aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorElliott Hughes <enh@google.com>2020-12-05 17:41:39 -0800
committerRob Landley <rob@landley.net>2020-12-06 02:24:12 -0600
commited3d5eb0eaf74e6686bc2576b2c4d5a5343dfd57 (patch)
treee0116710fb056cee84f021d8b2c230806679b65b
parent49c02dbe435681015a88c636749d144044fc5e4a (diff)
downloadtoybox-ed3d5eb0eaf74e6686bc2576b2c4d5a5343dfd57.tar.gz
unicode: new toy.
Based loosely on the Plan9/Inferno utility, and a convenient way to go back and forth between code points and utf8 sequences. This patch also fixes a couple of bugs in wctoutf8 (and the tests for this toy effectively serve as unit tests for wctoutf8/utf8towc).
-rw-r--r--lib/lib.c11
-rwxr-xr-xtests/unicode.test13
-rw-r--r--toys/pending/unicode.c65
3 files changed, 84 insertions, 5 deletions
diff --git a/lib/lib.c b/lib/lib.c
index 7786fcc1..3129e3e3 100644
--- a/lib/lib.c
+++ b/lib/lib.c
@@ -349,17 +349,18 @@ int stridx(char *haystack, char needle)
// Convert wc to utf8, returning bytes written. Does not null terminate.
int wctoutf8(char *s, unsigned wc)
{
- int len = (wc>0x7ff)+(wc>0xffff), mask = 12+len+!!len;
+ int len = (wc>0x7ff)+(wc>0xffff), i;
if (wc<128) {
*s = wc;
return 1;
} else {
+ i = len;
do {
- s[1+len] = 0x80+(wc&0x3f);
- wc >>= 7;
- } while (len--);
- *s = wc|mask;
+ s[1+i] = 0x80+(wc&0x3f);
+ wc >>= 6;
+ } while (i--);
+ *s = (((signed char) 0x80) >> (len+1)) | wc;
}
return 2+len;
diff --git a/tests/unicode.test b/tests/unicode.test
new file mode 100755
index 00000000..099231d2
--- /dev/null
+++ b/tests/unicode.test
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+[ -f testing.sh ] && . testing.sh
+
+#testing "name" "command" "result" "infile" "stdin"
+
+testing "text" "unicode 안녕 hi" "U+C548 : 안 : 0xec 0x95 0x88\nU+B155 : 녕 : 0xeb 0x85 0x95\nU+0068 : h\nU+0069 : i\n" "" ""
+testing "code points" "unicode 70 666" "U+0070 : p\nU+0666 : ٦ : 0xd9 0xa6\n" "" ""
+testing "ASCII controls" "unicode 0" "U+0000 : NUL\n" "" ""
+testing "del" "unicode 7f" "U+007F : DEL\n" "" ""
+testing "3-byte" "unicode 30b9" "U+30B9 : ス : 0xe3 0x82 0xb9\n" "" ""
+testing "4-byte" "unicode 10000" "U+10000 : 𐀀 : 0xf0 0x90 0x80 0x80\n" "" ""
+testing "range" "unicode 660-662" "U+0660 : ٠ : 0xd9 0xa0\nU+0661 : ١ : 0xd9 0xa1\nU+0662 : ٢ : 0xd9 0xa2\n" "" ""
diff --git a/toys/pending/unicode.c b/toys/pending/unicode.c
new file mode 100644
index 00000000..0a9eb24a
--- /dev/null
+++ b/toys/pending/unicode.c
@@ -0,0 +1,65 @@
+/* unicode.c - convert between Unicode and UTF-8
+ *
+ * Copyright 2020 The Android Open Source Project.
+ *
+ * Loosely based on the Plan9/Inferno unicode(1).
+
+USE_UNICODE(NEWTOY(unicode, "<1", TOYFLAG_USR|TOYFLAG_BIN))
+
+config UNICODE
+ bool "unicode"
+ default n
+ help
+ usage: unicode [[min]-max]
+
+ Convert between Unicode code points and UTF-8, in both directions.
+*/
+
+#define FOR_unicode
+#include "toys.h"
+
+static void codepoint(unsigned wc) {
+ char *low="NULSOHSTXETXEOTENQACKBELBS HT LF VT FF CR SO SI DLEDC1DC2DC3DC4"
+ "NAKSYNETBCANEM SUBESCFS GS RS US ";
+ unsigned n, i;
+
+ printf("U+%04X : ", wc);
+ if (wc < ' ') printf("%.3s", low+(wc*3));
+ else if (wc == 0x7f) printf("DEL");
+ else {
+ toybuf[n = wctoutf8(toybuf, wc)] = 0;
+ printf("%s%s", toybuf, n>1 ? " :":"");
+ if (n>1) for (i = 0; i < n; i++) printf(" %#02x", toybuf[i]);
+ }
+ xputc('\n');
+}
+
+void unicode_main(void)
+{
+ unsigned from, to;
+ char next, **args;
+
+ for (args = toys.optargs; *args; args++) {
+ // unicode 660-666 => table of `U+0600 : ٠ : 0xd9 0xa0` etc.
+ if (sscanf(*args, "%x-%x%c", &from, &to, &next) == 2) {
+ while (from <= to) codepoint(from++);
+
+ // unicode 666 => just `U+0666 : ٦ : 0xd9 0xa6`.
+ } else if (sscanf(*args, "%x%c", &from, &next) == 1) {
+ codepoint(from);
+
+ // unicode hello => table showing every character in the string.
+ } else {
+ char *s = *args;
+ size_t l = strlen(s);
+ wchar_t wc;
+ int n;
+
+ while ((n = utf8towc(&wc, s, l)) > 0) {
+ codepoint(wc);
+ s += n;
+ l -= n;
+ }
+ }
+ }
+}