diff options
author | Elliott Hughes <enh@google.com> | 2020-12-05 17:41:39 -0800 |
---|---|---|
committer | Rob Landley <rob@landley.net> | 2020-12-06 02:24:12 -0600 |
commit | ed3d5eb0eaf74e6686bc2576b2c4d5a5343dfd57 (patch) | |
tree | e0116710fb056cee84f021d8b2c230806679b65b | |
parent | 49c02dbe435681015a88c636749d144044fc5e4a (diff) | |
download | toybox-ed3d5eb0eaf74e6686bc2576b2c4d5a5343dfd57.tar.gz |
unicode: new toy.
Based loosely on the Plan9/Inferno utility, and a convenient way to go back
and forth between code points and utf8 sequences.
This patch also fixes a couple of bugs in wctoutf8 (and the tests for this
toy effectively serve as unit tests for wctoutf8/utf8towc).
-rw-r--r-- | lib/lib.c | 11 | ||||
-rwxr-xr-x | tests/unicode.test | 13 | ||||
-rw-r--r-- | toys/pending/unicode.c | 65 |
3 files changed, 84 insertions, 5 deletions
@@ -349,17 +349,18 @@ int stridx(char *haystack, char needle) // Convert wc to utf8, returning bytes written. Does not null terminate. int wctoutf8(char *s, unsigned wc) { - int len = (wc>0x7ff)+(wc>0xffff), mask = 12+len+!!len; + int len = (wc>0x7ff)+(wc>0xffff), i; if (wc<128) { *s = wc; return 1; } else { + i = len; do { - s[1+len] = 0x80+(wc&0x3f); - wc >>= 7; - } while (len--); - *s = wc|mask; + s[1+i] = 0x80+(wc&0x3f); + wc >>= 6; + } while (i--); + *s = (((signed char) 0x80) >> (len+1)) | wc; } return 2+len; diff --git a/tests/unicode.test b/tests/unicode.test new file mode 100755 index 00000000..099231d2 --- /dev/null +++ b/tests/unicode.test @@ -0,0 +1,13 @@ +#!/bin/bash + +[ -f testing.sh ] && . testing.sh + +#testing "name" "command" "result" "infile" "stdin" + +testing "text" "unicode 안녕 hi" "U+C548 : 안 : 0xec 0x95 0x88\nU+B155 : 녕 : 0xeb 0x85 0x95\nU+0068 : h\nU+0069 : i\n" "" "" +testing "code points" "unicode 70 666" "U+0070 : p\nU+0666 : ٦ : 0xd9 0xa6\n" "" "" +testing "ASCII controls" "unicode 0" "U+0000 : NUL\n" "" "" +testing "del" "unicode 7f" "U+007F : DEL\n" "" "" +testing "3-byte" "unicode 30b9" "U+30B9 : ス : 0xe3 0x82 0xb9\n" "" "" +testing "4-byte" "unicode 10000" "U+10000 : 𐀀 : 0xf0 0x90 0x80 0x80\n" "" "" +testing "range" "unicode 660-662" "U+0660 : ٠ : 0xd9 0xa0\nU+0661 : ١ : 0xd9 0xa1\nU+0662 : ٢ : 0xd9 0xa2\n" "" "" diff --git a/toys/pending/unicode.c b/toys/pending/unicode.c new file mode 100644 index 00000000..0a9eb24a --- /dev/null +++ b/toys/pending/unicode.c @@ -0,0 +1,65 @@ +/* unicode.c - convert between Unicode and UTF-8 + * + * Copyright 2020 The Android Open Source Project. + * + * Loosely based on the Plan9/Inferno unicode(1). + +USE_UNICODE(NEWTOY(unicode, "<1", TOYFLAG_USR|TOYFLAG_BIN)) + +config UNICODE + bool "unicode" + default n + help + usage: unicode [[min]-max] + + Convert between Unicode code points and UTF-8, in both directions. +*/ + +#define FOR_unicode +#include "toys.h" + +static void codepoint(unsigned wc) { + char *low="NULSOHSTXETXEOTENQACKBELBS HT LF VT FF CR SO SI DLEDC1DC2DC3DC4" + "NAKSYNETBCANEM SUBESCFS GS RS US "; + unsigned n, i; + + printf("U+%04X : ", wc); + if (wc < ' ') printf("%.3s", low+(wc*3)); + else if (wc == 0x7f) printf("DEL"); + else { + toybuf[n = wctoutf8(toybuf, wc)] = 0; + printf("%s%s", toybuf, n>1 ? " :":""); + if (n>1) for (i = 0; i < n; i++) printf(" %#02x", toybuf[i]); + } + xputc('\n'); +} + +void unicode_main(void) +{ + unsigned from, to; + char next, **args; + + for (args = toys.optargs; *args; args++) { + // unicode 660-666 => table of `U+0600 : ٠ : 0xd9 0xa0` etc. + if (sscanf(*args, "%x-%x%c", &from, &to, &next) == 2) { + while (from <= to) codepoint(from++); + + // unicode 666 => just `U+0666 : ٦ : 0xd9 0xa6`. + } else if (sscanf(*args, "%x%c", &from, &next) == 1) { + codepoint(from); + + // unicode hello => table showing every character in the string. + } else { + char *s = *args; + size_t l = strlen(s); + wchar_t wc; + int n; + + while ((n = utf8towc(&wc, s, l)) > 0) { + codepoint(wc); + s += n; + l -= n; + } + } + } +} |