unicode: new toy.

Based loosely on the Plan9/Inferno utility, and a convenient way to go back and forth between code points and utf8 sequences. This patch also fixes a couple of bugs in wctoutf8 (and the tests for this toy effectively serve as unit tests for wctoutf8/utf8towc).
author: Elliott Hughes <enh@google.com> 2020-12-05 17:41:39 -0800
committer: Rob Landley <rob@landley.net> 2020-12-06 02:24:12 -0600
commit: ed3d5eb0eaf74e6686bc2576b2c4d5a5343dfd57 (patch)
tree: e0116710fb056cee84f021d8b2c230806679b65b /toys/pending
parent: 49c02dbe435681015a88c636749d144044fc5e4a (diff)
download: toybox-ed3d5eb0eaf74e6686bc2576b2c4d5a5343dfd57.tar.gz
1 files changed, 65 insertions, 0 deletions
diff --git a/toys/pending/unicode.c b/toys/pending/unicode.c
new file mode 100644
index 00000000..0a9eb24a
--- /dev/null
+++ b/toys/pending/unicode.c
@@ -0,0 +1,65 @@
+/* unicode.c - convert between Unicode and UTF-8
+ *
+ * Copyright 2020 The Android Open Source Project.
+ *
+ * Loosely based on the Plan9/Inferno unicode(1).
+
+USE_UNICODE(NEWTOY(unicode, "<1", TOYFLAG_USR|TOYFLAG_BIN))
+
+config UNICODE
+  bool "unicode"
+  default n
+  help
+    usage: unicode [[min]-max]
+
+    Convert between Unicode code points and UTF-8, in both directions.
+*/
+
+#define FOR_unicode
+#include "toys.h"
+
+static void codepoint(unsigned wc) {
+  char *low="NULSOHSTXETXEOTENQACKBELBS HT LF VT FF CR SO SI DLEDC1DC2DC3DC4"
+            "NAKSYNETBCANEM SUBESCFS GS RS US ";
+  unsigned n, i;
+
+  printf("U+%04X : ", wc);
+  if (wc < ' ') printf("%.3s", low+(wc*3));
+  else if (wc == 0x7f) printf("DEL");
+  else {
+    toybuf[n = wctoutf8(toybuf, wc)] = 0;
+    printf("%s%s", toybuf, n>1 ? " :":"");
+    if (n>1) for (i = 0; i < n; i++) printf(" %#02x", toybuf[i]);
+  }
+  xputc('\n');
+}
+
+void unicode_main(void)
+{
+  unsigned from, to;
+  char next, **args;
+
+  for (args = toys.optargs; *args; args++) {
+    // unicode 660-666 => table of `U+0600 : ٠ : 0xd9 0xa0` etc.
+    if (sscanf(*args, "%x-%x%c", &from, &to, &next) == 2) {
+      while (from <= to) codepoint(from++);
+
+    // unicode 666 => just `U+0666 : ٦ : 0xd9 0xa6`.
+    } else if (sscanf(*args, "%x%c", &from, &next) == 1) {
+      codepoint(from);
+
+    // unicode hello => table showing every character in the string.
+    } else {
+      char *s = *args;
+      size_t l = strlen(s);
+      wchar_t wc;
+      int n;
+
+      while ((n = utf8towc(&wc, s, l)) > 0) {
+        codepoint(wc);
+        s += n;
+        l -= n;
+      }
+    }
+  }
+}
author	Elliott Hughes <enh@google.com>	2020-12-05 17:41:39 -0800
committer	Rob Landley <rob@landley.net>	2020-12-06 02:24:12 -0600
commit	ed3d5eb0eaf74e6686bc2576b2c4d5a5343dfd57 (patch)
tree	e0116710fb056cee84f021d8b2c230806679b65b /toys/pending
parent	49c02dbe435681015a88c636749d144044fc5e4a (diff)
download	toybox-ed3d5eb0eaf74e6686bc2576b2c4d5a5343dfd57.tar.gz