aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRob Landley <rob@landley.net>2017-09-05 02:36:24 -0500
committerRob Landley <rob@landley.net>2017-09-05 02:36:24 -0500
commitb3e70932b6534e603b03d28c45133b8f991f48fe (patch)
treeda3eff855103f5efd8ae82302bcd8b941c3611c7
parent6e766936396e2da7fb3820cadb3a9ae823caa9a8 (diff)
downloadtoybox-b3e70932b6534e603b03d28c45133b8f991f48fe.tar.gz
Tweak utf8towc() to return -1 earlier sometimes (instead of -2), and add test
program to compare against libc output.
-rw-r--r--lib/lib.c9
-rw-r--r--toys/example/test_utf8towc.c42
2 files changed, 45 insertions, 6 deletions
diff --git a/lib/lib.c b/lib/lib.c
index c482dcab..a4b7229b 100644
--- a/lib/lib.c
+++ b/lib/lib.c
@@ -345,20 +345,17 @@ int utf8towc(wchar_t *wc, char *str, unsigned len)
if (len && *str<128) return !!(*wc = *str);
result = first = *(s = str++);
+ if (result<0xc2 || result>0xf4) return -1;
for (mask = 6; (first&0xc0)==0xc0; mask += 5, first <<= 1) {
- if (mask>21) return -1;
if (!--len) return -2;
- c = *(str++);
- if ((c&0xc0) != 0x80) return -1;
+ if (((c = *(str++))&0xc0) != 0x80) return -1;
result = (result<<6)|(c&0x3f);
}
result &= (1<<mask)-1;
c = str-s;
- if (mask==6) return -1;
// Avoid overlong encodings
- if (mask==6 || mask>21 || result<(unsigned []){0x80,0x800,0x10000}[c-2])
- return -1;
+ if (result<(unsigned []){0x80,0x800,0x10000}[c-2]) return -1;
// Limit unicode so it can't encode anything UTF-16 can't.
if (result>0x10ffff || (result>=0xd800 && result<=0xdfff)) return -1;
diff --git a/toys/example/test_utf8towc.c b/toys/example/test_utf8towc.c
new file mode 100644
index 00000000..f939eaa7
--- /dev/null
+++ b/toys/example/test_utf8towc.c
@@ -0,0 +1,42 @@
+/* test_utf8towc() against libc mbrtowc()
+ *
+ * Copyright 2017 Rob Landley <rob@landley.net>
+
+USE_TEST_UTF8TOWC(NEWTOY(test_utf8towc, 0, TOYFLAG_USR|TOYFLAG_BIN))
+
+config TEST_UTF8TOWC
+ bool "test_utf8towc"
+ default n
+ help
+ usage: test_utf8towc
+
+ Print differences between toybox's utf8 conversion routines vs libc du jour.
+*/
+
+#include "toys.h"
+
+void test_utf8towc_main(void)
+{
+ mbstate_t mb;
+ int len1, len2;
+ unsigned u, h;
+ wchar_t wc1, wc2;
+
+ setlocale(LC_ALL, "en_US.UTF-8");
+
+ memset(&mb, 0, sizeof(mb));
+ for (u=1; u; u++) {
+ char *str = (void *)&h;
+
+ wc1 = wc2 = 0;
+ len2 = 4;
+ h = htonl(u);
+ while (!*str) str++, len2--;
+
+ len1 = mbrtowc(&wc1, str, len2, &mb);
+ if (len1<0) memset(&mb, 0, sizeof(mb));
+ len2 = utf8towc(&wc2, str, len2);
+ if (len1 != len2 || wc1 != wc2)
+ printf("%x %d %x %d %x\n", u, len1, wc1, len2, wc2);
+ }
+}