diff options
-rw-r--r-- | lib/lib.c | 9 | ||||
-rw-r--r-- | toys/example/test_utf8towc.c | 42 |
2 files changed, 45 insertions, 6 deletions
@@ -345,20 +345,17 @@ int utf8towc(wchar_t *wc, char *str, unsigned len) if (len && *str<128) return !!(*wc = *str); result = first = *(s = str++); + if (result<0xc2 || result>0xf4) return -1; for (mask = 6; (first&0xc0)==0xc0; mask += 5, first <<= 1) { - if (mask>21) return -1; if (!--len) return -2; - c = *(str++); - if ((c&0xc0) != 0x80) return -1; + if (((c = *(str++))&0xc0) != 0x80) return -1; result = (result<<6)|(c&0x3f); } result &= (1<<mask)-1; c = str-s; - if (mask==6) return -1; // Avoid overlong encodings - if (mask==6 || mask>21 || result<(unsigned []){0x80,0x800,0x10000}[c-2]) - return -1; + if (result<(unsigned []){0x80,0x800,0x10000}[c-2]) return -1; // Limit unicode so it can't encode anything UTF-16 can't. if (result>0x10ffff || (result>=0xd800 && result<=0xdfff)) return -1; diff --git a/toys/example/test_utf8towc.c b/toys/example/test_utf8towc.c new file mode 100644 index 00000000..f939eaa7 --- /dev/null +++ b/toys/example/test_utf8towc.c @@ -0,0 +1,42 @@ +/* test_utf8towc() against libc mbrtowc() + * + * Copyright 2017 Rob Landley <rob@landley.net> + +USE_TEST_UTF8TOWC(NEWTOY(test_utf8towc, 0, TOYFLAG_USR|TOYFLAG_BIN)) + +config TEST_UTF8TOWC + bool "test_utf8towc" + default n + help + usage: test_utf8towc + + Print differences between toybox's utf8 conversion routines vs libc du jour. +*/ + +#include "toys.h" + +void test_utf8towc_main(void) +{ + mbstate_t mb; + int len1, len2; + unsigned u, h; + wchar_t wc1, wc2; + + setlocale(LC_ALL, "en_US.UTF-8"); + + memset(&mb, 0, sizeof(mb)); + for (u=1; u; u++) { + char *str = (void *)&h; + + wc1 = wc2 = 0; + len2 = 4; + h = htonl(u); + while (!*str) str++, len2--; + + len1 = mbrtowc(&wc1, str, len2, &mb); + if (len1<0) memset(&mb, 0, sizeof(mb)); + len2 = utf8towc(&wc2, str, len2); + if (len1 != len2 || wc1 != wc2) + printf("%x %d %x %d %x\n", u, len1, wc1, len2, wc2); + } +} |