From 67ddade3373d0fefeff25b48430e5f08c3a7711b Mon Sep 17 00:00:00 2001 From: Rob Landley Date: Sat, 2 Sep 2017 18:15:09 -0500 Subject: In wc, replace mbrtowc() with new utf8towc() which doesn't have a context struct or care about locale. --- lib/lib.h | 1 + lib/linestack.c | 31 +++++++++++++++++++++++++++++++ toys/posix/wc.c | 1 + 3 files changed, 33 insertions(+) diff --git a/lib/lib.h b/lib/lib.h index bb8dfd7f..37566527 100644 --- a/lib/lib.h +++ b/lib/lib.h @@ -257,6 +257,7 @@ void linestack_addstack(struct linestack **lls, struct linestack *throw, void linestack_insert(struct linestack **lls, long pos, char *line, long len); void linestack_append(struct linestack **lls, char *line); struct linestack *linestack_load(char *name); +int utf8towc(wchar_t *wc, char *str, unsigned len); int crunch_escape(FILE *out, int cols, int wc); int crunch_rev_escape(FILE *out, int cols, int wc); int crunch_str(char **str, int width, FILE *out, char *escmore, diff --git a/lib/linestack.c b/lib/linestack.c index 39ba0993..44667106 100644 --- a/lib/linestack.c +++ b/lib/linestack.c @@ -80,6 +80,37 @@ struct linestack *linestack_load(char *name) return ls; } +// Convert utf8 sequence to a unicode wide character +int utf8towc(wchar_t *wc, char *str, unsigned len) +{ + unsigned result, mask, first; + char *s, c; + + // fast path ASCII + if (len && *str<128) return !!(*wc = *str); + + result = first = *(s = str++); + for (mask = 6; (first&0xc0)==0xc0; mask += 5, first <<= 1) { + if (!--len) return -2; + c = *(str++); + if ((c&0xc0) != 0x80) return -1; + result = (result<<6)|(c&0x3f); + } + result &= (1<21) return -1; + + // Avoid overlong encodings + if (mask==6 || mask>21 || result<(unsigned []){0x80,0x800,0x10000}[c-2]) + return -1; + + // Limit unicode so it can't encode anything UTF-16 can't. + if (result>0x10ffff || (result>=0xd800 && result<=0xdfff)) return -1; + *wc = result; + + return str-s; +} + // Show width many columns, negative means from right edge, out=0 just measure // if escout, send it unprintable chars, otherwise pass through raw data. // Returns width in columns, moves *str to end of data consumed. diff --git a/toys/posix/wc.c b/toys/posix/wc.c index a8c3e452..96ff9788 100644 --- a/toys/posix/wc.c +++ b/toys/posix/wc.c @@ -80,6 +80,7 @@ static void do_wc(int fd, char *name) // next wide size, don't count invalid, fetch more data if necessary clen = mbrtowc(&wchar, toybuf+pos, len-pos, 0); + clen = utf8towc(&wchar, toybuf+pos, len-pos); if (clen == -1) continue; if (clen == -2 && !done) break; -- cgit v1.2.3