From 6e766936396e2da7fb3820cadb3a9ae823caa9a8 Mon Sep 17 00:00:00 2001 From: Rob Landley Date: Sat, 2 Sep 2017 20:40:24 -0500 Subject: utf8towc() has to be in lib.c if strlower() is going to use it, because scripts/*.c builds against lib.c but not linestack.c. --- lib/lib.c | 34 +++++++++++++++++++++++++++++++++- lib/lib.h | 2 +- lib/linestack.c | 33 +-------------------------------- 3 files changed, 35 insertions(+), 34 deletions(-) diff --git a/lib/lib.c b/lib/lib.c index d011af02..c482dcab 100644 --- a/lib/lib.c +++ b/lib/lib.c @@ -335,6 +335,38 @@ int stridx(char *haystack, char needle) return off-haystack; } +// Convert utf8 sequence to a unicode wide character +int utf8towc(wchar_t *wc, char *str, unsigned len) +{ + unsigned result, mask, first; + char *s, c; + + // fast path ASCII + if (len && *str<128) return !!(*wc = *str); + + result = first = *(s = str++); + for (mask = 6; (first&0xc0)==0xc0; mask += 5, first <<= 1) { + if (mask>21) return -1; + if (!--len) return -2; + c = *(str++); + if ((c&0xc0) != 0x80) return -1; + result = (result<<6)|(c&0x3f); + } + result &= (1<21 || result<(unsigned []){0x80,0x800,0x10000}[c-2]) + return -1; + + // Limit unicode so it can't encode anything UTF-16 can't. + if (result>0x10ffff || (result>=0xd800 && result<=0xdfff)) return -1; + *wc = result; + + return str-s; +} + char *strlower(char *s) { char *try, *new; @@ -348,7 +380,7 @@ char *strlower(char *s) while (*s) { wchar_t c; - int len = mbrtowc(&c, s, MB_CUR_MAX, 0); + int len = utf8towc(&c, s, MB_CUR_MAX); if (len < 1) *(new++) = *(s++); else { diff --git a/lib/lib.h b/lib/lib.h index 37566527..9325a89b 100644 --- a/lib/lib.h +++ b/lib/lib.h @@ -204,6 +204,7 @@ long long xstrtol(char *str, char **end, int base); long long atolx(char *c); long long atolx_range(char *numstr, long long low, long long high); int stridx(char *haystack, char needle); +int utf8towc(wchar_t *wc, char *str, unsigned len); char *strlower(char *s); char *strafter(char *haystack, char *needle); char *chomp(char *s); @@ -257,7 +258,6 @@ void linestack_addstack(struct linestack **lls, struct linestack *throw, void linestack_insert(struct linestack **lls, long pos, char *line, long len); void linestack_append(struct linestack **lls, char *line); struct linestack *linestack_load(char *name); -int utf8towc(wchar_t *wc, char *str, unsigned len); int crunch_escape(FILE *out, int cols, int wc); int crunch_rev_escape(FILE *out, int cols, int wc); int crunch_str(char **str, int width, FILE *out, char *escmore, diff --git a/lib/linestack.c b/lib/linestack.c index 44667106..91dec564 100644 --- a/lib/linestack.c +++ b/lib/linestack.c @@ -80,37 +80,6 @@ struct linestack *linestack_load(char *name) return ls; } -// Convert utf8 sequence to a unicode wide character -int utf8towc(wchar_t *wc, char *str, unsigned len) -{ - unsigned result, mask, first; - char *s, c; - - // fast path ASCII - if (len && *str<128) return !!(*wc = *str); - - result = first = *(s = str++); - for (mask = 6; (first&0xc0)==0xc0; mask += 5, first <<= 1) { - if (!--len) return -2; - c = *(str++); - if ((c&0xc0) != 0x80) return -1; - result = (result<<6)|(c&0x3f); - } - result &= (1<21) return -1; - - // Avoid overlong encodings - if (mask==6 || mask>21 || result<(unsigned []){0x80,0x800,0x10000}[c-2]) - return -1; - - // Limit unicode so it can't encode anything UTF-16 can't. - if (result>0x10ffff || (result>=0xd800 && result<=0xdfff)) return -1; - *wc = result; - - return str-s; -} - // Show width many columns, negative means from right edge, out=0 just measure // if escout, send it unprintable chars, otherwise pass through raw data. // Returns width in columns, moves *str to end of data consumed. @@ -123,7 +92,7 @@ int crunch_str(char **str, int width, FILE *out, char *escmore, for (end = start = *str; *end; columns += col, end += bytes) { wchar_t wc; - if ((bytes = mbrtowc(&wc, end, MB_CUR_MAX, 0))>0 && (col = wcwidth(wc))>=0) + if ((bytes = utf8towc(&wc, end, 4))>0 && (col = wcwidth(wc))>=0) { if (!escmore || wc>255 || !strchr(escmore, wc)) { if (width-columns