aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRob Landley <rob@landley.net>2017-09-02 18:15:09 -0500
committerRob Landley <rob@landley.net>2017-09-02 18:15:09 -0500
commit67ddade3373d0fefeff25b48430e5f08c3a7711b (patch)
treeead8c34b2a28655eef456b0afa104a438cae2a02
parentbebf14cc298eb41d8e5c245e3800aea69ecca08c (diff)
downloadtoybox-67ddade3373d0fefeff25b48430e5f08c3a7711b.tar.gz
In wc, replace mbrtowc() with new utf8towc() which doesn't have a context struct
or care about locale.
-rw-r--r--lib/lib.h1
-rw-r--r--lib/linestack.c31
-rw-r--r--toys/posix/wc.c1
3 files changed, 33 insertions, 0 deletions
diff --git a/lib/lib.h b/lib/lib.h
index bb8dfd7f..37566527 100644
--- a/lib/lib.h
+++ b/lib/lib.h
@@ -257,6 +257,7 @@ void linestack_addstack(struct linestack **lls, struct linestack *throw,
void linestack_insert(struct linestack **lls, long pos, char *line, long len);
void linestack_append(struct linestack **lls, char *line);
struct linestack *linestack_load(char *name);
+int utf8towc(wchar_t *wc, char *str, unsigned len);
int crunch_escape(FILE *out, int cols, int wc);
int crunch_rev_escape(FILE *out, int cols, int wc);
int crunch_str(char **str, int width, FILE *out, char *escmore,
diff --git a/lib/linestack.c b/lib/linestack.c
index 39ba0993..44667106 100644
--- a/lib/linestack.c
+++ b/lib/linestack.c
@@ -80,6 +80,37 @@ struct linestack *linestack_load(char *name)
return ls;
}
+// Convert utf8 sequence to a unicode wide character
+int utf8towc(wchar_t *wc, char *str, unsigned len)
+{
+ unsigned result, mask, first;
+ char *s, c;
+
+ // fast path ASCII
+ if (len && *str<128) return !!(*wc = *str);
+
+ result = first = *(s = str++);
+ for (mask = 6; (first&0xc0)==0xc0; mask += 5, first <<= 1) {
+ if (!--len) return -2;
+ c = *(str++);
+ if ((c&0xc0) != 0x80) return -1;
+ result = (result<<6)|(c&0x3f);
+ }
+ result &= (1<<mask)-1;
+ c = str-s;
+ if (mask==6 || mask>21) return -1;
+
+ // Avoid overlong encodings
+ if (mask==6 || mask>21 || result<(unsigned []){0x80,0x800,0x10000}[c-2])
+ return -1;
+
+ // Limit unicode so it can't encode anything UTF-16 can't.
+ if (result>0x10ffff || (result>=0xd800 && result<=0xdfff)) return -1;
+ *wc = result;
+
+ return str-s;
+}
+
// Show width many columns, negative means from right edge, out=0 just measure
// if escout, send it unprintable chars, otherwise pass through raw data.
// Returns width in columns, moves *str to end of data consumed.
diff --git a/toys/posix/wc.c b/toys/posix/wc.c
index a8c3e452..96ff9788 100644
--- a/toys/posix/wc.c
+++ b/toys/posix/wc.c
@@ -80,6 +80,7 @@ static void do_wc(int fd, char *name)
// next wide size, don't count invalid, fetch more data if necessary
clen = mbrtowc(&wchar, toybuf+pos, len-pos, 0);
+ clen = utf8towc(&wchar, toybuf+pos, len-pos);
if (clen == -1) continue;
if (clen == -2 && !done) break;