From d3025b14b9c13286b79f256d019a99da9425ea0e Mon Sep 17 00:00:00 2001 From: Rob Landley Date: Sat, 15 May 2021 11:14:03 -0500 Subject: Convert utf8towc from wchar_t to unsigned (to match wctoutf8). The maximum unicode code point is 0x10ffff which is 21 bits. --- lib/lib.c | 6 +++--- lib/lib.h | 2 +- lib/linestack.c | 3 +-- lib/llist.c | 2 +- toys/example/demo_utf8towc.c | 4 ++-- toys/pending/vi.c | 5 ++--- toys/posix/cut.c | 2 +- toys/posix/expand.c | 2 +- toys/posix/file.c | 29 +++++++++++++---------------- toys/posix/grep.c | 2 +- toys/posix/wc.c | 3 +-- 11 files changed, 27 insertions(+), 33 deletions(-) diff --git a/lib/lib.c b/lib/lib.c index 87bda4f6..9f9b8136 100644 --- a/lib/lib.c +++ b/lib/lib.c @@ -370,7 +370,7 @@ int wctoutf8(char *s, unsigned wc) // Convert utf8 sequence to a unicode wide character // returns bytes consumed, or -1 if err, or -2 if need more data. -int utf8towc(wchar_t *wc, char *str, unsigned len) +int utf8towc(unsigned *wc, char *str, unsigned len) { unsigned result, mask, first; char *s, c; @@ -403,7 +403,7 @@ char *strlower(char *s) { char *try, *new; int len, mlen = (strlen(s)|7)+9; - wchar_t c; + unsigned c; try = new = xmalloc(mlen); @@ -739,7 +739,7 @@ void loopfiles(char **argv, void (*function)(int fd, char *name)) loopfiles_rw(argv, O_RDONLY|O_CLOEXEC|WARN_ONLY, 0, function); } -// glue to call dl_lines() from loopfiles +// glue to call do_lines() from loopfiles static void (*do_lines_bridge)(char **pline, long len); static void loopfile_lines_bridge(int fd, char *name) { diff --git a/lib/lib.h b/lib/lib.h index f9c04281..cf1920f9 100644 --- a/lib/lib.h +++ b/lib/lib.h @@ -231,7 +231,7 @@ long long atolx(char *c); long long atolx_range(char *numstr, long long low, long long high); int stridx(char *haystack, char needle); int wctoutf8(char *s, unsigned wc); -int utf8towc(wchar_t *wc, char *str, unsigned len); +int utf8towc(unsigned *wc, char *str, unsigned len); char *strlower(char *s); char *strafter(char *haystack, char *needle); char *chomp(char *s); diff --git a/lib/linestack.c b/lib/linestack.c index 0fc83e6b..e6ae1b57 100644 --- a/lib/linestack.c +++ b/lib/linestack.c @@ -93,10 +93,9 @@ int crunch_str(char **str, int width, FILE *out, char *escmore, { int columns = 0, col, bytes; char *start, *end; + unsigned wc; for (end = start = *str; *end; columns += col, end += bytes) { - wchar_t wc; - if ((bytes = utf8towc(&wc, end, 4))>0 && (col = wcwidth(wc))>=0) { if (!escmore || wc>255 || !strchr(escmore, wc)) { if (width-columnsprev is new node) void dlist_add_nomalloc(struct double_list **list, struct double_list *new) { if (*list) { @@ -92,7 +93,6 @@ void dlist_add_nomalloc(struct double_list **list, struct double_list *new) } else *list = new->next = new->prev = new; } - // Add an entry to the end of a doubly linked list struct double_list *dlist_add(struct double_list **list, char *data) { diff --git a/toys/example/demo_utf8towc.c b/toys/example/demo_utf8towc.c index 136be6ca..2c6050b1 100644 --- a/toys/example/demo_utf8towc.c +++ b/toys/example/demo_utf8towc.c @@ -19,8 +19,8 @@ void demo_utf8towc_main(void) { mbstate_t mb; int len1, len2; - unsigned u, h; - wchar_t wc1, wc2; + unsigned u, h, wc2; + wchar_t wc1; memset(&mb, 0, sizeof(mb)); for (u = 1; u<=0x10ffff; u++) { diff --git a/toys/pending/vi.c b/toys/pending/vi.c index da43d5d5..87c49d13 100644 --- a/toys/pending/vi.c +++ b/toys/pending/vi.c @@ -84,7 +84,7 @@ static const char *specials = ",.:;=-+*/(){}<>[]!@#$%^&|\\?\"\'"; //get utf8 length and width at same time static int utf8_lnw(int *width, char *s, int bytes) { - wchar_t wc; + unsigned wc; int length = 1; if (*s == '\t') *width = TT.tabstop; @@ -1312,10 +1312,9 @@ static int crunch_nstr(char **str, int width, int n, FILE *out, char *escmore, { int columns = 0, col, bytes; char *start, *end; + unsigned wc; for (end = start = *str; *end && n>0; columns += col, end += bytes, n -= bytes) { - wchar_t wc; - if ((bytes = utf8towc(&wc, end, 4))>0 && (col = wcwidth(wc))>=0) { if (!escmore || wc>255 || !strchr(escmore, wc)) { if (width-columns32 && !memcmp(toybuf, "\xff\xd8", 2)) xputs("JPEG image data"); + else if (len>32 && !memcmp(s, "\xff\xd8", 2)) xputs("JPEG image data"); // https://en.wikipedia.org/wiki/Java_class_file#General_layout else if (len>8 && strstart(&s, "\xca\xfe\xba\xbe")) @@ -252,9 +252,9 @@ static void do_regular_file(int fd, char *name) else if (len>85 && strstart(&s, "07070")) { char *cpioformat = "unknown type"; - if (toybuf[5] == '7') cpioformat = "pre-SVR4 or odc"; - else if (toybuf[5] == '1') cpioformat = "SVR4 with no CRC"; - else if (toybuf[5] == '2') cpioformat = "SVR4 with CRC"; + if (*s == '7') cpioformat = "pre-SVR4 or odc"; + else if (*s == '1') cpioformat = "SVR4 with no CRC"; + else if (*s == '2') cpioformat = "SVR4 with CRC"; xprintf("ASCII cpio archive (%s)\n", cpioformat); } else if (len>33 && ((magic=peek(&s,2))==0143561 || magic==070707)) { if (magic == 0143561) printf("byte-swapped "); @@ -265,16 +265,12 @@ static void do_regular_file(int fd, char *name) (s[262]!=' ' || s[263]!=' ')?"":" (GNU)"); // zip/jar/apk archive, ODF/OOXML document, or such else if (len>5 && strstart(&s, "PK\03\04")) { - int ver = toybuf[4]; - xprintf("Zip archive data"); - if (ver) xprintf(", requires at least v%d.%d to extract", ver/10, ver%10); + if (*s) xprintf(", requires at least v%d.%d to extract", *s/10, *s%10); xputc('\n'); } else if (len>9 && strstart(&s, "7z\xbc\xaf\x27\x1c")) { - int ver = toybuf[6]*10+toybuf[7]; - xprintf("7-zip archive data"); - if (ver) xprintf(", version %d.%d", ver/10, ver%10); + if (*s || s[1]) xprintf(", version %d.%d", *s, s[1]); xputc('\n'); } else if (len>4 && strstart(&s, "BZh") && isdigit(*s)) xprintf("bzip2 compressed data, block size = %c00k\n", *s); @@ -410,13 +406,14 @@ static void do_regular_file(int fd, char *name) // Whitespace is allowed between the #! and the interpreter while (isspace(*s)) s++; if (strstart(&s, "/usr/bin/env")) while (isspace(*s)) s++; - for (what = s; (s-toybuf)0 && wcwidth(wc)>=0) { i += bytes-1; if (!what) what = "UTF-8 text"; diff --git a/toys/posix/grep.c b/toys/posix/grep.c index 52d10139..8eb3c03a 100644 --- a/toys/posix/grep.c +++ b/toys/posix/grep.c @@ -124,7 +124,7 @@ static void do_grep(int fd, char *name) if (!FLAG(a) && !lseek(fd, 0, SEEK_CUR)) { char buf[256]; int len, i = 0; - wchar_t wc; + unsigned wc; // If the first 256 bytes don't parse as utf8, call it binary. if (0<(len = read(fd, buf, 256))) { diff --git a/toys/posix/wc.c b/toys/posix/wc.c index 910e4690..118e7750 100644 --- a/toys/posix/wc.c +++ b/toys/posix/wc.c @@ -74,6 +74,7 @@ static void do_wc(int fd, char *name) for (;;) { int pos, done = 0, len2 = read(fd, toybuf+len, sizeof(toybuf)-len); + unsigned wchar; if (len2<0) perror_msg_raw(name); else len += len2; @@ -85,8 +86,6 @@ static void do_wc(int fd, char *name) if (FLAG(m)) { // If we've consumed next wide char if (--clen<1) { - wchar_t wchar; - // next wide size, don't count invalid, fetch more data if necessary clen = utf8towc(&wchar, toybuf+pos, len-pos); if (clen == -1) continue; -- cgit v1.2.3