From 6e766936396e2da7fb3820cadb3a9ae823caa9a8 Mon Sep 17 00:00:00 2001
From: Rob Landley <rob@landley.net>
Date: Sat, 2 Sep 2017 20:40:24 -0500
Subject: utf8towc() has to be in lib.c if strlower() is going to use it,
 because scripts/*.c builds against lib.c but not linestack.c.

---
 lib/lib.c | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

(limited to 'lib/lib.c')

diff --git a/lib/lib.c b/lib/lib.c
index d011af02..c482dcab 100644
--- a/lib/lib.c
+++ b/lib/lib.c
@@ -335,6 +335,38 @@ int stridx(char *haystack, char needle)
   return off-haystack;
 }
 
+// Convert utf8 sequence to a unicode wide character
+int utf8towc(wchar_t *wc, char *str, unsigned len)
+{
+  unsigned result, mask, first;
+  char *s, c;
+
+  // fast path ASCII
+  if (len && *str<128) return !!(*wc = *str);
+
+  result = first = *(s = str++);
+  for (mask = 6; (first&0xc0)==0xc0; mask += 5, first <<= 1) {
+    if (mask>21) return -1;
+    if (!--len) return -2;
+    c = *(str++);
+    if ((c&0xc0) != 0x80) return -1;
+    result = (result<<6)|(c&0x3f);
+  }
+  result &= (1<<mask)-1;
+  c = str-s;
+  if (mask==6) return -1;
+
+  // Avoid overlong encodings
+  if (mask==6 || mask>21 || result<(unsigned []){0x80,0x800,0x10000}[c-2])
+    return -1;
+
+  // Limit unicode so it can't encode anything UTF-16 can't.
+  if (result>0x10ffff || (result>=0xd800 && result<=0xdfff)) return -1;
+  *wc = result;
+
+  return str-s;
+}
+
 char *strlower(char *s)
 {
   char *try, *new;
@@ -348,7 +380,7 @@ char *strlower(char *s)
 
     while (*s) {
       wchar_t c;
-      int len = mbrtowc(&c, s, MB_CUR_MAX, 0);
+      int len = utf8towc(&c, s, MB_CUR_MAX);
 
       if (len < 1) *(new++) = *(s++);
       else {
-- 
cgit v1.2.3