From a659b81dfa435aa19130a8c7dd1bfe8fa9a22131 Mon Sep 17 00:00:00 2001
From: Tomas Heinrich <heinrich.tomas@gmail.com>
Date: Thu, 29 Apr 2010 13:43:39 +0200
Subject: libbb/lineedit: add support for preserving "broken" (non-unicode)
 chars

Signed-off-by: Tomas Heinrich <heinrich.tomas@gmail.com>
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/lineedit.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++----------
 libbb/unicode.c  | 12 +++--------
 2 files changed, 54 insertions(+), 20 deletions(-)

(limited to 'libbb')

diff --git a/libbb/lineedit.c b/libbb/lineedit.c
index dc90846f9..622f9ddfc 100644
--- a/libbb/lineedit.c
+++ b/libbb/lineedit.c
@@ -68,7 +68,7 @@
 
 #undef CHAR_T
 #if ENABLE_UNICODE_SUPPORT
-# define BB_NUL L'\0'
+# define BB_NUL ((wchar_t)0)
 # define CHAR_T wchar_t
 static bool BB_isspace(CHAR_T c) { return ((unsigned)c < 256 && isspace(c)); }
 # if ENABLE_FEATURE_EDITING_VI
@@ -92,6 +92,14 @@ static bool BB_ispunct(CHAR_T c) { return ((unsigned)c < 256 && ispunct(c)); }
 #endif
 
 
+# if ENABLE_UNICODE_PRESERVE_BROKEN
+#  define unicode_mark_inv_wchar(wc)   ((wc) | 0x20000000)
+#  define unicode_is_inv_wchar(wc)     ((wc) & 0x20000000)
+# else
+#  define unicode_is_inv_wchar(wc)     0
+# endif
+
+
 enum {
 	/* We use int16_t for positions, need to limit line len */
 	MAX_LINELEN = CONFIG_FEATURE_EDITING_MAX_LEN < 0x7ff0
@@ -208,24 +216,58 @@ static size_t load_string(const char *src, int maxsize)
 	ssize_t len = mbstowcs(command_ps, src, maxsize - 1);
 	if (len < 0)
 		len = 0;
-	command_ps[len] = L'\0';
+	command_ps[len] = 0;
 	return len;
 }
-static size_t save_string(char *dst, int maxsize)
+static unsigned save_string(char *dst, unsigned maxsize)
 {
+#if !ENABLE_UNICODE_PRESERVE_BROKEN
 	ssize_t len = wcstombs(dst, command_ps, maxsize - 1);
 	if (len < 0)
 		len = 0;
 	dst[len] = '\0';
 	return len;
+#else
+	unsigned dstpos = 0;
+	unsigned srcpos = 0;
+
+	maxsize--;
+	while (dstpos < maxsize) {
+		wchar_t wc;
+		int n = srcpos;
+		while ((wc = command_ps[srcpos]) != 0
+		    && !unicode_is_inv_wchar(wc)
+		) {
+			srcpos++;
+		}
+		command_ps[srcpos] = 0;
+		n = wcstombs(dst + dstpos, command_ps + n, maxsize - dstpos);
+		if (n < 0) /* should not happen */
+			break;
+		dstpos += n;
+		if (wc == 0) /* usually is */
+			break;
+		/* We do have invalid byte here! */
+		command_ps[srcpos] = wc; /* restore it */
+		srcpos++;
+		if (dstpos == maxsize)
+			break;
+		dst[dstpos++] = (char) wc;
+	}
+	dst[dstpos] = '\0';
+	return dstpos;
+#endif
 }
 /* I thought just fputwc(c, stdout) would work. But no... */
 static void BB_PUTCHAR(wchar_t c)
 {
 	char buf[MB_CUR_MAX + 1];
 	mbstate_t mbst = { 0 };
-	ssize_t len = wcrtomb(buf, c, &mbst);
+	ssize_t len;
 
+	if (unicode_is_inv_wchar(c))
+		c = CONFIG_SUBST_WCHAR;
+	len = wcrtomb(buf, c, &mbst);
 	if (len > 0) {
 		buf[len] = '\0';
 		fputs(buf, stdout);
@@ -238,7 +280,7 @@ static size_t load_string(const char *src, int maxsize)
 	return strlen(command_ps);
 }
 # if ENABLE_FEATURE_TAB_COMPLETION
-static void save_string(char *dst, int maxsize)
+static void save_string(char *dst, unsigned maxsize)
 {
 	safe_strncpy(dst, command_ps, maxsize);
 }
@@ -1719,13 +1761,11 @@ static int lineedit_read_key(char *read_key_buffer)
  pushback:
 				/* Invalid sequence. Save all "bad bytes" except first */
 				read_key_ungets(read_key_buffer, unicode_buf + 1, unicode_idx - 1);
-				/*
-				 * ic = unicode_buf[0] sounds even better, but currently
-				 * this does not work: wchar_t[] -> char[] conversion
-				 * when lineedit finishes mangles such "raw bytes"
-				 * (by misinterpreting them as unicode chars):
-				 */
+# if !ENABLE_UNICODE_PRESERVE_BROKEN
 				ic = CONFIG_SUBST_WCHAR;
+# else
+				ic = unicode_mark_inv_wchar(unicode_buf[0]);
+# endif
 			} else {
 				/* Valid unicode char, return its code */
 				ic = wc;
diff --git a/libbb/unicode.c b/libbb/unicode.c
index 83e70b412..d1c6167c7 100644
--- a/libbb/unicode.c
+++ b/libbb/unicode.c
@@ -423,7 +423,6 @@ static int wcwidth(unsigned ucs)
 # if LAST_SUPPORTED_WCHAR >= 0x300
 	/* sorted list of non-overlapping intervals of non-spacing characters */
 	/* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */
-	static const struct interval combining[] = {
 #  define BIG_(a,b) { a, b },
 #  define PAIR(a,b)
 #  define ARRAY /* PAIR if < 0x4000 and no more than 4 chars big */ \
@@ -557,10 +556,9 @@ static int wcwidth(unsigned ucs)
 		BIG_(0xFE20, 0xFE23) \
 		BIG_(0xFEFF, 0xFEFF) \
 		BIG_(0xFFF9, 0xFFFB)
-		ARRAY
+	static const struct interval combining[] = { ARRAY };
 #  undef BIG_
 #  undef PAIR
-	};
 #  define BIG_(a,b)
 #  define PAIR(a,b) (a << 2) | (b-a),
 	static const uint16_t combining1[] = { ARRAY };
@@ -668,7 +666,6 @@ int FAST_FUNC unicode_bidi_isrtl(wint_t wc)
 	 * http://www.unicode.org/Public/5.2.0/ucd/extracted/DerivedBidiClass.txt
 	 * Bidi_Class=Left_To_Right | Bidi_Class=Arabic_Letter
 	 */
-	static const struct interval rtl_b[] = {
 #  define BIG_(a,b) { a, b },
 #  define PAIR(a,b)
 #  define ARRAY \
@@ -723,10 +720,9 @@ int FAST_FUNC unicode_bidi_isrtl(wint_t wc)
 		{0x10E7F, 0x10FFF},
 		{0x1E800, 0x1EFFF}
 		*/
-		ARRAY
+	static const struct interval rtl_b[] = { ARRAY };
 #  undef BIG_
 #  undef PAIR
-	};
 #  define BIG_(a,b)
 #  define PAIR(a,b) (a << 2) | (b-a),
 	static const uint16_t rtl_p[] = { ARRAY };
@@ -755,7 +751,6 @@ int FAST_FUNC unicode_bidi_is_neutral_wchar(wint_t wc)
 	 * White_Space, Other_Neutral, European_Number, European_Separator,
 	 * European_Terminator, Arabic_Number, Common_Separator
 	 */
-	static const struct interval neutral_b[] = {
 #  define BIG_(a,b) { a, b },
 #  define PAIR(a,b)
 #  define ARRAY \
@@ -929,10 +924,9 @@ int FAST_FUNC unicode_bidi_is_neutral_wchar(wint_t wc)
 		{0x1F030, 0x1F093},
 		{0x1F100, 0x1F10A}
 		*/
-		ARRAY
+	static const struct interval neutral_b[] = { ARRAY };
 #  undef BIG_
 #  undef PAIR
-	};
 #  define BIG_(a,b)
 #  define PAIR(a,b) (a << 2) | (b-a),
 	static const uint16_t neutral_p[] = { ARRAY };
-- 
cgit v1.2.3