From 3d5b60693109dc5bdaa977b9a25d715a24a33133 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Sun, 31 Jan 2010 05:55:55 +0100
Subject: ls: fix handling of broken unicode sequences

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/unicode.c    | 47 +++++++++++++++++++++++++----------------------
 testsuite/ls.tests | 46 ++++++++++++++++++++++++----------------------
 2 files changed, 49 insertions(+), 44 deletions(-)

diff --git a/libbb/unicode.c b/libbb/unicode.c
index 4e7e3a96a..7c41ef30b 100644
--- a/libbb/unicode.c
+++ b/libbb/unicode.c
@@ -139,6 +139,8 @@ size_t FAST_FUNC wcstombs(char *dest, const wchar_t *src, size_t n)
 	return org_n - n;
 }
 
+#define ERROR_WCHAR (~(wchar_t)0)
+
 static const char *mbstowc_internal(wchar_t *res, const char *src)
 {
 	int bytes;
@@ -159,16 +161,22 @@ static const char *mbstowc_internal(wchar_t *res, const char *src)
 		c <<= 1;
 		bytes++;
 	} while ((c & 0x80) && bytes < 6);
-	if (bytes == 1)
-		return NULL;
+	if (bytes == 1) {
+		/* A bare "continuation" byte. Say, 80 */
+		*res = ERROR_WCHAR;
+		return src;
+	}
 	c = (uint8_t)(c) >> bytes;
 
 	while (--bytes) {
-		unsigned ch = (unsigned char) *src++;
+		unsigned ch = (unsigned char) *src;
 		if ((ch & 0xc0) != 0x80) {
-			return NULL;
+			/* Missing "continuation" byte. Example: e0 80 */
+			*res = ERROR_WCHAR;
+			return src;
 		}
 		c = (c << 6) + (ch & 0x3f);
+		src++;
 	}
 
 	/* TODO */
@@ -177,8 +185,8 @@ static const char *mbstowc_internal(wchar_t *res, const char *src)
 	/* 11110000 10000000 10000100 10000000 converts to 0x100 */
 	/* correct encoding: 11000100 10000000 */
 	if (c <= 0x7f) { /* crude check */
-		return NULL;
-		//or maybe 0xfffd; /* replacement character */
+		*res = ERROR_WCHAR;
+		return src;
 	}
 
 	*res = c;
@@ -204,7 +212,7 @@ size_t FAST_FUNC mbstowcs(wchar_t *dest, const char *src, size_t n)
 	while (n) {
 		wchar_t wc;
 		src = mbstowc_internal(&wc, src);
-		if (src == NULL) /* error */
+		if (wc == ERROR_WCHAR) /* error */
 			return (size_t) -1L;
 		if (dest)
 			*dest++ = wc;
@@ -312,20 +320,15 @@ static char* FAST_FUNC unicode_conv_to_printable2(uni_stat_t *stats, const char
 				goto subst;
 		}
 #else
-		{
-			const char *src1 = mbstowc_internal(&wc, src);
-			/* src = NULL: invalid sequence is seen,
-			 * else: wc is set, src is advanced to next mb char
-			 */
-			if (src1) { /* no error */
-				if (wc == 0) /* end-of-string */
-					break;
-				src = src1;
-			} else { /* error */
-				src++;
-				goto subst;
-			}
-		}
+		src = mbstowc_internal(&wc, src);
+		/* src is advanced to next mb char
+		 * wc == ERROR_WCHAR: invalid sequence is seen
+		 * else: wc is set
+		 */
+		if (wc == ERROR_WCHAR) /* error */
+			goto subst;
+		if (wc == 0) /* end-of-string */
+			break;
 #endif
 		if (CONFIG_LAST_SUPPORTED_WCHAR && wc > CONFIG_LAST_SUPPORTED_WCHAR)
 			goto subst;
@@ -411,7 +414,7 @@ unsigned FAST_FUNC unicode_padding_to_width(unsigned width, const char *src)
 		}
 #else
 		src = mbstowc_internal(&wc, src);
-		if (!src || wc == 0) /* error, or end-of-string */
+		if (wc == ERROR_WCHAR || wc == 0) /* error, or end-of-string */
 			return width;
 #endif
 		w = wcwidth(wc);
diff --git a/testsuite/ls.tests b/testsuite/ls.tests
index b0c5da7f9..8d0f2c2ea 100755
--- a/testsuite/ls.tests
+++ b/testsuite/ls.tests
@@ -11,9 +11,11 @@ mkdir ls.testdir || exit 1
 
 # testing "test name" "command" "expected result" "file input" "stdin"
 
-# The test isn't passing correctly now - all | chars should line up
-# perfectly in the correctly passed test.
+# With Unicode provided by libc locale, I'm not sure this test can pass.
+# I suspect we might fail to skip exactly correct number of bytes
+# over broked unicode sequences.
 test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \
+&& test x"$CONFIG_LOCALE_SUPPORT" != x"y" \
 && test x"$CONFIG_SUBST_WCHAR" = x"63" \
 && test x"$CONFIG_LAST_SUPPORTED_WCHAR" = x"767" \
 && testing "ls unicode test" \
@@ -73,40 +75,40 @@ test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \
 0053____"?_?_"____________________________________________________________|
 0054_3.3__Sequences_with_last_continuation_byte_missing___________________|
 0055_3.3.1__2-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______|
-0056_3.3.2__3-byte_sequence_with_last_byte_missing__U+0000_:_____"??"______|
-0057_3.3.3__4-byte_sequence_with_last_byte_missing__U+0000_:_____"???"______|
-0058_3.3.4__5-byte_sequence_with_last_byte_missing__U+0000_:_____"????"______|
-0059_3.3.5__6-byte_sequence_with_last_byte_missing__U+0000_:_____"?????"______|
+0056_3.3.2__3-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______|
+0057_3.3.3__4-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______|
+0058_3.3.4__5-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______|
+0059_3.3.5__6-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______|
 0060_3.3.6__2-byte_sequence_with_last_byte_missing__U-000007FF_:_"?"______|
-0061_3.3.7__3-byte_sequence_with_last_byte_missing__U-0000FFFF_:_"??"______|
-0062_3.3.8__4-byte_sequence_with_last_byte_missing__U-001FFFFF_:_"???"______|
-0063_3.3.9__5-byte_sequence_with_last_byte_missing__U-03FFFFFF_:_"????"______|
-0064_3.3.10_6-byte_sequence_with_last_byte_missing__U-7FFFFFFF_:_"?????"______|
+0061_3.3.7__3-byte_sequence_with_last_byte_missing__U-0000FFFF_:_"?"______|
+0062_3.3.8__4-byte_sequence_with_last_byte_missing__U-001FFFFF_:_"?"______|
+0063_3.3.9__5-byte_sequence_with_last_byte_missing__U-03FFFFFF_:_"?"______|
+0064_3.3.10_6-byte_sequence_with_last_byte_missing__U-7FFFFFFF_:_"?"______|
 0065_3.4__Concatenation_of_incomplete_sequences___________________________|
-0066____"??????????????????????????????"______________________________________________________|
+0066____"??????????"______________________________________________________|
 0067_3.5__Impossible_bytes________________________________________________|
 0068_3.5.1__fe_=_"?"______________________________________________________|
 0069_3.5.2__ff_=_"?"______________________________________________________|
 0070_3.5.3__fe_fe_ff_ff_=_"????"__________________________________________|
 0071_4__Overlong_sequences________________________________________________|
 0072_4.1__Examples_of_an_overlong_ASCII_character_________________________|
-0073_4.1.1_U+002F_=_c0_af_____________=_"??"_______________________________|
-0074_4.1.2_U+002F_=_e0_80_af__________=_"???"_______________________________|
-0075_4.1.3_U+002F_=_f0_80_80_af_______=_"????"_______________________________|
-0076_4.1.4_U+002F_=_f8_80_80_80_af____=_"?????"_______________________________|
-0077_4.1.5_U+002F_=_fc_80_80_80_80_af_=_"??????"_______________________________|
+0073_4.1.1_U+002F_=_c0_af_____________=_"?"_______________________________|
+0074_4.1.2_U+002F_=_e0_80_af__________=_"?"_______________________________|
+0075_4.1.3_U+002F_=_f0_80_80_af_______=_"?"_______________________________|
+0076_4.1.4_U+002F_=_f8_80_80_80_af____=_"?"_______________________________|
+0077_4.1.5_U+002F_=_fc_80_80_80_80_af_=_"?"_______________________________|
 0078_4.2__Maximum_overlong_sequences______________________________________|
-0079_4.2.1__U-0000007F_=_c1_bf_____________=_"??"__________________________|
+0079_4.2.1__U-0000007F_=_c1_bf_____________=_"?"__________________________|
 0080_4.2.2__U-000007FF_=_e0_9f_bf__________=_"?"__________________________|
 0081_4.2.3__U-0000FFFF_=_f0_8f_bf_bf_______=_"?"__________________________|
 0082_4.2.4__U-001FFFFF_=_f8_87_bf_bf_bf____=_"?"__________________________|
 0083_4.2.5__U-03FFFFFF_=_fc_83_bf_bf_bf_bf_=_"?"__________________________|
 0084_4.3__Overlong_representation_of_the_NUL_character____________________|
-0085_4.3.1__U+0000_=_c0_80_____________=_"??"______________________________|
-0086_4.3.2__U+0000_=_e0_80_80__________=_"???"______________________________|
-0087_4.3.3__U+0000_=_f0_80_80_80_______=_"????"______________________________|
-0088_4.3.4__U+0000_=_f8_80_80_80_80____=_"?????"______________________________|
-0089_4.3.5__U+0000_=_fc_80_80_80_80_80_=_"??????"______________________________|
+0085_4.3.1__U+0000_=_c0_80_____________=_"?"______________________________|
+0086_4.3.2__U+0000_=_e0_80_80__________=_"?"______________________________|
+0087_4.3.3__U+0000_=_f0_80_80_80_______=_"?"______________________________|
+0088_4.3.4__U+0000_=_f8_80_80_80_80____=_"?"______________________________|
+0089_4.3.5__U+0000_=_fc_80_80_80_80_80_=_"?"______________________________|
 0090_5__Illegal_code_positions____________________________________________|
 0091_5.1_Single_UTF-16_surrogates_________________________________________|
 0092_5.1.1__U+D800_=_ed_a0_80_=_"?"_______________________________________|
-- 
cgit v1.2.3