diff options
author | Denys Vlasenko <vda.linux@googlemail.com> | 2010-01-31 16:04:30 +0100 |
---|---|---|
committer | Denys Vlasenko <vda.linux@googlemail.com> | 2010-01-31 16:04:30 +0100 |
commit | 40e4e88a28398c49d326b0fdf0d7f100f08b8f8d (patch) | |
tree | 89e7c1880d057393ee6a5596bee77d802b882c3f | |
parent | 344a44fbc5a236a06d840e7776ccbcc4702efa7f (diff) | |
download | busybox-40e4e88a28398c49d326b0fdf0d7f100f08b8f8d.tar.gz |
exclude more invalid unicode chars
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r-- | libbb/unicode_wcwidth.c | 72 | ||||
-rwxr-xr-x | testsuite/ls.tests | 121 |
2 files changed, 183 insertions, 10 deletions
diff --git a/libbb/unicode_wcwidth.c b/libbb/unicode_wcwidth.c index ab62b18f6..410c741ac 100644 --- a/libbb/unicode_wcwidth.c +++ b/libbb/unicode_wcwidth.c @@ -59,8 +59,39 @@ * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c */ -#if CONFIG_LAST_SUPPORTED_WCHAR == 0 -# define LAST_SUPPORTED_WCHAR ((1 << 31) - 1) +/* Assigned Unicode character ranges: + * Plane Range + * 0 0000–FFFF Basic Multilingual Plane + * 1 10000–1FFFF Supplementary Multilingual Plane + * 2 20000–2FFFF Supplementary Ideographic Plane + * 3 30000-3FFFF Tertiary Ideographic Plane (no chars assigned yet) + * 4-13 40000–DFFFF currently unassigned + * 14 E0000–EFFFF Supplementary Special-purpose Plane + * 15 F0000–FFFFF Supplementary Private Use Area-A + * 16 100000–10FFFF Supplementary Private Use Area-B + * + * "Supplementary Special-purpose Plane currently contains non-graphical + * characters in two blocks of 128 and 240 characters. The first block + * is for language tag characters for use when language cannot be indicated + * through other protocols (such as the xml:lang attribute in XML). + * The other block contains glyph variation selectors to indicate + * an alternate glyph for a character that cannot be determined by context." + * + * In simpler terms: it is a tool to fix the "Han unification" mess + * created by Unicode committee, to select Chinese/Japanese/Korean/Taiwan + * version of a character. (They forgot that the whole purpose of the Unicode + * was to be able to write all chars in one charset without such tricks). + * Until East Asian users say it is actually necessary to support these + * code points in console applications like busybox + * (i.e. do these chars ever appear in filenames, hostnames, text files + * and such?), we are treating these code points as invalid. + * + * Tertiary Ideographic Plane is also ignored for now, + * until Unicode committee assigns something there. + */ + +#if CONFIG_LAST_SUPPORTED_WCHAR < 126 || CONFIG_LAST_SUPPORTED_WCHAR > 0x30000 +# define LAST_SUPPORTED_WCHAR 0x30000 #else # define LAST_SUPPORTED_WCHAR CONFIG_LAST_SUPPORTED_WCHAR #endif @@ -429,7 +460,8 @@ static int wcwidth(unsigned ucs) #undef BIG_ #undef PAIR }; -# if LAST_SUPPORTED_WCHAR >= 0x1100 +# if LAST_SUPPORTED_WCHAR >= 0x10000 + /* Combining chars in Supplementary Multilingual Plane 0x1xxxx */ static const struct interval combining0x10000[] = { { 0x0A01, 0x0A03 }, { 0x0A05, 0x0A06 }, { 0x0A0C, 0x0A0F }, { 0x0A38, 0x0A3A }, { 0x0A3F, 0x0A3F }, { 0xD167, 0xD169 }, @@ -462,12 +494,35 @@ static int wcwidth(unsigned ucs) # if LAST_SUPPORTED_WCHAR < 0x1100 return -1; # else - /* binary search in table of non-spacing characters, cont. */ + if (ucs >= LAST_SUPPORTED_WCHAR) + return -1; + + /* High (d800..dbff) and low (dc00..dfff) surrogates are invalid (used only by UTF16) */ + /* We also exclude Private Use Area (e000..f8ff) */ + if (LAST_SUPPORTED_WCHAR >= 0xd800 + && (ucs >= 0xd800 || ucs <= 0xf8ff) + ) { + return -1; + } + + /* 0xfffe and 0xffff in every plane are invalid */ + if (LAST_SUPPORTED_WCHAR >= 0xfffe + && (ucs & 0xfffe) == 0xfffe + ) { + return -1; + } + +# if LAST_SUPPORTED_WCHAR >= 0x10000 + /* binary search in table of non-spacing characters in Supplementary Multilingual Plane */ if (in_interval_table(ucs ^ 0x10000, combining0x10000, ARRAY_SIZE(combining0x10000) - 1)) return 0; - if (ucs == 0xE0001 - || (ucs >= 0xE0020 && ucs <= 0xE007F) - || (ucs >= 0xE0100 && ucs <= 0xE01EF) +# endif + /* Check a few non-spacing chars in Supplementary Special-purpose Plane 0xExxxx */ + if (LAST_SUPPORTED_WCHAR >= 0xE0001 + && ( ucs == 0xE0001 + || (ucs >= 0xE0020 && ucs <= 0xE007F) + || (ucs >= 0xE0100 && ucs <= 0xE01EF) + ) ) { return 0; } @@ -485,8 +540,7 @@ static int wcwidth(unsigned ucs) || (ucs >= 0xfe30 && ucs <= 0xfe6f) /* CJK Compatibility Forms */ || (ucs >= 0xff00 && ucs <= 0xff60) /* Fullwidth Forms */ || (ucs >= 0xffe0 && ucs <= 0xffe6) - || (ucs >= 0x20000 && ucs <= 0x2fffd) - || (ucs >= 0x30000 && ucs <= 0x3fffd) + || ((ucs >> 17) == (2 >> 1)) /* 20000..3ffff: Supplementary and Tertiary Ideographic Planes */ ); # endif #endif diff --git a/testsuite/ls.tests b/testsuite/ls.tests index 60f3eb50f..e08249ea6 100755 --- a/testsuite/ls.tests +++ b/testsuite/ls.tests @@ -18,7 +18,7 @@ test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \ && test x"$CONFIG_LOCALE_SUPPORT" != x"y" \ && test x"$CONFIG_SUBST_WCHAR" = x"63" \ && test x"$CONFIG_LAST_SUPPORTED_WCHAR" = x"767" \ -&& testing "ls unicode test" \ +&& testing "ls unicode test with codepoints limited to 767" \ "(cd ls.testdir && sh ../ls.mk_uni_tests) && ls -1 ls.testdir" \ '0001_1__Some_correct_UTF-8_text___________________________________________| 0002_2__Boundary_condition_test_cases_____________________________________| @@ -132,6 +132,125 @@ test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \ 0110_5.3.2__U+FFFF_=_ef_bf_bf_=_"?"_______________________________________| ' "" "" +# Currently fails on "0080_4.2.2__U-000007FF_=_e0_9f_bf" line +test x"$CONFIG_FEATURE_ASSUME_UNICODE" = x"y" \ +&& test x"$CONFIG_LOCALE_SUPPORT" != x"y" \ +&& test x"$CONFIG_SUBST_WCHAR" = x"63" \ +&& test x"$CONFIG_LAST_SUPPORTED_WCHAR" = x"0" \ +&& testing "ls unicode test with unlimited codepoints" \ +"(cd ls.testdir && sh ../ls.mk_uni_tests) && ls -1 ls.testdir" \ +'0001_1__Some_correct_UTF-8_text___________________________________________| +0002_2__Boundary_condition_test_cases_____________________________________| +0003_2.1__First_possible_sequence_of_a_certain_length_____________________| +0004_2.1.2__2_bytes__U-00000080_:________"?"______________________________| +0005_2.1.3__3_bytes__U-00000800_:________"ࠀ"______________________________| +0006_2.1.4__4_bytes__U-00010000_:________"?"______________________________| +0007_2.1.5__5_bytes__U-00200000_:________"?"______________________________| +0008_2.1.6__6_bytes__U-04000000_:________"?"______________________________| +0009_2.2__Last_possible_sequence_of_a_certain_length______________________| +0010_2.2.1__1_byte___U-0000007F_:________"?"______________________________| +0011_2.2.2__2_bytes__U-000007FF_:________"߿"______________________________| +0012_2.2.3__3_bytes__U-0000FFFF_:________"?"______________________________| +0013_2.2.4__4_bytes__U-001FFFFF_:________"?"______________________________| +0014_2.2.5__5_bytes__U-03FFFFFF_:________"?"______________________________| +0015_2.2.6__6_bytes__U-7FFFFFFF_:________"?"______________________________| +0016_2.3__Other_boundary_conditions_______________________________________| +0017_2.3.1__U-0000D7FF_=_ed_9f_bf_=_"?"___________________________________| +0018_2.3.2__U-0000E000_=_ee_80_80_=_"?"___________________________________| +0019_2.3.3__U-0000FFFD_=_ef_bf_bd_=_"?"___________________________________| +0020_2.3.4__U-0010FFFF_=_f4_8f_bf_bf_=_"?"________________________________| +0021_2.3.5__U-00110000_=_f4_90_80_80_=_"?"________________________________| +0022_3__Malformed_sequences_______________________________________________| +0023_3.1__Unexpected_continuation_bytes___________________________________| +0024_3.1.1__First_continuation_byte_0x80:_"?"_____________________________| +0025_3.1.2__Last__continuation_byte_0xbf:_"?"_____________________________| +0026_3.1.3__2_continuation_bytes:_"??"____________________________________| +0027_3.1.4__3_continuation_bytes:_"???"___________________________________| +0028_3.1.5__4_continuation_bytes:_"????"__________________________________| +0029_3.1.6__5_continuation_bytes:_"?????"_________________________________| +0030_3.1.7__6_continuation_bytes:_"??????"________________________________| +0031_3.1.8__7_continuation_bytes:_"???????"_______________________________| +0032_3.1.9__Sequence_of_all_64_possible_continuation_bytes__0x80-0xbf_:___| +0033____"????????????????_________________________________________________| +0034_____????????????????_________________________________________________| +0035_____????????????????_________________________________________________| +0036_____????????????????"________________________________________________| +0037_3.2__Lonely_start_characters_________________________________________| +0038_3.2.1__All_32_first_bytes_of_2-byte_sequences__0xc0-0xdf_,___________| +0039________each_followed_by_a_space_character:___________________________| +0040____"?_?_?_?_?_?_?_?_?_?_?_?_?_?_?_?__________________________________| +0041_____?_?_?_?_?_?_?_?_?_?_?_?_?_?_?_?_"________________________________| +0042_3.2.2__All_16_first_bytes_of_3-byte_sequences__0xe0-0xef_,___________| +0043________each_followed_by_a_space_character:___________________________| +0044____"?_?_?_?_?_?_?_?_?_?_?_?_?_?_?_?_"________________________________| +0045_3.2.3__All_8_first_bytes_of_4-byte_sequences__0xf0-0xf7_,____________| +0046________each_followed_by_a_space_character:___________________________| +0047____"?_?_?_?_?_?_?_?_"________________________________________________| +0048_3.2.4__All_4_first_bytes_of_5-byte_sequences__0xf8-0xfb_,____________| +0049________each_followed_by_a_space_character:___________________________| +0050____"?_?_?_?_"________________________________________________________| +0051_3.2.5__All_2_first_bytes_of_6-byte_sequences__0xfc-0xfd_,____________| +0052________each_followed_by_a_space_character:___________________________| +0053____"?_?_"____________________________________________________________| +0054_3.3__Sequences_with_last_continuation_byte_missing___________________| +0055_3.3.1__2-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______| +0056_3.3.2__3-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______| +0057_3.3.3__4-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______| +0058_3.3.4__5-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______| +0059_3.3.5__6-byte_sequence_with_last_byte_missing__U+0000_:_____"?"______| +0060_3.3.6__2-byte_sequence_with_last_byte_missing__U-000007FF_:_"?"______| +0061_3.3.7__3-byte_sequence_with_last_byte_missing__U-0000FFFF_:_"?"______| +0062_3.3.8__4-byte_sequence_with_last_byte_missing__U-001FFFFF_:_"?"______| +0063_3.3.9__5-byte_sequence_with_last_byte_missing__U-03FFFFFF_:_"?"______| +0064_3.3.10_6-byte_sequence_with_last_byte_missing__U-7FFFFFFF_:_"?"______| +0065_3.4__Concatenation_of_incomplete_sequences___________________________| +0066____"??????????"______________________________________________________| +0067_3.5__Impossible_bytes________________________________________________| +0068_3.5.1__fe_=_"?"______________________________________________________| +0069_3.5.2__ff_=_"?"______________________________________________________| +0070_3.5.3__fe_fe_ff_ff_=_"????"__________________________________________| +0071_4__Overlong_sequences________________________________________________| +0072_4.1__Examples_of_an_overlong_ASCII_character_________________________| +0073_4.1.1_U+002F_=_c0_af_____________=_"?"_______________________________| +0074_4.1.2_U+002F_=_e0_80_af__________=_"?"_______________________________| +0075_4.1.3_U+002F_=_f0_80_80_af_______=_"?"_______________________________| +0076_4.1.4_U+002F_=_f8_80_80_80_af____=_"?"_______________________________| +0077_4.1.5_U+002F_=_fc_80_80_80_80_af_=_"?"_______________________________| +0078_4.2__Maximum_overlong_sequences______________________________________| +0079_4.2.1__U-0000007F_=_c1_bf_____________=_"?"__________________________| +0080_4.2.2__U-000007FF_=_e0_9f_bf__________=_"?"__________________________| +0081_4.2.3__U-0000FFFF_=_f0_8f_bf_bf_______=_"?"__________________________| +0082_4.2.4__U-001FFFFF_=_f8_87_bf_bf_bf____=_"?"__________________________| +0083_4.2.5__U-03FFFFFF_=_fc_83_bf_bf_bf_bf_=_"?"__________________________| +0084_4.3__Overlong_representation_of_the_NUL_character____________________| +0085_4.3.1__U+0000_=_c0_80_____________=_"?"______________________________| +0086_4.3.2__U+0000_=_e0_80_80__________=_"?"______________________________| +0087_4.3.3__U+0000_=_f0_80_80_80_______=_"?"______________________________| +0088_4.3.4__U+0000_=_f8_80_80_80_80____=_"?"______________________________| +0089_4.3.5__U+0000_=_fc_80_80_80_80_80_=_"?"______________________________| +0090_5__Illegal_code_positions____________________________________________| +0091_5.1_Single_UTF-16_surrogates_________________________________________| +0092_5.1.1__U+D800_=_ed_a0_80_=_"?"_______________________________________| +0093_5.1.2__U+DB7F_=_ed_ad_bf_=_"?"_______________________________________| +0094_5.1.3__U+DB80_=_ed_ae_80_=_"?"_______________________________________| +0095_5.1.4__U+DBFF_=_ed_af_bf_=_"?"_______________________________________| +0096_5.1.5__U+DC00_=_ed_b0_80_=_"?"_______________________________________| +0097_5.1.6__U+DF80_=_ed_be_80_=_"?"_______________________________________| +0098_5.1.7__U+DFFF_=_ed_bf_bf_=_"?"_______________________________________| +0099_5.2_Paired_UTF-16_surrogates_________________________________________| +0100_5.2.1__U+D800_U+DC00_=_ed_a0_80_ed_b0_80_=_"??"______________________| +0101_5.2.2__U+D800_U+DFFF_=_ed_a0_80_ed_bf_bf_=_"??"______________________| +0102_5.2.3__U+DB7F_U+DC00_=_ed_ad_bf_ed_b0_80_=_"??"______________________| +0103_5.2.4__U+DB7F_U+DFFF_=_ed_ad_bf_ed_bf_bf_=_"??"______________________| +0104_5.2.5__U+DB80_U+DC00_=_ed_ae_80_ed_b0_80_=_"??"______________________| +0105_5.2.6__U+DB80_U+DFFF_=_ed_ae_80_ed_bf_bf_=_"??"______________________| +0106_5.2.7__U+DBFF_U+DC00_=_ed_af_bf_ed_b0_80_=_"??"______________________| +0107_5.2.8__U+DBFF_U+DFFF_=_ed_af_bf_ed_bf_bf_=_"??"______________________| +0108_5.3_Other_illegal_code_positions_____________________________________| +0109_5.3.1__U+FFFE_=_ef_bf_be_=_"?"_______________________________________| +0110_5.3.2__U+FFFF_=_ef_bf_bf_=_"?"_______________________________________| +' "" "" + # Clean up rm -rf ls.testdir 2>/dev/null |