diff options
| -rw-r--r-- | Config.in | 11 | ||||
| -rw-r--r-- | libbb/lineedit.c | 62 | ||||
| -rw-r--r-- | libbb/unicode.c | 12 | ||||
| -rwxr-xr-x | testsuite/ash.tests | 24 | 
4 files changed, 89 insertions, 20 deletions
| @@ -223,6 +223,17 @@ config UNICODE_NEUTRAL_TABLE  	  With this option on, more extensive (and bigger) table  	  of neutral chars will be used. +config UNICODE_PRESERVE_BROKEN +	bool "Make it possible to enter sequences of chars which are not Unicode" +	default n +	depends on UNICODE_SUPPORT +	help +	  With this option on, invalid UTF-8 bytes are not substituted +	  with the selected substitution character. +	  For example, this means that entering 'l', 's', ' ', 0xff, [Enter] +	  at shell prompt will list file named 0xff (single char name +	  with char value 255), not file named '?'. +  config LONG_OPTS  	bool "Support for --long-options"  	default y diff --git a/libbb/lineedit.c b/libbb/lineedit.c index dc90846f9..622f9ddfc 100644 --- a/libbb/lineedit.c +++ b/libbb/lineedit.c @@ -68,7 +68,7 @@  #undef CHAR_T  #if ENABLE_UNICODE_SUPPORT -# define BB_NUL L'\0' +# define BB_NUL ((wchar_t)0)  # define CHAR_T wchar_t  static bool BB_isspace(CHAR_T c) { return ((unsigned)c < 256 && isspace(c)); }  # if ENABLE_FEATURE_EDITING_VI @@ -92,6 +92,14 @@ static bool BB_ispunct(CHAR_T c) { return ((unsigned)c < 256 && ispunct(c)); }  #endif +# if ENABLE_UNICODE_PRESERVE_BROKEN +#  define unicode_mark_inv_wchar(wc)   ((wc) | 0x20000000) +#  define unicode_is_inv_wchar(wc)     ((wc) & 0x20000000) +# else +#  define unicode_is_inv_wchar(wc)     0 +# endif + +  enum {  	/* We use int16_t for positions, need to limit line len */  	MAX_LINELEN = CONFIG_FEATURE_EDITING_MAX_LEN < 0x7ff0 @@ -208,24 +216,58 @@ static size_t load_string(const char *src, int maxsize)  	ssize_t len = mbstowcs(command_ps, src, maxsize - 1);  	if (len < 0)  		len = 0; -	command_ps[len] = L'\0'; +	command_ps[len] = 0;  	return len;  } -static size_t save_string(char *dst, int maxsize) +static unsigned save_string(char *dst, unsigned maxsize)  { +#if !ENABLE_UNICODE_PRESERVE_BROKEN  	ssize_t len = wcstombs(dst, command_ps, maxsize - 1);  	if (len < 0)  		len = 0;  	dst[len] = '\0';  	return len; +#else +	unsigned dstpos = 0; +	unsigned srcpos = 0; + +	maxsize--; +	while (dstpos < maxsize) { +		wchar_t wc; +		int n = srcpos; +		while ((wc = command_ps[srcpos]) != 0 +		    && !unicode_is_inv_wchar(wc) +		) { +			srcpos++; +		} +		command_ps[srcpos] = 0; +		n = wcstombs(dst + dstpos, command_ps + n, maxsize - dstpos); +		if (n < 0) /* should not happen */ +			break; +		dstpos += n; +		if (wc == 0) /* usually is */ +			break; +		/* We do have invalid byte here! */ +		command_ps[srcpos] = wc; /* restore it */ +		srcpos++; +		if (dstpos == maxsize) +			break; +		dst[dstpos++] = (char) wc; +	} +	dst[dstpos] = '\0'; +	return dstpos; +#endif  }  /* I thought just fputwc(c, stdout) would work. But no... */  static void BB_PUTCHAR(wchar_t c)  {  	char buf[MB_CUR_MAX + 1];  	mbstate_t mbst = { 0 }; -	ssize_t len = wcrtomb(buf, c, &mbst); +	ssize_t len; +	if (unicode_is_inv_wchar(c)) +		c = CONFIG_SUBST_WCHAR; +	len = wcrtomb(buf, c, &mbst);  	if (len > 0) {  		buf[len] = '\0';  		fputs(buf, stdout); @@ -238,7 +280,7 @@ static size_t load_string(const char *src, int maxsize)  	return strlen(command_ps);  }  # if ENABLE_FEATURE_TAB_COMPLETION -static void save_string(char *dst, int maxsize) +static void save_string(char *dst, unsigned maxsize)  {  	safe_strncpy(dst, command_ps, maxsize);  } @@ -1719,13 +1761,11 @@ static int lineedit_read_key(char *read_key_buffer)   pushback:  				/* Invalid sequence. Save all "bad bytes" except first */  				read_key_ungets(read_key_buffer, unicode_buf + 1, unicode_idx - 1); -				/* -				 * ic = unicode_buf[0] sounds even better, but currently -				 * this does not work: wchar_t[] -> char[] conversion -				 * when lineedit finishes mangles such "raw bytes" -				 * (by misinterpreting them as unicode chars): -				 */ +# if !ENABLE_UNICODE_PRESERVE_BROKEN  				ic = CONFIG_SUBST_WCHAR; +# else +				ic = unicode_mark_inv_wchar(unicode_buf[0]); +# endif  			} else {  				/* Valid unicode char, return its code */  				ic = wc; diff --git a/libbb/unicode.c b/libbb/unicode.c index 83e70b412..d1c6167c7 100644 --- a/libbb/unicode.c +++ b/libbb/unicode.c @@ -423,7 +423,6 @@ static int wcwidth(unsigned ucs)  # if LAST_SUPPORTED_WCHAR >= 0x300  	/* sorted list of non-overlapping intervals of non-spacing characters */  	/* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */ -	static const struct interval combining[] = {  #  define BIG_(a,b) { a, b },  #  define PAIR(a,b)  #  define ARRAY /* PAIR if < 0x4000 and no more than 4 chars big */ \ @@ -557,10 +556,9 @@ static int wcwidth(unsigned ucs)  		BIG_(0xFE20, 0xFE23) \  		BIG_(0xFEFF, 0xFEFF) \  		BIG_(0xFFF9, 0xFFFB) -		ARRAY +	static const struct interval combining[] = { ARRAY };  #  undef BIG_  #  undef PAIR -	};  #  define BIG_(a,b)  #  define PAIR(a,b) (a << 2) | (b-a),  	static const uint16_t combining1[] = { ARRAY }; @@ -668,7 +666,6 @@ int FAST_FUNC unicode_bidi_isrtl(wint_t wc)  	 * http://www.unicode.org/Public/5.2.0/ucd/extracted/DerivedBidiClass.txt  	 * Bidi_Class=Left_To_Right | Bidi_Class=Arabic_Letter  	 */ -	static const struct interval rtl_b[] = {  #  define BIG_(a,b) { a, b },  #  define PAIR(a,b)  #  define ARRAY \ @@ -723,10 +720,9 @@ int FAST_FUNC unicode_bidi_isrtl(wint_t wc)  		{0x10E7F, 0x10FFF},  		{0x1E800, 0x1EFFF}  		*/ -		ARRAY +	static const struct interval rtl_b[] = { ARRAY };  #  undef BIG_  #  undef PAIR -	};  #  define BIG_(a,b)  #  define PAIR(a,b) (a << 2) | (b-a),  	static const uint16_t rtl_p[] = { ARRAY }; @@ -755,7 +751,6 @@ int FAST_FUNC unicode_bidi_is_neutral_wchar(wint_t wc)  	 * White_Space, Other_Neutral, European_Number, European_Separator,  	 * European_Terminator, Arabic_Number, Common_Separator  	 */ -	static const struct interval neutral_b[] = {  #  define BIG_(a,b) { a, b },  #  define PAIR(a,b)  #  define ARRAY \ @@ -929,10 +924,9 @@ int FAST_FUNC unicode_bidi_is_neutral_wchar(wint_t wc)  		{0x1F030, 0x1F093},  		{0x1F100, 0x1F10A}  		*/ -		ARRAY +	static const struct interval neutral_b[] = { ARRAY };  #  undef BIG_  #  undef PAIR -	};  #  define BIG_(a,b)  #  define PAIR(a,b) (a << 2) | (b-a),  	static const uint16_t neutral_p[] = { ARRAY }; diff --git a/testsuite/ash.tests b/testsuite/ash.tests index 6b2caf316..ce585beb1 100755 --- a/testsuite/ash.tests +++ b/testsuite/ash.tests @@ -7,8 +7,30 @@  . ./testing.sh +test -f "$bindir/.config" && . "$bindir/.config" +  # testing "test name" "options" "expected result" "file input" "stdin" +if test x"$CONFIG_UNICODE_PRESERVE_BROKEN" = x"y"; then +testing "One byte which is not valid unicode char followed by valid input" \ +	"script -q -c 'ash' /dev/null >/dev/null; cat ash.output" \ +	"\ +00000000  ff 2d 0a                                          |.-.| +00000003 +" \ +	"" \ +	"echo \xff- | hexdump -C >ash.output; exit; exit; exit; exit\n" + +testing "30 bytes which are not valid unicode chars followed by valid input" \ +	"script -q -c 'ash' /dev/null >/dev/null; cat ash.output" \ +	"\ +00000000  ff ff ff ff ff ff ff ff  ff ff ff ff ff ff ff ff  |................| +00000010  ff ff ff ff ff ff ff ff  ff ff ff ff ff ff 2d 0a  |..............-.| +00000020 +" \ +	"" \ +	"echo \xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff- | hexdump -C >ash.output; exit; exit; exit; exit\n" +else  testing "One byte which is not valid unicode char followed by valid input" \  	"script -q -c 'ash' /dev/null >/dev/null; cat ash.output" \  	"\ @@ -27,6 +49,8 @@ testing "30 bytes which are not valid unicode chars followed by valid input" \  " \  	"" \  	"echo \xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff- | hexdump -C >ash.output; exit; exit; exit; exit\n" +fi +  # Not sure this behavior is perfect: we lose all invalid input which precedes  # arrow keys and such. In this example, \xff\xff are lost | 
