aboutsummaryrefslogtreecommitdiff
path: root/toys/posix/cut.c
diff options
context:
space:
mode:
authorRob Landley <rob@landley.net>2017-10-25 20:27:33 -0500
committerRob Landley <rob@landley.net>2017-10-25 20:27:33 -0500
commit69bc956f17b107140803ba522bc01ceb83073449 (patch)
tree25913a4a85fa10c109ba091647860793d37b7ff9 /toys/posix/cut.c
parentafde1dbae1d2771f097e2930ba23df88b90f1297 (diff)
downloadtoybox-69bc956f17b107140803ba522bc01ceb83073449.tar.gz
Make -c work with unicode chars, and first stab at -C measuring columns.
Diffstat (limited to 'toys/posix/cut.c')
-rw-r--r--toys/posix/cut.c71
1 files changed, 61 insertions, 10 deletions
diff --git a/toys/posix/cut.c b/toys/posix/cut.c
index f43d3e90..8410e695 100644
--- a/toys/posix/cut.c
+++ b/toys/posix/cut.c
@@ -10,7 +10,7 @@
*
* todo: -n, -s with -c
-USE_CUT(NEWTOY(cut, "b*|c*|f*|F*|O(output-delimiter):d:sDn[!cbf]", TOYFLAG_USR|TOYFLAG_BIN))
+USE_CUT(NEWTOY(cut, "b*|c*|f*|F*|C*|O(output-delimiter):d:sDn[!cbf]", TOYFLAG_USR|TOYFLAG_BIN))
config CUT
bool "cut"
@@ -27,6 +27,7 @@ config CUT
-b select bytes
-c select UTF-8 characters
+ -C select unicode columns
-d use DELIM (default is TAB for -f, run of whitespace for -F)
-D Don't sort/collate selections
-f select fields (words) separated by single DELIM character
@@ -40,12 +41,36 @@ config CUT
GLOBALS(
char *d;
char *O;
- struct arg_list *select[4]; // we treat them the same, so loop through
+ struct arg_list *select[5]; // we treat them the same, so loop through
int pairs;
regex_t reg;
)
+// Return number of bytes to start of first column fitting in columns
+// invalid sequences are skipped/ignored
+int unicolumns(char *start, unsigned columns)
+{
+ int i, j = 0;
+ wchar_t wc;
+ char *s = start, *ss = start;
+
+ // Skip start, rounding down if we hit a multicolumn char
+ while (j<columns && (i = utf8towc(&wc, s, 4))) {
+ if (i<0) s++;
+ else {
+ s += i;
+ if (0<(i = wcwidth(wc))) {
+ if ((j += i)>columns) break;
+ ss = s;
+ }
+ }
+ }
+
+ return ss-start;
+}
+
+
// Apply selections to an input line, producing output
static void cut_line(char **pline, long len)
{
@@ -67,13 +92,39 @@ static void cut_line(char **pline, long len)
// Find start and end of output string for the relevant selection type
if (toys.optflags&FLAG_b) s += start;
- else if (toys.optflags&FLAG_c) {
- if (start) crunch_str(&s, start, 0, 0, 0);
- if (!*s) continue;
- start = s-line;
- ss = s;
- crunch_str(&ss, count, 0, 0, 0);
- count = ss-s;
+ else if (toys.optflags&FLAG_C) {
+ // crunch_str() currently assumes that combining characters get
+ // escaped, to provide an unambiguous visual representation.
+ // This assumes the input string is null terminated.
+ //if (start) crunch_str(&s, start, 0, 0, 0);
+ //if (!*s) continue;
+ //start = s-line;
+ //ss = s;
+ //crunch_str(&ss, count, 0, 0, 0);
+ //count = ss-s;
+
+ s += unicolumns(s, start);
+ count = unicolumns(s, end-start);
+ } else if (toys.optflags&FLAG_c) {
+ wchar_t wc;
+ char *sss;
+
+ // Find start
+ ss = line+len;
+ while (start && s<ss) {
+ if (0<=(j = utf8towc(&wc, s, len))) start--;
+ s += (j<1) ? 1 : j;
+ }
+ if (s == ss) continue;
+
+ // Find end
+ end = count;
+ sss = s;
+ while (end && sss<ss) {
+ if (0<=(j = utf8towc(&wc, sss, len))) end--;
+ sss += (j<1) ? 1 : j;
+ }
+ count = sss-s;
} else {
regmatch_t match;
@@ -163,7 +214,7 @@ void cut_main(void)
// Parse ranges, which are attached to a selection type (only one can be set)
for (i = 0; i<ARRAY_LEN(TT.select); i++) {
- sprintf(buf, "bad -%c", "Ffcb"[i]);
+ sprintf(buf, "bad -%c", "CFfcb"[i]); // reverse order from newtoy optstr
if (TT.select[i]) comma_args(TT.select[i], 0, buf, get_range);
}
if (!TT.pairs) error_exit("no selections");