aboutsummaryrefslogtreecommitdiff
path: root/toys/posix/cut.c
blob: 9f7f7458f3e5e3e92f2e85430916d5512d77561f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
/* cut.c - print selected ranges from a file
 *
 * Copyright 2016 Rob Landley <rob@landley.net>
 *
 * http://pubs.opengroup.org/onlinepubs/9699919799/utilities/cut.html
 *
 * Deviations from posix: added -DF. We can only accept 512 selections, and
 * "-" counts as start to end. Using spaces to separate a comma-separated list
 * is silly and inconsistent with dd, ps, cp, and mount.
 *
 * todo: -n, -s with -c

USE_CUT(NEWTOY(cut, "b*|c*|f*|F*|C*|O(output-delimiter):d:sDn[!cbf]", TOYFLAG_USR|TOYFLAG_BIN))

config CUT
  bool "cut"
  default y
  help
    usage: cut [-Ds] [-bcfF LIST] [-dO DELIM] [FILE...]

    Print selected parts of lines from each FILE to standard output.

    Each selection LIST is comma separated, either numbers (counting from 1)
    or dash separated ranges (inclusive, with X- meaning to end of line and -X
    from start). By default selection ranges are sorted and collated, use -D
    to prevent that.

    -b	Select bytes
    -c	Select UTF-8 characters
    -C	Select unicode columns
    -d	Use DELIM (default is TAB for -f, run of whitespace for -F)
    -D	Don't sort/collate selections or match -fF lines without delimiter
    -f	Select fields (words) separated by single DELIM character
    -F	Select fields separated by DELIM regex
    -O	Output delimiter (default one space for -F, input delim for -f)
    -s	Skip lines without delimiters
*/
#define FOR_cut
#include "toys.h"

GLOBALS(
  char *d, *O;
  struct arg_list *select[5]; // we treat them the same, so loop through

  int pairs;
  regex_t reg;
)

// Return number of bytes to start of first column fitting in columns
// invalid sequences are skipped/ignored
int unicolumns(char *start, unsigned columns)
{
  int i, j = 0;
  wchar_t wc;
  char *s = start, *ss = start;

  // Skip start, rounding down if we hit a multicolumn char
  while (j<columns && (i = utf8towc(&wc, s, 4))) {
    if (i<0) s++;
    else {
      s += i;
      if (0<(i = wcwidth(wc))) {
        if ((j += i)>columns) break;
        ss = s;
      }
    }
  }

  return ss-start;
}

// Apply selections to an input line, producing output
static void cut_line(char **pline, long len)
{
  unsigned *pairs = (void *)toybuf;
  char *line;
  int i, j;

  if (!pline) return;
  line = *pline;
  if (len && line[len-1]=='\n') line[--len] = 0;

  // Loop through selections
  for (i=0; i<TT.pairs; i++) {
    unsigned start = pairs[2*i], end = pairs[(2*i)+1], count;
    char *s = line, *ss;

    // input: start/end position, count=difference between them
    // output: s = start of string, len = bytes to output

    if (start) start--;
    if (start>=len) continue;
    if (!end || end>len) end = len;
    count = end-start;

    // Find start and end of output string for the relevant selection type
    if (toys.optflags&FLAG_b) s += start;
    else if (toys.optflags&FLAG_C) {
      // crunch_str() currently assumes that combining characters get
      // escaped, to provide an unambiguous visual representation.
      // This assumes the input string is null terminated.
      //if (start) crunch_str(&s, start, 0, 0, 0);
      //if (!*s) continue;
      //start = s-line;
      //ss = s;
      //crunch_str(&ss, count, 0, 0, 0);
      //count = ss-s;

      s += unicolumns(s, start);
      count = unicolumns(s, end-start);
    } else if (toys.optflags&FLAG_c) {
      wchar_t wc;
      char *sss;

      // Find start
      ss = line+len;
      while (start && s<ss) {
        if (0<=(j = utf8towc(&wc, s, len))) start--;
        s += (j<1) ? 1 : j;
      }
      if (s == ss) continue;

      // Find end
      end = count;
      sss = s;
      while (end && sss<ss) {
        if (0<=(j = utf8towc(&wc, sss, len))) end--;
        sss += (j<1) ? 1 : j;
      }
      count = sss-s;
    } else {
      regmatch_t match;

      // Loop through skipping appropriate number of fields
      for (j = 0; j<2; j++) {
        ss = s;
        if (j) start = count;
        else end = start;
        while (*ss && start) {
          if (toys.optflags&FLAG_f) {
            if (!strchr(TT.d, *ss++)) continue;
            if (!--start && j) ss--;
          } else {
            if (regexec(&TT.reg, ss, 1, &match, REG_NOTBOL|REG_NOTEOL)) {
              ss = line+len;
              continue;
            }
            if (!match.rm_eo) break; // zero length match == no delimiter
            ss += (!--start && j) ? match.rm_so : match.rm_eo;
          }
        }
        if (!j && !*(s = ss)) break;
      }

      // If we never encountered even one separator, print whole line (posix!)
      if (!j && end == start) {
        if (toys.optflags&FLAG_D) break;
        if (toys.optflags&FLAG_s) return;
        fwrite(line, len, 1, stdout);
        break;
      } else if (!*s) continue;
      count = ss-s;
    }
    if (i && TT.O) fputs(TT.O, stdout);
    fwrite(s, count, 1, stdout);
  }
  xputc('\n');
}

static int compar(unsigned *a, unsigned *b)
{
  if (*a<*b) return -1;
  if (*a>*b) return 1;
  if (a[1]<b[1]) return -1;
  if (a[1]>b[1]) return 1;

  return 0;
}

// parse A or A-B or A- or -B
static char *get_range(void *data, char *str, int len)
{
  char *end = str;
  unsigned *pairs = (void *)toybuf, i;

  // Using toybuf[] to store ranges means we can have 512 selections max.
  if (TT.pairs == sizeof(toybuf)/sizeof(int)) perror_exit("select limit");
  pairs += 2*TT.pairs++;

  pairs[1] = UINT_MAX;
  for (i = 0; ;i++) {
    if (i==2) return end;
    if (isdigit(*end)) {
      long long ll = estrtol(end, &end, 10);

      if (ll<1 || ll>UINT_MAX || errno) return end;
      pairs[i] = ll;
    }
    if (*end++ != '-') break;
  }
  if (!i) pairs[1] = pairs[0];
  if ((end-str)<len) return end;
  if (pairs[0]>pairs[1]) return str;

  // No error
  return 0;
}

void cut_main(void)
{
  int i;
  char buf[8];

  // Parse command line arguments
  if ((toys.optflags&(FLAG_s|FLAG_f|FLAG_F))==FLAG_s)
    error_exit("-s needs -Ff");
  if ((toys.optflags&(FLAG_d|FLAG_f|FLAG_F))==FLAG_d)
    error_exit("-d needs -Ff");
  if (!TT.d) TT.d = (toys.optflags&FLAG_F) ? "[[:space:]][[:space:]]*" : "\t";
  if (toys.optflags&FLAG_F) xregcomp(&TT.reg, TT.d, REG_EXTENDED);
  if (!TT.O) {
    if (toys.optflags&FLAG_F) TT.O = " ";
    else if (toys.optflags&FLAG_f) TT.O = TT.d;
  }

  // Parse ranges, which are attached to a selection type (only one can be set)
  for (i = 0; i<ARRAY_LEN(TT.select); i++) {
    sprintf(buf, "bad -%c", "CFfcb"[i]); // reverse order from newtoy optstr
    if (TT.select[i]) comma_args(TT.select[i], 0, buf, get_range);
  }
  if (!TT.pairs) error_exit("no selections");

  // Sort and collate selections
  if (!(toys.optflags&FLAG_D)) {
    int from, to;
    unsigned *pairs = (void *)toybuf;

    qsort(toybuf, TT.pairs, 8, (void *)compar);
    for (to = 0, from = 2; from/2 < TT.pairs; from += 2) {
      if (pairs[from] > pairs[to+1]) {
        to += 2;
        memcpy(pairs+to, pairs+from, 2*sizeof(unsigned));
      } else if (pairs[from+1] > pairs[to+1]) pairs[to+1] = pairs[from+1];
    }
    TT.pairs = (to/2)+1;
  }

  // For each argument, loop through lines of file and call cut_line() on each
  loopfiles_lines(toys.optargs, cut_line);
}