From 49fcd9aa74b25334f3a8383a1bed67195afb2ad3 Mon Sep 17 00:00:00 2001 From: Rob Landley Date: Wed, 18 Jun 2008 19:31:32 -0500 Subject: Most of an susv3 compliant sort implementation (loosely based on the one I wrote back in 2005). Still a few bugs. Needs a _biiiiig_ test suite... --- toys/sort.c | 408 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 408 insertions(+) create mode 100644 toys/sort.c diff --git a/toys/sort.c b/toys/sort.c new file mode 100644 index 00000000..817d86ec --- /dev/null +++ b/toys/sort.c @@ -0,0 +1,408 @@ +/* vi: set sw=4 ts=4: + * + * sort.c - put input lines into order + * + * Copyright 2004, 2008 Rob Landley + * + * See http://www.opengroup.org/onlinepubs/007904975/utilities/sort.html + +USE_SORT(NEWTOY(sort, USE_SORT_BIG("S:T:m" "o:k*t:bgMcszdfi") "run", TOYFLAG_USR|TOYFLAG_BIN)) + +config SORT + bool "sort" + default y + help + usage: sort [-run] [FILE...] + + Sort all lines of text from input files (or stdin) to stdout. + + -r reverse + -u unique lines only + -n numeric order (instead of alphabetical) + +config SORT_BIG + bool " all SuSv3 options (Support -ktcsbdfiozgM)" + default y + depends on SORT + help + usage: sort [-bcdfgiMsz] [-k#[,#[x]] [-t X]] [-o FILE] + + -b ignore leading blanks (or trailing blanks in second part of key) + -c check whether input is sorted + -d dictionary order (use alphanumeric and whitespace chars only) + -f force uppercase (case insensitive sort) + -g general numeric sort (double precision with nan and inf) + -i ignore nonprinting characters + -M month sort (jan, feb, etc). + -s skip fallback sort (only sort with keys) + -z zero (null) terminated input + -k sort by "key" (see below) + -t use a key separator other than whitespace + -o output to FILE instead of stdout + + This version of sort requires floating point. + + Sorting by key looks at a subset of the words on each line. -k2 + uses the second word to the end of the line, -k2,2 looks at only + the second word, -k2,4 looks from the start of the second to the end + of the fourth word. Specifying multiple keys uses the later keys as + tie breakers, in order. A type specifier appended to a sort key + (such as -2,2n) applies only to sorting that key. +*/ + +#include "toys.h" +#include + +DEFINE_GLOBALS( + char *key_separator; + struct arg_list *raw_keys; + char *outfile; + char *ignore1, ignore2; // GNU compatability NOPs for -S and -T. + + void *key_list; + int linecount; + char **lines; +) + +#define TT this.sort + +// The sort types are n, g, and M. +// u, c, s, and z apply to top level only, not to keys. +// b at top level implies bb. +// The remaining options can be applied to search keys. + +#define FLAG_n 1 // Sort type: numeric +#define FLAG_u 2 // Unique +#define FLAG_r 4 // Reverse output order + +#define FLAG_i 8 // Ignore !isprint() +#define FLAG_f 16 // Force uppercase +#define FLAG_d 32 // Ignore !(isalnum()|isspace()) +#define FLAG_z 64 // Input is null terminated, not \n +#define FLAG_s 128 // Stable sort, no ascii fallback at end +#define FLAG_c 256 // Check only. No output, exit(!ordered) +#define FLAG_M 512 // Sort type: date +#define FLAG_g 1024 // Sort type: strtod() +#define FLAG_b 2048 // Ignore leading blanks + +// Left off dealing with FLAG_b/FLAG_bb logic... + +#define FLAG_bb 32768 // Ignore trailing blanks + +struct sort_key +{ + struct sort_key *next_key; // linked list + unsigned range[4]; // start word, start char, end word, end char + int flags; +}; + +// Copy of the part of this string corresponding to a key/flags. + +static char *get_key_data(char *str, struct sort_key *key, int flags) +{ + int start=0, end, len, i, j; + + // Special case whole string, so we don't have to make a copy + + if(key->range[0]==1 && !key->range[1] && !key->range[2] && !key->range[3] + && !(flags&(FLAG_b&FLAG_d&FLAG_f&FLAG_i&FLAG_bb))) return str; + + // Find start of key on first pass, end on second pass + + len = strlen(str); + for (j=0; j<2; j++) { + if (!key->range[2*j]) end=len; + + // Loop through fields + else { + end=0; + for (i=1; i < key->range[2*j]+j; i++) { + + // Skip leading blanks + if (str[end] && !TT.key_separator) + while (isspace(str[end])) end++; + + // Skip body of key + for (; str[end]; end++) { + if (TT.key_separator) { + if (str[end]==*TT.key_separator) break; + } else if (isspace(str[end])) break; + } + } + } + if (!j) start=end; + } + + // Key with explicit separator starts after the separator + if (TT.key_separator && str[start]==*TT.key_separator) start++; + + // Strip leading and trailing whitespace if necessary + if (flags&FLAG_b) while (isspace(str[start])) start++; + if (flags&FLAG_bb) while (end>start && isspace(str[end-1])) end--; + + // Handle offsets on start and end + if (key->range[3]) { + end += key->range[3]-1; + if (end>len) end=len; + } + if (key->range[1]) { + start += key->range[1]-1; + if (start>len) start=len; + } + + // Make the copy + if (endnext_key); + return *pkey = xzalloc(sizeof(struct sort_key)); +} + +// Perform actual comparison +static int compare_values(int flags, char *x, char *y) +{ + int ff = flags & (FLAG_n|FLAG_g|FLAG_M); + + // Ascii sort + if (!ff) return strcmp(x, y); + + if (CFG_SORT_BIG && ff == FLAG_g) { + char *xx,*yy; + double dx = strtod(x,&xx), dy = strtod(y,&yy); + int xinf, yinf; + + // not numbers < NaN < -infinity < numbers < +infinity) + + if (x==xx) return y==yy ? 0 : -1; + if (y==yy) return 1; + + // Check for isnan + if (dx!=dx) return (dy!=dy) ? 0 : -1; + if (dy!=dy) return 1; + + // Check for infinity. (Could underflow, but avoids needing libm.) + xinf = (1.0/dx == 0.0); + yinf = (1.0/dy == 0.0); + if (xinf) { + if(dx<0) return (yinf && dy<0) ? 0 : -1; + return (yinf && dy>0) ? 0 : 1; + } + if (yinf) return dy<0 ? 1 : -1; + + return dx>dy ? 1 : (dxdy ? 1 : (dxnext_key) + { + flags = (key->flags) ? key->flags : toys.optflags; + + // Chop out and modify key chunks, handling -dfib + + x = get_key_data(*xx, key, flags); + y = get_key_data(*yy, key, flags); + + retval = compare_values(flags, x, y); + + // Free the copies get_key_data() made. + + if (x != *xx) free(x); + if (y != *yy) free(y); + + if (retval) break; + } + } else retval = compare_values(flags, *xx, *yy); + + // Perform fallback sort if necessary + + if (!retval && !(toys.optflags&FLAG_s)) retval = strcmp(*xx, *yy); + + return retval * ((flags&FLAG_r) ? -1 : 1); +} + +// Callback from loopfiles to handle input files. +static void sort_read(int fd, char *name) +{ + // Read each line from file, appending to a big array. + + for (;;) { + char * line = (CFG_SORT_BIG && (toys.optflags&FLAG_z)) + ? get_rawline(fd, NULL, 0) : get_line(fd); + + if (!line) break; + + // handle -c here so we don't allocate more memory than necessary. + if (CFG_SORT_BIG && (toys.optflags&FLAG_c)) { + int j = (toys.optflags&FLAG_u) ? -1 : 0; + + if (TT.linecount && compare_keys((char *)TT.lines,line)>j) + error_exit("%s: Check line %d\n", name, TT.linecount); + + if (TT.lines) free(TT.lines); + else TT.linecount = 0; + TT.lines = (char **)line; + } else { + if (!(TT.linecount&63)) + TT.lines = xrealloc(TT.lines, sizeof(char *)*(TT.linecount+64)); + TT.lines[TT.linecount] = line; + } + TT.linecount++; + } +} + +void sort_main(void) +{ + int idx, fd = 1; + + // Open output file if necessary. + if (CFG_SORT_BIG && TT.outfile) + fd = xcreate(TT.outfile, O_CREAT|O_TRUNC|O_WRONLY, 0666); + + // Parse -k sort keys. + if (CFG_SORT_BIG && TT.raw_keys) { + struct arg_list *arg; + + for (arg = TT.raw_keys; arg; arg = arg->next) { + struct sort_key *key = add_key(); + char *temp; + int flag; + + idx = 0; + temp = arg->arg; + while (*temp) { + // Start of range + key->range[2*idx] = (unsigned)strtol(temp, &temp, 10); + if (*temp=='.') + key->range[(2*idx)+1] = (unsigned)strtol(temp+1, &temp, 10); + + // Handle flags appended to a key type. + for (;*temp;temp++) { + char *temp2, *optlist; + + // Note that a second comma becomes an "Unknown key" error. + + if (*temp==',' && !idx++) { + temp++; + break; + } + + // Which flag is this? + + optlist = toys.which->options; + temp2 = index(optlist, *temp); + flag = (1<<(optlist-temp2+strlen(optlist)-1)); + + // Was it a flag that can apply to a key? + + if (!temp2 || flag>FLAG_b + || (flag&(FLAG_u|FLAG_c|FLAG_s|FLAG_z))) + { + error_exit("Unknown key option."); + } + // b after , means strip _trailing_ space, not leading. + if (idx && flag==FLAG_b) flag = FLAG_bb; + key->flags |= flag; + } + } + } + } + + // global b flag strips both leading and trailing spaces + if (toys.optflags&FLAG_b) toys.optflags |= FLAG_bb; + + // If no keys, perform alphabetic sort over the whole line. + if (CFG_SORT_BIG && !TT.key_list) add_key()->range[0] = 1; + + // Open input files and read data, populating TT.lines[TT.linecount] + loopfiles(toys.optargs, sort_read); + + // The compare (-c) logic was handled in sort_read(), + // so if we got here, we're done. + if (CFG_SORT_BIG && (toys.optflags&FLAG_c)) return; + + // Perform the actual sort + qsort(TT.lines, TT.linecount, sizeof(char *), compare_keys); + + // handle unique (-u) + if (toys.optflags&FLAG_u) { + int jdx; + + for (jdx=0, idx=1; idx