aboutsummaryrefslogtreecommitdiff
path: root/toys/posix/sed.c
diff options
context:
space:
mode:
authorRob Landley <rob@landley.net>2014-12-21 01:54:54 -0600
committerRob Landley <rob@landley.net>2014-12-21 01:54:54 -0600
commit1a1e0a9d325dd1ca7461a8e8203dd47bab6a8bc1 (patch)
tree6fc7da5f3bbd3ad9ead0fd52c2dc4d67d5a825cc /toys/posix/sed.c
parent32cd2b770fe3ac8c419a29156162f3a037cf47a3 (diff)
downloadtoybox-1a1e0a9d325dd1ca7461a8e8203dd47bab6a8bc1.tar.gz
Promote sed to posix.
Diffstat (limited to 'toys/posix/sed.c')
-rw-r--r--toys/posix/sed.c1002
1 files changed, 1002 insertions, 0 deletions
diff --git a/toys/posix/sed.c b/toys/posix/sed.c
new file mode 100644
index 00000000..7f070208
--- /dev/null
+++ b/toys/posix/sed.c
@@ -0,0 +1,1002 @@
+/* sed.c - stream editor. Thing that does s/// and other stuff.
+ *
+ * Copyright 2014 Rob Landley <rob@landley.net>
+ *
+ * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html
+ *
+ * TODO: lines > 2G could signed int wrap length counters. Not just getline()
+ * but N and s///
+
+USE_SED(NEWTOY(sed, "(version)e*f*inr", TOYFLAG_USR|TOYFLAG_BIN|TOYFLAG_LOCALE))
+
+config SED
+ bool "sed"
+ default y
+ help
+ usage: sed [-inr] [-e SCRIPT]...|SCRIPT [-f SCRIPT_FILE]... [FILE...]
+
+ Stream editor. Apply one or more editing SCRIPTs to each line of input
+ (from FILE or stdin) producing output (by default to stdout).
+
+ -e add SCRIPT to list
+ -f add contents of SCRIPT_FILE to list
+ -i Edit each file in place.
+ -n No default output. (Use the p command to output matched lines.)
+ -r Use extended regular expression syntax.
+ -s Treat input files separately (implied by -i)
+
+ A SCRIPT is a series of one or more COMMANDs separated by newlines or
+ semicolons. All -e SCRIPTs are concatenated together as if separated
+ by newlines, followed by all lines from -f SCRIPT_FILEs, in order.
+ If no -e or -f SCRIPTs are specified, the first argument is the SCRIPT.
+
+ Each COMMAND may be preceded by an address which limits the command to
+ apply only to the specified line(s). Commands without an address apply to
+ every line. Addresses are of the form:
+
+ [ADDRESS[,ADDRESS]]COMMAND
+
+ The ADDRESS may be a decimal line number (starting at 1), a /regular
+ expression/ within a pair of forward slashes, or the character "$" which
+ matches the last line of input. (In -s or -i mode this matches the last
+ line of each file, otherwise just the last line of the last file.) A single
+ address matches one line, a pair of comma separated addresses match
+ everything from the first address to the second address (inclusive). If
+ both addresses are regular expressions, more than one range of lines in
+ each file can match.
+
+ REGULAR EXPRESSIONS in sed are started and ended by the same character
+ (traditionally / but anything except a backslash or a newline works).
+ Backslashes may be used to escape the delimiter if it occurs in the
+ regex, and for the usual printf escapes (\abcefnrtv and octal, hex,
+ and unicode). An empty regex repeats the previous one. ADDRESS regexes
+ (above) require the first delimeter to be escaped with a backslash when
+ it isn't a forward slash (to distinguish it from the COMMANDs below).
+
+ Sed mostly operates on individual lines one at a time. It reads each line,
+ processes it, and either writes it to the output or discards it before
+ reading the next line. Sed can remember one additional line in a separate
+ buffer (using the h, H, g, G, and x commands), and can read the next line
+ of input early (using the n and N command), but other than that command
+ scripts operate on individual lines of text.
+
+ Each COMMAND starts with a single character. The following commands take
+ no arguments:
+
+ { Start a new command block, continuing until a corresponding "}".
+ Command blocks may nest. If the block has an address, commands within
+ the block are only run for lines within the block's address range.
+
+ } End command block (this command cannot have an address)
+
+ d Delete this line and move on to the next one
+ (ignores remaining COMMANDs)
+
+ D Delete one line of input and restart command SCRIPT (same as "d"
+ unless you've glued lines together with "N" or similar)
+
+ g Get remembered line (overwriting current line)
+
+ G Get remembered line (appending to current line)
+
+ h Remember this line (overwriting remembered line)
+
+ H Remember this line (appending to remembered line, if any)
+
+ l Print line, escaping \abfrtv (but not newline), octal escaping other
+ nonprintable characters, wrapping lines to terminal width with a
+ backslash, and appending $ to actual end of line.
+
+ n Print default output and read next line, replacing current line
+ (If no next line available, quit processing script)
+
+ N Append next line of input to this line, separated by a newline
+ (This advances the line counter for address matching and "=", if no
+ next line available quit processing script without default output)
+
+ p Print this line
+
+ P Print this line up to first newline (from "N")
+
+ q Quit (print default output, no more commands processed or lines read)
+
+ x Exchange this line with remembered line (overwrite in both directions)
+
+ = Print the current line number (followed by a newline)
+
+ The following commands (may) take an argument. The "text" arguments (to
+ the "a", "b", and "c" commands) may end with an unescaped "\" to append
+ the next line (for which leading whitespace is not skipped), and also
+ treat ";" as a literal character (use "\;" instead).
+
+ a [text] Append text to output before attempting to read next line
+
+ b [label] Branch, jumps to :label (or with no label, to end of SCRIPT)
+
+ c [text] Delete line, output text at end of matching address range
+ (ignores remaining COMMANDs)
+
+ i [text] Print text
+
+ r [file] Append contents of file to output before attempting to read
+ next line.
+
+ s/S/R/F Search for regex S, replace matched text with R using flags F.
+ The first character after the "s" (anything but newline or
+ backslash) is the delimiter, escape with \ to use normally.
+
+ The replacement text may contain "&" to substitute the matched
+ text (escape it with backslash for a literal &), or \1 through
+ \9 to substitute a parenthetical subexpression in the regex.
+ You can also use the normal backslash escapes such as \n and
+ a backslash at the end of the line appends the next line.
+
+ The flags are:
+
+ [0-9] A number, substitute only that occurrence of pattern
+ g Global, substitute all occurrences of pattern
+ i Ignore case when matching
+ p Print the line if match was found and replaced
+ w [file] Write (append) line to file if match replaced
+
+ t [label] Test, jump to :label only if an "s" command found a match in
+ this line since last test (replacing with same text counts)
+
+ T [label] Test false, jump only if "s" hasn't found a match.
+
+ w [file] Write (append) line to file
+
+ y/old/new/ Change each character in 'old' to corresponding character
+ in 'new' (with standard backslash escapes, delimiter can be
+ any repeated character except \ or \n)
+
+ : [label] Labeled target for jump commands
+
+ # Comment, ignore rest of this line of SCRIPT
+
+ Deviations from posix: allow extended regular expressions with -r,
+ editing in place with -i, separate with -s, printf escapes in text, line
+ continuations, semicolons after all commands, 2-address anywhere an
+ address is allowed, "T" command, multiline continuations for [abc],
+ \; to end [abc] argument before end of line.
+*/
+
+#define FOR_sed
+#include "toys.h"
+
+GLOBALS(
+ struct arg_list *f;
+ struct arg_list *e;
+
+ // processed pattern list
+ struct double_list *pattern;
+
+ char *nextline, *remember;
+ void *restart, *lastregex;
+ long nextlen, rememberlen, count;
+ int fdout, noeol;
+ unsigned xx;
+)
+
+struct step {
+ struct step *next, *prev;
+
+ // Begin and end of each match
+ long lmatch[2];
+ int rmatch[2], arg1, arg2, w; // offsets because remalloc()
+ unsigned not, hit, sflags;
+ char c; // action
+};
+
+// Write out line with potential embedded NUL, handling eol/noeol
+static int emit(char *line, long len, int eol)
+{
+ int l, old = line[len];
+
+ if (TT.noeol && !writeall(TT.fdout, "\n", 1)) return 1;
+ if (eol) line[len++] = '\n';
+ if (!len) return 0;
+ TT.noeol = len && !eol;
+ l = writeall(TT.fdout, line, len);
+ if (eol) line[len-1] = old;
+ if (l != len) {
+ perror_msg("short write");
+
+ return 1;
+ }
+
+ return 0;
+}
+
+// Do regex matching handling embedded NUL bytes in string. Note that
+// neither the pattern nor the match can currently include NUL bytes
+// (even with wildcards) and string must be null terminated.
+static int ghostwheel(regex_t *preg, char *string, long len, int nmatch,
+ regmatch_t pmatch[], int eflags)
+{
+ char *s = string;
+
+ for (;;) {
+ long ll = 0;
+ int rc;
+
+ while (len && !*s) {
+ s++;
+ len--;
+ }
+ while (s[ll] && ll<len) ll++;
+
+ rc = regexec(preg, s, nmatch, pmatch, eflags);
+ if (!rc) {
+ for (rc = 0; rc<nmatch && pmatch[rc].rm_so!=-1; rc++) {
+ pmatch[rc].rm_so += s-string;
+ pmatch[rc].rm_eo += s-string;
+ }
+
+ return 0;
+ }
+ if (ll==len) return rc;
+
+ s += ll;
+ len -= ll;
+ }
+}
+
+// Extend allocation to include new string, with newline between if newlen<0
+
+static char *extend_string(char **old, char *new, int oldlen, int newlen)
+{
+ int newline = newlen < 0;
+ char *s;
+
+ if (newline) newlen = -newlen;
+ s = *old = xrealloc(*old, oldlen+newlen+newline+1);
+ if (newline) s[oldlen++] = '\n';
+ memcpy(s+oldlen, new, newlen);
+ s[oldlen+newlen] = 0;
+
+ return s+oldlen+newlen+1;
+}
+
+// An empty regex repeats the previous one
+void *get_regex(void *trump, int offset)
+{
+ if (!offset) {
+ if (!TT.lastregex) error_exit("no previous regex");
+ return TT.lastregex;
+ }
+
+ return TT.lastregex = offset+(char *)trump;
+}
+
+// Apply pattern to line from input file
+static void walk_pattern(char **pline, long plen)
+{
+ struct append {
+ struct append *next, *prev;
+ int file;
+ char *str;
+ } *append = 0;
+ char *line = TT.nextline;
+ long len = TT.nextlen;
+ struct step *logrus;
+ int eol = 0, tea = 0;
+
+ // Grab next line for deferred processing (EOF detection: we get a NULL
+ // pline at EOF to flush last line). Note that only end of _last_ input
+ // file matches $ (unless we're doing -i).
+ TT.nextline = 0;
+ TT.nextlen = 0;
+ if (pline) {
+ TT.nextline = *pline;
+ TT.nextlen = plen;
+ *pline = 0;
+ }
+
+ if (!line || !len) return;
+ if (line[len-1] == '\n') line[--len] = eol++;
+ TT.count++;
+
+ logrus = TT.restart ? TT.restart : (void *)TT.pattern;
+ TT.restart = 0;
+
+ while (logrus) {
+ char *str, c = logrus->c;
+
+ // Have we got a line or regex matching range for this rule?
+ if (*logrus->lmatch || *logrus->rmatch) {
+ int miss = 0;
+ long lm;
+
+ // In a match that might end?
+ if (logrus->hit) {
+ if (!(lm = logrus->lmatch[1])) {
+ if (!logrus->rmatch[1]) logrus->hit = 0;
+ else {
+ void *rm = get_regex(logrus, logrus->rmatch[1]);
+
+ // regex match end includes matching line, so defer deactivation
+ if (line && !ghostwheel(rm, line, len, 0, 0, 0)) miss = 1;
+ }
+ } else if (lm > 0 && lm < TT.count) logrus->hit = 0;
+
+ // Start a new match?
+ } else {
+ if (!(lm = *logrus->lmatch)) {
+ void *rm = get_regex(logrus, *logrus->rmatch);
+
+ if (line && !ghostwheel(rm, line, len, 0, 0, 0)) logrus->hit++;
+ } else if (lm == TT.count || (lm == -1 && !pline)) logrus->hit++;
+
+ if (!logrus->lmatch[1] && !logrus->rmatch[1]) miss = 1;
+ }
+
+ // Didn't match?
+ lm = !(logrus->hit ^ logrus->not);
+
+ // Deferred disable from regex end match
+ if (miss || logrus->lmatch[1] == TT.count) logrus->hit = 0;
+
+ if (lm) {
+ // Handle skipping curly bracket command group
+ if (c == '{') {
+ int curly = 1;
+
+ while (curly) {
+ logrus = logrus->next;
+ if (logrus->c == '{') curly++;
+ if (logrus->c == '}') curly--;
+ }
+ }
+ logrus = logrus->next;
+ continue;
+ }
+ }
+
+ // A deleted line can still update line match state for later commands
+ if (!line) {
+ logrus = logrus->next;
+ continue;
+ }
+
+ // Process command
+
+ if (c=='a' || c=='r') {
+ struct append *a = xzalloc(sizeof(struct append));
+ a->str = logrus->arg1+(char *)logrus;
+ a->file = c== 'r';
+ dlist_add_nomalloc((void *)&append, (void *)a);
+ } else if (c=='b' || c=='t' || c=='T') {
+ int t = tea;
+
+ if (c != 'b') tea = 0;
+ if (c=='b' || t^(c=='T')) {
+ if (!logrus->arg1) break;
+ str = logrus->arg1+(char *)logrus;
+ for (logrus = (void *)TT.pattern; logrus; logrus = logrus->next)
+ if (logrus->c == ':' && !strcmp(logrus->arg1+(char *)logrus, str))
+ break;
+ if (!logrus) error_exit("no :%s", str);
+ }
+ } else if (c=='c') {
+ str = logrus->arg1+(char *)logrus;
+ if (!logrus->hit) emit(str, strlen(str), 1);
+ free(line);
+ line = 0;
+ continue;
+ } else if (c=='d') {
+ free(line);
+ line = 0;
+ continue;
+ } else if (c=='D') {
+ // Delete up to \n or end of buffer
+ str = line;
+ while ((str-line)<len) if (*(str++) == '\n') break;
+ len -= str - line;
+ memmove(line, str, len);
+
+ // if "delete" blanks line, disable further processing
+ // otherwise trim and restart script
+ if (!len) {
+ free(line);
+ line = 0;
+ } else {
+ line[len] = 0;
+ logrus = (void *)TT.pattern;
+ }
+ continue;
+ } else if (c=='g') {
+ free(line);
+ line = xstrdup(TT.remember);
+ len = TT.rememberlen;
+ } else if (c=='G') {
+ line = xrealloc(line, len+TT.rememberlen+2);
+ line[len++] = '\n';
+ memcpy(line+len, TT.remember, TT.rememberlen);
+ line[len += TT.rememberlen] = 0;
+ } else if (c=='h') {
+ free(TT.remember);
+ TT.remember = xstrdup(line);
+ TT.rememberlen = len;
+ } else if (c=='H') {
+ TT.remember = xrealloc(TT.remember, TT.rememberlen+len+2);
+ TT.remember[TT.rememberlen++] = '\n';
+ memcpy(TT.remember+TT.rememberlen, line, len);
+ TT.remember[TT.rememberlen += len] = 0;
+ } else if (c=='i') {
+ str = logrus->arg1+(char *)logrus;
+ emit(str, strlen(str), 1);
+ } else if (c=='l') {
+ int i, x, off;
+
+ if (!TT.xx) {
+ terminal_size(&TT.xx, 0);
+ if (!TT.xx) TT.xx = 80;
+ if (TT.xx > sizeof(toybuf)-10) TT.xx = sizeof(toybuf)-10;
+ if (TT.xx > 4) TT.xx -= 4;
+ }
+
+ for (i = off = 0; i<len; i++) {
+ if (off >= TT.xx) {
+ toybuf[off++] = '\\';
+ emit(toybuf, off, 1);
+ off = 0;
+ }
+ x = stridx("\\\a\b\f\r\t\v", line[i]);
+ if (x != -1) {
+ toybuf[off++] = '\\';
+ toybuf[off++] = "\\abfrtv"[x];
+ } else if (line[i] >= ' ') toybuf[off++] = line[i];
+ else off += sprintf(toybuf+off, "\\%03o", line[i]);
+ }
+ toybuf[off++] = '$';
+ emit(toybuf, off, 1);
+ } else if (c=='n') {
+ TT.restart = logrus->next;
+
+ break;
+ } else if (c=='N') {
+ // Can't just grab next line because we could have multiple N and
+ // we need to actually read ahead to get N;$p EOF detection right.
+ if (pline) {
+ TT.restart = logrus->next;
+ extend_string(&line, TT.nextline, len, -TT.nextlen);
+ free(TT.nextline);
+ TT.nextline = line;
+ TT.nextlen += len + 1;
+ line = 0;
+ }
+
+ // Pending append goes out right after N
+ goto done;
+ } else if (c=='p' || c=='P') {
+ char *l = (c=='P') ? strchr(line, '\n') : 0;
+
+ if (emit(line, l ? l-line : len, eol)) break;
+ } else if (c=='q') {
+ if (pline) *pline = (void *)1;
+ free(TT.nextline);
+ TT.nextline = 0;
+ TT.nextlen = 0;
+
+ break;
+ } else if (c=='s') {
+ char *rline = line, *new = logrus->arg2 + (char *)logrus, *swap, *rswap;
+ regmatch_t *match = (void *)toybuf;
+ regex_t *reg = get_regex(logrus, logrus->arg1);
+ int mflags = 0, count = 0, zmatch = 1, rlen = len, mlen, off, newlen;
+
+ // Find match in remaining line (up to remaining len)
+ while (!ghostwheel(reg, rline, rlen, 10, match, mflags)) {
+ mflags = REG_NOTBOL;
+
+ // Zero length matches don't count immediately after a previous match
+ mlen = match[0].rm_eo-match[0].rm_so;
+ if (!mlen && !zmatch) {
+ if (!rlen--) break;
+ rline++;
+ zmatch++;
+ continue;
+ } else zmatch = 0;
+
+ // If we're replacing only a specific match, skip if this isn't it
+ off = logrus->sflags>>3;
+ if (off && off != ++count) {
+ rline += match[0].rm_eo;
+ rlen -= match[0].rm_eo;
+
+ continue;
+ }
+ // The fact getline() can allocate unbounded amounts of memory is
+ // a bigger issue, but while we're here check for integer overflow
+ if (match[0].rm_eo > INT_MAX) perror_exit(0);
+
+ // newlen = strlen(new) but with \1 and & and printf escapes
+ for (off = newlen = 0; new[off]; off++) {
+ int cc = -1;
+
+ if (new[off] == '&') cc = 0;
+ else if (new[off] == '\\') cc = new[++off] - '0';
+ if (cc < 0 || cc > 9) {
+ newlen++;
+ continue;
+ }
+ newlen += match[cc].rm_eo-match[cc].rm_so;
+ }
+
+ // Allocate new size, copy start/end around match. (Can't extend in
+ // place because backrefs may refer to text after it's overwritten.)
+ len += newlen-mlen;
+ swap = xmalloc(len+1);
+ rswap = swap+(rline-line)+match[0].rm_so;
+ memcpy(swap, line, (rline-line)+match[0].rm_so);
+ memcpy(rswap+newlen, rline+match[0].rm_eo, (rlen -= match[0].rm_eo)+1);
+
+ // copy in new replacement text
+ for (off = mlen = 0; new[off]; off++) {
+ int cc = 0, ll;
+
+ if (new[off] == '\\') {
+ cc = new[++off] - '0';
+ if (cc<0 || cc>9) {
+ if (!(rswap[mlen++] = unescape(new[off])))
+ rswap[mlen-1] = new[off];
+
+ continue;
+ } else if (match[cc].rm_so == -1) error_exit("no s//\\%d/", cc);
+ } else if (new[off] != '&') {
+ rswap[mlen++] = new[off];
+
+ continue;
+ }
+
+ ll = match[cc].rm_eo-match[cc].rm_so;
+ memcpy(rswap+mlen, rline+match[cc].rm_so, ll);
+ mlen += ll;
+ }
+
+ rline = rswap+newlen;
+ free(line);
+ line = swap;
+
+ // Stop after first substitution unless we have flag g
+ if (!(logrus->sflags & 2)) break;
+ }
+
+ if (mflags) {
+ // flag p
+ if (logrus->sflags & 4) emit(line, len, eol);
+
+ tea = 1;
+ if (logrus->w) goto writenow;
+ }
+ } else if (c=='w') {
+ int fd, noeol;
+ char *name;
+
+writenow:
+ // Swap out emit() context
+ fd = TT.fdout;
+ noeol = TT.noeol;
+
+ // We save filehandle and newline status before filename
+ name = logrus->w + (char *)logrus;
+ memcpy(&TT.fdout, name, 4);
+ name += 4;
+ TT.noeol = *(name++);
+
+ // write, then save/restore context
+ if (emit(line, len, eol))
+ perror_exit("w '%s'", logrus->arg1+(char *)logrus);
+ *(--name) = TT.noeol;
+ TT.noeol = noeol;
+ TT.fdout = fd;
+ } else if (c=='x') {
+ long swap = TT.rememberlen;
+
+ str = TT.remember;
+ TT.remember = line;
+ line = str;
+ TT.rememberlen = len;
+ len = swap;
+ } else if (c=='y') {
+ char *from, *to = (char *)logrus;
+ int i, j;
+
+ from = to+logrus->arg1;
+ to += logrus->arg2;
+
+ for (i = 0; i < len; i++) {
+ j = stridx(from, line[i]);
+ if (j != -1) line[i] = to[j];
+ }
+ } else if (c=='=') {
+ sprintf(toybuf, "%ld", TT.count);
+ emit(toybuf, strlen(toybuf), 1);
+ }
+
+ logrus = logrus->next;
+ }
+
+ if (line && !(toys.optflags & FLAG_n)) emit(line, len, eol);
+
+done:
+ free(line);
+
+ if (dlist_terminate(append)) while (append) {
+ struct append *a = append->next;
+
+ if (append->file) {
+ int fd = xopen(append->str, O_RDONLY);
+
+ // Force newline if noeol pending
+ emit(0, 0, 0);
+ xsendfile(fd, TT.fdout);
+ close(fd);
+ } else emit(append->str, strlen(append->str), 1);
+ free(append);
+ append = a;
+ }
+}
+
+// Genericish function, can probably get moved to lib.c
+
+// Iterate over lines in file, calling function. Function can write 0 to
+// the line pointer if they want to keep it, or 1 to terminate processing,
+// otherwise line is freed. Passed file descriptor is closed at the end.
+static void do_lines(int fd, char *name, void (*call)(char **pline, long len))
+{
+ FILE *fp = fd ? xfdopen(fd, "r") : stdin;
+
+ for (;;) {
+ char *line = 0;
+ ssize_t len;
+
+ len = getline(&line, (void *)&len, fp);
+ if (len > 0) {
+ call(&line, len);
+ if (line == (void *)1) break;
+ free(line);
+ } else break;
+ }
+
+ if (fd) fclose(fp);
+}
+
+static void do_sed(int fd, char *name)
+{
+ int i = toys.optflags & FLAG_i;
+ char *tmp;
+
+ if (i) {
+ struct step *primal;
+
+ if (!fd && *name=='-') {
+ error_msg("no -i on stdin");
+ return;
+ }
+ TT.fdout = copy_tempfile(fd, name, &tmp);
+ TT.count = 0;
+ for (primal = (void *)TT.pattern; primal; primal = primal->next)
+ primal->hit = 0;
+ }
+ do_lines(fd, name, walk_pattern);
+ if (i) {
+ walk_pattern(0, 0);
+ replace_tempfile(-1, TT.fdout, &tmp);
+ TT.fdout = 1;
+ TT.nextline = 0;
+ TT.nextlen = TT.noeol = 0;
+ }
+}
+
+// Copy chunk of string between two delimiters, converting printf escapes.
+// returns processed copy of string (0 if error), *pstr advances to next
+// unused char. if delim (or *delim) is 0 uses/saves starting char as delimiter
+// if regxex, ignore delimiter in [ranges]
+static char *unescape_delimited_string(char **pstr, char *delim, int regex)
+{
+ char *to, *from, d;
+
+ to = from = *pstr;
+ if (!delim || !*delim) {
+ if (!(d = *(from++))) return 0;
+ if (d == '\\') d = *(from++);
+ if (!d || d == '\\') return 0;
+ if (delim) *delim = d;
+ } else d = *delim;
+ to = delim = xmalloc(strlen(*pstr)+1);
+
+ while (*from != d) {
+ if (!*from) return 0;
+
+ // delimiter in regex character range doesn't count
+ if (*from == '[') {
+ int len = 1;
+
+ if (from[len] == ']') len++;
+ while (from[len] != ']') if (!from[len++]) return 0;
+ memmove(to, from, ++len);
+ to += len;
+ from += len;
+ continue;
+ }
+ if (*from == '\\') {
+ if (!from[1]) return 0;
+
+ // Check escaped end delimiter before printf style escapes.
+ if (from[1] == d) from++;
+ else if (from[1]=='\\') *(to++) = *(from++);
+ else {
+ char c = unescape(from[1]);
+
+ if (c) {
+ *(to++) = c;
+ from+=2;
+ continue;
+ }
+ }
+ }
+ *(to++) = *(from++);
+ }
+ *to = 0;
+ *pstr = from+1;
+
+ return delim;
+}
+
+// Translate primal pattern into walkable form.
+static void jewel_of_judgement(char **pline, long len)
+{
+ struct step *corwin = (void *)TT.pattern;
+ char *line = *pline, *reg, c, *errstart = *pline;
+ int i;
+
+ // Append additional line to pattern argument string?
+ if (corwin && corwin->prev->hit) {
+ // Remove half-finished entry from list so remalloc() doesn't confuse it
+ TT.pattern = TT.pattern->prev;
+ corwin = dlist_pop(&TT.pattern);
+ corwin->hit = 0;
+ c = corwin->c;
+ reg = (char *)corwin;
+ reg += corwin->arg1 + strlen(reg + corwin->arg1);
+
+ // Resume parsing
+ goto append;
+ }
+
+ // Loop through commands in line
+
+ corwin = 0;
+ for (;;) {
+ if (corwin) dlist_add_nomalloc(&TT.pattern, (void *)corwin);
+
+ while (isspace(*line) || *line == ';') line++;
+ if (!*line || *line == '#') return;
+
+ errstart = line;
+ memset(toybuf, 0, sizeof(struct step));
+ corwin = (void *)toybuf;
+ reg = toybuf + sizeof(struct step);
+
+ // Parse address range (if any)
+ for (i = 0; i < 2; i++) {
+ if (*line == ',') line++;
+ else if (i) break;
+
+ if (isdigit(*line)) corwin->lmatch[i] = strtol(line, &line, 0);
+ else if (*line == '$') {
+ corwin->lmatch[i] = -1;
+ line++;
+ } else if (*line == '/' || *line == '\\') {
+ char *s = line;
+
+ if (!(s = unescape_delimited_string(&line, 0, 1))) goto brand;
+ if (!*s) corwin->rmatch[i] = 0;
+ else {
+ xregcomp((void *)reg, s, (toys.optflags & FLAG_r)*REG_EXTENDED);
+ corwin->rmatch[i] = reg-toybuf;
+ reg += sizeof(regex_t);
+ }
+ free(s);
+ } else break;
+ }
+
+ while (isspace(*line)) line++;
+ if (!*line) break;
+
+ while (*line == '!') {
+ corwin->not = 1;
+ line++;
+ }
+ while (isspace(*line)) line++;
+
+ c = corwin->c = *(line++);
+ if (strchr("}:", c) && i) break;
+ if (strchr("aiqr=", c) && i>1) break;
+
+ // Add step to pattern
+ corwin = xmalloc(reg-toybuf);
+ memcpy(corwin, toybuf, reg-toybuf);
+ reg = (reg-toybuf) + (char *)corwin;
+
+ // Parse arguments by command type
+ if (c == '{') TT.nextlen++;
+ else if (c == '}') {
+ if (!TT.nextlen--) break;
+ } else if (c == 's') {
+ char *merlin, *fiona, delim = 0;
+
+ // s/pattern/replacement/flags
+
+ // get pattern (just record, we parse it later)
+ corwin->arg1 = reg - (char *)corwin;
+ if (!(merlin = unescape_delimited_string(&line, &delim, 1))) goto brand;
+
+ // get replacement - don't replace escapes because \1 and \& need
+ // processing later, after we replace \\ with \ we can't tell \\1 from \1
+ fiona = line;
+ while (*line != delim) {
+ if (!*line) goto brand;
+ if (*line == '\\') {
+ if (!line[1]) goto brand;
+ line += 2;
+ } else line++;
+ }
+
+ corwin->arg2 = corwin->arg1 + sizeof(regex_t);
+ reg = extend_string((void *)&corwin, fiona, corwin->arg2, line-fiona)+1;
+
+ // get flags
+ for (line++; *line; line++) {
+ long l;
+
+ if (isspace(*line) && *line != '\n') continue;
+
+ if (0 <= (l = stridx("igp", *line))) corwin->sflags |= 1<<l;
+ else if (!(corwin->sflags>>3) && 0<(l = strtol(line, &line, 10))) {
+ corwin->sflags |= l << 3;
+ line--;
+ } else break;
+ }
+
+ // We deferred actually parsing the regex until we had the s///i flag
+ // allocating the space was done by extend_string() above
+ if (!*merlin) corwin->arg1 = 0;
+ else xregcomp((void *)(corwin->arg1 + (char *)corwin), merlin,
+ ((toys.optflags & FLAG_r)*REG_EXTENDED)|((corwin->sflags&1)*REG_ICASE));
+ free(merlin);
+ if (*line == 'w') {
+ line++;
+ goto writenow;
+ }
+ } else if (c == 'w') {
+ int fd, delim;
+ char *cc;
+
+ // Since s/// uses arg1 and arg2, and w needs a persistent filehandle and
+ // eol status, and to retain the filename for error messages, we'd need
+ // to go up to arg5 just for this. Compromise: dynamically allocate the
+ // filehandle and eol status.
+
+writenow:
+ while (isspace(*line)) line++;
+ if (!*line) goto brand;
+ for (cc = line; *cc; cc++) if (*cc == '\\' && cc[1] == ';') break;
+ delim = *cc;
+ *cc = 0;
+ fd = xcreate(line, O_WRONLY|O_CREAT|O_TRUNC, 0644);
+ *cc = delim;
+
+ corwin->w = reg - (char *)corwin;
+ corwin = xrealloc(corwin, corwin->w+(cc-line)+6);
+ reg = corwin->w + (char *)corwin;
+
+ memcpy(reg, &fd, 4);
+ reg += 4;
+ *(reg++) = 0;
+ memcpy(reg, line, delim);
+ reg += delim;
+ *(reg++) = 0;
+
+ line = cc;
+ if (delim) line += 2;
+ } else if (c == 'y') {
+ char *s, delim = 0;
+ int len;
+
+ if (!(s = unescape_delimited_string(&line, &delim, 0))) goto brand;
+ corwin->arg1 = reg-(char *)corwin;
+ len = strlen(s);
+ reg = extend_string((void *)&corwin, s, reg-(char *)corwin, len);
+ free(s);
+ corwin->arg2 = reg-(char *)corwin;
+ if (!(s = unescape_delimited_string(&line, &delim, 0))) goto brand;
+ if (len != strlen(s)) goto brand;
+ reg = extend_string((void *)&corwin, s, reg-(char*)corwin, len);
+ free(s);
+ } else if (strchr("abcirtTw:", c)) {
+ int end, class;
+
+ // Trim whitespace from "b ;" and ": blah " but only first space in "w x "
+
+ while (isspace(*line)) {
+ if (!strchr("btT", c) || *line != '\n') line++;
+ else break;
+ }
+append:
+ class = !strchr("btT:", c);
+ end = strcspn(line, class ? "\n" : "; \t\r\n\v\f");
+
+ if (!end) {
+ if (!strchr("btT", c)) break;
+ continue;
+ }
+
+ // Extend allocation to include new string. We use offsets instead of
+ // pointers so realloc() moving stuff doesn't break things. Do it
+ // here instead of toybuf so there's no maximum size.
+ if (!corwin->arg1) corwin->arg1 = reg - (char*)corwin;
+ reg = extend_string((void *)&corwin, line, reg - (char *)corwin, end);
+ line += end;
+
+ // Line continuation? (Two slightly different input methods, -e with
+ // embedded newline vs -f line by line. Must parse both correctly.)
+ if (class && line[-1] == '\\') {
+ reg[-2] = 0;
+ if (*line && line[1]) {
+ reg -= 2;
+ line++;
+ goto append;
+ } else corwin->hit++;
+ }
+
+ // Commands that take no arguments
+ } else if (!strchr("{dDgGhHlnNpPqx=", c)) break;
+ }
+
+brand:
+ // Reminisce about chestnut trees.
+ error_exit("bad pattern '%s'@%ld (%c)", errstart, line-errstart+1, *line);
+}
+
+void sed_main(void)
+{
+ struct arg_list *dworkin;
+ char **args = toys.optargs;
+
+ // Lie to autoconf when it asks stupid questions, so configure regexes
+ // that look for "GNU sed version %f" greater than some old buggy number
+ // don't fail us for not matching their narrow expectations.
+ if (toys.optflags & FLAG_version) {
+ xprintf("This is not GNU sed version 9.0\n");
+ return;
+ }
+
+ // Need a pattern. If no unicorns about, fight serpent and take its eye.
+ if (!TT.e && !TT.f) {
+ if (!*toys.optargs) error_exit("no pattern");
+ (TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++);
+ }
+
+ // Option parsing infrastructure can't interlace "-e blah -f blah -e blah"
+ // so handle all -e, then all -f. (At least the behavior's consistent.)
+
+ for (dworkin = TT.e; dworkin; dworkin = dworkin->next)
+ jewel_of_judgement(&dworkin->arg, strlen(dworkin->arg));
+ for (dworkin = TT.f; dworkin; dworkin = dworkin->next)
+ do_lines(xopen(dworkin->arg, O_RDONLY), dworkin->arg, jewel_of_judgement);
+ dlist_terminate(TT.pattern);
+ if (TT.nextlen) error_exit("no }");
+
+ TT.fdout = 1;
+ TT.remember = xstrdup("");
+
+ // Inflict pattern upon input files
+ loopfiles_rw(args, O_RDONLY, 0, 0, do_sed);
+
+ if (!(toys.optflags & FLAG_i)) walk_pattern(0, 0);
+
+ // todo: need to close fd when done for TOYBOX_FREE?
+}