Promote sed to posix.

author: Rob Landley <rob@landley.net> 2014-12-21 01:54:54 -0600
committer: Rob Landley <rob@landley.net> 2014-12-21 01:54:54 -0600
commit: 1a1e0a9d325dd1ca7461a8e8203dd47bab6a8bc1 (patch)
tree: 6fc7da5f3bbd3ad9ead0fd52c2dc4d67d5a825cc /toys/posix/sed.c
parent: 32cd2b770fe3ac8c419a29156162f3a037cf47a3 (diff)
download: toybox-1a1e0a9d325dd1ca7461a8e8203dd47bab6a8bc1.tar.gz
1 files changed, 1002 insertions, 0 deletions
diff --git a/toys/posix/sed.c b/toys/posix/sed.c
new file mode 100644
index 00000000..7f070208
--- /dev/null
+++ b/toys/posix/sed.c
@@ -0,0 +1,1002 @@
+/* sed.c - stream editor. Thing that does s/// and other stuff.
+ *
+ * Copyright 2014 Rob Landley <rob@landley.net>
+ *
+ * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html
+ *
+ * TODO: lines > 2G could signed int wrap length counters. Not just getline()
+ * but N and s///
+
+USE_SED(NEWTOY(sed, "(version)e*f*inr", TOYFLAG_USR|TOYFLAG_BIN|TOYFLAG_LOCALE))
+
+config SED
+  bool "sed"
+  default y
+  help
+    usage: sed [-inr] [-e SCRIPT]...|SCRIPT [-f SCRIPT_FILE]... [FILE...]
+
+    Stream editor. Apply one or more editing SCRIPTs to each line of input
+    (from FILE or stdin) producing output (by default to stdout).
+
+    -e	add SCRIPT to list
+    -f	add contents of SCRIPT_FILE to list
+    -i	Edit each file in place.
+    -n	No default output. (Use the p command to output matched lines.)
+    -r	Use extended regular expression syntax.
+    -s	Treat input files separately (implied by -i)
+
+    A SCRIPT is a series of one or more COMMANDs separated by newlines or
+    semicolons. All -e SCRIPTs are concatenated together as if separated
+    by newlines, followed by all lines from -f SCRIPT_FILEs, in order.
+    If no -e or -f SCRIPTs are specified, the first argument is the SCRIPT.
+
+    Each COMMAND may be preceded by an address which limits the command to
+    apply only to the specified line(s). Commands without an address apply to
+    every line. Addresses are of the form:
+
+      [ADDRESS[,ADDRESS]]COMMAND
+
+    The ADDRESS may be a decimal line number (starting at 1), a /regular
+    expression/ within a pair of forward slashes, or the character "$" which
+    matches the last line of input. (In -s or -i mode this matches the last
+    line of each file, otherwise just the last line of the last file.) A single
+    address matches one line, a pair of comma separated addresses match
+    everything from the first address to the second address (inclusive). If
+    both addresses are regular expressions, more than one range of lines in
+    each file can match.
+
+    REGULAR EXPRESSIONS in sed are started and ended by the same character
+    (traditionally / but anything except a backslash or a newline works).
+    Backslashes may be used to escape the delimiter if it occurs in the
+    regex, and for the usual printf escapes (\abcefnrtv and octal, hex,
+    and unicode). An empty regex repeats the previous one. ADDRESS regexes
+    (above) require the first delimeter to be escaped with a backslash when
+    it isn't a forward slash (to distinguish it from the COMMANDs below).
+
+    Sed mostly operates on individual lines one at a time. It reads each line,
+    processes it, and either writes it to the output or discards it before
+    reading the next line. Sed can remember one additional line in a separate
+    buffer (using the h, H, g, G, and x commands), and can read the next line
+    of input early (using the n and N command), but other than that command
+    scripts operate on individual lines of text.
+
+    Each COMMAND starts with a single character. The following commands take
+    no arguments:
+
+      {  Start a new command block, continuing until a corresponding "}".
+         Command blocks may nest. If the block has an address, commands within
+         the block are only run for lines within the block's address range.
+
+      }  End command block (this command cannot have an address)
+
+      d  Delete this line and move on to the next one
+         (ignores remaining COMMANDs)
+
+      D  Delete one line of input and restart command SCRIPT (same as "d"
+         unless you've glued lines together with "N" or similar)
+
+      g  Get remembered line (overwriting current line)
+
+      G  Get remembered line (appending to current line)
+
+      h  Remember this line (overwriting remembered line)
+
+      H  Remember this line (appending to remembered line, if any)
+
+      l  Print line, escaping \abfrtv (but not newline), octal escaping other
+         nonprintable characters, wrapping lines to terminal width with a
+         backslash, and appending $ to actual end of line.
+
+      n  Print default output and read next line, replacing current line
+         (If no next line available, quit processing script)
+
+      N  Append next line of input to this line, separated by a newline
+         (This advances the line counter for address matching and "=", if no
+         next line available quit processing script without default output)
+
+      p  Print this line
+
+      P  Print this line up to first newline (from "N")
+
+      q  Quit (print default output, no more commands processed or lines read)
+
+      x  Exchange this line with remembered line (overwrite in both directions)
+
+      =  Print the current line number (followed by a newline)
+
+    The following commands (may) take an argument. The "text" arguments (to
+    the "a", "b", and "c" commands) may end with an unescaped "\" to append
+    the next line (for which leading whitespace is not skipped), and also
+    treat ";" as a literal character (use "\;" instead).
+
+      a [text]   Append text to output before attempting to read next line
+
+      b [label]  Branch, jumps to :label (or with no label, to end of SCRIPT)
+
+      c [text]   Delete line, output text at end of matching address range
+                 (ignores remaining COMMANDs)
+
+      i [text]   Print text
+
+      r [file]   Append contents of file to output before attempting to read
+                 next line.
+
+      s/S/R/F    Search for regex S, replace matched text with R using flags F.
+                 The first character after the "s" (anything but newline or
+                 backslash) is the delimiter, escape with \ to use normally.
+
+                 The replacement text may contain "&" to substitute the matched
+                 text (escape it with backslash for a literal &), or \1 through
+                 \9 to substitute a parenthetical subexpression in the regex.
+                 You can also use the normal backslash escapes such as \n and
+                 a backslash at the end of the line appends the next line.
+
+                 The flags are:
+
+                 [0-9]    A number, substitute only that occurrence of pattern
+                 g        Global, substitute all occurrences of pattern
+                 i        Ignore case when matching
+                 p        Print the line if match was found and replaced
+                 w [file] Write (append) line to file if match replaced
+
+      t [label]  Test, jump to :label only if an "s" command found a match in
+                 this line since last test (replacing with same text counts)
+
+      T [label]  Test false, jump only if "s" hasn't found a match.
+
+      w [file]   Write (append) line to file
+
+      y/old/new/ Change each character in 'old' to corresponding character
+                 in 'new' (with standard backslash escapes, delimiter can be
+                 any repeated character except \ or \n)
+
+      : [label]  Labeled target for jump commands
+
+      #  Comment, ignore rest of this line of SCRIPT
+
+    Deviations from posix: allow extended regular expressions with -r,
+    editing in place with -i, separate with -s, printf escapes in text, line
+    continuations, semicolons after all commands, 2-address anywhere an
+    address is allowed, "T" command, multiline continuations for [abc],
+    \; to end [abc] argument before end of line.
+*/
+
+#define FOR_sed
+#include "toys.h"
+
+GLOBALS(
+  struct arg_list *f;
+  struct arg_list *e;
+
+  // processed pattern list
+  struct double_list *pattern;
+
+  char *nextline, *remember;
+  void *restart, *lastregex;
+  long nextlen, rememberlen, count;
+  int fdout, noeol;
+  unsigned xx;
+)
+
+struct step {
+  struct step *next, *prev;
+
+  // Begin and end of each match
+  long lmatch[2];
+  int rmatch[2], arg1, arg2, w; // offsets because remalloc()
+  unsigned not, hit, sflags;
+  char c; // action
+};
+
+// Write out line with potential embedded NUL, handling eol/noeol
+static int emit(char *line, long len, int eol)
+{
+  int l, old = line[len];
+
+  if (TT.noeol && !writeall(TT.fdout, "\n", 1)) return 1;
+  if (eol) line[len++] = '\n';
+  if (!len) return 0;
+  TT.noeol = len && !eol;
+  l = writeall(TT.fdout, line, len);
+  if (eol) line[len-1] = old;
+  if (l != len) {
+    perror_msg("short write");
+
+    return 1;
+  }
+
+  return 0;
+}
+
+// Do regex matching handling embedded NUL bytes in string. Note that
+// neither the pattern nor the match can currently include NUL bytes
+// (even with wildcards) and string must be null terminated.
+static int ghostwheel(regex_t *preg, char *string, long len, int nmatch,
+  regmatch_t pmatch[], int eflags)
+{
+  char *s = string;
+
+  for (;;) {
+    long ll = 0;
+    int rc;
+
+    while (len && !*s) {
+      s++;
+      len--;
+    }
+    while (s[ll] && ll<len) ll++;
+
+    rc = regexec(preg, s, nmatch, pmatch, eflags);
+    if (!rc) {
+      for (rc = 0; rc<nmatch && pmatch[rc].rm_so!=-1; rc++) {
+        pmatch[rc].rm_so += s-string;
+        pmatch[rc].rm_eo += s-string;
+      }
+          
+      return 0;
+    }
+    if (ll==len) return rc;
+
+    s += ll;
+    len -= ll;
+  }
+}
+
+// Extend allocation to include new string, with newline between if newlen<0
+
+static char *extend_string(char **old, char *new, int oldlen, int newlen)
+{
+  int newline = newlen < 0;
+  char *s;
+
+  if (newline) newlen = -newlen;
+  s = *old = xrealloc(*old, oldlen+newlen+newline+1);
+  if (newline) s[oldlen++] = '\n';
+  memcpy(s+oldlen, new, newlen);
+  s[oldlen+newlen] = 0;
+
+  return s+oldlen+newlen+1;
+}
+
+// An empty regex repeats the previous one
+void *get_regex(void *trump, int offset)
+{
+  if (!offset) {
+    if (!TT.lastregex) error_exit("no previous regex");
+    return TT.lastregex;
+  }
+
+  return TT.lastregex = offset+(char *)trump;
+}
+
+// Apply pattern to line from input file
+static void walk_pattern(char **pline, long plen)
+{
+  struct append {
+    struct append *next, *prev;
+    int file;
+    char *str;
+  } *append = 0;
+  char *line = TT.nextline;
+  long len = TT.nextlen;
+  struct step *logrus;
+  int eol = 0, tea = 0;
+
+  // Grab next line for deferred processing (EOF detection: we get a NULL
+  // pline at EOF to flush last line). Note that only end of _last_ input
+  // file matches $ (unless we're doing -i).
+  TT.nextline = 0;
+  TT.nextlen = 0;
+  if (pline) {
+    TT.nextline = *pline;
+    TT.nextlen = plen;
+    *pline = 0;
+  }
+
+  if (!line || !len) return;
+  if (line[len-1] == '\n') line[--len] = eol++;
+  TT.count++;
+
+  logrus = TT.restart ? TT.restart : (void *)TT.pattern;
+  TT.restart = 0;
+
+  while (logrus) {
+    char *str, c = logrus->c;
+
+    // Have we got a line or regex matching range for this rule?
+    if (*logrus->lmatch || *logrus->rmatch) {
+      int miss = 0;
+      long lm;
+
+      // In a match that might end?
+      if (logrus->hit) {
+        if (!(lm = logrus->lmatch[1])) {
+          if (!logrus->rmatch[1]) logrus->hit = 0;
+          else {
+            void *rm = get_regex(logrus, logrus->rmatch[1]);
+
+            // regex match end includes matching line, so defer deactivation
+            if (line && !ghostwheel(rm, line, len, 0, 0, 0)) miss = 1;
+          }
+        } else if (lm > 0 && lm < TT.count) logrus->hit = 0;
+
+      // Start a new match?
+      } else {
+        if (!(lm = *logrus->lmatch)) {
+          void *rm = get_regex(logrus, *logrus->rmatch);
+
+          if (line && !ghostwheel(rm, line, len, 0, 0, 0)) logrus->hit++;
+        } else if (lm == TT.count || (lm == -1 && !pline)) logrus->hit++;
+
+        if (!logrus->lmatch[1] && !logrus->rmatch[1]) miss = 1;
+      } 
+
+      // Didn't match?
+      lm = !(logrus->hit ^ logrus->not);
+
+      // Deferred disable from regex end match
+      if (miss || logrus->lmatch[1] == TT.count) logrus->hit = 0;
+
+      if (lm) {
+        // Handle skipping curly bracket command group
+        if (c == '{') {
+          int curly = 1;
+
+          while (curly) {
+            logrus = logrus->next;
+            if (logrus->c == '{') curly++;
+            if (logrus->c == '}') curly--;
+          }
+        }
+        logrus = logrus->next;
+        continue;
+      }
+    }
+
+    // A deleted line can still update line match state for later commands
+    if (!line) {
+      logrus = logrus->next;
+      continue;
+    }
+
+    // Process command
+
+    if (c=='a' || c=='r') {
+      struct append *a = xzalloc(sizeof(struct append));
+      a->str = logrus->arg1+(char *)logrus;
+      a->file = c== 'r';
+      dlist_add_nomalloc((void *)&append, (void *)a);
+    } else if (c=='b' || c=='t' || c=='T') {
+      int t = tea;
+
+      if (c != 'b') tea = 0;
+      if (c=='b' || t^(c=='T')) {
+        if (!logrus->arg1) break;
+        str = logrus->arg1+(char *)logrus;
+        for (logrus = (void *)TT.pattern; logrus; logrus = logrus->next)
+          if (logrus->c == ':' && !strcmp(logrus->arg1+(char *)logrus, str))
+            break;
+        if (!logrus) error_exit("no :%s", str);
+      }
+    } else if (c=='c') {
+      str = logrus->arg1+(char *)logrus;
+      if (!logrus->hit) emit(str, strlen(str), 1);
+      free(line);
+      line = 0;
+      continue;
+    } else if (c=='d') {
+      free(line);
+      line = 0;
+      continue;
+    } else if (c=='D') {
+      // Delete up to \n or end of buffer
+      str = line;
+      while ((str-line)<len) if (*(str++) == '\n') break;
+      len -= str - line;
+      memmove(line, str, len);
+
+      // if "delete" blanks line, disable further processing
+      // otherwise trim and restart script
+      if (!len) {
+        free(line);
+        line = 0;
+      } else {
+        line[len] = 0;
+        logrus = (void *)TT.pattern;
+      }
+      continue;
+    } else if (c=='g') {
+      free(line);
+      line = xstrdup(TT.remember);
+      len = TT.rememberlen;
+    } else if (c=='G') {
+      line = xrealloc(line, len+TT.rememberlen+2);
+      line[len++] = '\n';
+      memcpy(line+len, TT.remember, TT.rememberlen);
+      line[len += TT.rememberlen] = 0;
+    } else if (c=='h') {
+      free(TT.remember);
+      TT.remember = xstrdup(line);
+      TT.rememberlen = len;
+    } else if (c=='H') {
+      TT.remember = xrealloc(TT.remember, TT.rememberlen+len+2);
+      TT.remember[TT.rememberlen++] = '\n';
+      memcpy(TT.remember+TT.rememberlen, line, len);
+      TT.remember[TT.rememberlen += len] = 0;
+    } else if (c=='i') {
+      str = logrus->arg1+(char *)logrus;
+      emit(str, strlen(str), 1);
+    } else if (c=='l') {
+      int i, x, off;
+
+      if (!TT.xx) {
+        terminal_size(&TT.xx, 0);
+        if (!TT.xx) TT.xx = 80;
+        if (TT.xx > sizeof(toybuf)-10) TT.xx = sizeof(toybuf)-10;
+        if (TT.xx > 4) TT.xx -= 4;
+      }
+
+      for (i = off = 0; i<len; i++) {
+        if (off >= TT.xx) {
+          toybuf[off++] = '\\';
+          emit(toybuf, off, 1);
+          off = 0;
+        }
+        x = stridx("\\\a\b\f\r\t\v", line[i]);
+        if (x != -1) {
+          toybuf[off++] = '\\';
+          toybuf[off++] = "\\abfrtv"[x];
+        } else if (line[i] >= ' ') toybuf[off++] = line[i];
+        else off += sprintf(toybuf+off, "\\%03o", line[i]);
+      }
+      toybuf[off++] = '$';
+      emit(toybuf, off, 1);
+    } else if (c=='n') {
+      TT.restart = logrus->next;
+
+      break;
+    } else if (c=='N') {
+      // Can't just grab next line because we could have multiple N and
+      // we need to actually read ahead to get N;$p EOF detection right.
+      if (pline) {
+        TT.restart = logrus->next;
+        extend_string(&line, TT.nextline, len, -TT.nextlen);
+        free(TT.nextline);
+        TT.nextline = line;
+        TT.nextlen += len + 1;
+        line = 0;
+      }
+
+      // Pending append goes out right after N
+      goto done; 
+    } else if (c=='p' || c=='P') {
+      char *l = (c=='P') ? strchr(line, '\n') : 0;
+
+      if (emit(line, l ? l-line : len, eol)) break;
+    } else if (c=='q') {
+      if (pline) *pline = (void *)1;
+      free(TT.nextline);
+      TT.nextline = 0;
+      TT.nextlen = 0;
+
+      break;
+    } else if (c=='s') {
+      char *rline = line, *new = logrus->arg2 + (char *)logrus, *swap, *rswap;
+      regmatch_t *match = (void *)toybuf;
+      regex_t *reg = get_regex(logrus, logrus->arg1);
+      int mflags = 0, count = 0, zmatch = 1, rlen = len, mlen, off, newlen;
+
+      // Find match in remaining line (up to remaining len)
+      while (!ghostwheel(reg, rline, rlen, 10, match, mflags)) {
+        mflags = REG_NOTBOL;
+
+        // Zero length matches don't count immediately after a previous match
+        mlen = match[0].rm_eo-match[0].rm_so;
+        if (!mlen && !zmatch) {
+          if (!rlen--) break;
+          rline++;
+          zmatch++;
+          continue;
+        } else zmatch = 0;
+
+        // If we're replacing only a specific match, skip if this isn't it
+        off = logrus->sflags>>3;
+        if (off && off != ++count) {
+          rline += match[0].rm_eo;
+          rlen -= match[0].rm_eo;
+
+          continue;
+        }
+        // The fact getline() can allocate unbounded amounts of memory is
+        // a bigger issue, but while we're here check for integer overflow
+        if (match[0].rm_eo > INT_MAX) perror_exit(0);
+
+        // newlen = strlen(new) but with \1 and & and printf escapes
+        for (off = newlen = 0; new[off]; off++) {
+          int cc = -1;
+
+          if (new[off] == '&') cc = 0;
+          else if (new[off] == '\\') cc = new[++off] - '0';
+          if (cc < 0 || cc > 9) {
+            newlen++;
+            continue;
+          }
+          newlen += match[cc].rm_eo-match[cc].rm_so;
+        }
+
+        // Allocate new size, copy start/end around match. (Can't extend in
+        // place because backrefs may refer to text after it's overwritten.)
+        len += newlen-mlen;
+        swap = xmalloc(len+1);
+        rswap = swap+(rline-line)+match[0].rm_so;
+        memcpy(swap, line, (rline-line)+match[0].rm_so);
+        memcpy(rswap+newlen, rline+match[0].rm_eo, (rlen -= match[0].rm_eo)+1);
+
+        // copy in new replacement text
+        for (off = mlen = 0; new[off]; off++) {
+          int cc = 0, ll;
+
+          if (new[off] == '\\') {
+            cc = new[++off] - '0';
+            if (cc<0 || cc>9) {
+              if (!(rswap[mlen++] = unescape(new[off])))
+                rswap[mlen-1] = new[off];
+
+              continue;
+            } else if (match[cc].rm_so == -1) error_exit("no s//\\%d/", cc);
+          } else if (new[off] != '&') {
+            rswap[mlen++] = new[off];
+
+            continue;
+          }
+
+          ll = match[cc].rm_eo-match[cc].rm_so;
+          memcpy(rswap+mlen, rline+match[cc].rm_so, ll);
+          mlen += ll;
+        }
+
+        rline = rswap+newlen;
+        free(line);
+        line = swap;
+
+        // Stop after first substitution unless we have flag g
+        if (!(logrus->sflags & 2)) break;
+      }
+
+      if (mflags) {
+        // flag p
+        if (logrus->sflags & 4) emit(line, len, eol);
+
+        tea = 1;
+        if (logrus->w) goto writenow;
+      }
+    } else if (c=='w') {
+      int fd, noeol;
+      char *name;
+
+writenow:
+      // Swap out emit() context
+      fd = TT.fdout;
+      noeol = TT.noeol;
+
+      // We save filehandle and newline status before filename
+      name = logrus->w + (char *)logrus;
+      memcpy(&TT.fdout, name, 4);
+      name += 4;
+      TT.noeol = *(name++);
+
+      // write, then save/restore context
+      if (emit(line, len, eol))
+        perror_exit("w '%s'", logrus->arg1+(char *)logrus);
+      *(--name) = TT.noeol;
+      TT.noeol = noeol;
+      TT.fdout = fd;
+    } else if (c=='x') {
+      long swap = TT.rememberlen;
+
+      str = TT.remember;
+      TT.remember = line;
+      line = str;
+      TT.rememberlen = len;
+      len = swap;
+    } else if (c=='y') {
+      char *from, *to = (char *)logrus;
+      int i, j;
+
+      from = to+logrus->arg1;
+      to += logrus->arg2;
+
+      for (i = 0; i < len; i++) {
+        j = stridx(from, line[i]);
+        if (j != -1) line[i] = to[j];
+      }
+    } else if (c=='=') {
+      sprintf(toybuf, "%ld", TT.count);
+      emit(toybuf, strlen(toybuf), 1);
+    }
+
+    logrus = logrus->next;
+  }
+
+  if (line && !(toys.optflags & FLAG_n)) emit(line, len, eol);
+
+done:
+  free(line);
+
+  if (dlist_terminate(append)) while (append) {
+    struct append *a = append->next;
+
+    if (append->file) {
+      int fd = xopen(append->str, O_RDONLY);
+
+      // Force newline if noeol pending
+      emit(0, 0, 0);
+      xsendfile(fd, TT.fdout);
+      close(fd);
+    } else emit(append->str, strlen(append->str), 1);
+    free(append);
+    append = a;
+  }
+}
+
+// Genericish function, can probably get moved to lib.c
+
+// Iterate over lines in file, calling function. Function can write 0 to
+// the line pointer if they want to keep it, or 1 to terminate processing,
+// otherwise line is freed. Passed file descriptor is closed at the end.
+static void do_lines(int fd, char *name, void (*call)(char **pline, long len))
+{
+  FILE *fp = fd ? xfdopen(fd, "r") : stdin;
+
+  for (;;) {
+    char *line = 0;
+    ssize_t len;
+
+    len = getline(&line, (void *)&len, fp);
+    if (len > 0) {
+      call(&line, len);
+      if (line == (void *)1) break;
+      free(line);
+    } else break;
+  }
+
+  if (fd) fclose(fp);
+}
+
+static void do_sed(int fd, char *name)
+{
+  int i = toys.optflags & FLAG_i;
+  char *tmp;
+
+  if (i) {
+    struct step *primal;
+
+    if (!fd && *name=='-') {
+      error_msg("no -i on stdin");
+      return;
+    }
+    TT.fdout = copy_tempfile(fd, name, &tmp);
+    TT.count = 0;
+    for (primal = (void *)TT.pattern; primal; primal = primal->next)
+      primal->hit = 0;
+  }
+  do_lines(fd, name, walk_pattern);
+  if (i) {
+    walk_pattern(0, 0);
+    replace_tempfile(-1, TT.fdout, &tmp);
+    TT.fdout = 1;
+    TT.nextline = 0;
+    TT.nextlen = TT.noeol = 0;
+  }
+}
+
+// Copy chunk of string between two delimiters, converting printf escapes.
+// returns processed copy of string (0 if error), *pstr advances to next
+// unused char. if delim (or *delim) is 0 uses/saves starting char as delimiter
+// if regxex, ignore delimiter in [ranges]
+static char *unescape_delimited_string(char **pstr, char *delim, int regex)
+{
+  char *to, *from, d;
+
+  to = from = *pstr;
+  if (!delim || !*delim) {
+    if (!(d = *(from++))) return 0;
+    if (d == '\\') d = *(from++);
+    if (!d || d == '\\') return 0;
+    if (delim) *delim = d;
+  } else d = *delim;
+  to = delim = xmalloc(strlen(*pstr)+1);
+
+  while (*from != d) {
+    if (!*from) return 0;
+
+    // delimiter in regex character range doesn't count
+    if (*from == '[') {
+      int len = 1;
+
+      if (from[len] == ']') len++;
+      while (from[len] != ']') if (!from[len++]) return 0;
+      memmove(to, from, ++len);
+      to += len;
+      from += len;
+      continue;
+    }
+    if (*from == '\\') {
+      if (!from[1]) return 0;
+
+      // Check escaped end delimiter before printf style escapes.
+      if (from[1] == d) from++;
+      else if (from[1]=='\\') *(to++) = *(from++);
+      else {
+        char c = unescape(from[1]);
+
+        if (c) {
+          *(to++) = c;
+          from+=2;
+          continue;
+        }
+      }
+    }
+    *(to++) = *(from++);
+  }
+  *to = 0;
+  *pstr = from+1;
+
+  return delim;
+}
+
+// Translate primal pattern into walkable form.
+static void jewel_of_judgement(char **pline, long len)
+{
+  struct step *corwin = (void *)TT.pattern;
+  char *line = *pline, *reg, c, *errstart = *pline;
+  int i;
+
+  // Append additional line to pattern argument string?
+  if (corwin && corwin->prev->hit) {
+    // Remove half-finished entry from list so remalloc() doesn't confuse it
+    TT.pattern = TT.pattern->prev;
+    corwin = dlist_pop(&TT.pattern);
+    corwin->hit = 0;
+    c = corwin->c;
+    reg = (char *)corwin;
+    reg += corwin->arg1 + strlen(reg + corwin->arg1);
+
+    // Resume parsing
+    goto append;
+  }
+
+  // Loop through commands in line
+
+  corwin = 0;
+  for (;;) {
+    if (corwin) dlist_add_nomalloc(&TT.pattern, (void *)corwin);
+
+    while (isspace(*line) || *line == ';') line++;
+    if (!*line || *line == '#') return;
+
+    errstart = line;
+    memset(toybuf, 0, sizeof(struct step));
+    corwin = (void *)toybuf;
+    reg = toybuf + sizeof(struct step);
+
+    // Parse address range (if any)
+    for (i = 0; i < 2; i++) {
+      if (*line == ',') line++;
+      else if (i) break;
+
+      if (isdigit(*line)) corwin->lmatch[i] = strtol(line, &line, 0);
+      else if (*line == '$') {
+        corwin->lmatch[i] = -1;
+        line++;
+      } else if (*line == '/' || *line == '\\') {
+        char *s = line;
+
+        if (!(s = unescape_delimited_string(&line, 0, 1))) goto brand;
+        if (!*s) corwin->rmatch[i] = 0;
+        else {
+          xregcomp((void *)reg, s, (toys.optflags & FLAG_r)*REG_EXTENDED);
+          corwin->rmatch[i] = reg-toybuf;
+          reg += sizeof(regex_t);
+        }
+        free(s);
+      } else break;
+    }
+
+    while (isspace(*line)) line++;
+    if (!*line) break;
+
+    while (*line == '!') {
+      corwin->not = 1;
+      line++;
+    }
+    while (isspace(*line)) line++;
+
+    c = corwin->c = *(line++);
+    if (strchr("}:", c) && i) break;
+    if (strchr("aiqr=", c) && i>1) break;
+
+    // Add step to pattern
+    corwin = xmalloc(reg-toybuf);
+    memcpy(corwin, toybuf, reg-toybuf);
+    reg = (reg-toybuf) + (char *)corwin;
+
+    // Parse arguments by command type
+    if (c == '{') TT.nextlen++;
+    else if (c == '}') {
+      if (!TT.nextlen--) break;
+    } else if (c == 's') {
+      char *merlin, *fiona, delim = 0;
+
+      // s/pattern/replacement/flags
+
+      // get pattern (just record, we parse it later)
+      corwin->arg1 = reg - (char *)corwin;
+      if (!(merlin = unescape_delimited_string(&line, &delim, 1))) goto brand;
+
+      // get replacement - don't replace escapes because \1 and \& need
+      // processing later, after we replace \\ with \ we can't tell \\1 from \1
+      fiona = line;
+      while (*line != delim) {
+        if (!*line) goto brand;
+        if (*line == '\\') {
+          if (!line[1]) goto brand;
+          line += 2;
+        } else line++;
+      }
+
+      corwin->arg2 = corwin->arg1 + sizeof(regex_t);
+      reg = extend_string((void *)&corwin, fiona, corwin->arg2, line-fiona)+1;
+
+      // get flags
+      for (line++; *line; line++) {
+        long l;
+
+        if (isspace(*line) && *line != '\n') continue;
+
+        if (0 <= (l = stridx("igp", *line))) corwin->sflags |= 1<<l;
+        else if (!(corwin->sflags>>3) && 0<(l = strtol(line, &line, 10))) {
+          corwin->sflags |= l << 3;
+          line--;
+        } else break;
+      }
+
+      // We deferred actually parsing the regex until we had the s///i flag
+      // allocating the space was done by extend_string() above
+      if (!*merlin) corwin->arg1 = 0;
+      else xregcomp((void *)(corwin->arg1 + (char *)corwin), merlin,
+        ((toys.optflags & FLAG_r)*REG_EXTENDED)|((corwin->sflags&1)*REG_ICASE));
+      free(merlin);
+      if (*line == 'w') {
+        line++;
+        goto writenow;
+      }
+    } else if (c == 'w') {
+      int fd, delim;
+      char *cc;
+
+      // Since s/// uses arg1 and arg2, and w needs a persistent filehandle and
+      // eol status, and to retain the filename for error messages, we'd need
+      // to go up to arg5 just for this. Compromise: dynamically allocate the
+      // filehandle and eol status.
+
+writenow:
+      while (isspace(*line)) line++;
+      if (!*line) goto brand;
+      for (cc = line; *cc; cc++) if (*cc == '\\' && cc[1] == ';') break;
+      delim = *cc;
+      *cc = 0;
+      fd = xcreate(line, O_WRONLY|O_CREAT|O_TRUNC, 0644);
+      *cc = delim;
+
+      corwin->w = reg - (char *)corwin;
+      corwin = xrealloc(corwin, corwin->w+(cc-line)+6);
+      reg = corwin->w + (char *)corwin;
+
+      memcpy(reg, &fd, 4);
+      reg += 4;
+      *(reg++) = 0;
+      memcpy(reg, line, delim);
+      reg += delim;
+      *(reg++) = 0;
+
+      line = cc;
+      if (delim) line += 2;
+    } else if (c == 'y') {
+      char *s, delim = 0;
+      int len;
+
+      if (!(s = unescape_delimited_string(&line, &delim, 0))) goto brand;
+      corwin->arg1 = reg-(char *)corwin;
+      len = strlen(s);
+      reg = extend_string((void *)&corwin, s, reg-(char *)corwin, len);
+      free(s);
+      corwin->arg2 = reg-(char *)corwin;
+      if (!(s = unescape_delimited_string(&line, &delim, 0))) goto brand;
+      if (len != strlen(s)) goto brand;
+      reg = extend_string((void *)&corwin, s, reg-(char*)corwin, len);
+      free(s);
+    } else if (strchr("abcirtTw:", c)) {
+      int end, class;
+
+      // Trim whitespace from "b ;" and ": blah " but only first space in "w x "
+
+      while (isspace(*line)) {
+        if (!strchr("btT", c) || *line != '\n') line++;
+        else break;
+      }
+append:
+      class = !strchr("btT:", c);
+      end = strcspn(line, class ? "\n" : "; \t\r\n\v\f");
+
+      if (!end) {
+        if (!strchr("btT", c)) break;
+        continue;
+      }
+
+      // Extend allocation to include new string. We use offsets instead of
+      // pointers so realloc() moving stuff doesn't break things. Do it
+      // here instead of toybuf so there's no maximum size.
+      if (!corwin->arg1) corwin->arg1 = reg - (char*)corwin;
+      reg = extend_string((void *)&corwin, line, reg - (char *)corwin, end); 
+      line += end;
+
+      // Line continuation? (Two slightly different input methods, -e with
+      // embedded newline vs -f line by line. Must parse both correctly.)
+      if (class && line[-1] == '\\') {
+        reg[-2] = 0;
+        if (*line && line[1]) {
+          reg -= 2;
+          line++;
+          goto append;
+        } else corwin->hit++;
+      }
+
+    // Commands that take no arguments
+    } else if (!strchr("{dDgGhHlnNpPqx=", c)) break;
+  }
+
+brand:
+  // Reminisce about chestnut trees.
+  error_exit("bad pattern '%s'@%ld (%c)", errstart, line-errstart+1, *line);
+}
+
+void sed_main(void)
+{
+  struct arg_list *dworkin;
+  char **args = toys.optargs;
+
+  // Lie to autoconf when it asks stupid questions, so configure regexes
+  // that look for "GNU sed version %f" greater than some old buggy number
+  // don't fail us for not matching their narrow expectations.
+  if (toys.optflags & FLAG_version) {
+    xprintf("This is not GNU sed version 9.0\n");
+    return;
+  }
+
+  // Need a pattern. If no unicorns about, fight serpent and take its eye.
+  if (!TT.e && !TT.f) {
+    if (!*toys.optargs) error_exit("no pattern");
+    (TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++);
+  }
+
+  // Option parsing infrastructure can't interlace "-e blah -f blah -e blah"
+  // so handle all -e, then all -f. (At least the behavior's consistent.)
+
+  for (dworkin = TT.e; dworkin; dworkin = dworkin->next)
+    jewel_of_judgement(&dworkin->arg, strlen(dworkin->arg));
+  for (dworkin = TT.f; dworkin; dworkin = dworkin->next)
+    do_lines(xopen(dworkin->arg, O_RDONLY), dworkin->arg, jewel_of_judgement);
+  dlist_terminate(TT.pattern);
+  if (TT.nextlen) error_exit("no }");  
+
+  TT.fdout = 1;
+  TT.remember = xstrdup("");
+
+  // Inflict pattern upon input files
+  loopfiles_rw(args, O_RDONLY, 0, 0, do_sed);
+
+  if (!(toys.optflags & FLAG_i)) walk_pattern(0, 0);
+
+  // todo: need to close fd when done for TOYBOX_FREE?
+}
author	Rob Landley <rob@landley.net>	2014-12-21 01:54:54 -0600
committer	Rob Landley <rob@landley.net>	2014-12-21 01:54:54 -0600
commit	1a1e0a9d325dd1ca7461a8e8203dd47bab6a8bc1 (patch)
tree	6fc7da5f3bbd3ad9ead0fd52c2dc4d67d5a825cc /toys/posix/sed.c
parent	32cd2b770fe3ac8c419a29156162f3a037cf47a3 (diff)
download	toybox-1a1e0a9d325dd1ca7461a8e8203dd47bab6a8bc1.tar.gz