From cb652fa275c1806277ff07a98c864dc8c0335e0f Mon Sep 17 00:00:00 2001 From: Rob Landley Date: Wed, 12 Nov 2014 14:47:57 -0600 Subject: Implement more sed plumbing, including s/// (without \1 or & yet). --- toys/pending/sed.c | 304 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 210 insertions(+), 94 deletions(-) diff --git a/toys/pending/sed.c b/toys/pending/sed.c index 551aff9d..3d3dcbd3 100644 --- a/toys/pending/sed.c +++ b/toys/pending/sed.c @@ -179,10 +179,8 @@ struct step { // Begin and end of each match long lmatch[2]; - int rmatch[2]; // offset to regex_t, because realloc() would confuse pointer + int rmatch[2], arg1, arg2, w; // offsets because remalloc() unsigned not, hit, sflags; - - int arg1, arg2, arg3; // offset start of to argument (string or regex_t) char c; // action }; @@ -208,6 +206,12 @@ static int ghostwheel(regex_t *preg, char *string, long len, int nmatch, regmatch_t pmatch[], int eflags) { /* + while (len && !*string) { + string++; + len--; + int l = strlen(string); + if (len != strlen(string)) + // todo: this long start = 0, rc = 0, matches = 0; @@ -221,8 +225,8 @@ static int ghostwheel(regex_t *preg, char *string, long len, int nmatch, return rc; */ - return regexec(preg, string, nmatch, pmatch, eflags); + return regexec(preg, string, nmatch, pmatch, eflags); } // Extend allocation to include new string, with newline between if newlen<0 @@ -248,7 +252,7 @@ static void walk_pattern(char **pline, long plen) struct append *next, *prev; int file; char *str; - } *append; + } *append = 0; char *line = TT.nextline; long len = TT.nextlen; struct step *logrus; @@ -269,6 +273,7 @@ static void walk_pattern(char **pline, long plen) logrus = TT.restart ? TT.restart : (void *)TT.pattern; TT.restart = 0; + while (logrus) { char *str, c = logrus->c; @@ -311,6 +316,7 @@ static void walk_pattern(char **pline, long plen) if (logrus->c == '}') curly--; } } + logrus = logrus->next; continue; } // Deferred disable from regex end match @@ -324,10 +330,14 @@ static void walk_pattern(char **pline, long plen) a->str = logrus->arg1+(char *)logrus; a->file = c== 'r'; dlist_add_nomalloc((void *)&append, (void *)a); - } else if (c=='b' || c=='t' || c=='T') { - if (c=='b' || tea^(c=='T')) { + } else if (c=='b' || c=='t' || c=='T') { + int t = tea; + + if (c != 'b') tea = 0; + if (c=='b' || t^(c=='T')) { str = logrus->arg1+(char *)logrus; + if (!*str) break; for (logrus = (void *)TT.pattern; logrus; logrus = logrus->next) if (logrus->c == ':' && !strcmp(logrus->arg1+(char *)logrus, str)) @@ -389,18 +399,76 @@ static void walk_pattern(char **pline, long plen) } else if (c=='p') { if (emit(line, len, eol)) break; } else if (c=='q') break; -// else if (c=='s') { -// tea = 1; -// } - else if (c=='w') { - int fd = TT.fdout, noeol = TT.noeol; - - TT.fdout = logrus->arg2; - TT.noeol = logrus->arg3; - + else if (c=='s') { + char *rline = line, *new = logrus->arg2 + (char *)logrus;; + regmatch_t *match = (void *)toybuf; + regex_t *reg = (void *)(logrus->arg1+(char *)logrus); + unsigned mflags = 0, i = 0, rlen = len, mlen, off, newlen; + + // Find match + while (!ghostwheel(reg, rline, rlen, 10, match, mflags)) { + mflags = REG_NOTBOL; + + // Zero length matches don't count immediately after last match + mlen = match[0].rm_eo-match[0].rm_so; + if (!mlen && !match[0].rm_so) { + if (!rlen--) break; + rline++; + continue; + } + + // Replace only a specific match? + off = logrus->sflags>>3; + if (off && off != ++i) { + rline += match[0].rm_eo; + rlen -= match[0].rm_eo; + + continue; + } + + // The strlen() and memcpy don't support \1 or & yet. + newlen = strlen(new); + + len = len-mlen+newlen; + off = rline-line; + line = xrealloc(line, len+1); + rline = line+off; + + rlen -= match[0].rm_eo; + memmove(rline+match[0].rm_so+newlen, rline+match[0].rm_eo, rlen+1); + memcpy(rline+match[0].rm_so, new, newlen); + rline += newlen+match[0].rm_so; + + // flag g + if (!(logrus->sflags & 2)) break; + } + + if (mflags) { + // flag p + if (logrus->sflags & 4) emit(line, len, eol); + + tea = 1; + if (logrus->w) goto writenow; + } + } else if (c=='w') { + int fd, noeol; + char *name; + +writenow: + // Swap out emit() context + fd = TT.fdout; + noeol = TT.noeol; + + // We save filehandle and newline status before filename + name = logrus->w + (char *)logrus; + memcpy(&TT.fdout, name, 4); + name += 4; + TT.noeol = *(name++); + + // write, then save/restore context if (emit(line, len, eol)) perror_exit("w '%s'", logrus->arg1+(char *)logrus); - logrus->arg3 = TT.noeol; + *(--name) = TT.noeol; TT.noeol = noeol; TT.fdout = fd; } else if (c=='x') { @@ -411,9 +479,18 @@ static void walk_pattern(char **pline, long plen) line = str; TT.rememberlen = len; len = swap; -// } else if (c=='y') { + } else if (c=='y') { + char *from, *to = (char *)logrus; + int i, j; + + from = to+logrus->arg1; + to += logrus->arg2; + + for (i = 0; i < len; i++) { + j = stridx(from, line[i]); + if (j != -1) line[i] = to[j]; + } } else if (c=='=') xprintf("%ld\n", TT.count); - // labcirstTwy else if (c!=':') error_exit("todo: %c", c); logrus = logrus->next; @@ -499,47 +576,50 @@ static void do_sed(int fd, char *name) } } -// Note: removing backslash escapes edits the source string, which could -// be from the environment space via -e, which could screw up what -// "ps" sees, and I'm ok with that. +// Note: removing backslash escapes and null terminating edits the source +// string, which could be from the environment space via -e, which could +// screw up what "ps" sees, and I'm ok with that. (Modifying the environment +// space like that means sed is very, very not reentrant.) + +// Ok, what happens if we xexec() sed with constant arguments then? +// TODO: ^^^ that -// extract regex up to delimeter, converting \escapes -// You can't use \ as delimiter because how would you escape anything? -static int parse_regex(regex_t *reg, char **pstr, char delim) +// returns length of processed string, *pstr advances to next unused char, +// if delim (or *delim) is 0 uses starting char as delimiter, otherwise +// parses and saves delimiter from first character(s) +static int unescape_delimited_string(char **pstr, char *delim) { - char *to, *from = *pstr; + char *to, *from = *pstr, d; int rc; - if (delim == '\\') rc = 0; - else { - for (to = from; *from != delim; *(to++) = *(from++)) { - if (!*from) break; - if (*from == '\\') { - if (!from[1]) break; - - // Check escaped end delimiter before printf style escapes. - if (from[1] == delim) from++; - else { - char c = unescape(from[1]); - - if (c) { - *to = c; - from++; - } + if (!delim || !*delim) { + if (!(d = *(from++))) return -1; + if (d == '\\') d = *(from++); + if (!d || d == '\\') return -1; + if (delim) *delim = d; + } else d = *delim; + + for (to = *pstr; *from != d; *(to++) = *(from++)) { + if (!*from) return -1; + if (*from == '\\') { + if (!from[1]) return -1; + + // Check escaped end delimiter before printf style escapes. + if (from[1] == d) from++; + else { + char c = unescape(from[1]); + + if (c) { + *to = c; + from++; } } } - rc = (*from == delim); } + rc = to-*pstr; + *to = 0; + *pstr = from+1; - if (rc) { - delim = *to; - *to = 0; - xregcomp(reg, *pstr, ((toys.optflags & FLAG_r)*REG_EXTENDED)|REG_NOSUB); - *to = delim; - } - *pstr = from + rc; - return rc; } @@ -587,14 +667,11 @@ static void jewel_of_judgement(char **pline, long len) corwin->lmatch[i] = -1; line++; } else if (*line == '/' || *line == '\\') { - char delim = *(line++); - - if (delim == '\\') { - if (!*line) goto brand; - delim = *(line++); - } + char *s = line; - if (!parse_regex((void *)reg, &line, delim)) goto brand; + if (-1 == unescape_delimited_string(&line, 0)) goto brand; + xregcomp((void *)reg, s, + ((toys.optflags & FLAG_r)*REG_EXTENDED)|REG_NOSUB); corwin->rmatch[i] = reg-toybuf; reg += sizeof(regex_t); } else break; @@ -603,7 +680,10 @@ static void jewel_of_judgement(char **pline, long len) while (isspace(*line)) line++; if (!*line) break; - while (*line == '!') corwin->not = 1; + while (*line == '!') { + corwin->not = 1; + line++; + } while (isspace(*line)) line++; c = corwin->c = *(line++); @@ -620,55 +700,89 @@ static void jewel_of_judgement(char **pline, long len) else if (c == '}') { if (!TT.nextlen--) break; } else if (c == 's') { - char delim = *line; + void *merlin, *fiona; int end; + char delim = 0; // s/pattern/replacement/flags - if (delim) line++; - else break; - - // get pattern - end = reg - (char *)corwin; - corwin = xrealloc(corwin, end+sizeof(regex_t)); - reg = end + (char *)corwin; - if (!parse_regex((void *)reg, &line, delim)) break; - corwin->arg1 = reg-toybuf; - reg += sizeof(regex_t); + // get pattern (just record, we parse it later) + corwin->arg1 = reg - (char *)corwin; + merlin = line; + if (-1 == unescape_delimited_string(&line, &delim)) goto brand; // get replacement + fiona = line; + if (-1 == (end = unescape_delimited_string(&line, &delim))) goto brand; + corwin->arg2 = corwin->arg1 + sizeof(regex_t); + reg = extend_string((void *)&corwin, fiona, corwin->arg2, end); - for (end = 0; line[end] != delim; end++) { - if (line[end] == '\\') end++; - if (!line[end]) goto brand; - } - - corwin->arg2 = reg - (char*)corwin; - reg = extend_string((void *)&corwin, line, corwin->arg2, end); - line += end+1; - - // parse s///flags - for (;;) { + // get flags + for (; *line && *line != ';'; line++) { long l; - line++; - if (!*line || *line == ';') break; if (isspace(*line)) continue; - if (0 <= (l = stridx("gpi", *line))) corwin->sflags |= 1<sflags >> 4 && 0<(l = strtol(line+end, &line, 10))) { - corwin->sflags |= l << 4; + if (0 <= (l = stridx("igp", *line))) corwin->sflags |= 1<sflags >> 3 && 0<(l = strtol(line+end, &line, 10))) { + corwin->sflags |= l << 3; line--; - } else if (*line == 'w') { - while (isspace(*++line)); - if (!*line) goto brand; - corwin->arg3 = reg - (char *)corwin; - reg = extend_string((void *)&corwin, line, corwin->arg3, - strlen(line)); - } else goto brand; + } else if (*line == 'w') break; + else goto brand; + } + + // We deferred actually parsing the rexex until we had the s///i flag + // allocating the space was done by extend_string() above + xregcomp((void *)(corwin->arg1 + (char *)corwin), merlin, + ((toys.optflags & FLAG_r)*REG_EXTENDED)|((corwin->sflags&1)*REG_ICASE)); + if (*line == 'w') { + line++; + goto writenow; } + } else if (c == 'w') { + int fd, delim; + char *cc; + + // Since s/// uses arg1 and arg2, and w needs a persistent filehandle and + // eol status, and to retain the filename for error messages, we'd need + // to go up to arg5 just for this. Compromise: dynamically allocate the + // filehandle and eol status. + +writenow: + while (isspace(*line)) line++; + if (!*line) goto brand; + for (cc = line; *cc; cc++) if (*cc == '\\' && cc[1] == ';') break; + delim = *cc; + *cc = 0; + fd = xcreate(line, O_WRONLY|O_CREAT|O_TRUNC, 0544); + *cc = delim; + + delim = cc-line; + corwin->w = reg - (char *)corwin; + corwin = xrealloc(corwin, corwin->w + delim + 6); + reg = corwin->w + (char *)corwin; + + memcpy(reg, &fd, 4); + reg += 4; + *(reg++) = 0; + memcpy(reg, line, delim); + reg += delim; + *(reg++) = 0; + + line = cc; + if (delim) line += 2; } else if (c == 'y') { - // y/old/new/ + char *s = line, delim = 0; + int len1, len2; + + if (-1 == (len1 = unescape_delimited_string(&line, &delim))) goto brand; + corwin->arg1 = reg-(char *)corwin; + reg = extend_string((void *)&corwin, s, reg-(char *)corwin, len1); + s = line; + corwin->arg2 = reg-(char *)corwin; + if (-1 == (len2 = unescape_delimited_string(&line, &delim))) goto brand; + if (len1 != len2) goto brand; + reg = extend_string((void *)&corwin, s, reg-(char*)corwin, len2); } else if (strchr("abcirtTw:", c)) { int end, class; @@ -742,4 +856,6 @@ void sed_main(void) loopfiles_rw(args, O_RDONLY, 0, 0, do_sed); if (!(toys.optflags & FLAG_i)) walk_pattern(0, 0); + + // todo: need to close fd when done for TOYBOX_FREE? } -- cgit v1.2.3