From 9b0d8691b9dad6674ea6d778bebcf3bbd76cc6de Mon Sep 17 00:00:00 2001 From: Rob Landley Date: Mon, 20 Oct 2014 21:07:16 -0500 Subject: Random in-progress snapshot of sed, not finished yet. --- toys/pending/sed.c | 158 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 150 insertions(+), 8 deletions(-) (limited to 'toys') diff --git a/toys/pending/sed.c b/toys/pending/sed.c index 83119bf8..338680af 100644 --- a/toys/pending/sed.c +++ b/toys/pending/sed.c @@ -4,7 +4,7 @@ * * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html * - * todo "-e blah -f blah -e blah" what order? + * todo "-e blah -f blah -e blah" what order? (All -e, then all -f.) * What happens when first address matched, then EOF? How about ",42" or "1," * Does $ match last line of file or last line of input * If file doesn't end with newline @@ -12,8 +12,13 @@ * space before address * numerical addresses that cross, select one line * test backslash escapes in regex; share code with printf? + * address counts lines cumulatively across files + * Why can't I start an address with \\ (posix says no, but _why_?) + * Fun with \nblah\nn vs \tblah\tt + * + * echo -e "one\ntwo\nthree" | sed -n '$,$p' -USE_SED(NEWTOY(sed, "e*f*inr", TOYFLAG_USR|TOYFLAG_BIN)) +USE_SED(NEWTOY(sed, "(version)e*f*inr", TOYFLAG_USR|TOYFLAG_BIN)) config SED bool "sed" @@ -75,27 +80,68 @@ GLOBALS( struct arg_list *f; struct arg_list *e; - void *pattern; + // processed pattern list + struct double_list *pattern; ) +struct step { + struct step *next, *prev; + + // Begin and end of each match + long lmatch[2]; + regex_t *rmatch[2]; + + // Action + char c; +}; + +// Apply pattern to line from input file static void do_line(char **pline, long len) { printf("len=%ld line=%s\n", len, *pline); } +// Genericish function, can probably get moved to lib.c + +// Iterate over lines in file, calling function. Function can write NULL to +// the line pointer if they want to keep it, otherwise line is freed. +// Passed file descriptor is closed at the end of processing. static void do_lines(int fd, char *name, void (*call)(char **pline, long len)) { - FILE *fp = fdopen(fd, "r"); + FILE *fp = xfdopen(fd, "r"); for (;;) { char *line = 0; ssize_t len; len = getline(&line, (void *)&len, fp); - do_line(&line, len); + call(&line, len); free(line); if (len < 1) break; } + fclose(fp); +} + +// Iterate over newline delimited data blob (potentially with embedded NUL), +// call function on each line. +static void chop_lines(char *data, long len, + void (*call)(char **pline, long len)) +{ + long ll; + + for (ll = 0; ll < len; ll++) { + if (data[ll] == '\n') { + char *c = data; + + data[ll] = 0; + call(&c, len); + data[ll++] = '\n'; + data += ll; + len -= ll; + ll = -1; + } + } + if (len) call(&data, len); } static void do_sed(int fd, char *name) @@ -103,16 +149,112 @@ static void do_sed(int fd, char *name) do_lines(fd, name, do_line); } +// Translate primal pattern into walkable form. +static void jewel_of_judgement(char **pline, long len) +{ + struct step *corwin; + char *line = *pline, *reg; + int i; + + while (isspace(*line)) line++; + if (*line == '#') return; + + memset(toybuf, 0, sizeof(struct step)); + corwin = (void *)toybuf; + reg = toybuf + sizeof(struct step); + + // Parse address range (if any) + for (i = 0; i < 2; i++) { + if (*line == ',') line++; + else if (i) break; + + if (isdigit(*line)) corwin->lmatch[i] = strtol(line, &line, 0); + else if (*line == '$') { + corwin->lmatch[i] = -1; + line++; + } else if (*line == '/' || *line == '\\') { + char delim = *(line++), slash = 0, *to, *from; + + if (delim == '\\') { + if (!*line) goto brand; + slash = delim = *(line++); + } + + // Removing backslash escapes edits the source string, which could + // be from the environment space via -e, which could screw up what + // "ps" sees, and I'm ok with that. + for (to = from = line; *from != delim; *(to++) = *(from++)) { + if (!*from) goto brand; + if (*from == '\\') { + if (!from[1]) goto brand; + + // Check escaped end delimiter before printf style escapes. + if (from[1] == slash) from++; + else { + char c = unescape(from[1]); + + if (c) { + *to = c; + from++; + } + } + } + } + slash = *to; + *to = 0; + xregcomp(corwin->rmatch[i] = (void *)reg, line, + ((toys.optflags & FLAG_r)*REG_EXTENDED)|REG_NOSUB); + *to = slash; + reg += sizeof(regex_t); + } else break; + } + + while (isspace(*line)) line++; + + if (!*line || !strchr("p", *line)) goto brand; + + // Add step to pattern + corwin = xmalloc(reg-toybuf); + memcpy(corwin, toybuf, reg-toybuf); + dlist_add_nomalloc(&TT.pattern, (void *)corwin); + + return; + +brand: + + // Reminisce about chestnut trees. + error_exit("bad pattern '%s'@%ld (%c)", *pline, line-*pline, *line); +} + void sed_main(void) { + struct arg_list *dworkin; char **args = toys.optargs; - // Need a pattern - if (!TT.e) { + // Lie to autoconf when it asks stupid questions, so configure regexes + // that look for "GNU sed version %f" greater than some old buggy number + // don't fail us for not matching their narrow expectations. + if (FLAG_version) { + xprintf("This is not GNU sed version 9.0\n"); + return; + } + + // Need a pattern. If no unicorns about, fight dragon and take its eye. + if (!TT.e && !TT.f) { if (!*toys.optargs) error_exit("no pattern"); (TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++); } + for (dworkin = TT.e; dworkin; dworkin = dworkin->next) { + chop_lines(dworkin->arg, strlen(dworkin->arg), jewel_of_judgement); + } + + for (dworkin = TT.f; dworkin; dworkin = dworkin->next) { + int fd = xopen(dworkin->arg, O_RDONLY); + + do_lines(fd, dworkin->arg, jewel_of_judgement); + } + // Inflict pattern upon input files - loopfiles(args, do_sed); + loopfiles_rw(args, O_RDONLY, 0, 0, do_sed); } -- cgit v1.2.3