1 files changed, 682 insertions, 0 deletions
diff --git a/usr.bin/grep/util.c b/usr.bin/grep/util.c
new file mode 100644
index 0000000..e16d08e
--- /dev/null
+++ b/usr.bin/grep/util.c
@@ -0,0 +1,682 @@
+/*	$OpenBSD: util.c,v 1.63 2020/07/23 20:19:27 martijn Exp $	*/
+
+/*-
+ * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <ctype.h>
+#include <err.h>
+#include <errno.h>
+#include <fts.h>
+#include <regex.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <zlib.h>
+
+#include "grep.h"
+
+/*
+ * Process a file line by line...
+ */
+
+static int	linesqueued;
+static int	procline(str_t *l, int);
+static int	grep_search(fastgrep_t *, char *, size_t, regmatch_t *pmatch, int);
+#ifndef SMALL
+static bool	grep_cmp(const char *, const char *, size_t);
+static void	grep_revstr(unsigned char *, int);
+#endif
+
+int
+grep_tree(char **argv)
+{
+	FTS	*fts;
+	FTSENT	*p;
+	int	c, fts_flags;
+	char	*dot_argv[] = { ".", NULL };
+	char	*path;
+
+	/* default to . if no path given */
+	if (argv[0] == NULL)
+		argv = dot_argv;
+
+	c = 0;
+
+	fts_flags = FTS_PHYSICAL | FTS_NOSTAT | FTS_NOCHDIR;
+
+	if (!(fts = fts_open(argv, fts_flags, NULL)))
+		err(2, NULL);
+	while ((p = fts_read(fts)) != NULL) {
+		switch (p->fts_info) {
+		case FTS_DNR:
+			break;
+		case FTS_ERR:
+			file_err = 1;
+			if(!sflag)
+				warnc(p->fts_errno, "%s", p->fts_path);
+			break;
+		case FTS_D:
+		case FTS_DP:
+			break;
+		default:
+			path = p->fts_path;
+			/* skip "./" if implied */
+			if (argv == dot_argv && p->fts_pathlen >= 2)
+				path += 2;
+			c |= procfile(path);
+			break;
+		}
+	}
+	if (errno)
+		err(2, "fts_read");
+	fts_close(fts);
+	return c;
+}
+
+int
+procfile(char *fn)
+{
+	str_t ln;
+	file_t *f;
+	int t, z, nottext, overflow = 0;
+	unsigned long long c;
+
+	mcount = mlimit;
+
+	if (fn == NULL) {
+		fn = "(standard input)";
+		f = grep_fdopen(STDIN_FILENO);
+	} else {
+		f = grep_open(fn);
+	}
+	if (f == NULL) {
+		if (errno == EISDIR)
+			return 0;
+		file_err = 1;
+		if (!sflag)
+			warn("%s", fn);
+		return 0;
+	}
+
+	nottext = grep_bin_file(f);
+	if (nottext && binbehave == BIN_FILE_SKIP) {
+		grep_close(f);
+		return 0;
+	}
+
+	ln.file = fn;
+	if (labelname)
+		ln.file = (char *)labelname;
+	ln.line_no = 0;
+	ln.len = 0;
+	linesqueued = 0;
+	tail = 0;
+	ln.off = -1;
+
+	if (Bflag > 0)
+		initqueue();
+	for (c = 0;  c == 0 || !(lflag || qflag); ) {
+		if (mflag && mlimit == 0)
+			break;
+		ln.off += ln.len + 1;
+		if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL)
+			break;
+		if (ln.len > 0 && ln.dat[ln.len - 1] == '\n')
+			--ln.len;
+		ln.line_no++;
+
+		z = tail;
+
+		if ((t = procline(&ln, nottext)) == 0 && Bflag > 0 && z == 0) {
+			enqueue(&ln);
+			linesqueued++;
+		}
+		if (ULLONG_MAX - c < (unsigned long long)t)
+			overflow = 1;
+		else
+			c += t;
+		if (mflag && mcount <= 0)
+			break;
+	}
+	if (Bflag > 0)
+		clearqueue();
+	grep_close(f);
+
+	if (cflag) {
+		if (!hflag)
+			printf("%s:", ln.file);
+		printf("%llu%s\n", c, overflow ? "+" : "");
+	}
+	if (lflag && c != 0)
+		printf("%s\n", fn);
+	if (Lflag && c == 0)
+		printf("%s\n", fn);
+	if (c && !cflag && !lflag && !Lflag &&
+	    binbehave == BIN_FILE_BIN && nottext && !qflag)
+		printf("Binary file %s matches\n", fn);
+
+	return overflow || c != 0;
+}
+
+
+/*
+ * Process an individual line in a file. Return non-zero if it matches.
+ */
+
+#define isword(x) (isalnum((unsigned char)x) || (x) == '_')
+
+static int
+procline(str_t *l, int nottext)
+{
+	regmatch_t	pmatch = { 0 };
+	int		c, i, r;
+	regoff_t	offset;
+
+	/* size_t will be converted to regoff_t. ssize_t is guaranteed to fit
+	 * into regoff_t */
+	if (l->len > SSIZE_MAX) {
+		errx(2, "Line is too big to process");
+	}
+
+	c = 0;
+	i = 0;
+	if (matchall) {
+		c = 1;
+		goto print;
+	}
+
+	for (i = 0; i < patterns; i++) {
+		offset = 0;
+redo:
+		if (fg_pattern[i].pattern) {
+			int flags = 0;
+			if (offset)
+				flags |= REG_NOTBOL;
+			r = grep_search(&fg_pattern[i], l->dat + offset,
+			    l->len - offset, &pmatch, flags);
+			pmatch.rm_so += offset;
+			pmatch.rm_eo += offset;
+		} else {
+			int flags = eflags;
+			if (offset)
+				flags |= REG_NOTBOL;
+			pmatch.rm_so = offset;
+			pmatch.rm_eo = l->len;
+			r = regexec(&r_pattern[i], l->dat, 1, &pmatch, flags);
+		}
+		if (r == 0 && xflag) {
+			if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len)
+				r = REG_NOMATCH;
+		}
+		if (r == 0) {
+			c = 1;
+			if (oflag && pmatch.rm_so != pmatch.rm_eo)
+				goto print;
+			break;
+		}
+	}
+	if (oflag)
+		return c;
+print:
+	if (vflag)
+		c = !c;
+
+	/* Count the matches if we have a match limit */
+	if (mflag)
+		mcount -= c;
+
+	if (c && binbehave == BIN_FILE_BIN && nottext)
+		return c; /* Binary file */
+
+	if ((tail > 0 || c) && !cflag && !qflag) {
+		if (c) {
+			if (first > 0 && tail == 0 && (Bflag < linesqueued) &&
+			    (Aflag || Bflag))
+				printf("--\n");
+			first = 1;
+			tail = Aflag;
+			if (Bflag > 0)
+				printqueue();
+			linesqueued = 0;
+			printline(l, ':', oflag ? &pmatch : NULL);
+		} else {
+			printline(l, '-', oflag ? &pmatch : NULL);
+			tail--;
+		}
+	}
+	if (oflag && !matchall) {
+		offset = pmatch.rm_eo;
+		goto redo;
+	}
+	return c;
+}
+
+#ifndef SMALL
+void
+fgrepcomp(fastgrep_t *fg, const unsigned char *pattern)
+{
+	int i;
+
+	/* Initialize. */
+	fg->patternLen = strlen(pattern);
+	fg->bol = 0;
+	fg->eol = 0;
+	fg->wmatch = wflag;
+	fg->reversedSearch = 0;
+
+	/*
+	 * Make a copy and upper case it for later if in -i mode,
+	 * else just copy the pointer.
+	 */
+	if (iflag) {
+		fg->pattern = grep_malloc(fg->patternLen + 1);
+		for (i = 0; i < fg->patternLen; i++)
+			fg->pattern[i] = toupper(pattern[i]);
+		fg->pattern[fg->patternLen] = '\0';
+	} else
+		fg->pattern = (unsigned char *)pattern;	/* really const */
+
+	/* Preprocess pattern. */
+	for (i = 0; i <= UCHAR_MAX; i++)
+		fg->qsBc[i] = fg->patternLen;
+	for (i = 1; i < fg->patternLen; i++) {
+		fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
+		/*
+		 * If case is ignored, make the jump apply to both upper and
+		 * lower cased characters.  As the pattern is stored in upper
+		 * case, apply the same to the lower case equivalents.
+		 */
+		if (iflag)
+			fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
+	}
+}
+#endif
+
+/*
+ * Returns: -1 on failure, 0 on success
+ */
+int
+fastcomp(fastgrep_t *fg, const char *pattern)
+{
+#ifdef SMALL
+	return -1;
+#else
+	int i;
+	int bol = 0;
+	int eol = 0;
+	int shiftPatternLen;
+	int hasDot = 0;
+	int firstHalfDot = -1;
+	int firstLastHalfDot = -1;
+	int lastHalfDot = 0;
+
+	/* Initialize. */
+	fg->patternLen = strlen(pattern);
+	fg->bol = 0;
+	fg->eol = 0;
+	fg->wmatch = 0;
+	fg->reversedSearch = 0;
+
+	/* Remove end-of-line character ('$'). */
+	if (fg->patternLen > 0 && pattern[fg->patternLen - 1] == '$') {
+		eol++;
+		fg->eol = 1;
+		fg->patternLen--;
+	}
+
+	/* Remove beginning-of-line character ('^'). */
+	if (pattern[0] == '^') {
+		bol++;
+		fg->bol = 1;
+		fg->patternLen--;
+	}
+
+	/* Remove enclosing [[:<:]] and [[:>:]] (word match). */
+	if (wflag) {
+		/* basic re's use \( \), extended re's ( ) */
+		int extra = Eflag ? 1 : 2;
+		fg->patternLen -= 14 + 2 * extra;
+		fg->wmatch = 7 + extra;
+	} else if (fg->patternLen >= 14 &&
+	    strncmp(pattern + fg->bol, "[[:<:]]", 7) == 0 &&
+	    strncmp(pattern + fg->bol + fg->patternLen - 7, "[[:>:]]", 7) == 0) {
+		fg->patternLen -= 14;
+		fg->wmatch = 7;
+	}
+
+	/*
+	 * Copy pattern minus '^' and '$' characters as well as word
+	 * match character classes at the beginning and ending of the
+	 * string respectively.
+	 */
+	fg->pattern = grep_malloc(fg->patternLen + 1);
+	memcpy(fg->pattern, pattern + bol + fg->wmatch, fg->patternLen);
+	fg->pattern[fg->patternLen] = '\0';
+
+	/* Look for ways to cheat...er...avoid the full regex engine. */
+	for (i = 0; i < fg->patternLen; i++)
+	{
+		switch (fg->pattern[i]) {
+		case '.':
+			hasDot = i;
+			if (i < fg->patternLen / 2) {
+				if (firstHalfDot < 0)
+					/* Closest dot to the beginning */
+					firstHalfDot = i;
+			} else {
+				/* Closest dot to the end of the pattern. */
+				lastHalfDot = i;
+				if (firstLastHalfDot < 0)
+					firstLastHalfDot = i;
+			}
+			break;
+		case '(': case ')':
+		case '{': case '}':
+			/* Special in BRE if preceded by '\\' */
+		case '?':
+		case '+':
+		case '|':
+			/* Not special in BRE. */
+			if (!Eflag)
+				goto nonspecial;
+		case '\\':
+		case '*':
+		case '[': case ']':
+			/* Free memory and let others know this is empty. */
+			free(fg->pattern);
+			fg->pattern = NULL;
+			return (-1);
+		default:
+nonspecial:
+			if (iflag)
+				fg->pattern[i] = toupper(fg->pattern[i]);
+			break;
+		}
+	}
+
+	/*
+	 * Determine if a reverse search would be faster based on the placement
+	 * of the dots.
+	 */
+	if ((!(lflag || cflag || oflag)) && ((!(bol || eol)) &&
+	    ((lastHalfDot) && ((firstHalfDot < 0) ||
+	    ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) {
+		fg->reversedSearch = 1;
+		hasDot = fg->patternLen - (firstHalfDot < 0 ?
+		    firstLastHalfDot : firstHalfDot) - 1;
+		grep_revstr(fg->pattern, fg->patternLen);
+	}
+
+	/*
+	 * Normal Quick Search would require a shift based on the position the
+	 * next character after the comparison is within the pattern.  With
+	 * wildcards, the position of the last dot effects the maximum shift
+	 * distance.
+	 * The closer to the end the wild card is the slower the search.  A
+	 * reverse version of this algorithm would be useful for wildcards near
+	 * the end of the string.
+	 *
+	 * Examples:
+	 * Pattern	Max shift
+	 * -------	---------
+	 * this		5
+	 * .his		4
+	 * t.is		3
+	 * th.s		2
+	 * thi.		1
+	 */
+
+	/* Adjust the shift based on location of the last dot ('.'). */
+	shiftPatternLen = fg->patternLen - hasDot;
+
+	/* Preprocess pattern. */
+	for (i = 0; i <= UCHAR_MAX; i++)
+		fg->qsBc[i] = shiftPatternLen;
+	for (i = hasDot + 1; i < fg->patternLen; i++) {
+		fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
+		/*
+		 * If case is ignored, make the jump apply to both upper and
+		 * lower cased characters.  As the pattern is stored in upper
+		 * case, apply the same to the lower case equivalents.
+		 */
+		if (iflag)
+			fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
+	}
+
+	/*
+	 * Put pattern back to normal after pre-processing to allow for easy
+	 * comparisons later.
+	 */
+	if (fg->reversedSearch)
+		grep_revstr(fg->pattern, fg->patternLen);
+
+	return (0);
+#endif
+}
+
+/*
+ * Word boundaries using regular expressions are defined as the point
+ * of transition from a non-word char to a word char, or vice versa.
+ * This means that grep -w +a and grep -w a+ never match anything,
+ * because they lack a starting or ending transition, but grep -w a+b
+ * does match a line containing a+b.
+ */
+#define wmatch(d, l, s, e)	\
+	((s == 0 || !isword(d[s-1])) && (e == l || !isword(d[e])) && \
+	  e > s && isword(d[s]) && isword(d[e-1]))
+
+static int
+grep_search(fastgrep_t *fg, char *data, size_t dataLen, regmatch_t *pmatch,
+    int flags)
+{
+#ifdef SMALL
+	return 0;
+#else
+	regoff_t j;
+	int rtrnVal = REG_NOMATCH;
+
+	pmatch->rm_so = -1;
+	pmatch->rm_eo = -1;
+
+	/* No point in going farther if we do not have enough data. */
+	if (dataLen < fg->patternLen)
+		return (rtrnVal);
+
+	/* Only try once at the beginning or ending of the line. */
+	if (fg->bol || fg->eol) {
+		if (fg->bol && (flags & REG_NOTBOL))
+			return 0;
+		/* Simple text comparison. */
+		/* Verify data is >= pattern length before searching on it. */
+		if (dataLen >= fg->patternLen) {
+			/* Determine where in data to start search at. */
+			if (fg->eol)
+				j = dataLen - fg->patternLen;
+			else
+				j = 0;
+			if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen)))
+				if (grep_cmp(fg->pattern, data + j,
+				    fg->patternLen)) {
+					pmatch->rm_so = j;
+					pmatch->rm_eo = j + fg->patternLen;
+					if (!fg->wmatch || wmatch(data, dataLen,
+					    pmatch->rm_so, pmatch->rm_eo))
+						rtrnVal = 0;
+				}
+		}
+	} else if (fg->reversedSearch) {
+		/* Quick Search algorithm. */
+		j = dataLen;
+		do {
+			if (grep_cmp(fg->pattern, data + j - fg->patternLen,
+			    fg->patternLen)) {
+				pmatch->rm_so = j - fg->patternLen;
+				pmatch->rm_eo = j;
+				if (!fg->wmatch || wmatch(data, dataLen,
+				    pmatch->rm_so, pmatch->rm_eo)) {
+					rtrnVal = 0;
+					break;
+				}
+			}
+			/* Shift if within bounds, otherwise, we are done. */
+			if (j == fg->patternLen)
+				break;
+			j -= fg->qsBc[(unsigned char)data[j - fg->patternLen - 1]];
+		} while (j >= fg->patternLen);
+	} else {
+		/* Quick Search algorithm. */
+		j = 0;
+		do {
+			if (grep_cmp(fg->pattern, data + j, fg->patternLen)) {
+				pmatch->rm_so = j;
+				pmatch->rm_eo = j + fg->patternLen;
+				if (fg->patternLen == 0 || !fg->wmatch ||
+				    wmatch(data, dataLen, pmatch->rm_so,
+				    pmatch->rm_eo)) {
+					rtrnVal = 0;
+					break;
+				}
+			}
+
+			/* Shift if within bounds, otherwise, we are done. */
+			if (j + fg->patternLen == dataLen)
+				break;
+			else
+				j += fg->qsBc[(unsigned char)data[j + fg->patternLen]];
+		} while (j <= (dataLen - fg->patternLen));
+	}
+
+	return (rtrnVal);
+#endif
+}
+
+
+void *
+grep_malloc(size_t size)
+{
+	void	*ptr;
+
+	if ((ptr = malloc(size)) == NULL)
+		err(2, "malloc");
+	return ptr;
+}
+
+void *
+grep_calloc(size_t nmemb, size_t size)
+{
+	void	*ptr;
+
+	if ((ptr = calloc(nmemb, size)) == NULL)
+		err(2, "calloc");
+	return ptr;
+}
+
+void *
+grep_realloc(void *ptr, size_t size)
+{
+	if ((ptr = realloc(ptr, size)) == NULL)
+		err(2, "realloc");
+	return ptr;
+}
+
+void *
+grep_reallocarray(void *ptr, size_t nmemb, size_t size)
+{
+	if ((ptr = reallocarray(ptr, nmemb, size)) == NULL)
+		err(2, "reallocarray");
+	return ptr;
+}
+
+#ifndef SMALL
+/*
+ * Returns:	true on success, false on failure
+ */
+static bool
+grep_cmp(const char *pattern, const char *data, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len; i++) {
+		if (((pattern[i] == data[i]) || (!Fflag && pattern[i] == '.'))
+		    || (iflag && pattern[i] == toupper((unsigned char)data[i])))
+			continue;
+		return false;
+	}
+
+	return true;
+}
+
+static void
+grep_revstr(unsigned char *str, int len)
+{
+	int i;
+	char c;
+
+	for (i = 0; i < len / 2; i++) {
+		c = str[i];
+		str[i] = str[len - i - 1];
+		str[len - i - 1] = c;
+	}
+}
+#endif
+
+void
+printline(str_t *line, int sep, regmatch_t *pmatch)
+{
+	int n;
+
+	n = 0;
+	if (!hflag) {
+		fputs(line->file, stdout);
+		++n;
+	}
+	if (nflag) {
+		if (n)
+			putchar(sep);
+		printf("%lld", line->line_no);
+		++n;
+	}
+	if (bflag) {
+		if (n)
+			putchar(sep);
+		printf("%lld", (long long)line->off +
+		    (pmatch ? pmatch->rm_so : 0));
+		++n;
+	}
+	if (n)
+		putchar(sep);
+	if (pmatch)
+		fwrite(line->dat + pmatch->rm_so,
+		    pmatch->rm_eo - pmatch->rm_so, 1, stdout);
+	else
+		fwrite(line->dat, line->len, 1, stdout);
+	putchar('\n');
+}