From 48162c4ee3fb013c09cecea52c6403a33526f172 Mon Sep 17 00:00:00 2001 From: Rob Landley Date: Mon, 6 May 2019 13:16:24 -0500 Subject: Greatly simplify and speed up regexec0() using REG_STARTEND. This is a 15 year old freebsd extension (presumably thus also available on MacOS) that glibc adopted in 2004, uClibc adopted in 2005, and bionic supports. The only thing that DOESN'T support it is musl, once again because its maintainer explicitly decided not to (https://www.openwall.com/lists/musl/2013/01/15/26), so add an #ifdef to let musl stay uniquely broken. (It'll stop at first NUL, everything else can match NULs). Finally fixes "s/x/y/g on a megabyte line of x's takes forever" issue. --- lib/lib.c | 37 +++++++------------------------------ lib/portability.h | 3 +++ tests/sed.test | 6 ++++++ 3 files changed, 16 insertions(+), 30 deletions(-) diff --git a/lib/lib.c b/lib/lib.c index 6df9566c..9df04906 100644 --- a/lib/lib.c +++ b/lib/lib.c @@ -1317,39 +1317,16 @@ int readlink0(char *path, char *buf, int len) return readlinkat0(AT_FDCWD, path, buf, len); } -// Do regex matching handling embedded NUL bytes in string (hence extra len -// argument). Note that neither the pattern nor the match can currently include -// NUL bytes (even with wildcards) and string must be null terminated at -// string[len]. But this can find a match after the first NUL. +// Do regex matching with len argument to handle embedded NUL bytes in string int regexec0(regex_t *preg, char *string, long len, int nmatch, - regmatch_t pmatch[], int eflags) + regmatch_t *pmatch, int eflags) { - char *s = string; + regmatch_t backup; - for (;;) { - int rc = regexec(preg, s, nmatch, pmatch, eflags); - - // check for match - if (!rc) { - for (rc = 0; rcrm_so = 0; + pmatch->rm_eo = len; + return regexec(preg, string, nmatch, pmatch, eflags|REG_STARTEND); } // Return user name or string representation of number, returned buffer diff --git a/lib/portability.h b/lib/portability.h index 96458266..ccb1b1c5 100644 --- a/lib/portability.h +++ b/lib/portability.h @@ -6,6 +6,9 @@ // For musl #define _ALL_SOURCE +#ifndef REG_STARTEND +#define REG_STARTEND 0 +#endif #ifdef __APPLE__ // macOS 10.13 doesn't have the POSIX 2008 direct access to timespec in diff --git a/tests/sed.test b/tests/sed.test index 6b27fff8..e5ec11bd 100755 --- a/tests/sed.test +++ b/tests/sed.test @@ -176,4 +176,10 @@ testing '\n with empty capture' \ testing '\n too high' \ 'sed -E "s/(.*)/\2/p" 2>/dev/null || echo OK' "OK\n" "" "foo" +# Performance test +X=x; Y=20; while [ $Y -gt 0 ]; do X=$X$X; Y=$(($Y-1)); done +testing 'megabyte s/x/y/g (5 sec timeout)' "timeout 5 sed 's/x/y/g' | sha1sum" \ + '138c1fa7c3f64186203b0192fb4abdb33cb4e98a -\n' '' "$X\n" +unset X Y + # -i with $ last line test -- cgit v1.2.3