From d4a7728dc3b37e2956034f18fc26c04bc0aa2b0e Mon Sep 17 00:00:00 2001 From: Matheus Izvekov Date: Mon, 18 Jan 2010 04:57:17 +0100 Subject: diff: rewrite. much smaller and more correct function old new delta diffreg 1815 3156 +1341 diff_main 860 1194 +334 read_token - 161 +161 skip_dir - 44 +44 seek_ft - 31 +31 fetch 362 392 +30 line_compar - 21 +21 add_to_dirlist 65 61 -4 print_only 16 - -16 skipline 51 - -51 newcand 85 - -85 asciifile 89 - -89 print_status 178 73 -105 make_temp 148 - -148 files_differ 168 - -168 get_recursive_dirlist 182 - -182 sort 226 - -226 prepare 283 - -283 change 283 - -283 do_diff 416 - -416 dump_unified_vec 457 - -457 check 1512 595 -917 ------------------------------------------------------------------------------ (add/remove: 4/12 grow/shrink: 3/3 up/down: 1962/-3430) Total: -1468 bytes Signed-off-by: Matheus Izvekov Signed-off-by: Denys Vlasenko --- editors/diff.c | 1840 ++++++++++++++++++++++---------------------------------- 1 file changed, 709 insertions(+), 1131 deletions(-) (limited to 'editors/diff.c') diff --git a/editors/diff.c b/editors/diff.c index 745ef0a33..af6917a03 100644 --- a/editors/diff.c +++ b/editors/diff.c @@ -2,6 +2,7 @@ /* * Mini diff implementation for busybox, adapted from OpenBSD diff. * + * Copyright (C) 2010 by Matheus Izvekov * Copyright (C) 2006 by Robert Sullivan * Copyright (c) 2003 Todd C. Miller * @@ -12,386 +13,310 @@ * Licensed under GPLv2 or later, see file LICENSE in this tarball for details. */ -#include "libbb.h" - -#define dbg_error_msg(...) ((void)0) -//#define dbg_error_msg(...) bb_error_msg(__VA_ARGS__) - -// #define FSIZE_MAX 32768 - -/* NOINLINEs added to prevent gcc from merging too much into diffreg() - * (it bites more than it can (efficiently) chew). */ - -/* - * Output flags - */ -enum { - /* Print a header/footer between files */ - /* D_HEADER = 1, - unused */ - /* Treat file as empty (/dev/null) */ - D_EMPTY1 = 2 * ENABLE_FEATURE_DIFF_DIR, - D_EMPTY2 = 4 * ENABLE_FEATURE_DIFF_DIR, -}; - /* - * Status values for print_status() and diffreg() return values - * Guide: - * D_SAME - files are the same - * D_DIFFER - files differ - * D_BINARY - binary files differ - * D_COMMON - subdirectory common to both dirs - * D_ONLY - file only exists in one dir - * D_ISDIR1 - path1 a dir, path2 a file - * D_ISDIR2 - path1 a file, path2 a dir - * D_ERROR - error occurred - * D_SKIPPED1 - skipped path1 as it is a special file - * D_SKIPPED2 - skipped path2 as it is a special file + * The following code uses an algorithm due to Harold Stone, + * which finds a pair of longest identical subsequences in + * the two files. + * + * The major goal is to generate the match vector J. + * J[i] is the index of the line in file1 corresponding + * to line i in file0. J[i] = 0 if there is no + * such line in file1. + * + * Lines are hashed so as to work in core. All potential + * matches are located by sorting the lines of each file + * on the hash (called "value"). In particular, this + * collects the equivalence classes in file1 together. + * Subroutine equiv replaces the value of each line in + * file0 by the index of the first element of its + * matching equivalence in (the reordered) file1. + * To save space equiv squeezes file1 into a single + * array member in which the equivalence classes + * are simply concatenated, except that their first + * members are flagged by changing sign. + * + * Next the indices that point into member are unsorted into + * array class according to the original order of file0. + * + * The cleverness lies in routine stone. This marches + * through the lines of file0, developing a vector klist + * of "k-candidates". At step i a k-candidate is a matched + * pair of lines x,y (x in file0, y in file1) such that + * there is a common subsequence of length k + * between the first i lines of file0 and the first y + * lines of file1, but there is no such subsequence for + * any smaller y. x is the earliest possible mate to y + * that occurs in such a subsequence. + * + * Whenever any of the members of the equivalence class of + * lines in file1 matable to a line in file0 has serial number + * less than the y of some k-candidate, that k-candidate + * with the smallest such y is replaced. The new + * k-candidate is chained (via pred) to the current + * k-1 candidate so that the actual subsequence can + * be recovered. When a member has serial number greater + * that the y of all k-candidates, the klist is extended. + * At the end, the longest subsequence is pulled out + * and placed in the array J by unravel + * + * With J in hand, the matches there recorded are + * checked against reality to assure that no spurious + * matches have crept in due to hashing. If they have, + * they are broken, and "jackpot" is recorded--a harmless + * matter except that a true match for a spuriously + * mated line may now be unnecessarily reported as a change. + * + * Much of the complexity of the program comes simply + * from trying to minimize core utilization and + * maximize the range of doable problems by dynamically + * allocating what is needed and reusing what is not. + * The core requirements for problems larger than somewhat + * are (in words) 2*length(file0) + length(file1) + + * 3*(number of k-candidates installed), typically about + * 6n words for files of length n. */ -#define D_SAME 0 -#define D_DIFFER (1 << 0) -#define D_BINARY (1 << 1) -#define D_COMMON (1 << 2) -/*#define D_ONLY (1 << 3) - unused */ -#define D_ISDIR1 (1 << 4) -#define D_ISDIR2 (1 << 5) -#define D_ERROR (1 << 6) -#define D_SKIPPED1 (1 << 7) -#define D_SKIPPED2 (1 << 8) - -/* Command line options */ -#define FLAG_a (1 << 0) -#define FLAG_b (1 << 1) -#define FLAG_d (1 << 2) -#define FLAG_i (1 << 3) -#define FLAG_L (1 << 4) -#define FLAG_N (1 << 5) -#define FLAG_q (1 << 6) -#define FLAG_r (1 << 7) -#define FLAG_s (1 << 8) -#define FLAG_S (1 << 9) -#define FLAG_t (1 << 10) -#define FLAG_T (1 << 11) -#define FLAG_U (1 << 12) -#define FLAG_w (1 << 13) +#include "libbb.h" -struct cand { - int x; - int y; - int pred; -}; +#if 0 +//#define dbg_error_msg(...) bb_error_msg(__VA_ARGS__) +#else +#define dbg_error_msg(...) ((void)0) +#endif -struct line { - int serial; - int value; +enum { /* print_status() and diffreg() return values */ + STATUS_SAME, /* files are the same */ + STATUS_DIFFER, /* files differ */ + STATUS_BINARY, /* binary files differ */ }; -/* - * The following struct is used to record change information - * doing a "context" or "unified" diff. (see routine "change" to - * understand the highly mnemonic field names) - */ -struct context_vec { - int a; /* start line in old file */ - int b; /* end line in old file */ - int c; /* start line in new file */ - int d; /* end line in new file */ +enum { /* Commandline flags */ + FLAG_a, + FLAG_b, + FLAG_d, + FLAG_i, /* unused */ + FLAG_L, /* unused */ + FLAG_N, + FLAG_q, + FLAG_r, + FLAG_s, + FLAG_S, /* unused */ + FLAG_t, + FLAG_T, + FLAG_U, /* unused */ + FLAG_w, }; +#define FLAG(x) (1 << FLAG_##x) - -#define g_read_buf bb_common_bufsiz1 +/* We cache file position to avoid excessive seeking */ +typedef struct FILE_and_pos_t { + FILE *ft_fp; + off_t ft_pos; +} FILE_and_pos_t; struct globals { - bool anychange; smallint exit_status; int opt_U_context; - int context_idx; - IF_FEATURE_DIFF_DIR(int dl_count;) - IF_FEATURE_DIFF_DIR(char **dl;) - char *opt_S_start; - const char *label1; - const char *label2; - int *J; /* will be overlaid on class */ - int clen; - int pref, suff; /* length of prefix and suffix */ - int nlen[2]; - int slen[2]; - int clistlen; /* the length of clist */ - struct cand *clist; /* merely a free storage pot for candidates */ - long *ixnew; /* will be overlaid on nfile[1] */ - long *ixold; /* will be overlaid on klist */ - struct line *nfile[2]; - struct line *sfile[2]; /* shortened by pruning common prefix/suffix */ - struct context_vec *context_vector; - char *tempname1, *tempname2; - struct stat stb1, stb2; + char *label[2]; + struct stat stb[2]; }; #define G (*ptr_to_globals) -#define anychange (G.anychange ) #define exit_status (G.exit_status ) #define opt_U_context (G.opt_U_context ) -#define context_idx (G.context_idx ) -#define dl_count (G.dl_count ) -#define dl (G.dl ) -#define opt_S_start (G.opt_S_start ) -#define label1 (G.label1 ) -#define label2 (G.label2 ) -#define J (G.J ) -#define clen (G.clen ) -#define pref (G.pref ) -#define suff (G.suff ) -#define nlen (G.nlen ) -#define slen (G.slen ) -#define clistlen (G.clistlen ) -#define clist (G.clist ) -#define ixnew (G.ixnew ) -#define ixold (G.ixold ) -#define nfile (G.nfile ) -#define sfile (G.sfile ) -#define context_vector (G.context_vector ) -#define stb1 (G.stb1 ) -#define stb2 (G.stb2 ) -#define tempname1 (G.tempname1 ) -#define tempname2 (G.tempname2 ) +#define label (G.label ) +#define stb (G.stb ) #define INIT_G() do { \ SET_PTR_TO_GLOBALS(xzalloc(sizeof(G))); \ opt_U_context = 3; \ - context_vector = xrealloc_vector(context_vector, 6, 0); \ } while (0) +typedef int token_t; -#if ENABLE_FEATURE_DIFF_DIR -static void print_only(const char *path, const char *entry) -{ - printf("Only in %s: %s\n", path, entry); -} -#endif +enum { + /* Public */ + TOK_EMPTY = 1 << 9, /* Line fully processed, you can proceed to the next */ + TOK_EOF = 1 << 10, /* File ended */ + /* Private (Only to be used by read_token() */ + TOK_EOL = 1 << 11, /* we saw EOL (sticky) */ + TOK_SPACE = 1 << 12, /* used -b code, means we are skipping spaces */ + SHIFT_EOF = (sizeof(token_t)*8 - 8) - 1, + CHAR_MASK = 0x1ff, /* 8th bit is used to distinguish EOF from 0xff */ +}; +/* Restores full EOF from one 8th bit: */ +//#define TOK2CHAR(t) (((t) << SHIFT_EOF) >> SHIFT_EOF) +/* We don't really need the above, we only need to have EOF != any_real_char: */ +#define TOK2CHAR(t) ((t) & CHAR_MASK) -static void print_status(int val, char *_path1, char *_path2) +static void seek_ft(FILE_and_pos_t *ft, off_t pos) { - /*const char *const _entry = entry ? entry : "";*/ - /*char *const _path1 = entry ? concat_path_file(path1, _entry) : path1;*/ - /*char *const _path2 = entry ? concat_path_file(path2, _entry) : path2;*/ - - switch (val) { -/* case D_ONLY: - print_only(path1, entry); - break; -*/ - case D_COMMON: - printf("Common subdirectories: %s and %s\n", _path1, _path2); - break; - case D_BINARY: - printf("Binary files %s and %s differ\n", _path1, _path2); - break; - case D_DIFFER: - if (option_mask32 & FLAG_q) - printf("Files %s and %s differ\n", _path1, _path2); - break; - case D_SAME: - if (option_mask32 & FLAG_s) - printf("Files %s and %s are identical\n", _path1, _path2); - break; - case D_ISDIR1: - printf("File %s is a %s while file %s is a %s\n", - _path1, "directory", _path2, "regular file"); - break; - case D_ISDIR2: - printf("File %s is a %s while file %s is a %s\n", - _path1, "regular file", _path2, "directory"); - break; - case D_SKIPPED1: - printf("File %s is not a regular file or directory and was skipped\n", - _path1); - break; - case D_SKIPPED2: - printf("File %s is not a regular file or directory and was skipped\n", - _path2); - break; - } -/* - if (entry) { - free(_path1); - free(_path2); + if (ft->ft_pos != pos) { + ft->ft_pos = pos; + fseeko(ft->ft_fp, pos, SEEK_SET); } -*/ } - -/* Read line, return its nonzero hash. Return 0 if EOF. - * - * Hash function taken from Robert Sedgewick, Algorithms in C, 3d ed., p 578. +/* Reads tokens from given fp, handling -b and -w flags + * The user must reset tok every line start */ -static ALWAYS_INLINE int fiddle_sum(int sum, int t) +static int read_token(FILE_and_pos_t *ft, token_t tok) { - return sum * 127 + t; -} -static int readhash(FILE *fp) -{ - int i, t; - int sum; - - sum = 1; - i = 0; - if (!(option_mask32 & (FLAG_b | FLAG_w))) { - while ((t = getc(fp)) != '\n') { - if (t == EOF) { - if (i == 0) - return 0; - break; - } - sum = fiddle_sum(sum, t); - i = 1; - } - } else { - int space = 0; + tok |= TOK_EMPTY; + while (!(tok & TOK_EOL)) { + bool is_space; + int t; + + t = fgetc(ft->ft_fp); + if (t != EOF) + ft->ft_pos++; + is_space = (t == EOF || isspace(t)); + + /* If t == EOF (-1), set both TOK_EOF and TOK_EOL */ + tok |= (t & (TOK_EOF + TOK_EOL)); + /* Only EOL? */ + if (t == '\n') + tok |= TOK_EOL; + + if ((option_mask32 & FLAG(w)) && is_space) + continue; - while (1) { - t = getc(fp); - switch (t) { - case '\t': - case '\r': - case '\v': - case '\f': - case ' ': - space = 1; - continue; - default: - if (space && !(option_mask32 & FLAG_w)) { - i = 1; - space = 0; - } - sum = fiddle_sum(sum, t); - i = 1; - continue; - case EOF: - if (i == 0) - return 0; - /* FALLTHROUGH */ - case '\n': - break; + /* Trim char value to low 9 bits */ + t &= CHAR_MASK; + + if (option_mask32 & FLAG(b)) { + /* Was prev char whitespace? */ + if (tok & TOK_SPACE) { /* yes */ + if (is_space) /* this one too, ignore it */ + continue; + tok &= ~TOK_SPACE; + } else if (is_space) { + /* 1st whitespace char. + * Set TOK_SPACE and replace char by ' ' */ + t = TOK_SPACE + ' '; } - break; } + /* Clear EMPTY */ + tok &= ~(TOK_EMPTY + CHAR_MASK); + /* Assign char value (low 9 bits) and maybe set TOK_SPACE */ + tok |= t; + break; } - /* - * There is a remote possibility that we end up with a zero sum. - * Zero is used as an EOF marker, so return 1 instead. - */ - return (sum == 0 ? 1 : sum); +#if 0 + bb_error_msg("fp:%p tok:%x '%c'%s%s%s%s", fp, tok, tok & 0xff + , tok & TOK_EOF ? " EOF" : "" + , tok & TOK_EOL ? " EOL" : "" + , tok & TOK_EMPTY ? " EMPTY" : "" + , tok & TOK_SPACE ? " SPACE" : "" + ); +#endif + return tok; } +struct cand { + int x; + int y; + int pred; +}; -/* Our diff implementation is using seek. - * When we meet non-seekable file, we must make a temp copy. - */ -static char *make_temp(FILE *f, struct stat *sb) +static int search(const int *c, int k, int y, const struct cand *list) { - char *name; - int fd; - - if (S_ISREG(sb->st_mode) || S_ISBLK(sb->st_mode)) - return NULL; - name = xstrdup("/tmp/difXXXXXX"); - fd = mkstemp(name); - if (fd < 0) - bb_perror_msg_and_die("mkstemp"); - if (bb_copyfd_eof(fileno(f), fd) < 0) { - clean_up: - unlink(name); - xfunc_die(); /* error message is printed by bb_copyfd_eof */ - } - fstat(fd, sb); - close(fd); - if (freopen(name, "r+", f) == NULL) { - bb_perror_msg("freopen"); - goto clean_up; - } - return name; -} - + if (list[c[k]].y < y) /* quick look for typical case */ + return k + 1; -/* - * Check to see if the given files differ. - * Returns 0 if they are the same, 1 if different, and -1 on error. - */ -static NOINLINE int files_differ(FILE *f1, FILE *f2) -{ - size_t i, j; - - /* Prevent making copies for "/dev/null" (too common) */ - /* Deal with input from pipes etc */ - tempname1 = make_temp(f1, &stb1); - tempname2 = make_temp(f2, &stb2); - if (stb1.st_size != stb2.st_size) { - return 1; - } - while (1) { - i = fread(g_read_buf, 1, COMMON_BUFSIZE/2, f1); - j = fread(g_read_buf + COMMON_BUFSIZE/2, 1, COMMON_BUFSIZE/2, f2); - if (i != j) - return 1; - if (i == 0) - return (ferror(f1) || ferror(f2)) ? -1 : 0; - if (memcmp(g_read_buf, - g_read_buf + COMMON_BUFSIZE/2, i) != 0) - return 1; + for (int i = 0, j = k + 1;;) { + const int l = (i + j) >> 1; + if (l > i) { + const int t = list[c[l]].y; + if (t > y) + j = l; + else if (t < y) + i = l; + else + return l; + } else + return l + 1; } } - -static void prepare(int i, FILE *fp /*, off_t filesize*/) +static unsigned isqrt(unsigned n) { - struct line *p; - int h; - size_t j, sz; - - rewind(fp); - - /*sz = (filesize <= FSIZE_MAX ? filesize : FSIZE_MAX) / 25;*/ - /*if (sz < 100)*/ - sz = 100; - - p = xmalloc((sz + 3) * sizeof(p[0])); - j = 0; - while ((h = readhash(fp)) != 0) { /* while not EOF */ - if (j == sz) { - sz = sz * 3 / 2; - p = xrealloc(p, (sz + 3) * sizeof(p[0])); - } - p[++j].value = h; + unsigned x = 1; + while (1) { + const unsigned y = x; + x = ((n / x) + x) >> 1; + if (x <= (y + 1) && x >= (y - 1)) + return x; } - nlen[i] = j; - nfile[i] = p; } - -static void prune(void) +static void stone(const int *a, int n, const int *b, int *J, int pref) { - int i, j; - - for (pref = 0; pref < nlen[0] && pref < nlen[1] && - nfile[0][pref + 1].value == nfile[1][pref + 1].value; pref++) - continue; - for (suff = 0; suff < nlen[0] - pref && suff < nlen[1] - pref && - nfile[0][nlen[0] - suff].value == nfile[1][nlen[1] - suff].value; - suff++) - continue; - for (j = 0; j < 2; j++) { - sfile[j] = nfile[j] + pref; - slen[j] = nlen[j] - pref - suff; - for (i = 0; i <= slen[j]; i++) - sfile[j][i].serial = i; + const unsigned isq = isqrt(n); + const unsigned bound = + (option_mask32 & FLAG(d)) ? UINT_MAX : MAX(256, isq); + int clen = 1; + int clistlen = 100; + int k = 0; + struct cand *clist = xzalloc(clistlen * sizeof(clist[0])); + int *klist = xzalloc((n + 2) * sizeof(klist[0])); + /*clist[0] = (struct cand){0}; - xzalloc did it */ + /*klist[0] = 0; */ + + for (struct cand cand = {1}; cand.x <= n; cand.x++) { + int j = a[cand.x], oldl = 0; + unsigned numtries = 0; + if (j == 0) + continue; + cand.y = -b[j]; + cand.pred = klist[0]; + do { + int l, tc; + if (cand.y <= clist[cand.pred].y) + continue; + l = search(klist, k, cand.y, clist); + if (l != oldl + 1) + cand.pred = klist[l - 1]; + if (l <= k && clist[klist[l]].y <= cand.y) + continue; + if (clen == clistlen) { + clistlen = clistlen * 11 / 10; + clist = xrealloc(clist, clistlen * sizeof(clist[0])); + } + clist[clen] = cand; + tc = klist[l]; + klist[l] = clen++; + if (l <= k) { + cand.pred = tc; + oldl = l; + numtries++; + } else { + k++; + break; + } + } while ((cand.y = b[++j]) > 0 && numtries < bound); } + /* Unravel */ + for (struct cand *q = clist + klist[k]; q->y; q = clist + q->pred) + J[q->x + pref] = q->y + pref; + free(klist); + free(clist); } +struct line { + /* 'serial' is not used in the begining, so we reuse it + * to store line offsets, thus reducing memory pressure + */ + union { + unsigned serial; + off_t offset; + }; + unsigned value; +}; static void equiv(struct line *a, int n, struct line *b, int m, int *c) { - int i, j; + int i = 1, j = 1; - i = j = 1; while (i <= n && j <= m) { if (a[i].value < b[j].value) a[i++].value = 0; @@ -414,286 +339,28 @@ static void equiv(struct line *a, int n, struct line *b, int m, int *c) c[j] = -1; } - -static int isqrt(int n) -{ - int y, x; - - if (n == 0) - return 0; - x = 1; - do { - y = x; - x = n / x; - x += y; - x /= 2; - } while ((x - y) > 1 || (x - y) < -1); - - return x; -} - - -static int newcand(int x, int y, int pred) +static void unsort(const struct line *f, int l, int *b) { - struct cand *q; - - if (clen == clistlen) { - clistlen = clistlen * 11 / 10; - clist = xrealloc(clist, clistlen * sizeof(struct cand)); - } - q = clist + clen; - q->x = x; - q->y = y; - q->pred = pred; - return clen++; -} - - -static int search(int *c, int k, int y) -{ - int i, j, l, t; - - if (clist[c[k]].y < y) /* quick look for typical case */ - return k + 1; - i = 0; - j = k + 1; - while (1) { - l = i + j; - if ((l >>= 1) <= i) - break; - t = clist[c[l]].y; - if (t > y) - j = l; - else if (t < y) - i = l; - else - return l; - } - return l + 1; -} - - -static int stone(int *a, int n, int *b, int *c) -{ - int i, k, y, j, l; - int oldc, tc, oldl; - unsigned numtries; - int isq = isqrt(n); -#if ENABLE_FEATURE_DIFF_MINIMAL - const unsigned bound = - (option_mask32 & FLAG_d) ? UINT_MAX : MAX(256, isq); -#else - const unsigned bound = MAX(256, isq); -#endif - - k = 0; - c[0] = newcand(0, 0, 0); - for (i = 1; i <= n; i++) { - j = a[i]; - if (j == 0) - continue; - y = -b[j]; - oldl = 0; - oldc = c[0]; - numtries = 0; - do { - if (y <= clist[oldc].y) - continue; - l = search(c, k, y); - if (l != oldl + 1) - oldc = c[l - 1]; - if (l <= k) { - if (clist[c[l]].y <= y) - continue; - tc = c[l]; - c[l] = newcand(i, y, oldc); - oldc = tc; - oldl = l; - numtries++; - } else { - c[l] = newcand(i, y, oldc); - k++; - break; - } - } while ((y = b[++j]) > 0 && numtries < bound); - } - return k; -} - - -static void unravel(int p) -{ - struct cand *q; - int i; - - for (i = 0; i <= nlen[0]; i++) - J[i] = i <= pref ? i : i > nlen[0] - suff ? i + nlen[1] - nlen[0] : 0; - for (q = clist + p; q->y != 0; q = clist + q->pred) - J[q->x + pref] = q->y + pref; -} - - -static void unsort(struct line *f, int l, int *b) -{ - int *a, i; - - a = xmalloc((l + 1) * sizeof(int)); - for (i = 1; i <= l; i++) + int *a = xmalloc((l + 1) * sizeof(a[0])); + for (int i = 1; i <= l; i++) a[f[i].serial] = f[i].value; - for (i = 1; i <= l; i++) + for (int i = 1; i <= l; i++) b[i] = a[i]; free(a); } - -static int skipline(FILE *f) -{ - int i, c; - - for (i = 1; (c = getc(f)) != '\n' && c != EOF; i++) - continue; - return i; -} - - -/* - * Check does double duty: - * 1. ferret out any fortuitous correspondences due - * to confounding by hashing (which result in "jackpot") - * 2. collect random access indexes to the two files - */ -static NOINLINE void check(FILE *f1, FILE *f2) -{ - int i, j, jackpot, c, d; - long ctold, ctnew; - - rewind(f1); - rewind(f2); - j = 1; - ixold[0] = ixnew[0] = 0; - jackpot = 0; - ctold = ctnew = 0; - for (i = 1; i <= nlen[0]; i++) { - if (J[i] == 0) { - ixold[i] = ctold += skipline(f1); - continue; - } - while (j < J[i]) { - ixnew[j] = ctnew += skipline(f2); - j++; - } - if (option_mask32 & (FLAG_b | FLAG_w | FLAG_i)) { - while (1) { - c = getc(f1); - d = getc(f2); - /* - * GNU diff ignores a missing newline - * in one file if bflag || wflag. - */ - if ((option_mask32 & (FLAG_b | FLAG_w)) - && ((c == EOF && d == '\n') || (c == '\n' && d == EOF)) - ) { - break; - } - ctold++; - ctnew++; - if ((option_mask32 & FLAG_b) && isspace(c) && isspace(d)) { - do { - if (c == '\n') - break; - ctold++; - c = getc(f1); - } while (isspace(c)); - do { - if (d == '\n') - break; - ctnew++; - d = getc(f2); - } while (isspace(d)); - } else if (option_mask32 & FLAG_w) { - while (isspace(c) && c != '\n') { - c = getc(f1); - ctold++; - } - while (isspace(d) && d != '\n') { - d = getc(f2); - ctnew++; - } - } - if (c != d) { - jackpot++; - J[i] = 0; - if (c != '\n' && c != EOF) - ctold += skipline(f1); - if (d != '\n' && c != EOF) - ctnew += skipline(f2); - break; - } - if (c == '\n' || c == EOF) - break; - } - } else { - while (1) { - ctold++; - ctnew++; - c = getc(f1); - d = getc(f2); - if (c != d) { - J[i] = 0; - if (c != '\n' && c != EOF) - ctold += skipline(f1); -/* was buggy? "if (d != '\n' && c != EOF)" */ - if (d != '\n' && d != EOF) - ctnew += skipline(f2); - break; - } - if (c == '\n' || c == EOF) - break; - } - } - ixold[i] = ctold; - ixnew[j] = ctnew; - j++; - } - for (; j <= nlen[1]; j++) - ixnew[j] = ctnew += skipline(f2); -} - - -/* shellsort CACM #201 */ -static void sort(struct line *a, int n) +static int line_compar(const void *a, const void *b) { - struct line *ai, *aim, w; - int j, m = 0, k; - - if (n == 0) - return; - for (j = 1; j <= n; j *= 2) - m = 2 * j - 1; - for (m /= 2; m != 0; m /= 2) { - k = n - m; - for (j = 1; j <= k; j++) { - for (ai = &a[j]; ai > a; ai -= m) { - aim = &ai[m]; - if (aim < ai) - break; /* wraparound */ - if (aim->value > ai[0].value - || (aim->value == ai[0].value && aim->serial > ai[0].serial) - ) { - break; - } - w.value = ai[0].value; - ai[0].value = aim->value; - aim->value = w.value; - w.serial = ai[0].serial; - ai[0].serial = aim->serial; - aim->serial = w.serial; - } - } - } +#define l0 ((const struct line*)a) +#define l1 ((const struct line*)b) + int r = l0->value - l1->value; + if (r) + return r; + return l0->serial - l1->serial; +#undef l0 +#undef l1 } - static void uni_range(int a, int b) { if (a < b) @@ -704,33 +371,23 @@ static void uni_range(int a, int b) printf("%d,0", b); } - -static void fetch(long *f, int a, int b, FILE *lb, int ch) +static void fetch(FILE_and_pos_t *ft, const off_t *ix, int a, int b, int ch) { - int i, j, c, lastc, col, nc; - - if (a > b) - return; - for (i = a; i <= b; i++) { - fseek(lb, f[i - 1], SEEK_SET); - nc = f[i] - f[i - 1]; - if (ch != '\0') { - putchar(ch); - if (option_mask32 & FLAG_T) - putchar('\t'); - } - col = 0; - for (j = 0, lastc = '\0'; j < nc; j++, lastc = c) { - c = getc(lb); + for (int i = a; i <= b; i++) { + seek_ft(ft, ix[i - 1]); + putchar(ch); + if (option_mask32 & FLAG(T)) + putchar('\t'); + for (int j = 0, col = 0; j < ix[i] - ix[i - 1]; j++) { + int c = fgetc(ft->ft_fp); if (c == EOF) { printf("\n\\ No newline at end of file\n"); return; } - if (c == '\t' && (option_mask32 & FLAG_t)) { - do { - putchar(' '); - } while (++col & 7); - } else { + ft->ft_pos++; + if (c == '\t' && (option_mask32 & FLAG(t))) + do putchar(' '); while (++col & 7); + else { putchar(c); col++; } @@ -738,540 +395,477 @@ static void fetch(long *f, int a, int b, FILE *lb, int ch) } } - -#if ENABLE_FEATURE_DIFF_BINARY -static int asciifile(FILE *f) +/* Creates the match vector J, where J[i] is the index + * of the line in the new file corresponding to the line i + * in the old file. Lines start at 1 instead of 0, that value + * being used instead to denote no corresponding line. + * This vector is dynamically allocated and must be freed by the caller. + * + * * fp is an input parameter, where fp[0] and fp[1] are the open + * old file and new file respectively. + * * nlen is an output variable, where nlen[0] and nlen[1] + * gets the number of lines in the old and new file respectively. + * * ix is an output variable, where ix[0] and ix[1] gets + * assigned dynamically allocated vectors of the offsets of the lines + * of the old and new file respectively. These must be freed by the caller. + */ +static int *create_J(FILE_and_pos_t ft[2], int nlen[2], off_t *ix[2]) { - int i, cnt; - - if (option_mask32 & FLAG_a) - return 1; - rewind(f); - cnt = fread(g_read_buf, 1, COMMON_BUFSIZE, f); - for (i = 0; i < cnt; i++) { - if (!isprint_asciionly(g_read_buf[i]) - && !isspace(g_read_buf[i]) - ) { - return 0; + int *J, slen[2], *class, *member; + struct line *nfile[2], *sfile[2]; + int pref = 0, suff = 0; + + /* Lines of both files are hashed, and in the process + * their offsets are stored in the array ix[fileno] + * where fileno == 0 points to the old file, and + * fileno == 1 points to the new one. + */ + for (int i = 0; i < 2; i++) { + unsigned hash; + token_t tok; + size_t sz = 100; + nfile[i] = xmalloc((sz + 3) * sizeof(nfile[i][0])); + seek_ft(&ft[i], 0); + + nlen[i] = 0; + /* We could zalloc nfile, but then zalloc starts showing in gprof at ~1% */ + nfile[i][0].offset = 0; + goto start; /* saves code */ + while (1) { + tok = read_token(&ft[i], tok); + if (!(tok & TOK_EMPTY)) { + /* Hash algorithm taken from Robert Sedgewick, Algorithms in C, 3d ed., p 578. */ + hash = hash * 128 - hash + TOK2CHAR(tok); + continue; + } + if (nlen[i]++ == sz) { + sz = sz * 3 / 2; + nfile[i] = xrealloc(nfile[i], (sz + 3) * sizeof(nfile[i][0])); + } + /* line_compar needs hashes fit into positive int */ + nfile[i][nlen[i]].value = hash & INT_MAX; + /* like ftello(ft[i].ft_fp) but faster (avoids lseek syscall) */ + nfile[i][nlen[i]].offset = ft[i].ft_pos; + if (tok & TOK_EOF) { + /* EOF counts as a token, so we have to adjust it here */ + nfile[i][nlen[i]].offset++; + break; + } +start: + hash = tok = 0; } + /* Exclude lone EOF line from the end of the file, to make fetch()'s job easier */ + if (nfile[i][nlen[i]].offset - nfile[i][nlen[i] - 1].offset == 1) + nlen[i]--; + /* Now we copy the line offsets into ix */ + ix[i] = xmalloc((nlen[i] + 2) * sizeof(ix[i][0])); + for (int j = 0; j < nlen[i] + 1; j++) + ix[i][j] = nfile[i][j].offset; + } + + /* lenght of prefix and suffix is calculated */ + for (; pref < nlen[0] && pref < nlen[1] && + nfile[0][pref + 1].value == nfile[1][pref + 1].value; + pref++); + for (; suff < nlen[0] - pref && suff < nlen[1] - pref && + nfile[0][nlen[0] - suff].value == nfile[1][nlen[1] - suff].value; + suff++); + /* Arrays are pruned by the suffix and prefix lenght, + * the result being sorted and stored in sfile[fileno], + * and their sizes are stored in slen[fileno] + */ + for (int j = 0; j < 2; j++) { + sfile[j] = nfile[j] + pref; + slen[j] = nlen[j] - pref - suff; + for (int i = 0; i <= slen[j]; i++) + sfile[j][i].serial = i; + qsort(sfile[j] + 1, slen[j], sizeof(*sfile[j]), line_compar); } - return 1; -} -#else -#define asciifile(f) 1 -#endif - - -/* dump accumulated "unified" diff changes */ -static void dump_unified_vec(FILE *f1, FILE *f2) -{ - struct context_vec *cvp = context_vector; - int lowa, upb, lowc, upd; - int a, b, c, d; - char ch; - - if (context_idx < 0) - return; - - dbg_error_msg("dumping %d context_vecs", context_idx+1); - - b = d = 0; /* gcc */ - lowa = MAX(1, cvp->a - opt_U_context); - upb = MIN(nlen[0], context_vector[context_idx].b + opt_U_context); - lowc = MAX(1, cvp->c - opt_U_context); - upd = MIN(nlen[1], context_vector[context_idx].d + opt_U_context); - - printf("@@ -"); - uni_range(lowa, upb); - printf(" +"); - uni_range(lowc, upd); - printf(" @@\n"); - - /* - * Output changes in "unified" diff format--the old and new lines - * are printed together. + /* nfile arrays are reused to reduce memory pressure + * The #if zeroed out section performs the same task as the + * one in the #else section. + * Peak memory usage is higher, but one array copy is avoided + * by not using unsort() */ - for (; cvp <= &context_vector[context_idx]; cvp++) { - a = cvp->a; - b = cvp->b; - c = cvp->c; - d = cvp->d; - - /* - * c: both new and old changes - * d: only changes in the old file - * a: only changes in the new file - */ - if (a <= b && c <= d) - ch = 'c'; - else - ch = (a <= b) ? 'd' : 'a'; #if 0 - switch (ch) { - case 'c': -// fetch() seeks! - fetch(ixold, lowa, a - 1, f1, ' '); - fetch(ixold, a, b, f1, '-'); - fetch(ixnew, c, d, f2, '+'); - break; - case 'd': - fetch(ixold, lowa, a - 1, f1, ' '); - fetch(ixold, a, b, f1, '-'); - break; - case 'a': - fetch(ixnew, lowc, c - 1, f2, ' '); - fetch(ixnew, c, d, f2, '+'); - break; - } + member = xmalloc((slen[1] + 2) * sizeof(member[0])); + equiv(sfile[0], slen[0], sfile[1], slen[1], member); + free(nfile[1]); + + class = xmalloc((slen[0] + 1) * sizeof(class[0])); + for (int i = 1; i <= slen[0]; i++) /* Unsorting */ + class[sfile[0][i].serial] = sfile[0][i].value; + free(nfile[0]); #else - if (ch == 'c' || ch == 'd') { - fetch(ixold, lowa, a - 1, f1, ' '); - fetch(ixold, a, b, f1, '-'); - } - if (ch == 'a') - fetch(ixnew, lowc, c - 1, f2, ' '); - if (ch == 'c' || ch == 'a') - fetch(ixnew, c, d, f2, '+'); -#endif - lowa = b + 1; - lowc = d + 1; - } - fetch(ixnew, d + 1, upd, f2, ' '); + member = (int *)nfile[1]; + equiv(sfile[0], slen[0], sfile[1], slen[1], member); + member = xrealloc(member, (slen[1] + 2) * sizeof(member[0])); - context_idx = -1; -} + class = (int *)nfile[0]; + unsort(sfile[0], slen[0], (int *)nfile[0]); + class = xrealloc(class, (slen[0] + 2) * sizeof(class[0])); +#endif + J = xmalloc((nlen[0] + 2) * sizeof(J[0])); + /* The elements of J which fall inside the prefix and suffix regions + * are marked as unchanged, while the ones which fall outside + * are initialized with 0 (no matches), so that function stone can + * then assign them their right values + */ + for (int i = 0, delta = nlen[1] - nlen[0]; i <= nlen[0]; i++) + J[i] = i <= pref ? i : + i > (nlen[0] - suff) ? (i + delta) : 0; + /* Here the magic is performed */ + stone(class, slen[0], member, J, pref); + J[nlen[0] + 1] = nlen[1] + 1; + free(class); + free(member); -static void print_header(const char *file1, const char *file2) -{ - if (label1) - printf("--- %s\n", label1); - else - printf("--- %s\t%s", file1, ctime(&stb1.st_mtime)); - if (label2) - printf("+++ %s\n", label2); - else - printf("+++ %s\t%s", file2, ctime(&stb2.st_mtime)); -} + /* Both files are rescanned, in an effort to find any lines + * which, due to limitations intrinsic to any hashing algorithm, + * are different but ended up confounded as the same + */ + for (int i = 1; i <= nlen[0]; i++) { + if (!J[i]) + continue; + seek_ft(&ft[0], ix[0][i - 1]); + seek_ft(&ft[1], ix[1][J[i] - 1]); -/* - * Indicate that there is a difference between lines a and b of the from file - * to get to lines c to d of the to file. If a is greater than b then there - * are no lines in the from file involved and this means that there were - * lines appended (beginning at b). If c is greater than d then there are - * lines missing from the to file. - */ -static void change(const char *file1, FILE *f1, const char *file2, FILE *f2, - int a, int b, int c, int d) -{ - if ((a > b && c > d) || (option_mask32 & FLAG_q)) { -//compat BUG: "diff -ub F1 F2" will output nothing, but will exit 1 -//if F1 and F2 differ only in whitespace. "standard" diff exits 0. -//This is the place where this erroneous exitcode is set: - dbg_error_msg("%d: abcd:%d,%d,%d,%d, anychange=1", __LINE__, a,b,c,d); - anychange = 1; - return; - } + for (int j = J[i]; i <= nlen[0] && J[i] == j; i++, j++) { + token_t tok0 = 0, tok1 = 0; + do { + tok0 = read_token(&ft[0], tok0); + tok1 = read_token(&ft[1], tok1); - if (anychange == 0) { - /* - * Print the context/unidiff header first time through. - */ - print_header(file1, file2); - } else if (a > context_vector[context_idx].b + (2 * opt_U_context) + 1 - && c > context_vector[context_idx].d + (2 * opt_U_context) + 1 - ) { - /* - * If this change is more than 'context' lines from the - * previous change, dump the record and reset it. - */ -// dump_unified_vec() seeks! - dump_unified_vec(f1, f2); + if (((tok0 ^ tok1) & TOK_EMPTY) != 0 /* one is empty (not both) */ + || (!(tok0 & TOK_EMPTY) && TOK2CHAR(tok0) != TOK2CHAR(tok1))) + J[i] = 0; /* Break the correspondence */ + } while (!(tok0 & tok1 & TOK_EMPTY)); + } } - context_idx++; - context_vector = xrealloc_vector(context_vector, 6, context_idx); - context_vector[context_idx].a = a; - context_vector[context_idx].b = b; - context_vector[context_idx].c = c; - context_vector[context_idx].d = d; - dbg_error_msg("new context_vec[%d]:%d,%d,%d,%d", context_idx, a,b,c,d); - anychange = 1; -} - -static void output(const char *file1, FILE *f1, const char *file2, FILE *f2) -{ - /* Note that j0 and j1 can't be used as they are defined in math.h. - * This also allows the rather amusing variable 'j00'... */ - int m, i0, i1, j00, j01; - - rewind(f1); - rewind(f2); - m = nlen[0]; - J[0] = 0; - J[m + 1] = nlen[1] + 1; - for (i0 = 1; i0 <= m; i0 = i1 + 1) { - while (i0 <= m && J[i0] == J[i0 - 1] + 1) - i0++; - j00 = J[i0 - 1] + 1; - i1 = i0 - 1; - while (i1 < m && J[i1 + 1] == 0) - i1++; - j01 = J[i1 + 1] - 1; - J[i1] = j01; -// change() seeks! - change(file1, f1, file2, f2, i0, i1, j00, j01); - } - if (m == 0) { -// change() seeks! - change(file1, f1, file2, f2, 1, 0, 1, nlen[1]); - } - if (anychange != 0 && !(option_mask32 & FLAG_q)) { -// dump_unified_vec() seeks! - dump_unified_vec(f1, f2); - } + return J; } /* - * The following code uses an algorithm due to Harold Stone, - * which finds a pair of longest identical subsequences in - * the two files. - * - * The major goal is to generate the match vector J. - * J[i] is the index of the line in file1 corresponding - * to line i in file0. J[i] = 0 if there is no - * such line in file1. - * - * Lines are hashed so as to work in core. All potential - * matches are located by sorting the lines of each file - * on the hash (called "value"). In particular, this - * collects the equivalence classes in file1 together. - * Subroutine equiv replaces the value of each line in - * file0 by the index of the first element of its - * matching equivalence in (the reordered) file1. - * To save space equiv squeezes file1 into a single - * array member in which the equivalence classes - * are simply concatenated, except that their first - * members are flagged by changing sign. - * - * Next the indices that point into member are unsorted into - * array class according to the original order of file0. - * - * The cleverness lies in routine stone. This marches - * through the lines of file0, developing a vector klist - * of "k-candidates". At step i a k-candidate is a matched - * pair of lines x,y (x in file0, y in file1) such that - * there is a common subsequence of length k - * between the first i lines of file0 and the first y - * lines of file1, but there is no such subsequence for - * any smaller y. x is the earliest possible mate to y - * that occurs in such a subsequence. - * - * Whenever any of the members of the equivalence class of - * lines in file1 matable to a line in file0 has serial number - * less than the y of some k-candidate, that k-candidate - * with the smallest such y is replaced. The new - * k-candidate is chained (via pred) to the current - * k-1 candidate so that the actual subsequence can - * be recovered. When a member has serial number greater - * that the y of all k-candidates, the klist is extended. - * At the end, the longest subsequence is pulled out - * and placed in the array J by unravel - * - * With J in hand, the matches there recorded are - * checked against reality to assure that no spurious - * matches have crept in due to hashing. If they have, - * they are broken, and "jackpot" is recorded--a harmless - * matter except that a true match for a spuriously - * mated line may now be unnecessarily reported as a change. - * - * Much of the complexity of the program comes simply - * from trying to minimize core utilization and - * maximize the range of doable problems by dynamically - * allocating what is needed and reusing what is not. - * The core requirements for problems larger than somewhat - * are (in words) 2*length(file0) + length(file1) + - * 3*(number of k-candidates installed), typically about - * 6n words for files of length n. + * The following struct is used to record change information + * doing a "context" or "unified" diff. */ -/* NB: files can be not REGular. The only sure thing that they - * are not both DIRectories. */ -static unsigned diffreg(const char *file1, const char *file2, int flags) +struct context_vec { + int a; /* start line in old file */ + int b; /* end line in old file */ + int c; /* start line in new file */ + int d; /* end line in new file */ +}; + +static bool diff(FILE_and_pos_t ft[2], char *file[2]) { - int *member; /* will be overlaid on nfile[1] */ - int *class; /* will be overlaid on nfile[0] */ - int *klist; /* will be overlaid on nfile[0] after class */ - FILE *f1; - FILE *f2; - unsigned rval; - int i; - - anychange = 0; - context_idx = -1; - tempname1 = tempname2 = NULL; - - /* Is any of them a directory? Then it's simple */ - if (S_ISDIR(stb1.st_mode) != S_ISDIR(stb2.st_mode)) - return (S_ISDIR(stb1.st_mode) ? D_ISDIR1 : D_ISDIR2); - - /* None of them are directories */ - rval = D_SAME; - - if (flags & D_EMPTY1) - /* can't be stdin, but xfopen_stdin() is smaller code */ - file1 = bb_dev_null; - f1 = xfopen_stdin(file1); - if (flags & D_EMPTY2) - file2 = bb_dev_null; - f2 = xfopen_stdin(file2); - - /* NB: if D_EMPTY1/2 is set, other file is always a regular file, - * not pipe/fifo/chardev/etc - D_EMPTY is used by "diff -r" only, - * and it never diffs non-ordinary files in subdirs. */ - if (!(flags & (D_EMPTY1 | D_EMPTY2))) { - /* Quick check whether they are different */ - /* NB: copies non-REG files to tempfiles and fills tempname1/2 */ - i = files_differ(f1, f2); - if (i != 1) { /* not different? */ - if (i != 0) /* error? */ - exit_status |= 2; - goto closem; - } - } + int nlen[2]; + off_t *ix[2]; + int *J = create_J(ft, nlen, ix); - if (!asciifile(f1) || !asciifile(f2)) { - rval = D_BINARY; - exit_status |= 1; - goto closem; - } + bool anychange = false; + struct context_vec *vec = NULL; + int idx = -1, i = 1; -// Rewind inside! - prepare(0, f1 /*, stb1.st_size*/); - prepare(1, f2 /*, stb2.st_size*/); - prune(); - sort(sfile[0], slen[0]); - sort(sfile[1], slen[1]); + do { + while (1) { + struct context_vec v; - member = (int *) nfile[1]; - equiv(sfile[0], slen[0], sfile[1], slen[1], member); -//TODO: xrealloc_vector? - member = xrealloc(member, (slen[1] + 2) * sizeof(int)); - - class = (int *) nfile[0]; - unsort(sfile[0], slen[0], class); - class = xrealloc(class, (slen[0] + 2) * sizeof(int)); - - klist = xmalloc((slen[0] + 2) * sizeof(int)); - clen = 0; - clistlen = 100; - clist = xmalloc(clistlen * sizeof(struct cand)); - i = stone(class, slen[0], member, klist); - free(member); - free(class); + for (v.a = i; v.a <= nlen[0] && J[v.a] == J[v.a - 1] + 1; v.a++) + continue; + v.c = J[v.a - 1] + 1; - J = xrealloc(J, (nlen[0] + 2) * sizeof(int)); - unravel(klist[i]); - free(clist); - free(klist); + for (v.b = v.a - 1; v.b < nlen[0] && !J[v.b + 1]; v.b++) + continue; + v.d = J[v.b + 1] - 1; + /* + * Indicate that there is a difference between lines a and b of the 'from' file + * to get to lines c to d of the 'to' file. If a is greater than b then there + * are no lines in the 'from' file involved and this means that there were + * lines appended (beginning at b). If c is greater than d then there are + * lines missing from the 'to' file. + */ + if (v.a <= v.b || v.c <= v.d) { + /* + * If this change is more than 'context' lines from the + * previous change, dump the record and reset it. + */ + if (idx >= 0 + && v.a > vec[idx].b + (2 * opt_U_context) + 1 + && v.c > vec[idx].d + (2 * opt_U_context) + 1 + ) { + break; + } + vec = xrealloc_vector(vec, 6, ++idx); + vec[idx] = v; + } + + i = v.b + 1; + if (i > nlen[0]) + break; + J[v.b] = v.d; + } + if (idx < 0) + continue; + if (!(option_mask32 & FLAG(q))) { + struct context_vec *cvp = vec; + int lowa = MAX(1, cvp->a - opt_U_context); + int upb = MIN(nlen[0], vec[idx].b + opt_U_context); + int lowc = MAX(1, cvp->c - opt_U_context); + int upd = MIN(nlen[1], vec[idx].d + opt_U_context); + + if (!anychange) { + /* Print the context/unidiff header first time through */ + printf("--- %s\n", label[0] ?: file[0]); + printf("+++ %s\n", label[1] ?: file[1]); + } - ixold = xrealloc(ixold, (nlen[0] + 2) * sizeof(long)); - ixnew = xrealloc(ixnew, (nlen[1] + 2) * sizeof(long)); -// Rewind inside! - check(f1, f2); -// Rewind inside! - output(file1, f1, file2, f2); + printf("@@ -"); + uni_range(lowa, upb); + printf(" +"); + uni_range(lowc, upd); + printf(" @@\n"); - closem: - if (anychange) { - exit_status |= 1; - dbg_error_msg("exit_status|=1 = %d", exit_status); - if (rval == D_SAME) - rval = D_DIFFER; - } - fclose_if_not_stdin(f1); - fclose_if_not_stdin(f2); - if (tempname1) { - unlink(tempname1); - free(tempname1); - } - if (tempname2) { - unlink(tempname2); - free(tempname2); - } - return rval; + /* + * Output changes in "unified" diff format--the old and new lines + * are printed together. + */ + while (1) { + bool end = cvp > &vec[idx]; + fetch(&ft[0], ix[0], lowa, end ? upb : cvp->a - 1, ' '); + if (end) + break; + fetch(&ft[0], ix[0], cvp->a, cvp->b, '-'); + fetch(&ft[1], ix[1], cvp->c, cvp->d, '+'); + lowa = cvp++->b + 1; + } + } + idx = -1; + anychange = true; + } while (i <= nlen[0]); + + free(vec); + free(ix[0]); + free(ix[1]); + free(J); + return anychange; } - -#if ENABLE_FEATURE_DIFF_DIR -static void do_diff(char *dir1, char *path1, char *dir2, char *path2) +static int diffreg(char *file[2]) { - int flags = 0; /*D_HEADER;*/ - int val; - char *fullpath1 = NULL; /* if -N */ - char *fullpath2 = NULL; - - if (path1) - fullpath1 = concat_path_file(dir1, path1); - if (path2) - fullpath2 = concat_path_file(dir2, path2); - - if (!fullpath1 || stat(fullpath1, &stb1) != 0) { - flags |= D_EMPTY1; - memset(&stb1, 0, sizeof(stb1)); - if (path2) { - free(fullpath1); - fullpath1 = concat_path_file(dir1, path2); + FILE_and_pos_t ft[2]; + bool binary = false, differ = false; + int status = STATUS_SAME; + + for (int i = 0; i < 2; i++) { + int fd = open_or_warn_stdin(file[i]); + if (fd == -1) + xfunc_die(); + /* Our diff implementation is using seek. + * When we meet non-seekable file, we must make a temp copy. + */ + ft[i].ft_pos = 0; + if (lseek(fd, 0, SEEK_SET) == -1 && errno == ESPIPE) { + char name[] = "/tmp/difXXXXXX"; + int fd_tmp = mkstemp(name); + if (fd_tmp < 0) + bb_perror_msg_and_die("mkstemp"); + unlink(name); + ft[i].ft_pos = bb_copyfd_eof(fd, fd_tmp); + /* error message is printed by bb_copyfd_eof */ + if (ft[i].ft_pos < 0) + xfunc_die(); + fstat(fd, &stb[i]); + if (fd) /* Prevents closing of stdin */ + close(fd); + fd = fd_tmp; } + ft[i].ft_fp = fdopen(fd, "r"); } - if (!fullpath2 || stat(fullpath2, &stb2) != 0) { - flags |= D_EMPTY2; - memset(&stb2, 0, sizeof(stb2)); - stb2.st_mode = stb1.st_mode; - if (path1) { - free(fullpath2); - fullpath2 = concat_path_file(dir2, path1); + + while (1) { + const size_t sz = COMMON_BUFSIZE / 2; + char *const buf0 = bb_common_bufsiz1; + char *const buf1 = buf0 + sz; + int i, j; + i = fread(buf0, 1, sz, ft[0].ft_fp); + ft[0].ft_pos += i; + j = fread(buf1, 1, sz, ft[1].ft_fp); + ft[1].ft_pos += j; + if (i != j) { + differ = true; + i = MIN(i, j); + } + if (i == 0) + break; + for (int k = 0; k < i; k++) { + if (!buf0[k] || !buf1[k]) + binary = true; + if (buf0[k] != buf1[k]) + differ = true; } } - - if (stb1.st_mode == 0) - stb1.st_mode = stb2.st_mode; - - if (S_ISDIR(stb1.st_mode) && S_ISDIR(stb2.st_mode)) { - printf("Common subdirectories: %s and %s\n", fullpath1, fullpath2); - goto ret; + if (differ) { + if (binary && !(option_mask32 & FLAG(a))) + status = STATUS_BINARY; + else if (diff(ft, file)) + status = STATUS_DIFFER; } + if (status != STATUS_SAME) + exit_status |= 1; - if (!S_ISREG(stb1.st_mode) && !S_ISDIR(stb1.st_mode)) - val = D_SKIPPED1; - else if (!S_ISREG(stb2.st_mode) && !S_ISDIR(stb2.st_mode)) - val = D_SKIPPED2; - else { - /* Both files are either REGular or DIRectories */ - val = diffreg(fullpath1, fullpath2, flags); - } + fclose_if_not_stdin(ft[0].ft_fp); + fclose_if_not_stdin(ft[1].ft_fp); - print_status(val, fullpath1, fullpath2 /*, NULL*/); - ret: - free(fullpath1); - free(fullpath2); + return status; } -#endif +static void print_status(int status, char *path[2]) +{ + switch (status) { + case STATUS_BINARY: + case STATUS_DIFFER: + if ((option_mask32 & FLAG(q)) || status == STATUS_BINARY) + printf("Files %s and %s differ\n", path[0], path[1]); + break; + case STATUS_SAME: + if (option_mask32 & FLAG(s)) + printf("Files %s and %s are identical\n", path[0], path[1]); + break; + } +} #if ENABLE_FEATURE_DIFF_DIR +struct dlist { + size_t len; + int s, e; + char **dl; +}; + /* This function adds a filename to dl, the directory listing. */ static int FAST_FUNC add_to_dirlist(const char *filename, struct stat *sb UNUSED_PARAM, - void *userdata, - int depth UNUSED_PARAM) + void *userdata, int depth UNUSED_PARAM) { - dl = xrealloc_vector(dl, 5, dl_count); - dl[dl_count] = xstrdup(filename + (int)(ptrdiff_t)userdata); - dl_count++; + struct dlist *const l = userdata; + l->dl = xrealloc_vector(l->dl, 6, l->e); + /* + 1 skips "/" after dirname */ + l->dl[l->e] = xstrdup(filename + l->len + 1); + l->e++; return TRUE; } - -/* This returns a sorted directory listing. */ -static char **get_recursive_dirlist(char *path) +/* If recursion is not set, this function adds the directory + * to the list and prevents recursive_action from recursing into it. + */ +static int FAST_FUNC skip_dir(const char *filename, + struct stat *sb, void *userdata, + int depth) { - dl_count = 0; - dl = xzalloc(sizeof(dl[0])); - - /* We need to trim root directory prefix. - * Using void *userdata to specify its length, - * add_to_dirlist will remove it. */ - if (option_mask32 & FLAG_r) { - recursive_action(path, ACTION_RECURSE|ACTION_FOLLOWLINKS, - add_to_dirlist, /* file_action */ - NULL, /* dir_action */ - (void*)(ptrdiff_t)(strlen(path) + 1), - 0); - } else { - DIR *dp; - struct dirent *ep; - - dp = warn_opendir(path); - while ((ep = readdir(dp))) { - if (!strcmp(ep->d_name, "..") || LONE_CHAR(ep->d_name, '.')) - continue; - add_to_dirlist(ep->d_name, NULL, (void*)(int)0, 0); - } - closedir(dp); + if (!(option_mask32 & FLAG(r)) && depth) { + add_to_dirlist(filename, sb, userdata, depth); + return SKIP; } - - /* Sort dl alphabetically. */ - qsort_string_vector(dl, dl_count); - - dl[dl_count] = NULL; - return dl; + return TRUE; } - -static void diffdir(char *p1, char *p2) +static void diffdir(char *p[2], const char *s_start) { - char **dirlist1, **dirlist2; - char *dp1, *dp2; - int pos; - - /* Check for trailing slashes. */ - dp1 = last_char_is(p1, '/'); - if (dp1 != NULL) - *dp1 = '\0'; - dp2 = last_char_is(p2, '/'); - if (dp2 != NULL) - *dp2 = '\0'; - - /* Get directory listings for p1 and p2. */ - dirlist1 = get_recursive_dirlist(p1); - dirlist2 = get_recursive_dirlist(p2); - - /* If -S was set, find the starting point. */ - if (opt_S_start) { - while (*dirlist1 != NULL && strcmp(*dirlist1, opt_S_start) < 0) - dirlist1++; - while (*dirlist2 != NULL && strcmp(*dirlist2, opt_S_start) < 0) - dirlist2++; - if ((*dirlist1 == NULL) || (*dirlist2 == NULL)) - bb_error_msg(bb_msg_invalid_arg, "NULL", "-S"); + struct dlist list[2]; + + memset(&list, 0, sizeof(list)); + for (int i = 0; i < 2; i++) { + /*list[i].s = list[i].e = 0; - memset did it */ + /*list[i].dl = NULL; */ + + /* We need to trim root directory prefix. + * Using list.len to specify its length, + * add_to_dirlist will remove it. */ + list[i].len = strlen(p[i]); + recursive_action(p[i], ACTION_RECURSE | ACTION_FOLLOWLINKS, + add_to_dirlist, skip_dir, &list[i], 0); + /* Sort dl alphabetically. + * GNU diff does this ignoring any number of trailing dots. + * We don't, so for us dotted files almost always are + * first on the list. + */ + qsort_string_vector(list[i].dl, list[i].e); + /* If -S was set, find the starting point. */ + if (!s_start) + continue; + while (list[i].s < list[i].e && strcmp(list[i].dl[list[i].s], s_start) < 0) + list[i].s++; } - /* Now that both dirlist1 and dirlist2 contain sorted directory * listings, we can start to go through dirlist1. If both listings * contain the same file, then do a normal diff. Otherwise, behaviour * is determined by whether the -N flag is set. */ - while (*dirlist1 != NULL || *dirlist2 != NULL) { - dp1 = *dirlist1; - dp2 = *dirlist2; - pos = dp1 == NULL ? 1 : (dp2 == NULL ? -1 : strcmp(dp1, dp2)); + while (1) { + char *dp[2]; + int pos; + int k; + + dp[0] = list[0].s < list[0].e ? list[0].dl[list[0].s] : NULL; + dp[1] = list[1].s < list[1].e ? list[1].dl[list[1].s] : NULL; + if (!dp[0] && !dp[1]) + break; + pos = !dp[0] ? 1 : (!dp[1] ? -1 : strcmp(dp[0], dp[1])); + k = pos > 0; + if (pos && !(option_mask32 & FLAG(N))) + printf("Only in %s: %s\n", p[k], dp[k]); + else { + char *fullpath[2], *path[2]; /* if -N */ + + for (int i = 0; i < 2; i++) { + if (pos == 0 || i == k) { + path[i] = fullpath[i] = concat_path_file(p[i], dp[i]); + stat(fullpath[i], &stb[i]); + } else { + fullpath[i] = concat_path_file(p[i], dp[1 - i]); + path[i] = (char *)bb_dev_null; + } + } + if (pos) + stat(fullpath[k], &stb[1 - k]); + + if (S_ISDIR(stb[0].st_mode) && S_ISDIR(stb[1].st_mode)) + printf("Common subdirectories: %s and %s\n", fullpath[0], fullpath[1]); + else if (!S_ISREG(stb[0].st_mode) && !S_ISDIR(stb[0].st_mode)) + printf("File %s is not a regular file or directory and was skipped\n", fullpath[0]); + else if (!S_ISREG(stb[1].st_mode) && !S_ISDIR(stb[1].st_mode)) + printf("File %s is not a regular file or directory and was skipped\n", fullpath[1]); + else if (S_ISDIR(stb[0].st_mode) != S_ISDIR(stb[1].st_mode)) { + if (S_ISDIR(stb[0].st_mode)) + printf("File %s is a %s while file %s is a %s\n", fullpath[0], "directory", fullpath[1], "regular file"); + else + printf("File %s is a %s while file %s is a %s\n", fullpath[0], "regular file", fullpath[1], "directory"); + } else + print_status(diffreg(path), fullpath); + + free(fullpath[0]); + free(fullpath[1]); + } + free(dp[k]); + list[k].s++; if (pos == 0) { - do_diff(p1, dp1, p2, dp2); - dirlist1++; - dirlist2++; - } else if (pos < 0) { - if (option_mask32 & FLAG_N) - do_diff(p1, dp1, p2, NULL); - else - print_only(p1, dp1); - dirlist1++; - } else { - if (option_mask32 & FLAG_N) - do_diff(p1, NULL, p2, dp2); - else - print_only(p2, dp2); - dirlist2++; + free(dp[1 - k]); + list[1 - k].s++; } } + if (ENABLE_FEATURE_CLEAN_UP) { + free(list[0].dl); + free(list[1].dl); + } } #endif - int diff_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; int diff_main(int argc UNUSED_PARAM, char **argv) { int gotstdin = 0; - char *f1, *f2; + char *file[2], *s_start = NULL; llist_t *L_arg = NULL; INIT_G(); @@ -1280,65 +874,49 @@ int diff_main(int argc UNUSED_PARAM, char **argv) opt_complementary = "=2:L::U+"; getopt32(argv, "abdiL:NqrsS:tTU:wu" "p" /* ignored (for compatibility) */, - &L_arg, &opt_S_start, &opt_U_context); - /*argc -= optind;*/ + &L_arg, &s_start, &opt_U_context); argv += optind; while (L_arg) { - if (label1 && label2) + if (label[0] && label[1]) bb_show_usage(); - if (label1) /* then label2 is NULL */ - label2 = label1; - label1 = llist_pop(&L_arg); + if (label[0]) /* then label[1] is NULL */ + label[1] = label[0]; + label[0] = llist_pop(&L_arg); } - - /* - * Do sanity checks, fill in stb1 and stb2 and call the appropriate - * driver routine. Both drivers use the contents of stb1 and stb2. - */ - f1 = argv[0]; - f2 = argv[1]; - /* Compat: "diff file name_which_doesnt_exist" exits with 2 */ xfunc_error_retval = 2; - if (LONE_DASH(f1)) { - fstat(STDIN_FILENO, &stb1); - gotstdin++; - } else - xstat(f1, &stb1); - if (LONE_DASH(f2)) { - fstat(STDIN_FILENO, &stb2); - gotstdin++; - } else - xstat(f2, &stb2); + for (int i = 0; i < 2; i++) { + file[i] = argv[i]; + /* Compat: "diff file name_which_doesnt_exist" exits with 2 */ + if (LONE_DASH(file[i])) { + fstat(STDIN_FILENO, &stb[i]); + gotstdin++; + } else + xstat(file[i], &stb[i]); + } xfunc_error_retval = 1; - - if (gotstdin && (S_ISDIR(stb1.st_mode) || S_ISDIR(stb2.st_mode))) + if (gotstdin && (S_ISDIR(stb[0].st_mode) || S_ISDIR(stb[1].st_mode))) bb_error_msg_and_die("can't compare stdin to a directory"); - if (S_ISDIR(stb1.st_mode) && S_ISDIR(stb2.st_mode)) { + if (S_ISDIR(stb[0].st_mode) && S_ISDIR(stb[1].st_mode)) { #if ENABLE_FEATURE_DIFF_DIR - diffdir(f1, f2); - return exit_status; + diffdir(file, s_start); #else bb_error_msg_and_die("no support for directory comparison"); #endif - } + } else { + bool dirfile = S_ISDIR(stb[0].st_mode) || S_ISDIR(stb[1].st_mode); + bool dir = S_ISDIR(stb[1].st_mode); + if (dirfile) { + const char *slash = strrchr(file[!dir], '/'); + file[dir] = concat_path_file(file[dir], slash ? slash + 1 : file[!dir]); + xstat(file[dir], &stb[dir]); + } + /* diffreg can get non-regular files here */ + print_status(gotstdin > 1 ? STATUS_SAME : diffreg(file), file); - if (S_ISDIR(stb1.st_mode)) { /* "diff dir file" */ - /* NB: "diff dir dir2/dir3/file" must become - * "diff dir/file dir2/dir3/file" */ - char *slash = strrchr(f2, '/'); - f1 = concat_path_file(f1, slash ? slash + 1 : f2); - xstat(f1, &stb1); - } - if (S_ISDIR(stb2.st_mode)) { - char *slash = strrchr(f1, '/'); - f2 = concat_path_file(f2, slash ? slash + 1 : f1); - xstat(f2, &stb2); + if (dirfile) + free(file[dir]); } - /* diffreg can get non-regular files here, - * they are not both DIRestories */ - print_status((gotstdin > 1 ? D_SAME : diffreg(f1, f2, 0)), - f1, f2 /*, NULL*/); return exit_status; } -- cgit v1.2.3