aboutsummaryrefslogtreecommitdiff
path: root/coreutils/sort.c
diff options
context:
space:
mode:
authorDenys Vlasenko <vda.linux@googlemail.com>2018-04-04 17:02:32 +0200
committerDenys Vlasenko <vda.linux@googlemail.com>2018-04-04 17:03:41 +0200
commitc29c2e60d8677cdec142a88609a3d040e4719439 (patch)
tree9d23487088699fb4f1b4aabd9361c4a7c7d21edf /coreutils/sort.c
parentee1fd1246e7d25990531dbbb2d267605b8914d5d (diff)
downloadbusybox-c29c2e60d8677cdec142a88609a3d040e4719439.tar.gz
sort: FEATURE_SORT_OPTIMIZE_MEMORY
On sorting all kernel/linux/arch/ *.[ch] files, this reduces memory usage by 6%. yes | head -99999999 | sort goes down from 1900Mb to 380 Mb. function old new delta sort_main 862 1098 +236 Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Diffstat (limited to 'coreutils/sort.c')
-rw-r--r--coreutils/sort.c86
1 files changed, 80 insertions, 6 deletions
diff --git a/coreutils/sort.c b/coreutils/sort.c
index b39297a26..b8bb6871d 100644
--- a/coreutils/sort.c
+++ b/coreutils/sort.c
@@ -18,16 +18,24 @@
//config: sort is used to sort lines of text in specified files.
//config:
//config:config FEATURE_SORT_BIG
-//config: bool "Full SuSv3 compliant sort (support -ktcsbdfiozgM)"
+//config: bool "Full SuSv3 compliant sort (support -ktcbdfiozgM)"
//config: default y
//config: depends on SORT
//config: help
-//config: Without this, sort only supports -r, -u, and an integer version
+//config: Without this, sort only supports -r, -u, -s, and an integer version
//config: of -n. Selecting this adds sort keys, floating point support, and
//config: more. This adds a little over 3k to a nonstatic build on x86.
//config:
//config: The SuSv3 sort standard is available at:
//config: http://www.opengroup.org/onlinepubs/007904975/utilities/sort.html
+//config:
+//config:config FEATURE_SORT_OPTIMIZE_MEMORY
+//config: bool "Use less memory (but might be slower)"
+//config: default n # defaults to N since we are size-paranoid tribe
+//config: depends on SORT
+//config: help
+//config: Attempt to use less memory (by storing only one copy
+//config: of duplicated lines, and such). Useful if you work on huge files.
//applet:IF_SORT(APPLET_NOEXEC(sort, sort, BB_DIR_USR_BIN, BB_SUID_DROP, sort))
@@ -46,19 +54,17 @@
//usage: "\n -f Ignore case"
//usage: "\n -i Ignore unprintable characters"
//usage: "\n -d Dictionary order (blank or alphanumeric only)"
-//usage: "\n -g General numerical sort"
-//usage: "\n -M Sort month"
//usage: )
//-h, --human-numeric-sort: compare human readable numbers (e.g., 2K 1G)
//usage: "\n -n Sort numbers"
//usage: IF_FEATURE_SORT_BIG(
+//usage: "\n -g General numerical sort"
+//usage: "\n -M Sort month"
//usage: "\n -t CHAR Field separator"
//usage: "\n -k N[,M] Sort by Nth field"
//usage: )
//usage: "\n -r Reverse sort order"
-//usage: IF_FEATURE_SORT_BIG(
//usage: "\n -s Stable (don't sort ties alphabetically)"
-//usage: )
//usage: "\n -u Suppress duplicate lines"
//usage: IF_FEATURE_SORT_BIG(
//usage: "\n -z Lines are terminated by NUL, not newline"
@@ -404,6 +410,14 @@ int sort_main(int argc UNUSED_PARAM, char **argv)
int i;
int linecount;
unsigned opts;
+#if ENABLE_FEATURE_SORT_OPTIMIZE_MEMORY
+ bool can_drop_dups;
+ size_t prev_len = 0;
+ /* Postpone optimizing if the input is small, < 16k lines:
+ * even just free()ing duplicate lines takes time.
+ */
+ size_t count_to_optimize_dups = 0x3fff;
+#endif
xfunc_error_retval = 2;
@@ -412,6 +426,29 @@ int sort_main(int argc UNUSED_PARAM, char **argv)
sort_opt_str,
&str_ignored, &str_ignored, &str_o, &lst_k, &str_t
);
+#if ENABLE_FEATURE_SORT_OPTIMIZE_MEMORY
+ /* Can drop dups only if -u but no "complicating" options,
+ * IOW: if we do a full line compares. Safe options:
+ * -o FILE Output to FILE
+ * -z Lines are terminated by NUL, not newline
+ * -r Reverse sort order
+ * -s Stable (don't sort ties alphabetically)
+ * Not sure about these:
+ * -b Ignore leading blanks
+ * -f Ignore case
+ * -i Ignore unprintable characters
+ * -d Dictionary order (blank or alphanumeric only)
+ * -n Sort numbers
+ * -g General numerical sort
+ * -M Sort month
+ */
+ can_drop_dups = ((opts & ~(FLAG_o|FLAG_z|FLAG_r|FLAG_s)) == FLAG_u);
+ /* Stable sort needs every line to be uniquely allocated,
+ * disable optimization to reuse strings:
+ */
+ if (opts & FLAG_s)
+ count_to_optimize_dups = (size_t)-1L;
+#endif
/* global b strips leading and trailing spaces */
if (opts & FLAG_b)
option_mask32 |= FLAG_bb;
@@ -489,6 +526,41 @@ int sort_main(int argc UNUSED_PARAM, char **argv)
char *line = GET_LINE(fp);
if (!line)
break;
+
+#if ENABLE_FEATURE_SORT_OPTIMIZE_MEMORY
+ if (count_to_optimize_dups != 0)
+ count_to_optimize_dups--;
+ if (count_to_optimize_dups == 0) {
+ size_t len;
+ char *new_line;
+ char first = *line;
+
+ /* On kernel/linux/arch/ *.[ch] files,
+ * this reduces memory usage by 6%.
+ * yes | head -99999999 | sort
+ * goes down from 1900Mb to 380 Mb.
+ */
+ if (first == '\0' || first == '\n') {
+ len = !(first == '\0');
+ new_line = (char*)"\n" + 1 - len;
+ goto replace;
+ }
+ len = strlen(line);
+ if (len <= prev_len) {
+ new_line = lines[linecount-1] + (prev_len - len);
+ if (strcmp(line, new_line) == 0) {
+ if (can_drop_dups && prev_len == len) {
+ free(line);
+ continue;
+ }
+ replace:
+ free(line);
+ line = new_line;
+ }
+ }
+ prev_len = len;
+ }
+#endif
lines = xrealloc_vector(lines, 6, linecount);
lines[linecount++] = line;
}
@@ -510,6 +582,8 @@ int sort_main(int argc UNUSED_PARAM, char **argv)
}
return EXIT_SUCCESS;
}
+#else
+//TODO: lighter version which only drops total dups if can_drop_dups == true
#endif
/* For stable sort, store original line position beyond terminating NUL */