aboutsummaryrefslogtreecommitdiff
path: root/archival/libarchive
diff options
context:
space:
mode:
authorDenys Vlasenko <vda.linux@googlemail.com>2018-02-05 00:34:08 +0100
committerDenys Vlasenko <vda.linux@googlemail.com>2018-02-05 00:34:08 +0100
commitc2a51b0cf16918482c993c4998a2a920e499a43f (patch)
treef138a8f9dfc180c60446e682ab4436c3bdfad127 /archival/libarchive
parentf75a7c04397b8b3409b5c4f5a4438654b6b830ce (diff)
downloadbusybox-c2a51b0cf16918482c993c4998a2a920e499a43f.tar.gz
bzip2: work around bad compiler optimization
gc-6.1.1 x86_64: function old new delta generateMTFValues 380 367 -13 gcc-4.3.1 386: function old new delta inner_loop - 41 +41 generateMTFValues 357 294 -63 ------------------------------------------------------------------------------ (add/remove: 1/0 grow/shrink: 0/1 up/down: 41/-63) Total: -22 bytes gcc-6.3.0 386: function old new delta inner_loop - 36 +36 generateMTFValues 363 250 -113 ------------------------------------------------------------------------------ (add/remove: 1/0 grow/shrink: 0/1 up/down: 36/-113) Total: -77 bytes The last case, gcc-6.3.0, runs almost 3 times faster after this change. Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Diffstat (limited to 'archival/libarchive')
-rw-r--r--archival/libarchive/bz/compress.c75
1 files changed, 43 insertions, 32 deletions
diff --git a/archival/libarchive/bz/compress.c b/archival/libarchive/bz/compress.c
index f65076758..462740b6c 100644
--- a/archival/libarchive/bz/compress.c
+++ b/archival/libarchive/bz/compress.c
@@ -158,6 +158,38 @@ void makeMaps_e(EState* s)
/*---------------------------------------------------*/
+/*
+ * This bit of code is performance-critical.
+ * On 32bit x86, gcc-6.3.0 was observed to spill ryy_j to stack,
+ * resulting in abysmal performance (x3 slowdown).
+ * Forcing it into a separate function alleviates register pressure,
+ * and spillage no longer happens.
+ * Other versions of gcc do not exhibit this problem, but out-of-line code
+ * seems to be helping them too (code is both smaller and faster).
+ * Therefore NOINLINE is enabled for the entire 32bit x86 arch for now,
+ * without a check for gcc version.
+ */
+static
+#if defined __i386__
+NOINLINE
+#endif
+int inner_loop(uint8_t *yy, uint8_t ll_i)
+{
+ register uint8_t rtmp;
+ register uint8_t* ryy_j;
+ rtmp = yy[1];
+ yy[1] = yy[0];
+ ryy_j = &(yy[1]);
+ while (ll_i != rtmp) {
+ register uint8_t rtmp2;
+ ryy_j++;
+ rtmp2 = rtmp;
+ rtmp = *ryy_j;
+ *ryy_j = rtmp2;
+ }
+ yy[0] = rtmp;
+ return ryy_j - &(yy[0]);
+}
static NOINLINE
void generateMTFValues(EState* s)
{
@@ -165,7 +197,6 @@ void generateMTFValues(EState* s)
int i;
int zPend;
int32_t wr;
- int32_t EOB;
/*
* After sorting (eg, here),
@@ -189,15 +220,12 @@ void generateMTFValues(EState* s)
* compressBlock().
*/
uint32_t* ptr = s->ptr;
- uint8_t* block = s->block;
- uint16_t* mtfv = s->mtfv;
makeMaps_e(s);
- EOB = s->nInUse+1;
wr = 0;
zPend = 0;
- for (i = 0; i <= EOB; i++)
+ for (i = 0; i <= s->nInUse+1; i++)
s->mtfFreq[i] = 0;
for (i = 0; i < s->nInUse; i++)
@@ -211,7 +239,7 @@ void generateMTFValues(EState* s)
j = ptr[i] - 1;
if (j < 0)
j += s->nblock;
- ll_i = s->unseqToSeq[block[j]];
+ ll_i = s->unseqToSeq[s->block[j]];
AssertD(ll_i < s->nInUse, "generateMTFValues(2a)");
if (yy[0] == ll_i) {
@@ -225,15 +253,15 @@ void generateMTFValues(EState* s)
while (1) {
#if 0
if (zPend & 1) {
- mtfv[wr] = BZ_RUNB; wr++;
+ s->mtfv[wr] = BZ_RUNB; wr++;
s->mtfFreq[BZ_RUNB]++;
} else {
- mtfv[wr] = BZ_RUNA; wr++;
+ s->mtfv[wr] = BZ_RUNA; wr++;
s->mtfFreq[BZ_RUNA]++;
}
#else /* same as above, since BZ_RUNA is 0 and BZ_RUNB is 1 */
unsigned run = zPend & 1;
- mtfv[wr] = run;
+ s->mtfv[wr] = run;
wr++;
s->mtfFreq[run]++;
#endif
@@ -247,36 +275,19 @@ void generateMTFValues(EState* s)
goto end;
zPend = 0;
}
- {
- register uint8_t rtmp;
- register uint8_t* ryy_j;
- register uint8_t rll_i;
- rtmp = yy[1];
- yy[1] = yy[0];
- ryy_j = &(yy[1]);
- rll_i = ll_i;
- while (rll_i != rtmp) {
- register uint8_t rtmp2;
- ryy_j++;
- rtmp2 = rtmp;
- rtmp = *ryy_j;
- *ryy_j = rtmp2;
- }
- yy[0] = rtmp;
- j = ryy_j - &(yy[0]);
- mtfv[wr] = j+1;
- wr++;
- s->mtfFreq[j+1]++;
- }
+ j = inner_loop(yy, ll_i);
+ s->mtfv[wr] = j+1;
+ wr++;
+ s->mtfFreq[j+1]++;
}
i = -1;
if (zPend > 0)
goto process_zPend; /* "process it and come back here" */
end:
- mtfv[wr] = EOB;
+ s->mtfv[wr] = s->nInUse+1;
wr++;
- s->mtfFreq[EOB]++;
+ s->mtfFreq[s->nInUse+1]++;
s->nMTF = wr;
}