From 4ff933c0e7895bd1ac1fe9793117f4d69de35514 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 30 Jul 2014 14:18:57 +0200 Subject: sha3: code shrink (and speedup for SHA3_SMALL=0) function old new delta sha3_process_block72 1454 1359 -95 Signed-off-by: Denys Vlasenko --- libbb/hash_md5_sha.c | 82 ++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 66 insertions(+), 16 deletions(-) (limited to 'libbb/hash_md5_sha.c') diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index dff583ad1..4cef2aba1 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c @@ -1141,22 +1141,30 @@ static void sha3_process_block72(uint64_t *state) #undef RhoPi } /* Chi */ - for (x = 0; x <= 20; x += 5) { - /* - * Can write this in terms of uint32 too, - * but why? compiler does it automatically. - */ - uint64_t BC0, BC1, BC2, BC3, BC4; - BC0 = state[x + 0]; - BC1 = state[x + 1]; - BC2 = state[x + 2]; - state[x + 0] = BC0 ^ ((~BC1) & BC2); - BC3 = state[x + 3]; - state[x + 1] = BC1 ^ ((~BC2) & BC3); - BC4 = state[x + 4]; - state[x + 2] = BC2 ^ ((~BC3) & BC4); - state[x + 3] = BC3 ^ ((~BC4) & BC0); - state[x + 4] = BC4 ^ ((~BC0) & BC1); + for (x = 0; x <= 40;) { + uint32_t BC0, BC1, BC2, BC3, BC4; + BC0 = s32[x + 0*2]; + BC1 = s32[x + 1*2]; + BC2 = s32[x + 2*2]; + s32[x + 0*2] = BC0 ^ ((~BC1) & BC2); + BC3 = s32[x + 3*2]; + s32[x + 1*2] = BC1 ^ ((~BC2) & BC3); + BC4 = s32[x + 4*2]; + s32[x + 2*2] = BC2 ^ ((~BC3) & BC4); + s32[x + 3*2] = BC3 ^ ((~BC4) & BC0); + s32[x + 4*2] = BC4 ^ ((~BC0) & BC1); + x++; + BC0 = s32[x + 0*2]; + BC1 = s32[x + 1*2]; + BC2 = s32[x + 2*2]; + s32[x + 0*2] = BC0 ^ ((~BC1) & BC2); + BC3 = s32[x + 3*2]; + s32[x + 1*2] = BC1 ^ ((~BC2) & BC3); + BC4 = s32[x + 4*2]; + s32[x + 2*2] = BC2 ^ ((~BC3) & BC4); + s32[x + 3*2] = BC3 ^ ((~BC4) & BC0); + s32[x + 4*2] = BC4 ^ ((~BC0) & BC1); + x += 9; } /* Iota */ s32[0] ^= IOTA_CONST_0bits & 1; @@ -1275,6 +1283,7 @@ static void sha3_process_block72(uint64_t *state) #undef RhoPi_twice } /* Chi */ +#if LONG_MAX > 0x7fffffff for (x = 0; x <= 20; x += 5) { uint64_t BC0, BC1, BC2, BC3, BC4; BC0 = state[x + 0]; @@ -1288,6 +1297,47 @@ static void sha3_process_block72(uint64_t *state) state[x + 3] = BC3 ^ ((~BC4) & BC0); state[x + 4] = BC4 ^ ((~BC0) & BC1); } +#else + /* Reduced register pressure version + * for register-starved 32-bit arches + * (i386: -95 bytes, and it is _faster_) + */ + for (x = 0; x <= 40;) { + uint32_t BC0, BC1, BC2, BC3, BC4; + uint32_t *const s32 = (uint32_t*)state; +# if SHA3_SMALL + do_half: +#endif + BC0 = s32[x + 0*2]; + BC1 = s32[x + 1*2]; + BC2 = s32[x + 2*2]; + s32[x + 0*2] = BC0 ^ ((~BC1) & BC2); + BC3 = s32[x + 3*2]; + s32[x + 1*2] = BC1 ^ ((~BC2) & BC3); + BC4 = s32[x + 4*2]; + s32[x + 2*2] = BC2 ^ ((~BC3) & BC4); + s32[x + 3*2] = BC3 ^ ((~BC4) & BC0); + s32[x + 4*2] = BC4 ^ ((~BC0) & BC1); + x++; +# if SHA3_SMALL + if (x & 1) + goto do_half; + x += 8; +# else + BC0 = s32[x + 0*2]; + BC1 = s32[x + 1*2]; + BC2 = s32[x + 2*2]; + s32[x + 0*2] = BC0 ^ ((~BC1) & BC2); + BC3 = s32[x + 3*2]; + s32[x + 1*2] = BC1 ^ ((~BC2) & BC3); + BC4 = s32[x + 4*2]; + s32[x + 2*2] = BC2 ^ ((~BC3) & BC4); + s32[x + 3*2] = BC3 ^ ((~BC4) & BC0); + s32[x + 4*2] = BC4 ^ ((~BC0) & BC1); + x += 9; +# endif + } +#endif /* Iota */ state[0] ^= IOTA_CONST[round] | (uint32_t)((IOTA_CONST_bit31 << round) & 0x80000000) -- cgit v1.2.3