From 9f00a0fdb159432f1d7232253e2180d85e5eca32 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sat, 8 Dec 2018 13:34:43 +0100 Subject: tls: make RIGHTSHIFTX() in AES-GCM faster Signed-off-by: Denys Vlasenko --- networking/tls_aesgcm.c | 63 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 56 insertions(+), 7 deletions(-) (limited to 'networking/tls_aesgcm.c') diff --git a/networking/tls_aesgcm.c b/networking/tls_aesgcm.c index 688df85fb..008dc9b5d 100644 --- a/networking/tls_aesgcm.c +++ b/networking/tls_aesgcm.c @@ -35,17 +35,66 @@ static ALWAYS_INLINE void FlattenSzInBits(byte* buf, word32 sz) static void RIGHTSHIFTX(byte* x) { - int i; - int carryOut = 0; - int carryIn = 0; - int borrow = x[15] & 0x01; +#define l ((unsigned long*)x) +#if 0 + // Generic byte-at-a-time algorithm + int i; + byte carryIn = (x[15] & 0x01) ? 0xE1 : 0; for (i = 0; i < AES_BLOCK_SIZE; i++) { - carryOut = x[i] & 0x01; - x[i] = (x[i] >> 1) | (carryIn ? 0x80 : 0); + byte carryOut = (x[i] << 7); // zero, or 0x80 + x[i] = (x[i] >> 1) ^ carryIn; + carryIn = carryOut; + } + +#elif BB_BIG_ENDIAN + + // Big-endian can shift-right in larger than byte chunks + // (we use the fact that 'x' is long-aligned) + unsigned long carryIn = (x[15] & 0x01) + ? ((unsigned long)0xE1 << (LONG_BIT-8)) + : 0; +# if ULONG_MAX <= 0xffffffff + int i; + for (i = 0; i < AES_BLOCK_SIZE/sizeof(long); i++) { + unsigned long carryOut = l[i] << (LONG_BIT-1); // zero, or 0x800..00 + l[i] = (l[i] >> 1) ^ carryIn; + carryIn = carryOut; + } +# else + // 64-bit code: need to process only 2 words + unsigned long carryOut = l[0] << (LONG_BIT-1); // zero, or 0x800..00 + l[0] = (l[0] >> 1) ^ carryIn; + l[1] = (l[1] >> 1) ^ carryOut; +# endif + +#else /* LITTLE_ENDIAN */ + + // In order to use word-sized ops, little-endian needs to byteswap. + // On x86, code size increase is ~10 bytes compared to byte-by-byte. + unsigned long carryIn = (x[15] & 0x01) + ? ((unsigned long)0xE1 << (LONG_BIT-8)) + : 0; +# if ULONG_MAX <= 0xffffffff + int i; + for (i = 0; i < AES_BLOCK_SIZE/sizeof(long); i++) { + unsigned long ti = SWAP_BE32(l[i]); + unsigned long carryOut = ti << (LONG_BIT-1); // zero, or 0x800..00 + ti = (ti >> 1) ^ carryIn; + l[i] = SWAP_BE32(ti); carryIn = carryOut; } - if (borrow) x[0] ^= 0xE1; +# else + // 64-bit code: need to process only 2 words + unsigned long tt = SWAP_BE64(l[0]); + unsigned long carryOut = tt << (LONG_BIT-1); // zero, or 0x800..00 + tt = (tt >> 1) ^ carryIn; l[0] = SWAP_BE64(tt); + tt = SWAP_BE64(l[1]); + tt = (tt >> 1) ^ carryOut; l[1] = SWAP_BE64(tt); +# endif + +#endif /* LITTLE_ENDIAN */ +#undef l } static void GMULT(byte* X, byte* Y) -- cgit v1.2.3