aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--networking/tls.c225
-rw-r--r--networking/tls.h73
-rw-r--r--networking/tls_pstm.c2254
-rw-r--r--networking/tls_pstm.h238
-rw-r--r--networking/tls_pstm_montgomery_reduce.c423
-rw-r--r--networking/tls_pstm_mul_comba.c777
-rw-r--r--networking/tls_pstm_sqr_comba.c1107
-rw-r--r--networking/tls_rsa.c203
-rw-r--r--networking/tls_rsa.h18
9 files changed, 5281 insertions, 37 deletions
diff --git a/networking/tls.c b/networking/tls.c
index 69c81b558..b0a4f7e75 100644
--- a/networking/tls.c
+++ b/networking/tls.c
@@ -1,7 +1,7 @@
/*
- * Licensed under GPLv2, see file LICENSE in this source tree.
- *
* Copyright (C) 2017 Denys Vlasenko
+ *
+ * Licensed under GPLv2, see file LICENSE in this source tree.
*/
//config:config TLS
//config: bool "tls (debugging)"
@@ -10,6 +10,11 @@
//applet:IF_TLS(APPLET(tls, BB_DIR_USR_BIN, BB_SUID_DROP))
//kbuild:lib-$(CONFIG_TLS) += tls.o
+//kbuild:lib-$(CONFIG_TLS) += tls_pstm.o
+//kbuild:lib-$(CONFIG_TLS) += tls_pstm_montgomery_reduce.o
+//kbuild:lib-$(CONFIG_TLS) += tls_pstm_mul_comba.o
+//kbuild:lib-$(CONFIG_TLS) += tls_pstm_sqr_comba.o
+//kbuild:lib-$(CONFIG_TLS) += tls_rsa.o
////kbuild:lib-$(CONFIG_TLS) += tls_ciphers.o
////kbuild:lib-$(CONFIG_TLS) += tls_aes.o
////kbuild:lib-$(CONFIG_TLS) += tls_aes_gcm.o
@@ -18,9 +23,7 @@
//usage: "HOST[:PORT]"
//usage:#define tls_full_usage "\n\n"
-#include "libbb.h"
-//#include "tls_cryptoapi.h"
-//#include "tls_ciphers.h"
+#include "tls.h"
#if 1
# define dbg(...) fprintf(stderr, __VA_ARGS__)
@@ -28,23 +31,26 @@
# define dbg(...) ((void)0)
#endif
-#define RECORD_TYPE_CHANGE_CIPHER_SPEC 20
-#define RECORD_TYPE_ALERT 21
-#define RECORD_TYPE_HANDSHAKE 22
-#define RECORD_TYPE_APPLICATION_DATA 23
-
-#define HANDSHAKE_HELLO_REQUEST 0
-#define HANDSHAKE_CLIENT_HELLO 1
-#define HANDSHAKE_SERVER_HELLO 2
-#define HANDSHAKE_HELLO_VERIFY_REQUEST 3
-#define HANDSHAKE_NEW_SESSION_TICKET 4
-#define HANDSHAKE_CERTIFICATE 11
-#define HANDSHAKE_SERVER_KEY_EXCHANGE 12
-#define HANDSHAKE_CERTIFICATE_REQUEST 13
-#define HANDSHAKE_SERVER_HELLO_DONE 14
-#define HANDSHAKE_CERTIFICATE_VERIFY 15
-#define HANDSHAKE_CLIENT_KEY_EXCHANGE 16
-#define HANDSHAKE_FINISHED 20
+#define RECORD_TYPE_CHANGE_CIPHER_SPEC 20
+#define RECORD_TYPE_ALERT 21
+#define RECORD_TYPE_HANDSHAKE 22
+#define RECORD_TYPE_APPLICATION_DATA 23
+
+#define HANDSHAKE_HELLO_REQUEST 0
+#define HANDSHAKE_CLIENT_HELLO 1
+#define HANDSHAKE_SERVER_HELLO 2
+#define HANDSHAKE_HELLO_VERIFY_REQUEST 3
+#define HANDSHAKE_NEW_SESSION_TICKET 4
+#define HANDSHAKE_CERTIFICATE 11
+#define HANDSHAKE_SERVER_KEY_EXCHANGE 12
+#define HANDSHAKE_CERTIFICATE_REQUEST 13
+#define HANDSHAKE_SERVER_HELLO_DONE 14
+#define HANDSHAKE_CERTIFICATE_VERIFY 15
+#define HANDSHAKE_CLIENT_KEY_EXCHANGE 16
+#define HANDSHAKE_FINISHED 20
+
+#define SSL_HS_RANDOM_SIZE 32
+#define SSL_HS_RSA_PREMASTER_SIZE 48
#define SSL_NULL_WITH_NULL_NULL 0x0000
#define SSL_RSA_WITH_NULL_MD5 0x0001
@@ -112,6 +118,7 @@
//TLS 1.2
#define TLS_MAJ 3
#define TLS_MIN 3
+//#define CIPHER_ID TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA // ok, recvs SERVER_KEY_EXCHANGE *** matrixssl uses this on my box
//#define CIPHER_ID TLS_RSA_WITH_AES_256_CBC_SHA256 // ok, no SERVER_KEY_EXCHANGE
// All GCMs:
//#define CIPHER_ID TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384 // SSL_ALERT_HANDSHAKE_FAILURE
@@ -123,9 +130,9 @@
//#define CIPHER_ID TLS_ECDH_RSA_WITH_AES_256_GCM_SHA384
//#define CIPHER_ID TLS_ECDH_RSA_WITH_AES_128_GCM_SHA256 // SSL_ALERT_HANDSHAKE_FAILURE
//#define CIPHER_ID TLS_RSA_WITH_AES_256_GCM_SHA384 // ok, no SERVER_KEY_EXCHANGE
-#define CIPHER_ID TLS_RSA_WITH_AES_128_GCM_SHA256 // ok, no SERVER_KEY_EXCHANGE
+#define CIPHER_ID TLS_RSA_WITH_AES_128_GCM_SHA256 // ok, no SERVER_KEY_EXCHANGE *** select this?
//#define CIPHER_ID TLS_DH_anon_WITH_AES_256_CBC_SHA // SSL_ALERT_HANDSHAKE_FAILURE
-// (tested b/c this one doesn't req server certs... no luck)
+//^^^^^^^^^^^^^^^^^^^^^^^ (tested b/c this one doesn't req server certs... no luck)
//test TLS_RSA_WITH_AES_128_CBC_SHA, in tls 1.2 it's mandated to be always supported
struct record_hdr {
@@ -137,8 +144,7 @@ struct record_hdr {
typedef struct tls_state {
int fd;
- uint8_t *pubkey;
- int pubkey_len;
+ psRsaKey_t server_rsa_pub_key;
// RFC 5246
// |6.2.1. Fragmentation
@@ -170,6 +176,12 @@ typedef struct tls_state {
uint8_t inbuf[18*1024];
} tls_state_t;
+void tls_get_random(void *buf, unsigned len)
+{
+ if (len != open_read_close("/dev/urandom", buf, len))
+ xfunc_die();
+}
+
static
tls_state_t *new_tls_state(void)
{
@@ -286,7 +298,7 @@ static void send_client_hello(tls_state_t *tls)
hello.len24_lo = (sizeof(hello) - sizeof(hello.xhdr) - 4);
hello.proto_maj = TLS_MAJ;
hello.proto_min = TLS_MIN;
- open_read_close("/dev/urandom", hello.rand32, sizeof(hello.rand32));
+ tls_get_random(hello.rand32, sizeof(hello.rand32));
//hello.session_id_len = 0;
//hello.cipherid_len16_hi = 0;
hello.cipherid_len16_lo = 2 * 1;
@@ -407,7 +419,18 @@ static uint8_t *skip_der_item(uint8_t *der, uint8_t *end)
return new_der;
}
-static void *find_key_in_der_cert(int *key_len, uint8_t *der, int len)
+static void der_binary_to_pstm(pstm_int *pstm_n, uint8_t *der, uint8_t *end)
+{
+ uint8_t *bin_ptr;
+ unsigned len = get_der_len(&bin_ptr, der, end);
+
+ dbg("binary bytes:%u, first:0x%02x\n", len, bin_ptr[0]);
+ pstm_init_for_read_unsigned_bin(/*pool:*/ NULL, pstm_n, len);
+ pstm_read_unsigned_bin(pstm_n, bin_ptr, len);
+ //return bin + len;
+}
+
+static void find_key_in_der_cert(tls_state_t *tls, uint8_t *der, int len)
{
/* Certificate is a DER-encoded data structure. Each DER element has a length,
* which makes it easy to skip over large compound elements of any complexity
@@ -504,19 +527,43 @@ static void *find_key_in_der_cert(int *key_len, uint8_t *der, int len)
der = skip_der_item(der, end); /* validity */
der = skip_der_item(der, end); /* subject */
- /* enter "subjectPublicKeyInfo" */
+ /* enter subjectPublicKeyInfo */
der = enter_der_item(der, &end);
-
- /* skip "subjectPublicKeyInfo.algorithm" */
+ { /* check subjectPublicKeyInfo.algorithm */
+ static const uint8_t expected[] = {
+ 0x30,0x0d, // SEQ 13 bytes
+ 0x06,0x09, 0x2a,0x86,0x48,0x86,0xf7,0x0d,0x01,0x01,0x01, // OID RSA_KEY_ALG 42.134.72.134.247.13.1.1.1
+ //0x05,0x00, // NULL
+ };
+ if (memcmp(der, expected, sizeof(expected)) != 0)
+ bb_error_msg_and_die("not RSA key");
+ }
+ /* skip subjectPublicKeyInfo.algorithm */
der = skip_der_item(der, end);
- /* enter "subjectPublicKeyInfo.publicKey" */
+ /* enter subjectPublicKeyInfo.publicKey */
// die_if_not_this_der_type(der, end, 0x03); /* must be BITSTRING */
der = enter_der_item(der, &end);
- /* return a copy */
- *key_len = end - der;
- dbg("copying key bytes:%u, first:0x%02x\n", *key_len, der[0]);
- return xmemdup(der, *key_len);
+ /* parse RSA key: */
+//based on getAsnRsaPubKey(), pkcs1ParsePrivBin() is also of note
+ dbg("key bytes:%u, first:0x%02x\n", (int)(end - der), der[0]);
+ if (end - der < 14) xfunc_die();
+ /* example format:
+ * ignore bits: 00
+ * SEQ 0x018a/394 bytes: 3082018a
+ * INTEGER 0x0181/385 bytes (modulus): 02820181 XX...XXX
+ * INTEGER 3 bytes (exponent): 0203 010001
+ */
+ if (*der != 0) /* "ignore bits", should be 0 */
+ xfunc_die();
+ der++;
+ der = enter_der_item(der, &end); /* enter SEQ */
+ //memset(tls->server_rsa_pub_key, 0, sizeof(tls->server_rsa_pub_key));
+ der_binary_to_pstm(&tls->server_rsa_pub_key.N, der, end); /* modulus */
+ der = skip_der_item(der, end);
+ der_binary_to_pstm(&tls->server_rsa_pub_key.e, der, end); /* exponent */
+ tls->server_rsa_pub_key.size = pstm_unsigned_bin_size(&tls->server_rsa_pub_key.N);
+ dbg("server_rsa_pub_key.size:%d\n", tls->server_rsa_pub_key.size);
}
static void get_server_cert_or_die(tls_state_t *tls)
@@ -553,7 +600,107 @@ static void get_server_cert_or_die(tls_state_t *tls)
len = len1;
if (len)
- tls->pubkey = find_key_in_der_cert(&tls->pubkey_len, certbuf + 10, len);
+ find_key_in_der_cert(tls, certbuf + 10, len);
+}
+
+static void send_client_key_exchange(tls_state_t *tls)
+{
+#if 0 //matrixssl code snippets:
+ int32 csRsaEncryptPub(psPool_t *pool, psPubKey_t *key,
+ unsigned char *in, uint32 inlen, unsigned char *out, uint32 outlen,
+ void *data)
+ {
+ psAssert(key->type == PS_RSA);
+ return psRsaEncryptPub(pool, (psRsaKey_t*)key->key, in, inlen, out, outlen,
+ data);
+ }
+...
+ /* pkaAfter.user is buffer len */
+ if ((rc = csRsaEncryptPub(pka->pool, &ssl->sec.cert->publicKey,
+ ssl->sec.premaster, ssl->sec.premasterSize, pka->outbuf,
+ pka->user, pka->data)) < 0) {
+ if (rc == PS_PENDING) {
+ /* For these ClientKeyExchange paths, we do want to come
+ back through nowDoCkePka for a double pass so each
+ case can manage its own pkaAfter and to make sure
+ psX509FreeCert and sslCreateKeys() are hit below. */
+ return rc;
+ }
+ psTraceIntInfo("csRsaEncryptPub in CKE failed %d\n", rc);
+ return MATRIXSSL_ERROR;
+ }
+ /* RSA closed the pool on second pass */
+ pka->pool = NULL;
+ clearPkaAfter(ssl);
+...
+#ifdef USE_RSA_CIPHER_SUITE
+/*
+ Standard RSA suite
+*/
+ ssl->sec.premasterSize = SSL_HS_RSA_PREMASTER_SIZE;
+ ssl->sec.premaster = psMalloc(ssl->hsPool,
+ SSL_HS_RSA_PREMASTER_SIZE);
+ if (ssl->sec.premaster == NULL) {
+ return SSL_MEM_ERROR;
+ }
+
+ ssl->sec.premaster[0] = ssl->reqMajVer;
+ ssl->sec.premaster[1] = ssl->reqMinVer;
+ if (matrixCryptoGetPrngData(ssl->sec.premaster + 2,
+ SSL_HS_RSA_PREMASTER_SIZE - 2, ssl->userPtr) < 0) {
+ return MATRIXSSL_ERROR;
+ }
+
+ /* Shedule RSA encryption. Put tmp pool under control of After */
+ pkaAfter->type = PKA_AFTER_RSA_ENCRYPT;
+ pkaAfter->outbuf = c;
+ pkaAfter->data = pkiData;
+ pkaAfter->pool = pkiPool;
+ pkaAfter->user = (uint32)(end - c); /* Available space */
+
+ c += keyLen;
+#endif
+#endif // 0
+
+ struct client_key_exchange {
+ struct record_hdr xhdr;
+ uint8_t type;
+ uint8_t len24_hi, len24_mid, len24_lo;
+ uint8_t keylen16_hi, keylen16_lo; /* exist for RSA, but not for some other key types */
+//had a bug when had no keylen: we:
+//write(3, "\x16\x03\x03\x01\x84\x10\x00\x01\x80\xXX\xXX\xXX\xXX\xXX\xXX...", 393) = 393
+//openssl:
+//write to 0xe9a090 [0xf9ac20] (395 bytes => 395 (0x18B))
+//0000 - 16 03 03 01 86 10 00 01 -82 01 80 xx xx xx xx xx
+ uint8_t key[384]; // size??
+ };
+ struct client_key_exchange record;
+ uint8_t premaster[SSL_HS_RSA_PREMASTER_SIZE];
+
+ memset(&record, 0, sizeof(record));
+ record.xhdr.type = RECORD_TYPE_HANDSHAKE;
+ record.xhdr.proto_maj = TLS_MAJ;
+ record.xhdr.proto_min = TLS_MIN;
+ record.xhdr.len16_hi = (sizeof(record) - sizeof(record.xhdr)) >> 8;
+ record.xhdr.len16_lo = (sizeof(record) - sizeof(record.xhdr)) & 0xff;
+ record.type = HANDSHAKE_CLIENT_KEY_EXCHANGE;
+ //record.len24_hi = 0;
+ record.len24_mid = (sizeof(record) - sizeof(record.xhdr) - 4) >> 8;
+ record.len24_lo = (sizeof(record) - sizeof(record.xhdr) - 4) & 0xff;
+ record.keylen16_hi = (sizeof(record) - sizeof(record.xhdr) - 6) >> 8;
+ record.keylen16_lo = (sizeof(record) - sizeof(record.xhdr) - 6) & 0xff;
+
+ tls_get_random(premaster, sizeof(premaster));
+ premaster[0] = TLS_MAJ;
+ premaster[1] = TLS_MIN;
+ psRsaEncryptPub(/*pool:*/ NULL,
+ /* psRsaKey_t* */ &tls->server_rsa_pub_key,
+ premaster, /*inlen:*/ sizeof(premaster),
+ record.key, sizeof(record.key),
+ data_param_ignored
+ );
+
+ xwrite(tls->fd, &record, sizeof(record));
}
static void tls_handshake(tls_state_t *tls)
@@ -614,6 +761,8 @@ static void tls_handshake(tls_state_t *tls)
// 459 bytes:
// 0c 00|01|c7 03|00|17|41|04|87|94|2e|2f|68|d0|c9|f4|97|a8|2d|ef|ed|67|ea|c6|f3|b3|56|47|5d|27|b6|bd|ee|70|25|30|5e|b0|8e|f6|21|5a...
//SvKey len=455^
+ // with TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA: 461 bytes:
+ // 0c 00|01|c9 03|00|17|41|04|cd|9b|b4|29|1f|f6|b0|c2|84|82|7f|29|6a|47|4e|ec|87|0b|c1|9c|69|e1|f8|c6|d0|53|e9|27|90|a5|c8|02|15|75...
dbg("got SERVER_KEY_EXCHANGE\n");
len = xread_tls_block(tls);
break;
@@ -624,6 +773,8 @@ static void tls_handshake(tls_state_t *tls)
case HANDSHAKE_SERVER_HELLO_DONE:
// 0e 000000 (len:0)
dbg("got SERVER_HELLO_DONE\n");
+ send_client_key_exchange(tls);
+ len = xread_tls_block(tls);
break;
default:
tls_error_die(tls);
diff --git a/networking/tls.h b/networking/tls.h
new file mode 100644
index 000000000..20317ecc3
--- /dev/null
+++ b/networking/tls.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (C) 2017 Denys Vlasenko
+ *
+ * Licensed under GPLv2, see file LICENSE in this source tree.
+ */
+#include "libbb.h"
+
+/* config tweaks */
+#define HAVE_NATIVE_INT64 1
+#undef DISABLE_PSTM
+#undef USE_1024_KEY_SPEED_OPTIMIZATIONS
+#undef USE_2048_KEY_SPEED_OPTIMIZATIONS
+//TODO: enable to use asm:
+//#if defined(__GNUC__) && defined(__i386__) -> #define PSTM_32BIT and PSTM_X86
+//#if defined(__GNUC__) && defined(__x86_64__) -> #define PSTM_64BIT and PSTM_X86_64
+//ARM and MIPS also have these
+
+
+#define PS_SUCCESS 0
+#define PS_FAILURE -1
+#define PS_ARG_FAIL -6 /* Failure due to bad function param */
+#define PS_PLATFORM_FAIL -7 /* Failure as a result of system call error */
+#define PS_MEM_FAIL -8 /* Failure to allocate requested memory */
+#define PS_LIMIT_FAIL -9 /* Failure on sanity/limit tests */
+
+#define PS_TRUE 1
+#define PS_FALSE 0
+
+#if BB_BIG_ENDIAN
+# define ENDIAN_BIG 1
+# undef ENDIAN_LITTLE
+//#???? ENDIAN_32BITWORD
+// controls only STORE32L, which we don't use
+#else
+# define ENDIAN_LITTLE 1
+# undef ENDIAN_BIG
+#endif
+
+typedef uint64_t uint64;
+typedef int64_t int64;
+typedef uint32_t uint32;
+typedef int32_t int32;
+typedef uint16_t uint16;
+typedef int16_t int16;
+
+//FIXME
+typedef char psPool_t;
+
+//#ifdef PS_PUBKEY_OPTIMIZE_FOR_SMALLER_RAM
+#define PS_EXPTMOD_WINSIZE 3
+//#ifdef PS_PUBKEY_OPTIMIZE_FOR_FASTER_SPEED
+//#define PS_EXPTMOD_WINSIZE 5
+
+#define PUBKEY_TYPE 0x01
+#define PRIVKEY_TYPE 0x02
+
+void tls_get_random(void *buf, unsigned len);
+
+#define matrixCryptoGetPrngData(buf, len, userPtr) (tls_get_random(buf, len), PS_SUCCESS)
+
+#define psFree(p, pool) free(p)
+#define psTraceCrypto(msg) bb_error_msg_and_die(msg)
+
+/* Secure zerofill */
+#define memset_s(A,B,C,D) memset((A),(C),(D))
+/* Constant time memory comparison */
+#define memcmpct(s1, s2, len) memcmp((s1), (s2), (len))
+#undef min
+#define min(x, y) ((x) < (y) ? (x) : (y))
+
+
+#include "tls_pstm.h"
+#include "tls_rsa.h"
diff --git a/networking/tls_pstm.c b/networking/tls_pstm.c
new file mode 100644
index 000000000..0d797f87f
--- /dev/null
+++ b/networking/tls_pstm.c
@@ -0,0 +1,2254 @@
+/*
+ * Copyright (C) 2017 Denys Vlasenko
+ *
+ * Licensed under GPLv2, see file LICENSE in this source tree.
+ */
+#include "tls.h"
+
+/**
+ * @file pstm.c
+ * @version 33ef80f (HEAD, tag: MATRIXSSL-3-7-2-OPEN, tag: MATRIXSSL-3-7-2-COMM, origin/master, origin/HEAD, master)
+ *
+ * Multiprecision number implementation.
+ */
+/*
+ * Copyright (c) 2013-2015 INSIDE Secure Corporation
+ * Copyright (c) PeerSec Networks, 2002-2011
+ * All Rights Reserved
+ *
+ * The latest version of this code is available at http://www.matrixssl.org
+ *
+ * This software is open source; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This General Public License does NOT permit incorporating this software
+ * into proprietary programs. If you are unable to comply with the GPL, a
+ * commercial license for this software may be purchased from INSIDE at
+ * http://www.insidesecure.com/eng/Company/Locations
+ *
+ * This program is distributed in WITHOUT ANY WARRANTY; without even the
+ * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * http://www.gnu.org/copyleft/gpl.html
+ */
+/******************************************************************************/
+
+///bbox
+//#include "../cryptoApi.h"
+#ifndef DISABLE_PSTM
+
+static int32 pstm_mul_2d(pstm_int *a, int16 b, pstm_int *c);
+
+/******************************************************************************/
+/*
+ init an pstm_int for a given size
+ */
+int32 pstm_init_size(psPool_t *pool, pstm_int * a, uint32 size)
+{
+// uint16 x;
+
+/*
+ alloc mem
+ */
+ a->dp = xzalloc(sizeof (pstm_digit) * size);
+ a->pool = pool;
+ a->used = 0;
+ a->alloc = (int16)size;
+ a->sign = PSTM_ZPOS;
+/*
+ zero the digits
+ */
+///bbox
+// for (x = 0; x < size; x++) {
+// a->dp[x] = 0;
+// }
+ return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+ Init a new pstm_int.
+*/
+int32 pstm_init(psPool_t *pool, pstm_int * a)
+{
+// int32 i;
+/*
+ allocate memory required and clear it
+ */
+ a->dp = xzalloc(sizeof (pstm_digit) * PSTM_DEFAULT_INIT);
+/*
+ set the digits to zero
+ */
+///bbox
+// for (i = 0; i < PSTM_DEFAULT_INIT; i++) {
+// a->dp[i] = 0;
+// }
+/*
+ set the used to zero, allocated digits to the default precision and sign
+ to positive
+ */
+ a->pool = pool;
+ a->used = 0;
+ a->alloc = PSTM_DEFAULT_INIT;
+ a->sign = PSTM_ZPOS;
+
+ return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+ Grow as required
+ */
+int32 pstm_grow(pstm_int * a, int16 size)
+{
+ int16 i;
+ pstm_digit *tmp;
+
+/*
+ If the alloc size is smaller alloc more ram.
+ */
+ if (a->alloc < size) {
+/*
+ Reallocate the array a->dp
+
+ We store the return in a temporary variable in case the operation
+ failed we don't want to overwrite the dp member of a.
+*/
+ tmp = xrealloc(a->dp, sizeof (pstm_digit) * size);
+/*
+ reallocation succeeded so set a->dp
+ */
+ a->dp = tmp;
+/*
+ zero excess digits
+ */
+ i = a->alloc;
+ a->alloc = size;
+ for (; i < a->alloc; i++) {
+ a->dp[i] = 0;
+ }
+ }
+ return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+ copy, b = a (b must be pre-allocated)
+ */
+int32 pstm_copy(pstm_int * a, pstm_int * b)
+{
+ int32 res, n;
+
+/*
+ If dst == src do nothing
+ */
+ if (a == b) {
+ return PSTM_OKAY;
+ }
+/*
+ Grow dest
+ */
+ if (b->alloc < a->used) {
+ if ((res = pstm_grow (b, a->used)) != PSTM_OKAY) {
+ return res;
+ }
+ }
+/*
+ Zero b and copy the parameters over
+ */
+ {
+ register pstm_digit *tmpa, *tmpb;
+
+ /* pointer aliases */
+ /* source */
+ tmpa = a->dp;
+
+ /* destination */
+ tmpb = b->dp;
+
+ /* copy all the digits */
+ for (n = 0; n < a->used; n++) {
+ *tmpb++ = *tmpa++;
+ }
+
+ /* clear high digits */
+ for (; n < b->used; n++) {
+ *tmpb++ = 0;
+ }
+ }
+/*
+ copy used count and sign
+ */
+ b->used = a->used;
+ b->sign = a->sign;
+ return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+ Trim unused digits
+
+ This is used to ensure that leading zero digits are trimed and the
+ leading "used" digit will be non-zero. Typically very fast. Also fixes
+ the sign if there are no more leading digits
+*/
+void pstm_clamp(pstm_int * a)
+{
+/* decrease used while the most significant digit is zero. */
+ while (a->used > 0 && a->dp[a->used - 1] == 0) {
+ --(a->used);
+ }
+/* reset the sign flag if used == 0 */
+ if (a->used == 0) {
+ a->sign = PSTM_ZPOS;
+ }
+}
+
+/******************************************************************************/
+/*
+ clear one (frees).
+ */
+void pstm_clear(pstm_int * a)
+{
+ int32 i;
+/*
+ only do anything if a hasn't been freed previously
+ */
+ if (a != NULL && a->dp != NULL) {
+/*
+ first zero the digits
+ */
+ for (i = 0; i < a->used; i++) {
+ a->dp[i] = 0;
+ }
+
+ psFree (a->dp, a->pool);
+/*
+ reset members to make debugging easier
+ */
+ a->dp = NULL;
+ a->alloc = a->used = 0;
+ a->sign = PSTM_ZPOS;
+ }
+}
+
+/******************************************************************************/
+/*
+ clear many (frees).
+ */
+void pstm_clear_multi(pstm_int *mp0, pstm_int *mp1, pstm_int *mp2,
+ pstm_int *mp3, pstm_int *mp4, pstm_int *mp5,
+ pstm_int *mp6, pstm_int *mp7)
+{
+ int32 n; /* Number of ok inits */
+
+ pstm_int *tempArray[9];
+
+ tempArray[0] = mp0;
+ tempArray[1] = mp1;
+ tempArray[2] = mp2;
+ tempArray[3] = mp3;
+ tempArray[4] = mp4;
+ tempArray[5] = mp5;
+ tempArray[6] = mp6;
+ tempArray[7] = mp7;
+ tempArray[8] = NULL;
+
+ for (n = 0; tempArray[n] != NULL; n++) {
+ if ((tempArray[n] != NULL) && (tempArray[n]->dp != NULL)) {
+ pstm_clear(tempArray[n]);
+ }
+ }
+}
+
+/******************************************************************************/
+/*
+ Set to zero.
+ */
+void pstm_zero(pstm_int * a)
+{
+ int32 n;
+ pstm_digit *tmp;
+
+ a->sign = PSTM_ZPOS;
+ a->used = 0;
+
+ tmp = a->dp;
+ for (n = 0; n < a->alloc; n++) {
+ *tmp++ = 0;
+ }
+}
+
+
+/******************************************************************************/
+/*
+ Compare maginitude of two ints (unsigned).
+ */
+int32 pstm_cmp_mag(pstm_int * a, pstm_int * b)
+{
+ int16 n;
+ pstm_digit *tmpa, *tmpb;
+
+/*
+ compare based on # of non-zero digits
+ */
+ if (a->used > b->used) {
+ return PSTM_GT;
+ }
+
+ if (a->used < b->used) {
+ return PSTM_LT;
+ }
+
+ /* alias for a */
+ tmpa = a->dp + (a->used - 1);
+
+ /* alias for b */
+ tmpb = b->dp + (a->used - 1);
+
+/*
+ compare based on digits
+ */
+ for (n = 0; n < a->used; ++n, --tmpa, --tmpb) {
+ if (*tmpa > *tmpb) {
+ return PSTM_GT;
+ }
+ if (*tmpa < *tmpb) {
+ return PSTM_LT;
+ }
+ }
+ return PSTM_EQ;
+}
+
+/******************************************************************************/
+/*
+ Compare two ints (signed)
+ */
+int32 pstm_cmp(pstm_int * a, pstm_int * b)
+{
+/*
+ compare based on sign
+ */
+ if (a->sign != b->sign) {
+ if (a->sign == PSTM_NEG) {
+ return PSTM_LT;
+ } else {
+ return PSTM_GT;
+ }
+ }
+/*
+ compare digits
+ */
+ if (a->sign == PSTM_NEG) {
+ /* if negative compare opposite direction */
+ return pstm_cmp_mag(b, a);
+ } else {
+ return pstm_cmp_mag(a, b);
+ }
+}
+
+/******************************************************************************/
+/*
+ pstm_ints can be initialized more precisely when they will populated
+ using pstm_read_unsigned_bin since the length of the byte stream is known
+*/
+int32 pstm_init_for_read_unsigned_bin(psPool_t *pool, pstm_int *a, uint32 len)
+{
+ int32 size;
+/*
+ Need to set this based on how many words max it will take to store the bin.
+ The magic + 2:
+ 1 to round up for the remainder of this integer math
+ 1 for the initial carry of '1' bits that fall between DIGIT_BIT and 8
+*/
+ size = (((len / sizeof(pstm_digit)) * (sizeof(pstm_digit) * CHAR_BIT))
+ / DIGIT_BIT) + 2;
+ return pstm_init_size(pool, a, size);
+}
+
+
+/******************************************************************************/
+/*
+ Reads a unsigned char array into pstm_int format. User should have
+ called pstm_init_for_read_unsigned_bin first. There is some grow logic
+ here if the default pstm_init was used but we don't really want to hit it.
+*/
+int32 pstm_read_unsigned_bin(pstm_int *a, unsigned char *b, int32 c)
+{
+ /* zero the int */
+ pstm_zero (a);
+
+/*
+ If we know the endianness of this architecture, and we're using
+ 32-bit pstm_digits, we can optimize this
+*/
+#if (defined(ENDIAN_LITTLE) || defined(ENDIAN_BIG)) && !defined(PSTM_64BIT)
+ /* But not for both simultaneously */
+#if defined(ENDIAN_LITTLE) && defined(ENDIAN_BIG)
+#error Both ENDIAN_LITTLE and ENDIAN_BIG defined.
+#endif
+ {
+ unsigned char *pd;
+ if ((unsigned)c > (PSTM_MAX_SIZE * sizeof(pstm_digit))) {
+ uint32 excess = c - (PSTM_MAX_SIZE * sizeof(pstm_digit));
+ c -= excess;
+ b += excess;
+ }
+ a->used = (int16)((c + sizeof(pstm_digit) - 1)/sizeof(pstm_digit));
+ if (a->alloc < a->used) {
+ if (pstm_grow(a, a->used) != PSTM_OKAY) {
+ return PSTM_MEM;
+ }
+ }
+ pd = (unsigned char *)a->dp;
+ /* read the bytes in */
+#ifdef ENDIAN_BIG
+ {
+ /* Use Duff's device to unroll the loop. */
+ int32 idx = (c - 1) & ~3;
+ switch (c % 4) {
+ case 0: do { pd[idx+0] = *b++;
+ case 3: pd[idx+1] = *b++;
+ case 2: pd[idx+2] = *b++;
+ case 1: pd[idx+3] = *b++;
+ idx -= 4;
+ } while ((c -= 4) > 0);
+ }
+ }
+#else
+ for (c -= 1; c >= 0; c -= 1) {
+ pd[c] = *b++;
+ }
+#endif
+ }
+#else
+ /* Big enough based on the len? */
+ a->used = (((c / sizeof(pstm_digit)) * (sizeof(pstm_digit) * CHAR_BIT))
+ / DIGIT_BIT) + 2;
+
+ if (a->alloc < a->used) {
+ if (pstm_grow(a, a->used) != PSTM_OKAY) {
+ return PSTM_MEM;
+ }
+ }
+ /* read the bytes in */
+ for (; c > 0; c--) {
+ if (pstm_mul_2d (a, 8, a) != PSTM_OKAY) {
+ return PS_MEM_FAIL;
+ }
+ a->dp[0] |= *b++;
+ a->used += 1;
+ }
+#endif
+
+ pstm_clamp (a);
+ return PS_SUCCESS;
+}
+
+/******************************************************************************/
+/*
+*/
+int16 pstm_count_bits (pstm_int * a)
+{
+ int16 r;
+ pstm_digit q;
+
+ if (a->used == 0) {
+ return 0;
+ }
+
+ /* get number of digits and add that */
+ r = (a->used - 1) * DIGIT_BIT;
+
+ /* take the last digit and count the bits in it */
+ q = a->dp[a->used - 1];
+ while (q > ((pstm_digit) 0)) {
+ ++r;
+ q >>= ((pstm_digit) 1);
+ }
+ return r;
+}
+
+/******************************************************************************/
+int32 pstm_unsigned_bin_size(pstm_int *a)
+{
+ int32 size = pstm_count_bits (a);
+ return (size / 8 + ((size & 7) != 0 ? 1 : 0));
+}
+
+/******************************************************************************/
+void pstm_set(pstm_int *a, pstm_digit b)
+{
+ pstm_zero(a);
+ a->dp[0] = b;
+ a->used = a->dp[0] ? 1 : 0;
+}
+
+/******************************************************************************/
+/*
+ Right shift
+*/
+void pstm_rshd(pstm_int *a, int16 x)
+{
+ int16 y;
+
+ /* too many digits just zero and return */
+ if (x >= a->used) {
+ pstm_zero(a);
+ return;
+ }
+
+ /* shift */
+ for (y = 0; y < a->used - x; y++) {
+ a->dp[y] = a->dp[y+x];
+ }
+
+ /* zero rest */
+ for (; y < a->used; y++) {
+ a->dp[y] = 0;
+ }
+
+ /* decrement count */
+ a->used -= x;
+ pstm_clamp(a);
+}
+
+/******************************************************************************/
+/*
+ Shift left a certain amount of digits.
+ */
+int32 pstm_lshd(pstm_int * a, int16 b)
+{
+ int16 x;
+ int32 res;
+
+/*
+ If its less than zero return.
+ */
+ if (b <= 0) {
+ return PSTM_OKAY;
+ }
+/*
+ Grow to fit the new digits.
+ */
+ if (a->alloc < a->used + b) {
+ if ((res = pstm_grow (a, a->used + b)) != PSTM_OKAY) {
+ return res;
+ }
+ }
+
+ {
+ register pstm_digit *top, *bottom;
+/*
+ Increment the used by the shift amount then copy upwards.
+ */
+ a->used += b;
+
+ /* top */
+ top = a->dp + a->used - 1;
+
+ /* base */
+ bottom = a->dp + a->used - 1 - b;
+/*
+ This is implemented using a sliding window except the window goes the
+ other way around. Copying from the bottom to the top.
+ */
+ for (x = a->used - 1; x >= b; x--) {
+ *top-- = *bottom--;
+ }
+
+ /* zero the lower digits */
+ top = a->dp;
+ for (x = 0; x < b; x++) {
+ *top++ = 0;
+ }
+ }
+ return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+ computes a = 2**b
+*/
+int32 pstm_2expt(pstm_int *a, int16 b)
+{
+ int16 z;
+
+ /* zero a as per default */
+ pstm_zero (a);
+
+ if (b < 0) {
+ return PSTM_OKAY;
+ }
+
+ z = b / DIGIT_BIT;
+ if (z >= PSTM_MAX_SIZE) {
+ return PS_LIMIT_FAIL;
+ }
+
+ /* set the used count of where the bit will go */
+ a->used = z + 1;
+
+ if (a->used > a->alloc) {
+ if (pstm_grow(a, a->used) != PSTM_OKAY) {
+ return PS_MEM_FAIL;
+ }
+ }
+
+ /* put the single bit in its place */
+ a->dp[z] = ((pstm_digit)1) << (b % DIGIT_BIT);
+ return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+
+*/
+int32 pstm_mul_2(pstm_int * a, pstm_int * b)
+{
+ int32 res;
+ int16 x, oldused;
+
+/*
+ grow to accomodate result
+ */
+ if (b->alloc < a->used + 1) {
+ if ((res = pstm_grow (b, a->used + 1)) != PSTM_OKAY) {
+ return res;
+ }
+ }
+ oldused = b->used;
+ b->used = a->used;
+
+ {
+ register pstm_digit r, rr, *tmpa, *tmpb;
+
+ /* alias for source */
+ tmpa = a->dp;
+
+ /* alias for dest */
+ tmpb = b->dp;
+
+ /* carry */
+ r = 0;
+ for (x = 0; x < a->used; x++) {
+/*
+ get what will be the *next* carry bit from the
+ MSB of the current digit
+*/
+ rr = *tmpa >> ((pstm_digit)(DIGIT_BIT - 1));
+/*
+ now shift up this digit, add in the carry [from the previous]
+*/
+ *tmpb++ = ((*tmpa++ << ((pstm_digit)1)) | r);
+/*
+ copy the carry that would be from the source
+ digit into the next iteration
+*/
+ r = rr;
+ }
+
+ /* new leading digit? */
+ if (r != 0 && b->used != (PSTM_MAX_SIZE-1)) {
+ /* add a MSB which is always 1 at this point */
+ *tmpb = 1;
+ ++(b->used);
+ }
+/*
+ now zero any excess digits on the destination that we didn't write to
+*/
+ tmpb = b->dp + b->used;
+ for (x = b->used; x < oldused; x++) {
+ *tmpb++ = 0;
+ }
+ }
+ b->sign = a->sign;
+ return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+ unsigned subtraction ||a|| >= ||b|| ALWAYS!
+*/
+int32 s_pstm_sub(pstm_int *a, pstm_int *b, pstm_int *c)
+{
+ int16 oldbused, oldused;
+ int32 x;
+ pstm_word t;
+
+ if (b->used > a->used) {
+ return PS_LIMIT_FAIL;
+ }
+ if (c->alloc < a->used) {
+ if ((x = pstm_grow (c, a->used)) != PSTM_OKAY) {
+ return x;
+ }
+ }
+ oldused = c->used;
+ oldbused = b->used;
+ c->used = a->used;
+ t = 0;
+
+ for (x = 0; x < oldbused; x++) {
+ t = ((pstm_word)a->dp[x]) - (((pstm_word)b->dp[x]) + t);
+ c->dp[x] = (pstm_digit)t;
+ t = (t >> DIGIT_BIT)&1;
+ }
+ for (; x < a->used; x++) {
+ t = ((pstm_word)a->dp[x]) - t;
+ c->dp[x] = (pstm_digit)t;
+ t = (t >> DIGIT_BIT);
+ }
+ for (; x < oldused; x++) {
+ c->dp[x] = 0;
+ }
+ pstm_clamp(c);
+ return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+ unsigned addition
+*/
+static int32 s_pstm_add(pstm_int *a, pstm_int *b, pstm_int *c)
+{
+ int16 x, y, oldused;
+ register pstm_word t, adp, bdp;
+
+ y = a->used;
+ if (b->used > y) {
+ y = b->used;
+ }
+ oldused = c->used;
+ c->used = y;
+
+ if (c->used > c->alloc) {
+ if (pstm_grow(c, c->used) != PSTM_OKAY) {
+ return PS_MEM_FAIL;
+ }
+ }
+
+ t = 0;
+ for (x = 0; x < y; x++) {
+ if (a->used < x) {
+ adp = 0;
+ } else {
+ adp = (pstm_word)a->dp[x];
+ }
+ if (b->used < x) {
+ bdp = 0;
+ } else {
+ bdp = (pstm_word)b->dp[x];
+ }
+ t += (adp) + (bdp);
+ c->dp[x] = (pstm_digit)t;
+ t >>= DIGIT_BIT;
+ }
+ if (t != 0 && x < PSTM_MAX_SIZE) {
+ if (c->used == c->alloc) {
+ if (pstm_grow(c, c->alloc + 1) != PSTM_OKAY) {
+ return PS_MEM_FAIL;
+ }
+ }
+ c->dp[c->used++] = (pstm_digit)t;
+ ++x;
+ }
+
+ c->used = x;
+ for (; x < oldused; x++) {
+ c->dp[x] = 0;
+ }
+ pstm_clamp(c);
+ return PSTM_OKAY;
+}
+
+
+/******************************************************************************/
+/*
+
+*/
+int32 pstm_sub(pstm_int *a, pstm_int *b, pstm_int *c)
+{
+ int32 res;
+ int16 sa, sb;
+
+ sa = a->sign;
+ sb = b->sign;
+
+ if (sa != sb) {
+/*
+ subtract a negative from a positive, OR a positive from a negative.
+ For both, ADD their magnitudes, and use the sign of the first number.
+ */
+ c->sign = sa;
+ if ((res = s_pstm_add (a, b, c)) != PSTM_OKAY) {
+ return res;
+ }
+ } else {
+/*
+ subtract a positive from a positive, OR a negative from a negative.
+ First, take the difference between their magnitudes, then...
+ */
+ if (pstm_cmp_mag (a, b) != PSTM_LT) {
+ /* Copy the sign from the first */
+ c->sign = sa;
+ /* The first has a larger or equal magnitude */
+ if ((res = s_pstm_sub (a, b, c)) != PSTM_OKAY) {
+ return res;
+ }
+ } else {
+ /* The result has the _opposite_ sign from the first number. */
+ c->sign = (sa == PSTM_ZPOS) ? PSTM_NEG : PSTM_ZPOS;
+ /* The second has a larger magnitude */
+ if ((res = s_pstm_sub (b, a, c)) != PSTM_OKAY) {
+ return res;
+ }
+ }
+ }
+ return PS_SUCCESS;
+}
+
+/******************************************************************************/
+/*
+ c = a - b
+*/
+int32 pstm_sub_d(psPool_t *pool, pstm_int *a, pstm_digit b, pstm_int *c)
+{
+ pstm_int tmp;
+ int32 res;
+
+ if (pstm_init_size(pool, &tmp, sizeof(pstm_digit)) != PSTM_OKAY) {
+ return PS_MEM_FAIL;
+ }
+ pstm_set(&tmp, b);
+ res = pstm_sub(a, &tmp, c);
+ pstm_clear(&tmp);
+ return res;
+}
+
+/******************************************************************************/
+/*
+ setups the montgomery reduction
+*/
+int32 pstm_montgomery_setup(pstm_int *a, pstm_digit *rho)
+{
+ pstm_digit x, b;
+
+/*
+ fast inversion mod 2**k
+ Based on the fact that
+ XA = 1 (mod 2**n) => (X(2-XA)) A = 1 (mod 2**2n)
+ => 2*X*A - X*X*A*A = 1
+ => 2*(1) - (1) = 1
+ */
+ b = a->dp[0];
+
+ if ((b & 1) == 0) {
+ psTraceCrypto("pstm_montogomery_setup failure\n");
+ return PS_ARG_FAIL;
+ }
+
+ x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
+ x *= 2 - b * x; /* here x*a==1 mod 2**8 */
+ x *= 2 - b * x; /* here x*a==1 mod 2**16 */
+ x *= 2 - b * x; /* here x*a==1 mod 2**32 */
+#ifdef PSTM_64BIT
+ x *= 2 - b * x; /* here x*a==1 mod 2**64 */
+#endif
+ /* rho = -1/m mod b */
+ *rho = (pstm_digit)(((pstm_word) 1 << ((pstm_word) DIGIT_BIT)) -
+ ((pstm_word)x));
+ return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+ * computes a = B**n mod b without division or multiplication useful for
+ * normalizing numbers in a Montgomery system.
+ */
+int32 pstm_montgomery_calc_normalization(pstm_int *a, pstm_int *b)
+{
+ int32 x;
+ int16 bits;
+
+ /* how many bits of last digit does b use */
+ bits = pstm_count_bits (b) % DIGIT_BIT;
+ if (!bits) bits = DIGIT_BIT;
+
+ /* compute A = B^(n-1) * 2^(bits-1) */
+ if (b->used > 1) {
+ if ((x = pstm_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) !=
+ PSTM_OKAY) {
+ return x;
+ }
+ } else {
+ pstm_set(a, 1);
+ bits = 1;
+ }
+
+ /* now compute C = A * B mod b */
+ for (x = bits - 1; x < (int32)DIGIT_BIT; x++) {
+ if (pstm_mul_2 (a, a) != PSTM_OKAY) {
+ return PS_MEM_FAIL;
+ }
+ if (pstm_cmp_mag (a, b) != PSTM_LT) {
+ if (s_pstm_sub (a, b, a) != PSTM_OKAY) {
+ return PS_MEM_FAIL;
+ }
+ }
+ }
+ return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+ c = a * 2**d
+*/
+static int32 pstm_mul_2d(pstm_int *a, int16 b, pstm_int *c)
+{
+ pstm_digit carry, carrytmp, shift;
+ int16 x;
+
+ /* copy it */
+ if (pstm_copy(a, c) != PSTM_OKAY) {
+ return PS_MEM_FAIL;
+ }
+
+ /* handle whole digits */
+ if (b >= DIGIT_BIT) {
+ if (pstm_lshd(c, b/DIGIT_BIT) != PSTM_OKAY) {
+ return PS_MEM_FAIL;
+ }
+ }
+ b %= DIGIT_BIT;
+
+ /* shift the digits */
+ if (b != 0) {
+ carry = 0;
+ shift = DIGIT_BIT - b;
+ for (x = 0; x < c->used; x++) {
+ carrytmp = c->dp[x] >> shift;
+ c->dp[x] = (c->dp[x] << b) + carry;
+ carry = carrytmp;
+ }
+ /* store last carry if room */
+ if (carry && x < PSTM_MAX_SIZE) {
+ if (c->used == c->alloc) {
+ if (pstm_grow(c, c->alloc + 1) != PSTM_OKAY) {
+ return PS_MEM_FAIL;
+ }
+ }
+ c->dp[c->used++] = carry;
+ }
+ }
+ pstm_clamp(c);
+ return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+ c = a mod 2**d
+*/
+static int32 pstm_mod_2d(pstm_int *a, int16 b, pstm_int *c)
+{
+ int16 x;
+
+ /* zero if count less than or equal to zero */
+ if (b <= 0) {
+ pstm_zero(c);
+ return PSTM_OKAY;
+ }
+
+ /* get copy of input */
+ if (pstm_copy(a, c) != PSTM_OKAY) {
+ return PS_MEM_FAIL;
+ }
+
+ /* if 2**d is larger than we just return */
+ if (b >= (DIGIT_BIT * a->used)) {
+ return PSTM_OKAY;
+ }
+
+ /* zero digits above the last digit of the modulus */
+ for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++)
+ {
+ c->dp[x] = 0;
+ }
+ /* clear the digit that is not completely outside/inside the modulus */
+ c->dp[b / DIGIT_BIT] &= ~((pstm_digit)0) >> (DIGIT_BIT - b);
+ pstm_clamp (c);
+ return PSTM_OKAY;
+}
+
+
+/******************************************************************************/
+/*
+ c = a * b
+*/
+int32 pstm_mul_d(pstm_int *a, pstm_digit b, pstm_int *c)
+{
+ pstm_word w;
+ int32 res;
+ int16 x, oldused;
+
+ if (c->alloc < a->used + 1) {
+ if ((res = pstm_grow (c, a->used + 1)) != PSTM_OKAY) {
+ return res;
+ }
+ }
+ oldused = c->used;
+ c->used = a->used;
+ c->sign = a->sign;
+ w = 0;
+ for (x = 0; x < a->used; x++) {
+ w = ((pstm_word)a->dp[x]) * ((pstm_word)b) + w;
+ c->dp[x] = (pstm_digit)w;
+ w = w >> DIGIT_BIT;
+ }
+ if (w != 0 && (a->used != PSTM_MAX_SIZE)) {
+ c->dp[c->used++] = (pstm_digit)w;
+ ++x;
+ }
+ for (; x < oldused; x++) {
+ c->dp[x] = 0;
+ }
+ pstm_clamp(c);
+ return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+ c = a / 2**b
+*/
+int32 pstm_div_2d(psPool_t *pool, pstm_int *a, int16 b, pstm_int *c,
+ pstm_int *d)
+{
+ pstm_digit D, r, rr;
+ int32 res;
+ int16 x;
+ pstm_int t;
+
+ /* if the shift count is <= 0 then we do no work */
+ if (b <= 0) {
+ if (pstm_copy (a, c) != PSTM_OKAY) {
+ return PS_MEM_FAIL;
+ }
+ if (d != NULL) {
+ pstm_zero (d);
+ }
+ return PSTM_OKAY;
+ }
+
+ /* get the remainder */
+ if (d != NULL) {
+ if (pstm_init(pool, &t) != PSTM_OKAY) {
+ return PS_MEM_FAIL;
+ }
+ if (pstm_mod_2d (a, b, &t) != PSTM_OKAY) {
+ res = PS_MEM_FAIL;
+ goto LBL_DONE;
+ }
+ }
+
+ /* copy */
+ if (pstm_copy(a, c) != PSTM_OKAY) {
+ res = PS_MEM_FAIL;
+ goto LBL_DONE;
+ }
+
+ /* shift by as many digits in the bit count */
+ if (b >= (int32)DIGIT_BIT) {
+ pstm_rshd (c, b / DIGIT_BIT);
+ }
+
+ /* shift any bit count < DIGIT_BIT */
+ D = (pstm_digit) (b % DIGIT_BIT);
+ if (D != 0) {
+ register pstm_digit *tmpc, mask, shift;
+
+ /* mask */
+ mask = (((pstm_digit)1) << D) - 1;
+
+ /* shift for lsb */
+ shift = DIGIT_BIT - D;
+
+ /* alias */
+ tmpc = c->dp + (c->used - 1);
+
+ /* carry */
+ r = 0;
+ for (x = c->used - 1; x >= 0; x--) {
+ /* get the lower bits of this word in a temp */
+ rr = *tmpc & mask;
+
+ /* shift the current word and mix in the carry bits from previous */
+ *tmpc = (*tmpc >> D) | (r << shift);
+ --tmpc;
+
+ /* set the carry to the carry bits of the current word above */
+ r = rr;
+ }
+ }
+ pstm_clamp (c);
+
+ res = PSTM_OKAY;
+LBL_DONE:
+ if (d != NULL) {
+ if (pstm_copy(&t, d) != PSTM_OKAY) {
+ res = PS_MEM_FAIL;
+ }
+ pstm_clear(&t);
+ }
+ return res;
+}
+
+/******************************************************************************/
+/*
+ b = a/2
+*/
+int32 pstm_div_2(pstm_int * a, pstm_int * b)
+{
+ int16 x, oldused;
+
+ if (b->alloc < a->used) {
+ if (pstm_grow(b, a->used) != PSTM_OKAY) {
+ return PS_MEM_FAIL;
+ }
+ }
+ oldused = b->used;
+ b->used = a->used;
+ {
+ register pstm_digit r, rr, *tmpa, *tmpb;
+
+ /* source alias */
+ tmpa = a->dp + b->used - 1;
+
+ /* dest alias */
+ tmpb = b->dp + b->used - 1;
+
+ /* carry */
+ r = 0;
+ for (x = b->used - 1; x >= 0; x--) {
+ /* get the carry for the next iteration */
+ rr = *tmpa & 1;
+
+ /* shift the current digit, add in carry and store */
+ *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
+
+ /* forward carry to next iteration */
+ r = rr;
+ }
+
+ /* zero excess digits */
+ tmpb = b->dp + b->used;
+ for (x = b->used; x < oldused; x++) {
+ *tmpb++ = 0;
+ }
+ }
+ b->sign = a->sign;
+ pstm_clamp (b);
+ return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+ Creates "a" then copies b into it
+ */
+int32 pstm_init_copy(psPool_t *pool, pstm_int * a, pstm_int * b, int16 toSqr)
+{
+ int16 x;
+ int32 res;
+
+ if (a == b) {
+ return PSTM_OKAY;
+ }
+ x = b->alloc;
+
+ if (toSqr) {
+/*
+ Smart-size: Increasing size of a if b->used is roughly half
+ of b->alloc because usage has shown that a lot of these copies
+ go on to be squared and need these extra digits
+*/
+ if ((b->used * 2) + 2 >= x) {
+ x = (b->used * 2) + 3;
+ }
+ }
+ if ((res = pstm_init_size(pool, a, x)) != PSTM_OKAY) {
+ return res;
+ }
+ return pstm_copy(b, a);
+}
+
+/******************************************************************************/
+/*
+ With some compilers, we have seen issues linking with the builtin
+ 64 bit division routine. The issues with either manifest in a failure
+ to find 'udivdi3' at link time, or a runtime invalid instruction fault
+ during an RSA operation.
+ The routine below divides a 64 bit unsigned int by a 32 bit unsigned int
+ explicitly, rather than using the division operation
+ The 64 bit result is placed in the 'numerator' parameter
+ The 32 bit mod (remainder) of the division is the return parameter
+ Based on implementations by:
+ Copyright (C) 2003 Bernardo Innocenti <bernie@develer.com>
+ Copyright (C) 1999 Hewlett-Packard Co
+ Copyright (C) 1999 David Mosberger-Tang <davidm@hpl.hp.com>
+*/
+#if defined(USE_MATRIX_DIV64) && defined(PSTM_32BIT)
+static uint32 psDiv64(uint64 *numerator, uint32 denominator)
+{
+ uint64 rem = *numerator;
+ uint64 b = denominator;
+ uint64 res = 0;
+ uint64 d = 1;
+ uint32 high = rem >> 32;
+
+ if (high >= denominator) {
+ high /= denominator;
+ res = (uint64) high << 32;
+ rem -= (uint64) (high * denominator) << 32;
+ }
+ while ((int64)b > 0 && b < rem) {
+ b = b+b;
+ d = d+d;
+ }
+ do {
+ if (rem >= b) {
+ rem -= b;
+ res += d;
+ }
+ b >>= 1;
+ d >>= 1;
+ } while (d);
+ *numerator = res;
+ return rem;
+}
+#endif /* USE_MATRIX_DIV64 */
+
+#if defined(USE_MATRIX_DIV128) && defined(PSTM_64BIT)
+typedef unsigned long uint128 __attribute__ ((mode(TI)));
+static uint64 psDiv128(uint128 *numerator, uint64 denominator)
+{
+ uint128 rem = *numerator;
+ uint128 b = denominator;
+ uint128 res = 0;
+ uint128 d = 1;
+ uint64 high = rem >> 64;
+
+ if (high >= denominator) {
+ high /= denominator;
+ res = (uint128) high << 64;
+ rem -= (uint128) (high * denominator) << 64;
+ }
+ while ((uint128)b > 0 && b < rem) {
+ b = b+b;
+ d = d+d;
+ }
+ do {
+ if (rem >= b) {
+ rem -= b;
+ res += d;
+ }
+ b >>= 1;
+ d >>= 1;
+ } while (d);
+ *numerator = res;
+ return rem;
+}
+#endif /* USE_MATRIX_DIV128 */
+
+/******************************************************************************/
+/*
+ a/b => cb + d == a
+*/
+int32 pstm_div(psPool_t *pool, pstm_int *a, pstm_int *b, pstm_int *c,
+ pstm_int *d)
+{
+ pstm_int q, x, y, t1, t2;
+ int32 res;
+ int16 n, t, i, norm, neg;
+
+ /* is divisor zero ? */
+ if (pstm_iszero (b) == 1) {
+ return PS_LIMIT_FAIL;
+ }
+
+ /* if a < b then q=0, r = a */
+ if (pstm_cmp_mag (a, b) == PSTM_LT) {
+ if (d != NULL) {
+ if (pstm_copy(a, d) != PSTM_OKAY) {
+ return PS_MEM_FAIL;
+ }
+ }
+ if (c != NULL) {
+ pstm_zero (c);
+ }
+ return PSTM_OKAY;
+ }
+/*
+ Smart-size inits
+*/
+ if ((res = pstm_init_size(pool, &t1, a->alloc)) != PSTM_OKAY) {
+ return res;
+ }
+ if ((res = pstm_init_size(pool, &t2, 3)) != PSTM_OKAY) {
+ goto LBL_T1;
+ }
+ if ((res = pstm_init_copy(pool, &x, a, 0)) != PSTM_OKAY) {
+ goto LBL_T2;
+ }
+/*
+ Used to be an init_copy on b but pstm_grow was always hit with triple size
+*/
+ if ((res = pstm_init_size(pool, &y, b->used * 3)) != PSTM_OKAY) {
+ goto LBL_X;
+ }
+ if ((res = pstm_copy(b, &y)) != PSTM_OKAY) {
+ goto LBL_Y;
+ }
+
+ /* fix the sign */
+ neg = (a->sign == b->sign) ? PSTM_ZPOS : PSTM_NEG;
+ x.sign = y.sign = PSTM_ZPOS;
+
+ /* normalize both x and y, ensure that y >= b/2, [b == 2**DIGIT_BIT] */
+ norm = pstm_count_bits(&y) % DIGIT_BIT;
+ if (norm < (int32)(DIGIT_BIT-1)) {
+ norm = (DIGIT_BIT-1) - norm;
+ if ((res = pstm_mul_2d(&x, norm, &x)) != PSTM_OKAY) {
+ goto LBL_Y;
+ }
+ if ((res = pstm_mul_2d(&y, norm, &y)) != PSTM_OKAY) {
+ goto LBL_Y;
+ }
+ } else {
+ norm = 0;
+ }
+
+ /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
+ n = x.used - 1;
+ t = y.used - 1;
+
+ if ((res = pstm_init_size(pool, &q, n - t + 1)) != PSTM_OKAY) {
+ goto LBL_Y;
+ }
+ q.used = n - t + 1;
+
+ /* while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} } */
+ if ((res = pstm_lshd(&y, n - t)) != PSTM_OKAY) { /* y = y*b**{n-t} */
+ goto LBL_Q;
+ }
+
+ while (pstm_cmp (&x, &y) != PSTM_LT) {
+ ++(q.dp[n - t]);
+ if ((res = pstm_sub(&x, &y, &x)) != PSTM_OKAY) {
+ goto LBL_Q;
+ }
+ }
+
+ /* reset y by shifting it back down */
+ pstm_rshd (&y, n - t);
+
+ /* step 3. for i from n down to (t + 1) */
+ for (i = n; i >= (t + 1); i--) {
+ if (i > x.used) {
+ continue;
+ }
+
+ /* step 3.1 if xi == yt then set q{i-t-1} to b-1,
+ * otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
+ if (x.dp[i] == y.dp[t]) {
+ q.dp[i - t - 1] = (pstm_digit)((((pstm_word)1) << DIGIT_BIT) - 1);
+ } else {
+ pstm_word tmp;
+ tmp = ((pstm_word) x.dp[i]) << ((pstm_word) DIGIT_BIT);
+ tmp |= ((pstm_word) x.dp[i - 1]);
+#if defined(USE_MATRIX_DIV64) && defined(PSTM_32BIT)
+ psDiv64(&tmp, y.dp[t]);
+#elif defined(USE_MATRIX_DIV128) && defined(PSTM_64BIT)
+ psDiv128(&tmp, y.dp[t]);
+#else
+ tmp /= ((pstm_word) y.dp[t]);
+#endif /* USE_MATRIX_DIV64 */
+ q.dp[i - t - 1] = (pstm_digit) (tmp);
+ }
+
+ /* while (q{i-t-1} * (yt * b + y{t-1})) >
+ xi * b**2 + xi-1 * b + xi-2
+
+ do q{i-t-1} -= 1;
+ */
+ q.dp[i - t - 1] = (q.dp[i - t - 1] + 1);
+ do {
+ q.dp[i - t - 1] = (q.dp[i - t - 1] - 1);
+
+ /* find left hand */
+ pstm_zero (&t1);
+ t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1];
+ t1.dp[1] = y.dp[t];
+ t1.used = 2;
+ if ((res = pstm_mul_d (&t1, q.dp[i - t - 1], &t1)) != PSTM_OKAY) {
+ goto LBL_Q;
+ }
+
+ /* find right hand */
+ t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2];
+ t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
+ t2.dp[2] = x.dp[i];
+ t2.used = 3;
+ } while (pstm_cmp_mag(&t1, &t2) == PSTM_GT);
+
+ /* step 3.3 x = x - q{i-t-1} * y * b**{i-t-1} */
+ if ((res = pstm_mul_d(&y, q.dp[i - t - 1], &t1)) != PSTM_OKAY) {
+ goto LBL_Q;
+ }
+
+ if ((res = pstm_lshd(&t1, i - t - 1)) != PSTM_OKAY) {
+ goto LBL_Q;
+ }
+
+ if ((res = pstm_sub(&x, &t1, &x)) != PSTM_OKAY) {
+ goto LBL_Q;
+ }
+
+ /* if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; } */
+ if (x.sign == PSTM_NEG) {
+ if ((res = pstm_copy(&y, &t1)) != PSTM_OKAY) {
+ goto LBL_Q;
+ }
+ if ((res = pstm_lshd (&t1, i - t - 1)) != PSTM_OKAY) {
+ goto LBL_Q;
+ }
+ if ((res = pstm_add (&x, &t1, &x)) != PSTM_OKAY) {
+ goto LBL_Q;
+ }
+ q.dp[i - t - 1] = q.dp[i - t - 1] - 1;
+ }
+ }
+/*
+ now q is the quotient and x is the remainder (which we have to normalize)
+*/
+ /* get sign before writing to c */
+ x.sign = x.used == 0 ? PSTM_ZPOS : a->sign;
+
+ if (c != NULL) {
+ pstm_clamp (&q);
+ if (pstm_copy (&q, c) != PSTM_OKAY) {
+ res = PS_MEM_FAIL;
+ goto LBL_Q;
+ }
+ c->sign = neg;
+ }
+
+ if (d != NULL) {
+ if ((res = pstm_div_2d (pool, &x, norm, &x, NULL)) != PSTM_OKAY) {
+ goto LBL_Q;
+ }
+/*
+ the following is a kludge, essentially we were seeing the right
+ remainder but with excess digits that should have been zero
+ */
+ for (i = b->used; i < x.used; i++) {
+ x.dp[i] = 0;
+ }
+ pstm_clamp(&x);
+ if (pstm_copy (&x, d) != PSTM_OKAY) {
+ res = PS_MEM_FAIL;
+ goto LBL_Q;
+ }
+ }
+
+ res = PSTM_OKAY;
+
+LBL_Q:pstm_clear (&q);
+LBL_Y:pstm_clear (&y);
+LBL_X:pstm_clear (&x);
+LBL_T2:pstm_clear (&t2);
+LBL_T1:pstm_clear (&t1);
+
+ return res;
+}
+
+/******************************************************************************/
+/*
+ Swap the elements of two integers, for cases where you can't simply swap
+ the pstm_int pointers around
+*/
+void pstm_exch(pstm_int * a, pstm_int * b)
+{
+ pstm_int t;
+
+ t = *a;
+ *a = *b;
+ *b = t;
+}
+
+/******************************************************************************/
+/*
+ c = a mod b, 0 <= c < b
+*/
+int32 pstm_mod(psPool_t *pool, pstm_int *a, pstm_int *b, pstm_int *c)
+{
+ pstm_int t;
+ int32 err;
+/*
+ Smart-size
+*/
+ if ((err = pstm_init_size(pool, &t, b->alloc)) != PSTM_OKAY) {
+ return err;
+ }
+ if ((err = pstm_div(pool, a, b, NULL, &t)) != PSTM_OKAY) {
+ pstm_clear (&t);
+ return err;
+ }
+ if (t.sign != b->sign) {
+ err = pstm_add(&t, b, c);
+ } else {
+ pstm_exch (&t, c);
+ }
+ pstm_clear (&t);
+ return err;
+}
+
+/******************************************************************************/
+/*
+ d = a * b (mod c)
+*/
+int32 pstm_mulmod(psPool_t *pool, pstm_int *a, pstm_int *b, pstm_int *c,
+ pstm_int *d)
+{
+ int32 res;
+ int16 size;
+ pstm_int tmp;
+
+/*
+ Smart-size pstm_inits. d is an output that is influenced by this local 't'
+ so don't shrink 'd' if it wants to becuase this will lead to an pstm_grow
+ in RSA operations
+*/
+ size = a->used + b->used + 1;
+ if ((a == d) && (size < a->alloc)) {
+ size = a->alloc;
+ }
+ if ((res = pstm_init_size(pool, &tmp, size)) != PSTM_OKAY) {
+ return res;
+ }
+ if ((res = pstm_mul_comba(pool, a, b, &tmp, NULL, 0)) != PSTM_OKAY) {
+ pstm_clear(&tmp);
+ return res;
+ }
+ res = pstm_mod(pool, &tmp, c, d);
+ pstm_clear(&tmp);
+ return res;
+}
+
+/******************************************************************************/
+/*
+ * y = g**x (mod b)
+ * Some restrictions... x must be positive and < b
+ */
+int32 pstm_exptmod(psPool_t *pool, pstm_int *G, pstm_int *X, pstm_int *P,
+ pstm_int *Y)
+{
+ pstm_int M[32], res; /* Keep this winsize based: (1 << max_winsize) */
+ pstm_digit buf, mp;
+ pstm_digit *paD;
+ int32 err, bitbuf;
+ int16 bitcpy, bitcnt, mode, digidx, x, y, winsize;
+ uint32 paDlen;
+
+ /* set window size from what user set as optimization */
+ x = pstm_count_bits(X);
+ if (x < 50) {
+ winsize = 2;
+ } else {
+ winsize = PS_EXPTMOD_WINSIZE;
+ }
+
+ /* now setup montgomery */
+ if ((err = pstm_montgomery_setup (P, &mp)) != PSTM_OKAY) {
+ return err;
+ }
+
+ /* setup result */
+ if ((err = pstm_init_size(pool, &res, (P->used * 2) + 1)) != PSTM_OKAY) {
+ return err;
+ }
+/*
+ create M table
+ The M table contains powers of the input base, e.g. M[x] = G^x mod P
+ The first half of the table is not computed though except for M[0] and M[1]
+ */
+ /* now we need R mod m */
+ if ((err = pstm_montgomery_calc_normalization (&res, P)) != PSTM_OKAY) {
+ goto LBL_RES;
+ }
+/*
+ init M array
+ init first cell
+ */
+ if ((err = pstm_init_size(pool, &M[1], res.used)) != PSTM_OKAY) {
+ goto LBL_RES;
+ }
+
+ /* now set M[1] to G * R mod m */
+ if (pstm_cmp_mag(P, G) != PSTM_GT) {
+ /* G > P so we reduce it first */
+ if ((err = pstm_mod(pool, G, P, &M[1])) != PSTM_OKAY) {
+ goto LBL_M;
+ }
+ } else {
+ if ((err = pstm_copy(G, &M[1])) != PSTM_OKAY) {
+ goto LBL_M;
+ }
+ }
+ if ((err = pstm_mulmod (pool, &M[1], &res, P, &M[1])) != PSTM_OKAY) {
+ goto LBL_M;
+ }
+/*
+ Pre-allocated digit. Used for mul, sqr, AND reduce
+*/
+ paDlen = ((M[1].used + 3) * 2) * sizeof(pstm_digit);
+ paD = xzalloc(paDlen);
+/*
+ compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times
+ */
+ if (pstm_init_copy(pool, &M[1 << (winsize - 1)], &M[1], 1) != PSTM_OKAY) {
+ err = PS_MEM_FAIL;
+ goto LBL_PAD;
+ }
+ for (x = 0; x < (winsize - 1); x++) {
+ if ((err = pstm_sqr_comba (pool, &M[1 << (winsize - 1)],
+ &M[1 << (winsize - 1)], paD, paDlen)) != PSTM_OKAY) {
+ goto LBL_PAD;
+ }
+ if ((err = pstm_montgomery_reduce(pool, &M[1 << (winsize - 1)], P, mp,
+ paD, paDlen)) != PSTM_OKAY) {
+ goto LBL_PAD;
+ }
+ }
+/*
+ now init the second half of the array
+*/
+ for (x = (1<<(winsize-1)) + 1; x < (1 << winsize); x++) {
+ if ((err = pstm_init_size(pool, &M[x], M[1<<(winsize-1)].alloc + 1))
+ != PSTM_OKAY) {
+ for (y = 1<<(winsize-1); y < x; y++) {
+ pstm_clear(&M[y]);
+ }
+ goto LBL_PAD;
+ }
+ }
+
+ /* create upper table */
+ for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
+ if ((err = pstm_mul_comba(pool, &M[x - 1], &M[1], &M[x], paD, paDlen))
+ != PSTM_OKAY) {
+ goto LBL_MARRAY;
+ }
+ if ((err = pstm_montgomery_reduce(pool, &M[x], P, mp, paD, paDlen)) !=
+ PSTM_OKAY) {
+ goto LBL_MARRAY;
+ }
+ }
+
+ /* set initial mode and bit cnt */
+ mode = 0;
+ bitcnt = 1;
+ buf = 0;
+ digidx = X->used - 1;
+ bitcpy = 0;
+ bitbuf = 0;
+
+ for (;;) {
+ /* grab next digit as required */
+ if (--bitcnt == 0) {
+ /* if digidx == -1 we are out of digits so break */
+ if (digidx == -1) {
+ break;
+ }
+ /* read next digit and reset bitcnt */
+ buf = X->dp[digidx--];
+ bitcnt = (int32)DIGIT_BIT;
+ }
+
+ /* grab the next msb from the exponent */
+ y = (pstm_digit)(buf >> (DIGIT_BIT - 1)) & 1;
+ buf <<= (pstm_digit)1;
+/*
+ If the bit is zero and mode == 0 then we ignore it.
+ These represent the leading zero bits before the first 1 bit
+ in the exponent. Technically this opt is not required but it
+ does lower the # of trivial squaring/reductions used
+*/
+ if (mode == 0 && y == 0) {
+ continue;
+ }
+
+ /* if the bit is zero and mode == 1 then we square */
+ if (mode == 1 && y == 0) {
+ if ((err = pstm_sqr_comba(pool, &res, &res, paD, paDlen)) !=
+ PSTM_OKAY) {
+ goto LBL_MARRAY;
+ }
+ if ((err = pstm_montgomery_reduce(pool, &res, P, mp, paD, paDlen))
+ != PSTM_OKAY) {
+ goto LBL_MARRAY;
+ }
+ continue;
+ }
+
+ /* else we add it to the window */
+ bitbuf |= (y << (winsize - ++bitcpy));
+ mode = 2;
+
+ if (bitcpy == winsize) {
+ /* ok window is filled so square as required and mul square first */
+ for (x = 0; x < winsize; x++) {
+ if ((err = pstm_sqr_comba(pool, &res, &res, paD, paDlen)) !=
+ PSTM_OKAY) {
+ goto LBL_MARRAY;
+ }
+ if ((err = pstm_montgomery_reduce(pool, &res, P, mp, paD,
+ paDlen)) != PSTM_OKAY) {
+ goto LBL_MARRAY;
+ }
+ }
+
+ /* then multiply */
+ if ((err = pstm_mul_comba(pool, &res, &M[bitbuf], &res, paD,
+ paDlen)) != PSTM_OKAY) {
+ goto LBL_MARRAY;
+ }
+ if ((err = pstm_montgomery_reduce(pool, &res, P, mp, paD, paDlen))
+ != PSTM_OKAY) {
+ goto LBL_MARRAY;
+ }
+
+ /* empty window and reset */
+ bitcpy = 0;
+ bitbuf = 0;
+ mode = 1;
+ }
+ }
+
+ /* if bits remain then square/multiply */
+ if (mode == 2 && bitcpy > 0) {
+ /* square then multiply if the bit is set */
+ for (x = 0; x < bitcpy; x++) {
+ if ((err = pstm_sqr_comba(pool, &res, &res, paD, paDlen)) !=
+ PSTM_OKAY) {
+ goto LBL_MARRAY;
+ }
+ if ((err = pstm_montgomery_reduce(pool, &res, P, mp, paD, paDlen))
+ != PSTM_OKAY) {
+ goto LBL_MARRAY;
+ }
+
+ /* get next bit of the window */
+ bitbuf <<= 1;
+ if ((bitbuf & (1 << winsize)) != 0) {
+ /* then multiply */
+ if ((err = pstm_mul_comba(pool, &res, &M[1], &res, paD, paDlen))
+ != PSTM_OKAY) {
+ goto LBL_MARRAY;
+ }
+ if ((err = pstm_montgomery_reduce(pool, &res, P, mp, paD,
+ paDlen)) != PSTM_OKAY) {
+ goto LBL_MARRAY;
+ }
+ }
+ }
+ }
+/*
+ Fix up result if Montgomery reduction is used recall that any value in a
+ Montgomery system is actually multiplied by R mod n. So we have to reduce
+ one more time to cancel out the factor of R.
+*/
+ if ((err = pstm_montgomery_reduce(pool, &res, P, mp, paD, paDlen)) !=
+ PSTM_OKAY) {
+ goto LBL_MARRAY;
+ }
+ /* swap res with Y */
+ if ((err = pstm_copy (&res, Y)) != PSTM_OKAY) {
+ goto LBL_MARRAY;
+ }
+ err = PSTM_OKAY;
+LBL_MARRAY:
+ for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
+ pstm_clear(&M[x]);
+ }
+LBL_PAD:psFree(paD, pool);
+LBL_M: pstm_clear(&M[1]);
+LBL_RES:pstm_clear(&res);
+ return err;
+}
+
+/******************************************************************************/
+/*
+
+*/
+int32 pstm_add(pstm_int *a, pstm_int *b, pstm_int *c)
+{
+ int32 res;
+ int16 sa, sb;
+
+ /* get sign of both inputs */
+ sa = a->sign;
+ sb = b->sign;
+
+ /* handle two cases, not four */
+ if (sa == sb) {
+ /* both positive or both negative, add their mags, copy the sign */
+ c->sign = sa;
+ if ((res = s_pstm_add (a, b, c)) != PSTM_OKAY) {
+ return res;
+ }
+ } else {
+/*
+ one positive, the other negative
+ subtract the one with the greater magnitude from the one of the lesser
+ magnitude. The result gets the sign of the one with the greater mag.
+ */
+ if (pstm_cmp_mag (a, b) == PSTM_LT) {
+ c->sign = sb;
+ if ((res = s_pstm_sub (b, a, c)) != PSTM_OKAY) {
+ return res;
+ }
+ } else {
+ c->sign = sa;
+ if ((res = s_pstm_sub (a, b, c)) != PSTM_OKAY) {
+ return res;
+ }
+ }
+ }
+ return PS_SUCCESS;
+}
+
+/******************************************************************************/
+/*
+ reverse an array, used for radix code
+*/
+static void pstm_reverse (unsigned char *s, int16 len)
+{
+ int32 ix, iy;
+ unsigned char t;
+
+ ix = 0;
+ iy = len - 1;
+ while (ix < iy) {
+ t = s[ix];
+ s[ix] = s[iy];
+ s[iy] = t;
+ ++ix;
+ --iy;
+ }
+}
+/******************************************************************************/
+/*
+ No reverse. Useful in some of the EIP-154 PKA stuff where special byte
+ order seems to come into play more often
+*/
+int32 pstm_to_unsigned_bin_nr(psPool_t *pool, pstm_int *a, unsigned char *b)
+{
+ int32 res;
+ int16 x;
+ pstm_int t = { 0 };
+
+ if ((res = pstm_init_copy(pool, &t, a, 0)) != PSTM_OKAY) {
+ return res;
+ }
+
+ x = 0;
+ while (pstm_iszero (&t) == 0) {
+ b[x++] = (unsigned char) (t.dp[0] & 255);
+ if ((res = pstm_div_2d (pool, &t, 8, &t, NULL)) != PSTM_OKAY) {
+ pstm_clear(&t);
+ return res;
+ }
+ }
+ pstm_clear(&t);
+ return PS_SUCCESS;
+}
+/******************************************************************************/
+/*
+
+*/
+int32 pstm_to_unsigned_bin(psPool_t *pool, pstm_int *a, unsigned char *b)
+{
+ int32 res;
+ int16 x;
+ pstm_int t = { 0 };
+
+ if ((res = pstm_init_copy(pool, &t, a, 0)) != PSTM_OKAY) {
+ return res;
+ }
+
+ x = 0;
+ while (pstm_iszero (&t) == 0) {
+ b[x++] = (unsigned char) (t.dp[0] & 255);
+ if ((res = pstm_div_2d (pool, &t, 8, &t, NULL)) != PSTM_OKAY) {
+ pstm_clear(&t);
+ return res;
+ }
+ }
+ pstm_reverse (b, x);
+ pstm_clear(&t);
+ return PS_SUCCESS;
+}
+
+/******************************************************************************/
+/*
+ compare against a single digit
+*/
+int32 pstm_cmp_d(pstm_int *a, pstm_digit b)
+{
+ /* compare based on sign */
+ if ((b && a->used == 0) || a->sign == PSTM_NEG) {
+ return PSTM_LT;
+ }
+
+ /* compare based on magnitude */
+ if (a->used > 1) {
+ return PSTM_GT;
+ }
+
+ /* compare the only digit of a to b */
+ if (a->dp[0] > b) {
+ return PSTM_GT;
+ } else if (a->dp[0] < b) {
+ return PSTM_LT;
+ } else {
+ return PSTM_EQ;
+ }
+}
+
+/*
+ Need invmod for ECC and also private key loading for hardware crypto
+ in cases where dQ > dP. The values must be switched and a new qP must be
+ calculated using this function
+*/
+static int32 pstm_invmod_slow(psPool_t *pool, pstm_int * a, pstm_int * b,
+ pstm_int * c)
+{
+ pstm_int x, y, u, v, A, B, C, D;
+ int32 res;
+
+ /* b cannot be negative */
+ if (b->sign == PSTM_NEG || pstm_iszero(b) == 1) {
+ return PS_LIMIT_FAIL;
+ }
+
+ /* init temps */
+ if (pstm_init_size(pool, &x, b->used) != PSTM_OKAY) {
+ return PS_MEM_FAIL;
+ }
+
+ /* x = a, y = b */
+ if ((res = pstm_mod(pool, a, b, &x)) != PSTM_OKAY) {
+ goto LBL_X;
+ }
+
+ if (pstm_init_copy(pool, &y, b, 0) != PSTM_OKAY) {
+ goto LBL_X;
+ }
+
+ /* 2. [modified] if x,y are both even then return an error! */
+ if (pstm_iseven (&x) == 1 && pstm_iseven (&y) == 1) {
+ res = PS_FAILURE;
+ goto LBL_Y;
+ }
+
+ /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
+ if ((res = pstm_init_copy(pool, &u, &x, 0)) != PSTM_OKAY) {
+ goto LBL_Y;
+ }
+ if ((res = pstm_init_copy(pool, &v, &y, 0)) != PSTM_OKAY) {
+ goto LBL_U;
+ }
+
+ if ((res = pstm_init_size(pool, &A, sizeof(pstm_digit))) != PSTM_OKAY) {
+ goto LBL_V;
+ }
+
+ if ((res = pstm_init_size(pool, &D, sizeof(pstm_digit))) != PSTM_OKAY) {
+ goto LBL_A;
+ }
+ pstm_set (&A, 1);
+ pstm_set (&D, 1);
+
+ if ((res = pstm_init(pool, &B)) != PSTM_OKAY) {
+ goto LBL_D;
+ }
+ if ((res = pstm_init(pool, &C)) != PSTM_OKAY) {
+ goto LBL_B;
+ }
+
+top:
+ /* 4. while u is even do */
+ while (pstm_iseven (&u) == 1) {
+ /* 4.1 u = u/2 */
+ if ((res = pstm_div_2 (&u, &u)) != PSTM_OKAY) {
+ goto LBL_C;
+ }
+
+ /* 4.2 if A or B is odd then */
+ if (pstm_isodd (&A) == 1 || pstm_isodd (&B) == 1) {
+ /* A = (A+y)/2, B = (B-x)/2 */
+ if ((res = pstm_add (&A, &y, &A)) != PSTM_OKAY) {
+ goto LBL_C;
+ }
+ if ((res = pstm_sub (&B, &x, &B)) != PSTM_OKAY) {
+ goto LBL_C;
+ }
+ }
+ /* A = A/2, B = B/2 */
+ if ((res = pstm_div_2 (&A, &A)) != PSTM_OKAY) {
+ goto LBL_C;
+ }
+ if ((res = pstm_div_2 (&B, &B)) != PSTM_OKAY) {
+ goto LBL_C;
+ }
+ }
+
+ /* 5. while v is even do */
+ while (pstm_iseven (&v) == 1) {
+ /* 5.1 v = v/2 */
+ if ((res = pstm_div_2 (&v, &v)) != PSTM_OKAY) {
+ goto LBL_C;
+ }
+
+ /* 5.2 if C or D is odd then */
+ if (pstm_isodd (&C) == 1 || pstm_isodd (&D) == 1) {
+ /* C = (C+y)/2, D = (D-x)/2 */
+ if ((res = pstm_add (&C, &y, &C)) != PSTM_OKAY) {
+ goto LBL_C;
+ }
+ if ((res = pstm_sub (&D, &x, &D)) != PSTM_OKAY) {
+ goto LBL_C;
+ }
+ }
+ /* C = C/2, D = D/2 */
+ if ((res = pstm_div_2 (&C, &C)) != PSTM_OKAY) {
+ goto LBL_C;
+ }
+ if ((res = pstm_div_2 (&D, &D)) != PSTM_OKAY) {
+ goto LBL_C;
+ }
+ }
+
+ /* 6. if u >= v then */
+ if (pstm_cmp (&u, &v) != PSTM_LT) {
+ /* u = u - v, A = A - C, B = B - D */
+ if ((res = pstm_sub (&u, &v, &u)) != PSTM_OKAY) {
+ goto LBL_C;
+ }
+ if ((res = pstm_sub (&A, &C, &A)) != PSTM_OKAY) {
+ goto LBL_C;
+ }
+ if ((res = pstm_sub (&B, &D, &B)) != PSTM_OKAY) {
+ goto LBL_C;
+ }
+ } else {
+ /* v - v - u, C = C - A, D = D - B */
+ if ((res = pstm_sub (&v, &u, &v)) != PSTM_OKAY) {
+ goto LBL_C;
+ }
+ if ((res = pstm_sub (&C, &A, &C)) != PSTM_OKAY) {
+ goto LBL_C;
+ }
+ if ((res = pstm_sub (&D, &B, &D)) != PSTM_OKAY) {
+ goto LBL_C;
+ }
+ }
+
+ /* if not zero goto step 4 */
+ if (pstm_iszero (&u) == 0)
+ goto top;
+
+ /* now a = C, b = D, gcd == g*v */
+
+ /* if v != 1 then there is no inverse */
+ if (pstm_cmp_d (&v, 1) != PSTM_EQ) {
+ res = PS_FAILURE;
+ goto LBL_C;
+ }
+
+ /* if its too low */
+ while (pstm_cmp_d(&C, 0) == PSTM_LT) {
+ if ((res = pstm_add(&C, b, &C)) != PSTM_OKAY) {
+ goto LBL_C;
+ }
+ }
+
+ /* too big */
+ while (pstm_cmp_mag(&C, b) != PSTM_LT) {
+ if ((res = pstm_sub(&C, b, &C)) != PSTM_OKAY) {
+ goto LBL_C;
+ }
+ }
+
+ /* C is now the inverse */
+ if ((res = pstm_copy(&C, c)) != PSTM_OKAY) {
+ goto LBL_C;
+ }
+ res = PSTM_OKAY;
+
+LBL_C: pstm_clear(&C);
+LBL_D: pstm_clear(&D);
+LBL_B: pstm_clear(&B);
+LBL_A: pstm_clear(&A);
+LBL_V: pstm_clear(&v);
+LBL_U: pstm_clear(&u);
+LBL_Y: pstm_clear(&y);
+LBL_X: pstm_clear(&x);
+
+ return res;
+}
+
+/* c = 1/a (mod b) for odd b only */
+int32 pstm_invmod(psPool_t *pool, pstm_int *a, pstm_int *b, pstm_int *c)
+{
+ pstm_int x, y, u, v, B, D;
+ int32 res;
+ uint16 neg, sanity;
+
+ /* 2. [modified] b must be odd */
+ if (pstm_iseven (b) == 1) {
+ return pstm_invmod_slow(pool, a,b,c);
+ }
+
+ /* x == modulus, y == value to invert */
+ if ((res = pstm_init_copy(pool, &x, b, 0)) != PSTM_OKAY) {
+ return res;
+ }
+
+ if ((res = pstm_init_size(pool, &y, a->alloc)) != PSTM_OKAY) {
+ goto LBL_X;
+ }
+
+ /* we need y = |a| */
+ pstm_abs(a, &y);
+
+ /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
+ if ((res = pstm_init_copy(pool, &u, &x, 0)) != PSTM_OKAY) {
+ goto LBL_Y;
+ }
+ if ((res = pstm_init_copy(pool, &v, &y, 0)) != PSTM_OKAY) {
+ goto LBL_U;
+ }
+ if ((res = pstm_init(pool, &B)) != PSTM_OKAY) {
+ goto LBL_V;
+ }
+ if ((res = pstm_init(pool, &D)) != PSTM_OKAY) {
+ goto LBL_B;
+ }
+
+ pstm_set (&D, 1);
+
+ sanity = 0;
+top:
+ /* 4. while u is even do */
+ while (pstm_iseven (&u) == 1) {
+ /* 4.1 u = u/2 */
+ if ((res = pstm_div_2 (&u, &u)) != PSTM_OKAY) {
+ goto LBL_D;
+ }
+
+ /* 4.2 if B is odd then */
+ if (pstm_isodd (&B) == 1) {
+ if ((res = pstm_sub (&B, &x, &B)) != PSTM_OKAY) {
+ goto LBL_D;
+ }
+ }
+ /* B = B/2 */
+ if ((res = pstm_div_2 (&B, &B)) != PSTM_OKAY) {
+ goto LBL_D;
+ }
+ }
+
+ /* 5. while v is even do */
+ while (pstm_iseven (&v) == 1) {
+ /* 5.1 v = v/2 */
+ if ((res = pstm_div_2 (&v, &v)) != PSTM_OKAY) {
+ goto LBL_D;
+ }
+ /* 5.2 if D is odd then */
+ if (pstm_isodd (&D) == 1) {
+ /* D = (D-x)/2 */
+ if ((res = pstm_sub (&D, &x, &D)) != PSTM_OKAY) {
+ goto LBL_D;
+ }
+ }
+ /* D = D/2 */
+ if ((res = pstm_div_2 (&D, &D)) != PSTM_OKAY) {
+ goto LBL_D;
+ }
+ }
+
+ /* 6. if u >= v then */
+ if (pstm_cmp (&u, &v) != PSTM_LT) {
+ /* u = u - v, B = B - D */
+ if ((res = pstm_sub (&u, &v, &u)) != PSTM_OKAY) {
+ goto LBL_D;
+ }
+ if ((res = pstm_sub (&B, &D, &B)) != PSTM_OKAY) {
+ goto LBL_D;
+ }
+ } else {
+ /* v - v - u, D = D - B */
+ if ((res = pstm_sub (&v, &u, &v)) != PSTM_OKAY) {
+ goto LBL_D;
+ }
+ if ((res = pstm_sub (&D, &B, &D)) != PSTM_OKAY) {
+ goto LBL_D;
+ }
+ }
+
+ /* if not zero goto step 4 */
+ if (sanity++ > 1000) {
+ res = PS_LIMIT_FAIL;
+ goto LBL_D;
+ }
+ if (pstm_iszero (&u) == 0) {
+ goto top;
+ }
+
+ /* now a = C, b = D, gcd == g*v */
+
+ /* if v != 1 then there is no inverse */
+ if (pstm_cmp_d (&v, 1) != PSTM_EQ) {
+ res = PS_FAILURE;
+ goto LBL_D;
+ }
+
+ /* b is now the inverse */
+ neg = a->sign;
+ while (D.sign == PSTM_NEG) {
+ if ((res = pstm_add (&D, b, &D)) != PSTM_OKAY) {
+ goto LBL_D;
+ }
+ }
+ if ((res = pstm_copy (&D, c)) != PSTM_OKAY) {
+ goto LBL_D;
+ }
+ c->sign = neg;
+ res = PSTM_OKAY;
+
+LBL_D: pstm_clear(&D);
+LBL_B: pstm_clear(&B);
+LBL_V: pstm_clear(&v);
+LBL_U: pstm_clear(&u);
+LBL_Y: pstm_clear(&y);
+LBL_X: pstm_clear(&x);
+ return res;
+}
+#endif /* !DISABLE_PSTM */
+/******************************************************************************/
diff --git a/networking/tls_pstm.h b/networking/tls_pstm.h
new file mode 100644
index 000000000..1affc1b69
--- /dev/null
+++ b/networking/tls_pstm.h
@@ -0,0 +1,238 @@
+/**
+ * @file pstm.h
+ * @version 33ef80f (HEAD, tag: MATRIXSSL-3-7-2-OPEN, tag: MATRIXSSL-3-7-2-COMM, origin/master, origin/HEAD, master)
+ *
+ * multiple-precision integer library.
+ */
+/*
+ * Copyright (c) 2013-2015 INSIDE Secure Corporation
+ * Copyright (c) PeerSec Networks, 2002-2011
+ * All Rights Reserved
+ *
+ * The latest version of this code is available at http://www.matrixssl.org
+ *
+ * This software is open source; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This General Public License does NOT permit incorporating this software
+ * into proprietary programs. If you are unable to comply with the GPL, a
+ * commercial license for this software may be purchased from INSIDE at
+ * http://www.insidesecure.com/eng/Company/Locations
+ *
+ * This program is distributed in WITHOUT ANY WARRANTY; without even the
+ * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * http://www.gnu.org/copyleft/gpl.html
+ */
+/******************************************************************************/
+
+#ifndef _h_PSTMATH
+#define _h_PSTMATH
+#ifndef DISABLE_PSTM
+
+/* Define this here to avoid including circular limits.h on some platforms */
+#ifndef CHAR_BIT
+#define CHAR_BIT 8
+#endif
+
+/******************************************************************************/
+/*
+ If native 64 bit integers are not supported, we do not support 32x32->64
+ in hardware, so we must set the 16 bit flag to produce 16x16->32 products.
+*/
+#ifndef HAVE_NATIVE_INT64
+ #define PSTM_16BIT
+#endif /* ! HAVE_NATIVE_INT64 */
+
+/******************************************************************************/
+/*
+ Some default configurations.
+
+ pstm_word should be the largest value the processor can hold as the product
+ of a multiplication. Most platforms support a 32x32->64 MAC instruction,
+ so 64bits is the default pstm_word size.
+ pstm_digit should be half the size of pstm_word
+ */
+#ifdef PSTM_8BIT
+/* 8-bit digits, 16-bit word products */
+ typedef unsigned char pstm_digit;
+ typedef unsigned short pstm_word;
+ #define DIGIT_BIT 8
+
+#elif defined(PSTM_16BIT)
+/* 16-bit digits, 32-bit word products */
+ typedef unsigned short pstm_digit;
+ typedef unsigned long pstm_word;
+ #define DIGIT_BIT 16
+
+#elif defined(PSTM_64BIT)
+/* 64-bit digits, 128-bit word products */
+ #ifndef __GNUC__
+ #error "64bit digits requires GCC"
+ #endif
+ typedef unsigned long pstm_digit;
+ typedef unsigned long pstm_word __attribute__ ((mode(TI)));
+ #define DIGIT_BIT 64
+
+#else
+/* This is the default case, 32-bit digits, 64-bit word products */
+ typedef uint32 pstm_digit;
+ typedef uint64 pstm_word;
+ #define DIGIT_BIT 32
+ #define PSTM_32BIT
+#endif /* digit and word size */
+
+#define PSTM_MASK (pstm_digit)(-1)
+#define PSTM_DIGIT_MAX PSTM_MASK
+
+/******************************************************************************/
+/*
+ equalities
+ */
+#define PSTM_LT -1 /* less than */
+#define PSTM_EQ 0 /* equal to */
+#define PSTM_GT 1 /* greater than */
+
+#define PSTM_ZPOS 0 /* positive integer */
+#define PSTM_NEG 1 /* negative */
+
+#define PSTM_OKAY PS_SUCCESS
+#define PSTM_MEM PS_MEM_FAIL
+
+/******************************************************************************/
+/*
+ Various build options
+ */
+#define PSTM_DEFAULT_INIT 64 /* default (64) digits of allocation */
+#define PSTM_MAX_SIZE 4096
+
+typedef struct {
+ int16 used, alloc, sign;
+ pstm_digit *dp;
+ psPool_t *pool;
+} pstm_int;
+
+/******************************************************************************/
+/*
+ Operations on large integers
+ */
+#define pstm_iszero(a) (((a)->used == 0) ? PS_TRUE : PS_FALSE)
+#define pstm_iseven(a) (((a)->used > 0 && (((a)->dp[0] & 1) == 0)) ? PS_TRUE : PS_FALSE)
+#define pstm_isodd(a) (((a)->used > 0 && (((a)->dp[0] & 1) == 1)) ? PS_TRUE : PS_FALSE)
+#define pstm_abs(a, b) { pstm_copy(a, b); (b)->sign = 0; }
+
+extern void pstm_set(pstm_int *a, pstm_digit b);
+
+extern void pstm_zero(pstm_int * a);
+
+extern int32 pstm_init(psPool_t *pool, pstm_int * a);
+
+extern int32 pstm_init_size(psPool_t *pool, pstm_int * a, uint32 size);
+
+extern int32 pstm_init_copy(psPool_t *pool, pstm_int * a, pstm_int * b,
+ int16 toSqr);
+
+extern int16 pstm_count_bits (pstm_int * a);
+
+extern int32 pstm_init_for_read_unsigned_bin(psPool_t *pool, pstm_int *a,
+ uint32 len);
+
+extern int32 pstm_read_unsigned_bin(pstm_int *a, unsigned char *b, int32 c);
+
+extern int32 pstm_unsigned_bin_size(pstm_int *a);
+
+extern int32 pstm_copy(pstm_int * a, pstm_int * b);
+
+extern void pstm_exch(pstm_int * a, pstm_int * b);
+
+extern void pstm_clear(pstm_int * a);
+
+extern void pstm_clear_multi(pstm_int *mp0, pstm_int *mp1, pstm_int *mp2,
+ pstm_int *mp3, pstm_int *mp4, pstm_int *mp5, pstm_int *mp6,
+ pstm_int *mp7);
+
+extern int32 pstm_grow(pstm_int * a, int16 size);
+
+extern void pstm_clamp(pstm_int * a);
+
+extern int32 pstm_cmp(pstm_int * a, pstm_int * b);
+
+extern int32 pstm_cmp_mag(pstm_int * a, pstm_int * b);
+
+extern void pstm_rshd(pstm_int *a, int16 x);
+
+extern int32 pstm_lshd(pstm_int * a, int16 b);
+
+extern int32 pstm_div(psPool_t *pool, pstm_int *a, pstm_int *b, pstm_int *c,
+ pstm_int *d);
+
+extern int32 pstm_div_2d(psPool_t *pool, pstm_int *a, int16 b, pstm_int *c,
+ pstm_int *d);
+
+extern int32 pstm_div_2(pstm_int * a, pstm_int * b);
+
+extern int32 s_pstm_sub(pstm_int *a, pstm_int *b, pstm_int *c);
+
+extern int32 pstm_sub(pstm_int *a, pstm_int *b, pstm_int *c);
+
+extern int32 pstm_sub_d(psPool_t *pool, pstm_int *a, pstm_digit b, pstm_int *c);
+
+extern int32 pstm_mul_2(pstm_int * a, pstm_int * b);
+
+extern int32 pstm_mod(psPool_t *pool, pstm_int *a, pstm_int *b, pstm_int *c);
+
+extern int32 pstm_mulmod(psPool_t *pool, pstm_int *a, pstm_int *b, pstm_int *c,
+ pstm_int *d);
+
+extern int32 pstm_exptmod(psPool_t *pool, pstm_int *G, pstm_int *X, pstm_int *P,
+ pstm_int *Y);
+
+extern int32 pstm_2expt(pstm_int *a, int16 b);
+
+extern int32 pstm_add(pstm_int *a, pstm_int *b, pstm_int *c);
+
+extern int32 pstm_to_unsigned_bin(psPool_t *pool, pstm_int *a,
+ unsigned char *b);
+
+extern int32 pstm_to_unsigned_bin_nr(psPool_t *pool, pstm_int *a,
+ unsigned char *b);
+
+extern int32 pstm_montgomery_setup(pstm_int *a, pstm_digit *rho);
+
+///bbox: pool unused
+#define pstm_montgomery_reduce(pool, a, m, mp, paD, paDlen) \
+ pstm_montgomery_reduce( a, m, mp, paD, paDlen)
+extern int32 pstm_montgomery_reduce(psPool_t *pool, pstm_int *a, pstm_int *m,
+ pstm_digit mp, pstm_digit *paD, uint32 paDlen);
+
+#define pstm_mul_comba(pool, A, B, C, paD, paDlen) \
+ pstm_mul_comba( A, B, C, paD, paDlen)
+extern int32 pstm_mul_comba(psPool_t *pool, pstm_int *A, pstm_int *B,
+ pstm_int *C, pstm_digit *paD, uint32 paDlen);
+
+///bbox: pool unused
+#define pstm_sqr_comba(pool, A, B, paD, paDlen) \
+ pstm_sqr_comba( A, B, paD, paDlen)
+extern int32 pstm_sqr_comba(psPool_t *pool, pstm_int *A, pstm_int *B,
+ pstm_digit *paD, uint32 paDlen);
+
+extern int32 pstm_cmp_d(pstm_int *a, pstm_digit b);
+
+extern int32 pstm_montgomery_calc_normalization(pstm_int *a, pstm_int *b);
+
+extern int32 pstm_mul_d(pstm_int *a, pstm_digit b, pstm_int *c);
+
+extern int32 pstm_invmod(psPool_t *pool, pstm_int * a, pstm_int * b,
+ pstm_int * c);
+
+#else /* DISABLE_PSTM */
+ typedef int32 pstm_int;
+#endif /* !DISABLE_PSTM */
+#endif /* _h_PSTMATH */
+
diff --git a/networking/tls_pstm_montgomery_reduce.c b/networking/tls_pstm_montgomery_reduce.c
new file mode 100644
index 000000000..c231c4ddf
--- /dev/null
+++ b/networking/tls_pstm_montgomery_reduce.c
@@ -0,0 +1,423 @@
+/*
+ * Copyright (C) 2017 Denys Vlasenko
+ *
+ * Licensed under GPLv2, see file LICENSE in this source tree.
+ */
+#include "tls.h"
+
+/**
+ * @file pstm_montgomery_reduce.c
+ * @version 33ef80f (HEAD, tag: MATRIXSSL-3-7-2-OPEN, tag: MATRIXSSL-3-7-2-COMM, origin/master, origin/HEAD, master)
+ *
+ * Multiprecision Montgomery Reduction.
+ */
+/*
+ * Copyright (c) 2013-2015 INSIDE Secure Corporation
+ * Copyright (c) PeerSec Networks, 2002-2011
+ * All Rights Reserved
+ *
+ * The latest version of this code is available at http://www.matrixssl.org
+ *
+ * This software is open source; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This General Public License does NOT permit incorporating this software
+ * into proprietary programs. If you are unable to comply with the GPL, a
+ * commercial license for this software may be purchased from INSIDE at
+ * http://www.insidesecure.com/eng/Company/Locations
+ *
+ * This program is distributed in WITHOUT ANY WARRANTY; without even the
+ * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * http://www.gnu.org/copyleft/gpl.html
+ */
+/******************************************************************************/
+
+///bbox
+//#include "../cryptoApi.h"
+#ifndef DISABLE_PSTM
+
+/******************************************************************************/
+
+#if defined(PSTM_X86)
+/* x86-32 optimized for 32 bit platforms. For 64 bit mode use X86_64 instead */
+#if !defined(__GNUC__) || !defined(__i386__) || !defined(PSTM_32BIT)
+#error "PSTM_X86 option requires GCC and 32 bit mode x86 processor"
+#endif
+//#pragma message ("Using 32 bit x86 Assembly Optimizations")
+
+#define MONT_START
+#define MONT_FINI
+#define LOOP_END
+#define LOOP_START \
+ mu = c[x] * mp
+
+#define INNERMUL \
+asm( \
+ "movl %5,%%eax \n\t" \
+ "mull %4 \n\t" \
+ "addl %1,%%eax \n\t" \
+ "adcl $0,%%edx \n\t" \
+ "addl %%eax,%0 \n\t" \
+ "adcl $0,%%edx \n\t" \
+ "movl %%edx,%1 \n\t" \
+:"=g"(_c[LO]), "=r"(cy) \
+:"0"(_c[LO]), "1"(cy), "g"(mu), "g"(*tmpm++) \
+: "%eax", "%edx", "%cc")
+
+#define PROPCARRY \
+asm( \
+ "addl %1,%0 \n\t" \
+ "setb %%al \n\t" \
+ "movzbl %%al,%1 \n\t" \
+:"=g"(_c[LO]), "=r"(cy) \
+:"0"(_c[LO]), "1"(cy) \
+: "%eax", "%cc")
+
+/******************************************************************************/
+#elif defined(PSTM_X86_64)
+/* x86-64 optimized */
+#if !defined(__GNUC__) || !defined(__x86_64__) || !defined(PSTM_64BIT)
+#error "PSTM_X86_64 option requires PSTM_64BIT, GCC and 64 bit mode x86 processor"
+#endif
+//#pragma message ("Using 64 bit x86_64 Assembly Optimizations")
+
+#define MONT_START
+#define MONT_FINI
+#define LOOP_END
+#define LOOP_START \
+mu = c[x] * mp
+
+#define INNERMUL \
+asm( \
+ "movq %5,%%rax \n\t" \
+ "mulq %4 \n\t" \
+ "addq %1,%%rax \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "addq %%rax,%0 \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "movq %%rdx,%1 \n\t" \
+ :"=g"(_c[LO]), "=r"(cy) \
+ :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \
+ : "%rax", "%rdx", "cc")
+
+#define INNERMUL8 \
+asm( \
+ "movq 0(%5),%%rax \n\t" \
+ "movq 0(%2),%%r10 \n\t" \
+ "movq 0x8(%5),%%r11 \n\t" \
+ "mulq %4 \n\t" \
+ "addq %%r10,%%rax \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "movq 0x8(%2),%%r10 \n\t" \
+ "addq %3,%%rax \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "movq %%rax,0(%0) \n\t" \
+ "movq %%rdx,%1 \n\t" \
+ \
+ "movq %%r11,%%rax \n\t" \
+ "movq 0x10(%5),%%r11 \n\t" \
+ "mulq %4 \n\t" \
+ "addq %%r10,%%rax \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "movq 0x10(%2),%%r10 \n\t" \
+ "addq %3,%%rax \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "movq %%rax,0x8(%0) \n\t" \
+ "movq %%rdx,%1 \n\t" \
+ \
+ "movq %%r11,%%rax \n\t" \
+ "movq 0x18(%5),%%r11 \n\t" \
+ "mulq %4 \n\t" \
+ "addq %%r10,%%rax \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "movq 0x18(%2),%%r10 \n\t" \
+ "addq %3,%%rax \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "movq %%rax,0x10(%0) \n\t" \
+ "movq %%rdx,%1 \n\t" \
+ \
+ "movq %%r11,%%rax \n\t" \
+ "movq 0x20(%5),%%r11 \n\t" \
+ "mulq %4 \n\t" \
+ "addq %%r10,%%rax \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "movq 0x20(%2),%%r10 \n\t" \
+ "addq %3,%%rax \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "movq %%rax,0x18(%0) \n\t" \
+ "movq %%rdx,%1 \n\t" \
+ \
+ "movq %%r11,%%rax \n\t" \
+ "movq 0x28(%5),%%r11 \n\t" \
+ "mulq %4 \n\t" \
+ "addq %%r10,%%rax \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "movq 0x28(%2),%%r10 \n\t" \
+ "addq %3,%%rax \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "movq %%rax,0x20(%0) \n\t" \
+ "movq %%rdx,%1 \n\t" \
+ \
+ "movq %%r11,%%rax \n\t" \
+ "movq 0x30(%5),%%r11 \n\t" \
+ "mulq %4 \n\t" \
+ "addq %%r10,%%rax \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "movq 0x30(%2),%%r10 \n\t" \
+ "addq %3,%%rax \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "movq %%rax,0x28(%0) \n\t" \
+ "movq %%rdx,%1 \n\t" \
+ \
+ "movq %%r11,%%rax \n\t" \
+ "movq 0x38(%5),%%r11 \n\t" \
+ "mulq %4 \n\t" \
+ "addq %%r10,%%rax \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "movq 0x38(%2),%%r10 \n\t" \
+ "addq %3,%%rax \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "movq %%rax,0x30(%0) \n\t" \
+ "movq %%rdx,%1 \n\t" \
+ \
+ "movq %%r11,%%rax \n\t" \
+ "mulq %4 \n\t" \
+ "addq %%r10,%%rax \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "addq %3,%%rax \n\t" \
+ "adcq $0,%%rdx \n\t" \
+ "movq %%rax,0x38(%0) \n\t" \
+ "movq %%rdx,%1 \n\t" \
+ \
+ :"=r"(_c), "=r"(cy) \
+ : "0"(_c), "1"(cy), "g"(mu), "r"(tmpm)\
+ : "%rax", "%rdx", "%r10", "%r11", "cc")
+
+#define PROPCARRY \
+asm( \
+ "addq %1,%0 \n\t" \
+ "setb %%al \n\t" \
+ "movzbq %%al,%1 \n\t" \
+ :"=g"(_c[LO]), "=r"(cy) \
+ :"0"(_c[LO]), "1"(cy) \
+ : "%rax", "cc")
+
+/******************************************************************************/
+#elif defined(PSTM_ARM)
+
+#define MONT_START
+#define MONT_FINI
+#define LOOP_END
+#define LOOP_START \
+mu = c[x] * mp
+
+#ifdef __thumb2__
+//#pragma message ("Using 32 bit ARM Thumb2 Assembly Optimizations")
+#define INNERMUL \
+asm( \
+ " LDR r0,%1 \n\t" \
+ " ADDS r0,r0,%0 \n\t" \
+ " ITE CS \n\t" \
+ " MOVCS %0,#1 \n\t" \
+ " MOVCC %0,#0 \n\t" \
+ " UMLAL r0,%0,%3,%4 \n\t" \
+ " STR r0,%1 \n\t" \
+ :"=r"(cy),"=m"(_c[0])\
+ :"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0])\
+ :"r0","%cc");
+#define PROPCARRY \
+asm( \
+ " LDR r0,%1 \n\t" \
+ " ADDS r0,r0,%0 \n\t" \
+ " STR r0,%1 \n\t" \
+ " ITE CS \n\t" \
+ " MOVCS %0,#1 \n\t" \
+ " MOVCC %0,#0 \n\t" \
+ :"=r"(cy),"=m"(_c[0])\
+ :"0"(cy),"m"(_c[0])\
+ :"r0","%cc");
+#else /* Non-Thumb2 code */
+//#pragma message ("Using 32 bit ARM Assembly Optimizations")
+#define INNERMUL \
+asm( \
+ " LDR r0,%1 \n\t" \
+ " ADDS r0,r0,%0 \n\t" \
+ " MOVCS %0,#1 \n\t" \
+ " MOVCC %0,#0 \n\t" \
+ " UMLAL r0,%0,%3,%4 \n\t" \
+ " STR r0,%1 \n\t" \
+ :"=r"(cy),"=m"(_c[0])\
+ :"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0])\
+ :"r0","%cc");
+#define PROPCARRY \
+asm( \
+ " LDR r0,%1 \n\t" \
+ " ADDS r0,r0,%0 \n\t" \
+ " STR r0,%1 \n\t" \
+ " MOVCS %0,#1 \n\t" \
+ " MOVCC %0,#0 \n\t" \
+ :"=r"(cy),"=m"(_c[0])\
+ :"0"(cy),"m"(_c[0])\
+ :"r0","%cc");
+#endif /* __thumb2__ */
+
+
+/******************************************************************************/
+#elif defined(PSTM_MIPS)
+/* MIPS32 */
+//#pragma message ("Using 32 bit MIPS Assembly Optimizations")
+#define MONT_START
+#define MONT_FINI
+#define LOOP_END
+#define LOOP_START \
+mu = c[x] * mp
+
+#define INNERMUL \
+asm( \
+ " multu %3,%4 \n\t" \
+ " mflo $12 \n\t" \
+ " mfhi $13 \n\t" \
+ " addu $12,$12,%0 \n\t" \
+ " sltu $10,$12,%0 \n\t" \
+ " addu $13,$13,$10 \n\t" \
+ " lw $10,%1 \n\t" \
+ " addu $12,$12,$10 \n\t" \
+ " sltu $10,$12,$10 \n\t" \
+ " addu %0,$13,$10 \n\t" \
+ " sw $12,%1 \n\t" \
+ :"=r"(cy),"=m"(_c[0])\
+ :"r"(cy),"r"(mu),"r"(tmpm[0]),"r"(_c[0])\
+ :"$10","$12","$13")\
+; ++tmpm;
+
+#define PROPCARRY \
+asm( \
+ " lw $10,%1 \n\t" \
+ " addu $10,$10,%0 \n\t" \
+ " sw $10,%1 \n\t" \
+ " sltu %0,$10,%0 \n\t" \
+ :"=r"(cy),"=m"(_c[0])\
+ :"r"(cy),"r"(_c[0])\
+ :"$10");
+
+
+/******************************************************************************/
+#else
+
+/* ISO C code */
+#define MONT_START
+#define MONT_FINI
+#define LOOP_END
+#define LOOP_START \
+ mu = c[x] * mp
+
+#define INNERMUL \
+ do { pstm_word t; \
+ t = ((pstm_word)_c[0] + (pstm_word)cy) + \
+ (((pstm_word)mu) * ((pstm_word)*tmpm++)); \
+ _c[0] = (pstm_digit)t; \
+ cy = (pstm_digit)(t >> DIGIT_BIT); \
+ } while (0)
+
+#define PROPCARRY \
+ do { pstm_digit t = _c[0] += cy; cy = (t < cy); } while (0)
+
+#endif
+
+/******************************************************************************/
+
+#define LO 0
+
+/* computes x/R == x (mod N) via Montgomery Reduction */
+int32 pstm_montgomery_reduce(psPool_t *pool, pstm_int *a, pstm_int *m,
+ pstm_digit mp, pstm_digit *paD, uint32 paDlen)
+{
+ pstm_digit *c, *_c, *tmpm, mu;
+ int32 oldused, x, y;
+ int16 pa;
+
+ pa = m->used;
+ if (pa > a->alloc) {
+ /* Sanity test for bad numbers. This will confirm no buffer overruns */
+ return PS_LIMIT_FAIL;
+ }
+
+ if (paD && paDlen >= (uint32)2*pa+1) {
+ c = paD;
+ memset(c, 0x0, paDlen);
+ } else {
+ c = xzalloc(2*pa+1);
+ }
+ /* copy the input */
+ oldused = a->used;
+ for (x = 0; x < oldused; x++) {
+ c[x] = a->dp[x];
+ }
+
+ MONT_START;
+
+ for (x = 0; x < pa; x++) {
+ pstm_digit cy = 0;
+ /* get Mu for this round */
+ LOOP_START;
+ _c = c + x;
+ tmpm = m->dp;
+ y = 0;
+#ifdef PSTM_X86_64
+ for (; y < (pa & ~7); y += 8) {
+ INNERMUL8;
+ _c += 8;
+ tmpm += 8;
+ }
+#endif /* PSTM_X86_64 */
+ for (; y < pa; y++) {
+ INNERMUL;
+ ++_c;
+ }
+ LOOP_END;
+ while (cy) {
+ PROPCARRY;
+ ++_c;
+ }
+ }
+
+ /* now copy out */
+ _c = c + pa;
+ tmpm = a->dp;
+ for (x = 0; x < pa+1; x++) {
+ *tmpm++ = *_c++;
+ }
+
+ for (; x < oldused; x++) {
+ *tmpm++ = 0;
+ }
+
+ MONT_FINI;
+
+ a->used = pa+1;
+ pstm_clamp(a);
+
+ /* reuse x as return code */
+ x = PSTM_OKAY;
+
+ /* if A >= m then A = A - m */
+ if (pstm_cmp_mag (a, m) != PSTM_LT) {
+ if (s_pstm_sub (a, m, a) != PSTM_OKAY) {
+ x = PS_MEM_FAIL;
+ }
+ }
+ if (paDlen < (uint32)2*pa+1) {
+ psFree(c, pool);
+ }
+ return x;
+}
+
+#endif /* !DISABLE_PSTM */
+/******************************************************************************/
diff --git a/networking/tls_pstm_mul_comba.c b/networking/tls_pstm_mul_comba.c
new file mode 100644
index 000000000..6e051baeb
--- /dev/null
+++ b/networking/tls_pstm_mul_comba.c
@@ -0,0 +1,777 @@
+/*
+ * Copyright (C) 2017 Denys Vlasenko
+ *
+ * Licensed under GPLv2, see file LICENSE in this source tree.
+ */
+#include "tls.h"
+
+/**
+ * @file pstm_mul_comba.c
+ * @version 33ef80f (HEAD, tag: MATRIXSSL-3-7-2-OPEN, tag: MATRIXSSL-3-7-2-COMM, origin/master, origin/HEAD, master)
+ *
+ * Multiprecision multiplication with Comba technique.
+ */
+/*
+ * Copyright (c) 2013-2015 INSIDE Secure Corporation
+ * Copyright (c) PeerSec Networks, 2002-2011
+ * All Rights Reserved
+ *
+ * The latest version of this code is available at http://www.matrixssl.org
+ *
+ * This software is open source; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This General Public License does NOT permit incorporating this software
+ * into proprietary programs. If you are unable to comply with the GPL, a
+ * commercial license for this software may be purchased from INSIDE at
+ * http://www.insidesecure.com/eng/Company/Locations
+ *
+ * This program is distributed in WITHOUT ANY WARRANTY; without even the
+ * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * http://www.gnu.org/copyleft/gpl.html
+ */
+/******************************************************************************/
+
+///bbox
+//#include "../cryptoApi.h"
+#ifndef DISABLE_PSTM
+
+/******************************************************************************/
+#if defined(PSTM_X86)
+/* x86-32 optimized for 32 bit platforms. For 64 bit mode use X86_64 instead */
+#if !defined(__GNUC__) || !defined(__i386__) || !defined(PSTM_32BIT)
+#error "PSTM_X86 option requires GCC and 32 bit mode x86 processor"
+#endif
+//#pragma message ("Using 32 bit x86 Assembly Optimizations")
+
+/* anything you need at the start */
+#define COMBA_START
+
+/* clear the chaining variables */
+#define COMBA_CLEAR \
+ c0 = c1 = c2 = 0;
+
+/* forward the carry to the next digit */
+#define COMBA_FORWARD \
+ do { c0 = c1; c1 = c2; c2 = 0; } while (0);
+
+/* store the first sum */
+#define COMBA_STORE(x) \
+ x = c0;
+
+/* store the second sum [carry] */
+#define COMBA_STORE2(x) \
+ x = c1;
+
+/* anything you need at the end */
+#define COMBA_FINI
+
+/* this should multiply i and j */
+#define MULADD(i, j) \
+asm( \
+ "movl %6,%%eax \n\t" \
+ "mull %7 \n\t" \
+ "addl %%eax,%0 \n\t" \
+ "adcl %%edx,%1 \n\t" \
+ "adcl $0,%2 \n\t" \
+ :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","%cc");
+
+/******************************************************************************/
+#elif defined(PSTM_X86_64)
+/* x86-64 optimized */
+#if !defined(__GNUC__) || !defined(__x86_64__) || !defined(PSTM_64BIT)
+#error "PSTM_X86_64 option requires PSTM_64BIT, GCC and 64 bit mode x86 processor"
+#endif
+//#pragma message ("Using 64 bit x86_64 Assembly Optimizations")
+
+/* anything you need at the start */
+#define COMBA_START
+
+/* clear the chaining variables */
+#define COMBA_CLEAR \
+c0 = c1 = c2 = 0;
+
+/* forward the carry to the next digit */
+#define COMBA_FORWARD \
+do { c0 = c1; c1 = c2; c2 = 0; } while (0);
+
+/* store the first sum */
+#define COMBA_STORE(x) \
+x = c0;
+
+/* store the second sum [carry] */
+#define COMBA_STORE2(x) \
+x = c1;
+
+/* anything you need at the end */
+#define COMBA_FINI
+
+/* this should multiply i and j */
+#define MULADD(i, j) \
+asm ( \
+ "movq %6,%%rax \n\t" \
+ "mulq %7 \n\t" \
+ "addq %%rax,%0 \n\t" \
+ "adcq %%rdx,%1 \n\t" \
+ "adcq $0,%2 \n\t" \
+ :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","cc");
+
+/******************************************************************************/
+#elif defined(PSTM_ARM)
+/* ARM code */
+//#pragma message ("Using 32 bit ARM Assembly Optimizations")
+
+#define COMBA_START
+
+#define COMBA_CLEAR \
+c0 = c1 = c2 = 0;
+
+#define COMBA_FORWARD \
+do { c0 = c1; c1 = c2; c2 = 0; } while (0);
+
+#define COMBA_STORE(x) \
+x = c0;
+
+#define COMBA_STORE2(x) \
+x = c1;
+
+#define COMBA_FINI
+
+#define MULADD(i, j) \
+asm( \
+ " UMULL r0,r1,%6,%7 \n\t" \
+ " ADDS %0,%0,r0 \n\t" \
+ " ADCS %1,%1,r1 \n\t" \
+ " ADC %2,%2,#0 \n\t" \
+ :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "%cc");
+
+/******************************************************************************/
+#elif defined(PSTM_MIPS)
+/* MIPS32 code */
+//#pragma message ("Using 32 bit MIPS Assembly Optimizations")
+
+#define COMBA_START
+
+#define COMBA_CLEAR \
+c0 = c1 = c2 = 0;
+
+#define COMBA_FORWARD \
+do { c0 = c1; c1 = c2; c2 = 0; } while (0);
+
+#define COMBA_STORE(x) \
+x = c0;
+
+#define COMBA_STORE2(x) \
+x = c1;
+
+#define COMBA_FINI
+
+#define MULADD(i, j) \
+asm( \
+ " multu %6,%7 \n\t" \
+ " mflo $12 \n\t" \
+ " mfhi $13 \n\t" \
+ " addu %0,%0,$12 \n\t" \
+ " sltu $12,%0,$12 \n\t" \
+ " addu %1,%1,$13 \n\t" \
+ " sltu $13,%1,$13 \n\t" \
+ " addu %1,%1,$12 \n\t" \
+ " sltu $12,%1,$12 \n\t" \
+ " addu %2,%2,$13 \n\t" \
+ " addu %2,%2,$12 \n\t" \
+ :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"$12","$13");
+
+/******************************************************************************/
+#else
+
+#define COMBA_START
+
+#define COMBA_CLEAR \
+ c0 = c1 = c2 = 0;
+
+#define COMBA_FORWARD \
+ do { c0 = c1; c1 = c2; c2 = 0; } while (0);
+
+#define COMBA_STORE(x) \
+ x = c0;
+
+#define COMBA_STORE2(x) \
+ x = c1;
+
+#define COMBA_FINI
+
+#define MULADD(i, j) \
+ do { pstm_word t; \
+ t = (pstm_word)c0 + ((pstm_word)i) * ((pstm_word)j); c0 = (pstm_digit)t; \
+ t = (pstm_word)c1 + (t >> DIGIT_BIT); \
+ c1 = (pstm_digit)t; c2 += (pstm_digit)(t >> DIGIT_BIT); \
+ } while (0);
+
+#endif
+
+/******************************************************************************/
+/* generic PxQ multiplier */
+///bbox: pool unused
+#define pstm_mul_comba_gen(pool, A, B, C, paD, paDlen) \
+ pstm_mul_comba_gen( A, B, C, paD, paDlen)
+static int32 pstm_mul_comba_gen(psPool_t *pool, pstm_int *A, pstm_int *B,
+ pstm_int *C, pstm_digit *paD, uint32 paDlen)
+{
+ int16 paDfail, pa;
+ int32 ix, iy, iz, tx, ty;
+ pstm_digit c0, c1, c2, *tmpx, *tmpy, *dst;
+
+ COMBA_START;
+ COMBA_CLEAR;
+
+ paDfail = 0;
+ /* get size of output and trim */
+ pa = A->used + B->used;
+
+/*
+ If c is not large enough grow it and continue
+*/
+ if (C->alloc < pa) {
+ if (pstm_grow(C, pa) != PSTM_OKAY) {
+ return PS_MEM_FAIL;
+ }
+ }
+ if (paD != NULL) {
+ if (paDlen < (sizeof(pstm_digit) * pa)) {
+ paDfail = 1; /* have a paD but it's not large enough */
+ dst = xzalloc(sizeof(pstm_digit) * pa);
+ } else {
+ dst = paD;
+ memset(dst, 0x0, paDlen);
+ }
+ } else {
+ dst = xzalloc(sizeof(pstm_digit) * pa);
+ }
+
+ for (ix = 0; ix < pa; ix++) {
+ /* get offsets into the two bignums */
+ ty = min(ix, B->used-1);
+ tx = ix - ty;
+
+ /* setup temp aliases */
+ tmpx = A->dp + tx;
+ tmpy = B->dp + ty;
+/*
+ This is the number of times the loop will iterate, essentially it's
+ while (tx++ < a->used && ty-- >= 0) { ... }
+*/
+ iy = min(A->used-tx, ty+1);
+
+ /* execute loop */
+ COMBA_FORWARD;
+ for (iz = 0; iz < iy; ++iz) {
+ MULADD(*tmpx++, *tmpy--);
+ }
+
+ /* store term */
+ COMBA_STORE(dst[ix]);
+ }
+ COMBA_FINI;
+/*
+ setup dest
+ */
+ iy = C->used;
+ C->used = pa;
+ C->sign = A->sign ^ B->sign;
+ {
+ pstm_digit *tmpc;
+ tmpc = C->dp;
+ for (ix = 0; ix < pa; ix++) {
+ *tmpc++ = dst[ix];
+ }
+/*
+ clear unused digits [that existed in the old copy of c]
+ */
+ for (; ix < iy; ix++) {
+ *tmpc++ = 0;
+ }
+ }
+ pstm_clamp(C);
+
+ if ((paD == NULL) || (paDfail == 1)) {
+ psFree(dst, pool);
+ }
+
+ return PS_SUCCESS;
+}
+
+/******************************************************************************/
+#ifdef USE_1024_KEY_SPEED_OPTIMIZATIONS
+static int32 pstm_mul_comba16(pstm_int *A, pstm_int *B, pstm_int *C)
+{
+ pstm_digit c0, c1, c2, at[32];
+
+ if (C->alloc < 32) {
+ if (pstm_grow(C, 32) != PSTM_OKAY) {
+ return PS_MEM_FAIL;
+ }
+ }
+ memcpy(at, A->dp, 16 * sizeof(pstm_digit));
+ memcpy(at+16, B->dp, 16 * sizeof(pstm_digit));
+
+ COMBA_START;
+
+ COMBA_CLEAR;
+ /* 0 */
+ MULADD(at[0], at[16]);
+ COMBA_STORE(C->dp[0]);
+ /* 1 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[17]); MULADD(at[1], at[16]);
+ COMBA_STORE(C->dp[1]);
+ /* 2 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[18]); MULADD(at[1], at[17]); MULADD(at[2], at[16]);
+ COMBA_STORE(C->dp[2]);
+ /* 3 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[19]); MULADD(at[1], at[18]); MULADD(at[2], at[17]); MULADD(at[3], at[16]);
+ COMBA_STORE(C->dp[3]);
+ /* 4 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[20]); MULADD(at[1], at[19]); MULADD(at[2], at[18]); MULADD(at[3], at[17]); MULADD(at[4], at[16]);
+ COMBA_STORE(C->dp[4]);
+ /* 5 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[21]); MULADD(at[1], at[20]); MULADD(at[2], at[19]); MULADD(at[3], at[18]); MULADD(at[4], at[17]); MULADD(at[5], at[16]);
+ COMBA_STORE(C->dp[5]);
+ /* 6 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[22]); MULADD(at[1], at[21]); MULADD(at[2], at[20]); MULADD(at[3], at[19]); MULADD(at[4], at[18]); MULADD(at[5], at[17]); MULADD(at[6], at[16]);
+ COMBA_STORE(C->dp[6]);
+ /* 7 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[23]); MULADD(at[1], at[22]); MULADD(at[2], at[21]); MULADD(at[3], at[20]); MULADD(at[4], at[19]); MULADD(at[5], at[18]); MULADD(at[6], at[17]); MULADD(at[7], at[16]);
+ COMBA_STORE(C->dp[7]);
+ /* 8 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[24]); MULADD(at[1], at[23]); MULADD(at[2], at[22]); MULADD(at[3], at[21]); MULADD(at[4], at[20]); MULADD(at[5], at[19]); MULADD(at[6], at[18]); MULADD(at[7], at[17]); MULADD(at[8], at[16]);
+ COMBA_STORE(C->dp[8]);
+ /* 9 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[25]); MULADD(at[1], at[24]); MULADD(at[2], at[23]); MULADD(at[3], at[22]); MULADD(at[4], at[21]); MULADD(at[5], at[20]); MULADD(at[6], at[19]); MULADD(at[7], at[18]); MULADD(at[8], at[17]); MULADD(at[9], at[16]);
+ COMBA_STORE(C->dp[9]);
+ /* 10 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[26]); MULADD(at[1], at[25]); MULADD(at[2], at[24]); MULADD(at[3], at[23]); MULADD(at[4], at[22]); MULADD(at[5], at[21]); MULADD(at[6], at[20]); MULADD(at[7], at[19]); MULADD(at[8], at[18]); MULADD(at[9], at[17]); MULADD(at[10], at[16]);
+ COMBA_STORE(C->dp[10]);
+ /* 11 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[27]); MULADD(at[1], at[26]); MULADD(at[2], at[25]); MULADD(at[3], at[24]); MULADD(at[4], at[23]); MULADD(at[5], at[22]); MULADD(at[6], at[21]); MULADD(at[7], at[20]); MULADD(at[8], at[19]); MULADD(at[9], at[18]); MULADD(at[10], at[17]); MULADD(at[11], at[16]);
+ COMBA_STORE(C->dp[11]);
+ /* 12 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[28]); MULADD(at[1], at[27]); MULADD(at[2], at[26]); MULADD(at[3], at[25]); MULADD(at[4], at[24]); MULADD(at[5], at[23]); MULADD(at[6], at[22]); MULADD(at[7], at[21]); MULADD(at[8], at[20]); MULADD(at[9], at[19]); MULADD(at[10], at[18]); MULADD(at[11], at[17]); MULADD(at[12], at[16]);
+ COMBA_STORE(C->dp[12]);
+ /* 13 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[29]); MULADD(at[1], at[28]); MULADD(at[2], at[27]); MULADD(at[3], at[26]); MULADD(at[4], at[25]); MULADD(at[5], at[24]); MULADD(at[6], at[23]); MULADD(at[7], at[22]); MULADD(at[8], at[21]); MULADD(at[9], at[20]); MULADD(at[10], at[19]); MULADD(at[11], at[18]); MULADD(at[12], at[17]); MULADD(at[13], at[16]);
+ COMBA_STORE(C->dp[13]);
+ /* 14 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[30]); MULADD(at[1], at[29]); MULADD(at[2], at[28]); MULADD(at[3], at[27]); MULADD(at[4], at[26]); MULADD(at[5], at[25]); MULADD(at[6], at[24]); MULADD(at[7], at[23]); MULADD(at[8], at[22]); MULADD(at[9], at[21]); MULADD(at[10], at[20]); MULADD(at[11], at[19]); MULADD(at[12], at[18]); MULADD(at[13], at[17]); MULADD(at[14], at[16]);
+ COMBA_STORE(C->dp[14]);
+ /* 15 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[31]); MULADD(at[1], at[30]); MULADD(at[2], at[29]); MULADD(at[3], at[28]); MULADD(at[4], at[27]); MULADD(at[5], at[26]); MULADD(at[6], at[25]); MULADD(at[7], at[24]); MULADD(at[8], at[23]); MULADD(at[9], at[22]); MULADD(at[10], at[21]); MULADD(at[11], at[20]); MULADD(at[12], at[19]); MULADD(at[13], at[18]); MULADD(at[14], at[17]); MULADD(at[15], at[16]);
+ COMBA_STORE(C->dp[15]);
+ /* 16 */
+ COMBA_FORWARD;
+ MULADD(at[1], at[31]); MULADD(at[2], at[30]); MULADD(at[3], at[29]); MULADD(at[4], at[28]); MULADD(at[5], at[27]); MULADD(at[6], at[26]); MULADD(at[7], at[25]); MULADD(at[8], at[24]); MULADD(at[9], at[23]); MULADD(at[10], at[22]); MULADD(at[11], at[21]); MULADD(at[12], at[20]); MULADD(at[13], at[19]); MULADD(at[14], at[18]); MULADD(at[15], at[17]);
+ COMBA_STORE(C->dp[16]);
+ /* 17 */
+ COMBA_FORWARD;
+ MULADD(at[2], at[31]); MULADD(at[3], at[30]); MULADD(at[4], at[29]); MULADD(at[5], at[28]); MULADD(at[6], at[27]); MULADD(at[7], at[26]); MULADD(at[8], at[25]); MULADD(at[9], at[24]); MULADD(at[10], at[23]); MULADD(at[11], at[22]); MULADD(at[12], at[21]); MULADD(at[13], at[20]); MULADD(at[14], at[19]); MULADD(at[15], at[18]);
+ COMBA_STORE(C->dp[17]);
+ /* 18 */
+ COMBA_FORWARD;
+ MULADD(at[3], at[31]); MULADD(at[4], at[30]); MULADD(at[5], at[29]); MULADD(at[6], at[28]); MULADD(at[7], at[27]); MULADD(at[8], at[26]); MULADD(at[9], at[25]); MULADD(at[10], at[24]); MULADD(at[11], at[23]); MULADD(at[12], at[22]); MULADD(at[13], at[21]); MULADD(at[14], at[20]); MULADD(at[15], at[19]);
+ COMBA_STORE(C->dp[18]);
+ /* 19 */
+ COMBA_FORWARD;
+ MULADD(at[4], at[31]); MULADD(at[5], at[30]); MULADD(at[6], at[29]); MULADD(at[7], at[28]); MULADD(at[8], at[27]); MULADD(at[9], at[26]); MULADD(at[10], at[25]); MULADD(at[11], at[24]); MULADD(at[12], at[23]); MULADD(at[13], at[22]); MULADD(at[14], at[21]); MULADD(at[15], at[20]);
+ COMBA_STORE(C->dp[19]);
+ /* 20 */
+ COMBA_FORWARD;
+ MULADD(at[5], at[31]); MULADD(at[6], at[30]); MULADD(at[7], at[29]); MULADD(at[8], at[28]); MULADD(at[9], at[27]); MULADD(at[10], at[26]); MULADD(at[11], at[25]); MULADD(at[12], at[24]); MULADD(at[13], at[23]); MULADD(at[14], at[22]); MULADD(at[15], at[21]);
+ COMBA_STORE(C->dp[20]);
+ /* 21 */
+ COMBA_FORWARD;
+ MULADD(at[6], at[31]); MULADD(at[7], at[30]); MULADD(at[8], at[29]); MULADD(at[9], at[28]); MULADD(at[10], at[27]); MULADD(at[11], at[26]); MULADD(at[12], at[25]); MULADD(at[13], at[24]); MULADD(at[14], at[23]); MULADD(at[15], at[22]);
+ COMBA_STORE(C->dp[21]);
+ /* 22 */
+ COMBA_FORWARD;
+ MULADD(at[7], at[31]); MULADD(at[8], at[30]); MULADD(at[9], at[29]); MULADD(at[10], at[28]); MULADD(at[11], at[27]); MULADD(at[12], at[26]); MULADD(at[13], at[25]); MULADD(at[14], at[24]); MULADD(at[15], at[23]);
+ COMBA_STORE(C->dp[22]);
+ /* 23 */
+ COMBA_FORWARD;
+ MULADD(at[8], at[31]); MULADD(at[9], at[30]); MULADD(at[10], at[29]); MULADD(at[11], at[28]); MULADD(at[12], at[27]); MULADD(at[13], at[26]); MULADD(at[14], at[25]); MULADD(at[15], at[24]);
+ COMBA_STORE(C->dp[23]);
+ /* 24 */
+ COMBA_FORWARD;
+ MULADD(at[9], at[31]); MULADD(at[10], at[30]); MULADD(at[11], at[29]); MULADD(at[12], at[28]); MULADD(at[13], at[27]); MULADD(at[14], at[26]); MULADD(at[15], at[25]);
+ COMBA_STORE(C->dp[24]);
+ /* 25 */
+ COMBA_FORWARD;
+ MULADD(at[10], at[31]); MULADD(at[11], at[30]); MULADD(at[12], at[29]); MULADD(at[13], at[28]); MULADD(at[14], at[27]); MULADD(at[15], at[26]);
+ COMBA_STORE(C->dp[25]);
+ /* 26 */
+ COMBA_FORWARD;
+ MULADD(at[11], at[31]); MULADD(at[12], at[30]); MULADD(at[13], at[29]); MULADD(at[14], at[28]); MULADD(at[15], at[27]);
+ COMBA_STORE(C->dp[26]);
+ /* 27 */
+ COMBA_FORWARD;
+ MULADD(at[12], at[31]); MULADD(at[13], at[30]); MULADD(at[14], at[29]); MULADD(at[15], at[28]);
+ COMBA_STORE(C->dp[27]);
+ /* 28 */
+ COMBA_FORWARD;
+ MULADD(at[13], at[31]); MULADD(at[14], at[30]); MULADD(at[15], at[29]);
+ COMBA_STORE(C->dp[28]);
+ /* 29 */
+ COMBA_FORWARD;
+ MULADD(at[14], at[31]); MULADD(at[15], at[30]);
+ COMBA_STORE(C->dp[29]);
+ /* 30 */
+ COMBA_FORWARD;
+ MULADD(at[15], at[31]);
+ COMBA_STORE(C->dp[30]);
+ COMBA_STORE2(C->dp[31]);
+ C->used = 32;
+ C->sign = A->sign ^ B->sign;
+ pstm_clamp(C);
+ COMBA_FINI;
+ return PSTM_OKAY;
+}
+#endif /* USE_1024_KEY_SPEED_OPTIMIZATIONS */
+
+
+#ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS
+static int32 pstm_mul_comba32(pstm_int *A, pstm_int *B, pstm_int *C)
+{
+ pstm_digit c0, c1, c2, at[64];
+ int32 out_size;
+
+ if (C->alloc < 64) {
+ if (pstm_grow(C, 64) != PSTM_OKAY) {
+ return PS_MEM_FAIL;
+ }
+ }
+
+ out_size = A->used + B->used;
+ memcpy(at, A->dp, 32 * sizeof(pstm_digit));
+ memcpy(at+32, B->dp, 32 * sizeof(pstm_digit));
+ COMBA_START;
+
+ COMBA_CLEAR;
+ /* 0 */
+ MULADD(at[0], at[32]);
+ COMBA_STORE(C->dp[0]);
+ /* 1 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[33]); MULADD(at[1], at[32]);
+ COMBA_STORE(C->dp[1]);
+ /* 2 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[34]); MULADD(at[1], at[33]); MULADD(at[2], at[32]);
+ COMBA_STORE(C->dp[2]);
+ /* 3 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[35]); MULADD(at[1], at[34]); MULADD(at[2], at[33]); MULADD(at[3], at[32]);
+ COMBA_STORE(C->dp[3]);
+ /* 4 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[36]); MULADD(at[1], at[35]); MULADD(at[2], at[34]); MULADD(at[3], at[33]); MULADD(at[4], at[32]);
+ COMBA_STORE(C->dp[4]);
+ /* 5 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[37]); MULADD(at[1], at[36]); MULADD(at[2], at[35]); MULADD(at[3], at[34]); MULADD(at[4], at[33]); MULADD(at[5], at[32]);
+ COMBA_STORE(C->dp[5]);
+ /* 6 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[38]); MULADD(at[1], at[37]); MULADD(at[2], at[36]); MULADD(at[3], at[35]); MULADD(at[4], at[34]); MULADD(at[5], at[33]); MULADD(at[6], at[32]);
+ COMBA_STORE(C->dp[6]);
+ /* 7 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[39]); MULADD(at[1], at[38]); MULADD(at[2], at[37]); MULADD(at[3], at[36]); MULADD(at[4], at[35]); MULADD(at[5], at[34]); MULADD(at[6], at[33]); MULADD(at[7], at[32]);
+ COMBA_STORE(C->dp[7]);
+ /* 8 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[40]); MULADD(at[1], at[39]); MULADD(at[2], at[38]); MULADD(at[3], at[37]); MULADD(at[4], at[36]); MULADD(at[5], at[35]); MULADD(at[6], at[34]); MULADD(at[7], at[33]); MULADD(at[8], at[32]);
+ COMBA_STORE(C->dp[8]);
+ /* 9 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[41]); MULADD(at[1], at[40]); MULADD(at[2], at[39]); MULADD(at[3], at[38]); MULADD(at[4], at[37]); MULADD(at[5], at[36]); MULADD(at[6], at[35]); MULADD(at[7], at[34]); MULADD(at[8], at[33]); MULADD(at[9], at[32]);
+ COMBA_STORE(C->dp[9]);
+ /* 10 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[42]); MULADD(at[1], at[41]); MULADD(at[2], at[40]); MULADD(at[3], at[39]); MULADD(at[4], at[38]); MULADD(at[5], at[37]); MULADD(at[6], at[36]); MULADD(at[7], at[35]); MULADD(at[8], at[34]); MULADD(at[9], at[33]); MULADD(at[10], at[32]);
+ COMBA_STORE(C->dp[10]);
+ /* 11 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[43]); MULADD(at[1], at[42]); MULADD(at[2], at[41]); MULADD(at[3], at[40]); MULADD(at[4], at[39]); MULADD(at[5], at[38]); MULADD(at[6], at[37]); MULADD(at[7], at[36]); MULADD(at[8], at[35]); MULADD(at[9], at[34]); MULADD(at[10], at[33]); MULADD(at[11], at[32]);
+ COMBA_STORE(C->dp[11]);
+ /* 12 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[44]); MULADD(at[1], at[43]); MULADD(at[2], at[42]); MULADD(at[3], at[41]); MULADD(at[4], at[40]); MULADD(at[5], at[39]); MULADD(at[6], at[38]); MULADD(at[7], at[37]); MULADD(at[8], at[36]); MULADD(at[9], at[35]); MULADD(at[10], at[34]); MULADD(at[11], at[33]); MULADD(at[12], at[32]);
+ COMBA_STORE(C->dp[12]);
+ /* 13 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[45]); MULADD(at[1], at[44]); MULADD(at[2], at[43]); MULADD(at[3], at[42]); MULADD(at[4], at[41]); MULADD(at[5], at[40]); MULADD(at[6], at[39]); MULADD(at[7], at[38]); MULADD(at[8], at[37]); MULADD(at[9], at[36]); MULADD(at[10], at[35]); MULADD(at[11], at[34]); MULADD(at[12], at[33]); MULADD(at[13], at[32]);
+ COMBA_STORE(C->dp[13]);
+ /* 14 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[46]); MULADD(at[1], at[45]); MULADD(at[2], at[44]); MULADD(at[3], at[43]); MULADD(at[4], at[42]); MULADD(at[5], at[41]); MULADD(at[6], at[40]); MULADD(at[7], at[39]); MULADD(at[8], at[38]); MULADD(at[9], at[37]); MULADD(at[10], at[36]); MULADD(at[11], at[35]); MULADD(at[12], at[34]); MULADD(at[13], at[33]); MULADD(at[14], at[32]);
+ COMBA_STORE(C->dp[14]);
+ /* 15 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[47]); MULADD(at[1], at[46]); MULADD(at[2], at[45]); MULADD(at[3], at[44]); MULADD(at[4], at[43]); MULADD(at[5], at[42]); MULADD(at[6], at[41]); MULADD(at[7], at[40]); MULADD(at[8], at[39]); MULADD(at[9], at[38]); MULADD(at[10], at[37]); MULADD(at[11], at[36]); MULADD(at[12], at[35]); MULADD(at[13], at[34]); MULADD(at[14], at[33]); MULADD(at[15], at[32]);
+ COMBA_STORE(C->dp[15]);
+ /* 16 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[48]); MULADD(at[1], at[47]); MULADD(at[2], at[46]); MULADD(at[3], at[45]); MULADD(at[4], at[44]); MULADD(at[5], at[43]); MULADD(at[6], at[42]); MULADD(at[7], at[41]); MULADD(at[8], at[40]); MULADD(at[9], at[39]); MULADD(at[10], at[38]); MULADD(at[11], at[37]); MULADD(at[12], at[36]); MULADD(at[13], at[35]); MULADD(at[14], at[34]); MULADD(at[15], at[33]); MULADD(at[16], at[32]);
+ COMBA_STORE(C->dp[16]);
+ /* 17 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[49]); MULADD(at[1], at[48]); MULADD(at[2], at[47]); MULADD(at[3], at[46]); MULADD(at[4], at[45]); MULADD(at[5], at[44]); MULADD(at[6], at[43]); MULADD(at[7], at[42]); MULADD(at[8], at[41]); MULADD(at[9], at[40]); MULADD(at[10], at[39]); MULADD(at[11], at[38]); MULADD(at[12], at[37]); MULADD(at[13], at[36]); MULADD(at[14], at[35]); MULADD(at[15], at[34]); MULADD(at[16], at[33]); MULADD(at[17], at[32]);
+ COMBA_STORE(C->dp[17]);
+ /* 18 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[50]); MULADD(at[1], at[49]); MULADD(at[2], at[48]); MULADD(at[3], at[47]); MULADD(at[4], at[46]); MULADD(at[5], at[45]); MULADD(at[6], at[44]); MULADD(at[7], at[43]); MULADD(at[8], at[42]); MULADD(at[9], at[41]); MULADD(at[10], at[40]); MULADD(at[11], at[39]); MULADD(at[12], at[38]); MULADD(at[13], at[37]); MULADD(at[14], at[36]); MULADD(at[15], at[35]); MULADD(at[16], at[34]); MULADD(at[17], at[33]); MULADD(at[18], at[32]);
+ COMBA_STORE(C->dp[18]);
+ /* 19 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[51]); MULADD(at[1], at[50]); MULADD(at[2], at[49]); MULADD(at[3], at[48]); MULADD(at[4], at[47]); MULADD(at[5], at[46]); MULADD(at[6], at[45]); MULADD(at[7], at[44]); MULADD(at[8], at[43]); MULADD(at[9], at[42]); MULADD(at[10], at[41]); MULADD(at[11], at[40]); MULADD(at[12], at[39]); MULADD(at[13], at[38]); MULADD(at[14], at[37]); MULADD(at[15], at[36]); MULADD(at[16], at[35]); MULADD(at[17], at[34]); MULADD(at[18], at[33]); MULADD(at[19], at[32]);
+ COMBA_STORE(C->dp[19]);
+ /* 20 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[52]); MULADD(at[1], at[51]); MULADD(at[2], at[50]); MULADD(at[3], at[49]); MULADD(at[4], at[48]); MULADD(at[5], at[47]); MULADD(at[6], at[46]); MULADD(at[7], at[45]); MULADD(at[8], at[44]); MULADD(at[9], at[43]); MULADD(at[10], at[42]); MULADD(at[11], at[41]); MULADD(at[12], at[40]); MULADD(at[13], at[39]); MULADD(at[14], at[38]); MULADD(at[15], at[37]); MULADD(at[16], at[36]); MULADD(at[17], at[35]); MULADD(at[18], at[34]); MULADD(at[19], at[33]); MULADD(at[20], at[32]);
+ COMBA_STORE(C->dp[20]);
+ /* 21 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[53]); MULADD(at[1], at[52]); MULADD(at[2], at[51]); MULADD(at[3], at[50]); MULADD(at[4], at[49]); MULADD(at[5], at[48]); MULADD(at[6], at[47]); MULADD(at[7], at[46]); MULADD(at[8], at[45]); MULADD(at[9], at[44]); MULADD(at[10], at[43]); MULADD(at[11], at[42]); MULADD(at[12], at[41]); MULADD(at[13], at[40]); MULADD(at[14], at[39]); MULADD(at[15], at[38]); MULADD(at[16], at[37]); MULADD(at[17], at[36]); MULADD(at[18], at[35]); MULADD(at[19], at[34]); MULADD(at[20], at[33]); MULADD(at[21], at[32]);
+ COMBA_STORE(C->dp[21]);
+ /* 22 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[54]); MULADD(at[1], at[53]); MULADD(at[2], at[52]); MULADD(at[3], at[51]); MULADD(at[4], at[50]); MULADD(at[5], at[49]); MULADD(at[6], at[48]); MULADD(at[7], at[47]); MULADD(at[8], at[46]); MULADD(at[9], at[45]); MULADD(at[10], at[44]); MULADD(at[11], at[43]); MULADD(at[12], at[42]); MULADD(at[13], at[41]); MULADD(at[14], at[40]); MULADD(at[15], at[39]); MULADD(at[16], at[38]); MULADD(at[17], at[37]); MULADD(at[18], at[36]); MULADD(at[19], at[35]); MULADD(at[20], at[34]); MULADD(at[21], at[33]); MULADD(at[22], at[32]);
+ COMBA_STORE(C->dp[22]);
+ /* 23 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[55]); MULADD(at[1], at[54]); MULADD(at[2], at[53]); MULADD(at[3], at[52]); MULADD(at[4], at[51]); MULADD(at[5], at[50]); MULADD(at[6], at[49]); MULADD(at[7], at[48]); MULADD(at[8], at[47]); MULADD(at[9], at[46]); MULADD(at[10], at[45]); MULADD(at[11], at[44]); MULADD(at[12], at[43]); MULADD(at[13], at[42]); MULADD(at[14], at[41]); MULADD(at[15], at[40]); MULADD(at[16], at[39]); MULADD(at[17], at[38]); MULADD(at[18], at[37]); MULADD(at[19], at[36]); MULADD(at[20], at[35]); MULADD(at[21], at[34]); MULADD(at[22], at[33]); MULADD(at[23], at[32]);
+ COMBA_STORE(C->dp[23]);
+ /* 24 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[56]); MULADD(at[1], at[55]); MULADD(at[2], at[54]); MULADD(at[3], at[53]); MULADD(at[4], at[52]); MULADD(at[5], at[51]); MULADD(at[6], at[50]); MULADD(at[7], at[49]); MULADD(at[8], at[48]); MULADD(at[9], at[47]); MULADD(at[10], at[46]); MULADD(at[11], at[45]); MULADD(at[12], at[44]); MULADD(at[13], at[43]); MULADD(at[14], at[42]); MULADD(at[15], at[41]); MULADD(at[16], at[40]); MULADD(at[17], at[39]); MULADD(at[18], at[38]); MULADD(at[19], at[37]); MULADD(at[20], at[36]); MULADD(at[21], at[35]); MULADD(at[22], at[34]); MULADD(at[23], at[33]); MULADD(at[24], at[32]);
+ COMBA_STORE(C->dp[24]);
+ /* 25 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[57]); MULADD(at[1], at[56]); MULADD(at[2], at[55]); MULADD(at[3], at[54]); MULADD(at[4], at[53]); MULADD(at[5], at[52]); MULADD(at[6], at[51]); MULADD(at[7], at[50]); MULADD(at[8], at[49]); MULADD(at[9], at[48]); MULADD(at[10], at[47]); MULADD(at[11], at[46]); MULADD(at[12], at[45]); MULADD(at[13], at[44]); MULADD(at[14], at[43]); MULADD(at[15], at[42]); MULADD(at[16], at[41]); MULADD(at[17], at[40]); MULADD(at[18], at[39]); MULADD(at[19], at[38]); MULADD(at[20], at[37]); MULADD(at[21], at[36]); MULADD(at[22], at[35]); MULADD(at[23], at[34]); MULADD(at[24], at[33]); MULADD(at[25], at[32]);
+ COMBA_STORE(C->dp[25]);
+ /* 26 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[58]); MULADD(at[1], at[57]); MULADD(at[2], at[56]); MULADD(at[3], at[55]); MULADD(at[4], at[54]); MULADD(at[5], at[53]); MULADD(at[6], at[52]); MULADD(at[7], at[51]); MULADD(at[8], at[50]); MULADD(at[9], at[49]); MULADD(at[10], at[48]); MULADD(at[11], at[47]); MULADD(at[12], at[46]); MULADD(at[13], at[45]); MULADD(at[14], at[44]); MULADD(at[15], at[43]); MULADD(at[16], at[42]); MULADD(at[17], at[41]); MULADD(at[18], at[40]); MULADD(at[19], at[39]); MULADD(at[20], at[38]); MULADD(at[21], at[37]); MULADD(at[22], at[36]); MULADD(at[23], at[35]); MULADD(at[24], at[34]); MULADD(at[25], at[33]); MULADD(at[26], at[32]);
+ COMBA_STORE(C->dp[26]);
+ /* 27 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[59]); MULADD(at[1], at[58]); MULADD(at[2], at[57]); MULADD(at[3], at[56]); MULADD(at[4], at[55]); MULADD(at[5], at[54]); MULADD(at[6], at[53]); MULADD(at[7], at[52]); MULADD(at[8], at[51]); MULADD(at[9], at[50]); MULADD(at[10], at[49]); MULADD(at[11], at[48]); MULADD(at[12], at[47]); MULADD(at[13], at[46]); MULADD(at[14], at[45]); MULADD(at[15], at[44]); MULADD(at[16], at[43]); MULADD(at[17], at[42]); MULADD(at[18], at[41]); MULADD(at[19], at[40]); MULADD(at[20], at[39]); MULADD(at[21], at[38]); MULADD(at[22], at[37]); MULADD(at[23], at[36]); MULADD(at[24], at[35]); MULADD(at[25], at[34]); MULADD(at[26], at[33]); MULADD(at[27], at[32]);
+ COMBA_STORE(C->dp[27]);
+ /* 28 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[60]); MULADD(at[1], at[59]); MULADD(at[2], at[58]); MULADD(at[3], at[57]); MULADD(at[4], at[56]); MULADD(at[5], at[55]); MULADD(at[6], at[54]); MULADD(at[7], at[53]); MULADD(at[8], at[52]); MULADD(at[9], at[51]); MULADD(at[10], at[50]); MULADD(at[11], at[49]); MULADD(at[12], at[48]); MULADD(at[13], at[47]); MULADD(at[14], at[46]); MULADD(at[15], at[45]); MULADD(at[16], at[44]); MULADD(at[17], at[43]); MULADD(at[18], at[42]); MULADD(at[19], at[41]); MULADD(at[20], at[40]); MULADD(at[21], at[39]); MULADD(at[22], at[38]); MULADD(at[23], at[37]); MULADD(at[24], at[36]); MULADD(at[25], at[35]); MULADD(at[26], at[34]); MULADD(at[27], at[33]); MULADD(at[28], at[32]);
+ COMBA_STORE(C->dp[28]);
+ /* 29 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[61]); MULADD(at[1], at[60]); MULADD(at[2], at[59]); MULADD(at[3], at[58]); MULADD(at[4], at[57]); MULADD(at[5], at[56]); MULADD(at[6], at[55]); MULADD(at[7], at[54]); MULADD(at[8], at[53]); MULADD(at[9], at[52]); MULADD(at[10], at[51]); MULADD(at[11], at[50]); MULADD(at[12], at[49]); MULADD(at[13], at[48]); MULADD(at[14], at[47]); MULADD(at[15], at[46]); MULADD(at[16], at[45]); MULADD(at[17], at[44]); MULADD(at[18], at[43]); MULADD(at[19], at[42]); MULADD(at[20], at[41]); MULADD(at[21], at[40]); MULADD(at[22], at[39]); MULADD(at[23], at[38]); MULADD(at[24], at[37]); MULADD(at[25], at[36]); MULADD(at[26], at[35]); MULADD(at[27], at[34]); MULADD(at[28], at[33]); MULADD(at[29], at[32]);
+ COMBA_STORE(C->dp[29]);
+ /* 30 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[62]); MULADD(at[1], at[61]); MULADD(at[2], at[60]); MULADD(at[3], at[59]); MULADD(at[4], at[58]); MULADD(at[5], at[57]); MULADD(at[6], at[56]); MULADD(at[7], at[55]); MULADD(at[8], at[54]); MULADD(at[9], at[53]); MULADD(at[10], at[52]); MULADD(at[11], at[51]); MULADD(at[12], at[50]); MULADD(at[13], at[49]); MULADD(at[14], at[48]); MULADD(at[15], at[47]); MULADD(at[16], at[46]); MULADD(at[17], at[45]); MULADD(at[18], at[44]); MULADD(at[19], at[43]); MULADD(at[20], at[42]); MULADD(at[21], at[41]); MULADD(at[22], at[40]); MULADD(at[23], at[39]); MULADD(at[24], at[38]); MULADD(at[25], at[37]); MULADD(at[26], at[36]); MULADD(at[27], at[35]); MULADD(at[28], at[34]); MULADD(at[29], at[33]); MULADD(at[30], at[32]);
+ COMBA_STORE(C->dp[30]);
+ /* 31 */
+ COMBA_FORWARD;
+ MULADD(at[0], at[63]); MULADD(at[1], at[62]); MULADD(at[2], at[61]); MULADD(at[3], at[60]); MULADD(at[4], at[59]); MULADD(at[5], at[58]); MULADD(at[6], at[57]); MULADD(at[7], at[56]); MULADD(at[8], at[55]); MULADD(at[9], at[54]); MULADD(at[10], at[53]); MULADD(at[11], at[52]); MULADD(at[12], at[51]); MULADD(at[13], at[50]); MULADD(at[14], at[49]); MULADD(at[15], at[48]); MULADD(at[16], at[47]); MULADD(at[17], at[46]); MULADD(at[18], at[45]); MULADD(at[19], at[44]); MULADD(at[20], at[43]); MULADD(at[21], at[42]); MULADD(at[22], at[41]); MULADD(at[23], at[40]); MULADD(at[24], at[39]); MULADD(at[25], at[38]); MULADD(at[26], at[37]); MULADD(at[27], at[36]); MULADD(at[28], at[35]); MULADD(at[29], at[34]); MULADD(at[30], at[33]); MULADD(at[31], at[32]);
+ COMBA_STORE(C->dp[31]);
+ /* 32 */
+ COMBA_FORWARD;
+ MULADD(at[1], at[63]); MULADD(at[2], at[62]); MULADD(at[3], at[61]); MULADD(at[4], at[60]); MULADD(at[5], at[59]); MULADD(at[6], at[58]); MULADD(at[7], at[57]); MULADD(at[8], at[56]); MULADD(at[9], at[55]); MULADD(at[10], at[54]); MULADD(at[11], at[53]); MULADD(at[12], at[52]); MULADD(at[13], at[51]); MULADD(at[14], at[50]); MULADD(at[15], at[49]); MULADD(at[16], at[48]); MULADD(at[17], at[47]); MULADD(at[18], at[46]); MULADD(at[19], at[45]); MULADD(at[20], at[44]); MULADD(at[21], at[43]); MULADD(at[22], at[42]); MULADD(at[23], at[41]); MULADD(at[24], at[40]); MULADD(at[25], at[39]); MULADD(at[26], at[38]); MULADD(at[27], at[37]); MULADD(at[28], at[36]); MULADD(at[29], at[35]); MULADD(at[30], at[34]); MULADD(at[31], at[33]);
+ COMBA_STORE(C->dp[32]);
+ /* 33 */
+ COMBA_FORWARD;
+ MULADD(at[2], at[63]); MULADD(at[3], at[62]); MULADD(at[4], at[61]); MULADD(at[5], at[60]); MULADD(at[6], at[59]); MULADD(at[7], at[58]); MULADD(at[8], at[57]); MULADD(at[9], at[56]); MULADD(at[10], at[55]); MULADD(at[11], at[54]); MULADD(at[12], at[53]); MULADD(at[13], at[52]); MULADD(at[14], at[51]); MULADD(at[15], at[50]); MULADD(at[16], at[49]); MULADD(at[17], at[48]); MULADD(at[18], at[47]); MULADD(at[19], at[46]); MULADD(at[20], at[45]); MULADD(at[21], at[44]); MULADD(at[22], at[43]); MULADD(at[23], at[42]); MULADD(at[24], at[41]); MULADD(at[25], at[40]); MULADD(at[26], at[39]); MULADD(at[27], at[38]); MULADD(at[28], at[37]); MULADD(at[29], at[36]); MULADD(at[30], at[35]); MULADD(at[31], at[34]);
+ COMBA_STORE(C->dp[33]);
+ /* 34 */
+ COMBA_FORWARD;
+ MULADD(at[3], at[63]); MULADD(at[4], at[62]); MULADD(at[5], at[61]); MULADD(at[6], at[60]); MULADD(at[7], at[59]); MULADD(at[8], at[58]); MULADD(at[9], at[57]); MULADD(at[10], at[56]); MULADD(at[11], at[55]); MULADD(at[12], at[54]); MULADD(at[13], at[53]); MULADD(at[14], at[52]); MULADD(at[15], at[51]); MULADD(at[16], at[50]); MULADD(at[17], at[49]); MULADD(at[18], at[48]); MULADD(at[19], at[47]); MULADD(at[20], at[46]); MULADD(at[21], at[45]); MULADD(at[22], at[44]); MULADD(at[23], at[43]); MULADD(at[24], at[42]); MULADD(at[25], at[41]); MULADD(at[26], at[40]); MULADD(at[27], at[39]); MULADD(at[28], at[38]); MULADD(at[29], at[37]); MULADD(at[30], at[36]); MULADD(at[31], at[35]);
+ COMBA_STORE(C->dp[34]);
+ /* 35 */
+ COMBA_FORWARD;
+ MULADD(at[4], at[63]); MULADD(at[5], at[62]); MULADD(at[6], at[61]); MULADD(at[7], at[60]); MULADD(at[8], at[59]); MULADD(at[9], at[58]); MULADD(at[10], at[57]); MULADD(at[11], at[56]); MULADD(at[12], at[55]); MULADD(at[13], at[54]); MULADD(at[14], at[53]); MULADD(at[15], at[52]); MULADD(at[16], at[51]); MULADD(at[17], at[50]); MULADD(at[18], at[49]); MULADD(at[19], at[48]); MULADD(at[20], at[47]); MULADD(at[21], at[46]); MULADD(at[22], at[45]); MULADD(at[23], at[44]); MULADD(at[24], at[43]); MULADD(at[25], at[42]); MULADD(at[26], at[41]); MULADD(at[27], at[40]); MULADD(at[28], at[39]); MULADD(at[29], at[38]); MULADD(at[30], at[37]); MULADD(at[31], at[36]);
+ COMBA_STORE(C->dp[35]);
+ /* 36 */
+ COMBA_FORWARD;
+ MULADD(at[5], at[63]); MULADD(at[6], at[62]); MULADD(at[7], at[61]); MULADD(at[8], at[60]); MULADD(at[9], at[59]); MULADD(at[10], at[58]); MULADD(at[11], at[57]); MULADD(at[12], at[56]); MULADD(at[13], at[55]); MULADD(at[14], at[54]); MULADD(at[15], at[53]); MULADD(at[16], at[52]); MULADD(at[17], at[51]); MULADD(at[18], at[50]); MULADD(at[19], at[49]); MULADD(at[20], at[48]); MULADD(at[21], at[47]); MULADD(at[22], at[46]); MULADD(at[23], at[45]); MULADD(at[24], at[44]); MULADD(at[25], at[43]); MULADD(at[26], at[42]); MULADD(at[27], at[41]); MULADD(at[28], at[40]); MULADD(at[29], at[39]); MULADD(at[30], at[38]); MULADD(at[31], at[37]);
+ COMBA_STORE(C->dp[36]);
+ /* 37 */
+ COMBA_FORWARD;
+ MULADD(at[6], at[63]); MULADD(at[7], at[62]); MULADD(at[8], at[61]); MULADD(at[9], at[60]); MULADD(at[10], at[59]); MULADD(at[11], at[58]); MULADD(at[12], at[57]); MULADD(at[13], at[56]); MULADD(at[14], at[55]); MULADD(at[15], at[54]); MULADD(at[16], at[53]); MULADD(at[17], at[52]); MULADD(at[18], at[51]); MULADD(at[19], at[50]); MULADD(at[20], at[49]); MULADD(at[21], at[48]); MULADD(at[22], at[47]); MULADD(at[23], at[46]); MULADD(at[24], at[45]); MULADD(at[25], at[44]); MULADD(at[26], at[43]); MULADD(at[27], at[42]); MULADD(at[28], at[41]); MULADD(at[29], at[40]); MULADD(at[30], at[39]); MULADD(at[31], at[38]);
+ COMBA_STORE(C->dp[37]);
+ /* 38 */
+ COMBA_FORWARD;
+ MULADD(at[7], at[63]); MULADD(at[8], at[62]); MULADD(at[9], at[61]); MULADD(at[10], at[60]); MULADD(at[11], at[59]); MULADD(at[12], at[58]); MULADD(at[13], at[57]); MULADD(at[14], at[56]); MULADD(at[15], at[55]); MULADD(at[16], at[54]); MULADD(at[17], at[53]); MULADD(at[18], at[52]); MULADD(at[19], at[51]); MULADD(at[20], at[50]); MULADD(at[21], at[49]); MULADD(at[22], at[48]); MULADD(at[23], at[47]); MULADD(at[24], at[46]); MULADD(at[25], at[45]); MULADD(at[26], at[44]); MULADD(at[27], at[43]); MULADD(at[28], at[42]); MULADD(at[29], at[41]); MULADD(at[30], at[40]); MULADD(at[31], at[39]);
+ COMBA_STORE(C->dp[38]);
+
+ /* early out at 40 digits, 40*32==1280, or two 640 bit operands */
+ if (out_size <= 40) { COMBA_STORE2(C->dp[39]); C->used = 40; C->sign = A->sign ^ B->sign; pstm_clamp(C); COMBA_FINI; return PSTM_OKAY; }
+
+ /* 39 */
+ COMBA_FORWARD;
+ MULADD(at[8], at[63]); MULADD(at[9], at[62]); MULADD(at[10], at[61]); MULADD(at[11], at[60]); MULADD(at[12], at[59]); MULADD(at[13], at[58]); MULADD(at[14], at[57]); MULADD(at[15], at[56]); MULADD(at[16], at[55]); MULADD(at[17], at[54]); MULADD(at[18], at[53]); MULADD(at[19], at[52]); MULADD(at[20], at[51]); MULADD(at[21], at[50]); MULADD(at[22], at[49]); MULADD(at[23], at[48]); MULADD(at[24], at[47]); MULADD(at[25], at[46]); MULADD(at[26], at[45]); MULADD(at[27], at[44]); MULADD(at[28], at[43]); MULADD(at[29], at[42]); MULADD(at[30], at[41]); MULADD(at[31], at[40]);
+ COMBA_STORE(C->dp[39]);
+ /* 40 */
+ COMBA_FORWARD;
+ MULADD(at[9], at[63]); MULADD(at[10], at[62]); MULADD(at[11], at[61]); MULADD(at[12], at[60]); MULADD(at[13], at[59]); MULADD(at[14], at[58]); MULADD(at[15], at[57]); MULADD(at[16], at[56]); MULADD(at[17], at[55]); MULADD(at[18], at[54]); MULADD(at[19], at[53]); MULADD(at[20], at[52]); MULADD(at[21], at[51]); MULADD(at[22], at[50]); MULADD(at[23], at[49]); MULADD(at[24], at[48]); MULADD(at[25], at[47]); MULADD(at[26], at[46]); MULADD(at[27], at[45]); MULADD(at[28], at[44]); MULADD(at[29], at[43]); MULADD(at[30], at[42]); MULADD(at[31], at[41]);
+ COMBA_STORE(C->dp[40]);
+ /* 41 */
+ COMBA_FORWARD;
+ MULADD(at[10], at[63]); MULADD(at[11], at[62]); MULADD(at[12], at[61]); MULADD(at[13], at[60]); MULADD(at[14], at[59]); MULADD(at[15], at[58]); MULADD(at[16], at[57]); MULADD(at[17], at[56]); MULADD(at[18], at[55]); MULADD(at[19], at[54]); MULADD(at[20], at[53]); MULADD(at[21], at[52]); MULADD(at[22], at[51]); MULADD(at[23], at[50]); MULADD(at[24], at[49]); MULADD(at[25], at[48]); MULADD(at[26], at[47]); MULADD(at[27], at[46]); MULADD(at[28], at[45]); MULADD(at[29], at[44]); MULADD(at[30], at[43]); MULADD(at[31], at[42]);
+ COMBA_STORE(C->dp[41]);
+ /* 42 */
+ COMBA_FORWARD;
+ MULADD(at[11], at[63]); MULADD(at[12], at[62]); MULADD(at[13], at[61]); MULADD(at[14], at[60]); MULADD(at[15], at[59]); MULADD(at[16], at[58]); MULADD(at[17], at[57]); MULADD(at[18], at[56]); MULADD(at[19], at[55]); MULADD(at[20], at[54]); MULADD(at[21], at[53]); MULADD(at[22], at[52]); MULADD(at[23], at[51]); MULADD(at[24], at[50]); MULADD(at[25], at[49]); MULADD(at[26], at[48]); MULADD(at[27], at[47]); MULADD(at[28], at[46]); MULADD(at[29], at[45]); MULADD(at[30], at[44]); MULADD(at[31], at[43]);
+ COMBA_STORE(C->dp[42]);
+ /* 43 */
+ COMBA_FORWARD;
+ MULADD(at[12], at[63]); MULADD(at[13], at[62]); MULADD(at[14], at[61]); MULADD(at[15], at[60]); MULADD(at[16], at[59]); MULADD(at[17], at[58]); MULADD(at[18], at[57]); MULADD(at[19], at[56]); MULADD(at[20], at[55]); MULADD(at[21], at[54]); MULADD(at[22], at[53]); MULADD(at[23], at[52]); MULADD(at[24], at[51]); MULADD(at[25], at[50]); MULADD(at[26], at[49]); MULADD(at[27], at[48]); MULADD(at[28], at[47]); MULADD(at[29], at[46]); MULADD(at[30], at[45]); MULADD(at[31], at[44]);
+ COMBA_STORE(C->dp[43]);
+ /* 44 */
+ COMBA_FORWARD;
+ MULADD(at[13], at[63]); MULADD(at[14], at[62]); MULADD(at[15], at[61]); MULADD(at[16], at[60]); MULADD(at[17], at[59]); MULADD(at[18], at[58]); MULADD(at[19], at[57]); MULADD(at[20], at[56]); MULADD(at[21], at[55]); MULADD(at[22], at[54]); MULADD(at[23], at[53]); MULADD(at[24], at[52]); MULADD(at[25], at[51]); MULADD(at[26], at[50]); MULADD(at[27], at[49]); MULADD(at[28], at[48]); MULADD(at[29], at[47]); MULADD(at[30], at[46]); MULADD(at[31], at[45]);
+ COMBA_STORE(C->dp[44]);
+ /* 45 */
+ COMBA_FORWARD;
+ MULADD(at[14], at[63]); MULADD(at[15], at[62]); MULADD(at[16], at[61]); MULADD(at[17], at[60]); MULADD(at[18], at[59]); MULADD(at[19], at[58]); MULADD(at[20], at[57]); MULADD(at[21], at[56]); MULADD(at[22], at[55]); MULADD(at[23], at[54]); MULADD(at[24], at[53]); MULADD(at[25], at[52]); MULADD(at[26], at[51]); MULADD(at[27], at[50]); MULADD(at[28], at[49]); MULADD(at[29], at[48]); MULADD(at[30], at[47]); MULADD(at[31], at[46]);
+ COMBA_STORE(C->dp[45]);
+ /* 46 */
+ COMBA_FORWARD;
+ MULADD(at[15], at[63]); MULADD(at[16], at[62]); MULADD(at[17], at[61]); MULADD(at[18], at[60]); MULADD(at[19], at[59]); MULADD(at[20], at[58]); MULADD(at[21], at[57]); MULADD(at[22], at[56]); MULADD(at[23], at[55]); MULADD(at[24], at[54]); MULADD(at[25], at[53]); MULADD(at[26], at[52]); MULADD(at[27], at[51]); MULADD(at[28], at[50]); MULADD(at[29], at[49]); MULADD(at[30], at[48]); MULADD(at[31], at[47]);
+ COMBA_STORE(C->dp[46]);
+
+ /* early out at 48 digits, 48*32==1536, or two 768 bit operands */
+ if (out_size <= 48) { COMBA_STORE2(C->dp[47]); C->used = 48; C->sign = A->sign ^ B->sign; pstm_clamp(C); COMBA_FINI; return PSTM_OKAY; }
+
+ /* 47 */
+ COMBA_FORWARD;
+ MULADD(at[16], at[63]); MULADD(at[17], at[62]); MULADD(at[18], at[61]); MULADD(at[19], at[60]); MULADD(at[20], at[59]); MULADD(at[21], at[58]); MULADD(at[22], at[57]); MULADD(at[23], at[56]); MULADD(at[24], at[55]); MULADD(at[25], at[54]); MULADD(at[26], at[53]); MULADD(at[27], at[52]); MULADD(at[28], at[51]); MULADD(at[29], at[50]); MULADD(at[30], at[49]); MULADD(at[31], at[48]);
+ COMBA_STORE(C->dp[47]);
+ /* 48 */
+ COMBA_FORWARD;
+ MULADD(at[17], at[63]); MULADD(at[18], at[62]); MULADD(at[19], at[61]); MULADD(at[20], at[60]); MULADD(at[21], at[59]); MULADD(at[22], at[58]); MULADD(at[23], at[57]); MULADD(at[24], at[56]); MULADD(at[25], at[55]); MULADD(at[26], at[54]); MULADD(at[27], at[53]); MULADD(at[28], at[52]); MULADD(at[29], at[51]); MULADD(at[30], at[50]); MULADD(at[31], at[49]);
+ COMBA_STORE(C->dp[48]);
+ /* 49 */
+ COMBA_FORWARD;
+ MULADD(at[18], at[63]); MULADD(at[19], at[62]); MULADD(at[20], at[61]); MULADD(at[21], at[60]); MULADD(at[22], at[59]); MULADD(at[23], at[58]); MULADD(at[24], at[57]); MULADD(at[25], at[56]); MULADD(at[26], at[55]); MULADD(at[27], at[54]); MULADD(at[28], at[53]); MULADD(at[29], at[52]); MULADD(at[30], at[51]); MULADD(at[31], at[50]);
+ COMBA_STORE(C->dp[49]);
+ /* 50 */
+ COMBA_FORWARD;
+ MULADD(at[19], at[63]); MULADD(at[20], at[62]); MULADD(at[21], at[61]); MULADD(at[22], at[60]); MULADD(at[23], at[59]); MULADD(at[24], at[58]); MULADD(at[25], at[57]); MULADD(at[26], at[56]); MULADD(at[27], at[55]); MULADD(at[28], at[54]); MULADD(at[29], at[53]); MULADD(at[30], at[52]); MULADD(at[31], at[51]);
+ COMBA_STORE(C->dp[50]);
+ /* 51 */
+ COMBA_FORWARD;
+ MULADD(at[20], at[63]); MULADD(at[21], at[62]); MULADD(at[22], at[61]); MULADD(at[23], at[60]); MULADD(at[24], at[59]); MULADD(at[25], at[58]); MULADD(at[26], at[57]); MULADD(at[27], at[56]); MULADD(at[28], at[55]); MULADD(at[29], at[54]); MULADD(at[30], at[53]); MULADD(at[31], at[52]);
+ COMBA_STORE(C->dp[51]);
+ /* 52 */
+ COMBA_FORWARD;
+ MULADD(at[21], at[63]); MULADD(at[22], at[62]); MULADD(at[23], at[61]); MULADD(at[24], at[60]); MULADD(at[25], at[59]); MULADD(at[26], at[58]); MULADD(at[27], at[57]); MULADD(at[28], at[56]); MULADD(at[29], at[55]); MULADD(at[30], at[54]); MULADD(at[31], at[53]);
+ COMBA_STORE(C->dp[52]);
+ /* 53 */
+ COMBA_FORWARD;
+ MULADD(at[22], at[63]); MULADD(at[23], at[62]); MULADD(at[24], at[61]); MULADD(at[25], at[60]); MULADD(at[26], at[59]); MULADD(at[27], at[58]); MULADD(at[28], at[57]); MULADD(at[29], at[56]); MULADD(at[30], at[55]); MULADD(at[31], at[54]);
+ COMBA_STORE(C->dp[53]);
+ /* 54 */
+ COMBA_FORWARD;
+ MULADD(at[23], at[63]); MULADD(at[24], at[62]); MULADD(at[25], at[61]); MULADD(at[26], at[60]); MULADD(at[27], at[59]); MULADD(at[28], at[58]); MULADD(at[29], at[57]); MULADD(at[30], at[56]); MULADD(at[31], at[55]);
+ COMBA_STORE(C->dp[54]);
+
+ /* early out at 56 digits, 56*32==1792, or two 896 bit operands */
+ if (out_size <= 56) { COMBA_STORE2(C->dp[55]); C->used = 56; C->sign = A->sign ^ B->sign; pstm_clamp(C); COMBA_FINI; return PSTM_OKAY; }
+
+ /* 55 */
+ COMBA_FORWARD;
+ MULADD(at[24], at[63]); MULADD(at[25], at[62]); MULADD(at[26], at[61]); MULADD(at[27], at[60]); MULADD(at[28], at[59]); MULADD(at[29], at[58]); MULADD(at[30], at[57]); MULADD(at[31], at[56]);
+ COMBA_STORE(C->dp[55]);
+ /* 56 */
+ COMBA_FORWARD;
+ MULADD(at[25], at[63]); MULADD(at[26], at[62]); MULADD(at[27], at[61]); MULADD(at[28], at[60]); MULADD(at[29], at[59]); MULADD(at[30], at[58]); MULADD(at[31], at[57]);
+ COMBA_STORE(C->dp[56]);
+ /* 57 */
+ COMBA_FORWARD;
+ MULADD(at[26], at[63]); MULADD(at[27], at[62]); MULADD(at[28], at[61]); MULADD(at[29], at[60]); MULADD(at[30], at[59]); MULADD(at[31], at[58]);
+ COMBA_STORE(C->dp[57]);
+ /* 58 */
+ COMBA_FORWARD;
+ MULADD(at[27], at[63]); MULADD(at[28], at[62]); MULADD(at[29], at[61]); MULADD(at[30], at[60]); MULADD(at[31], at[59]);
+ COMBA_STORE(C->dp[58]);
+ /* 59 */
+ COMBA_FORWARD;
+ MULADD(at[28], at[63]); MULADD(at[29], at[62]); MULADD(at[30], at[61]); MULADD(at[31], at[60]);
+ COMBA_STORE(C->dp[59]);
+ /* 60 */
+ COMBA_FORWARD;
+ MULADD(at[29], at[63]); MULADD(at[30], at[62]); MULADD(at[31], at[61]);
+ COMBA_STORE(C->dp[60]);
+ /* 61 */
+ COMBA_FORWARD;
+ MULADD(at[30], at[63]); MULADD(at[31], at[62]);
+ COMBA_STORE(C->dp[61]);
+ /* 62 */
+ COMBA_FORWARD;
+ MULADD(at[31], at[63]);
+ COMBA_STORE(C->dp[62]);
+ COMBA_STORE2(C->dp[63]);
+ C->used = 64;
+ C->sign = A->sign ^ B->sign;
+ pstm_clamp(C);
+ COMBA_FINI;
+ return PSTM_OKAY;
+}
+#endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */
+
+/******************************************************************************/
+
+int32 pstm_mul_comba(psPool_t *pool, pstm_int *A, pstm_int *B, pstm_int *C,
+ pstm_digit *paD, uint32 paDlen)
+{
+#ifdef USE_1024_KEY_SPEED_OPTIMIZATIONS
+ if (A->used == 16 && B->used == 16) {
+ return pstm_mul_comba16(A, B, C);
+ } else {
+#ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS
+ if (A->used == 32 && B->used == 32) {
+ return pstm_mul_comba32(A, B, C);
+ }
+#endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */
+ return pstm_mul_comba_gen(pool, A, B, C, paD, paDlen);
+ }
+#else
+#ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS
+ if (A->used == 32 && B->used == 32) {
+ return pstm_mul_comba32(A, B, C);
+ }
+#endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */
+ return pstm_mul_comba_gen(pool, A, B, C, paD, paDlen);
+#endif
+}
+
+#endif /* !DISABLE_PSTM */
+/******************************************************************************/
diff --git a/networking/tls_pstm_sqr_comba.c b/networking/tls_pstm_sqr_comba.c
new file mode 100644
index 000000000..98186d31f
--- /dev/null
+++ b/networking/tls_pstm_sqr_comba.c
@@ -0,0 +1,1107 @@
+/*
+ * Copyright (C) 2017 Denys Vlasenko
+ *
+ * Licensed under GPLv2, see file LICENSE in this source tree.
+ */
+#include "tls.h"
+
+/**
+ * @file pstm_sqr_comba.c
+ * @version 33ef80f (HEAD, tag: MATRIXSSL-3-7-2-OPEN, tag: MATRIXSSL-3-7-2-COMM, origin/master, origin/HEAD, master)
+ *
+ * Multiprecision Squaring with Comba technique.
+ */
+/*
+ * Copyright (c) 2013-2015 INSIDE Secure Corporation
+ * Copyright (c) PeerSec Networks, 2002-2011
+ * All Rights Reserved
+ *
+ * The latest version of this code is available at http://www.matrixssl.org
+ *
+ * This software is open source; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This General Public License does NOT permit incorporating this software
+ * into proprietary programs. If you are unable to comply with the GPL, a
+ * commercial license for this software may be purchased from INSIDE at
+ * http://www.insidesecure.com/eng/Company/Locations
+ *
+ * This program is distributed in WITHOUT ANY WARRANTY; without even the
+ * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * http://www.gnu.org/copyleft/gpl.html
+ */
+/******************************************************************************/
+
+///bbox
+//#include "../cryptoApi.h"
+#ifndef DISABLE_PSTM
+
+/******************************************************************************/
+#if defined(PSTM_X86)
+/* x86-32 optimized for 32 bit platforms. For 64 bit mode use X86_64 instead */
+#if !defined(__GNUC__) || !defined(__i386__)
+#error "PSTM_X86 option requires GCC and 32 bit mode x86 processor"
+#endif
+//#pragma message ("Using 32 bit x86 Assembly Optimizations")
+
+#define COMBA_START
+
+#define CLEAR_CARRY \
+ c0 = c1 = c2 = 0;
+
+#define COMBA_STORE(x) \
+ x = c0;
+
+#define COMBA_STORE2(x) \
+ x = c1;
+
+#define CARRY_FORWARD \
+ do { c0 = c1; c1 = c2; c2 = 0; } while (0);
+
+#define COMBA_FINI
+
+#define SQRADD(i, j) \
+asm( \
+ "movl %6,%%eax \n\t" \
+ "mull %%eax \n\t" \
+ "addl %%eax,%0 \n\t" \
+ "adcl %%edx,%1 \n\t" \
+ "adcl $0,%2 \n\t" \
+ :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","%cc");
+
+#define SQRADD2(i, j) \
+asm( \
+ "movl %6,%%eax \n\t" \
+ "mull %7 \n\t" \
+ "addl %%eax,%0 \n\t" \
+ "adcl %%edx,%1 \n\t" \
+ "adcl $0,%2 \n\t" \
+ "addl %%eax,%0 \n\t" \
+ "adcl %%edx,%1 \n\t" \
+ "adcl $0,%2 \n\t" \
+ :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","%cc");
+
+#define SQRADDSC(i, j) \
+asm( \
+ "movl %6,%%eax \n\t" \
+ "mull %7 \n\t" \
+ "movl %%eax,%0 \n\t" \
+ "movl %%edx,%1 \n\t" \
+ "xorl %2,%2 \n\t" \
+ :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc");
+
+#define SQRADDAC(i, j) \
+asm( \
+ "movl %6,%%eax \n\t" \
+ "mull %7 \n\t" \
+ "addl %%eax,%0 \n\t" \
+ "adcl %%edx,%1 \n\t" \
+ "adcl $0,%2 \n\t" \
+ :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc");
+
+#define SQRADDDB \
+asm( \
+ "addl %6,%0 \n\t" \
+ "adcl %7,%1 \n\t" \
+ "adcl %8,%2 \n\t" \
+ "addl %6,%0 \n\t" \
+ "adcl %7,%1 \n\t" \
+ "adcl %8,%2 \n\t" \
+ :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "%cc");
+
+/******************************************************************************/
+#elif defined(PSTM_X86_64)
+/* x86-64 optimized */
+#if !defined(__GNUC__) || !defined(__x86_64__) || !defined(PSTM_64BIT)
+#error "PSTM_X86_64 option requires PSTM_64BIT, GCC and 64 bit mode x86 processor"
+#endif
+//#pragma message ("Using 64 bit x86_64 Assembly Optimizations")
+
+#define COMBA_START
+
+#define CLEAR_CARRY \
+c0 = c1 = c2 = 0;
+
+#define COMBA_STORE(x) \
+x = c0;
+
+#define COMBA_STORE2(x) \
+x = c1;
+
+#define CARRY_FORWARD \
+do { c0 = c1; c1 = c2; c2 = 0; } while (0);
+
+#define COMBA_FINI
+
+#define SQRADD(i, j) \
+asm( \
+ "movq %6,%%rax \n\t" \
+ "mulq %%rax \n\t" \
+ "addq %%rax,%0 \n\t" \
+ "adcq %%rdx,%1 \n\t" \
+ "adcq $0,%2 \n\t" \
+ :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","cc");
+
+#define SQRADD2(i, j) \
+asm( \
+ "movq %6,%%rax \n\t" \
+ "mulq %7 \n\t" \
+ "addq %%rax,%0 \n\t" \
+ "adcq %%rdx,%1 \n\t" \
+ "adcq $0,%2 \n\t" \
+ "addq %%rax,%0 \n\t" \
+ "adcq %%rdx,%1 \n\t" \
+ "adcq $0,%2 \n\t" \
+ :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","cc");
+
+#define SQRADDSC(i, j) \
+asm( \
+ "movq %6,%%rax \n\t" \
+ "mulq %7 \n\t" \
+ "movq %%rax,%0 \n\t" \
+ "movq %%rdx,%1 \n\t" \
+ "xorq %2,%2 \n\t" \
+ :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc");
+
+#define SQRADDAC(i, j) \
+asm( \
+ "movq %6,%%rax \n\t" \
+ "mulq %7 \n\t" \
+ "addq %%rax,%0 \n\t" \
+ "adcq %%rdx,%1 \n\t" \
+ "adcq $0,%2 \n\t" \
+ :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc");
+
+#define SQRADDDB \
+asm( \
+ "addq %6,%0 \n\t" \
+ "adcq %7,%1 \n\t" \
+ "adcq %8,%2 \n\t" \
+ "addq %6,%0 \n\t" \
+ "adcq %7,%1 \n\t" \
+ "adcq %8,%2 \n\t" \
+ :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
+
+/******************************************************************************/
+#elif defined(PSTM_ARM)
+/* ARM code */
+//#pragma message ("Using 32 bit ARM Assembly Optimizations")
+
+#define COMBA_START
+
+#define CLEAR_CARRY \
+c0 = c1 = c2 = 0;
+
+#define COMBA_STORE(x) \
+x = c0;
+
+#define COMBA_STORE2(x) \
+x = c1;
+
+#define CARRY_FORWARD \
+do { c0 = c1; c1 = c2; c2 = 0; } while (0);
+
+#define COMBA_FINI
+
+/* multiplies point i and j, updates carry "c1" and digit c2 */
+#define SQRADD(i, j) \
+asm( \
+" UMULL r0,r1,%6,%6 \n\t" \
+" ADDS %0,%0,r0 \n\t" \
+" ADCS %1,%1,r1 \n\t" \
+" ADC %2,%2,#0 \n\t" \
+:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i) : "r0", "r1", "%cc");
+
+/* for squaring some of the terms are doubled... */
+#define SQRADD2(i, j) \
+asm( \
+" UMULL r0,r1,%6,%7 \n\t" \
+" ADDS %0,%0,r0 \n\t" \
+" ADCS %1,%1,r1 \n\t" \
+" ADC %2,%2,#0 \n\t" \
+" ADDS %0,%0,r0 \n\t" \
+" ADCS %1,%1,r1 \n\t" \
+" ADC %2,%2,#0 \n\t" \
+:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "%cc");
+
+#define SQRADDSC(i, j) \
+asm( \
+" UMULL %0,%1,%6,%7 \n\t" \
+" SUB %2,%2,%2 \n\t" \
+:"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "%cc");
+
+#define SQRADDAC(i, j) \
+asm( \
+" UMULL r0,r1,%6,%7 \n\t" \
+" ADDS %0,%0,r0 \n\t" \
+" ADCS %1,%1,r1 \n\t" \
+" ADC %2,%2,#0 \n\t" \
+:"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "r0", "r1", "%cc");
+
+#define SQRADDDB \
+asm( \
+" ADDS %0,%0,%3 \n\t" \
+" ADCS %1,%1,%4 \n\t" \
+" ADC %2,%2,%5 \n\t" \
+" ADDS %0,%0,%3 \n\t" \
+" ADCS %1,%1,%4 \n\t" \
+" ADC %2,%2,%5 \n\t" \
+:"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "%cc");
+
+/******************************************************************************/
+#elif defined(PSTM_MIPS)
+/* MIPS32 */
+//#pragma message ("Using 32 bit MIPS Assembly Optimizations")
+
+#define COMBA_START
+
+#define CLEAR_CARRY \
+c0 = c1 = c2 = 0;
+
+#define COMBA_STORE(x) \
+x = c0;
+
+#define COMBA_STORE2(x) \
+x = c1;
+
+#define CARRY_FORWARD \
+do { c0 = c1; c1 = c2; c2 = 0; } while (0);
+
+#define COMBA_FINI
+
+/* multiplies point i and j, updates carry "c1" and digit c2 */
+#define SQRADD(i, j) \
+asm( \
+ " multu %6,%6 \n\t" \
+ " mflo $12 \n\t" \
+ " mfhi $13 \n\t" \
+ " addu %0,%0,$12 \n\t" \
+ " sltu $12,%0,$12 \n\t" \
+ " addu %1,%1,$13 \n\t" \
+ " sltu $13,%1,$13 \n\t" \
+ " addu %1,%1,$12 \n\t" \
+ " sltu $12,%1,$12 \n\t" \
+ " addu %2,%2,$13 \n\t" \
+ " addu %2,%2,$12 \n\t" \
+ :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"$12","$13");
+
+/* for squaring some of the terms are doubled... */
+#define SQRADD2(i, j) \
+asm( \
+ " multu %6,%7 \n\t" \
+ " mflo $12 \n\t" \
+ " mfhi $13 \n\t" \
+ \
+ " addu %0,%0,$12 \n\t" \
+ " sltu $14,%0,$12 \n\t" \
+ " addu %1,%1,$13 \n\t" \
+ " sltu $15,%1,$13 \n\t" \
+ " addu %1,%1,$14 \n\t" \
+ " sltu $14,%1,$14 \n\t" \
+ " addu %2,%2,$15 \n\t" \
+ " addu %2,%2,$14 \n\t" \
+ \
+ " addu %0,%0,$12 \n\t" \
+ " sltu $14,%0,$12 \n\t" \
+ " addu %1,%1,$13 \n\t" \
+ " sltu $15,%1,$13 \n\t" \
+ " addu %1,%1,$14 \n\t" \
+ " sltu $14,%1,$14 \n\t" \
+ " addu %2,%2,$15 \n\t" \
+ " addu %2,%2,$14 \n\t" \
+ :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"$12", "$13", "$14", "$15");
+
+#define SQRADDSC(i, j) \
+asm( \
+ " multu %6,%7 \n\t" \
+ " mflo %0 \n\t" \
+ " mfhi %1 \n\t" \
+ " xor %2,%2,%2 \n\t" \
+ :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "%cc");
+
+#define SQRADDAC(i, j) \
+asm( \
+ " multu %6,%7 \n\t" \
+ " mflo $12 \n\t" \
+ " mfhi $13 \n\t" \
+ " addu %0,%0,$12 \n\t" \
+ " sltu $12,%0,$12 \n\t" \
+ " addu %1,%1,$13 \n\t" \
+ " sltu $13,%1,$13 \n\t" \
+ " addu %1,%1,$12 \n\t" \
+ " sltu $12,%1,$12 \n\t" \
+ " addu %2,%2,$13 \n\t" \
+ " addu %2,%2,$12 \n\t" \
+ :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"$12", "$13", "$14");
+
+#define SQRADDDB \
+asm( \
+ " addu %0,%0,%3 \n\t" \
+ " sltu $10,%0,%3 \n\t" \
+ " addu %1,%1,$10 \n\t" \
+ " sltu $10,%1,$10 \n\t" \
+ " addu %1,%1,%4 \n\t" \
+ " sltu $11,%1,%4 \n\t" \
+ " addu %2,%2,$10 \n\t" \
+ " addu %2,%2,$11 \n\t" \
+ " addu %2,%2,%5 \n\t" \
+ \
+ " addu %0,%0,%3 \n\t" \
+ " sltu $10,%0,%3 \n\t" \
+ " addu %1,%1,$10 \n\t" \
+ " sltu $10,%1,$10 \n\t" \
+ " addu %1,%1,%4 \n\t" \
+ " sltu $11,%1,%4 \n\t" \
+ " addu %2,%2,$10 \n\t" \
+ " addu %2,%2,$11 \n\t" \
+ " addu %2,%2,%5 \n\t" \
+ :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "$10", "$11");
+
+#else
+/******************************************************************************/
+#define PSTM_ISO
+/* ISO C portable code */
+
+#define COMBA_START
+
+#define CLEAR_CARRY \
+ c0 = c1 = c2 = 0;
+
+#define COMBA_STORE(x) \
+ x = c0;
+
+#define COMBA_STORE2(x) \
+ x = c1;
+
+#define CARRY_FORWARD \
+ do { c0 = c1; c1 = c2; c2 = 0; } while (0);
+
+#define COMBA_FINI
+
+/* multiplies point i and j, updates carry "c1" and digit c2 */
+#define SQRADD(i, j) \
+ do { pstm_word t; \
+ t = c0 + ((pstm_word)i) * ((pstm_word)j); c0 = (pstm_digit)t; \
+ t = c1 + (t >> DIGIT_BIT); \
+ c1 = (pstm_digit)t; c2 += (pstm_digit)(t >> DIGIT_BIT); \
+ } while (0);
+
+
+/* for squaring some of the terms are doubled... */
+#define SQRADD2(i, j) \
+ do { pstm_word t; \
+ t = ((pstm_word)i) * ((pstm_word)j); \
+ tt = (pstm_word)c0 + t; c0 = (pstm_digit)tt; \
+ tt = (pstm_word)c1 + (tt >> DIGIT_BIT); \
+ c1 = (pstm_digit)tt; c2 += (pstm_digit)(tt >> DIGIT_BIT); \
+ tt = (pstm_word)c0 + t; c0 = (pstm_digit)tt; \
+ tt = (pstm_word)c1 + (tt >> DIGIT_BIT); \
+ c1 = (pstm_digit)tt; c2 += (pstm_digit)(tt >> DIGIT_BIT); \
+ } while (0);
+
+#define SQRADDSC(i, j) \
+ do { pstm_word t; \
+ t = ((pstm_word)i) * ((pstm_word)j); \
+ sc0 = (pstm_digit)t; sc1 = (pstm_digit)(t >> DIGIT_BIT); sc2 = 0; \
+ } while (0);
+
+#define SQRADDAC(i, j) \
+ do { pstm_word t; \
+ t = ((pstm_word)sc0) + ((pstm_word)i) * ((pstm_word)j); \
+ sc0 = (pstm_digit)t; \
+ t = ((pstm_word)sc1) + (t >> DIGIT_BIT); sc1 = (pstm_digit)t; \
+ sc2 += (pstm_digit)(t >> DIGIT_BIT); \
+ } while (0);
+
+#define SQRADDDB \
+ do { pstm_word t; \
+ t = ((pstm_word)sc0) + ((pstm_word)sc0) + ((pstm_word)c0); \
+ c0 = (pstm_digit)t; \
+ t = ((pstm_word)sc1) + ((pstm_word)sc1) + c1 + (t >> DIGIT_BIT); \
+ c1 = (pstm_digit)t; \
+ c2 = c2 + sc2 + sc2 + (pstm_digit)(t >> DIGIT_BIT); \
+ } while (0);
+
+#endif /* ISO_C */
+
+/******************************************************************************/
+/*
+ Non-unrolled comba squarer
+ */
+///bbox: pool unused
+#define pstm_sqr_comba_gen(pool, A, B, paD, paDlen) \
+ pstm_sqr_comba_gen( A, B, paD, paDlen)
+static int32 pstm_sqr_comba_gen(psPool_t *pool, pstm_int *A, pstm_int *B,
+ pstm_digit *paD, uint32 paDlen)
+{
+ int16 paDfail, pa;
+ int32 ix, iz;
+ pstm_digit c0, c1, c2, *dst;
+#ifdef PSTM_ISO
+ pstm_word tt;
+#endif
+
+ paDfail = 0;
+ /* get size of output and trim */
+ pa = A->used + A->used;
+
+ /* number of output digits to produce */
+ COMBA_START;
+ CLEAR_CARRY;
+/*
+ If b is not large enough grow it and continue
+*/
+ if (B->alloc < pa) {
+ if (pstm_grow(B, pa) != PSTM_OKAY) {
+ return PS_MEM_FAIL;
+ }
+ }
+ if (paD != NULL) {
+ if (paDlen < (sizeof(pstm_digit) * pa)) {
+ paDfail = 1; /* have a paD, but it's not big enough */
+ dst = xzalloc(sizeof(pstm_digit) * pa);
+ } else {
+ dst = paD;
+ memset(dst, 0x0, paDlen);
+ }
+ } else {
+ dst = xzalloc(sizeof(pstm_digit) * pa);
+ }
+
+ for (ix = 0; ix < pa; ix++) {
+ int32 tx, ty, iy;
+ pstm_digit *tmpy, *tmpx;
+
+ /* get offsets into the two bignums */
+ ty = min(A->used-1, ix);
+ tx = ix - ty;
+
+ /* setup temp aliases */
+ tmpx = A->dp + tx;
+ tmpy = A->dp + ty;
+
+/*
+ This is the number of times the loop will iterate,
+ while (tx++ < a->used && ty-- >= 0) { ... }
+*/
+ iy = min(A->used-tx, ty+1);
+
+/*
+ now for squaring tx can never equal ty. We halve the distance since
+ they approach at a rate of 2x and we have to round because odd cases
+ need to be executed
+*/
+ iy = min(iy, (ty-tx+1)>>1);
+
+ /* forward carries */
+ CARRY_FORWARD;
+
+ /* execute loop */
+ for (iz = 0; iz < iy; iz++) {
+ SQRADD2(*tmpx++, *tmpy--);
+ }
+
+ /* even columns have the square term in them */
+ if ((ix&1) == 0) {
+ SQRADD(A->dp[ix>>1], A->dp[ix>>1]);
+ }
+
+ /* store it */
+ COMBA_STORE(dst[ix]);
+ }
+
+ COMBA_FINI;
+/*
+ setup dest
+ */
+ iz = B->used;
+ B->used = pa;
+ {
+ pstm_digit *tmpc;
+ tmpc = B->dp;
+ for (ix = 0; ix < pa; ix++) {
+ *tmpc++ = dst[ix];
+ }
+ /* clear unused digits (that existed in the old copy of c) */
+ for (; ix < iz; ix++) {
+ *tmpc++ = 0;
+ }
+ }
+ pstm_clamp(B);
+
+ if ((paD == NULL) || paDfail == 1) {
+ psFree(dst, pool);
+ }
+ return PS_SUCCESS;
+}
+
+/******************************************************************************/
+/*
+ Unrolled Comba loop for 1024 bit keys
+ */
+#ifdef USE_1024_KEY_SPEED_OPTIMIZATIONS
+static int32 pstm_sqr_comba16(pstm_int *A, pstm_int *B)
+{
+ pstm_digit *a, b[32], c0, c1, c2, sc0, sc1, sc2;
+#ifdef PSTM_ISO
+ pstm_word tt;
+#endif
+
+ if (B->alloc < 32) {
+ if (pstm_grow(B, 32) != PSTM_OKAY) {
+ return PS_MEM_FAIL;
+ }
+ }
+ a = A->dp;
+ sc0 = sc1 = sc2 = 0;
+
+ COMBA_START;
+
+ /* clear carries */
+ CLEAR_CARRY;
+
+ /* output 0 */
+ SQRADD(a[0],a[0]);
+ COMBA_STORE(b[0]);
+
+ /* output 1 */
+ CARRY_FORWARD;
+ SQRADD2(a[0], a[1]);
+ COMBA_STORE(b[1]);
+
+ /* output 2 */
+ CARRY_FORWARD;
+ SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]);
+ COMBA_STORE(b[2]);
+
+ /* output 3 */
+ CARRY_FORWARD;
+ SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]);
+ COMBA_STORE(b[3]);
+
+ /* output 4 */
+ CARRY_FORWARD;
+ SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]);
+ COMBA_STORE(b[4]);
+
+ /* output 5 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB;
+ COMBA_STORE(b[5]);
+
+ /* output 6 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]);
+ COMBA_STORE(b[6]);
+
+ /* output 7 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB;
+ COMBA_STORE(b[7]);
+
+ /* output 8 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]);
+ COMBA_STORE(b[8]);
+
+ /* output 9 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB;
+ COMBA_STORE(b[9]);
+
+ /* output 10 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[10]); SQRADDAC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]);
+ COMBA_STORE(b[10]);
+
+ /* output 11 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[11]); SQRADDAC(a[1], a[10]); SQRADDAC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB;
+ COMBA_STORE(b[11]);
+
+ /* output 12 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[12]); SQRADDAC(a[1], a[11]); SQRADDAC(a[2], a[10]); SQRADDAC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]);
+ COMBA_STORE(b[12]);
+
+ /* output 13 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[13]); SQRADDAC(a[1], a[12]); SQRADDAC(a[2], a[11]); SQRADDAC(a[3], a[10]); SQRADDAC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB;
+ COMBA_STORE(b[13]);
+
+ /* output 14 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[14]); SQRADDAC(a[1], a[13]); SQRADDAC(a[2], a[12]); SQRADDAC(a[3], a[11]); SQRADDAC(a[4], a[10]); SQRADDAC(a[5], a[9]); SQRADDAC(a[6], a[8]); SQRADDDB; SQRADD(a[7], a[7]);
+ COMBA_STORE(b[14]);
+
+ /* output 15 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[15]); SQRADDAC(a[1], a[14]); SQRADDAC(a[2], a[13]); SQRADDAC(a[3], a[12]); SQRADDAC(a[4], a[11]); SQRADDAC(a[5], a[10]); SQRADDAC(a[6], a[9]); SQRADDAC(a[7], a[8]); SQRADDDB;
+ COMBA_STORE(b[15]);
+
+ /* output 16 */
+ CARRY_FORWARD;
+ SQRADDSC(a[1], a[15]); SQRADDAC(a[2], a[14]); SQRADDAC(a[3], a[13]); SQRADDAC(a[4], a[12]); SQRADDAC(a[5], a[11]); SQRADDAC(a[6], a[10]); SQRADDAC(a[7], a[9]); SQRADDDB; SQRADD(a[8], a[8]);
+ COMBA_STORE(b[16]);
+
+ /* output 17 */
+ CARRY_FORWARD;
+ SQRADDSC(a[2], a[15]); SQRADDAC(a[3], a[14]); SQRADDAC(a[4], a[13]); SQRADDAC(a[5], a[12]); SQRADDAC(a[6], a[11]); SQRADDAC(a[7], a[10]); SQRADDAC(a[8], a[9]); SQRADDDB;
+ COMBA_STORE(b[17]);
+
+ /* output 18 */
+ CARRY_FORWARD;
+ SQRADDSC(a[3], a[15]); SQRADDAC(a[4], a[14]); SQRADDAC(a[5], a[13]); SQRADDAC(a[6], a[12]); SQRADDAC(a[7], a[11]); SQRADDAC(a[8], a[10]); SQRADDDB; SQRADD(a[9], a[9]);
+ COMBA_STORE(b[18]);
+
+ /* output 19 */
+ CARRY_FORWARD;
+ SQRADDSC(a[4], a[15]); SQRADDAC(a[5], a[14]); SQRADDAC(a[6], a[13]); SQRADDAC(a[7], a[12]); SQRADDAC(a[8], a[11]); SQRADDAC(a[9], a[10]); SQRADDDB;
+ COMBA_STORE(b[19]);
+
+ /* output 20 */
+ CARRY_FORWARD;
+ SQRADDSC(a[5], a[15]); SQRADDAC(a[6], a[14]); SQRADDAC(a[7], a[13]); SQRADDAC(a[8], a[12]); SQRADDAC(a[9], a[11]); SQRADDDB; SQRADD(a[10], a[10]);
+ COMBA_STORE(b[20]);
+
+ /* output 21 */
+ CARRY_FORWARD;
+ SQRADDSC(a[6], a[15]); SQRADDAC(a[7], a[14]); SQRADDAC(a[8], a[13]); SQRADDAC(a[9], a[12]); SQRADDAC(a[10], a[11]); SQRADDDB;
+ COMBA_STORE(b[21]);
+
+ /* output 22 */
+ CARRY_FORWARD;
+ SQRADDSC(a[7], a[15]); SQRADDAC(a[8], a[14]); SQRADDAC(a[9], a[13]); SQRADDAC(a[10], a[12]); SQRADDDB; SQRADD(a[11], a[11]);
+ COMBA_STORE(b[22]);
+
+ /* output 23 */
+ CARRY_FORWARD;
+ SQRADDSC(a[8], a[15]); SQRADDAC(a[9], a[14]); SQRADDAC(a[10], a[13]); SQRADDAC(a[11], a[12]); SQRADDDB;
+ COMBA_STORE(b[23]);
+
+ /* output 24 */
+ CARRY_FORWARD;
+ SQRADDSC(a[9], a[15]); SQRADDAC(a[10], a[14]); SQRADDAC(a[11], a[13]); SQRADDDB; SQRADD(a[12], a[12]);
+ COMBA_STORE(b[24]);
+
+ /* output 25 */
+ CARRY_FORWARD;
+ SQRADDSC(a[10], a[15]); SQRADDAC(a[11], a[14]); SQRADDAC(a[12], a[13]); SQRADDDB;
+ COMBA_STORE(b[25]);
+
+ /* output 26 */
+ CARRY_FORWARD;
+ SQRADD2(a[11], a[15]); SQRADD2(a[12], a[14]); SQRADD(a[13], a[13]);
+ COMBA_STORE(b[26]);
+
+ /* output 27 */
+ CARRY_FORWARD;
+ SQRADD2(a[12], a[15]); SQRADD2(a[13], a[14]);
+ COMBA_STORE(b[27]);
+
+ /* output 28 */
+ CARRY_FORWARD;
+ SQRADD2(a[13], a[15]); SQRADD(a[14], a[14]);
+ COMBA_STORE(b[28]);
+
+ /* output 29 */
+ CARRY_FORWARD;
+ SQRADD2(a[14], a[15]);
+ COMBA_STORE(b[29]);
+
+ /* output 30 */
+ CARRY_FORWARD;
+ SQRADD(a[15], a[15]);
+ COMBA_STORE(b[30]);
+ COMBA_STORE2(b[31]);
+ COMBA_FINI;
+
+ B->used = 32;
+ B->sign = PSTM_ZPOS;
+ memcpy(B->dp, b, 32 * sizeof(pstm_digit));
+ pstm_clamp(B);
+ return PSTM_OKAY;
+}
+#endif /* USE_1024_KEY_SPEED_OPTIMIZATIONS */
+
+
+#ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS
+static int32 pstm_sqr_comba32(pstm_int *A, pstm_int *B)
+{
+ pstm_digit *a, b[64], c0, c1, c2, sc0, sc1, sc2;
+#ifdef PSTM_ISO
+ pstm_word tt;
+#endif
+
+ if (B->alloc < 64) {
+ if (pstm_grow(B, 64) != PSTM_OKAY) {
+ return PS_MEM_FAIL;
+ }
+ }
+ sc0 = sc1 = sc2 = 0;
+ a = A->dp;
+ COMBA_START;
+
+ /* clear carries */
+ CLEAR_CARRY;
+
+ /* output 0 */
+ SQRADD(a[0],a[0]);
+ COMBA_STORE(b[0]);
+
+ /* output 1 */
+ CARRY_FORWARD;
+ SQRADD2(a[0], a[1]);
+ COMBA_STORE(b[1]);
+
+ /* output 2 */
+ CARRY_FORWARD;
+ SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]);
+ COMBA_STORE(b[2]);
+
+ /* output 3 */
+ CARRY_FORWARD;
+ SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]);
+ COMBA_STORE(b[3]);
+
+ /* output 4 */
+ CARRY_FORWARD;
+ SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]);
+ COMBA_STORE(b[4]);
+
+ /* output 5 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB;
+ COMBA_STORE(b[5]);
+
+ /* output 6 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]);
+ COMBA_STORE(b[6]);
+
+ /* output 7 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB;
+ COMBA_STORE(b[7]);
+
+ /* output 8 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]);
+ COMBA_STORE(b[8]);
+
+ /* output 9 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB;
+ COMBA_STORE(b[9]);
+
+ /* output 10 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[10]); SQRADDAC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]);
+ COMBA_STORE(b[10]);
+
+ /* output 11 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[11]); SQRADDAC(a[1], a[10]); SQRADDAC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB;
+ COMBA_STORE(b[11]);
+
+ /* output 12 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[12]); SQRADDAC(a[1], a[11]); SQRADDAC(a[2], a[10]); SQRADDAC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]);
+ COMBA_STORE(b[12]);
+
+ /* output 13 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[13]); SQRADDAC(a[1], a[12]); SQRADDAC(a[2], a[11]); SQRADDAC(a[3], a[10]); SQRADDAC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB;
+ COMBA_STORE(b[13]);
+
+ /* output 14 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[14]); SQRADDAC(a[1], a[13]); SQRADDAC(a[2], a[12]); SQRADDAC(a[3], a[11]); SQRADDAC(a[4], a[10]); SQRADDAC(a[5], a[9]); SQRADDAC(a[6], a[8]); SQRADDDB; SQRADD(a[7], a[7]);
+ COMBA_STORE(b[14]);
+
+ /* output 15 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[15]); SQRADDAC(a[1], a[14]); SQRADDAC(a[2], a[13]); SQRADDAC(a[3], a[12]); SQRADDAC(a[4], a[11]); SQRADDAC(a[5], a[10]); SQRADDAC(a[6], a[9]); SQRADDAC(a[7], a[8]); SQRADDDB;
+ COMBA_STORE(b[15]);
+
+ /* output 16 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[16]); SQRADDAC(a[1], a[15]); SQRADDAC(a[2], a[14]); SQRADDAC(a[3], a[13]); SQRADDAC(a[4], a[12]); SQRADDAC(a[5], a[11]); SQRADDAC(a[6], a[10]); SQRADDAC(a[7], a[9]); SQRADDDB; SQRADD(a[8], a[8]);
+ COMBA_STORE(b[16]);
+
+ /* output 17 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[17]); SQRADDAC(a[1], a[16]); SQRADDAC(a[2], a[15]); SQRADDAC(a[3], a[14]); SQRADDAC(a[4], a[13]); SQRADDAC(a[5], a[12]); SQRADDAC(a[6], a[11]); SQRADDAC(a[7], a[10]); SQRADDAC(a[8], a[9]); SQRADDDB;
+ COMBA_STORE(b[17]);
+
+ /* output 18 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[18]); SQRADDAC(a[1], a[17]); SQRADDAC(a[2], a[16]); SQRADDAC(a[3], a[15]); SQRADDAC(a[4], a[14]); SQRADDAC(a[5], a[13]); SQRADDAC(a[6], a[12]); SQRADDAC(a[7], a[11]); SQRADDAC(a[8], a[10]); SQRADDDB; SQRADD(a[9], a[9]);
+ COMBA_STORE(b[18]);
+
+ /* output 19 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[19]); SQRADDAC(a[1], a[18]); SQRADDAC(a[2], a[17]); SQRADDAC(a[3], a[16]); SQRADDAC(a[4], a[15]); SQRADDAC(a[5], a[14]); SQRADDAC(a[6], a[13]); SQRADDAC(a[7], a[12]); SQRADDAC(a[8], a[11]); SQRADDAC(a[9], a[10]); SQRADDDB;
+ COMBA_STORE(b[19]);
+
+ /* output 20 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[20]); SQRADDAC(a[1], a[19]); SQRADDAC(a[2], a[18]); SQRADDAC(a[3], a[17]); SQRADDAC(a[4], a[16]); SQRADDAC(a[5], a[15]); SQRADDAC(a[6], a[14]); SQRADDAC(a[7], a[13]); SQRADDAC(a[8], a[12]); SQRADDAC(a[9], a[11]); SQRADDDB; SQRADD(a[10], a[10]);
+ COMBA_STORE(b[20]);
+
+ /* output 21 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[21]); SQRADDAC(a[1], a[20]); SQRADDAC(a[2], a[19]); SQRADDAC(a[3], a[18]); SQRADDAC(a[4], a[17]); SQRADDAC(a[5], a[16]); SQRADDAC(a[6], a[15]); SQRADDAC(a[7], a[14]); SQRADDAC(a[8], a[13]); SQRADDAC(a[9], a[12]); SQRADDAC(a[10], a[11]); SQRADDDB;
+ COMBA_STORE(b[21]);
+
+ /* output 22 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[22]); SQRADDAC(a[1], a[21]); SQRADDAC(a[2], a[20]); SQRADDAC(a[3], a[19]); SQRADDAC(a[4], a[18]); SQRADDAC(a[5], a[17]); SQRADDAC(a[6], a[16]); SQRADDAC(a[7], a[15]); SQRADDAC(a[8], a[14]); SQRADDAC(a[9], a[13]); SQRADDAC(a[10], a[12]); SQRADDDB; SQRADD(a[11], a[11]);
+ COMBA_STORE(b[22]);
+
+ /* output 23 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[23]); SQRADDAC(a[1], a[22]); SQRADDAC(a[2], a[21]); SQRADDAC(a[3], a[20]); SQRADDAC(a[4], a[19]); SQRADDAC(a[5], a[18]); SQRADDAC(a[6], a[17]); SQRADDAC(a[7], a[16]); SQRADDAC(a[8], a[15]); SQRADDAC(a[9], a[14]); SQRADDAC(a[10], a[13]); SQRADDAC(a[11], a[12]); SQRADDDB;
+ COMBA_STORE(b[23]);
+
+ /* output 24 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[24]); SQRADDAC(a[1], a[23]); SQRADDAC(a[2], a[22]); SQRADDAC(a[3], a[21]); SQRADDAC(a[4], a[20]); SQRADDAC(a[5], a[19]); SQRADDAC(a[6], a[18]); SQRADDAC(a[7], a[17]); SQRADDAC(a[8], a[16]); SQRADDAC(a[9], a[15]); SQRADDAC(a[10], a[14]); SQRADDAC(a[11], a[13]); SQRADDDB; SQRADD(a[12], a[12]);
+ COMBA_STORE(b[24]);
+
+ /* output 25 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[25]); SQRADDAC(a[1], a[24]); SQRADDAC(a[2], a[23]); SQRADDAC(a[3], a[22]); SQRADDAC(a[4], a[21]); SQRADDAC(a[5], a[20]); SQRADDAC(a[6], a[19]); SQRADDAC(a[7], a[18]); SQRADDAC(a[8], a[17]); SQRADDAC(a[9], a[16]); SQRADDAC(a[10], a[15]); SQRADDAC(a[11], a[14]); SQRADDAC(a[12], a[13]); SQRADDDB;
+ COMBA_STORE(b[25]);
+
+ /* output 26 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[26]); SQRADDAC(a[1], a[25]); SQRADDAC(a[2], a[24]); SQRADDAC(a[3], a[23]); SQRADDAC(a[4], a[22]); SQRADDAC(a[5], a[21]); SQRADDAC(a[6], a[20]); SQRADDAC(a[7], a[19]); SQRADDAC(a[8], a[18]); SQRADDAC(a[9], a[17]); SQRADDAC(a[10], a[16]); SQRADDAC(a[11], a[15]); SQRADDAC(a[12], a[14]); SQRADDDB; SQRADD(a[13], a[13]);
+ COMBA_STORE(b[26]);
+
+ /* output 27 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[27]); SQRADDAC(a[1], a[26]); SQRADDAC(a[2], a[25]); SQRADDAC(a[3], a[24]); SQRADDAC(a[4], a[23]); SQRADDAC(a[5], a[22]); SQRADDAC(a[6], a[21]); SQRADDAC(a[7], a[20]); SQRADDAC(a[8], a[19]); SQRADDAC(a[9], a[18]); SQRADDAC(a[10], a[17]); SQRADDAC(a[11], a[16]); SQRADDAC(a[12], a[15]); SQRADDAC(a[13], a[14]); SQRADDDB;
+ COMBA_STORE(b[27]);
+
+ /* output 28 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[28]); SQRADDAC(a[1], a[27]); SQRADDAC(a[2], a[26]); SQRADDAC(a[3], a[25]); SQRADDAC(a[4], a[24]); SQRADDAC(a[5], a[23]); SQRADDAC(a[6], a[22]); SQRADDAC(a[7], a[21]); SQRADDAC(a[8], a[20]); SQRADDAC(a[9], a[19]); SQRADDAC(a[10], a[18]); SQRADDAC(a[11], a[17]); SQRADDAC(a[12], a[16]); SQRADDAC(a[13], a[15]); SQRADDDB; SQRADD(a[14], a[14]);
+ COMBA_STORE(b[28]);
+
+ /* output 29 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[29]); SQRADDAC(a[1], a[28]); SQRADDAC(a[2], a[27]); SQRADDAC(a[3], a[26]); SQRADDAC(a[4], a[25]); SQRADDAC(a[5], a[24]); SQRADDAC(a[6], a[23]); SQRADDAC(a[7], a[22]); SQRADDAC(a[8], a[21]); SQRADDAC(a[9], a[20]); SQRADDAC(a[10], a[19]); SQRADDAC(a[11], a[18]); SQRADDAC(a[12], a[17]); SQRADDAC(a[13], a[16]); SQRADDAC(a[14], a[15]); SQRADDDB;
+ COMBA_STORE(b[29]);
+
+ /* output 30 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[30]); SQRADDAC(a[1], a[29]); SQRADDAC(a[2], a[28]); SQRADDAC(a[3], a[27]); SQRADDAC(a[4], a[26]); SQRADDAC(a[5], a[25]); SQRADDAC(a[6], a[24]); SQRADDAC(a[7], a[23]); SQRADDAC(a[8], a[22]); SQRADDAC(a[9], a[21]); SQRADDAC(a[10], a[20]); SQRADDAC(a[11], a[19]); SQRADDAC(a[12], a[18]); SQRADDAC(a[13], a[17]); SQRADDAC(a[14], a[16]); SQRADDDB; SQRADD(a[15], a[15]);
+ COMBA_STORE(b[30]);
+
+ /* output 31 */
+ CARRY_FORWARD;
+ SQRADDSC(a[0], a[31]); SQRADDAC(a[1], a[30]); SQRADDAC(a[2], a[29]); SQRADDAC(a[3], a[28]); SQRADDAC(a[4], a[27]); SQRADDAC(a[5], a[26]); SQRADDAC(a[6], a[25]); SQRADDAC(a[7], a[24]); SQRADDAC(a[8], a[23]); SQRADDAC(a[9], a[22]); SQRADDAC(a[10], a[21]); SQRADDAC(a[11], a[20]); SQRADDAC(a[12], a[19]); SQRADDAC(a[13], a[18]); SQRADDAC(a[14], a[17]); SQRADDAC(a[15], a[16]); SQRADDDB;
+ COMBA_STORE(b[31]);
+
+ /* output 32 */
+ CARRY_FORWARD;
+ SQRADDSC(a[1], a[31]); SQRADDAC(a[2], a[30]); SQRADDAC(a[3], a[29]); SQRADDAC(a[4], a[28]); SQRADDAC(a[5], a[27]); SQRADDAC(a[6], a[26]); SQRADDAC(a[7], a[25]); SQRADDAC(a[8], a[24]); SQRADDAC(a[9], a[23]); SQRADDAC(a[10], a[22]); SQRADDAC(a[11], a[21]); SQRADDAC(a[12], a[20]); SQRADDAC(a[13], a[19]); SQRADDAC(a[14], a[18]); SQRADDAC(a[15], a[17]); SQRADDDB; SQRADD(a[16], a[16]);
+ COMBA_STORE(b[32]);
+
+ /* output 33 */
+ CARRY_FORWARD;
+ SQRADDSC(a[2], a[31]); SQRADDAC(a[3], a[30]); SQRADDAC(a[4], a[29]); SQRADDAC(a[5], a[28]); SQRADDAC(a[6], a[27]); SQRADDAC(a[7], a[26]); SQRADDAC(a[8], a[25]); SQRADDAC(a[9], a[24]); SQRADDAC(a[10], a[23]); SQRADDAC(a[11], a[22]); SQRADDAC(a[12], a[21]); SQRADDAC(a[13], a[20]); SQRADDAC(a[14], a[19]); SQRADDAC(a[15], a[18]); SQRADDAC(a[16], a[17]); SQRADDDB;
+ COMBA_STORE(b[33]);
+
+ /* output 34 */
+ CARRY_FORWARD;
+ SQRADDSC(a[3], a[31]); SQRADDAC(a[4], a[30]); SQRADDAC(a[5], a[29]); SQRADDAC(a[6], a[28]); SQRADDAC(a[7], a[27]); SQRADDAC(a[8], a[26]); SQRADDAC(a[9], a[25]); SQRADDAC(a[10], a[24]); SQRADDAC(a[11], a[23]); SQRADDAC(a[12], a[22]); SQRADDAC(a[13], a[21]); SQRADDAC(a[14], a[20]); SQRADDAC(a[15], a[19]); SQRADDAC(a[16], a[18]); SQRADDDB; SQRADD(a[17], a[17]);
+ COMBA_STORE(b[34]);
+
+ /* output 35 */
+ CARRY_FORWARD;
+ SQRADDSC(a[4], a[31]); SQRADDAC(a[5], a[30]); SQRADDAC(a[6], a[29]); SQRADDAC(a[7], a[28]); SQRADDAC(a[8], a[27]); SQRADDAC(a[9], a[26]); SQRADDAC(a[10], a[25]); SQRADDAC(a[11], a[24]); SQRADDAC(a[12], a[23]); SQRADDAC(a[13], a[22]); SQRADDAC(a[14], a[21]); SQRADDAC(a[15], a[20]); SQRADDAC(a[16], a[19]); SQRADDAC(a[17], a[18]); SQRADDDB;
+ COMBA_STORE(b[35]);
+
+ /* output 36 */
+ CARRY_FORWARD;
+ SQRADDSC(a[5], a[31]); SQRADDAC(a[6], a[30]); SQRADDAC(a[7], a[29]); SQRADDAC(a[8], a[28]); SQRADDAC(a[9], a[27]); SQRADDAC(a[10], a[26]); SQRADDAC(a[11], a[25]); SQRADDAC(a[12], a[24]); SQRADDAC(a[13], a[23]); SQRADDAC(a[14], a[22]); SQRADDAC(a[15], a[21]); SQRADDAC(a[16], a[20]); SQRADDAC(a[17], a[19]); SQRADDDB; SQRADD(a[18], a[18]);
+ COMBA_STORE(b[36]);
+
+ /* output 37 */
+ CARRY_FORWARD;
+ SQRADDSC(a[6], a[31]); SQRADDAC(a[7], a[30]); SQRADDAC(a[8], a[29]); SQRADDAC(a[9], a[28]); SQRADDAC(a[10], a[27]); SQRADDAC(a[11], a[26]); SQRADDAC(a[12], a[25]); SQRADDAC(a[13], a[24]); SQRADDAC(a[14], a[23]); SQRADDAC(a[15], a[22]); SQRADDAC(a[16], a[21]); SQRADDAC(a[17], a[20]); SQRADDAC(a[18], a[19]); SQRADDDB;
+ COMBA_STORE(b[37]);
+
+ /* output 38 */
+ CARRY_FORWARD;
+ SQRADDSC(a[7], a[31]); SQRADDAC(a[8], a[30]); SQRADDAC(a[9], a[29]); SQRADDAC(a[10], a[28]); SQRADDAC(a[11], a[27]); SQRADDAC(a[12], a[26]); SQRADDAC(a[13], a[25]); SQRADDAC(a[14], a[24]); SQRADDAC(a[15], a[23]); SQRADDAC(a[16], a[22]); SQRADDAC(a[17], a[21]); SQRADDAC(a[18], a[20]); SQRADDDB; SQRADD(a[19], a[19]);
+ COMBA_STORE(b[38]);
+
+ /* output 39 */
+ CARRY_FORWARD;
+ SQRADDSC(a[8], a[31]); SQRADDAC(a[9], a[30]); SQRADDAC(a[10], a[29]); SQRADDAC(a[11], a[28]); SQRADDAC(a[12], a[27]); SQRADDAC(a[13], a[26]); SQRADDAC(a[14], a[25]); SQRADDAC(a[15], a[24]); SQRADDAC(a[16], a[23]); SQRADDAC(a[17], a[22]); SQRADDAC(a[18], a[21]); SQRADDAC(a[19], a[20]); SQRADDDB;
+ COMBA_STORE(b[39]);
+
+ /* output 40 */
+ CARRY_FORWARD;
+ SQRADDSC(a[9], a[31]); SQRADDAC(a[10], a[30]); SQRADDAC(a[11], a[29]); SQRADDAC(a[12], a[28]); SQRADDAC(a[13], a[27]); SQRADDAC(a[14], a[26]); SQRADDAC(a[15], a[25]); SQRADDAC(a[16], a[24]); SQRADDAC(a[17], a[23]); SQRADDAC(a[18], a[22]); SQRADDAC(a[19], a[21]); SQRADDDB; SQRADD(a[20], a[20]);
+ COMBA_STORE(b[40]);
+
+ /* output 41 */
+ CARRY_FORWARD;
+ SQRADDSC(a[10], a[31]); SQRADDAC(a[11], a[30]); SQRADDAC(a[12], a[29]); SQRADDAC(a[13], a[28]); SQRADDAC(a[14], a[27]); SQRADDAC(a[15], a[26]); SQRADDAC(a[16], a[25]); SQRADDAC(a[17], a[24]); SQRADDAC(a[18], a[23]); SQRADDAC(a[19], a[22]); SQRADDAC(a[20], a[21]); SQRADDDB;
+ COMBA_STORE(b[41]);
+
+ /* output 42 */
+ CARRY_FORWARD;
+ SQRADDSC(a[11], a[31]); SQRADDAC(a[12], a[30]); SQRADDAC(a[13], a[29]); SQRADDAC(a[14], a[28]); SQRADDAC(a[15], a[27]); SQRADDAC(a[16], a[26]); SQRADDAC(a[17], a[25]); SQRADDAC(a[18], a[24]); SQRADDAC(a[19], a[23]); SQRADDAC(a[20], a[22]); SQRADDDB; SQRADD(a[21], a[21]);
+ COMBA_STORE(b[42]);
+
+ /* output 43 */
+ CARRY_FORWARD;
+ SQRADDSC(a[12], a[31]); SQRADDAC(a[13], a[30]); SQRADDAC(a[14], a[29]); SQRADDAC(a[15], a[28]); SQRADDAC(a[16], a[27]); SQRADDAC(a[17], a[26]); SQRADDAC(a[18], a[25]); SQRADDAC(a[19], a[24]); SQRADDAC(a[20], a[23]); SQRADDAC(a[21], a[22]); SQRADDDB;
+ COMBA_STORE(b[43]);
+
+ /* output 44 */
+ CARRY_FORWARD;
+ SQRADDSC(a[13], a[31]); SQRADDAC(a[14], a[30]); SQRADDAC(a[15], a[29]); SQRADDAC(a[16], a[28]); SQRADDAC(a[17], a[27]); SQRADDAC(a[18], a[26]); SQRADDAC(a[19], a[25]); SQRADDAC(a[20], a[24]); SQRADDAC(a[21], a[23]); SQRADDDB; SQRADD(a[22], a[22]);
+ COMBA_STORE(b[44]);
+
+ /* output 45 */
+ CARRY_FORWARD;
+ SQRADDSC(a[14], a[31]); SQRADDAC(a[15], a[30]); SQRADDAC(a[16], a[29]); SQRADDAC(a[17], a[28]); SQRADDAC(a[18], a[27]); SQRADDAC(a[19], a[26]); SQRADDAC(a[20], a[25]); SQRADDAC(a[21], a[24]); SQRADDAC(a[22], a[23]); SQRADDDB;
+ COMBA_STORE(b[45]);
+
+ /* output 46 */
+ CARRY_FORWARD;
+ SQRADDSC(a[15], a[31]); SQRADDAC(a[16], a[30]); SQRADDAC(a[17], a[29]); SQRADDAC(a[18], a[28]); SQRADDAC(a[19], a[27]); SQRADDAC(a[20], a[26]); SQRADDAC(a[21], a[25]); SQRADDAC(a[22], a[24]); SQRADDDB; SQRADD(a[23], a[23]);
+ COMBA_STORE(b[46]);
+
+ /* output 47 */
+ CARRY_FORWARD;
+ SQRADDSC(a[16], a[31]); SQRADDAC(a[17], a[30]); SQRADDAC(a[18], a[29]); SQRADDAC(a[19], a[28]); SQRADDAC(a[20], a[27]); SQRADDAC(a[21], a[26]); SQRADDAC(a[22], a[25]); SQRADDAC(a[23], a[24]); SQRADDDB;
+ COMBA_STORE(b[47]);
+
+ /* output 48 */
+ CARRY_FORWARD;
+ SQRADDSC(a[17], a[31]); SQRADDAC(a[18], a[30]); SQRADDAC(a[19], a[29]); SQRADDAC(a[20], a[28]); SQRADDAC(a[21], a[27]); SQRADDAC(a[22], a[26]); SQRADDAC(a[23], a[25]); SQRADDDB; SQRADD(a[24], a[24]);
+ COMBA_STORE(b[48]);
+
+ /* output 49 */
+ CARRY_FORWARD;
+ SQRADDSC(a[18], a[31]); SQRADDAC(a[19], a[30]); SQRADDAC(a[20], a[29]); SQRADDAC(a[21], a[28]); SQRADDAC(a[22], a[27]); SQRADDAC(a[23], a[26]); SQRADDAC(a[24], a[25]); SQRADDDB;
+ COMBA_STORE(b[49]);
+
+ /* output 50 */
+ CARRY_FORWARD;
+ SQRADDSC(a[19], a[31]); SQRADDAC(a[20], a[30]); SQRADDAC(a[21], a[29]); SQRADDAC(a[22], a[28]); SQRADDAC(a[23], a[27]); SQRADDAC(a[24], a[26]); SQRADDDB; SQRADD(a[25], a[25]);
+ COMBA_STORE(b[50]);
+
+ /* output 51 */
+ CARRY_FORWARD;
+ SQRADDSC(a[20], a[31]); SQRADDAC(a[21], a[30]); SQRADDAC(a[22], a[29]); SQRADDAC(a[23], a[28]); SQRADDAC(a[24], a[27]); SQRADDAC(a[25], a[26]); SQRADDDB;
+ COMBA_STORE(b[51]);
+
+ /* output 52 */
+ CARRY_FORWARD;
+ SQRADDSC(a[21], a[31]); SQRADDAC(a[22], a[30]); SQRADDAC(a[23], a[29]); SQRADDAC(a[24], a[28]); SQRADDAC(a[25], a[27]); SQRADDDB; SQRADD(a[26], a[26]);
+ COMBA_STORE(b[52]);
+
+ /* output 53 */
+ CARRY_FORWARD;
+ SQRADDSC(a[22], a[31]); SQRADDAC(a[23], a[30]); SQRADDAC(a[24], a[29]); SQRADDAC(a[25], a[28]); SQRADDAC(a[26], a[27]); SQRADDDB;
+ COMBA_STORE(b[53]);
+
+ /* output 54 */
+ CARRY_FORWARD;
+ SQRADDSC(a[23], a[31]); SQRADDAC(a[24], a[30]); SQRADDAC(a[25], a[29]); SQRADDAC(a[26], a[28]); SQRADDDB; SQRADD(a[27], a[27]);
+ COMBA_STORE(b[54]);
+
+ /* output 55 */
+ CARRY_FORWARD;
+ SQRADDSC(a[24], a[31]); SQRADDAC(a[25], a[30]); SQRADDAC(a[26], a[29]); SQRADDAC(a[27], a[28]); SQRADDDB;
+ COMBA_STORE(b[55]);
+
+ /* output 56 */
+ CARRY_FORWARD;
+ SQRADDSC(a[25], a[31]); SQRADDAC(a[26], a[30]); SQRADDAC(a[27], a[29]); SQRADDDB; SQRADD(a[28], a[28]);
+ COMBA_STORE(b[56]);
+
+ /* output 57 */
+ CARRY_FORWARD;
+ SQRADDSC(a[26], a[31]); SQRADDAC(a[27], a[30]); SQRADDAC(a[28], a[29]); SQRADDDB;
+ COMBA_STORE(b[57]);
+
+ /* output 58 */
+ CARRY_FORWARD;
+ SQRADD2(a[27], a[31]); SQRADD2(a[28], a[30]); SQRADD(a[29], a[29]);
+ COMBA_STORE(b[58]);
+
+ /* output 59 */
+ CARRY_FORWARD;
+ SQRADD2(a[28], a[31]); SQRADD2(a[29], a[30]);
+ COMBA_STORE(b[59]);
+
+ /* output 60 */
+ CARRY_FORWARD;
+ SQRADD2(a[29], a[31]); SQRADD(a[30], a[30]);
+ COMBA_STORE(b[60]);
+
+ /* output 61 */
+ CARRY_FORWARD;
+ SQRADD2(a[30], a[31]);
+ COMBA_STORE(b[61]);
+
+ /* output 62 */
+ CARRY_FORWARD;
+ SQRADD(a[31], a[31]);
+ COMBA_STORE(b[62]);
+ COMBA_STORE2(b[63]);
+ COMBA_FINI;
+
+ B->used = 64;
+ B->sign = PSTM_ZPOS;
+ memcpy(B->dp, b, 64 * sizeof(pstm_digit));
+ pstm_clamp(B);
+ return PSTM_OKAY;
+}
+#endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */
+
+/******************************************************************************/
+/*
+ */
+int32 pstm_sqr_comba(psPool_t *pool, pstm_int *A, pstm_int *B, pstm_digit *paD,
+ uint32 paDlen)
+{
+#ifdef USE_1024_KEY_SPEED_OPTIMIZATIONS
+ if (A->used == 16) {
+ return pstm_sqr_comba16(A, B);
+ } else {
+#ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS
+ if (A->used == 32) {
+ return pstm_sqr_comba32(A, B);
+ }
+#endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */
+ return pstm_sqr_comba_gen(pool, A, B, paD, paDlen);
+ }
+#else
+#ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS
+ if (A->used == 32) {
+ return pstm_sqr_comba32(A, B);
+ }
+#endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */
+ return pstm_sqr_comba_gen(pool, A, B, paD, paDlen);
+#endif
+}
+
+#endif /* DISABLE_PSTM */
+/******************************************************************************/
diff --git a/networking/tls_rsa.c b/networking/tls_rsa.c
new file mode 100644
index 000000000..058b09cee
--- /dev/null
+++ b/networking/tls_rsa.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright (C) 2017 Denys Vlasenko
+ *
+ * Licensed under GPLv2, see file LICENSE in this source tree.
+ */
+#include "tls.h"
+
+#define pkcs1Pad(in, inlen, out, outlen, cryptType, userPtr) \
+ pkcs1Pad(in, inlen, out, outlen, cryptType)
+static ///bbox
+int32 pkcs1Pad(unsigned char *in, uint32 inlen, unsigned char *out,
+ uint32 outlen, int32 cryptType, void *userPtr)
+{
+ unsigned char *c;
+ int32 randomLen;
+
+ randomLen = outlen - 3 - inlen;
+ if (randomLen < 8) {
+ psTraceCrypto("pkcs1Pad failure\n");
+ return PS_LIMIT_FAIL;
+ }
+ c = out;
+ *c = 0x00;
+ c++;
+ *c = (unsigned char)cryptType;
+ c++;
+ if (cryptType == PUBKEY_TYPE) {
+ while (randomLen-- > 0) {
+ *c++ = 0xFF;
+ }
+ } else {
+ if (matrixCryptoGetPrngData(c, (uint32)randomLen, userPtr) < 0) {
+ return PS_PLATFORM_FAIL;
+ }
+/*
+ SECURITY: Read through the random data and change all 0x0 to 0x01.
+ This is per spec that no random bytes should be 0
+*/
+ while (randomLen-- > 0) {
+ if (*c == 0x0) {
+ *c = 0x01;
+ }
+ c++;
+ }
+ }
+ *c = 0x00;
+ c++;
+ memcpy(c, in, inlen);
+
+ return outlen;
+}
+
+#define psRsaCrypt(pool, in, inlen, out, outlen, key, type, data) \
+ psRsaCrypt(pool, in, inlen, out, outlen, key, type)
+static ///bbox
+int32 psRsaCrypt(psPool_t *pool, const unsigned char *in, uint32 inlen,
+ unsigned char *out, uint32 *outlen, psRsaKey_t *key, int32 type,
+ void *data)
+{
+ pstm_int tmp, tmpa, tmpb;
+ int32 res;
+ uint32 x;
+
+ if (in == NULL || out == NULL || outlen == NULL || key == NULL) {
+ psTraceCrypto("NULL parameter error in psRsaCrypt\n");
+ return PS_ARG_FAIL;
+ }
+
+ tmp.dp = tmpa.dp = tmpb.dp = NULL;
+
+ /* Init and copy into tmp */
+ if (pstm_init_for_read_unsigned_bin(pool, &tmp, inlen + sizeof(pstm_digit))
+ != PS_SUCCESS) {
+ return PS_FAILURE;
+ }
+ if (pstm_read_unsigned_bin(&tmp, (unsigned char *)in, inlen) != PS_SUCCESS){
+ pstm_clear(&tmp);
+ return PS_FAILURE;
+ }
+ /* Sanity check on the input */
+ if (pstm_cmp(&key->N, &tmp) == PSTM_LT) {
+ res = PS_LIMIT_FAIL;
+ goto done;
+ }
+ if (type == PRIVKEY_TYPE) {
+ if (key->optimized) {
+ if (pstm_init_size(pool, &tmpa, key->p.alloc) != PS_SUCCESS) {
+ res = PS_FAILURE;
+ goto done;
+ }
+ if (pstm_init_size(pool, &tmpb, key->q.alloc) != PS_SUCCESS) {
+ pstm_clear(&tmpa);
+ res = PS_FAILURE;
+ goto done;
+ }
+ if (pstm_exptmod(pool, &tmp, &key->dP, &key->p, &tmpa) !=
+ PS_SUCCESS) {
+ psTraceCrypto("decrypt error: pstm_exptmod dP, p\n");
+ goto error;
+ }
+ if (pstm_exptmod(pool, &tmp, &key->dQ, &key->q, &tmpb) !=
+ PS_SUCCESS) {
+ psTraceCrypto("decrypt error: pstm_exptmod dQ, q\n");
+ goto error;
+ }
+ if (pstm_sub(&tmpa, &tmpb, &tmp) != PS_SUCCESS) {
+ psTraceCrypto("decrypt error: sub tmpb, tmp\n");
+ goto error;
+ }
+ if (pstm_mulmod(pool, &tmp, &key->qP, &key->p, &tmp) != PS_SUCCESS) {
+ psTraceCrypto("decrypt error: pstm_mulmod qP, p\n");
+ goto error;
+ }
+ if (pstm_mul_comba(pool, &tmp, &key->q, &tmp, NULL, 0)
+ != PS_SUCCESS){
+ psTraceCrypto("decrypt error: pstm_mul q \n");
+ goto error;
+ }
+ if (pstm_add(&tmp, &tmpb, &tmp) != PS_SUCCESS) {
+ psTraceCrypto("decrypt error: pstm_add tmp \n");
+ goto error;
+ }
+ } else {
+ if (pstm_exptmod(pool, &tmp, &key->d, &key->N, &tmp) !=
+ PS_SUCCESS) {
+ psTraceCrypto("psRsaCrypt error: pstm_exptmod\n");
+ goto error;
+ }
+ }
+ } else if (type == PUBKEY_TYPE) {
+ if (pstm_exptmod(pool, &tmp, &key->e, &key->N, &tmp) != PS_SUCCESS) {
+ psTraceCrypto("psRsaCrypt error: pstm_exptmod\n");
+ goto error;
+ }
+ } else {
+ psTraceCrypto("psRsaCrypt error: invalid type param\n");
+ goto error;
+ }
+ /* Read it back */
+ x = pstm_unsigned_bin_size(&key->N);
+
+ if ((uint32)x > *outlen) {
+ res = -1;
+ psTraceCrypto("psRsaCrypt error: pstm_unsigned_bin_size\n");
+ goto done;
+ }
+ /* We want the encrypted value to always be the key size. Pad with 0x0 */
+ while ((uint32)x < (unsigned long)key->size) {
+ *out++ = 0x0;
+ x++;
+ }
+
+ *outlen = x;
+ /* Convert it */
+ memset(out, 0x0, x);
+
+ if (pstm_to_unsigned_bin(pool, &tmp, out+(x-pstm_unsigned_bin_size(&tmp)))
+ != PS_SUCCESS) {
+ psTraceCrypto("psRsaCrypt error: pstm_to_unsigned_bin\n");
+ goto error;
+ }
+ /* Clean up and return */
+ res = PS_SUCCESS;
+ goto done;
+error:
+ res = PS_FAILURE;
+done:
+ if (type == PRIVKEY_TYPE && key->optimized) {
+ pstm_clear_multi(&tmpa, &tmpb, NULL, NULL, NULL, NULL, NULL, NULL);
+ }
+ pstm_clear(&tmp);
+ return res;
+}
+
+int32 psRsaEncryptPub(psPool_t *pool, psRsaKey_t *key,
+ unsigned char *in, uint32 inlen,
+ unsigned char *out, uint32 outlen, void *data)
+{
+ int32 err;
+ uint32 size;
+
+ size = key->size;
+ if (outlen < size) {
+ psTraceCrypto("Error on bad outlen parameter to psRsaEncryptPub\n");
+ return PS_ARG_FAIL;
+ }
+
+ if ((err = pkcs1Pad(in, inlen, out, size, PRIVKEY_TYPE, data))
+ < PS_SUCCESS) {
+ psTraceCrypto("Error padding psRsaEncryptPub. Likely data too long\n");
+ return err;
+ }
+ if ((err = psRsaCrypt(pool, out, size, out, (uint32*)&outlen, key,
+ PUBKEY_TYPE, data)) < PS_SUCCESS) {
+ psTraceCrypto("Error performing psRsaEncryptPub\n");
+ return err;
+ }
+ if (outlen != size) {
+ psTraceCrypto("Encrypted size error in psRsaEncryptPub\n");
+ return PS_FAILURE;
+ }
+ return size;
+}
diff --git a/networking/tls_rsa.h b/networking/tls_rsa.h
new file mode 100644
index 000000000..3281087c7
--- /dev/null
+++ b/networking/tls_rsa.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright (C) 2017 Denys Vlasenko
+ *
+ * Licensed under GPLv2, see file LICENSE in this source tree.
+ */
+
+typedef struct {
+ pstm_int e, d, N, qP, dP, dQ, p, q;
+ uint32 size; /* Size of the key in bytes */
+ int32 optimized; /* 1 for optimized */
+ psPool_t *pool;
+} psRsaKey_t;
+
+#define psRsaEncryptPub(pool, key, in, inlen, out, outlen, data) \
+ psRsaEncryptPub(pool, key, in, inlen, out, outlen)
+int32 psRsaEncryptPub(psPool_t *pool, psRsaKey_t *key,
+ unsigned char *in, uint32 inlen,
+ unsigned char *out, uint32 outlen, void *data);