1 files changed, 2254 insertions, 0 deletions
diff --git a/networking/tls_pstm.c b/networking/tls_pstm.c
new file mode 100644
index 000000000..0d797f87f
--- /dev/null
+++ b/networking/tls_pstm.c
@@ -0,0 +1,2254 @@
+/*
+ * Copyright (C) 2017 Denys Vlasenko
+ *
+ * Licensed under GPLv2, see file LICENSE in this source tree.
+ */
+#include "tls.h"
+
+/**
+ *	@file    pstm.c
+ *	@version 33ef80f (HEAD, tag: MATRIXSSL-3-7-2-OPEN, tag: MATRIXSSL-3-7-2-COMM, origin/master, origin/HEAD, master)
+ *
+ *	Multiprecision number implementation.
+ */
+/*
+ *	Copyright (c) 2013-2015 INSIDE Secure Corporation
+ *	Copyright (c) PeerSec Networks, 2002-2011
+ *	All Rights Reserved
+ *
+ *	The latest version of this code is available at http://www.matrixssl.org
+ *
+ *	This software is open source; you can redistribute it and/or modify
+ *	it under the terms of the GNU General Public License as published by
+ *	the Free Software Foundation; either version 2 of the License, or
+ *	(at your option) any later version.
+ *
+ *	This General Public License does NOT permit incorporating this software
+ *	into proprietary programs.  If you are unable to comply with the GPL, a
+ *	commercial license for this software may be purchased from INSIDE at
+ *	http://www.insidesecure.com/eng/Company/Locations
+ *
+ *	This program is distributed in WITHOUT ANY WARRANTY; without even the
+ *	implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *	See the GNU General Public License for more details.
+ *
+ *	You should have received a copy of the GNU General Public License
+ *	along with this program; if not, write to the Free Software
+ *	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *	http://www.gnu.org/copyleft/gpl.html
+ */
+/******************************************************************************/
+
+///bbox
+//#include "../cryptoApi.h"
+#ifndef DISABLE_PSTM
+
+static int32 pstm_mul_2d(pstm_int *a, int16 b, pstm_int *c);
+
+/******************************************************************************/
+/*
+	init an pstm_int for a given size
+ */
+int32 pstm_init_size(psPool_t *pool, pstm_int * a, uint32 size)
+{
+//	uint16		x;
+
+/*
+	alloc mem
+ */
+	a->dp = xzalloc(sizeof (pstm_digit) * size);
+	a->pool = pool;
+	a->used  = 0;
+	a->alloc = (int16)size;
+	a->sign  = PSTM_ZPOS;
+/*
+	zero the digits
+ */
+///bbox
+//	for (x = 0; x < size; x++) {
+//		a->dp[x] = 0;
+//	}
+	return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+	Init a new pstm_int.
+*/
+int32 pstm_init(psPool_t *pool, pstm_int * a)
+{
+//	int32		i;
+/*
+	allocate memory required and clear it
+ */
+	a->dp = xzalloc(sizeof (pstm_digit) * PSTM_DEFAULT_INIT);
+/*
+	set the digits to zero
+ */
+///bbox
+//	for (i = 0; i < PSTM_DEFAULT_INIT; i++) {
+//		a->dp[i] = 0;
+//	}
+/*
+	set the used to zero, allocated digits to the default precision and sign
+	to positive
+ */
+	a->pool = pool;
+	a->used  = 0;
+	a->alloc = PSTM_DEFAULT_INIT;
+	a->sign  = PSTM_ZPOS;
+
+	return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+	Grow as required
+ */
+int32 pstm_grow(pstm_int * a, int16 size)
+{
+	int16			i;
+	pstm_digit		*tmp;
+
+/*
+	If the alloc size is smaller alloc more ram.
+ */
+	if (a->alloc < size) {
+/*
+		Reallocate the array a->dp
+
+		We store the return in a temporary variable in case the operation
+		failed we don't want to overwrite the dp member of a.
+*/
+		tmp = xrealloc(a->dp, sizeof (pstm_digit) * size);
+/*
+		reallocation succeeded so set a->dp
+ */
+		a->dp = tmp;
+/*
+		zero excess digits
+ */
+		i			= a->alloc;
+		a->alloc	= size;
+		for (; i < a->alloc; i++) {
+			a->dp[i] = 0;
+		}
+	}
+	return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+	copy, b = a (b must be pre-allocated)
+ */
+int32 pstm_copy(pstm_int * a, pstm_int * b)
+{
+	int32	res, n;
+
+/*
+	If dst == src do nothing
+ */
+	if (a == b) {
+		return PSTM_OKAY;
+	}
+/*
+	Grow dest
+ */
+	if (b->alloc < a->used) {
+		if ((res = pstm_grow (b, a->used)) != PSTM_OKAY) {
+			return res;
+		}
+	}
+/*
+	Zero b and copy the parameters over
+ */
+	{
+		register pstm_digit *tmpa, *tmpb;
+
+		/* pointer aliases */
+		/* source */
+		tmpa = a->dp;
+
+		/* destination */
+		tmpb = b->dp;
+
+		/* copy all the digits */
+		for (n = 0; n < a->used; n++) {
+			*tmpb++ = *tmpa++;
+		}
+
+		/* clear high digits */
+		for (; n < b->used; n++) {
+			*tmpb++ = 0;
+		}
+	}
+/*
+	copy used count and sign
+ */
+	b->used = a->used;
+	b->sign = a->sign;
+	return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+	Trim unused digits
+
+	This is used to ensure that leading zero digits are trimed and the
+	leading "used" digit will be non-zero. Typically very fast.  Also fixes
+	the sign if there are no more leading digits
+*/
+void pstm_clamp(pstm_int * a)
+{
+/*	decrease used while the most significant digit is zero. */
+	while (a->used > 0 && a->dp[a->used - 1] == 0) {
+		--(a->used);
+	}
+/*	reset the sign flag if used == 0 */
+	if (a->used == 0) {
+		a->sign = PSTM_ZPOS;
+	}
+}
+
+/******************************************************************************/
+/*
+	clear one (frees).
+ */
+void pstm_clear(pstm_int * a)
+{
+	int32		i;
+/*
+	only do anything if a hasn't been freed previously
+ */
+	if (a != NULL && a->dp != NULL) {
+/*
+		first zero the digits
+ */
+		for (i = 0; i < a->used; i++) {
+			a->dp[i] = 0;
+		}
+
+		psFree (a->dp, a->pool);
+/*
+		reset members to make debugging easier
+ */
+		a->dp		= NULL;
+		a->alloc	= a->used = 0;
+		a->sign		= PSTM_ZPOS;
+	}
+}
+
+/******************************************************************************/
+/*
+	clear many (frees).
+ */
+void pstm_clear_multi(pstm_int *mp0, pstm_int *mp1, pstm_int *mp2,
+					pstm_int *mp3, pstm_int *mp4, pstm_int *mp5,
+					pstm_int *mp6, pstm_int *mp7)
+{
+	int32		n;		/* Number of ok inits */
+
+	pstm_int	*tempArray[9];
+
+	tempArray[0] = mp0;
+	tempArray[1] = mp1;
+	tempArray[2] = mp2;
+	tempArray[3] = mp3;
+	tempArray[4] = mp4;
+	tempArray[5] = mp5;
+	tempArray[6] = mp6;
+	tempArray[7] = mp7;
+	tempArray[8] = NULL;
+
+	for (n = 0; tempArray[n] != NULL; n++) {
+		if ((tempArray[n] != NULL) && (tempArray[n]->dp != NULL)) {
+			pstm_clear(tempArray[n]);
+		}
+	}
+}
+
+/******************************************************************************/
+/*
+	Set to zero.
+ */
+void pstm_zero(pstm_int * a)
+{
+	int32		n;
+	pstm_digit	*tmp;
+
+	a->sign = PSTM_ZPOS;
+	a->used = 0;
+
+	tmp = a->dp;
+	for (n = 0; n < a->alloc; n++) {
+		*tmp++ = 0;
+	}
+}
+
+
+/******************************************************************************/
+/*
+	Compare maginitude of two ints (unsigned).
+ */
+int32 pstm_cmp_mag(pstm_int * a, pstm_int * b)
+{
+	int16		n;
+	pstm_digit	*tmpa, *tmpb;
+
+/*
+	compare based on # of non-zero digits
+ */
+	if (a->used > b->used) {
+		return PSTM_GT;
+	}
+
+	if (a->used < b->used) {
+		return PSTM_LT;
+	}
+
+	/* alias for a */
+	tmpa = a->dp + (a->used - 1);
+
+	/* alias for b */
+	tmpb = b->dp + (a->used - 1);
+
+/*
+	compare based on digits
+ */
+	for (n = 0; n < a->used; ++n, --tmpa, --tmpb) {
+		if (*tmpa > *tmpb) {
+			return PSTM_GT;
+		}
+		if (*tmpa < *tmpb) {
+			return PSTM_LT;
+		}
+	}
+	return PSTM_EQ;
+}
+
+/******************************************************************************/
+/*
+	Compare two ints (signed)
+ */
+int32 pstm_cmp(pstm_int * a, pstm_int * b)
+{
+/*
+	compare based on sign
+ */
+	if (a->sign != b->sign) {
+		if (a->sign == PSTM_NEG) {
+			return PSTM_LT;
+		} else {
+			return PSTM_GT;
+		}
+	}
+/*
+	compare digits
+ */
+	if (a->sign == PSTM_NEG) {
+		/* if negative compare opposite direction */
+		return pstm_cmp_mag(b, a);
+	} else {
+		return pstm_cmp_mag(a, b);
+	}
+}
+
+/******************************************************************************/
+/*
+	pstm_ints can be initialized more precisely when they will populated
+	using pstm_read_unsigned_bin since the length of the byte stream is known
+*/
+int32 pstm_init_for_read_unsigned_bin(psPool_t *pool, pstm_int *a, uint32 len)
+{
+	int32 size;
+/*
+	Need to set this based on how many words max it will take to store the bin.
+	The magic + 2:
+		1 to round up for the remainder of this integer math
+		1 for the initial carry of '1' bits that fall between DIGIT_BIT and 8
+*/
+	size = (((len / sizeof(pstm_digit)) * (sizeof(pstm_digit) * CHAR_BIT))
+		/ DIGIT_BIT) + 2;
+	return pstm_init_size(pool, a, size);
+}
+
+
+/******************************************************************************/
+/*
+	Reads a unsigned char array into pstm_int format.  User should have
+	called pstm_init_for_read_unsigned_bin first.  There is some grow logic
+	here if the default pstm_init was used but we don't really want to hit it.
+*/
+int32 pstm_read_unsigned_bin(pstm_int *a, unsigned char *b, int32 c)
+{
+	/* zero the int */
+	pstm_zero (a);
+
+/*
+	If we know the endianness of this architecture, and we're using
+	32-bit pstm_digits, we can optimize this
+*/
+#if (defined(ENDIAN_LITTLE) || defined(ENDIAN_BIG)) && !defined(PSTM_64BIT)
+  /* But not for both simultaneously */
+#if defined(ENDIAN_LITTLE) && defined(ENDIAN_BIG)
+#error Both ENDIAN_LITTLE and ENDIAN_BIG defined.
+#endif
+	{
+		unsigned char *pd;
+		if ((unsigned)c > (PSTM_MAX_SIZE * sizeof(pstm_digit))) {
+			uint32 excess = c - (PSTM_MAX_SIZE * sizeof(pstm_digit));
+			c -= excess;
+			b += excess;
+		}
+		a->used = (int16)((c + sizeof(pstm_digit) - 1)/sizeof(pstm_digit));
+		if (a->alloc < a->used) {
+			if (pstm_grow(a, a->used) != PSTM_OKAY) {
+				return PSTM_MEM;
+			}
+		}
+		pd = (unsigned char *)a->dp;
+		/* read the bytes in */
+#ifdef ENDIAN_BIG
+		{
+			/* Use Duff's device to unroll the loop. */
+			int32 idx = (c - 1) & ~3;
+			switch (c % 4) {
+				case 0:	do { pd[idx+0] = *b++;
+					case 3:	     pd[idx+1] = *b++;
+					case 2:	     pd[idx+2] = *b++;
+					case 1:	     pd[idx+3] = *b++;
+					idx -= 4;
+				} while ((c -= 4) > 0);
+			}
+		}
+#else
+		for (c -= 1; c >= 0; c -= 1) {
+			pd[c] = *b++;
+		}
+#endif
+	}
+#else
+	/* Big enough based on the len? */
+	a->used = (((c / sizeof(pstm_digit)) * (sizeof(pstm_digit) * CHAR_BIT))
+		/ DIGIT_BIT) + 2;
+
+	if (a->alloc < a->used) {
+		if (pstm_grow(a, a->used) != PSTM_OKAY) {
+			return PSTM_MEM;
+		}
+	}
+	/* read the bytes in */
+	for (; c > 0; c--) {
+		if (pstm_mul_2d (a, 8, a) != PSTM_OKAY) {
+			return PS_MEM_FAIL;
+		}
+		a->dp[0] |= *b++;
+		a->used += 1;
+	}
+#endif
+
+	pstm_clamp (a);
+	return PS_SUCCESS;
+}
+
+/******************************************************************************/
+/*
+*/
+int16 pstm_count_bits (pstm_int * a)
+{
+	int16     r;
+	pstm_digit q;
+
+	if (a->used == 0) {
+		return 0;
+	}
+
+	/* get number of digits and add that */
+	r = (a->used - 1) * DIGIT_BIT;
+
+	/* take the last digit and count the bits in it */
+	q = a->dp[a->used - 1];
+	while (q > ((pstm_digit) 0)) {
+		++r;
+		q >>= ((pstm_digit) 1);
+	}
+	return r;
+}
+
+/******************************************************************************/
+int32 pstm_unsigned_bin_size(pstm_int *a)
+{
+	int32     size = pstm_count_bits (a);
+	return (size / 8 + ((size & 7) != 0 ? 1 : 0));
+}
+
+/******************************************************************************/
+void pstm_set(pstm_int *a, pstm_digit b)
+{
+   pstm_zero(a);
+   a->dp[0] = b;
+   a->used  = a->dp[0] ? 1 : 0;
+}
+
+/******************************************************************************/
+/*
+	Right shift
+*/
+void pstm_rshd(pstm_int *a, int16 x)
+{
+	int16 y;
+
+	/* too many digits just zero and return */
+	if (x >= a->used) {
+		pstm_zero(a);
+		return;
+	}
+
+	/* shift */
+	for (y = 0; y < a->used - x; y++) {
+		a->dp[y] = a->dp[y+x];
+	}
+
+	/* zero rest */
+	for (; y < a->used; y++) {
+		a->dp[y] = 0;
+	}
+
+	/* decrement count */
+	a->used -= x;
+	pstm_clamp(a);
+}
+
+/******************************************************************************/
+/*
+	Shift left a certain amount of digits.
+ */
+int32 pstm_lshd(pstm_int * a, int16 b)
+{
+	int16	x;
+	int32	res;
+
+/*
+	If its less than zero return.
+ */
+	if (b <= 0) {
+		return PSTM_OKAY;
+	}
+/*
+	Grow to fit the new digits.
+ */
+	if (a->alloc < a->used + b) {
+		if ((res = pstm_grow (a, a->used + b)) != PSTM_OKAY) {
+			return res;
+		}
+	}
+
+	{
+		register pstm_digit *top, *bottom;
+/*
+		Increment the used by the shift amount then copy upwards.
+ */
+		a->used += b;
+
+		/* top */
+		top = a->dp + a->used - 1;
+
+		/* base */
+		bottom = a->dp + a->used - 1 - b;
+/*
+		This is implemented using a sliding window except the window goes the
+		other way around.  Copying from the bottom to the top.
+ */
+		for (x = a->used - 1; x >= b; x--) {
+			*top-- = *bottom--;
+		}
+
+		/* zero the lower digits */
+		top = a->dp;
+		for (x = 0; x < b; x++) {
+			*top++ = 0;
+		}
+	}
+	return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+	computes a = 2**b
+*/
+int32 pstm_2expt(pstm_int *a, int16 b)
+{
+	int16     z;
+
+   /* zero a as per default */
+	pstm_zero (a);
+
+	if (b < 0) {
+		return PSTM_OKAY;
+	}
+
+	z = b / DIGIT_BIT;
+	if (z >= PSTM_MAX_SIZE) {
+		return PS_LIMIT_FAIL;
+	}
+
+	/* set the used count of where the bit will go */
+	a->used = z + 1;
+
+	if (a->used > a->alloc) {
+		if (pstm_grow(a, a->used) != PSTM_OKAY) {
+			return PS_MEM_FAIL;
+		}
+	}
+
+	/* put the single bit in its place */
+	a->dp[z] = ((pstm_digit)1) << (b % DIGIT_BIT);
+	return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+
+*/
+int32 pstm_mul_2(pstm_int * a, pstm_int * b)
+{
+	int32	res;
+	int16	x, oldused;
+
+/*
+	grow to accomodate result
+ */
+	if (b->alloc < a->used + 1) {
+		if ((res = pstm_grow (b, a->used + 1)) != PSTM_OKAY) {
+			return res;
+		}
+	}
+	oldused = b->used;
+	b->used = a->used;
+
+	{
+		register pstm_digit r, rr, *tmpa, *tmpb;
+
+		/* alias for source */
+		tmpa = a->dp;
+
+		/* alias for dest */
+		tmpb = b->dp;
+
+		/* carry */
+		r = 0;
+		for (x = 0; x < a->used; x++) {
+/*
+			get what will be the *next* carry bit from the
+			MSB of the current digit
+*/
+			rr = *tmpa >> ((pstm_digit)(DIGIT_BIT - 1));
+/*
+			now shift up this digit, add in the carry [from the previous]
+*/
+			*tmpb++ = ((*tmpa++ << ((pstm_digit)1)) | r);
+/*
+			copy the carry that would be from the source
+			digit into the next iteration
+*/
+			r = rr;
+		}
+
+		/* new leading digit? */
+		if (r != 0 && b->used != (PSTM_MAX_SIZE-1)) {
+			/* add a MSB which is always 1 at this point */
+			*tmpb = 1;
+			++(b->used);
+		}
+/*
+		now zero any excess digits on the destination that we didn't write to
+*/
+		tmpb = b->dp + b->used;
+		for (x = b->used; x < oldused; x++) {
+			*tmpb++ = 0;
+		}
+	}
+	b->sign = a->sign;
+	return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+	unsigned subtraction ||a|| >= ||b|| ALWAYS!
+*/
+int32 s_pstm_sub(pstm_int *a, pstm_int *b, pstm_int *c)
+{
+	int16		oldbused, oldused;
+	int32		x;
+	pstm_word	t;
+
+	if (b->used > a->used) {
+		return PS_LIMIT_FAIL;
+	}
+	if (c->alloc < a->used) {
+		if ((x = pstm_grow (c, a->used)) != PSTM_OKAY) {
+			return x;
+		}
+	}
+	oldused  = c->used;
+	oldbused = b->used;
+	c->used  = a->used;
+	t = 0;
+
+	for (x = 0; x < oldbused; x++) {
+		t = ((pstm_word)a->dp[x]) - (((pstm_word)b->dp[x]) + t);
+		c->dp[x] = (pstm_digit)t;
+		t = (t >> DIGIT_BIT)&1;
+	}
+	for (; x < a->used; x++) {
+		t = ((pstm_word)a->dp[x]) - t;
+		c->dp[x] = (pstm_digit)t;
+		t = (t >> DIGIT_BIT);
+	}
+	for (; x < oldused; x++) {
+		c->dp[x] = 0;
+	}
+	pstm_clamp(c);
+	return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+	unsigned addition
+*/
+static int32 s_pstm_add(pstm_int *a, pstm_int *b, pstm_int *c)
+{
+	int16				x, y, oldused;
+	register pstm_word	t, adp, bdp;
+
+	y = a->used;
+	if (b->used > y) {
+		y = b->used;
+	}
+	oldused = c->used;
+	c->used = y;
+
+	if (c->used > c->alloc) {
+		if (pstm_grow(c, c->used) != PSTM_OKAY) {
+			return PS_MEM_FAIL;
+		}
+	}
+
+	t = 0;
+	for (x = 0; x < y; x++) {
+		if (a->used < x) {
+			adp = 0;
+		} else {
+			adp = (pstm_word)a->dp[x];
+		}
+		if (b->used < x) {
+			bdp = 0;
+		} else {
+			bdp = (pstm_word)b->dp[x];
+		}
+		t         += (adp) + (bdp);
+		c->dp[x]   = (pstm_digit)t;
+		t        >>= DIGIT_BIT;
+	}
+	if (t != 0 && x < PSTM_MAX_SIZE) {
+		if (c->used == c->alloc) {
+			if (pstm_grow(c, c->alloc + 1) != PSTM_OKAY) {
+				return PS_MEM_FAIL;
+			}
+		}
+		c->dp[c->used++] = (pstm_digit)t;
+		++x;
+	}
+
+	c->used = x;
+	for (; x < oldused; x++) {
+		c->dp[x] = 0;
+	}
+	pstm_clamp(c);
+	return PSTM_OKAY;
+}
+
+
+/******************************************************************************/
+/*
+
+*/
+int32 pstm_sub(pstm_int *a, pstm_int *b, pstm_int *c)
+{
+	int32     res;
+	int16     sa, sb;
+
+	sa = a->sign;
+	sb = b->sign;
+
+	if (sa != sb) {
+/*
+		subtract a negative from a positive, OR a positive from a negative.
+		For both, ADD their magnitudes, and use the sign of the first number.
+ */
+		c->sign = sa;
+		if ((res = s_pstm_add (a, b, c)) != PSTM_OKAY) {
+			return res;
+		}
+	} else {
+/*
+		subtract a positive from a positive, OR a negative from a negative.
+		First, take the difference between their magnitudes, then...
+ */
+		if (pstm_cmp_mag (a, b) != PSTM_LT) {
+			/* Copy the sign from the first */
+			c->sign = sa;
+			/* The first has a larger or equal magnitude */
+			if ((res = s_pstm_sub (a, b, c)) != PSTM_OKAY) {
+				return res;
+			}
+		} else {
+			/* The result has the _opposite_ sign from the first number. */
+			c->sign = (sa == PSTM_ZPOS) ? PSTM_NEG : PSTM_ZPOS;
+			/* The second has a larger magnitude */
+			if ((res = s_pstm_sub (b, a, c)) != PSTM_OKAY) {
+				return res;
+			}
+		}
+	}
+	return PS_SUCCESS;
+}
+
+/******************************************************************************/
+/*
+	c = a - b
+*/
+int32 pstm_sub_d(psPool_t *pool, pstm_int *a, pstm_digit b, pstm_int *c)
+{
+	pstm_int	tmp;
+	int32		res;
+
+	if (pstm_init_size(pool, &tmp, sizeof(pstm_digit)) != PSTM_OKAY) {
+		return PS_MEM_FAIL;
+	}
+	pstm_set(&tmp, b);
+	res = pstm_sub(a, &tmp, c);
+	pstm_clear(&tmp);
+	return res;
+}
+
+/******************************************************************************/
+/*
+	setups the montgomery reduction
+*/
+int32 pstm_montgomery_setup(pstm_int *a, pstm_digit *rho)
+{
+	pstm_digit x, b;
+
+/*
+	fast inversion mod 2**k
+	Based on the fact that
+	XA = 1 (mod 2**n)	=>  (X(2-XA)) A = 1 (mod 2**2n)
+						=>  2*X*A - X*X*A*A = 1
+						=>  2*(1) - (1)     = 1
+ */
+	b = a->dp[0];
+
+	if ((b & 1) == 0) {
+		psTraceCrypto("pstm_montogomery_setup failure\n");
+		return PS_ARG_FAIL;
+	}
+
+	x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
+	x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
+	x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
+	x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
+#ifdef PSTM_64BIT
+	x *= 2 - b * x;               /* here x*a==1 mod 2**64 */
+#endif
+	/* rho = -1/m mod b */
+	*rho = (pstm_digit)(((pstm_word) 1 << ((pstm_word) DIGIT_BIT)) -
+		((pstm_word)x));
+	return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+ *	computes a = B**n mod b without division or multiplication useful for
+ *	normalizing numbers in a Montgomery system.
+ */
+int32 pstm_montgomery_calc_normalization(pstm_int *a, pstm_int *b)
+{
+	int32     x;
+	int16     bits;
+
+	/* how many bits of last digit does b use */
+	bits = pstm_count_bits (b) % DIGIT_BIT;
+	if (!bits) bits = DIGIT_BIT;
+
+	/* compute A = B^(n-1) * 2^(bits-1) */
+	if (b->used > 1) {
+		if ((x = pstm_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) !=
+				PSTM_OKAY) {
+			return x;
+		}
+	} else {
+		pstm_set(a, 1);
+		bits = 1;
+	}
+
+	/* now compute C = A * B mod b */
+	for (x = bits - 1; x < (int32)DIGIT_BIT; x++) {
+		if (pstm_mul_2 (a, a) != PSTM_OKAY) {
+			return PS_MEM_FAIL;
+		}
+		if (pstm_cmp_mag (a, b) != PSTM_LT) {
+			if (s_pstm_sub (a, b, a) != PSTM_OKAY) {
+				return PS_MEM_FAIL;
+			}
+		}
+	}
+	return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+	c = a * 2**d
+*/
+static int32 pstm_mul_2d(pstm_int *a, int16 b, pstm_int *c)
+{
+	pstm_digit	carry, carrytmp, shift;
+	int16		x;
+
+	/* copy it */
+	if (pstm_copy(a, c) != PSTM_OKAY) {
+		return PS_MEM_FAIL;
+	}
+
+	/* handle whole digits */
+	if (b >= DIGIT_BIT) {
+		if (pstm_lshd(c, b/DIGIT_BIT) != PSTM_OKAY) {
+			return PS_MEM_FAIL;
+		}
+	}
+	b %= DIGIT_BIT;
+
+	/* shift the digits */
+	if (b != 0) {
+		carry = 0;
+		shift = DIGIT_BIT - b;
+		for (x = 0; x < c->used; x++) {
+			carrytmp = c->dp[x] >> shift;
+			c->dp[x] = (c->dp[x] << b) + carry;
+			carry = carrytmp;
+		}
+		/* store last carry if room */
+		if (carry && x < PSTM_MAX_SIZE) {
+			if (c->used == c->alloc) {
+				if (pstm_grow(c, c->alloc + 1) != PSTM_OKAY) {
+					return PS_MEM_FAIL;
+				}
+			}
+			c->dp[c->used++] = carry;
+		}
+	}
+	pstm_clamp(c);
+	return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+	c = a mod 2**d
+*/
+static int32 pstm_mod_2d(pstm_int *a, int16 b, pstm_int *c)
+{
+	int16	x;
+
+	/* zero if count less than or equal to zero */
+	if (b <= 0) {
+		pstm_zero(c);
+		return PSTM_OKAY;
+	}
+
+	/* get copy of input */
+	if (pstm_copy(a, c) != PSTM_OKAY) {
+		return PS_MEM_FAIL;
+	}
+
+	/* if 2**d is larger than we just return */
+	if (b >= (DIGIT_BIT * a->used)) {
+		return PSTM_OKAY;
+	}
+
+	/* zero digits above the last digit of the modulus */
+	for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++)
+	{
+		c->dp[x] = 0;
+	}
+	/* clear the digit that is not completely outside/inside the modulus */
+	c->dp[b / DIGIT_BIT] &= ~((pstm_digit)0) >> (DIGIT_BIT - b);
+	pstm_clamp (c);
+	return PSTM_OKAY;
+}
+
+
+/******************************************************************************/
+/*
+	c = a * b
+*/
+int32 pstm_mul_d(pstm_int *a, pstm_digit b, pstm_int *c)
+{
+	pstm_word	w;
+	int32		res;
+	int16		x, oldused;
+
+	if (c->alloc < a->used + 1) {
+		if ((res = pstm_grow (c, a->used + 1)) != PSTM_OKAY) {
+			return res;
+		}
+	}
+	oldused = c->used;
+	c->used = a->used;
+	c->sign = a->sign;
+	w       = 0;
+	for (x = 0; x < a->used; x++) {
+		w         = ((pstm_word)a->dp[x]) * ((pstm_word)b) + w;
+		c->dp[x]  = (pstm_digit)w;
+		w         = w >> DIGIT_BIT;
+	}
+	if (w != 0 && (a->used != PSTM_MAX_SIZE)) {
+		c->dp[c->used++] = (pstm_digit)w;
+		++x;
+	}
+	for (; x < oldused; x++) {
+		c->dp[x] = 0;
+	}
+	pstm_clamp(c);
+	return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+	c = a / 2**b
+*/
+int32 pstm_div_2d(psPool_t *pool, pstm_int *a, int16 b, pstm_int *c,
+					pstm_int *d)
+{
+	pstm_digit	D, r, rr;
+	int32		res;
+	int16		x;
+	pstm_int	t;
+
+	/* if the shift count is <= 0 then we do no work */
+	if (b <= 0) {
+		if (pstm_copy (a, c) != PSTM_OKAY) {
+			return PS_MEM_FAIL;
+		}
+		if (d != NULL) {
+			pstm_zero (d);
+		}
+		return PSTM_OKAY;
+	}
+
+	/* get the remainder */
+	if (d != NULL) {
+		if (pstm_init(pool, &t) != PSTM_OKAY) {
+			return PS_MEM_FAIL;
+		}
+		if (pstm_mod_2d (a, b, &t) != PSTM_OKAY) {
+			res = PS_MEM_FAIL;
+			goto LBL_DONE;
+		}
+	}
+
+	/* copy */
+	if (pstm_copy(a, c) != PSTM_OKAY) {
+		res = PS_MEM_FAIL;
+		goto LBL_DONE;
+	}
+
+	/* shift by as many digits in the bit count */
+	if (b >= (int32)DIGIT_BIT) {
+		pstm_rshd (c, b / DIGIT_BIT);
+	}
+
+	/* shift any bit count < DIGIT_BIT */
+	D = (pstm_digit) (b % DIGIT_BIT);
+	if (D != 0) {
+		register pstm_digit *tmpc, mask, shift;
+
+		/* mask */
+		mask = (((pstm_digit)1) << D) - 1;
+
+		/* shift for lsb */
+		shift = DIGIT_BIT - D;
+
+		/* alias */
+		tmpc = c->dp + (c->used - 1);
+
+		/* carry */
+		r = 0;
+		for (x = c->used - 1; x >= 0; x--) {
+			/* get the lower  bits of this word in a temp */
+			rr = *tmpc & mask;
+
+			/* shift the current word and mix in the carry bits from previous */
+			*tmpc = (*tmpc >> D) | (r << shift);
+			--tmpc;
+
+			/* set the carry to the carry bits of the current word above */
+			r = rr;
+		}
+	}
+	pstm_clamp (c);
+
+	res = PSTM_OKAY;
+LBL_DONE:
+	if (d != NULL) {
+		if (pstm_copy(&t, d) != PSTM_OKAY) {
+			res = PS_MEM_FAIL;
+		}
+		pstm_clear(&t);
+	}
+	return res;
+}
+
+/******************************************************************************/
+/*
+	b = a/2
+*/
+int32 pstm_div_2(pstm_int * a, pstm_int * b)
+{
+	int16	x, oldused;
+
+	if (b->alloc < a->used) {
+		if (pstm_grow(b, a->used) != PSTM_OKAY) {
+			return PS_MEM_FAIL;
+		}
+	}
+	oldused = b->used;
+	b->used = a->used;
+	{
+		register pstm_digit r, rr, *tmpa, *tmpb;
+
+		/* source alias */
+		tmpa = a->dp + b->used - 1;
+
+		/* dest alias */
+		tmpb = b->dp + b->used - 1;
+
+		/* carry */
+		r = 0;
+		for (x = b->used - 1; x >= 0; x--) {
+			/* get the carry for the next iteration */
+			rr = *tmpa & 1;
+
+			/* shift the current digit, add in carry and store */
+			*tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
+
+			/* forward carry to next iteration */
+			r = rr;
+		}
+
+		/* zero excess digits */
+		tmpb = b->dp + b->used;
+		for (x = b->used; x < oldused; x++) {
+			*tmpb++ = 0;
+		}
+	}
+	b->sign = a->sign;
+	pstm_clamp (b);
+	return PSTM_OKAY;
+}
+
+/******************************************************************************/
+/*
+	Creates "a" then copies b into it
+ */
+int32 pstm_init_copy(psPool_t *pool, pstm_int * a, pstm_int * b, int16 toSqr)
+{
+	int16	x;
+	int32	res;
+
+	if (a == b) {
+		return PSTM_OKAY;
+	}
+	x = b->alloc;
+
+	if (toSqr) {
+/*
+		Smart-size:  Increasing size of a if b->used is roughly half
+		of b->alloc because usage has shown that a lot of these copies
+		go on to be squared and need these extra digits
+*/
+		if ((b->used * 2) + 2 >= x) {
+			x = (b->used * 2) + 3;
+		}
+	}
+	if ((res = pstm_init_size(pool, a, x)) != PSTM_OKAY) {
+		return res;
+	}
+	return pstm_copy(b, a);
+}
+
+/******************************************************************************/
+/*
+	With some compilers, we have seen issues linking with the builtin
+	64 bit division routine. The issues with either manifest in a failure
+	to find 'udivdi3' at link time, or a runtime invalid instruction fault
+	during an RSA operation.
+	The routine below divides a 64 bit unsigned int by a 32 bit unsigned int
+	explicitly, rather than using the division operation
+		The 64 bit result is placed in the 'numerator' parameter
+		The 32 bit mod (remainder) of the division is the return parameter
+	Based on implementations by:
+		Copyright (C) 2003 Bernardo Innocenti <bernie@develer.com>
+		Copyright (C) 1999 Hewlett-Packard Co
+		Copyright (C) 1999 David Mosberger-Tang <davidm@hpl.hp.com>
+*/
+#if defined(USE_MATRIX_DIV64) && defined(PSTM_32BIT)
+static uint32 psDiv64(uint64 *numerator, uint32 denominator)
+{
+	uint64	rem = *numerator;
+	uint64	b = denominator;
+	uint64	res = 0;
+	uint64	d = 1;
+	uint32	high = rem >> 32;
+
+	if (high >= denominator) {
+		high /= denominator;
+		res = (uint64) high << 32;
+		rem -= (uint64) (high * denominator) << 32;
+	}
+	while ((int64)b > 0 && b < rem) {
+		b = b+b;
+		d = d+d;
+	}
+	do {
+		if (rem >= b) {
+			rem -= b;
+			res += d;
+		}
+		b >>= 1;
+		d >>= 1;
+	} while (d);
+	*numerator = res;
+	return rem;
+}
+#endif /* USE_MATRIX_DIV64 */
+
+#if defined(USE_MATRIX_DIV128) && defined(PSTM_64BIT)
+typedef unsigned long	uint128 __attribute__ ((mode(TI)));
+static uint64 psDiv128(uint128 *numerator, uint64 denominator)
+{
+	uint128	rem = *numerator;
+	uint128	b = denominator;
+	uint128	res = 0;
+	uint128	d = 1;
+	uint64	high = rem >> 64;
+
+	if (high >= denominator) {
+		high /= denominator;
+		res = (uint128) high << 64;
+		rem -= (uint128) (high * denominator) << 64;
+	}
+	while ((uint128)b > 0 && b < rem) {
+		b = b+b;
+		d = d+d;
+	}
+	do {
+		if (rem >= b) {
+			rem -= b;
+			res += d;
+		}
+		b >>= 1;
+		d >>= 1;
+	} while (d);
+	*numerator = res;
+	return rem;
+}
+#endif /* USE_MATRIX_DIV128 */
+
+/******************************************************************************/
+/*
+	a/b => cb + d == a
+*/
+int32 pstm_div(psPool_t *pool, pstm_int *a, pstm_int *b, pstm_int *c,
+				pstm_int *d)
+{
+	pstm_int	q, x, y, t1, t2;
+	int32		res;
+	int16		n, t, i, norm, neg;
+
+	/* is divisor zero ? */
+	if (pstm_iszero (b) == 1) {
+		return PS_LIMIT_FAIL;
+	}
+
+	/* if a < b then q=0, r = a */
+	if (pstm_cmp_mag (a, b) == PSTM_LT) {
+		if (d != NULL) {
+			if (pstm_copy(a, d) != PSTM_OKAY) {
+				return PS_MEM_FAIL;
+			}
+		}
+		if (c != NULL) {
+			pstm_zero (c);
+		}
+		return PSTM_OKAY;
+	}
+/*
+	Smart-size inits
+*/
+	if ((res = pstm_init_size(pool, &t1, a->alloc)) != PSTM_OKAY) {
+		return res;
+	}
+	if ((res = pstm_init_size(pool, &t2, 3)) != PSTM_OKAY) {
+		goto LBL_T1;
+	}
+	if ((res = pstm_init_copy(pool, &x, a, 0)) != PSTM_OKAY) {
+		goto LBL_T2;
+	}
+/*
+	Used to be an init_copy on b but pstm_grow was always hit with triple size
+*/
+	if ((res = pstm_init_size(pool, &y, b->used * 3)) != PSTM_OKAY) {
+		goto LBL_X;
+	}
+	if ((res = pstm_copy(b, &y)) != PSTM_OKAY) {
+		goto LBL_Y;
+	}
+
+	/* fix the sign */
+	neg = (a->sign == b->sign) ? PSTM_ZPOS : PSTM_NEG;
+	x.sign = y.sign = PSTM_ZPOS;
+
+	/* normalize both x and y, ensure that y >= b/2, [b == 2**DIGIT_BIT] */
+	norm = pstm_count_bits(&y) % DIGIT_BIT;
+	if (norm < (int32)(DIGIT_BIT-1)) {
+		norm = (DIGIT_BIT-1) - norm;
+		if ((res = pstm_mul_2d(&x, norm, &x)) != PSTM_OKAY) {
+			goto LBL_Y;
+		}
+		if ((res = pstm_mul_2d(&y, norm, &y)) != PSTM_OKAY) {
+			goto LBL_Y;
+		}
+	} else {
+		norm = 0;
+	}
+
+	/* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
+	n = x.used - 1;
+	t = y.used - 1;
+
+	if ((res = pstm_init_size(pool, &q, n - t + 1)) != PSTM_OKAY) {
+		goto LBL_Y;
+	}
+	q.used = n - t + 1;
+
+	/* while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} } */
+	if ((res = pstm_lshd(&y, n - t)) != PSTM_OKAY) { /* y = y*b**{n-t} */
+		goto LBL_Q;
+	}
+
+	while (pstm_cmp (&x, &y) != PSTM_LT) {
+		++(q.dp[n - t]);
+		if ((res = pstm_sub(&x, &y, &x)) != PSTM_OKAY) {
+			goto LBL_Q;
+		}
+	}
+
+	/* reset y by shifting it back down */
+	pstm_rshd (&y, n - t);
+
+	/* step 3. for i from n down to (t + 1) */
+	for (i = n; i >= (t + 1); i--) {
+		if (i > x.used) {
+			continue;
+		}
+
+		/* step 3.1 if xi == yt then set q{i-t-1} to b-1,
+		 * otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
+		if (x.dp[i] == y.dp[t]) {
+			q.dp[i - t - 1] = (pstm_digit)((((pstm_word)1) << DIGIT_BIT) - 1);
+		} else {
+			pstm_word tmp;
+			tmp = ((pstm_word) x.dp[i]) << ((pstm_word) DIGIT_BIT);
+			tmp |= ((pstm_word) x.dp[i - 1]);
+#if defined(USE_MATRIX_DIV64) && defined(PSTM_32BIT)
+			psDiv64(&tmp, y.dp[t]);
+#elif defined(USE_MATRIX_DIV128) && defined(PSTM_64BIT)
+			psDiv128(&tmp, y.dp[t]);
+#else
+			tmp /= ((pstm_word) y.dp[t]);
+#endif /* USE_MATRIX_DIV64 */
+			q.dp[i - t - 1] = (pstm_digit) (tmp);
+		}
+
+		/* while (q{i-t-1} * (yt * b + y{t-1})) >
+			 xi * b**2 + xi-1 * b + xi-2
+
+			do q{i-t-1} -= 1;
+		*/
+		q.dp[i - t - 1] = (q.dp[i - t - 1] + 1);
+		do {
+			q.dp[i - t - 1] = (q.dp[i - t - 1] - 1);
+
+			/* find left hand */
+			pstm_zero (&t1);
+			t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1];
+			t1.dp[1] = y.dp[t];
+			t1.used = 2;
+			if ((res = pstm_mul_d (&t1, q.dp[i - t - 1], &t1)) != PSTM_OKAY) {
+				goto LBL_Q;
+			}
+
+			/* find right hand */
+			t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2];
+			t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
+			t2.dp[2] = x.dp[i];
+			t2.used = 3;
+		} while (pstm_cmp_mag(&t1, &t2) == PSTM_GT);
+
+		/* step 3.3 x = x - q{i-t-1} * y * b**{i-t-1} */
+		if ((res = pstm_mul_d(&y, q.dp[i - t - 1], &t1)) != PSTM_OKAY) {
+			goto LBL_Q;
+		}
+
+		if ((res = pstm_lshd(&t1, i - t - 1)) != PSTM_OKAY) {
+			goto LBL_Q;
+		}
+
+		if ((res = pstm_sub(&x, &t1, &x)) != PSTM_OKAY) {
+			goto LBL_Q;
+		}
+
+		/* if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; } */
+		if (x.sign == PSTM_NEG) {
+			if ((res = pstm_copy(&y, &t1)) != PSTM_OKAY) {
+				goto LBL_Q;
+			}
+			if ((res = pstm_lshd (&t1, i - t - 1)) != PSTM_OKAY) {
+				goto LBL_Q;
+			}
+			if ((res = pstm_add (&x, &t1, &x)) != PSTM_OKAY) {
+				goto LBL_Q;
+			}
+			q.dp[i - t - 1] = q.dp[i - t - 1] - 1;
+		}
+	}
+/*
+	now q is the quotient and x is the remainder (which we have to normalize)
+*/
+	/* get sign before writing to c */
+	x.sign = x.used == 0 ? PSTM_ZPOS : a->sign;
+
+	if (c != NULL) {
+		pstm_clamp (&q);
+		if (pstm_copy (&q, c) != PSTM_OKAY) {
+			res = PS_MEM_FAIL;
+			goto LBL_Q;
+		}
+		c->sign = neg;
+	}
+
+	if (d != NULL) {
+		if ((res = pstm_div_2d (pool, &x, norm, &x, NULL)) != PSTM_OKAY) {
+			goto LBL_Q;
+		}
+/*
+		the following is a kludge, essentially we were seeing the right
+		remainder but with excess digits that should have been zero
+ */
+		for (i = b->used; i < x.used; i++) {
+			x.dp[i] = 0;
+		}
+		pstm_clamp(&x);
+		if (pstm_copy (&x, d) != PSTM_OKAY) {
+			res = PS_MEM_FAIL;
+			goto LBL_Q;
+		}
+	}
+
+	res = PSTM_OKAY;
+
+LBL_Q:pstm_clear (&q);
+LBL_Y:pstm_clear (&y);
+LBL_X:pstm_clear (&x);
+LBL_T2:pstm_clear (&t2);
+LBL_T1:pstm_clear (&t1);
+
+	return res;
+}
+
+/******************************************************************************/
+/*
+	Swap the elements of two integers, for cases where you can't simply swap
+	the pstm_int pointers around
+*/
+void pstm_exch(pstm_int * a, pstm_int * b)
+{
+	pstm_int		t;
+
+	t	= *a;
+	*a	= *b;
+	*b	= t;
+}
+
+/******************************************************************************/
+/*
+	c = a mod b, 0 <= c < b
+*/
+int32 pstm_mod(psPool_t *pool, pstm_int *a, pstm_int *b, pstm_int *c)
+{
+	pstm_int	t;
+	int32		err;
+/*
+	Smart-size
+*/
+	if ((err = pstm_init_size(pool, &t, b->alloc)) != PSTM_OKAY) {
+		return err;
+	}
+	if ((err = pstm_div(pool, a, b, NULL, &t)) != PSTM_OKAY) {
+		pstm_clear (&t);
+		return err;
+	}
+	if (t.sign != b->sign) {
+		err = pstm_add(&t, b, c);
+	} else {
+		pstm_exch (&t, c);
+	}
+	pstm_clear (&t);
+	return err;
+}
+
+/******************************************************************************/
+/*
+	d = a * b (mod c)
+*/
+int32 pstm_mulmod(psPool_t *pool, pstm_int *a, pstm_int *b, pstm_int *c,
+			pstm_int *d)
+{
+	int32		res;
+	int16		size;
+	pstm_int	tmp;
+
+/*
+	Smart-size pstm_inits.  d is an output that is influenced by this local 't'
+	so don't shrink 'd' if it wants to becuase this will lead to an pstm_grow
+	in RSA operations
+*/
+	size = a->used + b->used + 1;
+	if ((a == d) && (size < a->alloc)) {
+		size = a->alloc;
+	}
+	if ((res = pstm_init_size(pool, &tmp, size)) != PSTM_OKAY) {
+		return res;
+	}
+	if ((res = pstm_mul_comba(pool, a, b, &tmp, NULL, 0)) != PSTM_OKAY) {
+		pstm_clear(&tmp);
+		return res;
+	}
+	res = pstm_mod(pool, &tmp, c, d);
+	pstm_clear(&tmp);
+	return res;
+}
+
+/******************************************************************************/
+/*
+ *	y = g**x (mod b)
+ *	Some restrictions... x must be positive and < b
+ */
+int32 pstm_exptmod(psPool_t *pool, pstm_int *G, pstm_int *X, pstm_int *P,
+			pstm_int *Y)
+{
+	pstm_int	M[32], res; /* Keep this winsize based: (1 << max_winsize) */
+	pstm_digit	buf, mp;
+	pstm_digit	*paD;
+	int32		err, bitbuf;
+	int16		bitcpy, bitcnt, mode, digidx, x, y, winsize;
+	uint32		paDlen;
+
+	/* set window size from what user set as optimization */
+	x = pstm_count_bits(X);
+	if (x < 50) {
+		winsize = 2;
+	} else {
+		winsize = PS_EXPTMOD_WINSIZE;
+	}
+
+	/* now setup montgomery  */
+	if ((err = pstm_montgomery_setup (P, &mp)) != PSTM_OKAY) {
+		return err;
+	}
+
+	/* setup result */
+	if ((err = pstm_init_size(pool, &res, (P->used * 2) + 1)) != PSTM_OKAY) {
+		return err;
+	}
+/*
+	create M table
+	The M table contains powers of the input base, e.g. M[x] = G^x mod P
+	The first half of the table is not computed though except for M[0] and M[1]
+ */
+	/* now we need R mod m */
+	if ((err = pstm_montgomery_calc_normalization (&res, P)) != PSTM_OKAY) {
+		goto LBL_RES;
+	}
+/*
+	init M array
+	init first cell
+ */
+	if ((err = pstm_init_size(pool, &M[1], res.used)) != PSTM_OKAY) {
+		goto LBL_RES;
+	}
+
+	/* now set M[1] to G * R mod m */
+	if (pstm_cmp_mag(P, G) != PSTM_GT) {
+		/* G > P so we reduce it first */
+		if ((err = pstm_mod(pool, G, P, &M[1])) != PSTM_OKAY) {
+			goto LBL_M;
+		}
+	} else {
+		if ((err = pstm_copy(G, &M[1])) != PSTM_OKAY) {
+			goto LBL_M;
+		}
+	}
+	if ((err = pstm_mulmod (pool, &M[1], &res, P, &M[1])) != PSTM_OKAY) {
+		goto LBL_M;
+	}
+/*
+	Pre-allocated digit.  Used for mul, sqr, AND reduce
+*/
+	paDlen = ((M[1].used + 3) * 2) * sizeof(pstm_digit);
+	paD = xzalloc(paDlen);
+/*
+ 	compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times
+ */
+	if (pstm_init_copy(pool, &M[1 << (winsize - 1)], &M[1], 1) != PSTM_OKAY) {
+		err = PS_MEM_FAIL;
+		goto LBL_PAD;
+	}
+	for (x = 0; x < (winsize - 1); x++) {
+		if ((err = pstm_sqr_comba (pool, &M[1 << (winsize - 1)],
+				&M[1 << (winsize - 1)], paD, paDlen)) != PSTM_OKAY) {
+			goto LBL_PAD;
+		}
+		if ((err = pstm_montgomery_reduce(pool, &M[1 << (winsize - 1)], P, mp,
+				paD, paDlen)) != PSTM_OKAY) {
+			goto LBL_PAD;
+		}
+	}
+/*
+	now init the second half of the array
+*/
+	for (x = (1<<(winsize-1)) + 1; x < (1 << winsize); x++) {
+		if ((err = pstm_init_size(pool, &M[x], M[1<<(winsize-1)].alloc + 1))
+				!= PSTM_OKAY) {
+			for (y = 1<<(winsize-1); y < x; y++) {
+				pstm_clear(&M[y]);
+			}
+			goto LBL_PAD;
+		}
+	}
+
+	/* create upper table */
+	for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
+		if ((err = pstm_mul_comba(pool, &M[x - 1], &M[1], &M[x], paD, paDlen))
+				!= PSTM_OKAY) {
+			goto LBL_MARRAY;
+		}
+		if ((err = pstm_montgomery_reduce(pool, &M[x], P, mp, paD, paDlen)) !=
+				PSTM_OKAY) {
+			goto LBL_MARRAY;
+		}
+	}
+
+	/* set initial mode and bit cnt */
+	mode   = 0;
+	bitcnt = 1;
+	buf    = 0;
+	digidx = X->used - 1;
+	bitcpy = 0;
+	bitbuf = 0;
+
+	for (;;) {
+		/* grab next digit as required */
+		if (--bitcnt == 0) {
+			/* if digidx == -1 we are out of digits so break */
+			if (digidx == -1) {
+				break;
+			}
+			/* read next digit and reset bitcnt */
+			buf    = X->dp[digidx--];
+			bitcnt = (int32)DIGIT_BIT;
+		}
+
+		/* grab the next msb from the exponent */
+		y     = (pstm_digit)(buf >> (DIGIT_BIT - 1)) & 1;
+		buf <<= (pstm_digit)1;
+/*
+		 If the bit is zero and mode == 0 then we ignore it.
+		 These represent the leading zero bits before the first 1 bit
+		 in the exponent.  Technically this opt is not required but it
+		 does lower the # of trivial squaring/reductions used
+*/
+		if (mode == 0 && y == 0) {
+			continue;
+		}
+
+		/* if the bit is zero and mode == 1 then we square */
+		if (mode == 1 && y == 0) {
+			if ((err = pstm_sqr_comba(pool, &res, &res, paD, paDlen)) !=
+					PSTM_OKAY) {
+				goto LBL_MARRAY;
+			}
+			if ((err = pstm_montgomery_reduce(pool, &res, P, mp, paD, paDlen))
+					!= PSTM_OKAY) {
+				goto LBL_MARRAY;
+			}
+			continue;
+		}
+
+		/* else we add it to the window */
+		bitbuf |= (y << (winsize - ++bitcpy));
+		mode    = 2;
+
+		if (bitcpy == winsize) {
+			/* ok window is filled so square as required and mul square first */
+			for (x = 0; x < winsize; x++) {
+				if ((err = pstm_sqr_comba(pool, &res, &res, paD, paDlen)) !=
+						PSTM_OKAY) {
+					goto LBL_MARRAY;
+				}
+				if ((err = pstm_montgomery_reduce(pool, &res, P, mp, paD,
+						paDlen)) != PSTM_OKAY) {
+					goto LBL_MARRAY;
+				}
+			}
+
+			/* then multiply */
+			if ((err = pstm_mul_comba(pool, &res, &M[bitbuf], &res, paD,
+					paDlen)) != PSTM_OKAY) {
+				goto LBL_MARRAY;
+			}
+			if ((err = pstm_montgomery_reduce(pool, &res, P, mp, paD, paDlen))
+					!= PSTM_OKAY) {
+				goto LBL_MARRAY;
+			}
+
+			/* empty window and reset */
+			bitcpy = 0;
+			bitbuf = 0;
+			mode   = 1;
+		}
+	}
+
+	/* if bits remain then square/multiply */
+	if (mode == 2 && bitcpy > 0) {
+		/* square then multiply if the bit is set */
+		for (x = 0; x < bitcpy; x++) {
+			if ((err = pstm_sqr_comba(pool, &res, &res, paD, paDlen)) !=
+					PSTM_OKAY) {
+				goto LBL_MARRAY;
+			}
+			if ((err = pstm_montgomery_reduce(pool, &res, P, mp, paD, paDlen))
+					!= PSTM_OKAY) {
+				goto LBL_MARRAY;
+			}
+
+			/* get next bit of the window */
+			bitbuf <<= 1;
+			if ((bitbuf & (1 << winsize)) != 0) {
+			/* then multiply */
+				if ((err = pstm_mul_comba(pool, &res, &M[1], &res, paD, paDlen))
+						!= PSTM_OKAY) {
+					goto LBL_MARRAY;
+				}
+				if ((err = pstm_montgomery_reduce(pool, &res, P, mp, paD,
+						paDlen)) != PSTM_OKAY) {
+					goto LBL_MARRAY;
+				}
+			}
+		}
+	}
+/*
+	Fix up result if Montgomery reduction is used recall that any value in a
+	Montgomery system is actually multiplied by R mod n.  So we have to reduce
+	one more time to cancel out the factor of R.
+*/
+	if ((err = pstm_montgomery_reduce(pool, &res, P, mp, paD, paDlen)) !=
+			PSTM_OKAY) {
+		goto LBL_MARRAY;
+	}
+	/* swap res with Y */
+	if ((err = pstm_copy (&res, Y)) != PSTM_OKAY) {
+		goto LBL_MARRAY;
+	}
+	err = PSTM_OKAY;
+LBL_MARRAY:
+	for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
+		pstm_clear(&M[x]);
+	}
+LBL_PAD:psFree(paD, pool);
+LBL_M: pstm_clear(&M[1]);
+LBL_RES:pstm_clear(&res);
+	return err;
+}
+
+/******************************************************************************/
+/*
+
+*/
+int32 pstm_add(pstm_int *a, pstm_int *b, pstm_int *c)
+{
+	int32	res;
+	int16	sa, sb;
+
+	/* get sign of both inputs */
+	sa = a->sign;
+	sb = b->sign;
+
+	/* handle two cases, not four */
+	if (sa == sb) {
+		/* both positive or both negative, add their mags, copy the sign */
+		c->sign = sa;
+		if ((res = s_pstm_add (a, b, c)) != PSTM_OKAY) {
+			return res;
+		}
+  } else {
+/*
+		one positive, the other negative
+		subtract the one with the greater magnitude from the one of the lesser
+		magnitude. The result gets the sign of the one with the greater mag.
+ */
+		if (pstm_cmp_mag (a, b) == PSTM_LT) {
+			c->sign = sb;
+			if ((res = s_pstm_sub (b, a, c)) != PSTM_OKAY) {
+				return res;
+			}
+		} else {
+			c->sign = sa;
+			if ((res = s_pstm_sub (a, b, c)) != PSTM_OKAY) {
+				return res;
+			}
+		}
+	}
+	return PS_SUCCESS;
+}
+
+/******************************************************************************/
+/*
+	reverse an array, used for radix code
+*/
+static void pstm_reverse (unsigned char *s, int16 len)
+{
+	int32     ix, iy;
+	unsigned char t;
+
+	ix = 0;
+	iy = len - 1;
+	while (ix < iy) {
+		t     = s[ix];
+		s[ix] = s[iy];
+		s[iy] = t;
+		++ix;
+		--iy;
+	}
+}
+/******************************************************************************/
+/*
+	No reverse.  Useful in some of the EIP-154 PKA stuff where special byte
+	order seems to come into play more often
+*/
+int32 pstm_to_unsigned_bin_nr(psPool_t *pool, pstm_int *a, unsigned char *b)
+{
+	int32     res;
+	int16     x;
+	pstm_int  t = { 0 };
+
+	if ((res = pstm_init_copy(pool, &t, a, 0)) != PSTM_OKAY) {
+		return res;
+	}
+
+	x = 0;
+	while (pstm_iszero (&t) == 0) {
+		b[x++] = (unsigned char) (t.dp[0] & 255);
+		if ((res = pstm_div_2d (pool, &t, 8, &t, NULL)) != PSTM_OKAY) {
+			pstm_clear(&t);
+			return res;
+		}
+	}
+	pstm_clear(&t);
+	return PS_SUCCESS;
+}
+/******************************************************************************/
+/*
+
+*/
+int32 pstm_to_unsigned_bin(psPool_t *pool, pstm_int *a, unsigned char *b)
+{
+	int32     res;
+	int16     x;
+	pstm_int  t = { 0 };
+
+	if ((res = pstm_init_copy(pool, &t, a, 0)) != PSTM_OKAY) {
+		return res;
+	}
+
+	x = 0;
+	while (pstm_iszero (&t) == 0) {
+		b[x++] = (unsigned char) (t.dp[0] & 255);
+		if ((res = pstm_div_2d (pool, &t, 8, &t, NULL)) != PSTM_OKAY) {
+			pstm_clear(&t);
+			return res;
+		}
+	}
+	pstm_reverse (b, x);
+	pstm_clear(&t);
+	return PS_SUCCESS;
+}
+
+/******************************************************************************/
+/*
+	compare against a single digit
+*/
+int32 pstm_cmp_d(pstm_int *a, pstm_digit b)
+{
+	/* compare based on sign */
+	if ((b && a->used == 0) || a->sign == PSTM_NEG) {
+		return PSTM_LT;
+	}
+
+	/* compare based on magnitude */
+	if (a->used > 1) {
+		return PSTM_GT;
+	}
+
+	/* compare the only digit of a to b */
+	if (a->dp[0] > b) {
+		return PSTM_GT;
+	} else if (a->dp[0] < b) {
+		return PSTM_LT;
+	} else {
+		return PSTM_EQ;
+	}
+}
+
+/*
+	Need invmod for ECC and also private key loading for hardware crypto
+	in cases where dQ > dP.  The values must be switched and a new qP must be
+	calculated using this function
+*/
+static int32 pstm_invmod_slow(psPool_t *pool, pstm_int * a, pstm_int * b,
+				pstm_int * c)
+{
+	pstm_int  x, y, u, v, A, B, C, D;
+	int32     res;
+
+	/* b cannot be negative */
+	if (b->sign == PSTM_NEG || pstm_iszero(b) == 1) {
+		return PS_LIMIT_FAIL;
+	}
+
+	/* init temps */
+	if (pstm_init_size(pool, &x, b->used) != PSTM_OKAY) {
+		return PS_MEM_FAIL;
+	}
+
+	/* x = a, y = b */
+	if ((res = pstm_mod(pool, a, b, &x)) != PSTM_OKAY) {
+		goto LBL_X;
+	}
+
+	if (pstm_init_copy(pool, &y, b, 0) != PSTM_OKAY) {
+		goto LBL_X;
+	}
+
+	/* 2. [modified] if x,y are both even then return an error! */
+	if (pstm_iseven (&x) == 1 && pstm_iseven (&y) == 1) {
+		res = PS_FAILURE;
+		goto LBL_Y;
+	}
+
+	/* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
+	if ((res = pstm_init_copy(pool, &u, &x, 0)) != PSTM_OKAY) {
+		goto LBL_Y;
+	}
+	if ((res = pstm_init_copy(pool, &v, &y, 0)) != PSTM_OKAY) {
+		goto LBL_U;
+	}
+
+	if ((res = pstm_init_size(pool, &A, sizeof(pstm_digit))) != PSTM_OKAY) {
+		goto LBL_V;
+	}
+
+	if ((res = pstm_init_size(pool, &D, sizeof(pstm_digit))) != PSTM_OKAY) {
+		goto LBL_A;
+	}
+	pstm_set (&A, 1);
+	pstm_set (&D, 1);
+
+	if ((res = pstm_init(pool, &B)) != PSTM_OKAY) {
+		goto LBL_D;
+	}
+	if ((res = pstm_init(pool, &C)) != PSTM_OKAY) {
+		goto LBL_B;
+	}
+
+top:
+	/* 4.  while u is even do */
+	while (pstm_iseven (&u) == 1) {
+		/* 4.1 u = u/2 */
+		if ((res = pstm_div_2 (&u, &u)) != PSTM_OKAY) {
+			goto LBL_C;
+		}
+
+		/* 4.2 if A or B is odd then */
+		if (pstm_isodd (&A) == 1 || pstm_isodd (&B) == 1) {
+			/* A = (A+y)/2, B = (B-x)/2 */
+			if ((res = pstm_add (&A, &y, &A)) != PSTM_OKAY) {
+				goto LBL_C;
+			}
+			if ((res = pstm_sub (&B, &x, &B)) != PSTM_OKAY) {
+				goto LBL_C;
+			}
+		}
+		/* A = A/2, B = B/2 */
+		if ((res = pstm_div_2 (&A, &A)) != PSTM_OKAY) {
+			goto LBL_C;
+		}
+		if ((res = pstm_div_2 (&B, &B)) != PSTM_OKAY) {
+			goto LBL_C;
+		}
+	}
+
+	/* 5.  while v is even do */
+	while (pstm_iseven (&v) == 1) {
+		/* 5.1 v = v/2 */
+		if ((res = pstm_div_2 (&v, &v)) != PSTM_OKAY) {
+			goto LBL_C;
+		}
+
+		/* 5.2 if C or D is odd then */
+		if (pstm_isodd (&C) == 1 || pstm_isodd (&D) == 1) {
+			/* C = (C+y)/2, D = (D-x)/2 */
+			if ((res = pstm_add (&C, &y, &C)) != PSTM_OKAY) {
+				goto LBL_C;
+			}
+			if ((res = pstm_sub (&D, &x, &D)) != PSTM_OKAY) {
+				goto LBL_C;
+			}
+		}
+		/* C = C/2, D = D/2 */
+		if ((res = pstm_div_2 (&C, &C)) != PSTM_OKAY) {
+			goto LBL_C;
+		}
+		if ((res = pstm_div_2 (&D, &D)) != PSTM_OKAY) {
+			goto LBL_C;
+		}
+	}
+
+	/* 6.  if u >= v then */
+	if (pstm_cmp (&u, &v) != PSTM_LT) {
+		/* u = u - v, A = A - C, B = B - D */
+		if ((res = pstm_sub (&u, &v, &u)) != PSTM_OKAY) {
+				goto LBL_C;
+		}
+		if ((res = pstm_sub (&A, &C, &A)) != PSTM_OKAY) {
+				goto LBL_C;
+		}
+		if ((res = pstm_sub (&B, &D, &B)) != PSTM_OKAY) {
+				goto LBL_C;
+		}
+	} else {
+		/* v - v - u, C = C - A, D = D - B */
+		if ((res = pstm_sub (&v, &u, &v)) != PSTM_OKAY) {
+				goto LBL_C;
+		}
+		if ((res = pstm_sub (&C, &A, &C)) != PSTM_OKAY) {
+				goto LBL_C;
+		}
+		if ((res = pstm_sub (&D, &B, &D)) != PSTM_OKAY) {
+				goto LBL_C;
+		}
+	}
+
+	/* if not zero goto step 4 */
+	if (pstm_iszero (&u) == 0)
+		goto top;
+
+	/* now a = C, b = D, gcd == g*v */
+
+	/* if v != 1 then there is no inverse */
+	if (pstm_cmp_d (&v, 1) != PSTM_EQ) {
+		res = PS_FAILURE;
+		goto LBL_C;
+	}
+
+	/* if its too low */
+	while (pstm_cmp_d(&C, 0) == PSTM_LT) {
+		if ((res = pstm_add(&C, b, &C)) != PSTM_OKAY) {
+			goto LBL_C;
+		}
+	}
+
+	/* too big */
+	while (pstm_cmp_mag(&C, b) != PSTM_LT) {
+		if ((res = pstm_sub(&C, b, &C)) != PSTM_OKAY) {
+			goto LBL_C;
+		}
+	}
+
+	/* C is now the inverse */
+	if ((res = pstm_copy(&C, c)) != PSTM_OKAY) {
+		goto LBL_C;
+	}
+	res = PSTM_OKAY;
+
+LBL_C: pstm_clear(&C);
+LBL_D: pstm_clear(&D);
+LBL_B: pstm_clear(&B);
+LBL_A: pstm_clear(&A);
+LBL_V: pstm_clear(&v);
+LBL_U: pstm_clear(&u);
+LBL_Y: pstm_clear(&y);
+LBL_X: pstm_clear(&x);
+
+	return res;
+}
+
+/* c = 1/a (mod b) for odd b only */
+int32 pstm_invmod(psPool_t *pool, pstm_int *a, pstm_int *b, pstm_int *c)
+{
+	pstm_int	x, y, u, v, B, D;
+	int32		res;
+	uint16		neg, sanity;
+
+	/* 2. [modified] b must be odd   */
+	if (pstm_iseven (b) == 1) {
+		return pstm_invmod_slow(pool, a,b,c);
+	}
+
+	/* x == modulus, y == value to invert */
+	if ((res = pstm_init_copy(pool, &x, b, 0)) != PSTM_OKAY) {
+		return res;
+	}
+
+	if ((res = pstm_init_size(pool, &y, a->alloc)) != PSTM_OKAY) {
+		goto LBL_X;
+	}
+
+	/* we need y = |a| */
+	pstm_abs(a, &y);
+
+	/* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
+	if ((res = pstm_init_copy(pool, &u, &x, 0)) != PSTM_OKAY) {
+		goto LBL_Y;
+	}
+	if ((res = pstm_init_copy(pool, &v, &y, 0)) != PSTM_OKAY) {
+		goto LBL_U;
+	}
+	if ((res = pstm_init(pool, &B)) != PSTM_OKAY) {
+		goto LBL_V;
+	}
+	if ((res = pstm_init(pool, &D)) != PSTM_OKAY) {
+		goto LBL_B;
+	}
+
+	pstm_set (&D, 1);
+
+	sanity = 0;
+top:
+	/* 4.  while u is even do */
+	while (pstm_iseven (&u) == 1) {
+		/* 4.1 u = u/2 */
+		if ((res = pstm_div_2 (&u, &u)) != PSTM_OKAY) {
+			goto LBL_D;
+		}
+
+		/* 4.2 if B is odd then */
+		if (pstm_isodd (&B) == 1) {
+			if ((res = pstm_sub (&B, &x, &B)) != PSTM_OKAY) {
+				goto LBL_D;
+			}
+		}
+		/* B = B/2 */
+		if ((res = pstm_div_2 (&B, &B)) !=  PSTM_OKAY) {
+			goto LBL_D;
+		}
+	}
+
+	/* 5.  while v is even do */
+	while (pstm_iseven (&v) == 1) {
+		/* 5.1 v = v/2 */
+		if ((res = pstm_div_2 (&v, &v)) != PSTM_OKAY) {
+			goto LBL_D;
+		}
+		/* 5.2 if D is odd then */
+		if (pstm_isodd (&D) == 1) {
+			/* D = (D-x)/2 */
+			if ((res = pstm_sub (&D, &x, &D)) != PSTM_OKAY) {
+				goto LBL_D;
+			}
+		}
+		/* D = D/2 */
+		if ((res = pstm_div_2 (&D, &D)) !=  PSTM_OKAY) {
+			goto LBL_D;
+		}
+	}
+
+	/* 6.  if u >= v then */
+	if (pstm_cmp (&u, &v) != PSTM_LT) {
+		/* u = u - v, B = B - D */
+		if ((res = pstm_sub (&u, &v, &u)) != PSTM_OKAY) {
+				goto LBL_D;
+		}
+		if ((res = pstm_sub (&B, &D, &B)) != PSTM_OKAY) {
+				goto LBL_D;
+		}
+	} else {
+		/* v - v - u, D = D - B */
+		if ((res = pstm_sub (&v, &u, &v)) != PSTM_OKAY) {
+				goto LBL_D;
+		}
+		if ((res = pstm_sub (&D, &B, &D)) != PSTM_OKAY) {
+				goto LBL_D;
+		}
+	}
+
+	/* if not zero goto step 4 */
+	if (sanity++ > 1000) {
+		res = PS_LIMIT_FAIL;
+		goto LBL_D;
+	}
+	if (pstm_iszero (&u) == 0) {
+		goto top;
+	}
+
+	/* now a = C, b = D, gcd == g*v */
+
+	/* if v != 1 then there is no inverse */
+	if (pstm_cmp_d (&v, 1) != PSTM_EQ) {
+		res = PS_FAILURE;
+		goto LBL_D;
+	}
+
+	/* b is now the inverse */
+	neg = a->sign;
+	while (D.sign == PSTM_NEG) {
+		if ((res = pstm_add (&D, b, &D)) != PSTM_OKAY) {
+			goto LBL_D;
+		}
+	}
+	if ((res = pstm_copy (&D, c)) != PSTM_OKAY) {
+		goto LBL_D;
+	}
+	c->sign = neg;
+	res = PSTM_OKAY;
+
+LBL_D: pstm_clear(&D);
+LBL_B: pstm_clear(&B);
+LBL_V: pstm_clear(&v);
+LBL_U: pstm_clear(&u);
+LBL_Y: pstm_clear(&y);
+LBL_X: pstm_clear(&x);
+	return res;
+}
+#endif /* !DISABLE_PSTM */
+/******************************************************************************/