dropbear: bn_fast_s_mp_sqr.c comparison

comparison bn_fast_s_mp_sqr.c @ 142:d29b64170cf0 libtommath-orig

import of libtommath 0.32

author	Matt Johnston <matt@ucc.asn.au>
date	Sun, 19 Dec 2004 11:33:56 +0000
parents	86e0b50a9b58
children	d8254fc979e9

comparison

equal deleted inserted replaced

-:e1037a1e12e7
+:d29b64170cf0
+#include <tommath.h>
+#ifdef BN_FAST_S_MP_SQR_C
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
 *
 * LibTomMath is a library that provides multiple-precision
 * integer arithmetic as well as number theoretic functionality.
 *
 * The library is free for all purposes without any express
 * guarantee it works.
 *
 * Tom St Denis, [email protected], http://math.libtomcrypt.org
 */
-#include <tommath.h>
 /* fast squaring
 *
 * This is the comba method where the columns of the product
 * are computed first then the carries are computed.  This
 * because 64-bit shifts are slow!
 *
 * Based on Algorithm 14.16 on pp.597 of HAC.
 *
 */
+/* the jist of squaring...
+you do like mult except the offset of the tmpx [one that starts closer to zero]
+can't equal the offset of tmpy.  So basically you set up iy like before then you min it with
+(ty-tx) so that it never happens.  You double all those you add in the inner loop
+After that loop you do the squares and add them in.
+Remove W2 and don't memset W
+*/
 int fast_s_mp_sqr (mp_int * a, mp_int * b)
 {
-int     olduse, newused, res, ix, pa;
+int       olduse, res, pa, ix, iz;
-mp_word W2[MP_WARRAY], W[MP_WARRAY];
+mp_digit   W[MP_WARRAY], *tmpx;
+mp_word   W1;
-/* calculate size of product and allocate as required */
+/* grow the destination as required */
-pa = a->used;
+pa = a->used + a->used;
-newused = pa + pa + 1;
+if (b->alloc < pa) {
-if (b->alloc < newused) {
+if ((res = mp_grow (b, pa)) != MP_OKAY) {
-if ((res = mp_grow (b, newused)) != MP_OKAY) {
 return res;
 }
 }
-/* zero temp buffer (columns)
+/* number of output digits to produce */
-* Note that there are two buffers.  Since squaring requires
+W1 = 0;
-* a outer and inner product and the inner product requires
+for (ix = 0; ix <= pa; ix++) {
-* computing a product and doubling it (a relatively expensive
+int      tx, ty, iy;
-* op to perform n**2 times if you don't have to) the inner and
+mp_word  _W;
-* outer products are computed in different buffers.  This way
+mp_digit *tmpy;
-* the inner product can be doubled using n doublings instead of
-* n**2
-*/
-memset (W,  0, newused * sizeof (mp_word));
-memset (W2, 0, newused * sizeof (mp_word));
-/* This computes the inner product.  To simplify the inner N**2 loop
+/* clear counter */
-* the multiplication by two is done afterwards in the N loop.
+_W = 0;
-*/
-for (ix = 0; ix < pa; ix++) {
-/* compute the outer product
-*
-* Note that every outer product is computed
-* for a particular column only once which means that
-* there is no need todo a double precision addition
-* into the W2[] array.
-*/
-W2[ix + ix] = ((mp_word)a->dp[ix]) * ((mp_word)a->dp[ix]);
-{
+/* get offsets into the two bignums */
-register mp_digit tmpx, *tmpy;
+ty = MIN(a->used-1, ix);
-register mp_word *_W;
+tx = ix - ty;
-register int iy;
-/* copy of left side */
+/* setup temp aliases */
-tmpx = a->dp[ix];
+tmpx = a->dp + tx;
+tmpy = a->dp + ty;
-/* alias for right side */
+/* this is the number of times the loop will iterrate, essentially its
-tmpy = a->dp + (ix + 1);
+while (tx++ < a->used && ty-- >= 0) { ... }
+*/
+iy = MIN(a->used-tx, ty+1);
-/* the column to store the result in */
+/* now for squaring tx can never equal ty
-_W = W + (ix + ix + 1);
+* we halve the distance since they approach at a rate of 2x
+* and we have to round because odd cases need to be executed
+*/
+iy = MIN(iy, (ty-tx+1)>>1);
-/* inner products */
+/* execute loop */
-for (iy = ix + 1; iy < pa; iy++) {
+for (iz = 0; iz < iy; iz++) {
-*_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++);
+_W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
 }
-}
+/* double the inner product and add carry */
+_W = _W + _W + W1;
+/* even columns have the square term in them */
+if ((ix&1) == 0) {
+_W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]);
+}
+/* store it */
+W[ix] = _W;
+/* make next carry */
+W1 = _W >> ((mp_word)DIGIT_BIT);
 }
 /* setup dest */
 olduse  = b->used;
-b->used = newused;
+b->used = a->used+a->used;
-/* now compute digits
-*
-* We have to double the inner product sums, add in the
-* outer product sums, propagate carries and convert
-* to single precision.
-*/
 {
-register mp_digit *tmpb;
+mp_digit *tmpb;
+tmpb = b->dp;
+for (ix = 0; ix < pa; ix++) {
+*tmpb++ = W[ix] & MP_MASK;
+}
-/* double first value, since the inner products are
+/* clear unused digits [that existed in the old copy of c] */
-* half of what they should be
-*/
-W[0] += W[0] + W2[0];
-tmpb = b->dp;
-for (ix = 1; ix < newused; ix++) {
-/* double/add next digit */
-W[ix] += W[ix] + W2[ix];
-/* propagate carry forwards [from the previous digit] */
-W[ix] = W[ix] + (W[ix - 1] >> ((mp_word) DIGIT_BIT));
-/* store the current digit now that the carry isn't
-* needed
-*/
-*tmpb++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
-}
-/* set the last value.  Note even if the carry is zero
-* this is required since the next step will not zero
-* it if b originally had a value at b->dp[2*a.used]
-*/
-*tmpb++ = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK));
-/* clear high digits of b if there were any originally */
 for (; ix < olduse; ix++) {
 *tmpb++ = 0;
 }
 }
 mp_clamp (b);
 return MP_OKAY;
 }
+#endif

Mercurial > dropbear

comparison bn_fast_s_mp_sqr.c @ 142:d29b64170cf0 libtommath-orig