Mercurial > dropbear
diff bn_fast_s_mp_mul_digs.c @ 2:86e0b50a9b58 libtommath-orig ltm-0.30-orig
ltm 0.30 orig import
author | Matt Johnston <matt@ucc.asn.au> |
---|---|
date | Mon, 31 May 2004 18:25:22 +0000 |
parents | |
children | d29b64170cf0 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bn_fast_s_mp_mul_digs.c Mon May 31 18:25:22 2004 +0000 @@ -0,0 +1,130 @@ +/* LibTomMath, multiple-precision integer library -- Tom St Denis + * + * LibTomMath is a library that provides multiple-precision + * integer arithmetic as well as number theoretic functionality. + * + * The library was designed directly after the MPI library by + * Michael Fromberger but has been written from scratch with + * additional optimizations in place. + * + * The library is free for all purposes without any express + * guarantee it works. + * + * Tom St Denis, [email protected], http://math.libtomcrypt.org + */ +#include <tommath.h> + +/* Fast (comba) multiplier + * + * This is the fast column-array [comba] multiplier. It is + * designed to compute the columns of the product first + * then handle the carries afterwards. This has the effect + * of making the nested loops that compute the columns very + * simple and schedulable on super-scalar processors. + * + * This has been modified to produce a variable number of + * digits of output so if say only a half-product is required + * you don't have to compute the upper half (a feature + * required for fast Barrett reduction). + * + * Based on Algorithm 14.12 on pp.595 of HAC. + * + */ +int +fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs) +{ + int olduse, res, pa, ix; + mp_word W[MP_WARRAY]; + + /* grow the destination as required */ + if (c->alloc < digs) { + if ((res = mp_grow (c, digs)) != MP_OKAY) { + return res; + } + } + + /* clear temp buf (the columns) */ + memset (W, 0, sizeof (mp_word) * digs); + + /* calculate the columns */ + pa = a->used; + for (ix = 0; ix < pa; ix++) { + /* this multiplier has been modified to allow you to + * control how many digits of output are produced. + * So at most we want to make upto "digs" digits of output. + * + * this adds products to distinct columns (at ix+iy) of W + * note that each step through the loop is not dependent on + * the previous which means the compiler can easily unroll + * the loop without scheduling problems + */ + { + register mp_digit tmpx, *tmpy; + register mp_word *_W; + register int iy, pb; + + /* alias for the the word on the left e.g. A[ix] * A[iy] */ + tmpx = a->dp[ix]; + + /* alias for the right side */ + tmpy = b->dp; + + /* alias for the columns, each step through the loop adds a new + term to each column + */ + _W = W + ix; + + /* the number of digits is limited by their placement. E.g. + we avoid multiplying digits that will end up above the # of + digits of precision requested + */ + pb = MIN (b->used, digs - ix); + + for (iy = 0; iy < pb; iy++) { + *_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++); + } + } + + } + + /* setup dest */ + olduse = c->used; + c->used = digs; + + { + register mp_digit *tmpc; + + /* At this point W[] contains the sums of each column. To get the + * correct result we must take the extra bits from each column and + * carry them down + * + * Note that while this adds extra code to the multiplier it + * saves time since the carry propagation is removed from the + * above nested loop.This has the effect of reducing the work + * from N*(N+N*c)==N**2 + c*N**2 to N**2 + N*c where c is the + * cost of the shifting. On very small numbers this is slower + * but on most cryptographic size numbers it is faster. + * + * In this particular implementation we feed the carries from + * behind which means when the loop terminates we still have one + * last digit to copy + */ + tmpc = c->dp; + for (ix = 1; ix < digs; ix++) { + /* forward the carry from the previous temp */ + W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT)); + + /* now extract the previous digit [below the carry] */ + *tmpc++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK)); + } + /* fetch the last digit */ + *tmpc++ = (mp_digit) (W[digs - 1] & ((mp_word) MP_MASK)); + + /* clear unused digits [that existed in the old copy of c] */ + for (; ix < olduse; ix++) { + *tmpc++ = 0; + } + } + mp_clamp (c); + return MP_OKAY; +}