Mercurial > dropbear
comparison bn_fast_s_mp_mul_digs.c @ 2:86e0b50a9b58 libtommath-orig ltm-0.30-orig
ltm 0.30 orig import
author | Matt Johnston <matt@ucc.asn.au> |
---|---|
date | Mon, 31 May 2004 18:25:22 +0000 |
parents | |
children | d29b64170cf0 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 2:86e0b50a9b58 |
---|---|
1 /* LibTomMath, multiple-precision integer library -- Tom St Denis | |
2 * | |
3 * LibTomMath is a library that provides multiple-precision | |
4 * integer arithmetic as well as number theoretic functionality. | |
5 * | |
6 * The library was designed directly after the MPI library by | |
7 * Michael Fromberger but has been written from scratch with | |
8 * additional optimizations in place. | |
9 * | |
10 * The library is free for all purposes without any express | |
11 * guarantee it works. | |
12 * | |
13 * Tom St Denis, [email protected], http://math.libtomcrypt.org | |
14 */ | |
15 #include <tommath.h> | |
16 | |
17 /* Fast (comba) multiplier | |
18 * | |
19 * This is the fast column-array [comba] multiplier. It is | |
20 * designed to compute the columns of the product first | |
21 * then handle the carries afterwards. This has the effect | |
22 * of making the nested loops that compute the columns very | |
23 * simple and schedulable on super-scalar processors. | |
24 * | |
25 * This has been modified to produce a variable number of | |
26 * digits of output so if say only a half-product is required | |
27 * you don't have to compute the upper half (a feature | |
28 * required for fast Barrett reduction). | |
29 * | |
30 * Based on Algorithm 14.12 on pp.595 of HAC. | |
31 * | |
32 */ | |
33 int | |
34 fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs) | |
35 { | |
36 int olduse, res, pa, ix; | |
37 mp_word W[MP_WARRAY]; | |
38 | |
39 /* grow the destination as required */ | |
40 if (c->alloc < digs) { | |
41 if ((res = mp_grow (c, digs)) != MP_OKAY) { | |
42 return res; | |
43 } | |
44 } | |
45 | |
46 /* clear temp buf (the columns) */ | |
47 memset (W, 0, sizeof (mp_word) * digs); | |
48 | |
49 /* calculate the columns */ | |
50 pa = a->used; | |
51 for (ix = 0; ix < pa; ix++) { | |
52 /* this multiplier has been modified to allow you to | |
53 * control how many digits of output are produced. | |
54 * So at most we want to make upto "digs" digits of output. | |
55 * | |
56 * this adds products to distinct columns (at ix+iy) of W | |
57 * note that each step through the loop is not dependent on | |
58 * the previous which means the compiler can easily unroll | |
59 * the loop without scheduling problems | |
60 */ | |
61 { | |
62 register mp_digit tmpx, *tmpy; | |
63 register mp_word *_W; | |
64 register int iy, pb; | |
65 | |
66 /* alias for the the word on the left e.g. A[ix] * A[iy] */ | |
67 tmpx = a->dp[ix]; | |
68 | |
69 /* alias for the right side */ | |
70 tmpy = b->dp; | |
71 | |
72 /* alias for the columns, each step through the loop adds a new | |
73 term to each column | |
74 */ | |
75 _W = W + ix; | |
76 | |
77 /* the number of digits is limited by their placement. E.g. | |
78 we avoid multiplying digits that will end up above the # of | |
79 digits of precision requested | |
80 */ | |
81 pb = MIN (b->used, digs - ix); | |
82 | |
83 for (iy = 0; iy < pb; iy++) { | |
84 *_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++); | |
85 } | |
86 } | |
87 | |
88 } | |
89 | |
90 /* setup dest */ | |
91 olduse = c->used; | |
92 c->used = digs; | |
93 | |
94 { | |
95 register mp_digit *tmpc; | |
96 | |
97 /* At this point W[] contains the sums of each column. To get the | |
98 * correct result we must take the extra bits from each column and | |
99 * carry them down | |
100 * | |
101 * Note that while this adds extra code to the multiplier it | |
102 * saves time since the carry propagation is removed from the | |
103 * above nested loop.This has the effect of reducing the work | |
104 * from N*(N+N*c)==N**2 + c*N**2 to N**2 + N*c where c is the | |
105 * cost of the shifting. On very small numbers this is slower | |
106 * but on most cryptographic size numbers it is faster. | |
107 * | |
108 * In this particular implementation we feed the carries from | |
109 * behind which means when the loop terminates we still have one | |
110 * last digit to copy | |
111 */ | |
112 tmpc = c->dp; | |
113 for (ix = 1; ix < digs; ix++) { | |
114 /* forward the carry from the previous temp */ | |
115 W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT)); | |
116 | |
117 /* now extract the previous digit [below the carry] */ | |
118 *tmpc++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK)); | |
119 } | |
120 /* fetch the last digit */ | |
121 *tmpc++ = (mp_digit) (W[digs - 1] & ((mp_word) MP_MASK)); | |
122 | |
123 /* clear unused digits [that existed in the old copy of c] */ | |
124 for (; ix < olduse; ix++) { | |
125 *tmpc++ = 0; | |
126 } | |
127 } | |
128 mp_clamp (c); | |
129 return MP_OKAY; | |
130 } |