2
|
1 /* LibTomMath, multiple-precision integer library -- Tom St Denis |
|
2 * |
|
3 * LibTomMath is a library that provides multiple-precision |
|
4 * integer arithmetic as well as number theoretic functionality. |
|
5 * |
|
6 * The library was designed directly after the MPI library by |
|
7 * Michael Fromberger but has been written from scratch with |
|
8 * additional optimizations in place. |
|
9 * |
|
10 * The library is free for all purposes without any express |
|
11 * guarantee it works. |
|
12 * |
|
13 * Tom St Denis, [email protected], http://math.libtomcrypt.org |
|
14 */ |
|
15 #include <tommath.h> |
|
16 |
|
17 /* Fast (comba) multiplier |
|
18 * |
|
19 * This is the fast column-array [comba] multiplier. It is |
|
20 * designed to compute the columns of the product first |
|
21 * then handle the carries afterwards. This has the effect |
|
22 * of making the nested loops that compute the columns very |
|
23 * simple and schedulable on super-scalar processors. |
|
24 * |
|
25 * This has been modified to produce a variable number of |
|
26 * digits of output so if say only a half-product is required |
|
27 * you don't have to compute the upper half (a feature |
|
28 * required for fast Barrett reduction). |
|
29 * |
|
30 * Based on Algorithm 14.12 on pp.595 of HAC. |
|
31 * |
|
32 */ |
|
33 int |
|
34 fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs) |
|
35 { |
|
36 int olduse, res, pa, ix; |
|
37 mp_word W[MP_WARRAY]; |
|
38 |
|
39 /* grow the destination as required */ |
|
40 if (c->alloc < digs) { |
|
41 if ((res = mp_grow (c, digs)) != MP_OKAY) { |
|
42 return res; |
|
43 } |
|
44 } |
|
45 |
|
46 /* clear temp buf (the columns) */ |
|
47 memset (W, 0, sizeof (mp_word) * digs); |
|
48 |
|
49 /* calculate the columns */ |
|
50 pa = a->used; |
|
51 for (ix = 0; ix < pa; ix++) { |
|
52 /* this multiplier has been modified to allow you to |
|
53 * control how many digits of output are produced. |
|
54 * So at most we want to make upto "digs" digits of output. |
|
55 * |
|
56 * this adds products to distinct columns (at ix+iy) of W |
|
57 * note that each step through the loop is not dependent on |
|
58 * the previous which means the compiler can easily unroll |
|
59 * the loop without scheduling problems |
|
60 */ |
|
61 { |
|
62 register mp_digit tmpx, *tmpy; |
|
63 register mp_word *_W; |
|
64 register int iy, pb; |
|
65 |
|
66 /* alias for the the word on the left e.g. A[ix] * A[iy] */ |
|
67 tmpx = a->dp[ix]; |
|
68 |
|
69 /* alias for the right side */ |
|
70 tmpy = b->dp; |
|
71 |
|
72 /* alias for the columns, each step through the loop adds a new |
|
73 term to each column |
|
74 */ |
|
75 _W = W + ix; |
|
76 |
|
77 /* the number of digits is limited by their placement. E.g. |
|
78 we avoid multiplying digits that will end up above the # of |
|
79 digits of precision requested |
|
80 */ |
|
81 pb = MIN (b->used, digs - ix); |
|
82 |
|
83 for (iy = 0; iy < pb; iy++) { |
|
84 *_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++); |
|
85 } |
|
86 } |
|
87 |
|
88 } |
|
89 |
|
90 /* setup dest */ |
|
91 olduse = c->used; |
|
92 c->used = digs; |
|
93 |
|
94 { |
|
95 register mp_digit *tmpc; |
|
96 |
|
97 /* At this point W[] contains the sums of each column. To get the |
|
98 * correct result we must take the extra bits from each column and |
|
99 * carry them down |
|
100 * |
|
101 * Note that while this adds extra code to the multiplier it |
|
102 * saves time since the carry propagation is removed from the |
|
103 * above nested loop.This has the effect of reducing the work |
|
104 * from N*(N+N*c)==N**2 + c*N**2 to N**2 + N*c where c is the |
|
105 * cost of the shifting. On very small numbers this is slower |
|
106 * but on most cryptographic size numbers it is faster. |
|
107 * |
|
108 * In this particular implementation we feed the carries from |
|
109 * behind which means when the loop terminates we still have one |
|
110 * last digit to copy |
|
111 */ |
|
112 tmpc = c->dp; |
|
113 for (ix = 1; ix < digs; ix++) { |
|
114 /* forward the carry from the previous temp */ |
|
115 W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT)); |
|
116 |
|
117 /* now extract the previous digit [below the carry] */ |
|
118 *tmpc++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK)); |
|
119 } |
|
120 /* fetch the last digit */ |
|
121 *tmpc++ = (mp_digit) (W[digs - 1] & ((mp_word) MP_MASK)); |
|
122 |
|
123 /* clear unused digits [that existed in the old copy of c] */ |
|
124 for (; ix < olduse; ix++) { |
|
125 *tmpc++ = 0; |
|
126 } |
|
127 } |
|
128 mp_clamp (c); |
|
129 return MP_OKAY; |
|
130 } |