1
|
1 /* LibTomMath, multiple-precision integer library -- Tom St Denis |
|
2 * |
|
3 * LibTomMath is a library that provides multiple-precision |
|
4 * integer arithmetic as well as number theoretic functionality. |
|
5 * |
|
6 * The library was designed directly after the MPI library by |
|
7 * Michael Fromberger but has been written from scratch with |
|
8 * additional optimizations in place. |
|
9 * |
|
10 * The library is free for all purposes without any express |
|
11 * guarantee it works. |
|
12 * |
|
13 * Tom St Denis, [email protected], http://math.libtomcrypt.org |
|
14 */ |
|
15 #include <tommath.h> |
|
16 |
|
17 /* fast squaring |
|
18 * |
|
19 * This is the comba method where the columns of the product |
|
20 * are computed first then the carries are computed. This |
|
21 * has the effect of making a very simple inner loop that |
|
22 * is executed the most |
|
23 * |
|
24 * W2 represents the outer products and W the inner. |
|
25 * |
|
26 * A further optimizations is made because the inner |
|
27 * products are of the form "A * B * 2". The *2 part does |
|
28 * not need to be computed until the end which is good |
|
29 * because 64-bit shifts are slow! |
|
30 * |
|
31 * Based on Algorithm 14.16 on pp.597 of HAC. |
|
32 * |
|
33 */ |
|
34 int fast_s_mp_sqr (mp_int * a, mp_int * b) |
|
35 { |
|
36 int olduse, newused, res, ix, pa; |
|
37 mp_word W2[MP_WARRAY], W[MP_WARRAY]; |
|
38 |
|
39 /* calculate size of product and allocate as required */ |
|
40 pa = a->used; |
|
41 newused = pa + pa + 1; |
|
42 if (b->alloc < newused) { |
|
43 if ((res = mp_grow (b, newused)) != MP_OKAY) { |
|
44 return res; |
|
45 } |
|
46 } |
|
47 |
|
48 /* zero temp buffer (columns) |
|
49 * Note that there are two buffers. Since squaring requires |
|
50 * a outer and inner product and the inner product requires |
|
51 * computing a product and doubling it (a relatively expensive |
|
52 * op to perform n**2 times if you don't have to) the inner and |
|
53 * outer products are computed in different buffers. This way |
|
54 * the inner product can be doubled using n doublings instead of |
|
55 * n**2 |
|
56 */ |
|
57 memset (W, 0, newused * sizeof (mp_word)); |
|
58 memset (W2, 0, newused * sizeof (mp_word)); |
|
59 |
|
60 /* This computes the inner product. To simplify the inner N**2 loop |
|
61 * the multiplication by two is done afterwards in the N loop. |
|
62 */ |
|
63 for (ix = 0; ix < pa; ix++) { |
|
64 /* compute the outer product |
|
65 * |
|
66 * Note that every outer product is computed |
|
67 * for a particular column only once which means that |
|
68 * there is no need todo a double precision addition |
|
69 * into the W2[] array. |
|
70 */ |
|
71 W2[ix + ix] = ((mp_word)a->dp[ix]) * ((mp_word)a->dp[ix]); |
|
72 |
|
73 { |
|
74 register mp_digit tmpx, *tmpy; |
|
75 register mp_word *_W; |
|
76 register int iy; |
|
77 |
|
78 /* copy of left side */ |
|
79 tmpx = a->dp[ix]; |
|
80 |
|
81 /* alias for right side */ |
|
82 tmpy = a->dp + (ix + 1); |
|
83 |
|
84 /* the column to store the result in */ |
|
85 _W = W + (ix + ix + 1); |
|
86 |
|
87 /* inner products */ |
|
88 for (iy = ix + 1; iy < pa; iy++) { |
|
89 *_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++); |
|
90 } |
|
91 } |
|
92 } |
|
93 |
|
94 /* setup dest */ |
|
95 olduse = b->used; |
|
96 b->used = newused; |
|
97 |
|
98 /* now compute digits |
|
99 * |
|
100 * We have to double the inner product sums, add in the |
|
101 * outer product sums, propagate carries and convert |
|
102 * to single precision. |
|
103 */ |
|
104 { |
|
105 register mp_digit *tmpb; |
|
106 |
|
107 /* double first value, since the inner products are |
|
108 * half of what they should be |
|
109 */ |
|
110 W[0] += W[0] + W2[0]; |
|
111 |
|
112 tmpb = b->dp; |
|
113 for (ix = 1; ix < newused; ix++) { |
|
114 /* double/add next digit */ |
|
115 W[ix] += W[ix] + W2[ix]; |
|
116 |
|
117 /* propagate carry forwards [from the previous digit] */ |
|
118 W[ix] = W[ix] + (W[ix - 1] >> ((mp_word) DIGIT_BIT)); |
|
119 |
|
120 /* store the current digit now that the carry isn't |
|
121 * needed |
|
122 */ |
|
123 *tmpb++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK)); |
|
124 } |
|
125 /* set the last value. Note even if the carry is zero |
|
126 * this is required since the next step will not zero |
|
127 * it if b originally had a value at b->dp[2*a.used] |
|
128 */ |
|
129 *tmpb++ = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK)); |
|
130 |
|
131 /* clear high digits of b if there were any originally */ |
|
132 for (; ix < olduse; ix++) { |
|
133 *tmpb++ = 0; |
|
134 } |
|
135 } |
|
136 |
|
137 mp_clamp (b); |
|
138 return MP_OKAY; |
|
139 } |