Mercurial > dropbear
comparison bn_fast_s_mp_sqr.c @ 2:86e0b50a9b58 libtommath-orig ltm-0.30-orig
ltm 0.30 orig import
author | Matt Johnston <matt@ucc.asn.au> |
---|---|
date | Mon, 31 May 2004 18:25:22 +0000 |
parents | |
children | d29b64170cf0 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 2:86e0b50a9b58 |
---|---|
1 /* LibTomMath, multiple-precision integer library -- Tom St Denis | |
2 * | |
3 * LibTomMath is a library that provides multiple-precision | |
4 * integer arithmetic as well as number theoretic functionality. | |
5 * | |
6 * The library was designed directly after the MPI library by | |
7 * Michael Fromberger but has been written from scratch with | |
8 * additional optimizations in place. | |
9 * | |
10 * The library is free for all purposes without any express | |
11 * guarantee it works. | |
12 * | |
13 * Tom St Denis, [email protected], http://math.libtomcrypt.org | |
14 */ | |
15 #include <tommath.h> | |
16 | |
17 /* fast squaring | |
18 * | |
19 * This is the comba method where the columns of the product | |
20 * are computed first then the carries are computed. This | |
21 * has the effect of making a very simple inner loop that | |
22 * is executed the most | |
23 * | |
24 * W2 represents the outer products and W the inner. | |
25 * | |
26 * A further optimizations is made because the inner | |
27 * products are of the form "A * B * 2". The *2 part does | |
28 * not need to be computed until the end which is good | |
29 * because 64-bit shifts are slow! | |
30 * | |
31 * Based on Algorithm 14.16 on pp.597 of HAC. | |
32 * | |
33 */ | |
34 int fast_s_mp_sqr (mp_int * a, mp_int * b) | |
35 { | |
36 int olduse, newused, res, ix, pa; | |
37 mp_word W2[MP_WARRAY], W[MP_WARRAY]; | |
38 | |
39 /* calculate size of product and allocate as required */ | |
40 pa = a->used; | |
41 newused = pa + pa + 1; | |
42 if (b->alloc < newused) { | |
43 if ((res = mp_grow (b, newused)) != MP_OKAY) { | |
44 return res; | |
45 } | |
46 } | |
47 | |
48 /* zero temp buffer (columns) | |
49 * Note that there are two buffers. Since squaring requires | |
50 * a outer and inner product and the inner product requires | |
51 * computing a product and doubling it (a relatively expensive | |
52 * op to perform n**2 times if you don't have to) the inner and | |
53 * outer products are computed in different buffers. This way | |
54 * the inner product can be doubled using n doublings instead of | |
55 * n**2 | |
56 */ | |
57 memset (W, 0, newused * sizeof (mp_word)); | |
58 memset (W2, 0, newused * sizeof (mp_word)); | |
59 | |
60 /* This computes the inner product. To simplify the inner N**2 loop | |
61 * the multiplication by two is done afterwards in the N loop. | |
62 */ | |
63 for (ix = 0; ix < pa; ix++) { | |
64 /* compute the outer product | |
65 * | |
66 * Note that every outer product is computed | |
67 * for a particular column only once which means that | |
68 * there is no need todo a double precision addition | |
69 * into the W2[] array. | |
70 */ | |
71 W2[ix + ix] = ((mp_word)a->dp[ix]) * ((mp_word)a->dp[ix]); | |
72 | |
73 { | |
74 register mp_digit tmpx, *tmpy; | |
75 register mp_word *_W; | |
76 register int iy; | |
77 | |
78 /* copy of left side */ | |
79 tmpx = a->dp[ix]; | |
80 | |
81 /* alias for right side */ | |
82 tmpy = a->dp + (ix + 1); | |
83 | |
84 /* the column to store the result in */ | |
85 _W = W + (ix + ix + 1); | |
86 | |
87 /* inner products */ | |
88 for (iy = ix + 1; iy < pa; iy++) { | |
89 *_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++); | |
90 } | |
91 } | |
92 } | |
93 | |
94 /* setup dest */ | |
95 olduse = b->used; | |
96 b->used = newused; | |
97 | |
98 /* now compute digits | |
99 * | |
100 * We have to double the inner product sums, add in the | |
101 * outer product sums, propagate carries and convert | |
102 * to single precision. | |
103 */ | |
104 { | |
105 register mp_digit *tmpb; | |
106 | |
107 /* double first value, since the inner products are | |
108 * half of what they should be | |
109 */ | |
110 W[0] += W[0] + W2[0]; | |
111 | |
112 tmpb = b->dp; | |
113 for (ix = 1; ix < newused; ix++) { | |
114 /* double/add next digit */ | |
115 W[ix] += W[ix] + W2[ix]; | |
116 | |
117 /* propagate carry forwards [from the previous digit] */ | |
118 W[ix] = W[ix] + (W[ix - 1] >> ((mp_word) DIGIT_BIT)); | |
119 | |
120 /* store the current digit now that the carry isn't | |
121 * needed | |
122 */ | |
123 *tmpb++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK)); | |
124 } | |
125 /* set the last value. Note even if the carry is zero | |
126 * this is required since the next step will not zero | |
127 * it if b originally had a value at b->dp[2*a.used] | |
128 */ | |
129 *tmpb++ = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK)); | |
130 | |
131 /* clear high digits of b if there were any originally */ | |
132 for (; ix < olduse; ix++) { | |
133 *tmpb++ = 0; | |
134 } | |
135 } | |
136 | |
137 mp_clamp (b); | |
138 return MP_OKAY; | |
139 } |