Mercurial > dropbear
comparison bn_fast_s_mp_sqr.c @ 142:d29b64170cf0 libtommath-orig
import of libtommath 0.32
author | Matt Johnston <matt@ucc.asn.au> |
---|---|
date | Sun, 19 Dec 2004 11:33:56 +0000 |
parents | 86e0b50a9b58 |
children | d8254fc979e9 |
comparison
equal
deleted
inserted
replaced
19:e1037a1e12e7 | 142:d29b64170cf0 |
---|---|
1 #include <tommath.h> | |
2 #ifdef BN_FAST_S_MP_SQR_C | |
1 /* LibTomMath, multiple-precision integer library -- Tom St Denis | 3 /* LibTomMath, multiple-precision integer library -- Tom St Denis |
2 * | 4 * |
3 * LibTomMath is a library that provides multiple-precision | 5 * LibTomMath is a library that provides multiple-precision |
4 * integer arithmetic as well as number theoretic functionality. | 6 * integer arithmetic as well as number theoretic functionality. |
5 * | 7 * |
10 * The library is free for all purposes without any express | 12 * The library is free for all purposes without any express |
11 * guarantee it works. | 13 * guarantee it works. |
12 * | 14 * |
13 * Tom St Denis, [email protected], http://math.libtomcrypt.org | 15 * Tom St Denis, [email protected], http://math.libtomcrypt.org |
14 */ | 16 */ |
15 #include <tommath.h> | |
16 | 17 |
17 /* fast squaring | 18 /* fast squaring |
18 * | 19 * |
19 * This is the comba method where the columns of the product | 20 * This is the comba method where the columns of the product |
20 * are computed first then the carries are computed. This | 21 * are computed first then the carries are computed. This |
29 * because 64-bit shifts are slow! | 30 * because 64-bit shifts are slow! |
30 * | 31 * |
31 * Based on Algorithm 14.16 on pp.597 of HAC. | 32 * Based on Algorithm 14.16 on pp.597 of HAC. |
32 * | 33 * |
33 */ | 34 */ |
35 /* the jist of squaring... | |
36 | |
37 you do like mult except the offset of the tmpx [one that starts closer to zero] | |
38 can't equal the offset of tmpy. So basically you set up iy like before then you min it with | |
39 (ty-tx) so that it never happens. You double all those you add in the inner loop | |
40 | |
41 After that loop you do the squares and add them in. | |
42 | |
43 Remove W2 and don't memset W | |
44 | |
45 */ | |
46 | |
34 int fast_s_mp_sqr (mp_int * a, mp_int * b) | 47 int fast_s_mp_sqr (mp_int * a, mp_int * b) |
35 { | 48 { |
36 int olduse, newused, res, ix, pa; | 49 int olduse, res, pa, ix, iz; |
37 mp_word W2[MP_WARRAY], W[MP_WARRAY]; | 50 mp_digit W[MP_WARRAY], *tmpx; |
51 mp_word W1; | |
38 | 52 |
39 /* calculate size of product and allocate as required */ | 53 /* grow the destination as required */ |
40 pa = a->used; | 54 pa = a->used + a->used; |
41 newused = pa + pa + 1; | 55 if (b->alloc < pa) { |
42 if (b->alloc < newused) { | 56 if ((res = mp_grow (b, pa)) != MP_OKAY) { |
43 if ((res = mp_grow (b, newused)) != MP_OKAY) { | |
44 return res; | 57 return res; |
45 } | 58 } |
46 } | 59 } |
47 | 60 |
48 /* zero temp buffer (columns) | 61 /* number of output digits to produce */ |
49 * Note that there are two buffers. Since squaring requires | 62 W1 = 0; |
50 * a outer and inner product and the inner product requires | 63 for (ix = 0; ix <= pa; ix++) { |
51 * computing a product and doubling it (a relatively expensive | 64 int tx, ty, iy; |
52 * op to perform n**2 times if you don't have to) the inner and | 65 mp_word _W; |
53 * outer products are computed in different buffers. This way | 66 mp_digit *tmpy; |
54 * the inner product can be doubled using n doublings instead of | |
55 * n**2 | |
56 */ | |
57 memset (W, 0, newused * sizeof (mp_word)); | |
58 memset (W2, 0, newused * sizeof (mp_word)); | |
59 | 67 |
60 /* This computes the inner product. To simplify the inner N**2 loop | 68 /* clear counter */ |
61 * the multiplication by two is done afterwards in the N loop. | 69 _W = 0; |
62 */ | |
63 for (ix = 0; ix < pa; ix++) { | |
64 /* compute the outer product | |
65 * | |
66 * Note that every outer product is computed | |
67 * for a particular column only once which means that | |
68 * there is no need todo a double precision addition | |
69 * into the W2[] array. | |
70 */ | |
71 W2[ix + ix] = ((mp_word)a->dp[ix]) * ((mp_word)a->dp[ix]); | |
72 | 70 |
73 { | 71 /* get offsets into the two bignums */ |
74 register mp_digit tmpx, *tmpy; | 72 ty = MIN(a->used-1, ix); |
75 register mp_word *_W; | 73 tx = ix - ty; |
76 register int iy; | |
77 | 74 |
78 /* copy of left side */ | 75 /* setup temp aliases */ |
79 tmpx = a->dp[ix]; | 76 tmpx = a->dp + tx; |
77 tmpy = a->dp + ty; | |
80 | 78 |
81 /* alias for right side */ | 79 /* this is the number of times the loop will iterrate, essentially its |
82 tmpy = a->dp + (ix + 1); | 80 while (tx++ < a->used && ty-- >= 0) { ... } |
81 */ | |
82 iy = MIN(a->used-tx, ty+1); | |
83 | 83 |
84 /* the column to store the result in */ | 84 /* now for squaring tx can never equal ty |
85 _W = W + (ix + ix + 1); | 85 * we halve the distance since they approach at a rate of 2x |
86 * and we have to round because odd cases need to be executed | |
87 */ | |
88 iy = MIN(iy, (ty-tx+1)>>1); | |
86 | 89 |
87 /* inner products */ | 90 /* execute loop */ |
88 for (iy = ix + 1; iy < pa; iy++) { | 91 for (iz = 0; iz < iy; iz++) { |
89 *_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++); | 92 _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--); |
90 } | 93 } |
91 } | 94 |
95 /* double the inner product and add carry */ | |
96 _W = _W + _W + W1; | |
97 | |
98 /* even columns have the square term in them */ | |
99 if ((ix&1) == 0) { | |
100 _W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]); | |
101 } | |
102 | |
103 /* store it */ | |
104 W[ix] = _W; | |
105 | |
106 /* make next carry */ | |
107 W1 = _W >> ((mp_word)DIGIT_BIT); | |
92 } | 108 } |
93 | 109 |
94 /* setup dest */ | 110 /* setup dest */ |
95 olduse = b->used; | 111 olduse = b->used; |
96 b->used = newused; | 112 b->used = a->used+a->used; |
97 | 113 |
98 /* now compute digits | |
99 * | |
100 * We have to double the inner product sums, add in the | |
101 * outer product sums, propagate carries and convert | |
102 * to single precision. | |
103 */ | |
104 { | 114 { |
105 register mp_digit *tmpb; | 115 mp_digit *tmpb; |
116 tmpb = b->dp; | |
117 for (ix = 0; ix < pa; ix++) { | |
118 *tmpb++ = W[ix] & MP_MASK; | |
119 } | |
106 | 120 |
107 /* double first value, since the inner products are | 121 /* clear unused digits [that existed in the old copy of c] */ |
108 * half of what they should be | |
109 */ | |
110 W[0] += W[0] + W2[0]; | |
111 | |
112 tmpb = b->dp; | |
113 for (ix = 1; ix < newused; ix++) { | |
114 /* double/add next digit */ | |
115 W[ix] += W[ix] + W2[ix]; | |
116 | |
117 /* propagate carry forwards [from the previous digit] */ | |
118 W[ix] = W[ix] + (W[ix - 1] >> ((mp_word) DIGIT_BIT)); | |
119 | |
120 /* store the current digit now that the carry isn't | |
121 * needed | |
122 */ | |
123 *tmpb++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK)); | |
124 } | |
125 /* set the last value. Note even if the carry is zero | |
126 * this is required since the next step will not zero | |
127 * it if b originally had a value at b->dp[2*a.used] | |
128 */ | |
129 *tmpb++ = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK)); | |
130 | |
131 /* clear high digits of b if there were any originally */ | |
132 for (; ix < olduse; ix++) { | 122 for (; ix < olduse; ix++) { |
133 *tmpb++ = 0; | 123 *tmpb++ = 0; |
134 } | 124 } |
135 } | 125 } |
136 | |
137 mp_clamp (b); | 126 mp_clamp (b); |
138 return MP_OKAY; | 127 return MP_OKAY; |
139 } | 128 } |
129 #endif |