comparison bn_fast_s_mp_sqr.c @ 142:d29b64170cf0 libtommath-orig

import of libtommath 0.32
author Matt Johnston <matt@ucc.asn.au>
date Sun, 19 Dec 2004 11:33:56 +0000
parents 86e0b50a9b58
children d8254fc979e9
comparison
equal deleted inserted replaced
19:e1037a1e12e7 142:d29b64170cf0
1 #include <tommath.h>
2 #ifdef BN_FAST_S_MP_SQR_C
1 /* LibTomMath, multiple-precision integer library -- Tom St Denis 3 /* LibTomMath, multiple-precision integer library -- Tom St Denis
2 * 4 *
3 * LibTomMath is a library that provides multiple-precision 5 * LibTomMath is a library that provides multiple-precision
4 * integer arithmetic as well as number theoretic functionality. 6 * integer arithmetic as well as number theoretic functionality.
5 * 7 *
10 * The library is free for all purposes without any express 12 * The library is free for all purposes without any express
11 * guarantee it works. 13 * guarantee it works.
12 * 14 *
13 * Tom St Denis, [email protected], http://math.libtomcrypt.org 15 * Tom St Denis, [email protected], http://math.libtomcrypt.org
14 */ 16 */
15 #include <tommath.h>
16 17
17 /* fast squaring 18 /* fast squaring
18 * 19 *
19 * This is the comba method where the columns of the product 20 * This is the comba method where the columns of the product
20 * are computed first then the carries are computed. This 21 * are computed first then the carries are computed. This
29 * because 64-bit shifts are slow! 30 * because 64-bit shifts are slow!
30 * 31 *
31 * Based on Algorithm 14.16 on pp.597 of HAC. 32 * Based on Algorithm 14.16 on pp.597 of HAC.
32 * 33 *
33 */ 34 */
35 /* the jist of squaring...
36
37 you do like mult except the offset of the tmpx [one that starts closer to zero]
38 can't equal the offset of tmpy. So basically you set up iy like before then you min it with
39 (ty-tx) so that it never happens. You double all those you add in the inner loop
40
41 After that loop you do the squares and add them in.
42
43 Remove W2 and don't memset W
44
45 */
46
34 int fast_s_mp_sqr (mp_int * a, mp_int * b) 47 int fast_s_mp_sqr (mp_int * a, mp_int * b)
35 { 48 {
36 int olduse, newused, res, ix, pa; 49 int olduse, res, pa, ix, iz;
37 mp_word W2[MP_WARRAY], W[MP_WARRAY]; 50 mp_digit W[MP_WARRAY], *tmpx;
51 mp_word W1;
38 52
39 /* calculate size of product and allocate as required */ 53 /* grow the destination as required */
40 pa = a->used; 54 pa = a->used + a->used;
41 newused = pa + pa + 1; 55 if (b->alloc < pa) {
42 if (b->alloc < newused) { 56 if ((res = mp_grow (b, pa)) != MP_OKAY) {
43 if ((res = mp_grow (b, newused)) != MP_OKAY) {
44 return res; 57 return res;
45 } 58 }
46 } 59 }
47 60
48 /* zero temp buffer (columns) 61 /* number of output digits to produce */
49 * Note that there are two buffers. Since squaring requires 62 W1 = 0;
50 * a outer and inner product and the inner product requires 63 for (ix = 0; ix <= pa; ix++) {
51 * computing a product and doubling it (a relatively expensive 64 int tx, ty, iy;
52 * op to perform n**2 times if you don't have to) the inner and 65 mp_word _W;
53 * outer products are computed in different buffers. This way 66 mp_digit *tmpy;
54 * the inner product can be doubled using n doublings instead of
55 * n**2
56 */
57 memset (W, 0, newused * sizeof (mp_word));
58 memset (W2, 0, newused * sizeof (mp_word));
59 67
60 /* This computes the inner product. To simplify the inner N**2 loop 68 /* clear counter */
61 * the multiplication by two is done afterwards in the N loop. 69 _W = 0;
62 */
63 for (ix = 0; ix < pa; ix++) {
64 /* compute the outer product
65 *
66 * Note that every outer product is computed
67 * for a particular column only once which means that
68 * there is no need todo a double precision addition
69 * into the W2[] array.
70 */
71 W2[ix + ix] = ((mp_word)a->dp[ix]) * ((mp_word)a->dp[ix]);
72 70
73 { 71 /* get offsets into the two bignums */
74 register mp_digit tmpx, *tmpy; 72 ty = MIN(a->used-1, ix);
75 register mp_word *_W; 73 tx = ix - ty;
76 register int iy;
77 74
78 /* copy of left side */ 75 /* setup temp aliases */
79 tmpx = a->dp[ix]; 76 tmpx = a->dp + tx;
77 tmpy = a->dp + ty;
80 78
81 /* alias for right side */ 79 /* this is the number of times the loop will iterrate, essentially its
82 tmpy = a->dp + (ix + 1); 80 while (tx++ < a->used && ty-- >= 0) { ... }
81 */
82 iy = MIN(a->used-tx, ty+1);
83 83
84 /* the column to store the result in */ 84 /* now for squaring tx can never equal ty
85 _W = W + (ix + ix + 1); 85 * we halve the distance since they approach at a rate of 2x
86 * and we have to round because odd cases need to be executed
87 */
88 iy = MIN(iy, (ty-tx+1)>>1);
86 89
87 /* inner products */ 90 /* execute loop */
88 for (iy = ix + 1; iy < pa; iy++) { 91 for (iz = 0; iz < iy; iz++) {
89 *_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++); 92 _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
90 } 93 }
91 } 94
95 /* double the inner product and add carry */
96 _W = _W + _W + W1;
97
98 /* even columns have the square term in them */
99 if ((ix&1) == 0) {
100 _W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]);
101 }
102
103 /* store it */
104 W[ix] = _W;
105
106 /* make next carry */
107 W1 = _W >> ((mp_word)DIGIT_BIT);
92 } 108 }
93 109
94 /* setup dest */ 110 /* setup dest */
95 olduse = b->used; 111 olduse = b->used;
96 b->used = newused; 112 b->used = a->used+a->used;
97 113
98 /* now compute digits
99 *
100 * We have to double the inner product sums, add in the
101 * outer product sums, propagate carries and convert
102 * to single precision.
103 */
104 { 114 {
105 register mp_digit *tmpb; 115 mp_digit *tmpb;
116 tmpb = b->dp;
117 for (ix = 0; ix < pa; ix++) {
118 *tmpb++ = W[ix] & MP_MASK;
119 }
106 120
107 /* double first value, since the inner products are 121 /* clear unused digits [that existed in the old copy of c] */
108 * half of what they should be
109 */
110 W[0] += W[0] + W2[0];
111
112 tmpb = b->dp;
113 for (ix = 1; ix < newused; ix++) {
114 /* double/add next digit */
115 W[ix] += W[ix] + W2[ix];
116
117 /* propagate carry forwards [from the previous digit] */
118 W[ix] = W[ix] + (W[ix - 1] >> ((mp_word) DIGIT_BIT));
119
120 /* store the current digit now that the carry isn't
121 * needed
122 */
123 *tmpb++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
124 }
125 /* set the last value. Note even if the carry is zero
126 * this is required since the next step will not zero
127 * it if b originally had a value at b->dp[2*a.used]
128 */
129 *tmpb++ = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK));
130
131 /* clear high digits of b if there were any originally */
132 for (; ix < olduse; ix++) { 122 for (; ix < olduse; ix++) {
133 *tmpb++ = 0; 123 *tmpb++ = 0;
134 } 124 }
135 } 125 }
136
137 mp_clamp (b); 126 mp_clamp (b);
138 return MP_OKAY; 127 return MP_OKAY;
139 } 128 }
129 #endif