142
|
1 #include <tommath.h> |
|
2 #ifdef BN_FAST_S_MP_SQR_C |
2
|
3 /* LibTomMath, multiple-precision integer library -- Tom St Denis |
|
4 * |
|
5 * LibTomMath is a library that provides multiple-precision |
|
6 * integer arithmetic as well as number theoretic functionality. |
|
7 * |
|
8 * The library was designed directly after the MPI library by |
|
9 * Michael Fromberger but has been written from scratch with |
|
10 * additional optimizations in place. |
|
11 * |
|
12 * The library is free for all purposes without any express |
|
13 * guarantee it works. |
|
14 * |
|
15 * Tom St Denis, [email protected], http://math.libtomcrypt.org |
|
16 */ |
|
17 |
|
18 /* fast squaring |
|
19 * |
|
20 * This is the comba method where the columns of the product |
|
21 * are computed first then the carries are computed. This |
|
22 * has the effect of making a very simple inner loop that |
|
23 * is executed the most |
|
24 * |
|
25 * W2 represents the outer products and W the inner. |
|
26 * |
|
27 * A further optimizations is made because the inner |
|
28 * products are of the form "A * B * 2". The *2 part does |
|
29 * not need to be computed until the end which is good |
|
30 * because 64-bit shifts are slow! |
|
31 * |
|
32 * Based on Algorithm 14.16 on pp.597 of HAC. |
|
33 * |
|
34 */ |
142
|
35 /* the jist of squaring... |
|
36 |
|
37 you do like mult except the offset of the tmpx [one that starts closer to zero] |
|
38 can't equal the offset of tmpy. So basically you set up iy like before then you min it with |
|
39 (ty-tx) so that it never happens. You double all those you add in the inner loop |
|
40 |
|
41 After that loop you do the squares and add them in. |
|
42 |
|
43 Remove W2 and don't memset W |
|
44 |
|
45 */ |
|
46 |
2
|
47 int fast_s_mp_sqr (mp_int * a, mp_int * b) |
|
48 { |
142
|
49 int olduse, res, pa, ix, iz; |
|
50 mp_digit W[MP_WARRAY], *tmpx; |
|
51 mp_word W1; |
2
|
52 |
142
|
53 /* grow the destination as required */ |
|
54 pa = a->used + a->used; |
|
55 if (b->alloc < pa) { |
|
56 if ((res = mp_grow (b, pa)) != MP_OKAY) { |
2
|
57 return res; |
|
58 } |
|
59 } |
|
60 |
142
|
61 /* number of output digits to produce */ |
|
62 W1 = 0; |
|
63 for (ix = 0; ix <= pa; ix++) { |
|
64 int tx, ty, iy; |
|
65 mp_word _W; |
|
66 mp_digit *tmpy; |
|
67 |
|
68 /* clear counter */ |
|
69 _W = 0; |
|
70 |
|
71 /* get offsets into the two bignums */ |
|
72 ty = MIN(a->used-1, ix); |
|
73 tx = ix - ty; |
|
74 |
|
75 /* setup temp aliases */ |
|
76 tmpx = a->dp + tx; |
|
77 tmpy = a->dp + ty; |
|
78 |
|
79 /* this is the number of times the loop will iterrate, essentially its |
|
80 while (tx++ < a->used && ty-- >= 0) { ... } |
|
81 */ |
|
82 iy = MIN(a->used-tx, ty+1); |
2
|
83 |
142
|
84 /* now for squaring tx can never equal ty |
|
85 * we halve the distance since they approach at a rate of 2x |
|
86 * and we have to round because odd cases need to be executed |
|
87 */ |
|
88 iy = MIN(iy, (ty-tx+1)>>1); |
|
89 |
|
90 /* execute loop */ |
|
91 for (iz = 0; iz < iy; iz++) { |
|
92 _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--); |
|
93 } |
2
|
94 |
142
|
95 /* double the inner product and add carry */ |
|
96 _W = _W + _W + W1; |
2
|
97 |
142
|
98 /* even columns have the square term in them */ |
|
99 if ((ix&1) == 0) { |
|
100 _W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]); |
|
101 } |
2
|
102 |
142
|
103 /* store it */ |
|
104 W[ix] = _W; |
|
105 |
|
106 /* make next carry */ |
|
107 W1 = _W >> ((mp_word)DIGIT_BIT); |
2
|
108 } |
|
109 |
|
110 /* setup dest */ |
|
111 olduse = b->used; |
142
|
112 b->used = a->used+a->used; |
2
|
113 |
|
114 { |
142
|
115 mp_digit *tmpb; |
2
|
116 tmpb = b->dp; |
142
|
117 for (ix = 0; ix < pa; ix++) { |
|
118 *tmpb++ = W[ix] & MP_MASK; |
|
119 } |
2
|
120 |
142
|
121 /* clear unused digits [that existed in the old copy of c] */ |
2
|
122 for (; ix < olduse; ix++) { |
|
123 *tmpb++ = 0; |
|
124 } |
|
125 } |
|
126 mp_clamp (b); |
|
127 return MP_OKAY; |
|
128 } |
142
|
129 #endif |