Mercurial > dropbear
comparison tomsfastmath/src/mont/fp_montgomery_reduce.c @ 643:a362b62d38b2 dropbear-tfm
Add tomsfastmath from git rev bfa4582842bc3bab42e4be4aed5703437049502a
with Makefile.in renamed
author | Matt Johnston <matt@ucc.asn.au> |
---|---|
date | Wed, 23 Nov 2011 18:10:20 +0700 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
642:33fd2f3499d2 | 643:a362b62d38b2 |
---|---|
1 /* TomsFastMath, a fast ISO C bignum library. | |
2 * | |
3 * This project is meant to fill in where LibTomMath | |
4 * falls short. That is speed ;-) | |
5 * | |
6 * This project is public domain and free for all purposes. | |
7 * | |
8 * Tom St Denis, [email protected] | |
9 */ | |
10 #include <tfm.h> | |
11 | |
12 /******************************************************************/ | |
13 #if defined(TFM_X86) && !defined(TFM_SSE2) | |
14 /* x86-32 code */ | |
15 | |
16 #define MONT_START | |
17 #define MONT_FINI | |
18 #define LOOP_END | |
19 #define LOOP_START \ | |
20 mu = c[x] * mp | |
21 | |
22 #define INNERMUL \ | |
23 asm( \ | |
24 "movl %5,%%eax \n\t" \ | |
25 "mull %4 \n\t" \ | |
26 "addl %1,%%eax \n\t" \ | |
27 "adcl $0,%%edx \n\t" \ | |
28 "addl %%eax,%0 \n\t" \ | |
29 "adcl $0,%%edx \n\t" \ | |
30 "movl %%edx,%1 \n\t" \ | |
31 :"=g"(_c[LO]), "=r"(cy) \ | |
32 :"0"(_c[LO]), "1"(cy), "g"(mu), "g"(*tmpm++) \ | |
33 : "%eax", "%edx", "%cc") | |
34 | |
35 #define PROPCARRY \ | |
36 asm( \ | |
37 "addl %1,%0 \n\t" \ | |
38 "setb %%al \n\t" \ | |
39 "movzbl %%al,%1 \n\t" \ | |
40 :"=g"(_c[LO]), "=r"(cy) \ | |
41 :"0"(_c[LO]), "1"(cy) \ | |
42 : "%eax", "%cc") | |
43 | |
44 /******************************************************************/ | |
45 #elif defined(TFM_X86_64) | |
46 /* x86-64 code */ | |
47 | |
48 #define MONT_START | |
49 #define MONT_FINI | |
50 #define LOOP_END | |
51 #define LOOP_START \ | |
52 mu = c[x] * mp | |
53 | |
54 #define INNERMUL \ | |
55 asm( \ | |
56 "movq %5,%%rax \n\t" \ | |
57 "mulq %4 \n\t" \ | |
58 "addq %1,%%rax \n\t" \ | |
59 "adcq $0,%%rdx \n\t" \ | |
60 "addq %%rax,%0 \n\t" \ | |
61 "adcq $0,%%rdx \n\t" \ | |
62 "movq %%rdx,%1 \n\t" \ | |
63 :"=g"(_c[LO]), "=r"(cy) \ | |
64 :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \ | |
65 : "%rax", "%rdx", "%cc") | |
66 | |
67 #define INNERMUL8 \ | |
68 asm( \ | |
69 "movq 0(%5),%%rax \n\t" \ | |
70 "movq 0(%2),%%r10 \n\t" \ | |
71 "movq 0x8(%5),%%r11 \n\t" \ | |
72 "mulq %4 \n\t" \ | |
73 "addq %%r10,%%rax \n\t" \ | |
74 "adcq $0,%%rdx \n\t" \ | |
75 "movq 0x8(%2),%%r10 \n\t" \ | |
76 "addq %3,%%rax \n\t" \ | |
77 "adcq $0,%%rdx \n\t" \ | |
78 "movq %%rax,0(%0) \n\t" \ | |
79 "movq %%rdx,%1 \n\t" \ | |
80 \ | |
81 "movq %%r11,%%rax \n\t" \ | |
82 "movq 0x10(%5),%%r11 \n\t" \ | |
83 "mulq %4 \n\t" \ | |
84 "addq %%r10,%%rax \n\t" \ | |
85 "adcq $0,%%rdx \n\t" \ | |
86 "movq 0x10(%2),%%r10 \n\t" \ | |
87 "addq %3,%%rax \n\t" \ | |
88 "adcq $0,%%rdx \n\t" \ | |
89 "movq %%rax,0x8(%0) \n\t" \ | |
90 "movq %%rdx,%1 \n\t" \ | |
91 \ | |
92 "movq %%r11,%%rax \n\t" \ | |
93 "movq 0x18(%5),%%r11 \n\t" \ | |
94 "mulq %4 \n\t" \ | |
95 "addq %%r10,%%rax \n\t" \ | |
96 "adcq $0,%%rdx \n\t" \ | |
97 "movq 0x18(%2),%%r10 \n\t" \ | |
98 "addq %3,%%rax \n\t" \ | |
99 "adcq $0,%%rdx \n\t" \ | |
100 "movq %%rax,0x10(%0) \n\t" \ | |
101 "movq %%rdx,%1 \n\t" \ | |
102 \ | |
103 "movq %%r11,%%rax \n\t" \ | |
104 "movq 0x20(%5),%%r11 \n\t" \ | |
105 "mulq %4 \n\t" \ | |
106 "addq %%r10,%%rax \n\t" \ | |
107 "adcq $0,%%rdx \n\t" \ | |
108 "movq 0x20(%2),%%r10 \n\t" \ | |
109 "addq %3,%%rax \n\t" \ | |
110 "adcq $0,%%rdx \n\t" \ | |
111 "movq %%rax,0x18(%0) \n\t" \ | |
112 "movq %%rdx,%1 \n\t" \ | |
113 \ | |
114 "movq %%r11,%%rax \n\t" \ | |
115 "movq 0x28(%5),%%r11 \n\t" \ | |
116 "mulq %4 \n\t" \ | |
117 "addq %%r10,%%rax \n\t" \ | |
118 "adcq $0,%%rdx \n\t" \ | |
119 "movq 0x28(%2),%%r10 \n\t" \ | |
120 "addq %3,%%rax \n\t" \ | |
121 "adcq $0,%%rdx \n\t" \ | |
122 "movq %%rax,0x20(%0) \n\t" \ | |
123 "movq %%rdx,%1 \n\t" \ | |
124 \ | |
125 "movq %%r11,%%rax \n\t" \ | |
126 "movq 0x30(%5),%%r11 \n\t" \ | |
127 "mulq %4 \n\t" \ | |
128 "addq %%r10,%%rax \n\t" \ | |
129 "adcq $0,%%rdx \n\t" \ | |
130 "movq 0x30(%2),%%r10 \n\t" \ | |
131 "addq %3,%%rax \n\t" \ | |
132 "adcq $0,%%rdx \n\t" \ | |
133 "movq %%rax,0x28(%0) \n\t" \ | |
134 "movq %%rdx,%1 \n\t" \ | |
135 \ | |
136 "movq %%r11,%%rax \n\t" \ | |
137 "movq 0x38(%5),%%r11 \n\t" \ | |
138 "mulq %4 \n\t" \ | |
139 "addq %%r10,%%rax \n\t" \ | |
140 "adcq $0,%%rdx \n\t" \ | |
141 "movq 0x38(%2),%%r10 \n\t" \ | |
142 "addq %3,%%rax \n\t" \ | |
143 "adcq $0,%%rdx \n\t" \ | |
144 "movq %%rax,0x30(%0) \n\t" \ | |
145 "movq %%rdx,%1 \n\t" \ | |
146 \ | |
147 "movq %%r11,%%rax \n\t" \ | |
148 "mulq %4 \n\t" \ | |
149 "addq %%r10,%%rax \n\t" \ | |
150 "adcq $0,%%rdx \n\t" \ | |
151 "addq %3,%%rax \n\t" \ | |
152 "adcq $0,%%rdx \n\t" \ | |
153 "movq %%rax,0x38(%0) \n\t" \ | |
154 "movq %%rdx,%1 \n\t" \ | |
155 \ | |
156 :"=r"(_c), "=r"(cy) \ | |
157 : "0"(_c), "1"(cy), "g"(mu), "r"(tmpm)\ | |
158 : "%rax", "%rdx", "%r10", "%r11", "%cc") | |
159 | |
160 | |
161 #define PROPCARRY \ | |
162 asm( \ | |
163 "addq %1,%0 \n\t" \ | |
164 "setb %%al \n\t" \ | |
165 "movzbq %%al,%1 \n\t" \ | |
166 :"=g"(_c[LO]), "=r"(cy) \ | |
167 :"0"(_c[LO]), "1"(cy) \ | |
168 : "%rax", "%cc") | |
169 | |
170 /******************************************************************/ | |
171 #elif defined(TFM_SSE2) | |
172 /* SSE2 code (assumes 32-bit fp_digits) */ | |
173 /* XMM register assignments: | |
174 * xmm0 *tmpm++, then Mu * (*tmpm++) | |
175 * xmm1 c[x], then Mu | |
176 * xmm2 mp | |
177 * xmm3 cy | |
178 * xmm4 _c[LO] | |
179 */ | |
180 | |
181 #define MONT_START \ | |
182 asm("movd %0,%%mm2"::"g"(mp)) | |
183 | |
184 #define MONT_FINI \ | |
185 asm("emms") | |
186 | |
187 #define LOOP_START \ | |
188 asm( \ | |
189 "movd %0,%%mm1 \n\t" \ | |
190 "pxor %%mm3,%%mm3 \n\t" \ | |
191 "pmuludq %%mm2,%%mm1 \n\t" \ | |
192 :: "g"(c[x])) | |
193 | |
194 /* pmuludq on mmx registers does a 32x32->64 multiply. */ | |
195 #define INNERMUL \ | |
196 asm( \ | |
197 "movd %1,%%mm4 \n\t" \ | |
198 "movd %2,%%mm0 \n\t" \ | |
199 "paddq %%mm4,%%mm3 \n\t" \ | |
200 "pmuludq %%mm1,%%mm0 \n\t" \ | |
201 "paddq %%mm0,%%mm3 \n\t" \ | |
202 "movd %%mm3,%0 \n\t" \ | |
203 "psrlq $32, %%mm3 \n\t" \ | |
204 :"=g"(_c[LO]) : "0"(_c[LO]), "g"(*tmpm++) ); | |
205 | |
206 #define INNERMUL8 \ | |
207 asm( \ | |
208 "movd 0(%1),%%mm4 \n\t" \ | |
209 "movd 0(%2),%%mm0 \n\t" \ | |
210 "paddq %%mm4,%%mm3 \n\t" \ | |
211 "pmuludq %%mm1,%%mm0 \n\t" \ | |
212 "movd 4(%2),%%mm5 \n\t" \ | |
213 "paddq %%mm0,%%mm3 \n\t" \ | |
214 "movd 4(%1),%%mm6 \n\t" \ | |
215 "movd %%mm3,0(%0) \n\t" \ | |
216 "psrlq $32, %%mm3 \n\t" \ | |
217 \ | |
218 "paddq %%mm6,%%mm3 \n\t" \ | |
219 "pmuludq %%mm1,%%mm5 \n\t" \ | |
220 "movd 8(%2),%%mm6 \n\t" \ | |
221 "paddq %%mm5,%%mm3 \n\t" \ | |
222 "movd 8(%1),%%mm7 \n\t" \ | |
223 "movd %%mm3,4(%0) \n\t" \ | |
224 "psrlq $32, %%mm3 \n\t" \ | |
225 \ | |
226 "paddq %%mm7,%%mm3 \n\t" \ | |
227 "pmuludq %%mm1,%%mm6 \n\t" \ | |
228 "movd 12(%2),%%mm7 \n\t" \ | |
229 "paddq %%mm6,%%mm3 \n\t" \ | |
230 "movd 12(%1),%%mm5 \n\t" \ | |
231 "movd %%mm3,8(%0) \n\t" \ | |
232 "psrlq $32, %%mm3 \n\t" \ | |
233 \ | |
234 "paddq %%mm5,%%mm3 \n\t" \ | |
235 "pmuludq %%mm1,%%mm7 \n\t" \ | |
236 "movd 16(%2),%%mm5 \n\t" \ | |
237 "paddq %%mm7,%%mm3 \n\t" \ | |
238 "movd 16(%1),%%mm6 \n\t" \ | |
239 "movd %%mm3,12(%0) \n\t" \ | |
240 "psrlq $32, %%mm3 \n\t" \ | |
241 \ | |
242 "paddq %%mm6,%%mm3 \n\t" \ | |
243 "pmuludq %%mm1,%%mm5 \n\t" \ | |
244 "movd 20(%2),%%mm6 \n\t" \ | |
245 "paddq %%mm5,%%mm3 \n\t" \ | |
246 "movd 20(%1),%%mm7 \n\t" \ | |
247 "movd %%mm3,16(%0) \n\t" \ | |
248 "psrlq $32, %%mm3 \n\t" \ | |
249 \ | |
250 "paddq %%mm7,%%mm3 \n\t" \ | |
251 "pmuludq %%mm1,%%mm6 \n\t" \ | |
252 "movd 24(%2),%%mm7 \n\t" \ | |
253 "paddq %%mm6,%%mm3 \n\t" \ | |
254 "movd 24(%1),%%mm5 \n\t" \ | |
255 "movd %%mm3,20(%0) \n\t" \ | |
256 "psrlq $32, %%mm3 \n\t" \ | |
257 \ | |
258 "paddq %%mm5,%%mm3 \n\t" \ | |
259 "pmuludq %%mm1,%%mm7 \n\t" \ | |
260 "movd 28(%2),%%mm5 \n\t" \ | |
261 "paddq %%mm7,%%mm3 \n\t" \ | |
262 "movd 28(%1),%%mm6 \n\t" \ | |
263 "movd %%mm3,24(%0) \n\t" \ | |
264 "psrlq $32, %%mm3 \n\t" \ | |
265 \ | |
266 "paddq %%mm6,%%mm3 \n\t" \ | |
267 "pmuludq %%mm1,%%mm5 \n\t" \ | |
268 "paddq %%mm5,%%mm3 \n\t" \ | |
269 "movd %%mm3,28(%0) \n\t" \ | |
270 "psrlq $32, %%mm3 \n\t" \ | |
271 :"=r"(_c) : "0"(_c), "g"(tmpm) ); | |
272 | |
273 #define LOOP_END \ | |
274 asm( "movd %%mm3,%0 \n" :"=r"(cy)) | |
275 | |
276 #define PROPCARRY \ | |
277 asm( \ | |
278 "addl %1,%0 \n\t" \ | |
279 "setb %%al \n\t" \ | |
280 "movzbl %%al,%1 \n\t" \ | |
281 :"=g"(_c[LO]), "=r"(cy) \ | |
282 :"0"(_c[LO]), "1"(cy) \ | |
283 : "%eax", "%cc") | |
284 | |
285 /******************************************************************/ | |
286 #elif defined(TFM_ARM) | |
287 /* ARMv4 code */ | |
288 | |
289 #define MONT_START | |
290 #define MONT_FINI | |
291 #define LOOP_END | |
292 #define LOOP_START \ | |
293 mu = c[x] * mp | |
294 | |
295 #define INNERMUL \ | |
296 asm( \ | |
297 " LDR r0,%1 \n\t" \ | |
298 " ADDS r0,r0,%0 \n\t" \ | |
299 " MOVCS %0,#1 \n\t" \ | |
300 " MOVCC %0,#0 \n\t" \ | |
301 " UMLAL r0,%0,%3,%4 \n\t" \ | |
302 " STR r0,%1 \n\t" \ | |
303 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c[0]):"r0","%cc"); | |
304 | |
305 #define PROPCARRY \ | |
306 asm( \ | |
307 " LDR r0,%1 \n\t" \ | |
308 " ADDS r0,r0,%0 \n\t" \ | |
309 " STR r0,%1 \n\t" \ | |
310 " MOVCS %0,#1 \n\t" \ | |
311 " MOVCC %0,#0 \n\t" \ | |
312 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r0","%cc"); | |
313 | |
314 /******************************************************************/ | |
315 #elif defined(TFM_PPC32) | |
316 | |
317 /* PPC32 */ | |
318 #define MONT_START | |
319 #define MONT_FINI | |
320 #define LOOP_END | |
321 #define LOOP_START \ | |
322 mu = c[x] * mp | |
323 | |
324 #define INNERMUL \ | |
325 asm( \ | |
326 " mullw 16,%3,%4 \n\t" \ | |
327 " mulhwu 17,%3,%4 \n\t" \ | |
328 " addc 16,16,%0 \n\t" \ | |
329 " addze 17,17 \n\t" \ | |
330 " lwz 18,%1 \n\t" \ | |
331 " addc 16,16,18 \n\t" \ | |
332 " addze %0,17 \n\t" \ | |
333 " stw 16,%1 \n\t" \ | |
334 :"=r"(cy),"=g"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "18","%cc"); ++tmpm; | |
335 | |
336 #define PROPCARRY \ | |
337 asm( \ | |
338 " lwz 16,%1 \n\t" \ | |
339 " addc 16,16,%0 \n\t" \ | |
340 " stw 16,%1 \n\t" \ | |
341 " xor %0,%0,%0 \n\t" \ | |
342 " addze %0,%0 \n\t" \ | |
343 :"=r"(cy),"=g"(_c[0]):"0"(cy),"1"(_c[0]):"16","%cc"); | |
344 | |
345 /******************************************************************/ | |
346 #elif defined(TFM_PPC64) | |
347 | |
348 /* PPC64 */ | |
349 #define MONT_START | |
350 #define MONT_FINI | |
351 #define LOOP_END | |
352 #define LOOP_START \ | |
353 mu = c[x] * mp | |
354 | |
355 #define INNERMUL \ | |
356 asm( \ | |
357 " mulld r16,%3,%4 \n\t" \ | |
358 " mulhdu r17,%3,%4 \n\t" \ | |
359 " addc r16,16,%0 \n\t" \ | |
360 " addze r17,r17 \n\t" \ | |
361 " ldx r18,0,%1 \n\t" \ | |
362 " addc r16,r16,r18 \n\t" \ | |
363 " addze %0,r17 \n\t" \ | |
364 " sdx r16,0,%1 \n\t" \ | |
365 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"r16", "r17", "r18","%cc"); ++tmpm; | |
366 | |
367 #define PROPCARRY \ | |
368 asm( \ | |
369 " ldx r16,0,%1 \n\t" \ | |
370 " addc r16,r16,%0 \n\t" \ | |
371 " sdx r16,0,%1 \n\t" \ | |
372 " xor %0,%0,%0 \n\t" \ | |
373 " addze %0,%0 \n\t" \ | |
374 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r16","%cc"); | |
375 | |
376 /******************************************************************/ | |
377 #elif defined(TFM_AVR32) | |
378 | |
379 /* AVR32 */ | |
380 #define MONT_START | |
381 #define MONT_FINI | |
382 #define LOOP_END | |
383 #define LOOP_START \ | |
384 mu = c[x] * mp | |
385 | |
386 #define INNERMUL \ | |
387 asm( \ | |
388 " ld.w r2,%1 \n\t" \ | |
389 " add r2,%0 \n\t" \ | |
390 " eor r3,r3 \n\t" \ | |
391 " acr r3 \n\t" \ | |
392 " macu.d r2,%3,%4 \n\t" \ | |
393 " st.w %1,r2 \n\t" \ | |
394 " mov %0,r3 \n\t" \ | |
395 :"=r"(cy),"=r"(_c):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c):"r2","r3"); | |
396 | |
397 #define PROPCARRY \ | |
398 asm( \ | |
399 " ld.w r2,%1 \n\t" \ | |
400 " add r2,%0 \n\t" \ | |
401 " st.w %1,r2 \n\t" \ | |
402 " eor %0,%0 \n\t" \ | |
403 " acr %0 \n\t" \ | |
404 :"=r"(cy),"=r"(&_c[0]):"0"(cy),"1"(&_c[0]):"r2","%cc"); | |
405 | |
406 /******************************************************************/ | |
407 #elif defined(TFM_MIPS) | |
408 | |
409 /* MIPS */ | |
410 #define MONT_START | |
411 #define MONT_FINI | |
412 #define LOOP_END | |
413 #define LOOP_START \ | |
414 mu = c[x] * mp | |
415 | |
416 #define INNERMUL \ | |
417 asm( \ | |
418 " multu %3,%4 \n\t" \ | |
419 " mflo $12 \n\t" \ | |
420 " mfhi $13 \n\t" \ | |
421 " addu $12,$12,%0 \n\t" \ | |
422 " sltu $10,$12,%0 \n\t" \ | |
423 " addu $13,$13,$10 \n\t" \ | |
424 " lw $10,%1 \n\t" \ | |
425 " addu $12,$12,$10 \n\t" \ | |
426 " sltu $10,$12,$10 \n\t" \ | |
427 " addu %0,$13,$10 \n\t" \ | |
428 " sw $12,%1 \n\t" \ | |
429 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"$10","$12","$13"); ++tmpm; | |
430 | |
431 #define PROPCARRY \ | |
432 asm( \ | |
433 " lw $10,%1 \n\t" \ | |
434 " addu $10,$10,%0 \n\t" \ | |
435 " sw $10,%1 \n\t" \ | |
436 " sltu %0,$10,%0 \n\t" \ | |
437 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"$10"); | |
438 | |
439 /******************************************************************/ | |
440 #else | |
441 | |
442 /* ISO C code */ | |
443 #define MONT_START | |
444 #define MONT_FINI | |
445 #define LOOP_END | |
446 #define LOOP_START \ | |
447 mu = c[x] * mp | |
448 | |
449 #define INNERMUL \ | |
450 do { fp_word t; \ | |
451 _c[0] = t = ((fp_word)_c[0] + (fp_word)cy) + \ | |
452 (((fp_word)mu) * ((fp_word)*tmpm++)); \ | |
453 cy = (t >> DIGIT_BIT); \ | |
454 } while (0) | |
455 | |
456 #define PROPCARRY \ | |
457 do { fp_digit t = _c[0] += cy; cy = (t < cy); } while (0) | |
458 | |
459 #endif | |
460 /******************************************************************/ | |
461 | |
462 | |
463 #define LO 0 | |
464 | |
465 #ifdef TFM_SMALL_MONT_SET | |
466 #include "fp_mont_small.i" | |
467 #endif | |
468 | |
469 /* computes x/R == x (mod N) via Montgomery Reduction */ | |
470 void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp) | |
471 { | |
472 fp_digit c[FP_SIZE], *_c, *tmpm, mu; | |
473 int oldused, x, y, pa; | |
474 | |
475 /* bail if too large */ | |
476 if (m->used > (FP_SIZE/2)) { | |
477 return; | |
478 } | |
479 | |
480 #ifdef TFM_SMALL_MONT_SET | |
481 if (m->used <= 16) { | |
482 fp_montgomery_reduce_small(a, m, mp); | |
483 return; | |
484 } | |
485 #endif | |
486 | |
487 #if defined(USE_MEMSET) | |
488 /* now zero the buff */ | |
489 memset(c, 0, sizeof c); | |
490 #endif | |
491 pa = m->used; | |
492 | |
493 /* copy the input */ | |
494 oldused = a->used; | |
495 for (x = 0; x < oldused; x++) { | |
496 c[x] = a->dp[x]; | |
497 } | |
498 #if !defined(USE_MEMSET) | |
499 for (; x < 2*pa+1; x++) { | |
500 c[x] = 0; | |
501 } | |
502 #endif | |
503 MONT_START; | |
504 | |
505 for (x = 0; x < pa; x++) { | |
506 fp_digit cy = 0; | |
507 /* get Mu for this round */ | |
508 LOOP_START; | |
509 _c = c + x; | |
510 tmpm = m->dp; | |
511 y = 0; | |
512 #if (defined(TFM_SSE2) || defined(TFM_X86_64)) | |
513 for (; y < (pa & ~7); y += 8) { | |
514 INNERMUL8; | |
515 _c += 8; | |
516 tmpm += 8; | |
517 } | |
518 #endif | |
519 | |
520 for (; y < pa; y++) { | |
521 INNERMUL; | |
522 ++_c; | |
523 } | |
524 LOOP_END; | |
525 while (cy) { | |
526 PROPCARRY; | |
527 ++_c; | |
528 } | |
529 } | |
530 | |
531 /* now copy out */ | |
532 _c = c + pa; | |
533 tmpm = a->dp; | |
534 for (x = 0; x < pa+1; x++) { | |
535 *tmpm++ = *_c++; | |
536 } | |
537 | |
538 for (; x < oldused; x++) { | |
539 *tmpm++ = 0; | |
540 } | |
541 | |
542 MONT_FINI; | |
543 | |
544 a->used = pa+1; | |
545 fp_clamp(a); | |
546 | |
547 /* if A >= m then A = A - m */ | |
548 if (fp_cmp_mag (a, m) != FP_LT) { | |
549 s_fp_sub (a, m, a); | |
550 } | |
551 } | |
552 | |
553 | |
554 /* $Source$ */ | |
555 /* $Revision$ */ | |
556 /* $Date$ */ |