comparison tomsfastmath/src/mont/fp_montgomery_reduce.c @ 643:a362b62d38b2 dropbear-tfm

Add tomsfastmath from git rev bfa4582842bc3bab42e4be4aed5703437049502a with Makefile.in renamed
author Matt Johnston <matt@ucc.asn.au>
date Wed, 23 Nov 2011 18:10:20 +0700
parents
children
comparison
equal deleted inserted replaced
642:33fd2f3499d2 643:a362b62d38b2
1 /* TomsFastMath, a fast ISO C bignum library.
2 *
3 * This project is meant to fill in where LibTomMath
4 * falls short. That is speed ;-)
5 *
6 * This project is public domain and free for all purposes.
7 *
8 * Tom St Denis, [email protected]
9 */
10 #include <tfm.h>
11
12 /******************************************************************/
13 #if defined(TFM_X86) && !defined(TFM_SSE2)
14 /* x86-32 code */
15
16 #define MONT_START
17 #define MONT_FINI
18 #define LOOP_END
19 #define LOOP_START \
20 mu = c[x] * mp
21
22 #define INNERMUL \
23 asm( \
24 "movl %5,%%eax \n\t" \
25 "mull %4 \n\t" \
26 "addl %1,%%eax \n\t" \
27 "adcl $0,%%edx \n\t" \
28 "addl %%eax,%0 \n\t" \
29 "adcl $0,%%edx \n\t" \
30 "movl %%edx,%1 \n\t" \
31 :"=g"(_c[LO]), "=r"(cy) \
32 :"0"(_c[LO]), "1"(cy), "g"(mu), "g"(*tmpm++) \
33 : "%eax", "%edx", "%cc")
34
35 #define PROPCARRY \
36 asm( \
37 "addl %1,%0 \n\t" \
38 "setb %%al \n\t" \
39 "movzbl %%al,%1 \n\t" \
40 :"=g"(_c[LO]), "=r"(cy) \
41 :"0"(_c[LO]), "1"(cy) \
42 : "%eax", "%cc")
43
44 /******************************************************************/
45 #elif defined(TFM_X86_64)
46 /* x86-64 code */
47
48 #define MONT_START
49 #define MONT_FINI
50 #define LOOP_END
51 #define LOOP_START \
52 mu = c[x] * mp
53
54 #define INNERMUL \
55 asm( \
56 "movq %5,%%rax \n\t" \
57 "mulq %4 \n\t" \
58 "addq %1,%%rax \n\t" \
59 "adcq $0,%%rdx \n\t" \
60 "addq %%rax,%0 \n\t" \
61 "adcq $0,%%rdx \n\t" \
62 "movq %%rdx,%1 \n\t" \
63 :"=g"(_c[LO]), "=r"(cy) \
64 :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \
65 : "%rax", "%rdx", "%cc")
66
67 #define INNERMUL8 \
68 asm( \
69 "movq 0(%5),%%rax \n\t" \
70 "movq 0(%2),%%r10 \n\t" \
71 "movq 0x8(%5),%%r11 \n\t" \
72 "mulq %4 \n\t" \
73 "addq %%r10,%%rax \n\t" \
74 "adcq $0,%%rdx \n\t" \
75 "movq 0x8(%2),%%r10 \n\t" \
76 "addq %3,%%rax \n\t" \
77 "adcq $0,%%rdx \n\t" \
78 "movq %%rax,0(%0) \n\t" \
79 "movq %%rdx,%1 \n\t" \
80 \
81 "movq %%r11,%%rax \n\t" \
82 "movq 0x10(%5),%%r11 \n\t" \
83 "mulq %4 \n\t" \
84 "addq %%r10,%%rax \n\t" \
85 "adcq $0,%%rdx \n\t" \
86 "movq 0x10(%2),%%r10 \n\t" \
87 "addq %3,%%rax \n\t" \
88 "adcq $0,%%rdx \n\t" \
89 "movq %%rax,0x8(%0) \n\t" \
90 "movq %%rdx,%1 \n\t" \
91 \
92 "movq %%r11,%%rax \n\t" \
93 "movq 0x18(%5),%%r11 \n\t" \
94 "mulq %4 \n\t" \
95 "addq %%r10,%%rax \n\t" \
96 "adcq $0,%%rdx \n\t" \
97 "movq 0x18(%2),%%r10 \n\t" \
98 "addq %3,%%rax \n\t" \
99 "adcq $0,%%rdx \n\t" \
100 "movq %%rax,0x10(%0) \n\t" \
101 "movq %%rdx,%1 \n\t" \
102 \
103 "movq %%r11,%%rax \n\t" \
104 "movq 0x20(%5),%%r11 \n\t" \
105 "mulq %4 \n\t" \
106 "addq %%r10,%%rax \n\t" \
107 "adcq $0,%%rdx \n\t" \
108 "movq 0x20(%2),%%r10 \n\t" \
109 "addq %3,%%rax \n\t" \
110 "adcq $0,%%rdx \n\t" \
111 "movq %%rax,0x18(%0) \n\t" \
112 "movq %%rdx,%1 \n\t" \
113 \
114 "movq %%r11,%%rax \n\t" \
115 "movq 0x28(%5),%%r11 \n\t" \
116 "mulq %4 \n\t" \
117 "addq %%r10,%%rax \n\t" \
118 "adcq $0,%%rdx \n\t" \
119 "movq 0x28(%2),%%r10 \n\t" \
120 "addq %3,%%rax \n\t" \
121 "adcq $0,%%rdx \n\t" \
122 "movq %%rax,0x20(%0) \n\t" \
123 "movq %%rdx,%1 \n\t" \
124 \
125 "movq %%r11,%%rax \n\t" \
126 "movq 0x30(%5),%%r11 \n\t" \
127 "mulq %4 \n\t" \
128 "addq %%r10,%%rax \n\t" \
129 "adcq $0,%%rdx \n\t" \
130 "movq 0x30(%2),%%r10 \n\t" \
131 "addq %3,%%rax \n\t" \
132 "adcq $0,%%rdx \n\t" \
133 "movq %%rax,0x28(%0) \n\t" \
134 "movq %%rdx,%1 \n\t" \
135 \
136 "movq %%r11,%%rax \n\t" \
137 "movq 0x38(%5),%%r11 \n\t" \
138 "mulq %4 \n\t" \
139 "addq %%r10,%%rax \n\t" \
140 "adcq $0,%%rdx \n\t" \
141 "movq 0x38(%2),%%r10 \n\t" \
142 "addq %3,%%rax \n\t" \
143 "adcq $0,%%rdx \n\t" \
144 "movq %%rax,0x30(%0) \n\t" \
145 "movq %%rdx,%1 \n\t" \
146 \
147 "movq %%r11,%%rax \n\t" \
148 "mulq %4 \n\t" \
149 "addq %%r10,%%rax \n\t" \
150 "adcq $0,%%rdx \n\t" \
151 "addq %3,%%rax \n\t" \
152 "adcq $0,%%rdx \n\t" \
153 "movq %%rax,0x38(%0) \n\t" \
154 "movq %%rdx,%1 \n\t" \
155 \
156 :"=r"(_c), "=r"(cy) \
157 : "0"(_c), "1"(cy), "g"(mu), "r"(tmpm)\
158 : "%rax", "%rdx", "%r10", "%r11", "%cc")
159
160
161 #define PROPCARRY \
162 asm( \
163 "addq %1,%0 \n\t" \
164 "setb %%al \n\t" \
165 "movzbq %%al,%1 \n\t" \
166 :"=g"(_c[LO]), "=r"(cy) \
167 :"0"(_c[LO]), "1"(cy) \
168 : "%rax", "%cc")
169
170 /******************************************************************/
171 #elif defined(TFM_SSE2)
172 /* SSE2 code (assumes 32-bit fp_digits) */
173 /* XMM register assignments:
174 * xmm0 *tmpm++, then Mu * (*tmpm++)
175 * xmm1 c[x], then Mu
176 * xmm2 mp
177 * xmm3 cy
178 * xmm4 _c[LO]
179 */
180
181 #define MONT_START \
182 asm("movd %0,%%mm2"::"g"(mp))
183
184 #define MONT_FINI \
185 asm("emms")
186
187 #define LOOP_START \
188 asm( \
189 "movd %0,%%mm1 \n\t" \
190 "pxor %%mm3,%%mm3 \n\t" \
191 "pmuludq %%mm2,%%mm1 \n\t" \
192 :: "g"(c[x]))
193
194 /* pmuludq on mmx registers does a 32x32->64 multiply. */
195 #define INNERMUL \
196 asm( \
197 "movd %1,%%mm4 \n\t" \
198 "movd %2,%%mm0 \n\t" \
199 "paddq %%mm4,%%mm3 \n\t" \
200 "pmuludq %%mm1,%%mm0 \n\t" \
201 "paddq %%mm0,%%mm3 \n\t" \
202 "movd %%mm3,%0 \n\t" \
203 "psrlq $32, %%mm3 \n\t" \
204 :"=g"(_c[LO]) : "0"(_c[LO]), "g"(*tmpm++) );
205
206 #define INNERMUL8 \
207 asm( \
208 "movd 0(%1),%%mm4 \n\t" \
209 "movd 0(%2),%%mm0 \n\t" \
210 "paddq %%mm4,%%mm3 \n\t" \
211 "pmuludq %%mm1,%%mm0 \n\t" \
212 "movd 4(%2),%%mm5 \n\t" \
213 "paddq %%mm0,%%mm3 \n\t" \
214 "movd 4(%1),%%mm6 \n\t" \
215 "movd %%mm3,0(%0) \n\t" \
216 "psrlq $32, %%mm3 \n\t" \
217 \
218 "paddq %%mm6,%%mm3 \n\t" \
219 "pmuludq %%mm1,%%mm5 \n\t" \
220 "movd 8(%2),%%mm6 \n\t" \
221 "paddq %%mm5,%%mm3 \n\t" \
222 "movd 8(%1),%%mm7 \n\t" \
223 "movd %%mm3,4(%0) \n\t" \
224 "psrlq $32, %%mm3 \n\t" \
225 \
226 "paddq %%mm7,%%mm3 \n\t" \
227 "pmuludq %%mm1,%%mm6 \n\t" \
228 "movd 12(%2),%%mm7 \n\t" \
229 "paddq %%mm6,%%mm3 \n\t" \
230 "movd 12(%1),%%mm5 \n\t" \
231 "movd %%mm3,8(%0) \n\t" \
232 "psrlq $32, %%mm3 \n\t" \
233 \
234 "paddq %%mm5,%%mm3 \n\t" \
235 "pmuludq %%mm1,%%mm7 \n\t" \
236 "movd 16(%2),%%mm5 \n\t" \
237 "paddq %%mm7,%%mm3 \n\t" \
238 "movd 16(%1),%%mm6 \n\t" \
239 "movd %%mm3,12(%0) \n\t" \
240 "psrlq $32, %%mm3 \n\t" \
241 \
242 "paddq %%mm6,%%mm3 \n\t" \
243 "pmuludq %%mm1,%%mm5 \n\t" \
244 "movd 20(%2),%%mm6 \n\t" \
245 "paddq %%mm5,%%mm3 \n\t" \
246 "movd 20(%1),%%mm7 \n\t" \
247 "movd %%mm3,16(%0) \n\t" \
248 "psrlq $32, %%mm3 \n\t" \
249 \
250 "paddq %%mm7,%%mm3 \n\t" \
251 "pmuludq %%mm1,%%mm6 \n\t" \
252 "movd 24(%2),%%mm7 \n\t" \
253 "paddq %%mm6,%%mm3 \n\t" \
254 "movd 24(%1),%%mm5 \n\t" \
255 "movd %%mm3,20(%0) \n\t" \
256 "psrlq $32, %%mm3 \n\t" \
257 \
258 "paddq %%mm5,%%mm3 \n\t" \
259 "pmuludq %%mm1,%%mm7 \n\t" \
260 "movd 28(%2),%%mm5 \n\t" \
261 "paddq %%mm7,%%mm3 \n\t" \
262 "movd 28(%1),%%mm6 \n\t" \
263 "movd %%mm3,24(%0) \n\t" \
264 "psrlq $32, %%mm3 \n\t" \
265 \
266 "paddq %%mm6,%%mm3 \n\t" \
267 "pmuludq %%mm1,%%mm5 \n\t" \
268 "paddq %%mm5,%%mm3 \n\t" \
269 "movd %%mm3,28(%0) \n\t" \
270 "psrlq $32, %%mm3 \n\t" \
271 :"=r"(_c) : "0"(_c), "g"(tmpm) );
272
273 #define LOOP_END \
274 asm( "movd %%mm3,%0 \n" :"=r"(cy))
275
276 #define PROPCARRY \
277 asm( \
278 "addl %1,%0 \n\t" \
279 "setb %%al \n\t" \
280 "movzbl %%al,%1 \n\t" \
281 :"=g"(_c[LO]), "=r"(cy) \
282 :"0"(_c[LO]), "1"(cy) \
283 : "%eax", "%cc")
284
285 /******************************************************************/
286 #elif defined(TFM_ARM)
287 /* ARMv4 code */
288
289 #define MONT_START
290 #define MONT_FINI
291 #define LOOP_END
292 #define LOOP_START \
293 mu = c[x] * mp
294
295 #define INNERMUL \
296 asm( \
297 " LDR r0,%1 \n\t" \
298 " ADDS r0,r0,%0 \n\t" \
299 " MOVCS %0,#1 \n\t" \
300 " MOVCC %0,#0 \n\t" \
301 " UMLAL r0,%0,%3,%4 \n\t" \
302 " STR r0,%1 \n\t" \
303 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c[0]):"r0","%cc");
304
305 #define PROPCARRY \
306 asm( \
307 " LDR r0,%1 \n\t" \
308 " ADDS r0,r0,%0 \n\t" \
309 " STR r0,%1 \n\t" \
310 " MOVCS %0,#1 \n\t" \
311 " MOVCC %0,#0 \n\t" \
312 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r0","%cc");
313
314 /******************************************************************/
315 #elif defined(TFM_PPC32)
316
317 /* PPC32 */
318 #define MONT_START
319 #define MONT_FINI
320 #define LOOP_END
321 #define LOOP_START \
322 mu = c[x] * mp
323
324 #define INNERMUL \
325 asm( \
326 " mullw 16,%3,%4 \n\t" \
327 " mulhwu 17,%3,%4 \n\t" \
328 " addc 16,16,%0 \n\t" \
329 " addze 17,17 \n\t" \
330 " lwz 18,%1 \n\t" \
331 " addc 16,16,18 \n\t" \
332 " addze %0,17 \n\t" \
333 " stw 16,%1 \n\t" \
334 :"=r"(cy),"=g"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "18","%cc"); ++tmpm;
335
336 #define PROPCARRY \
337 asm( \
338 " lwz 16,%1 \n\t" \
339 " addc 16,16,%0 \n\t" \
340 " stw 16,%1 \n\t" \
341 " xor %0,%0,%0 \n\t" \
342 " addze %0,%0 \n\t" \
343 :"=r"(cy),"=g"(_c[0]):"0"(cy),"1"(_c[0]):"16","%cc");
344
345 /******************************************************************/
346 #elif defined(TFM_PPC64)
347
348 /* PPC64 */
349 #define MONT_START
350 #define MONT_FINI
351 #define LOOP_END
352 #define LOOP_START \
353 mu = c[x] * mp
354
355 #define INNERMUL \
356 asm( \
357 " mulld r16,%3,%4 \n\t" \
358 " mulhdu r17,%3,%4 \n\t" \
359 " addc r16,16,%0 \n\t" \
360 " addze r17,r17 \n\t" \
361 " ldx r18,0,%1 \n\t" \
362 " addc r16,r16,r18 \n\t" \
363 " addze %0,r17 \n\t" \
364 " sdx r16,0,%1 \n\t" \
365 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"r16", "r17", "r18","%cc"); ++tmpm;
366
367 #define PROPCARRY \
368 asm( \
369 " ldx r16,0,%1 \n\t" \
370 " addc r16,r16,%0 \n\t" \
371 " sdx r16,0,%1 \n\t" \
372 " xor %0,%0,%0 \n\t" \
373 " addze %0,%0 \n\t" \
374 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r16","%cc");
375
376 /******************************************************************/
377 #elif defined(TFM_AVR32)
378
379 /* AVR32 */
380 #define MONT_START
381 #define MONT_FINI
382 #define LOOP_END
383 #define LOOP_START \
384 mu = c[x] * mp
385
386 #define INNERMUL \
387 asm( \
388 " ld.w r2,%1 \n\t" \
389 " add r2,%0 \n\t" \
390 " eor r3,r3 \n\t" \
391 " acr r3 \n\t" \
392 " macu.d r2,%3,%4 \n\t" \
393 " st.w %1,r2 \n\t" \
394 " mov %0,r3 \n\t" \
395 :"=r"(cy),"=r"(_c):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c):"r2","r3");
396
397 #define PROPCARRY \
398 asm( \
399 " ld.w r2,%1 \n\t" \
400 " add r2,%0 \n\t" \
401 " st.w %1,r2 \n\t" \
402 " eor %0,%0 \n\t" \
403 " acr %0 \n\t" \
404 :"=r"(cy),"=r"(&_c[0]):"0"(cy),"1"(&_c[0]):"r2","%cc");
405
406 /******************************************************************/
407 #elif defined(TFM_MIPS)
408
409 /* MIPS */
410 #define MONT_START
411 #define MONT_FINI
412 #define LOOP_END
413 #define LOOP_START \
414 mu = c[x] * mp
415
416 #define INNERMUL \
417 asm( \
418 " multu %3,%4 \n\t" \
419 " mflo $12 \n\t" \
420 " mfhi $13 \n\t" \
421 " addu $12,$12,%0 \n\t" \
422 " sltu $10,$12,%0 \n\t" \
423 " addu $13,$13,$10 \n\t" \
424 " lw $10,%1 \n\t" \
425 " addu $12,$12,$10 \n\t" \
426 " sltu $10,$12,$10 \n\t" \
427 " addu %0,$13,$10 \n\t" \
428 " sw $12,%1 \n\t" \
429 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"$10","$12","$13"); ++tmpm;
430
431 #define PROPCARRY \
432 asm( \
433 " lw $10,%1 \n\t" \
434 " addu $10,$10,%0 \n\t" \
435 " sw $10,%1 \n\t" \
436 " sltu %0,$10,%0 \n\t" \
437 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"$10");
438
439 /******************************************************************/
440 #else
441
442 /* ISO C code */
443 #define MONT_START
444 #define MONT_FINI
445 #define LOOP_END
446 #define LOOP_START \
447 mu = c[x] * mp
448
449 #define INNERMUL \
450 do { fp_word t; \
451 _c[0] = t = ((fp_word)_c[0] + (fp_word)cy) + \
452 (((fp_word)mu) * ((fp_word)*tmpm++)); \
453 cy = (t >> DIGIT_BIT); \
454 } while (0)
455
456 #define PROPCARRY \
457 do { fp_digit t = _c[0] += cy; cy = (t < cy); } while (0)
458
459 #endif
460 /******************************************************************/
461
462
463 #define LO 0
464
465 #ifdef TFM_SMALL_MONT_SET
466 #include "fp_mont_small.i"
467 #endif
468
469 /* computes x/R == x (mod N) via Montgomery Reduction */
470 void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp)
471 {
472 fp_digit c[FP_SIZE], *_c, *tmpm, mu;
473 int oldused, x, y, pa;
474
475 /* bail if too large */
476 if (m->used > (FP_SIZE/2)) {
477 return;
478 }
479
480 #ifdef TFM_SMALL_MONT_SET
481 if (m->used <= 16) {
482 fp_montgomery_reduce_small(a, m, mp);
483 return;
484 }
485 #endif
486
487 #if defined(USE_MEMSET)
488 /* now zero the buff */
489 memset(c, 0, sizeof c);
490 #endif
491 pa = m->used;
492
493 /* copy the input */
494 oldused = a->used;
495 for (x = 0; x < oldused; x++) {
496 c[x] = a->dp[x];
497 }
498 #if !defined(USE_MEMSET)
499 for (; x < 2*pa+1; x++) {
500 c[x] = 0;
501 }
502 #endif
503 MONT_START;
504
505 for (x = 0; x < pa; x++) {
506 fp_digit cy = 0;
507 /* get Mu for this round */
508 LOOP_START;
509 _c = c + x;
510 tmpm = m->dp;
511 y = 0;
512 #if (defined(TFM_SSE2) || defined(TFM_X86_64))
513 for (; y < (pa & ~7); y += 8) {
514 INNERMUL8;
515 _c += 8;
516 tmpm += 8;
517 }
518 #endif
519
520 for (; y < pa; y++) {
521 INNERMUL;
522 ++_c;
523 }
524 LOOP_END;
525 while (cy) {
526 PROPCARRY;
527 ++_c;
528 }
529 }
530
531 /* now copy out */
532 _c = c + pa;
533 tmpm = a->dp;
534 for (x = 0; x < pa+1; x++) {
535 *tmpm++ = *_c++;
536 }
537
538 for (; x < oldused; x++) {
539 *tmpm++ = 0;
540 }
541
542 MONT_FINI;
543
544 a->used = pa+1;
545 fp_clamp(a);
546
547 /* if A >= m then A = A - m */
548 if (fp_cmp_mag (a, m) != FP_LT) {
549 s_fp_sub (a, m, a);
550 }
551 }
552
553
554 /* $Source$ */
555 /* $Revision$ */
556 /* $Date$ */