comparison curve25519-donna.c @ 1434:27b9ddb06b09

Update curve25519-donna to f7837adf95a2c2dcc36233cb02a1fb34081c0c4a
author Matt Johnston <matt@ucc.asn.au>
date Sat, 24 Jun 2017 11:53:32 +0800
parents d3925ed45a85
children
comparison
equal deleted inserted replaced
1433:b19877938d6a 1434:27b9ddb06b09
41 * djb's sample implementation of curve25519 is written in a special assembly 41 * djb's sample implementation of curve25519 is written in a special assembly
42 * language called qhasm and uses the floating point registers. 42 * language called qhasm and uses the floating point registers.
43 * 43 *
44 * This is, almost, a clean room reimplementation from the curve25519 paper. It 44 * This is, almost, a clean room reimplementation from the curve25519 paper. It
45 * uses many of the tricks described therein. Only the crecip function is taken 45 * uses many of the tricks described therein. Only the crecip function is taken
46 * from the sample implementation. 46 * from the sample implementation. */
47 */
48 47
49 #include <string.h> 48 #include <string.h>
50 #include <stdint.h> 49 #include <stdint.h>
51 50
52 #ifdef _MSC_VER 51 #ifdef _MSC_VER
61 * 60 *
62 * Field elements are written as an array of signed, 64-bit limbs, least 61 * Field elements are written as an array of signed, 64-bit limbs, least
63 * significant first. The value of the field element is: 62 * significant first. The value of the field element is:
64 * x[0] + 2^26·x[1] + x^51·x[2] + 2^102·x[3] + ... 63 * x[0] + 2^26·x[1] + x^51·x[2] + 2^102·x[3] + ...
65 * 64 *
66 * i.e. the limbs are 26, 25, 26, 25, ... bits wide. 65 * i.e. the limbs are 26, 25, 26, 25, ... bits wide. */
67 */
68 66
69 /* Sum two numbers: output += in */ 67 /* Sum two numbers: output += in */
70 static void fsum(limb *output, const limb *in) { 68 static void fsum(limb *output, const limb *in) {
71 unsigned i; 69 unsigned i;
72 for (i = 0; i < 10; i += 2) { 70 for (i = 0; i < 10; i += 2) {
73 output[0+i] = (output[0+i] + in[0+i]); 71 output[0+i] = output[0+i] + in[0+i];
74 output[1+i] = (output[1+i] + in[1+i]); 72 output[1+i] = output[1+i] + in[1+i];
75 } 73 }
76 } 74 }
77 75
78 /* Find the difference of two numbers: output = in - output 76 /* Find the difference of two numbers: output = in - output
79 * (note the order of the arguments!) 77 * (note the order of the arguments!). */
80 */
81 static void fdifference(limb *output, const limb *in) { 78 static void fdifference(limb *output, const limb *in) {
82 unsigned i; 79 unsigned i;
83 for (i = 0; i < 10; ++i) { 80 for (i = 0; i < 10; ++i) {
84 output[i] = (in[i] - output[i]); 81 output[i] = in[i] - output[i];
85 } 82 }
86 } 83 }
87 84
88 /* Multiply a number by a scalar: output = in * scalar */ 85 /* Multiply a number by a scalar: output = in * scalar */
89 static void fscalar_product(limb *output, const limb *in, const limb scalar) { 86 static void fscalar_product(limb *output, const limb *in, const limb scalar) {
95 92
96 /* Multiply two numbers: output = in2 * in 93 /* Multiply two numbers: output = in2 * in
97 * 94 *
98 * output must be distinct to both inputs. The inputs are reduced coefficient 95 * output must be distinct to both inputs. The inputs are reduced coefficient
99 * form, the output is not. 96 * form, the output is not.
100 */ 97 *
98 * output[x] <= 14 * the largest product of the input limbs. */
101 static void fproduct(limb *output, const limb *in2, const limb *in) { 99 static void fproduct(limb *output, const limb *in2, const limb *in) {
102 output[0] = ((limb) ((s32) in2[0])) * ((s32) in[0]); 100 output[0] = ((limb) ((s32) in2[0])) * ((s32) in[0]);
103 output[1] = ((limb) ((s32) in2[0])) * ((s32) in[1]) + 101 output[1] = ((limb) ((s32) in2[0])) * ((s32) in[1]) +
104 ((limb) ((s32) in2[1])) * ((s32) in[0]); 102 ((limb) ((s32) in2[1])) * ((s32) in[0]);
105 output[2] = 2 * ((limb) ((s32) in2[1])) * ((s32) in[1]) + 103 output[2] = 2 * ((limb) ((s32) in2[1])) * ((s32) in[1]) +
199 output[17] = ((limb) ((s32) in2[8])) * ((s32) in[9]) + 197 output[17] = ((limb) ((s32) in2[8])) * ((s32) in[9]) +
200 ((limb) ((s32) in2[9])) * ((s32) in[8]); 198 ((limb) ((s32) in2[9])) * ((s32) in[8]);
201 output[18] = 2 * ((limb) ((s32) in2[9])) * ((s32) in[9]); 199 output[18] = 2 * ((limb) ((s32) in2[9])) * ((s32) in[9]);
202 } 200 }
203 201
204 /* Reduce a long form to a short form by taking the input mod 2^255 - 19. */ 202 /* Reduce a long form to a short form by taking the input mod 2^255 - 19.
203 *
204 * On entry: |output[i]| < 14*2^54
205 * On exit: |output[0..8]| < 280*2^54 */
205 static void freduce_degree(limb *output) { 206 static void freduce_degree(limb *output) {
206 /* Each of these shifts and adds ends up multiplying the value by 19. */ 207 /* Each of these shifts and adds ends up multiplying the value by 19.
208 *
209 * For output[0..8], the absolute entry value is < 14*2^54 and we add, at
210 * most, 19*14*2^54 thus, on exit, |output[0..8]| < 280*2^54. */
207 output[8] += output[18] << 4; 211 output[8] += output[18] << 4;
208 output[8] += output[18] << 1; 212 output[8] += output[18] << 1;
209 output[8] += output[18]; 213 output[8] += output[18];
210 output[7] += output[17] << 4; 214 output[7] += output[17] << 4;
211 output[7] += output[17] << 1; 215 output[7] += output[17] << 1;
235 239
236 #if (-1 & 3) != 3 240 #if (-1 & 3) != 3
237 #error "This code only works on a two's complement system" 241 #error "This code only works on a two's complement system"
238 #endif 242 #endif
239 243
240 /* return v / 2^26, using only shifts and adds. */ 244 /* return v / 2^26, using only shifts and adds.
245 *
246 * On entry: v can take any value. */
241 static inline limb 247 static inline limb
242 div_by_2_26(const limb v) 248 div_by_2_26(const limb v)
243 { 249 {
244 /* High word of v; no shift needed*/ 250 /* High word of v; no shift needed. */
245 const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32); 251 const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32);
246 /* Set to all 1s if v was negative; else set to 0s. */ 252 /* Set to all 1s if v was negative; else set to 0s. */
247 const int32_t sign = ((int32_t) highword) >> 31; 253 const int32_t sign = ((int32_t) highword) >> 31;
248 /* Set to 0x3ffffff if v was negative; else set to 0. */ 254 /* Set to 0x3ffffff if v was negative; else set to 0. */
249 const int32_t roundoff = ((uint32_t) sign) >> 6; 255 const int32_t roundoff = ((uint32_t) sign) >> 6;
250 /* Should return v / (1<<26) */ 256 /* Should return v / (1<<26) */
251 return (v + roundoff) >> 26; 257 return (v + roundoff) >> 26;
252 } 258 }
253 259
254 /* return v / (2^25), using only shifts and adds. */ 260 /* return v / (2^25), using only shifts and adds.
261 *
262 * On entry: v can take any value. */
255 static inline limb 263 static inline limb
256 div_by_2_25(const limb v) 264 div_by_2_25(const limb v)
257 { 265 {
258 /* High word of v; no shift needed*/ 266 /* High word of v; no shift needed*/
259 const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32); 267 const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32);
263 const int32_t roundoff = ((uint32_t) sign) >> 7; 271 const int32_t roundoff = ((uint32_t) sign) >> 7;
264 /* Should return v / (1<<25) */ 272 /* Should return v / (1<<25) */
265 return (v + roundoff) >> 25; 273 return (v + roundoff) >> 25;
266 } 274 }
267 275
268 static inline s32
269 div_s32_by_2_25(const s32 v)
270 {
271 const s32 roundoff = ((uint32_t)(v >> 31)) >> 7;
272 return (v + roundoff) >> 25;
273 }
274
275 /* Reduce all coefficients of the short form input so that |x| < 2^26. 276 /* Reduce all coefficients of the short form input so that |x| < 2^26.
276 * 277 *
277 * On entry: |output[i]| < 2^62 278 * On entry: |output[i]| < 280*2^54 */
278 */
279 static void freduce_coefficients(limb *output) { 279 static void freduce_coefficients(limb *output) {
280 unsigned i; 280 unsigned i;
281 281
282 output[10] = 0; 282 output[10] = 0;
283 283
284 for (i = 0; i < 10; i += 2) { 284 for (i = 0; i < 10; i += 2) {
285 limb over = div_by_2_26(output[i]); 285 limb over = div_by_2_26(output[i]);
286 /* The entry condition (that |output[i]| < 280*2^54) means that over is, at
287 * most, 280*2^28 in the first iteration of this loop. This is added to the
288 * next limb and we can approximate the resulting bound of that limb by
289 * 281*2^54. */
286 output[i] -= over << 26; 290 output[i] -= over << 26;
287 output[i+1] += over; 291 output[i+1] += over;
288 292
293 /* For the first iteration, |output[i+1]| < 281*2^54, thus |over| <
294 * 281*2^29. When this is added to the next limb, the resulting bound can
295 * be approximated as 281*2^54.
296 *
297 * For subsequent iterations of the loop, 281*2^54 remains a conservative
298 * bound and no overflow occurs. */
289 over = div_by_2_25(output[i+1]); 299 over = div_by_2_25(output[i+1]);
290 output[i+1] -= over << 25; 300 output[i+1] -= over << 25;
291 output[i+2] += over; 301 output[i+2] += over;
292 } 302 }
293 /* Now |output[10]| < 2 ^ 38 and all other coefficients are reduced. */ 303 /* Now |output[10]| < 281*2^29 and all other coefficients are reduced. */
294 output[0] += output[10] << 4; 304 output[0] += output[10] << 4;
295 output[0] += output[10] << 1; 305 output[0] += output[10] << 1;
296 output[0] += output[10]; 306 output[0] += output[10];
297 307
298 output[10] = 0; 308 output[10] = 0;
299 309
300 /* Now output[1..9] are reduced, and |output[0]| < 2^26 + 19 * 2^38 310 /* Now output[1..9] are reduced, and |output[0]| < 2^26 + 19*281*2^29
301 * So |over| will be no more than 77825 */ 311 * So |over| will be no more than 2^16. */
302 { 312 {
303 limb over = div_by_2_26(output[0]); 313 limb over = div_by_2_26(output[0]);
304 output[0] -= over << 26; 314 output[0] -= over << 26;
305 output[1] += over; 315 output[1] += over;
306 } 316 }
307 317
308 /* Now output[0,2..9] are reduced, and |output[1]| < 2^25 + 77825 318 /* Now output[0,2..9] are reduced, and |output[1]| < 2^25 + 2^16 < 2^26. The
309 * So |over| will be no more than 1. */ 319 * bound on |output[1]| is sufficient to meet our needs. */
310 {
311 /* output[1] fits in 32 bits, so we can use div_s32_by_2_25 here. */
312 s32 over32 = div_s32_by_2_25((s32) output[1]);
313 output[1] -= over32 << 25;
314 output[2] += over32;
315 }
316
317 /* Finally, output[0,1,3..9] are reduced, and output[2] is "nearly reduced":
318 * we have |output[2]| <= 2^26. This is good enough for all of our math,
319 * but it will require an extra freduce_coefficients before fcontract. */
320 } 320 }
321 321
322 /* A helpful wrapper around fproduct: output = in * in2. 322 /* A helpful wrapper around fproduct: output = in * in2.
323 * 323 *
324 * output must be distinct to both inputs. The output is reduced degree and 324 * On entry: |in[i]| < 2^27 and |in2[i]| < 2^27.
325 * reduced coefficient. 325 *
326 */ 326 * output must be distinct to both inputs. The output is reduced degree
327 * (indeed, one need only provide storage for 10 limbs) and |output[i]| < 2^26. */
327 static void 328 static void
328 fmul(limb *output, const limb *in, const limb *in2) { 329 fmul(limb *output, const limb *in, const limb *in2) {
329 limb t[19]; 330 limb t[19];
330 fproduct(t, in, in2); 331 fproduct(t, in, in2);
332 /* |t[i]| < 14*2^54 */
331 freduce_degree(t); 333 freduce_degree(t);
332 freduce_coefficients(t); 334 freduce_coefficients(t);
335 /* |t[i]| < 2^26 */
333 memcpy(output, t, sizeof(limb) * 10); 336 memcpy(output, t, sizeof(limb) * 10);
334 } 337 }
335 338
339 /* Square a number: output = in**2
340 *
341 * output must be distinct from the input. The inputs are reduced coefficient
342 * form, the output is not.
343 *
344 * output[x] <= 14 * the largest product of the input limbs. */
336 static void fsquare_inner(limb *output, const limb *in) { 345 static void fsquare_inner(limb *output, const limb *in) {
337 output[0] = ((limb) ((s32) in[0])) * ((s32) in[0]); 346 output[0] = ((limb) ((s32) in[0])) * ((s32) in[0]);
338 output[1] = 2 * ((limb) ((s32) in[0])) * ((s32) in[1]); 347 output[1] = 2 * ((limb) ((s32) in[0])) * ((s32) in[1]);
339 output[2] = 2 * (((limb) ((s32) in[1])) * ((s32) in[1]) + 348 output[2] = 2 * (((limb) ((s32) in[1])) * ((s32) in[1]) +
340 ((limb) ((s32) in[0])) * ((s32) in[2])); 349 ((limb) ((s32) in[0])) * ((s32) in[2]));
389 4 * ((limb) ((s32) in[7])) * ((s32) in[9]); 398 4 * ((limb) ((s32) in[7])) * ((s32) in[9]);
390 output[17] = 2 * ((limb) ((s32) in[8])) * ((s32) in[9]); 399 output[17] = 2 * ((limb) ((s32) in[8])) * ((s32) in[9]);
391 output[18] = 2 * ((limb) ((s32) in[9])) * ((s32) in[9]); 400 output[18] = 2 * ((limb) ((s32) in[9])) * ((s32) in[9]);
392 } 401 }
393 402
403 /* fsquare sets output = in^2.
404 *
405 * On entry: The |in| argument is in reduced coefficients form and |in[i]| <
406 * 2^27.
407 *
408 * On exit: The |output| argument is in reduced coefficients form (indeed, one
409 * need only provide storage for 10 limbs) and |out[i]| < 2^26. */
394 static void 410 static void
395 fsquare(limb *output, const limb *in) { 411 fsquare(limb *output, const limb *in) {
396 limb t[19]; 412 limb t[19];
397 fsquare_inner(t, in); 413 fsquare_inner(t, in);
414 /* |t[i]| < 14*2^54 because the largest product of two limbs will be <
415 * 2^(27+27) and fsquare_inner adds together, at most, 14 of those
416 * products. */
398 freduce_degree(t); 417 freduce_degree(t);
399 freduce_coefficients(t); 418 freduce_coefficients(t);
419 /* |t[i]| < 2^26 */
400 memcpy(output, t, sizeof(limb) * 10); 420 memcpy(output, t, sizeof(limb) * 10);
401 } 421 }
402 422
403 /* Take a little-endian, 32-byte number and expand it into polynomial form */ 423 /* Take a little-endian, 32-byte number and expand it into polynomial form */
404 static void 424 static void
415 F(4, 12, 6, 0x3ffffff); 435 F(4, 12, 6, 0x3ffffff);
416 F(5, 16, 0, 0x1ffffff); 436 F(5, 16, 0, 0x1ffffff);
417 F(6, 19, 1, 0x3ffffff); 437 F(6, 19, 1, 0x3ffffff);
418 F(7, 22, 3, 0x1ffffff); 438 F(7, 22, 3, 0x1ffffff);
419 F(8, 25, 4, 0x3ffffff); 439 F(8, 25, 4, 0x3ffffff);
420 F(9, 28, 6, 0x3ffffff); 440 F(9, 28, 6, 0x1ffffff);
421 #undef F 441 #undef F
422 } 442 }
423 443
424 #if (-32 >> 1) != -16 444 #if (-32 >> 1) != -16
425 #error "This code only works when >> does sign-extension on negative numbers" 445 #error "This code only works when >> does sign-extension on negative numbers"
426 #endif 446 #endif
427 447
448 /* s32_eq returns 0xffffffff iff a == b and zero otherwise. */
449 static s32 s32_eq(s32 a, s32 b) {
450 a = ~(a ^ b);
451 a &= a << 16;
452 a &= a << 8;
453 a &= a << 4;
454 a &= a << 2;
455 a &= a << 1;
456 return a >> 31;
457 }
458
459 /* s32_gte returns 0xffffffff if a >= b and zero otherwise, where a and b are
460 * both non-negative. */
461 static s32 s32_gte(s32 a, s32 b) {
462 a -= b;
463 /* a >= 0 iff a >= b. */
464 return ~(a >> 31);
465 }
466
428 /* Take a fully reduced polynomial form number and contract it into a 467 /* Take a fully reduced polynomial form number and contract it into a
429 * little-endian, 32-byte array 468 * little-endian, 32-byte array.
430 */ 469 *
470 * On entry: |input_limbs[i]| < 2^26 */
431 static void 471 static void
432 fcontract(u8 *output, limb *input) { 472 fcontract(u8 *output, limb *input_limbs) {
433 int i; 473 int i;
434 int j; 474 int j;
475 s32 input[10];
476 s32 mask;
477
478 /* |input_limbs[i]| < 2^26, so it's valid to convert to an s32. */
479 for (i = 0; i < 10; i++) {
480 input[i] = input_limbs[i];
481 }
435 482
436 for (j = 0; j < 2; ++j) { 483 for (j = 0; j < 2; ++j) {
437 for (i = 0; i < 9; ++i) { 484 for (i = 0; i < 9; ++i) {
438 if ((i & 1) == 1) { 485 if ((i & 1) == 1) {
439 /* This calculation is a time-invariant way to make input[i] positive 486 /* This calculation is a time-invariant way to make input[i]
440 by borrowing from the next-larger limb. 487 * non-negative by borrowing from the next-larger limb. */
441 */ 488 const s32 mask = input[i] >> 31;
442 const s32 mask = (s32)(input[i]) >> 31; 489 const s32 carry = -((input[i] & mask) >> 25);
443 const s32 carry = -(((s32)(input[i]) & mask) >> 25); 490 input[i] = input[i] + (carry << 25);
444 input[i] = (s32)(input[i]) + (carry << 25); 491 input[i+1] = input[i+1] - carry;
445 input[i+1] = (s32)(input[i+1]) - carry;
446 } else { 492 } else {
447 const s32 mask = (s32)(input[i]) >> 31; 493 const s32 mask = input[i] >> 31;
448 const s32 carry = -(((s32)(input[i]) & mask) >> 26); 494 const s32 carry = -((input[i] & mask) >> 26);
449 input[i] = (s32)(input[i]) + (carry << 26); 495 input[i] = input[i] + (carry << 26);
450 input[i+1] = (s32)(input[i+1]) - carry; 496 input[i+1] = input[i+1] - carry;
451 } 497 }
452 } 498 }
499
500 /* There's no greater limb for input[9] to borrow from, but we can multiply
501 * by 19 and borrow from input[0], which is valid mod 2^255-19. */
453 { 502 {
454 const s32 mask = (s32)(input[9]) >> 31; 503 const s32 mask = input[9] >> 31;
455 const s32 carry = -(((s32)(input[9]) & mask) >> 25); 504 const s32 carry = -((input[9] & mask) >> 25);
456 input[9] = (s32)(input[9]) + (carry << 25); 505 input[9] = input[9] + (carry << 25);
457 input[0] = (s32)(input[0]) - (carry * 19); 506 input[0] = input[0] - (carry * 19);
458 } 507 }
508
509 /* After the first iteration, input[1..9] are non-negative and fit within
510 * 25 or 26 bits, depending on position. However, input[0] may be
511 * negative. */
459 } 512 }
460 513
461 /* The first borrow-propagation pass above ended with every limb 514 /* The first borrow-propagation pass above ended with every limb
462 except (possibly) input[0] non-negative. 515 except (possibly) input[0] non-negative.
463 516
464 Since each input limb except input[0] is decreased by at most 1 517 If input[0] was negative after the first pass, then it was because of a
465 by a borrow-propagation pass, the second borrow-propagation pass 518 carry from input[9]. On entry, input[9] < 2^26 so the carry was, at most,
466 could only have wrapped around to decrease input[0] again if the 519 one, since (2**26-1) >> 25 = 1. Thus input[0] >= -19.
467 first pass left input[0] negative *and* input[1] through input[9] 520
468 were all zero. In that case, input[1] is now 2^25 - 1, and this 521 In the second pass, each limb is decreased by at most one. Thus the second
469 last borrow-propagation step will leave input[1] non-negative. 522 borrow-propagation pass could only have wrapped around to decrease
470 */ 523 input[0] again if the first pass left input[0] negative *and* input[1]
524 through input[9] were all zero. In that case, input[1] is now 2^25 - 1,
525 and this last borrow-propagation step will leave input[1] non-negative. */
471 { 526 {
472 const s32 mask = (s32)(input[0]) >> 31; 527 const s32 mask = input[0] >> 31;
473 const s32 carry = -(((s32)(input[0]) & mask) >> 26); 528 const s32 carry = -((input[0] & mask) >> 26);
474 input[0] = (s32)(input[0]) + (carry << 26); 529 input[0] = input[0] + (carry << 26);
475 input[1] = (s32)(input[1]) - carry; 530 input[1] = input[1] - carry;
476 } 531 }
477 532
478 /* Both passes through the above loop, plus the last 0-to-1 step, are 533 /* All input[i] are now non-negative. However, there might be values between
479 necessary: if input[9] is -1 and input[0] through input[8] are 0, 534 * 2^25 and 2^26 in a limb which is, nominally, 25 bits wide. */
480 negative values will remain in the array until the end. 535 for (j = 0; j < 2; j++) {
481 */ 536 for (i = 0; i < 9; i++) {
537 if ((i & 1) == 1) {
538 const s32 carry = input[i] >> 25;
539 input[i] &= 0x1ffffff;
540 input[i+1] += carry;
541 } else {
542 const s32 carry = input[i] >> 26;
543 input[i] &= 0x3ffffff;
544 input[i+1] += carry;
545 }
546 }
547
548 {
549 const s32 carry = input[9] >> 25;
550 input[9] &= 0x1ffffff;
551 input[0] += 19*carry;
552 }
553 }
554
555 /* If the first carry-chain pass, just above, ended up with a carry from
556 * input[9], and that caused input[0] to be out-of-bounds, then input[0] was
557 * < 2^26 + 2*19, because the carry was, at most, two.
558 *
559 * If the second pass carried from input[9] again then input[0] is < 2*19 and
560 * the input[9] -> input[0] carry didn't push input[0] out of bounds. */
561
562 /* It still remains the case that input might be between 2^255-19 and 2^255.
563 * In this case, input[1..9] must take their maximum value and input[0] must
564 * be >= (2^255-19) & 0x3ffffff, which is 0x3ffffed. */
565 mask = s32_gte(input[0], 0x3ffffed);
566 for (i = 1; i < 10; i++) {
567 if ((i & 1) == 1) {
568 mask &= s32_eq(input[i], 0x1ffffff);
569 } else {
570 mask &= s32_eq(input[i], 0x3ffffff);
571 }
572 }
573
574 /* mask is either 0xffffffff (if input >= 2^255-19) and zero otherwise. Thus
575 * this conditionally subtracts 2^255-19. */
576 input[0] -= mask & 0x3ffffed;
577
578 for (i = 1; i < 10; i++) {
579 if ((i & 1) == 1) {
580 input[i] -= mask & 0x1ffffff;
581 } else {
582 input[i] -= mask & 0x3ffffff;
583 }
584 }
482 585
483 input[1] <<= 2; 586 input[1] <<= 2;
484 input[2] <<= 3; 587 input[2] <<= 3;
485 input[3] <<= 5; 588 input[3] <<= 5;
486 input[4] <<= 6; 589 input[4] <<= 6;
514 * x2 z3: long form 617 * x2 z3: long form
515 * x3 z3: long form 618 * x3 z3: long form
516 * x z: short form, destroyed 619 * x z: short form, destroyed
517 * xprime zprime: short form, destroyed 620 * xprime zprime: short form, destroyed
518 * qmqp: short form, preserved 621 * qmqp: short form, preserved
519 */ 622 *
623 * On entry and exit, the absolute value of the limbs of all inputs and outputs
624 * are < 2^26. */
520 static void fmonty(limb *x2, limb *z2, /* output 2Q */ 625 static void fmonty(limb *x2, limb *z2, /* output 2Q */
521 limb *x3, limb *z3, /* output Q + Q' */ 626 limb *x3, limb *z3, /* output Q + Q' */
522 limb *x, limb *z, /* input Q */ 627 limb *x, limb *z, /* input Q */
523 limb *xprime, limb *zprime, /* input Q' */ 628 limb *xprime, limb *zprime, /* input Q' */
524 const limb *qmqp /* input Q - Q' */) { 629 const limb *qmqp /* input Q - Q' */) {
525 limb origx[10], origxprime[10], zzz[19], xx[19], zz[19], xxprime[19], 630 limb origx[10], origxprime[10], zzz[19], xx[19], zz[19], xxprime[19],
526 zzprime[19], zzzprime[19], xxxprime[19]; 631 zzprime[19], zzzprime[19], xxxprime[19];
527 632
528 memcpy(origx, x, 10 * sizeof(limb)); 633 memcpy(origx, x, 10 * sizeof(limb));
529 fsum(x, z); 634 fsum(x, z);
635 /* |x[i]| < 2^27 */
530 fdifference(z, origx); /* does x - z */ 636 fdifference(z, origx); /* does x - z */
637 /* |z[i]| < 2^27 */
531 638
532 memcpy(origxprime, xprime, sizeof(limb) * 10); 639 memcpy(origxprime, xprime, sizeof(limb) * 10);
533 fsum(xprime, zprime); 640 fsum(xprime, zprime);
641 /* |xprime[i]| < 2^27 */
534 fdifference(zprime, origxprime); 642 fdifference(zprime, origxprime);
643 /* |zprime[i]| < 2^27 */
535 fproduct(xxprime, xprime, z); 644 fproduct(xxprime, xprime, z);
645 /* |xxprime[i]| < 14*2^54: the largest product of two limbs will be <
646 * 2^(27+27) and fproduct adds together, at most, 14 of those products.
647 * (Approximating that to 2^58 doesn't work out.) */
536 fproduct(zzprime, x, zprime); 648 fproduct(zzprime, x, zprime);
649 /* |zzprime[i]| < 14*2^54 */
537 freduce_degree(xxprime); 650 freduce_degree(xxprime);
538 freduce_coefficients(xxprime); 651 freduce_coefficients(xxprime);
652 /* |xxprime[i]| < 2^26 */
539 freduce_degree(zzprime); 653 freduce_degree(zzprime);
540 freduce_coefficients(zzprime); 654 freduce_coefficients(zzprime);
655 /* |zzprime[i]| < 2^26 */
541 memcpy(origxprime, xxprime, sizeof(limb) * 10); 656 memcpy(origxprime, xxprime, sizeof(limb) * 10);
542 fsum(xxprime, zzprime); 657 fsum(xxprime, zzprime);
658 /* |xxprime[i]| < 2^27 */
543 fdifference(zzprime, origxprime); 659 fdifference(zzprime, origxprime);
660 /* |zzprime[i]| < 2^27 */
544 fsquare(xxxprime, xxprime); 661 fsquare(xxxprime, xxprime);
662 /* |xxxprime[i]| < 2^26 */
545 fsquare(zzzprime, zzprime); 663 fsquare(zzzprime, zzprime);
664 /* |zzzprime[i]| < 2^26 */
546 fproduct(zzprime, zzzprime, qmqp); 665 fproduct(zzprime, zzzprime, qmqp);
666 /* |zzprime[i]| < 14*2^52 */
547 freduce_degree(zzprime); 667 freduce_degree(zzprime);
548 freduce_coefficients(zzprime); 668 freduce_coefficients(zzprime);
669 /* |zzprime[i]| < 2^26 */
549 memcpy(x3, xxxprime, sizeof(limb) * 10); 670 memcpy(x3, xxxprime, sizeof(limb) * 10);
550 memcpy(z3, zzprime, sizeof(limb) * 10); 671 memcpy(z3, zzprime, sizeof(limb) * 10);
551 672
552 fsquare(xx, x); 673 fsquare(xx, x);
674 /* |xx[i]| < 2^26 */
553 fsquare(zz, z); 675 fsquare(zz, z);
676 /* |zz[i]| < 2^26 */
554 fproduct(x2, xx, zz); 677 fproduct(x2, xx, zz);
678 /* |x2[i]| < 14*2^52 */
555 freduce_degree(x2); 679 freduce_degree(x2);
556 freduce_coefficients(x2); 680 freduce_coefficients(x2);
681 /* |x2[i]| < 2^26 */
557 fdifference(zz, xx); /* does zz = xx - zz */ 682 fdifference(zz, xx); /* does zz = xx - zz */
683 /* |zz[i]| < 2^27 */
558 memset(zzz + 10, 0, sizeof(limb) * 9); 684 memset(zzz + 10, 0, sizeof(limb) * 9);
559 fscalar_product(zzz, zz, 121665); 685 fscalar_product(zzz, zz, 121665);
686 /* |zzz[i]| < 2^(27+17) */
560 /* No need to call freduce_degree here: 687 /* No need to call freduce_degree here:
561 fscalar_product doesn't increase the degree of its input. */ 688 fscalar_product doesn't increase the degree of its input. */
562 freduce_coefficients(zzz); 689 freduce_coefficients(zzz);
690 /* |zzz[i]| < 2^26 */
563 fsum(zzz, xx); 691 fsum(zzz, xx);
692 /* |zzz[i]| < 2^27 */
564 fproduct(z2, zz, zzz); 693 fproduct(z2, zz, zzz);
694 /* |z2[i]| < 14*2^(26+27) */
565 freduce_degree(z2); 695 freduce_degree(z2);
566 freduce_coefficients(z2); 696 freduce_coefficients(z2);
697 /* |z2|i| < 2^26 */
567 } 698 }
568 699
569 /* Conditionally swap two reduced-form limb arrays if 'iswap' is 1, but leave 700 /* Conditionally swap two reduced-form limb arrays if 'iswap' is 1, but leave
570 * them unchanged if 'iswap' is 0. Runs in data-invariant time to avoid 701 * them unchanged if 'iswap' is 0. Runs in data-invariant time to avoid
571 * side-channel attacks. 702 * side-channel attacks.
572 * 703 *
573 * NOTE that this function requires that 'iswap' be 1 or 0; other values give 704 * NOTE that this function requires that 'iswap' be 1 or 0; other values give
574 * wrong results. Also, the two limb arrays must be in reduced-coefficient, 705 * wrong results. Also, the two limb arrays must be in reduced-coefficient,
575 * reduced-degree form: the values in a[10..19] or b[10..19] aren't swapped, 706 * reduced-degree form: the values in a[10..19] or b[10..19] aren't swapped,
576 * and all all values in a[0..9],b[0..9] must have magnitude less than 707 * and all all values in a[0..9],b[0..9] must have magnitude less than
577 * INT32_MAX. 708 * INT32_MAX. */
578 */
579 static void 709 static void
580 swap_conditional(limb a[19], limb b[19], limb iswap) { 710 swap_conditional(limb a[19], limb b[19], limb iswap) {
581 unsigned i; 711 unsigned i;
582 const s32 swap = (s32) -iswap; 712 const s32 swap = (s32) -iswap;
583 713
590 720
591 /* Calculates nQ where Q is the x-coordinate of a point on the curve 721 /* Calculates nQ where Q is the x-coordinate of a point on the curve
592 * 722 *
593 * resultx/resultz: the x coordinate of the resulting curve point (short form) 723 * resultx/resultz: the x coordinate of the resulting curve point (short form)
594 * n: a little endian, 32-byte number 724 * n: a little endian, 32-byte number
595 * q: a point of the curve (short form) 725 * q: a point of the curve (short form) */
596 */
597 static void 726 static void
598 cmult(limb *resultx, limb *resultz, const u8 *n, const limb *q) { 727 cmult(limb *resultx, limb *resultz, const u8 *n, const limb *q) {
599 limb a[19] = {0}, b[19] = {1}, c[19] = {1}, d[19] = {0}; 728 limb a[19] = {0}, b[19] = {1}, c[19] = {1}, d[19] = {0};
600 limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t; 729 limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t;
601 limb e[19] = {0}, f[19] = {1}, g[19] = {0}, h[19] = {1}; 730 limb e[19] = {0}, f[19] = {1}, g[19] = {0}, h[19] = {1};
709 /* 2^254 - 2^4 */ fsquare(t0,t1); 838 /* 2^254 - 2^4 */ fsquare(t0,t1);
710 /* 2^255 - 2^5 */ fsquare(t1,t0); 839 /* 2^255 - 2^5 */ fsquare(t1,t0);
711 /* 2^255 - 21 */ fmul(out,t1,z11); 840 /* 2^255 - 21 */ fmul(out,t1,z11);
712 } 841 }
713 842
714 int curve25519_donna(u8 *, const u8 *, const u8 *);
715
716 int 843 int
717 curve25519_donna(u8 *mypublic, const u8 *secret, const u8 *basepoint) { 844 curve25519_donna(u8 *mypublic, const u8 *secret, const u8 *basepoint) {
718 limb bp[10], x[10], z[11], zmone[10]; 845 limb bp[10], x[10], z[11], zmone[10];
719 uint8_t e[32]; 846 uint8_t e[32];
720 int i; 847 int i;
726 853
727 fexpand(bp, basepoint); 854 fexpand(bp, basepoint);
728 cmult(x, z, e, bp); 855 cmult(x, z, e, bp);
729 crecip(zmone, z); 856 crecip(zmone, z);
730 fmul(z, x, zmone); 857 fmul(z, x, zmone);
731 freduce_coefficients(z);
732 fcontract(mypublic, z); 858 fcontract(mypublic, z);
733 return 0; 859 return 0;
734 } 860 }