Mercurial > dropbear
comparison curve25519-donna.c @ 1437:871b18fd7065 fuzz
merge from main (libtommath/libtomcrypt/curve25510-donna updates)
author | Matt Johnston <matt@ucc.asn.au> |
---|---|
date | Sat, 24 Jun 2017 22:51:45 +0800 |
parents | 27b9ddb06b09 |
children |
comparison
equal
deleted
inserted
replaced
1432:41dca1e5ea34 | 1437:871b18fd7065 |
---|---|
41 * djb's sample implementation of curve25519 is written in a special assembly | 41 * djb's sample implementation of curve25519 is written in a special assembly |
42 * language called qhasm and uses the floating point registers. | 42 * language called qhasm and uses the floating point registers. |
43 * | 43 * |
44 * This is, almost, a clean room reimplementation from the curve25519 paper. It | 44 * This is, almost, a clean room reimplementation from the curve25519 paper. It |
45 * uses many of the tricks described therein. Only the crecip function is taken | 45 * uses many of the tricks described therein. Only the crecip function is taken |
46 * from the sample implementation. | 46 * from the sample implementation. */ |
47 */ | |
48 | 47 |
49 #include <string.h> | 48 #include <string.h> |
50 #include <stdint.h> | 49 #include <stdint.h> |
51 | 50 |
52 #ifdef _MSC_VER | 51 #ifdef _MSC_VER |
61 * | 60 * |
62 * Field elements are written as an array of signed, 64-bit limbs, least | 61 * Field elements are written as an array of signed, 64-bit limbs, least |
63 * significant first. The value of the field element is: | 62 * significant first. The value of the field element is: |
64 * x[0] + 2^26·x[1] + x^51·x[2] + 2^102·x[3] + ... | 63 * x[0] + 2^26·x[1] + x^51·x[2] + 2^102·x[3] + ... |
65 * | 64 * |
66 * i.e. the limbs are 26, 25, 26, 25, ... bits wide. | 65 * i.e. the limbs are 26, 25, 26, 25, ... bits wide. */ |
67 */ | |
68 | 66 |
69 /* Sum two numbers: output += in */ | 67 /* Sum two numbers: output += in */ |
70 static void fsum(limb *output, const limb *in) { | 68 static void fsum(limb *output, const limb *in) { |
71 unsigned i; | 69 unsigned i; |
72 for (i = 0; i < 10; i += 2) { | 70 for (i = 0; i < 10; i += 2) { |
73 output[0+i] = (output[0+i] + in[0+i]); | 71 output[0+i] = output[0+i] + in[0+i]; |
74 output[1+i] = (output[1+i] + in[1+i]); | 72 output[1+i] = output[1+i] + in[1+i]; |
75 } | 73 } |
76 } | 74 } |
77 | 75 |
78 /* Find the difference of two numbers: output = in - output | 76 /* Find the difference of two numbers: output = in - output |
79 * (note the order of the arguments!) | 77 * (note the order of the arguments!). */ |
80 */ | |
81 static void fdifference(limb *output, const limb *in) { | 78 static void fdifference(limb *output, const limb *in) { |
82 unsigned i; | 79 unsigned i; |
83 for (i = 0; i < 10; ++i) { | 80 for (i = 0; i < 10; ++i) { |
84 output[i] = (in[i] - output[i]); | 81 output[i] = in[i] - output[i]; |
85 } | 82 } |
86 } | 83 } |
87 | 84 |
88 /* Multiply a number by a scalar: output = in * scalar */ | 85 /* Multiply a number by a scalar: output = in * scalar */ |
89 static void fscalar_product(limb *output, const limb *in, const limb scalar) { | 86 static void fscalar_product(limb *output, const limb *in, const limb scalar) { |
95 | 92 |
96 /* Multiply two numbers: output = in2 * in | 93 /* Multiply two numbers: output = in2 * in |
97 * | 94 * |
98 * output must be distinct to both inputs. The inputs are reduced coefficient | 95 * output must be distinct to both inputs. The inputs are reduced coefficient |
99 * form, the output is not. | 96 * form, the output is not. |
100 */ | 97 * |
98 * output[x] <= 14 * the largest product of the input limbs. */ | |
101 static void fproduct(limb *output, const limb *in2, const limb *in) { | 99 static void fproduct(limb *output, const limb *in2, const limb *in) { |
102 output[0] = ((limb) ((s32) in2[0])) * ((s32) in[0]); | 100 output[0] = ((limb) ((s32) in2[0])) * ((s32) in[0]); |
103 output[1] = ((limb) ((s32) in2[0])) * ((s32) in[1]) + | 101 output[1] = ((limb) ((s32) in2[0])) * ((s32) in[1]) + |
104 ((limb) ((s32) in2[1])) * ((s32) in[0]); | 102 ((limb) ((s32) in2[1])) * ((s32) in[0]); |
105 output[2] = 2 * ((limb) ((s32) in2[1])) * ((s32) in[1]) + | 103 output[2] = 2 * ((limb) ((s32) in2[1])) * ((s32) in[1]) + |
199 output[17] = ((limb) ((s32) in2[8])) * ((s32) in[9]) + | 197 output[17] = ((limb) ((s32) in2[8])) * ((s32) in[9]) + |
200 ((limb) ((s32) in2[9])) * ((s32) in[8]); | 198 ((limb) ((s32) in2[9])) * ((s32) in[8]); |
201 output[18] = 2 * ((limb) ((s32) in2[9])) * ((s32) in[9]); | 199 output[18] = 2 * ((limb) ((s32) in2[9])) * ((s32) in[9]); |
202 } | 200 } |
203 | 201 |
204 /* Reduce a long form to a short form by taking the input mod 2^255 - 19. */ | 202 /* Reduce a long form to a short form by taking the input mod 2^255 - 19. |
203 * | |
204 * On entry: |output[i]| < 14*2^54 | |
205 * On exit: |output[0..8]| < 280*2^54 */ | |
205 static void freduce_degree(limb *output) { | 206 static void freduce_degree(limb *output) { |
206 /* Each of these shifts and adds ends up multiplying the value by 19. */ | 207 /* Each of these shifts and adds ends up multiplying the value by 19. |
208 * | |
209 * For output[0..8], the absolute entry value is < 14*2^54 and we add, at | |
210 * most, 19*14*2^54 thus, on exit, |output[0..8]| < 280*2^54. */ | |
207 output[8] += output[18] << 4; | 211 output[8] += output[18] << 4; |
208 output[8] += output[18] << 1; | 212 output[8] += output[18] << 1; |
209 output[8] += output[18]; | 213 output[8] += output[18]; |
210 output[7] += output[17] << 4; | 214 output[7] += output[17] << 4; |
211 output[7] += output[17] << 1; | 215 output[7] += output[17] << 1; |
235 | 239 |
236 #if (-1 & 3) != 3 | 240 #if (-1 & 3) != 3 |
237 #error "This code only works on a two's complement system" | 241 #error "This code only works on a two's complement system" |
238 #endif | 242 #endif |
239 | 243 |
240 /* return v / 2^26, using only shifts and adds. */ | 244 /* return v / 2^26, using only shifts and adds. |
245 * | |
246 * On entry: v can take any value. */ | |
241 static inline limb | 247 static inline limb |
242 div_by_2_26(const limb v) | 248 div_by_2_26(const limb v) |
243 { | 249 { |
244 /* High word of v; no shift needed*/ | 250 /* High word of v; no shift needed. */ |
245 const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32); | 251 const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32); |
246 /* Set to all 1s if v was negative; else set to 0s. */ | 252 /* Set to all 1s if v was negative; else set to 0s. */ |
247 const int32_t sign = ((int32_t) highword) >> 31; | 253 const int32_t sign = ((int32_t) highword) >> 31; |
248 /* Set to 0x3ffffff if v was negative; else set to 0. */ | 254 /* Set to 0x3ffffff if v was negative; else set to 0. */ |
249 const int32_t roundoff = ((uint32_t) sign) >> 6; | 255 const int32_t roundoff = ((uint32_t) sign) >> 6; |
250 /* Should return v / (1<<26) */ | 256 /* Should return v / (1<<26) */ |
251 return (v + roundoff) >> 26; | 257 return (v + roundoff) >> 26; |
252 } | 258 } |
253 | 259 |
254 /* return v / (2^25), using only shifts and adds. */ | 260 /* return v / (2^25), using only shifts and adds. |
261 * | |
262 * On entry: v can take any value. */ | |
255 static inline limb | 263 static inline limb |
256 div_by_2_25(const limb v) | 264 div_by_2_25(const limb v) |
257 { | 265 { |
258 /* High word of v; no shift needed*/ | 266 /* High word of v; no shift needed*/ |
259 const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32); | 267 const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32); |
263 const int32_t roundoff = ((uint32_t) sign) >> 7; | 271 const int32_t roundoff = ((uint32_t) sign) >> 7; |
264 /* Should return v / (1<<25) */ | 272 /* Should return v / (1<<25) */ |
265 return (v + roundoff) >> 25; | 273 return (v + roundoff) >> 25; |
266 } | 274 } |
267 | 275 |
268 static inline s32 | |
269 div_s32_by_2_25(const s32 v) | |
270 { | |
271 const s32 roundoff = ((uint32_t)(v >> 31)) >> 7; | |
272 return (v + roundoff) >> 25; | |
273 } | |
274 | |
275 /* Reduce all coefficients of the short form input so that |x| < 2^26. | 276 /* Reduce all coefficients of the short form input so that |x| < 2^26. |
276 * | 277 * |
277 * On entry: |output[i]| < 2^62 | 278 * On entry: |output[i]| < 280*2^54 */ |
278 */ | |
279 static void freduce_coefficients(limb *output) { | 279 static void freduce_coefficients(limb *output) { |
280 unsigned i; | 280 unsigned i; |
281 | 281 |
282 output[10] = 0; | 282 output[10] = 0; |
283 | 283 |
284 for (i = 0; i < 10; i += 2) { | 284 for (i = 0; i < 10; i += 2) { |
285 limb over = div_by_2_26(output[i]); | 285 limb over = div_by_2_26(output[i]); |
286 /* The entry condition (that |output[i]| < 280*2^54) means that over is, at | |
287 * most, 280*2^28 in the first iteration of this loop. This is added to the | |
288 * next limb and we can approximate the resulting bound of that limb by | |
289 * 281*2^54. */ | |
286 output[i] -= over << 26; | 290 output[i] -= over << 26; |
287 output[i+1] += over; | 291 output[i+1] += over; |
288 | 292 |
293 /* For the first iteration, |output[i+1]| < 281*2^54, thus |over| < | |
294 * 281*2^29. When this is added to the next limb, the resulting bound can | |
295 * be approximated as 281*2^54. | |
296 * | |
297 * For subsequent iterations of the loop, 281*2^54 remains a conservative | |
298 * bound and no overflow occurs. */ | |
289 over = div_by_2_25(output[i+1]); | 299 over = div_by_2_25(output[i+1]); |
290 output[i+1] -= over << 25; | 300 output[i+1] -= over << 25; |
291 output[i+2] += over; | 301 output[i+2] += over; |
292 } | 302 } |
293 /* Now |output[10]| < 2 ^ 38 and all other coefficients are reduced. */ | 303 /* Now |output[10]| < 281*2^29 and all other coefficients are reduced. */ |
294 output[0] += output[10] << 4; | 304 output[0] += output[10] << 4; |
295 output[0] += output[10] << 1; | 305 output[0] += output[10] << 1; |
296 output[0] += output[10]; | 306 output[0] += output[10]; |
297 | 307 |
298 output[10] = 0; | 308 output[10] = 0; |
299 | 309 |
300 /* Now output[1..9] are reduced, and |output[0]| < 2^26 + 19 * 2^38 | 310 /* Now output[1..9] are reduced, and |output[0]| < 2^26 + 19*281*2^29 |
301 * So |over| will be no more than 77825 */ | 311 * So |over| will be no more than 2^16. */ |
302 { | 312 { |
303 limb over = div_by_2_26(output[0]); | 313 limb over = div_by_2_26(output[0]); |
304 output[0] -= over << 26; | 314 output[0] -= over << 26; |
305 output[1] += over; | 315 output[1] += over; |
306 } | 316 } |
307 | 317 |
308 /* Now output[0,2..9] are reduced, and |output[1]| < 2^25 + 77825 | 318 /* Now output[0,2..9] are reduced, and |output[1]| < 2^25 + 2^16 < 2^26. The |
309 * So |over| will be no more than 1. */ | 319 * bound on |output[1]| is sufficient to meet our needs. */ |
310 { | |
311 /* output[1] fits in 32 bits, so we can use div_s32_by_2_25 here. */ | |
312 s32 over32 = div_s32_by_2_25((s32) output[1]); | |
313 output[1] -= over32 << 25; | |
314 output[2] += over32; | |
315 } | |
316 | |
317 /* Finally, output[0,1,3..9] are reduced, and output[2] is "nearly reduced": | |
318 * we have |output[2]| <= 2^26. This is good enough for all of our math, | |
319 * but it will require an extra freduce_coefficients before fcontract. */ | |
320 } | 320 } |
321 | 321 |
322 /* A helpful wrapper around fproduct: output = in * in2. | 322 /* A helpful wrapper around fproduct: output = in * in2. |
323 * | 323 * |
324 * output must be distinct to both inputs. The output is reduced degree and | 324 * On entry: |in[i]| < 2^27 and |in2[i]| < 2^27. |
325 * reduced coefficient. | 325 * |
326 */ | 326 * output must be distinct to both inputs. The output is reduced degree |
327 * (indeed, one need only provide storage for 10 limbs) and |output[i]| < 2^26. */ | |
327 static void | 328 static void |
328 fmul(limb *output, const limb *in, const limb *in2) { | 329 fmul(limb *output, const limb *in, const limb *in2) { |
329 limb t[19]; | 330 limb t[19]; |
330 fproduct(t, in, in2); | 331 fproduct(t, in, in2); |
332 /* |t[i]| < 14*2^54 */ | |
331 freduce_degree(t); | 333 freduce_degree(t); |
332 freduce_coefficients(t); | 334 freduce_coefficients(t); |
335 /* |t[i]| < 2^26 */ | |
333 memcpy(output, t, sizeof(limb) * 10); | 336 memcpy(output, t, sizeof(limb) * 10); |
334 } | 337 } |
335 | 338 |
339 /* Square a number: output = in**2 | |
340 * | |
341 * output must be distinct from the input. The inputs are reduced coefficient | |
342 * form, the output is not. | |
343 * | |
344 * output[x] <= 14 * the largest product of the input limbs. */ | |
336 static void fsquare_inner(limb *output, const limb *in) { | 345 static void fsquare_inner(limb *output, const limb *in) { |
337 output[0] = ((limb) ((s32) in[0])) * ((s32) in[0]); | 346 output[0] = ((limb) ((s32) in[0])) * ((s32) in[0]); |
338 output[1] = 2 * ((limb) ((s32) in[0])) * ((s32) in[1]); | 347 output[1] = 2 * ((limb) ((s32) in[0])) * ((s32) in[1]); |
339 output[2] = 2 * (((limb) ((s32) in[1])) * ((s32) in[1]) + | 348 output[2] = 2 * (((limb) ((s32) in[1])) * ((s32) in[1]) + |
340 ((limb) ((s32) in[0])) * ((s32) in[2])); | 349 ((limb) ((s32) in[0])) * ((s32) in[2])); |
389 4 * ((limb) ((s32) in[7])) * ((s32) in[9]); | 398 4 * ((limb) ((s32) in[7])) * ((s32) in[9]); |
390 output[17] = 2 * ((limb) ((s32) in[8])) * ((s32) in[9]); | 399 output[17] = 2 * ((limb) ((s32) in[8])) * ((s32) in[9]); |
391 output[18] = 2 * ((limb) ((s32) in[9])) * ((s32) in[9]); | 400 output[18] = 2 * ((limb) ((s32) in[9])) * ((s32) in[9]); |
392 } | 401 } |
393 | 402 |
403 /* fsquare sets output = in^2. | |
404 * | |
405 * On entry: The |in| argument is in reduced coefficients form and |in[i]| < | |
406 * 2^27. | |
407 * | |
408 * On exit: The |output| argument is in reduced coefficients form (indeed, one | |
409 * need only provide storage for 10 limbs) and |out[i]| < 2^26. */ | |
394 static void | 410 static void |
395 fsquare(limb *output, const limb *in) { | 411 fsquare(limb *output, const limb *in) { |
396 limb t[19]; | 412 limb t[19]; |
397 fsquare_inner(t, in); | 413 fsquare_inner(t, in); |
414 /* |t[i]| < 14*2^54 because the largest product of two limbs will be < | |
415 * 2^(27+27) and fsquare_inner adds together, at most, 14 of those | |
416 * products. */ | |
398 freduce_degree(t); | 417 freduce_degree(t); |
399 freduce_coefficients(t); | 418 freduce_coefficients(t); |
419 /* |t[i]| < 2^26 */ | |
400 memcpy(output, t, sizeof(limb) * 10); | 420 memcpy(output, t, sizeof(limb) * 10); |
401 } | 421 } |
402 | 422 |
403 /* Take a little-endian, 32-byte number and expand it into polynomial form */ | 423 /* Take a little-endian, 32-byte number and expand it into polynomial form */ |
404 static void | 424 static void |
415 F(4, 12, 6, 0x3ffffff); | 435 F(4, 12, 6, 0x3ffffff); |
416 F(5, 16, 0, 0x1ffffff); | 436 F(5, 16, 0, 0x1ffffff); |
417 F(6, 19, 1, 0x3ffffff); | 437 F(6, 19, 1, 0x3ffffff); |
418 F(7, 22, 3, 0x1ffffff); | 438 F(7, 22, 3, 0x1ffffff); |
419 F(8, 25, 4, 0x3ffffff); | 439 F(8, 25, 4, 0x3ffffff); |
420 F(9, 28, 6, 0x3ffffff); | 440 F(9, 28, 6, 0x1ffffff); |
421 #undef F | 441 #undef F |
422 } | 442 } |
423 | 443 |
424 #if (-32 >> 1) != -16 | 444 #if (-32 >> 1) != -16 |
425 #error "This code only works when >> does sign-extension on negative numbers" | 445 #error "This code only works when >> does sign-extension on negative numbers" |
426 #endif | 446 #endif |
427 | 447 |
448 /* s32_eq returns 0xffffffff iff a == b and zero otherwise. */ | |
449 static s32 s32_eq(s32 a, s32 b) { | |
450 a = ~(a ^ b); | |
451 a &= a << 16; | |
452 a &= a << 8; | |
453 a &= a << 4; | |
454 a &= a << 2; | |
455 a &= a << 1; | |
456 return a >> 31; | |
457 } | |
458 | |
459 /* s32_gte returns 0xffffffff if a >= b and zero otherwise, where a and b are | |
460 * both non-negative. */ | |
461 static s32 s32_gte(s32 a, s32 b) { | |
462 a -= b; | |
463 /* a >= 0 iff a >= b. */ | |
464 return ~(a >> 31); | |
465 } | |
466 | |
428 /* Take a fully reduced polynomial form number and contract it into a | 467 /* Take a fully reduced polynomial form number and contract it into a |
429 * little-endian, 32-byte array | 468 * little-endian, 32-byte array. |
430 */ | 469 * |
470 * On entry: |input_limbs[i]| < 2^26 */ | |
431 static void | 471 static void |
432 fcontract(u8 *output, limb *input) { | 472 fcontract(u8 *output, limb *input_limbs) { |
433 int i; | 473 int i; |
434 int j; | 474 int j; |
475 s32 input[10]; | |
476 s32 mask; | |
477 | |
478 /* |input_limbs[i]| < 2^26, so it's valid to convert to an s32. */ | |
479 for (i = 0; i < 10; i++) { | |
480 input[i] = input_limbs[i]; | |
481 } | |
435 | 482 |
436 for (j = 0; j < 2; ++j) { | 483 for (j = 0; j < 2; ++j) { |
437 for (i = 0; i < 9; ++i) { | 484 for (i = 0; i < 9; ++i) { |
438 if ((i & 1) == 1) { | 485 if ((i & 1) == 1) { |
439 /* This calculation is a time-invariant way to make input[i] positive | 486 /* This calculation is a time-invariant way to make input[i] |
440 by borrowing from the next-larger limb. | 487 * non-negative by borrowing from the next-larger limb. */ |
441 */ | 488 const s32 mask = input[i] >> 31; |
442 const s32 mask = (s32)(input[i]) >> 31; | 489 const s32 carry = -((input[i] & mask) >> 25); |
443 const s32 carry = -(((s32)(input[i]) & mask) >> 25); | 490 input[i] = input[i] + (carry << 25); |
444 input[i] = (s32)(input[i]) + (carry << 25); | 491 input[i+1] = input[i+1] - carry; |
445 input[i+1] = (s32)(input[i+1]) - carry; | |
446 } else { | 492 } else { |
447 const s32 mask = (s32)(input[i]) >> 31; | 493 const s32 mask = input[i] >> 31; |
448 const s32 carry = -(((s32)(input[i]) & mask) >> 26); | 494 const s32 carry = -((input[i] & mask) >> 26); |
449 input[i] = (s32)(input[i]) + (carry << 26); | 495 input[i] = input[i] + (carry << 26); |
450 input[i+1] = (s32)(input[i+1]) - carry; | 496 input[i+1] = input[i+1] - carry; |
451 } | 497 } |
452 } | 498 } |
499 | |
500 /* There's no greater limb for input[9] to borrow from, but we can multiply | |
501 * by 19 and borrow from input[0], which is valid mod 2^255-19. */ | |
453 { | 502 { |
454 const s32 mask = (s32)(input[9]) >> 31; | 503 const s32 mask = input[9] >> 31; |
455 const s32 carry = -(((s32)(input[9]) & mask) >> 25); | 504 const s32 carry = -((input[9] & mask) >> 25); |
456 input[9] = (s32)(input[9]) + (carry << 25); | 505 input[9] = input[9] + (carry << 25); |
457 input[0] = (s32)(input[0]) - (carry * 19); | 506 input[0] = input[0] - (carry * 19); |
458 } | 507 } |
508 | |
509 /* After the first iteration, input[1..9] are non-negative and fit within | |
510 * 25 or 26 bits, depending on position. However, input[0] may be | |
511 * negative. */ | |
459 } | 512 } |
460 | 513 |
461 /* The first borrow-propagation pass above ended with every limb | 514 /* The first borrow-propagation pass above ended with every limb |
462 except (possibly) input[0] non-negative. | 515 except (possibly) input[0] non-negative. |
463 | 516 |
464 Since each input limb except input[0] is decreased by at most 1 | 517 If input[0] was negative after the first pass, then it was because of a |
465 by a borrow-propagation pass, the second borrow-propagation pass | 518 carry from input[9]. On entry, input[9] < 2^26 so the carry was, at most, |
466 could only have wrapped around to decrease input[0] again if the | 519 one, since (2**26-1) >> 25 = 1. Thus input[0] >= -19. |
467 first pass left input[0] negative *and* input[1] through input[9] | 520 |
468 were all zero. In that case, input[1] is now 2^25 - 1, and this | 521 In the second pass, each limb is decreased by at most one. Thus the second |
469 last borrow-propagation step will leave input[1] non-negative. | 522 borrow-propagation pass could only have wrapped around to decrease |
470 */ | 523 input[0] again if the first pass left input[0] negative *and* input[1] |
524 through input[9] were all zero. In that case, input[1] is now 2^25 - 1, | |
525 and this last borrow-propagation step will leave input[1] non-negative. */ | |
471 { | 526 { |
472 const s32 mask = (s32)(input[0]) >> 31; | 527 const s32 mask = input[0] >> 31; |
473 const s32 carry = -(((s32)(input[0]) & mask) >> 26); | 528 const s32 carry = -((input[0] & mask) >> 26); |
474 input[0] = (s32)(input[0]) + (carry << 26); | 529 input[0] = input[0] + (carry << 26); |
475 input[1] = (s32)(input[1]) - carry; | 530 input[1] = input[1] - carry; |
476 } | 531 } |
477 | 532 |
478 /* Both passes through the above loop, plus the last 0-to-1 step, are | 533 /* All input[i] are now non-negative. However, there might be values between |
479 necessary: if input[9] is -1 and input[0] through input[8] are 0, | 534 * 2^25 and 2^26 in a limb which is, nominally, 25 bits wide. */ |
480 negative values will remain in the array until the end. | 535 for (j = 0; j < 2; j++) { |
481 */ | 536 for (i = 0; i < 9; i++) { |
537 if ((i & 1) == 1) { | |
538 const s32 carry = input[i] >> 25; | |
539 input[i] &= 0x1ffffff; | |
540 input[i+1] += carry; | |
541 } else { | |
542 const s32 carry = input[i] >> 26; | |
543 input[i] &= 0x3ffffff; | |
544 input[i+1] += carry; | |
545 } | |
546 } | |
547 | |
548 { | |
549 const s32 carry = input[9] >> 25; | |
550 input[9] &= 0x1ffffff; | |
551 input[0] += 19*carry; | |
552 } | |
553 } | |
554 | |
555 /* If the first carry-chain pass, just above, ended up with a carry from | |
556 * input[9], and that caused input[0] to be out-of-bounds, then input[0] was | |
557 * < 2^26 + 2*19, because the carry was, at most, two. | |
558 * | |
559 * If the second pass carried from input[9] again then input[0] is < 2*19 and | |
560 * the input[9] -> input[0] carry didn't push input[0] out of bounds. */ | |
561 | |
562 /* It still remains the case that input might be between 2^255-19 and 2^255. | |
563 * In this case, input[1..9] must take their maximum value and input[0] must | |
564 * be >= (2^255-19) & 0x3ffffff, which is 0x3ffffed. */ | |
565 mask = s32_gte(input[0], 0x3ffffed); | |
566 for (i = 1; i < 10; i++) { | |
567 if ((i & 1) == 1) { | |
568 mask &= s32_eq(input[i], 0x1ffffff); | |
569 } else { | |
570 mask &= s32_eq(input[i], 0x3ffffff); | |
571 } | |
572 } | |
573 | |
574 /* mask is either 0xffffffff (if input >= 2^255-19) and zero otherwise. Thus | |
575 * this conditionally subtracts 2^255-19. */ | |
576 input[0] -= mask & 0x3ffffed; | |
577 | |
578 for (i = 1; i < 10; i++) { | |
579 if ((i & 1) == 1) { | |
580 input[i] -= mask & 0x1ffffff; | |
581 } else { | |
582 input[i] -= mask & 0x3ffffff; | |
583 } | |
584 } | |
482 | 585 |
483 input[1] <<= 2; | 586 input[1] <<= 2; |
484 input[2] <<= 3; | 587 input[2] <<= 3; |
485 input[3] <<= 5; | 588 input[3] <<= 5; |
486 input[4] <<= 6; | 589 input[4] <<= 6; |
514 * x2 z3: long form | 617 * x2 z3: long form |
515 * x3 z3: long form | 618 * x3 z3: long form |
516 * x z: short form, destroyed | 619 * x z: short form, destroyed |
517 * xprime zprime: short form, destroyed | 620 * xprime zprime: short form, destroyed |
518 * qmqp: short form, preserved | 621 * qmqp: short form, preserved |
519 */ | 622 * |
623 * On entry and exit, the absolute value of the limbs of all inputs and outputs | |
624 * are < 2^26. */ | |
520 static void fmonty(limb *x2, limb *z2, /* output 2Q */ | 625 static void fmonty(limb *x2, limb *z2, /* output 2Q */ |
521 limb *x3, limb *z3, /* output Q + Q' */ | 626 limb *x3, limb *z3, /* output Q + Q' */ |
522 limb *x, limb *z, /* input Q */ | 627 limb *x, limb *z, /* input Q */ |
523 limb *xprime, limb *zprime, /* input Q' */ | 628 limb *xprime, limb *zprime, /* input Q' */ |
524 const limb *qmqp /* input Q - Q' */) { | 629 const limb *qmqp /* input Q - Q' */) { |
525 limb origx[10], origxprime[10], zzz[19], xx[19], zz[19], xxprime[19], | 630 limb origx[10], origxprime[10], zzz[19], xx[19], zz[19], xxprime[19], |
526 zzprime[19], zzzprime[19], xxxprime[19]; | 631 zzprime[19], zzzprime[19], xxxprime[19]; |
527 | 632 |
528 memcpy(origx, x, 10 * sizeof(limb)); | 633 memcpy(origx, x, 10 * sizeof(limb)); |
529 fsum(x, z); | 634 fsum(x, z); |
635 /* |x[i]| < 2^27 */ | |
530 fdifference(z, origx); /* does x - z */ | 636 fdifference(z, origx); /* does x - z */ |
637 /* |z[i]| < 2^27 */ | |
531 | 638 |
532 memcpy(origxprime, xprime, sizeof(limb) * 10); | 639 memcpy(origxprime, xprime, sizeof(limb) * 10); |
533 fsum(xprime, zprime); | 640 fsum(xprime, zprime); |
641 /* |xprime[i]| < 2^27 */ | |
534 fdifference(zprime, origxprime); | 642 fdifference(zprime, origxprime); |
643 /* |zprime[i]| < 2^27 */ | |
535 fproduct(xxprime, xprime, z); | 644 fproduct(xxprime, xprime, z); |
645 /* |xxprime[i]| < 14*2^54: the largest product of two limbs will be < | |
646 * 2^(27+27) and fproduct adds together, at most, 14 of those products. | |
647 * (Approximating that to 2^58 doesn't work out.) */ | |
536 fproduct(zzprime, x, zprime); | 648 fproduct(zzprime, x, zprime); |
649 /* |zzprime[i]| < 14*2^54 */ | |
537 freduce_degree(xxprime); | 650 freduce_degree(xxprime); |
538 freduce_coefficients(xxprime); | 651 freduce_coefficients(xxprime); |
652 /* |xxprime[i]| < 2^26 */ | |
539 freduce_degree(zzprime); | 653 freduce_degree(zzprime); |
540 freduce_coefficients(zzprime); | 654 freduce_coefficients(zzprime); |
655 /* |zzprime[i]| < 2^26 */ | |
541 memcpy(origxprime, xxprime, sizeof(limb) * 10); | 656 memcpy(origxprime, xxprime, sizeof(limb) * 10); |
542 fsum(xxprime, zzprime); | 657 fsum(xxprime, zzprime); |
658 /* |xxprime[i]| < 2^27 */ | |
543 fdifference(zzprime, origxprime); | 659 fdifference(zzprime, origxprime); |
660 /* |zzprime[i]| < 2^27 */ | |
544 fsquare(xxxprime, xxprime); | 661 fsquare(xxxprime, xxprime); |
662 /* |xxxprime[i]| < 2^26 */ | |
545 fsquare(zzzprime, zzprime); | 663 fsquare(zzzprime, zzprime); |
664 /* |zzzprime[i]| < 2^26 */ | |
546 fproduct(zzprime, zzzprime, qmqp); | 665 fproduct(zzprime, zzzprime, qmqp); |
666 /* |zzprime[i]| < 14*2^52 */ | |
547 freduce_degree(zzprime); | 667 freduce_degree(zzprime); |
548 freduce_coefficients(zzprime); | 668 freduce_coefficients(zzprime); |
669 /* |zzprime[i]| < 2^26 */ | |
549 memcpy(x3, xxxprime, sizeof(limb) * 10); | 670 memcpy(x3, xxxprime, sizeof(limb) * 10); |
550 memcpy(z3, zzprime, sizeof(limb) * 10); | 671 memcpy(z3, zzprime, sizeof(limb) * 10); |
551 | 672 |
552 fsquare(xx, x); | 673 fsquare(xx, x); |
674 /* |xx[i]| < 2^26 */ | |
553 fsquare(zz, z); | 675 fsquare(zz, z); |
676 /* |zz[i]| < 2^26 */ | |
554 fproduct(x2, xx, zz); | 677 fproduct(x2, xx, zz); |
678 /* |x2[i]| < 14*2^52 */ | |
555 freduce_degree(x2); | 679 freduce_degree(x2); |
556 freduce_coefficients(x2); | 680 freduce_coefficients(x2); |
681 /* |x2[i]| < 2^26 */ | |
557 fdifference(zz, xx); /* does zz = xx - zz */ | 682 fdifference(zz, xx); /* does zz = xx - zz */ |
683 /* |zz[i]| < 2^27 */ | |
558 memset(zzz + 10, 0, sizeof(limb) * 9); | 684 memset(zzz + 10, 0, sizeof(limb) * 9); |
559 fscalar_product(zzz, zz, 121665); | 685 fscalar_product(zzz, zz, 121665); |
686 /* |zzz[i]| < 2^(27+17) */ | |
560 /* No need to call freduce_degree here: | 687 /* No need to call freduce_degree here: |
561 fscalar_product doesn't increase the degree of its input. */ | 688 fscalar_product doesn't increase the degree of its input. */ |
562 freduce_coefficients(zzz); | 689 freduce_coefficients(zzz); |
690 /* |zzz[i]| < 2^26 */ | |
563 fsum(zzz, xx); | 691 fsum(zzz, xx); |
692 /* |zzz[i]| < 2^27 */ | |
564 fproduct(z2, zz, zzz); | 693 fproduct(z2, zz, zzz); |
694 /* |z2[i]| < 14*2^(26+27) */ | |
565 freduce_degree(z2); | 695 freduce_degree(z2); |
566 freduce_coefficients(z2); | 696 freduce_coefficients(z2); |
697 /* |z2|i| < 2^26 */ | |
567 } | 698 } |
568 | 699 |
569 /* Conditionally swap two reduced-form limb arrays if 'iswap' is 1, but leave | 700 /* Conditionally swap two reduced-form limb arrays if 'iswap' is 1, but leave |
570 * them unchanged if 'iswap' is 0. Runs in data-invariant time to avoid | 701 * them unchanged if 'iswap' is 0. Runs in data-invariant time to avoid |
571 * side-channel attacks. | 702 * side-channel attacks. |
572 * | 703 * |
573 * NOTE that this function requires that 'iswap' be 1 or 0; other values give | 704 * NOTE that this function requires that 'iswap' be 1 or 0; other values give |
574 * wrong results. Also, the two limb arrays must be in reduced-coefficient, | 705 * wrong results. Also, the two limb arrays must be in reduced-coefficient, |
575 * reduced-degree form: the values in a[10..19] or b[10..19] aren't swapped, | 706 * reduced-degree form: the values in a[10..19] or b[10..19] aren't swapped, |
576 * and all all values in a[0..9],b[0..9] must have magnitude less than | 707 * and all all values in a[0..9],b[0..9] must have magnitude less than |
577 * INT32_MAX. | 708 * INT32_MAX. */ |
578 */ | |
579 static void | 709 static void |
580 swap_conditional(limb a[19], limb b[19], limb iswap) { | 710 swap_conditional(limb a[19], limb b[19], limb iswap) { |
581 unsigned i; | 711 unsigned i; |
582 const s32 swap = (s32) -iswap; | 712 const s32 swap = (s32) -iswap; |
583 | 713 |
590 | 720 |
591 /* Calculates nQ where Q is the x-coordinate of a point on the curve | 721 /* Calculates nQ where Q is the x-coordinate of a point on the curve |
592 * | 722 * |
593 * resultx/resultz: the x coordinate of the resulting curve point (short form) | 723 * resultx/resultz: the x coordinate of the resulting curve point (short form) |
594 * n: a little endian, 32-byte number | 724 * n: a little endian, 32-byte number |
595 * q: a point of the curve (short form) | 725 * q: a point of the curve (short form) */ |
596 */ | |
597 static void | 726 static void |
598 cmult(limb *resultx, limb *resultz, const u8 *n, const limb *q) { | 727 cmult(limb *resultx, limb *resultz, const u8 *n, const limb *q) { |
599 limb a[19] = {0}, b[19] = {1}, c[19] = {1}, d[19] = {0}; | 728 limb a[19] = {0}, b[19] = {1}, c[19] = {1}, d[19] = {0}; |
600 limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t; | 729 limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t; |
601 limb e[19] = {0}, f[19] = {1}, g[19] = {0}, h[19] = {1}; | 730 limb e[19] = {0}, f[19] = {1}, g[19] = {0}, h[19] = {1}; |
709 /* 2^254 - 2^4 */ fsquare(t0,t1); | 838 /* 2^254 - 2^4 */ fsquare(t0,t1); |
710 /* 2^255 - 2^5 */ fsquare(t1,t0); | 839 /* 2^255 - 2^5 */ fsquare(t1,t0); |
711 /* 2^255 - 21 */ fmul(out,t1,z11); | 840 /* 2^255 - 21 */ fmul(out,t1,z11); |
712 } | 841 } |
713 | 842 |
714 int curve25519_donna(u8 *, const u8 *, const u8 *); | |
715 | |
716 int | 843 int |
717 curve25519_donna(u8 *mypublic, const u8 *secret, const u8 *basepoint) { | 844 curve25519_donna(u8 *mypublic, const u8 *secret, const u8 *basepoint) { |
718 limb bp[10], x[10], z[11], zmone[10]; | 845 limb bp[10], x[10], z[11], zmone[10]; |
719 uint8_t e[32]; | 846 uint8_t e[32]; |
720 int i; | 847 int i; |
726 | 853 |
727 fexpand(bp, basepoint); | 854 fexpand(bp, basepoint); |
728 cmult(x, z, e, bp); | 855 cmult(x, z, e, bp); |
729 crecip(zmone, z); | 856 crecip(zmone, z); |
730 fmul(z, x, zmone); | 857 fmul(z, x, zmone); |
731 freduce_coefficients(z); | |
732 fcontract(mypublic, z); | 858 fcontract(mypublic, z); |
733 return 0; | 859 return 0; |
734 } | 860 } |