1 // The MIT License (MIT)
2 //
3 // Copyright (c) 2015-2016 the fiat-crypto authors (see the AUTHORS file).
4 //
5 // Permission is hereby granted, free of charge, to any person obtaining a copy
6 // of this software and associated documentation files (the "Software"), to deal
7 // in the Software without restriction, including without limitation the rights
8 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 // copies of the Software, and to permit persons to whom the Software is
10 // furnished to do so, subject to the following conditions:
11 //
12 // The above copyright notice and this permission notice shall be included in all
13 // copies or substantial portions of the Software.
14 //
15 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 // SOFTWARE.
22
23 // Some of this code is taken from the ref10 version of Ed25519 in SUPERCOP
24 // 20141124 (http://bench.cr.yp.to/supercop.html). That code is released as
25 // public domain but parts have been replaced with code generated by Fiat
26 // (https://github.com/mit-plv/fiat-crypto), which is MIT licensed.
27 //
28 // The field functions are shared by Ed25519 and X25519 where possible.
29
30 #include <openssl/curve25519.h>
31
32 #include <assert.h>
33 #include <string.h>
34
35 #include <openssl/cpu.h>
36 #include <openssl/mem.h>
37 #include <openssl/rand.h>
38 #include <openssl/sha.h>
39 #include <openssl/type_check.h>
40
41 #include "internal.h"
42 #include "../../crypto/internal.h"
43
44
45 // Various pre-computed constants.
46 #include "./curve25519_tables.h"
47
48 #if defined(BORINGSSL_CURVE25519_64BIT)
49 #include "./curve25519_64.h"
50 #else
51 #include "./curve25519_32.h"
52 #endif // BORINGSSL_CURVE25519_64BIT
53
54
55 // Low-level intrinsic operations
56
load_3(const uint8_t * in)57 static uint64_t load_3(const uint8_t *in) {
58 uint64_t result;
59 result = (uint64_t)in[0];
60 result |= ((uint64_t)in[1]) << 8;
61 result |= ((uint64_t)in[2]) << 16;
62 return result;
63 }
64
load_4(const uint8_t * in)65 static uint64_t load_4(const uint8_t *in) {
66 uint64_t result;
67 result = (uint64_t)in[0];
68 result |= ((uint64_t)in[1]) << 8;
69 result |= ((uint64_t)in[2]) << 16;
70 result |= ((uint64_t)in[3]) << 24;
71 return result;
72 }
73
74
75 // Field operations.
76
77 #if defined(BORINGSSL_CURVE25519_64BIT)
78
79 typedef uint64_t fe_limb_t;
80 #define FE_NUM_LIMBS 5
81
82 // assert_fe asserts that |f| satisfies bounds:
83 //
84 // [[0x0 ~> 0x8cccccccccccc],
85 // [0x0 ~> 0x8cccccccccccc],
86 // [0x0 ~> 0x8cccccccccccc],
87 // [0x0 ~> 0x8cccccccccccc],
88 // [0x0 ~> 0x8cccccccccccc]]
89 //
90 // See comments in curve25519_64.h for which functions use these bounds for
91 // inputs or outputs.
92 #define assert_fe(f) \
93 do { \
94 for (unsigned _assert_fe_i = 0; _assert_fe_i < 5; _assert_fe_i++) { \
95 assert(f[_assert_fe_i] <= UINT64_C(0x8cccccccccccc)); \
96 } \
97 } while (0)
98
99 // assert_fe_loose asserts that |f| satisfies bounds:
100 //
101 // [[0x0 ~> 0x1a666666666664],
102 // [0x0 ~> 0x1a666666666664],
103 // [0x0 ~> 0x1a666666666664],
104 // [0x0 ~> 0x1a666666666664],
105 // [0x0 ~> 0x1a666666666664]]
106 //
107 // See comments in curve25519_64.h for which functions use these bounds for
108 // inputs or outputs.
109 #define assert_fe_loose(f) \
110 do { \
111 for (unsigned _assert_fe_i = 0; _assert_fe_i < 5; _assert_fe_i++) { \
112 assert(f[_assert_fe_i] <= UINT64_C(0x1a666666666664)); \
113 } \
114 } while (0)
115
116 #else
117
118 typedef uint32_t fe_limb_t;
119 #define FE_NUM_LIMBS 10
120
121 // assert_fe asserts that |f| satisfies bounds:
122 //
123 // [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
124 // [0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
125 // [0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
126 // [0x0 ~> 0x4666666], [0x0 ~> 0x2333333],
127 // [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]]
128 //
129 // See comments in curve25519_32.h for which functions use these bounds for
130 // inputs or outputs.
131 #define assert_fe(f) \
132 do { \
133 for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \
134 assert(f[_assert_fe_i] <= \
135 ((_assert_fe_i & 1) ? 0x2333333u : 0x4666666u)); \
136 } \
137 } while (0)
138
139 // assert_fe_loose asserts that |f| satisfies bounds:
140 //
141 // [[0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
142 // [0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
143 // [0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
144 // [0x0 ~> 0xd333332], [0x0 ~> 0x6999999],
145 // [0x0 ~> 0xd333332], [0x0 ~> 0x6999999]]
146 //
147 // See comments in curve25519_32.h for which functions use these bounds for
148 // inputs or outputs.
149 #define assert_fe_loose(f) \
150 do { \
151 for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \
152 assert(f[_assert_fe_i] <= \
153 ((_assert_fe_i & 1) ? 0x6999999u : 0xd333332u)); \
154 } \
155 } while (0)
156
157 #endif // BORINGSSL_CURVE25519_64BIT
158
159 OPENSSL_STATIC_ASSERT(sizeof(fe) == sizeof(fe_limb_t) * FE_NUM_LIMBS,
160 "fe_limb_t[FE_NUM_LIMBS] is inconsistent with fe");
161
fe_frombytes_strict(fe * h,const uint8_t s[32])162 static void fe_frombytes_strict(fe *h, const uint8_t s[32]) {
163 // |fiat_25519_from_bytes| requires the top-most bit be clear.
164 assert((s[31] & 0x80) == 0);
165 fiat_25519_from_bytes(h->v, s);
166 assert_fe(h->v);
167 }
168
fe_frombytes(fe * h,const uint8_t s[32])169 static void fe_frombytes(fe *h, const uint8_t s[32]) {
170 uint8_t s_copy[32];
171 OPENSSL_memcpy(s_copy, s, 32);
172 s_copy[31] &= 0x7f;
173 fe_frombytes_strict(h, s_copy);
174 }
175
fe_tobytes(uint8_t s[32],const fe * f)176 static void fe_tobytes(uint8_t s[32], const fe *f) {
177 assert_fe(f->v);
178 fiat_25519_to_bytes(s, f->v);
179 }
180
181 // h = 0
fe_0(fe * h)182 static void fe_0(fe *h) {
183 OPENSSL_memset(h, 0, sizeof(fe));
184 }
185
fe_loose_0(fe_loose * h)186 static void fe_loose_0(fe_loose *h) {
187 OPENSSL_memset(h, 0, sizeof(fe_loose));
188 }
189
190 // h = 1
fe_1(fe * h)191 static void fe_1(fe *h) {
192 OPENSSL_memset(h, 0, sizeof(fe));
193 h->v[0] = 1;
194 }
195
fe_loose_1(fe_loose * h)196 static void fe_loose_1(fe_loose *h) {
197 OPENSSL_memset(h, 0, sizeof(fe_loose));
198 h->v[0] = 1;
199 }
200
201 // h = f + g
202 // Can overlap h with f or g.
fe_add(fe_loose * h,const fe * f,const fe * g)203 static void fe_add(fe_loose *h, const fe *f, const fe *g) {
204 assert_fe(f->v);
205 assert_fe(g->v);
206 fiat_25519_add(h->v, f->v, g->v);
207 assert_fe_loose(h->v);
208 }
209
210 // h = f - g
211 // Can overlap h with f or g.
fe_sub(fe_loose * h,const fe * f,const fe * g)212 static void fe_sub(fe_loose *h, const fe *f, const fe *g) {
213 assert_fe(f->v);
214 assert_fe(g->v);
215 fiat_25519_sub(h->v, f->v, g->v);
216 assert_fe_loose(h->v);
217 }
218
fe_carry(fe * h,const fe_loose * f)219 static void fe_carry(fe *h, const fe_loose* f) {
220 assert_fe_loose(f->v);
221 fiat_25519_carry(h->v, f->v);
222 assert_fe(h->v);
223 }
224
fe_mul_impl(fe_limb_t out[FE_NUM_LIMBS],const fe_limb_t in1[FE_NUM_LIMBS],const fe_limb_t in2[FE_NUM_LIMBS])225 static void fe_mul_impl(fe_limb_t out[FE_NUM_LIMBS],
226 const fe_limb_t in1[FE_NUM_LIMBS],
227 const fe_limb_t in2[FE_NUM_LIMBS]) {
228 assert_fe_loose(in1);
229 assert_fe_loose(in2);
230 fiat_25519_carry_mul(out, in1, in2);
231 assert_fe(out);
232 }
233
fe_mul_ltt(fe_loose * h,const fe * f,const fe * g)234 static void fe_mul_ltt(fe_loose *h, const fe *f, const fe *g) {
235 fe_mul_impl(h->v, f->v, g->v);
236 }
237
fe_mul_llt(fe_loose * h,const fe_loose * f,const fe * g)238 static void fe_mul_llt(fe_loose *h, const fe_loose *f, const fe *g) {
239 fe_mul_impl(h->v, f->v, g->v);
240 }
241
fe_mul_ttt(fe * h,const fe * f,const fe * g)242 static void fe_mul_ttt(fe *h, const fe *f, const fe *g) {
243 fe_mul_impl(h->v, f->v, g->v);
244 }
245
fe_mul_tlt(fe * h,const fe_loose * f,const fe * g)246 static void fe_mul_tlt(fe *h, const fe_loose *f, const fe *g) {
247 fe_mul_impl(h->v, f->v, g->v);
248 }
249
fe_mul_ttl(fe * h,const fe * f,const fe_loose * g)250 static void fe_mul_ttl(fe *h, const fe *f, const fe_loose *g) {
251 fe_mul_impl(h->v, f->v, g->v);
252 }
253
fe_mul_tll(fe * h,const fe_loose * f,const fe_loose * g)254 static void fe_mul_tll(fe *h, const fe_loose *f, const fe_loose *g) {
255 fe_mul_impl(h->v, f->v, g->v);
256 }
257
fe_sq_tl(fe * h,const fe_loose * f)258 static void fe_sq_tl(fe *h, const fe_loose *f) {
259 assert_fe_loose(f->v);
260 fiat_25519_carry_square(h->v, f->v);
261 assert_fe(h->v);
262 }
263
fe_sq_tt(fe * h,const fe * f)264 static void fe_sq_tt(fe *h, const fe *f) {
265 assert_fe_loose(f->v);
266 fiat_25519_carry_square(h->v, f->v);
267 assert_fe(h->v);
268 }
269
270 // Replace (f,g) with (g,f) if b == 1;
271 // replace (f,g) with (f,g) if b == 0.
272 //
273 // Preconditions: b in {0,1}.
fe_cswap(fe * f,fe * g,fe_limb_t b)274 static void fe_cswap(fe *f, fe *g, fe_limb_t b) {
275 b = 0-b;
276 for (unsigned i = 0; i < FE_NUM_LIMBS; i++) {
277 fe_limb_t x = f->v[i] ^ g->v[i];
278 x &= b;
279 f->v[i] ^= x;
280 g->v[i] ^= x;
281 }
282 }
283
fe_mul121666(fe * h,const fe_loose * f)284 static void fe_mul121666(fe *h, const fe_loose *f) {
285 assert_fe_loose(f->v);
286 fiat_25519_carry_scmul_121666(h->v, f->v);
287 assert_fe(h->v);
288 }
289
290 // h = -f
fe_neg(fe_loose * h,const fe * f)291 static void fe_neg(fe_loose *h, const fe *f) {
292 assert_fe(f->v);
293 fiat_25519_opp(h->v, f->v);
294 assert_fe_loose(h->v);
295 }
296
297 // Replace (f,g) with (g,g) if b == 1;
298 // replace (f,g) with (f,g) if b == 0.
299 //
300 // Preconditions: b in {0,1}.
fe_cmov(fe_loose * f,const fe_loose * g,fe_limb_t b)301 static void fe_cmov(fe_loose *f, const fe_loose *g, fe_limb_t b) {
302 // Silence an unused function warning. |fiat_25519_selectznz| isn't quite the
303 // calling convention the rest of this code wants, so implement it by hand.
304 //
305 // TODO(davidben): Switch to fiat's calling convention, or ask fiat to emit a
306 // different one.
307 (void)fiat_25519_selectznz;
308
309 b = 0-b;
310 for (unsigned i = 0; i < FE_NUM_LIMBS; i++) {
311 fe_limb_t x = f->v[i] ^ g->v[i];
312 x &= b;
313 f->v[i] ^= x;
314 }
315 }
316
317 // h = f
fe_copy(fe * h,const fe * f)318 static void fe_copy(fe *h, const fe *f) {
319 OPENSSL_memmove(h, f, sizeof(fe));
320 }
321
fe_copy_lt(fe_loose * h,const fe * f)322 static void fe_copy_lt(fe_loose *h, const fe *f) {
323 OPENSSL_STATIC_ASSERT(sizeof(fe_loose) == sizeof(fe),
324 "fe and fe_loose mismatch");
325 OPENSSL_memmove(h, f, sizeof(fe));
326 }
327 #if !defined(OPENSSL_SMALL)
fe_copy_ll(fe_loose * h,const fe_loose * f)328 static void fe_copy_ll(fe_loose *h, const fe_loose *f) {
329 OPENSSL_memmove(h, f, sizeof(fe_loose));
330 }
331 #endif // !defined(OPENSSL_SMALL)
332
fe_loose_invert(fe * out,const fe_loose * z)333 static void fe_loose_invert(fe *out, const fe_loose *z) {
334 fe t0;
335 fe t1;
336 fe t2;
337 fe t3;
338 int i;
339
340 fe_sq_tl(&t0, z);
341 fe_sq_tt(&t1, &t0);
342 for (i = 1; i < 2; ++i) {
343 fe_sq_tt(&t1, &t1);
344 }
345 fe_mul_tlt(&t1, z, &t1);
346 fe_mul_ttt(&t0, &t0, &t1);
347 fe_sq_tt(&t2, &t0);
348 fe_mul_ttt(&t1, &t1, &t2);
349 fe_sq_tt(&t2, &t1);
350 for (i = 1; i < 5; ++i) {
351 fe_sq_tt(&t2, &t2);
352 }
353 fe_mul_ttt(&t1, &t2, &t1);
354 fe_sq_tt(&t2, &t1);
355 for (i = 1; i < 10; ++i) {
356 fe_sq_tt(&t2, &t2);
357 }
358 fe_mul_ttt(&t2, &t2, &t1);
359 fe_sq_tt(&t3, &t2);
360 for (i = 1; i < 20; ++i) {
361 fe_sq_tt(&t3, &t3);
362 }
363 fe_mul_ttt(&t2, &t3, &t2);
364 fe_sq_tt(&t2, &t2);
365 for (i = 1; i < 10; ++i) {
366 fe_sq_tt(&t2, &t2);
367 }
368 fe_mul_ttt(&t1, &t2, &t1);
369 fe_sq_tt(&t2, &t1);
370 for (i = 1; i < 50; ++i) {
371 fe_sq_tt(&t2, &t2);
372 }
373 fe_mul_ttt(&t2, &t2, &t1);
374 fe_sq_tt(&t3, &t2);
375 for (i = 1; i < 100; ++i) {
376 fe_sq_tt(&t3, &t3);
377 }
378 fe_mul_ttt(&t2, &t3, &t2);
379 fe_sq_tt(&t2, &t2);
380 for (i = 1; i < 50; ++i) {
381 fe_sq_tt(&t2, &t2);
382 }
383 fe_mul_ttt(&t1, &t2, &t1);
384 fe_sq_tt(&t1, &t1);
385 for (i = 1; i < 5; ++i) {
386 fe_sq_tt(&t1, &t1);
387 }
388 fe_mul_ttt(out, &t1, &t0);
389 }
390
fe_invert(fe * out,const fe * z)391 static void fe_invert(fe *out, const fe *z) {
392 fe_loose l;
393 fe_copy_lt(&l, z);
394 fe_loose_invert(out, &l);
395 }
396
397 // return 0 if f == 0
398 // return 1 if f != 0
fe_isnonzero(const fe_loose * f)399 static int fe_isnonzero(const fe_loose *f) {
400 fe tight;
401 fe_carry(&tight, f);
402 uint8_t s[32];
403 fe_tobytes(s, &tight);
404
405 static const uint8_t zero[32] = {0};
406 return CRYPTO_memcmp(s, zero, sizeof(zero)) != 0;
407 }
408
409 // return 1 if f is in {1,3,5,...,q-2}
410 // return 0 if f is in {0,2,4,...,q-1}
fe_isnegative(const fe * f)411 static int fe_isnegative(const fe *f) {
412 uint8_t s[32];
413 fe_tobytes(s, f);
414 return s[0] & 1;
415 }
416
fe_sq2_tt(fe * h,const fe * f)417 static void fe_sq2_tt(fe *h, const fe *f) {
418 // h = f^2
419 fe_sq_tt(h, f);
420
421 // h = h + h
422 fe_loose tmp;
423 fe_add(&tmp, h, h);
424 fe_carry(h, &tmp);
425 }
426
fe_pow22523(fe * out,const fe * z)427 static void fe_pow22523(fe *out, const fe *z) {
428 fe t0;
429 fe t1;
430 fe t2;
431 int i;
432
433 fe_sq_tt(&t0, z);
434 fe_sq_tt(&t1, &t0);
435 for (i = 1; i < 2; ++i) {
436 fe_sq_tt(&t1, &t1);
437 }
438 fe_mul_ttt(&t1, z, &t1);
439 fe_mul_ttt(&t0, &t0, &t1);
440 fe_sq_tt(&t0, &t0);
441 fe_mul_ttt(&t0, &t1, &t0);
442 fe_sq_tt(&t1, &t0);
443 for (i = 1; i < 5; ++i) {
444 fe_sq_tt(&t1, &t1);
445 }
446 fe_mul_ttt(&t0, &t1, &t0);
447 fe_sq_tt(&t1, &t0);
448 for (i = 1; i < 10; ++i) {
449 fe_sq_tt(&t1, &t1);
450 }
451 fe_mul_ttt(&t1, &t1, &t0);
452 fe_sq_tt(&t2, &t1);
453 for (i = 1; i < 20; ++i) {
454 fe_sq_tt(&t2, &t2);
455 }
456 fe_mul_ttt(&t1, &t2, &t1);
457 fe_sq_tt(&t1, &t1);
458 for (i = 1; i < 10; ++i) {
459 fe_sq_tt(&t1, &t1);
460 }
461 fe_mul_ttt(&t0, &t1, &t0);
462 fe_sq_tt(&t1, &t0);
463 for (i = 1; i < 50; ++i) {
464 fe_sq_tt(&t1, &t1);
465 }
466 fe_mul_ttt(&t1, &t1, &t0);
467 fe_sq_tt(&t2, &t1);
468 for (i = 1; i < 100; ++i) {
469 fe_sq_tt(&t2, &t2);
470 }
471 fe_mul_ttt(&t1, &t2, &t1);
472 fe_sq_tt(&t1, &t1);
473 for (i = 1; i < 50; ++i) {
474 fe_sq_tt(&t1, &t1);
475 }
476 fe_mul_ttt(&t0, &t1, &t0);
477 fe_sq_tt(&t0, &t0);
478 for (i = 1; i < 2; ++i) {
479 fe_sq_tt(&t0, &t0);
480 }
481 fe_mul_ttt(out, &t0, z);
482 }
483
484
485 // Group operations.
486
x25519_ge_tobytes(uint8_t s[32],const ge_p2 * h)487 void x25519_ge_tobytes(uint8_t s[32], const ge_p2 *h) {
488 fe recip;
489 fe x;
490 fe y;
491
492 fe_invert(&recip, &h->Z);
493 fe_mul_ttt(&x, &h->X, &recip);
494 fe_mul_ttt(&y, &h->Y, &recip);
495 fe_tobytes(s, &y);
496 s[31] ^= fe_isnegative(&x) << 7;
497 }
498
ge_p3_tobytes(uint8_t s[32],const ge_p3 * h)499 static void ge_p3_tobytes(uint8_t s[32], const ge_p3 *h) {
500 fe recip;
501 fe x;
502 fe y;
503
504 fe_invert(&recip, &h->Z);
505 fe_mul_ttt(&x, &h->X, &recip);
506 fe_mul_ttt(&y, &h->Y, &recip);
507 fe_tobytes(s, &y);
508 s[31] ^= fe_isnegative(&x) << 7;
509 }
510
x25519_ge_frombytes_vartime(ge_p3 * h,const uint8_t s[32])511 int x25519_ge_frombytes_vartime(ge_p3 *h, const uint8_t s[32]) {
512 fe u;
513 fe_loose v;
514 fe v3;
515 fe vxx;
516 fe_loose check;
517
518 fe_frombytes(&h->Y, s);
519 fe_1(&h->Z);
520 fe_sq_tt(&v3, &h->Y);
521 fe_mul_ttt(&vxx, &v3, &d);
522 fe_sub(&v, &v3, &h->Z); // u = y^2-1
523 fe_carry(&u, &v);
524 fe_add(&v, &vxx, &h->Z); // v = dy^2+1
525
526 fe_sq_tl(&v3, &v);
527 fe_mul_ttl(&v3, &v3, &v); // v3 = v^3
528 fe_sq_tt(&h->X, &v3);
529 fe_mul_ttl(&h->X, &h->X, &v);
530 fe_mul_ttt(&h->X, &h->X, &u); // x = uv^7
531
532 fe_pow22523(&h->X, &h->X); // x = (uv^7)^((q-5)/8)
533 fe_mul_ttt(&h->X, &h->X, &v3);
534 fe_mul_ttt(&h->X, &h->X, &u); // x = uv^3(uv^7)^((q-5)/8)
535
536 fe_sq_tt(&vxx, &h->X);
537 fe_mul_ttl(&vxx, &vxx, &v);
538 fe_sub(&check, &vxx, &u);
539 if (fe_isnonzero(&check)) {
540 fe_add(&check, &vxx, &u);
541 if (fe_isnonzero(&check)) {
542 return 0;
543 }
544 fe_mul_ttt(&h->X, &h->X, &sqrtm1);
545 }
546
547 if (fe_isnegative(&h->X) != (s[31] >> 7)) {
548 fe_loose t;
549 fe_neg(&t, &h->X);
550 fe_carry(&h->X, &t);
551 }
552
553 fe_mul_ttt(&h->T, &h->X, &h->Y);
554 return 1;
555 }
556
ge_p2_0(ge_p2 * h)557 static void ge_p2_0(ge_p2 *h) {
558 fe_0(&h->X);
559 fe_1(&h->Y);
560 fe_1(&h->Z);
561 }
562
ge_p3_0(ge_p3 * h)563 static void ge_p3_0(ge_p3 *h) {
564 fe_0(&h->X);
565 fe_1(&h->Y);
566 fe_1(&h->Z);
567 fe_0(&h->T);
568 }
569
ge_cached_0(ge_cached * h)570 static void ge_cached_0(ge_cached *h) {
571 fe_loose_1(&h->YplusX);
572 fe_loose_1(&h->YminusX);
573 fe_loose_1(&h->Z);
574 fe_loose_0(&h->T2d);
575 }
576
ge_precomp_0(ge_precomp * h)577 static void ge_precomp_0(ge_precomp *h) {
578 fe_loose_1(&h->yplusx);
579 fe_loose_1(&h->yminusx);
580 fe_loose_0(&h->xy2d);
581 }
582
583 // r = p
ge_p3_to_p2(ge_p2 * r,const ge_p3 * p)584 static void ge_p3_to_p2(ge_p2 *r, const ge_p3 *p) {
585 fe_copy(&r->X, &p->X);
586 fe_copy(&r->Y, &p->Y);
587 fe_copy(&r->Z, &p->Z);
588 }
589
590 // r = p
x25519_ge_p3_to_cached(ge_cached * r,const ge_p3 * p)591 void x25519_ge_p3_to_cached(ge_cached *r, const ge_p3 *p) {
592 fe_add(&r->YplusX, &p->Y, &p->X);
593 fe_sub(&r->YminusX, &p->Y, &p->X);
594 fe_copy_lt(&r->Z, &p->Z);
595 fe_mul_ltt(&r->T2d, &p->T, &d2);
596 }
597
598 // r = p
x25519_ge_p1p1_to_p2(ge_p2 * r,const ge_p1p1 * p)599 void x25519_ge_p1p1_to_p2(ge_p2 *r, const ge_p1p1 *p) {
600 fe_mul_tll(&r->X, &p->X, &p->T);
601 fe_mul_tll(&r->Y, &p->Y, &p->Z);
602 fe_mul_tll(&r->Z, &p->Z, &p->T);
603 }
604
605 // r = p
x25519_ge_p1p1_to_p3(ge_p3 * r,const ge_p1p1 * p)606 void x25519_ge_p1p1_to_p3(ge_p3 *r, const ge_p1p1 *p) {
607 fe_mul_tll(&r->X, &p->X, &p->T);
608 fe_mul_tll(&r->Y, &p->Y, &p->Z);
609 fe_mul_tll(&r->Z, &p->Z, &p->T);
610 fe_mul_tll(&r->T, &p->X, &p->Y);
611 }
612
613 // r = p
ge_p1p1_to_cached(ge_cached * r,const ge_p1p1 * p)614 static void ge_p1p1_to_cached(ge_cached *r, const ge_p1p1 *p) {
615 ge_p3 t;
616 x25519_ge_p1p1_to_p3(&t, p);
617 x25519_ge_p3_to_cached(r, &t);
618 }
619
620 // r = 2 * p
ge_p2_dbl(ge_p1p1 * r,const ge_p2 * p)621 static void ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p) {
622 fe trX, trZ, trT;
623 fe t0;
624
625 fe_sq_tt(&trX, &p->X);
626 fe_sq_tt(&trZ, &p->Y);
627 fe_sq2_tt(&trT, &p->Z);
628 fe_add(&r->Y, &p->X, &p->Y);
629 fe_sq_tl(&t0, &r->Y);
630
631 fe_add(&r->Y, &trZ, &trX);
632 fe_sub(&r->Z, &trZ, &trX);
633 fe_carry(&trZ, &r->Y);
634 fe_sub(&r->X, &t0, &trZ);
635 fe_carry(&trZ, &r->Z);
636 fe_sub(&r->T, &trT, &trZ);
637 }
638
639 // r = 2 * p
ge_p3_dbl(ge_p1p1 * r,const ge_p3 * p)640 static void ge_p3_dbl(ge_p1p1 *r, const ge_p3 *p) {
641 ge_p2 q;
642 ge_p3_to_p2(&q, p);
643 ge_p2_dbl(r, &q);
644 }
645
646 // r = p + q
ge_madd(ge_p1p1 * r,const ge_p3 * p,const ge_precomp * q)647 static void ge_madd(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) {
648 fe trY, trZ, trT;
649
650 fe_add(&r->X, &p->Y, &p->X);
651 fe_sub(&r->Y, &p->Y, &p->X);
652 fe_mul_tll(&trZ, &r->X, &q->yplusx);
653 fe_mul_tll(&trY, &r->Y, &q->yminusx);
654 fe_mul_tlt(&trT, &q->xy2d, &p->T);
655 fe_add(&r->T, &p->Z, &p->Z);
656 fe_sub(&r->X, &trZ, &trY);
657 fe_add(&r->Y, &trZ, &trY);
658 fe_carry(&trZ, &r->T);
659 fe_add(&r->Z, &trZ, &trT);
660 fe_sub(&r->T, &trZ, &trT);
661 }
662
663 // r = p - q
ge_msub(ge_p1p1 * r,const ge_p3 * p,const ge_precomp * q)664 static void ge_msub(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) {
665 fe trY, trZ, trT;
666
667 fe_add(&r->X, &p->Y, &p->X);
668 fe_sub(&r->Y, &p->Y, &p->X);
669 fe_mul_tll(&trZ, &r->X, &q->yminusx);
670 fe_mul_tll(&trY, &r->Y, &q->yplusx);
671 fe_mul_tlt(&trT, &q->xy2d, &p->T);
672 fe_add(&r->T, &p->Z, &p->Z);
673 fe_sub(&r->X, &trZ, &trY);
674 fe_add(&r->Y, &trZ, &trY);
675 fe_carry(&trZ, &r->T);
676 fe_sub(&r->Z, &trZ, &trT);
677 fe_add(&r->T, &trZ, &trT);
678 }
679
680 // r = p + q
x25519_ge_add(ge_p1p1 * r,const ge_p3 * p,const ge_cached * q)681 void x25519_ge_add(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) {
682 fe trX, trY, trZ, trT;
683
684 fe_add(&r->X, &p->Y, &p->X);
685 fe_sub(&r->Y, &p->Y, &p->X);
686 fe_mul_tll(&trZ, &r->X, &q->YplusX);
687 fe_mul_tll(&trY, &r->Y, &q->YminusX);
688 fe_mul_tlt(&trT, &q->T2d, &p->T);
689 fe_mul_ttl(&trX, &p->Z, &q->Z);
690 fe_add(&r->T, &trX, &trX);
691 fe_sub(&r->X, &trZ, &trY);
692 fe_add(&r->Y, &trZ, &trY);
693 fe_carry(&trZ, &r->T);
694 fe_add(&r->Z, &trZ, &trT);
695 fe_sub(&r->T, &trZ, &trT);
696 }
697
698 // r = p - q
x25519_ge_sub(ge_p1p1 * r,const ge_p3 * p,const ge_cached * q)699 void x25519_ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) {
700 fe trX, trY, trZ, trT;
701
702 fe_add(&r->X, &p->Y, &p->X);
703 fe_sub(&r->Y, &p->Y, &p->X);
704 fe_mul_tll(&trZ, &r->X, &q->YminusX);
705 fe_mul_tll(&trY, &r->Y, &q->YplusX);
706 fe_mul_tlt(&trT, &q->T2d, &p->T);
707 fe_mul_ttl(&trX, &p->Z, &q->Z);
708 fe_add(&r->T, &trX, &trX);
709 fe_sub(&r->X, &trZ, &trY);
710 fe_add(&r->Y, &trZ, &trY);
711 fe_carry(&trZ, &r->T);
712 fe_sub(&r->Z, &trZ, &trT);
713 fe_add(&r->T, &trZ, &trT);
714 }
715
equal(signed char b,signed char c)716 static uint8_t equal(signed char b, signed char c) {
717 uint8_t ub = b;
718 uint8_t uc = c;
719 uint8_t x = ub ^ uc; // 0: yes; 1..255: no
720 uint32_t y = x; // 0: yes; 1..255: no
721 y -= 1; // 4294967295: yes; 0..254: no
722 y >>= 31; // 1: yes; 0: no
723 return y;
724 }
725
cmov(ge_precomp * t,const ge_precomp * u,uint8_t b)726 static void cmov(ge_precomp *t, const ge_precomp *u, uint8_t b) {
727 fe_cmov(&t->yplusx, &u->yplusx, b);
728 fe_cmov(&t->yminusx, &u->yminusx, b);
729 fe_cmov(&t->xy2d, &u->xy2d, b);
730 }
731
x25519_ge_scalarmult_small_precomp(ge_p3 * h,const uint8_t a[32],const uint8_t precomp_table[15* 2* 32])732 void x25519_ge_scalarmult_small_precomp(
733 ge_p3 *h, const uint8_t a[32], const uint8_t precomp_table[15 * 2 * 32]) {
734 // precomp_table is first expanded into matching |ge_precomp|
735 // elements.
736 ge_precomp multiples[15];
737
738 unsigned i;
739 for (i = 0; i < 15; i++) {
740 // The precomputed table is assumed to already clear the top bit, so
741 // |fe_frombytes_strict| may be used directly.
742 const uint8_t *bytes = &precomp_table[i*(2 * 32)];
743 fe x, y;
744 fe_frombytes_strict(&x, bytes);
745 fe_frombytes_strict(&y, bytes + 32);
746
747 ge_precomp *out = &multiples[i];
748 fe_add(&out->yplusx, &y, &x);
749 fe_sub(&out->yminusx, &y, &x);
750 fe_mul_ltt(&out->xy2d, &x, &y);
751 fe_mul_llt(&out->xy2d, &out->xy2d, &d2);
752 }
753
754 // See the comment above |k25519SmallPrecomp| about the structure of the
755 // precomputed elements. This loop does 64 additions and 64 doublings to
756 // calculate the result.
757 ge_p3_0(h);
758
759 for (i = 63; i < 64; i--) {
760 unsigned j;
761 signed char index = 0;
762
763 for (j = 0; j < 4; j++) {
764 const uint8_t bit = 1 & (a[(8 * j) + (i / 8)] >> (i & 7));
765 index |= (bit << j);
766 }
767
768 ge_precomp e;
769 ge_precomp_0(&e);
770
771 for (j = 1; j < 16; j++) {
772 cmov(&e, &multiples[j-1], equal(index, j));
773 }
774
775 ge_cached cached;
776 ge_p1p1 r;
777 x25519_ge_p3_to_cached(&cached, h);
778 x25519_ge_add(&r, h, &cached);
779 x25519_ge_p1p1_to_p3(h, &r);
780
781 ge_madd(&r, h, &e);
782 x25519_ge_p1p1_to_p3(h, &r);
783 }
784 }
785
786 #if defined(OPENSSL_SMALL)
787
x25519_ge_scalarmult_base(ge_p3 * h,const uint8_t a[32])788 void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32]) {
789 x25519_ge_scalarmult_small_precomp(h, a, k25519SmallPrecomp);
790 }
791
792 #else
793
negative(signed char b)794 static uint8_t negative(signed char b) {
795 uint32_t x = b;
796 x >>= 31; // 1: yes; 0: no
797 return x;
798 }
799
table_select(ge_precomp * t,int pos,signed char b)800 static void table_select(ge_precomp *t, int pos, signed char b) {
801 ge_precomp minust;
802 uint8_t bnegative = negative(b);
803 uint8_t babs = b - ((uint8_t)((-bnegative) & b) << 1);
804
805 ge_precomp_0(t);
806 cmov(t, &k25519Precomp[pos][0], equal(babs, 1));
807 cmov(t, &k25519Precomp[pos][1], equal(babs, 2));
808 cmov(t, &k25519Precomp[pos][2], equal(babs, 3));
809 cmov(t, &k25519Precomp[pos][3], equal(babs, 4));
810 cmov(t, &k25519Precomp[pos][4], equal(babs, 5));
811 cmov(t, &k25519Precomp[pos][5], equal(babs, 6));
812 cmov(t, &k25519Precomp[pos][6], equal(babs, 7));
813 cmov(t, &k25519Precomp[pos][7], equal(babs, 8));
814 fe_copy_ll(&minust.yplusx, &t->yminusx);
815 fe_copy_ll(&minust.yminusx, &t->yplusx);
816
817 // NOTE: the input table is canonical, but types don't encode it
818 fe tmp;
819 fe_carry(&tmp, &t->xy2d);
820 fe_neg(&minust.xy2d, &tmp);
821
822 cmov(t, &minust, bnegative);
823 }
824
825 // h = a * B
826 // where a = a[0]+256*a[1]+...+256^31 a[31]
827 // B is the Ed25519 base point (x,4/5) with x positive.
828 //
829 // Preconditions:
830 // a[31] <= 127
x25519_ge_scalarmult_base(ge_p3 * h,const uint8_t * a)831 void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t *a) {
832 signed char e[64];
833 signed char carry;
834 ge_p1p1 r;
835 ge_p2 s;
836 ge_precomp t;
837 int i;
838
839 for (i = 0; i < 32; ++i) {
840 e[2 * i + 0] = (a[i] >> 0) & 15;
841 e[2 * i + 1] = (a[i] >> 4) & 15;
842 }
843 // each e[i] is between 0 and 15
844 // e[63] is between 0 and 7
845
846 carry = 0;
847 for (i = 0; i < 63; ++i) {
848 e[i] += carry;
849 carry = e[i] + 8;
850 carry >>= 4;
851 e[i] -= carry << 4;
852 }
853 e[63] += carry;
854 // each e[i] is between -8 and 8
855
856 ge_p3_0(h);
857 for (i = 1; i < 64; i += 2) {
858 table_select(&t, i / 2, e[i]);
859 ge_madd(&r, h, &t);
860 x25519_ge_p1p1_to_p3(h, &r);
861 }
862
863 ge_p3_dbl(&r, h);
864 x25519_ge_p1p1_to_p2(&s, &r);
865 ge_p2_dbl(&r, &s);
866 x25519_ge_p1p1_to_p2(&s, &r);
867 ge_p2_dbl(&r, &s);
868 x25519_ge_p1p1_to_p2(&s, &r);
869 ge_p2_dbl(&r, &s);
870 x25519_ge_p1p1_to_p3(h, &r);
871
872 for (i = 0; i < 64; i += 2) {
873 table_select(&t, i / 2, e[i]);
874 ge_madd(&r, h, &t);
875 x25519_ge_p1p1_to_p3(h, &r);
876 }
877 }
878
879 #endif
880
cmov_cached(ge_cached * t,ge_cached * u,uint8_t b)881 static void cmov_cached(ge_cached *t, ge_cached *u, uint8_t b) {
882 fe_cmov(&t->YplusX, &u->YplusX, b);
883 fe_cmov(&t->YminusX, &u->YminusX, b);
884 fe_cmov(&t->Z, &u->Z, b);
885 fe_cmov(&t->T2d, &u->T2d, b);
886 }
887
888 // r = scalar * A.
889 // where a = a[0]+256*a[1]+...+256^31 a[31].
x25519_ge_scalarmult(ge_p2 * r,const uint8_t * scalar,const ge_p3 * A)890 void x25519_ge_scalarmult(ge_p2 *r, const uint8_t *scalar, const ge_p3 *A) {
891 ge_p2 Ai_p2[8];
892 ge_cached Ai[16];
893 ge_p1p1 t;
894
895 ge_cached_0(&Ai[0]);
896 x25519_ge_p3_to_cached(&Ai[1], A);
897 ge_p3_to_p2(&Ai_p2[1], A);
898
899 unsigned i;
900 for (i = 2; i < 16; i += 2) {
901 ge_p2_dbl(&t, &Ai_p2[i / 2]);
902 ge_p1p1_to_cached(&Ai[i], &t);
903 if (i < 8) {
904 x25519_ge_p1p1_to_p2(&Ai_p2[i], &t);
905 }
906 x25519_ge_add(&t, A, &Ai[i]);
907 ge_p1p1_to_cached(&Ai[i + 1], &t);
908 if (i < 7) {
909 x25519_ge_p1p1_to_p2(&Ai_p2[i + 1], &t);
910 }
911 }
912
913 ge_p2_0(r);
914 ge_p3 u;
915
916 for (i = 0; i < 256; i += 4) {
917 ge_p2_dbl(&t, r);
918 x25519_ge_p1p1_to_p2(r, &t);
919 ge_p2_dbl(&t, r);
920 x25519_ge_p1p1_to_p2(r, &t);
921 ge_p2_dbl(&t, r);
922 x25519_ge_p1p1_to_p2(r, &t);
923 ge_p2_dbl(&t, r);
924 x25519_ge_p1p1_to_p3(&u, &t);
925
926 uint8_t index = scalar[31 - i/8];
927 index >>= 4 - (i & 4);
928 index &= 0xf;
929
930 unsigned j;
931 ge_cached selected;
932 ge_cached_0(&selected);
933 for (j = 0; j < 16; j++) {
934 cmov_cached(&selected, &Ai[j], equal(j, index));
935 }
936
937 x25519_ge_add(&t, &u, &selected);
938 x25519_ge_p1p1_to_p2(r, &t);
939 }
940 }
941
slide(signed char * r,const uint8_t * a)942 static void slide(signed char *r, const uint8_t *a) {
943 int i;
944 int b;
945 int k;
946
947 for (i = 0; i < 256; ++i) {
948 r[i] = 1 & (a[i >> 3] >> (i & 7));
949 }
950
951 for (i = 0; i < 256; ++i) {
952 if (r[i]) {
953 for (b = 1; b <= 6 && i + b < 256; ++b) {
954 if (r[i + b]) {
955 if (r[i] + (r[i + b] << b) <= 15) {
956 r[i] += r[i + b] << b;
957 r[i + b] = 0;
958 } else if (r[i] - (r[i + b] << b) >= -15) {
959 r[i] -= r[i + b] << b;
960 for (k = i + b; k < 256; ++k) {
961 if (!r[k]) {
962 r[k] = 1;
963 break;
964 }
965 r[k] = 0;
966 }
967 } else {
968 break;
969 }
970 }
971 }
972 }
973 }
974 }
975
976 // r = a * A + b * B
977 // where a = a[0]+256*a[1]+...+256^31 a[31].
978 // and b = b[0]+256*b[1]+...+256^31 b[31].
979 // B is the Ed25519 base point (x,4/5) with x positive.
ge_double_scalarmult_vartime(ge_p2 * r,const uint8_t * a,const ge_p3 * A,const uint8_t * b)980 static void ge_double_scalarmult_vartime(ge_p2 *r, const uint8_t *a,
981 const ge_p3 *A, const uint8_t *b) {
982 signed char aslide[256];
983 signed char bslide[256];
984 ge_cached Ai[8]; // A,3A,5A,7A,9A,11A,13A,15A
985 ge_p1p1 t;
986 ge_p3 u;
987 ge_p3 A2;
988 int i;
989
990 slide(aslide, a);
991 slide(bslide, b);
992
993 x25519_ge_p3_to_cached(&Ai[0], A);
994 ge_p3_dbl(&t, A);
995 x25519_ge_p1p1_to_p3(&A2, &t);
996 x25519_ge_add(&t, &A2, &Ai[0]);
997 x25519_ge_p1p1_to_p3(&u, &t);
998 x25519_ge_p3_to_cached(&Ai[1], &u);
999 x25519_ge_add(&t, &A2, &Ai[1]);
1000 x25519_ge_p1p1_to_p3(&u, &t);
1001 x25519_ge_p3_to_cached(&Ai[2], &u);
1002 x25519_ge_add(&t, &A2, &Ai[2]);
1003 x25519_ge_p1p1_to_p3(&u, &t);
1004 x25519_ge_p3_to_cached(&Ai[3], &u);
1005 x25519_ge_add(&t, &A2, &Ai[3]);
1006 x25519_ge_p1p1_to_p3(&u, &t);
1007 x25519_ge_p3_to_cached(&Ai[4], &u);
1008 x25519_ge_add(&t, &A2, &Ai[4]);
1009 x25519_ge_p1p1_to_p3(&u, &t);
1010 x25519_ge_p3_to_cached(&Ai[5], &u);
1011 x25519_ge_add(&t, &A2, &Ai[5]);
1012 x25519_ge_p1p1_to_p3(&u, &t);
1013 x25519_ge_p3_to_cached(&Ai[6], &u);
1014 x25519_ge_add(&t, &A2, &Ai[6]);
1015 x25519_ge_p1p1_to_p3(&u, &t);
1016 x25519_ge_p3_to_cached(&Ai[7], &u);
1017
1018 ge_p2_0(r);
1019
1020 for (i = 255; i >= 0; --i) {
1021 if (aslide[i] || bslide[i]) {
1022 break;
1023 }
1024 }
1025
1026 for (; i >= 0; --i) {
1027 ge_p2_dbl(&t, r);
1028
1029 if (aslide[i] > 0) {
1030 x25519_ge_p1p1_to_p3(&u, &t);
1031 x25519_ge_add(&t, &u, &Ai[aslide[i] / 2]);
1032 } else if (aslide[i] < 0) {
1033 x25519_ge_p1p1_to_p3(&u, &t);
1034 x25519_ge_sub(&t, &u, &Ai[(-aslide[i]) / 2]);
1035 }
1036
1037 if (bslide[i] > 0) {
1038 x25519_ge_p1p1_to_p3(&u, &t);
1039 ge_madd(&t, &u, &Bi[bslide[i] / 2]);
1040 } else if (bslide[i] < 0) {
1041 x25519_ge_p1p1_to_p3(&u, &t);
1042 ge_msub(&t, &u, &Bi[(-bslide[i]) / 2]);
1043 }
1044
1045 x25519_ge_p1p1_to_p2(r, &t);
1046 }
1047 }
1048
1049 // int64_lshift21 returns |a << 21| but is defined when shifting bits into the
1050 // sign bit. This works around a language flaw in C.
int64_lshift21(int64_t a)1051 static inline int64_t int64_lshift21(int64_t a) {
1052 return (int64_t)((uint64_t)a << 21);
1053 }
1054
1055 // The set of scalars is \Z/l
1056 // where l = 2^252 + 27742317777372353535851937790883648493.
1057
1058 // Input:
1059 // s[0]+256*s[1]+...+256^63*s[63] = s
1060 //
1061 // Output:
1062 // s[0]+256*s[1]+...+256^31*s[31] = s mod l
1063 // where l = 2^252 + 27742317777372353535851937790883648493.
1064 // Overwrites s in place.
x25519_sc_reduce(uint8_t s[64])1065 void x25519_sc_reduce(uint8_t s[64]) {
1066 int64_t s0 = 2097151 & load_3(s);
1067 int64_t s1 = 2097151 & (load_4(s + 2) >> 5);
1068 int64_t s2 = 2097151 & (load_3(s + 5) >> 2);
1069 int64_t s3 = 2097151 & (load_4(s + 7) >> 7);
1070 int64_t s4 = 2097151 & (load_4(s + 10) >> 4);
1071 int64_t s5 = 2097151 & (load_3(s + 13) >> 1);
1072 int64_t s6 = 2097151 & (load_4(s + 15) >> 6);
1073 int64_t s7 = 2097151 & (load_3(s + 18) >> 3);
1074 int64_t s8 = 2097151 & load_3(s + 21);
1075 int64_t s9 = 2097151 & (load_4(s + 23) >> 5);
1076 int64_t s10 = 2097151 & (load_3(s + 26) >> 2);
1077 int64_t s11 = 2097151 & (load_4(s + 28) >> 7);
1078 int64_t s12 = 2097151 & (load_4(s + 31) >> 4);
1079 int64_t s13 = 2097151 & (load_3(s + 34) >> 1);
1080 int64_t s14 = 2097151 & (load_4(s + 36) >> 6);
1081 int64_t s15 = 2097151 & (load_3(s + 39) >> 3);
1082 int64_t s16 = 2097151 & load_3(s + 42);
1083 int64_t s17 = 2097151 & (load_4(s + 44) >> 5);
1084 int64_t s18 = 2097151 & (load_3(s + 47) >> 2);
1085 int64_t s19 = 2097151 & (load_4(s + 49) >> 7);
1086 int64_t s20 = 2097151 & (load_4(s + 52) >> 4);
1087 int64_t s21 = 2097151 & (load_3(s + 55) >> 1);
1088 int64_t s22 = 2097151 & (load_4(s + 57) >> 6);
1089 int64_t s23 = (load_4(s + 60) >> 3);
1090 int64_t carry0;
1091 int64_t carry1;
1092 int64_t carry2;
1093 int64_t carry3;
1094 int64_t carry4;
1095 int64_t carry5;
1096 int64_t carry6;
1097 int64_t carry7;
1098 int64_t carry8;
1099 int64_t carry9;
1100 int64_t carry10;
1101 int64_t carry11;
1102 int64_t carry12;
1103 int64_t carry13;
1104 int64_t carry14;
1105 int64_t carry15;
1106 int64_t carry16;
1107
1108 s11 += s23 * 666643;
1109 s12 += s23 * 470296;
1110 s13 += s23 * 654183;
1111 s14 -= s23 * 997805;
1112 s15 += s23 * 136657;
1113 s16 -= s23 * 683901;
1114 s23 = 0;
1115
1116 s10 += s22 * 666643;
1117 s11 += s22 * 470296;
1118 s12 += s22 * 654183;
1119 s13 -= s22 * 997805;
1120 s14 += s22 * 136657;
1121 s15 -= s22 * 683901;
1122 s22 = 0;
1123
1124 s9 += s21 * 666643;
1125 s10 += s21 * 470296;
1126 s11 += s21 * 654183;
1127 s12 -= s21 * 997805;
1128 s13 += s21 * 136657;
1129 s14 -= s21 * 683901;
1130 s21 = 0;
1131
1132 s8 += s20 * 666643;
1133 s9 += s20 * 470296;
1134 s10 += s20 * 654183;
1135 s11 -= s20 * 997805;
1136 s12 += s20 * 136657;
1137 s13 -= s20 * 683901;
1138 s20 = 0;
1139
1140 s7 += s19 * 666643;
1141 s8 += s19 * 470296;
1142 s9 += s19 * 654183;
1143 s10 -= s19 * 997805;
1144 s11 += s19 * 136657;
1145 s12 -= s19 * 683901;
1146 s19 = 0;
1147
1148 s6 += s18 * 666643;
1149 s7 += s18 * 470296;
1150 s8 += s18 * 654183;
1151 s9 -= s18 * 997805;
1152 s10 += s18 * 136657;
1153 s11 -= s18 * 683901;
1154 s18 = 0;
1155
1156 carry6 = (s6 + (1 << 20)) >> 21;
1157 s7 += carry6;
1158 s6 -= int64_lshift21(carry6);
1159 carry8 = (s8 + (1 << 20)) >> 21;
1160 s9 += carry8;
1161 s8 -= int64_lshift21(carry8);
1162 carry10 = (s10 + (1 << 20)) >> 21;
1163 s11 += carry10;
1164 s10 -= int64_lshift21(carry10);
1165 carry12 = (s12 + (1 << 20)) >> 21;
1166 s13 += carry12;
1167 s12 -= int64_lshift21(carry12);
1168 carry14 = (s14 + (1 << 20)) >> 21;
1169 s15 += carry14;
1170 s14 -= int64_lshift21(carry14);
1171 carry16 = (s16 + (1 << 20)) >> 21;
1172 s17 += carry16;
1173 s16 -= int64_lshift21(carry16);
1174
1175 carry7 = (s7 + (1 << 20)) >> 21;
1176 s8 += carry7;
1177 s7 -= int64_lshift21(carry7);
1178 carry9 = (s9 + (1 << 20)) >> 21;
1179 s10 += carry9;
1180 s9 -= int64_lshift21(carry9);
1181 carry11 = (s11 + (1 << 20)) >> 21;
1182 s12 += carry11;
1183 s11 -= int64_lshift21(carry11);
1184 carry13 = (s13 + (1 << 20)) >> 21;
1185 s14 += carry13;
1186 s13 -= int64_lshift21(carry13);
1187 carry15 = (s15 + (1 << 20)) >> 21;
1188 s16 += carry15;
1189 s15 -= int64_lshift21(carry15);
1190
1191 s5 += s17 * 666643;
1192 s6 += s17 * 470296;
1193 s7 += s17 * 654183;
1194 s8 -= s17 * 997805;
1195 s9 += s17 * 136657;
1196 s10 -= s17 * 683901;
1197 s17 = 0;
1198
1199 s4 += s16 * 666643;
1200 s5 += s16 * 470296;
1201 s6 += s16 * 654183;
1202 s7 -= s16 * 997805;
1203 s8 += s16 * 136657;
1204 s9 -= s16 * 683901;
1205 s16 = 0;
1206
1207 s3 += s15 * 666643;
1208 s4 += s15 * 470296;
1209 s5 += s15 * 654183;
1210 s6 -= s15 * 997805;
1211 s7 += s15 * 136657;
1212 s8 -= s15 * 683901;
1213 s15 = 0;
1214
1215 s2 += s14 * 666643;
1216 s3 += s14 * 470296;
1217 s4 += s14 * 654183;
1218 s5 -= s14 * 997805;
1219 s6 += s14 * 136657;
1220 s7 -= s14 * 683901;
1221 s14 = 0;
1222
1223 s1 += s13 * 666643;
1224 s2 += s13 * 470296;
1225 s3 += s13 * 654183;
1226 s4 -= s13 * 997805;
1227 s5 += s13 * 136657;
1228 s6 -= s13 * 683901;
1229 s13 = 0;
1230
1231 s0 += s12 * 666643;
1232 s1 += s12 * 470296;
1233 s2 += s12 * 654183;
1234 s3 -= s12 * 997805;
1235 s4 += s12 * 136657;
1236 s5 -= s12 * 683901;
1237 s12 = 0;
1238
1239 carry0 = (s0 + (1 << 20)) >> 21;
1240 s1 += carry0;
1241 s0 -= int64_lshift21(carry0);
1242 carry2 = (s2 + (1 << 20)) >> 21;
1243 s3 += carry2;
1244 s2 -= int64_lshift21(carry2);
1245 carry4 = (s4 + (1 << 20)) >> 21;
1246 s5 += carry4;
1247 s4 -= int64_lshift21(carry4);
1248 carry6 = (s6 + (1 << 20)) >> 21;
1249 s7 += carry6;
1250 s6 -= int64_lshift21(carry6);
1251 carry8 = (s8 + (1 << 20)) >> 21;
1252 s9 += carry8;
1253 s8 -= int64_lshift21(carry8);
1254 carry10 = (s10 + (1 << 20)) >> 21;
1255 s11 += carry10;
1256 s10 -= int64_lshift21(carry10);
1257
1258 carry1 = (s1 + (1 << 20)) >> 21;
1259 s2 += carry1;
1260 s1 -= int64_lshift21(carry1);
1261 carry3 = (s3 + (1 << 20)) >> 21;
1262 s4 += carry3;
1263 s3 -= int64_lshift21(carry3);
1264 carry5 = (s5 + (1 << 20)) >> 21;
1265 s6 += carry5;
1266 s5 -= int64_lshift21(carry5);
1267 carry7 = (s7 + (1 << 20)) >> 21;
1268 s8 += carry7;
1269 s7 -= int64_lshift21(carry7);
1270 carry9 = (s9 + (1 << 20)) >> 21;
1271 s10 += carry9;
1272 s9 -= int64_lshift21(carry9);
1273 carry11 = (s11 + (1 << 20)) >> 21;
1274 s12 += carry11;
1275 s11 -= int64_lshift21(carry11);
1276
1277 s0 += s12 * 666643;
1278 s1 += s12 * 470296;
1279 s2 += s12 * 654183;
1280 s3 -= s12 * 997805;
1281 s4 += s12 * 136657;
1282 s5 -= s12 * 683901;
1283 s12 = 0;
1284
1285 carry0 = s0 >> 21;
1286 s1 += carry0;
1287 s0 -= int64_lshift21(carry0);
1288 carry1 = s1 >> 21;
1289 s2 += carry1;
1290 s1 -= int64_lshift21(carry1);
1291 carry2 = s2 >> 21;
1292 s3 += carry2;
1293 s2 -= int64_lshift21(carry2);
1294 carry3 = s3 >> 21;
1295 s4 += carry3;
1296 s3 -= int64_lshift21(carry3);
1297 carry4 = s4 >> 21;
1298 s5 += carry4;
1299 s4 -= int64_lshift21(carry4);
1300 carry5 = s5 >> 21;
1301 s6 += carry5;
1302 s5 -= int64_lshift21(carry5);
1303 carry6 = s6 >> 21;
1304 s7 += carry6;
1305 s6 -= int64_lshift21(carry6);
1306 carry7 = s7 >> 21;
1307 s8 += carry7;
1308 s7 -= int64_lshift21(carry7);
1309 carry8 = s8 >> 21;
1310 s9 += carry8;
1311 s8 -= int64_lshift21(carry8);
1312 carry9 = s9 >> 21;
1313 s10 += carry9;
1314 s9 -= int64_lshift21(carry9);
1315 carry10 = s10 >> 21;
1316 s11 += carry10;
1317 s10 -= int64_lshift21(carry10);
1318 carry11 = s11 >> 21;
1319 s12 += carry11;
1320 s11 -= int64_lshift21(carry11);
1321
1322 s0 += s12 * 666643;
1323 s1 += s12 * 470296;
1324 s2 += s12 * 654183;
1325 s3 -= s12 * 997805;
1326 s4 += s12 * 136657;
1327 s5 -= s12 * 683901;
1328 s12 = 0;
1329
1330 carry0 = s0 >> 21;
1331 s1 += carry0;
1332 s0 -= int64_lshift21(carry0);
1333 carry1 = s1 >> 21;
1334 s2 += carry1;
1335 s1 -= int64_lshift21(carry1);
1336 carry2 = s2 >> 21;
1337 s3 += carry2;
1338 s2 -= int64_lshift21(carry2);
1339 carry3 = s3 >> 21;
1340 s4 += carry3;
1341 s3 -= int64_lshift21(carry3);
1342 carry4 = s4 >> 21;
1343 s5 += carry4;
1344 s4 -= int64_lshift21(carry4);
1345 carry5 = s5 >> 21;
1346 s6 += carry5;
1347 s5 -= int64_lshift21(carry5);
1348 carry6 = s6 >> 21;
1349 s7 += carry6;
1350 s6 -= int64_lshift21(carry6);
1351 carry7 = s7 >> 21;
1352 s8 += carry7;
1353 s7 -= int64_lshift21(carry7);
1354 carry8 = s8 >> 21;
1355 s9 += carry8;
1356 s8 -= int64_lshift21(carry8);
1357 carry9 = s9 >> 21;
1358 s10 += carry9;
1359 s9 -= int64_lshift21(carry9);
1360 carry10 = s10 >> 21;
1361 s11 += carry10;
1362 s10 -= int64_lshift21(carry10);
1363
1364 s[0] = s0 >> 0;
1365 s[1] = s0 >> 8;
1366 s[2] = (s0 >> 16) | (s1 << 5);
1367 s[3] = s1 >> 3;
1368 s[4] = s1 >> 11;
1369 s[5] = (s1 >> 19) | (s2 << 2);
1370 s[6] = s2 >> 6;
1371 s[7] = (s2 >> 14) | (s3 << 7);
1372 s[8] = s3 >> 1;
1373 s[9] = s3 >> 9;
1374 s[10] = (s3 >> 17) | (s4 << 4);
1375 s[11] = s4 >> 4;
1376 s[12] = s4 >> 12;
1377 s[13] = (s4 >> 20) | (s5 << 1);
1378 s[14] = s5 >> 7;
1379 s[15] = (s5 >> 15) | (s6 << 6);
1380 s[16] = s6 >> 2;
1381 s[17] = s6 >> 10;
1382 s[18] = (s6 >> 18) | (s7 << 3);
1383 s[19] = s7 >> 5;
1384 s[20] = s7 >> 13;
1385 s[21] = s8 >> 0;
1386 s[22] = s8 >> 8;
1387 s[23] = (s8 >> 16) | (s9 << 5);
1388 s[24] = s9 >> 3;
1389 s[25] = s9 >> 11;
1390 s[26] = (s9 >> 19) | (s10 << 2);
1391 s[27] = s10 >> 6;
1392 s[28] = (s10 >> 14) | (s11 << 7);
1393 s[29] = s11 >> 1;
1394 s[30] = s11 >> 9;
1395 s[31] = s11 >> 17;
1396 }
1397
1398 // Input:
1399 // a[0]+256*a[1]+...+256^31*a[31] = a
1400 // b[0]+256*b[1]+...+256^31*b[31] = b
1401 // c[0]+256*c[1]+...+256^31*c[31] = c
1402 //
1403 // Output:
1404 // s[0]+256*s[1]+...+256^31*s[31] = (ab+c) mod l
1405 // where l = 2^252 + 27742317777372353535851937790883648493.
sc_muladd(uint8_t * s,const uint8_t * a,const uint8_t * b,const uint8_t * c)1406 static void sc_muladd(uint8_t *s, const uint8_t *a, const uint8_t *b,
1407 const uint8_t *c) {
1408 int64_t a0 = 2097151 & load_3(a);
1409 int64_t a1 = 2097151 & (load_4(a + 2) >> 5);
1410 int64_t a2 = 2097151 & (load_3(a + 5) >> 2);
1411 int64_t a3 = 2097151 & (load_4(a + 7) >> 7);
1412 int64_t a4 = 2097151 & (load_4(a + 10) >> 4);
1413 int64_t a5 = 2097151 & (load_3(a + 13) >> 1);
1414 int64_t a6 = 2097151 & (load_4(a + 15) >> 6);
1415 int64_t a7 = 2097151 & (load_3(a + 18) >> 3);
1416 int64_t a8 = 2097151 & load_3(a + 21);
1417 int64_t a9 = 2097151 & (load_4(a + 23) >> 5);
1418 int64_t a10 = 2097151 & (load_3(a + 26) >> 2);
1419 int64_t a11 = (load_4(a + 28) >> 7);
1420 int64_t b0 = 2097151 & load_3(b);
1421 int64_t b1 = 2097151 & (load_4(b + 2) >> 5);
1422 int64_t b2 = 2097151 & (load_3(b + 5) >> 2);
1423 int64_t b3 = 2097151 & (load_4(b + 7) >> 7);
1424 int64_t b4 = 2097151 & (load_4(b + 10) >> 4);
1425 int64_t b5 = 2097151 & (load_3(b + 13) >> 1);
1426 int64_t b6 = 2097151 & (load_4(b + 15) >> 6);
1427 int64_t b7 = 2097151 & (load_3(b + 18) >> 3);
1428 int64_t b8 = 2097151 & load_3(b + 21);
1429 int64_t b9 = 2097151 & (load_4(b + 23) >> 5);
1430 int64_t b10 = 2097151 & (load_3(b + 26) >> 2);
1431 int64_t b11 = (load_4(b + 28) >> 7);
1432 int64_t c0 = 2097151 & load_3(c);
1433 int64_t c1 = 2097151 & (load_4(c + 2) >> 5);
1434 int64_t c2 = 2097151 & (load_3(c + 5) >> 2);
1435 int64_t c3 = 2097151 & (load_4(c + 7) >> 7);
1436 int64_t c4 = 2097151 & (load_4(c + 10) >> 4);
1437 int64_t c5 = 2097151 & (load_3(c + 13) >> 1);
1438 int64_t c6 = 2097151 & (load_4(c + 15) >> 6);
1439 int64_t c7 = 2097151 & (load_3(c + 18) >> 3);
1440 int64_t c8 = 2097151 & load_3(c + 21);
1441 int64_t c9 = 2097151 & (load_4(c + 23) >> 5);
1442 int64_t c10 = 2097151 & (load_3(c + 26) >> 2);
1443 int64_t c11 = (load_4(c + 28) >> 7);
1444 int64_t s0;
1445 int64_t s1;
1446 int64_t s2;
1447 int64_t s3;
1448 int64_t s4;
1449 int64_t s5;
1450 int64_t s6;
1451 int64_t s7;
1452 int64_t s8;
1453 int64_t s9;
1454 int64_t s10;
1455 int64_t s11;
1456 int64_t s12;
1457 int64_t s13;
1458 int64_t s14;
1459 int64_t s15;
1460 int64_t s16;
1461 int64_t s17;
1462 int64_t s18;
1463 int64_t s19;
1464 int64_t s20;
1465 int64_t s21;
1466 int64_t s22;
1467 int64_t s23;
1468 int64_t carry0;
1469 int64_t carry1;
1470 int64_t carry2;
1471 int64_t carry3;
1472 int64_t carry4;
1473 int64_t carry5;
1474 int64_t carry6;
1475 int64_t carry7;
1476 int64_t carry8;
1477 int64_t carry9;
1478 int64_t carry10;
1479 int64_t carry11;
1480 int64_t carry12;
1481 int64_t carry13;
1482 int64_t carry14;
1483 int64_t carry15;
1484 int64_t carry16;
1485 int64_t carry17;
1486 int64_t carry18;
1487 int64_t carry19;
1488 int64_t carry20;
1489 int64_t carry21;
1490 int64_t carry22;
1491
1492 s0 = c0 + a0 * b0;
1493 s1 = c1 + a0 * b1 + a1 * b0;
1494 s2 = c2 + a0 * b2 + a1 * b1 + a2 * b0;
1495 s3 = c3 + a0 * b3 + a1 * b2 + a2 * b1 + a3 * b0;
1496 s4 = c4 + a0 * b4 + a1 * b3 + a2 * b2 + a3 * b1 + a4 * b0;
1497 s5 = c5 + a0 * b5 + a1 * b4 + a2 * b3 + a3 * b2 + a4 * b1 + a5 * b0;
1498 s6 = c6 + a0 * b6 + a1 * b5 + a2 * b4 + a3 * b3 + a4 * b2 + a5 * b1 + a6 * b0;
1499 s7 = c7 + a0 * b7 + a1 * b6 + a2 * b5 + a3 * b4 + a4 * b3 + a5 * b2 +
1500 a6 * b1 + a7 * b0;
1501 s8 = c8 + a0 * b8 + a1 * b7 + a2 * b6 + a3 * b5 + a4 * b4 + a5 * b3 +
1502 a6 * b2 + a7 * b1 + a8 * b0;
1503 s9 = c9 + a0 * b9 + a1 * b8 + a2 * b7 + a3 * b6 + a4 * b5 + a5 * b4 +
1504 a6 * b3 + a7 * b2 + a8 * b1 + a9 * b0;
1505 s10 = c10 + a0 * b10 + a1 * b9 + a2 * b8 + a3 * b7 + a4 * b6 + a5 * b5 +
1506 a6 * b4 + a7 * b3 + a8 * b2 + a9 * b1 + a10 * b0;
1507 s11 = c11 + a0 * b11 + a1 * b10 + a2 * b9 + a3 * b8 + a4 * b7 + a5 * b6 +
1508 a6 * b5 + a7 * b4 + a8 * b3 + a9 * b2 + a10 * b1 + a11 * b0;
1509 s12 = a1 * b11 + a2 * b10 + a3 * b9 + a4 * b8 + a5 * b7 + a6 * b6 + a7 * b5 +
1510 a8 * b4 + a9 * b3 + a10 * b2 + a11 * b1;
1511 s13 = a2 * b11 + a3 * b10 + a4 * b9 + a5 * b8 + a6 * b7 + a7 * b6 + a8 * b5 +
1512 a9 * b4 + a10 * b3 + a11 * b2;
1513 s14 = a3 * b11 + a4 * b10 + a5 * b9 + a6 * b8 + a7 * b7 + a8 * b6 + a9 * b5 +
1514 a10 * b4 + a11 * b3;
1515 s15 = a4 * b11 + a5 * b10 + a6 * b9 + a7 * b8 + a8 * b7 + a9 * b6 + a10 * b5 +
1516 a11 * b4;
1517 s16 = a5 * b11 + a6 * b10 + a7 * b9 + a8 * b8 + a9 * b7 + a10 * b6 + a11 * b5;
1518 s17 = a6 * b11 + a7 * b10 + a8 * b9 + a9 * b8 + a10 * b7 + a11 * b6;
1519 s18 = a7 * b11 + a8 * b10 + a9 * b9 + a10 * b8 + a11 * b7;
1520 s19 = a8 * b11 + a9 * b10 + a10 * b9 + a11 * b8;
1521 s20 = a9 * b11 + a10 * b10 + a11 * b9;
1522 s21 = a10 * b11 + a11 * b10;
1523 s22 = a11 * b11;
1524 s23 = 0;
1525
1526 carry0 = (s0 + (1 << 20)) >> 21;
1527 s1 += carry0;
1528 s0 -= int64_lshift21(carry0);
1529 carry2 = (s2 + (1 << 20)) >> 21;
1530 s3 += carry2;
1531 s2 -= int64_lshift21(carry2);
1532 carry4 = (s4 + (1 << 20)) >> 21;
1533 s5 += carry4;
1534 s4 -= int64_lshift21(carry4);
1535 carry6 = (s6 + (1 << 20)) >> 21;
1536 s7 += carry6;
1537 s6 -= int64_lshift21(carry6);
1538 carry8 = (s8 + (1 << 20)) >> 21;
1539 s9 += carry8;
1540 s8 -= int64_lshift21(carry8);
1541 carry10 = (s10 + (1 << 20)) >> 21;
1542 s11 += carry10;
1543 s10 -= int64_lshift21(carry10);
1544 carry12 = (s12 + (1 << 20)) >> 21;
1545 s13 += carry12;
1546 s12 -= int64_lshift21(carry12);
1547 carry14 = (s14 + (1 << 20)) >> 21;
1548 s15 += carry14;
1549 s14 -= int64_lshift21(carry14);
1550 carry16 = (s16 + (1 << 20)) >> 21;
1551 s17 += carry16;
1552 s16 -= int64_lshift21(carry16);
1553 carry18 = (s18 + (1 << 20)) >> 21;
1554 s19 += carry18;
1555 s18 -= int64_lshift21(carry18);
1556 carry20 = (s20 + (1 << 20)) >> 21;
1557 s21 += carry20;
1558 s20 -= int64_lshift21(carry20);
1559 carry22 = (s22 + (1 << 20)) >> 21;
1560 s23 += carry22;
1561 s22 -= int64_lshift21(carry22);
1562
1563 carry1 = (s1 + (1 << 20)) >> 21;
1564 s2 += carry1;
1565 s1 -= int64_lshift21(carry1);
1566 carry3 = (s3 + (1 << 20)) >> 21;
1567 s4 += carry3;
1568 s3 -= int64_lshift21(carry3);
1569 carry5 = (s5 + (1 << 20)) >> 21;
1570 s6 += carry5;
1571 s5 -= int64_lshift21(carry5);
1572 carry7 = (s7 + (1 << 20)) >> 21;
1573 s8 += carry7;
1574 s7 -= int64_lshift21(carry7);
1575 carry9 = (s9 + (1 << 20)) >> 21;
1576 s10 += carry9;
1577 s9 -= int64_lshift21(carry9);
1578 carry11 = (s11 + (1 << 20)) >> 21;
1579 s12 += carry11;
1580 s11 -= int64_lshift21(carry11);
1581 carry13 = (s13 + (1 << 20)) >> 21;
1582 s14 += carry13;
1583 s13 -= int64_lshift21(carry13);
1584 carry15 = (s15 + (1 << 20)) >> 21;
1585 s16 += carry15;
1586 s15 -= int64_lshift21(carry15);
1587 carry17 = (s17 + (1 << 20)) >> 21;
1588 s18 += carry17;
1589 s17 -= int64_lshift21(carry17);
1590 carry19 = (s19 + (1 << 20)) >> 21;
1591 s20 += carry19;
1592 s19 -= int64_lshift21(carry19);
1593 carry21 = (s21 + (1 << 20)) >> 21;
1594 s22 += carry21;
1595 s21 -= int64_lshift21(carry21);
1596
1597 s11 += s23 * 666643;
1598 s12 += s23 * 470296;
1599 s13 += s23 * 654183;
1600 s14 -= s23 * 997805;
1601 s15 += s23 * 136657;
1602 s16 -= s23 * 683901;
1603 s23 = 0;
1604
1605 s10 += s22 * 666643;
1606 s11 += s22 * 470296;
1607 s12 += s22 * 654183;
1608 s13 -= s22 * 997805;
1609 s14 += s22 * 136657;
1610 s15 -= s22 * 683901;
1611 s22 = 0;
1612
1613 s9 += s21 * 666643;
1614 s10 += s21 * 470296;
1615 s11 += s21 * 654183;
1616 s12 -= s21 * 997805;
1617 s13 += s21 * 136657;
1618 s14 -= s21 * 683901;
1619 s21 = 0;
1620
1621 s8 += s20 * 666643;
1622 s9 += s20 * 470296;
1623 s10 += s20 * 654183;
1624 s11 -= s20 * 997805;
1625 s12 += s20 * 136657;
1626 s13 -= s20 * 683901;
1627 s20 = 0;
1628
1629 s7 += s19 * 666643;
1630 s8 += s19 * 470296;
1631 s9 += s19 * 654183;
1632 s10 -= s19 * 997805;
1633 s11 += s19 * 136657;
1634 s12 -= s19 * 683901;
1635 s19 = 0;
1636
1637 s6 += s18 * 666643;
1638 s7 += s18 * 470296;
1639 s8 += s18 * 654183;
1640 s9 -= s18 * 997805;
1641 s10 += s18 * 136657;
1642 s11 -= s18 * 683901;
1643 s18 = 0;
1644
1645 carry6 = (s6 + (1 << 20)) >> 21;
1646 s7 += carry6;
1647 s6 -= int64_lshift21(carry6);
1648 carry8 = (s8 + (1 << 20)) >> 21;
1649 s9 += carry8;
1650 s8 -= int64_lshift21(carry8);
1651 carry10 = (s10 + (1 << 20)) >> 21;
1652 s11 += carry10;
1653 s10 -= int64_lshift21(carry10);
1654 carry12 = (s12 + (1 << 20)) >> 21;
1655 s13 += carry12;
1656 s12 -= int64_lshift21(carry12);
1657 carry14 = (s14 + (1 << 20)) >> 21;
1658 s15 += carry14;
1659 s14 -= int64_lshift21(carry14);
1660 carry16 = (s16 + (1 << 20)) >> 21;
1661 s17 += carry16;
1662 s16 -= int64_lshift21(carry16);
1663
1664 carry7 = (s7 + (1 << 20)) >> 21;
1665 s8 += carry7;
1666 s7 -= int64_lshift21(carry7);
1667 carry9 = (s9 + (1 << 20)) >> 21;
1668 s10 += carry9;
1669 s9 -= int64_lshift21(carry9);
1670 carry11 = (s11 + (1 << 20)) >> 21;
1671 s12 += carry11;
1672 s11 -= int64_lshift21(carry11);
1673 carry13 = (s13 + (1 << 20)) >> 21;
1674 s14 += carry13;
1675 s13 -= int64_lshift21(carry13);
1676 carry15 = (s15 + (1 << 20)) >> 21;
1677 s16 += carry15;
1678 s15 -= int64_lshift21(carry15);
1679
1680 s5 += s17 * 666643;
1681 s6 += s17 * 470296;
1682 s7 += s17 * 654183;
1683 s8 -= s17 * 997805;
1684 s9 += s17 * 136657;
1685 s10 -= s17 * 683901;
1686 s17 = 0;
1687
1688 s4 += s16 * 666643;
1689 s5 += s16 * 470296;
1690 s6 += s16 * 654183;
1691 s7 -= s16 * 997805;
1692 s8 += s16 * 136657;
1693 s9 -= s16 * 683901;
1694 s16 = 0;
1695
1696 s3 += s15 * 666643;
1697 s4 += s15 * 470296;
1698 s5 += s15 * 654183;
1699 s6 -= s15 * 997805;
1700 s7 += s15 * 136657;
1701 s8 -= s15 * 683901;
1702 s15 = 0;
1703
1704 s2 += s14 * 666643;
1705 s3 += s14 * 470296;
1706 s4 += s14 * 654183;
1707 s5 -= s14 * 997805;
1708 s6 += s14 * 136657;
1709 s7 -= s14 * 683901;
1710 s14 = 0;
1711
1712 s1 += s13 * 666643;
1713 s2 += s13 * 470296;
1714 s3 += s13 * 654183;
1715 s4 -= s13 * 997805;
1716 s5 += s13 * 136657;
1717 s6 -= s13 * 683901;
1718 s13 = 0;
1719
1720 s0 += s12 * 666643;
1721 s1 += s12 * 470296;
1722 s2 += s12 * 654183;
1723 s3 -= s12 * 997805;
1724 s4 += s12 * 136657;
1725 s5 -= s12 * 683901;
1726 s12 = 0;
1727
1728 carry0 = (s0 + (1 << 20)) >> 21;
1729 s1 += carry0;
1730 s0 -= int64_lshift21(carry0);
1731 carry2 = (s2 + (1 << 20)) >> 21;
1732 s3 += carry2;
1733 s2 -= int64_lshift21(carry2);
1734 carry4 = (s4 + (1 << 20)) >> 21;
1735 s5 += carry4;
1736 s4 -= int64_lshift21(carry4);
1737 carry6 = (s6 + (1 << 20)) >> 21;
1738 s7 += carry6;
1739 s6 -= int64_lshift21(carry6);
1740 carry8 = (s8 + (1 << 20)) >> 21;
1741 s9 += carry8;
1742 s8 -= int64_lshift21(carry8);
1743 carry10 = (s10 + (1 << 20)) >> 21;
1744 s11 += carry10;
1745 s10 -= int64_lshift21(carry10);
1746
1747 carry1 = (s1 + (1 << 20)) >> 21;
1748 s2 += carry1;
1749 s1 -= int64_lshift21(carry1);
1750 carry3 = (s3 + (1 << 20)) >> 21;
1751 s4 += carry3;
1752 s3 -= int64_lshift21(carry3);
1753 carry5 = (s5 + (1 << 20)) >> 21;
1754 s6 += carry5;
1755 s5 -= int64_lshift21(carry5);
1756 carry7 = (s7 + (1 << 20)) >> 21;
1757 s8 += carry7;
1758 s7 -= int64_lshift21(carry7);
1759 carry9 = (s9 + (1 << 20)) >> 21;
1760 s10 += carry9;
1761 s9 -= int64_lshift21(carry9);
1762 carry11 = (s11 + (1 << 20)) >> 21;
1763 s12 += carry11;
1764 s11 -= int64_lshift21(carry11);
1765
1766 s0 += s12 * 666643;
1767 s1 += s12 * 470296;
1768 s2 += s12 * 654183;
1769 s3 -= s12 * 997805;
1770 s4 += s12 * 136657;
1771 s5 -= s12 * 683901;
1772 s12 = 0;
1773
1774 carry0 = s0 >> 21;
1775 s1 += carry0;
1776 s0 -= int64_lshift21(carry0);
1777 carry1 = s1 >> 21;
1778 s2 += carry1;
1779 s1 -= int64_lshift21(carry1);
1780 carry2 = s2 >> 21;
1781 s3 += carry2;
1782 s2 -= int64_lshift21(carry2);
1783 carry3 = s3 >> 21;
1784 s4 += carry3;
1785 s3 -= int64_lshift21(carry3);
1786 carry4 = s4 >> 21;
1787 s5 += carry4;
1788 s4 -= int64_lshift21(carry4);
1789 carry5 = s5 >> 21;
1790 s6 += carry5;
1791 s5 -= int64_lshift21(carry5);
1792 carry6 = s6 >> 21;
1793 s7 += carry6;
1794 s6 -= int64_lshift21(carry6);
1795 carry7 = s7 >> 21;
1796 s8 += carry7;
1797 s7 -= int64_lshift21(carry7);
1798 carry8 = s8 >> 21;
1799 s9 += carry8;
1800 s8 -= int64_lshift21(carry8);
1801 carry9 = s9 >> 21;
1802 s10 += carry9;
1803 s9 -= int64_lshift21(carry9);
1804 carry10 = s10 >> 21;
1805 s11 += carry10;
1806 s10 -= int64_lshift21(carry10);
1807 carry11 = s11 >> 21;
1808 s12 += carry11;
1809 s11 -= int64_lshift21(carry11);
1810
1811 s0 += s12 * 666643;
1812 s1 += s12 * 470296;
1813 s2 += s12 * 654183;
1814 s3 -= s12 * 997805;
1815 s4 += s12 * 136657;
1816 s5 -= s12 * 683901;
1817 s12 = 0;
1818
1819 carry0 = s0 >> 21;
1820 s1 += carry0;
1821 s0 -= int64_lshift21(carry0);
1822 carry1 = s1 >> 21;
1823 s2 += carry1;
1824 s1 -= int64_lshift21(carry1);
1825 carry2 = s2 >> 21;
1826 s3 += carry2;
1827 s2 -= int64_lshift21(carry2);
1828 carry3 = s3 >> 21;
1829 s4 += carry3;
1830 s3 -= int64_lshift21(carry3);
1831 carry4 = s4 >> 21;
1832 s5 += carry4;
1833 s4 -= int64_lshift21(carry4);
1834 carry5 = s5 >> 21;
1835 s6 += carry5;
1836 s5 -= int64_lshift21(carry5);
1837 carry6 = s6 >> 21;
1838 s7 += carry6;
1839 s6 -= int64_lshift21(carry6);
1840 carry7 = s7 >> 21;
1841 s8 += carry7;
1842 s7 -= int64_lshift21(carry7);
1843 carry8 = s8 >> 21;
1844 s9 += carry8;
1845 s8 -= int64_lshift21(carry8);
1846 carry9 = s9 >> 21;
1847 s10 += carry9;
1848 s9 -= int64_lshift21(carry9);
1849 carry10 = s10 >> 21;
1850 s11 += carry10;
1851 s10 -= int64_lshift21(carry10);
1852
1853 s[0] = s0 >> 0;
1854 s[1] = s0 >> 8;
1855 s[2] = (s0 >> 16) | (s1 << 5);
1856 s[3] = s1 >> 3;
1857 s[4] = s1 >> 11;
1858 s[5] = (s1 >> 19) | (s2 << 2);
1859 s[6] = s2 >> 6;
1860 s[7] = (s2 >> 14) | (s3 << 7);
1861 s[8] = s3 >> 1;
1862 s[9] = s3 >> 9;
1863 s[10] = (s3 >> 17) | (s4 << 4);
1864 s[11] = s4 >> 4;
1865 s[12] = s4 >> 12;
1866 s[13] = (s4 >> 20) | (s5 << 1);
1867 s[14] = s5 >> 7;
1868 s[15] = (s5 >> 15) | (s6 << 6);
1869 s[16] = s6 >> 2;
1870 s[17] = s6 >> 10;
1871 s[18] = (s6 >> 18) | (s7 << 3);
1872 s[19] = s7 >> 5;
1873 s[20] = s7 >> 13;
1874 s[21] = s8 >> 0;
1875 s[22] = s8 >> 8;
1876 s[23] = (s8 >> 16) | (s9 << 5);
1877 s[24] = s9 >> 3;
1878 s[25] = s9 >> 11;
1879 s[26] = (s9 >> 19) | (s10 << 2);
1880 s[27] = s10 >> 6;
1881 s[28] = (s10 >> 14) | (s11 << 7);
1882 s[29] = s11 >> 1;
1883 s[30] = s11 >> 9;
1884 s[31] = s11 >> 17;
1885 }
1886
ED25519_keypair(uint8_t out_public_key[32],uint8_t out_private_key[64])1887 void ED25519_keypair(uint8_t out_public_key[32], uint8_t out_private_key[64]) {
1888 uint8_t seed[32];
1889 RAND_bytes(seed, 32);
1890 ED25519_keypair_from_seed(out_public_key, out_private_key, seed);
1891 }
1892
ED25519_sign(uint8_t out_sig[64],const uint8_t * message,size_t message_len,const uint8_t private_key[64])1893 int ED25519_sign(uint8_t out_sig[64], const uint8_t *message,
1894 size_t message_len, const uint8_t private_key[64]) {
1895 // NOTE: The documentation on this function says that it returns zero on
1896 // allocation failure. While that can't happen with the current
1897 // implementation, we want to reserve the ability to allocate in this
1898 // implementation in the future.
1899
1900 uint8_t az[SHA512_DIGEST_LENGTH];
1901 SHA512(private_key, 32, az);
1902
1903 az[0] &= 248;
1904 az[31] &= 63;
1905 az[31] |= 64;
1906
1907 SHA512_CTX hash_ctx;
1908 SHA512_Init(&hash_ctx);
1909 SHA512_Update(&hash_ctx, az + 32, 32);
1910 SHA512_Update(&hash_ctx, message, message_len);
1911 uint8_t nonce[SHA512_DIGEST_LENGTH];
1912 SHA512_Final(nonce, &hash_ctx);
1913
1914 x25519_sc_reduce(nonce);
1915 ge_p3 R;
1916 x25519_ge_scalarmult_base(&R, nonce);
1917 ge_p3_tobytes(out_sig, &R);
1918
1919 SHA512_Init(&hash_ctx);
1920 SHA512_Update(&hash_ctx, out_sig, 32);
1921 SHA512_Update(&hash_ctx, private_key + 32, 32);
1922 SHA512_Update(&hash_ctx, message, message_len);
1923 uint8_t hram[SHA512_DIGEST_LENGTH];
1924 SHA512_Final(hram, &hash_ctx);
1925
1926 x25519_sc_reduce(hram);
1927 sc_muladd(out_sig + 32, hram, az, nonce);
1928
1929 return 1;
1930 }
1931
ED25519_verify(const uint8_t * message,size_t message_len,const uint8_t signature[64],const uint8_t public_key[32])1932 int ED25519_verify(const uint8_t *message, size_t message_len,
1933 const uint8_t signature[64], const uint8_t public_key[32]) {
1934 ge_p3 A;
1935 if ((signature[63] & 224) != 0 ||
1936 !x25519_ge_frombytes_vartime(&A, public_key)) {
1937 return 0;
1938 }
1939
1940 fe_loose t;
1941 fe_neg(&t, &A.X);
1942 fe_carry(&A.X, &t);
1943 fe_neg(&t, &A.T);
1944 fe_carry(&A.T, &t);
1945
1946 uint8_t pkcopy[32];
1947 OPENSSL_memcpy(pkcopy, public_key, 32);
1948 uint8_t rcopy[32];
1949 OPENSSL_memcpy(rcopy, signature, 32);
1950 union {
1951 uint64_t u64[4];
1952 uint8_t u8[32];
1953 } scopy;
1954 OPENSSL_memcpy(&scopy.u8[0], signature + 32, 32);
1955
1956 // https://tools.ietf.org/html/rfc8032#section-5.1.7 requires that s be in
1957 // the range [0, order) in order to prevent signature malleability.
1958
1959 // kOrder is the order of Curve25519 in little-endian form.
1960 static const uint64_t kOrder[4] = {
1961 UINT64_C(0x5812631a5cf5d3ed),
1962 UINT64_C(0x14def9dea2f79cd6),
1963 0,
1964 UINT64_C(0x1000000000000000),
1965 };
1966 for (size_t i = 3;; i--) {
1967 if (scopy.u64[i] > kOrder[i]) {
1968 return 0;
1969 } else if (scopy.u64[i] < kOrder[i]) {
1970 break;
1971 } else if (i == 0) {
1972 return 0;
1973 }
1974 }
1975
1976 SHA512_CTX hash_ctx;
1977 SHA512_Init(&hash_ctx);
1978 SHA512_Update(&hash_ctx, signature, 32);
1979 SHA512_Update(&hash_ctx, public_key, 32);
1980 SHA512_Update(&hash_ctx, message, message_len);
1981 uint8_t h[SHA512_DIGEST_LENGTH];
1982 SHA512_Final(h, &hash_ctx);
1983
1984 x25519_sc_reduce(h);
1985
1986 ge_p2 R;
1987 ge_double_scalarmult_vartime(&R, h, &A, scopy.u8);
1988
1989 uint8_t rcheck[32];
1990 x25519_ge_tobytes(rcheck, &R);
1991
1992 return CRYPTO_memcmp(rcheck, rcopy, sizeof(rcheck)) == 0;
1993 }
1994
ED25519_keypair_from_seed(uint8_t out_public_key[32],uint8_t out_private_key[64],const uint8_t seed[32])1995 void ED25519_keypair_from_seed(uint8_t out_public_key[32],
1996 uint8_t out_private_key[64],
1997 const uint8_t seed[32]) {
1998 uint8_t az[SHA512_DIGEST_LENGTH];
1999 SHA512(seed, 32, az);
2000
2001 az[0] &= 248;
2002 az[31] &= 127;
2003 az[31] |= 64;
2004
2005 ge_p3 A;
2006 x25519_ge_scalarmult_base(&A, az);
2007 ge_p3_tobytes(out_public_key, &A);
2008
2009 OPENSSL_memcpy(out_private_key, seed, 32);
2010 OPENSSL_memcpy(out_private_key + 32, out_public_key, 32);
2011 }
2012
2013
x25519_scalar_mult_generic(uint8_t out[32],const uint8_t scalar[32],const uint8_t point[32])2014 static void x25519_scalar_mult_generic(uint8_t out[32],
2015 const uint8_t scalar[32],
2016 const uint8_t point[32]) {
2017 fe x1, x2, z2, x3, z3, tmp0, tmp1;
2018 fe_loose x2l, z2l, x3l, tmp0l, tmp1l;
2019
2020 uint8_t e[32];
2021 OPENSSL_memcpy(e, scalar, 32);
2022 e[0] &= 248;
2023 e[31] &= 127;
2024 e[31] |= 64;
2025
2026 // The following implementation was transcribed to Coq and proven to
2027 // correspond to unary scalar multiplication in affine coordinates given that
2028 // x1 != 0 is the x coordinate of some point on the curve. It was also checked
2029 // in Coq that doing a ladderstep with x1 = x3 = 0 gives z2' = z3' = 0, and z2
2030 // = z3 = 0 gives z2' = z3' = 0. The statement was quantified over the
2031 // underlying field, so it applies to Curve25519 itself and the quadratic
2032 // twist of Curve25519. It was not proven in Coq that prime-field arithmetic
2033 // correctly simulates extension-field arithmetic on prime-field values.
2034 // The decoding of the byte array representation of e was not considered.
2035 // Specification of Montgomery curves in affine coordinates:
2036 // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Spec/MontgomeryCurve.v#L27>
2037 // Proof that these form a group that is isomorphic to a Weierstrass curve:
2038 // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/AffineProofs.v#L35>
2039 // Coq transcription and correctness proof of the loop (where scalarbits=255):
2040 // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L118>
2041 // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L278>
2042 // preconditions: 0 <= e < 2^255 (not necessarily e < order), fe_invert(0) = 0
2043 fe_frombytes(&x1, point);
2044 fe_1(&x2);
2045 fe_0(&z2);
2046 fe_copy(&x3, &x1);
2047 fe_1(&z3);
2048
2049 unsigned swap = 0;
2050 int pos;
2051 for (pos = 254; pos >= 0; --pos) {
2052 // loop invariant as of right before the test, for the case where x1 != 0:
2053 // pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3 is nonzero
2054 // let r := e >> (pos+1) in the following equalities of projective points:
2055 // to_xz (r*P) === if swap then (x3, z3) else (x2, z2)
2056 // to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3)
2057 // x1 is the nonzero x coordinate of the nonzero point (r*P-(r+1)*P)
2058 unsigned b = 1 & (e[pos / 8] >> (pos & 7));
2059 swap ^= b;
2060 fe_cswap(&x2, &x3, swap);
2061 fe_cswap(&z2, &z3, swap);
2062 swap = b;
2063 // Coq transcription of ladderstep formula (called from transcribed loop):
2064 // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L89>
2065 // <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L131>
2066 // x1 != 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L217>
2067 // x1 = 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L147>
2068 fe_sub(&tmp0l, &x3, &z3);
2069 fe_sub(&tmp1l, &x2, &z2);
2070 fe_add(&x2l, &x2, &z2);
2071 fe_add(&z2l, &x3, &z3);
2072 fe_mul_tll(&z3, &tmp0l, &x2l);
2073 fe_mul_tll(&z2, &z2l, &tmp1l);
2074 fe_sq_tl(&tmp0, &tmp1l);
2075 fe_sq_tl(&tmp1, &x2l);
2076 fe_add(&x3l, &z3, &z2);
2077 fe_sub(&z2l, &z3, &z2);
2078 fe_mul_ttt(&x2, &tmp1, &tmp0);
2079 fe_sub(&tmp1l, &tmp1, &tmp0);
2080 fe_sq_tl(&z2, &z2l);
2081 fe_mul121666(&z3, &tmp1l);
2082 fe_sq_tl(&x3, &x3l);
2083 fe_add(&tmp0l, &tmp0, &z3);
2084 fe_mul_ttt(&z3, &x1, &z2);
2085 fe_mul_tll(&z2, &tmp1l, &tmp0l);
2086 }
2087 // here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3) else (x2, z2)
2088 fe_cswap(&x2, &x3, swap);
2089 fe_cswap(&z2, &z3, swap);
2090
2091 fe_invert(&z2, &z2);
2092 fe_mul_ttt(&x2, &x2, &z2);
2093 fe_tobytes(out, &x2);
2094 }
2095
x25519_scalar_mult(uint8_t out[32],const uint8_t scalar[32],const uint8_t point[32])2096 static void x25519_scalar_mult(uint8_t out[32], const uint8_t scalar[32],
2097 const uint8_t point[32]) {
2098 #if defined(BORINGSSL_X25519_NEON)
2099 if (CRYPTO_is_NEON_capable()) {
2100 x25519_NEON(out, scalar, point);
2101 return;
2102 }
2103 #endif
2104
2105 x25519_scalar_mult_generic(out, scalar, point);
2106 }
2107
X25519_keypair(uint8_t out_public_value[32],uint8_t out_private_key[32])2108 void X25519_keypair(uint8_t out_public_value[32], uint8_t out_private_key[32]) {
2109 RAND_bytes(out_private_key, 32);
2110
2111 // All X25519 implementations should decode scalars correctly (see
2112 // https://tools.ietf.org/html/rfc7748#section-5). However, if an
2113 // implementation doesn't then it might interoperate with random keys a
2114 // fraction of the time because they'll, randomly, happen to be correctly
2115 // formed.
2116 //
2117 // Thus we do the opposite of the masking here to make sure that our private
2118 // keys are never correctly masked and so, hopefully, any incorrect
2119 // implementations are deterministically broken.
2120 //
2121 // This does not affect security because, although we're throwing away
2122 // entropy, a valid implementation of scalarmult should throw away the exact
2123 // same bits anyway.
2124 out_private_key[0] |= ~248;
2125 out_private_key[31] &= ~64;
2126 out_private_key[31] |= ~127;
2127
2128 X25519_public_from_private(out_public_value, out_private_key);
2129 }
2130
X25519(uint8_t out_shared_key[32],const uint8_t private_key[32],const uint8_t peer_public_value[32])2131 int X25519(uint8_t out_shared_key[32], const uint8_t private_key[32],
2132 const uint8_t peer_public_value[32]) {
2133 static const uint8_t kZeros[32] = {0};
2134 x25519_scalar_mult(out_shared_key, private_key, peer_public_value);
2135 // The all-zero output results when the input is a point of small order.
2136 return CRYPTO_memcmp(kZeros, out_shared_key, 32) != 0;
2137 }
2138
X25519_public_from_private(uint8_t out_public_value[32],const uint8_t private_key[32])2139 void X25519_public_from_private(uint8_t out_public_value[32],
2140 const uint8_t private_key[32]) {
2141 #if defined(BORINGSSL_X25519_NEON)
2142 if (CRYPTO_is_NEON_capable()) {
2143 static const uint8_t kMongomeryBasePoint[32] = {9};
2144 x25519_NEON(out_public_value, private_key, kMongomeryBasePoint);
2145 return;
2146 }
2147 #endif
2148
2149 uint8_t e[32];
2150 OPENSSL_memcpy(e, private_key, 32);
2151 e[0] &= 248;
2152 e[31] &= 127;
2153 e[31] |= 64;
2154
2155 ge_p3 A;
2156 x25519_ge_scalarmult_base(&A, e);
2157
2158 // We only need the u-coordinate of the curve25519 point. The map is
2159 // u=(y+1)/(1-y). Since y=Y/Z, this gives u=(Z+Y)/(Z-Y).
2160 fe_loose zplusy, zminusy;
2161 fe zminusy_inv;
2162 fe_add(&zplusy, &A.Z, &A.Y);
2163 fe_sub(&zminusy, &A.Z, &A.Y);
2164 fe_loose_invert(&zminusy_inv, &zminusy);
2165 fe_mul_tlt(&zminusy_inv, &zplusy, &zminusy_inv);
2166 fe_tobytes(out_public_value, &zminusy_inv);
2167 }
2168