1 /*
2 * This file is part of the openHiTLS project.
3 *
4 * openHiTLS is licensed under the Mulan PSL v2.
5 * You can use this software according to the terms and conditions of the Mulan PSL v2.
6 * You may obtain a copy of Mulan PSL v2 at:
7 *
8 * http://license.coscl.org.cn/MulanPSL2
9 *
10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13 * See the Mulan PSL v2 for more details.
14 */
15 /* Some of these codes are adapted from https://ed25519.cr.yp.to/software.html */
16
17 #include "hitls_build.h"
18 #ifdef HITLS_CRYPTO_CURVE25519
19
20 #include <stdbool.h>
21 #include "securec.h"
22 #include "curve25519_local.h"
23 #include "bsl_sal.h"
24 #ifdef HITLS_CRYPTO_ED25519
25 #define CRYPT_CURVE25519_OPTLEN 32
26 #endif
27 #define CONDITION_COPY(dst, src, indicate) \
28 (int32_t)((uint32_t)(dst) ^ (((uint32_t)(dst) ^ (uint32_t)(src)) & (indicate)))
29
30 // process Fp multiplication carry
31 #define FP_PROCESS_CARRY(h) \
32 do { \
33 int64_t carry0, carry1, carry2, carry3, carry4, carry5, carry6, carry7, carry8, carry9; \
34 carry0 = h##0 + (1 << 25); h##1 += carry0 >> 26; h##0 -= carry0 & CURVE25519_MASK_HIGH_38; \
35 carry4 = h##4 + (1 << 25); h##5 += carry4 >> 26; h##4 -= carry4 & CURVE25519_MASK_HIGH_38; \
36 carry1 = h##1 + (1 << 24); h##2 += carry1 >> 25; h##1 -= carry1 & CURVE25519_MASK_HIGH_39; \
37 carry5 = h##5 + (1 << 24); h##6 += carry5 >> 25; h##5 -= carry5 & CURVE25519_MASK_HIGH_39; \
38 carry2 = h##2 + (1 << 25); h##3 += carry2 >> 26; h##2 -= carry2 & CURVE25519_MASK_HIGH_38; \
39 carry6 = h##6 + (1 << 25); h##7 += carry6 >> 26; h##6 -= carry6 & CURVE25519_MASK_HIGH_38; \
40 carry3 = h##3 + (1 << 24); h##4 += carry3 >> 25; h##3 -= carry3 & CURVE25519_MASK_HIGH_39; \
41 carry7 = h##7 + (1 << 24); h##8 += carry7 >> 25; h##7 -= carry7 & CURVE25519_MASK_HIGH_39; \
42 carry4 = h##4 + (1 << 25); h##5 += carry4 >> 26; h##4 -= carry4 & CURVE25519_MASK_HIGH_38; \
43 carry8 = h##8 + (1 << 25); h##9 += carry8 >> 26; h##8 -= carry8 & CURVE25519_MASK_HIGH_38; \
44 carry9 = h##9 + (1 << 24); h##0 += (carry9 >> 25) * 19; h##9 -= carry9 & CURVE25519_MASK_HIGH_39; \
45 carry0 = h##0 + (1 << 25); h##1 += carry0 >> 26; h##0 -= carry0 & CURVE25519_MASK_HIGH_38; \
46 } while (0)
47
48 // h0...h9 to Fp25
49 #define INT64_2_FP25(h, out) \
50 do { \
51 (out)[0] = (int32_t)h##0; \
52 (out)[1] = (int32_t)h##1; \
53 (out)[2] = (int32_t)h##2; \
54 (out)[3] = (int32_t)h##3; \
55 (out)[4] = (int32_t)h##4; \
56 (out)[5] = (int32_t)h##5; \
57 (out)[6] = (int32_t)h##6; \
58 (out)[7] = (int32_t)h##7; \
59 (out)[8] = (int32_t)h##8; \
60 (out)[9] = (int32_t)h##9; \
61 } while (0)
62
63 #define FP25_2_INT32(in, out) \
64 do { \
65 out##0 = (in)[0]; \
66 out##1 = (in)[1]; \
67 out##2 = (in)[2]; \
68 out##3 = (in)[3]; \
69 out##4 = (in)[4]; \
70 out##5 = (in)[5]; \
71 out##6 = (in)[6]; \
72 out##7 = (in)[7]; \
73 out##8 = (in)[8]; \
74 out##9 = (in)[9]; \
75 } while (0)
76
77 /* out = f * g */
FpMul(Fp25 out,const Fp25 f,const Fp25 g)78 void FpMul(Fp25 out, const Fp25 f, const Fp25 g)
79 {
80 int32_t f0, f1, f2, f3, f4, f5, f6, f7, f8, f9;
81 int32_t g0, g1, g2, g3, g4, g5, g6, g7, g8, g9;
82 int64_t h0, h1, h2, h3, h4, h5, h6, h7, h8, h9;
83
84 FP25_2_INT32(f, f);
85 FP25_2_INT32(g, g);
86
87 int32_t f1_2 = f1 * 2;
88 int32_t f3_2 = f3 * 2;
89 int32_t f5_2 = f5 * 2;
90 int32_t f7_2 = f7 * 2;
91 int32_t f9_2 = f9 * 2;
92
93 int32_t g1_19 = g1 * 19;
94 int32_t g2_19 = g2 * 19;
95 int32_t g3_19 = g3 * 19;
96 int32_t g4_19 = g4 * 19;
97 int32_t g5_19 = g5 * 19;
98 int32_t g6_19 = g6 * 19;
99 int32_t g7_19 = g7 * 19;
100 int32_t g8_19 = g8 * 19;
101 int32_t g9_19 = g9 * 19;
102
103 /* h0 = f0g0 + 38f1g9 + 19f2g8 + 38f3g7 + 19f4g6 + 38f5g5 + 19f6g4 + 38f7g3 + 19f8g2 + 38f9g1
104 h1 = f0g1 + f1g0 + 19f2g9 + 19f3g8 + 19f4g7 + 19f5g6 + 19f6g5 + 19f7g4 + 19f8g3 + 19f9g2
105 h2 = f0g2 + 2f1g1 + f2g0 + 38f3g9 + 19f4g8 + 38f5g7 + 19f6g6 + 38f7g5 + 19f8g4 + 38f9g2
106 h3 = f0g3 + f1g2 + f2g1 + f3g0 + 19f4g9 + 19f5g8 + 19f6g7 + 19f7g6 + 19f8g5 + 19f9g4
107 h4 = f0g4 + 2f1g3 + f2g2 + 2f3g1 + f4g0 + 38f5g9 + 19f6g8 + 38f7g7 + 19f8g6 + 38f9g5
108 h5 = f0g5 + f1g4 + f2g3 + f3g2 + f4g1 + f5g0 + 19f6g9 + 19f7g8 + 19f8g7 + 19f9g6
109 h6 = f0g6 + 2f1g5 + f2g4 + 2f3g3 + f4g2 + 2f5g1 + f6g0 + 38f7g9 + 19f8g8 + 38f9g7
110 h7 = f0g7 + f1g6 + f2g5 + f3g4 + f4g3 + f5g2 + f6g1 + f7g0 + 19f8g9 + 19f9g8
111 h8 = f0g8 + 2f1g7 + f2g6 + 2f3g5 + f4g4 + 2f5g3 + f6g2 + 2f7g1 + f8g0 + 38f9g9
112 h9 = f0g9 + f1g8 + f2g7 + f3g6 + f4g5 + f5g4 + f6g3 + f7g2 + f8g1 + f9g0
113 The calculation is performed by column. */
114
115 h0 = (int64_t)f0 * g0;
116 h1 = (int64_t)f0 * g1;
117 h2 = (int64_t)f0 * g2;
118 h3 = (int64_t)f0 * g3;
119 h4 = (int64_t)f0 * g4;
120 h5 = (int64_t)f0 * g5;
121 h6 = (int64_t)f0 * g6;
122 h7 = (int64_t)f0 * g7;
123 h8 = (int64_t)f0 * g8;
124 h9 = (int64_t)f0 * g9;
125
126 h0 += (int64_t)f1_2 * g9_19;
127 h1 += (int64_t)f1 * g0;
128 h2 += (int64_t)f1_2 * g1;
129 h3 += (int64_t)f1 * g2;
130 h4 += (int64_t)f1_2 * g3;
131 h5 += (int64_t)f1 * g4;
132 h6 += (int64_t)f1_2 * g5;
133 h7 += (int64_t)f1 * g6;
134 h8 += (int64_t)f1_2 * g7;
135 h9 += (int64_t)f1 * g8;
136
137 h0 += (int64_t)f2 * g8_19;
138 h1 += (int64_t)f2 * g9_19;
139 h2 += (int64_t)f2 * g0;
140 h3 += (int64_t)f2 * g1;
141 h4 += (int64_t)f2 * g2;
142 h5 += (int64_t)f2 * g3;
143 h6 += (int64_t)f2 * g4;
144 h7 += (int64_t)f2 * g5;
145 h8 += (int64_t)f2 * g6;
146 h9 += (int64_t)f2 * g7;
147
148 h0 += (int64_t)f3_2 * g7_19;
149 h1 += (int64_t)f3 * g8_19;
150 h2 += (int64_t)f3_2 * g9_19;
151 h3 += (int64_t)f3 * g0;
152 h4 += (int64_t)f3_2 * g1;
153 h5 += (int64_t)f3 * g2;
154 h6 += (int64_t)f3_2 * g3;
155 h7 += (int64_t)f3 * g4;
156 h8 += (int64_t)f3_2 * g5;
157 h9 += (int64_t)f3 * g6;
158
159 h0 += (int64_t)f4 * g6_19;
160 h1 += (int64_t)f4 * g7_19;
161 h2 += (int64_t)f4 * g8_19;
162 h3 += (int64_t)f4 * g9_19;
163 h4 += (int64_t)f4 * g0;
164 h5 += (int64_t)f4 * g1;
165 h6 += (int64_t)f4 * g2;
166 h7 += (int64_t)f4 * g3;
167 h8 += (int64_t)f4 * g4;
168 h9 += (int64_t)f4 * g5;
169
170 h0 += (int64_t)f5_2 * g5_19;
171 h1 += (int64_t)f5 * g6_19;
172 h2 += (int64_t)f5_2 * g7_19;
173 h3 += (int64_t)f5 * g8_19;
174 h4 += (int64_t)f5_2 * g9_19;
175 h5 += (int64_t)f5 * g0;
176 h6 += (int64_t)f5_2 * g1;
177 h7 += (int64_t)f5 * g2;
178 h8 += (int64_t)f5_2 * g3;
179 h9 += (int64_t)f5 * g4;
180
181 h0 += (int64_t)f6 * g4_19;
182 h1 += (int64_t)f6 * g5_19;
183 h2 += (int64_t)f6 * g6_19;
184 h3 += (int64_t)f6 * g7_19;
185 h4 += (int64_t)f6 * g8_19;
186 h5 += (int64_t)f6 * g9_19;
187 h6 += (int64_t)f6 * g0;
188 h7 += (int64_t)f6 * g1;
189 h8 += (int64_t)f6 * g2;
190 h9 += (int64_t)f6 * g3;
191
192 h0 += (int64_t)f7_2 * g3_19;
193 h1 += (int64_t)f7 * g4_19;
194 h2 += (int64_t)f7_2 * g5_19;
195 h3 += (int64_t)f7 * g6_19;
196 h4 += (int64_t)f7_2 * g7_19;
197 h5 += (int64_t)f7 * g8_19;
198 h6 += (int64_t)f7_2 * g9_19;
199 h7 += (int64_t)f7 * g0;
200 h8 += (int64_t)f7_2 * g1;
201 h9 += (int64_t)f7 * g2;
202
203 h0 += (int64_t)f8 * g2_19;
204 h1 += (int64_t)f8 * g3_19;
205 h2 += (int64_t)f8 * g4_19;
206 h3 += (int64_t)f8 * g5_19;
207 h4 += (int64_t)f8 * g6_19;
208 h5 += (int64_t)f8 * g7_19;
209 h6 += (int64_t)f8 * g8_19;
210 h7 += (int64_t)f8 * g9_19;
211 h8 += (int64_t)f8 * g0;
212 h9 += (int64_t)f8 * g1;
213
214 h0 += (int64_t)f9_2 * g1_19;
215 h1 += (int64_t)f9 * g2_19;
216 h2 += (int64_t)f9_2 * g3_19;
217 h3 += (int64_t)f9 * g4_19;
218 h4 += (int64_t)f9_2 * g5_19;
219 h5 += (int64_t)f9 * g6_19;
220 h6 += (int64_t)f9_2 * g7_19;
221 h7 += (int64_t)f9 * g8_19;
222 h8 += (int64_t)f9_2 * g9_19;
223 h9 += (int64_t)f9 * g0;
224
225 FP_PROCESS_CARRY(h);
226
227 INT64_2_FP25(h, out);
228 }
229
FpSquareDoubleCore(Fp25 out,const Fp25 in,bool doDouble)230 void FpSquareDoubleCore(Fp25 out, const Fp25 in, bool doDouble)
231 {
232 int64_t h0, h1, h2, h3, h4, h5, h6, h7, h8, h9;
233 int32_t f0, f1, f2, f3, f4, f5, f6, f7, f8, f9;
234
235 FP25_2_INT32(in, f);
236
237 int32_t f0_2 = f0 * 2;
238 int32_t f1_2 = f1 * 2;
239 int32_t f2_2 = f2 * 2;
240 int32_t f3_2 = f3 * 2;
241 int32_t f4_2 = f4 * 2;
242 int32_t f5_2 = f5 * 2;
243 int32_t f6_2 = f6 * 2;
244 int32_t f7_2 = f7 * 2;
245
246 int32_t f9_38 = f9 * 38;
247 int32_t f8_19 = f8 * 19;
248 int32_t f7_38 = f7 * 38;
249 int32_t f6_19 = f6 * 19;
250 int32_t f5_19 = f5 * 19;
251
252 h0 = (int64_t)f0 * f0;
253 h1 = (int64_t)f0_2 * f1;
254 h2 = (int64_t)f0_2 * f2;
255 h3 = (int64_t)f0_2 * f3;
256 h4 = (int64_t)f0_2 * f4;
257 h5 = (int64_t)f0_2 * f5;
258 h6 = (int64_t)f0_2 * f6;
259 h7 = (int64_t)f0_2 * f7;
260 h8 = (int64_t)f0_2 * f8;
261 h9 = (int64_t)f0_2 * f9;
262
263 h0 += (int64_t)f1_2 * f9_38;
264 h1 += (int64_t)f2 * f9_38;
265 h2 += (int64_t)f1_2 * f1;
266 h3 += (int64_t)f1_2 * f2;
267 h4 += (int64_t)f1_2 * f3_2;
268 h5 += (int64_t)f1_2 * f4;
269 h6 += (int64_t)f1_2 * f5_2;
270 h7 += (int64_t)f1_2 * f6;
271 h8 += (int64_t)f1_2 * f7_2;
272 h9 += (int64_t)f1_2 * f8;
273
274 h0 += (int64_t)f2_2 * f8_19;
275 h1 += (int64_t)f3_2 * f8_19;
276 h2 += (int64_t)f3_2 * f9_38;
277 h3 += (int64_t)f4 * f9_38;
278 h4 += (int64_t)f2 * f2;
279 h5 += (int64_t)f2_2 * f3;
280 h6 += (int64_t)f2_2 * f4;
281 h7 += (int64_t)f2_2 * f5;
282 h8 += (int64_t)f2_2 * f6;
283 h9 += (int64_t)f2_2 * f7;
284
285 h0 += (int64_t)f3_2 * f7_38;
286 h1 += (int64_t)f4 * f7_38;
287 h2 += (int64_t)f4_2 * f8_19;
288 h3 += (int64_t)f5_2 * f8_19;
289 h4 += (int64_t)f5_2 * f9_38;
290 h5 += (int64_t)f6 * f9_38;
291 h6 += (int64_t)f3_2 * f3;
292 h7 += (int64_t)f3_2 * f4;
293 h8 += (int64_t)f3_2 * f5_2;
294 h9 += (int64_t)f3_2 * f6;
295
296 h0 += (int64_t)f4_2 * f6_19;
297 h1 += (int64_t)f5_2 * f6_19;
298 h2 += (int64_t)f5_2 * f7_38;
299 h3 += (int64_t)f6 * f7_38;
300 h4 += (int64_t)f6_2 * f8_19;
301 h5 += (int64_t)f7_2 * f8_19;
302 h6 += (int64_t)f7_2 * f9_38;
303 h7 += (int64_t)f8 * f9_38;
304 h8 += (int64_t)f4 * f4;
305 h9 += (int64_t)f4_2 * f5;
306
307 h0 += (int64_t)f5_2 * f5_19;
308 h2 += (int64_t)f6 * f6_19;
309 h4 += (int64_t)f7 * f7_38;
310 h6 += (int64_t)f8 * f8_19;
311 h8 += (int64_t)f9 * f9_38;
312
313 if (doDouble) {
314 h0 *= 2;
315 h1 *= 2;
316 h2 *= 2;
317 h3 *= 2;
318 h4 *= 2;
319 h5 *= 2;
320 h6 *= 2;
321 h7 *= 2;
322 h8 *= 2;
323 h9 *= 2;
324 }
325
326 FP_PROCESS_CARRY(h);
327
328 INT64_2_FP25(h, out);
329 }
330
331 /* out = in1 ^ (4 * 2 ^ (2 * times)) * in2 */
FpMultiSquare(Fp25 in1,Fp25 in2,Fp25 out,int32_t times)332 static void FpMultiSquare(Fp25 in1, Fp25 in2, Fp25 out, int32_t times)
333 {
334 int32_t i;
335 Fp25 temp1, temp2;
336 FpSquareDoubleCore(temp1, in1, false);
337 FpSquareDoubleCore(temp2, temp1, false);
338 for (i = 0; i < times; i++) {
339 FpSquareDoubleCore(temp1, temp2, false);
340 FpSquareDoubleCore(temp2, temp1, false);
341 }
342 FpMul(out, in2, temp2);
343 }
344
345 /* out = a ^ -1 */
FpInvert(Fp25 out,const Fp25 a)346 void FpInvert(Fp25 out, const Fp25 a)
347 {
348 int32_t i;
349 Fp25 a0; /* save a^1 */
350 Fp25 a1; /* save a^2 */
351 Fp25 a2; /* save a^11 */
352 Fp25 a3; /* save a^(2^5-1) */
353 Fp25 a4; /* save a^(2^10-1) */
354 Fp25 a5; /* save a^(2^20-1) */
355 Fp25 a6; /* save a^(2^40-1) */
356 Fp25 a7; /* save a^(2^50-1) */
357 Fp25 a8; /* save a^(2^100-1) */
358 Fp25 a9; /* save a^(2^200-1) */
359 Fp25 a10; /* save a^(2^250-1) */
360 Fp25 temp1, temp2;
361
362 /* We know a×b=1(mod p), then a and b are inverses of mod p, i.e. a=b^(-1), b=a^(-1);
363 * According to Fermat's little theorem a^(p-1)=1(mod p), so a*a^(p-2)=1(mod p);
364 * So the inverse element of a is a^(-1) = a^(p-2)(mod p)
365 * Here it is, p=2^255-19, thus we need to compute a^(2^255-21)(mod(2^255-19))
366 */
367
368 /* a^1 */
369 CURVE25519_FP_COPY(a0, a);
370
371 /* a^2 */
372 FpSquareDoubleCore(a1, a0, false);
373
374 /* a^4 */
375 FpSquareDoubleCore(temp1, a1, false);
376
377 /* a^8 */
378 FpSquareDoubleCore(temp2, temp1, false);
379
380 /* a^9 */
381 FpMul(temp1, a0, temp2);
382
383 /* a^11 */
384 FpMul(a2, a1, temp1);
385
386 /* a^22 */
387 FpSquareDoubleCore(temp2, a2, false);
388
389 /* a^(2^5-1) = a^(9+22) */
390 FpMul(a3, temp1, temp2);
391
392 /* a^(2^10-1) = a^(2^10-2^5) * a^(2^5-1) */
393 FpSquareDoubleCore(temp1, a3, false);
394 for (i = 0; i < 2; i++) { // (2 * 2)^2
395 FpSquareDoubleCore(temp2, temp1, false);
396 FpSquareDoubleCore(temp1, temp2, false);
397 }
398 FpMul(a4, a3, temp1);
399
400 /* a^(2^20-1) = a^(2^20-2^10) * a^(2^10-1) */
401 FpMultiSquare(a4, a4, a5, 4); // (2 * 2) ^ 4
402
403 /* a^(2^40-1) = a^(2^40-2^20) * a^(2^20-1) */
404 FpMultiSquare(a5, a5, a6, 9); // (2 * 2) ^ 9
405
406 /* a^(2^50-1) = a^(2^50-2^10) * a^(2^10-1) */
407 FpMultiSquare(a6, a4, a7, 4); // (2 * 2) ^ 4
408
409 /* a^(2^100-1) = a^(2^100-2^50) * a^(2^50-1) */
410 FpMultiSquare(a7, a7, a8, 24); // (2 * 2) ^ 24
411
412 /* a^(2^200-1) = a^(2^200-2^100) * a^(2^100-1) */
413 FpMultiSquare(a8, a8, a9, 49); // (2 * 2) ^ 49
414
415 /* a^(2^250-1) = a^(2^250-2^50) * a^(2^50-1) */
416 FpMultiSquare(a9, a7, a10, 24); // (2 * 2) ^ 24
417
418 /* a^(2^5*(2^250-1)) = (a^(2^250-1))^5 */
419 FpSquareDoubleCore(temp1, a10, false);
420 FpSquareDoubleCore(temp2, temp1, false);
421 FpSquareDoubleCore(temp1, temp2, false);
422 FpSquareDoubleCore(temp2, temp1, false);
423 FpSquareDoubleCore(temp1, temp2, false);
424
425 /* The output:a^(2^255-21) = a(2^5*(2^250-1)+11) = a^(2^5*(2^250-1)) * a^11 */
426 FpMul(out, a2, temp1);
427 }
428
429 #ifdef HITLS_CRYPTO_ED25519
430 /* out = in ^ ((q - 5) / 8) */
FpPowq58(Fp25 out,Fp25 in)431 static void FpPowq58(Fp25 out, Fp25 in)
432 {
433 Fp25 a, b, c;
434 int32_t i;
435 FpSquareDoubleCore(a, in, false);
436 FpSquareDoubleCore(b, a, false);
437 FpSquareDoubleCore(b, b, false);
438 FpMul(b, in, b);
439 FpMul(a, a, b);
440 FpSquareDoubleCore(a, a, false);
441 FpMul(a, b, a);
442 FpSquareDoubleCore(b, a, false);
443 // b = a ^ (2^5)
444 for (i = 1; i < 5; i++) {
445 FpSquareDoubleCore(b, b, false);
446 }
447 FpMul(a, b, a);
448 FpSquareDoubleCore(b, a, false);
449 // b = a ^ (2^10)
450 for (i = 1; i < 10; i++) {
451 FpSquareDoubleCore(b, b, false);
452 }
453 FpMul(b, b, a);
454 FpSquareDoubleCore(c, b, false);
455
456 // c = b ^ (2^20)
457 for (i = 1; i < 20; i++) {
458 FpSquareDoubleCore(c, c, false);
459 }
460 FpMul(b, c, b);
461
462 // b = b ^ (2^10)
463 for (i = 0; i < 10; i++) {
464 FpSquareDoubleCore(b, b, false);
465 }
466
467 FpMul(a, b, a);
468 FpSquareDoubleCore(b, a, false);
469
470 // b = a ^ (2^50)
471 for (i = 1; i < 50; i++) {
472 FpSquareDoubleCore(b, b, false);
473 }
474 FpMul(b, b, a);
475 FpSquareDoubleCore(c, b, false);
476
477 // c = b ^ (2 ^ 100)
478 for (i = 1; i < 100; i++) {
479 FpSquareDoubleCore(c, c, false);
480 }
481 FpMul(b, c, b);
482
483 // b = b ^ (2^50)
484 for (i = 0; i < 50; i++) {
485 FpSquareDoubleCore(b, b, false);
486 }
487 FpMul(a, b, a);
488 FpSquareDoubleCore(a, a, false);
489 FpSquareDoubleCore(a, a, false);
490 FpMul(out, a, in);
491 }
492 #endif
493
PaddingUnload(uint8_t out[32],Fp25 pFp25)494 static void PaddingUnload(uint8_t out[32], Fp25 pFp25)
495 {
496 int32_t *p = (int32_t *)pFp25;
497
498 /* Take a polynomial form number into a 32-byte array */
499 CURVE25519_BYTES4_PADDING_UNLOAD(out, 2, p); /* p0 unload 4 bytes on out[0] expand 2 */
500 CURVE25519_BYTES3_PADDING_UNLOAD(out + 4, 2, 3, p + 1); /* p1 unload 3 bytes on out[4] shift 2 expand 3 */
501 CURVE25519_BYTES3_PADDING_UNLOAD(out + 7, 3, 5, p + 2); /* p2 unload 3 bytes on out[7] shift 3 expand 5 */
502 CURVE25519_BYTES3_PADDING_UNLOAD(out + 10, 5, 6, p + 3); /* p3 unload 3 bytes on out[10] shift 5 expand 6 */
503 CURVE25519_BYTES3_UNLOAD(out + 13, 6, p + 4); /* p4 unload 3 bytes on out[13] shift 6 */
504
505 CURVE25519_BYTES4_PADDING_UNLOAD(out + 16, 1, p + 5); /* p5 unload 4 bytes on out[16] expand 1 */
506 CURVE25519_BYTES3_PADDING_UNLOAD(out + 20, 1, 3, p + 6); /* p6 unload 3 bytes on out[20] shift 1 expand 3 */
507 CURVE25519_BYTES3_PADDING_UNLOAD(out + 23, 3, 4, p + 7); /* p7 unload 3 bytes on out[23] shift 3 expand 4 */
508 CURVE25519_BYTES3_PADDING_UNLOAD(out + 26, 4, 6, p + 8); /* p8 unload 3 bytes on out[26] shift 4 expand 6 */
509 CURVE25519_BYTES3_UNLOAD(out + 29, 6, p + 9); /* p9 unload 3 bytes on out[29] shift 6 */
510 }
511
PolynomialToData(uint8_t out[32],const Fp25 polynomial)512 void PolynomialToData(uint8_t out[32], const Fp25 polynomial)
513 {
514 Fp25 pFp25;
515 uint32_t pos;
516 uint32_t over;
517 uint32_t mul19;
518 uint32_t signMask;
519
520 CURVE25519_FP_COPY(pFp25, polynomial);
521
522 /* First process, all the carry transport to pFp25[0] */
523 mul19 = (uint32_t)pFp25[9] * 19; // mul 19 for mod
524 over = mul19 + (1 << 24); // plus 1 << 24 for carry
525 // restricted to 25 bits, shift 31 for sign
526 signMask = (-(over >> 31)) & MASK_HIGH32(25);
527 over = (over >> 25) | signMask; // 25 bits
528 pos = 0;
529 do {
530 over = (uint32_t)pFp25[pos] + over;
531 // first carry is restricted to 25 bits, shift 31 for sign
532 signMask = (-(over >> 31)) & MASK_HIGH32(25);
533 over = (over >> 25) | signMask; // 25 bits
534 pos++;
535
536 over = (uint32_t)pFp25[pos] + over;
537 // second carry is restricted to 26 bits, shift 31 for sign
538 signMask = (-(over >> 31)) & MASK_HIGH32(26);
539 over = (over >> 26) | signMask; // 26 bits
540 pos++;
541 } while (pos < 10); // process from 0 to 9, pos < 10
542 mul19 = over * 19; // mul 19 for mod
543 pFp25[0] += (int32_t)mul19;
544
545 /* We subtracted 2^255-19 and get the result
546 * all polynomial[i] is restricted to 25 bits or 26 bits
547 */
548 pos = 0;
549 do {
550 // first polynomial is restricted to 26 bits, shift 31 for sign
551 signMask = (-((uint32_t)pFp25[pos] >> 31)) & MASK_HIGH32(26);
552 over = ((uint32_t)pFp25[pos] >> 26) | signMask; // 26 bits
553 pFp25[pos] = (int32_t)((uint32_t)pFp25[pos] & MASK_LOW32(26)); // 26 bits
554 pos++;
555 pFp25[pos] += (int32_t)over;
556
557 // second polynomial is restricted to 25 bits, shift 31 for sign
558 signMask = (-((uint32_t)pFp25[pos] >> 31)) & MASK_HIGH32(25);
559 over = ((uint32_t)pFp25[pos] >> 25) | signMask; // 25 bits
560 pFp25[pos] = (int32_t)((uint32_t)pFp25[pos] & MASK_LOW32(25));
561 pos++;
562 pFp25[pos] += (int32_t)over;
563 } while (pos < 8); // process form 0 to 7, pos < 8
564
565 // process pFp25[8], restricted to 26 bits, shift 31 for sign
566 signMask = (-((uint32_t)pFp25[pos] >> 31)) & MASK_HIGH32(26);
567 over = ((uint32_t)pFp25[pos] >> 26) | signMask; // 26 bits
568 pFp25[pos] = (int32_t)((uint32_t)pFp25[pos] & MASK_LOW32(26)); // 26 bits
569 pos++;
570 // process pFp25[9]
571 pFp25[pos] += (int32_t)over;
572 pFp25[pos] = (int32_t)((uint32_t)pFp25[pos] & MASK_LOW32(25)); // pFp25[9] is restricted to 25 bits
573
574 PaddingUnload(out, pFp25);
575 }
576
577 /* unified addition in Extended twist Edwards Coordinate */
578 /* out = out + tableElement */
GeAdd(GeE * out,const GePre * tableElement)579 static void GeAdd(GeE *out, const GePre *tableElement)
580 {
581 Fp25 a;
582 Fp25 b;
583 Fp25 c;
584 Fp25 d;
585 Fp25 e;
586 Fp25 f;
587 Fp25 g;
588 Fp25 h;
589 /* a = (Y1 − X1) * (Y2 − X2), b = (Y1 + X1) * (Y2 + X2)
590 * c = 2 * d * T1 * X2 * Y2, d = 2 * Z1
591 * e = b − a, f = d − c, g = d + c, h = b + a
592 * X3 = e * f, Y3 = g * h, T3 = e * h, Z3 = f * g
593 */
594 CURVE25519_FP_ADD(e, out->y, out->x);
595 CURVE25519_FP_SUB(f, out->y, out->x);
596 FpMul(b, e, tableElement->yplusx);
597 FpMul(a, f, tableElement->yminusx);
598 FpMul(c, out->t, tableElement->xy2d);
599 CURVE25519_FP_ADD(d, out->z, out->z);
600 CURVE25519_FP_SUB(e, b, a);
601 CURVE25519_FP_SUB(f, d, c);
602 CURVE25519_FP_ADD(g, d, c);
603 CURVE25519_FP_ADD(h, b, a);
604 FpMul(out->x, e, f);
605 FpMul(out->y, h, g);
606 FpMul(out->z, g, f);
607 FpMul(out->t, e, h);
608 }
609
610 #ifdef HITLS_CRYPTO_ED25519
611 /* out = out - tableElement */
GeSub(GeE * out,const GePre * tableElement)612 static void GeSub(GeE *out, const GePre *tableElement)
613 {
614 Fp25 a;
615 Fp25 b;
616 Fp25 c;
617 Fp25 d;
618 Fp25 e;
619 Fp25 f;
620 Fp25 g;
621 Fp25 h;
622
623 CURVE25519_FP_ADD(e, out->y, out->x);
624 CURVE25519_FP_SUB(f, out->y, out->x);
625 FpMul(b, e, tableElement->yminusx);
626 FpMul(a, f, tableElement->yplusx);
627 FpMul(c, out->t, tableElement->xy2d);
628 CURVE25519_FP_ADD(d, out->z, out->z);
629 CURVE25519_FP_SUB(e, b, a);
630 CURVE25519_FP_ADD(f, d, c);
631 CURVE25519_FP_SUB(g, d, c);
632 CURVE25519_FP_ADD(h, b, a);
633 FpMul(out->x, e, f);
634 FpMul(out->y, h, g);
635 FpMul(out->z, g, f);
636 FpMul(out->t, e, h);
637 }
638 #endif
639
640 /* double in Projective twist Edwards Coordinate */
ProjectiveDouble(GeC * complete,const GeP * projective)641 static void ProjectiveDouble(GeC *complete, const GeP *projective)
642 {
643 Fp25 tmp;
644 FpSquareDoubleCore((complete->x), (projective->x), false);
645 FpSquareDoubleCore((complete->z), (projective->y), false);
646 // T = 2 * Z^2
647 FpSquareDoubleCore(complete->t, projective->z, true);
648 CURVE25519_FP_ADD(complete->y, projective->x, projective->y);
649 FpSquareDoubleCore(tmp, complete->y, false);
650 // tmp = (X1 + Y1) ^ 2, T = 2 * Z^2, X = X1 ^ 2, Y = Z1 ^ 2, Z = Y1 ^ 2
651 CURVE25519_FP_ADD(complete->y, complete->z, complete->x);
652 CURVE25519_FP_SUB(complete->z, complete->z, complete->x);
653 CURVE25519_FP_SUB(complete->x, tmp, complete->y);
654 CURVE25519_FP_SUB(complete->t, complete->t, complete->z);
655 }
656
657 /* Convert complete coordinate to projective coordinate */
GeCompleteToProjective(GeP * out,const GeC * complete)658 static void GeCompleteToProjective(GeP *out, const GeC *complete)
659 {
660 FpMul(out->x, complete->t, complete->x);
661 FpMul(out->y, complete->z, complete->y);
662 FpMul(out->z, complete->t, complete->z);
663 }
664
665 /* p1 = 16 * p1 */
P1DoubleFourTimes(GeE * p1)666 static void P1DoubleFourTimes(GeE *p1)
667 {
668 GeP p;
669 GeC c;
670 // From extended coordinate to projective coordinate, just ignore T
671 CURVE25519_FP_COPY(p.x, p1->x);
672 CURVE25519_FP_COPY(p.y, p1->y);
673 CURVE25519_FP_COPY(p.z, p1->z);
674 // double 4 times to get 16p1
675 ProjectiveDouble(&c, &p);
676 GeCompleteToProjective(&p, &c);
677 ProjectiveDouble(&c, &p);
678 GeCompleteToProjective(&p, &c);
679 ProjectiveDouble(&c, &p);
680 GeCompleteToProjective(&p, &c);
681 ProjectiveDouble(&c, &p);
682 FpMul(p1->x, c.x, c.t);
683 FpMul(p1->y, c.y, c.z);
684 FpMul(p1->z, c.z, c.t);
685 FpMul(p1->t, c.x, c.y);
686 }
687
SetExtendedBasePoint(GeE * out)688 static void SetExtendedBasePoint(GeE *out)
689 {
690 CURVE25519_FP_SET(out->x, 0);
691 CURVE25519_FP_SET(out->y, 1);
692 CURVE25519_FP_SET(out->t, 0);
693 CURVE25519_FP_SET(out->z, 1);
694 }
695
696 /* Multiple with Base point, see paper: High-speed high-security signatures */
ScalarMultiBase(GeE * out,const uint8_t in[CRYPT_CURVE25519_KEYLEN])697 void ScalarMultiBase(GeE *out, const uint8_t in[CRYPT_CURVE25519_KEYLEN])
698 {
699 uint8_t carry;
700 // inLen is always 32, buffer needs 32 * 2 = 64
701 uint8_t privateKey[64];
702 int32_t i;
703 GePre preCompute;
704
705 // split 32 8bits input into 64 4bits-based number
706 for (i = 0; i < 32; i++) {
707 privateKey[i * 2] = in[i] & 15; // and 15 to get low 4 bits, stored in 2i
708 privateKey[i * 2 + 1] = (in[i] >> 4) & 15; // shift 4 then and 15 to get upper 4 bits, stored in 2i+1
709 }
710 carry = 0;
711 /**
712 * change from 0 - 15 to -8 - 7, if privateKey[i] >= 8, carry = 1, privateKey[i] -= 16
713 * if privateKey[i] < 8, privateKey[i] = privateKey[i]
714 */
715 for (i = 0; i < 63; i++) { // 0 to 63
716 privateKey[i] += carry;
717 carry = (privateKey[i] + 8) >> 4; // plus 8 then shit 4 to get carry
718 privateKey[i] -= carry << 4; // left shift 4
719 }
720 // never overflow since we set first bit to 0 of private key
721 privateKey[63] += carry; // last one is 63
722 // set base point X:Y:T:Z -> 0:1:0:1
723 SetExtendedBasePoint(out);
724 for (i = 1; i < 64; i += 2) { // form 1 to 63, process all odd element, increment by 2, i < 64
725 TableLookup(&preCompute, i / 2, (int8_t)privateKey[i]); // position goes from 0 to 31, i / 2 = pos
726 // Fit with paper: Twisted Edwards Curves Revisited
727 GeAdd(out, &preCompute);
728 }
729 // now we have P1, double it four times we have 16P1, P1 is in Extended now, we do double in projective coordinate
730 P1DoubleFourTimes(out);
731 // Add P0 with precomute
732 for (i = 0; i < 64; i += 2) { // form 0 to 62, process all even element, increment by 2, i < 64
733 TableLookup(&preCompute, i / 2, (int8_t)privateKey[i]); // position goes from 0 to 31, i / 2 = pos
734 GeAdd(out, &preCompute);
735 }
736 // clean up private key information
737 BSL_SAL_CleanseData(privateKey, sizeof(privateKey));
738 }
739
740 #ifdef HITLS_CRYPTO_ED25519
PointEncoding(const GeE * point,uint8_t * output,uint32_t outputLen)741 void PointEncoding(const GeE *point, uint8_t *output, uint32_t outputLen)
742 {
743 Fp25 zInvert;
744 Fp25 x;
745 Fp25 y;
746 uint8_t xData[CRYPT_CURVE25519_KEYLEN];
747 /* x = X / Z, y = Y / Z */
748 (void)outputLen;
749 FpInvert(zInvert, point->z);
750 FpMul(x, point->x, zInvert);
751 FpMul(y, point->y, zInvert);
752 PolynomialToData(output, y);
753 PolynomialToData(xData, x);
754 // PointEcoding writes only 32 bytes data, therefore output[31] is the last one
755 output[31] ^= (xData[0] & 0x1) << 7; // last one is output[31], get only last bit then shift 7
756 }
757 #endif
758
FeCmove(Fp25 dst,const Fp25 src,const uint32_t indicator)759 static void FeCmove(Fp25 dst, const Fp25 src, const uint32_t indicator)
760 {
761 // if indicator = 1, now it will be 111111111111b....
762 const uint32_t indicate = 0 - indicator;
763 /* des become source if dst->data[i] ^ src->data[i] is in 1111....b, or it does not change if
764 (dst->data[i] ^ src->data[i]) & indicate is all 0 */
765 dst[0] = CONDITION_COPY(dst[0], src[0], indicate); // 0
766 dst[1] = CONDITION_COPY(dst[1], src[1], indicate); // 1
767 dst[2] = CONDITION_COPY(dst[2], src[2], indicate); // 2
768 dst[3] = CONDITION_COPY(dst[3], src[3], indicate); // 3
769 dst[4] = CONDITION_COPY(dst[4], src[4], indicate); // 4
770 dst[5] = CONDITION_COPY(dst[5], src[5], indicate); // 5
771 dst[6] = CONDITION_COPY(dst[6], src[6], indicate); // 6
772 dst[7] = CONDITION_COPY(dst[7], src[7], indicate); // 7
773 dst[8] = CONDITION_COPY(dst[8], src[8], indicate); // 8
774 dst[9] = CONDITION_COPY(dst[9], src[9], indicate); // 9
775 }
776
ConditionalMove(GePre * preCompute,const GePre * tableElement,uint32_t indicator)777 void ConditionalMove(GePre *preCompute, const GePre *tableElement, uint32_t indicator)
778 {
779 FeCmove(preCompute->yplusx, tableElement->yplusx, indicator);
780 FeCmove(preCompute->yminusx, tableElement->yminusx, indicator);
781 FeCmove(preCompute->xy2d, tableElement->xy2d, indicator);
782 }
783
DataToPolynomial(Fp25 out,const uint8_t data[32])784 void DataToPolynomial(Fp25 out, const uint8_t data[32])
785 {
786 const uint8_t *t = data;
787 uint64_t p[10];
788 uint64_t over;
789 int32_t i;
790 uint64_t signMask;
791
792 /* f0, load 32 bits */
793 CURVE25519_BYTES4_LOAD(p, t);
794 /* f1, load 24 bits from t4, shift bits: 26 - 24 - (8 - x) = 0 -> x = 6 */
795 CURVE25519_BYTES3_LOAD_PADDING(p + 1, 6, t + 4);
796 /* f2, load 24 bits from t7, shift bits: 51 - 48 - (8 - x) = 0 -> x = 5 */
797 CURVE25519_BYTES3_LOAD_PADDING(p + 2, 5, t + 7);
798 /* f3, load 24 bits from t10, shift bits: 77 - 72 - (8 - x) = 0 -> x = 3 */
799 CURVE25519_BYTES3_LOAD_PADDING(p + 3, 3, t + 10);
800 /* f4, load 24 bits from t13, shift bits: 102 - 96 - (8 - x) = 0 -> x = 2 */
801 CURVE25519_BYTES3_LOAD_PADDING(p + 4, 2, t + 13);
802 /* f5, load 32 bits from t16 */
803 CURVE25519_BYTES4_LOAD(p + 5, t + 16);
804 /* f6, load 24 bits from t20, shift bits: 153 - 152 - (8 - x) = 0 -> x = 7 */
805 CURVE25519_BYTES3_LOAD_PADDING(p + 6, 7, t + 20);
806 /* f7, load 24 bits from t23, shift bits: 179 - 176 - (8 - x) = 0 -> x = 5 */
807 CURVE25519_BYTES3_LOAD_PADDING(p + 7, 5, t + 23);
808 /* f8, load 24 bits from t26, shift bits: 204 - 200 - (8 - x) = 0 -> x = 4 */
809 CURVE25519_BYTES3_LOAD_PADDING(p + 8, 4, t + 26);
810 /* f9, load 24 bits from t29, shift bits: 230 - 224 - (8 - x) = 0 -> x = 2 */
811 CURVE25519_BYTES3_LOAD(p + 9, t + 29);
812 p[9] = (p[9] & 0x7fffff) << 2; /* p9 is 25 bits, left shift 2 */
813
814 /* Limiting the number of bits, exchange 2^1 to 2^25.5, turn into polynomial representation */
815 /* f9->f0, shift 24 for carry */
816 over = p[9] + (1 << 24);
817 signMask = MASK_HIGH64(25) & (-((over) >> 63)); // shift 63 bits for sign, mask 25 bits
818 p[0] += ((over >> 25) | signMask) * 19; // 24 bits plus sign is 25, mul 19 for mod
819 p[9] -= MASK_HIGH64(39) & over; // 64 - 25 = 39 bits mask
820
821 /* f1->f2, restricted to 24 bits */
822 PROCESS_CARRY(p[1], p[2], signMask, over, 24);
823 /* f3->f4, restricted to 24 bits */
824 PROCESS_CARRY(p[3], p[4], signMask, over, 24);
825 /* f5->f6, restricted to 24 bits */
826 PROCESS_CARRY(p[5], p[6], signMask, over, 24);
827 /* f7->f8, restricted to 24 bits */
828 PROCESS_CARRY(p[7], p[8], signMask, over, 24);
829
830 /* f0->f1, restricted to 25 bits */
831 PROCESS_CARRY(p[0], p[1], signMask, over, 25);
832 /* f2->f3, restricted to 25 bits */
833 PROCESS_CARRY(p[2], p[3], signMask, over, 25);
834 /* f4->f5, restricted to 25 bits */
835 PROCESS_CARRY(p[4], p[5], signMask, over, 25);
836 /* f6->f7, restricted to 25 bits */
837 PROCESS_CARRY(p[6], p[7], signMask, over, 25);
838 /* f8->f9, restricted to 25 bits */
839 PROCESS_CARRY(p[8], p[9], signMask, over, 25);
840
841 /* After process carry, polynomial every term would not exceed 32 bits, convert form 0 to 9, i < 10 */
842 for (i = 0; i < 10; i++) {
843 out[i] = (int32_t)p[i];
844 }
845 }
846
847 #ifdef HITLS_CRYPTO_ED25519
CheckZero(Fp25 x)848 static bool CheckZero(Fp25 x)
849 {
850 uint8_t tmp[32];
851 const uint8_t zero[32] = {0};
852 PolynomialToData(tmp, x);
853 if (memcmp(tmp, zero, sizeof(zero)) == 0) {
854 return true;
855 } else {
856 return false;
857 }
858 }
859
GetXBit(Fp25 in)860 static uint8_t GetXBit(Fp25 in)
861 {
862 uint8_t tmp[32];
863 PolynomialToData(tmp, in);
864
865 return tmp[0] & 0x1;
866 }
867
868 static const Fp25 SQRTM1 = {-32595792, -7943725, 9377950, 3500415, 12389472, -272473,
869 -25146209, -2005654, 326686, 11406482};
870 static const Fp25 D = {-10913610, 13857413, -15372611, 6949391, 114729, -8787816,
871 -6275908, -3247719, -18696448, -12055116};
872
PointDecoding(GeE * point,const uint8_t in[CRYPT_CURVE25519_KEYLEN])873 int32_t PointDecoding(GeE *point, const uint8_t in[CRYPT_CURVE25519_KEYLEN])
874 {
875 Fp25 u, v, v3, x2, result;
876 // get the last block (31), shift 7 for first bit
877 uint8_t x0 = in[31] >> 7;
878 DataToPolynomial(point->y, in);
879
880 CURVE25519_FP_SET(point->z, 1);
881 FpSquareDoubleCore(u, point->y, false);
882 FpMul(v, u, D);
883 CURVE25519_FP_SUB(u, u, point->z);
884 CURVE25519_FP_ADD(v, v, point->z);
885
886 FpSquareDoubleCore(v3, v, false);
887 FpMul(v3, v3, v);
888 FpSquareDoubleCore(point->x, v3, false);
889 FpMul(point->x, point->x, v);
890 FpMul(point->x, point->x, u);
891
892 /* x = x ^ ((q - 5) / 8) */
893 FpPowq58(point->x, point->x);
894
895 FpMul(point->x, point->x, v3);
896 FpMul(point->x, point->x, u);
897 FpSquareDoubleCore(x2, point->x, false);
898 FpMul(x2, x2, v);
899 CURVE25519_FP_SUB(result, x2, u);
900
901 if (CheckZero(result) == false) {
902 CURVE25519_FP_ADD(result, x2, u);
903 if (CheckZero(result) == false) {
904 return 1;
905 }
906 FpMul(point->x, point->x, SQRTM1);
907 }
908 uint8_t bit = GetXBit(point->x);
909 if (bit != x0) {
910 CURVE25519_FP_NEGATE(point->x, point->x);
911 }
912 FpMul(point->t, point->x, point->y);
913
914 return 0;
915 }
916
ScalarMulAddPreLoad(const uint8_t in[CRYPT_CURVE25519_KEYLEN],uint64_t out[UINT8_32_21BITS_BLOCKNUM])917 static void ScalarMulAddPreLoad(const uint8_t in[CRYPT_CURVE25519_KEYLEN],
918 uint64_t out[UINT8_32_21BITS_BLOCKNUM])
919 {
920 CURVE25519_BYTES3_LOAD(&out[0], in);
921 out[0] = out[0] & MASK_64_LOW21;
922
923 CURVE25519_BYTES4_LOAD(&out[1], in + 2); // 1: load 4 bytes form position 2
924 out[1] = MASK_64_LOW21 & (out[1] >> 5); // 1: 8 - ((3 * 8) mod 21) mod 8 = 5
925
926 CURVE25519_BYTES3_LOAD(&out[2], in + 5); // 2: load 3 bytes form position 5
927 out[2] = MASK_64_LOW21 & (out[2] >> 2); // 2: 8 - ((6 * 8) mod 21) mod 8 = 2
928
929 CURVE25519_BYTES4_LOAD(&out[3], in + 7); // 3: load 4 bytes form position 7
930 out[3] = MASK_64_LOW21 & (out[3] >> 7); // 3: 8 - ((8 * 8) mod 21) mod 8 = 7
931
932 CURVE25519_BYTES4_LOAD(&out[4], in + 10); // 4: load 4 bytes form position 10
933 out[4] = MASK_64_LOW21 & (out[4] >> 4); // 4: 8 - ((11 * 8) mod 21) mod 8 = 4
934
935 CURVE25519_BYTES3_LOAD(&out[5], in + 13); // 5: load 3 bytes form position 13
936 out[5] = MASK_64_LOW21 & (out[5] >> 1); // 5: 8 - ((14 * 8) mod 21) mod 8 = 1
937
938 CURVE25519_BYTES4_LOAD(&out[6], in + 15); // 6: load 4 bytes form position 15
939 out[6] = MASK_64_LOW21 & (out[6] >> 6); // 6: 8 - ((16 * 8) mod 21) mod 8 = 6
940
941 CURVE25519_BYTES3_LOAD(&out[7], in + 18); // 7: load 3 bytes form position 18
942 out[7] = MASK_64_LOW21 & (out[7] >> 3); // 7: 8 - ((19 * 8) mod 21) mod 8 = 3
943
944 CURVE25519_BYTES3_LOAD(&out[8], in + 21); // 8: load 3 bytes form position 21
945 out[8] = MASK_64_LOW21 & out[8]; // 8: ((22 * 8) mod 21) mod 8 = 0
946
947 CURVE25519_BYTES4_LOAD(&out[9], in + 23); // 9: load 4 bytes form position 23
948 out[9] = MASK_64_LOW21 & (out[9] >> 5); // 9: 8 - ((24 * 8) mod 21) mod 8 = 5
949
950 CURVE25519_BYTES3_LOAD(&out[10], in + 26); // 10: load 3 bytes form position 26
951 out[10] = MASK_64_LOW21 & (out[10] >> 2); // 10: 8 - ((27 * 8) mod 21) mod 8 = 2
952
953 CURVE25519_BYTES4_LOAD(&out[11], in + 28); // 11: load 4 bytes form position 28
954 out[11] = (out[11] >> 7); // 11: 8 - ((29 * 8) mod 21) mod 8 = 7
955 }
956
ModuloLPreLoad(const uint8_t s[CRYPT_CURVE25519_SIGNLEN],uint64_t s21Bits[UINT8_64_21BITS_BLOCKNUM])957 static void ModuloLPreLoad(const uint8_t s[CRYPT_CURVE25519_SIGNLEN], uint64_t s21Bits[UINT8_64_21BITS_BLOCKNUM])
958 {
959 CURVE25519_BYTES3_LOAD(&s21Bits[0], s);
960 s21Bits[0] = s21Bits[0] & MASK_64_LOW21;
961
962 CURVE25519_BYTES4_LOAD(&s21Bits[1], s + 2); // 1: load 4 bytes form position 2
963 s21Bits[1] = MASK_64_LOW21 & (s21Bits[1] >> 5); // 1: 8 - ((3 * 8) mod 21) mod 8 = 5
964
965 CURVE25519_BYTES3_LOAD(&s21Bits[2], s + 5); // 2: load 3 bytes form position 5
966 s21Bits[2] = MASK_64_LOW21 & (s21Bits[2] >> 2); // 2: 8 - ((6 * 8) mod 21) mod 8 = 2
967
968 CURVE25519_BYTES4_LOAD(&s21Bits[3], s + 7); // 3: load 4 bytes form position 7
969 s21Bits[3] = MASK_64_LOW21 & (s21Bits[3] >> 7); // 3: 8 - ((8 * 8) mod 21) mod 8 = 7
970
971 CURVE25519_BYTES4_LOAD(&s21Bits[4], s + 10); // 4: load 4 bytes form position 10
972 s21Bits[4] = MASK_64_LOW21 & (s21Bits[4] >> 4); // 4: 8 - ((11 * 8) mod 21) mod 8 = 4
973
974 CURVE25519_BYTES3_LOAD(&s21Bits[5], s + 13); // 5: load 3 bytes form position 13
975 s21Bits[5] = MASK_64_LOW21 & (s21Bits[5] >> 1); // 5: 8 - ((14 * 8) mod 21) mod 8 = 1
976
977 CURVE25519_BYTES4_LOAD(&s21Bits[6], s + 15); // 6: load 4 bytes form position 15
978 s21Bits[6] = MASK_64_LOW21 & (s21Bits[6] >> 6); // 6: 8 - ((16 * 8) mod 21) mod 8 = 6
979
980 CURVE25519_BYTES3_LOAD(&s21Bits[7], s + 18); // 7: load 3 bytes form position 18
981 s21Bits[7] = MASK_64_LOW21 & (s21Bits[7] >> 3); // 7: 8 - ((19 * 8) mod 21) mod 8 = 3
982
983 CURVE25519_BYTES3_LOAD(&s21Bits[8], s + 21); // 8: load 3 bytes form position 21
984 s21Bits[8] = MASK_64_LOW21 & s21Bits[8]; // 8: ((22 * 8) mod 21) mod 8 = 0
985
986 CURVE25519_BYTES4_LOAD(&s21Bits[9], s + 23); // 9: load 4 bytes form position 23
987 s21Bits[9] = MASK_64_LOW21 & (s21Bits[9] >> 5); // 9: 8 - ((24 * 8) mod 21) mod 8 = 5
988
989 CURVE25519_BYTES3_LOAD(&s21Bits[10], s + 26); // 10: load 3 bytes form position 26
990 s21Bits[10] = MASK_64_LOW21 & (s21Bits[10] >> 2); // 10: 8 - ((27 * 8) mod 21) mod 8 = 2
991
992 CURVE25519_BYTES4_LOAD(&s21Bits[11], s + 28); // 11: load 4 bytes form position 28
993 s21Bits[11] = MASK_64_LOW21 & (s21Bits[11] >> 7); // 11: 8 - ((29 * 8) mod 21) mod 8 = 7
994
995 CURVE25519_BYTES4_LOAD(&s21Bits[12], s + 31); // 12: load 4 bytes form position 31
996 s21Bits[12] = MASK_64_LOW21 & (s21Bits[12] >> 4); // 12: 8 - ((32 * 8) mod 21) mod 8 = 4
997
998 CURVE25519_BYTES3_LOAD(&s21Bits[13], s + 34); // 13: load 3 bytes form position 34
999 s21Bits[13] = MASK_64_LOW21 & (s21Bits[13] >> 1); // 13: 8 - ((35 * 8) mod 21) mod 8 = 1
1000
1001 CURVE25519_BYTES4_LOAD(&s21Bits[14], s + 36); // 14: load 4 bytes form position 36
1002 s21Bits[14] = MASK_64_LOW21 & (s21Bits[14] >> 6); // 14: 8 - ((37 * 8) mod 21) mod 8 = 6
1003
1004 CURVE25519_BYTES3_LOAD(&s21Bits[15], s + 39); // 15: load 3 bytes form position 39
1005 s21Bits[15] = MASK_64_LOW21 & (s21Bits[15] >> 3); // 15: 8 - ((40 * 8) mod 21) mod 8 = 3
1006
1007 CURVE25519_BYTES3_LOAD(&s21Bits[16], s + 42); // 16: load 3 bytes form position 42
1008 s21Bits[16] = MASK_64_LOW21 & s21Bits[16]; // 16: ((43 * 8) mod 21) mod 8 = 0
1009
1010 CURVE25519_BYTES4_LOAD(&s21Bits[17], s + 44); // 17: load 4 bytes form position 44
1011 s21Bits[17] = MASK_64_LOW21 & (s21Bits[17] >> 5); // 17: 8 - ((45 * 8) mod 21) mod 8 = 5
1012
1013 CURVE25519_BYTES3_LOAD(&s21Bits[18], s + 47); // 18: load 3 bytes form position 47
1014 s21Bits[18] = MASK_64_LOW21 & (s21Bits[18] >> 2); // 18: 8 - ((48 * 8) mod 21) mod 8 = 2
1015
1016 CURVE25519_BYTES4_LOAD(&s21Bits[19], s + 49); // 19: load 4 bytes form position 49
1017 s21Bits[19] = MASK_64_LOW21 & (s21Bits[19] >> 7); // 19: 8 - ((50 * 8) mod 21) mod 8 = 7
1018
1019 CURVE25519_BYTES4_LOAD(&s21Bits[20], s + 52); // 20: load 4 bytes form position 52
1020 s21Bits[20] = MASK_64_LOW21 & (s21Bits[20] >> 4); // 20: 8 - ((53 * 8) mod 21) mod 8 = 4
1021
1022 CURVE25519_BYTES3_LOAD(&s21Bits[21], s + 55); // 21: load 3 bytes form position 55
1023 s21Bits[21] = MASK_64_LOW21 & (s21Bits[21] >> 1); // 21: 8 - ((56 * 8) mod 21) mod 8 = 1
1024
1025 CURVE25519_BYTES4_LOAD(&s21Bits[22], s + 57); // 22: load 4 bytes form position 57
1026 s21Bits[22] = MASK_64_LOW21 & (s21Bits[22] >> 6); // 22: 8 - ((58 * 8) mod 21) mod 8 = 6
1027
1028 CURVE25519_BYTES4_LOAD(&s21Bits[23], s + 60); // 23: load 4 bytes form position 60
1029 s21Bits[23] = s21Bits[23] >> 3; // 23: 8 - ((61 * 8) mod 21) mod 8 = 3
1030 }
1031
UnloadTo8Bits(uint8_t s8Bits[CRYPT_CURVE25519_OPTLEN],uint64_t s21Bits[UINT8_64_21BITS_BLOCKNUM])1032 static void UnloadTo8Bits(uint8_t s8Bits[CRYPT_CURVE25519_OPTLEN], uint64_t s21Bits[UINT8_64_21BITS_BLOCKNUM])
1033 {
1034 s8Bits[0] = (uint8_t)s21Bits[0];
1035
1036 // 1: load from 8 on block 0
1037 s8Bits[1] = (uint8_t)(s21Bits[0] >> 8);
1038
1039 // 2: load from (16 + 1) to 21 on block 0 and 1 to 3 on block 1, 8 - 3 = 5
1040 s8Bits[2] = (uint8_t)((s21Bits[0] >> 16) | (s21Bits[1] << 5));
1041
1042 // 3: load from (3 + 1) on block 1
1043 s8Bits[3] = (uint8_t)(s21Bits[1] >> 3);
1044
1045 // 4: load from (11 + 1) on block 1
1046 s8Bits[4] = (uint8_t)(s21Bits[1] >> 11);
1047
1048 // 5: load from (19 + 1) to 21 on block 1 and 1 to 6 on block 2, 8 - 6 = 2
1049 s8Bits[5] = (uint8_t)((s21Bits[1] >> 19) | (s21Bits[2] << 2));
1050
1051 // 6: load from (6 + 1) on block 2
1052 s8Bits[6] = (uint8_t)(s21Bits[2] >> 6);
1053
1054 // 7: load from (14 + 1) to 21 on block 2 and 1 on block 3, 8 - 7 = 1
1055 s8Bits[7] = (uint8_t)((s21Bits[2] >> 14) | (s21Bits[3] << 7));
1056
1057 // 8: load from (1 + 1) on block 3
1058 s8Bits[8] = (uint8_t)(s21Bits[3] >> 1);
1059
1060 // 9: load from (9 + 1) on block 3
1061 s8Bits[9] = (uint8_t)(s21Bits[3] >> 9);
1062
1063 // 10: load from (17 + 1) to 21 on block 3 and 1 to 4 on block 4, 8 - 4 = 4
1064 s8Bits[10] = (uint8_t)((s21Bits[3] >> 17) | (s21Bits[4] << 4));
1065
1066 // 11: load from (4 + 1) on block 4
1067 s8Bits[11] = (uint8_t)(s21Bits[4] >> 4);
1068
1069 // 12: load from (12 + 1) on block 4
1070 s8Bits[12] = (uint8_t)(s21Bits[4] >> 12);
1071
1072 // 13: load from (20 + 1) on block 4 and 1 to 7 on block 5, 8 - 7 = 1
1073 s8Bits[13] = (uint8_t)((s21Bits[4] >> 20) | (s21Bits[5] << 1));
1074
1075 // 14: load from (7 + 1) on block 5
1076 s8Bits[14] = (uint8_t)(s21Bits[5] >> 7);
1077
1078 // 15: load from (15 + 1) to 21 on block 5 and 1 to 2 on block 6, 8 - 2 = 6
1079 s8Bits[15] = (uint8_t)((s21Bits[5] >> 15) | (s21Bits[6] << 6));
1080
1081 // 16: load from (2 + 1) on block 6
1082 s8Bits[16] = (uint8_t)(s21Bits[6] >> 2);
1083
1084 // 17: load from (10 + 1) on block 6
1085 s8Bits[17] = (uint8_t)(s21Bits[6] >> 10);
1086
1087 // 18: load from (18 + 1) to 21 on block 6 and 1 to 5 on block 7, 8 - 5 = 3
1088 s8Bits[18] = (uint8_t)((s21Bits[6] >> 18) | (s21Bits[7] << 3));
1089
1090 // 19: load from (5 + 1) on block 7
1091 s8Bits[19] = (uint8_t)(s21Bits[7] >> 5);
1092
1093 // 20: load from (13 + 1) on block 7
1094 s8Bits[20] = (uint8_t)(s21Bits[7] >> 13);
1095
1096 // 21: load 8bits on block 8
1097 s8Bits[21] = (uint8_t)s21Bits[8];
1098
1099 // 22: load from (8 + 1) on block 8
1100 s8Bits[22] = (uint8_t)(s21Bits[8] >> 8);
1101
1102 // 23: load from (16 + 1) to 21 on block 8 and 1 to 3 on block 9, 8 - 3 = 5
1103 s8Bits[23] = (uint8_t)((s21Bits[8] >> 16) | (s21Bits[9] << 5));
1104
1105 // 24: load from (3 + 1) on block 9
1106 s8Bits[24] = (uint8_t)(s21Bits[9] >> 3);
1107
1108 // 25: load from (11 + 1) on block 9
1109 s8Bits[25] = (uint8_t)(s21Bits[9] >> 11);
1110
1111 // 26: load from (19 + 1) to 21 on block 9 and 1 to 6 on block 10, 8 - 6 = 2
1112 s8Bits[26] = (uint8_t)((s21Bits[9] >> 19) | (s21Bits[10] << 2));
1113
1114 // 27: load from (6 + 1) on block 10
1115 s8Bits[27] = (uint8_t)(s21Bits[10] >> 6);
1116
1117 // 28: load from (14 + 1) to 21 on block 10 and 1 on block 11, 8 - 7 = 1
1118 s8Bits[28] = (uint8_t)((s21Bits[10] >> 14) | (s21Bits[11] << 7));
1119
1120 // 29: load from (1 + 1) on block 11
1121 s8Bits[29] = (uint8_t)(s21Bits[11] >> 1);
1122
1123 // 30: load from (9 + 1) on block 11
1124 s8Bits[30] = (uint8_t)(s21Bits[11] >> 9);
1125
1126 // 31: load from (17 + 1) on block 11
1127 s8Bits[31] = (uint8_t)(s21Bits[11] >> 17);
1128 }
1129
ModuloLCore(uint64_t s21Bits[UINT8_64_21BITS_BLOCKNUM])1130 static void ModuloLCore(uint64_t s21Bits[UINT8_64_21BITS_BLOCKNUM])
1131 {
1132 int32_t i;
1133 uint64_t signMask1, signMask2;
1134 uint64_t carry1, carry2;
1135
1136 // multiply by l0, start with {11, 12, 13, 14, 15, 16} to {6, 7, 8, 9, 10, 11, 12}
1137 CURVE25519_MULTI_BY_L0(s21Bits, 11);
1138 CURVE25519_MULTI_BY_L0(s21Bits, 10);
1139 CURVE25519_MULTI_BY_L0(s21Bits, 9);
1140 CURVE25519_MULTI_BY_L0(s21Bits, 8);
1141 CURVE25519_MULTI_BY_L0(s21Bits, 7);
1142 CURVE25519_MULTI_BY_L0(s21Bits, 6);
1143
1144 // need to process carry to prevent overflow, process carry from 6->7, 8->9 ... 16->17, increment by 2
1145 for (i = 6; i <= 16; i += 2) {
1146 // 21 bits minus sign is 20 bits
1147 PROCESS_CARRY(s21Bits[i], s21Bits[i + 1], signMask1, carry1, 20);
1148 }
1149
1150 // process carry from 7->8, 9->10 ... 15->16, increment by 2
1151 for (i = 7; i <= 15; i += 2) {
1152 // 21 bits minus sign bit is 20 bits
1153 PROCESS_CARRY(s21Bits[i], s21Bits[i + 1], signMask2, carry2, 20);
1154 }
1155
1156 // {5, 6, 7, 8, 9, 10} to {0, 1, 2, 3, 4, 5, 6}
1157 CURVE25519_MULTI_BY_L0(s21Bits, 5);
1158 CURVE25519_MULTI_BY_L0(s21Bits, 4);
1159 CURVE25519_MULTI_BY_L0(s21Bits, 3);
1160 CURVE25519_MULTI_BY_L0(s21Bits, 2);
1161 CURVE25519_MULTI_BY_L0(s21Bits, 1);
1162 CURVE25519_MULTI_BY_L0(s21Bits, 0);
1163
1164 // process carry again, from 0->1, 2->3 ... 10->11, increment by 2
1165 for (i = 0; i <= 10; i += 2) {
1166 // 21 bits minus sign bit is 20 bits
1167 PROCESS_CARRY(s21Bits[i], s21Bits[i + 1], signMask1, carry1, 20);
1168 }
1169
1170 // from 1->2, 3->4 ... 11->12, increment by 2
1171 for (i = 1; i <= 11; i += 2) {
1172 // 21 bits minus sign is 20 bits
1173 PROCESS_CARRY(s21Bits[i], s21Bits[i + 1], signMask2, carry2, 20);
1174 }
1175
1176 CURVE25519_MULTI_BY_L0(s21Bits, 0);
1177
1178 // process carry from 0 to 11
1179 for (i = 0; i <= 11; i++) {
1180 PROCESS_CARRY_UNSIGN(s21Bits[i], s21Bits[i + 1], signMask1, carry1, 21); // s21Bits is 21 bits
1181 }
1182
1183 CURVE25519_MULTI_BY_L0(s21Bits, 0);
1184
1185 // from 0 to 10
1186 for (i = 0; i <= 10; i++) {
1187 PROCESS_CARRY_UNSIGN(s21Bits[i], s21Bits[i + 1], signMask1, carry1, 21); // s21Bits is 21 bits
1188 }
1189 }
1190
ModuloL(uint8_t s[CRYPT_CURVE25519_SIGNLEN])1191 void ModuloL(uint8_t s[CRYPT_CURVE25519_SIGNLEN])
1192 {
1193 // 24 of 21 bits block
1194 uint64_t s21Bits[UINT8_64_21BITS_BLOCKNUM] = {0};
1195
1196 ModuloLPreLoad(s, s21Bits);
1197
1198 ModuloLCore(s21Bits);
1199
1200 UnloadTo8Bits(s, s21Bits);
1201 }
1202
MulAdd(uint64_t s21Bits[UINT8_64_21BITS_BLOCKNUM],const uint64_t a21Bits[UINT8_32_21BITS_BLOCKNUM],const uint64_t b21Bits[UINT8_32_21BITS_BLOCKNUM],const uint64_t c21Bits[UINT8_32_21BITS_BLOCKNUM])1203 static void MulAdd(uint64_t s21Bits[UINT8_64_21BITS_BLOCKNUM], const uint64_t a21Bits[UINT8_32_21BITS_BLOCKNUM],
1204 const uint64_t b21Bits[UINT8_32_21BITS_BLOCKNUM], const uint64_t c21Bits[UINT8_32_21BITS_BLOCKNUM])
1205 {
1206 // s0 = c0 + a0b0
1207 s21Bits[0] = c21Bits[0] + a21Bits[0] * b21Bits[0];
1208
1209 // s1 = c1 + a0b1 + a1b0
1210 s21Bits[1] = c21Bits[1] + a21Bits[0] * b21Bits[1] + a21Bits[1] * b21Bits[0];
1211
1212 // s2 = c2 + a0b2 + b1a1 + a2b0
1213 s21Bits[2] = c21Bits[2] + a21Bits[0] * b21Bits[2] + a21Bits[1] * b21Bits[1] + a21Bits[2] * b21Bits[0];
1214
1215 // s3 = c3 + a0b3 + a1b2 + a2b1 + a3b0
1216 s21Bits[3] = c21Bits[3] + a21Bits[0] * b21Bits[3] + a21Bits[1] * b21Bits[2] +
1217 a21Bits[2] * b21Bits[1] + a21Bits[3] * b21Bits[0]; // a2b1 + a3b0
1218
1219 // s4 = c4 + a0b4 +a1b3 + a2b2 + a3b1 + a4b0
1220 s21Bits[4] = c21Bits[4] + a21Bits[0] * b21Bits[4] + a21Bits[1] * b21Bits[3] +
1221 a21Bits[2] * b21Bits[2] + a21Bits[3] * b21Bits[1] + a21Bits[4] * b21Bits[0]; // a2b2 + a3b1 + a4b0
1222
1223 // s5 = c5 + a0b5 + a1b4 + a2b3 + a3b2 + a4b1 + a5b0
1224 s21Bits[5] = c21Bits[5] + a21Bits[0] * b21Bits[5] + a21Bits[1] * b21Bits[4] + a21Bits[2] * b21Bits[3] +
1225 a21Bits[3] * b21Bits[2] + a21Bits[4] * b21Bits[1] + a21Bits[5] * b21Bits[0]; // a3b2 + a4b1 + a5b0
1226
1227 // s6 = c6 + a0b6 + a1b5 + a2b4 + a3b3 + a2b4 + a5b1 + a6b0
1228 s21Bits[6] = c21Bits[6] + a21Bits[0] * b21Bits[6] + a21Bits[1] * b21Bits[5] + a21Bits[2] * b21Bits[4] +
1229 a21Bits[3] * b21Bits[3] + a21Bits[4] * b21Bits[2] + a21Bits[5] * b21Bits[1] + // a3b3 + a2b4 + a5b1
1230 a21Bits[6] * b21Bits[0]; // a6b0
1231
1232 // s7 = c7 + a0b7 + a1b6 + a2b5 + a3b4 + a4b3 + a5b2 + a6b1 + a7b0
1233 s21Bits[7] = c21Bits[7] + a21Bits[0] * b21Bits[7] + a21Bits[1] * b21Bits[6] + a21Bits[2] * b21Bits[5] +
1234 a21Bits[3] * b21Bits[4] + a21Bits[4] * b21Bits[3] + a21Bits[5] * b21Bits[2] + // a3b4 + a4b3 + a5b2
1235 a21Bits[6] * b21Bits[1] + a21Bits[7] * b21Bits[0]; // a6b1 + a7b0
1236
1237 // s8 = c8 + a0b8 + a1b7 + a2b6 + a3b5 + a4b4 + a5b3 + a6b2 + a7b1 + a8b0
1238 s21Bits[8] = c21Bits[8] + a21Bits[0] * b21Bits[8] + a21Bits[1] * b21Bits[7] + a21Bits[2] * b21Bits[6] +
1239 a21Bits[3] * b21Bits[5] + a21Bits[4] * b21Bits[4] + a21Bits[5] * b21Bits[3] + // a3b5 + a4b4 + a5b3
1240 a21Bits[6] * b21Bits[2] + a21Bits[7] * b21Bits[1] + a21Bits[8] * b21Bits[0]; // a6b2 + a7b1 + a8b0
1241
1242 // s9 = c9 + a0b9 + a1b8 + a2b7 + a3b6 + a4b5 + a5b4 + a6b3 + a7b2 + a8b1 + a9b0
1243 s21Bits[9] = c21Bits[9] + a21Bits[0] * b21Bits[9] + a21Bits[1] * b21Bits[8] + a21Bits[2] * b21Bits[7] +
1244 a21Bits[3] * b21Bits[6] + a21Bits[4] * b21Bits[5] + a21Bits[5] * b21Bits[4] + // a3b6 + a4b5 + a5b4
1245 a21Bits[6] * b21Bits[3] + a21Bits[7] * b21Bits[2] + a21Bits[8] * b21Bits[1] + // a6b3 + a7b2 + a8b1
1246 a21Bits[9] * b21Bits[0]; // a9b0
1247
1248 // s10 = c10 + a0b10 + a1b9 + a2b8 + a3b7 + a4b6 + a5b5 + a6b4 + a7b3 + a8b2 + a9b1 + a10b0
1249 s21Bits[10] = c21Bits[10] + a21Bits[0] * b21Bits[10] + a21Bits[1] * b21Bits[9] + a21Bits[2] * b21Bits[8] +
1250 a21Bits[3] * b21Bits[7] + a21Bits[4] * b21Bits[6] + a21Bits[5] * b21Bits[5] + // a3b7 + a4b6 + a5b5
1251 a21Bits[6] * b21Bits[4] + a21Bits[7] * b21Bits[3] + a21Bits[8] * b21Bits[2] + // a6b4 + a7b3 + a8b2
1252 a21Bits[9] * b21Bits[1] + a21Bits[10] * b21Bits[0]; // a9b1 + a10b0
1253
1254 // s11 = c11 + a0b11 + a1b10 + a2b9 + a3b8 + a4b7 + a5b6 + a6b5 + a7b4 + a8b3 + a9b2 + a10b1 + a11b0
1255 s21Bits[11] = c21Bits[11] + a21Bits[0] * b21Bits[11] + a21Bits[1] * b21Bits[10] + a21Bits[2] * b21Bits[9] +
1256 a21Bits[3] * b21Bits[8] + a21Bits[4] * b21Bits[7] + a21Bits[5] * b21Bits[6] + // a3b8 + a4b7 + a5b6
1257 a21Bits[6] * b21Bits[5] + a21Bits[7] * b21Bits[4] + a21Bits[8] * b21Bits[3] + // a6b5 + a7b4 + a8b3
1258 a21Bits[9] * b21Bits[2] + a21Bits[10] * b21Bits[1] + a21Bits[11] * b21Bits[0]; // a9b2 + a10b1 + a11b0
1259
1260 // s12 = a1b11 + a2b10 + a3b9 + a4b8 + a5b7 + a6b6 + a7b5 + a8b4 + a9b3 + a10b2 + a11b1
1261 s21Bits[12] = a21Bits[1] * b21Bits[11] + a21Bits[2] * b21Bits[10] + a21Bits[3] * b21Bits[9] +
1262 a21Bits[4] * b21Bits[8] + a21Bits[5] * b21Bits[7] + a21Bits[6] * b21Bits[6] + // a4b8 + a5b7 + a6b6
1263 a21Bits[7] * b21Bits[5] + a21Bits[8] * b21Bits[4] + a21Bits[9] * b21Bits[3] + // a7b5 + a8b4 + a9b3
1264 a21Bits[10] * b21Bits[2] + a21Bits[11] * b21Bits[1]; // a10b2 + a11b1
1265
1266 // s13 = a2b11 + a3b10 + a4b9 + a5b8 + a6b7 + a7b6 + a8b5 + a9b4 + a10b3 + a11b2
1267 s21Bits[13] = a21Bits[2] * b21Bits[11] + a21Bits[3] * b21Bits[10] + a21Bits[4] * b21Bits[9] +
1268 a21Bits[5] * b21Bits[8] + a21Bits[6] * b21Bits[7] + a21Bits[7] * b21Bits[6] + // a5b8 + a6b7 + a7b6
1269 a21Bits[8] * b21Bits[5] + a21Bits[9] * b21Bits[4] + a21Bits[10] * b21Bits[3] + // a8b5 + a9b4 + a10b3
1270 a21Bits[11] * b21Bits[2]; // a11b2
1271
1272 // s14 = a3b11 + a4b10 + a5b9 + a6b8 + a7b7 + a8b6 + a9b5 + a10b4 + a11b3
1273 s21Bits[14] = a21Bits[3] * b21Bits[11] + a21Bits[4] * b21Bits[10] + a21Bits[5] * b21Bits[9] +
1274 a21Bits[6] * b21Bits[8] + a21Bits[7] * b21Bits[7] + a21Bits[8] * b21Bits[6] + // a6b8 + a7b7 + a8b6
1275 a21Bits[9] * b21Bits[5] + a21Bits[10] * b21Bits[4] + a21Bits[11] * b21Bits[3]; // a9b5 + a10b4 + a11b3
1276
1277 // s15 = a4b11 + a5b10 + a6b9 + a7b8 + a8b7 + a9b6 + a10b5 + a11b4
1278 s21Bits[15] = a21Bits[4] * b21Bits[11] + a21Bits[5] * b21Bits[10] + a21Bits[6] * b21Bits[9] +
1279 a21Bits[7] * b21Bits[8] + a21Bits[8] * b21Bits[7] + a21Bits[9] * b21Bits[6] + // a7b8 + a8b7 + a9b6
1280 a21Bits[10] * b21Bits[5] + a21Bits[11] * b21Bits[4]; // a10b5 + a11b4
1281
1282 // s16 = a5b11 + a6b10 + a7b9 + a8b8 + a9b7 + a10b6 + a11b5
1283 s21Bits[16] = a21Bits[5] * b21Bits[11] + a21Bits[6] * b21Bits[10] + a21Bits[7] * b21Bits[9] +
1284 a21Bits[8] * b21Bits[8] + a21Bits[9] * b21Bits[7] + a21Bits[10] * b21Bits[6] + // a8b8 + a9b7 + a10b6
1285 a21Bits[11] * b21Bits[5]; // a11b5
1286
1287 // s17 = a6b11 + a7b10 + a8b9 + a9b8 + a10b7 + a11b6
1288 s21Bits[17] = a21Bits[6] * b21Bits[11] + a21Bits[7] * b21Bits[10] + a21Bits[8] * b21Bits[9] +
1289 a21Bits[9] * b21Bits[8] + a21Bits[10] * b21Bits[7] + a21Bits[11] * b21Bits[6]; // a9b8 + a10b7 + a11b6
1290
1291 // s18 = a7b11 + a8b10 + a9b9 + a10b8 + a11b7
1292 s21Bits[18] = a21Bits[7] * b21Bits[11] + a21Bits[8] * b21Bits[10] + a21Bits[9] * b21Bits[9] +
1293 a21Bits[10] * b21Bits[8] + a21Bits[11] * b21Bits[7]; // a10b8 + a11b7
1294
1295 // s19 = a8b11 + a9b10 + a10b9 + a11b8
1296 s21Bits[19] = a21Bits[8] * b21Bits[11] + a21Bits[9] * b21Bits[10] + a21Bits[10] * b21Bits[9] +
1297 a21Bits[11] * b21Bits[8]; // a11b8
1298
1299 // s20 = a9b11 + a10b10 + a11b9
1300 s21Bits[20] = a21Bits[9] * b21Bits[11] + a21Bits[10] * b21Bits[10] + a21Bits[11] * b21Bits[9];
1301
1302 // s21 = a10b11 + a11b10
1303 s21Bits[21] = a21Bits[10] * b21Bits[11] + a21Bits[11] * b21Bits[10];
1304
1305 // s22 = a11b11
1306 s21Bits[22] = a21Bits[11] * b21Bits[11];
1307
1308 // s23 = 0
1309 s21Bits[23] = 0;
1310 }
1311
ScalarMulAdd(uint8_t s[CRYPT_CURVE25519_KEYLEN],const uint8_t a[CRYPT_CURVE25519_KEYLEN],const uint8_t b[CRYPT_CURVE25519_KEYLEN],const uint8_t c[CRYPT_CURVE25519_KEYLEN])1312 void ScalarMulAdd(uint8_t s[CRYPT_CURVE25519_KEYLEN], const uint8_t a[CRYPT_CURVE25519_KEYLEN],
1313 const uint8_t b[CRYPT_CURVE25519_KEYLEN], const uint8_t c[CRYPT_CURVE25519_KEYLEN])
1314 {
1315 uint64_t a21Bits[UINT8_32_21BITS_BLOCKNUM];
1316 uint64_t b21Bits[UINT8_32_21BITS_BLOCKNUM];
1317 uint64_t c21Bits[UINT8_32_21BITS_BLOCKNUM];
1318
1319 ScalarMulAddPreLoad(a, a21Bits);
1320 ScalarMulAddPreLoad(b, b21Bits);
1321 ScalarMulAddPreLoad(c, c21Bits);
1322
1323 uint64_t s21Bits[UINT8_64_21BITS_BLOCKNUM];
1324
1325 MulAdd(s21Bits, a21Bits, b21Bits, c21Bits);
1326
1327 int32_t i;
1328 uint64_t signMask1, signMask2;
1329 uint64_t carryA, carryB;
1330
1331 // process carry 0->1, 2->3 ... 22->23
1332 for (i = 0; i <= 22; i += 2) {
1333 // 21 bits minus sign bit is 20 bits
1334 PROCESS_CARRY(s21Bits[i], s21Bits[i + 1], signMask1, carryA, 20);
1335 }
1336
1337 // process carry 1->2, 3->4 ... 21->22
1338 for (i = 1; i <= 21; i += 2) {
1339 // 21 bits minus sign bit is 20 bits
1340 PROCESS_CARRY(s21Bits[i], s21Bits[i + 1], signMask2, carryB, 20);
1341 }
1342
1343 ModuloLCore(s21Bits);
1344
1345 UnloadTo8Bits(s, s21Bits);
1346 }
1347
1348 /* RFC8032, out = a + b */
PointAdd(GeE * out,GeE * greA,GeE * greB)1349 static void PointAdd(GeE *out, GeE *greA, GeE *greB)
1350 {
1351 const Fp25 d2 = {-21827239, -5839606, -30745221, 13898782, 229458,
1352 15978800, -12551817, -6495438, 29715968, 9444199};
1353 Fp25 a, b, c, d, e, f, g, h;
1354 CURVE25519_FP_SUB(e, greA->y, greA->x);
1355 CURVE25519_FP_SUB(f, greB->y, greB->x);
1356 CURVE25519_FP_ADD(g, greA->y, greA->x);
1357 CURVE25519_FP_ADD(h, greB->y, greB->x);
1358 FpMul(a, e, f);
1359 FpMul(b, g, h);
1360 FpMul(c, greA->t, greB->t);
1361 FpMul(c, c, d2);
1362 FpMul(d, greA->z, greB->z);
1363 CURVE25519_FP_ADD(d, d, d);
1364 CURVE25519_FP_SUB(e, b, a);
1365 CURVE25519_FP_SUB(f, d, c);
1366 CURVE25519_FP_ADD(g, d, c);
1367 CURVE25519_FP_ADD(h, b, a);
1368 FpMul(out->x, e, f);
1369 FpMul(out->y, g, h);
1370 FpMul(out->z, f, g);
1371 FpMul(out->t, e, h);
1372 }
1373
PointAddPrecompute(GeE * out,GeE * greA,GeEPre * greB)1374 static void PointAddPrecompute(GeE *out, GeE *greA, GeEPre *greB)
1375 {
1376 Fp25 a, b, c, d, e, f, g, h;
1377 CURVE25519_FP_SUB(e, greA->y, greA->x);
1378 CURVE25519_FP_ADD(g, greA->y, greA->x);
1379 FpMul(a, e, greB->yminusx);
1380 FpMul(b, g, greB->yplusx);
1381 FpMul(c, greA->t, greB->t2z);
1382 FpMul(d, greA->z, greB->z);
1383 CURVE25519_FP_ADD(d, d, d);
1384 CURVE25519_FP_SUB(e, b, a);
1385 CURVE25519_FP_SUB(f, d, c);
1386 CURVE25519_FP_ADD(g, d, c);
1387 CURVE25519_FP_ADD(h, b, a);
1388 FpMul(out->x, e, f);
1389 FpMul(out->y, g, h);
1390 FpMul(out->z, f, g);
1391 FpMul(out->t, e, h);
1392 }
1393
PointSubPrecompute(GeE * out,GeE * greA,GeEPre * greB)1394 static void PointSubPrecompute(GeE *out, GeE *greA, GeEPre *greB)
1395 {
1396 Fp25 a, b, c, d, e, f, g, h;
1397 CURVE25519_FP_SUB(e, greA->y, greA->x);
1398 CURVE25519_FP_ADD(g, greA->y, greA->x);
1399 FpMul(a, e, greB->yplusx);
1400 FpMul(b, g, greB->yminusx);
1401 FpMul(c, greA->t, greB->t2z);
1402 FpMul(d, greA->z, greB->z);
1403 CURVE25519_FP_ADD(d, d, d);
1404 CURVE25519_FP_SUB(e, b, a);
1405 CURVE25519_FP_ADD(f, d, c);
1406 CURVE25519_FP_SUB(g, d, c);
1407 CURVE25519_FP_ADD(h, b, a);
1408 FpMul(out->x, e, f);
1409 FpMul(out->y, g, h);
1410 FpMul(out->z, f, g);
1411 FpMul(out->t, e, h);
1412 }
1413
P1DoubleN(GeE * p1,int32_t n)1414 static void P1DoubleN(GeE *p1, int32_t n)
1415 {
1416 GeP p;
1417 GeC c;
1418 int32_t i;
1419 // From extended coordinate to projective coordinate, just ignore T
1420 CURVE25519_FP_COPY(p.x, p1->x);
1421 CURVE25519_FP_COPY(p.y, p1->y);
1422 CURVE25519_FP_COPY(p.z, p1->z);
1423
1424 ProjectiveDouble(&c, &p);
1425 for (i = 1; i < n; i++) {
1426 GeCompleteToProjective(&p, &c);
1427 ProjectiveDouble(&c, &p);
1428 }
1429
1430 FpMul(p1->x, c.t, c.x);
1431 FpMul(p1->y, c.z, c.y);
1432 FpMul(p1->z, c.t, c.z);
1433 FpMul(p1->t, c.y, c.x);
1434 }
1435
PointToPrecompute(GeEPre * out,GeE * in)1436 static void PointToPrecompute(GeEPre *out, GeE *in)
1437 {
1438 const Fp25 d2 = {-21827239, -5839606, -30745221, 13898782, 229458,
1439 15978800, -12551817, -6495438, 29715968, 9444199};
1440
1441 CURVE25519_FP_ADD(out->yplusx, in->y, in->x);
1442 CURVE25519_FP_SUB(out->yminusx, in->y, in->x);
1443 CURVE25519_FP_COPY(out->z, in->z);
1444 FpMul(out->t2z, in->t, d2);
1445 }
1446
FlipK(int8_t slide[256],uint32_t start)1447 static void FlipK(int8_t slide[256], uint32_t start)
1448 {
1449 uint32_t k;
1450 for (k = start; k < 256; k++) {
1451 if (slide[k] == 0) {
1452 slide[k] = 1;
1453 break;
1454 } else {
1455 slide[k] = 0;
1456 }
1457 }
1458 }
1459
SlideReduce(int8_t * out,uint32_t outLen,const uint8_t * in,uint32_t inLen)1460 static void SlideReduce(int8_t *out, uint32_t outLen, const uint8_t *in, uint32_t inLen)
1461 {
1462 uint32_t i, j;
1463 int8_t tmp;
1464 (void)outLen;
1465 (void)inLen;
1466 // 32 * 8 = 256
1467 for (i = 0; i < 256; i++) {
1468 // turn 32 8bits to 256 1bit, block: in[i >> 3], bit: (i & 7)
1469 out[i] = (int8_t)((in[i >> 3] >> (i & 7)) & 1);
1470 }
1471 // 32 * 8 = 256
1472 for (i = 0; i < 256; i++) {
1473 if (out[i] == 0) {
1474 continue;
1475 }
1476 for (j = 1; j <= 6 && (i + j) < 256; j++) { // check next 6 since 2^6 - 2^5 = 16 > 15, 256 is array length
1477 if (out[i + j] == 0) {
1478 continue;
1479 }
1480 // range 15 to -15
1481 tmp = (int8_t)((uint8_t)(out[i + j]) << j);
1482 if (out[i] + tmp <= 15) { // max 15, 0x1, 0x1, 0x1, 0x1 , 0 -> 0x1111, 0, 0, 0, 0
1483 out[i] += tmp;
1484 out[i + j] = 0;
1485 } else if (out[i] - tmp >= -15) { // min -15, 0x1111, 0, 0, 0, 1, 1 -> -1, 0, 0, 0, 0, 1
1486 out[i] -= tmp;
1487 FlipK(out, i + j);
1488 } else {
1489 break;
1490 }
1491 }
1492 }
1493 }
1494
1495 // Base on article "High-speed high-security signatures"
1496 // stores B, 3B, 5B, 7B, 9B, 11B, 13B, 15B, with B as ed25519 base point
1497 static const GePre g_precomputedB[8] = {
1498 {
1499 {25967493, -14356035, 29566456, 3660896, -12694345, 4014787, 27544626,
1500 -11754271, -6079156, 2047605},
1501 {-12545711, 934262, -2722910, 3049990, -727428, 9406986, 12720692,
1502 5043384, 19500929, -15469378},
1503 {-8738181, 4489570, 9688441, -14785194, 10184609, -12363380, 29287919,
1504 11864899, -24514362, -4438546},
1505 },
1506 {
1507 {15636291, -9688557, 24204773, -7912398, 616977, -16685262, 27787600,
1508 -14772189, 28944400, -1550024},
1509 {16568933, 4717097, -11556148, -1102322, 15682896, -11807043, 16354577,
1510 -11775962, 7689662, 11199574},
1511 {30464156, -5976125, -11779434, -15670865, 23220365, 15915852, 7512774,
1512 10017326, -17749093, -9920357},
1513 },
1514 {
1515 {10861363, 11473154, 27284546, 1981175, -30064349, 12577861, 32867885,
1516 14515107, -15438304, 10819380},
1517 {4708026, 6336745, 20377586, 9066809, -11272109, 6594696, -25653668,
1518 12483688, -12668491, 5581306},
1519 {19563160, 16186464, -29386857, 4097519, 10237984, -4348115, 28542350,
1520 13850243, -23678021, -15815942},
1521 },
1522 {
1523 {5153746, 9909285, 1723747, -2777874, 30523605, 5516873, 19480852,
1524 5230134, -23952439, -15175766},
1525 {-30269007, -3463509, 7665486, 10083793, 28475525, 1649722, 20654025,
1526 16520125, 30598449, 7715701},
1527 {28881845, 14381568, 9657904, 3680757, -20181635, 7843316, -31400660,
1528 1370708, 29794553, -1409300},
1529 },
1530 {
1531 {-22518993, -6692182, 14201702, -8745502, -23510406, 8844726, 18474211,
1532 -1361450, -13062696, 13821877},
1533 {-6455177, -7839871, 3374702, -4740862, -27098617, -10571707, 31655028,
1534 -7212327, 18853322, -14220951},
1535 {4566830, -12963868, -28974889, -12240689, -7602672, -2830569, -8514358,
1536 -10431137, 2207753, -3209784},
1537 },
1538 {
1539 {-25154831, -4185821, 29681144, 7868801, -6854661, -9423865, -12437364,
1540 -663000, -31111463, -16132436},
1541 {25576264, -2703214, 7349804, -11814844, 16472782, 9300885, 3844789,
1542 15725684, 171356, 6466918},
1543 {23103977, 13316479, 9739013, -16149481, 817875, -15038942, 8965339,
1544 -14088058, -30714912, 16193877},
1545 },
1546 {
1547 {-33521811, 3180713, -2394130, 14003687, -16903474, -16270840, 17238398,
1548 4729455, -18074513, 9256800},
1549 {-25182317, -4174131, 32336398, 5036987, -21236817, 11360617, 22616405,
1550 9761698, -19827198, 630305},
1551 {-13720693, 2639453, -24237460, -7406481, 9494427, -5774029, -6554551,
1552 -15960994, -2449256, -14291300},
1553 },
1554 {
1555 {-3151181, -5046075, 9282714, 6866145, -31907062, -863023, -18940575,
1556 15033784, 25105118, -7894876},
1557 {-24326370, 15950226, -31801215, -14592823, -11662737, -5090925,
1558 1573892, -2625887, 2198790, -15804619},
1559 {-3099351, 10324967, -2241613, 7453183, -5446979, -2735503, -13812022,
1560 -16236442, -32461234, -12290683},
1561 },
1562 };
1563
1564 /* out = hash * p + s * B */
KAMulPlusMulBase(GeE * out,const uint8_t hash[CRYPT_CURVE25519_KEYLEN],const GeE * p,const uint8_t s[CRYPT_CURVE25519_KEYLEN])1565 void KAMulPlusMulBase(GeE *out, const uint8_t hash[CRYPT_CURVE25519_KEYLEN],
1566 const GeE *p, const uint8_t s[CRYPT_CURVE25519_KEYLEN])
1567 {
1568 SetExtendedBasePoint(out);
1569 GeE tmpP[8]; // stores p, 3p, 5p, 7p, 9p, 11p, 13p, 15p
1570 GeE doubleP;
1571 GeEPre preComputedP[8]; // stores p, 3p, 5p, 7p, 9p, 11p, 13p, 15p
1572 int8_t slideP[256];
1573 int8_t slideS[256];
1574 int32_t i;
1575
1576 SlideReduce(slideP, 256, hash, CRYPT_CURVE25519_KEYLEN);
1577 SlideReduce(slideS, 256, s, CRYPT_CURVE25519_KEYLEN);
1578
1579 CURVE25519_GE_COPY(tmpP[0], *p);
1580 CURVE25519_GE_COPY(doubleP, *p);
1581
1582 PointToPrecompute(&preComputedP[0], &tmpP[0]);
1583 P1DoubleN(&doubleP, 1);
1584
1585 for (i = 1; i < 8; i += 1) { // p, 3p, ....., 13p, 15p, total 8
1586 PointAdd(&tmpP[i], &tmpP[i - 1], &doubleP);
1587 PointToPrecompute(&preComputedP[i], &tmpP[i]);
1588 }
1589
1590 int32_t zeroCount = 0;
1591 i = 255; // 255 to 0
1592 while (i >= 0 && slideP[i] == 0 && slideS[i] == 0) {
1593 i--;
1594 }
1595 for (; i >= 0; i--) {
1596 while (i >= 0 && slideP[i] == 0 && slideS[i] == 0) {
1597 zeroCount++;
1598 i--;
1599 }
1600 if (i < 0) {
1601 P1DoubleN(out, zeroCount);
1602 break;
1603 } else {
1604 P1DoubleN(out, zeroCount + 1);
1605 }
1606 zeroCount = 0;
1607 if (slideP[i] > 0) {
1608 PointAddPrecompute(out, out, &preComputedP[slideP[i] / 2]); // preComputedP[i] = (i * 2 + 1)P
1609 } else if (slideP[i] < 0) {
1610 PointSubPrecompute(out, out, &preComputedP[(-slideP[i]) / 2]); // preComputedP[i] = (i * 2 + 1)P
1611 }
1612 if (slideS[i] > 0) {
1613 GeAdd(out, &g_precomputedB[slideS[i] / 2]); // g_precomputedB[i] = (i * 2 + 1)P
1614 } else if (slideS[i] < 0) {
1615 GeSub(out, &g_precomputedB[(-slideS[i]) / 2]); // g_precomputedB[i] = (i * 2 + 1)P
1616 }
1617 }
1618 }
1619 #endif /* HITLS_CRYPTO_ED25519 */
1620
1621 #endif /* HITLS_CRYPTO_CURVE25519 */
1622