• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * This file is part of the openHiTLS project.
3  *
4  * openHiTLS is licensed under the Mulan PSL v2.
5  * You can use this software according to the terms and conditions of the Mulan PSL v2.
6  * You may obtain a copy of Mulan PSL v2 at:
7  *
8  *     http://license.coscl.org.cn/MulanPSL2
9  *
10  * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11  * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12  * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13  * See the Mulan PSL v2 for more details.
14  */
15 /* Some of these codes are adapted from https://ed25519.cr.yp.to/software.html */
16 
17 #include "hitls_build.h"
18 #ifdef HITLS_CRYPTO_CURVE25519
19 
20 #include <stdbool.h>
21 #include "securec.h"
22 #include "curve25519_local.h"
23 #include "bsl_sal.h"
24 #ifdef HITLS_CRYPTO_ED25519
25 #define CRYPT_CURVE25519_OPTLEN 32
26 #endif
27 #define CONDITION_COPY(dst, src, indicate)     \
28     (int32_t)((uint32_t)(dst) ^ (((uint32_t)(dst) ^ (uint32_t)(src)) & (indicate)))
29 
30 // process Fp multiplication carry
31 #define FP_PROCESS_CARRY(h)                           \
32 do {                                                  \
33     int64_t carry0, carry1, carry2, carry3, carry4, carry5, carry6, carry7, carry8, carry9;            \
34     carry0 = h##0 + (1 << 25); h##1 += carry0 >> 26; h##0 -= carry0 & CURVE25519_MASK_HIGH_38;         \
35     carry4 = h##4 + (1 << 25); h##5 += carry4 >> 26; h##4 -= carry4 & CURVE25519_MASK_HIGH_38;         \
36     carry1 = h##1 + (1 << 24); h##2 += carry1 >> 25; h##1 -= carry1 & CURVE25519_MASK_HIGH_39;         \
37     carry5 = h##5 + (1 << 24); h##6 += carry5 >> 25; h##5 -= carry5 & CURVE25519_MASK_HIGH_39;         \
38     carry2 = h##2 + (1 << 25); h##3 += carry2 >> 26; h##2 -= carry2 & CURVE25519_MASK_HIGH_38;         \
39     carry6 = h##6 + (1 << 25); h##7 += carry6 >> 26; h##6 -= carry6 & CURVE25519_MASK_HIGH_38;         \
40     carry3 = h##3 + (1 << 24); h##4 += carry3 >> 25; h##3 -= carry3 & CURVE25519_MASK_HIGH_39;         \
41     carry7 = h##7 + (1 << 24); h##8 += carry7 >> 25; h##7 -= carry7 & CURVE25519_MASK_HIGH_39;         \
42     carry4 = h##4 + (1 << 25); h##5 += carry4 >> 26; h##4 -= carry4 & CURVE25519_MASK_HIGH_38;         \
43     carry8 = h##8 + (1 << 25); h##9 += carry8 >> 26; h##8 -= carry8 & CURVE25519_MASK_HIGH_38;         \
44     carry9 = h##9 + (1 << 24); h##0 += (carry9 >> 25) * 19; h##9 -= carry9 & CURVE25519_MASK_HIGH_39;  \
45     carry0 = h##0 + (1 << 25); h##1 += carry0 >> 26; h##0 -= carry0 & CURVE25519_MASK_HIGH_38;         \
46 } while (0)
47 
48 // h0...h9 to Fp25
49 #define INT64_2_FP25(h, out)                  \
50 do {                                    \
51     (out)[0] = (int32_t)h##0;     \
52     (out)[1] = (int32_t)h##1;     \
53     (out)[2] = (int32_t)h##2;     \
54     (out)[3] = (int32_t)h##3;     \
55     (out)[4] = (int32_t)h##4;     \
56     (out)[5] = (int32_t)h##5;     \
57     (out)[6] = (int32_t)h##6;     \
58     (out)[7] = (int32_t)h##7;     \
59     (out)[8] = (int32_t)h##8;     \
60     (out)[9] = (int32_t)h##9;     \
61 } while (0)
62 
63 #define FP25_2_INT32(in, out)    \
64 do {                        \
65     out##0 = (in)[0];  \
66     out##1 = (in)[1];  \
67     out##2 = (in)[2];  \
68     out##3 = (in)[3];  \
69     out##4 = (in)[4];  \
70     out##5 = (in)[5];  \
71     out##6 = (in)[6];  \
72     out##7 = (in)[7];  \
73     out##8 = (in)[8];  \
74     out##9 = (in)[9];  \
75 } while (0)
76 
77 /* out = f * g */
FpMul(Fp25 out,const Fp25 f,const Fp25 g)78 void FpMul(Fp25 out, const Fp25 f, const Fp25 g)
79 {
80     int32_t f0, f1, f2, f3, f4, f5, f6, f7, f8, f9;
81     int32_t g0, g1, g2, g3, g4, g5, g6, g7, g8, g9;
82     int64_t h0, h1, h2, h3, h4, h5, h6, h7, h8, h9;
83 
84     FP25_2_INT32(f, f);
85     FP25_2_INT32(g, g);
86 
87     int32_t f1_2 = f1 * 2;
88     int32_t f3_2 = f3 * 2;
89     int32_t f5_2 = f5 * 2;
90     int32_t f7_2 = f7 * 2;
91     int32_t f9_2 = f9 * 2;
92 
93     int32_t g1_19 = g1 * 19;
94     int32_t g2_19 = g2 * 19;
95     int32_t g3_19 = g3 * 19;
96     int32_t g4_19 = g4 * 19;
97     int32_t g5_19 = g5 * 19;
98     int32_t g6_19 = g6 * 19;
99     int32_t g7_19 = g7 * 19;
100     int32_t g8_19 = g8 * 19;
101     int32_t g9_19 = g9 * 19;
102 
103     /*  h0  =  f0g0 + 38f1g9 + 19f2g8 + 38f3g7 + 19f4g6 + 38f5g5 + 19f6g4 + 38f7g3 + 19f8g2 + 38f9g1
104         h1  =  f0g1 + f1g0   + 19f2g9 + 19f3g8 + 19f4g7 + 19f5g6 + 19f6g5 + 19f7g4 + 19f8g3 + 19f9g2
105         h2  =  f0g2 + 2f1g1  + f2g0   + 38f3g9 + 19f4g8 + 38f5g7 + 19f6g6 + 38f7g5 + 19f8g4 + 38f9g2
106         h3  =  f0g3 + f1g2   + f2g1   + f3g0   + 19f4g9 + 19f5g8 + 19f6g7 + 19f7g6 + 19f8g5 + 19f9g4
107         h4  =  f0g4 + 2f1g3  + f2g2   + 2f3g1  + f4g0   + 38f5g9 + 19f6g8 + 38f7g7 + 19f8g6 + 38f9g5
108         h5  =  f0g5 + f1g4   + f2g3   + f3g2   + f4g1   + f5g0   + 19f6g9 + 19f7g8 + 19f8g7 + 19f9g6
109         h6  =  f0g6 + 2f1g5  + f2g4   + 2f3g3  + f4g2   + 2f5g1  + f6g0   + 38f7g9 + 19f8g8 + 38f9g7
110         h7  =  f0g7 + f1g6   + f2g5   + f3g4   + f4g3   + f5g2   + f6g1   + f7g0   + 19f8g9 + 19f9g8
111         h8  =  f0g8 + 2f1g7  + f2g6   + 2f3g5  + f4g4   + 2f5g3  + f6g2   + 2f7g1  + f8g0   + 38f9g9
112         h9  =  f0g9 + f1g8   + f2g7   + f3g6   + f4g5   + f5g4   + f6g3   + f7g2   + f8g1   + f9g0
113         The calculation is performed by column. */
114 
115     h0 = (int64_t)f0 * g0;
116     h1 = (int64_t)f0 * g1;
117     h2 = (int64_t)f0 * g2;
118     h3 = (int64_t)f0 * g3;
119     h4 = (int64_t)f0 * g4;
120     h5 = (int64_t)f0 * g5;
121     h6 = (int64_t)f0 * g6;
122     h7 = (int64_t)f0 * g7;
123     h8 = (int64_t)f0 * g8;
124     h9 = (int64_t)f0 * g9;
125 
126     h0 += (int64_t)f1_2 * g9_19;
127     h1 += (int64_t)f1 * g0;
128     h2 += (int64_t)f1_2 * g1;
129     h3 += (int64_t)f1 * g2;
130     h4 += (int64_t)f1_2 * g3;
131     h5 += (int64_t)f1 * g4;
132     h6 += (int64_t)f1_2 * g5;
133     h7 += (int64_t)f1 * g6;
134     h8 += (int64_t)f1_2 * g7;
135     h9 += (int64_t)f1 * g8;
136 
137     h0 += (int64_t)f2 * g8_19;
138     h1 += (int64_t)f2 * g9_19;
139     h2 += (int64_t)f2 * g0;
140     h3 += (int64_t)f2 * g1;
141     h4 += (int64_t)f2 * g2;
142     h5 += (int64_t)f2 * g3;
143     h6 += (int64_t)f2 * g4;
144     h7 += (int64_t)f2 * g5;
145     h8 += (int64_t)f2 * g6;
146     h9 += (int64_t)f2 * g7;
147 
148     h0 += (int64_t)f3_2 * g7_19;
149     h1 += (int64_t)f3 * g8_19;
150     h2 += (int64_t)f3_2 * g9_19;
151     h3 += (int64_t)f3 * g0;
152     h4 += (int64_t)f3_2 * g1;
153     h5 += (int64_t)f3 * g2;
154     h6 += (int64_t)f3_2 * g3;
155     h7 += (int64_t)f3 * g4;
156     h8 += (int64_t)f3_2 * g5;
157     h9 += (int64_t)f3 * g6;
158 
159     h0 += (int64_t)f4 * g6_19;
160     h1 += (int64_t)f4 * g7_19;
161     h2 += (int64_t)f4 * g8_19;
162     h3 += (int64_t)f4 * g9_19;
163     h4 += (int64_t)f4 * g0;
164     h5 += (int64_t)f4 * g1;
165     h6 += (int64_t)f4 * g2;
166     h7 += (int64_t)f4 * g3;
167     h8 += (int64_t)f4 * g4;
168     h9 += (int64_t)f4 * g5;
169 
170     h0 += (int64_t)f5_2 * g5_19;
171     h1 += (int64_t)f5 * g6_19;
172     h2 += (int64_t)f5_2 * g7_19;
173     h3 += (int64_t)f5 * g8_19;
174     h4 += (int64_t)f5_2 * g9_19;
175     h5 += (int64_t)f5 * g0;
176     h6 += (int64_t)f5_2 * g1;
177     h7 += (int64_t)f5 * g2;
178     h8 += (int64_t)f5_2 * g3;
179     h9 += (int64_t)f5 * g4;
180 
181     h0 += (int64_t)f6 * g4_19;
182     h1 += (int64_t)f6 * g5_19;
183     h2 += (int64_t)f6 * g6_19;
184     h3 += (int64_t)f6 * g7_19;
185     h4 += (int64_t)f6 * g8_19;
186     h5 += (int64_t)f6 * g9_19;
187     h6 += (int64_t)f6 * g0;
188     h7 += (int64_t)f6 * g1;
189     h8 += (int64_t)f6 * g2;
190     h9 += (int64_t)f6 * g3;
191 
192     h0 += (int64_t)f7_2 * g3_19;
193     h1 += (int64_t)f7 * g4_19;
194     h2 += (int64_t)f7_2 * g5_19;
195     h3 += (int64_t)f7 * g6_19;
196     h4 += (int64_t)f7_2 * g7_19;
197     h5 += (int64_t)f7 * g8_19;
198     h6 += (int64_t)f7_2 * g9_19;
199     h7 += (int64_t)f7 * g0;
200     h8 += (int64_t)f7_2 * g1;
201     h9 += (int64_t)f7 * g2;
202 
203     h0 += (int64_t)f8 * g2_19;
204     h1 += (int64_t)f8 * g3_19;
205     h2 += (int64_t)f8 * g4_19;
206     h3 += (int64_t)f8 * g5_19;
207     h4 += (int64_t)f8 * g6_19;
208     h5 += (int64_t)f8 * g7_19;
209     h6 += (int64_t)f8 * g8_19;
210     h7 += (int64_t)f8 * g9_19;
211     h8 += (int64_t)f8 * g0;
212     h9 += (int64_t)f8 * g1;
213 
214     h0 += (int64_t)f9_2 * g1_19;
215     h1 += (int64_t)f9 * g2_19;
216     h2 += (int64_t)f9_2 * g3_19;
217     h3 += (int64_t)f9 * g4_19;
218     h4 += (int64_t)f9_2 * g5_19;
219     h5 += (int64_t)f9 * g6_19;
220     h6 += (int64_t)f9_2 * g7_19;
221     h7 += (int64_t)f9 * g8_19;
222     h8 += (int64_t)f9_2 * g9_19;
223     h9 += (int64_t)f9 * g0;
224 
225     FP_PROCESS_CARRY(h);
226 
227     INT64_2_FP25(h, out);
228 }
229 
FpSquareDoubleCore(Fp25 out,const Fp25 in,bool doDouble)230 void FpSquareDoubleCore(Fp25 out, const Fp25 in, bool doDouble)
231 {
232     int64_t h0, h1, h2, h3, h4, h5, h6, h7, h8, h9;
233     int32_t f0, f1, f2, f3, f4, f5, f6, f7, f8, f9;
234 
235     FP25_2_INT32(in, f);
236 
237     int32_t f0_2 = f0 * 2;
238     int32_t f1_2 = f1 * 2;
239     int32_t f2_2 = f2 * 2;
240     int32_t f3_2 = f3 * 2;
241     int32_t f4_2 = f4 * 2;
242     int32_t f5_2 = f5 * 2;
243     int32_t f6_2 = f6 * 2;
244     int32_t f7_2 = f7 * 2;
245 
246     int32_t f9_38 = f9 * 38;
247     int32_t f8_19 = f8 * 19;
248     int32_t f7_38 = f7 * 38;
249     int32_t f6_19 = f6 * 19;
250     int32_t f5_19 = f5 * 19;
251 
252     h0 = (int64_t)f0 * f0;
253     h1 = (int64_t)f0_2 * f1;
254     h2 = (int64_t)f0_2 * f2;
255     h3 = (int64_t)f0_2 * f3;
256     h4 = (int64_t)f0_2 * f4;
257     h5 = (int64_t)f0_2 * f5;
258     h6 = (int64_t)f0_2 * f6;
259     h7 = (int64_t)f0_2 * f7;
260     h8 = (int64_t)f0_2 * f8;
261     h9 = (int64_t)f0_2 * f9;
262 
263     h0 += (int64_t)f1_2 * f9_38;
264     h1 += (int64_t)f2 * f9_38;
265     h2 += (int64_t)f1_2 * f1;
266     h3 += (int64_t)f1_2 * f2;
267     h4 += (int64_t)f1_2 * f3_2;
268     h5 += (int64_t)f1_2 * f4;
269     h6 += (int64_t)f1_2 * f5_2;
270     h7 += (int64_t)f1_2 * f6;
271     h8 += (int64_t)f1_2 * f7_2;
272     h9 += (int64_t)f1_2 * f8;
273 
274     h0 += (int64_t)f2_2 * f8_19;
275     h1 += (int64_t)f3_2 * f8_19;
276     h2 += (int64_t)f3_2 * f9_38;
277     h3 += (int64_t)f4 * f9_38;
278     h4 += (int64_t)f2 * f2;
279     h5 += (int64_t)f2_2 * f3;
280     h6 += (int64_t)f2_2 * f4;
281     h7 += (int64_t)f2_2 * f5;
282     h8 += (int64_t)f2_2 * f6;
283     h9 += (int64_t)f2_2 * f7;
284 
285     h0 += (int64_t)f3_2 * f7_38;
286     h1 += (int64_t)f4 * f7_38;
287     h2 += (int64_t)f4_2 * f8_19;
288     h3 += (int64_t)f5_2 * f8_19;
289     h4 += (int64_t)f5_2 * f9_38;
290     h5 += (int64_t)f6 * f9_38;
291     h6 += (int64_t)f3_2 * f3;
292     h7 += (int64_t)f3_2 * f4;
293     h8 += (int64_t)f3_2 * f5_2;
294     h9 += (int64_t)f3_2 * f6;
295 
296     h0 += (int64_t)f4_2 * f6_19;
297     h1 += (int64_t)f5_2 * f6_19;
298     h2 += (int64_t)f5_2 * f7_38;
299     h3 += (int64_t)f6 * f7_38;
300     h4 += (int64_t)f6_2 * f8_19;
301     h5 += (int64_t)f7_2 * f8_19;
302     h6 += (int64_t)f7_2 * f9_38;
303     h7 += (int64_t)f8 * f9_38;
304     h8 += (int64_t)f4 * f4;
305     h9 += (int64_t)f4_2 * f5;
306 
307     h0 += (int64_t)f5_2 * f5_19;
308     h2 += (int64_t)f6 * f6_19;
309     h4 += (int64_t)f7 * f7_38;
310     h6 += (int64_t)f8 * f8_19;
311     h8 += (int64_t)f9 * f9_38;
312 
313     if (doDouble) {
314         h0 *= 2;
315         h1 *= 2;
316         h2 *= 2;
317         h3 *= 2;
318         h4 *= 2;
319         h5 *= 2;
320         h6 *= 2;
321         h7 *= 2;
322         h8 *= 2;
323         h9 *= 2;
324     }
325 
326     FP_PROCESS_CARRY(h);
327 
328     INT64_2_FP25(h, out);
329 }
330 
331 /* out = in1 ^ (4 * 2 ^ (2 * times)) * in2 */
FpMultiSquare(Fp25 in1,Fp25 in2,Fp25 out,int32_t times)332 static void FpMultiSquare(Fp25 in1, Fp25 in2, Fp25 out, int32_t times)
333 {
334     int32_t i;
335     Fp25 temp1, temp2;
336     FpSquareDoubleCore(temp1, in1, false);
337     FpSquareDoubleCore(temp2, temp1, false);
338     for (i = 0; i < times; i++) {
339         FpSquareDoubleCore(temp1, temp2, false);
340         FpSquareDoubleCore(temp2, temp1, false);
341     }
342     FpMul(out, in2, temp2);
343 }
344 
345 /* out = a ^ -1 */
FpInvert(Fp25 out,const Fp25 a)346 void FpInvert(Fp25 out, const Fp25 a)
347 {
348     int32_t i;
349     Fp25 a0;    /* save a^1         */
350     Fp25 a1;    /* save a^2         */
351     Fp25 a2;    /* save a^11        */
352     Fp25 a3;    /* save a^(2^5-1)   */
353     Fp25 a4;    /* save a^(2^10-1)  */
354     Fp25 a5;    /* save a^(2^20-1)  */
355     Fp25 a6;    /* save a^(2^40-1)  */
356     Fp25 a7;    /* save a^(2^50-1)  */
357     Fp25 a8;    /* save a^(2^100-1) */
358     Fp25 a9;    /* save a^(2^200-1) */
359     Fp25 a10;   /* save a^(2^250-1) */
360     Fp25 temp1, temp2;
361 
362     /* We know a×b=1(mod p), then a and b are inverses of mod p, i.e. a=b^(-1), b=a^(-1);
363      * According to Fermat's little theorem a^(p-1)=1(mod p), so a*a^(p-2)=1(mod p);
364      * So the inverse element of a is a^(-1) = a^(p-2)(mod p)
365      * Here it is, p=2^255-19, thus we need to compute a^(2^255-21)(mod(2^255-19))
366      */
367 
368     /* a^1 */
369     CURVE25519_FP_COPY(a0, a);
370 
371     /* a^2 */
372     FpSquareDoubleCore(a1, a0, false);
373 
374     /* a^4 */
375     FpSquareDoubleCore(temp1, a1, false);
376 
377     /* a^8 */
378     FpSquareDoubleCore(temp2, temp1, false);
379 
380     /* a^9 */
381     FpMul(temp1, a0, temp2);
382 
383     /* a^11 */
384     FpMul(a2, a1, temp1);
385 
386     /* a^22 */
387     FpSquareDoubleCore(temp2, a2, false);
388 
389     /* a^(2^5-1) = a^(9+22) */
390     FpMul(a3, temp1, temp2);
391 
392     /* a^(2^10-1) = a^(2^10-2^5) * a^(2^5-1) */
393     FpSquareDoubleCore(temp1, a3, false);
394     for (i = 0; i < 2; i++) { // (2 * 2)^2
395         FpSquareDoubleCore(temp2, temp1, false);
396         FpSquareDoubleCore(temp1, temp2, false);
397     }
398     FpMul(a4, a3, temp1);
399 
400     /* a^(2^20-1) = a^(2^20-2^10) * a^(2^10-1) */
401     FpMultiSquare(a4, a4, a5, 4); // (2 * 2) ^ 4
402 
403     /* a^(2^40-1) = a^(2^40-2^20) * a^(2^20-1) */
404     FpMultiSquare(a5, a5, a6, 9); // (2 * 2) ^ 9
405 
406     /* a^(2^50-1) = a^(2^50-2^10) * a^(2^10-1) */
407     FpMultiSquare(a6, a4, a7, 4); // (2 * 2) ^ 4
408 
409     /* a^(2^100-1) = a^(2^100-2^50) * a^(2^50-1) */
410     FpMultiSquare(a7, a7, a8, 24); // (2 * 2) ^ 24
411 
412     /* a^(2^200-1) = a^(2^200-2^100) * a^(2^100-1) */
413     FpMultiSquare(a8, a8, a9, 49); // (2 * 2) ^ 49
414 
415     /* a^(2^250-1) = a^(2^250-2^50) * a^(2^50-1) */
416     FpMultiSquare(a9, a7, a10, 24); // (2 * 2) ^ 24
417 
418     /* a^(2^5*(2^250-1)) = (a^(2^250-1))^5 */
419     FpSquareDoubleCore(temp1, a10, false);
420     FpSquareDoubleCore(temp2, temp1, false);
421     FpSquareDoubleCore(temp1, temp2, false);
422     FpSquareDoubleCore(temp2, temp1, false);
423     FpSquareDoubleCore(temp1, temp2, false);
424 
425     /* The output:a^(2^255-21) = a(2^5*(2^250-1)+11) = a^(2^5*(2^250-1)) * a^11 */
426     FpMul(out, a2, temp1);
427 }
428 
429 #ifdef HITLS_CRYPTO_ED25519
430 /* out = in ^ ((q - 5) / 8) */
FpPowq58(Fp25 out,Fp25 in)431 static void FpPowq58(Fp25 out, Fp25 in)
432 {
433     Fp25 a, b, c;
434     int32_t i;
435     FpSquareDoubleCore(a, in, false);
436     FpSquareDoubleCore(b, a, false);
437     FpSquareDoubleCore(b, b, false);
438     FpMul(b, in, b);
439     FpMul(a, a, b);
440     FpSquareDoubleCore(a, a, false);
441     FpMul(a, b, a);
442     FpSquareDoubleCore(b, a, false);
443     // b = a ^ (2^5)
444     for (i = 1; i < 5; i++) {
445         FpSquareDoubleCore(b, b, false);
446     }
447     FpMul(a, b, a);
448     FpSquareDoubleCore(b, a, false);
449     // b = a ^ (2^10)
450     for (i = 1; i < 10; i++) {
451         FpSquareDoubleCore(b, b, false);
452     }
453     FpMul(b, b, a);
454     FpSquareDoubleCore(c, b, false);
455 
456     // c = b ^ (2^20)
457     for (i = 1; i < 20; i++) {
458         FpSquareDoubleCore(c, c, false);
459     }
460     FpMul(b, c, b);
461 
462     // b = b ^ (2^10)
463     for (i = 0; i < 10; i++) {
464         FpSquareDoubleCore(b, b, false);
465     }
466 
467     FpMul(a, b, a);
468     FpSquareDoubleCore(b, a, false);
469 
470     // b = a ^ (2^50)
471     for (i = 1; i < 50; i++) {
472         FpSquareDoubleCore(b, b, false);
473     }
474     FpMul(b, b, a);
475     FpSquareDoubleCore(c, b, false);
476 
477     // c = b ^ (2 ^ 100)
478     for (i = 1; i < 100; i++) {
479         FpSquareDoubleCore(c, c, false);
480     }
481     FpMul(b, c, b);
482 
483     // b = b ^ (2^50)
484     for (i = 0; i < 50; i++) {
485         FpSquareDoubleCore(b, b, false);
486     }
487     FpMul(a, b, a);
488     FpSquareDoubleCore(a, a, false);
489     FpSquareDoubleCore(a, a, false);
490     FpMul(out, a, in);
491 }
492 #endif
493 
PaddingUnload(uint8_t out[32],Fp25 pFp25)494 static void PaddingUnload(uint8_t out[32], Fp25 pFp25)
495 {
496     int32_t *p = (int32_t *)pFp25;
497 
498     /* Take a polynomial form number into a 32-byte array */
499     CURVE25519_BYTES4_PADDING_UNLOAD(out, 2, p);                /* p0 unload 4 bytes on out[0] expand 2 */
500     CURVE25519_BYTES3_PADDING_UNLOAD(out + 4, 2, 3, p + 1);     /* p1 unload 3 bytes on out[4] shift 2 expand 3 */
501     CURVE25519_BYTES3_PADDING_UNLOAD(out + 7, 3, 5, p + 2);     /* p2 unload 3 bytes on out[7] shift 3 expand 5 */
502     CURVE25519_BYTES3_PADDING_UNLOAD(out + 10, 5, 6, p + 3);    /* p3 unload 3 bytes on out[10] shift 5 expand 6 */
503     CURVE25519_BYTES3_UNLOAD(out + 13, 6, p + 4);               /* p4 unload 3 bytes on out[13] shift 6 */
504 
505     CURVE25519_BYTES4_PADDING_UNLOAD(out + 16, 1, p + 5);       /* p5 unload 4 bytes on out[16] expand 1 */
506     CURVE25519_BYTES3_PADDING_UNLOAD(out + 20, 1, 3, p + 6);    /* p6 unload 3 bytes on out[20] shift 1 expand 3 */
507     CURVE25519_BYTES3_PADDING_UNLOAD(out + 23, 3, 4, p + 7);    /* p7 unload 3 bytes on out[23] shift 3 expand 4 */
508     CURVE25519_BYTES3_PADDING_UNLOAD(out + 26, 4, 6, p + 8);    /* p8 unload 3 bytes on out[26] shift 4 expand 6 */
509     CURVE25519_BYTES3_UNLOAD(out + 29, 6, p + 9);               /* p9 unload 3 bytes on out[29] shift 6 */
510 }
511 
PolynomialToData(uint8_t out[32],const Fp25 polynomial)512 void PolynomialToData(uint8_t out[32], const Fp25 polynomial)
513 {
514     Fp25 pFp25;
515     uint32_t pos;
516     uint32_t over;
517     uint32_t mul19;
518     uint32_t signMask;
519 
520     CURVE25519_FP_COPY(pFp25, polynomial);
521 
522     /* First process, all the carry transport to pFp25[0] */
523     mul19 = (uint32_t)pFp25[9] * 19; // mul 19 for mod
524     over = mul19 + (1 << 24); // plus 1 << 24 for carry
525     // restricted to 25 bits, shift 31 for sign
526     signMask = (-(over >> 31)) & MASK_HIGH32(25);
527     over = (over >> 25) | signMask; // 25 bits
528     pos = 0;
529     do {
530         over = (uint32_t)pFp25[pos] + over;
531         // first carry is restricted to 25 bits, shift 31 for sign
532         signMask = (-(over >> 31)) & MASK_HIGH32(25);
533         over = (over >> 25) | signMask; // 25 bits
534         pos++;
535 
536         over = (uint32_t)pFp25[pos] + over;
537         // second carry is restricted to 26 bits, shift 31 for sign
538         signMask = (-(over >> 31)) & MASK_HIGH32(26);
539         over = (over >> 26) | signMask; // 26 bits
540         pos++;
541     } while (pos < 10); // process from 0 to 9, pos < 10
542     mul19 = over * 19; // mul 19 for mod
543     pFp25[0] += (int32_t)mul19;
544 
545     /* We subtracted 2^255-19 and get the result
546      * all polynomial[i] is restricted to 25 bits or 26 bits
547      */
548     pos = 0;
549     do {
550         // first polynomial is restricted to 26 bits, shift 31 for sign
551         signMask = (-((uint32_t)pFp25[pos] >> 31)) & MASK_HIGH32(26);
552         over = ((uint32_t)pFp25[pos] >> 26) | signMask; // 26 bits
553         pFp25[pos] = (int32_t)((uint32_t)pFp25[pos] & MASK_LOW32(26)); // 26 bits
554         pos++;
555         pFp25[pos] += (int32_t)over;
556 
557         // second polynomial is restricted to 25 bits, shift 31 for sign
558         signMask = (-((uint32_t)pFp25[pos] >> 31)) & MASK_HIGH32(25);
559         over = ((uint32_t)pFp25[pos] >> 25) | signMask; // 25 bits
560         pFp25[pos] = (int32_t)((uint32_t)pFp25[pos] & MASK_LOW32(25));
561         pos++;
562         pFp25[pos] += (int32_t)over;
563     } while (pos < 8); // process form 0 to 7, pos < 8
564 
565     // process pFp25[8], restricted to 26 bits, shift 31 for sign
566     signMask = (-((uint32_t)pFp25[pos] >> 31)) & MASK_HIGH32(26);
567     over = ((uint32_t)pFp25[pos] >> 26) | signMask; // 26 bits
568     pFp25[pos] = (int32_t)((uint32_t)pFp25[pos] & MASK_LOW32(26)); // 26 bits
569     pos++;
570     // process pFp25[9]
571     pFp25[pos] += (int32_t)over;
572     pFp25[pos] = (int32_t)((uint32_t)pFp25[pos] & MASK_LOW32(25)); // pFp25[9] is restricted to 25 bits
573 
574     PaddingUnload(out, pFp25);
575 }
576 
577 /* unified addition in Extended twist Edwards Coordinate */
578 /* out = out + tableElement */
GeAdd(GeE * out,const GePre * tableElement)579 static void GeAdd(GeE *out, const GePre *tableElement)
580 {
581     Fp25 a;
582     Fp25 b;
583     Fp25 c;
584     Fp25 d;
585     Fp25 e;
586     Fp25 f;
587     Fp25 g;
588     Fp25 h;
589     /* a = (Y1 − X1) * (Y2 − X2), b = (Y1 + X1) * (Y2 + X2)
590      * c = 2 * d * T1 * X2 * Y2, d = 2 * Z1
591      * e = b − a, f = d − c, g = d + c, h = b + a
592      * X3 = e * f, Y3 = g * h, T3 = e * h, Z3 = f * g
593      */
594     CURVE25519_FP_ADD(e, out->y, out->x);
595     CURVE25519_FP_SUB(f, out->y, out->x);
596     FpMul(b, e, tableElement->yplusx);
597     FpMul(a, f, tableElement->yminusx);
598     FpMul(c, out->t, tableElement->xy2d);
599     CURVE25519_FP_ADD(d, out->z, out->z);
600     CURVE25519_FP_SUB(e, b, a);
601     CURVE25519_FP_SUB(f, d, c);
602     CURVE25519_FP_ADD(g, d, c);
603     CURVE25519_FP_ADD(h, b, a);
604     FpMul(out->x, e, f);
605     FpMul(out->y, h, g);
606     FpMul(out->z, g, f);
607     FpMul(out->t, e, h);
608 }
609 
610 #ifdef HITLS_CRYPTO_ED25519
611 /* out = out - tableElement */
GeSub(GeE * out,const GePre * tableElement)612 static void GeSub(GeE *out, const GePre *tableElement)
613 {
614     Fp25 a;
615     Fp25 b;
616     Fp25 c;
617     Fp25 d;
618     Fp25 e;
619     Fp25 f;
620     Fp25 g;
621     Fp25 h;
622 
623     CURVE25519_FP_ADD(e, out->y, out->x);
624     CURVE25519_FP_SUB(f, out->y, out->x);
625     FpMul(b, e, tableElement->yminusx);
626     FpMul(a, f, tableElement->yplusx);
627     FpMul(c, out->t, tableElement->xy2d);
628     CURVE25519_FP_ADD(d, out->z, out->z);
629     CURVE25519_FP_SUB(e, b, a);
630     CURVE25519_FP_ADD(f, d, c);
631     CURVE25519_FP_SUB(g, d, c);
632     CURVE25519_FP_ADD(h, b, a);
633     FpMul(out->x, e, f);
634     FpMul(out->y, h, g);
635     FpMul(out->z, g, f);
636     FpMul(out->t, e, h);
637 }
638 #endif
639 
640 /* double in Projective twist Edwards Coordinate */
ProjectiveDouble(GeC * complete,const GeP * projective)641 static void ProjectiveDouble(GeC *complete, const GeP *projective)
642 {
643     Fp25 tmp;
644     FpSquareDoubleCore((complete->x), (projective->x), false);
645     FpSquareDoubleCore((complete->z), (projective->y), false);
646     // T = 2 * Z^2
647     FpSquareDoubleCore(complete->t, projective->z, true);
648     CURVE25519_FP_ADD(complete->y, projective->x, projective->y);
649     FpSquareDoubleCore(tmp, complete->y, false);
650     // tmp = (X1 + Y1) ^ 2, T = 2 * Z^2, X = X1 ^ 2, Y = Z1 ^ 2, Z = Y1 ^ 2
651     CURVE25519_FP_ADD(complete->y, complete->z, complete->x);
652     CURVE25519_FP_SUB(complete->z, complete->z, complete->x);
653     CURVE25519_FP_SUB(complete->x, tmp, complete->y);
654     CURVE25519_FP_SUB(complete->t, complete->t, complete->z);
655 }
656 
657 /* Convert complete coordinate to projective coordinate */
GeCompleteToProjective(GeP * out,const GeC * complete)658 static void GeCompleteToProjective(GeP *out, const GeC *complete)
659 {
660     FpMul(out->x, complete->t, complete->x);
661     FpMul(out->y, complete->z, complete->y);
662     FpMul(out->z, complete->t, complete->z);
663 }
664 
665 /* p1 = 16 * p1 */
P1DoubleFourTimes(GeE * p1)666 static void P1DoubleFourTimes(GeE *p1)
667 {
668     GeP p;
669     GeC c;
670     // From extended coordinate to projective coordinate, just ignore T
671     CURVE25519_FP_COPY(p.x, p1->x);
672     CURVE25519_FP_COPY(p.y, p1->y);
673     CURVE25519_FP_COPY(p.z, p1->z);
674     // double 4 times to get 16p1
675     ProjectiveDouble(&c, &p);
676     GeCompleteToProjective(&p, &c);
677     ProjectiveDouble(&c, &p);
678     GeCompleteToProjective(&p, &c);
679     ProjectiveDouble(&c, &p);
680     GeCompleteToProjective(&p, &c);
681     ProjectiveDouble(&c, &p);
682     FpMul(p1->x, c.x, c.t);
683     FpMul(p1->y, c.y, c.z);
684     FpMul(p1->z, c.z, c.t);
685     FpMul(p1->t, c.x, c.y);
686 }
687 
SetExtendedBasePoint(GeE * out)688 static void SetExtendedBasePoint(GeE *out)
689 {
690     CURVE25519_FP_SET(out->x, 0);
691     CURVE25519_FP_SET(out->y, 1);
692     CURVE25519_FP_SET(out->t, 0);
693     CURVE25519_FP_SET(out->z, 1);
694 }
695 
696 /* Multiple with Base point, see paper: High-speed high-security signatures */
ScalarMultiBase(GeE * out,const uint8_t in[CRYPT_CURVE25519_KEYLEN])697 void ScalarMultiBase(GeE *out, const uint8_t in[CRYPT_CURVE25519_KEYLEN])
698 {
699     uint8_t carry;
700     // inLen is always 32, buffer needs 32 * 2 = 64
701     uint8_t privateKey[64];
702     int32_t i;
703     GePre preCompute;
704 
705     // split 32 8bits input into 64 4bits-based number
706     for (i = 0; i < 32; i++) {
707         privateKey[i * 2] = in[i] & 15;            // and 15 to get low 4 bits, stored in 2i
708         privateKey[i * 2 + 1] = (in[i] >> 4) & 15; // shift 4 then and 15 to get upper 4 bits, stored in 2i+1
709     }
710     carry = 0;
711     /**
712      * change from 0 - 15 to -8 - 7, if privateKey[i] >= 8, carry = 1, privateKey[i] -= 16
713      * if privateKey[i] < 8, privateKey[i] = privateKey[i]
714      */
715     for (i = 0; i < 63; i++) { // 0 to 63
716         privateKey[i] += carry;
717         carry = (privateKey[i] + 8) >> 4; // plus 8 then shit 4 to get carry
718         privateKey[i] -= carry << 4;      // left shift 4
719     }
720     // never overflow since we set first bit to 0 of private key
721     privateKey[63] += carry; // last one is 63
722     // set base point X:Y:T:Z -> 0:1:0:1
723     SetExtendedBasePoint(out);
724     for (i = 1; i < 64; i += 2) { // form 1 to 63, process all odd element, increment by 2, i < 64
725         TableLookup(&preCompute, i / 2, (int8_t)privateKey[i]); // position goes from 0 to 31, i / 2 = pos
726         // Fit with paper: Twisted Edwards Curves Revisited
727         GeAdd(out, &preCompute);
728     }
729     // now we have P1, double it four times we have 16P1, P1 is in Extended now, we do double in projective coordinate
730     P1DoubleFourTimes(out);
731     // Add P0 with precomute
732     for (i = 0; i < 64; i += 2) { // form 0 to 62, process all even element, increment by 2, i < 64
733         TableLookup(&preCompute, i / 2, (int8_t)privateKey[i]); // position goes from 0 to 31, i / 2 = pos
734         GeAdd(out, &preCompute);
735     }
736     // clean up private key information
737     BSL_SAL_CleanseData(privateKey, sizeof(privateKey));
738 }
739 
740 #ifdef HITLS_CRYPTO_ED25519
PointEncoding(const GeE * point,uint8_t * output,uint32_t outputLen)741 void PointEncoding(const GeE *point, uint8_t *output, uint32_t outputLen)
742 {
743     Fp25 zInvert;
744     Fp25 x;
745     Fp25 y;
746     uint8_t xData[CRYPT_CURVE25519_KEYLEN];
747     /* x = X / Z, y = Y / Z */
748     (void)outputLen;
749     FpInvert(zInvert, point->z);
750     FpMul(x, point->x, zInvert);
751     FpMul(y, point->y, zInvert);
752     PolynomialToData(output, y);
753     PolynomialToData(xData, x);
754     // PointEcoding writes only 32 bytes data, therefore output[31] is the last one
755     output[31] ^= (xData[0] & 0x1) << 7; // last one is output[31], get only last bit then shift 7
756 }
757 #endif
758 
FeCmove(Fp25 dst,const Fp25 src,const uint32_t indicator)759 static void FeCmove(Fp25 dst, const Fp25 src, const uint32_t indicator)
760 {
761     // if indicator = 1, now it will be 111111111111b....
762     const uint32_t indicate = 0 - indicator;
763     /* des become source if dst->data[i] ^ src->data[i] is in 1111....b, or it does not change if
764     (dst->data[i] ^ src->data[i]) & indicate is all 0 */
765     dst[0] = CONDITION_COPY(dst[0], src[0], indicate); // 0
766     dst[1] = CONDITION_COPY(dst[1], src[1], indicate); // 1
767     dst[2] = CONDITION_COPY(dst[2], src[2], indicate); // 2
768     dst[3] = CONDITION_COPY(dst[3], src[3], indicate); // 3
769     dst[4] = CONDITION_COPY(dst[4], src[4], indicate); // 4
770     dst[5] = CONDITION_COPY(dst[5], src[5], indicate); // 5
771     dst[6] = CONDITION_COPY(dst[6], src[6], indicate); // 6
772     dst[7] = CONDITION_COPY(dst[7], src[7], indicate); // 7
773     dst[8] = CONDITION_COPY(dst[8], src[8], indicate); // 8
774     dst[9] = CONDITION_COPY(dst[9], src[9], indicate); // 9
775 }
776 
ConditionalMove(GePre * preCompute,const GePre * tableElement,uint32_t indicator)777 void ConditionalMove(GePre *preCompute, const GePre *tableElement, uint32_t indicator)
778 {
779     FeCmove(preCompute->yplusx, tableElement->yplusx, indicator);
780     FeCmove(preCompute->yminusx, tableElement->yminusx, indicator);
781     FeCmove(preCompute->xy2d, tableElement->xy2d, indicator);
782 }
783 
DataToPolynomial(Fp25 out,const uint8_t data[32])784 void DataToPolynomial(Fp25 out, const uint8_t data[32])
785 {
786     const uint8_t *t = data;
787     uint64_t p[10];
788     uint64_t over;
789     int32_t i;
790     uint64_t signMask;
791 
792     /* f0, load 32 bits */
793     CURVE25519_BYTES4_LOAD(p, t);
794     /* f1, load 24 bits from t4, shift bits: 26 - 24 - (8 - x) = 0 -> x = 6 */
795     CURVE25519_BYTES3_LOAD_PADDING(p + 1, 6, t + 4);
796     /* f2, load 24 bits from t7, shift bits: 51 - 48 - (8 - x) = 0 -> x = 5 */
797     CURVE25519_BYTES3_LOAD_PADDING(p + 2, 5, t + 7);
798     /* f3, load 24 bits from t10, shift bits: 77 - 72 - (8 - x) = 0 -> x = 3 */
799     CURVE25519_BYTES3_LOAD_PADDING(p + 3, 3, t + 10);
800     /* f4, load 24 bits from t13, shift bits: 102 - 96 - (8 - x) = 0 -> x = 2 */
801     CURVE25519_BYTES3_LOAD_PADDING(p + 4, 2, t + 13);
802     /* f5, load 32 bits from t16 */
803     CURVE25519_BYTES4_LOAD(p + 5, t + 16);
804     /* f6, load 24 bits from t20, shift bits: 153 - 152 - (8 - x) = 0 -> x = 7 */
805     CURVE25519_BYTES3_LOAD_PADDING(p + 6, 7, t + 20);
806     /* f7, load 24 bits from t23, shift bits: 179 - 176 - (8 - x) = 0 -> x = 5 */
807     CURVE25519_BYTES3_LOAD_PADDING(p + 7, 5, t + 23);
808     /* f8, load 24 bits from t26, shift bits: 204 - 200 - (8 - x) = 0 -> x = 4 */
809     CURVE25519_BYTES3_LOAD_PADDING(p + 8, 4, t + 26);
810     /* f9, load 24 bits from t29, shift bits: 230 - 224 - (8 - x) = 0 -> x = 2 */
811     CURVE25519_BYTES3_LOAD(p + 9, t + 29);
812     p[9] = (p[9] & 0x7fffff) << 2; /* p9 is 25 bits, left shift 2 */
813 
814     /* Limiting the number of bits, exchange 2^1 to 2^25.5, turn into polynomial representation */
815     /* f9->f0, shift 24 for carry */
816     over = p[9] + (1 << 24);
817     signMask = MASK_HIGH64(25) & (-((over) >> 63)); // shift 63 bits for sign, mask 25 bits
818     p[0] += ((over >> 25) | signMask) * 19; // 24 bits plus sign is 25, mul 19 for mod
819     p[9] -= MASK_HIGH64(39) & over; // 64 - 25 = 39 bits mask
820 
821     /* f1->f2, restricted to 24 bits */
822     PROCESS_CARRY(p[1], p[2], signMask, over, 24);
823     /* f3->f4, restricted to 24 bits */
824     PROCESS_CARRY(p[3], p[4], signMask, over, 24);
825     /* f5->f6, restricted to 24 bits */
826     PROCESS_CARRY(p[5], p[6], signMask, over, 24);
827     /* f7->f8, restricted to 24 bits */
828     PROCESS_CARRY(p[7], p[8], signMask, over, 24);
829 
830     /* f0->f1, restricted to 25 bits */
831     PROCESS_CARRY(p[0], p[1], signMask, over, 25);
832     /* f2->f3, restricted to 25 bits */
833     PROCESS_CARRY(p[2], p[3], signMask, over, 25);
834     /* f4->f5, restricted to 25 bits */
835     PROCESS_CARRY(p[4], p[5], signMask, over, 25);
836     /* f6->f7, restricted to 25 bits */
837     PROCESS_CARRY(p[6], p[7], signMask, over, 25);
838     /* f8->f9, restricted to 25 bits */
839     PROCESS_CARRY(p[8], p[9], signMask, over, 25);
840 
841     /* After process carry, polynomial every term would not exceed 32 bits, convert form 0 to 9, i < 10 */
842     for (i = 0; i < 10; i++) {
843         out[i] = (int32_t)p[i];
844     }
845 }
846 
847 #ifdef HITLS_CRYPTO_ED25519
CheckZero(Fp25 x)848 static bool CheckZero(Fp25 x)
849 {
850     uint8_t tmp[32];
851     const uint8_t zero[32] = {0};
852     PolynomialToData(tmp, x);
853     if (memcmp(tmp, zero, sizeof(zero)) == 0) {
854         return true;
855     } else {
856         return false;
857     }
858 }
859 
GetXBit(Fp25 in)860 static uint8_t GetXBit(Fp25 in)
861 {
862     uint8_t tmp[32];
863     PolynomialToData(tmp, in);
864 
865     return tmp[0] & 0x1;
866 }
867 
868 static const Fp25 SQRTM1 = {-32595792, -7943725, 9377950, 3500415, 12389472, -272473,
869     -25146209, -2005654, 326686, 11406482};
870 static const Fp25 D = {-10913610, 13857413, -15372611, 6949391, 114729, -8787816,
871     -6275908, -3247719, -18696448, -12055116};
872 
PointDecoding(GeE * point,const uint8_t in[CRYPT_CURVE25519_KEYLEN])873 int32_t PointDecoding(GeE *point, const uint8_t in[CRYPT_CURVE25519_KEYLEN])
874 {
875     Fp25 u, v, v3, x2, result;
876     // get the last block (31), shift 7 for first bit
877     uint8_t x0 = in[31] >> 7;
878     DataToPolynomial(point->y, in);
879 
880     CURVE25519_FP_SET(point->z, 1);
881     FpSquareDoubleCore(u, point->y, false);
882     FpMul(v, u, D);
883     CURVE25519_FP_SUB(u, u, point->z);
884     CURVE25519_FP_ADD(v, v, point->z);
885 
886     FpSquareDoubleCore(v3, v, false);
887     FpMul(v3, v3, v);
888     FpSquareDoubleCore(point->x, v3, false);
889     FpMul(point->x, point->x, v);
890     FpMul(point->x, point->x, u);
891 
892     /* x = x ^ ((q - 5) / 8) */
893     FpPowq58(point->x, point->x);
894 
895     FpMul(point->x, point->x, v3);
896     FpMul(point->x, point->x, u);
897     FpSquareDoubleCore(x2, point->x, false);
898     FpMul(x2, x2, v);
899     CURVE25519_FP_SUB(result, x2, u);
900 
901     if (CheckZero(result) == false) {
902         CURVE25519_FP_ADD(result, x2, u);
903         if (CheckZero(result) == false) {
904             return 1;
905         }
906         FpMul(point->x, point->x, SQRTM1);
907     }
908     uint8_t bit = GetXBit(point->x);
909     if (bit != x0) {
910         CURVE25519_FP_NEGATE(point->x, point->x);
911     }
912     FpMul(point->t, point->x, point->y);
913 
914     return 0;
915 }
916 
ScalarMulAddPreLoad(const uint8_t in[CRYPT_CURVE25519_KEYLEN],uint64_t out[UINT8_32_21BITS_BLOCKNUM])917 static void ScalarMulAddPreLoad(const uint8_t in[CRYPT_CURVE25519_KEYLEN],
918     uint64_t out[UINT8_32_21BITS_BLOCKNUM])
919 {
920     CURVE25519_BYTES3_LOAD(&out[0], in);
921     out[0] = out[0] & MASK_64_LOW21;
922 
923     CURVE25519_BYTES4_LOAD(&out[1], in + 2); // 1: load 4 bytes form position 2
924     out[1]  = MASK_64_LOW21 & (out[1] >> 5); // 1: 8 - ((3 * 8) mod 21) mod 8 = 5
925 
926     CURVE25519_BYTES3_LOAD(&out[2], in + 5); // 2: load 3 bytes form position 5
927     out[2]  = MASK_64_LOW21 & (out[2] >> 2); // 2: 8 - ((6 * 8) mod 21) mod 8 = 2
928 
929     CURVE25519_BYTES4_LOAD(&out[3], in + 7); // 3: load 4 bytes form position 7
930     out[3]  = MASK_64_LOW21 & (out[3] >> 7); // 3: 8 - ((8 * 8) mod 21) mod 8 = 7
931 
932     CURVE25519_BYTES4_LOAD(&out[4], in + 10); // 4: load 4 bytes form position 10
933     out[4]  = MASK_64_LOW21 & (out[4] >> 4); // 4: 8 - ((11 * 8) mod 21) mod 8 = 4
934 
935     CURVE25519_BYTES3_LOAD(&out[5], in + 13); // 5: load 3 bytes form position 13
936     out[5]  = MASK_64_LOW21 & (out[5] >> 1); // 5: 8 - ((14 * 8) mod 21) mod 8 = 1
937 
938     CURVE25519_BYTES4_LOAD(&out[6], in + 15); // 6: load 4 bytes form position 15
939     out[6]  = MASK_64_LOW21 & (out[6] >> 6); // 6: 8 - ((16 * 8) mod 21) mod 8 = 6
940 
941     CURVE25519_BYTES3_LOAD(&out[7], in + 18); // 7: load 3 bytes form position 18
942     out[7]  = MASK_64_LOW21 & (out[7] >> 3); // 7: 8 - ((19 * 8) mod 21) mod 8 = 3
943 
944     CURVE25519_BYTES3_LOAD(&out[8], in + 21); // 8: load 3 bytes form position 21
945     out[8]  = MASK_64_LOW21 & out[8]; // 8: ((22 * 8) mod 21) mod 8 = 0
946 
947     CURVE25519_BYTES4_LOAD(&out[9], in + 23); // 9: load 4 bytes form position 23
948     out[9]  = MASK_64_LOW21 & (out[9] >> 5); // 9: 8 - ((24 * 8) mod 21) mod 8 = 5
949 
950     CURVE25519_BYTES3_LOAD(&out[10], in + 26); // 10: load 3 bytes form position 26
951     out[10] = MASK_64_LOW21 & (out[10] >> 2); // 10: 8 - ((27 * 8) mod 21) mod 8 = 2
952 
953     CURVE25519_BYTES4_LOAD(&out[11], in + 28); // 11: load 4 bytes form position 28
954     out[11] = (out[11] >> 7); // 11: 8 - ((29 * 8) mod 21) mod 8 = 7
955 }
956 
ModuloLPreLoad(const uint8_t s[CRYPT_CURVE25519_SIGNLEN],uint64_t s21Bits[UINT8_64_21BITS_BLOCKNUM])957 static void ModuloLPreLoad(const uint8_t s[CRYPT_CURVE25519_SIGNLEN], uint64_t s21Bits[UINT8_64_21BITS_BLOCKNUM])
958 {
959     CURVE25519_BYTES3_LOAD(&s21Bits[0], s);
960     s21Bits[0] = s21Bits[0] & MASK_64_LOW21;
961 
962     CURVE25519_BYTES4_LOAD(&s21Bits[1], s + 2); // 1: load 4 bytes form position 2
963     s21Bits[1]  = MASK_64_LOW21 & (s21Bits[1] >> 5); // 1: 8 - ((3 * 8) mod 21) mod 8 = 5
964 
965     CURVE25519_BYTES3_LOAD(&s21Bits[2], s + 5); // 2: load 3 bytes form position 5
966     s21Bits[2]  = MASK_64_LOW21 & (s21Bits[2] >> 2); // 2: 8 - ((6 * 8) mod 21) mod 8 = 2
967 
968     CURVE25519_BYTES4_LOAD(&s21Bits[3], s + 7); // 3: load 4 bytes form position 7
969     s21Bits[3]  = MASK_64_LOW21 & (s21Bits[3] >> 7); // 3: 8 - ((8 * 8) mod 21) mod 8 = 7
970 
971     CURVE25519_BYTES4_LOAD(&s21Bits[4], s + 10); // 4: load 4 bytes form position 10
972     s21Bits[4]  = MASK_64_LOW21 & (s21Bits[4] >> 4); // 4: 8 - ((11 * 8) mod 21) mod 8 = 4
973 
974     CURVE25519_BYTES3_LOAD(&s21Bits[5], s + 13); // 5: load 3 bytes form position 13
975     s21Bits[5]  = MASK_64_LOW21 & (s21Bits[5] >> 1); // 5: 8 - ((14 * 8) mod 21) mod 8 = 1
976 
977     CURVE25519_BYTES4_LOAD(&s21Bits[6], s + 15); // 6: load 4 bytes form position 15
978     s21Bits[6]  = MASK_64_LOW21 & (s21Bits[6] >> 6); // 6: 8 - ((16 * 8) mod 21) mod 8 = 6
979 
980     CURVE25519_BYTES3_LOAD(&s21Bits[7], s + 18); // 7: load 3 bytes form position 18
981     s21Bits[7]  = MASK_64_LOW21 & (s21Bits[7] >> 3); // 7: 8 - ((19 * 8) mod 21) mod 8 = 3
982 
983     CURVE25519_BYTES3_LOAD(&s21Bits[8], s + 21); // 8: load 3 bytes form position 21
984     s21Bits[8]  = MASK_64_LOW21 & s21Bits[8]; // 8: ((22 * 8) mod 21) mod 8 = 0
985 
986     CURVE25519_BYTES4_LOAD(&s21Bits[9], s + 23); // 9: load 4 bytes form position 23
987     s21Bits[9]  = MASK_64_LOW21 & (s21Bits[9] >> 5); // 9: 8 - ((24 * 8) mod 21) mod 8 = 5
988 
989     CURVE25519_BYTES3_LOAD(&s21Bits[10], s + 26); // 10: load 3 bytes form position 26
990     s21Bits[10] = MASK_64_LOW21 & (s21Bits[10] >> 2); // 10: 8 - ((27 * 8) mod 21) mod 8 = 2
991 
992     CURVE25519_BYTES4_LOAD(&s21Bits[11], s + 28); // 11: load 4 bytes form position 28
993     s21Bits[11] = MASK_64_LOW21 & (s21Bits[11] >> 7); // 11: 8 - ((29 * 8) mod 21) mod 8 = 7
994 
995     CURVE25519_BYTES4_LOAD(&s21Bits[12], s + 31); // 12: load 4 bytes form position 31
996     s21Bits[12] = MASK_64_LOW21 & (s21Bits[12] >> 4); // 12: 8 - ((32 * 8) mod 21) mod 8 = 4
997 
998     CURVE25519_BYTES3_LOAD(&s21Bits[13], s + 34); // 13: load 3 bytes form position 34
999     s21Bits[13] = MASK_64_LOW21 & (s21Bits[13] >> 1); // 13: 8 - ((35 * 8) mod 21) mod 8 = 1
1000 
1001     CURVE25519_BYTES4_LOAD(&s21Bits[14], s + 36); // 14: load 4 bytes form position 36
1002     s21Bits[14] = MASK_64_LOW21 & (s21Bits[14] >> 6); // 14: 8 - ((37 * 8) mod 21) mod 8 = 6
1003 
1004     CURVE25519_BYTES3_LOAD(&s21Bits[15], s + 39); // 15: load 3 bytes form position 39
1005     s21Bits[15] = MASK_64_LOW21 & (s21Bits[15] >> 3); // 15: 8 - ((40 * 8) mod 21) mod 8 = 3
1006 
1007     CURVE25519_BYTES3_LOAD(&s21Bits[16], s + 42); // 16: load 3 bytes form position 42
1008     s21Bits[16] = MASK_64_LOW21 & s21Bits[16]; // 16: ((43 * 8) mod 21) mod 8 = 0
1009 
1010     CURVE25519_BYTES4_LOAD(&s21Bits[17], s + 44); // 17: load 4 bytes form position 44
1011     s21Bits[17] = MASK_64_LOW21 & (s21Bits[17] >> 5); // 17: 8 - ((45 * 8) mod 21) mod 8 = 5
1012 
1013     CURVE25519_BYTES3_LOAD(&s21Bits[18], s + 47); // 18: load 3 bytes form position 47
1014     s21Bits[18] = MASK_64_LOW21 & (s21Bits[18] >> 2); // 18: 8 - ((48 * 8) mod 21) mod 8 = 2
1015 
1016     CURVE25519_BYTES4_LOAD(&s21Bits[19], s + 49); // 19: load 4 bytes form position 49
1017     s21Bits[19] = MASK_64_LOW21 & (s21Bits[19] >> 7); // 19: 8 - ((50 * 8) mod 21) mod 8 = 7
1018 
1019     CURVE25519_BYTES4_LOAD(&s21Bits[20], s + 52); // 20: load 4 bytes form position 52
1020     s21Bits[20] = MASK_64_LOW21 & (s21Bits[20] >> 4); // 20: 8 - ((53 * 8) mod 21) mod 8 = 4
1021 
1022     CURVE25519_BYTES3_LOAD(&s21Bits[21], s + 55); // 21: load 3 bytes form position 55
1023     s21Bits[21] = MASK_64_LOW21 & (s21Bits[21] >> 1); // 21: 8 - ((56 * 8) mod 21) mod 8 = 1
1024 
1025     CURVE25519_BYTES4_LOAD(&s21Bits[22], s + 57); // 22: load 4 bytes form position 57
1026     s21Bits[22] = MASK_64_LOW21 & (s21Bits[22] >> 6); // 22: 8 - ((58 * 8) mod 21) mod 8 = 6
1027 
1028     CURVE25519_BYTES4_LOAD(&s21Bits[23], s + 60); // 23: load 4 bytes form position 60
1029     s21Bits[23] = s21Bits[23] >> 3;  // 23: 8 - ((61 * 8) mod 21) mod 8 = 3
1030 }
1031 
UnloadTo8Bits(uint8_t s8Bits[CRYPT_CURVE25519_OPTLEN],uint64_t s21Bits[UINT8_64_21BITS_BLOCKNUM])1032 static void UnloadTo8Bits(uint8_t s8Bits[CRYPT_CURVE25519_OPTLEN], uint64_t s21Bits[UINT8_64_21BITS_BLOCKNUM])
1033 {
1034     s8Bits[0] = (uint8_t)s21Bits[0];
1035 
1036     // 1: load from 8 on block 0
1037     s8Bits[1] = (uint8_t)(s21Bits[0] >> 8);
1038 
1039     // 2: load from (16 + 1) to 21 on block 0 and 1 to 3 on block 1, 8 - 3 = 5
1040     s8Bits[2] = (uint8_t)((s21Bits[0] >> 16) | (s21Bits[1] << 5));
1041 
1042     // 3: load from (3 + 1) on block 1
1043     s8Bits[3] = (uint8_t)(s21Bits[1] >> 3);
1044 
1045     // 4: load from (11 + 1) on block 1
1046     s8Bits[4] = (uint8_t)(s21Bits[1] >> 11);
1047 
1048     // 5: load from (19 + 1) to 21 on block 1 and 1 to 6 on block 2, 8 - 6 = 2
1049     s8Bits[5] = (uint8_t)((s21Bits[1] >> 19) | (s21Bits[2] << 2));
1050 
1051     // 6: load from (6 + 1) on block 2
1052     s8Bits[6] = (uint8_t)(s21Bits[2] >> 6);
1053 
1054     // 7: load from (14 + 1) to 21 on block 2 and 1 on block 3, 8 - 7 = 1
1055     s8Bits[7] = (uint8_t)((s21Bits[2] >> 14) | (s21Bits[3] << 7));
1056 
1057     // 8: load from (1 + 1) on block 3
1058     s8Bits[8] = (uint8_t)(s21Bits[3] >> 1);
1059 
1060     // 9: load from (9 + 1) on block 3
1061     s8Bits[9] = (uint8_t)(s21Bits[3] >> 9);
1062 
1063     // 10: load from (17 + 1) to 21 on block 3 and 1 to 4 on block 4, 8 - 4 = 4
1064     s8Bits[10] = (uint8_t)((s21Bits[3] >> 17) | (s21Bits[4] << 4));
1065 
1066     // 11: load from (4 + 1) on block 4
1067     s8Bits[11] = (uint8_t)(s21Bits[4] >> 4);
1068 
1069     // 12: load from (12 + 1) on block 4
1070     s8Bits[12] = (uint8_t)(s21Bits[4] >> 12);
1071 
1072     // 13: load from (20 + 1) on block 4 and 1 to 7 on block 5, 8 - 7 = 1
1073     s8Bits[13] = (uint8_t)((s21Bits[4] >> 20) | (s21Bits[5] << 1));
1074 
1075     // 14: load from (7 + 1) on block 5
1076     s8Bits[14] = (uint8_t)(s21Bits[5] >> 7);
1077 
1078     // 15: load from (15 + 1) to 21 on block 5 and 1 to 2 on block 6, 8 - 2 = 6
1079     s8Bits[15] = (uint8_t)((s21Bits[5] >> 15) | (s21Bits[6] << 6));
1080 
1081     // 16: load from (2 + 1) on block 6
1082     s8Bits[16] = (uint8_t)(s21Bits[6] >> 2);
1083 
1084     // 17: load from (10 + 1) on block 6
1085     s8Bits[17] = (uint8_t)(s21Bits[6] >> 10);
1086 
1087     // 18: load from (18 + 1) to 21 on block 6 and 1 to 5 on block 7, 8 - 5 = 3
1088     s8Bits[18] = (uint8_t)((s21Bits[6] >> 18) | (s21Bits[7] << 3));
1089 
1090     // 19: load from (5 + 1) on block 7
1091     s8Bits[19] = (uint8_t)(s21Bits[7] >> 5);
1092 
1093     // 20: load from (13 + 1) on block 7
1094     s8Bits[20] = (uint8_t)(s21Bits[7] >> 13);
1095 
1096     // 21: load 8bits on block 8
1097     s8Bits[21] = (uint8_t)s21Bits[8];
1098 
1099     // 22: load from (8 + 1) on block 8
1100     s8Bits[22] = (uint8_t)(s21Bits[8] >> 8);
1101 
1102     // 23: load from (16 + 1) to 21 on block 8 and 1 to 3 on block 9, 8 - 3 = 5
1103     s8Bits[23] = (uint8_t)((s21Bits[8] >> 16) | (s21Bits[9] << 5));
1104 
1105     // 24: load from (3 + 1) on block 9
1106     s8Bits[24] = (uint8_t)(s21Bits[9] >> 3);
1107 
1108     // 25: load from (11 + 1) on block 9
1109     s8Bits[25] = (uint8_t)(s21Bits[9] >> 11);
1110 
1111     // 26: load from (19 + 1) to 21 on block 9 and 1 to 6 on block 10, 8 - 6 = 2
1112     s8Bits[26] = (uint8_t)((s21Bits[9] >> 19) | (s21Bits[10] <<  2));
1113 
1114     // 27: load from (6 + 1) on block 10
1115     s8Bits[27] = (uint8_t)(s21Bits[10] >> 6);
1116 
1117     // 28: load from (14 + 1) to 21 on block 10 and 1 on block 11, 8 - 7 = 1
1118     s8Bits[28] = (uint8_t)((s21Bits[10] >> 14) | (s21Bits[11] << 7));
1119 
1120     // 29: load from (1 + 1) on block 11
1121     s8Bits[29] = (uint8_t)(s21Bits[11] >> 1);
1122 
1123     // 30: load from (9 + 1) on block 11
1124     s8Bits[30] = (uint8_t)(s21Bits[11] >> 9);
1125 
1126     // 31: load from (17 + 1) on block 11
1127     s8Bits[31] = (uint8_t)(s21Bits[11] >> 17);
1128 }
1129 
ModuloLCore(uint64_t s21Bits[UINT8_64_21BITS_BLOCKNUM])1130 static void ModuloLCore(uint64_t s21Bits[UINT8_64_21BITS_BLOCKNUM])
1131 {
1132     int32_t i;
1133     uint64_t signMask1, signMask2;
1134     uint64_t carry1, carry2;
1135 
1136     // multiply by l0, start with {11, 12, 13, 14, 15, 16} to {6, 7, 8, 9, 10, 11, 12}
1137     CURVE25519_MULTI_BY_L0(s21Bits, 11);
1138     CURVE25519_MULTI_BY_L0(s21Bits, 10);
1139     CURVE25519_MULTI_BY_L0(s21Bits, 9);
1140     CURVE25519_MULTI_BY_L0(s21Bits, 8);
1141     CURVE25519_MULTI_BY_L0(s21Bits, 7);
1142     CURVE25519_MULTI_BY_L0(s21Bits, 6);
1143 
1144     // need to process carry to prevent overflow, process carry from 6->7, 8->9 ... 16->17, increment by 2
1145     for (i = 6; i <= 16; i += 2) {
1146         // 21 bits minus sign is 20 bits
1147         PROCESS_CARRY(s21Bits[i], s21Bits[i + 1], signMask1, carry1, 20);
1148     }
1149 
1150     // process carry from 7->8, 9->10 ... 15->16, increment by 2
1151     for (i = 7; i <= 15; i += 2) {
1152         // 21 bits minus sign bit is 20 bits
1153         PROCESS_CARRY(s21Bits[i], s21Bits[i + 1], signMask2, carry2, 20);
1154     }
1155 
1156     // {5, 6, 7, 8, 9, 10} to {0, 1, 2, 3, 4, 5, 6}
1157     CURVE25519_MULTI_BY_L0(s21Bits, 5);
1158     CURVE25519_MULTI_BY_L0(s21Bits, 4);
1159     CURVE25519_MULTI_BY_L0(s21Bits, 3);
1160     CURVE25519_MULTI_BY_L0(s21Bits, 2);
1161     CURVE25519_MULTI_BY_L0(s21Bits, 1);
1162     CURVE25519_MULTI_BY_L0(s21Bits, 0);
1163 
1164     // process carry again, from 0->1, 2->3 ... 10->11, increment by 2
1165     for (i = 0; i <= 10; i += 2) {
1166         // 21 bits minus sign bit is 20 bits
1167         PROCESS_CARRY(s21Bits[i], s21Bits[i + 1], signMask1, carry1, 20);
1168     }
1169 
1170     // from 1->2, 3->4 ... 11->12, increment by 2
1171     for (i = 1; i <= 11; i += 2) {
1172         // 21 bits minus sign is 20 bits
1173         PROCESS_CARRY(s21Bits[i], s21Bits[i + 1], signMask2, carry2, 20);
1174     }
1175 
1176     CURVE25519_MULTI_BY_L0(s21Bits, 0);
1177 
1178     // process carry from 0 to 11
1179     for (i = 0; i <= 11; i++) {
1180         PROCESS_CARRY_UNSIGN(s21Bits[i], s21Bits[i + 1], signMask1, carry1, 21); // s21Bits is 21 bits
1181     }
1182 
1183     CURVE25519_MULTI_BY_L0(s21Bits, 0);
1184 
1185     // from 0 to 10
1186     for (i = 0; i <= 10; i++) {
1187         PROCESS_CARRY_UNSIGN(s21Bits[i], s21Bits[i + 1], signMask1, carry1, 21); // s21Bits is 21 bits
1188     }
1189 }
1190 
ModuloL(uint8_t s[CRYPT_CURVE25519_SIGNLEN])1191 void ModuloL(uint8_t s[CRYPT_CURVE25519_SIGNLEN])
1192 {
1193     // 24 of 21 bits block
1194     uint64_t s21Bits[UINT8_64_21BITS_BLOCKNUM] = {0};
1195 
1196     ModuloLPreLoad(s, s21Bits);
1197 
1198     ModuloLCore(s21Bits);
1199 
1200     UnloadTo8Bits(s, s21Bits);
1201 }
1202 
MulAdd(uint64_t s21Bits[UINT8_64_21BITS_BLOCKNUM],const uint64_t a21Bits[UINT8_32_21BITS_BLOCKNUM],const uint64_t b21Bits[UINT8_32_21BITS_BLOCKNUM],const uint64_t c21Bits[UINT8_32_21BITS_BLOCKNUM])1203 static void MulAdd(uint64_t s21Bits[UINT8_64_21BITS_BLOCKNUM], const uint64_t a21Bits[UINT8_32_21BITS_BLOCKNUM],
1204     const uint64_t b21Bits[UINT8_32_21BITS_BLOCKNUM], const uint64_t c21Bits[UINT8_32_21BITS_BLOCKNUM])
1205 {
1206     // s0 = c0 + a0b0
1207     s21Bits[0] = c21Bits[0] + a21Bits[0] * b21Bits[0];
1208 
1209     // s1 = c1 + a0b1 + a1b0
1210     s21Bits[1] = c21Bits[1] + a21Bits[0] * b21Bits[1] + a21Bits[1] * b21Bits[0];
1211 
1212     // s2 = c2 + a0b2 + b1a1 + a2b0
1213     s21Bits[2] = c21Bits[2] + a21Bits[0] * b21Bits[2] + a21Bits[1] * b21Bits[1] + a21Bits[2] * b21Bits[0];
1214 
1215     // s3 = c3 + a0b3 + a1b2 + a2b1 + a3b0
1216     s21Bits[3] = c21Bits[3] + a21Bits[0] * b21Bits[3] + a21Bits[1] * b21Bits[2] +
1217                  a21Bits[2] * b21Bits[1] + a21Bits[3] * b21Bits[0]; // a2b1 + a3b0
1218 
1219     // s4 = c4 + a0b4 +a1b3 + a2b2 + a3b1 + a4b0
1220     s21Bits[4] = c21Bits[4] + a21Bits[0] * b21Bits[4] + a21Bits[1] * b21Bits[3] +
1221                  a21Bits[2] * b21Bits[2] + a21Bits[3] * b21Bits[1] + a21Bits[4] * b21Bits[0]; // a2b2 + a3b1 + a4b0
1222 
1223     // s5 = c5 + a0b5 + a1b4 + a2b3 + a3b2 + a4b1 + a5b0
1224     s21Bits[5] = c21Bits[5] + a21Bits[0] * b21Bits[5] + a21Bits[1] * b21Bits[4] + a21Bits[2] * b21Bits[3] +
1225                  a21Bits[3] * b21Bits[2] + a21Bits[4] * b21Bits[1] + a21Bits[5] * b21Bits[0]; // a3b2 + a4b1 + a5b0
1226 
1227     // s6 = c6 + a0b6 + a1b5 + a2b4 + a3b3 + a2b4 + a5b1 + a6b0
1228     s21Bits[6] = c21Bits[6] + a21Bits[0] * b21Bits[6] + a21Bits[1] * b21Bits[5] + a21Bits[2] * b21Bits[4] +
1229                  a21Bits[3] * b21Bits[3] + a21Bits[4] * b21Bits[2] + a21Bits[5] * b21Bits[1] + // a3b3 + a2b4 + a5b1
1230                  a21Bits[6] * b21Bits[0]; // a6b0
1231 
1232     // s7 = c7 + a0b7 + a1b6 + a2b5 + a3b4 + a4b3 + a5b2 + a6b1 + a7b0
1233     s21Bits[7] = c21Bits[7] + a21Bits[0] * b21Bits[7] + a21Bits[1] * b21Bits[6] + a21Bits[2] * b21Bits[5] +
1234                  a21Bits[3] * b21Bits[4] + a21Bits[4] * b21Bits[3] + a21Bits[5] * b21Bits[2] + // a3b4 + a4b3 + a5b2
1235                  a21Bits[6] * b21Bits[1] + a21Bits[7] * b21Bits[0]; // a6b1 + a7b0
1236 
1237     // s8 = c8 + a0b8 + a1b7 + a2b6 + a3b5 + a4b4 + a5b3 + a6b2 + a7b1 + a8b0
1238     s21Bits[8] = c21Bits[8] + a21Bits[0] * b21Bits[8] + a21Bits[1] * b21Bits[7] + a21Bits[2] * b21Bits[6] +
1239                  a21Bits[3] * b21Bits[5] + a21Bits[4] * b21Bits[4] + a21Bits[5] * b21Bits[3] + // a3b5 + a4b4 + a5b3
1240                  a21Bits[6] * b21Bits[2] + a21Bits[7] * b21Bits[1] + a21Bits[8] * b21Bits[0]; // a6b2 + a7b1 + a8b0
1241 
1242     // s9 = c9 + a0b9 + a1b8 + a2b7 + a3b6 + a4b5 + a5b4 + a6b3 + a7b2 + a8b1 + a9b0
1243     s21Bits[9] = c21Bits[9] + a21Bits[0] * b21Bits[9] + a21Bits[1] * b21Bits[8] + a21Bits[2] * b21Bits[7] +
1244                  a21Bits[3] * b21Bits[6] + a21Bits[4] * b21Bits[5] + a21Bits[5] * b21Bits[4] + // a3b6 + a4b5 + a5b4
1245                  a21Bits[6] * b21Bits[3] + a21Bits[7] * b21Bits[2] + a21Bits[8] * b21Bits[1] + // a6b3 + a7b2 + a8b1
1246                  a21Bits[9] * b21Bits[0]; // a9b0
1247 
1248     // s10 = c10 + a0b10 + a1b9 + a2b8 + a3b7 + a4b6 + a5b5 + a6b4 + a7b3 + a8b2 + a9b1 + a10b0
1249     s21Bits[10] = c21Bits[10] + a21Bits[0] * b21Bits[10] + a21Bits[1] * b21Bits[9] + a21Bits[2] * b21Bits[8] +
1250                   a21Bits[3] * b21Bits[7] + a21Bits[4] * b21Bits[6] + a21Bits[5] * b21Bits[5] + // a3b7 + a4b6 + a5b5
1251                   a21Bits[6] * b21Bits[4] + a21Bits[7] * b21Bits[3] + a21Bits[8] * b21Bits[2] + // a6b4 + a7b3 + a8b2
1252                   a21Bits[9] * b21Bits[1] + a21Bits[10] * b21Bits[0]; // a9b1 + a10b0
1253 
1254     // s11 = c11 + a0b11 + a1b10 + a2b9 + a3b8 + a4b7 + a5b6 + a6b5 + a7b4 + a8b3 + a9b2 + a10b1 + a11b0
1255     s21Bits[11] = c21Bits[11] + a21Bits[0] * b21Bits[11] + a21Bits[1] * b21Bits[10] + a21Bits[2] * b21Bits[9] +
1256                   a21Bits[3] * b21Bits[8] + a21Bits[4] * b21Bits[7] + a21Bits[5] * b21Bits[6] + // a3b8 + a4b7 + a5b6
1257                   a21Bits[6] * b21Bits[5] + a21Bits[7] * b21Bits[4] + a21Bits[8] * b21Bits[3] + // a6b5 + a7b4 + a8b3
1258                   a21Bits[9] * b21Bits[2] + a21Bits[10] * b21Bits[1] + a21Bits[11] * b21Bits[0]; // a9b2 + a10b1 + a11b0
1259 
1260     // s12 = a1b11 + a2b10 + a3b9 + a4b8 + a5b7 + a6b6 + a7b5 + a8b4 + a9b3 + a10b2 + a11b1
1261     s21Bits[12] = a21Bits[1] * b21Bits[11] + a21Bits[2] * b21Bits[10] + a21Bits[3] * b21Bits[9] +
1262                   a21Bits[4] * b21Bits[8] + a21Bits[5] * b21Bits[7] + a21Bits[6] * b21Bits[6] + // a4b8 + a5b7 + a6b6
1263                   a21Bits[7] * b21Bits[5] + a21Bits[8] * b21Bits[4] + a21Bits[9] * b21Bits[3] + // a7b5 + a8b4 + a9b3
1264                   a21Bits[10] * b21Bits[2] + a21Bits[11] * b21Bits[1]; // a10b2 + a11b1
1265 
1266     // s13 = a2b11 + a3b10 + a4b9 + a5b8 + a6b7 + a7b6 + a8b5 + a9b4 + a10b3 + a11b2
1267     s21Bits[13] = a21Bits[2] * b21Bits[11] + a21Bits[3] * b21Bits[10] + a21Bits[4] * b21Bits[9] +
1268                   a21Bits[5] * b21Bits[8] + a21Bits[6] * b21Bits[7] + a21Bits[7] * b21Bits[6] + // a5b8 + a6b7 + a7b6
1269                   a21Bits[8] * b21Bits[5] + a21Bits[9] * b21Bits[4] + a21Bits[10] * b21Bits[3] + // a8b5 + a9b4 + a10b3
1270                   a21Bits[11] * b21Bits[2]; // a11b2
1271 
1272     // s14 = a3b11 + a4b10 + a5b9 + a6b8 + a7b7 + a8b6 + a9b5 + a10b4 + a11b3
1273     s21Bits[14] = a21Bits[3] * b21Bits[11] + a21Bits[4] * b21Bits[10] + a21Bits[5] * b21Bits[9] +
1274                   a21Bits[6] * b21Bits[8] + a21Bits[7] * b21Bits[7] + a21Bits[8] * b21Bits[6] + // a6b8 + a7b7 + a8b6
1275                   a21Bits[9] * b21Bits[5] + a21Bits[10] * b21Bits[4] + a21Bits[11] * b21Bits[3]; // a9b5 + a10b4 + a11b3
1276 
1277     // s15 = a4b11 + a5b10 + a6b9 + a7b8 + a8b7 + a9b6 + a10b5 + a11b4
1278     s21Bits[15] = a21Bits[4] * b21Bits[11] + a21Bits[5] * b21Bits[10] + a21Bits[6] * b21Bits[9] +
1279                   a21Bits[7] * b21Bits[8] + a21Bits[8] * b21Bits[7] + a21Bits[9] * b21Bits[6] + // a7b8 + a8b7 + a9b6
1280                   a21Bits[10] * b21Bits[5] + a21Bits[11] * b21Bits[4]; // a10b5 + a11b4
1281 
1282     // s16 = a5b11 + a6b10 + a7b9 + a8b8 + a9b7 + a10b6 + a11b5
1283     s21Bits[16] = a21Bits[5] * b21Bits[11] + a21Bits[6] * b21Bits[10] + a21Bits[7] * b21Bits[9] +
1284                   a21Bits[8] * b21Bits[8] + a21Bits[9] * b21Bits[7] + a21Bits[10] * b21Bits[6] + // a8b8 + a9b7 + a10b6
1285                   a21Bits[11] * b21Bits[5]; // a11b5
1286 
1287     // s17 = a6b11 + a7b10 + a8b9 + a9b8 + a10b7 + a11b6
1288     s21Bits[17] = a21Bits[6] * b21Bits[11] + a21Bits[7] * b21Bits[10] + a21Bits[8] * b21Bits[9] +
1289                   a21Bits[9] * b21Bits[8] + a21Bits[10] * b21Bits[7] + a21Bits[11] * b21Bits[6]; // a9b8 + a10b7 + a11b6
1290 
1291     // s18 = a7b11 + a8b10 + a9b9 + a10b8 + a11b7
1292     s21Bits[18] = a21Bits[7] * b21Bits[11] + a21Bits[8] * b21Bits[10] + a21Bits[9] * b21Bits[9] +
1293                   a21Bits[10] * b21Bits[8] + a21Bits[11] * b21Bits[7]; // a10b8 + a11b7
1294 
1295     // s19 = a8b11 + a9b10 + a10b9 + a11b8
1296     s21Bits[19] = a21Bits[8] * b21Bits[11] + a21Bits[9] * b21Bits[10] + a21Bits[10] * b21Bits[9] +
1297                   a21Bits[11] * b21Bits[8]; // a11b8
1298 
1299     // s20 = a9b11 + a10b10 + a11b9
1300     s21Bits[20] = a21Bits[9] * b21Bits[11] + a21Bits[10] * b21Bits[10] + a21Bits[11] * b21Bits[9];
1301 
1302     // s21 = a10b11 + a11b10
1303     s21Bits[21] = a21Bits[10] * b21Bits[11] + a21Bits[11] * b21Bits[10];
1304 
1305     // s22 = a11b11
1306     s21Bits[22] = a21Bits[11] * b21Bits[11];
1307 
1308     // s23 = 0
1309     s21Bits[23] = 0;
1310 }
1311 
ScalarMulAdd(uint8_t s[CRYPT_CURVE25519_KEYLEN],const uint8_t a[CRYPT_CURVE25519_KEYLEN],const uint8_t b[CRYPT_CURVE25519_KEYLEN],const uint8_t c[CRYPT_CURVE25519_KEYLEN])1312 void ScalarMulAdd(uint8_t s[CRYPT_CURVE25519_KEYLEN], const uint8_t a[CRYPT_CURVE25519_KEYLEN],
1313     const uint8_t b[CRYPT_CURVE25519_KEYLEN], const uint8_t c[CRYPT_CURVE25519_KEYLEN])
1314 {
1315     uint64_t a21Bits[UINT8_32_21BITS_BLOCKNUM];
1316     uint64_t b21Bits[UINT8_32_21BITS_BLOCKNUM];
1317     uint64_t c21Bits[UINT8_32_21BITS_BLOCKNUM];
1318 
1319     ScalarMulAddPreLoad(a, a21Bits);
1320     ScalarMulAddPreLoad(b, b21Bits);
1321     ScalarMulAddPreLoad(c, c21Bits);
1322 
1323     uint64_t s21Bits[UINT8_64_21BITS_BLOCKNUM];
1324 
1325     MulAdd(s21Bits, a21Bits, b21Bits, c21Bits);
1326 
1327     int32_t i;
1328     uint64_t signMask1, signMask2;
1329     uint64_t carryA, carryB;
1330 
1331     // process carry 0->1, 2->3 ... 22->23
1332     for (i = 0; i <= 22; i += 2) {
1333         // 21 bits minus sign bit is 20 bits
1334         PROCESS_CARRY(s21Bits[i], s21Bits[i + 1], signMask1, carryA, 20);
1335     }
1336 
1337     // process carry 1->2, 3->4 ... 21->22
1338     for (i = 1; i <= 21; i += 2) {
1339         // 21 bits minus sign bit is 20 bits
1340         PROCESS_CARRY(s21Bits[i], s21Bits[i + 1], signMask2, carryB, 20);
1341     }
1342 
1343     ModuloLCore(s21Bits);
1344 
1345     UnloadTo8Bits(s, s21Bits);
1346 }
1347 
1348 /* RFC8032, out = a + b */
PointAdd(GeE * out,GeE * greA,GeE * greB)1349 static void PointAdd(GeE *out, GeE *greA, GeE *greB)
1350 {
1351     const Fp25 d2 = {-21827239, -5839606, -30745221, 13898782, 229458,
1352         15978800, -12551817, -6495438, 29715968, 9444199};
1353     Fp25 a, b, c, d, e, f, g, h;
1354     CURVE25519_FP_SUB(e, greA->y, greA->x);
1355     CURVE25519_FP_SUB(f, greB->y, greB->x);
1356     CURVE25519_FP_ADD(g, greA->y, greA->x);
1357     CURVE25519_FP_ADD(h, greB->y, greB->x);
1358     FpMul(a, e, f);
1359     FpMul(b, g, h);
1360     FpMul(c, greA->t, greB->t);
1361     FpMul(c, c, d2);
1362     FpMul(d, greA->z, greB->z);
1363     CURVE25519_FP_ADD(d, d, d);
1364     CURVE25519_FP_SUB(e, b, a);
1365     CURVE25519_FP_SUB(f, d, c);
1366     CURVE25519_FP_ADD(g, d, c);
1367     CURVE25519_FP_ADD(h, b, a);
1368     FpMul(out->x, e, f);
1369     FpMul(out->y, g, h);
1370     FpMul(out->z, f, g);
1371     FpMul(out->t, e, h);
1372 }
1373 
PointAddPrecompute(GeE * out,GeE * greA,GeEPre * greB)1374 static void PointAddPrecompute(GeE *out, GeE *greA, GeEPre *greB)
1375 {
1376     Fp25 a, b, c, d, e, f, g, h;
1377     CURVE25519_FP_SUB(e, greA->y, greA->x);
1378     CURVE25519_FP_ADD(g, greA->y, greA->x);
1379     FpMul(a, e, greB->yminusx);
1380     FpMul(b, g, greB->yplusx);
1381     FpMul(c, greA->t, greB->t2z);
1382     FpMul(d, greA->z, greB->z);
1383     CURVE25519_FP_ADD(d, d, d);
1384     CURVE25519_FP_SUB(e, b, a);
1385     CURVE25519_FP_SUB(f, d, c);
1386     CURVE25519_FP_ADD(g, d, c);
1387     CURVE25519_FP_ADD(h, b, a);
1388     FpMul(out->x, e, f);
1389     FpMul(out->y, g, h);
1390     FpMul(out->z, f, g);
1391     FpMul(out->t, e, h);
1392 }
1393 
PointSubPrecompute(GeE * out,GeE * greA,GeEPre * greB)1394 static void PointSubPrecompute(GeE *out, GeE *greA, GeEPre *greB)
1395 {
1396     Fp25 a, b, c, d, e, f, g, h;
1397     CURVE25519_FP_SUB(e, greA->y, greA->x);
1398     CURVE25519_FP_ADD(g, greA->y, greA->x);
1399     FpMul(a, e, greB->yplusx);
1400     FpMul(b, g, greB->yminusx);
1401     FpMul(c, greA->t, greB->t2z);
1402     FpMul(d, greA->z, greB->z);
1403     CURVE25519_FP_ADD(d, d, d);
1404     CURVE25519_FP_SUB(e, b, a);
1405     CURVE25519_FP_ADD(f, d, c);
1406     CURVE25519_FP_SUB(g, d, c);
1407     CURVE25519_FP_ADD(h, b, a);
1408     FpMul(out->x, e, f);
1409     FpMul(out->y, g, h);
1410     FpMul(out->z, f, g);
1411     FpMul(out->t, e, h);
1412 }
1413 
P1DoubleN(GeE * p1,int32_t n)1414 static void P1DoubleN(GeE *p1, int32_t n)
1415 {
1416     GeP p;
1417     GeC c;
1418     int32_t i;
1419     // From extended coordinate to projective coordinate, just ignore T
1420     CURVE25519_FP_COPY(p.x, p1->x);
1421     CURVE25519_FP_COPY(p.y, p1->y);
1422     CURVE25519_FP_COPY(p.z, p1->z);
1423 
1424     ProjectiveDouble(&c, &p);
1425     for (i = 1; i < n; i++) {
1426         GeCompleteToProjective(&p, &c);
1427         ProjectiveDouble(&c, &p);
1428     }
1429 
1430     FpMul(p1->x, c.t, c.x);
1431     FpMul(p1->y, c.z, c.y);
1432     FpMul(p1->z, c.t, c.z);
1433     FpMul(p1->t, c.y, c.x);
1434 }
1435 
PointToPrecompute(GeEPre * out,GeE * in)1436 static void PointToPrecompute(GeEPre *out, GeE *in)
1437 {
1438     const Fp25 d2 = {-21827239, -5839606, -30745221, 13898782, 229458,
1439         15978800, -12551817, -6495438, 29715968, 9444199};
1440 
1441     CURVE25519_FP_ADD(out->yplusx, in->y, in->x);
1442     CURVE25519_FP_SUB(out->yminusx, in->y, in->x);
1443     CURVE25519_FP_COPY(out->z, in->z);
1444     FpMul(out->t2z, in->t, d2);
1445 }
1446 
FlipK(int8_t slide[256],uint32_t start)1447 static void FlipK(int8_t slide[256], uint32_t start)
1448 {
1449     uint32_t k;
1450     for (k = start; k < 256; k++) {
1451         if (slide[k] == 0) {
1452             slide[k] = 1;
1453             break;
1454         } else {
1455             slide[k] = 0;
1456         }
1457     }
1458 }
1459 
SlideReduce(int8_t * out,uint32_t outLen,const uint8_t * in,uint32_t inLen)1460 static void SlideReduce(int8_t *out, uint32_t outLen, const uint8_t *in, uint32_t inLen)
1461 {
1462     uint32_t i, j;
1463     int8_t tmp;
1464     (void)outLen;
1465     (void)inLen;
1466     // 32 * 8 = 256
1467     for (i = 0; i < 256; i++) {
1468         // turn 32 8bits to 256 1bit, block: in[i >> 3], bit: (i & 7)
1469         out[i] = (int8_t)((in[i >> 3] >> (i & 7)) & 1);
1470     }
1471     // 32 * 8 = 256
1472     for (i = 0; i < 256; i++) {
1473         if (out[i] == 0) {
1474             continue;
1475         }
1476         for (j = 1; j <= 6 && (i + j) < 256; j++) { // check next 6 since 2^6 - 2^5 = 16 > 15, 256 is array length
1477             if (out[i + j] == 0) {
1478                 continue;
1479             }
1480             // range 15 to -15
1481             tmp = (int8_t)((uint8_t)(out[i + j]) << j);
1482             if (out[i] + tmp <= 15) { // max 15,  0x1, 0x1, 0x1, 0x1 , 0 -> 0x1111, 0, 0, 0, 0
1483                 out[i] += tmp;
1484                 out[i + j] = 0;
1485             } else if (out[i] - tmp >= -15) { // min -15, 0x1111, 0, 0, 0, 1, 1 -> -1, 0, 0, 0, 0, 1
1486                 out[i] -= tmp;
1487                 FlipK(out, i + j);
1488             } else {
1489                 break;
1490             }
1491         }
1492     }
1493 }
1494 
1495 // Base on article "High-speed high-security signatures"
1496 // stores B, 3B, 5B, 7B, 9B, 11B, 13B, 15B, with B as ed25519 base point
1497 static const GePre g_precomputedB[8] = {
1498     {
1499         {25967493, -14356035, 29566456, 3660896, -12694345, 4014787, 27544626,
1500             -11754271, -6079156, 2047605},
1501         {-12545711, 934262, -2722910, 3049990, -727428, 9406986, 12720692,
1502             5043384, 19500929, -15469378},
1503         {-8738181, 4489570, 9688441, -14785194, 10184609, -12363380, 29287919,
1504             11864899, -24514362, -4438546},
1505     },
1506     {
1507         {15636291, -9688557, 24204773, -7912398, 616977, -16685262, 27787600,
1508             -14772189, 28944400, -1550024},
1509         {16568933, 4717097, -11556148, -1102322, 15682896, -11807043, 16354577,
1510             -11775962, 7689662, 11199574},
1511         {30464156, -5976125, -11779434, -15670865, 23220365, 15915852, 7512774,
1512             10017326, -17749093, -9920357},
1513     },
1514     {
1515         {10861363, 11473154, 27284546, 1981175, -30064349, 12577861, 32867885,
1516             14515107, -15438304, 10819380},
1517         {4708026, 6336745, 20377586, 9066809, -11272109, 6594696, -25653668,
1518             12483688, -12668491, 5581306},
1519         {19563160, 16186464, -29386857, 4097519, 10237984, -4348115, 28542350,
1520             13850243, -23678021, -15815942},
1521     },
1522     {
1523         {5153746, 9909285, 1723747, -2777874, 30523605, 5516873, 19480852,
1524             5230134, -23952439, -15175766},
1525         {-30269007, -3463509, 7665486, 10083793, 28475525, 1649722, 20654025,
1526             16520125, 30598449, 7715701},
1527         {28881845, 14381568, 9657904, 3680757, -20181635, 7843316, -31400660,
1528             1370708, 29794553, -1409300},
1529     },
1530     {
1531         {-22518993, -6692182, 14201702, -8745502, -23510406, 8844726, 18474211,
1532             -1361450, -13062696, 13821877},
1533         {-6455177, -7839871, 3374702, -4740862, -27098617, -10571707, 31655028,
1534             -7212327, 18853322, -14220951},
1535         {4566830, -12963868, -28974889, -12240689, -7602672, -2830569, -8514358,
1536             -10431137, 2207753, -3209784},
1537     },
1538     {
1539         {-25154831, -4185821, 29681144, 7868801, -6854661, -9423865, -12437364,
1540             -663000, -31111463, -16132436},
1541         {25576264, -2703214, 7349804, -11814844, 16472782, 9300885, 3844789,
1542             15725684, 171356, 6466918},
1543         {23103977, 13316479, 9739013, -16149481, 817875, -15038942, 8965339,
1544             -14088058, -30714912, 16193877},
1545     },
1546     {
1547         {-33521811, 3180713, -2394130, 14003687, -16903474, -16270840, 17238398,
1548             4729455, -18074513, 9256800},
1549         {-25182317, -4174131, 32336398, 5036987, -21236817, 11360617, 22616405,
1550             9761698, -19827198, 630305},
1551         {-13720693, 2639453, -24237460, -7406481, 9494427, -5774029, -6554551,
1552             -15960994, -2449256, -14291300},
1553     },
1554     {
1555         {-3151181, -5046075, 9282714, 6866145, -31907062, -863023, -18940575,
1556             15033784, 25105118, -7894876},
1557         {-24326370, 15950226, -31801215, -14592823, -11662737, -5090925,
1558             1573892, -2625887, 2198790, -15804619},
1559         {-3099351, 10324967, -2241613, 7453183, -5446979, -2735503, -13812022,
1560             -16236442, -32461234, -12290683},
1561     },
1562 };
1563 
1564 /* out = hash * p + s * B */
KAMulPlusMulBase(GeE * out,const uint8_t hash[CRYPT_CURVE25519_KEYLEN],const GeE * p,const uint8_t s[CRYPT_CURVE25519_KEYLEN])1565 void KAMulPlusMulBase(GeE *out, const uint8_t hash[CRYPT_CURVE25519_KEYLEN],
1566     const GeE *p, const uint8_t s[CRYPT_CURVE25519_KEYLEN])
1567 {
1568     SetExtendedBasePoint(out);
1569     GeE tmpP[8]; // stores p, 3p, 5p, 7p, 9p, 11p, 13p, 15p
1570     GeE doubleP;
1571     GeEPre preComputedP[8]; // stores p, 3p, 5p, 7p, 9p, 11p, 13p, 15p
1572     int8_t slideP[256];
1573     int8_t slideS[256];
1574     int32_t i;
1575 
1576     SlideReduce(slideP, 256, hash, CRYPT_CURVE25519_KEYLEN);
1577     SlideReduce(slideS, 256, s, CRYPT_CURVE25519_KEYLEN);
1578 
1579     CURVE25519_GE_COPY(tmpP[0], *p);
1580     CURVE25519_GE_COPY(doubleP, *p);
1581 
1582     PointToPrecompute(&preComputedP[0], &tmpP[0]);
1583     P1DoubleN(&doubleP, 1);
1584 
1585     for (i = 1; i < 8; i += 1) { // p, 3p, ....., 13p, 15p, total 8
1586         PointAdd(&tmpP[i], &tmpP[i - 1], &doubleP);
1587         PointToPrecompute(&preComputedP[i], &tmpP[i]);
1588     }
1589 
1590     int32_t zeroCount = 0;
1591     i = 255; // 255 to 0
1592     while (i >= 0 && slideP[i] == 0 && slideS[i] == 0) {
1593         i--;
1594     }
1595     for (; i >= 0; i--) {
1596         while (i >= 0 && slideP[i] == 0 && slideS[i] == 0) {
1597             zeroCount++;
1598             i--;
1599         }
1600         if (i < 0) {
1601             P1DoubleN(out, zeroCount);
1602             break;
1603         } else {
1604             P1DoubleN(out, zeroCount + 1);
1605         }
1606         zeroCount = 0;
1607         if (slideP[i] > 0) {
1608             PointAddPrecompute(out, out, &preComputedP[slideP[i] / 2]); // preComputedP[i] = (i * 2 + 1)P
1609         } else if (slideP[i] < 0) {
1610             PointSubPrecompute(out, out, &preComputedP[(-slideP[i]) / 2]); // preComputedP[i] = (i * 2 + 1)P
1611         }
1612         if (slideS[i] > 0) {
1613             GeAdd(out, &g_precomputedB[slideS[i] / 2]); // g_precomputedB[i] = (i * 2 + 1)P
1614         } else if (slideS[i] < 0) {
1615             GeSub(out, &g_precomputedB[(-slideS[i]) / 2]); // g_precomputedB[i] = (i * 2 + 1)P
1616         }
1617     }
1618 }
1619 #endif /* HITLS_CRYPTO_ED25519 */
1620 
1621 #endif /* HITLS_CRYPTO_CURVE25519 */
1622