1 /*
2 * This file is part of the openHiTLS project.
3 *
4 * openHiTLS is licensed under the Mulan PSL v2.
5 * You can use this software according to the terms and conditions of the Mulan PSL v2.
6 * You may obtain a copy of Mulan PSL v2 at:
7 *
8 * http://license.coscl.org.cn/MulanPSL2
9 *
10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13 * See the Mulan PSL v2 for more details.
14 */
15
16 #include "hitls_build.h"
17 #ifdef HITLS_CRYPTO_X25519
18
19 #include "securec.h"
20 #include "curve25519_local.h"
21
22 // X25519 alternative implementation, faster but require int128
23 #if (defined(__SIZEOF_INT128__) && (__SIZEOF_INT128__ == 16))
24 #define CURVE25519_51BITS_MASK 0x7ffffffffffff
25 #define CURVE25519_51BITS 51
26
Fp51DataToPoly(Fp51 * out,const uint8_t in[32])27 static void Fp51DataToPoly(Fp51 *out, const uint8_t in[32])
28 {
29 uint64_t h[5];
30
31 CURVE25519_BYTES7_LOAD(h, in);
32
33 CURVE25519_BYTES6_LOAD(h + 1, in + 7);
34 h[1] <<= 5;
35
36 CURVE25519_BYTES7_LOAD(h + 2, in + 13);
37 h[2] <<= 2;
38
39 CURVE25519_BYTES6_LOAD(h + 3, in + 20);
40 h[3] <<= 7;
41
42 CURVE25519_BYTES6_LOAD(h + 4, in + 26);
43 h[4] &= 0x7fffffffffff; // 41 bits mask = 0x7fffffffffff
44 h[4] <<= 4;
45
46 h[1] |= h[0] >> CURVE25519_51BITS;
47 h[0] &= CURVE25519_51BITS_MASK;
48
49 h[2] |= h[1] >> CURVE25519_51BITS;
50 h[1] &= CURVE25519_51BITS_MASK;
51
52 h[3] |= h[2] >> CURVE25519_51BITS;
53 h[2] &= CURVE25519_51BITS_MASK;
54
55 h[4] |= h[3] >> CURVE25519_51BITS;
56 h[3] &= CURVE25519_51BITS_MASK;
57
58 out->data[0] = h[0];
59 out->data[1] = h[1];
60 out->data[2] = h[2];
61 out->data[3] = h[3];
62 out->data[4] = h[4];
63 }
64
Fp51UnloadTo8Bits(uint8_t out[32],uint64_t h[5])65 static void Fp51UnloadTo8Bits(uint8_t out[32], uint64_t h[5])
66 {
67 // load from uint64 to uint8, load 8 bits at a time
68 out[0] = (uint8_t)h[0];
69 out[1] = (uint8_t)(h[0] >> 8);
70 out[2] = (uint8_t)(h[0] >> 16);
71 out[3] = (uint8_t)(h[0] >> 24);
72 out[4] = (uint8_t)(h[0] >> 32);
73 out[5] = (uint8_t)(h[0] >> 40);
74 // load from position 48 from h[1] and (8-5)=3 bits from h[1] to out[6]
75 out[6] = (uint8_t)((h[0] >> 48) | (uint8_t)(h[1] << 3));
76 out[7] = (uint8_t)(h[1] >> 5);
77 out[8] = (uint8_t)(h[1] >> 13);
78 out[9] = (uint8_t)(h[1] >> 21);
79 out[10] = (uint8_t)(h[1] >> 29);
80 out[11] = (uint8_t)(h[1] >> 37);
81 // load from position 45 from h[1] and (8-2)=6 bits from h[2] to out[12]
82 out[12] = (uint8_t)((h[1] >> 45) | (uint8_t)(h[2] << 6));
83 out[13] = (uint8_t)(h[2] >> 2);
84 out[14] = (uint8_t)(h[2] >> 10);
85 out[15] = (uint8_t)(h[2] >> 18);
86 out[16] = (uint8_t)(h[2] >> 26);
87 out[17] = (uint8_t)(h[2] >> 34);
88 out[18] = (uint8_t)(h[2] >> 42);
89 // load from position 50 from h[2] and (8-1)=7 bits from h[3] to out[19]
90 out[19] = (uint8_t)((h[2] >> 50) | (uint8_t)(h[3] << 1));
91 out[20] = (uint8_t)(h[3] >> 7);
92 out[21] = (uint8_t)(h[3] >> 15);
93 out[22] = (uint8_t)(h[3] >> 23);
94 out[23] = (uint8_t)(h[3] >> 31);
95 out[24] = (uint8_t)(h[3] >> 39);
96 // load from position 47 from h[3] and (4-4)=4 bits from h[4] to out[25]
97 out[25] = (uint8_t)((h[3] >> 47) | (uint8_t)(h[4] << 4));
98 out[26] = (uint8_t)(h[4] >> 4);
99 out[27] = (uint8_t)(h[4] >> 12);
100 out[28] = (uint8_t)(h[4] >> 20);
101 out[29] = (uint8_t)(h[4] >> 28);
102 out[30] = (uint8_t)(h[4] >> 36);
103 out[31] = (uint8_t)(h[4] >> 44);
104 }
105
Fp51PolyToData(const Fp51 * in,uint8_t out[32])106 static void Fp51PolyToData(const Fp51 *in, uint8_t out[32])
107 {
108 uint64_t h[5];
109 h[0] = in->data[0];
110 h[1] = in->data[1];
111 h[2] = in->data[2];
112 h[3] = in->data[3];
113 h[4] = in->data[4];
114 uint64_t carry;
115
116 carry = (h[0] + 19) >> CURVE25519_51BITS; // plus 19 then calculate carry
117 carry = (h[1] + carry) >> CURVE25519_51BITS;
118 carry = (h[2] + carry) >> CURVE25519_51BITS;
119 carry = (h[3] + carry) >> CURVE25519_51BITS;
120 carry = (h[4] + carry) >> CURVE25519_51BITS;
121
122 h[0] += 19 * carry; // process carry h[4] -> h[0], h[0] += 19 * carry
123 h[1] += h[0] >> CURVE25519_51BITS;
124 h[0] &= CURVE25519_51BITS_MASK;
125 h[2] += h[1] >> CURVE25519_51BITS;
126 h[1] &= CURVE25519_51BITS_MASK;
127 h[3] += h[2] >> CURVE25519_51BITS;
128 h[2] &= CURVE25519_51BITS_MASK;
129 h[4] += h[3] >> CURVE25519_51BITS;
130 h[3] &= CURVE25519_51BITS_MASK;
131 h[4] &= CURVE25519_51BITS_MASK;
132
133 Fp51UnloadTo8Bits(out, h);
134 }
135
Fp51ProcessCarry(__uint128_t in[5])136 void Fp51ProcessCarry(__uint128_t in[5])
137 {
138 in[1] += (uint64_t)(in[0] >> CURVE25519_51BITS);
139 in[0] = (uint64_t)in[0] & CURVE25519_51BITS_MASK;
140
141 in[2] += (uint64_t)(in[1] >> CURVE25519_51BITS);
142 in[1] = (uint64_t)in[1] & CURVE25519_51BITS_MASK;
143
144 in[3] += (uint64_t)(in[2] >> CURVE25519_51BITS);
145 in[2] = (uint64_t)in[2] & CURVE25519_51BITS_MASK;
146
147 in[4] += (uint64_t)(in[3] >> CURVE25519_51BITS);
148 in[3] = (uint64_t)in[3] & CURVE25519_51BITS_MASK;
149
150 in[0] += (uint64_t)(in[4] >> CURVE25519_51BITS) * 19;
151 in[4] = (uint64_t)in[4] & CURVE25519_51BITS_MASK;
152
153 in[1] += in[0] >> CURVE25519_51BITS;
154 in[0] &= CURVE25519_51BITS_MASK;
155 }
156
Fp51Mul(Fp51 * out,const Fp51 * f,const Fp51 * g)157 void Fp51Mul(Fp51 *out, const Fp51 *f, const Fp51 *g)
158 {
159 __uint128_t h[5];
160 // h[0] = f0g0 + 19*f1g4 + 19*f2g3 + 19*f3g2 + 19*f4g1
161 h[0] = (__uint128_t)f->data[0] * g->data[0] + (__uint128_t)f->data[1] * g->data[4] * 19 +
162 (__uint128_t)f->data[2] * g->data[3] * 19 + (__uint128_t)f->data[3] * g->data[2] * 19 + // 19*f2g3 + 19*f3g2
163 (__uint128_t)f->data[4] * g->data[1] * 19; // 19*f4g1
164 // h[1] = f0g1 + f1g0 + 19*f2g4 + 19*f3g3 + 19*f4g2
165 h[1] = (__uint128_t)f->data[0] * g->data[1] + (__uint128_t)f->data[1] * g->data[0] +
166 (__uint128_t)f->data[2] * g->data[4] * 19 + (__uint128_t)f->data[3] * g->data[3] * 19 + // 19*f2g4 + 19*f3g3
167 (__uint128_t)f->data[4] * g->data[2] * 19; // 19*f4g2
168 // h[2] = f0g2 + f1g1 + f2g0 + 19*f3g4 + 19*f4g3
169 h[2] = (__uint128_t)f->data[0] * g->data[2] + (__uint128_t)f->data[1] * g->data[1] +
170 (__uint128_t)f->data[2] * g->data[0] + (__uint128_t)f->data[3] * g->data[4] * 19 + // f2g0 + 19*f3g4
171 (__uint128_t)f->data[4] * g->data[3] * 19; // 19*f4g3
172 // h[3] = f0g3 + f1g2 + f2g1 + f3g0 + 19*f4g4
173 h[3] = (__uint128_t)f->data[0] * g->data[3] + (__uint128_t)f->data[1] * g->data[2] +
174 (__uint128_t)f->data[2] * g->data[1] + (__uint128_t)f->data[3] * g->data[0] + // f2g1 + f3g0
175 (__uint128_t)f->data[4] * g->data[4] * 19; // 19*f4g4
176 // h[4] = f0g4 + f1g3 + f2g2 + f3g1 + f4g0
177 h[4] = (__uint128_t)f->data[0] * g->data[4] + (__uint128_t)f->data[1] * g->data[3] +
178 (__uint128_t)f->data[2] * g->data[2] + (__uint128_t)f->data[3] * g->data[1] + // f2g2 + f3g1
179 (__uint128_t)f->data[4] * g->data[0]; // f4g0
180
181 Fp51ProcessCarry(h);
182
183 out->data[0] = (uint64_t)h[0];
184 out->data[1] = (uint64_t)h[1];
185 out->data[2] = (uint64_t)h[2];
186 out->data[3] = (uint64_t)h[3];
187 out->data[4] = (uint64_t)h[4];
188 }
189
Fp51Square(Fp51 * out,const Fp51 * in)190 void Fp51Square(Fp51 *out, const Fp51 *in)
191 {
192 __uint128_t h[5];
193 uint64_t in0mul2 = in->data[0] * 2;
194 uint64_t in1mul2 = in->data[1] * 2;
195 uint64_t in2mul2 = in->data[2] * 2;
196 uint64_t in3mul19 = in->data[3] * 19;
197 uint64_t in4mul19 = in->data[4] * 19;
198
199 // h0 = in0^2 + 38 * in1 * in4 + 38 * in2 * in3
200 h[0] = (__uint128_t)in->data[0] * in->data[0] + (__uint128_t)in1mul2 * in4mul19 +
201 (__uint128_t)in2mul2 * in3mul19;
202 // h1 = 2 * in0 * in1 + 19 * in3^2 + 38 * in2 * in4
203 h[1] = (__uint128_t)in0mul2 * in->data[1] + (__uint128_t)in->data[3] * in3mul19 +
204 (__uint128_t)in2mul2 * in4mul19;
205 // h2 = 2 * in0 * in2 + in1^2 + 38 * in3 * in4
206 h[2] = (__uint128_t)in0mul2 * in->data[2] + (__uint128_t)in->data[1] * in->data[1] +
207 (__uint128_t)(in->data[3] * 2) * in4mul19; // 2 * 19 * in3 * in4
208 // h3 = 2 * in0 * in3 + 19 * in4^2 + 2 * in1 * in2
209 h[3] = (__uint128_t)in0mul2 * in->data[3] + (__uint128_t)in->data[4] * in4mul19 +
210 (__uint128_t)in1mul2 * in->data[2]; // 2 * in1 * in2
211 // h4 = 2 * in0 * in4 + 2 * in1 * in3 + in2^2
212 h[4] = (__uint128_t)in0mul2 * in->data[4] + (__uint128_t)in1mul2 * in->data[3] +
213 (__uint128_t)in->data[2] * in->data[2]; // in2^2
214
215 Fp51ProcessCarry(h);
216
217 out->data[0] = (uint64_t)h[0];
218 out->data[1] = (uint64_t)h[1];
219 out->data[2] = (uint64_t)h[2];
220 out->data[3] = (uint64_t)h[3];
221 out->data[4] = (uint64_t)h[4];
222 }
223
Fp51MulScalar(Fp51 * out,const Fp51 * in,const uint32_t scalar)224 void Fp51MulScalar(Fp51 *out, const Fp51 *in, const uint32_t scalar)
225 {
226 __uint128_t h[5];
227 h[0] = in->data[0] * (__uint128_t)scalar;
228 h[1] = in->data[1] * (__uint128_t)scalar;
229 h[2] = in->data[2] * (__uint128_t)scalar;
230 h[3] = in->data[3] * (__uint128_t)scalar;
231 h[4] = in->data[4] * (__uint128_t)scalar;
232
233 Fp51ProcessCarry(h);
234
235 out->data[0] = (uint64_t)h[0];
236 out->data[1] = (uint64_t)h[1];
237 out->data[2] = (uint64_t)h[2];
238 out->data[3] = (uint64_t)h[3];
239 out->data[4] = (uint64_t)h[4];
240 }
241
242 /* out = in1 ^ (4 * 2 ^ (2 * times)) * in2 */
Fp51MultiSquare(Fp51 * in1,Fp51 * in2,Fp51 * out,int32_t times)243 static inline void Fp51MultiSquare(Fp51 *in1, Fp51 *in2, Fp51 *out, int32_t times)
244 {
245 int32_t i;
246 Fp51 temp1, temp2;
247 Fp51Square(&temp1, in1);
248 Fp51Square(&temp2, &temp1);
249 for (i = 0; i < times; i++) {
250 Fp51Square(&temp1, &temp2);
251 Fp51Square(&temp2, &temp1);
252 }
253 Fp51Mul(out, in2, &temp2);
254 }
255
256 /* out = a ^ -1 */
Fp51Invert(Fp51 * out,const Fp51 * a)257 static void Fp51Invert(Fp51 *out, const Fp51 *a)
258 {
259 Fp51 a0; /* save a^1 */
260 Fp51 a1; /* save a^2 */
261 Fp51 a2; /* save a^11 */
262 Fp51 a3; /* save a^(2^5-1) */
263 Fp51 a4; /* save a^(2^10-1) */
264 Fp51 a5; /* save a^(2^20-1) */
265 Fp51 a6; /* save a^(2^40-1) */
266 Fp51 a7; /* save a^(2^50-1) */
267 Fp51 a8; /* save a^(2^100-1) */
268 Fp51 a9; /* save a^(2^200-1) */
269 Fp51 a10; /* save a^(2^250-1) */
270 Fp51 temp1, temp2;
271
272 /* We know a×b=1(mod p), then a and b are inverses of mod p, i.e. a=b^(-1), b=a^(-1);
273 * According to Fermat's little theorem a^(p-1)=1(mod p), so a*a^(p-2)=1(mod p);
274 * So the inverse element of a is a^(-1) = a^(p-2)(mod p)
275 * Here it is, p=2^255-19, thus we need to compute a^(2^255-21)(mod(2^255-19))
276 */
277
278 /* a^1 */
279 CURVE25519_FP51_COPY(a0.data, a->data);
280
281 /* a^2 */
282 Fp51Square(&a1, &a0);
283
284 /* a^4 */
285 Fp51Square(&temp1, &a1);
286
287 /* a^8 */
288 Fp51Square(&temp2, &temp1);
289
290 /* a^9 */
291 Fp51Mul(&temp1, &a0, &temp2);
292
293 /* a^11 */
294 Fp51Mul(&a2, &a1, &temp1);
295
296 /* a^22 */
297 Fp51Square(&temp2, &a2);
298
299 /* a^(2^5-1) = a^(9+22) */
300 Fp51Mul(&a3, &temp1, &temp2);
301
302 /* a^(2^10-1) = a^(2^10-2^5) * a^(2^5-1) */
303 Fp51Square(&temp1, &a3);
304 Fp51Square(&temp2, &temp1);
305 Fp51Square(&temp1, &temp2);
306 Fp51Square(&temp2, &temp1);
307 Fp51Square(&temp1, &temp2);
308 Fp51Mul(&a4, &a3, &temp1);
309
310 /* a^(2^20-1) = a^(2^20-2^10) * a^(2^10-1) */
311 Fp51MultiSquare(&a4, &a4, &a5, 4); // (2 * 2) ^ 4
312
313 /* a^(2^40-1) = a^(2^40-2^20) * a^(2^20-1) */
314 Fp51MultiSquare(&a5, &a5, &a6, 9); // (2 * 2) ^ 9
315
316 /* a^(2^50-1) = a^(2^50-2^10) * a^(2^10-1) */
317 Fp51MultiSquare(&a6, &a4, &a7, 4); // (2 * 2) ^ 4
318
319 /* a^(2^100-1) = a^(2^100-2^50) * a^(2^50-1) */
320 Fp51MultiSquare(&a7, &a7, &a8, 24); // (2 * 2) ^ 24
321
322 /* a^(2^200-1) = a^(2^200-2^100) * a^(2^100-1) */
323 Fp51MultiSquare(&a8, &a8, &a9, 49); // (2 * 2) ^ 49
324
325 /* a^(2^250-1) = a^(2^250-2^50) * a^(2^50-1) */
326 Fp51MultiSquare(&a9, &a7, &a10, 24); // (2 * 2) ^ 24
327
328 /* a^(2^5*(2^250-1)) = (a^(2^250-1))^5 */
329 Fp51Square(&temp1, &a10);
330 Fp51Square(&temp2, &temp1);
331 Fp51Square(&temp1, &temp2);
332 Fp51Square(&temp2, &temp1);
333 Fp51Square(&temp1, &temp2);
334
335 /* The output: a^(2^255-21) = a(2^5*(2^250-1)+11) = a^(2^5*(2^250-1)) * a^11 */
336 Fp51Mul(out, &a2, &temp1);
337 }
338
ScalarMultiPoint(uint8_t out[32],const uint8_t scalar[32],const uint8_t point[32])339 void ScalarMultiPoint(uint8_t out[32], const uint8_t scalar[32], const uint8_t point[32])
340 {
341 uint8_t k[32];
342 const uint8_t *u = point;
343 int32_t t;
344 uint32_t swap;
345 uint32_t kTemp;
346 Fp51 x1, x2, x3;
347 Fp51 z2, z3;
348 Fp51 t1, t2;
349
350 /* Decord the scalar into k */
351 CURVE25519_DECODE_LITTLE_ENDIAN(k, scalar);
352
353 /* Reference RFC 7748 section 5: The constant a24 is (486662 - 2) / 4 = 121665 for curve25519/X25519 */
354 Fp51DataToPoly(&x1, u);
355 CURVE25519_FP51_SET(x2.data, 1);
356 CURVE25519_FP51_SET(z2.data, 0);
357 CURVE25519_FP51_COPY(x3.data, x1.data);
358 CURVE25519_FP51_SET(z3.data, 1);
359 swap = 0;
360
361 /* "bits" parameter set to 255 for x25519 */ /* For t = bits-1(254) down to 0: */
362 for (t = 254; t >= 0; t--) {
363 /* t >> 3: calculation the index of bit; t & 7: Obtains the corresponding bit in the byte */
364 kTemp = (k[(uint32_t)t >> 3] >> ((uint32_t)t & 7)) & 1; /* kTemp = (k >> t) & 1 */
365 swap ^= kTemp; /* swap ^= kTemp */
366 CURVE25519_FP51_CSWAP(swap, x2.data, x3.data); /* (x_2, x_3) = cswap(swap, x_2, x_3) */
367 CURVE25519_FP51_CSWAP(swap, z2.data, z3.data); /* (z_2, z_3) = cswap(swap, z_2, z_3) */
368 swap = kTemp; /* swap = kTemp */
369 CURVE25519_FP51_SUB(t1.data, x3.data, z3.data); /* x3 = D */
370 CURVE25519_FP51_SUB(t2.data, x2.data, z2.data); /* t2 = B */
371 CURVE25519_FP51_ADD(x2.data, x2.data, z2.data); /* t1 = A */
372 CURVE25519_FP51_ADD(z2.data, x3.data, z3.data); /* x2 = C */
373
374 Fp51Mul(&z3, &t1, &x2);
375 Fp51Mul(&z2, &z2, &t2);
376 Fp51Square(&t1, &t2);
377 Fp51Square(&t2, &x2);
378
379 CURVE25519_FP51_ADD(x3.data, z3.data, z2.data);
380 CURVE25519_FP51_SUB(z2.data, z3.data, z2.data);
381 Fp51Mul(&x2, &t2, &t1);
382 CURVE25519_FP51_SUB(t2.data, t2.data, t1.data);
383 Fp51Square(&z2, &z2);
384 Fp51MulScalar(&z3, &t2, 121666); // z2 *= 121665 + 1 = 121666
385 Fp51Square(&x3, &x3);
386 CURVE25519_FP51_ADD(t1.data, t1.data, z3.data);
387 Fp51Mul(&z3, &x1, &z2);
388 Fp51Mul(&z2, &t2, &t1);
389 }
390
391 CURVE25519_FP51_CSWAP(swap, x2.data, x3.data);
392 CURVE25519_FP51_CSWAP(swap, z2.data, z3.data);
393 /* Return x2 * (z2 ^ (p - 2)) */
394 Fp51Invert(&t1, &z2);
395 Fp51Mul(&t2, &x2, &t1);
396 Fp51PolyToData(&t2, out);
397 BSL_SAL_CleanseData(k, sizeof(k));
398 }
399
400 #else
401
FpMulScalar(Fp25 out,const Fp25 p,const int32_t scalar)402 void FpMulScalar(Fp25 out, const Fp25 p, const int32_t scalar)
403 {
404 int64_t s = (int64_t)scalar;
405 uint64_t over;
406 uint64_t result[10];
407 uint64_t mul19;
408 uint64_t t1;
409 uint64_t signMask1;
410 uint64_t signMask2;
411
412 /* Could be more than 32 bits but not be more than 64 bits */
413 CURVE25519_FP_MUL_SCALAR(result, p, s);
414
415 /* Process Carry */
416 /* the radix 2^25.5 representation:
417 * f0+2^26*f1+2^51*f2+2^77*f3+2^102*f4+2^128*f5+2^153*f6+2^179*f7+2^204*f8+2^230*f9 */
418 over = result[9] + (1 << 24); /* carry chain: index 9->0; 2^25 progressiv, left shift by 24 bits */
419 signMask1 = MASK_HIGH64(25) & (-((over) >> 63)); /* 2^25 progressiv, shift 63 for sign */
420 t1 = (over >> 25) | signMask1;
421 mul19 = (t1 + (t1 << 1) + (t1 << 4)); /* 19 = 1 + 2^1 + 2^4 */
422 result[0] += mul19; /* carry chain: index 9->0 */
423 result[9] -= CURVE25519_MASK_HIGH_39 & over;
424
425 /* carry chain: index 1->2; 2^25 progressiv(26->51) */
426 /* carry chain: index 1->2; 2^25 progressiv, left shift by 24 bits */
427 PROCESS_CARRY(result[1], result[2], signMask1, over, 24);
428
429 /* carry chain: index 3->4; 2^25 progressiv(77->102) */
430 /* carry chain: index 3->4; 2^25 progressiv, left shift by 24 bits */
431 PROCESS_CARRY(result[3], result[4], signMask1, over, 24);
432
433 /* carry chain: index 5->6; 2^25 progressiv(128->153) */
434 /* carry chain: index 5->6; 2^25 progressiv, left shift by 24 bits */
435 PROCESS_CARRY(result[5], result[6], signMask1, over, 24);
436
437 /* carry chain: index 7->8; 2^25 progressiv(179->204) */
438 /* carry chain: index 7->8; 2^25 progressiv, left shift by 24 bits */
439 PROCESS_CARRY(result[7], result[8], signMask1, over, 24);
440
441 /* carry chain: index 0->1; 2^26 progressiv(0->26) */
442 /* carry chain: index 0->1; 2^26 progressiv, left shift by 25 bits */
443 PROCESS_CARRY(result[0], result[1], signMask2, over, 25);
444
445 /* carry chain: index 2->3; 2^26 progressiv(51->77) */
446 /* carry chain: index 2->3; 2^26 progressiv, left shift by 25 bits */
447 PROCESS_CARRY(result[2], result[3], signMask2, over, 25);
448
449 /* carry chain: index 4->5; 2^26 progressiv(102->128) */
450 /* carry chain: index 4->5; 2^26 progressiv, left shift by 25 bits */
451 PROCESS_CARRY(result[4], result[5], signMask2, over, 25);
452
453 /* carry chain: index 6->7; 2^26 progressiv(153->179) */
454 /* carry chain: index 6->7; 2^26 progressiv, left shift by 25 bits */
455 PROCESS_CARRY(result[6], result[7], signMask2, over, 25);
456
457 /* carry chain: index 8->9; 2^26 progressiv(204->230) */
458 /* carry chain: index 8->9; 2^26 progressiv, left shift by 25 bits */
459 PROCESS_CARRY(result[8], result[9], signMask2, over, 25);
460
461 /* The result would not be more than 32 bits */
462 out[0] = (int32_t)result[0]; // 0
463 out[1] = (int32_t)result[1]; // 1
464 out[2] = (int32_t)result[2]; // 2
465 out[3] = (int32_t)result[3]; // 3
466 out[4] = (int32_t)result[4]; // 4
467 out[5] = (int32_t)result[5]; // 5
468 out[6] = (int32_t)result[6]; // 6
469 out[7] = (int32_t)result[7]; // 7
470 out[8] = (int32_t)result[8]; // 8
471 out[9] = (int32_t)result[9]; // 9
472
473 (void)memset_s(result, sizeof(result), 0, sizeof(result));
474 }
475
ScalarMultiPoint(uint8_t out[32],const uint8_t scalar[32],const uint8_t point[32])476 void ScalarMultiPoint(uint8_t out[32], const uint8_t scalar[32], const uint8_t point[32])
477 {
478 uint8_t k[32];
479 const uint8_t *u = point;
480 int32_t t;
481 uint32_t swap;
482 uint32_t kTemp;
483 Fp25 x1, x2, x3, z2, z3, t1, t2, t3;
484
485 /* Decord the scalar into k */
486 CURVE25519_DECODE_LITTLE_ENDIAN(k, scalar);
487
488 /* Reference RFC 7748 section 5:The constant a24 is (486662 - 2) / 4 = 121665 for curve25519/X25519 */
489 DataToPolynomial(x1, u);
490 CURVE25519_FP_SET(x2, 1);
491 CURVE25519_FP_SET(z2, 0);
492 CURVE25519_FP_COPY(x3, x1);
493 CURVE25519_FP_SET(z3, 1);
494 swap = 0;
495
496 /* "bits" parameter set to 255 for x25519 */ /* For t = bits-1(254) down to 0: */
497 for (t = 254; t >= 0; t--) {
498 /* t >> 3: calculation the index of bit; t & 7: Obtains the corresponding bit in the byte */
499 kTemp = (k[(uint32_t)t >> 3] >> ((uint32_t)t & 7)) & 1; /* kTemp = (k >> t) & 1 */
500 swap ^= kTemp; /* swap ^= kTemp */
501 CURVE25519_FP_CSWAP(swap, x2, x3); /* (x_2, x_3) = cswap(swap, x_2, x_3) */
502 CURVE25519_FP_CSWAP(swap, z2, z3); /* (z_2, z_3) = cswap(swap, z_2, z_3) */
503 swap = kTemp; /* swap = kTemp */
504 CURVE25519_FP_ADD(t1, x2, z2); /* t1 = A */
505 CURVE25519_FP_SUB(t2, x2, z2); /* t2 = B */
506 CURVE25519_FP_ADD(x2, x3, z3); /* x2 = C */
507 CURVE25519_FP_SUB(x3, x3, z3); /* x3 = D */
508 FpMul(z2, x3, t1); /* z2 = DA */
509 FpMul(z3, x2, t2); /* z3 = CB */
510 FpSquareDoubleCore(t1, t1, false); /* t1 = AA */
511 FpSquareDoubleCore(t2, t2, false); /* t2 = BB */
512 CURVE25519_FP_SUB(t3, t1, t2); /* t3 = E = AA - BB */
513 CURVE25519_FP_ADD(x3, z2, z3); /* x3 = DA + CB */
514 FpSquareDoubleCore(x3, x3, false); /* x3 = (DA + CB)^2 */
515 CURVE25519_FP_SUB(z3, z2, z3); /* z3 = DA - CB */
516 FpSquareDoubleCore(z3, z3, false); /* z3 = (DA - CB)^2 */
517 FpMul(z3, x1, z3); /* z3 = x1 * (DA - CB)^2 */
518 FpMul(x2, t1, t2); /* x2 = AA * BB */
519 FpMul(t1, t3, t1); /* t1 = E * AA */
520 FpSquareDoubleCore(z2, t3, false); /* z2 = E^2 */
521 /* Reference RFC 7748 section 5:The constant a24 is (486662 - 2) / 4 = 121665 for curve25519/X25519 */
522 FpMulScalar(z2, z2, 121665); /* z2 = a24 * E^2 */
523 CURVE25519_FP_ADD(z2, t1, z2); /* z2 = E * (AA + a24 * E) */
524 }
525
526 CURVE25519_FP_CSWAP(swap, x2, x3);
527 CURVE25519_FP_CSWAP(swap, z2, z3);
528 /* Return x2 * (z2 ^ (p - 2)) */
529 FpInvert(t1, z2);
530 FpMul(t2, x2, t1);
531 PolynomialToData(out, t2);
532 }
533 #endif // uint128
534 #endif /* HITLS_CRYPTO_X25519 */
535