• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * This file is part of the openHiTLS project.
3  *
4  * openHiTLS is licensed under the Mulan PSL v2.
5  * You can use this software according to the terms and conditions of the Mulan PSL v2.
6  * You may obtain a copy of Mulan PSL v2 at:
7  *
8  *     http://license.coscl.org.cn/MulanPSL2
9  *
10  * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11  * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12  * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13  * See the Mulan PSL v2 for more details.
14  */
15 
16 #include "hitls_build.h"
17 #if defined(HITLS_CRYPTO_CURVE_NISTP224) && defined(HITLS_CRYPTO_NIST_USE_ACCEL)
18 
19 #include <stdbool.h>
20 #include "securec.h"
21 #include "bsl_err_internal.h"
22 #include "crypt_errno.h"
23 #include "crypt_utils.h"
24 #include "crypt_bn.h"
25 #include "crypt_ecc.h"
26 #include "ecc_local.h"
27 #include "ecc_utils.h"
28 #include "bsl_util_internal.h"
29 
30 #ifndef __SIZEOF_INT128__
31 #error "This nistp224 implementation require the compiler support 128-bits integer."
32 #endif
33 
34 /*  field element definition */
35 #define FELEM_BITS      224
36 #define FELEM_BYTES     28
37 #define LIMB_BITS       (sizeof(uint64_t) << 3)
38 #define LIMB_NUM        4
39 #define BASE_BITS       56
40 #define BASE_MASK       0x00ffffffffffffff
41 /* The pre-calculation table of the G table has 16 points. */
42 #define TABLE_G_SIZE    16
43 /* The pre-calculation table of the P table has 17 points. */
44 #define TABLE_P_SIZE    17
45 
46 /*
47  * field elements, stored as arrays, all represented in little endian.
48  * Each element of the array is called a digit, and each digit represents an extended 2^56-number-system digit,
49  * that is, Felem can be expressed as:
50  * f_0 + f_1 * 2^56 + f_2 * 2^112 + f_3 * 2^168
51  * LongFelem is the same, but twice the width of Felem.
52  * It is used to store the result of multiplication of field elements.
53  * Point is a point represented as a Jacobian coordinate
54  */
55 typedef struct {
56     uint64_t data[LIMB_NUM];
57 } Felem;
58 
59 typedef struct {
60     uint128_t data[LIMB_NUM * 2 - 1];
61 } LongFelem;
62 
63 typedef struct {
64     Felem x;
65     Felem y;
66     Felem z;
67 } Point;
68 
69 /* ------------------------------------------------------------ */
70 /* ECP224 field order p, the value is 2^224 - 2^96 + 1, little endian. */
71 static const Felem FIELD_ORDER = {
72     {0x0000000000000001, 0x00ffff0000000000, 0x00ffffffffffffff, 0x00ffffffffffffff}
73 };
74 
75 /*
76  * A pre-calculated table of the base point G, which contains the (X, Y, Z) coordinates of k*G
77  *
78  * PRE_MUL_G divides all bits into four equal parts.
79  * index      Corresponding bit             value of k
80  *   0             0 0 0 0         0     + 0     + 0     + 0
81  *   1             0 0 0 1         0     + 0     + 0     + 1
82  *   2             0 0 1 0         0     + 0     + 2^56  + 0
83  *   3             0 0 1 1         0     + 0     + 2^56  + 1
84  *   4             0 1 0 0         0     + 2^112 + 0     + 0
85  *   5             0 1 0 1         0     + 2^112 + 0     + 1
86  *   6             0 1 1 0         0     + 2^112 + 2^56  + 0
87  *   7             0 1 1 1         0     + 2^112 + 2^56  + 1
88  *   8             1 0 0 0         2^168 + 0     + 0     + 0
89  *   9             1 0 0 1         2^168 + 0     + 0     + 1
90  *  10             1 0 1 0         2^168 + 0     + 2^56  + 0
91  *  11             1 0 1 1         2^168 + 0     + 2^56  + 1
92  *  12             1 1 0 0         2^168 + 2^112 + 0     + 0
93  *  13             1 1 0 1         2^168 + 2^112 + 0     + 1
94  *  14             1 1 1 0         2^168 + 2^112 + 2^56  + 0
95  *  15             1 1 1 1         2^168 + 2^112 + 2^56  + 1
96  */
97 static const Point PRE_MUL_G[TABLE_G_SIZE] = {
98     {
99         {{0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}},
100         {{0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}},
101         {{0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
102     }, {
103         {{0x003280d6115c1d21, 0x00c1d356c2112234, 0x007f321390b94a03, 0x00b70e0cbd6bb4bf}},
104         {{0x00d5819985007e34, 0x0075a05a07476444, 0x00fb4c22dfe6cd43, 0x00bd376388b5f723}},
105         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
106     }, {
107         {{0x00fd9675666ebbe9, 0x00bca7664d40ce5e, 0x002242df8d8a2a43, 0x001f49bbb0f99bc5}},
108         {{0x0029e0b892dc9c43, 0x00ece8608436e662, 0x00dc858f185310d0, 0x009812dd4eb8d321}},
109         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
110     }, {
111         {{0x006d3e678d5d8eb8, 0x00559eed1cb362f1, 0x0016e9a3bbce8a3f, 0x00eedcccd8c2a748}},
112         {{0x00f19f90ed50266d, 0x00abf2b4bf65f9df, 0x00313865468fafec, 0x005cb379ba910a17}},
113         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
114     }, {
115         {{0x000641966cab26e3, 0x0091fb2991fab0a0, 0x00efec27a4e13a0b, 0x000499aa8a5f8ebe}},
116         {{0x007510407766af5d, 0x0084d929610d5450, 0x0081d77aae82f706, 0x006916f6d4338c5b}},
117         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
118     }, {
119         {{0x00ea95ac3b1f15c6, 0x00086000905e82d4, 0x00dd323ae4d1c8b1, 0x00932b56be7685a3}},
120         {{0x009ef93dea25dbbf, 0x0041665960f390f0, 0x00fdec76dbe2a8a7, 0x00523e80f019062a}},
121         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
122     }, {
123         {{0x00822fdd26732c73, 0x00a01c83531b5d0f, 0x00363f37347c1ba4, 0x00c391b45c84725c}},
124         {{0x00bbd5e1b2d6ad24, 0x00ddfbcde19dfaec, 0x00c393da7e222a7f, 0x001efb7890ede244}},
125         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
126     }, {
127         {{0x004c9e90ca217da1, 0x00d11beca79159bb, 0x00ff8d33c2c98b7c, 0x002610b39409f849}},
128         {{0x0044d1352ac64da0, 0x00cdbb7b2c46b4fb, 0x00966c079b753c89, 0x00fe67e4e820b112}},
129         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
130     }, {
131         {{0x00e28cae2df5312d, 0x00c71b61d16f5c6e, 0x0079b7619a3e7c4c, 0x0005c73240899b47}},
132         {{0x009f7f6382c73e3a, 0x0018615165c56bda, 0x00641fab2116fd56, 0x0072855882b08394}},
133         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
134     }, {
135         {{0x000469182f161c09, 0x0074a98ca8d00fb5, 0x00b89da93489a3e0, 0x0041c98768fb0c1d}},
136         {{0x00e5ea05fb32da81, 0x003dce9ffbca6855, 0x001cfe2d3fbf59e6, 0x000e5e03408738a7}},
137         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
138     }, {
139         {{0x00dab22b2333e87f, 0x004430137a5dd2f6, 0x00e03ab9f738beb8, 0x00cb0c5d0dc34f24}},
140         {{0x00764a7df0c8fda5, 0x00185ba5c3fa2044, 0x009281d688bcbe50, 0x00c40331df893881}},
141         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
142     }, {
143         {{0x00b89530796f0f60, 0x00ade92bd26909a3, 0x001a0c83fb4884da, 0x001765bf22a5a984}},
144         {{0x00772a9ee75db09e, 0x0023bc6c67cec16f, 0x004c1edba8b14e2f, 0x00e2a215d9611369}},
145         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
146     }, {
147         {{0x00571e509fb5efb3, 0x00ade88696410552, 0x00c8ae85fada74fe, 0x006c7e4be83bbde3}},
148         {{0x00ff9f51160f4652, 0x00b47ce2495a6539, 0x00a2946c53b582f4, 0x00286d2db3ee9a60}},
149         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
150     }, {
151         {{0x0040bbd5081a44af, 0x000995183b13926c, 0x00bcefba6f47f6d0, 0x00215619e9cc0057}},
152         {{0x008bc94d3b0df45e, 0x00f11c54a3694f6f, 0x008631b93cdfe8b5, 0x00e7e3f4b0982db9}},
153         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
154     }, {
155         {{0x00b17048ab3e1c7b, 0x00ac38f36ff8a1d8, 0x001c29819435d2c6, 0x00c813132f4c07e9}},
156         {{0x002891425503b11f, 0x0008781030579fea, 0x00f5426ba5cc9674, 0x001e28ebf18562bc}},
157         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
158     }, {
159         {{0x009f31997cc864eb, 0x0006cd91d28b5e4c, 0x00ff17036691a973, 0x00f1aef351497c58}},
160         {{0x00dd1f2d600564ff, 0x00dead073b1402db, 0x0074a684435bd693, 0x00eea7471f962558}},
161         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
162     }
163 };
164 
165 /*
166  * A pre-calculated table of the base point G, which contains the (X, Y, Z) coordinates of k*G
167  *
168  * PRE_MUL_G2[] = PRE_MUL_G[] * 2^28
169  * index      Corresponding bit                value of k
170  *   0            0 0 0 0              0     + 0     + 0     + 0
171  *   1            0 0 0 1              0     + 0     + 0     + 2^28
172  *   2            0 0 1 0              0     + 0     + 2^84  + 0
173  *   3            0 0 1 1              0     + 0     + 2^84  + 2^28
174  *   4            0 1 0 0              0     + 2^140 + 0     + 0
175  *   5            0 1 0 1              0     + 2^140 + 0     + 2^28
176  *   6            0 1 1 0              0     + 2^140 + 2^84  + 0
177  *   7            0 1 1 1              0     + 2^140 + 2^84  + 2^28
178  *   8            1 0 0 0              2^196 + 0     + 0     + 0
179  *   9            1 0 0 1              2^196 + 0     + 0     + 2^28
180  *  10            1 0 1 0              2^196 + 0     + 2^84  + 0
181  *  11            1 0 1 1              2^196 + 0     + 2^84  + 2^28
182  *  12            1 1 0 0              2^196 + 2^140 + 0     + 0
183  *  13            1 1 0 1              2^196 + 2^140 + 0     + 2^28
184  *  14            1 1 1 0              2^196 + 2^140 + 2^84  + 0
185  *  15            1 1 1 1              2^196 + 2^140 + 2^84  + 2^28
186  */
187 static const Point PRE_MUL_G2[TABLE_G_SIZE] = {
188     {
189         {{0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}},
190         {{0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}},
191         {{0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
192     }, {
193         {{0x009665266dddf554, 0x009613d78b60ef2d, 0x00ce27a34cdba417, 0x00d35ab74d6afc31}},
194         {{0x0085ccdd22deb15e, 0x002137e5783a6aab, 0x00a141cffd8c93c6, 0x00355a1830e90f2d}},
195         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
196     }, {
197         {{0x001a494eadaade65, 0x00d6da4da77fe53c, 0x00e7992996abec86, 0x0065c3553c6090e3}},
198         {{0x00fa610b1fb09346, 0x00f1c6540b8a4aaf, 0x00c51a13ccd3cbab, 0x0002995b1b18c28a}},
199         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
200     }, {
201         {{0x007874568e7295ef, 0x0086b419fbe38d04, 0x00dc0690a7550d9a, 0x00d3966a44beac33}},
202         {{0x002b7280ec29132f, 0x00beaa3b6a032df3, 0x00dc7dd88ae41200, 0x00d25e2513e3a100}},
203         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
204     }, {
205         {{0x00924857eb2efafd, 0x00ac2bce41223190, 0x008edaa1445553fc, 0x00825800fd3562d5}},
206         {{0x008d79148ea96621, 0x0023a01c3dd9ed8d, 0x00af8b219f9416b5, 0x00d8db0cc277daea}},
207         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
208     }, {
209         {{0x0076a9c3b1a700f0, 0x00e9acd29bc7e691, 0x0069212d1a6b0327, 0x006322e97fe154be}},
210         {{0x00469fc5465d62aa, 0x008d41ed18883b05, 0x001f8eae66c52b88, 0x00e4fcbe9325be51}},
211         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
212     }, {
213         {{0x00825fdf583cac16, 0x00020b857c7b023a, 0x00683c17744b0165, 0x0014ffd0a2daf2f1}},
214         {{0x00323b36184218f9, 0x004944ec4e3b47d4, 0x00c15b3080841acf, 0x000bced4b01a28bb}},
215         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
216     }, {
217         {{0x0092ac22230df5c4, 0x0052f33b4063eda8, 0x00cb3f19870c0c93, 0x0040064f2ba65233}},
218         {{0x00fe16f0924f8992, 0x00012da25af5b517, 0x001a57bb24f723a6, 0x0006f8bc76760def}},
219         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
220     }, {
221         {{0x004a7084f7817cb9, 0x00bcab0738ee9a78, 0x003ec11e11d9c326, 0x00dc0fe90e0f1aae}},
222         {{0x00cf639ea5f98390, 0x005c350aa22ffb74, 0x009afae98a4047b7, 0x00956ec2d617fc45}},
223         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
224     }, {
225         {{0x004306d648c1be6a, 0x009247cd8bc9a462, 0x00f5595e377d2f2e, 0x00bd1c3caff1a52e}},
226         {{0x00045e14472409d0, 0x0029f3e17078f773, 0x00745a602b2d4f7d, 0x00191837685cdfbb}},
227         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
228     }, {
229         {{0x005b6ee254a8cb79, 0x004953433f5e7026, 0x00e21faeb1d1def4, 0x00c4c225785c09de}},
230         {{0x00307ce7bba1e518, 0x0031b125b1036db8, 0x0047e91868839e8f, 0x00c765866e33b9f3}},
231         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
232     }, {
233         {{0x003bfece24f96906, 0x004794da641e5093, 0x00de5df64f95db26, 0x00297ecd89714b05}},
234         {{0x00701bd3ebb2c3aa, 0x007073b4f53cb1d5, 0x0013c5665658af16, 0x009895089d66fe58}},
235         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
236     }, {
237         {{0x000fef05f78c4790, 0x002d773633b05d2e, 0x0094229c3a951c94, 0x00bbbd70df4911bb}},
238         {{0x00b2c6963d2c1168, 0x00105f47a72b0d73, 0x009fdf6111614080, 0x007b7e94b39e67b0}},
239         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
240     }, {
241         {{0x00ad1a7d6efbe2b3, 0x00f012482c0da69d, 0x006b3bdf12438345, 0x0040d7558d7aa4d9}},
242         {{0x008a09fffb5c6d3d, 0x009a356e5d9ffd38, 0x005973f15f4f9b1c, 0x00dcd5f59f63c3ea}},
243         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
244     }, {
245         {{0x00acf39f4c5ca7ab, 0x004c8071cc5fd737, 0x00c64e3602cd1184, 0x000acd4644c9abba}},
246         {{0x006c011a36d8bf6e, 0x00fecd87ba24e32a, 0x0019f6f56574fad8, 0x00050b204ced9405}},
247         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
248     }, {
249         {{0x00ed4f1cae7d9a96, 0x005ceef7ad94c40a, 0x00778e4a3bf3ef9b, 0x007405783dc3b55e}},
250         {{0x0032477c61b6e8c6, 0x00b46a97570f018b, 0x0091176d0a7e95d1, 0x003df90fbc4c7d0e}},
251         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
252     }
253 };
254 
255 /* --------------------------helper function-------------------------- */
256 /*
257  * Convert big endian byte stream to Felem
258  */
Bin2Felem(Felem * out,const uint8_t in[FELEM_BYTES])259 static void Bin2Felem(Felem *out, const uint8_t in[FELEM_BYTES])
260 {
261     int32_t offset;
262     for (int32_t i = LIMB_NUM - 1; i >= 0; --i) {
263         offset = 7 * (LIMB_NUM - 1 - i);  // 56bits occupy 7 bytes.
264         out->data[i] = ((uint64_t)in[offset + 0] << 48) |  // Byte 0 is shifted by 48 bits to the left.
265                        ((uint64_t)in[offset + 1] << 40) |  // The 1st byte is shifted leftward by 40 bits.
266                        ((uint64_t)in[offset + 2] << 32) |  // The 2nd byte is shifted leftward by 32 bits.
267                        ((uint64_t)in[offset + 3] << 24) |  // The 3rd byte is shifted leftward by 24 bits.
268                        ((uint64_t)in[offset + 4] << 16) |  // The 4th byte is shifted leftward by 16 bits.
269                        ((uint64_t)in[offset + 5] << 8) |   // The 5th byte is shifted leftward by 8 bits.
270                        ((uint64_t)in[offset + 6]);         // No shift is required for the 6th byte.
271     }
272 }
273 
274 /*
275  * Convert Felem to big-endian byte stream
276  * Input:
277  *      in[] < 2^56
278  * Output:
279  *      the length of out is 28
280  */
Felem2Bin(uint8_t out[FELEM_BYTES],const Felem * in)281 static void Felem2Bin(uint8_t out[FELEM_BYTES], const Felem *in)
282 {
283     int32_t offset;
284     for (int32_t i = LIMB_NUM - 1; i >= 0; --i) {
285         offset = 7 * (LIMB_NUM - 1 - i);  // 56bits occupy 7 bytes.
286         out[offset + 0] = (uint8_t)(in->data[i] >> 48);  // out[i + 0] get 48~55 bits
287         out[offset + 1] = (uint8_t)(in->data[i] >> 40);  // out[i + 1] get 40~47 bits
288         out[offset + 2] = (uint8_t)(in->data[i] >> 32);  // out[i + 2] get 32~39 bits
289         out[offset + 3] = (uint8_t)(in->data[i] >> 24);  // out[i + 3] get 24~31 bits
290         out[offset + 4] = (uint8_t)(in->data[i] >> 16);  // out[i + 4] get 16~23 bits
291         out[offset + 5] = (uint8_t)(in->data[i] >> 8);   // out[i + 5] get 8~15 bits
292         out[offset + 6] = (uint8_t)in->data[i];          // out[i + 6] get 0~7 bits
293     }
294 }
295 
296 /*
297  * Convert BN to Felem
298  * Output:
299  *      out[] < 2^56
300  */
BN2Felem(Felem * out,const BN_BigNum * in)301 static int32_t BN2Felem(Felem *out, const BN_BigNum *in)
302 {
303     int32_t retVal;
304     uint8_t bin[FELEM_BYTES];
305     uint32_t len = FELEM_BYTES;
306 
307     GOTO_ERR_IF(BN_Bn2Bin(in, bin, &len), retVal);
308 
309     for (uint32_t i = 0; i < FELEM_BYTES; ++i) {
310         bin[FELEM_BYTES - 1 - i] = i < len ? bin[len - 1 - i] : 0;
311     }
312 
313     Bin2Felem(out, bin);
314 ERR:
315     return retVal;
316 }
317 
318 /*
319  * Convert Felem to BN
320  * Input:
321  *      in[] < 2^56
322  */
Felem2BN(BN_BigNum * out,const Felem * in)323 static int32_t Felem2BN(BN_BigNum *out, const Felem *in)
324 {
325     int32_t retVal;
326     uint8_t bin[FELEM_BYTES];
327 
328     Felem2Bin(bin, in);
329 
330     GOTO_ERR_IF(BN_Bin2Bn(out, bin, FELEM_BYTES), retVal);
331 ERR:
332     return retVal;
333 }
334 
335 /* ---------------------------field operation--------------------------- */
336 /*
337  * Assignment
338  */
FelemAssign(Felem * out,const Felem * in)339 static inline void FelemAssign(Felem *out, const Felem *in)
340 {
341     out->data[0] = in->data[0];  // out->data[0] takes the value
342     out->data[1] = in->data[1];  // out->data[1] takes the value
343     out->data[2] = in->data[2];  // out->data[2] takes the value
344     out->data[3] = in->data[3];  // out->data[3] takes the value
345 }
346 
347 /*
348  * Copy bits by mask. If the corresponding bit is 1, copy the bit. If the corresponding bit is 0, retain the bit.
349  */
FelemAssignWithMask(Felem * out,const Felem * in,const uint64_t mask)350 static inline void FelemAssignWithMask(Felem *out, const Felem *in, const uint64_t mask)
351 {
352     uint64_t rmask = ~mask;
353     out->data[0] = (in->data[0] & mask) | (out->data[0] & rmask);  // out->data[0] get a new value or remain unchanged.
354     out->data[1] = (in->data[1] & mask) | (out->data[1] & rmask);  // out->data[1] get a new value or remain unchanged.
355     out->data[2] = (in->data[2] & mask) | (out->data[2] & rmask);  // out->data[2] get a new value or remain unchanged.
356     out->data[3] = (in->data[3] & mask) | (out->data[3] & rmask);  // out->data[3] get a new value or remain unchanged.
357 }
358 
359 /*
360  * Set the lowest digit
361  */
FelemSetLimb(Felem * out,const uint64_t in)362 static inline void FelemSetLimb(Felem *out, const uint64_t in)
363 {
364     out->data[0] = in;  // out->data[0] takes the value
365     out->data[1] = 0;   // out->data[1] clear to 0
366     out->data[2] = 0;   // out->data[2] clear to 0
367     out->data[3] = 0;   // out->data[3] clear to 0
368 }
369 
370 /*
371  * Zero judgment: is input less than(2^224 + 2^(16 + 192)). Only 0 and p need to be judged.
372  * Input:
373  *      in[] < 2^56 + 2^16
374  *      - in[0] < 2^56
375  *      - in[1] < 2^56
376  *      - in[2] < 2^56
377  *      - in[3] < 2^56 + 2^16
378  */
FelemIsZero(const Felem * in)379 static inline uint64_t FelemIsZero(const Felem *in)
380 {
381     uint64_t isZero, isP;
382 
383     // Check whether digits 0, 1, 2, and 3 of in are all 0.
384     isZero = in->data[0] | in->data[1] | in->data[2] | in->data[3];
385     isZero -= 1;  // If in == 0, the most significant bit is 1.
386 
387     // Determine that in is equal to the field order.
388     isP = (in->data[0] ^ FIELD_ORDER.data[0]) |     // Determines whether the digits 0 are equal.
389           (in->data[1] ^ FIELD_ORDER.data[1]) |     // Determines whether the digits 1 are equal.
390           (in->data[2] ^ FIELD_ORDER.data[2]) |     // Determines whether the digits 2 are equal.
391           (in->data[3] ^ FIELD_ORDER.data[3]);      // Determines whether the digits 3 are equal.
392     isP -= 1;  // If in == p, the most significant bit is 1
393 
394     return (isZero | isP) >> (LIMB_BITS - 1);
395 }
396 
397 /*
398  * Obtain the bit string whose length is len at idx in Felem.
399  * Input:
400  *      in[i] < 2^56
401  *      0 < len <= 64
402  */
FelemGetBits(const Felem * in,int32_t idx,uint32_t len)403 static uint64_t FelemGetBits(const Felem *in, int32_t idx, uint32_t len)
404 {
405     uint64_t ret;
406     uint32_t lower, upper;
407     uint64_t mask;
408 
409     lower = (uint32_t)idx;
410     // when 0 <= lower < 224, the most significant bit is 1. Obtain the most significant bit by right shifted by 31 bits
411     mask = (uint64_t)0 - ((~lower & (lower - 224)) >> 31);
412     ret = (in->data[(lower / BASE_BITS) & mask] & BASE_MASK & mask) >> (lower % BASE_BITS);
413 
414     upper = (uint32_t)idx + BASE_BITS;  // Next Unary Block
415     // when 0 <= upper < 224, the most significant bit is 1. Obtain the most significant bit by right shifted by 31 bits
416     mask = (uint64_t)0 - ((~upper & (upper - 224)) >> 31);
417     ret |= (in->data[(upper / BASE_BITS) & mask] & BASE_MASK & mask) << (BASE_BITS - upper % BASE_BITS);
418 
419     ret &= ((uint64_t)1 << len) - 1;  // All 1s of the len length
420 
421     return ret;
422 }
423 
424 /*
425  * field element reduction, retain the lower 56 bits of each Limb in the 4-ary Felem,
426  * perform reduction on the upper bits, and perform modulo operation to reduce all Limbs to [0, 2 ^ 56 + 2 ^ 7).
427  * Input:
428  *      in[] < 2^63
429  * Output:
430  *      out[] < 2^56 + 2^7
431  *      - out[0] < 2^56
432  *      - out[1] < 2^56
433  *      - out[2] < 2^56
434  *      - out[3] < 2^56 + 2^7
435  */
FelemReduce(Felem * out,const Felem * in)436 static void FelemReduce(Felem *out, const Felem *in)
437 {
438     uint64_t carryLimb = in->data[3] >> BASE_BITS;
439     const uint64_t borrow = (uint64_t)1 << 56;
440     const uint64_t lend = 1;
441 
442     // reduction carryLimb * (2^96 - 1), because it's the non-negative number, it must can be borrowed.
443     // Calculate out->data[0]
444     out->data[0] = in->data[0] + borrow - carryLimb;                                        // carryLimb * (-1)
445     // Calculate out->data[1]
446     out->data[1] = in->data[1] + (out->data[0] >> BASE_BITS) + (carryLimb << 40) - lend;    // carryLimb * 2^(56 + 40)
447     // Calculate out->data[2]
448     out->data[2] = in->data[2] + (out->data[1] >> BASE_BITS);
449     // Calculate out->data[3]
450     out->data[3] = (in->data[3] & BASE_MASK) + (out->data[2] >> BASE_BITS);                 // < 2^56 + 2^7
451 
452     out->data[0] &= BASE_MASK;  // out->data[0] Take the lower bits.
453     out->data[1] &= BASE_MASK;  // out->data[1] Take the lower bits.
454     out->data[2] &= BASE_MASK;  // out->data[2] Take the lower bits.
455 }
456 
457 /*
458  * field element reduction, convert 7-ary LongFelem to 4-ary Felem,
459  * and modulo reduction to all Limbs to [0, 2 ^ 56 + 2 ^ 16)
460  * Input:
461  *      in[] < 2^126
462  * Output:
463  *      out[] < 2^56 + 2^16
464  *      - out[0] < 2^56
465  *      - out[1] < 2^56
466  *      - out[2] < 2^56
467  *      - out[3] < 2^56 + 2^16
468  */
LongFelemReduce(Felem * out,const LongFelem * in)469 static void LongFelemReduce(Felem *out, const LongFelem *in)
470 {
471     const uint128_t *pi = in->data;
472     // p shifts left by 15
473     static const LongFelem zeroBase = {
474         {
475             (((uint128_t)1 << 112) - ((uint128_t)1 << 96) + 1)    << 15,  // 2^127 - 2^111 + 2^15
476             (((uint128_t)1 << 112) - ((uint128_t)1 << 56))        << 15,  // 2^127 - 2^71
477             (((uint128_t)1 << 112) - ((uint128_t)1 << 56))        << 15,  // 2^127 - 2^71
478             0,
479             0,
480             0,
481             0,
482         }
483     };
484 
485     uint128_t lout[4] = {
486         zeroBase.data[0] + pi[0],    // < 2^127 + 2^126 - 2^111 + 2^15
487         zeroBase.data[1] + pi[1],    // < 2^127 + 2^126 - 2^71
488         zeroBase.data[2] + pi[2],    // < 2^127 + 2^126 - 2^71
489         pi[3]                        // < 2^126
490     };
491     uint128_t carryLimb = pi[4];      // < 2^126
492 
493     // n = n / a * (a - b) ≡ n / a * b (mod (a - b))
494     // It can be obtained from the above formula:
495     // 2^n = 2^(n - 128) - 2^(n - 224) (mod (2^224 - 2^96 + 1))
496     //
497     // The following equation can be listed:
498     //   2^224 mod p
499     // = 2^96 - 1
500     //
501     //   2^280 mod p
502     // = 2^152 - 2^56
503     //
504     //   2^336 mod p
505     // = 2^208 - 2^112
506 
507     // reduce pi[6] and obtain the part higher 16 bits.
508     carryLimb += pi[6] >> 16;
509     // lout[3], reduce pi[6], shift leftwards by 40 bits then truncate; reduce pi[5] and obtain the part higher 16 bits.
510     lout[3] += ((pi[6] << 40) & BASE_MASK) + (pi[5] >> 16);
511     // lout[2], reduce pi[5], leftshift by 40bit then truncate; reduce carryLimb, obtain the higher 16bits. reduce pi[6]
512     lout[2] += ((pi[5] << 40) & BASE_MASK) + (carryLimb >> 16) - pi[6];
513     // lout[1], reduce carryLimb, shift leftwards by 40 bits then truncate; reduce pi[5]
514     lout[1] += ((carryLimb << 40) & BASE_MASK) - pi[5];
515     // lout[0], reduce carryLimb
516     lout[0] -= carryLimb;
517 
518     // Range after reduction:
519     // carryLimb < 2^126 + 2^110
520     // lout[3] < 2^126 + 2^56 + 2^110 < 2^127
521     // lout[2] < (2^127 + 2^126 - 2^71) + 2^56 + (2^110 + 2^94) < 2^128
522     // lout[1] < (2^127 + 2^126 - 2^71) + 2^56
523     // lout[0] < 2^127 + 2^126 - 2^111 + 2^15 < 2^128
524 
525     // carry
526     lout[3] += lout[2] >> BASE_BITS;   // lout[3] < 2^127 + 2^72 < 2^128
527     carryLimb = lout[3] >> BASE_BITS;  // carryLimb < 2^72
528 
529     lout[2] &= BASE_MASK;   // lout[2] < 2^56
530     lout[3] &= BASE_MASK;   // lout[3] < 2^56
531 
532     // reduce carryLimb
533     // lout[2],reduce carryLimb and obtain the part higher 16 bits.
534     lout[2] += carryLimb >> 16;
535     // lout[1],reduce carryLimb, shift leftwards by 40 bits then truncate;
536     lout[1] += (carryLimb << 40) & BASE_MASK;
537     // lout[0],reduce carryLimb
538     lout[0] -= carryLimb;
539 
540     // Range after reduction:
541     // lout[2] < 2^56 + 2^56
542     // lout[1] < (2^127 + 2^126 - 2^71 + 2^56) + 2^56
543     // lout[0] < 2^128
544 
545     // carry
546     lout[1] += lout[0] >> BASE_BITS;            // lout[1] < (2^127 + 2^126 - 2^71 + 2^57 - 2^41) + 2^72
547     lout[2] += lout[1] >> BASE_BITS;            // lout[2] < 2^57 + (2^71 + 2^70 + 2^16 - 2^15 + 1) < 2^72
548 
549     out->data[0] = (uint64_t)lout[0] & BASE_MASK;                         // out->data[0] < 2^56
550     out->data[1] = (uint64_t)lout[1] & BASE_MASK;                         // out->data[1] < 2^56
551     out->data[2] = (uint64_t)lout[2] & BASE_MASK;                         // out->data[2] < 2^56
552     out->data[3] = (uint64_t)lout[3] + (uint64_t)(lout[2] >> BASE_BITS);  // out->data[3] < 2^56 + 2^16
553 }
554 
555 /*
556  * field element modulo in [0, p), call FelemReduce or LongFelemReduce in advance.
557  * Input:
558  *      in[] < 2^56 + 2^16
559  *      - in[0] < 2^56
560  *      - in[1] < 2^56
561  *      - in[2] < 2^56
562  *      - in[3] < 2^56 + 2^16
563  * Output:
564  *      out < p, out[i] < 2^56
565  */
FelemContract(Felem * out,const Felem * in)566 static void FelemContract(Felem *out, const Felem *in)
567 {
568     Felem tmp = {{in->data[0], in->data[1], in->data[2], in->data[3] & BASE_MASK}};  // tmp[] < 2^56
569     uint64_t carryLimb = in->data[3] >> BASE_BITS;                  // 0 or 1, because of in[3] < 2^56 + 2^16
570     uint64_t mask;                                                  // Check the mask greater than or equal to p.
571 
572     const uint64_t lower40Mask = 0x000000ffffffffff;                // Lower 40-bit mask
573     const uint64_t borrow = (uint64_t)1 << BASE_BITS;
574     const uint64_t lend = 1;
575 
576     // Check whether tmp is greater than or equal to p. The upper 128 bits of p are all 1s and the lower 96 bits are 1.
577     // If the upper 128 bits (digit 3, 2, and 1) are all 1s, the value is 1. Otherwise, the value is 0.
578     mask = ((tmp.data[3] & tmp.data[2] & (tmp.data[1] | lower40Mask)) + 1) >> BASE_BITS;
579     // If the lower 96 bits are not zero, the value is 1. Otherwise, the value is 0.
580     mask &= (0 - ((tmp.data[1] & lower40Mask) | tmp.data[0])) >> (LIMB_BITS - 1);
581     mask = 0 - mask;  // If tmp is greater than or equal to p, the value is 1. Otherwise, the value is 0.
582 
583     // reduce carryLimb or subtract p. Note that carryLimb and mask cannot be true at the same time.
584     // p_0 = 0x0000000000000001
585     tmp.data[0] = (tmp.data[0] ^ borrow) - carryLimb - (mask & FIELD_ORDER.data[0]);
586     // p_1 = 0x00ffff0000000000
587     tmp.data[1] += carryLimb << 40;      // reduction carryLimb * 2^(56 + 40)
588     // If the value is less than p, remains unchanged. If it's greater than or equal to p, subtract 0x00ffff0000000000.
589     // That is, the lower 40 bits remain unchanged, and other bits are set to 0.
590     tmp.data[1] &= ~mask | lower40Mask;
591     // Lend to the low bit: if reduce it, then 2^96 - 1 > 0, if minus p, then tmp[1] + tmp[0] > p_1 + p_2
592     tmp.data[1] -= lend;
593 
594     // If the value is less than p, remains unchanged. If it's greater than or equal to p, subtract 0x00ffffffffffffff.
595     // That is, set to 0.
596     tmp.data[2] &= ~mask; // p_2 = 0x00ffffffffffffff
597 
598     // If the value is less than p, remains unchanged. If it's greater than or equal to p, subtract 0x00ffffffffffffff.
599     // That is, set to 0.
600     tmp.data[3] &= ~mask; // p_3 = 0x00ffffffffffffff
601 
602     // carry 0 -> 1 -> 2 -> 3
603     out->data[0] = tmp.data[0] & BASE_MASK;                               // tmp.data[0] get the lower bits.
604     out->data[1] = (tmp.data[1] += tmp.data[0] >> BASE_BITS) & BASE_MASK; // tmp.data[1] plus carry & get the lower bits
605     out->data[2] = (tmp.data[2] += tmp.data[1] >> BASE_BITS) & BASE_MASK; // tmp.data[2] plus carry & get the lower bits
606     out->data[3] = (tmp.data[3] += tmp.data[2] >> BASE_BITS) & BASE_MASK; // tmp.data[3] plus carry & get the lower bits
607 }
608 
609 /*
610  * field Addition
611  */
FelemAdd(Felem * out,const Felem * a,const Felem * b)612 static inline void FelemAdd(Felem *out, const Felem *a, const Felem *b)
613 {
614     out->data[0] = a->data[0] + b->data[0];  // out->data[0] takes the value
615     out->data[1] = a->data[1] + b->data[1];  // out->data[1] takes the value
616     out->data[2] = a->data[2] + b->data[2];  // out->data[2] takes the value
617     out->data[3] = a->data[3] + b->data[3];  // out->data[3] takes the value
618 }
619 
620 /*
621  * field element negation(NOT)
622  * Input:
623  *      in[] <= 2^57 - 2^41 - 2^1
624  * Output:
625  *      out[] <= 2^57 + 2^1 + in[] < 2^64
626  */
FelemNeg(Felem * out,const Felem * in)627 static inline void FelemNeg(Felem *out, const Felem *in)
628 {
629     static const Felem zeroBase = {{
630         (((uint64_t)1 << 56) + 1)                       << 1,
631         (((uint64_t)1 << 56) - ((uint64_t)1 << 40) - 1) << 1,
632         (((uint64_t)1 << 56) - 1)                       << 1,
633         (((uint64_t)1 << 56) - 1)                       << 1,
634     }};
635 
636     out->data[0] = zeroBase.data[0] - in->data[0];  // out->data[0] takes the value
637     out->data[1] = zeroBase.data[1] - in->data[1];  // out->data[1] takes the value
638     out->data[2] = zeroBase.data[2] - in->data[2];  // out->data[2] takes the value
639     out->data[3] = zeroBase.data[3] - in->data[3];  // out->data[3] takes the value
640 }
641 
642 /*
643  * field subtraction
644  * Input:
645  *      a[] < 2^64 - 2^60 - 2^4,b[] <= 2^60 - 2^44 - 2^4
646  * Output:
647  *      out[] <= 2^60 + 2^4 + a[] < 2^64
648  */
FelemSub(Felem * out,const Felem * a,const Felem * b)649 static inline void FelemSub(Felem *out, const Felem *a, const Felem *b)
650 {
651     static const Felem zeroBase = {{
652         (((uint64_t)1 << 56) + 1)                       << 4,
653         (((uint64_t)1 << 56) - ((uint64_t)1 << 40) - 1) << 4,
654         (((uint64_t)1 << 56) - 1)                       << 4,
655         (((uint64_t)1 << 56) - 1)                       << 4,
656     }};
657 
658     out->data[0] = zeroBase.data[0] + a->data[0] - b->data[0];  // out->data[0] takes the value
659     out->data[1] = zeroBase.data[1] + a->data[1] - b->data[1];  // out->data[1] takes the value
660     out->data[2] = zeroBase.data[2] + a->data[2] - b->data[2];  // out->data[2] takes the value
661     out->data[3] = zeroBase.data[3] + a->data[3] - b->data[3];  // out->data[3] takes the value
662 }
663 
664 /*
665  * field subtraction, input LongFelem directly.
666  * Input:
667  *      a[] < 2^128 - 2^124,b[] <= 2^124 - 2^68 - 2^52
668  * Output:
669  *      out[] <= 2^124 + a[] < 2^128
670  */
LongFelemSub(LongFelem * out,const LongFelem * a,const LongFelem * b)671 static void LongFelemSub(LongFelem *out, const LongFelem *a, const LongFelem *b)
672 {
673     // p shift left (8 + 56 * 3)
674     static const LongFelem zeroBase = {{
675         ((uint128_t)1 << 112)                                                   << 12,
676         (((uint128_t)1 << 112) - ((uint128_t)1 << 56))                          << 12,
677         (((uint128_t)1 << 112) - ((uint128_t)1 << 56))                          << 12,
678         (((uint128_t)1 << 112) - ((uint128_t)1 << 56))                          << 12,
679         (((uint128_t)1 << 112) - ((uint128_t)1 << 56) + 1)                      << 12,  // 1
680         (((uint128_t)1 << 112) - ((uint128_t)1 << 56) - ((uint128_t)1 << 40))   << 12,  // -2^96
681         (((uint128_t)1 << 112) - ((uint128_t)1 << 56))                          << 12,  // 2^224
682     }};
683 
684     out->data[0] = zeroBase.data[0] + a->data[0] - b->data[0];  // out->data[0] takes the value
685     out->data[1] = zeroBase.data[1] + a->data[1] - b->data[1];  // out->data[1] takes the value
686     out->data[2] = zeroBase.data[2] + a->data[2] - b->data[2];  // out->data[2] takes the value
687     out->data[3] = zeroBase.data[3] + a->data[3] - b->data[3];  // out->data[3] takes the value
688     out->data[4] = zeroBase.data[4] + a->data[4] - b->data[4];  // out->data[4] takes the value
689     out->data[5] = zeroBase.data[5] + a->data[5] - b->data[5];  // out->data[5] takes the value
690     out->data[6] = zeroBase.data[6] + a->data[6] - b->data[6];  // out->data[6] takes the value
691 }
692 
693 /*
694  * field element magnification
695  * Use only a small magnification factor to ensure that in[]*scalar does not overflow.
696  */
FelemScale(Felem * out,const Felem * in,const uint32_t scalar)697 static inline void FelemScale(Felem *out, const Felem *in, const uint32_t scalar)
698 {
699     out->data[0] = in->data[0] * scalar;  // out->data[0] takes the value
700     out->data[1] = in->data[1] * scalar;  // out->data[1] takes the value
701     out->data[2] = in->data[2] * scalar;  // out->data[2] takes the value
702     out->data[3] = in->data[3] * scalar;  // out->data[3] takes the value
703 }
704 
705 /*
706  * field element magnification, input LongFelem directly.
707  * Use only a small magnification factor to ensure that in[]*scalar does not overflow.
708  */
LongFelemScale(LongFelem * out,const LongFelem * in,const uint32_t scalar)709 static inline void LongFelemScale(LongFelem *out, const LongFelem *in, const uint32_t scalar)
710 {
711     out->data[0] = in->data[0] * scalar;  // out->data[0] takes the value
712     out->data[1] = in->data[1] * scalar;  // out->data[1] takes the value
713     out->data[2] = in->data[2] * scalar;  // out->data[2] takes the value
714     out->data[3] = in->data[3] * scalar;  // out->data[3] takes the value
715     out->data[4] = in->data[4] * scalar;  // out->data[4] takes the value
716     out->data[5] = in->data[5] * scalar;  // out->data[5] takes the value
717     out->data[6] = in->data[6] * scalar;  // out->data[6] takes the value
718 }
719 
720 /*
721  * field Multiplication
722  * Input:
723  *      a[] < 2^62, b[] < 2^62
724  * output:
725  *      out[] < a[] * b[] * 4 < 2^126
726  *      - out[0] < 2^124
727  *      - out[1] < 2^124 * 2
728  *      - out[2] < 2^124 * 3
729  *      - out[3] < 2^124 * 4
730  *      - out[4] < 2^124 * 3
731  *      - out[5] < 2^124 * 2
732  *      - out[6] < 2^124
733  */
FelemMul(LongFelem * out,const Felem * a,const Felem * b)734 static void FelemMul(LongFelem *out, const Felem *a, const Felem *b)
735 {
736     // out[0] = a[0]*b[0]
737     // out[1] = a[0]*b[1] + a[1]*b[0]
738     // out[2] = a[0]*b[2] + a[1]*b[1] + a[2]*b[0]
739     // out[3] = a[0]*b[3] + a[1]*b[2] + a[2]*b[1] + a[3]*b[0]
740     // out[4] =             a[1]*b[3] + a[2]*b[2] + a[3]*b[1]
741     // out[5] =                         a[2]*b[3] + a[3]*b[2]
742     // out[6] =                                     a[3]*b[3]
743 
744     uint128_t *po = out->data;
745     const uint64_t *pa = a->data;
746     const uint64_t *pb = b->data;
747 
748     po[0] = (uint128_t)pa[0] * pb[0];
749     po[1] = (uint128_t)pa[0] * pb[1] + (uint128_t)pa[1] * pb[0];
750     po[2] = (uint128_t)pa[0] * pb[2] + (uint128_t)pa[1] * pb[1] + (uint128_t)pa[2] * pb[0];
751     po[3] = (uint128_t)pa[0] * pb[3] + (uint128_t)pa[1] * pb[2] + (uint128_t)pa[2] * pb[1] + (uint128_t)pa[3] * pb[0];
752     po[4] =                            (uint128_t)pa[1] * pb[3] + (uint128_t)pa[2] * pb[2] + (uint128_t)pa[3] * pb[1];
753     po[5] =                                                       (uint128_t)pa[2] * pb[3] + (uint128_t)pa[3] * pb[2];
754     po[6] =                                                                                  (uint128_t)pa[3] * pb[3];
755 }
756 
757 /*
758  * field Square
759  * Input:
760  *      a[] < 2^62
761  * output:
762  *      out[] < a[] * a[] * 4 < 2^126
763  *      - out[0] < 2^124
764  *      - out[1] < 2^124 * 2
765  *      - out[2] < 2^124 * 3
766  *      - out[3] < 2^124 * 4
767  *      - out[4] < 2^124 * 3
768  *      - out[5] < 2^124 * 2
769  *      - out[6] < 2^124
770  */
FelemSqr(LongFelem * out,const Felem * a)771 static void FelemSqr(LongFelem *out, const Felem *a)
772 {
773     // out[0] = a[0]*a[0]
774     // out[1] = a[0]*a[1]*2
775     // out[2] = a[0]*a[2]*2 + a[1]*a[1]
776     // out[3] = a[0]*a[3]*2 + a[1]*a[2]*2
777     // out[4] = a[1]*a[3]*2 + a[2]*a[2]
778     // out[5] = a[2]*a[3]*2
779     // out[6] = a[3]*a[3]
780 
781     const uint64_t *pa = a->data;
782 
783     uint64_t a1x2 = pa[1] << 1;
784     uint64_t a2x2 = pa[2] << 1;
785 
786     out->data[0] = (uint128_t)pa[0] * pa[0];
787     out->data[1] = (uint128_t)pa[0] * a1x2;
788     out->data[2] = (uint128_t)pa[0] * a2x2 + (uint128_t)pa[1] * pa[1];
789     out->data[3] = ((uint128_t)pa[0] * pa[3] + (uint128_t)pa[1] * pa[2]) << 1;
790     out->data[4] = (uint128_t)pa[3] * a1x2 + (uint128_t)pa[2] * pa[2];
791     out->data[5] = (uint128_t)pa[3] * a2x2;
792     out->data[6] = (uint128_t)pa[3] * pa[3];
793 }
794 
FelemMulReduce(Felem * out,const Felem * a,const Felem * b)795 static inline void FelemMulReduce(Felem *out, const Felem *a, const Felem *b)
796 {
797     LongFelem ltmp;
798     FelemMul(&ltmp, a, b);
799     LongFelemReduce(out, &ltmp);
800 }
801 
FelemSqrReduce(Felem * out,const Felem * in)802 static inline void FelemSqrReduce(Felem *out, const Felem *in)
803 {
804     LongFelem ltmp;
805     FelemSqr(&ltmp, in);
806     LongFelemReduce(out, &ltmp);
807 }
808 
809 /*
810  * field element inversion
811  * From Fermat's little theorem, in^(p - 2) = in^(-1) (mod p)
812  * in^(-1) = in^(2^224 - 2^96 + 1 - 2) (mod p)
813  * Input:
814  *      in[i] < 2^63
815  * Output:
816  *      reduce(out[i])
817  */
FelemInv(Felem * out,const Felem * in)818 static void FelemInv(Felem *out, const Felem *in)
819 {
820     Felem inE1, inE6, inE24, inE96;  //  inEx indicates in^(2^(0)+2^(1)+'''+2^(x-1))
821     uint32_t i;
822 
823     //  Construct inE1
824     FelemReduce(out, in);
825     FelemAssign(&inE1, out);
826 
827     //  Construct inE2
828     FelemSqrReduce(out, out);
829     FelemMulReduce(out, out, &inE1);
830 
831     //  Construct inE4, and store it in inE6
832     FelemAssign(&inE6, out);
833     FelemSqrReduce(&inE6, &inE6);
834     FelemSqrReduce(&inE6, &inE6);
835     FelemMulReduce(&inE6, out, &inE6); //  inE6 is temporarily stored in inE4
836 
837     //  Construct in^6
838     FelemSqrReduce(&inE6, &inE6);
839     FelemSqrReduce(&inE6, &inE6);
840     FelemMulReduce(&inE6, out, &inE6);
841 
842     //  Construct inE12
843     FelemAssign(out, &inE6);
844     for (i = 0; i < 6; ++i) {     //  Moves the out by 6 digits to the left.
845         FelemSqrReduce(out, out);
846     }
847     FelemMulReduce(out, &inE6, out);
848 
849     //  Construct inE24
850     FelemAssign(&inE96, out);
851     for (i = 0; i < 12; ++i) {     //  Moves the out by 12 digits to the left.
852         FelemSqrReduce(out, out);
853     }
854     FelemMulReduce(&inE24, &inE96, out);
855 
856     //  Construct inE48
857     FelemAssign(out, &inE24);
858     for (i = 0; i < 24; ++i) {     //  Moves the out by 24 digits to the left.
859         FelemSqrReduce(out, out);
860     }
861     FelemMulReduce(out, &inE24, out);
862 
863     //  Construct inE96
864     FelemAssign(&inE96, out);
865     for (i = 0; i < 48; ++i) {    //  Moves the out by 48 digits to the left.
866         FelemSqrReduce(out, out);
867     }
868     FelemMulReduce(&inE96, &inE96, out);
869 
870     //  Construct inE7, and store it in inE6
871     FelemSqrReduce(&inE6, &inE6);
872     FelemMulReduce(&inE6, &inE6, &inE1);
873 
874     //  Construct inE31, and store it in inE24
875     for (i = 0; i < 7; ++i) {     //  Moves the inE24 by 7 digits to the left.
876         FelemSqrReduce(&inE24, &inE24);
877     }
878     FelemMulReduce(&inE24, &inE6, &inE24);
879 
880     //  Construct inE127, and store it in inE24
881     FelemAssign(out, &inE96);
882     for (i = 0; i < 31; ++i) {    //  Moves the out by 31 digits to the left.
883         FelemSqrReduce(out, out);
884     }
885     FelemMulReduce(&inE24, &inE24, out);
886 
887     // Move inE127 by 97 bits to the left and add inE97 to obtain inE(2^224 - 2^96 + 1 - 2)
888     for (i = 0; i < 97; ++i) {    //  shifts inE24 to the left by 97 bits, and inE24 stores inE127.
889         FelemSqrReduce(&inE24, &inE24);
890     }
891     FelemMulReduce(out, &inE96, &inE24);
892 }
893 
894 /* --------------------------Point group operation-------------------------- */
PtAssign(Point * out,const Point * in)895 static inline void PtAssign(Point *out, const Point *in)
896 {
897     FelemAssign(&out->x, &in->x);
898     FelemAssign(&out->y, &in->y);
899     FelemAssign(&out->z, &in->z);
900 }
901 
PtAssignWithMask(Point * out,const Point * in,const uint64_t mask)902 static inline void PtAssignWithMask(Point *out, const Point *in, const uint64_t mask)
903 {
904     FelemAssignWithMask(&out->x, &in->x, mask);
905     FelemAssignWithMask(&out->y, &in->y, mask);
906     FelemAssignWithMask(&out->z, &in->z, mask);
907 }
908 
909 /*
910  * double the point
911  * Algorithm reference: http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
912  * Number of field operations: 3M + 5S
913  *      delta = Z^2
914  *      gamma = Y^2
915  *      beta = X * gamma
916  *      alpha = 3 * (X - delta) * (X + delta)
917  *      X' = alpha^2 - 8 * beta
918  *      Z' = (Y + Z)^2 - gamma - delta
919  *      Y' = alpha * (4 * beta - X') - 8 * gamma^2
920  * Input:
921  *      in->x[] < 2^59, in->y[] < 2^61, in->z[] < 2^61
922  * Output:
923  *      reduce(out->x), reduce(out->y), out->z[] < 2^61
924  */
PtDouble(Point * out,const Point * in)925 static void PtDouble(Point *out, const Point *in)
926 {
927     Felem delta, gamma, beta, alpha;
928     Felem tmp, tmp2;
929     LongFelem ltmp, ltmp2;
930 
931     // delta = Z^2
932     FelemSqrReduce(&delta, &in->z);
933 
934     // gamma = Y^2
935     FelemSqrReduce(&gamma, &in->y);
936 
937     // beta = X * gamma
938     FelemMulReduce(&beta, &in->x, &gamma);
939 
940     // alpha = 3 * (X - delta) * (X + delta)
941     FelemAdd(&tmp, &in->x, &delta);         // tmp[] < 2^59 + 2^57 < 2^60
942     FelemScale(&tmp, &tmp, 3);              // tmp[] < 2^60 * 3 < 2^62
943     FelemSub(&tmp2, &in->x, &delta);        // tmp2[] < 2^60 + 2^4 + 2^59 < 2^61
944     FelemMulReduce(&alpha, &tmp, &tmp2);
945 
946     // X' = alpha^2 - 8 * beta
947     FelemSqrReduce(&tmp, &alpha);           // alpha^2, tmp[] < 2^56 + 2^16
948     FelemScale(&tmp2, &beta, 8);            // tmp2[] < (2^56 + 2^16) * 8 = 2^59 + 2^19
949     FelemSub(&out->x, &tmp, &tmp2);         // xout[] < 2^60 + 2^4 + 2^56 + 2^16 < 2^61
950     FelemReduce(&out->x, &out->x);          // xout[] < 2^56 + 2^7
951 
952     // Z' = (Y + Z)^2 - gamma - delta
953     FelemAdd(&tmp, &in->y, &in->z);         // < 2^61 + 2^61 = 2^62
954     FelemSqrReduce(&tmp, &tmp);             // (Y + Z)^2, tmp[] < 2^56 + 2^16
955     FelemAdd(&tmp2, &gamma, &delta);        // (gamma + delta), tmp2[] < (2^56 + 2^16) * 2 = 2^57 + 2^17
956     FelemSub(&out->z, &tmp, &tmp2);         // zout[] < 2^60 + 2^4 + 2^56 + 2^16 < 2^61
957 
958     // Y' = alpha * (4 * beta - X') - 8 * gamma^2
959     FelemScale(&tmp, &beta, 4);             // tmp[] < (2^56 + 2^16) * 4 = 2^58 + 2^18
960     FelemSub(&tmp, &tmp, &out->x);          // tmp[] < 2^60 + 2^4 + 2^58 + 2^18 < 2^61
961     FelemMul(&ltmp, &alpha, &tmp);          // alpha * (4 * beta - X'), ltmp[] < 2^(61 * 2) * 4 = 2^124
962     FelemSqr(&ltmp2, &gamma);               // gamma < 2^57, ltmp2[] < 2^(57 * 2) * 4 = 2^116
963     LongFelemScale(&ltmp2, &ltmp2, 8);      // 8 * gamma^2, ltmp2[] < 2^116 * 8 = 2^119
964     LongFelemSub(&ltmp, &ltmp, &ltmp2);     // ltmp[] < 2^124 + 2^124 < 2^125
965     LongFelemReduce(&out->y, &ltmp);        // yout[] < 2^56 + 2^16
966 }
967 
968 /*
969  * point addition
970  * Algorithm reference: http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
971  * Infinity point calculation is not supported.
972  * Number of field operations: 11M + 5S
973  *      Z1Z1 = Z1^2
974  *      Z2Z2 = Z2^2
975  *      U1 = X1 * Z2Z2
976  *      U2 = X2 * Z1Z1
977  *      S1 = Y1 * Z2 * Z2Z2
978  *      S2 = Y2 * Z1 * Z1Z1
979  *      H = U2 - U1
980  *      I = (2 * H)^2
981  *      J = H * I
982  *      r = 2 * (S2 - S1)
983  *      V = U1 * I
984  *      X3 = r^2 - J - 2 * V
985  *      Y3 = r * (V - X3) - 2 * S1 * J
986  *      Z3 = ((Z1 + Z2)^2 - Z1Z1 - Z2Z2) * H
987  * Input:
988  *      a->x[] < 2^64, a->y[] < 2^64, a->z[] < 2^61
989  *      reduce(b->x), b->y[] < 2^58, reduce(b->z)
990  * Output:
991  *      out->x[] < max(reduce(out->x), a->x[])
992  *      out->y[] < max(reduce(out->y), a->y[], b->y[])
993  *      out->z[] < max(reduce(out->z), a->z[])
994  */
PtAdd(Point * out,const Point * a,const Point * b)995 static void PtAdd(Point *out, const Point *a, const Point *b)
996 {
997     Point result;
998     Felem z1sqr, z2sqr, u1, u2, s1, s2, h, i, j, r, v;
999     Felem tmp;
1000     LongFelem ltmp, ltmp2;
1001     uint64_t isZ1Zero, isZ2Zero;
1002 
1003     // Z1Z1 = Z1^2
1004     FelemSqrReduce(&z1sqr, &a->z);
1005 
1006     // Z2Z2 = Z2^2
1007     FelemSqrReduce(&z2sqr, &b->z);
1008 
1009     isZ1Zero = 0 - FelemIsZero(&z1sqr);
1010     isZ2Zero = 0 - FelemIsZero(&z2sqr);
1011 
1012     // U1 = X1 * Z2Z2
1013     FelemMulReduce(&u1, &a->x, &z2sqr);
1014 
1015     // U2 = X2 * Z1Z1
1016     FelemMulReduce(&u2, &b->x, &z1sqr);
1017 
1018     // S1 = Y1 * Z2 * Z2Z2
1019     FelemMulReduce(&s1, &b->z, &z2sqr);  // Z2 * Z2Z2
1020     FelemMulReduce(&s1, &a->y, &s1);
1021 
1022     // S2 = Y2 * Z1 * Z1Z1
1023     FelemMulReduce(&s2, &a->z, &z1sqr);  // Z1 * Z1Z1
1024     FelemMulReduce(&s2, &b->y, &s2);
1025 
1026     // H = U2 - U1
1027     FelemSub(&h, &u2, &u1);
1028     FelemReduce(&h, &h);
1029 
1030     // r = 2 * (S2 - S1)
1031     FelemSub(&r, &s2, &s1);
1032     FelemScale(&r, &r, 2);  // 2 * (S2 - S1)
1033     FelemReduce(&r, &r);
1034 
1035     // H and r can determine whether x and y of the affine coordinates of two points are equal.
1036     // If the values are equal, double the values.
1037     if (isZ1Zero == 0 && isZ2Zero == 0 && FelemIsZero(&h) != 0 && FelemIsZero(&r) != 0) {
1038         // Use the smaller b point
1039         PtDouble(out, b);
1040         return;
1041     }
1042 
1043     // I = (2 * H)^2
1044     FelemScale(&tmp, &h, 2);
1045     FelemSqrReduce(&i, &tmp);
1046 
1047     // J = H * I
1048     FelemMulReduce(&j, &h, &i);
1049 
1050     // V = U1 * I
1051     FelemMulReduce(&v, &u1, &i);
1052 
1053     // X3 = r^2 - (J + 2 * V)
1054     FelemSqrReduce(&result.x, &r);              // r^2
1055     FelemScale(&tmp, &v, 2);                    // tmp[] < (2^56 + 2^16) * 2 = 2^57 + 2^17
1056     FelemAdd(&tmp, &tmp, &j);                   // J + 2 * V, tmp[] < (2^57 + 2^17) + (2^56 + 2^16) < 2^58
1057     FelemSub(&result.x, &result.x, &tmp);       // result.x[] < (2^60 + 2^4) + (2^56 + 2^16) < 2^61
1058     FelemReduce(&result.x, &result.x);          // result.x[] < 2^56 + 2^7
1059 
1060     // Y3 = r * (V - X3) - 2 * S1 * J
1061     FelemSub(&tmp, &v, &result.x);              // tmp < (2^60 + 2^4) + (2^56 + 2^16) < 2^61
1062     FelemMul(&ltmp, &r, &tmp);                  // r * (V - X3), ltmp[] < 2^57 * 2^61 * 4 = 2^120
1063     FelemMul(&ltmp2, &s1, &j);                  // ltmp2[] < 2^57 * 2^57 * 4 = 2^116
1064     LongFelemScale(&ltmp2, &ltmp2, 2);          // 2 * S1 * J, ltmp2[] < 2^117
1065     LongFelemSub(&ltmp, &ltmp, &ltmp2);         // ltmp[] < 2^124 + 2^120 < 2^125
1066     LongFelemReduce(&result.y, &ltmp);          // result.y[] < 2^56 + 2^16
1067 
1068     // Z3 = ((Z1 + Z2)^2 - Z1Z1 - Z2Z2) * H
1069     FelemAdd(&result.z, &a->z, &b->z);          // Z1 + Z2, result.z[] < 2^61 + 2^57 < 2^62
1070     FelemSqrReduce(&result.z, &result.z);       // (Z1 + Z2)^2
1071     FelemAdd(&tmp, &z1sqr, &z2sqr);             // Z1Z1 + Z2Z2
1072     FelemSub(&result.z, &result.z, &tmp);       // ((Z1 + Z2)^2 - Z1Z1 - Z2Z2)
1073     FelemMulReduce(&result.z, &result.z, &h);   // result.z[] < 2^56 + 2^16
1074 
1075     // Special case processing for infinity points
1076     PtAssignWithMask(&result, a, isZ2Zero);
1077     PtAssignWithMask(&result, b, isZ1Zero);
1078     PtAssign(out, &result);
1079 }
1080 
1081 /*
1082  * mixed addition of point
1083  * Algorithm reference: http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-madd-2007-bl
1084  * Infinity point calculation is not supported.
1085  * Number of field operations: 7M + 4S
1086  *      Z1Z1 = Z1^2
1087  *      U2 = X2 * Z1Z1
1088  *      S2 = Y2 * Z1 * Z1Z1
1089  *      H = U2 - X1
1090  *      HH = H^2
1091  *      I = 4 * HH
1092  *      J = H * I
1093  *      r = 2 * (S2 - Y1)
1094  *      V = X1 * I
1095  *      X3 = r^2 - J - 2 * V
1096  *      Y3 = r * (V - X3) - 2 * Y1 * J
1097  *      Z3 = (Z1 + H)^2 - Z1Z1 - HH
1098  * Input:
1099  *      a->x[] <= 2^60 - 2^44 - 2^4, a->y[] <= 2^60 - 2^44 - 2^4, a->z[] < 2^61
1100  *      b->x < p, b->y < p, b->z = 0 or 1
1101  * Output:
1102  *      out->x[] < max(reduce(out->x), a->x[])
1103  *      out->y[] < max(reduce(out->y), a->y[])
1104  *      out->z[] < 2^61
1105  */
PtAddMixed(Point * out,const Point * a,const Point * b)1106 static void PtAddMixed(Point *out, const Point *a, const Point *b)
1107 {
1108     Point result;
1109     Felem z1sqr, u2, s2, h, hsqr, i, j, r, v;
1110     Felem tmp;
1111     LongFelem ltmp, ltmp2;
1112     uint64_t isZ1Zero, isZ2Zero;
1113 
1114     // Z1Z1 = Z1^2
1115     FelemSqrReduce(&z1sqr, &a->z);
1116 
1117     isZ1Zero = 0 - FelemIsZero(&z1sqr);
1118     // The Z coordinate of point b can only be 0 or 1, that is, the bit 0 can only be 0 or 1, and the other bits are 0.
1119     isZ2Zero = b->z.data[0] - 1;
1120 
1121     // U2 = X2 * Z1Z1
1122     FelemMulReduce(&u2, &b->x, &z1sqr);
1123 
1124     // S2 = Y2 * Z1 * Z1Z1
1125     FelemMulReduce(&s2, &a->z, &z1sqr);  // Z1 * Z1Z1
1126     FelemMulReduce(&s2, &b->y, &s2);
1127 
1128     // H = U2 - X1
1129     FelemSub(&h, &u2, &a->x);
1130     FelemReduce(&h, &h);
1131 
1132     // r = 2 * (S2 - Y1)
1133     FelemSub(&r, &s2, &a->y);  // r[] < (2^60 + 2^4) + (2^56 + 2^16) < 2^61
1134     FelemScale(&r, &r, 2);     // r[] < 2^62
1135     FelemReduce(&r, &r);
1136 
1137     // H and r can determine whether x and y of the affine coordinates of two points are equal.
1138     // If they are equal, double the point.
1139     if (isZ1Zero == 0 && isZ2Zero == 0 && FelemIsZero(&h) != 0 && FelemIsZero(&r) != 0) {
1140         // Use the smaller b point
1141         PtDouble(out, b);
1142         return;
1143     }
1144 
1145     // HH = H^2
1146     FelemSqrReduce(&hsqr, &h);
1147 
1148     // I = 4 * HH
1149     FelemScale(&i, &hsqr, 4);  // i[] < (2^56 + 2^16) * 4 = 2^58 + 2^18
1150 
1151     // J = H * I
1152     FelemMulReduce(&j, &h, &i);
1153 
1154     // V = X1 * I
1155     FelemMulReduce(&v, &a->x, &i);
1156 
1157     // X3 = r^2 - J - 2 * V
1158     FelemSqrReduce(&result.x, &r);          // r^2
1159     FelemScale(&tmp, &v, 2);                // 2 * V, tmp[] < (2^56 + 2^16) * 2 = 2^57 + 2^17
1160     FelemAdd(&tmp, &j, &tmp);               // J + 2 * V, tmp[] < (2^56 + 2^16) + (2^57 + 2^17) < 2^58
1161     FelemSub(&result.x, &result.x, &tmp);   // result.x[] < (2^60 + 2^4) + (2^56 + 2^16) < 2^61
1162     FelemReduce(&result.x, &result.x);      // result.x[] < 2^56 + 2^7
1163 
1164     // Y3 = r * (V - X3) - 2 * Y1 * J
1165     FelemSub(&tmp, &v, &result.x);          // V - X3, tmp[] < (2^60 + 2^4) + (2^56 + 2^16) < 2^61
1166     FelemMul(&ltmp, &r, &tmp);              // r * (V - X3), ltmp[] < 2^57 * 2^61 * 4 = 2^120
1167     FelemMul(&ltmp2, &a->y, &j);            // Y1 * J, ltmp2[] < 2^61 * 2^57 * 4 = 2^120
1168     LongFelemScale(&ltmp2, &ltmp2, 2);      // 2 * Y1 * J, ltmp2[] < 2^120 * 2 = 2^121
1169     LongFelemSub(&ltmp, &ltmp, &ltmp2);     // ltmp[] < 2^124 + 2^120 < 2^125
1170     LongFelemReduce(&result.y, &ltmp);      // result.y[] < 2^56 + 2^16
1171 
1172     // Z3 = (Z1 + H)^2 - Z1Z1 - HH
1173     FelemAdd(&result.z, &a->z, &h);         // Z1 + H, result.z[] < 2^61 + (2^56 + 2^7) = 2^62
1174     FelemSqrReduce(&result.z, &result.z);
1175     FelemAdd(&tmp, &z1sqr, &hsqr);          // Z1Z1 + HH, tmp[] < (2^56 + 2^16) + (2^56 + 2^16) < 2^58
1176     FelemSub(&result.z, &result.z, &tmp);   // result.z[] < (2^60 + 2^4) + 2^58 < 2^61
1177 
1178     // Special case processing for infinity points
1179     PtAssignWithMask(&result, a, isZ2Zero);
1180     PtAssignWithMask(&result, b, isZ1Zero);
1181     PtAssign(out, &result);
1182 }
1183 
1184 /* Select the point that subscript is index in the table and place it in the Point *point.
1185    The anti-side channel processing exists. */
GetPointFromTable(Point * point,const Point table[],uint32_t pointNum,const uint32_t index)1186 static inline void GetPointFromTable(Point *point, const Point table[], uint32_t pointNum, const uint32_t index)
1187 {
1188     uint64_t mask;
1189     for (uint32_t i = 0; i < pointNum; i++) {
1190         /* If i is equal to index, the last mask is all Fs. Otherwise, the last mask is all 0s. */
1191         mask = (0 - (i ^ index)) >> 31;  // shifted rightwards by 31 bits to get the most significant bit
1192         mask--;
1193         /* Conditional value assignment, valid only when i == index */
1194         PtAssignWithMask(point, &table[i], mask);
1195     }
1196 }
1197 
1198 /*
1199  * Input:
1200  *      k1 < n
1201  *      0 <= i < 28
1202  * Output:
1203  *      out->x < p, out->y < p, out->z = 0 OR 1
1204  */
GetUpperPrecomputePtOfG(Point * out,const Felem * k1,int32_t curBit)1205 static inline void GetUpperPrecomputePtOfG(Point *out, const Felem *k1, int32_t curBit)
1206 {
1207     uint32_t bits;
1208     uint32_t i = (uint32_t)curBit;
1209 
1210     // The i bit of the upper half of digit 0. (BASE_BITS/2) is half-wide.
1211     bits = (uint32_t)(k1->data[0] >> (i + BASE_BITS / 2)) & 1;
1212     // The i bit of the upper half of digit 1. (BASE_BITS/2) is half-wide.
1213     bits |= ((uint32_t)(k1->data[1] >> (i + BASE_BITS / 2)) & 1) << 1;
1214     // The i bit of the upper half of digit 2. (BASE_BITS/2) is half-wide.
1215     bits |= ((uint32_t)(k1->data[2] >> (i + BASE_BITS / 2)) & 1) << 2;
1216     // The i bit of the upper half of digit 3. (BASE_BITS/2) is half-wide.
1217     bits |= ((uint32_t)(k1->data[3] >> (i + BASE_BITS / 2)) & 1) << 3;
1218 
1219     GetPointFromTable(out, PRE_MUL_G2, TABLE_G_SIZE, bits);
1220 }
1221 
1222 /*
1223  * Input:
1224  *      k1 < n
1225  *      0 <= i < 28
1226  * Output:
1227  *      out->x < p, out->y < p, out->z = 0 or 1
1228  */
GetLowerPrecomputePtOfG(Point * out,const Felem * k1,int32_t curBit)1229 static inline void GetLowerPrecomputePtOfG(Point *out, const Felem *k1, int32_t curBit)
1230 {
1231     uint32_t bits;
1232     uint32_t i = (uint32_t)curBit;
1233 
1234     bits = (uint32_t)(k1->data[0] >> i) & 1;          // The i bit of the lower half of digit 0.
1235     bits |= ((uint32_t)(k1->data[1] >> i) & 1) << 1;  // The i bit of the lower half of digit 1.
1236     bits |= ((uint32_t)(k1->data[2] >> i) & 1) << 2;  // The i bit of the lower half of digit 2.
1237     bits |= ((uint32_t)(k1->data[3] >> i) & 1) << 3;  // The i bit of the lower half of digit 3.
1238 
1239     GetPointFromTable(out, PRE_MUL_G, TABLE_G_SIZE, bits);
1240 }
1241 
1242 /*
1243  * Input:
1244  *      k2 < n
1245  *      0 <= i <= 220
1246  *      The coordinates of each point of preMulPt are reduced.
1247  * Output:
1248  *      reduce(out->x)
1249  *      out->y[] < max(reduce(out->y), negY)
1250  *      reduce(out->z)
1251  */
GetPrecomputePtOfP(Point * out,const Felem * k2,int32_t curBit,const Point preMulPt[TABLE_P_SIZE])1252 static inline void GetPrecomputePtOfP(Point *out, const Felem *k2, int32_t curBit, const Point preMulPt[TABLE_P_SIZE])
1253 {
1254     uint32_t bits;
1255     uint32_t sign, value;  // the grouping sign and actual value.
1256     Felem negY;
1257     // Obtain the 5-bit signed code and read the sign bits of the next group of numbers
1258     // to determine whether there is a carry. The total length is 6.
1259     bits = (uint32_t)FelemGetBits(k2, curBit - 1, WINDOW_SIZE + 1);
1260     DecodeScalarCode(&sign, &value, bits);
1261 
1262     GetPointFromTable(out, preMulPt, TABLE_P_SIZE, value);
1263 
1264     // out->y < 2^56 + 2^16
1265     FelemNeg(&negY, &out->y); // negY[] < (2^57 + 2^1) + (2^56 + 2^16) < 2^58
1266     FelemAssignWithMask(&out->y, &negY, (uint64_t)0 - sign);
1267 }
1268 
1269 /*
1270  * Calculate k1 * G + k2 * P
1271  * Input:
1272  *      k1 < n
1273  *      k2 < n
1274  *      The coordinates of each point of preMulPt are reduced.
1275  * Output:
1276  *      out->x < p, out->y < p, out->z < p
1277  */
PtMul(Point * out,const Felem * k1,const Felem * k2,const Point preMulPt[TABLE_P_SIZE])1278 static void PtMul(Point *out, const Felem *k1, const Felem *k2, const Point preMulPt[TABLE_P_SIZE])
1279 {
1280     Point ptQ = {};  // ptQ stores result
1281     Point ptPre = {};  // ptPre stores the points obtained from the table.
1282     bool isGMul = k1 != NULL;
1283     bool isPtMul = k2 != NULL && preMulPt != NULL;
1284     int32_t curBit;
1285 
1286     // Initialize the Q point.
1287     if (isPtMul) {
1288         curBit = 220;  // Start from 220th bit.
1289         // From k2's (_, 223, 222, 221, 220, 219) bit coding to select the initial point
1290         GetPrecomputePtOfP(&ptQ, k2, curBit, preMulPt);
1291     } else if (isGMul) {
1292         curBit = 27;  // Start from 27th.
1293         // From k1's (195, 139, 83, 27) bit coding to select the initial point
1294         GetLowerPrecomputePtOfG(&ptQ, k1, curBit);
1295         // From k1's (195 + 28, 139 + 28, 83 + 28, 27 + 28) bit to select the pre-calculation point
1296         // and adds it to the point Q.
1297         GetUpperPrecomputePtOfG(&ptPre, k1, curBit);
1298         PtAddMixed(&ptQ, &ptQ, &ptPre);
1299     } else {
1300         // k1 and k2 are all NULL, and the infinite point is output.
1301         (void)memset_s((void *)out, sizeof(Point), 0, sizeof(Point));
1302         return;
1303     }
1304 
1305     // Operation chain:                     point Q output range:
1306     //                                        x[]         y[]         z[]
1307     // Initialization the value             reduced     reduced     < 2^61
1308     //     ↓
1309     //  Double           ←↑                 reduced     reduced     < 2^61
1310     //     ↓              ↑
1311     //  mixed add         ↑                 reduced     reduced     < 2^61
1312     //     ↓              ↑
1313     //  mixed add        →↑                 reduced     reduced     < 2^61
1314     //     ↓              ↑
1315     // negation of Y      ↑                 reduced     < 2^58      < 2^61
1316     //     ↓              ↑
1317     //    add            →↑                 reduced     < 2^58      < 2^61
1318 
1319     while (--curBit >= 0) {
1320         // Start to shift right bit by bit. Due to the initialization of the most significant bit,
1321         // common point multiplication starts from 219th bit and base point multiplication starts from 26th bit.
1322         // Whether G-point multiplication is performed in the current cycle,
1323         // calculated once in each cycle starting from bit 27.
1324         bool isStepGMul = curBit <= 27;
1325         // Whether the current cycle is a common point multiplication, calculated once every 5 cycles
1326         bool isStepPtMul = curBit % WINDOW_SIZE == 0;
1327 
1328         PtDouble(&ptQ, &ptQ);
1329 
1330         // Generator G multiplication part
1331         // Divide k1 into eight segments, from high bits to low bits,
1332         // select bits from each segment and combine them together, and read the pre-computation table.
1333         // To reduce the precomputation table,
1334         // the divided eight segments are combined according to the upper half and the lower half
1335         if (isGMul && isStepGMul) {
1336             // Add the point multiplication result of the current bit of the eight-segment to the point Q
1337             GetLowerPrecomputePtOfG(&ptPre, k1, curBit);
1338             PtAddMixed(&ptQ, &ptQ, &ptPre);
1339 
1340             GetUpperPrecomputePtOfG(&ptPre, k1, curBit);
1341             PtAddMixed(&ptQ, &ptQ, &ptPre);
1342         }
1343 
1344         // Common point multiplication part
1345         // Use the sliding window signed encoding method
1346         // to group the most significant bits to the least significant bits every five bits.
1347         // Each group of numbers is regarded as a signed number. 00000 to 01111 are decimal numbers 0 to 15,
1348         // and 10000 to 11111 are decimal numbers (32-16) to (32-1).
1349         // This is equivalent to the set the number in complement form after the number carries 1.
1350         // for example
1351         // 11011(Complement) = 100000 - 00101
1352         if (isPtMul && isStepPtMul) {
1353             // Add the point multiplication result of the current group to the point Q.
1354             GetPrecomputePtOfP(&ptPre, k2, curBit, preMulPt);
1355             PtAdd(&ptQ, &ptQ, &ptPre);
1356         }
1357     }
1358 
1359     // Refer to the output range of the operation chain. Reduce the Y and Z coordinates.
1360     FelemReduce(&ptQ.y, &ptQ.y);
1361     FelemReduce(&ptQ.z, &ptQ.z);
1362     // do the modulo operation then output.
1363     FelemContract(&ptQ.x, &ptQ.x);
1364     FelemContract(&ptQ.y, &ptQ.y);
1365     FelemContract(&ptQ.z, &ptQ.z);
1366 
1367     PtAssign(out, &ptQ);
1368 }
1369 
1370 /*
1371  * Convert Jacobian coordinates to affine coordinates by a given module inverse
1372  */
PtMakeAffineWithInv(Point * out,const Point * in,const Felem * zInv)1373 static void PtMakeAffineWithInv(Point *out, const Point *in, const Felem *zInv)
1374 {
1375     Felem tmp;
1376 
1377     // 1/Z^2
1378     FelemSqrReduce(&tmp, zInv);
1379 
1380     // X/Z^2
1381     FelemMulReduce(&out->x, &in->x, &tmp);
1382     FelemContract(&out->x, &out->x);
1383 
1384     // 1/Z^3
1385     FelemMulReduce(&tmp, &tmp, zInv);
1386 
1387     // Y/Z^3
1388     FelemMulReduce(&out->y, &in->y, &tmp);
1389     FelemContract(&out->y, &out->y);
1390 
1391     FelemSetLimb(&out->z, 1);
1392 }
1393 
1394 /*
1395  * Obtain the pre-multiplication table of the input point pt, including 0pt-16pt.
1396  * The coordinates of all points are reduced.
1397  */
GetPreMulPt(Point preMulPt[TABLE_P_SIZE],const ECC_Point * pt)1398 static int32_t GetPreMulPt(Point preMulPt[TABLE_P_SIZE], const ECC_Point *pt)
1399 {
1400     int32_t ret;
1401 
1402     // 0pt
1403     (void)memset_s((void *)&preMulPt[0], sizeof(Point), 0, sizeof(Point));
1404     // 1pt
1405     GOTO_ERR_IF_EX(BN2Felem(&preMulPt[1].x, pt->x), ret);
1406     GOTO_ERR_IF_EX(BN2Felem(&preMulPt[1].y, pt->y), ret);
1407     GOTO_ERR_IF_EX(BN2Felem(&preMulPt[1].z, pt->z), ret);
1408     // 2pt ~ 15pt
1409     for (uint32_t i = 2; i < 15; i += 2) {
1410         PtDouble(&preMulPt[i], &preMulPt[i >> 1]);
1411         // Z coordinate after the doubled point is reduced.
1412         FelemReduce(&preMulPt[i].z, &preMulPt[i].z);
1413 
1414         PtAdd(&preMulPt[i + 1], &preMulPt[i], &preMulPt[1]);
1415     }
1416     // 16pt
1417     PtDouble(&preMulPt[16], &preMulPt[16 >> 1]);
1418     //  Z coordinate of the 16pt after the doubled point 16pt is reduced.
1419     FelemReduce(&preMulPt[16].z, &preMulPt[16].z);
1420 ERR:
1421     return ret;
1422 }
1423 
ECP224_PointMulAdd(ECC_Para * para,ECC_Point * r,const BN_BigNum * k1,const BN_BigNum * k2,const ECC_Point * pt)1424 int32_t ECP224_PointMulAdd(
1425     ECC_Para *para, ECC_Point *r, const BN_BigNum *k1, const BN_BigNum *k2, const ECC_Point *pt)
1426 {
1427     int32_t retVal;
1428     Felem fK1;
1429     Felem fK2;
1430     Point preMulPt[TABLE_P_SIZE];
1431     Point out;
1432 
1433     // Check the input parameters.
1434     GOTO_ERR_IF(CheckParaValid(para, CRYPT_ECC_NISTP224), retVal);
1435     GOTO_ERR_IF(CheckPointValid(r, CRYPT_ECC_NISTP224), retVal);
1436     GOTO_ERR_IF(CheckBnValid(k1, FELEM_BITS), retVal);
1437     GOTO_ERR_IF(CheckBnValid(k2, FELEM_BITS), retVal);
1438     GOTO_ERR_IF(CheckPointValid(pt, CRYPT_ECC_NISTP224), retVal);
1439     // Special treatment of infinity points
1440     if (BN_IsZero(pt->z)) {
1441         BSL_ERR_PUSH_ERROR(CRYPT_ECC_POINT_AT_INFINITY);
1442         return CRYPT_ECC_POINT_AT_INFINITY;
1443     }
1444 
1445     GOTO_ERR_IF_EX(BN2Felem(&fK1, k1), retVal);
1446     GOTO_ERR_IF_EX(BN2Felem(&fK2, k2), retVal);
1447     GOTO_ERR_IF_EX(GetPreMulPt(preMulPt, pt), retVal);
1448 
1449     PtMul(&out, &fK1, &fK2, preMulPt);
1450     GOTO_ERR_IF_EX(Felem2BN(r->x, &out.x), retVal);
1451     GOTO_ERR_IF_EX(Felem2BN(r->y, &out.y), retVal);
1452     GOTO_ERR_IF_EX(Felem2BN(r->z, &out.z), retVal);
1453 ERR:
1454     return retVal;
1455 }
1456 
ECP224_PointMul(ECC_Para * para,ECC_Point * r,const BN_BigNum * k,const ECC_Point * pt)1457 int32_t ECP224_PointMul(ECC_Para *para, ECC_Point *r, const BN_BigNum *k, const ECC_Point *pt)
1458 {
1459     int32_t retVal;
1460     Felem felemK;
1461     Point preMulPt[TABLE_P_SIZE];
1462     Point out;
1463 
1464     // Check the input parameters.
1465     GOTO_ERR_IF(CheckParaValid(para, CRYPT_ECC_NISTP224), retVal);
1466     GOTO_ERR_IF(CheckPointValid(r, CRYPT_ECC_NISTP224), retVal);
1467     GOTO_ERR_IF(CheckBnValid(k, FELEM_BITS), retVal);
1468     if (pt != NULL) {
1469         GOTO_ERR_IF(CheckPointValid(pt, CRYPT_ECC_NISTP224), retVal);
1470         // Special treatment of infinity points
1471         if (BN_IsZero(pt->z)) {
1472             BSL_ERR_PUSH_ERROR(CRYPT_ECC_POINT_AT_INFINITY);
1473             return CRYPT_ECC_POINT_AT_INFINITY;
1474         }
1475     }
1476 
1477     GOTO_ERR_IF_EX(BN2Felem(&felemK, k), retVal);
1478     //  When pt is NULL, r = k * G
1479     if (pt == NULL) {
1480         PtMul(&out, &felemK, NULL, NULL);
1481     } else {   //  If pt is not null, r = k * pt
1482         GOTO_ERR_IF_EX(GetPreMulPt(preMulPt, pt), retVal);
1483         PtMul(&out, NULL, &felemK, preMulPt);
1484     }
1485 
1486     GOTO_ERR_IF_EX(Felem2BN(r->x, &out.x), retVal);
1487     GOTO_ERR_IF_EX(Felem2BN(r->y, &out.y), retVal);
1488     GOTO_ERR_IF_EX(Felem2BN(r->z, &out.z), retVal);
1489 ERR:
1490     return retVal;
1491 }
1492 
ECP224_Point2Affine(const ECC_Para * para,ECC_Point * r,const ECC_Point * pt)1493 int32_t ECP224_Point2Affine(const ECC_Para *para, ECC_Point *r, const ECC_Point *pt)
1494 {
1495     int32_t retVal;
1496     Point out;
1497     Felem zInv;
1498 
1499     // Check the input parameters.
1500     GOTO_ERR_IF(CheckParaValid(para, CRYPT_ECC_NISTP224), retVal);
1501     GOTO_ERR_IF(CheckPointValid(r, CRYPT_ECC_NISTP224), retVal);
1502     GOTO_ERR_IF(CheckPointValid(pt, CRYPT_ECC_NISTP224), retVal);
1503     // Special treatment of infinity points
1504     if (BN_IsZero(pt->z)) {
1505         BSL_ERR_PUSH_ERROR(CRYPT_ECC_POINT_AT_INFINITY);
1506         return CRYPT_ECC_POINT_AT_INFINITY;
1507     }
1508 
1509     GOTO_ERR_IF_EX(BN2Felem(&out.x, pt->x), retVal);
1510     GOTO_ERR_IF_EX(BN2Felem(&out.y, pt->y), retVal);
1511     GOTO_ERR_IF_EX(BN2Felem(&out.z, pt->z), retVal);
1512 
1513     FelemInv(&zInv, &out.z);
1514     PtMakeAffineWithInv(&out, &out, &zInv);
1515 
1516     GOTO_ERR_IF_EX(Felem2BN(r->x, &out.x), retVal);
1517     GOTO_ERR_IF_EX(Felem2BN(r->y, &out.y), retVal);
1518     GOTO_ERR_IF_EX(Felem2BN(r->z, &out.z), retVal);
1519 ERR:
1520     return retVal;
1521 }
1522 
1523 
1524 #endif /* defined(HITLS_CRYPTO_CURVE_NISTP224) && defined(HITLS_CRYPTO_NIST_USE_ACCEL) */
1525