• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * This file is part of the openHiTLS project.
3  *
4  * openHiTLS is licensed under the Mulan PSL v2.
5  * You can use this software according to the terms and conditions of the Mulan PSL v2.
6  * You may obtain a copy of Mulan PSL v2 at:
7  *
8  *     http://license.coscl.org.cn/MulanPSL2
9  *
10  * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11  * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12  * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13  * See the Mulan PSL v2 for more details.
14  */
15 #include "hitls_build.h"
16 #if defined(HITLS_CRYPTO_CURVE_NISTP256) && defined(HITLS_CRYPTO_NIST_USE_ACCEL)
17 
18 #include <stdbool.h>
19 #include "securec.h"
20 #include "bsl_err_internal.h"
21 #include "crypt_errno.h"
22 #include "crypt_utils.h"
23 #include "crypt_bn.h"
24 #include "crypt_ecc.h"
25 #include "ecc_local.h"
26 #include "ecc_utils.h"
27 #include "bsl_util_internal.h"
28 
29 #ifndef __SIZEOF_INT128__
30 #error "This nistp256 implementation require the compiler support 128-bits integer."
31 #endif
32 
33 /* field element definition */
34 #define FELEM_BITS      256
35 #define FELEM_BYTES     32
36 #define LIMB_BITS       (sizeof(uint128_t) << 3)
37 #define LIMB_NUM        4
38 #define BASE_BITS       64
39 /* The pre-calculation table of the G table has 16 points. */
40 #define TABLE_G_SIZE    16
41 /* The pre-calculation table of the P table has 17 points. */
42 #define TABLE_P_SIZE    17
43 
44 /*
45  * Field elements, stored as arrays, all represented in little endian.
46  * Each element of the array is called a digit. Each digit represents an extended 2 ^ 64-bit.
47  * That is, Felem can be expressed as:
48  * f_0 + f_1 * 2^64 + f_2 * 2^128 + f_3 * 2^192
49  * LongFelem is used to store the result of multiplication of field elements and is twice the width of Felem.
50  * Point is a point represented as a Jacobian coordinate
51  */
52 typedef struct {
53     uint128_t data[LIMB_NUM];
54 } Felem;
55 
56 typedef struct {
57     uint128_t data[LIMB_NUM * 2];
58 } LongFelem;
59 
60 typedef struct {
61     Felem x;
62     Felem y;
63     Felem z;
64 } Point;
65 
66 /* ------------------------------------------------------------ */
67 /* ECP256 field order p. The value is 2^256 - 2^224 + 2^192 + 2^96 - 1, little endian */
68 static const Felem FIELD_ORDER = {
69     {0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001}
70 };
71 
72 /*
73  * Pre-computation table of the base point G, which contains the (X, Y, Z) coordinates of k*G
74  *
75  * PRE_MUL_G divides all bits into four equal parts.
76  * index       corresponding bit                         value of k
77  *   0              0 0 0 0                       0     + 0     + 0     + 0
78  *   1              0 0 0 1                       0     + 0     + 0     + 1
79  *   2              0 0 1 0                       0     + 0     + 2^64  + 0
80  *   3              0 0 1 1                       0     + 0     + 2^64  + 1
81  *   4              0 1 0 0                       0     + 2^128 + 0     + 0
82  *   5              0 1 0 1                       0     + 2^128 + 0     + 1
83  *   6              0 1 1 0                       0     + 2^128 + 2^64  + 0
84  *   7              0 1 1 1                       0     + 2^128 + 2^64  + 1
85  *   8              1 0 0 0                       2^192 + 0     + 0     + 0
86  *   9              1 0 0 1                       2^192 + 0     + 0     + 1
87  *  10              1 0 1 0                       2^192 + 0     + 2^64  + 0
88  *  11              1 0 1 1                       2^192 + 0     + 2^64  + 1
89  *  12              1 1 0 0                       2^192 + 2^128 + 0     + 0
90  *  13              1 1 0 1                       2^192 + 2^128 + 0     + 1
91  *  14              1 1 1 0                       2^192 + 2^128 + 2^64  + 0
92  *  15              1 1 1 1                       2^192 + 2^128 + 2^64  + 1
93  *
94  */
95 static const Point PRE_MUL_G[TABLE_G_SIZE] = {
96     {
97         {{0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}},
98         {{0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}},
99         {{0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
100     }, {
101         {{0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2, 0x6b17d1f2e12c4247}},
102         {{0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16, 0x4fe342e2fe1a7f9b}},
103         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
104     }, {
105         {{0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de, 0x0fa822bc2811aaa5}},
106         {{0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b, 0xbff44ae8f5dba80d}},
107         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
108     }, {
109         {{0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789, 0x300a4bbc89d6726f}},
110         {{0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f, 0x72aac7e0d09b4644}},
111         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
112     }, {
113         {{0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e, 0x447d739beedb5e67}},
114         {{0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7, 0x2d4825ab834131ee}},
115         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
116     }, {
117         {{0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60, 0xef9519328a9c72ff}},
118         {{0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c, 0x611e9fc37dbb2c9b}},
119         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
120     }, {
121         {{0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf, 0x550663797b51f5d8}},
122         {{0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5, 0x157164848aecb851}},
123         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
124     }, {
125         {{0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391, 0xeb5d7745b21141ea}},
126         {{0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee, 0xeafd72ebdbecc17b}},
127         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
128     }, {
129         {{0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5, 0xa6d39677a7849276}},
130         {{0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf, 0x674f84749b0b8816}},
131         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
132     }, {
133         {{0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb, 0x4e769e7672c9ddad}},
134         {{0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281, 0x42b99082de830663}},
135         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
136     }, {
137         {{0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478, 0x78878ef61c6ce04d}},
138         {{0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def, 0xb6cb3f5d7b72c321}},
139         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
140     }, {
141         {{0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae, 0x0c88bc4d716b1287}},
142         {{0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa, 0xdd5ddea3f3901dc6}},
143         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
144     }, {
145         {{0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3, 0x68f344af6b317466}},
146         {{0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3, 0x31b9c405f8540a20}},
147         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
148     }, {
149         {{0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0, 0x4052bf4b6f461db9}},
150         {{0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8, 0xfecf4d5190b0fc61}},
151         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
152     }, {
153         {{0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a, 0x1eddbae2c802e41a}},
154         {{0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0, 0x43104d86560ebcfc}},
155         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
156     }, {
157         {{0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a, 0xb48e26b484f7a21c}},
158         {{0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668, 0xfac015404d4d3dab}},
159         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
160     }
161 };
162 
163 /*
164  * Pre-computation table of the base point G, which contains the (X, Y, Z) coordinates of k*G
165  *
166  * PRE_MUL_G2[] = PRE_MUL_G[] * 2^32
167  * index       corresponding bit                       value of k
168  *   0              0 0 0 0                    0     + 0     + 0     + 0
169  *   1              0 0 0 1                    0     + 0     + 0     + 2^32
170  *   2              0 0 1 0                    0     + 0     + 2^96  + 0
171  *   3              0 0 1 1                    0     + 0     + 2^96  + 2^32
172  *   4              0 1 0 0                    0     + 2^160 + 0     + 0
173  *   5              0 1 0 1                    0     + 2^160 + 0     + 2^32
174  *   6              0 1 1 0                    0     + 2^160 + 2^96  + 0
175  *   7              0 1 1 1                    0     + 2^160 + 2^96  + 2^32
176  *   8              1 0 0 0                    2^224 + 0     + 0     + 0
177  *   9              1 0 0 1                    2^224 + 0     + 0     + 2^32
178  *  10              1 0 1 0                    2^224 + 0     + 2^96  + 0
179  *  11              1 0 1 1                    2^224 + 0     + 2^96  + 2^32
180  *  12              1 1 0 0                    2^224 + 2^160 + 0     + 0
181  *  13              1 1 0 1                    2^224 + 2^160 + 0     + 2^32
182  *  14              1 1 1 0                    2^224 + 2^160 + 2^96  + 0
183  *  15              1 1 1 1                    2^224 + 2^160 + 2^96  + 2^32
184  */
185 static const Point PRE_MUL_G2[TABLE_G_SIZE] = {
186     {
187         {{0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}},
188         {{0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}},
189         {{0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
190     }, {
191         {{0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da, 0x7fe36b40af22af89}},
192         {{0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1, 0xe697d45825b63624}},
193         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
194     }, {
195         {{0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902, 0x4a5b506612a677a6}},
196         {{0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40, 0xeb13461ceac089f1}},
197         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
198     }, {
199         {{0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857, 0x0781b8291c6a220a}},
200         {{0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434, 0x690cde8df0151593}},
201         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
202     }, {
203         {{0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326, 0x8a535f566ec73617}},
204         {{0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf, 0x0455c08468b08bd7}},
205         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
206     }, {
207         {{0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279, 0x06bada7ab77f8276}},
208         {{0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70, 0x5b476dfd0e6cb18a}},
209         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
210     }, {
211         {{0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8, 0x3e29864e8a2ec908}},
212         {{0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed, 0x239b90ea3dc31e7e}},
213         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
214     }, {
215         {{0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4, 0x820f4dd949f72ff7}},
216         {{0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3, 0x140406ec783a05ec}},
217         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
218     }, {
219         {{0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe, 0x68f6b8542783dfee}},
220         {{0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028, 0xcbe1feba92e40ce6}},
221         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
222     }, {
223         {{0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927, 0xd0b2f94d2f420109}},
224         {{0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a, 0x971459828b0719e5}},
225         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
226     }, {
227         {{0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687, 0x961610004a866aba}},
228         {{0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c, 0x7acb9fadcee75e44}},
229         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
230     }, {
231         {{0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea, 0x24eb9acca333bf5b}},
232         {{0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d, 0x69f891c5acd079cc}},
233         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
234     }, {
235         {{0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514, 0xe51f547c5972a107}},
236         {{0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06, 0x1c309a2b25bb1387}},
237         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
238     }, {
239         {{0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828, 0x20b87b8aa2c4e503}},
240         {{0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044, 0xf5c6fa49919776be}},
241         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
242     }, {
243         {{0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56, 0x1ed7d1b9332010b9}},
244         {{0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24, 0x3a2b03f03217257a}},
245         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
246     }, {
247         {{0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b, 0x15fee545c78dd9f6}},
248         {{0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb, 0x4ab5b6b2b8753f81}},
249         {{0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}}
250     }
251 };
252 
253 /* --------------------------helper function-------------------------- */
254 /*
255  * Convert big-endian byte stream to Felem
256  */
Bin2Felem(Felem * out,const uint8_t in[FELEM_BYTES])257 static inline void Bin2Felem(Felem *out, const uint8_t in[FELEM_BYTES])
258 {
259     // Write the input data to 128-bit digits every 64 bits.
260     out->data[3] = (uint128_t)Uint64FromBeBytes(in);        // Index 3 read 0~7   bytes
261     out->data[2] = (uint128_t)Uint64FromBeBytes(in + 8);    // Index 2 read 8~15  bytes
262     out->data[1] = (uint128_t)Uint64FromBeBytes(in + 16);   // Index 1 read 16~23 bytes
263     out->data[0] = (uint128_t)Uint64FromBeBytes(in + 24);   // Index 0 read 24~31 bytes
264 }
265 
266 /*
267  * Convert Felem to big-endian byte stream
268  * Input:
269  *      in[] < 2^64
270  * Output:
271  *      out length is 32
272  */
Felem2Bin(uint8_t out[FELEM_BYTES],const Felem * in)273 static inline void Felem2Bin(uint8_t out[FELEM_BYTES], const Felem *in)
274 {
275     Uint64ToBeBytes((uint64_t)in->data[3], out);          // Index 3 write 0~7   bytes
276     Uint64ToBeBytes((uint64_t)in->data[2], out + 8);      // Index 2 write 8~15  bytes
277     Uint64ToBeBytes((uint64_t)in->data[1], out + 16);     // Index 1 write 16~23 bytes
278     Uint64ToBeBytes((uint64_t)in->data[0], out + 24);     // Index 0 write 24~31 bytes
279 }
280 
281 /*
282  * Convert BN to Felem
283  * Output:
284  *      out[] < 2^64
285  */
BN2Felem(Felem * out,const BN_BigNum * in)286 static int32_t BN2Felem(Felem *out, const BN_BigNum *in)
287 {
288     int32_t ret;
289     uint8_t bin[FELEM_BYTES];
290     uint32_t len = FELEM_BYTES;
291 
292     GOTO_ERR_IF(BN_Bn2Bin(in, bin, &len), ret);
293 
294     for (uint32_t i = 0; i < FELEM_BYTES; ++i) {
295         bin[FELEM_BYTES - 1 - i] = i < len ? bin[len - 1 - i] : 0;
296     }
297 
298     Bin2Felem(out, bin);
299 ERR:
300     return ret;
301 }
302 
303 /*
304  * Convert Felem to BN
305  * Input:
306  *      in[] < 2^64
307  */
Felem2BN(BN_BigNum * out,const Felem * in)308 static int32_t Felem2BN(BN_BigNum *out, const Felem *in)
309 {
310     int32_t ret;
311     uint8_t bin[FELEM_BYTES];
312 
313     Felem2Bin(bin, in);
314 
315     GOTO_ERR_IF(BN_Bin2Bn(out, bin, FELEM_BYTES), ret);
316 ERR:
317     return ret;
318 }
319 
320 /* ---------------------------field operation--------------------------- */
321 /*
322  * Assignment
323  */
FelemAssign(Felem * out,const Felem * in)324 static inline void FelemAssign(Felem *out, const Felem *in)
325 {
326     out->data[0] = in->data[0];     // out->data[0] get the value
327     out->data[1] = in->data[1];     // out->data[1] get the value
328     out->data[2] = in->data[2];     // out->data[2] get the value
329     out->data[3] = in->data[3];     // out->data[3] get the value
330 }
331 
332 /*
333  * Copy each digit by mask. If the corresponding bit is 1, copy it. If the corresponding bit is 0, retain it.
334  */
FelemAssignWithMask(Felem * out,const Felem * in,const uint128_t mask)335 static inline void FelemAssignWithMask(Felem *out, const Felem *in, const uint128_t mask)
336 {
337     uint128_t rmask = ~mask;
338     // The value of out->data[0] is changed or remains unchanged.
339     out->data[0] = (in->data[0] & mask) | (out->data[0] & rmask);
340     // The value of out->data[1] is changed or remains unchanged.
341     out->data[1] = (in->data[1] & mask) | (out->data[1] & rmask);
342     // The value of out->data[2] is changed or remains unchanged.
343     out->data[2] = (in->data[2] & mask) | (out->data[2] & rmask);
344     // The value of out->data[3] is changed or remains unchanged.
345     out->data[3] = (in->data[3] & mask) | (out->data[3] & rmask);
346 }
347 
348 /*
349  * Set the lowest digit
350  */
FelemSetLimb(Felem * out,const uint128_t in)351 static inline void FelemSetLimb(Felem *out, const uint128_t in)
352 {
353     out->data[0] = in;  // out->data[0] get the value
354     out->data[1] = 0;   // out->data[1] clear to 0
355     out->data[2] = 0;   // out->data[2] clear to 0
356     out->data[3] = 0;   // out->data[3] clear to 0
357 }
358 
359 /*
360  * Zero judgment: input less than 2 ^ 256, only 0 and p need to be judged.
361  * Input:
362  *      in[] < 2^64
363  */
FelemIsZero(const Felem * in)364 static inline uint128_t FelemIsZero(const Felem *in)
365 {
366     uint128_t isZero, isP;
367 
368     // Check whether digits 0, 1, 2, and 3 are all 0.
369     isZero = in->data[0] | in->data[1] | in->data[2] | in->data[3];
370     isZero -= 1;  // If in == 0, the most significant bit is 1.
371 
372     // Determine that in is equal to the field order.
373     isP = (in->data[0] ^ FIELD_ORDER.data[0]) |   // Determine whether the digits 0 is equal to the order
374           (in->data[1] ^ FIELD_ORDER.data[1]) |   // Determine whether the digits 1 is equal to the order
375           (in->data[2] ^ FIELD_ORDER.data[2]) |   // Determine whether the digits 2 is equal to the order
376           (in->data[3] ^ FIELD_ORDER.data[3]);    // Determine whether the digits 3 is equal to the order
377     isP -= 1;  // If in == p, the most significant bit is 1.
378 
379     return (isZero | isP) >> (LIMB_BITS - 1);
380 }
381 
382 /*
383  * Obtain the bit string whose length is len at idx in Felem.
384  * Input:
385  *      in[] < 2^64
386  *      0 < len <= 64
387  */
FelemGetBits(const Felem * in,int32_t idx,uint32_t len)388 static uint64_t FelemGetBits(const Felem *in, int32_t idx, uint32_t len)
389 {
390     uint128_t ret;
391     uint32_t lower, upper;
392     uint64_t mask;
393 
394     lower = (uint32_t)idx;
395     // When 0 <= lower < 256, the most significant bit is 1, obtain the most significant bit by right shifted by 31 bits
396     mask = (uint64_t)0 - ((~lower & (lower - 256)) >> 31);
397     ret = (uint64_t)in->data[(lower / BASE_BITS) & mask] & mask;
398 
399     upper = (uint32_t)idx + BASE_BITS;  // next unary block
400     // When 0 <= upper < 256, the most significant bit is 1, obtain the most significant bit by right shifted by 31 bits
401     mask = (uint64_t)0 - ((~upper & (upper - 256)) >> 31);
402     ret |= (uint128_t)((uint64_t)in->data[(upper / BASE_BITS) & mask] & mask) << BASE_BITS;
403 
404     // Take the lower six bits, that is, mod 64, regardless of the positive and negative values of "lower".
405     ret >>= lower & (BASE_BITS - 1);
406     ret &= ((uint64_t)1 << len) - 1;  // All 1-bit string with the len length
407 
408     return (uint64_t)ret;
409 }
410 
411 /*
412  * Field element reduction, retains the base bit (64 bits) of each digit in the 4-ary Felem, and reduce the high bit.
413  * Should be called before modular multiplication, modulus square, or modulus
414  * Input:
415  *      in[] < 2^127
416  * Output:
417  *      out[] < 2^64
418  */
FelemReduce(Felem * out,const Felem * in)419 static void FelemReduce(Felem *out, const Felem *in)
420 {
421     uint128_t *po = out->data;
422     const uint128_t *pi = in->data;
423     const uint128_t borrow = (uint128_t)1 << 96;    // 2 ^ 96 borrowed from low to high
424     const uint128_t lend = (uint128_t)1 << 32;      // 2 ^ 32 lent from high to low
425     uint128_t carryLimb, carryLimbTotal;
426 
427     // Process the carry of each digit first: 0 -> 1 -> 2 -> 3
428     po[1] = pi[1] + (pi[0] >> BASE_BITS);   // po[1] takes the input value and adds the carry of digit 0.
429     po[2] = pi[2] + (po[1] >> BASE_BITS);   // po[2] takes the input value and adds the carry of digit 1.
430     po[3] = pi[3] + (po[2] >> BASE_BITS);   // po[3] takes the input value and adds the carry of digit 2.
431 
432     po[0] = (uint64_t)pi[0];                // po[0] takes the basic digit.
433     po[1] = (uint64_t)po[1];                // po[1] takes the basic digit.
434     po[2] = (uint64_t)po[2];                // po[2] takes the basic digit.
435 
436     // Now reduce the highest digit. Only need to reduce the carry of the highest digit three times.
437     // Note the carry bit of out[3] as carryLimb.
438     // It can be known from the equation:
439     // carryLimb * 2^256 = carryLimb * (2^224 - 2^192 - 2^96 + 1) (mod 2^256 - 2^224 + 2^192 + 2^96 - 1)
440     // The part of carryLimb * (2^224 - 2^192)  needs to be placed in out[3]
441     //
442     // First reduction: po[3] < 2^128 ---> po[3] < 2^96
443     // Assume that      po[3] = 0x ffffffffffffffff ffffffffffffffff
444     //      0x 0000000000000000 ffffffffffffffff    -> po[3] basic digit(unit)
445     // +    0x 00000000ffffffff ffffffff00000000    -> carryLimb * 2^32
446     // -    0x 0000000000000000 ffffffffffffffff    -> carryLimb
447     // -----------------------------------------
448     // =    0x 00000000ffffffff ffffffff00000000    -> po[3] <= 2^96 - 2^32 < 2^96
449     // Second reduction: po[3] < 2^96 ---> po[3] < 2^65 - 2^33
450     // Assume that       po[3] = 0x 00000000ffffffff ffffffffffffffff
451     //      0x 0000000000000000 ffffffffffffffff
452     // +    0x 0000000000000000 ffffffff00000000
453     // -    0x 0000000000000000 00000000ffffffff
454     // -----------------------------------------
455     // =    0x 0000000000000001 fffffffe00000000    -> po[3] <= 2^65 - 2^33 < 2^65 - 2^33
456     // Third reduction: po[3] < 2^65 - 2^32 ---> po[3] < 2^64
457     // Assume that      po[3] = 0x 0000000000000001 ffffffff00000000
458     //      0x 0000000000000000 ffffffff00000000
459     // +    0x 0000000000000000 0000000100000000
460     // -    0x 0000000000000000 0000000000000001
461     // -----------------------------------------
462     // =    0x 0000000000000000 ffffffffffffffff    -> po[3] < 2^64
463     //
464     // In addition, use carryLimbTotal to store the sum of carry bits.
465     // In this way, carryLimbTotal*(-2^96 + 1) can be calculated at a time.
466     // The simple conclusion is that when out[3] is the maximum, carryLimbTotal is the maximum.
467     // Consider the maximum value of out[3].
468     //      0x 7fffffffffffffff ffffffffffffffff    -> Maximum value of pi[3] (pi[3] < 2^127)
469     // +    0x 0000000000000000 8000000000000000    -> Maximum value of carry of pi[2]
470     // -----------------------------------------
471     // =    0x 8000000000000000 7fffffffffffffff    -> Maximum value of po[3]
472     // Use the value to perform three reduction:
473     // carryLimbTotal = 0x8000000000000000 + 0x7fffffff + 0x1 = 0x8000000080000000 < 2^64
474     carryLimbTotal = carryLimb = po[3] >> BASE_BITS;            // Reduce carryLimb and take the carry of po[3].
475     po[3] = (uint64_t)po[3] + (carryLimb << 32) - carryLimb;    // The reduction factor on po[3] is (2 ^ 32 - 1).
476     carryLimbTotal += (carryLimb = po[3] >> BASE_BITS);         // Reduce carryLimb and take the carry of po[3].
477     po[3] = (uint64_t)po[3] + (carryLimb << 32) - carryLimb;    // The reduction factor on po[3] is (2 ^ 32 - 1).
478     carryLimbTotal += (carryLimb = po[3] >> BASE_BITS);         // Reduce carryLimb and take the carry of po[3].
479     po[3] = (uint64_t)po[3] + (carryLimb << 32) - carryLimb;    // The reduction factor on po[3] is (2 ^ 32 - 1).
480 
481     // Calculate the remaining carryLimbTotal * (-2^96 + 1) <= 0
482     // In this case, it is impossible to carry out[4]. Therefore, po[3] < 2^64.
483     // If carryLimbTotal > 0, po[3] must be equal to at least 2 ^ 32 - 1. (i.e., po[3] = 2^64 in the last step)
484     // In this case, it is obvious that (2^32 - 1) * 2^192 > |carryLimbTotal * (-2^96 + 1)|
485     po[0] += carryLimbTotal;  // reduction of carryLimbTotal which the coefficient is 1
486     // po[1] + po[0] carry + Borrowed Bits - Reduction Factor 2^32
487     po[1] += ((po[0] >> BASE_BITS) ^ borrow) - (carryLimbTotal << 32);
488     // po[2] + po[1] carry + Borrowing Bits - Lending Bits
489     po[2] += ((po[1] >> BASE_BITS) ^ borrow) - lend;
490     // po[3] + po[2] carry - Lending Bits
491     po[3] += (po[2] >> BASE_BITS) - lend;
492 
493     po[0] = (uint64_t)po[0];  // po[0] takes the basic digit.
494     po[1] = (uint64_t)po[1];  // po[1] takes the basic digit.
495     po[2] = (uint64_t)po[2];  // po[2] takes the basic digit.
496 }
497 
498 /*
499  * Field element reduction, converting 8-ary LongFelem to 4-ary Felem
500  * Input:
501  *      in[] < 2^80
502  * Output:
503  *      out[] < 2^115
504  *      - out[0] < (2^114 + 2^82 - 2^50) + 2^(80 + 32) + 2 * 2^80   < 2^115
505  *      - out[1] < (2^114 + 2^82 - 2^50) + 2^(80 + 33) + 2 * 2^80   < 2^115
506  *      - out[2] < (2^114 + 2^82 - 2^50) + 2^(80 + 33) + 4 * 2^80   < 2^115
507  *      - out[3] < (2^114 + 2^82 - 2^50) + 2^(80 + 32) + 4 * 2^80   < 2^115
508  *      out[] > 0
509  *      - out[0] > (2^114 - 2^82) - 2 * 2^(80 + 32) - 2 * 2^80      > 0
510  *      - out[1] > (2^114 - 2^82) - 2^(80 + 32) - 2^80              > 0
511  *      - out[2] > (2^114 - 2^82) - 2^(80 + 32) - 2^80              > 0
512  *      - out[3] > (2^114 - 2^82) - 2 * 2^(80 + 32) - 2^80          > 0
513  */
LongFelemReduce(Felem * out,const LongFelem * in)514 static void LongFelemReduce(Felem *out, const LongFelem *in)
515 {
516     // n ≡ n / a * (a - b) ≡ n / a * b (mod (a - b))
517     // The following formula can be obtained:
518     // 2^n ≡ 2^(n - 256) * (2^224 - 2^192 - 2^96 + 1) (mod (2^256 - 2^224 + 2^192 + 2^96 - 1))
519     //
520     //   2^256 mod p
521     // = 2^224 - 2^192 - 2^96 + 1
522     //
523     //   2^288 mod p
524     // = 2^256 - 2^224 - 2^128 + 2^32
525     // = -2^192 - 2^128 - 2^96 + 2^32 + 1
526     //
527     //   2^320 mod p
528     // = 2^288 - 2^256 - 2^160 + 2^64
529     // = -2^224 - 2^160 - 2^128 + 2^64 + 2^32
530     //
531     //   2^352 mod p
532     // = 2^320 - 2^288 - 2^192 + 2^96
533     // = -2^224 - 2^160 + 2 * 2^96 + 2^64 - 1
534     //
535     //   2^384 mod p
536     // = 2^352 - 2^320 - 2^224 + 2^128
537     // = -2^224 + 2 * 2^128 + 2 * 2^96 - 2^32 - 1
538     //
539     //   2^416 mod p
540     // = 2^384 - 2^352 - 2^256 + 2^160
541     // = -2^224 + 2^192 + 2 * 2^160 + 2 * 2^128 + 2^96 - 2^64 - 2^32 - 1
542     //
543     //   2^448 mod p
544     // = 2^416 - 2^384 - 2^288 + 2^192
545     // = 3 * 2^192 + 2 * 2^160 + 2^128 - 2^64 - 2^32 - 1
546     //
547     //   2^480 mod p
548     // = 2^448 - 2^416 - 2^320 + 2^224
549     // = 3 * 2^224 + 2 * 2^192 + 2^160 - 2^96 - 2^64 - 2^32
550 
551     static const Felem zeroBase = {
552         {
553             (((uint128_t)1 << 64) - 1)                        << 50,
554             (((uint128_t)1 << 64) + ((uint128_t)1 << 32) - 1) << 50,
555             (((uint128_t)1 << 64) - 1)                        << 50,
556             (((uint128_t)1 << 64) - ((uint128_t)1 << 32))     << 50,
557         }
558     };
559 
560     uint128_t *po = out->data;
561     const uint128_t *pi = in->data;
562     // Add the term reduced by pi[4]*2^256, pi[5]*2^320, pi[6]*2^384, pi[7]*2^448 to the corresponding digit.
563     // Assign a value after zero adjustment.
564     // Adjust to zero of digit 0. Take the input value pi[0].
565     // The reduction item list is (pi[4], pi[5]*2^32, -pi[6] - pi[6]*2^32, -pi[7] - pi[7]*2^32)
566     po[0] = zeroBase.data[0] + pi[0] + pi[4] + (pi[5] << 32) - pi[6] - (pi[6] << 32) - pi[7] - (pi[7] << 32);
567     // Adjust to zero of digit 1. Take the input value pi[1].
568     // The reduction item list is (-pi[4]*2^32, pi[5], pi[6]*2^33, -pi[7])
569     po[1] = zeroBase.data[1] + pi[1] - (pi[4] << 32) + pi[5] + (pi[6] << 33) - pi[7];
570     // Adjust to zero of digit 2. Take the input value pi[2].
571     // The reduction item list is (0, -pi[5] - pi[5]*2^32, -pi[6]*2, pi[7] + pi[7]*2^33)
572     po[2] = zeroBase.data[2] + pi[2] - pi[5] - (pi[5] << 32) + (pi[6] << 1) + pi[7] + (pi[7] << 33);
573     // Adjust to zero of digit 3. Take the input value pi[3].
574     // The reduction item list is (-pi[4] + pi[4]*2^32, -pi[5]*2^32, -pi[6]*2^32, pi[7]*3)
575     po[3] = zeroBase.data[3] + pi[3] - pi[4] + (pi[4] << 32) - (pi[5] << 32) - (pi[6] << 32) + (pi[7] * 3);
576 }
577 
578 /*
579  * field element modulo: convert the field element to a unique value within [0, p)
580  * Input:
581  *      in[] < 2^64
582  * Output:
583  *      out < p, out[] < 2^64
584  */
FelemContract(Felem * out,const Felem * in)585 static void FelemContract(Felem *out, const Felem *in)
586 {
587     uint128_t *po = out->data;
588     const uint128_t *pi = in->data;
589     const uint128_t borrow = (uint128_t)1 << BASE_BITS;
590     const uint128_t lend = 1;
591 
592     bool isGreaterOrEqual = true;
593     for (int32_t i = 3; i >= 0; i--) {
594         if (pi[i] > FIELD_ORDER.data[i]) {
595             break;
596         } else if (pi[i] < FIELD_ORDER.data[i]) {
597             isGreaterOrEqual = false;
598             break;
599         }
600     }
601 
602     if (isGreaterOrEqual) {
603         // p_3 = 0xffffffff00000001
604         po[3] = (pi[3] - lend) - FIELD_ORDER.data[3];
605         // p_2 = 0x0000000000000000
606         po[2] = (borrow ^ pi[2]) - lend;
607         // p_1 = 0x00000000ffffffff
608         po[1] = ((borrow ^ pi[1]) - lend) - FIELD_ORDER.data[1];
609         // p_0 = 0xffffffffffffffff
610         po[0] = (borrow ^ pi[0]) - FIELD_ORDER.data[0];
611     }
612 
613     // Process carry 0 -> 1 -> 2 -> 3
614     po[1] += po[0] >> BASE_BITS;  // po[1] + po[0] carry
615     po[2] += po[1] >> BASE_BITS;  // po[2] + po[1] carry
616     po[3] += po[2] >> BASE_BITS;  // po[3] + po[2] carry
617 
618     po[0] = (uint64_t)po[0];        // po[0] takes the basic digit.
619     po[1] = (uint64_t)po[1];        // po[1] takes the basic digit.
620     po[2] = (uint64_t)po[2];        // po[2] takes the basic digit.
621 }
622 
623 /*
624  * Field Addition
625  */
FelemAdd(Felem * out,const Felem * a,const Felem * b)626 static inline void FelemAdd(Felem *out, const Felem *a, const Felem *b)
627 {
628     out->data[0] = a->data[0] + b->data[0];  // out->data[0] get the value
629     out->data[1] = a->data[1] + b->data[1];  // out->data[1] get the value
630     out->data[2] = a->data[2] + b->data[2];  // out->data[2] get the value
631     out->data[3] = a->data[3] + b->data[3];  // out->data[3] get the value
632 }
633 
634 /*
635  * Field element negation
636  * Input:
637  *      in[] <= 2^124 - 2^92
638  * Output:
639  *      out[] <= 2^124 + 2^92 - 2^60 + in[]
640  */
FelemNeg(Felem * out,const Felem * in)641 static inline void FelemNeg(Felem *out, const Felem *in)
642 {
643     static const Felem zeroBase = {{
644         (((uint128_t)1 << 64) - 1)                        << 60,
645         (((uint128_t)1 << 64) + ((uint128_t)1 << 32) - 1) << 60,
646         (((uint128_t)1 << 64) - 1)                        << 60,
647         (((uint128_t)1 << 64) - ((uint128_t)1 << 32))     << 60,
648     }};
649 
650     out->data[0] = zeroBase.data[0] - in->data[0];  // out->data[0] get the value
651     out->data[1] = zeroBase.data[1] - in->data[1];  // out->data[1] get the value
652     out->data[2] = zeroBase.data[2] - in->data[2];  // out->data[2] get the value
653     out->data[3] = zeroBase.data[3] - in->data[3];  // out->data[3] get the value
654 }
655 
656 /*
657  * Field subtraction
658  * Input:
659  *      a[] < 2^128 - 2^124 - 2^92 + 2^60,b[] <= 2^124 - 2^92
660  * Output:
661  *      out[] <= 2^124 + 2^92 - 2^60 + a[] < 2^128
662  */
FelemSub(Felem * out,const Felem * a,const Felem * b)663 static inline void FelemSub(Felem *out, const Felem *a, const Felem *b)
664 {
665     static const Felem zeroBase = {{
666         (((uint128_t)1 << 64) - 1)                        << 60,
667         (((uint128_t)1 << 64) + ((uint128_t)1 << 32) - 1) << 60,
668         (((uint128_t)1 << 64) - 1)                        << 60,
669         (((uint128_t)1 << 64) - ((uint128_t)1 << 32))     << 60,
670     }};
671 
672     out->data[0] = zeroBase.data[0] + a->data[0] - b->data[0];  // out->data[0] get the value
673     out->data[1] = zeroBase.data[1] + a->data[1] - b->data[1];  // out->data[1] get the value
674     out->data[2] = zeroBase.data[2] + a->data[2] - b->data[2];  // out->data[2] get the value
675     out->data[3] = zeroBase.data[3] + a->data[3] - b->data[3];  // out->data[3] get the value
676 }
677 
678 /*
679  * Field subtraction. Input LongFelem directly.
680  * Input:
681  *      a[] < 2^128 - 2^74 - 2^42 + 2^10,b[] <= 2^74 - 2^42
682  * Output:
683  *      out[] <= 2^74 + 2^42 - 2^10 + a[] < 2^128
684  */
LongFelemSub(LongFelem * out,const LongFelem * a,const LongFelem * b)685 static void LongFelemSub(LongFelem *out, const LongFelem *a, const LongFelem *b)
686 {
687     static const LongFelem zeroBase = {{
688         ((uint128_t)1 << 64)                              << 10,
689         (((uint128_t)1 << 64) - 1)                        << 10,
690         (((uint128_t)1 << 64) - 1)                        << 10,
691         (((uint128_t)1 << 64) - 1)                        << 10,
692         (((uint128_t)1 << 64) - 2)                        << 10,
693         (((uint128_t)1 << 64) + ((uint128_t)1 << 32) - 1) << 10,
694         (((uint128_t)1 << 64) - 1)                        << 10,
695         (((uint128_t)1 << 64) - ((uint128_t)1 << 32))     << 10,
696     }};
697 
698     out->data[0] = zeroBase.data[0] + a->data[0] - b->data[0];  // out->data[0] get the value
699     out->data[1] = zeroBase.data[1] + a->data[1] - b->data[1];  // out->data[1] get the value
700     out->data[2] = zeroBase.data[2] + a->data[2] - b->data[2];  // out->data[2] get the value
701     out->data[3] = zeroBase.data[3] + a->data[3] - b->data[3];  // out->data[3] get the value
702     out->data[4] = zeroBase.data[4] + a->data[4] - b->data[4];  // out->data[4] get the value
703     out->data[5] = zeroBase.data[5] + a->data[5] - b->data[5];  // out->data[5] get the value
704     out->data[6] = zeroBase.data[6] + a->data[6] - b->data[6];  // out->data[6] get the value
705     out->data[7] = zeroBase.data[7] + a->data[7] - b->data[7];  // out->data[7] get the value
706 }
707 
708 /*
709  * Scale the field element
710  * Use only a small magnification(scale) factor to ensure that in[] * scalar does not overflow.
711  */
FelemScale(Felem * out,const Felem * in,const uint32_t scalar)712 static inline void FelemScale(Felem *out, const Felem *in, const uint32_t scalar)
713 {
714     out->data[0] = in->data[0] * scalar;  // out->data[0] get the value
715     out->data[1] = in->data[1] * scalar;  // out->data[1] get the value
716     out->data[2] = in->data[2] * scalar;  // out->data[2] get the value
717     out->data[3] = in->data[3] * scalar;  // out->data[3] get the value
718 }
719 
720 /*
721  * Scale the field element. Input LongFelem directly.
722  * Use only a small magnification(scale) factor to ensure that in[] * scalar does not overflow.
723  */
LongFelemScale(LongFelem * out,const LongFelem * in,const uint32_t scalar)724 static inline void LongFelemScale(LongFelem *out, const LongFelem *in, const uint32_t scalar)
725 {
726     out->data[0] = in->data[0] * scalar;  // out->data[0] get the value
727     out->data[1] = in->data[1] * scalar;  // out->data[1] get the value
728     out->data[2] = in->data[2] * scalar;  // out->data[2] get the value
729     out->data[3] = in->data[3] * scalar;  // out->data[3] get the value
730     out->data[4] = in->data[4] * scalar;  // out->data[4] get the value
731     out->data[5] = in->data[5] * scalar;  // out->data[5] get the value
732     out->data[6] = in->data[6] * scalar;  // out->data[6] get the value
733     out->data[7] = in->data[7] * scalar;  // out->data[7] get the value
734 }
735 
736 /*
737  * Field Multiplication
738  * Input:
739  *      a[] < 2^64, b[] < 2^64
740  * Output:
741  *      out[] < 2^67
742  *      - out[0] < 2^64
743  *      - out[1] < 2^64 * 3
744  *      - out[2] < 2^64 * 5
745  *      - out[3] < 2^64 * 7
746  *      - out[4] < 2^64 * 7
747  *      - out[5] < 2^64 * 5
748  *      - out[6] < 2^64 * 3
749  *      - out[7] < 2^64
750  */
FelemMul(LongFelem * out,const Felem * a,const Felem * b)751 static void FelemMul(LongFelem *out, const Felem *a, const Felem *b)
752 {
753     // out[0] = a[0]*b[0]
754     // out[1] = a[0]*b[1] + a[1]*b[0]
755     // out[2] = a[0]*b[2] + a[1]*b[1] + a[2]*b[0]
756     // out[3] = a[0]*b[3] + a[1]*b[2] + a[2]*b[1] + a[3]*b[0]
757     // out[4] =             a[1]*b[3] + a[2]*b[2] + a[3]*b[1]
758     // out[5] =                         a[2]*b[3] + a[3]*b[2]
759     // out[6] =                                     a[3]*b[3]
760 
761     const uint64_t a64[4] = {(uint64_t)a->data[0], (uint64_t)a->data[1], (uint64_t)a->data[2], (uint64_t)a->data[3]};
762     const uint64_t b64[4] = {(uint64_t)b->data[0], (uint64_t)b->data[1], (uint64_t)b->data[2], (uint64_t)b->data[3]};
763     uint128_t limbMul;
764 
765     // out[0] = a[0]*b[0]
766     limbMul = (uint128_t)a64[0] * b64[0];   // a[0] * b[0]
767     out->data[0] = (uint64_t)limbMul;       // Digit 0 plus the basic digit
768     out->data[1] = limbMul >> BASE_BITS;    // Digit 1 plus the carry
769 
770     // out[1] = a[0]*b[1] + a[1]*b[0]
771     limbMul = (uint128_t)a64[0] * b64[1];   // a[0] * b[1]
772     out->data[1] += (uint64_t)limbMul;      // Digit 1 plus the basic digit
773     out->data[2] = limbMul >> BASE_BITS;    // Digit 2 plus the carry
774 
775     limbMul = (uint128_t)a64[1] * b64[0];   // a[1] * b[0]
776     out->data[1] += (uint64_t)limbMul;      // Digit 1 plus the basic digit
777     out->data[2] += limbMul >> BASE_BITS;   // Digit 2 plus the carry
778 
779     // out[2] = a[0]*b[2] + a[1]*b[1] + a[2]*b[0]
780     limbMul = (uint128_t)a64[0] * b64[2];   // a[0] * b[2]
781     out->data[2] += (uint64_t)limbMul;      // Digit 2 plus the basic digit
782     out->data[3] = limbMul >> BASE_BITS;    // Digit 3 plus the carry
783 
784     limbMul = (uint128_t)a64[1] * b64[1];   // a[1] * a[1]
785     out->data[2] += (uint64_t)limbMul;      // Digit 2 plus the basic digit
786     out->data[3] += limbMul >> BASE_BITS;   // Digit 3 plus the carry
787 
788     limbMul = (uint128_t)a64[2] * b64[0];   // a[2] * b[0]
789     out->data[2] += (uint64_t)limbMul;      // Digit 2 plus the basic digit
790     out->data[3] += limbMul >> BASE_BITS;   // Digit 3 plus the carry
791 
792     // out[3] = a[0]*b[3] + a[1]*b[2] + a[2]*b[1] + a[3]*b[0]
793     limbMul = (uint128_t)a64[0] * b64[3];   // a[0] * b[3]
794     out->data[3] += (uint64_t)limbMul;      // Digit 3 plus the basic digit
795     out->data[4] = limbMul >> BASE_BITS;    // Digit 4 plus the carry
796 
797     limbMul = (uint128_t)a64[1] * b64[2];   // a[1] * b[2]
798     out->data[3] += (uint64_t)limbMul;      // Digit 3 plus the basic digit
799     out->data[4] += limbMul >> BASE_BITS;   // Digit 4 plus the carry
800 
801     limbMul = (uint128_t)a64[2] * b64[1];   // a[2] * b[1]
802     out->data[3] += (uint64_t)limbMul;      // Digit 3 plus the basic digit
803     out->data[4] += limbMul >> BASE_BITS;   // Digit 4 plus the carry
804 
805     limbMul = (uint128_t)a64[3] * b64[0];   // a[3] * b[0]
806     out->data[3] += (uint64_t)limbMul;      // Digit 3 plus the basic digit
807     out->data[4] += limbMul >> BASE_BITS;   // Digit 4 plus the carry
808 
809     // out[4] = a[1]*b[3] + a[2]*b[2] + a[3]*b[1]
810     limbMul = (uint128_t)a64[1] * b64[3];   // a[1] * b[3]
811     out->data[4] += (uint64_t)limbMul;      // Digit 4 plus the basic digit
812     out->data[5] = limbMul >> BASE_BITS;    // Digit 5 plus the carry
813 
814     limbMul = (uint128_t)a64[2] * b64[2];   // a[2] * b[2]
815     out->data[4] += (uint64_t)limbMul;      // Digit 4 plus the basic digit
816     out->data[5] += limbMul >> BASE_BITS;   // Digit 5 plus the carry
817 
818     limbMul = (uint128_t)a64[3] * b64[1];   // a[3] * b[1]
819     out->data[4] += (uint64_t)limbMul;      // Digit 4 plus the basic digit
820     out->data[5] += limbMul >> BASE_BITS;   // Digit 5 plus the carry
821 
822     // out[5] = a[2]*b[3] + a[3]*b[2]
823     limbMul = (uint128_t)a64[2] * b64[3];   // a[2] * b[3]
824     out->data[5] += (uint64_t)limbMul;      // Digit 5 plus the basic digit
825     out->data[6] = limbMul >> BASE_BITS;    // Digit 6 plus the carry
826 
827     limbMul = (uint128_t)a64[3] * b64[2];   // a[3] * b[2]
828     out->data[5] += (uint64_t)limbMul;      // Digit 5 plus the basic digit
829     out->data[6] += limbMul >> BASE_BITS;   // Digit 6 plus the carry
830 
831     // out[6] = a[3]*b[3]
832     limbMul = (uint128_t)a64[3] * b64[3];   // a[3] * b[3]
833     out->data[6] += (uint64_t)limbMul;      // Digit 6 plus the basic digit
834     out->data[7] = limbMul >> BASE_BITS;    // Digit 7 plus the carry
835 }
836 
837 /*
838  * Field square
839  * Input:
840  *      a[] < 2^64
841  * Output:
842  *      out[] < 2^67
843  *      - out[0] < 2^64
844  *      - out[1] < 2^64 * 2
845  *      - out[2] < 2^64 * 4
846  *      - out[3] < 2^64 * 5
847  *      - out[4] < 2^64 * 6
848  *      - out[5] < 2^64 * 4
849  *      - out[6] < 2^64 * 3
850  *      - out[7] < 2^64
851  */
FelemSqr(LongFelem * out,const Felem * a)852 static void FelemSqr(LongFelem *out, const Felem *a)
853 {
854     // out[0] = a[0]*a[0]
855     // out[1] = a[0]*a[1]*2
856     // out[2] = a[0]*a[2]*2 + a[1]*a[1]
857     // out[3] = a[0]*a[3]*2 + a[1]*a[2]*2
858     // out[4] = a[1]*a[3]*2 + a[2]*a[2]
859     // out[5] = a[2]*a[3]*2
860     // out[6] = a[3]*a[3]
861 
862     const uint64_t a64[4] = {(uint64_t)a->data[0], (uint64_t)a->data[1], (uint64_t)a->data[2], (uint64_t)a->data[3]};
863     uint128_t limbMul;
864 
865     // out[0] = a[0]*a[0]
866     limbMul = (uint128_t)a64[0] * a64[0];       // a[0] * a[0]
867     out->data[0] = (uint64_t)limbMul;           // Digit 0 plus the basic digit
868     out->data[1] = limbMul >> BASE_BITS;        // Digit 1 plus the carry
869 
870     // out[1] = a[0]*a[1]*2
871     limbMul = (uint128_t)a64[0] * a64[1];       // a[0] * a[1]
872     out->data[1] += (uint64_t)limbMul << 1;     // basic digit after the product is left shifted by 1bit, add to digit 1
873     out->data[2] = limbMul >> (BASE_BITS - 1);  // carry after product shift, add to digit 2
874 
875     // out[2] = a[0]*a[2]*2 + a[1]*a[1]
876     limbMul = (uint128_t)a64[0] * a64[2];       // a[0] * a[2]
877     out->data[2] += (uint64_t)limbMul << 1;     // basic digit after the product is left shifted by 1bit, add to digit 2
878     out->data[3] = limbMul >> (BASE_BITS - 1);  // carry after product shift, add to digit 3
879 
880     limbMul = (uint128_t)a64[1] * a64[1];       // a[1] * a[1]
881     out->data[2] += (uint64_t)limbMul;          // Digit 2 plus the basic digit
882     out->data[3] += limbMul >> BASE_BITS;       // Digit 3 plus the carry
883 
884     // out[3] = a[0]*a[3]*2 + a[1]*a[2]*2
885     limbMul = (uint128_t)a64[0] * a64[3];       // a[0] * a[3]
886     out->data[3] += (uint64_t)limbMul << 1;     // basic digit after the product is left shifted by 1bit, add to digit 3
887     out->data[4] = limbMul >> (BASE_BITS - 1);  // carry after product shift, add to digit 4
888 
889     limbMul = (uint128_t)a64[1] * a64[2];       // a[1] * a[2]
890     out->data[3] += (uint64_t)limbMul << 1;     // basic digit after the product is left shifted by 1bit, add to digit 3
891     out->data[4] += limbMul >> (BASE_BITS - 1); // carry after product shift, add to digit 4
892 
893     // out[4] = a[1]*a[3]*2 + a[2]*a[2]
894     limbMul = (uint128_t)a64[1] * a64[3];       // a[1] * a[3]
895     out->data[4] += (uint64_t)limbMul << 1;     // basic digit after the product is left shifted by 1bit, add to digit 4
896     out->data[5] = limbMul >> (BASE_BITS - 1);  // carry after product shift, add to digit 5
897 
898     limbMul = (uint128_t)a64[2] * a64[2];       // a[2] * a[2]
899     out->data[4] += (uint64_t)limbMul;          // Digit 4 plus the basic digit
900     out->data[5] += limbMul >> BASE_BITS;       // Digit 5 plus the carry
901 
902     // out[5] = a[2]*a[3]*2
903     limbMul = (uint128_t)a64[2] * a64[3];       // a[2] * a[3]
904     out->data[5] += (uint64_t)limbMul << 1;     // basic digit after the product is left shifted by 1bit, add to digit 5
905     out->data[6] = limbMul >> (BASE_BITS - 1);  // carry after product shift, add to digit 6
906 
907     // out[6] = a[3]*a[3]
908     limbMul = (uint128_t)a64[3] * a64[3];       // a[3] * a[3]
909     out->data[6] += (uint64_t)limbMul;          // Digit 6 plus the basic digit
910     out->data[7] = limbMul >> BASE_BITS;        // Digit 7 plus the carry
911 }
912 
FelemMulReduce(Felem * out,const Felem * a,const Felem * b)913 static inline void FelemMulReduce(Felem *out, const Felem *a, const Felem *b)
914 {
915     LongFelem ltmp;
916     FelemMul(&ltmp, a, b);
917     LongFelemReduce(out, &ltmp);
918 }
919 
FelemSqrReduce(Felem * out,const Felem * in)920 static inline void FelemSqrReduce(Felem *out, const Felem *in)
921 {
922     LongFelem ltmp;
923     FelemSqr(&ltmp, in);
924     LongFelemReduce(out, &ltmp);
925 }
926 
FelemMulReduceToBase(Felem * out,const Felem * a,const Felem * b)927 static inline void FelemMulReduceToBase(Felem *out, const Felem *a, const Felem *b)
928 {
929     LongFelem ltmp;
930     FelemMul(&ltmp, a, b);
931     LongFelemReduce(out, &ltmp);
932     FelemReduce(out, out);
933 }
934 
FelemSqrReduceToBase(Felem * out,const Felem * in)935 static inline void FelemSqrReduceToBase(Felem *out, const Felem *in)
936 {
937     LongFelem ltmp;
938     FelemSqr(&ltmp, in);
939     LongFelemReduce(out, &ltmp);
940     FelemReduce(out, out);
941 }
942 
943 /*
944  * Field element inversion
945  * From Fermat's little theorem, in^(p - 2) = in^(-1) (mod p)
946  * in^(-1) = in^(2^256 - 2^224 + 2^192 + 2^96 - 1 - 2) (mod p)
947  * Input:
948  *      in[] < 2^64
949  * Output:
950  *      out[] < 2^64
951  */
FelemInv(Felem * out,const Felem * in)952 static void FelemInv(Felem *out, const Felem *in)
953 {
954     Felem inE3, inEf, inEff, inEffff, inEffffffff, inEfx16Lsh32, inEfffffffd;
955 
956     // Construct in^(p - 2) by moving left and adding.
957     // The value of p - 2 is as follows:
958     // ffffffff 00000001
959     // 00000000 00000000
960     // 00000000 ffffffff
961     // ffffffff fffffffd
962 
963     // Construct the {1, 3, f, ff, ffff, ffffffff} power of in by left shift and addition.
964     // Construct in^1
965     FelemAssign(out, in);  // in^1
966     // Construct in^3
967     FelemSqrReduceToBase(out, out);  // in^2
968     FelemMulReduceToBase(out, out, in);  // in^3
969     FelemAssign(&inE3, out);
970     // Construct in^f
971     FelemSqrReduceToBase(out, out);  // in^6
972     FelemSqrReduceToBase(out, out);  // in^c
973     FelemMulReduceToBase(&inEfffffffd, out, in);  // inEfffffffd = in^d
974     FelemMulReduceToBase(out, out, &inE3);  // in^f
975     FelemAssign(&inEf, out);
976     // Construct in^ff
977     FelemSqrReduceToBase(out, out);  // in^1e
978     FelemSqrReduceToBase(out, out);  // in^3c
979     FelemSqrReduceToBase(out, out);  // in^78
980     FelemSqrReduceToBase(out, out);  // in^f0
981     FelemMulReduceToBase(&inEfffffffd, out, &inEfffffffd);  // inEfffffffd = in^fd
982     FelemMulReduceToBase(out, out, &inEf);  // in^ff
983     FelemAssign(&inEff, out);
984     // Construct in^ffff and shift ff to the left by 8 bits to obtain ff00
985     for (int32_t i = 0; i < 8; ++i) {
986         FelemSqrReduceToBase(out, out);
987     }  // in^ff00
988     FelemMulReduceToBase(&inEfffffffd, out, &inEfffffffd);  // inEfffffffd = in^fffd
989     FelemMulReduceToBase(out, out, &inEff);  // in^ffff
990     FelemAssign(&inEffff, out);
991     // Construct in^ffffffff and shift ffff to the left by 16 bits to obtain ffff0000
992     for (int32_t i = 0; i < 16; ++i) {
993         FelemSqrReduceToBase(out, out);
994     }  // in^ffff0000
995     FelemMulReduceToBase(&inEfffffffd, out, &inEfffffffd);  // inEfffffffd = in^fffffffd
996     FelemMulReduceToBase(out, out, &inEffff);  // in^ffffffff
997     FelemAssign(&inEffffffff, out);
998 
999     // Construct in^ffffffff ffffffff 00000000
1000     // Obtain in^ffffffff 00000000 and shift ffffffff to the left by 32 bits.
1001     for (int32_t i = 0; i < 32; ++i) {
1002         FelemSqrReduceToBase(out, out);
1003     }  // in^ffffffff 00000000
1004     FelemAssign(&inEfx16Lsh32, out);
1005     // Then obtain ffffffff 00000000 00000000 and shift it leftwards by 32 bits.
1006     for (int32_t i = 0; i < 32; ++i) {
1007         FelemSqrReduceToBase(out, out);
1008     }  // out = in^ffffffff 00000000 00000000
1009     FelemMulReduceToBase(&inEfx16Lsh32, out, &inEfx16Lsh32);  // inEfx16Lsh32 = in^ffffffff ffffffff 00000000
1010 
1011     // Construct in^ffffffff 00000001 00000000
1012     FelemMulReduceToBase(out, out, &inEffffffff);  // in^ffffffff 00000000 ffffffff
1013     FelemMulReduceToBase(out, out, in);  // in^ffffffff 00000001 00000000
1014 
1015     // Shift leftward by 160 bits to the top.
1016     for (int32_t i = 0; i < 160; ++i) {
1017         FelemSqrReduceToBase(out, out);
1018     }  // in^ffffffff 00000001 00000000 00000000 00000000 00000000 00000000 00000000
1019 
1020     // Construct in^ffffffff 00000001 00000000 00000000 00000000 ffffffff ffffffff 00000000
1021     FelemMulReduceToBase(out, out, &inEfx16Lsh32);
1022     // Construct in^ffffffff 00000001 00000000 00000000 00000000 ffffffff ffffffff fffffffd
1023     FelemMulReduceToBase(out, out, &inEfffffffd);
1024 }
1025 
1026 /* --------------------------Point group operation-------------------------- */
PtAssign(Point * out,const Point * in)1027 static inline void PtAssign(Point *out, const Point *in)
1028 {
1029     FelemAssign(&out->x, &in->x);
1030     FelemAssign(&out->y, &in->y);
1031     FelemAssign(&out->z, &in->z);
1032 }
1033 
PtAssignWithMask(Point * out,const Point * in,const uint128_t mask)1034 static inline void PtAssignWithMask(Point *out, const Point *in, const uint128_t mask)
1035 {
1036     FelemAssignWithMask(&out->x, &in->x, mask);
1037     FelemAssignWithMask(&out->y, &in->y, mask);
1038     FelemAssignWithMask(&out->z, &in->z, mask);
1039 }
1040 
1041 /*
1042  * point double
1043  * Algorithm reference: http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1044  * Number of field operations: 3M + 5S
1045  *      delta = Z^2
1046  *      gamma = Y^2
1047  *      beta = X * gamma
1048  *      alpha = 3 * (X - delta) * (X + delta)
1049  *      X' = alpha^2 - 8 * beta
1050  *      Z' = (Y + Z)^2 - gamma - delta
1051  *      Y' = alpha * (4 * beta - X') - 8 * gamma^2
1052  * Input:
1053  *      in->x[], in->y[], in->z[] < 2^64
1054  * Output:
1055  *      out->x[], out->y[], out->z[] < 2^64
1056  */
PtDouble(Point * out,const Point * in)1057 static void PtDouble(Point *out, const Point *in)
1058 {
1059     Felem delta, gamma, beta, alpha;
1060     Felem tmp, tmp2;
1061     LongFelem ltmp, ltmp2;
1062 
1063     // delta = Z^2
1064     FelemSqrReduce(&delta, &in->z);  // delta[] < 2^115
1065 
1066     // gamma = Y^2
1067     FelemSqrReduceToBase(&gamma, &in->y);
1068 
1069     // beta = X * gamma
1070     FelemMulReduce(&beta, &in->x, &gamma);  // beta[] < 2^115
1071 
1072     // alpha = 3 * (X - delta) * (X + delta)
1073     FelemAdd(&tmp, &in->x, &delta);     // tmp[] < 2^64 + 2^115
1074     FelemScale(&tmp, &tmp, 3);          // 3 * (X + delta), tmp[] < (2^64 + 2^115) * 3 < 2^117
1075     FelemSub(&tmp2, &in->x, &delta);    // tmp2[] < (2^124 + 2^92 - 2^60) + 2^64 < 2^125
1076     FelemReduce(&tmp, &tmp);
1077     FelemReduce(&tmp2, &tmp2);
1078     FelemMulReduceToBase(&alpha, &tmp, &tmp2);
1079 
1080     // X' = alpha^2 - 8 * beta
1081     FelemSqrReduce(&tmp, &alpha);       // alpha^2, tmp[] < 2^115
1082     FelemScale(&tmp2, &beta, 8);        // 8 * beta, tmp2[] < 2^115 * 8 = 2^119
1083     FelemSub(&out->x, &tmp, &tmp2);     // out->x[] < (2^124 + 2^92 - 2^60) + 2^115 < 2^125
1084     FelemReduce(&out->x, &out->x);      // out->x[] < 2^64
1085 
1086     // Z' = (Y + Z)^2 - gamma - delta
1087     FelemAdd(&tmp, &in->y, &in->z);
1088     FelemReduce(&tmp, &tmp);
1089     FelemSqrReduce(&tmp, &tmp);         // (Y + Z)^2, tmp[] < 2^115
1090     FelemAdd(&tmp2, &gamma, &delta);    // (gamma + delta), tmp2[] < 2^64 + 2^115 < 2^116
1091     FelemSub(&out->z, &tmp, &tmp2);     // out->z[] < (2^124 + 2^92 - 2^60) + 2^115 < 2^125
1092     FelemReduce(&out->z, &out->z);      // out->z[] < 2^64
1093 
1094     // Y' = alpha * (4 * beta - X') - 8 * gamma^2
1095     FelemScale(&beta, &beta, 4);        // beta[] < 2^115 * 4 = 2^117
1096     FelemSub(&tmp, &beta, &out->x);
1097     FelemReduce(&tmp, &tmp);
1098     FelemMul(&ltmp, &alpha, &tmp);      // alpha * (4 * beta - X'), ltmp[] < 2^67
1099     FelemSqr(&ltmp2, &gamma);
1100     LongFelemScale(&ltmp2, &ltmp2, 8);  // 8 * gamma^2, ltmp2[] < 2^67 * 8 < 2^70
1101     LongFelemSub(&ltmp, &ltmp, &ltmp2); // ltmp[] < (2^74 + 2^42 - 2^10) + 2^67 < 2^75
1102     LongFelemReduce(&out->y, &ltmp);    // out->y[] < 2^115
1103     FelemReduce(&out->y, &out->y);      // out->y[] < 2^64
1104 }
1105 
1106 /*
1107  * point addition
1108  * Algorithm reference: http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
1109  * Infinity point calculation is not supported.
1110  * Number of field operations: 11M + 5S
1111  *      Z1Z1 = Z1^2
1112  *      Z2Z2 = Z2^2
1113  *      U1 = X1 * Z2Z2
1114  *      U2 = X2 * Z1Z1
1115  *      S1 = Y1 * Z2 * Z2Z2
1116  *      S2 = Y2 * Z1 * Z1Z1
1117  *      H = U2 - U1
1118  *      I = (2 * H)^2
1119  *      J = H * I
1120  *      r = 2 * (S2 - S1)
1121  *      V = U1 * I
1122  *      X3 = r^2 - J - 2 * V
1123  *      Y3 = r * (V - X3) - 2 * S1 * J
1124  *      Z3 = ((Z1 + Z2)^2 - Z1Z1 - Z2Z2) * H
1125  * Input:
1126  *      a->x[], a->y[], a->z[] < 2^64
1127  *      b->x[], b->y[], b->z[] < 2^64
1128  * Output:
1129  *      out->x[], out->y[], out->z[] < 2^64
1130  */
PtAdd(Point * out,const Point * a,const Point * b)1131 static void PtAdd(Point *out, const Point *a, const Point *b)
1132 {
1133     Point result;
1134     Felem z1sqr, z2sqr, u1, u2, s1, s2, h, i, j, r, v;
1135     Felem tmp;
1136     LongFelem ltmp, ltmp2;
1137     uint128_t isZ1Zero, isZ2Zero;
1138 
1139     // Z1Z1 = Z1^2
1140     FelemSqrReduceToBase(&z1sqr, &a->z);
1141 
1142     // Z2Z2 = Z2^2
1143     FelemSqrReduceToBase(&z2sqr, &b->z);
1144 
1145     isZ1Zero = 0 - FelemIsZero(&z1sqr);
1146     isZ2Zero = 0 - FelemIsZero(&z2sqr);
1147 
1148     // U1 = X1 * Z2Z2
1149     FelemMulReduceToBase(&u1, &a->x, &z2sqr);
1150 
1151     // U2 = X2 * Z1Z1
1152     FelemMulReduce(&u2, &b->x, &z1sqr);  // u2[] < 2^115
1153 
1154     // S1 = Y1 * Z2 * Z2Z2
1155     FelemMulReduceToBase(&s1, &b->z, &z2sqr);
1156     FelemMulReduceToBase(&s1, &a->y, &s1);
1157 
1158     // S2 = Y2 * Z1 * Z1Z1
1159     FelemMulReduceToBase(&s2, &a->z, &z1sqr);
1160     FelemMulReduce(&s2, &b->y, &s2);  // s2[] < 2^115
1161 
1162     // H = U2 - U1
1163     FelemSub(&h, &u2, &u1);  // h[] < (2^124 + 2^92 - 2^60) + 2^115 < 2^125
1164     FelemReduce(&h, &h);
1165 
1166     // r = 2 * (S2 - S1)
1167     FelemSub(&r, &s2, &s1);  // r[] < (2^124 + 2^92 - 2^60) + 2^115 < 2^125
1168     FelemScale(&r, &r, 2);   // r[] < 2^126
1169     FelemReduce(&r, &r);
1170 
1171     // H and r can determine whether x and y of the affine coordinates of two points are equal.
1172     // If the values are equal, use double().
1173     if (isZ1Zero == 0 && isZ2Zero == 0 && FelemIsZero(&h) != 0 && FelemIsZero(&r) != 0) {
1174         // Use a smaller b point
1175         PtDouble(out, b);
1176         return;
1177     }
1178 
1179     // I = (2 * H)^2
1180     FelemSqrReduce(&i, &h);  // H^2, i[] < 2^115
1181     FelemScale(&i, &i, 4);   // 4 * H^2, i[] < 2^117
1182     FelemReduce(&i, &i);
1183 
1184     // J = H * I
1185     FelemMulReduceToBase(&j, &h, &i);
1186 
1187     // V = U1 * I
1188     FelemMulReduce(&v, &u1, &i);  // v[] < 2^115
1189 
1190     // X3 = r^2 - (J + 2 * V)
1191     FelemSqrReduce(&result.x, &r);              // result.x[] < 2^115
1192     FelemScale(&tmp, &v, 2);                    // tmp[] < 2^115 * 2 = 2^116
1193     FelemAdd(&tmp, &j, &tmp);                   // tmp[] < 2^64 + 2^116 < 2^117
1194     FelemSub(&result.x, &result.x, &tmp);       // result.x[] < (2^124 + 2^90 - 2^60) + 2^115 < 2^125
1195     FelemReduce(&result.x, &result.x);          // result.x[] < 2^64
1196 
1197     // Y3 = r * (V - X3) - 2 * S1 * J
1198     FelemSub(&tmp, &v, &result.x);
1199     FelemReduce(&tmp, &tmp);
1200     FelemMul(&ltmp, &r, &tmp);                  // r * (V - X3), ltmp[] < 2^67
1201     FelemMul(&ltmp2, &s1, &j);                  // ltmp2[] < 2^67
1202     LongFelemScale(&ltmp2, &ltmp2, 2);          // 2 * S1 * J, ltmp2[] < 2^68
1203     LongFelemSub(&ltmp, &ltmp, &ltmp2);         // ltmp[] < (2^74 + 2^42 - 2^10) + 2^67 < 2^75
1204     LongFelemReduce(&result.y, &ltmp);          // result.y[] < 2^115
1205     FelemReduce(&result.y, &result.y);          // result.y[] < 2^64
1206 
1207     // Z3 = ((Z1 + Z2)^2 - Z1Z1 - Z2Z2) * H
1208     FelemAdd(&result.z, &a->z, &b->z);
1209     FelemReduce(&result.z, &result.z);
1210     FelemSqrReduce(&result.z, &result.z);       // (Z1 + Z2)^2
1211     FelemAdd(&tmp, &z1sqr, &z2sqr);             // Z1Z1 + Z2Z2
1212     FelemSub(&result.z, &result.z, &tmp);       // ((Z1 + Z2)^2 - Z1Z1 - Z2Z2)
1213     FelemReduce(&result.z, &result.z);
1214     FelemMulReduceToBase(&result.z, &result.z, &h);   // result.z[] < 2^64
1215 
1216     // Special case processing for infinity points
1217     PtAssignWithMask(&result, a, isZ2Zero);
1218     PtAssignWithMask(&result, b, isZ1Zero);
1219     PtAssign(out, &result);
1220 }
1221 
1222 /*
1223  * Mixed point addition
1224  * Algorithm reference: http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-madd-2007-bl
1225  * Infinity point calculation is not supported.
1226  * Number of field operations: 7M + 4S
1227  *      Z1Z1 = Z1^2
1228  *      U2 = X2 * Z1Z1
1229  *      S2 = Y2 * Z1 * Z1Z1
1230  *      H = U2 - X1
1231  *      HH = H^2
1232  *      I = 4 * HH
1233  *      J = H * I
1234  *      r = 2 * (S2 - Y1)
1235  *      V = X1 * I
1236  *      X3 = r^2 - J - 2 * V
1237  *      Y3 = r * (V - X3) - 2 * Y1 * J
1238  *      Z3 = (Z1 + H)^2 - Z1Z1 - HH
1239  * Input:
1240  *      a->x[] < 2^64, a->y[] < 2^64, a->z[] < 2^64
1241  *      b->x < p, b->y < p, b->z = 0 或 1
1242  * Output:
1243  *      out->x[] < 2^64
1244  *      out->y[] < 2^64
1245  *      out->z[] < 2^64
1246  */
PtAddMixed(Point * out,const Point * a,const Point * b)1247 static void PtAddMixed(Point *out, const Point *a, const Point *b)
1248 {
1249     Point result;
1250     Felem z1sqr, u2, s2, h, hsqr, i, j, r, v;
1251     Felem tmp;
1252     LongFelem ltmp, ltmp2;
1253     uint128_t isZ1Zero, isZ2Zero;
1254 
1255     // Z1Z1 = Z1^2
1256     FelemSqrReduceToBase(&z1sqr, &a->z);
1257 
1258     isZ1Zero = 0 - FelemIsZero(&z1sqr);
1259     // The Z coordinate of point b can only be 0 or 1, that is, digit 0 can only be 0 or 1, and the other digits are 0.
1260     isZ2Zero = b->z.data[0] - 1;
1261 
1262     // U2 = X2 * Z1Z1
1263     FelemMulReduce(&u2, &b->x, &z1sqr);  // u2[] < 2^115
1264 
1265     // S2 = Y2 * Z1 * Z1Z1
1266     FelemMulReduceToBase(&s2, &a->z, &z1sqr);
1267     FelemMulReduce(&s2, &b->y, &s2);     // s2[] < 2^115
1268 
1269     // H = U2 - X1
1270     FelemSub(&h, &u2, &a->x);  // h[] < (2^124 + 2^92 - 2^60) + 2^115 < 2^125
1271     FelemReduce(&h, &h);
1272 
1273     // r = 2 * (S2 - Y1)
1274     FelemSub(&r, &s2, &a->y);  // r[] < (2^124 + 2^92 - 2^60) + 2^115 < 2^125
1275     FelemScale(&r, &r, 2);     // r[] < 2^126
1276     FelemReduce(&r, &r);
1277 
1278     // H and r can determine whether x and y of the affine coordinates of two points are equal.
1279     // If the values are equal, use double().
1280     if (isZ1Zero == 0 && isZ2Zero == 0 && FelemIsZero(&h) != 0 && FelemIsZero(&r) != 0) {
1281         // Use a smaller b point
1282         PtDouble(out, b);
1283         return;
1284     }
1285 
1286     // HH = H^2
1287     FelemSqrReduce(&hsqr, &h);  // hsqr[] < 2^115
1288 
1289     // I = 4 * HH
1290     FelemScale(&i, &hsqr, 4);   // i[] < 2^117
1291     FelemReduce(&i, &i);
1292 
1293     // J = H * I
1294     FelemMulReduceToBase(&j, &h, &i);
1295 
1296     // V = X1 * I
1297     FelemMulReduce(&v, &a->x, &i);  // v[] < 2^115
1298 
1299     // X3 = r^2 - J - 2 * V
1300     FelemSqrReduce(&result.x, &r);
1301     FelemScale(&tmp, &v, 2);                // tmp[] < 2^116
1302     FelemAdd(&tmp, &j, &tmp);               // tmp[] < 2^64 + 2^116 < 2^117
1303     FelemSub(&result.x, &result.x, &tmp);   // result.x[] < (2^124 + 2^92 - 2^60) + 2^115 < 2^125
1304     FelemReduce(&result.x, &result.x);      // result.x[] < 2^64
1305 
1306     // Y3 = r * (V - X3) - 2 * Y1 * J
1307     FelemSub(&tmp, &v, &result.x);          // tmp[] < (2^124 + 2^92 - 2^60) + 2^115 < 2^125
1308     FelemReduce(&tmp, &tmp);
1309     FelemMul(&ltmp, &r, &tmp);              // ltmp[] < 2^67
1310     FelemMul(&ltmp2, &a->y, &j);            // ltmp2[] < 2^67
1311     LongFelemScale(&ltmp2, &ltmp2, 2);      // ltmp2[] < 2^68
1312     LongFelemSub(&ltmp, &ltmp, &ltmp2);     // ltmp[] < (2^74 + 2^42 - 2^10) + 2^67 < 2^75
1313     LongFelemReduce(&result.y, &ltmp);      // result.y[] < 2^115
1314     FelemReduce(&result.y, &result.y);      // result.y[] < 2^64
1315 
1316     // Z3 = (Z1 + H)^2 - Z1Z1 - HH
1317     FelemAdd(&result.z, &a->z, &h);         // result.z[] < 2^64 + 2^64 = 2^65
1318     FelemReduce(&result.z, &result.z);
1319     FelemSqrReduce(&result.z, &result.z);   // result.z[] < 2^115
1320     FelemAdd(&tmp, &z1sqr, &hsqr);          // tmp[] < 2^64 + 2^115 < 2^116
1321     FelemSub(&result.z, &result.z, &tmp);   // result.z[] < (2^124 + 2^92 - 2^60) + 2^115 < 2^125
1322     FelemReduce(&result.z, &result.z);      // result.z[] < 2^64
1323 
1324     // Special case processing for infinity points
1325     PtAssignWithMask(&result, a, isZ2Zero);
1326     PtAssignWithMask(&result, b, isZ1Zero);
1327     PtAssign(out, &result);
1328 }
1329 
1330 /* Select the point with subscript index in the table and place it in the point. Anti-side channel process is exists. */
GetPointFromTable(Point * point,const Point table[],uint32_t pointNum,const uint32_t index)1331 static inline void GetPointFromTable(Point *point, const Point table[], uint32_t pointNum, const uint32_t index)
1332 {
1333     uint128_t mask;
1334     for (uint32_t i = 0; i < pointNum; i++) {
1335         /* If i is equal to index, the last mask is all Fs. Otherwise, the last mask is all 0s. */
1336         mask = (0 - (i ^ index)) >> 31;  // shifted rightwards by 31 bits and obtain the most significant bit.
1337         mask--;
1338         /* Conditional value assignment, valid only when i == index */
1339         PtAssignWithMask(point, &table[i], mask);
1340     }
1341 }
1342 
1343 /*
1344  * Input:
1345  *      k1 < n
1346  *      0 <= i < 32
1347  * Output:
1348  *      out->x < p, out->y < p, out->z = 0 或 1
1349  */
GetLowerPrecomputePtOfG(Point * out,const Felem * k1,int32_t curBit)1350 static inline void GetLowerPrecomputePtOfG(Point *out, const Felem *k1, int32_t curBit)
1351 {
1352     uint32_t bits;
1353     uint32_t i = (uint32_t)curBit;
1354 
1355     bits = (uint32_t)(k1->data[0] >> i) & 1;          // i-bit of the lower half of the digit 0
1356     bits |= ((uint32_t)(k1->data[1] >> i) & 1) << 1;  // i-bit of the lower half of the digit 1
1357     bits |= ((uint32_t)(k1->data[2] >> i) & 1) << 2;  // i-bit of the lower half of the digit 2
1358     bits |= ((uint32_t)(k1->data[3] >> i) & 1) << 3;  // i-bit of the lower half of the digit 3
1359 
1360     GetPointFromTable(out, PRE_MUL_G, TABLE_G_SIZE, bits);
1361 }
1362 
1363 /*
1364  * Input:
1365  *      k1 < n
1366  *      0 <= i < 32
1367  * Output:
1368  *      out->x < p, out->y < p, out->z = 0 或 1
1369  */
GetUpperPrecomputePtOfG(Point * out,const Felem * k1,int32_t curBit)1370 static inline void GetUpperPrecomputePtOfG(Point *out, const Felem *k1, int32_t curBit)
1371 {
1372     uint32_t bits;
1373     uint32_t i = (uint32_t)curBit;
1374 
1375     // i-bit of the upper half of the digit 0. (BASE_BITS/2) is the half width.
1376     bits = (uint32_t)(k1->data[0] >> (i + BASE_BITS / 2)) & 1;
1377     // i-bit of the upper half of the digit 1. (BASE_BITS/2) is the half width.
1378     bits |= ((uint32_t)(k1->data[1] >> (i + BASE_BITS / 2)) & 1) << 1;
1379     // i-bit of the upper half of the digit 2. (BASE_BITS/2) is the half width.
1380     bits |= ((uint32_t)(k1->data[2] >> (i + BASE_BITS / 2)) & 1) << 2;
1381     // i-bit of the upper half of the digit 3. (BASE_BITS/2) is the half width.
1382     bits |= ((uint32_t)(k1->data[3] >> (i + BASE_BITS / 2)) & 1) << 3;
1383 
1384     GetPointFromTable(out, PRE_MUL_G2, TABLE_G_SIZE, bits);
1385 }
1386 
1387 /*
1388  * Input:
1389  *      k2 < n
1390  *      0 <= i <= 255
1391  *      The coordinates of each point of preMulPt are reduced.
1392  * Output:
1393  *      out->x[] < 2^64, out->y[] < 2^64, out->z[] < 2^64
1394  */
GetPrecomputePtOfP(Point * out,const Felem * k2,int32_t curBit,const Point preMulPt[TABLE_P_SIZE])1395 static inline void GetPrecomputePtOfP(Point *out, const Felem *k2, int32_t curBit, const Point preMulPt[TABLE_P_SIZE])
1396 {
1397     uint32_t bits;
1398     uint32_t sign, value;  // Indicates the grouping sign and actual value.
1399     Felem negY;
1400     // Obtain the 5-bit signed code. Read the sign bits of the next group of numbers
1401     // to determine whether there is a carry. The total length is 6.
1402     bits = (uint32_t)FelemGetBits(k2, curBit - 1, WINDOW_SIZE + 1);
1403     DecodeScalarCode(&sign, &value, bits);
1404 
1405     GetPointFromTable(out, preMulPt, TABLE_P_SIZE, value);
1406 
1407     FelemNeg(&negY, &out->y);
1408     FelemReduce(&negY, &negY);
1409     FelemAssignWithMask(&out->y, &negY, (uint128_t)0 - sign);
1410 }
1411 
1412 /*
1413  * Calculate k1 * G + k2 * P
1414  * Input:
1415  *      k1 < n
1416  *      k2 < n
1417  *      The coordinates of each point of preMulPt are reduced.
1418  * Output:
1419  *      out->x < p, out->y < p, out->z < p
1420  */
PtMul(Point * out,const Felem * k1,const Felem * k2,const Point preMulPt[TABLE_P_SIZE])1421 static void PtMul(Point *out, const Felem *k1, const Felem *k2, const Point preMulPt[TABLE_P_SIZE])
1422 {
1423     Point ptQ = {};  // ptQ stores the result
1424     Point ptPre = {};  // ptPre stores the points obtained from the table.
1425     bool isGMul = k1 != NULL;
1426     bool isPtMul = k2 != NULL && preMulPt != NULL;
1427     int32_t curBit;
1428 
1429     // Initialize the Q point.
1430     if (isPtMul) {
1431         curBit = 255;  // Start from 255th bit.
1432         // Select the initial point from bit coding (_, _, _, _, 255, 254) of k2
1433         GetPrecomputePtOfP(&ptQ, k2, curBit, preMulPt);
1434     } else if (isGMul) {
1435         curBit = 31;  // Start from 31.
1436         // Select the initial point from bit coding (223, 159, 95, 31) of k1
1437         GetLowerPrecomputePtOfG(&ptQ, k1, curBit);
1438         // Select a precomputation point from the (223 + 32, 159 + 32, 95 + 32, 31 + 32) bit of k1
1439         // and add the precomputation point to the point Q
1440         GetUpperPrecomputePtOfG(&ptPre, k1, curBit);
1441         PtAddMixed(&ptQ, &ptQ, &ptPre);
1442     } else {
1443         // k1 and k2 are NULL, output the infinite point.
1444         (void)memset_s((void *)out, sizeof(Point), 0, sizeof(Point));
1445         return;
1446     }
1447 
1448     //     Operation chain:                                     Q point output range:
1449     //                                                        x[]         y[]         z[]
1450     //        Init value                                    < 2^64      < 2^64      < 2^64
1451     //            ↓
1452     //         double        ←↑                             < 2^64      < 2^64      < 2^64
1453     //            ↓           ↑
1454     //       mixed add        ↑                             < 2^64      < 2^64      < 2^64
1455     //            ↓           ↑
1456     //       mixed add       →↑                             < 2^64      < 2^64      < 2^64
1457     //            ↓           ↑
1458     // Y negation & reduction ↑                             < 2^64      < 2^64      < 2^64
1459     //            ↓           ↑
1460     //          add          →↑                             < 2^64      < 2^64      < 2^64
1461 
1462     while (--curBit >= 0) {
1463         // Start to shift right bit by bit. Because the most significant bit is initialized,
1464         // common point multiplication starts from 254th bit and base point multiplication starts from 30th bit.
1465         // Whether G-point multiplication is performed in the current cycle.
1466         // It is calculated once in each cycle starting from bit 31.
1467         bool isStepGMul = curBit <= 31;
1468         // Whether the current cycle is a common point multiplication, calculated once every 5 cycles
1469         bool isStepPtMul = curBit % WINDOW_SIZE == 0;
1470 
1471         PtDouble(&ptQ, &ptQ);
1472 
1473         // Generator G-point multiplication part.
1474         // Divide k1 into 8 segments, from high bits to low bits, select bits from each segment
1475         // and combine them together, then read the pre-computation table.
1476         // Specially, to shrink the precomputation table, the divided 8 segments are combined
1477         // according to the upper half and the lower half.
1478         if (isGMul && isStepGMul) {
1479             // Add the point multiplication result of the current bit of the eight-segment packet to the point Q
1480             GetLowerPrecomputePtOfG(&ptPre, k1, curBit);
1481             PtAddMixed(&ptQ, &ptQ, &ptPre);
1482 
1483             GetUpperPrecomputePtOfG(&ptPre, k1, curBit);
1484             PtAddMixed(&ptQ, &ptQ, &ptPre);
1485         }
1486 
1487         // Common point multiplication part.
1488         // Use the sliding window signed encoding method
1489         // to group the most significant bits to the least significant bits every five bits.
1490         // Each group of numbers is regarded as a signed number. 00000 to 01111 are decimal numbers 0 to 15,
1491         // and 10000 to 11111 are decimal numbers (32-16) to (32-1).
1492         // This is equivalent to the set of numbers in the complement form after carry 1.
1493         // for example:
1494         // 11011(complement) = 100000 - 00101
1495         if (isPtMul && isStepPtMul) {
1496             // Add the point multiplication result of the current group to point Q.
1497             GetPrecomputePtOfP(&ptPre, k2, curBit, preMulPt);
1498             PtAdd(&ptQ, &ptQ, &ptPre);
1499         }
1500     }
1501 
1502     // Output the modulo operation.
1503     FelemContract(&ptQ.x, &ptQ.x);
1504     FelemContract(&ptQ.y, &ptQ.y);
1505     FelemContract(&ptQ.z, &ptQ.z);
1506 
1507     PtAssign(out, &ptQ);
1508 }
1509 
1510 /*
1511  * Convert Jacobian coordinates to affine coordinates by a given module inverse
1512  * Input:
1513  *      in->x[] < 2^64, in->y[] < 2^64
1514  *      zInv < 2^64
1515  * Output:
1516  *      out->x < p, out->y < p, out->z = 1
1517  */
PtMakeAffineWithInv(Point * out,const Point * in,const Felem * zInv)1518 static void PtMakeAffineWithInv(Point *out, const Point *in, const Felem *zInv)
1519 {
1520     Felem tmp;
1521 
1522     // 1/Z^2
1523     FelemSqrReduceToBase(&tmp, zInv);
1524     // X/Z^2
1525     FelemMulReduceToBase(&out->x, &in->x, &tmp);
1526     FelemContract(&out->x, &out->x);
1527 
1528     // 1/Z^3
1529     FelemMulReduceToBase(&tmp, &tmp, zInv);
1530     // Y/Z^3
1531     FelemMulReduceToBase(&out->y, &in->y, &tmp);
1532     FelemContract(&out->y, &out->y);
1533 
1534     FelemSetLimb(&out->z, 1);
1535 }
1536 
1537 /* --------------------------other functions-------------------------- */
1538 /*
1539  * Obtain the pre-multiplication table of the input point pt, including 0pt-16pt. All points are reduced.
1540  */
GetPreMulPt(Point preMulPt[TABLE_P_SIZE],const ECC_Point * pt)1541 static int32_t GetPreMulPt(Point preMulPt[TABLE_P_SIZE], const ECC_Point *pt)
1542 {
1543     int32_t ret;
1544 
1545     // 0pt
1546     (void)memset_s((void *)&preMulPt[0], sizeof(Point), 0, sizeof(Point));
1547     // 1pt
1548     GOTO_ERR_IF_EX(BN2Felem(&preMulPt[1].x, pt->x), ret);
1549     GOTO_ERR_IF_EX(BN2Felem(&preMulPt[1].y, pt->y), ret);
1550     GOTO_ERR_IF_EX(BN2Felem(&preMulPt[1].z, pt->z), ret);
1551     // 2pt ~ 15pt
1552     for (uint32_t i = 2; i < 15; i += 2) {
1553         PtDouble(&preMulPt[i], &preMulPt[i >> 1]);
1554         PtAdd(&preMulPt[i + 1], &preMulPt[i], &preMulPt[1]);
1555     }
1556     // 16pt
1557     PtDouble(&preMulPt[16], &preMulPt[16 >> 1]);
1558 ERR:
1559     return ret;
1560 }
1561 
ECP256_PointMulAdd(ECC_Para * para,ECC_Point * r,const BN_BigNum * k1,const BN_BigNum * k2,const ECC_Point * pt)1562 int32_t ECP256_PointMulAdd(
1563     ECC_Para *para, ECC_Point *r, const BN_BigNum *k1, const BN_BigNum *k2, const ECC_Point *pt)
1564 {
1565     int32_t ret;
1566     Felem felemK1;
1567     Felem felemK2;
1568     Point preMulPt[TABLE_P_SIZE];
1569     Point out;
1570 
1571     // Check the input parameters.
1572     GOTO_ERR_IF(CheckParaValid(para, CRYPT_ECC_NISTP256), ret);
1573     GOTO_ERR_IF(CheckPointValid(r, CRYPT_ECC_NISTP256), ret);
1574     GOTO_ERR_IF(CheckBnValid(k1, FELEM_BITS), ret);
1575     GOTO_ERR_IF(CheckBnValid(k2, FELEM_BITS), ret);
1576     GOTO_ERR_IF(CheckPointValid(pt, CRYPT_ECC_NISTP256), ret);
1577     // Special treatment of infinity points
1578     if (BN_IsZero(pt->z)) {
1579         BSL_ERR_PUSH_ERROR(CRYPT_ECC_POINT_AT_INFINITY);
1580         return CRYPT_ECC_POINT_AT_INFINITY;
1581     }
1582 
1583     GOTO_ERR_IF_EX(BN2Felem(&felemK1, k1), ret);
1584     GOTO_ERR_IF_EX(BN2Felem(&felemK2, k2), ret);
1585     GOTO_ERR_IF_EX(GetPreMulPt(preMulPt, pt), ret);
1586 
1587     PtMul(&out, &felemK1, &felemK2, preMulPt);
1588 
1589     GOTO_ERR_IF_EX(Felem2BN(r->x, &out.x), ret);
1590     GOTO_ERR_IF_EX(Felem2BN(r->y, &out.y), ret);
1591     GOTO_ERR_IF_EX(Felem2BN(r->z, &out.z), ret);
1592 ERR:
1593     return ret;
1594 }
1595 
ECP256_PointMul(ECC_Para * para,ECC_Point * r,const BN_BigNum * k,const ECC_Point * pt)1596 int32_t ECP256_PointMul(ECC_Para *para, ECC_Point *r, const BN_BigNum *k, const ECC_Point *pt)
1597 {
1598     int32_t ret;
1599     Felem felemK;
1600     Point preMulPt[TABLE_P_SIZE];
1601     Point out;
1602 
1603     // Check the input parameters.
1604     GOTO_ERR_IF(CheckParaValid(para, CRYPT_ECC_NISTP256), ret);
1605     GOTO_ERR_IF(CheckPointValid(r, CRYPT_ECC_NISTP256), ret);
1606     GOTO_ERR_IF(CheckBnValid(k, FELEM_BITS), ret);
1607     if (pt != NULL) {
1608         GOTO_ERR_IF(CheckPointValid(pt, CRYPT_ECC_NISTP256), ret);
1609         // Special treatment of infinity points
1610         if (BN_IsZero(pt->z)) {
1611             BSL_ERR_PUSH_ERROR(CRYPT_ECC_POINT_AT_INFINITY);
1612             return CRYPT_ECC_POINT_AT_INFINITY;
1613         }
1614     }
1615 
1616     GOTO_ERR_IF_EX(BN2Felem(&felemK, k), ret);
1617     if (pt != NULL) {
1618         GOTO_ERR_IF_EX(GetPreMulPt(preMulPt, pt), ret);
1619         PtMul(&out, NULL, &felemK, preMulPt);
1620     } else {
1621         PtMul(&out, &felemK, NULL, NULL);
1622     }
1623 
1624     GOTO_ERR_IF_EX(Felem2BN(r->x, &out.x), ret);
1625     GOTO_ERR_IF_EX(Felem2BN(r->y, &out.y), ret);
1626     GOTO_ERR_IF_EX(Felem2BN(r->z, &out.z), ret);
1627 ERR:
1628     return ret;
1629 }
1630 
ECP256_Point2Affine(const ECC_Para * para,ECC_Point * r,const ECC_Point * pt)1631 int32_t ECP256_Point2Affine(const ECC_Para *para, ECC_Point *r, const ECC_Point *pt)
1632 {
1633     int32_t ret;
1634     Point out;
1635     Felem zInv;
1636 
1637     // Check the input parameters.
1638     GOTO_ERR_IF(CheckParaValid(para, CRYPT_ECC_NISTP256), ret);
1639     GOTO_ERR_IF(CheckPointValid(r, CRYPT_ECC_NISTP256), ret);
1640     GOTO_ERR_IF(CheckPointValid(pt, CRYPT_ECC_NISTP256), ret);
1641     // Special treatment of infinity points
1642     if (BN_IsZero(pt->z)) {
1643         BSL_ERR_PUSH_ERROR(CRYPT_ECC_POINT_AT_INFINITY);
1644         return CRYPT_ECC_POINT_AT_INFINITY;
1645     }
1646 
1647     GOTO_ERR_IF_EX(BN2Felem(&out.x, pt->x), ret);
1648     GOTO_ERR_IF_EX(BN2Felem(&out.y, pt->y), ret);
1649     GOTO_ERR_IF_EX(BN2Felem(&out.z, pt->z), ret);
1650 
1651     FelemInv(&zInv, &out.z);
1652     PtMakeAffineWithInv(&out, &out, &zInv);
1653 
1654     GOTO_ERR_IF_EX(Felem2BN(r->x, &out.x), ret);
1655     GOTO_ERR_IF_EX(Felem2BN(r->y, &out.y), ret);
1656     GOTO_ERR_IF_EX(Felem2BN(r->z, &out.z), ret);
1657 ERR:
1658     return ret;
1659 }
1660 
ECP256_ModOrderInv(const ECC_Para * para,BN_BigNum * r,const BN_BigNum * a)1661 int32_t ECP256_ModOrderInv(const ECC_Para *para, BN_BigNum *r, const BN_BigNum *a)
1662 {
1663     if (para == NULL || r == NULL || a == NULL) {
1664         BSL_ERR_PUSH_ERROR(CRYPT_NULL_INPUT);
1665         return CRYPT_NULL_INPUT;
1666     }
1667 
1668     if (para->id != CRYPT_ECC_NISTP256) {
1669         BSL_ERR_PUSH_ERROR(CRYPT_ECC_POINT_ERR_CURVE_ID);
1670         return CRYPT_ECC_POINT_ERR_CURVE_ID;
1671     }
1672 
1673     if (BN_IsZero(a)) {
1674         BSL_ERR_PUSH_ERROR(CRYPT_ECC_INVERSE_INPUT_ZERO);
1675         return CRYPT_ECC_INVERSE_INPUT_ZERO;
1676     }
1677     return ECP_ModOrderInv(para, r, a);
1678 }
1679 
1680 #endif /* defined(HITLS_CRYPTO_CURVE_NISTP256) && defined(HITLS_CRYPTO_NIST_USE_ACCEL) */
1681