• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * This file is part of the openHiTLS project.
3 *
4 * openHiTLS is licensed under the Mulan PSL v2.
5 * You can use this software according to the terms and conditions of the Mulan PSL v2.
6 * You may obtain a copy of Mulan PSL v2 at:
7 *
8 *     http://license.coscl.org.cn/MulanPSL2
9 *
10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13 * See the Mulan PSL v2 for more details.
14 */
15
16#include "hitls_build.h"
17#ifdef HITLS_CRYPTO_CURVE_SM2
18#include "crypt_arm.h"
19.file	"ecp_sm2_armv8.S"
20
21#define	s0 x7
22#define	s1 x8
23#define	s2 x9
24#define	s3 x10
25#define	s4 x11
26#define	s5 x12
27#define	s6 x13
28#define	s7 x14
29
30.section .rodata
31# The polynomial
32.align	4
33.Lpoly:
34.quad	0xffffffffffffffff, 0xffffffff00000000, 0xffffffffffffffff, 0xfffffffeffffffff
35# The order of polynomial
36.Lord:
37.quad	0x53bbf40939d54123, 0x7203df6b21c6052b, 0xffffffffffffffff, 0xfffffffeffffffff
38
39.Lpoly_div_2:
40.quad	0x8000000000000000, 0xffffffff80000000, 0xffffffffffffffff, 0x7fffffff7fffffff
41.Lord_div_2:
42.quad	0xa9ddfa049ceaa092, 0xb901efb590e30295, 0xffffffffffffffff, 0x7fffffff7fffffff
43
44.Lzero:
45.quad	0, 0, 0, 0
46.Lord_1div4:
47.quad	0xd4eefd024e755049, 0xdc80f7dac871814a, 0xffffffffffffffff, 0x3fffffffbfffffff
48.Lord_2div4:
49.quad	0xa9ddfa049ceaa092, 0xb901efb590e30295, 0xffffffffffffffff, 0x7fffffff7fffffff
50.Lord_3div4:
51.quad	0x7eccf706eb5ff0db, 0x9582e790595483e0, 0xffffffffffffffff, 0xbfffffff3fffffff
52
53.Lpoly_1div4:
54.quad	0x4000000000000000, 0xffffffffc0000000, 0xffffffffffffffff, 0x3fffffffbfffffff
55.Lpoly_2div4:
56.quad	0x8000000000000000, 0xffffffff80000000, 0xffffffffffffffff, 0x7fffffff7fffffff
57.Lpoly_3div4:
58.quad	0xc000000000000000, 0xffffffff40000000, 0xffffffffffffffff, 0xbfffffff3fffffff
59
60.LRR:	//	2^512 mod P precomputed for sm2 polynomial
61.quad	0x0000000200000003, 0x00000002ffffffff, 0x0000000100000001, 0x0000000400000002
62.Lone_mont:
63.quad	0x0000000000000001, 0x00000000ffffffff, 0x0000000000000000, 0x0000000100000000
64.Lone:
65.quad	1,0,0,0
66
67.text
68### Right shift: in >> 1 ###
69# void ECP_Sm2Div2(BN_UINT *r, BN_UINT *a);
70# 1-bit right shift
71.globl	ECP_Sm2Div2
72.type	ECP_Sm2Div2, %function
73.align	4
74ECP_Sm2Div2:
75AARCH64_PACIASP
76	# Load inputs
77	ldp	x9, x10, [x1]
78	ldp	x11, x12, [x1, #16]
79
80	# Right shift
81	extr	x9, x10, x9, #1
82	extr	x10, x11, x10, #1
83	extr	x11, x12, x11, #1
84	lsr	x12, x12, #1
85
86	# Store results
87	stp	x9, x10, [x0]
88	stp	x11, x12, [x0, #16]
89AARCH64_AUTIASP
90	ret
91.size	ECP_Sm2Div2, .-ECP_Sm2Div2
92
93### Right shift: in >> 2 ###
94
95# void ECP_Sm2Div4(BN_UINT *r, BN_UINT *a);
96# 2-bit right shift
97.globl	ECP_Sm2Div4
98.type	ECP_Sm2Div4, %function
99.align	4
100ECP_Sm2Div4:
101AARCH64_PACIASP
102	# Load inputs
103	ldp	x7, x8, [x1]
104	ldp	x9, x10, [x1, #16]
105
106	# Right shift
107	extr	x7, x8, x7, #2
108	extr	x8, x9, x8, #2
109	extr	x9, x10, x9, #2
110	lsr	x10, x10, #2
111
112	# Store results
113	stp	x7, x8, [x0]
114	stp	x9, x10, [x0, #16]
115AARCH64_AUTIASP
116	ret
117.size	ECP_Sm2Div4, .-ECP_Sm2Div4
118
119### Sub: r = a-b ###
120.globl	ECP_Sm2BnSub
121.type	ECP_Sm2BnSub, %function
122.align	4
123ECP_Sm2BnSub:
124AARCH64_PACIASP
125	# Load inputs
126	ldp	x7, x8, [x1]
127	ldp	x11, x12, [x2]
128	ldp	x9, x10, [x1, #16]
129	ldp	x13, x14, [x2, #16]
130
131	# Sub
132	subs	x7,x7,x11
133	sbcs	x8,x8,x12
134	sbcs	x9,x9,x13
135	sbc	x10,x10,x14
136
137	# Store results
138	stp	x7, x8, [x0]
139	stp	x9, x10, [x0, #16]
140AARCH64_AUTIASP
141	ret
142.size	ECP_Sm2BnSub, .-ECP_Sm2BnSub
143
144### Add: r = a+b ###
145.globl	ECP_Sm2BnAdd
146.type	ECP_Sm2BnAdd, %function
147.align	4
148ECP_Sm2BnAdd:
149AARCH64_PACIASP
150	# Load inputs
151	ldp	x7, x8, [x1]
152	ldp	x11, x12, [x2]
153	ldp	x9, x10, [x1, #16]
154	ldp	x13, x14, [x2, #16]
155
156	# Add
157	adds	x7,x7,x11
158	adcs	x8,x8,x12
159	adcs	x9,x9,x13
160	adc	x10,x10,x14
161
162	# Store results
163	stp	x7, x8, [x0]
164	stp	x9, x10, [x0, #16]
165AARCH64_AUTIASP
166	ret
167.size	ECP_Sm2BnAdd, .-ECP_Sm2BnAdd
168
169### Modular div by 2: res = in/2 mod p ###
170# void ECP_Sm2Div2ModP(BN_UINT *r, BN_UINT *a)
171.globl	ECP_Sm2Div2ModP
172.type	ECP_Sm2Div2ModP, %function
173.align	4
174ECP_Sm2Div2ModP:
175AARCH64_PACIASP
176	# Load inputs
177	ldp	x3, x4, [x1]
178	ldp	x5, x6, [x1, #16]
179
180	# Save last bit
181	mov	x11, x3
182
183	# Right shift 1
184	extr	x3, x4, x3, #1
185	extr	x4, x5, x4, #1
186	extr	x5, x6, x5, #1
187	lsr	x6, x6, #1
188
189	# Load polynomial
190    adrp    x1, .Lpoly_div_2
191    add 	x1,x1,:lo12:.Lpoly_div_2
192
193	ldp	x7, x8, [x1]
194	ldp	x9, x10, [x1, #16]
195
196	# Parity check
197	tst	x11, #1
198	csel	x7,xzr,x7,eq
199	csel	x8,xzr,x8,eq
200	csel	x9,xzr,x9,eq
201	csel	x10,xzr,x10,eq
202
203	# Add
204	adds	x3,x3,x7
205	adcs	x4,x4,x8
206	adcs	x5,x5,x9
207	adc	x6,x6,x10
208
209	# Store results
210	stp	x3, x4, [x0]
211	stp	x5, x6, [x0, #16]
212AARCH64_AUTIASP
213	ret
214.size	ECP_Sm2Div2ModP, .-ECP_Sm2Div2ModP
215
216### Modular div by 2: res = in/2 mod n, where n = ord(p) ###
217# void ECP_Sm2Div2ModOrd(BN_UINT *r, BN_UINT *a)
218.globl	ECP_Sm2Div2ModOrd
219.type	ECP_Sm2Div2ModOrd, %function
220.align	4
221ECP_Sm2Div2ModOrd:
222AARCH64_PACIASP
223	# Load inputs
224	ldp	x3, x4, [x1]
225	ldp	x5, x6, [x1, #16]
226
227	# Save last bit
228	mov	x11, x3
229
230	# Right shift 1
231	extr	x3, x4, x3, #1
232	extr	x4, x5, x4, #1
233	extr	x5, x6, x5, #1
234	lsr	x6, x6, #1
235
236	# Load polynomial
237    adrp    x1, .Lord_div_2
238    add 	x1,x1,:lo12:.Lord_div_2
239	ldp	x7, x8, [x1]
240	ldp	x9, x10, [x1, #16]
241
242	# Parity check
243	tst	x11, #1
244	csel	x7,xzr,x7,eq
245	csel	x8,xzr,x8,eq
246	csel	x9,xzr,x9,eq
247	csel	x10,xzr,x10,eq
248
249	# Add
250	adds	x3,x3,x7
251	adcs	x4,x4,x8
252	adcs	x5,x5,x9
253	adc	x6,x6,x10
254
255	# Store results
256	stp	x3, x4, [x0]
257	stp	x5, x6, [x0, #16]
258AARCH64_AUTIASP
259	ret
260.size	ECP_Sm2Div2ModOrd, .-ECP_Sm2Div2ModOrd
261
262### Modular div by 4: res = in/4 mod p ###
263# void ECP_Sm2Div4ModP(BN_UINT *r, BN_UINT *a)
264.globl	ECP_Sm2Div4ModP
265.type	ECP_Sm2Div4ModP, %function
266.align	4
267
268ECP_Sm2Div4ModP:
269AARCH64_PACIASP
270	# Load inputs
271	ldp	x3, x4, [x1]
272	ldp	x5, x6, [x1, #16]
273
274	# Save last 2 bits
275	and	x11, x3, 0x3
276
277	# Right shift 2
278	extr	x3, x4, x3, #2
279	extr	x4, x5, x4, #2
280	extr	x5, x6, x5, #2
281	lsr	x6, x6, #2
282
283	# Load polynomial
284    adrp    x12, .Lzero
285    add     x12,x12,:lo12:.Lzero
286    adrp    x13, .Lpoly_1div4
287    add     x13,x13,:lo12:.Lpoly_1div4
288    adrp    x14, .Lpoly_2div4
289    add     x14,x14,:lo12:.Lpoly_2div4
290    adrp    x15, .Lpoly_3div4
291    add     x15,x15,:lo12:.Lpoly_3div4
292	cmp	x11, #1
293	csel	x1,x12,x13,cc
294	cmp	x11, #2
295	csel	x1,x1,x14,cc
296	cmp	x11, #3
297	csel	x1,x1,x15,cc
298
299	ldp	x7, x8, [x1]
300	ldp	x9, x10, [x1, #16]
301
302	# Add
303	adds	x3,x3,x7
304	adcs	x4,x4,x8
305	adcs	x5,x5,x9
306	adc	x6,x6,x10
307
308	# Store results
309	stp	x3, x4, [x0]
310	stp	x5, x6, [x0, #16]
311AARCH64_AUTIASP
312	ret
313.size	ECP_Sm2Div4ModP, .-ECP_Sm2Div4ModP
314
315### Modular div by 4: res = in/4 mod n, where n = ord(p) ###
316# void ECP_Sm2Div4ModOrd(BN_UINT *r, BN_UINT *a)
317.globl	ECP_Sm2Div4ModOrd
318.type	ECP_Sm2Div4ModOrd, %function
319.align	4
320
321ECP_Sm2Div4ModOrd:
322AARCH64_PACIASP
323	# Load inputs
324	ldp	x3, x4, [x1]
325	ldp	x5, x6, [x1, #16]
326
327	# Save last 2 bits
328	and	x11, x3, 0x3
329
330	# Right shift 2
331	extr	x3, x4, x3, #2
332	extr	x4, x5, x4, #2
333	extr	x5, x6, x5, #2
334	lsr	x6, x6, #2
335
336	# Load polynomial
337    adrp    x12, .Lzero
338    add     x12,x12,:lo12:.Lzero
339    adrp    x13, .Lord_1div4
340    add     x13,x13,:lo12:.Lord_1div4
341    adrp    x14, .Lord_2div4
342    add     x14,x14,:lo12:.Lord_2div4
343    adrp    x15, .Lord_3div4
344    add     x15,x15,:lo12:.Lord_3div4
345	cmp	x11, #1
346	csel	x1,x12,x13,cc
347	cmp	x11, #2
348	csel	x1,x1,x14,cc
349	cmp	x11, #3
350	csel	x1,x1,x15,cc
351
352	ldp	x7, x8, [x1]
353	ldp	x9, x10, [x1, #16]
354
355	# Add
356	adds	x3,x3,x7
357	adcs	x4,x4,x8
358	adcs	x5,x5,x9
359	adc	x6,x6,x10
360
361	# Store results
362	stp	x3, x4, [x0]
363	stp	x5, x6, [x0, #16]
364AARCH64_AUTIASP
365	ret
366.size	ECP_Sm2Div4ModOrd, .-ECP_Sm2Div4ModOrd
367
368#define	bn_mod_add(mod)			\
369	/* Load inputs */			\
370	ldp		x3,x4,[x1];			\
371	ldp		x5,x6,[x1,#0x10];	\
372	/* Addition */				\
373	ldp		x7,x8,[x2];			\
374	ldp		x9,x10,[x2,#0x10];	\
375	adds	x3,x3,x7;			\
376	adcs	x4,x4,x8;			\
377	adcs	x5,x5,x9;			\
378	adcs	x6,x6,x10;			\
379	adc 	x15,xzr,xzr;		\
380	mov		x11,x3;				\
381	mov		x12,x4;				\
382	mov		x13,x5;				\
383	mov		x14,x6;				\
384	/* Sub polynomial */		\
385	adrp    x2, mod;            \
386    add	    x2, x2, :lo12:mod;  \
387	ldp		x7,x8,[x2];			\
388	ldp		x9,x10,[x2,#0x10];	\
389	subs	x11,x11,x7;			\
390	sbcs	x12,x12,x8;			\
391	sbcs	x13,x13,x9;			\
392	sbcs	x14,x14,x10;		\
393	sbcs	x15,x15,xzr;		\
394	csel	x3,x3,x11,cc;		\
395	csel	x4,x4,x12,cc;		\
396	csel	x5,x5,x13,cc;		\
397	csel	x6,x6,x14,cc;		\
398	/* Store results */			\
399	stp		x3,x4,[x0];			\
400	stp		x5,x6,[x0,#0x10];	\
401
402#define	bn_mod_sub(mod)			\
403	/* Load inputs */			\
404	ldp		x3,x4,[x1];			\
405	ldp		x5,x6,[x1,#0x10];	\
406	/* Addition */				\
407	ldp		x7,x8,[x2];			\
408	ldp		x9,x10,[x2,#0x10];	\
409	subs	x3,x3,x7;			\
410	sbcs	x4,x4,x8;			\
411	sbcs	x5,x5,x9;			\
412	sbcs	x6,x6,x10;			\
413	sbc 	x15,xzr,xzr;		\
414	mov		x11,x3;				\
415	mov		x12,x4;				\
416	mov		x13,x5;				\
417	mov		x14,x6;				\
418	/* Add polynomial */		\
419	adrp    x2, mod;            \
420    add	    x2, x2, :lo12:mod;  \
421	ldp		x7,x8,[x2];			\
422	ldp		x9,x10,[x2,#0x10];	\
423	adds	x11,x11,x7;			\
424	adcs	x12,x12,x8;			\
425	adcs	x13,x13,x9;			\
426	adcs	x14,x14,x10;		\
427	tst		x15,x15;			\
428	csel	x3,x3,x11,eq;		\
429	csel	x4,x4,x12,eq;		\
430	csel	x5,x5,x13,eq;		\
431	csel	x6,x6,x14,eq;		\
432	/* Store results */			\
433	stp		x3,x4,[x0];			\
434	stp		x5,x6,[x0,#0x10];	\
435
436### Modular add: r = a+b mod p ###
437.globl	ECP_Sm2AddModP
438.type	ECP_Sm2AddModP, @function
439.align	4
440
441ECP_Sm2AddModP:
442
443AARCH64_PACIASP
444	bn_mod_add(.Lpoly);
445AARCH64_AUTIASP
446	ret
447.size	ECP_Sm2AddModP, .-ECP_Sm2AddModP
448
449### Modular sub: r = p - r ###
450.globl	ECP_Sm2Neg
451.type	ECP_Sm2Neg, @function
452.align	4
453
454ECP_Sm2Neg:
455AARCH64_PACIASP
456	ldp	x11, x12, [x1]
457    mov     x7, #0xffffffff00000000
458	ldp	x13, x14, [x1, #16]
459    mov     x8, #0xfffffffeffffffff
460
461	mov		x10, #-1
462	subs	x9,x10,x11
463	sbcs	x7,x7,x12
464	sbcs	x10,x10,x13
465	sbc	x8,x8,x14
466	stp	x9, x7, [x0]
467	stp	x10, x8, [x0, #16]
468	ret
469AARCH64_AUTIASP
470	ret
471.size	ECP_Sm2Neg, .-ECP_Sm2Neg
472
473### Modular sub: r = a-b mod p ###
474.globl	ECP_Sm2SubModP
475.type	ECP_Sm2SubModP, @function
476.align	4
477
478ECP_Sm2SubModP:
479
480AARCH64_PACIASP
481	bn_mod_sub(.Lpoly);
482AARCH64_AUTIASP
483	ret
484.size	ECP_Sm2SubModP, .-ECP_Sm2SubModP
485
486### Modular add: r = a+b mod n/p, where n = ord(p) ###
487.globl	ECP_Sm2AddModOrd
488.type	ECP_Sm2AddModOrd, @function
489.align	4
490ECP_Sm2AddModOrd:
491
492AARCH64_PACIASP
493	bn_mod_add(.Lord);
494AARCH64_AUTIASP
495	ret
496.size ECP_Sm2AddModOrd, .-ECP_Sm2AddModOrd
497
498### Modular sub: r = a-b mod n/p, where n = ord(p) ###
499.globl	ECP_Sm2SubModOrd
500.type	ECP_Sm2SubModOrd, @function
501.align	4
502ECP_Sm2SubModOrd:
503
504AARCH64_PACIASP
505	bn_mod_sub(.Lord);
506AARCH64_AUTIASP
507	ret
508.size ECP_Sm2SubModOrd, .-ECP_Sm2SubModOrd
509
510.macro	RDC
511	# registers map
512	# x3  x4  x5  x6  x15
513	# rsi rax rcx rdx rbx
514
515	# r = a mod sm2
516	# a = a15 | a14 | ... | a0, where ai are 32bit quantities
517	# | a7  | a6  | a5  | a4  | a3  | a2  | a1  | a0  | (+)
518	# | a8  | a11 | a10 | a9  | a8  |   0 | a9  | a8  | (+)
519	# | a9  | a14 | a13 | a12 | a11 |   0 | a10 | a9  | (+)
520	# | a10 | a15 | a14 | a13 | a12 |   0 | a11 | a10 | (+)
521	# | a11 |   0 | a15 | a14 | a13 |   0 | a12 | a11 | (+)
522	# | a12 |   0 | a15 | a14 | a13 |   0 | a13 | a12 | (+)
523	# | a12 |   0 |   0 | a15 | a14 |   0 | a14 | a13 | (+)
524	# | a13 |   0 |   0 |   0 | a15 |   0 | a14 | a13 | (+)
525	# | a13 |   0 |   0 |   0 |   0 |   0 | a15 | a14 | (+)
526	# | a14 |   0 |   0 |   0 |   0 |   0 | a15 | a14 | (+)
527	# | a14 |   0 |   0 |   0 |   0 |   0 |   0 | a15 | (+)
528	# | a15 |   0 |   0 |   0 |   0 |   0 |   0 | a15 | (+)
529	# | a15 |   0 |   0 |   0 |   0 |   0 |   0 |   0 | (+)
530	# | a15 |   0 |   0 |   0 |   0 |   0 |   0 |   0 | (+)
531	# |   0 |   0 |   0 |   0 |   0 | a8  |   0 |   0 | (-)
532	# |   0 |   0 |   0 |   0 |   0 | a9  |   0 |   0 | (-)
533	# |   0 |   0 |   0 |   0 |   0 | a13 |   0 |   0 | (-)
534	# |   0 |   0 |   0 |   0 |   0 | a14 |   0 |   0 | (-)
535	# | U[7]| U[6]| U[5]| U[4]| U[3]| U[2]| U[1]| U[0]|
536	# |    V[3]   |    V[2]   |   V[1]    |    V[0]   |
537	# until r < sm2
538	# s7 (a15|a14), s6 (a13|a12), s5 (a11|a10), s4 (a9|a8)
539	# s3 (a7|a6), s2 (a5|a4), s1 (a3|a2), s0 (a1|a0)
540
541	# 1. 64-bit addition
542	eor x3, x3, x3			// to store all carry
543	eor x4, x4, x4
544	mov x5, s6				// rcx <- s6
545	mov x6, s4				// rdx <- s4
546	# a13 | a12
547	adds x5, x5, s7			// rcx <- s6 + s7
548	adcs x4, xzr, xzr		// rax <- carry(s6+s7)
549	adds x5, x5, s7			// rcx <- s6 + 2*s7
550	adcs x4, x4, xzr
551	# a9 | a8
552	mov x15, x4				// rbx <- carry (rax)
553	adds x6, x6, x5			// rdx <- s4 + s6 + 2*s7
554	adcs x15, x15, xzr
555	adds x6, x6, s5			// rdx <- s4 + s5 + s6 + 2*s7
556	adcs x15, x15, xzr
557	# sum
558	adds s0, s0, x6			// s0 <- s0 + s4 + s5 + s6 + 2*s7
559	adcs s1, s1, x15		// s1 <- s1 + rbx + carry
560	adcs s2, s2, x5			// s2 <- s2 + s6 + 2*s7 + carry
561	adcs s3, s3, s7
562	adcs x3, xzr, xzr
563	# add carry
564	adds s3, s3, x4
565	adcs x3, x3, xzr		// all carry
566
567	stp s0, s1, [sp, #32]
568	stp s2, s3, [sp, #48]
569	# 2. 4 -> 8  64-bit to 32-bit spread
570	mov x4, #0xffffffff
571	mov s0, s4
572	mov s1, s5
573	mov s2, s6
574	mov s3, s7
575	and s0, s0, x4		// a8
576	and s1, s1, x4		// a10
577	and s2, s2, x4		// a12
578	and s3, s3, x4		// a14
579	lsr s4, s4, #32		// a9
580	lsr s5, s5, #32		// a11
581	lsr s6, s6, #32		// a13
582	lsr s7, s7, #32		// a15
583	# 3. 32-bit addition
584	mov x4, s3
585	add x4, x4, s2		// rax <- a12 + a14
586	mov x15, s3
587	add	x15, x15, s1	// rbx <- a10 + a14
588	mov x5, s7
589	add x5, x5, s6		// rcx <- a13 + a15
590	mov x6, s0
591	add x6, x6, s4		// rdx <- a8 + a9
592	add s7, s7, s5		// s7 <-  a11 + a15
593	mov s2, x5			// s2 <- a13 + a15
594	add s2, s2, x4		// s2 <- a12 + a13 + a14 + a15
595	add s1, s1, s2		// s1 <- a10 + a12 + a13 + a14 + a15
596	add s1, s1, s2		// s1 <- a10 + 2*(a12 + a13 + a14 + a15)
597	add s1, s1, x6		// s1 <- a8 + a9 + a10 + 2*(a12 + a13 + a14 + a15)
598	add s1, s1, s5		// s1 <- a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15)
599	add s2, s2, s6		// s2 <- a12 + 2*a13 + a14 + a15
600	add s2, s2, s5		// s2 <- a11 + a12 + 2*a13 + a14 + a15
601	add s2, s2, s0		// s2 <- a8 + a11 + a12 + 2*a13 + a14 + a15
602	add x6, x6, s3		// rdx <- a8 + a9 + a14
603	add x6, x6, s6		// rdx <- a8 + a9 + a13 + a14
604	add s4, s4, x5		// s4 <- a9 + a13 + a15
605	add s5, s5, s4		// s5 <- a9 + a11 + a13 + a15
606	add s5, s5, x5		// s5 <- a9 + a11 + 2*(a13 + a15)
607	add x4, x4, x15		// rax <- a10 + a12 + 2*a14
608
609	# U[0]	s5		a9 + a11 + 2*(a13 + a15)
610	# U[1]	%rax	a10 + a12 + 2*a14
611	# U[2]
612	# U[3]	s2		a8 + a11 + a12 + 2*a13 + a14 + a15
613	# U[4]	s4		a9 + a13 + a15
614	# U[5]	%rbx	a10 + a14
615	# U[6]	s7		a11 + a15
616	# U[7]	s1		a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15)
617	# sub	%rdx	a8 + a9 + a13 + a14
618
619	# s0 s3 s6  %rcx
620
621	# 4. 8 -> 4  32-bit to 64-bit
622	# sub %rdx
623	mov s0, x4
624	lsl s0, s0, #32
625	extr x4, s2, x4, #32
626	extr s2, x15, s2, #32
627	extr x15, s1, x15, #32
628	lsr s1, s1, #32
629
630	# 5. 64-bit addition
631	adds s5, s5, s0
632	adcs x4, x4, xzr
633	adcs s4, s4, s2
634	adcs s7, s7, x15
635	adcs x3, x3, s1
636
637	# V[0] s5
638	# V[1] %rax
639	# V[2] s4
640	# V[3] s7
641	# carry %rsi
642	# sub %rdx
643
644	# 5. ADD & SUB
645	ldp s0, s1, [sp, #32]
646	ldp s2, s3, [sp, #48]
647
648	# ADD
649	adds s0, s0, s5
650	adcs s1, s1, x4
651	adcs s2, s2, s4
652	adcs s3, s3, s7
653	adcs x3, x3, xzr
654	# SUB
655	subs s1, s1, x6
656	sbcs s2, s2, xzr
657	sbcs s3, s3, xzr
658	sbcs x3, x3, xzr
659
660	# 6. MOD
661	# First Mod
662	mov x4, x3
663	lsl x4, x4, #32
664	mov x5, x4
665	subs x4, x4, x3
666
667	adds s0, s0, x3
668	adcs s1, s1, x4
669	adcs s2, s2, xzr
670	adcs s3, s3, x5
671
672	# Last Mod
673	# return y - p if y > p else y
674	mov s4, s0
675	mov s5, s1
676	mov s6, s2
677	mov s7, s3
678
679	adrp x3, .Lpoly
680    add	x3, x3, :lo12:.Lpoly
681	ldp x4, x15, [x3]
682	ldp x16, x17, [x3, #16]
683
684	eor x5, x5, x5
685	adcs x5, xzr, xzr
686
687	subs s0, s0, x4
688	sbcs s1, s1, x15
689	sbcs s2, s2, x16
690	sbcs s3, s3, x17
691	sbcs x5, x5, xzr
692
693	csel s0, s0, s4, cs
694	csel s1, s1, s5, cs
695	csel s2, s2, s6, cs
696	csel s3, s3, s7, cs
697
698	stp s0, s1, [x0]
699	stp s2, s3, [x0, #16]
700.endm
701
702### Modular mul: r = a*b mod p ###
703# void ECP_Sm2Mul(uint64_t *r, const uint64_t *a, const uint64_t *b)
704# 256-bit modular multiplication in SM2
705# r		%rdi
706# a		%rsi
707# b		%rdx
708# registers map
709# s0  s1  s2  s3  s4  s5  s6  s7
710# x7  x8  x9  x10 x11 x12 x13 x14 x3  x4  x5  x6  x15
711# r8  r9  r10 r11 r12 r13 r14 r15 rax rdx rbx rcx rsi
712.globl	ECP_Sm2Mul
713.type	ECP_Sm2Mul, @function
714.align	4
715ECP_Sm2Mul:
716AARCH64_PACIASP
717	# Store scalar registers
718	stp     x29, x30, [sp, #-80]!
719    add     x29, sp, #0
720    stp     x16, x17, [sp, #16]
721	stp     x18, x19, [sp, #64]
722
723	# Load inputs
724	ldp s0, s1, [x1]
725	ldp s2, s3, [x1, #16]
726	ldp s4, s5, [x2]
727	ldp s6, s7, [x2, #16]
728
729### multiplication ###
730
731	# ========================
732	#             s7 s6 s5 s4
733	# *           s3 s2 s1 s0
734	# ------------------------
735	# +           s0 s0 s0 s0
736	#              *  *  *  *
737	#             s7 s6 s5 s4
738	#          s1 s1 s1 s1
739	#           *  *  *  *
740	#          s7 s6 s5 s4
741	#       s2 s2 s2 s2
742	#        *  *  *  *
743	#       s7 s6 s5 s4
744	#    s3 s3 s3 s3
745	#     *  *  *  *
746	#    s7 s6 s5 s4
747	# ------------------------
748	# s7 s6 s5 s4 s3 s2 s1 s0
749	# ========================
750
751### s0*s4 ###
752	mul x16, s0, s4
753	umulh x5, s0, s4
754	eor x6, x6, x6
755
756### s1*s4 + s0*s5 ###
757	mul x3, s1, s4
758	umulh x4, s1, s4
759	adds x5, x5, x3
760	adcs x6, x6, x4
761	eor x15, x15, x15
762
763	mul x3, s0, s5
764	umulh x4, s0, s5
765	adds x5, x5, x3
766	adcs x6, x6, x4
767	adcs x15, x15, xzr
768	mov x17, x5
769	eor x5, x5, x5
770
771### s2 * s4 + s1 * s5 + s0 *s6 ###
772	mul x3, s2, s4
773	umulh x4, s2, s4
774	adds x6, x6, x3
775	adcs x15, x15, x4
776
777	mul x3, s1, s5
778	umulh x4, s1, s5
779	adds x6, x6, x3
780	adcs x15, x15, x4
781	adcs x5, x5, xzr
782
783	mul x3, s0, s6
784	umulh x4, s0, s6
785	adds x6, x6, x3
786	adcs x15, x15, x4
787	adcs x5, x5, xzr
788	mov x18, x6
789	eor x6, x6, x6
790
791### s3*s4 + s2*s5 + s1*s6 + s0*s7 ###
792	mul x3, s3, s4
793	umulh x4, s3, s4
794	adds x15, x15, x3
795	adcs x5, x5, x4
796	adcs x6, x6, xzr
797
798	mul x3, s2, s5
799	umulh x4, s2, s5
800	adds x15, x15, x3
801	adcs x5, x5, x4
802	adcs x6, x6, xzr
803
804	mul x3, s1, s6
805	umulh x4, s1, s6
806	adds x15, x15, x3
807	adcs x5, x5, x4
808	adcs x6, x6, xzr
809
810	mul x3, s0 ,s7
811	umulh x4, s0, s7
812	adds x15, x15, x3
813	adcs x5, x5, x4
814	adcs x6, x6, xzr
815	mov x19, x15
816	eor x15, x15, x15
817
818### s3*s5 + s2*s6 + s1*s7 ###
819	mul x3, s3, s5
820	umulh x4, s3, s5
821	adds x5, x5, x3
822	adcs x6, x6, x4
823	# carry
824	adcs x15, x15, xzr
825
826	mul x3, s2, s6
827	umulh x4, s2, s6
828	adds x5, x5, x3
829	adcs x6, x6, x4
830	adcs x15, x15, xzr
831
832	mul x3, s1, s7
833	umulh x4, s1, s7
834	adds x5, x5, x3
835	adcs x6, x6, x4
836	adcs x15, x15, xzr
837	mov s4, x5
838	eor x5, x5, x5
839
840### s3*s6 + s2*s7 ###
841	mul x3, s3, s6
842	umulh x4, s3, s6
843	adds x6, x6, x3
844	adcs x15, x15, x4
845	adcs x5, x5, xzr
846
847	mul x3, s2, s7
848	umulh x4, s2, s7
849	adds x6, x6, x3
850	adcs x15, x15, x4
851	adcs x5, x5, xzr
852	mov s5, x6
853
854### s3*s7 ###
855	mul x3, s3, s7
856	umulh x4, s3, s7
857	adds x15, x15, x3
858	adcs x5, x5, x4
859	mov s6, x15
860	mov s7, x5
861
862	mov s0, x16
863	mov s1, x17
864	mov s2, x18
865	mov s3, x19
866
867	# result of mul: s7 s6 s5 s4 s3 s2 s1 s0
868
869### Reduction ###
870	RDC
871
872	# Restore scalar registers
873	ldp x16, x17, [sp, #16]
874	ldp x18, x19, [sp, #64]
875	ldp x29, x30, [sp], #80
876AARCH64_AUTIASP
877	ret
878.size ECP_Sm2Mul, .-ECP_Sm2Mul
879
880### Modular sqr: r = a^2 mod p ###
881# void ECP_Sm2Sqr(uint64_t *r, const uint64_t *a)
882# 256-bit modular multiplication in SM2 ###
883# r 	%rdi
884# a 	%rsi
885# registers map
886# s0  s1  s2  s3  s4  s5  s6  s7
887# x7  x8  x9  x10 x11 x12 x13 x14 x3  x4  x5  x6  x15 x16 x17
888# r8  r9  r10 r11 r12 r13 r14 r15 rax rdx rbx rcx rsi rbp rdi
889.globl	ECP_Sm2Sqr
890.type	ECP_Sm2Sqr, @function
891.align	4
892ECP_Sm2Sqr:
893AARCH64_PACIASP
894	# Store scalar registers
895	stp     x29, x30, [sp, #-64]!
896    add     x29, sp, #0
897    stp     x16, x17, [sp, #16]
898
899	# Load inputs
900	ldp s4, s5, [x1]
901	ldp s6, s7, [x1, #16]
902
903### square ###
904
905	# ========================
906	#             s7 s6 s5 s4
907	# *           s7 s6 s5 s4
908	# ------------------------
909	# +           s4 s4 s4 s4
910	#              *  *  *  *
911	#             s7 s6 s5 s4
912	#          s5 s5 s5 s5
913	#           *  *  *  *
914	#          s7 s6 s5 s4
915	#       s6 s6 s6 s6
916	#        *  *  *  *
917	#       s7 s6 s5 s4
918	#    s7 s7 s7 s7
919	#     *  *  *  *
920	#    s7 s6 s5 s4
921	# ------------------------
922	# s7 s6 s5 s4 s3 s2 s1 s0
923	# ========================
924
925### s1 <- s4*s5, s2 <- carry ###
926	mul s1, s4, s5
927	umulh s2, s4, s5
928	eor s3, s3, s3
929
930### s2 <- s4*s6 + carry(s2), s3 <- carry ###
931	mul x3, s6, s4
932	umulh s3, s6, s4
933	adds s2, s2, x3
934	adcs s3, s3, xzr
935	eor s0, s0, s0
936
937### s3 <- s4*s7 + s5*s6 + carry(s3), s0 <- carry ###
938	mul x3, s7, s4
939	umulh x4, s7, s4
940	adds s3, s3, x3
941	adcs s0, s0, x4
942	eor x5, x5, x5
943
944	mul x3, s6, s5
945	umulh x4, s6, s5
946	adds s3, s3, x3
947	adcs s0, s0, x4
948	adcs x5, xzr, xzr
949
950### s0 <- s5*s7 + carry(s0), rbx <- carry ###
951	mul x3, s7, s5
952	umulh x4, s7, s5
953	adds s0, s0, x3
954	adcs x5, x5, x4
955	eor x6, x6, x6
956
957### rbx <- s6*s7 + carry(rbx), rcx <- carry ###
958	mul x3, s7, s6
959	umulh x4, s7, s6
960	adds x5, x5, x3
961	adcs x6, x6, x4
962	eor x15, x15, x15
963
964### 2*s0|1|2|3 ###
965	adds s1, s1, s1
966	adcs s2, s2, s2
967	adcs s3, s3, s3
968	adcs s0, s0, s0
969	adcs x5, x5, x5
970	# update carry
971	adcs x6, x6, x6
972	adcs x15, xzr, xzr
973
974### rbp <- s4*s4, carry <- rdi ###
975	mul x16, s4, s4
976	umulh x17, s4, s4
977
978### s4 <- s5*s5, carry <- s5 ###
979	mul s4, s5, s5
980	umulh s5, s5, s5
981
982### s6*s6 ###
983	mul x3, s6, s6
984	umulh x4, s6, s6
985
986	# s1 += carry(s4*s4)
987	adds s1, s1, x17
988	# s2 += s5*s5
989	adcs s2, s2, s4
990	# s3 += carry(s5*s5)
991	adcs s3, s3, s5
992	# s4(s0) += s6*s6
993	adcs s0, s0, x3
994	# s5(rbx) += carry(s6*s6)
995	adcs x5, x5, x4
996	adcs x6, x6, xzr
997	adcs x15, x15, xzr
998
999### s7*s7 ###
1000	mul x3, s7, s7
1001	umulh x4, s7, s7
1002	# s6(rcx) += s7*s7
1003	adds x6, x6, x3
1004	# s7(rsi) += carry(s7*s7)
1005	adcs x15, x15, x4
1006
1007	mov s4, s0
1008	mov s0, x16
1009	mov s5, x5
1010	mov s6, x6
1011	mov s7, x15
1012	# result of mul: s7 s6 s5 s4 s3 s2 s1 s0=
1013### Reduction ###
1014	RDC
1015
1016	# Restore scalar registers
1017	ldp x16, x17, [sp, #16]
1018	ldp x29, x30, [sp], #64
1019AARCH64_AUTIASP
1020	ret
1021.size ECP_Sm2Sqr, .-ECP_Sm2Sqr
1022
1023.globl	ECP_Sm2ToMont
1024.type	ECP_Sm2ToMont,%function
1025.align	4
1026ECP_Sm2ToMont:
1027AARCH64_PACIASP
1028	stp		x29, x30,[sp, #-32]!
1029	add		x29,sp, #0
1030	stp		x19, x20,[sp, #16]
1031
1032    adrp    x3, .LRR		// bp[0]
1033    add 	x3, x3,:lo12:.LRR
1034	ldr	   	x3,[x3]
1035
1036	ldp		x4, x5,[x1]
1037	ldp		x6, x7,[x1, #16]
1038
1039    adrp    x14, .Lpoly+8
1040    add 	x14, x14,:lo12:.Lpoly+8
1041	ldr	   	x14,[x14]
1042
1043    adrp    x15, .Lpoly+24
1044    add 	x15, x15,:lo12:.Lpoly+24
1045	ldr	   	x15,[x15]
1046
1047    adrp    x2, .LRR
1048    add 	x2, x2,:lo12:.LRR
1049
1050	bl		ECP_Sm2MulMont
1051
1052	ldp		x19, x20,[sp, #16]
1053	ldp		x29, x30,[sp], #32
1054AARCH64_AUTIASP
1055	ret
1056.size	ECP_Sm2ToMont,.-ECP_Sm2ToMont
1057
1058.globl	ECP_Sm2FromMont
1059.type	ECP_Sm2FromMont,%function
1060.align	4
1061ECP_Sm2FromMont:
1062AARCH64_PACIASP
1063	stp		x29, x30,[sp, #-32]!
1064	add		x29,sp, #0
1065	stp		x19, x20,[sp, #16]
1066
1067    adrp    x2, .Lone
1068    add 	x2, x2,:lo12:.Lone
1069	ldr	   	x3, [x2]
1070
1071	ldp		x4, x5,[x1]
1072	ldp		x6, x7,[x1, #16]
1073
1074    adrp    x14, .Lpoly+8
1075    add 	x14, x14,:lo12:.Lpoly+8
1076	ldr	   	x14, [x14]
1077
1078    adrp    x15, .Lpoly+24
1079    add 	x15, x15,:lo12:.Lpoly+24
1080	ldr	   	x15, [x15]
1081
1082	bl		ECP_Sm2MulMont
1083
1084	ldp		x19, x20,[sp, #16]
1085	ldp		x29, x30,[sp], #32
1086AARCH64_AUTIASP
1087	ret
1088.size	ECP_Sm2FromMont,.-ECP_Sm2FromMont
1089
1090.type	ECP_Sm2MulMont,%function
1091.align	4
1092ECP_Sm2MulMont:
1093AARCH64_PACIASP
1094
1095	// a[0~3] * b[0]
1096	mul		x8, x4, x3
1097	umulh	x16, x4, x3
1098	mul		x9, x5, x3
1099	umulh	x17, x5, x3
1100	mul		x10, x6, x3
1101	umulh	x19, x6, x3
1102	mul		x11, x7, x3
1103	umulh	x20, x7, x3
1104
1105	adds	x9, x9, x16
1106	adcs	x10, x10, x17
1107	adcs	x11, x11, x19
1108	adc		x12, xzr, x20
1109	ldr		x3,	[x2, #8] // get b[1]
1110
1111	// begin 1st reduce
1112	lsl		x19, x8, #32
1113	lsr		x20, x8, #32
1114
1115	subs	x16, x8, x19
1116	sbcs	x17, xzr, x20
1117	sbcs	x19, xzr, x19
1118	sbc		x20, x8, x20
1119
1120	mov		x13, xzr
1121	adds	x8, x9, x16
1122	adcs	x9, x10, x17
1123	adcs	x10, x11, x19
1124	adcs	x11, x12, x20
1125	adc		x12, x13, xzr
1126
1127	// lo(a[0~3]) * b[1]
1128	mul		x16, x4, x3
1129	mul		x17, x5, x3
1130	mul		x19, x6, x3
1131	mul		x20, x7, x3
1132
1133	adds	x8, x8, x16
1134	adcs	x9, x9, x17
1135	adcs	x10, x10, x19
1136	adcs	x11, x11, x20
1137	adc		x12, x12, xzr
1138
1139	// hi(a[0~3]) * b[1]
1140	umulh	x16, x4, x3
1141	umulh	x17, x5, x3
1142	umulh	x19, x6, x3
1143	umulh	x20, x7, x3
1144
1145	adds	x9, x9, x16
1146	adcs	x10, x10, x17
1147	adcs	x11, x11, x19
1148	adcs	x12, x12, x20
1149	adc		x13, xzr, xzr
1150
1151	ldr		x3,	[x2, #8*2] // get b[2]
1152
1153	// begin 2st reduce
1154	lsl		x19, x8, #32
1155	lsr		x20, x8, #32
1156	subs	x16, x8, x19
1157	sbcs	x17, xzr, x20
1158	sbcs	x19, xzr, x19
1159	sbc		x20, x8, x20
1160
1161	adds	x8, x9, x16
1162	adcs	x9, x10, x17
1163	adcs	x10, x11, x19
1164	adcs	x11, x12, x20
1165	adc		x12, x13, xzr
1166
1167    // lo(a[0~3] * b[2])
1168	mul		x16, x4, x3
1169	mul		x17, x5, x3
1170	mul		x19, x6, x3
1171	mul		x20, x7, x3
1172
1173	adds	x8, x8, x16
1174	adcs	x9, x9, x17
1175	adcs	x10, x10, x19
1176	adcs	x11, x11, x20
1177	adc		x12, x12, xzr
1178
1179    // hi(a[0~3] * b[2])
1180	umulh	x16, x4, x3
1181	umulh	x17, x5, x3
1182	umulh	x19, x6, x3
1183	umulh	x20, x7, x3
1184
1185	adds	x9, x9, x16
1186	adcs	x10, x10, x17
1187	adcs	x11, x11, x19
1188	adcs	x12, x12, x20
1189	adc		x13, xzr, xzr
1190
1191	ldr		x3,[x2, #8*3]       // get b[3]
1192
1193	// begin 3st reduce
1194	lsl		x19, x8, #32
1195	lsr		x20, x8, #32
1196	subs	x16, x8, x19
1197	sbcs	x17, xzr, x20
1198	sbcs	x19, xzr, x19
1199	sbc		x20, x8, x20
1200
1201	adds	x8, x9, x16
1202	adcs	x9, x10, x17
1203	adcs	x10, x11, x19
1204	adcs	x11, x12, x20
1205	adc		x12, x13, xzr
1206
1207    // lo(a[0~3] * b[3])
1208	mul		x16, x4, x3
1209	mul		x17, x5, x3
1210	mul		x19, x6, x3
1211	mul		x20, x7, x3
1212
1213	adds	x8, x8, x16
1214	adcs	x9, x9, x17
1215	adcs	x10, x10, x19
1216	adcs	x11, x11, x20
1217	adc		x12, x12, xzr
1218
1219    // hi(a[0~3] * b[3])
1220	umulh	x16, x4, x3
1221	umulh	x17, x5, x3
1222	umulh	x19, x6, x3
1223	umulh	x20, x7, x3
1224
1225	adds	x9, x9, x16
1226	adcs	x10, x10, x17
1227	adcs	x11, x11, x19
1228	adcs	x12, x12, x20
1229	adc		x13, xzr, xzr
1230
1231	lsl		x19, x8, #32
1232	lsr		x20, x8, #32
1233
1234	// begin 4st reduce
1235	subs	x16, x8, x19
1236	sbcs	x17, xzr, x20
1237	sbcs	x19, xzr, x19
1238	sbc		x20, x8, x20
1239
1240	adds	x8, x9, x16
1241	adcs	x9, x10, x17
1242	adcs	x10, x11, x19
1243	adcs	x11, x12, x20
1244	adc		x12, x13, xzr
1245
1246	// for cal res - p
1247	adds	x16, x8, #1 // - (0xffffffffffffffff) = (+1)
1248	sbcs	x17, x9, x14
1249	adcs	x19, x10, xzr
1250	sbcs	x20, x11, x15
1251	sbcs	xzr, x12, xzr
1252
1253	csel	x8, x8, x16, lo
1254	csel	x9, x9, x17, lo
1255	csel	x10, x10, x19, lo
1256	csel	x11, x11, x20, lo
1257	stp		x8, x9,[x0]
1258	stp		x10, x11,[x0, #8*2]
1259
1260AARCH64_AUTIASP
1261	ret
1262.size	ECP_Sm2MulMont,.-ECP_Sm2MulMont
1263
1264.type	ECP_Sm2SqrMont,%function
1265.align	4
1266ECP_Sm2SqrMont:
1267AARCH64_PACIASP
1268
1269	// a[1~3] * a[0]
1270	mul		x9, x5, x4
1271	umulh	x17, x5, x4
1272	mul		x10, x6, x4
1273	umulh	x19, x6, x4
1274	mul		x11, x7, x4
1275	umulh	x12, x7, x4
1276
1277	adds	x10, x10, x17
1278	adcs	x11, x11, x19
1279	adc		x12, x12, xzr
1280
1281	// a[2~3] * a[1]
1282	mul		x16, x6, x5
1283	umulh	x17, x6, x5
1284	mul		x19, x7, x5
1285	umulh	x20, x7, x5
1286
1287	// a[3] * a[2]
1288	mul		x13, x7, x6
1289	umulh	x1, x7, x6
1290
1291	adds	x17, x17, x19
1292	adc		x19, x20, xzr
1293
1294	adds	x11, x11, x16
1295	adcs	x12, x12, x17
1296	adcs	x13, x13, x19
1297	adc		x1, x1, xzr
1298
1299	// a[0] * a[0]
1300	mul		x8, x4, x4
1301	umulh	x4, x4, x4
1302	// a[1] * a[1]
1303	mul		x17, x5, x5
1304	umulh	x5, x5, x5
1305
1306	adds	x9, x9, x9
1307	adcs	x10, x10, x10
1308
1309	adcs	x11, x11, x11
1310	adcs	x12, x12, x12
1311	adcs	x13, x13, x13
1312	adcs	x1, x1, x1
1313	adc		x2, xzr, xzr
1314
1315	// a[2] * a[2]
1316	mul		x19, x6, x6
1317	umulh	x6, x6, x6
1318	// a[3] * a[3]
1319	mul		x20, x7, x7
1320	umulh	x7, x7, x7
1321
1322	adds	x9, x9, x4
1323	adcs	x10, x10, x17
1324	adcs	x11, x11, x5
1325	adcs	x12, x12, x19
1326	adcs	x13, x13, x6
1327	adcs	x1, x1, x20
1328	adc		x2, x2, x7
1329
1330	// begin 1st reduce
1331	lsl		x19, x8, #32
1332	lsr		x20, x8, #32
1333	subs	x16, x8, x19
1334	sbcs	x17, xzr, x20
1335	sbcs	x19, xzr, x19
1336	sbc		x20, x8, x20
1337
1338	adds	x8, x9, x16
1339	adcs	x9, x10, x17
1340	adcs	x10, x11, x19
1341	adc		x11, xzr, x20
1342
1343	// begin 2st reduce
1344	lsl		x19, x8, #32
1345	lsr		x20, x8, #32
1346	subs	x16, x8, x19
1347	sbcs	x17, xzr, x20
1348	sbcs	x19, xzr, x19
1349	sbc		x20, x8, x20
1350
1351	adds	x8, x9, x16
1352	adcs	x9, x10, x17
1353	adcs	x10, x11, x19
1354	adc		x11, xzr, x20
1355
1356	// begin 3st reduce
1357	lsl		x19, x8, #32
1358	lsr		x20, x8, #32
1359	subs	x16, x8, x19
1360	sbcs	x17, xzr, x20
1361	sbcs	x19, xzr, x19
1362	sbc		x20, x8, x20
1363
1364	adds	x8, x9, x16
1365	adcs	x9, x10, x17
1366	adcs	x10, x11, x19
1367	adc		x11, xzr, x20
1368
1369	// begin 4st reduce
1370	lsl		x19, x8, #32
1371	lsr		x20, x8, #32
1372	subs	x16, x8, x19
1373	sbcs	x17, xzr, x20
1374	sbcs	x19, xzr, x19
1375	sbc		x20, x8, x20
1376
1377	adds	x8, x9, x16
1378	adcs	x9, x10, x17
1379	adcs	x10, x11, x19
1380	adc		x11, xzr, x20
1381
1382	adds	x8, x8, x12
1383	adcs	x9, x9, x13
1384	adcs	x10, x10, x1
1385	adcs	x11, x11, x2
1386	adc		x12, xzr, xzr
1387
1388	// for cal res - p
1389	adds	x16, x8, #1
1390	sbcs	x17, x9, x14
1391	adcs	x19, x10, xzr
1392	sbcs	x20, x11, x15
1393	sbcs	xzr, x12, xzr
1394
1395	csel	x8, x8, x16, lo
1396	csel	x9, x9, x17, lo
1397	csel	x10, x10, x19, lo
1398	csel	x11, x11, x20, lo
1399	stp		x8, x9,[x0]
1400	stp		x10, x11,[x0, #16]
1401AARCH64_AUTIASP
1402	ret
1403.size	ECP_Sm2SqrMont,.-ECP_Sm2SqrMont
1404
1405.type	ECP_Sm2AddCore,%function
1406.align	4
1407ECP_Sm2AddCore:
1408AARCH64_PACIASP
1409	adds	x8, x8, x16
1410	adcs	x9, x9, x17
1411	adcs	x10, x10, x19
1412	adcs	x11, x11, x20
1413	adc		x1, xzr, xzr
1414
1415	// sum - p
1416	adds	x16, x8, #1
1417	sbcs	x17, x9, x14  // x14 = 0xffffffff00000000
1418	adcs	x19, x10, xzr
1419	sbcs	x20, x11, x15 // x15 = 0xfffffffeffffffff
1420	sbcs	xzr, x1, xzr
1421
1422	csel	x8, x8, x16,lo
1423	csel	x9, x9, x17,lo
1424	csel	x10, x10, x19,lo
1425	csel	x11, x11, x20,lo
1426	stp		x8, x9, [x0]
1427	stp		x10, x11, [x0, #16]
1428AARCH64_AUTIASP
1429	ret
1430.size	ECP_Sm2AddCore,.-ECP_Sm2AddCore
1431
1432.type	ECP_Sm2DivBy2Core,%function
1433.align	4
1434ECP_Sm2DivBy2Core:
1435AARCH64_PACIASP
1436	subs	x16, x8, #1
1437	adcs	x17, x9, x14
1438	sbcs	x19, x10, xzr
1439	adcs	x20, x11, x15
1440	adc		x1, xzr, xzr
1441	tst		x8, #1
1442
1443	csel	x8, x8, x16, eq
1444	csel	x9, x9, x17, eq
1445	csel	x10, x10, x19, eq
1446	csel	x11, x11, x20 ,eq
1447	csel	x1, xzr, x1, eq
1448
1449	lsr		x8, x8, #1
1450	orr		x8, x8, x9, lsl#63
1451	lsr		x9, x9, #1
1452	orr		x9, x9, x10, lsl#63
1453	lsr		x10, x10, #1
1454	orr		x10, x10, x11, lsl#63
1455	lsr		x11, x11, #1
1456	orr		x11, x11, x1, lsl#63
1457	stp		x8, x9,[x0]
1458	stp		x10, x11, [x0, #16]
1459AARCH64_AUTIASP
1460	ret
1461.size	ECP_Sm2DivBy2Core,.-ECP_Sm2DivBy2Core
1462
1463.type	ECP_Sm2SubAB,%function
1464.align	4
1465ECP_Sm2SubAB:
1466
1467AARCH64_PACIASP
1468	ldp		x16, x17,[x2]
1469	ldp		x19, x20,[x2, #16]
1470	subs	x8, x8, x16
1471	sbcs	x9, x9, x17
1472	sbcs	x10, x10, x19
1473	sbcs	x11, x11, x20
1474	csetm	x16, cc
1475
1476	adds	x8, x8, x16
1477	and		x17, x16, x14
1478	adcs	x9, x9, x17
1479	adcs	x10, x10, x16
1480	and		x19, x16, x15
1481	adc		x11, x11, x19
1482	stp		x8, x9,[x0]
1483	stp		x10, x11,[x0, #16]
1484AARCH64_AUTIASP
1485	ret
1486.size	ECP_Sm2SubAB,.-ECP_Sm2SubAB
1487
1488.type	ECP_Sm2SubBA,%function
1489.align	4
1490ECP_Sm2SubBA:
1491AARCH64_PACIASP
1492	ldp		x16, x17,[x2]
1493	ldp		x19, x20,[x2, #16]
1494	subs	x8, x16, x8
1495	sbcs	x9, x17, x9
1496	sbcs	x10, x19, x10
1497	sbcs	x11, x20, x11
1498	csetm	x16, cc
1499
1500	adds	x8, x8, x16
1501	and		x17, x16, x14
1502	adcs	x9, x9, x17
1503	adcs	x10, x10, x16
1504	and		x19, x16, x15
1505	adc		x11, x11, x19
1506	stp		x8, x9,[x0]
1507	stp		x10, x11,[x0, #16]
1508AARCH64_AUTIASP
1509	ret
1510.size	ECP_Sm2SubBA,.-ECP_Sm2SubBA
1511
1512# ref. https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1513# Deal process:
1514#     delta = Z12
1515#     gamma = Y12
1516#     beta = X1*gamma
1517#     alpha = 3*(X1-delta)*(X1+delta)
1518#     X3 = alpha2-8*beta
1519#     Z3 = (Y1+Z1)2-gamma-delta
1520#     Y3 = alpha*(4*beta-X3)-8*gamma2
1521.globl	ECP_Sm2PointDoubleMont
1522.type	ECP_Sm2PointDoubleMont,%function
1523.align	4
1524ECP_Sm2PointDoubleMont:
1525AARCH64_PACIASP
1526	stp		x29, x30,[sp, #-80]!
1527	mov		x29, sp
1528	stp		x19, x20,[sp, #16]
1529	stp		x21, x22,[sp, #32]
1530	sub		sp, sp, #32*4
1531
1532.Lpoint_double:
1533	ldp		x8, x9,[x1, #32]        // a->y
1534	ldp		x10, x11,[x1, #32+16]
1535
1536	mov		x21, x0
1537	mov		x22, x1 // backup point a
1538
1539    adrp    x14, .Lpoly+8
1540    add 	x14, x14,:lo12:.Lpoly+8 // p[1]
1541	ldr	   	x14, [x14]
1542
1543    adrp    x15, .Lpoly+24
1544    add 	x15, x15,:lo12:.Lpoly+24 // p[3]
1545	ldr	   	x15, [x15]
1546
1547	mov		x16, x8
1548	mov		x17, x9
1549	mov		x19, x10
1550	mov		x20, x11
1551	ldp		x4, x5, [x22, #64]        // a->z
1552	ldp		x6, x7, [x22, #64+16]
1553	mov		x0, sp
1554	bl		ECP_Sm2AddCore        // s = 2 * a->y
1555
1556	add		x0, sp, #64
1557	bl		ECP_Sm2SqrMont        // zsqr = (a->z)^2
1558
1559	ldp		x16, x17, [x22]
1560	ldp		x19, x20, [x22, #16]
1561	mov		x4, x8
1562	mov		x5, x9
1563	mov		x6, x10
1564	mov		x7, x11
1565	add		x0, sp, #32
1566	bl		ECP_Sm2AddCore        // m = a->x + zsqr
1567
1568	add		x2, x22, #0
1569	mov		x8, x4
1570	mov		x9, x5
1571	ldp		x4, x5,[sp, #0]
1572	mov		x10, x6
1573	mov		x11, x7
1574	ldp		x6, x7,[sp, #16]
1575	add		x0, sp, #64
1576	bl		ECP_Sm2SubBA       // zsqr = a->x - zsqr
1577
1578	add		x0, sp, #0
1579	bl		ECP_Sm2SqrMont        // s = s^2
1580
1581	ldr		x3, [x22, #32]
1582	ldp		x4, x5,[x22, #64]
1583	ldp		x6, x7,[x22, #64+16]
1584	add		x2, x22, #32               // a->y
1585	add		x0, sp, #96
1586	bl		ECP_Sm2MulMont        // res_z = a->z * a->y
1587
1588	mov		x16, x8
1589	mov		x17, x9
1590	ldp		x4, x5, [sp, #0]
1591	mov		x19, x10
1592	mov		x20, x11
1593	ldp		x6, x7, [sp, #16]
1594	add		x0, x21, #64
1595	bl		ECP_Sm2AddCore        // res_z = 2 * res_z
1596
1597	add		x0, sp, #96
1598	bl		ECP_Sm2SqrMont        // res_y = s^2
1599
1600	ldr		x3, [sp, #64]
1601	ldp		x4, x5, [sp, #32]
1602	ldp		x6, x7, [sp, #32+16]
1603	add		x0, x21, #32
1604	bl		ECP_Sm2DivBy2Core     // res_y = res_y / 2
1605
1606	add		x2, sp, #64
1607	add		x0, sp, #32
1608	bl		ECP_Sm2MulMont        // m = m * zsqr
1609
1610	mov		x16, x8
1611	mov		x17, x9
1612	mov		x19, x10
1613	mov		x20, x11
1614	mov		x4, x8
1615	mov		x5, x9
1616	mov		x6, x10
1617	mov		x7, x11
1618	add		x0, sp, #32
1619	bl		ECP_Sm2AddCore
1620	mov		x16, x4
1621	mov		x17, x5
1622	ldr		x3, [x22]
1623	mov		x19, x6
1624	ldp		x4, x5, [sp, #0]
1625	mov		x20, x7
1626	ldp		x6, x7, [sp, #16]
1627	bl		ECP_Sm2AddCore        // m = 3 * m
1628
1629	mov		x2, x22
1630	add		x0, sp, #0
1631	bl		ECP_Sm2MulMont        // s = s * a->x
1632
1633	mov		x16, x8
1634	mov		x17, x9
1635	ldp		x4, x5, [sp, #32]
1636	mov		x19, x10
1637	mov		x20, x11
1638	ldp		x6, x7, [sp, #32+16]
1639	add		x0, sp, #96
1640	bl		ECP_Sm2AddCore        // tmp = 2 * s
1641
1642	mov		x0, x21
1643	bl		ECP_Sm2SqrMont        // res_x = m^2
1644
1645	add		x2, sp, #96
1646	bl		ECP_Sm2SubAB       // res_x = res_x - tmp
1647
1648	add		x2, sp, #0
1649	add		x0, sp, #0
1650	bl		ECP_Sm2SubBA       // s = s - res_x
1651
1652	ldr		x3, [sp, #32]
1653	mov		x4, x8
1654	mov		x5, x9
1655	mov		x6, x10
1656	mov		x7, x11
1657	add		x2, sp, #32
1658	bl		ECP_Sm2MulMont        // s = s * m
1659
1660	add		x2, x21, #32
1661	add		x0, x21, #32
1662	bl		ECP_Sm2SubAB       // res_y = s - res_y
1663
1664	mov		sp, x29
1665	ldp		x19, x20,[x29, #16]
1666	ldp		x21, x22,[x29, #32]
1667	ldp		x29, x30,[sp], #80
1668AARCH64_AUTIASP
1669	ret
1670.size	ECP_Sm2PointDoubleMont,.-ECP_Sm2PointDoubleMont
1671
1672# ref. https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo
1673# Deal process:
1674#     U1 = X1*Z22
1675#     U2 = X2*Z12
1676#     S1 = Y1*Z23
1677#     S2 = Y2*Z13
1678#     H = U2-U1
1679#     r = S2-S1
1680#     X3 = r2-H3-2*U1*H2
1681#     Y3 = r*(U1*H2-X3)-S1*H3
1682#     Z3 = Z1*Z2*H
1683.globl	ECP_Sm2PointAddMont
1684.type	ECP_Sm2PointAddMont,%function
1685.align	4
1686ECP_Sm2PointAddMont:
1687AARCH64_PACIASP
1688	stp		x29, x30,[sp, #-80]!
1689	mov		x29, sp
1690	stp		x19, x20,[sp, #16]
1691	stp		x21, x22,[sp, #32]
1692	stp		x23, x24,[sp, #48]
1693	stp		x25, x26,[sp, #64]
1694	sub		sp,sp, #32*12
1695
1696	ldp		x4, x5,[x2, #64]
1697	ldp		x6, x7,[x2, #64+16]
1698	mov		x21, x0
1699	mov		x22, x1 // backup points
1700	mov		x23, x2
1701
1702    adrp    x14, .Lpoly+8
1703    add 	x14, x14,:lo12:.Lpoly+8                // p[1]
1704	ldr	   	x14, [x14]
1705
1706    adrp    x15, .Lpoly+24
1707    add 	x15, x15,:lo12:.Lpoly+24               // p[3]
1708	ldr	   	x15, [x15]
1709
1710
1711	orr		x16, x4, x5
1712	orr		x19, x6, x7
1713	orr		x25, x16, x19
1714	cmp		x25, #0
1715	csetm	x25, ne 				// check the point is(x, y, 0)
1716	add		x0, sp, #128
1717	bl		ECP_Sm2SqrMont     // z1sqr = z1^2
1718
1719	ldp		x4, x5,[x22, #64]
1720	ldp		x6, x7,[x22, #64+16]
1721	orr		x16, x4, x5
1722	orr		x19, x6, x7
1723	orr		x24, x16, x19
1724	cmp		x24, #0
1725	csetm	x24, ne 				// check the point is(x, y, 0)
1726
1727	add		x0, sp, #224
1728	bl		ECP_Sm2SqrMont     // z2sqr = z2^2
1729
1730	ldr		x3, [x23, #64]
1731	ldp		x4, x5,[sp, #128]
1732	ldp		x6, x7,[sp, #128+16]
1733	add		x2, x23, #64
1734	add		x0,sp, #320
1735	bl		ECP_Sm2MulMont        // s2 = z1^3
1736
1737	ldr		x3,[x22, #64]
1738	ldp		x4, x5,[sp, #224]
1739	ldp		x6, x7,[sp, #224+16]
1740	add		x2, x22, #64
1741	add		x0,sp, #352
1742	bl		ECP_Sm2MulMont        // s2 = y2 * z1^3
1743
1744	ldr		x3,[x22, #32]
1745	ldp		x4, x5,[sp, #320]
1746	ldp		x6, x7,[sp, #320+16]
1747	add		x2, x22, #32
1748	add		x0,sp, #320
1749	bl		ECP_Sm2MulMont
1750
1751	ldr		x3,[x23, #32]
1752	ldp		x4, x5,[sp, #352]
1753	ldp		x6, x7,[sp, #352+16]
1754	add		x2, x23, #32
1755	add		x0,sp, #352
1756	bl		ECP_Sm2MulMont
1757
1758	add		x2,sp, #320
1759	ldr		x3,[sp, #128]
1760	ldp		x4, x5,[x22]
1761	ldp		x6, x7,[x22, #16]
1762	add		x0,sp, #96
1763	bl		ECP_Sm2SubAB
1764
1765	orr		x8, x8, x9
1766	orr		x10, x10, x11
1767	orr		x26, x8, x10
1768
1769	add		x2,sp, #128
1770	add		x0,sp, #256
1771	bl		ECP_Sm2MulMont
1772
1773	ldr		x3,[sp, #224]
1774	ldp		x4, x5,[x23]
1775	ldp		x6, x7,[x23, #16]
1776	add		x2,sp, #224
1777	add		x0,sp, #288
1778	bl		ECP_Sm2MulMont
1779
1780	add		x2,sp, #256
1781	ldp		x4, x5,[sp, #96]
1782	ldp		x6, x7,[sp, #96+16]
1783	add		x0,sp, #192
1784	bl		ECP_Sm2SubAB
1785
1786	orr		x8, x8, x9
1787	orr		x10, x10, x11
1788	orr		x8, x8, x10
1789	tst		x8, x8
1790	b.ne	.Ladd_proceed
1791
1792	tst		x24, x25
1793	b.eq	.Ladd_proceed
1794
1795	tst		x26, x26
1796	b.eq	.Ladd_double
1797
1798	stp		xzr, xzr,[x21]
1799	stp		xzr, xzr,[x21, #16]
1800	stp		xzr, xzr,[x21, #32]
1801	stp		xzr, xzr,[x21, #48]
1802	stp		xzr, xzr,[x21, #64]
1803	stp		xzr, xzr,[x21, #80]
1804	b	.Ladd_done
1805
1806.align	4
1807.Ladd_double:
1808	mov		x1, x22
1809	mov		x0, x21
1810	ldp		x23, x24,[x29, #48]
1811	ldp		x25, x26,[x29, #64]
1812	add		sp,sp, #32*(12-4)
1813	b		.Lpoint_double
1814
1815.align	4
1816.Ladd_proceed:
1817	add		x0,sp, #128
1818	bl		ECP_Sm2SqrMont
1819
1820	ldr		x3,[x22, #64]
1821	ldp		x4, x5,[sp, #192]
1822	ldp		x6, x7,[sp, #192+16]
1823	add		x2, x22, #64
1824	add		x0,sp, #64
1825	bl		ECP_Sm2MulMont
1826
1827	ldp		x4, x5,[sp, #192]
1828	ldp		x6, x7,[sp, #192+16]
1829	add		x0,sp, #224
1830	bl		ECP_Sm2SqrMont
1831
1832	ldr		x3,[x23, #64]
1833	ldp		x4, x5,[sp, #64]
1834	ldp		x6, x7,[sp, #64+16]
1835	add		x2, x23, #64
1836	add		x0,sp, #64
1837	bl		ECP_Sm2MulMont
1838
1839	ldr		x3,[sp, #192]
1840	ldp		x4, x5,[sp, #224]
1841	ldp		x6, x7,[sp, #224+16]
1842	add		x2,sp, #192
1843	add		x0,sp, #160
1844	bl		ECP_Sm2MulMont
1845
1846	ldr		x3,[sp, #224]
1847	ldp		x4, x5,[sp, #256]
1848	ldp		x6, x7,[sp, #256+16]
1849	add		x2,sp, #224
1850	add		x0,sp, #288
1851	bl		ECP_Sm2MulMont
1852
1853	mov		x16, x8
1854	mov		x17, x9
1855	mov		x19, x10
1856	mov		x20, x11
1857	add		x0,sp, #224
1858	bl		ECP_Sm2AddCore
1859
1860	add		x2,sp, #128
1861	add		x0,sp, #0
1862	bl		ECP_Sm2SubBA
1863
1864	add		x2,sp, #160
1865	bl		ECP_Sm2SubAB
1866
1867	add		x2,sp, #288
1868	ldr		x3,[sp, #160]
1869	ldp		x4, x5,[sp, #320]
1870	ldp		x6, x7,[sp, #320+16]
1871	add		x0,sp, #32
1872	bl		ECP_Sm2SubBA
1873
1874	add		x2,sp, #160
1875	add		x0,sp, #352
1876	bl		ECP_Sm2MulMont
1877
1878	ldr		x3,[sp, #96]
1879	ldp		x4, x5,[sp, #32]
1880	ldp		x6, x7,[sp, #32+16]
1881	add		x2,sp, #96
1882	add		x0,sp, #32
1883	bl		ECP_Sm2MulMont
1884
1885	add		x2,sp, #352
1886	bl		ECP_Sm2SubAB
1887
1888	ldp		x4, x5,[sp, #0]
1889	ldp		x6, x7,[sp, #16]
1890	ldp		x16, x17,[x23]
1891	ldp		x19, x20,[x23, #16]
1892	ldp		x8, x9,[x22, #0]
1893
1894	cmp		x24, #0
1895	csel	x16, x4, x16,ne
1896	csel	x17, x5, x17,ne
1897	csel	x19, x6, x19,ne
1898	csel	x20, x7, x20,ne
1899
1900	cmp		x25, #0
1901	csel	x8, x16, x8,ne
1902	csel	x9, x17, x9,ne
1903	csel	x10, x19, x10,ne
1904	csel	x11, x20, x11,ne
1905
1906	stp		x8, x9,[x21, #0]
1907	stp		x10, x11,[x21, #16]
1908
1909	ldp		x10, x11,[x22, #16]
1910	ldp		x4, x5,[sp, #32]
1911	ldp		x6, x7,[sp, #48]
1912	ldp		x16, x17,[x23, #32]
1913	ldp		x19, x20,[x23, #48]
1914	ldp		x8, x9,[x22, #32]
1915
1916	cmp		x24, #0
1917	csel	x16, x4, x16,ne
1918	csel	x17, x5, x17,ne
1919	csel	x19, x6, x19,ne
1920	csel	x20, x7, x20,ne
1921
1922	cmp		x25, #0
1923	csel	x8, x16, x8,ne
1924	csel	x9, x17, x9,ne
1925	csel	x10, x19, x10,ne
1926	csel	x11, x20, x11,ne
1927
1928	stp		x8, x9,[x21, #32]
1929	stp		x10, x11,[x21, #32+16]
1930
1931	ldp		x10, x11,[x22, #32+16]
1932	ldp		x8, x9,  [x22, #64]
1933
1934	ldp		x16, x17,[x23, #32+32]
1935	ldp		x19, x20,[x23, #32+48]
1936	ldp		x4, x5,[sp, #32+32]
1937	ldp		x6, x7,[sp, #32+48]
1938
1939	cmp		x24, #0
1940	ldp		x10, x11,[x22, #64+16]
1941	csel	x16, x4, x16,ne
1942	csel	x17, x5, x17,ne
1943	csel	x19, x6, x19,ne
1944	csel	x20, x7, x20,ne
1945
1946	cmp		x25, #0
1947	csel	x8, x16, x8, ne
1948	csel	x9, x17, x9, ne
1949	csel	x10, x19, x10, ne
1950	csel	x11, x20, x11, ne
1951
1952	stp		x8, x9,[x21, #64]
1953	stp		x10, x11,[x21, #64+16]
1954
1955.Ladd_done:
1956	mov		sp, x29
1957	ldp		x19, x20,[x29, #16]
1958	ldp		x21, x22,[x29, #32]
1959	ldp		x23, x24,[x29, #48]
1960	ldp		x25, x26,[x29, #64]
1961	ldp		x29, x30,[sp], #80
1962AARCH64_AUTIASP
1963	ret
1964.size	ECP_Sm2PointAddMont,.-ECP_Sm2PointAddMont
1965
1966# ref. https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-madd-2007-bl
1967# Deal process:
1968#     Z1Z1 = Z12
1969#     U2 = X2*Z1Z1
1970#     S2 = Y2*Z1*Z1Z1
1971#     H = U2-X1
1972#     HH = H2
1973#     I = 4*HH
1974#     J = H*I
1975#     r = 2*(S2-Y1)
1976#     V = X1*I
1977#     X3 = r2-J-2*V
1978#     Y3 = r*(V-X3)-2*Y1*J
1979#     Z3 = (Z1+H)2-Z1Z1-HH
1980.globl	ECP_Sm2PointAddAffineMont
1981.type	ECP_Sm2PointAddAffineMont,%function
1982.align	4
1983ECP_Sm2PointAddAffineMont:
1984AARCH64_PACIASP
1985	stp		x29, x30,[sp, #-80]!
1986	mov		x29, sp
1987	stp		x19, x20,[sp, #16]
1988	stp		x21, x22,[sp, #32]
1989	stp		x23, x24,[sp, #48]
1990	stp		x25, x26,[sp, #64]
1991	sub		sp, sp, #32*10
1992
1993	mov		x21, x0	 // backup r
1994	mov		x22, x1	 // point a
1995	mov		x23, x2	 // point b
1996
1997    adrp    x14, .Lpoly+8
1998    add 	x14, x14,:lo12:.Lpoly+8
1999	ldr	   	x14,[x14]
2000
2001    adrp    x15, .Lpoly+24
2002    add 	x15, x15,:lo12:.Lpoly+24
2003	ldr	   	x15,[x15]
2004
2005	ldp		x4, x5,[x1, #64]	// &(a->z[0]), a->z[0] is marked as z1[0]
2006	ldp		x6, x7,[x1, #64+16] // &(a->z[2])
2007
2008	orr		x16, x4, x5
2009	orr		x19, x6, x7
2010	orr		x24, x16, x19
2011	cmp		x24, #0
2012	csetm	x24, ne   // check is (x, y 0)
2013
2014	ldp		x8, x9, [x2]         // &(b->x[0])
2015	ldp		x10, x11, [x2, #16]	 // &(b->x[2])
2016	ldp		x16, x17, [x2, #32]	 // &(b->y[0])
2017	ldp		x19, x20, [x2, #48]	 // &(b->y[2])
2018
2019	orr		x8, x8, x9
2020	orr		x10, x10, x11
2021	orr		x16, x16, x17
2022	orr		x19, x19, x20
2023	orr		x8, x8, x10
2024	orr		x16, x16, x19
2025	orr		x25, x8, x16
2026	cmp		x25, #0
2027	csetm	x25, ne   // check is (x, y 0)
2028
2029	add		x0, sp, #128
2030	bl		ECP_Sm2SqrMont     // zsqr = z1^2
2031
2032	mov		x4, x8
2033	mov		x5, x9
2034	mov		x6, x10
2035	mov		x7, x11
2036
2037	ldr		x3, [x23]
2038	mov		x2, x23
2039	add		x0, sp, #96
2040	bl		ECP_Sm2MulMont        // u2 = z1^2 * x2
2041
2042	mov		x2, x22
2043	ldr		x3, [x22, #64]
2044	ldp		x4, x5, [sp, #128]
2045	ldp		x6, x7, [sp, #128+16]
2046	add		x0,sp, #160
2047	bl		ECP_Sm2SubAB
2048
2049	add		x2, x22, #64
2050	add		x0,sp, #128
2051	bl		ECP_Sm2MulMont
2052
2053	ldr		x3,[x22, #64]
2054	ldp		x4, x5,[sp, #160]
2055	ldp		x6, x7,[sp, #160+16]
2056	add		x2, x22, #64
2057	add		x0,sp, #64
2058	bl		ECP_Sm2MulMont
2059
2060	ldr		x3,[x23, #32]
2061	ldp		x4, x5,[sp, #128]
2062	ldp		x6, x7,[sp, #128+16]
2063	add		x2, x23, #32
2064	add		x0,sp, #128
2065	bl		ECP_Sm2MulMont
2066
2067	add		x2, x22, #32
2068	ldp		x4, x5,[sp, #160]
2069	ldp		x6, x7,[sp, #160+16]
2070	add		x0,sp, #192
2071	bl		ECP_Sm2SubAB
2072
2073	add		x0,sp, #224
2074	bl		ECP_Sm2SqrMont
2075
2076	ldp		x4, x5,[sp, #192]
2077	ldp		x6, x7,[sp, #192+16]
2078	add		x0,sp, #288
2079	bl		ECP_Sm2SqrMont
2080
2081	ldr		x3,[sp, #160]
2082	ldp		x4, x5,[sp, #224]
2083	ldp		x6, x7,[sp, #224+16]
2084	add		x2,sp, #160
2085	add		x0,sp, #256
2086	bl		ECP_Sm2MulMont
2087
2088	ldr		x3,[x22]
2089	ldp		x4, x5,[sp, #224]
2090	ldp		x6, x7,[sp, #224+16]
2091	mov		x2, x22
2092	add		x0,sp, #96
2093	bl		ECP_Sm2MulMont
2094
2095	mov		x16, x8
2096	mov		x17, x9
2097	mov		x19, x10
2098	mov		x20, x11
2099	add		x0,sp, #224
2100	bl		ECP_Sm2AddCore
2101
2102	add		x2,sp, #288
2103	add		x0,sp, #0
2104	bl		ECP_Sm2SubBA
2105
2106	add		x2,sp, #256
2107	bl		ECP_Sm2SubAB
2108
2109	add		x2,sp, #96
2110	ldr		x3,[x22, #32]
2111	ldp		x4, x5,[sp, #256]
2112	ldp		x6, x7,[sp, #256+16]
2113	add		x0,sp, #32
2114	bl		ECP_Sm2SubBA
2115
2116	add		x2, x22, #32
2117	add		x0,sp, #128
2118	bl		ECP_Sm2MulMont
2119
2120	ldr		x3,[sp, #192]
2121	ldp		x4, x5,[sp, #32]
2122	ldp		x6, x7,[sp, #32+16]
2123	add		x2,sp, #192
2124	add		x0,sp, #32
2125	bl		ECP_Sm2MulMont
2126
2127	add		x2,sp, #128
2128	bl		ECP_Sm2SubAB
2129
2130	ldp		x4, x5,[sp, #0]
2131	ldp		x6, x7,[sp, #16]
2132	ldp		x16, x17,[x23]
2133	ldp		x19, x20,[x23, #16]
2134
2135	ldp		x8, x9,[x22, #0]
2136	cmp		x24, #0
2137	ldp		x10, x11,[x22, #16]
2138
2139	csel	x16, x4, x16, ne
2140	csel	x17, x5, x17, ne
2141	csel	x19, x6, x19, ne
2142	csel	x20, x7, x20, ne
2143
2144	cmp		x25, #0
2145	csel	x8, x16, x8,ne
2146	csel	x9, x17, x9,ne
2147	csel	x10, x19, x10,ne
2148	csel	x11, x20, x11,ne
2149
2150	ldp		x4, x5,[sp, #32]
2151	ldp		x6, x7,[sp, #48]
2152	ldp		x16, x17,[x23, #32]
2153	ldp		x19, x20,[x23, #48]
2154	stp		x8, x9,[x21, #0]
2155	stp		x10, x11,[x21, #16]
2156
2157
2158	ldp		x8, x9,[x22, #32]
2159	cmp		x24, #0
2160	ldp		x10, x11,[x22, #32+16]
2161	csel	x16, x4, x16,ne
2162	csel	x17, x5, x17,ne
2163	csel	x19, x6, x19,ne
2164	csel	x20, x7, x20,ne
2165
2166	cmp		x25, #0
2167	csel	x8, x16, x8,ne
2168	csel	x9, x17, x9,ne
2169	csel	x10, x19, x10,ne
2170	csel	x11, x20, x11,ne
2171	stp		x8, x9,[x21, #32]
2172	stp		x10, x11,[x21, #32+16]
2173
2174	ldp		x4, x5,[sp, #32+32]
2175	ldp		x6, x7,[sp, #32+48]
2176
2177    adrp    x23, .Lone_mont
2178    add 	x23,x23,:lo12:.Lone_mont
2179	ldp		x16, x17,[x23]
2180	ldp		x19, x20,[x23, #16]
2181
2182	ldp		x8, x9,[x22, #64]
2183	ldp		x10, x11,[x22, #64+16]
2184
2185	cmp		x24, #0
2186	csel	x16, x4, x16, ne
2187	csel	x17, x5, x17, ne
2188	csel	x19, x6, x19, ne
2189	csel	x20, x7, x20, ne
2190
2191	cmp		x25, #0
2192	csel	x8, x16, x8, ne
2193	csel	x9, x17, x9, ne
2194	csel	x10, x19, x10, ne
2195	csel	x11, x20, x11, ne
2196
2197	stp		x8, x9, [x21, #64]
2198	stp		x10, x11, [x21, #64+16]
2199
2200	mov		sp, x29
2201	ldp		x19, x20,[x29, #16]
2202	ldp		x21, x22,[x29, #32]
2203	ldp		x23, x24,[x29, #48]
2204	ldp		x25, x26,[x29, #64]
2205	ldp		x29, x30,[sp], #80
2206AARCH64_AUTIASP
2207	ret
2208.size	ECP_Sm2PointAddAffineMont,.-ECP_Sm2PointAddAffineMont
2209#endif
2210