• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * This file is part of the openHiTLS project.
3 *
4 * openHiTLS is licensed under the Mulan PSL v2.
5 * You can use this software according to the terms and conditions of the Mulan PSL v2.
6 * You may obtain a copy of Mulan PSL v2 at:
7 *
8 *     http://license.coscl.org.cn/MulanPSL2
9 *
10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13 * See the Mulan PSL v2 for more details.
14 */
15
16#include "hitls_build.h"
17#ifdef HITLS_CRYPTO_CURVE_SM2
18
19.file	"ecp_sm2_x86_64.S"
20.text
21
22.set	s0,%r8
23.set	s1,%r9
24.set	s2,%r10
25.set	s3,%r11
26.set	s4,%r12
27.set	s5,%r13
28.set	s6,%r14
29.set	s7,%r15
30
31.macro	REGISTER_SAVE
32	pushq	%r12
33	pushq	%r13
34	pushq	%r14
35	pushq	%r15
36	pushq	%rbx
37	pushq	%rbp
38.endm
39
40.macro	REGISTER_POP
41	popq	%rbp
42	popq	%rbx
43	popq	%r15
44	popq	%r14
45	popq	%r13
46	popq	%r12
47.endm
48
49# The polynomial
50.align	64
51.Lpoly:
52.quad	0xffffffffffffffff, 0xffffffff00000000, 0xffffffffffffffff, 0xfffffffeffffffff
53# The order of polynomial
54.Lord:
55.quad	0x53bbf40939d54123, 0x7203df6b21c6052b, 0xffffffffffffffff, 0xfffffffeffffffff
56
57.Lpoly_div_2:
58.quad	0x8000000000000000, 0xffffffff80000000, 0xffffffffffffffff, 0x7fffffff7fffffff
59.Lord_div_2:
60.quad	0xa9ddfa049ceaa092, 0xb901efb590e30295, 0xffffffffffffffff, 0x7fffffff7fffffff
61
62.Lzero:
63.quad	0, 0, 0, 0
64.Lord_1div4:
65.quad	0xd4eefd024e755049, 0xdc80f7dac871814a, 0xffffffffffffffff, 0x3fffffffbfffffff
66.Lord_2div4:
67.quad	0xa9ddfa049ceaa092, 0xb901efb590e30295, 0xffffffffffffffff, 0x7fffffff7fffffff
68.Lord_3div4:
69.quad	0x7eccf706eb5ff0db, 0x9582e790595483e0, 0xffffffffffffffff, 0xbfffffff3fffffff
70
71.Lpoly_1div4:
72.quad	0x4000000000000000, 0xffffffffc0000000, 0xffffffffffffffff, 0x3fffffffbfffffff
73.Lpoly_2div4:
74.quad	0x8000000000000000, 0xffffffff80000000, 0xffffffffffffffff, 0x7fffffff7fffffff
75.Lpoly_3div4:
76.quad	0xc000000000000000, 0xffffffff40000000, 0xffffffffffffffff, 0xbfffffff3fffffff
77
78.LRR:// 2^512 mod P precomputed for sm2 polynomial
79.quad	0x0000000200000003, 0x00000002ffffffff, 0x0000000100000001, 0x0000000400000002
80.Lone_mont:
81.quad	0x0000000000000001, 0x00000000ffffffff, 0x0000000000000000, 0x0000000100000000
82.Lone:
83.quad	1,0,0,0
84.LOne:
85.long	1,1,1,1,1,1,1,1
86
87.globl	ECP_Sm2Div2
88.type	ECP_Sm2Div2,@function
89.align	64
90
91ECP_Sm2Div2:
92
93	movq	(%rdi),%r8
94	movq	8(%rdi),%r9
95	movq	16(%rdi),%r10
96	movq	24(%rdi),%r11
97
98	shrdq	$1,%r9,%r8
99	shrdq	$1,%r10,%r9
100	shrdq	$1,%r11,%r10
101	shrq	$1,%r11
102
103	movq	%r8,(%rdi)
104	movq	%r9,8(%rdi)
105	movq	%r10,16(%rdi)
106	movq	%r11,24(%rdi)
107
108	ret
109.size	ECP_Sm2Div2, .-ECP_Sm2Div2
110
111.globl	ECP_Sm2Div4
112.type	ECP_Sm2Div4,@function
113.align	64
114
115ECP_Sm2Div4:
116
117	movq	(%rdi),%r8
118	movq	8(%rdi),%r9
119	movq	16(%rdi),%r10
120	movq	24(%rdi),%r11
121
122	shrdq	$2,%r9,%r8
123	shrdq	$2,%r10,%r9
124	shrdq	$2,%r11,%r10
125	shrq	$2,%r11
126	movq	%r8,(%rdi)
127	movq	%r9,8(%rdi)
128	movq	%r10,16(%rdi)
129	movq	%r11,24(%rdi)
130
131	ret
132.size	ECP_Sm2Div4, .-ECP_Sm2Div4
133
134.globl	ECP_Sm2Neg
135.type	ECP_Sm2Neg,@function
136.align	64
137
138ECP_Sm2Neg:
139	movq	(%rdi),%r8
140	xorq	%rax,%rax
141
142	movq	$-1,%r8
143	movq	$0xffffffff00000000,%r9
144	movq	$0xfffffffeffffffff,%r11
145	movq	$-1,%r10
146
147	subq	0(%rsi),%r8
148	sbbq	8(%rsi),%r9
149	sbbq	16(%rsi),%r10
150	sbbq	24(%rsi),%r11
151
152	movq	%r8,(%rdi)
153	movq	%r9,8(%rdi)
154	movq	%r10,16(%rdi)
155	movq	%r11,24(%rdi)
156
157	ret
158.size	ECP_Sm2Neg, .-ECP_Sm2Neg
159
160.globl	ECP_Sm2BnSub
161.type	ECP_Sm2BnSub,@function
162.align	64
163
164ECP_Sm2BnSub:
165
166	movq	(%rsi),%r8
167	movq	8(%rsi),%r9
168	movq	16(%rsi),%r10
169	movq	24(%rsi),%r11
170
171	subq	(%rdx),%r8
172	sbbq	8(%rdx),%r9
173	sbbq	16(%rdx),%r10
174	sbbq	24(%rdx),%r11
175
176	movq	%r8,(%rdi)
177	movq	%r9,8(%rdi)
178	movq	%r10,16(%rdi)
179	movq	%r11,24(%rdi)
180	ret
181.size	ECP_Sm2BnSub, .-ECP_Sm2BnSub
182
183.globl	ECP_Sm2BnAdd
184.type	ECP_Sm2BnAdd,@function
185.align	64
186
187ECP_Sm2BnAdd:
188
189	movq	(%rsi),%r8
190	movq	8(%rsi),%r9
191	movq	16(%rsi),%r10
192	movq	24(%rsi),%r11
193
194	addq	(%rdx),%r8
195	adcq	8(%rdx),%r9
196	adcq	16(%rdx),%r10
197	adcq	24(%rdx),%r11
198
199	movq	%r8,(%rdi)
200	movq	%r9,8(%rdi)
201	movq	%r10,16(%rdi)
202	movq	%r11,24(%rdi)
203	ret
204.size	ECP_Sm2BnAdd, .-ECP_Sm2BnAdd
205
206.globl	ECP_Sm2Div2ModP
207.type	ECP_Sm2Div2ModP,@function
208.align	64
209
210ECP_Sm2Div2ModP:
211
212	subq	$24,%rsp
213	movq	%rbx,(%rsp)
214	movq	%r12,8(%rsp)
215	movq	%r13,16(%rsp)
216	xorq	%r12,%r12
217
218	movq	(%rsi),%r8
219	movq	8(%rsi),%r9
220	movq	16(%rsi),%r10
221	movq	24(%rsi),%r11
222
223	movq	%r8,%r13
224	andq	$1,%r13
225	shrdq	$1,%r9,%r8
226	shrdq	$1,%r10,%r9
227	shrdq	$1,%r11,%r10
228	shrdq	$1,%r12,%r11
229
230	leaq	.Lzero(%rip),%rax
231	leaq	.Lpoly_div_2(%rip),%rbx
232	cmpq	$1,%r13
233	cmoveq	%rbx,%rax
234
235	addq	(%rax),%r8
236	adcq	8(%rax),%r9
237	adcq	16(%rax),%r10
238	adcq	24(%rax),%r11
239
240	movq	%r8,(%rdi)
241	movq	%r9,8(%rdi)
242	movq	%r10,16(%rdi)
243	movq	%r11,24(%rdi)
244
245	movq	(%rsp),%rbx
246	movq	8(%rsp),%r12
247	movq	16(%rsp),%r13
248	addq	$24,%rsp
249	ret
250.size	ECP_Sm2Div2ModP, .-ECP_Sm2Div2ModP
251
252.globl	ECP_Sm2Div2ModOrd
253.type	ECP_Sm2Div2ModOrd,@function
254.align	64
255
256ECP_Sm2Div2ModOrd:
257
258	subq	$24,%rsp
259	movq	%rbx,(%rsp)
260	movq	%r12,8(%rsp)
261	movq	%r13,16(%rsp)
262	xorq	%r12,%r12
263
264	movq	(%rsi),%r8
265	movq	8(%rsi),%r9
266	movq	16(%rsi),%r10
267	movq	24(%rsi),%r11
268
269	movq	%r8,%r13
270	andq	$1,%r13
271	shrdq	$1,%r9,%r8
272	shrdq	$1,%r10,%r9
273	shrdq	$1,%r11,%r10
274	shrdq	$1,%r12,%r11
275
276	leaq	.Lzero(%rip),%rax
277	leaq	.Lord_div_2(%rip),%rbx
278	cmpq	$1,%r13
279	cmoveq	%rbx,%rax
280
281	addq	(%rax),%r8
282	adcq	8(%rax),%r9
283	adcq	16(%rax),%r10
284	adcq	24(%rax),%r11
285
286	movq	%r8,(%rdi)
287	movq	%r9,8(%rdi)
288	movq	%r10,16(%rdi)
289	movq	%r11,24(%rdi)
290
291	movq	(%rsp),%rbx
292	movq	8(%rsp),%r12
293	movq	16(%rsp),%r13
294	addq	$24,%rsp
295	ret
296.size	ECP_Sm2Div2ModOrd, .-ECP_Sm2Div2ModOrd
297
298.globl	ECP_Sm2Div4ModP
299.type	ECP_Sm2Div4ModP,@function
300.align	64
301
302ECP_Sm2Div4ModP:
303
304	subq	$24,%rsp
305	movq	%rbx,(%rsp)
306	movq	%r12,8(%rsp)
307	movq	%r13,16(%rsp)
308	xorq	%r12,%r12
309
310	movq	(%rsi),%r8
311	movq	8(%rsi),%r9
312	movq	16(%rsi),%r10
313	movq	24(%rsi),%r11
314
315	movq	%r8,%r13
316	andq	$3,%r13
317	shrdq	$2,%r9,%r8
318	shrdq	$2,%r10,%r9
319	shrdq	$2,%r11,%r10
320	shrdq	$2,%r12,%r11
321
322	leaq	.Lzero(%rip),%rax
323	leaq	.Lpoly_1div4(%rip),%rbx
324	leaq	.Lpoly_2div4(%rip),%rcx
325	leaq	.Lpoly_3div4(%rip),%rdx
326
327	cmpq	$1,%r13
328	cmoveq	%rbx,%rax
329	cmpq	$2,%r13
330	cmoveq	%rcx,%rax
331	cmpq	$3,%r13
332	cmoveq	%rdx,%rax
333
334
335	addq	(%rax),%r8
336	adcq	8(%rax),%r9
337	adcq	16(%rax),%r10
338	adcq	24(%rax),%r11
339
340
341	movq	%r8,(%rdi)
342	movq	%r9,8(%rdi)
343	movq	%r10,16(%rdi)
344	movq	%r11,24(%rdi)
345
346	movq	(%rsp),%rbx
347	movq	8(%rsp),%r12
348	movq	16(%rsp),%r13
349	addq	$24,%rsp
350	ret
351.size	ECP_Sm2Div4ModP, .-ECP_Sm2Div4ModP
352
353.globl	ECP_Sm2Div4ModOrd
354.type	ECP_Sm2Div4ModOrd,@function
355.align	64
356
357ECP_Sm2Div4ModOrd:
358
359	subq	$24,%rsp
360	movq	%rbx,(%rsp)
361	movq	%r12,8(%rsp)
362	movq	%r13,16(%rsp)
363	xorq	%r12,%r12
364
365
366	movq	(%rsi),%r8
367	movq	8(%rsi),%r9
368	movq	16(%rsi),%r10
369	movq	24(%rsi),%r11
370
371	movq	%r8,%r13
372	andq	$3,%r13
373	shrdq	$2,%r9,%r8
374	shrdq	$2,%r10,%r9
375	shrdq	$2,%r11,%r10
376	shrdq	$2,%r12,%r11
377
378	leaq	.Lzero(%rip),%rax
379	leaq	.Lord_1div4(%rip),%rbx
380	leaq	.Lord_2div4(%rip),%rcx
381	leaq	.Lord_3div4(%rip),%rdx
382
383	cmpq	$1,%r13
384	cmoveq	%rbx,%rax
385	cmpq	$2,%r13
386	cmoveq	%rcx,%rax
387	cmpq	$3,%r13
388	cmoveq	%rdx,%rax
389
390	addq	(%rax),%r8
391	adcq	8(%rax),%r9
392	adcq	16(%rax),%r10
393	adcq	24(%rax),%r11
394
395	movq	%r8,(%rdi)
396	movq	%r9,8(%rdi)
397	movq	%r10,16(%rdi)
398	movq	%r11,24(%rdi)
399
400	movq	(%rsp),%rbx
401	movq	8(%rsp),%r12
402	movq	16(%rsp),%r13
403	addq	$24,%rsp
404	ret
405.size	ECP_Sm2Div4ModOrd, .-ECP_Sm2Div4ModOrd
406
407#define	bn_mod_add(mod)				\
408	/* Store scalar registers */	\
409	subq	$32, %rsp;				\
410	movq	%r12, (%rsp);			\
411	movq	%r13, 8(%rsp);			\
412	movq	%r14, 16(%rsp);			\
413	movq	%r15, 24(%rsp);			\
414	xorq	%rax, %rax;				\
415	/* Load inputs */				\
416	movq	(%rsi), %r8;			\
417	movq	8(%rsi), %r9;			\
418	movq	16(%rsi), %r10;			\
419	movq	24(%rsi), %r11;			\
420	/* Addition */					\
421	addq	(%rdx), %r8;			\
422	adcq	8(%rdx), %r9;			\
423	adcq	16(%rdx), %r10;			\
424	adcq	24(%rdx), %r11;			\
425	/* Store carry */				\
426	adcq	$0, %rax;				\
427	movq	%r8, %r12;				\
428	movq	%r9, %r13;				\
429	movq	%r10, %r14;				\
430	movq	%r11, %r15;				\
431	/* Sub polynomial */			\
432	leaq	mod, %rsi;				\
433	subq	0(%rsi), %r8;			\
434	sbbq	8(%rsi), %r9;			\
435	sbbq	16(%rsi), %r10;			\
436	sbbq	24(%rsi), %r11;			\
437	sbbq	$0, %rax;				\
438	cmovcq	%r12, %r8;				\
439	cmovcq	%r13, %r9;				\
440	cmovcq	%r14, %r10;				\
441	cmovcq	%r15, %r11;				\
442	/* Store results */				\
443	movq	%r8, (%rdi);			\
444	movq	%r9, 8(%rdi);			\
445	movq	%r10, 16(%rdi);			\
446	movq	%r11, 24(%rdi);			\
447	/* Restore scalar registers */	\
448	movq	(%rsp), %r12;			\
449	movq	8(%rsp), %r13;			\
450	movq	16(%rsp), %r14;			\
451	movq	24(%rsp), %r15;			\
452	addq	$32, %rsp;				\
453
454#define	bn_mod_sub(mod)				\
455	/* Store scalar registers */	\
456	subq	$32, %rsp;				\
457	movq	%r12, (%rsp);			\
458	movq	%r13, 8(%rsp);			\
459	movq	%r14, 16(%rsp);			\
460	movq	%r15, 24(%rsp);			\
461	xorq	%rax, %rax;				\
462	/* Load inputs */				\
463	movq	(%rsi), %r8;			\
464	movq	8(%rsi), %r9;			\
465	movq	16(%rsi), %r10;			\
466	movq	24(%rsi), %r11;			\
467	/* Subtraction */				\
468	subq	(%rdx), %r8;			\
469	sbbq	8(%rdx), %r9;			\
470	sbbq	16(%rdx), %r10;			\
471	sbbq	24(%rdx), %r11;			\
472	sbbq	$0, %rax;				\
473	movq	%r8, %r12;				\
474	movq	%r9, %r13;				\
475	movq	%r10, %r14;				\
476	movq	%r11, %r15;				\
477	/* Add polynomial */			\
478	leaq	mod, %rsi;				\
479	addq	0(%rsi), %r8;			\
480	adcq	8(%rsi), %r9;			\
481	adcq	16(%rsi), %r10;			\
482	adcq	24(%rsi), %r11;			\
483	testq	%rax, %rax;				\
484	cmovzq	%r12, %r8;				\
485	cmovzq	%r13, %r9;				\
486	cmovzq	%r14, %r10;				\
487	cmovzq	%r15, %r11;				\
488	/* Store results */				\
489	movq	%r8, (%rdi);			\
490	movq	%r9, 8(%rdi);			\
491	movq	%r10, 16(%rdi);			\
492	movq	%r11, 24(%rdi);			\
493	/* Restore scalar registers */	\
494	movq	(%rsp), %r12;			\
495	movq	8(%rsp), %r13;			\
496	movq	16(%rsp), %r14;			\
497	movq	24(%rsp), %r15;			\
498	addq	$32, %rsp;				\
499
500### Modular add: r = a+b mod n/p, where n = ord(p) ###
501	# void ECP_Sm2AddModP(uint64_t *r, const uint64_t *a, const uint64_t *b)
502	# Modular poly add
503	# r		%rdi
504	# a		%rsi
505	# b		%rdx
506	.globl	ECP_Sm2AddModP
507	.type	ECP_Sm2AddModP, @function
508	.align	64
509
510ECP_Sm2AddModP:
511
512	bn_mod_add(.Lpoly(%rip))
513
514	ret
515	.size ECP_Sm2AddModP, .-ECP_Sm2AddModP
516
517	# void ECP_Sm2AddModOrd(uint64_t *r, const uint64_t *a, const uint64_t *b)
518	# Modular order add
519	# r		%rdi
520	# a		%rsi
521	# b		%rdx
522	.globl	ECP_Sm2AddModOrd
523	.type	ECP_Sm2AddModOrd, @function
524	.align	64
525
526ECP_Sm2AddModOrd:
527
528	bn_mod_add(.Lord(%rip))
529
530	ret
531	.size ECP_Sm2AddModOrd, .-ECP_Sm2AddModOrd
532
533### Modular sub: r = a-b mod n/p, where n = ord(p) ###
534	# void ECP_Sm2SubModP(uint64_t *r, const uint64_t *a, const uint64_t *b)
535	# Modular poly sub
536	# r		%rdi
537	# a		%rsi
538	# b		%rdx
539	.globl	ECP_Sm2SubModP
540	.type	ECP_Sm2SubModP, @function
541	.align	64
542
543ECP_Sm2SubModP:
544
545	bn_mod_sub(.Lpoly(%rip))
546
547	ret
548	.size ECP_Sm2SubModP, .-ECP_Sm2SubModP
549
550	# void ECP_Sm2SubModOrd(uint64_t *r, const uint64_t *a, const uint64_t *b)
551	# Modular order sub
552	# r		%rdi
553	# a		%rsi
554	# b		%rdx
555	.globl	ECP_Sm2SubModOrd
556	.type	ECP_Sm2SubModOrd, @function
557	.align	64
558
559ECP_Sm2SubModOrd:
560
561	bn_mod_sub(.Lord(%rip))
562
563	ret
564	.size ECP_Sm2SubModOrd, .-ECP_Sm2SubModOrd
565
566.macro	RDC
567	# r = a mod p256
568	# a = a15 | a14 | ... | a0, where ai are 32bit quantities
569	# |  a7 |  a6 |  a5 |  a4 |  a3 |  a2 |  a1 |  a0 | (+)
570	# |  a8 | a11 | a10 |  a9 |  a8 |   0 |  a9 |  a8 | (+)
571	# |  a9 | a14 | a13 | a12 | a11 |   0 | a10 |  a9 | (+)
572	# | a10 | a15 | a14 | a13 | a12 |   0 | a11 | a10 | (+)
573	# | a11 |   0 | a15 | a14 | a13 |   0 | a12 | a11 | (+)
574	# | a12 |   0 | a15 | a14 | a13 |   0 | a13 | a12 | (+)
575	# | a12 |   0 |   0 | a15 | a14 |   0 | a14 | a13 | (+)
576	# | a13 |   0 |   0 |   0 | a15 |   0 | a14 | a13 | (+)
577	# | a13 |   0 |   0 |   0 |   0 |   0 | a15 | a14 | (+)
578	# | a14 |   0 |   0 |   0 |   0 |   0 | a15 | a14 | (+)
579	# | a14 |   0 |   0 |   0 |   0 |   0 |   0 | a15 | (+)
580	# | a15 |   0 |   0 |   0 |   0 |   0 |   0 | a15 | (+)
581	# | a15 |   0 |   0 |   0 |   0 |   0 |   0 |   0 | (+)
582	# | a15 |   0 |   0 |   0 |   0 |   0 |   0 |   0 | (+)
583	# |   0 |   0 |   0 |   0 |   0 | a8  |   0 |   0 | (-)
584	# |   0 |   0 |   0 |   0 |   0 | a9  |   0 |   0 | (-)
585	# |   0 |   0 |   0 |   0 |   0 | a13 |   0 |   0 | (-)
586	# |   0 |   0 |   0 |   0 |   0 | a14 |   0 |   0 | (-)
587	# | U[7]| U[6]| U[5]| U[4]| U[3]| U[2]| U[1]| U[0]|
588	# |    V[3]   |    V[2]   |   V[1]    |    V[0]   |
589	# until r < p256
590	# s7 (a15|a14), s6 (a13|a12), s5 (a11|a10), s4 (a9|a8)
591	# s3 (a7|a6), s2 (a5|a4), s1 (a3|a2), s0 (a1|a0)
592
593	# 1. 64-bit addition
594	xorq	%rsi, %rsi		# to store all carry
595	xorq	%rax, %rax
596	movq	s6, %rcx		# rcx <- s6
597	movq	s4, %rdx		# rdx <- s4
598	# a13 | a12
599	addq	s7, %rcx		# rcx <- s6 + s7
600	adcq	$0, %rax		# rax <- carry(s6+s7)
601	addq	s7, %rcx		# rcx <- s6 + 2*s7
602	adcq	$0, %rax
603	# a9 | a8
604	movq	%rax, %rbx		# rbx <- carry (rax)
605	addq	%rcx, %rdx		# rdx <- s4 + s6 + 2*s7
606	adcq	$0, %rbx
607	addq	s5, %rdx		# rdx <- s4 + s5 + s6 + 2*s7
608	adcq	$0, %rbx
609	# sum
610	addq	%rdx, s0		# s0 <- s0 + s4 + s5 + s6 + 2*s7
611	adcq	%rbx, s1		# s1 <- s1 + rbx + carry
612	adcq	%rcx, s2		# s2 <- s2 + s6 + 2*s7 + carry
613	adcq	s7, s3			# s3 <- s3 + s7 + carry
614	adcq	$0, %rsi
615	# add carry
616	addq	%rax, s3
617	adcq	$0, %rsi		# rsi <- carry
618	# store registers
619	movq	s0, (%rsp)
620	movq	s1, 8(%rsp)
621	movq	s2, 16(%rsp)
622	movq	s3, 24(%rsp)
623	# 2. 4 -> 8  64-bit to 32-bit spread
624	movq	$0xffffffff, %rax
625	movq	s4, s0
626	movq	s5, s1
627	movq	s6, s2
628	movq	s7, s3
629	andq	%rax, s0	# a8
630	andq	%rax, s1	# a10
631	andq	%rax, s2	# a12
632	andq	%rax, s3	# a14
633	shrq	$32, s4		# a9
634	shrq	$32, s5		# a11
635	shrq	$32, s6		# a13
636	shrq	$32, s7		# a15
637	# 3. 32-bit addition
638	movq	s3, %rax
639	addq	s2, %rax	# rax <- a12 + a14
640	movq	s3, %rbx
641	addq	s1, %rbx	# rbx <- a10 + a14
642	movq	s7, %rcx
643	addq	s6, %rcx	# rcx <- a13 + a15
644	movq	s0, %rdx
645	addq	s4, %rdx	# rdx <- a8 + a9
646	addq	s5, s7		# s7 <-  a11 + a15
647	movq	%rcx, s2	# s2 <- a13 + a15
648	addq	%rax, s2	# s2 <- a12 + a13 + a14 + a15
649	addq	s2, s1		# s1 <- a10 + a12 + a13 + a14 + a15
650	addq	s2, s1		# s1 <- a10 + 2*(a12 + a13 + a14 + a15)
651	addq	%rdx, s1	# s1 <- a8 + a9 + a10 + 2*(a12 + a13 + a14 + a15)
652	addq	s5, s1		# s1 <- a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15)
653	addq	s6, s2		# s2 <- a12 + 2*a13 + a14 + a15
654	addq	s5, s2		# s2 <- a11 + a12 + 2*a13 + a14 + a15
655	addq	s0, s2		# s2 <- a8 + a11 + a12 + 2*a13 + a14 + a15
656	addq	s3, %rdx	# rdx <- a8 + a9 + a14
657	addq	s6, %rdx	# rdx <- a8 + a9 + a13 + a14
658	addq	%rcx, s4	# s4 <- a9 + a13 + a15
659	addq	s4, s5		# s5 <- a9 + a11 + a13 + a15
660	addq	%rcx, s5	# s5 <- a9 + a11 + 2*(a13 + a15)
661	addq	%rbx, %rax	# rax <- a10 + a12 + 2*a14
662
663	# U[0]	s5		a9 + a11 + 2*(a13 + a15)
664	# U[1]	%rax	a10 + a12 + 2*a14
665	# U[2]
666	# U[3]	s2		a8 + a11 + a12 + 2*a13 + a14 + a15
667	# U[4]	s4		a9 + a13 + a15
668	# U[5]	%rbx	a10 + a14
669	# U[6]	s7		a11 + a15
670	# U[7]	s1		a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15)
671	# sub	%rdx	a8 + a9 + a13 + a14
672
673	# vacant registers: s0 s3 s6  %rcx
674
675	# 4. 8 -> 4  32-bit to 64-bit
676	# sub %rdx
677	movq	%rax, s0
678	shlq	$32, s0			# U[1]'(s0) <- U[1] << 32
679	shrd	$32, s2, %rax	# U[3]'(%rax) <- U[3]U[1] >> 32
680	shrd	$32, %rbx, s2	# U[5]'(s2) <- U[5]U[3] >> 32
681	shrd	$32, s1, %rbx	# U[7]'(%rbx) <- U[7]U[5] >> 32
682	shrq	$32, s1			# U[7](s1) <- U[7] >> 32 (carry)
683
684	# 5. 64-bit addition
685	addq	s0, s5			# U[0] <- U[1]' + U[0]
686	adcq	$0, %rax		# U[3]' <- 0 + U[3]'
687	adcq	s2, s4			# U[4] <- U[5]' + U[4]
688	adcq	%rbx, s7		# U[6] <- U[7]' + U[6]
689	adcq	s1, %rsi		# rsi <- U[7]carry + carry
690
691	# V[0] s5
692	# V[1] %rax
693	# V[2] s4
694	# V[3] s7
695	# carry %rsi
696	# sub %rdx
697
698	# 5. ADD & SUB
699	movq	(%rsp), s0
700	movq	8(%rsp), s1
701	movq	16(%rsp), s2
702	movq	24(%rsp), s3
703	# ADD
704	addq s5, s0
705	adcq %rax, s1
706	adcq s4, s2
707	adcq s7, s3
708	adcq $0, %rsi
709	# SUB
710	subq %rdx, s1
711	sbbq $0, s2
712	sbbq $0, s3
713	sbbq $0, %rsi
714
715	# 6. MOD
716	# First Mod
717	movq %rsi, %rax		# rax <- carry (rsi)			+out[0]
718	shlq $32, %rax		# rax <- carry << 32
719	movq %rax, %rcx		# rcx <- rax					+out[3]
720	subq %rsi, %rax		# rax <- carry << 32 - carry	+out[1]
721
722	addq %rsi, s0
723	adcq %rax, s1
724	adcq $0, s2
725	adcq %rcx, s3
726
727	# Last Mod
728	# return r - p if r > p else r
729	movq	s0, s4
730	movq	s1, s5
731	movq	s2, s6
732	movq	s3, s7
733
734	leaq	.Lpoly(%rip), %rsi
735
736	movq	$0, %rcx
737	adcq	$0, %rcx
738
739	subq	0(%rsi), s0
740	sbbq	8(%rsi), s1
741	sbbq	16(%rsi), s2
742	sbbq	24(%rsi), s3
743	sbbq	$0, %rcx
744
745	cmovcq	s4, s0
746	cmovcq	s5, s1
747	cmovcq	s6, s2
748	cmovcq	s7, s3
749
750	movq	s0, (%rdi)
751	movq	s1, 8(%rdi)
752	movq	s2, 16(%rdi)
753	movq	s3, 24(%rdi)
754.endm
755
756### Modular mul: r = a*b mod p ###
757	# void ECP_Sm2Mul(uint64_t *r, const uint64_t *a, const uint64_t *b)
758	# 256-bit modular multiplication in SM2
759	# r		%rdi
760	# a		%rsi
761	# b		%rdx
762	.globl	ECP_Sm2Mul
763	.type	ECP_Sm2Mul, @function
764	.align	64
765
766ECP_Sm2Mul:
767
768	# Store scalar registers
769	subq	$72, %rsp
770	movq	%rbx, 32(%rsp)
771	movq	%r12, 40(%rsp)
772	movq	%r13, 48(%rsp)
773	movq	%r14, 56(%rsp)
774	movq	%r15, 64(%rsp)
775
776	# Load inputs
777	movq	(%rsi), s0
778	movq	8(%rsi), s1
779	movq	16(%rsi), s2
780	movq	24(%rsi), s3
781	movq	(%rdx), s4
782	movq	8(%rdx), s5
783	movq	16(%rdx), s6
784	movq	24(%rdx), s7
785
786### multiplication ###
787
788	# ========================
789	#             s7 s6 s5 s4
790	# *           s3 s2 s1 s0
791	# ------------------------
792	# +           s0 s0 s0 s0
793	#              *  *  *  *
794	#             s7 s6 s5 s4
795	#          s1 s1 s1 s1
796	#           *  *  *  *
797	#          s7 s6 s5 s4
798	#       s2 s2 s2 s2
799	#        *  *  *  *
800	#       s7 s6 s5 s4
801	#    s3 s3 s3 s3
802	#     *  *  *  *
803	#    s7 s6 s5 s4
804	# ------------------------
805	# s7 s6 s5 s4 s3 s2 s1 s0
806	# ========================
807
808### s0*s4 ###
809	movq	s0, %rax
810	mulq	s4
811	movq	%rax, (%rsp)
812	movq	%rdx, %rbx
813	xorq	%rcx, %rcx
814
815### s1*s4 + s0*s5 ###
816	movq	s1, %rax
817	mulq	s4
818	addq	%rax, %rbx
819	adcq	%rdx, %rcx
820	xorq	%rsi, %rsi
821
822	movq	s0, %rax
823	mulq	s5
824	addq	%rax, %rbx
825	adcq	%rdx, %rcx
826	adcq	$0, %rsi
827	movq	%rbx, 8(%rsp)
828	xorq	%rbx, %rbx
829
830### s2 * s4 + s1 * s5 + s0 *s6 ###
831	movq	s2, %rax
832	mulq	s4
833	addq	%rax, %rcx
834	adcq	%rdx, %rsi
835
836	movq	s1, %rax
837	mulq	s5
838	addq	%rax, %rcx
839	adcq	%rdx, %rsi
840	adcq	$0, %rbx
841
842	movq	s0, %rax
843	mulq	s6
844	addq	%rax, %rcx
845	adcq	%rdx, %rsi
846	adcq	$0, %rbx
847	movq	%rcx, 16(%rsp)
848	xorq	%rcx, %rcx
849
850### s3*s4 + s2*s5 + s1*s6 + s0*s7 ###
851	movq	s3, %rax
852	mulq	s4
853	addq	%rax, %rsi
854	adcq	%rdx, %rbx
855	adcq	$0, %rcx
856
857	movq	s2, %rax
858	mulq	s5
859	addq	%rax, %rsi
860	adcq	%rdx, %rbx
861	adcq	$0, %rcx
862
863	movq	s1, %rax
864	mulq	s6
865	addq	%rax, %rsi
866	adcq	%rdx, %rbx
867	adcq	$0, %rcx
868
869	movq	s0, %rax
870	mulq	s7
871	addq	%rax, %rsi
872	adcq	%rdx, %rbx
873	adcq	$0, %rcx
874	movq	%rsi, 24(%rsp)
875	xorq	%rsi, %rsi
876
877### s3*s5 + s2*s6 + s1*s7 ###
878	movq	s3, %rax
879	mulq	s5
880	addq	%rax, %rbx
881	adcq	%rdx, %rcx
882	# carry
883	adcq	$0, %rsi
884
885	movq	s2, %rax
886	mulq	s6
887	addq	%rax, %rbx
888	adcq	%rdx, %rcx
889	adcq	$0, %rsi
890
891	movq	s1, %rax
892	mulq	s7
893	addq	%rax, %rbx
894	adcq	%rdx, %rcx
895	adcq	$0, %rsi
896	movq	%rbx, s4
897	xorq	%rbx, %rbx
898
899### s3*s6 + s2*s7 ###
900	movq	s3, %rax
901	mulq	s6
902	addq	%rax, %rcx
903	adcq	%rdx, %rsi
904	# carry
905	adcq $0, %rbx
906
907	movq	s2, %rax
908	mulq	s7
909	addq	%rax, %rcx
910	adcq	%rdx, %rsi
911	adcq	$0, %rbx
912	movq	%rcx, s5
913
914### s3*s7 ###
915	movq	s3, %rax
916	mulq	s7
917	addq	%rax, %rsi
918	adcq	%rdx, %rbx
919	movq	%rsi, s6
920	movq	%rbx, s7
921
922	movq	(%rsp), s0
923	movq	8(%rsp), s1
924	movq	16(%rsp), s2
925	movq	24(%rsp), s3
926
927	# result of mul: s7 s6 s5 s4 s3 s2 s1 s0
928
929### Reduction ###
930	RDC
931
932	# Restore scalar registers
933	movq	32(%rsp), %rbx
934	movq	40(%rsp), %r12
935	movq	48(%rsp), %r13
936	movq	56(%rsp), %r14
937	movq	64(%rsp), %r15
938	addq	$72, %rsp
939
940	ret
941	.size ECP_Sm2Mul, .-ECP_Sm2Mul
942
943### Modular sqr: r = a^2 mod p ###
944	# void ECP_Sm2Sqr(uint64_t *r, const uint64_t *a)
945	# 256-bit modular multiplication in SM2 ###
946	# r 	%rdi
947	# a 	%rsi
948	.globl	ECP_Sm2Sqr
949	.type	ECP_Sm2Sqr, @function
950	.align	64
951
952ECP_Sm2Sqr:
953
954	# Store scalar registers
955	subq	$88, %rsp
956	movq	%rbx, 32(%rsp)
957	movq	%r12, 40(%rsp)
958	movq	%r13, 48(%rsp)
959	movq	%r14, 56(%rsp)
960	movq	%r15, 64(%rsp)
961	movq	%rbp, 72(%rsp)
962	movq	%rdi, 80(%rsp)
963
964	# Load inputs
965	movq	(%rsi), s4
966	movq	8(%rsi), s5
967	movq	16(%rsi), s6
968	movq	24(%rsi), s7
969
970### square ###
971
972	# ========================
973	#             s7 s6 s5 s4
974	# *           s7 s6 s5 s4
975	# ------------------------
976	# +           s4 s4 s4 s4
977	#              *  *  *  *
978	#             s7 s6 s5 s4
979	#          s5 s5 s5 s5
980	#           *  *  *  *
981	#          s7 s6 s5 s4
982	#       s6 s6 s6 s6
983	#        *  *  *  *
984	#       s7 s6 s5 s4
985	#    s7 s7 s7 s7
986	#     *  *  *  *
987	#    s7 s6 s5 s4
988	# ------------------------
989	# s7 s6 s5 s4 s3 s2 s1 s0
990	# ========================
991
992### s1 <- s4*s5, s2 <- carry ###
993	movq	s5, %rax
994	mulq	s4
995	movq	%rax, s1
996	movq	%rdx, s2
997	xorq	s3, s3
998
999### s2 <- s4*s6 + carry(s2), s3 <- carry ###
1000	movq	s6, %rax
1001	mulq	s4
1002	addq	%rax, s2
1003	adcq	%rdx, s3
1004	xorq	s0, s0
1005
1006### s3 <- s4*s7 + s5*s6 + carry(s3), s0 <- carry ###
1007	movq	s7, %rax
1008	mulq	s4
1009	addq	%rax, s3
1010	adcq	%rdx, s0
1011	xorq	%rbx, %rbx
1012
1013	movq	s6, %rax
1014	mulq	s5
1015	addq	%rax, s3
1016	adcq	%rdx, s0
1017	adcq	$0, %rbx
1018
1019### s0 <- s5*s7 + carry(s0), rbx <- carry ###
1020	movq	s7, %rax
1021	mulq	s5
1022	addq	%rax, s0
1023	adcq	%rdx, %rbx
1024	xorq	%rcx, %rcx
1025
1026### rbx <- s6*s7 + carry(rbx), rcx <- carry ###
1027	movq	s7, %rax
1028	mulq	s6
1029	addq	%rax, %rbx
1030	adcq	%rdx, %rcx
1031	xorq	%rsi, %rsi
1032
1033### 2*s0|1|2|3 ###
1034	addq	s1, s1
1035	adcq	s2, s2
1036	adcq	s3, s3
1037	adcq	s0, s0
1038	adcq	%rbx, %rbx
1039	# update carry
1040	adcq	%rcx, %rcx
1041	adcq	$0, %rsi
1042### rbp <- s4*s4, carry <- rdi ###
1043	movq	s4, %rax
1044	mulq	s4
1045	movq	%rax, %rbp
1046	movq	%rdx, %rdi
1047
1048### s4 <- s5*s5, carry <- s5 ###
1049	movq	s5, %rax
1050	mulq	s5
1051	movq	%rax, s4
1052	movq	%rdx, s5
1053
1054### s6*s6 ###
1055	movq	s6, %rax
1056	mulq	s6
1057
1058	# s1 += carry(s4*s4)
1059	addq	%rdi, s1
1060	# s2 += s5*s5
1061	adcq	s4, s2
1062	# s3 += carry(s5*s5)
1063	adcq	s5, s3
1064	# s4(s0) += s6*s6
1065	adcq	%rax, s0
1066	# s5(rbx) += carry(s6*s6)
1067	adcq	%rdx, %rbx
1068	adcq	$0, %rcx
1069	adcq	$0, %rsi
1070
1071### s7*s7 ###
1072	movq	s7, %rax
1073	mulq	s7
1074	# s6(rcx) += s7*s7
1075	addq	%rax, %rcx
1076	# s7(rsi) += carry(s7*s7)
1077	adcq	%rdx, %rsi
1078
1079	movq	s0, s4
1080	movq	%rbp, s0
1081	movq	%rbx, s5
1082	movq	%rcx, s6
1083	movq	%rsi, s7
1084
1085	# Restore rdi
1086	movq	80(%rsp), %rdi
1087
1088	# result of mul: s7 s6 s5 s4 s3 s2 s1 s0
1089
1090### Reduction ###
1091	RDC
1092
1093	# Restore scalar registers
1094	movq	32(%rsp), %rbx
1095	movq	40(%rsp), %r12
1096	movq	48(%rsp), %r13
1097	movq	56(%rsp), %r14
1098	movq	64(%rsp), %r15
1099	movq	72(%rsp), %rbp
1100	addq	$88, %rsp
1101
1102	ret
1103	.size ECP_Sm2Sqr, .-ECP_Sm2Sqr
1104
1105.globl	ECP_Sm2ToMont
1106.type	ECP_Sm2ToMont,@function
1107.align	32
1108ECP_Sm2ToMont:
1109	leaq	.LRR(%rip), %rdx
1110	REGISTER_SAVE
1111	movq	0(%rsi), %r9
1112	movq	8(%rsi), %r10
1113	movq	16(%rsi), %r11
1114	movq	24(%rsi), %r12
1115	movq	%rdx, %rbx
1116	movq	0(%rdx), %rax
1117
1118	call	ECP_Sm2MulMont
1119
1120	REGISTER_POP
1121	ret
1122.size	ECP_Sm2ToMont,.-ECP_Sm2ToMont
1123
1124.type	ECP_Sm2MulMont,@function
1125.align	32
1126ECP_Sm2MulMont:
1127
1128	// a[0~3] * b[0]
1129	movq	%rax, %rbp
1130	mulq	%r9
1131	movq	%rax, %r8
1132	movq	%rdx, %r9
1133	movq	%rbp, %rax
1134
1135	mulq	%r10
1136	addq	%rax, %r9
1137	adcq	$0, %rdx
1138	movq	%rbp, %rax
1139	movq	%rdx, %r10
1140
1141	mulq	%r11
1142	addq	%rax, %r10
1143	adcq	$0, %rdx
1144	movq	%rbp, %rax
1145	movq	%rdx, %r11
1146
1147	mulq	%r12
1148	addq	%rax, %r11
1149	adcq	$0, %rdx
1150	movq	%rdx, %r12
1151	movq	%r8, %rax
1152	movq	%r8, %r14
1153	xorq	%r13, %r13
1154
1155	// begin 1st reduce
1156	shlq	$32, %rax
1157	shrq	$32, %r14
1158
1159	movq	%r8, %rcx
1160	subq	%rax, %rcx
1161	movq	$0, %rdx
1162	sbbq	%r14, %rdx
1163	movq	%rdx, %rbp
1164	movq	$0, %rdx
1165	sbbq	%rax, %rdx
1166	movq	%rdx, %rax
1167	sbbq	%r14, %r8
1168	movq	%r8, %rdx
1169
1170	movq	%rcx, %r8
1171	addq	%r9, %r8
1172	movq	%rbp, %r9
1173	adcq	%r10, %r9
1174	movq	%rax, %r10
1175	adcq	%r11, %r10
1176	movq	%rdx, %r11
1177	adcq	%r12, %r11
1178	movq	$0, %r12
1179	adcq	%r13, %r12
1180	movq	8(%rbx), %rax // b[1]
1181
1182	movq	%rax, %rbp
1183	mulq	0(%rsi)
1184	addq	%rax, %r8
1185	adcq	$0, %rdx
1186	movq	%rbp, %rax
1187	movq	%rdx, %rcx
1188
1189	mulq	8(%rsi)
1190	addq	%rcx, %r9
1191	adcq	$0, %rdx
1192	addq	%rax, %r9
1193	adcq	$0, %rdx
1194	movq	%rbp, %rax
1195	movq	%rdx, %rcx
1196
1197	mulq	16(%rsi)
1198	addq	%rcx, %r10
1199	adcq	$0, %rdx
1200	addq	%rax, %r10
1201	adcq	$0, %rdx
1202	movq	%rbp, %rax
1203	movq	%rdx, %rcx
1204
1205	mulq	24(%rsi)
1206	addq	%rcx, %r11
1207	adcq	$0, %rdx
1208	addq	%rax, %r11
1209	movq	%r9, %rax
1210	adcq	%rdx, %r12
1211	adcq	$0, %r13
1212
1213	movq	%r8, %rax
1214	movq	%r8, %r14
1215
1216	// begin 2st reduce
1217	shlq	$32, %rax
1218	shrq	$32, %r14
1219
1220	movq	%r8, %rcx
1221	subq	%rax, %rcx
1222	movq	$0, %rdx
1223	sbbq	%r14, %rdx
1224	movq	%rdx, %rbp
1225	movq	$0, %rdx
1226	sbbq	%rax, %rdx
1227	movq	%rdx, %rax
1228	sbbq	%r14, %r8
1229	movq	%r8, %rdx
1230
1231	movq	%rcx, %r8
1232	addq	%r9, %r8
1233	movq	%rbp, %r9
1234	adcq	%r10, %r9
1235	movq	%rax, %r10
1236	adcq	%r11, %r10
1237	movq	%rdx, %r11
1238	adcq	%r12, %r11
1239	movq	$0, %r12
1240	adcq	%r13, %r12
1241	movq	16(%rbx), %rax // b[2]
1242
1243	movq	%rax, %rbp
1244	mulq	0(%rsi)
1245	addq	%rax, %r8
1246	movq	%rbp, %rax
1247	adcq	$0, %rdx
1248	movq	%rdx, %rcx
1249
1250	mulq	8(%rsi)
1251	addq	%rcx, %r9
1252	adcq	$0, %rdx
1253	addq	%rax, %r9
1254	adcq	$0, %rdx
1255	movq	%rbp, %rax
1256	movq	%rdx, %rcx
1257
1258	mulq	16(%rsi)
1259	addq	%rcx, %r10
1260	adcq	$0, %rdx
1261	addq	%rax, %r10
1262	adcq	$0, %rdx
1263	movq	%rbp, %rax
1264	movq	%rdx, %rcx
1265
1266	mulq	24(%rsi)
1267	addq	%rcx, %r11
1268	adcq	$0, %rdx
1269	addq	%rax, %r11
1270	movq	%r9, %rax
1271	adcq	%rdx, %r12
1272	adcq	$0, %r13
1273
1274	movq	%r8, %rax
1275	movq	%r8, %r14
1276
1277	// begin 3st reduce
1278	shlq	$32, %rax
1279	shrq	$32, %r14
1280
1281	movq	%r8, %rcx
1282	movq	$0, %rdx
1283	subq	%rax, %rcx
1284	sbbq	%r14, %rdx
1285	movq	%rdx, %rbp
1286	movq	$0, %rdx
1287	sbbq	%rax, %rdx
1288	sbbq	%r14, %r8
1289	movq	%rdx, %rax
1290	movq	%r8, %rdx
1291
1292	movq	%rcx, %r8
1293	addq	%r9, %r8
1294	movq	%rbp, %r9
1295	adcq	%r10, %r9
1296	movq	%rax, %r10
1297	adcq	%r11, %r10
1298	movq	%rdx, %r11
1299	adcq	%r12, %r11
1300	movq	$0, %r12
1301	adcq	%r13, %r12
1302	movq	24(%rbx), %rax // b[3]
1303
1304	movq	%rax, %rbp
1305	mulq	0(%rsi)
1306	addq	%rax, %r8
1307	adcq	$0, %rdx
1308	movq	%rbp, %rax
1309	movq	%rdx, %rcx
1310
1311	mulq	8(%rsi)
1312	addq	%rcx, %r9
1313	adcq	$0, %rdx
1314	addq	%rax, %r9
1315	adcq	$0, %rdx
1316	movq	%rbp, %rax
1317	movq	%rdx, %rcx
1318
1319	mulq	16(%rsi)
1320	addq	%rcx, %r10
1321	adcq	$0, %rdx
1322	addq	%rax, %r10
1323	adcq	$0, %rdx
1324	movq	%rbp, %rax
1325	movq	%rdx, %rcx
1326
1327	mulq	24(%rsi)
1328	addq	%rcx, %r11
1329	adcq	$0, %rdx
1330	addq	%rax, %r11
1331	adcq	%rdx, %r12
1332	adcq	$0, %r13
1333	movq	%r9, %rax
1334
1335	movq	%r8, %rax
1336	movq	%r8, %r14
1337
1338	// last reduction begin
1339	shlq	$32, %rax
1340	shrq	$32, %r14
1341
1342	movq	%r8, %rcx
1343	subq	%rax, %rcx
1344	movq	$0, %rdx
1345	sbbq	%r14, %rdx
1346	movq	%rdx, %rbp
1347	movq	$0, %rdx
1348	sbbq	%rax, %rdx
1349	movq	%rdx, %rax
1350	sbbq	%r14, %r8
1351	movq	%r8, %rdx
1352	movq	%rcx, %r8
1353
1354	addq	%r9, %r8
1355	movq	%rbp, %r9
1356	adcq	%r10, %r9
1357	movq	%rax, %r10
1358	adcq	%r11, %r10
1359	movq	%rdx, %r11
1360	adcq	%r12, %r11
1361	movq	$0, %rcx
1362	adcq	%r13, %rcx
1363	// last reduction end
1364
1365	// ret - p
1366	movq	%r8, %r12
1367	subq	$-1, %r12
1368	movq	.Lpoly+8(%rip), %r14
1369	movq	%r9, %r13
1370	sbbq	%r14, %r13
1371
1372	movq	%r10, %rbp
1373	sbbq	$-1, %rbp
1374
1375	movq	.Lpoly+24(%rip), %r15
1376	movq	%r11, %rdx
1377	sbbq	%r15, %rdx
1378	sbbq	$0, %rcx
1379
1380	cmovcq	%r8, %r12
1381	cmovcq	%r9, %r13
1382	cmovcq	%r10, %rbp
1383	movq	%r12,(%rdi)
1384	movq	%r13,8(%rdi)
1385	cmovcq	%r11, %rdx
1386	movq	%rbp,16(%rdi)
1387	movq	%rdx,24(%rdi)
1388
1389	movq	%rbp, %r8
1390	movq	%rdx, %r9
1391	ret
1392.size	ECP_Sm2MulMont, .-ECP_Sm2MulMont
1393
1394.globl	ECP_Sm2FromMont
1395.type	ECP_Sm2FromMont,@function
1396.align	32
1397ECP_Sm2FromMont:
1398
1399	leaq	.Lone(%rip), %rdx
1400	REGISTER_SAVE
1401	movq	%rdx, %rbx
1402	movq	0(%rsi), %r9
1403	movq	8(%rsi), %r10
1404	movq	16(%rsi), %r11
1405	movq	24(%rsi), %r12
1406	movq	0(%rdx), %rax
1407
1408	call	ECP_Sm2MulMont
1409
1410	REGISTER_POP
1411	ret
1412.size	ECP_Sm2FromMont,.-ECP_Sm2FromMont
1413
1414.type	ECP_Sm2SqrMont,@function
1415.align	32
1416ECP_Sm2SqrMont:
1417
1418	movq	%rax, %r13
1419	mulq	%r14		// a[0] * a[1]
1420	movq	%rax, %r9
1421	movq	%rdx, %r10
1422	movq	%r15, %rax
1423
1424	mulq	%r13		// a[0] * a[2]
1425	addq	%rax, %r10
1426	adcq	$0, %rdx
1427	movq	%r8, %rax
1428	movq	%rdx, %r11
1429
1430	mulq	%r13		// a[0] * a[3]
1431	addq	%rax, %r11
1432	adcq	$0, %rdx
1433	movq	%r15, %rax
1434	movq	%rdx, %r12
1435
1436	mulq	%r14		// a[1] * a[2]
1437	addq	%rax, %r11
1438	adcq	$0, %rdx
1439	movq	%r8, %rax
1440	movq	%rdx, %rbp
1441
1442	mulq	%r14		// a[1] * a[3]
1443	addq	%rax, %r12
1444	adcq	$0, %rdx
1445	addq	%rbp, %r12
1446	movq	%rdx, %r13
1447	movq	%r8, %rax
1448	adcq	$0, %r13
1449
1450	mulq	%r15		// a[2] * a[3]
1451	addq	%rax, %r13
1452	movq	(%rsi), %rax
1453	movq	%rdx, %r14
1454	adcq	$0, %r14
1455
1456	movq	$0, %r15
1457	addq	%r9, %r9
1458	adcq	%r10, %r10
1459	adcq	%r11, %r11
1460	adcq	%r12, %r12
1461	adcq	%r13, %r13
1462	adcq	%r14, %r14
1463	adcq	$0, %r15
1464
1465	mulq	%rax		// cal a[0] * a[0]
1466	movq	%rax, %r8
1467	movq	8(%rsi), %rax // get a[1]
1468	movq	%rdx, %rcx
1469
1470	mulq	%rax		// a[1] * a[1]
1471	addq	%rcx, %r9
1472	adcq	%rax, %r10
1473	adcq	$0, %rdx
1474	movq	16(%rsi), %rax
1475	movq	%rdx, %rcx
1476
1477	mulq	%rax		// a[2] * a[2]
1478	addq	%rcx, %r11
1479	adcq	%rax, %r12
1480	adcq	$0, %rdx
1481	movq	24(%rsi), %rax
1482	movq	%rdx, %rcx
1483
1484	mulq	%rax		// a[3] * a[3]
1485	addq	%rcx, %r13
1486	adcq	%rax, %r14
1487	movq	%r8, %rax
1488	adcq	%rdx, %r15
1489
1490	movq	%r8, %rax
1491	movq	%r8, %rsi
1492
1493	// begin 1st reduce
1494	shlq	$32, %rax
1495	shrq	$32, %rsi
1496	movq	%r8, %rcx
1497	subq	%rax, %rcx
1498
1499	movq	$0, %rdx
1500	sbbq	%rsi, %rdx
1501	movq	%rdx, %rbp
1502	movq	$0, %rdx
1503	sbbq	%rax, %rdx
1504	movq	%rdx, %rax
1505	sbbq	%rsi, %r8
1506	movq	%r8, %rdx
1507
1508	movq	%rcx, %r8
1509	addq	%r9, %r8
1510	movq	%rbp, %r9
1511	adcq	%r10, %r9
1512	movq	%rax, %r10
1513	adcq	%r11, %r10
1514	movq	%rdx, %r11
1515	adcq	$0, %r11
1516
1517	movq	%r8, %rax
1518	movq	%r8, %rsi
1519
1520	// begin 2st reduce
1521	shlq	$32, %rax
1522	shrq	$32, %rsi
1523	movq	%r8, %rcx
1524	subq	%rax, %rcx
1525
1526	movq	$0, %rdx
1527	sbbq	%rsi, %rdx
1528	movq	%rdx, %rbp
1529	movq	$0, %rdx
1530	sbbq	%rax, %rdx
1531	movq	%rdx, %rax
1532	sbbq	%rsi, %r8
1533	movq	%r8, %rdx
1534
1535	movq	%rcx, %r8
1536	addq	%r9, %r8
1537	movq	%rbp, %r9
1538	adcq	%r10, %r9
1539	movq	%rax, %r10
1540	adcq	%r11, %r10
1541	movq	%rdx, %r11
1542	adcq	$0, %r11
1543
1544	movq	%r8, %rax
1545	movq	%r8, %rsi
1546
1547	// begin 3st reduce
1548	shlq	$32, %rax
1549	shrq	$32, %rsi
1550	movq	%r8, %rcx
1551	subq	%rax, %rcx
1552
1553	movq	$0, %rdx
1554	sbbq	%rsi, %rdx
1555	movq	%rdx, %rbp
1556	movq	$0, %rdx
1557	sbbq	%rax, %rdx
1558	movq	%rdx, %rax
1559	sbbq	%rsi, %r8
1560	movq	%r8, %rdx
1561
1562	movq	%rcx, %r8
1563	addq	%r9, %r8
1564	movq	%rbp, %r9
1565	adcq	%r10, %r9
1566	movq	%rax, %r10
1567	adcq	%r11, %r10
1568	movq	%rdx, %r11
1569	adcq	$0, %r11
1570
1571	movq	%r8, %rax
1572	movq	%r8, %rsi
1573
1574	// begin 4st reduce
1575	shlq	$32, %rax
1576	shrq	$32, %rsi
1577	movq	%r8, %rcx
1578	subq	%rax, %rcx
1579
1580	movq	$0, %rdx
1581	sbbq	%rsi, %rdx
1582	movq	%rdx, %rbp
1583	movq	$0, %rdx
1584	sbbq	%rax, %rdx
1585	movq	%rdx, %rax
1586	sbbq	%rsi, %r8
1587	movq	%r8, %rdx
1588
1589	movq	%rcx, %r8
1590	addq	%r9, %r8
1591	movq	%rbp, %r9
1592	adcq	%r10, %r9
1593	movq	%rax, %r10
1594	adcq	%r11, %r10
1595	movq	%rdx, %r11
1596	adcq	$0, %r11
1597
1598	movq	.Lpoly+8(%rip), %rsi
1599	movq	.Lpoly+24(%rip), %rbp
1600
1601	addq	%r8, %r12
1602	adcq	%r9, %r13
1603	adcq	%r10, %r14
1604	adcq	%r11, %r15
1605	movq	$0, %r11
1606	adcq	$0, %r11
1607
1608	// ret - q
1609	movq	%r12, %rax
1610	subq	$-1, %rax
1611	movq	%r13, %rcx
1612	sbbq	%rsi, %rcx
1613	movq	%r14, %r8
1614	sbbq	$-1, %r8
1615	movq	%r15, %rdx
1616	sbbq	%rbp, %rdx
1617	sbbq	$0, %r11
1618
1619	cmovncq	%rax, %r12
1620	cmovncq	%rcx, %r13
1621	cmovncq	%r8, %r14
1622	movq	%r12,(%rdi)
1623	movq	%r13,8(%rdi)
1624	cmovncq	%rdx, %r15
1625	movq	%r14,16(%rdi)
1626	movq	%r15,24(%rdi)
1627	ret
1628.size	ECP_Sm2SqrMont,.-ECP_Sm2SqrMont
1629
1630.type	ECP_Sm2AddCore,@function
1631.align	32
1632ECP_Sm2AddCore:
1633
1634	addq	(%rbx), %r12
1635	adcq	8(%rbx), %r13
1636	movq	%r12, %rcx
1637	adcq	16(%rbx), %r8
1638	adcq	24(%rbx), %r9
1639	movq	$0, %r11
1640	movq	%r13, %rbp
1641	adcq	$0, %r11
1642
1643	subq	$-1, %r12	//  + 0xffffffffffffffff = -(-1)
1644	movq	%r8, %rax
1645	sbbq	%r14, %r13
1646	sbbq	$-1, %r8
1647	movq	%r9, %r10
1648	sbbq	%r15, %r9
1649	sbbq	$0, %r11
1650
1651	cmovcq	%rcx, %r12
1652	cmovcq	%rbp, %r13
1653	movq	%r12, 0(%rdi)
1654	cmovcq	%rax, %r8
1655	movq	%r13, 8(%rdi)
1656	cmovcq	%r10, %r9
1657	movq	%r8, 16(%rdi)
1658	movq	%r9, 24(%rdi)
1659
1660	ret
1661.size	ECP_Sm2AddCore,.-ECP_Sm2AddCore
1662
1663.type	ECP_Sm2SubBA,@function
1664.align	32
1665ECP_Sm2SubBA:
1666	subq	%r12, %rcx
1667	sbbq	%r13, %rbp
1668	movq	%rcx, %r12
1669	sbbq	%r8, %rax
1670	sbbq	%r9, %r10
1671	movq	%rbp, %r13
1672	sbbq	%r11, %r11
1673
1674	addq	$-1, %rcx
1675	movq	%rax, %r8
1676	adcq	%r14, %rbp
1677	adcq	$-1, %rax
1678	movq	%r10, %r9
1679	adcq	%r15, %r10
1680	testq	%r11, %r11
1681
1682	cmovnzq	%rcx, %r12
1683	cmovnzq	%rbp, %r13
1684	cmovnzq	%rax, %r8
1685	cmovnzq	%r10, %r9
1686	ret
1687.size	ECP_Sm2SubBA,.-ECP_Sm2SubBA
1688
1689.type	ECP_Sm2SubAB,@function
1690.align	32
1691ECP_Sm2SubAB:
1692	subq	0(%rbx), %r12
1693	sbbq	8(%rbx), %r13
1694	sbbq	16(%rbx), %r8
1695	sbbq	24(%rbx), %r9
1696	sbbq	%r11, %r11
1697
1698	movq	%r14, %rbp
1699	andq	%r11, %rbp
1700	movq	%r11, %rax
1701	btrq	$32, %rax
1702
1703	addq	%r11, %r12
1704	adcq	%rbp, %r13
1705	adcq	%r11, %r8
1706	adcq	%rax, %r9
1707
1708	movq	%r12, (%rdi)
1709	movq	%r13, 8(%rdi)
1710	movq	%r8, 16(%rdi)
1711	movq	%r9, 24(%rdi)
1712
1713	ret
1714.size	ECP_Sm2SubAB,.-ECP_Sm2SubAB
1715
1716.type	ECP_Sm2MulBy2Core,@function
1717.align	32
1718ECP_Sm2MulBy2Core:
1719	addq	%r12, %r12
1720	adcq	%r13, %r13
1721	movq	%r12, %rcx
1722	adcq	%r8, %r8
1723	adcq	%r9, %r9
1724	movq	$0, %r11
1725	movq	%r13, %rbp
1726	adcq	$0, %r11
1727
1728	subq	$-1, %r12	//  + 0xffffffffffffffff = -(-1)
1729	movq	%r8, %rax
1730	sbbq	%r14, %r13
1731	sbbq	$-1, %r8
1732	movq	%r9, %r10
1733	sbbq	%r15, %r9
1734	sbbq	$0, %r11
1735
1736	cmovcq	%rcx, %r12
1737	cmovcq	%rbp, %r13
1738	cmovcq	%rax, %r8
1739	cmovcq	%r10, %r9
1740
1741	movq	%r12, (%rdi)
1742	movq	%r13, 8(%rdi)
1743	movq	%r8, 16(%rdi)
1744	movq	%r9, 24(%rdi)
1745	ret
1746.size	ECP_Sm2MulBy2Core,.-ECP_Sm2MulBy2Core
1747
1748# ref. https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1749# Deal process:
1750#     delta = Z12
1751#     gamma = Y12
1752#     beta = X1*gamma
1753#     alpha = 3*(X1-delta)*(X1+delta)
1754#     X3 = alpha2-8*beta
1755#     Z3 = (Y1+Z1)2-gamma-delta
1756#     Y3 = alpha*(4*beta-X3)-8*gamma2
1757.globl	ECP_Sm2PointDoubleMont
1758.type	ECP_Sm2PointDoubleMont,@function
1759.align	32
1760ECP_Sm2PointDoubleMont:
1761	REGISTER_SAVE
1762	subq	$168, %rsp
1763
1764.Lpoint_double:
1765	vmovdqu	0(%rsi), %xmm0
1766	vmovdqu	16(%rsi), %xmm1
1767	vmovdqa	%xmm0,96(%rsp)
1768	vmovdqa	%xmm1,96+16(%rsp)
1769
1770	movq	%rsi, %rbx
1771	leaq	32(%rdi), %r10
1772	leaq	64(%rdi), %r11
1773	vmovq	%rdi, %xmm0
1774	vmovq	%r10, %xmm1
1775	vmovq	%r11, %xmm2
1776
1777	movq	32(%rsi), %r12
1778	movq	40(%rsi), %r13
1779	movq	48(%rsi), %r8
1780	movq	56(%rsi), %r9
1781
1782	movq	.Lpoly+8(%rip), %r14
1783	movq	.Lpoly+24(%rip), %r15
1784	leaq	(%rsp), %rdi
1785	call	ECP_Sm2MulBy2Core
1786
1787	movq	64(%rsi), %rax
1788	movq	72(%rsi), %r14
1789	movq	80(%rsi), %r15
1790	movq	88(%rsi), %r8
1791
1792	leaq	64(%rsi), %rsi              // Setting Input Parameters
1793	leaq	64(%rsp), %rdi              // store the result
1794	call	ECP_Sm2SqrMont
1795
1796	movq	(%rsp), %rax
1797	movq	8(%rsp), %r14
1798	movq	16(%rsp), %r15
1799	movq	24(%rsp), %r8
1800	leaq	(%rsp), %rsi
1801	leaq	(%rsp), %rdi
1802	call	ECP_Sm2SqrMont
1803
1804	movq	32(%rbx), %rax
1805	movq	64(%rbx), %r9
1806	movq	72(%rbx), %r10
1807	movq	80(%rbx), %r11
1808	movq	88(%rbx), %r12
1809
1810	leaq	64(%rbx), %rsi
1811	leaq	32(%rbx), %rbx
1812	vmovq	%xmm2, %rdi
1813	call	ECP_Sm2MulMont
1814	call	ECP_Sm2MulBy2Core
1815
1816	movq	96(%rsp), %r12
1817	movq	104(%rsp), %r13
1818	movq	112(%rsp), %r8
1819	movq	120(%rsp), %r9
1820
1821	leaq	32(%rsp), %rdi
1822	leaq	64(%rsp), %rbx
1823	call	ECP_Sm2AddCore
1824
1825	movq	96(%rsp), %r12
1826	movq	104(%rsp), %r13
1827	movq	112(%rsp), %r8
1828	movq	120(%rsp), %r9
1829
1830	leaq	64(%rsp), %rbx  // intput
1831	leaq	64(%rsp), %rdi  // output
1832	call	ECP_Sm2SubAB
1833
1834	movq	(%rsp), %rax
1835	movq	8(%rsp), %r14
1836	movq	16(%rsp), %r15
1837	movq	24(%rsp), %r8
1838	leaq	(%rsp), %rsi
1839	vmovq	%xmm1, %rdi
1840
1841	call	ECP_Sm2SqrMont
1842
1843	movq	%r12, %rcx
1844	addq	$-1, %r12
1845	movq	%r13, %r10
1846	adcq	%rsi, %r13
1847	movq	%r14, %rax
1848	adcq	$-1, %r14
1849	movq	$0, %r9
1850	movq	%r15, %r8
1851	adcq	%rbp, %r15
1852	adcq	$0, %r9
1853	xorq	%rsi, %rsi
1854	testq	$1, %rcx
1855
1856	cmovzq	%rcx, %r12
1857	cmovzq	%r10, %r13
1858	cmovzq	%rax, %r14
1859	cmovzq	%r8, %r15
1860	cmovzq	%rsi, %r9
1861
1862	movq	%r13, %rcx
1863	shrq	$1, %r12
1864	shlq	$63, %rcx
1865	shrq	$1, %r13
1866	movq	%r14, %r10
1867	orq		%rcx, %r12
1868	shlq	$63, %r10
1869	movq	%r15, %rax
1870	shrq	$1, %r14
1871	orq		%r10, %r13
1872	shlq	$63, %rax
1873	movq	%r12,0(%rdi)
1874	shrq	$1, %r15
1875	movq	%r13,8(%rdi)
1876	shlq	$63, %r9
1877	orq		%rax, %r14
1878	orq		%r9, %r15
1879
1880	movq	%r14,16(%rdi)
1881	movq	%r15,24(%rdi)
1882
1883	movq	64(%rsp), %rax
1884	leaq	64(%rsp), %rbx
1885	movq	32(%rsp), %r9
1886	movq	40(%rsp), %r10
1887	leaq	32(%rsp), %rsi
1888	movq	48(%rsp), %r11
1889	movq	56(%rsp), %r12
1890	leaq	32(%rsp), %rdi
1891	call	ECP_Sm2MulMont
1892
1893	leaq	128(%rsp), %rdi
1894	call	ECP_Sm2MulBy2Core
1895
1896	leaq	32(%rsp), %rbx
1897	leaq	32(%rsp), %rdi
1898	call	ECP_Sm2AddCore
1899
1900	movq	96(%rsp), %rax
1901	leaq	96(%rsp), %rbx
1902	movq	(%rsp), %r9
1903	movq	8(%rsp), %r10
1904	leaq	(%rsp), %rsi
1905	movq	16(%rsp), %r11
1906	movq	24(%rsp), %r12
1907	leaq	0(%rsp), %rdi
1908	call	ECP_Sm2MulMont
1909
1910	leaq	128(%rsp), %rdi
1911	call	ECP_Sm2MulBy2Core
1912
1913	movq	32(%rsp), %rax
1914	movq	40(%rsp), %r14
1915	leaq	32(%rsp), %rsi
1916	movq	48(%rsp), %r15
1917	movq	56(%rsp), %r8
1918	vmovq	%xmm0, %rdi
1919	call	ECP_Sm2SqrMont
1920
1921	leaq	128(%rsp), %rbx
1922	movq	%r14, %r8
1923	movq	%r15, %r9
1924	movq	%rsi, %r14
1925	movq	%rbp, %r15
1926	call	ECP_Sm2SubAB
1927
1928	movq	(%rsp), %rcx
1929	movq	8(%rsp), %rbp
1930	movq	16(%rsp), %rax
1931	movq	24(%rsp), %r10
1932	leaq	0(%rsp), %rdi
1933	call	ECP_Sm2SubBA
1934
1935	movq	32(%rsp), %rax
1936	leaq	32(%rsp), %rbx
1937	movq	%r12, %r14
1938	xorl	%ecx, %ecx
1939	movq	%r12,(%rsp)
1940	movq	%r13, %r10
1941	movq	%r13,8(%rsp)
1942	cmovzq	%r8, %r11
1943	movq	%r8,16(%rsp)
1944	cmovzq	%r9, %r12
1945	movq	%r9,24(%rsp)
1946	movq	%r14, %r9
1947
1948	leaq	0(%rsp), %rsi
1949	leaq	0(%rsp), %rdi
1950	call	ECP_Sm2MulMont
1951
1952	vmovq	%xmm1, %rbx
1953	vmovq	%xmm1, %rdi
1954	call	ECP_Sm2SubAB
1955
1956	leaq	168(%rsp), %rsp
1957	REGISTER_POP
1958	ret
1959.size	ECP_Sm2PointDoubleMont,.-ECP_Sm2PointDoubleMont
1960
1961# ref. https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo
1962# Deal process:
1963#     U1 = X1*Z22
1964#     U2 = X2*Z12
1965#     S1 = Y1*Z23
1966#     S2 = Y2*Z13
1967#     H = U2-U1
1968#     r = S2-S1
1969#     X3 = r2-H3-2*U1*H2
1970#     Y3 = r*(U1*H2-X3)-S1*H3
1971#     Z3 = Z1*Z2*H
1972.globl	ECP_Sm2PointAddMont
1973.type	ECP_Sm2PointAddMont,@function
1974.align	32
1975ECP_Sm2PointAddMont:
1976	REGISTER_SAVE
1977	subq	$584, %rsp
1978
1979	vmovdqu	0(%rsi), %xmm0
1980	vmovdqu	16(%rsi), %xmm1
1981	vmovdqu	32(%rsi), %xmm2
1982	vmovdqu	48(%rsi), %xmm3
1983	vmovdqu	64(%rsi), %xmm4
1984	vmovdqu	80(%rsi), %xmm5
1985	movq	%rsi, %rbx
1986	movq	%rdx, %rsi
1987	vmovdqa	%xmm0,384(%rsp)
1988	vmovdqa	%xmm1,384+16(%rsp)
1989	vmovdqa	%xmm2,416(%rsp)
1990	vmovdqa	%xmm3,416+16(%rsp)
1991	vmovdqa	%xmm4,448(%rsp)
1992	vmovdqa	%xmm5,448+16(%rsp)
1993	vpor     %xmm4, %xmm5, %xmm5
1994
1995	vmovdqu	0(%rsi), %xmm0
1996	vpshufd	$0xb1, %xmm5, %xmm3
1997	vmovdqu	16(%rsi), %xmm1
1998	vmovdqu	32(%rsi), %xmm2
1999	vpor		%xmm3, %xmm5, %xmm5
2000	vmovdqu	48(%rsi), %xmm3
2001
2002	movq	64(%rsi), %rax
2003	movq	72(%rsi), %r14
2004	movq	80(%rsi), %r15
2005	movq	88(%rsi), %r8
2006
2007	vmovdqa	%xmm0,480(%rsp)
2008	vpshufd	$0x1e, %xmm5, %xmm4
2009	vmovdqa	%xmm1,480+16(%rsp)
2010	vmovdqu	64(%rsi), %xmm0
2011	vmovdqu	80(%rsi), %xmm1
2012	vmovdqa	%xmm2,512(%rsp)
2013	vmovdqa	%xmm3,512+16(%rsp)
2014	vpor	%xmm4, %xmm5, %xmm5
2015	vpxor	%xmm4, %xmm4, %xmm4
2016	vpor	%xmm0, %xmm1, %xmm1
2017	vmovq	%rdi, %xmm0
2018
2019	leaq	64(%rsi), %rsi
2020	movq	%rax,544(%rsp)
2021	movq	%r14,544+8(%rsp)
2022	movq	%r15,544+16(%rsp)
2023	movq	%r8,544+24(%rsp)
2024	leaq	96(%rsp), %rdi
2025	call	ECP_Sm2SqrMont
2026
2027	vpcmpeqd	%xmm4, %xmm5, %xmm5
2028	vpshufd		$0xb1, %xmm1, %xmm4
2029	vpor		%xmm1, %xmm4, %xmm4
2030	vpshufd		$0, %xmm5, %xmm5
2031	vpshufd		$0x1e, %xmm4, %xmm3
2032	vpor		%xmm3, %xmm4, %xmm4
2033	vpxor		%xmm3, %xmm3, %xmm3
2034	vpcmpeqd	%xmm3, %xmm4, %xmm4
2035	vpshufd		$0, %xmm4, %xmm4
2036
2037	movq		64(%rbx), %rax
2038	movq		72(%rbx), %r14
2039	movq		80(%rbx), %r15
2040	movq		88(%rbx), %r8
2041	vmovq		%rbx, %xmm1
2042
2043	leaq	64(%rbx), %rsi
2044	leaq	32(%rsp), %rdi
2045	call	ECP_Sm2SqrMont
2046
2047	movq	544(%rsp), %rax
2048	leaq	544(%rsp), %rbx
2049	movq	96(%rsp), %r9
2050	movq	104(%rsp), %r10
2051	leaq	96(%rsp), %rsi
2052	movq	112(%rsp), %r11
2053	movq	120(%rsp), %r12
2054	leaq	224(%rsp), %rdi
2055	call	ECP_Sm2MulMont
2056
2057	movq	448(%rsp), %rax
2058	leaq	448(%rsp), %rbx
2059	movq	32(%rsp), %r9
2060	movq	40(%rsp), %r10
2061	leaq	32(%rsp), %rsi
2062	movq	48(%rsp), %r11
2063	movq	56(%rsp), %r12
2064	leaq	256(%rsp), %rdi
2065	call	ECP_Sm2MulMont
2066
2067	movq	416(%rsp), %rax
2068	leaq	416(%rsp), %rbx
2069	movq	224(%rsp), %r9
2070	movq	232(%rsp), %r10
2071	leaq	224(%rsp), %rsi
2072	movq	240(%rsp), %r11
2073	movq	248(%rsp), %r12
2074	leaq	224(%rsp), %rdi
2075	call	ECP_Sm2MulMont
2076
2077	movq	512(%rsp), %rax
2078	leaq	512(%rsp), %rbx
2079	movq	256(%rsp), %r9
2080	movq	264(%rsp), %r10
2081	leaq	256(%rsp), %rsi
2082	movq	272(%rsp), %r11
2083	movq	280(%rsp), %r12
2084	leaq	256(%rsp), %rdi
2085	call	ECP_Sm2MulMont
2086
2087	leaq	224(%rsp), %rbx
2088	leaq	64(%rsp), %rdi
2089	call	ECP_Sm2SubAB
2090
2091	orq		%r13, %r12
2092	vmovdqa	%xmm4, %xmm2
2093	orq		%r8, %r12
2094	orq		%r9, %r12
2095	vpor	%xmm5, %xmm2, %xmm2
2096	vmovq	%r12, %xmm3
2097
2098	movq	384(%rsp), %rax
2099	leaq	384(%rsp), %rbx
2100	movq	96(%rsp), %r9
2101	movq	104(%rsp), %r10
2102	leaq	96(%rsp), %rsi
2103	movq	112(%rsp), %r11
2104	movq	120(%rsp), %r12
2105	leaq	160(%rsp), %rdi
2106	call	ECP_Sm2MulMont
2107
2108	movq	480(%rsp), %rax
2109	leaq	480(%rsp), %rbx
2110	movq	32(%rsp), %r9
2111	movq	40(%rsp), %r10
2112	leaq	32(%rsp), %rsi
2113	movq	48(%rsp), %r11
2114	movq	56(%rsp), %r12
2115	leaq	192(%rsp), %rdi
2116	call	ECP_Sm2MulMont
2117
2118	leaq	160(%rsp), %rbx
2119	leaq	0(%rsp), %rdi
2120	call	ECP_Sm2SubAB
2121
2122	orq		%r13, %r12
2123	orq		%r8, %r12
2124	orq		%r9, %r12
2125
2126	vmovq	%xmm2, %r8
2127	vmovq	%xmm3, %r9
2128
2129	orq		%r8, %r12
2130	orq		%r9, %r12
2131	jnz		.Lpoint_add
2132
2133.Ladd_double:
2134	vmovq	%xmm1, %rsi
2135	vmovq	%xmm0, %rdi
2136	addq	$416, %rsp
2137	jmp	.Lpoint_double
2138
2139.align	32
2140.Lpoint_add:
2141	movq	64(%rsp), %rax
2142	movq	72(%rsp), %r14
2143	leaq	64(%rsp), %rsi
2144	movq	80(%rsp), %r15
2145	movq	88(%rsp), %r8
2146	leaq	96(%rsp), %rdi
2147	call	ECP_Sm2SqrMont
2148
2149	movq	448(%rsp), %rax
2150	leaq	448(%rsp), %rbx
2151	movq	(%rsp), %r9
2152	movq	8(%rsp), %r10
2153	leaq	(%rsp), %rsi
2154	movq	16(%rsp), %r11
2155	movq	24(%rsp), %r12
2156	leaq	352(%rsp), %rdi
2157	call	ECP_Sm2MulMont
2158
2159	movq	(%rsp), %rax
2160	movq	8(%rsp), %r14
2161	leaq	(%rsp), %rsi
2162	movq	16(%rsp), %r15
2163	movq	24(%rsp), %r8
2164	leaq	32(%rsp), %rdi
2165	call	ECP_Sm2SqrMont
2166
2167	movq	544(%rsp), %rax
2168	leaq	544(%rsp), %rbx
2169	movq	352(%rsp), %r9
2170	movq	360(%rsp), %r10
2171	leaq	352(%rsp), %rsi
2172	movq	368(%rsp), %r11
2173	movq	24+352(%rsp), %r12
2174	leaq	352(%rsp), %rdi
2175	call	ECP_Sm2MulMont
2176
2177	movq	(%rsp), %rax
2178	leaq	(%rsp), %rbx
2179	movq	32(%rsp), %r9
2180	movq	40(%rsp), %r10
2181	leaq	32(%rsp), %rsi
2182	movq	48(%rsp), %r11
2183	movq	56(%rsp), %r12
2184	leaq	128(%rsp), %rdi
2185	call	ECP_Sm2MulMont
2186
2187	movq	160(%rsp), %rax
2188	leaq	160(%rsp), %rbx
2189	movq	32(%rsp), %r9
2190	movq	40(%rsp), %r10
2191	leaq	32(%rsp), %rsi
2192	movq	48(%rsp), %r11
2193	movq	56(%rsp), %r12
2194	leaq	192(%rsp), %rdi
2195	call	ECP_Sm2MulMont
2196
2197	leaq	96(%rsp), %rsi
2198	movq	$0, %r11
2199	addq	%r12, %r12
2200	adcq	%r13, %r13
2201	movq	%r12, %rcx
2202	adcq	%r8, %r8
2203	adcq	%r9, %r9
2204	movq	%r13, %rbp
2205	adcq	$0, %r11
2206
2207	subq	$-1, %r12
2208	movq	%r8, %rax
2209	sbbq	%r14, %r13
2210	sbbq	$-1, %r8
2211	movq	%r9, %r10
2212	sbbq	%r15, %r9
2213	sbbq	$0, %r11
2214
2215	cmovcq	%rcx, %r12
2216	movq	(%rsi), %rcx
2217	cmovcq	%rbp, %r13
2218	movq	8(%rsi), %rbp
2219	cmovcq	%rax, %r8
2220	movq	16(%rsi), %rax
2221	cmovcq	%r10, %r9
2222	movq	24(%rsi), %r10
2223
2224	call	ECP_Sm2SubBA
2225
2226	leaq	128(%rsp), %rbx
2227	leaq	288(%rsp), %rdi
2228	call	ECP_Sm2SubAB
2229
2230	movq	192(%rsp), %rcx
2231	movq	200(%rsp), %rbp
2232	movq	208(%rsp), %rax
2233	movq	216(%rsp), %r10
2234	leaq	320(%rsp), %rdi
2235
2236	call	ECP_Sm2SubBA
2237
2238	movq	%r12,(%rdi)
2239	movq	%r13,8(%rdi)
2240	movq	%r8,16(%rdi)
2241	movq	%r9,24(%rdi)
2242
2243	movq	128(%rsp), %rax
2244	leaq	128(%rsp), %rbx
2245	movq	224(%rsp), %r9
2246	movq	232(%rsp), %r10
2247	leaq	224(%rsp), %rsi
2248	movq	240(%rsp), %r11
2249	movq	248(%rsp), %r12
2250	leaq	256(%rsp), %rdi
2251	call	ECP_Sm2MulMont
2252
2253	movq	320(%rsp), %rax
2254	leaq	320(%rsp), %rbx
2255	movq	64(%rsp), %r9
2256	movq	72(%rsp), %r10
2257	leaq	64(%rsp), %rsi
2258	movq	80(%rsp), %r11
2259	movq	88(%rsp), %r12
2260	leaq	320(%rsp), %rdi
2261	call	ECP_Sm2MulMont
2262
2263	leaq	256(%rsp), %rbx
2264	leaq	320(%rsp), %rdi
2265	call	ECP_Sm2SubAB
2266
2267	vmovq	%xmm0, %rdi
2268	vmovdqa	%xmm5, %xmm0
2269	vmovdqa	%xmm5, %xmm1
2270	vpandn	352(%rsp), %xmm0, %xmm0
2271	vmovdqa	%xmm5, %xmm2
2272	vpandn	368(%rsp), %xmm1, %xmm1
2273	vmovdqa	%xmm5, %xmm3
2274	vpand	544(%rsp), %xmm2, %xmm2
2275	vpand	560(%rsp), %xmm3, %xmm3
2276	vpor	%xmm0, %xmm2, %xmm2
2277	vpor	%xmm1, %xmm3, %xmm3
2278
2279	vmovdqa	%xmm4, %xmm0
2280	vmovdqa	%xmm4, %xmm1
2281	vpandn	%xmm2, %xmm0, %xmm0
2282	vmovdqa	%xmm4, %xmm2
2283	vpandn	%xmm3, %xmm1, %xmm1
2284	vmovdqa	%xmm4, %xmm3
2285	vpand	448(%rsp), %xmm2, %xmm2
2286	vpand	464(%rsp), %xmm3, %xmm3
2287	vpor	%xmm0, %xmm2, %xmm2
2288	vpor	%xmm1, %xmm3, %xmm3
2289	vmovdqu	%xmm2,64(%rdi)
2290	vmovdqu	%xmm3,80(%rdi)
2291
2292	vmovdqa	%xmm5, %xmm0
2293	vmovdqa	%xmm5, %xmm1
2294	vpandn	288(%rsp), %xmm0, %xmm0
2295	vmovdqa	%xmm5, %xmm2
2296	vpandn	304(%rsp), %xmm1, %xmm1
2297	vmovdqa	%xmm5, %xmm3
2298	vpand	480(%rsp), %xmm2, %xmm2
2299	vpand	496(%rsp), %xmm3, %xmm3
2300	vpor	%xmm0, %xmm2, %xmm2
2301	vpor	%xmm1, %xmm3, %xmm3
2302
2303	vmovdqa	%xmm4, %xmm0
2304	vmovdqa	%xmm4, %xmm1
2305	vpandn	%xmm2, %xmm0, %xmm0
2306	vmovdqa	%xmm4, %xmm2
2307	vpandn	%xmm3, %xmm1, %xmm1
2308	vmovdqa	%xmm4, %xmm3
2309	vpand	384(%rsp), %xmm2, %xmm2
2310	vpand	400(%rsp), %xmm3, %xmm3
2311	vpor	%xmm0, %xmm2, %xmm2
2312	vpor	%xmm1, %xmm3, %xmm3
2313	vmovdqu	%xmm2,(%rdi)
2314	vmovdqu	%xmm3,16(%rdi)
2315
2316	vmovdqa	%xmm5, %xmm0
2317	vmovdqa	%xmm5, %xmm1
2318	vpandn	320(%rsp), %xmm0, %xmm0
2319	vmovdqa	%xmm5, %xmm2
2320	vpandn	336(%rsp), %xmm1, %xmm1
2321	vmovdqa	%xmm5, %xmm3
2322	vpand	512(%rsp), %xmm2, %xmm2
2323	vpand	528(%rsp), %xmm3, %xmm3
2324	vpor	%xmm0, %xmm2, %xmm2
2325	vpor	%xmm1, %xmm3, %xmm3
2326
2327	vmovdqa	%xmm4, %xmm0
2328	vmovdqa	%xmm4, %xmm1
2329	vpandn	%xmm2, %xmm0, %xmm0
2330	vmovdqa	%xmm4, %xmm2
2331	vpandn	%xmm3, %xmm1, %xmm1
2332	vmovdqa	%xmm4, %xmm3
2333	vpand	416(%rsp), %xmm2, %xmm2
2334	vpand	432(%rsp), %xmm3, %xmm3
2335	vpor	%xmm0, %xmm2, %xmm2
2336	vpor	%xmm1, %xmm3, %xmm3
2337	vmovdqu	%xmm2,32(%rdi)
2338	vmovdqu	%xmm3,48(%rdi)
2339
2340.Ladd_done:
2341	leaq	584(%rsp), %rsp
2342	REGISTER_POP
2343	ret
2344.size	ECP_Sm2PointAddMont,.-ECP_Sm2PointAddMont
2345
2346# ref. https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-madd-2007-bl
2347# Deal process:
2348#     Z1Z1 = Z12
2349#     U2 = X2*Z1Z1
2350#     S2 = Y2*Z1*Z1Z1
2351#     H = U2-X1
2352#     HH = H2
2353#     I = 4*HH
2354#     J = H*I
2355#     r = 2*(S2-Y1)
2356#     V = X1*I
2357#     X3 = r2-J-2*V
2358#     Y3 = r*(V-X3)-2*Y1*J
2359#     Z3 = (Z1+H)2-Z1Z1-HH
2360.globl	ECP_Sm2PointAddAffineMont
2361.type	ECP_Sm2PointAddAffineMont,@function
2362.align	32
2363ECP_Sm2PointAddAffineMont:
2364	REGISTER_SAVE
2365	subq	$488, %rsp
2366	vmovdqu	(%rsi), %xmm0
2367	vmovdqu	16(%rsi), %xmm1
2368	vmovdqu	32(%rsi), %xmm2
2369	vmovdqu	48(%rsi), %xmm3
2370	vmovdqu	64(%rsi), %xmm4
2371	vmovdqu	80(%rsi), %xmm5
2372	movq	%rdx, %rbx
2373	movq	64(%rsi), %rax
2374	movq	72(%rsi), %r14
2375	movq	80(%rsi), %r15
2376	movq	88(%rsi), %r8
2377
2378	vmovdqa	%xmm0,320(%rsp)
2379	vmovdqa	%xmm1,336(%rsp)
2380	vmovdqa	%xmm2,352(%rsp)
2381	vmovdqa	%xmm3,368(%rsp)
2382	vmovdqa	%xmm4,384(%rsp)
2383	vmovdqa	%xmm5,400(%rsp)
2384	vpor	%xmm4, %xmm5, %xmm5
2385
2386	vmovdqu	(%rbx), %xmm0
2387	vpshufd	$0xb1, %xmm5, %xmm3
2388	vmovdqu	16(%rbx), %xmm1
2389	vmovdqu	32(%rbx), %xmm2
2390	vpor	%xmm3, %xmm5, %xmm5
2391	vmovdqu	48(%rbx), %xmm3
2392	vmovdqa	%xmm0, 416(%rsp)
2393	vpshufd	$0x1e, %xmm5, %xmm4
2394	vmovdqa	%xmm1, 416+16(%rsp)
2395	vpor	%xmm0, %xmm1, %xmm1
2396
2397	vmovq	%rdi, %xmm0
2398	vmovdqa	%xmm2, 448(%rsp)
2399	vmovdqa	%xmm3, 464(%rsp)
2400	vpor	%xmm2, %xmm3, %xmm3
2401	vpor	%xmm4, %xmm5, %xmm5
2402	vpxor	%xmm4, %xmm4, %xmm4
2403	vpor	%xmm1, %xmm3, %xmm3
2404
2405	leaq	64(%rsi), %rsi
2406	leaq	32(%rsp), %rdi
2407	call	ECP_Sm2SqrMont
2408
2409	vpcmpeqd	%xmm4, %xmm5, %xmm5
2410	vpshufd		$0xb1, %xmm3, %xmm4
2411	vpor		%xmm3, %xmm4, %xmm4
2412	vpshufd		$0, %xmm5, %xmm5
2413	vpshufd		$0x1e, %xmm4, %xmm3
2414	vpor		%xmm3, %xmm4, %xmm4
2415	vpxor		%xmm3, %xmm3, %xmm3
2416	vpcmpeqd	%xmm3, %xmm4, %xmm4
2417	vpshufd		$0, %xmm4, %xmm4
2418
2419	movq        (%rbx), %rax
2420	movq		%r12, %r9
2421	movq		%r13, %r10
2422	movq		%r14, %r11
2423
2424	leaq	32(%rsp), %rsi
2425	movq	%r15, %r12
2426	leaq	(%rsp), %rdi
2427	call	ECP_Sm2MulMont
2428
2429	leaq	320(%rsp), %rbx
2430	leaq	64(%rsp), %rdi
2431	call	ECP_Sm2SubAB
2432
2433	movq	384(%rsp), %rax
2434	leaq	384(%rsp), %rbx
2435	movq	32(%rsp), %r9
2436	movq	40(%rsp), %r10
2437	leaq	32(%rsp), %rsi
2438	movq	48(%rsp), %r11
2439	movq	56(%rsp), %r12
2440	leaq	32(%rsp), %rdi
2441	call	ECP_Sm2MulMont
2442
2443	movq	384(%rsp), %rax
2444	leaq	384(%rsp), %rbx
2445	movq	64(%rsp), %r9
2446	movq	72(%rsp), %r10
2447	leaq	64(%rsp), %rsi
2448	movq	80(%rsp), %r11
2449	movq	88(%rsp), %r12
2450	leaq	288(%rsp), %rdi
2451	call	ECP_Sm2MulMont
2452
2453	movq	448(%rsp), %rax
2454	leaq	448(%rsp), %rbx
2455	movq	32(%rsp), %r9
2456	movq	40(%rsp), %r10
2457	leaq	32(%rsp), %rsi
2458	movq	48(%rsp), %r11
2459	movq	56(%rsp), %r12
2460	leaq	32(%rsp), %rdi
2461	call	ECP_Sm2MulMont
2462
2463	leaq	352(%rsp), %rbx
2464	leaq	96(%rsp), %rdi
2465	call	ECP_Sm2SubAB
2466
2467	movq	64(%rsp), %rax
2468	movq	72(%rsp), %r14
2469	leaq	64(%rsp), %rsi
2470	movq	80(%rsp), %r15
2471	movq	88(%rsp), %r8
2472	leaq	128(%rsp), %rdi
2473	call	ECP_Sm2SqrMont
2474
2475	movq	96(%rsp), %rax
2476	movq	104(%rsp), %r14
2477	leaq	96(%rsp), %rsi
2478	movq	112(%rsp), %r15
2479	movq	120(%rsp), %r8
2480	leaq	192(%rsp), %rdi
2481	call	ECP_Sm2SqrMont
2482
2483	movq	128(%rsp), %rax
2484	leaq	128(%rsp), %rbx
2485	movq	64(%rsp), %r9
2486	movq	72(%rsp), %r10
2487	leaq	64(%rsp), %rsi
2488	movq	80(%rsp), %r11
2489	movq	88(%rsp), %r12
2490	leaq	160(%rsp), %rdi
2491	call	ECP_Sm2MulMont
2492
2493	movq	320(%rsp), %rax
2494	leaq	320(%rsp), %rbx
2495	movq	128(%rsp), %r9
2496	movq	136(%rsp), %r10
2497	leaq	128(%rsp), %rsi
2498	movq	144(%rsp), %r11
2499	movq	152(%rsp), %r12
2500	leaq	(%rsp), %rdi
2501	call	ECP_Sm2MulMont
2502
2503	leaq	192(%rsp), %rsi
2504	movq	$0, %r11
2505	addq	%r12, %r12
2506	adcq	%r13, %r13
2507	movq	%r12, %rcx
2508	adcq	%r8, %r8
2509	adcq	%r9, %r9
2510	movq	%r13, %rbp
2511	adcq	$0, %r11
2512
2513	subq	$-1, %r12
2514	movq	%r8, %rax
2515	sbbq	%r14, %r13
2516	sbbq	$-1, %r8
2517	movq	%r9, %r10
2518	sbbq	%r15, %r9
2519	sbbq	$0, %r11
2520
2521	cmovcq	%rcx, %r12
2522	movq	(%rsi), %rcx
2523	cmovcq	%rbp, %r13
2524	movq	8(%rsi), %rbp
2525	cmovcq	%rax, %r8
2526	movq	16(%rsi), %rax
2527	cmovcq	%r10, %r9
2528	movq	24(%rsi), %r10
2529
2530	call	ECP_Sm2SubBA
2531
2532	leaq	160(%rsp), %rbx
2533	leaq	224(%rsp), %rdi
2534	call	ECP_Sm2SubAB
2535
2536	movq	(%rsp), %rcx
2537	movq	8(%rsp), %rbp
2538	movq	16(%rsp), %rax
2539	movq	24(%rsp), %r10
2540	leaq	64(%rsp), %rdi
2541
2542	call	ECP_Sm2SubBA
2543
2544	movq	%r12,(%rdi)
2545	movq	%r13,8(%rdi)
2546	movq	%r8,16(%rdi)
2547	movq	%r9,24(%rdi)
2548
2549	movq	352(%rsp), %rax
2550	leaq	352(%rsp), %rbx
2551	movq	160(%rsp), %r9
2552	movq	168(%rsp), %r10
2553	leaq	160(%rsp), %rsi
2554	movq	176(%rsp), %r11
2555	movq	184(%rsp), %r12
2556	leaq	32(%rsp), %rdi
2557	call	ECP_Sm2MulMont
2558
2559	movq	96(%rsp), %rax
2560	leaq	96(%rsp), %rbx
2561	movq	64(%rsp), %r9
2562	movq	72(%rsp), %r10
2563	leaq	64(%rsp), %rsi
2564	movq	80(%rsp), %r11
2565	movq	88(%rsp), %r12
2566	leaq	64(%rsp), %rdi
2567	call	ECP_Sm2MulMont
2568
2569	leaq	32(%rsp), %rbx
2570	leaq	256(%rsp), %rdi
2571	call	ECP_Sm2SubAB
2572
2573	vmovq	%xmm0, %rdi
2574	vmovdqa	%xmm5, %xmm0
2575	vmovdqa	%xmm5, %xmm1
2576	vpandn	288(%rsp), %xmm0, %xmm0
2577	vmovdqa	%xmm5, %xmm2
2578	vpandn	304(%rsp), %xmm1, %xmm1
2579	vmovdqa	%xmm5, %xmm3
2580	vpand	.Lone_mont(%rip), %xmm2, %xmm2
2581	vpand	.Lone_mont+16(%rip), %xmm3, %xmm3
2582	vpor	%xmm0, %xmm2, %xmm2
2583	vpor	%xmm1, %xmm3, %xmm3
2584
2585	vmovdqa	%xmm4, %xmm0
2586	vmovdqa	%xmm4, %xmm1
2587	vpandn	%xmm2, %xmm0, %xmm0
2588	vmovdqa	%xmm4, %xmm2
2589	vpandn	%xmm3, %xmm1, %xmm1
2590	vmovdqa	%xmm4, %xmm3
2591	vpand	384(%rsp), %xmm2, %xmm2
2592	vpand	400(%rsp), %xmm3, %xmm3
2593	vpor	%xmm0, %xmm2, %xmm2
2594	vpor	%xmm1, %xmm3, %xmm3
2595	vmovdqu	%xmm2,64(%rdi)
2596	vmovdqu	%xmm3,80(%rdi)
2597
2598	vmovdqa	%xmm5, %xmm0
2599	vmovdqa	%xmm5, %xmm1
2600	vpandn	224(%rsp), %xmm0, %xmm0
2601	vmovdqa	%xmm5, %xmm2
2602	vpandn	240(%rsp), %xmm1, %xmm1
2603	vmovdqa	%xmm5, %xmm3
2604	vpand	416(%rsp), %xmm2, %xmm2
2605	vpand	432(%rsp), %xmm3, %xmm3
2606	vpor	%xmm0, %xmm2, %xmm2
2607	vpor	%xmm1, %xmm3, %xmm3
2608
2609	vmovdqa	%xmm4, %xmm0
2610	vmovdqa	%xmm4, %xmm1
2611	vpandn	%xmm2, %xmm0, %xmm0
2612	vmovdqa	%xmm4, %xmm2
2613	vpandn	%xmm3, %xmm1, %xmm1
2614	vmovdqa	%xmm4, %xmm3
2615	vpand	320(%rsp), %xmm2, %xmm2
2616	vpand	336(%rsp), %xmm3, %xmm3
2617	vpor	%xmm0, %xmm2, %xmm2
2618	vpor	%xmm1, %xmm3, %xmm3
2619	vmovdqu	%xmm2,(%rdi)
2620	vmovdqu	%xmm3,16(%rdi)
2621
2622	vmovdqa	%xmm5, %xmm0
2623	vmovdqa	%xmm5, %xmm1
2624	vpandn	256(%rsp), %xmm0, %xmm0
2625	vmovdqa	%xmm5, %xmm2
2626	vpandn	272(%rsp), %xmm1, %xmm1
2627	vmovdqa	%xmm5, %xmm3
2628	vpand	448(%rsp), %xmm2, %xmm2
2629	vpand	464(%rsp), %xmm3, %xmm3
2630	vpor	%xmm0, %xmm2, %xmm2
2631	vpor	%xmm1, %xmm3, %xmm3
2632
2633	vmovdqa	%xmm4, %xmm0
2634	vmovdqa	%xmm4, %xmm1
2635	vpandn	%xmm2, %xmm0, %xmm0
2636	vmovdqa	%xmm4, %xmm2
2637	vpandn	%xmm3, %xmm1, %xmm1
2638	vmovdqa	%xmm4, %xmm3
2639	vpand	352(%rsp), %xmm2, %xmm2
2640	vpand	368(%rsp), %xmm3, %xmm3
2641	vpor	%xmm0, %xmm2, %xmm2
2642	vpor	%xmm1, %xmm3, %xmm3
2643	vmovdqu	%xmm2,32(%rdi)
2644	vmovdqu	%xmm3,48(%rdi)
2645
2646	leaq	488(%rsp), %rsp
2647	REGISTER_POP
2648	ret
2649.size	ECP_Sm2PointAddAffineMont,.-ECP_Sm2PointAddAffineMont
2650#endif