• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
3# Copyright (c) 2014, Intel Corporation. All Rights Reserved.
4# Copyright (c) 2015 CloudFlare, Inc.
5#
6# Licensed under the OpenSSL license (the "License").  You may not use
7# this file except in compliance with the License.  You can obtain a copy
8# in the file LICENSE in the source distribution or at
9# https://www.openssl.org/source/license.html
10#
11# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3)
12# (1) Intel Corporation, Israel Development Center, Haifa, Israel
13# (2) University of Haifa, Israel
14# (3) CloudFlare, Inc.
15#
16# Reference:
17# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
18#                          256 Bit Primes"
19
20# Further optimization by <appro@openssl.org>:
21#
22#		this/original	with/without -DECP_NISTZ256_ASM(*)
23# Opteron	+15-49%		+150-195%
24# Bulldozer	+18-45%		+175-240%
25# P4		+24-46%		+100-150%
26# Westmere	+18-34%		+87-160%
27# Sandy Bridge	+14-35%		+120-185%
28# Ivy Bridge	+11-35%		+125-180%
29# Haswell	+10-37%		+160-200%
30# Broadwell	+24-58%		+210-270%
31# Atom		+20-50%		+180-240%
32# VIA Nano	+50-160%	+480-480%
33#
34# (*)	"without -DECP_NISTZ256_ASM" refers to build with
35#	"enable-ec_nistp_64_gcc_128";
36#
37# Ranges denote minimum and maximum improvement coefficients depending
38# on benchmark. In "this/original" column lower coefficient is for
39# ECDSA sign, while in "with/without" - for ECDH key agreement, and
40# higher - for ECDSA sign, relatively fastest server-side operation.
41# Keep in mind that +100% means 2x improvement.
42
43$flavour = shift;
44$output  = shift;
45if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
46
47$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
48
49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
51( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
52die "can't locate x86_64-xlate.pl";
53
54open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
55*STDOUT=*OUT;
56
57$avx = 2;
58$addx = 1;
59
60$code.=<<___;
61.text
62.extern	OPENSSL_ia32cap_P
63
64# The polynomial
65.section .rodata
66.align 64
67.Lpoly:
68.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
69
70.LOne:
71.long 1,1,1,1,1,1,1,1
72.LTwo:
73.long 2,2,2,2,2,2,2,2
74.LThree:
75.long 3,3,3,3,3,3,3,3
76.LONE_mont:
77.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
78
79# Constants for computations modulo ord(p256)
80.Lord:
81.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
82.LordK:
83.quad 0xccd1c8aaee00bc4f
84.text
85___
86
87{
88my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11));
89my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13");
90my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx");
91
92$code.=<<___;
93
94################################################################################
95# void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]);
96.globl	ecp_nistz256_neg
97.type	ecp_nistz256_neg,\@function,2
98.align	32
99ecp_nistz256_neg:
100.cfi_startproc
101	_CET_ENDBR
102	push	%r12
103.cfi_push	%r12
104	push	%r13
105.cfi_push	%r13
106.Lneg_body:
107
108	xor	$a0, $a0
109	xor	$a1, $a1
110	xor	$a2, $a2
111	xor	$a3, $a3
112	xor	$t4, $t4
113
114	sub	8*0($a_ptr), $a0
115	sbb	8*1($a_ptr), $a1
116	sbb	8*2($a_ptr), $a2
117	 mov	$a0, $t0
118	sbb	8*3($a_ptr), $a3
119	lea	.Lpoly(%rip), $a_ptr
120	 mov	$a1, $t1
121	sbb	\$0, $t4
122
123	add	8*0($a_ptr), $a0
124	 mov	$a2, $t2
125	adc	8*1($a_ptr), $a1
126	adc	8*2($a_ptr), $a2
127	 mov	$a3, $t3
128	adc	8*3($a_ptr), $a3
129	test	$t4, $t4
130
131	cmovz	$t0, $a0
132	cmovz	$t1, $a1
133	mov	$a0, 8*0($r_ptr)
134	cmovz	$t2, $a2
135	mov	$a1, 8*1($r_ptr)
136	cmovz	$t3, $a3
137	mov	$a2, 8*2($r_ptr)
138	mov	$a3, 8*3($r_ptr)
139
140	mov	0(%rsp),%r13
141.cfi_restore	%r13
142	mov	8(%rsp),%r12
143.cfi_restore	%r12
144	lea	16(%rsp),%rsp
145.cfi_adjust_cfa_offset	-16
146.Lneg_epilogue:
147	ret
148.cfi_endproc
149.size	ecp_nistz256_neg,.-ecp_nistz256_neg
150___
151}
152{
153my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
154my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
155my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
156my ($poly1,$poly3)=($acc6,$acc7);
157
158$code.=<<___;
159################################################################################
160# void ecp_nistz256_ord_mul_mont(
161#   uint64_t res[4],
162#   uint64_t a[4],
163#   uint64_t b[4]);
164
165.globl	ecp_nistz256_ord_mul_mont
166.type	ecp_nistz256_ord_mul_mont,\@function,3
167.align	32
168ecp_nistz256_ord_mul_mont:
169.cfi_startproc
170	_CET_ENDBR
171___
172$code.=<<___	if ($addx);
173	leaq	OPENSSL_ia32cap_P(%rip), %rcx
174	mov	8(%rcx), %rcx
175	and	\$0x80100, %ecx
176	cmp	\$0x80100, %ecx
177	je	.Lecp_nistz256_ord_mul_montx
178___
179$code.=<<___;
180	push	%rbp
181.cfi_push	%rbp
182	push	%rbx
183.cfi_push	%rbx
184	push	%r12
185.cfi_push	%r12
186	push	%r13
187.cfi_push	%r13
188	push	%r14
189.cfi_push	%r14
190	push	%r15
191.cfi_push	%r15
192.Lord_mul_body:
193
194	mov	8*0($b_org), %rax
195	mov	$b_org, $b_ptr
196	lea	.Lord(%rip), %r14
197	mov	.LordK(%rip), %r15
198
199	################################# * b[0]
200	mov	%rax, $t0
201	mulq	8*0($a_ptr)
202	mov	%rax, $acc0
203	mov	$t0, %rax
204	mov	%rdx, $acc1
205
206	mulq	8*1($a_ptr)
207	add	%rax, $acc1
208	mov	$t0, %rax
209	adc	\$0, %rdx
210	mov	%rdx, $acc2
211
212	mulq	8*2($a_ptr)
213	add	%rax, $acc2
214	mov	$t0, %rax
215	adc	\$0, %rdx
216
217	 mov	$acc0, $acc5
218	 imulq	%r15,$acc0
219
220	mov	%rdx, $acc3
221	mulq	8*3($a_ptr)
222	add	%rax, $acc3
223	 mov	$acc0, %rax
224	adc	\$0, %rdx
225	mov	%rdx, $acc4
226
227	################################# First reduction step
228	mulq	8*0(%r14)
229	mov	$acc0, $t1
230	add	%rax, $acc5		# guaranteed to be zero
231	mov	$acc0, %rax
232	adc	\$0, %rdx
233	mov	%rdx, $t0
234
235	sub	$acc0, $acc2
236	sbb	\$0, $acc0		# can't borrow
237
238	mulq	8*1(%r14)
239	add	$t0, $acc1
240	adc	\$0, %rdx
241	add	%rax, $acc1
242	mov	$t1, %rax
243	adc	%rdx, $acc2
244	mov	$t1, %rdx
245	adc	\$0, $acc0		# can't overflow
246
247	shl	\$32, %rax
248	shr	\$32, %rdx
249	sub	%rax, $acc3
250	 mov	8*1($b_ptr), %rax
251	sbb	%rdx, $t1		# can't borrow
252
253	add	$acc0, $acc3
254	adc	$t1, $acc4
255	adc	\$0, $acc5
256
257	################################# * b[1]
258	mov	%rax, $t0
259	mulq	8*0($a_ptr)
260	add	%rax, $acc1
261	mov	$t0, %rax
262	adc	\$0, %rdx
263	mov	%rdx, $t1
264
265	mulq	8*1($a_ptr)
266	add	$t1, $acc2
267	adc	\$0, %rdx
268	add	%rax, $acc2
269	mov	$t0, %rax
270	adc	\$0, %rdx
271	mov	%rdx, $t1
272
273	mulq	8*2($a_ptr)
274	add	$t1, $acc3
275	adc	\$0, %rdx
276	add	%rax, $acc3
277	mov	$t0, %rax
278	adc	\$0, %rdx
279
280	 mov	$acc1, $t0
281	 imulq	%r15, $acc1
282
283	mov	%rdx, $t1
284	mulq	8*3($a_ptr)
285	add	$t1, $acc4
286	adc	\$0, %rdx
287	xor	$acc0, $acc0
288	add	%rax, $acc4
289	 mov	$acc1, %rax
290	adc	%rdx, $acc5
291	adc	\$0, $acc0
292
293	################################# Second reduction step
294	mulq	8*0(%r14)
295	mov	$acc1, $t1
296	add	%rax, $t0		# guaranteed to be zero
297	mov	$acc1, %rax
298	adc	%rdx, $t0
299
300	sub	$acc1, $acc3
301	sbb	\$0, $acc1		# can't borrow
302
303	mulq	8*1(%r14)
304	add	$t0, $acc2
305	adc	\$0, %rdx
306	add	%rax, $acc2
307	mov	$t1, %rax
308	adc	%rdx, $acc3
309	mov	$t1, %rdx
310	adc	\$0, $acc1		# can't overflow
311
312	shl	\$32, %rax
313	shr	\$32, %rdx
314	sub	%rax, $acc4
315	 mov	8*2($b_ptr), %rax
316	sbb	%rdx, $t1		# can't borrow
317
318	add	$acc1, $acc4
319	adc	$t1, $acc5
320	adc	\$0, $acc0
321
322	################################## * b[2]
323	mov	%rax, $t0
324	mulq	8*0($a_ptr)
325	add	%rax, $acc2
326	mov	$t0, %rax
327	adc	\$0, %rdx
328	mov	%rdx, $t1
329
330	mulq	8*1($a_ptr)
331	add	$t1, $acc3
332	adc	\$0, %rdx
333	add	%rax, $acc3
334	mov	$t0, %rax
335	adc	\$0, %rdx
336	mov	%rdx, $t1
337
338	mulq	8*2($a_ptr)
339	add	$t1, $acc4
340	adc	\$0, %rdx
341	add	%rax, $acc4
342	mov	$t0, %rax
343	adc	\$0, %rdx
344
345	 mov	$acc2, $t0
346	 imulq	%r15, $acc2
347
348	mov	%rdx, $t1
349	mulq	8*3($a_ptr)
350	add	$t1, $acc5
351	adc	\$0, %rdx
352	xor	$acc1, $acc1
353	add	%rax, $acc5
354	 mov	$acc2, %rax
355	adc	%rdx, $acc0
356	adc	\$0, $acc1
357
358	################################# Third reduction step
359	mulq	8*0(%r14)
360	mov	$acc2, $t1
361	add	%rax, $t0		# guaranteed to be zero
362	mov	$acc2, %rax
363	adc	%rdx, $t0
364
365	sub	$acc2, $acc4
366	sbb	\$0, $acc2		# can't borrow
367
368	mulq	8*1(%r14)
369	add	$t0, $acc3
370	adc	\$0, %rdx
371	add	%rax, $acc3
372	mov	$t1, %rax
373	adc	%rdx, $acc4
374	mov	$t1, %rdx
375	adc	\$0, $acc2		# can't overflow
376
377	shl	\$32, %rax
378	shr	\$32, %rdx
379	sub	%rax, $acc5
380	 mov	8*3($b_ptr), %rax
381	sbb	%rdx, $t1		# can't borrow
382
383	add	$acc2, $acc5
384	adc	$t1, $acc0
385	adc	\$0, $acc1
386
387	################################# * b[3]
388	mov	%rax, $t0
389	mulq	8*0($a_ptr)
390	add	%rax, $acc3
391	mov	$t0, %rax
392	adc	\$0, %rdx
393	mov	%rdx, $t1
394
395	mulq	8*1($a_ptr)
396	add	$t1, $acc4
397	adc	\$0, %rdx
398	add	%rax, $acc4
399	mov	$t0, %rax
400	adc	\$0, %rdx
401	mov	%rdx, $t1
402
403	mulq	8*2($a_ptr)
404	add	$t1, $acc5
405	adc	\$0, %rdx
406	add	%rax, $acc5
407	mov	$t0, %rax
408	adc	\$0, %rdx
409
410	 mov	$acc3, $t0
411	 imulq	%r15, $acc3
412
413	mov	%rdx, $t1
414	mulq	8*3($a_ptr)
415	add	$t1, $acc0
416	adc	\$0, %rdx
417	xor	$acc2, $acc2
418	add	%rax, $acc0
419	 mov	$acc3, %rax
420	adc	%rdx, $acc1
421	adc	\$0, $acc2
422
423	################################# Last reduction step
424	mulq	8*0(%r14)
425	mov	$acc3, $t1
426	add	%rax, $t0		# guaranteed to be zero
427	mov	$acc3, %rax
428	adc	%rdx, $t0
429
430	sub	$acc3, $acc5
431	sbb	\$0, $acc3		# can't borrow
432
433	mulq	8*1(%r14)
434	add	$t0, $acc4
435	adc	\$0, %rdx
436	add	%rax, $acc4
437	mov	$t1, %rax
438	adc	%rdx, $acc5
439	mov	$t1, %rdx
440	adc	\$0, $acc3		# can't overflow
441
442	shl	\$32, %rax
443	shr	\$32, %rdx
444	sub	%rax, $acc0
445	sbb	%rdx, $t1		# can't borrow
446
447	add	$acc3, $acc0
448	adc	$t1, $acc1
449	adc	\$0, $acc2
450
451	################################# Subtract ord
452	 mov	$acc4, $a_ptr
453	sub	8*0(%r14), $acc4
454	 mov	$acc5, $acc3
455	sbb	8*1(%r14), $acc5
456	 mov	$acc0, $t0
457	sbb	8*2(%r14), $acc0
458	 mov	$acc1, $t1
459	sbb	8*3(%r14), $acc1
460	sbb	\$0, $acc2
461
462	cmovc	$a_ptr, $acc4
463	cmovc	$acc3, $acc5
464	cmovc	$t0, $acc0
465	cmovc	$t1, $acc1
466
467	mov	$acc4, 8*0($r_ptr)
468	mov	$acc5, 8*1($r_ptr)
469	mov	$acc0, 8*2($r_ptr)
470	mov	$acc1, 8*3($r_ptr)
471
472	mov	0(%rsp),%r15
473.cfi_restore	%r15
474	mov	8(%rsp),%r14
475.cfi_restore	%r14
476	mov	16(%rsp),%r13
477.cfi_restore	%r13
478	mov	24(%rsp),%r12
479.cfi_restore	%r12
480	mov	32(%rsp),%rbx
481.cfi_restore	%rbx
482	mov	40(%rsp),%rbp
483.cfi_restore	%rbp
484	lea	48(%rsp),%rsp
485.cfi_adjust_cfa_offset	-48
486.Lord_mul_epilogue:
487	ret
488.cfi_endproc
489.size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
490
491################################################################################
492# void ecp_nistz256_ord_sqr_mont(
493#   uint64_t res[4],
494#   uint64_t a[4],
495#   uint64_t rep);
496
497.globl	ecp_nistz256_ord_sqr_mont
498.type	ecp_nistz256_ord_sqr_mont,\@function,3
499.align	32
500ecp_nistz256_ord_sqr_mont:
501.cfi_startproc
502	_CET_ENDBR
503___
504$code.=<<___	if ($addx);
505	leaq	OPENSSL_ia32cap_P(%rip), %rcx
506	mov	8(%rcx), %rcx
507	and	\$0x80100, %ecx
508	cmp	\$0x80100, %ecx
509	je	.Lecp_nistz256_ord_sqr_montx
510___
511$code.=<<___;
512	push	%rbp
513.cfi_push	%rbp
514	push	%rbx
515.cfi_push	%rbx
516	push	%r12
517.cfi_push	%r12
518	push	%r13
519.cfi_push	%r13
520	push	%r14
521.cfi_push	%r14
522	push	%r15
523.cfi_push	%r15
524.Lord_sqr_body:
525
526	mov	8*0($a_ptr), $acc0
527	mov	8*1($a_ptr), %rax
528	mov	8*2($a_ptr), $acc6
529	mov	8*3($a_ptr), $acc7
530	lea	.Lord(%rip), $a_ptr	# pointer to modulus
531	mov	$b_org, $b_ptr
532	jmp	.Loop_ord_sqr
533
534.align	32
535.Loop_ord_sqr:
536	################################# a[1:] * a[0]
537	mov	%rax, $t1		# put aside a[1]
538	mul	$acc0			# a[1] * a[0]
539	mov	%rax, $acc1
540	movq	$t1, %xmm1		# offload a[1]
541	mov	$acc6, %rax
542	mov	%rdx, $acc2
543
544	mul	$acc0			# a[2] * a[0]
545	add	%rax, $acc2
546	mov	$acc7, %rax
547	movq	$acc6, %xmm2		# offload a[2]
548	adc	\$0, %rdx
549	mov	%rdx, $acc3
550
551	mul	$acc0			# a[3] * a[0]
552	add	%rax, $acc3
553	mov	$acc7, %rax
554	movq	$acc7, %xmm3		# offload a[3]
555	adc	\$0, %rdx
556	mov	%rdx, $acc4
557
558	################################# a[3] * a[2]
559	mul	$acc6			# a[3] * a[2]
560	mov	%rax, $acc5
561	mov	$acc6, %rax
562	mov	%rdx, $acc6
563
564	################################# a[2:] * a[1]
565	mul	$t1			# a[2] * a[1]
566	add	%rax, $acc3
567	mov	$acc7, %rax
568	adc	\$0, %rdx
569	mov	%rdx, $acc7
570
571	mul	$t1			# a[3] * a[1]
572	add	%rax, $acc4
573	adc	\$0, %rdx
574
575	add	$acc7, $acc4
576	adc	%rdx, $acc5
577	adc	\$0, $acc6		# can't overflow
578
579	################################# *2
580	xor	$acc7, $acc7
581	mov	$acc0, %rax
582	add	$acc1, $acc1
583	adc	$acc2, $acc2
584	adc	$acc3, $acc3
585	adc	$acc4, $acc4
586	adc	$acc5, $acc5
587	adc	$acc6, $acc6
588	adc	\$0, $acc7
589
590	################################# Missing products
591	mul	%rax			# a[0] * a[0]
592	mov	%rax, $acc0
593	movq	%xmm1, %rax
594	mov	%rdx, $t1
595
596	mul	%rax			# a[1] * a[1]
597	add	$t1, $acc1
598	adc	%rax, $acc2
599	movq	%xmm2, %rax
600	adc	\$0, %rdx
601	mov	%rdx, $t1
602
603	mul	%rax			# a[2] * a[2]
604	add	$t1, $acc3
605	adc	%rax, $acc4
606	movq	%xmm3, %rax
607	adc	\$0, %rdx
608	mov	%rdx, $t1
609
610	 mov	$acc0, $t0
611	 imulq	8*4($a_ptr), $acc0	# *= .LordK
612
613	mul	%rax			# a[3] * a[3]
614	add	$t1, $acc5
615	adc	%rax, $acc6
616	 mov	8*0($a_ptr), %rax	# modulus[0]
617	adc	%rdx, $acc7		# can't overflow
618
619	################################# First reduction step
620	mul	$acc0
621	mov	$acc0, $t1
622	add	%rax, $t0		# guaranteed to be zero
623	mov	8*1($a_ptr), %rax	# modulus[1]
624	adc	%rdx, $t0
625
626	sub	$acc0, $acc2
627	sbb	\$0, $t1		# can't borrow
628
629	mul	$acc0
630	add	$t0, $acc1
631	adc	\$0, %rdx
632	add	%rax, $acc1
633	mov	$acc0, %rax
634	adc	%rdx, $acc2
635	mov	$acc0, %rdx
636	adc	\$0, $t1		# can't overflow
637
638	 mov	$acc1, $t0
639	 imulq	8*4($a_ptr), $acc1	# *= .LordK
640
641	shl	\$32, %rax
642	shr	\$32, %rdx
643	sub	%rax, $acc3
644	 mov	8*0($a_ptr), %rax
645	sbb	%rdx, $acc0		# can't borrow
646
647	add	$t1, $acc3
648	adc	\$0, $acc0		# can't overflow
649
650	################################# Second reduction step
651	mul	$acc1
652	mov	$acc1, $t1
653	add	%rax, $t0		# guaranteed to be zero
654	mov	8*1($a_ptr), %rax
655	adc	%rdx, $t0
656
657	sub	$acc1, $acc3
658	sbb	\$0, $t1		# can't borrow
659
660	mul	$acc1
661	add	$t0, $acc2
662	adc	\$0, %rdx
663	add	%rax, $acc2
664	mov	$acc1, %rax
665	adc	%rdx, $acc3
666	mov	$acc1, %rdx
667	adc	\$0, $t1		# can't overflow
668
669	 mov	$acc2, $t0
670	 imulq	8*4($a_ptr), $acc2	# *= .LordK
671
672	shl	\$32, %rax
673	shr	\$32, %rdx
674	sub	%rax, $acc0
675	 mov	8*0($a_ptr), %rax
676	sbb	%rdx, $acc1		# can't borrow
677
678	add	$t1, $acc0
679	adc	\$0, $acc1		# can't overflow
680
681	################################# Third reduction step
682	mul	$acc2
683	mov	$acc2, $t1
684	add	%rax, $t0		# guaranteed to be zero
685	mov	8*1($a_ptr), %rax
686	adc	%rdx, $t0
687
688	sub	$acc2, $acc0
689	sbb	\$0, $t1		# can't borrow
690
691	mul	$acc2
692	add	$t0, $acc3
693	adc	\$0, %rdx
694	add	%rax, $acc3
695	mov	$acc2, %rax
696	adc	%rdx, $acc0
697	mov	$acc2, %rdx
698	adc	\$0, $t1		# can't overflow
699
700	 mov	$acc3, $t0
701	 imulq	8*4($a_ptr), $acc3	# *= .LordK
702
703	shl	\$32, %rax
704	shr	\$32, %rdx
705	sub	%rax, $acc1
706	 mov	8*0($a_ptr), %rax
707	sbb	%rdx, $acc2		# can't borrow
708
709	add	$t1, $acc1
710	adc	\$0, $acc2		# can't overflow
711
712	################################# Last reduction step
713	mul	$acc3
714	mov	$acc3, $t1
715	add	%rax, $t0		# guaranteed to be zero
716	mov	8*1($a_ptr), %rax
717	adc	%rdx, $t0
718
719	sub	$acc3, $acc1
720	sbb	\$0, $t1		# can't borrow
721
722	mul	$acc3
723	add	$t0, $acc0
724	adc	\$0, %rdx
725	add	%rax, $acc0
726	mov	$acc3, %rax
727	adc	%rdx, $acc1
728	mov	$acc3, %rdx
729	adc	\$0, $t1		# can't overflow
730
731	shl	\$32, %rax
732	shr	\$32, %rdx
733	sub	%rax, $acc2
734	sbb	%rdx, $acc3		# can't borrow
735
736	add	$t1, $acc2
737	adc	\$0, $acc3		# can't overflow
738
739	################################# Add bits [511:256] of the sqr result
740	xor	%rdx, %rdx
741	add	$acc4, $acc0
742	adc	$acc5, $acc1
743	 mov	$acc0, $acc4
744	adc	$acc6, $acc2
745	adc	$acc7, $acc3
746	 mov	$acc1, %rax
747	adc	\$0, %rdx
748
749	################################# Compare to modulus
750	sub	8*0($a_ptr), $acc0
751	 mov	$acc2, $acc6
752	sbb	8*1($a_ptr), $acc1
753	sbb	8*2($a_ptr), $acc2
754	 mov	$acc3, $acc7
755	sbb	8*3($a_ptr), $acc3
756	sbb	\$0, %rdx
757
758	cmovc	$acc4, $acc0
759	cmovnc	$acc1, %rax
760	cmovnc	$acc2, $acc6
761	cmovnc	$acc3, $acc7
762
763	dec	$b_ptr
764	jnz	.Loop_ord_sqr
765
766	mov	$acc0, 8*0($r_ptr)
767	mov	%rax,  8*1($r_ptr)
768	pxor	%xmm1, %xmm1
769	mov	$acc6, 8*2($r_ptr)
770	pxor	%xmm2, %xmm2
771	mov	$acc7, 8*3($r_ptr)
772	pxor	%xmm3, %xmm3
773
774	mov	0(%rsp),%r15
775.cfi_restore	%r15
776	mov	8(%rsp),%r14
777.cfi_restore	%r14
778	mov	16(%rsp),%r13
779.cfi_restore	%r13
780	mov	24(%rsp),%r12
781.cfi_restore	%r12
782	mov	32(%rsp),%rbx
783.cfi_restore	%rbx
784	mov	40(%rsp),%rbp
785.cfi_restore	%rbp
786	lea	48(%rsp),%rsp
787.cfi_adjust_cfa_offset	-48
788.Lord_sqr_epilogue:
789	ret
790.cfi_endproc
791.size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
792___
793
794$code.=<<___	if ($addx);
795################################################################################
796.type	ecp_nistz256_ord_mul_montx,\@function,3
797.align	32
798ecp_nistz256_ord_mul_montx:
799.cfi_startproc
800.Lecp_nistz256_ord_mul_montx:
801	push	%rbp
802.cfi_push	%rbp
803	push	%rbx
804.cfi_push	%rbx
805	push	%r12
806.cfi_push	%r12
807	push	%r13
808.cfi_push	%r13
809	push	%r14
810.cfi_push	%r14
811	push	%r15
812.cfi_push	%r15
813.Lord_mulx_body:
814
815	mov	$b_org, $b_ptr
816	mov	8*0($b_org), %rdx
817	mov	8*0($a_ptr), $acc1
818	mov	8*1($a_ptr), $acc2
819	mov	8*2($a_ptr), $acc3
820	mov	8*3($a_ptr), $acc4
821	lea	-128($a_ptr), $a_ptr	# control u-op density
822	lea	.Lord-128(%rip), %r14
823	mov	.LordK(%rip), %r15
824
825	################################# Multiply by b[0]
826	mulx	$acc1, $acc0, $acc1
827	mulx	$acc2, $t0, $acc2
828	mulx	$acc3, $t1, $acc3
829	add	$t0, $acc1
830	mulx	$acc4, $t0, $acc4
831	 mov	$acc0, %rdx
832	 mulx	%r15, %rdx, %rax
833	adc	$t1, $acc2
834	adc	$t0, $acc3
835	adc	\$0, $acc4
836
837	################################# reduction
838	xor	$acc5, $acc5		# $acc5=0, cf=0, of=0
839	mulx	8*0+128(%r14), $t0, $t1
840	adcx	$t0, $acc0		# guaranteed to be zero
841	adox	$t1, $acc1
842
843	mulx	8*1+128(%r14), $t0, $t1
844	adcx	$t0, $acc1
845	adox	$t1, $acc2
846
847	mulx	8*2+128(%r14), $t0, $t1
848	adcx	$t0, $acc2
849	adox	$t1, $acc3
850
851	mulx	8*3+128(%r14), $t0, $t1
852	 mov	8*1($b_ptr), %rdx
853	adcx	$t0, $acc3
854	adox	$t1, $acc4
855	adcx	$acc0, $acc4
856	adox	$acc0, $acc5
857	adc	\$0, $acc5		# cf=0, of=0
858
859	################################# Multiply by b[1]
860	mulx	8*0+128($a_ptr), $t0, $t1
861	adcx	$t0, $acc1
862	adox	$t1, $acc2
863
864	mulx	8*1+128($a_ptr), $t0, $t1
865	adcx	$t0, $acc2
866	adox	$t1, $acc3
867
868	mulx	8*2+128($a_ptr), $t0, $t1
869	adcx	$t0, $acc3
870	adox	$t1, $acc4
871
872	mulx	8*3+128($a_ptr), $t0, $t1
873	 mov	$acc1, %rdx
874	 mulx	%r15, %rdx, %rax
875	adcx	$t0, $acc4
876	adox	$t1, $acc5
877
878	adcx	$acc0, $acc5
879	adox	$acc0, $acc0
880	adc	\$0, $acc0		# cf=0, of=0
881
882	################################# reduction
883	mulx	8*0+128(%r14), $t0, $t1
884	adcx	$t0, $acc1		# guaranteed to be zero
885	adox	$t1, $acc2
886
887	mulx	8*1+128(%r14), $t0, $t1
888	adcx	$t0, $acc2
889	adox	$t1, $acc3
890
891	mulx	8*2+128(%r14), $t0, $t1
892	adcx	$t0, $acc3
893	adox	$t1, $acc4
894
895	mulx	8*3+128(%r14), $t0, $t1
896	 mov	8*2($b_ptr), %rdx
897	adcx	$t0, $acc4
898	adox	$t1, $acc5
899	adcx	$acc1, $acc5
900	adox	$acc1, $acc0
901	adc	\$0, $acc0		# cf=0, of=0
902
903	################################# Multiply by b[2]
904	mulx	8*0+128($a_ptr), $t0, $t1
905	adcx	$t0, $acc2
906	adox	$t1, $acc3
907
908	mulx	8*1+128($a_ptr), $t0, $t1
909	adcx	$t0, $acc3
910	adox	$t1, $acc4
911
912	mulx	8*2+128($a_ptr), $t0, $t1
913	adcx	$t0, $acc4
914	adox	$t1, $acc5
915
916	mulx	8*3+128($a_ptr), $t0, $t1
917	 mov	$acc2, %rdx
918	 mulx	%r15, %rdx, %rax
919	adcx	$t0, $acc5
920	adox	$t1, $acc0
921
922	adcx	$acc1, $acc0
923	adox	$acc1, $acc1
924	adc	\$0, $acc1		# cf=0, of=0
925
926	################################# reduction
927	mulx	8*0+128(%r14), $t0, $t1
928	adcx	$t0, $acc2		# guaranteed to be zero
929	adox	$t1, $acc3
930
931	mulx	8*1+128(%r14), $t0, $t1
932	adcx	$t0, $acc3
933	adox	$t1, $acc4
934
935	mulx	8*2+128(%r14), $t0, $t1
936	adcx	$t0, $acc4
937	adox	$t1, $acc5
938
939	mulx	8*3+128(%r14), $t0, $t1
940	 mov	8*3($b_ptr), %rdx
941	adcx	$t0, $acc5
942	adox	$t1, $acc0
943	adcx	$acc2, $acc0
944	adox	$acc2, $acc1
945	adc	\$0, $acc1		# cf=0, of=0
946
947	################################# Multiply by b[3]
948	mulx	8*0+128($a_ptr), $t0, $t1
949	adcx	$t0, $acc3
950	adox	$t1, $acc4
951
952	mulx	8*1+128($a_ptr), $t0, $t1
953	adcx	$t0, $acc4
954	adox	$t1, $acc5
955
956	mulx	8*2+128($a_ptr), $t0, $t1
957	adcx	$t0, $acc5
958	adox	$t1, $acc0
959
960	mulx	8*3+128($a_ptr), $t0, $t1
961	 mov	$acc3, %rdx
962	 mulx	%r15, %rdx, %rax
963	adcx	$t0, $acc0
964	adox	$t1, $acc1
965
966	adcx	$acc2, $acc1
967	adox	$acc2, $acc2
968	adc	\$0, $acc2		# cf=0, of=0
969
970	################################# reduction
971	mulx	8*0+128(%r14), $t0, $t1
972	adcx	$t0, $acc3		# guranteed to be zero
973	adox	$t1, $acc4
974
975	mulx	8*1+128(%r14), $t0, $t1
976	adcx	$t0, $acc4
977	adox	$t1, $acc5
978
979	mulx	8*2+128(%r14), $t0, $t1
980	adcx	$t0, $acc5
981	adox	$t1, $acc0
982
983	mulx	8*3+128(%r14), $t0, $t1
984	lea	128(%r14),%r14
985	 mov	$acc4, $t2
986	adcx	$t0, $acc0
987	adox	$t1, $acc1
988	 mov	$acc5, $t3
989	adcx	$acc3, $acc1
990	adox	$acc3, $acc2
991	adc	\$0, $acc2
992
993	#################################
994	# Branch-less conditional subtraction of P
995	 mov	$acc0, $t0
996	sub	8*0(%r14), $acc4
997	sbb	8*1(%r14), $acc5
998	sbb	8*2(%r14), $acc0
999	 mov	$acc1, $t1
1000	sbb	8*3(%r14), $acc1
1001	sbb	\$0, $acc2
1002
1003	cmovc	$t2, $acc4
1004	cmovc	$t3, $acc5
1005	cmovc	$t0, $acc0
1006	cmovc	$t1, $acc1
1007
1008	mov	$acc4, 8*0($r_ptr)
1009	mov	$acc5, 8*1($r_ptr)
1010	mov	$acc0, 8*2($r_ptr)
1011	mov	$acc1, 8*3($r_ptr)
1012
1013	mov	0(%rsp),%r15
1014.cfi_restore	%r15
1015	mov	8(%rsp),%r14
1016.cfi_restore	%r14
1017	mov	16(%rsp),%r13
1018.cfi_restore	%r13
1019	mov	24(%rsp),%r12
1020.cfi_restore	%r12
1021	mov	32(%rsp),%rbx
1022.cfi_restore	%rbx
1023	mov	40(%rsp),%rbp
1024.cfi_restore	%rbp
1025	lea	48(%rsp),%rsp
1026.cfi_adjust_cfa_offset	-48
1027.Lord_mulx_epilogue:
1028	ret
1029.cfi_endproc
1030.size	ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx
1031
1032.type	ecp_nistz256_ord_sqr_montx,\@function,3
1033.align	32
1034ecp_nistz256_ord_sqr_montx:
1035.cfi_startproc
1036.Lecp_nistz256_ord_sqr_montx:
1037	push	%rbp
1038.cfi_push	%rbp
1039	push	%rbx
1040.cfi_push	%rbx
1041	push	%r12
1042.cfi_push	%r12
1043	push	%r13
1044.cfi_push	%r13
1045	push	%r14
1046.cfi_push	%r14
1047	push	%r15
1048.cfi_push	%r15
1049.Lord_sqrx_body:
1050
1051	mov	$b_org, $b_ptr
1052	mov	8*0($a_ptr), %rdx
1053	mov	8*1($a_ptr), $acc6
1054	mov	8*2($a_ptr), $acc7
1055	mov	8*3($a_ptr), $acc0
1056	lea	.Lord(%rip), $a_ptr
1057	jmp	.Loop_ord_sqrx
1058
1059.align	32
1060.Loop_ord_sqrx:
1061	mulx	$acc6, $acc1, $acc2	# a[0]*a[1]
1062	mulx	$acc7, $t0, $acc3	# a[0]*a[2]
1063	 mov	%rdx, %rax		# offload a[0]
1064	 movq	$acc6, %xmm1		# offload a[1]
1065	mulx	$acc0, $t1, $acc4	# a[0]*a[3]
1066	 mov	$acc6, %rdx
1067	add	$t0, $acc2
1068	 movq	$acc7, %xmm2		# offload a[2]
1069	adc	$t1, $acc3
1070	adc	\$0, $acc4
1071	xor	$acc5, $acc5		# $acc5=0,cf=0,of=0
1072	#################################
1073	mulx	$acc7, $t0, $t1		# a[1]*a[2]
1074	adcx	$t0, $acc3
1075	adox	$t1, $acc4
1076
1077	mulx	$acc0, $t0, $t1		# a[1]*a[3]
1078	 mov	$acc7, %rdx
1079	adcx	$t0, $acc4
1080	adox	$t1, $acc5
1081	adc	\$0, $acc5
1082	#################################
1083	mulx	$acc0, $t0, $acc6	# a[2]*a[3]
1084	mov	%rax, %rdx
1085	 movq	$acc0, %xmm3		# offload a[3]
1086	xor	$acc7, $acc7		# $acc7=0,cf=0,of=0
1087	 adcx	$acc1, $acc1		# acc1:6<<1
1088	adox	$t0, $acc5
1089	 adcx	$acc2, $acc2
1090	adox	$acc7, $acc6		# of=0
1091
1092	################################# a[i]*a[i]
1093	mulx	%rdx, $acc0, $t1
1094	movq	%xmm1, %rdx
1095	 adcx	$acc3, $acc3
1096	adox	$t1, $acc1
1097	 adcx	$acc4, $acc4
1098	mulx	%rdx, $t0, $t4
1099	movq	%xmm2, %rdx
1100	 adcx	$acc5, $acc5
1101	adox	$t0, $acc2
1102	 adcx	$acc6, $acc6
1103	mulx	%rdx, $t0, $t1
1104	.byte	0x67
1105	movq	%xmm3, %rdx
1106	adox	$t4, $acc3
1107	 adcx	$acc7, $acc7
1108	adox	$t0, $acc4
1109	adox	$t1, $acc5
1110	mulx	%rdx, $t0, $t4
1111	adox	$t0, $acc6
1112	adox	$t4, $acc7
1113
1114	################################# reduction
1115	mov	$acc0, %rdx
1116	mulx	8*4($a_ptr), %rdx, $t0
1117
1118	xor	%rax, %rax		# cf=0, of=0
1119	mulx	8*0($a_ptr), $t0, $t1
1120	adcx	$t0, $acc0		# guaranteed to be zero
1121	adox	$t1, $acc1
1122	mulx	8*1($a_ptr), $t0, $t1
1123	adcx	$t0, $acc1
1124	adox	$t1, $acc2
1125	mulx	8*2($a_ptr), $t0, $t1
1126	adcx	$t0, $acc2
1127	adox	$t1, $acc3
1128	mulx	8*3($a_ptr), $t0, $t1
1129	adcx	$t0, $acc3
1130	adox	$t1, $acc0		# of=0
1131	adcx	%rax, $acc0		# cf=0
1132
1133	#################################
1134	mov	$acc1, %rdx
1135	mulx	8*4($a_ptr), %rdx, $t0
1136
1137	mulx	8*0($a_ptr), $t0, $t1
1138	adox	$t0, $acc1		# guaranteed to be zero
1139	adcx	$t1, $acc2
1140	mulx	8*1($a_ptr), $t0, $t1
1141	adox	$t0, $acc2
1142	adcx	$t1, $acc3
1143	mulx	8*2($a_ptr), $t0, $t1
1144	adox	$t0, $acc3
1145	adcx	$t1, $acc0
1146	mulx	8*3($a_ptr), $t0, $t1
1147	adox	$t0, $acc0
1148	adcx	$t1, $acc1		# cf=0
1149	adox	%rax, $acc1		# of=0
1150
1151	#################################
1152	mov	$acc2, %rdx
1153	mulx	8*4($a_ptr), %rdx, $t0
1154
1155	mulx	8*0($a_ptr), $t0, $t1
1156	adcx	$t0, $acc2		# guaranteed to be zero
1157	adox	$t1, $acc3
1158	mulx	8*1($a_ptr), $t0, $t1
1159	adcx	$t0, $acc3
1160	adox	$t1, $acc0
1161	mulx	8*2($a_ptr), $t0, $t1
1162	adcx	$t0, $acc0
1163	adox	$t1, $acc1
1164	mulx	8*3($a_ptr), $t0, $t1
1165	adcx	$t0, $acc1
1166	adox	$t1, $acc2		# of=0
1167	adcx	%rax, $acc2		# cf=0
1168
1169	#################################
1170	mov	$acc3, %rdx
1171	mulx	8*4($a_ptr), %rdx, $t0
1172
1173	mulx	8*0($a_ptr), $t0, $t1
1174	adox	$t0, $acc3		# guaranteed to be zero
1175	adcx	$t1, $acc0
1176	mulx	8*1($a_ptr), $t0, $t1
1177	adox	$t0, $acc0
1178	adcx	$t1, $acc1
1179	mulx	8*2($a_ptr), $t0, $t1
1180	adox	$t0, $acc1
1181	adcx	$t1, $acc2
1182	mulx	8*3($a_ptr), $t0, $t1
1183	adox	$t0, $acc2
1184	adcx	$t1, $acc3
1185	adox	%rax, $acc3
1186
1187	################################# accumulate upper half
1188	add	$acc0, $acc4		# add	$acc4, $acc0
1189	adc	$acc5, $acc1
1190	 mov	$acc4, %rdx
1191	adc	$acc6, $acc2
1192	adc	$acc7, $acc3
1193	 mov	$acc1, $acc6
1194	adc	\$0, %rax
1195
1196	################################# compare to modulus
1197	sub	8*0($a_ptr), $acc4
1198	 mov	$acc2, $acc7
1199	sbb	8*1($a_ptr), $acc1
1200	sbb	8*2($a_ptr), $acc2
1201	 mov	$acc3, $acc0
1202	sbb	8*3($a_ptr), $acc3
1203	sbb	\$0, %rax
1204
1205	cmovnc	$acc4, %rdx
1206	cmovnc	$acc1, $acc6
1207	cmovnc	$acc2, $acc7
1208	cmovnc	$acc3, $acc0
1209
1210	dec	$b_ptr
1211	jnz	.Loop_ord_sqrx
1212
1213	mov	%rdx, 8*0($r_ptr)
1214	mov	$acc6, 8*1($r_ptr)
1215	pxor	%xmm1, %xmm1
1216	mov	$acc7, 8*2($r_ptr)
1217	pxor	%xmm2, %xmm2
1218	mov	$acc0, 8*3($r_ptr)
1219	pxor	%xmm3, %xmm3
1220
1221	mov	0(%rsp),%r15
1222.cfi_restore	%r15
1223	mov	8(%rsp),%r14
1224.cfi_restore	%r14
1225	mov	16(%rsp),%r13
1226.cfi_restore	%r13
1227	mov	24(%rsp),%r12
1228.cfi_restore	%r12
1229	mov	32(%rsp),%rbx
1230.cfi_restore	%rbx
1231	mov	40(%rsp),%rbp
1232.cfi_restore	%rbp
1233	lea	48(%rsp),%rsp
1234.cfi_adjust_cfa_offset	-48
1235.Lord_sqrx_epilogue:
1236	ret
1237.cfi_endproc
1238.size	ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx
1239___
1240
1241$code.=<<___;
1242################################################################################
1243# void ecp_nistz256_mul_mont(
1244#   uint64_t res[4],
1245#   uint64_t a[4],
1246#   uint64_t b[4]);
1247
1248.globl	ecp_nistz256_mul_mont
1249.type	ecp_nistz256_mul_mont,\@function,3
1250.align	32
1251ecp_nistz256_mul_mont:
1252.cfi_startproc
1253	_CET_ENDBR
1254___
1255$code.=<<___	if ($addx);
1256	leaq	OPENSSL_ia32cap_P(%rip), %rcx
1257	mov	8(%rcx), %rcx
1258	and	\$0x80100, %ecx
1259___
1260$code.=<<___;
1261.Lmul_mont:
1262	push	%rbp
1263.cfi_push	%rbp
1264	push	%rbx
1265.cfi_push	%rbx
1266	push	%r12
1267.cfi_push	%r12
1268	push	%r13
1269.cfi_push	%r13
1270	push	%r14
1271.cfi_push	%r14
1272	push	%r15
1273.cfi_push	%r15
1274.Lmul_body:
1275___
1276$code.=<<___	if ($addx);
1277	cmp	\$0x80100, %ecx
1278	je	.Lmul_montx
1279___
1280$code.=<<___;
1281	mov	$b_org, $b_ptr
1282	mov	8*0($b_org), %rax
1283	mov	8*0($a_ptr), $acc1
1284	mov	8*1($a_ptr), $acc2
1285	mov	8*2($a_ptr), $acc3
1286	mov	8*3($a_ptr), $acc4
1287
1288	call	__ecp_nistz256_mul_montq
1289___
1290$code.=<<___	if ($addx);
1291	jmp	.Lmul_mont_done
1292
1293.align	32
1294.Lmul_montx:
1295	mov	$b_org, $b_ptr
1296	mov	8*0($b_org), %rdx
1297	mov	8*0($a_ptr), $acc1
1298	mov	8*1($a_ptr), $acc2
1299	mov	8*2($a_ptr), $acc3
1300	mov	8*3($a_ptr), $acc4
1301	lea	-128($a_ptr), $a_ptr	# control u-op density
1302
1303	call	__ecp_nistz256_mul_montx
1304___
1305$code.=<<___;
1306.Lmul_mont_done:
1307	mov	0(%rsp),%r15
1308.cfi_restore	%r15
1309	mov	8(%rsp),%r14
1310.cfi_restore	%r14
1311	mov	16(%rsp),%r13
1312.cfi_restore	%r13
1313	mov	24(%rsp),%r12
1314.cfi_restore	%r12
1315	mov	32(%rsp),%rbx
1316.cfi_restore	%rbx
1317	mov	40(%rsp),%rbp
1318.cfi_restore	%rbp
1319	lea	48(%rsp),%rsp
1320.cfi_adjust_cfa_offset	-48
1321.Lmul_epilogue:
1322	ret
1323.cfi_endproc
1324.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
1325
1326.type	__ecp_nistz256_mul_montq,\@abi-omnipotent
1327.align	32
1328__ecp_nistz256_mul_montq:
1329.cfi_startproc
1330	########################################################################
1331	# Multiply a by b[0]
1332	mov	%rax, $t1
1333	mulq	$acc1
1334	mov	.Lpoly+8*1(%rip),$poly1
1335	mov	%rax, $acc0
1336	mov	$t1, %rax
1337	mov	%rdx, $acc1
1338
1339	mulq	$acc2
1340	mov	.Lpoly+8*3(%rip),$poly3
1341	add	%rax, $acc1
1342	mov	$t1, %rax
1343	adc	\$0, %rdx
1344	mov	%rdx, $acc2
1345
1346	mulq	$acc3
1347	add	%rax, $acc2
1348	mov	$t1, %rax
1349	adc	\$0, %rdx
1350	mov	%rdx, $acc3
1351
1352	mulq	$acc4
1353	add	%rax, $acc3
1354	 mov	$acc0, %rax
1355	adc	\$0, %rdx
1356	xor	$acc5, $acc5
1357	mov	%rdx, $acc4
1358
1359	########################################################################
1360	# First reduction step
1361	# Basically now we want to multiply acc[0] by p256,
1362	# and add the result to the acc.
1363	# Due to the special form of p256 we do some optimizations
1364	#
1365	# acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
1366	# then we add acc[0] and get acc[0] x 2^96
1367
1368	mov	$acc0, $t1
1369	shl	\$32, $acc0
1370	mulq	$poly3
1371	shr	\$32, $t1
1372	add	$acc0, $acc1		# +=acc[0]<<96
1373	adc	$t1, $acc2
1374	adc	%rax, $acc3
1375	 mov	8*1($b_ptr), %rax
1376	adc	%rdx, $acc4
1377	adc	\$0, $acc5
1378	xor	$acc0, $acc0
1379
1380	########################################################################
1381	# Multiply by b[1]
1382	mov	%rax, $t1
1383	mulq	8*0($a_ptr)
1384	add	%rax, $acc1
1385	mov	$t1, %rax
1386	adc	\$0, %rdx
1387	mov	%rdx, $t0
1388
1389	mulq	8*1($a_ptr)
1390	add	$t0, $acc2
1391	adc	\$0, %rdx
1392	add	%rax, $acc2
1393	mov	$t1, %rax
1394	adc	\$0, %rdx
1395	mov	%rdx, $t0
1396
1397	mulq	8*2($a_ptr)
1398	add	$t0, $acc3
1399	adc	\$0, %rdx
1400	add	%rax, $acc3
1401	mov	$t1, %rax
1402	adc	\$0, %rdx
1403	mov	%rdx, $t0
1404
1405	mulq	8*3($a_ptr)
1406	add	$t0, $acc4
1407	adc	\$0, %rdx
1408	add	%rax, $acc4
1409	 mov	$acc1, %rax
1410	adc	%rdx, $acc5
1411	adc	\$0, $acc0
1412
1413	########################################################################
1414	# Second reduction step
1415	mov	$acc1, $t1
1416	shl	\$32, $acc1
1417	mulq	$poly3
1418	shr	\$32, $t1
1419	add	$acc1, $acc2
1420	adc	$t1, $acc3
1421	adc	%rax, $acc4
1422	 mov	8*2($b_ptr), %rax
1423	adc	%rdx, $acc5
1424	adc	\$0, $acc0
1425	xor	$acc1, $acc1
1426
1427	########################################################################
1428	# Multiply by b[2]
1429	mov	%rax, $t1
1430	mulq	8*0($a_ptr)
1431	add	%rax, $acc2
1432	mov	$t1, %rax
1433	adc	\$0, %rdx
1434	mov	%rdx, $t0
1435
1436	mulq	8*1($a_ptr)
1437	add	$t0, $acc3
1438	adc	\$0, %rdx
1439	add	%rax, $acc3
1440	mov	$t1, %rax
1441	adc	\$0, %rdx
1442	mov	%rdx, $t0
1443
1444	mulq	8*2($a_ptr)
1445	add	$t0, $acc4
1446	adc	\$0, %rdx
1447	add	%rax, $acc4
1448	mov	$t1, %rax
1449	adc	\$0, %rdx
1450	mov	%rdx, $t0
1451
1452	mulq	8*3($a_ptr)
1453	add	$t0, $acc5
1454	adc	\$0, %rdx
1455	add	%rax, $acc5
1456	 mov	$acc2, %rax
1457	adc	%rdx, $acc0
1458	adc	\$0, $acc1
1459
1460	########################################################################
1461	# Third reduction step
1462	mov	$acc2, $t1
1463	shl	\$32, $acc2
1464	mulq	$poly3
1465	shr	\$32, $t1
1466	add	$acc2, $acc3
1467	adc	$t1, $acc4
1468	adc	%rax, $acc5
1469	 mov	8*3($b_ptr), %rax
1470	adc	%rdx, $acc0
1471	adc	\$0, $acc1
1472	xor	$acc2, $acc2
1473
1474	########################################################################
1475	# Multiply by b[3]
1476	mov	%rax, $t1
1477	mulq	8*0($a_ptr)
1478	add	%rax, $acc3
1479	mov	$t1, %rax
1480	adc	\$0, %rdx
1481	mov	%rdx, $t0
1482
1483	mulq	8*1($a_ptr)
1484	add	$t0, $acc4
1485	adc	\$0, %rdx
1486	add	%rax, $acc4
1487	mov	$t1, %rax
1488	adc	\$0, %rdx
1489	mov	%rdx, $t0
1490
1491	mulq	8*2($a_ptr)
1492	add	$t0, $acc5
1493	adc	\$0, %rdx
1494	add	%rax, $acc5
1495	mov	$t1, %rax
1496	adc	\$0, %rdx
1497	mov	%rdx, $t0
1498
1499	mulq	8*3($a_ptr)
1500	add	$t0, $acc0
1501	adc	\$0, %rdx
1502	add	%rax, $acc0
1503	 mov	$acc3, %rax
1504	adc	%rdx, $acc1
1505	adc	\$0, $acc2
1506
1507	########################################################################
1508	# Final reduction step
1509	mov	$acc3, $t1
1510	shl	\$32, $acc3
1511	mulq	$poly3
1512	shr	\$32, $t1
1513	add	$acc3, $acc4
1514	adc	$t1, $acc5
1515	 mov	$acc4, $t0
1516	adc	%rax, $acc0
1517	adc	%rdx, $acc1
1518	 mov	$acc5, $t1
1519	adc	\$0, $acc2
1520
1521	########################################################################
1522	# Branch-less conditional subtraction of P
1523	sub	\$-1, $acc4		# .Lpoly[0]
1524	 mov	$acc0, $t2
1525	sbb	$poly1, $acc5		# .Lpoly[1]
1526	sbb	\$0, $acc0		# .Lpoly[2]
1527	 mov	$acc1, $t3
1528	sbb	$poly3, $acc1		# .Lpoly[3]
1529	sbb	\$0, $acc2
1530
1531	cmovc	$t0, $acc4
1532	cmovc	$t1, $acc5
1533	mov	$acc4, 8*0($r_ptr)
1534	cmovc	$t2, $acc0
1535	mov	$acc5, 8*1($r_ptr)
1536	cmovc	$t3, $acc1
1537	mov	$acc0, 8*2($r_ptr)
1538	mov	$acc1, 8*3($r_ptr)
1539
1540	ret
1541.cfi_endproc
1542.size	__ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
1543
1544################################################################################
1545# void ecp_nistz256_sqr_mont(
1546#   uint64_t res[4],
1547#   uint64_t a[4]);
1548
1549# we optimize the square according to S.Gueron and V.Krasnov,
1550# "Speeding up Big-Number Squaring"
1551.globl	ecp_nistz256_sqr_mont
1552.type	ecp_nistz256_sqr_mont,\@function,2
1553.align	32
1554ecp_nistz256_sqr_mont:
1555.cfi_startproc
1556	_CET_ENDBR
1557___
1558$code.=<<___	if ($addx);
1559	leaq	OPENSSL_ia32cap_P(%rip), %rcx
1560	mov	8(%rcx), %rcx
1561	and	\$0x80100, %ecx
1562___
1563$code.=<<___;
1564	push	%rbp
1565.cfi_push	%rbp
1566	push	%rbx
1567.cfi_push	%rbx
1568	push	%r12
1569.cfi_push	%r12
1570	push	%r13
1571.cfi_push	%r13
1572	push	%r14
1573.cfi_push	%r14
1574	push	%r15
1575.cfi_push	%r15
1576.Lsqr_body:
1577___
1578$code.=<<___	if ($addx);
1579	cmp	\$0x80100, %ecx
1580	je	.Lsqr_montx
1581___
1582$code.=<<___;
1583	mov	8*0($a_ptr), %rax
1584	mov	8*1($a_ptr), $acc6
1585	mov	8*2($a_ptr), $acc7
1586	mov	8*3($a_ptr), $acc0
1587
1588	call	__ecp_nistz256_sqr_montq
1589___
1590$code.=<<___	if ($addx);
1591	jmp	.Lsqr_mont_done
1592
1593.align	32
1594.Lsqr_montx:
1595	mov	8*0($a_ptr), %rdx
1596	mov	8*1($a_ptr), $acc6
1597	mov	8*2($a_ptr), $acc7
1598	mov	8*3($a_ptr), $acc0
1599	lea	-128($a_ptr), $a_ptr	# control u-op density
1600
1601	call	__ecp_nistz256_sqr_montx
1602___
1603$code.=<<___;
1604.Lsqr_mont_done:
1605	mov	0(%rsp),%r15
1606.cfi_restore	%r15
1607	mov	8(%rsp),%r14
1608.cfi_restore	%r14
1609	mov	16(%rsp),%r13
1610.cfi_restore	%r13
1611	mov	24(%rsp),%r12
1612.cfi_restore	%r12
1613	mov	32(%rsp),%rbx
1614.cfi_restore	%rbx
1615	mov	40(%rsp),%rbp
1616.cfi_restore	%rbp
1617	lea	48(%rsp),%rsp
1618.cfi_adjust_cfa_offset	-48
1619.Lsqr_epilogue:
1620	ret
1621.cfi_endproc
1622.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
1623
1624.type	__ecp_nistz256_sqr_montq,\@abi-omnipotent
1625.align	32
1626__ecp_nistz256_sqr_montq:
1627.cfi_startproc
1628	mov	%rax, $acc5
1629	mulq	$acc6			# a[1]*a[0]
1630	mov	%rax, $acc1
1631	mov	$acc7, %rax
1632	mov	%rdx, $acc2
1633
1634	mulq	$acc5			# a[0]*a[2]
1635	add	%rax, $acc2
1636	mov	$acc0, %rax
1637	adc	\$0, %rdx
1638	mov	%rdx, $acc3
1639
1640	mulq	$acc5			# a[0]*a[3]
1641	add	%rax, $acc3
1642	 mov	$acc7, %rax
1643	adc	\$0, %rdx
1644	mov	%rdx, $acc4
1645
1646	#################################
1647	mulq	$acc6			# a[1]*a[2]
1648	add	%rax, $acc3
1649	mov	$acc0, %rax
1650	adc	\$0, %rdx
1651	mov	%rdx, $t1
1652
1653	mulq	$acc6			# a[1]*a[3]
1654	add	%rax, $acc4
1655	 mov	$acc0, %rax
1656	adc	\$0, %rdx
1657	add	$t1, $acc4
1658	mov	%rdx, $acc5
1659	adc	\$0, $acc5
1660
1661	#################################
1662	mulq	$acc7			# a[2]*a[3]
1663	xor	$acc7, $acc7
1664	add	%rax, $acc5
1665	 mov	8*0($a_ptr), %rax
1666	mov	%rdx, $acc6
1667	adc	\$0, $acc6
1668
1669	add	$acc1, $acc1		# acc1:6<<1
1670	adc	$acc2, $acc2
1671	adc	$acc3, $acc3
1672	adc	$acc4, $acc4
1673	adc	$acc5, $acc5
1674	adc	$acc6, $acc6
1675	adc	\$0, $acc7
1676
1677	mulq	%rax
1678	mov	%rax, $acc0
1679	mov	8*1($a_ptr), %rax
1680	mov	%rdx, $t0
1681
1682	mulq	%rax
1683	add	$t0, $acc1
1684	adc	%rax, $acc2
1685	mov	8*2($a_ptr), %rax
1686	adc	\$0, %rdx
1687	mov	%rdx, $t0
1688
1689	mulq	%rax
1690	add	$t0, $acc3
1691	adc	%rax, $acc4
1692	mov	8*3($a_ptr), %rax
1693	adc	\$0, %rdx
1694	mov	%rdx, $t0
1695
1696	mulq	%rax
1697	add	$t0, $acc5
1698	adc	%rax, $acc6
1699	 mov	$acc0, %rax
1700	adc	%rdx, $acc7
1701
1702	mov	.Lpoly+8*1(%rip), $a_ptr
1703	mov	.Lpoly+8*3(%rip), $t1
1704
1705	##########################################
1706	# Now the reduction
1707	# First iteration
1708	mov	$acc0, $t0
1709	shl	\$32, $acc0
1710	mulq	$t1
1711	shr	\$32, $t0
1712	add	$acc0, $acc1		# +=acc[0]<<96
1713	adc	$t0, $acc2
1714	adc	%rax, $acc3
1715	 mov	$acc1, %rax
1716	adc	\$0, %rdx
1717
1718	##########################################
1719	# Second iteration
1720	mov	$acc1, $t0
1721	shl	\$32, $acc1
1722	mov	%rdx, $acc0
1723	mulq	$t1
1724	shr	\$32, $t0
1725	add	$acc1, $acc2
1726	adc	$t0, $acc3
1727	adc	%rax, $acc0
1728	 mov	$acc2, %rax
1729	adc	\$0, %rdx
1730
1731	##########################################
1732	# Third iteration
1733	mov	$acc2, $t0
1734	shl	\$32, $acc2
1735	mov	%rdx, $acc1
1736	mulq	$t1
1737	shr	\$32, $t0
1738	add	$acc2, $acc3
1739	adc	$t0, $acc0
1740	adc	%rax, $acc1
1741	 mov	$acc3, %rax
1742	adc	\$0, %rdx
1743
1744	###########################################
1745	# Last iteration
1746	mov	$acc3, $t0
1747	shl	\$32, $acc3
1748	mov	%rdx, $acc2
1749	mulq	$t1
1750	shr	\$32, $t0
1751	add	$acc3, $acc0
1752	adc	$t0, $acc1
1753	adc	%rax, $acc2
1754	adc	\$0, %rdx
1755	xor	$acc3, $acc3
1756
1757	############################################
1758	# Add the rest of the acc
1759	add	$acc0, $acc4
1760	adc	$acc1, $acc5
1761	 mov	$acc4, $acc0
1762	adc	$acc2, $acc6
1763	adc	%rdx, $acc7
1764	 mov	$acc5, $acc1
1765	adc	\$0, $acc3
1766
1767	sub	\$-1, $acc4		# .Lpoly[0]
1768	 mov	$acc6, $acc2
1769	sbb	$a_ptr, $acc5		# .Lpoly[1]
1770	sbb	\$0, $acc6		# .Lpoly[2]
1771	 mov	$acc7, $t0
1772	sbb	$t1, $acc7		# .Lpoly[3]
1773	sbb	\$0, $acc3
1774
1775	cmovc	$acc0, $acc4
1776	cmovc	$acc1, $acc5
1777	mov	$acc4, 8*0($r_ptr)
1778	cmovc	$acc2, $acc6
1779	mov	$acc5, 8*1($r_ptr)
1780	cmovc	$t0, $acc7
1781	mov	$acc6, 8*2($r_ptr)
1782	mov	$acc7, 8*3($r_ptr)
1783
1784	ret
1785.cfi_endproc
1786.size	__ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
1787___
1788
1789if ($addx) {
1790$code.=<<___;
1791.type	__ecp_nistz256_mul_montx,\@abi-omnipotent
1792.align	32
1793__ecp_nistz256_mul_montx:
1794.cfi_startproc
1795	########################################################################
1796	# Multiply by b[0]
1797	mulx	$acc1, $acc0, $acc1
1798	mulx	$acc2, $t0, $acc2
1799	mov	\$32, $poly1
1800	xor	$acc5, $acc5		# cf=0
1801	mulx	$acc3, $t1, $acc3
1802	mov	.Lpoly+8*3(%rip), $poly3
1803	adc	$t0, $acc1
1804	mulx	$acc4, $t0, $acc4
1805	 mov	$acc0, %rdx
1806	adc	$t1, $acc2
1807	 shlx	$poly1,$acc0,$t1
1808	adc	$t0, $acc3
1809	 shrx	$poly1,$acc0,$t0
1810	adc	\$0, $acc4
1811
1812	########################################################################
1813	# First reduction step
1814	add	$t1, $acc1
1815	adc	$t0, $acc2
1816
1817	mulx	$poly3, $t0, $t1
1818	 mov	8*1($b_ptr), %rdx
1819	adc	$t0, $acc3
1820	adc	$t1, $acc4
1821	adc	\$0, $acc5
1822	xor	$acc0, $acc0		# $acc0=0,cf=0,of=0
1823
1824	########################################################################
1825	# Multiply by b[1]
1826	mulx	8*0+128($a_ptr), $t0, $t1
1827	adcx	$t0, $acc1
1828	adox	$t1, $acc2
1829
1830	mulx	8*1+128($a_ptr), $t0, $t1
1831	adcx	$t0, $acc2
1832	adox	$t1, $acc3
1833
1834	mulx	8*2+128($a_ptr), $t0, $t1
1835	adcx	$t0, $acc3
1836	adox	$t1, $acc4
1837
1838	mulx	8*3+128($a_ptr), $t0, $t1
1839	 mov	$acc1, %rdx
1840	adcx	$t0, $acc4
1841	 shlx	$poly1, $acc1, $t0
1842	adox	$t1, $acc5
1843	 shrx	$poly1, $acc1, $t1
1844
1845	adcx	$acc0, $acc5
1846	adox	$acc0, $acc0
1847	adc	\$0, $acc0
1848
1849	########################################################################
1850	# Second reduction step
1851	add	$t0, $acc2
1852	adc	$t1, $acc3
1853
1854	mulx	$poly3, $t0, $t1
1855	 mov	8*2($b_ptr), %rdx
1856	adc	$t0, $acc4
1857	adc	$t1, $acc5
1858	adc	\$0, $acc0
1859	xor	$acc1 ,$acc1		# $acc1=0,cf=0,of=0
1860
1861	########################################################################
1862	# Multiply by b[2]
1863	mulx	8*0+128($a_ptr), $t0, $t1
1864	adcx	$t0, $acc2
1865	adox	$t1, $acc3
1866
1867	mulx	8*1+128($a_ptr), $t0, $t1
1868	adcx	$t0, $acc3
1869	adox	$t1, $acc4
1870
1871	mulx	8*2+128($a_ptr), $t0, $t1
1872	adcx	$t0, $acc4
1873	adox	$t1, $acc5
1874
1875	mulx	8*3+128($a_ptr), $t0, $t1
1876	 mov	$acc2, %rdx
1877	adcx	$t0, $acc5
1878	 shlx	$poly1, $acc2, $t0
1879	adox	$t1, $acc0
1880	 shrx	$poly1, $acc2, $t1
1881
1882	adcx	$acc1, $acc0
1883	adox	$acc1, $acc1
1884	adc	\$0, $acc1
1885
1886	########################################################################
1887	# Third reduction step
1888	add	$t0, $acc3
1889	adc	$t1, $acc4
1890
1891	mulx	$poly3, $t0, $t1
1892	 mov	8*3($b_ptr), %rdx
1893	adc	$t0, $acc5
1894	adc	$t1, $acc0
1895	adc	\$0, $acc1
1896	xor	$acc2, $acc2		# $acc2=0,cf=0,of=0
1897
1898	########################################################################
1899	# Multiply by b[3]
1900	mulx	8*0+128($a_ptr), $t0, $t1
1901	adcx	$t0, $acc3
1902	adox	$t1, $acc4
1903
1904	mulx	8*1+128($a_ptr), $t0, $t1
1905	adcx	$t0, $acc4
1906	adox	$t1, $acc5
1907
1908	mulx	8*2+128($a_ptr), $t0, $t1
1909	adcx	$t0, $acc5
1910	adox	$t1, $acc0
1911
1912	mulx	8*3+128($a_ptr), $t0, $t1
1913	 mov	$acc3, %rdx
1914	adcx	$t0, $acc0
1915	 shlx	$poly1, $acc3, $t0
1916	adox	$t1, $acc1
1917	 shrx	$poly1, $acc3, $t1
1918
1919	adcx	$acc2, $acc1
1920	adox	$acc2, $acc2
1921	adc	\$0, $acc2
1922
1923	########################################################################
1924	# Fourth reduction step
1925	add	$t0, $acc4
1926	adc	$t1, $acc5
1927
1928	mulx	$poly3, $t0, $t1
1929	 mov	$acc4, $t2
1930	mov	.Lpoly+8*1(%rip), $poly1
1931	adc	$t0, $acc0
1932	 mov	$acc5, $t3
1933	adc	$t1, $acc1
1934	adc	\$0, $acc2
1935
1936	########################################################################
1937	# Branch-less conditional subtraction of P
1938	xor	%eax, %eax
1939	 mov	$acc0, $t0
1940	sbb	\$-1, $acc4		# .Lpoly[0]
1941	sbb	$poly1, $acc5		# .Lpoly[1]
1942	sbb	\$0, $acc0		# .Lpoly[2]
1943	 mov	$acc1, $t1
1944	sbb	$poly3, $acc1		# .Lpoly[3]
1945	sbb	\$0, $acc2
1946
1947	cmovc	$t2, $acc4
1948	cmovc	$t3, $acc5
1949	mov	$acc4, 8*0($r_ptr)
1950	cmovc	$t0, $acc0
1951	mov	$acc5, 8*1($r_ptr)
1952	cmovc	$t1, $acc1
1953	mov	$acc0, 8*2($r_ptr)
1954	mov	$acc1, 8*3($r_ptr)
1955
1956	ret
1957.cfi_endproc
1958.size	__ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
1959
1960.type	__ecp_nistz256_sqr_montx,\@abi-omnipotent
1961.align	32
1962__ecp_nistz256_sqr_montx:
1963.cfi_startproc
1964	mulx	$acc6, $acc1, $acc2	# a[0]*a[1]
1965	mulx	$acc7, $t0, $acc3	# a[0]*a[2]
1966	xor	%eax, %eax
1967	adc	$t0, $acc2
1968	mulx	$acc0, $t1, $acc4	# a[0]*a[3]
1969	 mov	$acc6, %rdx
1970	adc	$t1, $acc3
1971	adc	\$0, $acc4
1972	xor	$acc5, $acc5		# $acc5=0,cf=0,of=0
1973
1974	#################################
1975	mulx	$acc7, $t0, $t1		# a[1]*a[2]
1976	adcx	$t0, $acc3
1977	adox	$t1, $acc4
1978
1979	mulx	$acc0, $t0, $t1		# a[1]*a[3]
1980	 mov	$acc7, %rdx
1981	adcx	$t0, $acc4
1982	adox	$t1, $acc5
1983	adc	\$0, $acc5
1984
1985	#################################
1986	mulx	$acc0, $t0, $acc6	# a[2]*a[3]
1987	 mov	8*0+128($a_ptr), %rdx
1988	xor	$acc7, $acc7		# $acc7=0,cf=0,of=0
1989	 adcx	$acc1, $acc1		# acc1:6<<1
1990	adox	$t0, $acc5
1991	 adcx	$acc2, $acc2
1992	adox	$acc7, $acc6		# of=0
1993
1994	mulx	%rdx, $acc0, $t1
1995	mov	8*1+128($a_ptr), %rdx
1996	 adcx	$acc3, $acc3
1997	adox	$t1, $acc1
1998	 adcx	$acc4, $acc4
1999	mulx	%rdx, $t0, $t4
2000	mov	8*2+128($a_ptr), %rdx
2001	 adcx	$acc5, $acc5
2002	adox	$t0, $acc2
2003	 adcx	$acc6, $acc6
2004	.byte	0x67
2005	mulx	%rdx, $t0, $t1
2006	mov	8*3+128($a_ptr), %rdx
2007	adox	$t4, $acc3
2008	 adcx	$acc7, $acc7
2009	adox	$t0, $acc4
2010	 mov	\$32, $a_ptr
2011	adox	$t1, $acc5
2012	.byte	0x67,0x67
2013	mulx	%rdx, $t0, $t4
2014	 mov	.Lpoly+8*3(%rip), %rdx
2015	adox	$t0, $acc6
2016	 shlx	$a_ptr, $acc0, $t0
2017	adox	$t4, $acc7
2018	 shrx	$a_ptr, $acc0, $t4
2019	mov	%rdx,$t1
2020
2021	# reduction step 1
2022	add	$t0, $acc1
2023	adc	$t4, $acc2
2024
2025	mulx	$acc0, $t0, $acc0
2026	adc	$t0, $acc3
2027	 shlx	$a_ptr, $acc1, $t0
2028	adc	\$0, $acc0
2029	 shrx	$a_ptr, $acc1, $t4
2030
2031	# reduction step 2
2032	add	$t0, $acc2
2033	adc	$t4, $acc3
2034
2035	mulx	$acc1, $t0, $acc1
2036	adc	$t0, $acc0
2037	 shlx	$a_ptr, $acc2, $t0
2038	adc	\$0, $acc1
2039	 shrx	$a_ptr, $acc2, $t4
2040
2041	# reduction step 3
2042	add	$t0, $acc3
2043	adc	$t4, $acc0
2044
2045	mulx	$acc2, $t0, $acc2
2046	adc	$t0, $acc1
2047	 shlx	$a_ptr, $acc3, $t0
2048	adc	\$0, $acc2
2049	 shrx	$a_ptr, $acc3, $t4
2050
2051	# reduction step 4
2052	add	$t0, $acc0
2053	adc	$t4, $acc1
2054
2055	mulx	$acc3, $t0, $acc3
2056	adc	$t0, $acc2
2057	adc	\$0, $acc3
2058
2059	xor	$t3, $t3
2060	add	$acc0, $acc4		# accumulate upper half
2061	 mov	.Lpoly+8*1(%rip), $a_ptr
2062	adc	$acc1, $acc5
2063	 mov	$acc4, $acc0
2064	adc	$acc2, $acc6
2065	adc	$acc3, $acc7
2066	 mov	$acc5, $acc1
2067	adc	\$0, $t3
2068
2069	sub	\$-1, $acc4		# .Lpoly[0]
2070	 mov	$acc6, $acc2
2071	sbb	$a_ptr, $acc5		# .Lpoly[1]
2072	sbb	\$0, $acc6		# .Lpoly[2]
2073	 mov	$acc7, $acc3
2074	sbb	$t1, $acc7		# .Lpoly[3]
2075	sbb	\$0, $t3
2076
2077	cmovc	$acc0, $acc4
2078	cmovc	$acc1, $acc5
2079	mov	$acc4, 8*0($r_ptr)
2080	cmovc	$acc2, $acc6
2081	mov	$acc5, 8*1($r_ptr)
2082	cmovc	$acc3, $acc7
2083	mov	$acc6, 8*2($r_ptr)
2084	mov	$acc7, 8*3($r_ptr)
2085
2086	ret
2087.cfi_endproc
2088.size	__ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
2089___
2090}
2091}
2092{
2093my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
2094my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7));
2095my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15));
2096my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15));
2097
2098$code.=<<___;
2099################################################################################
2100# void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
2101.globl	ecp_nistz256_select_w5
2102.type	ecp_nistz256_select_w5,\@abi-omnipotent
2103.align	32
2104ecp_nistz256_select_w5:
2105.cfi_startproc
2106	_CET_ENDBR
2107___
2108$code.=<<___	if ($avx>1);
2109	leaq	OPENSSL_ia32cap_P(%rip), %rax
2110	mov	8(%rax), %rax
2111	test	\$`1<<5`, %eax
2112	jnz	.Lavx2_select_w5
2113___
2114$code.=<<___	if ($win64);
2115	lea	-0x88(%rsp), %rax
2116.LSEH_begin_ecp_nistz256_select_w5:
2117	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
2118	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6, -0x20(%rax)
2119	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7, -0x10(%rax)
2120	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8, 0(%rax)
2121	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9, 0x10(%rax)
2122	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10, 0x20(%rax)
2123	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11, 0x30(%rax)
2124	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12, 0x40(%rax)
2125	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13, 0x50(%rax)
2126	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14, 0x60(%rax)
2127	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15, 0x70(%rax)
2128___
2129$code.=<<___;
2130	movdqa	.LOne(%rip), $ONE
2131	movd	$index, $INDEX
2132
2133	pxor	$Ra, $Ra
2134	pxor	$Rb, $Rb
2135	pxor	$Rc, $Rc
2136	pxor	$Rd, $Rd
2137	pxor	$Re, $Re
2138	pxor	$Rf, $Rf
2139
2140	movdqa	$ONE, $M0
2141	pshufd	\$0, $INDEX, $INDEX
2142
2143	mov	\$16, %rax
2144.Lselect_loop_sse_w5:
2145
2146	movdqa	$M0, $TMP0
2147	paddd	$ONE, $M0
2148	pcmpeqd $INDEX, $TMP0
2149
2150	movdqa	16*0($in_t), $T0a
2151	movdqa	16*1($in_t), $T0b
2152	movdqa	16*2($in_t), $T0c
2153	movdqa	16*3($in_t), $T0d
2154	movdqa	16*4($in_t), $T0e
2155	movdqa	16*5($in_t), $T0f
2156	lea 16*6($in_t), $in_t
2157
2158	pand	$TMP0, $T0a
2159	pand	$TMP0, $T0b
2160	por	$T0a, $Ra
2161	pand	$TMP0, $T0c
2162	por	$T0b, $Rb
2163	pand	$TMP0, $T0d
2164	por	$T0c, $Rc
2165	pand	$TMP0, $T0e
2166	por	$T0d, $Rd
2167	pand	$TMP0, $T0f
2168	por	$T0e, $Re
2169	por	$T0f, $Rf
2170
2171	dec	%rax
2172	jnz	.Lselect_loop_sse_w5
2173
2174	movdqu	$Ra, 16*0($val)
2175	movdqu	$Rb, 16*1($val)
2176	movdqu	$Rc, 16*2($val)
2177	movdqu	$Rd, 16*3($val)
2178	movdqu	$Re, 16*4($val)
2179	movdqu	$Rf, 16*5($val)
2180___
2181$code.=<<___	if ($win64);
2182	movaps	(%rsp), %xmm6
2183	movaps	0x10(%rsp), %xmm7
2184	movaps	0x20(%rsp), %xmm8
2185	movaps	0x30(%rsp), %xmm9
2186	movaps	0x40(%rsp), %xmm10
2187	movaps	0x50(%rsp), %xmm11
2188	movaps	0x60(%rsp), %xmm12
2189	movaps	0x70(%rsp), %xmm13
2190	movaps	0x80(%rsp), %xmm14
2191	movaps	0x90(%rsp), %xmm15
2192	lea	0xa8(%rsp), %rsp
2193___
2194$code.=<<___;
2195	ret
2196.cfi_endproc
2197.LSEH_end_ecp_nistz256_select_w5:
2198.size	ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
2199
2200################################################################################
2201# void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
2202.globl	ecp_nistz256_select_w7
2203.type	ecp_nistz256_select_w7,\@abi-omnipotent
2204.align	32
2205ecp_nistz256_select_w7:
2206.cfi_startproc
2207	_CET_ENDBR
2208___
2209$code.=<<___	if ($avx>1);
2210	leaq	OPENSSL_ia32cap_P(%rip), %rax
2211	mov	8(%rax), %rax
2212	test	\$`1<<5`, %eax
2213	jnz	.Lavx2_select_w7
2214___
2215$code.=<<___	if ($win64);
2216	lea	-0x88(%rsp), %rax
2217.LSEH_begin_ecp_nistz256_select_w7:
2218	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
2219	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6, -0x20(%rax)
2220	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7, -0x10(%rax)
2221	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8, 0(%rax)
2222	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9, 0x10(%rax)
2223	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10, 0x20(%rax)
2224	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11, 0x30(%rax)
2225	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12, 0x40(%rax)
2226	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13, 0x50(%rax)
2227	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14, 0x60(%rax)
2228	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15, 0x70(%rax)
2229___
2230$code.=<<___;
2231	movdqa	.LOne(%rip), $M0
2232	movd	$index, $INDEX
2233
2234	pxor	$Ra, $Ra
2235	pxor	$Rb, $Rb
2236	pxor	$Rc, $Rc
2237	pxor	$Rd, $Rd
2238
2239	movdqa	$M0, $ONE
2240	pshufd	\$0, $INDEX, $INDEX
2241	mov	\$64, %rax
2242
2243.Lselect_loop_sse_w7:
2244	movdqa	$M0, $TMP0
2245	paddd	$ONE, $M0
2246	movdqa	16*0($in_t), $T0a
2247	movdqa	16*1($in_t), $T0b
2248	pcmpeqd	$INDEX, $TMP0
2249	movdqa	16*2($in_t), $T0c
2250	movdqa	16*3($in_t), $T0d
2251	lea	16*4($in_t), $in_t
2252
2253	pand	$TMP0, $T0a
2254	pand	$TMP0, $T0b
2255	por	$T0a, $Ra
2256	pand	$TMP0, $T0c
2257	por	$T0b, $Rb
2258	pand	$TMP0, $T0d
2259	por	$T0c, $Rc
2260	prefetcht0	255($in_t)
2261	por	$T0d, $Rd
2262
2263	dec	%rax
2264	jnz	.Lselect_loop_sse_w7
2265
2266	movdqu	$Ra, 16*0($val)
2267	movdqu	$Rb, 16*1($val)
2268	movdqu	$Rc, 16*2($val)
2269	movdqu	$Rd, 16*3($val)
2270___
2271$code.=<<___	if ($win64);
2272	movaps	(%rsp), %xmm6
2273	movaps	0x10(%rsp), %xmm7
2274	movaps	0x20(%rsp), %xmm8
2275	movaps	0x30(%rsp), %xmm9
2276	movaps	0x40(%rsp), %xmm10
2277	movaps	0x50(%rsp), %xmm11
2278	movaps	0x60(%rsp), %xmm12
2279	movaps	0x70(%rsp), %xmm13
2280	movaps	0x80(%rsp), %xmm14
2281	movaps	0x90(%rsp), %xmm15
2282	lea	0xa8(%rsp), %rsp
2283___
2284$code.=<<___;
2285	ret
2286.cfi_endproc
2287.LSEH_end_ecp_nistz256_select_w7:
2288.size	ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
2289___
2290}
2291if ($avx>1) {
2292my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
2293my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4));
2294my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9));
2295my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14));
2296
2297$code.=<<___;
2298################################################################################
2299# void ecp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, int index);
2300.type	ecp_nistz256_avx2_select_w5,\@abi-omnipotent
2301.align	32
2302ecp_nistz256_avx2_select_w5:
2303.cfi_startproc
2304.Lavx2_select_w5:
2305	vzeroupper
2306___
2307$code.=<<___	if ($win64);
2308	lea	-0x88(%rsp), %rax
2309	mov	%rsp,%r11
2310.LSEH_begin_ecp_nistz256_avx2_select_w5:
2311	.byte	0x48,0x8d,0x60,0xe0		# lea	-0x20(%rax), %rsp
2312	.byte	0xc5,0xf8,0x29,0x70,0xe0	# vmovaps %xmm6, -0x20(%rax)
2313	.byte	0xc5,0xf8,0x29,0x78,0xf0	# vmovaps %xmm7, -0x10(%rax)
2314	.byte	0xc5,0x78,0x29,0x40,0x00	# vmovaps %xmm8, 8(%rax)
2315	.byte	0xc5,0x78,0x29,0x48,0x10	# vmovaps %xmm9, 0x10(%rax)
2316	.byte	0xc5,0x78,0x29,0x50,0x20	# vmovaps %xmm10, 0x20(%rax)
2317	.byte	0xc5,0x78,0x29,0x58,0x30	# vmovaps %xmm11, 0x30(%rax)
2318	.byte	0xc5,0x78,0x29,0x60,0x40	# vmovaps %xmm12, 0x40(%rax)
2319	.byte	0xc5,0x78,0x29,0x68,0x50	# vmovaps %xmm13, 0x50(%rax)
2320	.byte	0xc5,0x78,0x29,0x70,0x60	# vmovaps %xmm14, 0x60(%rax)
2321	.byte	0xc5,0x78,0x29,0x78,0x70	# vmovaps %xmm15, 0x70(%rax)
2322___
2323$code.=<<___;
2324	vmovdqa	.LTwo(%rip), $TWO
2325
2326	vpxor	$Ra, $Ra, $Ra
2327	vpxor	$Rb, $Rb, $Rb
2328	vpxor	$Rc, $Rc, $Rc
2329
2330	vmovdqa .LOne(%rip), $M0
2331	vmovdqa .LTwo(%rip), $M1
2332
2333	vmovd	$index, %xmm1
2334	vpermd	$INDEX, $Ra, $INDEX
2335
2336	mov	\$8, %rax
2337.Lselect_loop_avx2_w5:
2338
2339	vmovdqa	32*0($in_t), $T0a
2340	vmovdqa	32*1($in_t), $T0b
2341	vmovdqa	32*2($in_t), $T0c
2342
2343	vmovdqa	32*3($in_t), $T1a
2344	vmovdqa	32*4($in_t), $T1b
2345	vmovdqa	32*5($in_t), $T1c
2346
2347	vpcmpeqd	$INDEX, $M0, $TMP0
2348	vpcmpeqd	$INDEX, $M1, $TMP1
2349
2350	vpaddd	$TWO, $M0, $M0
2351	vpaddd	$TWO, $M1, $M1
2352	lea	32*6($in_t), $in_t
2353
2354	vpand	$TMP0, $T0a, $T0a
2355	vpand	$TMP0, $T0b, $T0b
2356	vpand	$TMP0, $T0c, $T0c
2357	vpand	$TMP1, $T1a, $T1a
2358	vpand	$TMP1, $T1b, $T1b
2359	vpand	$TMP1, $T1c, $T1c
2360
2361	vpxor	$T0a, $Ra, $Ra
2362	vpxor	$T0b, $Rb, $Rb
2363	vpxor	$T0c, $Rc, $Rc
2364	vpxor	$T1a, $Ra, $Ra
2365	vpxor	$T1b, $Rb, $Rb
2366	vpxor	$T1c, $Rc, $Rc
2367
2368	dec %rax
2369	jnz .Lselect_loop_avx2_w5
2370
2371	vmovdqu $Ra, 32*0($val)
2372	vmovdqu $Rb, 32*1($val)
2373	vmovdqu $Rc, 32*2($val)
2374	vzeroupper
2375___
2376$code.=<<___	if ($win64);
2377	movaps	(%rsp), %xmm6
2378	movaps	0x10(%rsp), %xmm7
2379	movaps	0x20(%rsp), %xmm8
2380	movaps	0x30(%rsp), %xmm9
2381	movaps	0x40(%rsp), %xmm10
2382	movaps	0x50(%rsp), %xmm11
2383	movaps	0x60(%rsp), %xmm12
2384	movaps	0x70(%rsp), %xmm13
2385	movaps	0x80(%rsp), %xmm14
2386	movaps	0x90(%rsp), %xmm15
2387	lea	(%r11), %rsp
2388___
2389$code.=<<___;
2390	ret
2391.cfi_endproc
2392.LSEH_end_ecp_nistz256_avx2_select_w5:
2393.size	ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5
2394___
2395}
2396if ($avx>1) {
2397my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
2398my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3));
2399my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7));
2400my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11));
2401my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15));
2402
2403$code.=<<___;
2404
2405################################################################################
2406# void ecp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, int index);
2407.globl	ecp_nistz256_avx2_select_w7
2408.type	ecp_nistz256_avx2_select_w7,\@abi-omnipotent
2409.align	32
2410ecp_nistz256_avx2_select_w7:
2411.cfi_startproc
2412.Lavx2_select_w7:
2413	_CET_ENDBR
2414	vzeroupper
2415___
2416$code.=<<___	if ($win64);
2417	mov	%rsp,%r11
2418	lea	-0x88(%rsp), %rax
2419.LSEH_begin_ecp_nistz256_avx2_select_w7:
2420	.byte	0x48,0x8d,0x60,0xe0		# lea	-0x20(%rax), %rsp
2421	.byte	0xc5,0xf8,0x29,0x70,0xe0	# vmovaps %xmm6, -0x20(%rax)
2422	.byte	0xc5,0xf8,0x29,0x78,0xf0	# vmovaps %xmm7, -0x10(%rax)
2423	.byte	0xc5,0x78,0x29,0x40,0x00	# vmovaps %xmm8, 8(%rax)
2424	.byte	0xc5,0x78,0x29,0x48,0x10	# vmovaps %xmm9, 0x10(%rax)
2425	.byte	0xc5,0x78,0x29,0x50,0x20	# vmovaps %xmm10, 0x20(%rax)
2426	.byte	0xc5,0x78,0x29,0x58,0x30	# vmovaps %xmm11, 0x30(%rax)
2427	.byte	0xc5,0x78,0x29,0x60,0x40	# vmovaps %xmm12, 0x40(%rax)
2428	.byte	0xc5,0x78,0x29,0x68,0x50	# vmovaps %xmm13, 0x50(%rax)
2429	.byte	0xc5,0x78,0x29,0x70,0x60	# vmovaps %xmm14, 0x60(%rax)
2430	.byte	0xc5,0x78,0x29,0x78,0x70	# vmovaps %xmm15, 0x70(%rax)
2431___
2432$code.=<<___;
2433	vmovdqa	.LThree(%rip), $THREE
2434
2435	vpxor	$Ra, $Ra, $Ra
2436	vpxor	$Rb, $Rb, $Rb
2437
2438	vmovdqa .LOne(%rip), $M0
2439	vmovdqa .LTwo(%rip), $M1
2440	vmovdqa .LThree(%rip), $M2
2441
2442	vmovd	$index, %xmm1
2443	vpermd	$INDEX, $Ra, $INDEX
2444	# Skip index = 0, because it is implicitly the point at infinity
2445
2446	mov	\$21, %rax
2447.Lselect_loop_avx2_w7:
2448
2449	vmovdqa	32*0($in_t), $T0a
2450	vmovdqa	32*1($in_t), $T0b
2451
2452	vmovdqa	32*2($in_t), $T1a
2453	vmovdqa	32*3($in_t), $T1b
2454
2455	vmovdqa	32*4($in_t), $T2a
2456	vmovdqa	32*5($in_t), $T2b
2457
2458	vpcmpeqd	$INDEX, $M0, $TMP0
2459	vpcmpeqd	$INDEX, $M1, $TMP1
2460	vpcmpeqd	$INDEX, $M2, $TMP2
2461
2462	vpaddd	$THREE, $M0, $M0
2463	vpaddd	$THREE, $M1, $M1
2464	vpaddd	$THREE, $M2, $M2
2465	lea	32*6($in_t), $in_t
2466
2467	vpand	$TMP0, $T0a, $T0a
2468	vpand	$TMP0, $T0b, $T0b
2469	vpand	$TMP1, $T1a, $T1a
2470	vpand	$TMP1, $T1b, $T1b
2471	vpand	$TMP2, $T2a, $T2a
2472	vpand	$TMP2, $T2b, $T2b
2473
2474	vpxor	$T0a, $Ra, $Ra
2475	vpxor	$T0b, $Rb, $Rb
2476	vpxor	$T1a, $Ra, $Ra
2477	vpxor	$T1b, $Rb, $Rb
2478	vpxor	$T2a, $Ra, $Ra
2479	vpxor	$T2b, $Rb, $Rb
2480
2481	dec %rax
2482	jnz .Lselect_loop_avx2_w7
2483
2484
2485	vmovdqa	32*0($in_t), $T0a
2486	vmovdqa	32*1($in_t), $T0b
2487
2488	vpcmpeqd	$INDEX, $M0, $TMP0
2489
2490	vpand	$TMP0, $T0a, $T0a
2491	vpand	$TMP0, $T0b, $T0b
2492
2493	vpxor	$T0a, $Ra, $Ra
2494	vpxor	$T0b, $Rb, $Rb
2495
2496	vmovdqu $Ra, 32*0($val)
2497	vmovdqu $Rb, 32*1($val)
2498	vzeroupper
2499___
2500$code.=<<___	if ($win64);
2501	movaps	(%rsp), %xmm6
2502	movaps	0x10(%rsp), %xmm7
2503	movaps	0x20(%rsp), %xmm8
2504	movaps	0x30(%rsp), %xmm9
2505	movaps	0x40(%rsp), %xmm10
2506	movaps	0x50(%rsp), %xmm11
2507	movaps	0x60(%rsp), %xmm12
2508	movaps	0x70(%rsp), %xmm13
2509	movaps	0x80(%rsp), %xmm14
2510	movaps	0x90(%rsp), %xmm15
2511	lea	(%r11), %rsp
2512___
2513$code.=<<___;
2514	ret
2515.cfi_endproc
2516.LSEH_end_ecp_nistz256_avx2_select_w7:
2517.size	ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
2518___
2519} else {
2520$code.=<<___;
2521.globl	ecp_nistz256_avx2_select_w7
2522.type	ecp_nistz256_avx2_select_w7,\@function,3
2523.align	32
2524ecp_nistz256_avx2_select_w7:
2525	_CET_ENDBR
2526	.byte	0x0f,0x0b	# ud2
2527	ret
2528.size	ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
2529___
2530}
2531{{{
2532########################################################################
2533# This block implements higher level point_double, point_add and
2534# point_add_affine. The key to performance in this case is to allow
2535# out-of-order execution logic to overlap computations from next step
2536# with tail processing from current step. By using tailored calling
2537# sequence we minimize inter-step overhead to give processor better
2538# shot at overlapping operations...
2539#
2540# You will notice that input data is copied to stack. Trouble is that
2541# there are no registers to spare for holding original pointers and
2542# reloading them, pointers, would create undesired dependencies on
2543# effective addresses calculation paths. In other words it's too done
2544# to favour out-of-order execution logic.
2545#						<appro@openssl.org>
2546
2547my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
2548my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
2549my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4);
2550my ($poly1,$poly3)=($acc6,$acc7);
2551
2552sub load_for_mul () {
2553my ($a,$b,$src0) = @_;
2554my $bias = $src0 eq "%rax" ? 0 : -128;
2555
2556"	mov	$b, $src0
2557	lea	$b, $b_ptr
2558	mov	8*0+$a, $acc1
2559	mov	8*1+$a, $acc2
2560	lea	$bias+$a, $a_ptr
2561	mov	8*2+$a, $acc3
2562	mov	8*3+$a, $acc4"
2563}
2564
2565sub load_for_sqr () {
2566my ($a,$src0) = @_;
2567my $bias = $src0 eq "%rax" ? 0 : -128;
2568
2569"	mov	8*0+$a, $src0
2570	mov	8*1+$a, $acc6
2571	lea	$bias+$a, $a_ptr
2572	mov	8*2+$a, $acc7
2573	mov	8*3+$a, $acc0"
2574}
2575
2576									{
2577########################################################################
2578# operate in 4-5-0-1 "name space" that matches multiplication output
2579#
2580my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
2581
2582$code.=<<___;
2583.type	__ecp_nistz256_add_toq,\@abi-omnipotent
2584.align	32
2585__ecp_nistz256_add_toq:
2586.cfi_startproc
2587	xor	$t4,$t4
2588	add	8*0($b_ptr), $a0
2589	adc	8*1($b_ptr), $a1
2590	 mov	$a0, $t0
2591	adc	8*2($b_ptr), $a2
2592	adc	8*3($b_ptr), $a3
2593	 mov	$a1, $t1
2594	adc	\$0, $t4
2595
2596	sub	\$-1, $a0
2597	 mov	$a2, $t2
2598	sbb	$poly1, $a1
2599	sbb	\$0, $a2
2600	 mov	$a3, $t3
2601	sbb	$poly3, $a3
2602	sbb	\$0, $t4
2603
2604	cmovc	$t0, $a0
2605	cmovc	$t1, $a1
2606	mov	$a0, 8*0($r_ptr)
2607	cmovc	$t2, $a2
2608	mov	$a1, 8*1($r_ptr)
2609	cmovc	$t3, $a3
2610	mov	$a2, 8*2($r_ptr)
2611	mov	$a3, 8*3($r_ptr)
2612
2613	ret
2614.cfi_endproc
2615.size	__ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
2616
2617.type	__ecp_nistz256_sub_fromq,\@abi-omnipotent
2618.align	32
2619__ecp_nistz256_sub_fromq:
2620.cfi_startproc
2621	sub	8*0($b_ptr), $a0
2622	sbb	8*1($b_ptr), $a1
2623	 mov	$a0, $t0
2624	sbb	8*2($b_ptr), $a2
2625	sbb	8*3($b_ptr), $a3
2626	 mov	$a1, $t1
2627	sbb	$t4, $t4
2628
2629	add	\$-1, $a0
2630	 mov	$a2, $t2
2631	adc	$poly1, $a1
2632	adc	\$0, $a2
2633	 mov	$a3, $t3
2634	adc	$poly3, $a3
2635	test	$t4, $t4
2636
2637	cmovz	$t0, $a0
2638	cmovz	$t1, $a1
2639	mov	$a0, 8*0($r_ptr)
2640	cmovz	$t2, $a2
2641	mov	$a1, 8*1($r_ptr)
2642	cmovz	$t3, $a3
2643	mov	$a2, 8*2($r_ptr)
2644	mov	$a3, 8*3($r_ptr)
2645
2646	ret
2647.cfi_endproc
2648.size	__ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
2649
2650.type	__ecp_nistz256_subq,\@abi-omnipotent
2651.align	32
2652__ecp_nistz256_subq:
2653.cfi_startproc
2654	sub	$a0, $t0
2655	sbb	$a1, $t1
2656	 mov	$t0, $a0
2657	sbb	$a2, $t2
2658	sbb	$a3, $t3
2659	 mov	$t1, $a1
2660	sbb	$t4, $t4
2661
2662	add	\$-1, $t0
2663	 mov	$t2, $a2
2664	adc	$poly1, $t1
2665	adc	\$0, $t2
2666	 mov	$t3, $a3
2667	adc	$poly3, $t3
2668	test	$t4, $t4
2669
2670	cmovnz	$t0, $a0
2671	cmovnz	$t1, $a1
2672	cmovnz	$t2, $a2
2673	cmovnz	$t3, $a3
2674
2675	ret
2676.cfi_endproc
2677.size	__ecp_nistz256_subq,.-__ecp_nistz256_subq
2678
2679.type	__ecp_nistz256_mul_by_2q,\@abi-omnipotent
2680.align	32
2681__ecp_nistz256_mul_by_2q:
2682.cfi_startproc
2683	xor	$t4, $t4
2684	add	$a0, $a0		# a0:a3+a0:a3
2685	adc	$a1, $a1
2686	 mov	$a0, $t0
2687	adc	$a2, $a2
2688	adc	$a3, $a3
2689	 mov	$a1, $t1
2690	adc	\$0, $t4
2691
2692	sub	\$-1, $a0
2693	 mov	$a2, $t2
2694	sbb	$poly1, $a1
2695	sbb	\$0, $a2
2696	 mov	$a3, $t3
2697	sbb	$poly3, $a3
2698	sbb	\$0, $t4
2699
2700	cmovc	$t0, $a0
2701	cmovc	$t1, $a1
2702	mov	$a0, 8*0($r_ptr)
2703	cmovc	$t2, $a2
2704	mov	$a1, 8*1($r_ptr)
2705	cmovc	$t3, $a3
2706	mov	$a2, 8*2($r_ptr)
2707	mov	$a3, 8*3($r_ptr)
2708
2709	ret
2710.cfi_endproc
2711.size	__ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
2712___
2713									}
2714sub gen_double () {
2715    my $x = shift;
2716    my ($src0,$sfx,$bias);
2717    my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
2718
2719    if ($x ne "x") {
2720	$src0 = "%rax";
2721	$sfx  = "";
2722	$bias = 0;
2723
2724$code.=<<___;
2725.globl	ecp_nistz256_point_double
2726.type	ecp_nistz256_point_double,\@function,2
2727.align	32
2728ecp_nistz256_point_double:
2729.cfi_startproc
2730	_CET_ENDBR
2731___
2732$code.=<<___	if ($addx);
2733	leaq	OPENSSL_ia32cap_P(%rip), %rcx
2734	mov	8(%rcx), %rcx
2735	and	\$0x80100, %ecx
2736	cmp	\$0x80100, %ecx
2737	je	.Lpoint_doublex
2738___
2739    } else {
2740	$src0 = "%rdx";
2741	$sfx  = "x";
2742	$bias = 128;
2743
2744$code.=<<___;
2745.type	ecp_nistz256_point_doublex,\@function,2
2746.align	32
2747ecp_nistz256_point_doublex:
2748.cfi_startproc
2749.Lpoint_doublex:
2750___
2751    }
2752$code.=<<___;
2753	push	%rbp
2754.cfi_push	%rbp
2755	push	%rbx
2756.cfi_push	%rbx
2757	push	%r12
2758.cfi_push	%r12
2759	push	%r13
2760.cfi_push	%r13
2761	push	%r14
2762.cfi_push	%r14
2763	push	%r15
2764.cfi_push	%r15
2765	sub	\$32*5+8, %rsp
2766.cfi_adjust_cfa_offset	32*5+8
2767.Lpoint_double${x}_body:
2768
2769.Lpoint_double_shortcut$x:
2770	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$a_ptr.x
2771	mov	$a_ptr, $b_ptr			# backup copy
2772	movdqu	0x10($a_ptr), %xmm1
2773	 mov	0x20+8*0($a_ptr), $acc4		# load in_y in "5-4-0-1" order
2774	 mov	0x20+8*1($a_ptr), $acc5
2775	 mov	0x20+8*2($a_ptr), $acc0
2776	 mov	0x20+8*3($a_ptr), $acc1
2777	 mov	.Lpoly+8*1(%rip), $poly1
2778	 mov	.Lpoly+8*3(%rip), $poly3
2779	movdqa	%xmm0, $in_x(%rsp)
2780	movdqa	%xmm1, $in_x+0x10(%rsp)
2781	lea	0x20($r_ptr), $acc2
2782	lea	0x40($r_ptr), $acc3
2783	movq	$r_ptr, %xmm0
2784	movq	$acc2, %xmm1
2785	movq	$acc3, %xmm2
2786
2787	lea	$S(%rsp), $r_ptr
2788	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(S, in_y);
2789
2790	mov	0x40+8*0($a_ptr), $src0
2791	mov	0x40+8*1($a_ptr), $acc6
2792	mov	0x40+8*2($a_ptr), $acc7
2793	mov	0x40+8*3($a_ptr), $acc0
2794	lea	0x40-$bias($a_ptr), $a_ptr
2795	lea	$Zsqr(%rsp), $r_ptr
2796	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Zsqr, in_z);
2797
2798	`&load_for_sqr("$S(%rsp)", "$src0")`
2799	lea	$S(%rsp), $r_ptr
2800	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(S, S);
2801
2802	mov	0x20($b_ptr), $src0		# $b_ptr is still valid
2803	mov	0x40+8*0($b_ptr), $acc1
2804	mov	0x40+8*1($b_ptr), $acc2
2805	mov	0x40+8*2($b_ptr), $acc3
2806	mov	0x40+8*3($b_ptr), $acc4
2807	lea	0x40-$bias($b_ptr), $a_ptr
2808	lea	0x20($b_ptr), $b_ptr
2809	movq	%xmm2, $r_ptr
2810	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, in_z, in_y);
2811	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(res_z, res_z);
2812
2813	mov	$in_x+8*0(%rsp), $acc4		# "5-4-0-1" order
2814	mov	$in_x+8*1(%rsp), $acc5
2815	lea	$Zsqr(%rsp), $b_ptr
2816	mov	$in_x+8*2(%rsp), $acc0
2817	mov	$in_x+8*3(%rsp), $acc1
2818	lea	$M(%rsp), $r_ptr
2819	call	__ecp_nistz256_add_to$x		# p256_add(M, in_x, Zsqr);
2820
2821	mov	$in_x+8*0(%rsp), $acc4		# "5-4-0-1" order
2822	mov	$in_x+8*1(%rsp), $acc5
2823	lea	$Zsqr(%rsp), $b_ptr
2824	mov	$in_x+8*2(%rsp), $acc0
2825	mov	$in_x+8*3(%rsp), $acc1
2826	lea	$Zsqr(%rsp), $r_ptr
2827	call	__ecp_nistz256_sub_from$x	# p256_sub(Zsqr, in_x, Zsqr);
2828
2829	`&load_for_sqr("$S(%rsp)", "$src0")`
2830	movq	%xmm1, $r_ptr
2831	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(res_y, S);
2832___
2833{
2834######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
2835# operate in 4-5-6-7 "name space" that matches squaring output
2836#
2837my ($poly1,$poly3)=($a_ptr,$t1);
2838my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2);
2839
2840$code.=<<___;
2841	xor	$t4, $t4
2842	mov	$a0, $t0
2843	add	\$-1, $a0
2844	mov	$a1, $t1
2845	adc	$poly1, $a1
2846	mov	$a2, $t2
2847	adc	\$0, $a2
2848	mov	$a3, $t3
2849	adc	$poly3, $a3
2850	adc	\$0, $t4
2851	xor	$a_ptr, $a_ptr		# borrow $a_ptr
2852	test	\$1, $t0
2853
2854	cmovz	$t0, $a0
2855	cmovz	$t1, $a1
2856	cmovz	$t2, $a2
2857	cmovz	$t3, $a3
2858	cmovz	$a_ptr, $t4
2859
2860	mov	$a1, $t0		# a0:a3>>1
2861	shr	\$1, $a0
2862	shl	\$63, $t0
2863	mov	$a2, $t1
2864	shr	\$1, $a1
2865	or	$t0, $a0
2866	shl	\$63, $t1
2867	mov	$a3, $t2
2868	shr	\$1, $a2
2869	or	$t1, $a1
2870	shl	\$63, $t2
2871	mov	$a0, 8*0($r_ptr)
2872	shr	\$1, $a3
2873	mov	$a1, 8*1($r_ptr)
2874	shl	\$63, $t4
2875	or	$t2, $a2
2876	or	$t4, $a3
2877	mov	$a2, 8*2($r_ptr)
2878	mov	$a3, 8*3($r_ptr)
2879___
2880}
2881$code.=<<___;
2882	`&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")`
2883	lea	$M(%rsp), $r_ptr
2884	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(M, M, Zsqr);
2885
2886	lea	$tmp0(%rsp), $r_ptr
2887	call	__ecp_nistz256_mul_by_2$x
2888
2889	lea	$M(%rsp), $b_ptr
2890	lea	$M(%rsp), $r_ptr
2891	call	__ecp_nistz256_add_to$x		# p256_mul_by_3(M, M);
2892
2893	`&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")`
2894	lea	$S(%rsp), $r_ptr
2895	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S, S, in_x);
2896
2897	lea	$tmp0(%rsp), $r_ptr
2898	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(tmp0, S);
2899
2900	`&load_for_sqr("$M(%rsp)", "$src0")`
2901	movq	%xmm0, $r_ptr
2902	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(res_x, M);
2903
2904	lea	$tmp0(%rsp), $b_ptr
2905	mov	$acc6, $acc0			# harmonize sqr output and sub input
2906	mov	$acc7, $acc1
2907	mov	$a_ptr, $poly1
2908	mov	$t1, $poly3
2909	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, tmp0);
2910
2911	mov	$S+8*0(%rsp), $t0
2912	mov	$S+8*1(%rsp), $t1
2913	mov	$S+8*2(%rsp), $t2
2914	mov	$S+8*3(%rsp), $acc2		# "4-5-0-1" order
2915	lea	$S(%rsp), $r_ptr
2916	call	__ecp_nistz256_sub$x		# p256_sub(S, S, res_x);
2917
2918	mov	$M(%rsp), $src0
2919	lea	$M(%rsp), $b_ptr
2920	mov	$acc4, $acc6			# harmonize sub output and mul input
2921	xor	%ecx, %ecx
2922	mov	$acc4, $S+8*0(%rsp)		# have to save:-(
2923	mov	$acc5, $acc2
2924	mov	$acc5, $S+8*1(%rsp)
2925	cmovz	$acc0, $acc3
2926	mov	$acc0, $S+8*2(%rsp)
2927	lea	$S-$bias(%rsp), $a_ptr
2928	cmovz	$acc1, $acc4
2929	mov	$acc1, $S+8*3(%rsp)
2930	mov	$acc6, $acc1
2931	lea	$S(%rsp), $r_ptr
2932	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S, S, M);
2933
2934	movq	%xmm1, $b_ptr
2935	movq	%xmm1, $r_ptr
2936	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, S, res_y);
2937
2938	lea	32*5+56(%rsp), %rsi
2939.cfi_def_cfa	%rsi,8
2940	mov	-48(%rsi),%r15
2941.cfi_restore	%r15
2942	mov	-40(%rsi),%r14
2943.cfi_restore	%r14
2944	mov	-32(%rsi),%r13
2945.cfi_restore	%r13
2946	mov	-24(%rsi),%r12
2947.cfi_restore	%r12
2948	mov	-16(%rsi),%rbx
2949.cfi_restore	%rbx
2950	mov	-8(%rsi),%rbp
2951.cfi_restore	%rbp
2952	lea	(%rsi),%rsp
2953.cfi_def_cfa_register	%rsp
2954.Lpoint_double${x}_epilogue:
2955	ret
2956.cfi_endproc
2957.size	ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx
2958___
2959}
2960&gen_double("q");
2961
2962sub gen_add () {
2963    my $x = shift;
2964    my ($src0,$sfx,$bias);
2965    my ($H,$Hsqr,$R,$Rsqr,$Hcub,
2966	$U1,$U2,$S1,$S2,
2967	$res_x,$res_y,$res_z,
2968	$in1_x,$in1_y,$in1_z,
2969	$in2_x,$in2_y,$in2_z)=map(32*$_,(0..17));
2970    my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
2971
2972    if ($x ne "x") {
2973	$src0 = "%rax";
2974	$sfx  = "";
2975	$bias = 0;
2976
2977$code.=<<___;
2978.globl	ecp_nistz256_point_add
2979.type	ecp_nistz256_point_add,\@function,3
2980.align	32
2981ecp_nistz256_point_add:
2982.cfi_startproc
2983	_CET_ENDBR
2984___
2985$code.=<<___	if ($addx);
2986	leaq	OPENSSL_ia32cap_P(%rip), %rcx
2987	mov	8(%rcx), %rcx
2988	and	\$0x80100, %ecx
2989	cmp	\$0x80100, %ecx
2990	je	.Lpoint_addx
2991___
2992    } else {
2993	$src0 = "%rdx";
2994	$sfx  = "x";
2995	$bias = 128;
2996
2997$code.=<<___;
2998.type	ecp_nistz256_point_addx,\@function,3
2999.align	32
3000ecp_nistz256_point_addx:
3001.cfi_startproc
3002.Lpoint_addx:
3003___
3004    }
3005$code.=<<___;
3006	push	%rbp
3007.cfi_push	%rbp
3008	push	%rbx
3009.cfi_push	%rbx
3010	push	%r12
3011.cfi_push	%r12
3012	push	%r13
3013.cfi_push	%r13
3014	push	%r14
3015.cfi_push	%r14
3016	push	%r15
3017.cfi_push	%r15
3018	sub	\$32*18+8, %rsp
3019.cfi_adjust_cfa_offset	32*18+8
3020.Lpoint_add${x}_body:
3021
3022	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$a_ptr
3023	movdqu	0x10($a_ptr), %xmm1
3024	movdqu	0x20($a_ptr), %xmm2
3025	movdqu	0x30($a_ptr), %xmm3
3026	movdqu	0x40($a_ptr), %xmm4
3027	movdqu	0x50($a_ptr), %xmm5
3028	mov	$a_ptr, $b_ptr			# reassign
3029	mov	$b_org, $a_ptr			# reassign
3030	movdqa	%xmm0, $in1_x(%rsp)
3031	movdqa	%xmm1, $in1_x+0x10(%rsp)
3032	movdqa	%xmm2, $in1_y(%rsp)
3033	movdqa	%xmm3, $in1_y+0x10(%rsp)
3034	movdqa	%xmm4, $in1_z(%rsp)
3035	movdqa	%xmm5, $in1_z+0x10(%rsp)
3036	por	%xmm4, %xmm5
3037
3038	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$b_ptr
3039	 pshufd	\$0xb1, %xmm5, %xmm3
3040	movdqu	0x10($a_ptr), %xmm1
3041	movdqu	0x20($a_ptr), %xmm2
3042	 por	%xmm3, %xmm5
3043	movdqu	0x30($a_ptr), %xmm3
3044	 mov	0x40+8*0($a_ptr), $src0		# load original in2_z
3045	 mov	0x40+8*1($a_ptr), $acc6
3046	 mov	0x40+8*2($a_ptr), $acc7
3047	 mov	0x40+8*3($a_ptr), $acc0
3048	movdqa	%xmm0, $in2_x(%rsp)
3049	 pshufd	\$0x1e, %xmm5, %xmm4
3050	movdqa	%xmm1, $in2_x+0x10(%rsp)
3051	movdqu	0x40($a_ptr),%xmm0		# in2_z again
3052	movdqu	0x50($a_ptr),%xmm1
3053	movdqa	%xmm2, $in2_y(%rsp)
3054	movdqa	%xmm3, $in2_y+0x10(%rsp)
3055	 por	%xmm4, %xmm5
3056	 pxor	%xmm4, %xmm4
3057	por	%xmm0, %xmm1
3058	 movq	$r_ptr, %xmm0			# save $r_ptr
3059
3060	lea	0x40-$bias($a_ptr), $a_ptr	# $a_ptr is still valid
3061	 mov	$src0, $in2_z+8*0(%rsp)		# make in2_z copy
3062	 mov	$acc6, $in2_z+8*1(%rsp)
3063	 mov	$acc7, $in2_z+8*2(%rsp)
3064	 mov	$acc0, $in2_z+8*3(%rsp)
3065	lea	$Z2sqr(%rsp), $r_ptr		# Z2^2
3066	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z2sqr, in2_z);
3067
3068	pcmpeqd	%xmm4, %xmm5
3069	pshufd	\$0xb1, %xmm1, %xmm4
3070	por	%xmm1, %xmm4
3071	pshufd	\$0, %xmm5, %xmm5		# in1infty
3072	pshufd	\$0x1e, %xmm4, %xmm3
3073	por	%xmm3, %xmm4
3074	pxor	%xmm3, %xmm3
3075	pcmpeqd	%xmm3, %xmm4
3076	pshufd	\$0, %xmm4, %xmm4		# in2infty
3077	 mov	0x40+8*0($b_ptr), $src0		# load original in1_z
3078	 mov	0x40+8*1($b_ptr), $acc6
3079	 mov	0x40+8*2($b_ptr), $acc7
3080	 mov	0x40+8*3($b_ptr), $acc0
3081	movq	$b_ptr, %xmm1
3082
3083	lea	0x40-$bias($b_ptr), $a_ptr
3084	lea	$Z1sqr(%rsp), $r_ptr		# Z1^2
3085	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z1sqr, in1_z);
3086
3087	`&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")`
3088	lea	$S1(%rsp), $r_ptr		# S1 = Z2^3
3089	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S1, Z2sqr, in2_z);
3090
3091	`&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
3092	lea	$S2(%rsp), $r_ptr		# S2 = Z1^3
3093	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Z1sqr, in1_z);
3094
3095	`&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")`
3096	lea	$S1(%rsp), $r_ptr		# S1 = Y1*Z2^3
3097	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S1, S1, in1_y);
3098
3099	`&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
3100	lea	$S2(%rsp), $r_ptr		# S2 = Y2*Z1^3
3101	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S2, in2_y);
3102
3103	lea	$S1(%rsp), $b_ptr
3104	lea	$R(%rsp), $r_ptr		# R = S2 - S1
3105	call	__ecp_nistz256_sub_from$x	# p256_sub(R, S2, S1);
3106
3107	or	$acc5, $acc4			# see if result is zero
3108	movdqa	%xmm4, %xmm2
3109	or	$acc0, $acc4
3110	or	$acc1, $acc4
3111	por	%xmm5, %xmm2			# in1infty || in2infty
3112	movq	$acc4, %xmm3
3113
3114	`&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")`
3115	lea	$U1(%rsp), $r_ptr		# U1 = X1*Z2^2
3116	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U1, in1_x, Z2sqr);
3117
3118	`&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")`
3119	lea	$U2(%rsp), $r_ptr		# U2 = X2*Z1^2
3120	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, in2_x, Z1sqr);
3121
3122	lea	$U1(%rsp), $b_ptr
3123	lea	$H(%rsp), $r_ptr		# H = U2 - U1
3124	call	__ecp_nistz256_sub_from$x	# p256_sub(H, U2, U1);
3125
3126	or	$acc5, $acc4			# see if result is zero
3127	or	$acc0, $acc4
3128	or	$acc1, $acc4			# !is_equal(U1, U2)
3129
3130	movq	%xmm2, $acc0
3131	movq	%xmm3, $acc1
3132	or	$acc0, $acc4
3133	.byte	0x3e				# predict taken
3134	jnz	.Ladd_proceed$x			# !is_equal(U1, U2) || in1infty || in2infty
3135
3136	# We now know A = B or A = -B and neither is infinity. Compare the
3137	# y-coordinates via S1 and S2.
3138	test	$acc1, $acc1
3139	jz	.Ladd_double$x			# is_equal(S1, S2)
3140
3141	# A = -B, so the result is infinity.
3142	#
3143	# TODO(davidben): Does .Ladd_proceed handle this case? It seems to, in
3144	# which case we should eliminate this special-case and simplify the
3145	# timing analysis.
3146	movq	%xmm0, $r_ptr			# restore $r_ptr
3147	pxor	%xmm0, %xmm0
3148	movdqu	%xmm0, 0x00($r_ptr)
3149	movdqu	%xmm0, 0x10($r_ptr)
3150	movdqu	%xmm0, 0x20($r_ptr)
3151	movdqu	%xmm0, 0x30($r_ptr)
3152	movdqu	%xmm0, 0x40($r_ptr)
3153	movdqu	%xmm0, 0x50($r_ptr)
3154	jmp	.Ladd_done$x
3155
3156.align	32
3157.Ladd_double$x:
3158	movq	%xmm1, $a_ptr			# restore $a_ptr
3159	movq	%xmm0, $r_ptr			# restore $r_ptr
3160	add	\$`32*(18-5)`, %rsp		# difference in frame sizes
3161.cfi_adjust_cfa_offset	`-32*(18-5)`
3162	jmp	.Lpoint_double_shortcut$x
3163.cfi_adjust_cfa_offset	`32*(18-5)`
3164
3165.align	32
3166.Ladd_proceed$x:
3167	`&load_for_sqr("$R(%rsp)", "$src0")`
3168	lea	$Rsqr(%rsp), $r_ptr		# R^2
3169	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Rsqr, R);
3170
3171	`&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
3172	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
3173	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, H, in1_z);
3174
3175	`&load_for_sqr("$H(%rsp)", "$src0")`
3176	lea	$Hsqr(%rsp), $r_ptr		# H^2
3177	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Hsqr, H);
3178
3179	`&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")`
3180	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
3181	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, res_z, in2_z);
3182
3183	`&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")`
3184	lea	$Hcub(%rsp), $r_ptr		# H^3
3185	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(Hcub, Hsqr, H);
3186
3187	`&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")`
3188	lea	$U2(%rsp), $r_ptr		# U1*H^2
3189	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, U1, Hsqr);
3190___
3191{
3192#######################################################################
3193# operate in 4-5-0-1 "name space" that matches multiplication output
3194#
3195my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
3196my ($poly1, $poly3)=($acc6,$acc7);
3197
3198$code.=<<___;
3199	#lea	$U2(%rsp), $a_ptr
3200	#lea	$Hsqr(%rsp), $r_ptr	# 2*U1*H^2
3201	#call	__ecp_nistz256_mul_by_2	# ecp_nistz256_mul_by_2(Hsqr, U2);
3202
3203	xor	$t4, $t4
3204	add	$acc0, $acc0		# a0:a3+a0:a3
3205	lea	$Rsqr(%rsp), $a_ptr
3206	adc	$acc1, $acc1
3207	 mov	$acc0, $t0
3208	adc	$acc2, $acc2
3209	adc	$acc3, $acc3
3210	 mov	$acc1, $t1
3211	adc	\$0, $t4
3212
3213	sub	\$-1, $acc0
3214	 mov	$acc2, $t2
3215	sbb	$poly1, $acc1
3216	sbb	\$0, $acc2
3217	 mov	$acc3, $t3
3218	sbb	$poly3, $acc3
3219	sbb	\$0, $t4
3220
3221	cmovc	$t0, $acc0
3222	mov	8*0($a_ptr), $t0
3223	cmovc	$t1, $acc1
3224	mov	8*1($a_ptr), $t1
3225	cmovc	$t2, $acc2
3226	mov	8*2($a_ptr), $t2
3227	cmovc	$t3, $acc3
3228	mov	8*3($a_ptr), $t3
3229
3230	call	__ecp_nistz256_sub$x		# p256_sub(res_x, Rsqr, Hsqr);
3231
3232	lea	$Hcub(%rsp), $b_ptr
3233	lea	$res_x(%rsp), $r_ptr
3234	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, Hcub);
3235
3236	mov	$U2+8*0(%rsp), $t0
3237	mov	$U2+8*1(%rsp), $t1
3238	mov	$U2+8*2(%rsp), $t2
3239	mov	$U2+8*3(%rsp), $t3
3240	lea	$res_y(%rsp), $r_ptr
3241
3242	call	__ecp_nistz256_sub$x		# p256_sub(res_y, U2, res_x);
3243
3244	mov	$acc0, 8*0($r_ptr)		# save the result, as
3245	mov	$acc1, 8*1($r_ptr)		# __ecp_nistz256_sub doesn't
3246	mov	$acc2, 8*2($r_ptr)
3247	mov	$acc3, 8*3($r_ptr)
3248___
3249}
3250$code.=<<___;
3251	`&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")`
3252	lea	$S2(%rsp), $r_ptr
3253	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S1, Hcub);
3254
3255	`&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")`
3256	lea	$res_y(%rsp), $r_ptr
3257	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_y, R, res_y);
3258
3259	lea	$S2(%rsp), $b_ptr
3260	lea	$res_y(%rsp), $r_ptr
3261	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, res_y, S2);
3262
3263	movq	%xmm0, $r_ptr		# restore $r_ptr
3264
3265	movdqa	%xmm5, %xmm0		# copy_conditional(res_z, in2_z, in1infty);
3266	movdqa	%xmm5, %xmm1
3267	pandn	$res_z(%rsp), %xmm0
3268	movdqa	%xmm5, %xmm2
3269	pandn	$res_z+0x10(%rsp), %xmm1
3270	movdqa	%xmm5, %xmm3
3271	pand	$in2_z(%rsp), %xmm2
3272	pand	$in2_z+0x10(%rsp), %xmm3
3273	por	%xmm0, %xmm2
3274	por	%xmm1, %xmm3
3275
3276	movdqa	%xmm4, %xmm0		# copy_conditional(res_z, in1_z, in2infty);
3277	movdqa	%xmm4, %xmm1
3278	pandn	%xmm2, %xmm0
3279	movdqa	%xmm4, %xmm2
3280	pandn	%xmm3, %xmm1
3281	movdqa	%xmm4, %xmm3
3282	pand	$in1_z(%rsp), %xmm2
3283	pand	$in1_z+0x10(%rsp), %xmm3
3284	por	%xmm0, %xmm2
3285	por	%xmm1, %xmm3
3286	movdqu	%xmm2, 0x40($r_ptr)
3287	movdqu	%xmm3, 0x50($r_ptr)
3288
3289	movdqa	%xmm5, %xmm0		# copy_conditional(res_x, in2_x, in1infty);
3290	movdqa	%xmm5, %xmm1
3291	pandn	$res_x(%rsp), %xmm0
3292	movdqa	%xmm5, %xmm2
3293	pandn	$res_x+0x10(%rsp), %xmm1
3294	movdqa	%xmm5, %xmm3
3295	pand	$in2_x(%rsp), %xmm2
3296	pand	$in2_x+0x10(%rsp), %xmm3
3297	por	%xmm0, %xmm2
3298	por	%xmm1, %xmm3
3299
3300	movdqa	%xmm4, %xmm0		# copy_conditional(res_x, in1_x, in2infty);
3301	movdqa	%xmm4, %xmm1
3302	pandn	%xmm2, %xmm0
3303	movdqa	%xmm4, %xmm2
3304	pandn	%xmm3, %xmm1
3305	movdqa	%xmm4, %xmm3
3306	pand	$in1_x(%rsp), %xmm2
3307	pand	$in1_x+0x10(%rsp), %xmm3
3308	por	%xmm0, %xmm2
3309	por	%xmm1, %xmm3
3310	movdqu	%xmm2, 0x00($r_ptr)
3311	movdqu	%xmm3, 0x10($r_ptr)
3312
3313	movdqa	%xmm5, %xmm0		# copy_conditional(res_y, in2_y, in1infty);
3314	movdqa	%xmm5, %xmm1
3315	pandn	$res_y(%rsp), %xmm0
3316	movdqa	%xmm5, %xmm2
3317	pandn	$res_y+0x10(%rsp), %xmm1
3318	movdqa	%xmm5, %xmm3
3319	pand	$in2_y(%rsp), %xmm2
3320	pand	$in2_y+0x10(%rsp), %xmm3
3321	por	%xmm0, %xmm2
3322	por	%xmm1, %xmm3
3323
3324	movdqa	%xmm4, %xmm0		# copy_conditional(res_y, in1_y, in2infty);
3325	movdqa	%xmm4, %xmm1
3326	pandn	%xmm2, %xmm0
3327	movdqa	%xmm4, %xmm2
3328	pandn	%xmm3, %xmm1
3329	movdqa	%xmm4, %xmm3
3330	pand	$in1_y(%rsp), %xmm2
3331	pand	$in1_y+0x10(%rsp), %xmm3
3332	por	%xmm0, %xmm2
3333	por	%xmm1, %xmm3
3334	movdqu	%xmm2, 0x20($r_ptr)
3335	movdqu	%xmm3, 0x30($r_ptr)
3336
3337.Ladd_done$x:
3338	lea	32*18+56(%rsp), %rsi
3339.cfi_def_cfa	%rsi,8
3340	mov	-48(%rsi),%r15
3341.cfi_restore	%r15
3342	mov	-40(%rsi),%r14
3343.cfi_restore	%r14
3344	mov	-32(%rsi),%r13
3345.cfi_restore	%r13
3346	mov	-24(%rsi),%r12
3347.cfi_restore	%r12
3348	mov	-16(%rsi),%rbx
3349.cfi_restore	%rbx
3350	mov	-8(%rsi),%rbp
3351.cfi_restore	%rbp
3352	lea	(%rsi),%rsp
3353.cfi_def_cfa_register	%rsp
3354.Lpoint_add${x}_epilogue:
3355	ret
3356.cfi_endproc
3357.size	ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx
3358___
3359}
3360&gen_add("q");
3361
3362sub gen_add_affine () {
3363    my $x = shift;
3364    my ($src0,$sfx,$bias);
3365    my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr,
3366	$res_x,$res_y,$res_z,
3367	$in1_x,$in1_y,$in1_z,
3368	$in2_x,$in2_y)=map(32*$_,(0..14));
3369    my $Z1sqr = $S2;
3370
3371    if ($x ne "x") {
3372	$src0 = "%rax";
3373	$sfx  = "";
3374	$bias = 0;
3375
3376$code.=<<___;
3377.globl	ecp_nistz256_point_add_affine
3378.type	ecp_nistz256_point_add_affine,\@function,3
3379.align	32
3380ecp_nistz256_point_add_affine:
3381.cfi_startproc
3382	_CET_ENDBR
3383___
3384$code.=<<___	if ($addx);
3385	leaq	OPENSSL_ia32cap_P(%rip), %rcx
3386	mov	8(%rcx), %rcx
3387	and	\$0x80100, %ecx
3388	cmp	\$0x80100, %ecx
3389	je	.Lpoint_add_affinex
3390___
3391    } else {
3392	$src0 = "%rdx";
3393	$sfx  = "x";
3394	$bias = 128;
3395
3396$code.=<<___;
3397.type	ecp_nistz256_point_add_affinex,\@function,3
3398.align	32
3399ecp_nistz256_point_add_affinex:
3400.cfi_startproc
3401.Lpoint_add_affinex:
3402___
3403    }
3404$code.=<<___;
3405	push	%rbp
3406.cfi_push	%rbp
3407	push	%rbx
3408.cfi_push	%rbx
3409	push	%r12
3410.cfi_push	%r12
3411	push	%r13
3412.cfi_push	%r13
3413	push	%r14
3414.cfi_push	%r14
3415	push	%r15
3416.cfi_push	%r15
3417	sub	\$32*15+8, %rsp
3418.cfi_adjust_cfa_offset	32*15+8
3419.Ladd_affine${x}_body:
3420
3421	movdqu	0x00($a_ptr), %xmm0	# copy	*(P256_POINT *)$a_ptr
3422	mov	$b_org, $b_ptr		# reassign
3423	movdqu	0x10($a_ptr), %xmm1
3424	movdqu	0x20($a_ptr), %xmm2
3425	movdqu	0x30($a_ptr), %xmm3
3426	movdqu	0x40($a_ptr), %xmm4
3427	movdqu	0x50($a_ptr), %xmm5
3428	 mov	0x40+8*0($a_ptr), $src0	# load original in1_z
3429	 mov	0x40+8*1($a_ptr), $acc6
3430	 mov	0x40+8*2($a_ptr), $acc7
3431	 mov	0x40+8*3($a_ptr), $acc0
3432	movdqa	%xmm0, $in1_x(%rsp)
3433	movdqa	%xmm1, $in1_x+0x10(%rsp)
3434	movdqa	%xmm2, $in1_y(%rsp)
3435	movdqa	%xmm3, $in1_y+0x10(%rsp)
3436	movdqa	%xmm4, $in1_z(%rsp)
3437	movdqa	%xmm5, $in1_z+0x10(%rsp)
3438	por	%xmm4, %xmm5
3439
3440	movdqu	0x00($b_ptr), %xmm0	# copy	*(P256_POINT_AFFINE *)$b_ptr
3441	 pshufd	\$0xb1, %xmm5, %xmm3
3442	movdqu	0x10($b_ptr), %xmm1
3443	movdqu	0x20($b_ptr), %xmm2
3444	 por	%xmm3, %xmm5
3445	movdqu	0x30($b_ptr), %xmm3
3446	movdqa	%xmm0, $in2_x(%rsp)
3447	 pshufd	\$0x1e, %xmm5, %xmm4
3448	movdqa	%xmm1, $in2_x+0x10(%rsp)
3449	por	%xmm0, %xmm1
3450	 movq	$r_ptr, %xmm0		# save $r_ptr
3451	movdqa	%xmm2, $in2_y(%rsp)
3452	movdqa	%xmm3, $in2_y+0x10(%rsp)
3453	por	%xmm2, %xmm3
3454	 por	%xmm4, %xmm5
3455	 pxor	%xmm4, %xmm4
3456	por	%xmm1, %xmm3
3457
3458	lea	0x40-$bias($a_ptr), $a_ptr	# $a_ptr is still valid
3459	lea	$Z1sqr(%rsp), $r_ptr		# Z1^2
3460	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z1sqr, in1_z);
3461
3462	pcmpeqd	%xmm4, %xmm5
3463	pshufd	\$0xb1, %xmm3, %xmm4
3464	 mov	0x00($b_ptr), $src0		# $b_ptr is still valid
3465	 #lea	0x00($b_ptr), $b_ptr
3466	 mov	$acc4, $acc1			# harmonize sqr output and mul input
3467	por	%xmm3, %xmm4
3468	pshufd	\$0, %xmm5, %xmm5		# in1infty
3469	pshufd	\$0x1e, %xmm4, %xmm3
3470	 mov	$acc5, $acc2
3471	por	%xmm3, %xmm4
3472	pxor	%xmm3, %xmm3
3473	 mov	$acc6, $acc3
3474	pcmpeqd	%xmm3, %xmm4
3475	pshufd	\$0, %xmm4, %xmm4		# in2infty
3476
3477	lea	$Z1sqr-$bias(%rsp), $a_ptr
3478	mov	$acc7, $acc4
3479	lea	$U2(%rsp), $r_ptr		# U2 = X2*Z1^2
3480	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, Z1sqr, in2_x);
3481
3482	lea	$in1_x(%rsp), $b_ptr
3483	lea	$H(%rsp), $r_ptr		# H = U2 - U1
3484	call	__ecp_nistz256_sub_from$x	# p256_sub(H, U2, in1_x);
3485
3486	`&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
3487	lea	$S2(%rsp), $r_ptr		# S2 = Z1^3
3488	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Z1sqr, in1_z);
3489
3490	`&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
3491	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
3492	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, H, in1_z);
3493
3494	`&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
3495	lea	$S2(%rsp), $r_ptr		# S2 = Y2*Z1^3
3496	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S2, in2_y);
3497
3498	lea	$in1_y(%rsp), $b_ptr
3499	lea	$R(%rsp), $r_ptr		# R = S2 - S1
3500	call	__ecp_nistz256_sub_from$x	# p256_sub(R, S2, in1_y);
3501
3502	`&load_for_sqr("$H(%rsp)", "$src0")`
3503	lea	$Hsqr(%rsp), $r_ptr		# H^2
3504	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Hsqr, H);
3505
3506	`&load_for_sqr("$R(%rsp)", "$src0")`
3507	lea	$Rsqr(%rsp), $r_ptr		# R^2
3508	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Rsqr, R);
3509
3510	`&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")`
3511	lea	$Hcub(%rsp), $r_ptr		# H^3
3512	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(Hcub, Hsqr, H);
3513
3514	`&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")`
3515	lea	$U2(%rsp), $r_ptr		# U1*H^2
3516	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, in1_x, Hsqr);
3517___
3518{
3519#######################################################################
3520# operate in 4-5-0-1 "name space" that matches multiplication output
3521#
3522my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
3523my ($poly1, $poly3)=($acc6,$acc7);
3524
3525$code.=<<___;
3526	#lea	$U2(%rsp), $a_ptr
3527	#lea	$Hsqr(%rsp), $r_ptr	# 2*U1*H^2
3528	#call	__ecp_nistz256_mul_by_2	# ecp_nistz256_mul_by_2(Hsqr, U2);
3529
3530	xor	$t4, $t4
3531	add	$acc0, $acc0		# a0:a3+a0:a3
3532	lea	$Rsqr(%rsp), $a_ptr
3533	adc	$acc1, $acc1
3534	 mov	$acc0, $t0
3535	adc	$acc2, $acc2
3536	adc	$acc3, $acc3
3537	 mov	$acc1, $t1
3538	adc	\$0, $t4
3539
3540	sub	\$-1, $acc0
3541	 mov	$acc2, $t2
3542	sbb	$poly1, $acc1
3543	sbb	\$0, $acc2
3544	 mov	$acc3, $t3
3545	sbb	$poly3, $acc3
3546	sbb	\$0, $t4
3547
3548	cmovc	$t0, $acc0
3549	mov	8*0($a_ptr), $t0
3550	cmovc	$t1, $acc1
3551	mov	8*1($a_ptr), $t1
3552	cmovc	$t2, $acc2
3553	mov	8*2($a_ptr), $t2
3554	cmovc	$t3, $acc3
3555	mov	8*3($a_ptr), $t3
3556
3557	call	__ecp_nistz256_sub$x		# p256_sub(res_x, Rsqr, Hsqr);
3558
3559	lea	$Hcub(%rsp), $b_ptr
3560	lea	$res_x(%rsp), $r_ptr
3561	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, Hcub);
3562
3563	mov	$U2+8*0(%rsp), $t0
3564	mov	$U2+8*1(%rsp), $t1
3565	mov	$U2+8*2(%rsp), $t2
3566	mov	$U2+8*3(%rsp), $t3
3567	lea	$H(%rsp), $r_ptr
3568
3569	call	__ecp_nistz256_sub$x		# p256_sub(H, U2, res_x);
3570
3571	mov	$acc0, 8*0($r_ptr)		# save the result, as
3572	mov	$acc1, 8*1($r_ptr)		# __ecp_nistz256_sub doesn't
3573	mov	$acc2, 8*2($r_ptr)
3574	mov	$acc3, 8*3($r_ptr)
3575___
3576}
3577$code.=<<___;
3578	`&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")`
3579	lea	$S2(%rsp), $r_ptr
3580	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Hcub, in1_y);
3581
3582	`&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")`
3583	lea	$H(%rsp), $r_ptr
3584	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(H, H, R);
3585
3586	lea	$S2(%rsp), $b_ptr
3587	lea	$res_y(%rsp), $r_ptr
3588	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, H, S2);
3589
3590	movq	%xmm0, $r_ptr		# restore $r_ptr
3591
3592	movdqa	%xmm5, %xmm0		# copy_conditional(res_z, ONE, in1infty);
3593	movdqa	%xmm5, %xmm1
3594	pandn	$res_z(%rsp), %xmm0
3595	movdqa	%xmm5, %xmm2
3596	pandn	$res_z+0x10(%rsp), %xmm1
3597	movdqa	%xmm5, %xmm3
3598	pand	.LONE_mont(%rip), %xmm2
3599	pand	.LONE_mont+0x10(%rip), %xmm3
3600	por	%xmm0, %xmm2
3601	por	%xmm1, %xmm3
3602
3603	movdqa	%xmm4, %xmm0		# copy_conditional(res_z, in1_z, in2infty);
3604	movdqa	%xmm4, %xmm1
3605	pandn	%xmm2, %xmm0
3606	movdqa	%xmm4, %xmm2
3607	pandn	%xmm3, %xmm1
3608	movdqa	%xmm4, %xmm3
3609	pand	$in1_z(%rsp), %xmm2
3610	pand	$in1_z+0x10(%rsp), %xmm3
3611	por	%xmm0, %xmm2
3612	por	%xmm1, %xmm3
3613	movdqu	%xmm2, 0x40($r_ptr)
3614	movdqu	%xmm3, 0x50($r_ptr)
3615
3616	movdqa	%xmm5, %xmm0		# copy_conditional(res_x, in2_x, in1infty);
3617	movdqa	%xmm5, %xmm1
3618	pandn	$res_x(%rsp), %xmm0
3619	movdqa	%xmm5, %xmm2
3620	pandn	$res_x+0x10(%rsp), %xmm1
3621	movdqa	%xmm5, %xmm3
3622	pand	$in2_x(%rsp), %xmm2
3623	pand	$in2_x+0x10(%rsp), %xmm3
3624	por	%xmm0, %xmm2
3625	por	%xmm1, %xmm3
3626
3627	movdqa	%xmm4, %xmm0		# copy_conditional(res_x, in1_x, in2infty);
3628	movdqa	%xmm4, %xmm1
3629	pandn	%xmm2, %xmm0
3630	movdqa	%xmm4, %xmm2
3631	pandn	%xmm3, %xmm1
3632	movdqa	%xmm4, %xmm3
3633	pand	$in1_x(%rsp), %xmm2
3634	pand	$in1_x+0x10(%rsp), %xmm3
3635	por	%xmm0, %xmm2
3636	por	%xmm1, %xmm3
3637	movdqu	%xmm2, 0x00($r_ptr)
3638	movdqu	%xmm3, 0x10($r_ptr)
3639
3640	movdqa	%xmm5, %xmm0		# copy_conditional(res_y, in2_y, in1infty);
3641	movdqa	%xmm5, %xmm1
3642	pandn	$res_y(%rsp), %xmm0
3643	movdqa	%xmm5, %xmm2
3644	pandn	$res_y+0x10(%rsp), %xmm1
3645	movdqa	%xmm5, %xmm3
3646	pand	$in2_y(%rsp), %xmm2
3647	pand	$in2_y+0x10(%rsp), %xmm3
3648	por	%xmm0, %xmm2
3649	por	%xmm1, %xmm3
3650
3651	movdqa	%xmm4, %xmm0		# copy_conditional(res_y, in1_y, in2infty);
3652	movdqa	%xmm4, %xmm1
3653	pandn	%xmm2, %xmm0
3654	movdqa	%xmm4, %xmm2
3655	pandn	%xmm3, %xmm1
3656	movdqa	%xmm4, %xmm3
3657	pand	$in1_y(%rsp), %xmm2
3658	pand	$in1_y+0x10(%rsp), %xmm3
3659	por	%xmm0, %xmm2
3660	por	%xmm1, %xmm3
3661	movdqu	%xmm2, 0x20($r_ptr)
3662	movdqu	%xmm3, 0x30($r_ptr)
3663
3664	lea	32*15+56(%rsp), %rsi
3665.cfi_def_cfa	%rsi,8
3666	mov	-48(%rsi),%r15
3667.cfi_restore	%r15
3668	mov	-40(%rsi),%r14
3669.cfi_restore	%r14
3670	mov	-32(%rsi),%r13
3671.cfi_restore	%r13
3672	mov	-24(%rsi),%r12
3673.cfi_restore	%r12
3674	mov	-16(%rsi),%rbx
3675.cfi_restore	%rbx
3676	mov	-8(%rsi),%rbp
3677.cfi_restore	%rbp
3678	lea	(%rsi),%rsp
3679.cfi_def_cfa_register	%rsp
3680.Ladd_affine${x}_epilogue:
3681	ret
3682.cfi_endproc
3683.size	ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx
3684___
3685}
3686&gen_add_affine("q");
3687
3688########################################################################
3689# AD*X magic
3690#
3691if ($addx) {								{
3692########################################################################
3693# operate in 4-5-0-1 "name space" that matches multiplication output
3694#
3695my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
3696
3697$code.=<<___;
3698.type	__ecp_nistz256_add_tox,\@abi-omnipotent
3699.align	32
3700__ecp_nistz256_add_tox:
3701.cfi_startproc
3702	xor	$t4, $t4
3703	adc	8*0($b_ptr), $a0
3704	adc	8*1($b_ptr), $a1
3705	 mov	$a0, $t0
3706	adc	8*2($b_ptr), $a2
3707	adc	8*3($b_ptr), $a3
3708	 mov	$a1, $t1
3709	adc	\$0, $t4
3710
3711	xor	$t3, $t3
3712	sbb	\$-1, $a0
3713	 mov	$a2, $t2
3714	sbb	$poly1, $a1
3715	sbb	\$0, $a2
3716	 mov	$a3, $t3
3717	sbb	$poly3, $a3
3718	sbb	\$0, $t4
3719
3720	cmovc	$t0, $a0
3721	cmovc	$t1, $a1
3722	mov	$a0, 8*0($r_ptr)
3723	cmovc	$t2, $a2
3724	mov	$a1, 8*1($r_ptr)
3725	cmovc	$t3, $a3
3726	mov	$a2, 8*2($r_ptr)
3727	mov	$a3, 8*3($r_ptr)
3728
3729	ret
3730.cfi_endproc
3731.size	__ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
3732
3733.type	__ecp_nistz256_sub_fromx,\@abi-omnipotent
3734.align	32
3735__ecp_nistz256_sub_fromx:
3736.cfi_startproc
3737	xor	$t4, $t4
3738	sbb	8*0($b_ptr), $a0
3739	sbb	8*1($b_ptr), $a1
3740	 mov	$a0, $t0
3741	sbb	8*2($b_ptr), $a2
3742	sbb	8*3($b_ptr), $a3
3743	 mov	$a1, $t1
3744	sbb	\$0, $t4
3745
3746	xor	$t3, $t3
3747	adc	\$-1, $a0
3748	 mov	$a2, $t2
3749	adc	$poly1, $a1
3750	adc	\$0, $a2
3751	 mov	$a3, $t3
3752	adc	$poly3, $a3
3753
3754	bt	\$0, $t4
3755	cmovnc	$t0, $a0
3756	cmovnc	$t1, $a1
3757	mov	$a0, 8*0($r_ptr)
3758	cmovnc	$t2, $a2
3759	mov	$a1, 8*1($r_ptr)
3760	cmovnc	$t3, $a3
3761	mov	$a2, 8*2($r_ptr)
3762	mov	$a3, 8*3($r_ptr)
3763
3764	ret
3765.cfi_endproc
3766.size	__ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
3767
3768.type	__ecp_nistz256_subx,\@abi-omnipotent
3769.align	32
3770__ecp_nistz256_subx:
3771.cfi_startproc
3772	xor	$t4, $t4
3773	sbb	$a0, $t0
3774	sbb	$a1, $t1
3775	 mov	$t0, $a0
3776	sbb	$a2, $t2
3777	sbb	$a3, $t3
3778	 mov	$t1, $a1
3779	sbb	\$0, $t4
3780
3781	xor	$a3 ,$a3
3782	adc	\$-1, $t0
3783	 mov	$t2, $a2
3784	adc	$poly1, $t1
3785	adc	\$0, $t2
3786	 mov	$t3, $a3
3787	adc	$poly3, $t3
3788
3789	bt	\$0, $t4
3790	cmovc	$t0, $a0
3791	cmovc	$t1, $a1
3792	cmovc	$t2, $a2
3793	cmovc	$t3, $a3
3794
3795	ret
3796.cfi_endproc
3797.size	__ecp_nistz256_subx,.-__ecp_nistz256_subx
3798
3799.type	__ecp_nistz256_mul_by_2x,\@abi-omnipotent
3800.align	32
3801__ecp_nistz256_mul_by_2x:
3802.cfi_startproc
3803	xor	$t4, $t4
3804	adc	$a0, $a0		# a0:a3+a0:a3
3805	adc	$a1, $a1
3806	 mov	$a0, $t0
3807	adc	$a2, $a2
3808	adc	$a3, $a3
3809	 mov	$a1, $t1
3810	adc	\$0, $t4
3811
3812	xor	$t3, $t3
3813	sbb	\$-1, $a0
3814	 mov	$a2, $t2
3815	sbb	$poly1, $a1
3816	sbb	\$0, $a2
3817	 mov	$a3, $t3
3818	sbb	$poly3, $a3
3819	sbb	\$0, $t4
3820
3821	cmovc	$t0, $a0
3822	cmovc	$t1, $a1
3823	mov	$a0, 8*0($r_ptr)
3824	cmovc	$t2, $a2
3825	mov	$a1, 8*1($r_ptr)
3826	cmovc	$t3, $a3
3827	mov	$a2, 8*2($r_ptr)
3828	mov	$a3, 8*3($r_ptr)
3829
3830	ret
3831.cfi_endproc
3832.size	__ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
3833___
3834									}
3835&gen_double("x");
3836&gen_add("x");
3837&gen_add_affine("x");
3838}
3839}}}
3840
3841# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3842#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
3843if ($win64) {
3844$rec="%rcx";
3845$frame="%rdx";
3846$context="%r8";
3847$disp="%r9";
3848
3849$code.=<<___;
3850.extern	__imp_RtlVirtualUnwind
3851
3852.type	short_handler,\@abi-omnipotent
3853.align	16
3854short_handler:
3855	push	%rsi
3856	push	%rdi
3857	push	%rbx
3858	push	%rbp
3859	push	%r12
3860	push	%r13
3861	push	%r14
3862	push	%r15
3863	pushfq
3864	sub	\$64,%rsp
3865
3866	mov	120($context),%rax	# pull context->Rax
3867	mov	248($context),%rbx	# pull context->Rip
3868
3869	mov	8($disp),%rsi		# disp->ImageBase
3870	mov	56($disp),%r11		# disp->HandlerData
3871
3872	mov	0(%r11),%r10d		# HandlerData[0]
3873	lea	(%rsi,%r10),%r10	# end of prologue label
3874	cmp	%r10,%rbx		# context->Rip<end of prologue label
3875	jb	.Lcommon_seh_tail
3876
3877	mov	152($context),%rax	# pull context->Rsp
3878
3879	mov	4(%r11),%r10d		# HandlerData[1]
3880	lea	(%rsi,%r10),%r10	# epilogue label
3881	cmp	%r10,%rbx		# context->Rip>=epilogue label
3882	jae	.Lcommon_seh_tail
3883
3884	lea	16(%rax),%rax
3885
3886	mov	-8(%rax),%r12
3887	mov	-16(%rax),%r13
3888	mov	%r12,216($context)	# restore context->R12
3889	mov	%r13,224($context)	# restore context->R13
3890
3891	jmp	.Lcommon_seh_tail
3892.size	short_handler,.-short_handler
3893
3894.type	full_handler,\@abi-omnipotent
3895.align	16
3896full_handler:
3897	push	%rsi
3898	push	%rdi
3899	push	%rbx
3900	push	%rbp
3901	push	%r12
3902	push	%r13
3903	push	%r14
3904	push	%r15
3905	pushfq
3906	sub	\$64,%rsp
3907
3908	mov	120($context),%rax	# pull context->Rax
3909	mov	248($context),%rbx	# pull context->Rip
3910
3911	mov	8($disp),%rsi		# disp->ImageBase
3912	mov	56($disp),%r11		# disp->HandlerData
3913
3914	mov	0(%r11),%r10d		# HandlerData[0]
3915	lea	(%rsi,%r10),%r10	# end of prologue label
3916	cmp	%r10,%rbx		# context->Rip<end of prologue label
3917	jb	.Lcommon_seh_tail
3918
3919	mov	152($context),%rax	# pull context->Rsp
3920
3921	mov	4(%r11),%r10d		# HandlerData[1]
3922	lea	(%rsi,%r10),%r10	# epilogue label
3923	cmp	%r10,%rbx		# context->Rip>=epilogue label
3924	jae	.Lcommon_seh_tail
3925
3926	mov	8(%r11),%r10d		# HandlerData[2]
3927	lea	(%rax,%r10),%rax
3928
3929	mov	-8(%rax),%rbp
3930	mov	-16(%rax),%rbx
3931	mov	-24(%rax),%r12
3932	mov	-32(%rax),%r13
3933	mov	-40(%rax),%r14
3934	mov	-48(%rax),%r15
3935	mov	%rbx,144($context)	# restore context->Rbx
3936	mov	%rbp,160($context)	# restore context->Rbp
3937	mov	%r12,216($context)	# restore context->R12
3938	mov	%r13,224($context)	# restore context->R13
3939	mov	%r14,232($context)	# restore context->R14
3940	mov	%r15,240($context)	# restore context->R15
3941
3942.Lcommon_seh_tail:
3943	mov	8(%rax),%rdi
3944	mov	16(%rax),%rsi
3945	mov	%rax,152($context)	# restore context->Rsp
3946	mov	%rsi,168($context)	# restore context->Rsi
3947	mov	%rdi,176($context)	# restore context->Rdi
3948
3949	mov	40($disp),%rdi		# disp->ContextRecord
3950	mov	$context,%rsi		# context
3951	mov	\$154,%ecx		# sizeof(CONTEXT)
3952	.long	0xa548f3fc		# cld; rep movsq
3953
3954	mov	$disp,%rsi
3955	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
3956	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
3957	mov	0(%rsi),%r8		# arg3, disp->ControlPc
3958	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
3959	mov	40(%rsi),%r10		# disp->ContextRecord
3960	lea	56(%rsi),%r11		# &disp->HandlerData
3961	lea	24(%rsi),%r12		# &disp->EstablisherFrame
3962	mov	%r10,32(%rsp)		# arg5
3963	mov	%r11,40(%rsp)		# arg6
3964	mov	%r12,48(%rsp)		# arg7
3965	mov	%rcx,56(%rsp)		# arg8, (NULL)
3966	call	*__imp_RtlVirtualUnwind(%rip)
3967
3968	mov	\$1,%eax		# ExceptionContinueSearch
3969	add	\$64,%rsp
3970	popfq
3971	pop	%r15
3972	pop	%r14
3973	pop	%r13
3974	pop	%r12
3975	pop	%rbp
3976	pop	%rbx
3977	pop	%rdi
3978	pop	%rsi
3979	ret
3980.size	full_handler,.-full_handler
3981
3982.section	.pdata
3983.align	4
3984	.rva	.LSEH_begin_ecp_nistz256_neg
3985	.rva	.LSEH_end_ecp_nistz256_neg
3986	.rva	.LSEH_info_ecp_nistz256_neg
3987
3988	.rva	.LSEH_begin_ecp_nistz256_ord_mul_mont
3989	.rva	.LSEH_end_ecp_nistz256_ord_mul_mont
3990	.rva	.LSEH_info_ecp_nistz256_ord_mul_mont
3991
3992	.rva	.LSEH_begin_ecp_nistz256_ord_sqr_mont
3993	.rva	.LSEH_end_ecp_nistz256_ord_sqr_mont
3994	.rva	.LSEH_info_ecp_nistz256_ord_sqr_mont
3995___
3996$code.=<<___	if ($addx);
3997	.rva	.LSEH_begin_ecp_nistz256_ord_mul_montx
3998	.rva	.LSEH_end_ecp_nistz256_ord_mul_montx
3999	.rva	.LSEH_info_ecp_nistz256_ord_mul_montx
4000
4001	.rva	.LSEH_begin_ecp_nistz256_ord_sqr_montx
4002	.rva	.LSEH_end_ecp_nistz256_ord_sqr_montx
4003	.rva	.LSEH_info_ecp_nistz256_ord_sqr_montx
4004___
4005$code.=<<___;
4006	.rva	.LSEH_begin_ecp_nistz256_mul_mont
4007	.rva	.LSEH_end_ecp_nistz256_mul_mont
4008	.rva	.LSEH_info_ecp_nistz256_mul_mont
4009
4010	.rva	.LSEH_begin_ecp_nistz256_sqr_mont
4011	.rva	.LSEH_end_ecp_nistz256_sqr_mont
4012	.rva	.LSEH_info_ecp_nistz256_sqr_mont
4013
4014	.rva	.LSEH_begin_ecp_nistz256_select_w5
4015	.rva	.LSEH_end_ecp_nistz256_select_w5
4016	.rva	.LSEH_info_ecp_nistz256_select_wX
4017
4018	.rva	.LSEH_begin_ecp_nistz256_select_w7
4019	.rva	.LSEH_end_ecp_nistz256_select_w7
4020	.rva	.LSEH_info_ecp_nistz256_select_wX
4021___
4022$code.=<<___	if ($avx>1);
4023	.rva	.LSEH_begin_ecp_nistz256_avx2_select_w5
4024	.rva	.LSEH_end_ecp_nistz256_avx2_select_w5
4025	.rva	.LSEH_info_ecp_nistz256_avx2_select_wX
4026
4027	.rva	.LSEH_begin_ecp_nistz256_avx2_select_w7
4028	.rva	.LSEH_end_ecp_nistz256_avx2_select_w7
4029	.rva	.LSEH_info_ecp_nistz256_avx2_select_wX
4030___
4031$code.=<<___;
4032	.rva	.LSEH_begin_ecp_nistz256_point_double
4033	.rva	.LSEH_end_ecp_nistz256_point_double
4034	.rva	.LSEH_info_ecp_nistz256_point_double
4035
4036	.rva	.LSEH_begin_ecp_nistz256_point_add
4037	.rva	.LSEH_end_ecp_nistz256_point_add
4038	.rva	.LSEH_info_ecp_nistz256_point_add
4039
4040	.rva	.LSEH_begin_ecp_nistz256_point_add_affine
4041	.rva	.LSEH_end_ecp_nistz256_point_add_affine
4042	.rva	.LSEH_info_ecp_nistz256_point_add_affine
4043___
4044$code.=<<___ if ($addx);
4045	.rva	.LSEH_begin_ecp_nistz256_point_doublex
4046	.rva	.LSEH_end_ecp_nistz256_point_doublex
4047	.rva	.LSEH_info_ecp_nistz256_point_doublex
4048
4049	.rva	.LSEH_begin_ecp_nistz256_point_addx
4050	.rva	.LSEH_end_ecp_nistz256_point_addx
4051	.rva	.LSEH_info_ecp_nistz256_point_addx
4052
4053	.rva	.LSEH_begin_ecp_nistz256_point_add_affinex
4054	.rva	.LSEH_end_ecp_nistz256_point_add_affinex
4055	.rva	.LSEH_info_ecp_nistz256_point_add_affinex
4056___
4057$code.=<<___;
4058
4059.section	.xdata
4060.align	8
4061.LSEH_info_ecp_nistz256_neg:
4062	.byte	9,0,0,0
4063	.rva	short_handler
4064	.rva	.Lneg_body,.Lneg_epilogue		# HandlerData[]
4065.LSEH_info_ecp_nistz256_ord_mul_mont:
4066	.byte	9,0,0,0
4067	.rva	full_handler
4068	.rva	.Lord_mul_body,.Lord_mul_epilogue	# HandlerData[]
4069	.long	48,0
4070.LSEH_info_ecp_nistz256_ord_sqr_mont:
4071	.byte	9,0,0,0
4072	.rva	full_handler
4073	.rva	.Lord_sqr_body,.Lord_sqr_epilogue	# HandlerData[]
4074	.long	48,0
4075___
4076$code.=<<___ if ($addx);
4077.LSEH_info_ecp_nistz256_ord_mul_montx:
4078	.byte	9,0,0,0
4079	.rva	full_handler
4080	.rva	.Lord_mulx_body,.Lord_mulx_epilogue	# HandlerData[]
4081	.long	48,0
4082.LSEH_info_ecp_nistz256_ord_sqr_montx:
4083	.byte	9,0,0,0
4084	.rva	full_handler
4085	.rva	.Lord_sqrx_body,.Lord_sqrx_epilogue	# HandlerData[]
4086	.long	48,0
4087___
4088$code.=<<___;
4089.LSEH_info_ecp_nistz256_mul_mont:
4090	.byte	9,0,0,0
4091	.rva	full_handler
4092	.rva	.Lmul_body,.Lmul_epilogue		# HandlerData[]
4093	.long	48,0
4094.LSEH_info_ecp_nistz256_sqr_mont:
4095	.byte	9,0,0,0
4096	.rva	full_handler
4097	.rva	.Lsqr_body,.Lsqr_epilogue		# HandlerData[]
4098	.long	48,0
4099.LSEH_info_ecp_nistz256_select_wX:
4100	.byte	0x01,0x33,0x16,0x00
4101	.byte	0x33,0xf8,0x09,0x00	#movaps 0x90(rsp),xmm15
4102	.byte	0x2e,0xe8,0x08,0x00	#movaps 0x80(rsp),xmm14
4103	.byte	0x29,0xd8,0x07,0x00	#movaps 0x70(rsp),xmm13
4104	.byte	0x24,0xc8,0x06,0x00	#movaps 0x60(rsp),xmm12
4105	.byte	0x1f,0xb8,0x05,0x00	#movaps 0x50(rsp),xmm11
4106	.byte	0x1a,0xa8,0x04,0x00	#movaps 0x40(rsp),xmm10
4107	.byte	0x15,0x98,0x03,0x00	#movaps 0x30(rsp),xmm9
4108	.byte	0x10,0x88,0x02,0x00	#movaps 0x20(rsp),xmm8
4109	.byte	0x0c,0x78,0x01,0x00	#movaps 0x10(rsp),xmm7
4110	.byte	0x08,0x68,0x00,0x00	#movaps 0x00(rsp),xmm6
4111	.byte	0x04,0x01,0x15,0x00	#sub	rsp,0xa8
4112	.align	8
4113___
4114$code.=<<___	if ($avx>1);
4115.LSEH_info_ecp_nistz256_avx2_select_wX:
4116	.byte	0x01,0x36,0x17,0x0b
4117	.byte	0x36,0xf8,0x09,0x00	# vmovaps 0x90(rsp),xmm15
4118	.byte	0x31,0xe8,0x08,0x00	# vmovaps 0x80(rsp),xmm14
4119	.byte	0x2c,0xd8,0x07,0x00	# vmovaps 0x70(rsp),xmm13
4120	.byte	0x27,0xc8,0x06,0x00	# vmovaps 0x60(rsp),xmm12
4121	.byte	0x22,0xb8,0x05,0x00	# vmovaps 0x50(rsp),xmm11
4122	.byte	0x1d,0xa8,0x04,0x00	# vmovaps 0x40(rsp),xmm10
4123	.byte	0x18,0x98,0x03,0x00	# vmovaps 0x30(rsp),xmm9
4124	.byte	0x13,0x88,0x02,0x00	# vmovaps 0x20(rsp),xmm8
4125	.byte	0x0e,0x78,0x01,0x00	# vmovaps 0x10(rsp),xmm7
4126	.byte	0x09,0x68,0x00,0x00	# vmovaps 0x00(rsp),xmm6
4127	.byte	0x04,0x01,0x15,0x00	# sub	  rsp,0xa8
4128	.byte	0x00,0xb3,0x00,0x00	# set_frame r11
4129	.align	8
4130___
4131$code.=<<___;
4132.LSEH_info_ecp_nistz256_point_double:
4133	.byte	9,0,0,0
4134	.rva	full_handler
4135	.rva	.Lpoint_doubleq_body,.Lpoint_doubleq_epilogue	# HandlerData[]
4136	.long	32*5+56,0
4137.LSEH_info_ecp_nistz256_point_add:
4138	.byte	9,0,0,0
4139	.rva	full_handler
4140	.rva	.Lpoint_addq_body,.Lpoint_addq_epilogue		# HandlerData[]
4141	.long	32*18+56,0
4142.LSEH_info_ecp_nistz256_point_add_affine:
4143	.byte	9,0,0,0
4144	.rva	full_handler
4145	.rva	.Ladd_affineq_body,.Ladd_affineq_epilogue	# HandlerData[]
4146	.long	32*15+56,0
4147___
4148$code.=<<___ if ($addx);
4149.align	8
4150.LSEH_info_ecp_nistz256_point_doublex:
4151	.byte	9,0,0,0
4152	.rva	full_handler
4153	.rva	.Lpoint_doublex_body,.Lpoint_doublex_epilogue	# HandlerData[]
4154	.long	32*5+56,0
4155.LSEH_info_ecp_nistz256_point_addx:
4156	.byte	9,0,0,0
4157	.rva	full_handler
4158	.rva	.Lpoint_addx_body,.Lpoint_addx_epilogue		# HandlerData[]
4159	.long	32*18+56,0
4160.LSEH_info_ecp_nistz256_point_add_affinex:
4161	.byte	9,0,0,0
4162	.rva	full_handler
4163	.rva	.Ladd_affinex_body,.Ladd_affinex_epilogue	# HandlerData[]
4164	.long	32*15+56,0
4165___
4166}
4167
4168$code =~ s/\`([^\`]*)\`/eval $1/gem;
4169print $code;
4170close STDOUT or die "error closing STDOUT: $!";
4171