• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
3# Copyright (c) 2014, Intel Corporation. All Rights Reserved.
4# Copyright (c) 2015 CloudFlare, Inc.
5#
6# Licensed under the OpenSSL license (the "License").  You may not use
7# this file except in compliance with the License.  You can obtain a copy
8# in the file LICENSE in the source distribution or at
9# https://www.openssl.org/source/license.html
10#
11# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3)
12# (1) Intel Corporation, Israel Development Center, Haifa, Israel
13# (2) University of Haifa, Israel
14# (3) CloudFlare, Inc.
15#
16# Reference:
17# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
18#                          256 Bit Primes"
19
20# Further optimization by <appro@openssl.org>:
21#
22#		this/original	with/without -DECP_NISTZ256_ASM(*)
23# Opteron	+15-49%		+150-195%
24# Bulldozer	+18-45%		+175-240%
25# P4		+24-46%		+100-150%
26# Westmere	+18-34%		+87-160%
27# Sandy Bridge	+14-35%		+120-185%
28# Ivy Bridge	+11-35%		+125-180%
29# Haswell	+10-37%		+160-200%
30# Broadwell	+24-58%		+210-270%
31# Atom		+20-50%		+180-240%
32# VIA Nano	+50-160%	+480-480%
33#
34# (*)	"without -DECP_NISTZ256_ASM" refers to build with
35#	"enable-ec_nistp_64_gcc_128";
36#
37# Ranges denote minimum and maximum improvement coefficients depending
38# on benchmark. In "this/original" column lower coefficient is for
39# ECDSA sign, while in "with/without" - for ECDH key agreement, and
40# higher - for ECDSA sign, relatively fastest server-side operation.
41# Keep in mind that +100% means 2x improvement.
42
43$flavour = shift;
44$output  = shift;
45if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
46
47$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
48
49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
51( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
52die "can't locate x86_64-xlate.pl";
53
54open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
55*STDOUT=*OUT;
56
57$avx = 2;
58$addx = 1;
59
60$code.=<<___;
61.text
62.extern	OPENSSL_ia32cap_P
63
64# The polynomial
65.align 64
66.Lpoly:
67.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
68
69.LOne:
70.long 1,1,1,1,1,1,1,1
71.LTwo:
72.long 2,2,2,2,2,2,2,2
73.LThree:
74.long 3,3,3,3,3,3,3,3
75.LONE_mont:
76.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
77
78# Constants for computations modulo ord(p256)
79.Lord:
80.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
81.LordK:
82.quad 0xccd1c8aaee00bc4f
83___
84
85{
86my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11));
87my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13");
88my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx");
89
90$code.=<<___;
91
92################################################################################
93# void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]);
94.globl	ecp_nistz256_neg
95.type	ecp_nistz256_neg,\@function,2
96.align	32
97ecp_nistz256_neg:
98.cfi_startproc
99	push	%r12
100.cfi_push	%r12
101	push	%r13
102.cfi_push	%r13
103.Lneg_body:
104
105	xor	$a0, $a0
106	xor	$a1, $a1
107	xor	$a2, $a2
108	xor	$a3, $a3
109	xor	$t4, $t4
110
111	sub	8*0($a_ptr), $a0
112	sbb	8*1($a_ptr), $a1
113	sbb	8*2($a_ptr), $a2
114	 mov	$a0, $t0
115	sbb	8*3($a_ptr), $a3
116	lea	.Lpoly(%rip), $a_ptr
117	 mov	$a1, $t1
118	sbb	\$0, $t4
119
120	add	8*0($a_ptr), $a0
121	 mov	$a2, $t2
122	adc	8*1($a_ptr), $a1
123	adc	8*2($a_ptr), $a2
124	 mov	$a3, $t3
125	adc	8*3($a_ptr), $a3
126	test	$t4, $t4
127
128	cmovz	$t0, $a0
129	cmovz	$t1, $a1
130	mov	$a0, 8*0($r_ptr)
131	cmovz	$t2, $a2
132	mov	$a1, 8*1($r_ptr)
133	cmovz	$t3, $a3
134	mov	$a2, 8*2($r_ptr)
135	mov	$a3, 8*3($r_ptr)
136
137	mov	0(%rsp),%r13
138.cfi_restore	%r13
139	mov	8(%rsp),%r12
140.cfi_restore	%r12
141	lea	16(%rsp),%rsp
142.cfi_adjust_cfa_offset	-16
143.Lneg_epilogue:
144	ret
145.cfi_endproc
146.size	ecp_nistz256_neg,.-ecp_nistz256_neg
147___
148}
149{
150my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
151my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
152my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
153my ($poly1,$poly3)=($acc6,$acc7);
154
155$code.=<<___;
156################################################################################
157# void ecp_nistz256_ord_mul_mont(
158#   uint64_t res[4],
159#   uint64_t a[4],
160#   uint64_t b[4]);
161
162.globl	ecp_nistz256_ord_mul_mont
163.type	ecp_nistz256_ord_mul_mont,\@function,3
164.align	32
165ecp_nistz256_ord_mul_mont:
166.cfi_startproc
167___
168$code.=<<___	if ($addx);
169	leaq	OPENSSL_ia32cap_P(%rip), %rcx
170	mov	8(%rcx), %rcx
171	and	\$0x80100, %ecx
172	cmp	\$0x80100, %ecx
173	je	.Lecp_nistz256_ord_mul_montx
174___
175$code.=<<___;
176	push	%rbp
177.cfi_push	%rbp
178	push	%rbx
179.cfi_push	%rbx
180	push	%r12
181.cfi_push	%r12
182	push	%r13
183.cfi_push	%r13
184	push	%r14
185.cfi_push	%r14
186	push	%r15
187.cfi_push	%r15
188.Lord_mul_body:
189
190	mov	8*0($b_org), %rax
191	mov	$b_org, $b_ptr
192	lea	.Lord(%rip), %r14
193	mov	.LordK(%rip), %r15
194
195	################################# * b[0]
196	mov	%rax, $t0
197	mulq	8*0($a_ptr)
198	mov	%rax, $acc0
199	mov	$t0, %rax
200	mov	%rdx, $acc1
201
202	mulq	8*1($a_ptr)
203	add	%rax, $acc1
204	mov	$t0, %rax
205	adc	\$0, %rdx
206	mov	%rdx, $acc2
207
208	mulq	8*2($a_ptr)
209	add	%rax, $acc2
210	mov	$t0, %rax
211	adc	\$0, %rdx
212
213	 mov	$acc0, $acc5
214	 imulq	%r15,$acc0
215
216	mov	%rdx, $acc3
217	mulq	8*3($a_ptr)
218	add	%rax, $acc3
219	 mov	$acc0, %rax
220	adc	\$0, %rdx
221	mov	%rdx, $acc4
222
223	################################# First reduction step
224	mulq	8*0(%r14)
225	mov	$acc0, $t1
226	add	%rax, $acc5		# guaranteed to be zero
227	mov	$acc0, %rax
228	adc	\$0, %rdx
229	mov	%rdx, $t0
230
231	sub	$acc0, $acc2
232	sbb	\$0, $acc0		# can't borrow
233
234	mulq	8*1(%r14)
235	add	$t0, $acc1
236	adc	\$0, %rdx
237	add	%rax, $acc1
238	mov	$t1, %rax
239	adc	%rdx, $acc2
240	mov	$t1, %rdx
241	adc	\$0, $acc0		# can't overflow
242
243	shl	\$32, %rax
244	shr	\$32, %rdx
245	sub	%rax, $acc3
246	 mov	8*1($b_ptr), %rax
247	sbb	%rdx, $t1		# can't borrow
248
249	add	$acc0, $acc3
250	adc	$t1, $acc4
251	adc	\$0, $acc5
252
253	################################# * b[1]
254	mov	%rax, $t0
255	mulq	8*0($a_ptr)
256	add	%rax, $acc1
257	mov	$t0, %rax
258	adc	\$0, %rdx
259	mov	%rdx, $t1
260
261	mulq	8*1($a_ptr)
262	add	$t1, $acc2
263	adc	\$0, %rdx
264	add	%rax, $acc2
265	mov	$t0, %rax
266	adc	\$0, %rdx
267	mov	%rdx, $t1
268
269	mulq	8*2($a_ptr)
270	add	$t1, $acc3
271	adc	\$0, %rdx
272	add	%rax, $acc3
273	mov	$t0, %rax
274	adc	\$0, %rdx
275
276	 mov	$acc1, $t0
277	 imulq	%r15, $acc1
278
279	mov	%rdx, $t1
280	mulq	8*3($a_ptr)
281	add	$t1, $acc4
282	adc	\$0, %rdx
283	xor	$acc0, $acc0
284	add	%rax, $acc4
285	 mov	$acc1, %rax
286	adc	%rdx, $acc5
287	adc	\$0, $acc0
288
289	################################# Second reduction step
290	mulq	8*0(%r14)
291	mov	$acc1, $t1
292	add	%rax, $t0		# guaranteed to be zero
293	mov	$acc1, %rax
294	adc	%rdx, $t0
295
296	sub	$acc1, $acc3
297	sbb	\$0, $acc1		# can't borrow
298
299	mulq	8*1(%r14)
300	add	$t0, $acc2
301	adc	\$0, %rdx
302	add	%rax, $acc2
303	mov	$t1, %rax
304	adc	%rdx, $acc3
305	mov	$t1, %rdx
306	adc	\$0, $acc1		# can't overflow
307
308	shl	\$32, %rax
309	shr	\$32, %rdx
310	sub	%rax, $acc4
311	 mov	8*2($b_ptr), %rax
312	sbb	%rdx, $t1		# can't borrow
313
314	add	$acc1, $acc4
315	adc	$t1, $acc5
316	adc	\$0, $acc0
317
318	################################## * b[2]
319	mov	%rax, $t0
320	mulq	8*0($a_ptr)
321	add	%rax, $acc2
322	mov	$t0, %rax
323	adc	\$0, %rdx
324	mov	%rdx, $t1
325
326	mulq	8*1($a_ptr)
327	add	$t1, $acc3
328	adc	\$0, %rdx
329	add	%rax, $acc3
330	mov	$t0, %rax
331	adc	\$0, %rdx
332	mov	%rdx, $t1
333
334	mulq	8*2($a_ptr)
335	add	$t1, $acc4
336	adc	\$0, %rdx
337	add	%rax, $acc4
338	mov	$t0, %rax
339	adc	\$0, %rdx
340
341	 mov	$acc2, $t0
342	 imulq	%r15, $acc2
343
344	mov	%rdx, $t1
345	mulq	8*3($a_ptr)
346	add	$t1, $acc5
347	adc	\$0, %rdx
348	xor	$acc1, $acc1
349	add	%rax, $acc5
350	 mov	$acc2, %rax
351	adc	%rdx, $acc0
352	adc	\$0, $acc1
353
354	################################# Third reduction step
355	mulq	8*0(%r14)
356	mov	$acc2, $t1
357	add	%rax, $t0		# guaranteed to be zero
358	mov	$acc2, %rax
359	adc	%rdx, $t0
360
361	sub	$acc2, $acc4
362	sbb	\$0, $acc2		# can't borrow
363
364	mulq	8*1(%r14)
365	add	$t0, $acc3
366	adc	\$0, %rdx
367	add	%rax, $acc3
368	mov	$t1, %rax
369	adc	%rdx, $acc4
370	mov	$t1, %rdx
371	adc	\$0, $acc2		# can't overflow
372
373	shl	\$32, %rax
374	shr	\$32, %rdx
375	sub	%rax, $acc5
376	 mov	8*3($b_ptr), %rax
377	sbb	%rdx, $t1		# can't borrow
378
379	add	$acc2, $acc5
380	adc	$t1, $acc0
381	adc	\$0, $acc1
382
383	################################# * b[3]
384	mov	%rax, $t0
385	mulq	8*0($a_ptr)
386	add	%rax, $acc3
387	mov	$t0, %rax
388	adc	\$0, %rdx
389	mov	%rdx, $t1
390
391	mulq	8*1($a_ptr)
392	add	$t1, $acc4
393	adc	\$0, %rdx
394	add	%rax, $acc4
395	mov	$t0, %rax
396	adc	\$0, %rdx
397	mov	%rdx, $t1
398
399	mulq	8*2($a_ptr)
400	add	$t1, $acc5
401	adc	\$0, %rdx
402	add	%rax, $acc5
403	mov	$t0, %rax
404	adc	\$0, %rdx
405
406	 mov	$acc3, $t0
407	 imulq	%r15, $acc3
408
409	mov	%rdx, $t1
410	mulq	8*3($a_ptr)
411	add	$t1, $acc0
412	adc	\$0, %rdx
413	xor	$acc2, $acc2
414	add	%rax, $acc0
415	 mov	$acc3, %rax
416	adc	%rdx, $acc1
417	adc	\$0, $acc2
418
419	################################# Last reduction step
420	mulq	8*0(%r14)
421	mov	$acc3, $t1
422	add	%rax, $t0		# guaranteed to be zero
423	mov	$acc3, %rax
424	adc	%rdx, $t0
425
426	sub	$acc3, $acc5
427	sbb	\$0, $acc3		# can't borrow
428
429	mulq	8*1(%r14)
430	add	$t0, $acc4
431	adc	\$0, %rdx
432	add	%rax, $acc4
433	mov	$t1, %rax
434	adc	%rdx, $acc5
435	mov	$t1, %rdx
436	adc	\$0, $acc3		# can't overflow
437
438	shl	\$32, %rax
439	shr	\$32, %rdx
440	sub	%rax, $acc0
441	sbb	%rdx, $t1		# can't borrow
442
443	add	$acc3, $acc0
444	adc	$t1, $acc1
445	adc	\$0, $acc2
446
447	################################# Subtract ord
448	 mov	$acc4, $a_ptr
449	sub	8*0(%r14), $acc4
450	 mov	$acc5, $acc3
451	sbb	8*1(%r14), $acc5
452	 mov	$acc0, $t0
453	sbb	8*2(%r14), $acc0
454	 mov	$acc1, $t1
455	sbb	8*3(%r14), $acc1
456	sbb	\$0, $acc2
457
458	cmovc	$a_ptr, $acc4
459	cmovc	$acc3, $acc5
460	cmovc	$t0, $acc0
461	cmovc	$t1, $acc1
462
463	mov	$acc4, 8*0($r_ptr)
464	mov	$acc5, 8*1($r_ptr)
465	mov	$acc0, 8*2($r_ptr)
466	mov	$acc1, 8*3($r_ptr)
467
468	mov	0(%rsp),%r15
469.cfi_restore	%r15
470	mov	8(%rsp),%r14
471.cfi_restore	%r14
472	mov	16(%rsp),%r13
473.cfi_restore	%r13
474	mov	24(%rsp),%r12
475.cfi_restore	%r12
476	mov	32(%rsp),%rbx
477.cfi_restore	%rbx
478	mov	40(%rsp),%rbp
479.cfi_restore	%rbp
480	lea	48(%rsp),%rsp
481.cfi_adjust_cfa_offset	-48
482.Lord_mul_epilogue:
483	ret
484.cfi_endproc
485.size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
486
487################################################################################
488# void ecp_nistz256_ord_sqr_mont(
489#   uint64_t res[4],
490#   uint64_t a[4],
491#   uint64_t rep);
492
493.globl	ecp_nistz256_ord_sqr_mont
494.type	ecp_nistz256_ord_sqr_mont,\@function,3
495.align	32
496ecp_nistz256_ord_sqr_mont:
497.cfi_startproc
498___
499$code.=<<___	if ($addx);
500	leaq	OPENSSL_ia32cap_P(%rip), %rcx
501	mov	8(%rcx), %rcx
502	and	\$0x80100, %ecx
503	cmp	\$0x80100, %ecx
504	je	.Lecp_nistz256_ord_sqr_montx
505___
506$code.=<<___;
507	push	%rbp
508.cfi_push	%rbp
509	push	%rbx
510.cfi_push	%rbx
511	push	%r12
512.cfi_push	%r12
513	push	%r13
514.cfi_push	%r13
515	push	%r14
516.cfi_push	%r14
517	push	%r15
518.cfi_push	%r15
519.Lord_sqr_body:
520
521	mov	8*0($a_ptr), $acc0
522	mov	8*1($a_ptr), %rax
523	mov	8*2($a_ptr), $acc6
524	mov	8*3($a_ptr), $acc7
525	lea	.Lord(%rip), $a_ptr	# pointer to modulus
526	mov	$b_org, $b_ptr
527	jmp	.Loop_ord_sqr
528
529.align	32
530.Loop_ord_sqr:
531	################################# a[1:] * a[0]
532	mov	%rax, $t1		# put aside a[1]
533	mul	$acc0			# a[1] * a[0]
534	mov	%rax, $acc1
535	movq	$t1, %xmm1		# offload a[1]
536	mov	$acc6, %rax
537	mov	%rdx, $acc2
538
539	mul	$acc0			# a[2] * a[0]
540	add	%rax, $acc2
541	mov	$acc7, %rax
542	movq	$acc6, %xmm2		# offload a[2]
543	adc	\$0, %rdx
544	mov	%rdx, $acc3
545
546	mul	$acc0			# a[3] * a[0]
547	add	%rax, $acc3
548	mov	$acc7, %rax
549	movq	$acc7, %xmm3		# offload a[3]
550	adc	\$0, %rdx
551	mov	%rdx, $acc4
552
553	################################# a[3] * a[2]
554	mul	$acc6			# a[3] * a[2]
555	mov	%rax, $acc5
556	mov	$acc6, %rax
557	mov	%rdx, $acc6
558
559	################################# a[2:] * a[1]
560	mul	$t1			# a[2] * a[1]
561	add	%rax, $acc3
562	mov	$acc7, %rax
563	adc	\$0, %rdx
564	mov	%rdx, $acc7
565
566	mul	$t1			# a[3] * a[1]
567	add	%rax, $acc4
568	adc	\$0, %rdx
569
570	add	$acc7, $acc4
571	adc	%rdx, $acc5
572	adc	\$0, $acc6		# can't overflow
573
574	################################# *2
575	xor	$acc7, $acc7
576	mov	$acc0, %rax
577	add	$acc1, $acc1
578	adc	$acc2, $acc2
579	adc	$acc3, $acc3
580	adc	$acc4, $acc4
581	adc	$acc5, $acc5
582	adc	$acc6, $acc6
583	adc	\$0, $acc7
584
585	################################# Missing products
586	mul	%rax			# a[0] * a[0]
587	mov	%rax, $acc0
588	movq	%xmm1, %rax
589	mov	%rdx, $t1
590
591	mul	%rax			# a[1] * a[1]
592	add	$t1, $acc1
593	adc	%rax, $acc2
594	movq	%xmm2, %rax
595	adc	\$0, %rdx
596	mov	%rdx, $t1
597
598	mul	%rax			# a[2] * a[2]
599	add	$t1, $acc3
600	adc	%rax, $acc4
601	movq	%xmm3, %rax
602	adc	\$0, %rdx
603	mov	%rdx, $t1
604
605	 mov	$acc0, $t0
606	 imulq	8*4($a_ptr), $acc0	# *= .LordK
607
608	mul	%rax			# a[3] * a[3]
609	add	$t1, $acc5
610	adc	%rax, $acc6
611	 mov	8*0($a_ptr), %rax	# modulus[0]
612	adc	%rdx, $acc7		# can't overflow
613
614	################################# First reduction step
615	mul	$acc0
616	mov	$acc0, $t1
617	add	%rax, $t0		# guaranteed to be zero
618	mov	8*1($a_ptr), %rax	# modulus[1]
619	adc	%rdx, $t0
620
621	sub	$acc0, $acc2
622	sbb	\$0, $t1		# can't borrow
623
624	mul	$acc0
625	add	$t0, $acc1
626	adc	\$0, %rdx
627	add	%rax, $acc1
628	mov	$acc0, %rax
629	adc	%rdx, $acc2
630	mov	$acc0, %rdx
631	adc	\$0, $t1		# can't overflow
632
633	 mov	$acc1, $t0
634	 imulq	8*4($a_ptr), $acc1	# *= .LordK
635
636	shl	\$32, %rax
637	shr	\$32, %rdx
638	sub	%rax, $acc3
639	 mov	8*0($a_ptr), %rax
640	sbb	%rdx, $acc0		# can't borrow
641
642	add	$t1, $acc3
643	adc	\$0, $acc0		# can't overflow
644
645	################################# Second reduction step
646	mul	$acc1
647	mov	$acc1, $t1
648	add	%rax, $t0		# guaranteed to be zero
649	mov	8*1($a_ptr), %rax
650	adc	%rdx, $t0
651
652	sub	$acc1, $acc3
653	sbb	\$0, $t1		# can't borrow
654
655	mul	$acc1
656	add	$t0, $acc2
657	adc	\$0, %rdx
658	add	%rax, $acc2
659	mov	$acc1, %rax
660	adc	%rdx, $acc3
661	mov	$acc1, %rdx
662	adc	\$0, $t1		# can't overflow
663
664	 mov	$acc2, $t0
665	 imulq	8*4($a_ptr), $acc2	# *= .LordK
666
667	shl	\$32, %rax
668	shr	\$32, %rdx
669	sub	%rax, $acc0
670	 mov	8*0($a_ptr), %rax
671	sbb	%rdx, $acc1		# can't borrow
672
673	add	$t1, $acc0
674	adc	\$0, $acc1		# can't overflow
675
676	################################# Third reduction step
677	mul	$acc2
678	mov	$acc2, $t1
679	add	%rax, $t0		# guaranteed to be zero
680	mov	8*1($a_ptr), %rax
681	adc	%rdx, $t0
682
683	sub	$acc2, $acc0
684	sbb	\$0, $t1		# can't borrow
685
686	mul	$acc2
687	add	$t0, $acc3
688	adc	\$0, %rdx
689	add	%rax, $acc3
690	mov	$acc2, %rax
691	adc	%rdx, $acc0
692	mov	$acc2, %rdx
693	adc	\$0, $t1		# can't overflow
694
695	 mov	$acc3, $t0
696	 imulq	8*4($a_ptr), $acc3	# *= .LordK
697
698	shl	\$32, %rax
699	shr	\$32, %rdx
700	sub	%rax, $acc1
701	 mov	8*0($a_ptr), %rax
702	sbb	%rdx, $acc2		# can't borrow
703
704	add	$t1, $acc1
705	adc	\$0, $acc2		# can't overflow
706
707	################################# Last reduction step
708	mul	$acc3
709	mov	$acc3, $t1
710	add	%rax, $t0		# guaranteed to be zero
711	mov	8*1($a_ptr), %rax
712	adc	%rdx, $t0
713
714	sub	$acc3, $acc1
715	sbb	\$0, $t1		# can't borrow
716
717	mul	$acc3
718	add	$t0, $acc0
719	adc	\$0, %rdx
720	add	%rax, $acc0
721	mov	$acc3, %rax
722	adc	%rdx, $acc1
723	mov	$acc3, %rdx
724	adc	\$0, $t1		# can't overflow
725
726	shl	\$32, %rax
727	shr	\$32, %rdx
728	sub	%rax, $acc2
729	sbb	%rdx, $acc3		# can't borrow
730
731	add	$t1, $acc2
732	adc	\$0, $acc3		# can't overflow
733
734	################################# Add bits [511:256] of the sqr result
735	xor	%rdx, %rdx
736	add	$acc4, $acc0
737	adc	$acc5, $acc1
738	 mov	$acc0, $acc4
739	adc	$acc6, $acc2
740	adc	$acc7, $acc3
741	 mov	$acc1, %rax
742	adc	\$0, %rdx
743
744	################################# Compare to modulus
745	sub	8*0($a_ptr), $acc0
746	 mov	$acc2, $acc6
747	sbb	8*1($a_ptr), $acc1
748	sbb	8*2($a_ptr), $acc2
749	 mov	$acc3, $acc7
750	sbb	8*3($a_ptr), $acc3
751	sbb	\$0, %rdx
752
753	cmovc	$acc4, $acc0
754	cmovnc	$acc1, %rax
755	cmovnc	$acc2, $acc6
756	cmovnc	$acc3, $acc7
757
758	dec	$b_ptr
759	jnz	.Loop_ord_sqr
760
761	mov	$acc0, 8*0($r_ptr)
762	mov	%rax,  8*1($r_ptr)
763	pxor	%xmm1, %xmm1
764	mov	$acc6, 8*2($r_ptr)
765	pxor	%xmm2, %xmm2
766	mov	$acc7, 8*3($r_ptr)
767	pxor	%xmm3, %xmm3
768
769	mov	0(%rsp),%r15
770.cfi_restore	%r15
771	mov	8(%rsp),%r14
772.cfi_restore	%r14
773	mov	16(%rsp),%r13
774.cfi_restore	%r13
775	mov	24(%rsp),%r12
776.cfi_restore	%r12
777	mov	32(%rsp),%rbx
778.cfi_restore	%rbx
779	mov	40(%rsp),%rbp
780.cfi_restore	%rbp
781	lea	48(%rsp),%rsp
782.cfi_adjust_cfa_offset	-48
783.Lord_sqr_epilogue:
784	ret
785.cfi_endproc
786.size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
787___
788
789$code.=<<___	if ($addx);
790################################################################################
791.type	ecp_nistz256_ord_mul_montx,\@function,3
792.align	32
793ecp_nistz256_ord_mul_montx:
794.cfi_startproc
795.Lecp_nistz256_ord_mul_montx:
796	push	%rbp
797.cfi_push	%rbp
798	push	%rbx
799.cfi_push	%rbx
800	push	%r12
801.cfi_push	%r12
802	push	%r13
803.cfi_push	%r13
804	push	%r14
805.cfi_push	%r14
806	push	%r15
807.cfi_push	%r15
808.Lord_mulx_body:
809
810	mov	$b_org, $b_ptr
811	mov	8*0($b_org), %rdx
812	mov	8*0($a_ptr), $acc1
813	mov	8*1($a_ptr), $acc2
814	mov	8*2($a_ptr), $acc3
815	mov	8*3($a_ptr), $acc4
816	lea	-128($a_ptr), $a_ptr	# control u-op density
817	lea	.Lord-128(%rip), %r14
818	mov	.LordK(%rip), %r15
819
820	################################# Multiply by b[0]
821	mulx	$acc1, $acc0, $acc1
822	mulx	$acc2, $t0, $acc2
823	mulx	$acc3, $t1, $acc3
824	add	$t0, $acc1
825	mulx	$acc4, $t0, $acc4
826	 mov	$acc0, %rdx
827	 mulx	%r15, %rdx, %rax
828	adc	$t1, $acc2
829	adc	$t0, $acc3
830	adc	\$0, $acc4
831
832	################################# reduction
833	xor	$acc5, $acc5		# $acc5=0, cf=0, of=0
834	mulx	8*0+128(%r14), $t0, $t1
835	adcx	$t0, $acc0		# guaranteed to be zero
836	adox	$t1, $acc1
837
838	mulx	8*1+128(%r14), $t0, $t1
839	adcx	$t0, $acc1
840	adox	$t1, $acc2
841
842	mulx	8*2+128(%r14), $t0, $t1
843	adcx	$t0, $acc2
844	adox	$t1, $acc3
845
846	mulx	8*3+128(%r14), $t0, $t1
847	 mov	8*1($b_ptr), %rdx
848	adcx	$t0, $acc3
849	adox	$t1, $acc4
850	adcx	$acc0, $acc4
851	adox	$acc0, $acc5
852	adc	\$0, $acc5		# cf=0, of=0
853
854	################################# Multiply by b[1]
855	mulx	8*0+128($a_ptr), $t0, $t1
856	adcx	$t0, $acc1
857	adox	$t1, $acc2
858
859	mulx	8*1+128($a_ptr), $t0, $t1
860	adcx	$t0, $acc2
861	adox	$t1, $acc3
862
863	mulx	8*2+128($a_ptr), $t0, $t1
864	adcx	$t0, $acc3
865	adox	$t1, $acc4
866
867	mulx	8*3+128($a_ptr), $t0, $t1
868	 mov	$acc1, %rdx
869	 mulx	%r15, %rdx, %rax
870	adcx	$t0, $acc4
871	adox	$t1, $acc5
872
873	adcx	$acc0, $acc5
874	adox	$acc0, $acc0
875	adc	\$0, $acc0		# cf=0, of=0
876
877	################################# reduction
878	mulx	8*0+128(%r14), $t0, $t1
879	adcx	$t0, $acc1		# guaranteed to be zero
880	adox	$t1, $acc2
881
882	mulx	8*1+128(%r14), $t0, $t1
883	adcx	$t0, $acc2
884	adox	$t1, $acc3
885
886	mulx	8*2+128(%r14), $t0, $t1
887	adcx	$t0, $acc3
888	adox	$t1, $acc4
889
890	mulx	8*3+128(%r14), $t0, $t1
891	 mov	8*2($b_ptr), %rdx
892	adcx	$t0, $acc4
893	adox	$t1, $acc5
894	adcx	$acc1, $acc5
895	adox	$acc1, $acc0
896	adc	\$0, $acc0		# cf=0, of=0
897
898	################################# Multiply by b[2]
899	mulx	8*0+128($a_ptr), $t0, $t1
900	adcx	$t0, $acc2
901	adox	$t1, $acc3
902
903	mulx	8*1+128($a_ptr), $t0, $t1
904	adcx	$t0, $acc3
905	adox	$t1, $acc4
906
907	mulx	8*2+128($a_ptr), $t0, $t1
908	adcx	$t0, $acc4
909	adox	$t1, $acc5
910
911	mulx	8*3+128($a_ptr), $t0, $t1
912	 mov	$acc2, %rdx
913	 mulx	%r15, %rdx, %rax
914	adcx	$t0, $acc5
915	adox	$t1, $acc0
916
917	adcx	$acc1, $acc0
918	adox	$acc1, $acc1
919	adc	\$0, $acc1		# cf=0, of=0
920
921	################################# reduction
922	mulx	8*0+128(%r14), $t0, $t1
923	adcx	$t0, $acc2		# guaranteed to be zero
924	adox	$t1, $acc3
925
926	mulx	8*1+128(%r14), $t0, $t1
927	adcx	$t0, $acc3
928	adox	$t1, $acc4
929
930	mulx	8*2+128(%r14), $t0, $t1
931	adcx	$t0, $acc4
932	adox	$t1, $acc5
933
934	mulx	8*3+128(%r14), $t0, $t1
935	 mov	8*3($b_ptr), %rdx
936	adcx	$t0, $acc5
937	adox	$t1, $acc0
938	adcx	$acc2, $acc0
939	adox	$acc2, $acc1
940	adc	\$0, $acc1		# cf=0, of=0
941
942	################################# Multiply by b[3]
943	mulx	8*0+128($a_ptr), $t0, $t1
944	adcx	$t0, $acc3
945	adox	$t1, $acc4
946
947	mulx	8*1+128($a_ptr), $t0, $t1
948	adcx	$t0, $acc4
949	adox	$t1, $acc5
950
951	mulx	8*2+128($a_ptr), $t0, $t1
952	adcx	$t0, $acc5
953	adox	$t1, $acc0
954
955	mulx	8*3+128($a_ptr), $t0, $t1
956	 mov	$acc3, %rdx
957	 mulx	%r15, %rdx, %rax
958	adcx	$t0, $acc0
959	adox	$t1, $acc1
960
961	adcx	$acc2, $acc1
962	adox	$acc2, $acc2
963	adc	\$0, $acc2		# cf=0, of=0
964
965	################################# reduction
966	mulx	8*0+128(%r14), $t0, $t1
967	adcx	$t0, $acc3		# guranteed to be zero
968	adox	$t1, $acc4
969
970	mulx	8*1+128(%r14), $t0, $t1
971	adcx	$t0, $acc4
972	adox	$t1, $acc5
973
974	mulx	8*2+128(%r14), $t0, $t1
975	adcx	$t0, $acc5
976	adox	$t1, $acc0
977
978	mulx	8*3+128(%r14), $t0, $t1
979	lea	128(%r14),%r14
980	 mov	$acc4, $t2
981	adcx	$t0, $acc0
982	adox	$t1, $acc1
983	 mov	$acc5, $t3
984	adcx	$acc3, $acc1
985	adox	$acc3, $acc2
986	adc	\$0, $acc2
987
988	#################################
989	# Branch-less conditional subtraction of P
990	 mov	$acc0, $t0
991	sub	8*0(%r14), $acc4
992	sbb	8*1(%r14), $acc5
993	sbb	8*2(%r14), $acc0
994	 mov	$acc1, $t1
995	sbb	8*3(%r14), $acc1
996	sbb	\$0, $acc2
997
998	cmovc	$t2, $acc4
999	cmovc	$t3, $acc5
1000	cmovc	$t0, $acc0
1001	cmovc	$t1, $acc1
1002
1003	mov	$acc4, 8*0($r_ptr)
1004	mov	$acc5, 8*1($r_ptr)
1005	mov	$acc0, 8*2($r_ptr)
1006	mov	$acc1, 8*3($r_ptr)
1007
1008	mov	0(%rsp),%r15
1009.cfi_restore	%r15
1010	mov	8(%rsp),%r14
1011.cfi_restore	%r14
1012	mov	16(%rsp),%r13
1013.cfi_restore	%r13
1014	mov	24(%rsp),%r12
1015.cfi_restore	%r12
1016	mov	32(%rsp),%rbx
1017.cfi_restore	%rbx
1018	mov	40(%rsp),%rbp
1019.cfi_restore	%rbp
1020	lea	48(%rsp),%rsp
1021.cfi_adjust_cfa_offset	-48
1022.Lord_mulx_epilogue:
1023	ret
1024.cfi_endproc
1025.size	ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx
1026
1027.type	ecp_nistz256_ord_sqr_montx,\@function,3
1028.align	32
1029ecp_nistz256_ord_sqr_montx:
1030.cfi_startproc
1031.Lecp_nistz256_ord_sqr_montx:
1032	push	%rbp
1033.cfi_push	%rbp
1034	push	%rbx
1035.cfi_push	%rbx
1036	push	%r12
1037.cfi_push	%r12
1038	push	%r13
1039.cfi_push	%r13
1040	push	%r14
1041.cfi_push	%r14
1042	push	%r15
1043.cfi_push	%r15
1044.Lord_sqrx_body:
1045
1046	mov	$b_org, $b_ptr
1047	mov	8*0($a_ptr), %rdx
1048	mov	8*1($a_ptr), $acc6
1049	mov	8*2($a_ptr), $acc7
1050	mov	8*3($a_ptr), $acc0
1051	lea	.Lord(%rip), $a_ptr
1052	jmp	.Loop_ord_sqrx
1053
1054.align	32
1055.Loop_ord_sqrx:
1056	mulx	$acc6, $acc1, $acc2	# a[0]*a[1]
1057	mulx	$acc7, $t0, $acc3	# a[0]*a[2]
1058	 mov	%rdx, %rax		# offload a[0]
1059	 movq	$acc6, %xmm1		# offload a[1]
1060	mulx	$acc0, $t1, $acc4	# a[0]*a[3]
1061	 mov	$acc6, %rdx
1062	add	$t0, $acc2
1063	 movq	$acc7, %xmm2		# offload a[2]
1064	adc	$t1, $acc3
1065	adc	\$0, $acc4
1066	xor	$acc5, $acc5		# $acc5=0,cf=0,of=0
1067	#################################
1068	mulx	$acc7, $t0, $t1		# a[1]*a[2]
1069	adcx	$t0, $acc3
1070	adox	$t1, $acc4
1071
1072	mulx	$acc0, $t0, $t1		# a[1]*a[3]
1073	 mov	$acc7, %rdx
1074	adcx	$t0, $acc4
1075	adox	$t1, $acc5
1076	adc	\$0, $acc5
1077	#################################
1078	mulx	$acc0, $t0, $acc6	# a[2]*a[3]
1079	mov	%rax, %rdx
1080	 movq	$acc0, %xmm3		# offload a[3]
1081	xor	$acc7, $acc7		# $acc7=0,cf=0,of=0
1082	 adcx	$acc1, $acc1		# acc1:6<<1
1083	adox	$t0, $acc5
1084	 adcx	$acc2, $acc2
1085	adox	$acc7, $acc6		# of=0
1086
1087	################################# a[i]*a[i]
1088	mulx	%rdx, $acc0, $t1
1089	movq	%xmm1, %rdx
1090	 adcx	$acc3, $acc3
1091	adox	$t1, $acc1
1092	 adcx	$acc4, $acc4
1093	mulx	%rdx, $t0, $t4
1094	movq	%xmm2, %rdx
1095	 adcx	$acc5, $acc5
1096	adox	$t0, $acc2
1097	 adcx	$acc6, $acc6
1098	mulx	%rdx, $t0, $t1
1099	.byte	0x67
1100	movq	%xmm3, %rdx
1101	adox	$t4, $acc3
1102	 adcx	$acc7, $acc7
1103	adox	$t0, $acc4
1104	adox	$t1, $acc5
1105	mulx	%rdx, $t0, $t4
1106	adox	$t0, $acc6
1107	adox	$t4, $acc7
1108
1109	################################# reduction
1110	mov	$acc0, %rdx
1111	mulx	8*4($a_ptr), %rdx, $t0
1112
1113	xor	%rax, %rax		# cf=0, of=0
1114	mulx	8*0($a_ptr), $t0, $t1
1115	adcx	$t0, $acc0		# guaranteed to be zero
1116	adox	$t1, $acc1
1117	mulx	8*1($a_ptr), $t0, $t1
1118	adcx	$t0, $acc1
1119	adox	$t1, $acc2
1120	mulx	8*2($a_ptr), $t0, $t1
1121	adcx	$t0, $acc2
1122	adox	$t1, $acc3
1123	mulx	8*3($a_ptr), $t0, $t1
1124	adcx	$t0, $acc3
1125	adox	$t1, $acc0		# of=0
1126	adcx	%rax, $acc0		# cf=0
1127
1128	#################################
1129	mov	$acc1, %rdx
1130	mulx	8*4($a_ptr), %rdx, $t0
1131
1132	mulx	8*0($a_ptr), $t0, $t1
1133	adox	$t0, $acc1		# guaranteed to be zero
1134	adcx	$t1, $acc2
1135	mulx	8*1($a_ptr), $t0, $t1
1136	adox	$t0, $acc2
1137	adcx	$t1, $acc3
1138	mulx	8*2($a_ptr), $t0, $t1
1139	adox	$t0, $acc3
1140	adcx	$t1, $acc0
1141	mulx	8*3($a_ptr), $t0, $t1
1142	adox	$t0, $acc0
1143	adcx	$t1, $acc1		# cf=0
1144	adox	%rax, $acc1		# of=0
1145
1146	#################################
1147	mov	$acc2, %rdx
1148	mulx	8*4($a_ptr), %rdx, $t0
1149
1150	mulx	8*0($a_ptr), $t0, $t1
1151	adcx	$t0, $acc2		# guaranteed to be zero
1152	adox	$t1, $acc3
1153	mulx	8*1($a_ptr), $t0, $t1
1154	adcx	$t0, $acc3
1155	adox	$t1, $acc0
1156	mulx	8*2($a_ptr), $t0, $t1
1157	adcx	$t0, $acc0
1158	adox	$t1, $acc1
1159	mulx	8*3($a_ptr), $t0, $t1
1160	adcx	$t0, $acc1
1161	adox	$t1, $acc2		# of=0
1162	adcx	%rax, $acc2		# cf=0
1163
1164	#################################
1165	mov	$acc3, %rdx
1166	mulx	8*4($a_ptr), %rdx, $t0
1167
1168	mulx	8*0($a_ptr), $t0, $t1
1169	adox	$t0, $acc3		# guaranteed to be zero
1170	adcx	$t1, $acc0
1171	mulx	8*1($a_ptr), $t0, $t1
1172	adox	$t0, $acc0
1173	adcx	$t1, $acc1
1174	mulx	8*2($a_ptr), $t0, $t1
1175	adox	$t0, $acc1
1176	adcx	$t1, $acc2
1177	mulx	8*3($a_ptr), $t0, $t1
1178	adox	$t0, $acc2
1179	adcx	$t1, $acc3
1180	adox	%rax, $acc3
1181
1182	################################# accumulate upper half
1183	add	$acc0, $acc4		# add	$acc4, $acc0
1184	adc	$acc5, $acc1
1185	 mov	$acc4, %rdx
1186	adc	$acc6, $acc2
1187	adc	$acc7, $acc3
1188	 mov	$acc1, $acc6
1189	adc	\$0, %rax
1190
1191	################################# compare to modulus
1192	sub	8*0($a_ptr), $acc4
1193	 mov	$acc2, $acc7
1194	sbb	8*1($a_ptr), $acc1
1195	sbb	8*2($a_ptr), $acc2
1196	 mov	$acc3, $acc0
1197	sbb	8*3($a_ptr), $acc3
1198	sbb	\$0, %rax
1199
1200	cmovnc	$acc4, %rdx
1201	cmovnc	$acc1, $acc6
1202	cmovnc	$acc2, $acc7
1203	cmovnc	$acc3, $acc0
1204
1205	dec	$b_ptr
1206	jnz	.Loop_ord_sqrx
1207
1208	mov	%rdx, 8*0($r_ptr)
1209	mov	$acc6, 8*1($r_ptr)
1210	pxor	%xmm1, %xmm1
1211	mov	$acc7, 8*2($r_ptr)
1212	pxor	%xmm2, %xmm2
1213	mov	$acc0, 8*3($r_ptr)
1214	pxor	%xmm3, %xmm3
1215
1216	mov	0(%rsp),%r15
1217.cfi_restore	%r15
1218	mov	8(%rsp),%r14
1219.cfi_restore	%r14
1220	mov	16(%rsp),%r13
1221.cfi_restore	%r13
1222	mov	24(%rsp),%r12
1223.cfi_restore	%r12
1224	mov	32(%rsp),%rbx
1225.cfi_restore	%rbx
1226	mov	40(%rsp),%rbp
1227.cfi_restore	%rbp
1228	lea	48(%rsp),%rsp
1229.cfi_adjust_cfa_offset	-48
1230.Lord_sqrx_epilogue:
1231	ret
1232.cfi_endproc
1233.size	ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx
1234___
1235
1236$code.=<<___;
1237################################################################################
1238# void ecp_nistz256_mul_mont(
1239#   uint64_t res[4],
1240#   uint64_t a[4],
1241#   uint64_t b[4]);
1242
1243.globl	ecp_nistz256_mul_mont
1244.type	ecp_nistz256_mul_mont,\@function,3
1245.align	32
1246ecp_nistz256_mul_mont:
1247.cfi_startproc
1248___
1249$code.=<<___	if ($addx);
1250	leaq	OPENSSL_ia32cap_P(%rip), %rcx
1251	mov	8(%rcx), %rcx
1252	and	\$0x80100, %ecx
1253___
1254$code.=<<___;
1255.Lmul_mont:
1256	push	%rbp
1257.cfi_push	%rbp
1258	push	%rbx
1259.cfi_push	%rbx
1260	push	%r12
1261.cfi_push	%r12
1262	push	%r13
1263.cfi_push	%r13
1264	push	%r14
1265.cfi_push	%r14
1266	push	%r15
1267.cfi_push	%r15
1268.Lmul_body:
1269___
1270$code.=<<___	if ($addx);
1271	cmp	\$0x80100, %ecx
1272	je	.Lmul_montx
1273___
1274$code.=<<___;
1275	mov	$b_org, $b_ptr
1276	mov	8*0($b_org), %rax
1277	mov	8*0($a_ptr), $acc1
1278	mov	8*1($a_ptr), $acc2
1279	mov	8*2($a_ptr), $acc3
1280	mov	8*3($a_ptr), $acc4
1281
1282	call	__ecp_nistz256_mul_montq
1283___
1284$code.=<<___	if ($addx);
1285	jmp	.Lmul_mont_done
1286
1287.align	32
1288.Lmul_montx:
1289	mov	$b_org, $b_ptr
1290	mov	8*0($b_org), %rdx
1291	mov	8*0($a_ptr), $acc1
1292	mov	8*1($a_ptr), $acc2
1293	mov	8*2($a_ptr), $acc3
1294	mov	8*3($a_ptr), $acc4
1295	lea	-128($a_ptr), $a_ptr	# control u-op density
1296
1297	call	__ecp_nistz256_mul_montx
1298___
1299$code.=<<___;
1300.Lmul_mont_done:
1301	mov	0(%rsp),%r15
1302.cfi_restore	%r15
1303	mov	8(%rsp),%r14
1304.cfi_restore	%r14
1305	mov	16(%rsp),%r13
1306.cfi_restore	%r13
1307	mov	24(%rsp),%r12
1308.cfi_restore	%r12
1309	mov	32(%rsp),%rbx
1310.cfi_restore	%rbx
1311	mov	40(%rsp),%rbp
1312.cfi_restore	%rbp
1313	lea	48(%rsp),%rsp
1314.cfi_adjust_cfa_offset	-48
1315.Lmul_epilogue:
1316	ret
1317.cfi_endproc
1318.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
1319
1320.type	__ecp_nistz256_mul_montq,\@abi-omnipotent
1321.align	32
1322__ecp_nistz256_mul_montq:
1323.cfi_startproc
1324	########################################################################
1325	# Multiply a by b[0]
1326	mov	%rax, $t1
1327	mulq	$acc1
1328	mov	.Lpoly+8*1(%rip),$poly1
1329	mov	%rax, $acc0
1330	mov	$t1, %rax
1331	mov	%rdx, $acc1
1332
1333	mulq	$acc2
1334	mov	.Lpoly+8*3(%rip),$poly3
1335	add	%rax, $acc1
1336	mov	$t1, %rax
1337	adc	\$0, %rdx
1338	mov	%rdx, $acc2
1339
1340	mulq	$acc3
1341	add	%rax, $acc2
1342	mov	$t1, %rax
1343	adc	\$0, %rdx
1344	mov	%rdx, $acc3
1345
1346	mulq	$acc4
1347	add	%rax, $acc3
1348	 mov	$acc0, %rax
1349	adc	\$0, %rdx
1350	xor	$acc5, $acc5
1351	mov	%rdx, $acc4
1352
1353	########################################################################
1354	# First reduction step
1355	# Basically now we want to multiply acc[0] by p256,
1356	# and add the result to the acc.
1357	# Due to the special form of p256 we do some optimizations
1358	#
1359	# acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
1360	# then we add acc[0] and get acc[0] x 2^96
1361
1362	mov	$acc0, $t1
1363	shl	\$32, $acc0
1364	mulq	$poly3
1365	shr	\$32, $t1
1366	add	$acc0, $acc1		# +=acc[0]<<96
1367	adc	$t1, $acc2
1368	adc	%rax, $acc3
1369	 mov	8*1($b_ptr), %rax
1370	adc	%rdx, $acc4
1371	adc	\$0, $acc5
1372	xor	$acc0, $acc0
1373
1374	########################################################################
1375	# Multiply by b[1]
1376	mov	%rax, $t1
1377	mulq	8*0($a_ptr)
1378	add	%rax, $acc1
1379	mov	$t1, %rax
1380	adc	\$0, %rdx
1381	mov	%rdx, $t0
1382
1383	mulq	8*1($a_ptr)
1384	add	$t0, $acc2
1385	adc	\$0, %rdx
1386	add	%rax, $acc2
1387	mov	$t1, %rax
1388	adc	\$0, %rdx
1389	mov	%rdx, $t0
1390
1391	mulq	8*2($a_ptr)
1392	add	$t0, $acc3
1393	adc	\$0, %rdx
1394	add	%rax, $acc3
1395	mov	$t1, %rax
1396	adc	\$0, %rdx
1397	mov	%rdx, $t0
1398
1399	mulq	8*3($a_ptr)
1400	add	$t0, $acc4
1401	adc	\$0, %rdx
1402	add	%rax, $acc4
1403	 mov	$acc1, %rax
1404	adc	%rdx, $acc5
1405	adc	\$0, $acc0
1406
1407	########################################################################
1408	# Second reduction step
1409	mov	$acc1, $t1
1410	shl	\$32, $acc1
1411	mulq	$poly3
1412	shr	\$32, $t1
1413	add	$acc1, $acc2
1414	adc	$t1, $acc3
1415	adc	%rax, $acc4
1416	 mov	8*2($b_ptr), %rax
1417	adc	%rdx, $acc5
1418	adc	\$0, $acc0
1419	xor	$acc1, $acc1
1420
1421	########################################################################
1422	# Multiply by b[2]
1423	mov	%rax, $t1
1424	mulq	8*0($a_ptr)
1425	add	%rax, $acc2
1426	mov	$t1, %rax
1427	adc	\$0, %rdx
1428	mov	%rdx, $t0
1429
1430	mulq	8*1($a_ptr)
1431	add	$t0, $acc3
1432	adc	\$0, %rdx
1433	add	%rax, $acc3
1434	mov	$t1, %rax
1435	adc	\$0, %rdx
1436	mov	%rdx, $t0
1437
1438	mulq	8*2($a_ptr)
1439	add	$t0, $acc4
1440	adc	\$0, %rdx
1441	add	%rax, $acc4
1442	mov	$t1, %rax
1443	adc	\$0, %rdx
1444	mov	%rdx, $t0
1445
1446	mulq	8*3($a_ptr)
1447	add	$t0, $acc5
1448	adc	\$0, %rdx
1449	add	%rax, $acc5
1450	 mov	$acc2, %rax
1451	adc	%rdx, $acc0
1452	adc	\$0, $acc1
1453
1454	########################################################################
1455	# Third reduction step
1456	mov	$acc2, $t1
1457	shl	\$32, $acc2
1458	mulq	$poly3
1459	shr	\$32, $t1
1460	add	$acc2, $acc3
1461	adc	$t1, $acc4
1462	adc	%rax, $acc5
1463	 mov	8*3($b_ptr), %rax
1464	adc	%rdx, $acc0
1465	adc	\$0, $acc1
1466	xor	$acc2, $acc2
1467
1468	########################################################################
1469	# Multiply by b[3]
1470	mov	%rax, $t1
1471	mulq	8*0($a_ptr)
1472	add	%rax, $acc3
1473	mov	$t1, %rax
1474	adc	\$0, %rdx
1475	mov	%rdx, $t0
1476
1477	mulq	8*1($a_ptr)
1478	add	$t0, $acc4
1479	adc	\$0, %rdx
1480	add	%rax, $acc4
1481	mov	$t1, %rax
1482	adc	\$0, %rdx
1483	mov	%rdx, $t0
1484
1485	mulq	8*2($a_ptr)
1486	add	$t0, $acc5
1487	adc	\$0, %rdx
1488	add	%rax, $acc5
1489	mov	$t1, %rax
1490	adc	\$0, %rdx
1491	mov	%rdx, $t0
1492
1493	mulq	8*3($a_ptr)
1494	add	$t0, $acc0
1495	adc	\$0, %rdx
1496	add	%rax, $acc0
1497	 mov	$acc3, %rax
1498	adc	%rdx, $acc1
1499	adc	\$0, $acc2
1500
1501	########################################################################
1502	# Final reduction step
1503	mov	$acc3, $t1
1504	shl	\$32, $acc3
1505	mulq	$poly3
1506	shr	\$32, $t1
1507	add	$acc3, $acc4
1508	adc	$t1, $acc5
1509	 mov	$acc4, $t0
1510	adc	%rax, $acc0
1511	adc	%rdx, $acc1
1512	 mov	$acc5, $t1
1513	adc	\$0, $acc2
1514
1515	########################################################################
1516	# Branch-less conditional subtraction of P
1517	sub	\$-1, $acc4		# .Lpoly[0]
1518	 mov	$acc0, $t2
1519	sbb	$poly1, $acc5		# .Lpoly[1]
1520	sbb	\$0, $acc0		# .Lpoly[2]
1521	 mov	$acc1, $t3
1522	sbb	$poly3, $acc1		# .Lpoly[3]
1523	sbb	\$0, $acc2
1524
1525	cmovc	$t0, $acc4
1526	cmovc	$t1, $acc5
1527	mov	$acc4, 8*0($r_ptr)
1528	cmovc	$t2, $acc0
1529	mov	$acc5, 8*1($r_ptr)
1530	cmovc	$t3, $acc1
1531	mov	$acc0, 8*2($r_ptr)
1532	mov	$acc1, 8*3($r_ptr)
1533
1534	ret
1535.cfi_endproc
1536.size	__ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
1537
1538################################################################################
1539# void ecp_nistz256_sqr_mont(
1540#   uint64_t res[4],
1541#   uint64_t a[4]);
1542
1543# we optimize the square according to S.Gueron and V.Krasnov,
1544# "Speeding up Big-Number Squaring"
1545.globl	ecp_nistz256_sqr_mont
1546.type	ecp_nistz256_sqr_mont,\@function,2
1547.align	32
1548ecp_nistz256_sqr_mont:
1549.cfi_startproc
1550___
1551$code.=<<___	if ($addx);
1552	leaq	OPENSSL_ia32cap_P(%rip), %rcx
1553	mov	8(%rcx), %rcx
1554	and	\$0x80100, %ecx
1555___
1556$code.=<<___;
1557	push	%rbp
1558.cfi_push	%rbp
1559	push	%rbx
1560.cfi_push	%rbx
1561	push	%r12
1562.cfi_push	%r12
1563	push	%r13
1564.cfi_push	%r13
1565	push	%r14
1566.cfi_push	%r14
1567	push	%r15
1568.cfi_push	%r15
1569.Lsqr_body:
1570___
1571$code.=<<___	if ($addx);
1572	cmp	\$0x80100, %ecx
1573	je	.Lsqr_montx
1574___
1575$code.=<<___;
1576	mov	8*0($a_ptr), %rax
1577	mov	8*1($a_ptr), $acc6
1578	mov	8*2($a_ptr), $acc7
1579	mov	8*3($a_ptr), $acc0
1580
1581	call	__ecp_nistz256_sqr_montq
1582___
1583$code.=<<___	if ($addx);
1584	jmp	.Lsqr_mont_done
1585
1586.align	32
1587.Lsqr_montx:
1588	mov	8*0($a_ptr), %rdx
1589	mov	8*1($a_ptr), $acc6
1590	mov	8*2($a_ptr), $acc7
1591	mov	8*3($a_ptr), $acc0
1592	lea	-128($a_ptr), $a_ptr	# control u-op density
1593
1594	call	__ecp_nistz256_sqr_montx
1595___
1596$code.=<<___;
1597.Lsqr_mont_done:
1598	mov	0(%rsp),%r15
1599.cfi_restore	%r15
1600	mov	8(%rsp),%r14
1601.cfi_restore	%r14
1602	mov	16(%rsp),%r13
1603.cfi_restore	%r13
1604	mov	24(%rsp),%r12
1605.cfi_restore	%r12
1606	mov	32(%rsp),%rbx
1607.cfi_restore	%rbx
1608	mov	40(%rsp),%rbp
1609.cfi_restore	%rbp
1610	lea	48(%rsp),%rsp
1611.cfi_adjust_cfa_offset	-48
1612.Lsqr_epilogue:
1613	ret
1614.cfi_endproc
1615.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
1616
1617.type	__ecp_nistz256_sqr_montq,\@abi-omnipotent
1618.align	32
1619__ecp_nistz256_sqr_montq:
1620.cfi_startproc
1621	mov	%rax, $acc5
1622	mulq	$acc6			# a[1]*a[0]
1623	mov	%rax, $acc1
1624	mov	$acc7, %rax
1625	mov	%rdx, $acc2
1626
1627	mulq	$acc5			# a[0]*a[2]
1628	add	%rax, $acc2
1629	mov	$acc0, %rax
1630	adc	\$0, %rdx
1631	mov	%rdx, $acc3
1632
1633	mulq	$acc5			# a[0]*a[3]
1634	add	%rax, $acc3
1635	 mov	$acc7, %rax
1636	adc	\$0, %rdx
1637	mov	%rdx, $acc4
1638
1639	#################################
1640	mulq	$acc6			# a[1]*a[2]
1641	add	%rax, $acc3
1642	mov	$acc0, %rax
1643	adc	\$0, %rdx
1644	mov	%rdx, $t1
1645
1646	mulq	$acc6			# a[1]*a[3]
1647	add	%rax, $acc4
1648	 mov	$acc0, %rax
1649	adc	\$0, %rdx
1650	add	$t1, $acc4
1651	mov	%rdx, $acc5
1652	adc	\$0, $acc5
1653
1654	#################################
1655	mulq	$acc7			# a[2]*a[3]
1656	xor	$acc7, $acc7
1657	add	%rax, $acc5
1658	 mov	8*0($a_ptr), %rax
1659	mov	%rdx, $acc6
1660	adc	\$0, $acc6
1661
1662	add	$acc1, $acc1		# acc1:6<<1
1663	adc	$acc2, $acc2
1664	adc	$acc3, $acc3
1665	adc	$acc4, $acc4
1666	adc	$acc5, $acc5
1667	adc	$acc6, $acc6
1668	adc	\$0, $acc7
1669
1670	mulq	%rax
1671	mov	%rax, $acc0
1672	mov	8*1($a_ptr), %rax
1673	mov	%rdx, $t0
1674
1675	mulq	%rax
1676	add	$t0, $acc1
1677	adc	%rax, $acc2
1678	mov	8*2($a_ptr), %rax
1679	adc	\$0, %rdx
1680	mov	%rdx, $t0
1681
1682	mulq	%rax
1683	add	$t0, $acc3
1684	adc	%rax, $acc4
1685	mov	8*3($a_ptr), %rax
1686	adc	\$0, %rdx
1687	mov	%rdx, $t0
1688
1689	mulq	%rax
1690	add	$t0, $acc5
1691	adc	%rax, $acc6
1692	 mov	$acc0, %rax
1693	adc	%rdx, $acc7
1694
1695	mov	.Lpoly+8*1(%rip), $a_ptr
1696	mov	.Lpoly+8*3(%rip), $t1
1697
1698	##########################################
1699	# Now the reduction
1700	# First iteration
1701	mov	$acc0, $t0
1702	shl	\$32, $acc0
1703	mulq	$t1
1704	shr	\$32, $t0
1705	add	$acc0, $acc1		# +=acc[0]<<96
1706	adc	$t0, $acc2
1707	adc	%rax, $acc3
1708	 mov	$acc1, %rax
1709	adc	\$0, %rdx
1710
1711	##########################################
1712	# Second iteration
1713	mov	$acc1, $t0
1714	shl	\$32, $acc1
1715	mov	%rdx, $acc0
1716	mulq	$t1
1717	shr	\$32, $t0
1718	add	$acc1, $acc2
1719	adc	$t0, $acc3
1720	adc	%rax, $acc0
1721	 mov	$acc2, %rax
1722	adc	\$0, %rdx
1723
1724	##########################################
1725	# Third iteration
1726	mov	$acc2, $t0
1727	shl	\$32, $acc2
1728	mov	%rdx, $acc1
1729	mulq	$t1
1730	shr	\$32, $t0
1731	add	$acc2, $acc3
1732	adc	$t0, $acc0
1733	adc	%rax, $acc1
1734	 mov	$acc3, %rax
1735	adc	\$0, %rdx
1736
1737	###########################################
1738	# Last iteration
1739	mov	$acc3, $t0
1740	shl	\$32, $acc3
1741	mov	%rdx, $acc2
1742	mulq	$t1
1743	shr	\$32, $t0
1744	add	$acc3, $acc0
1745	adc	$t0, $acc1
1746	adc	%rax, $acc2
1747	adc	\$0, %rdx
1748	xor	$acc3, $acc3
1749
1750	############################################
1751	# Add the rest of the acc
1752	add	$acc0, $acc4
1753	adc	$acc1, $acc5
1754	 mov	$acc4, $acc0
1755	adc	$acc2, $acc6
1756	adc	%rdx, $acc7
1757	 mov	$acc5, $acc1
1758	adc	\$0, $acc3
1759
1760	sub	\$-1, $acc4		# .Lpoly[0]
1761	 mov	$acc6, $acc2
1762	sbb	$a_ptr, $acc5		# .Lpoly[1]
1763	sbb	\$0, $acc6		# .Lpoly[2]
1764	 mov	$acc7, $t0
1765	sbb	$t1, $acc7		# .Lpoly[3]
1766	sbb	\$0, $acc3
1767
1768	cmovc	$acc0, $acc4
1769	cmovc	$acc1, $acc5
1770	mov	$acc4, 8*0($r_ptr)
1771	cmovc	$acc2, $acc6
1772	mov	$acc5, 8*1($r_ptr)
1773	cmovc	$t0, $acc7
1774	mov	$acc6, 8*2($r_ptr)
1775	mov	$acc7, 8*3($r_ptr)
1776
1777	ret
1778.cfi_endproc
1779.size	__ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
1780___
1781
1782if ($addx) {
1783$code.=<<___;
1784.type	__ecp_nistz256_mul_montx,\@abi-omnipotent
1785.align	32
1786__ecp_nistz256_mul_montx:
1787.cfi_startproc
1788	########################################################################
1789	# Multiply by b[0]
1790	mulx	$acc1, $acc0, $acc1
1791	mulx	$acc2, $t0, $acc2
1792	mov	\$32, $poly1
1793	xor	$acc5, $acc5		# cf=0
1794	mulx	$acc3, $t1, $acc3
1795	mov	.Lpoly+8*3(%rip), $poly3
1796	adc	$t0, $acc1
1797	mulx	$acc4, $t0, $acc4
1798	 mov	$acc0, %rdx
1799	adc	$t1, $acc2
1800	 shlx	$poly1,$acc0,$t1
1801	adc	$t0, $acc3
1802	 shrx	$poly1,$acc0,$t0
1803	adc	\$0, $acc4
1804
1805	########################################################################
1806	# First reduction step
1807	add	$t1, $acc1
1808	adc	$t0, $acc2
1809
1810	mulx	$poly3, $t0, $t1
1811	 mov	8*1($b_ptr), %rdx
1812	adc	$t0, $acc3
1813	adc	$t1, $acc4
1814	adc	\$0, $acc5
1815	xor	$acc0, $acc0		# $acc0=0,cf=0,of=0
1816
1817	########################################################################
1818	# Multiply by b[1]
1819	mulx	8*0+128($a_ptr), $t0, $t1
1820	adcx	$t0, $acc1
1821	adox	$t1, $acc2
1822
1823	mulx	8*1+128($a_ptr), $t0, $t1
1824	adcx	$t0, $acc2
1825	adox	$t1, $acc3
1826
1827	mulx	8*2+128($a_ptr), $t0, $t1
1828	adcx	$t0, $acc3
1829	adox	$t1, $acc4
1830
1831	mulx	8*3+128($a_ptr), $t0, $t1
1832	 mov	$acc1, %rdx
1833	adcx	$t0, $acc4
1834	 shlx	$poly1, $acc1, $t0
1835	adox	$t1, $acc5
1836	 shrx	$poly1, $acc1, $t1
1837
1838	adcx	$acc0, $acc5
1839	adox	$acc0, $acc0
1840	adc	\$0, $acc0
1841
1842	########################################################################
1843	# Second reduction step
1844	add	$t0, $acc2
1845	adc	$t1, $acc3
1846
1847	mulx	$poly3, $t0, $t1
1848	 mov	8*2($b_ptr), %rdx
1849	adc	$t0, $acc4
1850	adc	$t1, $acc5
1851	adc	\$0, $acc0
1852	xor	$acc1 ,$acc1		# $acc1=0,cf=0,of=0
1853
1854	########################################################################
1855	# Multiply by b[2]
1856	mulx	8*0+128($a_ptr), $t0, $t1
1857	adcx	$t0, $acc2
1858	adox	$t1, $acc3
1859
1860	mulx	8*1+128($a_ptr), $t0, $t1
1861	adcx	$t0, $acc3
1862	adox	$t1, $acc4
1863
1864	mulx	8*2+128($a_ptr), $t0, $t1
1865	adcx	$t0, $acc4
1866	adox	$t1, $acc5
1867
1868	mulx	8*3+128($a_ptr), $t0, $t1
1869	 mov	$acc2, %rdx
1870	adcx	$t0, $acc5
1871	 shlx	$poly1, $acc2, $t0
1872	adox	$t1, $acc0
1873	 shrx	$poly1, $acc2, $t1
1874
1875	adcx	$acc1, $acc0
1876	adox	$acc1, $acc1
1877	adc	\$0, $acc1
1878
1879	########################################################################
1880	# Third reduction step
1881	add	$t0, $acc3
1882	adc	$t1, $acc4
1883
1884	mulx	$poly3, $t0, $t1
1885	 mov	8*3($b_ptr), %rdx
1886	adc	$t0, $acc5
1887	adc	$t1, $acc0
1888	adc	\$0, $acc1
1889	xor	$acc2, $acc2		# $acc2=0,cf=0,of=0
1890
1891	########################################################################
1892	# Multiply by b[3]
1893	mulx	8*0+128($a_ptr), $t0, $t1
1894	adcx	$t0, $acc3
1895	adox	$t1, $acc4
1896
1897	mulx	8*1+128($a_ptr), $t0, $t1
1898	adcx	$t0, $acc4
1899	adox	$t1, $acc5
1900
1901	mulx	8*2+128($a_ptr), $t0, $t1
1902	adcx	$t0, $acc5
1903	adox	$t1, $acc0
1904
1905	mulx	8*3+128($a_ptr), $t0, $t1
1906	 mov	$acc3, %rdx
1907	adcx	$t0, $acc0
1908	 shlx	$poly1, $acc3, $t0
1909	adox	$t1, $acc1
1910	 shrx	$poly1, $acc3, $t1
1911
1912	adcx	$acc2, $acc1
1913	adox	$acc2, $acc2
1914	adc	\$0, $acc2
1915
1916	########################################################################
1917	# Fourth reduction step
1918	add	$t0, $acc4
1919	adc	$t1, $acc5
1920
1921	mulx	$poly3, $t0, $t1
1922	 mov	$acc4, $t2
1923	mov	.Lpoly+8*1(%rip), $poly1
1924	adc	$t0, $acc0
1925	 mov	$acc5, $t3
1926	adc	$t1, $acc1
1927	adc	\$0, $acc2
1928
1929	########################################################################
1930	# Branch-less conditional subtraction of P
1931	xor	%eax, %eax
1932	 mov	$acc0, $t0
1933	sbb	\$-1, $acc4		# .Lpoly[0]
1934	sbb	$poly1, $acc5		# .Lpoly[1]
1935	sbb	\$0, $acc0		# .Lpoly[2]
1936	 mov	$acc1, $t1
1937	sbb	$poly3, $acc1		# .Lpoly[3]
1938	sbb	\$0, $acc2
1939
1940	cmovc	$t2, $acc4
1941	cmovc	$t3, $acc5
1942	mov	$acc4, 8*0($r_ptr)
1943	cmovc	$t0, $acc0
1944	mov	$acc5, 8*1($r_ptr)
1945	cmovc	$t1, $acc1
1946	mov	$acc0, 8*2($r_ptr)
1947	mov	$acc1, 8*3($r_ptr)
1948
1949	ret
1950.cfi_endproc
1951.size	__ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
1952
1953.type	__ecp_nistz256_sqr_montx,\@abi-omnipotent
1954.align	32
1955__ecp_nistz256_sqr_montx:
1956.cfi_startproc
1957	mulx	$acc6, $acc1, $acc2	# a[0]*a[1]
1958	mulx	$acc7, $t0, $acc3	# a[0]*a[2]
1959	xor	%eax, %eax
1960	adc	$t0, $acc2
1961	mulx	$acc0, $t1, $acc4	# a[0]*a[3]
1962	 mov	$acc6, %rdx
1963	adc	$t1, $acc3
1964	adc	\$0, $acc4
1965	xor	$acc5, $acc5		# $acc5=0,cf=0,of=0
1966
1967	#################################
1968	mulx	$acc7, $t0, $t1		# a[1]*a[2]
1969	adcx	$t0, $acc3
1970	adox	$t1, $acc4
1971
1972	mulx	$acc0, $t0, $t1		# a[1]*a[3]
1973	 mov	$acc7, %rdx
1974	adcx	$t0, $acc4
1975	adox	$t1, $acc5
1976	adc	\$0, $acc5
1977
1978	#################################
1979	mulx	$acc0, $t0, $acc6	# a[2]*a[3]
1980	 mov	8*0+128($a_ptr), %rdx
1981	xor	$acc7, $acc7		# $acc7=0,cf=0,of=0
1982	 adcx	$acc1, $acc1		# acc1:6<<1
1983	adox	$t0, $acc5
1984	 adcx	$acc2, $acc2
1985	adox	$acc7, $acc6		# of=0
1986
1987	mulx	%rdx, $acc0, $t1
1988	mov	8*1+128($a_ptr), %rdx
1989	 adcx	$acc3, $acc3
1990	adox	$t1, $acc1
1991	 adcx	$acc4, $acc4
1992	mulx	%rdx, $t0, $t4
1993	mov	8*2+128($a_ptr), %rdx
1994	 adcx	$acc5, $acc5
1995	adox	$t0, $acc2
1996	 adcx	$acc6, $acc6
1997	.byte	0x67
1998	mulx	%rdx, $t0, $t1
1999	mov	8*3+128($a_ptr), %rdx
2000	adox	$t4, $acc3
2001	 adcx	$acc7, $acc7
2002	adox	$t0, $acc4
2003	 mov	\$32, $a_ptr
2004	adox	$t1, $acc5
2005	.byte	0x67,0x67
2006	mulx	%rdx, $t0, $t4
2007	 mov	.Lpoly+8*3(%rip), %rdx
2008	adox	$t0, $acc6
2009	 shlx	$a_ptr, $acc0, $t0
2010	adox	$t4, $acc7
2011	 shrx	$a_ptr, $acc0, $t4
2012	mov	%rdx,$t1
2013
2014	# reduction step 1
2015	add	$t0, $acc1
2016	adc	$t4, $acc2
2017
2018	mulx	$acc0, $t0, $acc0
2019	adc	$t0, $acc3
2020	 shlx	$a_ptr, $acc1, $t0
2021	adc	\$0, $acc0
2022	 shrx	$a_ptr, $acc1, $t4
2023
2024	# reduction step 2
2025	add	$t0, $acc2
2026	adc	$t4, $acc3
2027
2028	mulx	$acc1, $t0, $acc1
2029	adc	$t0, $acc0
2030	 shlx	$a_ptr, $acc2, $t0
2031	adc	\$0, $acc1
2032	 shrx	$a_ptr, $acc2, $t4
2033
2034	# reduction step 3
2035	add	$t0, $acc3
2036	adc	$t4, $acc0
2037
2038	mulx	$acc2, $t0, $acc2
2039	adc	$t0, $acc1
2040	 shlx	$a_ptr, $acc3, $t0
2041	adc	\$0, $acc2
2042	 shrx	$a_ptr, $acc3, $t4
2043
2044	# reduction step 4
2045	add	$t0, $acc0
2046	adc	$t4, $acc1
2047
2048	mulx	$acc3, $t0, $acc3
2049	adc	$t0, $acc2
2050	adc	\$0, $acc3
2051
2052	xor	$t3, $t3
2053	add	$acc0, $acc4		# accumulate upper half
2054	 mov	.Lpoly+8*1(%rip), $a_ptr
2055	adc	$acc1, $acc5
2056	 mov	$acc4, $acc0
2057	adc	$acc2, $acc6
2058	adc	$acc3, $acc7
2059	 mov	$acc5, $acc1
2060	adc	\$0, $t3
2061
2062	sub	\$-1, $acc4		# .Lpoly[0]
2063	 mov	$acc6, $acc2
2064	sbb	$a_ptr, $acc5		# .Lpoly[1]
2065	sbb	\$0, $acc6		# .Lpoly[2]
2066	 mov	$acc7, $acc3
2067	sbb	$t1, $acc7		# .Lpoly[3]
2068	sbb	\$0, $t3
2069
2070	cmovc	$acc0, $acc4
2071	cmovc	$acc1, $acc5
2072	mov	$acc4, 8*0($r_ptr)
2073	cmovc	$acc2, $acc6
2074	mov	$acc5, 8*1($r_ptr)
2075	cmovc	$acc3, $acc7
2076	mov	$acc6, 8*2($r_ptr)
2077	mov	$acc7, 8*3($r_ptr)
2078
2079	ret
2080.cfi_endproc
2081.size	__ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
2082___
2083}
2084}
2085{
2086my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
2087my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7));
2088my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15));
2089my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15));
2090
2091$code.=<<___;
2092################################################################################
2093# void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
2094.globl	ecp_nistz256_select_w5
2095.type	ecp_nistz256_select_w5,\@abi-omnipotent
2096.align	32
2097ecp_nistz256_select_w5:
2098.cfi_startproc
2099___
2100$code.=<<___	if ($avx>1);
2101	leaq	OPENSSL_ia32cap_P(%rip), %rax
2102	mov	8(%rax), %rax
2103	test	\$`1<<5`, %eax
2104	jnz	.Lavx2_select_w5
2105___
2106$code.=<<___	if ($win64);
2107	lea	-0x88(%rsp), %rax
2108.LSEH_begin_ecp_nistz256_select_w5:
2109	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
2110	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6, -0x20(%rax)
2111	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7, -0x10(%rax)
2112	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8, 0(%rax)
2113	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9, 0x10(%rax)
2114	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10, 0x20(%rax)
2115	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11, 0x30(%rax)
2116	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12, 0x40(%rax)
2117	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13, 0x50(%rax)
2118	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14, 0x60(%rax)
2119	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15, 0x70(%rax)
2120___
2121$code.=<<___;
2122	movdqa	.LOne(%rip), $ONE
2123	movd	$index, $INDEX
2124
2125	pxor	$Ra, $Ra
2126	pxor	$Rb, $Rb
2127	pxor	$Rc, $Rc
2128	pxor	$Rd, $Rd
2129	pxor	$Re, $Re
2130	pxor	$Rf, $Rf
2131
2132	movdqa	$ONE, $M0
2133	pshufd	\$0, $INDEX, $INDEX
2134
2135	mov	\$16, %rax
2136.Lselect_loop_sse_w5:
2137
2138	movdqa	$M0, $TMP0
2139	paddd	$ONE, $M0
2140	pcmpeqd $INDEX, $TMP0
2141
2142	movdqa	16*0($in_t), $T0a
2143	movdqa	16*1($in_t), $T0b
2144	movdqa	16*2($in_t), $T0c
2145	movdqa	16*3($in_t), $T0d
2146	movdqa	16*4($in_t), $T0e
2147	movdqa	16*5($in_t), $T0f
2148	lea 16*6($in_t), $in_t
2149
2150	pand	$TMP0, $T0a
2151	pand	$TMP0, $T0b
2152	por	$T0a, $Ra
2153	pand	$TMP0, $T0c
2154	por	$T0b, $Rb
2155	pand	$TMP0, $T0d
2156	por	$T0c, $Rc
2157	pand	$TMP0, $T0e
2158	por	$T0d, $Rd
2159	pand	$TMP0, $T0f
2160	por	$T0e, $Re
2161	por	$T0f, $Rf
2162
2163	dec	%rax
2164	jnz	.Lselect_loop_sse_w5
2165
2166	movdqu	$Ra, 16*0($val)
2167	movdqu	$Rb, 16*1($val)
2168	movdqu	$Rc, 16*2($val)
2169	movdqu	$Rd, 16*3($val)
2170	movdqu	$Re, 16*4($val)
2171	movdqu	$Rf, 16*5($val)
2172___
2173$code.=<<___	if ($win64);
2174	movaps	(%rsp), %xmm6
2175	movaps	0x10(%rsp), %xmm7
2176	movaps	0x20(%rsp), %xmm8
2177	movaps	0x30(%rsp), %xmm9
2178	movaps	0x40(%rsp), %xmm10
2179	movaps	0x50(%rsp), %xmm11
2180	movaps	0x60(%rsp), %xmm12
2181	movaps	0x70(%rsp), %xmm13
2182	movaps	0x80(%rsp), %xmm14
2183	movaps	0x90(%rsp), %xmm15
2184	lea	0xa8(%rsp), %rsp
2185___
2186$code.=<<___;
2187	ret
2188.cfi_endproc
2189.LSEH_end_ecp_nistz256_select_w5:
2190.size	ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
2191
2192################################################################################
2193# void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
2194.globl	ecp_nistz256_select_w7
2195.type	ecp_nistz256_select_w7,\@abi-omnipotent
2196.align	32
2197ecp_nistz256_select_w7:
2198.cfi_startproc
2199___
2200$code.=<<___	if ($avx>1);
2201	leaq	OPENSSL_ia32cap_P(%rip), %rax
2202	mov	8(%rax), %rax
2203	test	\$`1<<5`, %eax
2204	jnz	.Lavx2_select_w7
2205___
2206$code.=<<___	if ($win64);
2207	lea	-0x88(%rsp), %rax
2208.LSEH_begin_ecp_nistz256_select_w7:
2209	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
2210	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6, -0x20(%rax)
2211	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7, -0x10(%rax)
2212	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8, 0(%rax)
2213	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9, 0x10(%rax)
2214	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10, 0x20(%rax)
2215	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11, 0x30(%rax)
2216	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12, 0x40(%rax)
2217	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13, 0x50(%rax)
2218	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14, 0x60(%rax)
2219	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15, 0x70(%rax)
2220___
2221$code.=<<___;
2222	movdqa	.LOne(%rip), $M0
2223	movd	$index, $INDEX
2224
2225	pxor	$Ra, $Ra
2226	pxor	$Rb, $Rb
2227	pxor	$Rc, $Rc
2228	pxor	$Rd, $Rd
2229
2230	movdqa	$M0, $ONE
2231	pshufd	\$0, $INDEX, $INDEX
2232	mov	\$64, %rax
2233
2234.Lselect_loop_sse_w7:
2235	movdqa	$M0, $TMP0
2236	paddd	$ONE, $M0
2237	movdqa	16*0($in_t), $T0a
2238	movdqa	16*1($in_t), $T0b
2239	pcmpeqd	$INDEX, $TMP0
2240	movdqa	16*2($in_t), $T0c
2241	movdqa	16*3($in_t), $T0d
2242	lea	16*4($in_t), $in_t
2243
2244	pand	$TMP0, $T0a
2245	pand	$TMP0, $T0b
2246	por	$T0a, $Ra
2247	pand	$TMP0, $T0c
2248	por	$T0b, $Rb
2249	pand	$TMP0, $T0d
2250	por	$T0c, $Rc
2251	prefetcht0	255($in_t)
2252	por	$T0d, $Rd
2253
2254	dec	%rax
2255	jnz	.Lselect_loop_sse_w7
2256
2257	movdqu	$Ra, 16*0($val)
2258	movdqu	$Rb, 16*1($val)
2259	movdqu	$Rc, 16*2($val)
2260	movdqu	$Rd, 16*3($val)
2261___
2262$code.=<<___	if ($win64);
2263	movaps	(%rsp), %xmm6
2264	movaps	0x10(%rsp), %xmm7
2265	movaps	0x20(%rsp), %xmm8
2266	movaps	0x30(%rsp), %xmm9
2267	movaps	0x40(%rsp), %xmm10
2268	movaps	0x50(%rsp), %xmm11
2269	movaps	0x60(%rsp), %xmm12
2270	movaps	0x70(%rsp), %xmm13
2271	movaps	0x80(%rsp), %xmm14
2272	movaps	0x90(%rsp), %xmm15
2273	lea	0xa8(%rsp), %rsp
2274___
2275$code.=<<___;
2276	ret
2277.cfi_endproc
2278.LSEH_end_ecp_nistz256_select_w7:
2279.size	ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
2280___
2281}
2282if ($avx>1) {
2283my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
2284my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4));
2285my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9));
2286my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14));
2287
2288$code.=<<___;
2289################################################################################
2290# void ecp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, int index);
2291.type	ecp_nistz256_avx2_select_w5,\@abi-omnipotent
2292.align	32
2293ecp_nistz256_avx2_select_w5:
2294.cfi_startproc
2295.Lavx2_select_w5:
2296	vzeroupper
2297___
2298$code.=<<___	if ($win64);
2299	lea	-0x88(%rsp), %rax
2300	mov	%rsp,%r11
2301.LSEH_begin_ecp_nistz256_avx2_select_w5:
2302	.byte	0x48,0x8d,0x60,0xe0		# lea	-0x20(%rax), %rsp
2303	.byte	0xc5,0xf8,0x29,0x70,0xe0	# vmovaps %xmm6, -0x20(%rax)
2304	.byte	0xc5,0xf8,0x29,0x78,0xf0	# vmovaps %xmm7, -0x10(%rax)
2305	.byte	0xc5,0x78,0x29,0x40,0x00	# vmovaps %xmm8, 8(%rax)
2306	.byte	0xc5,0x78,0x29,0x48,0x10	# vmovaps %xmm9, 0x10(%rax)
2307	.byte	0xc5,0x78,0x29,0x50,0x20	# vmovaps %xmm10, 0x20(%rax)
2308	.byte	0xc5,0x78,0x29,0x58,0x30	# vmovaps %xmm11, 0x30(%rax)
2309	.byte	0xc5,0x78,0x29,0x60,0x40	# vmovaps %xmm12, 0x40(%rax)
2310	.byte	0xc5,0x78,0x29,0x68,0x50	# vmovaps %xmm13, 0x50(%rax)
2311	.byte	0xc5,0x78,0x29,0x70,0x60	# vmovaps %xmm14, 0x60(%rax)
2312	.byte	0xc5,0x78,0x29,0x78,0x70	# vmovaps %xmm15, 0x70(%rax)
2313___
2314$code.=<<___;
2315	vmovdqa	.LTwo(%rip), $TWO
2316
2317	vpxor	$Ra, $Ra, $Ra
2318	vpxor	$Rb, $Rb, $Rb
2319	vpxor	$Rc, $Rc, $Rc
2320
2321	vmovdqa .LOne(%rip), $M0
2322	vmovdqa .LTwo(%rip), $M1
2323
2324	vmovd	$index, %xmm1
2325	vpermd	$INDEX, $Ra, $INDEX
2326
2327	mov	\$8, %rax
2328.Lselect_loop_avx2_w5:
2329
2330	vmovdqa	32*0($in_t), $T0a
2331	vmovdqa	32*1($in_t), $T0b
2332	vmovdqa	32*2($in_t), $T0c
2333
2334	vmovdqa	32*3($in_t), $T1a
2335	vmovdqa	32*4($in_t), $T1b
2336	vmovdqa	32*5($in_t), $T1c
2337
2338	vpcmpeqd	$INDEX, $M0, $TMP0
2339	vpcmpeqd	$INDEX, $M1, $TMP1
2340
2341	vpaddd	$TWO, $M0, $M0
2342	vpaddd	$TWO, $M1, $M1
2343	lea	32*6($in_t), $in_t
2344
2345	vpand	$TMP0, $T0a, $T0a
2346	vpand	$TMP0, $T0b, $T0b
2347	vpand	$TMP0, $T0c, $T0c
2348	vpand	$TMP1, $T1a, $T1a
2349	vpand	$TMP1, $T1b, $T1b
2350	vpand	$TMP1, $T1c, $T1c
2351
2352	vpxor	$T0a, $Ra, $Ra
2353	vpxor	$T0b, $Rb, $Rb
2354	vpxor	$T0c, $Rc, $Rc
2355	vpxor	$T1a, $Ra, $Ra
2356	vpxor	$T1b, $Rb, $Rb
2357	vpxor	$T1c, $Rc, $Rc
2358
2359	dec %rax
2360	jnz .Lselect_loop_avx2_w5
2361
2362	vmovdqu $Ra, 32*0($val)
2363	vmovdqu $Rb, 32*1($val)
2364	vmovdqu $Rc, 32*2($val)
2365	vzeroupper
2366___
2367$code.=<<___	if ($win64);
2368	movaps	(%rsp), %xmm6
2369	movaps	0x10(%rsp), %xmm7
2370	movaps	0x20(%rsp), %xmm8
2371	movaps	0x30(%rsp), %xmm9
2372	movaps	0x40(%rsp), %xmm10
2373	movaps	0x50(%rsp), %xmm11
2374	movaps	0x60(%rsp), %xmm12
2375	movaps	0x70(%rsp), %xmm13
2376	movaps	0x80(%rsp), %xmm14
2377	movaps	0x90(%rsp), %xmm15
2378	lea	(%r11), %rsp
2379___
2380$code.=<<___;
2381	ret
2382.cfi_endproc
2383.LSEH_end_ecp_nistz256_avx2_select_w5:
2384.size	ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5
2385___
2386}
2387if ($avx>1) {
2388my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
2389my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3));
2390my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7));
2391my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11));
2392my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15));
2393
2394$code.=<<___;
2395
2396################################################################################
2397# void ecp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, int index);
2398.globl	ecp_nistz256_avx2_select_w7
2399.type	ecp_nistz256_avx2_select_w7,\@abi-omnipotent
2400.align	32
2401ecp_nistz256_avx2_select_w7:
2402.cfi_startproc
2403.Lavx2_select_w7:
2404	vzeroupper
2405___
2406$code.=<<___	if ($win64);
2407	mov	%rsp,%r11
2408	lea	-0x88(%rsp), %rax
2409.LSEH_begin_ecp_nistz256_avx2_select_w7:
2410	.byte	0x48,0x8d,0x60,0xe0		# lea	-0x20(%rax), %rsp
2411	.byte	0xc5,0xf8,0x29,0x70,0xe0	# vmovaps %xmm6, -0x20(%rax)
2412	.byte	0xc5,0xf8,0x29,0x78,0xf0	# vmovaps %xmm7, -0x10(%rax)
2413	.byte	0xc5,0x78,0x29,0x40,0x00	# vmovaps %xmm8, 8(%rax)
2414	.byte	0xc5,0x78,0x29,0x48,0x10	# vmovaps %xmm9, 0x10(%rax)
2415	.byte	0xc5,0x78,0x29,0x50,0x20	# vmovaps %xmm10, 0x20(%rax)
2416	.byte	0xc5,0x78,0x29,0x58,0x30	# vmovaps %xmm11, 0x30(%rax)
2417	.byte	0xc5,0x78,0x29,0x60,0x40	# vmovaps %xmm12, 0x40(%rax)
2418	.byte	0xc5,0x78,0x29,0x68,0x50	# vmovaps %xmm13, 0x50(%rax)
2419	.byte	0xc5,0x78,0x29,0x70,0x60	# vmovaps %xmm14, 0x60(%rax)
2420	.byte	0xc5,0x78,0x29,0x78,0x70	# vmovaps %xmm15, 0x70(%rax)
2421___
2422$code.=<<___;
2423	vmovdqa	.LThree(%rip), $THREE
2424
2425	vpxor	$Ra, $Ra, $Ra
2426	vpxor	$Rb, $Rb, $Rb
2427
2428	vmovdqa .LOne(%rip), $M0
2429	vmovdqa .LTwo(%rip), $M1
2430	vmovdqa .LThree(%rip), $M2
2431
2432	vmovd	$index, %xmm1
2433	vpermd	$INDEX, $Ra, $INDEX
2434	# Skip index = 0, because it is implicitly the point at infinity
2435
2436	mov	\$21, %rax
2437.Lselect_loop_avx2_w7:
2438
2439	vmovdqa	32*0($in_t), $T0a
2440	vmovdqa	32*1($in_t), $T0b
2441
2442	vmovdqa	32*2($in_t), $T1a
2443	vmovdqa	32*3($in_t), $T1b
2444
2445	vmovdqa	32*4($in_t), $T2a
2446	vmovdqa	32*5($in_t), $T2b
2447
2448	vpcmpeqd	$INDEX, $M0, $TMP0
2449	vpcmpeqd	$INDEX, $M1, $TMP1
2450	vpcmpeqd	$INDEX, $M2, $TMP2
2451
2452	vpaddd	$THREE, $M0, $M0
2453	vpaddd	$THREE, $M1, $M1
2454	vpaddd	$THREE, $M2, $M2
2455	lea	32*6($in_t), $in_t
2456
2457	vpand	$TMP0, $T0a, $T0a
2458	vpand	$TMP0, $T0b, $T0b
2459	vpand	$TMP1, $T1a, $T1a
2460	vpand	$TMP1, $T1b, $T1b
2461	vpand	$TMP2, $T2a, $T2a
2462	vpand	$TMP2, $T2b, $T2b
2463
2464	vpxor	$T0a, $Ra, $Ra
2465	vpxor	$T0b, $Rb, $Rb
2466	vpxor	$T1a, $Ra, $Ra
2467	vpxor	$T1b, $Rb, $Rb
2468	vpxor	$T2a, $Ra, $Ra
2469	vpxor	$T2b, $Rb, $Rb
2470
2471	dec %rax
2472	jnz .Lselect_loop_avx2_w7
2473
2474
2475	vmovdqa	32*0($in_t), $T0a
2476	vmovdqa	32*1($in_t), $T0b
2477
2478	vpcmpeqd	$INDEX, $M0, $TMP0
2479
2480	vpand	$TMP0, $T0a, $T0a
2481	vpand	$TMP0, $T0b, $T0b
2482
2483	vpxor	$T0a, $Ra, $Ra
2484	vpxor	$T0b, $Rb, $Rb
2485
2486	vmovdqu $Ra, 32*0($val)
2487	vmovdqu $Rb, 32*1($val)
2488	vzeroupper
2489___
2490$code.=<<___	if ($win64);
2491	movaps	(%rsp), %xmm6
2492	movaps	0x10(%rsp), %xmm7
2493	movaps	0x20(%rsp), %xmm8
2494	movaps	0x30(%rsp), %xmm9
2495	movaps	0x40(%rsp), %xmm10
2496	movaps	0x50(%rsp), %xmm11
2497	movaps	0x60(%rsp), %xmm12
2498	movaps	0x70(%rsp), %xmm13
2499	movaps	0x80(%rsp), %xmm14
2500	movaps	0x90(%rsp), %xmm15
2501	lea	(%r11), %rsp
2502___
2503$code.=<<___;
2504	ret
2505.cfi_endproc
2506.LSEH_end_ecp_nistz256_avx2_select_w7:
2507.size	ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
2508___
2509} else {
2510$code.=<<___;
2511.globl	ecp_nistz256_avx2_select_w7
2512.type	ecp_nistz256_avx2_select_w7,\@function,3
2513.align	32
2514ecp_nistz256_avx2_select_w7:
2515	.byte	0x0f,0x0b	# ud2
2516	ret
2517.size	ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
2518___
2519}
2520{{{
2521########################################################################
2522# This block implements higher level point_double, point_add and
2523# point_add_affine. The key to performance in this case is to allow
2524# out-of-order execution logic to overlap computations from next step
2525# with tail processing from current step. By using tailored calling
2526# sequence we minimize inter-step overhead to give processor better
2527# shot at overlapping operations...
2528#
2529# You will notice that input data is copied to stack. Trouble is that
2530# there are no registers to spare for holding original pointers and
2531# reloading them, pointers, would create undesired dependencies on
2532# effective addresses calculation paths. In other words it's too done
2533# to favour out-of-order execution logic.
2534#						<appro@openssl.org>
2535
2536my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
2537my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
2538my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4);
2539my ($poly1,$poly3)=($acc6,$acc7);
2540
2541sub load_for_mul () {
2542my ($a,$b,$src0) = @_;
2543my $bias = $src0 eq "%rax" ? 0 : -128;
2544
2545"	mov	$b, $src0
2546	lea	$b, $b_ptr
2547	mov	8*0+$a, $acc1
2548	mov	8*1+$a, $acc2
2549	lea	$bias+$a, $a_ptr
2550	mov	8*2+$a, $acc3
2551	mov	8*3+$a, $acc4"
2552}
2553
2554sub load_for_sqr () {
2555my ($a,$src0) = @_;
2556my $bias = $src0 eq "%rax" ? 0 : -128;
2557
2558"	mov	8*0+$a, $src0
2559	mov	8*1+$a, $acc6
2560	lea	$bias+$a, $a_ptr
2561	mov	8*2+$a, $acc7
2562	mov	8*3+$a, $acc0"
2563}
2564
2565									{
2566########################################################################
2567# operate in 4-5-0-1 "name space" that matches multiplication output
2568#
2569my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
2570
2571$code.=<<___;
2572.type	__ecp_nistz256_add_toq,\@abi-omnipotent
2573.align	32
2574__ecp_nistz256_add_toq:
2575.cfi_startproc
2576	xor	$t4,$t4
2577	add	8*0($b_ptr), $a0
2578	adc	8*1($b_ptr), $a1
2579	 mov	$a0, $t0
2580	adc	8*2($b_ptr), $a2
2581	adc	8*3($b_ptr), $a3
2582	 mov	$a1, $t1
2583	adc	\$0, $t4
2584
2585	sub	\$-1, $a0
2586	 mov	$a2, $t2
2587	sbb	$poly1, $a1
2588	sbb	\$0, $a2
2589	 mov	$a3, $t3
2590	sbb	$poly3, $a3
2591	sbb	\$0, $t4
2592
2593	cmovc	$t0, $a0
2594	cmovc	$t1, $a1
2595	mov	$a0, 8*0($r_ptr)
2596	cmovc	$t2, $a2
2597	mov	$a1, 8*1($r_ptr)
2598	cmovc	$t3, $a3
2599	mov	$a2, 8*2($r_ptr)
2600	mov	$a3, 8*3($r_ptr)
2601
2602	ret
2603.cfi_endproc
2604.size	__ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
2605
2606.type	__ecp_nistz256_sub_fromq,\@abi-omnipotent
2607.align	32
2608__ecp_nistz256_sub_fromq:
2609.cfi_startproc
2610	sub	8*0($b_ptr), $a0
2611	sbb	8*1($b_ptr), $a1
2612	 mov	$a0, $t0
2613	sbb	8*2($b_ptr), $a2
2614	sbb	8*3($b_ptr), $a3
2615	 mov	$a1, $t1
2616	sbb	$t4, $t4
2617
2618	add	\$-1, $a0
2619	 mov	$a2, $t2
2620	adc	$poly1, $a1
2621	adc	\$0, $a2
2622	 mov	$a3, $t3
2623	adc	$poly3, $a3
2624	test	$t4, $t4
2625
2626	cmovz	$t0, $a0
2627	cmovz	$t1, $a1
2628	mov	$a0, 8*0($r_ptr)
2629	cmovz	$t2, $a2
2630	mov	$a1, 8*1($r_ptr)
2631	cmovz	$t3, $a3
2632	mov	$a2, 8*2($r_ptr)
2633	mov	$a3, 8*3($r_ptr)
2634
2635	ret
2636.cfi_endproc
2637.size	__ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
2638
2639.type	__ecp_nistz256_subq,\@abi-omnipotent
2640.align	32
2641__ecp_nistz256_subq:
2642.cfi_startproc
2643	sub	$a0, $t0
2644	sbb	$a1, $t1
2645	 mov	$t0, $a0
2646	sbb	$a2, $t2
2647	sbb	$a3, $t3
2648	 mov	$t1, $a1
2649	sbb	$t4, $t4
2650
2651	add	\$-1, $t0
2652	 mov	$t2, $a2
2653	adc	$poly1, $t1
2654	adc	\$0, $t2
2655	 mov	$t3, $a3
2656	adc	$poly3, $t3
2657	test	$t4, $t4
2658
2659	cmovnz	$t0, $a0
2660	cmovnz	$t1, $a1
2661	cmovnz	$t2, $a2
2662	cmovnz	$t3, $a3
2663
2664	ret
2665.cfi_endproc
2666.size	__ecp_nistz256_subq,.-__ecp_nistz256_subq
2667
2668.type	__ecp_nistz256_mul_by_2q,\@abi-omnipotent
2669.align	32
2670__ecp_nistz256_mul_by_2q:
2671.cfi_startproc
2672	xor	$t4, $t4
2673	add	$a0, $a0		# a0:a3+a0:a3
2674	adc	$a1, $a1
2675	 mov	$a0, $t0
2676	adc	$a2, $a2
2677	adc	$a3, $a3
2678	 mov	$a1, $t1
2679	adc	\$0, $t4
2680
2681	sub	\$-1, $a0
2682	 mov	$a2, $t2
2683	sbb	$poly1, $a1
2684	sbb	\$0, $a2
2685	 mov	$a3, $t3
2686	sbb	$poly3, $a3
2687	sbb	\$0, $t4
2688
2689	cmovc	$t0, $a0
2690	cmovc	$t1, $a1
2691	mov	$a0, 8*0($r_ptr)
2692	cmovc	$t2, $a2
2693	mov	$a1, 8*1($r_ptr)
2694	cmovc	$t3, $a3
2695	mov	$a2, 8*2($r_ptr)
2696	mov	$a3, 8*3($r_ptr)
2697
2698	ret
2699.cfi_endproc
2700.size	__ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
2701___
2702									}
2703sub gen_double () {
2704    my $x = shift;
2705    my ($src0,$sfx,$bias);
2706    my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
2707
2708    if ($x ne "x") {
2709	$src0 = "%rax";
2710	$sfx  = "";
2711	$bias = 0;
2712
2713$code.=<<___;
2714.globl	ecp_nistz256_point_double
2715.type	ecp_nistz256_point_double,\@function,2
2716.align	32
2717ecp_nistz256_point_double:
2718.cfi_startproc
2719___
2720$code.=<<___	if ($addx);
2721	leaq	OPENSSL_ia32cap_P(%rip), %rcx
2722	mov	8(%rcx), %rcx
2723	and	\$0x80100, %ecx
2724	cmp	\$0x80100, %ecx
2725	je	.Lpoint_doublex
2726___
2727    } else {
2728	$src0 = "%rdx";
2729	$sfx  = "x";
2730	$bias = 128;
2731
2732$code.=<<___;
2733.type	ecp_nistz256_point_doublex,\@function,2
2734.align	32
2735ecp_nistz256_point_doublex:
2736.cfi_startproc
2737.Lpoint_doublex:
2738___
2739    }
2740$code.=<<___;
2741	push	%rbp
2742.cfi_push	%rbp
2743	push	%rbx
2744.cfi_push	%rbx
2745	push	%r12
2746.cfi_push	%r12
2747	push	%r13
2748.cfi_push	%r13
2749	push	%r14
2750.cfi_push	%r14
2751	push	%r15
2752.cfi_push	%r15
2753	sub	\$32*5+8, %rsp
2754.cfi_adjust_cfa_offset	32*5+8
2755.Lpoint_double${x}_body:
2756
2757.Lpoint_double_shortcut$x:
2758	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$a_ptr.x
2759	mov	$a_ptr, $b_ptr			# backup copy
2760	movdqu	0x10($a_ptr), %xmm1
2761	 mov	0x20+8*0($a_ptr), $acc4		# load in_y in "5-4-0-1" order
2762	 mov	0x20+8*1($a_ptr), $acc5
2763	 mov	0x20+8*2($a_ptr), $acc0
2764	 mov	0x20+8*3($a_ptr), $acc1
2765	 mov	.Lpoly+8*1(%rip), $poly1
2766	 mov	.Lpoly+8*3(%rip), $poly3
2767	movdqa	%xmm0, $in_x(%rsp)
2768	movdqa	%xmm1, $in_x+0x10(%rsp)
2769	lea	0x20($r_ptr), $acc2
2770	lea	0x40($r_ptr), $acc3
2771	movq	$r_ptr, %xmm0
2772	movq	$acc2, %xmm1
2773	movq	$acc3, %xmm2
2774
2775	lea	$S(%rsp), $r_ptr
2776	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(S, in_y);
2777
2778	mov	0x40+8*0($a_ptr), $src0
2779	mov	0x40+8*1($a_ptr), $acc6
2780	mov	0x40+8*2($a_ptr), $acc7
2781	mov	0x40+8*3($a_ptr), $acc0
2782	lea	0x40-$bias($a_ptr), $a_ptr
2783	lea	$Zsqr(%rsp), $r_ptr
2784	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Zsqr, in_z);
2785
2786	`&load_for_sqr("$S(%rsp)", "$src0")`
2787	lea	$S(%rsp), $r_ptr
2788	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(S, S);
2789
2790	mov	0x20($b_ptr), $src0		# $b_ptr is still valid
2791	mov	0x40+8*0($b_ptr), $acc1
2792	mov	0x40+8*1($b_ptr), $acc2
2793	mov	0x40+8*2($b_ptr), $acc3
2794	mov	0x40+8*3($b_ptr), $acc4
2795	lea	0x40-$bias($b_ptr), $a_ptr
2796	lea	0x20($b_ptr), $b_ptr
2797	movq	%xmm2, $r_ptr
2798	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, in_z, in_y);
2799	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(res_z, res_z);
2800
2801	mov	$in_x+8*0(%rsp), $acc4		# "5-4-0-1" order
2802	mov	$in_x+8*1(%rsp), $acc5
2803	lea	$Zsqr(%rsp), $b_ptr
2804	mov	$in_x+8*2(%rsp), $acc0
2805	mov	$in_x+8*3(%rsp), $acc1
2806	lea	$M(%rsp), $r_ptr
2807	call	__ecp_nistz256_add_to$x		# p256_add(M, in_x, Zsqr);
2808
2809	mov	$in_x+8*0(%rsp), $acc4		# "5-4-0-1" order
2810	mov	$in_x+8*1(%rsp), $acc5
2811	lea	$Zsqr(%rsp), $b_ptr
2812	mov	$in_x+8*2(%rsp), $acc0
2813	mov	$in_x+8*3(%rsp), $acc1
2814	lea	$Zsqr(%rsp), $r_ptr
2815	call	__ecp_nistz256_sub_from$x	# p256_sub(Zsqr, in_x, Zsqr);
2816
2817	`&load_for_sqr("$S(%rsp)", "$src0")`
2818	movq	%xmm1, $r_ptr
2819	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(res_y, S);
2820___
2821{
2822######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
2823# operate in 4-5-6-7 "name space" that matches squaring output
2824#
2825my ($poly1,$poly3)=($a_ptr,$t1);
2826my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2);
2827
2828$code.=<<___;
2829	xor	$t4, $t4
2830	mov	$a0, $t0
2831	add	\$-1, $a0
2832	mov	$a1, $t1
2833	adc	$poly1, $a1
2834	mov	$a2, $t2
2835	adc	\$0, $a2
2836	mov	$a3, $t3
2837	adc	$poly3, $a3
2838	adc	\$0, $t4
2839	xor	$a_ptr, $a_ptr		# borrow $a_ptr
2840	test	\$1, $t0
2841
2842	cmovz	$t0, $a0
2843	cmovz	$t1, $a1
2844	cmovz	$t2, $a2
2845	cmovz	$t3, $a3
2846	cmovz	$a_ptr, $t4
2847
2848	mov	$a1, $t0		# a0:a3>>1
2849	shr	\$1, $a0
2850	shl	\$63, $t0
2851	mov	$a2, $t1
2852	shr	\$1, $a1
2853	or	$t0, $a0
2854	shl	\$63, $t1
2855	mov	$a3, $t2
2856	shr	\$1, $a2
2857	or	$t1, $a1
2858	shl	\$63, $t2
2859	mov	$a0, 8*0($r_ptr)
2860	shr	\$1, $a3
2861	mov	$a1, 8*1($r_ptr)
2862	shl	\$63, $t4
2863	or	$t2, $a2
2864	or	$t4, $a3
2865	mov	$a2, 8*2($r_ptr)
2866	mov	$a3, 8*3($r_ptr)
2867___
2868}
2869$code.=<<___;
2870	`&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")`
2871	lea	$M(%rsp), $r_ptr
2872	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(M, M, Zsqr);
2873
2874	lea	$tmp0(%rsp), $r_ptr
2875	call	__ecp_nistz256_mul_by_2$x
2876
2877	lea	$M(%rsp), $b_ptr
2878	lea	$M(%rsp), $r_ptr
2879	call	__ecp_nistz256_add_to$x		# p256_mul_by_3(M, M);
2880
2881	`&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")`
2882	lea	$S(%rsp), $r_ptr
2883	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S, S, in_x);
2884
2885	lea	$tmp0(%rsp), $r_ptr
2886	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(tmp0, S);
2887
2888	`&load_for_sqr("$M(%rsp)", "$src0")`
2889	movq	%xmm0, $r_ptr
2890	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(res_x, M);
2891
2892	lea	$tmp0(%rsp), $b_ptr
2893	mov	$acc6, $acc0			# harmonize sqr output and sub input
2894	mov	$acc7, $acc1
2895	mov	$a_ptr, $poly1
2896	mov	$t1, $poly3
2897	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, tmp0);
2898
2899	mov	$S+8*0(%rsp), $t0
2900	mov	$S+8*1(%rsp), $t1
2901	mov	$S+8*2(%rsp), $t2
2902	mov	$S+8*3(%rsp), $acc2		# "4-5-0-1" order
2903	lea	$S(%rsp), $r_ptr
2904	call	__ecp_nistz256_sub$x		# p256_sub(S, S, res_x);
2905
2906	mov	$M(%rsp), $src0
2907	lea	$M(%rsp), $b_ptr
2908	mov	$acc4, $acc6			# harmonize sub output and mul input
2909	xor	%ecx, %ecx
2910	mov	$acc4, $S+8*0(%rsp)		# have to save:-(
2911	mov	$acc5, $acc2
2912	mov	$acc5, $S+8*1(%rsp)
2913	cmovz	$acc0, $acc3
2914	mov	$acc0, $S+8*2(%rsp)
2915	lea	$S-$bias(%rsp), $a_ptr
2916	cmovz	$acc1, $acc4
2917	mov	$acc1, $S+8*3(%rsp)
2918	mov	$acc6, $acc1
2919	lea	$S(%rsp), $r_ptr
2920	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S, S, M);
2921
2922	movq	%xmm1, $b_ptr
2923	movq	%xmm1, $r_ptr
2924	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, S, res_y);
2925
2926	lea	32*5+56(%rsp), %rsi
2927.cfi_def_cfa	%rsi,8
2928	mov	-48(%rsi),%r15
2929.cfi_restore	%r15
2930	mov	-40(%rsi),%r14
2931.cfi_restore	%r14
2932	mov	-32(%rsi),%r13
2933.cfi_restore	%r13
2934	mov	-24(%rsi),%r12
2935.cfi_restore	%r12
2936	mov	-16(%rsi),%rbx
2937.cfi_restore	%rbx
2938	mov	-8(%rsi),%rbp
2939.cfi_restore	%rbp
2940	lea	(%rsi),%rsp
2941.cfi_def_cfa_register	%rsp
2942.Lpoint_double${x}_epilogue:
2943	ret
2944.cfi_endproc
2945.size	ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx
2946___
2947}
2948&gen_double("q");
2949
2950sub gen_add () {
2951    my $x = shift;
2952    my ($src0,$sfx,$bias);
2953    my ($H,$Hsqr,$R,$Rsqr,$Hcub,
2954	$U1,$U2,$S1,$S2,
2955	$res_x,$res_y,$res_z,
2956	$in1_x,$in1_y,$in1_z,
2957	$in2_x,$in2_y,$in2_z)=map(32*$_,(0..17));
2958    my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
2959
2960    if ($x ne "x") {
2961	$src0 = "%rax";
2962	$sfx  = "";
2963	$bias = 0;
2964
2965$code.=<<___;
2966.globl	ecp_nistz256_point_add
2967.type	ecp_nistz256_point_add,\@function,3
2968.align	32
2969ecp_nistz256_point_add:
2970.cfi_startproc
2971___
2972$code.=<<___	if ($addx);
2973	leaq	OPENSSL_ia32cap_P(%rip), %rcx
2974	mov	8(%rcx), %rcx
2975	and	\$0x80100, %ecx
2976	cmp	\$0x80100, %ecx
2977	je	.Lpoint_addx
2978___
2979    } else {
2980	$src0 = "%rdx";
2981	$sfx  = "x";
2982	$bias = 128;
2983
2984$code.=<<___;
2985.type	ecp_nistz256_point_addx,\@function,3
2986.align	32
2987ecp_nistz256_point_addx:
2988.cfi_startproc
2989.Lpoint_addx:
2990___
2991    }
2992$code.=<<___;
2993	push	%rbp
2994.cfi_push	%rbp
2995	push	%rbx
2996.cfi_push	%rbx
2997	push	%r12
2998.cfi_push	%r12
2999	push	%r13
3000.cfi_push	%r13
3001	push	%r14
3002.cfi_push	%r14
3003	push	%r15
3004.cfi_push	%r15
3005	sub	\$32*18+8, %rsp
3006.cfi_adjust_cfa_offset	32*18+8
3007.Lpoint_add${x}_body:
3008
3009	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$a_ptr
3010	movdqu	0x10($a_ptr), %xmm1
3011	movdqu	0x20($a_ptr), %xmm2
3012	movdqu	0x30($a_ptr), %xmm3
3013	movdqu	0x40($a_ptr), %xmm4
3014	movdqu	0x50($a_ptr), %xmm5
3015	mov	$a_ptr, $b_ptr			# reassign
3016	mov	$b_org, $a_ptr			# reassign
3017	movdqa	%xmm0, $in1_x(%rsp)
3018	movdqa	%xmm1, $in1_x+0x10(%rsp)
3019	movdqa	%xmm2, $in1_y(%rsp)
3020	movdqa	%xmm3, $in1_y+0x10(%rsp)
3021	movdqa	%xmm4, $in1_z(%rsp)
3022	movdqa	%xmm5, $in1_z+0x10(%rsp)
3023	por	%xmm4, %xmm5
3024
3025	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$b_ptr
3026	 pshufd	\$0xb1, %xmm5, %xmm3
3027	movdqu	0x10($a_ptr), %xmm1
3028	movdqu	0x20($a_ptr), %xmm2
3029	 por	%xmm3, %xmm5
3030	movdqu	0x30($a_ptr), %xmm3
3031	 mov	0x40+8*0($a_ptr), $src0		# load original in2_z
3032	 mov	0x40+8*1($a_ptr), $acc6
3033	 mov	0x40+8*2($a_ptr), $acc7
3034	 mov	0x40+8*3($a_ptr), $acc0
3035	movdqa	%xmm0, $in2_x(%rsp)
3036	 pshufd	\$0x1e, %xmm5, %xmm4
3037	movdqa	%xmm1, $in2_x+0x10(%rsp)
3038	movdqu	0x40($a_ptr),%xmm0		# in2_z again
3039	movdqu	0x50($a_ptr),%xmm1
3040	movdqa	%xmm2, $in2_y(%rsp)
3041	movdqa	%xmm3, $in2_y+0x10(%rsp)
3042	 por	%xmm4, %xmm5
3043	 pxor	%xmm4, %xmm4
3044	por	%xmm0, %xmm1
3045	 movq	$r_ptr, %xmm0			# save $r_ptr
3046
3047	lea	0x40-$bias($a_ptr), $a_ptr	# $a_ptr is still valid
3048	 mov	$src0, $in2_z+8*0(%rsp)		# make in2_z copy
3049	 mov	$acc6, $in2_z+8*1(%rsp)
3050	 mov	$acc7, $in2_z+8*2(%rsp)
3051	 mov	$acc0, $in2_z+8*3(%rsp)
3052	lea	$Z2sqr(%rsp), $r_ptr		# Z2^2
3053	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z2sqr, in2_z);
3054
3055	pcmpeqd	%xmm4, %xmm5
3056	pshufd	\$0xb1, %xmm1, %xmm4
3057	por	%xmm1, %xmm4
3058	pshufd	\$0, %xmm5, %xmm5		# in1infty
3059	pshufd	\$0x1e, %xmm4, %xmm3
3060	por	%xmm3, %xmm4
3061	pxor	%xmm3, %xmm3
3062	pcmpeqd	%xmm3, %xmm4
3063	pshufd	\$0, %xmm4, %xmm4		# in2infty
3064	 mov	0x40+8*0($b_ptr), $src0		# load original in1_z
3065	 mov	0x40+8*1($b_ptr), $acc6
3066	 mov	0x40+8*2($b_ptr), $acc7
3067	 mov	0x40+8*3($b_ptr), $acc0
3068	movq	$b_ptr, %xmm1
3069
3070	lea	0x40-$bias($b_ptr), $a_ptr
3071	lea	$Z1sqr(%rsp), $r_ptr		# Z1^2
3072	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z1sqr, in1_z);
3073
3074	`&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")`
3075	lea	$S1(%rsp), $r_ptr		# S1 = Z2^3
3076	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S1, Z2sqr, in2_z);
3077
3078	`&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
3079	lea	$S2(%rsp), $r_ptr		# S2 = Z1^3
3080	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Z1sqr, in1_z);
3081
3082	`&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")`
3083	lea	$S1(%rsp), $r_ptr		# S1 = Y1*Z2^3
3084	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S1, S1, in1_y);
3085
3086	`&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
3087	lea	$S2(%rsp), $r_ptr		# S2 = Y2*Z1^3
3088	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S2, in2_y);
3089
3090	lea	$S1(%rsp), $b_ptr
3091	lea	$R(%rsp), $r_ptr		# R = S2 - S1
3092	call	__ecp_nistz256_sub_from$x	# p256_sub(R, S2, S1);
3093
3094	or	$acc5, $acc4			# see if result is zero
3095	movdqa	%xmm4, %xmm2
3096	or	$acc0, $acc4
3097	or	$acc1, $acc4
3098	por	%xmm5, %xmm2			# in1infty || in2infty
3099	movq	$acc4, %xmm3
3100
3101	`&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")`
3102	lea	$U1(%rsp), $r_ptr		# U1 = X1*Z2^2
3103	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U1, in1_x, Z2sqr);
3104
3105	`&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")`
3106	lea	$U2(%rsp), $r_ptr		# U2 = X2*Z1^2
3107	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, in2_x, Z1sqr);
3108
3109	lea	$U1(%rsp), $b_ptr
3110	lea	$H(%rsp), $r_ptr		# H = U2 - U1
3111	call	__ecp_nistz256_sub_from$x	# p256_sub(H, U2, U1);
3112
3113	or	$acc5, $acc4			# see if result is zero
3114	or	$acc0, $acc4
3115	or	$acc1, $acc4			# !is_equal(U1, U2)
3116
3117	movq	%xmm2, $acc0
3118	movq	%xmm3, $acc1
3119	or	$acc0, $acc4
3120	.byte	0x3e				# predict taken
3121	jnz	.Ladd_proceed$x			# !is_equal(U1, U2) || in1infty || in2infty
3122
3123	# We now know A = B or A = -B and neither is infinity. Compare the
3124	# y-coordinates via S1 and S2.
3125	test	$acc1, $acc1
3126	jz	.Ladd_double$x			# is_equal(S1, S2)
3127
3128	# A = -B, so the result is infinity.
3129	#
3130	# TODO(davidben): Does .Ladd_proceed handle this case? It seems to, in
3131	# which case we should eliminate this special-case and simplify the
3132	# timing analysis.
3133	movq	%xmm0, $r_ptr			# restore $r_ptr
3134	pxor	%xmm0, %xmm0
3135	movdqu	%xmm0, 0x00($r_ptr)
3136	movdqu	%xmm0, 0x10($r_ptr)
3137	movdqu	%xmm0, 0x20($r_ptr)
3138	movdqu	%xmm0, 0x30($r_ptr)
3139	movdqu	%xmm0, 0x40($r_ptr)
3140	movdqu	%xmm0, 0x50($r_ptr)
3141	jmp	.Ladd_done$x
3142
3143.align	32
3144.Ladd_double$x:
3145	movq	%xmm1, $a_ptr			# restore $a_ptr
3146	movq	%xmm0, $r_ptr			# restore $r_ptr
3147	add	\$`32*(18-5)`, %rsp		# difference in frame sizes
3148.cfi_adjust_cfa_offset	`-32*(18-5)`
3149	jmp	.Lpoint_double_shortcut$x
3150.cfi_adjust_cfa_offset	`32*(18-5)`
3151
3152.align	32
3153.Ladd_proceed$x:
3154	`&load_for_sqr("$R(%rsp)", "$src0")`
3155	lea	$Rsqr(%rsp), $r_ptr		# R^2
3156	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Rsqr, R);
3157
3158	`&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
3159	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
3160	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, H, in1_z);
3161
3162	`&load_for_sqr("$H(%rsp)", "$src0")`
3163	lea	$Hsqr(%rsp), $r_ptr		# H^2
3164	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Hsqr, H);
3165
3166	`&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")`
3167	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
3168	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, res_z, in2_z);
3169
3170	`&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")`
3171	lea	$Hcub(%rsp), $r_ptr		# H^3
3172	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(Hcub, Hsqr, H);
3173
3174	`&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")`
3175	lea	$U2(%rsp), $r_ptr		# U1*H^2
3176	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, U1, Hsqr);
3177___
3178{
3179#######################################################################
3180# operate in 4-5-0-1 "name space" that matches multiplication output
3181#
3182my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
3183my ($poly1, $poly3)=($acc6,$acc7);
3184
3185$code.=<<___;
3186	#lea	$U2(%rsp), $a_ptr
3187	#lea	$Hsqr(%rsp), $r_ptr	# 2*U1*H^2
3188	#call	__ecp_nistz256_mul_by_2	# ecp_nistz256_mul_by_2(Hsqr, U2);
3189
3190	xor	$t4, $t4
3191	add	$acc0, $acc0		# a0:a3+a0:a3
3192	lea	$Rsqr(%rsp), $a_ptr
3193	adc	$acc1, $acc1
3194	 mov	$acc0, $t0
3195	adc	$acc2, $acc2
3196	adc	$acc3, $acc3
3197	 mov	$acc1, $t1
3198	adc	\$0, $t4
3199
3200	sub	\$-1, $acc0
3201	 mov	$acc2, $t2
3202	sbb	$poly1, $acc1
3203	sbb	\$0, $acc2
3204	 mov	$acc3, $t3
3205	sbb	$poly3, $acc3
3206	sbb	\$0, $t4
3207
3208	cmovc	$t0, $acc0
3209	mov	8*0($a_ptr), $t0
3210	cmovc	$t1, $acc1
3211	mov	8*1($a_ptr), $t1
3212	cmovc	$t2, $acc2
3213	mov	8*2($a_ptr), $t2
3214	cmovc	$t3, $acc3
3215	mov	8*3($a_ptr), $t3
3216
3217	call	__ecp_nistz256_sub$x		# p256_sub(res_x, Rsqr, Hsqr);
3218
3219	lea	$Hcub(%rsp), $b_ptr
3220	lea	$res_x(%rsp), $r_ptr
3221	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, Hcub);
3222
3223	mov	$U2+8*0(%rsp), $t0
3224	mov	$U2+8*1(%rsp), $t1
3225	mov	$U2+8*2(%rsp), $t2
3226	mov	$U2+8*3(%rsp), $t3
3227	lea	$res_y(%rsp), $r_ptr
3228
3229	call	__ecp_nistz256_sub$x		# p256_sub(res_y, U2, res_x);
3230
3231	mov	$acc0, 8*0($r_ptr)		# save the result, as
3232	mov	$acc1, 8*1($r_ptr)		# __ecp_nistz256_sub doesn't
3233	mov	$acc2, 8*2($r_ptr)
3234	mov	$acc3, 8*3($r_ptr)
3235___
3236}
3237$code.=<<___;
3238	`&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")`
3239	lea	$S2(%rsp), $r_ptr
3240	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S1, Hcub);
3241
3242	`&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")`
3243	lea	$res_y(%rsp), $r_ptr
3244	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_y, R, res_y);
3245
3246	lea	$S2(%rsp), $b_ptr
3247	lea	$res_y(%rsp), $r_ptr
3248	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, res_y, S2);
3249
3250	movq	%xmm0, $r_ptr		# restore $r_ptr
3251
3252	movdqa	%xmm5, %xmm0		# copy_conditional(res_z, in2_z, in1infty);
3253	movdqa	%xmm5, %xmm1
3254	pandn	$res_z(%rsp), %xmm0
3255	movdqa	%xmm5, %xmm2
3256	pandn	$res_z+0x10(%rsp), %xmm1
3257	movdqa	%xmm5, %xmm3
3258	pand	$in2_z(%rsp), %xmm2
3259	pand	$in2_z+0x10(%rsp), %xmm3
3260	por	%xmm0, %xmm2
3261	por	%xmm1, %xmm3
3262
3263	movdqa	%xmm4, %xmm0		# copy_conditional(res_z, in1_z, in2infty);
3264	movdqa	%xmm4, %xmm1
3265	pandn	%xmm2, %xmm0
3266	movdqa	%xmm4, %xmm2
3267	pandn	%xmm3, %xmm1
3268	movdqa	%xmm4, %xmm3
3269	pand	$in1_z(%rsp), %xmm2
3270	pand	$in1_z+0x10(%rsp), %xmm3
3271	por	%xmm0, %xmm2
3272	por	%xmm1, %xmm3
3273	movdqu	%xmm2, 0x40($r_ptr)
3274	movdqu	%xmm3, 0x50($r_ptr)
3275
3276	movdqa	%xmm5, %xmm0		# copy_conditional(res_x, in2_x, in1infty);
3277	movdqa	%xmm5, %xmm1
3278	pandn	$res_x(%rsp), %xmm0
3279	movdqa	%xmm5, %xmm2
3280	pandn	$res_x+0x10(%rsp), %xmm1
3281	movdqa	%xmm5, %xmm3
3282	pand	$in2_x(%rsp), %xmm2
3283	pand	$in2_x+0x10(%rsp), %xmm3
3284	por	%xmm0, %xmm2
3285	por	%xmm1, %xmm3
3286
3287	movdqa	%xmm4, %xmm0		# copy_conditional(res_x, in1_x, in2infty);
3288	movdqa	%xmm4, %xmm1
3289	pandn	%xmm2, %xmm0
3290	movdqa	%xmm4, %xmm2
3291	pandn	%xmm3, %xmm1
3292	movdqa	%xmm4, %xmm3
3293	pand	$in1_x(%rsp), %xmm2
3294	pand	$in1_x+0x10(%rsp), %xmm3
3295	por	%xmm0, %xmm2
3296	por	%xmm1, %xmm3
3297	movdqu	%xmm2, 0x00($r_ptr)
3298	movdqu	%xmm3, 0x10($r_ptr)
3299
3300	movdqa	%xmm5, %xmm0		# copy_conditional(res_y, in2_y, in1infty);
3301	movdqa	%xmm5, %xmm1
3302	pandn	$res_y(%rsp), %xmm0
3303	movdqa	%xmm5, %xmm2
3304	pandn	$res_y+0x10(%rsp), %xmm1
3305	movdqa	%xmm5, %xmm3
3306	pand	$in2_y(%rsp), %xmm2
3307	pand	$in2_y+0x10(%rsp), %xmm3
3308	por	%xmm0, %xmm2
3309	por	%xmm1, %xmm3
3310
3311	movdqa	%xmm4, %xmm0		# copy_conditional(res_y, in1_y, in2infty);
3312	movdqa	%xmm4, %xmm1
3313	pandn	%xmm2, %xmm0
3314	movdqa	%xmm4, %xmm2
3315	pandn	%xmm3, %xmm1
3316	movdqa	%xmm4, %xmm3
3317	pand	$in1_y(%rsp), %xmm2
3318	pand	$in1_y+0x10(%rsp), %xmm3
3319	por	%xmm0, %xmm2
3320	por	%xmm1, %xmm3
3321	movdqu	%xmm2, 0x20($r_ptr)
3322	movdqu	%xmm3, 0x30($r_ptr)
3323
3324.Ladd_done$x:
3325	lea	32*18+56(%rsp), %rsi
3326.cfi_def_cfa	%rsi,8
3327	mov	-48(%rsi),%r15
3328.cfi_restore	%r15
3329	mov	-40(%rsi),%r14
3330.cfi_restore	%r14
3331	mov	-32(%rsi),%r13
3332.cfi_restore	%r13
3333	mov	-24(%rsi),%r12
3334.cfi_restore	%r12
3335	mov	-16(%rsi),%rbx
3336.cfi_restore	%rbx
3337	mov	-8(%rsi),%rbp
3338.cfi_restore	%rbp
3339	lea	(%rsi),%rsp
3340.cfi_def_cfa_register	%rsp
3341.Lpoint_add${x}_epilogue:
3342	ret
3343.cfi_endproc
3344.size	ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx
3345___
3346}
3347&gen_add("q");
3348
3349sub gen_add_affine () {
3350    my $x = shift;
3351    my ($src0,$sfx,$bias);
3352    my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr,
3353	$res_x,$res_y,$res_z,
3354	$in1_x,$in1_y,$in1_z,
3355	$in2_x,$in2_y)=map(32*$_,(0..14));
3356    my $Z1sqr = $S2;
3357
3358    if ($x ne "x") {
3359	$src0 = "%rax";
3360	$sfx  = "";
3361	$bias = 0;
3362
3363$code.=<<___;
3364.globl	ecp_nistz256_point_add_affine
3365.type	ecp_nistz256_point_add_affine,\@function,3
3366.align	32
3367ecp_nistz256_point_add_affine:
3368.cfi_startproc
3369___
3370$code.=<<___	if ($addx);
3371	leaq	OPENSSL_ia32cap_P(%rip), %rcx
3372	mov	8(%rcx), %rcx
3373	and	\$0x80100, %ecx
3374	cmp	\$0x80100, %ecx
3375	je	.Lpoint_add_affinex
3376___
3377    } else {
3378	$src0 = "%rdx";
3379	$sfx  = "x";
3380	$bias = 128;
3381
3382$code.=<<___;
3383.type	ecp_nistz256_point_add_affinex,\@function,3
3384.align	32
3385ecp_nistz256_point_add_affinex:
3386.cfi_startproc
3387.Lpoint_add_affinex:
3388___
3389    }
3390$code.=<<___;
3391	push	%rbp
3392.cfi_push	%rbp
3393	push	%rbx
3394.cfi_push	%rbx
3395	push	%r12
3396.cfi_push	%r12
3397	push	%r13
3398.cfi_push	%r13
3399	push	%r14
3400.cfi_push	%r14
3401	push	%r15
3402.cfi_push	%r15
3403	sub	\$32*15+8, %rsp
3404.cfi_adjust_cfa_offset	32*15+8
3405.Ladd_affine${x}_body:
3406
3407	movdqu	0x00($a_ptr), %xmm0	# copy	*(P256_POINT *)$a_ptr
3408	mov	$b_org, $b_ptr		# reassign
3409	movdqu	0x10($a_ptr), %xmm1
3410	movdqu	0x20($a_ptr), %xmm2
3411	movdqu	0x30($a_ptr), %xmm3
3412	movdqu	0x40($a_ptr), %xmm4
3413	movdqu	0x50($a_ptr), %xmm5
3414	 mov	0x40+8*0($a_ptr), $src0	# load original in1_z
3415	 mov	0x40+8*1($a_ptr), $acc6
3416	 mov	0x40+8*2($a_ptr), $acc7
3417	 mov	0x40+8*3($a_ptr), $acc0
3418	movdqa	%xmm0, $in1_x(%rsp)
3419	movdqa	%xmm1, $in1_x+0x10(%rsp)
3420	movdqa	%xmm2, $in1_y(%rsp)
3421	movdqa	%xmm3, $in1_y+0x10(%rsp)
3422	movdqa	%xmm4, $in1_z(%rsp)
3423	movdqa	%xmm5, $in1_z+0x10(%rsp)
3424	por	%xmm4, %xmm5
3425
3426	movdqu	0x00($b_ptr), %xmm0	# copy	*(P256_POINT_AFFINE *)$b_ptr
3427	 pshufd	\$0xb1, %xmm5, %xmm3
3428	movdqu	0x10($b_ptr), %xmm1
3429	movdqu	0x20($b_ptr), %xmm2
3430	 por	%xmm3, %xmm5
3431	movdqu	0x30($b_ptr), %xmm3
3432	movdqa	%xmm0, $in2_x(%rsp)
3433	 pshufd	\$0x1e, %xmm5, %xmm4
3434	movdqa	%xmm1, $in2_x+0x10(%rsp)
3435	por	%xmm0, %xmm1
3436	 movq	$r_ptr, %xmm0		# save $r_ptr
3437	movdqa	%xmm2, $in2_y(%rsp)
3438	movdqa	%xmm3, $in2_y+0x10(%rsp)
3439	por	%xmm2, %xmm3
3440	 por	%xmm4, %xmm5
3441	 pxor	%xmm4, %xmm4
3442	por	%xmm1, %xmm3
3443
3444	lea	0x40-$bias($a_ptr), $a_ptr	# $a_ptr is still valid
3445	lea	$Z1sqr(%rsp), $r_ptr		# Z1^2
3446	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z1sqr, in1_z);
3447
3448	pcmpeqd	%xmm4, %xmm5
3449	pshufd	\$0xb1, %xmm3, %xmm4
3450	 mov	0x00($b_ptr), $src0		# $b_ptr is still valid
3451	 #lea	0x00($b_ptr), $b_ptr
3452	 mov	$acc4, $acc1			# harmonize sqr output and mul input
3453	por	%xmm3, %xmm4
3454	pshufd	\$0, %xmm5, %xmm5		# in1infty
3455	pshufd	\$0x1e, %xmm4, %xmm3
3456	 mov	$acc5, $acc2
3457	por	%xmm3, %xmm4
3458	pxor	%xmm3, %xmm3
3459	 mov	$acc6, $acc3
3460	pcmpeqd	%xmm3, %xmm4
3461	pshufd	\$0, %xmm4, %xmm4		# in2infty
3462
3463	lea	$Z1sqr-$bias(%rsp), $a_ptr
3464	mov	$acc7, $acc4
3465	lea	$U2(%rsp), $r_ptr		# U2 = X2*Z1^2
3466	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, Z1sqr, in2_x);
3467
3468	lea	$in1_x(%rsp), $b_ptr
3469	lea	$H(%rsp), $r_ptr		# H = U2 - U1
3470	call	__ecp_nistz256_sub_from$x	# p256_sub(H, U2, in1_x);
3471
3472	`&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
3473	lea	$S2(%rsp), $r_ptr		# S2 = Z1^3
3474	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Z1sqr, in1_z);
3475
3476	`&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
3477	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
3478	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, H, in1_z);
3479
3480	`&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
3481	lea	$S2(%rsp), $r_ptr		# S2 = Y2*Z1^3
3482	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S2, in2_y);
3483
3484	lea	$in1_y(%rsp), $b_ptr
3485	lea	$R(%rsp), $r_ptr		# R = S2 - S1
3486	call	__ecp_nistz256_sub_from$x	# p256_sub(R, S2, in1_y);
3487
3488	`&load_for_sqr("$H(%rsp)", "$src0")`
3489	lea	$Hsqr(%rsp), $r_ptr		# H^2
3490	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Hsqr, H);
3491
3492	`&load_for_sqr("$R(%rsp)", "$src0")`
3493	lea	$Rsqr(%rsp), $r_ptr		# R^2
3494	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Rsqr, R);
3495
3496	`&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")`
3497	lea	$Hcub(%rsp), $r_ptr		# H^3
3498	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(Hcub, Hsqr, H);
3499
3500	`&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")`
3501	lea	$U2(%rsp), $r_ptr		# U1*H^2
3502	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, in1_x, Hsqr);
3503___
3504{
3505#######################################################################
3506# operate in 4-5-0-1 "name space" that matches multiplication output
3507#
3508my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
3509my ($poly1, $poly3)=($acc6,$acc7);
3510
3511$code.=<<___;
3512	#lea	$U2(%rsp), $a_ptr
3513	#lea	$Hsqr(%rsp), $r_ptr	# 2*U1*H^2
3514	#call	__ecp_nistz256_mul_by_2	# ecp_nistz256_mul_by_2(Hsqr, U2);
3515
3516	xor	$t4, $t4
3517	add	$acc0, $acc0		# a0:a3+a0:a3
3518	lea	$Rsqr(%rsp), $a_ptr
3519	adc	$acc1, $acc1
3520	 mov	$acc0, $t0
3521	adc	$acc2, $acc2
3522	adc	$acc3, $acc3
3523	 mov	$acc1, $t1
3524	adc	\$0, $t4
3525
3526	sub	\$-1, $acc0
3527	 mov	$acc2, $t2
3528	sbb	$poly1, $acc1
3529	sbb	\$0, $acc2
3530	 mov	$acc3, $t3
3531	sbb	$poly3, $acc3
3532	sbb	\$0, $t4
3533
3534	cmovc	$t0, $acc0
3535	mov	8*0($a_ptr), $t0
3536	cmovc	$t1, $acc1
3537	mov	8*1($a_ptr), $t1
3538	cmovc	$t2, $acc2
3539	mov	8*2($a_ptr), $t2
3540	cmovc	$t3, $acc3
3541	mov	8*3($a_ptr), $t3
3542
3543	call	__ecp_nistz256_sub$x		# p256_sub(res_x, Rsqr, Hsqr);
3544
3545	lea	$Hcub(%rsp), $b_ptr
3546	lea	$res_x(%rsp), $r_ptr
3547	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, Hcub);
3548
3549	mov	$U2+8*0(%rsp), $t0
3550	mov	$U2+8*1(%rsp), $t1
3551	mov	$U2+8*2(%rsp), $t2
3552	mov	$U2+8*3(%rsp), $t3
3553	lea	$H(%rsp), $r_ptr
3554
3555	call	__ecp_nistz256_sub$x		# p256_sub(H, U2, res_x);
3556
3557	mov	$acc0, 8*0($r_ptr)		# save the result, as
3558	mov	$acc1, 8*1($r_ptr)		# __ecp_nistz256_sub doesn't
3559	mov	$acc2, 8*2($r_ptr)
3560	mov	$acc3, 8*3($r_ptr)
3561___
3562}
3563$code.=<<___;
3564	`&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")`
3565	lea	$S2(%rsp), $r_ptr
3566	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Hcub, in1_y);
3567
3568	`&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")`
3569	lea	$H(%rsp), $r_ptr
3570	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(H, H, R);
3571
3572	lea	$S2(%rsp), $b_ptr
3573	lea	$res_y(%rsp), $r_ptr
3574	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, H, S2);
3575
3576	movq	%xmm0, $r_ptr		# restore $r_ptr
3577
3578	movdqa	%xmm5, %xmm0		# copy_conditional(res_z, ONE, in1infty);
3579	movdqa	%xmm5, %xmm1
3580	pandn	$res_z(%rsp), %xmm0
3581	movdqa	%xmm5, %xmm2
3582	pandn	$res_z+0x10(%rsp), %xmm1
3583	movdqa	%xmm5, %xmm3
3584	pand	.LONE_mont(%rip), %xmm2
3585	pand	.LONE_mont+0x10(%rip), %xmm3
3586	por	%xmm0, %xmm2
3587	por	%xmm1, %xmm3
3588
3589	movdqa	%xmm4, %xmm0		# copy_conditional(res_z, in1_z, in2infty);
3590	movdqa	%xmm4, %xmm1
3591	pandn	%xmm2, %xmm0
3592	movdqa	%xmm4, %xmm2
3593	pandn	%xmm3, %xmm1
3594	movdqa	%xmm4, %xmm3
3595	pand	$in1_z(%rsp), %xmm2
3596	pand	$in1_z+0x10(%rsp), %xmm3
3597	por	%xmm0, %xmm2
3598	por	%xmm1, %xmm3
3599	movdqu	%xmm2, 0x40($r_ptr)
3600	movdqu	%xmm3, 0x50($r_ptr)
3601
3602	movdqa	%xmm5, %xmm0		# copy_conditional(res_x, in2_x, in1infty);
3603	movdqa	%xmm5, %xmm1
3604	pandn	$res_x(%rsp), %xmm0
3605	movdqa	%xmm5, %xmm2
3606	pandn	$res_x+0x10(%rsp), %xmm1
3607	movdqa	%xmm5, %xmm3
3608	pand	$in2_x(%rsp), %xmm2
3609	pand	$in2_x+0x10(%rsp), %xmm3
3610	por	%xmm0, %xmm2
3611	por	%xmm1, %xmm3
3612
3613	movdqa	%xmm4, %xmm0		# copy_conditional(res_x, in1_x, in2infty);
3614	movdqa	%xmm4, %xmm1
3615	pandn	%xmm2, %xmm0
3616	movdqa	%xmm4, %xmm2
3617	pandn	%xmm3, %xmm1
3618	movdqa	%xmm4, %xmm3
3619	pand	$in1_x(%rsp), %xmm2
3620	pand	$in1_x+0x10(%rsp), %xmm3
3621	por	%xmm0, %xmm2
3622	por	%xmm1, %xmm3
3623	movdqu	%xmm2, 0x00($r_ptr)
3624	movdqu	%xmm3, 0x10($r_ptr)
3625
3626	movdqa	%xmm5, %xmm0		# copy_conditional(res_y, in2_y, in1infty);
3627	movdqa	%xmm5, %xmm1
3628	pandn	$res_y(%rsp), %xmm0
3629	movdqa	%xmm5, %xmm2
3630	pandn	$res_y+0x10(%rsp), %xmm1
3631	movdqa	%xmm5, %xmm3
3632	pand	$in2_y(%rsp), %xmm2
3633	pand	$in2_y+0x10(%rsp), %xmm3
3634	por	%xmm0, %xmm2
3635	por	%xmm1, %xmm3
3636
3637	movdqa	%xmm4, %xmm0		# copy_conditional(res_y, in1_y, in2infty);
3638	movdqa	%xmm4, %xmm1
3639	pandn	%xmm2, %xmm0
3640	movdqa	%xmm4, %xmm2
3641	pandn	%xmm3, %xmm1
3642	movdqa	%xmm4, %xmm3
3643	pand	$in1_y(%rsp), %xmm2
3644	pand	$in1_y+0x10(%rsp), %xmm3
3645	por	%xmm0, %xmm2
3646	por	%xmm1, %xmm3
3647	movdqu	%xmm2, 0x20($r_ptr)
3648	movdqu	%xmm3, 0x30($r_ptr)
3649
3650	lea	32*15+56(%rsp), %rsi
3651.cfi_def_cfa	%rsi,8
3652	mov	-48(%rsi),%r15
3653.cfi_restore	%r15
3654	mov	-40(%rsi),%r14
3655.cfi_restore	%r14
3656	mov	-32(%rsi),%r13
3657.cfi_restore	%r13
3658	mov	-24(%rsi),%r12
3659.cfi_restore	%r12
3660	mov	-16(%rsi),%rbx
3661.cfi_restore	%rbx
3662	mov	-8(%rsi),%rbp
3663.cfi_restore	%rbp
3664	lea	(%rsi),%rsp
3665.cfi_def_cfa_register	%rsp
3666.Ladd_affine${x}_epilogue:
3667	ret
3668.cfi_endproc
3669.size	ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx
3670___
3671}
3672&gen_add_affine("q");
3673
3674########################################################################
3675# AD*X magic
3676#
3677if ($addx) {								{
3678########################################################################
3679# operate in 4-5-0-1 "name space" that matches multiplication output
3680#
3681my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
3682
3683$code.=<<___;
3684.type	__ecp_nistz256_add_tox,\@abi-omnipotent
3685.align	32
3686__ecp_nistz256_add_tox:
3687.cfi_startproc
3688	xor	$t4, $t4
3689	adc	8*0($b_ptr), $a0
3690	adc	8*1($b_ptr), $a1
3691	 mov	$a0, $t0
3692	adc	8*2($b_ptr), $a2
3693	adc	8*3($b_ptr), $a3
3694	 mov	$a1, $t1
3695	adc	\$0, $t4
3696
3697	xor	$t3, $t3
3698	sbb	\$-1, $a0
3699	 mov	$a2, $t2
3700	sbb	$poly1, $a1
3701	sbb	\$0, $a2
3702	 mov	$a3, $t3
3703	sbb	$poly3, $a3
3704	sbb	\$0, $t4
3705
3706	cmovc	$t0, $a0
3707	cmovc	$t1, $a1
3708	mov	$a0, 8*0($r_ptr)
3709	cmovc	$t2, $a2
3710	mov	$a1, 8*1($r_ptr)
3711	cmovc	$t3, $a3
3712	mov	$a2, 8*2($r_ptr)
3713	mov	$a3, 8*3($r_ptr)
3714
3715	ret
3716.cfi_endproc
3717.size	__ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
3718
3719.type	__ecp_nistz256_sub_fromx,\@abi-omnipotent
3720.align	32
3721__ecp_nistz256_sub_fromx:
3722.cfi_startproc
3723	xor	$t4, $t4
3724	sbb	8*0($b_ptr), $a0
3725	sbb	8*1($b_ptr), $a1
3726	 mov	$a0, $t0
3727	sbb	8*2($b_ptr), $a2
3728	sbb	8*3($b_ptr), $a3
3729	 mov	$a1, $t1
3730	sbb	\$0, $t4
3731
3732	xor	$t3, $t3
3733	adc	\$-1, $a0
3734	 mov	$a2, $t2
3735	adc	$poly1, $a1
3736	adc	\$0, $a2
3737	 mov	$a3, $t3
3738	adc	$poly3, $a3
3739
3740	bt	\$0, $t4
3741	cmovnc	$t0, $a0
3742	cmovnc	$t1, $a1
3743	mov	$a0, 8*0($r_ptr)
3744	cmovnc	$t2, $a2
3745	mov	$a1, 8*1($r_ptr)
3746	cmovnc	$t3, $a3
3747	mov	$a2, 8*2($r_ptr)
3748	mov	$a3, 8*3($r_ptr)
3749
3750	ret
3751.cfi_endproc
3752.size	__ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
3753
3754.type	__ecp_nistz256_subx,\@abi-omnipotent
3755.align	32
3756__ecp_nistz256_subx:
3757.cfi_startproc
3758	xor	$t4, $t4
3759	sbb	$a0, $t0
3760	sbb	$a1, $t1
3761	 mov	$t0, $a0
3762	sbb	$a2, $t2
3763	sbb	$a3, $t3
3764	 mov	$t1, $a1
3765	sbb	\$0, $t4
3766
3767	xor	$a3 ,$a3
3768	adc	\$-1, $t0
3769	 mov	$t2, $a2
3770	adc	$poly1, $t1
3771	adc	\$0, $t2
3772	 mov	$t3, $a3
3773	adc	$poly3, $t3
3774
3775	bt	\$0, $t4
3776	cmovc	$t0, $a0
3777	cmovc	$t1, $a1
3778	cmovc	$t2, $a2
3779	cmovc	$t3, $a3
3780
3781	ret
3782.cfi_endproc
3783.size	__ecp_nistz256_subx,.-__ecp_nistz256_subx
3784
3785.type	__ecp_nistz256_mul_by_2x,\@abi-omnipotent
3786.align	32
3787__ecp_nistz256_mul_by_2x:
3788.cfi_startproc
3789	xor	$t4, $t4
3790	adc	$a0, $a0		# a0:a3+a0:a3
3791	adc	$a1, $a1
3792	 mov	$a0, $t0
3793	adc	$a2, $a2
3794	adc	$a3, $a3
3795	 mov	$a1, $t1
3796	adc	\$0, $t4
3797
3798	xor	$t3, $t3
3799	sbb	\$-1, $a0
3800	 mov	$a2, $t2
3801	sbb	$poly1, $a1
3802	sbb	\$0, $a2
3803	 mov	$a3, $t3
3804	sbb	$poly3, $a3
3805	sbb	\$0, $t4
3806
3807	cmovc	$t0, $a0
3808	cmovc	$t1, $a1
3809	mov	$a0, 8*0($r_ptr)
3810	cmovc	$t2, $a2
3811	mov	$a1, 8*1($r_ptr)
3812	cmovc	$t3, $a3
3813	mov	$a2, 8*2($r_ptr)
3814	mov	$a3, 8*3($r_ptr)
3815
3816	ret
3817.cfi_endproc
3818.size	__ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
3819___
3820									}
3821&gen_double("x");
3822&gen_add("x");
3823&gen_add_affine("x");
3824}
3825}}}
3826
3827# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3828#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
3829if ($win64) {
3830$rec="%rcx";
3831$frame="%rdx";
3832$context="%r8";
3833$disp="%r9";
3834
3835$code.=<<___;
3836.extern	__imp_RtlVirtualUnwind
3837
3838.type	short_handler,\@abi-omnipotent
3839.align	16
3840short_handler:
3841	push	%rsi
3842	push	%rdi
3843	push	%rbx
3844	push	%rbp
3845	push	%r12
3846	push	%r13
3847	push	%r14
3848	push	%r15
3849	pushfq
3850	sub	\$64,%rsp
3851
3852	mov	120($context),%rax	# pull context->Rax
3853	mov	248($context),%rbx	# pull context->Rip
3854
3855	mov	8($disp),%rsi		# disp->ImageBase
3856	mov	56($disp),%r11		# disp->HandlerData
3857
3858	mov	0(%r11),%r10d		# HandlerData[0]
3859	lea	(%rsi,%r10),%r10	# end of prologue label
3860	cmp	%r10,%rbx		# context->Rip<end of prologue label
3861	jb	.Lcommon_seh_tail
3862
3863	mov	152($context),%rax	# pull context->Rsp
3864
3865	mov	4(%r11),%r10d		# HandlerData[1]
3866	lea	(%rsi,%r10),%r10	# epilogue label
3867	cmp	%r10,%rbx		# context->Rip>=epilogue label
3868	jae	.Lcommon_seh_tail
3869
3870	lea	16(%rax),%rax
3871
3872	mov	-8(%rax),%r12
3873	mov	-16(%rax),%r13
3874	mov	%r12,216($context)	# restore context->R12
3875	mov	%r13,224($context)	# restore context->R13
3876
3877	jmp	.Lcommon_seh_tail
3878.size	short_handler,.-short_handler
3879
3880.type	full_handler,\@abi-omnipotent
3881.align	16
3882full_handler:
3883	push	%rsi
3884	push	%rdi
3885	push	%rbx
3886	push	%rbp
3887	push	%r12
3888	push	%r13
3889	push	%r14
3890	push	%r15
3891	pushfq
3892	sub	\$64,%rsp
3893
3894	mov	120($context),%rax	# pull context->Rax
3895	mov	248($context),%rbx	# pull context->Rip
3896
3897	mov	8($disp),%rsi		# disp->ImageBase
3898	mov	56($disp),%r11		# disp->HandlerData
3899
3900	mov	0(%r11),%r10d		# HandlerData[0]
3901	lea	(%rsi,%r10),%r10	# end of prologue label
3902	cmp	%r10,%rbx		# context->Rip<end of prologue label
3903	jb	.Lcommon_seh_tail
3904
3905	mov	152($context),%rax	# pull context->Rsp
3906
3907	mov	4(%r11),%r10d		# HandlerData[1]
3908	lea	(%rsi,%r10),%r10	# epilogue label
3909	cmp	%r10,%rbx		# context->Rip>=epilogue label
3910	jae	.Lcommon_seh_tail
3911
3912	mov	8(%r11),%r10d		# HandlerData[2]
3913	lea	(%rax,%r10),%rax
3914
3915	mov	-8(%rax),%rbp
3916	mov	-16(%rax),%rbx
3917	mov	-24(%rax),%r12
3918	mov	-32(%rax),%r13
3919	mov	-40(%rax),%r14
3920	mov	-48(%rax),%r15
3921	mov	%rbx,144($context)	# restore context->Rbx
3922	mov	%rbp,160($context)	# restore context->Rbp
3923	mov	%r12,216($context)	# restore context->R12
3924	mov	%r13,224($context)	# restore context->R13
3925	mov	%r14,232($context)	# restore context->R14
3926	mov	%r15,240($context)	# restore context->R15
3927
3928.Lcommon_seh_tail:
3929	mov	8(%rax),%rdi
3930	mov	16(%rax),%rsi
3931	mov	%rax,152($context)	# restore context->Rsp
3932	mov	%rsi,168($context)	# restore context->Rsi
3933	mov	%rdi,176($context)	# restore context->Rdi
3934
3935	mov	40($disp),%rdi		# disp->ContextRecord
3936	mov	$context,%rsi		# context
3937	mov	\$154,%ecx		# sizeof(CONTEXT)
3938	.long	0xa548f3fc		# cld; rep movsq
3939
3940	mov	$disp,%rsi
3941	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
3942	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
3943	mov	0(%rsi),%r8		# arg3, disp->ControlPc
3944	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
3945	mov	40(%rsi),%r10		# disp->ContextRecord
3946	lea	56(%rsi),%r11		# &disp->HandlerData
3947	lea	24(%rsi),%r12		# &disp->EstablisherFrame
3948	mov	%r10,32(%rsp)		# arg5
3949	mov	%r11,40(%rsp)		# arg6
3950	mov	%r12,48(%rsp)		# arg7
3951	mov	%rcx,56(%rsp)		# arg8, (NULL)
3952	call	*__imp_RtlVirtualUnwind(%rip)
3953
3954	mov	\$1,%eax		# ExceptionContinueSearch
3955	add	\$64,%rsp
3956	popfq
3957	pop	%r15
3958	pop	%r14
3959	pop	%r13
3960	pop	%r12
3961	pop	%rbp
3962	pop	%rbx
3963	pop	%rdi
3964	pop	%rsi
3965	ret
3966.size	full_handler,.-full_handler
3967
3968.section	.pdata
3969.align	4
3970	.rva	.LSEH_begin_ecp_nistz256_neg
3971	.rva	.LSEH_end_ecp_nistz256_neg
3972	.rva	.LSEH_info_ecp_nistz256_neg
3973
3974	.rva	.LSEH_begin_ecp_nistz256_ord_mul_mont
3975	.rva	.LSEH_end_ecp_nistz256_ord_mul_mont
3976	.rva	.LSEH_info_ecp_nistz256_ord_mul_mont
3977
3978	.rva	.LSEH_begin_ecp_nistz256_ord_sqr_mont
3979	.rva	.LSEH_end_ecp_nistz256_ord_sqr_mont
3980	.rva	.LSEH_info_ecp_nistz256_ord_sqr_mont
3981___
3982$code.=<<___	if ($addx);
3983	.rva	.LSEH_begin_ecp_nistz256_ord_mul_montx
3984	.rva	.LSEH_end_ecp_nistz256_ord_mul_montx
3985	.rva	.LSEH_info_ecp_nistz256_ord_mul_montx
3986
3987	.rva	.LSEH_begin_ecp_nistz256_ord_sqr_montx
3988	.rva	.LSEH_end_ecp_nistz256_ord_sqr_montx
3989	.rva	.LSEH_info_ecp_nistz256_ord_sqr_montx
3990___
3991$code.=<<___;
3992	.rva	.LSEH_begin_ecp_nistz256_mul_mont
3993	.rva	.LSEH_end_ecp_nistz256_mul_mont
3994	.rva	.LSEH_info_ecp_nistz256_mul_mont
3995
3996	.rva	.LSEH_begin_ecp_nistz256_sqr_mont
3997	.rva	.LSEH_end_ecp_nistz256_sqr_mont
3998	.rva	.LSEH_info_ecp_nistz256_sqr_mont
3999
4000	.rva	.LSEH_begin_ecp_nistz256_select_w5
4001	.rva	.LSEH_end_ecp_nistz256_select_w5
4002	.rva	.LSEH_info_ecp_nistz256_select_wX
4003
4004	.rva	.LSEH_begin_ecp_nistz256_select_w7
4005	.rva	.LSEH_end_ecp_nistz256_select_w7
4006	.rva	.LSEH_info_ecp_nistz256_select_wX
4007___
4008$code.=<<___	if ($avx>1);
4009	.rva	.LSEH_begin_ecp_nistz256_avx2_select_w5
4010	.rva	.LSEH_end_ecp_nistz256_avx2_select_w5
4011	.rva	.LSEH_info_ecp_nistz256_avx2_select_wX
4012
4013	.rva	.LSEH_begin_ecp_nistz256_avx2_select_w7
4014	.rva	.LSEH_end_ecp_nistz256_avx2_select_w7
4015	.rva	.LSEH_info_ecp_nistz256_avx2_select_wX
4016___
4017$code.=<<___;
4018	.rva	.LSEH_begin_ecp_nistz256_point_double
4019	.rva	.LSEH_end_ecp_nistz256_point_double
4020	.rva	.LSEH_info_ecp_nistz256_point_double
4021
4022	.rva	.LSEH_begin_ecp_nistz256_point_add
4023	.rva	.LSEH_end_ecp_nistz256_point_add
4024	.rva	.LSEH_info_ecp_nistz256_point_add
4025
4026	.rva	.LSEH_begin_ecp_nistz256_point_add_affine
4027	.rva	.LSEH_end_ecp_nistz256_point_add_affine
4028	.rva	.LSEH_info_ecp_nistz256_point_add_affine
4029___
4030$code.=<<___ if ($addx);
4031	.rva	.LSEH_begin_ecp_nistz256_point_doublex
4032	.rva	.LSEH_end_ecp_nistz256_point_doublex
4033	.rva	.LSEH_info_ecp_nistz256_point_doublex
4034
4035	.rva	.LSEH_begin_ecp_nistz256_point_addx
4036	.rva	.LSEH_end_ecp_nistz256_point_addx
4037	.rva	.LSEH_info_ecp_nistz256_point_addx
4038
4039	.rva	.LSEH_begin_ecp_nistz256_point_add_affinex
4040	.rva	.LSEH_end_ecp_nistz256_point_add_affinex
4041	.rva	.LSEH_info_ecp_nistz256_point_add_affinex
4042___
4043$code.=<<___;
4044
4045.section	.xdata
4046.align	8
4047.LSEH_info_ecp_nistz256_neg:
4048	.byte	9,0,0,0
4049	.rva	short_handler
4050	.rva	.Lneg_body,.Lneg_epilogue		# HandlerData[]
4051.LSEH_info_ecp_nistz256_ord_mul_mont:
4052	.byte	9,0,0,0
4053	.rva	full_handler
4054	.rva	.Lord_mul_body,.Lord_mul_epilogue	# HandlerData[]
4055	.long	48,0
4056.LSEH_info_ecp_nistz256_ord_sqr_mont:
4057	.byte	9,0,0,0
4058	.rva	full_handler
4059	.rva	.Lord_sqr_body,.Lord_sqr_epilogue	# HandlerData[]
4060	.long	48,0
4061___
4062$code.=<<___ if ($addx);
4063.LSEH_info_ecp_nistz256_ord_mul_montx:
4064	.byte	9,0,0,0
4065	.rva	full_handler
4066	.rva	.Lord_mulx_body,.Lord_mulx_epilogue	# HandlerData[]
4067	.long	48,0
4068.LSEH_info_ecp_nistz256_ord_sqr_montx:
4069	.byte	9,0,0,0
4070	.rva	full_handler
4071	.rva	.Lord_sqrx_body,.Lord_sqrx_epilogue	# HandlerData[]
4072	.long	48,0
4073___
4074$code.=<<___;
4075.LSEH_info_ecp_nistz256_mul_mont:
4076	.byte	9,0,0,0
4077	.rva	full_handler
4078	.rva	.Lmul_body,.Lmul_epilogue		# HandlerData[]
4079	.long	48,0
4080.LSEH_info_ecp_nistz256_sqr_mont:
4081	.byte	9,0,0,0
4082	.rva	full_handler
4083	.rva	.Lsqr_body,.Lsqr_epilogue		# HandlerData[]
4084	.long	48,0
4085.LSEH_info_ecp_nistz256_select_wX:
4086	.byte	0x01,0x33,0x16,0x00
4087	.byte	0x33,0xf8,0x09,0x00	#movaps 0x90(rsp),xmm15
4088	.byte	0x2e,0xe8,0x08,0x00	#movaps 0x80(rsp),xmm14
4089	.byte	0x29,0xd8,0x07,0x00	#movaps 0x70(rsp),xmm13
4090	.byte	0x24,0xc8,0x06,0x00	#movaps 0x60(rsp),xmm12
4091	.byte	0x1f,0xb8,0x05,0x00	#movaps 0x50(rsp),xmm11
4092	.byte	0x1a,0xa8,0x04,0x00	#movaps 0x40(rsp),xmm10
4093	.byte	0x15,0x98,0x03,0x00	#movaps 0x30(rsp),xmm9
4094	.byte	0x10,0x88,0x02,0x00	#movaps 0x20(rsp),xmm8
4095	.byte	0x0c,0x78,0x01,0x00	#movaps 0x10(rsp),xmm7
4096	.byte	0x08,0x68,0x00,0x00	#movaps 0x00(rsp),xmm6
4097	.byte	0x04,0x01,0x15,0x00	#sub	rsp,0xa8
4098	.align	8
4099___
4100$code.=<<___	if ($avx>1);
4101.LSEH_info_ecp_nistz256_avx2_select_wX:
4102	.byte	0x01,0x36,0x17,0x0b
4103	.byte	0x36,0xf8,0x09,0x00	# vmovaps 0x90(rsp),xmm15
4104	.byte	0x31,0xe8,0x08,0x00	# vmovaps 0x80(rsp),xmm14
4105	.byte	0x2c,0xd8,0x07,0x00	# vmovaps 0x70(rsp),xmm13
4106	.byte	0x27,0xc8,0x06,0x00	# vmovaps 0x60(rsp),xmm12
4107	.byte	0x22,0xb8,0x05,0x00	# vmovaps 0x50(rsp),xmm11
4108	.byte	0x1d,0xa8,0x04,0x00	# vmovaps 0x40(rsp),xmm10
4109	.byte	0x18,0x98,0x03,0x00	# vmovaps 0x30(rsp),xmm9
4110	.byte	0x13,0x88,0x02,0x00	# vmovaps 0x20(rsp),xmm8
4111	.byte	0x0e,0x78,0x01,0x00	# vmovaps 0x10(rsp),xmm7
4112	.byte	0x09,0x68,0x00,0x00	# vmovaps 0x00(rsp),xmm6
4113	.byte	0x04,0x01,0x15,0x00	# sub	  rsp,0xa8
4114	.byte	0x00,0xb3,0x00,0x00	# set_frame r11
4115	.align	8
4116___
4117$code.=<<___;
4118.LSEH_info_ecp_nistz256_point_double:
4119	.byte	9,0,0,0
4120	.rva	full_handler
4121	.rva	.Lpoint_doubleq_body,.Lpoint_doubleq_epilogue	# HandlerData[]
4122	.long	32*5+56,0
4123.LSEH_info_ecp_nistz256_point_add:
4124	.byte	9,0,0,0
4125	.rva	full_handler
4126	.rva	.Lpoint_addq_body,.Lpoint_addq_epilogue		# HandlerData[]
4127	.long	32*18+56,0
4128.LSEH_info_ecp_nistz256_point_add_affine:
4129	.byte	9,0,0,0
4130	.rva	full_handler
4131	.rva	.Ladd_affineq_body,.Ladd_affineq_epilogue	# HandlerData[]
4132	.long	32*15+56,0
4133___
4134$code.=<<___ if ($addx);
4135.align	8
4136.LSEH_info_ecp_nistz256_point_doublex:
4137	.byte	9,0,0,0
4138	.rva	full_handler
4139	.rva	.Lpoint_doublex_body,.Lpoint_doublex_epilogue	# HandlerData[]
4140	.long	32*5+56,0
4141.LSEH_info_ecp_nistz256_point_addx:
4142	.byte	9,0,0,0
4143	.rva	full_handler
4144	.rva	.Lpoint_addx_body,.Lpoint_addx_epilogue		# HandlerData[]
4145	.long	32*18+56,0
4146.LSEH_info_ecp_nistz256_point_add_affinex:
4147	.byte	9,0,0,0
4148	.rva	full_handler
4149	.rva	.Ladd_affinex_body,.Ladd_affinex_epilogue	# HandlerData[]
4150	.long	32*15+56,0
4151___
4152}
4153
4154$code =~ s/\`([^\`]*)\`/eval $1/gem;
4155print $code;
4156close STDOUT or die "error closing STDOUT";
4157