• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env perl
2
3# Copyright (c) 2014, Intel Corporation.
4#
5# Permission to use, copy, modify, and/or distribute this software for any
6# purpose with or without fee is hereby granted, provided that the above
7# copyright notice and this permission notice appear in all copies.
8#
9# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
12# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
14# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
15# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16
17# Developers and authors:
18# Shay Gueron (1, 2), and Vlad Krasnov (1)
19# (1) Intel Corporation, Israel Development Center
20# (2) University of Haifa
21
22#  Reference:
23#  S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
24#                           256 Bit Primes"
25
26# Further optimization by <appro@openssl.org>:
27#
28#		this/original
29# Opteron	+12-49%
30# Bulldozer	+14-45%
31# P4		+18-46%
32# Westmere	+12-34%
33# Sandy Bridge	+9-35%
34# Ivy Bridge	+9-35%
35# Haswell	+8-37%
36# Broadwell	+18-58%
37# Atom		+15-50%
38# VIA Nano	+43-160%
39#
40# Ranges denote minimum and maximum improvement coefficients depending
41# on benchmark.
42
43$flavour = shift;
44$output  = shift;
45if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
46
47$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
48
49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
51( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
52die "can't locate x86_64-xlate.pl";
53
54open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
55*STDOUT=*OUT;
56
57# TODO: enable these after testing. $avx goes to two and $addx to one.
58$avx=0;
59$addx=0;
60
61$code.=<<___;
62.text
63.extern	OPENSSL_ia32cap_P
64
65# The polynomial
66.align 64
67.Lpoly:
68.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
69
70.LOne:
71.long 1,1,1,1,1,1,1,1
72.LTwo:
73.long 2,2,2,2,2,2,2,2
74.LThree:
75.long 3,3,3,3,3,3,3,3
76.LONE_mont:
77.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
78___
79
80{
81my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11));
82my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13");
83my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx");
84
85$code.=<<___;
86
87################################################################################
88# void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]);
89.globl	ecp_nistz256_neg
90.type	ecp_nistz256_neg,\@function,2
91.align	32
92ecp_nistz256_neg:
93	push	%r12
94	push	%r13
95
96	xor	$a0, $a0
97	xor	$a1, $a1
98	xor	$a2, $a2
99	xor	$a3, $a3
100	xor	$t4, $t4
101
102	sub	8*0($a_ptr), $a0
103	sbb	8*1($a_ptr), $a1
104	sbb	8*2($a_ptr), $a2
105	 mov	$a0, $t0
106	sbb	8*3($a_ptr), $a3
107	lea	.Lpoly(%rip), $a_ptr
108	 mov	$a1, $t1
109	sbb	\$0, $t4
110
111	add	8*0($a_ptr), $a0
112	 mov	$a2, $t2
113	adc	8*1($a_ptr), $a1
114	adc	8*2($a_ptr), $a2
115	 mov	$a3, $t3
116	adc	8*3($a_ptr), $a3
117	test	$t4, $t4
118
119	cmovz	$t0, $a0
120	cmovz	$t1, $a1
121	mov	$a0, 8*0($r_ptr)
122	cmovz	$t2, $a2
123	mov	$a1, 8*1($r_ptr)
124	cmovz	$t3, $a3
125	mov	$a2, 8*2($r_ptr)
126	mov	$a3, 8*3($r_ptr)
127
128	pop %r13
129	pop %r12
130	ret
131.size	ecp_nistz256_neg,.-ecp_nistz256_neg
132___
133}
134{
135my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
136my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
137my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
138my ($poly1,$poly3)=($acc6,$acc7);
139
140$code.=<<___;
141################################################################################
142# void ecp_nistz256_mul_mont(
143#   uint64_t res[4],
144#   uint64_t a[4],
145#   uint64_t b[4]);
146
147.globl	ecp_nistz256_mul_mont
148.type	ecp_nistz256_mul_mont,\@function,3
149.align	32
150ecp_nistz256_mul_mont:
151___
152$code.=<<___	if ($addx);
153	mov	\$0x80100, %ecx
154	and	OPENSSL_ia32cap_P+8(%rip), %ecx
155___
156$code.=<<___;
157.Lmul_mont:
158	push	%rbp
159	push	%rbx
160	push	%r12
161	push	%r13
162	push	%r14
163	push	%r15
164___
165$code.=<<___	if ($addx);
166	cmp	\$0x80100, %ecx
167	je	.Lmul_montx
168___
169$code.=<<___;
170	mov	$b_org, $b_ptr
171	mov	8*0($b_org), %rax
172	mov	8*0($a_ptr), $acc1
173	mov	8*1($a_ptr), $acc2
174	mov	8*2($a_ptr), $acc3
175	mov	8*3($a_ptr), $acc4
176
177	call	__ecp_nistz256_mul_montq
178___
179$code.=<<___	if ($addx);
180	jmp	.Lmul_mont_done
181
182.align	32
183.Lmul_montx:
184	mov	$b_org, $b_ptr
185	mov	8*0($b_org), %rdx
186	mov	8*0($a_ptr), $acc1
187	mov	8*1($a_ptr), $acc2
188	mov	8*2($a_ptr), $acc3
189	mov	8*3($a_ptr), $acc4
190	lea	-128($a_ptr), $a_ptr	# control u-op density
191
192	call	__ecp_nistz256_mul_montx
193___
194$code.=<<___;
195.Lmul_mont_done:
196	pop	%r15
197	pop	%r14
198	pop	%r13
199	pop	%r12
200	pop	%rbx
201	pop	%rbp
202	ret
203.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
204
205.type	__ecp_nistz256_mul_montq,\@abi-omnipotent
206.align	32
207__ecp_nistz256_mul_montq:
208	########################################################################
209	# Multiply a by b[0]
210	mov	%rax, $t1
211	mulq	$acc1
212	mov	.Lpoly+8*1(%rip),$poly1
213	mov	%rax, $acc0
214	mov	$t1, %rax
215	mov	%rdx, $acc1
216
217	mulq	$acc2
218	mov	.Lpoly+8*3(%rip),$poly3
219	add	%rax, $acc1
220	mov	$t1, %rax
221	adc	\$0, %rdx
222	mov	%rdx, $acc2
223
224	mulq	$acc3
225	add	%rax, $acc2
226	mov	$t1, %rax
227	adc	\$0, %rdx
228	mov	%rdx, $acc3
229
230	mulq	$acc4
231	add	%rax, $acc3
232	 mov	$acc0, %rax
233	adc	\$0, %rdx
234	xor	$acc5, $acc5
235	mov	%rdx, $acc4
236
237	########################################################################
238	# First reduction step
239	# Basically now we want to multiply acc[0] by p256,
240	# and add the result to the acc.
241	# Due to the special form of p256 we do some optimizations
242	#
243	# acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
244	# then we add acc[0] and get acc[0] x 2^96
245
246	mov	$acc0, $t1
247	shl	\$32, $acc0
248	mulq	$poly3
249	shr	\$32, $t1
250	add	$acc0, $acc1		# +=acc[0]<<96
251	adc	$t1, $acc2
252	adc	%rax, $acc3
253	 mov	8*1($b_ptr), %rax
254	adc	%rdx, $acc4
255	adc	\$0, $acc5
256	xor	$acc0, $acc0
257
258	########################################################################
259	# Multiply by b[1]
260	mov	%rax, $t1
261	mulq	8*0($a_ptr)
262	add	%rax, $acc1
263	mov	$t1, %rax
264	adc	\$0, %rdx
265	mov	%rdx, $t0
266
267	mulq	8*1($a_ptr)
268	add	$t0, $acc2
269	adc	\$0, %rdx
270	add	%rax, $acc2
271	mov	$t1, %rax
272	adc	\$0, %rdx
273	mov	%rdx, $t0
274
275	mulq	8*2($a_ptr)
276	add	$t0, $acc3
277	adc	\$0, %rdx
278	add	%rax, $acc3
279	mov	$t1, %rax
280	adc	\$0, %rdx
281	mov	%rdx, $t0
282
283	mulq	8*3($a_ptr)
284	add	$t0, $acc4
285	adc	\$0, %rdx
286	add	%rax, $acc4
287	 mov	$acc1, %rax
288	adc	%rdx, $acc5
289	adc	\$0, $acc0
290
291	########################################################################
292	# Second reduction step
293	mov	$acc1, $t1
294	shl	\$32, $acc1
295	mulq	$poly3
296	shr	\$32, $t1
297	add	$acc1, $acc2
298	adc	$t1, $acc3
299	adc	%rax, $acc4
300	 mov	8*2($b_ptr), %rax
301	adc	%rdx, $acc5
302	adc	\$0, $acc0
303	xor	$acc1, $acc1
304
305	########################################################################
306	# Multiply by b[2]
307	mov	%rax, $t1
308	mulq	8*0($a_ptr)
309	add	%rax, $acc2
310	mov	$t1, %rax
311	adc	\$0, %rdx
312	mov	%rdx, $t0
313
314	mulq	8*1($a_ptr)
315	add	$t0, $acc3
316	adc	\$0, %rdx
317	add	%rax, $acc3
318	mov	$t1, %rax
319	adc	\$0, %rdx
320	mov	%rdx, $t0
321
322	mulq	8*2($a_ptr)
323	add	$t0, $acc4
324	adc	\$0, %rdx
325	add	%rax, $acc4
326	mov	$t1, %rax
327	adc	\$0, %rdx
328	mov	%rdx, $t0
329
330	mulq	8*3($a_ptr)
331	add	$t0, $acc5
332	adc	\$0, %rdx
333	add	%rax, $acc5
334	 mov	$acc2, %rax
335	adc	%rdx, $acc0
336	adc	\$0, $acc1
337
338	########################################################################
339	# Third reduction step
340	mov	$acc2, $t1
341	shl	\$32, $acc2
342	mulq	$poly3
343	shr	\$32, $t1
344	add	$acc2, $acc3
345	adc	$t1, $acc4
346	adc	%rax, $acc5
347	 mov	8*3($b_ptr), %rax
348	adc	%rdx, $acc0
349	adc	\$0, $acc1
350	xor	$acc2, $acc2
351
352	########################################################################
353	# Multiply by b[3]
354	mov	%rax, $t1
355	mulq	8*0($a_ptr)
356	add	%rax, $acc3
357	mov	$t1, %rax
358	adc	\$0, %rdx
359	mov	%rdx, $t0
360
361	mulq	8*1($a_ptr)
362	add	$t0, $acc4
363	adc	\$0, %rdx
364	add	%rax, $acc4
365	mov	$t1, %rax
366	adc	\$0, %rdx
367	mov	%rdx, $t0
368
369	mulq	8*2($a_ptr)
370	add	$t0, $acc5
371	adc	\$0, %rdx
372	add	%rax, $acc5
373	mov	$t1, %rax
374	adc	\$0, %rdx
375	mov	%rdx, $t0
376
377	mulq	8*3($a_ptr)
378	add	$t0, $acc0
379	adc	\$0, %rdx
380	add	%rax, $acc0
381	 mov	$acc3, %rax
382	adc	%rdx, $acc1
383	adc	\$0, $acc2
384
385	########################################################################
386	# Final reduction step
387	mov	$acc3, $t1
388	shl	\$32, $acc3
389	mulq	$poly3
390	shr	\$32, $t1
391	add	$acc3, $acc4
392	adc	$t1, $acc5
393	 mov	$acc4, $t0
394	adc	%rax, $acc0
395	adc	%rdx, $acc1
396	 mov	$acc5, $t1
397	adc	\$0, $acc2
398
399	########################################################################
400	# Branch-less conditional subtraction of P
401	sub	\$-1, $acc4		# .Lpoly[0]
402	 mov	$acc0, $t2
403	sbb	$poly1, $acc5		# .Lpoly[1]
404	sbb	\$0, $acc0		# .Lpoly[2]
405	 mov	$acc1, $t3
406	sbb	$poly3, $acc1		# .Lpoly[3]
407	sbb	\$0, $acc2
408
409	cmovc	$t0, $acc4
410	cmovc	$t1, $acc5
411	mov	$acc4, 8*0($r_ptr)
412	cmovc	$t2, $acc0
413	mov	$acc5, 8*1($r_ptr)
414	cmovc	$t3, $acc1
415	mov	$acc0, 8*2($r_ptr)
416	mov	$acc1, 8*3($r_ptr)
417
418	ret
419.size	__ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
420
421################################################################################
422# void ecp_nistz256_sqr_mont(
423#   uint64_t res[4],
424#   uint64_t a[4]);
425
426# we optimize the square according to S.Gueron and V.Krasnov,
427# "Speeding up Big-Number Squaring"
428.globl	ecp_nistz256_sqr_mont
429.type	ecp_nistz256_sqr_mont,\@function,2
430.align	32
431ecp_nistz256_sqr_mont:
432___
433$code.=<<___	if ($addx);
434	mov	\$0x80100, %ecx
435	and	OPENSSL_ia32cap_P+8(%rip), %ecx
436___
437$code.=<<___;
438	push	%rbp
439	push	%rbx
440	push	%r12
441	push	%r13
442	push	%r14
443	push	%r15
444___
445$code.=<<___	if ($addx);
446	cmp	\$0x80100, %ecx
447	je	.Lsqr_montx
448___
449$code.=<<___;
450	mov	8*0($a_ptr), %rax
451	mov	8*1($a_ptr), $acc6
452	mov	8*2($a_ptr), $acc7
453	mov	8*3($a_ptr), $acc0
454
455	call	__ecp_nistz256_sqr_montq
456___
457$code.=<<___	if ($addx);
458	jmp	.Lsqr_mont_done
459
460.align	32
461.Lsqr_montx:
462	mov	8*0($a_ptr), %rdx
463	mov	8*1($a_ptr), $acc6
464	mov	8*2($a_ptr), $acc7
465	mov	8*3($a_ptr), $acc0
466	lea	-128($a_ptr), $a_ptr	# control u-op density
467
468	call	__ecp_nistz256_sqr_montx
469___
470$code.=<<___;
471.Lsqr_mont_done:
472	pop	%r15
473	pop	%r14
474	pop	%r13
475	pop	%r12
476	pop	%rbx
477	pop	%rbp
478	ret
479.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
480
481.type	__ecp_nistz256_sqr_montq,\@abi-omnipotent
482.align	32
483__ecp_nistz256_sqr_montq:
484	mov	%rax, $acc5
485	mulq	$acc6			# a[1]*a[0]
486	mov	%rax, $acc1
487	mov	$acc7, %rax
488	mov	%rdx, $acc2
489
490	mulq	$acc5			# a[0]*a[2]
491	add	%rax, $acc2
492	mov	$acc0, %rax
493	adc	\$0, %rdx
494	mov	%rdx, $acc3
495
496	mulq	$acc5			# a[0]*a[3]
497	add	%rax, $acc3
498	 mov	$acc7, %rax
499	adc	\$0, %rdx
500	mov	%rdx, $acc4
501
502	#################################
503	mulq	$acc6			# a[1]*a[2]
504	add	%rax, $acc3
505	mov	$acc0, %rax
506	adc	\$0, %rdx
507	mov	%rdx, $t1
508
509	mulq	$acc6			# a[1]*a[3]
510	add	%rax, $acc4
511	 mov	$acc0, %rax
512	adc	\$0, %rdx
513	add	$t1, $acc4
514	mov	%rdx, $acc5
515	adc	\$0, $acc5
516
517	#################################
518	mulq	$acc7			# a[2]*a[3]
519	xor	$acc7, $acc7
520	add	%rax, $acc5
521	 mov	8*0($a_ptr), %rax
522	mov	%rdx, $acc6
523	adc	\$0, $acc6
524
525	add	$acc1, $acc1		# acc1:6<<1
526	adc	$acc2, $acc2
527	adc	$acc3, $acc3
528	adc	$acc4, $acc4
529	adc	$acc5, $acc5
530	adc	$acc6, $acc6
531	adc	\$0, $acc7
532
533	mulq	%rax
534	mov	%rax, $acc0
535	mov	8*1($a_ptr), %rax
536	mov	%rdx, $t0
537
538	mulq	%rax
539	add	$t0, $acc1
540	adc	%rax, $acc2
541	mov	8*2($a_ptr), %rax
542	adc	\$0, %rdx
543	mov	%rdx, $t0
544
545	mulq	%rax
546	add	$t0, $acc3
547	adc	%rax, $acc4
548	mov	8*3($a_ptr), %rax
549	adc	\$0, %rdx
550	mov	%rdx, $t0
551
552	mulq	%rax
553	add	$t0, $acc5
554	adc	%rax, $acc6
555	 mov	$acc0, %rax
556	adc	%rdx, $acc7
557
558	mov	.Lpoly+8*1(%rip), $a_ptr
559	mov	.Lpoly+8*3(%rip), $t1
560
561	##########################################
562	# Now the reduction
563	# First iteration
564	mov	$acc0, $t0
565	shl	\$32, $acc0
566	mulq	$t1
567	shr	\$32, $t0
568	add	$acc0, $acc1		# +=acc[0]<<96
569	adc	$t0, $acc2
570	adc	%rax, $acc3
571	 mov	$acc1, %rax
572	adc	\$0, %rdx
573
574	##########################################
575	# Second iteration
576	mov	$acc1, $t0
577	shl	\$32, $acc1
578	mov	%rdx, $acc0
579	mulq	$t1
580	shr	\$32, $t0
581	add	$acc1, $acc2
582	adc	$t0, $acc3
583	adc	%rax, $acc0
584	 mov	$acc2, %rax
585	adc	\$0, %rdx
586
587	##########################################
588	# Third iteration
589	mov	$acc2, $t0
590	shl	\$32, $acc2
591	mov	%rdx, $acc1
592	mulq	$t1
593	shr	\$32, $t0
594	add	$acc2, $acc3
595	adc	$t0, $acc0
596	adc	%rax, $acc1
597	 mov	$acc3, %rax
598	adc	\$0, %rdx
599
600	###########################################
601	# Last iteration
602	mov	$acc3, $t0
603	shl	\$32, $acc3
604	mov	%rdx, $acc2
605	mulq	$t1
606	shr	\$32, $t0
607	add	$acc3, $acc0
608	adc	$t0, $acc1
609	adc	%rax, $acc2
610	adc	\$0, %rdx
611	xor	$acc3, $acc3
612
613	############################################
614	# Add the rest of the acc
615	add	$acc0, $acc4
616	adc	$acc1, $acc5
617	 mov	$acc4, $acc0
618	adc	$acc2, $acc6
619	adc	%rdx, $acc7
620	 mov	$acc5, $acc1
621	adc	\$0, $acc3
622
623	sub	\$-1, $acc4		# .Lpoly[0]
624	 mov	$acc6, $acc2
625	sbb	$a_ptr, $acc5		# .Lpoly[1]
626	sbb	\$0, $acc6		# .Lpoly[2]
627	 mov	$acc7, $t0
628	sbb	$t1, $acc7		# .Lpoly[3]
629	sbb	\$0, $acc3
630
631	cmovc	$acc0, $acc4
632	cmovc	$acc1, $acc5
633	mov	$acc4, 8*0($r_ptr)
634	cmovc	$acc2, $acc6
635	mov	$acc5, 8*1($r_ptr)
636	cmovc	$t0, $acc7
637	mov	$acc6, 8*2($r_ptr)
638	mov	$acc7, 8*3($r_ptr)
639
640	ret
641.size	__ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
642___
643
644if ($addx) {
645$code.=<<___;
646.type	__ecp_nistz256_mul_montx,\@abi-omnipotent
647.align	32
648__ecp_nistz256_mul_montx:
649	########################################################################
650	# Multiply by b[0]
651	mulx	$acc1, $acc0, $acc1
652	mulx	$acc2, $t0, $acc2
653	mov	\$32, $poly1
654	xor	$acc5, $acc5		# cf=0
655	mulx	$acc3, $t1, $acc3
656	mov	.Lpoly+8*3(%rip), $poly3
657	adc	$t0, $acc1
658	mulx	$acc4, $t0, $acc4
659	 mov	$acc0, %rdx
660	adc	$t1, $acc2
661	 shlx	$poly1,$acc0,$t1
662	adc	$t0, $acc3
663	 shrx	$poly1,$acc0,$t0
664	adc	\$0, $acc4
665
666	########################################################################
667	# First reduction step
668	add	$t1, $acc1
669	adc	$t0, $acc2
670
671	mulx	$poly3, $t0, $t1
672	 mov	8*1($b_ptr), %rdx
673	adc	$t0, $acc3
674	adc	$t1, $acc4
675	adc	\$0, $acc5
676	xor	$acc0, $acc0		# $acc0=0,cf=0,of=0
677
678	########################################################################
679	# Multiply by b[1]
680	mulx	8*0+128($a_ptr), $t0, $t1
681	adcx	$t0, $acc1
682	adox	$t1, $acc2
683
684	mulx	8*1+128($a_ptr), $t0, $t1
685	adcx	$t0, $acc2
686	adox	$t1, $acc3
687
688	mulx	8*2+128($a_ptr), $t0, $t1
689	adcx	$t0, $acc3
690	adox	$t1, $acc4
691
692	mulx	8*3+128($a_ptr), $t0, $t1
693	 mov	$acc1, %rdx
694	adcx	$t0, $acc4
695	 shlx	$poly1, $acc1, $t0
696	adox	$t1, $acc5
697	 shrx	$poly1, $acc1, $t1
698
699	adcx	$acc0, $acc5
700	adox	$acc0, $acc0
701	adc	\$0, $acc0
702
703	########################################################################
704	# Second reduction step
705	add	$t0, $acc2
706	adc	$t1, $acc3
707
708	mulx	$poly3, $t0, $t1
709	 mov	8*2($b_ptr), %rdx
710	adc	$t0, $acc4
711	adc	$t1, $acc5
712	adc	\$0, $acc0
713	xor	$acc1 ,$acc1		# $acc1=0,cf=0,of=0
714
715	########################################################################
716	# Multiply by b[2]
717	mulx	8*0+128($a_ptr), $t0, $t1
718	adcx	$t0, $acc2
719	adox	$t1, $acc3
720
721	mulx	8*1+128($a_ptr), $t0, $t1
722	adcx	$t0, $acc3
723	adox	$t1, $acc4
724
725	mulx	8*2+128($a_ptr), $t0, $t1
726	adcx	$t0, $acc4
727	adox	$t1, $acc5
728
729	mulx	8*3+128($a_ptr), $t0, $t1
730	 mov	$acc2, %rdx
731	adcx	$t0, $acc5
732	 shlx	$poly1, $acc2, $t0
733	adox	$t1, $acc0
734	 shrx	$poly1, $acc2, $t1
735
736	adcx	$acc1, $acc0
737	adox	$acc1, $acc1
738	adc	\$0, $acc1
739
740	########################################################################
741	# Third reduction step
742	add	$t0, $acc3
743	adc	$t1, $acc4
744
745	mulx	$poly3, $t0, $t1
746	 mov	8*3($b_ptr), %rdx
747	adc	$t0, $acc5
748	adc	$t1, $acc0
749	adc	\$0, $acc1
750	xor	$acc2, $acc2		# $acc2=0,cf=0,of=0
751
752	########################################################################
753	# Multiply by b[3]
754	mulx	8*0+128($a_ptr), $t0, $t1
755	adcx	$t0, $acc3
756	adox	$t1, $acc4
757
758	mulx	8*1+128($a_ptr), $t0, $t1
759	adcx	$t0, $acc4
760	adox	$t1, $acc5
761
762	mulx	8*2+128($a_ptr), $t0, $t1
763	adcx	$t0, $acc5
764	adox	$t1, $acc0
765
766	mulx	8*3+128($a_ptr), $t0, $t1
767	 mov	$acc3, %rdx
768	adcx	$t0, $acc0
769	 shlx	$poly1, $acc3, $t0
770	adox	$t1, $acc1
771	 shrx	$poly1, $acc3, $t1
772
773	adcx	$acc2, $acc1
774	adox	$acc2, $acc2
775	adc	\$0, $acc2
776
777	########################################################################
778	# Fourth reduction step
779	add	$t0, $acc4
780	adc	$t1, $acc5
781
782	mulx	$poly3, $t0, $t1
783	 mov	$acc4, $t2
784	mov	.Lpoly+8*1(%rip), $poly1
785	adc	$t0, $acc0
786	 mov	$acc5, $t3
787	adc	$t1, $acc1
788	adc	\$0, $acc2
789
790	########################################################################
791	# Branch-less conditional subtraction of P
792	xor	%eax, %eax
793	 mov	$acc0, $t0
794	sbb	\$-1, $acc4		# .Lpoly[0]
795	sbb	$poly1, $acc5		# .Lpoly[1]
796	sbb	\$0, $acc0		# .Lpoly[2]
797	 mov	$acc1, $t1
798	sbb	$poly3, $acc1		# .Lpoly[3]
799	sbb	\$0, $acc2
800
801	cmovc	$t2, $acc4
802	cmovc	$t3, $acc5
803	mov	$acc4, 8*0($r_ptr)
804	cmovc	$t0, $acc0
805	mov	$acc5, 8*1($r_ptr)
806	cmovc	$t1, $acc1
807	mov	$acc0, 8*2($r_ptr)
808	mov	$acc1, 8*3($r_ptr)
809
810	ret
811.size	__ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
812
813.type	__ecp_nistz256_sqr_montx,\@abi-omnipotent
814.align	32
815__ecp_nistz256_sqr_montx:
816	mulx	$acc6, $acc1, $acc2	# a[0]*a[1]
817	mulx	$acc7, $t0, $acc3	# a[0]*a[2]
818	xor	%eax, %eax
819	adc	$t0, $acc2
820	mulx	$acc0, $t1, $acc4	# a[0]*a[3]
821	 mov	$acc6, %rdx
822	adc	$t1, $acc3
823	adc	\$0, $acc4
824	xor	$acc5, $acc5		# $acc5=0,cf=0,of=0
825
826	#################################
827	mulx	$acc7, $t0, $t1		# a[1]*a[2]
828	adcx	$t0, $acc3
829	adox	$t1, $acc4
830
831	mulx	$acc0, $t0, $t1		# a[1]*a[3]
832	 mov	$acc7, %rdx
833	adcx	$t0, $acc4
834	adox	$t1, $acc5
835	adc	\$0, $acc5
836
837	#################################
838	mulx	$acc0, $t0, $acc6	# a[2]*a[3]
839	 mov	8*0+128($a_ptr), %rdx
840	xor	$acc7, $acc7		# $acc7=0,cf=0,of=0
841	 adcx	$acc1, $acc1		# acc1:6<<1
842	adox	$t0, $acc5
843	 adcx	$acc2, $acc2
844	adox	$acc7, $acc6		# of=0
845
846	mulx	%rdx, $acc0, $t1
847	mov	8*1+128($a_ptr), %rdx
848	 adcx	$acc3, $acc3
849	adox	$t1, $acc1
850	 adcx	$acc4, $acc4
851	mulx	%rdx, $t0, $t4
852	mov	8*2+128($a_ptr), %rdx
853	 adcx	$acc5, $acc5
854	adox	$t0, $acc2
855	 adcx	$acc6, $acc6
856	.byte	0x67
857	mulx	%rdx, $t0, $t1
858	mov	8*3+128($a_ptr), %rdx
859	adox	$t4, $acc3
860	 adcx	$acc7, $acc7
861	adox	$t0, $acc4
862	 mov	\$32, $a_ptr
863	adox	$t1, $acc5
864	.byte	0x67,0x67
865	mulx	%rdx, $t0, $t4
866	 mov	$acc0, %rdx
867	adox	$t0, $acc6
868	 shlx	$a_ptr, $acc0, $t0
869	adox	$t4, $acc7
870	 shrx	$a_ptr, $acc0, $t4
871	 mov	.Lpoly+8*3(%rip), $t1
872
873	# reduction step 1
874	add	$t0, $acc1
875	adc	$t4, $acc2
876
877	mulx	$t1, $t0, $acc0
878	 mov	$acc1, %rdx
879	adc	$t0, $acc3
880	 shlx	$a_ptr, $acc1, $t0
881	adc	\$0, $acc0
882	 shrx	$a_ptr, $acc1, $t4
883
884	# reduction step 2
885	add	$t0, $acc2
886	adc	$t4, $acc3
887
888	mulx	$t1, $t0, $acc1
889	 mov	$acc2, %rdx
890	adc	$t0, $acc0
891	 shlx	$a_ptr, $acc2, $t0
892	adc	\$0, $acc1
893	 shrx	$a_ptr, $acc2, $t4
894
895	# reduction step 3
896	add	$t0, $acc3
897	adc	$t4, $acc0
898
899	mulx	$t1, $t0, $acc2
900	 mov	$acc3, %rdx
901	adc	$t0, $acc1
902	 shlx	$a_ptr, $acc3, $t0
903	adc	\$0, $acc2
904	 shrx	$a_ptr, $acc3, $t4
905
906	# reduction step 4
907	add	$t0, $acc0
908	adc	$t4, $acc1
909
910	mulx	$t1, $t0, $acc3
911	adc	$t0, $acc2
912	adc	\$0, $acc3
913
914	xor	$t3, $t3		# cf=0
915	adc	$acc0, $acc4		# accumulate upper half
916	 mov	.Lpoly+8*1(%rip), $a_ptr
917	adc	$acc1, $acc5
918	 mov	$acc4, $acc0
919	adc	$acc2, $acc6
920	adc	$acc3, $acc7
921	 mov	$acc5, $acc1
922	adc	\$0, $t3
923
924	xor	%eax, %eax		# cf=0
925	sbb	\$-1, $acc4		# .Lpoly[0]
926	 mov	$acc6, $acc2
927	sbb	$a_ptr, $acc5		# .Lpoly[1]
928	sbb	\$0, $acc6		# .Lpoly[2]
929	 mov	$acc7, $acc3
930	sbb	$t1, $acc7		# .Lpoly[3]
931	sbb	\$0, $t3
932
933	cmovc	$acc0, $acc4
934	cmovc	$acc1, $acc5
935	mov	$acc4, 8*0($r_ptr)
936	cmovc	$acc2, $acc6
937	mov	$acc5, 8*1($r_ptr)
938	cmovc	$acc3, $acc7
939	mov	$acc6, 8*2($r_ptr)
940	mov	$acc7, 8*3($r_ptr)
941
942	ret
943.size	__ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
944___
945}
946}
947{
948my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
949my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7));
950my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15));
951my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15));
952
953$code.=<<___;
954################################################################################
955# void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
956.globl	ecp_nistz256_select_w5
957.type	ecp_nistz256_select_w5,\@abi-omnipotent
958.align	32
959ecp_nistz256_select_w5:
960___
961$code.=<<___	if ($avx>1);
962	mov	OPENSSL_ia32cap_P+8(%rip), %eax
963	test	\$`1<<5`, %eax
964	jnz	.Lavx2_select_w5
965___
966$code.=<<___	if ($win64);
967	lea	-0x88(%rsp), %rax
968.LSEH_begin_ecp_nistz256_select_w5:
969	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
970	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6, -0x20(%rax)
971	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7, -0x10(%rax)
972	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8, 0(%rax)
973	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9, 0x10(%rax)
974	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10, 0x20(%rax)
975	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11, 0x30(%rax)
976	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12, 0x40(%rax)
977	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13, 0x50(%rax)
978	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14, 0x60(%rax)
979	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15, 0x70(%rax)
980___
981$code.=<<___;
982	movdqa	.LOne(%rip), $ONE
983	movd	$index, $INDEX
984
985	pxor	$Ra, $Ra
986	pxor	$Rb, $Rb
987	pxor	$Rc, $Rc
988	pxor	$Rd, $Rd
989	pxor	$Re, $Re
990	pxor	$Rf, $Rf
991
992	movdqa	$ONE, $M0
993	pshufd	\$0, $INDEX, $INDEX
994
995	mov	\$16, %rax
996.Lselect_loop_sse_w5:
997
998	movdqa	$M0, $TMP0
999	paddd	$ONE, $M0
1000	pcmpeqd $INDEX, $TMP0
1001
1002	movdqa	16*0($in_t), $T0a
1003	movdqa	16*1($in_t), $T0b
1004	movdqa	16*2($in_t), $T0c
1005	movdqa	16*3($in_t), $T0d
1006	movdqa	16*4($in_t), $T0e
1007	movdqa	16*5($in_t), $T0f
1008	lea 16*6($in_t), $in_t
1009
1010	pand	$TMP0, $T0a
1011	pand	$TMP0, $T0b
1012	por	$T0a, $Ra
1013	pand	$TMP0, $T0c
1014	por	$T0b, $Rb
1015	pand	$TMP0, $T0d
1016	por	$T0c, $Rc
1017	pand	$TMP0, $T0e
1018	por	$T0d, $Rd
1019	pand	$TMP0, $T0f
1020	por	$T0e, $Re
1021	por	$T0f, $Rf
1022
1023	dec	%rax
1024	jnz	.Lselect_loop_sse_w5
1025
1026	movdqu	$Ra, 16*0($val)
1027	movdqu	$Rb, 16*1($val)
1028	movdqu	$Rc, 16*2($val)
1029	movdqu	$Rd, 16*3($val)
1030	movdqu	$Re, 16*4($val)
1031	movdqu	$Rf, 16*5($val)
1032___
1033$code.=<<___	if ($win64);
1034	movaps	(%rsp), %xmm6
1035	movaps	0x10(%rsp), %xmm7
1036	movaps	0x20(%rsp), %xmm8
1037	movaps	0x30(%rsp), %xmm9
1038	movaps	0x40(%rsp), %xmm10
1039	movaps	0x50(%rsp), %xmm11
1040	movaps	0x60(%rsp), %xmm12
1041	movaps	0x70(%rsp), %xmm13
1042	movaps	0x80(%rsp), %xmm14
1043	movaps	0x90(%rsp), %xmm15
1044	lea	0xa8(%rsp), %rsp
1045.LSEH_end_ecp_nistz256_select_w5:
1046___
1047$code.=<<___;
1048	ret
1049.size	ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
1050
1051################################################################################
1052# void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
1053.globl	ecp_nistz256_select_w7
1054.type	ecp_nistz256_select_w7,\@abi-omnipotent
1055.align	32
1056ecp_nistz256_select_w7:
1057___
1058$code.=<<___	if ($avx>1);
1059	mov	OPENSSL_ia32cap_P+8(%rip), %eax
1060	test	\$`1<<5`, %eax
1061	jnz	.Lavx2_select_w7
1062___
1063$code.=<<___	if ($win64);
1064	lea	-0x88(%rsp), %rax
1065.LSEH_begin_ecp_nistz256_select_w7:
1066	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
1067	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6, -0x20(%rax)
1068	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7, -0x10(%rax)
1069	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8, 0(%rax)
1070	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9, 0x10(%rax)
1071	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10, 0x20(%rax)
1072	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11, 0x30(%rax)
1073	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12, 0x40(%rax)
1074	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13, 0x50(%rax)
1075	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14, 0x60(%rax)
1076	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15, 0x70(%rax)
1077___
1078$code.=<<___;
1079	movdqa	.LOne(%rip), $M0
1080	movd	$index, $INDEX
1081
1082	pxor	$Ra, $Ra
1083	pxor	$Rb, $Rb
1084	pxor	$Rc, $Rc
1085	pxor	$Rd, $Rd
1086
1087	movdqa	$M0, $ONE
1088	pshufd	\$0, $INDEX, $INDEX
1089	mov	\$64, %rax
1090
1091.Lselect_loop_sse_w7:
1092	movdqa	$M0, $TMP0
1093	paddd	$ONE, $M0
1094	movdqa	16*0($in_t), $T0a
1095	movdqa	16*1($in_t), $T0b
1096	pcmpeqd	$INDEX, $TMP0
1097	movdqa	16*2($in_t), $T0c
1098	movdqa	16*3($in_t), $T0d
1099	lea	16*4($in_t), $in_t
1100
1101	pand	$TMP0, $T0a
1102	pand	$TMP0, $T0b
1103	por	$T0a, $Ra
1104	pand	$TMP0, $T0c
1105	por	$T0b, $Rb
1106	pand	$TMP0, $T0d
1107	por	$T0c, $Rc
1108	prefetcht0	255($in_t)
1109	por	$T0d, $Rd
1110
1111	dec	%rax
1112	jnz	.Lselect_loop_sse_w7
1113
1114	movdqu	$Ra, 16*0($val)
1115	movdqu	$Rb, 16*1($val)
1116	movdqu	$Rc, 16*2($val)
1117	movdqu	$Rd, 16*3($val)
1118___
1119$code.=<<___	if ($win64);
1120	movaps	(%rsp), %xmm6
1121	movaps	0x10(%rsp), %xmm7
1122	movaps	0x20(%rsp), %xmm8
1123	movaps	0x30(%rsp), %xmm9
1124	movaps	0x40(%rsp), %xmm10
1125	movaps	0x50(%rsp), %xmm11
1126	movaps	0x60(%rsp), %xmm12
1127	movaps	0x70(%rsp), %xmm13
1128	movaps	0x80(%rsp), %xmm14
1129	movaps	0x90(%rsp), %xmm15
1130	lea	0xa8(%rsp), %rsp
1131.LSEH_end_ecp_nistz256_select_w7:
1132___
1133$code.=<<___;
1134	ret
1135.size	ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
1136___
1137}
1138if ($avx>1) {
1139my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
1140my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4));
1141my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9));
1142my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14));
1143
1144$code.=<<___;
1145################################################################################
1146# void ecp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, int index);
1147.type	ecp_nistz256_avx2_select_w5,\@abi-omnipotent
1148.align	32
1149ecp_nistz256_avx2_select_w5:
1150.Lavx2_select_w5:
1151	vzeroupper
1152___
1153$code.=<<___	if ($win64);
1154	lea	-0x88(%rsp), %rax
1155.LSEH_begin_ecp_nistz256_avx2_select_w5:
1156	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
1157	.byte	0xc5,0xf8,0x29,0x70,0xe0	#vmovaps %xmm6, -0x20(%rax)
1158	.byte	0xc5,0xf8,0x29,0x78,0xf0	#vmovaps %xmm7, -0x10(%rax)
1159	.byte	0xc5,0x78,0x29,0x40,0x00	#vmovaps %xmm8, 8(%rax)
1160	.byte	0xc5,0x78,0x29,0x48,0x10	#vmovaps %xmm9, 0x10(%rax)
1161	.byte	0xc5,0x78,0x29,0x50,0x20	#vmovaps %xmm10, 0x20(%rax)
1162	.byte	0xc5,0x78,0x29,0x58,0x30	#vmovaps %xmm11, 0x30(%rax)
1163	.byte	0xc5,0x78,0x29,0x60,0x40	#vmovaps %xmm12, 0x40(%rax)
1164	.byte	0xc5,0x78,0x29,0x68,0x50	#vmovaps %xmm13, 0x50(%rax)
1165	.byte	0xc5,0x78,0x29,0x70,0x60	#vmovaps %xmm14, 0x60(%rax)
1166	.byte	0xc5,0x78,0x29,0x78,0x70	#vmovaps %xmm15, 0x70(%rax)
1167___
1168$code.=<<___;
1169	vmovdqa	.LTwo(%rip), $TWO
1170
1171	vpxor	$Ra, $Ra, $Ra
1172	vpxor	$Rb, $Rb, $Rb
1173	vpxor	$Rc, $Rc, $Rc
1174
1175	vmovdqa .LOne(%rip), $M0
1176	vmovdqa .LTwo(%rip), $M1
1177
1178	vmovd	$index, %xmm1
1179	vpermd	$INDEX, $Ra, $INDEX
1180
1181	mov	\$8, %rax
1182.Lselect_loop_avx2_w5:
1183
1184	vmovdqa	32*0($in_t), $T0a
1185	vmovdqa	32*1($in_t), $T0b
1186	vmovdqa	32*2($in_t), $T0c
1187
1188	vmovdqa	32*3($in_t), $T1a
1189	vmovdqa	32*4($in_t), $T1b
1190	vmovdqa	32*5($in_t), $T1c
1191
1192	vpcmpeqd	$INDEX, $M0, $TMP0
1193	vpcmpeqd	$INDEX, $M1, $TMP1
1194
1195	vpaddd	$TWO, $M0, $M0
1196	vpaddd	$TWO, $M1, $M1
1197	lea	32*6($in_t), $in_t
1198
1199	vpand	$TMP0, $T0a, $T0a
1200	vpand	$TMP0, $T0b, $T0b
1201	vpand	$TMP0, $T0c, $T0c
1202	vpand	$TMP1, $T1a, $T1a
1203	vpand	$TMP1, $T1b, $T1b
1204	vpand	$TMP1, $T1c, $T1c
1205
1206	vpxor	$T0a, $Ra, $Ra
1207	vpxor	$T0b, $Rb, $Rb
1208	vpxor	$T0c, $Rc, $Rc
1209	vpxor	$T1a, $Ra, $Ra
1210	vpxor	$T1b, $Rb, $Rb
1211	vpxor	$T1c, $Rc, $Rc
1212
1213	dec %rax
1214	jnz .Lselect_loop_avx2_w5
1215
1216	vmovdqu $Ra, 32*0($val)
1217	vmovdqu $Rb, 32*1($val)
1218	vmovdqu $Rc, 32*2($val)
1219	vzeroupper
1220___
1221$code.=<<___	if ($win64);
1222	movaps	(%rsp), %xmm6
1223	movaps	0x10(%rsp), %xmm7
1224	movaps	0x20(%rsp), %xmm8
1225	movaps	0x30(%rsp), %xmm9
1226	movaps	0x40(%rsp), %xmm10
1227	movaps	0x50(%rsp), %xmm11
1228	movaps	0x60(%rsp), %xmm12
1229	movaps	0x70(%rsp), %xmm13
1230	movaps	0x80(%rsp), %xmm14
1231	movaps	0x90(%rsp), %xmm15
1232	lea	0xa8(%rsp), %rsp
1233.LSEH_end_ecp_nistz256_avx2_select_w5:
1234___
1235$code.=<<___;
1236	ret
1237.size	ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5
1238___
1239}
1240if ($avx>1) {
1241my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
1242my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3));
1243my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7));
1244my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11));
1245my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15));
1246
1247$code.=<<___;
1248
1249################################################################################
1250# void ecp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, int index);
1251.globl	ecp_nistz256_avx2_select_w7
1252.type	ecp_nistz256_avx2_select_w7,\@abi-omnipotent
1253.align	32
1254ecp_nistz256_avx2_select_w7:
1255.Lavx2_select_w7:
1256	vzeroupper
1257___
1258$code.=<<___	if ($win64);
1259	lea	-0x88(%rsp), %rax
1260.LSEH_begin_ecp_nistz256_avx2_select_w7:
1261	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax), %rsp
1262	.byte	0xc5,0xf8,0x29,0x70,0xe0	#vmovaps %xmm6, -0x20(%rax)
1263	.byte	0xc5,0xf8,0x29,0x78,0xf0	#vmovaps %xmm7, -0x10(%rax)
1264	.byte	0xc5,0x78,0x29,0x40,0x00	#vmovaps %xmm8, 8(%rax)
1265	.byte	0xc5,0x78,0x29,0x48,0x10	#vmovaps %xmm9, 0x10(%rax)
1266	.byte	0xc5,0x78,0x29,0x50,0x20	#vmovaps %xmm10, 0x20(%rax)
1267	.byte	0xc5,0x78,0x29,0x58,0x30	#vmovaps %xmm11, 0x30(%rax)
1268	.byte	0xc5,0x78,0x29,0x60,0x40	#vmovaps %xmm12, 0x40(%rax)
1269	.byte	0xc5,0x78,0x29,0x68,0x50	#vmovaps %xmm13, 0x50(%rax)
1270	.byte	0xc5,0x78,0x29,0x70,0x60	#vmovaps %xmm14, 0x60(%rax)
1271	.byte	0xc5,0x78,0x29,0x78,0x70	#vmovaps %xmm15, 0x70(%rax)
1272___
1273$code.=<<___;
1274	vmovdqa	.LThree(%rip), $THREE
1275
1276	vpxor	$Ra, $Ra, $Ra
1277	vpxor	$Rb, $Rb, $Rb
1278
1279	vmovdqa .LOne(%rip), $M0
1280	vmovdqa .LTwo(%rip), $M1
1281	vmovdqa .LThree(%rip), $M2
1282
1283	vmovd	$index, %xmm1
1284	vpermd	$INDEX, $Ra, $INDEX
1285	# Skip index = 0, because it is implicitly the point at infinity
1286
1287	mov	\$21, %rax
1288.Lselect_loop_avx2_w7:
1289
1290	vmovdqa	32*0($in_t), $T0a
1291	vmovdqa	32*1($in_t), $T0b
1292
1293	vmovdqa	32*2($in_t), $T1a
1294	vmovdqa	32*3($in_t), $T1b
1295
1296	vmovdqa	32*4($in_t), $T2a
1297	vmovdqa	32*5($in_t), $T2b
1298
1299	vpcmpeqd	$INDEX, $M0, $TMP0
1300	vpcmpeqd	$INDEX, $M1, $TMP1
1301	vpcmpeqd	$INDEX, $M2, $TMP2
1302
1303	vpaddd	$THREE, $M0, $M0
1304	vpaddd	$THREE, $M1, $M1
1305	vpaddd	$THREE, $M2, $M2
1306	lea	32*6($in_t), $in_t
1307
1308	vpand	$TMP0, $T0a, $T0a
1309	vpand	$TMP0, $T0b, $T0b
1310	vpand	$TMP1, $T1a, $T1a
1311	vpand	$TMP1, $T1b, $T1b
1312	vpand	$TMP2, $T2a, $T2a
1313	vpand	$TMP2, $T2b, $T2b
1314
1315	vpxor	$T0a, $Ra, $Ra
1316	vpxor	$T0b, $Rb, $Rb
1317	vpxor	$T1a, $Ra, $Ra
1318	vpxor	$T1b, $Rb, $Rb
1319	vpxor	$T2a, $Ra, $Ra
1320	vpxor	$T2b, $Rb, $Rb
1321
1322	dec %rax
1323	jnz .Lselect_loop_avx2_w7
1324
1325
1326	vmovdqa	32*0($in_t), $T0a
1327	vmovdqa	32*1($in_t), $T0b
1328
1329	vpcmpeqd	$INDEX, $M0, $TMP0
1330
1331	vpand	$TMP0, $T0a, $T0a
1332	vpand	$TMP0, $T0b, $T0b
1333
1334	vpxor	$T0a, $Ra, $Ra
1335	vpxor	$T0b, $Rb, $Rb
1336
1337	vmovdqu $Ra, 32*0($val)
1338	vmovdqu $Rb, 32*1($val)
1339	vzeroupper
1340___
1341$code.=<<___	if ($win64);
1342	movaps	(%rsp), %xmm6
1343	movaps	0x10(%rsp), %xmm7
1344	movaps	0x20(%rsp), %xmm8
1345	movaps	0x30(%rsp), %xmm9
1346	movaps	0x40(%rsp), %xmm10
1347	movaps	0x50(%rsp), %xmm11
1348	movaps	0x60(%rsp), %xmm12
1349	movaps	0x70(%rsp), %xmm13
1350	movaps	0x80(%rsp), %xmm14
1351	movaps	0x90(%rsp), %xmm15
1352	lea	0xa8(%rsp), %rsp
1353.LSEH_end_ecp_nistz256_avx2_select_w7:
1354___
1355$code.=<<___;
1356	ret
1357.size	ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
1358___
1359} else {
1360$code.=<<___;
1361.globl	ecp_nistz256_avx2_select_w7
1362.type	ecp_nistz256_avx2_select_w7,\@function,3
1363.align	32
1364ecp_nistz256_avx2_select_w7:
1365	.byte	0x0f,0x0b	# ud2
1366	ret
1367.size	ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
1368___
1369}
1370{{{
1371########################################################################
1372# This block implements higher level point_double, point_add and
1373# point_add_affine. The key to performance in this case is to allow
1374# out-of-order execution logic to overlap computations from next step
1375# with tail processing from current step. By using tailored calling
1376# sequence we minimize inter-step overhead to give processor better
1377# shot at overlapping operations...
1378#
1379# You will notice that input data is copied to stack. Trouble is that
1380# there are no registers to spare for holding original pointers and
1381# reloading them, pointers, would create undesired dependencies on
1382# effective addresses calculation paths. In other words it's too done
1383# to favour out-of-order execution logic.
1384#						<appro@openssl.org>
1385
1386my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
1387my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
1388my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4);
1389my ($poly1,$poly3)=($acc6,$acc7);
1390
1391sub load_for_mul () {
1392my ($a,$b,$src0) = @_;
1393my $bias = $src0 eq "%rax" ? 0 : -128;
1394
1395"	mov	$b, $src0
1396	lea	$b, $b_ptr
1397	mov	8*0+$a, $acc1
1398	mov	8*1+$a, $acc2
1399	lea	$bias+$a, $a_ptr
1400	mov	8*2+$a, $acc3
1401	mov	8*3+$a, $acc4"
1402}
1403
1404sub load_for_sqr () {
1405my ($a,$src0) = @_;
1406my $bias = $src0 eq "%rax" ? 0 : -128;
1407
1408"	mov	8*0+$a, $src0
1409	mov	8*1+$a, $acc6
1410	lea	$bias+$a, $a_ptr
1411	mov	8*2+$a, $acc7
1412	mov	8*3+$a, $acc0"
1413}
1414
1415									{
1416########################################################################
1417# operate in 4-5-0-1 "name space" that matches multiplication output
1418#
1419my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
1420
1421$code.=<<___;
1422.type	__ecp_nistz256_add_toq,\@abi-omnipotent
1423.align	32
1424__ecp_nistz256_add_toq:
1425	xor	$t4,$t4
1426	add	8*0($b_ptr), $a0
1427	adc	8*1($b_ptr), $a1
1428	 mov	$a0, $t0
1429	adc	8*2($b_ptr), $a2
1430	adc	8*3($b_ptr), $a3
1431	 mov	$a1, $t1
1432	adc	\$0, $t4
1433
1434	sub	\$-1, $a0
1435	 mov	$a2, $t2
1436	sbb	$poly1, $a1
1437	sbb	\$0, $a2
1438	 mov	$a3, $t3
1439	sbb	$poly3, $a3
1440	sbb	\$0, $t4
1441
1442	cmovc	$t0, $a0
1443	cmovc	$t1, $a1
1444	mov	$a0, 8*0($r_ptr)
1445	cmovc	$t2, $a2
1446	mov	$a1, 8*1($r_ptr)
1447	cmovc	$t3, $a3
1448	mov	$a2, 8*2($r_ptr)
1449	mov	$a3, 8*3($r_ptr)
1450
1451	ret
1452.size	__ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
1453
1454.type	__ecp_nistz256_sub_fromq,\@abi-omnipotent
1455.align	32
1456__ecp_nistz256_sub_fromq:
1457	sub	8*0($b_ptr), $a0
1458	sbb	8*1($b_ptr), $a1
1459	 mov	$a0, $t0
1460	sbb	8*2($b_ptr), $a2
1461	sbb	8*3($b_ptr), $a3
1462	 mov	$a1, $t1
1463	sbb	$t4, $t4
1464
1465	add	\$-1, $a0
1466	 mov	$a2, $t2
1467	adc	$poly1, $a1
1468	adc	\$0, $a2
1469	 mov	$a3, $t3
1470	adc	$poly3, $a3
1471	test	$t4, $t4
1472
1473	cmovz	$t0, $a0
1474	cmovz	$t1, $a1
1475	mov	$a0, 8*0($r_ptr)
1476	cmovz	$t2, $a2
1477	mov	$a1, 8*1($r_ptr)
1478	cmovz	$t3, $a3
1479	mov	$a2, 8*2($r_ptr)
1480	mov	$a3, 8*3($r_ptr)
1481
1482	ret
1483.size	__ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
1484
1485.type	__ecp_nistz256_subq,\@abi-omnipotent
1486.align	32
1487__ecp_nistz256_subq:
1488	sub	$a0, $t0
1489	sbb	$a1, $t1
1490	 mov	$t0, $a0
1491	sbb	$a2, $t2
1492	sbb	$a3, $t3
1493	 mov	$t1, $a1
1494	sbb	$t4, $t4
1495
1496	add	\$-1, $t0
1497	 mov	$t2, $a2
1498	adc	$poly1, $t1
1499	adc	\$0, $t2
1500	 mov	$t3, $a3
1501	adc	$poly3, $t3
1502	test	$t4, $t4
1503
1504	cmovnz	$t0, $a0
1505	cmovnz	$t1, $a1
1506	cmovnz	$t2, $a2
1507	cmovnz	$t3, $a3
1508
1509	ret
1510.size	__ecp_nistz256_subq,.-__ecp_nistz256_subq
1511
1512.type	__ecp_nistz256_mul_by_2q,\@abi-omnipotent
1513.align	32
1514__ecp_nistz256_mul_by_2q:
1515	xor	$t4, $t4
1516	add	$a0, $a0		# a0:a3+a0:a3
1517	adc	$a1, $a1
1518	 mov	$a0, $t0
1519	adc	$a2, $a2
1520	adc	$a3, $a3
1521	 mov	$a1, $t1
1522	adc	\$0, $t4
1523
1524	sub	\$-1, $a0
1525	 mov	$a2, $t2
1526	sbb	$poly1, $a1
1527	sbb	\$0, $a2
1528	 mov	$a3, $t3
1529	sbb	$poly3, $a3
1530	sbb	\$0, $t4
1531
1532	cmovc	$t0, $a0
1533	cmovc	$t1, $a1
1534	mov	$a0, 8*0($r_ptr)
1535	cmovc	$t2, $a2
1536	mov	$a1, 8*1($r_ptr)
1537	cmovc	$t3, $a3
1538	mov	$a2, 8*2($r_ptr)
1539	mov	$a3, 8*3($r_ptr)
1540
1541	ret
1542.size	__ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
1543___
1544									}
1545sub gen_double () {
1546    my $x = shift;
1547    my ($src0,$sfx,$bias);
1548    my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
1549
1550    if ($x ne "x") {
1551	$src0 = "%rax";
1552	$sfx  = "";
1553	$bias = 0;
1554
1555$code.=<<___;
1556.globl	ecp_nistz256_point_double
1557.type	ecp_nistz256_point_double,\@function,2
1558.align	32
1559ecp_nistz256_point_double:
1560___
1561$code.=<<___	if ($addx);
1562	mov	\$0x80100, %ecx
1563	and	OPENSSL_ia32cap_P+8(%rip), %ecx
1564	cmp	\$0x80100, %ecx
1565	je	.Lpoint_doublex
1566___
1567    } else {
1568	$src0 = "%rdx";
1569	$sfx  = "x";
1570	$bias = 128;
1571
1572$code.=<<___;
1573.type	ecp_nistz256_point_doublex,\@function,2
1574.align	32
1575ecp_nistz256_point_doublex:
1576.Lpoint_doublex:
1577___
1578    }
1579$code.=<<___;
1580	push	%rbp
1581	push	%rbx
1582	push	%r12
1583	push	%r13
1584	push	%r14
1585	push	%r15
1586	sub	\$32*5+8, %rsp
1587
1588.Lpoint_double_shortcut$x:
1589	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$a_ptr.x
1590	mov	$a_ptr, $b_ptr			# backup copy
1591	movdqu	0x10($a_ptr), %xmm1
1592	 mov	0x20+8*0($a_ptr), $acc4		# load in_y in "5-4-0-1" order
1593	 mov	0x20+8*1($a_ptr), $acc5
1594	 mov	0x20+8*2($a_ptr), $acc0
1595	 mov	0x20+8*3($a_ptr), $acc1
1596	 mov	.Lpoly+8*1(%rip), $poly1
1597	 mov	.Lpoly+8*3(%rip), $poly3
1598	movdqa	%xmm0, $in_x(%rsp)
1599	movdqa	%xmm1, $in_x+0x10(%rsp)
1600	lea	0x20($r_ptr), $acc2
1601	lea	0x40($r_ptr), $acc3
1602	movq	$r_ptr, %xmm0
1603	movq	$acc2, %xmm1
1604	movq	$acc3, %xmm2
1605
1606	lea	$S(%rsp), $r_ptr
1607	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(S, in_y);
1608
1609	mov	0x40+8*0($a_ptr), $src0
1610	mov	0x40+8*1($a_ptr), $acc6
1611	mov	0x40+8*2($a_ptr), $acc7
1612	mov	0x40+8*3($a_ptr), $acc0
1613	lea	0x40-$bias($a_ptr), $a_ptr
1614	lea	$Zsqr(%rsp), $r_ptr
1615	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Zsqr, in_z);
1616
1617	`&load_for_sqr("$S(%rsp)", "$src0")`
1618	lea	$S(%rsp), $r_ptr
1619	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(S, S);
1620
1621	mov	0x20($b_ptr), $src0		# $b_ptr is still valid
1622	mov	0x40+8*0($b_ptr), $acc1
1623	mov	0x40+8*1($b_ptr), $acc2
1624	mov	0x40+8*2($b_ptr), $acc3
1625	mov	0x40+8*3($b_ptr), $acc4
1626	lea	0x40-$bias($b_ptr), $a_ptr
1627	lea	0x20($b_ptr), $b_ptr
1628	movq	%xmm2, $r_ptr
1629	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, in_z, in_y);
1630	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(res_z, res_z);
1631
1632	mov	$in_x+8*0(%rsp), $acc4		# "5-4-0-1" order
1633	mov	$in_x+8*1(%rsp), $acc5
1634	lea	$Zsqr(%rsp), $b_ptr
1635	mov	$in_x+8*2(%rsp), $acc0
1636	mov	$in_x+8*3(%rsp), $acc1
1637	lea	$M(%rsp), $r_ptr
1638	call	__ecp_nistz256_add_to$x		# p256_add(M, in_x, Zsqr);
1639
1640	mov	$in_x+8*0(%rsp), $acc4		# "5-4-0-1" order
1641	mov	$in_x+8*1(%rsp), $acc5
1642	lea	$Zsqr(%rsp), $b_ptr
1643	mov	$in_x+8*2(%rsp), $acc0
1644	mov	$in_x+8*3(%rsp), $acc1
1645	lea	$Zsqr(%rsp), $r_ptr
1646	call	__ecp_nistz256_sub_from$x	# p256_sub(Zsqr, in_x, Zsqr);
1647
1648	`&load_for_sqr("$S(%rsp)", "$src0")`
1649	movq	%xmm1, $r_ptr
1650	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(res_y, S);
1651___
1652{
1653######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
1654# operate in 4-5-6-7 "name space" that matches squaring output
1655#
1656my ($poly1,$poly3)=($a_ptr,$t1);
1657my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2);
1658
1659$code.=<<___;
1660	xor	$t4, $t4
1661	mov	$a0, $t0
1662	add	\$-1, $a0
1663	mov	$a1, $t1
1664	adc	$poly1, $a1
1665	mov	$a2, $t2
1666	adc	\$0, $a2
1667	mov	$a3, $t3
1668	adc	$poly3, $a3
1669	adc	\$0, $t4
1670	xor	$a_ptr, $a_ptr		# borrow $a_ptr
1671	test	\$1, $t0
1672
1673	cmovz	$t0, $a0
1674	cmovz	$t1, $a1
1675	cmovz	$t2, $a2
1676	cmovz	$t3, $a3
1677	cmovz	$a_ptr, $t4
1678
1679	mov	$a1, $t0		# a0:a3>>1
1680	shr	\$1, $a0
1681	shl	\$63, $t0
1682	mov	$a2, $t1
1683	shr	\$1, $a1
1684	or	$t0, $a0
1685	shl	\$63, $t1
1686	mov	$a3, $t2
1687	shr	\$1, $a2
1688	or	$t1, $a1
1689	shl	\$63, $t2
1690	mov	$a0, 8*0($r_ptr)
1691	shr	\$1, $a3
1692	mov	$a1, 8*1($r_ptr)
1693	shl	\$63, $t4
1694	or	$t2, $a2
1695	or	$t4, $a3
1696	mov	$a2, 8*2($r_ptr)
1697	mov	$a3, 8*3($r_ptr)
1698___
1699}
1700$code.=<<___;
1701	`&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")`
1702	lea	$M(%rsp), $r_ptr
1703	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(M, M, Zsqr);
1704
1705	lea	$tmp0(%rsp), $r_ptr
1706	call	__ecp_nistz256_mul_by_2$x
1707
1708	lea	$M(%rsp), $b_ptr
1709	lea	$M(%rsp), $r_ptr
1710	call	__ecp_nistz256_add_to$x		# p256_mul_by_3(M, M);
1711
1712	`&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")`
1713	lea	$S(%rsp), $r_ptr
1714	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S, S, in_x);
1715
1716	lea	$tmp0(%rsp), $r_ptr
1717	call	__ecp_nistz256_mul_by_2$x	# p256_mul_by_2(tmp0, S);
1718
1719	`&load_for_sqr("$M(%rsp)", "$src0")`
1720	movq	%xmm0, $r_ptr
1721	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(res_x, M);
1722
1723	lea	$tmp0(%rsp), $b_ptr
1724	mov	$acc6, $acc0			# harmonize sqr output and sub input
1725	mov	$acc7, $acc1
1726	mov	$a_ptr, $poly1
1727	mov	$t1, $poly3
1728	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, tmp0);
1729
1730	mov	$S+8*0(%rsp), $t0
1731	mov	$S+8*1(%rsp), $t1
1732	mov	$S+8*2(%rsp), $t2
1733	mov	$S+8*3(%rsp), $acc2		# "4-5-0-1" order
1734	lea	$S(%rsp), $r_ptr
1735	call	__ecp_nistz256_sub$x		# p256_sub(S, S, res_x);
1736
1737	mov	$M(%rsp), $src0
1738	lea	$M(%rsp), $b_ptr
1739	mov	$acc4, $acc6			# harmonize sub output and mul input
1740	xor	%ecx, %ecx
1741	mov	$acc4, $S+8*0(%rsp)		# have to save:-(
1742	mov	$acc5, $acc2
1743	mov	$acc5, $S+8*1(%rsp)
1744	cmovz	$acc0, $acc3
1745	mov	$acc0, $S+8*2(%rsp)
1746	lea	$S-$bias(%rsp), $a_ptr
1747	cmovz	$acc1, $acc4
1748	mov	$acc1, $S+8*3(%rsp)
1749	mov	$acc6, $acc1
1750	lea	$S(%rsp), $r_ptr
1751	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S, S, M);
1752
1753	movq	%xmm1, $b_ptr
1754	movq	%xmm1, $r_ptr
1755	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, S, res_y);
1756
1757	add	\$32*5+8, %rsp
1758	pop	%r15
1759	pop	%r14
1760	pop	%r13
1761	pop	%r12
1762	pop	%rbx
1763	pop	%rbp
1764	ret
1765.size	ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx
1766___
1767}
1768&gen_double("q");
1769
1770sub gen_add () {
1771    my $x = shift;
1772    my ($src0,$sfx,$bias);
1773    my ($H,$Hsqr,$R,$Rsqr,$Hcub,
1774	$U1,$U2,$S1,$S2,
1775	$res_x,$res_y,$res_z,
1776	$in1_x,$in1_y,$in1_z,
1777	$in2_x,$in2_y,$in2_z)=map(32*$_,(0..17));
1778    my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
1779
1780    if ($x ne "x") {
1781	$src0 = "%rax";
1782	$sfx  = "";
1783	$bias = 0;
1784
1785$code.=<<___;
1786.globl	ecp_nistz256_point_add
1787.type	ecp_nistz256_point_add,\@function,3
1788.align	32
1789ecp_nistz256_point_add:
1790___
1791$code.=<<___	if ($addx);
1792	mov	\$0x80100, %ecx
1793	and	OPENSSL_ia32cap_P+8(%rip), %ecx
1794	cmp	\$0x80100, %ecx
1795	je	.Lpoint_addx
1796___
1797    } else {
1798	$src0 = "%rdx";
1799	$sfx  = "x";
1800	$bias = 128;
1801
1802$code.=<<___;
1803.type	ecp_nistz256_point_addx,\@function,3
1804.align	32
1805ecp_nistz256_point_addx:
1806.Lpoint_addx:
1807___
1808    }
1809$code.=<<___;
1810	push	%rbp
1811	push	%rbx
1812	push	%r12
1813	push	%r13
1814	push	%r14
1815	push	%r15
1816	sub	\$32*18+8, %rsp
1817
1818	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$a_ptr
1819	movdqu	0x10($a_ptr), %xmm1
1820	movdqu	0x20($a_ptr), %xmm2
1821	movdqu	0x30($a_ptr), %xmm3
1822	movdqu	0x40($a_ptr), %xmm4
1823	movdqu	0x50($a_ptr), %xmm5
1824	mov	$a_ptr, $b_ptr			# reassign
1825	mov	$b_org, $a_ptr			# reassign
1826	movdqa	%xmm0, $in1_x(%rsp)
1827	movdqa	%xmm1, $in1_x+0x10(%rsp)
1828	movdqa	%xmm2, $in1_y(%rsp)
1829	movdqa	%xmm3, $in1_y+0x10(%rsp)
1830	movdqa	%xmm4, $in1_z(%rsp)
1831	movdqa	%xmm5, $in1_z+0x10(%rsp)
1832	por	%xmm4, %xmm5
1833
1834	movdqu	0x00($a_ptr), %xmm0		# copy	*(P256_POINT *)$b_ptr
1835	 pshufd	\$0xb1, %xmm5, %xmm3
1836	movdqu	0x10($a_ptr), %xmm1
1837	movdqu	0x20($a_ptr), %xmm2
1838	 por	%xmm3, %xmm5
1839	movdqu	0x30($a_ptr), %xmm3
1840	 mov	0x40+8*0($a_ptr), $src0		# load original in2_z
1841	 mov	0x40+8*1($a_ptr), $acc6
1842	 mov	0x40+8*2($a_ptr), $acc7
1843	 mov	0x40+8*3($a_ptr), $acc0
1844	movdqa	%xmm0, $in2_x(%rsp)
1845	 pshufd	\$0x1e, %xmm5, %xmm4
1846	movdqa	%xmm1, $in2_x+0x10(%rsp)
1847	movdqu	0x40($a_ptr),%xmm0		# in2_z again
1848	movdqu	0x50($a_ptr),%xmm1
1849	movdqa	%xmm2, $in2_y(%rsp)
1850	movdqa	%xmm3, $in2_y+0x10(%rsp)
1851	 por	%xmm4, %xmm5
1852	 pxor	%xmm4, %xmm4
1853	por	%xmm0, %xmm1
1854	 movq	$r_ptr, %xmm0			# save $r_ptr
1855
1856	lea	0x40-$bias($a_ptr), $a_ptr	# $a_ptr is still valid
1857	 mov	$src0, $in2_z+8*0(%rsp)		# make in2_z copy
1858	 mov	$acc6, $in2_z+8*1(%rsp)
1859	 mov	$acc7, $in2_z+8*2(%rsp)
1860	 mov	$acc0, $in2_z+8*3(%rsp)
1861	lea	$Z2sqr(%rsp), $r_ptr		# Z2^2
1862	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z2sqr, in2_z);
1863
1864	pcmpeqd	%xmm4, %xmm5
1865	pshufd	\$0xb1, %xmm1, %xmm4
1866	por	%xmm1, %xmm4
1867	pshufd	\$0, %xmm5, %xmm5		# in1infty
1868	pshufd	\$0x1e, %xmm4, %xmm3
1869	por	%xmm3, %xmm4
1870	pxor	%xmm3, %xmm3
1871	pcmpeqd	%xmm3, %xmm4
1872	pshufd	\$0, %xmm4, %xmm4		# in2infty
1873	 mov	0x40+8*0($b_ptr), $src0		# load original in1_z
1874	 mov	0x40+8*1($b_ptr), $acc6
1875	 mov	0x40+8*2($b_ptr), $acc7
1876	 mov	0x40+8*3($b_ptr), $acc0
1877	movq	$b_ptr, %xmm1
1878
1879	lea	0x40-$bias($b_ptr), $a_ptr
1880	lea	$Z1sqr(%rsp), $r_ptr		# Z1^2
1881	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z1sqr, in1_z);
1882
1883	`&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")`
1884	lea	$S1(%rsp), $r_ptr		# S1 = Z2^3
1885	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S1, Z2sqr, in2_z);
1886
1887	`&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
1888	lea	$S2(%rsp), $r_ptr		# S2 = Z1^3
1889	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Z1sqr, in1_z);
1890
1891	`&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")`
1892	lea	$S1(%rsp), $r_ptr		# S1 = Y1*Z2^3
1893	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S1, S1, in1_y);
1894
1895	`&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
1896	lea	$S2(%rsp), $r_ptr		# S2 = Y2*Z1^3
1897	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S2, in2_y);
1898
1899	lea	$S1(%rsp), $b_ptr
1900	lea	$R(%rsp), $r_ptr		# R = S2 - S1
1901	call	__ecp_nistz256_sub_from$x	# p256_sub(R, S2, S1);
1902
1903	or	$acc5, $acc4			# see if result is zero
1904	movdqa	%xmm4, %xmm2
1905	or	$acc0, $acc4
1906	or	$acc1, $acc4
1907	por	%xmm5, %xmm2			# in1infty || in2infty
1908	movq	$acc4, %xmm3
1909
1910	`&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")`
1911	lea	$U1(%rsp), $r_ptr		# U1 = X1*Z2^2
1912	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U1, in1_x, Z2sqr);
1913
1914	`&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")`
1915	lea	$U2(%rsp), $r_ptr		# U2 = X2*Z1^2
1916	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, in2_x, Z1sqr);
1917
1918	lea	$U1(%rsp), $b_ptr
1919	lea	$H(%rsp), $r_ptr		# H = U2 - U1
1920	call	__ecp_nistz256_sub_from$x	# p256_sub(H, U2, U1);
1921
1922	or	$acc5, $acc4			# see if result is zero
1923	or	$acc0, $acc4
1924	or	$acc1, $acc4
1925
1926	.byte	0x3e				# predict taken
1927	jnz	.Ladd_proceed$x			# is_equal(U1,U2)?
1928	movq	%xmm2, $acc0
1929	movq	%xmm3, $acc1
1930	test	$acc0, $acc0
1931	jnz	.Ladd_proceed$x			# (in1infty || in2infty)?
1932	test	$acc1, $acc1
1933	jz	.Ladd_double$x			# is_equal(S1,S2)?
1934
1935	movq	%xmm0, $r_ptr			# restore $r_ptr
1936	pxor	%xmm0, %xmm0
1937	movdqu	%xmm0, 0x00($r_ptr)
1938	movdqu	%xmm0, 0x10($r_ptr)
1939	movdqu	%xmm0, 0x20($r_ptr)
1940	movdqu	%xmm0, 0x30($r_ptr)
1941	movdqu	%xmm0, 0x40($r_ptr)
1942	movdqu	%xmm0, 0x50($r_ptr)
1943	jmp	.Ladd_done$x
1944
1945.align	32
1946.Ladd_double$x:
1947	movq	%xmm1, $a_ptr			# restore $a_ptr
1948	movq	%xmm0, $r_ptr			# restore $r_ptr
1949	add	\$`32*(18-5)`, %rsp		# difference in frame sizes
1950	jmp	.Lpoint_double_shortcut$x
1951
1952.align	32
1953.Ladd_proceed$x:
1954	`&load_for_sqr("$R(%rsp)", "$src0")`
1955	lea	$Rsqr(%rsp), $r_ptr		# R^2
1956	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Rsqr, R);
1957
1958	`&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
1959	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
1960	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, H, in1_z);
1961
1962	`&load_for_sqr("$H(%rsp)", "$src0")`
1963	lea	$Hsqr(%rsp), $r_ptr		# H^2
1964	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Hsqr, H);
1965
1966	`&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")`
1967	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
1968	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, res_z, in2_z);
1969
1970	`&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")`
1971	lea	$Hcub(%rsp), $r_ptr		# H^3
1972	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(Hcub, Hsqr, H);
1973
1974	`&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")`
1975	lea	$U2(%rsp), $r_ptr		# U1*H^2
1976	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, U1, Hsqr);
1977___
1978{
1979#######################################################################
1980# operate in 4-5-0-1 "name space" that matches multiplication output
1981#
1982my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
1983my ($poly1, $poly3)=($acc6,$acc7);
1984
1985$code.=<<___;
1986	#lea	$U2(%rsp), $a_ptr
1987	#lea	$Hsqr(%rsp), $r_ptr	# 2*U1*H^2
1988	#call	__ecp_nistz256_mul_by_2	# ecp_nistz256_mul_by_2(Hsqr, U2);
1989
1990	xor	$t4, $t4
1991	add	$acc0, $acc0		# a0:a3+a0:a3
1992	lea	$Rsqr(%rsp), $a_ptr
1993	adc	$acc1, $acc1
1994	 mov	$acc0, $t0
1995	adc	$acc2, $acc2
1996	adc	$acc3, $acc3
1997	 mov	$acc1, $t1
1998	adc	\$0, $t4
1999
2000	sub	\$-1, $acc0
2001	 mov	$acc2, $t2
2002	sbb	$poly1, $acc1
2003	sbb	\$0, $acc2
2004	 mov	$acc3, $t3
2005	sbb	$poly3, $acc3
2006	sbb	\$0, $t4
2007
2008	cmovc	$t0, $acc0
2009	mov	8*0($a_ptr), $t0
2010	cmovc	$t1, $acc1
2011	mov	8*1($a_ptr), $t1
2012	cmovc	$t2, $acc2
2013	mov	8*2($a_ptr), $t2
2014	cmovc	$t3, $acc3
2015	mov	8*3($a_ptr), $t3
2016
2017	call	__ecp_nistz256_sub$x		# p256_sub(res_x, Rsqr, Hsqr);
2018
2019	lea	$Hcub(%rsp), $b_ptr
2020	lea	$res_x(%rsp), $r_ptr
2021	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, Hcub);
2022
2023	mov	$U2+8*0(%rsp), $t0
2024	mov	$U2+8*1(%rsp), $t1
2025	mov	$U2+8*2(%rsp), $t2
2026	mov	$U2+8*3(%rsp), $t3
2027	lea	$res_y(%rsp), $r_ptr
2028
2029	call	__ecp_nistz256_sub$x		# p256_sub(res_y, U2, res_x);
2030
2031	mov	$acc0, 8*0($r_ptr)		# save the result, as
2032	mov	$acc1, 8*1($r_ptr)		# __ecp_nistz256_sub doesn't
2033	mov	$acc2, 8*2($r_ptr)
2034	mov	$acc3, 8*3($r_ptr)
2035___
2036}
2037$code.=<<___;
2038	`&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")`
2039	lea	$S2(%rsp), $r_ptr
2040	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S1, Hcub);
2041
2042	`&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")`
2043	lea	$res_y(%rsp), $r_ptr
2044	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_y, R, res_y);
2045
2046	lea	$S2(%rsp), $b_ptr
2047	lea	$res_y(%rsp), $r_ptr
2048	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, res_y, S2);
2049
2050	movq	%xmm0, $r_ptr		# restore $r_ptr
2051
2052	movdqa	%xmm5, %xmm0		# copy_conditional(res_z, in2_z, in1infty);
2053	movdqa	%xmm5, %xmm1
2054	pandn	$res_z(%rsp), %xmm0
2055	movdqa	%xmm5, %xmm2
2056	pandn	$res_z+0x10(%rsp), %xmm1
2057	movdqa	%xmm5, %xmm3
2058	pand	$in2_z(%rsp), %xmm2
2059	pand	$in2_z+0x10(%rsp), %xmm3
2060	por	%xmm0, %xmm2
2061	por	%xmm1, %xmm3
2062
2063	movdqa	%xmm4, %xmm0		# copy_conditional(res_z, in1_z, in2infty);
2064	movdqa	%xmm4, %xmm1
2065	pandn	%xmm2, %xmm0
2066	movdqa	%xmm4, %xmm2
2067	pandn	%xmm3, %xmm1
2068	movdqa	%xmm4, %xmm3
2069	pand	$in1_z(%rsp), %xmm2
2070	pand	$in1_z+0x10(%rsp), %xmm3
2071	por	%xmm0, %xmm2
2072	por	%xmm1, %xmm3
2073	movdqu	%xmm2, 0x40($r_ptr)
2074	movdqu	%xmm3, 0x50($r_ptr)
2075
2076	movdqa	%xmm5, %xmm0		# copy_conditional(res_x, in2_x, in1infty);
2077	movdqa	%xmm5, %xmm1
2078	pandn	$res_x(%rsp), %xmm0
2079	movdqa	%xmm5, %xmm2
2080	pandn	$res_x+0x10(%rsp), %xmm1
2081	movdqa	%xmm5, %xmm3
2082	pand	$in2_x(%rsp), %xmm2
2083	pand	$in2_x+0x10(%rsp), %xmm3
2084	por	%xmm0, %xmm2
2085	por	%xmm1, %xmm3
2086
2087	movdqa	%xmm4, %xmm0		# copy_conditional(res_x, in1_x, in2infty);
2088	movdqa	%xmm4, %xmm1
2089	pandn	%xmm2, %xmm0
2090	movdqa	%xmm4, %xmm2
2091	pandn	%xmm3, %xmm1
2092	movdqa	%xmm4, %xmm3
2093	pand	$in1_x(%rsp), %xmm2
2094	pand	$in1_x+0x10(%rsp), %xmm3
2095	por	%xmm0, %xmm2
2096	por	%xmm1, %xmm3
2097	movdqu	%xmm2, 0x00($r_ptr)
2098	movdqu	%xmm3, 0x10($r_ptr)
2099
2100	movdqa	%xmm5, %xmm0		# copy_conditional(res_y, in2_y, in1infty);
2101	movdqa	%xmm5, %xmm1
2102	pandn	$res_y(%rsp), %xmm0
2103	movdqa	%xmm5, %xmm2
2104	pandn	$res_y+0x10(%rsp), %xmm1
2105	movdqa	%xmm5, %xmm3
2106	pand	$in2_y(%rsp), %xmm2
2107	pand	$in2_y+0x10(%rsp), %xmm3
2108	por	%xmm0, %xmm2
2109	por	%xmm1, %xmm3
2110
2111	movdqa	%xmm4, %xmm0		# copy_conditional(res_y, in1_y, in2infty);
2112	movdqa	%xmm4, %xmm1
2113	pandn	%xmm2, %xmm0
2114	movdqa	%xmm4, %xmm2
2115	pandn	%xmm3, %xmm1
2116	movdqa	%xmm4, %xmm3
2117	pand	$in1_y(%rsp), %xmm2
2118	pand	$in1_y+0x10(%rsp), %xmm3
2119	por	%xmm0, %xmm2
2120	por	%xmm1, %xmm3
2121	movdqu	%xmm2, 0x20($r_ptr)
2122	movdqu	%xmm3, 0x30($r_ptr)
2123
2124.Ladd_done$x:
2125	add	\$32*18+8, %rsp
2126	pop	%r15
2127	pop	%r14
2128	pop	%r13
2129	pop	%r12
2130	pop	%rbx
2131	pop	%rbp
2132	ret
2133.size	ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx
2134___
2135}
2136&gen_add("q");
2137
2138sub gen_add_affine () {
2139    my $x = shift;
2140    my ($src0,$sfx,$bias);
2141    my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr,
2142	$res_x,$res_y,$res_z,
2143	$in1_x,$in1_y,$in1_z,
2144	$in2_x,$in2_y)=map(32*$_,(0..14));
2145    my $Z1sqr = $S2;
2146
2147    if ($x ne "x") {
2148	$src0 = "%rax";
2149	$sfx  = "";
2150	$bias = 0;
2151
2152$code.=<<___;
2153.globl	ecp_nistz256_point_add_affine
2154.type	ecp_nistz256_point_add_affine,\@function,3
2155.align	32
2156ecp_nistz256_point_add_affine:
2157___
2158$code.=<<___	if ($addx);
2159	mov	\$0x80100, %ecx
2160	and	OPENSSL_ia32cap_P+8(%rip), %ecx
2161	cmp	\$0x80100, %ecx
2162	je	.Lpoint_add_affinex
2163___
2164    } else {
2165	$src0 = "%rdx";
2166	$sfx  = "x";
2167	$bias = 128;
2168
2169$code.=<<___;
2170.type	ecp_nistz256_point_add_affinex,\@function,3
2171.align	32
2172ecp_nistz256_point_add_affinex:
2173.Lpoint_add_affinex:
2174___
2175    }
2176$code.=<<___;
2177	push	%rbp
2178	push	%rbx
2179	push	%r12
2180	push	%r13
2181	push	%r14
2182	push	%r15
2183	sub	\$32*15+8, %rsp
2184
2185	movdqu	0x00($a_ptr), %xmm0	# copy	*(P256_POINT *)$a_ptr
2186	mov	$b_org, $b_ptr		# reassign
2187	movdqu	0x10($a_ptr), %xmm1
2188	movdqu	0x20($a_ptr), %xmm2
2189	movdqu	0x30($a_ptr), %xmm3
2190	movdqu	0x40($a_ptr), %xmm4
2191	movdqu	0x50($a_ptr), %xmm5
2192	 mov	0x40+8*0($a_ptr), $src0	# load original in1_z
2193	 mov	0x40+8*1($a_ptr), $acc6
2194	 mov	0x40+8*2($a_ptr), $acc7
2195	 mov	0x40+8*3($a_ptr), $acc0
2196	movdqa	%xmm0, $in1_x(%rsp)
2197	movdqa	%xmm1, $in1_x+0x10(%rsp)
2198	movdqa	%xmm2, $in1_y(%rsp)
2199	movdqa	%xmm3, $in1_y+0x10(%rsp)
2200	movdqa	%xmm4, $in1_z(%rsp)
2201	movdqa	%xmm5, $in1_z+0x10(%rsp)
2202	por	%xmm4, %xmm5
2203
2204	movdqu	0x00($b_ptr), %xmm0	# copy	*(P256_POINT_AFFINE *)$b_ptr
2205	 pshufd	\$0xb1, %xmm5, %xmm3
2206	movdqu	0x10($b_ptr), %xmm1
2207	movdqu	0x20($b_ptr), %xmm2
2208	 por	%xmm3, %xmm5
2209	movdqu	0x30($b_ptr), %xmm3
2210	movdqa	%xmm0, $in2_x(%rsp)
2211	 pshufd	\$0x1e, %xmm5, %xmm4
2212	movdqa	%xmm1, $in2_x+0x10(%rsp)
2213	por	%xmm0, %xmm1
2214	 movq	$r_ptr, %xmm0		# save $r_ptr
2215	movdqa	%xmm2, $in2_y(%rsp)
2216	movdqa	%xmm3, $in2_y+0x10(%rsp)
2217	por	%xmm2, %xmm3
2218	 por	%xmm4, %xmm5
2219	 pxor	%xmm4, %xmm4
2220	por	%xmm1, %xmm3
2221
2222	lea	0x40-$bias($a_ptr), $a_ptr	# $a_ptr is still valid
2223	lea	$Z1sqr(%rsp), $r_ptr		# Z1^2
2224	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Z1sqr, in1_z);
2225
2226	pcmpeqd	%xmm4, %xmm5
2227	pshufd	\$0xb1, %xmm3, %xmm4
2228	 mov	0x00($b_ptr), $src0		# $b_ptr is still valid
2229	 #lea	0x00($b_ptr), $b_ptr
2230	 mov	$acc4, $acc1			# harmonize sqr output and mul input
2231	por	%xmm3, %xmm4
2232	pshufd	\$0, %xmm5, %xmm5		# in1infty
2233	pshufd	\$0x1e, %xmm4, %xmm3
2234	 mov	$acc5, $acc2
2235	por	%xmm3, %xmm4
2236	pxor	%xmm3, %xmm3
2237	 mov	$acc6, $acc3
2238	pcmpeqd	%xmm3, %xmm4
2239	pshufd	\$0, %xmm4, %xmm4		# in2infty
2240
2241	lea	$Z1sqr-$bias(%rsp), $a_ptr
2242	mov	$acc7, $acc4
2243	lea	$U2(%rsp), $r_ptr		# U2 = X2*Z1^2
2244	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, Z1sqr, in2_x);
2245
2246	lea	$in1_x(%rsp), $b_ptr
2247	lea	$H(%rsp), $r_ptr		# H = U2 - U1
2248	call	__ecp_nistz256_sub_from$x	# p256_sub(H, U2, in1_x);
2249
2250	`&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
2251	lea	$S2(%rsp), $r_ptr		# S2 = Z1^3
2252	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Z1sqr, in1_z);
2253
2254	`&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
2255	lea	$res_z(%rsp), $r_ptr		# Z3 = H*Z1*Z2
2256	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(res_z, H, in1_z);
2257
2258	`&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
2259	lea	$S2(%rsp), $r_ptr		# S2 = Y2*Z1^3
2260	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, S2, in2_y);
2261
2262	lea	$in1_y(%rsp), $b_ptr
2263	lea	$R(%rsp), $r_ptr		# R = S2 - S1
2264	call	__ecp_nistz256_sub_from$x	# p256_sub(R, S2, in1_y);
2265
2266	`&load_for_sqr("$H(%rsp)", "$src0")`
2267	lea	$Hsqr(%rsp), $r_ptr		# H^2
2268	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Hsqr, H);
2269
2270	`&load_for_sqr("$R(%rsp)", "$src0")`
2271	lea	$Rsqr(%rsp), $r_ptr		# R^2
2272	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(Rsqr, R);
2273
2274	`&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")`
2275	lea	$Hcub(%rsp), $r_ptr		# H^3
2276	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(Hcub, Hsqr, H);
2277
2278	`&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")`
2279	lea	$U2(%rsp), $r_ptr		# U1*H^2
2280	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(U2, in1_x, Hsqr);
2281___
2282{
2283#######################################################################
2284# operate in 4-5-0-1 "name space" that matches multiplication output
2285#
2286my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
2287my ($poly1, $poly3)=($acc6,$acc7);
2288
2289$code.=<<___;
2290	#lea	$U2(%rsp), $a_ptr
2291	#lea	$Hsqr(%rsp), $r_ptr	# 2*U1*H^2
2292	#call	__ecp_nistz256_mul_by_2	# ecp_nistz256_mul_by_2(Hsqr, U2);
2293
2294	xor	$t4, $t4
2295	add	$acc0, $acc0		# a0:a3+a0:a3
2296	lea	$Rsqr(%rsp), $a_ptr
2297	adc	$acc1, $acc1
2298	 mov	$acc0, $t0
2299	adc	$acc2, $acc2
2300	adc	$acc3, $acc3
2301	 mov	$acc1, $t1
2302	adc	\$0, $t4
2303
2304	sub	\$-1, $acc0
2305	 mov	$acc2, $t2
2306	sbb	$poly1, $acc1
2307	sbb	\$0, $acc2
2308	 mov	$acc3, $t3
2309	sbb	$poly3, $acc3
2310	sbb	\$0, $t4
2311
2312	cmovc	$t0, $acc0
2313	mov	8*0($a_ptr), $t0
2314	cmovc	$t1, $acc1
2315	mov	8*1($a_ptr), $t1
2316	cmovc	$t2, $acc2
2317	mov	8*2($a_ptr), $t2
2318	cmovc	$t3, $acc3
2319	mov	8*3($a_ptr), $t3
2320
2321	call	__ecp_nistz256_sub$x		# p256_sub(res_x, Rsqr, Hsqr);
2322
2323	lea	$Hcub(%rsp), $b_ptr
2324	lea	$res_x(%rsp), $r_ptr
2325	call	__ecp_nistz256_sub_from$x	# p256_sub(res_x, res_x, Hcub);
2326
2327	mov	$U2+8*0(%rsp), $t0
2328	mov	$U2+8*1(%rsp), $t1
2329	mov	$U2+8*2(%rsp), $t2
2330	mov	$U2+8*3(%rsp), $t3
2331	lea	$H(%rsp), $r_ptr
2332
2333	call	__ecp_nistz256_sub$x		# p256_sub(H, U2, res_x);
2334
2335	mov	$acc0, 8*0($r_ptr)		# save the result, as
2336	mov	$acc1, 8*1($r_ptr)		# __ecp_nistz256_sub doesn't
2337	mov	$acc2, 8*2($r_ptr)
2338	mov	$acc3, 8*3($r_ptr)
2339___
2340}
2341$code.=<<___;
2342	`&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")`
2343	lea	$S2(%rsp), $r_ptr
2344	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(S2, Hcub, in1_y);
2345
2346	`&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")`
2347	lea	$H(%rsp), $r_ptr
2348	call	__ecp_nistz256_mul_mont$x	# p256_mul_mont(H, H, R);
2349
2350	lea	$S2(%rsp), $b_ptr
2351	lea	$res_y(%rsp), $r_ptr
2352	call	__ecp_nistz256_sub_from$x	# p256_sub(res_y, H, S2);
2353
2354	movq	%xmm0, $r_ptr		# restore $r_ptr
2355
2356	movdqa	%xmm5, %xmm0		# copy_conditional(res_z, ONE, in1infty);
2357	movdqa	%xmm5, %xmm1
2358	pandn	$res_z(%rsp), %xmm0
2359	movdqa	%xmm5, %xmm2
2360	pandn	$res_z+0x10(%rsp), %xmm1
2361	movdqa	%xmm5, %xmm3
2362	pand	.LONE_mont(%rip), %xmm2
2363	pand	.LONE_mont+0x10(%rip), %xmm3
2364	por	%xmm0, %xmm2
2365	por	%xmm1, %xmm3
2366
2367	movdqa	%xmm4, %xmm0		# copy_conditional(res_z, in1_z, in2infty);
2368	movdqa	%xmm4, %xmm1
2369	pandn	%xmm2, %xmm0
2370	movdqa	%xmm4, %xmm2
2371	pandn	%xmm3, %xmm1
2372	movdqa	%xmm4, %xmm3
2373	pand	$in1_z(%rsp), %xmm2
2374	pand	$in1_z+0x10(%rsp), %xmm3
2375	por	%xmm0, %xmm2
2376	por	%xmm1, %xmm3
2377	movdqu	%xmm2, 0x40($r_ptr)
2378	movdqu	%xmm3, 0x50($r_ptr)
2379
2380	movdqa	%xmm5, %xmm0		# copy_conditional(res_x, in2_x, in1infty);
2381	movdqa	%xmm5, %xmm1
2382	pandn	$res_x(%rsp), %xmm0
2383	movdqa	%xmm5, %xmm2
2384	pandn	$res_x+0x10(%rsp), %xmm1
2385	movdqa	%xmm5, %xmm3
2386	pand	$in2_x(%rsp), %xmm2
2387	pand	$in2_x+0x10(%rsp), %xmm3
2388	por	%xmm0, %xmm2
2389	por	%xmm1, %xmm3
2390
2391	movdqa	%xmm4, %xmm0		# copy_conditional(res_x, in1_x, in2infty);
2392	movdqa	%xmm4, %xmm1
2393	pandn	%xmm2, %xmm0
2394	movdqa	%xmm4, %xmm2
2395	pandn	%xmm3, %xmm1
2396	movdqa	%xmm4, %xmm3
2397	pand	$in1_x(%rsp), %xmm2
2398	pand	$in1_x+0x10(%rsp), %xmm3
2399	por	%xmm0, %xmm2
2400	por	%xmm1, %xmm3
2401	movdqu	%xmm2, 0x00($r_ptr)
2402	movdqu	%xmm3, 0x10($r_ptr)
2403
2404	movdqa	%xmm5, %xmm0		# copy_conditional(res_y, in2_y, in1infty);
2405	movdqa	%xmm5, %xmm1
2406	pandn	$res_y(%rsp), %xmm0
2407	movdqa	%xmm5, %xmm2
2408	pandn	$res_y+0x10(%rsp), %xmm1
2409	movdqa	%xmm5, %xmm3
2410	pand	$in2_y(%rsp), %xmm2
2411	pand	$in2_y+0x10(%rsp), %xmm3
2412	por	%xmm0, %xmm2
2413	por	%xmm1, %xmm3
2414
2415	movdqa	%xmm4, %xmm0		# copy_conditional(res_y, in1_y, in2infty);
2416	movdqa	%xmm4, %xmm1
2417	pandn	%xmm2, %xmm0
2418	movdqa	%xmm4, %xmm2
2419	pandn	%xmm3, %xmm1
2420	movdqa	%xmm4, %xmm3
2421	pand	$in1_y(%rsp), %xmm2
2422	pand	$in1_y+0x10(%rsp), %xmm3
2423	por	%xmm0, %xmm2
2424	por	%xmm1, %xmm3
2425	movdqu	%xmm2, 0x20($r_ptr)
2426	movdqu	%xmm3, 0x30($r_ptr)
2427
2428	add	\$32*15+8, %rsp
2429	pop	%r15
2430	pop	%r14
2431	pop	%r13
2432	pop	%r12
2433	pop	%rbx
2434	pop	%rbp
2435	ret
2436.size	ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx
2437___
2438}
2439&gen_add_affine("q");
2440
2441########################################################################
2442# AD*X magic
2443#
2444if ($addx) {								{
2445########################################################################
2446# operate in 4-5-0-1 "name space" that matches multiplication output
2447#
2448my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
2449
2450$code.=<<___;
2451.type	__ecp_nistz256_add_tox,\@abi-omnipotent
2452.align	32
2453__ecp_nistz256_add_tox:
2454	xor	$t4, $t4
2455	adc	8*0($b_ptr), $a0
2456	adc	8*1($b_ptr), $a1
2457	 mov	$a0, $t0
2458	adc	8*2($b_ptr), $a2
2459	adc	8*3($b_ptr), $a3
2460	 mov	$a1, $t1
2461	adc	\$0, $t4
2462
2463	xor	$t3, $t3
2464	sbb	\$-1, $a0
2465	 mov	$a2, $t2
2466	sbb	$poly1, $a1
2467	sbb	\$0, $a2
2468	 mov	$a3, $t3
2469	sbb	$poly3, $a3
2470	sbb	\$0, $t4
2471
2472	cmovc	$t0, $a0
2473	cmovc	$t1, $a1
2474	mov	$a0, 8*0($r_ptr)
2475	cmovc	$t2, $a2
2476	mov	$a1, 8*1($r_ptr)
2477	cmovc	$t3, $a3
2478	mov	$a2, 8*2($r_ptr)
2479	mov	$a3, 8*3($r_ptr)
2480
2481	ret
2482.size	__ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
2483
2484.type	__ecp_nistz256_sub_fromx,\@abi-omnipotent
2485.align	32
2486__ecp_nistz256_sub_fromx:
2487	xor	$t4, $t4
2488	sbb	8*0($b_ptr), $a0
2489	sbb	8*1($b_ptr), $a1
2490	 mov	$a0, $t0
2491	sbb	8*2($b_ptr), $a2
2492	sbb	8*3($b_ptr), $a3
2493	 mov	$a1, $t1
2494	sbb	\$0, $t4
2495
2496	xor	$t3, $t3
2497	adc	\$-1, $a0
2498	 mov	$a2, $t2
2499	adc	$poly1, $a1
2500	adc	\$0, $a2
2501	 mov	$a3, $t3
2502	adc	$poly3, $a3
2503
2504	bt	\$0, $t4
2505	cmovnc	$t0, $a0
2506	cmovnc	$t1, $a1
2507	mov	$a0, 8*0($r_ptr)
2508	cmovnc	$t2, $a2
2509	mov	$a1, 8*1($r_ptr)
2510	cmovnc	$t3, $a3
2511	mov	$a2, 8*2($r_ptr)
2512	mov	$a3, 8*3($r_ptr)
2513
2514	ret
2515.size	__ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
2516
2517.type	__ecp_nistz256_subx,\@abi-omnipotent
2518.align	32
2519__ecp_nistz256_subx:
2520	xor	$t4, $t4
2521	sbb	$a0, $t0
2522	sbb	$a1, $t1
2523	 mov	$t0, $a0
2524	sbb	$a2, $t2
2525	sbb	$a3, $t3
2526	 mov	$t1, $a1
2527	sbb	\$0, $t4
2528
2529	xor	$a3 ,$a3
2530	adc	\$-1, $t0
2531	 mov	$t2, $a2
2532	adc	$poly1, $t1
2533	adc	\$0, $t2
2534	 mov	$t3, $a3
2535	adc	$poly3, $t3
2536
2537	bt	\$0, $t4
2538	cmovc	$t0, $a0
2539	cmovc	$t1, $a1
2540	cmovc	$t2, $a2
2541	cmovc	$t3, $a3
2542
2543	ret
2544.size	__ecp_nistz256_subx,.-__ecp_nistz256_subx
2545
2546.type	__ecp_nistz256_mul_by_2x,\@abi-omnipotent
2547.align	32
2548__ecp_nistz256_mul_by_2x:
2549	xor	$t4, $t4
2550	adc	$a0, $a0		# a0:a3+a0:a3
2551	adc	$a1, $a1
2552	 mov	$a0, $t0
2553	adc	$a2, $a2
2554	adc	$a3, $a3
2555	 mov	$a1, $t1
2556	adc	\$0, $t4
2557
2558	xor	$t3, $t3
2559	sbb	\$-1, $a0
2560	 mov	$a2, $t2
2561	sbb	$poly1, $a1
2562	sbb	\$0, $a2
2563	 mov	$a3, $t3
2564	sbb	$poly3, $a3
2565	sbb	\$0, $t4
2566
2567	cmovc	$t0, $a0
2568	cmovc	$t1, $a1
2569	mov	$a0, 8*0($r_ptr)
2570	cmovc	$t2, $a2
2571	mov	$a1, 8*1($r_ptr)
2572	cmovc	$t3, $a3
2573	mov	$a2, 8*2($r_ptr)
2574	mov	$a3, 8*3($r_ptr)
2575
2576	ret
2577.size	__ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
2578___
2579									}
2580&gen_double("x");
2581&gen_add("x");
2582&gen_add_affine("x");
2583}
2584}}}
2585
2586$code =~ s/\`([^\`]*)\`/eval $1/gem;
2587print $code;
2588close STDOUT;
2589