• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env perl
2
3##############################################################################
4#                                                                            #
5#  Copyright (c) 2012, Intel Corporation                                     #
6#                                                                            #
7#  All rights reserved.                                                      #
8#                                                                            #
9#  Redistribution and use in source and binary forms, with or without        #
10#  modification, are permitted provided that the following conditions are    #
11#  met:                                                                      #
12#                                                                            #
13#  *  Redistributions of source code must retain the above copyright         #
14#     notice, this list of conditions and the following disclaimer.          #
15#                                                                            #
16#  *  Redistributions in binary form must reproduce the above copyright      #
17#     notice, this list of conditions and the following disclaimer in the    #
18#     documentation and/or other materials provided with the                 #
19#     distribution.                                                          #
20#                                                                            #
21#  *  Neither the name of the Intel Corporation nor the names of its         #
22#     contributors may be used to endorse or promote products derived from   #
23#     this software without specific prior written permission.               #
24#                                                                            #
25#                                                                            #
26#  THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY          #
27#  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE         #
28#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR        #
29#  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR            #
30#  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,     #
31#  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,       #
32#  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR        #
33#  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF    #
34#  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING      #
35#  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS        #
36#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              #
37#                                                                            #
38##############################################################################
39# Developers and authors:                                                    #
40# Shay Gueron (1, 2), and Vlad Krasnov (1)                                   #
41# (1) Intel Corporation, Israel Development Center, Haifa, Israel            #
42# (2) University of Haifa, Israel                                            #
43##############################################################################
44# Reference:                                                                 #
45# [1] S. Gueron, V. Krasnov: "Software Implementation of Modular             #
46#     Exponentiation,  Using Advanced Vector Instructions Architectures",    #
47#     F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369,   #
48#     pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012              #
49# [2] S. Gueron: "Efficient Software Implementations of Modular              #
50#     Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012).  #
51# [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE         #
52#     Proceedings of 9th International Conference on Information Technology: #
53#     New Generations (ITNG 2012), pp.821-823 (2012)                         #
54# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis    #
55#     resistant 1024-bit modular exponentiation, for optimizing RSA2048      #
56#     on AVX2 capable x86_64 platforms",                                     #
57#     http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest#
58##############################################################################
59#
60# +13% improvement over original submission by <appro@openssl.org>
61#
62# rsa2048 sign/sec	OpenSSL 1.0.1	scalar(*)	this
63# 2.3GHz Haswell	621		765/+23%	1113/+79%
64# 2.3GHz Broadwell(**)	688		1200(***)/+74%	1120/+63%
65#
66# (*)	if system doesn't support AVX2, for reference purposes;
67# (**)	scaled to 2.3GHz to simplify comparison;
68# (***)	scalar AD*X code is faster than AVX2 and is preferred code
69#	path for Broadwell;
70
71$flavour = shift;
72$output  = shift;
73if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
74
75$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
76
77$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
78( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
79( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
80die "can't locate x86_64-xlate.pl";
81
82# In upstream, this is controlled by shelling out to the compiler to check
83# versions, but BoringSSL is intended to be used with pre-generated perlasm
84# output, so this isn't useful anyway.
85#
86# TODO(davidben): Enable these after testing. $avx goes up to 2 and $addx to 1.
87$avx = 2;
88$addx = 1;
89
90open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
91*STDOUT = *OUT;
92
93if ($avx>1) {{{
94{ # void AMS_WW(
95my $rp="%rdi";	# BN_ULONG *rp,
96my $ap="%rsi";	# const BN_ULONG *ap,
97my $np="%rdx";	# const BN_ULONG *np,
98my $n0="%ecx";	# const BN_ULONG n0,
99my $rep="%r8d";	# int repeat);
100
101# The registers that hold the accumulated redundant result
102# The AMM works on 1024 bit operands, and redundant word size is 29
103# Therefore: ceil(1024/29)/4 = 9
104my $ACC0="%ymm0";
105my $ACC1="%ymm1";
106my $ACC2="%ymm2";
107my $ACC3="%ymm3";
108my $ACC4="%ymm4";
109my $ACC5="%ymm5";
110my $ACC6="%ymm6";
111my $ACC7="%ymm7";
112my $ACC8="%ymm8";
113my $ACC9="%ymm9";
114# Registers that hold the broadcasted words of bp, currently used
115my $B1="%ymm10";
116my $B2="%ymm11";
117# Registers that hold the broadcasted words of Y, currently used
118my $Y1="%ymm12";
119my $Y2="%ymm13";
120# Helper registers
121my $TEMP1="%ymm14";
122my $AND_MASK="%ymm15";
123# alu registers that hold the first words of the ACC
124my $r0="%r9";
125my $r1="%r10";
126my $r2="%r11";
127my $r3="%r12";
128
129my $i="%r14d";			# loop counter
130my $tmp = "%r15";
131
132my $FrameSize=32*18+32*8;	# place for A^2 and 2*A
133
134my $aap=$r0;
135my $tp0="%rbx";
136my $tp1=$r3;
137my $tpa=$tmp;
138
139$np="%r13";			# reassigned argument
140
141$code.=<<___;
142.text
143
144.globl	rsaz_1024_sqr_avx2
145.type	rsaz_1024_sqr_avx2,\@function,5
146.align	64
147rsaz_1024_sqr_avx2:		# 702 cycles, 14% faster than rsaz_1024_mul_avx2
148.cfi_startproc
149	lea	(%rsp), %rax
150.cfi_def_cfa_register	%rax
151	push	%rbx
152.cfi_push	%rbx
153	push	%rbp
154.cfi_push	%rbp
155	push	%r12
156.cfi_push	%r12
157	push	%r13
158.cfi_push	%r13
159	push	%r14
160.cfi_push	%r14
161	push	%r15
162.cfi_push	%r15
163	vzeroupper
164___
165$code.=<<___ if ($win64);
166	lea	-0xa8(%rsp),%rsp
167	vmovaps	%xmm6,-0xd8(%rax)
168	vmovaps	%xmm7,-0xc8(%rax)
169	vmovaps	%xmm8,-0xb8(%rax)
170	vmovaps	%xmm9,-0xa8(%rax)
171	vmovaps	%xmm10,-0x98(%rax)
172	vmovaps	%xmm11,-0x88(%rax)
173	vmovaps	%xmm12,-0x78(%rax)
174	vmovaps	%xmm13,-0x68(%rax)
175	vmovaps	%xmm14,-0x58(%rax)
176	vmovaps	%xmm15,-0x48(%rax)
177.Lsqr_1024_body:
178___
179$code.=<<___;
180	mov	%rax,%rbp
181.cfi_def_cfa_register	%rbp
182	mov	%rdx, $np			# reassigned argument
183	sub	\$$FrameSize, %rsp
184	mov	$np, $tmp
185	sub	\$-128, $rp			# size optimization
186	sub	\$-128, $ap
187	sub	\$-128, $np
188
189	and	\$4095, $tmp			# see if $np crosses page
190	add	\$32*10, $tmp
191	shr	\$12, $tmp
192	vpxor	$ACC9,$ACC9,$ACC9
193	jz	.Lsqr_1024_no_n_copy
194
195	# unaligned 256-bit load that crosses page boundary can
196	# cause >2x performance degradation here, so if $np does
197	# cross page boundary, copy it to stack and make sure stack
198	# frame doesn't...
199	sub		\$32*10,%rsp
200	vmovdqu		32*0-128($np), $ACC0
201	and		\$-2048, %rsp
202	vmovdqu		32*1-128($np), $ACC1
203	vmovdqu		32*2-128($np), $ACC2
204	vmovdqu		32*3-128($np), $ACC3
205	vmovdqu		32*4-128($np), $ACC4
206	vmovdqu		32*5-128($np), $ACC5
207	vmovdqu		32*6-128($np), $ACC6
208	vmovdqu		32*7-128($np), $ACC7
209	vmovdqu		32*8-128($np), $ACC8
210	lea		$FrameSize+128(%rsp),$np
211	vmovdqu		$ACC0, 32*0-128($np)
212	vmovdqu		$ACC1, 32*1-128($np)
213	vmovdqu		$ACC2, 32*2-128($np)
214	vmovdqu		$ACC3, 32*3-128($np)
215	vmovdqu		$ACC4, 32*4-128($np)
216	vmovdqu		$ACC5, 32*5-128($np)
217	vmovdqu		$ACC6, 32*6-128($np)
218	vmovdqu		$ACC7, 32*7-128($np)
219	vmovdqu		$ACC8, 32*8-128($np)
220	vmovdqu		$ACC9, 32*9-128($np)	# $ACC9 is zero
221
222.Lsqr_1024_no_n_copy:
223	and		\$-1024, %rsp
224
225	vmovdqu		32*1-128($ap), $ACC1
226	vmovdqu		32*2-128($ap), $ACC2
227	vmovdqu		32*3-128($ap), $ACC3
228	vmovdqu		32*4-128($ap), $ACC4
229	vmovdqu		32*5-128($ap), $ACC5
230	vmovdqu		32*6-128($ap), $ACC6
231	vmovdqu		32*7-128($ap), $ACC7
232	vmovdqu		32*8-128($ap), $ACC8
233
234	lea	192(%rsp), $tp0			# 64+128=192
235	vpbroadcastq	.Land_mask(%rip), $AND_MASK
236	jmp	.LOOP_GRANDE_SQR_1024
237
238.align	32
239.LOOP_GRANDE_SQR_1024:
240	lea	32*18+128(%rsp), $aap		# size optimization
241	lea	448(%rsp), $tp1			# 64+128+256=448
242
243	# the squaring is performed as described in Variant B of
244	# "Speeding up Big-Number Squaring", so start by calculating
245	# the A*2=A+A vector
246	vpaddq		$ACC1, $ACC1, $ACC1
247	 vpbroadcastq	32*0-128($ap), $B1
248	vpaddq		$ACC2, $ACC2, $ACC2
249	vmovdqa		$ACC1, 32*0-128($aap)
250	vpaddq		$ACC3, $ACC3, $ACC3
251	vmovdqa		$ACC2, 32*1-128($aap)
252	vpaddq		$ACC4, $ACC4, $ACC4
253	vmovdqa		$ACC3, 32*2-128($aap)
254	vpaddq		$ACC5, $ACC5, $ACC5
255	vmovdqa		$ACC4, 32*3-128($aap)
256	vpaddq		$ACC6, $ACC6, $ACC6
257	vmovdqa		$ACC5, 32*4-128($aap)
258	vpaddq		$ACC7, $ACC7, $ACC7
259	vmovdqa		$ACC6, 32*5-128($aap)
260	vpaddq		$ACC8, $ACC8, $ACC8
261	vmovdqa		$ACC7, 32*6-128($aap)
262	vpxor		$ACC9, $ACC9, $ACC9
263	vmovdqa		$ACC8, 32*7-128($aap)
264
265	vpmuludq	32*0-128($ap), $B1, $ACC0
266	 vpbroadcastq	32*1-128($ap), $B2
267	 vmovdqu	$ACC9, 32*9-192($tp0)	# zero upper half
268	vpmuludq	$B1, $ACC1, $ACC1
269	 vmovdqu	$ACC9, 32*10-448($tp1)
270	vpmuludq	$B1, $ACC2, $ACC2
271	 vmovdqu	$ACC9, 32*11-448($tp1)
272	vpmuludq	$B1, $ACC3, $ACC3
273	 vmovdqu	$ACC9, 32*12-448($tp1)
274	vpmuludq	$B1, $ACC4, $ACC4
275	 vmovdqu	$ACC9, 32*13-448($tp1)
276	vpmuludq	$B1, $ACC5, $ACC5
277	 vmovdqu	$ACC9, 32*14-448($tp1)
278	vpmuludq	$B1, $ACC6, $ACC6
279	 vmovdqu	$ACC9, 32*15-448($tp1)
280	vpmuludq	$B1, $ACC7, $ACC7
281	 vmovdqu	$ACC9, 32*16-448($tp1)
282	vpmuludq	$B1, $ACC8, $ACC8
283	 vpbroadcastq	32*2-128($ap), $B1
284	 vmovdqu	$ACC9, 32*17-448($tp1)
285
286	mov	$ap, $tpa
287	mov 	\$4, $i
288	jmp	.Lsqr_entry_1024
289___
290$TEMP0=$Y1;
291$TEMP2=$Y2;
292$code.=<<___;
293.align	32
294.LOOP_SQR_1024:
295	 vpbroadcastq	32*1-128($tpa), $B2
296	vpmuludq	32*0-128($ap), $B1, $ACC0
297	vpaddq		32*0-192($tp0), $ACC0, $ACC0
298	vpmuludq	32*0-128($aap), $B1, $ACC1
299	vpaddq		32*1-192($tp0), $ACC1, $ACC1
300	vpmuludq	32*1-128($aap), $B1, $ACC2
301	vpaddq		32*2-192($tp0), $ACC2, $ACC2
302	vpmuludq	32*2-128($aap), $B1, $ACC3
303	vpaddq		32*3-192($tp0), $ACC3, $ACC3
304	vpmuludq	32*3-128($aap), $B1, $ACC4
305	vpaddq		32*4-192($tp0), $ACC4, $ACC4
306	vpmuludq	32*4-128($aap), $B1, $ACC5
307	vpaddq		32*5-192($tp0), $ACC5, $ACC5
308	vpmuludq	32*5-128($aap), $B1, $ACC6
309	vpaddq		32*6-192($tp0), $ACC6, $ACC6
310	vpmuludq	32*6-128($aap), $B1, $ACC7
311	vpaddq		32*7-192($tp0), $ACC7, $ACC7
312	vpmuludq	32*7-128($aap), $B1, $ACC8
313	 vpbroadcastq	32*2-128($tpa), $B1
314	vpaddq		32*8-192($tp0), $ACC8, $ACC8
315.Lsqr_entry_1024:
316	vmovdqu		$ACC0, 32*0-192($tp0)
317	vmovdqu		$ACC1, 32*1-192($tp0)
318
319	vpmuludq	32*1-128($ap), $B2, $TEMP0
320	vpaddq		$TEMP0, $ACC2, $ACC2
321	vpmuludq	32*1-128($aap), $B2, $TEMP1
322	vpaddq		$TEMP1, $ACC3, $ACC3
323	vpmuludq	32*2-128($aap), $B2, $TEMP2
324	vpaddq		$TEMP2, $ACC4, $ACC4
325	vpmuludq	32*3-128($aap), $B2, $TEMP0
326	vpaddq		$TEMP0, $ACC5, $ACC5
327	vpmuludq	32*4-128($aap), $B2, $TEMP1
328	vpaddq		$TEMP1, $ACC6, $ACC6
329	vpmuludq	32*5-128($aap), $B2, $TEMP2
330	vpaddq		$TEMP2, $ACC7, $ACC7
331	vpmuludq	32*6-128($aap), $B2, $TEMP0
332	vpaddq		$TEMP0, $ACC8, $ACC8
333	vpmuludq	32*7-128($aap), $B2, $ACC0
334	 vpbroadcastq	32*3-128($tpa), $B2
335	vpaddq		32*9-192($tp0), $ACC0, $ACC0
336
337	vmovdqu		$ACC2, 32*2-192($tp0)
338	vmovdqu		$ACC3, 32*3-192($tp0)
339
340	vpmuludq	32*2-128($ap), $B1, $TEMP2
341	vpaddq		$TEMP2, $ACC4, $ACC4
342	vpmuludq	32*2-128($aap), $B1, $TEMP0
343	vpaddq		$TEMP0, $ACC5, $ACC5
344	vpmuludq	32*3-128($aap), $B1, $TEMP1
345	vpaddq		$TEMP1, $ACC6, $ACC6
346	vpmuludq	32*4-128($aap), $B1, $TEMP2
347	vpaddq		$TEMP2, $ACC7, $ACC7
348	vpmuludq	32*5-128($aap), $B1, $TEMP0
349	vpaddq		$TEMP0, $ACC8, $ACC8
350	vpmuludq	32*6-128($aap), $B1, $TEMP1
351	vpaddq		$TEMP1, $ACC0, $ACC0
352	vpmuludq	32*7-128($aap), $B1, $ACC1
353	 vpbroadcastq	32*4-128($tpa), $B1
354	vpaddq		32*10-448($tp1), $ACC1, $ACC1
355
356	vmovdqu		$ACC4, 32*4-192($tp0)
357	vmovdqu		$ACC5, 32*5-192($tp0)
358
359	vpmuludq	32*3-128($ap), $B2, $TEMP0
360	vpaddq		$TEMP0, $ACC6, $ACC6
361	vpmuludq	32*3-128($aap), $B2, $TEMP1
362	vpaddq		$TEMP1, $ACC7, $ACC7
363	vpmuludq	32*4-128($aap), $B2, $TEMP2
364	vpaddq		$TEMP2, $ACC8, $ACC8
365	vpmuludq	32*5-128($aap), $B2, $TEMP0
366	vpaddq		$TEMP0, $ACC0, $ACC0
367	vpmuludq	32*6-128($aap), $B2, $TEMP1
368	vpaddq		$TEMP1, $ACC1, $ACC1
369	vpmuludq	32*7-128($aap), $B2, $ACC2
370	 vpbroadcastq	32*5-128($tpa), $B2
371	vpaddq		32*11-448($tp1), $ACC2, $ACC2
372
373	vmovdqu		$ACC6, 32*6-192($tp0)
374	vmovdqu		$ACC7, 32*7-192($tp0)
375
376	vpmuludq	32*4-128($ap), $B1, $TEMP0
377	vpaddq		$TEMP0, $ACC8, $ACC8
378	vpmuludq	32*4-128($aap), $B1, $TEMP1
379	vpaddq		$TEMP1, $ACC0, $ACC0
380	vpmuludq	32*5-128($aap), $B1, $TEMP2
381	vpaddq		$TEMP2, $ACC1, $ACC1
382	vpmuludq	32*6-128($aap), $B1, $TEMP0
383	vpaddq		$TEMP0, $ACC2, $ACC2
384	vpmuludq	32*7-128($aap), $B1, $ACC3
385	 vpbroadcastq	32*6-128($tpa), $B1
386	vpaddq		32*12-448($tp1), $ACC3, $ACC3
387
388	vmovdqu		$ACC8, 32*8-192($tp0)
389	vmovdqu		$ACC0, 32*9-192($tp0)
390	lea		8($tp0), $tp0
391
392	vpmuludq	32*5-128($ap), $B2, $TEMP2
393	vpaddq		$TEMP2, $ACC1, $ACC1
394	vpmuludq	32*5-128($aap), $B2, $TEMP0
395	vpaddq		$TEMP0, $ACC2, $ACC2
396	vpmuludq	32*6-128($aap), $B2, $TEMP1
397	vpaddq		$TEMP1, $ACC3, $ACC3
398	vpmuludq	32*7-128($aap), $B2, $ACC4
399	 vpbroadcastq	32*7-128($tpa), $B2
400	vpaddq		32*13-448($tp1), $ACC4, $ACC4
401
402	vmovdqu		$ACC1, 32*10-448($tp1)
403	vmovdqu		$ACC2, 32*11-448($tp1)
404
405	vpmuludq	32*6-128($ap), $B1, $TEMP0
406	vpaddq		$TEMP0, $ACC3, $ACC3
407	vpmuludq	32*6-128($aap), $B1, $TEMP1
408	 vpbroadcastq	32*8-128($tpa), $ACC0		# borrow $ACC0 for $B1
409	vpaddq		$TEMP1, $ACC4, $ACC4
410	vpmuludq	32*7-128($aap), $B1, $ACC5
411	 vpbroadcastq	32*0+8-128($tpa), $B1		# for next iteration
412	vpaddq		32*14-448($tp1), $ACC5, $ACC5
413
414	vmovdqu		$ACC3, 32*12-448($tp1)
415	vmovdqu		$ACC4, 32*13-448($tp1)
416	lea		8($tpa), $tpa
417
418	vpmuludq	32*7-128($ap), $B2, $TEMP0
419	vpaddq		$TEMP0, $ACC5, $ACC5
420	vpmuludq	32*7-128($aap), $B2, $ACC6
421	vpaddq		32*15-448($tp1), $ACC6, $ACC6
422
423	vpmuludq	32*8-128($ap), $ACC0, $ACC7
424	vmovdqu		$ACC5, 32*14-448($tp1)
425	vpaddq		32*16-448($tp1), $ACC7, $ACC7
426	vmovdqu		$ACC6, 32*15-448($tp1)
427	vmovdqu		$ACC7, 32*16-448($tp1)
428	lea		8($tp1), $tp1
429
430	dec	$i
431	jnz	.LOOP_SQR_1024
432___
433$ZERO = $ACC9;
434$TEMP0 = $B1;
435$TEMP2 = $B2;
436$TEMP3 = $Y1;
437$TEMP4 = $Y2;
438$code.=<<___;
439	# we need to fix indices 32-39 to avoid overflow
440	vmovdqu		32*8(%rsp), $ACC8		# 32*8-192($tp0),
441	vmovdqu		32*9(%rsp), $ACC1		# 32*9-192($tp0)
442	vmovdqu		32*10(%rsp), $ACC2		# 32*10-192($tp0)
443	lea		192(%rsp), $tp0			# 64+128=192
444
445	vpsrlq		\$29, $ACC8, $TEMP1
446	vpand		$AND_MASK, $ACC8, $ACC8
447	vpsrlq		\$29, $ACC1, $TEMP2
448	vpand		$AND_MASK, $ACC1, $ACC1
449
450	vpermq		\$0x93, $TEMP1, $TEMP1
451	vpxor		$ZERO, $ZERO, $ZERO
452	vpermq		\$0x93, $TEMP2, $TEMP2
453
454	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
455	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
456	vpaddq		$TEMP0, $ACC8, $ACC8
457	vpblendd	\$3, $TEMP2, $ZERO, $TEMP2
458	vpaddq		$TEMP1, $ACC1, $ACC1
459	vpaddq		$TEMP2, $ACC2, $ACC2
460	vmovdqu		$ACC1, 32*9-192($tp0)
461	vmovdqu		$ACC2, 32*10-192($tp0)
462
463	mov	(%rsp), %rax
464	mov	8(%rsp), $r1
465	mov	16(%rsp), $r2
466	mov	24(%rsp), $r3
467	vmovdqu	32*1(%rsp), $ACC1
468	vmovdqu	32*2-192($tp0), $ACC2
469	vmovdqu	32*3-192($tp0), $ACC3
470	vmovdqu	32*4-192($tp0), $ACC4
471	vmovdqu	32*5-192($tp0), $ACC5
472	vmovdqu	32*6-192($tp0), $ACC6
473	vmovdqu	32*7-192($tp0), $ACC7
474
475	mov	%rax, $r0
476	imull	$n0, %eax
477	and	\$0x1fffffff, %eax
478	vmovd	%eax, $Y1
479
480	mov	%rax, %rdx
481	imulq	-128($np), %rax
482	 vpbroadcastq	$Y1, $Y1
483	add	%rax, $r0
484	mov	%rdx, %rax
485	imulq	8-128($np), %rax
486	shr	\$29, $r0
487	add	%rax, $r1
488	mov	%rdx, %rax
489	imulq	16-128($np), %rax
490	add	$r0, $r1
491	add	%rax, $r2
492	imulq	24-128($np), %rdx
493	add	%rdx, $r3
494
495	mov	$r1, %rax
496	imull	$n0, %eax
497	and	\$0x1fffffff, %eax
498
499	mov \$9, $i
500	jmp .LOOP_REDUCE_1024
501
502.align	32
503.LOOP_REDUCE_1024:
504	vmovd	%eax, $Y2
505	vpbroadcastq	$Y2, $Y2
506
507	vpmuludq	32*1-128($np), $Y1, $TEMP0
508	 mov	%rax, %rdx
509	 imulq	-128($np), %rax
510	vpaddq		$TEMP0, $ACC1, $ACC1
511	 add	%rax, $r1
512	vpmuludq	32*2-128($np), $Y1, $TEMP1
513	 mov	%rdx, %rax
514	 imulq	8-128($np), %rax
515	vpaddq		$TEMP1, $ACC2, $ACC2
516	vpmuludq	32*3-128($np), $Y1, $TEMP2
517	 .byte	0x67
518	 add	%rax, $r2
519	 .byte	0x67
520	 mov	%rdx, %rax
521	 imulq	16-128($np), %rax
522	 shr	\$29, $r1
523	vpaddq		$TEMP2, $ACC3, $ACC3
524	vpmuludq	32*4-128($np), $Y1, $TEMP0
525	 add	%rax, $r3
526	 add	$r1, $r2
527	vpaddq		$TEMP0, $ACC4, $ACC4
528	vpmuludq	32*5-128($np), $Y1, $TEMP1
529	 mov	$r2, %rax
530	 imull	$n0, %eax
531	vpaddq		$TEMP1, $ACC5, $ACC5
532	vpmuludq	32*6-128($np), $Y1, $TEMP2
533	 and	\$0x1fffffff, %eax
534	vpaddq		$TEMP2, $ACC6, $ACC6
535	vpmuludq	32*7-128($np), $Y1, $TEMP0
536	vpaddq		$TEMP0, $ACC7, $ACC7
537	vpmuludq	32*8-128($np), $Y1, $TEMP1
538	 vmovd	%eax, $Y1
539	 #vmovdqu	32*1-8-128($np), $TEMP2		# moved below
540	vpaddq		$TEMP1, $ACC8, $ACC8
541	 #vmovdqu	32*2-8-128($np), $TEMP0		# moved below
542	 vpbroadcastq	$Y1, $Y1
543
544	vpmuludq	32*1-8-128($np), $Y2, $TEMP2	# see above
545	vmovdqu		32*3-8-128($np), $TEMP1
546	 mov	%rax, %rdx
547	 imulq	-128($np), %rax
548	vpaddq		$TEMP2, $ACC1, $ACC1
549	vpmuludq	32*2-8-128($np), $Y2, $TEMP0	# see above
550	vmovdqu		32*4-8-128($np), $TEMP2
551	 add	%rax, $r2
552	 mov	%rdx, %rax
553	 imulq	8-128($np), %rax
554	vpaddq		$TEMP0, $ACC2, $ACC2
555	 add	$r3, %rax
556	 shr	\$29, $r2
557	vpmuludq	$Y2, $TEMP1, $TEMP1
558	vmovdqu		32*5-8-128($np), $TEMP0
559	 add	$r2, %rax
560	vpaddq		$TEMP1, $ACC3, $ACC3
561	vpmuludq	$Y2, $TEMP2, $TEMP2
562	vmovdqu		32*6-8-128($np), $TEMP1
563	 .byte	0x67
564	 mov	%rax, $r3
565	 imull	$n0, %eax
566	vpaddq		$TEMP2, $ACC4, $ACC4
567	vpmuludq	$Y2, $TEMP0, $TEMP0
568	.byte	0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00	# vmovdqu		32*7-8-128($np), $TEMP2
569	 and	\$0x1fffffff, %eax
570	vpaddq		$TEMP0, $ACC5, $ACC5
571	vpmuludq	$Y2, $TEMP1, $TEMP1
572	vmovdqu		32*8-8-128($np), $TEMP0
573	vpaddq		$TEMP1, $ACC6, $ACC6
574	vpmuludq	$Y2, $TEMP2, $TEMP2
575	vmovdqu		32*9-8-128($np), $ACC9
576	 vmovd	%eax, $ACC0			# borrow ACC0 for Y2
577	 imulq	-128($np), %rax
578	vpaddq		$TEMP2, $ACC7, $ACC7
579	vpmuludq	$Y2, $TEMP0, $TEMP0
580	 vmovdqu	32*1-16-128($np), $TEMP1
581	 vpbroadcastq	$ACC0, $ACC0
582	vpaddq		$TEMP0, $ACC8, $ACC8
583	vpmuludq	$Y2, $ACC9, $ACC9
584	 vmovdqu	32*2-16-128($np), $TEMP2
585	 add	%rax, $r3
586
587___
588($ACC0,$Y2)=($Y2,$ACC0);
589$code.=<<___;
590	 vmovdqu	32*1-24-128($np), $ACC0
591	vpmuludq	$Y1, $TEMP1, $TEMP1
592	vmovdqu		32*3-16-128($np), $TEMP0
593	vpaddq		$TEMP1, $ACC1, $ACC1
594	 vpmuludq	$Y2, $ACC0, $ACC0
595	vpmuludq	$Y1, $TEMP2, $TEMP2
596	.byte	0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff	# vmovdqu		32*4-16-128($np), $TEMP1
597	 vpaddq		$ACC1, $ACC0, $ACC0
598	vpaddq		$TEMP2, $ACC2, $ACC2
599	vpmuludq	$Y1, $TEMP0, $TEMP0
600	vmovdqu		32*5-16-128($np), $TEMP2
601	 .byte	0x67
602	 vmovq		$ACC0, %rax
603	 vmovdqu	$ACC0, (%rsp)		# transfer $r0-$r3
604	vpaddq		$TEMP0, $ACC3, $ACC3
605	vpmuludq	$Y1, $TEMP1, $TEMP1
606	vmovdqu		32*6-16-128($np), $TEMP0
607	vpaddq		$TEMP1, $ACC4, $ACC4
608	vpmuludq	$Y1, $TEMP2, $TEMP2
609	vmovdqu		32*7-16-128($np), $TEMP1
610	vpaddq		$TEMP2, $ACC5, $ACC5
611	vpmuludq	$Y1, $TEMP0, $TEMP0
612	vmovdqu		32*8-16-128($np), $TEMP2
613	vpaddq		$TEMP0, $ACC6, $ACC6
614	vpmuludq	$Y1, $TEMP1, $TEMP1
615	 shr	\$29, $r3
616	vmovdqu		32*9-16-128($np), $TEMP0
617	 add	$r3, %rax
618	vpaddq		$TEMP1, $ACC7, $ACC7
619	vpmuludq	$Y1, $TEMP2, $TEMP2
620	 #vmovdqu	32*2-24-128($np), $TEMP1	# moved below
621	 mov	%rax, $r0
622	 imull	$n0, %eax
623	vpaddq		$TEMP2, $ACC8, $ACC8
624	vpmuludq	$Y1, $TEMP0, $TEMP0
625	 and	\$0x1fffffff, %eax
626	 vmovd	%eax, $Y1
627	 vmovdqu	32*3-24-128($np), $TEMP2
628	.byte	0x67
629	vpaddq		$TEMP0, $ACC9, $ACC9
630	 vpbroadcastq	$Y1, $Y1
631
632	vpmuludq	32*2-24-128($np), $Y2, $TEMP1	# see above
633	vmovdqu		32*4-24-128($np), $TEMP0
634	 mov	%rax, %rdx
635	 imulq	-128($np), %rax
636	 mov	8(%rsp), $r1
637	vpaddq		$TEMP1, $ACC2, $ACC1
638	vpmuludq	$Y2, $TEMP2, $TEMP2
639	vmovdqu		32*5-24-128($np), $TEMP1
640	 add	%rax, $r0
641	 mov	%rdx, %rax
642	 imulq	8-128($np), %rax
643	 .byte	0x67
644	 shr	\$29, $r0
645	 mov	16(%rsp), $r2
646	vpaddq		$TEMP2, $ACC3, $ACC2
647	vpmuludq	$Y2, $TEMP0, $TEMP0
648	vmovdqu		32*6-24-128($np), $TEMP2
649	 add	%rax, $r1
650	 mov	%rdx, %rax
651	 imulq	16-128($np), %rax
652	vpaddq		$TEMP0, $ACC4, $ACC3
653	vpmuludq	$Y2, $TEMP1, $TEMP1
654	vmovdqu		32*7-24-128($np), $TEMP0
655	 imulq	24-128($np), %rdx		# future $r3
656	 add	%rax, $r2
657	 lea	($r0,$r1), %rax
658	vpaddq		$TEMP1, $ACC5, $ACC4
659	vpmuludq	$Y2, $TEMP2, $TEMP2
660	vmovdqu		32*8-24-128($np), $TEMP1
661	 mov	%rax, $r1
662	 imull	$n0, %eax
663	vpmuludq	$Y2, $TEMP0, $TEMP0
664	vpaddq		$TEMP2, $ACC6, $ACC5
665	vmovdqu		32*9-24-128($np), $TEMP2
666	 and	\$0x1fffffff, %eax
667	vpaddq		$TEMP0, $ACC7, $ACC6
668	vpmuludq	$Y2, $TEMP1, $TEMP1
669	 add	24(%rsp), %rdx
670	vpaddq		$TEMP1, $ACC8, $ACC7
671	vpmuludq	$Y2, $TEMP2, $TEMP2
672	vpaddq		$TEMP2, $ACC9, $ACC8
673	 vmovq	$r3, $ACC9
674	 mov	%rdx, $r3
675
676	dec	$i
677	jnz	.LOOP_REDUCE_1024
678___
679($ACC0,$Y2)=($Y2,$ACC0);
680$code.=<<___;
681	lea	448(%rsp), $tp1			# size optimization
682	vpaddq	$ACC9, $Y2, $ACC0
683	vpxor	$ZERO, $ZERO, $ZERO
684
685	vpaddq		32*9-192($tp0), $ACC0, $ACC0
686	vpaddq		32*10-448($tp1), $ACC1, $ACC1
687	vpaddq		32*11-448($tp1), $ACC2, $ACC2
688	vpaddq		32*12-448($tp1), $ACC3, $ACC3
689	vpaddq		32*13-448($tp1), $ACC4, $ACC4
690	vpaddq		32*14-448($tp1), $ACC5, $ACC5
691	vpaddq		32*15-448($tp1), $ACC6, $ACC6
692	vpaddq		32*16-448($tp1), $ACC7, $ACC7
693	vpaddq		32*17-448($tp1), $ACC8, $ACC8
694
695	vpsrlq		\$29, $ACC0, $TEMP1
696	vpand		$AND_MASK, $ACC0, $ACC0
697	vpsrlq		\$29, $ACC1, $TEMP2
698	vpand		$AND_MASK, $ACC1, $ACC1
699	vpsrlq		\$29, $ACC2, $TEMP3
700	vpermq		\$0x93, $TEMP1, $TEMP1
701	vpand		$AND_MASK, $ACC2, $ACC2
702	vpsrlq		\$29, $ACC3, $TEMP4
703	vpermq		\$0x93, $TEMP2, $TEMP2
704	vpand		$AND_MASK, $ACC3, $ACC3
705	vpermq		\$0x93, $TEMP3, $TEMP3
706
707	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
708	vpermq		\$0x93, $TEMP4, $TEMP4
709	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
710	vpaddq		$TEMP0, $ACC0, $ACC0
711	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
712	vpaddq		$TEMP1, $ACC1, $ACC1
713	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
714	vpaddq		$TEMP2, $ACC2, $ACC2
715	vpblendd	\$3, $TEMP4, $ZERO, $TEMP4
716	vpaddq		$TEMP3, $ACC3, $ACC3
717	vpaddq		$TEMP4, $ACC4, $ACC4
718
719	vpsrlq		\$29, $ACC0, $TEMP1
720	vpand		$AND_MASK, $ACC0, $ACC0
721	vpsrlq		\$29, $ACC1, $TEMP2
722	vpand		$AND_MASK, $ACC1, $ACC1
723	vpsrlq		\$29, $ACC2, $TEMP3
724	vpermq		\$0x93, $TEMP1, $TEMP1
725	vpand		$AND_MASK, $ACC2, $ACC2
726	vpsrlq		\$29, $ACC3, $TEMP4
727	vpermq		\$0x93, $TEMP2, $TEMP2
728	vpand		$AND_MASK, $ACC3, $ACC3
729	vpermq		\$0x93, $TEMP3, $TEMP3
730
731	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
732	vpermq		\$0x93, $TEMP4, $TEMP4
733	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
734	vpaddq		$TEMP0, $ACC0, $ACC0
735	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
736	vpaddq		$TEMP1, $ACC1, $ACC1
737	vmovdqu		$ACC0, 32*0-128($rp)
738	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
739	vpaddq		$TEMP2, $ACC2, $ACC2
740	vmovdqu		$ACC1, 32*1-128($rp)
741	vpblendd	\$3, $TEMP4, $ZERO, $TEMP4
742	vpaddq		$TEMP3, $ACC3, $ACC3
743	vmovdqu		$ACC2, 32*2-128($rp)
744	vpaddq		$TEMP4, $ACC4, $ACC4
745	vmovdqu		$ACC3, 32*3-128($rp)
746___
747$TEMP5=$ACC0;
748$code.=<<___;
749	vpsrlq		\$29, $ACC4, $TEMP1
750	vpand		$AND_MASK, $ACC4, $ACC4
751	vpsrlq		\$29, $ACC5, $TEMP2
752	vpand		$AND_MASK, $ACC5, $ACC5
753	vpsrlq		\$29, $ACC6, $TEMP3
754	vpermq		\$0x93, $TEMP1, $TEMP1
755	vpand		$AND_MASK, $ACC6, $ACC6
756	vpsrlq		\$29, $ACC7, $TEMP4
757	vpermq		\$0x93, $TEMP2, $TEMP2
758	vpand		$AND_MASK, $ACC7, $ACC7
759	vpsrlq		\$29, $ACC8, $TEMP5
760	vpermq		\$0x93, $TEMP3, $TEMP3
761	vpand		$AND_MASK, $ACC8, $ACC8
762	vpermq		\$0x93, $TEMP4, $TEMP4
763
764	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
765	vpermq		\$0x93, $TEMP5, $TEMP5
766	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
767	vpaddq		$TEMP0, $ACC4, $ACC4
768	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
769	vpaddq		$TEMP1, $ACC5, $ACC5
770	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
771	vpaddq		$TEMP2, $ACC6, $ACC6
772	vpblendd	\$3, $TEMP4, $TEMP5, $TEMP4
773	vpaddq		$TEMP3, $ACC7, $ACC7
774	vpaddq		$TEMP4, $ACC8, $ACC8
775
776	vpsrlq		\$29, $ACC4, $TEMP1
777	vpand		$AND_MASK, $ACC4, $ACC4
778	vpsrlq		\$29, $ACC5, $TEMP2
779	vpand		$AND_MASK, $ACC5, $ACC5
780	vpsrlq		\$29, $ACC6, $TEMP3
781	vpermq		\$0x93, $TEMP1, $TEMP1
782	vpand		$AND_MASK, $ACC6, $ACC6
783	vpsrlq		\$29, $ACC7, $TEMP4
784	vpermq		\$0x93, $TEMP2, $TEMP2
785	vpand		$AND_MASK, $ACC7, $ACC7
786	vpsrlq		\$29, $ACC8, $TEMP5
787	vpermq		\$0x93, $TEMP3, $TEMP3
788	vpand		$AND_MASK, $ACC8, $ACC8
789	vpermq		\$0x93, $TEMP4, $TEMP4
790
791	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
792	vpermq		\$0x93, $TEMP5, $TEMP5
793	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
794	vpaddq		$TEMP0, $ACC4, $ACC4
795	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
796	vpaddq		$TEMP1, $ACC5, $ACC5
797	vmovdqu		$ACC4, 32*4-128($rp)
798	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
799	vpaddq		$TEMP2, $ACC6, $ACC6
800	vmovdqu		$ACC5, 32*5-128($rp)
801	vpblendd	\$3, $TEMP4, $TEMP5, $TEMP4
802	vpaddq		$TEMP3, $ACC7, $ACC7
803	vmovdqu		$ACC6, 32*6-128($rp)
804	vpaddq		$TEMP4, $ACC8, $ACC8
805	vmovdqu		$ACC7, 32*7-128($rp)
806	vmovdqu		$ACC8, 32*8-128($rp)
807
808	mov	$rp, $ap
809	dec	$rep
810	jne	.LOOP_GRANDE_SQR_1024
811
812	vzeroall
813	mov	%rbp, %rax
814.cfi_def_cfa_register	%rax
815___
816$code.=<<___ if ($win64);
817.Lsqr_1024_in_tail:
818	movaps	-0xd8(%rax),%xmm6
819	movaps	-0xc8(%rax),%xmm7
820	movaps	-0xb8(%rax),%xmm8
821	movaps	-0xa8(%rax),%xmm9
822	movaps	-0x98(%rax),%xmm10
823	movaps	-0x88(%rax),%xmm11
824	movaps	-0x78(%rax),%xmm12
825	movaps	-0x68(%rax),%xmm13
826	movaps	-0x58(%rax),%xmm14
827	movaps	-0x48(%rax),%xmm15
828___
829$code.=<<___;
830	mov	-48(%rax),%r15
831.cfi_restore	%r15
832	mov	-40(%rax),%r14
833.cfi_restore	%r14
834	mov	-32(%rax),%r13
835.cfi_restore	%r13
836	mov	-24(%rax),%r12
837.cfi_restore	%r12
838	mov	-16(%rax),%rbp
839.cfi_restore	%rbp
840	mov	-8(%rax),%rbx
841.cfi_restore	%rbx
842	lea	(%rax),%rsp		# restore %rsp
843.cfi_def_cfa_register	%rsp
844.Lsqr_1024_epilogue:
845	ret
846.cfi_endproc
847.size	rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
848___
849}
850
851{ # void AMM_WW(
852my $rp="%rdi";	# BN_ULONG *rp,
853my $ap="%rsi";	# const BN_ULONG *ap,
854my $bp="%rdx";	# const BN_ULONG *bp,
855my $np="%rcx";	# const BN_ULONG *np,
856my $n0="%r8d";	# unsigned int n0);
857
858# The registers that hold the accumulated redundant result
859# The AMM works on 1024 bit operands, and redundant word size is 29
860# Therefore: ceil(1024/29)/4 = 9
861my $ACC0="%ymm0";
862my $ACC1="%ymm1";
863my $ACC2="%ymm2";
864my $ACC3="%ymm3";
865my $ACC4="%ymm4";
866my $ACC5="%ymm5";
867my $ACC6="%ymm6";
868my $ACC7="%ymm7";
869my $ACC8="%ymm8";
870my $ACC9="%ymm9";
871
872# Registers that hold the broadcasted words of multiplier, currently used
873my $Bi="%ymm10";
874my $Yi="%ymm11";
875
876# Helper registers
877my $TEMP0=$ACC0;
878my $TEMP1="%ymm12";
879my $TEMP2="%ymm13";
880my $ZERO="%ymm14";
881my $AND_MASK="%ymm15";
882
883# alu registers that hold the first words of the ACC
884my $r0="%r9";
885my $r1="%r10";
886my $r2="%r11";
887my $r3="%r12";
888
889my $i="%r14d";
890my $tmp="%r15";
891
892$bp="%r13";	# reassigned argument
893
894$code.=<<___;
895.globl	rsaz_1024_mul_avx2
896.type	rsaz_1024_mul_avx2,\@function,5
897.align	64
898rsaz_1024_mul_avx2:
899.cfi_startproc
900	lea	(%rsp), %rax
901.cfi_def_cfa_register	%rax
902	push	%rbx
903.cfi_push	%rbx
904	push	%rbp
905.cfi_push	%rbp
906	push	%r12
907.cfi_push	%r12
908	push	%r13
909.cfi_push	%r13
910	push	%r14
911.cfi_push	%r14
912	push	%r15
913.cfi_push	%r15
914___
915$code.=<<___ if ($win64);
916	vzeroupper
917	lea	-0xa8(%rsp),%rsp
918	vmovaps	%xmm6,-0xd8(%rax)
919	vmovaps	%xmm7,-0xc8(%rax)
920	vmovaps	%xmm8,-0xb8(%rax)
921	vmovaps	%xmm9,-0xa8(%rax)
922	vmovaps	%xmm10,-0x98(%rax)
923	vmovaps	%xmm11,-0x88(%rax)
924	vmovaps	%xmm12,-0x78(%rax)
925	vmovaps	%xmm13,-0x68(%rax)
926	vmovaps	%xmm14,-0x58(%rax)
927	vmovaps	%xmm15,-0x48(%rax)
928.Lmul_1024_body:
929___
930$code.=<<___;
931	mov	%rax,%rbp
932.cfi_def_cfa_register	%rbp
933	vzeroall
934	mov	%rdx, $bp	# reassigned argument
935	sub	\$64,%rsp
936
937	# unaligned 256-bit load that crosses page boundary can
938	# cause severe performance degradation here, so if $ap does
939	# cross page boundary, swap it with $bp [meaning that caller
940	# is advised to lay down $ap and $bp next to each other, so
941	# that only one can cross page boundary].
942	.byte	0x67,0x67
943	mov	$ap, $tmp
944	and	\$4095, $tmp
945	add	\$32*10, $tmp
946	shr	\$12, $tmp
947	mov	$ap, $tmp
948	cmovnz	$bp, $ap
949	cmovnz	$tmp, $bp
950
951	mov	$np, $tmp
952	sub	\$-128,$ap	# size optimization
953	sub	\$-128,$np
954	sub	\$-128,$rp
955
956	and	\$4095, $tmp	# see if $np crosses page
957	add	\$32*10, $tmp
958	.byte	0x67,0x67
959	shr	\$12, $tmp
960	jz	.Lmul_1024_no_n_copy
961
962	# unaligned 256-bit load that crosses page boundary can
963	# cause severe performance degradation here, so if $np does
964	# cross page boundary, copy it to stack and make sure stack
965	# frame doesn't...
966	sub		\$32*10,%rsp
967	vmovdqu		32*0-128($np), $ACC0
968	and		\$-512, %rsp
969	vmovdqu		32*1-128($np), $ACC1
970	vmovdqu		32*2-128($np), $ACC2
971	vmovdqu		32*3-128($np), $ACC3
972	vmovdqu		32*4-128($np), $ACC4
973	vmovdqu		32*5-128($np), $ACC5
974	vmovdqu		32*6-128($np), $ACC6
975	vmovdqu		32*7-128($np), $ACC7
976	vmovdqu		32*8-128($np), $ACC8
977	lea		64+128(%rsp),$np
978	vmovdqu		$ACC0, 32*0-128($np)
979	vpxor		$ACC0, $ACC0, $ACC0
980	vmovdqu		$ACC1, 32*1-128($np)
981	vpxor		$ACC1, $ACC1, $ACC1
982	vmovdqu		$ACC2, 32*2-128($np)
983	vpxor		$ACC2, $ACC2, $ACC2
984	vmovdqu		$ACC3, 32*3-128($np)
985	vpxor		$ACC3, $ACC3, $ACC3
986	vmovdqu		$ACC4, 32*4-128($np)
987	vpxor		$ACC4, $ACC4, $ACC4
988	vmovdqu		$ACC5, 32*5-128($np)
989	vpxor		$ACC5, $ACC5, $ACC5
990	vmovdqu		$ACC6, 32*6-128($np)
991	vpxor		$ACC6, $ACC6, $ACC6
992	vmovdqu		$ACC7, 32*7-128($np)
993	vpxor		$ACC7, $ACC7, $ACC7
994	vmovdqu		$ACC8, 32*8-128($np)
995	vmovdqa		$ACC0, $ACC8
996	vmovdqu		$ACC9, 32*9-128($np)	# $ACC9 is zero after vzeroall
997.Lmul_1024_no_n_copy:
998	and	\$-64,%rsp
999
1000	mov	($bp), %rbx
1001	vpbroadcastq ($bp), $Bi
1002	vmovdqu	$ACC0, (%rsp)			# clear top of stack
1003	xor	$r0, $r0
1004	.byte	0x67
1005	xor	$r1, $r1
1006	xor	$r2, $r2
1007	xor	$r3, $r3
1008
1009	vmovdqu	.Land_mask(%rip), $AND_MASK
1010	mov	\$9, $i
1011	vmovdqu	$ACC9, 32*9-128($rp)		# $ACC9 is zero after vzeroall
1012	jmp	.Loop_mul_1024
1013
1014.align	32
1015.Loop_mul_1024:
1016	 vpsrlq		\$29, $ACC3, $ACC9		# correct $ACC3(*)
1017	mov	%rbx, %rax
1018	imulq	-128($ap), %rax
1019	add	$r0, %rax
1020	mov	%rbx, $r1
1021	imulq	8-128($ap), $r1
1022	add	8(%rsp), $r1
1023
1024	mov	%rax, $r0
1025	imull	$n0, %eax
1026	and	\$0x1fffffff, %eax
1027
1028	 mov	%rbx, $r2
1029	 imulq	16-128($ap), $r2
1030	 add	16(%rsp), $r2
1031
1032	 mov	%rbx, $r3
1033	 imulq	24-128($ap), $r3
1034	 add	24(%rsp), $r3
1035	vpmuludq	32*1-128($ap),$Bi,$TEMP0
1036	 vmovd		%eax, $Yi
1037	vpaddq		$TEMP0,$ACC1,$ACC1
1038	vpmuludq	32*2-128($ap),$Bi,$TEMP1
1039	 vpbroadcastq	$Yi, $Yi
1040	vpaddq		$TEMP1,$ACC2,$ACC2
1041	vpmuludq	32*3-128($ap),$Bi,$TEMP2
1042	 vpand		$AND_MASK, $ACC3, $ACC3		# correct $ACC3
1043	vpaddq		$TEMP2,$ACC3,$ACC3
1044	vpmuludq	32*4-128($ap),$Bi,$TEMP0
1045	vpaddq		$TEMP0,$ACC4,$ACC4
1046	vpmuludq	32*5-128($ap),$Bi,$TEMP1
1047	vpaddq		$TEMP1,$ACC5,$ACC5
1048	vpmuludq	32*6-128($ap),$Bi,$TEMP2
1049	vpaddq		$TEMP2,$ACC6,$ACC6
1050	vpmuludq	32*7-128($ap),$Bi,$TEMP0
1051	 vpermq		\$0x93, $ACC9, $ACC9		# correct $ACC3
1052	vpaddq		$TEMP0,$ACC7,$ACC7
1053	vpmuludq	32*8-128($ap),$Bi,$TEMP1
1054	 vpbroadcastq	8($bp), $Bi
1055	vpaddq		$TEMP1,$ACC8,$ACC8
1056
1057	mov	%rax,%rdx
1058	imulq	-128($np),%rax
1059	add	%rax,$r0
1060	mov	%rdx,%rax
1061	imulq	8-128($np),%rax
1062	add	%rax,$r1
1063	mov	%rdx,%rax
1064	imulq	16-128($np),%rax
1065	add	%rax,$r2
1066	shr	\$29, $r0
1067	imulq	24-128($np),%rdx
1068	add	%rdx,$r3
1069	add	$r0, $r1
1070
1071	vpmuludq	32*1-128($np),$Yi,$TEMP2
1072	 vmovq		$Bi, %rbx
1073	vpaddq		$TEMP2,$ACC1,$ACC1
1074	vpmuludq	32*2-128($np),$Yi,$TEMP0
1075	vpaddq		$TEMP0,$ACC2,$ACC2
1076	vpmuludq	32*3-128($np),$Yi,$TEMP1
1077	vpaddq		$TEMP1,$ACC3,$ACC3
1078	vpmuludq	32*4-128($np),$Yi,$TEMP2
1079	vpaddq		$TEMP2,$ACC4,$ACC4
1080	vpmuludq	32*5-128($np),$Yi,$TEMP0
1081	vpaddq		$TEMP0,$ACC5,$ACC5
1082	vpmuludq	32*6-128($np),$Yi,$TEMP1
1083	vpaddq		$TEMP1,$ACC6,$ACC6
1084	vpmuludq	32*7-128($np),$Yi,$TEMP2
1085	 vpblendd	\$3, $ZERO, $ACC9, $ACC9	# correct $ACC3
1086	vpaddq		$TEMP2,$ACC7,$ACC7
1087	vpmuludq	32*8-128($np),$Yi,$TEMP0
1088	 vpaddq		$ACC9, $ACC3, $ACC3		# correct $ACC3
1089	vpaddq		$TEMP0,$ACC8,$ACC8
1090
1091	mov	%rbx, %rax
1092	imulq	-128($ap),%rax
1093	add	%rax,$r1
1094	 vmovdqu	-8+32*1-128($ap),$TEMP1
1095	mov	%rbx, %rax
1096	imulq	8-128($ap),%rax
1097	add	%rax,$r2
1098	 vmovdqu	-8+32*2-128($ap),$TEMP2
1099
1100	mov	$r1, %rax
1101	imull	$n0, %eax
1102	and	\$0x1fffffff, %eax
1103
1104	 imulq	16-128($ap),%rbx
1105	 add	%rbx,$r3
1106	vpmuludq	$Bi,$TEMP1,$TEMP1
1107	 vmovd		%eax, $Yi
1108	vmovdqu		-8+32*3-128($ap),$TEMP0
1109	vpaddq		$TEMP1,$ACC1,$ACC1
1110	vpmuludq	$Bi,$TEMP2,$TEMP2
1111	 vpbroadcastq	$Yi, $Yi
1112	vmovdqu		-8+32*4-128($ap),$TEMP1
1113	vpaddq		$TEMP2,$ACC2,$ACC2
1114	vpmuludq	$Bi,$TEMP0,$TEMP0
1115	vmovdqu		-8+32*5-128($ap),$TEMP2
1116	vpaddq		$TEMP0,$ACC3,$ACC3
1117	vpmuludq	$Bi,$TEMP1,$TEMP1
1118	vmovdqu		-8+32*6-128($ap),$TEMP0
1119	vpaddq		$TEMP1,$ACC4,$ACC4
1120	vpmuludq	$Bi,$TEMP2,$TEMP2
1121	vmovdqu		-8+32*7-128($ap),$TEMP1
1122	vpaddq		$TEMP2,$ACC5,$ACC5
1123	vpmuludq	$Bi,$TEMP0,$TEMP0
1124	vmovdqu		-8+32*8-128($ap),$TEMP2
1125	vpaddq		$TEMP0,$ACC6,$ACC6
1126	vpmuludq	$Bi,$TEMP1,$TEMP1
1127	vmovdqu		-8+32*9-128($ap),$ACC9
1128	vpaddq		$TEMP1,$ACC7,$ACC7
1129	vpmuludq	$Bi,$TEMP2,$TEMP2
1130	vpaddq		$TEMP2,$ACC8,$ACC8
1131	vpmuludq	$Bi,$ACC9,$ACC9
1132	 vpbroadcastq	16($bp), $Bi
1133
1134	mov	%rax,%rdx
1135	imulq	-128($np),%rax
1136	add	%rax,$r1
1137	 vmovdqu	-8+32*1-128($np),$TEMP0
1138	mov	%rdx,%rax
1139	imulq	8-128($np),%rax
1140	add	%rax,$r2
1141	 vmovdqu	-8+32*2-128($np),$TEMP1
1142	shr	\$29, $r1
1143	imulq	16-128($np),%rdx
1144	add	%rdx,$r3
1145	add	$r1, $r2
1146
1147	vpmuludq	$Yi,$TEMP0,$TEMP0
1148	 vmovq		$Bi, %rbx
1149	vmovdqu		-8+32*3-128($np),$TEMP2
1150	vpaddq		$TEMP0,$ACC1,$ACC1
1151	vpmuludq	$Yi,$TEMP1,$TEMP1
1152	vmovdqu		-8+32*4-128($np),$TEMP0
1153	vpaddq		$TEMP1,$ACC2,$ACC2
1154	vpmuludq	$Yi,$TEMP2,$TEMP2
1155	vmovdqu		-8+32*5-128($np),$TEMP1
1156	vpaddq		$TEMP2,$ACC3,$ACC3
1157	vpmuludq	$Yi,$TEMP0,$TEMP0
1158	vmovdqu		-8+32*6-128($np),$TEMP2
1159	vpaddq		$TEMP0,$ACC4,$ACC4
1160	vpmuludq	$Yi,$TEMP1,$TEMP1
1161	vmovdqu		-8+32*7-128($np),$TEMP0
1162	vpaddq		$TEMP1,$ACC5,$ACC5
1163	vpmuludq	$Yi,$TEMP2,$TEMP2
1164	vmovdqu		-8+32*8-128($np),$TEMP1
1165	vpaddq		$TEMP2,$ACC6,$ACC6
1166	vpmuludq	$Yi,$TEMP0,$TEMP0
1167	vmovdqu		-8+32*9-128($np),$TEMP2
1168	vpaddq		$TEMP0,$ACC7,$ACC7
1169	vpmuludq	$Yi,$TEMP1,$TEMP1
1170	vpaddq		$TEMP1,$ACC8,$ACC8
1171	vpmuludq	$Yi,$TEMP2,$TEMP2
1172	vpaddq		$TEMP2,$ACC9,$ACC9
1173
1174	 vmovdqu	-16+32*1-128($ap),$TEMP0
1175	mov	%rbx,%rax
1176	imulq	-128($ap),%rax
1177	add	$r2,%rax
1178
1179	 vmovdqu	-16+32*2-128($ap),$TEMP1
1180	mov	%rax,$r2
1181	imull	$n0, %eax
1182	and	\$0x1fffffff, %eax
1183
1184	 imulq	8-128($ap),%rbx
1185	 add	%rbx,$r3
1186	vpmuludq	$Bi,$TEMP0,$TEMP0
1187	 vmovd		%eax, $Yi
1188	vmovdqu		-16+32*3-128($ap),$TEMP2
1189	vpaddq		$TEMP0,$ACC1,$ACC1
1190	vpmuludq	$Bi,$TEMP1,$TEMP1
1191	 vpbroadcastq	$Yi, $Yi
1192	vmovdqu		-16+32*4-128($ap),$TEMP0
1193	vpaddq		$TEMP1,$ACC2,$ACC2
1194	vpmuludq	$Bi,$TEMP2,$TEMP2
1195	vmovdqu		-16+32*5-128($ap),$TEMP1
1196	vpaddq		$TEMP2,$ACC3,$ACC3
1197	vpmuludq	$Bi,$TEMP0,$TEMP0
1198	vmovdqu		-16+32*6-128($ap),$TEMP2
1199	vpaddq		$TEMP0,$ACC4,$ACC4
1200	vpmuludq	$Bi,$TEMP1,$TEMP1
1201	vmovdqu		-16+32*7-128($ap),$TEMP0
1202	vpaddq		$TEMP1,$ACC5,$ACC5
1203	vpmuludq	$Bi,$TEMP2,$TEMP2
1204	vmovdqu		-16+32*8-128($ap),$TEMP1
1205	vpaddq		$TEMP2,$ACC6,$ACC6
1206	vpmuludq	$Bi,$TEMP0,$TEMP0
1207	vmovdqu		-16+32*9-128($ap),$TEMP2
1208	vpaddq		$TEMP0,$ACC7,$ACC7
1209	vpmuludq	$Bi,$TEMP1,$TEMP1
1210	vpaddq		$TEMP1,$ACC8,$ACC8
1211	vpmuludq	$Bi,$TEMP2,$TEMP2
1212	 vpbroadcastq	24($bp), $Bi
1213	vpaddq		$TEMP2,$ACC9,$ACC9
1214
1215	 vmovdqu	-16+32*1-128($np),$TEMP0
1216	mov	%rax,%rdx
1217	imulq	-128($np),%rax
1218	add	%rax,$r2
1219	 vmovdqu	-16+32*2-128($np),$TEMP1
1220	imulq	8-128($np),%rdx
1221	add	%rdx,$r3
1222	shr	\$29, $r2
1223
1224	vpmuludq	$Yi,$TEMP0,$TEMP0
1225	 vmovq		$Bi, %rbx
1226	vmovdqu		-16+32*3-128($np),$TEMP2
1227	vpaddq		$TEMP0,$ACC1,$ACC1
1228	vpmuludq	$Yi,$TEMP1,$TEMP1
1229	vmovdqu		-16+32*4-128($np),$TEMP0
1230	vpaddq		$TEMP1,$ACC2,$ACC2
1231	vpmuludq	$Yi,$TEMP2,$TEMP2
1232	vmovdqu		-16+32*5-128($np),$TEMP1
1233	vpaddq		$TEMP2,$ACC3,$ACC3
1234	vpmuludq	$Yi,$TEMP0,$TEMP0
1235	vmovdqu		-16+32*6-128($np),$TEMP2
1236	vpaddq		$TEMP0,$ACC4,$ACC4
1237	vpmuludq	$Yi,$TEMP1,$TEMP1
1238	vmovdqu		-16+32*7-128($np),$TEMP0
1239	vpaddq		$TEMP1,$ACC5,$ACC5
1240	vpmuludq	$Yi,$TEMP2,$TEMP2
1241	vmovdqu		-16+32*8-128($np),$TEMP1
1242	vpaddq		$TEMP2,$ACC6,$ACC6
1243	vpmuludq	$Yi,$TEMP0,$TEMP0
1244	vmovdqu		-16+32*9-128($np),$TEMP2
1245	vpaddq		$TEMP0,$ACC7,$ACC7
1246	vpmuludq	$Yi,$TEMP1,$TEMP1
1247	 vmovdqu	-24+32*1-128($ap),$TEMP0
1248	vpaddq		$TEMP1,$ACC8,$ACC8
1249	vpmuludq	$Yi,$TEMP2,$TEMP2
1250	 vmovdqu	-24+32*2-128($ap),$TEMP1
1251	vpaddq		$TEMP2,$ACC9,$ACC9
1252
1253	add	$r2, $r3
1254	imulq	-128($ap),%rbx
1255	add	%rbx,$r3
1256
1257	mov	$r3, %rax
1258	imull	$n0, %eax
1259	and	\$0x1fffffff, %eax
1260
1261	vpmuludq	$Bi,$TEMP0,$TEMP0
1262	 vmovd		%eax, $Yi
1263	vmovdqu		-24+32*3-128($ap),$TEMP2
1264	vpaddq		$TEMP0,$ACC1,$ACC1
1265	vpmuludq	$Bi,$TEMP1,$TEMP1
1266	 vpbroadcastq	$Yi, $Yi
1267	vmovdqu		-24+32*4-128($ap),$TEMP0
1268	vpaddq		$TEMP1,$ACC2,$ACC2
1269	vpmuludq	$Bi,$TEMP2,$TEMP2
1270	vmovdqu		-24+32*5-128($ap),$TEMP1
1271	vpaddq		$TEMP2,$ACC3,$ACC3
1272	vpmuludq	$Bi,$TEMP0,$TEMP0
1273	vmovdqu		-24+32*6-128($ap),$TEMP2
1274	vpaddq		$TEMP0,$ACC4,$ACC4
1275	vpmuludq	$Bi,$TEMP1,$TEMP1
1276	vmovdqu		-24+32*7-128($ap),$TEMP0
1277	vpaddq		$TEMP1,$ACC5,$ACC5
1278	vpmuludq	$Bi,$TEMP2,$TEMP2
1279	vmovdqu		-24+32*8-128($ap),$TEMP1
1280	vpaddq		$TEMP2,$ACC6,$ACC6
1281	vpmuludq	$Bi,$TEMP0,$TEMP0
1282	vmovdqu		-24+32*9-128($ap),$TEMP2
1283	vpaddq		$TEMP0,$ACC7,$ACC7
1284	vpmuludq	$Bi,$TEMP1,$TEMP1
1285	vpaddq		$TEMP1,$ACC8,$ACC8
1286	vpmuludq	$Bi,$TEMP2,$TEMP2
1287	 vpbroadcastq	32($bp), $Bi
1288	vpaddq		$TEMP2,$ACC9,$ACC9
1289	 add		\$32, $bp			# $bp++
1290
1291	vmovdqu		-24+32*1-128($np),$TEMP0
1292	imulq	-128($np),%rax
1293	add	%rax,$r3
1294	shr	\$29, $r3
1295
1296	vmovdqu		-24+32*2-128($np),$TEMP1
1297	vpmuludq	$Yi,$TEMP0,$TEMP0
1298	 vmovq		$Bi, %rbx
1299	vmovdqu		-24+32*3-128($np),$TEMP2
1300	vpaddq		$TEMP0,$ACC1,$ACC0		# $ACC0==$TEMP0
1301	vpmuludq	$Yi,$TEMP1,$TEMP1
1302	 vmovdqu	$ACC0, (%rsp)			# transfer $r0-$r3
1303	vpaddq		$TEMP1,$ACC2,$ACC1
1304	vmovdqu		-24+32*4-128($np),$TEMP0
1305	vpmuludq	$Yi,$TEMP2,$TEMP2
1306	vmovdqu		-24+32*5-128($np),$TEMP1
1307	vpaddq		$TEMP2,$ACC3,$ACC2
1308	vpmuludq	$Yi,$TEMP0,$TEMP0
1309	vmovdqu		-24+32*6-128($np),$TEMP2
1310	vpaddq		$TEMP0,$ACC4,$ACC3
1311	vpmuludq	$Yi,$TEMP1,$TEMP1
1312	vmovdqu		-24+32*7-128($np),$TEMP0
1313	vpaddq		$TEMP1,$ACC5,$ACC4
1314	vpmuludq	$Yi,$TEMP2,$TEMP2
1315	vmovdqu		-24+32*8-128($np),$TEMP1
1316	vpaddq		$TEMP2,$ACC6,$ACC5
1317	vpmuludq	$Yi,$TEMP0,$TEMP0
1318	vmovdqu		-24+32*9-128($np),$TEMP2
1319	 mov	$r3, $r0
1320	vpaddq		$TEMP0,$ACC7,$ACC6
1321	vpmuludq	$Yi,$TEMP1,$TEMP1
1322	 add	(%rsp), $r0
1323	vpaddq		$TEMP1,$ACC8,$ACC7
1324	vpmuludq	$Yi,$TEMP2,$TEMP2
1325	 vmovq	$r3, $TEMP1
1326	vpaddq		$TEMP2,$ACC9,$ACC8
1327
1328	dec	$i
1329	jnz	.Loop_mul_1024
1330___
1331
1332# (*)	Original implementation was correcting ACC1-ACC3 for overflow
1333#	after 7 loop runs, or after 28 iterations, or 56 additions.
1334#	But as we underutilize resources, it's possible to correct in
1335#	each iteration with marginal performance loss. But then, as
1336#	we do it in each iteration, we can correct less digits, and
1337#	avoid performance penalties completely. Also note that we
1338#	correct only three digits out of four. This works because
1339#	most significant digit is subjected to less additions.
1340
1341$TEMP0 = $ACC9;
1342$TEMP3 = $Bi;
1343$TEMP4 = $Yi;
1344$code.=<<___;
1345	vpermq		\$0, $AND_MASK, $AND_MASK
1346	vpaddq		(%rsp), $TEMP1, $ACC0
1347
1348	vpsrlq		\$29, $ACC0, $TEMP1
1349	vpand		$AND_MASK, $ACC0, $ACC0
1350	vpsrlq		\$29, $ACC1, $TEMP2
1351	vpand		$AND_MASK, $ACC1, $ACC1
1352	vpsrlq		\$29, $ACC2, $TEMP3
1353	vpermq		\$0x93, $TEMP1, $TEMP1
1354	vpand		$AND_MASK, $ACC2, $ACC2
1355	vpsrlq		\$29, $ACC3, $TEMP4
1356	vpermq		\$0x93, $TEMP2, $TEMP2
1357	vpand		$AND_MASK, $ACC3, $ACC3
1358
1359	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
1360	vpermq		\$0x93, $TEMP3, $TEMP3
1361	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
1362	vpermq		\$0x93, $TEMP4, $TEMP4
1363	vpaddq		$TEMP0, $ACC0, $ACC0
1364	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
1365	vpaddq		$TEMP1, $ACC1, $ACC1
1366	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
1367	vpaddq		$TEMP2, $ACC2, $ACC2
1368	vpblendd	\$3, $TEMP4, $ZERO, $TEMP4
1369	vpaddq		$TEMP3, $ACC3, $ACC3
1370	vpaddq		$TEMP4, $ACC4, $ACC4
1371
1372	vpsrlq		\$29, $ACC0, $TEMP1
1373	vpand		$AND_MASK, $ACC0, $ACC0
1374	vpsrlq		\$29, $ACC1, $TEMP2
1375	vpand		$AND_MASK, $ACC1, $ACC1
1376	vpsrlq		\$29, $ACC2, $TEMP3
1377	vpermq		\$0x93, $TEMP1, $TEMP1
1378	vpand		$AND_MASK, $ACC2, $ACC2
1379	vpsrlq		\$29, $ACC3, $TEMP4
1380	vpermq		\$0x93, $TEMP2, $TEMP2
1381	vpand		$AND_MASK, $ACC3, $ACC3
1382	vpermq		\$0x93, $TEMP3, $TEMP3
1383
1384	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
1385	vpermq		\$0x93, $TEMP4, $TEMP4
1386	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
1387	vpaddq		$TEMP0, $ACC0, $ACC0
1388	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
1389	vpaddq		$TEMP1, $ACC1, $ACC1
1390	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
1391	vpaddq		$TEMP2, $ACC2, $ACC2
1392	vpblendd	\$3, $TEMP4, $ZERO, $TEMP4
1393	vpaddq		$TEMP3, $ACC3, $ACC3
1394	vpaddq		$TEMP4, $ACC4, $ACC4
1395
1396	vmovdqu		$ACC0, 0-128($rp)
1397	vmovdqu		$ACC1, 32-128($rp)
1398	vmovdqu		$ACC2, 64-128($rp)
1399	vmovdqu		$ACC3, 96-128($rp)
1400___
1401
1402$TEMP5=$ACC0;
1403$code.=<<___;
1404	vpsrlq		\$29, $ACC4, $TEMP1
1405	vpand		$AND_MASK, $ACC4, $ACC4
1406	vpsrlq		\$29, $ACC5, $TEMP2
1407	vpand		$AND_MASK, $ACC5, $ACC5
1408	vpsrlq		\$29, $ACC6, $TEMP3
1409	vpermq		\$0x93, $TEMP1, $TEMP1
1410	vpand		$AND_MASK, $ACC6, $ACC6
1411	vpsrlq		\$29, $ACC7, $TEMP4
1412	vpermq		\$0x93, $TEMP2, $TEMP2
1413	vpand		$AND_MASK, $ACC7, $ACC7
1414	vpsrlq		\$29, $ACC8, $TEMP5
1415	vpermq		\$0x93, $TEMP3, $TEMP3
1416	vpand		$AND_MASK, $ACC8, $ACC8
1417	vpermq		\$0x93, $TEMP4, $TEMP4
1418
1419	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
1420	vpermq		\$0x93, $TEMP5, $TEMP5
1421	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
1422	vpaddq		$TEMP0, $ACC4, $ACC4
1423	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
1424	vpaddq		$TEMP1, $ACC5, $ACC5
1425	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
1426	vpaddq		$TEMP2, $ACC6, $ACC6
1427	vpblendd	\$3, $TEMP4, $TEMP5, $TEMP4
1428	vpaddq		$TEMP3, $ACC7, $ACC7
1429	vpaddq		$TEMP4, $ACC8, $ACC8
1430
1431	vpsrlq		\$29, $ACC4, $TEMP1
1432	vpand		$AND_MASK, $ACC4, $ACC4
1433	vpsrlq		\$29, $ACC5, $TEMP2
1434	vpand		$AND_MASK, $ACC5, $ACC5
1435	vpsrlq		\$29, $ACC6, $TEMP3
1436	vpermq		\$0x93, $TEMP1, $TEMP1
1437	vpand		$AND_MASK, $ACC6, $ACC6
1438	vpsrlq		\$29, $ACC7, $TEMP4
1439	vpermq		\$0x93, $TEMP2, $TEMP2
1440	vpand		$AND_MASK, $ACC7, $ACC7
1441	vpsrlq		\$29, $ACC8, $TEMP5
1442	vpermq		\$0x93, $TEMP3, $TEMP3
1443	vpand		$AND_MASK, $ACC8, $ACC8
1444	vpermq		\$0x93, $TEMP4, $TEMP4
1445
1446	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
1447	vpermq		\$0x93, $TEMP5, $TEMP5
1448	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
1449	vpaddq		$TEMP0, $ACC4, $ACC4
1450	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
1451	vpaddq		$TEMP1, $ACC5, $ACC5
1452	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
1453	vpaddq		$TEMP2, $ACC6, $ACC6
1454	vpblendd	\$3, $TEMP4, $TEMP5, $TEMP4
1455	vpaddq		$TEMP3, $ACC7, $ACC7
1456	vpaddq		$TEMP4, $ACC8, $ACC8
1457
1458	vmovdqu		$ACC4, 128-128($rp)
1459	vmovdqu		$ACC5, 160-128($rp)
1460	vmovdqu		$ACC6, 192-128($rp)
1461	vmovdqu		$ACC7, 224-128($rp)
1462	vmovdqu		$ACC8, 256-128($rp)
1463	vzeroupper
1464
1465	mov	%rbp, %rax
1466.cfi_def_cfa_register	%rax
1467___
1468$code.=<<___ if ($win64);
1469.Lmul_1024_in_tail:
1470	movaps	-0xd8(%rax),%xmm6
1471	movaps	-0xc8(%rax),%xmm7
1472	movaps	-0xb8(%rax),%xmm8
1473	movaps	-0xa8(%rax),%xmm9
1474	movaps	-0x98(%rax),%xmm10
1475	movaps	-0x88(%rax),%xmm11
1476	movaps	-0x78(%rax),%xmm12
1477	movaps	-0x68(%rax),%xmm13
1478	movaps	-0x58(%rax),%xmm14
1479	movaps	-0x48(%rax),%xmm15
1480___
1481$code.=<<___;
1482	mov	-48(%rax),%r15
1483.cfi_restore	%r15
1484	mov	-40(%rax),%r14
1485.cfi_restore	%r14
1486	mov	-32(%rax),%r13
1487.cfi_restore	%r13
1488	mov	-24(%rax),%r12
1489.cfi_restore	%r12
1490	mov	-16(%rax),%rbp
1491.cfi_restore	%rbp
1492	mov	-8(%rax),%rbx
1493.cfi_restore	%rbx
1494	lea	(%rax),%rsp		# restore %rsp
1495.cfi_def_cfa_register	%rsp
1496.Lmul_1024_epilogue:
1497	ret
1498.cfi_endproc
1499.size	rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
1500___
1501}
1502{
1503my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi");
1504my @T = map("%r$_",(8..11));
1505
1506$code.=<<___;
1507.globl	rsaz_1024_red2norm_avx2
1508.type	rsaz_1024_red2norm_avx2,\@abi-omnipotent
1509.align	32
1510rsaz_1024_red2norm_avx2:
1511	sub	\$-128,$inp	# size optimization
1512	xor	%rax,%rax
1513___
1514
1515for ($j=0,$i=0; $i<16; $i++) {
1516    my $k=0;
1517    while (29*$j<64*($i+1)) {	# load data till boundary
1518	$code.="	mov	`8*$j-128`($inp), @T[0]\n";
1519	$j++; $k++; push(@T,shift(@T));
1520    }
1521    $l=$k;
1522    while ($k>1) {		# shift loaded data but last value
1523	$code.="	shl	\$`29*($j-$k)`,@T[-$k]\n";
1524	$k--;
1525    }
1526    $code.=<<___;		# shift last value
1527	mov	@T[-1], @T[0]
1528	shl	\$`29*($j-1)`, @T[-1]
1529	shr	\$`-29*($j-1)`, @T[0]
1530___
1531    while ($l) {		# accumulate all values
1532	$code.="	add	@T[-$l], %rax\n";
1533	$l--;
1534    }
1535	$code.=<<___;
1536	adc	\$0, @T[0]	# consume eventual carry
1537	mov	%rax, 8*$i($out)
1538	mov	@T[0], %rax
1539___
1540    push(@T,shift(@T));
1541}
1542$code.=<<___;
1543	ret
1544.size	rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2
1545
1546.globl	rsaz_1024_norm2red_avx2
1547.type	rsaz_1024_norm2red_avx2,\@abi-omnipotent
1548.align	32
1549rsaz_1024_norm2red_avx2:
1550	sub	\$-128,$out	# size optimization
1551	mov	($inp),@T[0]
1552	mov	\$0x1fffffff,%eax
1553___
1554for ($j=0,$i=0; $i<16; $i++) {
1555    $code.="	mov	`8*($i+1)`($inp),@T[1]\n"	if ($i<15);
1556    $code.="	xor	@T[1],@T[1]\n"			if ($i==15);
1557    my $k=1;
1558    while (29*($j+1)<64*($i+1)) {
1559    	$code.=<<___;
1560	mov	@T[0],@T[-$k]
1561	shr	\$`29*$j`,@T[-$k]
1562	and	%rax,@T[-$k]				# &0x1fffffff
1563	mov	@T[-$k],`8*$j-128`($out)
1564___
1565	$j++; $k++;
1566    }
1567    $code.=<<___;
1568	shrd	\$`29*$j`,@T[1],@T[0]
1569	and	%rax,@T[0]
1570	mov	@T[0],`8*$j-128`($out)
1571___
1572    $j++;
1573    push(@T,shift(@T));
1574}
1575$code.=<<___;
1576	mov	@T[0],`8*$j-128`($out)			# zero
1577	mov	@T[0],`8*($j+1)-128`($out)
1578	mov	@T[0],`8*($j+2)-128`($out)
1579	mov	@T[0],`8*($j+3)-128`($out)
1580	ret
1581.size	rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2
1582___
1583}
1584{
1585my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
1586
1587$code.=<<___;
1588.globl	rsaz_1024_scatter5_avx2
1589.type	rsaz_1024_scatter5_avx2,\@abi-omnipotent
1590.align	32
1591rsaz_1024_scatter5_avx2:
1592	vzeroupper
1593	vmovdqu	.Lscatter_permd(%rip),%ymm5
1594	shl	\$4,$power
1595	lea	($out,$power),$out
1596	mov	\$9,%eax
1597	jmp	.Loop_scatter_1024
1598
1599.align	32
1600.Loop_scatter_1024:
1601	vmovdqu		($inp),%ymm0
1602	lea		32($inp),$inp
1603	vpermd		%ymm0,%ymm5,%ymm0
1604	vmovdqu		%xmm0,($out)
1605	lea		16*32($out),$out
1606	dec	%eax
1607	jnz	.Loop_scatter_1024
1608
1609	vzeroupper
1610	ret
1611.size	rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
1612
1613.globl	rsaz_1024_gather5_avx2
1614.type	rsaz_1024_gather5_avx2,\@abi-omnipotent
1615.align	32
1616rsaz_1024_gather5_avx2:
1617.cfi_startproc
1618	vzeroupper
1619	mov	%rsp,%r11
1620.cfi_def_cfa_register	%r11
1621___
1622$code.=<<___ if ($win64);
1623	lea	-0x88(%rsp),%rax
1624.LSEH_begin_rsaz_1024_gather5:
1625	# I can't trust assembler to use specific encoding:-(
1626	.byte	0x48,0x8d,0x60,0xe0		# lea	-0x20(%rax),%rsp
1627	.byte	0xc5,0xf8,0x29,0x70,0xe0	# vmovaps %xmm6,-0x20(%rax)
1628	.byte	0xc5,0xf8,0x29,0x78,0xf0	# vmovaps %xmm7,-0x10(%rax)
1629	.byte	0xc5,0x78,0x29,0x40,0x00	# vmovaps %xmm8,0(%rax)
1630	.byte	0xc5,0x78,0x29,0x48,0x10	# vmovaps %xmm9,0x10(%rax)
1631	.byte	0xc5,0x78,0x29,0x50,0x20	# vmovaps %xmm10,0x20(%rax)
1632	.byte	0xc5,0x78,0x29,0x58,0x30	# vmovaps %xmm11,0x30(%rax)
1633	.byte	0xc5,0x78,0x29,0x60,0x40	# vmovaps %xmm12,0x40(%rax)
1634	.byte	0xc5,0x78,0x29,0x68,0x50	# vmovaps %xmm13,0x50(%rax)
1635	.byte	0xc5,0x78,0x29,0x70,0x60	# vmovaps %xmm14,0x60(%rax)
1636	.byte	0xc5,0x78,0x29,0x78,0x70	# vmovaps %xmm15,0x70(%rax)
1637___
1638$code.=<<___;
1639	lea	-0x100(%rsp),%rsp
1640	and	\$-32, %rsp
1641	lea	.Linc(%rip), %r10
1642	lea	-128(%rsp),%rax			# control u-op density
1643
1644	vmovd		$power, %xmm4
1645	vmovdqa		(%r10),%ymm0
1646	vmovdqa		32(%r10),%ymm1
1647	vmovdqa		64(%r10),%ymm5
1648	vpbroadcastd	%xmm4,%ymm4
1649
1650	vpaddd		%ymm5, %ymm0, %ymm2
1651	vpcmpeqd	%ymm4, %ymm0, %ymm0
1652	vpaddd		%ymm5, %ymm1, %ymm3
1653	vpcmpeqd	%ymm4, %ymm1, %ymm1
1654	vmovdqa		%ymm0, 32*0+128(%rax)
1655	vpaddd		%ymm5, %ymm2, %ymm0
1656	vpcmpeqd	%ymm4, %ymm2, %ymm2
1657	vmovdqa		%ymm1, 32*1+128(%rax)
1658	vpaddd		%ymm5, %ymm3, %ymm1
1659	vpcmpeqd	%ymm4, %ymm3, %ymm3
1660	vmovdqa		%ymm2, 32*2+128(%rax)
1661	vpaddd		%ymm5, %ymm0, %ymm2
1662	vpcmpeqd	%ymm4, %ymm0, %ymm0
1663	vmovdqa		%ymm3, 32*3+128(%rax)
1664	vpaddd		%ymm5, %ymm1, %ymm3
1665	vpcmpeqd	%ymm4, %ymm1, %ymm1
1666	vmovdqa		%ymm0, 32*4+128(%rax)
1667	vpaddd		%ymm5, %ymm2, %ymm8
1668	vpcmpeqd	%ymm4, %ymm2, %ymm2
1669	vmovdqa		%ymm1, 32*5+128(%rax)
1670	vpaddd		%ymm5, %ymm3, %ymm9
1671	vpcmpeqd	%ymm4, %ymm3, %ymm3
1672	vmovdqa		%ymm2, 32*6+128(%rax)
1673	vpaddd		%ymm5, %ymm8, %ymm10
1674	vpcmpeqd	%ymm4, %ymm8, %ymm8
1675	vmovdqa		%ymm3, 32*7+128(%rax)
1676	vpaddd		%ymm5, %ymm9, %ymm11
1677	vpcmpeqd	%ymm4, %ymm9, %ymm9
1678	vpaddd		%ymm5, %ymm10, %ymm12
1679	vpcmpeqd	%ymm4, %ymm10, %ymm10
1680	vpaddd		%ymm5, %ymm11, %ymm13
1681	vpcmpeqd	%ymm4, %ymm11, %ymm11
1682	vpaddd		%ymm5, %ymm12, %ymm14
1683	vpcmpeqd	%ymm4, %ymm12, %ymm12
1684	vpaddd		%ymm5, %ymm13, %ymm15
1685	vpcmpeqd	%ymm4, %ymm13, %ymm13
1686	vpcmpeqd	%ymm4, %ymm14, %ymm14
1687	vpcmpeqd	%ymm4, %ymm15, %ymm15
1688
1689	vmovdqa	-32(%r10),%ymm7			# .Lgather_permd
1690	lea	128($inp), $inp
1691	mov	\$9,$power
1692
1693.Loop_gather_1024:
1694	vmovdqa		32*0-128($inp),	%ymm0
1695	vmovdqa		32*1-128($inp),	%ymm1
1696	vmovdqa		32*2-128($inp),	%ymm2
1697	vmovdqa		32*3-128($inp),	%ymm3
1698	vpand		32*0+128(%rax),	%ymm0,	%ymm0
1699	vpand		32*1+128(%rax),	%ymm1,	%ymm1
1700	vpand		32*2+128(%rax),	%ymm2,	%ymm2
1701	vpor		%ymm0, %ymm1, %ymm4
1702	vpand		32*3+128(%rax),	%ymm3,	%ymm3
1703	vmovdqa		32*4-128($inp),	%ymm0
1704	vmovdqa		32*5-128($inp),	%ymm1
1705	vpor		%ymm2, %ymm3, %ymm5
1706	vmovdqa		32*6-128($inp),	%ymm2
1707	vmovdqa		32*7-128($inp),	%ymm3
1708	vpand		32*4+128(%rax),	%ymm0,	%ymm0
1709	vpand		32*5+128(%rax),	%ymm1,	%ymm1
1710	vpand		32*6+128(%rax),	%ymm2,	%ymm2
1711	vpor		%ymm0, %ymm4, %ymm4
1712	vpand		32*7+128(%rax),	%ymm3,	%ymm3
1713	vpand		32*8-128($inp),	%ymm8,	%ymm0
1714	vpor		%ymm1, %ymm5, %ymm5
1715	vpand		32*9-128($inp),	%ymm9,	%ymm1
1716	vpor		%ymm2, %ymm4, %ymm4
1717	vpand		32*10-128($inp),%ymm10,	%ymm2
1718	vpor		%ymm3, %ymm5, %ymm5
1719	vpand		32*11-128($inp),%ymm11,	%ymm3
1720	vpor		%ymm0, %ymm4, %ymm4
1721	vpand		32*12-128($inp),%ymm12,	%ymm0
1722	vpor		%ymm1, %ymm5, %ymm5
1723	vpand		32*13-128($inp),%ymm13,	%ymm1
1724	vpor		%ymm2, %ymm4, %ymm4
1725	vpand		32*14-128($inp),%ymm14,	%ymm2
1726	vpor		%ymm3, %ymm5, %ymm5
1727	vpand		32*15-128($inp),%ymm15,	%ymm3
1728	lea		32*16($inp), $inp
1729	vpor		%ymm0, %ymm4, %ymm4
1730	vpor		%ymm1, %ymm5, %ymm5
1731	vpor		%ymm2, %ymm4, %ymm4
1732	vpor		%ymm3, %ymm5, %ymm5
1733
1734	vpor		%ymm5, %ymm4, %ymm4
1735	vextracti128	\$1, %ymm4, %xmm5	# upper half is cleared
1736	vpor		%xmm4, %xmm5, %xmm5
1737	vpermd		%ymm5,%ymm7,%ymm5
1738	vmovdqu		%ymm5,($out)
1739	lea		32($out),$out
1740	dec	$power
1741	jnz	.Loop_gather_1024
1742
1743	vpxor	%ymm0,%ymm0,%ymm0
1744	vmovdqu	%ymm0,($out)
1745	vzeroupper
1746___
1747$code.=<<___ if ($win64);
1748	movaps	-0xa8(%r11),%xmm6
1749	movaps	-0x98(%r11),%xmm7
1750	movaps	-0x88(%r11),%xmm8
1751	movaps	-0x78(%r11),%xmm9
1752	movaps	-0x68(%r11),%xmm10
1753	movaps	-0x58(%r11),%xmm11
1754	movaps	-0x48(%r11),%xmm12
1755	movaps	-0x38(%r11),%xmm13
1756	movaps	-0x28(%r11),%xmm14
1757	movaps	-0x18(%r11),%xmm15
1758___
1759$code.=<<___;
1760	lea	(%r11),%rsp
1761.cfi_def_cfa_register	%rsp
1762	ret
1763.cfi_endproc
1764.LSEH_end_rsaz_1024_gather5:
1765.size	rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
1766___
1767}
1768
1769$code.=<<___;
1770.extern	OPENSSL_ia32cap_P
1771.globl	rsaz_avx2_eligible
1772.type	rsaz_avx2_eligible,\@abi-omnipotent
1773.align	32
1774rsaz_avx2_eligible:
1775	leaq	OPENSSL_ia32cap_P(%rip),%rax
1776	mov	8(%rax),%eax
1777___
1778$code.=<<___	if ($addx);
1779	mov	\$`1<<8|1<<19`,%ecx
1780	mov	\$0,%edx
1781	and	%eax,%ecx
1782	cmp	\$`1<<8|1<<19`,%ecx	# check for BMI2+AD*X
1783	cmove	%edx,%eax
1784___
1785$code.=<<___;
1786	and	\$`1<<5`,%eax
1787	shr	\$5,%eax
1788	ret
1789.size	rsaz_avx2_eligible,.-rsaz_avx2_eligible
1790
1791.align	64
1792.Land_mask:
1793	.quad	0x1fffffff,0x1fffffff,0x1fffffff,-1
1794.Lscatter_permd:
1795	.long	0,2,4,6,7,7,7,7
1796.Lgather_permd:
1797	.long	0,7,1,7,2,7,3,7
1798.Linc:
1799	.long	0,0,0,0, 1,1,1,1
1800	.long	2,2,2,2, 3,3,3,3
1801	.long	4,4,4,4, 4,4,4,4
1802.align	64
1803___
1804
1805if ($win64) {
1806$rec="%rcx";
1807$frame="%rdx";
1808$context="%r8";
1809$disp="%r9";
1810
1811$code.=<<___
1812.extern	__imp_RtlVirtualUnwind
1813.type	rsaz_se_handler,\@abi-omnipotent
1814.align	16
1815rsaz_se_handler:
1816	push	%rsi
1817	push	%rdi
1818	push	%rbx
1819	push	%rbp
1820	push	%r12
1821	push	%r13
1822	push	%r14
1823	push	%r15
1824	pushfq
1825	sub	\$64,%rsp
1826
1827	mov	120($context),%rax	# pull context->Rax
1828	mov	248($context),%rbx	# pull context->Rip
1829
1830	mov	8($disp),%rsi		# disp->ImageBase
1831	mov	56($disp),%r11		# disp->HandlerData
1832
1833	mov	0(%r11),%r10d		# HandlerData[0]
1834	lea	(%rsi,%r10),%r10	# prologue label
1835	cmp	%r10,%rbx		# context->Rip<prologue label
1836	jb	.Lcommon_seh_tail
1837
1838	mov	4(%r11),%r10d		# HandlerData[1]
1839	lea	(%rsi,%r10),%r10	# epilogue label
1840	cmp	%r10,%rbx		# context->Rip>=epilogue label
1841	jae	.Lcommon_seh_tail
1842
1843	mov	160($context),%rbp	# pull context->Rbp
1844
1845	mov	8(%r11),%r10d		# HandlerData[2]
1846	lea	(%rsi,%r10),%r10	# "in tail" label
1847	cmp	%r10,%rbx		# context->Rip>="in tail" label
1848	cmovc	%rbp,%rax
1849
1850	mov	-48(%rax),%r15
1851	mov	-40(%rax),%r14
1852	mov	-32(%rax),%r13
1853	mov	-24(%rax),%r12
1854	mov	-16(%rax),%rbp
1855	mov	-8(%rax),%rbx
1856	mov	%r15,240($context)
1857	mov	%r14,232($context)
1858	mov	%r13,224($context)
1859	mov	%r12,216($context)
1860	mov	%rbp,160($context)
1861	mov	%rbx,144($context)
1862
1863	lea	-0xd8(%rax),%rsi	# %xmm save area
1864	lea	512($context),%rdi	# & context.Xmm6
1865	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
1866	.long	0xa548f3fc		# cld; rep movsq
1867
1868.Lcommon_seh_tail:
1869	mov	8(%rax),%rdi
1870	mov	16(%rax),%rsi
1871	mov	%rax,152($context)	# restore context->Rsp
1872	mov	%rsi,168($context)	# restore context->Rsi
1873	mov	%rdi,176($context)	# restore context->Rdi
1874
1875	mov	40($disp),%rdi		# disp->ContextRecord
1876	mov	$context,%rsi		# context
1877	mov	\$154,%ecx		# sizeof(CONTEXT)
1878	.long	0xa548f3fc		# cld; rep movsq
1879
1880	mov	$disp,%rsi
1881	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1882	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1883	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1884	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1885	mov	40(%rsi),%r10		# disp->ContextRecord
1886	lea	56(%rsi),%r11		# &disp->HandlerData
1887	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1888	mov	%r10,32(%rsp)		# arg5
1889	mov	%r11,40(%rsp)		# arg6
1890	mov	%r12,48(%rsp)		# arg7
1891	mov	%rcx,56(%rsp)		# arg8, (NULL)
1892	call	*__imp_RtlVirtualUnwind(%rip)
1893
1894	mov	\$1,%eax		# ExceptionContinueSearch
1895	add	\$64,%rsp
1896	popfq
1897	pop	%r15
1898	pop	%r14
1899	pop	%r13
1900	pop	%r12
1901	pop	%rbp
1902	pop	%rbx
1903	pop	%rdi
1904	pop	%rsi
1905	ret
1906.size	rsaz_se_handler,.-rsaz_se_handler
1907
1908.section	.pdata
1909.align	4
1910	.rva	.LSEH_begin_rsaz_1024_sqr_avx2
1911	.rva	.LSEH_end_rsaz_1024_sqr_avx2
1912	.rva	.LSEH_info_rsaz_1024_sqr_avx2
1913
1914	.rva	.LSEH_begin_rsaz_1024_mul_avx2
1915	.rva	.LSEH_end_rsaz_1024_mul_avx2
1916	.rva	.LSEH_info_rsaz_1024_mul_avx2
1917
1918	.rva	.LSEH_begin_rsaz_1024_gather5
1919	.rva	.LSEH_end_rsaz_1024_gather5
1920	.rva	.LSEH_info_rsaz_1024_gather5
1921.section	.xdata
1922.align	8
1923.LSEH_info_rsaz_1024_sqr_avx2:
1924	.byte	9,0,0,0
1925	.rva	rsaz_se_handler
1926	.rva	.Lsqr_1024_body,.Lsqr_1024_epilogue,.Lsqr_1024_in_tail
1927	.long	0
1928.LSEH_info_rsaz_1024_mul_avx2:
1929	.byte	9,0,0,0
1930	.rva	rsaz_se_handler
1931	.rva	.Lmul_1024_body,.Lmul_1024_epilogue,.Lmul_1024_in_tail
1932	.long	0
1933.LSEH_info_rsaz_1024_gather5:
1934	.byte	0x01,0x36,0x17,0x0b
1935	.byte	0x36,0xf8,0x09,0x00	# vmovaps 0x90(rsp),xmm15
1936	.byte	0x31,0xe8,0x08,0x00	# vmovaps 0x80(rsp),xmm14
1937	.byte	0x2c,0xd8,0x07,0x00	# vmovaps 0x70(rsp),xmm13
1938	.byte	0x27,0xc8,0x06,0x00	# vmovaps 0x60(rsp),xmm12
1939	.byte	0x22,0xb8,0x05,0x00	# vmovaps 0x50(rsp),xmm11
1940	.byte	0x1d,0xa8,0x04,0x00	# vmovaps 0x40(rsp),xmm10
1941	.byte	0x18,0x98,0x03,0x00	# vmovaps 0x30(rsp),xmm9
1942	.byte	0x13,0x88,0x02,0x00	# vmovaps 0x20(rsp),xmm8
1943	.byte	0x0e,0x78,0x01,0x00	# vmovaps 0x10(rsp),xmm7
1944	.byte	0x09,0x68,0x00,0x00	# vmovaps 0x00(rsp),xmm6
1945	.byte	0x04,0x01,0x15,0x00	# sub	  rsp,0xa8
1946	.byte	0x00,0xb3,0x00,0x00	# set_frame r11
1947___
1948}
1949
1950foreach (split("\n",$code)) {
1951	s/\`([^\`]*)\`/eval($1)/ge;
1952
1953	s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge		or
1954
1955	s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go		or
1956	s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go		or
1957	s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go	or
1958	s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go	or
1959	s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
1960	print $_,"\n";
1961}
1962
1963}}} else {{{
1964print <<___;	# assembler is too old
1965.text
1966
1967.globl	rsaz_avx2_eligible
1968.type	rsaz_avx2_eligible,\@abi-omnipotent
1969rsaz_avx2_eligible:
1970	xor	%eax,%eax
1971	ret
1972.size	rsaz_avx2_eligible,.-rsaz_avx2_eligible
1973
1974.globl	rsaz_1024_sqr_avx2
1975.globl	rsaz_1024_mul_avx2
1976.globl	rsaz_1024_norm2red_avx2
1977.globl	rsaz_1024_red2norm_avx2
1978.globl	rsaz_1024_scatter5_avx2
1979.globl	rsaz_1024_gather5_avx2
1980.type	rsaz_1024_sqr_avx2,\@abi-omnipotent
1981rsaz_1024_sqr_avx2:
1982rsaz_1024_mul_avx2:
1983rsaz_1024_norm2red_avx2:
1984rsaz_1024_red2norm_avx2:
1985rsaz_1024_scatter5_avx2:
1986rsaz_1024_gather5_avx2:
1987	.byte	0x0f,0x0b	# ud2
1988	ret
1989.size	rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
1990___
1991}}}
1992
1993close STDOUT;
1994