• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
3# Copyright (c) 2012, Intel Corporation. All Rights Reserved.
4#
5# Licensed under the OpenSSL license (the "License").  You may not use
6# this file except in compliance with the License.  You can obtain a copy
7# in the file LICENSE in the source distribution or at
8# https://www.openssl.org/source/license.html
9#
10# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
11# (1) Intel Corporation, Israel Development Center, Haifa, Israel
12# (2) University of Haifa, Israel
13#
14# References:
15# [1] S. Gueron, V. Krasnov: "Software Implementation of Modular
16#     Exponentiation,  Using Advanced Vector Instructions Architectures",
17#     F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369,
18#     pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012
19# [2] S. Gueron: "Efficient Software Implementations of Modular
20#     Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012).
21# [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE
22#     Proceedings of 9th International Conference on Information Technology:
23#     New Generations (ITNG 2012), pp.821-823 (2012)
24# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis
25#     resistant 1024-bit modular exponentiation, for optimizing RSA2048
26#     on AVX2 capable x86_64 platforms",
27#     http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest
28#
29# +13% improvement over original submission by <appro@openssl.org>
30#
31# rsa2048 sign/sec	OpenSSL 1.0.1	scalar(*)	this
32# 2.3GHz Haswell	621		765/+23%	1113/+79%
33# 2.3GHz Broadwell(**)	688		1200(***)/+74%	1120/+63%
34#
35# (*)	if system doesn't support AVX2, for reference purposes;
36# (**)	scaled to 2.3GHz to simplify comparison;
37# (***)	scalar AD*X code is faster than AVX2 and is preferred code
38#	path for Broadwell;
39
40$flavour = shift;
41$output  = shift;
42if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
43
44$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
45
46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
48( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
49die "can't locate x86_64-xlate.pl";
50
51# In upstream, this is controlled by shelling out to the compiler to check
52# versions, but BoringSSL is intended to be used with pre-generated perlasm
53# output, so this isn't useful anyway.
54$avx = 2;
55
56open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
57*STDOUT = *OUT;
58
59if ($avx>1) {{{
60{ # void AMS_WW(
61my $rp="%rdi";	# BN_ULONG *rp,
62my $ap="%rsi";	# const BN_ULONG *ap,
63my $np="%rdx";	# const BN_ULONG *np,
64my $n0="%ecx";	# const BN_ULONG n0,
65my $rep="%r8d";	# int repeat);
66
67# The registers that hold the accumulated redundant result
68# The AMM works on 1024 bit operands, and redundant word size is 29
69# Therefore: ceil(1024/29)/4 = 9
70my $ACC0="%ymm0";
71my $ACC1="%ymm1";
72my $ACC2="%ymm2";
73my $ACC3="%ymm3";
74my $ACC4="%ymm4";
75my $ACC5="%ymm5";
76my $ACC6="%ymm6";
77my $ACC7="%ymm7";
78my $ACC8="%ymm8";
79my $ACC9="%ymm9";
80# Registers that hold the broadcasted words of bp, currently used
81my $B1="%ymm10";
82my $B2="%ymm11";
83# Registers that hold the broadcasted words of Y, currently used
84my $Y1="%ymm12";
85my $Y2="%ymm13";
86# Helper registers
87my $TEMP1="%ymm14";
88my $AND_MASK="%ymm15";
89# alu registers that hold the first words of the ACC
90my $r0="%r9";
91my $r1="%r10";
92my $r2="%r11";
93my $r3="%r12";
94
95my $i="%r14d";			# loop counter
96my $tmp = "%r15";
97
98my $FrameSize=32*18+32*8;	# place for A^2 and 2*A
99
100my $aap=$r0;
101my $tp0="%rbx";
102my $tp1=$r3;
103my $tpa=$tmp;
104
105$np="%r13";			# reassigned argument
106
107$code.=<<___;
108.text
109
110.globl	rsaz_1024_sqr_avx2
111.type	rsaz_1024_sqr_avx2,\@function,5
112.align	64
113rsaz_1024_sqr_avx2:		# 702 cycles, 14% faster than rsaz_1024_mul_avx2
114.cfi_startproc
115	_CET_ENDBR
116	lea	(%rsp), %rax
117.cfi_def_cfa_register	%rax
118	push	%rbx
119.cfi_push	%rbx
120	push	%rbp
121.cfi_push	%rbp
122	push	%r12
123.cfi_push	%r12
124	push	%r13
125.cfi_push	%r13
126	push	%r14
127.cfi_push	%r14
128	push	%r15
129.cfi_push	%r15
130	vzeroupper
131___
132$code.=<<___ if ($win64);
133	lea	-0xa8(%rsp),%rsp
134	vmovaps	%xmm6,-0xd8(%rax)
135	vmovaps	%xmm7,-0xc8(%rax)
136	vmovaps	%xmm8,-0xb8(%rax)
137	vmovaps	%xmm9,-0xa8(%rax)
138	vmovaps	%xmm10,-0x98(%rax)
139	vmovaps	%xmm11,-0x88(%rax)
140	vmovaps	%xmm12,-0x78(%rax)
141	vmovaps	%xmm13,-0x68(%rax)
142	vmovaps	%xmm14,-0x58(%rax)
143	vmovaps	%xmm15,-0x48(%rax)
144.Lsqr_1024_body:
145___
146$code.=<<___;
147	mov	%rax,%rbp
148.cfi_def_cfa_register	%rbp
149	mov	%rdx, $np			# reassigned argument
150	sub	\$$FrameSize, %rsp
151	mov	$np, $tmp
152	sub	\$-128, $rp			# size optimization
153	sub	\$-128, $ap
154	sub	\$-128, $np
155
156	and	\$4095, $tmp			# see if $np crosses page
157	add	\$32*10, $tmp
158	shr	\$12, $tmp
159	vpxor	$ACC9,$ACC9,$ACC9
160	jz	.Lsqr_1024_no_n_copy
161
162	# unaligned 256-bit load that crosses page boundary can
163	# cause >2x performance degradation here, so if $np does
164	# cross page boundary, copy it to stack and make sure stack
165	# frame doesn't...
166	sub		\$32*10,%rsp
167	vmovdqu		32*0-128($np), $ACC0
168	and		\$-2048, %rsp
169	vmovdqu		32*1-128($np), $ACC1
170	vmovdqu		32*2-128($np), $ACC2
171	vmovdqu		32*3-128($np), $ACC3
172	vmovdqu		32*4-128($np), $ACC4
173	vmovdqu		32*5-128($np), $ACC5
174	vmovdqu		32*6-128($np), $ACC6
175	vmovdqu		32*7-128($np), $ACC7
176	vmovdqu		32*8-128($np), $ACC8
177	lea		$FrameSize+128(%rsp),$np
178	vmovdqu		$ACC0, 32*0-128($np)
179	vmovdqu		$ACC1, 32*1-128($np)
180	vmovdqu		$ACC2, 32*2-128($np)
181	vmovdqu		$ACC3, 32*3-128($np)
182	vmovdqu		$ACC4, 32*4-128($np)
183	vmovdqu		$ACC5, 32*5-128($np)
184	vmovdqu		$ACC6, 32*6-128($np)
185	vmovdqu		$ACC7, 32*7-128($np)
186	vmovdqu		$ACC8, 32*8-128($np)
187	vmovdqu		$ACC9, 32*9-128($np)	# $ACC9 is zero
188
189.Lsqr_1024_no_n_copy:
190	and		\$-1024, %rsp
191
192	vmovdqu		32*1-128($ap), $ACC1
193	vmovdqu		32*2-128($ap), $ACC2
194	vmovdqu		32*3-128($ap), $ACC3
195	vmovdqu		32*4-128($ap), $ACC4
196	vmovdqu		32*5-128($ap), $ACC5
197	vmovdqu		32*6-128($ap), $ACC6
198	vmovdqu		32*7-128($ap), $ACC7
199	vmovdqu		32*8-128($ap), $ACC8
200
201	lea	192(%rsp), $tp0			# 64+128=192
202	vmovdqu	.Land_mask(%rip), $AND_MASK
203	jmp	.LOOP_GRANDE_SQR_1024
204
205.align	32
206.LOOP_GRANDE_SQR_1024:
207	lea	32*18+128(%rsp), $aap		# size optimization
208	lea	448(%rsp), $tp1			# 64+128+256=448
209
210	# the squaring is performed as described in Variant B of
211	# "Speeding up Big-Number Squaring", so start by calculating
212	# the A*2=A+A vector
213	vpaddq		$ACC1, $ACC1, $ACC1
214	 vpbroadcastq	32*0-128($ap), $B1
215	vpaddq		$ACC2, $ACC2, $ACC2
216	vmovdqa		$ACC1, 32*0-128($aap)
217	vpaddq		$ACC3, $ACC3, $ACC3
218	vmovdqa		$ACC2, 32*1-128($aap)
219	vpaddq		$ACC4, $ACC4, $ACC4
220	vmovdqa		$ACC3, 32*2-128($aap)
221	vpaddq		$ACC5, $ACC5, $ACC5
222	vmovdqa		$ACC4, 32*3-128($aap)
223	vpaddq		$ACC6, $ACC6, $ACC6
224	vmovdqa		$ACC5, 32*4-128($aap)
225	vpaddq		$ACC7, $ACC7, $ACC7
226	vmovdqa		$ACC6, 32*5-128($aap)
227	vpaddq		$ACC8, $ACC8, $ACC8
228	vmovdqa		$ACC7, 32*6-128($aap)
229	vpxor		$ACC9, $ACC9, $ACC9
230	vmovdqa		$ACC8, 32*7-128($aap)
231
232	vpmuludq	32*0-128($ap), $B1, $ACC0
233	 vpbroadcastq	32*1-128($ap), $B2
234	 vmovdqu	$ACC9, 32*9-192($tp0)	# zero upper half
235	vpmuludq	$B1, $ACC1, $ACC1
236	 vmovdqu	$ACC9, 32*10-448($tp1)
237	vpmuludq	$B1, $ACC2, $ACC2
238	 vmovdqu	$ACC9, 32*11-448($tp1)
239	vpmuludq	$B1, $ACC3, $ACC3
240	 vmovdqu	$ACC9, 32*12-448($tp1)
241	vpmuludq	$B1, $ACC4, $ACC4
242	 vmovdqu	$ACC9, 32*13-448($tp1)
243	vpmuludq	$B1, $ACC5, $ACC5
244	 vmovdqu	$ACC9, 32*14-448($tp1)
245	vpmuludq	$B1, $ACC6, $ACC6
246	 vmovdqu	$ACC9, 32*15-448($tp1)
247	vpmuludq	$B1, $ACC7, $ACC7
248	 vmovdqu	$ACC9, 32*16-448($tp1)
249	vpmuludq	$B1, $ACC8, $ACC8
250	 vpbroadcastq	32*2-128($ap), $B1
251	 vmovdqu	$ACC9, 32*17-448($tp1)
252
253	mov	$ap, $tpa
254	mov 	\$4, $i
255	jmp	.Lsqr_entry_1024
256___
257$TEMP0=$Y1;
258$TEMP2=$Y2;
259$code.=<<___;
260.align	32
261.LOOP_SQR_1024:
262	 vpbroadcastq	32*1-128($tpa), $B2
263	vpmuludq	32*0-128($ap), $B1, $ACC0
264	vpaddq		32*0-192($tp0), $ACC0, $ACC0
265	vpmuludq	32*0-128($aap), $B1, $ACC1
266	vpaddq		32*1-192($tp0), $ACC1, $ACC1
267	vpmuludq	32*1-128($aap), $B1, $ACC2
268	vpaddq		32*2-192($tp0), $ACC2, $ACC2
269	vpmuludq	32*2-128($aap), $B1, $ACC3
270	vpaddq		32*3-192($tp0), $ACC3, $ACC3
271	vpmuludq	32*3-128($aap), $B1, $ACC4
272	vpaddq		32*4-192($tp0), $ACC4, $ACC4
273	vpmuludq	32*4-128($aap), $B1, $ACC5
274	vpaddq		32*5-192($tp0), $ACC5, $ACC5
275	vpmuludq	32*5-128($aap), $B1, $ACC6
276	vpaddq		32*6-192($tp0), $ACC6, $ACC6
277	vpmuludq	32*6-128($aap), $B1, $ACC7
278	vpaddq		32*7-192($tp0), $ACC7, $ACC7
279	vpmuludq	32*7-128($aap), $B1, $ACC8
280	 vpbroadcastq	32*2-128($tpa), $B1
281	vpaddq		32*8-192($tp0), $ACC8, $ACC8
282.Lsqr_entry_1024:
283	vmovdqu		$ACC0, 32*0-192($tp0)
284	vmovdqu		$ACC1, 32*1-192($tp0)
285
286	vpmuludq	32*1-128($ap), $B2, $TEMP0
287	vpaddq		$TEMP0, $ACC2, $ACC2
288	vpmuludq	32*1-128($aap), $B2, $TEMP1
289	vpaddq		$TEMP1, $ACC3, $ACC3
290	vpmuludq	32*2-128($aap), $B2, $TEMP2
291	vpaddq		$TEMP2, $ACC4, $ACC4
292	vpmuludq	32*3-128($aap), $B2, $TEMP0
293	vpaddq		$TEMP0, $ACC5, $ACC5
294	vpmuludq	32*4-128($aap), $B2, $TEMP1
295	vpaddq		$TEMP1, $ACC6, $ACC6
296	vpmuludq	32*5-128($aap), $B2, $TEMP2
297	vpaddq		$TEMP2, $ACC7, $ACC7
298	vpmuludq	32*6-128($aap), $B2, $TEMP0
299	vpaddq		$TEMP0, $ACC8, $ACC8
300	vpmuludq	32*7-128($aap), $B2, $ACC0
301	 vpbroadcastq	32*3-128($tpa), $B2
302	vpaddq		32*9-192($tp0), $ACC0, $ACC0
303
304	vmovdqu		$ACC2, 32*2-192($tp0)
305	vmovdqu		$ACC3, 32*3-192($tp0)
306
307	vpmuludq	32*2-128($ap), $B1, $TEMP2
308	vpaddq		$TEMP2, $ACC4, $ACC4
309	vpmuludq	32*2-128($aap), $B1, $TEMP0
310	vpaddq		$TEMP0, $ACC5, $ACC5
311	vpmuludq	32*3-128($aap), $B1, $TEMP1
312	vpaddq		$TEMP1, $ACC6, $ACC6
313	vpmuludq	32*4-128($aap), $B1, $TEMP2
314	vpaddq		$TEMP2, $ACC7, $ACC7
315	vpmuludq	32*5-128($aap), $B1, $TEMP0
316	vpaddq		$TEMP0, $ACC8, $ACC8
317	vpmuludq	32*6-128($aap), $B1, $TEMP1
318	vpaddq		$TEMP1, $ACC0, $ACC0
319	vpmuludq	32*7-128($aap), $B1, $ACC1
320	 vpbroadcastq	32*4-128($tpa), $B1
321	vpaddq		32*10-448($tp1), $ACC1, $ACC1
322
323	vmovdqu		$ACC4, 32*4-192($tp0)
324	vmovdqu		$ACC5, 32*5-192($tp0)
325
326	vpmuludq	32*3-128($ap), $B2, $TEMP0
327	vpaddq		$TEMP0, $ACC6, $ACC6
328	vpmuludq	32*3-128($aap), $B2, $TEMP1
329	vpaddq		$TEMP1, $ACC7, $ACC7
330	vpmuludq	32*4-128($aap), $B2, $TEMP2
331	vpaddq		$TEMP2, $ACC8, $ACC8
332	vpmuludq	32*5-128($aap), $B2, $TEMP0
333	vpaddq		$TEMP0, $ACC0, $ACC0
334	vpmuludq	32*6-128($aap), $B2, $TEMP1
335	vpaddq		$TEMP1, $ACC1, $ACC1
336	vpmuludq	32*7-128($aap), $B2, $ACC2
337	 vpbroadcastq	32*5-128($tpa), $B2
338	vpaddq		32*11-448($tp1), $ACC2, $ACC2
339
340	vmovdqu		$ACC6, 32*6-192($tp0)
341	vmovdqu		$ACC7, 32*7-192($tp0)
342
343	vpmuludq	32*4-128($ap), $B1, $TEMP0
344	vpaddq		$TEMP0, $ACC8, $ACC8
345	vpmuludq	32*4-128($aap), $B1, $TEMP1
346	vpaddq		$TEMP1, $ACC0, $ACC0
347	vpmuludq	32*5-128($aap), $B1, $TEMP2
348	vpaddq		$TEMP2, $ACC1, $ACC1
349	vpmuludq	32*6-128($aap), $B1, $TEMP0
350	vpaddq		$TEMP0, $ACC2, $ACC2
351	vpmuludq	32*7-128($aap), $B1, $ACC3
352	 vpbroadcastq	32*6-128($tpa), $B1
353	vpaddq		32*12-448($tp1), $ACC3, $ACC3
354
355	vmovdqu		$ACC8, 32*8-192($tp0)
356	vmovdqu		$ACC0, 32*9-192($tp0)
357	lea		8($tp0), $tp0
358
359	vpmuludq	32*5-128($ap), $B2, $TEMP2
360	vpaddq		$TEMP2, $ACC1, $ACC1
361	vpmuludq	32*5-128($aap), $B2, $TEMP0
362	vpaddq		$TEMP0, $ACC2, $ACC2
363	vpmuludq	32*6-128($aap), $B2, $TEMP1
364	vpaddq		$TEMP1, $ACC3, $ACC3
365	vpmuludq	32*7-128($aap), $B2, $ACC4
366	 vpbroadcastq	32*7-128($tpa), $B2
367	vpaddq		32*13-448($tp1), $ACC4, $ACC4
368
369	vmovdqu		$ACC1, 32*10-448($tp1)
370	vmovdqu		$ACC2, 32*11-448($tp1)
371
372	vpmuludq	32*6-128($ap), $B1, $TEMP0
373	vpaddq		$TEMP0, $ACC3, $ACC3
374	vpmuludq	32*6-128($aap), $B1, $TEMP1
375	 vpbroadcastq	32*8-128($tpa), $ACC0		# borrow $ACC0 for $B1
376	vpaddq		$TEMP1, $ACC4, $ACC4
377	vpmuludq	32*7-128($aap), $B1, $ACC5
378	 vpbroadcastq	32*0+8-128($tpa), $B1		# for next iteration
379	vpaddq		32*14-448($tp1), $ACC5, $ACC5
380
381	vmovdqu		$ACC3, 32*12-448($tp1)
382	vmovdqu		$ACC4, 32*13-448($tp1)
383	lea		8($tpa), $tpa
384
385	vpmuludq	32*7-128($ap), $B2, $TEMP0
386	vpaddq		$TEMP0, $ACC5, $ACC5
387	vpmuludq	32*7-128($aap), $B2, $ACC6
388	vpaddq		32*15-448($tp1), $ACC6, $ACC6
389
390	vpmuludq	32*8-128($ap), $ACC0, $ACC7
391	vmovdqu		$ACC5, 32*14-448($tp1)
392	vpaddq		32*16-448($tp1), $ACC7, $ACC7
393	vmovdqu		$ACC6, 32*15-448($tp1)
394	vmovdqu		$ACC7, 32*16-448($tp1)
395	lea		8($tp1), $tp1
396
397	dec	$i
398	jnz	.LOOP_SQR_1024
399___
400$ZERO = $ACC9;
401$TEMP0 = $B1;
402$TEMP2 = $B2;
403$TEMP3 = $Y1;
404$TEMP4 = $Y2;
405$code.=<<___;
406	# we need to fix indices 32-39 to avoid overflow
407	vmovdqu		32*8(%rsp), $ACC8		# 32*8-192($tp0),
408	vmovdqu		32*9(%rsp), $ACC1		# 32*9-192($tp0)
409	vmovdqu		32*10(%rsp), $ACC2		# 32*10-192($tp0)
410	lea		192(%rsp), $tp0			# 64+128=192
411
412	vpsrlq		\$29, $ACC8, $TEMP1
413	vpand		$AND_MASK, $ACC8, $ACC8
414	vpsrlq		\$29, $ACC1, $TEMP2
415	vpand		$AND_MASK, $ACC1, $ACC1
416
417	vpermq		\$0x93, $TEMP1, $TEMP1
418	vpxor		$ZERO, $ZERO, $ZERO
419	vpermq		\$0x93, $TEMP2, $TEMP2
420
421	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
422	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
423	vpaddq		$TEMP0, $ACC8, $ACC8
424	vpblendd	\$3, $TEMP2, $ZERO, $TEMP2
425	vpaddq		$TEMP1, $ACC1, $ACC1
426	vpaddq		$TEMP2, $ACC2, $ACC2
427	vmovdqu		$ACC1, 32*9-192($tp0)
428	vmovdqu		$ACC2, 32*10-192($tp0)
429
430	mov	(%rsp), %rax
431	mov	8(%rsp), $r1
432	mov	16(%rsp), $r2
433	mov	24(%rsp), $r3
434	vmovdqu	32*1(%rsp), $ACC1
435	vmovdqu	32*2-192($tp0), $ACC2
436	vmovdqu	32*3-192($tp0), $ACC3
437	vmovdqu	32*4-192($tp0), $ACC4
438	vmovdqu	32*5-192($tp0), $ACC5
439	vmovdqu	32*6-192($tp0), $ACC6
440	vmovdqu	32*7-192($tp0), $ACC7
441
442	mov	%rax, $r0
443	imull	$n0, %eax
444	and	\$0x1fffffff, %eax
445	vmovd	%eax, $Y1
446
447	mov	%rax, %rdx
448	imulq	-128($np), %rax
449	 vpbroadcastq	$Y1, $Y1
450	add	%rax, $r0
451	mov	%rdx, %rax
452	imulq	8-128($np), %rax
453	shr	\$29, $r0
454	add	%rax, $r1
455	mov	%rdx, %rax
456	imulq	16-128($np), %rax
457	add	$r0, $r1
458	add	%rax, $r2
459	imulq	24-128($np), %rdx
460	add	%rdx, $r3
461
462	mov	$r1, %rax
463	imull	$n0, %eax
464	and	\$0x1fffffff, %eax
465
466	mov \$9, $i
467	jmp .LOOP_REDUCE_1024
468
469.align	32
470.LOOP_REDUCE_1024:
471	vmovd	%eax, $Y2
472	vpbroadcastq	$Y2, $Y2
473
474	vpmuludq	32*1-128($np), $Y1, $TEMP0
475	 mov	%rax, %rdx
476	 imulq	-128($np), %rax
477	vpaddq		$TEMP0, $ACC1, $ACC1
478	 add	%rax, $r1
479	vpmuludq	32*2-128($np), $Y1, $TEMP1
480	 mov	%rdx, %rax
481	 imulq	8-128($np), %rax
482	vpaddq		$TEMP1, $ACC2, $ACC2
483	vpmuludq	32*3-128($np), $Y1, $TEMP2
484	 .byte	0x67
485	 add	%rax, $r2
486	 .byte	0x67
487	 mov	%rdx, %rax
488	 imulq	16-128($np), %rax
489	 shr	\$29, $r1
490	vpaddq		$TEMP2, $ACC3, $ACC3
491	vpmuludq	32*4-128($np), $Y1, $TEMP0
492	 add	%rax, $r3
493	 add	$r1, $r2
494	vpaddq		$TEMP0, $ACC4, $ACC4
495	vpmuludq	32*5-128($np), $Y1, $TEMP1
496	 mov	$r2, %rax
497	 imull	$n0, %eax
498	vpaddq		$TEMP1, $ACC5, $ACC5
499	vpmuludq	32*6-128($np), $Y1, $TEMP2
500	 and	\$0x1fffffff, %eax
501	vpaddq		$TEMP2, $ACC6, $ACC6
502	vpmuludq	32*7-128($np), $Y1, $TEMP0
503	vpaddq		$TEMP0, $ACC7, $ACC7
504	vpmuludq	32*8-128($np), $Y1, $TEMP1
505	 vmovd	%eax, $Y1
506	 #vmovdqu	32*1-8-128($np), $TEMP2		# moved below
507	vpaddq		$TEMP1, $ACC8, $ACC8
508	 #vmovdqu	32*2-8-128($np), $TEMP0		# moved below
509	 vpbroadcastq	$Y1, $Y1
510
511	vpmuludq	32*1-8-128($np), $Y2, $TEMP2	# see above
512	vmovdqu		32*3-8-128($np), $TEMP1
513	 mov	%rax, %rdx
514	 imulq	-128($np), %rax
515	vpaddq		$TEMP2, $ACC1, $ACC1
516	vpmuludq	32*2-8-128($np), $Y2, $TEMP0	# see above
517	vmovdqu		32*4-8-128($np), $TEMP2
518	 add	%rax, $r2
519	 mov	%rdx, %rax
520	 imulq	8-128($np), %rax
521	vpaddq		$TEMP0, $ACC2, $ACC2
522	 add	$r3, %rax
523	 shr	\$29, $r2
524	vpmuludq	$Y2, $TEMP1, $TEMP1
525	vmovdqu		32*5-8-128($np), $TEMP0
526	 add	$r2, %rax
527	vpaddq		$TEMP1, $ACC3, $ACC3
528	vpmuludq	$Y2, $TEMP2, $TEMP2
529	vmovdqu		32*6-8-128($np), $TEMP1
530	 .byte	0x67
531	 mov	%rax, $r3
532	 imull	$n0, %eax
533	vpaddq		$TEMP2, $ACC4, $ACC4
534	vpmuludq	$Y2, $TEMP0, $TEMP0
535	.byte	0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00	# vmovdqu		32*7-8-128($np), $TEMP2
536	 and	\$0x1fffffff, %eax
537	vpaddq		$TEMP0, $ACC5, $ACC5
538	vpmuludq	$Y2, $TEMP1, $TEMP1
539	vmovdqu		32*8-8-128($np), $TEMP0
540	vpaddq		$TEMP1, $ACC6, $ACC6
541	vpmuludq	$Y2, $TEMP2, $TEMP2
542	vmovdqu		32*9-8-128($np), $ACC9
543	 vmovd	%eax, $ACC0			# borrow ACC0 for Y2
544	 imulq	-128($np), %rax
545	vpaddq		$TEMP2, $ACC7, $ACC7
546	vpmuludq	$Y2, $TEMP0, $TEMP0
547	 vmovdqu	32*1-16-128($np), $TEMP1
548	 vpbroadcastq	$ACC0, $ACC0
549	vpaddq		$TEMP0, $ACC8, $ACC8
550	vpmuludq	$Y2, $ACC9, $ACC9
551	 vmovdqu	32*2-16-128($np), $TEMP2
552	 add	%rax, $r3
553
554___
555($ACC0,$Y2)=($Y2,$ACC0);
556$code.=<<___;
557	 vmovdqu	32*1-24-128($np), $ACC0
558	vpmuludq	$Y1, $TEMP1, $TEMP1
559	vmovdqu		32*3-16-128($np), $TEMP0
560	vpaddq		$TEMP1, $ACC1, $ACC1
561	 vpmuludq	$Y2, $ACC0, $ACC0
562	vpmuludq	$Y1, $TEMP2, $TEMP2
563	.byte	0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff	# vmovdqu		32*4-16-128($np), $TEMP1
564	 vpaddq		$ACC1, $ACC0, $ACC0
565	vpaddq		$TEMP2, $ACC2, $ACC2
566	vpmuludq	$Y1, $TEMP0, $TEMP0
567	vmovdqu		32*5-16-128($np), $TEMP2
568	 .byte	0x67
569	 vmovq		$ACC0, %rax
570	 vmovdqu	$ACC0, (%rsp)		# transfer $r0-$r3
571	vpaddq		$TEMP0, $ACC3, $ACC3
572	vpmuludq	$Y1, $TEMP1, $TEMP1
573	vmovdqu		32*6-16-128($np), $TEMP0
574	vpaddq		$TEMP1, $ACC4, $ACC4
575	vpmuludq	$Y1, $TEMP2, $TEMP2
576	vmovdqu		32*7-16-128($np), $TEMP1
577	vpaddq		$TEMP2, $ACC5, $ACC5
578	vpmuludq	$Y1, $TEMP0, $TEMP0
579	vmovdqu		32*8-16-128($np), $TEMP2
580	vpaddq		$TEMP0, $ACC6, $ACC6
581	vpmuludq	$Y1, $TEMP1, $TEMP1
582	 shr	\$29, $r3
583	vmovdqu		32*9-16-128($np), $TEMP0
584	 add	$r3, %rax
585	vpaddq		$TEMP1, $ACC7, $ACC7
586	vpmuludq	$Y1, $TEMP2, $TEMP2
587	 #vmovdqu	32*2-24-128($np), $TEMP1	# moved below
588	 mov	%rax, $r0
589	 imull	$n0, %eax
590	vpaddq		$TEMP2, $ACC8, $ACC8
591	vpmuludq	$Y1, $TEMP0, $TEMP0
592	 and	\$0x1fffffff, %eax
593	 vmovd	%eax, $Y1
594	 vmovdqu	32*3-24-128($np), $TEMP2
595	.byte	0x67
596	vpaddq		$TEMP0, $ACC9, $ACC9
597	 vpbroadcastq	$Y1, $Y1
598
599	vpmuludq	32*2-24-128($np), $Y2, $TEMP1	# see above
600	vmovdqu		32*4-24-128($np), $TEMP0
601	 mov	%rax, %rdx
602	 imulq	-128($np), %rax
603	 mov	8(%rsp), $r1
604	vpaddq		$TEMP1, $ACC2, $ACC1
605	vpmuludq	$Y2, $TEMP2, $TEMP2
606	vmovdqu		32*5-24-128($np), $TEMP1
607	 add	%rax, $r0
608	 mov	%rdx, %rax
609	 imulq	8-128($np), %rax
610	 .byte	0x67
611	 shr	\$29, $r0
612	 mov	16(%rsp), $r2
613	vpaddq		$TEMP2, $ACC3, $ACC2
614	vpmuludq	$Y2, $TEMP0, $TEMP0
615	vmovdqu		32*6-24-128($np), $TEMP2
616	 add	%rax, $r1
617	 mov	%rdx, %rax
618	 imulq	16-128($np), %rax
619	vpaddq		$TEMP0, $ACC4, $ACC3
620	vpmuludq	$Y2, $TEMP1, $TEMP1
621	vmovdqu		32*7-24-128($np), $TEMP0
622	 imulq	24-128($np), %rdx		# future $r3
623	 add	%rax, $r2
624	 lea	($r0,$r1), %rax
625	vpaddq		$TEMP1, $ACC5, $ACC4
626	vpmuludq	$Y2, $TEMP2, $TEMP2
627	vmovdqu		32*8-24-128($np), $TEMP1
628	 mov	%rax, $r1
629	 imull	$n0, %eax
630	vpmuludq	$Y2, $TEMP0, $TEMP0
631	vpaddq		$TEMP2, $ACC6, $ACC5
632	vmovdqu		32*9-24-128($np), $TEMP2
633	 and	\$0x1fffffff, %eax
634	vpaddq		$TEMP0, $ACC7, $ACC6
635	vpmuludq	$Y2, $TEMP1, $TEMP1
636	 add	24(%rsp), %rdx
637	vpaddq		$TEMP1, $ACC8, $ACC7
638	vpmuludq	$Y2, $TEMP2, $TEMP2
639	vpaddq		$TEMP2, $ACC9, $ACC8
640	 vmovq	$r3, $ACC9
641	 mov	%rdx, $r3
642
643	dec	$i
644	jnz	.LOOP_REDUCE_1024
645___
646($ACC0,$Y2)=($Y2,$ACC0);
647$code.=<<___;
648	lea	448(%rsp), $tp1			# size optimization
649	vpaddq	$ACC9, $Y2, $ACC0
650	vpxor	$ZERO, $ZERO, $ZERO
651
652	vpaddq		32*9-192($tp0), $ACC0, $ACC0
653	vpaddq		32*10-448($tp1), $ACC1, $ACC1
654	vpaddq		32*11-448($tp1), $ACC2, $ACC2
655	vpaddq		32*12-448($tp1), $ACC3, $ACC3
656	vpaddq		32*13-448($tp1), $ACC4, $ACC4
657	vpaddq		32*14-448($tp1), $ACC5, $ACC5
658	vpaddq		32*15-448($tp1), $ACC6, $ACC6
659	vpaddq		32*16-448($tp1), $ACC7, $ACC7
660	vpaddq		32*17-448($tp1), $ACC8, $ACC8
661
662	vpsrlq		\$29, $ACC0, $TEMP1
663	vpand		$AND_MASK, $ACC0, $ACC0
664	vpsrlq		\$29, $ACC1, $TEMP2
665	vpand		$AND_MASK, $ACC1, $ACC1
666	vpsrlq		\$29, $ACC2, $TEMP3
667	vpermq		\$0x93, $TEMP1, $TEMP1
668	vpand		$AND_MASK, $ACC2, $ACC2
669	vpsrlq		\$29, $ACC3, $TEMP4
670	vpermq		\$0x93, $TEMP2, $TEMP2
671	vpand		$AND_MASK, $ACC3, $ACC3
672	vpermq		\$0x93, $TEMP3, $TEMP3
673
674	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
675	vpermq		\$0x93, $TEMP4, $TEMP4
676	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
677	vpaddq		$TEMP0, $ACC0, $ACC0
678	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
679	vpaddq		$TEMP1, $ACC1, $ACC1
680	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
681	vpaddq		$TEMP2, $ACC2, $ACC2
682	vpblendd	\$3, $TEMP4, $ZERO, $TEMP4
683	vpaddq		$TEMP3, $ACC3, $ACC3
684	vpaddq		$TEMP4, $ACC4, $ACC4
685
686	vpsrlq		\$29, $ACC0, $TEMP1
687	vpand		$AND_MASK, $ACC0, $ACC0
688	vpsrlq		\$29, $ACC1, $TEMP2
689	vpand		$AND_MASK, $ACC1, $ACC1
690	vpsrlq		\$29, $ACC2, $TEMP3
691	vpermq		\$0x93, $TEMP1, $TEMP1
692	vpand		$AND_MASK, $ACC2, $ACC2
693	vpsrlq		\$29, $ACC3, $TEMP4
694	vpermq		\$0x93, $TEMP2, $TEMP2
695	vpand		$AND_MASK, $ACC3, $ACC3
696	vpermq		\$0x93, $TEMP3, $TEMP3
697
698	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
699	vpermq		\$0x93, $TEMP4, $TEMP4
700	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
701	vpaddq		$TEMP0, $ACC0, $ACC0
702	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
703	vpaddq		$TEMP1, $ACC1, $ACC1
704	vmovdqu		$ACC0, 32*0-128($rp)
705	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
706	vpaddq		$TEMP2, $ACC2, $ACC2
707	vmovdqu		$ACC1, 32*1-128($rp)
708	vpblendd	\$3, $TEMP4, $ZERO, $TEMP4
709	vpaddq		$TEMP3, $ACC3, $ACC3
710	vmovdqu		$ACC2, 32*2-128($rp)
711	vpaddq		$TEMP4, $ACC4, $ACC4
712	vmovdqu		$ACC3, 32*3-128($rp)
713___
714$TEMP5=$ACC0;
715$code.=<<___;
716	vpsrlq		\$29, $ACC4, $TEMP1
717	vpand		$AND_MASK, $ACC4, $ACC4
718	vpsrlq		\$29, $ACC5, $TEMP2
719	vpand		$AND_MASK, $ACC5, $ACC5
720	vpsrlq		\$29, $ACC6, $TEMP3
721	vpermq		\$0x93, $TEMP1, $TEMP1
722	vpand		$AND_MASK, $ACC6, $ACC6
723	vpsrlq		\$29, $ACC7, $TEMP4
724	vpermq		\$0x93, $TEMP2, $TEMP2
725	vpand		$AND_MASK, $ACC7, $ACC7
726	vpsrlq		\$29, $ACC8, $TEMP5
727	vpermq		\$0x93, $TEMP3, $TEMP3
728	vpand		$AND_MASK, $ACC8, $ACC8
729	vpermq		\$0x93, $TEMP4, $TEMP4
730
731	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
732	vpermq		\$0x93, $TEMP5, $TEMP5
733	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
734	vpaddq		$TEMP0, $ACC4, $ACC4
735	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
736	vpaddq		$TEMP1, $ACC5, $ACC5
737	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
738	vpaddq		$TEMP2, $ACC6, $ACC6
739	vpblendd	\$3, $TEMP4, $TEMP5, $TEMP4
740	vpaddq		$TEMP3, $ACC7, $ACC7
741	vpaddq		$TEMP4, $ACC8, $ACC8
742
743	vpsrlq		\$29, $ACC4, $TEMP1
744	vpand		$AND_MASK, $ACC4, $ACC4
745	vpsrlq		\$29, $ACC5, $TEMP2
746	vpand		$AND_MASK, $ACC5, $ACC5
747	vpsrlq		\$29, $ACC6, $TEMP3
748	vpermq		\$0x93, $TEMP1, $TEMP1
749	vpand		$AND_MASK, $ACC6, $ACC6
750	vpsrlq		\$29, $ACC7, $TEMP4
751	vpermq		\$0x93, $TEMP2, $TEMP2
752	vpand		$AND_MASK, $ACC7, $ACC7
753	vpsrlq		\$29, $ACC8, $TEMP5
754	vpermq		\$0x93, $TEMP3, $TEMP3
755	vpand		$AND_MASK, $ACC8, $ACC8
756	vpermq		\$0x93, $TEMP4, $TEMP4
757
758	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
759	vpermq		\$0x93, $TEMP5, $TEMP5
760	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
761	vpaddq		$TEMP0, $ACC4, $ACC4
762	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
763	vpaddq		$TEMP1, $ACC5, $ACC5
764	vmovdqu		$ACC4, 32*4-128($rp)
765	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
766	vpaddq		$TEMP2, $ACC6, $ACC6
767	vmovdqu		$ACC5, 32*5-128($rp)
768	vpblendd	\$3, $TEMP4, $TEMP5, $TEMP4
769	vpaddq		$TEMP3, $ACC7, $ACC7
770	vmovdqu		$ACC6, 32*6-128($rp)
771	vpaddq		$TEMP4, $ACC8, $ACC8
772	vmovdqu		$ACC7, 32*7-128($rp)
773	vmovdqu		$ACC8, 32*8-128($rp)
774
775	mov	$rp, $ap
776	dec	$rep
777	jne	.LOOP_GRANDE_SQR_1024
778
779	vzeroall
780	mov	%rbp, %rax
781.cfi_def_cfa_register	%rax
782___
783$code.=<<___ if ($win64);
784.Lsqr_1024_in_tail:
785	movaps	-0xd8(%rax),%xmm6
786	movaps	-0xc8(%rax),%xmm7
787	movaps	-0xb8(%rax),%xmm8
788	movaps	-0xa8(%rax),%xmm9
789	movaps	-0x98(%rax),%xmm10
790	movaps	-0x88(%rax),%xmm11
791	movaps	-0x78(%rax),%xmm12
792	movaps	-0x68(%rax),%xmm13
793	movaps	-0x58(%rax),%xmm14
794	movaps	-0x48(%rax),%xmm15
795___
796$code.=<<___;
797	mov	-48(%rax),%r15
798.cfi_restore	%r15
799	mov	-40(%rax),%r14
800.cfi_restore	%r14
801	mov	-32(%rax),%r13
802.cfi_restore	%r13
803	mov	-24(%rax),%r12
804.cfi_restore	%r12
805	mov	-16(%rax),%rbp
806.cfi_restore	%rbp
807	mov	-8(%rax),%rbx
808.cfi_restore	%rbx
809	lea	(%rax),%rsp		# restore %rsp
810.cfi_def_cfa_register	%rsp
811.Lsqr_1024_epilogue:
812	ret
813.cfi_endproc
814.size	rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
815___
816}
817
818{ # void AMM_WW(
819my $rp="%rdi";	# BN_ULONG *rp,
820my $ap="%rsi";	# const BN_ULONG *ap,
821my $bp="%rdx";	# const BN_ULONG *bp,
822my $np="%rcx";	# const BN_ULONG *np,
823my $n0="%r8d";	# unsigned int n0);
824
825# The registers that hold the accumulated redundant result
826# The AMM works on 1024 bit operands, and redundant word size is 29
827# Therefore: ceil(1024/29)/4 = 9
828my $ACC0="%ymm0";
829my $ACC1="%ymm1";
830my $ACC2="%ymm2";
831my $ACC3="%ymm3";
832my $ACC4="%ymm4";
833my $ACC5="%ymm5";
834my $ACC6="%ymm6";
835my $ACC7="%ymm7";
836my $ACC8="%ymm8";
837my $ACC9="%ymm9";
838
839# Registers that hold the broadcasted words of multiplier, currently used
840my $Bi="%ymm10";
841my $Yi="%ymm11";
842
843# Helper registers
844my $TEMP0=$ACC0;
845my $TEMP1="%ymm12";
846my $TEMP2="%ymm13";
847my $ZERO="%ymm14";
848my $AND_MASK="%ymm15";
849
850# alu registers that hold the first words of the ACC
851my $r0="%r9";
852my $r1="%r10";
853my $r2="%r11";
854my $r3="%r12";
855
856my $i="%r14d";
857my $tmp="%r15";
858
859$bp="%r13";	# reassigned argument
860
861$code.=<<___;
862.globl	rsaz_1024_mul_avx2
863.type	rsaz_1024_mul_avx2,\@function,5
864.align	64
865rsaz_1024_mul_avx2:
866.cfi_startproc
867	_CET_ENDBR
868	lea	(%rsp), %rax
869.cfi_def_cfa_register	%rax
870	push	%rbx
871.cfi_push	%rbx
872	push	%rbp
873.cfi_push	%rbp
874	push	%r12
875.cfi_push	%r12
876	push	%r13
877.cfi_push	%r13
878	push	%r14
879.cfi_push	%r14
880	push	%r15
881.cfi_push	%r15
882___
883$code.=<<___ if ($win64);
884	vzeroupper
885	lea	-0xa8(%rsp),%rsp
886	vmovaps	%xmm6,-0xd8(%rax)
887	vmovaps	%xmm7,-0xc8(%rax)
888	vmovaps	%xmm8,-0xb8(%rax)
889	vmovaps	%xmm9,-0xa8(%rax)
890	vmovaps	%xmm10,-0x98(%rax)
891	vmovaps	%xmm11,-0x88(%rax)
892	vmovaps	%xmm12,-0x78(%rax)
893	vmovaps	%xmm13,-0x68(%rax)
894	vmovaps	%xmm14,-0x58(%rax)
895	vmovaps	%xmm15,-0x48(%rax)
896.Lmul_1024_body:
897___
898$code.=<<___;
899	mov	%rax,%rbp
900.cfi_def_cfa_register	%rbp
901	vzeroall
902	mov	%rdx, $bp	# reassigned argument
903	sub	\$64,%rsp
904
905	# unaligned 256-bit load that crosses page boundary can
906	# cause severe performance degradation here, so if $ap does
907	# cross page boundary, swap it with $bp [meaning that caller
908	# is advised to lay down $ap and $bp next to each other, so
909	# that only one can cross page boundary].
910	.byte	0x67,0x67
911	mov	$ap, $tmp
912	and	\$4095, $tmp
913	add	\$32*10, $tmp
914	shr	\$12, $tmp
915	mov	$ap, $tmp
916	cmovnz	$bp, $ap
917	cmovnz	$tmp, $bp
918
919	mov	$np, $tmp
920	sub	\$-128,$ap	# size optimization
921	sub	\$-128,$np
922	sub	\$-128,$rp
923
924	and	\$4095, $tmp	# see if $np crosses page
925	add	\$32*10, $tmp
926	.byte	0x67,0x67
927	shr	\$12, $tmp
928	jz	.Lmul_1024_no_n_copy
929
930	# unaligned 256-bit load that crosses page boundary can
931	# cause severe performance degradation here, so if $np does
932	# cross page boundary, copy it to stack and make sure stack
933	# frame doesn't...
934	sub		\$32*10,%rsp
935	vmovdqu		32*0-128($np), $ACC0
936	and		\$-512, %rsp
937	vmovdqu		32*1-128($np), $ACC1
938	vmovdqu		32*2-128($np), $ACC2
939	vmovdqu		32*3-128($np), $ACC3
940	vmovdqu		32*4-128($np), $ACC4
941	vmovdqu		32*5-128($np), $ACC5
942	vmovdqu		32*6-128($np), $ACC6
943	vmovdqu		32*7-128($np), $ACC7
944	vmovdqu		32*8-128($np), $ACC8
945	lea		64+128(%rsp),$np
946	vmovdqu		$ACC0, 32*0-128($np)
947	vpxor		$ACC0, $ACC0, $ACC0
948	vmovdqu		$ACC1, 32*1-128($np)
949	vpxor		$ACC1, $ACC1, $ACC1
950	vmovdqu		$ACC2, 32*2-128($np)
951	vpxor		$ACC2, $ACC2, $ACC2
952	vmovdqu		$ACC3, 32*3-128($np)
953	vpxor		$ACC3, $ACC3, $ACC3
954	vmovdqu		$ACC4, 32*4-128($np)
955	vpxor		$ACC4, $ACC4, $ACC4
956	vmovdqu		$ACC5, 32*5-128($np)
957	vpxor		$ACC5, $ACC5, $ACC5
958	vmovdqu		$ACC6, 32*6-128($np)
959	vpxor		$ACC6, $ACC6, $ACC6
960	vmovdqu		$ACC7, 32*7-128($np)
961	vpxor		$ACC7, $ACC7, $ACC7
962	vmovdqu		$ACC8, 32*8-128($np)
963	vmovdqa		$ACC0, $ACC8
964	vmovdqu		$ACC9, 32*9-128($np)	# $ACC9 is zero after vzeroall
965.Lmul_1024_no_n_copy:
966	and	\$-64,%rsp
967
968	mov	($bp), %rbx
969	vpbroadcastq ($bp), $Bi
970	vmovdqu	$ACC0, (%rsp)			# clear top of stack
971	xor	$r0, $r0
972	.byte	0x67
973	xor	$r1, $r1
974	xor	$r2, $r2
975	xor	$r3, $r3
976
977	vmovdqu	.Land_mask(%rip), $AND_MASK
978	mov	\$9, $i
979	vmovdqu	$ACC9, 32*9-128($rp)		# $ACC9 is zero after vzeroall
980	jmp	.Loop_mul_1024
981
982.align	32
983.Loop_mul_1024:
984	 vpsrlq		\$29, $ACC3, $ACC9		# correct $ACC3(*)
985	mov	%rbx, %rax
986	imulq	-128($ap), %rax
987	add	$r0, %rax
988	mov	%rbx, $r1
989	imulq	8-128($ap), $r1
990	add	8(%rsp), $r1
991
992	mov	%rax, $r0
993	imull	$n0, %eax
994	and	\$0x1fffffff, %eax
995
996	 mov	%rbx, $r2
997	 imulq	16-128($ap), $r2
998	 add	16(%rsp), $r2
999
1000	 mov	%rbx, $r3
1001	 imulq	24-128($ap), $r3
1002	 add	24(%rsp), $r3
1003	vpmuludq	32*1-128($ap),$Bi,$TEMP0
1004	 vmovd		%eax, $Yi
1005	vpaddq		$TEMP0,$ACC1,$ACC1
1006	vpmuludq	32*2-128($ap),$Bi,$TEMP1
1007	 vpbroadcastq	$Yi, $Yi
1008	vpaddq		$TEMP1,$ACC2,$ACC2
1009	vpmuludq	32*3-128($ap),$Bi,$TEMP2
1010	 vpand		$AND_MASK, $ACC3, $ACC3		# correct $ACC3
1011	vpaddq		$TEMP2,$ACC3,$ACC3
1012	vpmuludq	32*4-128($ap),$Bi,$TEMP0
1013	vpaddq		$TEMP0,$ACC4,$ACC4
1014	vpmuludq	32*5-128($ap),$Bi,$TEMP1
1015	vpaddq		$TEMP1,$ACC5,$ACC5
1016	vpmuludq	32*6-128($ap),$Bi,$TEMP2
1017	vpaddq		$TEMP2,$ACC6,$ACC6
1018	vpmuludq	32*7-128($ap),$Bi,$TEMP0
1019	 vpermq		\$0x93, $ACC9, $ACC9		# correct $ACC3
1020	vpaddq		$TEMP0,$ACC7,$ACC7
1021	vpmuludq	32*8-128($ap),$Bi,$TEMP1
1022	 vpbroadcastq	8($bp), $Bi
1023	vpaddq		$TEMP1,$ACC8,$ACC8
1024
1025	mov	%rax,%rdx
1026	imulq	-128($np),%rax
1027	add	%rax,$r0
1028	mov	%rdx,%rax
1029	imulq	8-128($np),%rax
1030	add	%rax,$r1
1031	mov	%rdx,%rax
1032	imulq	16-128($np),%rax
1033	add	%rax,$r2
1034	shr	\$29, $r0
1035	imulq	24-128($np),%rdx
1036	add	%rdx,$r3
1037	add	$r0, $r1
1038
1039	vpmuludq	32*1-128($np),$Yi,$TEMP2
1040	 vmovq		$Bi, %rbx
1041	vpaddq		$TEMP2,$ACC1,$ACC1
1042	vpmuludq	32*2-128($np),$Yi,$TEMP0
1043	vpaddq		$TEMP0,$ACC2,$ACC2
1044	vpmuludq	32*3-128($np),$Yi,$TEMP1
1045	vpaddq		$TEMP1,$ACC3,$ACC3
1046	vpmuludq	32*4-128($np),$Yi,$TEMP2
1047	vpaddq		$TEMP2,$ACC4,$ACC4
1048	vpmuludq	32*5-128($np),$Yi,$TEMP0
1049	vpaddq		$TEMP0,$ACC5,$ACC5
1050	vpmuludq	32*6-128($np),$Yi,$TEMP1
1051	vpaddq		$TEMP1,$ACC6,$ACC6
1052	vpmuludq	32*7-128($np),$Yi,$TEMP2
1053	 vpblendd	\$3, $ZERO, $ACC9, $TEMP1	# correct $ACC3
1054	vpaddq		$TEMP2,$ACC7,$ACC7
1055	vpmuludq	32*8-128($np),$Yi,$TEMP0
1056	 vpaddq		$TEMP1, $ACC3, $ACC3		# correct $ACC3
1057	vpaddq		$TEMP0,$ACC8,$ACC8
1058
1059	mov	%rbx, %rax
1060	imulq	-128($ap),%rax
1061	add	%rax,$r1
1062	 vmovdqu	-8+32*1-128($ap),$TEMP1
1063	mov	%rbx, %rax
1064	imulq	8-128($ap),%rax
1065	add	%rax,$r2
1066	 vmovdqu	-8+32*2-128($ap),$TEMP2
1067
1068	mov	$r1, %rax
1069	 vpblendd	\$0xfc, $ZERO, $ACC9, $ACC9	# correct $ACC3
1070	imull	$n0, %eax
1071	 vpaddq		$ACC9,$ACC4,$ACC4		# correct $ACC3
1072	and	\$0x1fffffff, %eax
1073
1074	 imulq	16-128($ap),%rbx
1075	 add	%rbx,$r3
1076	vpmuludq	$Bi,$TEMP1,$TEMP1
1077	 vmovd		%eax, $Yi
1078	vmovdqu		-8+32*3-128($ap),$TEMP0
1079	vpaddq		$TEMP1,$ACC1,$ACC1
1080	vpmuludq	$Bi,$TEMP2,$TEMP2
1081	 vpbroadcastq	$Yi, $Yi
1082	vmovdqu		-8+32*4-128($ap),$TEMP1
1083	vpaddq		$TEMP2,$ACC2,$ACC2
1084	vpmuludq	$Bi,$TEMP0,$TEMP0
1085	vmovdqu		-8+32*5-128($ap),$TEMP2
1086	vpaddq		$TEMP0,$ACC3,$ACC3
1087	vpmuludq	$Bi,$TEMP1,$TEMP1
1088	vmovdqu		-8+32*6-128($ap),$TEMP0
1089	vpaddq		$TEMP1,$ACC4,$ACC4
1090	vpmuludq	$Bi,$TEMP2,$TEMP2
1091	vmovdqu		-8+32*7-128($ap),$TEMP1
1092	vpaddq		$TEMP2,$ACC5,$ACC5
1093	vpmuludq	$Bi,$TEMP0,$TEMP0
1094	vmovdqu		-8+32*8-128($ap),$TEMP2
1095	vpaddq		$TEMP0,$ACC6,$ACC6
1096	vpmuludq	$Bi,$TEMP1,$TEMP1
1097	vmovdqu		-8+32*9-128($ap),$ACC9
1098	vpaddq		$TEMP1,$ACC7,$ACC7
1099	vpmuludq	$Bi,$TEMP2,$TEMP2
1100	vpaddq		$TEMP2,$ACC8,$ACC8
1101	vpmuludq	$Bi,$ACC9,$ACC9
1102	 vpbroadcastq	16($bp), $Bi
1103
1104	mov	%rax,%rdx
1105	imulq	-128($np),%rax
1106	add	%rax,$r1
1107	 vmovdqu	-8+32*1-128($np),$TEMP0
1108	mov	%rdx,%rax
1109	imulq	8-128($np),%rax
1110	add	%rax,$r2
1111	 vmovdqu	-8+32*2-128($np),$TEMP1
1112	shr	\$29, $r1
1113	imulq	16-128($np),%rdx
1114	add	%rdx,$r3
1115	add	$r1, $r2
1116
1117	vpmuludq	$Yi,$TEMP0,$TEMP0
1118	 vmovq		$Bi, %rbx
1119	vmovdqu		-8+32*3-128($np),$TEMP2
1120	vpaddq		$TEMP0,$ACC1,$ACC1
1121	vpmuludq	$Yi,$TEMP1,$TEMP1
1122	vmovdqu		-8+32*4-128($np),$TEMP0
1123	vpaddq		$TEMP1,$ACC2,$ACC2
1124	vpmuludq	$Yi,$TEMP2,$TEMP2
1125	vmovdqu		-8+32*5-128($np),$TEMP1
1126	vpaddq		$TEMP2,$ACC3,$ACC3
1127	vpmuludq	$Yi,$TEMP0,$TEMP0
1128	vmovdqu		-8+32*6-128($np),$TEMP2
1129	vpaddq		$TEMP0,$ACC4,$ACC4
1130	vpmuludq	$Yi,$TEMP1,$TEMP1
1131	vmovdqu		-8+32*7-128($np),$TEMP0
1132	vpaddq		$TEMP1,$ACC5,$ACC5
1133	vpmuludq	$Yi,$TEMP2,$TEMP2
1134	vmovdqu		-8+32*8-128($np),$TEMP1
1135	vpaddq		$TEMP2,$ACC6,$ACC6
1136	vpmuludq	$Yi,$TEMP0,$TEMP0
1137	vmovdqu		-8+32*9-128($np),$TEMP2
1138	vpaddq		$TEMP0,$ACC7,$ACC7
1139	vpmuludq	$Yi,$TEMP1,$TEMP1
1140	vpaddq		$TEMP1,$ACC8,$ACC8
1141	vpmuludq	$Yi,$TEMP2,$TEMP2
1142	vpaddq		$TEMP2,$ACC9,$ACC9
1143
1144	 vmovdqu	-16+32*1-128($ap),$TEMP0
1145	mov	%rbx,%rax
1146	imulq	-128($ap),%rax
1147	add	$r2,%rax
1148
1149	 vmovdqu	-16+32*2-128($ap),$TEMP1
1150	mov	%rax,$r2
1151	imull	$n0, %eax
1152	and	\$0x1fffffff, %eax
1153
1154	 imulq	8-128($ap),%rbx
1155	 add	%rbx,$r3
1156	vpmuludq	$Bi,$TEMP0,$TEMP0
1157	 vmovd		%eax, $Yi
1158	vmovdqu		-16+32*3-128($ap),$TEMP2
1159	vpaddq		$TEMP0,$ACC1,$ACC1
1160	vpmuludq	$Bi,$TEMP1,$TEMP1
1161	 vpbroadcastq	$Yi, $Yi
1162	vmovdqu		-16+32*4-128($ap),$TEMP0
1163	vpaddq		$TEMP1,$ACC2,$ACC2
1164	vpmuludq	$Bi,$TEMP2,$TEMP2
1165	vmovdqu		-16+32*5-128($ap),$TEMP1
1166	vpaddq		$TEMP2,$ACC3,$ACC3
1167	vpmuludq	$Bi,$TEMP0,$TEMP0
1168	vmovdqu		-16+32*6-128($ap),$TEMP2
1169	vpaddq		$TEMP0,$ACC4,$ACC4
1170	vpmuludq	$Bi,$TEMP1,$TEMP1
1171	vmovdqu		-16+32*7-128($ap),$TEMP0
1172	vpaddq		$TEMP1,$ACC5,$ACC5
1173	vpmuludq	$Bi,$TEMP2,$TEMP2
1174	vmovdqu		-16+32*8-128($ap),$TEMP1
1175	vpaddq		$TEMP2,$ACC6,$ACC6
1176	vpmuludq	$Bi,$TEMP0,$TEMP0
1177	vmovdqu		-16+32*9-128($ap),$TEMP2
1178	vpaddq		$TEMP0,$ACC7,$ACC7
1179	vpmuludq	$Bi,$TEMP1,$TEMP1
1180	vpaddq		$TEMP1,$ACC8,$ACC8
1181	vpmuludq	$Bi,$TEMP2,$TEMP2
1182	 vpbroadcastq	24($bp), $Bi
1183	vpaddq		$TEMP2,$ACC9,$ACC9
1184
1185	 vmovdqu	-16+32*1-128($np),$TEMP0
1186	mov	%rax,%rdx
1187	imulq	-128($np),%rax
1188	add	%rax,$r2
1189	 vmovdqu	-16+32*2-128($np),$TEMP1
1190	imulq	8-128($np),%rdx
1191	add	%rdx,$r3
1192	shr	\$29, $r2
1193
1194	vpmuludq	$Yi,$TEMP0,$TEMP0
1195	 vmovq		$Bi, %rbx
1196	vmovdqu		-16+32*3-128($np),$TEMP2
1197	vpaddq		$TEMP0,$ACC1,$ACC1
1198	vpmuludq	$Yi,$TEMP1,$TEMP1
1199	vmovdqu		-16+32*4-128($np),$TEMP0
1200	vpaddq		$TEMP1,$ACC2,$ACC2
1201	vpmuludq	$Yi,$TEMP2,$TEMP2
1202	vmovdqu		-16+32*5-128($np),$TEMP1
1203	vpaddq		$TEMP2,$ACC3,$ACC3
1204	vpmuludq	$Yi,$TEMP0,$TEMP0
1205	vmovdqu		-16+32*6-128($np),$TEMP2
1206	vpaddq		$TEMP0,$ACC4,$ACC4
1207	vpmuludq	$Yi,$TEMP1,$TEMP1
1208	vmovdqu		-16+32*7-128($np),$TEMP0
1209	vpaddq		$TEMP1,$ACC5,$ACC5
1210	vpmuludq	$Yi,$TEMP2,$TEMP2
1211	vmovdqu		-16+32*8-128($np),$TEMP1
1212	vpaddq		$TEMP2,$ACC6,$ACC6
1213	vpmuludq	$Yi,$TEMP0,$TEMP0
1214	vmovdqu		-16+32*9-128($np),$TEMP2
1215	vpaddq		$TEMP0,$ACC7,$ACC7
1216	vpmuludq	$Yi,$TEMP1,$TEMP1
1217	 vmovdqu	-24+32*1-128($ap),$TEMP0
1218	vpaddq		$TEMP1,$ACC8,$ACC8
1219	vpmuludq	$Yi,$TEMP2,$TEMP2
1220	 vmovdqu	-24+32*2-128($ap),$TEMP1
1221	vpaddq		$TEMP2,$ACC9,$ACC9
1222
1223	add	$r2, $r3
1224	imulq	-128($ap),%rbx
1225	add	%rbx,$r3
1226
1227	mov	$r3, %rax
1228	imull	$n0, %eax
1229	and	\$0x1fffffff, %eax
1230
1231	vpmuludq	$Bi,$TEMP0,$TEMP0
1232	 vmovd		%eax, $Yi
1233	vmovdqu		-24+32*3-128($ap),$TEMP2
1234	vpaddq		$TEMP0,$ACC1,$ACC1
1235	vpmuludq	$Bi,$TEMP1,$TEMP1
1236	 vpbroadcastq	$Yi, $Yi
1237	vmovdqu		-24+32*4-128($ap),$TEMP0
1238	vpaddq		$TEMP1,$ACC2,$ACC2
1239	vpmuludq	$Bi,$TEMP2,$TEMP2
1240	vmovdqu		-24+32*5-128($ap),$TEMP1
1241	vpaddq		$TEMP2,$ACC3,$ACC3
1242	vpmuludq	$Bi,$TEMP0,$TEMP0
1243	vmovdqu		-24+32*6-128($ap),$TEMP2
1244	vpaddq		$TEMP0,$ACC4,$ACC4
1245	vpmuludq	$Bi,$TEMP1,$TEMP1
1246	vmovdqu		-24+32*7-128($ap),$TEMP0
1247	vpaddq		$TEMP1,$ACC5,$ACC5
1248	vpmuludq	$Bi,$TEMP2,$TEMP2
1249	vmovdqu		-24+32*8-128($ap),$TEMP1
1250	vpaddq		$TEMP2,$ACC6,$ACC6
1251	vpmuludq	$Bi,$TEMP0,$TEMP0
1252	vmovdqu		-24+32*9-128($ap),$TEMP2
1253	vpaddq		$TEMP0,$ACC7,$ACC7
1254	vpmuludq	$Bi,$TEMP1,$TEMP1
1255	vpaddq		$TEMP1,$ACC8,$ACC8
1256	vpmuludq	$Bi,$TEMP2,$TEMP2
1257	 vpbroadcastq	32($bp), $Bi
1258	vpaddq		$TEMP2,$ACC9,$ACC9
1259	 add		\$32, $bp			# $bp++
1260
1261	vmovdqu		-24+32*1-128($np),$TEMP0
1262	imulq	-128($np),%rax
1263	add	%rax,$r3
1264	shr	\$29, $r3
1265
1266	vmovdqu		-24+32*2-128($np),$TEMP1
1267	vpmuludq	$Yi,$TEMP0,$TEMP0
1268	 vmovq		$Bi, %rbx
1269	vmovdqu		-24+32*3-128($np),$TEMP2
1270	vpaddq		$TEMP0,$ACC1,$ACC0		# $ACC0==$TEMP0
1271	vpmuludq	$Yi,$TEMP1,$TEMP1
1272	 vmovdqu	$ACC0, (%rsp)			# transfer $r0-$r3
1273	vpaddq		$TEMP1,$ACC2,$ACC1
1274	vmovdqu		-24+32*4-128($np),$TEMP0
1275	vpmuludq	$Yi,$TEMP2,$TEMP2
1276	vmovdqu		-24+32*5-128($np),$TEMP1
1277	vpaddq		$TEMP2,$ACC3,$ACC2
1278	vpmuludq	$Yi,$TEMP0,$TEMP0
1279	vmovdqu		-24+32*6-128($np),$TEMP2
1280	vpaddq		$TEMP0,$ACC4,$ACC3
1281	vpmuludq	$Yi,$TEMP1,$TEMP1
1282	vmovdqu		-24+32*7-128($np),$TEMP0
1283	vpaddq		$TEMP1,$ACC5,$ACC4
1284	vpmuludq	$Yi,$TEMP2,$TEMP2
1285	vmovdqu		-24+32*8-128($np),$TEMP1
1286	vpaddq		$TEMP2,$ACC6,$ACC5
1287	vpmuludq	$Yi,$TEMP0,$TEMP0
1288	vmovdqu		-24+32*9-128($np),$TEMP2
1289	 mov	$r3, $r0
1290	vpaddq		$TEMP0,$ACC7,$ACC6
1291	vpmuludq	$Yi,$TEMP1,$TEMP1
1292	 add	(%rsp), $r0
1293	vpaddq		$TEMP1,$ACC8,$ACC7
1294	vpmuludq	$Yi,$TEMP2,$TEMP2
1295	 vmovq	$r3, $TEMP1
1296	vpaddq		$TEMP2,$ACC9,$ACC8
1297
1298	dec	$i
1299	jnz	.Loop_mul_1024
1300___
1301
1302# (*)	Original implementation was correcting ACC1-ACC3 for overflow
1303#	after 7 loop runs, or after 28 iterations, or 56 additions.
1304#	But as we underutilize resources, it's possible to correct in
1305#	each iteration with marginal performance loss. But then, as
1306#	we do it in each iteration, we can correct less digits, and
1307#	avoid performance penalties completely.
1308
1309$TEMP0 = $ACC9;
1310$TEMP3 = $Bi;
1311$TEMP4 = $Yi;
1312$code.=<<___;
1313	vpaddq		(%rsp), $TEMP1, $ACC0
1314
1315	vpsrlq		\$29, $ACC0, $TEMP1
1316	vpand		$AND_MASK, $ACC0, $ACC0
1317	vpsrlq		\$29, $ACC1, $TEMP2
1318	vpand		$AND_MASK, $ACC1, $ACC1
1319	vpsrlq		\$29, $ACC2, $TEMP3
1320	vpermq		\$0x93, $TEMP1, $TEMP1
1321	vpand		$AND_MASK, $ACC2, $ACC2
1322	vpsrlq		\$29, $ACC3, $TEMP4
1323	vpermq		\$0x93, $TEMP2, $TEMP2
1324	vpand		$AND_MASK, $ACC3, $ACC3
1325
1326	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
1327	vpermq		\$0x93, $TEMP3, $TEMP3
1328	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
1329	vpermq		\$0x93, $TEMP4, $TEMP4
1330	vpaddq		$TEMP0, $ACC0, $ACC0
1331	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
1332	vpaddq		$TEMP1, $ACC1, $ACC1
1333	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
1334	vpaddq		$TEMP2, $ACC2, $ACC2
1335	vpblendd	\$3, $TEMP4, $ZERO, $TEMP4
1336	vpaddq		$TEMP3, $ACC3, $ACC3
1337	vpaddq		$TEMP4, $ACC4, $ACC4
1338
1339	vpsrlq		\$29, $ACC0, $TEMP1
1340	vpand		$AND_MASK, $ACC0, $ACC0
1341	vpsrlq		\$29, $ACC1, $TEMP2
1342	vpand		$AND_MASK, $ACC1, $ACC1
1343	vpsrlq		\$29, $ACC2, $TEMP3
1344	vpermq		\$0x93, $TEMP1, $TEMP1
1345	vpand		$AND_MASK, $ACC2, $ACC2
1346	vpsrlq		\$29, $ACC3, $TEMP4
1347	vpermq		\$0x93, $TEMP2, $TEMP2
1348	vpand		$AND_MASK, $ACC3, $ACC3
1349	vpermq		\$0x93, $TEMP3, $TEMP3
1350
1351	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
1352	vpermq		\$0x93, $TEMP4, $TEMP4
1353	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
1354	vpaddq		$TEMP0, $ACC0, $ACC0
1355	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
1356	vpaddq		$TEMP1, $ACC1, $ACC1
1357	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
1358	vpaddq		$TEMP2, $ACC2, $ACC2
1359	vpblendd	\$3, $TEMP4, $ZERO, $TEMP4
1360	vpaddq		$TEMP3, $ACC3, $ACC3
1361	vpaddq		$TEMP4, $ACC4, $ACC4
1362
1363	vmovdqu		$ACC0, 0-128($rp)
1364	vmovdqu		$ACC1, 32-128($rp)
1365	vmovdqu		$ACC2, 64-128($rp)
1366	vmovdqu		$ACC3, 96-128($rp)
1367___
1368
1369$TEMP5=$ACC0;
1370$code.=<<___;
1371	vpsrlq		\$29, $ACC4, $TEMP1
1372	vpand		$AND_MASK, $ACC4, $ACC4
1373	vpsrlq		\$29, $ACC5, $TEMP2
1374	vpand		$AND_MASK, $ACC5, $ACC5
1375	vpsrlq		\$29, $ACC6, $TEMP3
1376	vpermq		\$0x93, $TEMP1, $TEMP1
1377	vpand		$AND_MASK, $ACC6, $ACC6
1378	vpsrlq		\$29, $ACC7, $TEMP4
1379	vpermq		\$0x93, $TEMP2, $TEMP2
1380	vpand		$AND_MASK, $ACC7, $ACC7
1381	vpsrlq		\$29, $ACC8, $TEMP5
1382	vpermq		\$0x93, $TEMP3, $TEMP3
1383	vpand		$AND_MASK, $ACC8, $ACC8
1384	vpermq		\$0x93, $TEMP4, $TEMP4
1385
1386	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
1387	vpermq		\$0x93, $TEMP5, $TEMP5
1388	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
1389	vpaddq		$TEMP0, $ACC4, $ACC4
1390	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
1391	vpaddq		$TEMP1, $ACC5, $ACC5
1392	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
1393	vpaddq		$TEMP2, $ACC6, $ACC6
1394	vpblendd	\$3, $TEMP4, $TEMP5, $TEMP4
1395	vpaddq		$TEMP3, $ACC7, $ACC7
1396	vpaddq		$TEMP4, $ACC8, $ACC8
1397
1398	vpsrlq		\$29, $ACC4, $TEMP1
1399	vpand		$AND_MASK, $ACC4, $ACC4
1400	vpsrlq		\$29, $ACC5, $TEMP2
1401	vpand		$AND_MASK, $ACC5, $ACC5
1402	vpsrlq		\$29, $ACC6, $TEMP3
1403	vpermq		\$0x93, $TEMP1, $TEMP1
1404	vpand		$AND_MASK, $ACC6, $ACC6
1405	vpsrlq		\$29, $ACC7, $TEMP4
1406	vpermq		\$0x93, $TEMP2, $TEMP2
1407	vpand		$AND_MASK, $ACC7, $ACC7
1408	vpsrlq		\$29, $ACC8, $TEMP5
1409	vpermq		\$0x93, $TEMP3, $TEMP3
1410	vpand		$AND_MASK, $ACC8, $ACC8
1411	vpermq		\$0x93, $TEMP4, $TEMP4
1412
1413	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
1414	vpermq		\$0x93, $TEMP5, $TEMP5
1415	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
1416	vpaddq		$TEMP0, $ACC4, $ACC4
1417	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
1418	vpaddq		$TEMP1, $ACC5, $ACC5
1419	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
1420	vpaddq		$TEMP2, $ACC6, $ACC6
1421	vpblendd	\$3, $TEMP4, $TEMP5, $TEMP4
1422	vpaddq		$TEMP3, $ACC7, $ACC7
1423	vpaddq		$TEMP4, $ACC8, $ACC8
1424
1425	vmovdqu		$ACC4, 128-128($rp)
1426	vmovdqu		$ACC5, 160-128($rp)
1427	vmovdqu		$ACC6, 192-128($rp)
1428	vmovdqu		$ACC7, 224-128($rp)
1429	vmovdqu		$ACC8, 256-128($rp)
1430	vzeroupper
1431
1432	mov	%rbp, %rax
1433.cfi_def_cfa_register	%rax
1434___
1435$code.=<<___ if ($win64);
1436.Lmul_1024_in_tail:
1437	movaps	-0xd8(%rax),%xmm6
1438	movaps	-0xc8(%rax),%xmm7
1439	movaps	-0xb8(%rax),%xmm8
1440	movaps	-0xa8(%rax),%xmm9
1441	movaps	-0x98(%rax),%xmm10
1442	movaps	-0x88(%rax),%xmm11
1443	movaps	-0x78(%rax),%xmm12
1444	movaps	-0x68(%rax),%xmm13
1445	movaps	-0x58(%rax),%xmm14
1446	movaps	-0x48(%rax),%xmm15
1447___
1448$code.=<<___;
1449	mov	-48(%rax),%r15
1450.cfi_restore	%r15
1451	mov	-40(%rax),%r14
1452.cfi_restore	%r14
1453	mov	-32(%rax),%r13
1454.cfi_restore	%r13
1455	mov	-24(%rax),%r12
1456.cfi_restore	%r12
1457	mov	-16(%rax),%rbp
1458.cfi_restore	%rbp
1459	mov	-8(%rax),%rbx
1460.cfi_restore	%rbx
1461	lea	(%rax),%rsp		# restore %rsp
1462.cfi_def_cfa_register	%rsp
1463.Lmul_1024_epilogue:
1464	ret
1465.cfi_endproc
1466.size	rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
1467___
1468}
1469{
1470my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi");
1471my @T = map("%r$_",(8..11));
1472
1473$code.=<<___;
1474.globl	rsaz_1024_red2norm_avx2
1475.type	rsaz_1024_red2norm_avx2,\@abi-omnipotent
1476.align	32
1477rsaz_1024_red2norm_avx2:
1478.cfi_startproc
1479	_CET_ENDBR
1480	sub	\$-128,$inp	# size optimization
1481	xor	%rax,%rax
1482___
1483
1484for ($j=0,$i=0; $i<16; $i++) {
1485    my $k=0;
1486    while (29*$j<64*($i+1)) {	# load data till boundary
1487	$code.="	mov	`8*$j-128`($inp), @T[0]\n";
1488	$j++; $k++; push(@T,shift(@T));
1489    }
1490    $l=$k;
1491    while ($k>1) {		# shift loaded data but last value
1492	$code.="	shl	\$`29*($j-$k)`,@T[-$k]\n";
1493	$k--;
1494    }
1495    $code.=<<___;		# shift last value
1496	mov	@T[-1], @T[0]
1497	shl	\$`29*($j-1)`, @T[-1]
1498	shr	\$`-29*($j-1)`, @T[0]
1499___
1500    while ($l) {		# accumulate all values
1501	$code.="	add	@T[-$l], %rax\n";
1502	$l--;
1503    }
1504	$code.=<<___;
1505	adc	\$0, @T[0]	# consume eventual carry
1506	mov	%rax, 8*$i($out)
1507	mov	@T[0], %rax
1508___
1509    push(@T,shift(@T));
1510}
1511$code.=<<___;
1512	ret
1513.cfi_endproc
1514.size	rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2
1515
1516.globl	rsaz_1024_norm2red_avx2
1517.type	rsaz_1024_norm2red_avx2,\@abi-omnipotent
1518.align	32
1519rsaz_1024_norm2red_avx2:
1520.cfi_startproc
1521	_CET_ENDBR
1522	sub	\$-128,$out	# size optimization
1523	mov	($inp),@T[0]
1524	mov	\$0x1fffffff,%eax
1525___
1526for ($j=0,$i=0; $i<16; $i++) {
1527    $code.="	mov	`8*($i+1)`($inp),@T[1]\n"	if ($i<15);
1528    $code.="	xor	@T[1],@T[1]\n"			if ($i==15);
1529    my $k=1;
1530    while (29*($j+1)<64*($i+1)) {
1531    	$code.=<<___;
1532	mov	@T[0],@T[-$k]
1533	shr	\$`29*$j`,@T[-$k]
1534	and	%rax,@T[-$k]				# &0x1fffffff
1535	mov	@T[-$k],`8*$j-128`($out)
1536___
1537	$j++; $k++;
1538    }
1539    $code.=<<___;
1540	shrd	\$`29*$j`,@T[1],@T[0]
1541	and	%rax,@T[0]
1542	mov	@T[0],`8*$j-128`($out)
1543___
1544    $j++;
1545    push(@T,shift(@T));
1546}
1547$code.=<<___;
1548	mov	@T[0],`8*$j-128`($out)			# zero
1549	mov	@T[0],`8*($j+1)-128`($out)
1550	mov	@T[0],`8*($j+2)-128`($out)
1551	mov	@T[0],`8*($j+3)-128`($out)
1552	ret
1553.cfi_endproc
1554.size	rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2
1555___
1556}
1557{
1558my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
1559
1560$code.=<<___;
1561.globl	rsaz_1024_scatter5_avx2
1562.type	rsaz_1024_scatter5_avx2,\@abi-omnipotent
1563.align	32
1564rsaz_1024_scatter5_avx2:
1565.cfi_startproc
1566	_CET_ENDBR
1567	vzeroupper
1568	vmovdqu	.Lscatter_permd(%rip),%ymm5
1569	shl	\$4,$power
1570	lea	($out,$power),$out
1571	mov	\$9,%eax
1572	jmp	.Loop_scatter_1024
1573
1574.align	32
1575.Loop_scatter_1024:
1576	vmovdqu		($inp),%ymm0
1577	lea		32($inp),$inp
1578	vpermd		%ymm0,%ymm5,%ymm0
1579	vmovdqu		%xmm0,($out)
1580	lea		16*32($out),$out
1581	dec	%eax
1582	jnz	.Loop_scatter_1024
1583
1584	vzeroupper
1585	ret
1586.cfi_endproc
1587.size	rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
1588
1589.globl	rsaz_1024_gather5_avx2
1590.type	rsaz_1024_gather5_avx2,\@abi-omnipotent
1591.align	32
1592rsaz_1024_gather5_avx2:
1593.cfi_startproc
1594	_CET_ENDBR
1595	vzeroupper
1596	mov	%rsp,%r11
1597.cfi_def_cfa_register	%r11
1598___
1599$code.=<<___ if ($win64);
1600	lea	-0x88(%rsp),%rax
1601.LSEH_begin_rsaz_1024_gather5:
1602	# I can't trust assembler to use specific encoding:-(
1603	.byte	0x48,0x8d,0x60,0xe0		# lea	-0x20(%rax),%rsp
1604	.byte	0xc5,0xf8,0x29,0x70,0xe0	# vmovaps %xmm6,-0x20(%rax)
1605	.byte	0xc5,0xf8,0x29,0x78,0xf0	# vmovaps %xmm7,-0x10(%rax)
1606	.byte	0xc5,0x78,0x29,0x40,0x00	# vmovaps %xmm8,0(%rax)
1607	.byte	0xc5,0x78,0x29,0x48,0x10	# vmovaps %xmm9,0x10(%rax)
1608	.byte	0xc5,0x78,0x29,0x50,0x20	# vmovaps %xmm10,0x20(%rax)
1609	.byte	0xc5,0x78,0x29,0x58,0x30	# vmovaps %xmm11,0x30(%rax)
1610	.byte	0xc5,0x78,0x29,0x60,0x40	# vmovaps %xmm12,0x40(%rax)
1611	.byte	0xc5,0x78,0x29,0x68,0x50	# vmovaps %xmm13,0x50(%rax)
1612	.byte	0xc5,0x78,0x29,0x70,0x60	# vmovaps %xmm14,0x60(%rax)
1613	.byte	0xc5,0x78,0x29,0x78,0x70	# vmovaps %xmm15,0x70(%rax)
1614___
1615$code.=<<___;
1616	lea	-0x100(%rsp),%rsp
1617	and	\$-32, %rsp
1618	lea	.Linc(%rip), %r10
1619	lea	-128(%rsp),%rax			# control u-op density
1620
1621	vmovd		$power, %xmm4
1622	vmovdqa		(%r10),%ymm0
1623	vmovdqa		32(%r10),%ymm1
1624	vmovdqa		64(%r10),%ymm5
1625	vpbroadcastd	%xmm4,%ymm4
1626
1627	vpaddd		%ymm5, %ymm0, %ymm2
1628	vpcmpeqd	%ymm4, %ymm0, %ymm0
1629	vpaddd		%ymm5, %ymm1, %ymm3
1630	vpcmpeqd	%ymm4, %ymm1, %ymm1
1631	vmovdqa		%ymm0, 32*0+128(%rax)
1632	vpaddd		%ymm5, %ymm2, %ymm0
1633	vpcmpeqd	%ymm4, %ymm2, %ymm2
1634	vmovdqa		%ymm1, 32*1+128(%rax)
1635	vpaddd		%ymm5, %ymm3, %ymm1
1636	vpcmpeqd	%ymm4, %ymm3, %ymm3
1637	vmovdqa		%ymm2, 32*2+128(%rax)
1638	vpaddd		%ymm5, %ymm0, %ymm2
1639	vpcmpeqd	%ymm4, %ymm0, %ymm0
1640	vmovdqa		%ymm3, 32*3+128(%rax)
1641	vpaddd		%ymm5, %ymm1, %ymm3
1642	vpcmpeqd	%ymm4, %ymm1, %ymm1
1643	vmovdqa		%ymm0, 32*4+128(%rax)
1644	vpaddd		%ymm5, %ymm2, %ymm8
1645	vpcmpeqd	%ymm4, %ymm2, %ymm2
1646	vmovdqa		%ymm1, 32*5+128(%rax)
1647	vpaddd		%ymm5, %ymm3, %ymm9
1648	vpcmpeqd	%ymm4, %ymm3, %ymm3
1649	vmovdqa		%ymm2, 32*6+128(%rax)
1650	vpaddd		%ymm5, %ymm8, %ymm10
1651	vpcmpeqd	%ymm4, %ymm8, %ymm8
1652	vmovdqa		%ymm3, 32*7+128(%rax)
1653	vpaddd		%ymm5, %ymm9, %ymm11
1654	vpcmpeqd	%ymm4, %ymm9, %ymm9
1655	vpaddd		%ymm5, %ymm10, %ymm12
1656	vpcmpeqd	%ymm4, %ymm10, %ymm10
1657	vpaddd		%ymm5, %ymm11, %ymm13
1658	vpcmpeqd	%ymm4, %ymm11, %ymm11
1659	vpaddd		%ymm5, %ymm12, %ymm14
1660	vpcmpeqd	%ymm4, %ymm12, %ymm12
1661	vpaddd		%ymm5, %ymm13, %ymm15
1662	vpcmpeqd	%ymm4, %ymm13, %ymm13
1663	vpcmpeqd	%ymm4, %ymm14, %ymm14
1664	vpcmpeqd	%ymm4, %ymm15, %ymm15
1665
1666	vmovdqa	-32(%r10),%ymm7			# .Lgather_permd
1667	lea	128($inp), $inp
1668	mov	\$9,$power
1669
1670.Loop_gather_1024:
1671	vmovdqa		32*0-128($inp),	%ymm0
1672	vmovdqa		32*1-128($inp),	%ymm1
1673	vmovdqa		32*2-128($inp),	%ymm2
1674	vmovdqa		32*3-128($inp),	%ymm3
1675	vpand		32*0+128(%rax),	%ymm0,	%ymm0
1676	vpand		32*1+128(%rax),	%ymm1,	%ymm1
1677	vpand		32*2+128(%rax),	%ymm2,	%ymm2
1678	vpor		%ymm0, %ymm1, %ymm4
1679	vpand		32*3+128(%rax),	%ymm3,	%ymm3
1680	vmovdqa		32*4-128($inp),	%ymm0
1681	vmovdqa		32*5-128($inp),	%ymm1
1682	vpor		%ymm2, %ymm3, %ymm5
1683	vmovdqa		32*6-128($inp),	%ymm2
1684	vmovdqa		32*7-128($inp),	%ymm3
1685	vpand		32*4+128(%rax),	%ymm0,	%ymm0
1686	vpand		32*5+128(%rax),	%ymm1,	%ymm1
1687	vpand		32*6+128(%rax),	%ymm2,	%ymm2
1688	vpor		%ymm0, %ymm4, %ymm4
1689	vpand		32*7+128(%rax),	%ymm3,	%ymm3
1690	vpand		32*8-128($inp),	%ymm8,	%ymm0
1691	vpor		%ymm1, %ymm5, %ymm5
1692	vpand		32*9-128($inp),	%ymm9,	%ymm1
1693	vpor		%ymm2, %ymm4, %ymm4
1694	vpand		32*10-128($inp),%ymm10,	%ymm2
1695	vpor		%ymm3, %ymm5, %ymm5
1696	vpand		32*11-128($inp),%ymm11,	%ymm3
1697	vpor		%ymm0, %ymm4, %ymm4
1698	vpand		32*12-128($inp),%ymm12,	%ymm0
1699	vpor		%ymm1, %ymm5, %ymm5
1700	vpand		32*13-128($inp),%ymm13,	%ymm1
1701	vpor		%ymm2, %ymm4, %ymm4
1702	vpand		32*14-128($inp),%ymm14,	%ymm2
1703	vpor		%ymm3, %ymm5, %ymm5
1704	vpand		32*15-128($inp),%ymm15,	%ymm3
1705	lea		32*16($inp), $inp
1706	vpor		%ymm0, %ymm4, %ymm4
1707	vpor		%ymm1, %ymm5, %ymm5
1708	vpor		%ymm2, %ymm4, %ymm4
1709	vpor		%ymm3, %ymm5, %ymm5
1710
1711	vpor		%ymm5, %ymm4, %ymm4
1712	vextracti128	\$1, %ymm4, %xmm5	# upper half is cleared
1713	vpor		%xmm4, %xmm5, %xmm5
1714	vpermd		%ymm5,%ymm7,%ymm5
1715	vmovdqu		%ymm5,($out)
1716	lea		32($out),$out
1717	dec	$power
1718	jnz	.Loop_gather_1024
1719
1720	vpxor	%ymm0,%ymm0,%ymm0
1721	vmovdqu	%ymm0,($out)
1722	vzeroupper
1723___
1724$code.=<<___ if ($win64);
1725	movaps	-0xa8(%r11),%xmm6
1726	movaps	-0x98(%r11),%xmm7
1727	movaps	-0x88(%r11),%xmm8
1728	movaps	-0x78(%r11),%xmm9
1729	movaps	-0x68(%r11),%xmm10
1730	movaps	-0x58(%r11),%xmm11
1731	movaps	-0x48(%r11),%xmm12
1732	movaps	-0x38(%r11),%xmm13
1733	movaps	-0x28(%r11),%xmm14
1734	movaps	-0x18(%r11),%xmm15
1735___
1736$code.=<<___;
1737	lea	(%r11),%rsp
1738.cfi_def_cfa_register	%rsp
1739	ret
1740.cfi_endproc
1741.LSEH_end_rsaz_1024_gather5:
1742.size	rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
1743___
1744}
1745
1746$code.=<<___;
1747.section .rodata
1748.align	64
1749.Land_mask:
1750	.quad	0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff
1751.Lscatter_permd:
1752	.long	0,2,4,6,7,7,7,7
1753.Lgather_permd:
1754	.long	0,7,1,7,2,7,3,7
1755.Linc:
1756	.long	0,0,0,0, 1,1,1,1
1757	.long	2,2,2,2, 3,3,3,3
1758	.long	4,4,4,4, 4,4,4,4
1759.align	64
1760.text
1761___
1762
1763if ($win64) {
1764$rec="%rcx";
1765$frame="%rdx";
1766$context="%r8";
1767$disp="%r9";
1768
1769$code.=<<___
1770.extern	__imp_RtlVirtualUnwind
1771.type	rsaz_se_handler,\@abi-omnipotent
1772.align	16
1773rsaz_se_handler:
1774	push	%rsi
1775	push	%rdi
1776	push	%rbx
1777	push	%rbp
1778	push	%r12
1779	push	%r13
1780	push	%r14
1781	push	%r15
1782	pushfq
1783	sub	\$64,%rsp
1784
1785	mov	120($context),%rax	# pull context->Rax
1786	mov	248($context),%rbx	# pull context->Rip
1787
1788	mov	8($disp),%rsi		# disp->ImageBase
1789	mov	56($disp),%r11		# disp->HandlerData
1790
1791	mov	0(%r11),%r10d		# HandlerData[0]
1792	lea	(%rsi,%r10),%r10	# prologue label
1793	cmp	%r10,%rbx		# context->Rip<prologue label
1794	jb	.Lcommon_seh_tail
1795
1796	mov	4(%r11),%r10d		# HandlerData[1]
1797	lea	(%rsi,%r10),%r10	# epilogue label
1798	cmp	%r10,%rbx		# context->Rip>=epilogue label
1799	jae	.Lcommon_seh_tail
1800
1801	mov	160($context),%rbp	# pull context->Rbp
1802
1803	mov	8(%r11),%r10d		# HandlerData[2]
1804	lea	(%rsi,%r10),%r10	# "in tail" label
1805	cmp	%r10,%rbx		# context->Rip>="in tail" label
1806	cmovc	%rbp,%rax
1807
1808	mov	-48(%rax),%r15
1809	mov	-40(%rax),%r14
1810	mov	-32(%rax),%r13
1811	mov	-24(%rax),%r12
1812	mov	-16(%rax),%rbp
1813	mov	-8(%rax),%rbx
1814	mov	%r15,240($context)
1815	mov	%r14,232($context)
1816	mov	%r13,224($context)
1817	mov	%r12,216($context)
1818	mov	%rbp,160($context)
1819	mov	%rbx,144($context)
1820
1821	lea	-0xd8(%rax),%rsi	# %xmm save area
1822	lea	512($context),%rdi	# & context.Xmm6
1823	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
1824	.long	0xa548f3fc		# cld; rep movsq
1825
1826.Lcommon_seh_tail:
1827	mov	8(%rax),%rdi
1828	mov	16(%rax),%rsi
1829	mov	%rax,152($context)	# restore context->Rsp
1830	mov	%rsi,168($context)	# restore context->Rsi
1831	mov	%rdi,176($context)	# restore context->Rdi
1832
1833	mov	40($disp),%rdi		# disp->ContextRecord
1834	mov	$context,%rsi		# context
1835	mov	\$154,%ecx		# sizeof(CONTEXT)
1836	.long	0xa548f3fc		# cld; rep movsq
1837
1838	mov	$disp,%rsi
1839	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1840	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1841	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1842	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1843	mov	40(%rsi),%r10		# disp->ContextRecord
1844	lea	56(%rsi),%r11		# &disp->HandlerData
1845	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1846	mov	%r10,32(%rsp)		# arg5
1847	mov	%r11,40(%rsp)		# arg6
1848	mov	%r12,48(%rsp)		# arg7
1849	mov	%rcx,56(%rsp)		# arg8, (NULL)
1850	call	*__imp_RtlVirtualUnwind(%rip)
1851
1852	mov	\$1,%eax		# ExceptionContinueSearch
1853	add	\$64,%rsp
1854	popfq
1855	pop	%r15
1856	pop	%r14
1857	pop	%r13
1858	pop	%r12
1859	pop	%rbp
1860	pop	%rbx
1861	pop	%rdi
1862	pop	%rsi
1863	ret
1864.size	rsaz_se_handler,.-rsaz_se_handler
1865
1866.section	.pdata
1867.align	4
1868	.rva	.LSEH_begin_rsaz_1024_sqr_avx2
1869	.rva	.LSEH_end_rsaz_1024_sqr_avx2
1870	.rva	.LSEH_info_rsaz_1024_sqr_avx2
1871
1872	.rva	.LSEH_begin_rsaz_1024_mul_avx2
1873	.rva	.LSEH_end_rsaz_1024_mul_avx2
1874	.rva	.LSEH_info_rsaz_1024_mul_avx2
1875
1876	.rva	.LSEH_begin_rsaz_1024_gather5
1877	.rva	.LSEH_end_rsaz_1024_gather5
1878	.rva	.LSEH_info_rsaz_1024_gather5
1879.section	.xdata
1880.align	8
1881.LSEH_info_rsaz_1024_sqr_avx2:
1882	.byte	9,0,0,0
1883	.rva	rsaz_se_handler
1884	.rva	.Lsqr_1024_body,.Lsqr_1024_epilogue,.Lsqr_1024_in_tail
1885	.long	0
1886.LSEH_info_rsaz_1024_mul_avx2:
1887	.byte	9,0,0,0
1888	.rva	rsaz_se_handler
1889	.rva	.Lmul_1024_body,.Lmul_1024_epilogue,.Lmul_1024_in_tail
1890	.long	0
1891.LSEH_info_rsaz_1024_gather5:
1892	.byte	0x01,0x36,0x17,0x0b
1893	.byte	0x36,0xf8,0x09,0x00	# vmovaps 0x90(rsp),xmm15
1894	.byte	0x31,0xe8,0x08,0x00	# vmovaps 0x80(rsp),xmm14
1895	.byte	0x2c,0xd8,0x07,0x00	# vmovaps 0x70(rsp),xmm13
1896	.byte	0x27,0xc8,0x06,0x00	# vmovaps 0x60(rsp),xmm12
1897	.byte	0x22,0xb8,0x05,0x00	# vmovaps 0x50(rsp),xmm11
1898	.byte	0x1d,0xa8,0x04,0x00	# vmovaps 0x40(rsp),xmm10
1899	.byte	0x18,0x98,0x03,0x00	# vmovaps 0x30(rsp),xmm9
1900	.byte	0x13,0x88,0x02,0x00	# vmovaps 0x20(rsp),xmm8
1901	.byte	0x0e,0x78,0x01,0x00	# vmovaps 0x10(rsp),xmm7
1902	.byte	0x09,0x68,0x00,0x00	# vmovaps 0x00(rsp),xmm6
1903	.byte	0x04,0x01,0x15,0x00	# sub	  rsp,0xa8
1904	.byte	0x00,0xb3,0x00,0x00	# set_frame r11
1905___
1906}
1907
1908foreach (split("\n",$code)) {
1909	s/\`([^\`]*)\`/eval($1)/ge;
1910
1911	s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge		or
1912
1913	s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go		or
1914	s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go		or
1915	s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go	or
1916	s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go	or
1917	s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
1918	print $_,"\n";
1919}
1920
1921}}} else {{{
1922print <<___;	# assembler is too old
1923.text
1924
1925.globl	rsaz_avx2_eligible
1926.type	rsaz_avx2_eligible,\@abi-omnipotent
1927rsaz_avx2_eligible:
1928	xor	%eax,%eax
1929	ret
1930.size	rsaz_avx2_eligible,.-rsaz_avx2_eligible
1931
1932.globl	rsaz_1024_sqr_avx2
1933.globl	rsaz_1024_mul_avx2
1934.globl	rsaz_1024_norm2red_avx2
1935.globl	rsaz_1024_red2norm_avx2
1936.globl	rsaz_1024_scatter5_avx2
1937.globl	rsaz_1024_gather5_avx2
1938.type	rsaz_1024_sqr_avx2,\@abi-omnipotent
1939rsaz_1024_sqr_avx2:
1940rsaz_1024_mul_avx2:
1941rsaz_1024_norm2red_avx2:
1942rsaz_1024_red2norm_avx2:
1943rsaz_1024_scatter5_avx2:
1944rsaz_1024_gather5_avx2:
1945	.byte	0x0f,0x0b	# ud2
1946	ret
1947.size	rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
1948___
1949}}}
1950
1951close STDOUT or die "error closing STDOUT: $!";
1952