• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env perl
2
3##############################################################################
4#                                                                            #
5#  Copyright (c) 2012, Intel Corporation                                     #
6#                                                                            #
7#  All rights reserved.                                                      #
8#                                                                            #
9#  Redistribution and use in source and binary forms, with or without        #
10#  modification, are permitted provided that the following conditions are    #
11#  met:                                                                      #
12#                                                                            #
13#  *  Redistributions of source code must retain the above copyright         #
14#     notice, this list of conditions and the following disclaimer.          #
15#                                                                            #
16#  *  Redistributions in binary form must reproduce the above copyright      #
17#     notice, this list of conditions and the following disclaimer in the    #
18#     documentation and/or other materials provided with the                 #
19#     distribution.                                                          #
20#                                                                            #
21#  *  Neither the name of the Intel Corporation nor the names of its         #
22#     contributors may be used to endorse or promote products derived from   #
23#     this software without specific prior written permission.               #
24#                                                                            #
25#                                                                            #
26#  THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY          #
27#  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE         #
28#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR        #
29#  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR            #
30#  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,     #
31#  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,       #
32#  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR        #
33#  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF    #
34#  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING      #
35#  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS        #
36#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              #
37#                                                                            #
38##############################################################################
39# Developers and authors:                                                    #
40# Shay Gueron (1, 2), and Vlad Krasnov (1)                                   #
41# (1) Intel Corporation, Israel Development Center, Haifa, Israel            #
42# (2) University of Haifa, Israel                                            #
43##############################################################################
44# Reference:                                                                 #
45# [1] S. Gueron, V. Krasnov: "Software Implementation of Modular             #
46#     Exponentiation,  Using Advanced Vector Instructions Architectures",    #
47#     F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369,   #
48#     pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012              #
49# [2] S. Gueron: "Efficient Software Implementations of Modular              #
50#     Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012).  #
51# [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE         #
52#     Proceedings of 9th International Conference on Information Technology: #
53#     New Generations (ITNG 2012), pp.821-823 (2012)                         #
54# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis    #
55#     resistant 1024-bit modular exponentiation, for optimizing RSA2048      #
56#     on AVX2 capable x86_64 platforms",                                     #
57#     http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest#
58##############################################################################
59#
60# +13% improvement over original submission by <appro@openssl.org>
61#
62# rsa2048 sign/sec	OpenSSL 1.0.1	scalar(*)	this
63# 2.3GHz Haswell	621		765/+23%	1113/+79%
64# 2.3GHz Broadwell(**)	688		1200(***)/+74%	1120/+63%
65#
66# (*)	if system doesn't support AVX2, for reference purposes;
67# (**)	scaled to 2.3GHz to simplify comparison;
68# (***)	scalar AD*X code is faster than AVX2 and is preferred code
69#	path for Broadwell;
70
71$flavour = shift;
72$output  = shift;
73if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
74
75$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
76
77$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
78( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
79( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
80die "can't locate x86_64-xlate.pl";
81
82# In upstream, this is controlled by shelling out to the compiler to check
83# versions, but BoringSSL is intended to be used with pre-generated perlasm
84# output, so this isn't useful anyway.
85#
86# TODO(davidben): Enable these after testing. $avx goes up to 2 and $addx to 1.
87$avx = 0;
88$addx = 0;
89
90open OUT,"| \"$^X\" $xlate $flavour $output";
91*STDOUT = *OUT;
92
93if ($avx>1) {{{
94{ # void AMS_WW(
95my $rp="%rdi";	# BN_ULONG *rp,
96my $ap="%rsi";	# const BN_ULONG *ap,
97my $np="%rdx";	# const BN_ULONG *np,
98my $n0="%ecx";	# const BN_ULONG n0,
99my $rep="%r8d";	# int repeat);
100
101# The registers that hold the accumulated redundant result
102# The AMM works on 1024 bit operands, and redundant word size is 29
103# Therefore: ceil(1024/29)/4 = 9
104my $ACC0="%ymm0";
105my $ACC1="%ymm1";
106my $ACC2="%ymm2";
107my $ACC3="%ymm3";
108my $ACC4="%ymm4";
109my $ACC5="%ymm5";
110my $ACC6="%ymm6";
111my $ACC7="%ymm7";
112my $ACC8="%ymm8";
113my $ACC9="%ymm9";
114# Registers that hold the broadcasted words of bp, currently used
115my $B1="%ymm10";
116my $B2="%ymm11";
117# Registers that hold the broadcasted words of Y, currently used
118my $Y1="%ymm12";
119my $Y2="%ymm13";
120# Helper registers
121my $TEMP1="%ymm14";
122my $AND_MASK="%ymm15";
123# alu registers that hold the first words of the ACC
124my $r0="%r9";
125my $r1="%r10";
126my $r2="%r11";
127my $r3="%r12";
128
129my $i="%r14d";			# loop counter
130my $tmp = "%r15";
131
132my $FrameSize=32*18+32*8;	# place for A^2 and 2*A
133
134my $aap=$r0;
135my $tp0="%rbx";
136my $tp1=$r3;
137my $tpa=$tmp;
138
139$np="%r13";			# reassigned argument
140
141$code.=<<___;
142.text
143
144.globl	rsaz_1024_sqr_avx2
145.type	rsaz_1024_sqr_avx2,\@function,5
146.align	64
147rsaz_1024_sqr_avx2:		# 702 cycles, 14% faster than rsaz_1024_mul_avx2
148	lea	(%rsp), %rax
149	push	%rbx
150	push	%rbp
151	push	%r12
152	push	%r13
153	push	%r14
154	push	%r15
155	vzeroupper
156___
157$code.=<<___ if ($win64);
158	lea	-0xa8(%rsp),%rsp
159	vmovaps	%xmm6,-0xd8(%rax)
160	vmovaps	%xmm7,-0xc8(%rax)
161	vmovaps	%xmm8,-0xb8(%rax)
162	vmovaps	%xmm9,-0xa8(%rax)
163	vmovaps	%xmm10,-0x98(%rax)
164	vmovaps	%xmm11,-0x88(%rax)
165	vmovaps	%xmm12,-0x78(%rax)
166	vmovaps	%xmm13,-0x68(%rax)
167	vmovaps	%xmm14,-0x58(%rax)
168	vmovaps	%xmm15,-0x48(%rax)
169.Lsqr_1024_body:
170___
171$code.=<<___;
172	mov	%rax,%rbp
173	mov	%rdx, $np			# reassigned argument
174	sub	\$$FrameSize, %rsp
175	mov	$np, $tmp
176	sub	\$-128, $rp			# size optimization
177	sub	\$-128, $ap
178	sub	\$-128, $np
179
180	and	\$4095, $tmp			# see if $np crosses page
181	add	\$32*10, $tmp
182	shr	\$12, $tmp
183	vpxor	$ACC9,$ACC9,$ACC9
184	jz	.Lsqr_1024_no_n_copy
185
186	# unaligned 256-bit load that crosses page boundary can
187	# cause >2x performance degradation here, so if $np does
188	# cross page boundary, copy it to stack and make sure stack
189	# frame doesn't...
190	sub		\$32*10,%rsp
191	vmovdqu		32*0-128($np), $ACC0
192	and		\$-2048, %rsp
193	vmovdqu		32*1-128($np), $ACC1
194	vmovdqu		32*2-128($np), $ACC2
195	vmovdqu		32*3-128($np), $ACC3
196	vmovdqu		32*4-128($np), $ACC4
197	vmovdqu		32*5-128($np), $ACC5
198	vmovdqu		32*6-128($np), $ACC6
199	vmovdqu		32*7-128($np), $ACC7
200	vmovdqu		32*8-128($np), $ACC8
201	lea		$FrameSize+128(%rsp),$np
202	vmovdqu		$ACC0, 32*0-128($np)
203	vmovdqu		$ACC1, 32*1-128($np)
204	vmovdqu		$ACC2, 32*2-128($np)
205	vmovdqu		$ACC3, 32*3-128($np)
206	vmovdqu		$ACC4, 32*4-128($np)
207	vmovdqu		$ACC5, 32*5-128($np)
208	vmovdqu		$ACC6, 32*6-128($np)
209	vmovdqu		$ACC7, 32*7-128($np)
210	vmovdqu		$ACC8, 32*8-128($np)
211	vmovdqu		$ACC9, 32*9-128($np)	# $ACC9 is zero
212
213.Lsqr_1024_no_n_copy:
214	and		\$-1024, %rsp
215
216	vmovdqu		32*1-128($ap), $ACC1
217	vmovdqu		32*2-128($ap), $ACC2
218	vmovdqu		32*3-128($ap), $ACC3
219	vmovdqu		32*4-128($ap), $ACC4
220	vmovdqu		32*5-128($ap), $ACC5
221	vmovdqu		32*6-128($ap), $ACC6
222	vmovdqu		32*7-128($ap), $ACC7
223	vmovdqu		32*8-128($ap), $ACC8
224
225	lea	192(%rsp), $tp0			# 64+128=192
226	vpbroadcastq	.Land_mask(%rip), $AND_MASK
227	jmp	.LOOP_GRANDE_SQR_1024
228
229.align	32
230.LOOP_GRANDE_SQR_1024:
231	lea	32*18+128(%rsp), $aap		# size optimization
232	lea	448(%rsp), $tp1			# 64+128+256=448
233
234	# the squaring is performed as described in Variant B of
235	# "Speeding up Big-Number Squaring", so start by calculating
236	# the A*2=A+A vector
237	vpaddq		$ACC1, $ACC1, $ACC1
238	 vpbroadcastq	32*0-128($ap), $B1
239	vpaddq		$ACC2, $ACC2, $ACC2
240	vmovdqa		$ACC1, 32*0-128($aap)
241	vpaddq		$ACC3, $ACC3, $ACC3
242	vmovdqa		$ACC2, 32*1-128($aap)
243	vpaddq		$ACC4, $ACC4, $ACC4
244	vmovdqa		$ACC3, 32*2-128($aap)
245	vpaddq		$ACC5, $ACC5, $ACC5
246	vmovdqa		$ACC4, 32*3-128($aap)
247	vpaddq		$ACC6, $ACC6, $ACC6
248	vmovdqa		$ACC5, 32*4-128($aap)
249	vpaddq		$ACC7, $ACC7, $ACC7
250	vmovdqa		$ACC6, 32*5-128($aap)
251	vpaddq		$ACC8, $ACC8, $ACC8
252	vmovdqa		$ACC7, 32*6-128($aap)
253	vpxor		$ACC9, $ACC9, $ACC9
254	vmovdqa		$ACC8, 32*7-128($aap)
255
256	vpmuludq	32*0-128($ap), $B1, $ACC0
257	 vpbroadcastq	32*1-128($ap), $B2
258	 vmovdqu	$ACC9, 32*9-192($tp0)	# zero upper half
259	vpmuludq	$B1, $ACC1, $ACC1
260	 vmovdqu	$ACC9, 32*10-448($tp1)
261	vpmuludq	$B1, $ACC2, $ACC2
262	 vmovdqu	$ACC9, 32*11-448($tp1)
263	vpmuludq	$B1, $ACC3, $ACC3
264	 vmovdqu	$ACC9, 32*12-448($tp1)
265	vpmuludq	$B1, $ACC4, $ACC4
266	 vmovdqu	$ACC9, 32*13-448($tp1)
267	vpmuludq	$B1, $ACC5, $ACC5
268	 vmovdqu	$ACC9, 32*14-448($tp1)
269	vpmuludq	$B1, $ACC6, $ACC6
270	 vmovdqu	$ACC9, 32*15-448($tp1)
271	vpmuludq	$B1, $ACC7, $ACC7
272	 vmovdqu	$ACC9, 32*16-448($tp1)
273	vpmuludq	$B1, $ACC8, $ACC8
274	 vpbroadcastq	32*2-128($ap), $B1
275	 vmovdqu	$ACC9, 32*17-448($tp1)
276
277	mov	$ap, $tpa
278	mov 	\$4, $i
279	jmp	.Lsqr_entry_1024
280___
281$TEMP0=$Y1;
282$TEMP2=$Y2;
283$code.=<<___;
284.align	32
285.LOOP_SQR_1024:
286	 vpbroadcastq	32*1-128($tpa), $B2
287	vpmuludq	32*0-128($ap), $B1, $ACC0
288	vpaddq		32*0-192($tp0), $ACC0, $ACC0
289	vpmuludq	32*0-128($aap), $B1, $ACC1
290	vpaddq		32*1-192($tp0), $ACC1, $ACC1
291	vpmuludq	32*1-128($aap), $B1, $ACC2
292	vpaddq		32*2-192($tp0), $ACC2, $ACC2
293	vpmuludq	32*2-128($aap), $B1, $ACC3
294	vpaddq		32*3-192($tp0), $ACC3, $ACC3
295	vpmuludq	32*3-128($aap), $B1, $ACC4
296	vpaddq		32*4-192($tp0), $ACC4, $ACC4
297	vpmuludq	32*4-128($aap), $B1, $ACC5
298	vpaddq		32*5-192($tp0), $ACC5, $ACC5
299	vpmuludq	32*5-128($aap), $B1, $ACC6
300	vpaddq		32*6-192($tp0), $ACC6, $ACC6
301	vpmuludq	32*6-128($aap), $B1, $ACC7
302	vpaddq		32*7-192($tp0), $ACC7, $ACC7
303	vpmuludq	32*7-128($aap), $B1, $ACC8
304	 vpbroadcastq	32*2-128($tpa), $B1
305	vpaddq		32*8-192($tp0), $ACC8, $ACC8
306.Lsqr_entry_1024:
307	vmovdqu		$ACC0, 32*0-192($tp0)
308	vmovdqu		$ACC1, 32*1-192($tp0)
309
310	vpmuludq	32*1-128($ap), $B2, $TEMP0
311	vpaddq		$TEMP0, $ACC2, $ACC2
312	vpmuludq	32*1-128($aap), $B2, $TEMP1
313	vpaddq		$TEMP1, $ACC3, $ACC3
314	vpmuludq	32*2-128($aap), $B2, $TEMP2
315	vpaddq		$TEMP2, $ACC4, $ACC4
316	vpmuludq	32*3-128($aap), $B2, $TEMP0
317	vpaddq		$TEMP0, $ACC5, $ACC5
318	vpmuludq	32*4-128($aap), $B2, $TEMP1
319	vpaddq		$TEMP1, $ACC6, $ACC6
320	vpmuludq	32*5-128($aap), $B2, $TEMP2
321	vpaddq		$TEMP2, $ACC7, $ACC7
322	vpmuludq	32*6-128($aap), $B2, $TEMP0
323	vpaddq		$TEMP0, $ACC8, $ACC8
324	vpmuludq	32*7-128($aap), $B2, $ACC0
325	 vpbroadcastq	32*3-128($tpa), $B2
326	vpaddq		32*9-192($tp0), $ACC0, $ACC0
327
328	vmovdqu		$ACC2, 32*2-192($tp0)
329	vmovdqu		$ACC3, 32*3-192($tp0)
330
331	vpmuludq	32*2-128($ap), $B1, $TEMP2
332	vpaddq		$TEMP2, $ACC4, $ACC4
333	vpmuludq	32*2-128($aap), $B1, $TEMP0
334	vpaddq		$TEMP0, $ACC5, $ACC5
335	vpmuludq	32*3-128($aap), $B1, $TEMP1
336	vpaddq		$TEMP1, $ACC6, $ACC6
337	vpmuludq	32*4-128($aap), $B1, $TEMP2
338	vpaddq		$TEMP2, $ACC7, $ACC7
339	vpmuludq	32*5-128($aap), $B1, $TEMP0
340	vpaddq		$TEMP0, $ACC8, $ACC8
341	vpmuludq	32*6-128($aap), $B1, $TEMP1
342	vpaddq		$TEMP1, $ACC0, $ACC0
343	vpmuludq	32*7-128($aap), $B1, $ACC1
344	 vpbroadcastq	32*4-128($tpa), $B1
345	vpaddq		32*10-448($tp1), $ACC1, $ACC1
346
347	vmovdqu		$ACC4, 32*4-192($tp0)
348	vmovdqu		$ACC5, 32*5-192($tp0)
349
350	vpmuludq	32*3-128($ap), $B2, $TEMP0
351	vpaddq		$TEMP0, $ACC6, $ACC6
352	vpmuludq	32*3-128($aap), $B2, $TEMP1
353	vpaddq		$TEMP1, $ACC7, $ACC7
354	vpmuludq	32*4-128($aap), $B2, $TEMP2
355	vpaddq		$TEMP2, $ACC8, $ACC8
356	vpmuludq	32*5-128($aap), $B2, $TEMP0
357	vpaddq		$TEMP0, $ACC0, $ACC0
358	vpmuludq	32*6-128($aap), $B2, $TEMP1
359	vpaddq		$TEMP1, $ACC1, $ACC1
360	vpmuludq	32*7-128($aap), $B2, $ACC2
361	 vpbroadcastq	32*5-128($tpa), $B2
362	vpaddq		32*11-448($tp1), $ACC2, $ACC2
363
364	vmovdqu		$ACC6, 32*6-192($tp0)
365	vmovdqu		$ACC7, 32*7-192($tp0)
366
367	vpmuludq	32*4-128($ap), $B1, $TEMP0
368	vpaddq		$TEMP0, $ACC8, $ACC8
369	vpmuludq	32*4-128($aap), $B1, $TEMP1
370	vpaddq		$TEMP1, $ACC0, $ACC0
371	vpmuludq	32*5-128($aap), $B1, $TEMP2
372	vpaddq		$TEMP2, $ACC1, $ACC1
373	vpmuludq	32*6-128($aap), $B1, $TEMP0
374	vpaddq		$TEMP0, $ACC2, $ACC2
375	vpmuludq	32*7-128($aap), $B1, $ACC3
376	 vpbroadcastq	32*6-128($tpa), $B1
377	vpaddq		32*12-448($tp1), $ACC3, $ACC3
378
379	vmovdqu		$ACC8, 32*8-192($tp0)
380	vmovdqu		$ACC0, 32*9-192($tp0)
381	lea		8($tp0), $tp0
382
383	vpmuludq	32*5-128($ap), $B2, $TEMP2
384	vpaddq		$TEMP2, $ACC1, $ACC1
385	vpmuludq	32*5-128($aap), $B2, $TEMP0
386	vpaddq		$TEMP0, $ACC2, $ACC2
387	vpmuludq	32*6-128($aap), $B2, $TEMP1
388	vpaddq		$TEMP1, $ACC3, $ACC3
389	vpmuludq	32*7-128($aap), $B2, $ACC4
390	 vpbroadcastq	32*7-128($tpa), $B2
391	vpaddq		32*13-448($tp1), $ACC4, $ACC4
392
393	vmovdqu		$ACC1, 32*10-448($tp1)
394	vmovdqu		$ACC2, 32*11-448($tp1)
395
396	vpmuludq	32*6-128($ap), $B1, $TEMP0
397	vpaddq		$TEMP0, $ACC3, $ACC3
398	vpmuludq	32*6-128($aap), $B1, $TEMP1
399	 vpbroadcastq	32*8-128($tpa), $ACC0		# borrow $ACC0 for $B1
400	vpaddq		$TEMP1, $ACC4, $ACC4
401	vpmuludq	32*7-128($aap), $B1, $ACC5
402	 vpbroadcastq	32*0+8-128($tpa), $B1		# for next iteration
403	vpaddq		32*14-448($tp1), $ACC5, $ACC5
404
405	vmovdqu		$ACC3, 32*12-448($tp1)
406	vmovdqu		$ACC4, 32*13-448($tp1)
407	lea		8($tpa), $tpa
408
409	vpmuludq	32*7-128($ap), $B2, $TEMP0
410	vpaddq		$TEMP0, $ACC5, $ACC5
411	vpmuludq	32*7-128($aap), $B2, $ACC6
412	vpaddq		32*15-448($tp1), $ACC6, $ACC6
413
414	vpmuludq	32*8-128($ap), $ACC0, $ACC7
415	vmovdqu		$ACC5, 32*14-448($tp1)
416	vpaddq		32*16-448($tp1), $ACC7, $ACC7
417	vmovdqu		$ACC6, 32*15-448($tp1)
418	vmovdqu		$ACC7, 32*16-448($tp1)
419	lea		8($tp1), $tp1
420
421	dec	$i
422	jnz	.LOOP_SQR_1024
423___
424$ZERO = $ACC9;
425$TEMP0 = $B1;
426$TEMP2 = $B2;
427$TEMP3 = $Y1;
428$TEMP4 = $Y2;
429$code.=<<___;
430	#we need to fix indexes 32-39 to avoid overflow
431	vmovdqu		32*8(%rsp), $ACC8		# 32*8-192($tp0),
432	vmovdqu		32*9(%rsp), $ACC1		# 32*9-192($tp0)
433	vmovdqu		32*10(%rsp), $ACC2		# 32*10-192($tp0)
434	lea		192(%rsp), $tp0			# 64+128=192
435
436	vpsrlq		\$29, $ACC8, $TEMP1
437	vpand		$AND_MASK, $ACC8, $ACC8
438	vpsrlq		\$29, $ACC1, $TEMP2
439	vpand		$AND_MASK, $ACC1, $ACC1
440
441	vpermq		\$0x93, $TEMP1, $TEMP1
442	vpxor		$ZERO, $ZERO, $ZERO
443	vpermq		\$0x93, $TEMP2, $TEMP2
444
445	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
446	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
447	vpaddq		$TEMP0, $ACC8, $ACC8
448	vpblendd	\$3, $TEMP2, $ZERO, $TEMP2
449	vpaddq		$TEMP1, $ACC1, $ACC1
450	vpaddq		$TEMP2, $ACC2, $ACC2
451	vmovdqu		$ACC1, 32*9-192($tp0)
452	vmovdqu		$ACC2, 32*10-192($tp0)
453
454	mov	(%rsp), %rax
455	mov	8(%rsp), $r1
456	mov	16(%rsp), $r2
457	mov	24(%rsp), $r3
458	vmovdqu	32*1(%rsp), $ACC1
459	vmovdqu	32*2-192($tp0), $ACC2
460	vmovdqu	32*3-192($tp0), $ACC3
461	vmovdqu	32*4-192($tp0), $ACC4
462	vmovdqu	32*5-192($tp0), $ACC5
463	vmovdqu	32*6-192($tp0), $ACC6
464	vmovdqu	32*7-192($tp0), $ACC7
465
466	mov	%rax, $r0
467	imull	$n0, %eax
468	and	\$0x1fffffff, %eax
469	vmovd	%eax, $Y1
470
471	mov	%rax, %rdx
472	imulq	-128($np), %rax
473	 vpbroadcastq	$Y1, $Y1
474	add	%rax, $r0
475	mov	%rdx, %rax
476	imulq	8-128($np), %rax
477	shr	\$29, $r0
478	add	%rax, $r1
479	mov	%rdx, %rax
480	imulq	16-128($np), %rax
481	add	$r0, $r1
482	add	%rax, $r2
483	imulq	24-128($np), %rdx
484	add	%rdx, $r3
485
486	mov	$r1, %rax
487	imull	$n0, %eax
488	and	\$0x1fffffff, %eax
489
490	mov \$9, $i
491	jmp .LOOP_REDUCE_1024
492
493.align	32
494.LOOP_REDUCE_1024:
495	vmovd	%eax, $Y2
496	vpbroadcastq	$Y2, $Y2
497
498	vpmuludq	32*1-128($np), $Y1, $TEMP0
499	 mov	%rax, %rdx
500	 imulq	-128($np), %rax
501	vpaddq		$TEMP0, $ACC1, $ACC1
502	 add	%rax, $r1
503	vpmuludq	32*2-128($np), $Y1, $TEMP1
504	 mov	%rdx, %rax
505	 imulq	8-128($np), %rax
506	vpaddq		$TEMP1, $ACC2, $ACC2
507	vpmuludq	32*3-128($np), $Y1, $TEMP2
508	 .byte	0x67
509	 add	%rax, $r2
510	 .byte	0x67
511	 mov	%rdx, %rax
512	 imulq	16-128($np), %rax
513	 shr	\$29, $r1
514	vpaddq		$TEMP2, $ACC3, $ACC3
515	vpmuludq	32*4-128($np), $Y1, $TEMP0
516	 add	%rax, $r3
517	 add	$r1, $r2
518	vpaddq		$TEMP0, $ACC4, $ACC4
519	vpmuludq	32*5-128($np), $Y1, $TEMP1
520	 mov	$r2, %rax
521	 imull	$n0, %eax
522	vpaddq		$TEMP1, $ACC5, $ACC5
523	vpmuludq	32*6-128($np), $Y1, $TEMP2
524	 and	\$0x1fffffff, %eax
525	vpaddq		$TEMP2, $ACC6, $ACC6
526	vpmuludq	32*7-128($np), $Y1, $TEMP0
527	vpaddq		$TEMP0, $ACC7, $ACC7
528	vpmuludq	32*8-128($np), $Y1, $TEMP1
529	 vmovd	%eax, $Y1
530	 #vmovdqu	32*1-8-128($np), $TEMP2		# moved below
531	vpaddq		$TEMP1, $ACC8, $ACC8
532	 #vmovdqu	32*2-8-128($np), $TEMP0		# moved below
533	 vpbroadcastq	$Y1, $Y1
534
535	vpmuludq	32*1-8-128($np), $Y2, $TEMP2	# see above
536	vmovdqu		32*3-8-128($np), $TEMP1
537	 mov	%rax, %rdx
538	 imulq	-128($np), %rax
539	vpaddq		$TEMP2, $ACC1, $ACC1
540	vpmuludq	32*2-8-128($np), $Y2, $TEMP0	# see above
541	vmovdqu		32*4-8-128($np), $TEMP2
542	 add	%rax, $r2
543	 mov	%rdx, %rax
544	 imulq	8-128($np), %rax
545	vpaddq		$TEMP0, $ACC2, $ACC2
546	 add	$r3, %rax
547	 shr	\$29, $r2
548	vpmuludq	$Y2, $TEMP1, $TEMP1
549	vmovdqu		32*5-8-128($np), $TEMP0
550	 add	$r2, %rax
551	vpaddq		$TEMP1, $ACC3, $ACC3
552	vpmuludq	$Y2, $TEMP2, $TEMP2
553	vmovdqu		32*6-8-128($np), $TEMP1
554	 .byte	0x67
555	 mov	%rax, $r3
556	 imull	$n0, %eax
557	vpaddq		$TEMP2, $ACC4, $ACC4
558	vpmuludq	$Y2, $TEMP0, $TEMP0
559	.byte	0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00	# vmovdqu		32*7-8-128($np), $TEMP2
560	 and	\$0x1fffffff, %eax
561	vpaddq		$TEMP0, $ACC5, $ACC5
562	vpmuludq	$Y2, $TEMP1, $TEMP1
563	vmovdqu		32*8-8-128($np), $TEMP0
564	vpaddq		$TEMP1, $ACC6, $ACC6
565	vpmuludq	$Y2, $TEMP2, $TEMP2
566	vmovdqu		32*9-8-128($np), $ACC9
567	 vmovd	%eax, $ACC0			# borrow ACC0 for Y2
568	 imulq	-128($np), %rax
569	vpaddq		$TEMP2, $ACC7, $ACC7
570	vpmuludq	$Y2, $TEMP0, $TEMP0
571	 vmovdqu	32*1-16-128($np), $TEMP1
572	 vpbroadcastq	$ACC0, $ACC0
573	vpaddq		$TEMP0, $ACC8, $ACC8
574	vpmuludq	$Y2, $ACC9, $ACC9
575	 vmovdqu	32*2-16-128($np), $TEMP2
576	 add	%rax, $r3
577
578___
579($ACC0,$Y2)=($Y2,$ACC0);
580$code.=<<___;
581	 vmovdqu	32*1-24-128($np), $ACC0
582	vpmuludq	$Y1, $TEMP1, $TEMP1
583	vmovdqu		32*3-16-128($np), $TEMP0
584	vpaddq		$TEMP1, $ACC1, $ACC1
585	 vpmuludq	$Y2, $ACC0, $ACC0
586	vpmuludq	$Y1, $TEMP2, $TEMP2
587	.byte	0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff	# vmovdqu		32*4-16-128($np), $TEMP1
588	 vpaddq		$ACC1, $ACC0, $ACC0
589	vpaddq		$TEMP2, $ACC2, $ACC2
590	vpmuludq	$Y1, $TEMP0, $TEMP0
591	vmovdqu		32*5-16-128($np), $TEMP2
592	 .byte	0x67
593	 vmovq		$ACC0, %rax
594	 vmovdqu	$ACC0, (%rsp)		# transfer $r0-$r3
595	vpaddq		$TEMP0, $ACC3, $ACC3
596	vpmuludq	$Y1, $TEMP1, $TEMP1
597	vmovdqu		32*6-16-128($np), $TEMP0
598	vpaddq		$TEMP1, $ACC4, $ACC4
599	vpmuludq	$Y1, $TEMP2, $TEMP2
600	vmovdqu		32*7-16-128($np), $TEMP1
601	vpaddq		$TEMP2, $ACC5, $ACC5
602	vpmuludq	$Y1, $TEMP0, $TEMP0
603	vmovdqu		32*8-16-128($np), $TEMP2
604	vpaddq		$TEMP0, $ACC6, $ACC6
605	vpmuludq	$Y1, $TEMP1, $TEMP1
606	 shr	\$29, $r3
607	vmovdqu		32*9-16-128($np), $TEMP0
608	 add	$r3, %rax
609	vpaddq		$TEMP1, $ACC7, $ACC7
610	vpmuludq	$Y1, $TEMP2, $TEMP2
611	 #vmovdqu	32*2-24-128($np), $TEMP1	# moved below
612	 mov	%rax, $r0
613	 imull	$n0, %eax
614	vpaddq		$TEMP2, $ACC8, $ACC8
615	vpmuludq	$Y1, $TEMP0, $TEMP0
616	 and	\$0x1fffffff, %eax
617	 vmovd	%eax, $Y1
618	 vmovdqu	32*3-24-128($np), $TEMP2
619	.byte	0x67
620	vpaddq		$TEMP0, $ACC9, $ACC9
621	 vpbroadcastq	$Y1, $Y1
622
623	vpmuludq	32*2-24-128($np), $Y2, $TEMP1	# see above
624	vmovdqu		32*4-24-128($np), $TEMP0
625	 mov	%rax, %rdx
626	 imulq	-128($np), %rax
627	 mov	8(%rsp), $r1
628	vpaddq		$TEMP1, $ACC2, $ACC1
629	vpmuludq	$Y2, $TEMP2, $TEMP2
630	vmovdqu		32*5-24-128($np), $TEMP1
631	 add	%rax, $r0
632	 mov	%rdx, %rax
633	 imulq	8-128($np), %rax
634	 .byte	0x67
635	 shr	\$29, $r0
636	 mov	16(%rsp), $r2
637	vpaddq		$TEMP2, $ACC3, $ACC2
638	vpmuludq	$Y2, $TEMP0, $TEMP0
639	vmovdqu		32*6-24-128($np), $TEMP2
640	 add	%rax, $r1
641	 mov	%rdx, %rax
642	 imulq	16-128($np), %rax
643	vpaddq		$TEMP0, $ACC4, $ACC3
644	vpmuludq	$Y2, $TEMP1, $TEMP1
645	vmovdqu		32*7-24-128($np), $TEMP0
646	 imulq	24-128($np), %rdx		# future $r3
647	 add	%rax, $r2
648	 lea	($r0,$r1), %rax
649	vpaddq		$TEMP1, $ACC5, $ACC4
650	vpmuludq	$Y2, $TEMP2, $TEMP2
651	vmovdqu		32*8-24-128($np), $TEMP1
652	 mov	%rax, $r1
653	 imull	$n0, %eax
654	vpmuludq	$Y2, $TEMP0, $TEMP0
655	vpaddq		$TEMP2, $ACC6, $ACC5
656	vmovdqu		32*9-24-128($np), $TEMP2
657	 and	\$0x1fffffff, %eax
658	vpaddq		$TEMP0, $ACC7, $ACC6
659	vpmuludq	$Y2, $TEMP1, $TEMP1
660	 add	24(%rsp), %rdx
661	vpaddq		$TEMP1, $ACC8, $ACC7
662	vpmuludq	$Y2, $TEMP2, $TEMP2
663	vpaddq		$TEMP2, $ACC9, $ACC8
664	 vmovq	$r3, $ACC9
665	 mov	%rdx, $r3
666
667	dec	$i
668	jnz	.LOOP_REDUCE_1024
669___
670($ACC0,$Y2)=($Y2,$ACC0);
671$code.=<<___;
672	lea	448(%rsp), $tp1			# size optimization
673	vpaddq	$ACC9, $Y2, $ACC0
674	vpxor	$ZERO, $ZERO, $ZERO
675
676	vpaddq		32*9-192($tp0), $ACC0, $ACC0
677	vpaddq		32*10-448($tp1), $ACC1, $ACC1
678	vpaddq		32*11-448($tp1), $ACC2, $ACC2
679	vpaddq		32*12-448($tp1), $ACC3, $ACC3
680	vpaddq		32*13-448($tp1), $ACC4, $ACC4
681	vpaddq		32*14-448($tp1), $ACC5, $ACC5
682	vpaddq		32*15-448($tp1), $ACC6, $ACC6
683	vpaddq		32*16-448($tp1), $ACC7, $ACC7
684	vpaddq		32*17-448($tp1), $ACC8, $ACC8
685
686	vpsrlq		\$29, $ACC0, $TEMP1
687	vpand		$AND_MASK, $ACC0, $ACC0
688	vpsrlq		\$29, $ACC1, $TEMP2
689	vpand		$AND_MASK, $ACC1, $ACC1
690	vpsrlq		\$29, $ACC2, $TEMP3
691	vpermq		\$0x93, $TEMP1, $TEMP1
692	vpand		$AND_MASK, $ACC2, $ACC2
693	vpsrlq		\$29, $ACC3, $TEMP4
694	vpermq		\$0x93, $TEMP2, $TEMP2
695	vpand		$AND_MASK, $ACC3, $ACC3
696	vpermq		\$0x93, $TEMP3, $TEMP3
697
698	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
699	vpermq		\$0x93, $TEMP4, $TEMP4
700	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
701	vpaddq		$TEMP0, $ACC0, $ACC0
702	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
703	vpaddq		$TEMP1, $ACC1, $ACC1
704	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
705	vpaddq		$TEMP2, $ACC2, $ACC2
706	vpblendd	\$3, $TEMP4, $ZERO, $TEMP4
707	vpaddq		$TEMP3, $ACC3, $ACC3
708	vpaddq		$TEMP4, $ACC4, $ACC4
709
710	vpsrlq		\$29, $ACC0, $TEMP1
711	vpand		$AND_MASK, $ACC0, $ACC0
712	vpsrlq		\$29, $ACC1, $TEMP2
713	vpand		$AND_MASK, $ACC1, $ACC1
714	vpsrlq		\$29, $ACC2, $TEMP3
715	vpermq		\$0x93, $TEMP1, $TEMP1
716	vpand		$AND_MASK, $ACC2, $ACC2
717	vpsrlq		\$29, $ACC3, $TEMP4
718	vpermq		\$0x93, $TEMP2, $TEMP2
719	vpand		$AND_MASK, $ACC3, $ACC3
720	vpermq		\$0x93, $TEMP3, $TEMP3
721
722	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
723	vpermq		\$0x93, $TEMP4, $TEMP4
724	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
725	vpaddq		$TEMP0, $ACC0, $ACC0
726	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
727	vpaddq		$TEMP1, $ACC1, $ACC1
728	vmovdqu		$ACC0, 32*0-128($rp)
729	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
730	vpaddq		$TEMP2, $ACC2, $ACC2
731	vmovdqu		$ACC1, 32*1-128($rp)
732	vpblendd	\$3, $TEMP4, $ZERO, $TEMP4
733	vpaddq		$TEMP3, $ACC3, $ACC3
734	vmovdqu		$ACC2, 32*2-128($rp)
735	vpaddq		$TEMP4, $ACC4, $ACC4
736	vmovdqu		$ACC3, 32*3-128($rp)
737___
738$TEMP5=$ACC0;
739$code.=<<___;
740	vpsrlq		\$29, $ACC4, $TEMP1
741	vpand		$AND_MASK, $ACC4, $ACC4
742	vpsrlq		\$29, $ACC5, $TEMP2
743	vpand		$AND_MASK, $ACC5, $ACC5
744	vpsrlq		\$29, $ACC6, $TEMP3
745	vpermq		\$0x93, $TEMP1, $TEMP1
746	vpand		$AND_MASK, $ACC6, $ACC6
747	vpsrlq		\$29, $ACC7, $TEMP4
748	vpermq		\$0x93, $TEMP2, $TEMP2
749	vpand		$AND_MASK, $ACC7, $ACC7
750	vpsrlq		\$29, $ACC8, $TEMP5
751	vpermq		\$0x93, $TEMP3, $TEMP3
752	vpand		$AND_MASK, $ACC8, $ACC8
753	vpermq		\$0x93, $TEMP4, $TEMP4
754
755	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
756	vpermq		\$0x93, $TEMP5, $TEMP5
757	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
758	vpaddq		$TEMP0, $ACC4, $ACC4
759	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
760	vpaddq		$TEMP1, $ACC5, $ACC5
761	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
762	vpaddq		$TEMP2, $ACC6, $ACC6
763	vpblendd	\$3, $TEMP4, $TEMP5, $TEMP4
764	vpaddq		$TEMP3, $ACC7, $ACC7
765	vpaddq		$TEMP4, $ACC8, $ACC8
766
767	vpsrlq		\$29, $ACC4, $TEMP1
768	vpand		$AND_MASK, $ACC4, $ACC4
769	vpsrlq		\$29, $ACC5, $TEMP2
770	vpand		$AND_MASK, $ACC5, $ACC5
771	vpsrlq		\$29, $ACC6, $TEMP3
772	vpermq		\$0x93, $TEMP1, $TEMP1
773	vpand		$AND_MASK, $ACC6, $ACC6
774	vpsrlq		\$29, $ACC7, $TEMP4
775	vpermq		\$0x93, $TEMP2, $TEMP2
776	vpand		$AND_MASK, $ACC7, $ACC7
777	vpsrlq		\$29, $ACC8, $TEMP5
778	vpermq		\$0x93, $TEMP3, $TEMP3
779	vpand		$AND_MASK, $ACC8, $ACC8
780	vpermq		\$0x93, $TEMP4, $TEMP4
781
782	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
783	vpermq		\$0x93, $TEMP5, $TEMP5
784	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
785	vpaddq		$TEMP0, $ACC4, $ACC4
786	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
787	vpaddq		$TEMP1, $ACC5, $ACC5
788	vmovdqu		$ACC4, 32*4-128($rp)
789	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
790	vpaddq		$TEMP2, $ACC6, $ACC6
791	vmovdqu		$ACC5, 32*5-128($rp)
792	vpblendd	\$3, $TEMP4, $TEMP5, $TEMP4
793	vpaddq		$TEMP3, $ACC7, $ACC7
794	vmovdqu		$ACC6, 32*6-128($rp)
795	vpaddq		$TEMP4, $ACC8, $ACC8
796	vmovdqu		$ACC7, 32*7-128($rp)
797	vmovdqu		$ACC8, 32*8-128($rp)
798
799	mov	$rp, $ap
800	dec	$rep
801	jne	.LOOP_GRANDE_SQR_1024
802
803	vzeroall
804	mov	%rbp, %rax
805___
806$code.=<<___ if ($win64);
807	movaps	-0xd8(%rax),%xmm6
808	movaps	-0xc8(%rax),%xmm7
809	movaps	-0xb8(%rax),%xmm8
810	movaps	-0xa8(%rax),%xmm9
811	movaps	-0x98(%rax),%xmm10
812	movaps	-0x88(%rax),%xmm11
813	movaps	-0x78(%rax),%xmm12
814	movaps	-0x68(%rax),%xmm13
815	movaps	-0x58(%rax),%xmm14
816	movaps	-0x48(%rax),%xmm15
817___
818$code.=<<___;
819	mov	-48(%rax),%r15
820	mov	-40(%rax),%r14
821	mov	-32(%rax),%r13
822	mov	-24(%rax),%r12
823	mov	-16(%rax),%rbp
824	mov	-8(%rax),%rbx
825	lea	(%rax),%rsp		# restore %rsp
826.Lsqr_1024_epilogue:
827	ret
828.size	rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
829___
830}
831
832{ # void AMM_WW(
833my $rp="%rdi";	# BN_ULONG *rp,
834my $ap="%rsi";	# const BN_ULONG *ap,
835my $bp="%rdx";	# const BN_ULONG *bp,
836my $np="%rcx";	# const BN_ULONG *np,
837my $n0="%r8d";	# unsigned int n0);
838
839# The registers that hold the accumulated redundant result
840# The AMM works on 1024 bit operands, and redundant word size is 29
841# Therefore: ceil(1024/29)/4 = 9
842my $ACC0="%ymm0";
843my $ACC1="%ymm1";
844my $ACC2="%ymm2";
845my $ACC3="%ymm3";
846my $ACC4="%ymm4";
847my $ACC5="%ymm5";
848my $ACC6="%ymm6";
849my $ACC7="%ymm7";
850my $ACC8="%ymm8";
851my $ACC9="%ymm9";
852
853# Registers that hold the broadcasted words of multiplier, currently used
854my $Bi="%ymm10";
855my $Yi="%ymm11";
856
857# Helper registers
858my $TEMP0=$ACC0;
859my $TEMP1="%ymm12";
860my $TEMP2="%ymm13";
861my $ZERO="%ymm14";
862my $AND_MASK="%ymm15";
863
864# alu registers that hold the first words of the ACC
865my $r0="%r9";
866my $r1="%r10";
867my $r2="%r11";
868my $r3="%r12";
869
870my $i="%r14d";
871my $tmp="%r15";
872
873$bp="%r13";	# reassigned argument
874
875$code.=<<___;
876.globl	rsaz_1024_mul_avx2
877.type	rsaz_1024_mul_avx2,\@function,5
878.align	64
879rsaz_1024_mul_avx2:
880	lea	(%rsp), %rax
881	push	%rbx
882	push	%rbp
883	push	%r12
884	push	%r13
885	push	%r14
886	push	%r15
887___
888$code.=<<___ if ($win64);
889	vzeroupper
890	lea	-0xa8(%rsp),%rsp
891	vmovaps	%xmm6,-0xd8(%rax)
892	vmovaps	%xmm7,-0xc8(%rax)
893	vmovaps	%xmm8,-0xb8(%rax)
894	vmovaps	%xmm9,-0xa8(%rax)
895	vmovaps	%xmm10,-0x98(%rax)
896	vmovaps	%xmm11,-0x88(%rax)
897	vmovaps	%xmm12,-0x78(%rax)
898	vmovaps	%xmm13,-0x68(%rax)
899	vmovaps	%xmm14,-0x58(%rax)
900	vmovaps	%xmm15,-0x48(%rax)
901.Lmul_1024_body:
902___
903$code.=<<___;
904	mov	%rax,%rbp
905	vzeroall
906	mov	%rdx, $bp	# reassigned argument
907	sub	\$64,%rsp
908
909	# unaligned 256-bit load that crosses page boundary can
910	# cause severe performance degradation here, so if $ap does
911	# cross page boundary, swap it with $bp [meaning that caller
912	# is advised to lay down $ap and $bp next to each other, so
913	# that only one can cross page boundary].
914	.byte	0x67,0x67
915	mov	$ap, $tmp
916	and	\$4095, $tmp
917	add	\$32*10, $tmp
918	shr	\$12, $tmp
919	mov	$ap, $tmp
920	cmovnz	$bp, $ap
921	cmovnz	$tmp, $bp
922
923	mov	$np, $tmp
924	sub	\$-128,$ap	# size optimization
925	sub	\$-128,$np
926	sub	\$-128,$rp
927
928	and	\$4095, $tmp	# see if $np crosses page
929	add	\$32*10, $tmp
930	.byte	0x67,0x67
931	shr	\$12, $tmp
932	jz	.Lmul_1024_no_n_copy
933
934	# unaligned 256-bit load that crosses page boundary can
935	# cause severe performance degradation here, so if $np does
936	# cross page boundary, copy it to stack and make sure stack
937	# frame doesn't...
938	sub		\$32*10,%rsp
939	vmovdqu		32*0-128($np), $ACC0
940	and		\$-512, %rsp
941	vmovdqu		32*1-128($np), $ACC1
942	vmovdqu		32*2-128($np), $ACC2
943	vmovdqu		32*3-128($np), $ACC3
944	vmovdqu		32*4-128($np), $ACC4
945	vmovdqu		32*5-128($np), $ACC5
946	vmovdqu		32*6-128($np), $ACC6
947	vmovdqu		32*7-128($np), $ACC7
948	vmovdqu		32*8-128($np), $ACC8
949	lea		64+128(%rsp),$np
950	vmovdqu		$ACC0, 32*0-128($np)
951	vpxor		$ACC0, $ACC0, $ACC0
952	vmovdqu		$ACC1, 32*1-128($np)
953	vpxor		$ACC1, $ACC1, $ACC1
954	vmovdqu		$ACC2, 32*2-128($np)
955	vpxor		$ACC2, $ACC2, $ACC2
956	vmovdqu		$ACC3, 32*3-128($np)
957	vpxor		$ACC3, $ACC3, $ACC3
958	vmovdqu		$ACC4, 32*4-128($np)
959	vpxor		$ACC4, $ACC4, $ACC4
960	vmovdqu		$ACC5, 32*5-128($np)
961	vpxor		$ACC5, $ACC5, $ACC5
962	vmovdqu		$ACC6, 32*6-128($np)
963	vpxor		$ACC6, $ACC6, $ACC6
964	vmovdqu		$ACC7, 32*7-128($np)
965	vpxor		$ACC7, $ACC7, $ACC7
966	vmovdqu		$ACC8, 32*8-128($np)
967	vmovdqa		$ACC0, $ACC8
968	vmovdqu		$ACC9, 32*9-128($np)	# $ACC9 is zero after vzeroall
969.Lmul_1024_no_n_copy:
970	and	\$-64,%rsp
971
972	mov	($bp), %rbx
973	vpbroadcastq ($bp), $Bi
974	vmovdqu	$ACC0, (%rsp)			# clear top of stack
975	xor	$r0, $r0
976	.byte	0x67
977	xor	$r1, $r1
978	xor	$r2, $r2
979	xor	$r3, $r3
980
981	vmovdqu	.Land_mask(%rip), $AND_MASK
982	mov	\$9, $i
983	vmovdqu	$ACC9, 32*9-128($rp)		# $ACC9 is zero after vzeroall
984	jmp	.Loop_mul_1024
985
986.align	32
987.Loop_mul_1024:
988	 vpsrlq		\$29, $ACC3, $ACC9		# correct $ACC3(*)
989	mov	%rbx, %rax
990	imulq	-128($ap), %rax
991	add	$r0, %rax
992	mov	%rbx, $r1
993	imulq	8-128($ap), $r1
994	add	8(%rsp), $r1
995
996	mov	%rax, $r0
997	imull	$n0, %eax
998	and	\$0x1fffffff, %eax
999
1000	 mov	%rbx, $r2
1001	 imulq	16-128($ap), $r2
1002	 add	16(%rsp), $r2
1003
1004	 mov	%rbx, $r3
1005	 imulq	24-128($ap), $r3
1006	 add	24(%rsp), $r3
1007	vpmuludq	32*1-128($ap),$Bi,$TEMP0
1008	 vmovd		%eax, $Yi
1009	vpaddq		$TEMP0,$ACC1,$ACC1
1010	vpmuludq	32*2-128($ap),$Bi,$TEMP1
1011	 vpbroadcastq	$Yi, $Yi
1012	vpaddq		$TEMP1,$ACC2,$ACC2
1013	vpmuludq	32*3-128($ap),$Bi,$TEMP2
1014	 vpand		$AND_MASK, $ACC3, $ACC3		# correct $ACC3
1015	vpaddq		$TEMP2,$ACC3,$ACC3
1016	vpmuludq	32*4-128($ap),$Bi,$TEMP0
1017	vpaddq		$TEMP0,$ACC4,$ACC4
1018	vpmuludq	32*5-128($ap),$Bi,$TEMP1
1019	vpaddq		$TEMP1,$ACC5,$ACC5
1020	vpmuludq	32*6-128($ap),$Bi,$TEMP2
1021	vpaddq		$TEMP2,$ACC6,$ACC6
1022	vpmuludq	32*7-128($ap),$Bi,$TEMP0
1023	 vpermq		\$0x93, $ACC9, $ACC9		# correct $ACC3
1024	vpaddq		$TEMP0,$ACC7,$ACC7
1025	vpmuludq	32*8-128($ap),$Bi,$TEMP1
1026	 vpbroadcastq	8($bp), $Bi
1027	vpaddq		$TEMP1,$ACC8,$ACC8
1028
1029	mov	%rax,%rdx
1030	imulq	-128($np),%rax
1031	add	%rax,$r0
1032	mov	%rdx,%rax
1033	imulq	8-128($np),%rax
1034	add	%rax,$r1
1035	mov	%rdx,%rax
1036	imulq	16-128($np),%rax
1037	add	%rax,$r2
1038	shr	\$29, $r0
1039	imulq	24-128($np),%rdx
1040	add	%rdx,$r3
1041	add	$r0, $r1
1042
1043	vpmuludq	32*1-128($np),$Yi,$TEMP2
1044	 vmovq		$Bi, %rbx
1045	vpaddq		$TEMP2,$ACC1,$ACC1
1046	vpmuludq	32*2-128($np),$Yi,$TEMP0
1047	vpaddq		$TEMP0,$ACC2,$ACC2
1048	vpmuludq	32*3-128($np),$Yi,$TEMP1
1049	vpaddq		$TEMP1,$ACC3,$ACC3
1050	vpmuludq	32*4-128($np),$Yi,$TEMP2
1051	vpaddq		$TEMP2,$ACC4,$ACC4
1052	vpmuludq	32*5-128($np),$Yi,$TEMP0
1053	vpaddq		$TEMP0,$ACC5,$ACC5
1054	vpmuludq	32*6-128($np),$Yi,$TEMP1
1055	vpaddq		$TEMP1,$ACC6,$ACC6
1056	vpmuludq	32*7-128($np),$Yi,$TEMP2
1057	 vpblendd	\$3, $ZERO, $ACC9, $ACC9	# correct $ACC3
1058	vpaddq		$TEMP2,$ACC7,$ACC7
1059	vpmuludq	32*8-128($np),$Yi,$TEMP0
1060	 vpaddq		$ACC9, $ACC3, $ACC3		# correct $ACC3
1061	vpaddq		$TEMP0,$ACC8,$ACC8
1062
1063	mov	%rbx, %rax
1064	imulq	-128($ap),%rax
1065	add	%rax,$r1
1066	 vmovdqu	-8+32*1-128($ap),$TEMP1
1067	mov	%rbx, %rax
1068	imulq	8-128($ap),%rax
1069	add	%rax,$r2
1070	 vmovdqu	-8+32*2-128($ap),$TEMP2
1071
1072	mov	$r1, %rax
1073	imull	$n0, %eax
1074	and	\$0x1fffffff, %eax
1075
1076	 imulq	16-128($ap),%rbx
1077	 add	%rbx,$r3
1078	vpmuludq	$Bi,$TEMP1,$TEMP1
1079	 vmovd		%eax, $Yi
1080	vmovdqu		-8+32*3-128($ap),$TEMP0
1081	vpaddq		$TEMP1,$ACC1,$ACC1
1082	vpmuludq	$Bi,$TEMP2,$TEMP2
1083	 vpbroadcastq	$Yi, $Yi
1084	vmovdqu		-8+32*4-128($ap),$TEMP1
1085	vpaddq		$TEMP2,$ACC2,$ACC2
1086	vpmuludq	$Bi,$TEMP0,$TEMP0
1087	vmovdqu		-8+32*5-128($ap),$TEMP2
1088	vpaddq		$TEMP0,$ACC3,$ACC3
1089	vpmuludq	$Bi,$TEMP1,$TEMP1
1090	vmovdqu		-8+32*6-128($ap),$TEMP0
1091	vpaddq		$TEMP1,$ACC4,$ACC4
1092	vpmuludq	$Bi,$TEMP2,$TEMP2
1093	vmovdqu		-8+32*7-128($ap),$TEMP1
1094	vpaddq		$TEMP2,$ACC5,$ACC5
1095	vpmuludq	$Bi,$TEMP0,$TEMP0
1096	vmovdqu		-8+32*8-128($ap),$TEMP2
1097	vpaddq		$TEMP0,$ACC6,$ACC6
1098	vpmuludq	$Bi,$TEMP1,$TEMP1
1099	vmovdqu		-8+32*9-128($ap),$ACC9
1100	vpaddq		$TEMP1,$ACC7,$ACC7
1101	vpmuludq	$Bi,$TEMP2,$TEMP2
1102	vpaddq		$TEMP2,$ACC8,$ACC8
1103	vpmuludq	$Bi,$ACC9,$ACC9
1104	 vpbroadcastq	16($bp), $Bi
1105
1106	mov	%rax,%rdx
1107	imulq	-128($np),%rax
1108	add	%rax,$r1
1109	 vmovdqu	-8+32*1-128($np),$TEMP0
1110	mov	%rdx,%rax
1111	imulq	8-128($np),%rax
1112	add	%rax,$r2
1113	 vmovdqu	-8+32*2-128($np),$TEMP1
1114	shr	\$29, $r1
1115	imulq	16-128($np),%rdx
1116	add	%rdx,$r3
1117	add	$r1, $r2
1118
1119	vpmuludq	$Yi,$TEMP0,$TEMP0
1120	 vmovq		$Bi, %rbx
1121	vmovdqu		-8+32*3-128($np),$TEMP2
1122	vpaddq		$TEMP0,$ACC1,$ACC1
1123	vpmuludq	$Yi,$TEMP1,$TEMP1
1124	vmovdqu		-8+32*4-128($np),$TEMP0
1125	vpaddq		$TEMP1,$ACC2,$ACC2
1126	vpmuludq	$Yi,$TEMP2,$TEMP2
1127	vmovdqu		-8+32*5-128($np),$TEMP1
1128	vpaddq		$TEMP2,$ACC3,$ACC3
1129	vpmuludq	$Yi,$TEMP0,$TEMP0
1130	vmovdqu		-8+32*6-128($np),$TEMP2
1131	vpaddq		$TEMP0,$ACC4,$ACC4
1132	vpmuludq	$Yi,$TEMP1,$TEMP1
1133	vmovdqu		-8+32*7-128($np),$TEMP0
1134	vpaddq		$TEMP1,$ACC5,$ACC5
1135	vpmuludq	$Yi,$TEMP2,$TEMP2
1136	vmovdqu		-8+32*8-128($np),$TEMP1
1137	vpaddq		$TEMP2,$ACC6,$ACC6
1138	vpmuludq	$Yi,$TEMP0,$TEMP0
1139	vmovdqu		-8+32*9-128($np),$TEMP2
1140	vpaddq		$TEMP0,$ACC7,$ACC7
1141	vpmuludq	$Yi,$TEMP1,$TEMP1
1142	vpaddq		$TEMP1,$ACC8,$ACC8
1143	vpmuludq	$Yi,$TEMP2,$TEMP2
1144	vpaddq		$TEMP2,$ACC9,$ACC9
1145
1146	 vmovdqu	-16+32*1-128($ap),$TEMP0
1147	mov	%rbx,%rax
1148	imulq	-128($ap),%rax
1149	add	$r2,%rax
1150
1151	 vmovdqu	-16+32*2-128($ap),$TEMP1
1152	mov	%rax,$r2
1153	imull	$n0, %eax
1154	and	\$0x1fffffff, %eax
1155
1156	 imulq	8-128($ap),%rbx
1157	 add	%rbx,$r3
1158	vpmuludq	$Bi,$TEMP0,$TEMP0
1159	 vmovd		%eax, $Yi
1160	vmovdqu		-16+32*3-128($ap),$TEMP2
1161	vpaddq		$TEMP0,$ACC1,$ACC1
1162	vpmuludq	$Bi,$TEMP1,$TEMP1
1163	 vpbroadcastq	$Yi, $Yi
1164	vmovdqu		-16+32*4-128($ap),$TEMP0
1165	vpaddq		$TEMP1,$ACC2,$ACC2
1166	vpmuludq	$Bi,$TEMP2,$TEMP2
1167	vmovdqu		-16+32*5-128($ap),$TEMP1
1168	vpaddq		$TEMP2,$ACC3,$ACC3
1169	vpmuludq	$Bi,$TEMP0,$TEMP0
1170	vmovdqu		-16+32*6-128($ap),$TEMP2
1171	vpaddq		$TEMP0,$ACC4,$ACC4
1172	vpmuludq	$Bi,$TEMP1,$TEMP1
1173	vmovdqu		-16+32*7-128($ap),$TEMP0
1174	vpaddq		$TEMP1,$ACC5,$ACC5
1175	vpmuludq	$Bi,$TEMP2,$TEMP2
1176	vmovdqu		-16+32*8-128($ap),$TEMP1
1177	vpaddq		$TEMP2,$ACC6,$ACC6
1178	vpmuludq	$Bi,$TEMP0,$TEMP0
1179	vmovdqu		-16+32*9-128($ap),$TEMP2
1180	vpaddq		$TEMP0,$ACC7,$ACC7
1181	vpmuludq	$Bi,$TEMP1,$TEMP1
1182	vpaddq		$TEMP1,$ACC8,$ACC8
1183	vpmuludq	$Bi,$TEMP2,$TEMP2
1184	 vpbroadcastq	24($bp), $Bi
1185	vpaddq		$TEMP2,$ACC9,$ACC9
1186
1187	 vmovdqu	-16+32*1-128($np),$TEMP0
1188	mov	%rax,%rdx
1189	imulq	-128($np),%rax
1190	add	%rax,$r2
1191	 vmovdqu	-16+32*2-128($np),$TEMP1
1192	imulq	8-128($np),%rdx
1193	add	%rdx,$r3
1194	shr	\$29, $r2
1195
1196	vpmuludq	$Yi,$TEMP0,$TEMP0
1197	 vmovq		$Bi, %rbx
1198	vmovdqu		-16+32*3-128($np),$TEMP2
1199	vpaddq		$TEMP0,$ACC1,$ACC1
1200	vpmuludq	$Yi,$TEMP1,$TEMP1
1201	vmovdqu		-16+32*4-128($np),$TEMP0
1202	vpaddq		$TEMP1,$ACC2,$ACC2
1203	vpmuludq	$Yi,$TEMP2,$TEMP2
1204	vmovdqu		-16+32*5-128($np),$TEMP1
1205	vpaddq		$TEMP2,$ACC3,$ACC3
1206	vpmuludq	$Yi,$TEMP0,$TEMP0
1207	vmovdqu		-16+32*6-128($np),$TEMP2
1208	vpaddq		$TEMP0,$ACC4,$ACC4
1209	vpmuludq	$Yi,$TEMP1,$TEMP1
1210	vmovdqu		-16+32*7-128($np),$TEMP0
1211	vpaddq		$TEMP1,$ACC5,$ACC5
1212	vpmuludq	$Yi,$TEMP2,$TEMP2
1213	vmovdqu		-16+32*8-128($np),$TEMP1
1214	vpaddq		$TEMP2,$ACC6,$ACC6
1215	vpmuludq	$Yi,$TEMP0,$TEMP0
1216	vmovdqu		-16+32*9-128($np),$TEMP2
1217	vpaddq		$TEMP0,$ACC7,$ACC7
1218	vpmuludq	$Yi,$TEMP1,$TEMP1
1219	 vmovdqu	-24+32*1-128($ap),$TEMP0
1220	vpaddq		$TEMP1,$ACC8,$ACC8
1221	vpmuludq	$Yi,$TEMP2,$TEMP2
1222	 vmovdqu	-24+32*2-128($ap),$TEMP1
1223	vpaddq		$TEMP2,$ACC9,$ACC9
1224
1225	add	$r2, $r3
1226	imulq	-128($ap),%rbx
1227	add	%rbx,$r3
1228
1229	mov	$r3, %rax
1230	imull	$n0, %eax
1231	and	\$0x1fffffff, %eax
1232
1233	vpmuludq	$Bi,$TEMP0,$TEMP0
1234	 vmovd		%eax, $Yi
1235	vmovdqu		-24+32*3-128($ap),$TEMP2
1236	vpaddq		$TEMP0,$ACC1,$ACC1
1237	vpmuludq	$Bi,$TEMP1,$TEMP1
1238	 vpbroadcastq	$Yi, $Yi
1239	vmovdqu		-24+32*4-128($ap),$TEMP0
1240	vpaddq		$TEMP1,$ACC2,$ACC2
1241	vpmuludq	$Bi,$TEMP2,$TEMP2
1242	vmovdqu		-24+32*5-128($ap),$TEMP1
1243	vpaddq		$TEMP2,$ACC3,$ACC3
1244	vpmuludq	$Bi,$TEMP0,$TEMP0
1245	vmovdqu		-24+32*6-128($ap),$TEMP2
1246	vpaddq		$TEMP0,$ACC4,$ACC4
1247	vpmuludq	$Bi,$TEMP1,$TEMP1
1248	vmovdqu		-24+32*7-128($ap),$TEMP0
1249	vpaddq		$TEMP1,$ACC5,$ACC5
1250	vpmuludq	$Bi,$TEMP2,$TEMP2
1251	vmovdqu		-24+32*8-128($ap),$TEMP1
1252	vpaddq		$TEMP2,$ACC6,$ACC6
1253	vpmuludq	$Bi,$TEMP0,$TEMP0
1254	vmovdqu		-24+32*9-128($ap),$TEMP2
1255	vpaddq		$TEMP0,$ACC7,$ACC7
1256	vpmuludq	$Bi,$TEMP1,$TEMP1
1257	vpaddq		$TEMP1,$ACC8,$ACC8
1258	vpmuludq	$Bi,$TEMP2,$TEMP2
1259	 vpbroadcastq	32($bp), $Bi
1260	vpaddq		$TEMP2,$ACC9,$ACC9
1261	 add		\$32, $bp			# $bp++
1262
1263	vmovdqu		-24+32*1-128($np),$TEMP0
1264	imulq	-128($np),%rax
1265	add	%rax,$r3
1266	shr	\$29, $r3
1267
1268	vmovdqu		-24+32*2-128($np),$TEMP1
1269	vpmuludq	$Yi,$TEMP0,$TEMP0
1270	 vmovq		$Bi, %rbx
1271	vmovdqu		-24+32*3-128($np),$TEMP2
1272	vpaddq		$TEMP0,$ACC1,$ACC0		# $ACC0==$TEMP0
1273	vpmuludq	$Yi,$TEMP1,$TEMP1
1274	 vmovdqu	$ACC0, (%rsp)			# transfer $r0-$r3
1275	vpaddq		$TEMP1,$ACC2,$ACC1
1276	vmovdqu		-24+32*4-128($np),$TEMP0
1277	vpmuludq	$Yi,$TEMP2,$TEMP2
1278	vmovdqu		-24+32*5-128($np),$TEMP1
1279	vpaddq		$TEMP2,$ACC3,$ACC2
1280	vpmuludq	$Yi,$TEMP0,$TEMP0
1281	vmovdqu		-24+32*6-128($np),$TEMP2
1282	vpaddq		$TEMP0,$ACC4,$ACC3
1283	vpmuludq	$Yi,$TEMP1,$TEMP1
1284	vmovdqu		-24+32*7-128($np),$TEMP0
1285	vpaddq		$TEMP1,$ACC5,$ACC4
1286	vpmuludq	$Yi,$TEMP2,$TEMP2
1287	vmovdqu		-24+32*8-128($np),$TEMP1
1288	vpaddq		$TEMP2,$ACC6,$ACC5
1289	vpmuludq	$Yi,$TEMP0,$TEMP0
1290	vmovdqu		-24+32*9-128($np),$TEMP2
1291	 mov	$r3, $r0
1292	vpaddq		$TEMP0,$ACC7,$ACC6
1293	vpmuludq	$Yi,$TEMP1,$TEMP1
1294	 add	(%rsp), $r0
1295	vpaddq		$TEMP1,$ACC8,$ACC7
1296	vpmuludq	$Yi,$TEMP2,$TEMP2
1297	 vmovq	$r3, $TEMP1
1298	vpaddq		$TEMP2,$ACC9,$ACC8
1299
1300	dec	$i
1301	jnz	.Loop_mul_1024
1302___
1303
1304# (*)	Original implementation was correcting ACC1-ACC3 for overflow
1305#	after 7 loop runs, or after 28 iterations, or 56 additions.
1306#	But as we underutilize resources, it's possible to correct in
1307#	each iteration with marginal performance loss. But then, as
1308#	we do it in each iteration, we can correct less digits, and
1309#	avoid performance penalties completely. Also note that we
1310#	correct only three digits out of four. This works because
1311#	most significant digit is subjected to less additions.
1312
1313$TEMP0 = $ACC9;
1314$TEMP3 = $Bi;
1315$TEMP4 = $Yi;
1316$code.=<<___;
1317	vpermq		\$0, $AND_MASK, $AND_MASK
1318	vpaddq		(%rsp), $TEMP1, $ACC0
1319
1320	vpsrlq		\$29, $ACC0, $TEMP1
1321	vpand		$AND_MASK, $ACC0, $ACC0
1322	vpsrlq		\$29, $ACC1, $TEMP2
1323	vpand		$AND_MASK, $ACC1, $ACC1
1324	vpsrlq		\$29, $ACC2, $TEMP3
1325	vpermq		\$0x93, $TEMP1, $TEMP1
1326	vpand		$AND_MASK, $ACC2, $ACC2
1327	vpsrlq		\$29, $ACC3, $TEMP4
1328	vpermq		\$0x93, $TEMP2, $TEMP2
1329	vpand		$AND_MASK, $ACC3, $ACC3
1330
1331	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
1332	vpermq		\$0x93, $TEMP3, $TEMP3
1333	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
1334	vpermq		\$0x93, $TEMP4, $TEMP4
1335	vpaddq		$TEMP0, $ACC0, $ACC0
1336	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
1337	vpaddq		$TEMP1, $ACC1, $ACC1
1338	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
1339	vpaddq		$TEMP2, $ACC2, $ACC2
1340	vpblendd	\$3, $TEMP4, $ZERO, $TEMP4
1341	vpaddq		$TEMP3, $ACC3, $ACC3
1342	vpaddq		$TEMP4, $ACC4, $ACC4
1343
1344	vpsrlq		\$29, $ACC0, $TEMP1
1345	vpand		$AND_MASK, $ACC0, $ACC0
1346	vpsrlq		\$29, $ACC1, $TEMP2
1347	vpand		$AND_MASK, $ACC1, $ACC1
1348	vpsrlq		\$29, $ACC2, $TEMP3
1349	vpermq		\$0x93, $TEMP1, $TEMP1
1350	vpand		$AND_MASK, $ACC2, $ACC2
1351	vpsrlq		\$29, $ACC3, $TEMP4
1352	vpermq		\$0x93, $TEMP2, $TEMP2
1353	vpand		$AND_MASK, $ACC3, $ACC3
1354	vpermq		\$0x93, $TEMP3, $TEMP3
1355
1356	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
1357	vpermq		\$0x93, $TEMP4, $TEMP4
1358	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
1359	vpaddq		$TEMP0, $ACC0, $ACC0
1360	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
1361	vpaddq		$TEMP1, $ACC1, $ACC1
1362	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
1363	vpaddq		$TEMP2, $ACC2, $ACC2
1364	vpblendd	\$3, $TEMP4, $ZERO, $TEMP4
1365	vpaddq		$TEMP3, $ACC3, $ACC3
1366	vpaddq		$TEMP4, $ACC4, $ACC4
1367
1368	vmovdqu		$ACC0, 0-128($rp)
1369	vmovdqu		$ACC1, 32-128($rp)
1370	vmovdqu		$ACC2, 64-128($rp)
1371	vmovdqu		$ACC3, 96-128($rp)
1372___
1373
1374$TEMP5=$ACC0;
1375$code.=<<___;
1376	vpsrlq		\$29, $ACC4, $TEMP1
1377	vpand		$AND_MASK, $ACC4, $ACC4
1378	vpsrlq		\$29, $ACC5, $TEMP2
1379	vpand		$AND_MASK, $ACC5, $ACC5
1380	vpsrlq		\$29, $ACC6, $TEMP3
1381	vpermq		\$0x93, $TEMP1, $TEMP1
1382	vpand		$AND_MASK, $ACC6, $ACC6
1383	vpsrlq		\$29, $ACC7, $TEMP4
1384	vpermq		\$0x93, $TEMP2, $TEMP2
1385	vpand		$AND_MASK, $ACC7, $ACC7
1386	vpsrlq		\$29, $ACC8, $TEMP5
1387	vpermq		\$0x93, $TEMP3, $TEMP3
1388	vpand		$AND_MASK, $ACC8, $ACC8
1389	vpermq		\$0x93, $TEMP4, $TEMP4
1390
1391	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
1392	vpermq		\$0x93, $TEMP5, $TEMP5
1393	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
1394	vpaddq		$TEMP0, $ACC4, $ACC4
1395	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
1396	vpaddq		$TEMP1, $ACC5, $ACC5
1397	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
1398	vpaddq		$TEMP2, $ACC6, $ACC6
1399	vpblendd	\$3, $TEMP4, $TEMP5, $TEMP4
1400	vpaddq		$TEMP3, $ACC7, $ACC7
1401	vpaddq		$TEMP4, $ACC8, $ACC8
1402
1403	vpsrlq		\$29, $ACC4, $TEMP1
1404	vpand		$AND_MASK, $ACC4, $ACC4
1405	vpsrlq		\$29, $ACC5, $TEMP2
1406	vpand		$AND_MASK, $ACC5, $ACC5
1407	vpsrlq		\$29, $ACC6, $TEMP3
1408	vpermq		\$0x93, $TEMP1, $TEMP1
1409	vpand		$AND_MASK, $ACC6, $ACC6
1410	vpsrlq		\$29, $ACC7, $TEMP4
1411	vpermq		\$0x93, $TEMP2, $TEMP2
1412	vpand		$AND_MASK, $ACC7, $ACC7
1413	vpsrlq		\$29, $ACC8, $TEMP5
1414	vpermq		\$0x93, $TEMP3, $TEMP3
1415	vpand		$AND_MASK, $ACC8, $ACC8
1416	vpermq		\$0x93, $TEMP4, $TEMP4
1417
1418	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
1419	vpermq		\$0x93, $TEMP5, $TEMP5
1420	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
1421	vpaddq		$TEMP0, $ACC4, $ACC4
1422	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
1423	vpaddq		$TEMP1, $ACC5, $ACC5
1424	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
1425	vpaddq		$TEMP2, $ACC6, $ACC6
1426	vpblendd	\$3, $TEMP4, $TEMP5, $TEMP4
1427	vpaddq		$TEMP3, $ACC7, $ACC7
1428	vpaddq		$TEMP4, $ACC8, $ACC8
1429
1430	vmovdqu		$ACC4, 128-128($rp)
1431	vmovdqu		$ACC5, 160-128($rp)
1432	vmovdqu		$ACC6, 192-128($rp)
1433	vmovdqu		$ACC7, 224-128($rp)
1434	vmovdqu		$ACC8, 256-128($rp)
1435	vzeroupper
1436
1437	mov	%rbp, %rax
1438___
1439$code.=<<___ if ($win64);
1440	movaps	-0xd8(%rax),%xmm6
1441	movaps	-0xc8(%rax),%xmm7
1442	movaps	-0xb8(%rax),%xmm8
1443	movaps	-0xa8(%rax),%xmm9
1444	movaps	-0x98(%rax),%xmm10
1445	movaps	-0x88(%rax),%xmm11
1446	movaps	-0x78(%rax),%xmm12
1447	movaps	-0x68(%rax),%xmm13
1448	movaps	-0x58(%rax),%xmm14
1449	movaps	-0x48(%rax),%xmm15
1450___
1451$code.=<<___;
1452	mov	-48(%rax),%r15
1453	mov	-40(%rax),%r14
1454	mov	-32(%rax),%r13
1455	mov	-24(%rax),%r12
1456	mov	-16(%rax),%rbp
1457	mov	-8(%rax),%rbx
1458	lea	(%rax),%rsp		# restore %rsp
1459.Lmul_1024_epilogue:
1460	ret
1461.size	rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
1462___
1463}
1464{
1465my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi");
1466my @T = map("%r$_",(8..11));
1467
1468$code.=<<___;
1469.globl	rsaz_1024_red2norm_avx2
1470.type	rsaz_1024_red2norm_avx2,\@abi-omnipotent
1471.align	32
1472rsaz_1024_red2norm_avx2:
1473	sub	\$-128,$inp	# size optimization
1474	xor	%rax,%rax
1475___
1476
1477for ($j=0,$i=0; $i<16; $i++) {
1478    my $k=0;
1479    while (29*$j<64*($i+1)) {	# load data till boundary
1480	$code.="	mov	`8*$j-128`($inp), @T[0]\n";
1481	$j++; $k++; push(@T,shift(@T));
1482    }
1483    $l=$k;
1484    while ($k>1) {		# shift loaded data but last value
1485	$code.="	shl	\$`29*($j-$k)`,@T[-$k]\n";
1486	$k--;
1487    }
1488    $code.=<<___;		# shift last value
1489	mov	@T[-1], @T[0]
1490	shl	\$`29*($j-1)`, @T[-1]
1491	shr	\$`-29*($j-1)`, @T[0]
1492___
1493    while ($l) {		# accumulate all values
1494	$code.="	add	@T[-$l], %rax\n";
1495	$l--;
1496    }
1497	$code.=<<___;
1498	adc	\$0, @T[0]	# consume eventual carry
1499	mov	%rax, 8*$i($out)
1500	mov	@T[0], %rax
1501___
1502    push(@T,shift(@T));
1503}
1504$code.=<<___;
1505	ret
1506.size	rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2
1507
1508.globl	rsaz_1024_norm2red_avx2
1509.type	rsaz_1024_norm2red_avx2,\@abi-omnipotent
1510.align	32
1511rsaz_1024_norm2red_avx2:
1512	sub	\$-128,$out	# size optimization
1513	mov	($inp),@T[0]
1514	mov	\$0x1fffffff,%eax
1515___
1516for ($j=0,$i=0; $i<16; $i++) {
1517    $code.="	mov	`8*($i+1)`($inp),@T[1]\n"	if ($i<15);
1518    $code.="	xor	@T[1],@T[1]\n"			if ($i==15);
1519    my $k=1;
1520    while (29*($j+1)<64*($i+1)) {
1521    	$code.=<<___;
1522	mov	@T[0],@T[-$k]
1523	shr	\$`29*$j`,@T[-$k]
1524	and	%rax,@T[-$k]				# &0x1fffffff
1525	mov	@T[-$k],`8*$j-128`($out)
1526___
1527	$j++; $k++;
1528    }
1529    $code.=<<___;
1530	shrd	\$`29*$j`,@T[1],@T[0]
1531	and	%rax,@T[0]
1532	mov	@T[0],`8*$j-128`($out)
1533___
1534    $j++;
1535    push(@T,shift(@T));
1536}
1537$code.=<<___;
1538	mov	@T[0],`8*$j-128`($out)			# zero
1539	mov	@T[0],`8*($j+1)-128`($out)
1540	mov	@T[0],`8*($j+2)-128`($out)
1541	mov	@T[0],`8*($j+3)-128`($out)
1542	ret
1543.size	rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2
1544___
1545}
1546{
1547my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
1548
1549$code.=<<___;
1550.globl	rsaz_1024_scatter5_avx2
1551.type	rsaz_1024_scatter5_avx2,\@abi-omnipotent
1552.align	32
1553rsaz_1024_scatter5_avx2:
1554	vzeroupper
1555	vmovdqu	.Lscatter_permd(%rip),%ymm5
1556	shl	\$4,$power
1557	lea	($out,$power),$out
1558	mov	\$9,%eax
1559	jmp	.Loop_scatter_1024
1560
1561.align	32
1562.Loop_scatter_1024:
1563	vmovdqu		($inp),%ymm0
1564	lea		32($inp),$inp
1565	vpermd		%ymm0,%ymm5,%ymm0
1566	vmovdqu		%xmm0,($out)
1567	lea		16*32($out),$out
1568	dec	%eax
1569	jnz	.Loop_scatter_1024
1570
1571	vzeroupper
1572	ret
1573.size	rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
1574
1575.globl	rsaz_1024_gather5_avx2
1576.type	rsaz_1024_gather5_avx2,\@abi-omnipotent
1577.align	32
1578rsaz_1024_gather5_avx2:
1579___
1580$code.=<<___ if ($win64);
1581	lea	-0x88(%rsp),%rax
1582	vzeroupper
1583.LSEH_begin_rsaz_1024_gather5:
1584	# I can't trust assembler to use specific encoding:-(
1585	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax),%rsp
1586	.byte	0xc5,0xf8,0x29,0x70,0xe0	#vmovaps %xmm6,-0x20(%rax)
1587	.byte	0xc5,0xf8,0x29,0x78,0xf0	#vmovaps %xmm7,-0x10(%rax)
1588	.byte	0xc5,0x78,0x29,0x40,0x00	#vmovaps %xmm8,0(%rax)
1589	.byte	0xc5,0x78,0x29,0x48,0x10	#vmovaps %xmm9,0x10(%rax)
1590	.byte	0xc5,0x78,0x29,0x50,0x20	#vmovaps %xmm10,0x20(%rax)
1591	.byte	0xc5,0x78,0x29,0x58,0x30	#vmovaps %xmm11,0x30(%rax)
1592	.byte	0xc5,0x78,0x29,0x60,0x40	#vmovaps %xmm12,0x40(%rax)
1593	.byte	0xc5,0x78,0x29,0x68,0x50	#vmovaps %xmm13,0x50(%rax)
1594	.byte	0xc5,0x78,0x29,0x70,0x60	#vmovaps %xmm14,0x60(%rax)
1595	.byte	0xc5,0x78,0x29,0x78,0x70	#vmovaps %xmm15,0x70(%rax)
1596___
1597$code.=<<___;
1598	lea	.Lgather_table(%rip),%r11
1599	mov	$power,%eax
1600	and	\$3,$power
1601	shr	\$2,%eax			# cache line number
1602	shl	\$4,$power			# offset within cache line
1603
1604	vmovdqu		-32(%r11),%ymm7		# .Lgather_permd
1605	vpbroadcastb	8(%r11,%rax), %xmm8
1606	vpbroadcastb	7(%r11,%rax), %xmm9
1607	vpbroadcastb	6(%r11,%rax), %xmm10
1608	vpbroadcastb	5(%r11,%rax), %xmm11
1609	vpbroadcastb	4(%r11,%rax), %xmm12
1610	vpbroadcastb	3(%r11,%rax), %xmm13
1611	vpbroadcastb	2(%r11,%rax), %xmm14
1612	vpbroadcastb	1(%r11,%rax), %xmm15
1613
1614	lea	64($inp,$power),$inp
1615	mov	\$64,%r11			# size optimization
1616	mov	\$9,%eax
1617	jmp	.Loop_gather_1024
1618
1619.align	32
1620.Loop_gather_1024:
1621	vpand		-64($inp),		%xmm8,%xmm0
1622	vpand		($inp),			%xmm9,%xmm1
1623	vpand		64($inp),		%xmm10,%xmm2
1624	vpand		($inp,%r11,2),		%xmm11,%xmm3
1625	 vpor					%xmm0,%xmm1,%xmm1
1626	vpand		64($inp,%r11,2),	%xmm12,%xmm4
1627	 vpor					%xmm2,%xmm3,%xmm3
1628	vpand		($inp,%r11,4),		%xmm13,%xmm5
1629	 vpor					%xmm1,%xmm3,%xmm3
1630	vpand		64($inp,%r11,4),	%xmm14,%xmm6
1631	 vpor					%xmm4,%xmm5,%xmm5
1632	vpand		-128($inp,%r11,8),	%xmm15,%xmm2
1633	lea		($inp,%r11,8),$inp
1634	 vpor					%xmm3,%xmm5,%xmm5
1635	 vpor					%xmm2,%xmm6,%xmm6
1636	 vpor					%xmm5,%xmm6,%xmm6
1637	vpermd		%ymm6,%ymm7,%ymm6
1638	vmovdqu		%ymm6,($out)
1639	lea		32($out),$out
1640	dec	%eax
1641	jnz	.Loop_gather_1024
1642
1643	vpxor	%ymm0,%ymm0,%ymm0
1644	vmovdqu	%ymm0,($out)
1645	vzeroupper
1646___
1647$code.=<<___ if ($win64);
1648	movaps	(%rsp),%xmm6
1649	movaps	0x10(%rsp),%xmm7
1650	movaps	0x20(%rsp),%xmm8
1651	movaps	0x30(%rsp),%xmm9
1652	movaps	0x40(%rsp),%xmm10
1653	movaps	0x50(%rsp),%xmm11
1654	movaps	0x60(%rsp),%xmm12
1655	movaps	0x70(%rsp),%xmm13
1656	movaps	0x80(%rsp),%xmm14
1657	movaps	0x90(%rsp),%xmm15
1658	lea	0xa8(%rsp),%rsp
1659.LSEH_end_rsaz_1024_gather5:
1660___
1661$code.=<<___;
1662	ret
1663.size	rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
1664___
1665}
1666
1667$code.=<<___;
1668.extern	OPENSSL_ia32cap_P
1669.globl	rsaz_avx2_eligible
1670.type	rsaz_avx2_eligible,\@abi-omnipotent
1671.align	32
1672rsaz_avx2_eligible:
1673	mov	OPENSSL_ia32cap_P+8(%rip),%eax
1674___
1675$code.=<<___	if ($addx);
1676	mov	\$`1<<8|1<<19`,%ecx
1677	mov	\$0,%edx
1678	and	%eax,%ecx
1679	cmp	\$`1<<8|1<<19`,%ecx	# check for BMI2+AD*X
1680	cmove	%edx,%eax
1681___
1682$code.=<<___;
1683	and	\$`1<<5`,%eax
1684	shr	\$5,%eax
1685	ret
1686.size	rsaz_avx2_eligible,.-rsaz_avx2_eligible
1687
1688.align	64
1689.Land_mask:
1690	.quad	0x1fffffff,0x1fffffff,0x1fffffff,-1
1691.Lscatter_permd:
1692	.long	0,2,4,6,7,7,7,7
1693.Lgather_permd:
1694	.long	0,7,1,7,2,7,3,7
1695.Lgather_table:
1696	.byte	0,0,0,0,0,0,0,0, 0xff,0,0,0,0,0,0,0
1697.align	64
1698___
1699
1700if ($win64) {
1701$rec="%rcx";
1702$frame="%rdx";
1703$context="%r8";
1704$disp="%r9";
1705
1706$code.=<<___
1707.extern	__imp_RtlVirtualUnwind
1708.type	rsaz_se_handler,\@abi-omnipotent
1709.align	16
1710rsaz_se_handler:
1711	push	%rsi
1712	push	%rdi
1713	push	%rbx
1714	push	%rbp
1715	push	%r12
1716	push	%r13
1717	push	%r14
1718	push	%r15
1719	pushfq
1720	sub	\$64,%rsp
1721
1722	mov	120($context),%rax	# pull context->Rax
1723	mov	248($context),%rbx	# pull context->Rip
1724
1725	mov	8($disp),%rsi		# disp->ImageBase
1726	mov	56($disp),%r11		# disp->HandlerData
1727
1728	mov	0(%r11),%r10d		# HandlerData[0]
1729	lea	(%rsi,%r10),%r10	# prologue label
1730	cmp	%r10,%rbx		# context->Rip<prologue label
1731	jb	.Lcommon_seh_tail
1732
1733	mov	152($context),%rax	# pull context->Rsp
1734
1735	mov	4(%r11),%r10d		# HandlerData[1]
1736	lea	(%rsi,%r10),%r10	# epilogue label
1737	cmp	%r10,%rbx		# context->Rip>=epilogue label
1738	jae	.Lcommon_seh_tail
1739
1740	mov	160($context),%rax	# pull context->Rbp
1741
1742	mov	-48(%rax),%r15
1743	mov	-40(%rax),%r14
1744	mov	-32(%rax),%r13
1745	mov	-24(%rax),%r12
1746	mov	-16(%rax),%rbp
1747	mov	-8(%rax),%rbx
1748	mov	%r15,240($context)
1749	mov	%r14,232($context)
1750	mov	%r13,224($context)
1751	mov	%r12,216($context)
1752	mov	%rbp,160($context)
1753	mov	%rbx,144($context)
1754
1755	lea	-0xd8(%rax),%rsi	# %xmm save area
1756	lea	512($context),%rdi	# & context.Xmm6
1757	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
1758	.long	0xa548f3fc		# cld; rep movsq
1759
1760.Lcommon_seh_tail:
1761	mov	8(%rax),%rdi
1762	mov	16(%rax),%rsi
1763	mov	%rax,152($context)	# restore context->Rsp
1764	mov	%rsi,168($context)	# restore context->Rsi
1765	mov	%rdi,176($context)	# restore context->Rdi
1766
1767	mov	40($disp),%rdi		# disp->ContextRecord
1768	mov	$context,%rsi		# context
1769	mov	\$154,%ecx		# sizeof(CONTEXT)
1770	.long	0xa548f3fc		# cld; rep movsq
1771
1772	mov	$disp,%rsi
1773	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1774	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1775	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1776	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1777	mov	40(%rsi),%r10		# disp->ContextRecord
1778	lea	56(%rsi),%r11		# &disp->HandlerData
1779	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1780	mov	%r10,32(%rsp)		# arg5
1781	mov	%r11,40(%rsp)		# arg6
1782	mov	%r12,48(%rsp)		# arg7
1783	mov	%rcx,56(%rsp)		# arg8, (NULL)
1784	call	*__imp_RtlVirtualUnwind(%rip)
1785
1786	mov	\$1,%eax		# ExceptionContinueSearch
1787	add	\$64,%rsp
1788	popfq
1789	pop	%r15
1790	pop	%r14
1791	pop	%r13
1792	pop	%r12
1793	pop	%rbp
1794	pop	%rbx
1795	pop	%rdi
1796	pop	%rsi
1797	ret
1798.size	rsaz_se_handler,.-rsaz_se_handler
1799
1800.section	.pdata
1801.align	4
1802	.rva	.LSEH_begin_rsaz_1024_sqr_avx2
1803	.rva	.LSEH_end_rsaz_1024_sqr_avx2
1804	.rva	.LSEH_info_rsaz_1024_sqr_avx2
1805
1806	.rva	.LSEH_begin_rsaz_1024_mul_avx2
1807	.rva	.LSEH_end_rsaz_1024_mul_avx2
1808	.rva	.LSEH_info_rsaz_1024_mul_avx2
1809
1810	.rva	.LSEH_begin_rsaz_1024_gather5
1811	.rva	.LSEH_end_rsaz_1024_gather5
1812	.rva	.LSEH_info_rsaz_1024_gather5
1813.section	.xdata
1814.align	8
1815.LSEH_info_rsaz_1024_sqr_avx2:
1816	.byte	9,0,0,0
1817	.rva	rsaz_se_handler
1818	.rva	.Lsqr_1024_body,.Lsqr_1024_epilogue
1819.LSEH_info_rsaz_1024_mul_avx2:
1820	.byte	9,0,0,0
1821	.rva	rsaz_se_handler
1822	.rva	.Lmul_1024_body,.Lmul_1024_epilogue
1823.LSEH_info_rsaz_1024_gather5:
1824	.byte	0x01,0x33,0x16,0x00
1825	.byte	0x36,0xf8,0x09,0x00	#vmovaps 0x90(rsp),xmm15
1826	.byte	0x31,0xe8,0x08,0x00	#vmovaps 0x80(rsp),xmm14
1827	.byte	0x2c,0xd8,0x07,0x00	#vmovaps 0x70(rsp),xmm13
1828	.byte	0x27,0xc8,0x06,0x00	#vmovaps 0x60(rsp),xmm12
1829	.byte	0x22,0xb8,0x05,0x00	#vmovaps 0x50(rsp),xmm11
1830	.byte	0x1d,0xa8,0x04,0x00	#vmovaps 0x40(rsp),xmm10
1831	.byte	0x18,0x98,0x03,0x00	#vmovaps 0x30(rsp),xmm9
1832	.byte	0x13,0x88,0x02,0x00	#vmovaps 0x20(rsp),xmm8
1833	.byte	0x0e,0x78,0x01,0x00	#vmovaps 0x10(rsp),xmm7
1834	.byte	0x09,0x68,0x00,0x00	#vmovaps 0x00(rsp),xmm6
1835	.byte	0x04,0x01,0x15,0x00	#sub	rsp,0xa8
1836___
1837}
1838
1839foreach (split("\n",$code)) {
1840	s/\`([^\`]*)\`/eval($1)/ge;
1841
1842	s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge		or
1843
1844	s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go		or
1845	s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go		or
1846	s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go	or
1847	s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go	or
1848	s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
1849	print $_,"\n";
1850}
1851
1852}}} else {{{
1853print <<___;	# assembler is too old
1854.text
1855
1856.globl	rsaz_avx2_eligible
1857.type	rsaz_avx2_eligible,\@abi-omnipotent
1858rsaz_avx2_eligible:
1859	xor	%eax,%eax
1860	ret
1861.size	rsaz_avx2_eligible,.-rsaz_avx2_eligible
1862
1863.globl	rsaz_1024_sqr_avx2
1864.globl	rsaz_1024_mul_avx2
1865.globl	rsaz_1024_norm2red_avx2
1866.globl	rsaz_1024_red2norm_avx2
1867.globl	rsaz_1024_scatter5_avx2
1868.globl	rsaz_1024_gather5_avx2
1869.type	rsaz_1024_sqr_avx2,\@abi-omnipotent
1870rsaz_1024_sqr_avx2:
1871rsaz_1024_mul_avx2:
1872rsaz_1024_norm2red_avx2:
1873rsaz_1024_red2norm_avx2:
1874rsaz_1024_scatter5_avx2:
1875rsaz_1024_gather5_avx2:
1876	.byte	0x0f,0x0b	# ud2
1877	ret
1878.size	rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
1879___
1880}}}
1881
1882close STDOUT;
1883