• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for Intel AES-NI extension. In
18# OpenSSL context it's used with Intel engine, but can also be used as
19# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
20# details].
21#
22# Performance.
23#
24# Given aes(enc|dec) instructions' latency asymptotic performance for
25# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
26# processed with 128-bit key. And given their throughput asymptotic
27# performance for parallelizable modes is 1.25 cycles per byte. Being
28# asymptotic limit it's not something you commonly achieve in reality,
29# but how close does one get? Below are results collected for
30# different modes and block sized. Pairs of numbers are for en-/
31# decryption.
32#
33#	16-byte     64-byte     256-byte    1-KB        8-KB
34# ECB	4.25/4.25   1.38/1.38   1.28/1.28   1.26/1.26	1.26/1.26
35# CTR	5.42/5.42   1.92/1.92   1.44/1.44   1.28/1.28   1.26/1.26
36# CBC	4.38/4.43   4.15/1.43   4.07/1.32   4.07/1.29   4.06/1.28
37# CCM	5.66/9.42   4.42/5.41   4.16/4.40   4.09/4.15   4.06/4.07
38# OFB	5.42/5.42   4.64/4.64   4.44/4.44   4.39/4.39   4.38/4.38
39# CFB	5.73/5.85   5.56/5.62   5.48/5.56   5.47/5.55   5.47/5.55
40#
41# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
42# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
43# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
44# The results were collected with specially crafted speed.c benchmark
45# in order to compare them with results reported in "Intel Advanced
46# Encryption Standard (AES) New Instruction Set" White Paper Revision
47# 3.0 dated May 2010. All above results are consistently better. This
48# module also provides better performance for block sizes smaller than
49# 128 bytes in points *not* represented in the above table.
50#
51# Looking at the results for 8-KB buffer.
52#
53# CFB and OFB results are far from the limit, because implementation
54# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
55# single-block aesni_encrypt, which is not the most optimal way to go.
56# CBC encrypt result is unexpectedly high and there is no documented
57# explanation for it. Seemingly there is a small penalty for feeding
58# the result back to AES unit the way it's done in CBC mode. There is
59# nothing one can do and the result appears optimal. CCM result is
60# identical to CBC, because CBC-MAC is essentially CBC encrypt without
61# saving output. CCM CTR "stays invisible," because it's neatly
62# interleaved wih CBC-MAC. This provides ~30% improvement over
63# "straightforward" CCM implementation with CTR and CBC-MAC performed
64# disjointly. Parallelizable modes practically achieve the theoretical
65# limit.
66#
67# Looking at how results vary with buffer size.
68#
69# Curves are practically saturated at 1-KB buffer size. In most cases
70# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
71# CTR curve doesn't follow this pattern and is "slowest" changing one
72# with "256-byte" result being 87% of "8-KB." This is because overhead
73# in CTR mode is most computationally intensive. Small-block CCM
74# decrypt is slower than encrypt, because first CTR and last CBC-MAC
75# iterations can't be interleaved.
76#
77# Results for 192- and 256-bit keys.
78#
79# EVP-free results were observed to scale perfectly with number of
80# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
81# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
82# are a tad smaller, because the above mentioned penalty biases all
83# results by same constant value. In similar way function call
84# overhead affects small-block performance, as well as OFB and CFB
85# results. Differences are not large, most common coefficients are
86# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
87# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
88
89# January 2011
90#
91# While Westmere processor features 6 cycles latency for aes[enc|dec]
92# instructions, which can be scheduled every second cycle, Sandy
93# Bridge spends 8 cycles per instruction, but it can schedule them
94# every cycle. This means that code targeting Westmere would perform
95# suboptimally on Sandy Bridge. Therefore this update.
96#
97# In addition, non-parallelizable CBC encrypt (as well as CCM) is
98# optimized. Relative improvement might appear modest, 8% on Westmere,
99# but in absolute terms it's 3.77 cycles per byte encrypted with
100# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
101# should be compared to asymptotic limits of 3.75 for Westmere and
102# 5.00 for Sandy Bridge. Actually, the fact that they get this close
103# to asymptotic limits is quite amazing. Indeed, the limit is
104# calculated as latency times number of rounds, 10 for 128-bit key,
105# and divided by 16, the number of bytes in block, or in other words
106# it accounts *solely* for aesenc instructions. But there are extra
107# instructions, and numbers so close to the asymptotic limits mean
108# that it's as if it takes as little as *one* additional cycle to
109# execute all of them. How is it possible? It is possible thanks to
110# out-of-order execution logic, which manages to overlap post-
111# processing of previous block, things like saving the output, with
112# actual encryption of current block, as well as pre-processing of
113# current block, things like fetching input and xor-ing it with
114# 0-round element of the key schedule, with actual encryption of
115# previous block. Keep this in mind...
116#
117# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
118# performance is achieved by interleaving instructions working on
119# independent blocks. In which case asymptotic limit for such modes
120# can be obtained by dividing above mentioned numbers by AES
121# instructions' interleave factor. Westmere can execute at most 3
122# instructions at a time, meaning that optimal interleave factor is 3,
123# and that's where the "magic" number of 1.25 come from. "Optimal
124# interleave factor" means that increase of interleave factor does
125# not improve performance. The formula has proven to reflect reality
126# pretty well on Westmere... Sandy Bridge on the other hand can
127# execute up to 8 AES instructions at a time, so how does varying
128# interleave factor affect the performance? Here is table for ECB
129# (numbers are cycles per byte processed with 128-bit key):
130#
131# instruction interleave factor		3x	6x	8x
132# theoretical asymptotic limit		1.67	0.83	0.625
133# measured performance for 8KB block	1.05	0.86	0.84
134#
135# "as if" interleave factor		4.7x	5.8x	6.0x
136#
137# Further data for other parallelizable modes:
138#
139# CBC decrypt				1.16	0.93	0.74
140# CTR					1.14	0.91	0.74
141#
142# Well, given 3x column it's probably inappropriate to call the limit
143# asymptotic, if it can be surpassed, isn't it? What happens there?
144# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
145# magic is responsible for this. Processor overlaps not only the
146# additional instructions with AES ones, but even AES instructions
147# processing adjacent triplets of independent blocks. In the 6x case
148# additional instructions  still claim disproportionally small amount
149# of additional cycles, but in 8x case number of instructions must be
150# a tad too high for out-of-order logic to cope with, and AES unit
151# remains underutilized... As you can see 8x interleave is hardly
152# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
153# utilizes 6x interleave because of limited register bank capacity.
154#
155# Higher interleave factors do have negative impact on Westmere
156# performance. While for ECB mode it's negligible ~1.5%, other
157# parallelizables perform ~5% worse, which is outweighed by ~25%
158# improvement on Sandy Bridge. To balance regression on Westmere
159# CTR mode was implemented with 6x aesenc interleave factor.
160
161# April 2011
162#
163# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
164# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
165# in CTR mode AES instruction interleave factor was chosen to be 6x.
166
167# November 2015
168#
169# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was
170# chosen to be 6x.
171
172######################################################################
173# Current large-block performance in cycles per byte processed with
174# 128-bit key (less is better).
175#
176#		CBC en-/decrypt	CTR	XTS	ECB	OCB
177# Westmere	3.77/1.25	1.25	1.25	1.26
178# * Bridge	5.07/0.74	0.75	0.90	0.85	0.98
179# Haswell	4.44/0.63	0.63	0.73	0.63	0.70
180# Skylake	2.62/0.63	0.63	0.63	0.63
181# Silvermont	5.75/3.54	3.56	4.12	3.87(*)	4.11
182# Knights L	2.54/0.77	0.78	0.85	-	1.50
183# Goldmont	3.82/1.26	1.26	1.29	1.29	1.50
184# Bulldozer	5.77/0.70	0.72	0.90	0.70	0.95
185# Ryzen		2.71/0.35	0.35	0.44	0.38	0.49
186#
187# (*)	Atom Silvermont ECB result is suboptimal because of penalties
188#	incurred by operations on %xmm8-15. As ECB is not considered
189#	critical, nothing was done to mitigate the problem.
190
191$PREFIX="aes_hw";	# if $PREFIX is set to "AES", the script
192			# generates drop-in replacement for
193			# crypto/aes/asm/aes-x86_64.pl:-)
194
195$flavour = shift;
196$output  = shift;
197if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
198
199$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
200
201$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
202( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
203( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
204die "can't locate x86_64-xlate.pl";
205
206open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
207*STDOUT=*OUT;
208
209$movkey = $PREFIX eq "aes_hw" ? "movups" : "movups";
210@_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
211		("%rdi","%rsi","%rdx","%rcx");	# Unix order
212
213$code=".text\n";
214$code.=".extern	OPENSSL_ia32cap_P\n";
215
216$rounds="%eax";	# input to and changed by aesni_[en|de]cryptN !!!
217# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
218$inp="%rdi";
219$out="%rsi";
220$len="%rdx";
221$key="%rcx";	# input to and changed by aesni_[en|de]cryptN !!!
222$ivp="%r8";	# cbc, ctr, ...
223
224$rnds_="%r10d";	# backup copy for $rounds
225$key_="%r11";	# backup copy for $key
226
227# %xmm register layout
228$rndkey0="%xmm0";	$rndkey1="%xmm1";
229$inout0="%xmm2";	$inout1="%xmm3";
230$inout2="%xmm4";	$inout3="%xmm5";
231$inout4="%xmm6";	$inout5="%xmm7";
232$inout6="%xmm8";	$inout7="%xmm9";
233
234$in2="%xmm6";		$in1="%xmm7";	# used in CBC decrypt, CTR, ...
235$in0="%xmm8";		$iv="%xmm9";
236
237# Inline version of internal aesni_[en|de]crypt1.
238#
239# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
240# cycles which take care of loop variables...
241{ my $sn;
242sub aesni_generate1 {
243my ($p,$key,$rounds,$inout,$ivec)=@_;	$inout=$inout0 if (!defined($inout));
244++$sn;
245$code.=<<___;
246	$movkey	($key),$rndkey0
247	$movkey	16($key),$rndkey1
248___
249$code.=<<___ if (defined($ivec));
250	xorps	$rndkey0,$ivec
251	lea	32($key),$key
252	xorps	$ivec,$inout
253___
254$code.=<<___ if (!defined($ivec));
255	lea	32($key),$key
256	xorps	$rndkey0,$inout
257___
258$code.=<<___;
259.Loop_${p}1_$sn:
260	aes${p}	$rndkey1,$inout
261	dec	$rounds
262	$movkey	($key),$rndkey1
263	lea	16($key),$key
264	jnz	.Loop_${p}1_$sn	# loop body is 16 bytes
265	aes${p}last	$rndkey1,$inout
266___
267}}
268# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
269#
270{ my ($inp,$out,$key) = @_4args;
271
272$code.=<<___;
273.globl	${PREFIX}_encrypt
274.type	${PREFIX}_encrypt,\@abi-omnipotent
275.align	16
276${PREFIX}_encrypt:
277.cfi_startproc
278	_CET_ENDBR
279#ifdef BORINGSSL_DISPATCH_TEST
280.extern	BORINGSSL_function_hit
281	movb \$1,BORINGSSL_function_hit+1(%rip)
282#endif
283	movups	($inp),$inout0		# load input
284	mov	240($key),$rounds	# key->rounds
285___
286	&aesni_generate1("enc",$key,$rounds);
287$code.=<<___;
288	 pxor	$rndkey0,$rndkey0	# clear register bank
289	 pxor	$rndkey1,$rndkey1
290	movups	$inout0,($out)		# output
291	 pxor	$inout0,$inout0
292	ret
293.cfi_endproc
294.size	${PREFIX}_encrypt,.-${PREFIX}_encrypt
295
296.globl	${PREFIX}_decrypt
297.type	${PREFIX}_decrypt,\@abi-omnipotent
298.align	16
299${PREFIX}_decrypt:
300.cfi_startproc
301	_CET_ENDBR
302	movups	($inp),$inout0		# load input
303	mov	240($key),$rounds	# key->rounds
304___
305	&aesni_generate1("dec",$key,$rounds);
306$code.=<<___;
307	 pxor	$rndkey0,$rndkey0	# clear register bank
308	 pxor	$rndkey1,$rndkey1
309	movups	$inout0,($out)		# output
310	 pxor	$inout0,$inout0
311	ret
312.cfi_endproc
313.size	${PREFIX}_decrypt, .-${PREFIX}_decrypt
314___
315}
316
317# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
318# factor. Why 3x subroutine were originally used in loops? Even though
319# aes[enc|dec] latency was originally 6, it could be scheduled only
320# every *2nd* cycle. Thus 3x interleave was the one providing optimal
321# utilization, i.e. when subroutine's throughput is virtually same as
322# of non-interleaved subroutine [for number of input blocks up to 3].
323# This is why it originally made no sense to implement 2x subroutine.
324# But times change and it became appropriate to spend extra 192 bytes
325# on 2x subroutine on Atom Silvermont account. For processors that
326# can schedule aes[enc|dec] every cycle optimal interleave factor
327# equals to corresponding instructions latency. 8x is optimal for
328# * Bridge and "super-optimal" for other Intel CPUs...
329
330sub aesni_generate2 {
331my $dir=shift;
332# As already mentioned it takes in $key and $rounds, which are *not*
333# preserved. $inout[0-1] is cipher/clear text...
334$code.=<<___;
335.type	_aesni_${dir}rypt2,\@abi-omnipotent
336.align	16
337_aesni_${dir}rypt2:
338.cfi_startproc
339	$movkey	($key),$rndkey0
340	shl	\$4,$rounds
341	$movkey	16($key),$rndkey1
342	xorps	$rndkey0,$inout0
343	xorps	$rndkey0,$inout1
344	$movkey	32($key),$rndkey0
345	lea	32($key,$rounds),$key
346	neg	%rax				# $rounds
347	add	\$16,%rax
348
349.L${dir}_loop2:
350	aes${dir}	$rndkey1,$inout0
351	aes${dir}	$rndkey1,$inout1
352	$movkey		($key,%rax),$rndkey1
353	add		\$32,%rax
354	aes${dir}	$rndkey0,$inout0
355	aes${dir}	$rndkey0,$inout1
356	$movkey		-16($key,%rax),$rndkey0
357	jnz		.L${dir}_loop2
358
359	aes${dir}	$rndkey1,$inout0
360	aes${dir}	$rndkey1,$inout1
361	aes${dir}last	$rndkey0,$inout0
362	aes${dir}last	$rndkey0,$inout1
363	ret
364.cfi_endproc
365.size	_aesni_${dir}rypt2,.-_aesni_${dir}rypt2
366___
367}
368sub aesni_generate3 {
369my $dir=shift;
370# As already mentioned it takes in $key and $rounds, which are *not*
371# preserved. $inout[0-2] is cipher/clear text...
372$code.=<<___;
373.type	_aesni_${dir}rypt3,\@abi-omnipotent
374.align	16
375_aesni_${dir}rypt3:
376.cfi_startproc
377	$movkey	($key),$rndkey0
378	shl	\$4,$rounds
379	$movkey	16($key),$rndkey1
380	xorps	$rndkey0,$inout0
381	xorps	$rndkey0,$inout1
382	xorps	$rndkey0,$inout2
383	$movkey	32($key),$rndkey0
384	lea	32($key,$rounds),$key
385	neg	%rax				# $rounds
386	add	\$16,%rax
387
388.L${dir}_loop3:
389	aes${dir}	$rndkey1,$inout0
390	aes${dir}	$rndkey1,$inout1
391	aes${dir}	$rndkey1,$inout2
392	$movkey		($key,%rax),$rndkey1
393	add		\$32,%rax
394	aes${dir}	$rndkey0,$inout0
395	aes${dir}	$rndkey0,$inout1
396	aes${dir}	$rndkey0,$inout2
397	$movkey		-16($key,%rax),$rndkey0
398	jnz		.L${dir}_loop3
399
400	aes${dir}	$rndkey1,$inout0
401	aes${dir}	$rndkey1,$inout1
402	aes${dir}	$rndkey1,$inout2
403	aes${dir}last	$rndkey0,$inout0
404	aes${dir}last	$rndkey0,$inout1
405	aes${dir}last	$rndkey0,$inout2
406	ret
407.cfi_endproc
408.size	_aesni_${dir}rypt3,.-_aesni_${dir}rypt3
409___
410}
411# 4x interleave is implemented to improve small block performance,
412# most notably [and naturally] 4 block by ~30%. One can argue that one
413# should have implemented 5x as well, but improvement would be <20%,
414# so it's not worth it...
415sub aesni_generate4 {
416my $dir=shift;
417# As already mentioned it takes in $key and $rounds, which are *not*
418# preserved. $inout[0-3] is cipher/clear text...
419$code.=<<___;
420.type	_aesni_${dir}rypt4,\@abi-omnipotent
421.align	16
422_aesni_${dir}rypt4:
423.cfi_startproc
424	$movkey	($key),$rndkey0
425	shl	\$4,$rounds
426	$movkey	16($key),$rndkey1
427	xorps	$rndkey0,$inout0
428	xorps	$rndkey0,$inout1
429	xorps	$rndkey0,$inout2
430	xorps	$rndkey0,$inout3
431	$movkey	32($key),$rndkey0
432	lea	32($key,$rounds),$key
433	neg	%rax				# $rounds
434	.byte	0x0f,0x1f,0x00
435	add	\$16,%rax
436
437.L${dir}_loop4:
438	aes${dir}	$rndkey1,$inout0
439	aes${dir}	$rndkey1,$inout1
440	aes${dir}	$rndkey1,$inout2
441	aes${dir}	$rndkey1,$inout3
442	$movkey		($key,%rax),$rndkey1
443	add		\$32,%rax
444	aes${dir}	$rndkey0,$inout0
445	aes${dir}	$rndkey0,$inout1
446	aes${dir}	$rndkey0,$inout2
447	aes${dir}	$rndkey0,$inout3
448	$movkey		-16($key,%rax),$rndkey0
449	jnz		.L${dir}_loop4
450
451	aes${dir}	$rndkey1,$inout0
452	aes${dir}	$rndkey1,$inout1
453	aes${dir}	$rndkey1,$inout2
454	aes${dir}	$rndkey1,$inout3
455	aes${dir}last	$rndkey0,$inout0
456	aes${dir}last	$rndkey0,$inout1
457	aes${dir}last	$rndkey0,$inout2
458	aes${dir}last	$rndkey0,$inout3
459	ret
460.cfi_endproc
461.size	_aesni_${dir}rypt4,.-_aesni_${dir}rypt4
462___
463}
464sub aesni_generate6 {
465my $dir=shift;
466# As already mentioned it takes in $key and $rounds, which are *not*
467# preserved. $inout[0-5] is cipher/clear text...
468$code.=<<___;
469.type	_aesni_${dir}rypt6,\@abi-omnipotent
470.align	16
471_aesni_${dir}rypt6:
472.cfi_startproc
473	$movkey		($key),$rndkey0
474	shl		\$4,$rounds
475	$movkey		16($key),$rndkey1
476	xorps		$rndkey0,$inout0
477	pxor		$rndkey0,$inout1
478	pxor		$rndkey0,$inout2
479	aes${dir}	$rndkey1,$inout0
480	lea		32($key,$rounds),$key
481	neg		%rax			# $rounds
482	aes${dir}	$rndkey1,$inout1
483	pxor		$rndkey0,$inout3
484	pxor		$rndkey0,$inout4
485	aes${dir}	$rndkey1,$inout2
486	pxor		$rndkey0,$inout5
487	$movkey		($key,%rax),$rndkey0
488	add		\$16,%rax
489	jmp		.L${dir}_loop6_enter
490.align	16
491.L${dir}_loop6:
492	aes${dir}	$rndkey1,$inout0
493	aes${dir}	$rndkey1,$inout1
494	aes${dir}	$rndkey1,$inout2
495.L${dir}_loop6_enter:
496	aes${dir}	$rndkey1,$inout3
497	aes${dir}	$rndkey1,$inout4
498	aes${dir}	$rndkey1,$inout5
499	$movkey		($key,%rax),$rndkey1
500	add		\$32,%rax
501	aes${dir}	$rndkey0,$inout0
502	aes${dir}	$rndkey0,$inout1
503	aes${dir}	$rndkey0,$inout2
504	aes${dir}	$rndkey0,$inout3
505	aes${dir}	$rndkey0,$inout4
506	aes${dir}	$rndkey0,$inout5
507	$movkey		-16($key,%rax),$rndkey0
508	jnz		.L${dir}_loop6
509
510	aes${dir}	$rndkey1,$inout0
511	aes${dir}	$rndkey1,$inout1
512	aes${dir}	$rndkey1,$inout2
513	aes${dir}	$rndkey1,$inout3
514	aes${dir}	$rndkey1,$inout4
515	aes${dir}	$rndkey1,$inout5
516	aes${dir}last	$rndkey0,$inout0
517	aes${dir}last	$rndkey0,$inout1
518	aes${dir}last	$rndkey0,$inout2
519	aes${dir}last	$rndkey0,$inout3
520	aes${dir}last	$rndkey0,$inout4
521	aes${dir}last	$rndkey0,$inout5
522	ret
523.cfi_endproc
524.size	_aesni_${dir}rypt6,.-_aesni_${dir}rypt6
525___
526}
527sub aesni_generate8 {
528my $dir=shift;
529# As already mentioned it takes in $key and $rounds, which are *not*
530# preserved. $inout[0-7] is cipher/clear text...
531$code.=<<___;
532.type	_aesni_${dir}rypt8,\@abi-omnipotent
533.align	16
534_aesni_${dir}rypt8:
535.cfi_startproc
536	$movkey		($key),$rndkey0
537	shl		\$4,$rounds
538	$movkey		16($key),$rndkey1
539	xorps		$rndkey0,$inout0
540	xorps		$rndkey0,$inout1
541	pxor		$rndkey0,$inout2
542	pxor		$rndkey0,$inout3
543	pxor		$rndkey0,$inout4
544	lea		32($key,$rounds),$key
545	neg		%rax			# $rounds
546	aes${dir}	$rndkey1,$inout0
547	pxor		$rndkey0,$inout5
548	pxor		$rndkey0,$inout6
549	aes${dir}	$rndkey1,$inout1
550	pxor		$rndkey0,$inout7
551	$movkey		($key,%rax),$rndkey0
552	add		\$16,%rax
553	jmp		.L${dir}_loop8_inner
554.align	16
555.L${dir}_loop8:
556	aes${dir}	$rndkey1,$inout0
557	aes${dir}	$rndkey1,$inout1
558.L${dir}_loop8_inner:
559	aes${dir}	$rndkey1,$inout2
560	aes${dir}	$rndkey1,$inout3
561	aes${dir}	$rndkey1,$inout4
562	aes${dir}	$rndkey1,$inout5
563	aes${dir}	$rndkey1,$inout6
564	aes${dir}	$rndkey1,$inout7
565.L${dir}_loop8_enter:
566	$movkey		($key,%rax),$rndkey1
567	add		\$32,%rax
568	aes${dir}	$rndkey0,$inout0
569	aes${dir}	$rndkey0,$inout1
570	aes${dir}	$rndkey0,$inout2
571	aes${dir}	$rndkey0,$inout3
572	aes${dir}	$rndkey0,$inout4
573	aes${dir}	$rndkey0,$inout5
574	aes${dir}	$rndkey0,$inout6
575	aes${dir}	$rndkey0,$inout7
576	$movkey		-16($key,%rax),$rndkey0
577	jnz		.L${dir}_loop8
578
579	aes${dir}	$rndkey1,$inout0
580	aes${dir}	$rndkey1,$inout1
581	aes${dir}	$rndkey1,$inout2
582	aes${dir}	$rndkey1,$inout3
583	aes${dir}	$rndkey1,$inout4
584	aes${dir}	$rndkey1,$inout5
585	aes${dir}	$rndkey1,$inout6
586	aes${dir}	$rndkey1,$inout7
587	aes${dir}last	$rndkey0,$inout0
588	aes${dir}last	$rndkey0,$inout1
589	aes${dir}last	$rndkey0,$inout2
590	aes${dir}last	$rndkey0,$inout3
591	aes${dir}last	$rndkey0,$inout4
592	aes${dir}last	$rndkey0,$inout5
593	aes${dir}last	$rndkey0,$inout6
594	aes${dir}last	$rndkey0,$inout7
595	ret
596.cfi_endproc
597.size	_aesni_${dir}rypt8,.-_aesni_${dir}rypt8
598___
599}
600&aesni_generate2("enc") if ($PREFIX eq "aes_hw");
601&aesni_generate2("dec");
602&aesni_generate3("enc") if ($PREFIX eq "aes_hw");
603&aesni_generate3("dec");
604&aesni_generate4("enc") if ($PREFIX eq "aes_hw");
605&aesni_generate4("dec");
606&aesni_generate6("enc") if ($PREFIX eq "aes_hw");
607&aesni_generate6("dec");
608&aesni_generate8("enc") if ($PREFIX eq "aes_hw");
609&aesni_generate8("dec");
610
611if ($PREFIX eq "aes_hw") {
612########################################################################
613# void aesni_ecb_encrypt (const void *in, void *out,
614#			  size_t length, const AES_KEY *key,
615#			  int enc);
616$code.=<<___;
617.globl	${PREFIX}_ecb_encrypt
618.type	${PREFIX}_ecb_encrypt,\@function,5
619.align	16
620${PREFIX}_ecb_encrypt:
621.cfi_startproc
622	_CET_ENDBR
623___
624$code.=<<___ if ($win64);
625	lea	-0x58(%rsp),%rsp
626	movaps	%xmm6,(%rsp)		# offload $inout4..7
627	movaps	%xmm7,0x10(%rsp)
628	movaps	%xmm8,0x20(%rsp)
629	movaps	%xmm9,0x30(%rsp)
630.Lecb_enc_body:
631___
632$code.=<<___;
633	and	\$-16,$len		# if ($len<16)
634	jz	.Lecb_ret		# return
635
636	mov	240($key),$rounds	# key->rounds
637	$movkey	($key),$rndkey0
638	mov	$key,$key_		# backup $key
639	mov	$rounds,$rnds_		# backup $rounds
640	test	%r8d,%r8d		# 5th argument
641	jz	.Lecb_decrypt
642#--------------------------- ECB ENCRYPT ------------------------------#
643	cmp	\$0x80,$len		# if ($len<8*16)
644	jb	.Lecb_enc_tail		# short input
645
646	movdqu	($inp),$inout0		# load 8 input blocks
647	movdqu	0x10($inp),$inout1
648	movdqu	0x20($inp),$inout2
649	movdqu	0x30($inp),$inout3
650	movdqu	0x40($inp),$inout4
651	movdqu	0x50($inp),$inout5
652	movdqu	0x60($inp),$inout6
653	movdqu	0x70($inp),$inout7
654	lea	0x80($inp),$inp		# $inp+=8*16
655	sub	\$0x80,$len		# $len-=8*16 (can be zero)
656	jmp	.Lecb_enc_loop8_enter
657.align 16
658.Lecb_enc_loop8:
659	movups	$inout0,($out)		# store 8 output blocks
660	mov	$key_,$key		# restore $key
661	movdqu	($inp),$inout0		# load 8 input blocks
662	mov	$rnds_,$rounds		# restore $rounds
663	movups	$inout1,0x10($out)
664	movdqu	0x10($inp),$inout1
665	movups	$inout2,0x20($out)
666	movdqu	0x20($inp),$inout2
667	movups	$inout3,0x30($out)
668	movdqu	0x30($inp),$inout3
669	movups	$inout4,0x40($out)
670	movdqu	0x40($inp),$inout4
671	movups	$inout5,0x50($out)
672	movdqu	0x50($inp),$inout5
673	movups	$inout6,0x60($out)
674	movdqu	0x60($inp),$inout6
675	movups	$inout7,0x70($out)
676	lea	0x80($out),$out		# $out+=8*16
677	movdqu	0x70($inp),$inout7
678	lea	0x80($inp),$inp		# $inp+=8*16
679.Lecb_enc_loop8_enter:
680
681	call	_aesni_encrypt8
682
683	sub	\$0x80,$len
684	jnc	.Lecb_enc_loop8		# loop if $len-=8*16 didn't borrow
685
686	movups	$inout0,($out)		# store 8 output blocks
687	mov	$key_,$key		# restore $key
688	movups	$inout1,0x10($out)
689	mov	$rnds_,$rounds		# restore $rounds
690	movups	$inout2,0x20($out)
691	movups	$inout3,0x30($out)
692	movups	$inout4,0x40($out)
693	movups	$inout5,0x50($out)
694	movups	$inout6,0x60($out)
695	movups	$inout7,0x70($out)
696	lea	0x80($out),$out		# $out+=8*16
697	add	\$0x80,$len		# restore real remaining $len
698	jz	.Lecb_ret		# done if ($len==0)
699
700.Lecb_enc_tail:				# $len is less than 8*16
701	movups	($inp),$inout0
702	cmp	\$0x20,$len
703	jb	.Lecb_enc_one
704	movups	0x10($inp),$inout1
705	je	.Lecb_enc_two
706	movups	0x20($inp),$inout2
707	cmp	\$0x40,$len
708	jb	.Lecb_enc_three
709	movups	0x30($inp),$inout3
710	je	.Lecb_enc_four
711	movups	0x40($inp),$inout4
712	cmp	\$0x60,$len
713	jb	.Lecb_enc_five
714	movups	0x50($inp),$inout5
715	je	.Lecb_enc_six
716	movdqu	0x60($inp),$inout6
717	xorps	$inout7,$inout7
718	call	_aesni_encrypt8
719	movups	$inout0,($out)		# store 7 output blocks
720	movups	$inout1,0x10($out)
721	movups	$inout2,0x20($out)
722	movups	$inout3,0x30($out)
723	movups	$inout4,0x40($out)
724	movups	$inout5,0x50($out)
725	movups	$inout6,0x60($out)
726	jmp	.Lecb_ret
727.align	16
728.Lecb_enc_one:
729___
730	&aesni_generate1("enc",$key,$rounds);
731$code.=<<___;
732	movups	$inout0,($out)		# store one output block
733	jmp	.Lecb_ret
734.align	16
735.Lecb_enc_two:
736	call	_aesni_encrypt2
737	movups	$inout0,($out)		# store 2 output blocks
738	movups	$inout1,0x10($out)
739	jmp	.Lecb_ret
740.align	16
741.Lecb_enc_three:
742	call	_aesni_encrypt3
743	movups	$inout0,($out)		# store 3 output blocks
744	movups	$inout1,0x10($out)
745	movups	$inout2,0x20($out)
746	jmp	.Lecb_ret
747.align	16
748.Lecb_enc_four:
749	call	_aesni_encrypt4
750	movups	$inout0,($out)		# store 4 output blocks
751	movups	$inout1,0x10($out)
752	movups	$inout2,0x20($out)
753	movups	$inout3,0x30($out)
754	jmp	.Lecb_ret
755.align	16
756.Lecb_enc_five:
757	xorps	$inout5,$inout5
758	call	_aesni_encrypt6
759	movups	$inout0,($out)		# store 5 output blocks
760	movups	$inout1,0x10($out)
761	movups	$inout2,0x20($out)
762	movups	$inout3,0x30($out)
763	movups	$inout4,0x40($out)
764	jmp	.Lecb_ret
765.align	16
766.Lecb_enc_six:
767	call	_aesni_encrypt6
768	movups	$inout0,($out)		# store 6 output blocks
769	movups	$inout1,0x10($out)
770	movups	$inout2,0x20($out)
771	movups	$inout3,0x30($out)
772	movups	$inout4,0x40($out)
773	movups	$inout5,0x50($out)
774	jmp	.Lecb_ret
775#--------------------------- ECB DECRYPT ------------------------------#
776.align	16
777.Lecb_decrypt:
778	cmp	\$0x80,$len		# if ($len<8*16)
779	jb	.Lecb_dec_tail		# short input
780
781	movdqu	($inp),$inout0		# load 8 input blocks
782	movdqu	0x10($inp),$inout1
783	movdqu	0x20($inp),$inout2
784	movdqu	0x30($inp),$inout3
785	movdqu	0x40($inp),$inout4
786	movdqu	0x50($inp),$inout5
787	movdqu	0x60($inp),$inout6
788	movdqu	0x70($inp),$inout7
789	lea	0x80($inp),$inp		# $inp+=8*16
790	sub	\$0x80,$len		# $len-=8*16 (can be zero)
791	jmp	.Lecb_dec_loop8_enter
792.align 16
793.Lecb_dec_loop8:
794	movups	$inout0,($out)		# store 8 output blocks
795	mov	$key_,$key		# restore $key
796	movdqu	($inp),$inout0		# load 8 input blocks
797	mov	$rnds_,$rounds		# restore $rounds
798	movups	$inout1,0x10($out)
799	movdqu	0x10($inp),$inout1
800	movups	$inout2,0x20($out)
801	movdqu	0x20($inp),$inout2
802	movups	$inout3,0x30($out)
803	movdqu	0x30($inp),$inout3
804	movups	$inout4,0x40($out)
805	movdqu	0x40($inp),$inout4
806	movups	$inout5,0x50($out)
807	movdqu	0x50($inp),$inout5
808	movups	$inout6,0x60($out)
809	movdqu	0x60($inp),$inout6
810	movups	$inout7,0x70($out)
811	lea	0x80($out),$out		# $out+=8*16
812	movdqu	0x70($inp),$inout7
813	lea	0x80($inp),$inp		# $inp+=8*16
814.Lecb_dec_loop8_enter:
815
816	call	_aesni_decrypt8
817
818	$movkey	($key_),$rndkey0
819	sub	\$0x80,$len
820	jnc	.Lecb_dec_loop8		# loop if $len-=8*16 didn't borrow
821
822	movups	$inout0,($out)		# store 8 output blocks
823	 pxor	$inout0,$inout0		# clear register bank
824	mov	$key_,$key		# restore $key
825	movups	$inout1,0x10($out)
826	 pxor	$inout1,$inout1
827	mov	$rnds_,$rounds		# restore $rounds
828	movups	$inout2,0x20($out)
829	 pxor	$inout2,$inout2
830	movups	$inout3,0x30($out)
831	 pxor	$inout3,$inout3
832	movups	$inout4,0x40($out)
833	 pxor	$inout4,$inout4
834	movups	$inout5,0x50($out)
835	 pxor	$inout5,$inout5
836	movups	$inout6,0x60($out)
837	 pxor	$inout6,$inout6
838	movups	$inout7,0x70($out)
839	 pxor	$inout7,$inout7
840	lea	0x80($out),$out		# $out+=8*16
841	add	\$0x80,$len		# restore real remaining $len
842	jz	.Lecb_ret		# done if ($len==0)
843
844.Lecb_dec_tail:
845	movups	($inp),$inout0
846	cmp	\$0x20,$len
847	jb	.Lecb_dec_one
848	movups	0x10($inp),$inout1
849	je	.Lecb_dec_two
850	movups	0x20($inp),$inout2
851	cmp	\$0x40,$len
852	jb	.Lecb_dec_three
853	movups	0x30($inp),$inout3
854	je	.Lecb_dec_four
855	movups	0x40($inp),$inout4
856	cmp	\$0x60,$len
857	jb	.Lecb_dec_five
858	movups	0x50($inp),$inout5
859	je	.Lecb_dec_six
860	movups	0x60($inp),$inout6
861	$movkey	($key),$rndkey0
862	xorps	$inout7,$inout7
863	call	_aesni_decrypt8
864	movups	$inout0,($out)		# store 7 output blocks
865	 pxor	$inout0,$inout0		# clear register bank
866	movups	$inout1,0x10($out)
867	 pxor	$inout1,$inout1
868	movups	$inout2,0x20($out)
869	 pxor	$inout2,$inout2
870	movups	$inout3,0x30($out)
871	 pxor	$inout3,$inout3
872	movups	$inout4,0x40($out)
873	 pxor	$inout4,$inout4
874	movups	$inout5,0x50($out)
875	 pxor	$inout5,$inout5
876	movups	$inout6,0x60($out)
877	 pxor	$inout6,$inout6
878	 pxor	$inout7,$inout7
879	jmp	.Lecb_ret
880.align	16
881.Lecb_dec_one:
882___
883	&aesni_generate1("dec",$key,$rounds);
884$code.=<<___;
885	movups	$inout0,($out)		# store one output block
886	 pxor	$inout0,$inout0		# clear register bank
887	jmp	.Lecb_ret
888.align	16
889.Lecb_dec_two:
890	call	_aesni_decrypt2
891	movups	$inout0,($out)		# store 2 output blocks
892	 pxor	$inout0,$inout0		# clear register bank
893	movups	$inout1,0x10($out)
894	 pxor	$inout1,$inout1
895	jmp	.Lecb_ret
896.align	16
897.Lecb_dec_three:
898	call	_aesni_decrypt3
899	movups	$inout0,($out)		# store 3 output blocks
900	 pxor	$inout0,$inout0		# clear register bank
901	movups	$inout1,0x10($out)
902	 pxor	$inout1,$inout1
903	movups	$inout2,0x20($out)
904	 pxor	$inout2,$inout2
905	jmp	.Lecb_ret
906.align	16
907.Lecb_dec_four:
908	call	_aesni_decrypt4
909	movups	$inout0,($out)		# store 4 output blocks
910	 pxor	$inout0,$inout0		# clear register bank
911	movups	$inout1,0x10($out)
912	 pxor	$inout1,$inout1
913	movups	$inout2,0x20($out)
914	 pxor	$inout2,$inout2
915	movups	$inout3,0x30($out)
916	 pxor	$inout3,$inout3
917	jmp	.Lecb_ret
918.align	16
919.Lecb_dec_five:
920	xorps	$inout5,$inout5
921	call	_aesni_decrypt6
922	movups	$inout0,($out)		# store 5 output blocks
923	 pxor	$inout0,$inout0		# clear register bank
924	movups	$inout1,0x10($out)
925	 pxor	$inout1,$inout1
926	movups	$inout2,0x20($out)
927	 pxor	$inout2,$inout2
928	movups	$inout3,0x30($out)
929	 pxor	$inout3,$inout3
930	movups	$inout4,0x40($out)
931	 pxor	$inout4,$inout4
932	 pxor	$inout5,$inout5
933	jmp	.Lecb_ret
934.align	16
935.Lecb_dec_six:
936	call	_aesni_decrypt6
937	movups	$inout0,($out)		# store 6 output blocks
938	 pxor	$inout0,$inout0		# clear register bank
939	movups	$inout1,0x10($out)
940	 pxor	$inout1,$inout1
941	movups	$inout2,0x20($out)
942	 pxor	$inout2,$inout2
943	movups	$inout3,0x30($out)
944	 pxor	$inout3,$inout3
945	movups	$inout4,0x40($out)
946	 pxor	$inout4,$inout4
947	movups	$inout5,0x50($out)
948	 pxor	$inout5,$inout5
949
950.Lecb_ret:
951	xorps	$rndkey0,$rndkey0	# %xmm0
952	pxor	$rndkey1,$rndkey1
953___
954$code.=<<___ if ($win64);
955	movaps	(%rsp),%xmm6
956	movaps	%xmm0,(%rsp)		# clear stack
957	movaps	0x10(%rsp),%xmm7
958	movaps	%xmm0,0x10(%rsp)
959	movaps	0x20(%rsp),%xmm8
960	movaps	%xmm0,0x20(%rsp)
961	movaps	0x30(%rsp),%xmm9
962	movaps	%xmm0,0x30(%rsp)
963	lea	0x58(%rsp),%rsp
964.Lecb_enc_ret:
965___
966$code.=<<___;
967	ret
968.cfi_endproc
969.size	${PREFIX}_ecb_encrypt,.-${PREFIX}_ecb_encrypt
970___
971
972{
973######################################################################
974# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
975#                         size_t blocks, const AES_KEY *key,
976#                         const char *ivec,char *cmac);
977#
978# Handles only complete blocks, operates on 64-bit counter and
979# does not update *ivec! Nor does it finalize CMAC value
980# (see engine/eng_aesni.c for details)
981#
982if (0) {  # Omit these functions in BoringSSL
983my $cmac="%r9";	# 6th argument
984
985my $increment="%xmm9";
986my $iv="%xmm6";
987my $bswap_mask="%xmm7";
988
989$code.=<<___;
990.globl	${PREFIX}_ccm64_encrypt_blocks
991.type	${PREFIX}_ccm64_encrypt_blocks,\@function,6
992.align	16
993${PREFIX}_ccm64_encrypt_blocks:
994___
995$code.=<<___ if ($win64);
996	lea	-0x58(%rsp),%rsp
997	movaps	%xmm6,(%rsp)		# $iv
998	movaps	%xmm7,0x10(%rsp)	# $bswap_mask
999	movaps	%xmm8,0x20(%rsp)	# $in0
1000	movaps	%xmm9,0x30(%rsp)	# $increment
1001.Lccm64_enc_body:
1002___
1003$code.=<<___;
1004	mov	240($key),$rounds		# key->rounds
1005	movdqu	($ivp),$iv
1006	movdqa	.Lincrement64(%rip),$increment
1007	movdqa	.Lbswap_mask(%rip),$bswap_mask
1008
1009	shl	\$4,$rounds
1010	mov	\$16,$rnds_
1011	lea	0($key),$key_
1012	movdqu	($cmac),$inout1
1013	movdqa	$iv,$inout0
1014	lea	32($key,$rounds),$key		# end of key schedule
1015	pshufb	$bswap_mask,$iv
1016	sub	%rax,%r10			# twisted $rounds
1017	jmp	.Lccm64_enc_outer
1018.align	16
1019.Lccm64_enc_outer:
1020	$movkey	($key_),$rndkey0
1021	mov	%r10,%rax
1022	movups	($inp),$in0			# load inp
1023
1024	xorps	$rndkey0,$inout0		# counter
1025	$movkey	16($key_),$rndkey1
1026	xorps	$in0,$rndkey0
1027	xorps	$rndkey0,$inout1		# cmac^=inp
1028	$movkey	32($key_),$rndkey0
1029
1030.Lccm64_enc2_loop:
1031	aesenc	$rndkey1,$inout0
1032	aesenc	$rndkey1,$inout1
1033	$movkey	($key,%rax),$rndkey1
1034	add	\$32,%rax
1035	aesenc	$rndkey0,$inout0
1036	aesenc	$rndkey0,$inout1
1037	$movkey	-16($key,%rax),$rndkey0
1038	jnz	.Lccm64_enc2_loop
1039	aesenc	$rndkey1,$inout0
1040	aesenc	$rndkey1,$inout1
1041	paddq	$increment,$iv
1042	dec	$len				# $len-- ($len is in blocks)
1043	aesenclast	$rndkey0,$inout0
1044	aesenclast	$rndkey0,$inout1
1045
1046	lea	16($inp),$inp
1047	xorps	$inout0,$in0			# inp ^= E(iv)
1048	movdqa	$iv,$inout0
1049	movups	$in0,($out)			# save output
1050	pshufb	$bswap_mask,$inout0
1051	lea	16($out),$out			# $out+=16
1052	jnz	.Lccm64_enc_outer		# loop if ($len!=0)
1053
1054	 pxor	$rndkey0,$rndkey0		# clear register bank
1055	 pxor	$rndkey1,$rndkey1
1056	 pxor	$inout0,$inout0
1057	movups	$inout1,($cmac)			# store resulting mac
1058	 pxor	$inout1,$inout1
1059	 pxor	$in0,$in0
1060	 pxor	$iv,$iv
1061___
1062$code.=<<___ if ($win64);
1063	movaps	(%rsp),%xmm6
1064	movaps	%xmm0,(%rsp)			# clear stack
1065	movaps	0x10(%rsp),%xmm7
1066	movaps	%xmm0,0x10(%rsp)
1067	movaps	0x20(%rsp),%xmm8
1068	movaps	%xmm0,0x20(%rsp)
1069	movaps	0x30(%rsp),%xmm9
1070	movaps	%xmm0,0x30(%rsp)
1071	lea	0x58(%rsp),%rsp
1072.Lccm64_enc_ret:
1073___
1074$code.=<<___;
1075	ret
1076.size	${PREFIX}_ccm64_encrypt_blocks,.-${PREFIX}_ccm64_encrypt_blocks
1077___
1078######################################################################
1079$code.=<<___;
1080.globl	${PREFIX}_ccm64_decrypt_blocks
1081.type	${PREFIX}_ccm64_decrypt_blocks,\@function,6
1082.align	16
1083${PREFIX}_ccm64_decrypt_blocks:
1084___
1085$code.=<<___ if ($win64);
1086	lea	-0x58(%rsp),%rsp
1087	movaps	%xmm6,(%rsp)		# $iv
1088	movaps	%xmm7,0x10(%rsp)	# $bswap_mask
1089	movaps	%xmm8,0x20(%rsp)	# $in8
1090	movaps	%xmm9,0x30(%rsp)	# $increment
1091.Lccm64_dec_body:
1092___
1093$code.=<<___;
1094	mov	240($key),$rounds		# key->rounds
1095	movups	($ivp),$iv
1096	movdqu	($cmac),$inout1
1097	movdqa	.Lincrement64(%rip),$increment
1098	movdqa	.Lbswap_mask(%rip),$bswap_mask
1099
1100	movaps	$iv,$inout0
1101	mov	$rounds,$rnds_
1102	mov	$key,$key_
1103	pshufb	$bswap_mask,$iv
1104___
1105	&aesni_generate1("enc",$key,$rounds);
1106$code.=<<___;
1107	shl	\$4,$rnds_
1108	mov	\$16,$rounds
1109	movups	($inp),$in0			# load inp
1110	paddq	$increment,$iv
1111	lea	16($inp),$inp			# $inp+=16
1112	sub	%r10,%rax			# twisted $rounds
1113	lea	32($key_,$rnds_),$key		# end of key schedule
1114	mov	%rax,%r10
1115	jmp	.Lccm64_dec_outer
1116.align	16
1117.Lccm64_dec_outer:
1118	xorps	$inout0,$in0			# inp ^= E(iv)
1119	movdqa	$iv,$inout0
1120	movups	$in0,($out)			# save output
1121	lea	16($out),$out			# $out+=16
1122	pshufb	$bswap_mask,$inout0
1123
1124	sub	\$1,$len			# $len-- ($len is in blocks)
1125	jz	.Lccm64_dec_break		# if ($len==0) break
1126
1127	$movkey	($key_),$rndkey0
1128	mov	%r10,%rax
1129	$movkey	16($key_),$rndkey1
1130	xorps	$rndkey0,$in0
1131	xorps	$rndkey0,$inout0
1132	xorps	$in0,$inout1			# cmac^=out
1133	$movkey	32($key_),$rndkey0
1134	jmp	.Lccm64_dec2_loop
1135.align	16
1136.Lccm64_dec2_loop:
1137	aesenc	$rndkey1,$inout0
1138	aesenc	$rndkey1,$inout1
1139	$movkey	($key,%rax),$rndkey1
1140	add	\$32,%rax
1141	aesenc	$rndkey0,$inout0
1142	aesenc	$rndkey0,$inout1
1143	$movkey	-16($key,%rax),$rndkey0
1144	jnz	.Lccm64_dec2_loop
1145	movups	($inp),$in0			# load input
1146	paddq	$increment,$iv
1147	aesenc	$rndkey1,$inout0
1148	aesenc	$rndkey1,$inout1
1149	aesenclast	$rndkey0,$inout0
1150	aesenclast	$rndkey0,$inout1
1151	lea	16($inp),$inp			# $inp+=16
1152	jmp	.Lccm64_dec_outer
1153
1154.align	16
1155.Lccm64_dec_break:
1156	#xorps	$in0,$inout1			# cmac^=out
1157	mov	240($key_),$rounds
1158___
1159	&aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
1160$code.=<<___;
1161	 pxor	$rndkey0,$rndkey0		# clear register bank
1162	 pxor	$rndkey1,$rndkey1
1163	 pxor	$inout0,$inout0
1164	movups	$inout1,($cmac)			# store resulting mac
1165	 pxor	$inout1,$inout1
1166	 pxor	$in0,$in0
1167	 pxor	$iv,$iv
1168___
1169$code.=<<___ if ($win64);
1170	movaps	(%rsp),%xmm6
1171	movaps	%xmm0,(%rsp)			# clear stack
1172	movaps	0x10(%rsp),%xmm7
1173	movaps	%xmm0,0x10(%rsp)
1174	movaps	0x20(%rsp),%xmm8
1175	movaps	%xmm0,0x20(%rsp)
1176	movaps	0x30(%rsp),%xmm9
1177	movaps	%xmm0,0x30(%rsp)
1178	lea	0x58(%rsp),%rsp
1179.Lccm64_dec_ret:
1180___
1181$code.=<<___;
1182	ret
1183.size	${PREFIX}_ccm64_decrypt_blocks,.-${PREFIX}_ccm64_decrypt_blocks
1184___
1185}
1186######################################################################
1187# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
1188#                         size_t blocks, const AES_KEY *key,
1189#                         const char *ivec);
1190#
1191# Handles only complete blocks, operates on 32-bit counter and
1192# does not update *ivec! (see crypto/modes/ctr128.c for details)
1193#
1194# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
1195# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
1196# Keywords are full unroll and modulo-schedule counter calculations
1197# with zero-round key xor.
1198{
1199my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
1200my ($key0,$ctr)=("%ebp","${ivp}d");
1201my $frame_size = 0x80 + ($win64?160:0);
1202
1203$code.=<<___;
1204.globl	${PREFIX}_ctr32_encrypt_blocks
1205.type	${PREFIX}_ctr32_encrypt_blocks,\@function,5
1206.align	16
1207${PREFIX}_ctr32_encrypt_blocks:
1208.cfi_startproc
1209	_CET_ENDBR
1210#ifdef BORINGSSL_DISPATCH_TEST
1211	movb \$1,BORINGSSL_function_hit(%rip)
1212#endif
1213	cmp	\$1,$len
1214	jne	.Lctr32_bulk
1215
1216	# handle single block without allocating stack frame,
1217	# useful when handling edges
1218	movups	($ivp),$inout0
1219	movups	($inp),$inout1
1220	mov	240($key),%edx			# key->rounds
1221___
1222	&aesni_generate1("enc",$key,"%edx");
1223$code.=<<___;
1224	 pxor	$rndkey0,$rndkey0		# clear register bank
1225	 pxor	$rndkey1,$rndkey1
1226	xorps	$inout1,$inout0
1227	 pxor	$inout1,$inout1
1228	movups	$inout0,($out)
1229	 xorps	$inout0,$inout0
1230	jmp	.Lctr32_epilogue
1231
1232.align	16
1233.Lctr32_bulk:
1234	lea	(%rsp),$key_			# use $key_ as frame pointer
1235.cfi_def_cfa_register	$key_
1236	push	%rbp
1237.cfi_push	%rbp
1238	sub	\$$frame_size,%rsp
1239	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
1240___
1241$code.=<<___ if ($win64);
1242	movaps	%xmm6,-0xa8($key_)		# offload everything
1243	movaps	%xmm7,-0x98($key_)
1244	movaps	%xmm8,-0x88($key_)
1245	movaps	%xmm9,-0x78($key_)
1246	movaps	%xmm10,-0x68($key_)
1247	movaps	%xmm11,-0x58($key_)
1248	movaps	%xmm12,-0x48($key_)
1249	movaps	%xmm13,-0x38($key_)
1250	movaps	%xmm14,-0x28($key_)
1251	movaps	%xmm15,-0x18($key_)
1252.Lctr32_body:
1253___
1254$code.=<<___;
1255
1256	# 8 16-byte words on top of stack are counter values
1257	# xor-ed with zero-round key
1258
1259	movdqu	($ivp),$inout0
1260	movdqu	($key),$rndkey0
1261	mov	12($ivp),$ctr			# counter LSB
1262	pxor	$rndkey0,$inout0
1263	mov	12($key),$key0			# 0-round key LSB
1264	movdqa	$inout0,0x00(%rsp)		# populate counter block
1265	bswap	$ctr
1266	movdqa	$inout0,$inout1
1267	movdqa	$inout0,$inout2
1268	movdqa	$inout0,$inout3
1269	movdqa	$inout0,0x40(%rsp)
1270	movdqa	$inout0,0x50(%rsp)
1271	movdqa	$inout0,0x60(%rsp)
1272	mov	%rdx,%r10			# about to borrow %rdx
1273	movdqa	$inout0,0x70(%rsp)
1274
1275	lea	1($ctr),%rax
1276	 lea	2($ctr),%rdx
1277	bswap	%eax
1278	 bswap	%edx
1279	xor	$key0,%eax
1280	 xor	$key0,%edx
1281	pinsrd	\$3,%eax,$inout1
1282	lea	3($ctr),%rax
1283	movdqa	$inout1,0x10(%rsp)
1284	 pinsrd	\$3,%edx,$inout2
1285	bswap	%eax
1286	 mov	%r10,%rdx			# restore %rdx
1287	 lea	4($ctr),%r10
1288	 movdqa	$inout2,0x20(%rsp)
1289	xor	$key0,%eax
1290	 bswap	%r10d
1291	pinsrd	\$3,%eax,$inout3
1292	 xor	$key0,%r10d
1293	movdqa	$inout3,0x30(%rsp)
1294	lea	5($ctr),%r9
1295	 mov	%r10d,0x40+12(%rsp)
1296	bswap	%r9d
1297	 lea	6($ctr),%r10
1298	mov	240($key),$rounds		# key->rounds
1299	xor	$key0,%r9d
1300	 bswap	%r10d
1301	mov	%r9d,0x50+12(%rsp)
1302	 xor	$key0,%r10d
1303	lea	7($ctr),%r9
1304	 mov	%r10d,0x60+12(%rsp)
1305	bswap	%r9d
1306	xor	$key0,%r9d
1307	mov	%r9d,0x70+12(%rsp)
1308
1309	$movkey	0x10($key),$rndkey1
1310
1311	movdqa	0x40(%rsp),$inout4
1312	movdqa	0x50(%rsp),$inout5
1313
1314	cmp	\$8,$len		# $len is in blocks
1315	jb	.Lctr32_tail		# short input if ($len<8)
1316
1317	lea	0x80($key),$key		# size optimization
1318	sub	\$8,$len		# $len is biased by -8
1319	jmp	.Lctr32_loop8
1320
1321.align	32
1322.Lctr32_loop8:
1323	 add		\$8,$ctr		# next counter value
1324	movdqa		0x60(%rsp),$inout6
1325	aesenc		$rndkey1,$inout0
1326	 mov		$ctr,%r9d
1327	movdqa		0x70(%rsp),$inout7
1328	aesenc		$rndkey1,$inout1
1329	 bswap		%r9d
1330	$movkey		0x20-0x80($key),$rndkey0
1331	aesenc		$rndkey1,$inout2
1332	 xor		$key0,%r9d
1333	 nop
1334	aesenc		$rndkey1,$inout3
1335	 mov		%r9d,0x00+12(%rsp)	# store next counter value
1336	 lea		1($ctr),%r9
1337	aesenc		$rndkey1,$inout4
1338	aesenc		$rndkey1,$inout5
1339	aesenc		$rndkey1,$inout6
1340	aesenc		$rndkey1,$inout7
1341	$movkey		0x30-0x80($key),$rndkey1
1342___
1343for($i=2;$i<8;$i++) {
1344my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
1345$code.=<<___;
1346	 bswap		%r9d
1347	aesenc		$rndkeyx,$inout0
1348	aesenc		$rndkeyx,$inout1
1349	 xor		$key0,%r9d
1350	 .byte		0x66,0x90
1351	aesenc		$rndkeyx,$inout2
1352	aesenc		$rndkeyx,$inout3
1353	 mov		%r9d,`0x10*($i-1)`+12(%rsp)
1354	 lea		$i($ctr),%r9
1355	aesenc		$rndkeyx,$inout4
1356	aesenc		$rndkeyx,$inout5
1357	aesenc		$rndkeyx,$inout6
1358	aesenc		$rndkeyx,$inout7
1359	$movkey		`0x20+0x10*$i`-0x80($key),$rndkeyx
1360___
1361}
1362$code.=<<___;
1363	 bswap		%r9d
1364	aesenc		$rndkey0,$inout0
1365	aesenc		$rndkey0,$inout1
1366	aesenc		$rndkey0,$inout2
1367	 xor		$key0,%r9d
1368	 movdqu		0x00($inp),$in0		# start loading input
1369	aesenc		$rndkey0,$inout3
1370	 mov		%r9d,0x70+12(%rsp)
1371	 cmp		\$11,$rounds
1372	aesenc		$rndkey0,$inout4
1373	aesenc		$rndkey0,$inout5
1374	aesenc		$rndkey0,$inout6
1375	aesenc		$rndkey0,$inout7
1376	$movkey		0xa0-0x80($key),$rndkey0
1377
1378	jb		.Lctr32_enc_done
1379
1380	aesenc		$rndkey1,$inout0
1381	aesenc		$rndkey1,$inout1
1382	aesenc		$rndkey1,$inout2
1383	aesenc		$rndkey1,$inout3
1384	aesenc		$rndkey1,$inout4
1385	aesenc		$rndkey1,$inout5
1386	aesenc		$rndkey1,$inout6
1387	aesenc		$rndkey1,$inout7
1388	$movkey		0xb0-0x80($key),$rndkey1
1389
1390	aesenc		$rndkey0,$inout0
1391	aesenc		$rndkey0,$inout1
1392	aesenc		$rndkey0,$inout2
1393	aesenc		$rndkey0,$inout3
1394	aesenc		$rndkey0,$inout4
1395	aesenc		$rndkey0,$inout5
1396	aesenc		$rndkey0,$inout6
1397	aesenc		$rndkey0,$inout7
1398	$movkey		0xc0-0x80($key),$rndkey0
1399	je		.Lctr32_enc_done
1400
1401	aesenc		$rndkey1,$inout0
1402	aesenc		$rndkey1,$inout1
1403	aesenc		$rndkey1,$inout2
1404	aesenc		$rndkey1,$inout3
1405	aesenc		$rndkey1,$inout4
1406	aesenc		$rndkey1,$inout5
1407	aesenc		$rndkey1,$inout6
1408	aesenc		$rndkey1,$inout7
1409	$movkey		0xd0-0x80($key),$rndkey1
1410
1411	aesenc		$rndkey0,$inout0
1412	aesenc		$rndkey0,$inout1
1413	aesenc		$rndkey0,$inout2
1414	aesenc		$rndkey0,$inout3
1415	aesenc		$rndkey0,$inout4
1416	aesenc		$rndkey0,$inout5
1417	aesenc		$rndkey0,$inout6
1418	aesenc		$rndkey0,$inout7
1419	$movkey		0xe0-0x80($key),$rndkey0
1420	jmp		.Lctr32_enc_done
1421
1422.align	16
1423.Lctr32_enc_done:
1424	movdqu		0x10($inp),$in1
1425	pxor		$rndkey0,$in0		# input^=round[last]
1426	movdqu		0x20($inp),$in2
1427	pxor		$rndkey0,$in1
1428	movdqu		0x30($inp),$in3
1429	pxor		$rndkey0,$in2
1430	movdqu		0x40($inp),$in4
1431	pxor		$rndkey0,$in3
1432	movdqu		0x50($inp),$in5
1433	pxor		$rndkey0,$in4
1434	prefetcht0	0x1c0($inp)	# We process 128 bytes (8*16), so to prefetch 1 iteration
1435	prefetcht0	0x200($inp)	# We need to prefetch 2 64 byte lines
1436	pxor		$rndkey0,$in5
1437	aesenc		$rndkey1,$inout0
1438	aesenc		$rndkey1,$inout1
1439	aesenc		$rndkey1,$inout2
1440	aesenc		$rndkey1,$inout3
1441	aesenc		$rndkey1,$inout4
1442	aesenc		$rndkey1,$inout5
1443	aesenc		$rndkey1,$inout6
1444	aesenc		$rndkey1,$inout7
1445	movdqu		0x60($inp),$rndkey1	# borrow $rndkey1 for inp[6]
1446	lea		0x80($inp),$inp		# $inp+=8*16
1447
1448	aesenclast	$in0,$inout0		# $inN is inp[N]^round[last]
1449	pxor		$rndkey0,$rndkey1	# borrowed $rndkey
1450	movdqu		0x70-0x80($inp),$in0
1451	aesenclast	$in1,$inout1
1452	pxor		$rndkey0,$in0
1453	movdqa		0x00(%rsp),$in1		# load next counter block
1454	aesenclast	$in2,$inout2
1455	aesenclast	$in3,$inout3
1456	movdqa		0x10(%rsp),$in2
1457	movdqa		0x20(%rsp),$in3
1458	aesenclast	$in4,$inout4
1459	aesenclast	$in5,$inout5
1460	movdqa		0x30(%rsp),$in4
1461	movdqa		0x40(%rsp),$in5
1462	aesenclast	$rndkey1,$inout6
1463	movdqa		0x50(%rsp),$rndkey0
1464	$movkey		0x10-0x80($key),$rndkey1#real 1st-round key
1465	aesenclast	$in0,$inout7
1466
1467	movups		$inout0,($out)		# store 8 output blocks
1468	movdqa		$in1,$inout0
1469	movups		$inout1,0x10($out)
1470	movdqa		$in2,$inout1
1471	movups		$inout2,0x20($out)
1472	movdqa		$in3,$inout2
1473	movups		$inout3,0x30($out)
1474	movdqa		$in4,$inout3
1475	movups		$inout4,0x40($out)
1476	movdqa		$in5,$inout4
1477	movups		$inout5,0x50($out)
1478	movdqa		$rndkey0,$inout5
1479	movups		$inout6,0x60($out)
1480	movups		$inout7,0x70($out)
1481	lea		0x80($out),$out		# $out+=8*16
1482
1483	sub	\$8,$len
1484	jnc	.Lctr32_loop8			# loop if $len-=8 didn't borrow
1485
1486	add	\$8,$len			# restore real remaining $len
1487	jz	.Lctr32_done			# done if ($len==0)
1488	lea	-0x80($key),$key
1489
1490.Lctr32_tail:
1491	# note that at this point $inout0..5 are populated with
1492	# counter values xor-ed with 0-round key
1493	lea	16($key),$key
1494	cmp	\$4,$len
1495	jb	.Lctr32_loop3
1496	je	.Lctr32_loop4
1497
1498	# if ($len>4) compute 7 E(counter)
1499	shl		\$4,$rounds
1500	movdqa		0x60(%rsp),$inout6
1501	pxor		$inout7,$inout7
1502
1503	$movkey		16($key),$rndkey0
1504	aesenc		$rndkey1,$inout0
1505	aesenc		$rndkey1,$inout1
1506	lea		32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter
1507	neg		%rax
1508	aesenc		$rndkey1,$inout2
1509	add		\$16,%rax		# prepare for .Lenc_loop8_enter
1510	 movups		($inp),$in0
1511	aesenc		$rndkey1,$inout3
1512	aesenc		$rndkey1,$inout4
1513	 movups		0x10($inp),$in1		# pre-load input
1514	 movups		0x20($inp),$in2
1515	aesenc		$rndkey1,$inout5
1516	aesenc		$rndkey1,$inout6
1517
1518	call            .Lenc_loop8_enter
1519
1520	movdqu	0x30($inp),$in3
1521	pxor	$in0,$inout0
1522	movdqu	0x40($inp),$in0
1523	pxor	$in1,$inout1
1524	movdqu	$inout0,($out)			# store output
1525	pxor	$in2,$inout2
1526	movdqu	$inout1,0x10($out)
1527	pxor	$in3,$inout3
1528	movdqu	$inout2,0x20($out)
1529	pxor	$in0,$inout4
1530	movdqu	$inout3,0x30($out)
1531	movdqu	$inout4,0x40($out)
1532	cmp	\$6,$len
1533	jb	.Lctr32_done			# $len was 5, stop store
1534
1535	movups	0x50($inp),$in1
1536	xorps	$in1,$inout5
1537	movups	$inout5,0x50($out)
1538	je	.Lctr32_done			# $len was 6, stop store
1539
1540	movups	0x60($inp),$in2
1541	xorps	$in2,$inout6
1542	movups	$inout6,0x60($out)
1543	jmp	.Lctr32_done			# $len was 7, stop store
1544
1545.align	32
1546.Lctr32_loop4:
1547	aesenc		$rndkey1,$inout0
1548	lea		16($key),$key
1549	dec		$rounds
1550	aesenc		$rndkey1,$inout1
1551	aesenc		$rndkey1,$inout2
1552	aesenc		$rndkey1,$inout3
1553	$movkey		($key),$rndkey1
1554	jnz		.Lctr32_loop4
1555	aesenclast	$rndkey1,$inout0
1556	aesenclast	$rndkey1,$inout1
1557	 movups		($inp),$in0		# load input
1558	 movups		0x10($inp),$in1
1559	aesenclast	$rndkey1,$inout2
1560	aesenclast	$rndkey1,$inout3
1561	 movups		0x20($inp),$in2
1562	 movups		0x30($inp),$in3
1563
1564	xorps	$in0,$inout0
1565	movups	$inout0,($out)			# store output
1566	xorps	$in1,$inout1
1567	movups	$inout1,0x10($out)
1568	pxor	$in2,$inout2
1569	movdqu	$inout2,0x20($out)
1570	pxor	$in3,$inout3
1571	movdqu	$inout3,0x30($out)
1572	jmp	.Lctr32_done			# $len was 4, stop store
1573
1574.align	32
1575.Lctr32_loop3:
1576	aesenc		$rndkey1,$inout0
1577	lea		16($key),$key
1578	dec		$rounds
1579	aesenc		$rndkey1,$inout1
1580	aesenc		$rndkey1,$inout2
1581	$movkey		($key),$rndkey1
1582	jnz		.Lctr32_loop3
1583	aesenclast	$rndkey1,$inout0
1584	aesenclast	$rndkey1,$inout1
1585	aesenclast	$rndkey1,$inout2
1586
1587	movups	($inp),$in0			# load input
1588	xorps	$in0,$inout0
1589	movups	$inout0,($out)			# store output
1590	cmp	\$2,$len
1591	jb	.Lctr32_done			# $len was 1, stop store
1592
1593	movups	0x10($inp),$in1
1594	xorps	$in1,$inout1
1595	movups	$inout1,0x10($out)
1596	je	.Lctr32_done			# $len was 2, stop store
1597
1598	movups	0x20($inp),$in2
1599	xorps	$in2,$inout2
1600	movups	$inout2,0x20($out)		# $len was 3, stop store
1601
1602.Lctr32_done:
1603	xorps	%xmm0,%xmm0			# clear register bank
1604	xor	$key0,$key0
1605	pxor	%xmm1,%xmm1
1606	pxor	%xmm2,%xmm2
1607	pxor	%xmm3,%xmm3
1608	pxor	%xmm4,%xmm4
1609	pxor	%xmm5,%xmm5
1610___
1611$code.=<<___ if (!$win64);
1612	pxor	%xmm6,%xmm6
1613	pxor	%xmm7,%xmm7
1614	movaps	%xmm0,0x00(%rsp)		# clear stack
1615	pxor	%xmm8,%xmm8
1616	movaps	%xmm0,0x10(%rsp)
1617	pxor	%xmm9,%xmm9
1618	movaps	%xmm0,0x20(%rsp)
1619	pxor	%xmm10,%xmm10
1620	movaps	%xmm0,0x30(%rsp)
1621	pxor	%xmm11,%xmm11
1622	movaps	%xmm0,0x40(%rsp)
1623	pxor	%xmm12,%xmm12
1624	movaps	%xmm0,0x50(%rsp)
1625	pxor	%xmm13,%xmm13
1626	movaps	%xmm0,0x60(%rsp)
1627	pxor	%xmm14,%xmm14
1628	movaps	%xmm0,0x70(%rsp)
1629	pxor	%xmm15,%xmm15
1630___
1631$code.=<<___ if ($win64);
1632	movaps	-0xa8($key_),%xmm6
1633	movaps	%xmm0,-0xa8($key_)		# clear stack
1634	movaps	-0x98($key_),%xmm7
1635	movaps	%xmm0,-0x98($key_)
1636	movaps	-0x88($key_),%xmm8
1637	movaps	%xmm0,-0x88($key_)
1638	movaps	-0x78($key_),%xmm9
1639	movaps	%xmm0,-0x78($key_)
1640	movaps	-0x68($key_),%xmm10
1641	movaps	%xmm0,-0x68($key_)
1642	movaps	-0x58($key_),%xmm11
1643	movaps	%xmm0,-0x58($key_)
1644	movaps	-0x48($key_),%xmm12
1645	movaps	%xmm0,-0x48($key_)
1646	movaps	-0x38($key_),%xmm13
1647	movaps	%xmm0,-0x38($key_)
1648	movaps	-0x28($key_),%xmm14
1649	movaps	%xmm0,-0x28($key_)
1650	movaps	-0x18($key_),%xmm15
1651	movaps	%xmm0,-0x18($key_)
1652	movaps	%xmm0,0x00(%rsp)
1653	movaps	%xmm0,0x10(%rsp)
1654	movaps	%xmm0,0x20(%rsp)
1655	movaps	%xmm0,0x30(%rsp)
1656	movaps	%xmm0,0x40(%rsp)
1657	movaps	%xmm0,0x50(%rsp)
1658	movaps	%xmm0,0x60(%rsp)
1659	movaps	%xmm0,0x70(%rsp)
1660___
1661$code.=<<___;
1662	mov	-8($key_),%rbp
1663.cfi_restore	%rbp
1664	lea	($key_),%rsp
1665.cfi_def_cfa_register	%rsp
1666.Lctr32_epilogue:
1667	ret
1668.cfi_endproc
1669.size	${PREFIX}_ctr32_encrypt_blocks,.-${PREFIX}_ctr32_encrypt_blocks
1670___
1671}
1672
1673######################################################################
1674# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1675#	const AES_KEY *key1, const AES_KEY *key2
1676#	const unsigned char iv[16]);
1677#
1678if (0) {  # Omit these functions in BoringSSL
1679my @tweak=map("%xmm$_",(10..15));
1680my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
1681my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
1682my $frame_size = 0x70 + ($win64?160:0);
1683my $key_ = "%rbp";	# override so that we can use %r11 as FP
1684
1685$code.=<<___;
1686.globl	${PREFIX}_xts_encrypt
1687.type	${PREFIX}_xts_encrypt,\@function,6
1688.align	16
1689${PREFIX}_xts_encrypt:
1690.cfi_startproc
1691	_CET_ENDBR
1692	lea	(%rsp),%r11			# frame pointer
1693.cfi_def_cfa_register	%r11
1694	push	%rbp
1695.cfi_push	%rbp
1696	sub	\$$frame_size,%rsp
1697	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
1698___
1699$code.=<<___ if ($win64);
1700	movaps	%xmm6,-0xa8(%r11)		# offload everything
1701	movaps	%xmm7,-0x98(%r11)
1702	movaps	%xmm8,-0x88(%r11)
1703	movaps	%xmm9,-0x78(%r11)
1704	movaps	%xmm10,-0x68(%r11)
1705	movaps	%xmm11,-0x58(%r11)
1706	movaps	%xmm12,-0x48(%r11)
1707	movaps	%xmm13,-0x38(%r11)
1708	movaps	%xmm14,-0x28(%r11)
1709	movaps	%xmm15,-0x18(%r11)
1710.Lxts_enc_body:
1711___
1712$code.=<<___;
1713	movups	($ivp),$inout0			# load clear-text tweak
1714	mov	240(%r8),$rounds		# key2->rounds
1715	mov	240($key),$rnds_		# key1->rounds
1716___
1717	# generate the tweak
1718	&aesni_generate1("enc",$key2,$rounds,$inout0);
1719$code.=<<___;
1720	$movkey	($key),$rndkey0			# zero round key
1721	mov	$key,$key_			# backup $key
1722	mov	$rnds_,$rounds			# backup $rounds
1723	shl	\$4,$rnds_
1724	mov	$len,$len_			# backup $len
1725	and	\$-16,$len
1726
1727	$movkey	16($key,$rnds_),$rndkey1	# last round key
1728
1729	movdqa	.Lxts_magic(%rip),$twmask
1730	movdqa	$inout0,@tweak[5]
1731	pshufd	\$0x5f,$inout0,$twres
1732	pxor	$rndkey0,$rndkey1
1733___
1734    # alternative tweak calculation algorithm is based on suggestions
1735    # by Shay Gueron. psrad doesn't conflict with AES-NI instructions
1736    # and should help in the future...
1737    for ($i=0;$i<4;$i++) {
1738    $code.=<<___;
1739	movdqa	$twres,$twtmp
1740	paddd	$twres,$twres
1741	movdqa	@tweak[5],@tweak[$i]
1742	psrad	\$31,$twtmp			# broadcast upper bits
1743	paddq	@tweak[5],@tweak[5]
1744	pand	$twmask,$twtmp
1745	pxor	$rndkey0,@tweak[$i]
1746	pxor	$twtmp,@tweak[5]
1747___
1748    }
1749$code.=<<___;
1750	movdqa	@tweak[5],@tweak[4]
1751	psrad	\$31,$twres
1752	paddq	@tweak[5],@tweak[5]
1753	pand	$twmask,$twres
1754	pxor	$rndkey0,@tweak[4]
1755	pxor	$twres,@tweak[5]
1756	movaps	$rndkey1,0x60(%rsp)		# save round[0]^round[last]
1757
1758	sub	\$16*6,$len
1759	jc	.Lxts_enc_short			# if $len-=6*16 borrowed
1760
1761	mov	\$16+96,$rounds
1762	lea	32($key_,$rnds_),$key		# end of key schedule
1763	sub	%r10,%rax			# twisted $rounds
1764	$movkey	16($key_),$rndkey1
1765	mov	%rax,%r10			# backup twisted $rounds
1766	lea	.Lxts_magic(%rip),%r8
1767	jmp	.Lxts_enc_grandloop
1768
1769.align	32
1770.Lxts_enc_grandloop:
1771	movdqu	`16*0`($inp),$inout0		# load input
1772	movdqa	$rndkey0,$twmask
1773	movdqu	`16*1`($inp),$inout1
1774	pxor	@tweak[0],$inout0		# input^=tweak^round[0]
1775	movdqu	`16*2`($inp),$inout2
1776	pxor	@tweak[1],$inout1
1777	 aesenc		$rndkey1,$inout0
1778	movdqu	`16*3`($inp),$inout3
1779	pxor	@tweak[2],$inout2
1780	 aesenc		$rndkey1,$inout1
1781	movdqu	`16*4`($inp),$inout4
1782	pxor	@tweak[3],$inout3
1783	 aesenc		$rndkey1,$inout2
1784	movdqu	`16*5`($inp),$inout5
1785	pxor	@tweak[5],$twmask		# round[0]^=tweak[5]
1786	 movdqa	0x60(%rsp),$twres		# load round[0]^round[last]
1787	pxor	@tweak[4],$inout4
1788	 aesenc		$rndkey1,$inout3
1789	$movkey	32($key_),$rndkey0
1790	lea	`16*6`($inp),$inp
1791	pxor	$twmask,$inout5
1792
1793	 pxor	$twres,@tweak[0]		# calculate tweaks^round[last]
1794	aesenc		$rndkey1,$inout4
1795	 pxor	$twres,@tweak[1]
1796	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks^round[last]
1797	aesenc		$rndkey1,$inout5
1798	$movkey		48($key_),$rndkey1
1799	 pxor	$twres,@tweak[2]
1800
1801	aesenc		$rndkey0,$inout0
1802	 pxor	$twres,@tweak[3]
1803	 movdqa	@tweak[1],`16*1`(%rsp)
1804	aesenc		$rndkey0,$inout1
1805	 pxor	$twres,@tweak[4]
1806	 movdqa	@tweak[2],`16*2`(%rsp)
1807	aesenc		$rndkey0,$inout2
1808	aesenc		$rndkey0,$inout3
1809	 pxor	$twres,$twmask
1810	 movdqa	@tweak[4],`16*4`(%rsp)
1811	aesenc		$rndkey0,$inout4
1812	aesenc		$rndkey0,$inout5
1813	$movkey		64($key_),$rndkey0
1814	 movdqa	$twmask,`16*5`(%rsp)
1815	pshufd	\$0x5f,@tweak[5],$twres
1816	jmp	.Lxts_enc_loop6
1817.align	32
1818.Lxts_enc_loop6:
1819	aesenc		$rndkey1,$inout0
1820	aesenc		$rndkey1,$inout1
1821	aesenc		$rndkey1,$inout2
1822	aesenc		$rndkey1,$inout3
1823	aesenc		$rndkey1,$inout4
1824	aesenc		$rndkey1,$inout5
1825	$movkey		-64($key,%rax),$rndkey1
1826	add		\$32,%rax
1827
1828	aesenc		$rndkey0,$inout0
1829	aesenc		$rndkey0,$inout1
1830	aesenc		$rndkey0,$inout2
1831	aesenc		$rndkey0,$inout3
1832	aesenc		$rndkey0,$inout4
1833	aesenc		$rndkey0,$inout5
1834	$movkey		-80($key,%rax),$rndkey0
1835	jnz		.Lxts_enc_loop6
1836
1837	movdqa	(%r8),$twmask			# start calculating next tweak
1838	movdqa	$twres,$twtmp
1839	paddd	$twres,$twres
1840	 aesenc		$rndkey1,$inout0
1841	paddq	@tweak[5],@tweak[5]
1842	psrad	\$31,$twtmp
1843	 aesenc		$rndkey1,$inout1
1844	pand	$twmask,$twtmp
1845	$movkey	($key_),@tweak[0]		# load round[0]
1846	 aesenc		$rndkey1,$inout2
1847	 aesenc		$rndkey1,$inout3
1848	 aesenc		$rndkey1,$inout4
1849	pxor	$twtmp,@tweak[5]
1850	movaps	@tweak[0],@tweak[1]		# copy round[0]
1851	 aesenc		$rndkey1,$inout5
1852	 $movkey	-64($key),$rndkey1
1853
1854	movdqa	$twres,$twtmp
1855	 aesenc		$rndkey0,$inout0
1856	paddd	$twres,$twres
1857	pxor	@tweak[5],@tweak[0]
1858	 aesenc		$rndkey0,$inout1
1859	psrad	\$31,$twtmp
1860	paddq	@tweak[5],@tweak[5]
1861	 aesenc		$rndkey0,$inout2
1862	 aesenc		$rndkey0,$inout3
1863	pand	$twmask,$twtmp
1864	movaps	@tweak[1],@tweak[2]
1865	 aesenc		$rndkey0,$inout4
1866	pxor	$twtmp,@tweak[5]
1867	movdqa	$twres,$twtmp
1868	 aesenc		$rndkey0,$inout5
1869	 $movkey	-48($key),$rndkey0
1870
1871	paddd	$twres,$twres
1872	 aesenc		$rndkey1,$inout0
1873	pxor	@tweak[5],@tweak[1]
1874	psrad	\$31,$twtmp
1875	 aesenc		$rndkey1,$inout1
1876	paddq	@tweak[5],@tweak[5]
1877	pand	$twmask,$twtmp
1878	 aesenc		$rndkey1,$inout2
1879	 aesenc		$rndkey1,$inout3
1880	 movdqa	@tweak[3],`16*3`(%rsp)
1881	pxor	$twtmp,@tweak[5]
1882	 aesenc		$rndkey1,$inout4
1883	movaps	@tweak[2],@tweak[3]
1884	movdqa	$twres,$twtmp
1885	 aesenc		$rndkey1,$inout5
1886	 $movkey	-32($key),$rndkey1
1887
1888	paddd	$twres,$twres
1889	 aesenc		$rndkey0,$inout0
1890	pxor	@tweak[5],@tweak[2]
1891	psrad	\$31,$twtmp
1892	 aesenc		$rndkey0,$inout1
1893	paddq	@tweak[5],@tweak[5]
1894	pand	$twmask,$twtmp
1895	 aesenc		$rndkey0,$inout2
1896	 aesenc		$rndkey0,$inout3
1897	 aesenc		$rndkey0,$inout4
1898	pxor	$twtmp,@tweak[5]
1899	movaps	@tweak[3],@tweak[4]
1900	 aesenc		$rndkey0,$inout5
1901
1902	movdqa	$twres,$rndkey0
1903	paddd	$twres,$twres
1904	 aesenc		$rndkey1,$inout0
1905	pxor	@tweak[5],@tweak[3]
1906	psrad	\$31,$rndkey0
1907	 aesenc		$rndkey1,$inout1
1908	paddq	@tweak[5],@tweak[5]
1909	pand	$twmask,$rndkey0
1910	 aesenc		$rndkey1,$inout2
1911	 aesenc		$rndkey1,$inout3
1912	pxor	$rndkey0,@tweak[5]
1913	$movkey		($key_),$rndkey0
1914	 aesenc		$rndkey1,$inout4
1915	 aesenc		$rndkey1,$inout5
1916	$movkey		16($key_),$rndkey1
1917
1918	pxor	@tweak[5],@tweak[4]
1919	 aesenclast	`16*0`(%rsp),$inout0
1920	psrad	\$31,$twres
1921	paddq	@tweak[5],@tweak[5]
1922	 aesenclast	`16*1`(%rsp),$inout1
1923	 aesenclast	`16*2`(%rsp),$inout2
1924	pand	$twmask,$twres
1925	mov	%r10,%rax			# restore $rounds
1926	 aesenclast	`16*3`(%rsp),$inout3
1927	 aesenclast	`16*4`(%rsp),$inout4
1928	 aesenclast	`16*5`(%rsp),$inout5
1929	pxor	$twres,@tweak[5]
1930
1931	lea	`16*6`($out),$out		# $out+=6*16
1932	movups	$inout0,`-16*6`($out)		# store 6 output blocks
1933	movups	$inout1,`-16*5`($out)
1934	movups	$inout2,`-16*4`($out)
1935	movups	$inout3,`-16*3`($out)
1936	movups	$inout4,`-16*2`($out)
1937	movups	$inout5,`-16*1`($out)
1938	sub	\$16*6,$len
1939	jnc	.Lxts_enc_grandloop		# loop if $len-=6*16 didn't borrow
1940
1941	mov	\$16+96,$rounds
1942	sub	$rnds_,$rounds
1943	mov	$key_,$key			# restore $key
1944	shr	\$4,$rounds			# restore original value
1945
1946.Lxts_enc_short:
1947	# at the point @tweak[0..5] are populated with tweak values
1948	mov	$rounds,$rnds_			# backup $rounds
1949	pxor	$rndkey0,@tweak[0]
1950	add	\$16*6,$len			# restore real remaining $len
1951	jz	.Lxts_enc_done			# done if ($len==0)
1952
1953	pxor	$rndkey0,@tweak[1]
1954	cmp	\$0x20,$len
1955	jb	.Lxts_enc_one			# $len is 1*16
1956	pxor	$rndkey0,@tweak[2]
1957	je	.Lxts_enc_two			# $len is 2*16
1958
1959	pxor	$rndkey0,@tweak[3]
1960	cmp	\$0x40,$len
1961	jb	.Lxts_enc_three			# $len is 3*16
1962	pxor	$rndkey0,@tweak[4]
1963	je	.Lxts_enc_four			# $len is 4*16
1964
1965	movdqu	($inp),$inout0			# $len is 5*16
1966	movdqu	16*1($inp),$inout1
1967	movdqu	16*2($inp),$inout2
1968	pxor	@tweak[0],$inout0
1969	movdqu	16*3($inp),$inout3
1970	pxor	@tweak[1],$inout1
1971	movdqu	16*4($inp),$inout4
1972	lea	16*5($inp),$inp			# $inp+=5*16
1973	pxor	@tweak[2],$inout2
1974	pxor	@tweak[3],$inout3
1975	pxor	@tweak[4],$inout4
1976	pxor	$inout5,$inout5
1977
1978	call	_aesni_encrypt6
1979
1980	xorps	@tweak[0],$inout0
1981	movdqa	@tweak[5],@tweak[0]
1982	xorps	@tweak[1],$inout1
1983	xorps	@tweak[2],$inout2
1984	movdqu	$inout0,($out)			# store 5 output blocks
1985	xorps	@tweak[3],$inout3
1986	movdqu	$inout1,16*1($out)
1987	xorps	@tweak[4],$inout4
1988	movdqu	$inout2,16*2($out)
1989	movdqu	$inout3,16*3($out)
1990	movdqu	$inout4,16*4($out)
1991	lea	16*5($out),$out			# $out+=5*16
1992	jmp	.Lxts_enc_done
1993
1994.align	16
1995.Lxts_enc_one:
1996	movups	($inp),$inout0
1997	lea	16*1($inp),$inp			# inp+=1*16
1998	xorps	@tweak[0],$inout0
1999___
2000	&aesni_generate1("enc",$key,$rounds);
2001$code.=<<___;
2002	xorps	@tweak[0],$inout0
2003	movdqa	@tweak[1],@tweak[0]
2004	movups	$inout0,($out)			# store one output block
2005	lea	16*1($out),$out			# $out+=1*16
2006	jmp	.Lxts_enc_done
2007
2008.align	16
2009.Lxts_enc_two:
2010	movups	($inp),$inout0
2011	movups	16($inp),$inout1
2012	lea	32($inp),$inp			# $inp+=2*16
2013	xorps	@tweak[0],$inout0
2014	xorps	@tweak[1],$inout1
2015
2016	call	_aesni_encrypt2
2017
2018	xorps	@tweak[0],$inout0
2019	movdqa	@tweak[2],@tweak[0]
2020	xorps	@tweak[1],$inout1
2021	movups	$inout0,($out)			# store 2 output blocks
2022	movups	$inout1,16*1($out)
2023	lea	16*2($out),$out			# $out+=2*16
2024	jmp	.Lxts_enc_done
2025
2026.align	16
2027.Lxts_enc_three:
2028	movups	($inp),$inout0
2029	movups	16*1($inp),$inout1
2030	movups	16*2($inp),$inout2
2031	lea	16*3($inp),$inp			# $inp+=3*16
2032	xorps	@tweak[0],$inout0
2033	xorps	@tweak[1],$inout1
2034	xorps	@tweak[2],$inout2
2035
2036	call	_aesni_encrypt3
2037
2038	xorps	@tweak[0],$inout0
2039	movdqa	@tweak[3],@tweak[0]
2040	xorps	@tweak[1],$inout1
2041	xorps	@tweak[2],$inout2
2042	movups	$inout0,($out)			# store 3 output blocks
2043	movups	$inout1,16*1($out)
2044	movups	$inout2,16*2($out)
2045	lea	16*3($out),$out			# $out+=3*16
2046	jmp	.Lxts_enc_done
2047
2048.align	16
2049.Lxts_enc_four:
2050	movups	($inp),$inout0
2051	movups	16*1($inp),$inout1
2052	movups	16*2($inp),$inout2
2053	xorps	@tweak[0],$inout0
2054	movups	16*3($inp),$inout3
2055	lea	16*4($inp),$inp			# $inp+=4*16
2056	xorps	@tweak[1],$inout1
2057	xorps	@tweak[2],$inout2
2058	xorps	@tweak[3],$inout3
2059
2060	call	_aesni_encrypt4
2061
2062	pxor	@tweak[0],$inout0
2063	movdqa	@tweak[4],@tweak[0]
2064	pxor	@tweak[1],$inout1
2065	pxor	@tweak[2],$inout2
2066	movdqu	$inout0,($out)			# store 4 output blocks
2067	pxor	@tweak[3],$inout3
2068	movdqu	$inout1,16*1($out)
2069	movdqu	$inout2,16*2($out)
2070	movdqu	$inout3,16*3($out)
2071	lea	16*4($out),$out			# $out+=4*16
2072	jmp	.Lxts_enc_done
2073
2074.align	16
2075.Lxts_enc_done:
2076	and	\$15,$len_			# see if $len%16 is 0
2077	jz	.Lxts_enc_ret
2078	mov	$len_,$len
2079
2080.Lxts_enc_steal:
2081	movzb	($inp),%eax			# borrow $rounds ...
2082	movzb	-16($out),%ecx			# ... and $key
2083	lea	1($inp),$inp
2084	mov	%al,-16($out)
2085	mov	%cl,0($out)
2086	lea	1($out),$out
2087	sub	\$1,$len
2088	jnz	.Lxts_enc_steal
2089
2090	sub	$len_,$out			# rewind $out
2091	mov	$key_,$key			# restore $key
2092	mov	$rnds_,$rounds			# restore $rounds
2093
2094	movups	-16($out),$inout0
2095	xorps	@tweak[0],$inout0
2096___
2097	&aesni_generate1("enc",$key,$rounds);
2098$code.=<<___;
2099	xorps	@tweak[0],$inout0
2100	movups	$inout0,-16($out)
2101
2102.Lxts_enc_ret:
2103	xorps	%xmm0,%xmm0			# clear register bank
2104	pxor	%xmm1,%xmm1
2105	pxor	%xmm2,%xmm2
2106	pxor	%xmm3,%xmm3
2107	pxor	%xmm4,%xmm4
2108	pxor	%xmm5,%xmm5
2109___
2110$code.=<<___ if (!$win64);
2111	pxor	%xmm6,%xmm6
2112	pxor	%xmm7,%xmm7
2113	movaps	%xmm0,0x00(%rsp)		# clear stack
2114	pxor	%xmm8,%xmm8
2115	movaps	%xmm0,0x10(%rsp)
2116	pxor	%xmm9,%xmm9
2117	movaps	%xmm0,0x20(%rsp)
2118	pxor	%xmm10,%xmm10
2119	movaps	%xmm0,0x30(%rsp)
2120	pxor	%xmm11,%xmm11
2121	movaps	%xmm0,0x40(%rsp)
2122	pxor	%xmm12,%xmm12
2123	movaps	%xmm0,0x50(%rsp)
2124	pxor	%xmm13,%xmm13
2125	movaps	%xmm0,0x60(%rsp)
2126	pxor	%xmm14,%xmm14
2127	pxor	%xmm15,%xmm15
2128___
2129$code.=<<___ if ($win64);
2130	movaps	-0xa8(%r11),%xmm6
2131	movaps	%xmm0,-0xa8(%r11)		# clear stack
2132	movaps	-0x98(%r11),%xmm7
2133	movaps	%xmm0,-0x98(%r11)
2134	movaps	-0x88(%r11),%xmm8
2135	movaps	%xmm0,-0x88(%r11)
2136	movaps	-0x78(%r11),%xmm9
2137	movaps	%xmm0,-0x78(%r11)
2138	movaps	-0x68(%r11),%xmm10
2139	movaps	%xmm0,-0x68(%r11)
2140	movaps	-0x58(%r11),%xmm11
2141	movaps	%xmm0,-0x58(%r11)
2142	movaps	-0x48(%r11),%xmm12
2143	movaps	%xmm0,-0x48(%r11)
2144	movaps	-0x38(%r11),%xmm13
2145	movaps	%xmm0,-0x38(%r11)
2146	movaps	-0x28(%r11),%xmm14
2147	movaps	%xmm0,-0x28(%r11)
2148	movaps	-0x18(%r11),%xmm15
2149	movaps	%xmm0,-0x18(%r11)
2150	movaps	%xmm0,0x00(%rsp)
2151	movaps	%xmm0,0x10(%rsp)
2152	movaps	%xmm0,0x20(%rsp)
2153	movaps	%xmm0,0x30(%rsp)
2154	movaps	%xmm0,0x40(%rsp)
2155	movaps	%xmm0,0x50(%rsp)
2156	movaps	%xmm0,0x60(%rsp)
2157___
2158$code.=<<___;
2159	mov	-8(%r11),%rbp
2160.cfi_restore	%rbp
2161	lea	(%r11),%rsp
2162.cfi_def_cfa_register	%rsp
2163.Lxts_enc_epilogue:
2164	ret
2165.cfi_endproc
2166.size	${PREFIX}_xts_encrypt,.-${PREFIX}_xts_encrypt
2167___
2168
2169$code.=<<___;
2170.globl	${PREFIX}_xts_decrypt
2171.type	${PREFIX}_xts_decrypt,\@function,6
2172.align	16
2173${PREFIX}_xts_decrypt:
2174.cfi_startproc
2175	_CET_ENDBR
2176	lea	(%rsp),%r11			# frame pointer
2177.cfi_def_cfa_register	%r11
2178	push	%rbp
2179.cfi_push	%rbp
2180	sub	\$$frame_size,%rsp
2181	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
2182___
2183$code.=<<___ if ($win64);
2184	movaps	%xmm6,-0xa8(%r11)		# offload everything
2185	movaps	%xmm7,-0x98(%r11)
2186	movaps	%xmm8,-0x88(%r11)
2187	movaps	%xmm9,-0x78(%r11)
2188	movaps	%xmm10,-0x68(%r11)
2189	movaps	%xmm11,-0x58(%r11)
2190	movaps	%xmm12,-0x48(%r11)
2191	movaps	%xmm13,-0x38(%r11)
2192	movaps	%xmm14,-0x28(%r11)
2193	movaps	%xmm15,-0x18(%r11)
2194.Lxts_dec_body:
2195___
2196$code.=<<___;
2197	movups	($ivp),$inout0			# load clear-text tweak
2198	mov	240($key2),$rounds		# key2->rounds
2199	mov	240($key),$rnds_		# key1->rounds
2200___
2201	# generate the tweak
2202	&aesni_generate1("enc",$key2,$rounds,$inout0);
2203$code.=<<___;
2204	xor	%eax,%eax			# if ($len%16) len-=16;
2205	test	\$15,$len
2206	setnz	%al
2207	shl	\$4,%rax
2208	sub	%rax,$len
2209
2210	$movkey	($key),$rndkey0			# zero round key
2211	mov	$key,$key_			# backup $key
2212	mov	$rnds_,$rounds			# backup $rounds
2213	shl	\$4,$rnds_
2214	mov	$len,$len_			# backup $len
2215	and	\$-16,$len
2216
2217	$movkey	16($key,$rnds_),$rndkey1	# last round key
2218
2219	movdqa	.Lxts_magic(%rip),$twmask
2220	movdqa	$inout0,@tweak[5]
2221	pshufd	\$0x5f,$inout0,$twres
2222	pxor	$rndkey0,$rndkey1
2223___
2224    for ($i=0;$i<4;$i++) {
2225    $code.=<<___;
2226	movdqa	$twres,$twtmp
2227	paddd	$twres,$twres
2228	movdqa	@tweak[5],@tweak[$i]
2229	psrad	\$31,$twtmp			# broadcast upper bits
2230	paddq	@tweak[5],@tweak[5]
2231	pand	$twmask,$twtmp
2232	pxor	$rndkey0,@tweak[$i]
2233	pxor	$twtmp,@tweak[5]
2234___
2235    }
2236$code.=<<___;
2237	movdqa	@tweak[5],@tweak[4]
2238	psrad	\$31,$twres
2239	paddq	@tweak[5],@tweak[5]
2240	pand	$twmask,$twres
2241	pxor	$rndkey0,@tweak[4]
2242	pxor	$twres,@tweak[5]
2243	movaps	$rndkey1,0x60(%rsp)		# save round[0]^round[last]
2244
2245	sub	\$16*6,$len
2246	jc	.Lxts_dec_short			# if $len-=6*16 borrowed
2247
2248	mov	\$16+96,$rounds
2249	lea	32($key_,$rnds_),$key		# end of key schedule
2250	sub	%r10,%rax			# twisted $rounds
2251	$movkey	16($key_),$rndkey1
2252	mov	%rax,%r10			# backup twisted $rounds
2253	lea	.Lxts_magic(%rip),%r8
2254	jmp	.Lxts_dec_grandloop
2255
2256.align	32
2257.Lxts_dec_grandloop:
2258	movdqu	`16*0`($inp),$inout0		# load input
2259	movdqa	$rndkey0,$twmask
2260	movdqu	`16*1`($inp),$inout1
2261	pxor	@tweak[0],$inout0		# intput^=tweak^round[0]
2262	movdqu	`16*2`($inp),$inout2
2263	pxor	@tweak[1],$inout1
2264	 aesdec		$rndkey1,$inout0
2265	movdqu	`16*3`($inp),$inout3
2266	pxor	@tweak[2],$inout2
2267	 aesdec		$rndkey1,$inout1
2268	movdqu	`16*4`($inp),$inout4
2269	pxor	@tweak[3],$inout3
2270	 aesdec		$rndkey1,$inout2
2271	movdqu	`16*5`($inp),$inout5
2272	pxor	@tweak[5],$twmask		# round[0]^=tweak[5]
2273	 movdqa	0x60(%rsp),$twres		# load round[0]^round[last]
2274	pxor	@tweak[4],$inout4
2275	 aesdec		$rndkey1,$inout3
2276	$movkey	32($key_),$rndkey0
2277	lea	`16*6`($inp),$inp
2278	pxor	$twmask,$inout5
2279
2280	 pxor	$twres,@tweak[0]		# calculate tweaks^round[last]
2281	aesdec		$rndkey1,$inout4
2282	 pxor	$twres,@tweak[1]
2283	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks^last round key
2284	aesdec		$rndkey1,$inout5
2285	$movkey		48($key_),$rndkey1
2286	 pxor	$twres,@tweak[2]
2287
2288	aesdec		$rndkey0,$inout0
2289	 pxor	$twres,@tweak[3]
2290	 movdqa	@tweak[1],`16*1`(%rsp)
2291	aesdec		$rndkey0,$inout1
2292	 pxor	$twres,@tweak[4]
2293	 movdqa	@tweak[2],`16*2`(%rsp)
2294	aesdec		$rndkey0,$inout2
2295	aesdec		$rndkey0,$inout3
2296	 pxor	$twres,$twmask
2297	 movdqa	@tweak[4],`16*4`(%rsp)
2298	aesdec		$rndkey0,$inout4
2299	aesdec		$rndkey0,$inout5
2300	$movkey		64($key_),$rndkey0
2301	 movdqa	$twmask,`16*5`(%rsp)
2302	pshufd	\$0x5f,@tweak[5],$twres
2303	jmp	.Lxts_dec_loop6
2304.align	32
2305.Lxts_dec_loop6:
2306	aesdec		$rndkey1,$inout0
2307	aesdec		$rndkey1,$inout1
2308	aesdec		$rndkey1,$inout2
2309	aesdec		$rndkey1,$inout3
2310	aesdec		$rndkey1,$inout4
2311	aesdec		$rndkey1,$inout5
2312	$movkey		-64($key,%rax),$rndkey1
2313	add		\$32,%rax
2314
2315	aesdec		$rndkey0,$inout0
2316	aesdec		$rndkey0,$inout1
2317	aesdec		$rndkey0,$inout2
2318	aesdec		$rndkey0,$inout3
2319	aesdec		$rndkey0,$inout4
2320	aesdec		$rndkey0,$inout5
2321	$movkey		-80($key,%rax),$rndkey0
2322	jnz		.Lxts_dec_loop6
2323
2324	movdqa	(%r8),$twmask			# start calculating next tweak
2325	movdqa	$twres,$twtmp
2326	paddd	$twres,$twres
2327	 aesdec		$rndkey1,$inout0
2328	paddq	@tweak[5],@tweak[5]
2329	psrad	\$31,$twtmp
2330	 aesdec		$rndkey1,$inout1
2331	pand	$twmask,$twtmp
2332	$movkey	($key_),@tweak[0]		# load round[0]
2333	 aesdec		$rndkey1,$inout2
2334	 aesdec		$rndkey1,$inout3
2335	 aesdec		$rndkey1,$inout4
2336	pxor	$twtmp,@tweak[5]
2337	movaps	@tweak[0],@tweak[1]		# copy round[0]
2338	 aesdec		$rndkey1,$inout5
2339	 $movkey	-64($key),$rndkey1
2340
2341	movdqa	$twres,$twtmp
2342	 aesdec		$rndkey0,$inout0
2343	paddd	$twres,$twres
2344	pxor	@tweak[5],@tweak[0]
2345	 aesdec		$rndkey0,$inout1
2346	psrad	\$31,$twtmp
2347	paddq	@tweak[5],@tweak[5]
2348	 aesdec		$rndkey0,$inout2
2349	 aesdec		$rndkey0,$inout3
2350	pand	$twmask,$twtmp
2351	movaps	@tweak[1],@tweak[2]
2352	 aesdec		$rndkey0,$inout4
2353	pxor	$twtmp,@tweak[5]
2354	movdqa	$twres,$twtmp
2355	 aesdec		$rndkey0,$inout5
2356	 $movkey	-48($key),$rndkey0
2357
2358	paddd	$twres,$twres
2359	 aesdec		$rndkey1,$inout0
2360	pxor	@tweak[5],@tweak[1]
2361	psrad	\$31,$twtmp
2362	 aesdec		$rndkey1,$inout1
2363	paddq	@tweak[5],@tweak[5]
2364	pand	$twmask,$twtmp
2365	 aesdec		$rndkey1,$inout2
2366	 aesdec		$rndkey1,$inout3
2367	 movdqa	@tweak[3],`16*3`(%rsp)
2368	pxor	$twtmp,@tweak[5]
2369	 aesdec		$rndkey1,$inout4
2370	movaps	@tweak[2],@tweak[3]
2371	movdqa	$twres,$twtmp
2372	 aesdec		$rndkey1,$inout5
2373	 $movkey	-32($key),$rndkey1
2374
2375	paddd	$twres,$twres
2376	 aesdec		$rndkey0,$inout0
2377	pxor	@tweak[5],@tweak[2]
2378	psrad	\$31,$twtmp
2379	 aesdec		$rndkey0,$inout1
2380	paddq	@tweak[5],@tweak[5]
2381	pand	$twmask,$twtmp
2382	 aesdec		$rndkey0,$inout2
2383	 aesdec		$rndkey0,$inout3
2384	 aesdec		$rndkey0,$inout4
2385	pxor	$twtmp,@tweak[5]
2386	movaps	@tweak[3],@tweak[4]
2387	 aesdec		$rndkey0,$inout5
2388
2389	movdqa	$twres,$rndkey0
2390	paddd	$twres,$twres
2391	 aesdec		$rndkey1,$inout0
2392	pxor	@tweak[5],@tweak[3]
2393	psrad	\$31,$rndkey0
2394	 aesdec		$rndkey1,$inout1
2395	paddq	@tweak[5],@tweak[5]
2396	pand	$twmask,$rndkey0
2397	 aesdec		$rndkey1,$inout2
2398	 aesdec		$rndkey1,$inout3
2399	pxor	$rndkey0,@tweak[5]
2400	$movkey		($key_),$rndkey0
2401	 aesdec		$rndkey1,$inout4
2402	 aesdec		$rndkey1,$inout5
2403	$movkey		16($key_),$rndkey1
2404
2405	pxor	@tweak[5],@tweak[4]
2406	 aesdeclast	`16*0`(%rsp),$inout0
2407	psrad	\$31,$twres
2408	paddq	@tweak[5],@tweak[5]
2409	 aesdeclast	`16*1`(%rsp),$inout1
2410	 aesdeclast	`16*2`(%rsp),$inout2
2411	pand	$twmask,$twres
2412	mov	%r10,%rax			# restore $rounds
2413	 aesdeclast	`16*3`(%rsp),$inout3
2414	 aesdeclast	`16*4`(%rsp),$inout4
2415	 aesdeclast	`16*5`(%rsp),$inout5
2416	pxor	$twres,@tweak[5]
2417
2418	lea	`16*6`($out),$out		# $out+=6*16
2419	movups	$inout0,`-16*6`($out)		# store 6 output blocks
2420	movups	$inout1,`-16*5`($out)
2421	movups	$inout2,`-16*4`($out)
2422	movups	$inout3,`-16*3`($out)
2423	movups	$inout4,`-16*2`($out)
2424	movups	$inout5,`-16*1`($out)
2425	sub	\$16*6,$len
2426	jnc	.Lxts_dec_grandloop		# loop if $len-=6*16 didn't borrow
2427
2428	mov	\$16+96,$rounds
2429	sub	$rnds_,$rounds
2430	mov	$key_,$key			# restore $key
2431	shr	\$4,$rounds			# restore original value
2432
2433.Lxts_dec_short:
2434	# at the point @tweak[0..5] are populated with tweak values
2435	mov	$rounds,$rnds_			# backup $rounds
2436	pxor	$rndkey0,@tweak[0]
2437	pxor	$rndkey0,@tweak[1]
2438	add	\$16*6,$len			# restore real remaining $len
2439	jz	.Lxts_dec_done			# done if ($len==0)
2440
2441	pxor	$rndkey0,@tweak[2]
2442	cmp	\$0x20,$len
2443	jb	.Lxts_dec_one			# $len is 1*16
2444	pxor	$rndkey0,@tweak[3]
2445	je	.Lxts_dec_two			# $len is 2*16
2446
2447	pxor	$rndkey0,@tweak[4]
2448	cmp	\$0x40,$len
2449	jb	.Lxts_dec_three			# $len is 3*16
2450	je	.Lxts_dec_four			# $len is 4*16
2451
2452	movdqu	($inp),$inout0			# $len is 5*16
2453	movdqu	16*1($inp),$inout1
2454	movdqu	16*2($inp),$inout2
2455	pxor	@tweak[0],$inout0
2456	movdqu	16*3($inp),$inout3
2457	pxor	@tweak[1],$inout1
2458	movdqu	16*4($inp),$inout4
2459	lea	16*5($inp),$inp			# $inp+=5*16
2460	pxor	@tweak[2],$inout2
2461	pxor	@tweak[3],$inout3
2462	pxor	@tweak[4],$inout4
2463
2464	call	_aesni_decrypt6
2465
2466	xorps	@tweak[0],$inout0
2467	xorps	@tweak[1],$inout1
2468	xorps	@tweak[2],$inout2
2469	movdqu	$inout0,($out)			# store 5 output blocks
2470	xorps	@tweak[3],$inout3
2471	movdqu	$inout1,16*1($out)
2472	xorps	@tweak[4],$inout4
2473	movdqu	$inout2,16*2($out)
2474	 pxor		$twtmp,$twtmp
2475	movdqu	$inout3,16*3($out)
2476	 pcmpgtd	@tweak[5],$twtmp
2477	movdqu	$inout4,16*4($out)
2478	lea	16*5($out),$out			# $out+=5*16
2479	 pshufd		\$0x13,$twtmp,@tweak[1]	# $twres
2480	and	\$15,$len_
2481	jz	.Lxts_dec_ret
2482
2483	movdqa	@tweak[5],@tweak[0]
2484	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
2485	pand	$twmask,@tweak[1]		# isolate carry and residue
2486	pxor	@tweak[5],@tweak[1]
2487	jmp	.Lxts_dec_done2
2488
2489.align	16
2490.Lxts_dec_one:
2491	movups	($inp),$inout0
2492	lea	16*1($inp),$inp			# $inp+=1*16
2493	xorps	@tweak[0],$inout0
2494___
2495	&aesni_generate1("dec",$key,$rounds);
2496$code.=<<___;
2497	xorps	@tweak[0],$inout0
2498	movdqa	@tweak[1],@tweak[0]
2499	movups	$inout0,($out)			# store one output block
2500	movdqa	@tweak[2],@tweak[1]
2501	lea	16*1($out),$out			# $out+=1*16
2502	jmp	.Lxts_dec_done
2503
2504.align	16
2505.Lxts_dec_two:
2506	movups	($inp),$inout0
2507	movups	16($inp),$inout1
2508	lea	32($inp),$inp			# $inp+=2*16
2509	xorps	@tweak[0],$inout0
2510	xorps	@tweak[1],$inout1
2511
2512	call	_aesni_decrypt2
2513
2514	xorps	@tweak[0],$inout0
2515	movdqa	@tweak[2],@tweak[0]
2516	xorps	@tweak[1],$inout1
2517	movdqa	@tweak[3],@tweak[1]
2518	movups	$inout0,($out)			# store 2 output blocks
2519	movups	$inout1,16*1($out)
2520	lea	16*2($out),$out			# $out+=2*16
2521	jmp	.Lxts_dec_done
2522
2523.align	16
2524.Lxts_dec_three:
2525	movups	($inp),$inout0
2526	movups	16*1($inp),$inout1
2527	movups	16*2($inp),$inout2
2528	lea	16*3($inp),$inp			# $inp+=3*16
2529	xorps	@tweak[0],$inout0
2530	xorps	@tweak[1],$inout1
2531	xorps	@tweak[2],$inout2
2532
2533	call	_aesni_decrypt3
2534
2535	xorps	@tweak[0],$inout0
2536	movdqa	@tweak[3],@tweak[0]
2537	xorps	@tweak[1],$inout1
2538	movdqa	@tweak[4],@tweak[1]
2539	xorps	@tweak[2],$inout2
2540	movups	$inout0,($out)			# store 3 output blocks
2541	movups	$inout1,16*1($out)
2542	movups	$inout2,16*2($out)
2543	lea	16*3($out),$out			# $out+=3*16
2544	jmp	.Lxts_dec_done
2545
2546.align	16
2547.Lxts_dec_four:
2548	movups	($inp),$inout0
2549	movups	16*1($inp),$inout1
2550	movups	16*2($inp),$inout2
2551	xorps	@tweak[0],$inout0
2552	movups	16*3($inp),$inout3
2553	lea	16*4($inp),$inp			# $inp+=4*16
2554	xorps	@tweak[1],$inout1
2555	xorps	@tweak[2],$inout2
2556	xorps	@tweak[3],$inout3
2557
2558	call	_aesni_decrypt4
2559
2560	pxor	@tweak[0],$inout0
2561	movdqa	@tweak[4],@tweak[0]
2562	pxor	@tweak[1],$inout1
2563	movdqa	@tweak[5],@tweak[1]
2564	pxor	@tweak[2],$inout2
2565	movdqu	$inout0,($out)			# store 4 output blocks
2566	pxor	@tweak[3],$inout3
2567	movdqu	$inout1,16*1($out)
2568	movdqu	$inout2,16*2($out)
2569	movdqu	$inout3,16*3($out)
2570	lea	16*4($out),$out			# $out+=4*16
2571	jmp	.Lxts_dec_done
2572
2573.align	16
2574.Lxts_dec_done:
2575	and	\$15,$len_			# see if $len%16 is 0
2576	jz	.Lxts_dec_ret
2577.Lxts_dec_done2:
2578	mov	$len_,$len
2579	mov	$key_,$key			# restore $key
2580	mov	$rnds_,$rounds			# restore $rounds
2581
2582	movups	($inp),$inout0
2583	xorps	@tweak[1],$inout0
2584___
2585	&aesni_generate1("dec",$key,$rounds);
2586$code.=<<___;
2587	xorps	@tweak[1],$inout0
2588	movups	$inout0,($out)
2589
2590.Lxts_dec_steal:
2591	movzb	16($inp),%eax			# borrow $rounds ...
2592	movzb	($out),%ecx			# ... and $key
2593	lea	1($inp),$inp
2594	mov	%al,($out)
2595	mov	%cl,16($out)
2596	lea	1($out),$out
2597	sub	\$1,$len
2598	jnz	.Lxts_dec_steal
2599
2600	sub	$len_,$out			# rewind $out
2601	mov	$key_,$key			# restore $key
2602	mov	$rnds_,$rounds			# restore $rounds
2603
2604	movups	($out),$inout0
2605	xorps	@tweak[0],$inout0
2606___
2607	&aesni_generate1("dec",$key,$rounds);
2608$code.=<<___;
2609	xorps	@tweak[0],$inout0
2610	movups	$inout0,($out)
2611
2612.Lxts_dec_ret:
2613	xorps	%xmm0,%xmm0			# clear register bank
2614	pxor	%xmm1,%xmm1
2615	pxor	%xmm2,%xmm2
2616	pxor	%xmm3,%xmm3
2617	pxor	%xmm4,%xmm4
2618	pxor	%xmm5,%xmm5
2619___
2620$code.=<<___ if (!$win64);
2621	pxor	%xmm6,%xmm6
2622	pxor	%xmm7,%xmm7
2623	movaps	%xmm0,0x00(%rsp)		# clear stack
2624	pxor	%xmm8,%xmm8
2625	movaps	%xmm0,0x10(%rsp)
2626	pxor	%xmm9,%xmm9
2627	movaps	%xmm0,0x20(%rsp)
2628	pxor	%xmm10,%xmm10
2629	movaps	%xmm0,0x30(%rsp)
2630	pxor	%xmm11,%xmm11
2631	movaps	%xmm0,0x40(%rsp)
2632	pxor	%xmm12,%xmm12
2633	movaps	%xmm0,0x50(%rsp)
2634	pxor	%xmm13,%xmm13
2635	movaps	%xmm0,0x60(%rsp)
2636	pxor	%xmm14,%xmm14
2637	pxor	%xmm15,%xmm15
2638___
2639$code.=<<___ if ($win64);
2640	movaps	-0xa8(%r11),%xmm6
2641	movaps	%xmm0,-0xa8(%r11)		# clear stack
2642	movaps	-0x98(%r11),%xmm7
2643	movaps	%xmm0,-0x98(%r11)
2644	movaps	-0x88(%r11),%xmm8
2645	movaps	%xmm0,-0x88(%r11)
2646	movaps	-0x78(%r11),%xmm9
2647	movaps	%xmm0,-0x78(%r11)
2648	movaps	-0x68(%r11),%xmm10
2649	movaps	%xmm0,-0x68(%r11)
2650	movaps	-0x58(%r11),%xmm11
2651	movaps	%xmm0,-0x58(%r11)
2652	movaps	-0x48(%r11),%xmm12
2653	movaps	%xmm0,-0x48(%r11)
2654	movaps	-0x38(%r11),%xmm13
2655	movaps	%xmm0,-0x38(%r11)
2656	movaps	-0x28(%r11),%xmm14
2657	movaps	%xmm0,-0x28(%r11)
2658	movaps	-0x18(%r11),%xmm15
2659	movaps	%xmm0,-0x18(%r11)
2660	movaps	%xmm0,0x00(%rsp)
2661	movaps	%xmm0,0x10(%rsp)
2662	movaps	%xmm0,0x20(%rsp)
2663	movaps	%xmm0,0x30(%rsp)
2664	movaps	%xmm0,0x40(%rsp)
2665	movaps	%xmm0,0x50(%rsp)
2666	movaps	%xmm0,0x60(%rsp)
2667___
2668$code.=<<___;
2669	mov	-8(%r11),%rbp
2670.cfi_restore	%rbp
2671	lea	(%r11),%rsp
2672.cfi_def_cfa_register	%rsp
2673.Lxts_dec_epilogue:
2674	ret
2675.cfi_endproc
2676.size	${PREFIX}_xts_decrypt,.-${PREFIX}_xts_decrypt
2677___
2678} }}
2679
2680########################################################################
2681# void $PREFIX_cbc_encrypt (const void *inp, void *out,
2682#			    size_t length, const AES_KEY *key,
2683#			    unsigned char *ivp,const int enc);
2684{
2685my $frame_size = 0x10 + ($win64?0xa0:0);	# used in decrypt
2686my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
2687
2688$code.=<<___;
2689.globl	${PREFIX}_cbc_encrypt
2690.type	${PREFIX}_cbc_encrypt,\@function,6
2691.align	16
2692${PREFIX}_cbc_encrypt:
2693.cfi_startproc
2694	_CET_ENDBR
2695	test	$len,$len		# check length
2696	jz	.Lcbc_ret
2697
2698	mov	240($key),$rnds_	# key->rounds
2699	mov	$key,$key_		# backup $key
2700	test	%r9d,%r9d		# 6th argument
2701	jz	.Lcbc_decrypt
2702#--------------------------- CBC ENCRYPT ------------------------------#
2703	movups	($ivp),$inout0		# load iv as initial state
2704	mov	$rnds_,$rounds
2705	cmp	\$16,$len
2706	jb	.Lcbc_enc_tail
2707	sub	\$16,$len
2708	jmp	.Lcbc_enc_loop
2709.align	16
2710.Lcbc_enc_loop:
2711	movups	($inp),$inout1		# load input
2712	lea	16($inp),$inp
2713	#xorps	$inout1,$inout0
2714___
2715	&aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
2716$code.=<<___;
2717	mov	$rnds_,$rounds		# restore $rounds
2718	mov	$key_,$key		# restore $key
2719	movups	$inout0,0($out)		# store output
2720	lea	16($out),$out
2721	sub	\$16,$len
2722	jnc	.Lcbc_enc_loop
2723	add	\$16,$len
2724	jnz	.Lcbc_enc_tail
2725	 pxor	$rndkey0,$rndkey0	# clear register bank
2726	 pxor	$rndkey1,$rndkey1
2727	movups	$inout0,($ivp)
2728	 pxor	$inout0,$inout0
2729	 pxor	$inout1,$inout1
2730	jmp	.Lcbc_ret
2731
2732.Lcbc_enc_tail:
2733	mov	$len,%rcx	# zaps $key
2734	xchg	$inp,$out	# $inp is %rsi and $out is %rdi now
2735	.long	0x9066A4F3	# rep movsb
2736	mov	\$16,%ecx	# zero tail
2737	sub	$len,%rcx
2738	xor	%eax,%eax
2739	.long	0x9066AAF3	# rep stosb
2740	lea	-16(%rdi),%rdi	# rewind $out by 1 block
2741	mov	$rnds_,$rounds	# restore $rounds
2742	mov	%rdi,%rsi	# $inp and $out are the same
2743	mov	$key_,$key	# restore $key
2744	xor	$len,$len	# len=16
2745	jmp	.Lcbc_enc_loop	# one more spin
2746#--------------------------- CBC DECRYPT ------------------------------#
2747.align	16
2748.Lcbc_decrypt:
2749	cmp	\$16,$len
2750	jne	.Lcbc_decrypt_bulk
2751
2752	# handle single block without allocating stack frame,
2753	# useful in ciphertext stealing mode
2754	movdqu	($inp),$inout0		# load input
2755	movdqu	($ivp),$inout1		# load iv
2756	movdqa	$inout0,$inout2		# future iv
2757___
2758	&aesni_generate1("dec",$key,$rnds_);
2759$code.=<<___;
2760	 pxor	$rndkey0,$rndkey0	# clear register bank
2761	 pxor	$rndkey1,$rndkey1
2762	movdqu	$inout2,($ivp)		# store iv
2763	xorps	$inout1,$inout0		# ^=iv
2764	 pxor	$inout1,$inout1
2765	movups	$inout0,($out)		# store output
2766	 pxor	$inout0,$inout0
2767	jmp	.Lcbc_ret
2768.align	16
2769.Lcbc_decrypt_bulk:
2770	lea	(%rsp),%r11		# frame pointer
2771.cfi_def_cfa_register	%r11
2772	push	%rbp
2773.cfi_push	%rbp
2774	sub	\$$frame_size,%rsp
2775	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
2776___
2777$code.=<<___ if ($win64);
2778	movaps	%xmm6,0x10(%rsp)
2779	movaps	%xmm7,0x20(%rsp)
2780	movaps	%xmm8,0x30(%rsp)
2781	movaps	%xmm9,0x40(%rsp)
2782	movaps	%xmm10,0x50(%rsp)
2783	movaps	%xmm11,0x60(%rsp)
2784	movaps	%xmm12,0x70(%rsp)
2785	movaps	%xmm13,0x80(%rsp)
2786	movaps	%xmm14,0x90(%rsp)
2787	movaps	%xmm15,0xa0(%rsp)
2788.Lcbc_decrypt_body:
2789___
2790
2791my $inp_=$key_="%rbp";			# reassign $key_
2792
2793$code.=<<___;
2794	mov	$key,$key_		# [re-]backup $key [after reassignment]
2795	movups	($ivp),$iv
2796	mov	$rnds_,$rounds
2797	cmp	\$0x50,$len
2798	jbe	.Lcbc_dec_tail
2799
2800	$movkey	($key),$rndkey0
2801	movdqu	0x00($inp),$inout0	# load input
2802	movdqu	0x10($inp),$inout1
2803	movdqa	$inout0,$in0
2804	movdqu	0x20($inp),$inout2
2805	movdqa	$inout1,$in1
2806	movdqu	0x30($inp),$inout3
2807	movdqa	$inout2,$in2
2808	movdqu	0x40($inp),$inout4
2809	movdqa	$inout3,$in3
2810	movdqu	0x50($inp),$inout5
2811	movdqa	$inout4,$in4
2812	cmp	\$0x70,$len
2813	jbe	.Lcbc_dec_six_or_seven
2814
2815	sub	\$0x70,$len		# $len is biased by -7*16
2816	lea	0x70($key),$key		# size optimization
2817	jmp	.Lcbc_dec_loop8_enter
2818.align	16
2819.Lcbc_dec_loop8:
2820	movups	$inout7,($out)
2821	lea	0x10($out),$out
2822.Lcbc_dec_loop8_enter:
2823	movdqu		0x60($inp),$inout6
2824	pxor		$rndkey0,$inout0
2825	movdqu		0x70($inp),$inout7
2826	pxor		$rndkey0,$inout1
2827	$movkey		0x10-0x70($key),$rndkey1
2828	pxor		$rndkey0,$inout2
2829	mov		\$-1,$inp_
2830	cmp		\$0x70,$len	# is there at least 0x60 bytes ahead?
2831	pxor		$rndkey0,$inout3
2832	pxor		$rndkey0,$inout4
2833	pxor		$rndkey0,$inout5
2834	pxor		$rndkey0,$inout6
2835
2836	aesdec		$rndkey1,$inout0
2837	pxor		$rndkey0,$inout7
2838	$movkey		0x20-0x70($key),$rndkey0
2839	aesdec		$rndkey1,$inout1
2840	aesdec		$rndkey1,$inout2
2841	aesdec		$rndkey1,$inout3
2842	aesdec		$rndkey1,$inout4
2843	aesdec		$rndkey1,$inout5
2844	aesdec		$rndkey1,$inout6
2845	adc		\$0,$inp_
2846	and		\$128,$inp_
2847	aesdec		$rndkey1,$inout7
2848	add		$inp,$inp_
2849	$movkey		0x30-0x70($key),$rndkey1
2850___
2851for($i=1;$i<12;$i++) {
2852my $rndkeyx = ($i&1)?$rndkey0:$rndkey1;
2853$code.=<<___	if ($i==7);
2854	cmp		\$11,$rounds
2855___
2856$code.=<<___;
2857	aesdec		$rndkeyx,$inout0
2858	aesdec		$rndkeyx,$inout1
2859	aesdec		$rndkeyx,$inout2
2860	aesdec		$rndkeyx,$inout3
2861	aesdec		$rndkeyx,$inout4
2862	aesdec		$rndkeyx,$inout5
2863	aesdec		$rndkeyx,$inout6
2864	aesdec		$rndkeyx,$inout7
2865	$movkey		`0x30+0x10*$i`-0x70($key),$rndkeyx
2866___
2867$code.=<<___	if ($i<6 || (!($i&1) && $i>7));
2868	nop
2869___
2870$code.=<<___	if ($i==7);
2871	jb		.Lcbc_dec_done
2872___
2873$code.=<<___	if ($i==9);
2874	je		.Lcbc_dec_done
2875___
2876$code.=<<___	if ($i==11);
2877	jmp		.Lcbc_dec_done
2878___
2879}
2880$code.=<<___;
2881.align	16
2882.Lcbc_dec_done:
2883	aesdec		$rndkey1,$inout0
2884	aesdec		$rndkey1,$inout1
2885	pxor		$rndkey0,$iv
2886	pxor		$rndkey0,$in0
2887	aesdec		$rndkey1,$inout2
2888	aesdec		$rndkey1,$inout3
2889	pxor		$rndkey0,$in1
2890	pxor		$rndkey0,$in2
2891	aesdec		$rndkey1,$inout4
2892	aesdec		$rndkey1,$inout5
2893	pxor		$rndkey0,$in3
2894	pxor		$rndkey0,$in4
2895	aesdec		$rndkey1,$inout6
2896	aesdec		$rndkey1,$inout7
2897	movdqu		0x50($inp),$rndkey1
2898
2899	aesdeclast	$iv,$inout0
2900	movdqu		0x60($inp),$iv		# borrow $iv
2901	pxor		$rndkey0,$rndkey1
2902	aesdeclast	$in0,$inout1
2903	pxor		$rndkey0,$iv
2904	movdqu		0x70($inp),$rndkey0	# next IV
2905	aesdeclast	$in1,$inout2
2906	lea		0x80($inp),$inp
2907	movdqu		0x00($inp_),$in0
2908	aesdeclast	$in2,$inout3
2909	aesdeclast	$in3,$inout4
2910	movdqu		0x10($inp_),$in1
2911	movdqu		0x20($inp_),$in2
2912	aesdeclast	$in4,$inout5
2913	aesdeclast	$rndkey1,$inout6
2914	movdqu		0x30($inp_),$in3
2915	movdqu		0x40($inp_),$in4
2916	aesdeclast	$iv,$inout7
2917	movdqa		$rndkey0,$iv		# return $iv
2918	movdqu		0x50($inp_),$rndkey1
2919	$movkey		-0x70($key),$rndkey0
2920
2921	movups		$inout0,($out)		# store output
2922	movdqa		$in0,$inout0
2923	movups		$inout1,0x10($out)
2924	movdqa		$in1,$inout1
2925	movups		$inout2,0x20($out)
2926	movdqa		$in2,$inout2
2927	movups		$inout3,0x30($out)
2928	movdqa		$in3,$inout3
2929	movups		$inout4,0x40($out)
2930	movdqa		$in4,$inout4
2931	movups		$inout5,0x50($out)
2932	movdqa		$rndkey1,$inout5
2933	movups		$inout6,0x60($out)
2934	lea		0x70($out),$out
2935
2936	sub	\$0x80,$len
2937	ja	.Lcbc_dec_loop8
2938
2939	movaps	$inout7,$inout0
2940	lea	-0x70($key),$key
2941	add	\$0x70,$len
2942	jle	.Lcbc_dec_clear_tail_collected
2943	movups	$inout7,($out)
2944	lea	0x10($out),$out
2945	cmp	\$0x50,$len
2946	jbe	.Lcbc_dec_tail
2947
2948	movaps	$in0,$inout0
2949.Lcbc_dec_six_or_seven:
2950	cmp	\$0x60,$len
2951	ja	.Lcbc_dec_seven
2952
2953	movaps	$inout5,$inout6
2954	call	_aesni_decrypt6
2955	pxor	$iv,$inout0		# ^= IV
2956	movaps	$inout6,$iv
2957	pxor	$in0,$inout1
2958	movdqu	$inout0,($out)
2959	pxor	$in1,$inout2
2960	movdqu	$inout1,0x10($out)
2961	 pxor	$inout1,$inout1		# clear register bank
2962	pxor	$in2,$inout3
2963	movdqu	$inout2,0x20($out)
2964	 pxor	$inout2,$inout2
2965	pxor	$in3,$inout4
2966	movdqu	$inout3,0x30($out)
2967	 pxor	$inout3,$inout3
2968	pxor	$in4,$inout5
2969	movdqu	$inout4,0x40($out)
2970	 pxor	$inout4,$inout4
2971	lea	0x50($out),$out
2972	movdqa	$inout5,$inout0
2973	 pxor	$inout5,$inout5
2974	jmp	.Lcbc_dec_tail_collected
2975
2976.align	16
2977.Lcbc_dec_seven:
2978	movups	0x60($inp),$inout6
2979	xorps	$inout7,$inout7
2980	call	_aesni_decrypt8
2981	movups	0x50($inp),$inout7
2982	pxor	$iv,$inout0		# ^= IV
2983	movups	0x60($inp),$iv
2984	pxor	$in0,$inout1
2985	movdqu	$inout0,($out)
2986	pxor	$in1,$inout2
2987	movdqu	$inout1,0x10($out)
2988	 pxor	$inout1,$inout1		# clear register bank
2989	pxor	$in2,$inout3
2990	movdqu	$inout2,0x20($out)
2991	 pxor	$inout2,$inout2
2992	pxor	$in3,$inout4
2993	movdqu	$inout3,0x30($out)
2994	 pxor	$inout3,$inout3
2995	pxor	$in4,$inout5
2996	movdqu	$inout4,0x40($out)
2997	 pxor	$inout4,$inout4
2998	pxor	$inout7,$inout6
2999	movdqu	$inout5,0x50($out)
3000	 pxor	$inout5,$inout5
3001	lea	0x60($out),$out
3002	movdqa	$inout6,$inout0
3003	 pxor	$inout6,$inout6
3004	 pxor	$inout7,$inout7
3005	jmp	.Lcbc_dec_tail_collected
3006
3007.Lcbc_dec_tail:
3008	movups	($inp),$inout0
3009	sub	\$0x10,$len
3010	jbe	.Lcbc_dec_one		# $len is 1*16 or less
3011
3012	movups	0x10($inp),$inout1
3013	movaps	$inout0,$in0
3014	sub	\$0x10,$len
3015	jbe	.Lcbc_dec_two		# $len is 2*16 or less
3016
3017	movups	0x20($inp),$inout2
3018	movaps	$inout1,$in1
3019	sub	\$0x10,$len
3020	jbe	.Lcbc_dec_three		# $len is 3*16 or less
3021
3022	movups	0x30($inp),$inout3
3023	movaps	$inout2,$in2
3024	sub	\$0x10,$len
3025	jbe	.Lcbc_dec_four		# $len is 4*16 or less
3026
3027	movups	0x40($inp),$inout4	# $len is 5*16 or less
3028	movaps	$inout3,$in3
3029	movaps	$inout4,$in4
3030	xorps	$inout5,$inout5
3031	call	_aesni_decrypt6
3032	pxor	$iv,$inout0
3033	movaps	$in4,$iv
3034	pxor	$in0,$inout1
3035	movdqu	$inout0,($out)
3036	pxor	$in1,$inout2
3037	movdqu	$inout1,0x10($out)
3038	 pxor	$inout1,$inout1		# clear register bank
3039	pxor	$in2,$inout3
3040	movdqu	$inout2,0x20($out)
3041	 pxor	$inout2,$inout2
3042	pxor	$in3,$inout4
3043	movdqu	$inout3,0x30($out)
3044	 pxor	$inout3,$inout3
3045	lea	0x40($out),$out
3046	movdqa	$inout4,$inout0
3047	 pxor	$inout4,$inout4
3048	 pxor	$inout5,$inout5
3049	sub	\$0x10,$len
3050	jmp	.Lcbc_dec_tail_collected
3051
3052.align	16
3053.Lcbc_dec_one:
3054	movaps	$inout0,$in0
3055___
3056	&aesni_generate1("dec",$key,$rounds);
3057$code.=<<___;
3058	xorps	$iv,$inout0
3059	movaps	$in0,$iv
3060	jmp	.Lcbc_dec_tail_collected
3061.align	16
3062.Lcbc_dec_two:
3063	movaps	$inout1,$in1
3064	call	_aesni_decrypt2
3065	pxor	$iv,$inout0
3066	movaps	$in1,$iv
3067	pxor	$in0,$inout1
3068	movdqu	$inout0,($out)
3069	movdqa	$inout1,$inout0
3070	 pxor	$inout1,$inout1		# clear register bank
3071	lea	0x10($out),$out
3072	jmp	.Lcbc_dec_tail_collected
3073.align	16
3074.Lcbc_dec_three:
3075	movaps	$inout2,$in2
3076	call	_aesni_decrypt3
3077	pxor	$iv,$inout0
3078	movaps	$in2,$iv
3079	pxor	$in0,$inout1
3080	movdqu	$inout0,($out)
3081	pxor	$in1,$inout2
3082	movdqu	$inout1,0x10($out)
3083	 pxor	$inout1,$inout1		# clear register bank
3084	movdqa	$inout2,$inout0
3085	 pxor	$inout2,$inout2
3086	lea	0x20($out),$out
3087	jmp	.Lcbc_dec_tail_collected
3088.align	16
3089.Lcbc_dec_four:
3090	movaps	$inout3,$in3
3091	call	_aesni_decrypt4
3092	pxor	$iv,$inout0
3093	movaps	$in3,$iv
3094	pxor	$in0,$inout1
3095	movdqu	$inout0,($out)
3096	pxor	$in1,$inout2
3097	movdqu	$inout1,0x10($out)
3098	 pxor	$inout1,$inout1		# clear register bank
3099	pxor	$in2,$inout3
3100	movdqu	$inout2,0x20($out)
3101	 pxor	$inout2,$inout2
3102	movdqa	$inout3,$inout0
3103	 pxor	$inout3,$inout3
3104	lea	0x30($out),$out
3105	jmp	.Lcbc_dec_tail_collected
3106
3107.align	16
3108.Lcbc_dec_clear_tail_collected:
3109	pxor	$inout1,$inout1		# clear register bank
3110	pxor	$inout2,$inout2
3111	pxor	$inout3,$inout3
3112___
3113$code.=<<___ if (!$win64);
3114	pxor	$inout4,$inout4		# %xmm6..9
3115	pxor	$inout5,$inout5
3116	pxor	$inout6,$inout6
3117	pxor	$inout7,$inout7
3118___
3119$code.=<<___;
3120.Lcbc_dec_tail_collected:
3121	movups	$iv,($ivp)
3122	and	\$15,$len
3123	jnz	.Lcbc_dec_tail_partial
3124	movups	$inout0,($out)
3125	pxor	$inout0,$inout0
3126	jmp	.Lcbc_dec_ret
3127.align	16
3128.Lcbc_dec_tail_partial:
3129	movaps	$inout0,(%rsp)
3130	pxor	$inout0,$inout0
3131	mov	\$16,%rcx
3132	mov	$out,%rdi
3133	sub	$len,%rcx
3134	lea	(%rsp),%rsi
3135	.long	0x9066A4F3		# rep movsb
3136	movdqa	$inout0,(%rsp)
3137
3138.Lcbc_dec_ret:
3139	xorps	$rndkey0,$rndkey0	# %xmm0
3140	pxor	$rndkey1,$rndkey1
3141___
3142$code.=<<___ if ($win64);
3143	movaps	0x10(%rsp),%xmm6
3144	movaps	%xmm0,0x10(%rsp)	# clear stack
3145	movaps	0x20(%rsp),%xmm7
3146	movaps	%xmm0,0x20(%rsp)
3147	movaps	0x30(%rsp),%xmm8
3148	movaps	%xmm0,0x30(%rsp)
3149	movaps	0x40(%rsp),%xmm9
3150	movaps	%xmm0,0x40(%rsp)
3151	movaps	0x50(%rsp),%xmm10
3152	movaps	%xmm0,0x50(%rsp)
3153	movaps	0x60(%rsp),%xmm11
3154	movaps	%xmm0,0x60(%rsp)
3155	movaps	0x70(%rsp),%xmm12
3156	movaps	%xmm0,0x70(%rsp)
3157	movaps	0x80(%rsp),%xmm13
3158	movaps	%xmm0,0x80(%rsp)
3159	movaps	0x90(%rsp),%xmm14
3160	movaps	%xmm0,0x90(%rsp)
3161	movaps	0xa0(%rsp),%xmm15
3162	movaps	%xmm0,0xa0(%rsp)
3163___
3164$code.=<<___;
3165	mov	-8(%r11),%rbp
3166.cfi_restore	%rbp
3167	lea	(%r11),%rsp
3168.cfi_def_cfa_register	%rsp
3169.Lcbc_ret:
3170	ret
3171.cfi_endproc
3172.size	${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
3173___
3174}
3175# int ${PREFIX}_set_decrypt_key(const unsigned char *inp,
3176#				int bits, AES_KEY *key)
3177#
3178# input:	$inp	user-supplied key
3179#		$bits	$inp length in bits
3180#		$key	pointer to key schedule
3181# output:	%eax	0 denoting success, -1 or -2 - failure (see C)
3182#		*$key	key schedule
3183#
3184{ my ($inp,$bits,$key) = @_4args;
3185  $bits =~ s/%r/%e/;
3186
3187$code.=<<___;
3188.globl	${PREFIX}_set_decrypt_key
3189.type	${PREFIX}_set_decrypt_key,\@abi-omnipotent
3190.align	16
3191${PREFIX}_set_decrypt_key:
3192.cfi_startproc
3193	_CET_ENDBR
3194	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
3195.cfi_adjust_cfa_offset	8
3196	call	__aesni_set_encrypt_key
3197	shl	\$4,$bits		# rounds-1 after _aesni_set_encrypt_key
3198	test	%eax,%eax
3199	jnz	.Ldec_key_ret
3200	lea	16($key,$bits),$inp	# points at the end of key schedule
3201
3202	$movkey	($key),%xmm0		# just swap
3203	$movkey	($inp),%xmm1
3204	$movkey	%xmm0,($inp)
3205	$movkey	%xmm1,($key)
3206	lea	16($key),$key
3207	lea	-16($inp),$inp
3208
3209.Ldec_key_inverse:
3210	$movkey	($key),%xmm0		# swap and inverse
3211	$movkey	($inp),%xmm1
3212	aesimc	%xmm0,%xmm0
3213	aesimc	%xmm1,%xmm1
3214	lea	16($key),$key
3215	lea	-16($inp),$inp
3216	$movkey	%xmm0,16($inp)
3217	$movkey	%xmm1,-16($key)
3218	cmp	$key,$inp
3219	ja	.Ldec_key_inverse
3220
3221	$movkey	($key),%xmm0		# inverse middle
3222	aesimc	%xmm0,%xmm0
3223	pxor	%xmm1,%xmm1
3224	$movkey	%xmm0,($inp)
3225	pxor	%xmm0,%xmm0
3226.Ldec_key_ret:
3227	add	\$8,%rsp
3228.cfi_adjust_cfa_offset	-8
3229	ret
3230.cfi_endproc
3231.LSEH_end_set_decrypt_key:
3232.size	${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
3233___
3234
3235# This is based on submission from Intel by
3236#	Huang Ying
3237#	Vinodh Gopal
3238#	Kahraman Akdemir
3239#
3240# Aggressively optimized in respect to aeskeygenassist's critical path
3241# and is contained in %xmm0-5 to meet Win64 ABI requirement.
3242#
3243# int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
3244#				int bits, AES_KEY * const key);
3245#
3246# input:	$inp	user-supplied key
3247#		$bits	$inp length in bits
3248#		$key	pointer to key schedule
3249# output:	%eax	0 denoting success, -1 or -2 - failure (see C)
3250#		$bits	rounds-1 (used in aesni_set_decrypt_key)
3251#		*$key	key schedule
3252#		$key	pointer to key schedule (used in
3253#			aesni_set_decrypt_key)
3254#
3255# Subroutine is frame-less, which means that only volatile registers
3256# are used. Note that it's declared "abi-omnipotent", which means that
3257# amount of volatile registers is smaller on Windows.
3258#
3259$code.=<<___;
3260.globl	${PREFIX}_set_encrypt_key
3261.type	${PREFIX}_set_encrypt_key,\@abi-omnipotent
3262.align	16
3263${PREFIX}_set_encrypt_key:
3264__aesni_set_encrypt_key:
3265.cfi_startproc
3266	_CET_ENDBR
3267#ifdef BORINGSSL_DISPATCH_TEST
3268	movb \$1,BORINGSSL_function_hit+3(%rip)
3269#endif
3270	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
3271.cfi_adjust_cfa_offset	8
3272	mov	\$-1,%rax
3273	test	$inp,$inp
3274	jz	.Lenc_key_ret
3275	test	$key,$key
3276	jz	.Lenc_key_ret
3277
3278	movups	($inp),%xmm0		# pull first 128 bits of *userKey
3279	xorps	%xmm4,%xmm4		# low dword of xmm4 is assumed 0
3280	leaq	OPENSSL_ia32cap_P(%rip),%r10
3281	movl	4(%r10),%r10d
3282	and	\$`1<<28|1<<11`,%r10d	# AVX and XOP bits
3283	lea	16($key),%rax		# %rax is used as modifiable copy of $key
3284	cmp	\$256,$bits
3285	je	.L14rounds
3286	cmp	\$192,$bits
3287	je	.L12rounds
3288	cmp	\$128,$bits
3289	jne	.Lbad_keybits
3290
3291.L10rounds:
3292	mov	\$9,$bits			# 10 rounds for 128-bit key
3293	cmp	\$`1<<28`,%r10d			# AVX, bit no XOP
3294	je	.L10rounds_alt
3295
3296	$movkey	%xmm0,($key)			# round 0
3297	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 1
3298	call		.Lkey_expansion_128_cold
3299	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 2
3300	call		.Lkey_expansion_128
3301	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 3
3302	call		.Lkey_expansion_128
3303	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 4
3304	call		.Lkey_expansion_128
3305	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 5
3306	call		.Lkey_expansion_128
3307	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 6
3308	call		.Lkey_expansion_128
3309	aeskeygenassist	\$0x40,%xmm0,%xmm1	# round 7
3310	call		.Lkey_expansion_128
3311	aeskeygenassist	\$0x80,%xmm0,%xmm1	# round 8
3312	call		.Lkey_expansion_128
3313	aeskeygenassist	\$0x1b,%xmm0,%xmm1	# round 9
3314	call		.Lkey_expansion_128
3315	aeskeygenassist	\$0x36,%xmm0,%xmm1	# round 10
3316	call		.Lkey_expansion_128
3317	$movkey	%xmm0,(%rax)
3318	mov	$bits,80(%rax)	# 240(%rdx)
3319	xor	%eax,%eax
3320	jmp	.Lenc_key_ret
3321
3322.align	16
3323.L10rounds_alt:
3324	movdqa	.Lkey_rotate(%rip),%xmm5
3325	mov	\$8,%r10d
3326	movdqa	.Lkey_rcon1(%rip),%xmm4
3327	movdqa	%xmm0,%xmm2
3328	movdqu	%xmm0,($key)
3329	jmp	.Loop_key128
3330
3331.align	16
3332.Loop_key128:
3333	pshufb		%xmm5,%xmm0
3334	aesenclast	%xmm4,%xmm0
3335	pslld		\$1,%xmm4
3336	lea		16(%rax),%rax
3337
3338	movdqa		%xmm2,%xmm3
3339	pslldq		\$4,%xmm2
3340	pxor		%xmm2,%xmm3
3341	pslldq		\$4,%xmm2
3342	pxor		%xmm2,%xmm3
3343	pslldq		\$4,%xmm2
3344	pxor		%xmm3,%xmm2
3345
3346	pxor		%xmm2,%xmm0
3347	movdqu		%xmm0,-16(%rax)
3348	movdqa		%xmm0,%xmm2
3349
3350	dec	%r10d
3351	jnz	.Loop_key128
3352
3353	movdqa		.Lkey_rcon1b(%rip),%xmm4
3354
3355	pshufb		%xmm5,%xmm0
3356	aesenclast	%xmm4,%xmm0
3357	pslld		\$1,%xmm4
3358
3359	movdqa		%xmm2,%xmm3
3360	pslldq		\$4,%xmm2
3361	pxor		%xmm2,%xmm3
3362	pslldq		\$4,%xmm2
3363	pxor		%xmm2,%xmm3
3364	pslldq		\$4,%xmm2
3365	pxor		%xmm3,%xmm2
3366
3367	pxor		%xmm2,%xmm0
3368	movdqu		%xmm0,(%rax)
3369
3370	movdqa		%xmm0,%xmm2
3371	pshufb		%xmm5,%xmm0
3372	aesenclast	%xmm4,%xmm0
3373
3374	movdqa		%xmm2,%xmm3
3375	pslldq		\$4,%xmm2
3376	pxor		%xmm2,%xmm3
3377	pslldq		\$4,%xmm2
3378	pxor		%xmm2,%xmm3
3379	pslldq		\$4,%xmm2
3380	pxor		%xmm3,%xmm2
3381
3382	pxor		%xmm2,%xmm0
3383	movdqu		%xmm0,16(%rax)
3384
3385	mov	$bits,96(%rax)	# 240($key)
3386	xor	%eax,%eax
3387	jmp	.Lenc_key_ret
3388
3389.align	16
3390.L12rounds:
3391	movq	16($inp),%xmm2			# remaining 1/3 of *userKey
3392	mov	\$11,$bits			# 12 rounds for 192
3393	cmp	\$`1<<28`,%r10d			# AVX, but no XOP
3394	je	.L12rounds_alt
3395
3396	$movkey	%xmm0,($key)			# round 0
3397	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 1,2
3398	call		.Lkey_expansion_192a_cold
3399	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 2,3
3400	call		.Lkey_expansion_192b
3401	aeskeygenassist	\$0x4,%xmm2,%xmm1	# round 4,5
3402	call		.Lkey_expansion_192a
3403	aeskeygenassist	\$0x8,%xmm2,%xmm1	# round 5,6
3404	call		.Lkey_expansion_192b
3405	aeskeygenassist	\$0x10,%xmm2,%xmm1	# round 7,8
3406	call		.Lkey_expansion_192a
3407	aeskeygenassist	\$0x20,%xmm2,%xmm1	# round 8,9
3408	call		.Lkey_expansion_192b
3409	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 10,11
3410	call		.Lkey_expansion_192a
3411	aeskeygenassist	\$0x80,%xmm2,%xmm1	# round 11,12
3412	call		.Lkey_expansion_192b
3413	$movkey	%xmm0,(%rax)
3414	mov	$bits,48(%rax)	# 240(%rdx)
3415	xor	%rax, %rax
3416	jmp	.Lenc_key_ret
3417
3418.align	16
3419.L12rounds_alt:
3420	movdqa	.Lkey_rotate192(%rip),%xmm5
3421	movdqa	.Lkey_rcon1(%rip),%xmm4
3422	mov	\$8,%r10d
3423	movdqu	%xmm0,($key)
3424	jmp	.Loop_key192
3425
3426.align	16
3427.Loop_key192:
3428	movq		%xmm2,0(%rax)
3429	movdqa		%xmm2,%xmm1
3430	pshufb		%xmm5,%xmm2
3431	aesenclast	%xmm4,%xmm2
3432	pslld		\$1, %xmm4
3433	lea		24(%rax),%rax
3434
3435	movdqa		%xmm0,%xmm3
3436	pslldq		\$4,%xmm0
3437	pxor		%xmm0,%xmm3
3438	pslldq		\$4,%xmm0
3439	pxor		%xmm0,%xmm3
3440	pslldq		\$4,%xmm0
3441	pxor		%xmm3,%xmm0
3442
3443	pshufd		\$0xff,%xmm0,%xmm3
3444	pxor		%xmm1,%xmm3
3445	pslldq		\$4,%xmm1
3446	pxor		%xmm1,%xmm3
3447
3448	pxor		%xmm2,%xmm0
3449	pxor		%xmm3,%xmm2
3450	movdqu		%xmm0,-16(%rax)
3451
3452	dec	%r10d
3453	jnz	.Loop_key192
3454
3455	mov	$bits,32(%rax)	# 240($key)
3456	xor	%eax,%eax
3457	jmp	.Lenc_key_ret
3458
3459.align	16
3460.L14rounds:
3461	movups	16($inp),%xmm2			# remaining half of *userKey
3462	mov	\$13,$bits			# 14 rounds for 256
3463	lea	16(%rax),%rax
3464	cmp	\$`1<<28`,%r10d			# AVX, but no XOP
3465	je	.L14rounds_alt
3466
3467	$movkey	%xmm0,($key)			# round 0
3468	$movkey	%xmm2,16($key)			# round 1
3469	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 2
3470	call		.Lkey_expansion_256a_cold
3471	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 3
3472	call		.Lkey_expansion_256b
3473	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 4
3474	call		.Lkey_expansion_256a
3475	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 5
3476	call		.Lkey_expansion_256b
3477	aeskeygenassist	\$0x4,%xmm2,%xmm1	# round 6
3478	call		.Lkey_expansion_256a
3479	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 7
3480	call		.Lkey_expansion_256b
3481	aeskeygenassist	\$0x8,%xmm2,%xmm1	# round 8
3482	call		.Lkey_expansion_256a
3483	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 9
3484	call		.Lkey_expansion_256b
3485	aeskeygenassist	\$0x10,%xmm2,%xmm1	# round 10
3486	call		.Lkey_expansion_256a
3487	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 11
3488	call		.Lkey_expansion_256b
3489	aeskeygenassist	\$0x20,%xmm2,%xmm1	# round 12
3490	call		.Lkey_expansion_256a
3491	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 13
3492	call		.Lkey_expansion_256b
3493	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 14
3494	call		.Lkey_expansion_256a
3495	$movkey	%xmm0,(%rax)
3496	mov	$bits,16(%rax)	# 240(%rdx)
3497	xor	%rax,%rax
3498	jmp	.Lenc_key_ret
3499
3500.align	16
3501.L14rounds_alt:
3502	movdqa	.Lkey_rotate(%rip),%xmm5
3503	movdqa	.Lkey_rcon1(%rip),%xmm4
3504	mov	\$7,%r10d
3505	movdqu	%xmm0,0($key)
3506	movdqa	%xmm2,%xmm1
3507	movdqu	%xmm2,16($key)
3508	jmp	.Loop_key256
3509
3510.align	16
3511.Loop_key256:
3512	pshufb		%xmm5,%xmm2
3513	aesenclast	%xmm4,%xmm2
3514
3515	movdqa		%xmm0,%xmm3
3516	pslldq		\$4,%xmm0
3517	pxor		%xmm0,%xmm3
3518	pslldq		\$4,%xmm0
3519	pxor		%xmm0,%xmm3
3520	pslldq		\$4,%xmm0
3521	pxor		%xmm3,%xmm0
3522	pslld		\$1,%xmm4
3523
3524	pxor		%xmm2,%xmm0
3525	movdqu		%xmm0,(%rax)
3526
3527	dec	%r10d
3528	jz	.Ldone_key256
3529
3530	pshufd		\$0xff,%xmm0,%xmm2
3531	pxor		%xmm3,%xmm3
3532	aesenclast	%xmm3,%xmm2
3533
3534	movdqa		%xmm1,%xmm3
3535	pslldq		\$4,%xmm1
3536	pxor		%xmm1,%xmm3
3537	pslldq		\$4,%xmm1
3538	pxor		%xmm1,%xmm3
3539	pslldq		\$4,%xmm1
3540	pxor		%xmm3,%xmm1
3541
3542	pxor		%xmm1,%xmm2
3543	movdqu		%xmm2,16(%rax)
3544	lea		32(%rax),%rax
3545	movdqa		%xmm2,%xmm1
3546
3547	jmp	.Loop_key256
3548
3549.Ldone_key256:
3550	mov	$bits,16(%rax)	# 240($key)
3551	xor	%eax,%eax
3552	jmp	.Lenc_key_ret
3553
3554.align	16
3555.Lbad_keybits:
3556	mov	\$-2,%rax
3557.Lenc_key_ret:
3558	pxor	%xmm0,%xmm0
3559	pxor	%xmm1,%xmm1
3560	pxor	%xmm2,%xmm2
3561	pxor	%xmm3,%xmm3
3562	pxor	%xmm4,%xmm4
3563	pxor	%xmm5,%xmm5
3564	add	\$8,%rsp
3565.cfi_adjust_cfa_offset	-8
3566	ret
3567.cfi_endproc
3568.LSEH_end_set_encrypt_key:
3569
3570.align	16
3571.Lkey_expansion_128:
3572	$movkey	%xmm0,(%rax)
3573	lea	16(%rax),%rax
3574.Lkey_expansion_128_cold:
3575	shufps	\$0b00010000,%xmm0,%xmm4
3576	xorps	%xmm4, %xmm0
3577	shufps	\$0b10001100,%xmm0,%xmm4
3578	xorps	%xmm4, %xmm0
3579	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
3580	xorps	%xmm1,%xmm0
3581	ret
3582
3583.align 16
3584.Lkey_expansion_192a:
3585	$movkey	%xmm0,(%rax)
3586	lea	16(%rax),%rax
3587.Lkey_expansion_192a_cold:
3588	movaps	%xmm2, %xmm5
3589.Lkey_expansion_192b_warm:
3590	shufps	\$0b00010000,%xmm0,%xmm4
3591	movdqa	%xmm2,%xmm3
3592	xorps	%xmm4,%xmm0
3593	shufps	\$0b10001100,%xmm0,%xmm4
3594	pslldq	\$4,%xmm3
3595	xorps	%xmm4,%xmm0
3596	pshufd	\$0b01010101,%xmm1,%xmm1	# critical path
3597	pxor	%xmm3,%xmm2
3598	pxor	%xmm1,%xmm0
3599	pshufd	\$0b11111111,%xmm0,%xmm3
3600	pxor	%xmm3,%xmm2
3601	ret
3602
3603.align 16
3604.Lkey_expansion_192b:
3605	movaps	%xmm0,%xmm3
3606	shufps	\$0b01000100,%xmm0,%xmm5
3607	$movkey	%xmm5,(%rax)
3608	shufps	\$0b01001110,%xmm2,%xmm3
3609	$movkey	%xmm3,16(%rax)
3610	lea	32(%rax),%rax
3611	jmp	.Lkey_expansion_192b_warm
3612
3613.align	16
3614.Lkey_expansion_256a:
3615	$movkey	%xmm2,(%rax)
3616	lea	16(%rax),%rax
3617.Lkey_expansion_256a_cold:
3618	shufps	\$0b00010000,%xmm0,%xmm4
3619	xorps	%xmm4,%xmm0
3620	shufps	\$0b10001100,%xmm0,%xmm4
3621	xorps	%xmm4,%xmm0
3622	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
3623	xorps	%xmm1,%xmm0
3624	ret
3625
3626.align 16
3627.Lkey_expansion_256b:
3628	$movkey	%xmm0,(%rax)
3629	lea	16(%rax),%rax
3630
3631	shufps	\$0b00010000,%xmm2,%xmm4
3632	xorps	%xmm4,%xmm2
3633	shufps	\$0b10001100,%xmm2,%xmm4
3634	xorps	%xmm4,%xmm2
3635	shufps	\$0b10101010,%xmm1,%xmm1	# critical path
3636	xorps	%xmm1,%xmm2
3637	ret
3638.size	${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
3639.size	__aesni_set_encrypt_key,.-__aesni_set_encrypt_key
3640___
3641}
3642
3643$code.=<<___;
3644.section .rodata
3645.align	64
3646.Lbswap_mask:
3647	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
3648.Lincrement32:
3649	.long	6,6,6,0
3650.Lincrement64:
3651	.long	1,0,0,0
3652.Lxts_magic:
3653	.long	0x87,0,1,0
3654.Lincrement1:
3655	.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3656.Lkey_rotate:
3657	.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
3658.Lkey_rotate192:
3659	.long	0x04070605,0x04070605,0x04070605,0x04070605
3660.Lkey_rcon1:
3661	.long	1,1,1,1
3662.Lkey_rcon1b:
3663	.long	0x1b,0x1b,0x1b,0x1b
3664
3665.asciz  "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
3666.align	64
3667.text
3668___
3669
3670# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3671#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
3672if ($win64) {
3673$rec="%rcx";
3674$frame="%rdx";
3675$context="%r8";
3676$disp="%r9";
3677
3678$code.=<<___;
3679.extern	__imp_RtlVirtualUnwind
3680___
3681$code.=<<___ if ($PREFIX eq "aes_hw");
3682.type	ecb_ccm64_se_handler,\@abi-omnipotent
3683.align	16
3684ecb_ccm64_se_handler:
3685	push	%rsi
3686	push	%rdi
3687	push	%rbx
3688	push	%rbp
3689	push	%r12
3690	push	%r13
3691	push	%r14
3692	push	%r15
3693	pushfq
3694	sub	\$64,%rsp
3695
3696	mov	120($context),%rax	# pull context->Rax
3697	mov	248($context),%rbx	# pull context->Rip
3698
3699	mov	8($disp),%rsi		# disp->ImageBase
3700	mov	56($disp),%r11		# disp->HandlerData
3701
3702	mov	0(%r11),%r10d		# HandlerData[0]
3703	lea	(%rsi,%r10),%r10	# prologue label
3704	cmp	%r10,%rbx		# context->Rip<prologue label
3705	jb	.Lcommon_seh_tail
3706
3707	mov	152($context),%rax	# pull context->Rsp
3708
3709	mov	4(%r11),%r10d		# HandlerData[1]
3710	lea	(%rsi,%r10),%r10	# epilogue label
3711	cmp	%r10,%rbx		# context->Rip>=epilogue label
3712	jae	.Lcommon_seh_tail
3713
3714	lea	0(%rax),%rsi		# %xmm save area
3715	lea	512($context),%rdi	# &context.Xmm6
3716	mov	\$8,%ecx		# 4*sizeof(%xmm0)/sizeof(%rax)
3717	.long	0xa548f3fc		# cld; rep movsq
3718	lea	0x58(%rax),%rax		# adjust stack pointer
3719
3720	jmp	.Lcommon_seh_tail
3721.size	${PREFIX}_ccm64_se_handler,.-${PREFIX}_ccm64_se_handler
3722
3723.type	ctr_xts_se_handler,\@abi-omnipotent
3724.align	16
3725ctr_xts_se_handler:
3726	push	%rsi
3727	push	%rdi
3728	push	%rbx
3729	push	%rbp
3730	push	%r12
3731	push	%r13
3732	push	%r14
3733	push	%r15
3734	pushfq
3735	sub	\$64,%rsp
3736
3737	mov	120($context),%rax	# pull context->Rax
3738	mov	248($context),%rbx	# pull context->Rip
3739
3740	mov	8($disp),%rsi		# disp->ImageBase
3741	mov	56($disp),%r11		# disp->HandlerData
3742
3743	mov	0(%r11),%r10d		# HandlerData[0]
3744	lea	(%rsi,%r10),%r10	# prologue lable
3745	cmp	%r10,%rbx		# context->Rip<prologue label
3746	jb	.Lcommon_seh_tail
3747
3748	mov	152($context),%rax	# pull context->Rsp
3749
3750	mov	4(%r11),%r10d		# HandlerData[1]
3751	lea	(%rsi,%r10),%r10	# epilogue label
3752	cmp	%r10,%rbx		# context->Rip>=epilogue label
3753	jae	.Lcommon_seh_tail
3754
3755	mov	208($context),%rax	# pull context->R11
3756
3757	lea	-0xa8(%rax),%rsi	# %xmm save area
3758	lea	512($context),%rdi	# & context.Xmm6
3759	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
3760	.long	0xa548f3fc		# cld; rep movsq
3761
3762	mov	-8(%rax),%rbp		# restore saved %rbp
3763	mov	%rbp,160($context)	# restore context->Rbp
3764	jmp	.Lcommon_seh_tail
3765.size	ctr_xts_se_handler,.-ctr_xts_se_handler
3766
3767___
3768# BoringSSL omits the OCB functions.
3769$code.=<<___ if (0);
3770.type	ocb_se_handler,\@abi-omnipotent
3771.align	16
3772ocb_se_handler:
3773	push	%rsi
3774	push	%rdi
3775	push	%rbx
3776	push	%rbp
3777	push	%r12
3778	push	%r13
3779	push	%r14
3780	push	%r15
3781	pushfq
3782	sub	\$64,%rsp
3783
3784	mov	120($context),%rax	# pull context->Rax
3785	mov	248($context),%rbx	# pull context->Rip
3786
3787	mov	8($disp),%rsi		# disp->ImageBase
3788	mov	56($disp),%r11		# disp->HandlerData
3789
3790	mov	0(%r11),%r10d		# HandlerData[0]
3791	lea	(%rsi,%r10),%r10	# prologue lable
3792	cmp	%r10,%rbx		# context->Rip<prologue label
3793	jb	.Lcommon_seh_tail
3794
3795	mov	4(%r11),%r10d		# HandlerData[1]
3796	lea	(%rsi,%r10),%r10	# epilogue label
3797	cmp	%r10,%rbx		# context->Rip>=epilogue label
3798	jae	.Lcommon_seh_tail
3799
3800	mov	8(%r11),%r10d		# HandlerData[2]
3801	lea	(%rsi,%r10),%r10
3802	cmp	%r10,%rbx		# context->Rip>=pop label
3803	jae	.Locb_no_xmm
3804
3805	mov	152($context),%rax	# pull context->Rsp
3806
3807	lea	(%rax),%rsi		# %xmm save area
3808	lea	512($context),%rdi	# & context.Xmm6
3809	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
3810	.long	0xa548f3fc		# cld; rep movsq
3811	lea	0xa0+0x28(%rax),%rax
3812
3813.Locb_no_xmm:
3814	mov	-8(%rax),%rbx
3815	mov	-16(%rax),%rbp
3816	mov	-24(%rax),%r12
3817	mov	-32(%rax),%r13
3818	mov	-40(%rax),%r14
3819
3820	mov	%rbx,144($context)	# restore context->Rbx
3821	mov	%rbp,160($context)	# restore context->Rbp
3822	mov	%r12,216($context)	# restore context->R12
3823	mov	%r13,224($context)	# restore context->R13
3824	mov	%r14,232($context)	# restore context->R14
3825
3826	jmp	.Lcommon_seh_tail
3827.size	ocb_se_handler,.-ocb_se_handler
3828___
3829$code.=<<___;
3830.type	cbc_se_handler,\@abi-omnipotent
3831.align	16
3832cbc_se_handler:
3833	push	%rsi
3834	push	%rdi
3835	push	%rbx
3836	push	%rbp
3837	push	%r12
3838	push	%r13
3839	push	%r14
3840	push	%r15
3841	pushfq
3842	sub	\$64,%rsp
3843
3844	mov	152($context),%rax	# pull context->Rsp
3845	mov	248($context),%rbx	# pull context->Rip
3846
3847	lea	.Lcbc_decrypt_bulk(%rip),%r10
3848	cmp	%r10,%rbx		# context->Rip<"prologue" label
3849	jb	.Lcommon_seh_tail
3850
3851	mov	120($context),%rax	# pull context->Rax
3852
3853	lea	.Lcbc_decrypt_body(%rip),%r10
3854	cmp	%r10,%rbx		# context->Rip<cbc_decrypt_body
3855	jb	.Lcommon_seh_tail
3856
3857	mov	152($context),%rax	# pull context->Rsp
3858
3859	lea	.Lcbc_ret(%rip),%r10
3860	cmp	%r10,%rbx		# context->Rip>="epilogue" label
3861	jae	.Lcommon_seh_tail
3862
3863	lea	16(%rax),%rsi		# %xmm save area
3864	lea	512($context),%rdi	# &context.Xmm6
3865	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
3866	.long	0xa548f3fc		# cld; rep movsq
3867
3868	mov	208($context),%rax	# pull context->R11
3869
3870	mov	-8(%rax),%rbp		# restore saved %rbp
3871	mov	%rbp,160($context)	# restore context->Rbp
3872
3873.Lcommon_seh_tail:
3874	mov	8(%rax),%rdi
3875	mov	16(%rax),%rsi
3876	mov	%rax,152($context)	# restore context->Rsp
3877	mov	%rsi,168($context)	# restore context->Rsi
3878	mov	%rdi,176($context)	# restore context->Rdi
3879
3880	mov	40($disp),%rdi		# disp->ContextRecord
3881	mov	$context,%rsi		# context
3882	mov	\$154,%ecx		# sizeof(CONTEXT)
3883	.long	0xa548f3fc		# cld; rep movsq
3884
3885	mov	$disp,%rsi
3886	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
3887	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
3888	mov	0(%rsi),%r8		# arg3, disp->ControlPc
3889	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
3890	mov	40(%rsi),%r10		# disp->ContextRecord
3891	lea	56(%rsi),%r11		# &disp->HandlerData
3892	lea	24(%rsi),%r12		# &disp->EstablisherFrame
3893	mov	%r10,32(%rsp)		# arg5
3894	mov	%r11,40(%rsp)		# arg6
3895	mov	%r12,48(%rsp)		# arg7
3896	mov	%rcx,56(%rsp)		# arg8, (NULL)
3897	call	*__imp_RtlVirtualUnwind(%rip)
3898
3899	mov	\$1,%eax		# ExceptionContinueSearch
3900	add	\$64,%rsp
3901	popfq
3902	pop	%r15
3903	pop	%r14
3904	pop	%r13
3905	pop	%r12
3906	pop	%rbp
3907	pop	%rbx
3908	pop	%rdi
3909	pop	%rsi
3910	ret
3911.size	cbc_se_handler,.-cbc_se_handler
3912
3913.section	.pdata
3914.align	4
3915___
3916$code.=<<___ if ($PREFIX eq "aes_hw");
3917	.rva	.LSEH_begin_${PREFIX}_ecb_encrypt
3918	.rva	.LSEH_end_${PREFIX}_ecb_encrypt
3919	.rva	.LSEH_info_ecb
3920
3921	.rva	.LSEH_begin_${PREFIX}_ctr32_encrypt_blocks
3922	.rva	.LSEH_end_${PREFIX}_ctr32_encrypt_blocks
3923	.rva	.LSEH_info_ctr32
3924___
3925$code.=<<___;
3926	.rva	.LSEH_begin_${PREFIX}_cbc_encrypt
3927	.rva	.LSEH_end_${PREFIX}_cbc_encrypt
3928	.rva	.LSEH_info_cbc
3929
3930	.rva	${PREFIX}_set_decrypt_key
3931	.rva	.LSEH_end_set_decrypt_key
3932	.rva	.LSEH_info_key
3933
3934	.rva	${PREFIX}_set_encrypt_key
3935	.rva	.LSEH_end_set_encrypt_key
3936	.rva	.LSEH_info_key
3937.section	.xdata
3938.align	8
3939___
3940$code.=<<___ if ($PREFIX eq "aes_hw");
3941.LSEH_info_ecb:
3942	.byte	9,0,0,0
3943	.rva	ecb_ccm64_se_handler
3944	.rva	.Lecb_enc_body,.Lecb_enc_ret		# HandlerData[]
3945.LSEH_info_ctr32:
3946	.byte	9,0,0,0
3947	.rva	ctr_xts_se_handler
3948	.rva	.Lctr32_body,.Lctr32_epilogue		# HandlerData[]
3949___
3950$code.=<<___;
3951.LSEH_info_cbc:
3952	.byte	9,0,0,0
3953	.rva	cbc_se_handler
3954.LSEH_info_key:
3955	.byte	0x01,0x04,0x01,0x00
3956	.byte	0x04,0x02,0x00,0x00	# sub rsp,8
3957___
3958}
3959
3960sub rex {
3961  local *opcode=shift;
3962  my ($dst,$src)=@_;
3963  my $rex=0;
3964
3965    $rex|=0x04			if($dst>=8);
3966    $rex|=0x01			if($src>=8);
3967    push @opcode,$rex|0x40	if($rex);
3968}
3969
3970sub aesni {
3971  my $line=shift;
3972  my @opcode=(0x66);
3973
3974    if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
3975	rex(\@opcode,$4,$3);
3976	push @opcode,0x0f,0x3a,0xdf;
3977	push @opcode,0xc0|($3&7)|(($4&7)<<3);	# ModR/M
3978	my $c=$2;
3979	push @opcode,$c=~/^0/?oct($c):$c;
3980	return ".byte\t".join(',',@opcode);
3981    }
3982    elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
3983	my %opcodelet = (
3984		"aesimc" => 0xdb,
3985		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
3986		"aesdec" => 0xde,	"aesdeclast" => 0xdf
3987	);
3988	return undef if (!defined($opcodelet{$1}));
3989	rex(\@opcode,$3,$2);
3990	push @opcode,0x0f,0x38,$opcodelet{$1};
3991	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
3992	return ".byte\t".join(',',@opcode);
3993    }
3994    elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
3995	my %opcodelet = (
3996		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
3997		"aesdec" => 0xde,	"aesdeclast" => 0xdf
3998	);
3999	return undef if (!defined($opcodelet{$1}));
4000	my $off = $2;
4001	push @opcode,0x44 if ($3>=8);
4002	push @opcode,0x0f,0x38,$opcodelet{$1};
4003	push @opcode,0x44|(($3&7)<<3),0x24;	# ModR/M
4004	push @opcode,($off=~/^0/?oct($off):$off)&0xff;
4005	return ".byte\t".join(',',@opcode);
4006    }
4007    return $line;
4008}
4009
4010sub movbe {
4011	".byte	0x0f,0x38,0xf1,0x44,0x24,".shift;
4012}
4013
4014$code =~ s/\`([^\`]*)\`/eval($1)/gem;
4015$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
4016#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm;	# debugging artefact
4017$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
4018
4019print $code;
4020
4021close STDOUT or die "error closing STDOUT: $!";
4022