• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for Intel AES-NI extension. In
18# OpenSSL context it's used with Intel engine, but can also be used as
19# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
20# details].
21#
22# Performance.
23#
24# Given aes(enc|dec) instructions' latency asymptotic performance for
25# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
26# processed with 128-bit key. And given their throughput asymptotic
27# performance for parallelizable modes is 1.25 cycles per byte. Being
28# asymptotic limit it's not something you commonly achieve in reality,
29# but how close does one get? Below are results collected for
30# different modes and block sized. Pairs of numbers are for en-/
31# decryption.
32#
33#	16-byte     64-byte     256-byte    1-KB        8-KB
34# ECB	4.25/4.25   1.38/1.38   1.28/1.28   1.26/1.26	1.26/1.26
35# CTR	5.42/5.42   1.92/1.92   1.44/1.44   1.28/1.28   1.26/1.26
36# CBC	4.38/4.43   4.15/1.43   4.07/1.32   4.07/1.29   4.06/1.28
37# CCM	5.66/9.42   4.42/5.41   4.16/4.40   4.09/4.15   4.06/4.07
38# OFB	5.42/5.42   4.64/4.64   4.44/4.44   4.39/4.39   4.38/4.38
39# CFB	5.73/5.85   5.56/5.62   5.48/5.56   5.47/5.55   5.47/5.55
40#
41# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
42# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
43# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
44# The results were collected with specially crafted speed.c benchmark
45# in order to compare them with results reported in "Intel Advanced
46# Encryption Standard (AES) New Instruction Set" White Paper Revision
47# 3.0 dated May 2010. All above results are consistently better. This
48# module also provides better performance for block sizes smaller than
49# 128 bytes in points *not* represented in the above table.
50#
51# Looking at the results for 8-KB buffer.
52#
53# CFB and OFB results are far from the limit, because implementation
54# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
55# single-block aesni_encrypt, which is not the most optimal way to go.
56# CBC encrypt result is unexpectedly high and there is no documented
57# explanation for it. Seemingly there is a small penalty for feeding
58# the result back to AES unit the way it's done in CBC mode. There is
59# nothing one can do and the result appears optimal. CCM result is
60# identical to CBC, because CBC-MAC is essentially CBC encrypt without
61# saving output. CCM CTR "stays invisible," because it's neatly
62# interleaved wih CBC-MAC. This provides ~30% improvement over
63# "straightforward" CCM implementation with CTR and CBC-MAC performed
64# disjointly. Parallelizable modes practically achieve the theoretical
65# limit.
66#
67# Looking at how results vary with buffer size.
68#
69# Curves are practically saturated at 1-KB buffer size. In most cases
70# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
71# CTR curve doesn't follow this pattern and is "slowest" changing one
72# with "256-byte" result being 87% of "8-KB." This is because overhead
73# in CTR mode is most computationally intensive. Small-block CCM
74# decrypt is slower than encrypt, because first CTR and last CBC-MAC
75# iterations can't be interleaved.
76#
77# Results for 192- and 256-bit keys.
78#
79# EVP-free results were observed to scale perfectly with number of
80# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
81# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
82# are a tad smaller, because the above mentioned penalty biases all
83# results by same constant value. In similar way function call
84# overhead affects small-block performance, as well as OFB and CFB
85# results. Differences are not large, most common coefficients are
86# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
87# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
88
89# January 2011
90#
91# While Westmere processor features 6 cycles latency for aes[enc|dec]
92# instructions, which can be scheduled every second cycle, Sandy
93# Bridge spends 8 cycles per instruction, but it can schedule them
94# every cycle. This means that code targeting Westmere would perform
95# suboptimally on Sandy Bridge. Therefore this update.
96#
97# In addition, non-parallelizable CBC encrypt (as well as CCM) is
98# optimized. Relative improvement might appear modest, 8% on Westmere,
99# but in absolute terms it's 3.77 cycles per byte encrypted with
100# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
101# should be compared to asymptotic limits of 3.75 for Westmere and
102# 5.00 for Sandy Bridge. Actually, the fact that they get this close
103# to asymptotic limits is quite amazing. Indeed, the limit is
104# calculated as latency times number of rounds, 10 for 128-bit key,
105# and divided by 16, the number of bytes in block, or in other words
106# it accounts *solely* for aesenc instructions. But there are extra
107# instructions, and numbers so close to the asymptotic limits mean
108# that it's as if it takes as little as *one* additional cycle to
109# execute all of them. How is it possible? It is possible thanks to
110# out-of-order execution logic, which manages to overlap post-
111# processing of previous block, things like saving the output, with
112# actual encryption of current block, as well as pre-processing of
113# current block, things like fetching input and xor-ing it with
114# 0-round element of the key schedule, with actual encryption of
115# previous block. Keep this in mind...
116#
117# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
118# performance is achieved by interleaving instructions working on
119# independent blocks. In which case asymptotic limit for such modes
120# can be obtained by dividing above mentioned numbers by AES
121# instructions' interleave factor. Westmere can execute at most 3
122# instructions at a time, meaning that optimal interleave factor is 3,
123# and that's where the "magic" number of 1.25 come from. "Optimal
124# interleave factor" means that increase of interleave factor does
125# not improve performance. The formula has proven to reflect reality
126# pretty well on Westmere... Sandy Bridge on the other hand can
127# execute up to 8 AES instructions at a time, so how does varying
128# interleave factor affect the performance? Here is table for ECB
129# (numbers are cycles per byte processed with 128-bit key):
130#
131# instruction interleave factor		3x	6x	8x
132# theoretical asymptotic limit		1.67	0.83	0.625
133# measured performance for 8KB block	1.05	0.86	0.84
134#
135# "as if" interleave factor		4.7x	5.8x	6.0x
136#
137# Further data for other parallelizable modes:
138#
139# CBC decrypt				1.16	0.93	0.74
140# CTR					1.14	0.91	0.74
141#
142# Well, given 3x column it's probably inappropriate to call the limit
143# asymptotic, if it can be surpassed, isn't it? What happens there?
144# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
145# magic is responsible for this. Processor overlaps not only the
146# additional instructions with AES ones, but even AES instructions
147# processing adjacent triplets of independent blocks. In the 6x case
148# additional instructions  still claim disproportionally small amount
149# of additional cycles, but in 8x case number of instructions must be
150# a tad too high for out-of-order logic to cope with, and AES unit
151# remains underutilized... As you can see 8x interleave is hardly
152# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
153# utilizes 6x interleave because of limited register bank capacity.
154#
155# Higher interleave factors do have negative impact on Westmere
156# performance. While for ECB mode it's negligible ~1.5%, other
157# parallelizables perform ~5% worse, which is outweighed by ~25%
158# improvement on Sandy Bridge. To balance regression on Westmere
159# CTR mode was implemented with 6x aesenc interleave factor.
160
161# April 2011
162#
163# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
164# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
165# in CTR mode AES instruction interleave factor was chosen to be 6x.
166
167# November 2015
168#
169# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was
170# chosen to be 6x.
171
172######################################################################
173# Current large-block performance in cycles per byte processed with
174# 128-bit key (less is better).
175#
176#		CBC en-/decrypt	CTR	XTS	ECB	OCB
177# Westmere	3.77/1.25	1.25	1.25	1.26
178# * Bridge	5.07/0.74	0.75	0.90	0.85	0.98
179# Haswell	4.44/0.63	0.63	0.73	0.63	0.70
180# Skylake	2.62/0.63	0.63	0.63	0.63
181# Silvermont	5.75/3.54	3.56	4.12	3.87(*)	4.11
182# Knights L	2.54/0.77	0.78	0.85	-	1.50
183# Goldmont	3.82/1.26	1.26	1.29	1.29	1.50
184# Bulldozer	5.77/0.70	0.72	0.90	0.70	0.95
185# Ryzen		2.71/0.35	0.35	0.44	0.38	0.49
186#
187# (*)	Atom Silvermont ECB result is suboptimal because of penalties
188#	incurred by operations on %xmm8-15. As ECB is not considered
189#	critical, nothing was done to mitigate the problem.
190
191$PREFIX="aes_hw";	# if $PREFIX is set to "AES", the script
192			# generates drop-in replacement for
193			# crypto/aes/asm/aes-x86_64.pl:-)
194
195$flavour = shift;
196$output  = shift;
197if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
198
199$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
200
201$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
202( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
203( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
204die "can't locate x86_64-xlate.pl";
205
206open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
207*STDOUT=*OUT;
208
209$movkey = $PREFIX eq "aes_hw" ? "movups" : "movups";
210@_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
211		("%rdi","%rsi","%rdx","%rcx");	# Unix order
212
213$code=".text\n";
214$code.=".extern	OPENSSL_ia32cap_P\n";
215
216$rounds="%eax";	# input to and changed by aesni_[en|de]cryptN !!!
217# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
218$inp="%rdi";
219$out="%rsi";
220$len="%rdx";
221$key="%rcx";	# input to and changed by aesni_[en|de]cryptN !!!
222$ivp="%r8";	# cbc, ctr, ...
223
224$rnds_="%r10d";	# backup copy for $rounds
225$key_="%r11";	# backup copy for $key
226
227# %xmm register layout
228$rndkey0="%xmm0";	$rndkey1="%xmm1";
229$inout0="%xmm2";	$inout1="%xmm3";
230$inout2="%xmm4";	$inout3="%xmm5";
231$inout4="%xmm6";	$inout5="%xmm7";
232$inout6="%xmm8";	$inout7="%xmm9";
233
234$in2="%xmm6";		$in1="%xmm7";	# used in CBC decrypt, CTR, ...
235$in0="%xmm8";		$iv="%xmm9";
236
237# Inline version of internal aesni_[en|de]crypt1.
238#
239# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
240# cycles which take care of loop variables...
241{ my $sn;
242sub aesni_generate1 {
243my ($p,$key,$rounds,$inout,$ivec)=@_;	$inout=$inout0 if (!defined($inout));
244++$sn;
245$code.=<<___;
246	$movkey	($key),$rndkey0
247	$movkey	16($key),$rndkey1
248___
249$code.=<<___ if (defined($ivec));
250	xorps	$rndkey0,$ivec
251	lea	32($key),$key
252	xorps	$ivec,$inout
253___
254$code.=<<___ if (!defined($ivec));
255	lea	32($key),$key
256	xorps	$rndkey0,$inout
257___
258$code.=<<___;
259.Loop_${p}1_$sn:
260	aes${p}	$rndkey1,$inout
261	dec	$rounds
262	$movkey	($key),$rndkey1
263	lea	16($key),$key
264	jnz	.Loop_${p}1_$sn	# loop body is 16 bytes
265	aes${p}last	$rndkey1,$inout
266___
267}}
268# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
269#
270{ my ($inp,$out,$key) = @_4args;
271
272$code.=<<___;
273.globl	${PREFIX}_encrypt
274.type	${PREFIX}_encrypt,\@abi-omnipotent
275.align	16
276${PREFIX}_encrypt:
277.cfi_startproc
278#ifndef NDEBUG
279#ifndef BORINGSSL_FIPS
280.extern	BORINGSSL_function_hit
281	movb \$1,BORINGSSL_function_hit+1(%rip)
282#endif
283#endif
284	movups	($inp),$inout0		# load input
285	mov	240($key),$rounds	# key->rounds
286___
287	&aesni_generate1("enc",$key,$rounds);
288$code.=<<___;
289	 pxor	$rndkey0,$rndkey0	# clear register bank
290	 pxor	$rndkey1,$rndkey1
291	movups	$inout0,($out)		# output
292	 pxor	$inout0,$inout0
293	ret
294.cfi_endproc
295.size	${PREFIX}_encrypt,.-${PREFIX}_encrypt
296
297.globl	${PREFIX}_decrypt
298.type	${PREFIX}_decrypt,\@abi-omnipotent
299.align	16
300${PREFIX}_decrypt:
301.cfi_startproc
302	movups	($inp),$inout0		# load input
303	mov	240($key),$rounds	# key->rounds
304___
305	&aesni_generate1("dec",$key,$rounds);
306$code.=<<___;
307	 pxor	$rndkey0,$rndkey0	# clear register bank
308	 pxor	$rndkey1,$rndkey1
309	movups	$inout0,($out)		# output
310	 pxor	$inout0,$inout0
311	ret
312.cfi_endproc
313.size	${PREFIX}_decrypt, .-${PREFIX}_decrypt
314___
315}
316
317# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
318# factor. Why 3x subroutine were originally used in loops? Even though
319# aes[enc|dec] latency was originally 6, it could be scheduled only
320# every *2nd* cycle. Thus 3x interleave was the one providing optimal
321# utilization, i.e. when subroutine's throughput is virtually same as
322# of non-interleaved subroutine [for number of input blocks up to 3].
323# This is why it originally made no sense to implement 2x subroutine.
324# But times change and it became appropriate to spend extra 192 bytes
325# on 2x subroutine on Atom Silvermont account. For processors that
326# can schedule aes[enc|dec] every cycle optimal interleave factor
327# equals to corresponding instructions latency. 8x is optimal for
328# * Bridge and "super-optimal" for other Intel CPUs...
329
330sub aesni_generate2 {
331my $dir=shift;
332# As already mentioned it takes in $key and $rounds, which are *not*
333# preserved. $inout[0-1] is cipher/clear text...
334$code.=<<___;
335.type	_aesni_${dir}rypt2,\@abi-omnipotent
336.align	16
337_aesni_${dir}rypt2:
338.cfi_startproc
339	$movkey	($key),$rndkey0
340	shl	\$4,$rounds
341	$movkey	16($key),$rndkey1
342	xorps	$rndkey0,$inout0
343	xorps	$rndkey0,$inout1
344	$movkey	32($key),$rndkey0
345	lea	32($key,$rounds),$key
346	neg	%rax				# $rounds
347	add	\$16,%rax
348
349.L${dir}_loop2:
350	aes${dir}	$rndkey1,$inout0
351	aes${dir}	$rndkey1,$inout1
352	$movkey		($key,%rax),$rndkey1
353	add		\$32,%rax
354	aes${dir}	$rndkey0,$inout0
355	aes${dir}	$rndkey0,$inout1
356	$movkey		-16($key,%rax),$rndkey0
357	jnz		.L${dir}_loop2
358
359	aes${dir}	$rndkey1,$inout0
360	aes${dir}	$rndkey1,$inout1
361	aes${dir}last	$rndkey0,$inout0
362	aes${dir}last	$rndkey0,$inout1
363	ret
364.cfi_endproc
365.size	_aesni_${dir}rypt2,.-_aesni_${dir}rypt2
366___
367}
368sub aesni_generate3 {
369my $dir=shift;
370# As already mentioned it takes in $key and $rounds, which are *not*
371# preserved. $inout[0-2] is cipher/clear text...
372$code.=<<___;
373.type	_aesni_${dir}rypt3,\@abi-omnipotent
374.align	16
375_aesni_${dir}rypt3:
376.cfi_startproc
377	$movkey	($key),$rndkey0
378	shl	\$4,$rounds
379	$movkey	16($key),$rndkey1
380	xorps	$rndkey0,$inout0
381	xorps	$rndkey0,$inout1
382	xorps	$rndkey0,$inout2
383	$movkey	32($key),$rndkey0
384	lea	32($key,$rounds),$key
385	neg	%rax				# $rounds
386	add	\$16,%rax
387
388.L${dir}_loop3:
389	aes${dir}	$rndkey1,$inout0
390	aes${dir}	$rndkey1,$inout1
391	aes${dir}	$rndkey1,$inout2
392	$movkey		($key,%rax),$rndkey1
393	add		\$32,%rax
394	aes${dir}	$rndkey0,$inout0
395	aes${dir}	$rndkey0,$inout1
396	aes${dir}	$rndkey0,$inout2
397	$movkey		-16($key,%rax),$rndkey0
398	jnz		.L${dir}_loop3
399
400	aes${dir}	$rndkey1,$inout0
401	aes${dir}	$rndkey1,$inout1
402	aes${dir}	$rndkey1,$inout2
403	aes${dir}last	$rndkey0,$inout0
404	aes${dir}last	$rndkey0,$inout1
405	aes${dir}last	$rndkey0,$inout2
406	ret
407.cfi_endproc
408.size	_aesni_${dir}rypt3,.-_aesni_${dir}rypt3
409___
410}
411# 4x interleave is implemented to improve small block performance,
412# most notably [and naturally] 4 block by ~30%. One can argue that one
413# should have implemented 5x as well, but improvement would be <20%,
414# so it's not worth it...
415sub aesni_generate4 {
416my $dir=shift;
417# As already mentioned it takes in $key and $rounds, which are *not*
418# preserved. $inout[0-3] is cipher/clear text...
419$code.=<<___;
420.type	_aesni_${dir}rypt4,\@abi-omnipotent
421.align	16
422_aesni_${dir}rypt4:
423.cfi_startproc
424	$movkey	($key),$rndkey0
425	shl	\$4,$rounds
426	$movkey	16($key),$rndkey1
427	xorps	$rndkey0,$inout0
428	xorps	$rndkey0,$inout1
429	xorps	$rndkey0,$inout2
430	xorps	$rndkey0,$inout3
431	$movkey	32($key),$rndkey0
432	lea	32($key,$rounds),$key
433	neg	%rax				# $rounds
434	.byte	0x0f,0x1f,0x00
435	add	\$16,%rax
436
437.L${dir}_loop4:
438	aes${dir}	$rndkey1,$inout0
439	aes${dir}	$rndkey1,$inout1
440	aes${dir}	$rndkey1,$inout2
441	aes${dir}	$rndkey1,$inout3
442	$movkey		($key,%rax),$rndkey1
443	add		\$32,%rax
444	aes${dir}	$rndkey0,$inout0
445	aes${dir}	$rndkey0,$inout1
446	aes${dir}	$rndkey0,$inout2
447	aes${dir}	$rndkey0,$inout3
448	$movkey		-16($key,%rax),$rndkey0
449	jnz		.L${dir}_loop4
450
451	aes${dir}	$rndkey1,$inout0
452	aes${dir}	$rndkey1,$inout1
453	aes${dir}	$rndkey1,$inout2
454	aes${dir}	$rndkey1,$inout3
455	aes${dir}last	$rndkey0,$inout0
456	aes${dir}last	$rndkey0,$inout1
457	aes${dir}last	$rndkey0,$inout2
458	aes${dir}last	$rndkey0,$inout3
459	ret
460.cfi_endproc
461.size	_aesni_${dir}rypt4,.-_aesni_${dir}rypt4
462___
463}
464sub aesni_generate6 {
465my $dir=shift;
466# As already mentioned it takes in $key and $rounds, which are *not*
467# preserved. $inout[0-5] is cipher/clear text...
468$code.=<<___;
469.type	_aesni_${dir}rypt6,\@abi-omnipotent
470.align	16
471_aesni_${dir}rypt6:
472.cfi_startproc
473	$movkey		($key),$rndkey0
474	shl		\$4,$rounds
475	$movkey		16($key),$rndkey1
476	xorps		$rndkey0,$inout0
477	pxor		$rndkey0,$inout1
478	pxor		$rndkey0,$inout2
479	aes${dir}	$rndkey1,$inout0
480	lea		32($key,$rounds),$key
481	neg		%rax			# $rounds
482	aes${dir}	$rndkey1,$inout1
483	pxor		$rndkey0,$inout3
484	pxor		$rndkey0,$inout4
485	aes${dir}	$rndkey1,$inout2
486	pxor		$rndkey0,$inout5
487	$movkey		($key,%rax),$rndkey0
488	add		\$16,%rax
489	jmp		.L${dir}_loop6_enter
490.align	16
491.L${dir}_loop6:
492	aes${dir}	$rndkey1,$inout0
493	aes${dir}	$rndkey1,$inout1
494	aes${dir}	$rndkey1,$inout2
495.L${dir}_loop6_enter:
496	aes${dir}	$rndkey1,$inout3
497	aes${dir}	$rndkey1,$inout4
498	aes${dir}	$rndkey1,$inout5
499	$movkey		($key,%rax),$rndkey1
500	add		\$32,%rax
501	aes${dir}	$rndkey0,$inout0
502	aes${dir}	$rndkey0,$inout1
503	aes${dir}	$rndkey0,$inout2
504	aes${dir}	$rndkey0,$inout3
505	aes${dir}	$rndkey0,$inout4
506	aes${dir}	$rndkey0,$inout5
507	$movkey		-16($key,%rax),$rndkey0
508	jnz		.L${dir}_loop6
509
510	aes${dir}	$rndkey1,$inout0
511	aes${dir}	$rndkey1,$inout1
512	aes${dir}	$rndkey1,$inout2
513	aes${dir}	$rndkey1,$inout3
514	aes${dir}	$rndkey1,$inout4
515	aes${dir}	$rndkey1,$inout5
516	aes${dir}last	$rndkey0,$inout0
517	aes${dir}last	$rndkey0,$inout1
518	aes${dir}last	$rndkey0,$inout2
519	aes${dir}last	$rndkey0,$inout3
520	aes${dir}last	$rndkey0,$inout4
521	aes${dir}last	$rndkey0,$inout5
522	ret
523.cfi_endproc
524.size	_aesni_${dir}rypt6,.-_aesni_${dir}rypt6
525___
526}
527sub aesni_generate8 {
528my $dir=shift;
529# As already mentioned it takes in $key and $rounds, which are *not*
530# preserved. $inout[0-7] is cipher/clear text...
531$code.=<<___;
532.type	_aesni_${dir}rypt8,\@abi-omnipotent
533.align	16
534_aesni_${dir}rypt8:
535.cfi_startproc
536	$movkey		($key),$rndkey0
537	shl		\$4,$rounds
538	$movkey		16($key),$rndkey1
539	xorps		$rndkey0,$inout0
540	xorps		$rndkey0,$inout1
541	pxor		$rndkey0,$inout2
542	pxor		$rndkey0,$inout3
543	pxor		$rndkey0,$inout4
544	lea		32($key,$rounds),$key
545	neg		%rax			# $rounds
546	aes${dir}	$rndkey1,$inout0
547	pxor		$rndkey0,$inout5
548	pxor		$rndkey0,$inout6
549	aes${dir}	$rndkey1,$inout1
550	pxor		$rndkey0,$inout7
551	$movkey		($key,%rax),$rndkey0
552	add		\$16,%rax
553	jmp		.L${dir}_loop8_inner
554.align	16
555.L${dir}_loop8:
556	aes${dir}	$rndkey1,$inout0
557	aes${dir}	$rndkey1,$inout1
558.L${dir}_loop8_inner:
559	aes${dir}	$rndkey1,$inout2
560	aes${dir}	$rndkey1,$inout3
561	aes${dir}	$rndkey1,$inout4
562	aes${dir}	$rndkey1,$inout5
563	aes${dir}	$rndkey1,$inout6
564	aes${dir}	$rndkey1,$inout7
565.L${dir}_loop8_enter:
566	$movkey		($key,%rax),$rndkey1
567	add		\$32,%rax
568	aes${dir}	$rndkey0,$inout0
569	aes${dir}	$rndkey0,$inout1
570	aes${dir}	$rndkey0,$inout2
571	aes${dir}	$rndkey0,$inout3
572	aes${dir}	$rndkey0,$inout4
573	aes${dir}	$rndkey0,$inout5
574	aes${dir}	$rndkey0,$inout6
575	aes${dir}	$rndkey0,$inout7
576	$movkey		-16($key,%rax),$rndkey0
577	jnz		.L${dir}_loop8
578
579	aes${dir}	$rndkey1,$inout0
580	aes${dir}	$rndkey1,$inout1
581	aes${dir}	$rndkey1,$inout2
582	aes${dir}	$rndkey1,$inout3
583	aes${dir}	$rndkey1,$inout4
584	aes${dir}	$rndkey1,$inout5
585	aes${dir}	$rndkey1,$inout6
586	aes${dir}	$rndkey1,$inout7
587	aes${dir}last	$rndkey0,$inout0
588	aes${dir}last	$rndkey0,$inout1
589	aes${dir}last	$rndkey0,$inout2
590	aes${dir}last	$rndkey0,$inout3
591	aes${dir}last	$rndkey0,$inout4
592	aes${dir}last	$rndkey0,$inout5
593	aes${dir}last	$rndkey0,$inout6
594	aes${dir}last	$rndkey0,$inout7
595	ret
596.cfi_endproc
597.size	_aesni_${dir}rypt8,.-_aesni_${dir}rypt8
598___
599}
600&aesni_generate2("enc") if ($PREFIX eq "aes_hw");
601&aesni_generate2("dec");
602&aesni_generate3("enc") if ($PREFIX eq "aes_hw");
603&aesni_generate3("dec");
604&aesni_generate4("enc") if ($PREFIX eq "aes_hw");
605&aesni_generate4("dec");
606&aesni_generate6("enc") if ($PREFIX eq "aes_hw");
607&aesni_generate6("dec");
608&aesni_generate8("enc") if ($PREFIX eq "aes_hw");
609&aesni_generate8("dec");
610
611if ($PREFIX eq "aes_hw") {
612########################################################################
613# void aesni_ecb_encrypt (const void *in, void *out,
614#			  size_t length, const AES_KEY *key,
615#			  int enc);
616$code.=<<___;
617.globl	${PREFIX}_ecb_encrypt
618.type	${PREFIX}_ecb_encrypt,\@function,5
619.align	16
620${PREFIX}_ecb_encrypt:
621.cfi_startproc
622___
623$code.=<<___ if ($win64);
624	lea	-0x58(%rsp),%rsp
625	movaps	%xmm6,(%rsp)		# offload $inout4..7
626	movaps	%xmm7,0x10(%rsp)
627	movaps	%xmm8,0x20(%rsp)
628	movaps	%xmm9,0x30(%rsp)
629.Lecb_enc_body:
630___
631$code.=<<___;
632	and	\$-16,$len		# if ($len<16)
633	jz	.Lecb_ret		# return
634
635	mov	240($key),$rounds	# key->rounds
636	$movkey	($key),$rndkey0
637	mov	$key,$key_		# backup $key
638	mov	$rounds,$rnds_		# backup $rounds
639	test	%r8d,%r8d		# 5th argument
640	jz	.Lecb_decrypt
641#--------------------------- ECB ENCRYPT ------------------------------#
642	cmp	\$0x80,$len		# if ($len<8*16)
643	jb	.Lecb_enc_tail		# short input
644
645	movdqu	($inp),$inout0		# load 8 input blocks
646	movdqu	0x10($inp),$inout1
647	movdqu	0x20($inp),$inout2
648	movdqu	0x30($inp),$inout3
649	movdqu	0x40($inp),$inout4
650	movdqu	0x50($inp),$inout5
651	movdqu	0x60($inp),$inout6
652	movdqu	0x70($inp),$inout7
653	lea	0x80($inp),$inp		# $inp+=8*16
654	sub	\$0x80,$len		# $len-=8*16 (can be zero)
655	jmp	.Lecb_enc_loop8_enter
656.align 16
657.Lecb_enc_loop8:
658	movups	$inout0,($out)		# store 8 output blocks
659	mov	$key_,$key		# restore $key
660	movdqu	($inp),$inout0		# load 8 input blocks
661	mov	$rnds_,$rounds		# restore $rounds
662	movups	$inout1,0x10($out)
663	movdqu	0x10($inp),$inout1
664	movups	$inout2,0x20($out)
665	movdqu	0x20($inp),$inout2
666	movups	$inout3,0x30($out)
667	movdqu	0x30($inp),$inout3
668	movups	$inout4,0x40($out)
669	movdqu	0x40($inp),$inout4
670	movups	$inout5,0x50($out)
671	movdqu	0x50($inp),$inout5
672	movups	$inout6,0x60($out)
673	movdqu	0x60($inp),$inout6
674	movups	$inout7,0x70($out)
675	lea	0x80($out),$out		# $out+=8*16
676	movdqu	0x70($inp),$inout7
677	lea	0x80($inp),$inp		# $inp+=8*16
678.Lecb_enc_loop8_enter:
679
680	call	_aesni_encrypt8
681
682	sub	\$0x80,$len
683	jnc	.Lecb_enc_loop8		# loop if $len-=8*16 didn't borrow
684
685	movups	$inout0,($out)		# store 8 output blocks
686	mov	$key_,$key		# restore $key
687	movups	$inout1,0x10($out)
688	mov	$rnds_,$rounds		# restore $rounds
689	movups	$inout2,0x20($out)
690	movups	$inout3,0x30($out)
691	movups	$inout4,0x40($out)
692	movups	$inout5,0x50($out)
693	movups	$inout6,0x60($out)
694	movups	$inout7,0x70($out)
695	lea	0x80($out),$out		# $out+=8*16
696	add	\$0x80,$len		# restore real remaining $len
697	jz	.Lecb_ret		# done if ($len==0)
698
699.Lecb_enc_tail:				# $len is less than 8*16
700	movups	($inp),$inout0
701	cmp	\$0x20,$len
702	jb	.Lecb_enc_one
703	movups	0x10($inp),$inout1
704	je	.Lecb_enc_two
705	movups	0x20($inp),$inout2
706	cmp	\$0x40,$len
707	jb	.Lecb_enc_three
708	movups	0x30($inp),$inout3
709	je	.Lecb_enc_four
710	movups	0x40($inp),$inout4
711	cmp	\$0x60,$len
712	jb	.Lecb_enc_five
713	movups	0x50($inp),$inout5
714	je	.Lecb_enc_six
715	movdqu	0x60($inp),$inout6
716	xorps	$inout7,$inout7
717	call	_aesni_encrypt8
718	movups	$inout0,($out)		# store 7 output blocks
719	movups	$inout1,0x10($out)
720	movups	$inout2,0x20($out)
721	movups	$inout3,0x30($out)
722	movups	$inout4,0x40($out)
723	movups	$inout5,0x50($out)
724	movups	$inout6,0x60($out)
725	jmp	.Lecb_ret
726.align	16
727.Lecb_enc_one:
728___
729	&aesni_generate1("enc",$key,$rounds);
730$code.=<<___;
731	movups	$inout0,($out)		# store one output block
732	jmp	.Lecb_ret
733.align	16
734.Lecb_enc_two:
735	call	_aesni_encrypt2
736	movups	$inout0,($out)		# store 2 output blocks
737	movups	$inout1,0x10($out)
738	jmp	.Lecb_ret
739.align	16
740.Lecb_enc_three:
741	call	_aesni_encrypt3
742	movups	$inout0,($out)		# store 3 output blocks
743	movups	$inout1,0x10($out)
744	movups	$inout2,0x20($out)
745	jmp	.Lecb_ret
746.align	16
747.Lecb_enc_four:
748	call	_aesni_encrypt4
749	movups	$inout0,($out)		# store 4 output blocks
750	movups	$inout1,0x10($out)
751	movups	$inout2,0x20($out)
752	movups	$inout3,0x30($out)
753	jmp	.Lecb_ret
754.align	16
755.Lecb_enc_five:
756	xorps	$inout5,$inout5
757	call	_aesni_encrypt6
758	movups	$inout0,($out)		# store 5 output blocks
759	movups	$inout1,0x10($out)
760	movups	$inout2,0x20($out)
761	movups	$inout3,0x30($out)
762	movups	$inout4,0x40($out)
763	jmp	.Lecb_ret
764.align	16
765.Lecb_enc_six:
766	call	_aesni_encrypt6
767	movups	$inout0,($out)		# store 6 output blocks
768	movups	$inout1,0x10($out)
769	movups	$inout2,0x20($out)
770	movups	$inout3,0x30($out)
771	movups	$inout4,0x40($out)
772	movups	$inout5,0x50($out)
773	jmp	.Lecb_ret
774#--------------------------- ECB DECRYPT ------------------------------#
775.align	16
776.Lecb_decrypt:
777	cmp	\$0x80,$len		# if ($len<8*16)
778	jb	.Lecb_dec_tail		# short input
779
780	movdqu	($inp),$inout0		# load 8 input blocks
781	movdqu	0x10($inp),$inout1
782	movdqu	0x20($inp),$inout2
783	movdqu	0x30($inp),$inout3
784	movdqu	0x40($inp),$inout4
785	movdqu	0x50($inp),$inout5
786	movdqu	0x60($inp),$inout6
787	movdqu	0x70($inp),$inout7
788	lea	0x80($inp),$inp		# $inp+=8*16
789	sub	\$0x80,$len		# $len-=8*16 (can be zero)
790	jmp	.Lecb_dec_loop8_enter
791.align 16
792.Lecb_dec_loop8:
793	movups	$inout0,($out)		# store 8 output blocks
794	mov	$key_,$key		# restore $key
795	movdqu	($inp),$inout0		# load 8 input blocks
796	mov	$rnds_,$rounds		# restore $rounds
797	movups	$inout1,0x10($out)
798	movdqu	0x10($inp),$inout1
799	movups	$inout2,0x20($out)
800	movdqu	0x20($inp),$inout2
801	movups	$inout3,0x30($out)
802	movdqu	0x30($inp),$inout3
803	movups	$inout4,0x40($out)
804	movdqu	0x40($inp),$inout4
805	movups	$inout5,0x50($out)
806	movdqu	0x50($inp),$inout5
807	movups	$inout6,0x60($out)
808	movdqu	0x60($inp),$inout6
809	movups	$inout7,0x70($out)
810	lea	0x80($out),$out		# $out+=8*16
811	movdqu	0x70($inp),$inout7
812	lea	0x80($inp),$inp		# $inp+=8*16
813.Lecb_dec_loop8_enter:
814
815	call	_aesni_decrypt8
816
817	$movkey	($key_),$rndkey0
818	sub	\$0x80,$len
819	jnc	.Lecb_dec_loop8		# loop if $len-=8*16 didn't borrow
820
821	movups	$inout0,($out)		# store 8 output blocks
822	 pxor	$inout0,$inout0		# clear register bank
823	mov	$key_,$key		# restore $key
824	movups	$inout1,0x10($out)
825	 pxor	$inout1,$inout1
826	mov	$rnds_,$rounds		# restore $rounds
827	movups	$inout2,0x20($out)
828	 pxor	$inout2,$inout2
829	movups	$inout3,0x30($out)
830	 pxor	$inout3,$inout3
831	movups	$inout4,0x40($out)
832	 pxor	$inout4,$inout4
833	movups	$inout5,0x50($out)
834	 pxor	$inout5,$inout5
835	movups	$inout6,0x60($out)
836	 pxor	$inout6,$inout6
837	movups	$inout7,0x70($out)
838	 pxor	$inout7,$inout7
839	lea	0x80($out),$out		# $out+=8*16
840	add	\$0x80,$len		# restore real remaining $len
841	jz	.Lecb_ret		# done if ($len==0)
842
843.Lecb_dec_tail:
844	movups	($inp),$inout0
845	cmp	\$0x20,$len
846	jb	.Lecb_dec_one
847	movups	0x10($inp),$inout1
848	je	.Lecb_dec_two
849	movups	0x20($inp),$inout2
850	cmp	\$0x40,$len
851	jb	.Lecb_dec_three
852	movups	0x30($inp),$inout3
853	je	.Lecb_dec_four
854	movups	0x40($inp),$inout4
855	cmp	\$0x60,$len
856	jb	.Lecb_dec_five
857	movups	0x50($inp),$inout5
858	je	.Lecb_dec_six
859	movups	0x60($inp),$inout6
860	$movkey	($key),$rndkey0
861	xorps	$inout7,$inout7
862	call	_aesni_decrypt8
863	movups	$inout0,($out)		# store 7 output blocks
864	 pxor	$inout0,$inout0		# clear register bank
865	movups	$inout1,0x10($out)
866	 pxor	$inout1,$inout1
867	movups	$inout2,0x20($out)
868	 pxor	$inout2,$inout2
869	movups	$inout3,0x30($out)
870	 pxor	$inout3,$inout3
871	movups	$inout4,0x40($out)
872	 pxor	$inout4,$inout4
873	movups	$inout5,0x50($out)
874	 pxor	$inout5,$inout5
875	movups	$inout6,0x60($out)
876	 pxor	$inout6,$inout6
877	 pxor	$inout7,$inout7
878	jmp	.Lecb_ret
879.align	16
880.Lecb_dec_one:
881___
882	&aesni_generate1("dec",$key,$rounds);
883$code.=<<___;
884	movups	$inout0,($out)		# store one output block
885	 pxor	$inout0,$inout0		# clear register bank
886	jmp	.Lecb_ret
887.align	16
888.Lecb_dec_two:
889	call	_aesni_decrypt2
890	movups	$inout0,($out)		# store 2 output blocks
891	 pxor	$inout0,$inout0		# clear register bank
892	movups	$inout1,0x10($out)
893	 pxor	$inout1,$inout1
894	jmp	.Lecb_ret
895.align	16
896.Lecb_dec_three:
897	call	_aesni_decrypt3
898	movups	$inout0,($out)		# store 3 output blocks
899	 pxor	$inout0,$inout0		# clear register bank
900	movups	$inout1,0x10($out)
901	 pxor	$inout1,$inout1
902	movups	$inout2,0x20($out)
903	 pxor	$inout2,$inout2
904	jmp	.Lecb_ret
905.align	16
906.Lecb_dec_four:
907	call	_aesni_decrypt4
908	movups	$inout0,($out)		# store 4 output blocks
909	 pxor	$inout0,$inout0		# clear register bank
910	movups	$inout1,0x10($out)
911	 pxor	$inout1,$inout1
912	movups	$inout2,0x20($out)
913	 pxor	$inout2,$inout2
914	movups	$inout3,0x30($out)
915	 pxor	$inout3,$inout3
916	jmp	.Lecb_ret
917.align	16
918.Lecb_dec_five:
919	xorps	$inout5,$inout5
920	call	_aesni_decrypt6
921	movups	$inout0,($out)		# store 5 output blocks
922	 pxor	$inout0,$inout0		# clear register bank
923	movups	$inout1,0x10($out)
924	 pxor	$inout1,$inout1
925	movups	$inout2,0x20($out)
926	 pxor	$inout2,$inout2
927	movups	$inout3,0x30($out)
928	 pxor	$inout3,$inout3
929	movups	$inout4,0x40($out)
930	 pxor	$inout4,$inout4
931	 pxor	$inout5,$inout5
932	jmp	.Lecb_ret
933.align	16
934.Lecb_dec_six:
935	call	_aesni_decrypt6
936	movups	$inout0,($out)		# store 6 output blocks
937	 pxor	$inout0,$inout0		# clear register bank
938	movups	$inout1,0x10($out)
939	 pxor	$inout1,$inout1
940	movups	$inout2,0x20($out)
941	 pxor	$inout2,$inout2
942	movups	$inout3,0x30($out)
943	 pxor	$inout3,$inout3
944	movups	$inout4,0x40($out)
945	 pxor	$inout4,$inout4
946	movups	$inout5,0x50($out)
947	 pxor	$inout5,$inout5
948
949.Lecb_ret:
950	xorps	$rndkey0,$rndkey0	# %xmm0
951	pxor	$rndkey1,$rndkey1
952___
953$code.=<<___ if ($win64);
954	movaps	(%rsp),%xmm6
955	movaps	%xmm0,(%rsp)		# clear stack
956	movaps	0x10(%rsp),%xmm7
957	movaps	%xmm0,0x10(%rsp)
958	movaps	0x20(%rsp),%xmm8
959	movaps	%xmm0,0x20(%rsp)
960	movaps	0x30(%rsp),%xmm9
961	movaps	%xmm0,0x30(%rsp)
962	lea	0x58(%rsp),%rsp
963.Lecb_enc_ret:
964___
965$code.=<<___;
966	ret
967.cfi_endproc
968.size	${PREFIX}_ecb_encrypt,.-${PREFIX}_ecb_encrypt
969___
970
971{
972######################################################################
973# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
974#                         size_t blocks, const AES_KEY *key,
975#                         const char *ivec,char *cmac);
976#
977# Handles only complete blocks, operates on 64-bit counter and
978# does not update *ivec! Nor does it finalize CMAC value
979# (see engine/eng_aesni.c for details)
980#
981if (0) {  # Omit these functions in BoringSSL
982my $cmac="%r9";	# 6th argument
983
984my $increment="%xmm9";
985my $iv="%xmm6";
986my $bswap_mask="%xmm7";
987
988$code.=<<___;
989.globl	${PREFIX}_ccm64_encrypt_blocks
990.type	${PREFIX}_ccm64_encrypt_blocks,\@function,6
991.align	16
992${PREFIX}_ccm64_encrypt_blocks:
993___
994$code.=<<___ if ($win64);
995	lea	-0x58(%rsp),%rsp
996	movaps	%xmm6,(%rsp)		# $iv
997	movaps	%xmm7,0x10(%rsp)	# $bswap_mask
998	movaps	%xmm8,0x20(%rsp)	# $in0
999	movaps	%xmm9,0x30(%rsp)	# $increment
1000.Lccm64_enc_body:
1001___
1002$code.=<<___;
1003	mov	240($key),$rounds		# key->rounds
1004	movdqu	($ivp),$iv
1005	movdqa	.Lincrement64(%rip),$increment
1006	movdqa	.Lbswap_mask(%rip),$bswap_mask
1007
1008	shl	\$4,$rounds
1009	mov	\$16,$rnds_
1010	lea	0($key),$key_
1011	movdqu	($cmac),$inout1
1012	movdqa	$iv,$inout0
1013	lea	32($key,$rounds),$key		# end of key schedule
1014	pshufb	$bswap_mask,$iv
1015	sub	%rax,%r10			# twisted $rounds
1016	jmp	.Lccm64_enc_outer
1017.align	16
1018.Lccm64_enc_outer:
1019	$movkey	($key_),$rndkey0
1020	mov	%r10,%rax
1021	movups	($inp),$in0			# load inp
1022
1023	xorps	$rndkey0,$inout0		# counter
1024	$movkey	16($key_),$rndkey1
1025	xorps	$in0,$rndkey0
1026	xorps	$rndkey0,$inout1		# cmac^=inp
1027	$movkey	32($key_),$rndkey0
1028
1029.Lccm64_enc2_loop:
1030	aesenc	$rndkey1,$inout0
1031	aesenc	$rndkey1,$inout1
1032	$movkey	($key,%rax),$rndkey1
1033	add	\$32,%rax
1034	aesenc	$rndkey0,$inout0
1035	aesenc	$rndkey0,$inout1
1036	$movkey	-16($key,%rax),$rndkey0
1037	jnz	.Lccm64_enc2_loop
1038	aesenc	$rndkey1,$inout0
1039	aesenc	$rndkey1,$inout1
1040	paddq	$increment,$iv
1041	dec	$len				# $len-- ($len is in blocks)
1042	aesenclast	$rndkey0,$inout0
1043	aesenclast	$rndkey0,$inout1
1044
1045	lea	16($inp),$inp
1046	xorps	$inout0,$in0			# inp ^= E(iv)
1047	movdqa	$iv,$inout0
1048	movups	$in0,($out)			# save output
1049	pshufb	$bswap_mask,$inout0
1050	lea	16($out),$out			# $out+=16
1051	jnz	.Lccm64_enc_outer		# loop if ($len!=0)
1052
1053	 pxor	$rndkey0,$rndkey0		# clear register bank
1054	 pxor	$rndkey1,$rndkey1
1055	 pxor	$inout0,$inout0
1056	movups	$inout1,($cmac)			# store resulting mac
1057	 pxor	$inout1,$inout1
1058	 pxor	$in0,$in0
1059	 pxor	$iv,$iv
1060___
1061$code.=<<___ if ($win64);
1062	movaps	(%rsp),%xmm6
1063	movaps	%xmm0,(%rsp)			# clear stack
1064	movaps	0x10(%rsp),%xmm7
1065	movaps	%xmm0,0x10(%rsp)
1066	movaps	0x20(%rsp),%xmm8
1067	movaps	%xmm0,0x20(%rsp)
1068	movaps	0x30(%rsp),%xmm9
1069	movaps	%xmm0,0x30(%rsp)
1070	lea	0x58(%rsp),%rsp
1071.Lccm64_enc_ret:
1072___
1073$code.=<<___;
1074	ret
1075.size	${PREFIX}_ccm64_encrypt_blocks,.-${PREFIX}_ccm64_encrypt_blocks
1076___
1077######################################################################
1078$code.=<<___;
1079.globl	${PREFIX}_ccm64_decrypt_blocks
1080.type	${PREFIX}_ccm64_decrypt_blocks,\@function,6
1081.align	16
1082${PREFIX}_ccm64_decrypt_blocks:
1083___
1084$code.=<<___ if ($win64);
1085	lea	-0x58(%rsp),%rsp
1086	movaps	%xmm6,(%rsp)		# $iv
1087	movaps	%xmm7,0x10(%rsp)	# $bswap_mask
1088	movaps	%xmm8,0x20(%rsp)	# $in8
1089	movaps	%xmm9,0x30(%rsp)	# $increment
1090.Lccm64_dec_body:
1091___
1092$code.=<<___;
1093	mov	240($key),$rounds		# key->rounds
1094	movups	($ivp),$iv
1095	movdqu	($cmac),$inout1
1096	movdqa	.Lincrement64(%rip),$increment
1097	movdqa	.Lbswap_mask(%rip),$bswap_mask
1098
1099	movaps	$iv,$inout0
1100	mov	$rounds,$rnds_
1101	mov	$key,$key_
1102	pshufb	$bswap_mask,$iv
1103___
1104	&aesni_generate1("enc",$key,$rounds);
1105$code.=<<___;
1106	shl	\$4,$rnds_
1107	mov	\$16,$rounds
1108	movups	($inp),$in0			# load inp
1109	paddq	$increment,$iv
1110	lea	16($inp),$inp			# $inp+=16
1111	sub	%r10,%rax			# twisted $rounds
1112	lea	32($key_,$rnds_),$key		# end of key schedule
1113	mov	%rax,%r10
1114	jmp	.Lccm64_dec_outer
1115.align	16
1116.Lccm64_dec_outer:
1117	xorps	$inout0,$in0			# inp ^= E(iv)
1118	movdqa	$iv,$inout0
1119	movups	$in0,($out)			# save output
1120	lea	16($out),$out			# $out+=16
1121	pshufb	$bswap_mask,$inout0
1122
1123	sub	\$1,$len			# $len-- ($len is in blocks)
1124	jz	.Lccm64_dec_break		# if ($len==0) break
1125
1126	$movkey	($key_),$rndkey0
1127	mov	%r10,%rax
1128	$movkey	16($key_),$rndkey1
1129	xorps	$rndkey0,$in0
1130	xorps	$rndkey0,$inout0
1131	xorps	$in0,$inout1			# cmac^=out
1132	$movkey	32($key_),$rndkey0
1133	jmp	.Lccm64_dec2_loop
1134.align	16
1135.Lccm64_dec2_loop:
1136	aesenc	$rndkey1,$inout0
1137	aesenc	$rndkey1,$inout1
1138	$movkey	($key,%rax),$rndkey1
1139	add	\$32,%rax
1140	aesenc	$rndkey0,$inout0
1141	aesenc	$rndkey0,$inout1
1142	$movkey	-16($key,%rax),$rndkey0
1143	jnz	.Lccm64_dec2_loop
1144	movups	($inp),$in0			# load input
1145	paddq	$increment,$iv
1146	aesenc	$rndkey1,$inout0
1147	aesenc	$rndkey1,$inout1
1148	aesenclast	$rndkey0,$inout0
1149	aesenclast	$rndkey0,$inout1
1150	lea	16($inp),$inp			# $inp+=16
1151	jmp	.Lccm64_dec_outer
1152
1153.align	16
1154.Lccm64_dec_break:
1155	#xorps	$in0,$inout1			# cmac^=out
1156	mov	240($key_),$rounds
1157___
1158	&aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
1159$code.=<<___;
1160	 pxor	$rndkey0,$rndkey0		# clear register bank
1161	 pxor	$rndkey1,$rndkey1
1162	 pxor	$inout0,$inout0
1163	movups	$inout1,($cmac)			# store resulting mac
1164	 pxor	$inout1,$inout1
1165	 pxor	$in0,$in0
1166	 pxor	$iv,$iv
1167___
1168$code.=<<___ if ($win64);
1169	movaps	(%rsp),%xmm6
1170	movaps	%xmm0,(%rsp)			# clear stack
1171	movaps	0x10(%rsp),%xmm7
1172	movaps	%xmm0,0x10(%rsp)
1173	movaps	0x20(%rsp),%xmm8
1174	movaps	%xmm0,0x20(%rsp)
1175	movaps	0x30(%rsp),%xmm9
1176	movaps	%xmm0,0x30(%rsp)
1177	lea	0x58(%rsp),%rsp
1178.Lccm64_dec_ret:
1179___
1180$code.=<<___;
1181	ret
1182.size	${PREFIX}_ccm64_decrypt_blocks,.-${PREFIX}_ccm64_decrypt_blocks
1183___
1184}
1185######################################################################
1186# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
1187#                         size_t blocks, const AES_KEY *key,
1188#                         const char *ivec);
1189#
1190# Handles only complete blocks, operates on 32-bit counter and
1191# does not update *ivec! (see crypto/modes/ctr128.c for details)
1192#
1193# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
1194# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
1195# Keywords are full unroll and modulo-schedule counter calculations
1196# with zero-round key xor.
1197{
1198my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
1199my ($key0,$ctr)=("%ebp","${ivp}d");
1200my $frame_size = 0x80 + ($win64?160:0);
1201
1202$code.=<<___;
1203.globl	${PREFIX}_ctr32_encrypt_blocks
1204.type	${PREFIX}_ctr32_encrypt_blocks,\@function,5
1205.align	16
1206${PREFIX}_ctr32_encrypt_blocks:
1207.cfi_startproc
1208#ifndef NDEBUG
1209#ifndef BORINGSSL_FIPS
1210	movb \$1,BORINGSSL_function_hit(%rip)
1211#endif
1212#endif
1213	cmp	\$1,$len
1214	jne	.Lctr32_bulk
1215
1216	# handle single block without allocating stack frame,
1217	# useful when handling edges
1218	movups	($ivp),$inout0
1219	movups	($inp),$inout1
1220	mov	240($key),%edx			# key->rounds
1221___
1222	&aesni_generate1("enc",$key,"%edx");
1223$code.=<<___;
1224	 pxor	$rndkey0,$rndkey0		# clear register bank
1225	 pxor	$rndkey1,$rndkey1
1226	xorps	$inout1,$inout0
1227	 pxor	$inout1,$inout1
1228	movups	$inout0,($out)
1229	 xorps	$inout0,$inout0
1230	jmp	.Lctr32_epilogue
1231
1232.align	16
1233.Lctr32_bulk:
1234	lea	(%rsp),$key_			# use $key_ as frame pointer
1235.cfi_def_cfa_register	$key_
1236	push	%rbp
1237.cfi_push	%rbp
1238	sub	\$$frame_size,%rsp
1239	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
1240___
1241$code.=<<___ if ($win64);
1242	movaps	%xmm6,-0xa8($key_)		# offload everything
1243	movaps	%xmm7,-0x98($key_)
1244	movaps	%xmm8,-0x88($key_)
1245	movaps	%xmm9,-0x78($key_)
1246	movaps	%xmm10,-0x68($key_)
1247	movaps	%xmm11,-0x58($key_)
1248	movaps	%xmm12,-0x48($key_)
1249	movaps	%xmm13,-0x38($key_)
1250	movaps	%xmm14,-0x28($key_)
1251	movaps	%xmm15,-0x18($key_)
1252.Lctr32_body:
1253___
1254$code.=<<___;
1255
1256	# 8 16-byte words on top of stack are counter values
1257	# xor-ed with zero-round key
1258
1259	movdqu	($ivp),$inout0
1260	movdqu	($key),$rndkey0
1261	mov	12($ivp),$ctr			# counter LSB
1262	pxor	$rndkey0,$inout0
1263	mov	12($key),$key0			# 0-round key LSB
1264	movdqa	$inout0,0x00(%rsp)		# populate counter block
1265	bswap	$ctr
1266	movdqa	$inout0,$inout1
1267	movdqa	$inout0,$inout2
1268	movdqa	$inout0,$inout3
1269	movdqa	$inout0,0x40(%rsp)
1270	movdqa	$inout0,0x50(%rsp)
1271	movdqa	$inout0,0x60(%rsp)
1272	mov	%rdx,%r10			# about to borrow %rdx
1273	movdqa	$inout0,0x70(%rsp)
1274
1275	lea	1($ctr),%rax
1276	 lea	2($ctr),%rdx
1277	bswap	%eax
1278	 bswap	%edx
1279	xor	$key0,%eax
1280	 xor	$key0,%edx
1281	pinsrd	\$3,%eax,$inout1
1282	lea	3($ctr),%rax
1283	movdqa	$inout1,0x10(%rsp)
1284	 pinsrd	\$3,%edx,$inout2
1285	bswap	%eax
1286	 mov	%r10,%rdx			# restore %rdx
1287	 lea	4($ctr),%r10
1288	 movdqa	$inout2,0x20(%rsp)
1289	xor	$key0,%eax
1290	 bswap	%r10d
1291	pinsrd	\$3,%eax,$inout3
1292	 xor	$key0,%r10d
1293	movdqa	$inout3,0x30(%rsp)
1294	lea	5($ctr),%r9
1295	 mov	%r10d,0x40+12(%rsp)
1296	bswap	%r9d
1297	 lea	6($ctr),%r10
1298	mov	240($key),$rounds		# key->rounds
1299	xor	$key0,%r9d
1300	 bswap	%r10d
1301	mov	%r9d,0x50+12(%rsp)
1302	 xor	$key0,%r10d
1303	lea	7($ctr),%r9
1304	 mov	%r10d,0x60+12(%rsp)
1305	bswap	%r9d
1306	leaq	OPENSSL_ia32cap_P(%rip),%r10
1307	 mov	4(%r10),%r10d
1308	xor	$key0,%r9d
1309	 and	\$`1<<26|1<<22`,%r10d		# isolate XSAVE+MOVBE
1310	mov	%r9d,0x70+12(%rsp)
1311
1312	$movkey	0x10($key),$rndkey1
1313
1314	movdqa	0x40(%rsp),$inout4
1315	movdqa	0x50(%rsp),$inout5
1316
1317	cmp	\$8,$len		# $len is in blocks
1318	jb	.Lctr32_tail		# short input if ($len<8)
1319
1320	sub	\$6,$len		# $len is biased by -6
1321	cmp	\$`1<<22`,%r10d		# check for MOVBE without XSAVE
1322	je	.Lctr32_6x		# [which denotes Atom Silvermont]
1323
1324	lea	0x80($key),$key		# size optimization
1325	sub	\$2,$len		# $len is biased by -8
1326	jmp	.Lctr32_loop8
1327
1328.align	16
1329.Lctr32_6x:
1330	shl	\$4,$rounds
1331	mov	\$48,$rnds_
1332	bswap	$key0
1333	lea	32($key,$rounds),$key	# end of key schedule
1334	sub	%rax,%r10		# twisted $rounds
1335	jmp	.Lctr32_loop6
1336
1337.align	16
1338.Lctr32_loop6:
1339	 add	\$6,$ctr		# next counter value
1340	$movkey	-48($key,$rnds_),$rndkey0
1341	aesenc	$rndkey1,$inout0
1342	 mov	$ctr,%eax
1343	 xor	$key0,%eax
1344	aesenc	$rndkey1,$inout1
1345	 movbe	%eax,`0x00+12`(%rsp)	# store next counter value
1346	 lea	1($ctr),%eax
1347	aesenc	$rndkey1,$inout2
1348	 xor	$key0,%eax
1349	 movbe	%eax,`0x10+12`(%rsp)
1350	aesenc	$rndkey1,$inout3
1351	 lea	2($ctr),%eax
1352	 xor	$key0,%eax
1353	aesenc	$rndkey1,$inout4
1354	 movbe	%eax,`0x20+12`(%rsp)
1355	 lea	3($ctr),%eax
1356	aesenc	$rndkey1,$inout5
1357	$movkey	-32($key,$rnds_),$rndkey1
1358	 xor	$key0,%eax
1359
1360	aesenc	$rndkey0,$inout0
1361	 movbe	%eax,`0x30+12`(%rsp)
1362	 lea	4($ctr),%eax
1363	aesenc	$rndkey0,$inout1
1364	 xor	$key0,%eax
1365	 movbe	%eax,`0x40+12`(%rsp)
1366	aesenc	$rndkey0,$inout2
1367	 lea	5($ctr),%eax
1368	 xor	$key0,%eax
1369	aesenc	$rndkey0,$inout3
1370	 movbe	%eax,`0x50+12`(%rsp)
1371	 mov	%r10,%rax		# mov	$rnds_,$rounds
1372	aesenc	$rndkey0,$inout4
1373	aesenc	$rndkey0,$inout5
1374	$movkey	-16($key,$rnds_),$rndkey0
1375
1376	call	.Lenc_loop6
1377
1378	movdqu	($inp),$inout6		# load 6 input blocks
1379	movdqu	0x10($inp),$inout7
1380	movdqu	0x20($inp),$in0
1381	movdqu	0x30($inp),$in1
1382	movdqu	0x40($inp),$in2
1383	movdqu	0x50($inp),$in3
1384	lea	0x60($inp),$inp		# $inp+=6*16
1385	$movkey	-64($key,$rnds_),$rndkey1
1386	pxor	$inout0,$inout6		# inp^=E(ctr)
1387	movaps	0x00(%rsp),$inout0	# load next counter [xor-ed with 0 round]
1388	pxor	$inout1,$inout7
1389	movaps	0x10(%rsp),$inout1
1390	pxor	$inout2,$in0
1391	movaps	0x20(%rsp),$inout2
1392	pxor	$inout3,$in1
1393	movaps	0x30(%rsp),$inout3
1394	pxor	$inout4,$in2
1395	movaps	0x40(%rsp),$inout4
1396	pxor	$inout5,$in3
1397	movaps	0x50(%rsp),$inout5
1398	movdqu	$inout6,($out)		# store 6 output blocks
1399	movdqu	$inout7,0x10($out)
1400	movdqu	$in0,0x20($out)
1401	movdqu	$in1,0x30($out)
1402	movdqu	$in2,0x40($out)
1403	movdqu	$in3,0x50($out)
1404	lea	0x60($out),$out		# $out+=6*16
1405
1406	sub	\$6,$len
1407	jnc	.Lctr32_loop6		# loop if $len-=6 didn't borrow
1408
1409	add	\$6,$len		# restore real remaining $len
1410	jz	.Lctr32_done		# done if ($len==0)
1411
1412	lea	-48($rnds_),$rounds
1413	lea	-80($key,$rnds_),$key	# restore $key
1414	neg	$rounds
1415	shr	\$4,$rounds		# restore $rounds
1416	jmp	.Lctr32_tail
1417
1418.align	32
1419.Lctr32_loop8:
1420	 add		\$8,$ctr		# next counter value
1421	movdqa		0x60(%rsp),$inout6
1422	aesenc		$rndkey1,$inout0
1423	 mov		$ctr,%r9d
1424	movdqa		0x70(%rsp),$inout7
1425	aesenc		$rndkey1,$inout1
1426	 bswap		%r9d
1427	$movkey		0x20-0x80($key),$rndkey0
1428	aesenc		$rndkey1,$inout2
1429	 xor		$key0,%r9d
1430	 nop
1431	aesenc		$rndkey1,$inout3
1432	 mov		%r9d,0x00+12(%rsp)	# store next counter value
1433	 lea		1($ctr),%r9
1434	aesenc		$rndkey1,$inout4
1435	aesenc		$rndkey1,$inout5
1436	aesenc		$rndkey1,$inout6
1437	aesenc		$rndkey1,$inout7
1438	$movkey		0x30-0x80($key),$rndkey1
1439___
1440for($i=2;$i<8;$i++) {
1441my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
1442$code.=<<___;
1443	 bswap		%r9d
1444	aesenc		$rndkeyx,$inout0
1445	aesenc		$rndkeyx,$inout1
1446	 xor		$key0,%r9d
1447	 .byte		0x66,0x90
1448	aesenc		$rndkeyx,$inout2
1449	aesenc		$rndkeyx,$inout3
1450	 mov		%r9d,`0x10*($i-1)`+12(%rsp)
1451	 lea		$i($ctr),%r9
1452	aesenc		$rndkeyx,$inout4
1453	aesenc		$rndkeyx,$inout5
1454	aesenc		$rndkeyx,$inout6
1455	aesenc		$rndkeyx,$inout7
1456	$movkey		`0x20+0x10*$i`-0x80($key),$rndkeyx
1457___
1458}
1459$code.=<<___;
1460	 bswap		%r9d
1461	aesenc		$rndkey0,$inout0
1462	aesenc		$rndkey0,$inout1
1463	aesenc		$rndkey0,$inout2
1464	 xor		$key0,%r9d
1465	 movdqu		0x00($inp),$in0		# start loading input
1466	aesenc		$rndkey0,$inout3
1467	 mov		%r9d,0x70+12(%rsp)
1468	 cmp		\$11,$rounds
1469	aesenc		$rndkey0,$inout4
1470	aesenc		$rndkey0,$inout5
1471	aesenc		$rndkey0,$inout6
1472	aesenc		$rndkey0,$inout7
1473	$movkey		0xa0-0x80($key),$rndkey0
1474
1475	jb		.Lctr32_enc_done
1476
1477	aesenc		$rndkey1,$inout0
1478	aesenc		$rndkey1,$inout1
1479	aesenc		$rndkey1,$inout2
1480	aesenc		$rndkey1,$inout3
1481	aesenc		$rndkey1,$inout4
1482	aesenc		$rndkey1,$inout5
1483	aesenc		$rndkey1,$inout6
1484	aesenc		$rndkey1,$inout7
1485	$movkey		0xb0-0x80($key),$rndkey1
1486
1487	aesenc		$rndkey0,$inout0
1488	aesenc		$rndkey0,$inout1
1489	aesenc		$rndkey0,$inout2
1490	aesenc		$rndkey0,$inout3
1491	aesenc		$rndkey0,$inout4
1492	aesenc		$rndkey0,$inout5
1493	aesenc		$rndkey0,$inout6
1494	aesenc		$rndkey0,$inout7
1495	$movkey		0xc0-0x80($key),$rndkey0
1496	je		.Lctr32_enc_done
1497
1498	aesenc		$rndkey1,$inout0
1499	aesenc		$rndkey1,$inout1
1500	aesenc		$rndkey1,$inout2
1501	aesenc		$rndkey1,$inout3
1502	aesenc		$rndkey1,$inout4
1503	aesenc		$rndkey1,$inout5
1504	aesenc		$rndkey1,$inout6
1505	aesenc		$rndkey1,$inout7
1506	$movkey		0xd0-0x80($key),$rndkey1
1507
1508	aesenc		$rndkey0,$inout0
1509	aesenc		$rndkey0,$inout1
1510	aesenc		$rndkey0,$inout2
1511	aesenc		$rndkey0,$inout3
1512	aesenc		$rndkey0,$inout4
1513	aesenc		$rndkey0,$inout5
1514	aesenc		$rndkey0,$inout6
1515	aesenc		$rndkey0,$inout7
1516	$movkey		0xe0-0x80($key),$rndkey0
1517	jmp		.Lctr32_enc_done
1518
1519.align	16
1520.Lctr32_enc_done:
1521	movdqu		0x10($inp),$in1
1522	pxor		$rndkey0,$in0		# input^=round[last]
1523	movdqu		0x20($inp),$in2
1524	pxor		$rndkey0,$in1
1525	movdqu		0x30($inp),$in3
1526	pxor		$rndkey0,$in2
1527	movdqu		0x40($inp),$in4
1528	pxor		$rndkey0,$in3
1529	movdqu		0x50($inp),$in5
1530	pxor		$rndkey0,$in4
1531	pxor		$rndkey0,$in5
1532	aesenc		$rndkey1,$inout0
1533	aesenc		$rndkey1,$inout1
1534	aesenc		$rndkey1,$inout2
1535	aesenc		$rndkey1,$inout3
1536	aesenc		$rndkey1,$inout4
1537	aesenc		$rndkey1,$inout5
1538	aesenc		$rndkey1,$inout6
1539	aesenc		$rndkey1,$inout7
1540	movdqu		0x60($inp),$rndkey1	# borrow $rndkey1 for inp[6]
1541	lea		0x80($inp),$inp		# $inp+=8*16
1542
1543	aesenclast	$in0,$inout0		# $inN is inp[N]^round[last]
1544	pxor		$rndkey0,$rndkey1	# borrowed $rndkey
1545	movdqu		0x70-0x80($inp),$in0
1546	aesenclast	$in1,$inout1
1547	pxor		$rndkey0,$in0
1548	movdqa		0x00(%rsp),$in1		# load next counter block
1549	aesenclast	$in2,$inout2
1550	aesenclast	$in3,$inout3
1551	movdqa		0x10(%rsp),$in2
1552	movdqa		0x20(%rsp),$in3
1553	aesenclast	$in4,$inout4
1554	aesenclast	$in5,$inout5
1555	movdqa		0x30(%rsp),$in4
1556	movdqa		0x40(%rsp),$in5
1557	aesenclast	$rndkey1,$inout6
1558	movdqa		0x50(%rsp),$rndkey0
1559	$movkey		0x10-0x80($key),$rndkey1#real 1st-round key
1560	aesenclast	$in0,$inout7
1561
1562	movups		$inout0,($out)		# store 8 output blocks
1563	movdqa		$in1,$inout0
1564	movups		$inout1,0x10($out)
1565	movdqa		$in2,$inout1
1566	movups		$inout2,0x20($out)
1567	movdqa		$in3,$inout2
1568	movups		$inout3,0x30($out)
1569	movdqa		$in4,$inout3
1570	movups		$inout4,0x40($out)
1571	movdqa		$in5,$inout4
1572	movups		$inout5,0x50($out)
1573	movdqa		$rndkey0,$inout5
1574	movups		$inout6,0x60($out)
1575	movups		$inout7,0x70($out)
1576	lea		0x80($out),$out		# $out+=8*16
1577
1578	sub	\$8,$len
1579	jnc	.Lctr32_loop8			# loop if $len-=8 didn't borrow
1580
1581	add	\$8,$len			# restore real remaining $len
1582	jz	.Lctr32_done			# done if ($len==0)
1583	lea	-0x80($key),$key
1584
1585.Lctr32_tail:
1586	# note that at this point $inout0..5 are populated with
1587	# counter values xor-ed with 0-round key
1588	lea	16($key),$key
1589	cmp	\$4,$len
1590	jb	.Lctr32_loop3
1591	je	.Lctr32_loop4
1592
1593	# if ($len>4) compute 7 E(counter)
1594	shl		\$4,$rounds
1595	movdqa		0x60(%rsp),$inout6
1596	pxor		$inout7,$inout7
1597
1598	$movkey		16($key),$rndkey0
1599	aesenc		$rndkey1,$inout0
1600	aesenc		$rndkey1,$inout1
1601	lea		32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter
1602	neg		%rax
1603	aesenc		$rndkey1,$inout2
1604	add		\$16,%rax		# prepare for .Lenc_loop8_enter
1605	 movups		($inp),$in0
1606	aesenc		$rndkey1,$inout3
1607	aesenc		$rndkey1,$inout4
1608	 movups		0x10($inp),$in1		# pre-load input
1609	 movups		0x20($inp),$in2
1610	aesenc		$rndkey1,$inout5
1611	aesenc		$rndkey1,$inout6
1612
1613	call            .Lenc_loop8_enter
1614
1615	movdqu	0x30($inp),$in3
1616	pxor	$in0,$inout0
1617	movdqu	0x40($inp),$in0
1618	pxor	$in1,$inout1
1619	movdqu	$inout0,($out)			# store output
1620	pxor	$in2,$inout2
1621	movdqu	$inout1,0x10($out)
1622	pxor	$in3,$inout3
1623	movdqu	$inout2,0x20($out)
1624	pxor	$in0,$inout4
1625	movdqu	$inout3,0x30($out)
1626	movdqu	$inout4,0x40($out)
1627	cmp	\$6,$len
1628	jb	.Lctr32_done			# $len was 5, stop store
1629
1630	movups	0x50($inp),$in1
1631	xorps	$in1,$inout5
1632	movups	$inout5,0x50($out)
1633	je	.Lctr32_done			# $len was 6, stop store
1634
1635	movups	0x60($inp),$in2
1636	xorps	$in2,$inout6
1637	movups	$inout6,0x60($out)
1638	jmp	.Lctr32_done			# $len was 7, stop store
1639
1640.align	32
1641.Lctr32_loop4:
1642	aesenc		$rndkey1,$inout0
1643	lea		16($key),$key
1644	dec		$rounds
1645	aesenc		$rndkey1,$inout1
1646	aesenc		$rndkey1,$inout2
1647	aesenc		$rndkey1,$inout3
1648	$movkey		($key),$rndkey1
1649	jnz		.Lctr32_loop4
1650	aesenclast	$rndkey1,$inout0
1651	aesenclast	$rndkey1,$inout1
1652	 movups		($inp),$in0		# load input
1653	 movups		0x10($inp),$in1
1654	aesenclast	$rndkey1,$inout2
1655	aesenclast	$rndkey1,$inout3
1656	 movups		0x20($inp),$in2
1657	 movups		0x30($inp),$in3
1658
1659	xorps	$in0,$inout0
1660	movups	$inout0,($out)			# store output
1661	xorps	$in1,$inout1
1662	movups	$inout1,0x10($out)
1663	pxor	$in2,$inout2
1664	movdqu	$inout2,0x20($out)
1665	pxor	$in3,$inout3
1666	movdqu	$inout3,0x30($out)
1667	jmp	.Lctr32_done			# $len was 4, stop store
1668
1669.align	32
1670.Lctr32_loop3:
1671	aesenc		$rndkey1,$inout0
1672	lea		16($key),$key
1673	dec		$rounds
1674	aesenc		$rndkey1,$inout1
1675	aesenc		$rndkey1,$inout2
1676	$movkey		($key),$rndkey1
1677	jnz		.Lctr32_loop3
1678	aesenclast	$rndkey1,$inout0
1679	aesenclast	$rndkey1,$inout1
1680	aesenclast	$rndkey1,$inout2
1681
1682	movups	($inp),$in0			# load input
1683	xorps	$in0,$inout0
1684	movups	$inout0,($out)			# store output
1685	cmp	\$2,$len
1686	jb	.Lctr32_done			# $len was 1, stop store
1687
1688	movups	0x10($inp),$in1
1689	xorps	$in1,$inout1
1690	movups	$inout1,0x10($out)
1691	je	.Lctr32_done			# $len was 2, stop store
1692
1693	movups	0x20($inp),$in2
1694	xorps	$in2,$inout2
1695	movups	$inout2,0x20($out)		# $len was 3, stop store
1696
1697.Lctr32_done:
1698	xorps	%xmm0,%xmm0			# clear register bank
1699	xor	$key0,$key0
1700	pxor	%xmm1,%xmm1
1701	pxor	%xmm2,%xmm2
1702	pxor	%xmm3,%xmm3
1703	pxor	%xmm4,%xmm4
1704	pxor	%xmm5,%xmm5
1705___
1706$code.=<<___ if (!$win64);
1707	pxor	%xmm6,%xmm6
1708	pxor	%xmm7,%xmm7
1709	movaps	%xmm0,0x00(%rsp)		# clear stack
1710	pxor	%xmm8,%xmm8
1711	movaps	%xmm0,0x10(%rsp)
1712	pxor	%xmm9,%xmm9
1713	movaps	%xmm0,0x20(%rsp)
1714	pxor	%xmm10,%xmm10
1715	movaps	%xmm0,0x30(%rsp)
1716	pxor	%xmm11,%xmm11
1717	movaps	%xmm0,0x40(%rsp)
1718	pxor	%xmm12,%xmm12
1719	movaps	%xmm0,0x50(%rsp)
1720	pxor	%xmm13,%xmm13
1721	movaps	%xmm0,0x60(%rsp)
1722	pxor	%xmm14,%xmm14
1723	movaps	%xmm0,0x70(%rsp)
1724	pxor	%xmm15,%xmm15
1725___
1726$code.=<<___ if ($win64);
1727	movaps	-0xa8($key_),%xmm6
1728	movaps	%xmm0,-0xa8($key_)		# clear stack
1729	movaps	-0x98($key_),%xmm7
1730	movaps	%xmm0,-0x98($key_)
1731	movaps	-0x88($key_),%xmm8
1732	movaps	%xmm0,-0x88($key_)
1733	movaps	-0x78($key_),%xmm9
1734	movaps	%xmm0,-0x78($key_)
1735	movaps	-0x68($key_),%xmm10
1736	movaps	%xmm0,-0x68($key_)
1737	movaps	-0x58($key_),%xmm11
1738	movaps	%xmm0,-0x58($key_)
1739	movaps	-0x48($key_),%xmm12
1740	movaps	%xmm0,-0x48($key_)
1741	movaps	-0x38($key_),%xmm13
1742	movaps	%xmm0,-0x38($key_)
1743	movaps	-0x28($key_),%xmm14
1744	movaps	%xmm0,-0x28($key_)
1745	movaps	-0x18($key_),%xmm15
1746	movaps	%xmm0,-0x18($key_)
1747	movaps	%xmm0,0x00(%rsp)
1748	movaps	%xmm0,0x10(%rsp)
1749	movaps	%xmm0,0x20(%rsp)
1750	movaps	%xmm0,0x30(%rsp)
1751	movaps	%xmm0,0x40(%rsp)
1752	movaps	%xmm0,0x50(%rsp)
1753	movaps	%xmm0,0x60(%rsp)
1754	movaps	%xmm0,0x70(%rsp)
1755___
1756$code.=<<___;
1757	mov	-8($key_),%rbp
1758.cfi_restore	%rbp
1759	lea	($key_),%rsp
1760.cfi_def_cfa_register	%rsp
1761.Lctr32_epilogue:
1762	ret
1763.cfi_endproc
1764.size	${PREFIX}_ctr32_encrypt_blocks,.-${PREFIX}_ctr32_encrypt_blocks
1765___
1766}
1767
1768######################################################################
1769# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1770#	const AES_KEY *key1, const AES_KEY *key2
1771#	const unsigned char iv[16]);
1772#
1773if (0) {  # Omit these functions in BoringSSL
1774my @tweak=map("%xmm$_",(10..15));
1775my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
1776my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
1777my $frame_size = 0x70 + ($win64?160:0);
1778my $key_ = "%rbp";	# override so that we can use %r11 as FP
1779
1780$code.=<<___;
1781.globl	${PREFIX}_xts_encrypt
1782.type	${PREFIX}_xts_encrypt,\@function,6
1783.align	16
1784${PREFIX}_xts_encrypt:
1785.cfi_startproc
1786	lea	(%rsp),%r11			# frame pointer
1787.cfi_def_cfa_register	%r11
1788	push	%rbp
1789.cfi_push	%rbp
1790	sub	\$$frame_size,%rsp
1791	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
1792___
1793$code.=<<___ if ($win64);
1794	movaps	%xmm6,-0xa8(%r11)		# offload everything
1795	movaps	%xmm7,-0x98(%r11)
1796	movaps	%xmm8,-0x88(%r11)
1797	movaps	%xmm9,-0x78(%r11)
1798	movaps	%xmm10,-0x68(%r11)
1799	movaps	%xmm11,-0x58(%r11)
1800	movaps	%xmm12,-0x48(%r11)
1801	movaps	%xmm13,-0x38(%r11)
1802	movaps	%xmm14,-0x28(%r11)
1803	movaps	%xmm15,-0x18(%r11)
1804.Lxts_enc_body:
1805___
1806$code.=<<___;
1807	movups	($ivp),$inout0			# load clear-text tweak
1808	mov	240(%r8),$rounds		# key2->rounds
1809	mov	240($key),$rnds_		# key1->rounds
1810___
1811	# generate the tweak
1812	&aesni_generate1("enc",$key2,$rounds,$inout0);
1813$code.=<<___;
1814	$movkey	($key),$rndkey0			# zero round key
1815	mov	$key,$key_			# backup $key
1816	mov	$rnds_,$rounds			# backup $rounds
1817	shl	\$4,$rnds_
1818	mov	$len,$len_			# backup $len
1819	and	\$-16,$len
1820
1821	$movkey	16($key,$rnds_),$rndkey1	# last round key
1822
1823	movdqa	.Lxts_magic(%rip),$twmask
1824	movdqa	$inout0,@tweak[5]
1825	pshufd	\$0x5f,$inout0,$twres
1826	pxor	$rndkey0,$rndkey1
1827___
1828    # alternative tweak calculation algorithm is based on suggestions
1829    # by Shay Gueron. psrad doesn't conflict with AES-NI instructions
1830    # and should help in the future...
1831    for ($i=0;$i<4;$i++) {
1832    $code.=<<___;
1833	movdqa	$twres,$twtmp
1834	paddd	$twres,$twres
1835	movdqa	@tweak[5],@tweak[$i]
1836	psrad	\$31,$twtmp			# broadcast upper bits
1837	paddq	@tweak[5],@tweak[5]
1838	pand	$twmask,$twtmp
1839	pxor	$rndkey0,@tweak[$i]
1840	pxor	$twtmp,@tweak[5]
1841___
1842    }
1843$code.=<<___;
1844	movdqa	@tweak[5],@tweak[4]
1845	psrad	\$31,$twres
1846	paddq	@tweak[5],@tweak[5]
1847	pand	$twmask,$twres
1848	pxor	$rndkey0,@tweak[4]
1849	pxor	$twres,@tweak[5]
1850	movaps	$rndkey1,0x60(%rsp)		# save round[0]^round[last]
1851
1852	sub	\$16*6,$len
1853	jc	.Lxts_enc_short			# if $len-=6*16 borrowed
1854
1855	mov	\$16+96,$rounds
1856	lea	32($key_,$rnds_),$key		# end of key schedule
1857	sub	%r10,%rax			# twisted $rounds
1858	$movkey	16($key_),$rndkey1
1859	mov	%rax,%r10			# backup twisted $rounds
1860	lea	.Lxts_magic(%rip),%r8
1861	jmp	.Lxts_enc_grandloop
1862
1863.align	32
1864.Lxts_enc_grandloop:
1865	movdqu	`16*0`($inp),$inout0		# load input
1866	movdqa	$rndkey0,$twmask
1867	movdqu	`16*1`($inp),$inout1
1868	pxor	@tweak[0],$inout0		# input^=tweak^round[0]
1869	movdqu	`16*2`($inp),$inout2
1870	pxor	@tweak[1],$inout1
1871	 aesenc		$rndkey1,$inout0
1872	movdqu	`16*3`($inp),$inout3
1873	pxor	@tweak[2],$inout2
1874	 aesenc		$rndkey1,$inout1
1875	movdqu	`16*4`($inp),$inout4
1876	pxor	@tweak[3],$inout3
1877	 aesenc		$rndkey1,$inout2
1878	movdqu	`16*5`($inp),$inout5
1879	pxor	@tweak[5],$twmask		# round[0]^=tweak[5]
1880	 movdqa	0x60(%rsp),$twres		# load round[0]^round[last]
1881	pxor	@tweak[4],$inout4
1882	 aesenc		$rndkey1,$inout3
1883	$movkey	32($key_),$rndkey0
1884	lea	`16*6`($inp),$inp
1885	pxor	$twmask,$inout5
1886
1887	 pxor	$twres,@tweak[0]		# calculate tweaks^round[last]
1888	aesenc		$rndkey1,$inout4
1889	 pxor	$twres,@tweak[1]
1890	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks^round[last]
1891	aesenc		$rndkey1,$inout5
1892	$movkey		48($key_),$rndkey1
1893	 pxor	$twres,@tweak[2]
1894
1895	aesenc		$rndkey0,$inout0
1896	 pxor	$twres,@tweak[3]
1897	 movdqa	@tweak[1],`16*1`(%rsp)
1898	aesenc		$rndkey0,$inout1
1899	 pxor	$twres,@tweak[4]
1900	 movdqa	@tweak[2],`16*2`(%rsp)
1901	aesenc		$rndkey0,$inout2
1902	aesenc		$rndkey0,$inout3
1903	 pxor	$twres,$twmask
1904	 movdqa	@tweak[4],`16*4`(%rsp)
1905	aesenc		$rndkey0,$inout4
1906	aesenc		$rndkey0,$inout5
1907	$movkey		64($key_),$rndkey0
1908	 movdqa	$twmask,`16*5`(%rsp)
1909	pshufd	\$0x5f,@tweak[5],$twres
1910	jmp	.Lxts_enc_loop6
1911.align	32
1912.Lxts_enc_loop6:
1913	aesenc		$rndkey1,$inout0
1914	aesenc		$rndkey1,$inout1
1915	aesenc		$rndkey1,$inout2
1916	aesenc		$rndkey1,$inout3
1917	aesenc		$rndkey1,$inout4
1918	aesenc		$rndkey1,$inout5
1919	$movkey		-64($key,%rax),$rndkey1
1920	add		\$32,%rax
1921
1922	aesenc		$rndkey0,$inout0
1923	aesenc		$rndkey0,$inout1
1924	aesenc		$rndkey0,$inout2
1925	aesenc		$rndkey0,$inout3
1926	aesenc		$rndkey0,$inout4
1927	aesenc		$rndkey0,$inout5
1928	$movkey		-80($key,%rax),$rndkey0
1929	jnz		.Lxts_enc_loop6
1930
1931	movdqa	(%r8),$twmask			# start calculating next tweak
1932	movdqa	$twres,$twtmp
1933	paddd	$twres,$twres
1934	 aesenc		$rndkey1,$inout0
1935	paddq	@tweak[5],@tweak[5]
1936	psrad	\$31,$twtmp
1937	 aesenc		$rndkey1,$inout1
1938	pand	$twmask,$twtmp
1939	$movkey	($key_),@tweak[0]		# load round[0]
1940	 aesenc		$rndkey1,$inout2
1941	 aesenc		$rndkey1,$inout3
1942	 aesenc		$rndkey1,$inout4
1943	pxor	$twtmp,@tweak[5]
1944	movaps	@tweak[0],@tweak[1]		# copy round[0]
1945	 aesenc		$rndkey1,$inout5
1946	 $movkey	-64($key),$rndkey1
1947
1948	movdqa	$twres,$twtmp
1949	 aesenc		$rndkey0,$inout0
1950	paddd	$twres,$twres
1951	pxor	@tweak[5],@tweak[0]
1952	 aesenc		$rndkey0,$inout1
1953	psrad	\$31,$twtmp
1954	paddq	@tweak[5],@tweak[5]
1955	 aesenc		$rndkey0,$inout2
1956	 aesenc		$rndkey0,$inout3
1957	pand	$twmask,$twtmp
1958	movaps	@tweak[1],@tweak[2]
1959	 aesenc		$rndkey0,$inout4
1960	pxor	$twtmp,@tweak[5]
1961	movdqa	$twres,$twtmp
1962	 aesenc		$rndkey0,$inout5
1963	 $movkey	-48($key),$rndkey0
1964
1965	paddd	$twres,$twres
1966	 aesenc		$rndkey1,$inout0
1967	pxor	@tweak[5],@tweak[1]
1968	psrad	\$31,$twtmp
1969	 aesenc		$rndkey1,$inout1
1970	paddq	@tweak[5],@tweak[5]
1971	pand	$twmask,$twtmp
1972	 aesenc		$rndkey1,$inout2
1973	 aesenc		$rndkey1,$inout3
1974	 movdqa	@tweak[3],`16*3`(%rsp)
1975	pxor	$twtmp,@tweak[5]
1976	 aesenc		$rndkey1,$inout4
1977	movaps	@tweak[2],@tweak[3]
1978	movdqa	$twres,$twtmp
1979	 aesenc		$rndkey1,$inout5
1980	 $movkey	-32($key),$rndkey1
1981
1982	paddd	$twres,$twres
1983	 aesenc		$rndkey0,$inout0
1984	pxor	@tweak[5],@tweak[2]
1985	psrad	\$31,$twtmp
1986	 aesenc		$rndkey0,$inout1
1987	paddq	@tweak[5],@tweak[5]
1988	pand	$twmask,$twtmp
1989	 aesenc		$rndkey0,$inout2
1990	 aesenc		$rndkey0,$inout3
1991	 aesenc		$rndkey0,$inout4
1992	pxor	$twtmp,@tweak[5]
1993	movaps	@tweak[3],@tweak[4]
1994	 aesenc		$rndkey0,$inout5
1995
1996	movdqa	$twres,$rndkey0
1997	paddd	$twres,$twres
1998	 aesenc		$rndkey1,$inout0
1999	pxor	@tweak[5],@tweak[3]
2000	psrad	\$31,$rndkey0
2001	 aesenc		$rndkey1,$inout1
2002	paddq	@tweak[5],@tweak[5]
2003	pand	$twmask,$rndkey0
2004	 aesenc		$rndkey1,$inout2
2005	 aesenc		$rndkey1,$inout3
2006	pxor	$rndkey0,@tweak[5]
2007	$movkey		($key_),$rndkey0
2008	 aesenc		$rndkey1,$inout4
2009	 aesenc		$rndkey1,$inout5
2010	$movkey		16($key_),$rndkey1
2011
2012	pxor	@tweak[5],@tweak[4]
2013	 aesenclast	`16*0`(%rsp),$inout0
2014	psrad	\$31,$twres
2015	paddq	@tweak[5],@tweak[5]
2016	 aesenclast	`16*1`(%rsp),$inout1
2017	 aesenclast	`16*2`(%rsp),$inout2
2018	pand	$twmask,$twres
2019	mov	%r10,%rax			# restore $rounds
2020	 aesenclast	`16*3`(%rsp),$inout3
2021	 aesenclast	`16*4`(%rsp),$inout4
2022	 aesenclast	`16*5`(%rsp),$inout5
2023	pxor	$twres,@tweak[5]
2024
2025	lea	`16*6`($out),$out		# $out+=6*16
2026	movups	$inout0,`-16*6`($out)		# store 6 output blocks
2027	movups	$inout1,`-16*5`($out)
2028	movups	$inout2,`-16*4`($out)
2029	movups	$inout3,`-16*3`($out)
2030	movups	$inout4,`-16*2`($out)
2031	movups	$inout5,`-16*1`($out)
2032	sub	\$16*6,$len
2033	jnc	.Lxts_enc_grandloop		# loop if $len-=6*16 didn't borrow
2034
2035	mov	\$16+96,$rounds
2036	sub	$rnds_,$rounds
2037	mov	$key_,$key			# restore $key
2038	shr	\$4,$rounds			# restore original value
2039
2040.Lxts_enc_short:
2041	# at the point @tweak[0..5] are populated with tweak values
2042	mov	$rounds,$rnds_			# backup $rounds
2043	pxor	$rndkey0,@tweak[0]
2044	add	\$16*6,$len			# restore real remaining $len
2045	jz	.Lxts_enc_done			# done if ($len==0)
2046
2047	pxor	$rndkey0,@tweak[1]
2048	cmp	\$0x20,$len
2049	jb	.Lxts_enc_one			# $len is 1*16
2050	pxor	$rndkey0,@tweak[2]
2051	je	.Lxts_enc_two			# $len is 2*16
2052
2053	pxor	$rndkey0,@tweak[3]
2054	cmp	\$0x40,$len
2055	jb	.Lxts_enc_three			# $len is 3*16
2056	pxor	$rndkey0,@tweak[4]
2057	je	.Lxts_enc_four			# $len is 4*16
2058
2059	movdqu	($inp),$inout0			# $len is 5*16
2060	movdqu	16*1($inp),$inout1
2061	movdqu	16*2($inp),$inout2
2062	pxor	@tweak[0],$inout0
2063	movdqu	16*3($inp),$inout3
2064	pxor	@tweak[1],$inout1
2065	movdqu	16*4($inp),$inout4
2066	lea	16*5($inp),$inp			# $inp+=5*16
2067	pxor	@tweak[2],$inout2
2068	pxor	@tweak[3],$inout3
2069	pxor	@tweak[4],$inout4
2070	pxor	$inout5,$inout5
2071
2072	call	_aesni_encrypt6
2073
2074	xorps	@tweak[0],$inout0
2075	movdqa	@tweak[5],@tweak[0]
2076	xorps	@tweak[1],$inout1
2077	xorps	@tweak[2],$inout2
2078	movdqu	$inout0,($out)			# store 5 output blocks
2079	xorps	@tweak[3],$inout3
2080	movdqu	$inout1,16*1($out)
2081	xorps	@tweak[4],$inout4
2082	movdqu	$inout2,16*2($out)
2083	movdqu	$inout3,16*3($out)
2084	movdqu	$inout4,16*4($out)
2085	lea	16*5($out),$out			# $out+=5*16
2086	jmp	.Lxts_enc_done
2087
2088.align	16
2089.Lxts_enc_one:
2090	movups	($inp),$inout0
2091	lea	16*1($inp),$inp			# inp+=1*16
2092	xorps	@tweak[0],$inout0
2093___
2094	&aesni_generate1("enc",$key,$rounds);
2095$code.=<<___;
2096	xorps	@tweak[0],$inout0
2097	movdqa	@tweak[1],@tweak[0]
2098	movups	$inout0,($out)			# store one output block
2099	lea	16*1($out),$out			# $out+=1*16
2100	jmp	.Lxts_enc_done
2101
2102.align	16
2103.Lxts_enc_two:
2104	movups	($inp),$inout0
2105	movups	16($inp),$inout1
2106	lea	32($inp),$inp			# $inp+=2*16
2107	xorps	@tweak[0],$inout0
2108	xorps	@tweak[1],$inout1
2109
2110	call	_aesni_encrypt2
2111
2112	xorps	@tweak[0],$inout0
2113	movdqa	@tweak[2],@tweak[0]
2114	xorps	@tweak[1],$inout1
2115	movups	$inout0,($out)			# store 2 output blocks
2116	movups	$inout1,16*1($out)
2117	lea	16*2($out),$out			# $out+=2*16
2118	jmp	.Lxts_enc_done
2119
2120.align	16
2121.Lxts_enc_three:
2122	movups	($inp),$inout0
2123	movups	16*1($inp),$inout1
2124	movups	16*2($inp),$inout2
2125	lea	16*3($inp),$inp			# $inp+=3*16
2126	xorps	@tweak[0],$inout0
2127	xorps	@tweak[1],$inout1
2128	xorps	@tweak[2],$inout2
2129
2130	call	_aesni_encrypt3
2131
2132	xorps	@tweak[0],$inout0
2133	movdqa	@tweak[3],@tweak[0]
2134	xorps	@tweak[1],$inout1
2135	xorps	@tweak[2],$inout2
2136	movups	$inout0,($out)			# store 3 output blocks
2137	movups	$inout1,16*1($out)
2138	movups	$inout2,16*2($out)
2139	lea	16*3($out),$out			# $out+=3*16
2140	jmp	.Lxts_enc_done
2141
2142.align	16
2143.Lxts_enc_four:
2144	movups	($inp),$inout0
2145	movups	16*1($inp),$inout1
2146	movups	16*2($inp),$inout2
2147	xorps	@tweak[0],$inout0
2148	movups	16*3($inp),$inout3
2149	lea	16*4($inp),$inp			# $inp+=4*16
2150	xorps	@tweak[1],$inout1
2151	xorps	@tweak[2],$inout2
2152	xorps	@tweak[3],$inout3
2153
2154	call	_aesni_encrypt4
2155
2156	pxor	@tweak[0],$inout0
2157	movdqa	@tweak[4],@tweak[0]
2158	pxor	@tweak[1],$inout1
2159	pxor	@tweak[2],$inout2
2160	movdqu	$inout0,($out)			# store 4 output blocks
2161	pxor	@tweak[3],$inout3
2162	movdqu	$inout1,16*1($out)
2163	movdqu	$inout2,16*2($out)
2164	movdqu	$inout3,16*3($out)
2165	lea	16*4($out),$out			# $out+=4*16
2166	jmp	.Lxts_enc_done
2167
2168.align	16
2169.Lxts_enc_done:
2170	and	\$15,$len_			# see if $len%16 is 0
2171	jz	.Lxts_enc_ret
2172	mov	$len_,$len
2173
2174.Lxts_enc_steal:
2175	movzb	($inp),%eax			# borrow $rounds ...
2176	movzb	-16($out),%ecx			# ... and $key
2177	lea	1($inp),$inp
2178	mov	%al,-16($out)
2179	mov	%cl,0($out)
2180	lea	1($out),$out
2181	sub	\$1,$len
2182	jnz	.Lxts_enc_steal
2183
2184	sub	$len_,$out			# rewind $out
2185	mov	$key_,$key			# restore $key
2186	mov	$rnds_,$rounds			# restore $rounds
2187
2188	movups	-16($out),$inout0
2189	xorps	@tweak[0],$inout0
2190___
2191	&aesni_generate1("enc",$key,$rounds);
2192$code.=<<___;
2193	xorps	@tweak[0],$inout0
2194	movups	$inout0,-16($out)
2195
2196.Lxts_enc_ret:
2197	xorps	%xmm0,%xmm0			# clear register bank
2198	pxor	%xmm1,%xmm1
2199	pxor	%xmm2,%xmm2
2200	pxor	%xmm3,%xmm3
2201	pxor	%xmm4,%xmm4
2202	pxor	%xmm5,%xmm5
2203___
2204$code.=<<___ if (!$win64);
2205	pxor	%xmm6,%xmm6
2206	pxor	%xmm7,%xmm7
2207	movaps	%xmm0,0x00(%rsp)		# clear stack
2208	pxor	%xmm8,%xmm8
2209	movaps	%xmm0,0x10(%rsp)
2210	pxor	%xmm9,%xmm9
2211	movaps	%xmm0,0x20(%rsp)
2212	pxor	%xmm10,%xmm10
2213	movaps	%xmm0,0x30(%rsp)
2214	pxor	%xmm11,%xmm11
2215	movaps	%xmm0,0x40(%rsp)
2216	pxor	%xmm12,%xmm12
2217	movaps	%xmm0,0x50(%rsp)
2218	pxor	%xmm13,%xmm13
2219	movaps	%xmm0,0x60(%rsp)
2220	pxor	%xmm14,%xmm14
2221	pxor	%xmm15,%xmm15
2222___
2223$code.=<<___ if ($win64);
2224	movaps	-0xa8(%r11),%xmm6
2225	movaps	%xmm0,-0xa8(%r11)		# clear stack
2226	movaps	-0x98(%r11),%xmm7
2227	movaps	%xmm0,-0x98(%r11)
2228	movaps	-0x88(%r11),%xmm8
2229	movaps	%xmm0,-0x88(%r11)
2230	movaps	-0x78(%r11),%xmm9
2231	movaps	%xmm0,-0x78(%r11)
2232	movaps	-0x68(%r11),%xmm10
2233	movaps	%xmm0,-0x68(%r11)
2234	movaps	-0x58(%r11),%xmm11
2235	movaps	%xmm0,-0x58(%r11)
2236	movaps	-0x48(%r11),%xmm12
2237	movaps	%xmm0,-0x48(%r11)
2238	movaps	-0x38(%r11),%xmm13
2239	movaps	%xmm0,-0x38(%r11)
2240	movaps	-0x28(%r11),%xmm14
2241	movaps	%xmm0,-0x28(%r11)
2242	movaps	-0x18(%r11),%xmm15
2243	movaps	%xmm0,-0x18(%r11)
2244	movaps	%xmm0,0x00(%rsp)
2245	movaps	%xmm0,0x10(%rsp)
2246	movaps	%xmm0,0x20(%rsp)
2247	movaps	%xmm0,0x30(%rsp)
2248	movaps	%xmm0,0x40(%rsp)
2249	movaps	%xmm0,0x50(%rsp)
2250	movaps	%xmm0,0x60(%rsp)
2251___
2252$code.=<<___;
2253	mov	-8(%r11),%rbp
2254.cfi_restore	%rbp
2255	lea	(%r11),%rsp
2256.cfi_def_cfa_register	%rsp
2257.Lxts_enc_epilogue:
2258	ret
2259.cfi_endproc
2260.size	${PREFIX}_xts_encrypt,.-${PREFIX}_xts_encrypt
2261___
2262
2263$code.=<<___;
2264.globl	${PREFIX}_xts_decrypt
2265.type	${PREFIX}_xts_decrypt,\@function,6
2266.align	16
2267${PREFIX}_xts_decrypt:
2268.cfi_startproc
2269	lea	(%rsp),%r11			# frame pointer
2270.cfi_def_cfa_register	%r11
2271	push	%rbp
2272.cfi_push	%rbp
2273	sub	\$$frame_size,%rsp
2274	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
2275___
2276$code.=<<___ if ($win64);
2277	movaps	%xmm6,-0xa8(%r11)		# offload everything
2278	movaps	%xmm7,-0x98(%r11)
2279	movaps	%xmm8,-0x88(%r11)
2280	movaps	%xmm9,-0x78(%r11)
2281	movaps	%xmm10,-0x68(%r11)
2282	movaps	%xmm11,-0x58(%r11)
2283	movaps	%xmm12,-0x48(%r11)
2284	movaps	%xmm13,-0x38(%r11)
2285	movaps	%xmm14,-0x28(%r11)
2286	movaps	%xmm15,-0x18(%r11)
2287.Lxts_dec_body:
2288___
2289$code.=<<___;
2290	movups	($ivp),$inout0			# load clear-text tweak
2291	mov	240($key2),$rounds		# key2->rounds
2292	mov	240($key),$rnds_		# key1->rounds
2293___
2294	# generate the tweak
2295	&aesni_generate1("enc",$key2,$rounds,$inout0);
2296$code.=<<___;
2297	xor	%eax,%eax			# if ($len%16) len-=16;
2298	test	\$15,$len
2299	setnz	%al
2300	shl	\$4,%rax
2301	sub	%rax,$len
2302
2303	$movkey	($key),$rndkey0			# zero round key
2304	mov	$key,$key_			# backup $key
2305	mov	$rnds_,$rounds			# backup $rounds
2306	shl	\$4,$rnds_
2307	mov	$len,$len_			# backup $len
2308	and	\$-16,$len
2309
2310	$movkey	16($key,$rnds_),$rndkey1	# last round key
2311
2312	movdqa	.Lxts_magic(%rip),$twmask
2313	movdqa	$inout0,@tweak[5]
2314	pshufd	\$0x5f,$inout0,$twres
2315	pxor	$rndkey0,$rndkey1
2316___
2317    for ($i=0;$i<4;$i++) {
2318    $code.=<<___;
2319	movdqa	$twres,$twtmp
2320	paddd	$twres,$twres
2321	movdqa	@tweak[5],@tweak[$i]
2322	psrad	\$31,$twtmp			# broadcast upper bits
2323	paddq	@tweak[5],@tweak[5]
2324	pand	$twmask,$twtmp
2325	pxor	$rndkey0,@tweak[$i]
2326	pxor	$twtmp,@tweak[5]
2327___
2328    }
2329$code.=<<___;
2330	movdqa	@tweak[5],@tweak[4]
2331	psrad	\$31,$twres
2332	paddq	@tweak[5],@tweak[5]
2333	pand	$twmask,$twres
2334	pxor	$rndkey0,@tweak[4]
2335	pxor	$twres,@tweak[5]
2336	movaps	$rndkey1,0x60(%rsp)		# save round[0]^round[last]
2337
2338	sub	\$16*6,$len
2339	jc	.Lxts_dec_short			# if $len-=6*16 borrowed
2340
2341	mov	\$16+96,$rounds
2342	lea	32($key_,$rnds_),$key		# end of key schedule
2343	sub	%r10,%rax			# twisted $rounds
2344	$movkey	16($key_),$rndkey1
2345	mov	%rax,%r10			# backup twisted $rounds
2346	lea	.Lxts_magic(%rip),%r8
2347	jmp	.Lxts_dec_grandloop
2348
2349.align	32
2350.Lxts_dec_grandloop:
2351	movdqu	`16*0`($inp),$inout0		# load input
2352	movdqa	$rndkey0,$twmask
2353	movdqu	`16*1`($inp),$inout1
2354	pxor	@tweak[0],$inout0		# intput^=tweak^round[0]
2355	movdqu	`16*2`($inp),$inout2
2356	pxor	@tweak[1],$inout1
2357	 aesdec		$rndkey1,$inout0
2358	movdqu	`16*3`($inp),$inout3
2359	pxor	@tweak[2],$inout2
2360	 aesdec		$rndkey1,$inout1
2361	movdqu	`16*4`($inp),$inout4
2362	pxor	@tweak[3],$inout3
2363	 aesdec		$rndkey1,$inout2
2364	movdqu	`16*5`($inp),$inout5
2365	pxor	@tweak[5],$twmask		# round[0]^=tweak[5]
2366	 movdqa	0x60(%rsp),$twres		# load round[0]^round[last]
2367	pxor	@tweak[4],$inout4
2368	 aesdec		$rndkey1,$inout3
2369	$movkey	32($key_),$rndkey0
2370	lea	`16*6`($inp),$inp
2371	pxor	$twmask,$inout5
2372
2373	 pxor	$twres,@tweak[0]		# calculate tweaks^round[last]
2374	aesdec		$rndkey1,$inout4
2375	 pxor	$twres,@tweak[1]
2376	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks^last round key
2377	aesdec		$rndkey1,$inout5
2378	$movkey		48($key_),$rndkey1
2379	 pxor	$twres,@tweak[2]
2380
2381	aesdec		$rndkey0,$inout0
2382	 pxor	$twres,@tweak[3]
2383	 movdqa	@tweak[1],`16*1`(%rsp)
2384	aesdec		$rndkey0,$inout1
2385	 pxor	$twres,@tweak[4]
2386	 movdqa	@tweak[2],`16*2`(%rsp)
2387	aesdec		$rndkey0,$inout2
2388	aesdec		$rndkey0,$inout3
2389	 pxor	$twres,$twmask
2390	 movdqa	@tweak[4],`16*4`(%rsp)
2391	aesdec		$rndkey0,$inout4
2392	aesdec		$rndkey0,$inout5
2393	$movkey		64($key_),$rndkey0
2394	 movdqa	$twmask,`16*5`(%rsp)
2395	pshufd	\$0x5f,@tweak[5],$twres
2396	jmp	.Lxts_dec_loop6
2397.align	32
2398.Lxts_dec_loop6:
2399	aesdec		$rndkey1,$inout0
2400	aesdec		$rndkey1,$inout1
2401	aesdec		$rndkey1,$inout2
2402	aesdec		$rndkey1,$inout3
2403	aesdec		$rndkey1,$inout4
2404	aesdec		$rndkey1,$inout5
2405	$movkey		-64($key,%rax),$rndkey1
2406	add		\$32,%rax
2407
2408	aesdec		$rndkey0,$inout0
2409	aesdec		$rndkey0,$inout1
2410	aesdec		$rndkey0,$inout2
2411	aesdec		$rndkey0,$inout3
2412	aesdec		$rndkey0,$inout4
2413	aesdec		$rndkey0,$inout5
2414	$movkey		-80($key,%rax),$rndkey0
2415	jnz		.Lxts_dec_loop6
2416
2417	movdqa	(%r8),$twmask			# start calculating next tweak
2418	movdqa	$twres,$twtmp
2419	paddd	$twres,$twres
2420	 aesdec		$rndkey1,$inout0
2421	paddq	@tweak[5],@tweak[5]
2422	psrad	\$31,$twtmp
2423	 aesdec		$rndkey1,$inout1
2424	pand	$twmask,$twtmp
2425	$movkey	($key_),@tweak[0]		# load round[0]
2426	 aesdec		$rndkey1,$inout2
2427	 aesdec		$rndkey1,$inout3
2428	 aesdec		$rndkey1,$inout4
2429	pxor	$twtmp,@tweak[5]
2430	movaps	@tweak[0],@tweak[1]		# copy round[0]
2431	 aesdec		$rndkey1,$inout5
2432	 $movkey	-64($key),$rndkey1
2433
2434	movdqa	$twres,$twtmp
2435	 aesdec		$rndkey0,$inout0
2436	paddd	$twres,$twres
2437	pxor	@tweak[5],@tweak[0]
2438	 aesdec		$rndkey0,$inout1
2439	psrad	\$31,$twtmp
2440	paddq	@tweak[5],@tweak[5]
2441	 aesdec		$rndkey0,$inout2
2442	 aesdec		$rndkey0,$inout3
2443	pand	$twmask,$twtmp
2444	movaps	@tweak[1],@tweak[2]
2445	 aesdec		$rndkey0,$inout4
2446	pxor	$twtmp,@tweak[5]
2447	movdqa	$twres,$twtmp
2448	 aesdec		$rndkey0,$inout5
2449	 $movkey	-48($key),$rndkey0
2450
2451	paddd	$twres,$twres
2452	 aesdec		$rndkey1,$inout0
2453	pxor	@tweak[5],@tweak[1]
2454	psrad	\$31,$twtmp
2455	 aesdec		$rndkey1,$inout1
2456	paddq	@tweak[5],@tweak[5]
2457	pand	$twmask,$twtmp
2458	 aesdec		$rndkey1,$inout2
2459	 aesdec		$rndkey1,$inout3
2460	 movdqa	@tweak[3],`16*3`(%rsp)
2461	pxor	$twtmp,@tweak[5]
2462	 aesdec		$rndkey1,$inout4
2463	movaps	@tweak[2],@tweak[3]
2464	movdqa	$twres,$twtmp
2465	 aesdec		$rndkey1,$inout5
2466	 $movkey	-32($key),$rndkey1
2467
2468	paddd	$twres,$twres
2469	 aesdec		$rndkey0,$inout0
2470	pxor	@tweak[5],@tweak[2]
2471	psrad	\$31,$twtmp
2472	 aesdec		$rndkey0,$inout1
2473	paddq	@tweak[5],@tweak[5]
2474	pand	$twmask,$twtmp
2475	 aesdec		$rndkey0,$inout2
2476	 aesdec		$rndkey0,$inout3
2477	 aesdec		$rndkey0,$inout4
2478	pxor	$twtmp,@tweak[5]
2479	movaps	@tweak[3],@tweak[4]
2480	 aesdec		$rndkey0,$inout5
2481
2482	movdqa	$twres,$rndkey0
2483	paddd	$twres,$twres
2484	 aesdec		$rndkey1,$inout0
2485	pxor	@tweak[5],@tweak[3]
2486	psrad	\$31,$rndkey0
2487	 aesdec		$rndkey1,$inout1
2488	paddq	@tweak[5],@tweak[5]
2489	pand	$twmask,$rndkey0
2490	 aesdec		$rndkey1,$inout2
2491	 aesdec		$rndkey1,$inout3
2492	pxor	$rndkey0,@tweak[5]
2493	$movkey		($key_),$rndkey0
2494	 aesdec		$rndkey1,$inout4
2495	 aesdec		$rndkey1,$inout5
2496	$movkey		16($key_),$rndkey1
2497
2498	pxor	@tweak[5],@tweak[4]
2499	 aesdeclast	`16*0`(%rsp),$inout0
2500	psrad	\$31,$twres
2501	paddq	@tweak[5],@tweak[5]
2502	 aesdeclast	`16*1`(%rsp),$inout1
2503	 aesdeclast	`16*2`(%rsp),$inout2
2504	pand	$twmask,$twres
2505	mov	%r10,%rax			# restore $rounds
2506	 aesdeclast	`16*3`(%rsp),$inout3
2507	 aesdeclast	`16*4`(%rsp),$inout4
2508	 aesdeclast	`16*5`(%rsp),$inout5
2509	pxor	$twres,@tweak[5]
2510
2511	lea	`16*6`($out),$out		# $out+=6*16
2512	movups	$inout0,`-16*6`($out)		# store 6 output blocks
2513	movups	$inout1,`-16*5`($out)
2514	movups	$inout2,`-16*4`($out)
2515	movups	$inout3,`-16*3`($out)
2516	movups	$inout4,`-16*2`($out)
2517	movups	$inout5,`-16*1`($out)
2518	sub	\$16*6,$len
2519	jnc	.Lxts_dec_grandloop		# loop if $len-=6*16 didn't borrow
2520
2521	mov	\$16+96,$rounds
2522	sub	$rnds_,$rounds
2523	mov	$key_,$key			# restore $key
2524	shr	\$4,$rounds			# restore original value
2525
2526.Lxts_dec_short:
2527	# at the point @tweak[0..5] are populated with tweak values
2528	mov	$rounds,$rnds_			# backup $rounds
2529	pxor	$rndkey0,@tweak[0]
2530	pxor	$rndkey0,@tweak[1]
2531	add	\$16*6,$len			# restore real remaining $len
2532	jz	.Lxts_dec_done			# done if ($len==0)
2533
2534	pxor	$rndkey0,@tweak[2]
2535	cmp	\$0x20,$len
2536	jb	.Lxts_dec_one			# $len is 1*16
2537	pxor	$rndkey0,@tweak[3]
2538	je	.Lxts_dec_two			# $len is 2*16
2539
2540	pxor	$rndkey0,@tweak[4]
2541	cmp	\$0x40,$len
2542	jb	.Lxts_dec_three			# $len is 3*16
2543	je	.Lxts_dec_four			# $len is 4*16
2544
2545	movdqu	($inp),$inout0			# $len is 5*16
2546	movdqu	16*1($inp),$inout1
2547	movdqu	16*2($inp),$inout2
2548	pxor	@tweak[0],$inout0
2549	movdqu	16*3($inp),$inout3
2550	pxor	@tweak[1],$inout1
2551	movdqu	16*4($inp),$inout4
2552	lea	16*5($inp),$inp			# $inp+=5*16
2553	pxor	@tweak[2],$inout2
2554	pxor	@tweak[3],$inout3
2555	pxor	@tweak[4],$inout4
2556
2557	call	_aesni_decrypt6
2558
2559	xorps	@tweak[0],$inout0
2560	xorps	@tweak[1],$inout1
2561	xorps	@tweak[2],$inout2
2562	movdqu	$inout0,($out)			# store 5 output blocks
2563	xorps	@tweak[3],$inout3
2564	movdqu	$inout1,16*1($out)
2565	xorps	@tweak[4],$inout4
2566	movdqu	$inout2,16*2($out)
2567	 pxor		$twtmp,$twtmp
2568	movdqu	$inout3,16*3($out)
2569	 pcmpgtd	@tweak[5],$twtmp
2570	movdqu	$inout4,16*4($out)
2571	lea	16*5($out),$out			# $out+=5*16
2572	 pshufd		\$0x13,$twtmp,@tweak[1]	# $twres
2573	and	\$15,$len_
2574	jz	.Lxts_dec_ret
2575
2576	movdqa	@tweak[5],@tweak[0]
2577	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
2578	pand	$twmask,@tweak[1]		# isolate carry and residue
2579	pxor	@tweak[5],@tweak[1]
2580	jmp	.Lxts_dec_done2
2581
2582.align	16
2583.Lxts_dec_one:
2584	movups	($inp),$inout0
2585	lea	16*1($inp),$inp			# $inp+=1*16
2586	xorps	@tweak[0],$inout0
2587___
2588	&aesni_generate1("dec",$key,$rounds);
2589$code.=<<___;
2590	xorps	@tweak[0],$inout0
2591	movdqa	@tweak[1],@tweak[0]
2592	movups	$inout0,($out)			# store one output block
2593	movdqa	@tweak[2],@tweak[1]
2594	lea	16*1($out),$out			# $out+=1*16
2595	jmp	.Lxts_dec_done
2596
2597.align	16
2598.Lxts_dec_two:
2599	movups	($inp),$inout0
2600	movups	16($inp),$inout1
2601	lea	32($inp),$inp			# $inp+=2*16
2602	xorps	@tweak[0],$inout0
2603	xorps	@tweak[1],$inout1
2604
2605	call	_aesni_decrypt2
2606
2607	xorps	@tweak[0],$inout0
2608	movdqa	@tweak[2],@tweak[0]
2609	xorps	@tweak[1],$inout1
2610	movdqa	@tweak[3],@tweak[1]
2611	movups	$inout0,($out)			# store 2 output blocks
2612	movups	$inout1,16*1($out)
2613	lea	16*2($out),$out			# $out+=2*16
2614	jmp	.Lxts_dec_done
2615
2616.align	16
2617.Lxts_dec_three:
2618	movups	($inp),$inout0
2619	movups	16*1($inp),$inout1
2620	movups	16*2($inp),$inout2
2621	lea	16*3($inp),$inp			# $inp+=3*16
2622	xorps	@tweak[0],$inout0
2623	xorps	@tweak[1],$inout1
2624	xorps	@tweak[2],$inout2
2625
2626	call	_aesni_decrypt3
2627
2628	xorps	@tweak[0],$inout0
2629	movdqa	@tweak[3],@tweak[0]
2630	xorps	@tweak[1],$inout1
2631	movdqa	@tweak[4],@tweak[1]
2632	xorps	@tweak[2],$inout2
2633	movups	$inout0,($out)			# store 3 output blocks
2634	movups	$inout1,16*1($out)
2635	movups	$inout2,16*2($out)
2636	lea	16*3($out),$out			# $out+=3*16
2637	jmp	.Lxts_dec_done
2638
2639.align	16
2640.Lxts_dec_four:
2641	movups	($inp),$inout0
2642	movups	16*1($inp),$inout1
2643	movups	16*2($inp),$inout2
2644	xorps	@tweak[0],$inout0
2645	movups	16*3($inp),$inout3
2646	lea	16*4($inp),$inp			# $inp+=4*16
2647	xorps	@tweak[1],$inout1
2648	xorps	@tweak[2],$inout2
2649	xorps	@tweak[3],$inout3
2650
2651	call	_aesni_decrypt4
2652
2653	pxor	@tweak[0],$inout0
2654	movdqa	@tweak[4],@tweak[0]
2655	pxor	@tweak[1],$inout1
2656	movdqa	@tweak[5],@tweak[1]
2657	pxor	@tweak[2],$inout2
2658	movdqu	$inout0,($out)			# store 4 output blocks
2659	pxor	@tweak[3],$inout3
2660	movdqu	$inout1,16*1($out)
2661	movdqu	$inout2,16*2($out)
2662	movdqu	$inout3,16*3($out)
2663	lea	16*4($out),$out			# $out+=4*16
2664	jmp	.Lxts_dec_done
2665
2666.align	16
2667.Lxts_dec_done:
2668	and	\$15,$len_			# see if $len%16 is 0
2669	jz	.Lxts_dec_ret
2670.Lxts_dec_done2:
2671	mov	$len_,$len
2672	mov	$key_,$key			# restore $key
2673	mov	$rnds_,$rounds			# restore $rounds
2674
2675	movups	($inp),$inout0
2676	xorps	@tweak[1],$inout0
2677___
2678	&aesni_generate1("dec",$key,$rounds);
2679$code.=<<___;
2680	xorps	@tweak[1],$inout0
2681	movups	$inout0,($out)
2682
2683.Lxts_dec_steal:
2684	movzb	16($inp),%eax			# borrow $rounds ...
2685	movzb	($out),%ecx			# ... and $key
2686	lea	1($inp),$inp
2687	mov	%al,($out)
2688	mov	%cl,16($out)
2689	lea	1($out),$out
2690	sub	\$1,$len
2691	jnz	.Lxts_dec_steal
2692
2693	sub	$len_,$out			# rewind $out
2694	mov	$key_,$key			# restore $key
2695	mov	$rnds_,$rounds			# restore $rounds
2696
2697	movups	($out),$inout0
2698	xorps	@tweak[0],$inout0
2699___
2700	&aesni_generate1("dec",$key,$rounds);
2701$code.=<<___;
2702	xorps	@tweak[0],$inout0
2703	movups	$inout0,($out)
2704
2705.Lxts_dec_ret:
2706	xorps	%xmm0,%xmm0			# clear register bank
2707	pxor	%xmm1,%xmm1
2708	pxor	%xmm2,%xmm2
2709	pxor	%xmm3,%xmm3
2710	pxor	%xmm4,%xmm4
2711	pxor	%xmm5,%xmm5
2712___
2713$code.=<<___ if (!$win64);
2714	pxor	%xmm6,%xmm6
2715	pxor	%xmm7,%xmm7
2716	movaps	%xmm0,0x00(%rsp)		# clear stack
2717	pxor	%xmm8,%xmm8
2718	movaps	%xmm0,0x10(%rsp)
2719	pxor	%xmm9,%xmm9
2720	movaps	%xmm0,0x20(%rsp)
2721	pxor	%xmm10,%xmm10
2722	movaps	%xmm0,0x30(%rsp)
2723	pxor	%xmm11,%xmm11
2724	movaps	%xmm0,0x40(%rsp)
2725	pxor	%xmm12,%xmm12
2726	movaps	%xmm0,0x50(%rsp)
2727	pxor	%xmm13,%xmm13
2728	movaps	%xmm0,0x60(%rsp)
2729	pxor	%xmm14,%xmm14
2730	pxor	%xmm15,%xmm15
2731___
2732$code.=<<___ if ($win64);
2733	movaps	-0xa8(%r11),%xmm6
2734	movaps	%xmm0,-0xa8(%r11)		# clear stack
2735	movaps	-0x98(%r11),%xmm7
2736	movaps	%xmm0,-0x98(%r11)
2737	movaps	-0x88(%r11),%xmm8
2738	movaps	%xmm0,-0x88(%r11)
2739	movaps	-0x78(%r11),%xmm9
2740	movaps	%xmm0,-0x78(%r11)
2741	movaps	-0x68(%r11),%xmm10
2742	movaps	%xmm0,-0x68(%r11)
2743	movaps	-0x58(%r11),%xmm11
2744	movaps	%xmm0,-0x58(%r11)
2745	movaps	-0x48(%r11),%xmm12
2746	movaps	%xmm0,-0x48(%r11)
2747	movaps	-0x38(%r11),%xmm13
2748	movaps	%xmm0,-0x38(%r11)
2749	movaps	-0x28(%r11),%xmm14
2750	movaps	%xmm0,-0x28(%r11)
2751	movaps	-0x18(%r11),%xmm15
2752	movaps	%xmm0,-0x18(%r11)
2753	movaps	%xmm0,0x00(%rsp)
2754	movaps	%xmm0,0x10(%rsp)
2755	movaps	%xmm0,0x20(%rsp)
2756	movaps	%xmm0,0x30(%rsp)
2757	movaps	%xmm0,0x40(%rsp)
2758	movaps	%xmm0,0x50(%rsp)
2759	movaps	%xmm0,0x60(%rsp)
2760___
2761$code.=<<___;
2762	mov	-8(%r11),%rbp
2763.cfi_restore	%rbp
2764	lea	(%r11),%rsp
2765.cfi_def_cfa_register	%rsp
2766.Lxts_dec_epilogue:
2767	ret
2768.cfi_endproc
2769.size	${PREFIX}_xts_decrypt,.-${PREFIX}_xts_decrypt
2770___
2771}
2772
2773######################################################################
2774# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
2775#	const AES_KEY *key, unsigned int start_block_num,
2776#	unsigned char offset_i[16], const unsigned char L_[][16],
2777#	unsigned char checksum[16]);
2778#
2779if (0) {  # Omit these functions in BoringSSL
2780my @offset=map("%xmm$_",(10..15));
2781my ($checksum,$rndkey0l)=("%xmm8","%xmm9");
2782my ($block_num,$offset_p)=("%r8","%r9");		# 5th and 6th arguments
2783my ($L_p,$checksum_p) = ("%rbx","%rbp");
2784my ($i1,$i3,$i5) = ("%r12","%r13","%r14");
2785my $seventh_arg = $win64 ? 56 : 8;
2786my $blocks = $len;
2787
2788$code.=<<___;
2789.globl	${PREFIX}_ocb_encrypt
2790.type	${PREFIX}_ocb_encrypt,\@function,6
2791.align	32
2792${PREFIX}_ocb_encrypt:
2793.cfi_startproc
2794	lea	(%rsp),%rax
2795	push	%rbx
2796.cfi_push	%rbx
2797	push	%rbp
2798.cfi_push	%rbp
2799	push	%r12
2800.cfi_push	%r12
2801	push	%r13
2802.cfi_push	%r13
2803	push	%r14
2804.cfi_push	%r14
2805___
2806$code.=<<___ if ($win64);
2807	lea	-0xa0(%rsp),%rsp
2808	movaps	%xmm6,0x00(%rsp)		# offload everything
2809	movaps	%xmm7,0x10(%rsp)
2810	movaps	%xmm8,0x20(%rsp)
2811	movaps	%xmm9,0x30(%rsp)
2812	movaps	%xmm10,0x40(%rsp)
2813	movaps	%xmm11,0x50(%rsp)
2814	movaps	%xmm12,0x60(%rsp)
2815	movaps	%xmm13,0x70(%rsp)
2816	movaps	%xmm14,0x80(%rsp)
2817	movaps	%xmm15,0x90(%rsp)
2818.Locb_enc_body:
2819___
2820$code.=<<___;
2821	mov	$seventh_arg(%rax),$L_p		# 7th argument
2822	mov	$seventh_arg+8(%rax),$checksum_p# 8th argument
2823
2824	mov	240($key),$rnds_
2825	mov	$key,$key_
2826	shl	\$4,$rnds_
2827	$movkey	($key),$rndkey0l		# round[0]
2828	$movkey	16($key,$rnds_),$rndkey1	# round[last]
2829
2830	movdqu	($offset_p),@offset[5]		# load last offset_i
2831	pxor	$rndkey1,$rndkey0l		# round[0] ^ round[last]
2832	pxor	$rndkey1,@offset[5]		# offset_i ^ round[last]
2833
2834	mov	\$16+32,$rounds
2835	lea	32($key_,$rnds_),$key
2836	$movkey	16($key_),$rndkey1		# round[1]
2837	sub	%r10,%rax			# twisted $rounds
2838	mov	%rax,%r10			# backup twisted $rounds
2839
2840	movdqu	($L_p),@offset[0]		# L_0 for all odd-numbered blocks
2841	movdqu	($checksum_p),$checksum		# load checksum
2842
2843	test	\$1,$block_num			# is first block number odd?
2844	jnz	.Locb_enc_odd
2845
2846	bsf	$block_num,$i1
2847	add	\$1,$block_num
2848	shl	\$4,$i1
2849	movdqu	($L_p,$i1),$inout5		# borrow
2850	movdqu	($inp),$inout0
2851	lea	16($inp),$inp
2852
2853	call	__ocb_encrypt1
2854
2855	movdqa	$inout5,@offset[5]
2856	movups	$inout0,($out)
2857	lea	16($out),$out
2858	sub	\$1,$blocks
2859	jz	.Locb_enc_done
2860
2861.Locb_enc_odd:
2862	lea	1($block_num),$i1		# even-numbered blocks
2863	lea	3($block_num),$i3
2864	lea	5($block_num),$i5
2865	lea	6($block_num),$block_num
2866	bsf	$i1,$i1				# ntz(block)
2867	bsf	$i3,$i3
2868	bsf	$i5,$i5
2869	shl	\$4,$i1				# ntz(block) -> table offset
2870	shl	\$4,$i3
2871	shl	\$4,$i5
2872
2873	sub	\$6,$blocks
2874	jc	.Locb_enc_short
2875	jmp	.Locb_enc_grandloop
2876
2877.align	32
2878.Locb_enc_grandloop:
2879	movdqu	`16*0`($inp),$inout0		# load input
2880	movdqu	`16*1`($inp),$inout1
2881	movdqu	`16*2`($inp),$inout2
2882	movdqu	`16*3`($inp),$inout3
2883	movdqu	`16*4`($inp),$inout4
2884	movdqu	`16*5`($inp),$inout5
2885	lea	`16*6`($inp),$inp
2886
2887	call	__ocb_encrypt6
2888
2889	movups	$inout0,`16*0`($out)		# store output
2890	movups	$inout1,`16*1`($out)
2891	movups	$inout2,`16*2`($out)
2892	movups	$inout3,`16*3`($out)
2893	movups	$inout4,`16*4`($out)
2894	movups	$inout5,`16*5`($out)
2895	lea	`16*6`($out),$out
2896	sub	\$6,$blocks
2897	jnc	.Locb_enc_grandloop
2898
2899.Locb_enc_short:
2900	add	\$6,$blocks
2901	jz	.Locb_enc_done
2902
2903	movdqu	`16*0`($inp),$inout0
2904	cmp	\$2,$blocks
2905	jb	.Locb_enc_one
2906	movdqu	`16*1`($inp),$inout1
2907	je	.Locb_enc_two
2908
2909	movdqu	`16*2`($inp),$inout2
2910	cmp	\$4,$blocks
2911	jb	.Locb_enc_three
2912	movdqu	`16*3`($inp),$inout3
2913	je	.Locb_enc_four
2914
2915	movdqu	`16*4`($inp),$inout4
2916	pxor	$inout5,$inout5
2917
2918	call	__ocb_encrypt6
2919
2920	movdqa	@offset[4],@offset[5]
2921	movups	$inout0,`16*0`($out)
2922	movups	$inout1,`16*1`($out)
2923	movups	$inout2,`16*2`($out)
2924	movups	$inout3,`16*3`($out)
2925	movups	$inout4,`16*4`($out)
2926
2927	jmp	.Locb_enc_done
2928
2929.align	16
2930.Locb_enc_one:
2931	movdqa	@offset[0],$inout5		# borrow
2932
2933	call	__ocb_encrypt1
2934
2935	movdqa	$inout5,@offset[5]
2936	movups	$inout0,`16*0`($out)
2937	jmp	.Locb_enc_done
2938
2939.align	16
2940.Locb_enc_two:
2941	pxor	$inout2,$inout2
2942	pxor	$inout3,$inout3
2943
2944	call	__ocb_encrypt4
2945
2946	movdqa	@offset[1],@offset[5]
2947	movups	$inout0,`16*0`($out)
2948	movups	$inout1,`16*1`($out)
2949
2950	jmp	.Locb_enc_done
2951
2952.align	16
2953.Locb_enc_three:
2954	pxor	$inout3,$inout3
2955
2956	call	__ocb_encrypt4
2957
2958	movdqa	@offset[2],@offset[5]
2959	movups	$inout0,`16*0`($out)
2960	movups	$inout1,`16*1`($out)
2961	movups	$inout2,`16*2`($out)
2962
2963	jmp	.Locb_enc_done
2964
2965.align	16
2966.Locb_enc_four:
2967	call	__ocb_encrypt4
2968
2969	movdqa	@offset[3],@offset[5]
2970	movups	$inout0,`16*0`($out)
2971	movups	$inout1,`16*1`($out)
2972	movups	$inout2,`16*2`($out)
2973	movups	$inout3,`16*3`($out)
2974
2975.Locb_enc_done:
2976	pxor	$rndkey0,@offset[5]		# "remove" round[last]
2977	movdqu	$checksum,($checksum_p)		# store checksum
2978	movdqu	@offset[5],($offset_p)		# store last offset_i
2979
2980	xorps	%xmm0,%xmm0			# clear register bank
2981	pxor	%xmm1,%xmm1
2982	pxor	%xmm2,%xmm2
2983	pxor	%xmm3,%xmm3
2984	pxor	%xmm4,%xmm4
2985	pxor	%xmm5,%xmm5
2986___
2987$code.=<<___ if (!$win64);
2988	pxor	%xmm6,%xmm6
2989	pxor	%xmm7,%xmm7
2990	pxor	%xmm8,%xmm8
2991	pxor	%xmm9,%xmm9
2992	pxor	%xmm10,%xmm10
2993	pxor	%xmm11,%xmm11
2994	pxor	%xmm12,%xmm12
2995	pxor	%xmm13,%xmm13
2996	pxor	%xmm14,%xmm14
2997	pxor	%xmm15,%xmm15
2998	lea	0x28(%rsp),%rax
2999.cfi_def_cfa	%rax,8
3000___
3001$code.=<<___ if ($win64);
3002	movaps	0x00(%rsp),%xmm6
3003	movaps	%xmm0,0x00(%rsp)		# clear stack
3004	movaps	0x10(%rsp),%xmm7
3005	movaps	%xmm0,0x10(%rsp)
3006	movaps	0x20(%rsp),%xmm8
3007	movaps	%xmm0,0x20(%rsp)
3008	movaps	0x30(%rsp),%xmm9
3009	movaps	%xmm0,0x30(%rsp)
3010	movaps	0x40(%rsp),%xmm10
3011	movaps	%xmm0,0x40(%rsp)
3012	movaps	0x50(%rsp),%xmm11
3013	movaps	%xmm0,0x50(%rsp)
3014	movaps	0x60(%rsp),%xmm12
3015	movaps	%xmm0,0x60(%rsp)
3016	movaps	0x70(%rsp),%xmm13
3017	movaps	%xmm0,0x70(%rsp)
3018	movaps	0x80(%rsp),%xmm14
3019	movaps	%xmm0,0x80(%rsp)
3020	movaps	0x90(%rsp),%xmm15
3021	movaps	%xmm0,0x90(%rsp)
3022	lea	0xa0+0x28(%rsp),%rax
3023.Locb_enc_pop:
3024___
3025$code.=<<___;
3026	mov	-40(%rax),%r14
3027.cfi_restore	%r14
3028	mov	-32(%rax),%r13
3029.cfi_restore	%r13
3030	mov	-24(%rax),%r12
3031.cfi_restore	%r12
3032	mov	-16(%rax),%rbp
3033.cfi_restore	%rbp
3034	mov	-8(%rax),%rbx
3035.cfi_restore	%rbx
3036	lea	(%rax),%rsp
3037.cfi_def_cfa_register	%rsp
3038.Locb_enc_epilogue:
3039	ret
3040.cfi_endproc
3041.size	${PREFIX}_ocb_encrypt,.-${PREFIX}_ocb_encrypt
3042
3043.type	__ocb_encrypt6,\@abi-omnipotent
3044.align	32
3045__ocb_encrypt6:
3046	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
3047	 movdqu		($L_p,$i1),@offset[1]
3048	 movdqa		@offset[0],@offset[2]
3049	 movdqu		($L_p,$i3),@offset[3]
3050	 movdqa		@offset[0],@offset[4]
3051	 pxor		@offset[5],@offset[0]
3052	 movdqu		($L_p,$i5),@offset[5]
3053	 pxor		@offset[0],@offset[1]
3054	pxor		$inout0,$checksum	# accumulate checksum
3055	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
3056	 pxor		@offset[1],@offset[2]
3057	pxor		$inout1,$checksum
3058	pxor		@offset[1],$inout1
3059	 pxor		@offset[2],@offset[3]
3060	pxor		$inout2,$checksum
3061	pxor		@offset[2],$inout2
3062	 pxor		@offset[3],@offset[4]
3063	pxor		$inout3,$checksum
3064	pxor		@offset[3],$inout3
3065	 pxor		@offset[4],@offset[5]
3066	pxor		$inout4,$checksum
3067	pxor		@offset[4],$inout4
3068	pxor		$inout5,$checksum
3069	pxor		@offset[5],$inout5
3070	$movkey		32($key_),$rndkey0
3071
3072	lea		1($block_num),$i1	# even-numbered blocks
3073	lea		3($block_num),$i3
3074	lea		5($block_num),$i5
3075	add		\$6,$block_num
3076	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
3077	bsf		$i1,$i1			# ntz(block)
3078	bsf		$i3,$i3
3079	bsf		$i5,$i5
3080
3081	aesenc		$rndkey1,$inout0
3082	aesenc		$rndkey1,$inout1
3083	aesenc		$rndkey1,$inout2
3084	aesenc		$rndkey1,$inout3
3085	 pxor		$rndkey0l,@offset[1]
3086	 pxor		$rndkey0l,@offset[2]
3087	aesenc		$rndkey1,$inout4
3088	 pxor		$rndkey0l,@offset[3]
3089	 pxor		$rndkey0l,@offset[4]
3090	aesenc		$rndkey1,$inout5
3091	$movkey		48($key_),$rndkey1
3092	 pxor		$rndkey0l,@offset[5]
3093
3094	aesenc		$rndkey0,$inout0
3095	aesenc		$rndkey0,$inout1
3096	aesenc		$rndkey0,$inout2
3097	aesenc		$rndkey0,$inout3
3098	aesenc		$rndkey0,$inout4
3099	aesenc		$rndkey0,$inout5
3100	$movkey		64($key_),$rndkey0
3101	shl		\$4,$i1			# ntz(block) -> table offset
3102	shl		\$4,$i3
3103	jmp		.Locb_enc_loop6
3104
3105.align	32
3106.Locb_enc_loop6:
3107	aesenc		$rndkey1,$inout0
3108	aesenc		$rndkey1,$inout1
3109	aesenc		$rndkey1,$inout2
3110	aesenc		$rndkey1,$inout3
3111	aesenc		$rndkey1,$inout4
3112	aesenc		$rndkey1,$inout5
3113	$movkey		($key,%rax),$rndkey1
3114	add		\$32,%rax
3115
3116	aesenc		$rndkey0,$inout0
3117	aesenc		$rndkey0,$inout1
3118	aesenc		$rndkey0,$inout2
3119	aesenc		$rndkey0,$inout3
3120	aesenc		$rndkey0,$inout4
3121	aesenc		$rndkey0,$inout5
3122	$movkey		-16($key,%rax),$rndkey0
3123	jnz		.Locb_enc_loop6
3124
3125	aesenc		$rndkey1,$inout0
3126	aesenc		$rndkey1,$inout1
3127	aesenc		$rndkey1,$inout2
3128	aesenc		$rndkey1,$inout3
3129	aesenc		$rndkey1,$inout4
3130	aesenc		$rndkey1,$inout5
3131	$movkey		16($key_),$rndkey1
3132	shl		\$4,$i5
3133
3134	aesenclast	@offset[0],$inout0
3135	movdqu		($L_p),@offset[0]	# L_0 for all odd-numbered blocks
3136	mov		%r10,%rax		# restore twisted rounds
3137	aesenclast	@offset[1],$inout1
3138	aesenclast	@offset[2],$inout2
3139	aesenclast	@offset[3],$inout3
3140	aesenclast	@offset[4],$inout4
3141	aesenclast	@offset[5],$inout5
3142	ret
3143.size	__ocb_encrypt6,.-__ocb_encrypt6
3144
3145.type	__ocb_encrypt4,\@abi-omnipotent
3146.align	32
3147__ocb_encrypt4:
3148	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
3149	 movdqu		($L_p,$i1),@offset[1]
3150	 movdqa		@offset[0],@offset[2]
3151	 movdqu		($L_p,$i3),@offset[3]
3152	 pxor		@offset[5],@offset[0]
3153	 pxor		@offset[0],@offset[1]
3154	pxor		$inout0,$checksum	# accumulate checksum
3155	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
3156	 pxor		@offset[1],@offset[2]
3157	pxor		$inout1,$checksum
3158	pxor		@offset[1],$inout1
3159	 pxor		@offset[2],@offset[3]
3160	pxor		$inout2,$checksum
3161	pxor		@offset[2],$inout2
3162	pxor		$inout3,$checksum
3163	pxor		@offset[3],$inout3
3164	$movkey		32($key_),$rndkey0
3165
3166	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
3167	 pxor		$rndkey0l,@offset[1]
3168	 pxor		$rndkey0l,@offset[2]
3169	 pxor		$rndkey0l,@offset[3]
3170
3171	aesenc		$rndkey1,$inout0
3172	aesenc		$rndkey1,$inout1
3173	aesenc		$rndkey1,$inout2
3174	aesenc		$rndkey1,$inout3
3175	$movkey		48($key_),$rndkey1
3176
3177	aesenc		$rndkey0,$inout0
3178	aesenc		$rndkey0,$inout1
3179	aesenc		$rndkey0,$inout2
3180	aesenc		$rndkey0,$inout3
3181	$movkey		64($key_),$rndkey0
3182	jmp		.Locb_enc_loop4
3183
3184.align	32
3185.Locb_enc_loop4:
3186	aesenc		$rndkey1,$inout0
3187	aesenc		$rndkey1,$inout1
3188	aesenc		$rndkey1,$inout2
3189	aesenc		$rndkey1,$inout3
3190	$movkey		($key,%rax),$rndkey1
3191	add		\$32,%rax
3192
3193	aesenc		$rndkey0,$inout0
3194	aesenc		$rndkey0,$inout1
3195	aesenc		$rndkey0,$inout2
3196	aesenc		$rndkey0,$inout3
3197	$movkey		-16($key,%rax),$rndkey0
3198	jnz		.Locb_enc_loop4
3199
3200	aesenc		$rndkey1,$inout0
3201	aesenc		$rndkey1,$inout1
3202	aesenc		$rndkey1,$inout2
3203	aesenc		$rndkey1,$inout3
3204	$movkey		16($key_),$rndkey1
3205	mov		%r10,%rax		# restore twisted rounds
3206
3207	aesenclast	@offset[0],$inout0
3208	aesenclast	@offset[1],$inout1
3209	aesenclast	@offset[2],$inout2
3210	aesenclast	@offset[3],$inout3
3211	ret
3212.size	__ocb_encrypt4,.-__ocb_encrypt4
3213
3214.type	__ocb_encrypt1,\@abi-omnipotent
3215.align	32
3216__ocb_encrypt1:
3217	 pxor		@offset[5],$inout5	# offset_i
3218	 pxor		$rndkey0l,$inout5	# offset_i ^ round[0]
3219	pxor		$inout0,$checksum	# accumulate checksum
3220	pxor		$inout5,$inout0		# input ^ round[0] ^ offset_i
3221	$movkey		32($key_),$rndkey0
3222
3223	aesenc		$rndkey1,$inout0
3224	$movkey		48($key_),$rndkey1
3225	pxor		$rndkey0l,$inout5	# offset_i ^ round[last]
3226
3227	aesenc		$rndkey0,$inout0
3228	$movkey		64($key_),$rndkey0
3229	jmp		.Locb_enc_loop1
3230
3231.align	32
3232.Locb_enc_loop1:
3233	aesenc		$rndkey1,$inout0
3234	$movkey		($key,%rax),$rndkey1
3235	add		\$32,%rax
3236
3237	aesenc		$rndkey0,$inout0
3238	$movkey		-16($key,%rax),$rndkey0
3239	jnz		.Locb_enc_loop1
3240
3241	aesenc		$rndkey1,$inout0
3242	$movkey		16($key_),$rndkey1	# redundant in tail
3243	mov		%r10,%rax		# restore twisted rounds
3244
3245	aesenclast	$inout5,$inout0
3246	ret
3247.size	__ocb_encrypt1,.-__ocb_encrypt1
3248
3249.globl	${PREFIX}_ocb_decrypt
3250.type	${PREFIX}_ocb_decrypt,\@function,6
3251.align	32
3252${PREFIX}_ocb_decrypt:
3253.cfi_startproc
3254	lea	(%rsp),%rax
3255	push	%rbx
3256.cfi_push	%rbx
3257	push	%rbp
3258.cfi_push	%rbp
3259	push	%r12
3260.cfi_push	%r12
3261	push	%r13
3262.cfi_push	%r13
3263	push	%r14
3264.cfi_push	%r14
3265___
3266$code.=<<___ if ($win64);
3267	lea	-0xa0(%rsp),%rsp
3268	movaps	%xmm6,0x00(%rsp)		# offload everything
3269	movaps	%xmm7,0x10(%rsp)
3270	movaps	%xmm8,0x20(%rsp)
3271	movaps	%xmm9,0x30(%rsp)
3272	movaps	%xmm10,0x40(%rsp)
3273	movaps	%xmm11,0x50(%rsp)
3274	movaps	%xmm12,0x60(%rsp)
3275	movaps	%xmm13,0x70(%rsp)
3276	movaps	%xmm14,0x80(%rsp)
3277	movaps	%xmm15,0x90(%rsp)
3278.Locb_dec_body:
3279___
3280$code.=<<___;
3281	mov	$seventh_arg(%rax),$L_p		# 7th argument
3282	mov	$seventh_arg+8(%rax),$checksum_p# 8th argument
3283
3284	mov	240($key),$rnds_
3285	mov	$key,$key_
3286	shl	\$4,$rnds_
3287	$movkey	($key),$rndkey0l		# round[0]
3288	$movkey	16($key,$rnds_),$rndkey1	# round[last]
3289
3290	movdqu	($offset_p),@offset[5]		# load last offset_i
3291	pxor	$rndkey1,$rndkey0l		# round[0] ^ round[last]
3292	pxor	$rndkey1,@offset[5]		# offset_i ^ round[last]
3293
3294	mov	\$16+32,$rounds
3295	lea	32($key_,$rnds_),$key
3296	$movkey	16($key_),$rndkey1		# round[1]
3297	sub	%r10,%rax			# twisted $rounds
3298	mov	%rax,%r10			# backup twisted $rounds
3299
3300	movdqu	($L_p),@offset[0]		# L_0 for all odd-numbered blocks
3301	movdqu	($checksum_p),$checksum		# load checksum
3302
3303	test	\$1,$block_num			# is first block number odd?
3304	jnz	.Locb_dec_odd
3305
3306	bsf	$block_num,$i1
3307	add	\$1,$block_num
3308	shl	\$4,$i1
3309	movdqu	($L_p,$i1),$inout5		# borrow
3310	movdqu	($inp),$inout0
3311	lea	16($inp),$inp
3312
3313	call	__ocb_decrypt1
3314
3315	movdqa	$inout5,@offset[5]
3316	movups	$inout0,($out)
3317	xorps	$inout0,$checksum		# accumulate checksum
3318	lea	16($out),$out
3319	sub	\$1,$blocks
3320	jz	.Locb_dec_done
3321
3322.Locb_dec_odd:
3323	lea	1($block_num),$i1		# even-numbered blocks
3324	lea	3($block_num),$i3
3325	lea	5($block_num),$i5
3326	lea	6($block_num),$block_num
3327	bsf	$i1,$i1				# ntz(block)
3328	bsf	$i3,$i3
3329	bsf	$i5,$i5
3330	shl	\$4,$i1				# ntz(block) -> table offset
3331	shl	\$4,$i3
3332	shl	\$4,$i5
3333
3334	sub	\$6,$blocks
3335	jc	.Locb_dec_short
3336	jmp	.Locb_dec_grandloop
3337
3338.align	32
3339.Locb_dec_grandloop:
3340	movdqu	`16*0`($inp),$inout0		# load input
3341	movdqu	`16*1`($inp),$inout1
3342	movdqu	`16*2`($inp),$inout2
3343	movdqu	`16*3`($inp),$inout3
3344	movdqu	`16*4`($inp),$inout4
3345	movdqu	`16*5`($inp),$inout5
3346	lea	`16*6`($inp),$inp
3347
3348	call	__ocb_decrypt6
3349
3350	movups	$inout0,`16*0`($out)		# store output
3351	pxor	$inout0,$checksum		# accumulate checksum
3352	movups	$inout1,`16*1`($out)
3353	pxor	$inout1,$checksum
3354	movups	$inout2,`16*2`($out)
3355	pxor	$inout2,$checksum
3356	movups	$inout3,`16*3`($out)
3357	pxor	$inout3,$checksum
3358	movups	$inout4,`16*4`($out)
3359	pxor	$inout4,$checksum
3360	movups	$inout5,`16*5`($out)
3361	pxor	$inout5,$checksum
3362	lea	`16*6`($out),$out
3363	sub	\$6,$blocks
3364	jnc	.Locb_dec_grandloop
3365
3366.Locb_dec_short:
3367	add	\$6,$blocks
3368	jz	.Locb_dec_done
3369
3370	movdqu	`16*0`($inp),$inout0
3371	cmp	\$2,$blocks
3372	jb	.Locb_dec_one
3373	movdqu	`16*1`($inp),$inout1
3374	je	.Locb_dec_two
3375
3376	movdqu	`16*2`($inp),$inout2
3377	cmp	\$4,$blocks
3378	jb	.Locb_dec_three
3379	movdqu	`16*3`($inp),$inout3
3380	je	.Locb_dec_four
3381
3382	movdqu	`16*4`($inp),$inout4
3383	pxor	$inout5,$inout5
3384
3385	call	__ocb_decrypt6
3386
3387	movdqa	@offset[4],@offset[5]
3388	movups	$inout0,`16*0`($out)		# store output
3389	pxor	$inout0,$checksum		# accumulate checksum
3390	movups	$inout1,`16*1`($out)
3391	pxor	$inout1,$checksum
3392	movups	$inout2,`16*2`($out)
3393	pxor	$inout2,$checksum
3394	movups	$inout3,`16*3`($out)
3395	pxor	$inout3,$checksum
3396	movups	$inout4,`16*4`($out)
3397	pxor	$inout4,$checksum
3398
3399	jmp	.Locb_dec_done
3400
3401.align	16
3402.Locb_dec_one:
3403	movdqa	@offset[0],$inout5		# borrow
3404
3405	call	__ocb_decrypt1
3406
3407	movdqa	$inout5,@offset[5]
3408	movups	$inout0,`16*0`($out)		# store output
3409	xorps	$inout0,$checksum		# accumulate checksum
3410	jmp	.Locb_dec_done
3411
3412.align	16
3413.Locb_dec_two:
3414	pxor	$inout2,$inout2
3415	pxor	$inout3,$inout3
3416
3417	call	__ocb_decrypt4
3418
3419	movdqa	@offset[1],@offset[5]
3420	movups	$inout0,`16*0`($out)		# store output
3421	xorps	$inout0,$checksum		# accumulate checksum
3422	movups	$inout1,`16*1`($out)
3423	xorps	$inout1,$checksum
3424
3425	jmp	.Locb_dec_done
3426
3427.align	16
3428.Locb_dec_three:
3429	pxor	$inout3,$inout3
3430
3431	call	__ocb_decrypt4
3432
3433	movdqa	@offset[2],@offset[5]
3434	movups	$inout0,`16*0`($out)		# store output
3435	xorps	$inout0,$checksum		# accumulate checksum
3436	movups	$inout1,`16*1`($out)
3437	xorps	$inout1,$checksum
3438	movups	$inout2,`16*2`($out)
3439	xorps	$inout2,$checksum
3440
3441	jmp	.Locb_dec_done
3442
3443.align	16
3444.Locb_dec_four:
3445	call	__ocb_decrypt4
3446
3447	movdqa	@offset[3],@offset[5]
3448	movups	$inout0,`16*0`($out)		# store output
3449	pxor	$inout0,$checksum		# accumulate checksum
3450	movups	$inout1,`16*1`($out)
3451	pxor	$inout1,$checksum
3452	movups	$inout2,`16*2`($out)
3453	pxor	$inout2,$checksum
3454	movups	$inout3,`16*3`($out)
3455	pxor	$inout3,$checksum
3456
3457.Locb_dec_done:
3458	pxor	$rndkey0,@offset[5]		# "remove" round[last]
3459	movdqu	$checksum,($checksum_p)		# store checksum
3460	movdqu	@offset[5],($offset_p)		# store last offset_i
3461
3462	xorps	%xmm0,%xmm0			# clear register bank
3463	pxor	%xmm1,%xmm1
3464	pxor	%xmm2,%xmm2
3465	pxor	%xmm3,%xmm3
3466	pxor	%xmm4,%xmm4
3467	pxor	%xmm5,%xmm5
3468___
3469$code.=<<___ if (!$win64);
3470	pxor	%xmm6,%xmm6
3471	pxor	%xmm7,%xmm7
3472	pxor	%xmm8,%xmm8
3473	pxor	%xmm9,%xmm9
3474	pxor	%xmm10,%xmm10
3475	pxor	%xmm11,%xmm11
3476	pxor	%xmm12,%xmm12
3477	pxor	%xmm13,%xmm13
3478	pxor	%xmm14,%xmm14
3479	pxor	%xmm15,%xmm15
3480	lea	0x28(%rsp),%rax
3481.cfi_def_cfa	%rax,8
3482___
3483$code.=<<___ if ($win64);
3484	movaps	0x00(%rsp),%xmm6
3485	movaps	%xmm0,0x00(%rsp)		# clear stack
3486	movaps	0x10(%rsp),%xmm7
3487	movaps	%xmm0,0x10(%rsp)
3488	movaps	0x20(%rsp),%xmm8
3489	movaps	%xmm0,0x20(%rsp)
3490	movaps	0x30(%rsp),%xmm9
3491	movaps	%xmm0,0x30(%rsp)
3492	movaps	0x40(%rsp),%xmm10
3493	movaps	%xmm0,0x40(%rsp)
3494	movaps	0x50(%rsp),%xmm11
3495	movaps	%xmm0,0x50(%rsp)
3496	movaps	0x60(%rsp),%xmm12
3497	movaps	%xmm0,0x60(%rsp)
3498	movaps	0x70(%rsp),%xmm13
3499	movaps	%xmm0,0x70(%rsp)
3500	movaps	0x80(%rsp),%xmm14
3501	movaps	%xmm0,0x80(%rsp)
3502	movaps	0x90(%rsp),%xmm15
3503	movaps	%xmm0,0x90(%rsp)
3504	lea	0xa0+0x28(%rsp),%rax
3505.Locb_dec_pop:
3506___
3507$code.=<<___;
3508	mov	-40(%rax),%r14
3509.cfi_restore	%r14
3510	mov	-32(%rax),%r13
3511.cfi_restore	%r13
3512	mov	-24(%rax),%r12
3513.cfi_restore	%r12
3514	mov	-16(%rax),%rbp
3515.cfi_restore	%rbp
3516	mov	-8(%rax),%rbx
3517.cfi_restore	%rbx
3518	lea	(%rax),%rsp
3519.cfi_def_cfa_register	%rsp
3520.Locb_dec_epilogue:
3521	ret
3522.cfi_endproc
3523.size	${PREFIX}_ocb_decrypt,.-${PREFIX}_ocb_decrypt
3524
3525.type	__ocb_decrypt6,\@abi-omnipotent
3526.align	32
3527__ocb_decrypt6:
3528	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
3529	 movdqu		($L_p,$i1),@offset[1]
3530	 movdqa		@offset[0],@offset[2]
3531	 movdqu		($L_p,$i3),@offset[3]
3532	 movdqa		@offset[0],@offset[4]
3533	 pxor		@offset[5],@offset[0]
3534	 movdqu		($L_p,$i5),@offset[5]
3535	 pxor		@offset[0],@offset[1]
3536	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
3537	 pxor		@offset[1],@offset[2]
3538	pxor		@offset[1],$inout1
3539	 pxor		@offset[2],@offset[3]
3540	pxor		@offset[2],$inout2
3541	 pxor		@offset[3],@offset[4]
3542	pxor		@offset[3],$inout3
3543	 pxor		@offset[4],@offset[5]
3544	pxor		@offset[4],$inout4
3545	pxor		@offset[5],$inout5
3546	$movkey		32($key_),$rndkey0
3547
3548	lea		1($block_num),$i1	# even-numbered blocks
3549	lea		3($block_num),$i3
3550	lea		5($block_num),$i5
3551	add		\$6,$block_num
3552	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
3553	bsf		$i1,$i1			# ntz(block)
3554	bsf		$i3,$i3
3555	bsf		$i5,$i5
3556
3557	aesdec		$rndkey1,$inout0
3558	aesdec		$rndkey1,$inout1
3559	aesdec		$rndkey1,$inout2
3560	aesdec		$rndkey1,$inout3
3561	 pxor		$rndkey0l,@offset[1]
3562	 pxor		$rndkey0l,@offset[2]
3563	aesdec		$rndkey1,$inout4
3564	 pxor		$rndkey0l,@offset[3]
3565	 pxor		$rndkey0l,@offset[4]
3566	aesdec		$rndkey1,$inout5
3567	$movkey		48($key_),$rndkey1
3568	 pxor		$rndkey0l,@offset[5]
3569
3570	aesdec		$rndkey0,$inout0
3571	aesdec		$rndkey0,$inout1
3572	aesdec		$rndkey0,$inout2
3573	aesdec		$rndkey0,$inout3
3574	aesdec		$rndkey0,$inout4
3575	aesdec		$rndkey0,$inout5
3576	$movkey		64($key_),$rndkey0
3577	shl		\$4,$i1			# ntz(block) -> table offset
3578	shl		\$4,$i3
3579	jmp		.Locb_dec_loop6
3580
3581.align	32
3582.Locb_dec_loop6:
3583	aesdec		$rndkey1,$inout0
3584	aesdec		$rndkey1,$inout1
3585	aesdec		$rndkey1,$inout2
3586	aesdec		$rndkey1,$inout3
3587	aesdec		$rndkey1,$inout4
3588	aesdec		$rndkey1,$inout5
3589	$movkey		($key,%rax),$rndkey1
3590	add		\$32,%rax
3591
3592	aesdec		$rndkey0,$inout0
3593	aesdec		$rndkey0,$inout1
3594	aesdec		$rndkey0,$inout2
3595	aesdec		$rndkey0,$inout3
3596	aesdec		$rndkey0,$inout4
3597	aesdec		$rndkey0,$inout5
3598	$movkey		-16($key,%rax),$rndkey0
3599	jnz		.Locb_dec_loop6
3600
3601	aesdec		$rndkey1,$inout0
3602	aesdec		$rndkey1,$inout1
3603	aesdec		$rndkey1,$inout2
3604	aesdec		$rndkey1,$inout3
3605	aesdec		$rndkey1,$inout4
3606	aesdec		$rndkey1,$inout5
3607	$movkey		16($key_),$rndkey1
3608	shl		\$4,$i5
3609
3610	aesdeclast	@offset[0],$inout0
3611	movdqu		($L_p),@offset[0]	# L_0 for all odd-numbered blocks
3612	mov		%r10,%rax		# restore twisted rounds
3613	aesdeclast	@offset[1],$inout1
3614	aesdeclast	@offset[2],$inout2
3615	aesdeclast	@offset[3],$inout3
3616	aesdeclast	@offset[4],$inout4
3617	aesdeclast	@offset[5],$inout5
3618	ret
3619.size	__ocb_decrypt6,.-__ocb_decrypt6
3620
3621.type	__ocb_decrypt4,\@abi-omnipotent
3622.align	32
3623__ocb_decrypt4:
3624	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
3625	 movdqu		($L_p,$i1),@offset[1]
3626	 movdqa		@offset[0],@offset[2]
3627	 movdqu		($L_p,$i3),@offset[3]
3628	 pxor		@offset[5],@offset[0]
3629	 pxor		@offset[0],@offset[1]
3630	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
3631	 pxor		@offset[1],@offset[2]
3632	pxor		@offset[1],$inout1
3633	 pxor		@offset[2],@offset[3]
3634	pxor		@offset[2],$inout2
3635	pxor		@offset[3],$inout3
3636	$movkey		32($key_),$rndkey0
3637
3638	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
3639	 pxor		$rndkey0l,@offset[1]
3640	 pxor		$rndkey0l,@offset[2]
3641	 pxor		$rndkey0l,@offset[3]
3642
3643	aesdec		$rndkey1,$inout0
3644	aesdec		$rndkey1,$inout1
3645	aesdec		$rndkey1,$inout2
3646	aesdec		$rndkey1,$inout3
3647	$movkey		48($key_),$rndkey1
3648
3649	aesdec		$rndkey0,$inout0
3650	aesdec		$rndkey0,$inout1
3651	aesdec		$rndkey0,$inout2
3652	aesdec		$rndkey0,$inout3
3653	$movkey		64($key_),$rndkey0
3654	jmp		.Locb_dec_loop4
3655
3656.align	32
3657.Locb_dec_loop4:
3658	aesdec		$rndkey1,$inout0
3659	aesdec		$rndkey1,$inout1
3660	aesdec		$rndkey1,$inout2
3661	aesdec		$rndkey1,$inout3
3662	$movkey		($key,%rax),$rndkey1
3663	add		\$32,%rax
3664
3665	aesdec		$rndkey0,$inout0
3666	aesdec		$rndkey0,$inout1
3667	aesdec		$rndkey0,$inout2
3668	aesdec		$rndkey0,$inout3
3669	$movkey		-16($key,%rax),$rndkey0
3670	jnz		.Locb_dec_loop4
3671
3672	aesdec		$rndkey1,$inout0
3673	aesdec		$rndkey1,$inout1
3674	aesdec		$rndkey1,$inout2
3675	aesdec		$rndkey1,$inout3
3676	$movkey		16($key_),$rndkey1
3677	mov		%r10,%rax		# restore twisted rounds
3678
3679	aesdeclast	@offset[0],$inout0
3680	aesdeclast	@offset[1],$inout1
3681	aesdeclast	@offset[2],$inout2
3682	aesdeclast	@offset[3],$inout3
3683	ret
3684.size	__ocb_decrypt4,.-__ocb_decrypt4
3685
3686.type	__ocb_decrypt1,\@abi-omnipotent
3687.align	32
3688__ocb_decrypt1:
3689	 pxor		@offset[5],$inout5	# offset_i
3690	 pxor		$rndkey0l,$inout5	# offset_i ^ round[0]
3691	pxor		$inout5,$inout0		# input ^ round[0] ^ offset_i
3692	$movkey		32($key_),$rndkey0
3693
3694	aesdec		$rndkey1,$inout0
3695	$movkey		48($key_),$rndkey1
3696	pxor		$rndkey0l,$inout5	# offset_i ^ round[last]
3697
3698	aesdec		$rndkey0,$inout0
3699	$movkey		64($key_),$rndkey0
3700	jmp		.Locb_dec_loop1
3701
3702.align	32
3703.Locb_dec_loop1:
3704	aesdec		$rndkey1,$inout0
3705	$movkey		($key,%rax),$rndkey1
3706	add		\$32,%rax
3707
3708	aesdec		$rndkey0,$inout0
3709	$movkey		-16($key,%rax),$rndkey0
3710	jnz		.Locb_dec_loop1
3711
3712	aesdec		$rndkey1,$inout0
3713	$movkey		16($key_),$rndkey1	# redundant in tail
3714	mov		%r10,%rax		# restore twisted rounds
3715
3716	aesdeclast	$inout5,$inout0
3717	ret
3718.size	__ocb_decrypt1,.-__ocb_decrypt1
3719___
3720} }}
3721
3722########################################################################
3723# void $PREFIX_cbc_encrypt (const void *inp, void *out,
3724#			    size_t length, const AES_KEY *key,
3725#			    unsigned char *ivp,const int enc);
3726{
3727my $frame_size = 0x10 + ($win64?0xa0:0);	# used in decrypt
3728my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
3729
3730$code.=<<___;
3731.globl	${PREFIX}_cbc_encrypt
3732.type	${PREFIX}_cbc_encrypt,\@function,6
3733.align	16
3734${PREFIX}_cbc_encrypt:
3735.cfi_startproc
3736	test	$len,$len		# check length
3737	jz	.Lcbc_ret
3738
3739	mov	240($key),$rnds_	# key->rounds
3740	mov	$key,$key_		# backup $key
3741	test	%r9d,%r9d		# 6th argument
3742	jz	.Lcbc_decrypt
3743#--------------------------- CBC ENCRYPT ------------------------------#
3744	movups	($ivp),$inout0		# load iv as initial state
3745	mov	$rnds_,$rounds
3746	cmp	\$16,$len
3747	jb	.Lcbc_enc_tail
3748	sub	\$16,$len
3749	jmp	.Lcbc_enc_loop
3750.align	16
3751.Lcbc_enc_loop:
3752	movups	($inp),$inout1		# load input
3753	lea	16($inp),$inp
3754	#xorps	$inout1,$inout0
3755___
3756	&aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
3757$code.=<<___;
3758	mov	$rnds_,$rounds		# restore $rounds
3759	mov	$key_,$key		# restore $key
3760	movups	$inout0,0($out)		# store output
3761	lea	16($out),$out
3762	sub	\$16,$len
3763	jnc	.Lcbc_enc_loop
3764	add	\$16,$len
3765	jnz	.Lcbc_enc_tail
3766	 pxor	$rndkey0,$rndkey0	# clear register bank
3767	 pxor	$rndkey1,$rndkey1
3768	movups	$inout0,($ivp)
3769	 pxor	$inout0,$inout0
3770	 pxor	$inout1,$inout1
3771	jmp	.Lcbc_ret
3772
3773.Lcbc_enc_tail:
3774	mov	$len,%rcx	# zaps $key
3775	xchg	$inp,$out	# $inp is %rsi and $out is %rdi now
3776	.long	0x9066A4F3	# rep movsb
3777	mov	\$16,%ecx	# zero tail
3778	sub	$len,%rcx
3779	xor	%eax,%eax
3780	.long	0x9066AAF3	# rep stosb
3781	lea	-16(%rdi),%rdi	# rewind $out by 1 block
3782	mov	$rnds_,$rounds	# restore $rounds
3783	mov	%rdi,%rsi	# $inp and $out are the same
3784	mov	$key_,$key	# restore $key
3785	xor	$len,$len	# len=16
3786	jmp	.Lcbc_enc_loop	# one more spin
3787#--------------------------- CBC DECRYPT ------------------------------#
3788.align	16
3789.Lcbc_decrypt:
3790	cmp	\$16,$len
3791	jne	.Lcbc_decrypt_bulk
3792
3793	# handle single block without allocating stack frame,
3794	# useful in ciphertext stealing mode
3795	movdqu	($inp),$inout0		# load input
3796	movdqu	($ivp),$inout1		# load iv
3797	movdqa	$inout0,$inout2		# future iv
3798___
3799	&aesni_generate1("dec",$key,$rnds_);
3800$code.=<<___;
3801	 pxor	$rndkey0,$rndkey0	# clear register bank
3802	 pxor	$rndkey1,$rndkey1
3803	movdqu	$inout2,($ivp)		# store iv
3804	xorps	$inout1,$inout0		# ^=iv
3805	 pxor	$inout1,$inout1
3806	movups	$inout0,($out)		# store output
3807	 pxor	$inout0,$inout0
3808	jmp	.Lcbc_ret
3809.align	16
3810.Lcbc_decrypt_bulk:
3811	lea	(%rsp),%r11		# frame pointer
3812.cfi_def_cfa_register	%r11
3813	push	%rbp
3814.cfi_push	%rbp
3815	sub	\$$frame_size,%rsp
3816	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
3817___
3818$code.=<<___ if ($win64);
3819	movaps	%xmm6,0x10(%rsp)
3820	movaps	%xmm7,0x20(%rsp)
3821	movaps	%xmm8,0x30(%rsp)
3822	movaps	%xmm9,0x40(%rsp)
3823	movaps	%xmm10,0x50(%rsp)
3824	movaps	%xmm11,0x60(%rsp)
3825	movaps	%xmm12,0x70(%rsp)
3826	movaps	%xmm13,0x80(%rsp)
3827	movaps	%xmm14,0x90(%rsp)
3828	movaps	%xmm15,0xa0(%rsp)
3829.Lcbc_decrypt_body:
3830___
3831
3832my $inp_=$key_="%rbp";			# reassign $key_
3833
3834$code.=<<___;
3835	mov	$key,$key_		# [re-]backup $key [after reassignment]
3836	movups	($ivp),$iv
3837	mov	$rnds_,$rounds
3838	cmp	\$0x50,$len
3839	jbe	.Lcbc_dec_tail
3840
3841	$movkey	($key),$rndkey0
3842	movdqu	0x00($inp),$inout0	# load input
3843	movdqu	0x10($inp),$inout1
3844	movdqa	$inout0,$in0
3845	movdqu	0x20($inp),$inout2
3846	movdqa	$inout1,$in1
3847	movdqu	0x30($inp),$inout3
3848	movdqa	$inout2,$in2
3849	movdqu	0x40($inp),$inout4
3850	movdqa	$inout3,$in3
3851	movdqu	0x50($inp),$inout5
3852	movdqa	$inout4,$in4
3853	leaq	OPENSSL_ia32cap_P(%rip),%r9
3854	mov	4(%r9),%r9d
3855	cmp	\$0x70,$len
3856	jbe	.Lcbc_dec_six_or_seven
3857
3858	and	\$`1<<26|1<<22`,%r9d	# isolate XSAVE+MOVBE
3859	sub	\$0x50,$len		# $len is biased by -5*16
3860	cmp	\$`1<<22`,%r9d		# check for MOVBE without XSAVE
3861	je	.Lcbc_dec_loop6_enter	# [which denotes Atom Silvermont]
3862	sub	\$0x20,$len		# $len is biased by -7*16
3863	lea	0x70($key),$key		# size optimization
3864	jmp	.Lcbc_dec_loop8_enter
3865.align	16
3866.Lcbc_dec_loop8:
3867	movups	$inout7,($out)
3868	lea	0x10($out),$out
3869.Lcbc_dec_loop8_enter:
3870	movdqu		0x60($inp),$inout6
3871	pxor		$rndkey0,$inout0
3872	movdqu		0x70($inp),$inout7
3873	pxor		$rndkey0,$inout1
3874	$movkey		0x10-0x70($key),$rndkey1
3875	pxor		$rndkey0,$inout2
3876	mov		\$-1,$inp_
3877	cmp		\$0x70,$len	# is there at least 0x60 bytes ahead?
3878	pxor		$rndkey0,$inout3
3879	pxor		$rndkey0,$inout4
3880	pxor		$rndkey0,$inout5
3881	pxor		$rndkey0,$inout6
3882
3883	aesdec		$rndkey1,$inout0
3884	pxor		$rndkey0,$inout7
3885	$movkey		0x20-0x70($key),$rndkey0
3886	aesdec		$rndkey1,$inout1
3887	aesdec		$rndkey1,$inout2
3888	aesdec		$rndkey1,$inout3
3889	aesdec		$rndkey1,$inout4
3890	aesdec		$rndkey1,$inout5
3891	aesdec		$rndkey1,$inout6
3892	adc		\$0,$inp_
3893	and		\$128,$inp_
3894	aesdec		$rndkey1,$inout7
3895	add		$inp,$inp_
3896	$movkey		0x30-0x70($key),$rndkey1
3897___
3898for($i=1;$i<12;$i++) {
3899my $rndkeyx = ($i&1)?$rndkey0:$rndkey1;
3900$code.=<<___	if ($i==7);
3901	cmp		\$11,$rounds
3902___
3903$code.=<<___;
3904	aesdec		$rndkeyx,$inout0
3905	aesdec		$rndkeyx,$inout1
3906	aesdec		$rndkeyx,$inout2
3907	aesdec		$rndkeyx,$inout3
3908	aesdec		$rndkeyx,$inout4
3909	aesdec		$rndkeyx,$inout5
3910	aesdec		$rndkeyx,$inout6
3911	aesdec		$rndkeyx,$inout7
3912	$movkey		`0x30+0x10*$i`-0x70($key),$rndkeyx
3913___
3914$code.=<<___	if ($i<6 || (!($i&1) && $i>7));
3915	nop
3916___
3917$code.=<<___	if ($i==7);
3918	jb		.Lcbc_dec_done
3919___
3920$code.=<<___	if ($i==9);
3921	je		.Lcbc_dec_done
3922___
3923$code.=<<___	if ($i==11);
3924	jmp		.Lcbc_dec_done
3925___
3926}
3927$code.=<<___;
3928.align	16
3929.Lcbc_dec_done:
3930	aesdec		$rndkey1,$inout0
3931	aesdec		$rndkey1,$inout1
3932	pxor		$rndkey0,$iv
3933	pxor		$rndkey0,$in0
3934	aesdec		$rndkey1,$inout2
3935	aesdec		$rndkey1,$inout3
3936	pxor		$rndkey0,$in1
3937	pxor		$rndkey0,$in2
3938	aesdec		$rndkey1,$inout4
3939	aesdec		$rndkey1,$inout5
3940	pxor		$rndkey0,$in3
3941	pxor		$rndkey0,$in4
3942	aesdec		$rndkey1,$inout6
3943	aesdec		$rndkey1,$inout7
3944	movdqu		0x50($inp),$rndkey1
3945
3946	aesdeclast	$iv,$inout0
3947	movdqu		0x60($inp),$iv		# borrow $iv
3948	pxor		$rndkey0,$rndkey1
3949	aesdeclast	$in0,$inout1
3950	pxor		$rndkey0,$iv
3951	movdqu		0x70($inp),$rndkey0	# next IV
3952	aesdeclast	$in1,$inout2
3953	lea		0x80($inp),$inp
3954	movdqu		0x00($inp_),$in0
3955	aesdeclast	$in2,$inout3
3956	aesdeclast	$in3,$inout4
3957	movdqu		0x10($inp_),$in1
3958	movdqu		0x20($inp_),$in2
3959	aesdeclast	$in4,$inout5
3960	aesdeclast	$rndkey1,$inout6
3961	movdqu		0x30($inp_),$in3
3962	movdqu		0x40($inp_),$in4
3963	aesdeclast	$iv,$inout7
3964	movdqa		$rndkey0,$iv		# return $iv
3965	movdqu		0x50($inp_),$rndkey1
3966	$movkey		-0x70($key),$rndkey0
3967
3968	movups		$inout0,($out)		# store output
3969	movdqa		$in0,$inout0
3970	movups		$inout1,0x10($out)
3971	movdqa		$in1,$inout1
3972	movups		$inout2,0x20($out)
3973	movdqa		$in2,$inout2
3974	movups		$inout3,0x30($out)
3975	movdqa		$in3,$inout3
3976	movups		$inout4,0x40($out)
3977	movdqa		$in4,$inout4
3978	movups		$inout5,0x50($out)
3979	movdqa		$rndkey1,$inout5
3980	movups		$inout6,0x60($out)
3981	lea		0x70($out),$out
3982
3983	sub	\$0x80,$len
3984	ja	.Lcbc_dec_loop8
3985
3986	movaps	$inout7,$inout0
3987	lea	-0x70($key),$key
3988	add	\$0x70,$len
3989	jle	.Lcbc_dec_clear_tail_collected
3990	movups	$inout7,($out)
3991	lea	0x10($out),$out
3992	cmp	\$0x50,$len
3993	jbe	.Lcbc_dec_tail
3994
3995	movaps	$in0,$inout0
3996.Lcbc_dec_six_or_seven:
3997	cmp	\$0x60,$len
3998	ja	.Lcbc_dec_seven
3999
4000	movaps	$inout5,$inout6
4001	call	_aesni_decrypt6
4002	pxor	$iv,$inout0		# ^= IV
4003	movaps	$inout6,$iv
4004	pxor	$in0,$inout1
4005	movdqu	$inout0,($out)
4006	pxor	$in1,$inout2
4007	movdqu	$inout1,0x10($out)
4008	 pxor	$inout1,$inout1		# clear register bank
4009	pxor	$in2,$inout3
4010	movdqu	$inout2,0x20($out)
4011	 pxor	$inout2,$inout2
4012	pxor	$in3,$inout4
4013	movdqu	$inout3,0x30($out)
4014	 pxor	$inout3,$inout3
4015	pxor	$in4,$inout5
4016	movdqu	$inout4,0x40($out)
4017	 pxor	$inout4,$inout4
4018	lea	0x50($out),$out
4019	movdqa	$inout5,$inout0
4020	 pxor	$inout5,$inout5
4021	jmp	.Lcbc_dec_tail_collected
4022
4023.align	16
4024.Lcbc_dec_seven:
4025	movups	0x60($inp),$inout6
4026	xorps	$inout7,$inout7
4027	call	_aesni_decrypt8
4028	movups	0x50($inp),$inout7
4029	pxor	$iv,$inout0		# ^= IV
4030	movups	0x60($inp),$iv
4031	pxor	$in0,$inout1
4032	movdqu	$inout0,($out)
4033	pxor	$in1,$inout2
4034	movdqu	$inout1,0x10($out)
4035	 pxor	$inout1,$inout1		# clear register bank
4036	pxor	$in2,$inout3
4037	movdqu	$inout2,0x20($out)
4038	 pxor	$inout2,$inout2
4039	pxor	$in3,$inout4
4040	movdqu	$inout3,0x30($out)
4041	 pxor	$inout3,$inout3
4042	pxor	$in4,$inout5
4043	movdqu	$inout4,0x40($out)
4044	 pxor	$inout4,$inout4
4045	pxor	$inout7,$inout6
4046	movdqu	$inout5,0x50($out)
4047	 pxor	$inout5,$inout5
4048	lea	0x60($out),$out
4049	movdqa	$inout6,$inout0
4050	 pxor	$inout6,$inout6
4051	 pxor	$inout7,$inout7
4052	jmp	.Lcbc_dec_tail_collected
4053
4054.align	16
4055.Lcbc_dec_loop6:
4056	movups	$inout5,($out)
4057	lea	0x10($out),$out
4058	movdqu	0x00($inp),$inout0	# load input
4059	movdqu	0x10($inp),$inout1
4060	movdqa	$inout0,$in0
4061	movdqu	0x20($inp),$inout2
4062	movdqa	$inout1,$in1
4063	movdqu	0x30($inp),$inout3
4064	movdqa	$inout2,$in2
4065	movdqu	0x40($inp),$inout4
4066	movdqa	$inout3,$in3
4067	movdqu	0x50($inp),$inout5
4068	movdqa	$inout4,$in4
4069.Lcbc_dec_loop6_enter:
4070	lea	0x60($inp),$inp
4071	movdqa	$inout5,$inout6
4072
4073	call	_aesni_decrypt6
4074
4075	pxor	$iv,$inout0		# ^= IV
4076	movdqa	$inout6,$iv
4077	pxor	$in0,$inout1
4078	movdqu	$inout0,($out)
4079	pxor	$in1,$inout2
4080	movdqu	$inout1,0x10($out)
4081	pxor	$in2,$inout3
4082	movdqu	$inout2,0x20($out)
4083	pxor	$in3,$inout4
4084	mov	$key_,$key
4085	movdqu	$inout3,0x30($out)
4086	pxor	$in4,$inout5
4087	mov	$rnds_,$rounds
4088	movdqu	$inout4,0x40($out)
4089	lea	0x50($out),$out
4090	sub	\$0x60,$len
4091	ja	.Lcbc_dec_loop6
4092
4093	movdqa	$inout5,$inout0
4094	add	\$0x50,$len
4095	jle	.Lcbc_dec_clear_tail_collected
4096	movups	$inout5,($out)
4097	lea	0x10($out),$out
4098
4099.Lcbc_dec_tail:
4100	movups	($inp),$inout0
4101	sub	\$0x10,$len
4102	jbe	.Lcbc_dec_one		# $len is 1*16 or less
4103
4104	movups	0x10($inp),$inout1
4105	movaps	$inout0,$in0
4106	sub	\$0x10,$len
4107	jbe	.Lcbc_dec_two		# $len is 2*16 or less
4108
4109	movups	0x20($inp),$inout2
4110	movaps	$inout1,$in1
4111	sub	\$0x10,$len
4112	jbe	.Lcbc_dec_three		# $len is 3*16 or less
4113
4114	movups	0x30($inp),$inout3
4115	movaps	$inout2,$in2
4116	sub	\$0x10,$len
4117	jbe	.Lcbc_dec_four		# $len is 4*16 or less
4118
4119	movups	0x40($inp),$inout4	# $len is 5*16 or less
4120	movaps	$inout3,$in3
4121	movaps	$inout4,$in4
4122	xorps	$inout5,$inout5
4123	call	_aesni_decrypt6
4124	pxor	$iv,$inout0
4125	movaps	$in4,$iv
4126	pxor	$in0,$inout1
4127	movdqu	$inout0,($out)
4128	pxor	$in1,$inout2
4129	movdqu	$inout1,0x10($out)
4130	 pxor	$inout1,$inout1		# clear register bank
4131	pxor	$in2,$inout3
4132	movdqu	$inout2,0x20($out)
4133	 pxor	$inout2,$inout2
4134	pxor	$in3,$inout4
4135	movdqu	$inout3,0x30($out)
4136	 pxor	$inout3,$inout3
4137	lea	0x40($out),$out
4138	movdqa	$inout4,$inout0
4139	 pxor	$inout4,$inout4
4140	 pxor	$inout5,$inout5
4141	sub	\$0x10,$len
4142	jmp	.Lcbc_dec_tail_collected
4143
4144.align	16
4145.Lcbc_dec_one:
4146	movaps	$inout0,$in0
4147___
4148	&aesni_generate1("dec",$key,$rounds);
4149$code.=<<___;
4150	xorps	$iv,$inout0
4151	movaps	$in0,$iv
4152	jmp	.Lcbc_dec_tail_collected
4153.align	16
4154.Lcbc_dec_two:
4155	movaps	$inout1,$in1
4156	call	_aesni_decrypt2
4157	pxor	$iv,$inout0
4158	movaps	$in1,$iv
4159	pxor	$in0,$inout1
4160	movdqu	$inout0,($out)
4161	movdqa	$inout1,$inout0
4162	 pxor	$inout1,$inout1		# clear register bank
4163	lea	0x10($out),$out
4164	jmp	.Lcbc_dec_tail_collected
4165.align	16
4166.Lcbc_dec_three:
4167	movaps	$inout2,$in2
4168	call	_aesni_decrypt3
4169	pxor	$iv,$inout0
4170	movaps	$in2,$iv
4171	pxor	$in0,$inout1
4172	movdqu	$inout0,($out)
4173	pxor	$in1,$inout2
4174	movdqu	$inout1,0x10($out)
4175	 pxor	$inout1,$inout1		# clear register bank
4176	movdqa	$inout2,$inout0
4177	 pxor	$inout2,$inout2
4178	lea	0x20($out),$out
4179	jmp	.Lcbc_dec_tail_collected
4180.align	16
4181.Lcbc_dec_four:
4182	movaps	$inout3,$in3
4183	call	_aesni_decrypt4
4184	pxor	$iv,$inout0
4185	movaps	$in3,$iv
4186	pxor	$in0,$inout1
4187	movdqu	$inout0,($out)
4188	pxor	$in1,$inout2
4189	movdqu	$inout1,0x10($out)
4190	 pxor	$inout1,$inout1		# clear register bank
4191	pxor	$in2,$inout3
4192	movdqu	$inout2,0x20($out)
4193	 pxor	$inout2,$inout2
4194	movdqa	$inout3,$inout0
4195	 pxor	$inout3,$inout3
4196	lea	0x30($out),$out
4197	jmp	.Lcbc_dec_tail_collected
4198
4199.align	16
4200.Lcbc_dec_clear_tail_collected:
4201	pxor	$inout1,$inout1		# clear register bank
4202	pxor	$inout2,$inout2
4203	pxor	$inout3,$inout3
4204___
4205$code.=<<___ if (!$win64);
4206	pxor	$inout4,$inout4		# %xmm6..9
4207	pxor	$inout5,$inout5
4208	pxor	$inout6,$inout6
4209	pxor	$inout7,$inout7
4210___
4211$code.=<<___;
4212.Lcbc_dec_tail_collected:
4213	movups	$iv,($ivp)
4214	and	\$15,$len
4215	jnz	.Lcbc_dec_tail_partial
4216	movups	$inout0,($out)
4217	pxor	$inout0,$inout0
4218	jmp	.Lcbc_dec_ret
4219.align	16
4220.Lcbc_dec_tail_partial:
4221	movaps	$inout0,(%rsp)
4222	pxor	$inout0,$inout0
4223	mov	\$16,%rcx
4224	mov	$out,%rdi
4225	sub	$len,%rcx
4226	lea	(%rsp),%rsi
4227	.long	0x9066A4F3		# rep movsb
4228	movdqa	$inout0,(%rsp)
4229
4230.Lcbc_dec_ret:
4231	xorps	$rndkey0,$rndkey0	# %xmm0
4232	pxor	$rndkey1,$rndkey1
4233___
4234$code.=<<___ if ($win64);
4235	movaps	0x10(%rsp),%xmm6
4236	movaps	%xmm0,0x10(%rsp)	# clear stack
4237	movaps	0x20(%rsp),%xmm7
4238	movaps	%xmm0,0x20(%rsp)
4239	movaps	0x30(%rsp),%xmm8
4240	movaps	%xmm0,0x30(%rsp)
4241	movaps	0x40(%rsp),%xmm9
4242	movaps	%xmm0,0x40(%rsp)
4243	movaps	0x50(%rsp),%xmm10
4244	movaps	%xmm0,0x50(%rsp)
4245	movaps	0x60(%rsp),%xmm11
4246	movaps	%xmm0,0x60(%rsp)
4247	movaps	0x70(%rsp),%xmm12
4248	movaps	%xmm0,0x70(%rsp)
4249	movaps	0x80(%rsp),%xmm13
4250	movaps	%xmm0,0x80(%rsp)
4251	movaps	0x90(%rsp),%xmm14
4252	movaps	%xmm0,0x90(%rsp)
4253	movaps	0xa0(%rsp),%xmm15
4254	movaps	%xmm0,0xa0(%rsp)
4255___
4256$code.=<<___;
4257	mov	-8(%r11),%rbp
4258.cfi_restore	%rbp
4259	lea	(%r11),%rsp
4260.cfi_def_cfa_register	%rsp
4261.Lcbc_ret:
4262	ret
4263.cfi_endproc
4264.size	${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
4265___
4266}
4267# int ${PREFIX}_set_decrypt_key(const unsigned char *inp,
4268#				int bits, AES_KEY *key)
4269#
4270# input:	$inp	user-supplied key
4271#		$bits	$inp length in bits
4272#		$key	pointer to key schedule
4273# output:	%eax	0 denoting success, -1 or -2 - failure (see C)
4274#		*$key	key schedule
4275#
4276{ my ($inp,$bits,$key) = @_4args;
4277  $bits =~ s/%r/%e/;
4278
4279$code.=<<___;
4280.globl	${PREFIX}_set_decrypt_key
4281.type	${PREFIX}_set_decrypt_key,\@abi-omnipotent
4282.align	16
4283${PREFIX}_set_decrypt_key:
4284.cfi_startproc
4285	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
4286.cfi_adjust_cfa_offset	8
4287	call	__aesni_set_encrypt_key
4288	shl	\$4,$bits		# rounds-1 after _aesni_set_encrypt_key
4289	test	%eax,%eax
4290	jnz	.Ldec_key_ret
4291	lea	16($key,$bits),$inp	# points at the end of key schedule
4292
4293	$movkey	($key),%xmm0		# just swap
4294	$movkey	($inp),%xmm1
4295	$movkey	%xmm0,($inp)
4296	$movkey	%xmm1,($key)
4297	lea	16($key),$key
4298	lea	-16($inp),$inp
4299
4300.Ldec_key_inverse:
4301	$movkey	($key),%xmm0		# swap and inverse
4302	$movkey	($inp),%xmm1
4303	aesimc	%xmm0,%xmm0
4304	aesimc	%xmm1,%xmm1
4305	lea	16($key),$key
4306	lea	-16($inp),$inp
4307	$movkey	%xmm0,16($inp)
4308	$movkey	%xmm1,-16($key)
4309	cmp	$key,$inp
4310	ja	.Ldec_key_inverse
4311
4312	$movkey	($key),%xmm0		# inverse middle
4313	aesimc	%xmm0,%xmm0
4314	pxor	%xmm1,%xmm1
4315	$movkey	%xmm0,($inp)
4316	pxor	%xmm0,%xmm0
4317.Ldec_key_ret:
4318	add	\$8,%rsp
4319.cfi_adjust_cfa_offset	-8
4320	ret
4321.cfi_endproc
4322.LSEH_end_set_decrypt_key:
4323.size	${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
4324___
4325
4326# This is based on submission from Intel by
4327#	Huang Ying
4328#	Vinodh Gopal
4329#	Kahraman Akdemir
4330#
4331# Aggressively optimized in respect to aeskeygenassist's critical path
4332# and is contained in %xmm0-5 to meet Win64 ABI requirement.
4333#
4334# int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
4335#				int bits, AES_KEY * const key);
4336#
4337# input:	$inp	user-supplied key
4338#		$bits	$inp length in bits
4339#		$key	pointer to key schedule
4340# output:	%eax	0 denoting success, -1 or -2 - failure (see C)
4341#		$bits	rounds-1 (used in aesni_set_decrypt_key)
4342#		*$key	key schedule
4343#		$key	pointer to key schedule (used in
4344#			aesni_set_decrypt_key)
4345#
4346# Subroutine is frame-less, which means that only volatile registers
4347# are used. Note that it's declared "abi-omnipotent", which means that
4348# amount of volatile registers is smaller on Windows.
4349#
4350$code.=<<___;
4351.globl	${PREFIX}_set_encrypt_key
4352.type	${PREFIX}_set_encrypt_key,\@abi-omnipotent
4353.align	16
4354${PREFIX}_set_encrypt_key:
4355__aesni_set_encrypt_key:
4356.cfi_startproc
4357#ifndef NDEBUG
4358#ifndef BORINGSSL_FIPS
4359	movb \$1,BORINGSSL_function_hit+3(%rip)
4360#endif
4361#endif
4362	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
4363.cfi_adjust_cfa_offset	8
4364	mov	\$-1,%rax
4365	test	$inp,$inp
4366	jz	.Lenc_key_ret
4367	test	$key,$key
4368	jz	.Lenc_key_ret
4369
4370	movups	($inp),%xmm0		# pull first 128 bits of *userKey
4371	xorps	%xmm4,%xmm4		# low dword of xmm4 is assumed 0
4372	leaq	OPENSSL_ia32cap_P(%rip),%r10
4373	movl	4(%r10),%r10d
4374	and	\$`1<<28|1<<11`,%r10d	# AVX and XOP bits
4375	lea	16($key),%rax		# %rax is used as modifiable copy of $key
4376	cmp	\$256,$bits
4377	je	.L14rounds
4378	cmp	\$192,$bits
4379	je	.L12rounds
4380	cmp	\$128,$bits
4381	jne	.Lbad_keybits
4382
4383.L10rounds:
4384	mov	\$9,$bits			# 10 rounds for 128-bit key
4385	cmp	\$`1<<28`,%r10d			# AVX, bit no XOP
4386	je	.L10rounds_alt
4387
4388	$movkey	%xmm0,($key)			# round 0
4389	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 1
4390	call		.Lkey_expansion_128_cold
4391	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 2
4392	call		.Lkey_expansion_128
4393	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 3
4394	call		.Lkey_expansion_128
4395	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 4
4396	call		.Lkey_expansion_128
4397	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 5
4398	call		.Lkey_expansion_128
4399	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 6
4400	call		.Lkey_expansion_128
4401	aeskeygenassist	\$0x40,%xmm0,%xmm1	# round 7
4402	call		.Lkey_expansion_128
4403	aeskeygenassist	\$0x80,%xmm0,%xmm1	# round 8
4404	call		.Lkey_expansion_128
4405	aeskeygenassist	\$0x1b,%xmm0,%xmm1	# round 9
4406	call		.Lkey_expansion_128
4407	aeskeygenassist	\$0x36,%xmm0,%xmm1	# round 10
4408	call		.Lkey_expansion_128
4409	$movkey	%xmm0,(%rax)
4410	mov	$bits,80(%rax)	# 240(%rdx)
4411	xor	%eax,%eax
4412	jmp	.Lenc_key_ret
4413
4414.align	16
4415.L10rounds_alt:
4416	movdqa	.Lkey_rotate(%rip),%xmm5
4417	mov	\$8,%r10d
4418	movdqa	.Lkey_rcon1(%rip),%xmm4
4419	movdqa	%xmm0,%xmm2
4420	movdqu	%xmm0,($key)
4421	jmp	.Loop_key128
4422
4423.align	16
4424.Loop_key128:
4425	pshufb		%xmm5,%xmm0
4426	aesenclast	%xmm4,%xmm0
4427	pslld		\$1,%xmm4
4428	lea		16(%rax),%rax
4429
4430	movdqa		%xmm2,%xmm3
4431	pslldq		\$4,%xmm2
4432	pxor		%xmm2,%xmm3
4433	pslldq		\$4,%xmm2
4434	pxor		%xmm2,%xmm3
4435	pslldq		\$4,%xmm2
4436	pxor		%xmm3,%xmm2
4437
4438	pxor		%xmm2,%xmm0
4439	movdqu		%xmm0,-16(%rax)
4440	movdqa		%xmm0,%xmm2
4441
4442	dec	%r10d
4443	jnz	.Loop_key128
4444
4445	movdqa		.Lkey_rcon1b(%rip),%xmm4
4446
4447	pshufb		%xmm5,%xmm0
4448	aesenclast	%xmm4,%xmm0
4449	pslld		\$1,%xmm4
4450
4451	movdqa		%xmm2,%xmm3
4452	pslldq		\$4,%xmm2
4453	pxor		%xmm2,%xmm3
4454	pslldq		\$4,%xmm2
4455	pxor		%xmm2,%xmm3
4456	pslldq		\$4,%xmm2
4457	pxor		%xmm3,%xmm2
4458
4459	pxor		%xmm2,%xmm0
4460	movdqu		%xmm0,(%rax)
4461
4462	movdqa		%xmm0,%xmm2
4463	pshufb		%xmm5,%xmm0
4464	aesenclast	%xmm4,%xmm0
4465
4466	movdqa		%xmm2,%xmm3
4467	pslldq		\$4,%xmm2
4468	pxor		%xmm2,%xmm3
4469	pslldq		\$4,%xmm2
4470	pxor		%xmm2,%xmm3
4471	pslldq		\$4,%xmm2
4472	pxor		%xmm3,%xmm2
4473
4474	pxor		%xmm2,%xmm0
4475	movdqu		%xmm0,16(%rax)
4476
4477	mov	$bits,96(%rax)	# 240($key)
4478	xor	%eax,%eax
4479	jmp	.Lenc_key_ret
4480
4481.align	16
4482.L12rounds:
4483	movq	16($inp),%xmm2			# remaining 1/3 of *userKey
4484	mov	\$11,$bits			# 12 rounds for 192
4485	cmp	\$`1<<28`,%r10d			# AVX, but no XOP
4486	je	.L12rounds_alt
4487
4488	$movkey	%xmm0,($key)			# round 0
4489	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 1,2
4490	call		.Lkey_expansion_192a_cold
4491	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 2,3
4492	call		.Lkey_expansion_192b
4493	aeskeygenassist	\$0x4,%xmm2,%xmm1	# round 4,5
4494	call		.Lkey_expansion_192a
4495	aeskeygenassist	\$0x8,%xmm2,%xmm1	# round 5,6
4496	call		.Lkey_expansion_192b
4497	aeskeygenassist	\$0x10,%xmm2,%xmm1	# round 7,8
4498	call		.Lkey_expansion_192a
4499	aeskeygenassist	\$0x20,%xmm2,%xmm1	# round 8,9
4500	call		.Lkey_expansion_192b
4501	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 10,11
4502	call		.Lkey_expansion_192a
4503	aeskeygenassist	\$0x80,%xmm2,%xmm1	# round 11,12
4504	call		.Lkey_expansion_192b
4505	$movkey	%xmm0,(%rax)
4506	mov	$bits,48(%rax)	# 240(%rdx)
4507	xor	%rax, %rax
4508	jmp	.Lenc_key_ret
4509
4510.align	16
4511.L12rounds_alt:
4512	movdqa	.Lkey_rotate192(%rip),%xmm5
4513	movdqa	.Lkey_rcon1(%rip),%xmm4
4514	mov	\$8,%r10d
4515	movdqu	%xmm0,($key)
4516	jmp	.Loop_key192
4517
4518.align	16
4519.Loop_key192:
4520	movq		%xmm2,0(%rax)
4521	movdqa		%xmm2,%xmm1
4522	pshufb		%xmm5,%xmm2
4523	aesenclast	%xmm4,%xmm2
4524	pslld		\$1, %xmm4
4525	lea		24(%rax),%rax
4526
4527	movdqa		%xmm0,%xmm3
4528	pslldq		\$4,%xmm0
4529	pxor		%xmm0,%xmm3
4530	pslldq		\$4,%xmm0
4531	pxor		%xmm0,%xmm3
4532	pslldq		\$4,%xmm0
4533	pxor		%xmm3,%xmm0
4534
4535	pshufd		\$0xff,%xmm0,%xmm3
4536	pxor		%xmm1,%xmm3
4537	pslldq		\$4,%xmm1
4538	pxor		%xmm1,%xmm3
4539
4540	pxor		%xmm2,%xmm0
4541	pxor		%xmm3,%xmm2
4542	movdqu		%xmm0,-16(%rax)
4543
4544	dec	%r10d
4545	jnz	.Loop_key192
4546
4547	mov	$bits,32(%rax)	# 240($key)
4548	xor	%eax,%eax
4549	jmp	.Lenc_key_ret
4550
4551.align	16
4552.L14rounds:
4553	movups	16($inp),%xmm2			# remaining half of *userKey
4554	mov	\$13,$bits			# 14 rounds for 256
4555	lea	16(%rax),%rax
4556	cmp	\$`1<<28`,%r10d			# AVX, but no XOP
4557	je	.L14rounds_alt
4558
4559	$movkey	%xmm0,($key)			# round 0
4560	$movkey	%xmm2,16($key)			# round 1
4561	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 2
4562	call		.Lkey_expansion_256a_cold
4563	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 3
4564	call		.Lkey_expansion_256b
4565	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 4
4566	call		.Lkey_expansion_256a
4567	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 5
4568	call		.Lkey_expansion_256b
4569	aeskeygenassist	\$0x4,%xmm2,%xmm1	# round 6
4570	call		.Lkey_expansion_256a
4571	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 7
4572	call		.Lkey_expansion_256b
4573	aeskeygenassist	\$0x8,%xmm2,%xmm1	# round 8
4574	call		.Lkey_expansion_256a
4575	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 9
4576	call		.Lkey_expansion_256b
4577	aeskeygenassist	\$0x10,%xmm2,%xmm1	# round 10
4578	call		.Lkey_expansion_256a
4579	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 11
4580	call		.Lkey_expansion_256b
4581	aeskeygenassist	\$0x20,%xmm2,%xmm1	# round 12
4582	call		.Lkey_expansion_256a
4583	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 13
4584	call		.Lkey_expansion_256b
4585	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 14
4586	call		.Lkey_expansion_256a
4587	$movkey	%xmm0,(%rax)
4588	mov	$bits,16(%rax)	# 240(%rdx)
4589	xor	%rax,%rax
4590	jmp	.Lenc_key_ret
4591
4592.align	16
4593.L14rounds_alt:
4594	movdqa	.Lkey_rotate(%rip),%xmm5
4595	movdqa	.Lkey_rcon1(%rip),%xmm4
4596	mov	\$7,%r10d
4597	movdqu	%xmm0,0($key)
4598	movdqa	%xmm2,%xmm1
4599	movdqu	%xmm2,16($key)
4600	jmp	.Loop_key256
4601
4602.align	16
4603.Loop_key256:
4604	pshufb		%xmm5,%xmm2
4605	aesenclast	%xmm4,%xmm2
4606
4607	movdqa		%xmm0,%xmm3
4608	pslldq		\$4,%xmm0
4609	pxor		%xmm0,%xmm3
4610	pslldq		\$4,%xmm0
4611	pxor		%xmm0,%xmm3
4612	pslldq		\$4,%xmm0
4613	pxor		%xmm3,%xmm0
4614	pslld		\$1,%xmm4
4615
4616	pxor		%xmm2,%xmm0
4617	movdqu		%xmm0,(%rax)
4618
4619	dec	%r10d
4620	jz	.Ldone_key256
4621
4622	pshufd		\$0xff,%xmm0,%xmm2
4623	pxor		%xmm3,%xmm3
4624	aesenclast	%xmm3,%xmm2
4625
4626	movdqa		%xmm1,%xmm3
4627	pslldq		\$4,%xmm1
4628	pxor		%xmm1,%xmm3
4629	pslldq		\$4,%xmm1
4630	pxor		%xmm1,%xmm3
4631	pslldq		\$4,%xmm1
4632	pxor		%xmm3,%xmm1
4633
4634	pxor		%xmm1,%xmm2
4635	movdqu		%xmm2,16(%rax)
4636	lea		32(%rax),%rax
4637	movdqa		%xmm2,%xmm1
4638
4639	jmp	.Loop_key256
4640
4641.Ldone_key256:
4642	mov	$bits,16(%rax)	# 240($key)
4643	xor	%eax,%eax
4644	jmp	.Lenc_key_ret
4645
4646.align	16
4647.Lbad_keybits:
4648	mov	\$-2,%rax
4649.Lenc_key_ret:
4650	pxor	%xmm0,%xmm0
4651	pxor	%xmm1,%xmm1
4652	pxor	%xmm2,%xmm2
4653	pxor	%xmm3,%xmm3
4654	pxor	%xmm4,%xmm4
4655	pxor	%xmm5,%xmm5
4656	add	\$8,%rsp
4657.cfi_adjust_cfa_offset	-8
4658	ret
4659.cfi_endproc
4660.LSEH_end_set_encrypt_key:
4661
4662.align	16
4663.Lkey_expansion_128:
4664	$movkey	%xmm0,(%rax)
4665	lea	16(%rax),%rax
4666.Lkey_expansion_128_cold:
4667	shufps	\$0b00010000,%xmm0,%xmm4
4668	xorps	%xmm4, %xmm0
4669	shufps	\$0b10001100,%xmm0,%xmm4
4670	xorps	%xmm4, %xmm0
4671	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
4672	xorps	%xmm1,%xmm0
4673	ret
4674
4675.align 16
4676.Lkey_expansion_192a:
4677	$movkey	%xmm0,(%rax)
4678	lea	16(%rax),%rax
4679.Lkey_expansion_192a_cold:
4680	movaps	%xmm2, %xmm5
4681.Lkey_expansion_192b_warm:
4682	shufps	\$0b00010000,%xmm0,%xmm4
4683	movdqa	%xmm2,%xmm3
4684	xorps	%xmm4,%xmm0
4685	shufps	\$0b10001100,%xmm0,%xmm4
4686	pslldq	\$4,%xmm3
4687	xorps	%xmm4,%xmm0
4688	pshufd	\$0b01010101,%xmm1,%xmm1	# critical path
4689	pxor	%xmm3,%xmm2
4690	pxor	%xmm1,%xmm0
4691	pshufd	\$0b11111111,%xmm0,%xmm3
4692	pxor	%xmm3,%xmm2
4693	ret
4694
4695.align 16
4696.Lkey_expansion_192b:
4697	movaps	%xmm0,%xmm3
4698	shufps	\$0b01000100,%xmm0,%xmm5
4699	$movkey	%xmm5,(%rax)
4700	shufps	\$0b01001110,%xmm2,%xmm3
4701	$movkey	%xmm3,16(%rax)
4702	lea	32(%rax),%rax
4703	jmp	.Lkey_expansion_192b_warm
4704
4705.align	16
4706.Lkey_expansion_256a:
4707	$movkey	%xmm2,(%rax)
4708	lea	16(%rax),%rax
4709.Lkey_expansion_256a_cold:
4710	shufps	\$0b00010000,%xmm0,%xmm4
4711	xorps	%xmm4,%xmm0
4712	shufps	\$0b10001100,%xmm0,%xmm4
4713	xorps	%xmm4,%xmm0
4714	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
4715	xorps	%xmm1,%xmm0
4716	ret
4717
4718.align 16
4719.Lkey_expansion_256b:
4720	$movkey	%xmm0,(%rax)
4721	lea	16(%rax),%rax
4722
4723	shufps	\$0b00010000,%xmm2,%xmm4
4724	xorps	%xmm4,%xmm2
4725	shufps	\$0b10001100,%xmm2,%xmm4
4726	xorps	%xmm4,%xmm2
4727	shufps	\$0b10101010,%xmm1,%xmm1	# critical path
4728	xorps	%xmm1,%xmm2
4729	ret
4730.size	${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
4731.size	__aesni_set_encrypt_key,.-__aesni_set_encrypt_key
4732___
4733}
4734
4735$code.=<<___;
4736.align	64
4737.Lbswap_mask:
4738	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
4739.Lincrement32:
4740	.long	6,6,6,0
4741.Lincrement64:
4742	.long	1,0,0,0
4743.Lxts_magic:
4744	.long	0x87,0,1,0
4745.Lincrement1:
4746	.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4747.Lkey_rotate:
4748	.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
4749.Lkey_rotate192:
4750	.long	0x04070605,0x04070605,0x04070605,0x04070605
4751.Lkey_rcon1:
4752	.long	1,1,1,1
4753.Lkey_rcon1b:
4754	.long	0x1b,0x1b,0x1b,0x1b
4755
4756.asciz  "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
4757.align	64
4758___
4759
4760# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
4761#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
4762if ($win64) {
4763$rec="%rcx";
4764$frame="%rdx";
4765$context="%r8";
4766$disp="%r9";
4767
4768$code.=<<___;
4769.extern	__imp_RtlVirtualUnwind
4770___
4771$code.=<<___ if ($PREFIX eq "aes_hw");
4772.type	ecb_ccm64_se_handler,\@abi-omnipotent
4773.align	16
4774ecb_ccm64_se_handler:
4775	push	%rsi
4776	push	%rdi
4777	push	%rbx
4778	push	%rbp
4779	push	%r12
4780	push	%r13
4781	push	%r14
4782	push	%r15
4783	pushfq
4784	sub	\$64,%rsp
4785
4786	mov	120($context),%rax	# pull context->Rax
4787	mov	248($context),%rbx	# pull context->Rip
4788
4789	mov	8($disp),%rsi		# disp->ImageBase
4790	mov	56($disp),%r11		# disp->HandlerData
4791
4792	mov	0(%r11),%r10d		# HandlerData[0]
4793	lea	(%rsi,%r10),%r10	# prologue label
4794	cmp	%r10,%rbx		# context->Rip<prologue label
4795	jb	.Lcommon_seh_tail
4796
4797	mov	152($context),%rax	# pull context->Rsp
4798
4799	mov	4(%r11),%r10d		# HandlerData[1]
4800	lea	(%rsi,%r10),%r10	# epilogue label
4801	cmp	%r10,%rbx		# context->Rip>=epilogue label
4802	jae	.Lcommon_seh_tail
4803
4804	lea	0(%rax),%rsi		# %xmm save area
4805	lea	512($context),%rdi	# &context.Xmm6
4806	mov	\$8,%ecx		# 4*sizeof(%xmm0)/sizeof(%rax)
4807	.long	0xa548f3fc		# cld; rep movsq
4808	lea	0x58(%rax),%rax		# adjust stack pointer
4809
4810	jmp	.Lcommon_seh_tail
4811.size	${PREFIX}_ccm64_se_handler,.-${PREFIX}_ccm64_se_handler
4812
4813.type	ctr_xts_se_handler,\@abi-omnipotent
4814.align	16
4815ctr_xts_se_handler:
4816	push	%rsi
4817	push	%rdi
4818	push	%rbx
4819	push	%rbp
4820	push	%r12
4821	push	%r13
4822	push	%r14
4823	push	%r15
4824	pushfq
4825	sub	\$64,%rsp
4826
4827	mov	120($context),%rax	# pull context->Rax
4828	mov	248($context),%rbx	# pull context->Rip
4829
4830	mov	8($disp),%rsi		# disp->ImageBase
4831	mov	56($disp),%r11		# disp->HandlerData
4832
4833	mov	0(%r11),%r10d		# HandlerData[0]
4834	lea	(%rsi,%r10),%r10	# prologue lable
4835	cmp	%r10,%rbx		# context->Rip<prologue label
4836	jb	.Lcommon_seh_tail
4837
4838	mov	152($context),%rax	# pull context->Rsp
4839
4840	mov	4(%r11),%r10d		# HandlerData[1]
4841	lea	(%rsi,%r10),%r10	# epilogue label
4842	cmp	%r10,%rbx		# context->Rip>=epilogue label
4843	jae	.Lcommon_seh_tail
4844
4845	mov	208($context),%rax	# pull context->R11
4846
4847	lea	-0xa8(%rax),%rsi	# %xmm save area
4848	lea	512($context),%rdi	# & context.Xmm6
4849	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
4850	.long	0xa548f3fc		# cld; rep movsq
4851
4852	mov	-8(%rax),%rbp		# restore saved %rbp
4853	mov	%rbp,160($context)	# restore context->Rbp
4854	jmp	.Lcommon_seh_tail
4855.size	ctr_xts_se_handler,.-ctr_xts_se_handler
4856
4857___
4858# BoringSSL omits the OCB functions.
4859$code.=<<___ if (0);
4860.type	ocb_se_handler,\@abi-omnipotent
4861.align	16
4862ocb_se_handler:
4863	push	%rsi
4864	push	%rdi
4865	push	%rbx
4866	push	%rbp
4867	push	%r12
4868	push	%r13
4869	push	%r14
4870	push	%r15
4871	pushfq
4872	sub	\$64,%rsp
4873
4874	mov	120($context),%rax	# pull context->Rax
4875	mov	248($context),%rbx	# pull context->Rip
4876
4877	mov	8($disp),%rsi		# disp->ImageBase
4878	mov	56($disp),%r11		# disp->HandlerData
4879
4880	mov	0(%r11),%r10d		# HandlerData[0]
4881	lea	(%rsi,%r10),%r10	# prologue lable
4882	cmp	%r10,%rbx		# context->Rip<prologue label
4883	jb	.Lcommon_seh_tail
4884
4885	mov	4(%r11),%r10d		# HandlerData[1]
4886	lea	(%rsi,%r10),%r10	# epilogue label
4887	cmp	%r10,%rbx		# context->Rip>=epilogue label
4888	jae	.Lcommon_seh_tail
4889
4890	mov	8(%r11),%r10d		# HandlerData[2]
4891	lea	(%rsi,%r10),%r10
4892	cmp	%r10,%rbx		# context->Rip>=pop label
4893	jae	.Locb_no_xmm
4894
4895	mov	152($context),%rax	# pull context->Rsp
4896
4897	lea	(%rax),%rsi		# %xmm save area
4898	lea	512($context),%rdi	# & context.Xmm6
4899	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
4900	.long	0xa548f3fc		# cld; rep movsq
4901	lea	0xa0+0x28(%rax),%rax
4902
4903.Locb_no_xmm:
4904	mov	-8(%rax),%rbx
4905	mov	-16(%rax),%rbp
4906	mov	-24(%rax),%r12
4907	mov	-32(%rax),%r13
4908	mov	-40(%rax),%r14
4909
4910	mov	%rbx,144($context)	# restore context->Rbx
4911	mov	%rbp,160($context)	# restore context->Rbp
4912	mov	%r12,216($context)	# restore context->R12
4913	mov	%r13,224($context)	# restore context->R13
4914	mov	%r14,232($context)	# restore context->R14
4915
4916	jmp	.Lcommon_seh_tail
4917.size	ocb_se_handler,.-ocb_se_handler
4918___
4919$code.=<<___;
4920.type	cbc_se_handler,\@abi-omnipotent
4921.align	16
4922cbc_se_handler:
4923	push	%rsi
4924	push	%rdi
4925	push	%rbx
4926	push	%rbp
4927	push	%r12
4928	push	%r13
4929	push	%r14
4930	push	%r15
4931	pushfq
4932	sub	\$64,%rsp
4933
4934	mov	152($context),%rax	# pull context->Rsp
4935	mov	248($context),%rbx	# pull context->Rip
4936
4937	lea	.Lcbc_decrypt_bulk(%rip),%r10
4938	cmp	%r10,%rbx		# context->Rip<"prologue" label
4939	jb	.Lcommon_seh_tail
4940
4941	mov	120($context),%rax	# pull context->Rax
4942
4943	lea	.Lcbc_decrypt_body(%rip),%r10
4944	cmp	%r10,%rbx		# context->Rip<cbc_decrypt_body
4945	jb	.Lcommon_seh_tail
4946
4947	mov	152($context),%rax	# pull context->Rsp
4948
4949	lea	.Lcbc_ret(%rip),%r10
4950	cmp	%r10,%rbx		# context->Rip>="epilogue" label
4951	jae	.Lcommon_seh_tail
4952
4953	lea	16(%rax),%rsi		# %xmm save area
4954	lea	512($context),%rdi	# &context.Xmm6
4955	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
4956	.long	0xa548f3fc		# cld; rep movsq
4957
4958	mov	208($context),%rax	# pull context->R11
4959
4960	mov	-8(%rax),%rbp		# restore saved %rbp
4961	mov	%rbp,160($context)	# restore context->Rbp
4962
4963.Lcommon_seh_tail:
4964	mov	8(%rax),%rdi
4965	mov	16(%rax),%rsi
4966	mov	%rax,152($context)	# restore context->Rsp
4967	mov	%rsi,168($context)	# restore context->Rsi
4968	mov	%rdi,176($context)	# restore context->Rdi
4969
4970	mov	40($disp),%rdi		# disp->ContextRecord
4971	mov	$context,%rsi		# context
4972	mov	\$154,%ecx		# sizeof(CONTEXT)
4973	.long	0xa548f3fc		# cld; rep movsq
4974
4975	mov	$disp,%rsi
4976	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
4977	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
4978	mov	0(%rsi),%r8		# arg3, disp->ControlPc
4979	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
4980	mov	40(%rsi),%r10		# disp->ContextRecord
4981	lea	56(%rsi),%r11		# &disp->HandlerData
4982	lea	24(%rsi),%r12		# &disp->EstablisherFrame
4983	mov	%r10,32(%rsp)		# arg5
4984	mov	%r11,40(%rsp)		# arg6
4985	mov	%r12,48(%rsp)		# arg7
4986	mov	%rcx,56(%rsp)		# arg8, (NULL)
4987	call	*__imp_RtlVirtualUnwind(%rip)
4988
4989	mov	\$1,%eax		# ExceptionContinueSearch
4990	add	\$64,%rsp
4991	popfq
4992	pop	%r15
4993	pop	%r14
4994	pop	%r13
4995	pop	%r12
4996	pop	%rbp
4997	pop	%rbx
4998	pop	%rdi
4999	pop	%rsi
5000	ret
5001.size	cbc_se_handler,.-cbc_se_handler
5002
5003.section	.pdata
5004.align	4
5005___
5006$code.=<<___ if ($PREFIX eq "aes_hw");
5007	.rva	.LSEH_begin_${PREFIX}_ecb_encrypt
5008	.rva	.LSEH_end_${PREFIX}_ecb_encrypt
5009	.rva	.LSEH_info_ecb
5010
5011	.rva	.LSEH_begin_${PREFIX}_ctr32_encrypt_blocks
5012	.rva	.LSEH_end_${PREFIX}_ctr32_encrypt_blocks
5013	.rva	.LSEH_info_ctr32
5014___
5015$code.=<<___;
5016	.rva	.LSEH_begin_${PREFIX}_cbc_encrypt
5017	.rva	.LSEH_end_${PREFIX}_cbc_encrypt
5018	.rva	.LSEH_info_cbc
5019
5020	.rva	${PREFIX}_set_decrypt_key
5021	.rva	.LSEH_end_set_decrypt_key
5022	.rva	.LSEH_info_key
5023
5024	.rva	${PREFIX}_set_encrypt_key
5025	.rva	.LSEH_end_set_encrypt_key
5026	.rva	.LSEH_info_key
5027.section	.xdata
5028.align	8
5029___
5030$code.=<<___ if ($PREFIX eq "aes_hw");
5031.LSEH_info_ecb:
5032	.byte	9,0,0,0
5033	.rva	ecb_ccm64_se_handler
5034	.rva	.Lecb_enc_body,.Lecb_enc_ret		# HandlerData[]
5035.LSEH_info_ctr32:
5036	.byte	9,0,0,0
5037	.rva	ctr_xts_se_handler
5038	.rva	.Lctr32_body,.Lctr32_epilogue		# HandlerData[]
5039___
5040$code.=<<___;
5041.LSEH_info_cbc:
5042	.byte	9,0,0,0
5043	.rva	cbc_se_handler
5044.LSEH_info_key:
5045	.byte	0x01,0x04,0x01,0x00
5046	.byte	0x04,0x02,0x00,0x00	# sub rsp,8
5047___
5048}
5049
5050sub rex {
5051  local *opcode=shift;
5052  my ($dst,$src)=@_;
5053  my $rex=0;
5054
5055    $rex|=0x04			if($dst>=8);
5056    $rex|=0x01			if($src>=8);
5057    push @opcode,$rex|0x40	if($rex);
5058}
5059
5060sub aesni {
5061  my $line=shift;
5062  my @opcode=(0x66);
5063
5064    if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5065	rex(\@opcode,$4,$3);
5066	push @opcode,0x0f,0x3a,0xdf;
5067	push @opcode,0xc0|($3&7)|(($4&7)<<3);	# ModR/M
5068	my $c=$2;
5069	push @opcode,$c=~/^0/?oct($c):$c;
5070	return ".byte\t".join(',',@opcode);
5071    }
5072    elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5073	my %opcodelet = (
5074		"aesimc" => 0xdb,
5075		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
5076		"aesdec" => 0xde,	"aesdeclast" => 0xdf
5077	);
5078	return undef if (!defined($opcodelet{$1}));
5079	rex(\@opcode,$3,$2);
5080	push @opcode,0x0f,0x38,$opcodelet{$1};
5081	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
5082	return ".byte\t".join(',',@opcode);
5083    }
5084    elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
5085	my %opcodelet = (
5086		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
5087		"aesdec" => 0xde,	"aesdeclast" => 0xdf
5088	);
5089	return undef if (!defined($opcodelet{$1}));
5090	my $off = $2;
5091	push @opcode,0x44 if ($3>=8);
5092	push @opcode,0x0f,0x38,$opcodelet{$1};
5093	push @opcode,0x44|(($3&7)<<3),0x24;	# ModR/M
5094	push @opcode,($off=~/^0/?oct($off):$off)&0xff;
5095	return ".byte\t".join(',',@opcode);
5096    }
5097    return $line;
5098}
5099
5100sub movbe {
5101	".byte	0x0f,0x38,0xf1,0x44,0x24,".shift;
5102}
5103
5104$code =~ s/\`([^\`]*)\`/eval($1)/gem;
5105$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
5106#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm;	# debugging artefact
5107$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
5108
5109print $code;
5110
5111close STDOUT;
5112