• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for Intel AES-NI extension. In
18# OpenSSL context it's used with Intel engine, but can also be used as
19# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
20# details].
21#
22# Performance.
23#
24# Given aes(enc|dec) instructions' latency asymptotic performance for
25# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
26# processed with 128-bit key. And given their throughput asymptotic
27# performance for parallelizable modes is 1.25 cycles per byte. Being
28# asymptotic limit it's not something you commonly achieve in reality,
29# but how close does one get? Below are results collected for
30# different modes and block sized. Pairs of numbers are for en-/
31# decryption.
32#
33#	16-byte     64-byte     256-byte    1-KB        8-KB
34# ECB	4.25/4.25   1.38/1.38   1.28/1.28   1.26/1.26	1.26/1.26
35# CTR	5.42/5.42   1.92/1.92   1.44/1.44   1.28/1.28   1.26/1.26
36# CBC	4.38/4.43   4.15/1.43   4.07/1.32   4.07/1.29   4.06/1.28
37# CCM	5.66/9.42   4.42/5.41   4.16/4.40   4.09/4.15   4.06/4.07
38# OFB	5.42/5.42   4.64/4.64   4.44/4.44   4.39/4.39   4.38/4.38
39# CFB	5.73/5.85   5.56/5.62   5.48/5.56   5.47/5.55   5.47/5.55
40#
41# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
42# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
43# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
44# The results were collected with specially crafted speed.c benchmark
45# in order to compare them with results reported in "Intel Advanced
46# Encryption Standard (AES) New Instruction Set" White Paper Revision
47# 3.0 dated May 2010. All above results are consistently better. This
48# module also provides better performance for block sizes smaller than
49# 128 bytes in points *not* represented in the above table.
50#
51# Looking at the results for 8-KB buffer.
52#
53# CFB and OFB results are far from the limit, because implementation
54# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
55# single-block aesni_encrypt, which is not the most optimal way to go.
56# CBC encrypt result is unexpectedly high and there is no documented
57# explanation for it. Seemingly there is a small penalty for feeding
58# the result back to AES unit the way it's done in CBC mode. There is
59# nothing one can do and the result appears optimal. CCM result is
60# identical to CBC, because CBC-MAC is essentially CBC encrypt without
61# saving output. CCM CTR "stays invisible," because it's neatly
62# interleaved wih CBC-MAC. This provides ~30% improvement over
63# "straightforward" CCM implementation with CTR and CBC-MAC performed
64# disjointly. Parallelizable modes practically achieve the theoretical
65# limit.
66#
67# Looking at how results vary with buffer size.
68#
69# Curves are practically saturated at 1-KB buffer size. In most cases
70# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
71# CTR curve doesn't follow this pattern and is "slowest" changing one
72# with "256-byte" result being 87% of "8-KB." This is because overhead
73# in CTR mode is most computationally intensive. Small-block CCM
74# decrypt is slower than encrypt, because first CTR and last CBC-MAC
75# iterations can't be interleaved.
76#
77# Results for 192- and 256-bit keys.
78#
79# EVP-free results were observed to scale perfectly with number of
80# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
81# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
82# are a tad smaller, because the above mentioned penalty biases all
83# results by same constant value. In similar way function call
84# overhead affects small-block performance, as well as OFB and CFB
85# results. Differences are not large, most common coefficients are
86# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
87# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
88
89# January 2011
90#
91# While Westmere processor features 6 cycles latency for aes[enc|dec]
92# instructions, which can be scheduled every second cycle, Sandy
93# Bridge spends 8 cycles per instruction, but it can schedule them
94# every cycle. This means that code targeting Westmere would perform
95# suboptimally on Sandy Bridge. Therefore this update.
96#
97# In addition, non-parallelizable CBC encrypt (as well as CCM) is
98# optimized. Relative improvement might appear modest, 8% on Westmere,
99# but in absolute terms it's 3.77 cycles per byte encrypted with
100# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
101# should be compared to asymptotic limits of 3.75 for Westmere and
102# 5.00 for Sandy Bridge. Actually, the fact that they get this close
103# to asymptotic limits is quite amazing. Indeed, the limit is
104# calculated as latency times number of rounds, 10 for 128-bit key,
105# and divided by 16, the number of bytes in block, or in other words
106# it accounts *solely* for aesenc instructions. But there are extra
107# instructions, and numbers so close to the asymptotic limits mean
108# that it's as if it takes as little as *one* additional cycle to
109# execute all of them. How is it possible? It is possible thanks to
110# out-of-order execution logic, which manages to overlap post-
111# processing of previous block, things like saving the output, with
112# actual encryption of current block, as well as pre-processing of
113# current block, things like fetching input and xor-ing it with
114# 0-round element of the key schedule, with actual encryption of
115# previous block. Keep this in mind...
116#
117# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
118# performance is achieved by interleaving instructions working on
119# independent blocks. In which case asymptotic limit for such modes
120# can be obtained by dividing above mentioned numbers by AES
121# instructions' interleave factor. Westmere can execute at most 3
122# instructions at a time, meaning that optimal interleave factor is 3,
123# and that's where the "magic" number of 1.25 come from. "Optimal
124# interleave factor" means that increase of interleave factor does
125# not improve performance. The formula has proven to reflect reality
126# pretty well on Westmere... Sandy Bridge on the other hand can
127# execute up to 8 AES instructions at a time, so how does varying
128# interleave factor affect the performance? Here is table for ECB
129# (numbers are cycles per byte processed with 128-bit key):
130#
131# instruction interleave factor		3x	6x	8x
132# theoretical asymptotic limit		1.67	0.83	0.625
133# measured performance for 8KB block	1.05	0.86	0.84
134#
135# "as if" interleave factor		4.7x	5.8x	6.0x
136#
137# Further data for other parallelizable modes:
138#
139# CBC decrypt				1.16	0.93	0.74
140# CTR					1.14	0.91	0.74
141#
142# Well, given 3x column it's probably inappropriate to call the limit
143# asymptotic, if it can be surpassed, isn't it? What happens there?
144# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
145# magic is responsible for this. Processor overlaps not only the
146# additional instructions with AES ones, but even AES instructions
147# processing adjacent triplets of independent blocks. In the 6x case
148# additional instructions  still claim disproportionally small amount
149# of additional cycles, but in 8x case number of instructions must be
150# a tad too high for out-of-order logic to cope with, and AES unit
151# remains underutilized... As you can see 8x interleave is hardly
152# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
153# utilizes 6x interleave because of limited register bank capacity.
154#
155# Higher interleave factors do have negative impact on Westmere
156# performance. While for ECB mode it's negligible ~1.5%, other
157# parallelizables perform ~5% worse, which is outweighed by ~25%
158# improvement on Sandy Bridge. To balance regression on Westmere
159# CTR mode was implemented with 6x aesenc interleave factor.
160
161# April 2011
162#
163# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
164# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
165# in CTR mode AES instruction interleave factor was chosen to be 6x.
166
167# November 2015
168#
169# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was
170# chosen to be 6x.
171
172######################################################################
173# Current large-block performance in cycles per byte processed with
174# 128-bit key (less is better).
175#
176#		CBC en-/decrypt	CTR	XTS	ECB	OCB
177# Westmere	3.77/1.25	1.25	1.25	1.26
178# * Bridge	5.07/0.74	0.75	0.90	0.85	0.98
179# Haswell	4.44/0.63	0.63	0.73	0.63	0.70
180# Skylake	2.62/0.63	0.63	0.63	0.63
181# Silvermont	5.75/3.54	3.56	4.12	3.87(*)	4.11
182# Knights L	2.54/0.77	0.78	0.85	-	1.50
183# Goldmont	3.82/1.26	1.26	1.29	1.29	1.50
184# Bulldozer	5.77/0.70	0.72	0.90	0.70	0.95
185# Ryzen		2.71/0.35	0.35	0.44	0.38	0.49
186#
187# (*)	Atom Silvermont ECB result is suboptimal because of penalties
188#	incurred by operations on %xmm8-15. As ECB is not considered
189#	critical, nothing was done to mitigate the problem.
190
191$PREFIX="aes_hw";	# if $PREFIX is set to "AES", the script
192			# generates drop-in replacement for
193			# crypto/aes/asm/aes-x86_64.pl:-)
194
195$flavour = shift;
196$output  = shift;
197if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
198
199$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
200
201$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
202( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
203( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
204die "can't locate x86_64-xlate.pl";
205
206open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
207*STDOUT=*OUT;
208
209$movkey = $PREFIX eq "aes_hw" ? "movups" : "movups";
210@_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
211		("%rdi","%rsi","%rdx","%rcx");	# Unix order
212
213$code=".text\n";
214$code.=".extern	OPENSSL_ia32cap_P\n";
215
216$rounds="%eax";	# input to and changed by aesni_[en|de]cryptN !!!
217# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
218$inp="%rdi";
219$out="%rsi";
220$len="%rdx";
221$key="%rcx";	# input to and changed by aesni_[en|de]cryptN !!!
222$ivp="%r8";	# cbc, ctr, ...
223
224$rnds_="%r10d";	# backup copy for $rounds
225$key_="%r11";	# backup copy for $key
226
227# %xmm register layout
228$rndkey0="%xmm0";	$rndkey1="%xmm1";
229$inout0="%xmm2";	$inout1="%xmm3";
230$inout2="%xmm4";	$inout3="%xmm5";
231$inout4="%xmm6";	$inout5="%xmm7";
232$inout6="%xmm8";	$inout7="%xmm9";
233
234$in2="%xmm6";		$in1="%xmm7";	# used in CBC decrypt, CTR, ...
235$in0="%xmm8";		$iv="%xmm9";
236
237# Inline version of internal aesni_[en|de]crypt1.
238#
239# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
240# cycles which take care of loop variables...
241{ my $sn;
242sub aesni_generate1 {
243my ($p,$key,$rounds,$inout,$ivec)=@_;	$inout=$inout0 if (!defined($inout));
244++$sn;
245$code.=<<___;
246	$movkey	($key),$rndkey0
247	$movkey	16($key),$rndkey1
248___
249$code.=<<___ if (defined($ivec));
250	xorps	$rndkey0,$ivec
251	lea	32($key),$key
252	xorps	$ivec,$inout
253___
254$code.=<<___ if (!defined($ivec));
255	lea	32($key),$key
256	xorps	$rndkey0,$inout
257___
258$code.=<<___;
259.Loop_${p}1_$sn:
260	aes${p}	$rndkey1,$inout
261	dec	$rounds
262	$movkey	($key),$rndkey1
263	lea	16($key),$key
264	jnz	.Loop_${p}1_$sn	# loop body is 16 bytes
265	aes${p}last	$rndkey1,$inout
266___
267}}
268# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
269#
270{ my ($inp,$out,$key) = @_4args;
271
272$code.=<<___;
273.globl	${PREFIX}_encrypt
274.type	${PREFIX}_encrypt,\@abi-omnipotent
275.align	16
276${PREFIX}_encrypt:
277.cfi_startproc
278#ifdef BORINGSSL_DISPATCH_TEST
279.extern	BORINGSSL_function_hit
280	movb \$1,BORINGSSL_function_hit+1(%rip)
281#endif
282	movups	($inp),$inout0		# load input
283	mov	240($key),$rounds	# key->rounds
284___
285	&aesni_generate1("enc",$key,$rounds);
286$code.=<<___;
287	 pxor	$rndkey0,$rndkey0	# clear register bank
288	 pxor	$rndkey1,$rndkey1
289	movups	$inout0,($out)		# output
290	 pxor	$inout0,$inout0
291	ret
292.cfi_endproc
293.size	${PREFIX}_encrypt,.-${PREFIX}_encrypt
294
295.globl	${PREFIX}_decrypt
296.type	${PREFIX}_decrypt,\@abi-omnipotent
297.align	16
298${PREFIX}_decrypt:
299.cfi_startproc
300	movups	($inp),$inout0		# load input
301	mov	240($key),$rounds	# key->rounds
302___
303	&aesni_generate1("dec",$key,$rounds);
304$code.=<<___;
305	 pxor	$rndkey0,$rndkey0	# clear register bank
306	 pxor	$rndkey1,$rndkey1
307	movups	$inout0,($out)		# output
308	 pxor	$inout0,$inout0
309	ret
310.cfi_endproc
311.size	${PREFIX}_decrypt, .-${PREFIX}_decrypt
312___
313}
314
315# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
316# factor. Why 3x subroutine were originally used in loops? Even though
317# aes[enc|dec] latency was originally 6, it could be scheduled only
318# every *2nd* cycle. Thus 3x interleave was the one providing optimal
319# utilization, i.e. when subroutine's throughput is virtually same as
320# of non-interleaved subroutine [for number of input blocks up to 3].
321# This is why it originally made no sense to implement 2x subroutine.
322# But times change and it became appropriate to spend extra 192 bytes
323# on 2x subroutine on Atom Silvermont account. For processors that
324# can schedule aes[enc|dec] every cycle optimal interleave factor
325# equals to corresponding instructions latency. 8x is optimal for
326# * Bridge and "super-optimal" for other Intel CPUs...
327
328sub aesni_generate2 {
329my $dir=shift;
330# As already mentioned it takes in $key and $rounds, which are *not*
331# preserved. $inout[0-1] is cipher/clear text...
332$code.=<<___;
333.type	_aesni_${dir}rypt2,\@abi-omnipotent
334.align	16
335_aesni_${dir}rypt2:
336.cfi_startproc
337	$movkey	($key),$rndkey0
338	shl	\$4,$rounds
339	$movkey	16($key),$rndkey1
340	xorps	$rndkey0,$inout0
341	xorps	$rndkey0,$inout1
342	$movkey	32($key),$rndkey0
343	lea	32($key,$rounds),$key
344	neg	%rax				# $rounds
345	add	\$16,%rax
346
347.L${dir}_loop2:
348	aes${dir}	$rndkey1,$inout0
349	aes${dir}	$rndkey1,$inout1
350	$movkey		($key,%rax),$rndkey1
351	add		\$32,%rax
352	aes${dir}	$rndkey0,$inout0
353	aes${dir}	$rndkey0,$inout1
354	$movkey		-16($key,%rax),$rndkey0
355	jnz		.L${dir}_loop2
356
357	aes${dir}	$rndkey1,$inout0
358	aes${dir}	$rndkey1,$inout1
359	aes${dir}last	$rndkey0,$inout0
360	aes${dir}last	$rndkey0,$inout1
361	ret
362.cfi_endproc
363.size	_aesni_${dir}rypt2,.-_aesni_${dir}rypt2
364___
365}
366sub aesni_generate3 {
367my $dir=shift;
368# As already mentioned it takes in $key and $rounds, which are *not*
369# preserved. $inout[0-2] is cipher/clear text...
370$code.=<<___;
371.type	_aesni_${dir}rypt3,\@abi-omnipotent
372.align	16
373_aesni_${dir}rypt3:
374.cfi_startproc
375	$movkey	($key),$rndkey0
376	shl	\$4,$rounds
377	$movkey	16($key),$rndkey1
378	xorps	$rndkey0,$inout0
379	xorps	$rndkey0,$inout1
380	xorps	$rndkey0,$inout2
381	$movkey	32($key),$rndkey0
382	lea	32($key,$rounds),$key
383	neg	%rax				# $rounds
384	add	\$16,%rax
385
386.L${dir}_loop3:
387	aes${dir}	$rndkey1,$inout0
388	aes${dir}	$rndkey1,$inout1
389	aes${dir}	$rndkey1,$inout2
390	$movkey		($key,%rax),$rndkey1
391	add		\$32,%rax
392	aes${dir}	$rndkey0,$inout0
393	aes${dir}	$rndkey0,$inout1
394	aes${dir}	$rndkey0,$inout2
395	$movkey		-16($key,%rax),$rndkey0
396	jnz		.L${dir}_loop3
397
398	aes${dir}	$rndkey1,$inout0
399	aes${dir}	$rndkey1,$inout1
400	aes${dir}	$rndkey1,$inout2
401	aes${dir}last	$rndkey0,$inout0
402	aes${dir}last	$rndkey0,$inout1
403	aes${dir}last	$rndkey0,$inout2
404	ret
405.cfi_endproc
406.size	_aesni_${dir}rypt3,.-_aesni_${dir}rypt3
407___
408}
409# 4x interleave is implemented to improve small block performance,
410# most notably [and naturally] 4 block by ~30%. One can argue that one
411# should have implemented 5x as well, but improvement would be <20%,
412# so it's not worth it...
413sub aesni_generate4 {
414my $dir=shift;
415# As already mentioned it takes in $key and $rounds, which are *not*
416# preserved. $inout[0-3] is cipher/clear text...
417$code.=<<___;
418.type	_aesni_${dir}rypt4,\@abi-omnipotent
419.align	16
420_aesni_${dir}rypt4:
421.cfi_startproc
422	$movkey	($key),$rndkey0
423	shl	\$4,$rounds
424	$movkey	16($key),$rndkey1
425	xorps	$rndkey0,$inout0
426	xorps	$rndkey0,$inout1
427	xorps	$rndkey0,$inout2
428	xorps	$rndkey0,$inout3
429	$movkey	32($key),$rndkey0
430	lea	32($key,$rounds),$key
431	neg	%rax				# $rounds
432	.byte	0x0f,0x1f,0x00
433	add	\$16,%rax
434
435.L${dir}_loop4:
436	aes${dir}	$rndkey1,$inout0
437	aes${dir}	$rndkey1,$inout1
438	aes${dir}	$rndkey1,$inout2
439	aes${dir}	$rndkey1,$inout3
440	$movkey		($key,%rax),$rndkey1
441	add		\$32,%rax
442	aes${dir}	$rndkey0,$inout0
443	aes${dir}	$rndkey0,$inout1
444	aes${dir}	$rndkey0,$inout2
445	aes${dir}	$rndkey0,$inout3
446	$movkey		-16($key,%rax),$rndkey0
447	jnz		.L${dir}_loop4
448
449	aes${dir}	$rndkey1,$inout0
450	aes${dir}	$rndkey1,$inout1
451	aes${dir}	$rndkey1,$inout2
452	aes${dir}	$rndkey1,$inout3
453	aes${dir}last	$rndkey0,$inout0
454	aes${dir}last	$rndkey0,$inout1
455	aes${dir}last	$rndkey0,$inout2
456	aes${dir}last	$rndkey0,$inout3
457	ret
458.cfi_endproc
459.size	_aesni_${dir}rypt4,.-_aesni_${dir}rypt4
460___
461}
462sub aesni_generate6 {
463my $dir=shift;
464# As already mentioned it takes in $key and $rounds, which are *not*
465# preserved. $inout[0-5] is cipher/clear text...
466$code.=<<___;
467.type	_aesni_${dir}rypt6,\@abi-omnipotent
468.align	16
469_aesni_${dir}rypt6:
470.cfi_startproc
471	$movkey		($key),$rndkey0
472	shl		\$4,$rounds
473	$movkey		16($key),$rndkey1
474	xorps		$rndkey0,$inout0
475	pxor		$rndkey0,$inout1
476	pxor		$rndkey0,$inout2
477	aes${dir}	$rndkey1,$inout0
478	lea		32($key,$rounds),$key
479	neg		%rax			# $rounds
480	aes${dir}	$rndkey1,$inout1
481	pxor		$rndkey0,$inout3
482	pxor		$rndkey0,$inout4
483	aes${dir}	$rndkey1,$inout2
484	pxor		$rndkey0,$inout5
485	$movkey		($key,%rax),$rndkey0
486	add		\$16,%rax
487	jmp		.L${dir}_loop6_enter
488.align	16
489.L${dir}_loop6:
490	aes${dir}	$rndkey1,$inout0
491	aes${dir}	$rndkey1,$inout1
492	aes${dir}	$rndkey1,$inout2
493.L${dir}_loop6_enter:
494	aes${dir}	$rndkey1,$inout3
495	aes${dir}	$rndkey1,$inout4
496	aes${dir}	$rndkey1,$inout5
497	$movkey		($key,%rax),$rndkey1
498	add		\$32,%rax
499	aes${dir}	$rndkey0,$inout0
500	aes${dir}	$rndkey0,$inout1
501	aes${dir}	$rndkey0,$inout2
502	aes${dir}	$rndkey0,$inout3
503	aes${dir}	$rndkey0,$inout4
504	aes${dir}	$rndkey0,$inout5
505	$movkey		-16($key,%rax),$rndkey0
506	jnz		.L${dir}_loop6
507
508	aes${dir}	$rndkey1,$inout0
509	aes${dir}	$rndkey1,$inout1
510	aes${dir}	$rndkey1,$inout2
511	aes${dir}	$rndkey1,$inout3
512	aes${dir}	$rndkey1,$inout4
513	aes${dir}	$rndkey1,$inout5
514	aes${dir}last	$rndkey0,$inout0
515	aes${dir}last	$rndkey0,$inout1
516	aes${dir}last	$rndkey0,$inout2
517	aes${dir}last	$rndkey0,$inout3
518	aes${dir}last	$rndkey0,$inout4
519	aes${dir}last	$rndkey0,$inout5
520	ret
521.cfi_endproc
522.size	_aesni_${dir}rypt6,.-_aesni_${dir}rypt6
523___
524}
525sub aesni_generate8 {
526my $dir=shift;
527# As already mentioned it takes in $key and $rounds, which are *not*
528# preserved. $inout[0-7] is cipher/clear text...
529$code.=<<___;
530.type	_aesni_${dir}rypt8,\@abi-omnipotent
531.align	16
532_aesni_${dir}rypt8:
533.cfi_startproc
534	$movkey		($key),$rndkey0
535	shl		\$4,$rounds
536	$movkey		16($key),$rndkey1
537	xorps		$rndkey0,$inout0
538	xorps		$rndkey0,$inout1
539	pxor		$rndkey0,$inout2
540	pxor		$rndkey0,$inout3
541	pxor		$rndkey0,$inout4
542	lea		32($key,$rounds),$key
543	neg		%rax			# $rounds
544	aes${dir}	$rndkey1,$inout0
545	pxor		$rndkey0,$inout5
546	pxor		$rndkey0,$inout6
547	aes${dir}	$rndkey1,$inout1
548	pxor		$rndkey0,$inout7
549	$movkey		($key,%rax),$rndkey0
550	add		\$16,%rax
551	jmp		.L${dir}_loop8_inner
552.align	16
553.L${dir}_loop8:
554	aes${dir}	$rndkey1,$inout0
555	aes${dir}	$rndkey1,$inout1
556.L${dir}_loop8_inner:
557	aes${dir}	$rndkey1,$inout2
558	aes${dir}	$rndkey1,$inout3
559	aes${dir}	$rndkey1,$inout4
560	aes${dir}	$rndkey1,$inout5
561	aes${dir}	$rndkey1,$inout6
562	aes${dir}	$rndkey1,$inout7
563.L${dir}_loop8_enter:
564	$movkey		($key,%rax),$rndkey1
565	add		\$32,%rax
566	aes${dir}	$rndkey0,$inout0
567	aes${dir}	$rndkey0,$inout1
568	aes${dir}	$rndkey0,$inout2
569	aes${dir}	$rndkey0,$inout3
570	aes${dir}	$rndkey0,$inout4
571	aes${dir}	$rndkey0,$inout5
572	aes${dir}	$rndkey0,$inout6
573	aes${dir}	$rndkey0,$inout7
574	$movkey		-16($key,%rax),$rndkey0
575	jnz		.L${dir}_loop8
576
577	aes${dir}	$rndkey1,$inout0
578	aes${dir}	$rndkey1,$inout1
579	aes${dir}	$rndkey1,$inout2
580	aes${dir}	$rndkey1,$inout3
581	aes${dir}	$rndkey1,$inout4
582	aes${dir}	$rndkey1,$inout5
583	aes${dir}	$rndkey1,$inout6
584	aes${dir}	$rndkey1,$inout7
585	aes${dir}last	$rndkey0,$inout0
586	aes${dir}last	$rndkey0,$inout1
587	aes${dir}last	$rndkey0,$inout2
588	aes${dir}last	$rndkey0,$inout3
589	aes${dir}last	$rndkey0,$inout4
590	aes${dir}last	$rndkey0,$inout5
591	aes${dir}last	$rndkey0,$inout6
592	aes${dir}last	$rndkey0,$inout7
593	ret
594.cfi_endproc
595.size	_aesni_${dir}rypt8,.-_aesni_${dir}rypt8
596___
597}
598&aesni_generate2("enc") if ($PREFIX eq "aes_hw");
599&aesni_generate2("dec");
600&aesni_generate3("enc") if ($PREFIX eq "aes_hw");
601&aesni_generate3("dec");
602&aesni_generate4("enc") if ($PREFIX eq "aes_hw");
603&aesni_generate4("dec");
604&aesni_generate6("enc") if ($PREFIX eq "aes_hw");
605&aesni_generate6("dec");
606&aesni_generate8("enc") if ($PREFIX eq "aes_hw");
607&aesni_generate8("dec");
608
609if ($PREFIX eq "aes_hw") {
610########################################################################
611# void aesni_ecb_encrypt (const void *in, void *out,
612#			  size_t length, const AES_KEY *key,
613#			  int enc);
614$code.=<<___;
615.globl	${PREFIX}_ecb_encrypt
616.type	${PREFIX}_ecb_encrypt,\@function,5
617.align	16
618${PREFIX}_ecb_encrypt:
619.cfi_startproc
620___
621$code.=<<___ if ($win64);
622	lea	-0x58(%rsp),%rsp
623	movaps	%xmm6,(%rsp)		# offload $inout4..7
624	movaps	%xmm7,0x10(%rsp)
625	movaps	%xmm8,0x20(%rsp)
626	movaps	%xmm9,0x30(%rsp)
627.Lecb_enc_body:
628___
629$code.=<<___;
630	and	\$-16,$len		# if ($len<16)
631	jz	.Lecb_ret		# return
632
633	mov	240($key),$rounds	# key->rounds
634	$movkey	($key),$rndkey0
635	mov	$key,$key_		# backup $key
636	mov	$rounds,$rnds_		# backup $rounds
637	test	%r8d,%r8d		# 5th argument
638	jz	.Lecb_decrypt
639#--------------------------- ECB ENCRYPT ------------------------------#
640	cmp	\$0x80,$len		# if ($len<8*16)
641	jb	.Lecb_enc_tail		# short input
642
643	movdqu	($inp),$inout0		# load 8 input blocks
644	movdqu	0x10($inp),$inout1
645	movdqu	0x20($inp),$inout2
646	movdqu	0x30($inp),$inout3
647	movdqu	0x40($inp),$inout4
648	movdqu	0x50($inp),$inout5
649	movdqu	0x60($inp),$inout6
650	movdqu	0x70($inp),$inout7
651	lea	0x80($inp),$inp		# $inp+=8*16
652	sub	\$0x80,$len		# $len-=8*16 (can be zero)
653	jmp	.Lecb_enc_loop8_enter
654.align 16
655.Lecb_enc_loop8:
656	movups	$inout0,($out)		# store 8 output blocks
657	mov	$key_,$key		# restore $key
658	movdqu	($inp),$inout0		# load 8 input blocks
659	mov	$rnds_,$rounds		# restore $rounds
660	movups	$inout1,0x10($out)
661	movdqu	0x10($inp),$inout1
662	movups	$inout2,0x20($out)
663	movdqu	0x20($inp),$inout2
664	movups	$inout3,0x30($out)
665	movdqu	0x30($inp),$inout3
666	movups	$inout4,0x40($out)
667	movdqu	0x40($inp),$inout4
668	movups	$inout5,0x50($out)
669	movdqu	0x50($inp),$inout5
670	movups	$inout6,0x60($out)
671	movdqu	0x60($inp),$inout6
672	movups	$inout7,0x70($out)
673	lea	0x80($out),$out		# $out+=8*16
674	movdqu	0x70($inp),$inout7
675	lea	0x80($inp),$inp		# $inp+=8*16
676.Lecb_enc_loop8_enter:
677
678	call	_aesni_encrypt8
679
680	sub	\$0x80,$len
681	jnc	.Lecb_enc_loop8		# loop if $len-=8*16 didn't borrow
682
683	movups	$inout0,($out)		# store 8 output blocks
684	mov	$key_,$key		# restore $key
685	movups	$inout1,0x10($out)
686	mov	$rnds_,$rounds		# restore $rounds
687	movups	$inout2,0x20($out)
688	movups	$inout3,0x30($out)
689	movups	$inout4,0x40($out)
690	movups	$inout5,0x50($out)
691	movups	$inout6,0x60($out)
692	movups	$inout7,0x70($out)
693	lea	0x80($out),$out		# $out+=8*16
694	add	\$0x80,$len		# restore real remaining $len
695	jz	.Lecb_ret		# done if ($len==0)
696
697.Lecb_enc_tail:				# $len is less than 8*16
698	movups	($inp),$inout0
699	cmp	\$0x20,$len
700	jb	.Lecb_enc_one
701	movups	0x10($inp),$inout1
702	je	.Lecb_enc_two
703	movups	0x20($inp),$inout2
704	cmp	\$0x40,$len
705	jb	.Lecb_enc_three
706	movups	0x30($inp),$inout3
707	je	.Lecb_enc_four
708	movups	0x40($inp),$inout4
709	cmp	\$0x60,$len
710	jb	.Lecb_enc_five
711	movups	0x50($inp),$inout5
712	je	.Lecb_enc_six
713	movdqu	0x60($inp),$inout6
714	xorps	$inout7,$inout7
715	call	_aesni_encrypt8
716	movups	$inout0,($out)		# store 7 output blocks
717	movups	$inout1,0x10($out)
718	movups	$inout2,0x20($out)
719	movups	$inout3,0x30($out)
720	movups	$inout4,0x40($out)
721	movups	$inout5,0x50($out)
722	movups	$inout6,0x60($out)
723	jmp	.Lecb_ret
724.align	16
725.Lecb_enc_one:
726___
727	&aesni_generate1("enc",$key,$rounds);
728$code.=<<___;
729	movups	$inout0,($out)		# store one output block
730	jmp	.Lecb_ret
731.align	16
732.Lecb_enc_two:
733	call	_aesni_encrypt2
734	movups	$inout0,($out)		# store 2 output blocks
735	movups	$inout1,0x10($out)
736	jmp	.Lecb_ret
737.align	16
738.Lecb_enc_three:
739	call	_aesni_encrypt3
740	movups	$inout0,($out)		# store 3 output blocks
741	movups	$inout1,0x10($out)
742	movups	$inout2,0x20($out)
743	jmp	.Lecb_ret
744.align	16
745.Lecb_enc_four:
746	call	_aesni_encrypt4
747	movups	$inout0,($out)		# store 4 output blocks
748	movups	$inout1,0x10($out)
749	movups	$inout2,0x20($out)
750	movups	$inout3,0x30($out)
751	jmp	.Lecb_ret
752.align	16
753.Lecb_enc_five:
754	xorps	$inout5,$inout5
755	call	_aesni_encrypt6
756	movups	$inout0,($out)		# store 5 output blocks
757	movups	$inout1,0x10($out)
758	movups	$inout2,0x20($out)
759	movups	$inout3,0x30($out)
760	movups	$inout4,0x40($out)
761	jmp	.Lecb_ret
762.align	16
763.Lecb_enc_six:
764	call	_aesni_encrypt6
765	movups	$inout0,($out)		# store 6 output blocks
766	movups	$inout1,0x10($out)
767	movups	$inout2,0x20($out)
768	movups	$inout3,0x30($out)
769	movups	$inout4,0x40($out)
770	movups	$inout5,0x50($out)
771	jmp	.Lecb_ret
772#--------------------------- ECB DECRYPT ------------------------------#
773.align	16
774.Lecb_decrypt:
775	cmp	\$0x80,$len		# if ($len<8*16)
776	jb	.Lecb_dec_tail		# short input
777
778	movdqu	($inp),$inout0		# load 8 input blocks
779	movdqu	0x10($inp),$inout1
780	movdqu	0x20($inp),$inout2
781	movdqu	0x30($inp),$inout3
782	movdqu	0x40($inp),$inout4
783	movdqu	0x50($inp),$inout5
784	movdqu	0x60($inp),$inout6
785	movdqu	0x70($inp),$inout7
786	lea	0x80($inp),$inp		# $inp+=8*16
787	sub	\$0x80,$len		# $len-=8*16 (can be zero)
788	jmp	.Lecb_dec_loop8_enter
789.align 16
790.Lecb_dec_loop8:
791	movups	$inout0,($out)		# store 8 output blocks
792	mov	$key_,$key		# restore $key
793	movdqu	($inp),$inout0		# load 8 input blocks
794	mov	$rnds_,$rounds		# restore $rounds
795	movups	$inout1,0x10($out)
796	movdqu	0x10($inp),$inout1
797	movups	$inout2,0x20($out)
798	movdqu	0x20($inp),$inout2
799	movups	$inout3,0x30($out)
800	movdqu	0x30($inp),$inout3
801	movups	$inout4,0x40($out)
802	movdqu	0x40($inp),$inout4
803	movups	$inout5,0x50($out)
804	movdqu	0x50($inp),$inout5
805	movups	$inout6,0x60($out)
806	movdqu	0x60($inp),$inout6
807	movups	$inout7,0x70($out)
808	lea	0x80($out),$out		# $out+=8*16
809	movdqu	0x70($inp),$inout7
810	lea	0x80($inp),$inp		# $inp+=8*16
811.Lecb_dec_loop8_enter:
812
813	call	_aesni_decrypt8
814
815	$movkey	($key_),$rndkey0
816	sub	\$0x80,$len
817	jnc	.Lecb_dec_loop8		# loop if $len-=8*16 didn't borrow
818
819	movups	$inout0,($out)		# store 8 output blocks
820	 pxor	$inout0,$inout0		# clear register bank
821	mov	$key_,$key		# restore $key
822	movups	$inout1,0x10($out)
823	 pxor	$inout1,$inout1
824	mov	$rnds_,$rounds		# restore $rounds
825	movups	$inout2,0x20($out)
826	 pxor	$inout2,$inout2
827	movups	$inout3,0x30($out)
828	 pxor	$inout3,$inout3
829	movups	$inout4,0x40($out)
830	 pxor	$inout4,$inout4
831	movups	$inout5,0x50($out)
832	 pxor	$inout5,$inout5
833	movups	$inout6,0x60($out)
834	 pxor	$inout6,$inout6
835	movups	$inout7,0x70($out)
836	 pxor	$inout7,$inout7
837	lea	0x80($out),$out		# $out+=8*16
838	add	\$0x80,$len		# restore real remaining $len
839	jz	.Lecb_ret		# done if ($len==0)
840
841.Lecb_dec_tail:
842	movups	($inp),$inout0
843	cmp	\$0x20,$len
844	jb	.Lecb_dec_one
845	movups	0x10($inp),$inout1
846	je	.Lecb_dec_two
847	movups	0x20($inp),$inout2
848	cmp	\$0x40,$len
849	jb	.Lecb_dec_three
850	movups	0x30($inp),$inout3
851	je	.Lecb_dec_four
852	movups	0x40($inp),$inout4
853	cmp	\$0x60,$len
854	jb	.Lecb_dec_five
855	movups	0x50($inp),$inout5
856	je	.Lecb_dec_six
857	movups	0x60($inp),$inout6
858	$movkey	($key),$rndkey0
859	xorps	$inout7,$inout7
860	call	_aesni_decrypt8
861	movups	$inout0,($out)		# store 7 output blocks
862	 pxor	$inout0,$inout0		# clear register bank
863	movups	$inout1,0x10($out)
864	 pxor	$inout1,$inout1
865	movups	$inout2,0x20($out)
866	 pxor	$inout2,$inout2
867	movups	$inout3,0x30($out)
868	 pxor	$inout3,$inout3
869	movups	$inout4,0x40($out)
870	 pxor	$inout4,$inout4
871	movups	$inout5,0x50($out)
872	 pxor	$inout5,$inout5
873	movups	$inout6,0x60($out)
874	 pxor	$inout6,$inout6
875	 pxor	$inout7,$inout7
876	jmp	.Lecb_ret
877.align	16
878.Lecb_dec_one:
879___
880	&aesni_generate1("dec",$key,$rounds);
881$code.=<<___;
882	movups	$inout0,($out)		# store one output block
883	 pxor	$inout0,$inout0		# clear register bank
884	jmp	.Lecb_ret
885.align	16
886.Lecb_dec_two:
887	call	_aesni_decrypt2
888	movups	$inout0,($out)		# store 2 output blocks
889	 pxor	$inout0,$inout0		# clear register bank
890	movups	$inout1,0x10($out)
891	 pxor	$inout1,$inout1
892	jmp	.Lecb_ret
893.align	16
894.Lecb_dec_three:
895	call	_aesni_decrypt3
896	movups	$inout0,($out)		# store 3 output blocks
897	 pxor	$inout0,$inout0		# clear register bank
898	movups	$inout1,0x10($out)
899	 pxor	$inout1,$inout1
900	movups	$inout2,0x20($out)
901	 pxor	$inout2,$inout2
902	jmp	.Lecb_ret
903.align	16
904.Lecb_dec_four:
905	call	_aesni_decrypt4
906	movups	$inout0,($out)		# store 4 output blocks
907	 pxor	$inout0,$inout0		# clear register bank
908	movups	$inout1,0x10($out)
909	 pxor	$inout1,$inout1
910	movups	$inout2,0x20($out)
911	 pxor	$inout2,$inout2
912	movups	$inout3,0x30($out)
913	 pxor	$inout3,$inout3
914	jmp	.Lecb_ret
915.align	16
916.Lecb_dec_five:
917	xorps	$inout5,$inout5
918	call	_aesni_decrypt6
919	movups	$inout0,($out)		# store 5 output blocks
920	 pxor	$inout0,$inout0		# clear register bank
921	movups	$inout1,0x10($out)
922	 pxor	$inout1,$inout1
923	movups	$inout2,0x20($out)
924	 pxor	$inout2,$inout2
925	movups	$inout3,0x30($out)
926	 pxor	$inout3,$inout3
927	movups	$inout4,0x40($out)
928	 pxor	$inout4,$inout4
929	 pxor	$inout5,$inout5
930	jmp	.Lecb_ret
931.align	16
932.Lecb_dec_six:
933	call	_aesni_decrypt6
934	movups	$inout0,($out)		# store 6 output blocks
935	 pxor	$inout0,$inout0		# clear register bank
936	movups	$inout1,0x10($out)
937	 pxor	$inout1,$inout1
938	movups	$inout2,0x20($out)
939	 pxor	$inout2,$inout2
940	movups	$inout3,0x30($out)
941	 pxor	$inout3,$inout3
942	movups	$inout4,0x40($out)
943	 pxor	$inout4,$inout4
944	movups	$inout5,0x50($out)
945	 pxor	$inout5,$inout5
946
947.Lecb_ret:
948	xorps	$rndkey0,$rndkey0	# %xmm0
949	pxor	$rndkey1,$rndkey1
950___
951$code.=<<___ if ($win64);
952	movaps	(%rsp),%xmm6
953	movaps	%xmm0,(%rsp)		# clear stack
954	movaps	0x10(%rsp),%xmm7
955	movaps	%xmm0,0x10(%rsp)
956	movaps	0x20(%rsp),%xmm8
957	movaps	%xmm0,0x20(%rsp)
958	movaps	0x30(%rsp),%xmm9
959	movaps	%xmm0,0x30(%rsp)
960	lea	0x58(%rsp),%rsp
961.Lecb_enc_ret:
962___
963$code.=<<___;
964	ret
965.cfi_endproc
966.size	${PREFIX}_ecb_encrypt,.-${PREFIX}_ecb_encrypt
967___
968
969{
970######################################################################
971# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
972#                         size_t blocks, const AES_KEY *key,
973#                         const char *ivec,char *cmac);
974#
975# Handles only complete blocks, operates on 64-bit counter and
976# does not update *ivec! Nor does it finalize CMAC value
977# (see engine/eng_aesni.c for details)
978#
979if (0) {  # Omit these functions in BoringSSL
980my $cmac="%r9";	# 6th argument
981
982my $increment="%xmm9";
983my $iv="%xmm6";
984my $bswap_mask="%xmm7";
985
986$code.=<<___;
987.globl	${PREFIX}_ccm64_encrypt_blocks
988.type	${PREFIX}_ccm64_encrypt_blocks,\@function,6
989.align	16
990${PREFIX}_ccm64_encrypt_blocks:
991___
992$code.=<<___ if ($win64);
993	lea	-0x58(%rsp),%rsp
994	movaps	%xmm6,(%rsp)		# $iv
995	movaps	%xmm7,0x10(%rsp)	# $bswap_mask
996	movaps	%xmm8,0x20(%rsp)	# $in0
997	movaps	%xmm9,0x30(%rsp)	# $increment
998.Lccm64_enc_body:
999___
1000$code.=<<___;
1001	mov	240($key),$rounds		# key->rounds
1002	movdqu	($ivp),$iv
1003	movdqa	.Lincrement64(%rip),$increment
1004	movdqa	.Lbswap_mask(%rip),$bswap_mask
1005
1006	shl	\$4,$rounds
1007	mov	\$16,$rnds_
1008	lea	0($key),$key_
1009	movdqu	($cmac),$inout1
1010	movdqa	$iv,$inout0
1011	lea	32($key,$rounds),$key		# end of key schedule
1012	pshufb	$bswap_mask,$iv
1013	sub	%rax,%r10			# twisted $rounds
1014	jmp	.Lccm64_enc_outer
1015.align	16
1016.Lccm64_enc_outer:
1017	$movkey	($key_),$rndkey0
1018	mov	%r10,%rax
1019	movups	($inp),$in0			# load inp
1020
1021	xorps	$rndkey0,$inout0		# counter
1022	$movkey	16($key_),$rndkey1
1023	xorps	$in0,$rndkey0
1024	xorps	$rndkey0,$inout1		# cmac^=inp
1025	$movkey	32($key_),$rndkey0
1026
1027.Lccm64_enc2_loop:
1028	aesenc	$rndkey1,$inout0
1029	aesenc	$rndkey1,$inout1
1030	$movkey	($key,%rax),$rndkey1
1031	add	\$32,%rax
1032	aesenc	$rndkey0,$inout0
1033	aesenc	$rndkey0,$inout1
1034	$movkey	-16($key,%rax),$rndkey0
1035	jnz	.Lccm64_enc2_loop
1036	aesenc	$rndkey1,$inout0
1037	aesenc	$rndkey1,$inout1
1038	paddq	$increment,$iv
1039	dec	$len				# $len-- ($len is in blocks)
1040	aesenclast	$rndkey0,$inout0
1041	aesenclast	$rndkey0,$inout1
1042
1043	lea	16($inp),$inp
1044	xorps	$inout0,$in0			# inp ^= E(iv)
1045	movdqa	$iv,$inout0
1046	movups	$in0,($out)			# save output
1047	pshufb	$bswap_mask,$inout0
1048	lea	16($out),$out			# $out+=16
1049	jnz	.Lccm64_enc_outer		# loop if ($len!=0)
1050
1051	 pxor	$rndkey0,$rndkey0		# clear register bank
1052	 pxor	$rndkey1,$rndkey1
1053	 pxor	$inout0,$inout0
1054	movups	$inout1,($cmac)			# store resulting mac
1055	 pxor	$inout1,$inout1
1056	 pxor	$in0,$in0
1057	 pxor	$iv,$iv
1058___
1059$code.=<<___ if ($win64);
1060	movaps	(%rsp),%xmm6
1061	movaps	%xmm0,(%rsp)			# clear stack
1062	movaps	0x10(%rsp),%xmm7
1063	movaps	%xmm0,0x10(%rsp)
1064	movaps	0x20(%rsp),%xmm8
1065	movaps	%xmm0,0x20(%rsp)
1066	movaps	0x30(%rsp),%xmm9
1067	movaps	%xmm0,0x30(%rsp)
1068	lea	0x58(%rsp),%rsp
1069.Lccm64_enc_ret:
1070___
1071$code.=<<___;
1072	ret
1073.size	${PREFIX}_ccm64_encrypt_blocks,.-${PREFIX}_ccm64_encrypt_blocks
1074___
1075######################################################################
1076$code.=<<___;
1077.globl	${PREFIX}_ccm64_decrypt_blocks
1078.type	${PREFIX}_ccm64_decrypt_blocks,\@function,6
1079.align	16
1080${PREFIX}_ccm64_decrypt_blocks:
1081___
1082$code.=<<___ if ($win64);
1083	lea	-0x58(%rsp),%rsp
1084	movaps	%xmm6,(%rsp)		# $iv
1085	movaps	%xmm7,0x10(%rsp)	# $bswap_mask
1086	movaps	%xmm8,0x20(%rsp)	# $in8
1087	movaps	%xmm9,0x30(%rsp)	# $increment
1088.Lccm64_dec_body:
1089___
1090$code.=<<___;
1091	mov	240($key),$rounds		# key->rounds
1092	movups	($ivp),$iv
1093	movdqu	($cmac),$inout1
1094	movdqa	.Lincrement64(%rip),$increment
1095	movdqa	.Lbswap_mask(%rip),$bswap_mask
1096
1097	movaps	$iv,$inout0
1098	mov	$rounds,$rnds_
1099	mov	$key,$key_
1100	pshufb	$bswap_mask,$iv
1101___
1102	&aesni_generate1("enc",$key,$rounds);
1103$code.=<<___;
1104	shl	\$4,$rnds_
1105	mov	\$16,$rounds
1106	movups	($inp),$in0			# load inp
1107	paddq	$increment,$iv
1108	lea	16($inp),$inp			# $inp+=16
1109	sub	%r10,%rax			# twisted $rounds
1110	lea	32($key_,$rnds_),$key		# end of key schedule
1111	mov	%rax,%r10
1112	jmp	.Lccm64_dec_outer
1113.align	16
1114.Lccm64_dec_outer:
1115	xorps	$inout0,$in0			# inp ^= E(iv)
1116	movdqa	$iv,$inout0
1117	movups	$in0,($out)			# save output
1118	lea	16($out),$out			# $out+=16
1119	pshufb	$bswap_mask,$inout0
1120
1121	sub	\$1,$len			# $len-- ($len is in blocks)
1122	jz	.Lccm64_dec_break		# if ($len==0) break
1123
1124	$movkey	($key_),$rndkey0
1125	mov	%r10,%rax
1126	$movkey	16($key_),$rndkey1
1127	xorps	$rndkey0,$in0
1128	xorps	$rndkey0,$inout0
1129	xorps	$in0,$inout1			# cmac^=out
1130	$movkey	32($key_),$rndkey0
1131	jmp	.Lccm64_dec2_loop
1132.align	16
1133.Lccm64_dec2_loop:
1134	aesenc	$rndkey1,$inout0
1135	aesenc	$rndkey1,$inout1
1136	$movkey	($key,%rax),$rndkey1
1137	add	\$32,%rax
1138	aesenc	$rndkey0,$inout0
1139	aesenc	$rndkey0,$inout1
1140	$movkey	-16($key,%rax),$rndkey0
1141	jnz	.Lccm64_dec2_loop
1142	movups	($inp),$in0			# load input
1143	paddq	$increment,$iv
1144	aesenc	$rndkey1,$inout0
1145	aesenc	$rndkey1,$inout1
1146	aesenclast	$rndkey0,$inout0
1147	aesenclast	$rndkey0,$inout1
1148	lea	16($inp),$inp			# $inp+=16
1149	jmp	.Lccm64_dec_outer
1150
1151.align	16
1152.Lccm64_dec_break:
1153	#xorps	$in0,$inout1			# cmac^=out
1154	mov	240($key_),$rounds
1155___
1156	&aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
1157$code.=<<___;
1158	 pxor	$rndkey0,$rndkey0		# clear register bank
1159	 pxor	$rndkey1,$rndkey1
1160	 pxor	$inout0,$inout0
1161	movups	$inout1,($cmac)			# store resulting mac
1162	 pxor	$inout1,$inout1
1163	 pxor	$in0,$in0
1164	 pxor	$iv,$iv
1165___
1166$code.=<<___ if ($win64);
1167	movaps	(%rsp),%xmm6
1168	movaps	%xmm0,(%rsp)			# clear stack
1169	movaps	0x10(%rsp),%xmm7
1170	movaps	%xmm0,0x10(%rsp)
1171	movaps	0x20(%rsp),%xmm8
1172	movaps	%xmm0,0x20(%rsp)
1173	movaps	0x30(%rsp),%xmm9
1174	movaps	%xmm0,0x30(%rsp)
1175	lea	0x58(%rsp),%rsp
1176.Lccm64_dec_ret:
1177___
1178$code.=<<___;
1179	ret
1180.size	${PREFIX}_ccm64_decrypt_blocks,.-${PREFIX}_ccm64_decrypt_blocks
1181___
1182}
1183######################################################################
1184# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
1185#                         size_t blocks, const AES_KEY *key,
1186#                         const char *ivec);
1187#
1188# Handles only complete blocks, operates on 32-bit counter and
1189# does not update *ivec! (see crypto/modes/ctr128.c for details)
1190#
1191# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
1192# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
1193# Keywords are full unroll and modulo-schedule counter calculations
1194# with zero-round key xor.
1195{
1196my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
1197my ($key0,$ctr)=("%ebp","${ivp}d");
1198my $frame_size = 0x80 + ($win64?160:0);
1199
1200$code.=<<___;
1201.globl	${PREFIX}_ctr32_encrypt_blocks
1202.type	${PREFIX}_ctr32_encrypt_blocks,\@function,5
1203.align	16
1204${PREFIX}_ctr32_encrypt_blocks:
1205.cfi_startproc
1206#ifdef BORINGSSL_DISPATCH_TEST
1207	movb \$1,BORINGSSL_function_hit(%rip)
1208#endif
1209	cmp	\$1,$len
1210	jne	.Lctr32_bulk
1211
1212	# handle single block without allocating stack frame,
1213	# useful when handling edges
1214	movups	($ivp),$inout0
1215	movups	($inp),$inout1
1216	mov	240($key),%edx			# key->rounds
1217___
1218	&aesni_generate1("enc",$key,"%edx");
1219$code.=<<___;
1220	 pxor	$rndkey0,$rndkey0		# clear register bank
1221	 pxor	$rndkey1,$rndkey1
1222	xorps	$inout1,$inout0
1223	 pxor	$inout1,$inout1
1224	movups	$inout0,($out)
1225	 xorps	$inout0,$inout0
1226	jmp	.Lctr32_epilogue
1227
1228.align	16
1229.Lctr32_bulk:
1230	lea	(%rsp),$key_			# use $key_ as frame pointer
1231.cfi_def_cfa_register	$key_
1232	push	%rbp
1233.cfi_push	%rbp
1234	sub	\$$frame_size,%rsp
1235	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
1236___
1237$code.=<<___ if ($win64);
1238	movaps	%xmm6,-0xa8($key_)		# offload everything
1239	movaps	%xmm7,-0x98($key_)
1240	movaps	%xmm8,-0x88($key_)
1241	movaps	%xmm9,-0x78($key_)
1242	movaps	%xmm10,-0x68($key_)
1243	movaps	%xmm11,-0x58($key_)
1244	movaps	%xmm12,-0x48($key_)
1245	movaps	%xmm13,-0x38($key_)
1246	movaps	%xmm14,-0x28($key_)
1247	movaps	%xmm15,-0x18($key_)
1248.Lctr32_body:
1249___
1250$code.=<<___;
1251
1252	# 8 16-byte words on top of stack are counter values
1253	# xor-ed with zero-round key
1254
1255	movdqu	($ivp),$inout0
1256	movdqu	($key),$rndkey0
1257	mov	12($ivp),$ctr			# counter LSB
1258	pxor	$rndkey0,$inout0
1259	mov	12($key),$key0			# 0-round key LSB
1260	movdqa	$inout0,0x00(%rsp)		# populate counter block
1261	bswap	$ctr
1262	movdqa	$inout0,$inout1
1263	movdqa	$inout0,$inout2
1264	movdqa	$inout0,$inout3
1265	movdqa	$inout0,0x40(%rsp)
1266	movdqa	$inout0,0x50(%rsp)
1267	movdqa	$inout0,0x60(%rsp)
1268	mov	%rdx,%r10			# about to borrow %rdx
1269	movdqa	$inout0,0x70(%rsp)
1270
1271	lea	1($ctr),%rax
1272	 lea	2($ctr),%rdx
1273	bswap	%eax
1274	 bswap	%edx
1275	xor	$key0,%eax
1276	 xor	$key0,%edx
1277	pinsrd	\$3,%eax,$inout1
1278	lea	3($ctr),%rax
1279	movdqa	$inout1,0x10(%rsp)
1280	 pinsrd	\$3,%edx,$inout2
1281	bswap	%eax
1282	 mov	%r10,%rdx			# restore %rdx
1283	 lea	4($ctr),%r10
1284	 movdqa	$inout2,0x20(%rsp)
1285	xor	$key0,%eax
1286	 bswap	%r10d
1287	pinsrd	\$3,%eax,$inout3
1288	 xor	$key0,%r10d
1289	movdqa	$inout3,0x30(%rsp)
1290	lea	5($ctr),%r9
1291	 mov	%r10d,0x40+12(%rsp)
1292	bswap	%r9d
1293	 lea	6($ctr),%r10
1294	mov	240($key),$rounds		# key->rounds
1295	xor	$key0,%r9d
1296	 bswap	%r10d
1297	mov	%r9d,0x50+12(%rsp)
1298	 xor	$key0,%r10d
1299	lea	7($ctr),%r9
1300	 mov	%r10d,0x60+12(%rsp)
1301	bswap	%r9d
1302	leaq	OPENSSL_ia32cap_P(%rip),%r10
1303	 mov	4(%r10),%r10d
1304	xor	$key0,%r9d
1305	 and	\$`1<<26|1<<22`,%r10d		# isolate XSAVE+MOVBE
1306	mov	%r9d,0x70+12(%rsp)
1307
1308	$movkey	0x10($key),$rndkey1
1309
1310	movdqa	0x40(%rsp),$inout4
1311	movdqa	0x50(%rsp),$inout5
1312
1313	cmp	\$8,$len		# $len is in blocks
1314	jb	.Lctr32_tail		# short input if ($len<8)
1315
1316	sub	\$6,$len		# $len is biased by -6
1317	cmp	\$`1<<22`,%r10d		# check for MOVBE without XSAVE
1318	je	.Lctr32_6x		# [which denotes Atom Silvermont]
1319
1320	lea	0x80($key),$key		# size optimization
1321	sub	\$2,$len		# $len is biased by -8
1322	jmp	.Lctr32_loop8
1323
1324.align	16
1325.Lctr32_6x:
1326	shl	\$4,$rounds
1327	mov	\$48,$rnds_
1328	bswap	$key0
1329	lea	32($key,$rounds),$key	# end of key schedule
1330	sub	%rax,%r10		# twisted $rounds
1331	jmp	.Lctr32_loop6
1332
1333.align	16
1334.Lctr32_loop6:
1335	 add	\$6,$ctr		# next counter value
1336	$movkey	-48($key,$rnds_),$rndkey0
1337	aesenc	$rndkey1,$inout0
1338	 mov	$ctr,%eax
1339	 xor	$key0,%eax
1340	aesenc	$rndkey1,$inout1
1341	 movbe	%eax,`0x00+12`(%rsp)	# store next counter value
1342	 lea	1($ctr),%eax
1343	aesenc	$rndkey1,$inout2
1344	 xor	$key0,%eax
1345	 movbe	%eax,`0x10+12`(%rsp)
1346	aesenc	$rndkey1,$inout3
1347	 lea	2($ctr),%eax
1348	 xor	$key0,%eax
1349	aesenc	$rndkey1,$inout4
1350	 movbe	%eax,`0x20+12`(%rsp)
1351	 lea	3($ctr),%eax
1352	aesenc	$rndkey1,$inout5
1353	$movkey	-32($key,$rnds_),$rndkey1
1354	 xor	$key0,%eax
1355
1356	aesenc	$rndkey0,$inout0
1357	 movbe	%eax,`0x30+12`(%rsp)
1358	 lea	4($ctr),%eax
1359	aesenc	$rndkey0,$inout1
1360	 xor	$key0,%eax
1361	 movbe	%eax,`0x40+12`(%rsp)
1362	aesenc	$rndkey0,$inout2
1363	 lea	5($ctr),%eax
1364	 xor	$key0,%eax
1365	aesenc	$rndkey0,$inout3
1366	 movbe	%eax,`0x50+12`(%rsp)
1367	 mov	%r10,%rax		# mov	$rnds_,$rounds
1368	aesenc	$rndkey0,$inout4
1369	aesenc	$rndkey0,$inout5
1370	$movkey	-16($key,$rnds_),$rndkey0
1371
1372	call	.Lenc_loop6
1373
1374	movdqu	($inp),$inout6		# load 6 input blocks
1375	movdqu	0x10($inp),$inout7
1376	movdqu	0x20($inp),$in0
1377	movdqu	0x30($inp),$in1
1378	movdqu	0x40($inp),$in2
1379	movdqu	0x50($inp),$in3
1380	lea	0x60($inp),$inp		# $inp+=6*16
1381	$movkey	-64($key,$rnds_),$rndkey1
1382	pxor	$inout0,$inout6		# inp^=E(ctr)
1383	movaps	0x00(%rsp),$inout0	# load next counter [xor-ed with 0 round]
1384	pxor	$inout1,$inout7
1385	movaps	0x10(%rsp),$inout1
1386	pxor	$inout2,$in0
1387	movaps	0x20(%rsp),$inout2
1388	pxor	$inout3,$in1
1389	movaps	0x30(%rsp),$inout3
1390	pxor	$inout4,$in2
1391	movaps	0x40(%rsp),$inout4
1392	pxor	$inout5,$in3
1393	movaps	0x50(%rsp),$inout5
1394	movdqu	$inout6,($out)		# store 6 output blocks
1395	movdqu	$inout7,0x10($out)
1396	movdqu	$in0,0x20($out)
1397	movdqu	$in1,0x30($out)
1398	movdqu	$in2,0x40($out)
1399	movdqu	$in3,0x50($out)
1400	lea	0x60($out),$out		# $out+=6*16
1401
1402	sub	\$6,$len
1403	jnc	.Lctr32_loop6		# loop if $len-=6 didn't borrow
1404
1405	add	\$6,$len		# restore real remaining $len
1406	jz	.Lctr32_done		# done if ($len==0)
1407
1408	lea	-48($rnds_),$rounds
1409	lea	-80($key,$rnds_),$key	# restore $key
1410	neg	$rounds
1411	shr	\$4,$rounds		# restore $rounds
1412	jmp	.Lctr32_tail
1413
1414.align	32
1415.Lctr32_loop8:
1416	 add		\$8,$ctr		# next counter value
1417	movdqa		0x60(%rsp),$inout6
1418	aesenc		$rndkey1,$inout0
1419	 mov		$ctr,%r9d
1420	movdqa		0x70(%rsp),$inout7
1421	aesenc		$rndkey1,$inout1
1422	 bswap		%r9d
1423	$movkey		0x20-0x80($key),$rndkey0
1424	aesenc		$rndkey1,$inout2
1425	 xor		$key0,%r9d
1426	 nop
1427	aesenc		$rndkey1,$inout3
1428	 mov		%r9d,0x00+12(%rsp)	# store next counter value
1429	 lea		1($ctr),%r9
1430	aesenc		$rndkey1,$inout4
1431	aesenc		$rndkey1,$inout5
1432	aesenc		$rndkey1,$inout6
1433	aesenc		$rndkey1,$inout7
1434	$movkey		0x30-0x80($key),$rndkey1
1435___
1436for($i=2;$i<8;$i++) {
1437my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
1438$code.=<<___;
1439	 bswap		%r9d
1440	aesenc		$rndkeyx,$inout0
1441	aesenc		$rndkeyx,$inout1
1442	 xor		$key0,%r9d
1443	 .byte		0x66,0x90
1444	aesenc		$rndkeyx,$inout2
1445	aesenc		$rndkeyx,$inout3
1446	 mov		%r9d,`0x10*($i-1)`+12(%rsp)
1447	 lea		$i($ctr),%r9
1448	aesenc		$rndkeyx,$inout4
1449	aesenc		$rndkeyx,$inout5
1450	aesenc		$rndkeyx,$inout6
1451	aesenc		$rndkeyx,$inout7
1452	$movkey		`0x20+0x10*$i`-0x80($key),$rndkeyx
1453___
1454}
1455$code.=<<___;
1456	 bswap		%r9d
1457	aesenc		$rndkey0,$inout0
1458	aesenc		$rndkey0,$inout1
1459	aesenc		$rndkey0,$inout2
1460	 xor		$key0,%r9d
1461	 movdqu		0x00($inp),$in0		# start loading input
1462	aesenc		$rndkey0,$inout3
1463	 mov		%r9d,0x70+12(%rsp)
1464	 cmp		\$11,$rounds
1465	aesenc		$rndkey0,$inout4
1466	aesenc		$rndkey0,$inout5
1467	aesenc		$rndkey0,$inout6
1468	aesenc		$rndkey0,$inout7
1469	$movkey		0xa0-0x80($key),$rndkey0
1470
1471	jb		.Lctr32_enc_done
1472
1473	aesenc		$rndkey1,$inout0
1474	aesenc		$rndkey1,$inout1
1475	aesenc		$rndkey1,$inout2
1476	aesenc		$rndkey1,$inout3
1477	aesenc		$rndkey1,$inout4
1478	aesenc		$rndkey1,$inout5
1479	aesenc		$rndkey1,$inout6
1480	aesenc		$rndkey1,$inout7
1481	$movkey		0xb0-0x80($key),$rndkey1
1482
1483	aesenc		$rndkey0,$inout0
1484	aesenc		$rndkey0,$inout1
1485	aesenc		$rndkey0,$inout2
1486	aesenc		$rndkey0,$inout3
1487	aesenc		$rndkey0,$inout4
1488	aesenc		$rndkey0,$inout5
1489	aesenc		$rndkey0,$inout6
1490	aesenc		$rndkey0,$inout7
1491	$movkey		0xc0-0x80($key),$rndkey0
1492	je		.Lctr32_enc_done
1493
1494	aesenc		$rndkey1,$inout0
1495	aesenc		$rndkey1,$inout1
1496	aesenc		$rndkey1,$inout2
1497	aesenc		$rndkey1,$inout3
1498	aesenc		$rndkey1,$inout4
1499	aesenc		$rndkey1,$inout5
1500	aesenc		$rndkey1,$inout6
1501	aesenc		$rndkey1,$inout7
1502	$movkey		0xd0-0x80($key),$rndkey1
1503
1504	aesenc		$rndkey0,$inout0
1505	aesenc		$rndkey0,$inout1
1506	aesenc		$rndkey0,$inout2
1507	aesenc		$rndkey0,$inout3
1508	aesenc		$rndkey0,$inout4
1509	aesenc		$rndkey0,$inout5
1510	aesenc		$rndkey0,$inout6
1511	aesenc		$rndkey0,$inout7
1512	$movkey		0xe0-0x80($key),$rndkey0
1513	jmp		.Lctr32_enc_done
1514
1515.align	16
1516.Lctr32_enc_done:
1517	movdqu		0x10($inp),$in1
1518	pxor		$rndkey0,$in0		# input^=round[last]
1519	movdqu		0x20($inp),$in2
1520	pxor		$rndkey0,$in1
1521	movdqu		0x30($inp),$in3
1522	pxor		$rndkey0,$in2
1523	movdqu		0x40($inp),$in4
1524	pxor		$rndkey0,$in3
1525	movdqu		0x50($inp),$in5
1526	pxor		$rndkey0,$in4
1527	pxor		$rndkey0,$in5
1528	aesenc		$rndkey1,$inout0
1529	aesenc		$rndkey1,$inout1
1530	aesenc		$rndkey1,$inout2
1531	aesenc		$rndkey1,$inout3
1532	aesenc		$rndkey1,$inout4
1533	aesenc		$rndkey1,$inout5
1534	aesenc		$rndkey1,$inout6
1535	aesenc		$rndkey1,$inout7
1536	movdqu		0x60($inp),$rndkey1	# borrow $rndkey1 for inp[6]
1537	lea		0x80($inp),$inp		# $inp+=8*16
1538
1539	aesenclast	$in0,$inout0		# $inN is inp[N]^round[last]
1540	pxor		$rndkey0,$rndkey1	# borrowed $rndkey
1541	movdqu		0x70-0x80($inp),$in0
1542	aesenclast	$in1,$inout1
1543	pxor		$rndkey0,$in0
1544	movdqa		0x00(%rsp),$in1		# load next counter block
1545	aesenclast	$in2,$inout2
1546	aesenclast	$in3,$inout3
1547	movdqa		0x10(%rsp),$in2
1548	movdqa		0x20(%rsp),$in3
1549	aesenclast	$in4,$inout4
1550	aesenclast	$in5,$inout5
1551	movdqa		0x30(%rsp),$in4
1552	movdqa		0x40(%rsp),$in5
1553	aesenclast	$rndkey1,$inout6
1554	movdqa		0x50(%rsp),$rndkey0
1555	$movkey		0x10-0x80($key),$rndkey1#real 1st-round key
1556	aesenclast	$in0,$inout7
1557
1558	movups		$inout0,($out)		# store 8 output blocks
1559	movdqa		$in1,$inout0
1560	movups		$inout1,0x10($out)
1561	movdqa		$in2,$inout1
1562	movups		$inout2,0x20($out)
1563	movdqa		$in3,$inout2
1564	movups		$inout3,0x30($out)
1565	movdqa		$in4,$inout3
1566	movups		$inout4,0x40($out)
1567	movdqa		$in5,$inout4
1568	movups		$inout5,0x50($out)
1569	movdqa		$rndkey0,$inout5
1570	movups		$inout6,0x60($out)
1571	movups		$inout7,0x70($out)
1572	lea		0x80($out),$out		# $out+=8*16
1573
1574	sub	\$8,$len
1575	jnc	.Lctr32_loop8			# loop if $len-=8 didn't borrow
1576
1577	add	\$8,$len			# restore real remaining $len
1578	jz	.Lctr32_done			# done if ($len==0)
1579	lea	-0x80($key),$key
1580
1581.Lctr32_tail:
1582	# note that at this point $inout0..5 are populated with
1583	# counter values xor-ed with 0-round key
1584	lea	16($key),$key
1585	cmp	\$4,$len
1586	jb	.Lctr32_loop3
1587	je	.Lctr32_loop4
1588
1589	# if ($len>4) compute 7 E(counter)
1590	shl		\$4,$rounds
1591	movdqa		0x60(%rsp),$inout6
1592	pxor		$inout7,$inout7
1593
1594	$movkey		16($key),$rndkey0
1595	aesenc		$rndkey1,$inout0
1596	aesenc		$rndkey1,$inout1
1597	lea		32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter
1598	neg		%rax
1599	aesenc		$rndkey1,$inout2
1600	add		\$16,%rax		# prepare for .Lenc_loop8_enter
1601	 movups		($inp),$in0
1602	aesenc		$rndkey1,$inout3
1603	aesenc		$rndkey1,$inout4
1604	 movups		0x10($inp),$in1		# pre-load input
1605	 movups		0x20($inp),$in2
1606	aesenc		$rndkey1,$inout5
1607	aesenc		$rndkey1,$inout6
1608
1609	call            .Lenc_loop8_enter
1610
1611	movdqu	0x30($inp),$in3
1612	pxor	$in0,$inout0
1613	movdqu	0x40($inp),$in0
1614	pxor	$in1,$inout1
1615	movdqu	$inout0,($out)			# store output
1616	pxor	$in2,$inout2
1617	movdqu	$inout1,0x10($out)
1618	pxor	$in3,$inout3
1619	movdqu	$inout2,0x20($out)
1620	pxor	$in0,$inout4
1621	movdqu	$inout3,0x30($out)
1622	movdqu	$inout4,0x40($out)
1623	cmp	\$6,$len
1624	jb	.Lctr32_done			# $len was 5, stop store
1625
1626	movups	0x50($inp),$in1
1627	xorps	$in1,$inout5
1628	movups	$inout5,0x50($out)
1629	je	.Lctr32_done			# $len was 6, stop store
1630
1631	movups	0x60($inp),$in2
1632	xorps	$in2,$inout6
1633	movups	$inout6,0x60($out)
1634	jmp	.Lctr32_done			# $len was 7, stop store
1635
1636.align	32
1637.Lctr32_loop4:
1638	aesenc		$rndkey1,$inout0
1639	lea		16($key),$key
1640	dec		$rounds
1641	aesenc		$rndkey1,$inout1
1642	aesenc		$rndkey1,$inout2
1643	aesenc		$rndkey1,$inout3
1644	$movkey		($key),$rndkey1
1645	jnz		.Lctr32_loop4
1646	aesenclast	$rndkey1,$inout0
1647	aesenclast	$rndkey1,$inout1
1648	 movups		($inp),$in0		# load input
1649	 movups		0x10($inp),$in1
1650	aesenclast	$rndkey1,$inout2
1651	aesenclast	$rndkey1,$inout3
1652	 movups		0x20($inp),$in2
1653	 movups		0x30($inp),$in3
1654
1655	xorps	$in0,$inout0
1656	movups	$inout0,($out)			# store output
1657	xorps	$in1,$inout1
1658	movups	$inout1,0x10($out)
1659	pxor	$in2,$inout2
1660	movdqu	$inout2,0x20($out)
1661	pxor	$in3,$inout3
1662	movdqu	$inout3,0x30($out)
1663	jmp	.Lctr32_done			# $len was 4, stop store
1664
1665.align	32
1666.Lctr32_loop3:
1667	aesenc		$rndkey1,$inout0
1668	lea		16($key),$key
1669	dec		$rounds
1670	aesenc		$rndkey1,$inout1
1671	aesenc		$rndkey1,$inout2
1672	$movkey		($key),$rndkey1
1673	jnz		.Lctr32_loop3
1674	aesenclast	$rndkey1,$inout0
1675	aesenclast	$rndkey1,$inout1
1676	aesenclast	$rndkey1,$inout2
1677
1678	movups	($inp),$in0			# load input
1679	xorps	$in0,$inout0
1680	movups	$inout0,($out)			# store output
1681	cmp	\$2,$len
1682	jb	.Lctr32_done			# $len was 1, stop store
1683
1684	movups	0x10($inp),$in1
1685	xorps	$in1,$inout1
1686	movups	$inout1,0x10($out)
1687	je	.Lctr32_done			# $len was 2, stop store
1688
1689	movups	0x20($inp),$in2
1690	xorps	$in2,$inout2
1691	movups	$inout2,0x20($out)		# $len was 3, stop store
1692
1693.Lctr32_done:
1694	xorps	%xmm0,%xmm0			# clear register bank
1695	xor	$key0,$key0
1696	pxor	%xmm1,%xmm1
1697	pxor	%xmm2,%xmm2
1698	pxor	%xmm3,%xmm3
1699	pxor	%xmm4,%xmm4
1700	pxor	%xmm5,%xmm5
1701___
1702$code.=<<___ if (!$win64);
1703	pxor	%xmm6,%xmm6
1704	pxor	%xmm7,%xmm7
1705	movaps	%xmm0,0x00(%rsp)		# clear stack
1706	pxor	%xmm8,%xmm8
1707	movaps	%xmm0,0x10(%rsp)
1708	pxor	%xmm9,%xmm9
1709	movaps	%xmm0,0x20(%rsp)
1710	pxor	%xmm10,%xmm10
1711	movaps	%xmm0,0x30(%rsp)
1712	pxor	%xmm11,%xmm11
1713	movaps	%xmm0,0x40(%rsp)
1714	pxor	%xmm12,%xmm12
1715	movaps	%xmm0,0x50(%rsp)
1716	pxor	%xmm13,%xmm13
1717	movaps	%xmm0,0x60(%rsp)
1718	pxor	%xmm14,%xmm14
1719	movaps	%xmm0,0x70(%rsp)
1720	pxor	%xmm15,%xmm15
1721___
1722$code.=<<___ if ($win64);
1723	movaps	-0xa8($key_),%xmm6
1724	movaps	%xmm0,-0xa8($key_)		# clear stack
1725	movaps	-0x98($key_),%xmm7
1726	movaps	%xmm0,-0x98($key_)
1727	movaps	-0x88($key_),%xmm8
1728	movaps	%xmm0,-0x88($key_)
1729	movaps	-0x78($key_),%xmm9
1730	movaps	%xmm0,-0x78($key_)
1731	movaps	-0x68($key_),%xmm10
1732	movaps	%xmm0,-0x68($key_)
1733	movaps	-0x58($key_),%xmm11
1734	movaps	%xmm0,-0x58($key_)
1735	movaps	-0x48($key_),%xmm12
1736	movaps	%xmm0,-0x48($key_)
1737	movaps	-0x38($key_),%xmm13
1738	movaps	%xmm0,-0x38($key_)
1739	movaps	-0x28($key_),%xmm14
1740	movaps	%xmm0,-0x28($key_)
1741	movaps	-0x18($key_),%xmm15
1742	movaps	%xmm0,-0x18($key_)
1743	movaps	%xmm0,0x00(%rsp)
1744	movaps	%xmm0,0x10(%rsp)
1745	movaps	%xmm0,0x20(%rsp)
1746	movaps	%xmm0,0x30(%rsp)
1747	movaps	%xmm0,0x40(%rsp)
1748	movaps	%xmm0,0x50(%rsp)
1749	movaps	%xmm0,0x60(%rsp)
1750	movaps	%xmm0,0x70(%rsp)
1751___
1752$code.=<<___;
1753	mov	-8($key_),%rbp
1754.cfi_restore	%rbp
1755	lea	($key_),%rsp
1756.cfi_def_cfa_register	%rsp
1757.Lctr32_epilogue:
1758	ret
1759.cfi_endproc
1760.size	${PREFIX}_ctr32_encrypt_blocks,.-${PREFIX}_ctr32_encrypt_blocks
1761___
1762}
1763
1764######################################################################
1765# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1766#	const AES_KEY *key1, const AES_KEY *key2
1767#	const unsigned char iv[16]);
1768#
1769if (0) {  # Omit these functions in BoringSSL
1770my @tweak=map("%xmm$_",(10..15));
1771my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
1772my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
1773my $frame_size = 0x70 + ($win64?160:0);
1774my $key_ = "%rbp";	# override so that we can use %r11 as FP
1775
1776$code.=<<___;
1777.globl	${PREFIX}_xts_encrypt
1778.type	${PREFIX}_xts_encrypt,\@function,6
1779.align	16
1780${PREFIX}_xts_encrypt:
1781.cfi_startproc
1782	lea	(%rsp),%r11			# frame pointer
1783.cfi_def_cfa_register	%r11
1784	push	%rbp
1785.cfi_push	%rbp
1786	sub	\$$frame_size,%rsp
1787	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
1788___
1789$code.=<<___ if ($win64);
1790	movaps	%xmm6,-0xa8(%r11)		# offload everything
1791	movaps	%xmm7,-0x98(%r11)
1792	movaps	%xmm8,-0x88(%r11)
1793	movaps	%xmm9,-0x78(%r11)
1794	movaps	%xmm10,-0x68(%r11)
1795	movaps	%xmm11,-0x58(%r11)
1796	movaps	%xmm12,-0x48(%r11)
1797	movaps	%xmm13,-0x38(%r11)
1798	movaps	%xmm14,-0x28(%r11)
1799	movaps	%xmm15,-0x18(%r11)
1800.Lxts_enc_body:
1801___
1802$code.=<<___;
1803	movups	($ivp),$inout0			# load clear-text tweak
1804	mov	240(%r8),$rounds		# key2->rounds
1805	mov	240($key),$rnds_		# key1->rounds
1806___
1807	# generate the tweak
1808	&aesni_generate1("enc",$key2,$rounds,$inout0);
1809$code.=<<___;
1810	$movkey	($key),$rndkey0			# zero round key
1811	mov	$key,$key_			# backup $key
1812	mov	$rnds_,$rounds			# backup $rounds
1813	shl	\$4,$rnds_
1814	mov	$len,$len_			# backup $len
1815	and	\$-16,$len
1816
1817	$movkey	16($key,$rnds_),$rndkey1	# last round key
1818
1819	movdqa	.Lxts_magic(%rip),$twmask
1820	movdqa	$inout0,@tweak[5]
1821	pshufd	\$0x5f,$inout0,$twres
1822	pxor	$rndkey0,$rndkey1
1823___
1824    # alternative tweak calculation algorithm is based on suggestions
1825    # by Shay Gueron. psrad doesn't conflict with AES-NI instructions
1826    # and should help in the future...
1827    for ($i=0;$i<4;$i++) {
1828    $code.=<<___;
1829	movdqa	$twres,$twtmp
1830	paddd	$twres,$twres
1831	movdqa	@tweak[5],@tweak[$i]
1832	psrad	\$31,$twtmp			# broadcast upper bits
1833	paddq	@tweak[5],@tweak[5]
1834	pand	$twmask,$twtmp
1835	pxor	$rndkey0,@tweak[$i]
1836	pxor	$twtmp,@tweak[5]
1837___
1838    }
1839$code.=<<___;
1840	movdqa	@tweak[5],@tweak[4]
1841	psrad	\$31,$twres
1842	paddq	@tweak[5],@tweak[5]
1843	pand	$twmask,$twres
1844	pxor	$rndkey0,@tweak[4]
1845	pxor	$twres,@tweak[5]
1846	movaps	$rndkey1,0x60(%rsp)		# save round[0]^round[last]
1847
1848	sub	\$16*6,$len
1849	jc	.Lxts_enc_short			# if $len-=6*16 borrowed
1850
1851	mov	\$16+96,$rounds
1852	lea	32($key_,$rnds_),$key		# end of key schedule
1853	sub	%r10,%rax			# twisted $rounds
1854	$movkey	16($key_),$rndkey1
1855	mov	%rax,%r10			# backup twisted $rounds
1856	lea	.Lxts_magic(%rip),%r8
1857	jmp	.Lxts_enc_grandloop
1858
1859.align	32
1860.Lxts_enc_grandloop:
1861	movdqu	`16*0`($inp),$inout0		# load input
1862	movdqa	$rndkey0,$twmask
1863	movdqu	`16*1`($inp),$inout1
1864	pxor	@tweak[0],$inout0		# input^=tweak^round[0]
1865	movdqu	`16*2`($inp),$inout2
1866	pxor	@tweak[1],$inout1
1867	 aesenc		$rndkey1,$inout0
1868	movdqu	`16*3`($inp),$inout3
1869	pxor	@tweak[2],$inout2
1870	 aesenc		$rndkey1,$inout1
1871	movdqu	`16*4`($inp),$inout4
1872	pxor	@tweak[3],$inout3
1873	 aesenc		$rndkey1,$inout2
1874	movdqu	`16*5`($inp),$inout5
1875	pxor	@tweak[5],$twmask		# round[0]^=tweak[5]
1876	 movdqa	0x60(%rsp),$twres		# load round[0]^round[last]
1877	pxor	@tweak[4],$inout4
1878	 aesenc		$rndkey1,$inout3
1879	$movkey	32($key_),$rndkey0
1880	lea	`16*6`($inp),$inp
1881	pxor	$twmask,$inout5
1882
1883	 pxor	$twres,@tweak[0]		# calculate tweaks^round[last]
1884	aesenc		$rndkey1,$inout4
1885	 pxor	$twres,@tweak[1]
1886	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks^round[last]
1887	aesenc		$rndkey1,$inout5
1888	$movkey		48($key_),$rndkey1
1889	 pxor	$twres,@tweak[2]
1890
1891	aesenc		$rndkey0,$inout0
1892	 pxor	$twres,@tweak[3]
1893	 movdqa	@tweak[1],`16*1`(%rsp)
1894	aesenc		$rndkey0,$inout1
1895	 pxor	$twres,@tweak[4]
1896	 movdqa	@tweak[2],`16*2`(%rsp)
1897	aesenc		$rndkey0,$inout2
1898	aesenc		$rndkey0,$inout3
1899	 pxor	$twres,$twmask
1900	 movdqa	@tweak[4],`16*4`(%rsp)
1901	aesenc		$rndkey0,$inout4
1902	aesenc		$rndkey0,$inout5
1903	$movkey		64($key_),$rndkey0
1904	 movdqa	$twmask,`16*5`(%rsp)
1905	pshufd	\$0x5f,@tweak[5],$twres
1906	jmp	.Lxts_enc_loop6
1907.align	32
1908.Lxts_enc_loop6:
1909	aesenc		$rndkey1,$inout0
1910	aesenc		$rndkey1,$inout1
1911	aesenc		$rndkey1,$inout2
1912	aesenc		$rndkey1,$inout3
1913	aesenc		$rndkey1,$inout4
1914	aesenc		$rndkey1,$inout5
1915	$movkey		-64($key,%rax),$rndkey1
1916	add		\$32,%rax
1917
1918	aesenc		$rndkey0,$inout0
1919	aesenc		$rndkey0,$inout1
1920	aesenc		$rndkey0,$inout2
1921	aesenc		$rndkey0,$inout3
1922	aesenc		$rndkey0,$inout4
1923	aesenc		$rndkey0,$inout5
1924	$movkey		-80($key,%rax),$rndkey0
1925	jnz		.Lxts_enc_loop6
1926
1927	movdqa	(%r8),$twmask			# start calculating next tweak
1928	movdqa	$twres,$twtmp
1929	paddd	$twres,$twres
1930	 aesenc		$rndkey1,$inout0
1931	paddq	@tweak[5],@tweak[5]
1932	psrad	\$31,$twtmp
1933	 aesenc		$rndkey1,$inout1
1934	pand	$twmask,$twtmp
1935	$movkey	($key_),@tweak[0]		# load round[0]
1936	 aesenc		$rndkey1,$inout2
1937	 aesenc		$rndkey1,$inout3
1938	 aesenc		$rndkey1,$inout4
1939	pxor	$twtmp,@tweak[5]
1940	movaps	@tweak[0],@tweak[1]		# copy round[0]
1941	 aesenc		$rndkey1,$inout5
1942	 $movkey	-64($key),$rndkey1
1943
1944	movdqa	$twres,$twtmp
1945	 aesenc		$rndkey0,$inout0
1946	paddd	$twres,$twres
1947	pxor	@tweak[5],@tweak[0]
1948	 aesenc		$rndkey0,$inout1
1949	psrad	\$31,$twtmp
1950	paddq	@tweak[5],@tweak[5]
1951	 aesenc		$rndkey0,$inout2
1952	 aesenc		$rndkey0,$inout3
1953	pand	$twmask,$twtmp
1954	movaps	@tweak[1],@tweak[2]
1955	 aesenc		$rndkey0,$inout4
1956	pxor	$twtmp,@tweak[5]
1957	movdqa	$twres,$twtmp
1958	 aesenc		$rndkey0,$inout5
1959	 $movkey	-48($key),$rndkey0
1960
1961	paddd	$twres,$twres
1962	 aesenc		$rndkey1,$inout0
1963	pxor	@tweak[5],@tweak[1]
1964	psrad	\$31,$twtmp
1965	 aesenc		$rndkey1,$inout1
1966	paddq	@tweak[5],@tweak[5]
1967	pand	$twmask,$twtmp
1968	 aesenc		$rndkey1,$inout2
1969	 aesenc		$rndkey1,$inout3
1970	 movdqa	@tweak[3],`16*3`(%rsp)
1971	pxor	$twtmp,@tweak[5]
1972	 aesenc		$rndkey1,$inout4
1973	movaps	@tweak[2],@tweak[3]
1974	movdqa	$twres,$twtmp
1975	 aesenc		$rndkey1,$inout5
1976	 $movkey	-32($key),$rndkey1
1977
1978	paddd	$twres,$twres
1979	 aesenc		$rndkey0,$inout0
1980	pxor	@tweak[5],@tweak[2]
1981	psrad	\$31,$twtmp
1982	 aesenc		$rndkey0,$inout1
1983	paddq	@tweak[5],@tweak[5]
1984	pand	$twmask,$twtmp
1985	 aesenc		$rndkey0,$inout2
1986	 aesenc		$rndkey0,$inout3
1987	 aesenc		$rndkey0,$inout4
1988	pxor	$twtmp,@tweak[5]
1989	movaps	@tweak[3],@tweak[4]
1990	 aesenc		$rndkey0,$inout5
1991
1992	movdqa	$twres,$rndkey0
1993	paddd	$twres,$twres
1994	 aesenc		$rndkey1,$inout0
1995	pxor	@tweak[5],@tweak[3]
1996	psrad	\$31,$rndkey0
1997	 aesenc		$rndkey1,$inout1
1998	paddq	@tweak[5],@tweak[5]
1999	pand	$twmask,$rndkey0
2000	 aesenc		$rndkey1,$inout2
2001	 aesenc		$rndkey1,$inout3
2002	pxor	$rndkey0,@tweak[5]
2003	$movkey		($key_),$rndkey0
2004	 aesenc		$rndkey1,$inout4
2005	 aesenc		$rndkey1,$inout5
2006	$movkey		16($key_),$rndkey1
2007
2008	pxor	@tweak[5],@tweak[4]
2009	 aesenclast	`16*0`(%rsp),$inout0
2010	psrad	\$31,$twres
2011	paddq	@tweak[5],@tweak[5]
2012	 aesenclast	`16*1`(%rsp),$inout1
2013	 aesenclast	`16*2`(%rsp),$inout2
2014	pand	$twmask,$twres
2015	mov	%r10,%rax			# restore $rounds
2016	 aesenclast	`16*3`(%rsp),$inout3
2017	 aesenclast	`16*4`(%rsp),$inout4
2018	 aesenclast	`16*5`(%rsp),$inout5
2019	pxor	$twres,@tweak[5]
2020
2021	lea	`16*6`($out),$out		# $out+=6*16
2022	movups	$inout0,`-16*6`($out)		# store 6 output blocks
2023	movups	$inout1,`-16*5`($out)
2024	movups	$inout2,`-16*4`($out)
2025	movups	$inout3,`-16*3`($out)
2026	movups	$inout4,`-16*2`($out)
2027	movups	$inout5,`-16*1`($out)
2028	sub	\$16*6,$len
2029	jnc	.Lxts_enc_grandloop		# loop if $len-=6*16 didn't borrow
2030
2031	mov	\$16+96,$rounds
2032	sub	$rnds_,$rounds
2033	mov	$key_,$key			# restore $key
2034	shr	\$4,$rounds			# restore original value
2035
2036.Lxts_enc_short:
2037	# at the point @tweak[0..5] are populated with tweak values
2038	mov	$rounds,$rnds_			# backup $rounds
2039	pxor	$rndkey0,@tweak[0]
2040	add	\$16*6,$len			# restore real remaining $len
2041	jz	.Lxts_enc_done			# done if ($len==0)
2042
2043	pxor	$rndkey0,@tweak[1]
2044	cmp	\$0x20,$len
2045	jb	.Lxts_enc_one			# $len is 1*16
2046	pxor	$rndkey0,@tweak[2]
2047	je	.Lxts_enc_two			# $len is 2*16
2048
2049	pxor	$rndkey0,@tweak[3]
2050	cmp	\$0x40,$len
2051	jb	.Lxts_enc_three			# $len is 3*16
2052	pxor	$rndkey0,@tweak[4]
2053	je	.Lxts_enc_four			# $len is 4*16
2054
2055	movdqu	($inp),$inout0			# $len is 5*16
2056	movdqu	16*1($inp),$inout1
2057	movdqu	16*2($inp),$inout2
2058	pxor	@tweak[0],$inout0
2059	movdqu	16*3($inp),$inout3
2060	pxor	@tweak[1],$inout1
2061	movdqu	16*4($inp),$inout4
2062	lea	16*5($inp),$inp			# $inp+=5*16
2063	pxor	@tweak[2],$inout2
2064	pxor	@tweak[3],$inout3
2065	pxor	@tweak[4],$inout4
2066	pxor	$inout5,$inout5
2067
2068	call	_aesni_encrypt6
2069
2070	xorps	@tweak[0],$inout0
2071	movdqa	@tweak[5],@tweak[0]
2072	xorps	@tweak[1],$inout1
2073	xorps	@tweak[2],$inout2
2074	movdqu	$inout0,($out)			# store 5 output blocks
2075	xorps	@tweak[3],$inout3
2076	movdqu	$inout1,16*1($out)
2077	xorps	@tweak[4],$inout4
2078	movdqu	$inout2,16*2($out)
2079	movdqu	$inout3,16*3($out)
2080	movdqu	$inout4,16*4($out)
2081	lea	16*5($out),$out			# $out+=5*16
2082	jmp	.Lxts_enc_done
2083
2084.align	16
2085.Lxts_enc_one:
2086	movups	($inp),$inout0
2087	lea	16*1($inp),$inp			# inp+=1*16
2088	xorps	@tweak[0],$inout0
2089___
2090	&aesni_generate1("enc",$key,$rounds);
2091$code.=<<___;
2092	xorps	@tweak[0],$inout0
2093	movdqa	@tweak[1],@tweak[0]
2094	movups	$inout0,($out)			# store one output block
2095	lea	16*1($out),$out			# $out+=1*16
2096	jmp	.Lxts_enc_done
2097
2098.align	16
2099.Lxts_enc_two:
2100	movups	($inp),$inout0
2101	movups	16($inp),$inout1
2102	lea	32($inp),$inp			# $inp+=2*16
2103	xorps	@tweak[0],$inout0
2104	xorps	@tweak[1],$inout1
2105
2106	call	_aesni_encrypt2
2107
2108	xorps	@tweak[0],$inout0
2109	movdqa	@tweak[2],@tweak[0]
2110	xorps	@tweak[1],$inout1
2111	movups	$inout0,($out)			# store 2 output blocks
2112	movups	$inout1,16*1($out)
2113	lea	16*2($out),$out			# $out+=2*16
2114	jmp	.Lxts_enc_done
2115
2116.align	16
2117.Lxts_enc_three:
2118	movups	($inp),$inout0
2119	movups	16*1($inp),$inout1
2120	movups	16*2($inp),$inout2
2121	lea	16*3($inp),$inp			# $inp+=3*16
2122	xorps	@tweak[0],$inout0
2123	xorps	@tweak[1],$inout1
2124	xorps	@tweak[2],$inout2
2125
2126	call	_aesni_encrypt3
2127
2128	xorps	@tweak[0],$inout0
2129	movdqa	@tweak[3],@tweak[0]
2130	xorps	@tweak[1],$inout1
2131	xorps	@tweak[2],$inout2
2132	movups	$inout0,($out)			# store 3 output blocks
2133	movups	$inout1,16*1($out)
2134	movups	$inout2,16*2($out)
2135	lea	16*3($out),$out			# $out+=3*16
2136	jmp	.Lxts_enc_done
2137
2138.align	16
2139.Lxts_enc_four:
2140	movups	($inp),$inout0
2141	movups	16*1($inp),$inout1
2142	movups	16*2($inp),$inout2
2143	xorps	@tweak[0],$inout0
2144	movups	16*3($inp),$inout3
2145	lea	16*4($inp),$inp			# $inp+=4*16
2146	xorps	@tweak[1],$inout1
2147	xorps	@tweak[2],$inout2
2148	xorps	@tweak[3],$inout3
2149
2150	call	_aesni_encrypt4
2151
2152	pxor	@tweak[0],$inout0
2153	movdqa	@tweak[4],@tweak[0]
2154	pxor	@tweak[1],$inout1
2155	pxor	@tweak[2],$inout2
2156	movdqu	$inout0,($out)			# store 4 output blocks
2157	pxor	@tweak[3],$inout3
2158	movdqu	$inout1,16*1($out)
2159	movdqu	$inout2,16*2($out)
2160	movdqu	$inout3,16*3($out)
2161	lea	16*4($out),$out			# $out+=4*16
2162	jmp	.Lxts_enc_done
2163
2164.align	16
2165.Lxts_enc_done:
2166	and	\$15,$len_			# see if $len%16 is 0
2167	jz	.Lxts_enc_ret
2168	mov	$len_,$len
2169
2170.Lxts_enc_steal:
2171	movzb	($inp),%eax			# borrow $rounds ...
2172	movzb	-16($out),%ecx			# ... and $key
2173	lea	1($inp),$inp
2174	mov	%al,-16($out)
2175	mov	%cl,0($out)
2176	lea	1($out),$out
2177	sub	\$1,$len
2178	jnz	.Lxts_enc_steal
2179
2180	sub	$len_,$out			# rewind $out
2181	mov	$key_,$key			# restore $key
2182	mov	$rnds_,$rounds			# restore $rounds
2183
2184	movups	-16($out),$inout0
2185	xorps	@tweak[0],$inout0
2186___
2187	&aesni_generate1("enc",$key,$rounds);
2188$code.=<<___;
2189	xorps	@tweak[0],$inout0
2190	movups	$inout0,-16($out)
2191
2192.Lxts_enc_ret:
2193	xorps	%xmm0,%xmm0			# clear register bank
2194	pxor	%xmm1,%xmm1
2195	pxor	%xmm2,%xmm2
2196	pxor	%xmm3,%xmm3
2197	pxor	%xmm4,%xmm4
2198	pxor	%xmm5,%xmm5
2199___
2200$code.=<<___ if (!$win64);
2201	pxor	%xmm6,%xmm6
2202	pxor	%xmm7,%xmm7
2203	movaps	%xmm0,0x00(%rsp)		# clear stack
2204	pxor	%xmm8,%xmm8
2205	movaps	%xmm0,0x10(%rsp)
2206	pxor	%xmm9,%xmm9
2207	movaps	%xmm0,0x20(%rsp)
2208	pxor	%xmm10,%xmm10
2209	movaps	%xmm0,0x30(%rsp)
2210	pxor	%xmm11,%xmm11
2211	movaps	%xmm0,0x40(%rsp)
2212	pxor	%xmm12,%xmm12
2213	movaps	%xmm0,0x50(%rsp)
2214	pxor	%xmm13,%xmm13
2215	movaps	%xmm0,0x60(%rsp)
2216	pxor	%xmm14,%xmm14
2217	pxor	%xmm15,%xmm15
2218___
2219$code.=<<___ if ($win64);
2220	movaps	-0xa8(%r11),%xmm6
2221	movaps	%xmm0,-0xa8(%r11)		# clear stack
2222	movaps	-0x98(%r11),%xmm7
2223	movaps	%xmm0,-0x98(%r11)
2224	movaps	-0x88(%r11),%xmm8
2225	movaps	%xmm0,-0x88(%r11)
2226	movaps	-0x78(%r11),%xmm9
2227	movaps	%xmm0,-0x78(%r11)
2228	movaps	-0x68(%r11),%xmm10
2229	movaps	%xmm0,-0x68(%r11)
2230	movaps	-0x58(%r11),%xmm11
2231	movaps	%xmm0,-0x58(%r11)
2232	movaps	-0x48(%r11),%xmm12
2233	movaps	%xmm0,-0x48(%r11)
2234	movaps	-0x38(%r11),%xmm13
2235	movaps	%xmm0,-0x38(%r11)
2236	movaps	-0x28(%r11),%xmm14
2237	movaps	%xmm0,-0x28(%r11)
2238	movaps	-0x18(%r11),%xmm15
2239	movaps	%xmm0,-0x18(%r11)
2240	movaps	%xmm0,0x00(%rsp)
2241	movaps	%xmm0,0x10(%rsp)
2242	movaps	%xmm0,0x20(%rsp)
2243	movaps	%xmm0,0x30(%rsp)
2244	movaps	%xmm0,0x40(%rsp)
2245	movaps	%xmm0,0x50(%rsp)
2246	movaps	%xmm0,0x60(%rsp)
2247___
2248$code.=<<___;
2249	mov	-8(%r11),%rbp
2250.cfi_restore	%rbp
2251	lea	(%r11),%rsp
2252.cfi_def_cfa_register	%rsp
2253.Lxts_enc_epilogue:
2254	ret
2255.cfi_endproc
2256.size	${PREFIX}_xts_encrypt,.-${PREFIX}_xts_encrypt
2257___
2258
2259$code.=<<___;
2260.globl	${PREFIX}_xts_decrypt
2261.type	${PREFIX}_xts_decrypt,\@function,6
2262.align	16
2263${PREFIX}_xts_decrypt:
2264.cfi_startproc
2265	lea	(%rsp),%r11			# frame pointer
2266.cfi_def_cfa_register	%r11
2267	push	%rbp
2268.cfi_push	%rbp
2269	sub	\$$frame_size,%rsp
2270	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
2271___
2272$code.=<<___ if ($win64);
2273	movaps	%xmm6,-0xa8(%r11)		# offload everything
2274	movaps	%xmm7,-0x98(%r11)
2275	movaps	%xmm8,-0x88(%r11)
2276	movaps	%xmm9,-0x78(%r11)
2277	movaps	%xmm10,-0x68(%r11)
2278	movaps	%xmm11,-0x58(%r11)
2279	movaps	%xmm12,-0x48(%r11)
2280	movaps	%xmm13,-0x38(%r11)
2281	movaps	%xmm14,-0x28(%r11)
2282	movaps	%xmm15,-0x18(%r11)
2283.Lxts_dec_body:
2284___
2285$code.=<<___;
2286	movups	($ivp),$inout0			# load clear-text tweak
2287	mov	240($key2),$rounds		# key2->rounds
2288	mov	240($key),$rnds_		# key1->rounds
2289___
2290	# generate the tweak
2291	&aesni_generate1("enc",$key2,$rounds,$inout0);
2292$code.=<<___;
2293	xor	%eax,%eax			# if ($len%16) len-=16;
2294	test	\$15,$len
2295	setnz	%al
2296	shl	\$4,%rax
2297	sub	%rax,$len
2298
2299	$movkey	($key),$rndkey0			# zero round key
2300	mov	$key,$key_			# backup $key
2301	mov	$rnds_,$rounds			# backup $rounds
2302	shl	\$4,$rnds_
2303	mov	$len,$len_			# backup $len
2304	and	\$-16,$len
2305
2306	$movkey	16($key,$rnds_),$rndkey1	# last round key
2307
2308	movdqa	.Lxts_magic(%rip),$twmask
2309	movdqa	$inout0,@tweak[5]
2310	pshufd	\$0x5f,$inout0,$twres
2311	pxor	$rndkey0,$rndkey1
2312___
2313    for ($i=0;$i<4;$i++) {
2314    $code.=<<___;
2315	movdqa	$twres,$twtmp
2316	paddd	$twres,$twres
2317	movdqa	@tweak[5],@tweak[$i]
2318	psrad	\$31,$twtmp			# broadcast upper bits
2319	paddq	@tweak[5],@tweak[5]
2320	pand	$twmask,$twtmp
2321	pxor	$rndkey0,@tweak[$i]
2322	pxor	$twtmp,@tweak[5]
2323___
2324    }
2325$code.=<<___;
2326	movdqa	@tweak[5],@tweak[4]
2327	psrad	\$31,$twres
2328	paddq	@tweak[5],@tweak[5]
2329	pand	$twmask,$twres
2330	pxor	$rndkey0,@tweak[4]
2331	pxor	$twres,@tweak[5]
2332	movaps	$rndkey1,0x60(%rsp)		# save round[0]^round[last]
2333
2334	sub	\$16*6,$len
2335	jc	.Lxts_dec_short			# if $len-=6*16 borrowed
2336
2337	mov	\$16+96,$rounds
2338	lea	32($key_,$rnds_),$key		# end of key schedule
2339	sub	%r10,%rax			# twisted $rounds
2340	$movkey	16($key_),$rndkey1
2341	mov	%rax,%r10			# backup twisted $rounds
2342	lea	.Lxts_magic(%rip),%r8
2343	jmp	.Lxts_dec_grandloop
2344
2345.align	32
2346.Lxts_dec_grandloop:
2347	movdqu	`16*0`($inp),$inout0		# load input
2348	movdqa	$rndkey0,$twmask
2349	movdqu	`16*1`($inp),$inout1
2350	pxor	@tweak[0],$inout0		# intput^=tweak^round[0]
2351	movdqu	`16*2`($inp),$inout2
2352	pxor	@tweak[1],$inout1
2353	 aesdec		$rndkey1,$inout0
2354	movdqu	`16*3`($inp),$inout3
2355	pxor	@tweak[2],$inout2
2356	 aesdec		$rndkey1,$inout1
2357	movdqu	`16*4`($inp),$inout4
2358	pxor	@tweak[3],$inout3
2359	 aesdec		$rndkey1,$inout2
2360	movdqu	`16*5`($inp),$inout5
2361	pxor	@tweak[5],$twmask		# round[0]^=tweak[5]
2362	 movdqa	0x60(%rsp),$twres		# load round[0]^round[last]
2363	pxor	@tweak[4],$inout4
2364	 aesdec		$rndkey1,$inout3
2365	$movkey	32($key_),$rndkey0
2366	lea	`16*6`($inp),$inp
2367	pxor	$twmask,$inout5
2368
2369	 pxor	$twres,@tweak[0]		# calculate tweaks^round[last]
2370	aesdec		$rndkey1,$inout4
2371	 pxor	$twres,@tweak[1]
2372	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks^last round key
2373	aesdec		$rndkey1,$inout5
2374	$movkey		48($key_),$rndkey1
2375	 pxor	$twres,@tweak[2]
2376
2377	aesdec		$rndkey0,$inout0
2378	 pxor	$twres,@tweak[3]
2379	 movdqa	@tweak[1],`16*1`(%rsp)
2380	aesdec		$rndkey0,$inout1
2381	 pxor	$twres,@tweak[4]
2382	 movdqa	@tweak[2],`16*2`(%rsp)
2383	aesdec		$rndkey0,$inout2
2384	aesdec		$rndkey0,$inout3
2385	 pxor	$twres,$twmask
2386	 movdqa	@tweak[4],`16*4`(%rsp)
2387	aesdec		$rndkey0,$inout4
2388	aesdec		$rndkey0,$inout5
2389	$movkey		64($key_),$rndkey0
2390	 movdqa	$twmask,`16*5`(%rsp)
2391	pshufd	\$0x5f,@tweak[5],$twres
2392	jmp	.Lxts_dec_loop6
2393.align	32
2394.Lxts_dec_loop6:
2395	aesdec		$rndkey1,$inout0
2396	aesdec		$rndkey1,$inout1
2397	aesdec		$rndkey1,$inout2
2398	aesdec		$rndkey1,$inout3
2399	aesdec		$rndkey1,$inout4
2400	aesdec		$rndkey1,$inout5
2401	$movkey		-64($key,%rax),$rndkey1
2402	add		\$32,%rax
2403
2404	aesdec		$rndkey0,$inout0
2405	aesdec		$rndkey0,$inout1
2406	aesdec		$rndkey0,$inout2
2407	aesdec		$rndkey0,$inout3
2408	aesdec		$rndkey0,$inout4
2409	aesdec		$rndkey0,$inout5
2410	$movkey		-80($key,%rax),$rndkey0
2411	jnz		.Lxts_dec_loop6
2412
2413	movdqa	(%r8),$twmask			# start calculating next tweak
2414	movdqa	$twres,$twtmp
2415	paddd	$twres,$twres
2416	 aesdec		$rndkey1,$inout0
2417	paddq	@tweak[5],@tweak[5]
2418	psrad	\$31,$twtmp
2419	 aesdec		$rndkey1,$inout1
2420	pand	$twmask,$twtmp
2421	$movkey	($key_),@tweak[0]		# load round[0]
2422	 aesdec		$rndkey1,$inout2
2423	 aesdec		$rndkey1,$inout3
2424	 aesdec		$rndkey1,$inout4
2425	pxor	$twtmp,@tweak[5]
2426	movaps	@tweak[0],@tweak[1]		# copy round[0]
2427	 aesdec		$rndkey1,$inout5
2428	 $movkey	-64($key),$rndkey1
2429
2430	movdqa	$twres,$twtmp
2431	 aesdec		$rndkey0,$inout0
2432	paddd	$twres,$twres
2433	pxor	@tweak[5],@tweak[0]
2434	 aesdec		$rndkey0,$inout1
2435	psrad	\$31,$twtmp
2436	paddq	@tweak[5],@tweak[5]
2437	 aesdec		$rndkey0,$inout2
2438	 aesdec		$rndkey0,$inout3
2439	pand	$twmask,$twtmp
2440	movaps	@tweak[1],@tweak[2]
2441	 aesdec		$rndkey0,$inout4
2442	pxor	$twtmp,@tweak[5]
2443	movdqa	$twres,$twtmp
2444	 aesdec		$rndkey0,$inout5
2445	 $movkey	-48($key),$rndkey0
2446
2447	paddd	$twres,$twres
2448	 aesdec		$rndkey1,$inout0
2449	pxor	@tweak[5],@tweak[1]
2450	psrad	\$31,$twtmp
2451	 aesdec		$rndkey1,$inout1
2452	paddq	@tweak[5],@tweak[5]
2453	pand	$twmask,$twtmp
2454	 aesdec		$rndkey1,$inout2
2455	 aesdec		$rndkey1,$inout3
2456	 movdqa	@tweak[3],`16*3`(%rsp)
2457	pxor	$twtmp,@tweak[5]
2458	 aesdec		$rndkey1,$inout4
2459	movaps	@tweak[2],@tweak[3]
2460	movdqa	$twres,$twtmp
2461	 aesdec		$rndkey1,$inout5
2462	 $movkey	-32($key),$rndkey1
2463
2464	paddd	$twres,$twres
2465	 aesdec		$rndkey0,$inout0
2466	pxor	@tweak[5],@tweak[2]
2467	psrad	\$31,$twtmp
2468	 aesdec		$rndkey0,$inout1
2469	paddq	@tweak[5],@tweak[5]
2470	pand	$twmask,$twtmp
2471	 aesdec		$rndkey0,$inout2
2472	 aesdec		$rndkey0,$inout3
2473	 aesdec		$rndkey0,$inout4
2474	pxor	$twtmp,@tweak[5]
2475	movaps	@tweak[3],@tweak[4]
2476	 aesdec		$rndkey0,$inout5
2477
2478	movdqa	$twres,$rndkey0
2479	paddd	$twres,$twres
2480	 aesdec		$rndkey1,$inout0
2481	pxor	@tweak[5],@tweak[3]
2482	psrad	\$31,$rndkey0
2483	 aesdec		$rndkey1,$inout1
2484	paddq	@tweak[5],@tweak[5]
2485	pand	$twmask,$rndkey0
2486	 aesdec		$rndkey1,$inout2
2487	 aesdec		$rndkey1,$inout3
2488	pxor	$rndkey0,@tweak[5]
2489	$movkey		($key_),$rndkey0
2490	 aesdec		$rndkey1,$inout4
2491	 aesdec		$rndkey1,$inout5
2492	$movkey		16($key_),$rndkey1
2493
2494	pxor	@tweak[5],@tweak[4]
2495	 aesdeclast	`16*0`(%rsp),$inout0
2496	psrad	\$31,$twres
2497	paddq	@tweak[5],@tweak[5]
2498	 aesdeclast	`16*1`(%rsp),$inout1
2499	 aesdeclast	`16*2`(%rsp),$inout2
2500	pand	$twmask,$twres
2501	mov	%r10,%rax			# restore $rounds
2502	 aesdeclast	`16*3`(%rsp),$inout3
2503	 aesdeclast	`16*4`(%rsp),$inout4
2504	 aesdeclast	`16*5`(%rsp),$inout5
2505	pxor	$twres,@tweak[5]
2506
2507	lea	`16*6`($out),$out		# $out+=6*16
2508	movups	$inout0,`-16*6`($out)		# store 6 output blocks
2509	movups	$inout1,`-16*5`($out)
2510	movups	$inout2,`-16*4`($out)
2511	movups	$inout3,`-16*3`($out)
2512	movups	$inout4,`-16*2`($out)
2513	movups	$inout5,`-16*1`($out)
2514	sub	\$16*6,$len
2515	jnc	.Lxts_dec_grandloop		# loop if $len-=6*16 didn't borrow
2516
2517	mov	\$16+96,$rounds
2518	sub	$rnds_,$rounds
2519	mov	$key_,$key			# restore $key
2520	shr	\$4,$rounds			# restore original value
2521
2522.Lxts_dec_short:
2523	# at the point @tweak[0..5] are populated with tweak values
2524	mov	$rounds,$rnds_			# backup $rounds
2525	pxor	$rndkey0,@tweak[0]
2526	pxor	$rndkey0,@tweak[1]
2527	add	\$16*6,$len			# restore real remaining $len
2528	jz	.Lxts_dec_done			# done if ($len==0)
2529
2530	pxor	$rndkey0,@tweak[2]
2531	cmp	\$0x20,$len
2532	jb	.Lxts_dec_one			# $len is 1*16
2533	pxor	$rndkey0,@tweak[3]
2534	je	.Lxts_dec_two			# $len is 2*16
2535
2536	pxor	$rndkey0,@tweak[4]
2537	cmp	\$0x40,$len
2538	jb	.Lxts_dec_three			# $len is 3*16
2539	je	.Lxts_dec_four			# $len is 4*16
2540
2541	movdqu	($inp),$inout0			# $len is 5*16
2542	movdqu	16*1($inp),$inout1
2543	movdqu	16*2($inp),$inout2
2544	pxor	@tweak[0],$inout0
2545	movdqu	16*3($inp),$inout3
2546	pxor	@tweak[1],$inout1
2547	movdqu	16*4($inp),$inout4
2548	lea	16*5($inp),$inp			# $inp+=5*16
2549	pxor	@tweak[2],$inout2
2550	pxor	@tweak[3],$inout3
2551	pxor	@tweak[4],$inout4
2552
2553	call	_aesni_decrypt6
2554
2555	xorps	@tweak[0],$inout0
2556	xorps	@tweak[1],$inout1
2557	xorps	@tweak[2],$inout2
2558	movdqu	$inout0,($out)			# store 5 output blocks
2559	xorps	@tweak[3],$inout3
2560	movdqu	$inout1,16*1($out)
2561	xorps	@tweak[4],$inout4
2562	movdqu	$inout2,16*2($out)
2563	 pxor		$twtmp,$twtmp
2564	movdqu	$inout3,16*3($out)
2565	 pcmpgtd	@tweak[5],$twtmp
2566	movdqu	$inout4,16*4($out)
2567	lea	16*5($out),$out			# $out+=5*16
2568	 pshufd		\$0x13,$twtmp,@tweak[1]	# $twres
2569	and	\$15,$len_
2570	jz	.Lxts_dec_ret
2571
2572	movdqa	@tweak[5],@tweak[0]
2573	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
2574	pand	$twmask,@tweak[1]		# isolate carry and residue
2575	pxor	@tweak[5],@tweak[1]
2576	jmp	.Lxts_dec_done2
2577
2578.align	16
2579.Lxts_dec_one:
2580	movups	($inp),$inout0
2581	lea	16*1($inp),$inp			# $inp+=1*16
2582	xorps	@tweak[0],$inout0
2583___
2584	&aesni_generate1("dec",$key,$rounds);
2585$code.=<<___;
2586	xorps	@tweak[0],$inout0
2587	movdqa	@tweak[1],@tweak[0]
2588	movups	$inout0,($out)			# store one output block
2589	movdqa	@tweak[2],@tweak[1]
2590	lea	16*1($out),$out			# $out+=1*16
2591	jmp	.Lxts_dec_done
2592
2593.align	16
2594.Lxts_dec_two:
2595	movups	($inp),$inout0
2596	movups	16($inp),$inout1
2597	lea	32($inp),$inp			# $inp+=2*16
2598	xorps	@tweak[0],$inout0
2599	xorps	@tweak[1],$inout1
2600
2601	call	_aesni_decrypt2
2602
2603	xorps	@tweak[0],$inout0
2604	movdqa	@tweak[2],@tweak[0]
2605	xorps	@tweak[1],$inout1
2606	movdqa	@tweak[3],@tweak[1]
2607	movups	$inout0,($out)			# store 2 output blocks
2608	movups	$inout1,16*1($out)
2609	lea	16*2($out),$out			# $out+=2*16
2610	jmp	.Lxts_dec_done
2611
2612.align	16
2613.Lxts_dec_three:
2614	movups	($inp),$inout0
2615	movups	16*1($inp),$inout1
2616	movups	16*2($inp),$inout2
2617	lea	16*3($inp),$inp			# $inp+=3*16
2618	xorps	@tweak[0],$inout0
2619	xorps	@tweak[1],$inout1
2620	xorps	@tweak[2],$inout2
2621
2622	call	_aesni_decrypt3
2623
2624	xorps	@tweak[0],$inout0
2625	movdqa	@tweak[3],@tweak[0]
2626	xorps	@tweak[1],$inout1
2627	movdqa	@tweak[4],@tweak[1]
2628	xorps	@tweak[2],$inout2
2629	movups	$inout0,($out)			# store 3 output blocks
2630	movups	$inout1,16*1($out)
2631	movups	$inout2,16*2($out)
2632	lea	16*3($out),$out			# $out+=3*16
2633	jmp	.Lxts_dec_done
2634
2635.align	16
2636.Lxts_dec_four:
2637	movups	($inp),$inout0
2638	movups	16*1($inp),$inout1
2639	movups	16*2($inp),$inout2
2640	xorps	@tweak[0],$inout0
2641	movups	16*3($inp),$inout3
2642	lea	16*4($inp),$inp			# $inp+=4*16
2643	xorps	@tweak[1],$inout1
2644	xorps	@tweak[2],$inout2
2645	xorps	@tweak[3],$inout3
2646
2647	call	_aesni_decrypt4
2648
2649	pxor	@tweak[0],$inout0
2650	movdqa	@tweak[4],@tweak[0]
2651	pxor	@tweak[1],$inout1
2652	movdqa	@tweak[5],@tweak[1]
2653	pxor	@tweak[2],$inout2
2654	movdqu	$inout0,($out)			# store 4 output blocks
2655	pxor	@tweak[3],$inout3
2656	movdqu	$inout1,16*1($out)
2657	movdqu	$inout2,16*2($out)
2658	movdqu	$inout3,16*3($out)
2659	lea	16*4($out),$out			# $out+=4*16
2660	jmp	.Lxts_dec_done
2661
2662.align	16
2663.Lxts_dec_done:
2664	and	\$15,$len_			# see if $len%16 is 0
2665	jz	.Lxts_dec_ret
2666.Lxts_dec_done2:
2667	mov	$len_,$len
2668	mov	$key_,$key			# restore $key
2669	mov	$rnds_,$rounds			# restore $rounds
2670
2671	movups	($inp),$inout0
2672	xorps	@tweak[1],$inout0
2673___
2674	&aesni_generate1("dec",$key,$rounds);
2675$code.=<<___;
2676	xorps	@tweak[1],$inout0
2677	movups	$inout0,($out)
2678
2679.Lxts_dec_steal:
2680	movzb	16($inp),%eax			# borrow $rounds ...
2681	movzb	($out),%ecx			# ... and $key
2682	lea	1($inp),$inp
2683	mov	%al,($out)
2684	mov	%cl,16($out)
2685	lea	1($out),$out
2686	sub	\$1,$len
2687	jnz	.Lxts_dec_steal
2688
2689	sub	$len_,$out			# rewind $out
2690	mov	$key_,$key			# restore $key
2691	mov	$rnds_,$rounds			# restore $rounds
2692
2693	movups	($out),$inout0
2694	xorps	@tweak[0],$inout0
2695___
2696	&aesni_generate1("dec",$key,$rounds);
2697$code.=<<___;
2698	xorps	@tweak[0],$inout0
2699	movups	$inout0,($out)
2700
2701.Lxts_dec_ret:
2702	xorps	%xmm0,%xmm0			# clear register bank
2703	pxor	%xmm1,%xmm1
2704	pxor	%xmm2,%xmm2
2705	pxor	%xmm3,%xmm3
2706	pxor	%xmm4,%xmm4
2707	pxor	%xmm5,%xmm5
2708___
2709$code.=<<___ if (!$win64);
2710	pxor	%xmm6,%xmm6
2711	pxor	%xmm7,%xmm7
2712	movaps	%xmm0,0x00(%rsp)		# clear stack
2713	pxor	%xmm8,%xmm8
2714	movaps	%xmm0,0x10(%rsp)
2715	pxor	%xmm9,%xmm9
2716	movaps	%xmm0,0x20(%rsp)
2717	pxor	%xmm10,%xmm10
2718	movaps	%xmm0,0x30(%rsp)
2719	pxor	%xmm11,%xmm11
2720	movaps	%xmm0,0x40(%rsp)
2721	pxor	%xmm12,%xmm12
2722	movaps	%xmm0,0x50(%rsp)
2723	pxor	%xmm13,%xmm13
2724	movaps	%xmm0,0x60(%rsp)
2725	pxor	%xmm14,%xmm14
2726	pxor	%xmm15,%xmm15
2727___
2728$code.=<<___ if ($win64);
2729	movaps	-0xa8(%r11),%xmm6
2730	movaps	%xmm0,-0xa8(%r11)		# clear stack
2731	movaps	-0x98(%r11),%xmm7
2732	movaps	%xmm0,-0x98(%r11)
2733	movaps	-0x88(%r11),%xmm8
2734	movaps	%xmm0,-0x88(%r11)
2735	movaps	-0x78(%r11),%xmm9
2736	movaps	%xmm0,-0x78(%r11)
2737	movaps	-0x68(%r11),%xmm10
2738	movaps	%xmm0,-0x68(%r11)
2739	movaps	-0x58(%r11),%xmm11
2740	movaps	%xmm0,-0x58(%r11)
2741	movaps	-0x48(%r11),%xmm12
2742	movaps	%xmm0,-0x48(%r11)
2743	movaps	-0x38(%r11),%xmm13
2744	movaps	%xmm0,-0x38(%r11)
2745	movaps	-0x28(%r11),%xmm14
2746	movaps	%xmm0,-0x28(%r11)
2747	movaps	-0x18(%r11),%xmm15
2748	movaps	%xmm0,-0x18(%r11)
2749	movaps	%xmm0,0x00(%rsp)
2750	movaps	%xmm0,0x10(%rsp)
2751	movaps	%xmm0,0x20(%rsp)
2752	movaps	%xmm0,0x30(%rsp)
2753	movaps	%xmm0,0x40(%rsp)
2754	movaps	%xmm0,0x50(%rsp)
2755	movaps	%xmm0,0x60(%rsp)
2756___
2757$code.=<<___;
2758	mov	-8(%r11),%rbp
2759.cfi_restore	%rbp
2760	lea	(%r11),%rsp
2761.cfi_def_cfa_register	%rsp
2762.Lxts_dec_epilogue:
2763	ret
2764.cfi_endproc
2765.size	${PREFIX}_xts_decrypt,.-${PREFIX}_xts_decrypt
2766___
2767}
2768
2769######################################################################
2770# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
2771#	const AES_KEY *key, unsigned int start_block_num,
2772#	unsigned char offset_i[16], const unsigned char L_[][16],
2773#	unsigned char checksum[16]);
2774#
2775if (0) {  # Omit these functions in BoringSSL
2776my @offset=map("%xmm$_",(10..15));
2777my ($checksum,$rndkey0l)=("%xmm8","%xmm9");
2778my ($block_num,$offset_p)=("%r8","%r9");		# 5th and 6th arguments
2779my ($L_p,$checksum_p) = ("%rbx","%rbp");
2780my ($i1,$i3,$i5) = ("%r12","%r13","%r14");
2781my $seventh_arg = $win64 ? 56 : 8;
2782my $blocks = $len;
2783
2784$code.=<<___;
2785.globl	${PREFIX}_ocb_encrypt
2786.type	${PREFIX}_ocb_encrypt,\@function,6
2787.align	32
2788${PREFIX}_ocb_encrypt:
2789.cfi_startproc
2790	lea	(%rsp),%rax
2791	push	%rbx
2792.cfi_push	%rbx
2793	push	%rbp
2794.cfi_push	%rbp
2795	push	%r12
2796.cfi_push	%r12
2797	push	%r13
2798.cfi_push	%r13
2799	push	%r14
2800.cfi_push	%r14
2801___
2802$code.=<<___ if ($win64);
2803	lea	-0xa0(%rsp),%rsp
2804	movaps	%xmm6,0x00(%rsp)		# offload everything
2805	movaps	%xmm7,0x10(%rsp)
2806	movaps	%xmm8,0x20(%rsp)
2807	movaps	%xmm9,0x30(%rsp)
2808	movaps	%xmm10,0x40(%rsp)
2809	movaps	%xmm11,0x50(%rsp)
2810	movaps	%xmm12,0x60(%rsp)
2811	movaps	%xmm13,0x70(%rsp)
2812	movaps	%xmm14,0x80(%rsp)
2813	movaps	%xmm15,0x90(%rsp)
2814.Locb_enc_body:
2815___
2816$code.=<<___;
2817	mov	$seventh_arg(%rax),$L_p		# 7th argument
2818	mov	$seventh_arg+8(%rax),$checksum_p# 8th argument
2819
2820	mov	240($key),$rnds_
2821	mov	$key,$key_
2822	shl	\$4,$rnds_
2823	$movkey	($key),$rndkey0l		# round[0]
2824	$movkey	16($key,$rnds_),$rndkey1	# round[last]
2825
2826	movdqu	($offset_p),@offset[5]		# load last offset_i
2827	pxor	$rndkey1,$rndkey0l		# round[0] ^ round[last]
2828	pxor	$rndkey1,@offset[5]		# offset_i ^ round[last]
2829
2830	mov	\$16+32,$rounds
2831	lea	32($key_,$rnds_),$key
2832	$movkey	16($key_),$rndkey1		# round[1]
2833	sub	%r10,%rax			# twisted $rounds
2834	mov	%rax,%r10			# backup twisted $rounds
2835
2836	movdqu	($L_p),@offset[0]		# L_0 for all odd-numbered blocks
2837	movdqu	($checksum_p),$checksum		# load checksum
2838
2839	test	\$1,$block_num			# is first block number odd?
2840	jnz	.Locb_enc_odd
2841
2842	bsf	$block_num,$i1
2843	add	\$1,$block_num
2844	shl	\$4,$i1
2845	movdqu	($L_p,$i1),$inout5		# borrow
2846	movdqu	($inp),$inout0
2847	lea	16($inp),$inp
2848
2849	call	__ocb_encrypt1
2850
2851	movdqa	$inout5,@offset[5]
2852	movups	$inout0,($out)
2853	lea	16($out),$out
2854	sub	\$1,$blocks
2855	jz	.Locb_enc_done
2856
2857.Locb_enc_odd:
2858	lea	1($block_num),$i1		# even-numbered blocks
2859	lea	3($block_num),$i3
2860	lea	5($block_num),$i5
2861	lea	6($block_num),$block_num
2862	bsf	$i1,$i1				# ntz(block)
2863	bsf	$i3,$i3
2864	bsf	$i5,$i5
2865	shl	\$4,$i1				# ntz(block) -> table offset
2866	shl	\$4,$i3
2867	shl	\$4,$i5
2868
2869	sub	\$6,$blocks
2870	jc	.Locb_enc_short
2871	jmp	.Locb_enc_grandloop
2872
2873.align	32
2874.Locb_enc_grandloop:
2875	movdqu	`16*0`($inp),$inout0		# load input
2876	movdqu	`16*1`($inp),$inout1
2877	movdqu	`16*2`($inp),$inout2
2878	movdqu	`16*3`($inp),$inout3
2879	movdqu	`16*4`($inp),$inout4
2880	movdqu	`16*5`($inp),$inout5
2881	lea	`16*6`($inp),$inp
2882
2883	call	__ocb_encrypt6
2884
2885	movups	$inout0,`16*0`($out)		# store output
2886	movups	$inout1,`16*1`($out)
2887	movups	$inout2,`16*2`($out)
2888	movups	$inout3,`16*3`($out)
2889	movups	$inout4,`16*4`($out)
2890	movups	$inout5,`16*5`($out)
2891	lea	`16*6`($out),$out
2892	sub	\$6,$blocks
2893	jnc	.Locb_enc_grandloop
2894
2895.Locb_enc_short:
2896	add	\$6,$blocks
2897	jz	.Locb_enc_done
2898
2899	movdqu	`16*0`($inp),$inout0
2900	cmp	\$2,$blocks
2901	jb	.Locb_enc_one
2902	movdqu	`16*1`($inp),$inout1
2903	je	.Locb_enc_two
2904
2905	movdqu	`16*2`($inp),$inout2
2906	cmp	\$4,$blocks
2907	jb	.Locb_enc_three
2908	movdqu	`16*3`($inp),$inout3
2909	je	.Locb_enc_four
2910
2911	movdqu	`16*4`($inp),$inout4
2912	pxor	$inout5,$inout5
2913
2914	call	__ocb_encrypt6
2915
2916	movdqa	@offset[4],@offset[5]
2917	movups	$inout0,`16*0`($out)
2918	movups	$inout1,`16*1`($out)
2919	movups	$inout2,`16*2`($out)
2920	movups	$inout3,`16*3`($out)
2921	movups	$inout4,`16*4`($out)
2922
2923	jmp	.Locb_enc_done
2924
2925.align	16
2926.Locb_enc_one:
2927	movdqa	@offset[0],$inout5		# borrow
2928
2929	call	__ocb_encrypt1
2930
2931	movdqa	$inout5,@offset[5]
2932	movups	$inout0,`16*0`($out)
2933	jmp	.Locb_enc_done
2934
2935.align	16
2936.Locb_enc_two:
2937	pxor	$inout2,$inout2
2938	pxor	$inout3,$inout3
2939
2940	call	__ocb_encrypt4
2941
2942	movdqa	@offset[1],@offset[5]
2943	movups	$inout0,`16*0`($out)
2944	movups	$inout1,`16*1`($out)
2945
2946	jmp	.Locb_enc_done
2947
2948.align	16
2949.Locb_enc_three:
2950	pxor	$inout3,$inout3
2951
2952	call	__ocb_encrypt4
2953
2954	movdqa	@offset[2],@offset[5]
2955	movups	$inout0,`16*0`($out)
2956	movups	$inout1,`16*1`($out)
2957	movups	$inout2,`16*2`($out)
2958
2959	jmp	.Locb_enc_done
2960
2961.align	16
2962.Locb_enc_four:
2963	call	__ocb_encrypt4
2964
2965	movdqa	@offset[3],@offset[5]
2966	movups	$inout0,`16*0`($out)
2967	movups	$inout1,`16*1`($out)
2968	movups	$inout2,`16*2`($out)
2969	movups	$inout3,`16*3`($out)
2970
2971.Locb_enc_done:
2972	pxor	$rndkey0,@offset[5]		# "remove" round[last]
2973	movdqu	$checksum,($checksum_p)		# store checksum
2974	movdqu	@offset[5],($offset_p)		# store last offset_i
2975
2976	xorps	%xmm0,%xmm0			# clear register bank
2977	pxor	%xmm1,%xmm1
2978	pxor	%xmm2,%xmm2
2979	pxor	%xmm3,%xmm3
2980	pxor	%xmm4,%xmm4
2981	pxor	%xmm5,%xmm5
2982___
2983$code.=<<___ if (!$win64);
2984	pxor	%xmm6,%xmm6
2985	pxor	%xmm7,%xmm7
2986	pxor	%xmm8,%xmm8
2987	pxor	%xmm9,%xmm9
2988	pxor	%xmm10,%xmm10
2989	pxor	%xmm11,%xmm11
2990	pxor	%xmm12,%xmm12
2991	pxor	%xmm13,%xmm13
2992	pxor	%xmm14,%xmm14
2993	pxor	%xmm15,%xmm15
2994	lea	0x28(%rsp),%rax
2995.cfi_def_cfa	%rax,8
2996___
2997$code.=<<___ if ($win64);
2998	movaps	0x00(%rsp),%xmm6
2999	movaps	%xmm0,0x00(%rsp)		# clear stack
3000	movaps	0x10(%rsp),%xmm7
3001	movaps	%xmm0,0x10(%rsp)
3002	movaps	0x20(%rsp),%xmm8
3003	movaps	%xmm0,0x20(%rsp)
3004	movaps	0x30(%rsp),%xmm9
3005	movaps	%xmm0,0x30(%rsp)
3006	movaps	0x40(%rsp),%xmm10
3007	movaps	%xmm0,0x40(%rsp)
3008	movaps	0x50(%rsp),%xmm11
3009	movaps	%xmm0,0x50(%rsp)
3010	movaps	0x60(%rsp),%xmm12
3011	movaps	%xmm0,0x60(%rsp)
3012	movaps	0x70(%rsp),%xmm13
3013	movaps	%xmm0,0x70(%rsp)
3014	movaps	0x80(%rsp),%xmm14
3015	movaps	%xmm0,0x80(%rsp)
3016	movaps	0x90(%rsp),%xmm15
3017	movaps	%xmm0,0x90(%rsp)
3018	lea	0xa0+0x28(%rsp),%rax
3019.Locb_enc_pop:
3020___
3021$code.=<<___;
3022	mov	-40(%rax),%r14
3023.cfi_restore	%r14
3024	mov	-32(%rax),%r13
3025.cfi_restore	%r13
3026	mov	-24(%rax),%r12
3027.cfi_restore	%r12
3028	mov	-16(%rax),%rbp
3029.cfi_restore	%rbp
3030	mov	-8(%rax),%rbx
3031.cfi_restore	%rbx
3032	lea	(%rax),%rsp
3033.cfi_def_cfa_register	%rsp
3034.Locb_enc_epilogue:
3035	ret
3036.cfi_endproc
3037.size	${PREFIX}_ocb_encrypt,.-${PREFIX}_ocb_encrypt
3038
3039.type	__ocb_encrypt6,\@abi-omnipotent
3040.align	32
3041__ocb_encrypt6:
3042	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
3043	 movdqu		($L_p,$i1),@offset[1]
3044	 movdqa		@offset[0],@offset[2]
3045	 movdqu		($L_p,$i3),@offset[3]
3046	 movdqa		@offset[0],@offset[4]
3047	 pxor		@offset[5],@offset[0]
3048	 movdqu		($L_p,$i5),@offset[5]
3049	 pxor		@offset[0],@offset[1]
3050	pxor		$inout0,$checksum	# accumulate checksum
3051	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
3052	 pxor		@offset[1],@offset[2]
3053	pxor		$inout1,$checksum
3054	pxor		@offset[1],$inout1
3055	 pxor		@offset[2],@offset[3]
3056	pxor		$inout2,$checksum
3057	pxor		@offset[2],$inout2
3058	 pxor		@offset[3],@offset[4]
3059	pxor		$inout3,$checksum
3060	pxor		@offset[3],$inout3
3061	 pxor		@offset[4],@offset[5]
3062	pxor		$inout4,$checksum
3063	pxor		@offset[4],$inout4
3064	pxor		$inout5,$checksum
3065	pxor		@offset[5],$inout5
3066	$movkey		32($key_),$rndkey0
3067
3068	lea		1($block_num),$i1	# even-numbered blocks
3069	lea		3($block_num),$i3
3070	lea		5($block_num),$i5
3071	add		\$6,$block_num
3072	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
3073	bsf		$i1,$i1			# ntz(block)
3074	bsf		$i3,$i3
3075	bsf		$i5,$i5
3076
3077	aesenc		$rndkey1,$inout0
3078	aesenc		$rndkey1,$inout1
3079	aesenc		$rndkey1,$inout2
3080	aesenc		$rndkey1,$inout3
3081	 pxor		$rndkey0l,@offset[1]
3082	 pxor		$rndkey0l,@offset[2]
3083	aesenc		$rndkey1,$inout4
3084	 pxor		$rndkey0l,@offset[3]
3085	 pxor		$rndkey0l,@offset[4]
3086	aesenc		$rndkey1,$inout5
3087	$movkey		48($key_),$rndkey1
3088	 pxor		$rndkey0l,@offset[5]
3089
3090	aesenc		$rndkey0,$inout0
3091	aesenc		$rndkey0,$inout1
3092	aesenc		$rndkey0,$inout2
3093	aesenc		$rndkey0,$inout3
3094	aesenc		$rndkey0,$inout4
3095	aesenc		$rndkey0,$inout5
3096	$movkey		64($key_),$rndkey0
3097	shl		\$4,$i1			# ntz(block) -> table offset
3098	shl		\$4,$i3
3099	jmp		.Locb_enc_loop6
3100
3101.align	32
3102.Locb_enc_loop6:
3103	aesenc		$rndkey1,$inout0
3104	aesenc		$rndkey1,$inout1
3105	aesenc		$rndkey1,$inout2
3106	aesenc		$rndkey1,$inout3
3107	aesenc		$rndkey1,$inout4
3108	aesenc		$rndkey1,$inout5
3109	$movkey		($key,%rax),$rndkey1
3110	add		\$32,%rax
3111
3112	aesenc		$rndkey0,$inout0
3113	aesenc		$rndkey0,$inout1
3114	aesenc		$rndkey0,$inout2
3115	aesenc		$rndkey0,$inout3
3116	aesenc		$rndkey0,$inout4
3117	aesenc		$rndkey0,$inout5
3118	$movkey		-16($key,%rax),$rndkey0
3119	jnz		.Locb_enc_loop6
3120
3121	aesenc		$rndkey1,$inout0
3122	aesenc		$rndkey1,$inout1
3123	aesenc		$rndkey1,$inout2
3124	aesenc		$rndkey1,$inout3
3125	aesenc		$rndkey1,$inout4
3126	aesenc		$rndkey1,$inout5
3127	$movkey		16($key_),$rndkey1
3128	shl		\$4,$i5
3129
3130	aesenclast	@offset[0],$inout0
3131	movdqu		($L_p),@offset[0]	# L_0 for all odd-numbered blocks
3132	mov		%r10,%rax		# restore twisted rounds
3133	aesenclast	@offset[1],$inout1
3134	aesenclast	@offset[2],$inout2
3135	aesenclast	@offset[3],$inout3
3136	aesenclast	@offset[4],$inout4
3137	aesenclast	@offset[5],$inout5
3138	ret
3139.size	__ocb_encrypt6,.-__ocb_encrypt6
3140
3141.type	__ocb_encrypt4,\@abi-omnipotent
3142.align	32
3143__ocb_encrypt4:
3144	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
3145	 movdqu		($L_p,$i1),@offset[1]
3146	 movdqa		@offset[0],@offset[2]
3147	 movdqu		($L_p,$i3),@offset[3]
3148	 pxor		@offset[5],@offset[0]
3149	 pxor		@offset[0],@offset[1]
3150	pxor		$inout0,$checksum	# accumulate checksum
3151	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
3152	 pxor		@offset[1],@offset[2]
3153	pxor		$inout1,$checksum
3154	pxor		@offset[1],$inout1
3155	 pxor		@offset[2],@offset[3]
3156	pxor		$inout2,$checksum
3157	pxor		@offset[2],$inout2
3158	pxor		$inout3,$checksum
3159	pxor		@offset[3],$inout3
3160	$movkey		32($key_),$rndkey0
3161
3162	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
3163	 pxor		$rndkey0l,@offset[1]
3164	 pxor		$rndkey0l,@offset[2]
3165	 pxor		$rndkey0l,@offset[3]
3166
3167	aesenc		$rndkey1,$inout0
3168	aesenc		$rndkey1,$inout1
3169	aesenc		$rndkey1,$inout2
3170	aesenc		$rndkey1,$inout3
3171	$movkey		48($key_),$rndkey1
3172
3173	aesenc		$rndkey0,$inout0
3174	aesenc		$rndkey0,$inout1
3175	aesenc		$rndkey0,$inout2
3176	aesenc		$rndkey0,$inout3
3177	$movkey		64($key_),$rndkey0
3178	jmp		.Locb_enc_loop4
3179
3180.align	32
3181.Locb_enc_loop4:
3182	aesenc		$rndkey1,$inout0
3183	aesenc		$rndkey1,$inout1
3184	aesenc		$rndkey1,$inout2
3185	aesenc		$rndkey1,$inout3
3186	$movkey		($key,%rax),$rndkey1
3187	add		\$32,%rax
3188
3189	aesenc		$rndkey0,$inout0
3190	aesenc		$rndkey0,$inout1
3191	aesenc		$rndkey0,$inout2
3192	aesenc		$rndkey0,$inout3
3193	$movkey		-16($key,%rax),$rndkey0
3194	jnz		.Locb_enc_loop4
3195
3196	aesenc		$rndkey1,$inout0
3197	aesenc		$rndkey1,$inout1
3198	aesenc		$rndkey1,$inout2
3199	aesenc		$rndkey1,$inout3
3200	$movkey		16($key_),$rndkey1
3201	mov		%r10,%rax		# restore twisted rounds
3202
3203	aesenclast	@offset[0],$inout0
3204	aesenclast	@offset[1],$inout1
3205	aesenclast	@offset[2],$inout2
3206	aesenclast	@offset[3],$inout3
3207	ret
3208.size	__ocb_encrypt4,.-__ocb_encrypt4
3209
3210.type	__ocb_encrypt1,\@abi-omnipotent
3211.align	32
3212__ocb_encrypt1:
3213	 pxor		@offset[5],$inout5	# offset_i
3214	 pxor		$rndkey0l,$inout5	# offset_i ^ round[0]
3215	pxor		$inout0,$checksum	# accumulate checksum
3216	pxor		$inout5,$inout0		# input ^ round[0] ^ offset_i
3217	$movkey		32($key_),$rndkey0
3218
3219	aesenc		$rndkey1,$inout0
3220	$movkey		48($key_),$rndkey1
3221	pxor		$rndkey0l,$inout5	# offset_i ^ round[last]
3222
3223	aesenc		$rndkey0,$inout0
3224	$movkey		64($key_),$rndkey0
3225	jmp		.Locb_enc_loop1
3226
3227.align	32
3228.Locb_enc_loop1:
3229	aesenc		$rndkey1,$inout0
3230	$movkey		($key,%rax),$rndkey1
3231	add		\$32,%rax
3232
3233	aesenc		$rndkey0,$inout0
3234	$movkey		-16($key,%rax),$rndkey0
3235	jnz		.Locb_enc_loop1
3236
3237	aesenc		$rndkey1,$inout0
3238	$movkey		16($key_),$rndkey1	# redundant in tail
3239	mov		%r10,%rax		# restore twisted rounds
3240
3241	aesenclast	$inout5,$inout0
3242	ret
3243.size	__ocb_encrypt1,.-__ocb_encrypt1
3244
3245.globl	${PREFIX}_ocb_decrypt
3246.type	${PREFIX}_ocb_decrypt,\@function,6
3247.align	32
3248${PREFIX}_ocb_decrypt:
3249.cfi_startproc
3250	lea	(%rsp),%rax
3251	push	%rbx
3252.cfi_push	%rbx
3253	push	%rbp
3254.cfi_push	%rbp
3255	push	%r12
3256.cfi_push	%r12
3257	push	%r13
3258.cfi_push	%r13
3259	push	%r14
3260.cfi_push	%r14
3261___
3262$code.=<<___ if ($win64);
3263	lea	-0xa0(%rsp),%rsp
3264	movaps	%xmm6,0x00(%rsp)		# offload everything
3265	movaps	%xmm7,0x10(%rsp)
3266	movaps	%xmm8,0x20(%rsp)
3267	movaps	%xmm9,0x30(%rsp)
3268	movaps	%xmm10,0x40(%rsp)
3269	movaps	%xmm11,0x50(%rsp)
3270	movaps	%xmm12,0x60(%rsp)
3271	movaps	%xmm13,0x70(%rsp)
3272	movaps	%xmm14,0x80(%rsp)
3273	movaps	%xmm15,0x90(%rsp)
3274.Locb_dec_body:
3275___
3276$code.=<<___;
3277	mov	$seventh_arg(%rax),$L_p		# 7th argument
3278	mov	$seventh_arg+8(%rax),$checksum_p# 8th argument
3279
3280	mov	240($key),$rnds_
3281	mov	$key,$key_
3282	shl	\$4,$rnds_
3283	$movkey	($key),$rndkey0l		# round[0]
3284	$movkey	16($key,$rnds_),$rndkey1	# round[last]
3285
3286	movdqu	($offset_p),@offset[5]		# load last offset_i
3287	pxor	$rndkey1,$rndkey0l		# round[0] ^ round[last]
3288	pxor	$rndkey1,@offset[5]		# offset_i ^ round[last]
3289
3290	mov	\$16+32,$rounds
3291	lea	32($key_,$rnds_),$key
3292	$movkey	16($key_),$rndkey1		# round[1]
3293	sub	%r10,%rax			# twisted $rounds
3294	mov	%rax,%r10			# backup twisted $rounds
3295
3296	movdqu	($L_p),@offset[0]		# L_0 for all odd-numbered blocks
3297	movdqu	($checksum_p),$checksum		# load checksum
3298
3299	test	\$1,$block_num			# is first block number odd?
3300	jnz	.Locb_dec_odd
3301
3302	bsf	$block_num,$i1
3303	add	\$1,$block_num
3304	shl	\$4,$i1
3305	movdqu	($L_p,$i1),$inout5		# borrow
3306	movdqu	($inp),$inout0
3307	lea	16($inp),$inp
3308
3309	call	__ocb_decrypt1
3310
3311	movdqa	$inout5,@offset[5]
3312	movups	$inout0,($out)
3313	xorps	$inout0,$checksum		# accumulate checksum
3314	lea	16($out),$out
3315	sub	\$1,$blocks
3316	jz	.Locb_dec_done
3317
3318.Locb_dec_odd:
3319	lea	1($block_num),$i1		# even-numbered blocks
3320	lea	3($block_num),$i3
3321	lea	5($block_num),$i5
3322	lea	6($block_num),$block_num
3323	bsf	$i1,$i1				# ntz(block)
3324	bsf	$i3,$i3
3325	bsf	$i5,$i5
3326	shl	\$4,$i1				# ntz(block) -> table offset
3327	shl	\$4,$i3
3328	shl	\$4,$i5
3329
3330	sub	\$6,$blocks
3331	jc	.Locb_dec_short
3332	jmp	.Locb_dec_grandloop
3333
3334.align	32
3335.Locb_dec_grandloop:
3336	movdqu	`16*0`($inp),$inout0		# load input
3337	movdqu	`16*1`($inp),$inout1
3338	movdqu	`16*2`($inp),$inout2
3339	movdqu	`16*3`($inp),$inout3
3340	movdqu	`16*4`($inp),$inout4
3341	movdqu	`16*5`($inp),$inout5
3342	lea	`16*6`($inp),$inp
3343
3344	call	__ocb_decrypt6
3345
3346	movups	$inout0,`16*0`($out)		# store output
3347	pxor	$inout0,$checksum		# accumulate checksum
3348	movups	$inout1,`16*1`($out)
3349	pxor	$inout1,$checksum
3350	movups	$inout2,`16*2`($out)
3351	pxor	$inout2,$checksum
3352	movups	$inout3,`16*3`($out)
3353	pxor	$inout3,$checksum
3354	movups	$inout4,`16*4`($out)
3355	pxor	$inout4,$checksum
3356	movups	$inout5,`16*5`($out)
3357	pxor	$inout5,$checksum
3358	lea	`16*6`($out),$out
3359	sub	\$6,$blocks
3360	jnc	.Locb_dec_grandloop
3361
3362.Locb_dec_short:
3363	add	\$6,$blocks
3364	jz	.Locb_dec_done
3365
3366	movdqu	`16*0`($inp),$inout0
3367	cmp	\$2,$blocks
3368	jb	.Locb_dec_one
3369	movdqu	`16*1`($inp),$inout1
3370	je	.Locb_dec_two
3371
3372	movdqu	`16*2`($inp),$inout2
3373	cmp	\$4,$blocks
3374	jb	.Locb_dec_three
3375	movdqu	`16*3`($inp),$inout3
3376	je	.Locb_dec_four
3377
3378	movdqu	`16*4`($inp),$inout4
3379	pxor	$inout5,$inout5
3380
3381	call	__ocb_decrypt6
3382
3383	movdqa	@offset[4],@offset[5]
3384	movups	$inout0,`16*0`($out)		# store output
3385	pxor	$inout0,$checksum		# accumulate checksum
3386	movups	$inout1,`16*1`($out)
3387	pxor	$inout1,$checksum
3388	movups	$inout2,`16*2`($out)
3389	pxor	$inout2,$checksum
3390	movups	$inout3,`16*3`($out)
3391	pxor	$inout3,$checksum
3392	movups	$inout4,`16*4`($out)
3393	pxor	$inout4,$checksum
3394
3395	jmp	.Locb_dec_done
3396
3397.align	16
3398.Locb_dec_one:
3399	movdqa	@offset[0],$inout5		# borrow
3400
3401	call	__ocb_decrypt1
3402
3403	movdqa	$inout5,@offset[5]
3404	movups	$inout0,`16*0`($out)		# store output
3405	xorps	$inout0,$checksum		# accumulate checksum
3406	jmp	.Locb_dec_done
3407
3408.align	16
3409.Locb_dec_two:
3410	pxor	$inout2,$inout2
3411	pxor	$inout3,$inout3
3412
3413	call	__ocb_decrypt4
3414
3415	movdqa	@offset[1],@offset[5]
3416	movups	$inout0,`16*0`($out)		# store output
3417	xorps	$inout0,$checksum		# accumulate checksum
3418	movups	$inout1,`16*1`($out)
3419	xorps	$inout1,$checksum
3420
3421	jmp	.Locb_dec_done
3422
3423.align	16
3424.Locb_dec_three:
3425	pxor	$inout3,$inout3
3426
3427	call	__ocb_decrypt4
3428
3429	movdqa	@offset[2],@offset[5]
3430	movups	$inout0,`16*0`($out)		# store output
3431	xorps	$inout0,$checksum		# accumulate checksum
3432	movups	$inout1,`16*1`($out)
3433	xorps	$inout1,$checksum
3434	movups	$inout2,`16*2`($out)
3435	xorps	$inout2,$checksum
3436
3437	jmp	.Locb_dec_done
3438
3439.align	16
3440.Locb_dec_four:
3441	call	__ocb_decrypt4
3442
3443	movdqa	@offset[3],@offset[5]
3444	movups	$inout0,`16*0`($out)		# store output
3445	pxor	$inout0,$checksum		# accumulate checksum
3446	movups	$inout1,`16*1`($out)
3447	pxor	$inout1,$checksum
3448	movups	$inout2,`16*2`($out)
3449	pxor	$inout2,$checksum
3450	movups	$inout3,`16*3`($out)
3451	pxor	$inout3,$checksum
3452
3453.Locb_dec_done:
3454	pxor	$rndkey0,@offset[5]		# "remove" round[last]
3455	movdqu	$checksum,($checksum_p)		# store checksum
3456	movdqu	@offset[5],($offset_p)		# store last offset_i
3457
3458	xorps	%xmm0,%xmm0			# clear register bank
3459	pxor	%xmm1,%xmm1
3460	pxor	%xmm2,%xmm2
3461	pxor	%xmm3,%xmm3
3462	pxor	%xmm4,%xmm4
3463	pxor	%xmm5,%xmm5
3464___
3465$code.=<<___ if (!$win64);
3466	pxor	%xmm6,%xmm6
3467	pxor	%xmm7,%xmm7
3468	pxor	%xmm8,%xmm8
3469	pxor	%xmm9,%xmm9
3470	pxor	%xmm10,%xmm10
3471	pxor	%xmm11,%xmm11
3472	pxor	%xmm12,%xmm12
3473	pxor	%xmm13,%xmm13
3474	pxor	%xmm14,%xmm14
3475	pxor	%xmm15,%xmm15
3476	lea	0x28(%rsp),%rax
3477.cfi_def_cfa	%rax,8
3478___
3479$code.=<<___ if ($win64);
3480	movaps	0x00(%rsp),%xmm6
3481	movaps	%xmm0,0x00(%rsp)		# clear stack
3482	movaps	0x10(%rsp),%xmm7
3483	movaps	%xmm0,0x10(%rsp)
3484	movaps	0x20(%rsp),%xmm8
3485	movaps	%xmm0,0x20(%rsp)
3486	movaps	0x30(%rsp),%xmm9
3487	movaps	%xmm0,0x30(%rsp)
3488	movaps	0x40(%rsp),%xmm10
3489	movaps	%xmm0,0x40(%rsp)
3490	movaps	0x50(%rsp),%xmm11
3491	movaps	%xmm0,0x50(%rsp)
3492	movaps	0x60(%rsp),%xmm12
3493	movaps	%xmm0,0x60(%rsp)
3494	movaps	0x70(%rsp),%xmm13
3495	movaps	%xmm0,0x70(%rsp)
3496	movaps	0x80(%rsp),%xmm14
3497	movaps	%xmm0,0x80(%rsp)
3498	movaps	0x90(%rsp),%xmm15
3499	movaps	%xmm0,0x90(%rsp)
3500	lea	0xa0+0x28(%rsp),%rax
3501.Locb_dec_pop:
3502___
3503$code.=<<___;
3504	mov	-40(%rax),%r14
3505.cfi_restore	%r14
3506	mov	-32(%rax),%r13
3507.cfi_restore	%r13
3508	mov	-24(%rax),%r12
3509.cfi_restore	%r12
3510	mov	-16(%rax),%rbp
3511.cfi_restore	%rbp
3512	mov	-8(%rax),%rbx
3513.cfi_restore	%rbx
3514	lea	(%rax),%rsp
3515.cfi_def_cfa_register	%rsp
3516.Locb_dec_epilogue:
3517	ret
3518.cfi_endproc
3519.size	${PREFIX}_ocb_decrypt,.-${PREFIX}_ocb_decrypt
3520
3521.type	__ocb_decrypt6,\@abi-omnipotent
3522.align	32
3523__ocb_decrypt6:
3524	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
3525	 movdqu		($L_p,$i1),@offset[1]
3526	 movdqa		@offset[0],@offset[2]
3527	 movdqu		($L_p,$i3),@offset[3]
3528	 movdqa		@offset[0],@offset[4]
3529	 pxor		@offset[5],@offset[0]
3530	 movdqu		($L_p,$i5),@offset[5]
3531	 pxor		@offset[0],@offset[1]
3532	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
3533	 pxor		@offset[1],@offset[2]
3534	pxor		@offset[1],$inout1
3535	 pxor		@offset[2],@offset[3]
3536	pxor		@offset[2],$inout2
3537	 pxor		@offset[3],@offset[4]
3538	pxor		@offset[3],$inout3
3539	 pxor		@offset[4],@offset[5]
3540	pxor		@offset[4],$inout4
3541	pxor		@offset[5],$inout5
3542	$movkey		32($key_),$rndkey0
3543
3544	lea		1($block_num),$i1	# even-numbered blocks
3545	lea		3($block_num),$i3
3546	lea		5($block_num),$i5
3547	add		\$6,$block_num
3548	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
3549	bsf		$i1,$i1			# ntz(block)
3550	bsf		$i3,$i3
3551	bsf		$i5,$i5
3552
3553	aesdec		$rndkey1,$inout0
3554	aesdec		$rndkey1,$inout1
3555	aesdec		$rndkey1,$inout2
3556	aesdec		$rndkey1,$inout3
3557	 pxor		$rndkey0l,@offset[1]
3558	 pxor		$rndkey0l,@offset[2]
3559	aesdec		$rndkey1,$inout4
3560	 pxor		$rndkey0l,@offset[3]
3561	 pxor		$rndkey0l,@offset[4]
3562	aesdec		$rndkey1,$inout5
3563	$movkey		48($key_),$rndkey1
3564	 pxor		$rndkey0l,@offset[5]
3565
3566	aesdec		$rndkey0,$inout0
3567	aesdec		$rndkey0,$inout1
3568	aesdec		$rndkey0,$inout2
3569	aesdec		$rndkey0,$inout3
3570	aesdec		$rndkey0,$inout4
3571	aesdec		$rndkey0,$inout5
3572	$movkey		64($key_),$rndkey0
3573	shl		\$4,$i1			# ntz(block) -> table offset
3574	shl		\$4,$i3
3575	jmp		.Locb_dec_loop6
3576
3577.align	32
3578.Locb_dec_loop6:
3579	aesdec		$rndkey1,$inout0
3580	aesdec		$rndkey1,$inout1
3581	aesdec		$rndkey1,$inout2
3582	aesdec		$rndkey1,$inout3
3583	aesdec		$rndkey1,$inout4
3584	aesdec		$rndkey1,$inout5
3585	$movkey		($key,%rax),$rndkey1
3586	add		\$32,%rax
3587
3588	aesdec		$rndkey0,$inout0
3589	aesdec		$rndkey0,$inout1
3590	aesdec		$rndkey0,$inout2
3591	aesdec		$rndkey0,$inout3
3592	aesdec		$rndkey0,$inout4
3593	aesdec		$rndkey0,$inout5
3594	$movkey		-16($key,%rax),$rndkey0
3595	jnz		.Locb_dec_loop6
3596
3597	aesdec		$rndkey1,$inout0
3598	aesdec		$rndkey1,$inout1
3599	aesdec		$rndkey1,$inout2
3600	aesdec		$rndkey1,$inout3
3601	aesdec		$rndkey1,$inout4
3602	aesdec		$rndkey1,$inout5
3603	$movkey		16($key_),$rndkey1
3604	shl		\$4,$i5
3605
3606	aesdeclast	@offset[0],$inout0
3607	movdqu		($L_p),@offset[0]	# L_0 for all odd-numbered blocks
3608	mov		%r10,%rax		# restore twisted rounds
3609	aesdeclast	@offset[1],$inout1
3610	aesdeclast	@offset[2],$inout2
3611	aesdeclast	@offset[3],$inout3
3612	aesdeclast	@offset[4],$inout4
3613	aesdeclast	@offset[5],$inout5
3614	ret
3615.size	__ocb_decrypt6,.-__ocb_decrypt6
3616
3617.type	__ocb_decrypt4,\@abi-omnipotent
3618.align	32
3619__ocb_decrypt4:
3620	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
3621	 movdqu		($L_p,$i1),@offset[1]
3622	 movdqa		@offset[0],@offset[2]
3623	 movdqu		($L_p,$i3),@offset[3]
3624	 pxor		@offset[5],@offset[0]
3625	 pxor		@offset[0],@offset[1]
3626	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
3627	 pxor		@offset[1],@offset[2]
3628	pxor		@offset[1],$inout1
3629	 pxor		@offset[2],@offset[3]
3630	pxor		@offset[2],$inout2
3631	pxor		@offset[3],$inout3
3632	$movkey		32($key_),$rndkey0
3633
3634	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
3635	 pxor		$rndkey0l,@offset[1]
3636	 pxor		$rndkey0l,@offset[2]
3637	 pxor		$rndkey0l,@offset[3]
3638
3639	aesdec		$rndkey1,$inout0
3640	aesdec		$rndkey1,$inout1
3641	aesdec		$rndkey1,$inout2
3642	aesdec		$rndkey1,$inout3
3643	$movkey		48($key_),$rndkey1
3644
3645	aesdec		$rndkey0,$inout0
3646	aesdec		$rndkey0,$inout1
3647	aesdec		$rndkey0,$inout2
3648	aesdec		$rndkey0,$inout3
3649	$movkey		64($key_),$rndkey0
3650	jmp		.Locb_dec_loop4
3651
3652.align	32
3653.Locb_dec_loop4:
3654	aesdec		$rndkey1,$inout0
3655	aesdec		$rndkey1,$inout1
3656	aesdec		$rndkey1,$inout2
3657	aesdec		$rndkey1,$inout3
3658	$movkey		($key,%rax),$rndkey1
3659	add		\$32,%rax
3660
3661	aesdec		$rndkey0,$inout0
3662	aesdec		$rndkey0,$inout1
3663	aesdec		$rndkey0,$inout2
3664	aesdec		$rndkey0,$inout3
3665	$movkey		-16($key,%rax),$rndkey0
3666	jnz		.Locb_dec_loop4
3667
3668	aesdec		$rndkey1,$inout0
3669	aesdec		$rndkey1,$inout1
3670	aesdec		$rndkey1,$inout2
3671	aesdec		$rndkey1,$inout3
3672	$movkey		16($key_),$rndkey1
3673	mov		%r10,%rax		# restore twisted rounds
3674
3675	aesdeclast	@offset[0],$inout0
3676	aesdeclast	@offset[1],$inout1
3677	aesdeclast	@offset[2],$inout2
3678	aesdeclast	@offset[3],$inout3
3679	ret
3680.size	__ocb_decrypt4,.-__ocb_decrypt4
3681
3682.type	__ocb_decrypt1,\@abi-omnipotent
3683.align	32
3684__ocb_decrypt1:
3685	 pxor		@offset[5],$inout5	# offset_i
3686	 pxor		$rndkey0l,$inout5	# offset_i ^ round[0]
3687	pxor		$inout5,$inout0		# input ^ round[0] ^ offset_i
3688	$movkey		32($key_),$rndkey0
3689
3690	aesdec		$rndkey1,$inout0
3691	$movkey		48($key_),$rndkey1
3692	pxor		$rndkey0l,$inout5	# offset_i ^ round[last]
3693
3694	aesdec		$rndkey0,$inout0
3695	$movkey		64($key_),$rndkey0
3696	jmp		.Locb_dec_loop1
3697
3698.align	32
3699.Locb_dec_loop1:
3700	aesdec		$rndkey1,$inout0
3701	$movkey		($key,%rax),$rndkey1
3702	add		\$32,%rax
3703
3704	aesdec		$rndkey0,$inout0
3705	$movkey		-16($key,%rax),$rndkey0
3706	jnz		.Locb_dec_loop1
3707
3708	aesdec		$rndkey1,$inout0
3709	$movkey		16($key_),$rndkey1	# redundant in tail
3710	mov		%r10,%rax		# restore twisted rounds
3711
3712	aesdeclast	$inout5,$inout0
3713	ret
3714.size	__ocb_decrypt1,.-__ocb_decrypt1
3715___
3716} }}
3717
3718########################################################################
3719# void $PREFIX_cbc_encrypt (const void *inp, void *out,
3720#			    size_t length, const AES_KEY *key,
3721#			    unsigned char *ivp,const int enc);
3722{
3723my $frame_size = 0x10 + ($win64?0xa0:0);	# used in decrypt
3724my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
3725
3726$code.=<<___;
3727.globl	${PREFIX}_cbc_encrypt
3728.type	${PREFIX}_cbc_encrypt,\@function,6
3729.align	16
3730${PREFIX}_cbc_encrypt:
3731.cfi_startproc
3732	test	$len,$len		# check length
3733	jz	.Lcbc_ret
3734
3735	mov	240($key),$rnds_	# key->rounds
3736	mov	$key,$key_		# backup $key
3737	test	%r9d,%r9d		# 6th argument
3738	jz	.Lcbc_decrypt
3739#--------------------------- CBC ENCRYPT ------------------------------#
3740	movups	($ivp),$inout0		# load iv as initial state
3741	mov	$rnds_,$rounds
3742	cmp	\$16,$len
3743	jb	.Lcbc_enc_tail
3744	sub	\$16,$len
3745	jmp	.Lcbc_enc_loop
3746.align	16
3747.Lcbc_enc_loop:
3748	movups	($inp),$inout1		# load input
3749	lea	16($inp),$inp
3750	#xorps	$inout1,$inout0
3751___
3752	&aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
3753$code.=<<___;
3754	mov	$rnds_,$rounds		# restore $rounds
3755	mov	$key_,$key		# restore $key
3756	movups	$inout0,0($out)		# store output
3757	lea	16($out),$out
3758	sub	\$16,$len
3759	jnc	.Lcbc_enc_loop
3760	add	\$16,$len
3761	jnz	.Lcbc_enc_tail
3762	 pxor	$rndkey0,$rndkey0	# clear register bank
3763	 pxor	$rndkey1,$rndkey1
3764	movups	$inout0,($ivp)
3765	 pxor	$inout0,$inout0
3766	 pxor	$inout1,$inout1
3767	jmp	.Lcbc_ret
3768
3769.Lcbc_enc_tail:
3770	mov	$len,%rcx	# zaps $key
3771	xchg	$inp,$out	# $inp is %rsi and $out is %rdi now
3772	.long	0x9066A4F3	# rep movsb
3773	mov	\$16,%ecx	# zero tail
3774	sub	$len,%rcx
3775	xor	%eax,%eax
3776	.long	0x9066AAF3	# rep stosb
3777	lea	-16(%rdi),%rdi	# rewind $out by 1 block
3778	mov	$rnds_,$rounds	# restore $rounds
3779	mov	%rdi,%rsi	# $inp and $out are the same
3780	mov	$key_,$key	# restore $key
3781	xor	$len,$len	# len=16
3782	jmp	.Lcbc_enc_loop	# one more spin
3783#--------------------------- CBC DECRYPT ------------------------------#
3784.align	16
3785.Lcbc_decrypt:
3786	cmp	\$16,$len
3787	jne	.Lcbc_decrypt_bulk
3788
3789	# handle single block without allocating stack frame,
3790	# useful in ciphertext stealing mode
3791	movdqu	($inp),$inout0		# load input
3792	movdqu	($ivp),$inout1		# load iv
3793	movdqa	$inout0,$inout2		# future iv
3794___
3795	&aesni_generate1("dec",$key,$rnds_);
3796$code.=<<___;
3797	 pxor	$rndkey0,$rndkey0	# clear register bank
3798	 pxor	$rndkey1,$rndkey1
3799	movdqu	$inout2,($ivp)		# store iv
3800	xorps	$inout1,$inout0		# ^=iv
3801	 pxor	$inout1,$inout1
3802	movups	$inout0,($out)		# store output
3803	 pxor	$inout0,$inout0
3804	jmp	.Lcbc_ret
3805.align	16
3806.Lcbc_decrypt_bulk:
3807	lea	(%rsp),%r11		# frame pointer
3808.cfi_def_cfa_register	%r11
3809	push	%rbp
3810.cfi_push	%rbp
3811	sub	\$$frame_size,%rsp
3812	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
3813___
3814$code.=<<___ if ($win64);
3815	movaps	%xmm6,0x10(%rsp)
3816	movaps	%xmm7,0x20(%rsp)
3817	movaps	%xmm8,0x30(%rsp)
3818	movaps	%xmm9,0x40(%rsp)
3819	movaps	%xmm10,0x50(%rsp)
3820	movaps	%xmm11,0x60(%rsp)
3821	movaps	%xmm12,0x70(%rsp)
3822	movaps	%xmm13,0x80(%rsp)
3823	movaps	%xmm14,0x90(%rsp)
3824	movaps	%xmm15,0xa0(%rsp)
3825.Lcbc_decrypt_body:
3826___
3827
3828my $inp_=$key_="%rbp";			# reassign $key_
3829
3830$code.=<<___;
3831	mov	$key,$key_		# [re-]backup $key [after reassignment]
3832	movups	($ivp),$iv
3833	mov	$rnds_,$rounds
3834	cmp	\$0x50,$len
3835	jbe	.Lcbc_dec_tail
3836
3837	$movkey	($key),$rndkey0
3838	movdqu	0x00($inp),$inout0	# load input
3839	movdqu	0x10($inp),$inout1
3840	movdqa	$inout0,$in0
3841	movdqu	0x20($inp),$inout2
3842	movdqa	$inout1,$in1
3843	movdqu	0x30($inp),$inout3
3844	movdqa	$inout2,$in2
3845	movdqu	0x40($inp),$inout4
3846	movdqa	$inout3,$in3
3847	movdqu	0x50($inp),$inout5
3848	movdqa	$inout4,$in4
3849	leaq	OPENSSL_ia32cap_P(%rip),%r9
3850	mov	4(%r9),%r9d
3851	cmp	\$0x70,$len
3852	jbe	.Lcbc_dec_six_or_seven
3853
3854	and	\$`1<<26|1<<22`,%r9d	# isolate XSAVE+MOVBE
3855	sub	\$0x50,$len		# $len is biased by -5*16
3856	cmp	\$`1<<22`,%r9d		# check for MOVBE without XSAVE
3857	je	.Lcbc_dec_loop6_enter	# [which denotes Atom Silvermont]
3858	sub	\$0x20,$len		# $len is biased by -7*16
3859	lea	0x70($key),$key		# size optimization
3860	jmp	.Lcbc_dec_loop8_enter
3861.align	16
3862.Lcbc_dec_loop8:
3863	movups	$inout7,($out)
3864	lea	0x10($out),$out
3865.Lcbc_dec_loop8_enter:
3866	movdqu		0x60($inp),$inout6
3867	pxor		$rndkey0,$inout0
3868	movdqu		0x70($inp),$inout7
3869	pxor		$rndkey0,$inout1
3870	$movkey		0x10-0x70($key),$rndkey1
3871	pxor		$rndkey0,$inout2
3872	mov		\$-1,$inp_
3873	cmp		\$0x70,$len	# is there at least 0x60 bytes ahead?
3874	pxor		$rndkey0,$inout3
3875	pxor		$rndkey0,$inout4
3876	pxor		$rndkey0,$inout5
3877	pxor		$rndkey0,$inout6
3878
3879	aesdec		$rndkey1,$inout0
3880	pxor		$rndkey0,$inout7
3881	$movkey		0x20-0x70($key),$rndkey0
3882	aesdec		$rndkey1,$inout1
3883	aesdec		$rndkey1,$inout2
3884	aesdec		$rndkey1,$inout3
3885	aesdec		$rndkey1,$inout4
3886	aesdec		$rndkey1,$inout5
3887	aesdec		$rndkey1,$inout6
3888	adc		\$0,$inp_
3889	and		\$128,$inp_
3890	aesdec		$rndkey1,$inout7
3891	add		$inp,$inp_
3892	$movkey		0x30-0x70($key),$rndkey1
3893___
3894for($i=1;$i<12;$i++) {
3895my $rndkeyx = ($i&1)?$rndkey0:$rndkey1;
3896$code.=<<___	if ($i==7);
3897	cmp		\$11,$rounds
3898___
3899$code.=<<___;
3900	aesdec		$rndkeyx,$inout0
3901	aesdec		$rndkeyx,$inout1
3902	aesdec		$rndkeyx,$inout2
3903	aesdec		$rndkeyx,$inout3
3904	aesdec		$rndkeyx,$inout4
3905	aesdec		$rndkeyx,$inout5
3906	aesdec		$rndkeyx,$inout6
3907	aesdec		$rndkeyx,$inout7
3908	$movkey		`0x30+0x10*$i`-0x70($key),$rndkeyx
3909___
3910$code.=<<___	if ($i<6 || (!($i&1) && $i>7));
3911	nop
3912___
3913$code.=<<___	if ($i==7);
3914	jb		.Lcbc_dec_done
3915___
3916$code.=<<___	if ($i==9);
3917	je		.Lcbc_dec_done
3918___
3919$code.=<<___	if ($i==11);
3920	jmp		.Lcbc_dec_done
3921___
3922}
3923$code.=<<___;
3924.align	16
3925.Lcbc_dec_done:
3926	aesdec		$rndkey1,$inout0
3927	aesdec		$rndkey1,$inout1
3928	pxor		$rndkey0,$iv
3929	pxor		$rndkey0,$in0
3930	aesdec		$rndkey1,$inout2
3931	aesdec		$rndkey1,$inout3
3932	pxor		$rndkey0,$in1
3933	pxor		$rndkey0,$in2
3934	aesdec		$rndkey1,$inout4
3935	aesdec		$rndkey1,$inout5
3936	pxor		$rndkey0,$in3
3937	pxor		$rndkey0,$in4
3938	aesdec		$rndkey1,$inout6
3939	aesdec		$rndkey1,$inout7
3940	movdqu		0x50($inp),$rndkey1
3941
3942	aesdeclast	$iv,$inout0
3943	movdqu		0x60($inp),$iv		# borrow $iv
3944	pxor		$rndkey0,$rndkey1
3945	aesdeclast	$in0,$inout1
3946	pxor		$rndkey0,$iv
3947	movdqu		0x70($inp),$rndkey0	# next IV
3948	aesdeclast	$in1,$inout2
3949	lea		0x80($inp),$inp
3950	movdqu		0x00($inp_),$in0
3951	aesdeclast	$in2,$inout3
3952	aesdeclast	$in3,$inout4
3953	movdqu		0x10($inp_),$in1
3954	movdqu		0x20($inp_),$in2
3955	aesdeclast	$in4,$inout5
3956	aesdeclast	$rndkey1,$inout6
3957	movdqu		0x30($inp_),$in3
3958	movdqu		0x40($inp_),$in4
3959	aesdeclast	$iv,$inout7
3960	movdqa		$rndkey0,$iv		# return $iv
3961	movdqu		0x50($inp_),$rndkey1
3962	$movkey		-0x70($key),$rndkey0
3963
3964	movups		$inout0,($out)		# store output
3965	movdqa		$in0,$inout0
3966	movups		$inout1,0x10($out)
3967	movdqa		$in1,$inout1
3968	movups		$inout2,0x20($out)
3969	movdqa		$in2,$inout2
3970	movups		$inout3,0x30($out)
3971	movdqa		$in3,$inout3
3972	movups		$inout4,0x40($out)
3973	movdqa		$in4,$inout4
3974	movups		$inout5,0x50($out)
3975	movdqa		$rndkey1,$inout5
3976	movups		$inout6,0x60($out)
3977	lea		0x70($out),$out
3978
3979	sub	\$0x80,$len
3980	ja	.Lcbc_dec_loop8
3981
3982	movaps	$inout7,$inout0
3983	lea	-0x70($key),$key
3984	add	\$0x70,$len
3985	jle	.Lcbc_dec_clear_tail_collected
3986	movups	$inout7,($out)
3987	lea	0x10($out),$out
3988	cmp	\$0x50,$len
3989	jbe	.Lcbc_dec_tail
3990
3991	movaps	$in0,$inout0
3992.Lcbc_dec_six_or_seven:
3993	cmp	\$0x60,$len
3994	ja	.Lcbc_dec_seven
3995
3996	movaps	$inout5,$inout6
3997	call	_aesni_decrypt6
3998	pxor	$iv,$inout0		# ^= IV
3999	movaps	$inout6,$iv
4000	pxor	$in0,$inout1
4001	movdqu	$inout0,($out)
4002	pxor	$in1,$inout2
4003	movdqu	$inout1,0x10($out)
4004	 pxor	$inout1,$inout1		# clear register bank
4005	pxor	$in2,$inout3
4006	movdqu	$inout2,0x20($out)
4007	 pxor	$inout2,$inout2
4008	pxor	$in3,$inout4
4009	movdqu	$inout3,0x30($out)
4010	 pxor	$inout3,$inout3
4011	pxor	$in4,$inout5
4012	movdqu	$inout4,0x40($out)
4013	 pxor	$inout4,$inout4
4014	lea	0x50($out),$out
4015	movdqa	$inout5,$inout0
4016	 pxor	$inout5,$inout5
4017	jmp	.Lcbc_dec_tail_collected
4018
4019.align	16
4020.Lcbc_dec_seven:
4021	movups	0x60($inp),$inout6
4022	xorps	$inout7,$inout7
4023	call	_aesni_decrypt8
4024	movups	0x50($inp),$inout7
4025	pxor	$iv,$inout0		# ^= IV
4026	movups	0x60($inp),$iv
4027	pxor	$in0,$inout1
4028	movdqu	$inout0,($out)
4029	pxor	$in1,$inout2
4030	movdqu	$inout1,0x10($out)
4031	 pxor	$inout1,$inout1		# clear register bank
4032	pxor	$in2,$inout3
4033	movdqu	$inout2,0x20($out)
4034	 pxor	$inout2,$inout2
4035	pxor	$in3,$inout4
4036	movdqu	$inout3,0x30($out)
4037	 pxor	$inout3,$inout3
4038	pxor	$in4,$inout5
4039	movdqu	$inout4,0x40($out)
4040	 pxor	$inout4,$inout4
4041	pxor	$inout7,$inout6
4042	movdqu	$inout5,0x50($out)
4043	 pxor	$inout5,$inout5
4044	lea	0x60($out),$out
4045	movdqa	$inout6,$inout0
4046	 pxor	$inout6,$inout6
4047	 pxor	$inout7,$inout7
4048	jmp	.Lcbc_dec_tail_collected
4049
4050.align	16
4051.Lcbc_dec_loop6:
4052	movups	$inout5,($out)
4053	lea	0x10($out),$out
4054	movdqu	0x00($inp),$inout0	# load input
4055	movdqu	0x10($inp),$inout1
4056	movdqa	$inout0,$in0
4057	movdqu	0x20($inp),$inout2
4058	movdqa	$inout1,$in1
4059	movdqu	0x30($inp),$inout3
4060	movdqa	$inout2,$in2
4061	movdqu	0x40($inp),$inout4
4062	movdqa	$inout3,$in3
4063	movdqu	0x50($inp),$inout5
4064	movdqa	$inout4,$in4
4065.Lcbc_dec_loop6_enter:
4066	lea	0x60($inp),$inp
4067	movdqa	$inout5,$inout6
4068
4069	call	_aesni_decrypt6
4070
4071	pxor	$iv,$inout0		# ^= IV
4072	movdqa	$inout6,$iv
4073	pxor	$in0,$inout1
4074	movdqu	$inout0,($out)
4075	pxor	$in1,$inout2
4076	movdqu	$inout1,0x10($out)
4077	pxor	$in2,$inout3
4078	movdqu	$inout2,0x20($out)
4079	pxor	$in3,$inout4
4080	mov	$key_,$key
4081	movdqu	$inout3,0x30($out)
4082	pxor	$in4,$inout5
4083	mov	$rnds_,$rounds
4084	movdqu	$inout4,0x40($out)
4085	lea	0x50($out),$out
4086	sub	\$0x60,$len
4087	ja	.Lcbc_dec_loop6
4088
4089	movdqa	$inout5,$inout0
4090	add	\$0x50,$len
4091	jle	.Lcbc_dec_clear_tail_collected
4092	movups	$inout5,($out)
4093	lea	0x10($out),$out
4094
4095.Lcbc_dec_tail:
4096	movups	($inp),$inout0
4097	sub	\$0x10,$len
4098	jbe	.Lcbc_dec_one		# $len is 1*16 or less
4099
4100	movups	0x10($inp),$inout1
4101	movaps	$inout0,$in0
4102	sub	\$0x10,$len
4103	jbe	.Lcbc_dec_two		# $len is 2*16 or less
4104
4105	movups	0x20($inp),$inout2
4106	movaps	$inout1,$in1
4107	sub	\$0x10,$len
4108	jbe	.Lcbc_dec_three		# $len is 3*16 or less
4109
4110	movups	0x30($inp),$inout3
4111	movaps	$inout2,$in2
4112	sub	\$0x10,$len
4113	jbe	.Lcbc_dec_four		# $len is 4*16 or less
4114
4115	movups	0x40($inp),$inout4	# $len is 5*16 or less
4116	movaps	$inout3,$in3
4117	movaps	$inout4,$in4
4118	xorps	$inout5,$inout5
4119	call	_aesni_decrypt6
4120	pxor	$iv,$inout0
4121	movaps	$in4,$iv
4122	pxor	$in0,$inout1
4123	movdqu	$inout0,($out)
4124	pxor	$in1,$inout2
4125	movdqu	$inout1,0x10($out)
4126	 pxor	$inout1,$inout1		# clear register bank
4127	pxor	$in2,$inout3
4128	movdqu	$inout2,0x20($out)
4129	 pxor	$inout2,$inout2
4130	pxor	$in3,$inout4
4131	movdqu	$inout3,0x30($out)
4132	 pxor	$inout3,$inout3
4133	lea	0x40($out),$out
4134	movdqa	$inout4,$inout0
4135	 pxor	$inout4,$inout4
4136	 pxor	$inout5,$inout5
4137	sub	\$0x10,$len
4138	jmp	.Lcbc_dec_tail_collected
4139
4140.align	16
4141.Lcbc_dec_one:
4142	movaps	$inout0,$in0
4143___
4144	&aesni_generate1("dec",$key,$rounds);
4145$code.=<<___;
4146	xorps	$iv,$inout0
4147	movaps	$in0,$iv
4148	jmp	.Lcbc_dec_tail_collected
4149.align	16
4150.Lcbc_dec_two:
4151	movaps	$inout1,$in1
4152	call	_aesni_decrypt2
4153	pxor	$iv,$inout0
4154	movaps	$in1,$iv
4155	pxor	$in0,$inout1
4156	movdqu	$inout0,($out)
4157	movdqa	$inout1,$inout0
4158	 pxor	$inout1,$inout1		# clear register bank
4159	lea	0x10($out),$out
4160	jmp	.Lcbc_dec_tail_collected
4161.align	16
4162.Lcbc_dec_three:
4163	movaps	$inout2,$in2
4164	call	_aesni_decrypt3
4165	pxor	$iv,$inout0
4166	movaps	$in2,$iv
4167	pxor	$in0,$inout1
4168	movdqu	$inout0,($out)
4169	pxor	$in1,$inout2
4170	movdqu	$inout1,0x10($out)
4171	 pxor	$inout1,$inout1		# clear register bank
4172	movdqa	$inout2,$inout0
4173	 pxor	$inout2,$inout2
4174	lea	0x20($out),$out
4175	jmp	.Lcbc_dec_tail_collected
4176.align	16
4177.Lcbc_dec_four:
4178	movaps	$inout3,$in3
4179	call	_aesni_decrypt4
4180	pxor	$iv,$inout0
4181	movaps	$in3,$iv
4182	pxor	$in0,$inout1
4183	movdqu	$inout0,($out)
4184	pxor	$in1,$inout2
4185	movdqu	$inout1,0x10($out)
4186	 pxor	$inout1,$inout1		# clear register bank
4187	pxor	$in2,$inout3
4188	movdqu	$inout2,0x20($out)
4189	 pxor	$inout2,$inout2
4190	movdqa	$inout3,$inout0
4191	 pxor	$inout3,$inout3
4192	lea	0x30($out),$out
4193	jmp	.Lcbc_dec_tail_collected
4194
4195.align	16
4196.Lcbc_dec_clear_tail_collected:
4197	pxor	$inout1,$inout1		# clear register bank
4198	pxor	$inout2,$inout2
4199	pxor	$inout3,$inout3
4200___
4201$code.=<<___ if (!$win64);
4202	pxor	$inout4,$inout4		# %xmm6..9
4203	pxor	$inout5,$inout5
4204	pxor	$inout6,$inout6
4205	pxor	$inout7,$inout7
4206___
4207$code.=<<___;
4208.Lcbc_dec_tail_collected:
4209	movups	$iv,($ivp)
4210	and	\$15,$len
4211	jnz	.Lcbc_dec_tail_partial
4212	movups	$inout0,($out)
4213	pxor	$inout0,$inout0
4214	jmp	.Lcbc_dec_ret
4215.align	16
4216.Lcbc_dec_tail_partial:
4217	movaps	$inout0,(%rsp)
4218	pxor	$inout0,$inout0
4219	mov	\$16,%rcx
4220	mov	$out,%rdi
4221	sub	$len,%rcx
4222	lea	(%rsp),%rsi
4223	.long	0x9066A4F3		# rep movsb
4224	movdqa	$inout0,(%rsp)
4225
4226.Lcbc_dec_ret:
4227	xorps	$rndkey0,$rndkey0	# %xmm0
4228	pxor	$rndkey1,$rndkey1
4229___
4230$code.=<<___ if ($win64);
4231	movaps	0x10(%rsp),%xmm6
4232	movaps	%xmm0,0x10(%rsp)	# clear stack
4233	movaps	0x20(%rsp),%xmm7
4234	movaps	%xmm0,0x20(%rsp)
4235	movaps	0x30(%rsp),%xmm8
4236	movaps	%xmm0,0x30(%rsp)
4237	movaps	0x40(%rsp),%xmm9
4238	movaps	%xmm0,0x40(%rsp)
4239	movaps	0x50(%rsp),%xmm10
4240	movaps	%xmm0,0x50(%rsp)
4241	movaps	0x60(%rsp),%xmm11
4242	movaps	%xmm0,0x60(%rsp)
4243	movaps	0x70(%rsp),%xmm12
4244	movaps	%xmm0,0x70(%rsp)
4245	movaps	0x80(%rsp),%xmm13
4246	movaps	%xmm0,0x80(%rsp)
4247	movaps	0x90(%rsp),%xmm14
4248	movaps	%xmm0,0x90(%rsp)
4249	movaps	0xa0(%rsp),%xmm15
4250	movaps	%xmm0,0xa0(%rsp)
4251___
4252$code.=<<___;
4253	mov	-8(%r11),%rbp
4254.cfi_restore	%rbp
4255	lea	(%r11),%rsp
4256.cfi_def_cfa_register	%rsp
4257.Lcbc_ret:
4258	ret
4259.cfi_endproc
4260.size	${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
4261___
4262}
4263# int ${PREFIX}_set_decrypt_key(const unsigned char *inp,
4264#				int bits, AES_KEY *key)
4265#
4266# input:	$inp	user-supplied key
4267#		$bits	$inp length in bits
4268#		$key	pointer to key schedule
4269# output:	%eax	0 denoting success, -1 or -2 - failure (see C)
4270#		*$key	key schedule
4271#
4272{ my ($inp,$bits,$key) = @_4args;
4273  $bits =~ s/%r/%e/;
4274
4275$code.=<<___;
4276.globl	${PREFIX}_set_decrypt_key
4277.type	${PREFIX}_set_decrypt_key,\@abi-omnipotent
4278.align	16
4279${PREFIX}_set_decrypt_key:
4280.cfi_startproc
4281	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
4282.cfi_adjust_cfa_offset	8
4283	call	__aesni_set_encrypt_key
4284	shl	\$4,$bits		# rounds-1 after _aesni_set_encrypt_key
4285	test	%eax,%eax
4286	jnz	.Ldec_key_ret
4287	lea	16($key,$bits),$inp	# points at the end of key schedule
4288
4289	$movkey	($key),%xmm0		# just swap
4290	$movkey	($inp),%xmm1
4291	$movkey	%xmm0,($inp)
4292	$movkey	%xmm1,($key)
4293	lea	16($key),$key
4294	lea	-16($inp),$inp
4295
4296.Ldec_key_inverse:
4297	$movkey	($key),%xmm0		# swap and inverse
4298	$movkey	($inp),%xmm1
4299	aesimc	%xmm0,%xmm0
4300	aesimc	%xmm1,%xmm1
4301	lea	16($key),$key
4302	lea	-16($inp),$inp
4303	$movkey	%xmm0,16($inp)
4304	$movkey	%xmm1,-16($key)
4305	cmp	$key,$inp
4306	ja	.Ldec_key_inverse
4307
4308	$movkey	($key),%xmm0		# inverse middle
4309	aesimc	%xmm0,%xmm0
4310	pxor	%xmm1,%xmm1
4311	$movkey	%xmm0,($inp)
4312	pxor	%xmm0,%xmm0
4313.Ldec_key_ret:
4314	add	\$8,%rsp
4315.cfi_adjust_cfa_offset	-8
4316	ret
4317.cfi_endproc
4318.LSEH_end_set_decrypt_key:
4319.size	${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
4320___
4321
4322# This is based on submission from Intel by
4323#	Huang Ying
4324#	Vinodh Gopal
4325#	Kahraman Akdemir
4326#
4327# Aggressively optimized in respect to aeskeygenassist's critical path
4328# and is contained in %xmm0-5 to meet Win64 ABI requirement.
4329#
4330# int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
4331#				int bits, AES_KEY * const key);
4332#
4333# input:	$inp	user-supplied key
4334#		$bits	$inp length in bits
4335#		$key	pointer to key schedule
4336# output:	%eax	0 denoting success, -1 or -2 - failure (see C)
4337#		$bits	rounds-1 (used in aesni_set_decrypt_key)
4338#		*$key	key schedule
4339#		$key	pointer to key schedule (used in
4340#			aesni_set_decrypt_key)
4341#
4342# Subroutine is frame-less, which means that only volatile registers
4343# are used. Note that it's declared "abi-omnipotent", which means that
4344# amount of volatile registers is smaller on Windows.
4345#
4346$code.=<<___;
4347.globl	${PREFIX}_set_encrypt_key
4348.type	${PREFIX}_set_encrypt_key,\@abi-omnipotent
4349.align	16
4350${PREFIX}_set_encrypt_key:
4351__aesni_set_encrypt_key:
4352.cfi_startproc
4353#ifdef BORINGSSL_DISPATCH_TEST
4354	movb \$1,BORINGSSL_function_hit+3(%rip)
4355#endif
4356	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
4357.cfi_adjust_cfa_offset	8
4358	mov	\$-1,%rax
4359	test	$inp,$inp
4360	jz	.Lenc_key_ret
4361	test	$key,$key
4362	jz	.Lenc_key_ret
4363
4364	movups	($inp),%xmm0		# pull first 128 bits of *userKey
4365	xorps	%xmm4,%xmm4		# low dword of xmm4 is assumed 0
4366	leaq	OPENSSL_ia32cap_P(%rip),%r10
4367	movl	4(%r10),%r10d
4368	and	\$`1<<28|1<<11`,%r10d	# AVX and XOP bits
4369	lea	16($key),%rax		# %rax is used as modifiable copy of $key
4370	cmp	\$256,$bits
4371	je	.L14rounds
4372	cmp	\$192,$bits
4373	je	.L12rounds
4374	cmp	\$128,$bits
4375	jne	.Lbad_keybits
4376
4377.L10rounds:
4378	mov	\$9,$bits			# 10 rounds for 128-bit key
4379	cmp	\$`1<<28`,%r10d			# AVX, bit no XOP
4380	je	.L10rounds_alt
4381
4382	$movkey	%xmm0,($key)			# round 0
4383	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 1
4384	call		.Lkey_expansion_128_cold
4385	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 2
4386	call		.Lkey_expansion_128
4387	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 3
4388	call		.Lkey_expansion_128
4389	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 4
4390	call		.Lkey_expansion_128
4391	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 5
4392	call		.Lkey_expansion_128
4393	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 6
4394	call		.Lkey_expansion_128
4395	aeskeygenassist	\$0x40,%xmm0,%xmm1	# round 7
4396	call		.Lkey_expansion_128
4397	aeskeygenassist	\$0x80,%xmm0,%xmm1	# round 8
4398	call		.Lkey_expansion_128
4399	aeskeygenassist	\$0x1b,%xmm0,%xmm1	# round 9
4400	call		.Lkey_expansion_128
4401	aeskeygenassist	\$0x36,%xmm0,%xmm1	# round 10
4402	call		.Lkey_expansion_128
4403	$movkey	%xmm0,(%rax)
4404	mov	$bits,80(%rax)	# 240(%rdx)
4405	xor	%eax,%eax
4406	jmp	.Lenc_key_ret
4407
4408.align	16
4409.L10rounds_alt:
4410	movdqa	.Lkey_rotate(%rip),%xmm5
4411	mov	\$8,%r10d
4412	movdqa	.Lkey_rcon1(%rip),%xmm4
4413	movdqa	%xmm0,%xmm2
4414	movdqu	%xmm0,($key)
4415	jmp	.Loop_key128
4416
4417.align	16
4418.Loop_key128:
4419	pshufb		%xmm5,%xmm0
4420	aesenclast	%xmm4,%xmm0
4421	pslld		\$1,%xmm4
4422	lea		16(%rax),%rax
4423
4424	movdqa		%xmm2,%xmm3
4425	pslldq		\$4,%xmm2
4426	pxor		%xmm2,%xmm3
4427	pslldq		\$4,%xmm2
4428	pxor		%xmm2,%xmm3
4429	pslldq		\$4,%xmm2
4430	pxor		%xmm3,%xmm2
4431
4432	pxor		%xmm2,%xmm0
4433	movdqu		%xmm0,-16(%rax)
4434	movdqa		%xmm0,%xmm2
4435
4436	dec	%r10d
4437	jnz	.Loop_key128
4438
4439	movdqa		.Lkey_rcon1b(%rip),%xmm4
4440
4441	pshufb		%xmm5,%xmm0
4442	aesenclast	%xmm4,%xmm0
4443	pslld		\$1,%xmm4
4444
4445	movdqa		%xmm2,%xmm3
4446	pslldq		\$4,%xmm2
4447	pxor		%xmm2,%xmm3
4448	pslldq		\$4,%xmm2
4449	pxor		%xmm2,%xmm3
4450	pslldq		\$4,%xmm2
4451	pxor		%xmm3,%xmm2
4452
4453	pxor		%xmm2,%xmm0
4454	movdqu		%xmm0,(%rax)
4455
4456	movdqa		%xmm0,%xmm2
4457	pshufb		%xmm5,%xmm0
4458	aesenclast	%xmm4,%xmm0
4459
4460	movdqa		%xmm2,%xmm3
4461	pslldq		\$4,%xmm2
4462	pxor		%xmm2,%xmm3
4463	pslldq		\$4,%xmm2
4464	pxor		%xmm2,%xmm3
4465	pslldq		\$4,%xmm2
4466	pxor		%xmm3,%xmm2
4467
4468	pxor		%xmm2,%xmm0
4469	movdqu		%xmm0,16(%rax)
4470
4471	mov	$bits,96(%rax)	# 240($key)
4472	xor	%eax,%eax
4473	jmp	.Lenc_key_ret
4474
4475.align	16
4476.L12rounds:
4477	movq	16($inp),%xmm2			# remaining 1/3 of *userKey
4478	mov	\$11,$bits			# 12 rounds for 192
4479	cmp	\$`1<<28`,%r10d			# AVX, but no XOP
4480	je	.L12rounds_alt
4481
4482	$movkey	%xmm0,($key)			# round 0
4483	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 1,2
4484	call		.Lkey_expansion_192a_cold
4485	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 2,3
4486	call		.Lkey_expansion_192b
4487	aeskeygenassist	\$0x4,%xmm2,%xmm1	# round 4,5
4488	call		.Lkey_expansion_192a
4489	aeskeygenassist	\$0x8,%xmm2,%xmm1	# round 5,6
4490	call		.Lkey_expansion_192b
4491	aeskeygenassist	\$0x10,%xmm2,%xmm1	# round 7,8
4492	call		.Lkey_expansion_192a
4493	aeskeygenassist	\$0x20,%xmm2,%xmm1	# round 8,9
4494	call		.Lkey_expansion_192b
4495	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 10,11
4496	call		.Lkey_expansion_192a
4497	aeskeygenassist	\$0x80,%xmm2,%xmm1	# round 11,12
4498	call		.Lkey_expansion_192b
4499	$movkey	%xmm0,(%rax)
4500	mov	$bits,48(%rax)	# 240(%rdx)
4501	xor	%rax, %rax
4502	jmp	.Lenc_key_ret
4503
4504.align	16
4505.L12rounds_alt:
4506	movdqa	.Lkey_rotate192(%rip),%xmm5
4507	movdqa	.Lkey_rcon1(%rip),%xmm4
4508	mov	\$8,%r10d
4509	movdqu	%xmm0,($key)
4510	jmp	.Loop_key192
4511
4512.align	16
4513.Loop_key192:
4514	movq		%xmm2,0(%rax)
4515	movdqa		%xmm2,%xmm1
4516	pshufb		%xmm5,%xmm2
4517	aesenclast	%xmm4,%xmm2
4518	pslld		\$1, %xmm4
4519	lea		24(%rax),%rax
4520
4521	movdqa		%xmm0,%xmm3
4522	pslldq		\$4,%xmm0
4523	pxor		%xmm0,%xmm3
4524	pslldq		\$4,%xmm0
4525	pxor		%xmm0,%xmm3
4526	pslldq		\$4,%xmm0
4527	pxor		%xmm3,%xmm0
4528
4529	pshufd		\$0xff,%xmm0,%xmm3
4530	pxor		%xmm1,%xmm3
4531	pslldq		\$4,%xmm1
4532	pxor		%xmm1,%xmm3
4533
4534	pxor		%xmm2,%xmm0
4535	pxor		%xmm3,%xmm2
4536	movdqu		%xmm0,-16(%rax)
4537
4538	dec	%r10d
4539	jnz	.Loop_key192
4540
4541	mov	$bits,32(%rax)	# 240($key)
4542	xor	%eax,%eax
4543	jmp	.Lenc_key_ret
4544
4545.align	16
4546.L14rounds:
4547	movups	16($inp),%xmm2			# remaining half of *userKey
4548	mov	\$13,$bits			# 14 rounds for 256
4549	lea	16(%rax),%rax
4550	cmp	\$`1<<28`,%r10d			# AVX, but no XOP
4551	je	.L14rounds_alt
4552
4553	$movkey	%xmm0,($key)			# round 0
4554	$movkey	%xmm2,16($key)			# round 1
4555	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 2
4556	call		.Lkey_expansion_256a_cold
4557	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 3
4558	call		.Lkey_expansion_256b
4559	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 4
4560	call		.Lkey_expansion_256a
4561	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 5
4562	call		.Lkey_expansion_256b
4563	aeskeygenassist	\$0x4,%xmm2,%xmm1	# round 6
4564	call		.Lkey_expansion_256a
4565	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 7
4566	call		.Lkey_expansion_256b
4567	aeskeygenassist	\$0x8,%xmm2,%xmm1	# round 8
4568	call		.Lkey_expansion_256a
4569	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 9
4570	call		.Lkey_expansion_256b
4571	aeskeygenassist	\$0x10,%xmm2,%xmm1	# round 10
4572	call		.Lkey_expansion_256a
4573	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 11
4574	call		.Lkey_expansion_256b
4575	aeskeygenassist	\$0x20,%xmm2,%xmm1	# round 12
4576	call		.Lkey_expansion_256a
4577	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 13
4578	call		.Lkey_expansion_256b
4579	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 14
4580	call		.Lkey_expansion_256a
4581	$movkey	%xmm0,(%rax)
4582	mov	$bits,16(%rax)	# 240(%rdx)
4583	xor	%rax,%rax
4584	jmp	.Lenc_key_ret
4585
4586.align	16
4587.L14rounds_alt:
4588	movdqa	.Lkey_rotate(%rip),%xmm5
4589	movdqa	.Lkey_rcon1(%rip),%xmm4
4590	mov	\$7,%r10d
4591	movdqu	%xmm0,0($key)
4592	movdqa	%xmm2,%xmm1
4593	movdqu	%xmm2,16($key)
4594	jmp	.Loop_key256
4595
4596.align	16
4597.Loop_key256:
4598	pshufb		%xmm5,%xmm2
4599	aesenclast	%xmm4,%xmm2
4600
4601	movdqa		%xmm0,%xmm3
4602	pslldq		\$4,%xmm0
4603	pxor		%xmm0,%xmm3
4604	pslldq		\$4,%xmm0
4605	pxor		%xmm0,%xmm3
4606	pslldq		\$4,%xmm0
4607	pxor		%xmm3,%xmm0
4608	pslld		\$1,%xmm4
4609
4610	pxor		%xmm2,%xmm0
4611	movdqu		%xmm0,(%rax)
4612
4613	dec	%r10d
4614	jz	.Ldone_key256
4615
4616	pshufd		\$0xff,%xmm0,%xmm2
4617	pxor		%xmm3,%xmm3
4618	aesenclast	%xmm3,%xmm2
4619
4620	movdqa		%xmm1,%xmm3
4621	pslldq		\$4,%xmm1
4622	pxor		%xmm1,%xmm3
4623	pslldq		\$4,%xmm1
4624	pxor		%xmm1,%xmm3
4625	pslldq		\$4,%xmm1
4626	pxor		%xmm3,%xmm1
4627
4628	pxor		%xmm1,%xmm2
4629	movdqu		%xmm2,16(%rax)
4630	lea		32(%rax),%rax
4631	movdqa		%xmm2,%xmm1
4632
4633	jmp	.Loop_key256
4634
4635.Ldone_key256:
4636	mov	$bits,16(%rax)	# 240($key)
4637	xor	%eax,%eax
4638	jmp	.Lenc_key_ret
4639
4640.align	16
4641.Lbad_keybits:
4642	mov	\$-2,%rax
4643.Lenc_key_ret:
4644	pxor	%xmm0,%xmm0
4645	pxor	%xmm1,%xmm1
4646	pxor	%xmm2,%xmm2
4647	pxor	%xmm3,%xmm3
4648	pxor	%xmm4,%xmm4
4649	pxor	%xmm5,%xmm5
4650	add	\$8,%rsp
4651.cfi_adjust_cfa_offset	-8
4652	ret
4653.cfi_endproc
4654.LSEH_end_set_encrypt_key:
4655
4656.align	16
4657.Lkey_expansion_128:
4658	$movkey	%xmm0,(%rax)
4659	lea	16(%rax),%rax
4660.Lkey_expansion_128_cold:
4661	shufps	\$0b00010000,%xmm0,%xmm4
4662	xorps	%xmm4, %xmm0
4663	shufps	\$0b10001100,%xmm0,%xmm4
4664	xorps	%xmm4, %xmm0
4665	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
4666	xorps	%xmm1,%xmm0
4667	ret
4668
4669.align 16
4670.Lkey_expansion_192a:
4671	$movkey	%xmm0,(%rax)
4672	lea	16(%rax),%rax
4673.Lkey_expansion_192a_cold:
4674	movaps	%xmm2, %xmm5
4675.Lkey_expansion_192b_warm:
4676	shufps	\$0b00010000,%xmm0,%xmm4
4677	movdqa	%xmm2,%xmm3
4678	xorps	%xmm4,%xmm0
4679	shufps	\$0b10001100,%xmm0,%xmm4
4680	pslldq	\$4,%xmm3
4681	xorps	%xmm4,%xmm0
4682	pshufd	\$0b01010101,%xmm1,%xmm1	# critical path
4683	pxor	%xmm3,%xmm2
4684	pxor	%xmm1,%xmm0
4685	pshufd	\$0b11111111,%xmm0,%xmm3
4686	pxor	%xmm3,%xmm2
4687	ret
4688
4689.align 16
4690.Lkey_expansion_192b:
4691	movaps	%xmm0,%xmm3
4692	shufps	\$0b01000100,%xmm0,%xmm5
4693	$movkey	%xmm5,(%rax)
4694	shufps	\$0b01001110,%xmm2,%xmm3
4695	$movkey	%xmm3,16(%rax)
4696	lea	32(%rax),%rax
4697	jmp	.Lkey_expansion_192b_warm
4698
4699.align	16
4700.Lkey_expansion_256a:
4701	$movkey	%xmm2,(%rax)
4702	lea	16(%rax),%rax
4703.Lkey_expansion_256a_cold:
4704	shufps	\$0b00010000,%xmm0,%xmm4
4705	xorps	%xmm4,%xmm0
4706	shufps	\$0b10001100,%xmm0,%xmm4
4707	xorps	%xmm4,%xmm0
4708	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
4709	xorps	%xmm1,%xmm0
4710	ret
4711
4712.align 16
4713.Lkey_expansion_256b:
4714	$movkey	%xmm0,(%rax)
4715	lea	16(%rax),%rax
4716
4717	shufps	\$0b00010000,%xmm2,%xmm4
4718	xorps	%xmm4,%xmm2
4719	shufps	\$0b10001100,%xmm2,%xmm4
4720	xorps	%xmm4,%xmm2
4721	shufps	\$0b10101010,%xmm1,%xmm1	# critical path
4722	xorps	%xmm1,%xmm2
4723	ret
4724.size	${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
4725.size	__aesni_set_encrypt_key,.-__aesni_set_encrypt_key
4726___
4727}
4728
4729$code.=<<___;
4730.align	64
4731.Lbswap_mask:
4732	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
4733.Lincrement32:
4734	.long	6,6,6,0
4735.Lincrement64:
4736	.long	1,0,0,0
4737.Lxts_magic:
4738	.long	0x87,0,1,0
4739.Lincrement1:
4740	.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4741.Lkey_rotate:
4742	.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
4743.Lkey_rotate192:
4744	.long	0x04070605,0x04070605,0x04070605,0x04070605
4745.Lkey_rcon1:
4746	.long	1,1,1,1
4747.Lkey_rcon1b:
4748	.long	0x1b,0x1b,0x1b,0x1b
4749
4750.asciz  "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
4751.align	64
4752___
4753
4754# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
4755#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
4756if ($win64) {
4757$rec="%rcx";
4758$frame="%rdx";
4759$context="%r8";
4760$disp="%r9";
4761
4762$code.=<<___;
4763.extern	__imp_RtlVirtualUnwind
4764___
4765$code.=<<___ if ($PREFIX eq "aes_hw");
4766.type	ecb_ccm64_se_handler,\@abi-omnipotent
4767.align	16
4768ecb_ccm64_se_handler:
4769	push	%rsi
4770	push	%rdi
4771	push	%rbx
4772	push	%rbp
4773	push	%r12
4774	push	%r13
4775	push	%r14
4776	push	%r15
4777	pushfq
4778	sub	\$64,%rsp
4779
4780	mov	120($context),%rax	# pull context->Rax
4781	mov	248($context),%rbx	# pull context->Rip
4782
4783	mov	8($disp),%rsi		# disp->ImageBase
4784	mov	56($disp),%r11		# disp->HandlerData
4785
4786	mov	0(%r11),%r10d		# HandlerData[0]
4787	lea	(%rsi,%r10),%r10	# prologue label
4788	cmp	%r10,%rbx		# context->Rip<prologue label
4789	jb	.Lcommon_seh_tail
4790
4791	mov	152($context),%rax	# pull context->Rsp
4792
4793	mov	4(%r11),%r10d		# HandlerData[1]
4794	lea	(%rsi,%r10),%r10	# epilogue label
4795	cmp	%r10,%rbx		# context->Rip>=epilogue label
4796	jae	.Lcommon_seh_tail
4797
4798	lea	0(%rax),%rsi		# %xmm save area
4799	lea	512($context),%rdi	# &context.Xmm6
4800	mov	\$8,%ecx		# 4*sizeof(%xmm0)/sizeof(%rax)
4801	.long	0xa548f3fc		# cld; rep movsq
4802	lea	0x58(%rax),%rax		# adjust stack pointer
4803
4804	jmp	.Lcommon_seh_tail
4805.size	${PREFIX}_ccm64_se_handler,.-${PREFIX}_ccm64_se_handler
4806
4807.type	ctr_xts_se_handler,\@abi-omnipotent
4808.align	16
4809ctr_xts_se_handler:
4810	push	%rsi
4811	push	%rdi
4812	push	%rbx
4813	push	%rbp
4814	push	%r12
4815	push	%r13
4816	push	%r14
4817	push	%r15
4818	pushfq
4819	sub	\$64,%rsp
4820
4821	mov	120($context),%rax	# pull context->Rax
4822	mov	248($context),%rbx	# pull context->Rip
4823
4824	mov	8($disp),%rsi		# disp->ImageBase
4825	mov	56($disp),%r11		# disp->HandlerData
4826
4827	mov	0(%r11),%r10d		# HandlerData[0]
4828	lea	(%rsi,%r10),%r10	# prologue lable
4829	cmp	%r10,%rbx		# context->Rip<prologue label
4830	jb	.Lcommon_seh_tail
4831
4832	mov	152($context),%rax	# pull context->Rsp
4833
4834	mov	4(%r11),%r10d		# HandlerData[1]
4835	lea	(%rsi,%r10),%r10	# epilogue label
4836	cmp	%r10,%rbx		# context->Rip>=epilogue label
4837	jae	.Lcommon_seh_tail
4838
4839	mov	208($context),%rax	# pull context->R11
4840
4841	lea	-0xa8(%rax),%rsi	# %xmm save area
4842	lea	512($context),%rdi	# & context.Xmm6
4843	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
4844	.long	0xa548f3fc		# cld; rep movsq
4845
4846	mov	-8(%rax),%rbp		# restore saved %rbp
4847	mov	%rbp,160($context)	# restore context->Rbp
4848	jmp	.Lcommon_seh_tail
4849.size	ctr_xts_se_handler,.-ctr_xts_se_handler
4850
4851___
4852# BoringSSL omits the OCB functions.
4853$code.=<<___ if (0);
4854.type	ocb_se_handler,\@abi-omnipotent
4855.align	16
4856ocb_se_handler:
4857	push	%rsi
4858	push	%rdi
4859	push	%rbx
4860	push	%rbp
4861	push	%r12
4862	push	%r13
4863	push	%r14
4864	push	%r15
4865	pushfq
4866	sub	\$64,%rsp
4867
4868	mov	120($context),%rax	# pull context->Rax
4869	mov	248($context),%rbx	# pull context->Rip
4870
4871	mov	8($disp),%rsi		# disp->ImageBase
4872	mov	56($disp),%r11		# disp->HandlerData
4873
4874	mov	0(%r11),%r10d		# HandlerData[0]
4875	lea	(%rsi,%r10),%r10	# prologue lable
4876	cmp	%r10,%rbx		# context->Rip<prologue label
4877	jb	.Lcommon_seh_tail
4878
4879	mov	4(%r11),%r10d		# HandlerData[1]
4880	lea	(%rsi,%r10),%r10	# epilogue label
4881	cmp	%r10,%rbx		# context->Rip>=epilogue label
4882	jae	.Lcommon_seh_tail
4883
4884	mov	8(%r11),%r10d		# HandlerData[2]
4885	lea	(%rsi,%r10),%r10
4886	cmp	%r10,%rbx		# context->Rip>=pop label
4887	jae	.Locb_no_xmm
4888
4889	mov	152($context),%rax	# pull context->Rsp
4890
4891	lea	(%rax),%rsi		# %xmm save area
4892	lea	512($context),%rdi	# & context.Xmm6
4893	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
4894	.long	0xa548f3fc		# cld; rep movsq
4895	lea	0xa0+0x28(%rax),%rax
4896
4897.Locb_no_xmm:
4898	mov	-8(%rax),%rbx
4899	mov	-16(%rax),%rbp
4900	mov	-24(%rax),%r12
4901	mov	-32(%rax),%r13
4902	mov	-40(%rax),%r14
4903
4904	mov	%rbx,144($context)	# restore context->Rbx
4905	mov	%rbp,160($context)	# restore context->Rbp
4906	mov	%r12,216($context)	# restore context->R12
4907	mov	%r13,224($context)	# restore context->R13
4908	mov	%r14,232($context)	# restore context->R14
4909
4910	jmp	.Lcommon_seh_tail
4911.size	ocb_se_handler,.-ocb_se_handler
4912___
4913$code.=<<___;
4914.type	cbc_se_handler,\@abi-omnipotent
4915.align	16
4916cbc_se_handler:
4917	push	%rsi
4918	push	%rdi
4919	push	%rbx
4920	push	%rbp
4921	push	%r12
4922	push	%r13
4923	push	%r14
4924	push	%r15
4925	pushfq
4926	sub	\$64,%rsp
4927
4928	mov	152($context),%rax	# pull context->Rsp
4929	mov	248($context),%rbx	# pull context->Rip
4930
4931	lea	.Lcbc_decrypt_bulk(%rip),%r10
4932	cmp	%r10,%rbx		# context->Rip<"prologue" label
4933	jb	.Lcommon_seh_tail
4934
4935	mov	120($context),%rax	# pull context->Rax
4936
4937	lea	.Lcbc_decrypt_body(%rip),%r10
4938	cmp	%r10,%rbx		# context->Rip<cbc_decrypt_body
4939	jb	.Lcommon_seh_tail
4940
4941	mov	152($context),%rax	# pull context->Rsp
4942
4943	lea	.Lcbc_ret(%rip),%r10
4944	cmp	%r10,%rbx		# context->Rip>="epilogue" label
4945	jae	.Lcommon_seh_tail
4946
4947	lea	16(%rax),%rsi		# %xmm save area
4948	lea	512($context),%rdi	# &context.Xmm6
4949	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
4950	.long	0xa548f3fc		# cld; rep movsq
4951
4952	mov	208($context),%rax	# pull context->R11
4953
4954	mov	-8(%rax),%rbp		# restore saved %rbp
4955	mov	%rbp,160($context)	# restore context->Rbp
4956
4957.Lcommon_seh_tail:
4958	mov	8(%rax),%rdi
4959	mov	16(%rax),%rsi
4960	mov	%rax,152($context)	# restore context->Rsp
4961	mov	%rsi,168($context)	# restore context->Rsi
4962	mov	%rdi,176($context)	# restore context->Rdi
4963
4964	mov	40($disp),%rdi		# disp->ContextRecord
4965	mov	$context,%rsi		# context
4966	mov	\$154,%ecx		# sizeof(CONTEXT)
4967	.long	0xa548f3fc		# cld; rep movsq
4968
4969	mov	$disp,%rsi
4970	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
4971	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
4972	mov	0(%rsi),%r8		# arg3, disp->ControlPc
4973	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
4974	mov	40(%rsi),%r10		# disp->ContextRecord
4975	lea	56(%rsi),%r11		# &disp->HandlerData
4976	lea	24(%rsi),%r12		# &disp->EstablisherFrame
4977	mov	%r10,32(%rsp)		# arg5
4978	mov	%r11,40(%rsp)		# arg6
4979	mov	%r12,48(%rsp)		# arg7
4980	mov	%rcx,56(%rsp)		# arg8, (NULL)
4981	call	*__imp_RtlVirtualUnwind(%rip)
4982
4983	mov	\$1,%eax		# ExceptionContinueSearch
4984	add	\$64,%rsp
4985	popfq
4986	pop	%r15
4987	pop	%r14
4988	pop	%r13
4989	pop	%r12
4990	pop	%rbp
4991	pop	%rbx
4992	pop	%rdi
4993	pop	%rsi
4994	ret
4995.size	cbc_se_handler,.-cbc_se_handler
4996
4997.section	.pdata
4998.align	4
4999___
5000$code.=<<___ if ($PREFIX eq "aes_hw");
5001	.rva	.LSEH_begin_${PREFIX}_ecb_encrypt
5002	.rva	.LSEH_end_${PREFIX}_ecb_encrypt
5003	.rva	.LSEH_info_ecb
5004
5005	.rva	.LSEH_begin_${PREFIX}_ctr32_encrypt_blocks
5006	.rva	.LSEH_end_${PREFIX}_ctr32_encrypt_blocks
5007	.rva	.LSEH_info_ctr32
5008___
5009$code.=<<___;
5010	.rva	.LSEH_begin_${PREFIX}_cbc_encrypt
5011	.rva	.LSEH_end_${PREFIX}_cbc_encrypt
5012	.rva	.LSEH_info_cbc
5013
5014	.rva	${PREFIX}_set_decrypt_key
5015	.rva	.LSEH_end_set_decrypt_key
5016	.rva	.LSEH_info_key
5017
5018	.rva	${PREFIX}_set_encrypt_key
5019	.rva	.LSEH_end_set_encrypt_key
5020	.rva	.LSEH_info_key
5021.section	.xdata
5022.align	8
5023___
5024$code.=<<___ if ($PREFIX eq "aes_hw");
5025.LSEH_info_ecb:
5026	.byte	9,0,0,0
5027	.rva	ecb_ccm64_se_handler
5028	.rva	.Lecb_enc_body,.Lecb_enc_ret		# HandlerData[]
5029.LSEH_info_ctr32:
5030	.byte	9,0,0,0
5031	.rva	ctr_xts_se_handler
5032	.rva	.Lctr32_body,.Lctr32_epilogue		# HandlerData[]
5033___
5034$code.=<<___;
5035.LSEH_info_cbc:
5036	.byte	9,0,0,0
5037	.rva	cbc_se_handler
5038.LSEH_info_key:
5039	.byte	0x01,0x04,0x01,0x00
5040	.byte	0x04,0x02,0x00,0x00	# sub rsp,8
5041___
5042}
5043
5044sub rex {
5045  local *opcode=shift;
5046  my ($dst,$src)=@_;
5047  my $rex=0;
5048
5049    $rex|=0x04			if($dst>=8);
5050    $rex|=0x01			if($src>=8);
5051    push @opcode,$rex|0x40	if($rex);
5052}
5053
5054sub aesni {
5055  my $line=shift;
5056  my @opcode=(0x66);
5057
5058    if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5059	rex(\@opcode,$4,$3);
5060	push @opcode,0x0f,0x3a,0xdf;
5061	push @opcode,0xc0|($3&7)|(($4&7)<<3);	# ModR/M
5062	my $c=$2;
5063	push @opcode,$c=~/^0/?oct($c):$c;
5064	return ".byte\t".join(',',@opcode);
5065    }
5066    elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5067	my %opcodelet = (
5068		"aesimc" => 0xdb,
5069		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
5070		"aesdec" => 0xde,	"aesdeclast" => 0xdf
5071	);
5072	return undef if (!defined($opcodelet{$1}));
5073	rex(\@opcode,$3,$2);
5074	push @opcode,0x0f,0x38,$opcodelet{$1};
5075	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
5076	return ".byte\t".join(',',@opcode);
5077    }
5078    elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
5079	my %opcodelet = (
5080		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
5081		"aesdec" => 0xde,	"aesdeclast" => 0xdf
5082	);
5083	return undef if (!defined($opcodelet{$1}));
5084	my $off = $2;
5085	push @opcode,0x44 if ($3>=8);
5086	push @opcode,0x0f,0x38,$opcodelet{$1};
5087	push @opcode,0x44|(($3&7)<<3),0x24;	# ModR/M
5088	push @opcode,($off=~/^0/?oct($off):$off)&0xff;
5089	return ".byte\t".join(',',@opcode);
5090    }
5091    return $line;
5092}
5093
5094sub movbe {
5095	".byte	0x0f,0x38,0xf1,0x44,0x24,".shift;
5096}
5097
5098$code =~ s/\`([^\`]*)\`/eval($1)/gem;
5099$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
5100#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm;	# debugging artefact
5101$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
5102
5103print $code;
5104
5105close STDOUT or die "error closing STDOUT";
5106