• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for Intel AES-NI extension. In
18# OpenSSL context it's used with Intel engine, but can also be used as
19# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
20# details].
21#
22# Performance.
23#
24# Given aes(enc|dec) instructions' latency asymptotic performance for
25# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
26# processed with 128-bit key. And given their throughput asymptotic
27# performance for parallelizable modes is 1.25 cycles per byte. Being
28# asymptotic limit it's not something you commonly achieve in reality,
29# but how close does one get? Below are results collected for
30# different modes and block sized. Pairs of numbers are for en-/
31# decryption.
32#
33#	16-byte     64-byte     256-byte    1-KB        8-KB
34# ECB	4.25/4.25   1.38/1.38   1.28/1.28   1.26/1.26	1.26/1.26
35# CTR	5.42/5.42   1.92/1.92   1.44/1.44   1.28/1.28   1.26/1.26
36# CBC	4.38/4.43   4.15/1.43   4.07/1.32   4.07/1.29   4.06/1.28
37# CCM	5.66/9.42   4.42/5.41   4.16/4.40   4.09/4.15   4.06/4.07
38# OFB	5.42/5.42   4.64/4.64   4.44/4.44   4.39/4.39   4.38/4.38
39# CFB	5.73/5.85   5.56/5.62   5.48/5.56   5.47/5.55   5.47/5.55
40#
41# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
42# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
43# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
44# The results were collected with specially crafted speed.c benchmark
45# in order to compare them with results reported in "Intel Advanced
46# Encryption Standard (AES) New Instruction Set" White Paper Revision
47# 3.0 dated May 2010. All above results are consistently better. This
48# module also provides better performance for block sizes smaller than
49# 128 bytes in points *not* represented in the above table.
50#
51# Looking at the results for 8-KB buffer.
52#
53# CFB and OFB results are far from the limit, because implementation
54# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
55# single-block aesni_encrypt, which is not the most optimal way to go.
56# CBC encrypt result is unexpectedly high and there is no documented
57# explanation for it. Seemingly there is a small penalty for feeding
58# the result back to AES unit the way it's done in CBC mode. There is
59# nothing one can do and the result appears optimal. CCM result is
60# identical to CBC, because CBC-MAC is essentially CBC encrypt without
61# saving output. CCM CTR "stays invisible," because it's neatly
62# interleaved wih CBC-MAC. This provides ~30% improvement over
63# "straightforward" CCM implementation with CTR and CBC-MAC performed
64# disjointly. Parallelizable modes practically achieve the theoretical
65# limit.
66#
67# Looking at how results vary with buffer size.
68#
69# Curves are practically saturated at 1-KB buffer size. In most cases
70# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
71# CTR curve doesn't follow this pattern and is "slowest" changing one
72# with "256-byte" result being 87% of "8-KB." This is because overhead
73# in CTR mode is most computationally intensive. Small-block CCM
74# decrypt is slower than encrypt, because first CTR and last CBC-MAC
75# iterations can't be interleaved.
76#
77# Results for 192- and 256-bit keys.
78#
79# EVP-free results were observed to scale perfectly with number of
80# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
81# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
82# are a tad smaller, because the above mentioned penalty biases all
83# results by same constant value. In similar way function call
84# overhead affects small-block performance, as well as OFB and CFB
85# results. Differences are not large, most common coefficients are
86# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
87# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
88
89# January 2011
90#
91# While Westmere processor features 6 cycles latency for aes[enc|dec]
92# instructions, which can be scheduled every second cycle, Sandy
93# Bridge spends 8 cycles per instruction, but it can schedule them
94# every cycle. This means that code targeting Westmere would perform
95# suboptimally on Sandy Bridge. Therefore this update.
96#
97# In addition, non-parallelizable CBC encrypt (as well as CCM) is
98# optimized. Relative improvement might appear modest, 8% on Westmere,
99# but in absolute terms it's 3.77 cycles per byte encrypted with
100# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
101# should be compared to asymptotic limits of 3.75 for Westmere and
102# 5.00 for Sandy Bridge. Actually, the fact that they get this close
103# to asymptotic limits is quite amazing. Indeed, the limit is
104# calculated as latency times number of rounds, 10 for 128-bit key,
105# and divided by 16, the number of bytes in block, or in other words
106# it accounts *solely* for aesenc instructions. But there are extra
107# instructions, and numbers so close to the asymptotic limits mean
108# that it's as if it takes as little as *one* additional cycle to
109# execute all of them. How is it possible? It is possible thanks to
110# out-of-order execution logic, which manages to overlap post-
111# processing of previous block, things like saving the output, with
112# actual encryption of current block, as well as pre-processing of
113# current block, things like fetching input and xor-ing it with
114# 0-round element of the key schedule, with actual encryption of
115# previous block. Keep this in mind...
116#
117# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
118# performance is achieved by interleaving instructions working on
119# independent blocks. In which case asymptotic limit for such modes
120# can be obtained by dividing above mentioned numbers by AES
121# instructions' interleave factor. Westmere can execute at most 3
122# instructions at a time, meaning that optimal interleave factor is 3,
123# and that's where the "magic" number of 1.25 come from. "Optimal
124# interleave factor" means that increase of interleave factor does
125# not improve performance. The formula has proven to reflect reality
126# pretty well on Westmere... Sandy Bridge on the other hand can
127# execute up to 8 AES instructions at a time, so how does varying
128# interleave factor affect the performance? Here is table for ECB
129# (numbers are cycles per byte processed with 128-bit key):
130#
131# instruction interleave factor		3x	6x	8x
132# theoretical asymptotic limit		1.67	0.83	0.625
133# measured performance for 8KB block	1.05	0.86	0.84
134#
135# "as if" interleave factor		4.7x	5.8x	6.0x
136#
137# Further data for other parallelizable modes:
138#
139# CBC decrypt				1.16	0.93	0.74
140# CTR					1.14	0.91	0.74
141#
142# Well, given 3x column it's probably inappropriate to call the limit
143# asymptotic, if it can be surpassed, isn't it? What happens there?
144# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
145# magic is responsible for this. Processor overlaps not only the
146# additional instructions with AES ones, but even AES instructions
147# processing adjacent triplets of independent blocks. In the 6x case
148# additional instructions  still claim disproportionally small amount
149# of additional cycles, but in 8x case number of instructions must be
150# a tad too high for out-of-order logic to cope with, and AES unit
151# remains underutilized... As you can see 8x interleave is hardly
152# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
153# utilizes 6x interleave because of limited register bank capacity.
154#
155# Higher interleave factors do have negative impact on Westmere
156# performance. While for ECB mode it's negligible ~1.5%, other
157# parallelizables perform ~5% worse, which is outweighed by ~25%
158# improvement on Sandy Bridge. To balance regression on Westmere
159# CTR mode was implemented with 6x aesenc interleave factor.
160
161# April 2011
162#
163# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
164# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
165# in CTR mode AES instruction interleave factor was chosen to be 6x.
166
167######################################################################
168# Current large-block performance in cycles per byte processed with
169# 128-bit key (less is better).
170#
171#		CBC en-/decrypt	CTR	XTS	ECB	OCB
172# Westmere	3.77/1.25	1.25	1.25	1.26
173# * Bridge	5.07/0.74	0.75	0.90	0.85	0.98
174# Haswell	4.44/0.63	0.63	0.73	0.63	0.70
175# Skylake	2.62/0.63	0.63	0.63	0.63
176# Silvermont	5.75/3.54	3.56	4.12	3.87(*)	4.11
177# Knights L	2.54/0.77	0.78	0.85	-	1.50
178# Goldmont	3.82/1.26	1.26	1.29	1.29	1.50
179# Bulldozer	5.77/0.70	0.72	0.90	0.70	0.95
180# Ryzen		2.71/0.35	0.35	0.44	0.38	0.49
181#
182# (*)	Atom Silvermont ECB result is suboptimal because of penalties
183#	incurred by operations on %xmm8-15. As ECB is not considered
184#	critical, nothing was done to mitigate the problem.
185
186$PREFIX="aes_hw";	# if $PREFIX is set to "AES", the script
187			# generates drop-in replacement for
188			# crypto/aes/asm/aes-x86_64.pl:-)
189
190$flavour = shift;
191$output  = shift;
192if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
193
194$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
195
196$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
197( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
198( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
199die "can't locate x86_64-xlate.pl";
200
201open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
202*STDOUT=*OUT;
203
204$movkey = $PREFIX eq "aes_hw" ? "movups" : "movups";
205@_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
206		("%rdi","%rsi","%rdx","%rcx");	# Unix order
207
208$code=".text\n";
209$code.=".extern	OPENSSL_ia32cap_P\n";
210
211$rounds="%eax";	# input to and changed by aesni_[en|de]cryptN !!!
212# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
213$inp="%rdi";
214$out="%rsi";
215$len="%rdx";
216$key="%rcx";	# input to and changed by aesni_[en|de]cryptN !!!
217$ivp="%r8";	# cbc, ctr, ...
218
219$rnds_="%r10d";	# backup copy for $rounds
220$key_="%r11";	# backup copy for $key
221
222# %xmm register layout
223$rndkey0="%xmm0";	$rndkey1="%xmm1";
224$inout0="%xmm2";	$inout1="%xmm3";
225$inout2="%xmm4";	$inout3="%xmm5";
226$inout4="%xmm6";	$inout5="%xmm7";
227$inout6="%xmm8";	$inout7="%xmm9";
228
229$in2="%xmm6";		$in1="%xmm7";	# used in CBC decrypt, CTR, ...
230$in0="%xmm8";		$iv="%xmm9";
231
232# Inline version of internal aesni_[en|de]crypt1.
233#
234# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
235# cycles which take care of loop variables...
236{ my $sn;
237sub aesni_generate1 {
238my ($p,$key,$rounds,$inout,$ivec)=@_;	$inout=$inout0 if (!defined($inout));
239++$sn;
240$code.=<<___;
241	$movkey	($key),$rndkey0
242	$movkey	16($key),$rndkey1
243___
244$code.=<<___ if (defined($ivec));
245	xorps	$rndkey0,$ivec
246	lea	32($key),$key
247	xorps	$ivec,$inout
248___
249$code.=<<___ if (!defined($ivec));
250	lea	32($key),$key
251	xorps	$rndkey0,$inout
252___
253$code.=<<___;
254.Loop_${p}1_$sn:
255	aes${p}	$rndkey1,$inout
256	dec	$rounds
257	$movkey	($key),$rndkey1
258	lea	16($key),$key
259	jnz	.Loop_${p}1_$sn	# loop body is 16 bytes
260	aes${p}last	$rndkey1,$inout
261___
262}}
263# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
264#
265{ my ($inp,$out,$key) = @_4args;
266
267$code.=<<___;
268.globl	${PREFIX}_encrypt
269.type	${PREFIX}_encrypt,\@abi-omnipotent
270.align	16
271${PREFIX}_encrypt:
272.cfi_startproc
273	_CET_ENDBR
274#ifdef BORINGSSL_DISPATCH_TEST
275.extern	BORINGSSL_function_hit
276	movb \$1,BORINGSSL_function_hit+1(%rip)
277#endif
278	movups	($inp),$inout0		# load input
279	mov	240($key),$rounds	# key->rounds
280___
281	&aesni_generate1("enc",$key,$rounds);
282$code.=<<___;
283	 pxor	$rndkey0,$rndkey0	# clear register bank
284	 pxor	$rndkey1,$rndkey1
285	movups	$inout0,($out)		# output
286	 pxor	$inout0,$inout0
287	ret
288.cfi_endproc
289.size	${PREFIX}_encrypt,.-${PREFIX}_encrypt
290___
291}
292
293# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
294# factor. Why 3x subroutine were originally used in loops? Even though
295# aes[enc|dec] latency was originally 6, it could be scheduled only
296# every *2nd* cycle. Thus 3x interleave was the one providing optimal
297# utilization, i.e. when subroutine's throughput is virtually same as
298# of non-interleaved subroutine [for number of input blocks up to 3].
299# This is why it originally made no sense to implement 2x subroutine.
300# But times change and it became appropriate to spend extra 192 bytes
301# on 2x subroutine on Atom Silvermont account. For processors that
302# can schedule aes[enc|dec] every cycle optimal interleave factor
303# equals to corresponding instructions latency. 8x is optimal for
304# * Bridge and "super-optimal" for other Intel CPUs...
305
306sub aesni_generate2 {
307my $dir=shift;
308# As already mentioned it takes in $key and $rounds, which are *not*
309# preserved. $inout[0-1] is cipher/clear text...
310$code.=<<___;
311.type	_aesni_${dir}rypt2,\@abi-omnipotent
312.align	16
313_aesni_${dir}rypt2:
314.cfi_startproc
315	$movkey	($key),$rndkey0
316	shl	\$4,$rounds
317	$movkey	16($key),$rndkey1
318	xorps	$rndkey0,$inout0
319	xorps	$rndkey0,$inout1
320	$movkey	32($key),$rndkey0
321	lea	32($key,$rounds),$key
322	neg	%rax				# $rounds
323	add	\$16,%rax
324
325.L${dir}_loop2:
326	aes${dir}	$rndkey1,$inout0
327	aes${dir}	$rndkey1,$inout1
328	$movkey		($key,%rax),$rndkey1
329	add		\$32,%rax
330	aes${dir}	$rndkey0,$inout0
331	aes${dir}	$rndkey0,$inout1
332	$movkey		-16($key,%rax),$rndkey0
333	jnz		.L${dir}_loop2
334
335	aes${dir}	$rndkey1,$inout0
336	aes${dir}	$rndkey1,$inout1
337	aes${dir}last	$rndkey0,$inout0
338	aes${dir}last	$rndkey0,$inout1
339	ret
340.cfi_endproc
341.size	_aesni_${dir}rypt2,.-_aesni_${dir}rypt2
342___
343}
344sub aesni_generate3 {
345my $dir=shift;
346# As already mentioned it takes in $key and $rounds, which are *not*
347# preserved. $inout[0-2] is cipher/clear text...
348$code.=<<___;
349.type	_aesni_${dir}rypt3,\@abi-omnipotent
350.align	16
351_aesni_${dir}rypt3:
352.cfi_startproc
353	$movkey	($key),$rndkey0
354	shl	\$4,$rounds
355	$movkey	16($key),$rndkey1
356	xorps	$rndkey0,$inout0
357	xorps	$rndkey0,$inout1
358	xorps	$rndkey0,$inout2
359	$movkey	32($key),$rndkey0
360	lea	32($key,$rounds),$key
361	neg	%rax				# $rounds
362	add	\$16,%rax
363
364.L${dir}_loop3:
365	aes${dir}	$rndkey1,$inout0
366	aes${dir}	$rndkey1,$inout1
367	aes${dir}	$rndkey1,$inout2
368	$movkey		($key,%rax),$rndkey1
369	add		\$32,%rax
370	aes${dir}	$rndkey0,$inout0
371	aes${dir}	$rndkey0,$inout1
372	aes${dir}	$rndkey0,$inout2
373	$movkey		-16($key,%rax),$rndkey0
374	jnz		.L${dir}_loop3
375
376	aes${dir}	$rndkey1,$inout0
377	aes${dir}	$rndkey1,$inout1
378	aes${dir}	$rndkey1,$inout2
379	aes${dir}last	$rndkey0,$inout0
380	aes${dir}last	$rndkey0,$inout1
381	aes${dir}last	$rndkey0,$inout2
382	ret
383.cfi_endproc
384.size	_aesni_${dir}rypt3,.-_aesni_${dir}rypt3
385___
386}
387# 4x interleave is implemented to improve small block performance,
388# most notably [and naturally] 4 block by ~30%. One can argue that one
389# should have implemented 5x as well, but improvement would be <20%,
390# so it's not worth it...
391sub aesni_generate4 {
392my $dir=shift;
393# As already mentioned it takes in $key and $rounds, which are *not*
394# preserved. $inout[0-3] is cipher/clear text...
395$code.=<<___;
396.type	_aesni_${dir}rypt4,\@abi-omnipotent
397.align	16
398_aesni_${dir}rypt4:
399.cfi_startproc
400	$movkey	($key),$rndkey0
401	shl	\$4,$rounds
402	$movkey	16($key),$rndkey1
403	xorps	$rndkey0,$inout0
404	xorps	$rndkey0,$inout1
405	xorps	$rndkey0,$inout2
406	xorps	$rndkey0,$inout3
407	$movkey	32($key),$rndkey0
408	lea	32($key,$rounds),$key
409	neg	%rax				# $rounds
410	.byte	0x0f,0x1f,0x00
411	add	\$16,%rax
412
413.L${dir}_loop4:
414	aes${dir}	$rndkey1,$inout0
415	aes${dir}	$rndkey1,$inout1
416	aes${dir}	$rndkey1,$inout2
417	aes${dir}	$rndkey1,$inout3
418	$movkey		($key,%rax),$rndkey1
419	add		\$32,%rax
420	aes${dir}	$rndkey0,$inout0
421	aes${dir}	$rndkey0,$inout1
422	aes${dir}	$rndkey0,$inout2
423	aes${dir}	$rndkey0,$inout3
424	$movkey		-16($key,%rax),$rndkey0
425	jnz		.L${dir}_loop4
426
427	aes${dir}	$rndkey1,$inout0
428	aes${dir}	$rndkey1,$inout1
429	aes${dir}	$rndkey1,$inout2
430	aes${dir}	$rndkey1,$inout3
431	aes${dir}last	$rndkey0,$inout0
432	aes${dir}last	$rndkey0,$inout1
433	aes${dir}last	$rndkey0,$inout2
434	aes${dir}last	$rndkey0,$inout3
435	ret
436.cfi_endproc
437.size	_aesni_${dir}rypt4,.-_aesni_${dir}rypt4
438___
439}
440sub aesni_generate6 {
441my $dir=shift;
442# As already mentioned it takes in $key and $rounds, which are *not*
443# preserved. $inout[0-5] is cipher/clear text...
444$code.=<<___;
445.type	_aesni_${dir}rypt6,\@abi-omnipotent
446.align	16
447_aesni_${dir}rypt6:
448.cfi_startproc
449	$movkey		($key),$rndkey0
450	shl		\$4,$rounds
451	$movkey		16($key),$rndkey1
452	xorps		$rndkey0,$inout0
453	pxor		$rndkey0,$inout1
454	pxor		$rndkey0,$inout2
455	aes${dir}	$rndkey1,$inout0
456	lea		32($key,$rounds),$key
457	neg		%rax			# $rounds
458	aes${dir}	$rndkey1,$inout1
459	pxor		$rndkey0,$inout3
460	pxor		$rndkey0,$inout4
461	aes${dir}	$rndkey1,$inout2
462	pxor		$rndkey0,$inout5
463	$movkey		($key,%rax),$rndkey0
464	add		\$16,%rax
465	jmp		.L${dir}_loop6_enter
466.align	16
467.L${dir}_loop6:
468	aes${dir}	$rndkey1,$inout0
469	aes${dir}	$rndkey1,$inout1
470	aes${dir}	$rndkey1,$inout2
471.L${dir}_loop6_enter:
472	aes${dir}	$rndkey1,$inout3
473	aes${dir}	$rndkey1,$inout4
474	aes${dir}	$rndkey1,$inout5
475	$movkey		($key,%rax),$rndkey1
476	add		\$32,%rax
477	aes${dir}	$rndkey0,$inout0
478	aes${dir}	$rndkey0,$inout1
479	aes${dir}	$rndkey0,$inout2
480	aes${dir}	$rndkey0,$inout3
481	aes${dir}	$rndkey0,$inout4
482	aes${dir}	$rndkey0,$inout5
483	$movkey		-16($key,%rax),$rndkey0
484	jnz		.L${dir}_loop6
485
486	aes${dir}	$rndkey1,$inout0
487	aes${dir}	$rndkey1,$inout1
488	aes${dir}	$rndkey1,$inout2
489	aes${dir}	$rndkey1,$inout3
490	aes${dir}	$rndkey1,$inout4
491	aes${dir}	$rndkey1,$inout5
492	aes${dir}last	$rndkey0,$inout0
493	aes${dir}last	$rndkey0,$inout1
494	aes${dir}last	$rndkey0,$inout2
495	aes${dir}last	$rndkey0,$inout3
496	aes${dir}last	$rndkey0,$inout4
497	aes${dir}last	$rndkey0,$inout5
498	ret
499.cfi_endproc
500.size	_aesni_${dir}rypt6,.-_aesni_${dir}rypt6
501___
502}
503sub aesni_generate8 {
504my $dir=shift;
505# As already mentioned it takes in $key and $rounds, which are *not*
506# preserved. $inout[0-7] is cipher/clear text...
507$code.=<<___;
508.type	_aesni_${dir}rypt8,\@abi-omnipotent
509.align	16
510_aesni_${dir}rypt8:
511.cfi_startproc
512	$movkey		($key),$rndkey0
513	shl		\$4,$rounds
514	$movkey		16($key),$rndkey1
515	xorps		$rndkey0,$inout0
516	xorps		$rndkey0,$inout1
517	pxor		$rndkey0,$inout2
518	pxor		$rndkey0,$inout3
519	pxor		$rndkey0,$inout4
520	lea		32($key,$rounds),$key
521	neg		%rax			# $rounds
522	aes${dir}	$rndkey1,$inout0
523	pxor		$rndkey0,$inout5
524	pxor		$rndkey0,$inout6
525	aes${dir}	$rndkey1,$inout1
526	pxor		$rndkey0,$inout7
527	$movkey		($key,%rax),$rndkey0
528	add		\$16,%rax
529	jmp		.L${dir}_loop8_inner
530.align	16
531.L${dir}_loop8:
532	aes${dir}	$rndkey1,$inout0
533	aes${dir}	$rndkey1,$inout1
534.L${dir}_loop8_inner:
535	aes${dir}	$rndkey1,$inout2
536	aes${dir}	$rndkey1,$inout3
537	aes${dir}	$rndkey1,$inout4
538	aes${dir}	$rndkey1,$inout5
539	aes${dir}	$rndkey1,$inout6
540	aes${dir}	$rndkey1,$inout7
541.L${dir}_loop8_enter:
542	$movkey		($key,%rax),$rndkey1
543	add		\$32,%rax
544	aes${dir}	$rndkey0,$inout0
545	aes${dir}	$rndkey0,$inout1
546	aes${dir}	$rndkey0,$inout2
547	aes${dir}	$rndkey0,$inout3
548	aes${dir}	$rndkey0,$inout4
549	aes${dir}	$rndkey0,$inout5
550	aes${dir}	$rndkey0,$inout6
551	aes${dir}	$rndkey0,$inout7
552	$movkey		-16($key,%rax),$rndkey0
553	jnz		.L${dir}_loop8
554
555	aes${dir}	$rndkey1,$inout0
556	aes${dir}	$rndkey1,$inout1
557	aes${dir}	$rndkey1,$inout2
558	aes${dir}	$rndkey1,$inout3
559	aes${dir}	$rndkey1,$inout4
560	aes${dir}	$rndkey1,$inout5
561	aes${dir}	$rndkey1,$inout6
562	aes${dir}	$rndkey1,$inout7
563	aes${dir}last	$rndkey0,$inout0
564	aes${dir}last	$rndkey0,$inout1
565	aes${dir}last	$rndkey0,$inout2
566	aes${dir}last	$rndkey0,$inout3
567	aes${dir}last	$rndkey0,$inout4
568	aes${dir}last	$rndkey0,$inout5
569	aes${dir}last	$rndkey0,$inout6
570	aes${dir}last	$rndkey0,$inout7
571	ret
572.cfi_endproc
573.size	_aesni_${dir}rypt8,.-_aesni_${dir}rypt8
574___
575}
576&aesni_generate2("enc") if ($PREFIX eq "aes_hw");
577&aesni_generate3("enc") if ($PREFIX eq "aes_hw");
578&aesni_generate4("enc") if ($PREFIX eq "aes_hw");
579&aesni_generate6("enc") if ($PREFIX eq "aes_hw");
580&aesni_generate8("enc") if ($PREFIX eq "aes_hw");
581
582if ($PREFIX eq "aes_hw") {
583{
584######################################################################
585# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
586#                         size_t blocks, const AES_KEY *key,
587#                         const char *ivec);
588#
589# Handles only complete blocks, operates on 32-bit counter and
590# does not update *ivec! (see crypto/modes/ctr128.c for details)
591#
592# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
593# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
594# Keywords are full unroll and modulo-schedule counter calculations
595# with zero-round key xor.
596{
597my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
598my ($key0,$ctr)=("%ebp","${ivp}d");
599my $frame_size = 0x80 + ($win64?160:0);
600
601$code.=<<___;
602.globl	${PREFIX}_ctr32_encrypt_blocks
603.type	${PREFIX}_ctr32_encrypt_blocks,\@function,5
604.align	16
605${PREFIX}_ctr32_encrypt_blocks:
606.cfi_startproc
607	_CET_ENDBR
608#ifdef BORINGSSL_DISPATCH_TEST
609	movb \$1,BORINGSSL_function_hit(%rip)
610#endif
611	cmp	\$1,$len
612	jne	.Lctr32_bulk
613
614	# handle single block without allocating stack frame,
615	# useful when handling edges
616	movups	($ivp),$inout0
617	movups	($inp),$inout1
618	mov	240($key),%edx			# key->rounds
619___
620	&aesni_generate1("enc",$key,"%edx");
621$code.=<<___;
622	 pxor	$rndkey0,$rndkey0		# clear register bank
623	 pxor	$rndkey1,$rndkey1
624	xorps	$inout1,$inout0
625	 pxor	$inout1,$inout1
626	movups	$inout0,($out)
627	 xorps	$inout0,$inout0
628	jmp	.Lctr32_epilogue
629
630.align	16
631.Lctr32_bulk:
632	lea	(%rsp),$key_			# use $key_ as frame pointer
633.cfi_def_cfa_register	$key_
634	push	%rbp
635.cfi_push	%rbp
636	sub	\$$frame_size,%rsp
637	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
638___
639$code.=<<___ if ($win64);
640	movaps	%xmm6,-0xa8($key_)		# offload everything
641	movaps	%xmm7,-0x98($key_)
642	movaps	%xmm8,-0x88($key_)
643	movaps	%xmm9,-0x78($key_)
644	movaps	%xmm10,-0x68($key_)
645	movaps	%xmm11,-0x58($key_)
646	movaps	%xmm12,-0x48($key_)
647	movaps	%xmm13,-0x38($key_)
648	movaps	%xmm14,-0x28($key_)
649	movaps	%xmm15,-0x18($key_)
650.Lctr32_body:
651___
652$code.=<<___;
653
654	# 8 16-byte words on top of stack are counter values
655	# xor-ed with zero-round key
656
657	movdqu	($ivp),$inout0
658	movdqu	($key),$rndkey0
659	mov	12($ivp),$ctr			# counter LSB
660	pxor	$rndkey0,$inout0
661	mov	12($key),$key0			# 0-round key LSB
662	movdqa	$inout0,0x00(%rsp)		# populate counter block
663	bswap	$ctr
664	movdqa	$inout0,$inout1
665	movdqa	$inout0,$inout2
666	movdqa	$inout0,$inout3
667	movdqa	$inout0,0x40(%rsp)
668	movdqa	$inout0,0x50(%rsp)
669	movdqa	$inout0,0x60(%rsp)
670	mov	%rdx,%r10			# about to borrow %rdx
671	movdqa	$inout0,0x70(%rsp)
672
673	lea	1($ctr),%rax
674	 lea	2($ctr),%rdx
675	bswap	%eax
676	 bswap	%edx
677	xor	$key0,%eax
678	 xor	$key0,%edx
679	pinsrd	\$3,%eax,$inout1
680	lea	3($ctr),%rax
681	movdqa	$inout1,0x10(%rsp)
682	 pinsrd	\$3,%edx,$inout2
683	bswap	%eax
684	 mov	%r10,%rdx			# restore %rdx
685	 lea	4($ctr),%r10
686	 movdqa	$inout2,0x20(%rsp)
687	xor	$key0,%eax
688	 bswap	%r10d
689	pinsrd	\$3,%eax,$inout3
690	 xor	$key0,%r10d
691	movdqa	$inout3,0x30(%rsp)
692	lea	5($ctr),%r9
693	 mov	%r10d,0x40+12(%rsp)
694	bswap	%r9d
695	 lea	6($ctr),%r10
696	mov	240($key),$rounds		# key->rounds
697	xor	$key0,%r9d
698	 bswap	%r10d
699	mov	%r9d,0x50+12(%rsp)
700	 xor	$key0,%r10d
701	lea	7($ctr),%r9
702	 mov	%r10d,0x60+12(%rsp)
703	bswap	%r9d
704	leaq	OPENSSL_ia32cap_P(%rip),%r10
705	 mov	4(%r10),%r10d
706	xor	$key0,%r9d
707	 and	\$`1<<26|1<<22`,%r10d		# isolate XSAVE+MOVBE
708	mov	%r9d,0x70+12(%rsp)
709
710	$movkey	0x10($key),$rndkey1
711
712	movdqa	0x40(%rsp),$inout4
713	movdqa	0x50(%rsp),$inout5
714
715	cmp	\$8,$len		# $len is in blocks
716	jb	.Lctr32_tail		# short input if ($len<8)
717
718	sub	\$6,$len		# $len is biased by -6
719	cmp	\$`1<<22`,%r10d		# check for MOVBE without XSAVE
720	je	.Lctr32_6x		# [which denotes Atom Silvermont]
721
722	lea	0x80($key),$key		# size optimization
723	sub	\$2,$len		# $len is biased by -8
724	jmp	.Lctr32_loop8
725
726.align	16
727.Lctr32_6x:
728	shl	\$4,$rounds
729	mov	\$48,$rnds_
730	bswap	$key0
731	lea	32($key,$rounds),$key	# end of key schedule
732	sub	%rax,%r10		# twisted $rounds
733	jmp	.Lctr32_loop6
734
735.align	16
736.Lctr32_loop6:
737	 add	\$6,$ctr		# next counter value
738	$movkey	-48($key,$rnds_),$rndkey0
739	aesenc	$rndkey1,$inout0
740	 mov	$ctr,%eax
741	 xor	$key0,%eax
742	aesenc	$rndkey1,$inout1
743	 movbe	%eax,`0x00+12`(%rsp)	# store next counter value
744	 lea	1($ctr),%eax
745	aesenc	$rndkey1,$inout2
746	 xor	$key0,%eax
747	 movbe	%eax,`0x10+12`(%rsp)
748	aesenc	$rndkey1,$inout3
749	 lea	2($ctr),%eax
750	 xor	$key0,%eax
751	aesenc	$rndkey1,$inout4
752	 movbe	%eax,`0x20+12`(%rsp)
753	 lea	3($ctr),%eax
754	aesenc	$rndkey1,$inout5
755	$movkey	-32($key,$rnds_),$rndkey1
756	 xor	$key0,%eax
757
758	aesenc	$rndkey0,$inout0
759	 movbe	%eax,`0x30+12`(%rsp)
760	 lea	4($ctr),%eax
761	aesenc	$rndkey0,$inout1
762	 xor	$key0,%eax
763	 movbe	%eax,`0x40+12`(%rsp)
764	aesenc	$rndkey0,$inout2
765	 lea	5($ctr),%eax
766	 xor	$key0,%eax
767	aesenc	$rndkey0,$inout3
768	 movbe	%eax,`0x50+12`(%rsp)
769	 mov	%r10,%rax		# mov	$rnds_,$rounds
770	aesenc	$rndkey0,$inout4
771	aesenc	$rndkey0,$inout5
772	$movkey	-16($key,$rnds_),$rndkey0
773
774	call	.Lenc_loop6
775
776	movdqu	($inp),$inout6		# load 6 input blocks
777	movdqu	0x10($inp),$inout7
778	movdqu	0x20($inp),$in0
779	movdqu	0x30($inp),$in1
780	movdqu	0x40($inp),$in2
781	movdqu	0x50($inp),$in3
782	lea	0x60($inp),$inp		# $inp+=6*16
783	$movkey	-64($key,$rnds_),$rndkey1
784	pxor	$inout0,$inout6		# inp^=E(ctr)
785	movaps	0x00(%rsp),$inout0	# load next counter [xor-ed with 0 round]
786	pxor	$inout1,$inout7
787	movaps	0x10(%rsp),$inout1
788	pxor	$inout2,$in0
789	movaps	0x20(%rsp),$inout2
790	pxor	$inout3,$in1
791	movaps	0x30(%rsp),$inout3
792	pxor	$inout4,$in2
793	movaps	0x40(%rsp),$inout4
794	pxor	$inout5,$in3
795	movaps	0x50(%rsp),$inout5
796	movdqu	$inout6,($out)		# store 6 output blocks
797	movdqu	$inout7,0x10($out)
798	movdqu	$in0,0x20($out)
799	movdqu	$in1,0x30($out)
800	movdqu	$in2,0x40($out)
801	movdqu	$in3,0x50($out)
802	lea	0x60($out),$out		# $out+=6*16
803
804	sub	\$6,$len
805	jnc	.Lctr32_loop6		# loop if $len-=6 didn't borrow
806
807	add	\$6,$len		# restore real remaining $len
808	jz	.Lctr32_done		# done if ($len==0)
809
810	lea	-48($rnds_),$rounds
811	lea	-80($key,$rnds_),$key	# restore $key
812	neg	$rounds
813	shr	\$4,$rounds		# restore $rounds
814	jmp	.Lctr32_tail
815
816.align	32
817.Lctr32_loop8:
818	 add		\$8,$ctr		# next counter value
819	movdqa		0x60(%rsp),$inout6
820	aesenc		$rndkey1,$inout0
821	 mov		$ctr,%r9d
822	movdqa		0x70(%rsp),$inout7
823	aesenc		$rndkey1,$inout1
824	 bswap		%r9d
825	$movkey		0x20-0x80($key),$rndkey0
826	aesenc		$rndkey1,$inout2
827	 xor		$key0,%r9d
828	 nop
829	aesenc		$rndkey1,$inout3
830	 mov		%r9d,0x00+12(%rsp)	# store next counter value
831	 lea		1($ctr),%r9
832	aesenc		$rndkey1,$inout4
833	aesenc		$rndkey1,$inout5
834	aesenc		$rndkey1,$inout6
835	aesenc		$rndkey1,$inout7
836	$movkey		0x30-0x80($key),$rndkey1
837___
838for($i=2;$i<8;$i++) {
839my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
840$code.=<<___;
841	 bswap		%r9d
842	aesenc		$rndkeyx,$inout0
843	aesenc		$rndkeyx,$inout1
844	 xor		$key0,%r9d
845	 .byte		0x66,0x90
846	aesenc		$rndkeyx,$inout2
847	aesenc		$rndkeyx,$inout3
848	 mov		%r9d,`0x10*($i-1)`+12(%rsp)
849	 lea		$i($ctr),%r9
850	aesenc		$rndkeyx,$inout4
851	aesenc		$rndkeyx,$inout5
852	aesenc		$rndkeyx,$inout6
853	aesenc		$rndkeyx,$inout7
854	$movkey		`0x20+0x10*$i`-0x80($key),$rndkeyx
855___
856}
857$code.=<<___;
858	 bswap		%r9d
859	aesenc		$rndkey0,$inout0
860	aesenc		$rndkey0,$inout1
861	aesenc		$rndkey0,$inout2
862	 xor		$key0,%r9d
863	 movdqu		0x00($inp),$in0		# start loading input
864	aesenc		$rndkey0,$inout3
865	 mov		%r9d,0x70+12(%rsp)
866	 cmp		\$11,$rounds
867	aesenc		$rndkey0,$inout4
868	aesenc		$rndkey0,$inout5
869	aesenc		$rndkey0,$inout6
870	aesenc		$rndkey0,$inout7
871	$movkey		0xa0-0x80($key),$rndkey0
872
873	jb		.Lctr32_enc_done
874
875	aesenc		$rndkey1,$inout0
876	aesenc		$rndkey1,$inout1
877	aesenc		$rndkey1,$inout2
878	aesenc		$rndkey1,$inout3
879	aesenc		$rndkey1,$inout4
880	aesenc		$rndkey1,$inout5
881	aesenc		$rndkey1,$inout6
882	aesenc		$rndkey1,$inout7
883	$movkey		0xb0-0x80($key),$rndkey1
884
885	aesenc		$rndkey0,$inout0
886	aesenc		$rndkey0,$inout1
887	aesenc		$rndkey0,$inout2
888	aesenc		$rndkey0,$inout3
889	aesenc		$rndkey0,$inout4
890	aesenc		$rndkey0,$inout5
891	aesenc		$rndkey0,$inout6
892	aesenc		$rndkey0,$inout7
893	$movkey		0xc0-0x80($key),$rndkey0
894
895	# 192-bit key support was removed.
896
897	aesenc		$rndkey1,$inout0
898	aesenc		$rndkey1,$inout1
899	aesenc		$rndkey1,$inout2
900	aesenc		$rndkey1,$inout3
901	aesenc		$rndkey1,$inout4
902	aesenc		$rndkey1,$inout5
903	aesenc		$rndkey1,$inout6
904	aesenc		$rndkey1,$inout7
905	$movkey		0xd0-0x80($key),$rndkey1
906
907	aesenc		$rndkey0,$inout0
908	aesenc		$rndkey0,$inout1
909	aesenc		$rndkey0,$inout2
910	aesenc		$rndkey0,$inout3
911	aesenc		$rndkey0,$inout4
912	aesenc		$rndkey0,$inout5
913	aesenc		$rndkey0,$inout6
914	aesenc		$rndkey0,$inout7
915	$movkey		0xe0-0x80($key),$rndkey0
916	jmp		.Lctr32_enc_done
917
918.align	16
919.Lctr32_enc_done:
920	movdqu		0x10($inp),$in1
921	pxor		$rndkey0,$in0		# input^=round[last]
922	movdqu		0x20($inp),$in2
923	pxor		$rndkey0,$in1
924	movdqu		0x30($inp),$in3
925	pxor		$rndkey0,$in2
926	movdqu		0x40($inp),$in4
927	pxor		$rndkey0,$in3
928	movdqu		0x50($inp),$in5
929	pxor		$rndkey0,$in4
930	prefetcht0	0x1c0($inp)	# We process 128 bytes (8*16), so to prefetch 1 iteration
931	prefetcht0	0x200($inp)	# We need to prefetch 2 64 byte lines
932	pxor		$rndkey0,$in5
933	aesenc		$rndkey1,$inout0
934	aesenc		$rndkey1,$inout1
935	aesenc		$rndkey1,$inout2
936	aesenc		$rndkey1,$inout3
937	aesenc		$rndkey1,$inout4
938	aesenc		$rndkey1,$inout5
939	aesenc		$rndkey1,$inout6
940	aesenc		$rndkey1,$inout7
941	movdqu		0x60($inp),$rndkey1	# borrow $rndkey1 for inp[6]
942	lea		0x80($inp),$inp		# $inp+=8*16
943
944	aesenclast	$in0,$inout0		# $inN is inp[N]^round[last]
945	pxor		$rndkey0,$rndkey1	# borrowed $rndkey
946	movdqu		0x70-0x80($inp),$in0
947	aesenclast	$in1,$inout1
948	pxor		$rndkey0,$in0
949	movdqa		0x00(%rsp),$in1		# load next counter block
950	aesenclast	$in2,$inout2
951	aesenclast	$in3,$inout3
952	movdqa		0x10(%rsp),$in2
953	movdqa		0x20(%rsp),$in3
954	aesenclast	$in4,$inout4
955	aesenclast	$in5,$inout5
956	movdqa		0x30(%rsp),$in4
957	movdqa		0x40(%rsp),$in5
958	aesenclast	$rndkey1,$inout6
959	movdqa		0x50(%rsp),$rndkey0
960	$movkey		0x10-0x80($key),$rndkey1#real 1st-round key
961	aesenclast	$in0,$inout7
962
963	movups		$inout0,($out)		# store 8 output blocks
964	movdqa		$in1,$inout0
965	movups		$inout1,0x10($out)
966	movdqa		$in2,$inout1
967	movups		$inout2,0x20($out)
968	movdqa		$in3,$inout2
969	movups		$inout3,0x30($out)
970	movdqa		$in4,$inout3
971	movups		$inout4,0x40($out)
972	movdqa		$in5,$inout4
973	movups		$inout5,0x50($out)
974	movdqa		$rndkey0,$inout5
975	movups		$inout6,0x60($out)
976	movups		$inout7,0x70($out)
977	lea		0x80($out),$out		# $out+=8*16
978
979	sub	\$8,$len
980	jnc	.Lctr32_loop8			# loop if $len-=8 didn't borrow
981
982	add	\$8,$len			# restore real remaining $len
983	jz	.Lctr32_done			# done if ($len==0)
984	lea	-0x80($key),$key
985
986.Lctr32_tail:
987	# note that at this point $inout0..5 are populated with
988	# counter values xor-ed with 0-round key
989	lea	16($key),$key
990	cmp	\$4,$len
991	jb	.Lctr32_loop3
992	je	.Lctr32_loop4
993
994	# if ($len>4) compute 7 E(counter)
995	shl		\$4,$rounds
996	movdqa		0x60(%rsp),$inout6
997	pxor		$inout7,$inout7
998
999	$movkey		16($key),$rndkey0
1000	aesenc		$rndkey1,$inout0
1001	aesenc		$rndkey1,$inout1
1002	lea		32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter
1003	neg		%rax
1004	aesenc		$rndkey1,$inout2
1005	add		\$16,%rax		# prepare for .Lenc_loop8_enter
1006	 movups		($inp),$in0
1007	aesenc		$rndkey1,$inout3
1008	aesenc		$rndkey1,$inout4
1009	 movups		0x10($inp),$in1		# pre-load input
1010	 movups		0x20($inp),$in2
1011	aesenc		$rndkey1,$inout5
1012	aesenc		$rndkey1,$inout6
1013
1014	call            .Lenc_loop8_enter
1015
1016	movdqu	0x30($inp),$in3
1017	pxor	$in0,$inout0
1018	movdqu	0x40($inp),$in0
1019	pxor	$in1,$inout1
1020	movdqu	$inout0,($out)			# store output
1021	pxor	$in2,$inout2
1022	movdqu	$inout1,0x10($out)
1023	pxor	$in3,$inout3
1024	movdqu	$inout2,0x20($out)
1025	pxor	$in0,$inout4
1026	movdqu	$inout3,0x30($out)
1027	movdqu	$inout4,0x40($out)
1028	cmp	\$6,$len
1029	jb	.Lctr32_done			# $len was 5, stop store
1030
1031	movups	0x50($inp),$in1
1032	xorps	$in1,$inout5
1033	movups	$inout5,0x50($out)
1034	je	.Lctr32_done			# $len was 6, stop store
1035
1036	movups	0x60($inp),$in2
1037	xorps	$in2,$inout6
1038	movups	$inout6,0x60($out)
1039	jmp	.Lctr32_done			# $len was 7, stop store
1040
1041.align	32
1042.Lctr32_loop4:
1043	aesenc		$rndkey1,$inout0
1044	lea		16($key),$key
1045	dec		$rounds
1046	aesenc		$rndkey1,$inout1
1047	aesenc		$rndkey1,$inout2
1048	aesenc		$rndkey1,$inout3
1049	$movkey		($key),$rndkey1
1050	jnz		.Lctr32_loop4
1051	aesenclast	$rndkey1,$inout0
1052	aesenclast	$rndkey1,$inout1
1053	 movups		($inp),$in0		# load input
1054	 movups		0x10($inp),$in1
1055	aesenclast	$rndkey1,$inout2
1056	aesenclast	$rndkey1,$inout3
1057	 movups		0x20($inp),$in2
1058	 movups		0x30($inp),$in3
1059
1060	xorps	$in0,$inout0
1061	movups	$inout0,($out)			# store output
1062	xorps	$in1,$inout1
1063	movups	$inout1,0x10($out)
1064	pxor	$in2,$inout2
1065	movdqu	$inout2,0x20($out)
1066	pxor	$in3,$inout3
1067	movdqu	$inout3,0x30($out)
1068	jmp	.Lctr32_done			# $len was 4, stop store
1069
1070.align	32
1071.Lctr32_loop3:
1072	aesenc		$rndkey1,$inout0
1073	lea		16($key),$key
1074	dec		$rounds
1075	aesenc		$rndkey1,$inout1
1076	aesenc		$rndkey1,$inout2
1077	$movkey		($key),$rndkey1
1078	jnz		.Lctr32_loop3
1079	aesenclast	$rndkey1,$inout0
1080	aesenclast	$rndkey1,$inout1
1081	aesenclast	$rndkey1,$inout2
1082
1083	movups	($inp),$in0			# load input
1084	xorps	$in0,$inout0
1085	movups	$inout0,($out)			# store output
1086	cmp	\$2,$len
1087	jb	.Lctr32_done			# $len was 1, stop store
1088
1089	movups	0x10($inp),$in1
1090	xorps	$in1,$inout1
1091	movups	$inout1,0x10($out)
1092	je	.Lctr32_done			# $len was 2, stop store
1093
1094	movups	0x20($inp),$in2
1095	xorps	$in2,$inout2
1096	movups	$inout2,0x20($out)		# $len was 3, stop store
1097
1098.Lctr32_done:
1099	xorps	%xmm0,%xmm0			# clear register bank
1100	xor	$key0,$key0
1101	pxor	%xmm1,%xmm1
1102	pxor	%xmm2,%xmm2
1103	pxor	%xmm3,%xmm3
1104	pxor	%xmm4,%xmm4
1105	pxor	%xmm5,%xmm5
1106___
1107$code.=<<___ if (!$win64);
1108	pxor	%xmm6,%xmm6
1109	pxor	%xmm7,%xmm7
1110	movaps	%xmm0,0x00(%rsp)		# clear stack
1111	pxor	%xmm8,%xmm8
1112	movaps	%xmm0,0x10(%rsp)
1113	pxor	%xmm9,%xmm9
1114	movaps	%xmm0,0x20(%rsp)
1115	pxor	%xmm10,%xmm10
1116	movaps	%xmm0,0x30(%rsp)
1117	pxor	%xmm11,%xmm11
1118	movaps	%xmm0,0x40(%rsp)
1119	pxor	%xmm12,%xmm12
1120	movaps	%xmm0,0x50(%rsp)
1121	pxor	%xmm13,%xmm13
1122	movaps	%xmm0,0x60(%rsp)
1123	pxor	%xmm14,%xmm14
1124	movaps	%xmm0,0x70(%rsp)
1125	pxor	%xmm15,%xmm15
1126___
1127$code.=<<___ if ($win64);
1128	movaps	-0xa8($key_),%xmm6
1129	movaps	%xmm0,-0xa8($key_)		# clear stack
1130	movaps	-0x98($key_),%xmm7
1131	movaps	%xmm0,-0x98($key_)
1132	movaps	-0x88($key_),%xmm8
1133	movaps	%xmm0,-0x88($key_)
1134	movaps	-0x78($key_),%xmm9
1135	movaps	%xmm0,-0x78($key_)
1136	movaps	-0x68($key_),%xmm10
1137	movaps	%xmm0,-0x68($key_)
1138	movaps	-0x58($key_),%xmm11
1139	movaps	%xmm0,-0x58($key_)
1140	movaps	-0x48($key_),%xmm12
1141	movaps	%xmm0,-0x48($key_)
1142	movaps	-0x38($key_),%xmm13
1143	movaps	%xmm0,-0x38($key_)
1144	movaps	-0x28($key_),%xmm14
1145	movaps	%xmm0,-0x28($key_)
1146	movaps	-0x18($key_),%xmm15
1147	movaps	%xmm0,-0x18($key_)
1148	movaps	%xmm0,0x00(%rsp)
1149	movaps	%xmm0,0x10(%rsp)
1150	movaps	%xmm0,0x20(%rsp)
1151	movaps	%xmm0,0x30(%rsp)
1152	movaps	%xmm0,0x40(%rsp)
1153	movaps	%xmm0,0x50(%rsp)
1154	movaps	%xmm0,0x60(%rsp)
1155	movaps	%xmm0,0x70(%rsp)
1156___
1157$code.=<<___;
1158	mov	-8($key_),%rbp
1159.cfi_restore	%rbp
1160	lea	($key_),%rsp
1161.cfi_def_cfa_register	%rsp
1162.Lctr32_epilogue:
1163	ret
1164.cfi_endproc
1165.size	${PREFIX}_ctr32_encrypt_blocks,.-${PREFIX}_ctr32_encrypt_blocks
1166___
1167} }}
1168
1169{ my ($inp,$bits,$key) = @_4args;
1170  $bits =~ s/%r/%e/;
1171
1172# This is based on submission by
1173#
1174#	Huang Ying <ying.huang@intel.com>
1175#	Vinodh Gopal <vinodh.gopal@intel.com>
1176#	Kahraman Akdemir
1177#
1178# Aggressively optimized in respect to aeskeygenassist's critical path
1179# and is contained in %xmm0-5 to meet Win64 ABI requirement.
1180#
1181# int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
1182#				int bits, AES_KEY * const key);
1183#
1184# input:	$inp	user-supplied key
1185#		$bits	$inp length in bits
1186#		$key	pointer to key schedule
1187# output:	%eax	0 denoting success, -1 or -2 - failure (see C)
1188#		$bits	rounds-1 (used in aesni_set_decrypt_key)
1189#		*$key	key schedule
1190#		$key	pointer to key schedule (used in
1191#			aesni_set_decrypt_key)
1192#
1193# Subroutine is frame-less, which means that only volatile registers
1194# are used. Note that it's declared "abi-omnipotent", which means that
1195# amount of volatile registers is smaller on Windows.
1196#
1197$code.=<<___;
1198.globl	${PREFIX}_set_encrypt_key
1199.type	${PREFIX}_set_encrypt_key,\@abi-omnipotent
1200.align	16
1201${PREFIX}_set_encrypt_key:
1202__aesni_set_encrypt_key:
1203.cfi_startproc
1204	_CET_ENDBR
1205#ifdef BORINGSSL_DISPATCH_TEST
1206	movb \$1,BORINGSSL_function_hit+3(%rip)
1207#endif
1208	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
1209.cfi_adjust_cfa_offset	8
1210	mov	\$-1,%rax
1211	test	$inp,$inp
1212	jz	.Lenc_key_ret
1213	test	$key,$key
1214	jz	.Lenc_key_ret
1215
1216	movups	($inp),%xmm0		# pull first 128 bits of *userKey
1217	xorps	%xmm4,%xmm4		# low dword of xmm4 is assumed 0
1218	leaq	OPENSSL_ia32cap_P(%rip),%r10
1219	movl	4(%r10),%r10d
1220	and	\$`1<<28|1<<11`,%r10d	# AVX and XOP bits
1221	lea	16($key),%rax		# %rax is used as modifiable copy of $key
1222	cmp	\$256,$bits
1223	je	.L14rounds
1224	# 192-bit key support was removed.
1225	cmp	\$128,$bits
1226	jne	.Lbad_keybits
1227
1228.L10rounds:
1229	mov	\$9,$bits			# 10 rounds for 128-bit key
1230	cmp	\$`1<<28`,%r10d			# AVX, bit no XOP
1231	je	.L10rounds_alt
1232
1233	$movkey	%xmm0,($key)			# round 0
1234	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 1
1235	call		.Lkey_expansion_128_cold
1236	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 2
1237	call		.Lkey_expansion_128
1238	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 3
1239	call		.Lkey_expansion_128
1240	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 4
1241	call		.Lkey_expansion_128
1242	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 5
1243	call		.Lkey_expansion_128
1244	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 6
1245	call		.Lkey_expansion_128
1246	aeskeygenassist	\$0x40,%xmm0,%xmm1	# round 7
1247	call		.Lkey_expansion_128
1248	aeskeygenassist	\$0x80,%xmm0,%xmm1	# round 8
1249	call		.Lkey_expansion_128
1250	aeskeygenassist	\$0x1b,%xmm0,%xmm1	# round 9
1251	call		.Lkey_expansion_128
1252	aeskeygenassist	\$0x36,%xmm0,%xmm1	# round 10
1253	call		.Lkey_expansion_128
1254	$movkey	%xmm0,(%rax)
1255	mov	$bits,80(%rax)	# 240(%rdx)
1256	xor	%eax,%eax
1257	jmp	.Lenc_key_ret
1258
1259.align	16
1260.L10rounds_alt:
1261	movdqa	.Lkey_rotate(%rip),%xmm5
1262	mov	\$8,%r10d
1263	movdqa	.Lkey_rcon1(%rip),%xmm4
1264	movdqa	%xmm0,%xmm2
1265	movdqu	%xmm0,($key)
1266	jmp	.Loop_key128
1267
1268.align	16
1269.Loop_key128:
1270	pshufb		%xmm5,%xmm0
1271	aesenclast	%xmm4,%xmm0
1272	pslld		\$1,%xmm4
1273	lea		16(%rax),%rax
1274
1275	movdqa		%xmm2,%xmm3
1276	pslldq		\$4,%xmm2
1277	pxor		%xmm2,%xmm3
1278	pslldq		\$4,%xmm2
1279	pxor		%xmm2,%xmm3
1280	pslldq		\$4,%xmm2
1281	pxor		%xmm3,%xmm2
1282
1283	pxor		%xmm2,%xmm0
1284	movdqu		%xmm0,-16(%rax)
1285	movdqa		%xmm0,%xmm2
1286
1287	dec	%r10d
1288	jnz	.Loop_key128
1289
1290	movdqa		.Lkey_rcon1b(%rip),%xmm4
1291
1292	pshufb		%xmm5,%xmm0
1293	aesenclast	%xmm4,%xmm0
1294	pslld		\$1,%xmm4
1295
1296	movdqa		%xmm2,%xmm3
1297	pslldq		\$4,%xmm2
1298	pxor		%xmm2,%xmm3
1299	pslldq		\$4,%xmm2
1300	pxor		%xmm2,%xmm3
1301	pslldq		\$4,%xmm2
1302	pxor		%xmm3,%xmm2
1303
1304	pxor		%xmm2,%xmm0
1305	movdqu		%xmm0,(%rax)
1306
1307	movdqa		%xmm0,%xmm2
1308	pshufb		%xmm5,%xmm0
1309	aesenclast	%xmm4,%xmm0
1310
1311	movdqa		%xmm2,%xmm3
1312	pslldq		\$4,%xmm2
1313	pxor		%xmm2,%xmm3
1314	pslldq		\$4,%xmm2
1315	pxor		%xmm2,%xmm3
1316	pslldq		\$4,%xmm2
1317	pxor		%xmm3,%xmm2
1318
1319	pxor		%xmm2,%xmm0
1320	movdqu		%xmm0,16(%rax)
1321
1322	mov	$bits,96(%rax)	# 240($key)
1323	xor	%eax,%eax
1324	jmp	.Lenc_key_ret
1325
1326# 192-bit key support was removed.
1327
1328.align	16
1329.L14rounds:
1330	movups	16($inp),%xmm2			# remaining half of *userKey
1331	mov	\$13,$bits			# 14 rounds for 256
1332	lea	16(%rax),%rax
1333	cmp	\$`1<<28`,%r10d			# AVX, but no XOP
1334	je	.L14rounds_alt
1335
1336	$movkey	%xmm0,($key)			# round 0
1337	$movkey	%xmm2,16($key)			# round 1
1338	aeskeygenassist	\$0x1,%xmm2,%xmm1	# round 2
1339	call		.Lkey_expansion_256a_cold
1340	aeskeygenassist	\$0x1,%xmm0,%xmm1	# round 3
1341	call		.Lkey_expansion_256b
1342	aeskeygenassist	\$0x2,%xmm2,%xmm1	# round 4
1343	call		.Lkey_expansion_256a
1344	aeskeygenassist	\$0x2,%xmm0,%xmm1	# round 5
1345	call		.Lkey_expansion_256b
1346	aeskeygenassist	\$0x4,%xmm2,%xmm1	# round 6
1347	call		.Lkey_expansion_256a
1348	aeskeygenassist	\$0x4,%xmm0,%xmm1	# round 7
1349	call		.Lkey_expansion_256b
1350	aeskeygenassist	\$0x8,%xmm2,%xmm1	# round 8
1351	call		.Lkey_expansion_256a
1352	aeskeygenassist	\$0x8,%xmm0,%xmm1	# round 9
1353	call		.Lkey_expansion_256b
1354	aeskeygenassist	\$0x10,%xmm2,%xmm1	# round 10
1355	call		.Lkey_expansion_256a
1356	aeskeygenassist	\$0x10,%xmm0,%xmm1	# round 11
1357	call		.Lkey_expansion_256b
1358	aeskeygenassist	\$0x20,%xmm2,%xmm1	# round 12
1359	call		.Lkey_expansion_256a
1360	aeskeygenassist	\$0x20,%xmm0,%xmm1	# round 13
1361	call		.Lkey_expansion_256b
1362	aeskeygenassist	\$0x40,%xmm2,%xmm1	# round 14
1363	call		.Lkey_expansion_256a
1364	$movkey	%xmm0,(%rax)
1365	mov	$bits,16(%rax)	# 240(%rdx)
1366	xor	%rax,%rax
1367	jmp	.Lenc_key_ret
1368
1369.align	16
1370.L14rounds_alt:
1371	movdqa	.Lkey_rotate(%rip),%xmm5
1372	movdqa	.Lkey_rcon1(%rip),%xmm4
1373	mov	\$7,%r10d
1374	movdqu	%xmm0,0($key)
1375	movdqa	%xmm2,%xmm1
1376	movdqu	%xmm2,16($key)
1377	jmp	.Loop_key256
1378
1379.align	16
1380.Loop_key256:
1381	pshufb		%xmm5,%xmm2
1382	aesenclast	%xmm4,%xmm2
1383
1384	movdqa		%xmm0,%xmm3
1385	pslldq		\$4,%xmm0
1386	pxor		%xmm0,%xmm3
1387	pslldq		\$4,%xmm0
1388	pxor		%xmm0,%xmm3
1389	pslldq		\$4,%xmm0
1390	pxor		%xmm3,%xmm0
1391	pslld		\$1,%xmm4
1392
1393	pxor		%xmm2,%xmm0
1394	movdqu		%xmm0,(%rax)
1395
1396	dec	%r10d
1397	jz	.Ldone_key256
1398
1399	pshufd		\$0xff,%xmm0,%xmm2
1400	pxor		%xmm3,%xmm3
1401	aesenclast	%xmm3,%xmm2
1402
1403	movdqa		%xmm1,%xmm3
1404	pslldq		\$4,%xmm1
1405	pxor		%xmm1,%xmm3
1406	pslldq		\$4,%xmm1
1407	pxor		%xmm1,%xmm3
1408	pslldq		\$4,%xmm1
1409	pxor		%xmm3,%xmm1
1410
1411	pxor		%xmm1,%xmm2
1412	movdqu		%xmm2,16(%rax)
1413	lea		32(%rax),%rax
1414	movdqa		%xmm2,%xmm1
1415
1416	jmp	.Loop_key256
1417
1418.Ldone_key256:
1419	mov	$bits,16(%rax)	# 240($key)
1420	xor	%eax,%eax
1421	jmp	.Lenc_key_ret
1422
1423.align	16
1424.Lbad_keybits:
1425	mov	\$-2,%rax
1426.Lenc_key_ret:
1427	pxor	%xmm0,%xmm0
1428	pxor	%xmm1,%xmm1
1429	pxor	%xmm2,%xmm2
1430	pxor	%xmm3,%xmm3
1431	pxor	%xmm4,%xmm4
1432	pxor	%xmm5,%xmm5
1433	add	\$8,%rsp
1434.cfi_adjust_cfa_offset	-8
1435	ret
1436.cfi_endproc
1437.LSEH_end_set_encrypt_key:
1438
1439.align	16
1440.Lkey_expansion_128:
1441	$movkey	%xmm0,(%rax)
1442	lea	16(%rax),%rax
1443.Lkey_expansion_128_cold:
1444	shufps	\$0b00010000,%xmm0,%xmm4
1445	xorps	%xmm4, %xmm0
1446	shufps	\$0b10001100,%xmm0,%xmm4
1447	xorps	%xmm4, %xmm0
1448	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
1449	xorps	%xmm1,%xmm0
1450	ret
1451
1452.align 16
1453.Lkey_expansion_192a:
1454	$movkey	%xmm0,(%rax)
1455	lea	16(%rax),%rax
1456.Lkey_expansion_192a_cold:
1457	movaps	%xmm2, %xmm5
1458.Lkey_expansion_192b_warm:
1459	shufps	\$0b00010000,%xmm0,%xmm4
1460	movdqa	%xmm2,%xmm3
1461	xorps	%xmm4,%xmm0
1462	shufps	\$0b10001100,%xmm0,%xmm4
1463	pslldq	\$4,%xmm3
1464	xorps	%xmm4,%xmm0
1465	pshufd	\$0b01010101,%xmm1,%xmm1	# critical path
1466	pxor	%xmm3,%xmm2
1467	pxor	%xmm1,%xmm0
1468	pshufd	\$0b11111111,%xmm0,%xmm3
1469	pxor	%xmm3,%xmm2
1470	ret
1471
1472.align 16
1473.Lkey_expansion_192b:
1474	movaps	%xmm0,%xmm3
1475	shufps	\$0b01000100,%xmm0,%xmm5
1476	$movkey	%xmm5,(%rax)
1477	shufps	\$0b01001110,%xmm2,%xmm3
1478	$movkey	%xmm3,16(%rax)
1479	lea	32(%rax),%rax
1480	jmp	.Lkey_expansion_192b_warm
1481
1482.align	16
1483.Lkey_expansion_256a:
1484	$movkey	%xmm2,(%rax)
1485	lea	16(%rax),%rax
1486.Lkey_expansion_256a_cold:
1487	shufps	\$0b00010000,%xmm0,%xmm4
1488	xorps	%xmm4,%xmm0
1489	shufps	\$0b10001100,%xmm0,%xmm4
1490	xorps	%xmm4,%xmm0
1491	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
1492	xorps	%xmm1,%xmm0
1493	ret
1494
1495.align 16
1496.Lkey_expansion_256b:
1497	$movkey	%xmm0,(%rax)
1498	lea	16(%rax),%rax
1499
1500	shufps	\$0b00010000,%xmm2,%xmm4
1501	xorps	%xmm4,%xmm2
1502	shufps	\$0b10001100,%xmm2,%xmm4
1503	xorps	%xmm4,%xmm2
1504	shufps	\$0b10101010,%xmm1,%xmm1	# critical path
1505	xorps	%xmm1,%xmm2
1506	ret
1507.size	${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
1508.size	__aesni_set_encrypt_key,.-__aesni_set_encrypt_key
1509___
1510}
1511
1512$code.=<<___;
1513.section .rodata
1514.align	64
1515.Lbswap_mask:
1516	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1517.Lincrement32:
1518	.long	6,6,6,0
1519.Lincrement64:
1520	.long	1,0,0,0
1521.Lincrement1:
1522	.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1523.Lkey_rotate:
1524	.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
1525.Lkey_rotate192:
1526	.long	0x04070605,0x04070605,0x04070605,0x04070605
1527.Lkey_rcon1:
1528	.long	1,1,1,1
1529.Lkey_rcon1b:
1530	.long	0x1b,0x1b,0x1b,0x1b
1531
1532.asciz  "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
1533.align	64
1534.text
1535___
1536
1537# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1538#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1539if ($win64) {
1540$rec="%rcx";
1541$frame="%rdx";
1542$context="%r8";
1543$disp="%r9";
1544
1545$code.=<<___;
1546.extern	__imp_RtlVirtualUnwind
1547___
1548$code.=<<___ if ($PREFIX eq "aes_hw");
1549.type	ctr_xts_se_handler,\@abi-omnipotent
1550.align	16
1551ctr_xts_se_handler:
1552	push	%rsi
1553	push	%rdi
1554	push	%rbx
1555	push	%rbp
1556	push	%r12
1557	push	%r13
1558	push	%r14
1559	push	%r15
1560	pushfq
1561	sub	\$64,%rsp
1562
1563	mov	120($context),%rax	# pull context->Rax
1564	mov	248($context),%rbx	# pull context->Rip
1565
1566	mov	8($disp),%rsi		# disp->ImageBase
1567	mov	56($disp),%r11		# disp->HandlerData
1568
1569	mov	0(%r11),%r10d		# HandlerData[0]
1570	lea	(%rsi,%r10),%r10	# prologue lable
1571	cmp	%r10,%rbx		# context->Rip<prologue label
1572	jb	.Lcommon_seh_tail
1573
1574	mov	152($context),%rax	# pull context->Rsp
1575
1576	mov	4(%r11),%r10d		# HandlerData[1]
1577	lea	(%rsi,%r10),%r10	# epilogue label
1578	cmp	%r10,%rbx		# context->Rip>=epilogue label
1579	jae	.Lcommon_seh_tail
1580
1581	mov	208($context),%rax	# pull context->R11
1582
1583	lea	-0xa8(%rax),%rsi	# %xmm save area
1584	lea	512($context),%rdi	# & context.Xmm6
1585	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
1586	.long	0xa548f3fc		# cld; rep movsq
1587
1588	mov	-8(%rax),%rbp		# restore saved %rbp
1589	mov	%rbp,160($context)	# restore context->Rbp
1590
1591
1592.Lcommon_seh_tail:
1593	mov	8(%rax),%rdi
1594	mov	16(%rax),%rsi
1595	mov	%rax,152($context)	# restore context->Rsp
1596	mov	%rsi,168($context)	# restore context->Rsi
1597	mov	%rdi,176($context)	# restore context->Rdi
1598
1599	mov	40($disp),%rdi		# disp->ContextRecord
1600	mov	$context,%rsi		# context
1601	mov	\$154,%ecx		# sizeof(CONTEXT)
1602	.long	0xa548f3fc		# cld; rep movsq
1603
1604	mov	$disp,%rsi
1605	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1606	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1607	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1608	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1609	mov	40(%rsi),%r10		# disp->ContextRecord
1610	lea	56(%rsi),%r11		# &disp->HandlerData
1611	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1612	mov	%r10,32(%rsp)		# arg5
1613	mov	%r11,40(%rsp)		# arg6
1614	mov	%r12,48(%rsp)		# arg7
1615	mov	%rcx,56(%rsp)		# arg8, (NULL)
1616	call	*__imp_RtlVirtualUnwind(%rip)
1617
1618	mov	\$1,%eax		# ExceptionContinueSearch
1619	add	\$64,%rsp
1620	popfq
1621	pop	%r15
1622	pop	%r14
1623	pop	%r13
1624	pop	%r12
1625	pop	%rbp
1626	pop	%rbx
1627	pop	%rdi
1628	pop	%rsi
1629	ret
1630.size	ctr_xts_se_handler,.-ctr_xts_se_handler
1631
1632.section	.pdata
1633.align	4
1634___
1635$code.=<<___ if ($PREFIX eq "aes_hw");
1636	.rva	.LSEH_begin_${PREFIX}_ctr32_encrypt_blocks
1637	.rva	.LSEH_end_${PREFIX}_ctr32_encrypt_blocks
1638	.rva	.LSEH_info_ctr32
1639___
1640$code.=<<___;
1641	.rva	${PREFIX}_set_encrypt_key
1642	.rva	.LSEH_end_set_encrypt_key
1643	.rva	.LSEH_info_key
1644.section	.xdata
1645.align	8
1646___
1647$code.=<<___ if ($PREFIX eq "aes_hw");
1648.LSEH_info_ctr32:
1649	.byte	9,0,0,0
1650	.rva	ctr_xts_se_handler
1651	.rva	.Lctr32_body,.Lctr32_epilogue		# HandlerData[]
1652___
1653$code.=<<___;
1654.LSEH_info_key:
1655	.byte	0x01,0x04,0x01,0x00
1656	.byte	0x04,0x02,0x00,0x00	# sub rsp,8
1657___
1658}
1659
1660sub rex {
1661  local *opcode=shift;
1662  my ($dst,$src)=@_;
1663  my $rex=0;
1664
1665    $rex|=0x04			if($dst>=8);
1666    $rex|=0x01			if($src>=8);
1667    push @opcode,$rex|0x40	if($rex);
1668}
1669
1670sub aesni {
1671  my $line=shift;
1672  my @opcode=(0x66);
1673
1674    if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1675	rex(\@opcode,$4,$3);
1676	push @opcode,0x0f,0x3a,0xdf;
1677	push @opcode,0xc0|($3&7)|(($4&7)<<3);	# ModR/M
1678	my $c=$2;
1679	push @opcode,$c=~/^0/?oct($c):$c;
1680	return ".byte\t".join(',',@opcode);
1681    }
1682    elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1683	my %opcodelet = (
1684		"aesimc" => 0xdb,
1685		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
1686		"aesdec" => 0xde,	"aesdeclast" => 0xdf
1687	);
1688	return undef if (!defined($opcodelet{$1}));
1689	rex(\@opcode,$3,$2);
1690	push @opcode,0x0f,0x38,$opcodelet{$1};
1691	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
1692	return ".byte\t".join(',',@opcode);
1693    }
1694    elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
1695	my %opcodelet = (
1696		"aesenc" => 0xdc,	"aesenclast" => 0xdd,
1697		"aesdec" => 0xde,	"aesdeclast" => 0xdf
1698	);
1699	return undef if (!defined($opcodelet{$1}));
1700	my $off = $2;
1701	push @opcode,0x44 if ($3>=8);
1702	push @opcode,0x0f,0x38,$opcodelet{$1};
1703	push @opcode,0x44|(($3&7)<<3),0x24;	# ModR/M
1704	push @opcode,($off=~/^0/?oct($off):$off)&0xff;
1705	return ".byte\t".join(',',@opcode);
1706    }
1707    return $line;
1708}
1709
1710sub movbe {
1711	".byte	0x0f,0x38,0xf1,0x44,0x24,".shift;
1712}
1713
1714$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1715$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
1716#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm;	# debugging artefact
1717$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
1718
1719print $code;
1720
1721close STDOUT or die "error closing STDOUT: $!";
1722