• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env perl
2
3###################################################################
4### AES-128 [originally in CTR mode]				###
5### bitsliced implementation for Intel Core 2 processors	###
6### requires support of SSE extensions up to SSSE3		###
7### Author: Emilia Käsper and Peter Schwabe			###
8### Date: 2009-03-19						###
9### Public domain						###
10###								###
11### See http://homes.esat.kuleuven.be/~ekasper/#software for	###
12### further information.					###
13###################################################################
14#
15# September 2011.
16#
17# Started as transliteration to "perlasm" the original code has
18# undergone following changes:
19#
20# - code was made position-independent;
21# - rounds were folded into a loop resulting in >5x size reduction
22#   from 12.5KB to 2.2KB;
23# - above was possibile thanks to mixcolumns() modification that
24#   allowed to feed its output back to aesenc[last], this was
25#   achieved at cost of two additional inter-registers moves;
26# - some instruction reordering and interleaving;
27# - this module doesn't implement key setup subroutine, instead it
28#   relies on conversion of "conventional" key schedule as returned
29#   by AES_set_encrypt_key (see discussion below);
30# - first and last round keys are treated differently, which allowed
31#   to skip one shiftrows(), reduce bit-sliced key schedule and
32#   speed-up conversion by 22%;
33# - support for 192- and 256-bit keys was added;
34#
35# Resulting performance in CPU cycles spent to encrypt one byte out
36# of 4096-byte buffer with 128-bit key is:
37#
38#		Emilia's	this(*)		difference
39#
40# Core 2    	9.30		8.69		+7%
41# Nehalem(**) 	7.63		6.98		+9%
42# Atom	    	17.1		17.4		-2%(***)
43#
44# (*)	Comparison is not completely fair, because "this" is ECB,
45#	i.e. no extra processing such as counter values calculation
46#	and xor-ing input as in Emilia's CTR implementation is
47#	performed. However, the CTR calculations stand for not more
48#	than 1% of total time, so comparison is *rather* fair.
49#
50# (**)	Results were collected on Westmere, which is considered to
51#	be equivalent to Nehalem for this code.
52#
53# (***)	Slowdown on Atom is rather strange per se, because original
54#	implementation has a number of 9+-bytes instructions, which
55#	are bad for Atom front-end, and which I eliminated completely.
56#	In attempt to address deterioration sbox() was tested in FP
57#	SIMD "domain" (movaps instead of movdqa, xorps instead of
58#	pxor, etc.). While it resulted in nominal 4% improvement on
59#	Atom, it hurted Westmere by more than 2x factor.
60#
61# As for key schedule conversion subroutine. Interface to OpenSSL
62# relies on per-invocation on-the-fly conversion. This naturally
63# has impact on performance, especially for short inputs. Conversion
64# time in CPU cycles and its ratio to CPU cycles spent in 8x block
65# function is:
66#
67# 		conversion	conversion/8x block
68# Core 2	240		0.22
69# Nehalem	180		0.20
70# Atom		430		0.19
71#
72# The ratio values mean that 128-byte blocks will be processed
73# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
74# etc. Then keep in mind that input sizes not divisible by 128 are
75# *effectively* slower, especially shortest ones, e.g. consecutive
76# 144-byte blocks are processed 44% slower than one would expect,
77# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
78# it's still faster than ["hyper-threading-safe" code path in]
79# aes-x86_64.pl on all lengths above 64 bytes...
80#
81# October 2011.
82#
83# Add decryption procedure. Performance in CPU cycles spent to decrypt
84# one byte out of 4096-byte buffer with 128-bit key is:
85#
86# Core 2	11.0
87# Nehalem	9.16
88# Atom		20.9
89#
90# November 2011.
91#
92# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
93# suboptimal, but XTS is meant to be used with larger blocks...
94#
95#						<appro@openssl.org>
96
97$flavour = shift;
98$output  = shift;
99if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
100
101$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
102
103$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
104( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
105( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
106die "can't locate x86_64-xlate.pl";
107
108open STDOUT,"| $^X $xlate $flavour $output";
109
110my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
111my @XMM=map("%xmm$_",(15,0..14));	# best on Atom, +10% over (0..15)
112my $ecb=0;	# suppress unreferenced ECB subroutines, spare some space...
113
114{
115my ($key,$rounds,$const)=("%rax","%r10d","%r11");
116
117sub Sbox {
118# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
119# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
120my @b=@_[0..7];
121my @t=@_[8..11];
122my @s=@_[12..15];
123	&InBasisChange	(@b);
124	&Inv_GF256	(@b[6,5,0,3,7,1,4,2],@t,@s);
125	&OutBasisChange	(@b[7,1,4,2,6,5,0,3]);
126}
127
128sub InBasisChange {
129# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
130# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
131my @b=@_[0..7];
132$code.=<<___;
133	pxor	@b[6], @b[5]
134	pxor	@b[1], @b[2]
135	pxor	@b[0], @b[3]
136	pxor	@b[2], @b[6]
137	pxor 	@b[0], @b[5]
138
139	pxor	@b[3], @b[6]
140	pxor	@b[7], @b[3]
141	pxor	@b[5], @b[7]
142	pxor	@b[4], @b[3]
143	pxor	@b[5], @b[4]
144	pxor	@b[1], @b[3]
145
146	pxor	@b[7], @b[2]
147	pxor	@b[5], @b[1]
148___
149}
150
151sub OutBasisChange {
152# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
153# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
154my @b=@_[0..7];
155$code.=<<___;
156	pxor	@b[6], @b[0]
157	pxor	@b[4], @b[1]
158	pxor	@b[0], @b[2]
159	pxor	@b[6], @b[4]
160	pxor	@b[1], @b[6]
161
162	pxor	@b[5], @b[1]
163	pxor	@b[3], @b[5]
164	pxor	@b[7], @b[3]
165	pxor	@b[5], @b[7]
166	pxor	@b[5], @b[2]
167
168	pxor	@b[7], @b[4]
169___
170}
171
172sub InvSbox {
173# input in lsb 	> [b0, b1, b2, b3, b4, b5, b6, b7] < msb
174# output in lsb	> [b0, b1, b6, b4, b2, b7, b3, b5] < msb
175my @b=@_[0..7];
176my @t=@_[8..11];
177my @s=@_[12..15];
178	&InvInBasisChange	(@b);
179	&Inv_GF256		(@b[5,1,2,6,3,7,0,4],@t,@s);
180	&InvOutBasisChange	(@b[3,7,0,4,5,1,2,6]);
181}
182
183sub InvInBasisChange {		# OutBasisChange in reverse
184my @b=@_[5,1,2,6,3,7,0,4];
185$code.=<<___
186	pxor	@b[7], @b[4]
187
188	pxor	@b[5], @b[7]
189	pxor	@b[5], @b[2]
190	pxor	@b[7], @b[3]
191	pxor	@b[3], @b[5]
192	pxor	@b[5], @b[1]
193
194	pxor	@b[1], @b[6]
195	pxor	@b[0], @b[2]
196	pxor	@b[6], @b[4]
197	pxor	@b[6], @b[0]
198	pxor	@b[4], @b[1]
199___
200}
201
202sub InvOutBasisChange {		# InBasisChange in reverse
203my @b=@_[2,5,7,3,6,1,0,4];
204$code.=<<___;
205	pxor	@b[5], @b[1]
206	pxor	@b[7], @b[2]
207
208	pxor	@b[1], @b[3]
209	pxor	@b[5], @b[4]
210	pxor	@b[5], @b[7]
211	pxor	@b[4], @b[3]
212	 pxor 	@b[0], @b[5]
213	pxor	@b[7], @b[3]
214	 pxor	@b[2], @b[6]
215	 pxor	@b[1], @b[2]
216	pxor	@b[3], @b[6]
217
218	pxor	@b[0], @b[3]
219	pxor	@b[6], @b[5]
220___
221}
222
223sub Mul_GF4 {
224#;*************************************************************
225#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
226#;*************************************************************
227my ($x0,$x1,$y0,$y1,$t0)=@_;
228$code.=<<___;
229	movdqa	$y0, $t0
230	pxor 	$y1, $t0
231	pand	$x0, $t0
232	pxor	$x1, $x0
233	pand	$y0, $x1
234	pand	$y1, $x0
235	pxor	$x1, $x0
236	pxor	$t0, $x1
237___
238}
239
240sub Mul_GF4_N {				# not used, see next subroutine
241# multiply and scale by N
242my ($x0,$x1,$y0,$y1,$t0)=@_;
243$code.=<<___;
244	movdqa	$y0, $t0
245	pxor	$y1, $t0
246	pand	$x0, $t0
247	pxor	$x1, $x0
248	pand	$y0, $x1
249	pand	$y1, $x0
250	pxor	$x0, $x1
251	pxor	$t0, $x0
252___
253}
254
255sub Mul_GF4_N_GF4 {
256# interleaved Mul_GF4_N and Mul_GF4
257my ($x0,$x1,$y0,$y1,$t0,
258    $x2,$x3,$y2,$y3,$t1)=@_;
259$code.=<<___;
260	movdqa	$y0, $t0
261	 movdqa	$y2, $t1
262	pxor	$y1, $t0
263	 pxor 	$y3, $t1
264	pand	$x0, $t0
265	 pand	$x2, $t1
266	pxor	$x1, $x0
267	 pxor	$x3, $x2
268	pand	$y0, $x1
269	 pand	$y2, $x3
270	pand	$y1, $x0
271	 pand	$y3, $x2
272	pxor	$x0, $x1
273	 pxor	$x3, $x2
274	pxor	$t0, $x0
275	 pxor	$t1, $x3
276___
277}
278sub Mul_GF16_2 {
279my @x=@_[0..7];
280my @y=@_[8..11];
281my @t=@_[12..15];
282$code.=<<___;
283	movdqa	@x[0], @t[0]
284	movdqa	@x[1], @t[1]
285___
286	&Mul_GF4  	(@x[0], @x[1], @y[0], @y[1], @t[2]);
287$code.=<<___;
288	pxor	@x[2], @t[0]
289	pxor	@x[3], @t[1]
290	pxor	@y[2], @y[0]
291	pxor	@y[3], @y[1]
292___
293	Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
294			 @x[2], @x[3], @y[2], @y[3], @t[2]);
295$code.=<<___;
296	pxor	@t[0], @x[0]
297	pxor	@t[0], @x[2]
298	pxor	@t[1], @x[1]
299	pxor	@t[1], @x[3]
300
301	movdqa	@x[4], @t[0]
302	movdqa	@x[5], @t[1]
303	pxor	@x[6], @t[0]
304	pxor	@x[7], @t[1]
305___
306	&Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
307			 @x[6], @x[7], @y[2], @y[3], @t[2]);
308$code.=<<___;
309	pxor	@y[2], @y[0]
310	pxor	@y[3], @y[1]
311___
312	&Mul_GF4  	(@x[4], @x[5], @y[0], @y[1], @t[3]);
313$code.=<<___;
314	pxor	@t[0], @x[4]
315	pxor	@t[0], @x[6]
316	pxor	@t[1], @x[5]
317	pxor	@t[1], @x[7]
318___
319}
320sub Inv_GF256 {
321#;********************************************************************
322#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
323#;********************************************************************
324my @x=@_[0..7];
325my @t=@_[8..11];
326my @s=@_[12..15];
327# direct optimizations from hardware
328$code.=<<___;
329	movdqa	@x[4], @t[3]
330	movdqa	@x[5], @t[2]
331	movdqa	@x[1], @t[1]
332	movdqa	@x[7], @s[1]
333	movdqa	@x[0], @s[0]
334
335	pxor	@x[6], @t[3]
336	pxor	@x[7], @t[2]
337	pxor	@x[3], @t[1]
338	 movdqa	@t[3], @s[2]
339	pxor	@x[6], @s[1]
340	 movdqa	@t[2], @t[0]
341	pxor	@x[2], @s[0]
342	 movdqa	@t[3], @s[3]
343
344	por	@t[1], @t[2]
345	por	@s[0], @t[3]
346	pxor	@t[0], @s[3]
347	pand	@s[0], @s[2]
348	pxor	@t[1], @s[0]
349	pand	@t[1], @t[0]
350	pand	@s[0], @s[3]
351	movdqa	@x[3], @s[0]
352	pxor	@x[2], @s[0]
353	pand	@s[0], @s[1]
354	pxor	@s[1], @t[3]
355	pxor	@s[1], @t[2]
356	movdqa	@x[4], @s[1]
357	movdqa	@x[1], @s[0]
358	pxor	@x[5], @s[1]
359	pxor	@x[0], @s[0]
360	movdqa	@s[1], @t[1]
361	pand	@s[0], @s[1]
362	por	@s[0], @t[1]
363	pxor	@s[1], @t[0]
364	pxor	@s[3], @t[3]
365	pxor	@s[2], @t[2]
366	pxor	@s[3], @t[1]
367	movdqa	@x[7], @s[0]
368	pxor	@s[2], @t[0]
369	movdqa	@x[6], @s[1]
370	pxor	@s[2], @t[1]
371	movdqa	@x[5], @s[2]
372	pand	@x[3], @s[0]
373	movdqa	@x[4], @s[3]
374	pand	@x[2], @s[1]
375	pand	@x[1], @s[2]
376	por	@x[0], @s[3]
377	pxor	@s[0], @t[3]
378	pxor	@s[1], @t[2]
379	pxor	@s[2], @t[1]
380	pxor	@s[3], @t[0]
381
382	#Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
383
384	# new smaller inversion
385
386	movdqa	@t[3], @s[0]
387	pand	@t[1], @t[3]
388	pxor	@t[2], @s[0]
389
390	movdqa	@t[0], @s[2]
391	movdqa	@s[0], @s[3]
392	pxor	@t[3], @s[2]
393	pand	@s[2], @s[3]
394
395	movdqa	@t[1], @s[1]
396	pxor	@t[2], @s[3]
397	pxor	@t[0], @s[1]
398
399	pxor	@t[2], @t[3]
400
401	pand	@t[3], @s[1]
402
403	movdqa	@s[2], @t[2]
404	pxor	@t[0], @s[1]
405
406	pxor	@s[1], @t[2]
407	pxor	@s[1], @t[1]
408
409	pand	@t[0], @t[2]
410
411	pxor	@t[2], @s[2]
412	pxor	@t[2], @t[1]
413
414	pand	@s[3], @s[2]
415
416	pxor	@s[0], @s[2]
417___
418# output in s3, s2, s1, t1
419
420# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
421
422# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
423	&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
424
425### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
426}
427
428# AES linear components
429
430sub ShiftRows {
431my @x=@_[0..7];
432my $mask=pop;
433$code.=<<___;
434	pxor	0x00($key),@x[0]
435	pxor	0x10($key),@x[1]
436	pshufb	$mask,@x[0]
437	pxor	0x20($key),@x[2]
438	pshufb	$mask,@x[1]
439	pxor	0x30($key),@x[3]
440	pshufb	$mask,@x[2]
441	pxor	0x40($key),@x[4]
442	pshufb	$mask,@x[3]
443	pxor	0x50($key),@x[5]
444	pshufb	$mask,@x[4]
445	pxor	0x60($key),@x[6]
446	pshufb	$mask,@x[5]
447	pxor	0x70($key),@x[7]
448	pshufb	$mask,@x[6]
449	lea	0x80($key),$key
450	pshufb	$mask,@x[7]
451___
452}
453
454sub MixColumns {
455# modified to emit output in order suitable for feeding back to aesenc[last]
456my @x=@_[0..7];
457my @t=@_[8..15];
458$code.=<<___;
459	pshufd	\$0x93, @x[0], @t[0]	# x0 <<< 32
460	pshufd	\$0x93, @x[1], @t[1]
461	 pxor	@t[0], @x[0]		# x0 ^ (x0 <<< 32)
462	pshufd	\$0x93, @x[2], @t[2]
463	 pxor	@t[1], @x[1]
464	pshufd	\$0x93, @x[3], @t[3]
465	 pxor	@t[2], @x[2]
466	pshufd	\$0x93, @x[4], @t[4]
467	 pxor	@t[3], @x[3]
468	pshufd	\$0x93, @x[5], @t[5]
469	 pxor	@t[4], @x[4]
470	pshufd	\$0x93, @x[6], @t[6]
471	 pxor	@t[5], @x[5]
472	pshufd	\$0x93, @x[7], @t[7]
473	 pxor	@t[6], @x[6]
474	 pxor	@t[7], @x[7]
475
476	pxor	@x[0], @t[1]
477	pxor	@x[7], @t[0]
478	pxor	@x[7], @t[1]
479	 pshufd	\$0x4E, @x[0], @x[0] 	# (x0 ^ (x0 <<< 32)) <<< 64)
480	pxor	@x[1], @t[2]
481	 pshufd	\$0x4E, @x[1], @x[1]
482	pxor	@x[4], @t[5]
483	 pxor	@t[0], @x[0]
484	pxor	@x[5], @t[6]
485	 pxor	@t[1], @x[1]
486	pxor	@x[3], @t[4]
487	 pshufd	\$0x4E, @x[4], @t[0]
488	pxor	@x[6], @t[7]
489	 pshufd	\$0x4E, @x[5], @t[1]
490	pxor	@x[2], @t[3]
491	 pshufd	\$0x4E, @x[3], @x[4]
492	pxor	@x[7], @t[3]
493	 pshufd	\$0x4E, @x[7], @x[5]
494	pxor	@x[7], @t[4]
495	 pshufd	\$0x4E, @x[6], @x[3]
496	pxor	@t[4], @t[0]
497	 pshufd	\$0x4E, @x[2], @x[6]
498	pxor	@t[5], @t[1]
499
500	pxor	@t[3], @x[4]
501	pxor	@t[7], @x[5]
502	pxor	@t[6], @x[3]
503	 movdqa	@t[0], @x[2]
504	pxor	@t[2], @x[6]
505	 movdqa	@t[1], @x[7]
506___
507}
508
509sub InvMixColumns {
510my @x=@_[0..7];
511my @t=@_[8..15];
512
513$code.=<<___;
514	# multiplication by 0x0e
515	pshufd	\$0x93, @x[7], @t[7]
516	movdqa	@x[2], @t[2]
517	pxor	@x[5], @x[7]		# 7 5
518	pxor	@x[5], @x[2]		# 2 5
519	pshufd	\$0x93, @x[0], @t[0]
520	movdqa	@x[5], @t[5]
521	pxor	@x[0], @x[5]		# 5 0		[1]
522	pxor	@x[1], @x[0]		# 0 1
523	pshufd	\$0x93, @x[1], @t[1]
524	pxor	@x[2], @x[1]		# 1 25
525	pxor	@x[6], @x[0]		# 01 6		[2]
526	pxor	@x[3], @x[1]		# 125 3		[4]
527	pshufd	\$0x93, @x[3], @t[3]
528	pxor	@x[0], @x[2]		# 25 016	[3]
529	pxor	@x[7], @x[3]		# 3 75
530	pxor	@x[6], @x[7]		# 75 6		[0]
531	pshufd	\$0x93, @x[6], @t[6]
532	movdqa	@x[4], @t[4]
533	pxor	@x[4], @x[6]		# 6 4
534	pxor	@x[3], @x[4]		# 4 375		[6]
535	pxor	@x[7], @x[3]		# 375 756=36
536	pxor	@t[5], @x[6]		# 64 5		[7]
537	pxor	@t[2], @x[3]		# 36 2
538	pxor	@t[4], @x[3]		# 362 4		[5]
539	pshufd	\$0x93, @t[5], @t[5]
540___
541					my @y = @x[7,5,0,2,1,3,4,6];
542$code.=<<___;
543	# multiplication by 0x0b
544	pxor	@y[0], @y[1]
545	pxor	@t[0], @y[0]
546	pxor	@t[1], @y[1]
547	pshufd	\$0x93, @t[2], @t[2]
548	pxor	@t[5], @y[0]
549	pxor	@t[6], @y[1]
550	pxor	@t[7], @y[0]
551	pshufd	\$0x93, @t[4], @t[4]
552	pxor	@t[6], @t[7]		# clobber t[7]
553	pxor	@y[0], @y[1]
554
555	pxor	@t[0], @y[3]
556	pshufd	\$0x93, @t[0], @t[0]
557	pxor	@t[1], @y[2]
558	pxor	@t[1], @y[4]
559	pxor	@t[2], @y[2]
560	pshufd	\$0x93, @t[1], @t[1]
561	pxor	@t[2], @y[3]
562	pxor	@t[2], @y[5]
563	pxor	@t[7], @y[2]
564	pshufd	\$0x93, @t[2], @t[2]
565	pxor	@t[3], @y[3]
566	pxor	@t[3], @y[6]
567	pxor	@t[3], @y[4]
568	pshufd	\$0x93, @t[3], @t[3]
569	pxor	@t[4], @y[7]
570	pxor	@t[4], @y[5]
571	pxor	@t[7], @y[7]
572	pxor	@t[5], @y[3]
573	pxor	@t[4], @y[4]
574	pxor	@t[5], @t[7]		# clobber t[7] even more
575
576	pxor	@t[7], @y[5]
577	pshufd	\$0x93, @t[4], @t[4]
578	pxor	@t[7], @y[6]
579	pxor	@t[7], @y[4]
580
581	pxor	@t[5], @t[7]
582	pshufd	\$0x93, @t[5], @t[5]
583	pxor	@t[6], @t[7]		# restore t[7]
584
585	# multiplication by 0x0d
586	pxor	@y[7], @y[4]
587	pxor	@t[4], @y[7]
588	pshufd	\$0x93, @t[6], @t[6]
589	pxor	@t[0], @y[2]
590	pxor	@t[5], @y[7]
591	pxor	@t[2], @y[2]
592	pshufd	\$0x93, @t[7], @t[7]
593
594	pxor	@y[1], @y[3]
595	pxor	@t[1], @y[1]
596	pxor	@t[0], @y[0]
597	pxor	@t[0], @y[3]
598	pxor	@t[5], @y[1]
599	pxor	@t[5], @y[0]
600	pxor	@t[7], @y[1]
601	pshufd	\$0x93, @t[0], @t[0]
602	pxor	@t[6], @y[0]
603	pxor	@y[1], @y[3]
604	pxor	@t[1], @y[4]
605	pshufd	\$0x93, @t[1], @t[1]
606
607	pxor	@t[7], @y[7]
608	pxor	@t[2], @y[4]
609	pxor	@t[2], @y[5]
610	pshufd	\$0x93, @t[2], @t[2]
611	pxor	@t[6], @y[2]
612	pxor	@t[3], @t[6]		# clobber t[6]
613	pxor	@y[7], @y[4]
614	pxor	@t[6], @y[3]
615
616	pxor	@t[6], @y[6]
617	pxor	@t[5], @y[5]
618	pxor	@t[4], @y[6]
619	pshufd	\$0x93, @t[4], @t[4]
620	pxor	@t[6], @y[5]
621	pxor	@t[7], @y[6]
622	pxor	@t[3], @t[6]		# restore t[6]
623
624	pshufd	\$0x93, @t[5], @t[5]
625	pshufd	\$0x93, @t[6], @t[6]
626	pshufd	\$0x93, @t[7], @t[7]
627	pshufd	\$0x93, @t[3], @t[3]
628
629	# multiplication by 0x09
630	pxor	@y[1], @y[4]
631	pxor	@y[1], @t[1]		# t[1]=y[1]
632	pxor	@t[5], @t[0]		# clobber t[0]
633	pxor	@t[5], @t[1]
634	pxor	@t[0], @y[3]
635	pxor	@y[0], @t[0]		# t[0]=y[0]
636	pxor	@t[6], @t[1]
637	pxor	@t[7], @t[6]		# clobber t[6]
638	pxor	@t[1], @y[4]
639	pxor	@t[4], @y[7]
640	pxor	@y[4], @t[4]		# t[4]=y[4]
641	pxor	@t[3], @y[6]
642	pxor	@y[3], @t[3]		# t[3]=y[3]
643	pxor	@t[2], @y[5]
644	pxor	@y[2], @t[2]		# t[2]=y[2]
645	pxor	@t[7], @t[3]
646	pxor	@y[5], @t[5]		# t[5]=y[5]
647	pxor	@t[6], @t[2]
648	pxor	@t[6], @t[5]
649	pxor	@y[6], @t[6]		# t[6]=y[6]
650	pxor	@y[7], @t[7]		# t[7]=y[7]
651
652	movdqa	@t[0],@XMM[0]
653	movdqa	@t[1],@XMM[1]
654	movdqa	@t[2],@XMM[2]
655	movdqa	@t[3],@XMM[3]
656	movdqa	@t[4],@XMM[4]
657	movdqa	@t[5],@XMM[5]
658	movdqa	@t[6],@XMM[6]
659	movdqa	@t[7],@XMM[7]
660___
661}
662
663sub aesenc {				# not used
664my @b=@_[0..7];
665my @t=@_[8..15];
666$code.=<<___;
667	movdqa	0x30($const),@t[0]	# .LSR
668___
669	&ShiftRows	(@b,@t[0]);
670	&Sbox		(@b,@t);
671	&MixColumns	(@b[0,1,4,6,3,7,2,5],@t);
672}
673
674sub aesenclast {			# not used
675my @b=@_[0..7];
676my @t=@_[8..15];
677$code.=<<___;
678	movdqa	0x40($const),@t[0]	# .LSRM0
679___
680	&ShiftRows	(@b,@t[0]);
681	&Sbox		(@b,@t);
682$code.=<<___
683	pxor	0x00($key),@b[0]
684	pxor	0x10($key),@b[1]
685	pxor	0x20($key),@b[4]
686	pxor	0x30($key),@b[6]
687	pxor	0x40($key),@b[3]
688	pxor	0x50($key),@b[7]
689	pxor	0x60($key),@b[2]
690	pxor	0x70($key),@b[5]
691___
692}
693
694sub swapmove {
695my ($a,$b,$n,$mask,$t)=@_;
696$code.=<<___;
697	movdqa	$b,$t
698	psrlq	\$$n,$b
699	pxor  	$a,$b
700	pand	$mask,$b
701	pxor	$b,$a
702	psllq	\$$n,$b
703	pxor	$t,$b
704___
705}
706sub swapmove2x {
707my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
708$code.=<<___;
709	movdqa	$b0,$t0
710	psrlq	\$$n,$b0
711	 movdqa	$b1,$t1
712	 psrlq	\$$n,$b1
713	pxor  	$a0,$b0
714	 pxor  	$a1,$b1
715	pand	$mask,$b0
716	 pand	$mask,$b1
717	pxor	$b0,$a0
718	psllq	\$$n,$b0
719	 pxor	$b1,$a1
720	 psllq	\$$n,$b1
721	pxor	$t0,$b0
722	 pxor	$t1,$b1
723___
724}
725
726sub bitslice {
727my @x=reverse(@_[0..7]);
728my ($t0,$t1,$t2,$t3)=@_[8..11];
729$code.=<<___;
730	movdqa	0x00($const),$t0	# .LBS0
731	movdqa	0x10($const),$t1	# .LBS1
732___
733	&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
734	&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
735$code.=<<___;
736	movdqa	0x20($const),$t0	# .LBS2
737___
738	&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
739	&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
740
741	&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
742	&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
743}
744
745$code.=<<___;
746.text
747
748.extern	asm_AES_encrypt
749.extern	asm_AES_decrypt
750
751.type	_bsaes_encrypt8,\@abi-omnipotent
752.align	64
753_bsaes_encrypt8:
754	lea	.LBS0(%rip), $const	# constants table
755
756	movdqa	($key), @XMM[9]		# round 0 key
757	lea	0x10($key), $key
758	movdqa	0x50($const), @XMM[8]	# .LM0SR
759	pxor	@XMM[9], @XMM[0]	# xor with round0 key
760	pxor	@XMM[9], @XMM[1]
761	 pshufb	@XMM[8], @XMM[0]
762	pxor	@XMM[9], @XMM[2]
763	 pshufb	@XMM[8], @XMM[1]
764	pxor	@XMM[9], @XMM[3]
765	 pshufb	@XMM[8], @XMM[2]
766	pxor	@XMM[9], @XMM[4]
767	 pshufb	@XMM[8], @XMM[3]
768	pxor	@XMM[9], @XMM[5]
769	 pshufb	@XMM[8], @XMM[4]
770	pxor	@XMM[9], @XMM[6]
771	 pshufb	@XMM[8], @XMM[5]
772	pxor	@XMM[9], @XMM[7]
773	 pshufb	@XMM[8], @XMM[6]
774	 pshufb	@XMM[8], @XMM[7]
775_bsaes_encrypt8_bitslice:
776___
777	&bitslice	(@XMM[0..7, 8..11]);
778$code.=<<___;
779	dec	$rounds
780	jmp	.Lenc_sbox
781.align	16
782.Lenc_loop:
783___
784	&ShiftRows	(@XMM[0..7, 8]);
785$code.=".Lenc_sbox:\n";
786	&Sbox		(@XMM[0..7, 8..15]);
787$code.=<<___;
788	dec	$rounds
789	jl	.Lenc_done
790___
791	&MixColumns	(@XMM[0,1,4,6,3,7,2,5, 8..15]);
792$code.=<<___;
793	movdqa	0x30($const), @XMM[8]	# .LSR
794	jnz	.Lenc_loop
795	movdqa	0x40($const), @XMM[8]	# .LSRM0
796	jmp	.Lenc_loop
797.align	16
798.Lenc_done:
799___
800	# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
801	&bitslice	(@XMM[0,1,4,6,3,7,2,5, 8..11]);
802$code.=<<___;
803	movdqa	($key), @XMM[8]		# last round key
804	pxor	@XMM[8], @XMM[4]
805	pxor	@XMM[8], @XMM[6]
806	pxor	@XMM[8], @XMM[3]
807	pxor	@XMM[8], @XMM[7]
808	pxor	@XMM[8], @XMM[2]
809	pxor	@XMM[8], @XMM[5]
810	pxor	@XMM[8], @XMM[0]
811	pxor	@XMM[8], @XMM[1]
812	ret
813.size	_bsaes_encrypt8,.-_bsaes_encrypt8
814
815.type	_bsaes_decrypt8,\@abi-omnipotent
816.align	64
817_bsaes_decrypt8:
818	lea	.LBS0(%rip), $const	# constants table
819
820	movdqa	($key), @XMM[9]		# round 0 key
821	lea	0x10($key), $key
822	movdqa	-0x30($const), @XMM[8]	# .LM0ISR
823	pxor	@XMM[9], @XMM[0]	# xor with round0 key
824	pxor	@XMM[9], @XMM[1]
825	 pshufb	@XMM[8], @XMM[0]
826	pxor	@XMM[9], @XMM[2]
827	 pshufb	@XMM[8], @XMM[1]
828	pxor	@XMM[9], @XMM[3]
829	 pshufb	@XMM[8], @XMM[2]
830	pxor	@XMM[9], @XMM[4]
831	 pshufb	@XMM[8], @XMM[3]
832	pxor	@XMM[9], @XMM[5]
833	 pshufb	@XMM[8], @XMM[4]
834	pxor	@XMM[9], @XMM[6]
835	 pshufb	@XMM[8], @XMM[5]
836	pxor	@XMM[9], @XMM[7]
837	 pshufb	@XMM[8], @XMM[6]
838	 pshufb	@XMM[8], @XMM[7]
839___
840	&bitslice	(@XMM[0..7, 8..11]);
841$code.=<<___;
842	dec	$rounds
843	jmp	.Ldec_sbox
844.align	16
845.Ldec_loop:
846___
847	&ShiftRows	(@XMM[0..7, 8]);
848$code.=".Ldec_sbox:\n";
849	&InvSbox	(@XMM[0..7, 8..15]);
850$code.=<<___;
851	dec	$rounds
852	jl	.Ldec_done
853___
854	&InvMixColumns	(@XMM[0,1,6,4,2,7,3,5, 8..15]);
855$code.=<<___;
856	movdqa	-0x10($const), @XMM[8]	# .LISR
857	jnz	.Ldec_loop
858	movdqa	-0x20($const), @XMM[8]	# .LISRM0
859	jmp	.Ldec_loop
860.align	16
861.Ldec_done:
862___
863	&bitslice	(@XMM[0,1,6,4,2,7,3,5, 8..11]);
864$code.=<<___;
865	movdqa	($key), @XMM[8]		# last round key
866	pxor	@XMM[8], @XMM[6]
867	pxor	@XMM[8], @XMM[4]
868	pxor	@XMM[8], @XMM[2]
869	pxor	@XMM[8], @XMM[7]
870	pxor	@XMM[8], @XMM[3]
871	pxor	@XMM[8], @XMM[5]
872	pxor	@XMM[8], @XMM[0]
873	pxor	@XMM[8], @XMM[1]
874	ret
875.size	_bsaes_decrypt8,.-_bsaes_decrypt8
876___
877}
878{
879my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
880
881sub bitslice_key {
882my @x=reverse(@_[0..7]);
883my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
884
885	&swapmove	(@x[0,1],1,$bs0,$t2,$t3);
886$code.=<<___;
887	#&swapmove(@x[2,3],1,$t0,$t2,$t3);
888	movdqa	@x[0], @x[2]
889	movdqa	@x[1], @x[3]
890___
891	#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
892
893	&swapmove2x	(@x[0,2,1,3],2,$bs1,$t2,$t3);
894$code.=<<___;
895	#&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
896	movdqa	@x[0], @x[4]
897	movdqa	@x[2], @x[6]
898	movdqa	@x[1], @x[5]
899	movdqa	@x[3], @x[7]
900___
901	&swapmove2x	(@x[0,4,1,5],4,$bs2,$t2,$t3);
902	&swapmove2x	(@x[2,6,3,7],4,$bs2,$t2,$t3);
903}
904
905$code.=<<___;
906.type	_bsaes_key_convert,\@abi-omnipotent
907.align	16
908_bsaes_key_convert:
909	lea	.Lmasks(%rip), $const
910	movdqu	($inp), %xmm7		# load round 0 key
911	lea	0x10($inp), $inp
912	movdqa	0x00($const), %xmm0	# 0x01...
913	movdqa	0x10($const), %xmm1	# 0x02...
914	movdqa	0x20($const), %xmm2	# 0x04...
915	movdqa	0x30($const), %xmm3	# 0x08...
916	movdqa	0x40($const), %xmm4	# .LM0
917	pcmpeqd	%xmm5, %xmm5		# .LNOT
918
919	movdqu	($inp), %xmm6		# load round 1 key
920	movdqa	%xmm7, ($out)		# save round 0 key
921	lea	0x10($out), $out
922	dec	$rounds
923	jmp	.Lkey_loop
924.align	16
925.Lkey_loop:
926	pshufb	%xmm4, %xmm6		# .LM0
927
928	movdqa	%xmm0,	%xmm8
929	movdqa	%xmm1,	%xmm9
930
931	pand	%xmm6,	%xmm8
932	pand	%xmm6,	%xmm9
933	movdqa	%xmm2,	%xmm10
934	pcmpeqb	%xmm0,	%xmm8
935	psllq	\$4,	%xmm0		# 0x10...
936	movdqa	%xmm3,	%xmm11
937	pcmpeqb	%xmm1,	%xmm9
938	psllq	\$4,	%xmm1		# 0x20...
939
940	pand	%xmm6,	%xmm10
941	pand	%xmm6,	%xmm11
942	movdqa	%xmm0,	%xmm12
943	pcmpeqb	%xmm2,	%xmm10
944	psllq	\$4,	%xmm2		# 0x40...
945	movdqa	%xmm1,	%xmm13
946	pcmpeqb	%xmm3,	%xmm11
947	psllq	\$4,	%xmm3		# 0x80...
948
949	movdqa	%xmm2,	%xmm14
950	movdqa	%xmm3,	%xmm15
951	 pxor	%xmm5,	%xmm8		# "pnot"
952	 pxor	%xmm5,	%xmm9
953
954	pand	%xmm6,	%xmm12
955	pand	%xmm6,	%xmm13
956	 movdqa	%xmm8, 0x00($out)	# write bit-sliced round key
957	pcmpeqb	%xmm0,	%xmm12
958	psrlq	\$4,	%xmm0		# 0x01...
959	 movdqa	%xmm9, 0x10($out)
960	pcmpeqb	%xmm1,	%xmm13
961	psrlq	\$4,	%xmm1		# 0x02...
962	 lea	0x10($inp), $inp
963
964	pand	%xmm6,	%xmm14
965	pand	%xmm6,	%xmm15
966	 movdqa	%xmm10, 0x20($out)
967	pcmpeqb	%xmm2,	%xmm14
968	psrlq	\$4,	%xmm2		# 0x04...
969	 movdqa	%xmm11, 0x30($out)
970	pcmpeqb	%xmm3,	%xmm15
971	psrlq	\$4,	%xmm3		# 0x08...
972	 movdqu	($inp), %xmm6		# load next round key
973
974	pxor	%xmm5, %xmm13		# "pnot"
975	pxor	%xmm5, %xmm14
976	movdqa	%xmm12, 0x40($out)
977	movdqa	%xmm13, 0x50($out)
978	movdqa	%xmm14, 0x60($out)
979	movdqa	%xmm15, 0x70($out)
980	lea	0x80($out),$out
981	dec	$rounds
982	jnz	.Lkey_loop
983
984	movdqa	0x50($const), %xmm7	# .L63
985	#movdqa	%xmm6, ($out)		# don't save last round key
986	ret
987.size	_bsaes_key_convert,.-_bsaes_key_convert
988___
989}
990
991if (0 && !$win64) {	# following four functions are unsupported interface
992			# used for benchmarking...
993$code.=<<___;
994.globl	bsaes_enc_key_convert
995.type	bsaes_enc_key_convert,\@function,2
996.align	16
997bsaes_enc_key_convert:
998	mov	240($inp),%r10d		# pass rounds
999	mov	$inp,%rcx		# pass key
1000	mov	$out,%rax		# pass key schedule
1001	call	_bsaes_key_convert
1002	pxor	%xmm6,%xmm7		# fix up last round key
1003	movdqa	%xmm7,(%rax)		# save last round key
1004	ret
1005.size	bsaes_enc_key_convert,.-bsaes_enc_key_convert
1006
1007.globl	bsaes_encrypt_128
1008.type	bsaes_encrypt_128,\@function,4
1009.align	16
1010bsaes_encrypt_128:
1011.Lenc128_loop:
1012	movdqu	0x00($inp), @XMM[0]	# load input
1013	movdqu	0x10($inp), @XMM[1]
1014	movdqu	0x20($inp), @XMM[2]
1015	movdqu	0x30($inp), @XMM[3]
1016	movdqu	0x40($inp), @XMM[4]
1017	movdqu	0x50($inp), @XMM[5]
1018	movdqu	0x60($inp), @XMM[6]
1019	movdqu	0x70($inp), @XMM[7]
1020	mov	$key, %rax		# pass the $key
1021	lea	0x80($inp), $inp
1022	mov	\$10,%r10d
1023
1024	call	_bsaes_encrypt8
1025
1026	movdqu	@XMM[0], 0x00($out)	# write output
1027	movdqu	@XMM[1], 0x10($out)
1028	movdqu	@XMM[4], 0x20($out)
1029	movdqu	@XMM[6], 0x30($out)
1030	movdqu	@XMM[3], 0x40($out)
1031	movdqu	@XMM[7], 0x50($out)
1032	movdqu	@XMM[2], 0x60($out)
1033	movdqu	@XMM[5], 0x70($out)
1034	lea	0x80($out), $out
1035	sub	\$0x80,$len
1036	ja	.Lenc128_loop
1037	ret
1038.size	bsaes_encrypt_128,.-bsaes_encrypt_128
1039
1040.globl	bsaes_dec_key_convert
1041.type	bsaes_dec_key_convert,\@function,2
1042.align	16
1043bsaes_dec_key_convert:
1044	mov	240($inp),%r10d		# pass rounds
1045	mov	$inp,%rcx		# pass key
1046	mov	$out,%rax		# pass key schedule
1047	call	_bsaes_key_convert
1048	pxor	($out),%xmm7		# fix up round 0 key
1049	movdqa	%xmm6,(%rax)		# save last round key
1050	movdqa	%xmm7,($out)
1051	ret
1052.size	bsaes_dec_key_convert,.-bsaes_dec_key_convert
1053
1054.globl	bsaes_decrypt_128
1055.type	bsaes_decrypt_128,\@function,4
1056.align	16
1057bsaes_decrypt_128:
1058.Ldec128_loop:
1059	movdqu	0x00($inp), @XMM[0]	# load input
1060	movdqu	0x10($inp), @XMM[1]
1061	movdqu	0x20($inp), @XMM[2]
1062	movdqu	0x30($inp), @XMM[3]
1063	movdqu	0x40($inp), @XMM[4]
1064	movdqu	0x50($inp), @XMM[5]
1065	movdqu	0x60($inp), @XMM[6]
1066	movdqu	0x70($inp), @XMM[7]
1067	mov	$key, %rax		# pass the $key
1068	lea	0x80($inp), $inp
1069	mov	\$10,%r10d
1070
1071	call	_bsaes_decrypt8
1072
1073	movdqu	@XMM[0], 0x00($out)	# write output
1074	movdqu	@XMM[1], 0x10($out)
1075	movdqu	@XMM[6], 0x20($out)
1076	movdqu	@XMM[4], 0x30($out)
1077	movdqu	@XMM[2], 0x40($out)
1078	movdqu	@XMM[7], 0x50($out)
1079	movdqu	@XMM[3], 0x60($out)
1080	movdqu	@XMM[5], 0x70($out)
1081	lea	0x80($out), $out
1082	sub	\$0x80,$len
1083	ja	.Ldec128_loop
1084	ret
1085.size	bsaes_decrypt_128,.-bsaes_decrypt_128
1086___
1087}
1088{
1089######################################################################
1090#
1091# OpenSSL interface
1092#
1093my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64	? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1094						: ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1095my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1096
1097if ($ecb) {
1098$code.=<<___;
1099.globl	bsaes_ecb_encrypt_blocks
1100.type	bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1101.align	16
1102bsaes_ecb_encrypt_blocks:
1103	mov	%rsp, %rax
1104.Lecb_enc_prologue:
1105	push	%rbp
1106	push	%rbx
1107	push	%r12
1108	push	%r13
1109	push	%r14
1110	push	%r15
1111	lea	-0x48(%rsp),%rsp
1112___
1113$code.=<<___ if ($win64);
1114	lea	-0xa0(%rsp), %rsp
1115	movaps	%xmm6, 0x40(%rsp)
1116	movaps	%xmm7, 0x50(%rsp)
1117	movaps	%xmm8, 0x60(%rsp)
1118	movaps	%xmm9, 0x70(%rsp)
1119	movaps	%xmm10, 0x80(%rsp)
1120	movaps	%xmm11, 0x90(%rsp)
1121	movaps	%xmm12, 0xa0(%rsp)
1122	movaps	%xmm13, 0xb0(%rsp)
1123	movaps	%xmm14, 0xc0(%rsp)
1124	movaps	%xmm15, 0xd0(%rsp)
1125.Lecb_enc_body:
1126___
1127$code.=<<___;
1128	mov	%rsp,%rbp		# backup %rsp
1129	mov	240($arg4),%eax		# rounds
1130	mov	$arg1,$inp		# backup arguments
1131	mov	$arg2,$out
1132	mov	$arg3,$len
1133	mov	$arg4,$key
1134	cmp	\$8,$arg3
1135	jb	.Lecb_enc_short
1136
1137	mov	%eax,%ebx		# backup rounds
1138	shl	\$7,%rax		# 128 bytes per inner round key
1139	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
1140	sub	%rax,%rsp
1141	mov	%rsp,%rax		# pass key schedule
1142	mov	$key,%rcx		# pass key
1143	mov	%ebx,%r10d		# pass rounds
1144	call	_bsaes_key_convert
1145	pxor	%xmm6,%xmm7		# fix up last round key
1146	movdqa	%xmm7,(%rax)		# save last round key
1147
1148	sub	\$8,$len
1149.Lecb_enc_loop:
1150	movdqu	0x00($inp), @XMM[0]	# load input
1151	movdqu	0x10($inp), @XMM[1]
1152	movdqu	0x20($inp), @XMM[2]
1153	movdqu	0x30($inp), @XMM[3]
1154	movdqu	0x40($inp), @XMM[4]
1155	movdqu	0x50($inp), @XMM[5]
1156	mov	%rsp, %rax		# pass key schedule
1157	movdqu	0x60($inp), @XMM[6]
1158	mov	%ebx,%r10d		# pass rounds
1159	movdqu	0x70($inp), @XMM[7]
1160	lea	0x80($inp), $inp
1161
1162	call	_bsaes_encrypt8
1163
1164	movdqu	@XMM[0], 0x00($out)	# write output
1165	movdqu	@XMM[1], 0x10($out)
1166	movdqu	@XMM[4], 0x20($out)
1167	movdqu	@XMM[6], 0x30($out)
1168	movdqu	@XMM[3], 0x40($out)
1169	movdqu	@XMM[7], 0x50($out)
1170	movdqu	@XMM[2], 0x60($out)
1171	movdqu	@XMM[5], 0x70($out)
1172	lea	0x80($out), $out
1173	sub	\$8,$len
1174	jnc	.Lecb_enc_loop
1175
1176	add	\$8,$len
1177	jz	.Lecb_enc_done
1178
1179	movdqu	0x00($inp), @XMM[0]	# load input
1180	mov	%rsp, %rax		# pass key schedule
1181	mov	%ebx,%r10d		# pass rounds
1182	cmp	\$2,$len
1183	jb	.Lecb_enc_one
1184	movdqu	0x10($inp), @XMM[1]
1185	je	.Lecb_enc_two
1186	movdqu	0x20($inp), @XMM[2]
1187	cmp	\$4,$len
1188	jb	.Lecb_enc_three
1189	movdqu	0x30($inp), @XMM[3]
1190	je	.Lecb_enc_four
1191	movdqu	0x40($inp), @XMM[4]
1192	cmp	\$6,$len
1193	jb	.Lecb_enc_five
1194	movdqu	0x50($inp), @XMM[5]
1195	je	.Lecb_enc_six
1196	movdqu	0x60($inp), @XMM[6]
1197	call	_bsaes_encrypt8
1198	movdqu	@XMM[0], 0x00($out)	# write output
1199	movdqu	@XMM[1], 0x10($out)
1200	movdqu	@XMM[4], 0x20($out)
1201	movdqu	@XMM[6], 0x30($out)
1202	movdqu	@XMM[3], 0x40($out)
1203	movdqu	@XMM[7], 0x50($out)
1204	movdqu	@XMM[2], 0x60($out)
1205	jmp	.Lecb_enc_done
1206.align	16
1207.Lecb_enc_six:
1208	call	_bsaes_encrypt8
1209	movdqu	@XMM[0], 0x00($out)	# write output
1210	movdqu	@XMM[1], 0x10($out)
1211	movdqu	@XMM[4], 0x20($out)
1212	movdqu	@XMM[6], 0x30($out)
1213	movdqu	@XMM[3], 0x40($out)
1214	movdqu	@XMM[7], 0x50($out)
1215	jmp	.Lecb_enc_done
1216.align	16
1217.Lecb_enc_five:
1218	call	_bsaes_encrypt8
1219	movdqu	@XMM[0], 0x00($out)	# write output
1220	movdqu	@XMM[1], 0x10($out)
1221	movdqu	@XMM[4], 0x20($out)
1222	movdqu	@XMM[6], 0x30($out)
1223	movdqu	@XMM[3], 0x40($out)
1224	jmp	.Lecb_enc_done
1225.align	16
1226.Lecb_enc_four:
1227	call	_bsaes_encrypt8
1228	movdqu	@XMM[0], 0x00($out)	# write output
1229	movdqu	@XMM[1], 0x10($out)
1230	movdqu	@XMM[4], 0x20($out)
1231	movdqu	@XMM[6], 0x30($out)
1232	jmp	.Lecb_enc_done
1233.align	16
1234.Lecb_enc_three:
1235	call	_bsaes_encrypt8
1236	movdqu	@XMM[0], 0x00($out)	# write output
1237	movdqu	@XMM[1], 0x10($out)
1238	movdqu	@XMM[4], 0x20($out)
1239	jmp	.Lecb_enc_done
1240.align	16
1241.Lecb_enc_two:
1242	call	_bsaes_encrypt8
1243	movdqu	@XMM[0], 0x00($out)	# write output
1244	movdqu	@XMM[1], 0x10($out)
1245	jmp	.Lecb_enc_done
1246.align	16
1247.Lecb_enc_one:
1248	call	_bsaes_encrypt8
1249	movdqu	@XMM[0], 0x00($out)	# write output
1250	jmp	.Lecb_enc_done
1251.align	16
1252.Lecb_enc_short:
1253	lea	($inp), $arg1
1254	lea	($out), $arg2
1255	lea	($key), $arg3
1256	call	asm_AES_encrypt
1257	lea	16($inp), $inp
1258	lea	16($out), $out
1259	dec	$len
1260	jnz	.Lecb_enc_short
1261
1262.Lecb_enc_done:
1263	lea	(%rsp),%rax
1264	pxor	%xmm0, %xmm0
1265.Lecb_enc_bzero:			# wipe key schedule [if any]
1266	movdqa	%xmm0, 0x00(%rax)
1267	movdqa	%xmm0, 0x10(%rax)
1268	lea	0x20(%rax), %rax
1269	cmp	%rax, %rbp
1270	jb	.Lecb_enc_bzero
1271
1272	lea	(%rbp),%rsp		# restore %rsp
1273___
1274$code.=<<___ if ($win64);
1275	movaps	0x40(%rbp), %xmm6
1276	movaps	0x50(%rbp), %xmm7
1277	movaps	0x60(%rbp), %xmm8
1278	movaps	0x70(%rbp), %xmm9
1279	movaps	0x80(%rbp), %xmm10
1280	movaps	0x90(%rbp), %xmm11
1281	movaps	0xa0(%rbp), %xmm12
1282	movaps	0xb0(%rbp), %xmm13
1283	movaps	0xc0(%rbp), %xmm14
1284	movaps	0xd0(%rbp), %xmm15
1285	lea	0xa0(%rbp), %rsp
1286___
1287$code.=<<___;
1288	mov	0x48(%rsp), %r15
1289	mov	0x50(%rsp), %r14
1290	mov	0x58(%rsp), %r13
1291	mov	0x60(%rsp), %r12
1292	mov	0x68(%rsp), %rbx
1293	mov	0x70(%rsp), %rax
1294	lea	0x78(%rsp), %rsp
1295	mov	%rax, %rbp
1296.Lecb_enc_epilogue:
1297	ret
1298.size	bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1299
1300.globl	bsaes_ecb_decrypt_blocks
1301.type	bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1302.align	16
1303bsaes_ecb_decrypt_blocks:
1304	mov	%rsp, %rax
1305.Lecb_dec_prologue:
1306	push	%rbp
1307	push	%rbx
1308	push	%r12
1309	push	%r13
1310	push	%r14
1311	push	%r15
1312	lea	-0x48(%rsp),%rsp
1313___
1314$code.=<<___ if ($win64);
1315	lea	-0xa0(%rsp), %rsp
1316	movaps	%xmm6, 0x40(%rsp)
1317	movaps	%xmm7, 0x50(%rsp)
1318	movaps	%xmm8, 0x60(%rsp)
1319	movaps	%xmm9, 0x70(%rsp)
1320	movaps	%xmm10, 0x80(%rsp)
1321	movaps	%xmm11, 0x90(%rsp)
1322	movaps	%xmm12, 0xa0(%rsp)
1323	movaps	%xmm13, 0xb0(%rsp)
1324	movaps	%xmm14, 0xc0(%rsp)
1325	movaps	%xmm15, 0xd0(%rsp)
1326.Lecb_dec_body:
1327___
1328$code.=<<___;
1329	mov	%rsp,%rbp		# backup %rsp
1330	mov	240($arg4),%eax		# rounds
1331	mov	$arg1,$inp		# backup arguments
1332	mov	$arg2,$out
1333	mov	$arg3,$len
1334	mov	$arg4,$key
1335	cmp	\$8,$arg3
1336	jb	.Lecb_dec_short
1337
1338	mov	%eax,%ebx		# backup rounds
1339	shl	\$7,%rax		# 128 bytes per inner round key
1340	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
1341	sub	%rax,%rsp
1342	mov	%rsp,%rax		# pass key schedule
1343	mov	$key,%rcx		# pass key
1344	mov	%ebx,%r10d		# pass rounds
1345	call	_bsaes_key_convert
1346	pxor	(%rsp),%xmm7		# fix up 0 round key
1347	movdqa	%xmm6,(%rax)		# save last round key
1348	movdqa	%xmm7,(%rsp)
1349
1350	sub	\$8,$len
1351.Lecb_dec_loop:
1352	movdqu	0x00($inp), @XMM[0]	# load input
1353	movdqu	0x10($inp), @XMM[1]
1354	movdqu	0x20($inp), @XMM[2]
1355	movdqu	0x30($inp), @XMM[3]
1356	movdqu	0x40($inp), @XMM[4]
1357	movdqu	0x50($inp), @XMM[5]
1358	mov	%rsp, %rax		# pass key schedule
1359	movdqu	0x60($inp), @XMM[6]
1360	mov	%ebx,%r10d		# pass rounds
1361	movdqu	0x70($inp), @XMM[7]
1362	lea	0x80($inp), $inp
1363
1364	call	_bsaes_decrypt8
1365
1366	movdqu	@XMM[0], 0x00($out)	# write output
1367	movdqu	@XMM[1], 0x10($out)
1368	movdqu	@XMM[6], 0x20($out)
1369	movdqu	@XMM[4], 0x30($out)
1370	movdqu	@XMM[2], 0x40($out)
1371	movdqu	@XMM[7], 0x50($out)
1372	movdqu	@XMM[3], 0x60($out)
1373	movdqu	@XMM[5], 0x70($out)
1374	lea	0x80($out), $out
1375	sub	\$8,$len
1376	jnc	.Lecb_dec_loop
1377
1378	add	\$8,$len
1379	jz	.Lecb_dec_done
1380
1381	movdqu	0x00($inp), @XMM[0]	# load input
1382	mov	%rsp, %rax		# pass key schedule
1383	mov	%ebx,%r10d		# pass rounds
1384	cmp	\$2,$len
1385	jb	.Lecb_dec_one
1386	movdqu	0x10($inp), @XMM[1]
1387	je	.Lecb_dec_two
1388	movdqu	0x20($inp), @XMM[2]
1389	cmp	\$4,$len
1390	jb	.Lecb_dec_three
1391	movdqu	0x30($inp), @XMM[3]
1392	je	.Lecb_dec_four
1393	movdqu	0x40($inp), @XMM[4]
1394	cmp	\$6,$len
1395	jb	.Lecb_dec_five
1396	movdqu	0x50($inp), @XMM[5]
1397	je	.Lecb_dec_six
1398	movdqu	0x60($inp), @XMM[6]
1399	call	_bsaes_decrypt8
1400	movdqu	@XMM[0], 0x00($out)	# write output
1401	movdqu	@XMM[1], 0x10($out)
1402	movdqu	@XMM[6], 0x20($out)
1403	movdqu	@XMM[4], 0x30($out)
1404	movdqu	@XMM[2], 0x40($out)
1405	movdqu	@XMM[7], 0x50($out)
1406	movdqu	@XMM[3], 0x60($out)
1407	jmp	.Lecb_dec_done
1408.align	16
1409.Lecb_dec_six:
1410	call	_bsaes_decrypt8
1411	movdqu	@XMM[0], 0x00($out)	# write output
1412	movdqu	@XMM[1], 0x10($out)
1413	movdqu	@XMM[6], 0x20($out)
1414	movdqu	@XMM[4], 0x30($out)
1415	movdqu	@XMM[2], 0x40($out)
1416	movdqu	@XMM[7], 0x50($out)
1417	jmp	.Lecb_dec_done
1418.align	16
1419.Lecb_dec_five:
1420	call	_bsaes_decrypt8
1421	movdqu	@XMM[0], 0x00($out)	# write output
1422	movdqu	@XMM[1], 0x10($out)
1423	movdqu	@XMM[6], 0x20($out)
1424	movdqu	@XMM[4], 0x30($out)
1425	movdqu	@XMM[2], 0x40($out)
1426	jmp	.Lecb_dec_done
1427.align	16
1428.Lecb_dec_four:
1429	call	_bsaes_decrypt8
1430	movdqu	@XMM[0], 0x00($out)	# write output
1431	movdqu	@XMM[1], 0x10($out)
1432	movdqu	@XMM[6], 0x20($out)
1433	movdqu	@XMM[4], 0x30($out)
1434	jmp	.Lecb_dec_done
1435.align	16
1436.Lecb_dec_three:
1437	call	_bsaes_decrypt8
1438	movdqu	@XMM[0], 0x00($out)	# write output
1439	movdqu	@XMM[1], 0x10($out)
1440	movdqu	@XMM[6], 0x20($out)
1441	jmp	.Lecb_dec_done
1442.align	16
1443.Lecb_dec_two:
1444	call	_bsaes_decrypt8
1445	movdqu	@XMM[0], 0x00($out)	# write output
1446	movdqu	@XMM[1], 0x10($out)
1447	jmp	.Lecb_dec_done
1448.align	16
1449.Lecb_dec_one:
1450	call	_bsaes_decrypt8
1451	movdqu	@XMM[0], 0x00($out)	# write output
1452	jmp	.Lecb_dec_done
1453.align	16
1454.Lecb_dec_short:
1455	lea	($inp), $arg1
1456	lea	($out), $arg2
1457	lea	($key), $arg3
1458	call	asm_AES_decrypt
1459	lea	16($inp), $inp
1460	lea	16($out), $out
1461	dec	$len
1462	jnz	.Lecb_dec_short
1463
1464.Lecb_dec_done:
1465	lea	(%rsp),%rax
1466	pxor	%xmm0, %xmm0
1467.Lecb_dec_bzero:			# wipe key schedule [if any]
1468	movdqa	%xmm0, 0x00(%rax)
1469	movdqa	%xmm0, 0x10(%rax)
1470	lea	0x20(%rax), %rax
1471	cmp	%rax, %rbp
1472	jb	.Lecb_dec_bzero
1473
1474	lea	(%rbp),%rsp		# restore %rsp
1475___
1476$code.=<<___ if ($win64);
1477	movaps	0x40(%rbp), %xmm6
1478	movaps	0x50(%rbp), %xmm7
1479	movaps	0x60(%rbp), %xmm8
1480	movaps	0x70(%rbp), %xmm9
1481	movaps	0x80(%rbp), %xmm10
1482	movaps	0x90(%rbp), %xmm11
1483	movaps	0xa0(%rbp), %xmm12
1484	movaps	0xb0(%rbp), %xmm13
1485	movaps	0xc0(%rbp), %xmm14
1486	movaps	0xd0(%rbp), %xmm15
1487	lea	0xa0(%rbp), %rsp
1488___
1489$code.=<<___;
1490	mov	0x48(%rsp), %r15
1491	mov	0x50(%rsp), %r14
1492	mov	0x58(%rsp), %r13
1493	mov	0x60(%rsp), %r12
1494	mov	0x68(%rsp), %rbx
1495	mov	0x70(%rsp), %rax
1496	lea	0x78(%rsp), %rsp
1497	mov	%rax, %rbp
1498.Lecb_dec_epilogue:
1499	ret
1500.size	bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1501___
1502}
1503$code.=<<___;
1504.extern	asm_AES_cbc_encrypt
1505.globl	bsaes_cbc_encrypt
1506.type	bsaes_cbc_encrypt,\@abi-omnipotent
1507.align	16
1508bsaes_cbc_encrypt:
1509___
1510$code.=<<___ if ($win64);
1511	mov	48(%rsp),$arg6		# pull direction flag
1512___
1513$code.=<<___;
1514	cmp	\$0,$arg6
1515	jne	asm_AES_cbc_encrypt
1516	cmp	\$128,$arg3
1517	jb	asm_AES_cbc_encrypt
1518
1519	mov	%rsp, %rax
1520.Lcbc_dec_prologue:
1521	push	%rbp
1522	push	%rbx
1523	push	%r12
1524	push	%r13
1525	push	%r14
1526	push	%r15
1527	lea	-0x48(%rsp), %rsp
1528___
1529$code.=<<___ if ($win64);
1530	mov	0xa0(%rsp),$arg5	# pull ivp
1531	lea	-0xa0(%rsp), %rsp
1532	movaps	%xmm6, 0x40(%rsp)
1533	movaps	%xmm7, 0x50(%rsp)
1534	movaps	%xmm8, 0x60(%rsp)
1535	movaps	%xmm9, 0x70(%rsp)
1536	movaps	%xmm10, 0x80(%rsp)
1537	movaps	%xmm11, 0x90(%rsp)
1538	movaps	%xmm12, 0xa0(%rsp)
1539	movaps	%xmm13, 0xb0(%rsp)
1540	movaps	%xmm14, 0xc0(%rsp)
1541	movaps	%xmm15, 0xd0(%rsp)
1542.Lcbc_dec_body:
1543___
1544$code.=<<___;
1545	mov	%rsp, %rbp		# backup %rsp
1546	mov	240($arg4), %eax	# rounds
1547	mov	$arg1, $inp		# backup arguments
1548	mov	$arg2, $out
1549	mov	$arg3, $len
1550	mov	$arg4, $key
1551	mov	$arg5, %rbx
1552	shr	\$4, $len		# bytes to blocks
1553
1554	mov	%eax, %edx		# rounds
1555	shl	\$7, %rax		# 128 bytes per inner round key
1556	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
1557	sub	%rax, %rsp
1558
1559	mov	%rsp, %rax		# pass key schedule
1560	mov	$key, %rcx		# pass key
1561	mov	%edx, %r10d		# pass rounds
1562	call	_bsaes_key_convert
1563	pxor	(%rsp),%xmm7		# fix up 0 round key
1564	movdqa	%xmm6,(%rax)		# save last round key
1565	movdqa	%xmm7,(%rsp)
1566
1567	movdqu	(%rbx), @XMM[15]	# load IV
1568	sub	\$8,$len
1569.Lcbc_dec_loop:
1570	movdqu	0x00($inp), @XMM[0]	# load input
1571	movdqu	0x10($inp), @XMM[1]
1572	movdqu	0x20($inp), @XMM[2]
1573	movdqu	0x30($inp), @XMM[3]
1574	movdqu	0x40($inp), @XMM[4]
1575	movdqu	0x50($inp), @XMM[5]
1576	mov	%rsp, %rax		# pass key schedule
1577	movdqu	0x60($inp), @XMM[6]
1578	mov	%edx,%r10d		# pass rounds
1579	movdqu	0x70($inp), @XMM[7]
1580	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1581
1582	call	_bsaes_decrypt8
1583
1584	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1585	movdqu	0x00($inp), @XMM[8]	# re-load input
1586	movdqu	0x10($inp), @XMM[9]
1587	pxor	@XMM[8], @XMM[1]
1588	movdqu	0x20($inp), @XMM[10]
1589	pxor	@XMM[9], @XMM[6]
1590	movdqu	0x30($inp), @XMM[11]
1591	pxor	@XMM[10], @XMM[4]
1592	movdqu	0x40($inp), @XMM[12]
1593	pxor	@XMM[11], @XMM[2]
1594	movdqu	0x50($inp), @XMM[13]
1595	pxor	@XMM[12], @XMM[7]
1596	movdqu	0x60($inp), @XMM[14]
1597	pxor	@XMM[13], @XMM[3]
1598	movdqu	0x70($inp), @XMM[15]	# IV
1599	pxor	@XMM[14], @XMM[5]
1600	movdqu	@XMM[0], 0x00($out)	# write output
1601	lea	0x80($inp), $inp
1602	movdqu	@XMM[1], 0x10($out)
1603	movdqu	@XMM[6], 0x20($out)
1604	movdqu	@XMM[4], 0x30($out)
1605	movdqu	@XMM[2], 0x40($out)
1606	movdqu	@XMM[7], 0x50($out)
1607	movdqu	@XMM[3], 0x60($out)
1608	movdqu	@XMM[5], 0x70($out)
1609	lea	0x80($out), $out
1610	sub	\$8,$len
1611	jnc	.Lcbc_dec_loop
1612
1613	add	\$8,$len
1614	jz	.Lcbc_dec_done
1615
1616	movdqu	0x00($inp), @XMM[0]	# load input
1617	mov	%rsp, %rax		# pass key schedule
1618	mov	%edx, %r10d		# pass rounds
1619	cmp	\$2,$len
1620	jb	.Lcbc_dec_one
1621	movdqu	0x10($inp), @XMM[1]
1622	je	.Lcbc_dec_two
1623	movdqu	0x20($inp), @XMM[2]
1624	cmp	\$4,$len
1625	jb	.Lcbc_dec_three
1626	movdqu	0x30($inp), @XMM[3]
1627	je	.Lcbc_dec_four
1628	movdqu	0x40($inp), @XMM[4]
1629	cmp	\$6,$len
1630	jb	.Lcbc_dec_five
1631	movdqu	0x50($inp), @XMM[5]
1632	je	.Lcbc_dec_six
1633	movdqu	0x60($inp), @XMM[6]
1634	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1635	call	_bsaes_decrypt8
1636	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1637	movdqu	0x00($inp), @XMM[8]	# re-load input
1638	movdqu	0x10($inp), @XMM[9]
1639	pxor	@XMM[8], @XMM[1]
1640	movdqu	0x20($inp), @XMM[10]
1641	pxor	@XMM[9], @XMM[6]
1642	movdqu	0x30($inp), @XMM[11]
1643	pxor	@XMM[10], @XMM[4]
1644	movdqu	0x40($inp), @XMM[12]
1645	pxor	@XMM[11], @XMM[2]
1646	movdqu	0x50($inp), @XMM[13]
1647	pxor	@XMM[12], @XMM[7]
1648	movdqu	0x60($inp), @XMM[15]	# IV
1649	pxor	@XMM[13], @XMM[3]
1650	movdqu	@XMM[0], 0x00($out)	# write output
1651	movdqu	@XMM[1], 0x10($out)
1652	movdqu	@XMM[6], 0x20($out)
1653	movdqu	@XMM[4], 0x30($out)
1654	movdqu	@XMM[2], 0x40($out)
1655	movdqu	@XMM[7], 0x50($out)
1656	movdqu	@XMM[3], 0x60($out)
1657	jmp	.Lcbc_dec_done
1658.align	16
1659.Lcbc_dec_six:
1660	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1661	call	_bsaes_decrypt8
1662	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1663	movdqu	0x00($inp), @XMM[8]	# re-load input
1664	movdqu	0x10($inp), @XMM[9]
1665	pxor	@XMM[8], @XMM[1]
1666	movdqu	0x20($inp), @XMM[10]
1667	pxor	@XMM[9], @XMM[6]
1668	movdqu	0x30($inp), @XMM[11]
1669	pxor	@XMM[10], @XMM[4]
1670	movdqu	0x40($inp), @XMM[12]
1671	pxor	@XMM[11], @XMM[2]
1672	movdqu	0x50($inp), @XMM[15]	# IV
1673	pxor	@XMM[12], @XMM[7]
1674	movdqu	@XMM[0], 0x00($out)	# write output
1675	movdqu	@XMM[1], 0x10($out)
1676	movdqu	@XMM[6], 0x20($out)
1677	movdqu	@XMM[4], 0x30($out)
1678	movdqu	@XMM[2], 0x40($out)
1679	movdqu	@XMM[7], 0x50($out)
1680	jmp	.Lcbc_dec_done
1681.align	16
1682.Lcbc_dec_five:
1683	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1684	call	_bsaes_decrypt8
1685	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1686	movdqu	0x00($inp), @XMM[8]	# re-load input
1687	movdqu	0x10($inp), @XMM[9]
1688	pxor	@XMM[8], @XMM[1]
1689	movdqu	0x20($inp), @XMM[10]
1690	pxor	@XMM[9], @XMM[6]
1691	movdqu	0x30($inp), @XMM[11]
1692	pxor	@XMM[10], @XMM[4]
1693	movdqu	0x40($inp), @XMM[15]	# IV
1694	pxor	@XMM[11], @XMM[2]
1695	movdqu	@XMM[0], 0x00($out)	# write output
1696	movdqu	@XMM[1], 0x10($out)
1697	movdqu	@XMM[6], 0x20($out)
1698	movdqu	@XMM[4], 0x30($out)
1699	movdqu	@XMM[2], 0x40($out)
1700	jmp	.Lcbc_dec_done
1701.align	16
1702.Lcbc_dec_four:
1703	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1704	call	_bsaes_decrypt8
1705	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1706	movdqu	0x00($inp), @XMM[8]	# re-load input
1707	movdqu	0x10($inp), @XMM[9]
1708	pxor	@XMM[8], @XMM[1]
1709	movdqu	0x20($inp), @XMM[10]
1710	pxor	@XMM[9], @XMM[6]
1711	movdqu	0x30($inp), @XMM[15]	# IV
1712	pxor	@XMM[10], @XMM[4]
1713	movdqu	@XMM[0], 0x00($out)	# write output
1714	movdqu	@XMM[1], 0x10($out)
1715	movdqu	@XMM[6], 0x20($out)
1716	movdqu	@XMM[4], 0x30($out)
1717	jmp	.Lcbc_dec_done
1718.align	16
1719.Lcbc_dec_three:
1720	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1721	call	_bsaes_decrypt8
1722	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1723	movdqu	0x00($inp), @XMM[8]	# re-load input
1724	movdqu	0x10($inp), @XMM[9]
1725	pxor	@XMM[8], @XMM[1]
1726	movdqu	0x20($inp), @XMM[15]	# IV
1727	pxor	@XMM[9], @XMM[6]
1728	movdqu	@XMM[0], 0x00($out)	# write output
1729	movdqu	@XMM[1], 0x10($out)
1730	movdqu	@XMM[6], 0x20($out)
1731	jmp	.Lcbc_dec_done
1732.align	16
1733.Lcbc_dec_two:
1734	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
1735	call	_bsaes_decrypt8
1736	pxor	0x20(%rbp), @XMM[0]	# ^= IV
1737	movdqu	0x00($inp), @XMM[8]	# re-load input
1738	movdqu	0x10($inp), @XMM[15]	# IV
1739	pxor	@XMM[8], @XMM[1]
1740	movdqu	@XMM[0], 0x00($out)	# write output
1741	movdqu	@XMM[1], 0x10($out)
1742	jmp	.Lcbc_dec_done
1743.align	16
1744.Lcbc_dec_one:
1745	lea	($inp), $arg1
1746	lea	0x20(%rbp), $arg2	# buffer output
1747	lea	($key), $arg3
1748	call	asm_AES_decrypt		# doesn't touch %xmm
1749	pxor	0x20(%rbp), @XMM[15]	# ^= IV
1750	movdqu	@XMM[15], ($out)	# write output
1751	movdqa	@XMM[0], @XMM[15]	# IV
1752
1753.Lcbc_dec_done:
1754	movdqu	@XMM[15], (%rbx)	# return IV
1755	lea	(%rsp), %rax
1756	pxor	%xmm0, %xmm0
1757.Lcbc_dec_bzero:			# wipe key schedule [if any]
1758	movdqa	%xmm0, 0x00(%rax)
1759	movdqa	%xmm0, 0x10(%rax)
1760	lea	0x20(%rax), %rax
1761	cmp	%rax, %rbp
1762	ja	.Lcbc_dec_bzero
1763
1764	lea	(%rbp),%rsp		# restore %rsp
1765___
1766$code.=<<___ if ($win64);
1767	movaps	0x40(%rbp), %xmm6
1768	movaps	0x50(%rbp), %xmm7
1769	movaps	0x60(%rbp), %xmm8
1770	movaps	0x70(%rbp), %xmm9
1771	movaps	0x80(%rbp), %xmm10
1772	movaps	0x90(%rbp), %xmm11
1773	movaps	0xa0(%rbp), %xmm12
1774	movaps	0xb0(%rbp), %xmm13
1775	movaps	0xc0(%rbp), %xmm14
1776	movaps	0xd0(%rbp), %xmm15
1777	lea	0xa0(%rbp), %rsp
1778___
1779$code.=<<___;
1780	mov	0x48(%rsp), %r15
1781	mov	0x50(%rsp), %r14
1782	mov	0x58(%rsp), %r13
1783	mov	0x60(%rsp), %r12
1784	mov	0x68(%rsp), %rbx
1785	mov	0x70(%rsp), %rax
1786	lea	0x78(%rsp), %rsp
1787	mov	%rax, %rbp
1788.Lcbc_dec_epilogue:
1789	ret
1790.size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1791
1792.globl	bsaes_ctr32_encrypt_blocks
1793.type	bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1794.align	16
1795bsaes_ctr32_encrypt_blocks:
1796	mov	%rsp, %rax
1797.Lctr_enc_prologue:
1798	push	%rbp
1799	push	%rbx
1800	push	%r12
1801	push	%r13
1802	push	%r14
1803	push	%r15
1804	lea	-0x48(%rsp), %rsp
1805___
1806$code.=<<___ if ($win64);
1807	mov	0xa0(%rsp),$arg5	# pull ivp
1808	lea	-0xa0(%rsp), %rsp
1809	movaps	%xmm6, 0x40(%rsp)
1810	movaps	%xmm7, 0x50(%rsp)
1811	movaps	%xmm8, 0x60(%rsp)
1812	movaps	%xmm9, 0x70(%rsp)
1813	movaps	%xmm10, 0x80(%rsp)
1814	movaps	%xmm11, 0x90(%rsp)
1815	movaps	%xmm12, 0xa0(%rsp)
1816	movaps	%xmm13, 0xb0(%rsp)
1817	movaps	%xmm14, 0xc0(%rsp)
1818	movaps	%xmm15, 0xd0(%rsp)
1819.Lctr_enc_body:
1820___
1821$code.=<<___;
1822	mov	%rsp, %rbp		# backup %rsp
1823	movdqu	($arg5), %xmm0		# load counter
1824	mov	240($arg4), %eax	# rounds
1825	mov	$arg1, $inp		# backup arguments
1826	mov	$arg2, $out
1827	mov	$arg3, $len
1828	mov	$arg4, $key
1829	movdqa	%xmm0, 0x20(%rbp)	# copy counter
1830	cmp	\$8, $arg3
1831	jb	.Lctr_enc_short
1832
1833	mov	%eax, %ebx		# rounds
1834	shl	\$7, %rax		# 128 bytes per inner round key
1835	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
1836	sub	%rax, %rsp
1837
1838	mov	%rsp, %rax		# pass key schedule
1839	mov	$key, %rcx		# pass key
1840	mov	%ebx, %r10d		# pass rounds
1841	call	_bsaes_key_convert
1842	pxor	%xmm6,%xmm7		# fix up last round key
1843	movdqa	%xmm7,(%rax)		# save last round key
1844
1845	movdqa	(%rsp), @XMM[9]		# load round0 key
1846	lea	.LADD1(%rip), %r11
1847	movdqa	0x20(%rbp), @XMM[0]	# counter copy
1848	movdqa	-0x20(%r11), @XMM[8]	# .LSWPUP
1849	pshufb	@XMM[8], @XMM[9]	# byte swap upper part
1850	pshufb	@XMM[8], @XMM[0]
1851	movdqa	@XMM[9], (%rsp)		# save adjusted round0 key
1852	jmp	.Lctr_enc_loop
1853.align	16
1854.Lctr_enc_loop:
1855	movdqa	@XMM[0], 0x20(%rbp)	# save counter
1856	movdqa	@XMM[0], @XMM[1]	# prepare 8 counter values
1857	movdqa	@XMM[0], @XMM[2]
1858	paddd	0x00(%r11), @XMM[1]	# .LADD1
1859	movdqa	@XMM[0], @XMM[3]
1860	paddd	0x10(%r11), @XMM[2]	# .LADD2
1861	movdqa	@XMM[0], @XMM[4]
1862	paddd	0x20(%r11), @XMM[3]	# .LADD3
1863	movdqa	@XMM[0], @XMM[5]
1864	paddd	0x30(%r11), @XMM[4]	# .LADD4
1865	movdqa	@XMM[0], @XMM[6]
1866	paddd	0x40(%r11), @XMM[5]	# .LADD5
1867	movdqa	@XMM[0], @XMM[7]
1868	paddd	0x50(%r11), @XMM[6]	# .LADD6
1869	paddd	0x60(%r11), @XMM[7]	# .LADD7
1870
1871	# Borrow prologue from _bsaes_encrypt8 to use the opportunity
1872	# to flip byte order in 32-bit counter
1873	movdqa	(%rsp), @XMM[9]		# round 0 key
1874	lea	0x10(%rsp), %rax	# pass key schedule
1875	movdqa	-0x10(%r11), @XMM[8]	# .LSWPUPM0SR
1876	pxor	@XMM[9], @XMM[0]	# xor with round0 key
1877	pxor	@XMM[9], @XMM[1]
1878	 pshufb	@XMM[8], @XMM[0]
1879	pxor	@XMM[9], @XMM[2]
1880	 pshufb	@XMM[8], @XMM[1]
1881	pxor	@XMM[9], @XMM[3]
1882	 pshufb	@XMM[8], @XMM[2]
1883	pxor	@XMM[9], @XMM[4]
1884	 pshufb	@XMM[8], @XMM[3]
1885	pxor	@XMM[9], @XMM[5]
1886	 pshufb	@XMM[8], @XMM[4]
1887	pxor	@XMM[9], @XMM[6]
1888	 pshufb	@XMM[8], @XMM[5]
1889	pxor	@XMM[9], @XMM[7]
1890	 pshufb	@XMM[8], @XMM[6]
1891	lea	.LBS0(%rip), %r11	# constants table
1892	 pshufb	@XMM[8], @XMM[7]
1893	mov	%ebx,%r10d		# pass rounds
1894
1895	call	_bsaes_encrypt8_bitslice
1896
1897	sub	\$8,$len
1898	jc	.Lctr_enc_loop_done
1899
1900	movdqu	0x00($inp), @XMM[8]	# load input
1901	movdqu	0x10($inp), @XMM[9]
1902	movdqu	0x20($inp), @XMM[10]
1903	movdqu	0x30($inp), @XMM[11]
1904	movdqu	0x40($inp), @XMM[12]
1905	movdqu	0x50($inp), @XMM[13]
1906	movdqu	0x60($inp), @XMM[14]
1907	movdqu	0x70($inp), @XMM[15]
1908	lea	0x80($inp),$inp
1909	pxor	@XMM[0], @XMM[8]
1910	movdqa	0x20(%rbp), @XMM[0]	# load counter
1911	pxor	@XMM[9], @XMM[1]
1912	movdqu	@XMM[8], 0x00($out)	# write output
1913	pxor	@XMM[10], @XMM[4]
1914	movdqu	@XMM[1], 0x10($out)
1915	pxor	@XMM[11], @XMM[6]
1916	movdqu	@XMM[4], 0x20($out)
1917	pxor	@XMM[12], @XMM[3]
1918	movdqu	@XMM[6], 0x30($out)
1919	pxor	@XMM[13], @XMM[7]
1920	movdqu	@XMM[3], 0x40($out)
1921	pxor	@XMM[14], @XMM[2]
1922	movdqu	@XMM[7], 0x50($out)
1923	pxor	@XMM[15], @XMM[5]
1924	movdqu	@XMM[2], 0x60($out)
1925	lea	.LADD1(%rip), %r11
1926	movdqu	@XMM[5], 0x70($out)
1927	lea	0x80($out), $out
1928	paddd	0x70(%r11), @XMM[0]	# .LADD8
1929	jnz	.Lctr_enc_loop
1930
1931	jmp	.Lctr_enc_done
1932.align	16
1933.Lctr_enc_loop_done:
1934	add	\$8, $len
1935	movdqu	0x00($inp), @XMM[8]	# load input
1936	pxor	@XMM[8], @XMM[0]
1937	movdqu	@XMM[0], 0x00($out)	# write output
1938	cmp	\$2,$len
1939	jb	.Lctr_enc_done
1940	movdqu	0x10($inp), @XMM[9]
1941	pxor	@XMM[9], @XMM[1]
1942	movdqu	@XMM[1], 0x10($out)
1943	je	.Lctr_enc_done
1944	movdqu	0x20($inp), @XMM[10]
1945	pxor	@XMM[10], @XMM[4]
1946	movdqu	@XMM[4], 0x20($out)
1947	cmp	\$4,$len
1948	jb	.Lctr_enc_done
1949	movdqu	0x30($inp), @XMM[11]
1950	pxor	@XMM[11], @XMM[6]
1951	movdqu	@XMM[6], 0x30($out)
1952	je	.Lctr_enc_done
1953	movdqu	0x40($inp), @XMM[12]
1954	pxor	@XMM[12], @XMM[3]
1955	movdqu	@XMM[3], 0x40($out)
1956	cmp	\$6,$len
1957	jb	.Lctr_enc_done
1958	movdqu	0x50($inp), @XMM[13]
1959	pxor	@XMM[13], @XMM[7]
1960	movdqu	@XMM[7], 0x50($out)
1961	je	.Lctr_enc_done
1962	movdqu	0x60($inp), @XMM[14]
1963	pxor	@XMM[14], @XMM[2]
1964	movdqu	@XMM[2], 0x60($out)
1965	jmp	.Lctr_enc_done
1966
1967.align	16
1968.Lctr_enc_short:
1969	lea	0x20(%rbp), $arg1
1970	lea	0x30(%rbp), $arg2
1971	lea	($key), $arg3
1972	call	asm_AES_encrypt
1973	movdqu	($inp), @XMM[1]
1974	lea	16($inp), $inp
1975	mov	0x2c(%rbp), %eax	# load 32-bit counter
1976	bswap	%eax
1977	pxor	0x30(%rbp), @XMM[1]
1978	inc	%eax			# increment
1979	movdqu	@XMM[1], ($out)
1980	bswap	%eax
1981	lea	16($out), $out
1982	mov	%eax, 0x2c(%rsp)	# save 32-bit counter
1983	dec	$len
1984	jnz	.Lctr_enc_short
1985
1986.Lctr_enc_done:
1987	lea	(%rsp), %rax
1988	pxor	%xmm0, %xmm0
1989.Lctr_enc_bzero:			# wipe key schedule [if any]
1990	movdqa	%xmm0, 0x00(%rax)
1991	movdqa	%xmm0, 0x10(%rax)
1992	lea	0x20(%rax), %rax
1993	cmp	%rax, %rbp
1994	ja	.Lctr_enc_bzero
1995
1996	lea	(%rbp),%rsp		# restore %rsp
1997___
1998$code.=<<___ if ($win64);
1999	movaps	0x40(%rbp), %xmm6
2000	movaps	0x50(%rbp), %xmm7
2001	movaps	0x60(%rbp), %xmm8
2002	movaps	0x70(%rbp), %xmm9
2003	movaps	0x80(%rbp), %xmm10
2004	movaps	0x90(%rbp), %xmm11
2005	movaps	0xa0(%rbp), %xmm12
2006	movaps	0xb0(%rbp), %xmm13
2007	movaps	0xc0(%rbp), %xmm14
2008	movaps	0xd0(%rbp), %xmm15
2009	lea	0xa0(%rbp), %rsp
2010___
2011$code.=<<___;
2012	mov	0x48(%rsp), %r15
2013	mov	0x50(%rsp), %r14
2014	mov	0x58(%rsp), %r13
2015	mov	0x60(%rsp), %r12
2016	mov	0x68(%rsp), %rbx
2017	mov	0x70(%rsp), %rax
2018	lea	0x78(%rsp), %rsp
2019	mov	%rax, %rbp
2020.Lctr_enc_epilogue:
2021	ret
2022.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2023___
2024######################################################################
2025# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2026#	const AES_KEY *key1, const AES_KEY *key2,
2027#	const unsigned char iv[16]);
2028#
2029my ($twmask,$twres,$twtmp)=@XMM[13..15];
2030$code.=<<___;
2031.globl	bsaes_xts_encrypt
2032.type	bsaes_xts_encrypt,\@abi-omnipotent
2033.align	16
2034bsaes_xts_encrypt:
2035	mov	%rsp, %rax
2036.Lxts_enc_prologue:
2037	push	%rbp
2038	push	%rbx
2039	push	%r12
2040	push	%r13
2041	push	%r14
2042	push	%r15
2043	lea	-0x48(%rsp), %rsp
2044___
2045$code.=<<___ if ($win64);
2046	mov	0xa0(%rsp),$arg5	# pull key2
2047	mov	0xa8(%rsp),$arg6	# pull ivp
2048	lea	-0xa0(%rsp), %rsp
2049	movaps	%xmm6, 0x40(%rsp)
2050	movaps	%xmm7, 0x50(%rsp)
2051	movaps	%xmm8, 0x60(%rsp)
2052	movaps	%xmm9, 0x70(%rsp)
2053	movaps	%xmm10, 0x80(%rsp)
2054	movaps	%xmm11, 0x90(%rsp)
2055	movaps	%xmm12, 0xa0(%rsp)
2056	movaps	%xmm13, 0xb0(%rsp)
2057	movaps	%xmm14, 0xc0(%rsp)
2058	movaps	%xmm15, 0xd0(%rsp)
2059.Lxts_enc_body:
2060___
2061$code.=<<___;
2062	mov	%rsp, %rbp		# backup %rsp
2063	mov	$arg1, $inp		# backup arguments
2064	mov	$arg2, $out
2065	mov	$arg3, $len
2066	mov	$arg4, $key
2067
2068	lea	($arg6), $arg1
2069	lea	0x20(%rbp), $arg2
2070	lea	($arg5), $arg3
2071	call	asm_AES_encrypt		# generate initial tweak
2072
2073	mov	240($key), %eax		# rounds
2074	mov	$len, %rbx		# backup $len
2075
2076	mov	%eax, %edx		# rounds
2077	shl	\$7, %rax		# 128 bytes per inner round key
2078	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
2079	sub	%rax, %rsp
2080
2081	mov	%rsp, %rax		# pass key schedule
2082	mov	$key, %rcx		# pass key
2083	mov	%edx, %r10d		# pass rounds
2084	call	_bsaes_key_convert
2085	pxor	%xmm6, %xmm7		# fix up last round key
2086	movdqa	%xmm7, (%rax)		# save last round key
2087
2088	and	\$-16, $len
2089	sub	\$0x80, %rsp		# place for tweak[8]
2090	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
2091
2092	pxor	$twtmp, $twtmp
2093	movdqa	.Lxts_magic(%rip), $twmask
2094	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2095
2096	sub	\$0x80, $len
2097	jc	.Lxts_enc_short
2098	jmp	.Lxts_enc_loop
2099
2100.align	16
2101.Lxts_enc_loop:
2102___
2103    for ($i=0;$i<7;$i++) {
2104    $code.=<<___;
2105	pshufd	\$0x13, $twtmp, $twres
2106	pxor	$twtmp, $twtmp
2107	movdqa	@XMM[7], @XMM[$i]
2108	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2109	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2110	pand	$twmask, $twres		# isolate carry and residue
2111	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2112	pxor	$twres, @XMM[7]
2113___
2114    $code.=<<___ if ($i>=1);
2115	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2116___
2117    $code.=<<___ if ($i>=2);
2118	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2119___
2120    }
2121$code.=<<___;
2122	movdqu	0x60($inp), @XMM[8+6]
2123	pxor	@XMM[8+5], @XMM[5]
2124	movdqu	0x70($inp), @XMM[8+7]
2125	lea	0x80($inp), $inp
2126	movdqa	@XMM[7], 0x70(%rsp)
2127	pxor	@XMM[8+6], @XMM[6]
2128	lea	0x80(%rsp), %rax	# pass key schedule
2129	pxor	@XMM[8+7], @XMM[7]
2130	mov	%edx, %r10d		# pass rounds
2131
2132	call	_bsaes_encrypt8
2133
2134	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2135	pxor	0x10(%rsp), @XMM[1]
2136	movdqu	@XMM[0], 0x00($out)	# write output
2137	pxor	0x20(%rsp), @XMM[4]
2138	movdqu	@XMM[1], 0x10($out)
2139	pxor	0x30(%rsp), @XMM[6]
2140	movdqu	@XMM[4], 0x20($out)
2141	pxor	0x40(%rsp), @XMM[3]
2142	movdqu	@XMM[6], 0x30($out)
2143	pxor	0x50(%rsp), @XMM[7]
2144	movdqu	@XMM[3], 0x40($out)
2145	pxor	0x60(%rsp), @XMM[2]
2146	movdqu	@XMM[7], 0x50($out)
2147	pxor	0x70(%rsp), @XMM[5]
2148	movdqu	@XMM[2], 0x60($out)
2149	movdqu	@XMM[5], 0x70($out)
2150	lea	0x80($out), $out
2151
2152	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
2153	pxor	$twtmp, $twtmp
2154	movdqa	.Lxts_magic(%rip), $twmask
2155	pcmpgtd	@XMM[7], $twtmp
2156	pshufd	\$0x13, $twtmp, $twres
2157	pxor	$twtmp, $twtmp
2158	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2159	pand	$twmask, $twres		# isolate carry and residue
2160	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2161	pxor	$twres, @XMM[7]
2162
2163	sub	\$0x80,$len
2164	jnc	.Lxts_enc_loop
2165
2166.Lxts_enc_short:
2167	add	\$0x80, $len
2168	jz	.Lxts_enc_done
2169___
2170    for ($i=0;$i<7;$i++) {
2171    $code.=<<___;
2172	pshufd	\$0x13, $twtmp, $twres
2173	pxor	$twtmp, $twtmp
2174	movdqa	@XMM[7], @XMM[$i]
2175	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2176	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2177	pand	$twmask, $twres		# isolate carry and residue
2178	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2179	pxor	$twres, @XMM[7]
2180___
2181    $code.=<<___ if ($i>=1);
2182	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2183	cmp	\$`0x10*$i`,$len
2184	je	.Lxts_enc_$i
2185___
2186    $code.=<<___ if ($i>=2);
2187	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2188___
2189    }
2190$code.=<<___;
2191	movdqu	0x60($inp), @XMM[8+6]
2192	pxor	@XMM[8+5], @XMM[5]
2193	movdqa	@XMM[7], 0x70(%rsp)
2194	lea	0x70($inp), $inp
2195	pxor	@XMM[8+6], @XMM[6]
2196	lea	0x80(%rsp), %rax	# pass key schedule
2197	mov	%edx, %r10d		# pass rounds
2198
2199	call	_bsaes_encrypt8
2200
2201	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2202	pxor	0x10(%rsp), @XMM[1]
2203	movdqu	@XMM[0], 0x00($out)	# write output
2204	pxor	0x20(%rsp), @XMM[4]
2205	movdqu	@XMM[1], 0x10($out)
2206	pxor	0x30(%rsp), @XMM[6]
2207	movdqu	@XMM[4], 0x20($out)
2208	pxor	0x40(%rsp), @XMM[3]
2209	movdqu	@XMM[6], 0x30($out)
2210	pxor	0x50(%rsp), @XMM[7]
2211	movdqu	@XMM[3], 0x40($out)
2212	pxor	0x60(%rsp), @XMM[2]
2213	movdqu	@XMM[7], 0x50($out)
2214	movdqu	@XMM[2], 0x60($out)
2215	lea	0x70($out), $out
2216
2217	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
2218	jmp	.Lxts_enc_done
2219.align	16
2220.Lxts_enc_6:
2221	pxor	@XMM[8+4], @XMM[4]
2222	lea	0x60($inp), $inp
2223	pxor	@XMM[8+5], @XMM[5]
2224	lea	0x80(%rsp), %rax	# pass key schedule
2225	mov	%edx, %r10d		# pass rounds
2226
2227	call	_bsaes_encrypt8
2228
2229	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2230	pxor	0x10(%rsp), @XMM[1]
2231	movdqu	@XMM[0], 0x00($out)	# write output
2232	pxor	0x20(%rsp), @XMM[4]
2233	movdqu	@XMM[1], 0x10($out)
2234	pxor	0x30(%rsp), @XMM[6]
2235	movdqu	@XMM[4], 0x20($out)
2236	pxor	0x40(%rsp), @XMM[3]
2237	movdqu	@XMM[6], 0x30($out)
2238	pxor	0x50(%rsp), @XMM[7]
2239	movdqu	@XMM[3], 0x40($out)
2240	movdqu	@XMM[7], 0x50($out)
2241	lea	0x60($out), $out
2242
2243	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
2244	jmp	.Lxts_enc_done
2245.align	16
2246.Lxts_enc_5:
2247	pxor	@XMM[8+3], @XMM[3]
2248	lea	0x50($inp), $inp
2249	pxor	@XMM[8+4], @XMM[4]
2250	lea	0x80(%rsp), %rax	# pass key schedule
2251	mov	%edx, %r10d		# pass rounds
2252
2253	call	_bsaes_encrypt8
2254
2255	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2256	pxor	0x10(%rsp), @XMM[1]
2257	movdqu	@XMM[0], 0x00($out)	# write output
2258	pxor	0x20(%rsp), @XMM[4]
2259	movdqu	@XMM[1], 0x10($out)
2260	pxor	0x30(%rsp), @XMM[6]
2261	movdqu	@XMM[4], 0x20($out)
2262	pxor	0x40(%rsp), @XMM[3]
2263	movdqu	@XMM[6], 0x30($out)
2264	movdqu	@XMM[3], 0x40($out)
2265	lea	0x50($out), $out
2266
2267	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
2268	jmp	.Lxts_enc_done
2269.align	16
2270.Lxts_enc_4:
2271	pxor	@XMM[8+2], @XMM[2]
2272	lea	0x40($inp), $inp
2273	pxor	@XMM[8+3], @XMM[3]
2274	lea	0x80(%rsp), %rax	# pass key schedule
2275	mov	%edx, %r10d		# pass rounds
2276
2277	call	_bsaes_encrypt8
2278
2279	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2280	pxor	0x10(%rsp), @XMM[1]
2281	movdqu	@XMM[0], 0x00($out)	# write output
2282	pxor	0x20(%rsp), @XMM[4]
2283	movdqu	@XMM[1], 0x10($out)
2284	pxor	0x30(%rsp), @XMM[6]
2285	movdqu	@XMM[4], 0x20($out)
2286	movdqu	@XMM[6], 0x30($out)
2287	lea	0x40($out), $out
2288
2289	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
2290	jmp	.Lxts_enc_done
2291.align	16
2292.Lxts_enc_3:
2293	pxor	@XMM[8+1], @XMM[1]
2294	lea	0x30($inp), $inp
2295	pxor	@XMM[8+2], @XMM[2]
2296	lea	0x80(%rsp), %rax	# pass key schedule
2297	mov	%edx, %r10d		# pass rounds
2298
2299	call	_bsaes_encrypt8
2300
2301	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2302	pxor	0x10(%rsp), @XMM[1]
2303	movdqu	@XMM[0], 0x00($out)	# write output
2304	pxor	0x20(%rsp), @XMM[4]
2305	movdqu	@XMM[1], 0x10($out)
2306	movdqu	@XMM[4], 0x20($out)
2307	lea	0x30($out), $out
2308
2309	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
2310	jmp	.Lxts_enc_done
2311.align	16
2312.Lxts_enc_2:
2313	pxor	@XMM[8+0], @XMM[0]
2314	lea	0x20($inp), $inp
2315	pxor	@XMM[8+1], @XMM[1]
2316	lea	0x80(%rsp), %rax	# pass key schedule
2317	mov	%edx, %r10d		# pass rounds
2318
2319	call	_bsaes_encrypt8
2320
2321	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2322	pxor	0x10(%rsp), @XMM[1]
2323	movdqu	@XMM[0], 0x00($out)	# write output
2324	movdqu	@XMM[1], 0x10($out)
2325	lea	0x20($out), $out
2326
2327	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
2328	jmp	.Lxts_enc_done
2329.align	16
2330.Lxts_enc_1:
2331	pxor	@XMM[0], @XMM[8]
2332	lea	0x10($inp), $inp
2333	movdqa	@XMM[8], 0x20(%rbp)
2334	lea	0x20(%rbp), $arg1
2335	lea	0x20(%rbp), $arg2
2336	lea	($key), $arg3
2337	call	asm_AES_encrypt		# doesn't touch %xmm
2338	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
2339	#pxor	@XMM[8], @XMM[0]
2340	#lea	0x80(%rsp), %rax	# pass key schedule
2341	#mov	%edx, %r10d		# pass rounds
2342	#call	_bsaes_encrypt8
2343	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2344	movdqu	@XMM[0], 0x00($out)	# write output
2345	lea	0x10($out), $out
2346
2347	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
2348
2349.Lxts_enc_done:
2350	and	\$15, %ebx
2351	jz	.Lxts_enc_ret
2352	mov	$out, %rdx
2353
2354.Lxts_enc_steal:
2355	movzb	($inp), %eax
2356	movzb	-16(%rdx), %ecx
2357	lea	1($inp), $inp
2358	mov	%al, -16(%rdx)
2359	mov	%cl, 0(%rdx)
2360	lea	1(%rdx), %rdx
2361	sub	\$1,%ebx
2362	jnz	.Lxts_enc_steal
2363
2364	movdqu	-16($out), @XMM[0]
2365	lea	0x20(%rbp), $arg1
2366	pxor	@XMM[7], @XMM[0]
2367	lea	0x20(%rbp), $arg2
2368	movdqa	@XMM[0], 0x20(%rbp)
2369	lea	($key), $arg3
2370	call	asm_AES_encrypt		# doesn't touch %xmm
2371	pxor	0x20(%rbp), @XMM[7]
2372	movdqu	@XMM[7], -16($out)
2373
2374.Lxts_enc_ret:
2375	lea	(%rsp), %rax
2376	pxor	%xmm0, %xmm0
2377.Lxts_enc_bzero:			# wipe key schedule [if any]
2378	movdqa	%xmm0, 0x00(%rax)
2379	movdqa	%xmm0, 0x10(%rax)
2380	lea	0x20(%rax), %rax
2381	cmp	%rax, %rbp
2382	ja	.Lxts_enc_bzero
2383
2384	lea	(%rbp),%rsp		# restore %rsp
2385___
2386$code.=<<___ if ($win64);
2387	movaps	0x40(%rbp), %xmm6
2388	movaps	0x50(%rbp), %xmm7
2389	movaps	0x60(%rbp), %xmm8
2390	movaps	0x70(%rbp), %xmm9
2391	movaps	0x80(%rbp), %xmm10
2392	movaps	0x90(%rbp), %xmm11
2393	movaps	0xa0(%rbp), %xmm12
2394	movaps	0xb0(%rbp), %xmm13
2395	movaps	0xc0(%rbp), %xmm14
2396	movaps	0xd0(%rbp), %xmm15
2397	lea	0xa0(%rbp), %rsp
2398___
2399$code.=<<___;
2400	mov	0x48(%rsp), %r15
2401	mov	0x50(%rsp), %r14
2402	mov	0x58(%rsp), %r13
2403	mov	0x60(%rsp), %r12
2404	mov	0x68(%rsp), %rbx
2405	mov	0x70(%rsp), %rax
2406	lea	0x78(%rsp), %rsp
2407	mov	%rax, %rbp
2408.Lxts_enc_epilogue:
2409	ret
2410.size	bsaes_xts_encrypt,.-bsaes_xts_encrypt
2411
2412.globl	bsaes_xts_decrypt
2413.type	bsaes_xts_decrypt,\@abi-omnipotent
2414.align	16
2415bsaes_xts_decrypt:
2416	mov	%rsp, %rax
2417.Lxts_dec_prologue:
2418	push	%rbp
2419	push	%rbx
2420	push	%r12
2421	push	%r13
2422	push	%r14
2423	push	%r15
2424	lea	-0x48(%rsp), %rsp
2425___
2426$code.=<<___ if ($win64);
2427	mov	0xa0(%rsp),$arg5	# pull key2
2428	mov	0xa8(%rsp),$arg6	# pull ivp
2429	lea	-0xa0(%rsp), %rsp
2430	movaps	%xmm6, 0x40(%rsp)
2431	movaps	%xmm7, 0x50(%rsp)
2432	movaps	%xmm8, 0x60(%rsp)
2433	movaps	%xmm9, 0x70(%rsp)
2434	movaps	%xmm10, 0x80(%rsp)
2435	movaps	%xmm11, 0x90(%rsp)
2436	movaps	%xmm12, 0xa0(%rsp)
2437	movaps	%xmm13, 0xb0(%rsp)
2438	movaps	%xmm14, 0xc0(%rsp)
2439	movaps	%xmm15, 0xd0(%rsp)
2440.Lxts_dec_body:
2441___
2442$code.=<<___;
2443	mov	%rsp, %rbp		# backup %rsp
2444	mov	$arg1, $inp		# backup arguments
2445	mov	$arg2, $out
2446	mov	$arg3, $len
2447	mov	$arg4, $key
2448
2449	lea	($arg6), $arg1
2450	lea	0x20(%rbp), $arg2
2451	lea	($arg5), $arg3
2452	call	asm_AES_encrypt		# generate initial tweak
2453
2454	mov	240($key), %eax		# rounds
2455	mov	$len, %rbx		# backup $len
2456
2457	mov	%eax, %edx		# rounds
2458	shl	\$7, %rax		# 128 bytes per inner round key
2459	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
2460	sub	%rax, %rsp
2461
2462	mov	%rsp, %rax		# pass key schedule
2463	mov	$key, %rcx		# pass key
2464	mov	%edx, %r10d		# pass rounds
2465	call	_bsaes_key_convert
2466	pxor	(%rsp), %xmm7		# fix up round 0 key
2467	movdqa	%xmm6, (%rax)		# save last round key
2468	movdqa	%xmm7, (%rsp)
2469
2470	xor	%eax, %eax		# if ($len%16) len-=16;
2471	and	\$-16, $len
2472	test	\$15, %ebx
2473	setnz	%al
2474	shl	\$4, %rax
2475	sub	%rax, $len
2476
2477	sub	\$0x80, %rsp		# place for tweak[8]
2478	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
2479
2480	pxor	$twtmp, $twtmp
2481	movdqa	.Lxts_magic(%rip), $twmask
2482	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2483
2484	sub	\$0x80, $len
2485	jc	.Lxts_dec_short
2486	jmp	.Lxts_dec_loop
2487
2488.align	16
2489.Lxts_dec_loop:
2490___
2491    for ($i=0;$i<7;$i++) {
2492    $code.=<<___;
2493	pshufd	\$0x13, $twtmp, $twres
2494	pxor	$twtmp, $twtmp
2495	movdqa	@XMM[7], @XMM[$i]
2496	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2497	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2498	pand	$twmask, $twres		# isolate carry and residue
2499	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2500	pxor	$twres, @XMM[7]
2501___
2502    $code.=<<___ if ($i>=1);
2503	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2504___
2505    $code.=<<___ if ($i>=2);
2506	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2507___
2508    }
2509$code.=<<___;
2510	movdqu	0x60($inp), @XMM[8+6]
2511	pxor	@XMM[8+5], @XMM[5]
2512	movdqu	0x70($inp), @XMM[8+7]
2513	lea	0x80($inp), $inp
2514	movdqa	@XMM[7], 0x70(%rsp)
2515	pxor	@XMM[8+6], @XMM[6]
2516	lea	0x80(%rsp), %rax	# pass key schedule
2517	pxor	@XMM[8+7], @XMM[7]
2518	mov	%edx, %r10d		# pass rounds
2519
2520	call	_bsaes_decrypt8
2521
2522	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2523	pxor	0x10(%rsp), @XMM[1]
2524	movdqu	@XMM[0], 0x00($out)	# write output
2525	pxor	0x20(%rsp), @XMM[6]
2526	movdqu	@XMM[1], 0x10($out)
2527	pxor	0x30(%rsp), @XMM[4]
2528	movdqu	@XMM[6], 0x20($out)
2529	pxor	0x40(%rsp), @XMM[2]
2530	movdqu	@XMM[4], 0x30($out)
2531	pxor	0x50(%rsp), @XMM[7]
2532	movdqu	@XMM[2], 0x40($out)
2533	pxor	0x60(%rsp), @XMM[3]
2534	movdqu	@XMM[7], 0x50($out)
2535	pxor	0x70(%rsp), @XMM[5]
2536	movdqu	@XMM[3], 0x60($out)
2537	movdqu	@XMM[5], 0x70($out)
2538	lea	0x80($out), $out
2539
2540	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
2541	pxor	$twtmp, $twtmp
2542	movdqa	.Lxts_magic(%rip), $twmask
2543	pcmpgtd	@XMM[7], $twtmp
2544	pshufd	\$0x13, $twtmp, $twres
2545	pxor	$twtmp, $twtmp
2546	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2547	pand	$twmask, $twres		# isolate carry and residue
2548	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2549	pxor	$twres, @XMM[7]
2550
2551	sub	\$0x80,$len
2552	jnc	.Lxts_dec_loop
2553
2554.Lxts_dec_short:
2555	add	\$0x80, $len
2556	jz	.Lxts_dec_done
2557___
2558    for ($i=0;$i<7;$i++) {
2559    $code.=<<___;
2560	pshufd	\$0x13, $twtmp, $twres
2561	pxor	$twtmp, $twtmp
2562	movdqa	@XMM[7], @XMM[$i]
2563	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2564	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
2565	pand	$twmask, $twres		# isolate carry and residue
2566	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
2567	pxor	$twres, @XMM[7]
2568___
2569    $code.=<<___ if ($i>=1);
2570	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
2571	cmp	\$`0x10*$i`,$len
2572	je	.Lxts_dec_$i
2573___
2574    $code.=<<___ if ($i>=2);
2575	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2576___
2577    }
2578$code.=<<___;
2579	movdqu	0x60($inp), @XMM[8+6]
2580	pxor	@XMM[8+5], @XMM[5]
2581	movdqa	@XMM[7], 0x70(%rsp)
2582	lea	0x70($inp), $inp
2583	pxor	@XMM[8+6], @XMM[6]
2584	lea	0x80(%rsp), %rax	# pass key schedule
2585	mov	%edx, %r10d		# pass rounds
2586
2587	call	_bsaes_decrypt8
2588
2589	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2590	pxor	0x10(%rsp), @XMM[1]
2591	movdqu	@XMM[0], 0x00($out)	# write output
2592	pxor	0x20(%rsp), @XMM[6]
2593	movdqu	@XMM[1], 0x10($out)
2594	pxor	0x30(%rsp), @XMM[4]
2595	movdqu	@XMM[6], 0x20($out)
2596	pxor	0x40(%rsp), @XMM[2]
2597	movdqu	@XMM[4], 0x30($out)
2598	pxor	0x50(%rsp), @XMM[7]
2599	movdqu	@XMM[2], 0x40($out)
2600	pxor	0x60(%rsp), @XMM[3]
2601	movdqu	@XMM[7], 0x50($out)
2602	movdqu	@XMM[3], 0x60($out)
2603	lea	0x70($out), $out
2604
2605	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
2606	jmp	.Lxts_dec_done
2607.align	16
2608.Lxts_dec_6:
2609	pxor	@XMM[8+4], @XMM[4]
2610	lea	0x60($inp), $inp
2611	pxor	@XMM[8+5], @XMM[5]
2612	lea	0x80(%rsp), %rax	# pass key schedule
2613	mov	%edx, %r10d		# pass rounds
2614
2615	call	_bsaes_decrypt8
2616
2617	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2618	pxor	0x10(%rsp), @XMM[1]
2619	movdqu	@XMM[0], 0x00($out)	# write output
2620	pxor	0x20(%rsp), @XMM[6]
2621	movdqu	@XMM[1], 0x10($out)
2622	pxor	0x30(%rsp), @XMM[4]
2623	movdqu	@XMM[6], 0x20($out)
2624	pxor	0x40(%rsp), @XMM[2]
2625	movdqu	@XMM[4], 0x30($out)
2626	pxor	0x50(%rsp), @XMM[7]
2627	movdqu	@XMM[2], 0x40($out)
2628	movdqu	@XMM[7], 0x50($out)
2629	lea	0x60($out), $out
2630
2631	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
2632	jmp	.Lxts_dec_done
2633.align	16
2634.Lxts_dec_5:
2635	pxor	@XMM[8+3], @XMM[3]
2636	lea	0x50($inp), $inp
2637	pxor	@XMM[8+4], @XMM[4]
2638	lea	0x80(%rsp), %rax	# pass key schedule
2639	mov	%edx, %r10d		# pass rounds
2640
2641	call	_bsaes_decrypt8
2642
2643	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2644	pxor	0x10(%rsp), @XMM[1]
2645	movdqu	@XMM[0], 0x00($out)	# write output
2646	pxor	0x20(%rsp), @XMM[6]
2647	movdqu	@XMM[1], 0x10($out)
2648	pxor	0x30(%rsp), @XMM[4]
2649	movdqu	@XMM[6], 0x20($out)
2650	pxor	0x40(%rsp), @XMM[2]
2651	movdqu	@XMM[4], 0x30($out)
2652	movdqu	@XMM[2], 0x40($out)
2653	lea	0x50($out), $out
2654
2655	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
2656	jmp	.Lxts_dec_done
2657.align	16
2658.Lxts_dec_4:
2659	pxor	@XMM[8+2], @XMM[2]
2660	lea	0x40($inp), $inp
2661	pxor	@XMM[8+3], @XMM[3]
2662	lea	0x80(%rsp), %rax	# pass key schedule
2663	mov	%edx, %r10d		# pass rounds
2664
2665	call	_bsaes_decrypt8
2666
2667	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2668	pxor	0x10(%rsp), @XMM[1]
2669	movdqu	@XMM[0], 0x00($out)	# write output
2670	pxor	0x20(%rsp), @XMM[6]
2671	movdqu	@XMM[1], 0x10($out)
2672	pxor	0x30(%rsp), @XMM[4]
2673	movdqu	@XMM[6], 0x20($out)
2674	movdqu	@XMM[4], 0x30($out)
2675	lea	0x40($out), $out
2676
2677	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
2678	jmp	.Lxts_dec_done
2679.align	16
2680.Lxts_dec_3:
2681	pxor	@XMM[8+1], @XMM[1]
2682	lea	0x30($inp), $inp
2683	pxor	@XMM[8+2], @XMM[2]
2684	lea	0x80(%rsp), %rax	# pass key schedule
2685	mov	%edx, %r10d		# pass rounds
2686
2687	call	_bsaes_decrypt8
2688
2689	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2690	pxor	0x10(%rsp), @XMM[1]
2691	movdqu	@XMM[0], 0x00($out)	# write output
2692	pxor	0x20(%rsp), @XMM[6]
2693	movdqu	@XMM[1], 0x10($out)
2694	movdqu	@XMM[6], 0x20($out)
2695	lea	0x30($out), $out
2696
2697	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
2698	jmp	.Lxts_dec_done
2699.align	16
2700.Lxts_dec_2:
2701	pxor	@XMM[8+0], @XMM[0]
2702	lea	0x20($inp), $inp
2703	pxor	@XMM[8+1], @XMM[1]
2704	lea	0x80(%rsp), %rax	# pass key schedule
2705	mov	%edx, %r10d		# pass rounds
2706
2707	call	_bsaes_decrypt8
2708
2709	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2710	pxor	0x10(%rsp), @XMM[1]
2711	movdqu	@XMM[0], 0x00($out)	# write output
2712	movdqu	@XMM[1], 0x10($out)
2713	lea	0x20($out), $out
2714
2715	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
2716	jmp	.Lxts_dec_done
2717.align	16
2718.Lxts_dec_1:
2719	pxor	@XMM[0], @XMM[8]
2720	lea	0x10($inp), $inp
2721	movdqa	@XMM[8], 0x20(%rbp)
2722	lea	0x20(%rbp), $arg1
2723	lea	0x20(%rbp), $arg2
2724	lea	($key), $arg3
2725	call	asm_AES_decrypt		# doesn't touch %xmm
2726	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
2727	#pxor	@XMM[8], @XMM[0]
2728	#lea	0x80(%rsp), %rax	# pass key schedule
2729	#mov	%edx, %r10d		# pass rounds
2730	#call	_bsaes_decrypt8
2731	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
2732	movdqu	@XMM[0], 0x00($out)	# write output
2733	lea	0x10($out), $out
2734
2735	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
2736
2737.Lxts_dec_done:
2738	and	\$15, %ebx
2739	jz	.Lxts_dec_ret
2740
2741	pxor	$twtmp, $twtmp
2742	movdqa	.Lxts_magic(%rip), $twmask
2743	pcmpgtd	@XMM[7], $twtmp
2744	pshufd	\$0x13, $twtmp, $twres
2745	movdqa	@XMM[7], @XMM[6]
2746	paddq	@XMM[7], @XMM[7]	# psllq 1,$tweak
2747	pand	$twmask, $twres		# isolate carry and residue
2748	movdqu	($inp), @XMM[0]
2749	pxor	$twres, @XMM[7]
2750
2751	lea	0x20(%rbp), $arg1
2752	pxor	@XMM[7], @XMM[0]
2753	lea	0x20(%rbp), $arg2
2754	movdqa	@XMM[0], 0x20(%rbp)
2755	lea	($key), $arg3
2756	call	asm_AES_decrypt		# doesn't touch %xmm
2757	pxor	0x20(%rbp), @XMM[7]
2758	mov	$out, %rdx
2759	movdqu	@XMM[7], ($out)
2760
2761.Lxts_dec_steal:
2762	movzb	16($inp), %eax
2763	movzb	(%rdx), %ecx
2764	lea	1($inp), $inp
2765	mov	%al, (%rdx)
2766	mov	%cl, 16(%rdx)
2767	lea	1(%rdx), %rdx
2768	sub	\$1,%ebx
2769	jnz	.Lxts_dec_steal
2770
2771	movdqu	($out), @XMM[0]
2772	lea	0x20(%rbp), $arg1
2773	pxor	@XMM[6], @XMM[0]
2774	lea	0x20(%rbp), $arg2
2775	movdqa	@XMM[0], 0x20(%rbp)
2776	lea	($key), $arg3
2777	call	asm_AES_decrypt		# doesn't touch %xmm
2778	pxor	0x20(%rbp), @XMM[6]
2779	movdqu	@XMM[6], ($out)
2780
2781.Lxts_dec_ret:
2782	lea	(%rsp), %rax
2783	pxor	%xmm0, %xmm0
2784.Lxts_dec_bzero:			# wipe key schedule [if any]
2785	movdqa	%xmm0, 0x00(%rax)
2786	movdqa	%xmm0, 0x10(%rax)
2787	lea	0x20(%rax), %rax
2788	cmp	%rax, %rbp
2789	ja	.Lxts_dec_bzero
2790
2791	lea	(%rbp),%rsp		# restore %rsp
2792___
2793$code.=<<___ if ($win64);
2794	movaps	0x40(%rbp), %xmm6
2795	movaps	0x50(%rbp), %xmm7
2796	movaps	0x60(%rbp), %xmm8
2797	movaps	0x70(%rbp), %xmm9
2798	movaps	0x80(%rbp), %xmm10
2799	movaps	0x90(%rbp), %xmm11
2800	movaps	0xa0(%rbp), %xmm12
2801	movaps	0xb0(%rbp), %xmm13
2802	movaps	0xc0(%rbp), %xmm14
2803	movaps	0xd0(%rbp), %xmm15
2804	lea	0xa0(%rbp), %rsp
2805___
2806$code.=<<___;
2807	mov	0x48(%rsp), %r15
2808	mov	0x50(%rsp), %r14
2809	mov	0x58(%rsp), %r13
2810	mov	0x60(%rsp), %r12
2811	mov	0x68(%rsp), %rbx
2812	mov	0x70(%rsp), %rax
2813	lea	0x78(%rsp), %rsp
2814	mov	%rax, %rbp
2815.Lxts_dec_epilogue:
2816	ret
2817.size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
2818___
2819}
2820$code.=<<___;
2821.type	_bsaes_const,\@object
2822.align	64
2823_bsaes_const:
2824.LM0ISR:	# InvShiftRows constants
2825	.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
2826.LISRM0:
2827	.quad	0x01040b0e0205080f, 0x0306090c00070a0d
2828.LISR:
2829	.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
2830.LBS0:		# bit-slice constants
2831	.quad	0x5555555555555555, 0x5555555555555555
2832.LBS1:
2833	.quad	0x3333333333333333, 0x3333333333333333
2834.LBS2:
2835	.quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2836.LSR:		# shiftrows constants
2837	.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
2838.LSRM0:
2839	.quad	0x0304090e00050a0f, 0x01060b0c0207080d
2840.LM0SR:
2841	.quad	0x0a0e02060f03070b, 0x0004080c05090d01
2842.LSWPUP:	# byte-swap upper dword
2843	.quad	0x0706050403020100, 0x0c0d0e0f0b0a0908
2844.LSWPUPM0SR:
2845	.quad	0x0a0d02060c03070b, 0x0004080f05090e01
2846.LADD1:		# counter increment constants
2847	.quad	0x0000000000000000, 0x0000000100000000
2848.LADD2:
2849	.quad	0x0000000000000000, 0x0000000200000000
2850.LADD3:
2851	.quad	0x0000000000000000, 0x0000000300000000
2852.LADD4:
2853	.quad	0x0000000000000000, 0x0000000400000000
2854.LADD5:
2855	.quad	0x0000000000000000, 0x0000000500000000
2856.LADD6:
2857	.quad	0x0000000000000000, 0x0000000600000000
2858.LADD7:
2859	.quad	0x0000000000000000, 0x0000000700000000
2860.LADD8:
2861	.quad	0x0000000000000000, 0x0000000800000000
2862.Lxts_magic:
2863	.long	0x87,0,1,0
2864.Lmasks:
2865	.quad	0x0101010101010101, 0x0101010101010101
2866	.quad	0x0202020202020202, 0x0202020202020202
2867	.quad	0x0404040404040404, 0x0404040404040404
2868	.quad	0x0808080808080808, 0x0808080808080808
2869.LM0:
2870	.quad	0x02060a0e03070b0f, 0x0004080c0105090d
2871.L63:
2872	.quad	0x6363636363636363, 0x6363636363636363
2873.asciz	"Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
2874.align	64
2875.size	_bsaes_const,.-_bsaes_const
2876___
2877
2878# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2879#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
2880if ($win64) {
2881$rec="%rcx";
2882$frame="%rdx";
2883$context="%r8";
2884$disp="%r9";
2885
2886$code.=<<___;
2887.extern	__imp_RtlVirtualUnwind
2888.type	se_handler,\@abi-omnipotent
2889.align	16
2890se_handler:
2891	push	%rsi
2892	push	%rdi
2893	push	%rbx
2894	push	%rbp
2895	push	%r12
2896	push	%r13
2897	push	%r14
2898	push	%r15
2899	pushfq
2900	sub	\$64,%rsp
2901
2902	mov	120($context),%rax	# pull context->Rax
2903	mov	248($context),%rbx	# pull context->Rip
2904
2905	mov	8($disp),%rsi		# disp->ImageBase
2906	mov	56($disp),%r11		# disp->HandlerData
2907
2908	mov	0(%r11),%r10d		# HandlerData[0]
2909	lea	(%rsi,%r10),%r10	# prologue label
2910	cmp	%r10,%rbx		# context->Rip<prologue label
2911	jb	.Lin_prologue
2912
2913	mov	152($context),%rax	# pull context->Rsp
2914
2915	mov	4(%r11),%r10d		# HandlerData[1]
2916	lea	(%rsi,%r10),%r10	# epilogue label
2917	cmp	%r10,%rbx		# context->Rip>=epilogue label
2918	jae	.Lin_prologue
2919
2920	mov	160($context),%rax	# pull context->Rbp
2921
2922	lea	0x40(%rax),%rsi		# %xmm save area
2923	lea	512($context),%rdi	# &context.Xmm6
2924	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
2925	.long	0xa548f3fc		# cld; rep movsq
2926	lea	0xa0(%rax),%rax		# adjust stack pointer
2927
2928	mov	0x70(%rax),%rbp
2929	mov	0x68(%rax),%rbx
2930	mov	0x60(%rax),%r12
2931	mov	0x58(%rax),%r13
2932	mov	0x50(%rax),%r14
2933	mov	0x48(%rax),%r15
2934	lea	0x78(%rax),%rax		# adjust stack pointer
2935	mov	%rbx,144($context)	# restore context->Rbx
2936	mov	%rbp,160($context)	# restore context->Rbp
2937	mov	%r12,216($context)	# restore context->R12
2938	mov	%r13,224($context)	# restore context->R13
2939	mov	%r14,232($context)	# restore context->R14
2940	mov	%r15,240($context)	# restore context->R15
2941
2942.Lin_prologue:
2943	mov	%rax,152($context)	# restore context->Rsp
2944
2945	mov	40($disp),%rdi		# disp->ContextRecord
2946	mov	$context,%rsi		# context
2947	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
2948	.long	0xa548f3fc		# cld; rep movsq
2949
2950	mov	$disp,%rsi
2951	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
2952	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
2953	mov	0(%rsi),%r8		# arg3, disp->ControlPc
2954	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
2955	mov	40(%rsi),%r10		# disp->ContextRecord
2956	lea	56(%rsi),%r11		# &disp->HandlerData
2957	lea	24(%rsi),%r12		# &disp->EstablisherFrame
2958	mov	%r10,32(%rsp)		# arg5
2959	mov	%r11,40(%rsp)		# arg6
2960	mov	%r12,48(%rsp)		# arg7
2961	mov	%rcx,56(%rsp)		# arg8, (NULL)
2962	call	*__imp_RtlVirtualUnwind(%rip)
2963
2964	mov	\$1,%eax		# ExceptionContinueSearch
2965	add	\$64,%rsp
2966	popfq
2967	pop	%r15
2968	pop	%r14
2969	pop	%r13
2970	pop	%r12
2971	pop	%rbp
2972	pop	%rbx
2973	pop	%rdi
2974	pop	%rsi
2975	ret
2976.size	se_handler,.-se_handler
2977
2978.section	.pdata
2979.align	4
2980___
2981$code.=<<___ if ($ecb);
2982	.rva	.Lecb_enc_prologue
2983	.rva	.Lecb_enc_epilogue
2984	.rva	.Lecb_enc_info
2985
2986	.rva	.Lecb_dec_prologue
2987	.rva	.Lecb_dec_epilogue
2988	.rva	.Lecb_dec_info
2989___
2990$code.=<<___;
2991	.rva	.Lcbc_dec_prologue
2992	.rva	.Lcbc_dec_epilogue
2993	.rva	.Lcbc_dec_info
2994
2995	.rva	.Lctr_enc_prologue
2996	.rva	.Lctr_enc_epilogue
2997	.rva	.Lctr_enc_info
2998
2999	.rva	.Lxts_enc_prologue
3000	.rva	.Lxts_enc_epilogue
3001	.rva	.Lxts_enc_info
3002
3003	.rva	.Lxts_dec_prologue
3004	.rva	.Lxts_dec_epilogue
3005	.rva	.Lxts_dec_info
3006
3007.section	.xdata
3008.align	8
3009___
3010$code.=<<___ if ($ecb);
3011.Lecb_enc_info:
3012	.byte	9,0,0,0
3013	.rva	se_handler
3014	.rva	.Lecb_enc_body,.Lecb_enc_epilogue	# HandlerData[]
3015.Lecb_dec_info:
3016	.byte	9,0,0,0
3017	.rva	se_handler
3018	.rva	.Lecb_dec_body,.Lecb_dec_epilogue	# HandlerData[]
3019___
3020$code.=<<___;
3021.Lcbc_dec_info:
3022	.byte	9,0,0,0
3023	.rva	se_handler
3024	.rva	.Lcbc_dec_body,.Lcbc_dec_epilogue	# HandlerData[]
3025.Lctr_enc_info:
3026	.byte	9,0,0,0
3027	.rva	se_handler
3028	.rva	.Lctr_enc_body,.Lctr_enc_epilogue	# HandlerData[]
3029.Lxts_enc_info:
3030	.byte	9,0,0,0
3031	.rva	se_handler
3032	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
3033.Lxts_dec_info:
3034	.byte	9,0,0,0
3035	.rva	se_handler
3036	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
3037___
3038}
3039
3040$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3041
3042print $code;
3043
3044close STDOUT;
3045