• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# This module implements support for Intel AES-NI extension. In
11# OpenSSL context it's used with Intel engine, but can also be used as
12# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
13# details].
14#
15# Performance.
16#
17# To start with see corresponding paragraph in aesni-x86_64.pl...
18# Instead of filling table similar to one found there I've chosen to
19# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
20# The simplified table below represents 32-bit performance relative
21# to 64-bit one in every given point. Ratios vary for different
22# encryption modes, therefore interval values.
23#
24#	16-byte     64-byte     256-byte    1-KB        8-KB
25#	53-67%      67-84%      91-94%      95-98%      97-99.5%
26#
27# Lower ratios for smaller block sizes are perfectly understandable,
28# because function call overhead is higher in 32-bit mode. Largest
29# 8-KB block performance is virtually same: 32-bit code is less than
30# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
31
32# January 2011
33#
34# See aesni-x86_64.pl for details. Unlike x86_64 version this module
35# interleaves at most 6 aes[enc|dec] instructions, because there are
36# not enough registers for 8x interleave [which should be optimal for
37# Sandy Bridge]. Actually, performance results for 6x interleave
38# factor presented in aesni-x86_64.pl (except for CTR) are for this
39# module.
40
41# April 2011
42#
43# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
44# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
45
46######################################################################
47# Current large-block performance in cycles per byte processed with
48# 128-bit key (less is better).
49#
50#		CBC en-/decrypt	CTR	XTS	ECB
51# Westmere	3.77/1.37	1.37	1.52	1.27
52# * Bridge	5.07/0.98	0.99	1.09	0.91
53# Haswell	4.44/0.80	0.97	1.03	0.72
54# Skylake	2.68/0.65	0.65	0.66	0.64
55# Silvermont	5.77/3.56	3.67	4.03	3.46
56# Goldmont	3.84/1.39	1.39	1.63	1.31
57# Bulldozer	5.80/0.98	1.05	1.24	0.93
58
59$PREFIX="aesni";	# if $PREFIX is set to "AES", the script
60			# generates drop-in replacement for
61			# crypto/aes/asm/aes-586.pl:-)
62$inline=1;		# inline _aesni_[en|de]crypt
63
64$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
65push(@INC,"${dir}","${dir}../../../perlasm");
66require "x86asm.pl";
67
68$output = pop;
69open OUT,">$output";
70*STDOUT=*OUT;
71
72&asm_init($ARGV[0]);
73
74&external_label("OPENSSL_ia32cap_P");
75&static_label("key_const");
76
77if ($PREFIX eq "aesni")	{ $movekey=\&movups; }
78else			{ $movekey=\&movups; }
79
80$len="eax";
81$rounds="ecx";
82$key="edx";
83$inp="esi";
84$out="edi";
85$rounds_="ebx";	# backup copy for $rounds
86$key_="ebp";	# backup copy for $key
87
88$rndkey0="xmm0";
89$rndkey1="xmm1";
90$inout0="xmm2";
91$inout1="xmm3";
92$inout2="xmm4";
93$inout3="xmm5";	$in1="xmm5";
94$inout4="xmm6";	$in0="xmm6";
95$inout5="xmm7";	$ivec="xmm7";
96
97# AESNI extension
98sub aeskeygenassist
99{ my($dst,$src,$imm)=@_;
100    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
101    {	&data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm);	}
102}
103sub aescommon
104{ my($opcodelet,$dst,$src)=@_;
105    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
106    {	&data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
107}
108sub aesimc	{ aescommon(0xdb,@_); }
109sub aesenc	{ aescommon(0xdc,@_); }
110sub aesenclast	{ aescommon(0xdd,@_); }
111sub aesdec	{ aescommon(0xde,@_); }
112sub aesdeclast	{ aescommon(0xdf,@_); }
113
114# Inline version of internal aesni_[en|de]crypt1
115{ my $sn;
116sub aesni_inline_generate1
117{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
118  $sn++;
119
120    &$movekey		($rndkey0,&QWP(0,$key));
121    &$movekey		($rndkey1,&QWP(16,$key));
122    &xorps		($ivec,$rndkey0)	if (defined($ivec));
123    &lea		($key,&DWP(32,$key));
124    &xorps		($inout,$ivec)		if (defined($ivec));
125    &xorps		($inout,$rndkey0)	if (!defined($ivec));
126    &set_label("${p}1_loop_$sn");
127	eval"&aes${p}	($inout,$rndkey1)";
128	&dec		($rounds);
129	&$movekey	($rndkey1,&QWP(0,$key));
130	&lea		($key,&DWP(16,$key));
131    &jnz		(&label("${p}1_loop_$sn"));
132    eval"&aes${p}last	($inout,$rndkey1)";
133}}
134
135sub aesni_generate1	# fully unrolled loop
136{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
137
138    &function_begin_B("_aesni_${p}rypt1");
139	&movups		($rndkey0,&QWP(0,$key));
140	&$movekey	($rndkey1,&QWP(0x10,$key));
141	&xorps		($inout,$rndkey0);
142	&$movekey	($rndkey0,&QWP(0x20,$key));
143	&lea		($key,&DWP(0x30,$key));
144	&cmp		($rounds,11);
145	&jb		(&label("${p}128"));
146	&lea		($key,&DWP(0x20,$key));
147	&je		(&label("${p}192"));
148	&lea		($key,&DWP(0x20,$key));
149	eval"&aes${p}	($inout,$rndkey1)";
150	&$movekey	($rndkey1,&QWP(-0x40,$key));
151	eval"&aes${p}	($inout,$rndkey0)";
152	&$movekey	($rndkey0,&QWP(-0x30,$key));
153    &set_label("${p}192");
154	eval"&aes${p}	($inout,$rndkey1)";
155	&$movekey	($rndkey1,&QWP(-0x20,$key));
156	eval"&aes${p}	($inout,$rndkey0)";
157	&$movekey	($rndkey0,&QWP(-0x10,$key));
158    &set_label("${p}128");
159	eval"&aes${p}	($inout,$rndkey1)";
160	&$movekey	($rndkey1,&QWP(0,$key));
161	eval"&aes${p}	($inout,$rndkey0)";
162	&$movekey	($rndkey0,&QWP(0x10,$key));
163	eval"&aes${p}	($inout,$rndkey1)";
164	&$movekey	($rndkey1,&QWP(0x20,$key));
165	eval"&aes${p}	($inout,$rndkey0)";
166	&$movekey	($rndkey0,&QWP(0x30,$key));
167	eval"&aes${p}	($inout,$rndkey1)";
168	&$movekey	($rndkey1,&QWP(0x40,$key));
169	eval"&aes${p}	($inout,$rndkey0)";
170	&$movekey	($rndkey0,&QWP(0x50,$key));
171	eval"&aes${p}	($inout,$rndkey1)";
172	&$movekey	($rndkey1,&QWP(0x60,$key));
173	eval"&aes${p}	($inout,$rndkey0)";
174	&$movekey	($rndkey0,&QWP(0x70,$key));
175	eval"&aes${p}	($inout,$rndkey1)";
176    eval"&aes${p}last	($inout,$rndkey0)";
177    &ret();
178    &function_end_B("_aesni_${p}rypt1");
179}
180
181# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
182&aesni_generate1("enc") if (!$inline);
183&function_begin_B("${PREFIX}_encrypt");
184	&mov	("eax",&wparam(0));
185	&mov	($key,&wparam(2));
186	&movups	($inout0,&QWP(0,"eax"));
187	&mov	($rounds,&DWP(240,$key));
188	&mov	("eax",&wparam(1));
189	if ($inline)
190	{   &aesni_inline_generate1("enc");	}
191	else
192	{   &call	("_aesni_encrypt1");	}
193	&pxor	($rndkey0,$rndkey0);		# clear register bank
194	&pxor	($rndkey1,$rndkey1);
195	&movups	(&QWP(0,"eax"),$inout0);
196	&pxor	($inout0,$inout0);
197	&ret	();
198&function_end_B("${PREFIX}_encrypt");
199
200# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
201&aesni_generate1("dec") if(!$inline);
202&function_begin_B("${PREFIX}_decrypt");
203	&mov	("eax",&wparam(0));
204	&mov	($key,&wparam(2));
205	&movups	($inout0,&QWP(0,"eax"));
206	&mov	($rounds,&DWP(240,$key));
207	&mov	("eax",&wparam(1));
208	if ($inline)
209	{   &aesni_inline_generate1("dec");	}
210	else
211	{   &call	("_aesni_decrypt1");	}
212	&pxor	($rndkey0,$rndkey0);		# clear register bank
213	&pxor	($rndkey1,$rndkey1);
214	&movups	(&QWP(0,"eax"),$inout0);
215	&pxor	($inout0,$inout0);
216	&ret	();
217&function_end_B("${PREFIX}_decrypt");
218
219# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
220# factor. Why 3x subroutine were originally used in loops? Even though
221# aes[enc|dec] latency was originally 6, it could be scheduled only
222# every *2nd* cycle. Thus 3x interleave was the one providing optimal
223# utilization, i.e. when subroutine's throughput is virtually same as
224# of non-interleaved subroutine [for number of input blocks up to 3].
225# This is why it originally made no sense to implement 2x subroutine.
226# But times change and it became appropriate to spend extra 192 bytes
227# on 2x subroutine on Atom Silvermont account. For processors that
228# can schedule aes[enc|dec] every cycle optimal interleave factor
229# equals to corresponding instructions latency. 8x is optimal for
230# * Bridge, but it's unfeasible to accommodate such implementation
231# in XMM registers addreassable in 32-bit mode and therefore maximum
232# of 6x is used instead...
233
234sub aesni_generate2
235{ my $p=shift;
236
237    &function_begin_B("_aesni_${p}rypt2");
238	&$movekey	($rndkey0,&QWP(0,$key));
239	&shl		($rounds,4);
240	&$movekey	($rndkey1,&QWP(16,$key));
241	&xorps		($inout0,$rndkey0);
242	&pxor		($inout1,$rndkey0);
243	&$movekey	($rndkey0,&QWP(32,$key));
244	&lea		($key,&DWP(32,$key,$rounds));
245	&neg		($rounds);
246	&add		($rounds,16);
247
248    &set_label("${p}2_loop");
249	eval"&aes${p}	($inout0,$rndkey1)";
250	eval"&aes${p}	($inout1,$rndkey1)";
251	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
252	&add		($rounds,32);
253	eval"&aes${p}	($inout0,$rndkey0)";
254	eval"&aes${p}	($inout1,$rndkey0)";
255	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
256	&jnz		(&label("${p}2_loop"));
257    eval"&aes${p}	($inout0,$rndkey1)";
258    eval"&aes${p}	($inout1,$rndkey1)";
259    eval"&aes${p}last	($inout0,$rndkey0)";
260    eval"&aes${p}last	($inout1,$rndkey0)";
261    &ret();
262    &function_end_B("_aesni_${p}rypt2");
263}
264
265sub aesni_generate3
266{ my $p=shift;
267
268    &function_begin_B("_aesni_${p}rypt3");
269	&$movekey	($rndkey0,&QWP(0,$key));
270	&shl		($rounds,4);
271	&$movekey	($rndkey1,&QWP(16,$key));
272	&xorps		($inout0,$rndkey0);
273	&pxor		($inout1,$rndkey0);
274	&pxor		($inout2,$rndkey0);
275	&$movekey	($rndkey0,&QWP(32,$key));
276	&lea		($key,&DWP(32,$key,$rounds));
277	&neg		($rounds);
278	&add		($rounds,16);
279
280    &set_label("${p}3_loop");
281	eval"&aes${p}	($inout0,$rndkey1)";
282	eval"&aes${p}	($inout1,$rndkey1)";
283	eval"&aes${p}	($inout2,$rndkey1)";
284	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
285	&add		($rounds,32);
286	eval"&aes${p}	($inout0,$rndkey0)";
287	eval"&aes${p}	($inout1,$rndkey0)";
288	eval"&aes${p}	($inout2,$rndkey0)";
289	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
290	&jnz		(&label("${p}3_loop"));
291    eval"&aes${p}	($inout0,$rndkey1)";
292    eval"&aes${p}	($inout1,$rndkey1)";
293    eval"&aes${p}	($inout2,$rndkey1)";
294    eval"&aes${p}last	($inout0,$rndkey0)";
295    eval"&aes${p}last	($inout1,$rndkey0)";
296    eval"&aes${p}last	($inout2,$rndkey0)";
297    &ret();
298    &function_end_B("_aesni_${p}rypt3");
299}
300
301# 4x interleave is implemented to improve small block performance,
302# most notably [and naturally] 4 block by ~30%. One can argue that one
303# should have implemented 5x as well, but improvement  would be <20%,
304# so it's not worth it...
305sub aesni_generate4
306{ my $p=shift;
307
308    &function_begin_B("_aesni_${p}rypt4");
309	&$movekey	($rndkey0,&QWP(0,$key));
310	&$movekey	($rndkey1,&QWP(16,$key));
311	&shl		($rounds,4);
312	&xorps		($inout0,$rndkey0);
313	&pxor		($inout1,$rndkey0);
314	&pxor		($inout2,$rndkey0);
315	&pxor		($inout3,$rndkey0);
316	&$movekey	($rndkey0,&QWP(32,$key));
317	&lea		($key,&DWP(32,$key,$rounds));
318	&neg		($rounds);
319	&data_byte	(0x0f,0x1f,0x40,0x00);
320	&add		($rounds,16);
321
322    &set_label("${p}4_loop");
323	eval"&aes${p}	($inout0,$rndkey1)";
324	eval"&aes${p}	($inout1,$rndkey1)";
325	eval"&aes${p}	($inout2,$rndkey1)";
326	eval"&aes${p}	($inout3,$rndkey1)";
327	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
328	&add		($rounds,32);
329	eval"&aes${p}	($inout0,$rndkey0)";
330	eval"&aes${p}	($inout1,$rndkey0)";
331	eval"&aes${p}	($inout2,$rndkey0)";
332	eval"&aes${p}	($inout3,$rndkey0)";
333	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
334    &jnz		(&label("${p}4_loop"));
335
336    eval"&aes${p}	($inout0,$rndkey1)";
337    eval"&aes${p}	($inout1,$rndkey1)";
338    eval"&aes${p}	($inout2,$rndkey1)";
339    eval"&aes${p}	($inout3,$rndkey1)";
340    eval"&aes${p}last	($inout0,$rndkey0)";
341    eval"&aes${p}last	($inout1,$rndkey0)";
342    eval"&aes${p}last	($inout2,$rndkey0)";
343    eval"&aes${p}last	($inout3,$rndkey0)";
344    &ret();
345    &function_end_B("_aesni_${p}rypt4");
346}
347
348sub aesni_generate6
349{ my $p=shift;
350
351    &function_begin_B("_aesni_${p}rypt6");
352    &static_label("_aesni_${p}rypt6_enter");
353	&$movekey	($rndkey0,&QWP(0,$key));
354	&shl		($rounds,4);
355	&$movekey	($rndkey1,&QWP(16,$key));
356	&xorps		($inout0,$rndkey0);
357	&pxor		($inout1,$rndkey0);	# pxor does better here
358	&pxor		($inout2,$rndkey0);
359	eval"&aes${p}	($inout0,$rndkey1)";
360	&pxor		($inout3,$rndkey0);
361	&pxor		($inout4,$rndkey0);
362	eval"&aes${p}	($inout1,$rndkey1)";
363	&lea		($key,&DWP(32,$key,$rounds));
364	&neg		($rounds);
365	eval"&aes${p}	($inout2,$rndkey1)";
366	&pxor		($inout5,$rndkey0);
367	&$movekey	($rndkey0,&QWP(0,$key,$rounds));
368	&add		($rounds,16);
369	&jmp		(&label("_aesni_${p}rypt6_inner"));
370
371    &set_label("${p}6_loop",16);
372	eval"&aes${p}	($inout0,$rndkey1)";
373	eval"&aes${p}	($inout1,$rndkey1)";
374	eval"&aes${p}	($inout2,$rndkey1)";
375    &set_label("_aesni_${p}rypt6_inner");
376	eval"&aes${p}	($inout3,$rndkey1)";
377	eval"&aes${p}	($inout4,$rndkey1)";
378	eval"&aes${p}	($inout5,$rndkey1)";
379    &set_label("_aesni_${p}rypt6_enter");
380	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
381	&add		($rounds,32);
382	eval"&aes${p}	($inout0,$rndkey0)";
383	eval"&aes${p}	($inout1,$rndkey0)";
384	eval"&aes${p}	($inout2,$rndkey0)";
385	eval"&aes${p}	($inout3,$rndkey0)";
386	eval"&aes${p}	($inout4,$rndkey0)";
387	eval"&aes${p}	($inout5,$rndkey0)";
388	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
389    &jnz		(&label("${p}6_loop"));
390
391    eval"&aes${p}	($inout0,$rndkey1)";
392    eval"&aes${p}	($inout1,$rndkey1)";
393    eval"&aes${p}	($inout2,$rndkey1)";
394    eval"&aes${p}	($inout3,$rndkey1)";
395    eval"&aes${p}	($inout4,$rndkey1)";
396    eval"&aes${p}	($inout5,$rndkey1)";
397    eval"&aes${p}last	($inout0,$rndkey0)";
398    eval"&aes${p}last	($inout1,$rndkey0)";
399    eval"&aes${p}last	($inout2,$rndkey0)";
400    eval"&aes${p}last	($inout3,$rndkey0)";
401    eval"&aes${p}last	($inout4,$rndkey0)";
402    eval"&aes${p}last	($inout5,$rndkey0)";
403    &ret();
404    &function_end_B("_aesni_${p}rypt6");
405}
406&aesni_generate2("enc") if ($PREFIX eq "aesni");
407&aesni_generate2("dec");
408&aesni_generate3("enc") if ($PREFIX eq "aesni");
409&aesni_generate3("dec");
410&aesni_generate4("enc") if ($PREFIX eq "aesni");
411&aesni_generate4("dec");
412&aesni_generate6("enc") if ($PREFIX eq "aesni");
413&aesni_generate6("dec");
414
415if ($PREFIX eq "aesni") {
416######################################################################
417# void aesni_ecb_encrypt (const void *in, void *out,
418#                         size_t length, const AES_KEY *key,
419#                         int enc);
420&function_begin("aesni_ecb_encrypt");
421	&mov	($inp,&wparam(0));
422	&mov	($out,&wparam(1));
423	&mov	($len,&wparam(2));
424	&mov	($key,&wparam(3));
425	&mov	($rounds_,&wparam(4));
426	&and	($len,-16);
427	&jz	(&label("ecb_ret"));
428	&mov	($rounds,&DWP(240,$key));
429	&test	($rounds_,$rounds_);
430	&jz	(&label("ecb_decrypt"));
431
432	&mov	($key_,$key);		# backup $key
433	&mov	($rounds_,$rounds);	# backup $rounds
434	&cmp	($len,0x60);
435	&jb	(&label("ecb_enc_tail"));
436
437	&movdqu	($inout0,&QWP(0,$inp));
438	&movdqu	($inout1,&QWP(0x10,$inp));
439	&movdqu	($inout2,&QWP(0x20,$inp));
440	&movdqu	($inout3,&QWP(0x30,$inp));
441	&movdqu	($inout4,&QWP(0x40,$inp));
442	&movdqu	($inout5,&QWP(0x50,$inp));
443	&lea	($inp,&DWP(0x60,$inp));
444	&sub	($len,0x60);
445	&jmp	(&label("ecb_enc_loop6_enter"));
446
447&set_label("ecb_enc_loop6",16);
448	&movups	(&QWP(0,$out),$inout0);
449	&movdqu	($inout0,&QWP(0,$inp));
450	&movups	(&QWP(0x10,$out),$inout1);
451	&movdqu	($inout1,&QWP(0x10,$inp));
452	&movups	(&QWP(0x20,$out),$inout2);
453	&movdqu	($inout2,&QWP(0x20,$inp));
454	&movups	(&QWP(0x30,$out),$inout3);
455	&movdqu	($inout3,&QWP(0x30,$inp));
456	&movups	(&QWP(0x40,$out),$inout4);
457	&movdqu	($inout4,&QWP(0x40,$inp));
458	&movups	(&QWP(0x50,$out),$inout5);
459	&lea	($out,&DWP(0x60,$out));
460	&movdqu	($inout5,&QWP(0x50,$inp));
461	&lea	($inp,&DWP(0x60,$inp));
462&set_label("ecb_enc_loop6_enter");
463
464	&call	("_aesni_encrypt6");
465
466	&mov	($key,$key_);		# restore $key
467	&mov	($rounds,$rounds_);	# restore $rounds
468	&sub	($len,0x60);
469	&jnc	(&label("ecb_enc_loop6"));
470
471	&movups	(&QWP(0,$out),$inout0);
472	&movups	(&QWP(0x10,$out),$inout1);
473	&movups	(&QWP(0x20,$out),$inout2);
474	&movups	(&QWP(0x30,$out),$inout3);
475	&movups	(&QWP(0x40,$out),$inout4);
476	&movups	(&QWP(0x50,$out),$inout5);
477	&lea	($out,&DWP(0x60,$out));
478	&add	($len,0x60);
479	&jz	(&label("ecb_ret"));
480
481&set_label("ecb_enc_tail");
482	&movups	($inout0,&QWP(0,$inp));
483	&cmp	($len,0x20);
484	&jb	(&label("ecb_enc_one"));
485	&movups	($inout1,&QWP(0x10,$inp));
486	&je	(&label("ecb_enc_two"));
487	&movups	($inout2,&QWP(0x20,$inp));
488	&cmp	($len,0x40);
489	&jb	(&label("ecb_enc_three"));
490	&movups	($inout3,&QWP(0x30,$inp));
491	&je	(&label("ecb_enc_four"));
492	&movups	($inout4,&QWP(0x40,$inp));
493	&xorps	($inout5,$inout5);
494	&call	("_aesni_encrypt6");
495	&movups	(&QWP(0,$out),$inout0);
496	&movups	(&QWP(0x10,$out),$inout1);
497	&movups	(&QWP(0x20,$out),$inout2);
498	&movups	(&QWP(0x30,$out),$inout3);
499	&movups	(&QWP(0x40,$out),$inout4);
500	jmp	(&label("ecb_ret"));
501
502&set_label("ecb_enc_one",16);
503	if ($inline)
504	{   &aesni_inline_generate1("enc");	}
505	else
506	{   &call	("_aesni_encrypt1");	}
507	&movups	(&QWP(0,$out),$inout0);
508	&jmp	(&label("ecb_ret"));
509
510&set_label("ecb_enc_two",16);
511	&call	("_aesni_encrypt2");
512	&movups	(&QWP(0,$out),$inout0);
513	&movups	(&QWP(0x10,$out),$inout1);
514	&jmp	(&label("ecb_ret"));
515
516&set_label("ecb_enc_three",16);
517	&call	("_aesni_encrypt3");
518	&movups	(&QWP(0,$out),$inout0);
519	&movups	(&QWP(0x10,$out),$inout1);
520	&movups	(&QWP(0x20,$out),$inout2);
521	&jmp	(&label("ecb_ret"));
522
523&set_label("ecb_enc_four",16);
524	&call	("_aesni_encrypt4");
525	&movups	(&QWP(0,$out),$inout0);
526	&movups	(&QWP(0x10,$out),$inout1);
527	&movups	(&QWP(0x20,$out),$inout2);
528	&movups	(&QWP(0x30,$out),$inout3);
529	&jmp	(&label("ecb_ret"));
530######################################################################
531&set_label("ecb_decrypt",16);
532	&mov	($key_,$key);		# backup $key
533	&mov	($rounds_,$rounds);	# backup $rounds
534	&cmp	($len,0x60);
535	&jb	(&label("ecb_dec_tail"));
536
537	&movdqu	($inout0,&QWP(0,$inp));
538	&movdqu	($inout1,&QWP(0x10,$inp));
539	&movdqu	($inout2,&QWP(0x20,$inp));
540	&movdqu	($inout3,&QWP(0x30,$inp));
541	&movdqu	($inout4,&QWP(0x40,$inp));
542	&movdqu	($inout5,&QWP(0x50,$inp));
543	&lea	($inp,&DWP(0x60,$inp));
544	&sub	($len,0x60);
545	&jmp	(&label("ecb_dec_loop6_enter"));
546
547&set_label("ecb_dec_loop6",16);
548	&movups	(&QWP(0,$out),$inout0);
549	&movdqu	($inout0,&QWP(0,$inp));
550	&movups	(&QWP(0x10,$out),$inout1);
551	&movdqu	($inout1,&QWP(0x10,$inp));
552	&movups	(&QWP(0x20,$out),$inout2);
553	&movdqu	($inout2,&QWP(0x20,$inp));
554	&movups	(&QWP(0x30,$out),$inout3);
555	&movdqu	($inout3,&QWP(0x30,$inp));
556	&movups	(&QWP(0x40,$out),$inout4);
557	&movdqu	($inout4,&QWP(0x40,$inp));
558	&movups	(&QWP(0x50,$out),$inout5);
559	&lea	($out,&DWP(0x60,$out));
560	&movdqu	($inout5,&QWP(0x50,$inp));
561	&lea	($inp,&DWP(0x60,$inp));
562&set_label("ecb_dec_loop6_enter");
563
564	&call	("_aesni_decrypt6");
565
566	&mov	($key,$key_);		# restore $key
567	&mov	($rounds,$rounds_);	# restore $rounds
568	&sub	($len,0x60);
569	&jnc	(&label("ecb_dec_loop6"));
570
571	&movups	(&QWP(0,$out),$inout0);
572	&movups	(&QWP(0x10,$out),$inout1);
573	&movups	(&QWP(0x20,$out),$inout2);
574	&movups	(&QWP(0x30,$out),$inout3);
575	&movups	(&QWP(0x40,$out),$inout4);
576	&movups	(&QWP(0x50,$out),$inout5);
577	&lea	($out,&DWP(0x60,$out));
578	&add	($len,0x60);
579	&jz	(&label("ecb_ret"));
580
581&set_label("ecb_dec_tail");
582	&movups	($inout0,&QWP(0,$inp));
583	&cmp	($len,0x20);
584	&jb	(&label("ecb_dec_one"));
585	&movups	($inout1,&QWP(0x10,$inp));
586	&je	(&label("ecb_dec_two"));
587	&movups	($inout2,&QWP(0x20,$inp));
588	&cmp	($len,0x40);
589	&jb	(&label("ecb_dec_three"));
590	&movups	($inout3,&QWP(0x30,$inp));
591	&je	(&label("ecb_dec_four"));
592	&movups	($inout4,&QWP(0x40,$inp));
593	&xorps	($inout5,$inout5);
594	&call	("_aesni_decrypt6");
595	&movups	(&QWP(0,$out),$inout0);
596	&movups	(&QWP(0x10,$out),$inout1);
597	&movups	(&QWP(0x20,$out),$inout2);
598	&movups	(&QWP(0x30,$out),$inout3);
599	&movups	(&QWP(0x40,$out),$inout4);
600	&jmp	(&label("ecb_ret"));
601
602&set_label("ecb_dec_one",16);
603	if ($inline)
604	{   &aesni_inline_generate1("dec");	}
605	else
606	{   &call	("_aesni_decrypt1");	}
607	&movups	(&QWP(0,$out),$inout0);
608	&jmp	(&label("ecb_ret"));
609
610&set_label("ecb_dec_two",16);
611	&call	("_aesni_decrypt2");
612	&movups	(&QWP(0,$out),$inout0);
613	&movups	(&QWP(0x10,$out),$inout1);
614	&jmp	(&label("ecb_ret"));
615
616&set_label("ecb_dec_three",16);
617	&call	("_aesni_decrypt3");
618	&movups	(&QWP(0,$out),$inout0);
619	&movups	(&QWP(0x10,$out),$inout1);
620	&movups	(&QWP(0x20,$out),$inout2);
621	&jmp	(&label("ecb_ret"));
622
623&set_label("ecb_dec_four",16);
624	&call	("_aesni_decrypt4");
625	&movups	(&QWP(0,$out),$inout0);
626	&movups	(&QWP(0x10,$out),$inout1);
627	&movups	(&QWP(0x20,$out),$inout2);
628	&movups	(&QWP(0x30,$out),$inout3);
629
630&set_label("ecb_ret");
631	&pxor	("xmm0","xmm0");		# clear register bank
632	&pxor	("xmm1","xmm1");
633	&pxor	("xmm2","xmm2");
634	&pxor	("xmm3","xmm3");
635	&pxor	("xmm4","xmm4");
636	&pxor	("xmm5","xmm5");
637	&pxor	("xmm6","xmm6");
638	&pxor	("xmm7","xmm7");
639&function_end("aesni_ecb_encrypt");
640
641######################################################################
642# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
643#                         size_t blocks, const AES_KEY *key,
644#                         const char *ivec,char *cmac);
645#
646# Handles only complete blocks, operates on 64-bit counter and
647# does not update *ivec! Nor does it finalize CMAC value
648# (see engine/eng_aesni.c for details)
649#
650{ my $cmac=$inout1;
651&function_begin("aesni_ccm64_encrypt_blocks");
652	&mov	($inp,&wparam(0));
653	&mov	($out,&wparam(1));
654	&mov	($len,&wparam(2));
655	&mov	($key,&wparam(3));
656	&mov	($rounds_,&wparam(4));
657	&mov	($rounds,&wparam(5));
658	&mov	($key_,"esp");
659	&sub	("esp",60);
660	&and	("esp",-16);			# align stack
661	&mov	(&DWP(48,"esp"),$key_);
662
663	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
664	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
665	&mov	($rounds,&DWP(240,$key));
666
667	# compose byte-swap control mask for pshufb on stack
668	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
669	&mov	(&DWP(4,"esp"),0x08090a0b);
670	&mov	(&DWP(8,"esp"),0x04050607);
671	&mov	(&DWP(12,"esp"),0x00010203);
672
673	# compose counter increment vector on stack
674	&mov	($rounds_,1);
675	&xor	($key_,$key_);
676	&mov	(&DWP(16,"esp"),$rounds_);
677	&mov	(&DWP(20,"esp"),$key_);
678	&mov	(&DWP(24,"esp"),$key_);
679	&mov	(&DWP(28,"esp"),$key_);
680
681	&shl	($rounds,4);
682	&mov	($rounds_,16);
683	&lea	($key_,&DWP(0,$key));
684	&movdqa	($inout3,&QWP(0,"esp"));
685	&movdqa	($inout0,$ivec);
686	&lea	($key,&DWP(32,$key,$rounds));
687	&sub	($rounds_,$rounds);
688	&pshufb	($ivec,$inout3);
689
690&set_label("ccm64_enc_outer");
691	&$movekey	($rndkey0,&QWP(0,$key_));
692	&mov		($rounds,$rounds_);
693	&movups		($in0,&QWP(0,$inp));
694
695	&xorps		($inout0,$rndkey0);
696	&$movekey	($rndkey1,&QWP(16,$key_));
697	&xorps		($rndkey0,$in0);
698	&xorps		($cmac,$rndkey0);		# cmac^=inp
699	&$movekey	($rndkey0,&QWP(32,$key_));
700
701&set_label("ccm64_enc2_loop");
702	&aesenc		($inout0,$rndkey1);
703	&aesenc		($cmac,$rndkey1);
704	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
705	&add		($rounds,32);
706	&aesenc		($inout0,$rndkey0);
707	&aesenc		($cmac,$rndkey0);
708	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
709	&jnz		(&label("ccm64_enc2_loop"));
710	&aesenc		($inout0,$rndkey1);
711	&aesenc		($cmac,$rndkey1);
712	&paddq		($ivec,&QWP(16,"esp"));
713	&dec		($len);
714	&aesenclast	($inout0,$rndkey0);
715	&aesenclast	($cmac,$rndkey0);
716
717	&lea	($inp,&DWP(16,$inp));
718	&xorps	($in0,$inout0);			# inp^=E(ivec)
719	&movdqa	($inout0,$ivec);
720	&movups	(&QWP(0,$out),$in0);		# save output
721	&pshufb	($inout0,$inout3);
722	&lea	($out,&DWP(16,$out));
723	&jnz	(&label("ccm64_enc_outer"));
724
725	&mov	("esp",&DWP(48,"esp"));
726	&mov	($out,&wparam(5));
727	&movups	(&QWP(0,$out),$cmac);
728
729	&pxor	("xmm0","xmm0");		# clear register bank
730	&pxor	("xmm1","xmm1");
731	&pxor	("xmm2","xmm2");
732	&pxor	("xmm3","xmm3");
733	&pxor	("xmm4","xmm4");
734	&pxor	("xmm5","xmm5");
735	&pxor	("xmm6","xmm6");
736	&pxor	("xmm7","xmm7");
737&function_end("aesni_ccm64_encrypt_blocks");
738
739&function_begin("aesni_ccm64_decrypt_blocks");
740	&mov	($inp,&wparam(0));
741	&mov	($out,&wparam(1));
742	&mov	($len,&wparam(2));
743	&mov	($key,&wparam(3));
744	&mov	($rounds_,&wparam(4));
745	&mov	($rounds,&wparam(5));
746	&mov	($key_,"esp");
747	&sub	("esp",60);
748	&and	("esp",-16);			# align stack
749	&mov	(&DWP(48,"esp"),$key_);
750
751	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
752	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
753	&mov	($rounds,&DWP(240,$key));
754
755	# compose byte-swap control mask for pshufb on stack
756	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
757	&mov	(&DWP(4,"esp"),0x08090a0b);
758	&mov	(&DWP(8,"esp"),0x04050607);
759	&mov	(&DWP(12,"esp"),0x00010203);
760
761	# compose counter increment vector on stack
762	&mov	($rounds_,1);
763	&xor	($key_,$key_);
764	&mov	(&DWP(16,"esp"),$rounds_);
765	&mov	(&DWP(20,"esp"),$key_);
766	&mov	(&DWP(24,"esp"),$key_);
767	&mov	(&DWP(28,"esp"),$key_);
768
769	&movdqa	($inout3,&QWP(0,"esp"));	# bswap mask
770	&movdqa	($inout0,$ivec);
771
772	&mov	($key_,$key);
773	&mov	($rounds_,$rounds);
774
775	&pshufb	($ivec,$inout3);
776	if ($inline)
777	{   &aesni_inline_generate1("enc");	}
778	else
779	{   &call	("_aesni_encrypt1");	}
780	&shl	($rounds_,4);
781	&mov	($rounds,16);
782	&movups	($in0,&QWP(0,$inp));		# load inp
783	&paddq	($ivec,&QWP(16,"esp"));
784	&lea	($inp,&QWP(16,$inp));
785	&sub	($rounds,$rounds_);
786	&lea	($key,&DWP(32,$key_,$rounds_));
787	&mov	($rounds_,$rounds);
788	&jmp	(&label("ccm64_dec_outer"));
789
790&set_label("ccm64_dec_outer",16);
791	&xorps	($in0,$inout0);			# inp ^= E(ivec)
792	&movdqa	($inout0,$ivec);
793	&movups	(&QWP(0,$out),$in0);		# save output
794	&lea	($out,&DWP(16,$out));
795	&pshufb	($inout0,$inout3);
796
797	&sub	($len,1);
798	&jz	(&label("ccm64_dec_break"));
799
800	&$movekey	($rndkey0,&QWP(0,$key_));
801	&mov		($rounds,$rounds_);
802	&$movekey	($rndkey1,&QWP(16,$key_));
803	&xorps		($in0,$rndkey0);
804	&xorps		($inout0,$rndkey0);
805	&xorps		($cmac,$in0);		# cmac^=out
806	&$movekey	($rndkey0,&QWP(32,$key_));
807
808&set_label("ccm64_dec2_loop");
809	&aesenc		($inout0,$rndkey1);
810	&aesenc		($cmac,$rndkey1);
811	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
812	&add		($rounds,32);
813	&aesenc		($inout0,$rndkey0);
814	&aesenc		($cmac,$rndkey0);
815	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
816	&jnz		(&label("ccm64_dec2_loop"));
817	&movups		($in0,&QWP(0,$inp));	# load inp
818	&paddq		($ivec,&QWP(16,"esp"));
819	&aesenc		($inout0,$rndkey1);
820	&aesenc		($cmac,$rndkey1);
821	&aesenclast	($inout0,$rndkey0);
822	&aesenclast	($cmac,$rndkey0);
823	&lea		($inp,&QWP(16,$inp));
824	&jmp	(&label("ccm64_dec_outer"));
825
826&set_label("ccm64_dec_break",16);
827	&mov	($rounds,&DWP(240,$key_));
828	&mov	($key,$key_);
829	if ($inline)
830	{   &aesni_inline_generate1("enc",$cmac,$in0);	}
831	else
832	{   &call	("_aesni_encrypt1",$cmac);	}
833
834	&mov	("esp",&DWP(48,"esp"));
835	&mov	($out,&wparam(5));
836	&movups	(&QWP(0,$out),$cmac);
837
838	&pxor	("xmm0","xmm0");		# clear register bank
839	&pxor	("xmm1","xmm1");
840	&pxor	("xmm2","xmm2");
841	&pxor	("xmm3","xmm3");
842	&pxor	("xmm4","xmm4");
843	&pxor	("xmm5","xmm5");
844	&pxor	("xmm6","xmm6");
845	&pxor	("xmm7","xmm7");
846&function_end("aesni_ccm64_decrypt_blocks");
847}
848
849######################################################################
850# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
851#                         size_t blocks, const AES_KEY *key,
852#                         const char *ivec);
853#
854# Handles only complete blocks, operates on 32-bit counter and
855# does not update *ivec! (see crypto/modes/ctr128.c for details)
856#
857# stack layout:
858#	0	pshufb mask
859#	16	vector addend: 0,6,6,6
860# 	32	counter-less ivec
861#	48	1st triplet of counter vector
862#	64	2nd triplet of counter vector
863#	80	saved %esp
864
865&function_begin("aesni_ctr32_encrypt_blocks");
866	&mov	($inp,&wparam(0));
867	&mov	($out,&wparam(1));
868	&mov	($len,&wparam(2));
869	&mov	($key,&wparam(3));
870	&mov	($rounds_,&wparam(4));
871	&mov	($key_,"esp");
872	&sub	("esp",88);
873	&and	("esp",-16);			# align stack
874	&mov	(&DWP(80,"esp"),$key_);
875
876	&cmp	($len,1);
877	&je	(&label("ctr32_one_shortcut"));
878
879	&movdqu	($inout5,&QWP(0,$rounds_));	# load ivec
880
881	# compose byte-swap control mask for pshufb on stack
882	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
883	&mov	(&DWP(4,"esp"),0x08090a0b);
884	&mov	(&DWP(8,"esp"),0x04050607);
885	&mov	(&DWP(12,"esp"),0x00010203);
886
887	# compose counter increment vector on stack
888	&mov	($rounds,6);
889	&xor	($key_,$key_);
890	&mov	(&DWP(16,"esp"),$rounds);
891	&mov	(&DWP(20,"esp"),$rounds);
892	&mov	(&DWP(24,"esp"),$rounds);
893	&mov	(&DWP(28,"esp"),$key_);
894
895	&pextrd	($rounds_,$inout5,3);		# pull 32-bit counter
896	&pinsrd	($inout5,$key_,3);		# wipe 32-bit counter
897
898	&mov	($rounds,&DWP(240,$key));	# key->rounds
899
900	# compose 2 vectors of 3x32-bit counters
901	&bswap	($rounds_);
902	&pxor	($rndkey0,$rndkey0);
903	&pxor	($rndkey1,$rndkey1);
904	&movdqa	($inout0,&QWP(0,"esp"));	# load byte-swap mask
905	&pinsrd	($rndkey0,$rounds_,0);
906	&lea	($key_,&DWP(3,$rounds_));
907	&pinsrd	($rndkey1,$key_,0);
908	&inc	($rounds_);
909	&pinsrd	($rndkey0,$rounds_,1);
910	&inc	($key_);
911	&pinsrd	($rndkey1,$key_,1);
912	&inc	($rounds_);
913	&pinsrd	($rndkey0,$rounds_,2);
914	&inc	($key_);
915	&pinsrd	($rndkey1,$key_,2);
916	&movdqa	(&QWP(48,"esp"),$rndkey0);	# save 1st triplet
917	&pshufb	($rndkey0,$inout0);		# byte swap
918	&movdqu	($inout4,&QWP(0,$key));		# key[0]
919	&movdqa	(&QWP(64,"esp"),$rndkey1);	# save 2nd triplet
920	&pshufb	($rndkey1,$inout0);		# byte swap
921
922	&pshufd	($inout0,$rndkey0,3<<6);	# place counter to upper dword
923	&pshufd	($inout1,$rndkey0,2<<6);
924	&cmp	($len,6);
925	&jb	(&label("ctr32_tail"));
926	&pxor	($inout5,$inout4);		# counter-less ivec^key[0]
927	&shl	($rounds,4);
928	&mov	($rounds_,16);
929	&movdqa	(&QWP(32,"esp"),$inout5);	# save counter-less ivec^key[0]
930	&mov	($key_,$key);			# backup $key
931	&sub	($rounds_,$rounds);		# backup twisted $rounds
932	&lea	($key,&DWP(32,$key,$rounds));
933	&sub	($len,6);
934	&jmp	(&label("ctr32_loop6"));
935
936&set_label("ctr32_loop6",16);
937	# inlining _aesni_encrypt6's prologue gives ~6% improvement...
938	&pshufd	($inout2,$rndkey0,1<<6);
939	&movdqa	($rndkey0,&QWP(32,"esp"));	# pull counter-less ivec
940	&pshufd	($inout3,$rndkey1,3<<6);
941	&pxor		($inout0,$rndkey0);	# merge counter-less ivec
942	&pshufd	($inout4,$rndkey1,2<<6);
943	&pxor		($inout1,$rndkey0);
944	&pshufd	($inout5,$rndkey1,1<<6);
945	&$movekey	($rndkey1,&QWP(16,$key_));
946	&pxor		($inout2,$rndkey0);
947	&pxor		($inout3,$rndkey0);
948	&aesenc		($inout0,$rndkey1);
949	&pxor		($inout4,$rndkey0);
950	&pxor		($inout5,$rndkey0);
951	&aesenc		($inout1,$rndkey1);
952	&$movekey	($rndkey0,&QWP(32,$key_));
953	&mov		($rounds,$rounds_);
954	&aesenc		($inout2,$rndkey1);
955	&aesenc		($inout3,$rndkey1);
956	&aesenc		($inout4,$rndkey1);
957	&aesenc		($inout5,$rndkey1);
958
959	&call		(&label("_aesni_encrypt6_enter"));
960
961	&movups	($rndkey1,&QWP(0,$inp));
962	&movups	($rndkey0,&QWP(0x10,$inp));
963	&xorps	($inout0,$rndkey1);
964	&movups	($rndkey1,&QWP(0x20,$inp));
965	&xorps	($inout1,$rndkey0);
966	&movups	(&QWP(0,$out),$inout0);
967	&movdqa	($rndkey0,&QWP(16,"esp"));	# load increment
968	&xorps	($inout2,$rndkey1);
969	&movdqa	($rndkey1,&QWP(64,"esp"));	# load 2nd triplet
970	&movups	(&QWP(0x10,$out),$inout1);
971	&movups	(&QWP(0x20,$out),$inout2);
972
973	&paddd	($rndkey1,$rndkey0);		# 2nd triplet increment
974	&paddd	($rndkey0,&QWP(48,"esp"));	# 1st triplet increment
975	&movdqa	($inout0,&QWP(0,"esp"));	# load byte swap mask
976
977	&movups	($inout1,&QWP(0x30,$inp));
978	&movups	($inout2,&QWP(0x40,$inp));
979	&xorps	($inout3,$inout1);
980	&movups	($inout1,&QWP(0x50,$inp));
981	&lea	($inp,&DWP(0x60,$inp));
982	&movdqa	(&QWP(48,"esp"),$rndkey0);	# save 1st triplet
983	&pshufb	($rndkey0,$inout0);		# byte swap
984	&xorps	($inout4,$inout2);
985	&movups	(&QWP(0x30,$out),$inout3);
986	&xorps	($inout5,$inout1);
987	&movdqa	(&QWP(64,"esp"),$rndkey1);	# save 2nd triplet
988	&pshufb	($rndkey1,$inout0);		# byte swap
989	&movups	(&QWP(0x40,$out),$inout4);
990	&pshufd	($inout0,$rndkey0,3<<6);
991	&movups	(&QWP(0x50,$out),$inout5);
992	&lea	($out,&DWP(0x60,$out));
993
994	&pshufd	($inout1,$rndkey0,2<<6);
995	&sub	($len,6);
996	&jnc	(&label("ctr32_loop6"));
997
998	&add	($len,6);
999	&jz	(&label("ctr32_ret"));
1000	&movdqu	($inout5,&QWP(0,$key_));
1001	&mov	($key,$key_);
1002	&pxor	($inout5,&QWP(32,"esp"));	# restore count-less ivec
1003	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
1004
1005&set_label("ctr32_tail");
1006	&por	($inout0,$inout5);
1007	&cmp	($len,2);
1008	&jb	(&label("ctr32_one"));
1009
1010	&pshufd	($inout2,$rndkey0,1<<6);
1011	&por	($inout1,$inout5);
1012	&je	(&label("ctr32_two"));
1013
1014	&pshufd	($inout3,$rndkey1,3<<6);
1015	&por	($inout2,$inout5);
1016	&cmp	($len,4);
1017	&jb	(&label("ctr32_three"));
1018
1019	&pshufd	($inout4,$rndkey1,2<<6);
1020	&por	($inout3,$inout5);
1021	&je	(&label("ctr32_four"));
1022
1023	&por	($inout4,$inout5);
1024	&call	("_aesni_encrypt6");
1025	&movups	($rndkey1,&QWP(0,$inp));
1026	&movups	($rndkey0,&QWP(0x10,$inp));
1027	&xorps	($inout0,$rndkey1);
1028	&movups	($rndkey1,&QWP(0x20,$inp));
1029	&xorps	($inout1,$rndkey0);
1030	&movups	($rndkey0,&QWP(0x30,$inp));
1031	&xorps	($inout2,$rndkey1);
1032	&movups	($rndkey1,&QWP(0x40,$inp));
1033	&xorps	($inout3,$rndkey0);
1034	&movups	(&QWP(0,$out),$inout0);
1035	&xorps	($inout4,$rndkey1);
1036	&movups	(&QWP(0x10,$out),$inout1);
1037	&movups	(&QWP(0x20,$out),$inout2);
1038	&movups	(&QWP(0x30,$out),$inout3);
1039	&movups	(&QWP(0x40,$out),$inout4);
1040	&jmp	(&label("ctr32_ret"));
1041
1042&set_label("ctr32_one_shortcut",16);
1043	&movups	($inout0,&QWP(0,$rounds_));	# load ivec
1044	&mov	($rounds,&DWP(240,$key));
1045
1046&set_label("ctr32_one");
1047	if ($inline)
1048	{   &aesni_inline_generate1("enc");	}
1049	else
1050	{   &call	("_aesni_encrypt1");	}
1051	&movups	($in0,&QWP(0,$inp));
1052	&xorps	($in0,$inout0);
1053	&movups	(&QWP(0,$out),$in0);
1054	&jmp	(&label("ctr32_ret"));
1055
1056&set_label("ctr32_two",16);
1057	&call	("_aesni_encrypt2");
1058	&movups	($inout3,&QWP(0,$inp));
1059	&movups	($inout4,&QWP(0x10,$inp));
1060	&xorps	($inout0,$inout3);
1061	&xorps	($inout1,$inout4);
1062	&movups	(&QWP(0,$out),$inout0);
1063	&movups	(&QWP(0x10,$out),$inout1);
1064	&jmp	(&label("ctr32_ret"));
1065
1066&set_label("ctr32_three",16);
1067	&call	("_aesni_encrypt3");
1068	&movups	($inout3,&QWP(0,$inp));
1069	&movups	($inout4,&QWP(0x10,$inp));
1070	&xorps	($inout0,$inout3);
1071	&movups	($inout5,&QWP(0x20,$inp));
1072	&xorps	($inout1,$inout4);
1073	&movups	(&QWP(0,$out),$inout0);
1074	&xorps	($inout2,$inout5);
1075	&movups	(&QWP(0x10,$out),$inout1);
1076	&movups	(&QWP(0x20,$out),$inout2);
1077	&jmp	(&label("ctr32_ret"));
1078
1079&set_label("ctr32_four",16);
1080	&call	("_aesni_encrypt4");
1081	&movups	($inout4,&QWP(0,$inp));
1082	&movups	($inout5,&QWP(0x10,$inp));
1083	&movups	($rndkey1,&QWP(0x20,$inp));
1084	&xorps	($inout0,$inout4);
1085	&movups	($rndkey0,&QWP(0x30,$inp));
1086	&xorps	($inout1,$inout5);
1087	&movups	(&QWP(0,$out),$inout0);
1088	&xorps	($inout2,$rndkey1);
1089	&movups	(&QWP(0x10,$out),$inout1);
1090	&xorps	($inout3,$rndkey0);
1091	&movups	(&QWP(0x20,$out),$inout2);
1092	&movups	(&QWP(0x30,$out),$inout3);
1093
1094&set_label("ctr32_ret");
1095	&pxor	("xmm0","xmm0");		# clear register bank
1096	&pxor	("xmm1","xmm1");
1097	&pxor	("xmm2","xmm2");
1098	&pxor	("xmm3","xmm3");
1099	&pxor	("xmm4","xmm4");
1100	&movdqa	(&QWP(32,"esp"),"xmm0");	# clear stack
1101	&pxor	("xmm5","xmm5");
1102	&movdqa	(&QWP(48,"esp"),"xmm0");
1103	&pxor	("xmm6","xmm6");
1104	&movdqa	(&QWP(64,"esp"),"xmm0");
1105	&pxor	("xmm7","xmm7");
1106	&mov	("esp",&DWP(80,"esp"));
1107&function_end("aesni_ctr32_encrypt_blocks");
1108
1109######################################################################
1110# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1111#	const AES_KEY *key1, const AES_KEY *key2
1112#	const unsigned char iv[16]);
1113#
1114{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
1115
1116&function_begin("aesni_xts_encrypt");
1117	&mov	($key,&wparam(4));		# key2
1118	&mov	($inp,&wparam(5));		# clear-text tweak
1119
1120	&mov	($rounds,&DWP(240,$key));	# key2->rounds
1121	&movups	($inout0,&QWP(0,$inp));
1122	if ($inline)
1123	{   &aesni_inline_generate1("enc");	}
1124	else
1125	{   &call	("_aesni_encrypt1");	}
1126
1127	&mov	($inp,&wparam(0));
1128	&mov	($out,&wparam(1));
1129	&mov	($len,&wparam(2));
1130	&mov	($key,&wparam(3));		# key1
1131
1132	&mov	($key_,"esp");
1133	&sub	("esp",16*7+8);
1134	&mov	($rounds,&DWP(240,$key));	# key1->rounds
1135	&and	("esp",-16);			# align stack
1136
1137	&mov	(&DWP(16*6+0,"esp"),0x87);	# compose the magic constant
1138	&mov	(&DWP(16*6+4,"esp"),0);
1139	&mov	(&DWP(16*6+8,"esp"),1);
1140	&mov	(&DWP(16*6+12,"esp"),0);
1141	&mov	(&DWP(16*7+0,"esp"),$len);	# save original $len
1142	&mov	(&DWP(16*7+4,"esp"),$key_);	# save original %esp
1143
1144	&movdqa	($tweak,$inout0);
1145	&pxor	($twtmp,$twtmp);
1146	&movdqa	($twmask,&QWP(6*16,"esp"));	# 0x0...010...87
1147	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1148
1149	&and	($len,-16);
1150	&mov	($key_,$key);			# backup $key
1151	&mov	($rounds_,$rounds);		# backup $rounds
1152	&sub	($len,16*6);
1153	&jc	(&label("xts_enc_short"));
1154
1155	&shl	($rounds,4);
1156	&mov	($rounds_,16);
1157	&sub	($rounds_,$rounds);
1158	&lea	($key,&DWP(32,$key,$rounds));
1159	&jmp	(&label("xts_enc_loop6"));
1160
1161&set_label("xts_enc_loop6",16);
1162	for ($i=0;$i<4;$i++) {
1163	    &pshufd	($twres,$twtmp,0x13);
1164	    &pxor	($twtmp,$twtmp);
1165	    &movdqa	(&QWP(16*$i,"esp"),$tweak);
1166	    &paddq	($tweak,$tweak);	# &psllq($tweak,1);
1167	    &pand	($twres,$twmask);	# isolate carry and residue
1168	    &pcmpgtd	($twtmp,$tweak);	# broadcast upper bits
1169	    &pxor	($tweak,$twres);
1170	}
1171	&pshufd	($inout5,$twtmp,0x13);
1172	&movdqa	(&QWP(16*$i++,"esp"),$tweak);
1173	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1174	 &$movekey	($rndkey0,&QWP(0,$key_));
1175	&pand	($inout5,$twmask);		# isolate carry and residue
1176	 &movups	($inout0,&QWP(0,$inp));	# load input
1177	&pxor	($inout5,$tweak);
1178
1179	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1180	&mov	($rounds,$rounds_);		# restore $rounds
1181	&movdqu	($inout1,&QWP(16*1,$inp));
1182	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
1183	&movdqu	($inout2,&QWP(16*2,$inp));
1184	 &pxor		($inout1,$rndkey0);
1185	&movdqu	($inout3,&QWP(16*3,$inp));
1186	 &pxor		($inout2,$rndkey0);
1187	&movdqu	($inout4,&QWP(16*4,$inp));
1188	 &pxor		($inout3,$rndkey0);
1189	&movdqu	($rndkey1,&QWP(16*5,$inp));
1190	 &pxor		($inout4,$rndkey0);
1191	&lea	($inp,&DWP(16*6,$inp));
1192	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1193	&movdqa	(&QWP(16*$i,"esp"),$inout5);	# save last tweak
1194	&pxor	($inout5,$rndkey1);
1195
1196	 &$movekey	($rndkey1,&QWP(16,$key_));
1197	&pxor	($inout1,&QWP(16*1,"esp"));
1198	&pxor	($inout2,&QWP(16*2,"esp"));
1199	 &aesenc	($inout0,$rndkey1);
1200	&pxor	($inout3,&QWP(16*3,"esp"));
1201	&pxor	($inout4,&QWP(16*4,"esp"));
1202	 &aesenc	($inout1,$rndkey1);
1203	&pxor		($inout5,$rndkey0);
1204	 &$movekey	($rndkey0,&QWP(32,$key_));
1205	 &aesenc	($inout2,$rndkey1);
1206	 &aesenc	($inout3,$rndkey1);
1207	 &aesenc	($inout4,$rndkey1);
1208	 &aesenc	($inout5,$rndkey1);
1209	&call		(&label("_aesni_encrypt6_enter"));
1210
1211	&movdqa	($tweak,&QWP(16*5,"esp"));	# last tweak
1212       &pxor	($twtmp,$twtmp);
1213	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1214       &pcmpgtd	($twtmp,$tweak);		# broadcast upper bits
1215	&xorps	($inout1,&QWP(16*1,"esp"));
1216	&movups	(&QWP(16*0,$out),$inout0);	# write output
1217	&xorps	($inout2,&QWP(16*2,"esp"));
1218	&movups	(&QWP(16*1,$out),$inout1);
1219	&xorps	($inout3,&QWP(16*3,"esp"));
1220	&movups	(&QWP(16*2,$out),$inout2);
1221	&xorps	($inout4,&QWP(16*4,"esp"));
1222	&movups	(&QWP(16*3,$out),$inout3);
1223	&xorps	($inout5,$tweak);
1224	&movups	(&QWP(16*4,$out),$inout4);
1225       &pshufd	($twres,$twtmp,0x13);
1226	&movups	(&QWP(16*5,$out),$inout5);
1227	&lea	($out,&DWP(16*6,$out));
1228       &movdqa	($twmask,&QWP(16*6,"esp"));	# 0x0...010...87
1229
1230	&pxor	($twtmp,$twtmp);
1231	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1232	&pand	($twres,$twmask);		# isolate carry and residue
1233	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1234	&pxor	($tweak,$twres);
1235
1236	&sub	($len,16*6);
1237	&jnc	(&label("xts_enc_loop6"));
1238
1239	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
1240	&mov	($key,$key_);			# restore $key
1241	&mov	($rounds_,$rounds);
1242
1243&set_label("xts_enc_short");
1244	&add	($len,16*6);
1245	&jz	(&label("xts_enc_done6x"));
1246
1247	&movdqa	($inout3,$tweak);		# put aside previous tweak
1248	&cmp	($len,0x20);
1249	&jb	(&label("xts_enc_one"));
1250
1251	&pshufd	($twres,$twtmp,0x13);
1252	&pxor	($twtmp,$twtmp);
1253	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1254	&pand	($twres,$twmask);		# isolate carry and residue
1255	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1256	&pxor	($tweak,$twres);
1257	&je	(&label("xts_enc_two"));
1258
1259	&pshufd	($twres,$twtmp,0x13);
1260	&pxor	($twtmp,$twtmp);
1261	&movdqa	($inout4,$tweak);		# put aside previous tweak
1262	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1263	&pand	($twres,$twmask);		# isolate carry and residue
1264	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1265	&pxor	($tweak,$twres);
1266	&cmp	($len,0x40);
1267	&jb	(&label("xts_enc_three"));
1268
1269	&pshufd	($twres,$twtmp,0x13);
1270	&pxor	($twtmp,$twtmp);
1271	&movdqa	($inout5,$tweak);		# put aside previous tweak
1272	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1273	&pand	($twres,$twmask);		# isolate carry and residue
1274	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1275	&pxor	($tweak,$twres);
1276	&movdqa	(&QWP(16*0,"esp"),$inout3);
1277	&movdqa	(&QWP(16*1,"esp"),$inout4);
1278	&je	(&label("xts_enc_four"));
1279
1280	&movdqa	(&QWP(16*2,"esp"),$inout5);
1281	&pshufd	($inout5,$twtmp,0x13);
1282	&movdqa	(&QWP(16*3,"esp"),$tweak);
1283	&paddq	($tweak,$tweak);		# &psllq($inout0,1);
1284	&pand	($inout5,$twmask);		# isolate carry and residue
1285	&pxor	($inout5,$tweak);
1286
1287	&movdqu	($inout0,&QWP(16*0,$inp));	# load input
1288	&movdqu	($inout1,&QWP(16*1,$inp));
1289	&movdqu	($inout2,&QWP(16*2,$inp));
1290	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1291	&movdqu	($inout3,&QWP(16*3,$inp));
1292	&pxor	($inout1,&QWP(16*1,"esp"));
1293	&movdqu	($inout4,&QWP(16*4,$inp));
1294	&pxor	($inout2,&QWP(16*2,"esp"));
1295	&lea	($inp,&DWP(16*5,$inp));
1296	&pxor	($inout3,&QWP(16*3,"esp"));
1297	&movdqa	(&QWP(16*4,"esp"),$inout5);	# save last tweak
1298	&pxor	($inout4,$inout5);
1299
1300	&call	("_aesni_encrypt6");
1301
1302	&movaps	($tweak,&QWP(16*4,"esp"));	# last tweak
1303	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1304	&xorps	($inout1,&QWP(16*1,"esp"));
1305	&xorps	($inout2,&QWP(16*2,"esp"));
1306	&movups	(&QWP(16*0,$out),$inout0);	# write output
1307	&xorps	($inout3,&QWP(16*3,"esp"));
1308	&movups	(&QWP(16*1,$out),$inout1);
1309	&xorps	($inout4,$tweak);
1310	&movups	(&QWP(16*2,$out),$inout2);
1311	&movups	(&QWP(16*3,$out),$inout3);
1312	&movups	(&QWP(16*4,$out),$inout4);
1313	&lea	($out,&DWP(16*5,$out));
1314	&jmp	(&label("xts_enc_done"));
1315
1316&set_label("xts_enc_one",16);
1317	&movups	($inout0,&QWP(16*0,$inp));	# load input
1318	&lea	($inp,&DWP(16*1,$inp));
1319	&xorps	($inout0,$inout3);		# input^=tweak
1320	if ($inline)
1321	{   &aesni_inline_generate1("enc");	}
1322	else
1323	{   &call	("_aesni_encrypt1");	}
1324	&xorps	($inout0,$inout3);		# output^=tweak
1325	&movups	(&QWP(16*0,$out),$inout0);	# write output
1326	&lea	($out,&DWP(16*1,$out));
1327
1328	&movdqa	($tweak,$inout3);		# last tweak
1329	&jmp	(&label("xts_enc_done"));
1330
1331&set_label("xts_enc_two",16);
1332	&movaps	($inout4,$tweak);		# put aside last tweak
1333
1334	&movups	($inout0,&QWP(16*0,$inp));	# load input
1335	&movups	($inout1,&QWP(16*1,$inp));
1336	&lea	($inp,&DWP(16*2,$inp));
1337	&xorps	($inout0,$inout3);		# input^=tweak
1338	&xorps	($inout1,$inout4);
1339
1340	&call	("_aesni_encrypt2");
1341
1342	&xorps	($inout0,$inout3);		# output^=tweak
1343	&xorps	($inout1,$inout4);
1344	&movups	(&QWP(16*0,$out),$inout0);	# write output
1345	&movups	(&QWP(16*1,$out),$inout1);
1346	&lea	($out,&DWP(16*2,$out));
1347
1348	&movdqa	($tweak,$inout4);		# last tweak
1349	&jmp	(&label("xts_enc_done"));
1350
1351&set_label("xts_enc_three",16);
1352	&movaps	($inout5,$tweak);		# put aside last tweak
1353	&movups	($inout0,&QWP(16*0,$inp));	# load input
1354	&movups	($inout1,&QWP(16*1,$inp));
1355	&movups	($inout2,&QWP(16*2,$inp));
1356	&lea	($inp,&DWP(16*3,$inp));
1357	&xorps	($inout0,$inout3);		# input^=tweak
1358	&xorps	($inout1,$inout4);
1359	&xorps	($inout2,$inout5);
1360
1361	&call	("_aesni_encrypt3");
1362
1363	&xorps	($inout0,$inout3);		# output^=tweak
1364	&xorps	($inout1,$inout4);
1365	&xorps	($inout2,$inout5);
1366	&movups	(&QWP(16*0,$out),$inout0);	# write output
1367	&movups	(&QWP(16*1,$out),$inout1);
1368	&movups	(&QWP(16*2,$out),$inout2);
1369	&lea	($out,&DWP(16*3,$out));
1370
1371	&movdqa	($tweak,$inout5);		# last tweak
1372	&jmp	(&label("xts_enc_done"));
1373
1374&set_label("xts_enc_four",16);
1375	&movaps	($inout4,$tweak);		# put aside last tweak
1376
1377	&movups	($inout0,&QWP(16*0,$inp));	# load input
1378	&movups	($inout1,&QWP(16*1,$inp));
1379	&movups	($inout2,&QWP(16*2,$inp));
1380	&xorps	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1381	&movups	($inout3,&QWP(16*3,$inp));
1382	&lea	($inp,&DWP(16*4,$inp));
1383	&xorps	($inout1,&QWP(16*1,"esp"));
1384	&xorps	($inout2,$inout5);
1385	&xorps	($inout3,$inout4);
1386
1387	&call	("_aesni_encrypt4");
1388
1389	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1390	&xorps	($inout1,&QWP(16*1,"esp"));
1391	&xorps	($inout2,$inout5);
1392	&movups	(&QWP(16*0,$out),$inout0);	# write output
1393	&xorps	($inout3,$inout4);
1394	&movups	(&QWP(16*1,$out),$inout1);
1395	&movups	(&QWP(16*2,$out),$inout2);
1396	&movups	(&QWP(16*3,$out),$inout3);
1397	&lea	($out,&DWP(16*4,$out));
1398
1399	&movdqa	($tweak,$inout4);		# last tweak
1400	&jmp	(&label("xts_enc_done"));
1401
1402&set_label("xts_enc_done6x",16);		# $tweak is pre-calculated
1403	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
1404	&and	($len,15);
1405	&jz	(&label("xts_enc_ret"));
1406	&movdqa	($inout3,$tweak);
1407	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
1408	&jmp	(&label("xts_enc_steal"));
1409
1410&set_label("xts_enc_done",16);
1411	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
1412	&pxor	($twtmp,$twtmp);
1413	&and	($len,15);
1414	&jz	(&label("xts_enc_ret"));
1415
1416	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1417	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
1418	&pshufd	($inout3,$twtmp,0x13);
1419	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1420	&pand	($inout3,&QWP(16*6,"esp"));	# isolate carry and residue
1421	&pxor	($inout3,$tweak);
1422
1423&set_label("xts_enc_steal");
1424	&movz	($rounds,&BP(0,$inp));
1425	&movz	($key,&BP(-16,$out));
1426	&lea	($inp,&DWP(1,$inp));
1427	&mov	(&BP(-16,$out),&LB($rounds));
1428	&mov	(&BP(0,$out),&LB($key));
1429	&lea	($out,&DWP(1,$out));
1430	&sub	($len,1);
1431	&jnz	(&label("xts_enc_steal"));
1432
1433	&sub	($out,&DWP(16*7+0,"esp"));	# rewind $out
1434	&mov	($key,$key_);			# restore $key
1435	&mov	($rounds,$rounds_);		# restore $rounds
1436
1437	&movups	($inout0,&QWP(-16,$out));	# load input
1438	&xorps	($inout0,$inout3);		# input^=tweak
1439	if ($inline)
1440	{   &aesni_inline_generate1("enc");	}
1441	else
1442	{   &call	("_aesni_encrypt1");	}
1443	&xorps	($inout0,$inout3);		# output^=tweak
1444	&movups	(&QWP(-16,$out),$inout0);	# write output
1445
1446&set_label("xts_enc_ret");
1447	&pxor	("xmm0","xmm0");		# clear register bank
1448	&pxor	("xmm1","xmm1");
1449	&pxor	("xmm2","xmm2");
1450	&movdqa	(&QWP(16*0,"esp"),"xmm0");	# clear stack
1451	&pxor	("xmm3","xmm3");
1452	&movdqa	(&QWP(16*1,"esp"),"xmm0");
1453	&pxor	("xmm4","xmm4");
1454	&movdqa	(&QWP(16*2,"esp"),"xmm0");
1455	&pxor	("xmm5","xmm5");
1456	&movdqa	(&QWP(16*3,"esp"),"xmm0");
1457	&pxor	("xmm6","xmm6");
1458	&movdqa	(&QWP(16*4,"esp"),"xmm0");
1459	&pxor	("xmm7","xmm7");
1460	&movdqa	(&QWP(16*5,"esp"),"xmm0");
1461	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
1462&function_end("aesni_xts_encrypt");
1463
1464&function_begin("aesni_xts_decrypt");
1465	&mov	($key,&wparam(4));		# key2
1466	&mov	($inp,&wparam(5));		# clear-text tweak
1467
1468	&mov	($rounds,&DWP(240,$key));	# key2->rounds
1469	&movups	($inout0,&QWP(0,$inp));
1470	if ($inline)
1471	{   &aesni_inline_generate1("enc");	}
1472	else
1473	{   &call	("_aesni_encrypt1");	}
1474
1475	&mov	($inp,&wparam(0));
1476	&mov	($out,&wparam(1));
1477	&mov	($len,&wparam(2));
1478	&mov	($key,&wparam(3));		# key1
1479
1480	&mov	($key_,"esp");
1481	&sub	("esp",16*7+8);
1482	&and	("esp",-16);			# align stack
1483
1484	&xor	($rounds_,$rounds_);		# if(len%16) len-=16;
1485	&test	($len,15);
1486	&setnz	(&LB($rounds_));
1487	&shl	($rounds_,4);
1488	&sub	($len,$rounds_);
1489
1490	&mov	(&DWP(16*6+0,"esp"),0x87);	# compose the magic constant
1491	&mov	(&DWP(16*6+4,"esp"),0);
1492	&mov	(&DWP(16*6+8,"esp"),1);
1493	&mov	(&DWP(16*6+12,"esp"),0);
1494	&mov	(&DWP(16*7+0,"esp"),$len);	# save original $len
1495	&mov	(&DWP(16*7+4,"esp"),$key_);	# save original %esp
1496
1497	&mov	($rounds,&DWP(240,$key));	# key1->rounds
1498	&mov	($key_,$key);			# backup $key
1499	&mov	($rounds_,$rounds);		# backup $rounds
1500
1501	&movdqa	($tweak,$inout0);
1502	&pxor	($twtmp,$twtmp);
1503	&movdqa	($twmask,&QWP(6*16,"esp"));	# 0x0...010...87
1504	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1505
1506	&and	($len,-16);
1507	&sub	($len,16*6);
1508	&jc	(&label("xts_dec_short"));
1509
1510	&shl	($rounds,4);
1511	&mov	($rounds_,16);
1512	&sub	($rounds_,$rounds);
1513	&lea	($key,&DWP(32,$key,$rounds));
1514	&jmp	(&label("xts_dec_loop6"));
1515
1516&set_label("xts_dec_loop6",16);
1517	for ($i=0;$i<4;$i++) {
1518	    &pshufd	($twres,$twtmp,0x13);
1519	    &pxor	($twtmp,$twtmp);
1520	    &movdqa	(&QWP(16*$i,"esp"),$tweak);
1521	    &paddq	($tweak,$tweak);	# &psllq($tweak,1);
1522	    &pand	($twres,$twmask);	# isolate carry and residue
1523	    &pcmpgtd	($twtmp,$tweak);	# broadcast upper bits
1524	    &pxor	($tweak,$twres);
1525	}
1526	&pshufd	($inout5,$twtmp,0x13);
1527	&movdqa	(&QWP(16*$i++,"esp"),$tweak);
1528	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1529	 &$movekey	($rndkey0,&QWP(0,$key_));
1530	&pand	($inout5,$twmask);		# isolate carry and residue
1531	 &movups	($inout0,&QWP(0,$inp));	# load input
1532	&pxor	($inout5,$tweak);
1533
1534	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1535	&mov	($rounds,$rounds_);
1536	&movdqu	($inout1,&QWP(16*1,$inp));
1537	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
1538	&movdqu	($inout2,&QWP(16*2,$inp));
1539	 &pxor		($inout1,$rndkey0);
1540	&movdqu	($inout3,&QWP(16*3,$inp));
1541	 &pxor		($inout2,$rndkey0);
1542	&movdqu	($inout4,&QWP(16*4,$inp));
1543	 &pxor		($inout3,$rndkey0);
1544	&movdqu	($rndkey1,&QWP(16*5,$inp));
1545	 &pxor		($inout4,$rndkey0);
1546	&lea	($inp,&DWP(16*6,$inp));
1547	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1548	&movdqa	(&QWP(16*$i,"esp"),$inout5);	# save last tweak
1549	&pxor	($inout5,$rndkey1);
1550
1551	 &$movekey	($rndkey1,&QWP(16,$key_));
1552	&pxor	($inout1,&QWP(16*1,"esp"));
1553	&pxor	($inout2,&QWP(16*2,"esp"));
1554	 &aesdec	($inout0,$rndkey1);
1555	&pxor	($inout3,&QWP(16*3,"esp"));
1556	&pxor	($inout4,&QWP(16*4,"esp"));
1557	 &aesdec	($inout1,$rndkey1);
1558	&pxor		($inout5,$rndkey0);
1559	 &$movekey	($rndkey0,&QWP(32,$key_));
1560	 &aesdec	($inout2,$rndkey1);
1561	 &aesdec	($inout3,$rndkey1);
1562	 &aesdec	($inout4,$rndkey1);
1563	 &aesdec	($inout5,$rndkey1);
1564	&call		(&label("_aesni_decrypt6_enter"));
1565
1566	&movdqa	($tweak,&QWP(16*5,"esp"));	# last tweak
1567       &pxor	($twtmp,$twtmp);
1568	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1569       &pcmpgtd	($twtmp,$tweak);		# broadcast upper bits
1570	&xorps	($inout1,&QWP(16*1,"esp"));
1571	&movups	(&QWP(16*0,$out),$inout0);	# write output
1572	&xorps	($inout2,&QWP(16*2,"esp"));
1573	&movups	(&QWP(16*1,$out),$inout1);
1574	&xorps	($inout3,&QWP(16*3,"esp"));
1575	&movups	(&QWP(16*2,$out),$inout2);
1576	&xorps	($inout4,&QWP(16*4,"esp"));
1577	&movups	(&QWP(16*3,$out),$inout3);
1578	&xorps	($inout5,$tweak);
1579	&movups	(&QWP(16*4,$out),$inout4);
1580       &pshufd	($twres,$twtmp,0x13);
1581	&movups	(&QWP(16*5,$out),$inout5);
1582	&lea	($out,&DWP(16*6,$out));
1583       &movdqa	($twmask,&QWP(16*6,"esp"));	# 0x0...010...87
1584
1585	&pxor	($twtmp,$twtmp);
1586	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1587	&pand	($twres,$twmask);		# isolate carry and residue
1588	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1589	&pxor	($tweak,$twres);
1590
1591	&sub	($len,16*6);
1592	&jnc	(&label("xts_dec_loop6"));
1593
1594	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
1595	&mov	($key,$key_);			# restore $key
1596	&mov	($rounds_,$rounds);
1597
1598&set_label("xts_dec_short");
1599	&add	($len,16*6);
1600	&jz	(&label("xts_dec_done6x"));
1601
1602	&movdqa	($inout3,$tweak);		# put aside previous tweak
1603	&cmp	($len,0x20);
1604	&jb	(&label("xts_dec_one"));
1605
1606	&pshufd	($twres,$twtmp,0x13);
1607	&pxor	($twtmp,$twtmp);
1608	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1609	&pand	($twres,$twmask);		# isolate carry and residue
1610	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1611	&pxor	($tweak,$twres);
1612	&je	(&label("xts_dec_two"));
1613
1614	&pshufd	($twres,$twtmp,0x13);
1615	&pxor	($twtmp,$twtmp);
1616	&movdqa	($inout4,$tweak);		# put aside previous tweak
1617	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1618	&pand	($twres,$twmask);		# isolate carry and residue
1619	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1620	&pxor	($tweak,$twres);
1621	&cmp	($len,0x40);
1622	&jb	(&label("xts_dec_three"));
1623
1624	&pshufd	($twres,$twtmp,0x13);
1625	&pxor	($twtmp,$twtmp);
1626	&movdqa	($inout5,$tweak);		# put aside previous tweak
1627	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1628	&pand	($twres,$twmask);		# isolate carry and residue
1629	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1630	&pxor	($tweak,$twres);
1631	&movdqa	(&QWP(16*0,"esp"),$inout3);
1632	&movdqa	(&QWP(16*1,"esp"),$inout4);
1633	&je	(&label("xts_dec_four"));
1634
1635	&movdqa	(&QWP(16*2,"esp"),$inout5);
1636	&pshufd	($inout5,$twtmp,0x13);
1637	&movdqa	(&QWP(16*3,"esp"),$tweak);
1638	&paddq	($tweak,$tweak);		# &psllq($inout0,1);
1639	&pand	($inout5,$twmask);		# isolate carry and residue
1640	&pxor	($inout5,$tweak);
1641
1642	&movdqu	($inout0,&QWP(16*0,$inp));	# load input
1643	&movdqu	($inout1,&QWP(16*1,$inp));
1644	&movdqu	($inout2,&QWP(16*2,$inp));
1645	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1646	&movdqu	($inout3,&QWP(16*3,$inp));
1647	&pxor	($inout1,&QWP(16*1,"esp"));
1648	&movdqu	($inout4,&QWP(16*4,$inp));
1649	&pxor	($inout2,&QWP(16*2,"esp"));
1650	&lea	($inp,&DWP(16*5,$inp));
1651	&pxor	($inout3,&QWP(16*3,"esp"));
1652	&movdqa	(&QWP(16*4,"esp"),$inout5);	# save last tweak
1653	&pxor	($inout4,$inout5);
1654
1655	&call	("_aesni_decrypt6");
1656
1657	&movaps	($tweak,&QWP(16*4,"esp"));	# last tweak
1658	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1659	&xorps	($inout1,&QWP(16*1,"esp"));
1660	&xorps	($inout2,&QWP(16*2,"esp"));
1661	&movups	(&QWP(16*0,$out),$inout0);	# write output
1662	&xorps	($inout3,&QWP(16*3,"esp"));
1663	&movups	(&QWP(16*1,$out),$inout1);
1664	&xorps	($inout4,$tweak);
1665	&movups	(&QWP(16*2,$out),$inout2);
1666	&movups	(&QWP(16*3,$out),$inout3);
1667	&movups	(&QWP(16*4,$out),$inout4);
1668	&lea	($out,&DWP(16*5,$out));
1669	&jmp	(&label("xts_dec_done"));
1670
1671&set_label("xts_dec_one",16);
1672	&movups	($inout0,&QWP(16*0,$inp));	# load input
1673	&lea	($inp,&DWP(16*1,$inp));
1674	&xorps	($inout0,$inout3);		# input^=tweak
1675	if ($inline)
1676	{   &aesni_inline_generate1("dec");	}
1677	else
1678	{   &call	("_aesni_decrypt1");	}
1679	&xorps	($inout0,$inout3);		# output^=tweak
1680	&movups	(&QWP(16*0,$out),$inout0);	# write output
1681	&lea	($out,&DWP(16*1,$out));
1682
1683	&movdqa	($tweak,$inout3);		# last tweak
1684	&jmp	(&label("xts_dec_done"));
1685
1686&set_label("xts_dec_two",16);
1687	&movaps	($inout4,$tweak);		# put aside last tweak
1688
1689	&movups	($inout0,&QWP(16*0,$inp));	# load input
1690	&movups	($inout1,&QWP(16*1,$inp));
1691	&lea	($inp,&DWP(16*2,$inp));
1692	&xorps	($inout0,$inout3);		# input^=tweak
1693	&xorps	($inout1,$inout4);
1694
1695	&call	("_aesni_decrypt2");
1696
1697	&xorps	($inout0,$inout3);		# output^=tweak
1698	&xorps	($inout1,$inout4);
1699	&movups	(&QWP(16*0,$out),$inout0);	# write output
1700	&movups	(&QWP(16*1,$out),$inout1);
1701	&lea	($out,&DWP(16*2,$out));
1702
1703	&movdqa	($tweak,$inout4);		# last tweak
1704	&jmp	(&label("xts_dec_done"));
1705
1706&set_label("xts_dec_three",16);
1707	&movaps	($inout5,$tweak);		# put aside last tweak
1708	&movups	($inout0,&QWP(16*0,$inp));	# load input
1709	&movups	($inout1,&QWP(16*1,$inp));
1710	&movups	($inout2,&QWP(16*2,$inp));
1711	&lea	($inp,&DWP(16*3,$inp));
1712	&xorps	($inout0,$inout3);		# input^=tweak
1713	&xorps	($inout1,$inout4);
1714	&xorps	($inout2,$inout5);
1715
1716	&call	("_aesni_decrypt3");
1717
1718	&xorps	($inout0,$inout3);		# output^=tweak
1719	&xorps	($inout1,$inout4);
1720	&xorps	($inout2,$inout5);
1721	&movups	(&QWP(16*0,$out),$inout0);	# write output
1722	&movups	(&QWP(16*1,$out),$inout1);
1723	&movups	(&QWP(16*2,$out),$inout2);
1724	&lea	($out,&DWP(16*3,$out));
1725
1726	&movdqa	($tweak,$inout5);		# last tweak
1727	&jmp	(&label("xts_dec_done"));
1728
1729&set_label("xts_dec_four",16);
1730	&movaps	($inout4,$tweak);		# put aside last tweak
1731
1732	&movups	($inout0,&QWP(16*0,$inp));	# load input
1733	&movups	($inout1,&QWP(16*1,$inp));
1734	&movups	($inout2,&QWP(16*2,$inp));
1735	&xorps	($inout0,&QWP(16*0,"esp"));	# input^=tweak
1736	&movups	($inout3,&QWP(16*3,$inp));
1737	&lea	($inp,&DWP(16*4,$inp));
1738	&xorps	($inout1,&QWP(16*1,"esp"));
1739	&xorps	($inout2,$inout5);
1740	&xorps	($inout3,$inout4);
1741
1742	&call	("_aesni_decrypt4");
1743
1744	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
1745	&xorps	($inout1,&QWP(16*1,"esp"));
1746	&xorps	($inout2,$inout5);
1747	&movups	(&QWP(16*0,$out),$inout0);	# write output
1748	&xorps	($inout3,$inout4);
1749	&movups	(&QWP(16*1,$out),$inout1);
1750	&movups	(&QWP(16*2,$out),$inout2);
1751	&movups	(&QWP(16*3,$out),$inout3);
1752	&lea	($out,&DWP(16*4,$out));
1753
1754	&movdqa	($tweak,$inout4);		# last tweak
1755	&jmp	(&label("xts_dec_done"));
1756
1757&set_label("xts_dec_done6x",16);		# $tweak is pre-calculated
1758	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
1759	&and	($len,15);
1760	&jz	(&label("xts_dec_ret"));
1761	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
1762	&jmp	(&label("xts_dec_only_one_more"));
1763
1764&set_label("xts_dec_done",16);
1765	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
1766	&pxor	($twtmp,$twtmp);
1767	&and	($len,15);
1768	&jz	(&label("xts_dec_ret"));
1769
1770	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1771	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
1772	&pshufd	($twres,$twtmp,0x13);
1773	&pxor	($twtmp,$twtmp);
1774	&movdqa	($twmask,&QWP(16*6,"esp"));
1775	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1776	&pand	($twres,$twmask);		# isolate carry and residue
1777	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
1778	&pxor	($tweak,$twres);
1779
1780&set_label("xts_dec_only_one_more");
1781	&pshufd	($inout3,$twtmp,0x13);
1782	&movdqa	($inout4,$tweak);		# put aside previous tweak
1783	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
1784	&pand	($inout3,$twmask);		# isolate carry and residue
1785	&pxor	($inout3,$tweak);
1786
1787	&mov	($key,$key_);			# restore $key
1788	&mov	($rounds,$rounds_);		# restore $rounds
1789
1790	&movups	($inout0,&QWP(0,$inp));		# load input
1791	&xorps	($inout0,$inout3);		# input^=tweak
1792	if ($inline)
1793	{   &aesni_inline_generate1("dec");	}
1794	else
1795	{   &call	("_aesni_decrypt1");	}
1796	&xorps	($inout0,$inout3);		# output^=tweak
1797	&movups	(&QWP(0,$out),$inout0);		# write output
1798
1799&set_label("xts_dec_steal");
1800	&movz	($rounds,&BP(16,$inp));
1801	&movz	($key,&BP(0,$out));
1802	&lea	($inp,&DWP(1,$inp));
1803	&mov	(&BP(0,$out),&LB($rounds));
1804	&mov	(&BP(16,$out),&LB($key));
1805	&lea	($out,&DWP(1,$out));
1806	&sub	($len,1);
1807	&jnz	(&label("xts_dec_steal"));
1808
1809	&sub	($out,&DWP(16*7+0,"esp"));	# rewind $out
1810	&mov	($key,$key_);			# restore $key
1811	&mov	($rounds,$rounds_);		# restore $rounds
1812
1813	&movups	($inout0,&QWP(0,$out));		# load input
1814	&xorps	($inout0,$inout4);		# input^=tweak
1815	if ($inline)
1816	{   &aesni_inline_generate1("dec");	}
1817	else
1818	{   &call	("_aesni_decrypt1");	}
1819	&xorps	($inout0,$inout4);		# output^=tweak
1820	&movups	(&QWP(0,$out),$inout0);		# write output
1821
1822&set_label("xts_dec_ret");
1823	&pxor	("xmm0","xmm0");		# clear register bank
1824	&pxor	("xmm1","xmm1");
1825	&pxor	("xmm2","xmm2");
1826	&movdqa	(&QWP(16*0,"esp"),"xmm0");	# clear stack
1827	&pxor	("xmm3","xmm3");
1828	&movdqa	(&QWP(16*1,"esp"),"xmm0");
1829	&pxor	("xmm4","xmm4");
1830	&movdqa	(&QWP(16*2,"esp"),"xmm0");
1831	&pxor	("xmm5","xmm5");
1832	&movdqa	(&QWP(16*3,"esp"),"xmm0");
1833	&pxor	("xmm6","xmm6");
1834	&movdqa	(&QWP(16*4,"esp"),"xmm0");
1835	&pxor	("xmm7","xmm7");
1836	&movdqa	(&QWP(16*5,"esp"),"xmm0");
1837	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
1838&function_end("aesni_xts_decrypt");
1839}
1840}
1841
1842######################################################################
1843# void $PREFIX_cbc_encrypt (const void *inp, void *out,
1844#                           size_t length, const AES_KEY *key,
1845#                           unsigned char *ivp,const int enc);
1846&function_begin("${PREFIX}_cbc_encrypt");
1847	&mov	($inp,&wparam(0));
1848	&mov	($rounds_,"esp");
1849	&mov	($out,&wparam(1));
1850	&sub	($rounds_,24);
1851	&mov	($len,&wparam(2));
1852	&and	($rounds_,-16);
1853	&mov	($key,&wparam(3));
1854	&mov	($key_,&wparam(4));
1855	&test	($len,$len);
1856	&jz	(&label("cbc_abort"));
1857
1858	&cmp	(&wparam(5),0);
1859	&xchg	($rounds_,"esp");		# alloca
1860	&movups	($ivec,&QWP(0,$key_));		# load IV
1861	&mov	($rounds,&DWP(240,$key));
1862	&mov	($key_,$key);			# backup $key
1863	&mov	(&DWP(16,"esp"),$rounds_);	# save original %esp
1864	&mov	($rounds_,$rounds);		# backup $rounds
1865	&je	(&label("cbc_decrypt"));
1866
1867	&movaps	($inout0,$ivec);
1868	&cmp	($len,16);
1869	&jb	(&label("cbc_enc_tail"));
1870	&sub	($len,16);
1871	&jmp	(&label("cbc_enc_loop"));
1872
1873&set_label("cbc_enc_loop",16);
1874	&movups	($ivec,&QWP(0,$inp));		# input actually
1875	&lea	($inp,&DWP(16,$inp));
1876	if ($inline)
1877	{   &aesni_inline_generate1("enc",$inout0,$ivec);	}
1878	else
1879	{   &xorps($inout0,$ivec); &call("_aesni_encrypt1");	}
1880	&mov	($rounds,$rounds_);	# restore $rounds
1881	&mov	($key,$key_);		# restore $key
1882	&movups	(&QWP(0,$out),$inout0);	# store output
1883	&lea	($out,&DWP(16,$out));
1884	&sub	($len,16);
1885	&jnc	(&label("cbc_enc_loop"));
1886	&add	($len,16);
1887	&jnz	(&label("cbc_enc_tail"));
1888	&movaps	($ivec,$inout0);
1889	&pxor	($inout0,$inout0);
1890	&jmp	(&label("cbc_ret"));
1891
1892&set_label("cbc_enc_tail");
1893	&mov	("ecx",$len);		# zaps $rounds
1894	&data_word(0xA4F3F689);		# rep movsb
1895	&mov	("ecx",16);		# zero tail
1896	&sub	("ecx",$len);
1897	&xor	("eax","eax");		# zaps $len
1898	&data_word(0xAAF3F689);		# rep stosb
1899	&lea	($out,&DWP(-16,$out));	# rewind $out by 1 block
1900	&mov	($rounds,$rounds_);	# restore $rounds
1901	&mov	($inp,$out);		# $inp and $out are the same
1902	&mov	($key,$key_);		# restore $key
1903	&jmp	(&label("cbc_enc_loop"));
1904######################################################################
1905&set_label("cbc_decrypt",16);
1906	&cmp	($len,0x50);
1907	&jbe	(&label("cbc_dec_tail"));
1908	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
1909	&sub	($len,0x50);
1910	&jmp	(&label("cbc_dec_loop6_enter"));
1911
1912&set_label("cbc_dec_loop6",16);
1913	&movaps	(&QWP(0,"esp"),$rndkey0);	# save IV
1914	&movups	(&QWP(0,$out),$inout5);
1915	&lea	($out,&DWP(0x10,$out));
1916&set_label("cbc_dec_loop6_enter");
1917	&movdqu	($inout0,&QWP(0,$inp));
1918	&movdqu	($inout1,&QWP(0x10,$inp));
1919	&movdqu	($inout2,&QWP(0x20,$inp));
1920	&movdqu	($inout3,&QWP(0x30,$inp));
1921	&movdqu	($inout4,&QWP(0x40,$inp));
1922	&movdqu	($inout5,&QWP(0x50,$inp));
1923
1924	&call	("_aesni_decrypt6");
1925
1926	&movups	($rndkey1,&QWP(0,$inp));
1927	&movups	($rndkey0,&QWP(0x10,$inp));
1928	&xorps	($inout0,&QWP(0,"esp"));	# ^=IV
1929	&xorps	($inout1,$rndkey1);
1930	&movups	($rndkey1,&QWP(0x20,$inp));
1931	&xorps	($inout2,$rndkey0);
1932	&movups	($rndkey0,&QWP(0x30,$inp));
1933	&xorps	($inout3,$rndkey1);
1934	&movups	($rndkey1,&QWP(0x40,$inp));
1935	&xorps	($inout4,$rndkey0);
1936	&movups	($rndkey0,&QWP(0x50,$inp));	# IV
1937	&xorps	($inout5,$rndkey1);
1938	&movups	(&QWP(0,$out),$inout0);
1939	&movups	(&QWP(0x10,$out),$inout1);
1940	&lea	($inp,&DWP(0x60,$inp));
1941	&movups	(&QWP(0x20,$out),$inout2);
1942	&mov	($rounds,$rounds_);		# restore $rounds
1943	&movups	(&QWP(0x30,$out),$inout3);
1944	&mov	($key,$key_);			# restore $key
1945	&movups	(&QWP(0x40,$out),$inout4);
1946	&lea	($out,&DWP(0x50,$out));
1947	&sub	($len,0x60);
1948	&ja	(&label("cbc_dec_loop6"));
1949
1950	&movaps	($inout0,$inout5);
1951	&movaps	($ivec,$rndkey0);
1952	&add	($len,0x50);
1953	&jle	(&label("cbc_dec_clear_tail_collected"));
1954	&movups	(&QWP(0,$out),$inout0);
1955	&lea	($out,&DWP(0x10,$out));
1956&set_label("cbc_dec_tail");
1957	&movups	($inout0,&QWP(0,$inp));
1958	&movaps	($in0,$inout0);
1959	&cmp	($len,0x10);
1960	&jbe	(&label("cbc_dec_one"));
1961
1962	&movups	($inout1,&QWP(0x10,$inp));
1963	&movaps	($in1,$inout1);
1964	&cmp	($len,0x20);
1965	&jbe	(&label("cbc_dec_two"));
1966
1967	&movups	($inout2,&QWP(0x20,$inp));
1968	&cmp	($len,0x30);
1969	&jbe	(&label("cbc_dec_three"));
1970
1971	&movups	($inout3,&QWP(0x30,$inp));
1972	&cmp	($len,0x40);
1973	&jbe	(&label("cbc_dec_four"));
1974
1975	&movups	($inout4,&QWP(0x40,$inp));
1976	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
1977	&movups	($inout0,&QWP(0,$inp));
1978	&xorps	($inout5,$inout5);
1979	&call	("_aesni_decrypt6");
1980	&movups	($rndkey1,&QWP(0,$inp));
1981	&movups	($rndkey0,&QWP(0x10,$inp));
1982	&xorps	($inout0,&QWP(0,"esp"));	# ^= IV
1983	&xorps	($inout1,$rndkey1);
1984	&movups	($rndkey1,&QWP(0x20,$inp));
1985	&xorps	($inout2,$rndkey0);
1986	&movups	($rndkey0,&QWP(0x30,$inp));
1987	&xorps	($inout3,$rndkey1);
1988	&movups	($ivec,&QWP(0x40,$inp));	# IV
1989	&xorps	($inout4,$rndkey0);
1990	&movups	(&QWP(0,$out),$inout0);
1991	&movups	(&QWP(0x10,$out),$inout1);
1992	&pxor	($inout1,$inout1);
1993	&movups	(&QWP(0x20,$out),$inout2);
1994	&pxor	($inout2,$inout2);
1995	&movups	(&QWP(0x30,$out),$inout3);
1996	&pxor	($inout3,$inout3);
1997	&lea	($out,&DWP(0x40,$out));
1998	&movaps	($inout0,$inout4);
1999	&pxor	($inout4,$inout4);
2000	&sub	($len,0x50);
2001	&jmp	(&label("cbc_dec_tail_collected"));
2002
2003&set_label("cbc_dec_one",16);
2004	if ($inline)
2005	{   &aesni_inline_generate1("dec");	}
2006	else
2007	{   &call	("_aesni_decrypt1");	}
2008	&xorps	($inout0,$ivec);
2009	&movaps	($ivec,$in0);
2010	&sub	($len,0x10);
2011	&jmp	(&label("cbc_dec_tail_collected"));
2012
2013&set_label("cbc_dec_two",16);
2014	&call	("_aesni_decrypt2");
2015	&xorps	($inout0,$ivec);
2016	&xorps	($inout1,$in0);
2017	&movups	(&QWP(0,$out),$inout0);
2018	&movaps	($inout0,$inout1);
2019	&pxor	($inout1,$inout1);
2020	&lea	($out,&DWP(0x10,$out));
2021	&movaps	($ivec,$in1);
2022	&sub	($len,0x20);
2023	&jmp	(&label("cbc_dec_tail_collected"));
2024
2025&set_label("cbc_dec_three",16);
2026	&call	("_aesni_decrypt3");
2027	&xorps	($inout0,$ivec);
2028	&xorps	($inout1,$in0);
2029	&xorps	($inout2,$in1);
2030	&movups	(&QWP(0,$out),$inout0);
2031	&movaps	($inout0,$inout2);
2032	&pxor	($inout2,$inout2);
2033	&movups	(&QWP(0x10,$out),$inout1);
2034	&pxor	($inout1,$inout1);
2035	&lea	($out,&DWP(0x20,$out));
2036	&movups	($ivec,&QWP(0x20,$inp));
2037	&sub	($len,0x30);
2038	&jmp	(&label("cbc_dec_tail_collected"));
2039
2040&set_label("cbc_dec_four",16);
2041	&call	("_aesni_decrypt4");
2042	&movups	($rndkey1,&QWP(0x10,$inp));
2043	&movups	($rndkey0,&QWP(0x20,$inp));
2044	&xorps	($inout0,$ivec);
2045	&movups	($ivec,&QWP(0x30,$inp));
2046	&xorps	($inout1,$in0);
2047	&movups	(&QWP(0,$out),$inout0);
2048	&xorps	($inout2,$rndkey1);
2049	&movups	(&QWP(0x10,$out),$inout1);
2050	&pxor	($inout1,$inout1);
2051	&xorps	($inout3,$rndkey0);
2052	&movups	(&QWP(0x20,$out),$inout2);
2053	&pxor	($inout2,$inout2);
2054	&lea	($out,&DWP(0x30,$out));
2055	&movaps	($inout0,$inout3);
2056	&pxor	($inout3,$inout3);
2057	&sub	($len,0x40);
2058	&jmp	(&label("cbc_dec_tail_collected"));
2059
2060&set_label("cbc_dec_clear_tail_collected",16);
2061	&pxor	($inout1,$inout1);
2062	&pxor	($inout2,$inout2);
2063	&pxor	($inout3,$inout3);
2064	&pxor	($inout4,$inout4);
2065&set_label("cbc_dec_tail_collected");
2066	&and	($len,15);
2067	&jnz	(&label("cbc_dec_tail_partial"));
2068	&movups	(&QWP(0,$out),$inout0);
2069	&pxor	($rndkey0,$rndkey0);
2070	&jmp	(&label("cbc_ret"));
2071
2072&set_label("cbc_dec_tail_partial",16);
2073	&movaps	(&QWP(0,"esp"),$inout0);
2074	&pxor	($rndkey0,$rndkey0);
2075	&mov	("ecx",16);
2076	&mov	($inp,"esp");
2077	&sub	("ecx",$len);
2078	&data_word(0xA4F3F689);		# rep movsb
2079	&movdqa	(&QWP(0,"esp"),$inout0);
2080
2081&set_label("cbc_ret");
2082	&mov	("esp",&DWP(16,"esp"));	# pull original %esp
2083	&mov	($key_,&wparam(4));
2084	&pxor	($inout0,$inout0);
2085	&pxor	($rndkey1,$rndkey1);
2086	&movups	(&QWP(0,$key_),$ivec);	# output IV
2087	&pxor	($ivec,$ivec);
2088&set_label("cbc_abort");
2089&function_end("${PREFIX}_cbc_encrypt");
2090
2091######################################################################
2092# Mechanical port from aesni-x86_64.pl.
2093#
2094# _aesni_set_encrypt_key is private interface,
2095# input:
2096#	"eax"	const unsigned char *userKey
2097#	$rounds	int bits
2098#	$key	AES_KEY *key
2099# output:
2100#	"eax"	return code
2101#	$round	rounds
2102
2103&function_begin_B("_aesni_set_encrypt_key");
2104	&push	("ebp");
2105	&push	("ebx");
2106	&test	("eax","eax");
2107	&jz	(&label("bad_pointer"));
2108	&test	($key,$key);
2109	&jz	(&label("bad_pointer"));
2110
2111	&call	(&label("pic"));
2112&set_label("pic");
2113	&blindpop("ebx");
2114	&lea	("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
2115
2116	&picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const"));
2117	&movups	("xmm0",&QWP(0,"eax"));	# pull first 128 bits of *userKey
2118	&xorps	("xmm4","xmm4");	# low dword of xmm4 is assumed 0
2119	&mov	("ebp",&DWP(4,"ebp"));
2120	&lea	($key,&DWP(16,$key));
2121	&and	("ebp",1<<28|1<<11);	# AVX and XOP bits
2122	&cmp	($rounds,256);
2123	&je	(&label("14rounds"));
2124	&cmp	($rounds,192);
2125	&je	(&label("12rounds"));
2126	&cmp	($rounds,128);
2127	&jne	(&label("bad_keybits"));
2128
2129&set_label("10rounds",16);
2130	&cmp		("ebp",1<<28);
2131	&je		(&label("10rounds_alt"));
2132
2133	&mov		($rounds,9);
2134	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
2135	&aeskeygenassist("xmm1","xmm0",0x01);		# round 1
2136	&call		(&label("key_128_cold"));
2137	&aeskeygenassist("xmm1","xmm0",0x2);		# round 2
2138	&call		(&label("key_128"));
2139	&aeskeygenassist("xmm1","xmm0",0x04);		# round 3
2140	&call		(&label("key_128"));
2141	&aeskeygenassist("xmm1","xmm0",0x08);		# round 4
2142	&call		(&label("key_128"));
2143	&aeskeygenassist("xmm1","xmm0",0x10);		# round 5
2144	&call		(&label("key_128"));
2145	&aeskeygenassist("xmm1","xmm0",0x20);		# round 6
2146	&call		(&label("key_128"));
2147	&aeskeygenassist("xmm1","xmm0",0x40);		# round 7
2148	&call		(&label("key_128"));
2149	&aeskeygenassist("xmm1","xmm0",0x80);		# round 8
2150	&call		(&label("key_128"));
2151	&aeskeygenassist("xmm1","xmm0",0x1b);		# round 9
2152	&call		(&label("key_128"));
2153	&aeskeygenassist("xmm1","xmm0",0x36);		# round 10
2154	&call		(&label("key_128"));
2155	&$movekey	(&QWP(0,$key),"xmm0");
2156	&mov		(&DWP(80,$key),$rounds);
2157
2158	&jmp	(&label("good_key"));
2159
2160&set_label("key_128",16);
2161	&$movekey	(&QWP(0,$key),"xmm0");
2162	&lea		($key,&DWP(16,$key));
2163&set_label("key_128_cold");
2164	&shufps		("xmm4","xmm0",0b00010000);
2165	&xorps		("xmm0","xmm4");
2166	&shufps		("xmm4","xmm0",0b10001100);
2167	&xorps		("xmm0","xmm4");
2168	&shufps		("xmm1","xmm1",0b11111111);	# critical path
2169	&xorps		("xmm0","xmm1");
2170	&ret();
2171
2172&set_label("10rounds_alt",16);
2173	&movdqa		("xmm5",&QWP(0x00,"ebx"));
2174	&mov		($rounds,8);
2175	&movdqa		("xmm4",&QWP(0x20,"ebx"));
2176	&movdqa		("xmm2","xmm0");
2177	&movdqu		(&QWP(-16,$key),"xmm0");
2178
2179&set_label("loop_key128");
2180	&pshufb		("xmm0","xmm5");
2181	&aesenclast	("xmm0","xmm4");
2182	&pslld		("xmm4",1);
2183	&lea		($key,&DWP(16,$key));
2184
2185	&movdqa		("xmm3","xmm2");
2186	&pslldq		("xmm2",4);
2187	&pxor		("xmm3","xmm2");
2188	&pslldq		("xmm2",4);
2189	&pxor		("xmm3","xmm2");
2190	&pslldq		("xmm2",4);
2191	&pxor		("xmm2","xmm3");
2192
2193	&pxor		("xmm0","xmm2");
2194	&movdqu		(&QWP(-16,$key),"xmm0");
2195	&movdqa		("xmm2","xmm0");
2196
2197	&dec		($rounds);
2198	&jnz		(&label("loop_key128"));
2199
2200	&movdqa		("xmm4",&QWP(0x30,"ebx"));
2201
2202	&pshufb		("xmm0","xmm5");
2203	&aesenclast	("xmm0","xmm4");
2204	&pslld		("xmm4",1);
2205
2206	&movdqa		("xmm3","xmm2");
2207	&pslldq		("xmm2",4);
2208	&pxor		("xmm3","xmm2");
2209	&pslldq		("xmm2",4);
2210	&pxor		("xmm3","xmm2");
2211	&pslldq		("xmm2",4);
2212	&pxor		("xmm2","xmm3");
2213
2214	&pxor		("xmm0","xmm2");
2215	&movdqu		(&QWP(0,$key),"xmm0");
2216
2217	&movdqa		("xmm2","xmm0");
2218	&pshufb		("xmm0","xmm5");
2219	&aesenclast	("xmm0","xmm4");
2220
2221	&movdqa		("xmm3","xmm2");
2222	&pslldq		("xmm2",4);
2223	&pxor		("xmm3","xmm2");
2224	&pslldq		("xmm2",4);
2225	&pxor		("xmm3","xmm2");
2226	&pslldq		("xmm2",4);
2227	&pxor		("xmm2","xmm3");
2228
2229	&pxor		("xmm0","xmm2");
2230	&movdqu		(&QWP(16,$key),"xmm0");
2231
2232	&mov		($rounds,9);
2233	&mov		(&DWP(96,$key),$rounds);
2234
2235	&jmp	(&label("good_key"));
2236
2237&set_label("12rounds",16);
2238	&movq		("xmm2",&QWP(16,"eax"));	# remaining 1/3 of *userKey
2239	&cmp		("ebp",1<<28);
2240	&je		(&label("12rounds_alt"));
2241
2242	&mov		($rounds,11);
2243	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
2244	&aeskeygenassist("xmm1","xmm2",0x01);		# round 1,2
2245	&call		(&label("key_192a_cold"));
2246	&aeskeygenassist("xmm1","xmm2",0x02);		# round 2,3
2247	&call		(&label("key_192b"));
2248	&aeskeygenassist("xmm1","xmm2",0x04);		# round 4,5
2249	&call		(&label("key_192a"));
2250	&aeskeygenassist("xmm1","xmm2",0x08);		# round 5,6
2251	&call		(&label("key_192b"));
2252	&aeskeygenassist("xmm1","xmm2",0x10);		# round 7,8
2253	&call		(&label("key_192a"));
2254	&aeskeygenassist("xmm1","xmm2",0x20);		# round 8,9
2255	&call		(&label("key_192b"));
2256	&aeskeygenassist("xmm1","xmm2",0x40);		# round 10,11
2257	&call		(&label("key_192a"));
2258	&aeskeygenassist("xmm1","xmm2",0x80);		# round 11,12
2259	&call		(&label("key_192b"));
2260	&$movekey	(&QWP(0,$key),"xmm0");
2261	&mov		(&DWP(48,$key),$rounds);
2262
2263	&jmp	(&label("good_key"));
2264
2265&set_label("key_192a",16);
2266	&$movekey	(&QWP(0,$key),"xmm0");
2267	&lea		($key,&DWP(16,$key));
2268&set_label("key_192a_cold",16);
2269	&movaps		("xmm5","xmm2");
2270&set_label("key_192b_warm");
2271	&shufps		("xmm4","xmm0",0b00010000);
2272	&movdqa		("xmm3","xmm2");
2273	&xorps		("xmm0","xmm4");
2274	&shufps		("xmm4","xmm0",0b10001100);
2275	&pslldq		("xmm3",4);
2276	&xorps		("xmm0","xmm4");
2277	&pshufd		("xmm1","xmm1",0b01010101);	# critical path
2278	&pxor		("xmm2","xmm3");
2279	&pxor		("xmm0","xmm1");
2280	&pshufd		("xmm3","xmm0",0b11111111);
2281	&pxor		("xmm2","xmm3");
2282	&ret();
2283
2284&set_label("key_192b",16);
2285	&movaps		("xmm3","xmm0");
2286	&shufps		("xmm5","xmm0",0b01000100);
2287	&$movekey	(&QWP(0,$key),"xmm5");
2288	&shufps		("xmm3","xmm2",0b01001110);
2289	&$movekey	(&QWP(16,$key),"xmm3");
2290	&lea		($key,&DWP(32,$key));
2291	&jmp		(&label("key_192b_warm"));
2292
2293&set_label("12rounds_alt",16);
2294	&movdqa		("xmm5",&QWP(0x10,"ebx"));
2295	&movdqa		("xmm4",&QWP(0x20,"ebx"));
2296	&mov		($rounds,8);
2297	&movdqu		(&QWP(-16,$key),"xmm0");
2298
2299&set_label("loop_key192");
2300	&movq		(&QWP(0,$key),"xmm2");
2301	&movdqa		("xmm1","xmm2");
2302	&pshufb		("xmm2","xmm5");
2303	&aesenclast	("xmm2","xmm4");
2304	&pslld		("xmm4",1);
2305	&lea		($key,&DWP(24,$key));
2306
2307	&movdqa		("xmm3","xmm0");
2308	&pslldq		("xmm0",4);
2309	&pxor		("xmm3","xmm0");
2310	&pslldq		("xmm0",4);
2311	&pxor		("xmm3","xmm0");
2312	&pslldq		("xmm0",4);
2313	&pxor		("xmm0","xmm3");
2314
2315	&pshufd		("xmm3","xmm0",0xff);
2316	&pxor		("xmm3","xmm1");
2317	&pslldq		("xmm1",4);
2318	&pxor		("xmm3","xmm1");
2319
2320	&pxor		("xmm0","xmm2");
2321	&pxor		("xmm2","xmm3");
2322	&movdqu		(&QWP(-16,$key),"xmm0");
2323
2324	&dec		($rounds);
2325	&jnz		(&label("loop_key192"));
2326
2327	&mov	($rounds,11);
2328	&mov	(&DWP(32,$key),$rounds);
2329
2330	&jmp	(&label("good_key"));
2331
2332&set_label("14rounds",16);
2333	&movups		("xmm2",&QWP(16,"eax"));	# remaining half of *userKey
2334	&lea		($key,&DWP(16,$key));
2335	&cmp		("ebp",1<<28);
2336	&je		(&label("14rounds_alt"));
2337
2338	&mov		($rounds,13);
2339	&$movekey	(&QWP(-32,$key),"xmm0");	# round 0
2340	&$movekey	(&QWP(-16,$key),"xmm2");	# round 1
2341	&aeskeygenassist("xmm1","xmm2",0x01);		# round 2
2342	&call		(&label("key_256a_cold"));
2343	&aeskeygenassist("xmm1","xmm0",0x01);		# round 3
2344	&call		(&label("key_256b"));
2345	&aeskeygenassist("xmm1","xmm2",0x02);		# round 4
2346	&call		(&label("key_256a"));
2347	&aeskeygenassist("xmm1","xmm0",0x02);		# round 5
2348	&call		(&label("key_256b"));
2349	&aeskeygenassist("xmm1","xmm2",0x04);		# round 6
2350	&call		(&label("key_256a"));
2351	&aeskeygenassist("xmm1","xmm0",0x04);		# round 7
2352	&call		(&label("key_256b"));
2353	&aeskeygenassist("xmm1","xmm2",0x08);		# round 8
2354	&call		(&label("key_256a"));
2355	&aeskeygenassist("xmm1","xmm0",0x08);		# round 9
2356	&call		(&label("key_256b"));
2357	&aeskeygenassist("xmm1","xmm2",0x10);		# round 10
2358	&call		(&label("key_256a"));
2359	&aeskeygenassist("xmm1","xmm0",0x10);		# round 11
2360	&call		(&label("key_256b"));
2361	&aeskeygenassist("xmm1","xmm2",0x20);		# round 12
2362	&call		(&label("key_256a"));
2363	&aeskeygenassist("xmm1","xmm0",0x20);		# round 13
2364	&call		(&label("key_256b"));
2365	&aeskeygenassist("xmm1","xmm2",0x40);		# round 14
2366	&call		(&label("key_256a"));
2367	&$movekey	(&QWP(0,$key),"xmm0");
2368	&mov		(&DWP(16,$key),$rounds);
2369	&xor		("eax","eax");
2370
2371	&jmp	(&label("good_key"));
2372
2373&set_label("key_256a",16);
2374	&$movekey	(&QWP(0,$key),"xmm2");
2375	&lea		($key,&DWP(16,$key));
2376&set_label("key_256a_cold");
2377	&shufps		("xmm4","xmm0",0b00010000);
2378	&xorps		("xmm0","xmm4");
2379	&shufps		("xmm4","xmm0",0b10001100);
2380	&xorps		("xmm0","xmm4");
2381	&shufps		("xmm1","xmm1",0b11111111);	# critical path
2382	&xorps		("xmm0","xmm1");
2383	&ret();
2384
2385&set_label("key_256b",16);
2386	&$movekey	(&QWP(0,$key),"xmm0");
2387	&lea		($key,&DWP(16,$key));
2388
2389	&shufps		("xmm4","xmm2",0b00010000);
2390	&xorps		("xmm2","xmm4");
2391	&shufps		("xmm4","xmm2",0b10001100);
2392	&xorps		("xmm2","xmm4");
2393	&shufps		("xmm1","xmm1",0b10101010);	# critical path
2394	&xorps		("xmm2","xmm1");
2395	&ret();
2396
2397&set_label("14rounds_alt",16);
2398	&movdqa		("xmm5",&QWP(0x00,"ebx"));
2399	&movdqa		("xmm4",&QWP(0x20,"ebx"));
2400	&mov		($rounds,7);
2401	&movdqu		(&QWP(-32,$key),"xmm0");
2402	&movdqa		("xmm1","xmm2");
2403	&movdqu		(&QWP(-16,$key),"xmm2");
2404
2405&set_label("loop_key256");
2406	&pshufb		("xmm2","xmm5");
2407	&aesenclast	("xmm2","xmm4");
2408
2409	&movdqa		("xmm3","xmm0");
2410	&pslldq		("xmm0",4);
2411	&pxor		("xmm3","xmm0");
2412	&pslldq		("xmm0",4);
2413	&pxor		("xmm3","xmm0");
2414	&pslldq		("xmm0",4);
2415	&pxor		("xmm0","xmm3");
2416	&pslld		("xmm4",1);
2417
2418	&pxor		("xmm0","xmm2");
2419	&movdqu		(&QWP(0,$key),"xmm0");
2420
2421	&dec		($rounds);
2422	&jz		(&label("done_key256"));
2423
2424	&pshufd		("xmm2","xmm0",0xff);
2425	&pxor		("xmm3","xmm3");
2426	&aesenclast	("xmm2","xmm3");
2427
2428	&movdqa		("xmm3","xmm1")
2429	&pslldq		("xmm1",4);
2430	&pxor		("xmm3","xmm1");
2431	&pslldq		("xmm1",4);
2432	&pxor		("xmm3","xmm1");
2433	&pslldq		("xmm1",4);
2434	&pxor		("xmm1","xmm3");
2435
2436	&pxor		("xmm2","xmm1");
2437	&movdqu		(&QWP(16,$key),"xmm2");
2438	&lea		($key,&DWP(32,$key));
2439	&movdqa		("xmm1","xmm2");
2440	&jmp		(&label("loop_key256"));
2441
2442&set_label("done_key256");
2443	&mov		($rounds,13);
2444	&mov		(&DWP(16,$key),$rounds);
2445
2446&set_label("good_key");
2447	&pxor	("xmm0","xmm0");
2448	&pxor	("xmm1","xmm1");
2449	&pxor	("xmm2","xmm2");
2450	&pxor	("xmm3","xmm3");
2451	&pxor	("xmm4","xmm4");
2452	&pxor	("xmm5","xmm5");
2453	&xor	("eax","eax");
2454	&pop	("ebx");
2455	&pop	("ebp");
2456	&ret	();
2457
2458&set_label("bad_pointer",4);
2459	&mov	("eax",-1);
2460	&pop	("ebx");
2461	&pop	("ebp");
2462	&ret	();
2463&set_label("bad_keybits",4);
2464	&pxor	("xmm0","xmm0");
2465	&mov	("eax",-2);
2466	&pop	("ebx");
2467	&pop	("ebp");
2468	&ret	();
2469&function_end_B("_aesni_set_encrypt_key");
2470
2471# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
2472#                              AES_KEY *key)
2473&function_begin_B("${PREFIX}_set_encrypt_key");
2474	&mov	("eax",&wparam(0));
2475	&mov	($rounds,&wparam(1));
2476	&mov	($key,&wparam(2));
2477	&call	("_aesni_set_encrypt_key");
2478	&ret	();
2479&function_end_B("${PREFIX}_set_encrypt_key");
2480
2481# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
2482#                              AES_KEY *key)
2483&function_begin_B("${PREFIX}_set_decrypt_key");
2484	&mov	("eax",&wparam(0));
2485	&mov	($rounds,&wparam(1));
2486	&mov	($key,&wparam(2));
2487	&call	("_aesni_set_encrypt_key");
2488	&mov	($key,&wparam(2));
2489	&shl	($rounds,4);	# rounds-1 after _aesni_set_encrypt_key
2490	&test	("eax","eax");
2491	&jnz	(&label("dec_key_ret"));
2492	&lea	("eax",&DWP(16,$key,$rounds));	# end of key schedule
2493
2494	&$movekey	("xmm0",&QWP(0,$key));	# just swap
2495	&$movekey	("xmm1",&QWP(0,"eax"));
2496	&$movekey	(&QWP(0,"eax"),"xmm0");
2497	&$movekey	(&QWP(0,$key),"xmm1");
2498	&lea		($key,&DWP(16,$key));
2499	&lea		("eax",&DWP(-16,"eax"));
2500
2501&set_label("dec_key_inverse");
2502	&$movekey	("xmm0",&QWP(0,$key));	# swap and inverse
2503	&$movekey	("xmm1",&QWP(0,"eax"));
2504	&aesimc		("xmm0","xmm0");
2505	&aesimc		("xmm1","xmm1");
2506	&lea		($key,&DWP(16,$key));
2507	&lea		("eax",&DWP(-16,"eax"));
2508	&$movekey	(&QWP(16,"eax"),"xmm0");
2509	&$movekey	(&QWP(-16,$key),"xmm1");
2510	&cmp		("eax",$key);
2511	&ja		(&label("dec_key_inverse"));
2512
2513	&$movekey	("xmm0",&QWP(0,$key));	# inverse middle
2514	&aesimc		("xmm0","xmm0");
2515	&$movekey	(&QWP(0,$key),"xmm0");
2516
2517	&pxor		("xmm0","xmm0");
2518	&pxor		("xmm1","xmm1");
2519	&xor		("eax","eax");		# return success
2520&set_label("dec_key_ret");
2521	&ret	();
2522&function_end_B("${PREFIX}_set_decrypt_key");
2523
2524&set_label("key_const",64);
2525&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
2526&data_word(0x04070605,0x04070605,0x04070605,0x04070605);
2527&data_word(1,1,1,1);
2528&data_word(0x1b,0x1b,0x1b,0x1b);
2529&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
2530
2531&asm_finish();
2532
2533close STDOUT;
2534