• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for Intel AES-NI extension. In
18# OpenSSL context it's used with Intel engine, but can also be used as
19# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
20# details].
21#
22# Performance.
23#
24# To start with see corresponding paragraph in aesni-x86_64.pl...
25# Instead of filling table similar to one found there I've chosen to
26# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
27# The simplified table below represents 32-bit performance relative
28# to 64-bit one in every given point. Ratios vary for different
29# encryption modes, therefore interval values.
30#
31#	16-byte     64-byte     256-byte    1-KB        8-KB
32#	53-67%      67-84%      91-94%      95-98%      97-99.5%
33#
34# Lower ratios for smaller block sizes are perfectly understandable,
35# because function call overhead is higher in 32-bit mode. Largest
36# 8-KB block performance is virtually same: 32-bit code is less than
37# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
38
39# January 2011
40#
41# See aesni-x86_64.pl for details. Unlike x86_64 version this module
42# interleaves at most 6 aes[enc|dec] instructions, because there are
43# not enough registers for 8x interleave [which should be optimal for
44# Sandy Bridge]. Actually, performance results for 6x interleave
45# factor presented in aesni-x86_64.pl (except for CTR) are for this
46# module.
47
48# April 2011
49#
50# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
51# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
52
53# November 2015
54#
55# Add aesni_ocb_[en|de]crypt. [Removed in BoringSSL]
56
57######################################################################
58# Current large-block performance in cycles per byte processed with
59# 128-bit key (less is better).
60#
61#		CBC en-/decrypt	CTR	XTS	ECB	OCB
62# Westmere	3.77/1.37	1.37	1.52	1.27
63# * Bridge	5.07/0.98	0.99	1.09	0.91	1.10
64# Haswell	4.44/0.80	0.97	1.03	0.72	0.76
65# Skylake	2.68/0.65	0.65	0.66	0.64	0.66
66# Silvermont	5.77/3.56	3.67	4.03	3.46	4.03
67# Goldmont	3.84/1.39	1.39	1.63	1.31	1.70
68# Bulldozer	5.80/0.98	1.05	1.24	0.93	1.23
69
70$PREFIX="aes_hw";	# if $PREFIX is set to "AES", the script
71			# generates drop-in replacement for
72			# crypto/aes/asm/aes-586.pl:-)
73$AESNI_PREFIX="aes_hw";
74$inline=1;		# inline _aesni_[en|de]crypt
75
76$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
77push(@INC,"${dir}","${dir}../../../perlasm");
78require "x86asm.pl";
79
80$output = pop;
81open OUT,">$output";
82*STDOUT=*OUT;
83
84&asm_init($ARGV[0]);
85
86&external_label("OPENSSL_ia32cap_P");
87&preprocessor_ifdef("BORINGSSL_DISPATCH_TEST")
88&external_label("BORINGSSL_function_hit");
89&preprocessor_endif();
90&static_label("key_const");
91
92if ($PREFIX eq $AESNI_PREFIX)	{ $movekey=\&movups; }
93else			{ $movekey=\&movups; }
94
95$len="eax";
96$rounds="ecx";
97$key="edx";
98$inp="esi";
99$out="edi";
100$rounds_="ebx";	# backup copy for $rounds
101$key_="ebp";	# backup copy for $key
102
103$rndkey0="xmm0";
104$rndkey1="xmm1";
105$inout0="xmm2";
106$inout1="xmm3";
107$inout2="xmm4";
108$inout3="xmm5";	$in1="xmm5";
109$inout4="xmm6";	$in0="xmm6";
110$inout5="xmm7";	$ivec="xmm7";
111
112# AESNI extension
113sub aeskeygenassist
114{ my($dst,$src,$imm)=@_;
115    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
116    {	&data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm);	}
117}
118sub aescommon
119{ my($opcodelet,$dst,$src)=@_;
120    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
121    {	&data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
122}
123sub aesimc	{ aescommon(0xdb,@_); }
124sub aesenc	{ aescommon(0xdc,@_); }
125sub aesenclast	{ aescommon(0xdd,@_); }
126
127# Inline version of internal aesni_[en|de]crypt1
128{ my $sn;
129sub aesni_inline_generate1
130{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
131  $sn++;
132
133    &$movekey		($rndkey0,&QWP(0,$key));
134    &$movekey		($rndkey1,&QWP(16,$key));
135    &xorps		($ivec,$rndkey0)	if (defined($ivec));
136    &lea		($key,&DWP(32,$key));
137    &xorps		($inout,$ivec)		if (defined($ivec));
138    &xorps		($inout,$rndkey0)	if (!defined($ivec));
139    &set_label("${p}1_loop_$sn");
140	eval"&aes${p}	($inout,$rndkey1)";
141	&dec		($rounds);
142	&$movekey	($rndkey1,&QWP(0,$key));
143	&lea		($key,&DWP(16,$key));
144    &jnz		(&label("${p}1_loop_$sn"));
145    eval"&aes${p}last	($inout,$rndkey1)";
146}}
147
148sub aesni_generate1	# fully unrolled loop
149{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
150
151    &function_begin_B("_aesni_${p}rypt1");
152	&movups		($rndkey0,&QWP(0,$key));
153	&$movekey	($rndkey1,&QWP(0x10,$key));
154	&xorps		($inout,$rndkey0);
155	&$movekey	($rndkey0,&QWP(0x20,$key));
156	&lea		($key,&DWP(0x30,$key));
157	&cmp		($rounds,11);
158	&jb		(&label("${p}128"));
159	&lea		($key,&DWP(0x40,$key));
160	# 192-bit key support was removed.
161
162	eval"&aes${p}	($inout,$rndkey1)";
163	&$movekey	($rndkey1,&QWP(-0x40,$key));
164	eval"&aes${p}	($inout,$rndkey0)";
165	&$movekey	($rndkey0,&QWP(-0x30,$key));
166
167	# 192-bit key support was removed.
168	eval"&aes${p}	($inout,$rndkey1)";
169	&$movekey	($rndkey1,&QWP(-0x20,$key));
170	eval"&aes${p}	($inout,$rndkey0)";
171	&$movekey	($rndkey0,&QWP(-0x10,$key));
172    &set_label("${p}128");
173	eval"&aes${p}	($inout,$rndkey1)";
174	&$movekey	($rndkey1,&QWP(0,$key));
175	eval"&aes${p}	($inout,$rndkey0)";
176	&$movekey	($rndkey0,&QWP(0x10,$key));
177	eval"&aes${p}	($inout,$rndkey1)";
178	&$movekey	($rndkey1,&QWP(0x20,$key));
179	eval"&aes${p}	($inout,$rndkey0)";
180	&$movekey	($rndkey0,&QWP(0x30,$key));
181	eval"&aes${p}	($inout,$rndkey1)";
182	&$movekey	($rndkey1,&QWP(0x40,$key));
183	eval"&aes${p}	($inout,$rndkey0)";
184	&$movekey	($rndkey0,&QWP(0x50,$key));
185	eval"&aes${p}	($inout,$rndkey1)";
186	&$movekey	($rndkey1,&QWP(0x60,$key));
187	eval"&aes${p}	($inout,$rndkey0)";
188	&$movekey	($rndkey0,&QWP(0x70,$key));
189	eval"&aes${p}	($inout,$rndkey1)";
190    eval"&aes${p}last	($inout,$rndkey0)";
191    &ret();
192    &function_end_B("_aesni_${p}rypt1");
193}
194
195# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
196&aesni_generate1("enc") if (!$inline);
197&function_begin_B("${PREFIX}_encrypt");
198	&record_function_hit(1);
199
200	&mov	("eax",&wparam(0));
201	&mov	($key,&wparam(2));
202	&movups	($inout0,&QWP(0,"eax"));
203	&mov	($rounds,&DWP(240,$key));
204	&mov	("eax",&wparam(1));
205	if ($inline)
206	{   &aesni_inline_generate1("enc");	}
207	else
208	{   &call	("_aesni_encrypt1");	}
209	&pxor	($rndkey0,$rndkey0);		# clear register bank
210	&pxor	($rndkey1,$rndkey1);
211	&movups	(&QWP(0,"eax"),$inout0);
212	&pxor	($inout0,$inout0);
213	&ret	();
214&function_end_B("${PREFIX}_encrypt");
215
216# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
217# factor. Why 3x subroutine were originally used in loops? Even though
218# aes[enc|dec] latency was originally 6, it could be scheduled only
219# every *2nd* cycle. Thus 3x interleave was the one providing optimal
220# utilization, i.e. when subroutine's throughput is virtually same as
221# of non-interleaved subroutine [for number of input blocks up to 3].
222# This is why it originally made no sense to implement 2x subroutine.
223# But times change and it became appropriate to spend extra 192 bytes
224# on 2x subroutine on Atom Silvermont account. For processors that
225# can schedule aes[enc|dec] every cycle optimal interleave factor
226# equals to corresponding instructions latency. 8x is optimal for
227# * Bridge, but it's unfeasible to accommodate such implementation
228# in XMM registers addressable in 32-bit mode and therefore maximum
229# of 6x is used instead...
230
231sub aesni_generate2
232{ my $p=shift;
233
234    &function_begin_B("_aesni_${p}rypt2");
235	&$movekey	($rndkey0,&QWP(0,$key));
236	&shl		($rounds,4);
237	&$movekey	($rndkey1,&QWP(16,$key));
238	&xorps		($inout0,$rndkey0);
239	&pxor		($inout1,$rndkey0);
240	&$movekey	($rndkey0,&QWP(32,$key));
241	&lea		($key,&DWP(32,$key,$rounds));
242	&neg		($rounds);
243	&add		($rounds,16);
244
245    &set_label("${p}2_loop");
246	eval"&aes${p}	($inout0,$rndkey1)";
247	eval"&aes${p}	($inout1,$rndkey1)";
248	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
249	&add		($rounds,32);
250	eval"&aes${p}	($inout0,$rndkey0)";
251	eval"&aes${p}	($inout1,$rndkey0)";
252	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
253	&jnz		(&label("${p}2_loop"));
254    eval"&aes${p}	($inout0,$rndkey1)";
255    eval"&aes${p}	($inout1,$rndkey1)";
256    eval"&aes${p}last	($inout0,$rndkey0)";
257    eval"&aes${p}last	($inout1,$rndkey0)";
258    &ret();
259    &function_end_B("_aesni_${p}rypt2");
260}
261
262sub aesni_generate3
263{ my $p=shift;
264
265    &function_begin_B("_aesni_${p}rypt3");
266	&$movekey	($rndkey0,&QWP(0,$key));
267	&shl		($rounds,4);
268	&$movekey	($rndkey1,&QWP(16,$key));
269	&xorps		($inout0,$rndkey0);
270	&pxor		($inout1,$rndkey0);
271	&pxor		($inout2,$rndkey0);
272	&$movekey	($rndkey0,&QWP(32,$key));
273	&lea		($key,&DWP(32,$key,$rounds));
274	&neg		($rounds);
275	&add		($rounds,16);
276
277    &set_label("${p}3_loop");
278	eval"&aes${p}	($inout0,$rndkey1)";
279	eval"&aes${p}	($inout1,$rndkey1)";
280	eval"&aes${p}	($inout2,$rndkey1)";
281	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
282	&add		($rounds,32);
283	eval"&aes${p}	($inout0,$rndkey0)";
284	eval"&aes${p}	($inout1,$rndkey0)";
285	eval"&aes${p}	($inout2,$rndkey0)";
286	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
287	&jnz		(&label("${p}3_loop"));
288    eval"&aes${p}	($inout0,$rndkey1)";
289    eval"&aes${p}	($inout1,$rndkey1)";
290    eval"&aes${p}	($inout2,$rndkey1)";
291    eval"&aes${p}last	($inout0,$rndkey0)";
292    eval"&aes${p}last	($inout1,$rndkey0)";
293    eval"&aes${p}last	($inout2,$rndkey0)";
294    &ret();
295    &function_end_B("_aesni_${p}rypt3");
296}
297
298# 4x interleave is implemented to improve small block performance,
299# most notably [and naturally] 4 block by ~30%. One can argue that one
300# should have implemented 5x as well, but improvement  would be <20%,
301# so it's not worth it...
302sub aesni_generate4
303{ my $p=shift;
304
305    &function_begin_B("_aesni_${p}rypt4");
306	&$movekey	($rndkey0,&QWP(0,$key));
307	&$movekey	($rndkey1,&QWP(16,$key));
308	&shl		($rounds,4);
309	&xorps		($inout0,$rndkey0);
310	&pxor		($inout1,$rndkey0);
311	&pxor		($inout2,$rndkey0);
312	&pxor		($inout3,$rndkey0);
313	&$movekey	($rndkey0,&QWP(32,$key));
314	&lea		($key,&DWP(32,$key,$rounds));
315	&neg		($rounds);
316	&data_byte	(0x0f,0x1f,0x40,0x00);
317	&add		($rounds,16);
318
319    &set_label("${p}4_loop");
320	eval"&aes${p}	($inout0,$rndkey1)";
321	eval"&aes${p}	($inout1,$rndkey1)";
322	eval"&aes${p}	($inout2,$rndkey1)";
323	eval"&aes${p}	($inout3,$rndkey1)";
324	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
325	&add		($rounds,32);
326	eval"&aes${p}	($inout0,$rndkey0)";
327	eval"&aes${p}	($inout1,$rndkey0)";
328	eval"&aes${p}	($inout2,$rndkey0)";
329	eval"&aes${p}	($inout3,$rndkey0)";
330	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
331    &jnz		(&label("${p}4_loop"));
332
333    eval"&aes${p}	($inout0,$rndkey1)";
334    eval"&aes${p}	($inout1,$rndkey1)";
335    eval"&aes${p}	($inout2,$rndkey1)";
336    eval"&aes${p}	($inout3,$rndkey1)";
337    eval"&aes${p}last	($inout0,$rndkey0)";
338    eval"&aes${p}last	($inout1,$rndkey0)";
339    eval"&aes${p}last	($inout2,$rndkey0)";
340    eval"&aes${p}last	($inout3,$rndkey0)";
341    &ret();
342    &function_end_B("_aesni_${p}rypt4");
343}
344
345sub aesni_generate6
346{ my $p=shift;
347
348    &function_begin_B("_aesni_${p}rypt6");
349    &static_label("_aesni_${p}rypt6_enter");
350	&$movekey	($rndkey0,&QWP(0,$key));
351	&shl		($rounds,4);
352	&$movekey	($rndkey1,&QWP(16,$key));
353	&xorps		($inout0,$rndkey0);
354	&pxor		($inout1,$rndkey0);	# pxor does better here
355	&pxor		($inout2,$rndkey0);
356	eval"&aes${p}	($inout0,$rndkey1)";
357	&pxor		($inout3,$rndkey0);
358	&pxor		($inout4,$rndkey0);
359	eval"&aes${p}	($inout1,$rndkey1)";
360	&lea		($key,&DWP(32,$key,$rounds));
361	&neg		($rounds);
362	eval"&aes${p}	($inout2,$rndkey1)";
363	&pxor		($inout5,$rndkey0);
364	&$movekey	($rndkey0,&QWP(0,$key,$rounds));
365	&add		($rounds,16);
366	&jmp		(&label("_aesni_${p}rypt6_inner"));
367
368    &set_label("${p}6_loop",16);
369	eval"&aes${p}	($inout0,$rndkey1)";
370	eval"&aes${p}	($inout1,$rndkey1)";
371	eval"&aes${p}	($inout2,$rndkey1)";
372    &set_label("_aesni_${p}rypt6_inner");
373	eval"&aes${p}	($inout3,$rndkey1)";
374	eval"&aes${p}	($inout4,$rndkey1)";
375	eval"&aes${p}	($inout5,$rndkey1)";
376    &set_label("_aesni_${p}rypt6_enter");
377	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
378	&add		($rounds,32);
379	eval"&aes${p}	($inout0,$rndkey0)";
380	eval"&aes${p}	($inout1,$rndkey0)";
381	eval"&aes${p}	($inout2,$rndkey0)";
382	eval"&aes${p}	($inout3,$rndkey0)";
383	eval"&aes${p}	($inout4,$rndkey0)";
384	eval"&aes${p}	($inout5,$rndkey0)";
385	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
386    &jnz		(&label("${p}6_loop"));
387
388    eval"&aes${p}	($inout0,$rndkey1)";
389    eval"&aes${p}	($inout1,$rndkey1)";
390    eval"&aes${p}	($inout2,$rndkey1)";
391    eval"&aes${p}	($inout3,$rndkey1)";
392    eval"&aes${p}	($inout4,$rndkey1)";
393    eval"&aes${p}	($inout5,$rndkey1)";
394    eval"&aes${p}last	($inout0,$rndkey0)";
395    eval"&aes${p}last	($inout1,$rndkey0)";
396    eval"&aes${p}last	($inout2,$rndkey0)";
397    eval"&aes${p}last	($inout3,$rndkey0)";
398    eval"&aes${p}last	($inout4,$rndkey0)";
399    eval"&aes${p}last	($inout5,$rndkey0)";
400    &ret();
401    &function_end_B("_aesni_${p}rypt6");
402}
403&aesni_generate2("enc") if ($PREFIX eq $AESNI_PREFIX);
404&aesni_generate3("enc") if ($PREFIX eq $AESNI_PREFIX);
405&aesni_generate4("enc") if ($PREFIX eq $AESNI_PREFIX);
406&aesni_generate6("enc") if ($PREFIX eq $AESNI_PREFIX);
407
408if ($PREFIX eq $AESNI_PREFIX) {
409
410######################################################################
411# void aes_hw_ctr32_encrypt_blocks (const void *in, void *out,
412#                         size_t blocks, const AES_KEY *key,
413#                         const char *ivec);
414#
415# Handles only complete blocks, operates on 32-bit counter and
416# does not update *ivec! (see crypto/modes/ctr128.c for details)
417#
418# stack layout:
419#	0	pshufb mask
420#	16	vector addend: 0,6,6,6
421# 	32	counter-less ivec
422#	48	1st triplet of counter vector
423#	64	2nd triplet of counter vector
424#	80	saved %esp
425
426&function_begin("${PREFIX}_ctr32_encrypt_blocks");
427	&record_function_hit(0);
428
429	&mov	($inp,&wparam(0));
430	&mov	($out,&wparam(1));
431	&mov	($len,&wparam(2));
432	&mov	($key,&wparam(3));
433	&mov	($rounds_,&wparam(4));
434	&mov	($key_,"esp");
435	&sub	("esp",88);
436	&and	("esp",-16);			# align stack
437	&mov	(&DWP(80,"esp"),$key_);
438
439	&cmp	($len,1);
440	&je	(&label("ctr32_one_shortcut"));
441
442	&movdqu	($inout5,&QWP(0,$rounds_));	# load ivec
443
444	# compose byte-swap control mask for pshufb on stack
445	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
446	&mov	(&DWP(4,"esp"),0x08090a0b);
447	&mov	(&DWP(8,"esp"),0x04050607);
448	&mov	(&DWP(12,"esp"),0x00010203);
449
450	# compose counter increment vector on stack
451	&mov	($rounds,6);
452	&xor	($key_,$key_);
453	&mov	(&DWP(16,"esp"),$rounds);
454	&mov	(&DWP(20,"esp"),$rounds);
455	&mov	(&DWP(24,"esp"),$rounds);
456	&mov	(&DWP(28,"esp"),$key_);
457
458	&pextrd	($rounds_,$inout5,3);		# pull 32-bit counter
459	&pinsrd	($inout5,$key_,3);		# wipe 32-bit counter
460
461	&mov	($rounds,&DWP(240,$key));	# key->rounds
462
463	# compose 2 vectors of 3x32-bit counters
464	&bswap	($rounds_);
465	&pxor	($rndkey0,$rndkey0);
466	&pxor	($rndkey1,$rndkey1);
467	&movdqa	($inout0,&QWP(0,"esp"));	# load byte-swap mask
468	&pinsrd	($rndkey0,$rounds_,0);
469	&lea	($key_,&DWP(3,$rounds_));
470	&pinsrd	($rndkey1,$key_,0);
471	&inc	($rounds_);
472	&pinsrd	($rndkey0,$rounds_,1);
473	&inc	($key_);
474	&pinsrd	($rndkey1,$key_,1);
475	&inc	($rounds_);
476	&pinsrd	($rndkey0,$rounds_,2);
477	&inc	($key_);
478	&pinsrd	($rndkey1,$key_,2);
479	&movdqa	(&QWP(48,"esp"),$rndkey0);	# save 1st triplet
480	&pshufb	($rndkey0,$inout0);		# byte swap
481	&movdqu	($inout4,&QWP(0,$key));		# key[0]
482	&movdqa	(&QWP(64,"esp"),$rndkey1);	# save 2nd triplet
483	&pshufb	($rndkey1,$inout0);		# byte swap
484
485	&pshufd	($inout0,$rndkey0,3<<6);	# place counter to upper dword
486	&pshufd	($inout1,$rndkey0,2<<6);
487	&cmp	($len,6);
488	&jb	(&label("ctr32_tail"));
489	&pxor	($inout5,$inout4);		# counter-less ivec^key[0]
490	&shl	($rounds,4);
491	&mov	($rounds_,16);
492	&movdqa	(&QWP(32,"esp"),$inout5);	# save counter-less ivec^key[0]
493	&mov	($key_,$key);			# backup $key
494	&sub	($rounds_,$rounds);		# backup twisted $rounds
495	&lea	($key,&DWP(32,$key,$rounds));
496	&sub	($len,6);
497	&jmp	(&label("ctr32_loop6"));
498
499&set_label("ctr32_loop6",16);
500	# inlining _aesni_encrypt6's prologue gives ~6% improvement...
501	&pshufd	($inout2,$rndkey0,1<<6);
502	&movdqa	($rndkey0,&QWP(32,"esp"));	# pull counter-less ivec
503	&pshufd	($inout3,$rndkey1,3<<6);
504	&pxor		($inout0,$rndkey0);	# merge counter-less ivec
505	&pshufd	($inout4,$rndkey1,2<<6);
506	&pxor		($inout1,$rndkey0);
507	&pshufd	($inout5,$rndkey1,1<<6);
508	&$movekey	($rndkey1,&QWP(16,$key_));
509	&pxor		($inout2,$rndkey0);
510	&pxor		($inout3,$rndkey0);
511	&aesenc		($inout0,$rndkey1);
512	&pxor		($inout4,$rndkey0);
513	&pxor		($inout5,$rndkey0);
514	&aesenc		($inout1,$rndkey1);
515	&$movekey	($rndkey0,&QWP(32,$key_));
516	&mov		($rounds,$rounds_);
517	&aesenc		($inout2,$rndkey1);
518	&aesenc		($inout3,$rndkey1);
519	&aesenc		($inout4,$rndkey1);
520	&aesenc		($inout5,$rndkey1);
521
522	&call		(&label("_aesni_encrypt6_enter"));
523
524	&movups	($rndkey1,&QWP(0,$inp));
525	&movups	($rndkey0,&QWP(0x10,$inp));
526	&xorps	($inout0,$rndkey1);
527	&movups	($rndkey1,&QWP(0x20,$inp));
528	&xorps	($inout1,$rndkey0);
529	&movups	(&QWP(0,$out),$inout0);
530	&movdqa	($rndkey0,&QWP(16,"esp"));	# load increment
531	&xorps	($inout2,$rndkey1);
532	&movdqa	($rndkey1,&QWP(64,"esp"));	# load 2nd triplet
533	&movups	(&QWP(0x10,$out),$inout1);
534	&movups	(&QWP(0x20,$out),$inout2);
535
536	&paddd	($rndkey1,$rndkey0);		# 2nd triplet increment
537	&paddd	($rndkey0,&QWP(48,"esp"));	# 1st triplet increment
538	&movdqa	($inout0,&QWP(0,"esp"));	# load byte swap mask
539
540	&movups	($inout1,&QWP(0x30,$inp));
541	&movups	($inout2,&QWP(0x40,$inp));
542	&xorps	($inout3,$inout1);
543	&movups	($inout1,&QWP(0x50,$inp));
544	&lea	($inp,&DWP(0x60,$inp));
545	&movdqa	(&QWP(48,"esp"),$rndkey0);	# save 1st triplet
546	&pshufb	($rndkey0,$inout0);		# byte swap
547	&xorps	($inout4,$inout2);
548	&movups	(&QWP(0x30,$out),$inout3);
549	&xorps	($inout5,$inout1);
550	&movdqa	(&QWP(64,"esp"),$rndkey1);	# save 2nd triplet
551	&pshufb	($rndkey1,$inout0);		# byte swap
552	&movups	(&QWP(0x40,$out),$inout4);
553	&pshufd	($inout0,$rndkey0,3<<6);
554	&movups	(&QWP(0x50,$out),$inout5);
555	&lea	($out,&DWP(0x60,$out));
556
557	&pshufd	($inout1,$rndkey0,2<<6);
558	&sub	($len,6);
559	&jnc	(&label("ctr32_loop6"));
560
561	&add	($len,6);
562	&jz	(&label("ctr32_ret"));
563	&movdqu	($inout5,&QWP(0,$key_));
564	&mov	($key,$key_);
565	&pxor	($inout5,&QWP(32,"esp"));	# restore count-less ivec
566	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
567
568&set_label("ctr32_tail");
569	&por	($inout0,$inout5);
570	&cmp	($len,2);
571	&jb	(&label("ctr32_one"));
572
573	&pshufd	($inout2,$rndkey0,1<<6);
574	&por	($inout1,$inout5);
575	&je	(&label("ctr32_two"));
576
577	&pshufd	($inout3,$rndkey1,3<<6);
578	&por	($inout2,$inout5);
579	&cmp	($len,4);
580	&jb	(&label("ctr32_three"));
581
582	&pshufd	($inout4,$rndkey1,2<<6);
583	&por	($inout3,$inout5);
584	&je	(&label("ctr32_four"));
585
586	&por	($inout4,$inout5);
587	&call	("_aesni_encrypt6");
588	&movups	($rndkey1,&QWP(0,$inp));
589	&movups	($rndkey0,&QWP(0x10,$inp));
590	&xorps	($inout0,$rndkey1);
591	&movups	($rndkey1,&QWP(0x20,$inp));
592	&xorps	($inout1,$rndkey0);
593	&movups	($rndkey0,&QWP(0x30,$inp));
594	&xorps	($inout2,$rndkey1);
595	&movups	($rndkey1,&QWP(0x40,$inp));
596	&xorps	($inout3,$rndkey0);
597	&movups	(&QWP(0,$out),$inout0);
598	&xorps	($inout4,$rndkey1);
599	&movups	(&QWP(0x10,$out),$inout1);
600	&movups	(&QWP(0x20,$out),$inout2);
601	&movups	(&QWP(0x30,$out),$inout3);
602	&movups	(&QWP(0x40,$out),$inout4);
603	&jmp	(&label("ctr32_ret"));
604
605&set_label("ctr32_one_shortcut",16);
606	&movups	($inout0,&QWP(0,$rounds_));	# load ivec
607	&mov	($rounds,&DWP(240,$key));
608
609&set_label("ctr32_one");
610	if ($inline)
611	{   &aesni_inline_generate1("enc");	}
612	else
613	{   &call	("_aesni_encrypt1");	}
614	&movups	($in0,&QWP(0,$inp));
615	&xorps	($in0,$inout0);
616	&movups	(&QWP(0,$out),$in0);
617	&jmp	(&label("ctr32_ret"));
618
619&set_label("ctr32_two",16);
620	&call	("_aesni_encrypt2");
621	&movups	($inout3,&QWP(0,$inp));
622	&movups	($inout4,&QWP(0x10,$inp));
623	&xorps	($inout0,$inout3);
624	&xorps	($inout1,$inout4);
625	&movups	(&QWP(0,$out),$inout0);
626	&movups	(&QWP(0x10,$out),$inout1);
627	&jmp	(&label("ctr32_ret"));
628
629&set_label("ctr32_three",16);
630	&call	("_aesni_encrypt3");
631	&movups	($inout3,&QWP(0,$inp));
632	&movups	($inout4,&QWP(0x10,$inp));
633	&xorps	($inout0,$inout3);
634	&movups	($inout5,&QWP(0x20,$inp));
635	&xorps	($inout1,$inout4);
636	&movups	(&QWP(0,$out),$inout0);
637	&xorps	($inout2,$inout5);
638	&movups	(&QWP(0x10,$out),$inout1);
639	&movups	(&QWP(0x20,$out),$inout2);
640	&jmp	(&label("ctr32_ret"));
641
642&set_label("ctr32_four",16);
643	&call	("_aesni_encrypt4");
644	&movups	($inout4,&QWP(0,$inp));
645	&movups	($inout5,&QWP(0x10,$inp));
646	&movups	($rndkey1,&QWP(0x20,$inp));
647	&xorps	($inout0,$inout4);
648	&movups	($rndkey0,&QWP(0x30,$inp));
649	&xorps	($inout1,$inout5);
650	&movups	(&QWP(0,$out),$inout0);
651	&xorps	($inout2,$rndkey1);
652	&movups	(&QWP(0x10,$out),$inout1);
653	&xorps	($inout3,$rndkey0);
654	&movups	(&QWP(0x20,$out),$inout2);
655	&movups	(&QWP(0x30,$out),$inout3);
656
657&set_label("ctr32_ret");
658	&pxor	("xmm0","xmm0");		# clear register bank
659	&pxor	("xmm1","xmm1");
660	&pxor	("xmm2","xmm2");
661	&pxor	("xmm3","xmm3");
662	&pxor	("xmm4","xmm4");
663	&movdqa	(&QWP(32,"esp"),"xmm0");	# clear stack
664	&pxor	("xmm5","xmm5");
665	&movdqa	(&QWP(48,"esp"),"xmm0");
666	&pxor	("xmm6","xmm6");
667	&movdqa	(&QWP(64,"esp"),"xmm0");
668	&pxor	("xmm7","xmm7");
669	&mov	("esp",&DWP(80,"esp"));
670&function_end("${PREFIX}_ctr32_encrypt_blocks");
671}
672
673######################################################################
674# Mechanical port from aesni-x86_64.pl.
675#
676# _aesni_set_encrypt_key is private interface,
677# input:
678#	"eax"	const unsigned char *userKey
679#	$rounds	int bits
680#	$key	AES_KEY *key
681# output:
682#	"eax"	return code
683#	$round	rounds
684
685&function_begin_B("_aesni_set_encrypt_key");
686	&push	("ebp");
687	&push	("ebx");
688	&test	("eax","eax");
689	&jz	(&label("bad_pointer"));
690	&test	($key,$key);
691	&jz	(&label("bad_pointer"));
692
693	&call	(&label("pic"));
694&set_label("pic");
695	&blindpop("ebx");
696	&lea	("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
697
698	&picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const"));
699	&movups	("xmm0",&QWP(0,"eax"));	# pull first 128 bits of *userKey
700	&xorps	("xmm4","xmm4");	# low dword of xmm4 is assumed 0
701	&mov	("ebp",&DWP(4,"ebp"));
702	&lea	($key,&DWP(16,$key));
703	&and	("ebp",1<<28|1<<11);	# AVX and XOP bits
704	&cmp	($rounds,256);
705	&je	(&label("14rounds"));
706	# 192-bit key support was removed.
707	&cmp	($rounds,128);
708	&jne	(&label("bad_keybits"));
709
710&set_label("10rounds",16);
711	&cmp		("ebp",1<<28);
712	&je		(&label("10rounds_alt"));
713
714	&mov		($rounds,9);
715	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
716	&aeskeygenassist("xmm1","xmm0",0x01);		# round 1
717	&call		(&label("key_128_cold"));
718	&aeskeygenassist("xmm1","xmm0",0x2);		# round 2
719	&call		(&label("key_128"));
720	&aeskeygenassist("xmm1","xmm0",0x04);		# round 3
721	&call		(&label("key_128"));
722	&aeskeygenassist("xmm1","xmm0",0x08);		# round 4
723	&call		(&label("key_128"));
724	&aeskeygenassist("xmm1","xmm0",0x10);		# round 5
725	&call		(&label("key_128"));
726	&aeskeygenassist("xmm1","xmm0",0x20);		# round 6
727	&call		(&label("key_128"));
728	&aeskeygenassist("xmm1","xmm0",0x40);		# round 7
729	&call		(&label("key_128"));
730	&aeskeygenassist("xmm1","xmm0",0x80);		# round 8
731	&call		(&label("key_128"));
732	&aeskeygenassist("xmm1","xmm0",0x1b);		# round 9
733	&call		(&label("key_128"));
734	&aeskeygenassist("xmm1","xmm0",0x36);		# round 10
735	&call		(&label("key_128"));
736	&$movekey	(&QWP(0,$key),"xmm0");
737	&mov		(&DWP(80,$key),$rounds);
738
739	&jmp	(&label("good_key"));
740
741&set_label("key_128",16);
742	&$movekey	(&QWP(0,$key),"xmm0");
743	&lea		($key,&DWP(16,$key));
744&set_label("key_128_cold");
745	&shufps		("xmm4","xmm0",0b00010000);
746	&xorps		("xmm0","xmm4");
747	&shufps		("xmm4","xmm0",0b10001100);
748	&xorps		("xmm0","xmm4");
749	&shufps		("xmm1","xmm1",0b11111111);	# critical path
750	&xorps		("xmm0","xmm1");
751	&ret();
752
753&set_label("10rounds_alt",16);
754	&movdqa		("xmm5",&QWP(0x00,"ebx"));
755	&mov		($rounds,8);
756	&movdqa		("xmm4",&QWP(0x20,"ebx"));
757	&movdqa		("xmm2","xmm0");
758	&movdqu		(&QWP(-16,$key),"xmm0");
759
760&set_label("loop_key128");
761	&pshufb		("xmm0","xmm5");
762	&aesenclast	("xmm0","xmm4");
763	&pslld		("xmm4",1);
764	&lea		($key,&DWP(16,$key));
765
766	&movdqa		("xmm3","xmm2");
767	&pslldq		("xmm2",4);
768	&pxor		("xmm3","xmm2");
769	&pslldq		("xmm2",4);
770	&pxor		("xmm3","xmm2");
771	&pslldq		("xmm2",4);
772	&pxor		("xmm2","xmm3");
773
774	&pxor		("xmm0","xmm2");
775	&movdqu		(&QWP(-16,$key),"xmm0");
776	&movdqa		("xmm2","xmm0");
777
778	&dec		($rounds);
779	&jnz		(&label("loop_key128"));
780
781	&movdqa		("xmm4",&QWP(0x30,"ebx"));
782
783	&pshufb		("xmm0","xmm5");
784	&aesenclast	("xmm0","xmm4");
785	&pslld		("xmm4",1);
786
787	&movdqa		("xmm3","xmm2");
788	&pslldq		("xmm2",4);
789	&pxor		("xmm3","xmm2");
790	&pslldq		("xmm2",4);
791	&pxor		("xmm3","xmm2");
792	&pslldq		("xmm2",4);
793	&pxor		("xmm2","xmm3");
794
795	&pxor		("xmm0","xmm2");
796	&movdqu		(&QWP(0,$key),"xmm0");
797
798	&movdqa		("xmm2","xmm0");
799	&pshufb		("xmm0","xmm5");
800	&aesenclast	("xmm0","xmm4");
801
802	&movdqa		("xmm3","xmm2");
803	&pslldq		("xmm2",4);
804	&pxor		("xmm3","xmm2");
805	&pslldq		("xmm2",4);
806	&pxor		("xmm3","xmm2");
807	&pslldq		("xmm2",4);
808	&pxor		("xmm2","xmm3");
809
810	&pxor		("xmm0","xmm2");
811	&movdqu		(&QWP(16,$key),"xmm0");
812
813	&mov		($rounds,9);
814	&mov		(&DWP(96,$key),$rounds);
815
816	&jmp	(&label("good_key"));
817
818# 192-bit key support was removed.
819
820&set_label("14rounds",16);
821	&movups		("xmm2",&QWP(16,"eax"));	# remaining half of *userKey
822	&lea		($key,&DWP(16,$key));
823	&cmp		("ebp",1<<28);
824	&je		(&label("14rounds_alt"));
825
826	&mov		($rounds,13);
827	&$movekey	(&QWP(-32,$key),"xmm0");	# round 0
828	&$movekey	(&QWP(-16,$key),"xmm2");	# round 1
829	&aeskeygenassist("xmm1","xmm2",0x01);		# round 2
830	&call		(&label("key_256a_cold"));
831	&aeskeygenassist("xmm1","xmm0",0x01);		# round 3
832	&call		(&label("key_256b"));
833	&aeskeygenassist("xmm1","xmm2",0x02);		# round 4
834	&call		(&label("key_256a"));
835	&aeskeygenassist("xmm1","xmm0",0x02);		# round 5
836	&call		(&label("key_256b"));
837	&aeskeygenassist("xmm1","xmm2",0x04);		# round 6
838	&call		(&label("key_256a"));
839	&aeskeygenassist("xmm1","xmm0",0x04);		# round 7
840	&call		(&label("key_256b"));
841	&aeskeygenassist("xmm1","xmm2",0x08);		# round 8
842	&call		(&label("key_256a"));
843	&aeskeygenassist("xmm1","xmm0",0x08);		# round 9
844	&call		(&label("key_256b"));
845	&aeskeygenassist("xmm1","xmm2",0x10);		# round 10
846	&call		(&label("key_256a"));
847	&aeskeygenassist("xmm1","xmm0",0x10);		# round 11
848	&call		(&label("key_256b"));
849	&aeskeygenassist("xmm1","xmm2",0x20);		# round 12
850	&call		(&label("key_256a"));
851	&aeskeygenassist("xmm1","xmm0",0x20);		# round 13
852	&call		(&label("key_256b"));
853	&aeskeygenassist("xmm1","xmm2",0x40);		# round 14
854	&call		(&label("key_256a"));
855	&$movekey	(&QWP(0,$key),"xmm0");
856	&mov		(&DWP(16,$key),$rounds);
857	&xor		("eax","eax");
858
859	&jmp	(&label("good_key"));
860
861&set_label("key_256a",16);
862	&$movekey	(&QWP(0,$key),"xmm2");
863	&lea		($key,&DWP(16,$key));
864&set_label("key_256a_cold");
865	&shufps		("xmm4","xmm0",0b00010000);
866	&xorps		("xmm0","xmm4");
867	&shufps		("xmm4","xmm0",0b10001100);
868	&xorps		("xmm0","xmm4");
869	&shufps		("xmm1","xmm1",0b11111111);	# critical path
870	&xorps		("xmm0","xmm1");
871	&ret();
872
873&set_label("key_256b",16);
874	&$movekey	(&QWP(0,$key),"xmm0");
875	&lea		($key,&DWP(16,$key));
876
877	&shufps		("xmm4","xmm2",0b00010000);
878	&xorps		("xmm2","xmm4");
879	&shufps		("xmm4","xmm2",0b10001100);
880	&xorps		("xmm2","xmm4");
881	&shufps		("xmm1","xmm1",0b10101010);	# critical path
882	&xorps		("xmm2","xmm1");
883	&ret();
884
885&set_label("14rounds_alt",16);
886	&movdqa		("xmm5",&QWP(0x00,"ebx"));
887	&movdqa		("xmm4",&QWP(0x20,"ebx"));
888	&mov		($rounds,7);
889	&movdqu		(&QWP(-32,$key),"xmm0");
890	&movdqa		("xmm1","xmm2");
891	&movdqu		(&QWP(-16,$key),"xmm2");
892
893&set_label("loop_key256");
894	&pshufb		("xmm2","xmm5");
895	&aesenclast	("xmm2","xmm4");
896
897	&movdqa		("xmm3","xmm0");
898	&pslldq		("xmm0",4);
899	&pxor		("xmm3","xmm0");
900	&pslldq		("xmm0",4);
901	&pxor		("xmm3","xmm0");
902	&pslldq		("xmm0",4);
903	&pxor		("xmm0","xmm3");
904	&pslld		("xmm4",1);
905
906	&pxor		("xmm0","xmm2");
907	&movdqu		(&QWP(0,$key),"xmm0");
908
909	&dec		($rounds);
910	&jz		(&label("done_key256"));
911
912	&pshufd		("xmm2","xmm0",0xff);
913	&pxor		("xmm3","xmm3");
914	&aesenclast	("xmm2","xmm3");
915
916	&movdqa		("xmm3","xmm1");
917	&pslldq		("xmm1",4);
918	&pxor		("xmm3","xmm1");
919	&pslldq		("xmm1",4);
920	&pxor		("xmm3","xmm1");
921	&pslldq		("xmm1",4);
922	&pxor		("xmm1","xmm3");
923
924	&pxor		("xmm2","xmm1");
925	&movdqu		(&QWP(16,$key),"xmm2");
926	&lea		($key,&DWP(32,$key));
927	&movdqa		("xmm1","xmm2");
928	&jmp		(&label("loop_key256"));
929
930&set_label("done_key256");
931	&mov		($rounds,13);
932	&mov		(&DWP(16,$key),$rounds);
933
934&set_label("good_key");
935	&pxor	("xmm0","xmm0");
936	&pxor	("xmm1","xmm1");
937	&pxor	("xmm2","xmm2");
938	&pxor	("xmm3","xmm3");
939	&pxor	("xmm4","xmm4");
940	&pxor	("xmm5","xmm5");
941	&xor	("eax","eax");
942	&pop	("ebx");
943	&pop	("ebp");
944	&ret	();
945
946&set_label("bad_pointer",4);
947	&mov	("eax",-1);
948	&pop	("ebx");
949	&pop	("ebp");
950	&ret	();
951&set_label("bad_keybits",4);
952	&pxor	("xmm0","xmm0");
953	&mov	("eax",-2);
954	&pop	("ebx");
955	&pop	("ebp");
956	&ret	();
957&function_end_B("_aesni_set_encrypt_key");
958
959# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
960#                              AES_KEY *key)
961&function_begin_B("${PREFIX}_set_encrypt_key");
962	&record_function_hit(3);
963
964	&mov	("eax",&wparam(0));
965	&mov	($rounds,&wparam(1));
966	&mov	($key,&wparam(2));
967	&call	("_aesni_set_encrypt_key");
968	&ret	();
969&function_end_B("${PREFIX}_set_encrypt_key");
970
971&set_label("key_const",64);
972&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
973&data_word(0x04070605,0x04070605,0x04070605,0x04070605);
974&data_word(1,1,1,1);
975&data_word(0x1b,0x1b,0x1b,0x1b);
976&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
977
978&asm_finish();
979
980close STDOUT or die "error closing STDOUT: $!";
981