• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for Intel AES-NI extension. In
18# OpenSSL context it's used with Intel engine, but can also be used as
19# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
20# details].
21#
22# Performance.
23#
24# To start with see corresponding paragraph in aesni-x86_64.pl...
25# Instead of filling table similar to one found there I've chosen to
26# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
27# The simplified table below represents 32-bit performance relative
28# to 64-bit one in every given point. Ratios vary for different
29# encryption modes, therefore interval values.
30#
31#	16-byte     64-byte     256-byte    1-KB        8-KB
32#	53-67%      67-84%      91-94%      95-98%      97-99.5%
33#
34# Lower ratios for smaller block sizes are perfectly understandable,
35# because function call overhead is higher in 32-bit mode. Largest
36# 8-KB block performance is virtually same: 32-bit code is less than
37# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
38
39# January 2011
40#
41# See aesni-x86_64.pl for details. Unlike x86_64 version this module
42# interleaves at most 6 aes[enc|dec] instructions, because there are
43# not enough registers for 8x interleave [which should be optimal for
44# Sandy Bridge]. Actually, performance results for 6x interleave
45# factor presented in aesni-x86_64.pl (except for CTR) are for this
46# module.
47
48# April 2011
49#
50# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
51# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
52
53# November 2015
54#
55# Add aesni_ocb_[en|de]crypt. [Removed in BoringSSL]
56
57######################################################################
58# Current large-block performance in cycles per byte processed with
59# 128-bit key (less is better).
60#
61#		CBC en-/decrypt	CTR	XTS	ECB	OCB
62# Westmere	3.77/1.37	1.37	1.52	1.27
63# * Bridge	5.07/0.98	0.99	1.09	0.91	1.10
64# Haswell	4.44/0.80	0.97	1.03	0.72	0.76
65# Skylake	2.68/0.65	0.65	0.66	0.64	0.66
66# Silvermont	5.77/3.56	3.67	4.03	3.46	4.03
67# Goldmont	3.84/1.39	1.39	1.63	1.31	1.70
68# Bulldozer	5.80/0.98	1.05	1.24	0.93	1.23
69
70$PREFIX="GFp_aes_hw";	# if $PREFIX is set to "AES", the script
71			# generates drop-in replacement for
72			# crypto/aes/asm/aes-586.pl:-)
73$AESNI_PREFIX="GFp_aes_hw";
74$inline=1;		# inline _aesni_[en|de]crypt
75
76$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
77push(@INC,"${dir}","${dir}../../../perlasm");
78require "x86asm.pl";
79
80$output = pop;
81open OUT,">$output";
82*STDOUT=*OUT;
83
84&asm_init($ARGV[0]);
85
86&external_label("GFp_ia32cap_P");
87&static_label("key_const");
88
89if ($PREFIX eq $AESNI_PREFIX)	{ $movekey=\&movups; }
90else			{ $movekey=\&movups; }
91
92$len="eax";
93$rounds="ecx";
94$key="edx";
95$inp="esi";
96$out="edi";
97$rounds_="ebx";	# backup copy for $rounds
98$key_="ebp";	# backup copy for $key
99
100$rndkey0="xmm0";
101$rndkey1="xmm1";
102$inout0="xmm2";
103$inout1="xmm3";
104$inout2="xmm4";
105$inout3="xmm5";	$in1="xmm5";
106$inout4="xmm6";	$in0="xmm6";
107$inout5="xmm7";	$ivec="xmm7";
108
109# AESNI extension
110sub aeskeygenassist
111{ my($dst,$src,$imm)=@_;
112    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
113    {	&data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm);	}
114}
115sub aescommon
116{ my($opcodelet,$dst,$src)=@_;
117    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
118    {	&data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
119}
120sub aesimc	{ aescommon(0xdb,@_); }
121sub aesenc	{ aescommon(0xdc,@_); }
122sub aesenclast	{ aescommon(0xdd,@_); }
123
124# Inline version of internal aesni_[en|de]crypt1
125{ my $sn;
126sub aesni_inline_generate1
127{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
128  $sn++;
129
130    &$movekey		($rndkey0,&QWP(0,$key));
131    &$movekey		($rndkey1,&QWP(16,$key));
132    &xorps		($ivec,$rndkey0)	if (defined($ivec));
133    &lea		($key,&DWP(32,$key));
134    &xorps		($inout,$ivec)		if (defined($ivec));
135    &xorps		($inout,$rndkey0)	if (!defined($ivec));
136    &set_label("${p}1_loop_$sn");
137	eval"&aes${p}	($inout,$rndkey1)";
138	&dec		($rounds);
139	&$movekey	($rndkey1,&QWP(0,$key));
140	&lea		($key,&DWP(16,$key));
141    &jnz		(&label("${p}1_loop_$sn"));
142    eval"&aes${p}last	($inout,$rndkey1)";
143}}
144
145sub aesni_generate1	# fully unrolled loop
146{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
147
148    &function_begin_B("_aesni_${p}rypt1");
149	&movups		($rndkey0,&QWP(0,$key));
150	&$movekey	($rndkey1,&QWP(0x10,$key));
151	&xorps		($inout,$rndkey0);
152	&$movekey	($rndkey0,&QWP(0x20,$key));
153	&lea		($key,&DWP(0x30,$key));
154	&cmp		($rounds,11);
155	&jb		(&label("${p}128"));
156	&lea		($key,&DWP(0x40,$key));
157	# 192-bit key support was removed.
158
159	eval"&aes${p}	($inout,$rndkey1)";
160	&$movekey	($rndkey1,&QWP(-0x40,$key));
161	eval"&aes${p}	($inout,$rndkey0)";
162	&$movekey	($rndkey0,&QWP(-0x30,$key));
163
164	# 192-bit key support was removed.
165	eval"&aes${p}	($inout,$rndkey1)";
166	&$movekey	($rndkey1,&QWP(-0x20,$key));
167	eval"&aes${p}	($inout,$rndkey0)";
168	&$movekey	($rndkey0,&QWP(-0x10,$key));
169    &set_label("${p}128");
170	eval"&aes${p}	($inout,$rndkey1)";
171	&$movekey	($rndkey1,&QWP(0,$key));
172	eval"&aes${p}	($inout,$rndkey0)";
173	&$movekey	($rndkey0,&QWP(0x10,$key));
174	eval"&aes${p}	($inout,$rndkey1)";
175	&$movekey	($rndkey1,&QWP(0x20,$key));
176	eval"&aes${p}	($inout,$rndkey0)";
177	&$movekey	($rndkey0,&QWP(0x30,$key));
178	eval"&aes${p}	($inout,$rndkey1)";
179	&$movekey	($rndkey1,&QWP(0x40,$key));
180	eval"&aes${p}	($inout,$rndkey0)";
181	&$movekey	($rndkey0,&QWP(0x50,$key));
182	eval"&aes${p}	($inout,$rndkey1)";
183	&$movekey	($rndkey1,&QWP(0x60,$key));
184	eval"&aes${p}	($inout,$rndkey0)";
185	&$movekey	($rndkey0,&QWP(0x70,$key));
186	eval"&aes${p}	($inout,$rndkey1)";
187    eval"&aes${p}last	($inout,$rndkey0)";
188    &ret();
189    &function_end_B("_aesni_${p}rypt1");
190}
191
192# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
193&aesni_generate1("enc") if (!$inline);
194&function_begin_B("${PREFIX}_encrypt");
195	&mov	("eax",&wparam(0));
196	&mov	($key,&wparam(2));
197	&movups	($inout0,&QWP(0,"eax"));
198	&mov	($rounds,&DWP(240,$key));
199	&mov	("eax",&wparam(1));
200	if ($inline)
201	{   &aesni_inline_generate1("enc");	}
202	else
203	{   &call	("_aesni_encrypt1");	}
204	&pxor	($rndkey0,$rndkey0);		# clear register bank
205	&pxor	($rndkey1,$rndkey1);
206	&movups	(&QWP(0,"eax"),$inout0);
207	&pxor	($inout0,$inout0);
208	&ret	();
209&function_end_B("${PREFIX}_encrypt");
210
211# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
212# factor. Why 3x subroutine were originally used in loops? Even though
213# aes[enc|dec] latency was originally 6, it could be scheduled only
214# every *2nd* cycle. Thus 3x interleave was the one providing optimal
215# utilization, i.e. when subroutine's throughput is virtually same as
216# of non-interleaved subroutine [for number of input blocks up to 3].
217# This is why it originally made no sense to implement 2x subroutine.
218# But times change and it became appropriate to spend extra 192 bytes
219# on 2x subroutine on Atom Silvermont account. For processors that
220# can schedule aes[enc|dec] every cycle optimal interleave factor
221# equals to corresponding instructions latency. 8x is optimal for
222# * Bridge, but it's unfeasible to accommodate such implementation
223# in XMM registers addressable in 32-bit mode and therefore maximum
224# of 6x is used instead...
225
226sub aesni_generate2
227{ my $p=shift;
228
229    &function_begin_B("_aesni_${p}rypt2");
230	&$movekey	($rndkey0,&QWP(0,$key));
231	&shl		($rounds,4);
232	&$movekey	($rndkey1,&QWP(16,$key));
233	&xorps		($inout0,$rndkey0);
234	&pxor		($inout1,$rndkey0);
235	&$movekey	($rndkey0,&QWP(32,$key));
236	&lea		($key,&DWP(32,$key,$rounds));
237	&neg		($rounds);
238	&add		($rounds,16);
239
240    &set_label("${p}2_loop");
241	eval"&aes${p}	($inout0,$rndkey1)";
242	eval"&aes${p}	($inout1,$rndkey1)";
243	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
244	&add		($rounds,32);
245	eval"&aes${p}	($inout0,$rndkey0)";
246	eval"&aes${p}	($inout1,$rndkey0)";
247	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
248	&jnz		(&label("${p}2_loop"));
249    eval"&aes${p}	($inout0,$rndkey1)";
250    eval"&aes${p}	($inout1,$rndkey1)";
251    eval"&aes${p}last	($inout0,$rndkey0)";
252    eval"&aes${p}last	($inout1,$rndkey0)";
253    &ret();
254    &function_end_B("_aesni_${p}rypt2");
255}
256
257sub aesni_generate3
258{ my $p=shift;
259
260    &function_begin_B("_aesni_${p}rypt3");
261	&$movekey	($rndkey0,&QWP(0,$key));
262	&shl		($rounds,4);
263	&$movekey	($rndkey1,&QWP(16,$key));
264	&xorps		($inout0,$rndkey0);
265	&pxor		($inout1,$rndkey0);
266	&pxor		($inout2,$rndkey0);
267	&$movekey	($rndkey0,&QWP(32,$key));
268	&lea		($key,&DWP(32,$key,$rounds));
269	&neg		($rounds);
270	&add		($rounds,16);
271
272    &set_label("${p}3_loop");
273	eval"&aes${p}	($inout0,$rndkey1)";
274	eval"&aes${p}	($inout1,$rndkey1)";
275	eval"&aes${p}	($inout2,$rndkey1)";
276	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
277	&add		($rounds,32);
278	eval"&aes${p}	($inout0,$rndkey0)";
279	eval"&aes${p}	($inout1,$rndkey0)";
280	eval"&aes${p}	($inout2,$rndkey0)";
281	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
282	&jnz		(&label("${p}3_loop"));
283    eval"&aes${p}	($inout0,$rndkey1)";
284    eval"&aes${p}	($inout1,$rndkey1)";
285    eval"&aes${p}	($inout2,$rndkey1)";
286    eval"&aes${p}last	($inout0,$rndkey0)";
287    eval"&aes${p}last	($inout1,$rndkey0)";
288    eval"&aes${p}last	($inout2,$rndkey0)";
289    &ret();
290    &function_end_B("_aesni_${p}rypt3");
291}
292
293# 4x interleave is implemented to improve small block performance,
294# most notably [and naturally] 4 block by ~30%. One can argue that one
295# should have implemented 5x as well, but improvement  would be <20%,
296# so it's not worth it...
297sub aesni_generate4
298{ my $p=shift;
299
300    &function_begin_B("_aesni_${p}rypt4");
301	&$movekey	($rndkey0,&QWP(0,$key));
302	&$movekey	($rndkey1,&QWP(16,$key));
303	&shl		($rounds,4);
304	&xorps		($inout0,$rndkey0);
305	&pxor		($inout1,$rndkey0);
306	&pxor		($inout2,$rndkey0);
307	&pxor		($inout3,$rndkey0);
308	&$movekey	($rndkey0,&QWP(32,$key));
309	&lea		($key,&DWP(32,$key,$rounds));
310	&neg		($rounds);
311	&data_byte	(0x0f,0x1f,0x40,0x00);
312	&add		($rounds,16);
313
314    &set_label("${p}4_loop");
315	eval"&aes${p}	($inout0,$rndkey1)";
316	eval"&aes${p}	($inout1,$rndkey1)";
317	eval"&aes${p}	($inout2,$rndkey1)";
318	eval"&aes${p}	($inout3,$rndkey1)";
319	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
320	&add		($rounds,32);
321	eval"&aes${p}	($inout0,$rndkey0)";
322	eval"&aes${p}	($inout1,$rndkey0)";
323	eval"&aes${p}	($inout2,$rndkey0)";
324	eval"&aes${p}	($inout3,$rndkey0)";
325	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
326    &jnz		(&label("${p}4_loop"));
327
328    eval"&aes${p}	($inout0,$rndkey1)";
329    eval"&aes${p}	($inout1,$rndkey1)";
330    eval"&aes${p}	($inout2,$rndkey1)";
331    eval"&aes${p}	($inout3,$rndkey1)";
332    eval"&aes${p}last	($inout0,$rndkey0)";
333    eval"&aes${p}last	($inout1,$rndkey0)";
334    eval"&aes${p}last	($inout2,$rndkey0)";
335    eval"&aes${p}last	($inout3,$rndkey0)";
336    &ret();
337    &function_end_B("_aesni_${p}rypt4");
338}
339
340sub aesni_generate6
341{ my $p=shift;
342
343    &function_begin_B("_aesni_${p}rypt6");
344    &static_label("_aesni_${p}rypt6_enter");
345	&$movekey	($rndkey0,&QWP(0,$key));
346	&shl		($rounds,4);
347	&$movekey	($rndkey1,&QWP(16,$key));
348	&xorps		($inout0,$rndkey0);
349	&pxor		($inout1,$rndkey0);	# pxor does better here
350	&pxor		($inout2,$rndkey0);
351	eval"&aes${p}	($inout0,$rndkey1)";
352	&pxor		($inout3,$rndkey0);
353	&pxor		($inout4,$rndkey0);
354	eval"&aes${p}	($inout1,$rndkey1)";
355	&lea		($key,&DWP(32,$key,$rounds));
356	&neg		($rounds);
357	eval"&aes${p}	($inout2,$rndkey1)";
358	&pxor		($inout5,$rndkey0);
359	&$movekey	($rndkey0,&QWP(0,$key,$rounds));
360	&add		($rounds,16);
361	&jmp		(&label("_aesni_${p}rypt6_inner"));
362
363    &set_label("${p}6_loop",16);
364	eval"&aes${p}	($inout0,$rndkey1)";
365	eval"&aes${p}	($inout1,$rndkey1)";
366	eval"&aes${p}	($inout2,$rndkey1)";
367    &set_label("_aesni_${p}rypt6_inner");
368	eval"&aes${p}	($inout3,$rndkey1)";
369	eval"&aes${p}	($inout4,$rndkey1)";
370	eval"&aes${p}	($inout5,$rndkey1)";
371    &set_label("_aesni_${p}rypt6_enter");
372	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
373	&add		($rounds,32);
374	eval"&aes${p}	($inout0,$rndkey0)";
375	eval"&aes${p}	($inout1,$rndkey0)";
376	eval"&aes${p}	($inout2,$rndkey0)";
377	eval"&aes${p}	($inout3,$rndkey0)";
378	eval"&aes${p}	($inout4,$rndkey0)";
379	eval"&aes${p}	($inout5,$rndkey0)";
380	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
381    &jnz		(&label("${p}6_loop"));
382
383    eval"&aes${p}	($inout0,$rndkey1)";
384    eval"&aes${p}	($inout1,$rndkey1)";
385    eval"&aes${p}	($inout2,$rndkey1)";
386    eval"&aes${p}	($inout3,$rndkey1)";
387    eval"&aes${p}	($inout4,$rndkey1)";
388    eval"&aes${p}	($inout5,$rndkey1)";
389    eval"&aes${p}last	($inout0,$rndkey0)";
390    eval"&aes${p}last	($inout1,$rndkey0)";
391    eval"&aes${p}last	($inout2,$rndkey0)";
392    eval"&aes${p}last	($inout3,$rndkey0)";
393    eval"&aes${p}last	($inout4,$rndkey0)";
394    eval"&aes${p}last	($inout5,$rndkey0)";
395    &ret();
396    &function_end_B("_aesni_${p}rypt6");
397}
398&aesni_generate2("enc") if ($PREFIX eq $AESNI_PREFIX);
399&aesni_generate3("enc") if ($PREFIX eq $AESNI_PREFIX);
400&aesni_generate4("enc") if ($PREFIX eq $AESNI_PREFIX);
401&aesni_generate6("enc") if ($PREFIX eq $AESNI_PREFIX);
402
403if ($PREFIX eq $AESNI_PREFIX) {
404
405######################################################################
406# void aes_hw_ctr32_encrypt_blocks (const void *in, void *out,
407#                         size_t blocks, const AES_KEY *key,
408#                         const char *ivec);
409#
410# Handles only complete blocks, operates on 32-bit counter and
411# does not update *ivec! (see crypto/modes/ctr128.c for details)
412#
413# stack layout:
414#	0	pshufb mask
415#	16	vector addend: 0,6,6,6
416# 	32	counter-less ivec
417#	48	1st triplet of counter vector
418#	64	2nd triplet of counter vector
419#	80	saved %esp
420
421&function_begin("${PREFIX}_ctr32_encrypt_blocks");
422	&mov	($inp,&wparam(0));
423	&mov	($out,&wparam(1));
424	&mov	($len,&wparam(2));
425	&mov	($key,&wparam(3));
426	&mov	($rounds_,&wparam(4));
427	&mov	($key_,"esp");
428	&sub	("esp",88);
429	&and	("esp",-16);			# align stack
430	&mov	(&DWP(80,"esp"),$key_);
431
432	&cmp	($len,1);
433	&je	(&label("ctr32_one_shortcut"));
434
435	&movdqu	($inout5,&QWP(0,$rounds_));	# load ivec
436
437	# compose byte-swap control mask for pshufb on stack
438	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
439	&mov	(&DWP(4,"esp"),0x08090a0b);
440	&mov	(&DWP(8,"esp"),0x04050607);
441	&mov	(&DWP(12,"esp"),0x00010203);
442
443	# compose counter increment vector on stack
444	&mov	($rounds,6);
445	&xor	($key_,$key_);
446	&mov	(&DWP(16,"esp"),$rounds);
447	&mov	(&DWP(20,"esp"),$rounds);
448	&mov	(&DWP(24,"esp"),$rounds);
449	&mov	(&DWP(28,"esp"),$key_);
450
451	&pextrd	($rounds_,$inout5,3);		# pull 32-bit counter
452	&pinsrd	($inout5,$key_,3);		# wipe 32-bit counter
453
454	&mov	($rounds,&DWP(240,$key));	# key->rounds
455
456	# compose 2 vectors of 3x32-bit counters
457	&bswap	($rounds_);
458	&pxor	($rndkey0,$rndkey0);
459	&pxor	($rndkey1,$rndkey1);
460	&movdqa	($inout0,&QWP(0,"esp"));	# load byte-swap mask
461	&pinsrd	($rndkey0,$rounds_,0);
462	&lea	($key_,&DWP(3,$rounds_));
463	&pinsrd	($rndkey1,$key_,0);
464	&inc	($rounds_);
465	&pinsrd	($rndkey0,$rounds_,1);
466	&inc	($key_);
467	&pinsrd	($rndkey1,$key_,1);
468	&inc	($rounds_);
469	&pinsrd	($rndkey0,$rounds_,2);
470	&inc	($key_);
471	&pinsrd	($rndkey1,$key_,2);
472	&movdqa	(&QWP(48,"esp"),$rndkey0);	# save 1st triplet
473	&pshufb	($rndkey0,$inout0);		# byte swap
474	&movdqu	($inout4,&QWP(0,$key));		# key[0]
475	&movdqa	(&QWP(64,"esp"),$rndkey1);	# save 2nd triplet
476	&pshufb	($rndkey1,$inout0);		# byte swap
477
478	&pshufd	($inout0,$rndkey0,3<<6);	# place counter to upper dword
479	&pshufd	($inout1,$rndkey0,2<<6);
480	&cmp	($len,6);
481	&jb	(&label("ctr32_tail"));
482	&pxor	($inout5,$inout4);		# counter-less ivec^key[0]
483	&shl	($rounds,4);
484	&mov	($rounds_,16);
485	&movdqa	(&QWP(32,"esp"),$inout5);	# save counter-less ivec^key[0]
486	&mov	($key_,$key);			# backup $key
487	&sub	($rounds_,$rounds);		# backup twisted $rounds
488	&lea	($key,&DWP(32,$key,$rounds));
489	&sub	($len,6);
490	&jmp	(&label("ctr32_loop6"));
491
492&set_label("ctr32_loop6",16);
493	# inlining _aesni_encrypt6's prologue gives ~6% improvement...
494	&pshufd	($inout2,$rndkey0,1<<6);
495	&movdqa	($rndkey0,&QWP(32,"esp"));	# pull counter-less ivec
496	&pshufd	($inout3,$rndkey1,3<<6);
497	&pxor		($inout0,$rndkey0);	# merge counter-less ivec
498	&pshufd	($inout4,$rndkey1,2<<6);
499	&pxor		($inout1,$rndkey0);
500	&pshufd	($inout5,$rndkey1,1<<6);
501	&$movekey	($rndkey1,&QWP(16,$key_));
502	&pxor		($inout2,$rndkey0);
503	&pxor		($inout3,$rndkey0);
504	&aesenc		($inout0,$rndkey1);
505	&pxor		($inout4,$rndkey0);
506	&pxor		($inout5,$rndkey0);
507	&aesenc		($inout1,$rndkey1);
508	&$movekey	($rndkey0,&QWP(32,$key_));
509	&mov		($rounds,$rounds_);
510	&aesenc		($inout2,$rndkey1);
511	&aesenc		($inout3,$rndkey1);
512	&aesenc		($inout4,$rndkey1);
513	&aesenc		($inout5,$rndkey1);
514
515	&call		(&label("_aesni_encrypt6_enter"));
516
517	&movups	($rndkey1,&QWP(0,$inp));
518	&movups	($rndkey0,&QWP(0x10,$inp));
519	&xorps	($inout0,$rndkey1);
520	&movups	($rndkey1,&QWP(0x20,$inp));
521	&xorps	($inout1,$rndkey0);
522	&movups	(&QWP(0,$out),$inout0);
523	&movdqa	($rndkey0,&QWP(16,"esp"));	# load increment
524	&xorps	($inout2,$rndkey1);
525	&movdqa	($rndkey1,&QWP(64,"esp"));	# load 2nd triplet
526	&movups	(&QWP(0x10,$out),$inout1);
527	&movups	(&QWP(0x20,$out),$inout2);
528
529	&paddd	($rndkey1,$rndkey0);		# 2nd triplet increment
530	&paddd	($rndkey0,&QWP(48,"esp"));	# 1st triplet increment
531	&movdqa	($inout0,&QWP(0,"esp"));	# load byte swap mask
532
533	&movups	($inout1,&QWP(0x30,$inp));
534	&movups	($inout2,&QWP(0x40,$inp));
535	&xorps	($inout3,$inout1);
536	&movups	($inout1,&QWP(0x50,$inp));
537	&lea	($inp,&DWP(0x60,$inp));
538	&movdqa	(&QWP(48,"esp"),$rndkey0);	# save 1st triplet
539	&pshufb	($rndkey0,$inout0);		# byte swap
540	&xorps	($inout4,$inout2);
541	&movups	(&QWP(0x30,$out),$inout3);
542	&xorps	($inout5,$inout1);
543	&movdqa	(&QWP(64,"esp"),$rndkey1);	# save 2nd triplet
544	&pshufb	($rndkey1,$inout0);		# byte swap
545	&movups	(&QWP(0x40,$out),$inout4);
546	&pshufd	($inout0,$rndkey0,3<<6);
547	&movups	(&QWP(0x50,$out),$inout5);
548	&lea	($out,&DWP(0x60,$out));
549
550	&pshufd	($inout1,$rndkey0,2<<6);
551	&sub	($len,6);
552	&jnc	(&label("ctr32_loop6"));
553
554	&add	($len,6);
555	&jz	(&label("ctr32_ret"));
556	&movdqu	($inout5,&QWP(0,$key_));
557	&mov	($key,$key_);
558	&pxor	($inout5,&QWP(32,"esp"));	# restore count-less ivec
559	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
560
561&set_label("ctr32_tail");
562	&por	($inout0,$inout5);
563	&cmp	($len,2);
564	&jb	(&label("ctr32_one"));
565
566	&pshufd	($inout2,$rndkey0,1<<6);
567	&por	($inout1,$inout5);
568	&je	(&label("ctr32_two"));
569
570	&pshufd	($inout3,$rndkey1,3<<6);
571	&por	($inout2,$inout5);
572	&cmp	($len,4);
573	&jb	(&label("ctr32_three"));
574
575	&pshufd	($inout4,$rndkey1,2<<6);
576	&por	($inout3,$inout5);
577	&je	(&label("ctr32_four"));
578
579	&por	($inout4,$inout5);
580	&call	("_aesni_encrypt6");
581	&movups	($rndkey1,&QWP(0,$inp));
582	&movups	($rndkey0,&QWP(0x10,$inp));
583	&xorps	($inout0,$rndkey1);
584	&movups	($rndkey1,&QWP(0x20,$inp));
585	&xorps	($inout1,$rndkey0);
586	&movups	($rndkey0,&QWP(0x30,$inp));
587	&xorps	($inout2,$rndkey1);
588	&movups	($rndkey1,&QWP(0x40,$inp));
589	&xorps	($inout3,$rndkey0);
590	&movups	(&QWP(0,$out),$inout0);
591	&xorps	($inout4,$rndkey1);
592	&movups	(&QWP(0x10,$out),$inout1);
593	&movups	(&QWP(0x20,$out),$inout2);
594	&movups	(&QWP(0x30,$out),$inout3);
595	&movups	(&QWP(0x40,$out),$inout4);
596	&jmp	(&label("ctr32_ret"));
597
598&set_label("ctr32_one_shortcut",16);
599	&movups	($inout0,&QWP(0,$rounds_));	# load ivec
600	&mov	($rounds,&DWP(240,$key));
601
602&set_label("ctr32_one");
603	if ($inline)
604	{   &aesni_inline_generate1("enc");	}
605	else
606	{   &call	("_aesni_encrypt1");	}
607	&movups	($in0,&QWP(0,$inp));
608	&xorps	($in0,$inout0);
609	&movups	(&QWP(0,$out),$in0);
610	&jmp	(&label("ctr32_ret"));
611
612&set_label("ctr32_two",16);
613	&call	("_aesni_encrypt2");
614	&movups	($inout3,&QWP(0,$inp));
615	&movups	($inout4,&QWP(0x10,$inp));
616	&xorps	($inout0,$inout3);
617	&xorps	($inout1,$inout4);
618	&movups	(&QWP(0,$out),$inout0);
619	&movups	(&QWP(0x10,$out),$inout1);
620	&jmp	(&label("ctr32_ret"));
621
622&set_label("ctr32_three",16);
623	&call	("_aesni_encrypt3");
624	&movups	($inout3,&QWP(0,$inp));
625	&movups	($inout4,&QWP(0x10,$inp));
626	&xorps	($inout0,$inout3);
627	&movups	($inout5,&QWP(0x20,$inp));
628	&xorps	($inout1,$inout4);
629	&movups	(&QWP(0,$out),$inout0);
630	&xorps	($inout2,$inout5);
631	&movups	(&QWP(0x10,$out),$inout1);
632	&movups	(&QWP(0x20,$out),$inout2);
633	&jmp	(&label("ctr32_ret"));
634
635&set_label("ctr32_four",16);
636	&call	("_aesni_encrypt4");
637	&movups	($inout4,&QWP(0,$inp));
638	&movups	($inout5,&QWP(0x10,$inp));
639	&movups	($rndkey1,&QWP(0x20,$inp));
640	&xorps	($inout0,$inout4);
641	&movups	($rndkey0,&QWP(0x30,$inp));
642	&xorps	($inout1,$inout5);
643	&movups	(&QWP(0,$out),$inout0);
644	&xorps	($inout2,$rndkey1);
645	&movups	(&QWP(0x10,$out),$inout1);
646	&xorps	($inout3,$rndkey0);
647	&movups	(&QWP(0x20,$out),$inout2);
648	&movups	(&QWP(0x30,$out),$inout3);
649
650&set_label("ctr32_ret");
651	&pxor	("xmm0","xmm0");		# clear register bank
652	&pxor	("xmm1","xmm1");
653	&pxor	("xmm2","xmm2");
654	&pxor	("xmm3","xmm3");
655	&pxor	("xmm4","xmm4");
656	&movdqa	(&QWP(32,"esp"),"xmm0");	# clear stack
657	&pxor	("xmm5","xmm5");
658	&movdqa	(&QWP(48,"esp"),"xmm0");
659	&pxor	("xmm6","xmm6");
660	&movdqa	(&QWP(64,"esp"),"xmm0");
661	&pxor	("xmm7","xmm7");
662	&mov	("esp",&DWP(80,"esp"));
663&function_end("${PREFIX}_ctr32_encrypt_blocks");
664}
665
666######################################################################
667# Mechanical port from aesni-x86_64.pl.
668#
669# _aesni_set_encrypt_key is private interface,
670# input:
671#	"eax"	const unsigned char *userKey
672#	$rounds	int bits
673#	$key	AES_KEY *key
674# output:
675#	"eax"	return code
676#	$round	rounds
677
678&function_begin_B("_aesni_set_encrypt_key");
679	&push	("ebp");
680	&push	("ebx");
681	&test	("eax","eax");
682	&jz	(&label("bad_pointer"));
683	&test	($key,$key);
684	&jz	(&label("bad_pointer"));
685
686	&call	(&label("pic"));
687&set_label("pic");
688	&blindpop("ebx");
689	&lea	("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
690
691	&picmeup("ebp","GFp_ia32cap_P","ebx",&label("key_const"));
692	&movups	("xmm0",&QWP(0,"eax"));	# pull first 128 bits of *userKey
693	&xorps	("xmm4","xmm4");	# low dword of xmm4 is assumed 0
694	&mov	("ebp",&DWP(4,"ebp"));
695	&lea	($key,&DWP(16,$key));
696	&and	("ebp",1<<28|1<<11);	# AVX and XOP bits
697	&cmp	($rounds,256);
698	&je	(&label("14rounds"));
699	# 192-bit key support was removed.
700	&cmp	($rounds,128);
701	&jne	(&label("bad_keybits"));
702
703&set_label("10rounds",16);
704	&cmp		("ebp",1<<28);
705	&je		(&label("10rounds_alt"));
706
707	&mov		($rounds,9);
708	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
709	&aeskeygenassist("xmm1","xmm0",0x01);		# round 1
710	&call		(&label("key_128_cold"));
711	&aeskeygenassist("xmm1","xmm0",0x2);		# round 2
712	&call		(&label("key_128"));
713	&aeskeygenassist("xmm1","xmm0",0x04);		# round 3
714	&call		(&label("key_128"));
715	&aeskeygenassist("xmm1","xmm0",0x08);		# round 4
716	&call		(&label("key_128"));
717	&aeskeygenassist("xmm1","xmm0",0x10);		# round 5
718	&call		(&label("key_128"));
719	&aeskeygenassist("xmm1","xmm0",0x20);		# round 6
720	&call		(&label("key_128"));
721	&aeskeygenassist("xmm1","xmm0",0x40);		# round 7
722	&call		(&label("key_128"));
723	&aeskeygenassist("xmm1","xmm0",0x80);		# round 8
724	&call		(&label("key_128"));
725	&aeskeygenassist("xmm1","xmm0",0x1b);		# round 9
726	&call		(&label("key_128"));
727	&aeskeygenassist("xmm1","xmm0",0x36);		# round 10
728	&call		(&label("key_128"));
729	&$movekey	(&QWP(0,$key),"xmm0");
730	&mov		(&DWP(80,$key),$rounds);
731
732	&jmp	(&label("good_key"));
733
734&set_label("key_128",16);
735	&$movekey	(&QWP(0,$key),"xmm0");
736	&lea		($key,&DWP(16,$key));
737&set_label("key_128_cold");
738	&shufps		("xmm4","xmm0",0b00010000);
739	&xorps		("xmm0","xmm4");
740	&shufps		("xmm4","xmm0",0b10001100);
741	&xorps		("xmm0","xmm4");
742	&shufps		("xmm1","xmm1",0b11111111);	# critical path
743	&xorps		("xmm0","xmm1");
744	&ret();
745
746&set_label("10rounds_alt",16);
747	&movdqa		("xmm5",&QWP(0x00,"ebx"));
748	&mov		($rounds,8);
749	&movdqa		("xmm4",&QWP(0x20,"ebx"));
750	&movdqa		("xmm2","xmm0");
751	&movdqu		(&QWP(-16,$key),"xmm0");
752
753&set_label("loop_key128");
754	&pshufb		("xmm0","xmm5");
755	&aesenclast	("xmm0","xmm4");
756	&pslld		("xmm4",1);
757	&lea		($key,&DWP(16,$key));
758
759	&movdqa		("xmm3","xmm2");
760	&pslldq		("xmm2",4);
761	&pxor		("xmm3","xmm2");
762	&pslldq		("xmm2",4);
763	&pxor		("xmm3","xmm2");
764	&pslldq		("xmm2",4);
765	&pxor		("xmm2","xmm3");
766
767	&pxor		("xmm0","xmm2");
768	&movdqu		(&QWP(-16,$key),"xmm0");
769	&movdqa		("xmm2","xmm0");
770
771	&dec		($rounds);
772	&jnz		(&label("loop_key128"));
773
774	&movdqa		("xmm4",&QWP(0x30,"ebx"));
775
776	&pshufb		("xmm0","xmm5");
777	&aesenclast	("xmm0","xmm4");
778	&pslld		("xmm4",1);
779
780	&movdqa		("xmm3","xmm2");
781	&pslldq		("xmm2",4);
782	&pxor		("xmm3","xmm2");
783	&pslldq		("xmm2",4);
784	&pxor		("xmm3","xmm2");
785	&pslldq		("xmm2",4);
786	&pxor		("xmm2","xmm3");
787
788	&pxor		("xmm0","xmm2");
789	&movdqu		(&QWP(0,$key),"xmm0");
790
791	&movdqa		("xmm2","xmm0");
792	&pshufb		("xmm0","xmm5");
793	&aesenclast	("xmm0","xmm4");
794
795	&movdqa		("xmm3","xmm2");
796	&pslldq		("xmm2",4);
797	&pxor		("xmm3","xmm2");
798	&pslldq		("xmm2",4);
799	&pxor		("xmm3","xmm2");
800	&pslldq		("xmm2",4);
801	&pxor		("xmm2","xmm3");
802
803	&pxor		("xmm0","xmm2");
804	&movdqu		(&QWP(16,$key),"xmm0");
805
806	&mov		($rounds,9);
807	&mov		(&DWP(96,$key),$rounds);
808
809	&jmp	(&label("good_key"));
810
811# 192-bit key support was removed.
812
813&set_label("14rounds",16);
814	&movups		("xmm2",&QWP(16,"eax"));	# remaining half of *userKey
815	&lea		($key,&DWP(16,$key));
816	&cmp		("ebp",1<<28);
817	&je		(&label("14rounds_alt"));
818
819	&mov		($rounds,13);
820	&$movekey	(&QWP(-32,$key),"xmm0");	# round 0
821	&$movekey	(&QWP(-16,$key),"xmm2");	# round 1
822	&aeskeygenassist("xmm1","xmm2",0x01);		# round 2
823	&call		(&label("key_256a_cold"));
824	&aeskeygenassist("xmm1","xmm0",0x01);		# round 3
825	&call		(&label("key_256b"));
826	&aeskeygenassist("xmm1","xmm2",0x02);		# round 4
827	&call		(&label("key_256a"));
828	&aeskeygenassist("xmm1","xmm0",0x02);		# round 5
829	&call		(&label("key_256b"));
830	&aeskeygenassist("xmm1","xmm2",0x04);		# round 6
831	&call		(&label("key_256a"));
832	&aeskeygenassist("xmm1","xmm0",0x04);		# round 7
833	&call		(&label("key_256b"));
834	&aeskeygenassist("xmm1","xmm2",0x08);		# round 8
835	&call		(&label("key_256a"));
836	&aeskeygenassist("xmm1","xmm0",0x08);		# round 9
837	&call		(&label("key_256b"));
838	&aeskeygenassist("xmm1","xmm2",0x10);		# round 10
839	&call		(&label("key_256a"));
840	&aeskeygenassist("xmm1","xmm0",0x10);		# round 11
841	&call		(&label("key_256b"));
842	&aeskeygenassist("xmm1","xmm2",0x20);		# round 12
843	&call		(&label("key_256a"));
844	&aeskeygenassist("xmm1","xmm0",0x20);		# round 13
845	&call		(&label("key_256b"));
846	&aeskeygenassist("xmm1","xmm2",0x40);		# round 14
847	&call		(&label("key_256a"));
848	&$movekey	(&QWP(0,$key),"xmm0");
849	&mov		(&DWP(16,$key),$rounds);
850	&xor		("eax","eax");
851
852	&jmp	(&label("good_key"));
853
854&set_label("key_256a",16);
855	&$movekey	(&QWP(0,$key),"xmm2");
856	&lea		($key,&DWP(16,$key));
857&set_label("key_256a_cold");
858	&shufps		("xmm4","xmm0",0b00010000);
859	&xorps		("xmm0","xmm4");
860	&shufps		("xmm4","xmm0",0b10001100);
861	&xorps		("xmm0","xmm4");
862	&shufps		("xmm1","xmm1",0b11111111);	# critical path
863	&xorps		("xmm0","xmm1");
864	&ret();
865
866&set_label("key_256b",16);
867	&$movekey	(&QWP(0,$key),"xmm0");
868	&lea		($key,&DWP(16,$key));
869
870	&shufps		("xmm4","xmm2",0b00010000);
871	&xorps		("xmm2","xmm4");
872	&shufps		("xmm4","xmm2",0b10001100);
873	&xorps		("xmm2","xmm4");
874	&shufps		("xmm1","xmm1",0b10101010);	# critical path
875	&xorps		("xmm2","xmm1");
876	&ret();
877
878&set_label("14rounds_alt",16);
879	&movdqa		("xmm5",&QWP(0x00,"ebx"));
880	&movdqa		("xmm4",&QWP(0x20,"ebx"));
881	&mov		($rounds,7);
882	&movdqu		(&QWP(-32,$key),"xmm0");
883	&movdqa		("xmm1","xmm2");
884	&movdqu		(&QWP(-16,$key),"xmm2");
885
886&set_label("loop_key256");
887	&pshufb		("xmm2","xmm5");
888	&aesenclast	("xmm2","xmm4");
889
890	&movdqa		("xmm3","xmm0");
891	&pslldq		("xmm0",4);
892	&pxor		("xmm3","xmm0");
893	&pslldq		("xmm0",4);
894	&pxor		("xmm3","xmm0");
895	&pslldq		("xmm0",4);
896	&pxor		("xmm0","xmm3");
897	&pslld		("xmm4",1);
898
899	&pxor		("xmm0","xmm2");
900	&movdqu		(&QWP(0,$key),"xmm0");
901
902	&dec		($rounds);
903	&jz		(&label("done_key256"));
904
905	&pshufd		("xmm2","xmm0",0xff);
906	&pxor		("xmm3","xmm3");
907	&aesenclast	("xmm2","xmm3");
908
909	&movdqa		("xmm3","xmm1");
910	&pslldq		("xmm1",4);
911	&pxor		("xmm3","xmm1");
912	&pslldq		("xmm1",4);
913	&pxor		("xmm3","xmm1");
914	&pslldq		("xmm1",4);
915	&pxor		("xmm1","xmm3");
916
917	&pxor		("xmm2","xmm1");
918	&movdqu		(&QWP(16,$key),"xmm2");
919	&lea		($key,&DWP(32,$key));
920	&movdqa		("xmm1","xmm2");
921	&jmp		(&label("loop_key256"));
922
923&set_label("done_key256");
924	&mov		($rounds,13);
925	&mov		(&DWP(16,$key),$rounds);
926
927&set_label("good_key");
928	&pxor	("xmm0","xmm0");
929	&pxor	("xmm1","xmm1");
930	&pxor	("xmm2","xmm2");
931	&pxor	("xmm3","xmm3");
932	&pxor	("xmm4","xmm4");
933	&pxor	("xmm5","xmm5");
934	&xor	("eax","eax");
935	&pop	("ebx");
936	&pop	("ebp");
937	&ret	();
938
939&set_label("bad_pointer",4);
940	&mov	("eax",-1);
941	&pop	("ebx");
942	&pop	("ebp");
943	&ret	();
944&set_label("bad_keybits",4);
945	&pxor	("xmm0","xmm0");
946	&mov	("eax",-2);
947	&pop	("ebx");
948	&pop	("ebp");
949	&ret	();
950&function_end_B("_aesni_set_encrypt_key");
951
952# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
953#                              AES_KEY *key)
954&function_begin_B("${PREFIX}_set_encrypt_key");
955	&mov	("eax",&wparam(0));
956	&mov	($rounds,&wparam(1));
957	&mov	($key,&wparam(2));
958	&call	("_aesni_set_encrypt_key");
959	&ret	();
960&function_end_B("${PREFIX}_set_encrypt_key");
961
962&set_label("key_const",64);
963&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
964&data_word(0x04070605,0x04070605,0x04070605,0x04070605);
965&data_word(1,1,1,1);
966&data_word(0x1b,0x1b,0x1b,0x1b);
967&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
968
969&asm_finish();
970
971close STDOUT or die "error closing STDOUT";
972