• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10######################################################################
11## Constant-time SSSE3 AES core implementation.
12## version 0.1
13##
14## By Mike Hamburg (Stanford University), 2009
15## Public domain.
16##
17## For details see http://shiftleft.org/papers/vector_aes/ and
18## http://crypto.stanford.edu/vpaes/.
19
20######################################################################
21# September 2011.
22#
23# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
24# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
25# doesn't handle partial vectors (doesn't have to if called from
26# EVP only). "Drop-in" implies that this module doesn't share key
27# schedule structure with the original nor does it make assumption
28# about its alignment...
29#
30# Performance summary. aes-586.pl column lists large-block CBC
31# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
32# byte processed with 128-bit key, and vpaes-x86.pl column - [also
33# large-block CBC] encrypt/decrypt.
34#
35#		aes-586.pl		vpaes-x86.pl
36#
37# Core 2(**)	28.1/41.4/18.3		21.9/25.2(***)
38# Nehalem	27.9/40.4/18.1		10.2/11.9
39# Atom		70.7/92.1/60.1		61.1/75.4(***)
40# Silvermont	45.4/62.9/24.1		49.2/61.1(***)
41#
42# (*)	"Hyper-threading" in the context refers rather to cache shared
43#	among multiple cores, than to specifically Intel HTT. As vast
44#	majority of contemporary cores share cache, slower code path
45#	is common place. In other words "with-hyper-threading-off"
46#	results are presented mostly for reference purposes.
47#
48# (**)	"Core 2" refers to initial 65nm design, a.k.a. Conroe.
49#
50# (***)	Less impressive improvement on Core 2 and Atom is due to slow
51#	pshufb,	yet it's respectable +28%/64%  improvement on Core 2
52#	and +15% on Atom (as implied, over "hyper-threading-safe"
53#	code path).
54#
55#						<appro@openssl.org>
56
57$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
58push(@INC,"${dir}","${dir}../../../perlasm");
59require "x86asm.pl";
60
61$output = pop;
62open OUT,">$output";
63*STDOUT=*OUT;
64
65&asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386");
66
67$PREFIX="vpaes";
68
69my  ($round, $base, $magic, $key, $const, $inp, $out)=
70    ("eax",  "ebx", "ecx",  "edx","ebp",  "esi","edi");
71
72&preprocessor_ifdef("BORINGSSL_DISPATCH_TEST")
73&external_label("BORINGSSL_function_hit");
74&preprocessor_endif();
75&static_label("_vpaes_consts");
76&static_label("_vpaes_schedule_low_round");
77
78&set_label("_vpaes_consts",64);
79$k_inv=-0x30;		# inv, inva
80	&data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
81	&data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
82
83$k_s0F=-0x10;		# s0F
84	&data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
85
86$k_ipt=0x00;		# input transform (lo, hi)
87	&data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
88	&data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
89
90$k_sb1=0x20;		# sb1u, sb1t
91	&data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
92	&data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
93$k_sb2=0x40;		# sb2u, sb2t
94	&data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
95	&data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
96$k_sbo=0x60;		# sbou, sbot
97	&data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
98	&data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
99
100$k_mc_forward=0x80;	# mc_forward
101	&data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
102	&data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
103	&data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
104	&data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
105
106$k_mc_backward=0xc0;	# mc_backward
107	&data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
108	&data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
109	&data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
110	&data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
111
112$k_sr=0x100;		# sr
113	&data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
114	&data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
115	&data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
116	&data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
117
118$k_rcon=0x140;		# rcon
119	&data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
120
121$k_s63=0x150;		# s63: all equal to 0x63 transformed
122	&data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
123
124$k_opt=0x160;		# output transform
125	&data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
126	&data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
127
128$k_deskew=0x180;	# deskew tables: inverts the sbox's "skew"
129	&data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
130	&data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
131
132&asciz	("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
133&align	(64);
134
135&function_begin_B("_vpaes_preheat");
136	&add	($const,&DWP(0,"esp"));
137	&movdqa	("xmm7",&QWP($k_inv,$const));
138	&movdqa	("xmm6",&QWP($k_s0F,$const));
139	&ret	();
140&function_end_B("_vpaes_preheat");
141
142##
143##  _aes_encrypt_core
144##
145##  AES-encrypt %xmm0.
146##
147##  Inputs:
148##     %xmm0 = input
149##     %xmm6-%xmm7 as in _vpaes_preheat
150##    (%edx) = scheduled keys
151##
152##  Output in %xmm0
153##  Clobbers  %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
154##
155##
156&function_begin_B("_vpaes_encrypt_core");
157	&mov	($magic,16);
158	&mov	($round,&DWP(240,$key));
159	&movdqa	("xmm1","xmm6")
160	&movdqa	("xmm2",&QWP($k_ipt,$const));
161	&pandn	("xmm1","xmm0");
162	&pand	("xmm0","xmm6");
163	&movdqu	("xmm5",&QWP(0,$key));
164	&pshufb	("xmm2","xmm0");
165	&movdqa	("xmm0",&QWP($k_ipt+16,$const));
166	&pxor	("xmm2","xmm5");
167	&psrld	("xmm1",4);
168	&add	($key,16);
169	&pshufb	("xmm0","xmm1");
170	&lea	($base,&DWP($k_mc_backward,$const));
171	&pxor	("xmm0","xmm2");
172	&jmp	(&label("enc_entry"));
173
174
175&set_label("enc_loop",16);
176	# middle of middle round
177	&movdqa	("xmm4",&QWP($k_sb1,$const));	# 4 : sb1u
178	&movdqa	("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
179	&pshufb	("xmm4","xmm2");		# 4 = sb1u
180	&pshufb	("xmm0","xmm3");		# 0 = sb1t
181	&pxor	("xmm4","xmm5");		# 4 = sb1u + k
182	&movdqa	("xmm5",&QWP($k_sb2,$const));	# 4 : sb2u
183	&pxor	("xmm0","xmm4");		# 0 = A
184	&movdqa	("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
185	&pshufb	("xmm5","xmm2");		# 4 = sb2u
186	&movdqa	("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
187	&movdqa	("xmm4",&QWP(0,$base,$magic));	# .Lk_mc_backward[]
188	&pshufb	("xmm2","xmm3");		# 2 = sb2t
189	&movdqa	("xmm3","xmm0");		# 3 = A
190	&pxor	("xmm2","xmm5");		# 2 = 2A
191	&pshufb	("xmm0","xmm1");		# 0 = B
192	&add	($key,16);			# next key
193	&pxor	("xmm0","xmm2");		# 0 = 2A+B
194	&pshufb	("xmm3","xmm4");		# 3 = D
195	&add	($magic,16);			# next mc
196	&pxor	("xmm3","xmm0");		# 3 = 2A+B+D
197	&pshufb	("xmm0","xmm1");		# 0 = 2B+C
198	&and	($magic,0x30);			# ... mod 4
199	&sub	($round,1);			# nr--
200	&pxor	("xmm0","xmm3");		# 0 = 2A+3B+C+D
201
202&set_label("enc_entry");
203	# top of round
204	&movdqa	("xmm1","xmm6");		# 1 : i
205	&movdqa	("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
206	&pandn	("xmm1","xmm0");		# 1 = i<<4
207	&psrld	("xmm1",4);			# 1 = i
208	&pand	("xmm0","xmm6");		# 0 = k
209	&pshufb	("xmm5","xmm0");		# 2 = a/k
210	&movdqa	("xmm3","xmm7");		# 3 : 1/i
211	&pxor	("xmm0","xmm1");		# 0 = j
212	&pshufb	("xmm3","xmm1");		# 3 = 1/i
213	&movdqa	("xmm4","xmm7");		# 4 : 1/j
214	&pxor	("xmm3","xmm5");		# 3 = iak = 1/i + a/k
215	&pshufb	("xmm4","xmm0");		# 4 = 1/j
216	&movdqa	("xmm2","xmm7");		# 2 : 1/iak
217	&pxor	("xmm4","xmm5");		# 4 = jak = 1/j + a/k
218	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
219	&movdqa	("xmm3","xmm7");		# 3 : 1/jak
220	&pxor	("xmm2","xmm0");		# 2 = io
221	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
222	&movdqu	("xmm5",&QWP(0,$key));
223	&pxor	("xmm3","xmm1");		# 3 = jo
224	&jnz	(&label("enc_loop"));
225
226	# middle of last round
227	&movdqa	("xmm4",&QWP($k_sbo,$const));	# 3 : sbou      .Lk_sbo
228	&movdqa	("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot      .Lk_sbo+16
229	&pshufb	("xmm4","xmm2");		# 4 = sbou
230	&pxor	("xmm4","xmm5");		# 4 = sb1u + k
231	&pshufb	("xmm0","xmm3");		# 0 = sb1t
232	&movdqa	("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
233	&pxor	("xmm0","xmm4");		# 0 = A
234	&pshufb	("xmm0","xmm1");
235	&ret	();
236&function_end_B("_vpaes_encrypt_core");
237
238########################################################
239##                                                    ##
240##                  AES key schedule                  ##
241##                                                    ##
242########################################################
243&function_begin_B("_vpaes_schedule_core");
244	&add	($const,&DWP(0,"esp"));
245	&movdqu	("xmm0",&QWP(0,$inp));		# load key (unaligned)
246	&movdqa	("xmm2",&QWP($k_rcon,$const));	# load rcon
247
248	# input transform
249	&movdqa	("xmm3","xmm0");
250	&lea	($base,&DWP($k_ipt,$const));
251	&movdqa	(&QWP(4,"esp"),"xmm2");		# xmm8
252	&call	("_vpaes_schedule_transform");
253	&movdqa	("xmm7","xmm0");
254
255	&test	($out,$out);
256	&jnz	(&label("schedule_am_decrypting"));
257
258	# encrypting, output zeroth round key after transform
259	&movdqu	(&QWP(0,$key),"xmm0");
260	&jmp	(&label("schedule_go"));
261
262&set_label("schedule_am_decrypting");
263	# decrypting, output zeroth round key after shiftrows
264	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
265	&pshufb	("xmm3","xmm1");
266	&movdqu	(&QWP(0,$key),"xmm3");
267	&xor	($magic,0x30);
268
269&set_label("schedule_go");
270	&cmp	($round,192);
271	&ja	(&label("schedule_256"));
272	# 192-bit key support was removed.
273	# 128: fall though
274
275##
276##  .schedule_128
277##
278##  128-bit specific part of key schedule.
279##
280##  This schedule is really simple, because all its parts
281##  are accomplished by the subroutines.
282##
283&set_label("schedule_128");
284	&mov	($round,10);
285
286&set_label("loop_schedule_128");
287	&call	("_vpaes_schedule_round");
288	&dec	($round);
289	&jz	(&label("schedule_mangle_last"));
290	&call	("_vpaes_schedule_mangle");	# write output
291	&jmp	(&label("loop_schedule_128"));
292
293##
294##  .aes_schedule_256
295##
296##  256-bit specific part of key schedule.
297##
298##  The structure here is very similar to the 128-bit
299##  schedule, but with an additional "low side" in
300##  %xmm6.  The low side's rounds are the same as the
301##  high side's, except no rcon and no rotation.
302##
303&set_label("schedule_256",16);
304	&movdqu	("xmm0",&QWP(16,$inp));		# load key part 2 (unaligned)
305	&call	("_vpaes_schedule_transform");	# input transform
306	&mov	($round,7);
307
308&set_label("loop_schedule_256");
309	&call	("_vpaes_schedule_mangle");	# output low result
310	&movdqa	("xmm6","xmm0");		# save cur_lo in xmm6
311
312	# high round
313	&call	("_vpaes_schedule_round");
314	&dec	($round);
315	&jz	(&label("schedule_mangle_last"));
316	&call	("_vpaes_schedule_mangle");
317
318	# low round. swap xmm7 and xmm6
319	&pshufd	("xmm0","xmm0",0xFF);
320	&movdqa	(&QWP(20,"esp"),"xmm7");
321	&movdqa	("xmm7","xmm6");
322	&call	("_vpaes_schedule_low_round");
323	&movdqa	("xmm7",&QWP(20,"esp"));
324
325	&jmp	(&label("loop_schedule_256"));
326
327##
328##  .aes_schedule_mangle_last
329##
330##  Mangler for last round of key schedule
331##  Mangles %xmm0
332##    when encrypting, outputs out(%xmm0) ^ 63
333##    when decrypting, outputs unskew(%xmm0)
334##
335##  Always called right before return... jumps to cleanup and exits
336##
337&set_label("schedule_mangle_last",16);
338	# schedule last round key from xmm0
339	&lea	($base,&DWP($k_deskew,$const));
340	&test	($out,$out);
341	&jnz	(&label("schedule_mangle_last_dec"));
342
343	# encrypting
344	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
345	&pshufb	("xmm0","xmm1");		# output permute
346	&lea	($base,&DWP($k_opt,$const));	# prepare to output transform
347	&add	($key,32);
348
349&set_label("schedule_mangle_last_dec");
350	&add	($key,-16);
351	&pxor	("xmm0",&QWP($k_s63,$const));
352	&call	("_vpaes_schedule_transform");	# output transform
353	&movdqu	(&QWP(0,$key),"xmm0");		# save last key
354
355	# cleanup
356	&pxor	("xmm0","xmm0");
357	&pxor	("xmm1","xmm1");
358	&pxor	("xmm2","xmm2");
359	&pxor	("xmm3","xmm3");
360	&pxor	("xmm4","xmm4");
361	&pxor	("xmm5","xmm5");
362	&pxor	("xmm6","xmm6");
363	&pxor	("xmm7","xmm7");
364	&ret	();
365&function_end_B("_vpaes_schedule_core");
366
367##
368##  .aes_schedule_round
369##
370##  Runs one main round of the key schedule on %xmm0, %xmm7
371##
372##  Specifically, runs subbytes on the high dword of %xmm0
373##  then rotates it by one byte and xors into the low dword of
374##  %xmm7.
375##
376##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
377##  next rcon.
378##
379##  Smears the dwords of %xmm7 by xoring the low into the
380##  second low, result into third, result into highest.
381##
382##  Returns results in %xmm7 = %xmm0.
383##  Clobbers %xmm1-%xmm5.
384##
385&function_begin_B("_vpaes_schedule_round");
386	# extract rcon from xmm8
387	&movdqa	("xmm2",&QWP(8,"esp"));		# xmm8
388	&pxor	("xmm1","xmm1");
389	&palignr("xmm1","xmm2",15);
390	&palignr("xmm2","xmm2",15);
391	&pxor	("xmm7","xmm1");
392
393	# rotate
394	&pshufd	("xmm0","xmm0",0xFF);
395	&palignr("xmm0","xmm0",1);
396
397	# fall through...
398	&movdqa	(&QWP(8,"esp"),"xmm2");		# xmm8
399
400	# low round: same as high round, but no rotation and no rcon.
401&set_label("_vpaes_schedule_low_round");
402	# smear xmm7
403	&movdqa	("xmm1","xmm7");
404	&pslldq	("xmm7",4);
405	&pxor	("xmm7","xmm1");
406	&movdqa	("xmm1","xmm7");
407	&pslldq	("xmm7",8);
408	&pxor	("xmm7","xmm1");
409	&pxor	("xmm7",&QWP($k_s63,$const));
410
411	# subbyte
412	&movdqa	("xmm4",&QWP($k_s0F,$const));
413	&movdqa	("xmm5",&QWP($k_inv,$const));	# 4 : 1/j
414	&movdqa	("xmm1","xmm4");
415	&pandn	("xmm1","xmm0");
416	&psrld	("xmm1",4);			# 1 = i
417	&pand	("xmm0","xmm4");		# 0 = k
418	&movdqa	("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
419	&pshufb	("xmm2","xmm0");		# 2 = a/k
420	&pxor	("xmm0","xmm1");		# 0 = j
421	&movdqa	("xmm3","xmm5");		# 3 : 1/i
422	&pshufb	("xmm3","xmm1");		# 3 = 1/i
423	&pxor	("xmm3","xmm2");		# 3 = iak = 1/i + a/k
424	&movdqa	("xmm4","xmm5");		# 4 : 1/j
425	&pshufb	("xmm4","xmm0");		# 4 = 1/j
426	&pxor	("xmm4","xmm2");		# 4 = jak = 1/j + a/k
427	&movdqa	("xmm2","xmm5");		# 2 : 1/iak
428	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
429	&pxor	("xmm2","xmm0");		# 2 = io
430	&movdqa	("xmm3","xmm5");		# 3 : 1/jak
431	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
432	&pxor	("xmm3","xmm1");		# 3 = jo
433	&movdqa	("xmm4",&QWP($k_sb1,$const));	# 4 : sbou
434	&pshufb	("xmm4","xmm2");		# 4 = sbou
435	&movdqa	("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
436	&pshufb	("xmm0","xmm3");		# 0 = sb1t
437	&pxor	("xmm0","xmm4");		# 0 = sbox output
438
439	# add in smeared stuff
440	&pxor	("xmm0","xmm7");
441	&movdqa	("xmm7","xmm0");
442	&ret	();
443&function_end_B("_vpaes_schedule_round");
444
445##
446##  .aes_schedule_transform
447##
448##  Linear-transform %xmm0 according to tables at (%ebx)
449##
450##  Output in %xmm0
451##  Clobbers %xmm1, %xmm2
452##
453&function_begin_B("_vpaes_schedule_transform");
454	&movdqa	("xmm2",&QWP($k_s0F,$const));
455	&movdqa	("xmm1","xmm2");
456	&pandn	("xmm1","xmm0");
457	&psrld	("xmm1",4);
458	&pand	("xmm0","xmm2");
459	&movdqa	("xmm2",&QWP(0,$base));
460	&pshufb	("xmm2","xmm0");
461	&movdqa	("xmm0",&QWP(16,$base));
462	&pshufb	("xmm0","xmm1");
463	&pxor	("xmm0","xmm2");
464	&ret	();
465&function_end_B("_vpaes_schedule_transform");
466
467##
468##  .aes_schedule_mangle
469##
470##  Mangle xmm0 from (basis-transformed) standard version
471##  to our version.
472##
473##  On encrypt,
474##    xor with 0x63
475##    multiply by circulant 0,1,1,1
476##    apply shiftrows transform
477##
478##  On decrypt,
479##    xor with 0x63
480##    multiply by "inverse mixcolumns" circulant E,B,D,9
481##    deskew
482##    apply shiftrows transform
483##
484##
485##  Writes out to (%edx), and increments or decrements it
486##  Keeps track of round number mod 4 in %ecx
487##  Preserves xmm0
488##  Clobbers xmm1-xmm5
489##
490&function_begin_B("_vpaes_schedule_mangle");
491	&movdqa	("xmm4","xmm0");	# save xmm0 for later
492	&movdqa	("xmm5",&QWP($k_mc_forward,$const));
493	&test	($out,$out);
494	&jnz	(&label("schedule_mangle_dec"));
495
496	# encrypting
497	&add	($key,16);
498	&pxor	("xmm4",&QWP($k_s63,$const));
499	&pshufb	("xmm4","xmm5");
500	&movdqa	("xmm3","xmm4");
501	&pshufb	("xmm4","xmm5");
502	&pxor	("xmm3","xmm4");
503	&pshufb	("xmm4","xmm5");
504	&pxor	("xmm3","xmm4");
505
506	&jmp	(&label("schedule_mangle_both"));
507
508&set_label("schedule_mangle_dec",16);
509	# inverse mix columns
510	&movdqa	("xmm2",&QWP($k_s0F,$const));
511	&lea	($inp,&DWP($k_dksd,$const));
512	&movdqa	("xmm1","xmm2");
513	&pandn	("xmm1","xmm4");
514	&psrld	("xmm1",4);			# 1 = hi
515	&pand	("xmm4","xmm2");		# 4 = lo
516
517	&movdqa	("xmm2",&QWP(0,$inp));
518	&pshufb	("xmm2","xmm4");
519	&movdqa	("xmm3",&QWP(0x10,$inp));
520	&pshufb	("xmm3","xmm1");
521	&pxor	("xmm3","xmm2");
522	&pshufb	("xmm3","xmm5");
523
524	&movdqa	("xmm2",&QWP(0x20,$inp));
525	&pshufb	("xmm2","xmm4");
526	&pxor	("xmm2","xmm3");
527	&movdqa	("xmm3",&QWP(0x30,$inp));
528	&pshufb	("xmm3","xmm1");
529	&pxor	("xmm3","xmm2");
530	&pshufb	("xmm3","xmm5");
531
532	&movdqa	("xmm2",&QWP(0x40,$inp));
533	&pshufb	("xmm2","xmm4");
534	&pxor	("xmm2","xmm3");
535	&movdqa	("xmm3",&QWP(0x50,$inp));
536	&pshufb	("xmm3","xmm1");
537	&pxor	("xmm3","xmm2");
538	&pshufb	("xmm3","xmm5");
539
540	&movdqa	("xmm2",&QWP(0x60,$inp));
541	&pshufb	("xmm2","xmm4");
542	&pxor	("xmm2","xmm3");
543	&movdqa	("xmm3",&QWP(0x70,$inp));
544	&pshufb	("xmm3","xmm1");
545	&pxor	("xmm3","xmm2");
546
547	&add	($key,-16);
548
549&set_label("schedule_mangle_both");
550	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
551	&pshufb	("xmm3","xmm1");
552	&add	($magic,-16);
553	&and	($magic,0x30);
554	&movdqu	(&QWP(0,$key),"xmm3");
555	&ret	();
556&function_end_B("_vpaes_schedule_mangle");
557
558#
559# Interface to OpenSSL
560#
561&function_begin("${PREFIX}_set_encrypt_key");
562	record_function_hit(5);
563
564	&mov	($inp,&wparam(0));		# inp
565	&lea	($base,&DWP(-56,"esp"));
566	&mov	($round,&wparam(1));		# bits
567	&and	($base,-16);
568	&mov	($key,&wparam(2));		# key
569	&xchg	($base,"esp");			# alloca
570	&mov	(&DWP(48,"esp"),$base);
571
572	&mov	($base,$round);
573	&shr	($base,5);
574	&add	($base,5);
575	&mov	(&DWP(240,$key),$base);		# AES_KEY->rounds = nbits/32+5;
576	&mov	($magic,0x30);
577	&mov	($out,0);
578
579	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
580	&call	("_vpaes_schedule_core");
581&set_label("pic_point");
582
583	&mov	("esp",&DWP(48,"esp"));
584	&xor	("eax","eax");
585&function_end("${PREFIX}_set_encrypt_key");
586
587&function_begin("${PREFIX}_encrypt");
588	record_function_hit(4);
589
590	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
591	&call	("_vpaes_preheat");
592&set_label("pic_point");
593	&mov	($inp,&wparam(0));		# inp
594	&lea	($base,&DWP(-56,"esp"));
595	&mov	($out,&wparam(1));		# out
596	&and	($base,-16);
597	&mov	($key,&wparam(2));		# key
598	&xchg	($base,"esp");			# alloca
599	&mov	(&DWP(48,"esp"),$base);
600
601	&movdqu	("xmm0",&QWP(0,$inp));
602	&call	("_vpaes_encrypt_core");
603	&movdqu	(&QWP(0,$out),"xmm0");
604
605	&mov	("esp",&DWP(48,"esp"));
606&function_end("${PREFIX}_encrypt");
607
608&asm_finish();
609
610close STDOUT or die "error closing STDOUT: $!";
611