• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10######################################################################
11## Constant-time SSSE3 AES core implementation.
12## version 0.1
13##
14## By Mike Hamburg (Stanford University), 2009
15## Public domain.
16##
17## For details see http://shiftleft.org/papers/vector_aes/ and
18## http://crypto.stanford.edu/vpaes/.
19
20######################################################################
21# September 2011.
22#
23# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
24# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
25# doesn't handle partial vectors (doesn't have to if called from
26# EVP only). "Drop-in" implies that this module doesn't share key
27# schedule structure with the original nor does it make assumption
28# about its alignment...
29#
30# Performance summary. aes-586.pl column lists large-block CBC
31# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
32# byte processed with 128-bit key, and vpaes-x86.pl column - [also
33# large-block CBC] encrypt/decrypt.
34#
35#		aes-586.pl		vpaes-x86.pl
36#
37# Core 2(**)	28.1/41.4/18.3		21.9/25.2(***)
38# Nehalem	27.9/40.4/18.1		10.2/11.9
39# Atom		70.7/92.1/60.1		61.1/75.4(***)
40# Silvermont	45.4/62.9/24.1		49.2/61.1(***)
41#
42# (*)	"Hyper-threading" in the context refers rather to cache shared
43#	among multiple cores, than to specifically Intel HTT. As vast
44#	majority of contemporary cores share cache, slower code path
45#	is common place. In other words "with-hyper-threading-off"
46#	results are presented mostly for reference purposes.
47#
48# (**)	"Core 2" refers to initial 65nm design, a.k.a. Conroe.
49#
50# (***)	Less impressive improvement on Core 2 and Atom is due to slow
51#	pshufb,	yet it's respectable +28%/64%  improvement on Core 2
52#	and +15% on Atom (as implied, over "hyper-threading-safe"
53#	code path).
54#
55#						<appro@openssl.org>
56
57$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
58push(@INC,"${dir}","${dir}../../../perlasm");
59require "x86asm.pl";
60
61$output = pop;
62open OUT,">$output";
63*STDOUT=*OUT;
64
65&asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386");
66
67$PREFIX="vpaes";
68
69my  ($round, $base, $magic, $key, $const, $inp, $out)=
70    ("eax",  "ebx", "ecx",  "edx","ebp",  "esi","edi");
71
72&static_label("_vpaes_consts");
73&static_label("_vpaes_schedule_low_round");
74
75&set_label("_vpaes_consts",64);
76$k_inv=-0x30;		# inv, inva
77	&data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
78	&data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
79
80$k_s0F=-0x10;		# s0F
81	&data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
82
83$k_ipt=0x00;		# input transform (lo, hi)
84	&data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
85	&data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
86
87$k_sb1=0x20;		# sb1u, sb1t
88	&data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
89	&data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
90$k_sb2=0x40;		# sb2u, sb2t
91	&data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
92	&data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
93$k_sbo=0x60;		# sbou, sbot
94	&data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
95	&data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
96
97$k_mc_forward=0x80;	# mc_forward
98	&data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
99	&data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
100	&data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
101	&data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
102
103$k_mc_backward=0xc0;	# mc_backward
104	&data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
105	&data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
106	&data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
107	&data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
108
109$k_sr=0x100;		# sr
110	&data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
111	&data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
112	&data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
113	&data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
114
115$k_rcon=0x140;		# rcon
116	&data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
117
118$k_s63=0x150;		# s63: all equal to 0x63 transformed
119	&data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
120
121$k_opt=0x160;		# output transform
122	&data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
123	&data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
124
125$k_deskew=0x180;	# deskew tables: inverts the sbox's "skew"
126	&data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
127	&data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
128
129&asciz	("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
130&align	(64);
131
132&function_begin_B("_vpaes_preheat");
133	&add	($const,&DWP(0,"esp"));
134	&movdqa	("xmm7",&QWP($k_inv,$const));
135	&movdqa	("xmm6",&QWP($k_s0F,$const));
136	&ret	();
137&function_end_B("_vpaes_preheat");
138
139##
140##  _aes_encrypt_core
141##
142##  AES-encrypt %xmm0.
143##
144##  Inputs:
145##     %xmm0 = input
146##     %xmm6-%xmm7 as in _vpaes_preheat
147##    (%edx) = scheduled keys
148##
149##  Output in %xmm0
150##  Clobbers  %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
151##
152##
153&function_begin_B("_vpaes_encrypt_core");
154	&mov	($magic,16);
155	&mov	($round,&DWP(240,$key));
156	&movdqa	("xmm1","xmm6")
157	&movdqa	("xmm2",&QWP($k_ipt,$const));
158	&pandn	("xmm1","xmm0");
159	&pand	("xmm0","xmm6");
160	&movdqu	("xmm5",&QWP(0,$key));
161	&pshufb	("xmm2","xmm0");
162	&movdqa	("xmm0",&QWP($k_ipt+16,$const));
163	&pxor	("xmm2","xmm5");
164	&psrld	("xmm1",4);
165	&add	($key,16);
166	&pshufb	("xmm0","xmm1");
167	&lea	($base,&DWP($k_mc_backward,$const));
168	&pxor	("xmm0","xmm2");
169	&jmp	(&label("enc_entry"));
170
171
172&set_label("enc_loop",16);
173	# middle of middle round
174	&movdqa	("xmm4",&QWP($k_sb1,$const));	# 4 : sb1u
175	&movdqa	("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
176	&pshufb	("xmm4","xmm2");		# 4 = sb1u
177	&pshufb	("xmm0","xmm3");		# 0 = sb1t
178	&pxor	("xmm4","xmm5");		# 4 = sb1u + k
179	&movdqa	("xmm5",&QWP($k_sb2,$const));	# 4 : sb2u
180	&pxor	("xmm0","xmm4");		# 0 = A
181	&movdqa	("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
182	&pshufb	("xmm5","xmm2");		# 4 = sb2u
183	&movdqa	("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
184	&movdqa	("xmm4",&QWP(0,$base,$magic));	# .Lk_mc_backward[]
185	&pshufb	("xmm2","xmm3");		# 2 = sb2t
186	&movdqa	("xmm3","xmm0");		# 3 = A
187	&pxor	("xmm2","xmm5");		# 2 = 2A
188	&pshufb	("xmm0","xmm1");		# 0 = B
189	&add	($key,16);			# next key
190	&pxor	("xmm0","xmm2");		# 0 = 2A+B
191	&pshufb	("xmm3","xmm4");		# 3 = D
192	&add	($magic,16);			# next mc
193	&pxor	("xmm3","xmm0");		# 3 = 2A+B+D
194	&pshufb	("xmm0","xmm1");		# 0 = 2B+C
195	&and	($magic,0x30);			# ... mod 4
196	&sub	($round,1);			# nr--
197	&pxor	("xmm0","xmm3");		# 0 = 2A+3B+C+D
198
199&set_label("enc_entry");
200	# top of round
201	&movdqa	("xmm1","xmm6");		# 1 : i
202	&movdqa	("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
203	&pandn	("xmm1","xmm0");		# 1 = i<<4
204	&psrld	("xmm1",4);			# 1 = i
205	&pand	("xmm0","xmm6");		# 0 = k
206	&pshufb	("xmm5","xmm0");		# 2 = a/k
207	&movdqa	("xmm3","xmm7");		# 3 : 1/i
208	&pxor	("xmm0","xmm1");		# 0 = j
209	&pshufb	("xmm3","xmm1");		# 3 = 1/i
210	&movdqa	("xmm4","xmm7");		# 4 : 1/j
211	&pxor	("xmm3","xmm5");		# 3 = iak = 1/i + a/k
212	&pshufb	("xmm4","xmm0");		# 4 = 1/j
213	&movdqa	("xmm2","xmm7");		# 2 : 1/iak
214	&pxor	("xmm4","xmm5");		# 4 = jak = 1/j + a/k
215	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
216	&movdqa	("xmm3","xmm7");		# 3 : 1/jak
217	&pxor	("xmm2","xmm0");		# 2 = io
218	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
219	&movdqu	("xmm5",&QWP(0,$key));
220	&pxor	("xmm3","xmm1");		# 3 = jo
221	&jnz	(&label("enc_loop"));
222
223	# middle of last round
224	&movdqa	("xmm4",&QWP($k_sbo,$const));	# 3 : sbou      .Lk_sbo
225	&movdqa	("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot      .Lk_sbo+16
226	&pshufb	("xmm4","xmm2");		# 4 = sbou
227	&pxor	("xmm4","xmm5");		# 4 = sb1u + k
228	&pshufb	("xmm0","xmm3");		# 0 = sb1t
229	&movdqa	("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
230	&pxor	("xmm0","xmm4");		# 0 = A
231	&pshufb	("xmm0","xmm1");
232	&ret	();
233&function_end_B("_vpaes_encrypt_core");
234
235########################################################
236##                                                    ##
237##                  AES key schedule                  ##
238##                                                    ##
239########################################################
240&function_begin_B("_vpaes_schedule_core");
241	&add	($const,&DWP(0,"esp"));
242	&movdqu	("xmm0",&QWP(0,$inp));		# load key (unaligned)
243	&movdqa	("xmm2",&QWP($k_rcon,$const));	# load rcon
244
245	# input transform
246	&movdqa	("xmm3","xmm0");
247	&lea	($base,&DWP($k_ipt,$const));
248	&movdqa	(&QWP(4,"esp"),"xmm2");		# xmm8
249	&call	("_vpaes_schedule_transform");
250	&movdqa	("xmm7","xmm0");
251
252	&test	($out,$out);
253	&jnz	(&label("schedule_am_decrypting"));
254
255	# encrypting, output zeroth round key after transform
256	&movdqu	(&QWP(0,$key),"xmm0");
257	&jmp	(&label("schedule_go"));
258
259&set_label("schedule_am_decrypting");
260	# decrypting, output zeroth round key after shiftrows
261	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
262	&pshufb	("xmm3","xmm1");
263	&movdqu	(&QWP(0,$key),"xmm3");
264	&xor	($magic,0x30);
265
266&set_label("schedule_go");
267	&cmp	($round,192);
268	&ja	(&label("schedule_256"));
269	# 192-bit key support was removed.
270	# 128: fall though
271
272##
273##  .schedule_128
274##
275##  128-bit specific part of key schedule.
276##
277##  This schedule is really simple, because all its parts
278##  are accomplished by the subroutines.
279##
280&set_label("schedule_128");
281	&mov	($round,10);
282
283&set_label("loop_schedule_128");
284	&call	("_vpaes_schedule_round");
285	&dec	($round);
286	&jz	(&label("schedule_mangle_last"));
287	&call	("_vpaes_schedule_mangle");	# write output
288	&jmp	(&label("loop_schedule_128"));
289
290##
291##  .aes_schedule_256
292##
293##  256-bit specific part of key schedule.
294##
295##  The structure here is very similar to the 128-bit
296##  schedule, but with an additional "low side" in
297##  %xmm6.  The low side's rounds are the same as the
298##  high side's, except no rcon and no rotation.
299##
300&set_label("schedule_256",16);
301	&movdqu	("xmm0",&QWP(16,$inp));		# load key part 2 (unaligned)
302	&call	("_vpaes_schedule_transform");	# input transform
303	&mov	($round,7);
304
305&set_label("loop_schedule_256");
306	&call	("_vpaes_schedule_mangle");	# output low result
307	&movdqa	("xmm6","xmm0");		# save cur_lo in xmm6
308
309	# high round
310	&call	("_vpaes_schedule_round");
311	&dec	($round);
312	&jz	(&label("schedule_mangle_last"));
313	&call	("_vpaes_schedule_mangle");
314
315	# low round. swap xmm7 and xmm6
316	&pshufd	("xmm0","xmm0",0xFF);
317	&movdqa	(&QWP(20,"esp"),"xmm7");
318	&movdqa	("xmm7","xmm6");
319	&call	("_vpaes_schedule_low_round");
320	&movdqa	("xmm7",&QWP(20,"esp"));
321
322	&jmp	(&label("loop_schedule_256"));
323
324##
325##  .aes_schedule_mangle_last
326##
327##  Mangler for last round of key schedule
328##  Mangles %xmm0
329##    when encrypting, outputs out(%xmm0) ^ 63
330##    when decrypting, outputs unskew(%xmm0)
331##
332##  Always called right before return... jumps to cleanup and exits
333##
334&set_label("schedule_mangle_last",16);
335	# schedule last round key from xmm0
336	&lea	($base,&DWP($k_deskew,$const));
337	&test	($out,$out);
338	&jnz	(&label("schedule_mangle_last_dec"));
339
340	# encrypting
341	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
342	&pshufb	("xmm0","xmm1");		# output permute
343	&lea	($base,&DWP($k_opt,$const));	# prepare to output transform
344	&add	($key,32);
345
346&set_label("schedule_mangle_last_dec");
347	&add	($key,-16);
348	&pxor	("xmm0",&QWP($k_s63,$const));
349	&call	("_vpaes_schedule_transform");	# output transform
350	&movdqu	(&QWP(0,$key),"xmm0");		# save last key
351
352	# cleanup
353	&pxor	("xmm0","xmm0");
354	&pxor	("xmm1","xmm1");
355	&pxor	("xmm2","xmm2");
356	&pxor	("xmm3","xmm3");
357	&pxor	("xmm4","xmm4");
358	&pxor	("xmm5","xmm5");
359	&pxor	("xmm6","xmm6");
360	&pxor	("xmm7","xmm7");
361	&ret	();
362&function_end_B("_vpaes_schedule_core");
363
364##
365##  .aes_schedule_round
366##
367##  Runs one main round of the key schedule on %xmm0, %xmm7
368##
369##  Specifically, runs subbytes on the high dword of %xmm0
370##  then rotates it by one byte and xors into the low dword of
371##  %xmm7.
372##
373##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
374##  next rcon.
375##
376##  Smears the dwords of %xmm7 by xoring the low into the
377##  second low, result into third, result into highest.
378##
379##  Returns results in %xmm7 = %xmm0.
380##  Clobbers %xmm1-%xmm5.
381##
382&function_begin_B("_vpaes_schedule_round");
383	# extract rcon from xmm8
384	&movdqa	("xmm2",&QWP(8,"esp"));		# xmm8
385	&pxor	("xmm1","xmm1");
386	&palignr("xmm1","xmm2",15);
387	&palignr("xmm2","xmm2",15);
388	&pxor	("xmm7","xmm1");
389
390	# rotate
391	&pshufd	("xmm0","xmm0",0xFF);
392	&palignr("xmm0","xmm0",1);
393
394	# fall through...
395	&movdqa	(&QWP(8,"esp"),"xmm2");		# xmm8
396
397	# low round: same as high round, but no rotation and no rcon.
398&set_label("_vpaes_schedule_low_round");
399	# smear xmm7
400	&movdqa	("xmm1","xmm7");
401	&pslldq	("xmm7",4);
402	&pxor	("xmm7","xmm1");
403	&movdqa	("xmm1","xmm7");
404	&pslldq	("xmm7",8);
405	&pxor	("xmm7","xmm1");
406	&pxor	("xmm7",&QWP($k_s63,$const));
407
408	# subbyte
409	&movdqa	("xmm4",&QWP($k_s0F,$const));
410	&movdqa	("xmm5",&QWP($k_inv,$const));	# 4 : 1/j
411	&movdqa	("xmm1","xmm4");
412	&pandn	("xmm1","xmm0");
413	&psrld	("xmm1",4);			# 1 = i
414	&pand	("xmm0","xmm4");		# 0 = k
415	&movdqa	("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
416	&pshufb	("xmm2","xmm0");		# 2 = a/k
417	&pxor	("xmm0","xmm1");		# 0 = j
418	&movdqa	("xmm3","xmm5");		# 3 : 1/i
419	&pshufb	("xmm3","xmm1");		# 3 = 1/i
420	&pxor	("xmm3","xmm2");		# 3 = iak = 1/i + a/k
421	&movdqa	("xmm4","xmm5");		# 4 : 1/j
422	&pshufb	("xmm4","xmm0");		# 4 = 1/j
423	&pxor	("xmm4","xmm2");		# 4 = jak = 1/j + a/k
424	&movdqa	("xmm2","xmm5");		# 2 : 1/iak
425	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
426	&pxor	("xmm2","xmm0");		# 2 = io
427	&movdqa	("xmm3","xmm5");		# 3 : 1/jak
428	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
429	&pxor	("xmm3","xmm1");		# 3 = jo
430	&movdqa	("xmm4",&QWP($k_sb1,$const));	# 4 : sbou
431	&pshufb	("xmm4","xmm2");		# 4 = sbou
432	&movdqa	("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
433	&pshufb	("xmm0","xmm3");		# 0 = sb1t
434	&pxor	("xmm0","xmm4");		# 0 = sbox output
435
436	# add in smeared stuff
437	&pxor	("xmm0","xmm7");
438	&movdqa	("xmm7","xmm0");
439	&ret	();
440&function_end_B("_vpaes_schedule_round");
441
442##
443##  .aes_schedule_transform
444##
445##  Linear-transform %xmm0 according to tables at (%ebx)
446##
447##  Output in %xmm0
448##  Clobbers %xmm1, %xmm2
449##
450&function_begin_B("_vpaes_schedule_transform");
451	&movdqa	("xmm2",&QWP($k_s0F,$const));
452	&movdqa	("xmm1","xmm2");
453	&pandn	("xmm1","xmm0");
454	&psrld	("xmm1",4);
455	&pand	("xmm0","xmm2");
456	&movdqa	("xmm2",&QWP(0,$base));
457	&pshufb	("xmm2","xmm0");
458	&movdqa	("xmm0",&QWP(16,$base));
459	&pshufb	("xmm0","xmm1");
460	&pxor	("xmm0","xmm2");
461	&ret	();
462&function_end_B("_vpaes_schedule_transform");
463
464##
465##  .aes_schedule_mangle
466##
467##  Mangle xmm0 from (basis-transformed) standard version
468##  to our version.
469##
470##  On encrypt,
471##    xor with 0x63
472##    multiply by circulant 0,1,1,1
473##    apply shiftrows transform
474##
475##  On decrypt,
476##    xor with 0x63
477##    multiply by "inverse mixcolumns" circulant E,B,D,9
478##    deskew
479##    apply shiftrows transform
480##
481##
482##  Writes out to (%edx), and increments or decrements it
483##  Keeps track of round number mod 4 in %ecx
484##  Preserves xmm0
485##  Clobbers xmm1-xmm5
486##
487&function_begin_B("_vpaes_schedule_mangle");
488	&movdqa	("xmm4","xmm0");	# save xmm0 for later
489	&movdqa	("xmm5",&QWP($k_mc_forward,$const));
490	&test	($out,$out);
491	&jnz	(&label("schedule_mangle_dec"));
492
493	# encrypting
494	&add	($key,16);
495	&pxor	("xmm4",&QWP($k_s63,$const));
496	&pshufb	("xmm4","xmm5");
497	&movdqa	("xmm3","xmm4");
498	&pshufb	("xmm4","xmm5");
499	&pxor	("xmm3","xmm4");
500	&pshufb	("xmm4","xmm5");
501	&pxor	("xmm3","xmm4");
502
503	&jmp	(&label("schedule_mangle_both"));
504
505&set_label("schedule_mangle_dec",16);
506	# inverse mix columns
507	&movdqa	("xmm2",&QWP($k_s0F,$const));
508	&lea	($inp,&DWP($k_dksd,$const));
509	&movdqa	("xmm1","xmm2");
510	&pandn	("xmm1","xmm4");
511	&psrld	("xmm1",4);			# 1 = hi
512	&pand	("xmm4","xmm2");		# 4 = lo
513
514	&movdqa	("xmm2",&QWP(0,$inp));
515	&pshufb	("xmm2","xmm4");
516	&movdqa	("xmm3",&QWP(0x10,$inp));
517	&pshufb	("xmm3","xmm1");
518	&pxor	("xmm3","xmm2");
519	&pshufb	("xmm3","xmm5");
520
521	&movdqa	("xmm2",&QWP(0x20,$inp));
522	&pshufb	("xmm2","xmm4");
523	&pxor	("xmm2","xmm3");
524	&movdqa	("xmm3",&QWP(0x30,$inp));
525	&pshufb	("xmm3","xmm1");
526	&pxor	("xmm3","xmm2");
527	&pshufb	("xmm3","xmm5");
528
529	&movdqa	("xmm2",&QWP(0x40,$inp));
530	&pshufb	("xmm2","xmm4");
531	&pxor	("xmm2","xmm3");
532	&movdqa	("xmm3",&QWP(0x50,$inp));
533	&pshufb	("xmm3","xmm1");
534	&pxor	("xmm3","xmm2");
535	&pshufb	("xmm3","xmm5");
536
537	&movdqa	("xmm2",&QWP(0x60,$inp));
538	&pshufb	("xmm2","xmm4");
539	&pxor	("xmm2","xmm3");
540	&movdqa	("xmm3",&QWP(0x70,$inp));
541	&pshufb	("xmm3","xmm1");
542	&pxor	("xmm3","xmm2");
543
544	&add	($key,-16);
545
546&set_label("schedule_mangle_both");
547	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
548	&pshufb	("xmm3","xmm1");
549	&add	($magic,-16);
550	&and	($magic,0x30);
551	&movdqu	(&QWP(0,$key),"xmm3");
552	&ret	();
553&function_end_B("_vpaes_schedule_mangle");
554
555#
556# Interface to OpenSSL
557#
558&function_begin("${PREFIX}_set_encrypt_key");
559	&mov	($inp,&wparam(0));		# inp
560	&lea	($base,&DWP(-56,"esp"));
561	&mov	($round,&wparam(1));		# bits
562	&and	($base,-16);
563	&mov	($key,&wparam(2));		# key
564	&xchg	($base,"esp");			# alloca
565	&mov	(&DWP(48,"esp"),$base);
566
567	&mov	($base,$round);
568	&shr	($base,5);
569	&add	($base,5);
570	&mov	(&DWP(240,$key),$base);		# AES_KEY->rounds = nbits/32+5;
571	&mov	($magic,0x30);
572	&mov	($out,0);
573
574	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
575	&call	("_vpaes_schedule_core");
576&set_label("pic_point");
577
578	&mov	("esp",&DWP(48,"esp"));
579	&xor	("eax","eax");
580&function_end("${PREFIX}_set_encrypt_key");
581
582&function_begin("${PREFIX}_encrypt");
583	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
584	&call	("_vpaes_preheat");
585&set_label("pic_point");
586	&mov	($inp,&wparam(0));		# inp
587	&lea	($base,&DWP(-56,"esp"));
588	&mov	($out,&wparam(1));		# out
589	&and	($base,-16);
590	&mov	($key,&wparam(2));		# key
591	&xchg	($base,"esp");			# alloca
592	&mov	(&DWP(48,"esp"),$base);
593
594	&movdqu	("xmm0",&QWP(0,$inp));
595	&call	("_vpaes_encrypt_core");
596	&movdqu	(&QWP(0,$out),"xmm0");
597
598	&mov	("esp",&DWP(48,"esp"));
599&function_end("${PREFIX}_encrypt");
600
601&asm_finish();
602
603close STDOUT or die "error closing STDOUT";
604