• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# January 2015
11#
12# ChaCha20 for x86.
13#
14# Performance in cycles per byte out of large buffer.
15#
16#		1xIALU/gcc	4xSSSE3
17# Pentium	17.5/+80%
18# PIII		14.2/+60%
19# P4		18.6/+84%
20# Core2		9.56/+89%	4.83
21# Westmere	9.50/+45%	3.35
22# Sandy Bridge	10.5/+47%	3.20
23# Haswell	8.15/+50%	2.83
24# Skylake	7.53/+22%	2.75
25# Silvermont	17.4/+36%	8.35
26# Goldmont	13.4/+40%	4.36
27# Sledgehammer	10.2/+54%
28# Bulldozer	13.4/+50%	4.38(*)
29#
30# (*)	Bulldozer actually executes 4xXOP code path that delivers 3.55;
31#
32# Modified from upstream OpenSSL to remove the XOP code.
33
34$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
35push(@INC,"${dir}","${dir}../../perlasm");
36require "x86asm.pl";
37
38$output=pop;
39open STDOUT,">$output";
40
41&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386");
42
43$xmm=$ymm=1;
44$gasver=999;  # enable everything
45
46$a="eax";
47($b,$b_)=("ebx","ebp");
48($c,$c_)=("ecx","esi");
49($d,$d_)=("edx","edi");
50
51sub QUARTERROUND {
52my ($ai,$bi,$ci,$di,$i)=@_;
53my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di));	# next
54my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di));	# previous
55
56	#       a   b   c   d
57	#
58	#       0   4   8  12 < even round
59	#       1   5   9  13
60	#       2   6  10  14
61	#       3   7  11  15
62	#       0   5  10  15 < odd round
63	#       1   6  11  12
64	#       2   7   8  13
65	#       3   4   9  14
66
67	if ($i==0) {
68            my $j=4;
69	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
70	} elsif ($i==3) {
71            my $j=0;
72	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
73	} elsif ($i==4) {
74            my $j=4;
75	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
76	} elsif ($i==7) {
77            my $j=0;
78	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
79	}
80
81	#&add	($a,$b);			# see elsewhere
82	&xor	($d,$a);
83	 &mov	(&DWP(4*$cp,"esp"),$c_)		if ($ai>0 && $ai<3);
84	&rol	($d,16);
85	 &mov	(&DWP(4*$bp,"esp"),$b_)		if ($i!=0);
86	&add	($c,$d);
87	 &mov	($c_,&DWP(4*$cn,"esp"))		if ($ai>0 && $ai<3);
88	&xor	($b,$c);
89	 &mov	($d_,&DWP(4*$dn,"esp"))		if ($di!=$dn);
90	&rol	($b,12);
91	 &mov	($b_,&DWP(4*$bn,"esp"))		if ($i<7);
92	 &mov	($b_,&DWP(128,"esp"))		if ($i==7);	# loop counter
93	&add	($a,$b);
94	&xor	($d,$a);
95	&mov	(&DWP(4*$ai,"esp"),$a);
96	&rol	($d,8);
97	&mov	($a,&DWP(4*$an,"esp"));
98	&add	($c,$d);
99	&mov	(&DWP(4*$di,"esp"),$d)		if ($di!=$dn);
100	&mov	($d_,$d)			if ($di==$dn);
101	&xor	($b,$c);
102	 &add	($a,$b_)			if ($i<7);	# elsewhere
103	&rol	($b,7);
104
105	($b,$b_)=($b_,$b);
106	($c,$c_)=($c_,$c);
107	($d,$d_)=($d_,$d);
108}
109
110&static_label("ssse3_shortcut");
111&static_label("ssse3_data");
112&static_label("pic_point");
113
114&function_begin("ChaCha20_ctr32");
115	&xor	("eax","eax");
116	&cmp	("eax",&wparam(2));		# len==0?
117	&je	(&label("no_data"));
118if ($xmm) {
119	&call	(&label("pic_point"));
120&set_label("pic_point");
121	&blindpop("eax");
122	&picmeup("ebp","OPENSSL_ia32cap_P","eax",&label("pic_point"));
123	&test	(&DWP(0,"ebp"),1<<24);		# test FXSR bit
124	&jz	(&label("x86"));
125	&test	(&DWP(4,"ebp"),1<<9);		# test SSSE3 bit
126	&jz	(&label("x86"));
127	&jmp	(&label("ssse3_shortcut"));
128&set_label("x86");
129}
130	&mov	("esi",&wparam(3));		# key
131	&mov	("edi",&wparam(4));		# counter and nonce
132
133	&stack_push(33);
134
135	&mov	("eax",&DWP(4*0,"esi"));	# copy key
136	&mov	("ebx",&DWP(4*1,"esi"));
137	&mov	("ecx",&DWP(4*2,"esi"));
138	&mov	("edx",&DWP(4*3,"esi"));
139	&mov	(&DWP(64+4*4,"esp"),"eax");
140	&mov	(&DWP(64+4*5,"esp"),"ebx");
141	&mov	(&DWP(64+4*6,"esp"),"ecx");
142	&mov	(&DWP(64+4*7,"esp"),"edx");
143	&mov	("eax",&DWP(4*4,"esi"));
144	&mov	("ebx",&DWP(4*5,"esi"));
145	&mov	("ecx",&DWP(4*6,"esi"));
146	&mov	("edx",&DWP(4*7,"esi"));
147	&mov	(&DWP(64+4*8,"esp"),"eax");
148	&mov	(&DWP(64+4*9,"esp"),"ebx");
149	&mov	(&DWP(64+4*10,"esp"),"ecx");
150	&mov	(&DWP(64+4*11,"esp"),"edx");
151	&mov	("eax",&DWP(4*0,"edi"));	# copy counter and nonce
152	&mov	("ebx",&DWP(4*1,"edi"));
153	&mov	("ecx",&DWP(4*2,"edi"));
154	&mov	("edx",&DWP(4*3,"edi"));
155	&sub	("eax",1);
156	&mov	(&DWP(64+4*12,"esp"),"eax");
157	&mov	(&DWP(64+4*13,"esp"),"ebx");
158	&mov	(&DWP(64+4*14,"esp"),"ecx");
159	&mov	(&DWP(64+4*15,"esp"),"edx");
160	&jmp	(&label("entry"));
161
162&set_label("outer_loop",16);
163	&mov	(&wparam(1),$b);		# save input
164	&mov	(&wparam(0),$a);		# save output
165	&mov	(&wparam(2),$c);		# save len
166&set_label("entry");
167	&mov	($a,0x61707865);
168	&mov	(&DWP(4*1,"esp"),0x3320646e);
169	&mov	(&DWP(4*2,"esp"),0x79622d32);
170	&mov	(&DWP(4*3,"esp"),0x6b206574);
171
172	&mov	($b, &DWP(64+4*5,"esp"));	# copy key material
173	&mov	($b_,&DWP(64+4*6,"esp"));
174	&mov	($c, &DWP(64+4*10,"esp"));
175	&mov	($c_,&DWP(64+4*11,"esp"));
176	&mov	($d, &DWP(64+4*13,"esp"));
177	&mov	($d_,&DWP(64+4*14,"esp"));
178	&mov	(&DWP(4*5,"esp"),$b);
179	&mov	(&DWP(4*6,"esp"),$b_);
180	&mov	(&DWP(4*10,"esp"),$c);
181	&mov	(&DWP(4*11,"esp"),$c_);
182	&mov	(&DWP(4*13,"esp"),$d);
183	&mov	(&DWP(4*14,"esp"),$d_);
184
185	&mov	($b, &DWP(64+4*7,"esp"));
186	&mov	($d_,&DWP(64+4*15,"esp"));
187	&mov	($d, &DWP(64+4*12,"esp"));
188	&mov	($b_,&DWP(64+4*4,"esp"));
189	&mov	($c, &DWP(64+4*8,"esp"));
190	&mov	($c_,&DWP(64+4*9,"esp"));
191	&add	($d,1);				# counter value
192	&mov	(&DWP(4*7,"esp"),$b);
193	&mov	(&DWP(4*15,"esp"),$d_);
194	&mov	(&DWP(64+4*12,"esp"),$d);	# save counter value
195
196	&mov	($b,10);			# loop counter
197	&jmp	(&label("loop"));
198
199&set_label("loop",16);
200	&add	($a,$b_);			# elsewhere
201	&mov	(&DWP(128,"esp"),$b);		# save loop counter
202	&mov	($b,$b_);
203	&QUARTERROUND(0, 4, 8, 12, 0);
204	&QUARTERROUND(1, 5, 9, 13, 1);
205	&QUARTERROUND(2, 6,10, 14, 2);
206	&QUARTERROUND(3, 7,11, 15, 3);
207	&QUARTERROUND(0, 5,10, 15, 4);
208	&QUARTERROUND(1, 6,11, 12, 5);
209	&QUARTERROUND(2, 7, 8, 13, 6);
210	&QUARTERROUND(3, 4, 9, 14, 7);
211	&dec	($b);
212	&jnz	(&label("loop"));
213
214	&mov	($b,&wparam(2));		# load len
215
216	&add	($a,0x61707865);		# accumulate key material
217	&add	($b_,&DWP(64+4*4,"esp"));
218	&add	($c, &DWP(64+4*8,"esp"));
219	&add	($c_,&DWP(64+4*9,"esp"));
220
221	&cmp	($b,64);
222	&jb	(&label("tail"));
223
224	&mov	($b,&wparam(1));		# load input pointer
225	&add	($d, &DWP(64+4*12,"esp"));
226	&add	($d_,&DWP(64+4*14,"esp"));
227
228	&xor	($a, &DWP(4*0,$b));		# xor with input
229	&xor	($b_,&DWP(4*4,$b));
230	&mov	(&DWP(4*0,"esp"),$a);
231	&mov	($a,&wparam(0));		# load output pointer
232	&xor	($c, &DWP(4*8,$b));
233	&xor	($c_,&DWP(4*9,$b));
234	&xor	($d, &DWP(4*12,$b));
235	&xor	($d_,&DWP(4*14,$b));
236	&mov	(&DWP(4*4,$a),$b_);		# write output
237	&mov	(&DWP(4*8,$a),$c);
238	&mov	(&DWP(4*9,$a),$c_);
239	&mov	(&DWP(4*12,$a),$d);
240	&mov	(&DWP(4*14,$a),$d_);
241
242	&mov	($b_,&DWP(4*1,"esp"));
243	&mov	($c, &DWP(4*2,"esp"));
244	&mov	($c_,&DWP(4*3,"esp"));
245	&mov	($d, &DWP(4*5,"esp"));
246	&mov	($d_,&DWP(4*6,"esp"));
247	&add	($b_,0x3320646e);		# accumulate key material
248	&add	($c, 0x79622d32);
249	&add	($c_,0x6b206574);
250	&add	($d, &DWP(64+4*5,"esp"));
251	&add	($d_,&DWP(64+4*6,"esp"));
252	&xor	($b_,&DWP(4*1,$b));
253	&xor	($c, &DWP(4*2,$b));
254	&xor	($c_,&DWP(4*3,$b));
255	&xor	($d, &DWP(4*5,$b));
256	&xor	($d_,&DWP(4*6,$b));
257	&mov	(&DWP(4*1,$a),$b_);
258	&mov	(&DWP(4*2,$a),$c);
259	&mov	(&DWP(4*3,$a),$c_);
260	&mov	(&DWP(4*5,$a),$d);
261	&mov	(&DWP(4*6,$a),$d_);
262
263	&mov	($b_,&DWP(4*7,"esp"));
264	&mov	($c, &DWP(4*10,"esp"));
265	&mov	($c_,&DWP(4*11,"esp"));
266	&mov	($d, &DWP(4*13,"esp"));
267	&mov	($d_,&DWP(4*15,"esp"));
268	&add	($b_,&DWP(64+4*7,"esp"));
269	&add	($c, &DWP(64+4*10,"esp"));
270	&add	($c_,&DWP(64+4*11,"esp"));
271	&add	($d, &DWP(64+4*13,"esp"));
272	&add	($d_,&DWP(64+4*15,"esp"));
273	&xor	($b_,&DWP(4*7,$b));
274	&xor	($c, &DWP(4*10,$b));
275	&xor	($c_,&DWP(4*11,$b));
276	&xor	($d, &DWP(4*13,$b));
277	&xor	($d_,&DWP(4*15,$b));
278	&lea	($b,&DWP(4*16,$b));
279	&mov	(&DWP(4*7,$a),$b_);
280	&mov	($b_,&DWP(4*0,"esp"));
281	&mov	(&DWP(4*10,$a),$c);
282	&mov	($c,&wparam(2));		# len
283	&mov	(&DWP(4*11,$a),$c_);
284	&mov	(&DWP(4*13,$a),$d);
285	&mov	(&DWP(4*15,$a),$d_);
286	&mov	(&DWP(4*0,$a),$b_);
287	&lea	($a,&DWP(4*16,$a));
288	&sub	($c,64);
289	&jnz	(&label("outer_loop"));
290
291	&jmp	(&label("done"));
292
293&set_label("tail");
294	&add	($d, &DWP(64+4*12,"esp"));
295	&add	($d_,&DWP(64+4*14,"esp"));
296	&mov	(&DWP(4*0,"esp"),$a);
297	&mov	(&DWP(4*4,"esp"),$b_);
298	&mov	(&DWP(4*8,"esp"),$c);
299	&mov	(&DWP(4*9,"esp"),$c_);
300	&mov	(&DWP(4*12,"esp"),$d);
301	&mov	(&DWP(4*14,"esp"),$d_);
302
303	&mov	($b_,&DWP(4*1,"esp"));
304	&mov	($c, &DWP(4*2,"esp"));
305	&mov	($c_,&DWP(4*3,"esp"));
306	&mov	($d, &DWP(4*5,"esp"));
307	&mov	($d_,&DWP(4*6,"esp"));
308	&add	($b_,0x3320646e);		# accumulate key material
309	&add	($c, 0x79622d32);
310	&add	($c_,0x6b206574);
311	&add	($d, &DWP(64+4*5,"esp"));
312	&add	($d_,&DWP(64+4*6,"esp"));
313	&mov	(&DWP(4*1,"esp"),$b_);
314	&mov	(&DWP(4*2,"esp"),$c);
315	&mov	(&DWP(4*3,"esp"),$c_);
316	&mov	(&DWP(4*5,"esp"),$d);
317	&mov	(&DWP(4*6,"esp"),$d_);
318
319	&mov	($b_,&DWP(4*7,"esp"));
320	&mov	($c, &DWP(4*10,"esp"));
321	&mov	($c_,&DWP(4*11,"esp"));
322	&mov	($d, &DWP(4*13,"esp"));
323	&mov	($d_,&DWP(4*15,"esp"));
324	&add	($b_,&DWP(64+4*7,"esp"));
325	&add	($c, &DWP(64+4*10,"esp"));
326	&add	($c_,&DWP(64+4*11,"esp"));
327	&add	($d, &DWP(64+4*13,"esp"));
328	&add	($d_,&DWP(64+4*15,"esp"));
329	&mov	(&DWP(4*7,"esp"),$b_);
330	&mov	($b_,&wparam(1));		# load input
331	&mov	(&DWP(4*10,"esp"),$c);
332	&mov	($c,&wparam(0));		# load output
333	&mov	(&DWP(4*11,"esp"),$c_);
334	&xor	($c_,$c_);
335	&mov	(&DWP(4*13,"esp"),$d);
336	&mov	(&DWP(4*15,"esp"),$d_);
337
338	&xor	("eax","eax");
339	&xor	("edx","edx");
340&set_label("tail_loop");
341	&movb	("al",&BP(0,$c_,$b_));
342	&movb	("dl",&BP(0,"esp",$c_));
343	&lea	($c_,&DWP(1,$c_));
344	&xor	("al","dl");
345	&mov	(&BP(-1,$c,$c_),"al");
346	&dec	($b);
347	&jnz	(&label("tail_loop"));
348
349&set_label("done");
350	&stack_pop(33);
351&set_label("no_data");
352&function_end("ChaCha20_ctr32");
353
354if ($xmm) {
355my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7));
356my ($out,$inp,$len)=("edi","esi","ecx");
357
358sub QUARTERROUND_SSSE3 {
359my ($ai,$bi,$ci,$di,$i)=@_;
360my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di));	# next
361my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di));	# previous
362
363	#       a   b   c   d
364	#
365	#       0   4   8  12 < even round
366	#       1   5   9  13
367	#       2   6  10  14
368	#       3   7  11  15
369	#       0   5  10  15 < odd round
370	#       1   6  11  12
371	#       2   7   8  13
372	#       3   4   9  14
373
374	if ($i==0) {
375            my $j=4;
376	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp));
377	} elsif ($i==3) {
378            my $j=0;
379	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn));
380	} elsif ($i==4) {
381            my $j=4;
382	    ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp));
383	} elsif ($i==7) {
384            my $j=0;
385	    ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn));
386	}
387
388	#&paddd	($xa,$xb);			# see elsewhere
389	#&pxor	($xd,$xa);			# see elsewhere
390	 &movdqa(&QWP(16*$cp-128,"ebx"),$xc_)	if ($ai>0 && $ai<3);
391	&pshufb	($xd,&QWP(0,"eax"));		# rot16
392	 &movdqa(&QWP(16*$bp-128,"ebx"),$xb_)	if ($i!=0);
393	&paddd	($xc,$xd);
394	 &movdqa($xc_,&QWP(16*$cn-128,"ebx"))	if ($ai>0 && $ai<3);
395	&pxor	($xb,$xc);
396	 &movdqa($xb_,&QWP(16*$bn-128,"ebx"))	if ($i<7);
397	&movdqa	($xa_,$xb);			# borrow as temporary
398	&pslld	($xb,12);
399	&psrld	($xa_,20);
400	&por	($xb,$xa_);
401	 &movdqa($xa_,&QWP(16*$an-128,"ebx"));
402	&paddd	($xa,$xb);
403	 &movdqa($xd_,&QWP(16*$dn-128,"ebx"))	if ($di!=$dn);
404	&pxor	($xd,$xa);
405	&movdqa	(&QWP(16*$ai-128,"ebx"),$xa);
406	&pshufb	($xd,&QWP(16,"eax"));		# rot8
407	&paddd	($xc,$xd);
408	&movdqa	(&QWP(16*$di-128,"ebx"),$xd)	if ($di!=$dn);
409	&movdqa	($xd_,$xd)			if ($di==$dn);
410	&pxor	($xb,$xc);
411	 &paddd	($xa_,$xb_)			if ($i<7);	# elsewhere
412	&movdqa	($xa,$xb);			# borrow as temporary
413	&pslld	($xb,7);
414	&psrld	($xa,25);
415	 &pxor	($xd_,$xa_)			if ($i<7);	# elsewhere
416	&por	($xb,$xa);
417
418	($xa,$xa_)=($xa_,$xa);
419	($xb,$xb_)=($xb_,$xb);
420	($xc,$xc_)=($xc_,$xc);
421	($xd,$xd_)=($xd_,$xd);
422}
423
424&function_begin("ChaCha20_ssse3");
425&set_label("ssse3_shortcut");
426	&mov		($out,&wparam(0));
427	&mov		($inp,&wparam(1));
428	&mov		($len,&wparam(2));
429	&mov		("edx",&wparam(3));		# key
430	&mov		("ebx",&wparam(4));		# counter and nonce
431
432	&mov		("ebp","esp");
433	&stack_push	(131);
434	&and		("esp",-64);
435	&mov		(&DWP(512,"esp"),"ebp");
436
437	&lea		("eax",&DWP(&label("ssse3_data")."-".
438				    &label("pic_point"),"eax"));
439	&movdqu		("xmm3",&QWP(0,"ebx"));		# counter and nonce
440
441if (defined($gasver) && $gasver>=2.17) {		# even though we encode
442							# pshufb manually, we
443							# handle only register
444							# operands, while this
445							# segment uses memory
446							# operand...
447	&cmp		($len,64*4);
448	&jb		(&label("1x"));
449
450	&mov		(&DWP(512+4,"esp"),"edx");	# offload pointers
451	&mov		(&DWP(512+8,"esp"),"ebx");
452	&sub		($len,64*4);			# bias len
453	&lea		("ebp",&DWP(256+128,"esp"));	# size optimization
454
455	&movdqu		("xmm7",&QWP(0,"edx"));		# key
456	&pshufd		("xmm0","xmm3",0x00);
457	&pshufd		("xmm1","xmm3",0x55);
458	&pshufd		("xmm2","xmm3",0xaa);
459	&pshufd		("xmm3","xmm3",0xff);
460	 &paddd		("xmm0",&QWP(16*3,"eax"));	# fix counters
461	&pshufd		("xmm4","xmm7",0x00);
462	&pshufd		("xmm5","xmm7",0x55);
463	 &psubd		("xmm0",&QWP(16*4,"eax"));
464	&pshufd		("xmm6","xmm7",0xaa);
465	&pshufd		("xmm7","xmm7",0xff);
466	&movdqa		(&QWP(16*12-128,"ebp"),"xmm0");
467	&movdqa		(&QWP(16*13-128,"ebp"),"xmm1");
468	&movdqa		(&QWP(16*14-128,"ebp"),"xmm2");
469	&movdqa		(&QWP(16*15-128,"ebp"),"xmm3");
470	 &movdqu	("xmm3",&QWP(16,"edx"));	# key
471	&movdqa		(&QWP(16*4-128,"ebp"),"xmm4");
472	&movdqa		(&QWP(16*5-128,"ebp"),"xmm5");
473	&movdqa		(&QWP(16*6-128,"ebp"),"xmm6");
474	&movdqa		(&QWP(16*7-128,"ebp"),"xmm7");
475	 &movdqa	("xmm7",&QWP(16*2,"eax"));	# sigma
476	 &lea		("ebx",&DWP(128,"esp"));	# size optimization
477
478	&pshufd		("xmm0","xmm3",0x00);
479	&pshufd		("xmm1","xmm3",0x55);
480	&pshufd		("xmm2","xmm3",0xaa);
481	&pshufd		("xmm3","xmm3",0xff);
482	&pshufd		("xmm4","xmm7",0x00);
483	&pshufd		("xmm5","xmm7",0x55);
484	&pshufd		("xmm6","xmm7",0xaa);
485	&pshufd		("xmm7","xmm7",0xff);
486	&movdqa		(&QWP(16*8-128,"ebp"),"xmm0");
487	&movdqa		(&QWP(16*9-128,"ebp"),"xmm1");
488	&movdqa		(&QWP(16*10-128,"ebp"),"xmm2");
489	&movdqa		(&QWP(16*11-128,"ebp"),"xmm3");
490	&movdqa		(&QWP(16*0-128,"ebp"),"xmm4");
491	&movdqa		(&QWP(16*1-128,"ebp"),"xmm5");
492	&movdqa		(&QWP(16*2-128,"ebp"),"xmm6");
493	&movdqa		(&QWP(16*3-128,"ebp"),"xmm7");
494
495	&lea		($inp,&DWP(128,$inp));		# size optimization
496	&lea		($out,&DWP(128,$out));		# size optimization
497	&jmp		(&label("outer_loop"));
498
499&set_label("outer_loop",16);
500	#&movdqa	("xmm0",&QWP(16*0-128,"ebp"));	# copy key material
501	&movdqa		("xmm1",&QWP(16*1-128,"ebp"));
502	&movdqa		("xmm2",&QWP(16*2-128,"ebp"));
503	&movdqa		("xmm3",&QWP(16*3-128,"ebp"));
504	#&movdqa	("xmm4",&QWP(16*4-128,"ebp"));
505	&movdqa		("xmm5",&QWP(16*5-128,"ebp"));
506	&movdqa		("xmm6",&QWP(16*6-128,"ebp"));
507	&movdqa		("xmm7",&QWP(16*7-128,"ebp"));
508	#&movdqa	(&QWP(16*0-128,"ebx"),"xmm0");
509	&movdqa		(&QWP(16*1-128,"ebx"),"xmm1");
510	&movdqa		(&QWP(16*2-128,"ebx"),"xmm2");
511	&movdqa		(&QWP(16*3-128,"ebx"),"xmm3");
512	#&movdqa	(&QWP(16*4-128,"ebx"),"xmm4");
513	&movdqa		(&QWP(16*5-128,"ebx"),"xmm5");
514	&movdqa		(&QWP(16*6-128,"ebx"),"xmm6");
515	&movdqa		(&QWP(16*7-128,"ebx"),"xmm7");
516	#&movdqa	("xmm0",&QWP(16*8-128,"ebp"));
517	#&movdqa	("xmm1",&QWP(16*9-128,"ebp"));
518	&movdqa		("xmm2",&QWP(16*10-128,"ebp"));
519	&movdqa		("xmm3",&QWP(16*11-128,"ebp"));
520	&movdqa		("xmm4",&QWP(16*12-128,"ebp"));
521	&movdqa		("xmm5",&QWP(16*13-128,"ebp"));
522	&movdqa		("xmm6",&QWP(16*14-128,"ebp"));
523	&movdqa		("xmm7",&QWP(16*15-128,"ebp"));
524	&paddd		("xmm4",&QWP(16*4,"eax"));	# counter value
525	#&movdqa	(&QWP(16*8-128,"ebx"),"xmm0");
526	#&movdqa	(&QWP(16*9-128,"ebx"),"xmm1");
527	&movdqa		(&QWP(16*10-128,"ebx"),"xmm2");
528	&movdqa		(&QWP(16*11-128,"ebx"),"xmm3");
529	&movdqa		(&QWP(16*12-128,"ebx"),"xmm4");
530	&movdqa		(&QWP(16*13-128,"ebx"),"xmm5");
531	&movdqa		(&QWP(16*14-128,"ebx"),"xmm6");
532	&movdqa		(&QWP(16*15-128,"ebx"),"xmm7");
533	&movdqa		(&QWP(16*12-128,"ebp"),"xmm4");	# save counter value
534
535	&movdqa		($xa, &QWP(16*0-128,"ebp"));
536	&movdqa		($xd, "xmm4");
537	&movdqa		($xb_,&QWP(16*4-128,"ebp"));
538	&movdqa		($xc, &QWP(16*8-128,"ebp"));
539	&movdqa		($xc_,&QWP(16*9-128,"ebp"));
540
541	&mov		("edx",10);			# loop counter
542	&nop		();
543
544&set_label("loop",16);
545	&paddd		($xa,$xb_);			# elsewhere
546	&movdqa		($xb,$xb_);
547	&pxor		($xd,$xa);			# elsewhere
548	&QUARTERROUND_SSSE3(0, 4, 8, 12, 0);
549	&QUARTERROUND_SSSE3(1, 5, 9, 13, 1);
550	&QUARTERROUND_SSSE3(2, 6,10, 14, 2);
551	&QUARTERROUND_SSSE3(3, 7,11, 15, 3);
552	&QUARTERROUND_SSSE3(0, 5,10, 15, 4);
553	&QUARTERROUND_SSSE3(1, 6,11, 12, 5);
554	&QUARTERROUND_SSSE3(2, 7, 8, 13, 6);
555	&QUARTERROUND_SSSE3(3, 4, 9, 14, 7);
556	&dec		("edx");
557	&jnz		(&label("loop"));
558
559	&movdqa		(&QWP(16*4-128,"ebx"),$xb_);
560	&movdqa		(&QWP(16*8-128,"ebx"),$xc);
561	&movdqa		(&QWP(16*9-128,"ebx"),$xc_);
562	&movdqa		(&QWP(16*12-128,"ebx"),$xd);
563	&movdqa		(&QWP(16*14-128,"ebx"),$xd_);
564
565    my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7));
566
567	#&movdqa	($xa0,&QWP(16*0-128,"ebx"));	# it's there
568	&movdqa		($xa1,&QWP(16*1-128,"ebx"));
569	&movdqa		($xa2,&QWP(16*2-128,"ebx"));
570	&movdqa		($xa3,&QWP(16*3-128,"ebx"));
571
572    for($i=0;$i<256;$i+=64) {
573	&paddd		($xa0,&QWP($i+16*0-128,"ebp"));	# accumulate key material
574	&paddd		($xa1,&QWP($i+16*1-128,"ebp"));
575	&paddd		($xa2,&QWP($i+16*2-128,"ebp"));
576	&paddd		($xa3,&QWP($i+16*3-128,"ebp"));
577
578	&movdqa		($xt2,$xa0);		# "de-interlace" data
579	&punpckldq	($xa0,$xa1);
580	&movdqa		($xt3,$xa2);
581	&punpckldq	($xa2,$xa3);
582	&punpckhdq	($xt2,$xa1);
583	&punpckhdq	($xt3,$xa3);
584	&movdqa		($xa1,$xa0);
585	&punpcklqdq	($xa0,$xa2);		# "a0"
586	&movdqa		($xa3,$xt2);
587	&punpcklqdq	($xt2,$xt3);		# "a2"
588	&punpckhqdq	($xa1,$xa2);		# "a1"
589	&punpckhqdq	($xa3,$xt3);		# "a3"
590
591	#($xa2,$xt2)=($xt2,$xa2);
592
593	&movdqu		($xt0,&QWP(64*0-128,$inp));	# load input
594	&movdqu		($xt1,&QWP(64*1-128,$inp));
595	&movdqu		($xa2,&QWP(64*2-128,$inp));
596	&movdqu		($xt3,&QWP(64*3-128,$inp));
597	&lea		($inp,&QWP($i<192?16:(64*4-16*3),$inp));
598	&pxor		($xt0,$xa0);
599	&movdqa		($xa0,&QWP($i+16*4-128,"ebx"))	if ($i<192);
600	&pxor		($xt1,$xa1);
601	&movdqa		($xa1,&QWP($i+16*5-128,"ebx"))	if ($i<192);
602	&pxor		($xt2,$xa2);
603	&movdqa		($xa2,&QWP($i+16*6-128,"ebx"))	if ($i<192);
604	&pxor		($xt3,$xa3);
605	&movdqa		($xa3,&QWP($i+16*7-128,"ebx"))	if ($i<192);
606	&movdqu		(&QWP(64*0-128,$out),$xt0);	# store output
607	&movdqu		(&QWP(64*1-128,$out),$xt1);
608	&movdqu		(&QWP(64*2-128,$out),$xt2);
609	&movdqu		(&QWP(64*3-128,$out),$xt3);
610	&lea		($out,&QWP($i<192?16:(64*4-16*3),$out));
611    }
612	&sub		($len,64*4);
613	&jnc		(&label("outer_loop"));
614
615	&add		($len,64*4);
616	&jz		(&label("done"));
617
618	&mov		("ebx",&DWP(512+8,"esp"));	# restore pointers
619	&lea		($inp,&DWP(-128,$inp));
620	&mov		("edx",&DWP(512+4,"esp"));
621	&lea		($out,&DWP(-128,$out));
622
623	&movd		("xmm2",&DWP(16*12-128,"ebp"));	# counter value
624	&movdqu		("xmm3",&QWP(0,"ebx"));
625	&paddd		("xmm2",&QWP(16*6,"eax"));	# +four
626	&pand		("xmm3",&QWP(16*7,"eax"));
627	&por		("xmm3","xmm2");		# counter value
628}
629{
630my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7));
631
632sub SSSE3ROUND {	# critical path is 20 "SIMD ticks" per round
633	&paddd		($a,$b);
634	&pxor		($d,$a);
635	&pshufb		($d,$rot16);
636
637	&paddd		($c,$d);
638	&pxor		($b,$c);
639	&movdqa		($t,$b);
640	&psrld		($b,20);
641	&pslld		($t,12);
642	&por		($b,$t);
643
644	&paddd		($a,$b);
645	&pxor		($d,$a);
646	&pshufb		($d,$rot24);
647
648	&paddd		($c,$d);
649	&pxor		($b,$c);
650	&movdqa		($t,$b);
651	&psrld		($b,25);
652	&pslld		($t,7);
653	&por		($b,$t);
654}
655
656&set_label("1x");
657	&movdqa		($a,&QWP(16*2,"eax"));		# sigma
658	&movdqu		($b,&QWP(0,"edx"));
659	&movdqu		($c,&QWP(16,"edx"));
660	#&movdqu	($d,&QWP(0,"ebx"));		# already loaded
661	&movdqa		($rot16,&QWP(0,"eax"));
662	&movdqa		($rot24,&QWP(16,"eax"));
663	&mov		(&DWP(16*3,"esp"),"ebp");
664
665	&movdqa		(&QWP(16*0,"esp"),$a);
666	&movdqa		(&QWP(16*1,"esp"),$b);
667	&movdqa		(&QWP(16*2,"esp"),$c);
668	&movdqa		(&QWP(16*3,"esp"),$d);
669	&mov		("edx",10);
670	&jmp		(&label("loop1x"));
671
672&set_label("outer1x",16);
673	&movdqa		($d,&QWP(16*5,"eax"));		# one
674	&movdqa		($a,&QWP(16*0,"esp"));
675	&movdqa		($b,&QWP(16*1,"esp"));
676	&movdqa		($c,&QWP(16*2,"esp"));
677	&paddd		($d,&QWP(16*3,"esp"));
678	&mov		("edx",10);
679	&movdqa		(&QWP(16*3,"esp"),$d);
680	&jmp		(&label("loop1x"));
681
682&set_label("loop1x",16);
683	&SSSE3ROUND();
684	&pshufd	($c,$c,0b01001110);
685	&pshufd	($b,$b,0b00111001);
686	&pshufd	($d,$d,0b10010011);
687	&nop	();
688
689	&SSSE3ROUND();
690	&pshufd	($c,$c,0b01001110);
691	&pshufd	($b,$b,0b10010011);
692	&pshufd	($d,$d,0b00111001);
693
694	&dec		("edx");
695	&jnz		(&label("loop1x"));
696
697	&paddd		($a,&QWP(16*0,"esp"));
698	&paddd		($b,&QWP(16*1,"esp"));
699	&paddd		($c,&QWP(16*2,"esp"));
700	&paddd		($d,&QWP(16*3,"esp"));
701
702	&cmp		($len,64);
703	&jb		(&label("tail"));
704
705	&movdqu		($t,&QWP(16*0,$inp));
706	&movdqu		($t1,&QWP(16*1,$inp));
707	&pxor		($a,$t);		# xor with input
708	&movdqu		($t,&QWP(16*2,$inp));
709	&pxor		($b,$t1);
710	&movdqu		($t1,&QWP(16*3,$inp));
711	&pxor		($c,$t);
712	&pxor		($d,$t1);
713	&lea		($inp,&DWP(16*4,$inp));	# inp+=64
714
715	&movdqu		(&QWP(16*0,$out),$a);	# write output
716	&movdqu		(&QWP(16*1,$out),$b);
717	&movdqu		(&QWP(16*2,$out),$c);
718	&movdqu		(&QWP(16*3,$out),$d);
719	&lea		($out,&DWP(16*4,$out));	# inp+=64
720
721	&sub		($len,64);
722	&jnz		(&label("outer1x"));
723
724	&jmp		(&label("done"));
725
726&set_label("tail");
727	&movdqa		(&QWP(16*0,"esp"),$a);
728	&movdqa		(&QWP(16*1,"esp"),$b);
729	&movdqa		(&QWP(16*2,"esp"),$c);
730	&movdqa		(&QWP(16*3,"esp"),$d);
731
732	&xor		("eax","eax");
733	&xor		("edx","edx");
734	&xor		("ebp","ebp");
735
736&set_label("tail_loop");
737	&movb		("al",&BP(0,"esp","ebp"));
738	&movb		("dl",&BP(0,$inp,"ebp"));
739	&lea		("ebp",&DWP(1,"ebp"));
740	&xor		("al","dl");
741	&movb		(&BP(-1,$out,"ebp"),"al");
742	&dec		($len);
743	&jnz		(&label("tail_loop"));
744}
745&set_label("done");
746	&mov		("esp",&DWP(512,"esp"));
747&function_end("ChaCha20_ssse3");
748
749&align	(64);
750&set_label("ssse3_data");
751&data_byte(0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd);
752&data_byte(0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe);
753&data_word(0x61707865,0x3320646e,0x79622d32,0x6b206574);
754&data_word(0,1,2,3);
755&data_word(4,4,4,4);
756&data_word(1,0,0,0);
757&data_word(4,0,0,0);
758&data_word(0,-1,-1,-1);
759&align	(64);
760}
761&asciz	("ChaCha20 for x86, CRYPTOGAMS by <appro\@openssl.org>");
762
763&asm_finish();
764
765close STDOUT;
766