• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1.text
2.globl	_ChaCha20_ctr32
3.type	_ChaCha20_ctr32,@function
4.align	4
5_ChaCha20_ctr32:
6L_ChaCha20_ctr32_begin:
7	%ifdef __CET__
8
9.byte	243,15,30,251
10	%endif
11
12	pushl	%ebp
13	pushl	%ebx
14	pushl	%esi
15	pushl	%edi
16	xorl	%eax,%eax
17	cmpl	28(%esp),%eax
18	je	L000no_data
19	call	Lpic_point
20Lpic_point:
21	popl	%eax
22	leal	__GLOBAL_OFFSET_TABLE_+[.-Lpic_point](%eax),%ebp
23	movl	_OPENSSL_ia32cap_P@GOT(%ebp),%ebp
24	testl	$16777216,(%ebp)
25	jz	L001x86
26	testl	$512,4(%ebp)
27	jz	L001x86
28	jmp	Lssse3_shortcut
29L001x86:
30	movl	32(%esp),%esi
31	movl	36(%esp),%edi
32	subl	$132,%esp
33	movl	(%esi),%eax
34	movl	4(%esi),%ebx
35	movl	8(%esi),%ecx
36	movl	12(%esi),%edx
37	movl	%eax,80(%esp)
38	movl	%ebx,84(%esp)
39	movl	%ecx,88(%esp)
40	movl	%edx,92(%esp)
41	movl	16(%esi),%eax
42	movl	20(%esi),%ebx
43	movl	24(%esi),%ecx
44	movl	28(%esi),%edx
45	movl	%eax,96(%esp)
46	movl	%ebx,100(%esp)
47	movl	%ecx,104(%esp)
48	movl	%edx,108(%esp)
49	movl	(%edi),%eax
50	movl	4(%edi),%ebx
51	movl	8(%edi),%ecx
52	movl	12(%edi),%edx
53	subl	$1,%eax
54	movl	%eax,112(%esp)
55	movl	%ebx,116(%esp)
56	movl	%ecx,120(%esp)
57	movl	%edx,124(%esp)
58	jmp	L002entry
59.align	4,0x90
60L003outer_loop:
61	movl	%ebx,156(%esp)
62	movl	%eax,152(%esp)
63	movl	%ecx,160(%esp)
64L002entry:
65	movl	$1634760805,%eax
66	movl	$857760878,4(%esp)
67	movl	$2036477234,8(%esp)
68	movl	$1797285236,12(%esp)
69	movl	84(%esp),%ebx
70	movl	88(%esp),%ebp
71	movl	104(%esp),%ecx
72	movl	108(%esp),%esi
73	movl	116(%esp),%edx
74	movl	120(%esp),%edi
75	movl	%ebx,20(%esp)
76	movl	%ebp,24(%esp)
77	movl	%ecx,40(%esp)
78	movl	%esi,44(%esp)
79	movl	%edx,52(%esp)
80	movl	%edi,56(%esp)
81	movl	92(%esp),%ebx
82	movl	124(%esp),%edi
83	movl	112(%esp),%edx
84	movl	80(%esp),%ebp
85	movl	96(%esp),%ecx
86	movl	100(%esp),%esi
87	addl	$1,%edx
88	movl	%ebx,28(%esp)
89	movl	%edi,60(%esp)
90	movl	%edx,112(%esp)
91	movl	$10,%ebx
92	jmp	L004loop
93.align	4,0x90
94L004loop:
95	addl	%ebp,%eax
96	movl	%ebx,128(%esp)
97	movl	%ebp,%ebx
98	xorl	%eax,%edx
99	roll	$16,%edx
100	addl	%edx,%ecx
101	xorl	%ecx,%ebx
102	movl	52(%esp),%edi
103	roll	$12,%ebx
104	movl	20(%esp),%ebp
105	addl	%ebx,%eax
106	xorl	%eax,%edx
107	movl	%eax,(%esp)
108	roll	$8,%edx
109	movl	4(%esp),%eax
110	addl	%edx,%ecx
111	movl	%edx,48(%esp)
112	xorl	%ecx,%ebx
113	addl	%ebp,%eax
114	roll	$7,%ebx
115	xorl	%eax,%edi
116	movl	%ecx,32(%esp)
117	roll	$16,%edi
118	movl	%ebx,16(%esp)
119	addl	%edi,%esi
120	movl	40(%esp),%ecx
121	xorl	%esi,%ebp
122	movl	56(%esp),%edx
123	roll	$12,%ebp
124	movl	24(%esp),%ebx
125	addl	%ebp,%eax
126	xorl	%eax,%edi
127	movl	%eax,4(%esp)
128	roll	$8,%edi
129	movl	8(%esp),%eax
130	addl	%edi,%esi
131	movl	%edi,52(%esp)
132	xorl	%esi,%ebp
133	addl	%ebx,%eax
134	roll	$7,%ebp
135	xorl	%eax,%edx
136	movl	%esi,36(%esp)
137	roll	$16,%edx
138	movl	%ebp,20(%esp)
139	addl	%edx,%ecx
140	movl	44(%esp),%esi
141	xorl	%ecx,%ebx
142	movl	60(%esp),%edi
143	roll	$12,%ebx
144	movl	28(%esp),%ebp
145	addl	%ebx,%eax
146	xorl	%eax,%edx
147	movl	%eax,8(%esp)
148	roll	$8,%edx
149	movl	12(%esp),%eax
150	addl	%edx,%ecx
151	movl	%edx,56(%esp)
152	xorl	%ecx,%ebx
153	addl	%ebp,%eax
154	roll	$7,%ebx
155	xorl	%eax,%edi
156	roll	$16,%edi
157	movl	%ebx,24(%esp)
158	addl	%edi,%esi
159	xorl	%esi,%ebp
160	roll	$12,%ebp
161	movl	20(%esp),%ebx
162	addl	%ebp,%eax
163	xorl	%eax,%edi
164	movl	%eax,12(%esp)
165	roll	$8,%edi
166	movl	(%esp),%eax
167	addl	%edi,%esi
168	movl	%edi,%edx
169	xorl	%esi,%ebp
170	addl	%ebx,%eax
171	roll	$7,%ebp
172	xorl	%eax,%edx
173	roll	$16,%edx
174	movl	%ebp,28(%esp)
175	addl	%edx,%ecx
176	xorl	%ecx,%ebx
177	movl	48(%esp),%edi
178	roll	$12,%ebx
179	movl	24(%esp),%ebp
180	addl	%ebx,%eax
181	xorl	%eax,%edx
182	movl	%eax,(%esp)
183	roll	$8,%edx
184	movl	4(%esp),%eax
185	addl	%edx,%ecx
186	movl	%edx,60(%esp)
187	xorl	%ecx,%ebx
188	addl	%ebp,%eax
189	roll	$7,%ebx
190	xorl	%eax,%edi
191	movl	%ecx,40(%esp)
192	roll	$16,%edi
193	movl	%ebx,20(%esp)
194	addl	%edi,%esi
195	movl	32(%esp),%ecx
196	xorl	%esi,%ebp
197	movl	52(%esp),%edx
198	roll	$12,%ebp
199	movl	28(%esp),%ebx
200	addl	%ebp,%eax
201	xorl	%eax,%edi
202	movl	%eax,4(%esp)
203	roll	$8,%edi
204	movl	8(%esp),%eax
205	addl	%edi,%esi
206	movl	%edi,48(%esp)
207	xorl	%esi,%ebp
208	addl	%ebx,%eax
209	roll	$7,%ebp
210	xorl	%eax,%edx
211	movl	%esi,44(%esp)
212	roll	$16,%edx
213	movl	%ebp,24(%esp)
214	addl	%edx,%ecx
215	movl	36(%esp),%esi
216	xorl	%ecx,%ebx
217	movl	56(%esp),%edi
218	roll	$12,%ebx
219	movl	16(%esp),%ebp
220	addl	%ebx,%eax
221	xorl	%eax,%edx
222	movl	%eax,8(%esp)
223	roll	$8,%edx
224	movl	12(%esp),%eax
225	addl	%edx,%ecx
226	movl	%edx,52(%esp)
227	xorl	%ecx,%ebx
228	addl	%ebp,%eax
229	roll	$7,%ebx
230	xorl	%eax,%edi
231	roll	$16,%edi
232	movl	%ebx,28(%esp)
233	addl	%edi,%esi
234	xorl	%esi,%ebp
235	movl	48(%esp),%edx
236	roll	$12,%ebp
237	movl	128(%esp),%ebx
238	addl	%ebp,%eax
239	xorl	%eax,%edi
240	movl	%eax,12(%esp)
241	roll	$8,%edi
242	movl	(%esp),%eax
243	addl	%edi,%esi
244	movl	%edi,56(%esp)
245	xorl	%esi,%ebp
246	roll	$7,%ebp
247	decl	%ebx
248	jnz	L004loop
249	movl	160(%esp),%ebx
250	addl	$1634760805,%eax
251	addl	80(%esp),%ebp
252	addl	96(%esp),%ecx
253	addl	100(%esp),%esi
254	cmpl	$64,%ebx
255	jb	L005tail
256	movl	156(%esp),%ebx
257	addl	112(%esp),%edx
258	addl	120(%esp),%edi
259	xorl	(%ebx),%eax
260	xorl	16(%ebx),%ebp
261	movl	%eax,(%esp)
262	movl	152(%esp),%eax
263	xorl	32(%ebx),%ecx
264	xorl	36(%ebx),%esi
265	xorl	48(%ebx),%edx
266	xorl	56(%ebx),%edi
267	movl	%ebp,16(%eax)
268	movl	%ecx,32(%eax)
269	movl	%esi,36(%eax)
270	movl	%edx,48(%eax)
271	movl	%edi,56(%eax)
272	movl	4(%esp),%ebp
273	movl	8(%esp),%ecx
274	movl	12(%esp),%esi
275	movl	20(%esp),%edx
276	movl	24(%esp),%edi
277	addl	$857760878,%ebp
278	addl	$2036477234,%ecx
279	addl	$1797285236,%esi
280	addl	84(%esp),%edx
281	addl	88(%esp),%edi
282	xorl	4(%ebx),%ebp
283	xorl	8(%ebx),%ecx
284	xorl	12(%ebx),%esi
285	xorl	20(%ebx),%edx
286	xorl	24(%ebx),%edi
287	movl	%ebp,4(%eax)
288	movl	%ecx,8(%eax)
289	movl	%esi,12(%eax)
290	movl	%edx,20(%eax)
291	movl	%edi,24(%eax)
292	movl	28(%esp),%ebp
293	movl	40(%esp),%ecx
294	movl	44(%esp),%esi
295	movl	52(%esp),%edx
296	movl	60(%esp),%edi
297	addl	92(%esp),%ebp
298	addl	104(%esp),%ecx
299	addl	108(%esp),%esi
300	addl	116(%esp),%edx
301	addl	124(%esp),%edi
302	xorl	28(%ebx),%ebp
303	xorl	40(%ebx),%ecx
304	xorl	44(%ebx),%esi
305	xorl	52(%ebx),%edx
306	xorl	60(%ebx),%edi
307	leal	64(%ebx),%ebx
308	movl	%ebp,28(%eax)
309	movl	(%esp),%ebp
310	movl	%ecx,40(%eax)
311	movl	160(%esp),%ecx
312	movl	%esi,44(%eax)
313	movl	%edx,52(%eax)
314	movl	%edi,60(%eax)
315	movl	%ebp,(%eax)
316	leal	64(%eax),%eax
317	subl	$64,%ecx
318	jnz	L003outer_loop
319	jmp	L006done
320L005tail:
321	addl	112(%esp),%edx
322	addl	120(%esp),%edi
323	movl	%eax,(%esp)
324	movl	%ebp,16(%esp)
325	movl	%ecx,32(%esp)
326	movl	%esi,36(%esp)
327	movl	%edx,48(%esp)
328	movl	%edi,56(%esp)
329	movl	4(%esp),%ebp
330	movl	8(%esp),%ecx
331	movl	12(%esp),%esi
332	movl	20(%esp),%edx
333	movl	24(%esp),%edi
334	addl	$857760878,%ebp
335	addl	$2036477234,%ecx
336	addl	$1797285236,%esi
337	addl	84(%esp),%edx
338	addl	88(%esp),%edi
339	movl	%ebp,4(%esp)
340	movl	%ecx,8(%esp)
341	movl	%esi,12(%esp)
342	movl	%edx,20(%esp)
343	movl	%edi,24(%esp)
344	movl	28(%esp),%ebp
345	movl	40(%esp),%ecx
346	movl	44(%esp),%esi
347	movl	52(%esp),%edx
348	movl	60(%esp),%edi
349	addl	92(%esp),%ebp
350	addl	104(%esp),%ecx
351	addl	108(%esp),%esi
352	addl	116(%esp),%edx
353	addl	124(%esp),%edi
354	movl	%ebp,28(%esp)
355	movl	156(%esp),%ebp
356	movl	%ecx,40(%esp)
357	movl	152(%esp),%ecx
358	movl	%esi,44(%esp)
359	xorl	%esi,%esi
360	movl	%edx,52(%esp)
361	movl	%edi,60(%esp)
362	xorl	%eax,%eax
363	xorl	%edx,%edx
364L007tail_loop:
365	movb	(%esi,%ebp,1),%al
366	movb	(%esp,%esi,1),%dl
367	leal	1(%esi),%esi
368	xorb	%dl,%al
369	movb	%al,-1(%ecx,%esi,1)
370	decl	%ebx
371	jnz	L007tail_loop
372L006done:
373	addl	$132,%esp
374L000no_data:
375	popl	%edi
376	popl	%esi
377	popl	%ebx
378	popl	%ebp
379	ret
380.globl	_ChaCha20_ssse3
381.type	_ChaCha20_ssse3,@function
382.align	4
383_ChaCha20_ssse3:
384L_ChaCha20_ssse3_begin:
385	%ifdef __CET__
386
387.byte	243,15,30,251
388	%endif
389
390	pushl	%ebp
391	pushl	%ebx
392	pushl	%esi
393	pushl	%edi
394Lssse3_shortcut:
395	testl	$2048,4(%ebp)
396	jnz	Lxop_shortcut
397	movl	20(%esp),%edi
398	movl	24(%esp),%esi
399	movl	28(%esp),%ecx
400	movl	32(%esp),%edx
401	movl	36(%esp),%ebx
402	movl	%esp,%ebp
403	subl	$524,%esp
404	andl	$-64,%esp
405	movl	%ebp,512(%esp)
406	leal	Lssse3_data-Lpic_point(%eax),%eax
407	movdqu	(%ebx),%xmm3
408	cmpl	$256,%ecx
409	jb	L0081x
410	movl	%edx,516(%esp)
411	movl	%ebx,520(%esp)
412	subl	$256,%ecx
413	leal	384(%esp),%ebp
414	movdqu	(%edx),%xmm7
415	pshufd	$0,%xmm3,%xmm0
416	pshufd	$85,%xmm3,%xmm1
417	pshufd	$170,%xmm3,%xmm2
418	pshufd	$255,%xmm3,%xmm3
419	paddd	48(%eax),%xmm0
420	pshufd	$0,%xmm7,%xmm4
421	pshufd	$85,%xmm7,%xmm5
422	psubd	64(%eax),%xmm0
423	pshufd	$170,%xmm7,%xmm6
424	pshufd	$255,%xmm7,%xmm7
425	movdqa	%xmm0,64(%ebp)
426	movdqa	%xmm1,80(%ebp)
427	movdqa	%xmm2,96(%ebp)
428	movdqa	%xmm3,112(%ebp)
429	movdqu	16(%edx),%xmm3
430	movdqa	%xmm4,-64(%ebp)
431	movdqa	%xmm5,-48(%ebp)
432	movdqa	%xmm6,-32(%ebp)
433	movdqa	%xmm7,-16(%ebp)
434	movdqa	32(%eax),%xmm7
435	leal	128(%esp),%ebx
436	pshufd	$0,%xmm3,%xmm0
437	pshufd	$85,%xmm3,%xmm1
438	pshufd	$170,%xmm3,%xmm2
439	pshufd	$255,%xmm3,%xmm3
440	pshufd	$0,%xmm7,%xmm4
441	pshufd	$85,%xmm7,%xmm5
442	pshufd	$170,%xmm7,%xmm6
443	pshufd	$255,%xmm7,%xmm7
444	movdqa	%xmm0,(%ebp)
445	movdqa	%xmm1,16(%ebp)
446	movdqa	%xmm2,32(%ebp)
447	movdqa	%xmm3,48(%ebp)
448	movdqa	%xmm4,-128(%ebp)
449	movdqa	%xmm5,-112(%ebp)
450	movdqa	%xmm6,-96(%ebp)
451	movdqa	%xmm7,-80(%ebp)
452	leal	128(%esi),%esi
453	leal	128(%edi),%edi
454	jmp	L009outer_loop
455.align	4,0x90
456L009outer_loop:
457	movdqa	-112(%ebp),%xmm1
458	movdqa	-96(%ebp),%xmm2
459	movdqa	-80(%ebp),%xmm3
460	movdqa	-48(%ebp),%xmm5
461	movdqa	-32(%ebp),%xmm6
462	movdqa	-16(%ebp),%xmm7
463	movdqa	%xmm1,-112(%ebx)
464	movdqa	%xmm2,-96(%ebx)
465	movdqa	%xmm3,-80(%ebx)
466	movdqa	%xmm5,-48(%ebx)
467	movdqa	%xmm6,-32(%ebx)
468	movdqa	%xmm7,-16(%ebx)
469	movdqa	32(%ebp),%xmm2
470	movdqa	48(%ebp),%xmm3
471	movdqa	64(%ebp),%xmm4
472	movdqa	80(%ebp),%xmm5
473	movdqa	96(%ebp),%xmm6
474	movdqa	112(%ebp),%xmm7
475	paddd	64(%eax),%xmm4
476	movdqa	%xmm2,32(%ebx)
477	movdqa	%xmm3,48(%ebx)
478	movdqa	%xmm4,64(%ebx)
479	movdqa	%xmm5,80(%ebx)
480	movdqa	%xmm6,96(%ebx)
481	movdqa	%xmm7,112(%ebx)
482	movdqa	%xmm4,64(%ebp)
483	movdqa	-128(%ebp),%xmm0
484	movdqa	%xmm4,%xmm6
485	movdqa	-64(%ebp),%xmm3
486	movdqa	(%ebp),%xmm4
487	movdqa	16(%ebp),%xmm5
488	movl	$10,%edx
489	nop
490.align	4,0x90
491L010loop:
492	paddd	%xmm3,%xmm0
493	movdqa	%xmm3,%xmm2
494	pxor	%xmm0,%xmm6
495	pshufb	(%eax),%xmm6
496	paddd	%xmm6,%xmm4
497	pxor	%xmm4,%xmm2
498	movdqa	-48(%ebx),%xmm3
499	movdqa	%xmm2,%xmm1
500	pslld	$12,%xmm2
501	psrld	$20,%xmm1
502	por	%xmm1,%xmm2
503	movdqa	-112(%ebx),%xmm1
504	paddd	%xmm2,%xmm0
505	movdqa	80(%ebx),%xmm7
506	pxor	%xmm0,%xmm6
507	movdqa	%xmm0,-128(%ebx)
508	pshufb	16(%eax),%xmm6
509	paddd	%xmm6,%xmm4
510	movdqa	%xmm6,64(%ebx)
511	pxor	%xmm4,%xmm2
512	paddd	%xmm3,%xmm1
513	movdqa	%xmm2,%xmm0
514	pslld	$7,%xmm2
515	psrld	$25,%xmm0
516	pxor	%xmm1,%xmm7
517	por	%xmm0,%xmm2
518	movdqa	%xmm4,(%ebx)
519	pshufb	(%eax),%xmm7
520	movdqa	%xmm2,-64(%ebx)
521	paddd	%xmm7,%xmm5
522	movdqa	32(%ebx),%xmm4
523	pxor	%xmm5,%xmm3
524	movdqa	-32(%ebx),%xmm2
525	movdqa	%xmm3,%xmm0
526	pslld	$12,%xmm3
527	psrld	$20,%xmm0
528	por	%xmm0,%xmm3
529	movdqa	-96(%ebx),%xmm0
530	paddd	%xmm3,%xmm1
531	movdqa	96(%ebx),%xmm6
532	pxor	%xmm1,%xmm7
533	movdqa	%xmm1,-112(%ebx)
534	pshufb	16(%eax),%xmm7
535	paddd	%xmm7,%xmm5
536	movdqa	%xmm7,80(%ebx)
537	pxor	%xmm5,%xmm3
538	paddd	%xmm2,%xmm0
539	movdqa	%xmm3,%xmm1
540	pslld	$7,%xmm3
541	psrld	$25,%xmm1
542	pxor	%xmm0,%xmm6
543	por	%xmm1,%xmm3
544	movdqa	%xmm5,16(%ebx)
545	pshufb	(%eax),%xmm6
546	movdqa	%xmm3,-48(%ebx)
547	paddd	%xmm6,%xmm4
548	movdqa	48(%ebx),%xmm5
549	pxor	%xmm4,%xmm2
550	movdqa	-16(%ebx),%xmm3
551	movdqa	%xmm2,%xmm1
552	pslld	$12,%xmm2
553	psrld	$20,%xmm1
554	por	%xmm1,%xmm2
555	movdqa	-80(%ebx),%xmm1
556	paddd	%xmm2,%xmm0
557	movdqa	112(%ebx),%xmm7
558	pxor	%xmm0,%xmm6
559	movdqa	%xmm0,-96(%ebx)
560	pshufb	16(%eax),%xmm6
561	paddd	%xmm6,%xmm4
562	movdqa	%xmm6,96(%ebx)
563	pxor	%xmm4,%xmm2
564	paddd	%xmm3,%xmm1
565	movdqa	%xmm2,%xmm0
566	pslld	$7,%xmm2
567	psrld	$25,%xmm0
568	pxor	%xmm1,%xmm7
569	por	%xmm0,%xmm2
570	pshufb	(%eax),%xmm7
571	movdqa	%xmm2,-32(%ebx)
572	paddd	%xmm7,%xmm5
573	pxor	%xmm5,%xmm3
574	movdqa	-48(%ebx),%xmm2
575	movdqa	%xmm3,%xmm0
576	pslld	$12,%xmm3
577	psrld	$20,%xmm0
578	por	%xmm0,%xmm3
579	movdqa	-128(%ebx),%xmm0
580	paddd	%xmm3,%xmm1
581	pxor	%xmm1,%xmm7
582	movdqa	%xmm1,-80(%ebx)
583	pshufb	16(%eax),%xmm7
584	paddd	%xmm7,%xmm5
585	movdqa	%xmm7,%xmm6
586	pxor	%xmm5,%xmm3
587	paddd	%xmm2,%xmm0
588	movdqa	%xmm3,%xmm1
589	pslld	$7,%xmm3
590	psrld	$25,%xmm1
591	pxor	%xmm0,%xmm6
592	por	%xmm1,%xmm3
593	pshufb	(%eax),%xmm6
594	movdqa	%xmm3,-16(%ebx)
595	paddd	%xmm6,%xmm4
596	pxor	%xmm4,%xmm2
597	movdqa	-32(%ebx),%xmm3
598	movdqa	%xmm2,%xmm1
599	pslld	$12,%xmm2
600	psrld	$20,%xmm1
601	por	%xmm1,%xmm2
602	movdqa	-112(%ebx),%xmm1
603	paddd	%xmm2,%xmm0
604	movdqa	64(%ebx),%xmm7
605	pxor	%xmm0,%xmm6
606	movdqa	%xmm0,-128(%ebx)
607	pshufb	16(%eax),%xmm6
608	paddd	%xmm6,%xmm4
609	movdqa	%xmm6,112(%ebx)
610	pxor	%xmm4,%xmm2
611	paddd	%xmm3,%xmm1
612	movdqa	%xmm2,%xmm0
613	pslld	$7,%xmm2
614	psrld	$25,%xmm0
615	pxor	%xmm1,%xmm7
616	por	%xmm0,%xmm2
617	movdqa	%xmm4,32(%ebx)
618	pshufb	(%eax),%xmm7
619	movdqa	%xmm2,-48(%ebx)
620	paddd	%xmm7,%xmm5
621	movdqa	(%ebx),%xmm4
622	pxor	%xmm5,%xmm3
623	movdqa	-16(%ebx),%xmm2
624	movdqa	%xmm3,%xmm0
625	pslld	$12,%xmm3
626	psrld	$20,%xmm0
627	por	%xmm0,%xmm3
628	movdqa	-96(%ebx),%xmm0
629	paddd	%xmm3,%xmm1
630	movdqa	80(%ebx),%xmm6
631	pxor	%xmm1,%xmm7
632	movdqa	%xmm1,-112(%ebx)
633	pshufb	16(%eax),%xmm7
634	paddd	%xmm7,%xmm5
635	movdqa	%xmm7,64(%ebx)
636	pxor	%xmm5,%xmm3
637	paddd	%xmm2,%xmm0
638	movdqa	%xmm3,%xmm1
639	pslld	$7,%xmm3
640	psrld	$25,%xmm1
641	pxor	%xmm0,%xmm6
642	por	%xmm1,%xmm3
643	movdqa	%xmm5,48(%ebx)
644	pshufb	(%eax),%xmm6
645	movdqa	%xmm3,-32(%ebx)
646	paddd	%xmm6,%xmm4
647	movdqa	16(%ebx),%xmm5
648	pxor	%xmm4,%xmm2
649	movdqa	-64(%ebx),%xmm3
650	movdqa	%xmm2,%xmm1
651	pslld	$12,%xmm2
652	psrld	$20,%xmm1
653	por	%xmm1,%xmm2
654	movdqa	-80(%ebx),%xmm1
655	paddd	%xmm2,%xmm0
656	movdqa	96(%ebx),%xmm7
657	pxor	%xmm0,%xmm6
658	movdqa	%xmm0,-96(%ebx)
659	pshufb	16(%eax),%xmm6
660	paddd	%xmm6,%xmm4
661	movdqa	%xmm6,80(%ebx)
662	pxor	%xmm4,%xmm2
663	paddd	%xmm3,%xmm1
664	movdqa	%xmm2,%xmm0
665	pslld	$7,%xmm2
666	psrld	$25,%xmm0
667	pxor	%xmm1,%xmm7
668	por	%xmm0,%xmm2
669	pshufb	(%eax),%xmm7
670	movdqa	%xmm2,-16(%ebx)
671	paddd	%xmm7,%xmm5
672	pxor	%xmm5,%xmm3
673	movdqa	%xmm3,%xmm0
674	pslld	$12,%xmm3
675	psrld	$20,%xmm0
676	por	%xmm0,%xmm3
677	movdqa	-128(%ebx),%xmm0
678	paddd	%xmm3,%xmm1
679	movdqa	64(%ebx),%xmm6
680	pxor	%xmm1,%xmm7
681	movdqa	%xmm1,-80(%ebx)
682	pshufb	16(%eax),%xmm7
683	paddd	%xmm7,%xmm5
684	movdqa	%xmm7,96(%ebx)
685	pxor	%xmm5,%xmm3
686	movdqa	%xmm3,%xmm1
687	pslld	$7,%xmm3
688	psrld	$25,%xmm1
689	por	%xmm1,%xmm3
690	decl	%edx
691	jnz	L010loop
692	movdqa	%xmm3,-64(%ebx)
693	movdqa	%xmm4,(%ebx)
694	movdqa	%xmm5,16(%ebx)
695	movdqa	%xmm6,64(%ebx)
696	movdqa	%xmm7,96(%ebx)
697	movdqa	-112(%ebx),%xmm1
698	movdqa	-96(%ebx),%xmm2
699	movdqa	-80(%ebx),%xmm3
700	paddd	-128(%ebp),%xmm0
701	paddd	-112(%ebp),%xmm1
702	paddd	-96(%ebp),%xmm2
703	paddd	-80(%ebp),%xmm3
704	movdqa	%xmm0,%xmm6
705	punpckldq	%xmm1,%xmm0
706	movdqa	%xmm2,%xmm7
707	punpckldq	%xmm3,%xmm2
708	punpckhdq	%xmm1,%xmm6
709	punpckhdq	%xmm3,%xmm7
710	movdqa	%xmm0,%xmm1
711	punpcklqdq	%xmm2,%xmm0
712	movdqa	%xmm6,%xmm3
713	punpcklqdq	%xmm7,%xmm6
714	punpckhqdq	%xmm2,%xmm1
715	punpckhqdq	%xmm7,%xmm3
716	movdqu	-128(%esi),%xmm4
717	movdqu	-64(%esi),%xmm5
718	movdqu	(%esi),%xmm2
719	movdqu	64(%esi),%xmm7
720	leal	16(%esi),%esi
721	pxor	%xmm0,%xmm4
722	movdqa	-64(%ebx),%xmm0
723	pxor	%xmm1,%xmm5
724	movdqa	-48(%ebx),%xmm1
725	pxor	%xmm2,%xmm6
726	movdqa	-32(%ebx),%xmm2
727	pxor	%xmm3,%xmm7
728	movdqa	-16(%ebx),%xmm3
729	movdqu	%xmm4,-128(%edi)
730	movdqu	%xmm5,-64(%edi)
731	movdqu	%xmm6,(%edi)
732	movdqu	%xmm7,64(%edi)
733	leal	16(%edi),%edi
734	paddd	-64(%ebp),%xmm0
735	paddd	-48(%ebp),%xmm1
736	paddd	-32(%ebp),%xmm2
737	paddd	-16(%ebp),%xmm3
738	movdqa	%xmm0,%xmm6
739	punpckldq	%xmm1,%xmm0
740	movdqa	%xmm2,%xmm7
741	punpckldq	%xmm3,%xmm2
742	punpckhdq	%xmm1,%xmm6
743	punpckhdq	%xmm3,%xmm7
744	movdqa	%xmm0,%xmm1
745	punpcklqdq	%xmm2,%xmm0
746	movdqa	%xmm6,%xmm3
747	punpcklqdq	%xmm7,%xmm6
748	punpckhqdq	%xmm2,%xmm1
749	punpckhqdq	%xmm7,%xmm3
750	movdqu	-128(%esi),%xmm4
751	movdqu	-64(%esi),%xmm5
752	movdqu	(%esi),%xmm2
753	movdqu	64(%esi),%xmm7
754	leal	16(%esi),%esi
755	pxor	%xmm0,%xmm4
756	movdqa	(%ebx),%xmm0
757	pxor	%xmm1,%xmm5
758	movdqa	16(%ebx),%xmm1
759	pxor	%xmm2,%xmm6
760	movdqa	32(%ebx),%xmm2
761	pxor	%xmm3,%xmm7
762	movdqa	48(%ebx),%xmm3
763	movdqu	%xmm4,-128(%edi)
764	movdqu	%xmm5,-64(%edi)
765	movdqu	%xmm6,(%edi)
766	movdqu	%xmm7,64(%edi)
767	leal	16(%edi),%edi
768	paddd	(%ebp),%xmm0
769	paddd	16(%ebp),%xmm1
770	paddd	32(%ebp),%xmm2
771	paddd	48(%ebp),%xmm3
772	movdqa	%xmm0,%xmm6
773	punpckldq	%xmm1,%xmm0
774	movdqa	%xmm2,%xmm7
775	punpckldq	%xmm3,%xmm2
776	punpckhdq	%xmm1,%xmm6
777	punpckhdq	%xmm3,%xmm7
778	movdqa	%xmm0,%xmm1
779	punpcklqdq	%xmm2,%xmm0
780	movdqa	%xmm6,%xmm3
781	punpcklqdq	%xmm7,%xmm6
782	punpckhqdq	%xmm2,%xmm1
783	punpckhqdq	%xmm7,%xmm3
784	movdqu	-128(%esi),%xmm4
785	movdqu	-64(%esi),%xmm5
786	movdqu	(%esi),%xmm2
787	movdqu	64(%esi),%xmm7
788	leal	16(%esi),%esi
789	pxor	%xmm0,%xmm4
790	movdqa	64(%ebx),%xmm0
791	pxor	%xmm1,%xmm5
792	movdqa	80(%ebx),%xmm1
793	pxor	%xmm2,%xmm6
794	movdqa	96(%ebx),%xmm2
795	pxor	%xmm3,%xmm7
796	movdqa	112(%ebx),%xmm3
797	movdqu	%xmm4,-128(%edi)
798	movdqu	%xmm5,-64(%edi)
799	movdqu	%xmm6,(%edi)
800	movdqu	%xmm7,64(%edi)
801	leal	16(%edi),%edi
802	paddd	64(%ebp),%xmm0
803	paddd	80(%ebp),%xmm1
804	paddd	96(%ebp),%xmm2
805	paddd	112(%ebp),%xmm3
806	movdqa	%xmm0,%xmm6
807	punpckldq	%xmm1,%xmm0
808	movdqa	%xmm2,%xmm7
809	punpckldq	%xmm3,%xmm2
810	punpckhdq	%xmm1,%xmm6
811	punpckhdq	%xmm3,%xmm7
812	movdqa	%xmm0,%xmm1
813	punpcklqdq	%xmm2,%xmm0
814	movdqa	%xmm6,%xmm3
815	punpcklqdq	%xmm7,%xmm6
816	punpckhqdq	%xmm2,%xmm1
817	punpckhqdq	%xmm7,%xmm3
818	movdqu	-128(%esi),%xmm4
819	movdqu	-64(%esi),%xmm5
820	movdqu	(%esi),%xmm2
821	movdqu	64(%esi),%xmm7
822	leal	208(%esi),%esi
823	pxor	%xmm0,%xmm4
824	pxor	%xmm1,%xmm5
825	pxor	%xmm2,%xmm6
826	pxor	%xmm3,%xmm7
827	movdqu	%xmm4,-128(%edi)
828	movdqu	%xmm5,-64(%edi)
829	movdqu	%xmm6,(%edi)
830	movdqu	%xmm7,64(%edi)
831	leal	208(%edi),%edi
832	subl	$256,%ecx
833	jnc	L009outer_loop
834	addl	$256,%ecx
835	jz	L011done
836	movl	520(%esp),%ebx
837	leal	-128(%esi),%esi
838	movl	516(%esp),%edx
839	leal	-128(%edi),%edi
840	movd	64(%ebp),%xmm2
841	movdqu	(%ebx),%xmm3
842	paddd	96(%eax),%xmm2
843	pand	112(%eax),%xmm3
844	por	%xmm2,%xmm3
845L0081x:
846	movdqa	32(%eax),%xmm0
847	movdqu	(%edx),%xmm1
848	movdqu	16(%edx),%xmm2
849	movdqa	(%eax),%xmm6
850	movdqa	16(%eax),%xmm7
851	movl	%ebp,48(%esp)
852	movdqa	%xmm0,(%esp)
853	movdqa	%xmm1,16(%esp)
854	movdqa	%xmm2,32(%esp)
855	movdqa	%xmm3,48(%esp)
856	movl	$10,%edx
857	jmp	L012loop1x
858.align	4,0x90
859L013outer1x:
860	movdqa	80(%eax),%xmm3
861	movdqa	(%esp),%xmm0
862	movdqa	16(%esp),%xmm1
863	movdqa	32(%esp),%xmm2
864	paddd	48(%esp),%xmm3
865	movl	$10,%edx
866	movdqa	%xmm3,48(%esp)
867	jmp	L012loop1x
868.align	4,0x90
869L012loop1x:
870	paddd	%xmm1,%xmm0
871	pxor	%xmm0,%xmm3
872.byte	102,15,56,0,222
873	paddd	%xmm3,%xmm2
874	pxor	%xmm2,%xmm1
875	movdqa	%xmm1,%xmm4
876	psrld	$20,%xmm1
877	pslld	$12,%xmm4
878	por	%xmm4,%xmm1
879	paddd	%xmm1,%xmm0
880	pxor	%xmm0,%xmm3
881.byte	102,15,56,0,223
882	paddd	%xmm3,%xmm2
883	pxor	%xmm2,%xmm1
884	movdqa	%xmm1,%xmm4
885	psrld	$25,%xmm1
886	pslld	$7,%xmm4
887	por	%xmm4,%xmm1
888	pshufd	$78,%xmm2,%xmm2
889	pshufd	$57,%xmm1,%xmm1
890	pshufd	$147,%xmm3,%xmm3
891	nop
892	paddd	%xmm1,%xmm0
893	pxor	%xmm0,%xmm3
894.byte	102,15,56,0,222
895	paddd	%xmm3,%xmm2
896	pxor	%xmm2,%xmm1
897	movdqa	%xmm1,%xmm4
898	psrld	$20,%xmm1
899	pslld	$12,%xmm4
900	por	%xmm4,%xmm1
901	paddd	%xmm1,%xmm0
902	pxor	%xmm0,%xmm3
903.byte	102,15,56,0,223
904	paddd	%xmm3,%xmm2
905	pxor	%xmm2,%xmm1
906	movdqa	%xmm1,%xmm4
907	psrld	$25,%xmm1
908	pslld	$7,%xmm4
909	por	%xmm4,%xmm1
910	pshufd	$78,%xmm2,%xmm2
911	pshufd	$147,%xmm1,%xmm1
912	pshufd	$57,%xmm3,%xmm3
913	decl	%edx
914	jnz	L012loop1x
915	paddd	(%esp),%xmm0
916	paddd	16(%esp),%xmm1
917	paddd	32(%esp),%xmm2
918	paddd	48(%esp),%xmm3
919	cmpl	$64,%ecx
920	jb	L014tail
921	movdqu	(%esi),%xmm4
922	movdqu	16(%esi),%xmm5
923	pxor	%xmm4,%xmm0
924	movdqu	32(%esi),%xmm4
925	pxor	%xmm5,%xmm1
926	movdqu	48(%esi),%xmm5
927	pxor	%xmm4,%xmm2
928	pxor	%xmm5,%xmm3
929	leal	64(%esi),%esi
930	movdqu	%xmm0,(%edi)
931	movdqu	%xmm1,16(%edi)
932	movdqu	%xmm2,32(%edi)
933	movdqu	%xmm3,48(%edi)
934	leal	64(%edi),%edi
935	subl	$64,%ecx
936	jnz	L013outer1x
937	jmp	L011done
938L014tail:
939	movdqa	%xmm0,(%esp)
940	movdqa	%xmm1,16(%esp)
941	movdqa	%xmm2,32(%esp)
942	movdqa	%xmm3,48(%esp)
943	xorl	%eax,%eax
944	xorl	%edx,%edx
945	xorl	%ebp,%ebp
946L015tail_loop:
947	movb	(%esp,%ebp,1),%al
948	movb	(%esi,%ebp,1),%dl
949	leal	1(%ebp),%ebp
950	xorb	%dl,%al
951	movb	%al,-1(%edi,%ebp,1)
952	decl	%ecx
953	jnz	L015tail_loop
954L011done:
955	movl	512(%esp),%esp
956	popl	%edi
957	popl	%esi
958	popl	%ebx
959	popl	%ebp
960	ret
961.align	6,0x90
962Lssse3_data:
963.byte	2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
964.byte	3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
965.long	1634760805,857760878,2036477234,1797285236
966.long	0,1,2,3
967.long	4,4,4,4
968.long	1,0,0,0
969.long	4,0,0,0
970.long	0,-1,-1,-1
971.align	6,0x90
972.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
973.byte	44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
974.byte	60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
975.byte	114,103,62,0
976.globl	_ChaCha20_xop
977.type	_ChaCha20_xop,@function
978.align	4
979_ChaCha20_xop:
980L_ChaCha20_xop_begin:
981	%ifdef __CET__
982
983.byte	243,15,30,251
984	%endif
985
986	pushl	%ebp
987	pushl	%ebx
988	pushl	%esi
989	pushl	%edi
990Lxop_shortcut:
991	movl	20(%esp),%edi
992	movl	24(%esp),%esi
993	movl	28(%esp),%ecx
994	movl	32(%esp),%edx
995	movl	36(%esp),%ebx
996	vzeroupper
997	movl	%esp,%ebp
998	subl	$524,%esp
999	andl	$-64,%esp
1000	movl	%ebp,512(%esp)
1001	leal	Lssse3_data-Lpic_point(%eax),%eax
1002	vmovdqu	(%ebx),%xmm3
1003	cmpl	$256,%ecx
1004	jb	L0161x
1005	movl	%edx,516(%esp)
1006	movl	%ebx,520(%esp)
1007	subl	$256,%ecx
1008	leal	384(%esp),%ebp
1009	vmovdqu	(%edx),%xmm7
1010	vpshufd	$0,%xmm3,%xmm0
1011	vpshufd	$85,%xmm3,%xmm1
1012	vpshufd	$170,%xmm3,%xmm2
1013	vpshufd	$255,%xmm3,%xmm3
1014	vpaddd	48(%eax),%xmm0,%xmm0
1015	vpshufd	$0,%xmm7,%xmm4
1016	vpshufd	$85,%xmm7,%xmm5
1017	vpsubd	64(%eax),%xmm0,%xmm0
1018	vpshufd	$170,%xmm7,%xmm6
1019	vpshufd	$255,%xmm7,%xmm7
1020	vmovdqa	%xmm0,64(%ebp)
1021	vmovdqa	%xmm1,80(%ebp)
1022	vmovdqa	%xmm2,96(%ebp)
1023	vmovdqa	%xmm3,112(%ebp)
1024	vmovdqu	16(%edx),%xmm3
1025	vmovdqa	%xmm4,-64(%ebp)
1026	vmovdqa	%xmm5,-48(%ebp)
1027	vmovdqa	%xmm6,-32(%ebp)
1028	vmovdqa	%xmm7,-16(%ebp)
1029	vmovdqa	32(%eax),%xmm7
1030	leal	128(%esp),%ebx
1031	vpshufd	$0,%xmm3,%xmm0
1032	vpshufd	$85,%xmm3,%xmm1
1033	vpshufd	$170,%xmm3,%xmm2
1034	vpshufd	$255,%xmm3,%xmm3
1035	vpshufd	$0,%xmm7,%xmm4
1036	vpshufd	$85,%xmm7,%xmm5
1037	vpshufd	$170,%xmm7,%xmm6
1038	vpshufd	$255,%xmm7,%xmm7
1039	vmovdqa	%xmm0,(%ebp)
1040	vmovdqa	%xmm1,16(%ebp)
1041	vmovdqa	%xmm2,32(%ebp)
1042	vmovdqa	%xmm3,48(%ebp)
1043	vmovdqa	%xmm4,-128(%ebp)
1044	vmovdqa	%xmm5,-112(%ebp)
1045	vmovdqa	%xmm6,-96(%ebp)
1046	vmovdqa	%xmm7,-80(%ebp)
1047	leal	128(%esi),%esi
1048	leal	128(%edi),%edi
1049	jmp	L017outer_loop
1050.align	5,0x90
1051L017outer_loop:
1052	vmovdqa	-112(%ebp),%xmm1
1053	vmovdqa	-96(%ebp),%xmm2
1054	vmovdqa	-80(%ebp),%xmm3
1055	vmovdqa	-48(%ebp),%xmm5
1056	vmovdqa	-32(%ebp),%xmm6
1057	vmovdqa	-16(%ebp),%xmm7
1058	vmovdqa	%xmm1,-112(%ebx)
1059	vmovdqa	%xmm2,-96(%ebx)
1060	vmovdqa	%xmm3,-80(%ebx)
1061	vmovdqa	%xmm5,-48(%ebx)
1062	vmovdqa	%xmm6,-32(%ebx)
1063	vmovdqa	%xmm7,-16(%ebx)
1064	vmovdqa	32(%ebp),%xmm2
1065	vmovdqa	48(%ebp),%xmm3
1066	vmovdqa	64(%ebp),%xmm4
1067	vmovdqa	80(%ebp),%xmm5
1068	vmovdqa	96(%ebp),%xmm6
1069	vmovdqa	112(%ebp),%xmm7
1070	vpaddd	64(%eax),%xmm4,%xmm4
1071	vmovdqa	%xmm2,32(%ebx)
1072	vmovdqa	%xmm3,48(%ebx)
1073	vmovdqa	%xmm4,64(%ebx)
1074	vmovdqa	%xmm5,80(%ebx)
1075	vmovdqa	%xmm6,96(%ebx)
1076	vmovdqa	%xmm7,112(%ebx)
1077	vmovdqa	%xmm4,64(%ebp)
1078	vmovdqa	-128(%ebp),%xmm0
1079	vmovdqa	%xmm4,%xmm6
1080	vmovdqa	-64(%ebp),%xmm3
1081	vmovdqa	(%ebp),%xmm4
1082	vmovdqa	16(%ebp),%xmm5
1083	movl	$10,%edx
1084	nop
1085.align	5,0x90
1086L018loop:
1087	vpaddd	%xmm3,%xmm0,%xmm0
1088	vpxor	%xmm0,%xmm6,%xmm6
1089.byte	143,232,120,194,246,16
1090	vpaddd	%xmm6,%xmm4,%xmm4
1091	vpxor	%xmm4,%xmm3,%xmm2
1092	vmovdqa	-112(%ebx),%xmm1
1093.byte	143,232,120,194,210,12
1094	vmovdqa	-48(%ebx),%xmm3
1095	vpaddd	%xmm2,%xmm0,%xmm0
1096	vmovdqa	80(%ebx),%xmm7
1097	vpxor	%xmm0,%xmm6,%xmm6
1098	vpaddd	%xmm3,%xmm1,%xmm1
1099.byte	143,232,120,194,246,8
1100	vmovdqa	%xmm0,-128(%ebx)
1101	vpaddd	%xmm6,%xmm4,%xmm4
1102	vmovdqa	%xmm6,64(%ebx)
1103	vpxor	%xmm4,%xmm2,%xmm2
1104	vpxor	%xmm1,%xmm7,%xmm7
1105.byte	143,232,120,194,210,7
1106	vmovdqa	%xmm4,(%ebx)
1107.byte	143,232,120,194,255,16
1108	vmovdqa	%xmm2,-64(%ebx)
1109	vpaddd	%xmm7,%xmm5,%xmm5
1110	vmovdqa	32(%ebx),%xmm4
1111	vpxor	%xmm5,%xmm3,%xmm3
1112	vmovdqa	-96(%ebx),%xmm0
1113.byte	143,232,120,194,219,12
1114	vmovdqa	-32(%ebx),%xmm2
1115	vpaddd	%xmm3,%xmm1,%xmm1
1116	vmovdqa	96(%ebx),%xmm6
1117	vpxor	%xmm1,%xmm7,%xmm7
1118	vpaddd	%xmm2,%xmm0,%xmm0
1119.byte	143,232,120,194,255,8
1120	vmovdqa	%xmm1,-112(%ebx)
1121	vpaddd	%xmm7,%xmm5,%xmm5
1122	vmovdqa	%xmm7,80(%ebx)
1123	vpxor	%xmm5,%xmm3,%xmm3
1124	vpxor	%xmm0,%xmm6,%xmm6
1125.byte	143,232,120,194,219,7
1126	vmovdqa	%xmm5,16(%ebx)
1127.byte	143,232,120,194,246,16
1128	vmovdqa	%xmm3,-48(%ebx)
1129	vpaddd	%xmm6,%xmm4,%xmm4
1130	vmovdqa	48(%ebx),%xmm5
1131	vpxor	%xmm4,%xmm2,%xmm2
1132	vmovdqa	-80(%ebx),%xmm1
1133.byte	143,232,120,194,210,12
1134	vmovdqa	-16(%ebx),%xmm3
1135	vpaddd	%xmm2,%xmm0,%xmm0
1136	vmovdqa	112(%ebx),%xmm7
1137	vpxor	%xmm0,%xmm6,%xmm6
1138	vpaddd	%xmm3,%xmm1,%xmm1
1139.byte	143,232,120,194,246,8
1140	vmovdqa	%xmm0,-96(%ebx)
1141	vpaddd	%xmm6,%xmm4,%xmm4
1142	vmovdqa	%xmm6,96(%ebx)
1143	vpxor	%xmm4,%xmm2,%xmm2
1144	vpxor	%xmm1,%xmm7,%xmm7
1145.byte	143,232,120,194,210,7
1146.byte	143,232,120,194,255,16
1147	vmovdqa	%xmm2,-32(%ebx)
1148	vpaddd	%xmm7,%xmm5,%xmm5
1149	vpxor	%xmm5,%xmm3,%xmm3
1150	vmovdqa	-128(%ebx),%xmm0
1151.byte	143,232,120,194,219,12
1152	vmovdqa	-48(%ebx),%xmm2
1153	vpaddd	%xmm3,%xmm1,%xmm1
1154	vpxor	%xmm1,%xmm7,%xmm7
1155	vpaddd	%xmm2,%xmm0,%xmm0
1156.byte	143,232,120,194,255,8
1157	vmovdqa	%xmm1,-80(%ebx)
1158	vpaddd	%xmm7,%xmm5,%xmm5
1159	vpxor	%xmm5,%xmm3,%xmm3
1160	vpxor	%xmm0,%xmm7,%xmm6
1161.byte	143,232,120,194,219,7
1162.byte	143,232,120,194,246,16
1163	vmovdqa	%xmm3,-16(%ebx)
1164	vpaddd	%xmm6,%xmm4,%xmm4
1165	vpxor	%xmm4,%xmm2,%xmm2
1166	vmovdqa	-112(%ebx),%xmm1
1167.byte	143,232,120,194,210,12
1168	vmovdqa	-32(%ebx),%xmm3
1169	vpaddd	%xmm2,%xmm0,%xmm0
1170	vmovdqa	64(%ebx),%xmm7
1171	vpxor	%xmm0,%xmm6,%xmm6
1172	vpaddd	%xmm3,%xmm1,%xmm1
1173.byte	143,232,120,194,246,8
1174	vmovdqa	%xmm0,-128(%ebx)
1175	vpaddd	%xmm6,%xmm4,%xmm4
1176	vmovdqa	%xmm6,112(%ebx)
1177	vpxor	%xmm4,%xmm2,%xmm2
1178	vpxor	%xmm1,%xmm7,%xmm7
1179.byte	143,232,120,194,210,7
1180	vmovdqa	%xmm4,32(%ebx)
1181.byte	143,232,120,194,255,16
1182	vmovdqa	%xmm2,-48(%ebx)
1183	vpaddd	%xmm7,%xmm5,%xmm5
1184	vmovdqa	(%ebx),%xmm4
1185	vpxor	%xmm5,%xmm3,%xmm3
1186	vmovdqa	-96(%ebx),%xmm0
1187.byte	143,232,120,194,219,12
1188	vmovdqa	-16(%ebx),%xmm2
1189	vpaddd	%xmm3,%xmm1,%xmm1
1190	vmovdqa	80(%ebx),%xmm6
1191	vpxor	%xmm1,%xmm7,%xmm7
1192	vpaddd	%xmm2,%xmm0,%xmm0
1193.byte	143,232,120,194,255,8
1194	vmovdqa	%xmm1,-112(%ebx)
1195	vpaddd	%xmm7,%xmm5,%xmm5
1196	vmovdqa	%xmm7,64(%ebx)
1197	vpxor	%xmm5,%xmm3,%xmm3
1198	vpxor	%xmm0,%xmm6,%xmm6
1199.byte	143,232,120,194,219,7
1200	vmovdqa	%xmm5,48(%ebx)
1201.byte	143,232,120,194,246,16
1202	vmovdqa	%xmm3,-32(%ebx)
1203	vpaddd	%xmm6,%xmm4,%xmm4
1204	vmovdqa	16(%ebx),%xmm5
1205	vpxor	%xmm4,%xmm2,%xmm2
1206	vmovdqa	-80(%ebx),%xmm1
1207.byte	143,232,120,194,210,12
1208	vmovdqa	-64(%ebx),%xmm3
1209	vpaddd	%xmm2,%xmm0,%xmm0
1210	vmovdqa	96(%ebx),%xmm7
1211	vpxor	%xmm0,%xmm6,%xmm6
1212	vpaddd	%xmm3,%xmm1,%xmm1
1213.byte	143,232,120,194,246,8
1214	vmovdqa	%xmm0,-96(%ebx)
1215	vpaddd	%xmm6,%xmm4,%xmm4
1216	vmovdqa	%xmm6,80(%ebx)
1217	vpxor	%xmm4,%xmm2,%xmm2
1218	vpxor	%xmm1,%xmm7,%xmm7
1219.byte	143,232,120,194,210,7
1220.byte	143,232,120,194,255,16
1221	vmovdqa	%xmm2,-16(%ebx)
1222	vpaddd	%xmm7,%xmm5,%xmm5
1223	vpxor	%xmm5,%xmm3,%xmm3
1224	vmovdqa	-128(%ebx),%xmm0
1225.byte	143,232,120,194,219,12
1226	vpaddd	%xmm3,%xmm1,%xmm1
1227	vmovdqa	64(%ebx),%xmm6
1228	vpxor	%xmm1,%xmm7,%xmm7
1229.byte	143,232,120,194,255,8
1230	vmovdqa	%xmm1,-80(%ebx)
1231	vpaddd	%xmm7,%xmm5,%xmm5
1232	vmovdqa	%xmm7,96(%ebx)
1233	vpxor	%xmm5,%xmm3,%xmm3
1234.byte	143,232,120,194,219,7
1235	decl	%edx
1236	jnz	L018loop
1237	vmovdqa	%xmm3,-64(%ebx)
1238	vmovdqa	%xmm4,(%ebx)
1239	vmovdqa	%xmm5,16(%ebx)
1240	vmovdqa	%xmm6,64(%ebx)
1241	vmovdqa	%xmm7,96(%ebx)
1242	vmovdqa	-112(%ebx),%xmm1
1243	vmovdqa	-96(%ebx),%xmm2
1244	vmovdqa	-80(%ebx),%xmm3
1245	vpaddd	-128(%ebp),%xmm0,%xmm0
1246	vpaddd	-112(%ebp),%xmm1,%xmm1
1247	vpaddd	-96(%ebp),%xmm2,%xmm2
1248	vpaddd	-80(%ebp),%xmm3,%xmm3
1249	vpunpckldq	%xmm1,%xmm0,%xmm6
1250	vpunpckldq	%xmm3,%xmm2,%xmm7
1251	vpunpckhdq	%xmm1,%xmm0,%xmm0
1252	vpunpckhdq	%xmm3,%xmm2,%xmm2
1253	vpunpcklqdq	%xmm7,%xmm6,%xmm1
1254	vpunpckhqdq	%xmm7,%xmm6,%xmm6
1255	vpunpcklqdq	%xmm2,%xmm0,%xmm7
1256	vpunpckhqdq	%xmm2,%xmm0,%xmm3
1257	vpxor	-128(%esi),%xmm1,%xmm4
1258	vpxor	-64(%esi),%xmm6,%xmm5
1259	vpxor	(%esi),%xmm7,%xmm6
1260	vpxor	64(%esi),%xmm3,%xmm7
1261	leal	16(%esi),%esi
1262	vmovdqa	-64(%ebx),%xmm0
1263	vmovdqa	-48(%ebx),%xmm1
1264	vmovdqa	-32(%ebx),%xmm2
1265	vmovdqa	-16(%ebx),%xmm3
1266	vmovdqu	%xmm4,-128(%edi)
1267	vmovdqu	%xmm5,-64(%edi)
1268	vmovdqu	%xmm6,(%edi)
1269	vmovdqu	%xmm7,64(%edi)
1270	leal	16(%edi),%edi
1271	vpaddd	-64(%ebp),%xmm0,%xmm0
1272	vpaddd	-48(%ebp),%xmm1,%xmm1
1273	vpaddd	-32(%ebp),%xmm2,%xmm2
1274	vpaddd	-16(%ebp),%xmm3,%xmm3
1275	vpunpckldq	%xmm1,%xmm0,%xmm6
1276	vpunpckldq	%xmm3,%xmm2,%xmm7
1277	vpunpckhdq	%xmm1,%xmm0,%xmm0
1278	vpunpckhdq	%xmm3,%xmm2,%xmm2
1279	vpunpcklqdq	%xmm7,%xmm6,%xmm1
1280	vpunpckhqdq	%xmm7,%xmm6,%xmm6
1281	vpunpcklqdq	%xmm2,%xmm0,%xmm7
1282	vpunpckhqdq	%xmm2,%xmm0,%xmm3
1283	vpxor	-128(%esi),%xmm1,%xmm4
1284	vpxor	-64(%esi),%xmm6,%xmm5
1285	vpxor	(%esi),%xmm7,%xmm6
1286	vpxor	64(%esi),%xmm3,%xmm7
1287	leal	16(%esi),%esi
1288	vmovdqa	(%ebx),%xmm0
1289	vmovdqa	16(%ebx),%xmm1
1290	vmovdqa	32(%ebx),%xmm2
1291	vmovdqa	48(%ebx),%xmm3
1292	vmovdqu	%xmm4,-128(%edi)
1293	vmovdqu	%xmm5,-64(%edi)
1294	vmovdqu	%xmm6,(%edi)
1295	vmovdqu	%xmm7,64(%edi)
1296	leal	16(%edi),%edi
1297	vpaddd	(%ebp),%xmm0,%xmm0
1298	vpaddd	16(%ebp),%xmm1,%xmm1
1299	vpaddd	32(%ebp),%xmm2,%xmm2
1300	vpaddd	48(%ebp),%xmm3,%xmm3
1301	vpunpckldq	%xmm1,%xmm0,%xmm6
1302	vpunpckldq	%xmm3,%xmm2,%xmm7
1303	vpunpckhdq	%xmm1,%xmm0,%xmm0
1304	vpunpckhdq	%xmm3,%xmm2,%xmm2
1305	vpunpcklqdq	%xmm7,%xmm6,%xmm1
1306	vpunpckhqdq	%xmm7,%xmm6,%xmm6
1307	vpunpcklqdq	%xmm2,%xmm0,%xmm7
1308	vpunpckhqdq	%xmm2,%xmm0,%xmm3
1309	vpxor	-128(%esi),%xmm1,%xmm4
1310	vpxor	-64(%esi),%xmm6,%xmm5
1311	vpxor	(%esi),%xmm7,%xmm6
1312	vpxor	64(%esi),%xmm3,%xmm7
1313	leal	16(%esi),%esi
1314	vmovdqa	64(%ebx),%xmm0
1315	vmovdqa	80(%ebx),%xmm1
1316	vmovdqa	96(%ebx),%xmm2
1317	vmovdqa	112(%ebx),%xmm3
1318	vmovdqu	%xmm4,-128(%edi)
1319	vmovdqu	%xmm5,-64(%edi)
1320	vmovdqu	%xmm6,(%edi)
1321	vmovdqu	%xmm7,64(%edi)
1322	leal	16(%edi),%edi
1323	vpaddd	64(%ebp),%xmm0,%xmm0
1324	vpaddd	80(%ebp),%xmm1,%xmm1
1325	vpaddd	96(%ebp),%xmm2,%xmm2
1326	vpaddd	112(%ebp),%xmm3,%xmm3
1327	vpunpckldq	%xmm1,%xmm0,%xmm6
1328	vpunpckldq	%xmm3,%xmm2,%xmm7
1329	vpunpckhdq	%xmm1,%xmm0,%xmm0
1330	vpunpckhdq	%xmm3,%xmm2,%xmm2
1331	vpunpcklqdq	%xmm7,%xmm6,%xmm1
1332	vpunpckhqdq	%xmm7,%xmm6,%xmm6
1333	vpunpcklqdq	%xmm2,%xmm0,%xmm7
1334	vpunpckhqdq	%xmm2,%xmm0,%xmm3
1335	vpxor	-128(%esi),%xmm1,%xmm4
1336	vpxor	-64(%esi),%xmm6,%xmm5
1337	vpxor	(%esi),%xmm7,%xmm6
1338	vpxor	64(%esi),%xmm3,%xmm7
1339	leal	208(%esi),%esi
1340	vmovdqu	%xmm4,-128(%edi)
1341	vmovdqu	%xmm5,-64(%edi)
1342	vmovdqu	%xmm6,(%edi)
1343	vmovdqu	%xmm7,64(%edi)
1344	leal	208(%edi),%edi
1345	subl	$256,%ecx
1346	jnc	L017outer_loop
1347	addl	$256,%ecx
1348	jz	L019done
1349	movl	520(%esp),%ebx
1350	leal	-128(%esi),%esi
1351	movl	516(%esp),%edx
1352	leal	-128(%edi),%edi
1353	vmovd	64(%ebp),%xmm2
1354	vmovdqu	(%ebx),%xmm3
1355	vpaddd	96(%eax),%xmm2,%xmm2
1356	vpand	112(%eax),%xmm3,%xmm3
1357	vpor	%xmm2,%xmm3,%xmm3
1358L0161x:
1359	vmovdqa	32(%eax),%xmm0
1360	vmovdqu	(%edx),%xmm1
1361	vmovdqu	16(%edx),%xmm2
1362	vmovdqa	(%eax),%xmm6
1363	vmovdqa	16(%eax),%xmm7
1364	movl	%ebp,48(%esp)
1365	vmovdqa	%xmm0,(%esp)
1366	vmovdqa	%xmm1,16(%esp)
1367	vmovdqa	%xmm2,32(%esp)
1368	vmovdqa	%xmm3,48(%esp)
1369	movl	$10,%edx
1370	jmp	L020loop1x
1371.align	4,0x90
1372L021outer1x:
1373	vmovdqa	80(%eax),%xmm3
1374	vmovdqa	(%esp),%xmm0
1375	vmovdqa	16(%esp),%xmm1
1376	vmovdqa	32(%esp),%xmm2
1377	vpaddd	48(%esp),%xmm3,%xmm3
1378	movl	$10,%edx
1379	vmovdqa	%xmm3,48(%esp)
1380	jmp	L020loop1x
1381.align	4,0x90
1382L020loop1x:
1383	vpaddd	%xmm1,%xmm0,%xmm0
1384	vpxor	%xmm0,%xmm3,%xmm3
1385.byte	143,232,120,194,219,16
1386	vpaddd	%xmm3,%xmm2,%xmm2
1387	vpxor	%xmm2,%xmm1,%xmm1
1388.byte	143,232,120,194,201,12
1389	vpaddd	%xmm1,%xmm0,%xmm0
1390	vpxor	%xmm0,%xmm3,%xmm3
1391.byte	143,232,120,194,219,8
1392	vpaddd	%xmm3,%xmm2,%xmm2
1393	vpxor	%xmm2,%xmm1,%xmm1
1394.byte	143,232,120,194,201,7
1395	vpshufd	$78,%xmm2,%xmm2
1396	vpshufd	$57,%xmm1,%xmm1
1397	vpshufd	$147,%xmm3,%xmm3
1398	vpaddd	%xmm1,%xmm0,%xmm0
1399	vpxor	%xmm0,%xmm3,%xmm3
1400.byte	143,232,120,194,219,16
1401	vpaddd	%xmm3,%xmm2,%xmm2
1402	vpxor	%xmm2,%xmm1,%xmm1
1403.byte	143,232,120,194,201,12
1404	vpaddd	%xmm1,%xmm0,%xmm0
1405	vpxor	%xmm0,%xmm3,%xmm3
1406.byte	143,232,120,194,219,8
1407	vpaddd	%xmm3,%xmm2,%xmm2
1408	vpxor	%xmm2,%xmm1,%xmm1
1409.byte	143,232,120,194,201,7
1410	vpshufd	$78,%xmm2,%xmm2
1411	vpshufd	$147,%xmm1,%xmm1
1412	vpshufd	$57,%xmm3,%xmm3
1413	decl	%edx
1414	jnz	L020loop1x
1415	vpaddd	(%esp),%xmm0,%xmm0
1416	vpaddd	16(%esp),%xmm1,%xmm1
1417	vpaddd	32(%esp),%xmm2,%xmm2
1418	vpaddd	48(%esp),%xmm3,%xmm3
1419	cmpl	$64,%ecx
1420	jb	L022tail
1421	vpxor	(%esi),%xmm0,%xmm0
1422	vpxor	16(%esi),%xmm1,%xmm1
1423	vpxor	32(%esi),%xmm2,%xmm2
1424	vpxor	48(%esi),%xmm3,%xmm3
1425	leal	64(%esi),%esi
1426	vmovdqu	%xmm0,(%edi)
1427	vmovdqu	%xmm1,16(%edi)
1428	vmovdqu	%xmm2,32(%edi)
1429	vmovdqu	%xmm3,48(%edi)
1430	leal	64(%edi),%edi
1431	subl	$64,%ecx
1432	jnz	L021outer1x
1433	jmp	L019done
1434L022tail:
1435	vmovdqa	%xmm0,(%esp)
1436	vmovdqa	%xmm1,16(%esp)
1437	vmovdqa	%xmm2,32(%esp)
1438	vmovdqa	%xmm3,48(%esp)
1439	xorl	%eax,%eax
1440	xorl	%edx,%edx
1441	xorl	%ebp,%ebp
1442L023tail_loop:
1443	movb	(%esp,%ebp,1),%al
1444	movb	(%esi,%ebp,1),%dl
1445	leal	1(%ebp),%ebp
1446	xorb	%dl,%al
1447	movb	%al,-1(%edi,%ebp,1)
1448	decl	%ecx
1449	jnz	L023tail_loop
1450L019done:
1451	vzeroupper
1452	movl	512(%esp),%esp
1453	popl	%edi
1454	popl	%esi
1455	popl	%ebx
1456	popl	%ebp
1457	ret
1458.comm	_OPENSSL_ia32cap_P,16
1459