• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1.text
2.align	64
3.globl	poly1305_init
4.type	poly1305_init,@function
5.align	16
6poly1305_init:
7.L_poly1305_init_begin:
8	%ifdef __CET__
9
10.byte	243,15,30,251
11	%endif
12
13	pushl	%ebp
14	pushl	%ebx
15	pushl	%esi
16	pushl	%edi
17	movl	20(%esp),%edi
18	movl	24(%esp),%esi
19	movl	28(%esp),%ebp
20	xorl	%eax,%eax
21	movl	%eax,(%edi)
22	movl	%eax,4(%edi)
23	movl	%eax,8(%edi)
24	movl	%eax,12(%edi)
25	movl	%eax,16(%edi)
26	movl	%eax,20(%edi)
27	cmpl	$0,%esi
28	je	.L000nokey
29	call	.L001pic_point
30.L001pic_point:
31	popl	%ebx
32	leal	poly1305_blocks-.L001pic_point(%ebx),%eax
33	leal	poly1305_emit-.L001pic_point(%ebx),%edx
34	leal	OPENSSL_ia32cap_P-.L001pic_point(%ebx),%edi
35	movl	(%edi),%ecx
36	andl	$83886080,%ecx
37	cmpl	$83886080,%ecx
38	jne	.L002no_sse2
39	leal	_poly1305_blocks_sse2-.L001pic_point(%ebx),%eax
40	leal	_poly1305_emit_sse2-.L001pic_point(%ebx),%edx
41	movl	8(%edi),%ecx
42	testl	$32,%ecx
43	jz	.L002no_sse2
44	leal	_poly1305_blocks_avx2-.L001pic_point(%ebx),%eax
45.L002no_sse2:
46	movl	20(%esp),%edi
47	movl	%eax,(%ebp)
48	movl	%edx,4(%ebp)
49	movl	(%esi),%eax
50	movl	4(%esi),%ebx
51	movl	8(%esi),%ecx
52	movl	12(%esi),%edx
53	andl	$268435455,%eax
54	andl	$268435452,%ebx
55	andl	$268435452,%ecx
56	andl	$268435452,%edx
57	movl	%eax,24(%edi)
58	movl	%ebx,28(%edi)
59	movl	%ecx,32(%edi)
60	movl	%edx,36(%edi)
61	movl	$1,%eax
62.L000nokey:
63	popl	%edi
64	popl	%esi
65	popl	%ebx
66	popl	%ebp
67	ret
68.size	poly1305_init,.-.L_poly1305_init_begin
69.globl	poly1305_blocks
70.type	poly1305_blocks,@function
71.align	16
72poly1305_blocks:
73.L_poly1305_blocks_begin:
74	%ifdef __CET__
75
76.byte	243,15,30,251
77	%endif
78
79	pushl	%ebp
80	pushl	%ebx
81	pushl	%esi
82	pushl	%edi
83	movl	20(%esp),%edi
84	movl	24(%esp),%esi
85	movl	28(%esp),%ecx
86.Lenter_blocks:
87	andl	$-15,%ecx
88	jz	.L003nodata
89	subl	$64,%esp
90	movl	24(%edi),%eax
91	movl	28(%edi),%ebx
92	leal	(%esi,%ecx,1),%ebp
93	movl	32(%edi),%ecx
94	movl	36(%edi),%edx
95	movl	%ebp,92(%esp)
96	movl	%esi,%ebp
97	movl	%eax,36(%esp)
98	movl	%ebx,%eax
99	shrl	$2,%eax
100	movl	%ebx,40(%esp)
101	addl	%ebx,%eax
102	movl	%ecx,%ebx
103	shrl	$2,%ebx
104	movl	%ecx,44(%esp)
105	addl	%ecx,%ebx
106	movl	%edx,%ecx
107	shrl	$2,%ecx
108	movl	%edx,48(%esp)
109	addl	%edx,%ecx
110	movl	%eax,52(%esp)
111	movl	%ebx,56(%esp)
112	movl	%ecx,60(%esp)
113	movl	(%edi),%eax
114	movl	4(%edi),%ebx
115	movl	8(%edi),%ecx
116	movl	12(%edi),%esi
117	movl	16(%edi),%edi
118	jmp	.L004loop
119.align	32
120.L004loop:
121	addl	(%ebp),%eax
122	adcl	4(%ebp),%ebx
123	adcl	8(%ebp),%ecx
124	adcl	12(%ebp),%esi
125	leal	16(%ebp),%ebp
126	adcl	96(%esp),%edi
127	movl	%eax,(%esp)
128	movl	%esi,12(%esp)
129	mull	36(%esp)
130	movl	%edi,16(%esp)
131	movl	%eax,%edi
132	movl	%ebx,%eax
133	movl	%edx,%esi
134	mull	60(%esp)
135	addl	%eax,%edi
136	movl	%ecx,%eax
137	adcl	%edx,%esi
138	mull	56(%esp)
139	addl	%eax,%edi
140	movl	12(%esp),%eax
141	adcl	%edx,%esi
142	mull	52(%esp)
143	addl	%eax,%edi
144	movl	(%esp),%eax
145	adcl	%edx,%esi
146	mull	40(%esp)
147	movl	%edi,20(%esp)
148	xorl	%edi,%edi
149	addl	%eax,%esi
150	movl	%ebx,%eax
151	adcl	%edx,%edi
152	mull	36(%esp)
153	addl	%eax,%esi
154	movl	%ecx,%eax
155	adcl	%edx,%edi
156	mull	60(%esp)
157	addl	%eax,%esi
158	movl	12(%esp),%eax
159	adcl	%edx,%edi
160	mull	56(%esp)
161	addl	%eax,%esi
162	movl	16(%esp),%eax
163	adcl	%edx,%edi
164	imull	52(%esp),%eax
165	addl	%eax,%esi
166	movl	(%esp),%eax
167	adcl	$0,%edi
168	mull	44(%esp)
169	movl	%esi,24(%esp)
170	xorl	%esi,%esi
171	addl	%eax,%edi
172	movl	%ebx,%eax
173	adcl	%edx,%esi
174	mull	40(%esp)
175	addl	%eax,%edi
176	movl	%ecx,%eax
177	adcl	%edx,%esi
178	mull	36(%esp)
179	addl	%eax,%edi
180	movl	12(%esp),%eax
181	adcl	%edx,%esi
182	mull	60(%esp)
183	addl	%eax,%edi
184	movl	16(%esp),%eax
185	adcl	%edx,%esi
186	imull	56(%esp),%eax
187	addl	%eax,%edi
188	movl	(%esp),%eax
189	adcl	$0,%esi
190	mull	48(%esp)
191	movl	%edi,28(%esp)
192	xorl	%edi,%edi
193	addl	%eax,%esi
194	movl	%ebx,%eax
195	adcl	%edx,%edi
196	mull	44(%esp)
197	addl	%eax,%esi
198	movl	%ecx,%eax
199	adcl	%edx,%edi
200	mull	40(%esp)
201	addl	%eax,%esi
202	movl	12(%esp),%eax
203	adcl	%edx,%edi
204	mull	36(%esp)
205	addl	%eax,%esi
206	movl	16(%esp),%ecx
207	adcl	%edx,%edi
208	movl	%ecx,%edx
209	imull	60(%esp),%ecx
210	addl	%ecx,%esi
211	movl	20(%esp),%eax
212	adcl	$0,%edi
213	imull	36(%esp),%edx
214	addl	%edi,%edx
215	movl	24(%esp),%ebx
216	movl	28(%esp),%ecx
217	movl	%edx,%edi
218	shrl	$2,%edx
219	andl	$3,%edi
220	leal	(%edx,%edx,4),%edx
221	addl	%edx,%eax
222	adcl	$0,%ebx
223	adcl	$0,%ecx
224	adcl	$0,%esi
225	adcl	$0,%edi
226	cmpl	92(%esp),%ebp
227	jne	.L004loop
228	movl	84(%esp),%edx
229	addl	$64,%esp
230	movl	%eax,(%edx)
231	movl	%ebx,4(%edx)
232	movl	%ecx,8(%edx)
233	movl	%esi,12(%edx)
234	movl	%edi,16(%edx)
235.L003nodata:
236	popl	%edi
237	popl	%esi
238	popl	%ebx
239	popl	%ebp
240	ret
241.size	poly1305_blocks,.-.L_poly1305_blocks_begin
242.globl	poly1305_emit
243.type	poly1305_emit,@function
244.align	16
245poly1305_emit:
246.L_poly1305_emit_begin:
247	%ifdef __CET__
248
249.byte	243,15,30,251
250	%endif
251
252	pushl	%ebp
253	pushl	%ebx
254	pushl	%esi
255	pushl	%edi
256	movl	20(%esp),%ebp
257.Lenter_emit:
258	movl	24(%esp),%edi
259	movl	(%ebp),%eax
260	movl	4(%ebp),%ebx
261	movl	8(%ebp),%ecx
262	movl	12(%ebp),%edx
263	movl	16(%ebp),%esi
264	addl	$5,%eax
265	adcl	$0,%ebx
266	adcl	$0,%ecx
267	adcl	$0,%edx
268	adcl	$0,%esi
269	shrl	$2,%esi
270	negl	%esi
271	andl	%esi,%eax
272	andl	%esi,%ebx
273	andl	%esi,%ecx
274	andl	%esi,%edx
275	movl	%eax,(%edi)
276	movl	%ebx,4(%edi)
277	movl	%ecx,8(%edi)
278	movl	%edx,12(%edi)
279	notl	%esi
280	movl	(%ebp),%eax
281	movl	4(%ebp),%ebx
282	movl	8(%ebp),%ecx
283	movl	12(%ebp),%edx
284	movl	28(%esp),%ebp
285	andl	%esi,%eax
286	andl	%esi,%ebx
287	andl	%esi,%ecx
288	andl	%esi,%edx
289	orl	(%edi),%eax
290	orl	4(%edi),%ebx
291	orl	8(%edi),%ecx
292	orl	12(%edi),%edx
293	addl	(%ebp),%eax
294	adcl	4(%ebp),%ebx
295	adcl	8(%ebp),%ecx
296	adcl	12(%ebp),%edx
297	movl	%eax,(%edi)
298	movl	%ebx,4(%edi)
299	movl	%ecx,8(%edi)
300	movl	%edx,12(%edi)
301	popl	%edi
302	popl	%esi
303	popl	%ebx
304	popl	%ebp
305	ret
306.size	poly1305_emit,.-.L_poly1305_emit_begin
307.align	32
308.type	_poly1305_init_sse2,@function
309.align	16
310_poly1305_init_sse2:
311	%ifdef __CET__
312
313.byte	243,15,30,251
314	%endif
315
316	movdqu	24(%edi),%xmm4
317	leal	48(%edi),%edi
318	movl	%esp,%ebp
319	subl	$224,%esp
320	andl	$-16,%esp
321	movq	64(%ebx),%xmm7
322	movdqa	%xmm4,%xmm0
323	movdqa	%xmm4,%xmm1
324	movdqa	%xmm4,%xmm2
325	pand	%xmm7,%xmm0
326	psrlq	$26,%xmm1
327	psrldq	$6,%xmm2
328	pand	%xmm7,%xmm1
329	movdqa	%xmm2,%xmm3
330	psrlq	$4,%xmm2
331	psrlq	$30,%xmm3
332	pand	%xmm7,%xmm2
333	pand	%xmm7,%xmm3
334	psrldq	$13,%xmm4
335	leal	144(%esp),%edx
336	movl	$2,%ecx
337.L005square:
338	movdqa	%xmm0,(%esp)
339	movdqa	%xmm1,16(%esp)
340	movdqa	%xmm2,32(%esp)
341	movdqa	%xmm3,48(%esp)
342	movdqa	%xmm4,64(%esp)
343	movdqa	%xmm1,%xmm6
344	movdqa	%xmm2,%xmm5
345	pslld	$2,%xmm6
346	pslld	$2,%xmm5
347	paddd	%xmm1,%xmm6
348	paddd	%xmm2,%xmm5
349	movdqa	%xmm6,80(%esp)
350	movdqa	%xmm5,96(%esp)
351	movdqa	%xmm3,%xmm6
352	movdqa	%xmm4,%xmm5
353	pslld	$2,%xmm6
354	pslld	$2,%xmm5
355	paddd	%xmm3,%xmm6
356	paddd	%xmm4,%xmm5
357	movdqa	%xmm6,112(%esp)
358	movdqa	%xmm5,128(%esp)
359	pshufd	$68,%xmm0,%xmm6
360	movdqa	%xmm1,%xmm5
361	pshufd	$68,%xmm1,%xmm1
362	pshufd	$68,%xmm2,%xmm2
363	pshufd	$68,%xmm3,%xmm3
364	pshufd	$68,%xmm4,%xmm4
365	movdqa	%xmm6,(%edx)
366	movdqa	%xmm1,16(%edx)
367	movdqa	%xmm2,32(%edx)
368	movdqa	%xmm3,48(%edx)
369	movdqa	%xmm4,64(%edx)
370	pmuludq	%xmm0,%xmm4
371	pmuludq	%xmm0,%xmm3
372	pmuludq	%xmm0,%xmm2
373	pmuludq	%xmm0,%xmm1
374	pmuludq	%xmm6,%xmm0
375	movdqa	%xmm5,%xmm6
376	pmuludq	48(%edx),%xmm5
377	movdqa	%xmm6,%xmm7
378	pmuludq	32(%edx),%xmm6
379	paddq	%xmm5,%xmm4
380	movdqa	%xmm7,%xmm5
381	pmuludq	16(%edx),%xmm7
382	paddq	%xmm6,%xmm3
383	movdqa	80(%esp),%xmm6
384	pmuludq	(%edx),%xmm5
385	paddq	%xmm7,%xmm2
386	pmuludq	64(%edx),%xmm6
387	movdqa	32(%esp),%xmm7
388	paddq	%xmm5,%xmm1
389	movdqa	%xmm7,%xmm5
390	pmuludq	32(%edx),%xmm7
391	paddq	%xmm6,%xmm0
392	movdqa	%xmm5,%xmm6
393	pmuludq	16(%edx),%xmm5
394	paddq	%xmm7,%xmm4
395	movdqa	96(%esp),%xmm7
396	pmuludq	(%edx),%xmm6
397	paddq	%xmm5,%xmm3
398	movdqa	%xmm7,%xmm5
399	pmuludq	64(%edx),%xmm7
400	paddq	%xmm6,%xmm2
401	pmuludq	48(%edx),%xmm5
402	movdqa	48(%esp),%xmm6
403	paddq	%xmm7,%xmm1
404	movdqa	%xmm6,%xmm7
405	pmuludq	16(%edx),%xmm6
406	paddq	%xmm5,%xmm0
407	movdqa	112(%esp),%xmm5
408	pmuludq	(%edx),%xmm7
409	paddq	%xmm6,%xmm4
410	movdqa	%xmm5,%xmm6
411	pmuludq	64(%edx),%xmm5
412	paddq	%xmm7,%xmm3
413	movdqa	%xmm6,%xmm7
414	pmuludq	48(%edx),%xmm6
415	paddq	%xmm5,%xmm2
416	pmuludq	32(%edx),%xmm7
417	movdqa	64(%esp),%xmm5
418	paddq	%xmm6,%xmm1
419	movdqa	128(%esp),%xmm6
420	pmuludq	(%edx),%xmm5
421	paddq	%xmm7,%xmm0
422	movdqa	%xmm6,%xmm7
423	pmuludq	64(%edx),%xmm6
424	paddq	%xmm5,%xmm4
425	movdqa	%xmm7,%xmm5
426	pmuludq	16(%edx),%xmm7
427	paddq	%xmm6,%xmm3
428	movdqa	%xmm5,%xmm6
429	pmuludq	32(%edx),%xmm5
430	paddq	%xmm7,%xmm0
431	pmuludq	48(%edx),%xmm6
432	movdqa	64(%ebx),%xmm7
433	paddq	%xmm5,%xmm1
434	paddq	%xmm6,%xmm2
435	movdqa	%xmm3,%xmm5
436	pand	%xmm7,%xmm3
437	psrlq	$26,%xmm5
438	paddq	%xmm4,%xmm5
439	movdqa	%xmm0,%xmm6
440	pand	%xmm7,%xmm0
441	psrlq	$26,%xmm6
442	movdqa	%xmm5,%xmm4
443	paddq	%xmm1,%xmm6
444	psrlq	$26,%xmm5
445	pand	%xmm7,%xmm4
446	movdqa	%xmm6,%xmm1
447	psrlq	$26,%xmm6
448	paddd	%xmm5,%xmm0
449	psllq	$2,%xmm5
450	paddq	%xmm2,%xmm6
451	paddq	%xmm0,%xmm5
452	pand	%xmm7,%xmm1
453	movdqa	%xmm6,%xmm2
454	psrlq	$26,%xmm6
455	pand	%xmm7,%xmm2
456	paddd	%xmm3,%xmm6
457	movdqa	%xmm5,%xmm0
458	psrlq	$26,%xmm5
459	movdqa	%xmm6,%xmm3
460	psrlq	$26,%xmm6
461	pand	%xmm7,%xmm0
462	paddd	%xmm5,%xmm1
463	pand	%xmm7,%xmm3
464	paddd	%xmm6,%xmm4
465	decl	%ecx
466	jz	.L006square_break
467	punpcklqdq	(%esp),%xmm0
468	punpcklqdq	16(%esp),%xmm1
469	punpcklqdq	32(%esp),%xmm2
470	punpcklqdq	48(%esp),%xmm3
471	punpcklqdq	64(%esp),%xmm4
472	jmp	.L005square
473.L006square_break:
474	psllq	$32,%xmm0
475	psllq	$32,%xmm1
476	psllq	$32,%xmm2
477	psllq	$32,%xmm3
478	psllq	$32,%xmm4
479	por	(%esp),%xmm0
480	por	16(%esp),%xmm1
481	por	32(%esp),%xmm2
482	por	48(%esp),%xmm3
483	por	64(%esp),%xmm4
484	pshufd	$141,%xmm0,%xmm0
485	pshufd	$141,%xmm1,%xmm1
486	pshufd	$141,%xmm2,%xmm2
487	pshufd	$141,%xmm3,%xmm3
488	pshufd	$141,%xmm4,%xmm4
489	movdqu	%xmm0,(%edi)
490	movdqu	%xmm1,16(%edi)
491	movdqu	%xmm2,32(%edi)
492	movdqu	%xmm3,48(%edi)
493	movdqu	%xmm4,64(%edi)
494	movdqa	%xmm1,%xmm6
495	movdqa	%xmm2,%xmm5
496	pslld	$2,%xmm6
497	pslld	$2,%xmm5
498	paddd	%xmm1,%xmm6
499	paddd	%xmm2,%xmm5
500	movdqu	%xmm6,80(%edi)
501	movdqu	%xmm5,96(%edi)
502	movdqa	%xmm3,%xmm6
503	movdqa	%xmm4,%xmm5
504	pslld	$2,%xmm6
505	pslld	$2,%xmm5
506	paddd	%xmm3,%xmm6
507	paddd	%xmm4,%xmm5
508	movdqu	%xmm6,112(%edi)
509	movdqu	%xmm5,128(%edi)
510	movl	%ebp,%esp
511	leal	-48(%edi),%edi
512	ret
513.size	_poly1305_init_sse2,.-_poly1305_init_sse2
514.align	32
515.type	_poly1305_blocks_sse2,@function
516.align	16
517_poly1305_blocks_sse2:
518	%ifdef __CET__
519
520.byte	243,15,30,251
521	%endif
522
523	pushl	%ebp
524	pushl	%ebx
525	pushl	%esi
526	pushl	%edi
527	movl	20(%esp),%edi
528	movl	24(%esp),%esi
529	movl	28(%esp),%ecx
530	movl	20(%edi),%eax
531	andl	$-16,%ecx
532	jz	.L007nodata
533	cmpl	$64,%ecx
534	jae	.L008enter_sse2
535	testl	%eax,%eax
536	jz	.Lenter_blocks
537.align	16
538.L008enter_sse2:
539	call	.L009pic_point
540.L009pic_point:
541	popl	%ebx
542	leal	.Lconst_sse2-.L009pic_point(%ebx),%ebx
543	testl	%eax,%eax
544	jnz	.L010base2_26
545	call	_poly1305_init_sse2
546	movl	(%edi),%eax
547	movl	3(%edi),%ecx
548	movl	6(%edi),%edx
549	movl	9(%edi),%esi
550	movl	13(%edi),%ebp
551	movl	$1,20(%edi)
552	shrl	$2,%ecx
553	andl	$67108863,%eax
554	shrl	$4,%edx
555	andl	$67108863,%ecx
556	shrl	$6,%esi
557	andl	$67108863,%edx
558	movd	%eax,%xmm0
559	movd	%ecx,%xmm1
560	movd	%edx,%xmm2
561	movd	%esi,%xmm3
562	movd	%ebp,%xmm4
563	movl	24(%esp),%esi
564	movl	28(%esp),%ecx
565	jmp	.L011base2_32
566.align	16
567.L010base2_26:
568	movd	(%edi),%xmm0
569	movd	4(%edi),%xmm1
570	movd	8(%edi),%xmm2
571	movd	12(%edi),%xmm3
572	movd	16(%edi),%xmm4
573	movdqa	64(%ebx),%xmm7
574.L011base2_32:
575	movl	32(%esp),%eax
576	movl	%esp,%ebp
577	subl	$528,%esp
578	andl	$-16,%esp
579	leal	48(%edi),%edi
580	shll	$24,%eax
581	testl	$31,%ecx
582	jz	.L012even
583	movdqu	(%esi),%xmm6
584	leal	16(%esi),%esi
585	movdqa	%xmm6,%xmm5
586	pand	%xmm7,%xmm6
587	paddd	%xmm6,%xmm0
588	movdqa	%xmm5,%xmm6
589	psrlq	$26,%xmm5
590	psrldq	$6,%xmm6
591	pand	%xmm7,%xmm5
592	paddd	%xmm5,%xmm1
593	movdqa	%xmm6,%xmm5
594	psrlq	$4,%xmm6
595	pand	%xmm7,%xmm6
596	paddd	%xmm6,%xmm2
597	movdqa	%xmm5,%xmm6
598	psrlq	$30,%xmm5
599	pand	%xmm7,%xmm5
600	psrldq	$7,%xmm6
601	paddd	%xmm5,%xmm3
602	movd	%eax,%xmm5
603	paddd	%xmm6,%xmm4
604	movd	12(%edi),%xmm6
605	paddd	%xmm5,%xmm4
606	movdqa	%xmm0,(%esp)
607	movdqa	%xmm1,16(%esp)
608	movdqa	%xmm2,32(%esp)
609	movdqa	%xmm3,48(%esp)
610	movdqa	%xmm4,64(%esp)
611	pmuludq	%xmm6,%xmm0
612	pmuludq	%xmm6,%xmm1
613	pmuludq	%xmm6,%xmm2
614	movd	28(%edi),%xmm5
615	pmuludq	%xmm6,%xmm3
616	pmuludq	%xmm6,%xmm4
617	movdqa	%xmm5,%xmm6
618	pmuludq	48(%esp),%xmm5
619	movdqa	%xmm6,%xmm7
620	pmuludq	32(%esp),%xmm6
621	paddq	%xmm5,%xmm4
622	movdqa	%xmm7,%xmm5
623	pmuludq	16(%esp),%xmm7
624	paddq	%xmm6,%xmm3
625	movd	92(%edi),%xmm6
626	pmuludq	(%esp),%xmm5
627	paddq	%xmm7,%xmm2
628	pmuludq	64(%esp),%xmm6
629	movd	44(%edi),%xmm7
630	paddq	%xmm5,%xmm1
631	movdqa	%xmm7,%xmm5
632	pmuludq	32(%esp),%xmm7
633	paddq	%xmm6,%xmm0
634	movdqa	%xmm5,%xmm6
635	pmuludq	16(%esp),%xmm5
636	paddq	%xmm7,%xmm4
637	movd	108(%edi),%xmm7
638	pmuludq	(%esp),%xmm6
639	paddq	%xmm5,%xmm3
640	movdqa	%xmm7,%xmm5
641	pmuludq	64(%esp),%xmm7
642	paddq	%xmm6,%xmm2
643	pmuludq	48(%esp),%xmm5
644	movd	60(%edi),%xmm6
645	paddq	%xmm7,%xmm1
646	movdqa	%xmm6,%xmm7
647	pmuludq	16(%esp),%xmm6
648	paddq	%xmm5,%xmm0
649	movd	124(%edi),%xmm5
650	pmuludq	(%esp),%xmm7
651	paddq	%xmm6,%xmm4
652	movdqa	%xmm5,%xmm6
653	pmuludq	64(%esp),%xmm5
654	paddq	%xmm7,%xmm3
655	movdqa	%xmm6,%xmm7
656	pmuludq	48(%esp),%xmm6
657	paddq	%xmm5,%xmm2
658	pmuludq	32(%esp),%xmm7
659	movd	76(%edi),%xmm5
660	paddq	%xmm6,%xmm1
661	movd	140(%edi),%xmm6
662	pmuludq	(%esp),%xmm5
663	paddq	%xmm7,%xmm0
664	movdqa	%xmm6,%xmm7
665	pmuludq	64(%esp),%xmm6
666	paddq	%xmm5,%xmm4
667	movdqa	%xmm7,%xmm5
668	pmuludq	16(%esp),%xmm7
669	paddq	%xmm6,%xmm3
670	movdqa	%xmm5,%xmm6
671	pmuludq	32(%esp),%xmm5
672	paddq	%xmm7,%xmm0
673	pmuludq	48(%esp),%xmm6
674	movdqa	64(%ebx),%xmm7
675	paddq	%xmm5,%xmm1
676	paddq	%xmm6,%xmm2
677	movdqa	%xmm3,%xmm5
678	pand	%xmm7,%xmm3
679	psrlq	$26,%xmm5
680	paddq	%xmm4,%xmm5
681	movdqa	%xmm0,%xmm6
682	pand	%xmm7,%xmm0
683	psrlq	$26,%xmm6
684	movdqa	%xmm5,%xmm4
685	paddq	%xmm1,%xmm6
686	psrlq	$26,%xmm5
687	pand	%xmm7,%xmm4
688	movdqa	%xmm6,%xmm1
689	psrlq	$26,%xmm6
690	paddd	%xmm5,%xmm0
691	psllq	$2,%xmm5
692	paddq	%xmm2,%xmm6
693	paddq	%xmm0,%xmm5
694	pand	%xmm7,%xmm1
695	movdqa	%xmm6,%xmm2
696	psrlq	$26,%xmm6
697	pand	%xmm7,%xmm2
698	paddd	%xmm3,%xmm6
699	movdqa	%xmm5,%xmm0
700	psrlq	$26,%xmm5
701	movdqa	%xmm6,%xmm3
702	psrlq	$26,%xmm6
703	pand	%xmm7,%xmm0
704	paddd	%xmm5,%xmm1
705	pand	%xmm7,%xmm3
706	paddd	%xmm6,%xmm4
707	subl	$16,%ecx
708	jz	.L013done
709.L012even:
710	leal	384(%esp),%edx
711	leal	-32(%esi),%eax
712	subl	$64,%ecx
713	movdqu	(%edi),%xmm5
714	pshufd	$68,%xmm5,%xmm6
715	cmovbl	%eax,%esi
716	pshufd	$238,%xmm5,%xmm5
717	movdqa	%xmm6,(%edx)
718	leal	160(%esp),%eax
719	movdqu	16(%edi),%xmm6
720	movdqa	%xmm5,-144(%edx)
721	pshufd	$68,%xmm6,%xmm5
722	pshufd	$238,%xmm6,%xmm6
723	movdqa	%xmm5,16(%edx)
724	movdqu	32(%edi),%xmm5
725	movdqa	%xmm6,-128(%edx)
726	pshufd	$68,%xmm5,%xmm6
727	pshufd	$238,%xmm5,%xmm5
728	movdqa	%xmm6,32(%edx)
729	movdqu	48(%edi),%xmm6
730	movdqa	%xmm5,-112(%edx)
731	pshufd	$68,%xmm6,%xmm5
732	pshufd	$238,%xmm6,%xmm6
733	movdqa	%xmm5,48(%edx)
734	movdqu	64(%edi),%xmm5
735	movdqa	%xmm6,-96(%edx)
736	pshufd	$68,%xmm5,%xmm6
737	pshufd	$238,%xmm5,%xmm5
738	movdqa	%xmm6,64(%edx)
739	movdqu	80(%edi),%xmm6
740	movdqa	%xmm5,-80(%edx)
741	pshufd	$68,%xmm6,%xmm5
742	pshufd	$238,%xmm6,%xmm6
743	movdqa	%xmm5,80(%edx)
744	movdqu	96(%edi),%xmm5
745	movdqa	%xmm6,-64(%edx)
746	pshufd	$68,%xmm5,%xmm6
747	pshufd	$238,%xmm5,%xmm5
748	movdqa	%xmm6,96(%edx)
749	movdqu	112(%edi),%xmm6
750	movdqa	%xmm5,-48(%edx)
751	pshufd	$68,%xmm6,%xmm5
752	pshufd	$238,%xmm6,%xmm6
753	movdqa	%xmm5,112(%edx)
754	movdqu	128(%edi),%xmm5
755	movdqa	%xmm6,-32(%edx)
756	pshufd	$68,%xmm5,%xmm6
757	pshufd	$238,%xmm5,%xmm5
758	movdqa	%xmm6,128(%edx)
759	movdqa	%xmm5,-16(%edx)
760	movdqu	32(%esi),%xmm5
761	movdqu	48(%esi),%xmm6
762	leal	32(%esi),%esi
763	movdqa	%xmm2,112(%esp)
764	movdqa	%xmm3,128(%esp)
765	movdqa	%xmm4,144(%esp)
766	movdqa	%xmm5,%xmm2
767	movdqa	%xmm6,%xmm3
768	psrldq	$6,%xmm2
769	psrldq	$6,%xmm3
770	movdqa	%xmm5,%xmm4
771	punpcklqdq	%xmm3,%xmm2
772	punpckhqdq	%xmm6,%xmm4
773	punpcklqdq	%xmm6,%xmm5
774	movdqa	%xmm2,%xmm3
775	psrlq	$4,%xmm2
776	psrlq	$30,%xmm3
777	movdqa	%xmm5,%xmm6
778	psrlq	$40,%xmm4
779	psrlq	$26,%xmm6
780	pand	%xmm7,%xmm5
781	pand	%xmm7,%xmm6
782	pand	%xmm7,%xmm2
783	pand	%xmm7,%xmm3
784	por	(%ebx),%xmm4
785	movdqa	%xmm0,80(%esp)
786	movdqa	%xmm1,96(%esp)
787	jbe	.L014skip_loop
788	jmp	.L015loop
789.align	32
790.L015loop:
791	movdqa	-144(%edx),%xmm7
792	movdqa	%xmm6,16(%eax)
793	movdqa	%xmm2,32(%eax)
794	movdqa	%xmm3,48(%eax)
795	movdqa	%xmm4,64(%eax)
796	movdqa	%xmm5,%xmm1
797	pmuludq	%xmm7,%xmm5
798	movdqa	%xmm6,%xmm0
799	pmuludq	%xmm7,%xmm6
800	pmuludq	%xmm7,%xmm2
801	pmuludq	%xmm7,%xmm3
802	pmuludq	%xmm7,%xmm4
803	pmuludq	-16(%edx),%xmm0
804	movdqa	%xmm1,%xmm7
805	pmuludq	-128(%edx),%xmm1
806	paddq	%xmm5,%xmm0
807	movdqa	%xmm7,%xmm5
808	pmuludq	-112(%edx),%xmm7
809	paddq	%xmm6,%xmm1
810	movdqa	%xmm5,%xmm6
811	pmuludq	-96(%edx),%xmm5
812	paddq	%xmm7,%xmm2
813	movdqa	16(%eax),%xmm7
814	pmuludq	-80(%edx),%xmm6
815	paddq	%xmm5,%xmm3
816	movdqa	%xmm7,%xmm5
817	pmuludq	-128(%edx),%xmm7
818	paddq	%xmm6,%xmm4
819	movdqa	%xmm5,%xmm6
820	pmuludq	-112(%edx),%xmm5
821	paddq	%xmm7,%xmm2
822	movdqa	32(%eax),%xmm7
823	pmuludq	-96(%edx),%xmm6
824	paddq	%xmm5,%xmm3
825	movdqa	%xmm7,%xmm5
826	pmuludq	-32(%edx),%xmm7
827	paddq	%xmm6,%xmm4
828	movdqa	%xmm5,%xmm6
829	pmuludq	-16(%edx),%xmm5
830	paddq	%xmm7,%xmm0
831	movdqa	%xmm6,%xmm7
832	pmuludq	-128(%edx),%xmm6
833	paddq	%xmm5,%xmm1
834	movdqa	48(%eax),%xmm5
835	pmuludq	-112(%edx),%xmm7
836	paddq	%xmm6,%xmm3
837	movdqa	%xmm5,%xmm6
838	pmuludq	-48(%edx),%xmm5
839	paddq	%xmm7,%xmm4
840	movdqa	%xmm6,%xmm7
841	pmuludq	-32(%edx),%xmm6
842	paddq	%xmm5,%xmm0
843	movdqa	%xmm7,%xmm5
844	pmuludq	-16(%edx),%xmm7
845	paddq	%xmm6,%xmm1
846	movdqa	64(%eax),%xmm6
847	pmuludq	-128(%edx),%xmm5
848	paddq	%xmm7,%xmm2
849	movdqa	%xmm6,%xmm7
850	pmuludq	-16(%edx),%xmm6
851	paddq	%xmm5,%xmm4
852	movdqa	%xmm7,%xmm5
853	pmuludq	-64(%edx),%xmm7
854	paddq	%xmm6,%xmm3
855	movdqa	%xmm5,%xmm6
856	pmuludq	-48(%edx),%xmm5
857	paddq	%xmm7,%xmm0
858	movdqa	64(%ebx),%xmm7
859	pmuludq	-32(%edx),%xmm6
860	paddq	%xmm5,%xmm1
861	paddq	%xmm6,%xmm2
862	movdqu	-32(%esi),%xmm5
863	movdqu	-16(%esi),%xmm6
864	leal	32(%esi),%esi
865	movdqa	%xmm2,32(%esp)
866	movdqa	%xmm3,48(%esp)
867	movdqa	%xmm4,64(%esp)
868	movdqa	%xmm5,%xmm2
869	movdqa	%xmm6,%xmm3
870	psrldq	$6,%xmm2
871	psrldq	$6,%xmm3
872	movdqa	%xmm5,%xmm4
873	punpcklqdq	%xmm3,%xmm2
874	punpckhqdq	%xmm6,%xmm4
875	punpcklqdq	%xmm6,%xmm5
876	movdqa	%xmm2,%xmm3
877	psrlq	$4,%xmm2
878	psrlq	$30,%xmm3
879	movdqa	%xmm5,%xmm6
880	psrlq	$40,%xmm4
881	psrlq	$26,%xmm6
882	pand	%xmm7,%xmm5
883	pand	%xmm7,%xmm6
884	pand	%xmm7,%xmm2
885	pand	%xmm7,%xmm3
886	por	(%ebx),%xmm4
887	leal	-32(%esi),%eax
888	subl	$64,%ecx
889	paddd	80(%esp),%xmm5
890	paddd	96(%esp),%xmm6
891	paddd	112(%esp),%xmm2
892	paddd	128(%esp),%xmm3
893	paddd	144(%esp),%xmm4
894	cmovbl	%eax,%esi
895	leal	160(%esp),%eax
896	movdqa	(%edx),%xmm7
897	movdqa	%xmm1,16(%esp)
898	movdqa	%xmm6,16(%eax)
899	movdqa	%xmm2,32(%eax)
900	movdqa	%xmm3,48(%eax)
901	movdqa	%xmm4,64(%eax)
902	movdqa	%xmm5,%xmm1
903	pmuludq	%xmm7,%xmm5
904	paddq	%xmm0,%xmm5
905	movdqa	%xmm6,%xmm0
906	pmuludq	%xmm7,%xmm6
907	pmuludq	%xmm7,%xmm2
908	pmuludq	%xmm7,%xmm3
909	pmuludq	%xmm7,%xmm4
910	paddq	16(%esp),%xmm6
911	paddq	32(%esp),%xmm2
912	paddq	48(%esp),%xmm3
913	paddq	64(%esp),%xmm4
914	pmuludq	128(%edx),%xmm0
915	movdqa	%xmm1,%xmm7
916	pmuludq	16(%edx),%xmm1
917	paddq	%xmm5,%xmm0
918	movdqa	%xmm7,%xmm5
919	pmuludq	32(%edx),%xmm7
920	paddq	%xmm6,%xmm1
921	movdqa	%xmm5,%xmm6
922	pmuludq	48(%edx),%xmm5
923	paddq	%xmm7,%xmm2
924	movdqa	16(%eax),%xmm7
925	pmuludq	64(%edx),%xmm6
926	paddq	%xmm5,%xmm3
927	movdqa	%xmm7,%xmm5
928	pmuludq	16(%edx),%xmm7
929	paddq	%xmm6,%xmm4
930	movdqa	%xmm5,%xmm6
931	pmuludq	32(%edx),%xmm5
932	paddq	%xmm7,%xmm2
933	movdqa	32(%eax),%xmm7
934	pmuludq	48(%edx),%xmm6
935	paddq	%xmm5,%xmm3
936	movdqa	%xmm7,%xmm5
937	pmuludq	112(%edx),%xmm7
938	paddq	%xmm6,%xmm4
939	movdqa	%xmm5,%xmm6
940	pmuludq	128(%edx),%xmm5
941	paddq	%xmm7,%xmm0
942	movdqa	%xmm6,%xmm7
943	pmuludq	16(%edx),%xmm6
944	paddq	%xmm5,%xmm1
945	movdqa	48(%eax),%xmm5
946	pmuludq	32(%edx),%xmm7
947	paddq	%xmm6,%xmm3
948	movdqa	%xmm5,%xmm6
949	pmuludq	96(%edx),%xmm5
950	paddq	%xmm7,%xmm4
951	movdqa	%xmm6,%xmm7
952	pmuludq	112(%edx),%xmm6
953	paddq	%xmm5,%xmm0
954	movdqa	%xmm7,%xmm5
955	pmuludq	128(%edx),%xmm7
956	paddq	%xmm6,%xmm1
957	movdqa	64(%eax),%xmm6
958	pmuludq	16(%edx),%xmm5
959	paddq	%xmm7,%xmm2
960	movdqa	%xmm6,%xmm7
961	pmuludq	128(%edx),%xmm6
962	paddq	%xmm5,%xmm4
963	movdqa	%xmm7,%xmm5
964	pmuludq	80(%edx),%xmm7
965	paddq	%xmm6,%xmm3
966	movdqa	%xmm5,%xmm6
967	pmuludq	96(%edx),%xmm5
968	paddq	%xmm7,%xmm0
969	movdqa	64(%ebx),%xmm7
970	pmuludq	112(%edx),%xmm6
971	paddq	%xmm5,%xmm1
972	paddq	%xmm6,%xmm2
973	movdqa	%xmm3,%xmm5
974	pand	%xmm7,%xmm3
975	psrlq	$26,%xmm5
976	paddq	%xmm4,%xmm5
977	movdqa	%xmm0,%xmm6
978	pand	%xmm7,%xmm0
979	psrlq	$26,%xmm6
980	movdqa	%xmm5,%xmm4
981	paddq	%xmm1,%xmm6
982	psrlq	$26,%xmm5
983	pand	%xmm7,%xmm4
984	movdqa	%xmm6,%xmm1
985	psrlq	$26,%xmm6
986	paddd	%xmm5,%xmm0
987	psllq	$2,%xmm5
988	paddq	%xmm2,%xmm6
989	paddq	%xmm0,%xmm5
990	pand	%xmm7,%xmm1
991	movdqa	%xmm6,%xmm2
992	psrlq	$26,%xmm6
993	pand	%xmm7,%xmm2
994	paddd	%xmm3,%xmm6
995	movdqa	%xmm5,%xmm0
996	psrlq	$26,%xmm5
997	movdqa	%xmm6,%xmm3
998	psrlq	$26,%xmm6
999	pand	%xmm7,%xmm0
1000	paddd	%xmm5,%xmm1
1001	pand	%xmm7,%xmm3
1002	paddd	%xmm6,%xmm4
1003	movdqu	32(%esi),%xmm5
1004	movdqu	48(%esi),%xmm6
1005	leal	32(%esi),%esi
1006	movdqa	%xmm2,112(%esp)
1007	movdqa	%xmm3,128(%esp)
1008	movdqa	%xmm4,144(%esp)
1009	movdqa	%xmm5,%xmm2
1010	movdqa	%xmm6,%xmm3
1011	psrldq	$6,%xmm2
1012	psrldq	$6,%xmm3
1013	movdqa	%xmm5,%xmm4
1014	punpcklqdq	%xmm3,%xmm2
1015	punpckhqdq	%xmm6,%xmm4
1016	punpcklqdq	%xmm6,%xmm5
1017	movdqa	%xmm2,%xmm3
1018	psrlq	$4,%xmm2
1019	psrlq	$30,%xmm3
1020	movdqa	%xmm5,%xmm6
1021	psrlq	$40,%xmm4
1022	psrlq	$26,%xmm6
1023	pand	%xmm7,%xmm5
1024	pand	%xmm7,%xmm6
1025	pand	%xmm7,%xmm2
1026	pand	%xmm7,%xmm3
1027	por	(%ebx),%xmm4
1028	movdqa	%xmm0,80(%esp)
1029	movdqa	%xmm1,96(%esp)
1030	ja	.L015loop
1031.L014skip_loop:
1032	pshufd	$16,-144(%edx),%xmm7
1033	addl	$32,%ecx
1034	jnz	.L016long_tail
1035	paddd	%xmm0,%xmm5
1036	paddd	%xmm1,%xmm6
1037	paddd	112(%esp),%xmm2
1038	paddd	128(%esp),%xmm3
1039	paddd	144(%esp),%xmm4
1040.L016long_tail:
1041	movdqa	%xmm5,(%eax)
1042	movdqa	%xmm6,16(%eax)
1043	movdqa	%xmm2,32(%eax)
1044	movdqa	%xmm3,48(%eax)
1045	movdqa	%xmm4,64(%eax)
1046	pmuludq	%xmm7,%xmm5
1047	pmuludq	%xmm7,%xmm6
1048	pmuludq	%xmm7,%xmm2
1049	movdqa	%xmm5,%xmm0
1050	pshufd	$16,-128(%edx),%xmm5
1051	pmuludq	%xmm7,%xmm3
1052	movdqa	%xmm6,%xmm1
1053	pmuludq	%xmm7,%xmm4
1054	movdqa	%xmm5,%xmm6
1055	pmuludq	48(%eax),%xmm5
1056	movdqa	%xmm6,%xmm7
1057	pmuludq	32(%eax),%xmm6
1058	paddq	%xmm5,%xmm4
1059	movdqa	%xmm7,%xmm5
1060	pmuludq	16(%eax),%xmm7
1061	paddq	%xmm6,%xmm3
1062	pshufd	$16,-64(%edx),%xmm6
1063	pmuludq	(%eax),%xmm5
1064	paddq	%xmm7,%xmm2
1065	pmuludq	64(%eax),%xmm6
1066	pshufd	$16,-112(%edx),%xmm7
1067	paddq	%xmm5,%xmm1
1068	movdqa	%xmm7,%xmm5
1069	pmuludq	32(%eax),%xmm7
1070	paddq	%xmm6,%xmm0
1071	movdqa	%xmm5,%xmm6
1072	pmuludq	16(%eax),%xmm5
1073	paddq	%xmm7,%xmm4
1074	pshufd	$16,-48(%edx),%xmm7
1075	pmuludq	(%eax),%xmm6
1076	paddq	%xmm5,%xmm3
1077	movdqa	%xmm7,%xmm5
1078	pmuludq	64(%eax),%xmm7
1079	paddq	%xmm6,%xmm2
1080	pmuludq	48(%eax),%xmm5
1081	pshufd	$16,-96(%edx),%xmm6
1082	paddq	%xmm7,%xmm1
1083	movdqa	%xmm6,%xmm7
1084	pmuludq	16(%eax),%xmm6
1085	paddq	%xmm5,%xmm0
1086	pshufd	$16,-32(%edx),%xmm5
1087	pmuludq	(%eax),%xmm7
1088	paddq	%xmm6,%xmm4
1089	movdqa	%xmm5,%xmm6
1090	pmuludq	64(%eax),%xmm5
1091	paddq	%xmm7,%xmm3
1092	movdqa	%xmm6,%xmm7
1093	pmuludq	48(%eax),%xmm6
1094	paddq	%xmm5,%xmm2
1095	pmuludq	32(%eax),%xmm7
1096	pshufd	$16,-80(%edx),%xmm5
1097	paddq	%xmm6,%xmm1
1098	pshufd	$16,-16(%edx),%xmm6
1099	pmuludq	(%eax),%xmm5
1100	paddq	%xmm7,%xmm0
1101	movdqa	%xmm6,%xmm7
1102	pmuludq	64(%eax),%xmm6
1103	paddq	%xmm5,%xmm4
1104	movdqa	%xmm7,%xmm5
1105	pmuludq	16(%eax),%xmm7
1106	paddq	%xmm6,%xmm3
1107	movdqa	%xmm5,%xmm6
1108	pmuludq	32(%eax),%xmm5
1109	paddq	%xmm7,%xmm0
1110	pmuludq	48(%eax),%xmm6
1111	movdqa	64(%ebx),%xmm7
1112	paddq	%xmm5,%xmm1
1113	paddq	%xmm6,%xmm2
1114	jz	.L017short_tail
1115	movdqu	-32(%esi),%xmm5
1116	movdqu	-16(%esi),%xmm6
1117	leal	32(%esi),%esi
1118	movdqa	%xmm2,32(%esp)
1119	movdqa	%xmm3,48(%esp)
1120	movdqa	%xmm4,64(%esp)
1121	movdqa	%xmm5,%xmm2
1122	movdqa	%xmm6,%xmm3
1123	psrldq	$6,%xmm2
1124	psrldq	$6,%xmm3
1125	movdqa	%xmm5,%xmm4
1126	punpcklqdq	%xmm3,%xmm2
1127	punpckhqdq	%xmm6,%xmm4
1128	punpcklqdq	%xmm6,%xmm5
1129	movdqa	%xmm2,%xmm3
1130	psrlq	$4,%xmm2
1131	psrlq	$30,%xmm3
1132	movdqa	%xmm5,%xmm6
1133	psrlq	$40,%xmm4
1134	psrlq	$26,%xmm6
1135	pand	%xmm7,%xmm5
1136	pand	%xmm7,%xmm6
1137	pand	%xmm7,%xmm2
1138	pand	%xmm7,%xmm3
1139	por	(%ebx),%xmm4
1140	pshufd	$16,(%edx),%xmm7
1141	paddd	80(%esp),%xmm5
1142	paddd	96(%esp),%xmm6
1143	paddd	112(%esp),%xmm2
1144	paddd	128(%esp),%xmm3
1145	paddd	144(%esp),%xmm4
1146	movdqa	%xmm5,(%esp)
1147	pmuludq	%xmm7,%xmm5
1148	movdqa	%xmm6,16(%esp)
1149	pmuludq	%xmm7,%xmm6
1150	paddq	%xmm5,%xmm0
1151	movdqa	%xmm2,%xmm5
1152	pmuludq	%xmm7,%xmm2
1153	paddq	%xmm6,%xmm1
1154	movdqa	%xmm3,%xmm6
1155	pmuludq	%xmm7,%xmm3
1156	paddq	32(%esp),%xmm2
1157	movdqa	%xmm5,32(%esp)
1158	pshufd	$16,16(%edx),%xmm5
1159	paddq	48(%esp),%xmm3
1160	movdqa	%xmm6,48(%esp)
1161	movdqa	%xmm4,%xmm6
1162	pmuludq	%xmm7,%xmm4
1163	paddq	64(%esp),%xmm4
1164	movdqa	%xmm6,64(%esp)
1165	movdqa	%xmm5,%xmm6
1166	pmuludq	48(%esp),%xmm5
1167	movdqa	%xmm6,%xmm7
1168	pmuludq	32(%esp),%xmm6
1169	paddq	%xmm5,%xmm4
1170	movdqa	%xmm7,%xmm5
1171	pmuludq	16(%esp),%xmm7
1172	paddq	%xmm6,%xmm3
1173	pshufd	$16,80(%edx),%xmm6
1174	pmuludq	(%esp),%xmm5
1175	paddq	%xmm7,%xmm2
1176	pmuludq	64(%esp),%xmm6
1177	pshufd	$16,32(%edx),%xmm7
1178	paddq	%xmm5,%xmm1
1179	movdqa	%xmm7,%xmm5
1180	pmuludq	32(%esp),%xmm7
1181	paddq	%xmm6,%xmm0
1182	movdqa	%xmm5,%xmm6
1183	pmuludq	16(%esp),%xmm5
1184	paddq	%xmm7,%xmm4
1185	pshufd	$16,96(%edx),%xmm7
1186	pmuludq	(%esp),%xmm6
1187	paddq	%xmm5,%xmm3
1188	movdqa	%xmm7,%xmm5
1189	pmuludq	64(%esp),%xmm7
1190	paddq	%xmm6,%xmm2
1191	pmuludq	48(%esp),%xmm5
1192	pshufd	$16,48(%edx),%xmm6
1193	paddq	%xmm7,%xmm1
1194	movdqa	%xmm6,%xmm7
1195	pmuludq	16(%esp),%xmm6
1196	paddq	%xmm5,%xmm0
1197	pshufd	$16,112(%edx),%xmm5
1198	pmuludq	(%esp),%xmm7
1199	paddq	%xmm6,%xmm4
1200	movdqa	%xmm5,%xmm6
1201	pmuludq	64(%esp),%xmm5
1202	paddq	%xmm7,%xmm3
1203	movdqa	%xmm6,%xmm7
1204	pmuludq	48(%esp),%xmm6
1205	paddq	%xmm5,%xmm2
1206	pmuludq	32(%esp),%xmm7
1207	pshufd	$16,64(%edx),%xmm5
1208	paddq	%xmm6,%xmm1
1209	pshufd	$16,128(%edx),%xmm6
1210	pmuludq	(%esp),%xmm5
1211	paddq	%xmm7,%xmm0
1212	movdqa	%xmm6,%xmm7
1213	pmuludq	64(%esp),%xmm6
1214	paddq	%xmm5,%xmm4
1215	movdqa	%xmm7,%xmm5
1216	pmuludq	16(%esp),%xmm7
1217	paddq	%xmm6,%xmm3
1218	movdqa	%xmm5,%xmm6
1219	pmuludq	32(%esp),%xmm5
1220	paddq	%xmm7,%xmm0
1221	pmuludq	48(%esp),%xmm6
1222	movdqa	64(%ebx),%xmm7
1223	paddq	%xmm5,%xmm1
1224	paddq	%xmm6,%xmm2
1225.L017short_tail:
1226	pshufd	$78,%xmm4,%xmm6
1227	pshufd	$78,%xmm3,%xmm5
1228	paddq	%xmm6,%xmm4
1229	paddq	%xmm5,%xmm3
1230	pshufd	$78,%xmm0,%xmm6
1231	pshufd	$78,%xmm1,%xmm5
1232	paddq	%xmm6,%xmm0
1233	paddq	%xmm5,%xmm1
1234	pshufd	$78,%xmm2,%xmm6
1235	movdqa	%xmm3,%xmm5
1236	pand	%xmm7,%xmm3
1237	psrlq	$26,%xmm5
1238	paddq	%xmm6,%xmm2
1239	paddq	%xmm4,%xmm5
1240	movdqa	%xmm0,%xmm6
1241	pand	%xmm7,%xmm0
1242	psrlq	$26,%xmm6
1243	movdqa	%xmm5,%xmm4
1244	paddq	%xmm1,%xmm6
1245	psrlq	$26,%xmm5
1246	pand	%xmm7,%xmm4
1247	movdqa	%xmm6,%xmm1
1248	psrlq	$26,%xmm6
1249	paddd	%xmm5,%xmm0
1250	psllq	$2,%xmm5
1251	paddq	%xmm2,%xmm6
1252	paddq	%xmm0,%xmm5
1253	pand	%xmm7,%xmm1
1254	movdqa	%xmm6,%xmm2
1255	psrlq	$26,%xmm6
1256	pand	%xmm7,%xmm2
1257	paddd	%xmm3,%xmm6
1258	movdqa	%xmm5,%xmm0
1259	psrlq	$26,%xmm5
1260	movdqa	%xmm6,%xmm3
1261	psrlq	$26,%xmm6
1262	pand	%xmm7,%xmm0
1263	paddd	%xmm5,%xmm1
1264	pand	%xmm7,%xmm3
1265	paddd	%xmm6,%xmm4
1266.L013done:
1267	movd	%xmm0,-48(%edi)
1268	movd	%xmm1,-44(%edi)
1269	movd	%xmm2,-40(%edi)
1270	movd	%xmm3,-36(%edi)
1271	movd	%xmm4,-32(%edi)
1272	movl	%ebp,%esp
1273.L007nodata:
1274	popl	%edi
1275	popl	%esi
1276	popl	%ebx
1277	popl	%ebp
1278	ret
1279.size	_poly1305_blocks_sse2,.-_poly1305_blocks_sse2
1280.align	32
1281.type	_poly1305_emit_sse2,@function
1282.align	16
1283_poly1305_emit_sse2:
1284	%ifdef __CET__
1285
1286.byte	243,15,30,251
1287	%endif
1288
1289	pushl	%ebp
1290	pushl	%ebx
1291	pushl	%esi
1292	pushl	%edi
1293	movl	20(%esp),%ebp
1294	cmpl	$0,20(%ebp)
1295	je	.Lenter_emit
1296	movl	(%ebp),%eax
1297	movl	4(%ebp),%edi
1298	movl	8(%ebp),%ecx
1299	movl	12(%ebp),%edx
1300	movl	16(%ebp),%esi
1301	movl	%edi,%ebx
1302	shll	$26,%edi
1303	shrl	$6,%ebx
1304	addl	%edi,%eax
1305	movl	%ecx,%edi
1306	adcl	$0,%ebx
1307	shll	$20,%edi
1308	shrl	$12,%ecx
1309	addl	%edi,%ebx
1310	movl	%edx,%edi
1311	adcl	$0,%ecx
1312	shll	$14,%edi
1313	shrl	$18,%edx
1314	addl	%edi,%ecx
1315	movl	%esi,%edi
1316	adcl	$0,%edx
1317	shll	$8,%edi
1318	shrl	$24,%esi
1319	addl	%edi,%edx
1320	adcl	$0,%esi
1321	movl	%esi,%edi
1322	andl	$3,%esi
1323	shrl	$2,%edi
1324	leal	(%edi,%edi,4),%ebp
1325	movl	24(%esp),%edi
1326	addl	%ebp,%eax
1327	movl	28(%esp),%ebp
1328	adcl	$0,%ebx
1329	adcl	$0,%ecx
1330	adcl	$0,%edx
1331	adcl	$0,%esi
1332	movd	%eax,%xmm0
1333	addl	$5,%eax
1334	movd	%ebx,%xmm1
1335	adcl	$0,%ebx
1336	movd	%ecx,%xmm2
1337	adcl	$0,%ecx
1338	movd	%edx,%xmm3
1339	adcl	$0,%edx
1340	adcl	$0,%esi
1341	shrl	$2,%esi
1342	negl	%esi
1343	andl	%esi,%eax
1344	andl	%esi,%ebx
1345	andl	%esi,%ecx
1346	andl	%esi,%edx
1347	movl	%eax,(%edi)
1348	movd	%xmm0,%eax
1349	movl	%ebx,4(%edi)
1350	movd	%xmm1,%ebx
1351	movl	%ecx,8(%edi)
1352	movd	%xmm2,%ecx
1353	movl	%edx,12(%edi)
1354	movd	%xmm3,%edx
1355	notl	%esi
1356	andl	%esi,%eax
1357	andl	%esi,%ebx
1358	orl	(%edi),%eax
1359	andl	%esi,%ecx
1360	orl	4(%edi),%ebx
1361	andl	%esi,%edx
1362	orl	8(%edi),%ecx
1363	orl	12(%edi),%edx
1364	addl	(%ebp),%eax
1365	adcl	4(%ebp),%ebx
1366	movl	%eax,(%edi)
1367	adcl	8(%ebp),%ecx
1368	movl	%ebx,4(%edi)
1369	adcl	12(%ebp),%edx
1370	movl	%ecx,8(%edi)
1371	movl	%edx,12(%edi)
1372	popl	%edi
1373	popl	%esi
1374	popl	%ebx
1375	popl	%ebp
1376	ret
1377.size	_poly1305_emit_sse2,.-_poly1305_emit_sse2
1378.align	32
1379.type	_poly1305_init_avx2,@function
1380.align	16
1381_poly1305_init_avx2:
1382	%ifdef __CET__
1383
1384.byte	243,15,30,251
1385	%endif
1386
1387	vmovdqu	24(%edi),%xmm4
1388	leal	48(%edi),%edi
1389	movl	%esp,%ebp
1390	subl	$224,%esp
1391	andl	$-16,%esp
1392	vmovdqa	64(%ebx),%xmm7
1393	vpand	%xmm7,%xmm4,%xmm0
1394	vpsrlq	$26,%xmm4,%xmm1
1395	vpsrldq	$6,%xmm4,%xmm3
1396	vpand	%xmm7,%xmm1,%xmm1
1397	vpsrlq	$4,%xmm3,%xmm2
1398	vpsrlq	$30,%xmm3,%xmm3
1399	vpand	%xmm7,%xmm2,%xmm2
1400	vpand	%xmm7,%xmm3,%xmm3
1401	vpsrldq	$13,%xmm4,%xmm4
1402	leal	144(%esp),%edx
1403	movl	$2,%ecx
1404.L018square:
1405	vmovdqa	%xmm0,(%esp)
1406	vmovdqa	%xmm1,16(%esp)
1407	vmovdqa	%xmm2,32(%esp)
1408	vmovdqa	%xmm3,48(%esp)
1409	vmovdqa	%xmm4,64(%esp)
1410	vpslld	$2,%xmm1,%xmm6
1411	vpslld	$2,%xmm2,%xmm5
1412	vpaddd	%xmm1,%xmm6,%xmm6
1413	vpaddd	%xmm2,%xmm5,%xmm5
1414	vmovdqa	%xmm6,80(%esp)
1415	vmovdqa	%xmm5,96(%esp)
1416	vpslld	$2,%xmm3,%xmm6
1417	vpslld	$2,%xmm4,%xmm5
1418	vpaddd	%xmm3,%xmm6,%xmm6
1419	vpaddd	%xmm4,%xmm5,%xmm5
1420	vmovdqa	%xmm6,112(%esp)
1421	vmovdqa	%xmm5,128(%esp)
1422	vpshufd	$68,%xmm0,%xmm5
1423	vmovdqa	%xmm1,%xmm6
1424	vpshufd	$68,%xmm1,%xmm1
1425	vpshufd	$68,%xmm2,%xmm2
1426	vpshufd	$68,%xmm3,%xmm3
1427	vpshufd	$68,%xmm4,%xmm4
1428	vmovdqa	%xmm5,(%edx)
1429	vmovdqa	%xmm1,16(%edx)
1430	vmovdqa	%xmm2,32(%edx)
1431	vmovdqa	%xmm3,48(%edx)
1432	vmovdqa	%xmm4,64(%edx)
1433	vpmuludq	%xmm0,%xmm4,%xmm4
1434	vpmuludq	%xmm0,%xmm3,%xmm3
1435	vpmuludq	%xmm0,%xmm2,%xmm2
1436	vpmuludq	%xmm0,%xmm1,%xmm1
1437	vpmuludq	%xmm0,%xmm5,%xmm0
1438	vpmuludq	48(%edx),%xmm6,%xmm5
1439	vpaddq	%xmm5,%xmm4,%xmm4
1440	vpmuludq	32(%edx),%xmm6,%xmm7
1441	vpaddq	%xmm7,%xmm3,%xmm3
1442	vpmuludq	16(%edx),%xmm6,%xmm5
1443	vpaddq	%xmm5,%xmm2,%xmm2
1444	vmovdqa	80(%esp),%xmm7
1445	vpmuludq	(%edx),%xmm6,%xmm6
1446	vpaddq	%xmm6,%xmm1,%xmm1
1447	vmovdqa	32(%esp),%xmm5
1448	vpmuludq	64(%edx),%xmm7,%xmm7
1449	vpaddq	%xmm7,%xmm0,%xmm0
1450	vpmuludq	32(%edx),%xmm5,%xmm6
1451	vpaddq	%xmm6,%xmm4,%xmm4
1452	vpmuludq	16(%edx),%xmm5,%xmm7
1453	vpaddq	%xmm7,%xmm3,%xmm3
1454	vmovdqa	96(%esp),%xmm6
1455	vpmuludq	(%edx),%xmm5,%xmm5
1456	vpaddq	%xmm5,%xmm2,%xmm2
1457	vpmuludq	64(%edx),%xmm6,%xmm7
1458	vpaddq	%xmm7,%xmm1,%xmm1
1459	vmovdqa	48(%esp),%xmm5
1460	vpmuludq	48(%edx),%xmm6,%xmm6
1461	vpaddq	%xmm6,%xmm0,%xmm0
1462	vpmuludq	16(%edx),%xmm5,%xmm7
1463	vpaddq	%xmm7,%xmm4,%xmm4
1464	vmovdqa	112(%esp),%xmm6
1465	vpmuludq	(%edx),%xmm5,%xmm5
1466	vpaddq	%xmm5,%xmm3,%xmm3
1467	vpmuludq	64(%edx),%xmm6,%xmm7
1468	vpaddq	%xmm7,%xmm2,%xmm2
1469	vpmuludq	48(%edx),%xmm6,%xmm5
1470	vpaddq	%xmm5,%xmm1,%xmm1
1471	vmovdqa	64(%esp),%xmm7
1472	vpmuludq	32(%edx),%xmm6,%xmm6
1473	vpaddq	%xmm6,%xmm0,%xmm0
1474	vmovdqa	128(%esp),%xmm5
1475	vpmuludq	(%edx),%xmm7,%xmm7
1476	vpaddq	%xmm7,%xmm4,%xmm4
1477	vpmuludq	64(%edx),%xmm5,%xmm6
1478	vpaddq	%xmm6,%xmm3,%xmm3
1479	vpmuludq	16(%edx),%xmm5,%xmm7
1480	vpaddq	%xmm7,%xmm0,%xmm0
1481	vpmuludq	32(%edx),%xmm5,%xmm6
1482	vpaddq	%xmm6,%xmm1,%xmm1
1483	vmovdqa	64(%ebx),%xmm7
1484	vpmuludq	48(%edx),%xmm5,%xmm5
1485	vpaddq	%xmm5,%xmm2,%xmm2
1486	vpsrlq	$26,%xmm3,%xmm5
1487	vpand	%xmm7,%xmm3,%xmm3
1488	vpsrlq	$26,%xmm0,%xmm6
1489	vpand	%xmm7,%xmm0,%xmm0
1490	vpaddq	%xmm5,%xmm4,%xmm4
1491	vpaddq	%xmm6,%xmm1,%xmm1
1492	vpsrlq	$26,%xmm4,%xmm5
1493	vpand	%xmm7,%xmm4,%xmm4
1494	vpsrlq	$26,%xmm1,%xmm6
1495	vpand	%xmm7,%xmm1,%xmm1
1496	vpaddq	%xmm6,%xmm2,%xmm2
1497	vpaddd	%xmm5,%xmm0,%xmm0
1498	vpsllq	$2,%xmm5,%xmm5
1499	vpsrlq	$26,%xmm2,%xmm6
1500	vpand	%xmm7,%xmm2,%xmm2
1501	vpaddd	%xmm5,%xmm0,%xmm0
1502	vpaddd	%xmm6,%xmm3,%xmm3
1503	vpsrlq	$26,%xmm3,%xmm6
1504	vpsrlq	$26,%xmm0,%xmm5
1505	vpand	%xmm7,%xmm0,%xmm0
1506	vpand	%xmm7,%xmm3,%xmm3
1507	vpaddd	%xmm5,%xmm1,%xmm1
1508	vpaddd	%xmm6,%xmm4,%xmm4
1509	decl	%ecx
1510	jz	.L019square_break
1511	vpunpcklqdq	(%esp),%xmm0,%xmm0
1512	vpunpcklqdq	16(%esp),%xmm1,%xmm1
1513	vpunpcklqdq	32(%esp),%xmm2,%xmm2
1514	vpunpcklqdq	48(%esp),%xmm3,%xmm3
1515	vpunpcklqdq	64(%esp),%xmm4,%xmm4
1516	jmp	.L018square
1517.L019square_break:
1518	vpsllq	$32,%xmm0,%xmm0
1519	vpsllq	$32,%xmm1,%xmm1
1520	vpsllq	$32,%xmm2,%xmm2
1521	vpsllq	$32,%xmm3,%xmm3
1522	vpsllq	$32,%xmm4,%xmm4
1523	vpor	(%esp),%xmm0,%xmm0
1524	vpor	16(%esp),%xmm1,%xmm1
1525	vpor	32(%esp),%xmm2,%xmm2
1526	vpor	48(%esp),%xmm3,%xmm3
1527	vpor	64(%esp),%xmm4,%xmm4
1528	vpshufd	$141,%xmm0,%xmm0
1529	vpshufd	$141,%xmm1,%xmm1
1530	vpshufd	$141,%xmm2,%xmm2
1531	vpshufd	$141,%xmm3,%xmm3
1532	vpshufd	$141,%xmm4,%xmm4
1533	vmovdqu	%xmm0,(%edi)
1534	vmovdqu	%xmm1,16(%edi)
1535	vmovdqu	%xmm2,32(%edi)
1536	vmovdqu	%xmm3,48(%edi)
1537	vmovdqu	%xmm4,64(%edi)
1538	vpslld	$2,%xmm1,%xmm6
1539	vpslld	$2,%xmm2,%xmm5
1540	vpaddd	%xmm1,%xmm6,%xmm6
1541	vpaddd	%xmm2,%xmm5,%xmm5
1542	vmovdqu	%xmm6,80(%edi)
1543	vmovdqu	%xmm5,96(%edi)
1544	vpslld	$2,%xmm3,%xmm6
1545	vpslld	$2,%xmm4,%xmm5
1546	vpaddd	%xmm3,%xmm6,%xmm6
1547	vpaddd	%xmm4,%xmm5,%xmm5
1548	vmovdqu	%xmm6,112(%edi)
1549	vmovdqu	%xmm5,128(%edi)
1550	movl	%ebp,%esp
1551	leal	-48(%edi),%edi
1552	ret
1553.size	_poly1305_init_avx2,.-_poly1305_init_avx2
1554.align	32
1555.type	_poly1305_blocks_avx2,@function
1556.align	16
1557_poly1305_blocks_avx2:
1558	%ifdef __CET__
1559
1560.byte	243,15,30,251
1561	%endif
1562
1563	pushl	%ebp
1564	pushl	%ebx
1565	pushl	%esi
1566	pushl	%edi
1567	movl	20(%esp),%edi
1568	movl	24(%esp),%esi
1569	movl	28(%esp),%ecx
1570	movl	20(%edi),%eax
1571	andl	$-16,%ecx
1572	jz	.L020nodata
1573	cmpl	$64,%ecx
1574	jae	.L021enter_avx2
1575	testl	%eax,%eax
1576	jz	.Lenter_blocks
1577.L021enter_avx2:
1578	vzeroupper
1579	call	.L022pic_point
1580.L022pic_point:
1581	popl	%ebx
1582	leal	.Lconst_sse2-.L022pic_point(%ebx),%ebx
1583	testl	%eax,%eax
1584	jnz	.L023base2_26
1585	call	_poly1305_init_avx2
1586	movl	(%edi),%eax
1587	movl	3(%edi),%ecx
1588	movl	6(%edi),%edx
1589	movl	9(%edi),%esi
1590	movl	13(%edi),%ebp
1591	shrl	$2,%ecx
1592	andl	$67108863,%eax
1593	shrl	$4,%edx
1594	andl	$67108863,%ecx
1595	shrl	$6,%esi
1596	andl	$67108863,%edx
1597	movl	%eax,(%edi)
1598	movl	%ecx,4(%edi)
1599	movl	%edx,8(%edi)
1600	movl	%esi,12(%edi)
1601	movl	%ebp,16(%edi)
1602	movl	$1,20(%edi)
1603	movl	24(%esp),%esi
1604	movl	28(%esp),%ecx
1605.L023base2_26:
1606	movl	32(%esp),%eax
1607	movl	%esp,%ebp
1608	subl	$448,%esp
1609	andl	$-512,%esp
1610	vmovdqu	48(%edi),%xmm0
1611	leal	288(%esp),%edx
1612	vmovdqu	64(%edi),%xmm1
1613	vmovdqu	80(%edi),%xmm2
1614	vmovdqu	96(%edi),%xmm3
1615	vmovdqu	112(%edi),%xmm4
1616	leal	48(%edi),%edi
1617	vpermq	$64,%ymm0,%ymm0
1618	vpermq	$64,%ymm1,%ymm1
1619	vpermq	$64,%ymm2,%ymm2
1620	vpermq	$64,%ymm3,%ymm3
1621	vpermq	$64,%ymm4,%ymm4
1622	vpshufd	$200,%ymm0,%ymm0
1623	vpshufd	$200,%ymm1,%ymm1
1624	vpshufd	$200,%ymm2,%ymm2
1625	vpshufd	$200,%ymm3,%ymm3
1626	vpshufd	$200,%ymm4,%ymm4
1627	vmovdqa	%ymm0,-128(%edx)
1628	vmovdqu	80(%edi),%xmm0
1629	vmovdqa	%ymm1,-96(%edx)
1630	vmovdqu	96(%edi),%xmm1
1631	vmovdqa	%ymm2,-64(%edx)
1632	vmovdqu	112(%edi),%xmm2
1633	vmovdqa	%ymm3,-32(%edx)
1634	vmovdqu	128(%edi),%xmm3
1635	vmovdqa	%ymm4,(%edx)
1636	vpermq	$64,%ymm0,%ymm0
1637	vpermq	$64,%ymm1,%ymm1
1638	vpermq	$64,%ymm2,%ymm2
1639	vpermq	$64,%ymm3,%ymm3
1640	vpshufd	$200,%ymm0,%ymm0
1641	vpshufd	$200,%ymm1,%ymm1
1642	vpshufd	$200,%ymm2,%ymm2
1643	vpshufd	$200,%ymm3,%ymm3
1644	vmovdqa	%ymm0,32(%edx)
1645	vmovd	-48(%edi),%xmm0
1646	vmovdqa	%ymm1,64(%edx)
1647	vmovd	-44(%edi),%xmm1
1648	vmovdqa	%ymm2,96(%edx)
1649	vmovd	-40(%edi),%xmm2
1650	vmovdqa	%ymm3,128(%edx)
1651	vmovd	-36(%edi),%xmm3
1652	vmovd	-32(%edi),%xmm4
1653	vmovdqa	64(%ebx),%ymm7
1654	negl	%eax
1655	testl	$63,%ecx
1656	jz	.L024even
1657	movl	%ecx,%edx
1658	andl	$-64,%ecx
1659	andl	$63,%edx
1660	vmovdqu	(%esi),%xmm5
1661	cmpl	$32,%edx
1662	jb	.L025one
1663	vmovdqu	16(%esi),%xmm6
1664	je	.L026two
1665	vinserti128	$1,32(%esi),%ymm5,%ymm5
1666	leal	48(%esi),%esi
1667	leal	8(%ebx),%ebx
1668	leal	296(%esp),%edx
1669	jmp	.L027tail
1670.L026two:
1671	leal	32(%esi),%esi
1672	leal	16(%ebx),%ebx
1673	leal	304(%esp),%edx
1674	jmp	.L027tail
1675.L025one:
1676	leal	16(%esi),%esi
1677	vpxor	%ymm6,%ymm6,%ymm6
1678	leal	32(%ebx,%eax,8),%ebx
1679	leal	312(%esp),%edx
1680	jmp	.L027tail
1681.align	32
1682.L024even:
1683	vmovdqu	(%esi),%xmm5
1684	vmovdqu	16(%esi),%xmm6
1685	vinserti128	$1,32(%esi),%ymm5,%ymm5
1686	vinserti128	$1,48(%esi),%ymm6,%ymm6
1687	leal	64(%esi),%esi
1688	subl	$64,%ecx
1689	jz	.L027tail
1690.L028loop:
1691	vmovdqa	%ymm2,64(%esp)
1692	vpsrldq	$6,%ymm5,%ymm2
1693	vmovdqa	%ymm0,(%esp)
1694	vpsrldq	$6,%ymm6,%ymm0
1695	vmovdqa	%ymm1,32(%esp)
1696	vpunpckhqdq	%ymm6,%ymm5,%ymm1
1697	vpunpcklqdq	%ymm6,%ymm5,%ymm5
1698	vpunpcklqdq	%ymm0,%ymm2,%ymm2
1699	vpsrlq	$30,%ymm2,%ymm0
1700	vpsrlq	$4,%ymm2,%ymm2
1701	vpsrlq	$26,%ymm5,%ymm6
1702	vpsrlq	$40,%ymm1,%ymm1
1703	vpand	%ymm7,%ymm2,%ymm2
1704	vpand	%ymm7,%ymm5,%ymm5
1705	vpand	%ymm7,%ymm6,%ymm6
1706	vpand	%ymm7,%ymm0,%ymm0
1707	vpor	(%ebx),%ymm1,%ymm1
1708	vpaddq	64(%esp),%ymm2,%ymm2
1709	vpaddq	(%esp),%ymm5,%ymm5
1710	vpaddq	32(%esp),%ymm6,%ymm6
1711	vpaddq	%ymm3,%ymm0,%ymm0
1712	vpaddq	%ymm4,%ymm1,%ymm1
1713	vpmuludq	-96(%edx),%ymm2,%ymm3
1714	vmovdqa	%ymm6,32(%esp)
1715	vpmuludq	-64(%edx),%ymm2,%ymm4
1716	vmovdqa	%ymm0,96(%esp)
1717	vpmuludq	96(%edx),%ymm2,%ymm0
1718	vmovdqa	%ymm1,128(%esp)
1719	vpmuludq	128(%edx),%ymm2,%ymm1
1720	vpmuludq	-128(%edx),%ymm2,%ymm2
1721	vpmuludq	-32(%edx),%ymm5,%ymm7
1722	vpaddq	%ymm7,%ymm3,%ymm3
1723	vpmuludq	(%edx),%ymm5,%ymm6
1724	vpaddq	%ymm6,%ymm4,%ymm4
1725	vpmuludq	-128(%edx),%ymm5,%ymm7
1726	vpaddq	%ymm7,%ymm0,%ymm0
1727	vmovdqa	32(%esp),%ymm7
1728	vpmuludq	-96(%edx),%ymm5,%ymm6
1729	vpaddq	%ymm6,%ymm1,%ymm1
1730	vpmuludq	-64(%edx),%ymm5,%ymm5
1731	vpaddq	%ymm5,%ymm2,%ymm2
1732	vpmuludq	-64(%edx),%ymm7,%ymm6
1733	vpaddq	%ymm6,%ymm3,%ymm3
1734	vpmuludq	-32(%edx),%ymm7,%ymm5
1735	vpaddq	%ymm5,%ymm4,%ymm4
1736	vpmuludq	128(%edx),%ymm7,%ymm6
1737	vpaddq	%ymm6,%ymm0,%ymm0
1738	vmovdqa	96(%esp),%ymm6
1739	vpmuludq	-128(%edx),%ymm7,%ymm5
1740	vpaddq	%ymm5,%ymm1,%ymm1
1741	vpmuludq	-96(%edx),%ymm7,%ymm7
1742	vpaddq	%ymm7,%ymm2,%ymm2
1743	vpmuludq	-128(%edx),%ymm6,%ymm5
1744	vpaddq	%ymm5,%ymm3,%ymm3
1745	vpmuludq	-96(%edx),%ymm6,%ymm7
1746	vpaddq	%ymm7,%ymm4,%ymm4
1747	vpmuludq	64(%edx),%ymm6,%ymm5
1748	vpaddq	%ymm5,%ymm0,%ymm0
1749	vmovdqa	128(%esp),%ymm5
1750	vpmuludq	96(%edx),%ymm6,%ymm7
1751	vpaddq	%ymm7,%ymm1,%ymm1
1752	vpmuludq	128(%edx),%ymm6,%ymm6
1753	vpaddq	%ymm6,%ymm2,%ymm2
1754	vpmuludq	128(%edx),%ymm5,%ymm7
1755	vpaddq	%ymm7,%ymm3,%ymm3
1756	vpmuludq	32(%edx),%ymm5,%ymm6
1757	vpaddq	%ymm6,%ymm0,%ymm0
1758	vpmuludq	-128(%edx),%ymm5,%ymm7
1759	vpaddq	%ymm7,%ymm4,%ymm4
1760	vmovdqa	64(%ebx),%ymm7
1761	vpmuludq	64(%edx),%ymm5,%ymm6
1762	vpaddq	%ymm6,%ymm1,%ymm1
1763	vpmuludq	96(%edx),%ymm5,%ymm5
1764	vpaddq	%ymm5,%ymm2,%ymm2
1765	vpsrlq	$26,%ymm3,%ymm5
1766	vpand	%ymm7,%ymm3,%ymm3
1767	vpsrlq	$26,%ymm0,%ymm6
1768	vpand	%ymm7,%ymm0,%ymm0
1769	vpaddq	%ymm5,%ymm4,%ymm4
1770	vpaddq	%ymm6,%ymm1,%ymm1
1771	vpsrlq	$26,%ymm4,%ymm5
1772	vpand	%ymm7,%ymm4,%ymm4
1773	vpsrlq	$26,%ymm1,%ymm6
1774	vpand	%ymm7,%ymm1,%ymm1
1775	vpaddq	%ymm6,%ymm2,%ymm2
1776	vpaddq	%ymm5,%ymm0,%ymm0
1777	vpsllq	$2,%ymm5,%ymm5
1778	vpsrlq	$26,%ymm2,%ymm6
1779	vpand	%ymm7,%ymm2,%ymm2
1780	vpaddq	%ymm5,%ymm0,%ymm0
1781	vpaddq	%ymm6,%ymm3,%ymm3
1782	vpsrlq	$26,%ymm3,%ymm6
1783	vpsrlq	$26,%ymm0,%ymm5
1784	vpand	%ymm7,%ymm0,%ymm0
1785	vpand	%ymm7,%ymm3,%ymm3
1786	vpaddq	%ymm5,%ymm1,%ymm1
1787	vpaddq	%ymm6,%ymm4,%ymm4
1788	vmovdqu	(%esi),%xmm5
1789	vmovdqu	16(%esi),%xmm6
1790	vinserti128	$1,32(%esi),%ymm5,%ymm5
1791	vinserti128	$1,48(%esi),%ymm6,%ymm6
1792	leal	64(%esi),%esi
1793	subl	$64,%ecx
1794	jnz	.L028loop
1795.L027tail:
1796	vmovdqa	%ymm2,64(%esp)
1797	vpsrldq	$6,%ymm5,%ymm2
1798	vmovdqa	%ymm0,(%esp)
1799	vpsrldq	$6,%ymm6,%ymm0
1800	vmovdqa	%ymm1,32(%esp)
1801	vpunpckhqdq	%ymm6,%ymm5,%ymm1
1802	vpunpcklqdq	%ymm6,%ymm5,%ymm5
1803	vpunpcklqdq	%ymm0,%ymm2,%ymm2
1804	vpsrlq	$30,%ymm2,%ymm0
1805	vpsrlq	$4,%ymm2,%ymm2
1806	vpsrlq	$26,%ymm5,%ymm6
1807	vpsrlq	$40,%ymm1,%ymm1
1808	vpand	%ymm7,%ymm2,%ymm2
1809	vpand	%ymm7,%ymm5,%ymm5
1810	vpand	%ymm7,%ymm6,%ymm6
1811	vpand	%ymm7,%ymm0,%ymm0
1812	vpor	(%ebx),%ymm1,%ymm1
1813	andl	$-64,%ebx
1814	vpaddq	64(%esp),%ymm2,%ymm2
1815	vpaddq	(%esp),%ymm5,%ymm5
1816	vpaddq	32(%esp),%ymm6,%ymm6
1817	vpaddq	%ymm3,%ymm0,%ymm0
1818	vpaddq	%ymm4,%ymm1,%ymm1
1819	vpmuludq	-92(%edx),%ymm2,%ymm3
1820	vmovdqa	%ymm6,32(%esp)
1821	vpmuludq	-60(%edx),%ymm2,%ymm4
1822	vmovdqa	%ymm0,96(%esp)
1823	vpmuludq	100(%edx),%ymm2,%ymm0
1824	vmovdqa	%ymm1,128(%esp)
1825	vpmuludq	132(%edx),%ymm2,%ymm1
1826	vpmuludq	-124(%edx),%ymm2,%ymm2
1827	vpmuludq	-28(%edx),%ymm5,%ymm7
1828	vpaddq	%ymm7,%ymm3,%ymm3
1829	vpmuludq	4(%edx),%ymm5,%ymm6
1830	vpaddq	%ymm6,%ymm4,%ymm4
1831	vpmuludq	-124(%edx),%ymm5,%ymm7
1832	vpaddq	%ymm7,%ymm0,%ymm0
1833	vmovdqa	32(%esp),%ymm7
1834	vpmuludq	-92(%edx),%ymm5,%ymm6
1835	vpaddq	%ymm6,%ymm1,%ymm1
1836	vpmuludq	-60(%edx),%ymm5,%ymm5
1837	vpaddq	%ymm5,%ymm2,%ymm2
1838	vpmuludq	-60(%edx),%ymm7,%ymm6
1839	vpaddq	%ymm6,%ymm3,%ymm3
1840	vpmuludq	-28(%edx),%ymm7,%ymm5
1841	vpaddq	%ymm5,%ymm4,%ymm4
1842	vpmuludq	132(%edx),%ymm7,%ymm6
1843	vpaddq	%ymm6,%ymm0,%ymm0
1844	vmovdqa	96(%esp),%ymm6
1845	vpmuludq	-124(%edx),%ymm7,%ymm5
1846	vpaddq	%ymm5,%ymm1,%ymm1
1847	vpmuludq	-92(%edx),%ymm7,%ymm7
1848	vpaddq	%ymm7,%ymm2,%ymm2
1849	vpmuludq	-124(%edx),%ymm6,%ymm5
1850	vpaddq	%ymm5,%ymm3,%ymm3
1851	vpmuludq	-92(%edx),%ymm6,%ymm7
1852	vpaddq	%ymm7,%ymm4,%ymm4
1853	vpmuludq	68(%edx),%ymm6,%ymm5
1854	vpaddq	%ymm5,%ymm0,%ymm0
1855	vmovdqa	128(%esp),%ymm5
1856	vpmuludq	100(%edx),%ymm6,%ymm7
1857	vpaddq	%ymm7,%ymm1,%ymm1
1858	vpmuludq	132(%edx),%ymm6,%ymm6
1859	vpaddq	%ymm6,%ymm2,%ymm2
1860	vpmuludq	132(%edx),%ymm5,%ymm7
1861	vpaddq	%ymm7,%ymm3,%ymm3
1862	vpmuludq	36(%edx),%ymm5,%ymm6
1863	vpaddq	%ymm6,%ymm0,%ymm0
1864	vpmuludq	-124(%edx),%ymm5,%ymm7
1865	vpaddq	%ymm7,%ymm4,%ymm4
1866	vmovdqa	64(%ebx),%ymm7
1867	vpmuludq	68(%edx),%ymm5,%ymm6
1868	vpaddq	%ymm6,%ymm1,%ymm1
1869	vpmuludq	100(%edx),%ymm5,%ymm5
1870	vpaddq	%ymm5,%ymm2,%ymm2
1871	vpsrldq	$8,%ymm4,%ymm5
1872	vpsrldq	$8,%ymm3,%ymm6
1873	vpaddq	%ymm5,%ymm4,%ymm4
1874	vpsrldq	$8,%ymm0,%ymm5
1875	vpaddq	%ymm6,%ymm3,%ymm3
1876	vpsrldq	$8,%ymm1,%ymm6
1877	vpaddq	%ymm5,%ymm0,%ymm0
1878	vpsrldq	$8,%ymm2,%ymm5
1879	vpaddq	%ymm6,%ymm1,%ymm1
1880	vpermq	$2,%ymm4,%ymm6
1881	vpaddq	%ymm5,%ymm2,%ymm2
1882	vpermq	$2,%ymm3,%ymm5
1883	vpaddq	%ymm6,%ymm4,%ymm4
1884	vpermq	$2,%ymm0,%ymm6
1885	vpaddq	%ymm5,%ymm3,%ymm3
1886	vpermq	$2,%ymm1,%ymm5
1887	vpaddq	%ymm6,%ymm0,%ymm0
1888	vpermq	$2,%ymm2,%ymm6
1889	vpaddq	%ymm5,%ymm1,%ymm1
1890	vpaddq	%ymm6,%ymm2,%ymm2
1891	vpsrlq	$26,%ymm3,%ymm5
1892	vpand	%ymm7,%ymm3,%ymm3
1893	vpsrlq	$26,%ymm0,%ymm6
1894	vpand	%ymm7,%ymm0,%ymm0
1895	vpaddq	%ymm5,%ymm4,%ymm4
1896	vpaddq	%ymm6,%ymm1,%ymm1
1897	vpsrlq	$26,%ymm4,%ymm5
1898	vpand	%ymm7,%ymm4,%ymm4
1899	vpsrlq	$26,%ymm1,%ymm6
1900	vpand	%ymm7,%ymm1,%ymm1
1901	vpaddq	%ymm6,%ymm2,%ymm2
1902	vpaddq	%ymm5,%ymm0,%ymm0
1903	vpsllq	$2,%ymm5,%ymm5
1904	vpsrlq	$26,%ymm2,%ymm6
1905	vpand	%ymm7,%ymm2,%ymm2
1906	vpaddq	%ymm5,%ymm0,%ymm0
1907	vpaddq	%ymm6,%ymm3,%ymm3
1908	vpsrlq	$26,%ymm3,%ymm6
1909	vpsrlq	$26,%ymm0,%ymm5
1910	vpand	%ymm7,%ymm0,%ymm0
1911	vpand	%ymm7,%ymm3,%ymm3
1912	vpaddq	%ymm5,%ymm1,%ymm1
1913	vpaddq	%ymm6,%ymm4,%ymm4
1914	cmpl	$0,%ecx
1915	je	.L029done
1916	vpshufd	$252,%xmm0,%xmm0
1917	leal	288(%esp),%edx
1918	vpshufd	$252,%xmm1,%xmm1
1919	vpshufd	$252,%xmm2,%xmm2
1920	vpshufd	$252,%xmm3,%xmm3
1921	vpshufd	$252,%xmm4,%xmm4
1922	jmp	.L024even
1923.align	16
1924.L029done:
1925	vmovd	%xmm0,-48(%edi)
1926	vmovd	%xmm1,-44(%edi)
1927	vmovd	%xmm2,-40(%edi)
1928	vmovd	%xmm3,-36(%edi)
1929	vmovd	%xmm4,-32(%edi)
1930	vzeroupper
1931	movl	%ebp,%esp
1932.L020nodata:
1933	popl	%edi
1934	popl	%esi
1935	popl	%ebx
1936	popl	%ebp
1937	ret
1938.size	_poly1305_blocks_avx2,.-_poly1305_blocks_avx2
1939.align	64
1940.Lconst_sse2:
1941.long	16777216,0,16777216,0,16777216,0,16777216,0
1942.long	0,0,0,0,0,0,0,0
1943.long	67108863,0,67108863,0,67108863,0,67108863,0
1944.long	268435455,268435452,268435452,268435452
1945.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54
1946.byte	44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
1947.byte	60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
1948.byte	114,103,62,0
1949.align	4
1950.comm	OPENSSL_ia32cap_P,16,4
1951
1952	.section ".note.gnu.property", "a"
1953	.p2align 2
1954	.long 1f - 0f
1955	.long 4f - 1f
1956	.long 5
19570:
1958	.asciz "GNU"
19591:
1960	.p2align 2
1961	.long 0xc0000002
1962	.long 3f - 2f
19632:
1964	.long 3
19653:
1966	.p2align 2
19674:
1968