• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1%ifidn __OUTPUT_FORMAT__,obj
2section	code	use32 class=code align=64
3%elifidn __OUTPUT_FORMAT__,win32
4$@feat.00 equ 1
5section	.text	code align=64
6%else
7section	.text	code
8%endif
9;extern	_OPENSSL_ia32cap_P
10align	64
11global	_poly1305_init
12align	16
13_poly1305_init:
14L$_poly1305_init_begin:
15	push	ebp
16	push	ebx
17	push	esi
18	push	edi
19	mov	edi,DWORD [20+esp]
20	mov	esi,DWORD [24+esp]
21	mov	ebp,DWORD [28+esp]
22	xor	eax,eax
23	mov	DWORD [edi],eax
24	mov	DWORD [4+edi],eax
25	mov	DWORD [8+edi],eax
26	mov	DWORD [12+edi],eax
27	mov	DWORD [16+edi],eax
28	mov	DWORD [20+edi],eax
29	cmp	esi,0
30	je	NEAR L$000nokey
31	call	L$001pic_point
32L$001pic_point:
33	pop	ebx
34	lea	eax,[(_poly1305_blocks-L$001pic_point)+ebx]
35	lea	edx,[(_poly1305_emit-L$001pic_point)+ebx]
36	lea	edi,[_OPENSSL_ia32cap_P]
37	mov	ecx,DWORD [edi]
38	and	ecx,83886080
39	cmp	ecx,83886080
40	jne	NEAR L$002no_sse2
41	lea	eax,[(__poly1305_blocks_sse2-L$001pic_point)+ebx]
42	lea	edx,[(__poly1305_emit_sse2-L$001pic_point)+ebx]
43	mov	ecx,DWORD [8+edi]
44	test	ecx,32
45	jz	NEAR L$002no_sse2
46	lea	eax,[(__poly1305_blocks_avx2-L$001pic_point)+ebx]
47L$002no_sse2:
48	mov	edi,DWORD [20+esp]
49	mov	DWORD [ebp],eax
50	mov	DWORD [4+ebp],edx
51	mov	eax,DWORD [esi]
52	mov	ebx,DWORD [4+esi]
53	mov	ecx,DWORD [8+esi]
54	mov	edx,DWORD [12+esi]
55	and	eax,268435455
56	and	ebx,268435452
57	and	ecx,268435452
58	and	edx,268435452
59	mov	DWORD [24+edi],eax
60	mov	DWORD [28+edi],ebx
61	mov	DWORD [32+edi],ecx
62	mov	DWORD [36+edi],edx
63	mov	eax,1
64L$000nokey:
65	pop	edi
66	pop	esi
67	pop	ebx
68	pop	ebp
69	ret
70global	_poly1305_blocks
71align	16
72_poly1305_blocks:
73L$_poly1305_blocks_begin:
74	push	ebp
75	push	ebx
76	push	esi
77	push	edi
78	mov	edi,DWORD [20+esp]
79	mov	esi,DWORD [24+esp]
80	mov	ecx,DWORD [28+esp]
81L$enter_blocks:
82	and	ecx,-15
83	jz	NEAR L$003nodata
84	sub	esp,64
85	mov	eax,DWORD [24+edi]
86	mov	ebx,DWORD [28+edi]
87	lea	ebp,[ecx*1+esi]
88	mov	ecx,DWORD [32+edi]
89	mov	edx,DWORD [36+edi]
90	mov	DWORD [92+esp],ebp
91	mov	ebp,esi
92	mov	DWORD [36+esp],eax
93	mov	eax,ebx
94	shr	eax,2
95	mov	DWORD [40+esp],ebx
96	add	eax,ebx
97	mov	ebx,ecx
98	shr	ebx,2
99	mov	DWORD [44+esp],ecx
100	add	ebx,ecx
101	mov	ecx,edx
102	shr	ecx,2
103	mov	DWORD [48+esp],edx
104	add	ecx,edx
105	mov	DWORD [52+esp],eax
106	mov	DWORD [56+esp],ebx
107	mov	DWORD [60+esp],ecx
108	mov	eax,DWORD [edi]
109	mov	ebx,DWORD [4+edi]
110	mov	ecx,DWORD [8+edi]
111	mov	esi,DWORD [12+edi]
112	mov	edi,DWORD [16+edi]
113	jmp	NEAR L$004loop
114align	32
115L$004loop:
116	add	eax,DWORD [ebp]
117	adc	ebx,DWORD [4+ebp]
118	adc	ecx,DWORD [8+ebp]
119	adc	esi,DWORD [12+ebp]
120	lea	ebp,[16+ebp]
121	adc	edi,DWORD [96+esp]
122	mov	DWORD [esp],eax
123	mov	DWORD [12+esp],esi
124	mul	DWORD [36+esp]
125	mov	DWORD [16+esp],edi
126	mov	edi,eax
127	mov	eax,ebx
128	mov	esi,edx
129	mul	DWORD [60+esp]
130	add	edi,eax
131	mov	eax,ecx
132	adc	esi,edx
133	mul	DWORD [56+esp]
134	add	edi,eax
135	mov	eax,DWORD [12+esp]
136	adc	esi,edx
137	mul	DWORD [52+esp]
138	add	edi,eax
139	mov	eax,DWORD [esp]
140	adc	esi,edx
141	mul	DWORD [40+esp]
142	mov	DWORD [20+esp],edi
143	xor	edi,edi
144	add	esi,eax
145	mov	eax,ebx
146	adc	edi,edx
147	mul	DWORD [36+esp]
148	add	esi,eax
149	mov	eax,ecx
150	adc	edi,edx
151	mul	DWORD [60+esp]
152	add	esi,eax
153	mov	eax,DWORD [12+esp]
154	adc	edi,edx
155	mul	DWORD [56+esp]
156	add	esi,eax
157	mov	eax,DWORD [16+esp]
158	adc	edi,edx
159	imul	eax,DWORD [52+esp]
160	add	esi,eax
161	mov	eax,DWORD [esp]
162	adc	edi,0
163	mul	DWORD [44+esp]
164	mov	DWORD [24+esp],esi
165	xor	esi,esi
166	add	edi,eax
167	mov	eax,ebx
168	adc	esi,edx
169	mul	DWORD [40+esp]
170	add	edi,eax
171	mov	eax,ecx
172	adc	esi,edx
173	mul	DWORD [36+esp]
174	add	edi,eax
175	mov	eax,DWORD [12+esp]
176	adc	esi,edx
177	mul	DWORD [60+esp]
178	add	edi,eax
179	mov	eax,DWORD [16+esp]
180	adc	esi,edx
181	imul	eax,DWORD [56+esp]
182	add	edi,eax
183	mov	eax,DWORD [esp]
184	adc	esi,0
185	mul	DWORD [48+esp]
186	mov	DWORD [28+esp],edi
187	xor	edi,edi
188	add	esi,eax
189	mov	eax,ebx
190	adc	edi,edx
191	mul	DWORD [44+esp]
192	add	esi,eax
193	mov	eax,ecx
194	adc	edi,edx
195	mul	DWORD [40+esp]
196	add	esi,eax
197	mov	eax,DWORD [12+esp]
198	adc	edi,edx
199	mul	DWORD [36+esp]
200	add	esi,eax
201	mov	ecx,DWORD [16+esp]
202	adc	edi,edx
203	mov	edx,ecx
204	imul	ecx,DWORD [60+esp]
205	add	esi,ecx
206	mov	eax,DWORD [20+esp]
207	adc	edi,0
208	imul	edx,DWORD [36+esp]
209	add	edx,edi
210	mov	ebx,DWORD [24+esp]
211	mov	ecx,DWORD [28+esp]
212	mov	edi,edx
213	shr	edx,2
214	and	edi,3
215	lea	edx,[edx*4+edx]
216	add	eax,edx
217	adc	ebx,0
218	adc	ecx,0
219	adc	esi,0
220	adc	edi,0
221	cmp	ebp,DWORD [92+esp]
222	jne	NEAR L$004loop
223	mov	edx,DWORD [84+esp]
224	add	esp,64
225	mov	DWORD [edx],eax
226	mov	DWORD [4+edx],ebx
227	mov	DWORD [8+edx],ecx
228	mov	DWORD [12+edx],esi
229	mov	DWORD [16+edx],edi
230L$003nodata:
231	pop	edi
232	pop	esi
233	pop	ebx
234	pop	ebp
235	ret
236global	_poly1305_emit
237align	16
238_poly1305_emit:
239L$_poly1305_emit_begin:
240	push	ebp
241	push	ebx
242	push	esi
243	push	edi
244	mov	ebp,DWORD [20+esp]
245L$enter_emit:
246	mov	edi,DWORD [24+esp]
247	mov	eax,DWORD [ebp]
248	mov	ebx,DWORD [4+ebp]
249	mov	ecx,DWORD [8+ebp]
250	mov	edx,DWORD [12+ebp]
251	mov	esi,DWORD [16+ebp]
252	add	eax,5
253	adc	ebx,0
254	adc	ecx,0
255	adc	edx,0
256	adc	esi,0
257	shr	esi,2
258	neg	esi
259	and	eax,esi
260	and	ebx,esi
261	and	ecx,esi
262	and	edx,esi
263	mov	DWORD [edi],eax
264	mov	DWORD [4+edi],ebx
265	mov	DWORD [8+edi],ecx
266	mov	DWORD [12+edi],edx
267	not	esi
268	mov	eax,DWORD [ebp]
269	mov	ebx,DWORD [4+ebp]
270	mov	ecx,DWORD [8+ebp]
271	mov	edx,DWORD [12+ebp]
272	mov	ebp,DWORD [28+esp]
273	and	eax,esi
274	and	ebx,esi
275	and	ecx,esi
276	and	edx,esi
277	or	eax,DWORD [edi]
278	or	ebx,DWORD [4+edi]
279	or	ecx,DWORD [8+edi]
280	or	edx,DWORD [12+edi]
281	add	eax,DWORD [ebp]
282	adc	ebx,DWORD [4+ebp]
283	adc	ecx,DWORD [8+ebp]
284	adc	edx,DWORD [12+ebp]
285	mov	DWORD [edi],eax
286	mov	DWORD [4+edi],ebx
287	mov	DWORD [8+edi],ecx
288	mov	DWORD [12+edi],edx
289	pop	edi
290	pop	esi
291	pop	ebx
292	pop	ebp
293	ret
294align	32
295align	16
296__poly1305_init_sse2:
297	movdqu	xmm4,[24+edi]
298	lea	edi,[48+edi]
299	mov	ebp,esp
300	sub	esp,224
301	and	esp,-16
302	movq	xmm7,[64+ebx]
303	movdqa	xmm0,xmm4
304	movdqa	xmm1,xmm4
305	movdqa	xmm2,xmm4
306	pand	xmm0,xmm7
307	psrlq	xmm1,26
308	psrldq	xmm2,6
309	pand	xmm1,xmm7
310	movdqa	xmm3,xmm2
311	psrlq	xmm2,4
312	psrlq	xmm3,30
313	pand	xmm2,xmm7
314	pand	xmm3,xmm7
315	psrldq	xmm4,13
316	lea	edx,[144+esp]
317	mov	ecx,2
318L$005square:
319	movdqa	[esp],xmm0
320	movdqa	[16+esp],xmm1
321	movdqa	[32+esp],xmm2
322	movdqa	[48+esp],xmm3
323	movdqa	[64+esp],xmm4
324	movdqa	xmm6,xmm1
325	movdqa	xmm5,xmm2
326	pslld	xmm6,2
327	pslld	xmm5,2
328	paddd	xmm6,xmm1
329	paddd	xmm5,xmm2
330	movdqa	[80+esp],xmm6
331	movdqa	[96+esp],xmm5
332	movdqa	xmm6,xmm3
333	movdqa	xmm5,xmm4
334	pslld	xmm6,2
335	pslld	xmm5,2
336	paddd	xmm6,xmm3
337	paddd	xmm5,xmm4
338	movdqa	[112+esp],xmm6
339	movdqa	[128+esp],xmm5
340	pshufd	xmm6,xmm0,68
341	movdqa	xmm5,xmm1
342	pshufd	xmm1,xmm1,68
343	pshufd	xmm2,xmm2,68
344	pshufd	xmm3,xmm3,68
345	pshufd	xmm4,xmm4,68
346	movdqa	[edx],xmm6
347	movdqa	[16+edx],xmm1
348	movdqa	[32+edx],xmm2
349	movdqa	[48+edx],xmm3
350	movdqa	[64+edx],xmm4
351	pmuludq	xmm4,xmm0
352	pmuludq	xmm3,xmm0
353	pmuludq	xmm2,xmm0
354	pmuludq	xmm1,xmm0
355	pmuludq	xmm0,xmm6
356	movdqa	xmm6,xmm5
357	pmuludq	xmm5,[48+edx]
358	movdqa	xmm7,xmm6
359	pmuludq	xmm6,[32+edx]
360	paddq	xmm4,xmm5
361	movdqa	xmm5,xmm7
362	pmuludq	xmm7,[16+edx]
363	paddq	xmm3,xmm6
364	movdqa	xmm6,[80+esp]
365	pmuludq	xmm5,[edx]
366	paddq	xmm2,xmm7
367	pmuludq	xmm6,[64+edx]
368	movdqa	xmm7,[32+esp]
369	paddq	xmm1,xmm5
370	movdqa	xmm5,xmm7
371	pmuludq	xmm7,[32+edx]
372	paddq	xmm0,xmm6
373	movdqa	xmm6,xmm5
374	pmuludq	xmm5,[16+edx]
375	paddq	xmm4,xmm7
376	movdqa	xmm7,[96+esp]
377	pmuludq	xmm6,[edx]
378	paddq	xmm3,xmm5
379	movdqa	xmm5,xmm7
380	pmuludq	xmm7,[64+edx]
381	paddq	xmm2,xmm6
382	pmuludq	xmm5,[48+edx]
383	movdqa	xmm6,[48+esp]
384	paddq	xmm1,xmm7
385	movdqa	xmm7,xmm6
386	pmuludq	xmm6,[16+edx]
387	paddq	xmm0,xmm5
388	movdqa	xmm5,[112+esp]
389	pmuludq	xmm7,[edx]
390	paddq	xmm4,xmm6
391	movdqa	xmm6,xmm5
392	pmuludq	xmm5,[64+edx]
393	paddq	xmm3,xmm7
394	movdqa	xmm7,xmm6
395	pmuludq	xmm6,[48+edx]
396	paddq	xmm2,xmm5
397	pmuludq	xmm7,[32+edx]
398	movdqa	xmm5,[64+esp]
399	paddq	xmm1,xmm6
400	movdqa	xmm6,[128+esp]
401	pmuludq	xmm5,[edx]
402	paddq	xmm0,xmm7
403	movdqa	xmm7,xmm6
404	pmuludq	xmm6,[64+edx]
405	paddq	xmm4,xmm5
406	movdqa	xmm5,xmm7
407	pmuludq	xmm7,[16+edx]
408	paddq	xmm3,xmm6
409	movdqa	xmm6,xmm5
410	pmuludq	xmm5,[32+edx]
411	paddq	xmm0,xmm7
412	pmuludq	xmm6,[48+edx]
413	movdqa	xmm7,[64+ebx]
414	paddq	xmm1,xmm5
415	paddq	xmm2,xmm6
416	movdqa	xmm5,xmm3
417	pand	xmm3,xmm7
418	psrlq	xmm5,26
419	paddq	xmm5,xmm4
420	movdqa	xmm6,xmm0
421	pand	xmm0,xmm7
422	psrlq	xmm6,26
423	movdqa	xmm4,xmm5
424	paddq	xmm6,xmm1
425	psrlq	xmm5,26
426	pand	xmm4,xmm7
427	movdqa	xmm1,xmm6
428	psrlq	xmm6,26
429	paddd	xmm0,xmm5
430	psllq	xmm5,2
431	paddq	xmm6,xmm2
432	paddq	xmm5,xmm0
433	pand	xmm1,xmm7
434	movdqa	xmm2,xmm6
435	psrlq	xmm6,26
436	pand	xmm2,xmm7
437	paddd	xmm6,xmm3
438	movdqa	xmm0,xmm5
439	psrlq	xmm5,26
440	movdqa	xmm3,xmm6
441	psrlq	xmm6,26
442	pand	xmm0,xmm7
443	paddd	xmm1,xmm5
444	pand	xmm3,xmm7
445	paddd	xmm4,xmm6
446	dec	ecx
447	jz	NEAR L$006square_break
448	punpcklqdq	xmm0,[esp]
449	punpcklqdq	xmm1,[16+esp]
450	punpcklqdq	xmm2,[32+esp]
451	punpcklqdq	xmm3,[48+esp]
452	punpcklqdq	xmm4,[64+esp]
453	jmp	NEAR L$005square
454L$006square_break:
455	psllq	xmm0,32
456	psllq	xmm1,32
457	psllq	xmm2,32
458	psllq	xmm3,32
459	psllq	xmm4,32
460	por	xmm0,[esp]
461	por	xmm1,[16+esp]
462	por	xmm2,[32+esp]
463	por	xmm3,[48+esp]
464	por	xmm4,[64+esp]
465	pshufd	xmm0,xmm0,141
466	pshufd	xmm1,xmm1,141
467	pshufd	xmm2,xmm2,141
468	pshufd	xmm3,xmm3,141
469	pshufd	xmm4,xmm4,141
470	movdqu	[edi],xmm0
471	movdqu	[16+edi],xmm1
472	movdqu	[32+edi],xmm2
473	movdqu	[48+edi],xmm3
474	movdqu	[64+edi],xmm4
475	movdqa	xmm6,xmm1
476	movdqa	xmm5,xmm2
477	pslld	xmm6,2
478	pslld	xmm5,2
479	paddd	xmm6,xmm1
480	paddd	xmm5,xmm2
481	movdqu	[80+edi],xmm6
482	movdqu	[96+edi],xmm5
483	movdqa	xmm6,xmm3
484	movdqa	xmm5,xmm4
485	pslld	xmm6,2
486	pslld	xmm5,2
487	paddd	xmm6,xmm3
488	paddd	xmm5,xmm4
489	movdqu	[112+edi],xmm6
490	movdqu	[128+edi],xmm5
491	mov	esp,ebp
492	lea	edi,[edi-48]
493	ret
494align	32
495align	16
496__poly1305_blocks_sse2:
497	push	ebp
498	push	ebx
499	push	esi
500	push	edi
501	mov	edi,DWORD [20+esp]
502	mov	esi,DWORD [24+esp]
503	mov	ecx,DWORD [28+esp]
504	mov	eax,DWORD [20+edi]
505	and	ecx,-16
506	jz	NEAR L$007nodata
507	cmp	ecx,64
508	jae	NEAR L$008enter_sse2
509	test	eax,eax
510	jz	NEAR L$enter_blocks
511align	16
512L$008enter_sse2:
513	call	L$009pic_point
514L$009pic_point:
515	pop	ebx
516	lea	ebx,[(L$const_sse2-L$009pic_point)+ebx]
517	test	eax,eax
518	jnz	NEAR L$010base2_26
519	call	__poly1305_init_sse2
520	mov	eax,DWORD [edi]
521	mov	ecx,DWORD [3+edi]
522	mov	edx,DWORD [6+edi]
523	mov	esi,DWORD [9+edi]
524	mov	ebp,DWORD [13+edi]
525	mov	DWORD [20+edi],1
526	shr	ecx,2
527	and	eax,67108863
528	shr	edx,4
529	and	ecx,67108863
530	shr	esi,6
531	and	edx,67108863
532	movd	xmm0,eax
533	movd	xmm1,ecx
534	movd	xmm2,edx
535	movd	xmm3,esi
536	movd	xmm4,ebp
537	mov	esi,DWORD [24+esp]
538	mov	ecx,DWORD [28+esp]
539	jmp	NEAR L$011base2_32
540align	16
541L$010base2_26:
542	movd	xmm0,DWORD [edi]
543	movd	xmm1,DWORD [4+edi]
544	movd	xmm2,DWORD [8+edi]
545	movd	xmm3,DWORD [12+edi]
546	movd	xmm4,DWORD [16+edi]
547	movdqa	xmm7,[64+ebx]
548L$011base2_32:
549	mov	eax,DWORD [32+esp]
550	mov	ebp,esp
551	sub	esp,528
552	and	esp,-16
553	lea	edi,[48+edi]
554	shl	eax,24
555	test	ecx,31
556	jz	NEAR L$012even
557	movdqu	xmm6,[esi]
558	lea	esi,[16+esi]
559	movdqa	xmm5,xmm6
560	pand	xmm6,xmm7
561	paddd	xmm0,xmm6
562	movdqa	xmm6,xmm5
563	psrlq	xmm5,26
564	psrldq	xmm6,6
565	pand	xmm5,xmm7
566	paddd	xmm1,xmm5
567	movdqa	xmm5,xmm6
568	psrlq	xmm6,4
569	pand	xmm6,xmm7
570	paddd	xmm2,xmm6
571	movdqa	xmm6,xmm5
572	psrlq	xmm5,30
573	pand	xmm5,xmm7
574	psrldq	xmm6,7
575	paddd	xmm3,xmm5
576	movd	xmm5,eax
577	paddd	xmm4,xmm6
578	movd	xmm6,DWORD [12+edi]
579	paddd	xmm4,xmm5
580	movdqa	[esp],xmm0
581	movdqa	[16+esp],xmm1
582	movdqa	[32+esp],xmm2
583	movdqa	[48+esp],xmm3
584	movdqa	[64+esp],xmm4
585	pmuludq	xmm0,xmm6
586	pmuludq	xmm1,xmm6
587	pmuludq	xmm2,xmm6
588	movd	xmm5,DWORD [28+edi]
589	pmuludq	xmm3,xmm6
590	pmuludq	xmm4,xmm6
591	movdqa	xmm6,xmm5
592	pmuludq	xmm5,[48+esp]
593	movdqa	xmm7,xmm6
594	pmuludq	xmm6,[32+esp]
595	paddq	xmm4,xmm5
596	movdqa	xmm5,xmm7
597	pmuludq	xmm7,[16+esp]
598	paddq	xmm3,xmm6
599	movd	xmm6,DWORD [92+edi]
600	pmuludq	xmm5,[esp]
601	paddq	xmm2,xmm7
602	pmuludq	xmm6,[64+esp]
603	movd	xmm7,DWORD [44+edi]
604	paddq	xmm1,xmm5
605	movdqa	xmm5,xmm7
606	pmuludq	xmm7,[32+esp]
607	paddq	xmm0,xmm6
608	movdqa	xmm6,xmm5
609	pmuludq	xmm5,[16+esp]
610	paddq	xmm4,xmm7
611	movd	xmm7,DWORD [108+edi]
612	pmuludq	xmm6,[esp]
613	paddq	xmm3,xmm5
614	movdqa	xmm5,xmm7
615	pmuludq	xmm7,[64+esp]
616	paddq	xmm2,xmm6
617	pmuludq	xmm5,[48+esp]
618	movd	xmm6,DWORD [60+edi]
619	paddq	xmm1,xmm7
620	movdqa	xmm7,xmm6
621	pmuludq	xmm6,[16+esp]
622	paddq	xmm0,xmm5
623	movd	xmm5,DWORD [124+edi]
624	pmuludq	xmm7,[esp]
625	paddq	xmm4,xmm6
626	movdqa	xmm6,xmm5
627	pmuludq	xmm5,[64+esp]
628	paddq	xmm3,xmm7
629	movdqa	xmm7,xmm6
630	pmuludq	xmm6,[48+esp]
631	paddq	xmm2,xmm5
632	pmuludq	xmm7,[32+esp]
633	movd	xmm5,DWORD [76+edi]
634	paddq	xmm1,xmm6
635	movd	xmm6,DWORD [140+edi]
636	pmuludq	xmm5,[esp]
637	paddq	xmm0,xmm7
638	movdqa	xmm7,xmm6
639	pmuludq	xmm6,[64+esp]
640	paddq	xmm4,xmm5
641	movdqa	xmm5,xmm7
642	pmuludq	xmm7,[16+esp]
643	paddq	xmm3,xmm6
644	movdqa	xmm6,xmm5
645	pmuludq	xmm5,[32+esp]
646	paddq	xmm0,xmm7
647	pmuludq	xmm6,[48+esp]
648	movdqa	xmm7,[64+ebx]
649	paddq	xmm1,xmm5
650	paddq	xmm2,xmm6
651	movdqa	xmm5,xmm3
652	pand	xmm3,xmm7
653	psrlq	xmm5,26
654	paddq	xmm5,xmm4
655	movdqa	xmm6,xmm0
656	pand	xmm0,xmm7
657	psrlq	xmm6,26
658	movdqa	xmm4,xmm5
659	paddq	xmm6,xmm1
660	psrlq	xmm5,26
661	pand	xmm4,xmm7
662	movdqa	xmm1,xmm6
663	psrlq	xmm6,26
664	paddd	xmm0,xmm5
665	psllq	xmm5,2
666	paddq	xmm6,xmm2
667	paddq	xmm5,xmm0
668	pand	xmm1,xmm7
669	movdqa	xmm2,xmm6
670	psrlq	xmm6,26
671	pand	xmm2,xmm7
672	paddd	xmm6,xmm3
673	movdqa	xmm0,xmm5
674	psrlq	xmm5,26
675	movdqa	xmm3,xmm6
676	psrlq	xmm6,26
677	pand	xmm0,xmm7
678	paddd	xmm1,xmm5
679	pand	xmm3,xmm7
680	paddd	xmm4,xmm6
681	sub	ecx,16
682	jz	NEAR L$013done
683L$012even:
684	lea	edx,[384+esp]
685	lea	eax,[esi-32]
686	sub	ecx,64
687	movdqu	xmm5,[edi]
688	pshufd	xmm6,xmm5,68
689	cmovb	esi,eax
690	pshufd	xmm5,xmm5,238
691	movdqa	[edx],xmm6
692	lea	eax,[160+esp]
693	movdqu	xmm6,[16+edi]
694	movdqa	[edx-144],xmm5
695	pshufd	xmm5,xmm6,68
696	pshufd	xmm6,xmm6,238
697	movdqa	[16+edx],xmm5
698	movdqu	xmm5,[32+edi]
699	movdqa	[edx-128],xmm6
700	pshufd	xmm6,xmm5,68
701	pshufd	xmm5,xmm5,238
702	movdqa	[32+edx],xmm6
703	movdqu	xmm6,[48+edi]
704	movdqa	[edx-112],xmm5
705	pshufd	xmm5,xmm6,68
706	pshufd	xmm6,xmm6,238
707	movdqa	[48+edx],xmm5
708	movdqu	xmm5,[64+edi]
709	movdqa	[edx-96],xmm6
710	pshufd	xmm6,xmm5,68
711	pshufd	xmm5,xmm5,238
712	movdqa	[64+edx],xmm6
713	movdqu	xmm6,[80+edi]
714	movdqa	[edx-80],xmm5
715	pshufd	xmm5,xmm6,68
716	pshufd	xmm6,xmm6,238
717	movdqa	[80+edx],xmm5
718	movdqu	xmm5,[96+edi]
719	movdqa	[edx-64],xmm6
720	pshufd	xmm6,xmm5,68
721	pshufd	xmm5,xmm5,238
722	movdqa	[96+edx],xmm6
723	movdqu	xmm6,[112+edi]
724	movdqa	[edx-48],xmm5
725	pshufd	xmm5,xmm6,68
726	pshufd	xmm6,xmm6,238
727	movdqa	[112+edx],xmm5
728	movdqu	xmm5,[128+edi]
729	movdqa	[edx-32],xmm6
730	pshufd	xmm6,xmm5,68
731	pshufd	xmm5,xmm5,238
732	movdqa	[128+edx],xmm6
733	movdqa	[edx-16],xmm5
734	movdqu	xmm5,[32+esi]
735	movdqu	xmm6,[48+esi]
736	lea	esi,[32+esi]
737	movdqa	[112+esp],xmm2
738	movdqa	[128+esp],xmm3
739	movdqa	[144+esp],xmm4
740	movdqa	xmm2,xmm5
741	movdqa	xmm3,xmm6
742	psrldq	xmm2,6
743	psrldq	xmm3,6
744	movdqa	xmm4,xmm5
745	punpcklqdq	xmm2,xmm3
746	punpckhqdq	xmm4,xmm6
747	punpcklqdq	xmm5,xmm6
748	movdqa	xmm3,xmm2
749	psrlq	xmm2,4
750	psrlq	xmm3,30
751	movdqa	xmm6,xmm5
752	psrlq	xmm4,40
753	psrlq	xmm6,26
754	pand	xmm5,xmm7
755	pand	xmm6,xmm7
756	pand	xmm2,xmm7
757	pand	xmm3,xmm7
758	por	xmm4,[ebx]
759	movdqa	[80+esp],xmm0
760	movdqa	[96+esp],xmm1
761	jbe	NEAR L$014skip_loop
762	jmp	NEAR L$015loop
763align	32
764L$015loop:
765	movdqa	xmm7,[edx-144]
766	movdqa	[16+eax],xmm6
767	movdqa	[32+eax],xmm2
768	movdqa	[48+eax],xmm3
769	movdqa	[64+eax],xmm4
770	movdqa	xmm1,xmm5
771	pmuludq	xmm5,xmm7
772	movdqa	xmm0,xmm6
773	pmuludq	xmm6,xmm7
774	pmuludq	xmm2,xmm7
775	pmuludq	xmm3,xmm7
776	pmuludq	xmm4,xmm7
777	pmuludq	xmm0,[edx-16]
778	movdqa	xmm7,xmm1
779	pmuludq	xmm1,[edx-128]
780	paddq	xmm0,xmm5
781	movdqa	xmm5,xmm7
782	pmuludq	xmm7,[edx-112]
783	paddq	xmm1,xmm6
784	movdqa	xmm6,xmm5
785	pmuludq	xmm5,[edx-96]
786	paddq	xmm2,xmm7
787	movdqa	xmm7,[16+eax]
788	pmuludq	xmm6,[edx-80]
789	paddq	xmm3,xmm5
790	movdqa	xmm5,xmm7
791	pmuludq	xmm7,[edx-128]
792	paddq	xmm4,xmm6
793	movdqa	xmm6,xmm5
794	pmuludq	xmm5,[edx-112]
795	paddq	xmm2,xmm7
796	movdqa	xmm7,[32+eax]
797	pmuludq	xmm6,[edx-96]
798	paddq	xmm3,xmm5
799	movdqa	xmm5,xmm7
800	pmuludq	xmm7,[edx-32]
801	paddq	xmm4,xmm6
802	movdqa	xmm6,xmm5
803	pmuludq	xmm5,[edx-16]
804	paddq	xmm0,xmm7
805	movdqa	xmm7,xmm6
806	pmuludq	xmm6,[edx-128]
807	paddq	xmm1,xmm5
808	movdqa	xmm5,[48+eax]
809	pmuludq	xmm7,[edx-112]
810	paddq	xmm3,xmm6
811	movdqa	xmm6,xmm5
812	pmuludq	xmm5,[edx-48]
813	paddq	xmm4,xmm7
814	movdqa	xmm7,xmm6
815	pmuludq	xmm6,[edx-32]
816	paddq	xmm0,xmm5
817	movdqa	xmm5,xmm7
818	pmuludq	xmm7,[edx-16]
819	paddq	xmm1,xmm6
820	movdqa	xmm6,[64+eax]
821	pmuludq	xmm5,[edx-128]
822	paddq	xmm2,xmm7
823	movdqa	xmm7,xmm6
824	pmuludq	xmm6,[edx-16]
825	paddq	xmm4,xmm5
826	movdqa	xmm5,xmm7
827	pmuludq	xmm7,[edx-64]
828	paddq	xmm3,xmm6
829	movdqa	xmm6,xmm5
830	pmuludq	xmm5,[edx-48]
831	paddq	xmm0,xmm7
832	movdqa	xmm7,[64+ebx]
833	pmuludq	xmm6,[edx-32]
834	paddq	xmm1,xmm5
835	paddq	xmm2,xmm6
836	movdqu	xmm5,[esi-32]
837	movdqu	xmm6,[esi-16]
838	lea	esi,[32+esi]
839	movdqa	[32+esp],xmm2
840	movdqa	[48+esp],xmm3
841	movdqa	[64+esp],xmm4
842	movdqa	xmm2,xmm5
843	movdqa	xmm3,xmm6
844	psrldq	xmm2,6
845	psrldq	xmm3,6
846	movdqa	xmm4,xmm5
847	punpcklqdq	xmm2,xmm3
848	punpckhqdq	xmm4,xmm6
849	punpcklqdq	xmm5,xmm6
850	movdqa	xmm3,xmm2
851	psrlq	xmm2,4
852	psrlq	xmm3,30
853	movdqa	xmm6,xmm5
854	psrlq	xmm4,40
855	psrlq	xmm6,26
856	pand	xmm5,xmm7
857	pand	xmm6,xmm7
858	pand	xmm2,xmm7
859	pand	xmm3,xmm7
860	por	xmm4,[ebx]
861	lea	eax,[esi-32]
862	sub	ecx,64
863	paddd	xmm5,[80+esp]
864	paddd	xmm6,[96+esp]
865	paddd	xmm2,[112+esp]
866	paddd	xmm3,[128+esp]
867	paddd	xmm4,[144+esp]
868	cmovb	esi,eax
869	lea	eax,[160+esp]
870	movdqa	xmm7,[edx]
871	movdqa	[16+esp],xmm1
872	movdqa	[16+eax],xmm6
873	movdqa	[32+eax],xmm2
874	movdqa	[48+eax],xmm3
875	movdqa	[64+eax],xmm4
876	movdqa	xmm1,xmm5
877	pmuludq	xmm5,xmm7
878	paddq	xmm5,xmm0
879	movdqa	xmm0,xmm6
880	pmuludq	xmm6,xmm7
881	pmuludq	xmm2,xmm7
882	pmuludq	xmm3,xmm7
883	pmuludq	xmm4,xmm7
884	paddq	xmm6,[16+esp]
885	paddq	xmm2,[32+esp]
886	paddq	xmm3,[48+esp]
887	paddq	xmm4,[64+esp]
888	pmuludq	xmm0,[128+edx]
889	movdqa	xmm7,xmm1
890	pmuludq	xmm1,[16+edx]
891	paddq	xmm0,xmm5
892	movdqa	xmm5,xmm7
893	pmuludq	xmm7,[32+edx]
894	paddq	xmm1,xmm6
895	movdqa	xmm6,xmm5
896	pmuludq	xmm5,[48+edx]
897	paddq	xmm2,xmm7
898	movdqa	xmm7,[16+eax]
899	pmuludq	xmm6,[64+edx]
900	paddq	xmm3,xmm5
901	movdqa	xmm5,xmm7
902	pmuludq	xmm7,[16+edx]
903	paddq	xmm4,xmm6
904	movdqa	xmm6,xmm5
905	pmuludq	xmm5,[32+edx]
906	paddq	xmm2,xmm7
907	movdqa	xmm7,[32+eax]
908	pmuludq	xmm6,[48+edx]
909	paddq	xmm3,xmm5
910	movdqa	xmm5,xmm7
911	pmuludq	xmm7,[112+edx]
912	paddq	xmm4,xmm6
913	movdqa	xmm6,xmm5
914	pmuludq	xmm5,[128+edx]
915	paddq	xmm0,xmm7
916	movdqa	xmm7,xmm6
917	pmuludq	xmm6,[16+edx]
918	paddq	xmm1,xmm5
919	movdqa	xmm5,[48+eax]
920	pmuludq	xmm7,[32+edx]
921	paddq	xmm3,xmm6
922	movdqa	xmm6,xmm5
923	pmuludq	xmm5,[96+edx]
924	paddq	xmm4,xmm7
925	movdqa	xmm7,xmm6
926	pmuludq	xmm6,[112+edx]
927	paddq	xmm0,xmm5
928	movdqa	xmm5,xmm7
929	pmuludq	xmm7,[128+edx]
930	paddq	xmm1,xmm6
931	movdqa	xmm6,[64+eax]
932	pmuludq	xmm5,[16+edx]
933	paddq	xmm2,xmm7
934	movdqa	xmm7,xmm6
935	pmuludq	xmm6,[128+edx]
936	paddq	xmm4,xmm5
937	movdqa	xmm5,xmm7
938	pmuludq	xmm7,[80+edx]
939	paddq	xmm3,xmm6
940	movdqa	xmm6,xmm5
941	pmuludq	xmm5,[96+edx]
942	paddq	xmm0,xmm7
943	movdqa	xmm7,[64+ebx]
944	pmuludq	xmm6,[112+edx]
945	paddq	xmm1,xmm5
946	paddq	xmm2,xmm6
947	movdqa	xmm5,xmm3
948	pand	xmm3,xmm7
949	psrlq	xmm5,26
950	paddq	xmm5,xmm4
951	movdqa	xmm6,xmm0
952	pand	xmm0,xmm7
953	psrlq	xmm6,26
954	movdqa	xmm4,xmm5
955	paddq	xmm6,xmm1
956	psrlq	xmm5,26
957	pand	xmm4,xmm7
958	movdqa	xmm1,xmm6
959	psrlq	xmm6,26
960	paddd	xmm0,xmm5
961	psllq	xmm5,2
962	paddq	xmm6,xmm2
963	paddq	xmm5,xmm0
964	pand	xmm1,xmm7
965	movdqa	xmm2,xmm6
966	psrlq	xmm6,26
967	pand	xmm2,xmm7
968	paddd	xmm6,xmm3
969	movdqa	xmm0,xmm5
970	psrlq	xmm5,26
971	movdqa	xmm3,xmm6
972	psrlq	xmm6,26
973	pand	xmm0,xmm7
974	paddd	xmm1,xmm5
975	pand	xmm3,xmm7
976	paddd	xmm4,xmm6
977	movdqu	xmm5,[32+esi]
978	movdqu	xmm6,[48+esi]
979	lea	esi,[32+esi]
980	movdqa	[112+esp],xmm2
981	movdqa	[128+esp],xmm3
982	movdqa	[144+esp],xmm4
983	movdqa	xmm2,xmm5
984	movdqa	xmm3,xmm6
985	psrldq	xmm2,6
986	psrldq	xmm3,6
987	movdqa	xmm4,xmm5
988	punpcklqdq	xmm2,xmm3
989	punpckhqdq	xmm4,xmm6
990	punpcklqdq	xmm5,xmm6
991	movdqa	xmm3,xmm2
992	psrlq	xmm2,4
993	psrlq	xmm3,30
994	movdqa	xmm6,xmm5
995	psrlq	xmm4,40
996	psrlq	xmm6,26
997	pand	xmm5,xmm7
998	pand	xmm6,xmm7
999	pand	xmm2,xmm7
1000	pand	xmm3,xmm7
1001	por	xmm4,[ebx]
1002	movdqa	[80+esp],xmm0
1003	movdqa	[96+esp],xmm1
1004	ja	NEAR L$015loop
1005L$014skip_loop:
1006	pshufd	xmm7,[edx-144],16
1007	add	ecx,32
1008	jnz	NEAR L$016long_tail
1009	paddd	xmm5,xmm0
1010	paddd	xmm6,xmm1
1011	paddd	xmm2,[112+esp]
1012	paddd	xmm3,[128+esp]
1013	paddd	xmm4,[144+esp]
1014L$016long_tail:
1015	movdqa	[eax],xmm5
1016	movdqa	[16+eax],xmm6
1017	movdqa	[32+eax],xmm2
1018	movdqa	[48+eax],xmm3
1019	movdqa	[64+eax],xmm4
1020	pmuludq	xmm5,xmm7
1021	pmuludq	xmm6,xmm7
1022	pmuludq	xmm2,xmm7
1023	movdqa	xmm0,xmm5
1024	pshufd	xmm5,[edx-128],16
1025	pmuludq	xmm3,xmm7
1026	movdqa	xmm1,xmm6
1027	pmuludq	xmm4,xmm7
1028	movdqa	xmm6,xmm5
1029	pmuludq	xmm5,[48+eax]
1030	movdqa	xmm7,xmm6
1031	pmuludq	xmm6,[32+eax]
1032	paddq	xmm4,xmm5
1033	movdqa	xmm5,xmm7
1034	pmuludq	xmm7,[16+eax]
1035	paddq	xmm3,xmm6
1036	pshufd	xmm6,[edx-64],16
1037	pmuludq	xmm5,[eax]
1038	paddq	xmm2,xmm7
1039	pmuludq	xmm6,[64+eax]
1040	pshufd	xmm7,[edx-112],16
1041	paddq	xmm1,xmm5
1042	movdqa	xmm5,xmm7
1043	pmuludq	xmm7,[32+eax]
1044	paddq	xmm0,xmm6
1045	movdqa	xmm6,xmm5
1046	pmuludq	xmm5,[16+eax]
1047	paddq	xmm4,xmm7
1048	pshufd	xmm7,[edx-48],16
1049	pmuludq	xmm6,[eax]
1050	paddq	xmm3,xmm5
1051	movdqa	xmm5,xmm7
1052	pmuludq	xmm7,[64+eax]
1053	paddq	xmm2,xmm6
1054	pmuludq	xmm5,[48+eax]
1055	pshufd	xmm6,[edx-96],16
1056	paddq	xmm1,xmm7
1057	movdqa	xmm7,xmm6
1058	pmuludq	xmm6,[16+eax]
1059	paddq	xmm0,xmm5
1060	pshufd	xmm5,[edx-32],16
1061	pmuludq	xmm7,[eax]
1062	paddq	xmm4,xmm6
1063	movdqa	xmm6,xmm5
1064	pmuludq	xmm5,[64+eax]
1065	paddq	xmm3,xmm7
1066	movdqa	xmm7,xmm6
1067	pmuludq	xmm6,[48+eax]
1068	paddq	xmm2,xmm5
1069	pmuludq	xmm7,[32+eax]
1070	pshufd	xmm5,[edx-80],16
1071	paddq	xmm1,xmm6
1072	pshufd	xmm6,[edx-16],16
1073	pmuludq	xmm5,[eax]
1074	paddq	xmm0,xmm7
1075	movdqa	xmm7,xmm6
1076	pmuludq	xmm6,[64+eax]
1077	paddq	xmm4,xmm5
1078	movdqa	xmm5,xmm7
1079	pmuludq	xmm7,[16+eax]
1080	paddq	xmm3,xmm6
1081	movdqa	xmm6,xmm5
1082	pmuludq	xmm5,[32+eax]
1083	paddq	xmm0,xmm7
1084	pmuludq	xmm6,[48+eax]
1085	movdqa	xmm7,[64+ebx]
1086	paddq	xmm1,xmm5
1087	paddq	xmm2,xmm6
1088	jz	NEAR L$017short_tail
1089	movdqu	xmm5,[esi-32]
1090	movdqu	xmm6,[esi-16]
1091	lea	esi,[32+esi]
1092	movdqa	[32+esp],xmm2
1093	movdqa	[48+esp],xmm3
1094	movdqa	[64+esp],xmm4
1095	movdqa	xmm2,xmm5
1096	movdqa	xmm3,xmm6
1097	psrldq	xmm2,6
1098	psrldq	xmm3,6
1099	movdqa	xmm4,xmm5
1100	punpcklqdq	xmm2,xmm3
1101	punpckhqdq	xmm4,xmm6
1102	punpcklqdq	xmm5,xmm6
1103	movdqa	xmm3,xmm2
1104	psrlq	xmm2,4
1105	psrlq	xmm3,30
1106	movdqa	xmm6,xmm5
1107	psrlq	xmm4,40
1108	psrlq	xmm6,26
1109	pand	xmm5,xmm7
1110	pand	xmm6,xmm7
1111	pand	xmm2,xmm7
1112	pand	xmm3,xmm7
1113	por	xmm4,[ebx]
1114	pshufd	xmm7,[edx],16
1115	paddd	xmm5,[80+esp]
1116	paddd	xmm6,[96+esp]
1117	paddd	xmm2,[112+esp]
1118	paddd	xmm3,[128+esp]
1119	paddd	xmm4,[144+esp]
1120	movdqa	[esp],xmm5
1121	pmuludq	xmm5,xmm7
1122	movdqa	[16+esp],xmm6
1123	pmuludq	xmm6,xmm7
1124	paddq	xmm0,xmm5
1125	movdqa	xmm5,xmm2
1126	pmuludq	xmm2,xmm7
1127	paddq	xmm1,xmm6
1128	movdqa	xmm6,xmm3
1129	pmuludq	xmm3,xmm7
1130	paddq	xmm2,[32+esp]
1131	movdqa	[32+esp],xmm5
1132	pshufd	xmm5,[16+edx],16
1133	paddq	xmm3,[48+esp]
1134	movdqa	[48+esp],xmm6
1135	movdqa	xmm6,xmm4
1136	pmuludq	xmm4,xmm7
1137	paddq	xmm4,[64+esp]
1138	movdqa	[64+esp],xmm6
1139	movdqa	xmm6,xmm5
1140	pmuludq	xmm5,[48+esp]
1141	movdqa	xmm7,xmm6
1142	pmuludq	xmm6,[32+esp]
1143	paddq	xmm4,xmm5
1144	movdqa	xmm5,xmm7
1145	pmuludq	xmm7,[16+esp]
1146	paddq	xmm3,xmm6
1147	pshufd	xmm6,[80+edx],16
1148	pmuludq	xmm5,[esp]
1149	paddq	xmm2,xmm7
1150	pmuludq	xmm6,[64+esp]
1151	pshufd	xmm7,[32+edx],16
1152	paddq	xmm1,xmm5
1153	movdqa	xmm5,xmm7
1154	pmuludq	xmm7,[32+esp]
1155	paddq	xmm0,xmm6
1156	movdqa	xmm6,xmm5
1157	pmuludq	xmm5,[16+esp]
1158	paddq	xmm4,xmm7
1159	pshufd	xmm7,[96+edx],16
1160	pmuludq	xmm6,[esp]
1161	paddq	xmm3,xmm5
1162	movdqa	xmm5,xmm7
1163	pmuludq	xmm7,[64+esp]
1164	paddq	xmm2,xmm6
1165	pmuludq	xmm5,[48+esp]
1166	pshufd	xmm6,[48+edx],16
1167	paddq	xmm1,xmm7
1168	movdqa	xmm7,xmm6
1169	pmuludq	xmm6,[16+esp]
1170	paddq	xmm0,xmm5
1171	pshufd	xmm5,[112+edx],16
1172	pmuludq	xmm7,[esp]
1173	paddq	xmm4,xmm6
1174	movdqa	xmm6,xmm5
1175	pmuludq	xmm5,[64+esp]
1176	paddq	xmm3,xmm7
1177	movdqa	xmm7,xmm6
1178	pmuludq	xmm6,[48+esp]
1179	paddq	xmm2,xmm5
1180	pmuludq	xmm7,[32+esp]
1181	pshufd	xmm5,[64+edx],16
1182	paddq	xmm1,xmm6
1183	pshufd	xmm6,[128+edx],16
1184	pmuludq	xmm5,[esp]
1185	paddq	xmm0,xmm7
1186	movdqa	xmm7,xmm6
1187	pmuludq	xmm6,[64+esp]
1188	paddq	xmm4,xmm5
1189	movdqa	xmm5,xmm7
1190	pmuludq	xmm7,[16+esp]
1191	paddq	xmm3,xmm6
1192	movdqa	xmm6,xmm5
1193	pmuludq	xmm5,[32+esp]
1194	paddq	xmm0,xmm7
1195	pmuludq	xmm6,[48+esp]
1196	movdqa	xmm7,[64+ebx]
1197	paddq	xmm1,xmm5
1198	paddq	xmm2,xmm6
1199L$017short_tail:
1200	pshufd	xmm6,xmm4,78
1201	pshufd	xmm5,xmm3,78
1202	paddq	xmm4,xmm6
1203	paddq	xmm3,xmm5
1204	pshufd	xmm6,xmm0,78
1205	pshufd	xmm5,xmm1,78
1206	paddq	xmm0,xmm6
1207	paddq	xmm1,xmm5
1208	pshufd	xmm6,xmm2,78
1209	movdqa	xmm5,xmm3
1210	pand	xmm3,xmm7
1211	psrlq	xmm5,26
1212	paddq	xmm2,xmm6
1213	paddq	xmm5,xmm4
1214	movdqa	xmm6,xmm0
1215	pand	xmm0,xmm7
1216	psrlq	xmm6,26
1217	movdqa	xmm4,xmm5
1218	paddq	xmm6,xmm1
1219	psrlq	xmm5,26
1220	pand	xmm4,xmm7
1221	movdqa	xmm1,xmm6
1222	psrlq	xmm6,26
1223	paddd	xmm0,xmm5
1224	psllq	xmm5,2
1225	paddq	xmm6,xmm2
1226	paddq	xmm5,xmm0
1227	pand	xmm1,xmm7
1228	movdqa	xmm2,xmm6
1229	psrlq	xmm6,26
1230	pand	xmm2,xmm7
1231	paddd	xmm6,xmm3
1232	movdqa	xmm0,xmm5
1233	psrlq	xmm5,26
1234	movdqa	xmm3,xmm6
1235	psrlq	xmm6,26
1236	pand	xmm0,xmm7
1237	paddd	xmm1,xmm5
1238	pand	xmm3,xmm7
1239	paddd	xmm4,xmm6
1240L$013done:
1241	movd	DWORD [edi-48],xmm0
1242	movd	DWORD [edi-44],xmm1
1243	movd	DWORD [edi-40],xmm2
1244	movd	DWORD [edi-36],xmm3
1245	movd	DWORD [edi-32],xmm4
1246	mov	esp,ebp
1247L$007nodata:
1248	pop	edi
1249	pop	esi
1250	pop	ebx
1251	pop	ebp
1252	ret
1253align	32
1254align	16
1255__poly1305_emit_sse2:
1256	push	ebp
1257	push	ebx
1258	push	esi
1259	push	edi
1260	mov	ebp,DWORD [20+esp]
1261	cmp	DWORD [20+ebp],0
1262	je	NEAR L$enter_emit
1263	mov	eax,DWORD [ebp]
1264	mov	edi,DWORD [4+ebp]
1265	mov	ecx,DWORD [8+ebp]
1266	mov	edx,DWORD [12+ebp]
1267	mov	esi,DWORD [16+ebp]
1268	mov	ebx,edi
1269	shl	edi,26
1270	shr	ebx,6
1271	add	eax,edi
1272	mov	edi,ecx
1273	adc	ebx,0
1274	shl	edi,20
1275	shr	ecx,12
1276	add	ebx,edi
1277	mov	edi,edx
1278	adc	ecx,0
1279	shl	edi,14
1280	shr	edx,18
1281	add	ecx,edi
1282	mov	edi,esi
1283	adc	edx,0
1284	shl	edi,8
1285	shr	esi,24
1286	add	edx,edi
1287	adc	esi,0
1288	mov	edi,esi
1289	and	esi,3
1290	shr	edi,2
1291	lea	ebp,[edi*4+edi]
1292	mov	edi,DWORD [24+esp]
1293	add	eax,ebp
1294	mov	ebp,DWORD [28+esp]
1295	adc	ebx,0
1296	adc	ecx,0
1297	adc	edx,0
1298	adc	esi,0
1299	movd	xmm0,eax
1300	add	eax,5
1301	movd	xmm1,ebx
1302	adc	ebx,0
1303	movd	xmm2,ecx
1304	adc	ecx,0
1305	movd	xmm3,edx
1306	adc	edx,0
1307	adc	esi,0
1308	shr	esi,2
1309	neg	esi
1310	and	eax,esi
1311	and	ebx,esi
1312	and	ecx,esi
1313	and	edx,esi
1314	mov	DWORD [edi],eax
1315	movd	eax,xmm0
1316	mov	DWORD [4+edi],ebx
1317	movd	ebx,xmm1
1318	mov	DWORD [8+edi],ecx
1319	movd	ecx,xmm2
1320	mov	DWORD [12+edi],edx
1321	movd	edx,xmm3
1322	not	esi
1323	and	eax,esi
1324	and	ebx,esi
1325	or	eax,DWORD [edi]
1326	and	ecx,esi
1327	or	ebx,DWORD [4+edi]
1328	and	edx,esi
1329	or	ecx,DWORD [8+edi]
1330	or	edx,DWORD [12+edi]
1331	add	eax,DWORD [ebp]
1332	adc	ebx,DWORD [4+ebp]
1333	mov	DWORD [edi],eax
1334	adc	ecx,DWORD [8+ebp]
1335	mov	DWORD [4+edi],ebx
1336	adc	edx,DWORD [12+ebp]
1337	mov	DWORD [8+edi],ecx
1338	mov	DWORD [12+edi],edx
1339	pop	edi
1340	pop	esi
1341	pop	ebx
1342	pop	ebp
1343	ret
1344align	32
1345align	16
1346__poly1305_init_avx2:
1347	vmovdqu	xmm4,[24+edi]
1348	lea	edi,[48+edi]
1349	mov	ebp,esp
1350	sub	esp,224
1351	and	esp,-16
1352	vmovdqa	xmm7,[64+ebx]
1353	vpand	xmm0,xmm4,xmm7
1354	vpsrlq	xmm1,xmm4,26
1355	vpsrldq	xmm3,xmm4,6
1356	vpand	xmm1,xmm1,xmm7
1357	vpsrlq	xmm2,xmm3,4
1358	vpsrlq	xmm3,xmm3,30
1359	vpand	xmm2,xmm2,xmm7
1360	vpand	xmm3,xmm3,xmm7
1361	vpsrldq	xmm4,xmm4,13
1362	lea	edx,[144+esp]
1363	mov	ecx,2
1364L$018square:
1365	vmovdqa	[esp],xmm0
1366	vmovdqa	[16+esp],xmm1
1367	vmovdqa	[32+esp],xmm2
1368	vmovdqa	[48+esp],xmm3
1369	vmovdqa	[64+esp],xmm4
1370	vpslld	xmm6,xmm1,2
1371	vpslld	xmm5,xmm2,2
1372	vpaddd	xmm6,xmm6,xmm1
1373	vpaddd	xmm5,xmm5,xmm2
1374	vmovdqa	[80+esp],xmm6
1375	vmovdqa	[96+esp],xmm5
1376	vpslld	xmm6,xmm3,2
1377	vpslld	xmm5,xmm4,2
1378	vpaddd	xmm6,xmm6,xmm3
1379	vpaddd	xmm5,xmm5,xmm4
1380	vmovdqa	[112+esp],xmm6
1381	vmovdqa	[128+esp],xmm5
1382	vpshufd	xmm5,xmm0,68
1383	vmovdqa	xmm6,xmm1
1384	vpshufd	xmm1,xmm1,68
1385	vpshufd	xmm2,xmm2,68
1386	vpshufd	xmm3,xmm3,68
1387	vpshufd	xmm4,xmm4,68
1388	vmovdqa	[edx],xmm5
1389	vmovdqa	[16+edx],xmm1
1390	vmovdqa	[32+edx],xmm2
1391	vmovdqa	[48+edx],xmm3
1392	vmovdqa	[64+edx],xmm4
1393	vpmuludq	xmm4,xmm4,xmm0
1394	vpmuludq	xmm3,xmm3,xmm0
1395	vpmuludq	xmm2,xmm2,xmm0
1396	vpmuludq	xmm1,xmm1,xmm0
1397	vpmuludq	xmm0,xmm5,xmm0
1398	vpmuludq	xmm5,xmm6,[48+edx]
1399	vpaddq	xmm4,xmm4,xmm5
1400	vpmuludq	xmm7,xmm6,[32+edx]
1401	vpaddq	xmm3,xmm3,xmm7
1402	vpmuludq	xmm5,xmm6,[16+edx]
1403	vpaddq	xmm2,xmm2,xmm5
1404	vmovdqa	xmm7,[80+esp]
1405	vpmuludq	xmm6,xmm6,[edx]
1406	vpaddq	xmm1,xmm1,xmm6
1407	vmovdqa	xmm5,[32+esp]
1408	vpmuludq	xmm7,xmm7,[64+edx]
1409	vpaddq	xmm0,xmm0,xmm7
1410	vpmuludq	xmm6,xmm5,[32+edx]
1411	vpaddq	xmm4,xmm4,xmm6
1412	vpmuludq	xmm7,xmm5,[16+edx]
1413	vpaddq	xmm3,xmm3,xmm7
1414	vmovdqa	xmm6,[96+esp]
1415	vpmuludq	xmm5,xmm5,[edx]
1416	vpaddq	xmm2,xmm2,xmm5
1417	vpmuludq	xmm7,xmm6,[64+edx]
1418	vpaddq	xmm1,xmm1,xmm7
1419	vmovdqa	xmm5,[48+esp]
1420	vpmuludq	xmm6,xmm6,[48+edx]
1421	vpaddq	xmm0,xmm0,xmm6
1422	vpmuludq	xmm7,xmm5,[16+edx]
1423	vpaddq	xmm4,xmm4,xmm7
1424	vmovdqa	xmm6,[112+esp]
1425	vpmuludq	xmm5,xmm5,[edx]
1426	vpaddq	xmm3,xmm3,xmm5
1427	vpmuludq	xmm7,xmm6,[64+edx]
1428	vpaddq	xmm2,xmm2,xmm7
1429	vpmuludq	xmm5,xmm6,[48+edx]
1430	vpaddq	xmm1,xmm1,xmm5
1431	vmovdqa	xmm7,[64+esp]
1432	vpmuludq	xmm6,xmm6,[32+edx]
1433	vpaddq	xmm0,xmm0,xmm6
1434	vmovdqa	xmm5,[128+esp]
1435	vpmuludq	xmm7,xmm7,[edx]
1436	vpaddq	xmm4,xmm4,xmm7
1437	vpmuludq	xmm6,xmm5,[64+edx]
1438	vpaddq	xmm3,xmm3,xmm6
1439	vpmuludq	xmm7,xmm5,[16+edx]
1440	vpaddq	xmm0,xmm0,xmm7
1441	vpmuludq	xmm6,xmm5,[32+edx]
1442	vpaddq	xmm1,xmm1,xmm6
1443	vmovdqa	xmm7,[64+ebx]
1444	vpmuludq	xmm5,xmm5,[48+edx]
1445	vpaddq	xmm2,xmm2,xmm5
1446	vpsrlq	xmm5,xmm3,26
1447	vpand	xmm3,xmm3,xmm7
1448	vpsrlq	xmm6,xmm0,26
1449	vpand	xmm0,xmm0,xmm7
1450	vpaddq	xmm4,xmm4,xmm5
1451	vpaddq	xmm1,xmm1,xmm6
1452	vpsrlq	xmm5,xmm4,26
1453	vpand	xmm4,xmm4,xmm7
1454	vpsrlq	xmm6,xmm1,26
1455	vpand	xmm1,xmm1,xmm7
1456	vpaddq	xmm2,xmm2,xmm6
1457	vpaddd	xmm0,xmm0,xmm5
1458	vpsllq	xmm5,xmm5,2
1459	vpsrlq	xmm6,xmm2,26
1460	vpand	xmm2,xmm2,xmm7
1461	vpaddd	xmm0,xmm0,xmm5
1462	vpaddd	xmm3,xmm3,xmm6
1463	vpsrlq	xmm6,xmm3,26
1464	vpsrlq	xmm5,xmm0,26
1465	vpand	xmm0,xmm0,xmm7
1466	vpand	xmm3,xmm3,xmm7
1467	vpaddd	xmm1,xmm1,xmm5
1468	vpaddd	xmm4,xmm4,xmm6
1469	dec	ecx
1470	jz	NEAR L$019square_break
1471	vpunpcklqdq	xmm0,xmm0,[esp]
1472	vpunpcklqdq	xmm1,xmm1,[16+esp]
1473	vpunpcklqdq	xmm2,xmm2,[32+esp]
1474	vpunpcklqdq	xmm3,xmm3,[48+esp]
1475	vpunpcklqdq	xmm4,xmm4,[64+esp]
1476	jmp	NEAR L$018square
1477L$019square_break:
1478	vpsllq	xmm0,xmm0,32
1479	vpsllq	xmm1,xmm1,32
1480	vpsllq	xmm2,xmm2,32
1481	vpsllq	xmm3,xmm3,32
1482	vpsllq	xmm4,xmm4,32
1483	vpor	xmm0,xmm0,[esp]
1484	vpor	xmm1,xmm1,[16+esp]
1485	vpor	xmm2,xmm2,[32+esp]
1486	vpor	xmm3,xmm3,[48+esp]
1487	vpor	xmm4,xmm4,[64+esp]
1488	vpshufd	xmm0,xmm0,141
1489	vpshufd	xmm1,xmm1,141
1490	vpshufd	xmm2,xmm2,141
1491	vpshufd	xmm3,xmm3,141
1492	vpshufd	xmm4,xmm4,141
1493	vmovdqu	[edi],xmm0
1494	vmovdqu	[16+edi],xmm1
1495	vmovdqu	[32+edi],xmm2
1496	vmovdqu	[48+edi],xmm3
1497	vmovdqu	[64+edi],xmm4
1498	vpslld	xmm6,xmm1,2
1499	vpslld	xmm5,xmm2,2
1500	vpaddd	xmm6,xmm6,xmm1
1501	vpaddd	xmm5,xmm5,xmm2
1502	vmovdqu	[80+edi],xmm6
1503	vmovdqu	[96+edi],xmm5
1504	vpslld	xmm6,xmm3,2
1505	vpslld	xmm5,xmm4,2
1506	vpaddd	xmm6,xmm6,xmm3
1507	vpaddd	xmm5,xmm5,xmm4
1508	vmovdqu	[112+edi],xmm6
1509	vmovdqu	[128+edi],xmm5
1510	mov	esp,ebp
1511	lea	edi,[edi-48]
1512	ret
1513align	32
1514align	16
1515__poly1305_blocks_avx2:
1516	push	ebp
1517	push	ebx
1518	push	esi
1519	push	edi
1520	mov	edi,DWORD [20+esp]
1521	mov	esi,DWORD [24+esp]
1522	mov	ecx,DWORD [28+esp]
1523	mov	eax,DWORD [20+edi]
1524	and	ecx,-16
1525	jz	NEAR L$020nodata
1526	cmp	ecx,64
1527	jae	NEAR L$021enter_avx2
1528	test	eax,eax
1529	jz	NEAR L$enter_blocks
1530L$021enter_avx2:
1531	vzeroupper
1532	call	L$022pic_point
1533L$022pic_point:
1534	pop	ebx
1535	lea	ebx,[(L$const_sse2-L$022pic_point)+ebx]
1536	test	eax,eax
1537	jnz	NEAR L$023base2_26
1538	call	__poly1305_init_avx2
1539	mov	eax,DWORD [edi]
1540	mov	ecx,DWORD [3+edi]
1541	mov	edx,DWORD [6+edi]
1542	mov	esi,DWORD [9+edi]
1543	mov	ebp,DWORD [13+edi]
1544	shr	ecx,2
1545	and	eax,67108863
1546	shr	edx,4
1547	and	ecx,67108863
1548	shr	esi,6
1549	and	edx,67108863
1550	mov	DWORD [edi],eax
1551	mov	DWORD [4+edi],ecx
1552	mov	DWORD [8+edi],edx
1553	mov	DWORD [12+edi],esi
1554	mov	DWORD [16+edi],ebp
1555	mov	DWORD [20+edi],1
1556	mov	esi,DWORD [24+esp]
1557	mov	ecx,DWORD [28+esp]
1558L$023base2_26:
1559	mov	eax,DWORD [32+esp]
1560	mov	ebp,esp
1561	sub	esp,448
1562	and	esp,-512
1563	vmovdqu	xmm0,[48+edi]
1564	lea	edx,[288+esp]
1565	vmovdqu	xmm1,[64+edi]
1566	vmovdqu	xmm2,[80+edi]
1567	vmovdqu	xmm3,[96+edi]
1568	vmovdqu	xmm4,[112+edi]
1569	lea	edi,[48+edi]
1570	vpermq	ymm0,ymm0,64
1571	vpermq	ymm1,ymm1,64
1572	vpermq	ymm2,ymm2,64
1573	vpermq	ymm3,ymm3,64
1574	vpermq	ymm4,ymm4,64
1575	vpshufd	ymm0,ymm0,200
1576	vpshufd	ymm1,ymm1,200
1577	vpshufd	ymm2,ymm2,200
1578	vpshufd	ymm3,ymm3,200
1579	vpshufd	ymm4,ymm4,200
1580	vmovdqa	[edx-128],ymm0
1581	vmovdqu	xmm0,[80+edi]
1582	vmovdqa	[edx-96],ymm1
1583	vmovdqu	xmm1,[96+edi]
1584	vmovdqa	[edx-64],ymm2
1585	vmovdqu	xmm2,[112+edi]
1586	vmovdqa	[edx-32],ymm3
1587	vmovdqu	xmm3,[128+edi]
1588	vmovdqa	[edx],ymm4
1589	vpermq	ymm0,ymm0,64
1590	vpermq	ymm1,ymm1,64
1591	vpermq	ymm2,ymm2,64
1592	vpermq	ymm3,ymm3,64
1593	vpshufd	ymm0,ymm0,200
1594	vpshufd	ymm1,ymm1,200
1595	vpshufd	ymm2,ymm2,200
1596	vpshufd	ymm3,ymm3,200
1597	vmovdqa	[32+edx],ymm0
1598	vmovd	xmm0,DWORD [edi-48]
1599	vmovdqa	[64+edx],ymm1
1600	vmovd	xmm1,DWORD [edi-44]
1601	vmovdqa	[96+edx],ymm2
1602	vmovd	xmm2,DWORD [edi-40]
1603	vmovdqa	[128+edx],ymm3
1604	vmovd	xmm3,DWORD [edi-36]
1605	vmovd	xmm4,DWORD [edi-32]
1606	vmovdqa	ymm7,[64+ebx]
1607	neg	eax
1608	test	ecx,63
1609	jz	NEAR L$024even
1610	mov	edx,ecx
1611	and	ecx,-64
1612	and	edx,63
1613	vmovdqu	xmm5,[esi]
1614	cmp	edx,32
1615	jb	NEAR L$025one
1616	vmovdqu	xmm6,[16+esi]
1617	je	NEAR L$026two
1618	vinserti128	ymm5,ymm5,[32+esi],1
1619	lea	esi,[48+esi]
1620	lea	ebx,[8+ebx]
1621	lea	edx,[296+esp]
1622	jmp	NEAR L$027tail
1623L$026two:
1624	lea	esi,[32+esi]
1625	lea	ebx,[16+ebx]
1626	lea	edx,[304+esp]
1627	jmp	NEAR L$027tail
1628L$025one:
1629	lea	esi,[16+esi]
1630	vpxor	ymm6,ymm6,ymm6
1631	lea	ebx,[32+eax*8+ebx]
1632	lea	edx,[312+esp]
1633	jmp	NEAR L$027tail
1634align	32
1635L$024even:
1636	vmovdqu	xmm5,[esi]
1637	vmovdqu	xmm6,[16+esi]
1638	vinserti128	ymm5,ymm5,[32+esi],1
1639	vinserti128	ymm6,ymm6,[48+esi],1
1640	lea	esi,[64+esi]
1641	sub	ecx,64
1642	jz	NEAR L$027tail
1643L$028loop:
1644	vmovdqa	[64+esp],ymm2
1645	vpsrldq	ymm2,ymm5,6
1646	vmovdqa	[esp],ymm0
1647	vpsrldq	ymm0,ymm6,6
1648	vmovdqa	[32+esp],ymm1
1649	vpunpckhqdq	ymm1,ymm5,ymm6
1650	vpunpcklqdq	ymm5,ymm5,ymm6
1651	vpunpcklqdq	ymm2,ymm2,ymm0
1652	vpsrlq	ymm0,ymm2,30
1653	vpsrlq	ymm2,ymm2,4
1654	vpsrlq	ymm6,ymm5,26
1655	vpsrlq	ymm1,ymm1,40
1656	vpand	ymm2,ymm2,ymm7
1657	vpand	ymm5,ymm5,ymm7
1658	vpand	ymm6,ymm6,ymm7
1659	vpand	ymm0,ymm0,ymm7
1660	vpor	ymm1,ymm1,[ebx]
1661	vpaddq	ymm2,ymm2,[64+esp]
1662	vpaddq	ymm5,ymm5,[esp]
1663	vpaddq	ymm6,ymm6,[32+esp]
1664	vpaddq	ymm0,ymm0,ymm3
1665	vpaddq	ymm1,ymm1,ymm4
1666	vpmuludq	ymm3,ymm2,[edx-96]
1667	vmovdqa	[32+esp],ymm6
1668	vpmuludq	ymm4,ymm2,[edx-64]
1669	vmovdqa	[96+esp],ymm0
1670	vpmuludq	ymm0,ymm2,[96+edx]
1671	vmovdqa	[128+esp],ymm1
1672	vpmuludq	ymm1,ymm2,[128+edx]
1673	vpmuludq	ymm2,ymm2,[edx-128]
1674	vpmuludq	ymm7,ymm5,[edx-32]
1675	vpaddq	ymm3,ymm3,ymm7
1676	vpmuludq	ymm6,ymm5,[edx]
1677	vpaddq	ymm4,ymm4,ymm6
1678	vpmuludq	ymm7,ymm5,[edx-128]
1679	vpaddq	ymm0,ymm0,ymm7
1680	vmovdqa	ymm7,[32+esp]
1681	vpmuludq	ymm6,ymm5,[edx-96]
1682	vpaddq	ymm1,ymm1,ymm6
1683	vpmuludq	ymm5,ymm5,[edx-64]
1684	vpaddq	ymm2,ymm2,ymm5
1685	vpmuludq	ymm6,ymm7,[edx-64]
1686	vpaddq	ymm3,ymm3,ymm6
1687	vpmuludq	ymm5,ymm7,[edx-32]
1688	vpaddq	ymm4,ymm4,ymm5
1689	vpmuludq	ymm6,ymm7,[128+edx]
1690	vpaddq	ymm0,ymm0,ymm6
1691	vmovdqa	ymm6,[96+esp]
1692	vpmuludq	ymm5,ymm7,[edx-128]
1693	vpaddq	ymm1,ymm1,ymm5
1694	vpmuludq	ymm7,ymm7,[edx-96]
1695	vpaddq	ymm2,ymm2,ymm7
1696	vpmuludq	ymm5,ymm6,[edx-128]
1697	vpaddq	ymm3,ymm3,ymm5
1698	vpmuludq	ymm7,ymm6,[edx-96]
1699	vpaddq	ymm4,ymm4,ymm7
1700	vpmuludq	ymm5,ymm6,[64+edx]
1701	vpaddq	ymm0,ymm0,ymm5
1702	vmovdqa	ymm5,[128+esp]
1703	vpmuludq	ymm7,ymm6,[96+edx]
1704	vpaddq	ymm1,ymm1,ymm7
1705	vpmuludq	ymm6,ymm6,[128+edx]
1706	vpaddq	ymm2,ymm2,ymm6
1707	vpmuludq	ymm7,ymm5,[128+edx]
1708	vpaddq	ymm3,ymm3,ymm7
1709	vpmuludq	ymm6,ymm5,[32+edx]
1710	vpaddq	ymm0,ymm0,ymm6
1711	vpmuludq	ymm7,ymm5,[edx-128]
1712	vpaddq	ymm4,ymm4,ymm7
1713	vmovdqa	ymm7,[64+ebx]
1714	vpmuludq	ymm6,ymm5,[64+edx]
1715	vpaddq	ymm1,ymm1,ymm6
1716	vpmuludq	ymm5,ymm5,[96+edx]
1717	vpaddq	ymm2,ymm2,ymm5
1718	vpsrlq	ymm5,ymm3,26
1719	vpand	ymm3,ymm3,ymm7
1720	vpsrlq	ymm6,ymm0,26
1721	vpand	ymm0,ymm0,ymm7
1722	vpaddq	ymm4,ymm4,ymm5
1723	vpaddq	ymm1,ymm1,ymm6
1724	vpsrlq	ymm5,ymm4,26
1725	vpand	ymm4,ymm4,ymm7
1726	vpsrlq	ymm6,ymm1,26
1727	vpand	ymm1,ymm1,ymm7
1728	vpaddq	ymm2,ymm2,ymm6
1729	vpaddq	ymm0,ymm0,ymm5
1730	vpsllq	ymm5,ymm5,2
1731	vpsrlq	ymm6,ymm2,26
1732	vpand	ymm2,ymm2,ymm7
1733	vpaddq	ymm0,ymm0,ymm5
1734	vpaddq	ymm3,ymm3,ymm6
1735	vpsrlq	ymm6,ymm3,26
1736	vpsrlq	ymm5,ymm0,26
1737	vpand	ymm0,ymm0,ymm7
1738	vpand	ymm3,ymm3,ymm7
1739	vpaddq	ymm1,ymm1,ymm5
1740	vpaddq	ymm4,ymm4,ymm6
1741	vmovdqu	xmm5,[esi]
1742	vmovdqu	xmm6,[16+esi]
1743	vinserti128	ymm5,ymm5,[32+esi],1
1744	vinserti128	ymm6,ymm6,[48+esi],1
1745	lea	esi,[64+esi]
1746	sub	ecx,64
1747	jnz	NEAR L$028loop
1748L$027tail:
1749	vmovdqa	[64+esp],ymm2
1750	vpsrldq	ymm2,ymm5,6
1751	vmovdqa	[esp],ymm0
1752	vpsrldq	ymm0,ymm6,6
1753	vmovdqa	[32+esp],ymm1
1754	vpunpckhqdq	ymm1,ymm5,ymm6
1755	vpunpcklqdq	ymm5,ymm5,ymm6
1756	vpunpcklqdq	ymm2,ymm2,ymm0
1757	vpsrlq	ymm0,ymm2,30
1758	vpsrlq	ymm2,ymm2,4
1759	vpsrlq	ymm6,ymm5,26
1760	vpsrlq	ymm1,ymm1,40
1761	vpand	ymm2,ymm2,ymm7
1762	vpand	ymm5,ymm5,ymm7
1763	vpand	ymm6,ymm6,ymm7
1764	vpand	ymm0,ymm0,ymm7
1765	vpor	ymm1,ymm1,[ebx]
1766	and	ebx,-64
1767	vpaddq	ymm2,ymm2,[64+esp]
1768	vpaddq	ymm5,ymm5,[esp]
1769	vpaddq	ymm6,ymm6,[32+esp]
1770	vpaddq	ymm0,ymm0,ymm3
1771	vpaddq	ymm1,ymm1,ymm4
1772	vpmuludq	ymm3,ymm2,[edx-92]
1773	vmovdqa	[32+esp],ymm6
1774	vpmuludq	ymm4,ymm2,[edx-60]
1775	vmovdqa	[96+esp],ymm0
1776	vpmuludq	ymm0,ymm2,[100+edx]
1777	vmovdqa	[128+esp],ymm1
1778	vpmuludq	ymm1,ymm2,[132+edx]
1779	vpmuludq	ymm2,ymm2,[edx-124]
1780	vpmuludq	ymm7,ymm5,[edx-28]
1781	vpaddq	ymm3,ymm3,ymm7
1782	vpmuludq	ymm6,ymm5,[4+edx]
1783	vpaddq	ymm4,ymm4,ymm6
1784	vpmuludq	ymm7,ymm5,[edx-124]
1785	vpaddq	ymm0,ymm0,ymm7
1786	vmovdqa	ymm7,[32+esp]
1787	vpmuludq	ymm6,ymm5,[edx-92]
1788	vpaddq	ymm1,ymm1,ymm6
1789	vpmuludq	ymm5,ymm5,[edx-60]
1790	vpaddq	ymm2,ymm2,ymm5
1791	vpmuludq	ymm6,ymm7,[edx-60]
1792	vpaddq	ymm3,ymm3,ymm6
1793	vpmuludq	ymm5,ymm7,[edx-28]
1794	vpaddq	ymm4,ymm4,ymm5
1795	vpmuludq	ymm6,ymm7,[132+edx]
1796	vpaddq	ymm0,ymm0,ymm6
1797	vmovdqa	ymm6,[96+esp]
1798	vpmuludq	ymm5,ymm7,[edx-124]
1799	vpaddq	ymm1,ymm1,ymm5
1800	vpmuludq	ymm7,ymm7,[edx-92]
1801	vpaddq	ymm2,ymm2,ymm7
1802	vpmuludq	ymm5,ymm6,[edx-124]
1803	vpaddq	ymm3,ymm3,ymm5
1804	vpmuludq	ymm7,ymm6,[edx-92]
1805	vpaddq	ymm4,ymm4,ymm7
1806	vpmuludq	ymm5,ymm6,[68+edx]
1807	vpaddq	ymm0,ymm0,ymm5
1808	vmovdqa	ymm5,[128+esp]
1809	vpmuludq	ymm7,ymm6,[100+edx]
1810	vpaddq	ymm1,ymm1,ymm7
1811	vpmuludq	ymm6,ymm6,[132+edx]
1812	vpaddq	ymm2,ymm2,ymm6
1813	vpmuludq	ymm7,ymm5,[132+edx]
1814	vpaddq	ymm3,ymm3,ymm7
1815	vpmuludq	ymm6,ymm5,[36+edx]
1816	vpaddq	ymm0,ymm0,ymm6
1817	vpmuludq	ymm7,ymm5,[edx-124]
1818	vpaddq	ymm4,ymm4,ymm7
1819	vmovdqa	ymm7,[64+ebx]
1820	vpmuludq	ymm6,ymm5,[68+edx]
1821	vpaddq	ymm1,ymm1,ymm6
1822	vpmuludq	ymm5,ymm5,[100+edx]
1823	vpaddq	ymm2,ymm2,ymm5
1824	vpsrldq	ymm5,ymm4,8
1825	vpsrldq	ymm6,ymm3,8
1826	vpaddq	ymm4,ymm4,ymm5
1827	vpsrldq	ymm5,ymm0,8
1828	vpaddq	ymm3,ymm3,ymm6
1829	vpsrldq	ymm6,ymm1,8
1830	vpaddq	ymm0,ymm0,ymm5
1831	vpsrldq	ymm5,ymm2,8
1832	vpaddq	ymm1,ymm1,ymm6
1833	vpermq	ymm6,ymm4,2
1834	vpaddq	ymm2,ymm2,ymm5
1835	vpermq	ymm5,ymm3,2
1836	vpaddq	ymm4,ymm4,ymm6
1837	vpermq	ymm6,ymm0,2
1838	vpaddq	ymm3,ymm3,ymm5
1839	vpermq	ymm5,ymm1,2
1840	vpaddq	ymm0,ymm0,ymm6
1841	vpermq	ymm6,ymm2,2
1842	vpaddq	ymm1,ymm1,ymm5
1843	vpaddq	ymm2,ymm2,ymm6
1844	vpsrlq	ymm5,ymm3,26
1845	vpand	ymm3,ymm3,ymm7
1846	vpsrlq	ymm6,ymm0,26
1847	vpand	ymm0,ymm0,ymm7
1848	vpaddq	ymm4,ymm4,ymm5
1849	vpaddq	ymm1,ymm1,ymm6
1850	vpsrlq	ymm5,ymm4,26
1851	vpand	ymm4,ymm4,ymm7
1852	vpsrlq	ymm6,ymm1,26
1853	vpand	ymm1,ymm1,ymm7
1854	vpaddq	ymm2,ymm2,ymm6
1855	vpaddq	ymm0,ymm0,ymm5
1856	vpsllq	ymm5,ymm5,2
1857	vpsrlq	ymm6,ymm2,26
1858	vpand	ymm2,ymm2,ymm7
1859	vpaddq	ymm0,ymm0,ymm5
1860	vpaddq	ymm3,ymm3,ymm6
1861	vpsrlq	ymm6,ymm3,26
1862	vpsrlq	ymm5,ymm0,26
1863	vpand	ymm0,ymm0,ymm7
1864	vpand	ymm3,ymm3,ymm7
1865	vpaddq	ymm1,ymm1,ymm5
1866	vpaddq	ymm4,ymm4,ymm6
1867	cmp	ecx,0
1868	je	NEAR L$029done
1869	vpshufd	xmm0,xmm0,252
1870	lea	edx,[288+esp]
1871	vpshufd	xmm1,xmm1,252
1872	vpshufd	xmm2,xmm2,252
1873	vpshufd	xmm3,xmm3,252
1874	vpshufd	xmm4,xmm4,252
1875	jmp	NEAR L$024even
1876align	16
1877L$029done:
1878	vmovd	DWORD [edi-48],xmm0
1879	vmovd	DWORD [edi-44],xmm1
1880	vmovd	DWORD [edi-40],xmm2
1881	vmovd	DWORD [edi-36],xmm3
1882	vmovd	DWORD [edi-32],xmm4
1883	vzeroupper
1884	mov	esp,ebp
1885L$020nodata:
1886	pop	edi
1887	pop	esi
1888	pop	ebx
1889	pop	ebp
1890	ret
1891align	64
1892L$const_sse2:
1893dd	16777216,0,16777216,0,16777216,0,16777216,0
1894dd	0,0,0,0,0,0,0,0
1895dd	67108863,0,67108863,0,67108863,0,67108863,0
1896dd	268435455,268435452,268435452,268435452
1897db	80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54
1898db	44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
1899db	60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
1900db	114,103,62,0
1901align	4
1902segment	.bss
1903common	_OPENSSL_ia32cap_P 16
1904