• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1.text
2
3
4
5.globl	poly1305_init
6.hidden	poly1305_init
7.globl	poly1305_blocks
8.hidden	poly1305_blocks
9.globl	poly1305_emit
10.hidden	poly1305_emit
11
12.type	poly1305_init,@function
13.align	32
14poly1305_init:
15.cfi_startproc
16	xorq	%rax,%rax
17	movq	%rax,0(%rdi)
18	movq	%rax,8(%rdi)
19	movq	%rax,16(%rdi)
20
21	cmpq	$0,%rsi
22	je	.Lno_key
23
24	leaq	poly1305_blocks(%rip),%r10
25	leaq	poly1305_emit(%rip),%r11
26	movq	OPENSSL_ia32cap_P+4(%rip),%r9
27	leaq	poly1305_blocks_avx(%rip),%rax
28	leaq	poly1305_emit_avx(%rip),%rcx
29	btq	$28,%r9
30	cmovcq	%rax,%r10
31	cmovcq	%rcx,%r11
32	leaq	poly1305_blocks_avx2(%rip),%rax
33	btq	$37,%r9
34	cmovcq	%rax,%r10
35	movq	$0x0ffffffc0fffffff,%rax
36	movq	$0x0ffffffc0ffffffc,%rcx
37	andq	0(%rsi),%rax
38	andq	8(%rsi),%rcx
39	movq	%rax,24(%rdi)
40	movq	%rcx,32(%rdi)
41	movq	%r10,0(%rdx)
42	movq	%r11,8(%rdx)
43	movl	$1,%eax
44.Lno_key:
45	.byte	0xf3,0xc3
46.cfi_endproc
47.size	poly1305_init,.-poly1305_init
48
49.type	poly1305_blocks,@function
50.align	32
51poly1305_blocks:
52.cfi_startproc
53.Lblocks:
54	shrq	$4,%rdx
55	jz	.Lno_data
56
57	pushq	%rbx
58.cfi_adjust_cfa_offset	8
59.cfi_offset	%rbx,-16
60	pushq	%rbp
61.cfi_adjust_cfa_offset	8
62.cfi_offset	%rbp,-24
63	pushq	%r12
64.cfi_adjust_cfa_offset	8
65.cfi_offset	%r12,-32
66	pushq	%r13
67.cfi_adjust_cfa_offset	8
68.cfi_offset	%r13,-40
69	pushq	%r14
70.cfi_adjust_cfa_offset	8
71.cfi_offset	%r14,-48
72	pushq	%r15
73.cfi_adjust_cfa_offset	8
74.cfi_offset	%r15,-56
75.Lblocks_body:
76
77	movq	%rdx,%r15
78
79	movq	24(%rdi),%r11
80	movq	32(%rdi),%r13
81
82	movq	0(%rdi),%r14
83	movq	8(%rdi),%rbx
84	movq	16(%rdi),%rbp
85
86	movq	%r13,%r12
87	shrq	$2,%r13
88	movq	%r12,%rax
89	addq	%r12,%r13
90	jmp	.Loop
91
92.align	32
93.Loop:
94	addq	0(%rsi),%r14
95	adcq	8(%rsi),%rbx
96	leaq	16(%rsi),%rsi
97	adcq	%rcx,%rbp
98	mulq	%r14
99	movq	%rax,%r9
100	movq	%r11,%rax
101	movq	%rdx,%r10
102
103	mulq	%r14
104	movq	%rax,%r14
105	movq	%r11,%rax
106	movq	%rdx,%r8
107
108	mulq	%rbx
109	addq	%rax,%r9
110	movq	%r13,%rax
111	adcq	%rdx,%r10
112
113	mulq	%rbx
114	movq	%rbp,%rbx
115	addq	%rax,%r14
116	adcq	%rdx,%r8
117
118	imulq	%r13,%rbx
119	addq	%rbx,%r9
120	movq	%r8,%rbx
121	adcq	$0,%r10
122
123	imulq	%r11,%rbp
124	addq	%r9,%rbx
125	movq	$-4,%rax
126	adcq	%rbp,%r10
127
128	andq	%r10,%rax
129	movq	%r10,%rbp
130	shrq	$2,%r10
131	andq	$3,%rbp
132	addq	%r10,%rax
133	addq	%rax,%r14
134	adcq	$0,%rbx
135	adcq	$0,%rbp
136	movq	%r12,%rax
137	decq	%r15
138	jnz	.Loop
139
140	movq	%r14,0(%rdi)
141	movq	%rbx,8(%rdi)
142	movq	%rbp,16(%rdi)
143
144	movq	0(%rsp),%r15
145.cfi_restore	%r15
146	movq	8(%rsp),%r14
147.cfi_restore	%r14
148	movq	16(%rsp),%r13
149.cfi_restore	%r13
150	movq	24(%rsp),%r12
151.cfi_restore	%r12
152	movq	32(%rsp),%rbp
153.cfi_restore	%rbp
154	movq	40(%rsp),%rbx
155.cfi_restore	%rbx
156	leaq	48(%rsp),%rsp
157.cfi_adjust_cfa_offset	-48
158.Lno_data:
159.Lblocks_epilogue:
160	.byte	0xf3,0xc3
161.cfi_endproc
162.size	poly1305_blocks,.-poly1305_blocks
163
164.type	poly1305_emit,@function
165.align	32
166poly1305_emit:
167.cfi_startproc
168.Lemit:
169	movq	0(%rdi),%r8
170	movq	8(%rdi),%r9
171	movq	16(%rdi),%r10
172
173	movq	%r8,%rax
174	addq	$5,%r8
175	movq	%r9,%rcx
176	adcq	$0,%r9
177	adcq	$0,%r10
178	shrq	$2,%r10
179	cmovnzq	%r8,%rax
180	cmovnzq	%r9,%rcx
181
182	addq	0(%rdx),%rax
183	adcq	8(%rdx),%rcx
184	movq	%rax,0(%rsi)
185	movq	%rcx,8(%rsi)
186
187	.byte	0xf3,0xc3
188.cfi_endproc
189.size	poly1305_emit,.-poly1305_emit
190.type	__poly1305_block,@function
191.align	32
192__poly1305_block:
193.cfi_startproc
194	mulq	%r14
195	movq	%rax,%r9
196	movq	%r11,%rax
197	movq	%rdx,%r10
198
199	mulq	%r14
200	movq	%rax,%r14
201	movq	%r11,%rax
202	movq	%rdx,%r8
203
204	mulq	%rbx
205	addq	%rax,%r9
206	movq	%r13,%rax
207	adcq	%rdx,%r10
208
209	mulq	%rbx
210	movq	%rbp,%rbx
211	addq	%rax,%r14
212	adcq	%rdx,%r8
213
214	imulq	%r13,%rbx
215	addq	%rbx,%r9
216	movq	%r8,%rbx
217	adcq	$0,%r10
218
219	imulq	%r11,%rbp
220	addq	%r9,%rbx
221	movq	$-4,%rax
222	adcq	%rbp,%r10
223
224	andq	%r10,%rax
225	movq	%r10,%rbp
226	shrq	$2,%r10
227	andq	$3,%rbp
228	addq	%r10,%rax
229	addq	%rax,%r14
230	adcq	$0,%rbx
231	adcq	$0,%rbp
232	.byte	0xf3,0xc3
233.cfi_endproc
234.size	__poly1305_block,.-__poly1305_block
235
236.type	__poly1305_init_avx,@function
237.align	32
238__poly1305_init_avx:
239.cfi_startproc
240	movq	%r11,%r14
241	movq	%r12,%rbx
242	xorq	%rbp,%rbp
243
244	leaq	48+64(%rdi),%rdi
245
246	movq	%r12,%rax
247	call	__poly1305_block
248
249	movl	$0x3ffffff,%eax
250	movl	$0x3ffffff,%edx
251	movq	%r14,%r8
252	andl	%r14d,%eax
253	movq	%r11,%r9
254	andl	%r11d,%edx
255	movl	%eax,-64(%rdi)
256	shrq	$26,%r8
257	movl	%edx,-60(%rdi)
258	shrq	$26,%r9
259
260	movl	$0x3ffffff,%eax
261	movl	$0x3ffffff,%edx
262	andl	%r8d,%eax
263	andl	%r9d,%edx
264	movl	%eax,-48(%rdi)
265	leal	(%rax,%rax,4),%eax
266	movl	%edx,-44(%rdi)
267	leal	(%rdx,%rdx,4),%edx
268	movl	%eax,-32(%rdi)
269	shrq	$26,%r8
270	movl	%edx,-28(%rdi)
271	shrq	$26,%r9
272
273	movq	%rbx,%rax
274	movq	%r12,%rdx
275	shlq	$12,%rax
276	shlq	$12,%rdx
277	orq	%r8,%rax
278	orq	%r9,%rdx
279	andl	$0x3ffffff,%eax
280	andl	$0x3ffffff,%edx
281	movl	%eax,-16(%rdi)
282	leal	(%rax,%rax,4),%eax
283	movl	%edx,-12(%rdi)
284	leal	(%rdx,%rdx,4),%edx
285	movl	%eax,0(%rdi)
286	movq	%rbx,%r8
287	movl	%edx,4(%rdi)
288	movq	%r12,%r9
289
290	movl	$0x3ffffff,%eax
291	movl	$0x3ffffff,%edx
292	shrq	$14,%r8
293	shrq	$14,%r9
294	andl	%r8d,%eax
295	andl	%r9d,%edx
296	movl	%eax,16(%rdi)
297	leal	(%rax,%rax,4),%eax
298	movl	%edx,20(%rdi)
299	leal	(%rdx,%rdx,4),%edx
300	movl	%eax,32(%rdi)
301	shrq	$26,%r8
302	movl	%edx,36(%rdi)
303	shrq	$26,%r9
304
305	movq	%rbp,%rax
306	shlq	$24,%rax
307	orq	%rax,%r8
308	movl	%r8d,48(%rdi)
309	leaq	(%r8,%r8,4),%r8
310	movl	%r9d,52(%rdi)
311	leaq	(%r9,%r9,4),%r9
312	movl	%r8d,64(%rdi)
313	movl	%r9d,68(%rdi)
314
315	movq	%r12,%rax
316	call	__poly1305_block
317
318	movl	$0x3ffffff,%eax
319	movq	%r14,%r8
320	andl	%r14d,%eax
321	shrq	$26,%r8
322	movl	%eax,-52(%rdi)
323
324	movl	$0x3ffffff,%edx
325	andl	%r8d,%edx
326	movl	%edx,-36(%rdi)
327	leal	(%rdx,%rdx,4),%edx
328	shrq	$26,%r8
329	movl	%edx,-20(%rdi)
330
331	movq	%rbx,%rax
332	shlq	$12,%rax
333	orq	%r8,%rax
334	andl	$0x3ffffff,%eax
335	movl	%eax,-4(%rdi)
336	leal	(%rax,%rax,4),%eax
337	movq	%rbx,%r8
338	movl	%eax,12(%rdi)
339
340	movl	$0x3ffffff,%edx
341	shrq	$14,%r8
342	andl	%r8d,%edx
343	movl	%edx,28(%rdi)
344	leal	(%rdx,%rdx,4),%edx
345	shrq	$26,%r8
346	movl	%edx,44(%rdi)
347
348	movq	%rbp,%rax
349	shlq	$24,%rax
350	orq	%rax,%r8
351	movl	%r8d,60(%rdi)
352	leaq	(%r8,%r8,4),%r8
353	movl	%r8d,76(%rdi)
354
355	movq	%r12,%rax
356	call	__poly1305_block
357
358	movl	$0x3ffffff,%eax
359	movq	%r14,%r8
360	andl	%r14d,%eax
361	shrq	$26,%r8
362	movl	%eax,-56(%rdi)
363
364	movl	$0x3ffffff,%edx
365	andl	%r8d,%edx
366	movl	%edx,-40(%rdi)
367	leal	(%rdx,%rdx,4),%edx
368	shrq	$26,%r8
369	movl	%edx,-24(%rdi)
370
371	movq	%rbx,%rax
372	shlq	$12,%rax
373	orq	%r8,%rax
374	andl	$0x3ffffff,%eax
375	movl	%eax,-8(%rdi)
376	leal	(%rax,%rax,4),%eax
377	movq	%rbx,%r8
378	movl	%eax,8(%rdi)
379
380	movl	$0x3ffffff,%edx
381	shrq	$14,%r8
382	andl	%r8d,%edx
383	movl	%edx,24(%rdi)
384	leal	(%rdx,%rdx,4),%edx
385	shrq	$26,%r8
386	movl	%edx,40(%rdi)
387
388	movq	%rbp,%rax
389	shlq	$24,%rax
390	orq	%rax,%r8
391	movl	%r8d,56(%rdi)
392	leaq	(%r8,%r8,4),%r8
393	movl	%r8d,72(%rdi)
394
395	leaq	-48-64(%rdi),%rdi
396	.byte	0xf3,0xc3
397.cfi_endproc
398.size	__poly1305_init_avx,.-__poly1305_init_avx
399
400.type	poly1305_blocks_avx,@function
401.align	32
402poly1305_blocks_avx:
403.cfi_startproc
404	movl	20(%rdi),%r8d
405	cmpq	$128,%rdx
406	jae	.Lblocks_avx
407	testl	%r8d,%r8d
408	jz	.Lblocks
409
410.Lblocks_avx:
411	andq	$-16,%rdx
412	jz	.Lno_data_avx
413
414	vzeroupper
415
416	testl	%r8d,%r8d
417	jz	.Lbase2_64_avx
418
419	testq	$31,%rdx
420	jz	.Leven_avx
421
422	pushq	%rbx
423.cfi_adjust_cfa_offset	8
424.cfi_offset	%rbx,-16
425	pushq	%rbp
426.cfi_adjust_cfa_offset	8
427.cfi_offset	%rbp,-24
428	pushq	%r12
429.cfi_adjust_cfa_offset	8
430.cfi_offset	%r12,-32
431	pushq	%r13
432.cfi_adjust_cfa_offset	8
433.cfi_offset	%r13,-40
434	pushq	%r14
435.cfi_adjust_cfa_offset	8
436.cfi_offset	%r14,-48
437	pushq	%r15
438.cfi_adjust_cfa_offset	8
439.cfi_offset	%r15,-56
440.Lblocks_avx_body:
441
442	movq	%rdx,%r15
443
444	movq	0(%rdi),%r8
445	movq	8(%rdi),%r9
446	movl	16(%rdi),%ebp
447
448	movq	24(%rdi),%r11
449	movq	32(%rdi),%r13
450
451
452	movl	%r8d,%r14d
453	andq	$-2147483648,%r8
454	movq	%r9,%r12
455	movl	%r9d,%ebx
456	andq	$-2147483648,%r9
457
458	shrq	$6,%r8
459	shlq	$52,%r12
460	addq	%r8,%r14
461	shrq	$12,%rbx
462	shrq	$18,%r9
463	addq	%r12,%r14
464	adcq	%r9,%rbx
465
466	movq	%rbp,%r8
467	shlq	$40,%r8
468	shrq	$24,%rbp
469	addq	%r8,%rbx
470	adcq	$0,%rbp
471
472	movq	$-4,%r9
473	movq	%rbp,%r8
474	andq	%rbp,%r9
475	shrq	$2,%r8
476	andq	$3,%rbp
477	addq	%r9,%r8
478	addq	%r8,%r14
479	adcq	$0,%rbx
480	adcq	$0,%rbp
481
482	movq	%r13,%r12
483	movq	%r13,%rax
484	shrq	$2,%r13
485	addq	%r12,%r13
486
487	addq	0(%rsi),%r14
488	adcq	8(%rsi),%rbx
489	leaq	16(%rsi),%rsi
490	adcq	%rcx,%rbp
491
492	call	__poly1305_block
493
494	testq	%rcx,%rcx
495	jz	.Lstore_base2_64_avx
496
497
498	movq	%r14,%rax
499	movq	%r14,%rdx
500	shrq	$52,%r14
501	movq	%rbx,%r11
502	movq	%rbx,%r12
503	shrq	$26,%rdx
504	andq	$0x3ffffff,%rax
505	shlq	$12,%r11
506	andq	$0x3ffffff,%rdx
507	shrq	$14,%rbx
508	orq	%r11,%r14
509	shlq	$24,%rbp
510	andq	$0x3ffffff,%r14
511	shrq	$40,%r12
512	andq	$0x3ffffff,%rbx
513	orq	%r12,%rbp
514
515	subq	$16,%r15
516	jz	.Lstore_base2_26_avx
517
518	vmovd	%eax,%xmm0
519	vmovd	%edx,%xmm1
520	vmovd	%r14d,%xmm2
521	vmovd	%ebx,%xmm3
522	vmovd	%ebp,%xmm4
523	jmp	.Lproceed_avx
524
525.align	32
526.Lstore_base2_64_avx:
527	movq	%r14,0(%rdi)
528	movq	%rbx,8(%rdi)
529	movq	%rbp,16(%rdi)
530	jmp	.Ldone_avx
531
532.align	16
533.Lstore_base2_26_avx:
534	movl	%eax,0(%rdi)
535	movl	%edx,4(%rdi)
536	movl	%r14d,8(%rdi)
537	movl	%ebx,12(%rdi)
538	movl	%ebp,16(%rdi)
539.align	16
540.Ldone_avx:
541	movq	0(%rsp),%r15
542.cfi_restore	%r15
543	movq	8(%rsp),%r14
544.cfi_restore	%r14
545	movq	16(%rsp),%r13
546.cfi_restore	%r13
547	movq	24(%rsp),%r12
548.cfi_restore	%r12
549	movq	32(%rsp),%rbp
550.cfi_restore	%rbp
551	movq	40(%rsp),%rbx
552.cfi_restore	%rbx
553	leaq	48(%rsp),%rsp
554.cfi_adjust_cfa_offset	-48
555.Lno_data_avx:
556.Lblocks_avx_epilogue:
557	.byte	0xf3,0xc3
558.cfi_endproc
559
560.align	32
561.Lbase2_64_avx:
562.cfi_startproc
563	pushq	%rbx
564.cfi_adjust_cfa_offset	8
565.cfi_offset	%rbx,-16
566	pushq	%rbp
567.cfi_adjust_cfa_offset	8
568.cfi_offset	%rbp,-24
569	pushq	%r12
570.cfi_adjust_cfa_offset	8
571.cfi_offset	%r12,-32
572	pushq	%r13
573.cfi_adjust_cfa_offset	8
574.cfi_offset	%r13,-40
575	pushq	%r14
576.cfi_adjust_cfa_offset	8
577.cfi_offset	%r14,-48
578	pushq	%r15
579.cfi_adjust_cfa_offset	8
580.cfi_offset	%r15,-56
581.Lbase2_64_avx_body:
582
583	movq	%rdx,%r15
584
585	movq	24(%rdi),%r11
586	movq	32(%rdi),%r13
587
588	movq	0(%rdi),%r14
589	movq	8(%rdi),%rbx
590	movl	16(%rdi),%ebp
591
592	movq	%r13,%r12
593	movq	%r13,%rax
594	shrq	$2,%r13
595	addq	%r12,%r13
596
597	testq	$31,%rdx
598	jz	.Linit_avx
599
600	addq	0(%rsi),%r14
601	adcq	8(%rsi),%rbx
602	leaq	16(%rsi),%rsi
603	adcq	%rcx,%rbp
604	subq	$16,%r15
605
606	call	__poly1305_block
607
608.Linit_avx:
609
610	movq	%r14,%rax
611	movq	%r14,%rdx
612	shrq	$52,%r14
613	movq	%rbx,%r8
614	movq	%rbx,%r9
615	shrq	$26,%rdx
616	andq	$0x3ffffff,%rax
617	shlq	$12,%r8
618	andq	$0x3ffffff,%rdx
619	shrq	$14,%rbx
620	orq	%r8,%r14
621	shlq	$24,%rbp
622	andq	$0x3ffffff,%r14
623	shrq	$40,%r9
624	andq	$0x3ffffff,%rbx
625	orq	%r9,%rbp
626
627	vmovd	%eax,%xmm0
628	vmovd	%edx,%xmm1
629	vmovd	%r14d,%xmm2
630	vmovd	%ebx,%xmm3
631	vmovd	%ebp,%xmm4
632	movl	$1,20(%rdi)
633
634	call	__poly1305_init_avx
635
636.Lproceed_avx:
637	movq	%r15,%rdx
638
639	movq	0(%rsp),%r15
640.cfi_restore	%r15
641	movq	8(%rsp),%r14
642.cfi_restore	%r14
643	movq	16(%rsp),%r13
644.cfi_restore	%r13
645	movq	24(%rsp),%r12
646.cfi_restore	%r12
647	movq	32(%rsp),%rbp
648.cfi_restore	%rbp
649	movq	40(%rsp),%rbx
650.cfi_restore	%rbx
651	leaq	48(%rsp),%rax
652	leaq	48(%rsp),%rsp
653.cfi_adjust_cfa_offset	-48
654.Lbase2_64_avx_epilogue:
655	jmp	.Ldo_avx
656.cfi_endproc
657
658.align	32
659.Leven_avx:
660.cfi_startproc
661	vmovd	0(%rdi),%xmm0
662	vmovd	4(%rdi),%xmm1
663	vmovd	8(%rdi),%xmm2
664	vmovd	12(%rdi),%xmm3
665	vmovd	16(%rdi),%xmm4
666
667.Ldo_avx:
668	leaq	-88(%rsp),%r11
669.cfi_def_cfa	%r11,0x60
670	subq	$0x178,%rsp
671	subq	$64,%rdx
672	leaq	-32(%rsi),%rax
673	cmovcq	%rax,%rsi
674
675	vmovdqu	48(%rdi),%xmm14
676	leaq	112(%rdi),%rdi
677	leaq	.Lconst(%rip),%rcx
678
679
680
681	vmovdqu	32(%rsi),%xmm5
682	vmovdqu	48(%rsi),%xmm6
683	vmovdqa	64(%rcx),%xmm15
684
685	vpsrldq	$6,%xmm5,%xmm7
686	vpsrldq	$6,%xmm6,%xmm8
687	vpunpckhqdq	%xmm6,%xmm5,%xmm9
688	vpunpcklqdq	%xmm6,%xmm5,%xmm5
689	vpunpcklqdq	%xmm8,%xmm7,%xmm8
690
691	vpsrlq	$40,%xmm9,%xmm9
692	vpsrlq	$26,%xmm5,%xmm6
693	vpand	%xmm15,%xmm5,%xmm5
694	vpsrlq	$4,%xmm8,%xmm7
695	vpand	%xmm15,%xmm6,%xmm6
696	vpsrlq	$30,%xmm8,%xmm8
697	vpand	%xmm15,%xmm7,%xmm7
698	vpand	%xmm15,%xmm8,%xmm8
699	vpor	32(%rcx),%xmm9,%xmm9
700
701	jbe	.Lskip_loop_avx
702
703
704	vmovdqu	-48(%rdi),%xmm11
705	vmovdqu	-32(%rdi),%xmm12
706	vpshufd	$0xEE,%xmm14,%xmm13
707	vpshufd	$0x44,%xmm14,%xmm10
708	vmovdqa	%xmm13,-144(%r11)
709	vmovdqa	%xmm10,0(%rsp)
710	vpshufd	$0xEE,%xmm11,%xmm14
711	vmovdqu	-16(%rdi),%xmm10
712	vpshufd	$0x44,%xmm11,%xmm11
713	vmovdqa	%xmm14,-128(%r11)
714	vmovdqa	%xmm11,16(%rsp)
715	vpshufd	$0xEE,%xmm12,%xmm13
716	vmovdqu	0(%rdi),%xmm11
717	vpshufd	$0x44,%xmm12,%xmm12
718	vmovdqa	%xmm13,-112(%r11)
719	vmovdqa	%xmm12,32(%rsp)
720	vpshufd	$0xEE,%xmm10,%xmm14
721	vmovdqu	16(%rdi),%xmm12
722	vpshufd	$0x44,%xmm10,%xmm10
723	vmovdqa	%xmm14,-96(%r11)
724	vmovdqa	%xmm10,48(%rsp)
725	vpshufd	$0xEE,%xmm11,%xmm13
726	vmovdqu	32(%rdi),%xmm10
727	vpshufd	$0x44,%xmm11,%xmm11
728	vmovdqa	%xmm13,-80(%r11)
729	vmovdqa	%xmm11,64(%rsp)
730	vpshufd	$0xEE,%xmm12,%xmm14
731	vmovdqu	48(%rdi),%xmm11
732	vpshufd	$0x44,%xmm12,%xmm12
733	vmovdqa	%xmm14,-64(%r11)
734	vmovdqa	%xmm12,80(%rsp)
735	vpshufd	$0xEE,%xmm10,%xmm13
736	vmovdqu	64(%rdi),%xmm12
737	vpshufd	$0x44,%xmm10,%xmm10
738	vmovdqa	%xmm13,-48(%r11)
739	vmovdqa	%xmm10,96(%rsp)
740	vpshufd	$0xEE,%xmm11,%xmm14
741	vpshufd	$0x44,%xmm11,%xmm11
742	vmovdqa	%xmm14,-32(%r11)
743	vmovdqa	%xmm11,112(%rsp)
744	vpshufd	$0xEE,%xmm12,%xmm13
745	vmovdqa	0(%rsp),%xmm14
746	vpshufd	$0x44,%xmm12,%xmm12
747	vmovdqa	%xmm13,-16(%r11)
748	vmovdqa	%xmm12,128(%rsp)
749
750	jmp	.Loop_avx
751
752.align	32
753.Loop_avx:
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774	vpmuludq	%xmm5,%xmm14,%xmm10
775	vpmuludq	%xmm6,%xmm14,%xmm11
776	vmovdqa	%xmm2,32(%r11)
777	vpmuludq	%xmm7,%xmm14,%xmm12
778	vmovdqa	16(%rsp),%xmm2
779	vpmuludq	%xmm8,%xmm14,%xmm13
780	vpmuludq	%xmm9,%xmm14,%xmm14
781
782	vmovdqa	%xmm0,0(%r11)
783	vpmuludq	32(%rsp),%xmm9,%xmm0
784	vmovdqa	%xmm1,16(%r11)
785	vpmuludq	%xmm8,%xmm2,%xmm1
786	vpaddq	%xmm0,%xmm10,%xmm10
787	vpaddq	%xmm1,%xmm14,%xmm14
788	vmovdqa	%xmm3,48(%r11)
789	vpmuludq	%xmm7,%xmm2,%xmm0
790	vpmuludq	%xmm6,%xmm2,%xmm1
791	vpaddq	%xmm0,%xmm13,%xmm13
792	vmovdqa	48(%rsp),%xmm3
793	vpaddq	%xmm1,%xmm12,%xmm12
794	vmovdqa	%xmm4,64(%r11)
795	vpmuludq	%xmm5,%xmm2,%xmm2
796	vpmuludq	%xmm7,%xmm3,%xmm0
797	vpaddq	%xmm2,%xmm11,%xmm11
798
799	vmovdqa	64(%rsp),%xmm4
800	vpaddq	%xmm0,%xmm14,%xmm14
801	vpmuludq	%xmm6,%xmm3,%xmm1
802	vpmuludq	%xmm5,%xmm3,%xmm3
803	vpaddq	%xmm1,%xmm13,%xmm13
804	vmovdqa	80(%rsp),%xmm2
805	vpaddq	%xmm3,%xmm12,%xmm12
806	vpmuludq	%xmm9,%xmm4,%xmm0
807	vpmuludq	%xmm8,%xmm4,%xmm4
808	vpaddq	%xmm0,%xmm11,%xmm11
809	vmovdqa	96(%rsp),%xmm3
810	vpaddq	%xmm4,%xmm10,%xmm10
811
812	vmovdqa	128(%rsp),%xmm4
813	vpmuludq	%xmm6,%xmm2,%xmm1
814	vpmuludq	%xmm5,%xmm2,%xmm2
815	vpaddq	%xmm1,%xmm14,%xmm14
816	vpaddq	%xmm2,%xmm13,%xmm13
817	vpmuludq	%xmm9,%xmm3,%xmm0
818	vpmuludq	%xmm8,%xmm3,%xmm1
819	vpaddq	%xmm0,%xmm12,%xmm12
820	vmovdqu	0(%rsi),%xmm0
821	vpaddq	%xmm1,%xmm11,%xmm11
822	vpmuludq	%xmm7,%xmm3,%xmm3
823	vpmuludq	%xmm7,%xmm4,%xmm7
824	vpaddq	%xmm3,%xmm10,%xmm10
825
826	vmovdqu	16(%rsi),%xmm1
827	vpaddq	%xmm7,%xmm11,%xmm11
828	vpmuludq	%xmm8,%xmm4,%xmm8
829	vpmuludq	%xmm9,%xmm4,%xmm9
830	vpsrldq	$6,%xmm0,%xmm2
831	vpaddq	%xmm8,%xmm12,%xmm12
832	vpaddq	%xmm9,%xmm13,%xmm13
833	vpsrldq	$6,%xmm1,%xmm3
834	vpmuludq	112(%rsp),%xmm5,%xmm9
835	vpmuludq	%xmm6,%xmm4,%xmm5
836	vpunpckhqdq	%xmm1,%xmm0,%xmm4
837	vpaddq	%xmm9,%xmm14,%xmm14
838	vmovdqa	-144(%r11),%xmm9
839	vpaddq	%xmm5,%xmm10,%xmm10
840
841	vpunpcklqdq	%xmm1,%xmm0,%xmm0
842	vpunpcklqdq	%xmm3,%xmm2,%xmm3
843
844
845	vpsrldq	$5,%xmm4,%xmm4
846	vpsrlq	$26,%xmm0,%xmm1
847	vpand	%xmm15,%xmm0,%xmm0
848	vpsrlq	$4,%xmm3,%xmm2
849	vpand	%xmm15,%xmm1,%xmm1
850	vpand	0(%rcx),%xmm4,%xmm4
851	vpsrlq	$30,%xmm3,%xmm3
852	vpand	%xmm15,%xmm2,%xmm2
853	vpand	%xmm15,%xmm3,%xmm3
854	vpor	32(%rcx),%xmm4,%xmm4
855
856	vpaddq	0(%r11),%xmm0,%xmm0
857	vpaddq	16(%r11),%xmm1,%xmm1
858	vpaddq	32(%r11),%xmm2,%xmm2
859	vpaddq	48(%r11),%xmm3,%xmm3
860	vpaddq	64(%r11),%xmm4,%xmm4
861
862	leaq	32(%rsi),%rax
863	leaq	64(%rsi),%rsi
864	subq	$64,%rdx
865	cmovcq	%rax,%rsi
866
867
868
869
870
871
872
873
874
875
876	vpmuludq	%xmm0,%xmm9,%xmm5
877	vpmuludq	%xmm1,%xmm9,%xmm6
878	vpaddq	%xmm5,%xmm10,%xmm10
879	vpaddq	%xmm6,%xmm11,%xmm11
880	vmovdqa	-128(%r11),%xmm7
881	vpmuludq	%xmm2,%xmm9,%xmm5
882	vpmuludq	%xmm3,%xmm9,%xmm6
883	vpaddq	%xmm5,%xmm12,%xmm12
884	vpaddq	%xmm6,%xmm13,%xmm13
885	vpmuludq	%xmm4,%xmm9,%xmm9
886	vpmuludq	-112(%r11),%xmm4,%xmm5
887	vpaddq	%xmm9,%xmm14,%xmm14
888
889	vpaddq	%xmm5,%xmm10,%xmm10
890	vpmuludq	%xmm2,%xmm7,%xmm6
891	vpmuludq	%xmm3,%xmm7,%xmm5
892	vpaddq	%xmm6,%xmm13,%xmm13
893	vmovdqa	-96(%r11),%xmm8
894	vpaddq	%xmm5,%xmm14,%xmm14
895	vpmuludq	%xmm1,%xmm7,%xmm6
896	vpmuludq	%xmm0,%xmm7,%xmm7
897	vpaddq	%xmm6,%xmm12,%xmm12
898	vpaddq	%xmm7,%xmm11,%xmm11
899
900	vmovdqa	-80(%r11),%xmm9
901	vpmuludq	%xmm2,%xmm8,%xmm5
902	vpmuludq	%xmm1,%xmm8,%xmm6
903	vpaddq	%xmm5,%xmm14,%xmm14
904	vpaddq	%xmm6,%xmm13,%xmm13
905	vmovdqa	-64(%r11),%xmm7
906	vpmuludq	%xmm0,%xmm8,%xmm8
907	vpmuludq	%xmm4,%xmm9,%xmm5
908	vpaddq	%xmm8,%xmm12,%xmm12
909	vpaddq	%xmm5,%xmm11,%xmm11
910	vmovdqa	-48(%r11),%xmm8
911	vpmuludq	%xmm3,%xmm9,%xmm9
912	vpmuludq	%xmm1,%xmm7,%xmm6
913	vpaddq	%xmm9,%xmm10,%xmm10
914
915	vmovdqa	-16(%r11),%xmm9
916	vpaddq	%xmm6,%xmm14,%xmm14
917	vpmuludq	%xmm0,%xmm7,%xmm7
918	vpmuludq	%xmm4,%xmm8,%xmm5
919	vpaddq	%xmm7,%xmm13,%xmm13
920	vpaddq	%xmm5,%xmm12,%xmm12
921	vmovdqu	32(%rsi),%xmm5
922	vpmuludq	%xmm3,%xmm8,%xmm7
923	vpmuludq	%xmm2,%xmm8,%xmm8
924	vpaddq	%xmm7,%xmm11,%xmm11
925	vmovdqu	48(%rsi),%xmm6
926	vpaddq	%xmm8,%xmm10,%xmm10
927
928	vpmuludq	%xmm2,%xmm9,%xmm2
929	vpmuludq	%xmm3,%xmm9,%xmm3
930	vpsrldq	$6,%xmm5,%xmm7
931	vpaddq	%xmm2,%xmm11,%xmm11
932	vpmuludq	%xmm4,%xmm9,%xmm4
933	vpsrldq	$6,%xmm6,%xmm8
934	vpaddq	%xmm3,%xmm12,%xmm2
935	vpaddq	%xmm4,%xmm13,%xmm3
936	vpmuludq	-32(%r11),%xmm0,%xmm4
937	vpmuludq	%xmm1,%xmm9,%xmm0
938	vpunpckhqdq	%xmm6,%xmm5,%xmm9
939	vpaddq	%xmm4,%xmm14,%xmm4
940	vpaddq	%xmm0,%xmm10,%xmm0
941
942	vpunpcklqdq	%xmm6,%xmm5,%xmm5
943	vpunpcklqdq	%xmm8,%xmm7,%xmm8
944
945
946	vpsrldq	$5,%xmm9,%xmm9
947	vpsrlq	$26,%xmm5,%xmm6
948	vmovdqa	0(%rsp),%xmm14
949	vpand	%xmm15,%xmm5,%xmm5
950	vpsrlq	$4,%xmm8,%xmm7
951	vpand	%xmm15,%xmm6,%xmm6
952	vpand	0(%rcx),%xmm9,%xmm9
953	vpsrlq	$30,%xmm8,%xmm8
954	vpand	%xmm15,%xmm7,%xmm7
955	vpand	%xmm15,%xmm8,%xmm8
956	vpor	32(%rcx),%xmm9,%xmm9
957
958
959
960
961
962	vpsrlq	$26,%xmm3,%xmm13
963	vpand	%xmm15,%xmm3,%xmm3
964	vpaddq	%xmm13,%xmm4,%xmm4
965
966	vpsrlq	$26,%xmm0,%xmm10
967	vpand	%xmm15,%xmm0,%xmm0
968	vpaddq	%xmm10,%xmm11,%xmm1
969
970	vpsrlq	$26,%xmm4,%xmm10
971	vpand	%xmm15,%xmm4,%xmm4
972
973	vpsrlq	$26,%xmm1,%xmm11
974	vpand	%xmm15,%xmm1,%xmm1
975	vpaddq	%xmm11,%xmm2,%xmm2
976
977	vpaddq	%xmm10,%xmm0,%xmm0
978	vpsllq	$2,%xmm10,%xmm10
979	vpaddq	%xmm10,%xmm0,%xmm0
980
981	vpsrlq	$26,%xmm2,%xmm12
982	vpand	%xmm15,%xmm2,%xmm2
983	vpaddq	%xmm12,%xmm3,%xmm3
984
985	vpsrlq	$26,%xmm0,%xmm10
986	vpand	%xmm15,%xmm0,%xmm0
987	vpaddq	%xmm10,%xmm1,%xmm1
988
989	vpsrlq	$26,%xmm3,%xmm13
990	vpand	%xmm15,%xmm3,%xmm3
991	vpaddq	%xmm13,%xmm4,%xmm4
992
993	ja	.Loop_avx
994
995.Lskip_loop_avx:
996
997
998
999	vpshufd	$0x10,%xmm14,%xmm14
1000	addq	$32,%rdx
1001	jnz	.Long_tail_avx
1002
1003	vpaddq	%xmm2,%xmm7,%xmm7
1004	vpaddq	%xmm0,%xmm5,%xmm5
1005	vpaddq	%xmm1,%xmm6,%xmm6
1006	vpaddq	%xmm3,%xmm8,%xmm8
1007	vpaddq	%xmm4,%xmm9,%xmm9
1008
1009.Long_tail_avx:
1010	vmovdqa	%xmm2,32(%r11)
1011	vmovdqa	%xmm0,0(%r11)
1012	vmovdqa	%xmm1,16(%r11)
1013	vmovdqa	%xmm3,48(%r11)
1014	vmovdqa	%xmm4,64(%r11)
1015
1016
1017
1018
1019
1020
1021
1022	vpmuludq	%xmm7,%xmm14,%xmm12
1023	vpmuludq	%xmm5,%xmm14,%xmm10
1024	vpshufd	$0x10,-48(%rdi),%xmm2
1025	vpmuludq	%xmm6,%xmm14,%xmm11
1026	vpmuludq	%xmm8,%xmm14,%xmm13
1027	vpmuludq	%xmm9,%xmm14,%xmm14
1028
1029	vpmuludq	%xmm8,%xmm2,%xmm0
1030	vpaddq	%xmm0,%xmm14,%xmm14
1031	vpshufd	$0x10,-32(%rdi),%xmm3
1032	vpmuludq	%xmm7,%xmm2,%xmm1
1033	vpaddq	%xmm1,%xmm13,%xmm13
1034	vpshufd	$0x10,-16(%rdi),%xmm4
1035	vpmuludq	%xmm6,%xmm2,%xmm0
1036	vpaddq	%xmm0,%xmm12,%xmm12
1037	vpmuludq	%xmm5,%xmm2,%xmm2
1038	vpaddq	%xmm2,%xmm11,%xmm11
1039	vpmuludq	%xmm9,%xmm3,%xmm3
1040	vpaddq	%xmm3,%xmm10,%xmm10
1041
1042	vpshufd	$0x10,0(%rdi),%xmm2
1043	vpmuludq	%xmm7,%xmm4,%xmm1
1044	vpaddq	%xmm1,%xmm14,%xmm14
1045	vpmuludq	%xmm6,%xmm4,%xmm0
1046	vpaddq	%xmm0,%xmm13,%xmm13
1047	vpshufd	$0x10,16(%rdi),%xmm3
1048	vpmuludq	%xmm5,%xmm4,%xmm4
1049	vpaddq	%xmm4,%xmm12,%xmm12
1050	vpmuludq	%xmm9,%xmm2,%xmm1
1051	vpaddq	%xmm1,%xmm11,%xmm11
1052	vpshufd	$0x10,32(%rdi),%xmm4
1053	vpmuludq	%xmm8,%xmm2,%xmm2
1054	vpaddq	%xmm2,%xmm10,%xmm10
1055
1056	vpmuludq	%xmm6,%xmm3,%xmm0
1057	vpaddq	%xmm0,%xmm14,%xmm14
1058	vpmuludq	%xmm5,%xmm3,%xmm3
1059	vpaddq	%xmm3,%xmm13,%xmm13
1060	vpshufd	$0x10,48(%rdi),%xmm2
1061	vpmuludq	%xmm9,%xmm4,%xmm1
1062	vpaddq	%xmm1,%xmm12,%xmm12
1063	vpshufd	$0x10,64(%rdi),%xmm3
1064	vpmuludq	%xmm8,%xmm4,%xmm0
1065	vpaddq	%xmm0,%xmm11,%xmm11
1066	vpmuludq	%xmm7,%xmm4,%xmm4
1067	vpaddq	%xmm4,%xmm10,%xmm10
1068
1069	vpmuludq	%xmm5,%xmm2,%xmm2
1070	vpaddq	%xmm2,%xmm14,%xmm14
1071	vpmuludq	%xmm9,%xmm3,%xmm1
1072	vpaddq	%xmm1,%xmm13,%xmm13
1073	vpmuludq	%xmm8,%xmm3,%xmm0
1074	vpaddq	%xmm0,%xmm12,%xmm12
1075	vpmuludq	%xmm7,%xmm3,%xmm1
1076	vpaddq	%xmm1,%xmm11,%xmm11
1077	vpmuludq	%xmm6,%xmm3,%xmm3
1078	vpaddq	%xmm3,%xmm10,%xmm10
1079
1080	jz	.Lshort_tail_avx
1081
1082	vmovdqu	0(%rsi),%xmm0
1083	vmovdqu	16(%rsi),%xmm1
1084
1085	vpsrldq	$6,%xmm0,%xmm2
1086	vpsrldq	$6,%xmm1,%xmm3
1087	vpunpckhqdq	%xmm1,%xmm0,%xmm4
1088	vpunpcklqdq	%xmm1,%xmm0,%xmm0
1089	vpunpcklqdq	%xmm3,%xmm2,%xmm3
1090
1091	vpsrlq	$40,%xmm4,%xmm4
1092	vpsrlq	$26,%xmm0,%xmm1
1093	vpand	%xmm15,%xmm0,%xmm0
1094	vpsrlq	$4,%xmm3,%xmm2
1095	vpand	%xmm15,%xmm1,%xmm1
1096	vpsrlq	$30,%xmm3,%xmm3
1097	vpand	%xmm15,%xmm2,%xmm2
1098	vpand	%xmm15,%xmm3,%xmm3
1099	vpor	32(%rcx),%xmm4,%xmm4
1100
1101	vpshufd	$0x32,-64(%rdi),%xmm9
1102	vpaddq	0(%r11),%xmm0,%xmm0
1103	vpaddq	16(%r11),%xmm1,%xmm1
1104	vpaddq	32(%r11),%xmm2,%xmm2
1105	vpaddq	48(%r11),%xmm3,%xmm3
1106	vpaddq	64(%r11),%xmm4,%xmm4
1107
1108
1109
1110
1111	vpmuludq	%xmm0,%xmm9,%xmm5
1112	vpaddq	%xmm5,%xmm10,%xmm10
1113	vpmuludq	%xmm1,%xmm9,%xmm6
1114	vpaddq	%xmm6,%xmm11,%xmm11
1115	vpmuludq	%xmm2,%xmm9,%xmm5
1116	vpaddq	%xmm5,%xmm12,%xmm12
1117	vpshufd	$0x32,-48(%rdi),%xmm7
1118	vpmuludq	%xmm3,%xmm9,%xmm6
1119	vpaddq	%xmm6,%xmm13,%xmm13
1120	vpmuludq	%xmm4,%xmm9,%xmm9
1121	vpaddq	%xmm9,%xmm14,%xmm14
1122
1123	vpmuludq	%xmm3,%xmm7,%xmm5
1124	vpaddq	%xmm5,%xmm14,%xmm14
1125	vpshufd	$0x32,-32(%rdi),%xmm8
1126	vpmuludq	%xmm2,%xmm7,%xmm6
1127	vpaddq	%xmm6,%xmm13,%xmm13
1128	vpshufd	$0x32,-16(%rdi),%xmm9
1129	vpmuludq	%xmm1,%xmm7,%xmm5
1130	vpaddq	%xmm5,%xmm12,%xmm12
1131	vpmuludq	%xmm0,%xmm7,%xmm7
1132	vpaddq	%xmm7,%xmm11,%xmm11
1133	vpmuludq	%xmm4,%xmm8,%xmm8
1134	vpaddq	%xmm8,%xmm10,%xmm10
1135
1136	vpshufd	$0x32,0(%rdi),%xmm7
1137	vpmuludq	%xmm2,%xmm9,%xmm6
1138	vpaddq	%xmm6,%xmm14,%xmm14
1139	vpmuludq	%xmm1,%xmm9,%xmm5
1140	vpaddq	%xmm5,%xmm13,%xmm13
1141	vpshufd	$0x32,16(%rdi),%xmm8
1142	vpmuludq	%xmm0,%xmm9,%xmm9
1143	vpaddq	%xmm9,%xmm12,%xmm12
1144	vpmuludq	%xmm4,%xmm7,%xmm6
1145	vpaddq	%xmm6,%xmm11,%xmm11
1146	vpshufd	$0x32,32(%rdi),%xmm9
1147	vpmuludq	%xmm3,%xmm7,%xmm7
1148	vpaddq	%xmm7,%xmm10,%xmm10
1149
1150	vpmuludq	%xmm1,%xmm8,%xmm5
1151	vpaddq	%xmm5,%xmm14,%xmm14
1152	vpmuludq	%xmm0,%xmm8,%xmm8
1153	vpaddq	%xmm8,%xmm13,%xmm13
1154	vpshufd	$0x32,48(%rdi),%xmm7
1155	vpmuludq	%xmm4,%xmm9,%xmm6
1156	vpaddq	%xmm6,%xmm12,%xmm12
1157	vpshufd	$0x32,64(%rdi),%xmm8
1158	vpmuludq	%xmm3,%xmm9,%xmm5
1159	vpaddq	%xmm5,%xmm11,%xmm11
1160	vpmuludq	%xmm2,%xmm9,%xmm9
1161	vpaddq	%xmm9,%xmm10,%xmm10
1162
1163	vpmuludq	%xmm0,%xmm7,%xmm7
1164	vpaddq	%xmm7,%xmm14,%xmm14
1165	vpmuludq	%xmm4,%xmm8,%xmm6
1166	vpaddq	%xmm6,%xmm13,%xmm13
1167	vpmuludq	%xmm3,%xmm8,%xmm5
1168	vpaddq	%xmm5,%xmm12,%xmm12
1169	vpmuludq	%xmm2,%xmm8,%xmm6
1170	vpaddq	%xmm6,%xmm11,%xmm11
1171	vpmuludq	%xmm1,%xmm8,%xmm8
1172	vpaddq	%xmm8,%xmm10,%xmm10
1173
1174.Lshort_tail_avx:
1175
1176
1177
1178	vpsrldq	$8,%xmm14,%xmm9
1179	vpsrldq	$8,%xmm13,%xmm8
1180	vpsrldq	$8,%xmm11,%xmm6
1181	vpsrldq	$8,%xmm10,%xmm5
1182	vpsrldq	$8,%xmm12,%xmm7
1183	vpaddq	%xmm8,%xmm13,%xmm13
1184	vpaddq	%xmm9,%xmm14,%xmm14
1185	vpaddq	%xmm5,%xmm10,%xmm10
1186	vpaddq	%xmm6,%xmm11,%xmm11
1187	vpaddq	%xmm7,%xmm12,%xmm12
1188
1189
1190
1191
1192	vpsrlq	$26,%xmm13,%xmm3
1193	vpand	%xmm15,%xmm13,%xmm13
1194	vpaddq	%xmm3,%xmm14,%xmm14
1195
1196	vpsrlq	$26,%xmm10,%xmm0
1197	vpand	%xmm15,%xmm10,%xmm10
1198	vpaddq	%xmm0,%xmm11,%xmm11
1199
1200	vpsrlq	$26,%xmm14,%xmm4
1201	vpand	%xmm15,%xmm14,%xmm14
1202
1203	vpsrlq	$26,%xmm11,%xmm1
1204	vpand	%xmm15,%xmm11,%xmm11
1205	vpaddq	%xmm1,%xmm12,%xmm12
1206
1207	vpaddq	%xmm4,%xmm10,%xmm10
1208	vpsllq	$2,%xmm4,%xmm4
1209	vpaddq	%xmm4,%xmm10,%xmm10
1210
1211	vpsrlq	$26,%xmm12,%xmm2
1212	vpand	%xmm15,%xmm12,%xmm12
1213	vpaddq	%xmm2,%xmm13,%xmm13
1214
1215	vpsrlq	$26,%xmm10,%xmm0
1216	vpand	%xmm15,%xmm10,%xmm10
1217	vpaddq	%xmm0,%xmm11,%xmm11
1218
1219	vpsrlq	$26,%xmm13,%xmm3
1220	vpand	%xmm15,%xmm13,%xmm13
1221	vpaddq	%xmm3,%xmm14,%xmm14
1222
1223	vmovd	%xmm10,-112(%rdi)
1224	vmovd	%xmm11,-108(%rdi)
1225	vmovd	%xmm12,-104(%rdi)
1226	vmovd	%xmm13,-100(%rdi)
1227	vmovd	%xmm14,-96(%rdi)
1228	leaq	88(%r11),%rsp
1229.cfi_def_cfa	%rsp,8
1230	vzeroupper
1231	.byte	0xf3,0xc3
1232.cfi_endproc
1233.size	poly1305_blocks_avx,.-poly1305_blocks_avx
1234
1235.type	poly1305_emit_avx,@function
1236.align	32
1237poly1305_emit_avx:
1238.cfi_startproc
1239	cmpl	$0,20(%rdi)
1240	je	.Lemit
1241
1242	movl	0(%rdi),%eax
1243	movl	4(%rdi),%ecx
1244	movl	8(%rdi),%r8d
1245	movl	12(%rdi),%r11d
1246	movl	16(%rdi),%r10d
1247
1248	shlq	$26,%rcx
1249	movq	%r8,%r9
1250	shlq	$52,%r8
1251	addq	%rcx,%rax
1252	shrq	$12,%r9
1253	addq	%rax,%r8
1254	adcq	$0,%r9
1255
1256	shlq	$14,%r11
1257	movq	%r10,%rax
1258	shrq	$24,%r10
1259	addq	%r11,%r9
1260	shlq	$40,%rax
1261	addq	%rax,%r9
1262	adcq	$0,%r10
1263
1264	movq	%r10,%rax
1265	movq	%r10,%rcx
1266	andq	$3,%r10
1267	shrq	$2,%rax
1268	andq	$-4,%rcx
1269	addq	%rcx,%rax
1270	addq	%rax,%r8
1271	adcq	$0,%r9
1272	adcq	$0,%r10
1273
1274	movq	%r8,%rax
1275	addq	$5,%r8
1276	movq	%r9,%rcx
1277	adcq	$0,%r9
1278	adcq	$0,%r10
1279	shrq	$2,%r10
1280	cmovnzq	%r8,%rax
1281	cmovnzq	%r9,%rcx
1282
1283	addq	0(%rdx),%rax
1284	adcq	8(%rdx),%rcx
1285	movq	%rax,0(%rsi)
1286	movq	%rcx,8(%rsi)
1287
1288	.byte	0xf3,0xc3
1289.cfi_endproc
1290.size	poly1305_emit_avx,.-poly1305_emit_avx
1291.type	poly1305_blocks_avx2,@function
1292.align	32
1293poly1305_blocks_avx2:
1294.cfi_startproc
1295	movl	20(%rdi),%r8d
1296	cmpq	$128,%rdx
1297	jae	.Lblocks_avx2
1298	testl	%r8d,%r8d
1299	jz	.Lblocks
1300
1301.Lblocks_avx2:
1302	andq	$-16,%rdx
1303	jz	.Lno_data_avx2
1304
1305	vzeroupper
1306
1307	testl	%r8d,%r8d
1308	jz	.Lbase2_64_avx2
1309
1310	testq	$63,%rdx
1311	jz	.Leven_avx2
1312
1313	pushq	%rbx
1314.cfi_adjust_cfa_offset	8
1315.cfi_offset	%rbx,-16
1316	pushq	%rbp
1317.cfi_adjust_cfa_offset	8
1318.cfi_offset	%rbp,-24
1319	pushq	%r12
1320.cfi_adjust_cfa_offset	8
1321.cfi_offset	%r12,-32
1322	pushq	%r13
1323.cfi_adjust_cfa_offset	8
1324.cfi_offset	%r13,-40
1325	pushq	%r14
1326.cfi_adjust_cfa_offset	8
1327.cfi_offset	%r14,-48
1328	pushq	%r15
1329.cfi_adjust_cfa_offset	8
1330.cfi_offset	%r15,-56
1331.Lblocks_avx2_body:
1332
1333	movq	%rdx,%r15
1334
1335	movq	0(%rdi),%r8
1336	movq	8(%rdi),%r9
1337	movl	16(%rdi),%ebp
1338
1339	movq	24(%rdi),%r11
1340	movq	32(%rdi),%r13
1341
1342
1343	movl	%r8d,%r14d
1344	andq	$-2147483648,%r8
1345	movq	%r9,%r12
1346	movl	%r9d,%ebx
1347	andq	$-2147483648,%r9
1348
1349	shrq	$6,%r8
1350	shlq	$52,%r12
1351	addq	%r8,%r14
1352	shrq	$12,%rbx
1353	shrq	$18,%r9
1354	addq	%r12,%r14
1355	adcq	%r9,%rbx
1356
1357	movq	%rbp,%r8
1358	shlq	$40,%r8
1359	shrq	$24,%rbp
1360	addq	%r8,%rbx
1361	adcq	$0,%rbp
1362
1363	movq	$-4,%r9
1364	movq	%rbp,%r8
1365	andq	%rbp,%r9
1366	shrq	$2,%r8
1367	andq	$3,%rbp
1368	addq	%r9,%r8
1369	addq	%r8,%r14
1370	adcq	$0,%rbx
1371	adcq	$0,%rbp
1372
1373	movq	%r13,%r12
1374	movq	%r13,%rax
1375	shrq	$2,%r13
1376	addq	%r12,%r13
1377
1378.Lbase2_26_pre_avx2:
1379	addq	0(%rsi),%r14
1380	adcq	8(%rsi),%rbx
1381	leaq	16(%rsi),%rsi
1382	adcq	%rcx,%rbp
1383	subq	$16,%r15
1384
1385	call	__poly1305_block
1386	movq	%r12,%rax
1387
1388	testq	$63,%r15
1389	jnz	.Lbase2_26_pre_avx2
1390
1391	testq	%rcx,%rcx
1392	jz	.Lstore_base2_64_avx2
1393
1394
1395	movq	%r14,%rax
1396	movq	%r14,%rdx
1397	shrq	$52,%r14
1398	movq	%rbx,%r11
1399	movq	%rbx,%r12
1400	shrq	$26,%rdx
1401	andq	$0x3ffffff,%rax
1402	shlq	$12,%r11
1403	andq	$0x3ffffff,%rdx
1404	shrq	$14,%rbx
1405	orq	%r11,%r14
1406	shlq	$24,%rbp
1407	andq	$0x3ffffff,%r14
1408	shrq	$40,%r12
1409	andq	$0x3ffffff,%rbx
1410	orq	%r12,%rbp
1411
1412	testq	%r15,%r15
1413	jz	.Lstore_base2_26_avx2
1414
1415	vmovd	%eax,%xmm0
1416	vmovd	%edx,%xmm1
1417	vmovd	%r14d,%xmm2
1418	vmovd	%ebx,%xmm3
1419	vmovd	%ebp,%xmm4
1420	jmp	.Lproceed_avx2
1421
1422.align	32
1423.Lstore_base2_64_avx2:
1424	movq	%r14,0(%rdi)
1425	movq	%rbx,8(%rdi)
1426	movq	%rbp,16(%rdi)
1427	jmp	.Ldone_avx2
1428
1429.align	16
1430.Lstore_base2_26_avx2:
1431	movl	%eax,0(%rdi)
1432	movl	%edx,4(%rdi)
1433	movl	%r14d,8(%rdi)
1434	movl	%ebx,12(%rdi)
1435	movl	%ebp,16(%rdi)
1436.align	16
1437.Ldone_avx2:
1438	movq	0(%rsp),%r15
1439.cfi_restore	%r15
1440	movq	8(%rsp),%r14
1441.cfi_restore	%r14
1442	movq	16(%rsp),%r13
1443.cfi_restore	%r13
1444	movq	24(%rsp),%r12
1445.cfi_restore	%r12
1446	movq	32(%rsp),%rbp
1447.cfi_restore	%rbp
1448	movq	40(%rsp),%rbx
1449.cfi_restore	%rbx
1450	leaq	48(%rsp),%rsp
1451.cfi_adjust_cfa_offset	-48
1452.Lno_data_avx2:
1453.Lblocks_avx2_epilogue:
1454	.byte	0xf3,0xc3
1455.cfi_endproc
1456
1457.align	32
1458.Lbase2_64_avx2:
1459.cfi_startproc
1460	pushq	%rbx
1461.cfi_adjust_cfa_offset	8
1462.cfi_offset	%rbx,-16
1463	pushq	%rbp
1464.cfi_adjust_cfa_offset	8
1465.cfi_offset	%rbp,-24
1466	pushq	%r12
1467.cfi_adjust_cfa_offset	8
1468.cfi_offset	%r12,-32
1469	pushq	%r13
1470.cfi_adjust_cfa_offset	8
1471.cfi_offset	%r13,-40
1472	pushq	%r14
1473.cfi_adjust_cfa_offset	8
1474.cfi_offset	%r14,-48
1475	pushq	%r15
1476.cfi_adjust_cfa_offset	8
1477.cfi_offset	%r15,-56
1478.Lbase2_64_avx2_body:
1479
1480	movq	%rdx,%r15
1481
1482	movq	24(%rdi),%r11
1483	movq	32(%rdi),%r13
1484
1485	movq	0(%rdi),%r14
1486	movq	8(%rdi),%rbx
1487	movl	16(%rdi),%ebp
1488
1489	movq	%r13,%r12
1490	movq	%r13,%rax
1491	shrq	$2,%r13
1492	addq	%r12,%r13
1493
1494	testq	$63,%rdx
1495	jz	.Linit_avx2
1496
1497.Lbase2_64_pre_avx2:
1498	addq	0(%rsi),%r14
1499	adcq	8(%rsi),%rbx
1500	leaq	16(%rsi),%rsi
1501	adcq	%rcx,%rbp
1502	subq	$16,%r15
1503
1504	call	__poly1305_block
1505	movq	%r12,%rax
1506
1507	testq	$63,%r15
1508	jnz	.Lbase2_64_pre_avx2
1509
1510.Linit_avx2:
1511
1512	movq	%r14,%rax
1513	movq	%r14,%rdx
1514	shrq	$52,%r14
1515	movq	%rbx,%r8
1516	movq	%rbx,%r9
1517	shrq	$26,%rdx
1518	andq	$0x3ffffff,%rax
1519	shlq	$12,%r8
1520	andq	$0x3ffffff,%rdx
1521	shrq	$14,%rbx
1522	orq	%r8,%r14
1523	shlq	$24,%rbp
1524	andq	$0x3ffffff,%r14
1525	shrq	$40,%r9
1526	andq	$0x3ffffff,%rbx
1527	orq	%r9,%rbp
1528
1529	vmovd	%eax,%xmm0
1530	vmovd	%edx,%xmm1
1531	vmovd	%r14d,%xmm2
1532	vmovd	%ebx,%xmm3
1533	vmovd	%ebp,%xmm4
1534	movl	$1,20(%rdi)
1535
1536	call	__poly1305_init_avx
1537
1538.Lproceed_avx2:
1539	movq	%r15,%rdx
1540	movl	OPENSSL_ia32cap_P+8(%rip),%r10d
1541	movl	$3221291008,%r11d
1542
1543	movq	0(%rsp),%r15
1544.cfi_restore	%r15
1545	movq	8(%rsp),%r14
1546.cfi_restore	%r14
1547	movq	16(%rsp),%r13
1548.cfi_restore	%r13
1549	movq	24(%rsp),%r12
1550.cfi_restore	%r12
1551	movq	32(%rsp),%rbp
1552.cfi_restore	%rbp
1553	movq	40(%rsp),%rbx
1554.cfi_restore	%rbx
1555	leaq	48(%rsp),%rax
1556	leaq	48(%rsp),%rsp
1557.cfi_adjust_cfa_offset	-48
1558.Lbase2_64_avx2_epilogue:
1559	jmp	.Ldo_avx2
1560.cfi_endproc
1561
1562.align	32
1563.Leven_avx2:
1564.cfi_startproc
1565	movl	OPENSSL_ia32cap_P+8(%rip),%r10d
1566	vmovd	0(%rdi),%xmm0
1567	vmovd	4(%rdi),%xmm1
1568	vmovd	8(%rdi),%xmm2
1569	vmovd	12(%rdi),%xmm3
1570	vmovd	16(%rdi),%xmm4
1571
1572.Ldo_avx2:
1573	leaq	-8(%rsp),%r11
1574.cfi_def_cfa	%r11,16
1575	subq	$0x128,%rsp
1576	leaq	.Lconst(%rip),%rcx
1577	leaq	48+64(%rdi),%rdi
1578	vmovdqa	96(%rcx),%ymm7
1579
1580
1581	vmovdqu	-64(%rdi),%xmm9
1582	andq	$-512,%rsp
1583	vmovdqu	-48(%rdi),%xmm10
1584	vmovdqu	-32(%rdi),%xmm6
1585	vmovdqu	-16(%rdi),%xmm11
1586	vmovdqu	0(%rdi),%xmm12
1587	vmovdqu	16(%rdi),%xmm13
1588	leaq	144(%rsp),%rax
1589	vmovdqu	32(%rdi),%xmm14
1590	vpermd	%ymm9,%ymm7,%ymm9
1591	vmovdqu	48(%rdi),%xmm15
1592	vpermd	%ymm10,%ymm7,%ymm10
1593	vmovdqu	64(%rdi),%xmm5
1594	vpermd	%ymm6,%ymm7,%ymm6
1595	vmovdqa	%ymm9,0(%rsp)
1596	vpermd	%ymm11,%ymm7,%ymm11
1597	vmovdqa	%ymm10,32-144(%rax)
1598	vpermd	%ymm12,%ymm7,%ymm12
1599	vmovdqa	%ymm6,64-144(%rax)
1600	vpermd	%ymm13,%ymm7,%ymm13
1601	vmovdqa	%ymm11,96-144(%rax)
1602	vpermd	%ymm14,%ymm7,%ymm14
1603	vmovdqa	%ymm12,128-144(%rax)
1604	vpermd	%ymm15,%ymm7,%ymm15
1605	vmovdqa	%ymm13,160-144(%rax)
1606	vpermd	%ymm5,%ymm7,%ymm5
1607	vmovdqa	%ymm14,192-144(%rax)
1608	vmovdqa	%ymm15,224-144(%rax)
1609	vmovdqa	%ymm5,256-144(%rax)
1610	vmovdqa	64(%rcx),%ymm5
1611
1612
1613
1614	vmovdqu	0(%rsi),%xmm7
1615	vmovdqu	16(%rsi),%xmm8
1616	vinserti128	$1,32(%rsi),%ymm7,%ymm7
1617	vinserti128	$1,48(%rsi),%ymm8,%ymm8
1618	leaq	64(%rsi),%rsi
1619
1620	vpsrldq	$6,%ymm7,%ymm9
1621	vpsrldq	$6,%ymm8,%ymm10
1622	vpunpckhqdq	%ymm8,%ymm7,%ymm6
1623	vpunpcklqdq	%ymm10,%ymm9,%ymm9
1624	vpunpcklqdq	%ymm8,%ymm7,%ymm7
1625
1626	vpsrlq	$30,%ymm9,%ymm10
1627	vpsrlq	$4,%ymm9,%ymm9
1628	vpsrlq	$26,%ymm7,%ymm8
1629	vpsrlq	$40,%ymm6,%ymm6
1630	vpand	%ymm5,%ymm9,%ymm9
1631	vpand	%ymm5,%ymm7,%ymm7
1632	vpand	%ymm5,%ymm8,%ymm8
1633	vpand	%ymm5,%ymm10,%ymm10
1634	vpor	32(%rcx),%ymm6,%ymm6
1635
1636	vpaddq	%ymm2,%ymm9,%ymm2
1637	subq	$64,%rdx
1638	jz	.Ltail_avx2
1639	jmp	.Loop_avx2
1640
1641.align	32
1642.Loop_avx2:
1643
1644
1645
1646
1647
1648
1649
1650
1651	vpaddq	%ymm0,%ymm7,%ymm0
1652	vmovdqa	0(%rsp),%ymm7
1653	vpaddq	%ymm1,%ymm8,%ymm1
1654	vmovdqa	32(%rsp),%ymm8
1655	vpaddq	%ymm3,%ymm10,%ymm3
1656	vmovdqa	96(%rsp),%ymm9
1657	vpaddq	%ymm4,%ymm6,%ymm4
1658	vmovdqa	48(%rax),%ymm10
1659	vmovdqa	112(%rax),%ymm5
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676	vpmuludq	%ymm2,%ymm7,%ymm13
1677	vpmuludq	%ymm2,%ymm8,%ymm14
1678	vpmuludq	%ymm2,%ymm9,%ymm15
1679	vpmuludq	%ymm2,%ymm10,%ymm11
1680	vpmuludq	%ymm2,%ymm5,%ymm12
1681
1682	vpmuludq	%ymm0,%ymm8,%ymm6
1683	vpmuludq	%ymm1,%ymm8,%ymm2
1684	vpaddq	%ymm6,%ymm12,%ymm12
1685	vpaddq	%ymm2,%ymm13,%ymm13
1686	vpmuludq	%ymm3,%ymm8,%ymm6
1687	vpmuludq	64(%rsp),%ymm4,%ymm2
1688	vpaddq	%ymm6,%ymm15,%ymm15
1689	vpaddq	%ymm2,%ymm11,%ymm11
1690	vmovdqa	-16(%rax),%ymm8
1691
1692	vpmuludq	%ymm0,%ymm7,%ymm6
1693	vpmuludq	%ymm1,%ymm7,%ymm2
1694	vpaddq	%ymm6,%ymm11,%ymm11
1695	vpaddq	%ymm2,%ymm12,%ymm12
1696	vpmuludq	%ymm3,%ymm7,%ymm6
1697	vpmuludq	%ymm4,%ymm7,%ymm2
1698	vmovdqu	0(%rsi),%xmm7
1699	vpaddq	%ymm6,%ymm14,%ymm14
1700	vpaddq	%ymm2,%ymm15,%ymm15
1701	vinserti128	$1,32(%rsi),%ymm7,%ymm7
1702
1703	vpmuludq	%ymm3,%ymm8,%ymm6
1704	vpmuludq	%ymm4,%ymm8,%ymm2
1705	vmovdqu	16(%rsi),%xmm8
1706	vpaddq	%ymm6,%ymm11,%ymm11
1707	vpaddq	%ymm2,%ymm12,%ymm12
1708	vmovdqa	16(%rax),%ymm2
1709	vpmuludq	%ymm1,%ymm9,%ymm6
1710	vpmuludq	%ymm0,%ymm9,%ymm9
1711	vpaddq	%ymm6,%ymm14,%ymm14
1712	vpaddq	%ymm9,%ymm13,%ymm13
1713	vinserti128	$1,48(%rsi),%ymm8,%ymm8
1714	leaq	64(%rsi),%rsi
1715
1716	vpmuludq	%ymm1,%ymm2,%ymm6
1717	vpmuludq	%ymm0,%ymm2,%ymm2
1718	vpsrldq	$6,%ymm7,%ymm9
1719	vpaddq	%ymm6,%ymm15,%ymm15
1720	vpaddq	%ymm2,%ymm14,%ymm14
1721	vpmuludq	%ymm3,%ymm10,%ymm6
1722	vpmuludq	%ymm4,%ymm10,%ymm2
1723	vpsrldq	$6,%ymm8,%ymm10
1724	vpaddq	%ymm6,%ymm12,%ymm12
1725	vpaddq	%ymm2,%ymm13,%ymm13
1726	vpunpckhqdq	%ymm8,%ymm7,%ymm6
1727
1728	vpmuludq	%ymm3,%ymm5,%ymm3
1729	vpmuludq	%ymm4,%ymm5,%ymm4
1730	vpunpcklqdq	%ymm8,%ymm7,%ymm7
1731	vpaddq	%ymm3,%ymm13,%ymm2
1732	vpaddq	%ymm4,%ymm14,%ymm3
1733	vpunpcklqdq	%ymm10,%ymm9,%ymm10
1734	vpmuludq	80(%rax),%ymm0,%ymm4
1735	vpmuludq	%ymm1,%ymm5,%ymm0
1736	vmovdqa	64(%rcx),%ymm5
1737	vpaddq	%ymm4,%ymm15,%ymm4
1738	vpaddq	%ymm0,%ymm11,%ymm0
1739
1740
1741
1742
1743	vpsrlq	$26,%ymm3,%ymm14
1744	vpand	%ymm5,%ymm3,%ymm3
1745	vpaddq	%ymm14,%ymm4,%ymm4
1746
1747	vpsrlq	$26,%ymm0,%ymm11
1748	vpand	%ymm5,%ymm0,%ymm0
1749	vpaddq	%ymm11,%ymm12,%ymm1
1750
1751	vpsrlq	$26,%ymm4,%ymm15
1752	vpand	%ymm5,%ymm4,%ymm4
1753
1754	vpsrlq	$4,%ymm10,%ymm9
1755
1756	vpsrlq	$26,%ymm1,%ymm12
1757	vpand	%ymm5,%ymm1,%ymm1
1758	vpaddq	%ymm12,%ymm2,%ymm2
1759
1760	vpaddq	%ymm15,%ymm0,%ymm0
1761	vpsllq	$2,%ymm15,%ymm15
1762	vpaddq	%ymm15,%ymm0,%ymm0
1763
1764	vpand	%ymm5,%ymm9,%ymm9
1765	vpsrlq	$26,%ymm7,%ymm8
1766
1767	vpsrlq	$26,%ymm2,%ymm13
1768	vpand	%ymm5,%ymm2,%ymm2
1769	vpaddq	%ymm13,%ymm3,%ymm3
1770
1771	vpaddq	%ymm9,%ymm2,%ymm2
1772	vpsrlq	$30,%ymm10,%ymm10
1773
1774	vpsrlq	$26,%ymm0,%ymm11
1775	vpand	%ymm5,%ymm0,%ymm0
1776	vpaddq	%ymm11,%ymm1,%ymm1
1777
1778	vpsrlq	$40,%ymm6,%ymm6
1779
1780	vpsrlq	$26,%ymm3,%ymm14
1781	vpand	%ymm5,%ymm3,%ymm3
1782	vpaddq	%ymm14,%ymm4,%ymm4
1783
1784	vpand	%ymm5,%ymm7,%ymm7
1785	vpand	%ymm5,%ymm8,%ymm8
1786	vpand	%ymm5,%ymm10,%ymm10
1787	vpor	32(%rcx),%ymm6,%ymm6
1788
1789	subq	$64,%rdx
1790	jnz	.Loop_avx2
1791
1792.byte	0x66,0x90
1793.Ltail_avx2:
1794
1795
1796
1797
1798
1799
1800
1801	vpaddq	%ymm0,%ymm7,%ymm0
1802	vmovdqu	4(%rsp),%ymm7
1803	vpaddq	%ymm1,%ymm8,%ymm1
1804	vmovdqu	36(%rsp),%ymm8
1805	vpaddq	%ymm3,%ymm10,%ymm3
1806	vmovdqu	100(%rsp),%ymm9
1807	vpaddq	%ymm4,%ymm6,%ymm4
1808	vmovdqu	52(%rax),%ymm10
1809	vmovdqu	116(%rax),%ymm5
1810
1811	vpmuludq	%ymm2,%ymm7,%ymm13
1812	vpmuludq	%ymm2,%ymm8,%ymm14
1813	vpmuludq	%ymm2,%ymm9,%ymm15
1814	vpmuludq	%ymm2,%ymm10,%ymm11
1815	vpmuludq	%ymm2,%ymm5,%ymm12
1816
1817	vpmuludq	%ymm0,%ymm8,%ymm6
1818	vpmuludq	%ymm1,%ymm8,%ymm2
1819	vpaddq	%ymm6,%ymm12,%ymm12
1820	vpaddq	%ymm2,%ymm13,%ymm13
1821	vpmuludq	%ymm3,%ymm8,%ymm6
1822	vpmuludq	68(%rsp),%ymm4,%ymm2
1823	vpaddq	%ymm6,%ymm15,%ymm15
1824	vpaddq	%ymm2,%ymm11,%ymm11
1825
1826	vpmuludq	%ymm0,%ymm7,%ymm6
1827	vpmuludq	%ymm1,%ymm7,%ymm2
1828	vpaddq	%ymm6,%ymm11,%ymm11
1829	vmovdqu	-12(%rax),%ymm8
1830	vpaddq	%ymm2,%ymm12,%ymm12
1831	vpmuludq	%ymm3,%ymm7,%ymm6
1832	vpmuludq	%ymm4,%ymm7,%ymm2
1833	vpaddq	%ymm6,%ymm14,%ymm14
1834	vpaddq	%ymm2,%ymm15,%ymm15
1835
1836	vpmuludq	%ymm3,%ymm8,%ymm6
1837	vpmuludq	%ymm4,%ymm8,%ymm2
1838	vpaddq	%ymm6,%ymm11,%ymm11
1839	vpaddq	%ymm2,%ymm12,%ymm12
1840	vmovdqu	20(%rax),%ymm2
1841	vpmuludq	%ymm1,%ymm9,%ymm6
1842	vpmuludq	%ymm0,%ymm9,%ymm9
1843	vpaddq	%ymm6,%ymm14,%ymm14
1844	vpaddq	%ymm9,%ymm13,%ymm13
1845
1846	vpmuludq	%ymm1,%ymm2,%ymm6
1847	vpmuludq	%ymm0,%ymm2,%ymm2
1848	vpaddq	%ymm6,%ymm15,%ymm15
1849	vpaddq	%ymm2,%ymm14,%ymm14
1850	vpmuludq	%ymm3,%ymm10,%ymm6
1851	vpmuludq	%ymm4,%ymm10,%ymm2
1852	vpaddq	%ymm6,%ymm12,%ymm12
1853	vpaddq	%ymm2,%ymm13,%ymm13
1854
1855	vpmuludq	%ymm3,%ymm5,%ymm3
1856	vpmuludq	%ymm4,%ymm5,%ymm4
1857	vpaddq	%ymm3,%ymm13,%ymm2
1858	vpaddq	%ymm4,%ymm14,%ymm3
1859	vpmuludq	84(%rax),%ymm0,%ymm4
1860	vpmuludq	%ymm1,%ymm5,%ymm0
1861	vmovdqa	64(%rcx),%ymm5
1862	vpaddq	%ymm4,%ymm15,%ymm4
1863	vpaddq	%ymm0,%ymm11,%ymm0
1864
1865
1866
1867
1868	vpsrldq	$8,%ymm12,%ymm8
1869	vpsrldq	$8,%ymm2,%ymm9
1870	vpsrldq	$8,%ymm3,%ymm10
1871	vpsrldq	$8,%ymm4,%ymm6
1872	vpsrldq	$8,%ymm0,%ymm7
1873	vpaddq	%ymm8,%ymm12,%ymm12
1874	vpaddq	%ymm9,%ymm2,%ymm2
1875	vpaddq	%ymm10,%ymm3,%ymm3
1876	vpaddq	%ymm6,%ymm4,%ymm4
1877	vpaddq	%ymm7,%ymm0,%ymm0
1878
1879	vpermq	$0x2,%ymm3,%ymm10
1880	vpermq	$0x2,%ymm4,%ymm6
1881	vpermq	$0x2,%ymm0,%ymm7
1882	vpermq	$0x2,%ymm12,%ymm8
1883	vpermq	$0x2,%ymm2,%ymm9
1884	vpaddq	%ymm10,%ymm3,%ymm3
1885	vpaddq	%ymm6,%ymm4,%ymm4
1886	vpaddq	%ymm7,%ymm0,%ymm0
1887	vpaddq	%ymm8,%ymm12,%ymm12
1888	vpaddq	%ymm9,%ymm2,%ymm2
1889
1890
1891
1892
1893	vpsrlq	$26,%ymm3,%ymm14
1894	vpand	%ymm5,%ymm3,%ymm3
1895	vpaddq	%ymm14,%ymm4,%ymm4
1896
1897	vpsrlq	$26,%ymm0,%ymm11
1898	vpand	%ymm5,%ymm0,%ymm0
1899	vpaddq	%ymm11,%ymm12,%ymm1
1900
1901	vpsrlq	$26,%ymm4,%ymm15
1902	vpand	%ymm5,%ymm4,%ymm4
1903
1904	vpsrlq	$26,%ymm1,%ymm12
1905	vpand	%ymm5,%ymm1,%ymm1
1906	vpaddq	%ymm12,%ymm2,%ymm2
1907
1908	vpaddq	%ymm15,%ymm0,%ymm0
1909	vpsllq	$2,%ymm15,%ymm15
1910	vpaddq	%ymm15,%ymm0,%ymm0
1911
1912	vpsrlq	$26,%ymm2,%ymm13
1913	vpand	%ymm5,%ymm2,%ymm2
1914	vpaddq	%ymm13,%ymm3,%ymm3
1915
1916	vpsrlq	$26,%ymm0,%ymm11
1917	vpand	%ymm5,%ymm0,%ymm0
1918	vpaddq	%ymm11,%ymm1,%ymm1
1919
1920	vpsrlq	$26,%ymm3,%ymm14
1921	vpand	%ymm5,%ymm3,%ymm3
1922	vpaddq	%ymm14,%ymm4,%ymm4
1923
1924	vmovd	%xmm0,-112(%rdi)
1925	vmovd	%xmm1,-108(%rdi)
1926	vmovd	%xmm2,-104(%rdi)
1927	vmovd	%xmm3,-100(%rdi)
1928	vmovd	%xmm4,-96(%rdi)
1929	leaq	8(%r11),%rsp
1930.cfi_def_cfa	%rsp,8
1931	vzeroupper
1932	.byte	0xf3,0xc3
1933.cfi_endproc
1934.size	poly1305_blocks_avx2,.-poly1305_blocks_avx2
1935.align	64
1936.Lconst:
1937.Lmask24:
1938.long	0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
1939.L129:
1940.long	16777216,0,16777216,0,16777216,0,16777216,0
1941.Lmask26:
1942.long	0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
1943.Lpermd_avx2:
1944.long	2,2,2,3,2,0,2,1
1945.Lpermd_avx512:
1946.long	0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
1947
1948.L2_44_inp_permd:
1949.long	0,1,1,2,2,3,7,7
1950.L2_44_inp_shift:
1951.quad	0,12,24,64
1952.L2_44_mask:
1953.quad	0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
1954.L2_44_shift_rgt:
1955.quad	44,44,42,64
1956.L2_44_shift_lft:
1957.quad	8,8,10,64
1958
1959.align	64
1960.Lx_mask44:
1961.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
1962.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
1963.Lx_mask42:
1964.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
1965.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
1966.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1967.align	16
1968.globl	xor128_encrypt_n_pad
1969.type	xor128_encrypt_n_pad,@function
1970.align	16
1971xor128_encrypt_n_pad:
1972.cfi_startproc
1973	subq	%rdx,%rsi
1974	subq	%rdx,%rdi
1975	movq	%rcx,%r10
1976	shrq	$4,%rcx
1977	jz	.Ltail_enc
1978	nop
1979.Loop_enc_xmm:
1980	movdqu	(%rsi,%rdx,1),%xmm0
1981	pxor	(%rdx),%xmm0
1982	movdqu	%xmm0,(%rdi,%rdx,1)
1983	movdqa	%xmm0,(%rdx)
1984	leaq	16(%rdx),%rdx
1985	decq	%rcx
1986	jnz	.Loop_enc_xmm
1987
1988	andq	$15,%r10
1989	jz	.Ldone_enc
1990
1991.Ltail_enc:
1992	movq	$16,%rcx
1993	subq	%r10,%rcx
1994	xorl	%eax,%eax
1995.Loop_enc_byte:
1996	movb	(%rsi,%rdx,1),%al
1997	xorb	(%rdx),%al
1998	movb	%al,(%rdi,%rdx,1)
1999	movb	%al,(%rdx)
2000	leaq	1(%rdx),%rdx
2001	decq	%r10
2002	jnz	.Loop_enc_byte
2003
2004	xorl	%eax,%eax
2005.Loop_enc_pad:
2006	movb	%al,(%rdx)
2007	leaq	1(%rdx),%rdx
2008	decq	%rcx
2009	jnz	.Loop_enc_pad
2010
2011.Ldone_enc:
2012	movq	%rdx,%rax
2013	.byte	0xf3,0xc3
2014.cfi_endproc
2015.size	xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
2016
2017.globl	xor128_decrypt_n_pad
2018.type	xor128_decrypt_n_pad,@function
2019.align	16
2020xor128_decrypt_n_pad:
2021.cfi_startproc
2022	subq	%rdx,%rsi
2023	subq	%rdx,%rdi
2024	movq	%rcx,%r10
2025	shrq	$4,%rcx
2026	jz	.Ltail_dec
2027	nop
2028.Loop_dec_xmm:
2029	movdqu	(%rsi,%rdx,1),%xmm0
2030	movdqa	(%rdx),%xmm1
2031	pxor	%xmm0,%xmm1
2032	movdqu	%xmm1,(%rdi,%rdx,1)
2033	movdqa	%xmm0,(%rdx)
2034	leaq	16(%rdx),%rdx
2035	decq	%rcx
2036	jnz	.Loop_dec_xmm
2037
2038	pxor	%xmm1,%xmm1
2039	andq	$15,%r10
2040	jz	.Ldone_dec
2041
2042.Ltail_dec:
2043	movq	$16,%rcx
2044	subq	%r10,%rcx
2045	xorl	%eax,%eax
2046	xorq	%r11,%r11
2047.Loop_dec_byte:
2048	movb	(%rsi,%rdx,1),%r11b
2049	movb	(%rdx),%al
2050	xorb	%r11b,%al
2051	movb	%al,(%rdi,%rdx,1)
2052	movb	%r11b,(%rdx)
2053	leaq	1(%rdx),%rdx
2054	decq	%r10
2055	jnz	.Loop_dec_byte
2056
2057	xorl	%eax,%eax
2058.Loop_dec_pad:
2059	movb	%al,(%rdx)
2060	leaq	1(%rdx),%rdx
2061	decq	%rcx
2062	jnz	.Loop_dec_pad
2063
2064.Ldone_dec:
2065	movq	%rdx,%rax
2066	.byte	0xf3,0xc3
2067.cfi_endproc
2068.size	xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
2069