• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1.text
2
3
4
5.align	64
6.Lzero:
7.long	0,0,0,0
8.Lone:
9.long	1,0,0,0
10.Linc:
11.long	0,1,2,3
12.Lfour:
13.long	4,4,4,4
14.Lincy:
15.long	0,2,4,6,1,3,5,7
16.Leight:
17.long	8,8,8,8,8,8,8,8
18.Lrot16:
19.byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
20.Lrot24:
21.byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
22.Ltwoy:
23.long	2,0,0,0, 2,0,0,0
24.align	64
25.Lzeroz:
26.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
27.Lfourz:
28.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
29.Lincz:
30.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
31.Lsixteen:
32.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
33.Lsigma:
34.byte	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
35.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
36.globl	ChaCha20_ctr32
37.type	ChaCha20_ctr32,@function
38.align	64
39ChaCha20_ctr32:
40.cfi_startproc
41	cmpq	$0,%rdx
42	je	.Lno_data
43	movq	OPENSSL_ia32cap_P+4(%rip),%r10
44	testl	$512,%r10d
45	jnz	.LChaCha20_ssse3
46
47	pushq	%rbx
48.cfi_adjust_cfa_offset	8
49.cfi_offset	%rbx,-16
50	pushq	%rbp
51.cfi_adjust_cfa_offset	8
52.cfi_offset	%rbp,-24
53	pushq	%r12
54.cfi_adjust_cfa_offset	8
55.cfi_offset	%r12,-32
56	pushq	%r13
57.cfi_adjust_cfa_offset	8
58.cfi_offset	%r13,-40
59	pushq	%r14
60.cfi_adjust_cfa_offset	8
61.cfi_offset	%r14,-48
62	pushq	%r15
63.cfi_adjust_cfa_offset	8
64.cfi_offset	%r15,-56
65	subq	$64+24,%rsp
66.cfi_adjust_cfa_offset	64+24
67.Lctr32_body:
68
69
70	movdqu	(%rcx),%xmm1
71	movdqu	16(%rcx),%xmm2
72	movdqu	(%r8),%xmm3
73	movdqa	.Lone(%rip),%xmm4
74
75
76	movdqa	%xmm1,16(%rsp)
77	movdqa	%xmm2,32(%rsp)
78	movdqa	%xmm3,48(%rsp)
79	movq	%rdx,%rbp
80	jmp	.Loop_outer
81
82.align	32
83.Loop_outer:
84	movl	$0x61707865,%eax
85	movl	$0x3320646e,%ebx
86	movl	$0x79622d32,%ecx
87	movl	$0x6b206574,%edx
88	movl	16(%rsp),%r8d
89	movl	20(%rsp),%r9d
90	movl	24(%rsp),%r10d
91	movl	28(%rsp),%r11d
92	movd	%xmm3,%r12d
93	movl	52(%rsp),%r13d
94	movl	56(%rsp),%r14d
95	movl	60(%rsp),%r15d
96
97	movq	%rbp,64+0(%rsp)
98	movl	$10,%ebp
99	movq	%rsi,64+8(%rsp)
100.byte	102,72,15,126,214
101	movq	%rdi,64+16(%rsp)
102	movq	%rsi,%rdi
103	shrq	$32,%rdi
104	jmp	.Loop
105
106.align	32
107.Loop:
108	addl	%r8d,%eax
109	xorl	%eax,%r12d
110	roll	$16,%r12d
111	addl	%r9d,%ebx
112	xorl	%ebx,%r13d
113	roll	$16,%r13d
114	addl	%r12d,%esi
115	xorl	%esi,%r8d
116	roll	$12,%r8d
117	addl	%r13d,%edi
118	xorl	%edi,%r9d
119	roll	$12,%r9d
120	addl	%r8d,%eax
121	xorl	%eax,%r12d
122	roll	$8,%r12d
123	addl	%r9d,%ebx
124	xorl	%ebx,%r13d
125	roll	$8,%r13d
126	addl	%r12d,%esi
127	xorl	%esi,%r8d
128	roll	$7,%r8d
129	addl	%r13d,%edi
130	xorl	%edi,%r9d
131	roll	$7,%r9d
132	movl	%esi,32(%rsp)
133	movl	%edi,36(%rsp)
134	movl	40(%rsp),%esi
135	movl	44(%rsp),%edi
136	addl	%r10d,%ecx
137	xorl	%ecx,%r14d
138	roll	$16,%r14d
139	addl	%r11d,%edx
140	xorl	%edx,%r15d
141	roll	$16,%r15d
142	addl	%r14d,%esi
143	xorl	%esi,%r10d
144	roll	$12,%r10d
145	addl	%r15d,%edi
146	xorl	%edi,%r11d
147	roll	$12,%r11d
148	addl	%r10d,%ecx
149	xorl	%ecx,%r14d
150	roll	$8,%r14d
151	addl	%r11d,%edx
152	xorl	%edx,%r15d
153	roll	$8,%r15d
154	addl	%r14d,%esi
155	xorl	%esi,%r10d
156	roll	$7,%r10d
157	addl	%r15d,%edi
158	xorl	%edi,%r11d
159	roll	$7,%r11d
160	addl	%r9d,%eax
161	xorl	%eax,%r15d
162	roll	$16,%r15d
163	addl	%r10d,%ebx
164	xorl	%ebx,%r12d
165	roll	$16,%r12d
166	addl	%r15d,%esi
167	xorl	%esi,%r9d
168	roll	$12,%r9d
169	addl	%r12d,%edi
170	xorl	%edi,%r10d
171	roll	$12,%r10d
172	addl	%r9d,%eax
173	xorl	%eax,%r15d
174	roll	$8,%r15d
175	addl	%r10d,%ebx
176	xorl	%ebx,%r12d
177	roll	$8,%r12d
178	addl	%r15d,%esi
179	xorl	%esi,%r9d
180	roll	$7,%r9d
181	addl	%r12d,%edi
182	xorl	%edi,%r10d
183	roll	$7,%r10d
184	movl	%esi,40(%rsp)
185	movl	%edi,44(%rsp)
186	movl	32(%rsp),%esi
187	movl	36(%rsp),%edi
188	addl	%r11d,%ecx
189	xorl	%ecx,%r13d
190	roll	$16,%r13d
191	addl	%r8d,%edx
192	xorl	%edx,%r14d
193	roll	$16,%r14d
194	addl	%r13d,%esi
195	xorl	%esi,%r11d
196	roll	$12,%r11d
197	addl	%r14d,%edi
198	xorl	%edi,%r8d
199	roll	$12,%r8d
200	addl	%r11d,%ecx
201	xorl	%ecx,%r13d
202	roll	$8,%r13d
203	addl	%r8d,%edx
204	xorl	%edx,%r14d
205	roll	$8,%r14d
206	addl	%r13d,%esi
207	xorl	%esi,%r11d
208	roll	$7,%r11d
209	addl	%r14d,%edi
210	xorl	%edi,%r8d
211	roll	$7,%r8d
212	decl	%ebp
213	jnz	.Loop
214	movl	%edi,36(%rsp)
215	movl	%esi,32(%rsp)
216	movq	64(%rsp),%rbp
217	movdqa	%xmm2,%xmm1
218	movq	64+8(%rsp),%rsi
219	paddd	%xmm4,%xmm3
220	movq	64+16(%rsp),%rdi
221
222	addl	$0x61707865,%eax
223	addl	$0x3320646e,%ebx
224	addl	$0x79622d32,%ecx
225	addl	$0x6b206574,%edx
226	addl	16(%rsp),%r8d
227	addl	20(%rsp),%r9d
228	addl	24(%rsp),%r10d
229	addl	28(%rsp),%r11d
230	addl	48(%rsp),%r12d
231	addl	52(%rsp),%r13d
232	addl	56(%rsp),%r14d
233	addl	60(%rsp),%r15d
234	paddd	32(%rsp),%xmm1
235
236	cmpq	$64,%rbp
237	jb	.Ltail
238
239	xorl	0(%rsi),%eax
240	xorl	4(%rsi),%ebx
241	xorl	8(%rsi),%ecx
242	xorl	12(%rsi),%edx
243	xorl	16(%rsi),%r8d
244	xorl	20(%rsi),%r9d
245	xorl	24(%rsi),%r10d
246	xorl	28(%rsi),%r11d
247	movdqu	32(%rsi),%xmm0
248	xorl	48(%rsi),%r12d
249	xorl	52(%rsi),%r13d
250	xorl	56(%rsi),%r14d
251	xorl	60(%rsi),%r15d
252	leaq	64(%rsi),%rsi
253	pxor	%xmm1,%xmm0
254
255	movdqa	%xmm2,32(%rsp)
256	movd	%xmm3,48(%rsp)
257
258	movl	%eax,0(%rdi)
259	movl	%ebx,4(%rdi)
260	movl	%ecx,8(%rdi)
261	movl	%edx,12(%rdi)
262	movl	%r8d,16(%rdi)
263	movl	%r9d,20(%rdi)
264	movl	%r10d,24(%rdi)
265	movl	%r11d,28(%rdi)
266	movdqu	%xmm0,32(%rdi)
267	movl	%r12d,48(%rdi)
268	movl	%r13d,52(%rdi)
269	movl	%r14d,56(%rdi)
270	movl	%r15d,60(%rdi)
271	leaq	64(%rdi),%rdi
272
273	subq	$64,%rbp
274	jnz	.Loop_outer
275
276	jmp	.Ldone
277
278.align	16
279.Ltail:
280	movl	%eax,0(%rsp)
281	movl	%ebx,4(%rsp)
282	xorq	%rbx,%rbx
283	movl	%ecx,8(%rsp)
284	movl	%edx,12(%rsp)
285	movl	%r8d,16(%rsp)
286	movl	%r9d,20(%rsp)
287	movl	%r10d,24(%rsp)
288	movl	%r11d,28(%rsp)
289	movdqa	%xmm1,32(%rsp)
290	movl	%r12d,48(%rsp)
291	movl	%r13d,52(%rsp)
292	movl	%r14d,56(%rsp)
293	movl	%r15d,60(%rsp)
294
295.Loop_tail:
296	movzbl	(%rsi,%rbx,1),%eax
297	movzbl	(%rsp,%rbx,1),%edx
298	leaq	1(%rbx),%rbx
299	xorl	%edx,%eax
300	movb	%al,-1(%rdi,%rbx,1)
301	decq	%rbp
302	jnz	.Loop_tail
303
304.Ldone:
305	leaq	64+24+48(%rsp),%rsi
306.cfi_def_cfa	%rsi,8
307	movq	-48(%rsi),%r15
308.cfi_restore	%r15
309	movq	-40(%rsi),%r14
310.cfi_restore	%r14
311	movq	-32(%rsi),%r13
312.cfi_restore	%r13
313	movq	-24(%rsi),%r12
314.cfi_restore	%r12
315	movq	-16(%rsi),%rbp
316.cfi_restore	%rbp
317	movq	-8(%rsi),%rbx
318.cfi_restore	%rbx
319	leaq	(%rsi),%rsp
320.cfi_def_cfa_register	%rsp
321.Lno_data:
322	.byte	0xf3,0xc3
323.cfi_endproc
324.size	ChaCha20_ctr32,.-ChaCha20_ctr32
325.type	ChaCha20_ssse3,@function
326.align	32
327ChaCha20_ssse3:
328.cfi_startproc
329.LChaCha20_ssse3:
330	movq	%rsp,%r9
331.cfi_def_cfa_register	%r9
332	testl	$2048,%r10d
333	jnz	.LChaCha20_4xop
334	cmpq	$128,%rdx
335	je	.LChaCha20_128
336	ja	.LChaCha20_4x
337
338.Ldo_sse3_after_all:
339	subq	$64+8,%rsp
340	movdqa	.Lsigma(%rip),%xmm0
341	movdqu	(%rcx),%xmm1
342	movdqu	16(%rcx),%xmm2
343	movdqu	(%r8),%xmm3
344	movdqa	.Lrot16(%rip),%xmm6
345	movdqa	.Lrot24(%rip),%xmm7
346
347	movdqa	%xmm0,0(%rsp)
348	movdqa	%xmm1,16(%rsp)
349	movdqa	%xmm2,32(%rsp)
350	movdqa	%xmm3,48(%rsp)
351	movq	$10,%r8
352	jmp	.Loop_ssse3
353
354.align	32
355.Loop_outer_ssse3:
356	movdqa	.Lone(%rip),%xmm3
357	movdqa	0(%rsp),%xmm0
358	movdqa	16(%rsp),%xmm1
359	movdqa	32(%rsp),%xmm2
360	paddd	48(%rsp),%xmm3
361	movq	$10,%r8
362	movdqa	%xmm3,48(%rsp)
363	jmp	.Loop_ssse3
364
365.align	32
366.Loop_ssse3:
367	paddd	%xmm1,%xmm0
368	pxor	%xmm0,%xmm3
369.byte	102,15,56,0,222
370	paddd	%xmm3,%xmm2
371	pxor	%xmm2,%xmm1
372	movdqa	%xmm1,%xmm4
373	psrld	$20,%xmm1
374	pslld	$12,%xmm4
375	por	%xmm4,%xmm1
376	paddd	%xmm1,%xmm0
377	pxor	%xmm0,%xmm3
378.byte	102,15,56,0,223
379	paddd	%xmm3,%xmm2
380	pxor	%xmm2,%xmm1
381	movdqa	%xmm1,%xmm4
382	psrld	$25,%xmm1
383	pslld	$7,%xmm4
384	por	%xmm4,%xmm1
385	pshufd	$78,%xmm2,%xmm2
386	pshufd	$57,%xmm1,%xmm1
387	pshufd	$147,%xmm3,%xmm3
388	nop
389	paddd	%xmm1,%xmm0
390	pxor	%xmm0,%xmm3
391.byte	102,15,56,0,222
392	paddd	%xmm3,%xmm2
393	pxor	%xmm2,%xmm1
394	movdqa	%xmm1,%xmm4
395	psrld	$20,%xmm1
396	pslld	$12,%xmm4
397	por	%xmm4,%xmm1
398	paddd	%xmm1,%xmm0
399	pxor	%xmm0,%xmm3
400.byte	102,15,56,0,223
401	paddd	%xmm3,%xmm2
402	pxor	%xmm2,%xmm1
403	movdqa	%xmm1,%xmm4
404	psrld	$25,%xmm1
405	pslld	$7,%xmm4
406	por	%xmm4,%xmm1
407	pshufd	$78,%xmm2,%xmm2
408	pshufd	$147,%xmm1,%xmm1
409	pshufd	$57,%xmm3,%xmm3
410	decq	%r8
411	jnz	.Loop_ssse3
412	paddd	0(%rsp),%xmm0
413	paddd	16(%rsp),%xmm1
414	paddd	32(%rsp),%xmm2
415	paddd	48(%rsp),%xmm3
416
417	cmpq	$64,%rdx
418	jb	.Ltail_ssse3
419
420	movdqu	0(%rsi),%xmm4
421	movdqu	16(%rsi),%xmm5
422	pxor	%xmm4,%xmm0
423	movdqu	32(%rsi),%xmm4
424	pxor	%xmm5,%xmm1
425	movdqu	48(%rsi),%xmm5
426	leaq	64(%rsi),%rsi
427	pxor	%xmm4,%xmm2
428	pxor	%xmm5,%xmm3
429
430	movdqu	%xmm0,0(%rdi)
431	movdqu	%xmm1,16(%rdi)
432	movdqu	%xmm2,32(%rdi)
433	movdqu	%xmm3,48(%rdi)
434	leaq	64(%rdi),%rdi
435
436	subq	$64,%rdx
437	jnz	.Loop_outer_ssse3
438
439	jmp	.Ldone_ssse3
440
441.align	16
442.Ltail_ssse3:
443	movdqa	%xmm0,0(%rsp)
444	movdqa	%xmm1,16(%rsp)
445	movdqa	%xmm2,32(%rsp)
446	movdqa	%xmm3,48(%rsp)
447	xorq	%r8,%r8
448
449.Loop_tail_ssse3:
450	movzbl	(%rsi,%r8,1),%eax
451	movzbl	(%rsp,%r8,1),%ecx
452	leaq	1(%r8),%r8
453	xorl	%ecx,%eax
454	movb	%al,-1(%rdi,%r8,1)
455	decq	%rdx
456	jnz	.Loop_tail_ssse3
457
458.Ldone_ssse3:
459	leaq	(%r9),%rsp
460.cfi_def_cfa_register	%rsp
461.Lssse3_epilogue:
462	.byte	0xf3,0xc3
463.cfi_endproc
464.size	ChaCha20_ssse3,.-ChaCha20_ssse3
465.type	ChaCha20_128,@function
466.align	32
467ChaCha20_128:
468.cfi_startproc
469.LChaCha20_128:
470	movq	%rsp,%r9
471.cfi_def_cfa_register	%r9
472	subq	$64+8,%rsp
473	movdqa	.Lsigma(%rip),%xmm8
474	movdqu	(%rcx),%xmm9
475	movdqu	16(%rcx),%xmm2
476	movdqu	(%r8),%xmm3
477	movdqa	.Lone(%rip),%xmm1
478	movdqa	.Lrot16(%rip),%xmm6
479	movdqa	.Lrot24(%rip),%xmm7
480
481	movdqa	%xmm8,%xmm10
482	movdqa	%xmm8,0(%rsp)
483	movdqa	%xmm9,%xmm11
484	movdqa	%xmm9,16(%rsp)
485	movdqa	%xmm2,%xmm0
486	movdqa	%xmm2,32(%rsp)
487	paddd	%xmm3,%xmm1
488	movdqa	%xmm3,48(%rsp)
489	movq	$10,%r8
490	jmp	.Loop_128
491
492.align	32
493.Loop_128:
494	paddd	%xmm9,%xmm8
495	pxor	%xmm8,%xmm3
496	paddd	%xmm11,%xmm10
497	pxor	%xmm10,%xmm1
498.byte	102,15,56,0,222
499.byte	102,15,56,0,206
500	paddd	%xmm3,%xmm2
501	paddd	%xmm1,%xmm0
502	pxor	%xmm2,%xmm9
503	pxor	%xmm0,%xmm11
504	movdqa	%xmm9,%xmm4
505	psrld	$20,%xmm9
506	movdqa	%xmm11,%xmm5
507	pslld	$12,%xmm4
508	psrld	$20,%xmm11
509	por	%xmm4,%xmm9
510	pslld	$12,%xmm5
511	por	%xmm5,%xmm11
512	paddd	%xmm9,%xmm8
513	pxor	%xmm8,%xmm3
514	paddd	%xmm11,%xmm10
515	pxor	%xmm10,%xmm1
516.byte	102,15,56,0,223
517.byte	102,15,56,0,207
518	paddd	%xmm3,%xmm2
519	paddd	%xmm1,%xmm0
520	pxor	%xmm2,%xmm9
521	pxor	%xmm0,%xmm11
522	movdqa	%xmm9,%xmm4
523	psrld	$25,%xmm9
524	movdqa	%xmm11,%xmm5
525	pslld	$7,%xmm4
526	psrld	$25,%xmm11
527	por	%xmm4,%xmm9
528	pslld	$7,%xmm5
529	por	%xmm5,%xmm11
530	pshufd	$78,%xmm2,%xmm2
531	pshufd	$57,%xmm9,%xmm9
532	pshufd	$147,%xmm3,%xmm3
533	pshufd	$78,%xmm0,%xmm0
534	pshufd	$57,%xmm11,%xmm11
535	pshufd	$147,%xmm1,%xmm1
536	paddd	%xmm9,%xmm8
537	pxor	%xmm8,%xmm3
538	paddd	%xmm11,%xmm10
539	pxor	%xmm10,%xmm1
540.byte	102,15,56,0,222
541.byte	102,15,56,0,206
542	paddd	%xmm3,%xmm2
543	paddd	%xmm1,%xmm0
544	pxor	%xmm2,%xmm9
545	pxor	%xmm0,%xmm11
546	movdqa	%xmm9,%xmm4
547	psrld	$20,%xmm9
548	movdqa	%xmm11,%xmm5
549	pslld	$12,%xmm4
550	psrld	$20,%xmm11
551	por	%xmm4,%xmm9
552	pslld	$12,%xmm5
553	por	%xmm5,%xmm11
554	paddd	%xmm9,%xmm8
555	pxor	%xmm8,%xmm3
556	paddd	%xmm11,%xmm10
557	pxor	%xmm10,%xmm1
558.byte	102,15,56,0,223
559.byte	102,15,56,0,207
560	paddd	%xmm3,%xmm2
561	paddd	%xmm1,%xmm0
562	pxor	%xmm2,%xmm9
563	pxor	%xmm0,%xmm11
564	movdqa	%xmm9,%xmm4
565	psrld	$25,%xmm9
566	movdqa	%xmm11,%xmm5
567	pslld	$7,%xmm4
568	psrld	$25,%xmm11
569	por	%xmm4,%xmm9
570	pslld	$7,%xmm5
571	por	%xmm5,%xmm11
572	pshufd	$78,%xmm2,%xmm2
573	pshufd	$147,%xmm9,%xmm9
574	pshufd	$57,%xmm3,%xmm3
575	pshufd	$78,%xmm0,%xmm0
576	pshufd	$147,%xmm11,%xmm11
577	pshufd	$57,%xmm1,%xmm1
578	decq	%r8
579	jnz	.Loop_128
580	paddd	0(%rsp),%xmm8
581	paddd	16(%rsp),%xmm9
582	paddd	32(%rsp),%xmm2
583	paddd	48(%rsp),%xmm3
584	paddd	.Lone(%rip),%xmm1
585	paddd	0(%rsp),%xmm10
586	paddd	16(%rsp),%xmm11
587	paddd	32(%rsp),%xmm0
588	paddd	48(%rsp),%xmm1
589
590	movdqu	0(%rsi),%xmm4
591	movdqu	16(%rsi),%xmm5
592	pxor	%xmm4,%xmm8
593	movdqu	32(%rsi),%xmm4
594	pxor	%xmm5,%xmm9
595	movdqu	48(%rsi),%xmm5
596	pxor	%xmm4,%xmm2
597	movdqu	64(%rsi),%xmm4
598	pxor	%xmm5,%xmm3
599	movdqu	80(%rsi),%xmm5
600	pxor	%xmm4,%xmm10
601	movdqu	96(%rsi),%xmm4
602	pxor	%xmm5,%xmm11
603	movdqu	112(%rsi),%xmm5
604	pxor	%xmm4,%xmm0
605	pxor	%xmm5,%xmm1
606
607	movdqu	%xmm8,0(%rdi)
608	movdqu	%xmm9,16(%rdi)
609	movdqu	%xmm2,32(%rdi)
610	movdqu	%xmm3,48(%rdi)
611	movdqu	%xmm10,64(%rdi)
612	movdqu	%xmm11,80(%rdi)
613	movdqu	%xmm0,96(%rdi)
614	movdqu	%xmm1,112(%rdi)
615	leaq	(%r9),%rsp
616.cfi_def_cfa_register	%rsp
617.L128_epilogue:
618	.byte	0xf3,0xc3
619.cfi_endproc
620.size	ChaCha20_128,.-ChaCha20_128
621.type	ChaCha20_4x,@function
622.align	32
623ChaCha20_4x:
624.cfi_startproc
625.LChaCha20_4x:
626	movq	%rsp,%r9
627.cfi_def_cfa_register	%r9
628	movq	%r10,%r11
629	shrq	$32,%r10
630	testq	$32,%r10
631	jnz	.LChaCha20_8x
632	cmpq	$192,%rdx
633	ja	.Lproceed4x
634
635	andq	$71303168,%r11
636	cmpq	$4194304,%r11
637	je	.Ldo_sse3_after_all
638
639.Lproceed4x:
640	subq	$0x140+8,%rsp
641	movdqa	.Lsigma(%rip),%xmm11
642	movdqu	(%rcx),%xmm15
643	movdqu	16(%rcx),%xmm7
644	movdqu	(%r8),%xmm3
645	leaq	256(%rsp),%rcx
646	leaq	.Lrot16(%rip),%r10
647	leaq	.Lrot24(%rip),%r11
648
649	pshufd	$0x00,%xmm11,%xmm8
650	pshufd	$0x55,%xmm11,%xmm9
651	movdqa	%xmm8,64(%rsp)
652	pshufd	$0xaa,%xmm11,%xmm10
653	movdqa	%xmm9,80(%rsp)
654	pshufd	$0xff,%xmm11,%xmm11
655	movdqa	%xmm10,96(%rsp)
656	movdqa	%xmm11,112(%rsp)
657
658	pshufd	$0x00,%xmm15,%xmm12
659	pshufd	$0x55,%xmm15,%xmm13
660	movdqa	%xmm12,128-256(%rcx)
661	pshufd	$0xaa,%xmm15,%xmm14
662	movdqa	%xmm13,144-256(%rcx)
663	pshufd	$0xff,%xmm15,%xmm15
664	movdqa	%xmm14,160-256(%rcx)
665	movdqa	%xmm15,176-256(%rcx)
666
667	pshufd	$0x00,%xmm7,%xmm4
668	pshufd	$0x55,%xmm7,%xmm5
669	movdqa	%xmm4,192-256(%rcx)
670	pshufd	$0xaa,%xmm7,%xmm6
671	movdqa	%xmm5,208-256(%rcx)
672	pshufd	$0xff,%xmm7,%xmm7
673	movdqa	%xmm6,224-256(%rcx)
674	movdqa	%xmm7,240-256(%rcx)
675
676	pshufd	$0x00,%xmm3,%xmm0
677	pshufd	$0x55,%xmm3,%xmm1
678	paddd	.Linc(%rip),%xmm0
679	pshufd	$0xaa,%xmm3,%xmm2
680	movdqa	%xmm1,272-256(%rcx)
681	pshufd	$0xff,%xmm3,%xmm3
682	movdqa	%xmm2,288-256(%rcx)
683	movdqa	%xmm3,304-256(%rcx)
684
685	jmp	.Loop_enter4x
686
687.align	32
688.Loop_outer4x:
689	movdqa	64(%rsp),%xmm8
690	movdqa	80(%rsp),%xmm9
691	movdqa	96(%rsp),%xmm10
692	movdqa	112(%rsp),%xmm11
693	movdqa	128-256(%rcx),%xmm12
694	movdqa	144-256(%rcx),%xmm13
695	movdqa	160-256(%rcx),%xmm14
696	movdqa	176-256(%rcx),%xmm15
697	movdqa	192-256(%rcx),%xmm4
698	movdqa	208-256(%rcx),%xmm5
699	movdqa	224-256(%rcx),%xmm6
700	movdqa	240-256(%rcx),%xmm7
701	movdqa	256-256(%rcx),%xmm0
702	movdqa	272-256(%rcx),%xmm1
703	movdqa	288-256(%rcx),%xmm2
704	movdqa	304-256(%rcx),%xmm3
705	paddd	.Lfour(%rip),%xmm0
706
707.Loop_enter4x:
708	movdqa	%xmm6,32(%rsp)
709	movdqa	%xmm7,48(%rsp)
710	movdqa	(%r10),%xmm7
711	movl	$10,%eax
712	movdqa	%xmm0,256-256(%rcx)
713	jmp	.Loop4x
714
715.align	32
716.Loop4x:
717	paddd	%xmm12,%xmm8
718	paddd	%xmm13,%xmm9
719	pxor	%xmm8,%xmm0
720	pxor	%xmm9,%xmm1
721.byte	102,15,56,0,199
722.byte	102,15,56,0,207
723	paddd	%xmm0,%xmm4
724	paddd	%xmm1,%xmm5
725	pxor	%xmm4,%xmm12
726	pxor	%xmm5,%xmm13
727	movdqa	%xmm12,%xmm6
728	pslld	$12,%xmm12
729	psrld	$20,%xmm6
730	movdqa	%xmm13,%xmm7
731	pslld	$12,%xmm13
732	por	%xmm6,%xmm12
733	psrld	$20,%xmm7
734	movdqa	(%r11),%xmm6
735	por	%xmm7,%xmm13
736	paddd	%xmm12,%xmm8
737	paddd	%xmm13,%xmm9
738	pxor	%xmm8,%xmm0
739	pxor	%xmm9,%xmm1
740.byte	102,15,56,0,198
741.byte	102,15,56,0,206
742	paddd	%xmm0,%xmm4
743	paddd	%xmm1,%xmm5
744	pxor	%xmm4,%xmm12
745	pxor	%xmm5,%xmm13
746	movdqa	%xmm12,%xmm7
747	pslld	$7,%xmm12
748	psrld	$25,%xmm7
749	movdqa	%xmm13,%xmm6
750	pslld	$7,%xmm13
751	por	%xmm7,%xmm12
752	psrld	$25,%xmm6
753	movdqa	(%r10),%xmm7
754	por	%xmm6,%xmm13
755	movdqa	%xmm4,0(%rsp)
756	movdqa	%xmm5,16(%rsp)
757	movdqa	32(%rsp),%xmm4
758	movdqa	48(%rsp),%xmm5
759	paddd	%xmm14,%xmm10
760	paddd	%xmm15,%xmm11
761	pxor	%xmm10,%xmm2
762	pxor	%xmm11,%xmm3
763.byte	102,15,56,0,215
764.byte	102,15,56,0,223
765	paddd	%xmm2,%xmm4
766	paddd	%xmm3,%xmm5
767	pxor	%xmm4,%xmm14
768	pxor	%xmm5,%xmm15
769	movdqa	%xmm14,%xmm6
770	pslld	$12,%xmm14
771	psrld	$20,%xmm6
772	movdqa	%xmm15,%xmm7
773	pslld	$12,%xmm15
774	por	%xmm6,%xmm14
775	psrld	$20,%xmm7
776	movdqa	(%r11),%xmm6
777	por	%xmm7,%xmm15
778	paddd	%xmm14,%xmm10
779	paddd	%xmm15,%xmm11
780	pxor	%xmm10,%xmm2
781	pxor	%xmm11,%xmm3
782.byte	102,15,56,0,214
783.byte	102,15,56,0,222
784	paddd	%xmm2,%xmm4
785	paddd	%xmm3,%xmm5
786	pxor	%xmm4,%xmm14
787	pxor	%xmm5,%xmm15
788	movdqa	%xmm14,%xmm7
789	pslld	$7,%xmm14
790	psrld	$25,%xmm7
791	movdqa	%xmm15,%xmm6
792	pslld	$7,%xmm15
793	por	%xmm7,%xmm14
794	psrld	$25,%xmm6
795	movdqa	(%r10),%xmm7
796	por	%xmm6,%xmm15
797	paddd	%xmm13,%xmm8
798	paddd	%xmm14,%xmm9
799	pxor	%xmm8,%xmm3
800	pxor	%xmm9,%xmm0
801.byte	102,15,56,0,223
802.byte	102,15,56,0,199
803	paddd	%xmm3,%xmm4
804	paddd	%xmm0,%xmm5
805	pxor	%xmm4,%xmm13
806	pxor	%xmm5,%xmm14
807	movdqa	%xmm13,%xmm6
808	pslld	$12,%xmm13
809	psrld	$20,%xmm6
810	movdqa	%xmm14,%xmm7
811	pslld	$12,%xmm14
812	por	%xmm6,%xmm13
813	psrld	$20,%xmm7
814	movdqa	(%r11),%xmm6
815	por	%xmm7,%xmm14
816	paddd	%xmm13,%xmm8
817	paddd	%xmm14,%xmm9
818	pxor	%xmm8,%xmm3
819	pxor	%xmm9,%xmm0
820.byte	102,15,56,0,222
821.byte	102,15,56,0,198
822	paddd	%xmm3,%xmm4
823	paddd	%xmm0,%xmm5
824	pxor	%xmm4,%xmm13
825	pxor	%xmm5,%xmm14
826	movdqa	%xmm13,%xmm7
827	pslld	$7,%xmm13
828	psrld	$25,%xmm7
829	movdqa	%xmm14,%xmm6
830	pslld	$7,%xmm14
831	por	%xmm7,%xmm13
832	psrld	$25,%xmm6
833	movdqa	(%r10),%xmm7
834	por	%xmm6,%xmm14
835	movdqa	%xmm4,32(%rsp)
836	movdqa	%xmm5,48(%rsp)
837	movdqa	0(%rsp),%xmm4
838	movdqa	16(%rsp),%xmm5
839	paddd	%xmm15,%xmm10
840	paddd	%xmm12,%xmm11
841	pxor	%xmm10,%xmm1
842	pxor	%xmm11,%xmm2
843.byte	102,15,56,0,207
844.byte	102,15,56,0,215
845	paddd	%xmm1,%xmm4
846	paddd	%xmm2,%xmm5
847	pxor	%xmm4,%xmm15
848	pxor	%xmm5,%xmm12
849	movdqa	%xmm15,%xmm6
850	pslld	$12,%xmm15
851	psrld	$20,%xmm6
852	movdqa	%xmm12,%xmm7
853	pslld	$12,%xmm12
854	por	%xmm6,%xmm15
855	psrld	$20,%xmm7
856	movdqa	(%r11),%xmm6
857	por	%xmm7,%xmm12
858	paddd	%xmm15,%xmm10
859	paddd	%xmm12,%xmm11
860	pxor	%xmm10,%xmm1
861	pxor	%xmm11,%xmm2
862.byte	102,15,56,0,206
863.byte	102,15,56,0,214
864	paddd	%xmm1,%xmm4
865	paddd	%xmm2,%xmm5
866	pxor	%xmm4,%xmm15
867	pxor	%xmm5,%xmm12
868	movdqa	%xmm15,%xmm7
869	pslld	$7,%xmm15
870	psrld	$25,%xmm7
871	movdqa	%xmm12,%xmm6
872	pslld	$7,%xmm12
873	por	%xmm7,%xmm15
874	psrld	$25,%xmm6
875	movdqa	(%r10),%xmm7
876	por	%xmm6,%xmm12
877	decl	%eax
878	jnz	.Loop4x
879
880	paddd	64(%rsp),%xmm8
881	paddd	80(%rsp),%xmm9
882	paddd	96(%rsp),%xmm10
883	paddd	112(%rsp),%xmm11
884
885	movdqa	%xmm8,%xmm6
886	punpckldq	%xmm9,%xmm8
887	movdqa	%xmm10,%xmm7
888	punpckldq	%xmm11,%xmm10
889	punpckhdq	%xmm9,%xmm6
890	punpckhdq	%xmm11,%xmm7
891	movdqa	%xmm8,%xmm9
892	punpcklqdq	%xmm10,%xmm8
893	movdqa	%xmm6,%xmm11
894	punpcklqdq	%xmm7,%xmm6
895	punpckhqdq	%xmm10,%xmm9
896	punpckhqdq	%xmm7,%xmm11
897	paddd	128-256(%rcx),%xmm12
898	paddd	144-256(%rcx),%xmm13
899	paddd	160-256(%rcx),%xmm14
900	paddd	176-256(%rcx),%xmm15
901
902	movdqa	%xmm8,0(%rsp)
903	movdqa	%xmm9,16(%rsp)
904	movdqa	32(%rsp),%xmm8
905	movdqa	48(%rsp),%xmm9
906
907	movdqa	%xmm12,%xmm10
908	punpckldq	%xmm13,%xmm12
909	movdqa	%xmm14,%xmm7
910	punpckldq	%xmm15,%xmm14
911	punpckhdq	%xmm13,%xmm10
912	punpckhdq	%xmm15,%xmm7
913	movdqa	%xmm12,%xmm13
914	punpcklqdq	%xmm14,%xmm12
915	movdqa	%xmm10,%xmm15
916	punpcklqdq	%xmm7,%xmm10
917	punpckhqdq	%xmm14,%xmm13
918	punpckhqdq	%xmm7,%xmm15
919	paddd	192-256(%rcx),%xmm4
920	paddd	208-256(%rcx),%xmm5
921	paddd	224-256(%rcx),%xmm8
922	paddd	240-256(%rcx),%xmm9
923
924	movdqa	%xmm6,32(%rsp)
925	movdqa	%xmm11,48(%rsp)
926
927	movdqa	%xmm4,%xmm14
928	punpckldq	%xmm5,%xmm4
929	movdqa	%xmm8,%xmm7
930	punpckldq	%xmm9,%xmm8
931	punpckhdq	%xmm5,%xmm14
932	punpckhdq	%xmm9,%xmm7
933	movdqa	%xmm4,%xmm5
934	punpcklqdq	%xmm8,%xmm4
935	movdqa	%xmm14,%xmm9
936	punpcklqdq	%xmm7,%xmm14
937	punpckhqdq	%xmm8,%xmm5
938	punpckhqdq	%xmm7,%xmm9
939	paddd	256-256(%rcx),%xmm0
940	paddd	272-256(%rcx),%xmm1
941	paddd	288-256(%rcx),%xmm2
942	paddd	304-256(%rcx),%xmm3
943
944	movdqa	%xmm0,%xmm8
945	punpckldq	%xmm1,%xmm0
946	movdqa	%xmm2,%xmm7
947	punpckldq	%xmm3,%xmm2
948	punpckhdq	%xmm1,%xmm8
949	punpckhdq	%xmm3,%xmm7
950	movdqa	%xmm0,%xmm1
951	punpcklqdq	%xmm2,%xmm0
952	movdqa	%xmm8,%xmm3
953	punpcklqdq	%xmm7,%xmm8
954	punpckhqdq	%xmm2,%xmm1
955	punpckhqdq	%xmm7,%xmm3
956	cmpq	$256,%rdx
957	jb	.Ltail4x
958
959	movdqu	0(%rsi),%xmm6
960	movdqu	16(%rsi),%xmm11
961	movdqu	32(%rsi),%xmm2
962	movdqu	48(%rsi),%xmm7
963	pxor	0(%rsp),%xmm6
964	pxor	%xmm12,%xmm11
965	pxor	%xmm4,%xmm2
966	pxor	%xmm0,%xmm7
967
968	movdqu	%xmm6,0(%rdi)
969	movdqu	64(%rsi),%xmm6
970	movdqu	%xmm11,16(%rdi)
971	movdqu	80(%rsi),%xmm11
972	movdqu	%xmm2,32(%rdi)
973	movdqu	96(%rsi),%xmm2
974	movdqu	%xmm7,48(%rdi)
975	movdqu	112(%rsi),%xmm7
976	leaq	128(%rsi),%rsi
977	pxor	16(%rsp),%xmm6
978	pxor	%xmm13,%xmm11
979	pxor	%xmm5,%xmm2
980	pxor	%xmm1,%xmm7
981
982	movdqu	%xmm6,64(%rdi)
983	movdqu	0(%rsi),%xmm6
984	movdqu	%xmm11,80(%rdi)
985	movdqu	16(%rsi),%xmm11
986	movdqu	%xmm2,96(%rdi)
987	movdqu	32(%rsi),%xmm2
988	movdqu	%xmm7,112(%rdi)
989	leaq	128(%rdi),%rdi
990	movdqu	48(%rsi),%xmm7
991	pxor	32(%rsp),%xmm6
992	pxor	%xmm10,%xmm11
993	pxor	%xmm14,%xmm2
994	pxor	%xmm8,%xmm7
995
996	movdqu	%xmm6,0(%rdi)
997	movdqu	64(%rsi),%xmm6
998	movdqu	%xmm11,16(%rdi)
999	movdqu	80(%rsi),%xmm11
1000	movdqu	%xmm2,32(%rdi)
1001	movdqu	96(%rsi),%xmm2
1002	movdqu	%xmm7,48(%rdi)
1003	movdqu	112(%rsi),%xmm7
1004	leaq	128(%rsi),%rsi
1005	pxor	48(%rsp),%xmm6
1006	pxor	%xmm15,%xmm11
1007	pxor	%xmm9,%xmm2
1008	pxor	%xmm3,%xmm7
1009	movdqu	%xmm6,64(%rdi)
1010	movdqu	%xmm11,80(%rdi)
1011	movdqu	%xmm2,96(%rdi)
1012	movdqu	%xmm7,112(%rdi)
1013	leaq	128(%rdi),%rdi
1014
1015	subq	$256,%rdx
1016	jnz	.Loop_outer4x
1017
1018	jmp	.Ldone4x
1019
1020.Ltail4x:
1021	cmpq	$192,%rdx
1022	jae	.L192_or_more4x
1023	cmpq	$128,%rdx
1024	jae	.L128_or_more4x
1025	cmpq	$64,%rdx
1026	jae	.L64_or_more4x
1027
1028
1029	xorq	%r10,%r10
1030
1031	movdqa	%xmm12,16(%rsp)
1032	movdqa	%xmm4,32(%rsp)
1033	movdqa	%xmm0,48(%rsp)
1034	jmp	.Loop_tail4x
1035
1036.align	32
1037.L64_or_more4x:
1038	movdqu	0(%rsi),%xmm6
1039	movdqu	16(%rsi),%xmm11
1040	movdqu	32(%rsi),%xmm2
1041	movdqu	48(%rsi),%xmm7
1042	pxor	0(%rsp),%xmm6
1043	pxor	%xmm12,%xmm11
1044	pxor	%xmm4,%xmm2
1045	pxor	%xmm0,%xmm7
1046	movdqu	%xmm6,0(%rdi)
1047	movdqu	%xmm11,16(%rdi)
1048	movdqu	%xmm2,32(%rdi)
1049	movdqu	%xmm7,48(%rdi)
1050	je	.Ldone4x
1051
1052	movdqa	16(%rsp),%xmm6
1053	leaq	64(%rsi),%rsi
1054	xorq	%r10,%r10
1055	movdqa	%xmm6,0(%rsp)
1056	movdqa	%xmm13,16(%rsp)
1057	leaq	64(%rdi),%rdi
1058	movdqa	%xmm5,32(%rsp)
1059	subq	$64,%rdx
1060	movdqa	%xmm1,48(%rsp)
1061	jmp	.Loop_tail4x
1062
1063.align	32
1064.L128_or_more4x:
1065	movdqu	0(%rsi),%xmm6
1066	movdqu	16(%rsi),%xmm11
1067	movdqu	32(%rsi),%xmm2
1068	movdqu	48(%rsi),%xmm7
1069	pxor	0(%rsp),%xmm6
1070	pxor	%xmm12,%xmm11
1071	pxor	%xmm4,%xmm2
1072	pxor	%xmm0,%xmm7
1073
1074	movdqu	%xmm6,0(%rdi)
1075	movdqu	64(%rsi),%xmm6
1076	movdqu	%xmm11,16(%rdi)
1077	movdqu	80(%rsi),%xmm11
1078	movdqu	%xmm2,32(%rdi)
1079	movdqu	96(%rsi),%xmm2
1080	movdqu	%xmm7,48(%rdi)
1081	movdqu	112(%rsi),%xmm7
1082	pxor	16(%rsp),%xmm6
1083	pxor	%xmm13,%xmm11
1084	pxor	%xmm5,%xmm2
1085	pxor	%xmm1,%xmm7
1086	movdqu	%xmm6,64(%rdi)
1087	movdqu	%xmm11,80(%rdi)
1088	movdqu	%xmm2,96(%rdi)
1089	movdqu	%xmm7,112(%rdi)
1090	je	.Ldone4x
1091
1092	movdqa	32(%rsp),%xmm6
1093	leaq	128(%rsi),%rsi
1094	xorq	%r10,%r10
1095	movdqa	%xmm6,0(%rsp)
1096	movdqa	%xmm10,16(%rsp)
1097	leaq	128(%rdi),%rdi
1098	movdqa	%xmm14,32(%rsp)
1099	subq	$128,%rdx
1100	movdqa	%xmm8,48(%rsp)
1101	jmp	.Loop_tail4x
1102
1103.align	32
1104.L192_or_more4x:
1105	movdqu	0(%rsi),%xmm6
1106	movdqu	16(%rsi),%xmm11
1107	movdqu	32(%rsi),%xmm2
1108	movdqu	48(%rsi),%xmm7
1109	pxor	0(%rsp),%xmm6
1110	pxor	%xmm12,%xmm11
1111	pxor	%xmm4,%xmm2
1112	pxor	%xmm0,%xmm7
1113
1114	movdqu	%xmm6,0(%rdi)
1115	movdqu	64(%rsi),%xmm6
1116	movdqu	%xmm11,16(%rdi)
1117	movdqu	80(%rsi),%xmm11
1118	movdqu	%xmm2,32(%rdi)
1119	movdqu	96(%rsi),%xmm2
1120	movdqu	%xmm7,48(%rdi)
1121	movdqu	112(%rsi),%xmm7
1122	leaq	128(%rsi),%rsi
1123	pxor	16(%rsp),%xmm6
1124	pxor	%xmm13,%xmm11
1125	pxor	%xmm5,%xmm2
1126	pxor	%xmm1,%xmm7
1127
1128	movdqu	%xmm6,64(%rdi)
1129	movdqu	0(%rsi),%xmm6
1130	movdqu	%xmm11,80(%rdi)
1131	movdqu	16(%rsi),%xmm11
1132	movdqu	%xmm2,96(%rdi)
1133	movdqu	32(%rsi),%xmm2
1134	movdqu	%xmm7,112(%rdi)
1135	leaq	128(%rdi),%rdi
1136	movdqu	48(%rsi),%xmm7
1137	pxor	32(%rsp),%xmm6
1138	pxor	%xmm10,%xmm11
1139	pxor	%xmm14,%xmm2
1140	pxor	%xmm8,%xmm7
1141	movdqu	%xmm6,0(%rdi)
1142	movdqu	%xmm11,16(%rdi)
1143	movdqu	%xmm2,32(%rdi)
1144	movdqu	%xmm7,48(%rdi)
1145	je	.Ldone4x
1146
1147	movdqa	48(%rsp),%xmm6
1148	leaq	64(%rsi),%rsi
1149	xorq	%r10,%r10
1150	movdqa	%xmm6,0(%rsp)
1151	movdqa	%xmm15,16(%rsp)
1152	leaq	64(%rdi),%rdi
1153	movdqa	%xmm9,32(%rsp)
1154	subq	$192,%rdx
1155	movdqa	%xmm3,48(%rsp)
1156
1157.Loop_tail4x:
1158	movzbl	(%rsi,%r10,1),%eax
1159	movzbl	(%rsp,%r10,1),%ecx
1160	leaq	1(%r10),%r10
1161	xorl	%ecx,%eax
1162	movb	%al,-1(%rdi,%r10,1)
1163	decq	%rdx
1164	jnz	.Loop_tail4x
1165
1166.Ldone4x:
1167	leaq	(%r9),%rsp
1168.cfi_def_cfa_register	%rsp
1169.L4x_epilogue:
1170	.byte	0xf3,0xc3
1171.cfi_endproc
1172.size	ChaCha20_4x,.-ChaCha20_4x
1173.type	ChaCha20_4xop,@function
1174.align	32
1175ChaCha20_4xop:
1176.cfi_startproc
1177.LChaCha20_4xop:
1178	movq	%rsp,%r9
1179.cfi_def_cfa_register	%r9
1180	subq	$0x140+8,%rsp
1181	vzeroupper
1182
1183	vmovdqa	.Lsigma(%rip),%xmm11
1184	vmovdqu	(%rcx),%xmm3
1185	vmovdqu	16(%rcx),%xmm15
1186	vmovdqu	(%r8),%xmm7
1187	leaq	256(%rsp),%rcx
1188
1189	vpshufd	$0x00,%xmm11,%xmm8
1190	vpshufd	$0x55,%xmm11,%xmm9
1191	vmovdqa	%xmm8,64(%rsp)
1192	vpshufd	$0xaa,%xmm11,%xmm10
1193	vmovdqa	%xmm9,80(%rsp)
1194	vpshufd	$0xff,%xmm11,%xmm11
1195	vmovdqa	%xmm10,96(%rsp)
1196	vmovdqa	%xmm11,112(%rsp)
1197
1198	vpshufd	$0x00,%xmm3,%xmm0
1199	vpshufd	$0x55,%xmm3,%xmm1
1200	vmovdqa	%xmm0,128-256(%rcx)
1201	vpshufd	$0xaa,%xmm3,%xmm2
1202	vmovdqa	%xmm1,144-256(%rcx)
1203	vpshufd	$0xff,%xmm3,%xmm3
1204	vmovdqa	%xmm2,160-256(%rcx)
1205	vmovdqa	%xmm3,176-256(%rcx)
1206
1207	vpshufd	$0x00,%xmm15,%xmm12
1208	vpshufd	$0x55,%xmm15,%xmm13
1209	vmovdqa	%xmm12,192-256(%rcx)
1210	vpshufd	$0xaa,%xmm15,%xmm14
1211	vmovdqa	%xmm13,208-256(%rcx)
1212	vpshufd	$0xff,%xmm15,%xmm15
1213	vmovdqa	%xmm14,224-256(%rcx)
1214	vmovdqa	%xmm15,240-256(%rcx)
1215
1216	vpshufd	$0x00,%xmm7,%xmm4
1217	vpshufd	$0x55,%xmm7,%xmm5
1218	vpaddd	.Linc(%rip),%xmm4,%xmm4
1219	vpshufd	$0xaa,%xmm7,%xmm6
1220	vmovdqa	%xmm5,272-256(%rcx)
1221	vpshufd	$0xff,%xmm7,%xmm7
1222	vmovdqa	%xmm6,288-256(%rcx)
1223	vmovdqa	%xmm7,304-256(%rcx)
1224
1225	jmp	.Loop_enter4xop
1226
1227.align	32
1228.Loop_outer4xop:
1229	vmovdqa	64(%rsp),%xmm8
1230	vmovdqa	80(%rsp),%xmm9
1231	vmovdqa	96(%rsp),%xmm10
1232	vmovdqa	112(%rsp),%xmm11
1233	vmovdqa	128-256(%rcx),%xmm0
1234	vmovdqa	144-256(%rcx),%xmm1
1235	vmovdqa	160-256(%rcx),%xmm2
1236	vmovdqa	176-256(%rcx),%xmm3
1237	vmovdqa	192-256(%rcx),%xmm12
1238	vmovdqa	208-256(%rcx),%xmm13
1239	vmovdqa	224-256(%rcx),%xmm14
1240	vmovdqa	240-256(%rcx),%xmm15
1241	vmovdqa	256-256(%rcx),%xmm4
1242	vmovdqa	272-256(%rcx),%xmm5
1243	vmovdqa	288-256(%rcx),%xmm6
1244	vmovdqa	304-256(%rcx),%xmm7
1245	vpaddd	.Lfour(%rip),%xmm4,%xmm4
1246
1247.Loop_enter4xop:
1248	movl	$10,%eax
1249	vmovdqa	%xmm4,256-256(%rcx)
1250	jmp	.Loop4xop
1251
1252.align	32
1253.Loop4xop:
1254	vpaddd	%xmm0,%xmm8,%xmm8
1255	vpaddd	%xmm1,%xmm9,%xmm9
1256	vpaddd	%xmm2,%xmm10,%xmm10
1257	vpaddd	%xmm3,%xmm11,%xmm11
1258	vpxor	%xmm4,%xmm8,%xmm4
1259	vpxor	%xmm5,%xmm9,%xmm5
1260	vpxor	%xmm6,%xmm10,%xmm6
1261	vpxor	%xmm7,%xmm11,%xmm7
1262.byte	143,232,120,194,228,16
1263.byte	143,232,120,194,237,16
1264.byte	143,232,120,194,246,16
1265.byte	143,232,120,194,255,16
1266	vpaddd	%xmm4,%xmm12,%xmm12
1267	vpaddd	%xmm5,%xmm13,%xmm13
1268	vpaddd	%xmm6,%xmm14,%xmm14
1269	vpaddd	%xmm7,%xmm15,%xmm15
1270	vpxor	%xmm0,%xmm12,%xmm0
1271	vpxor	%xmm1,%xmm13,%xmm1
1272	vpxor	%xmm14,%xmm2,%xmm2
1273	vpxor	%xmm15,%xmm3,%xmm3
1274.byte	143,232,120,194,192,12
1275.byte	143,232,120,194,201,12
1276.byte	143,232,120,194,210,12
1277.byte	143,232,120,194,219,12
1278	vpaddd	%xmm8,%xmm0,%xmm8
1279	vpaddd	%xmm9,%xmm1,%xmm9
1280	vpaddd	%xmm2,%xmm10,%xmm10
1281	vpaddd	%xmm3,%xmm11,%xmm11
1282	vpxor	%xmm4,%xmm8,%xmm4
1283	vpxor	%xmm5,%xmm9,%xmm5
1284	vpxor	%xmm6,%xmm10,%xmm6
1285	vpxor	%xmm7,%xmm11,%xmm7
1286.byte	143,232,120,194,228,8
1287.byte	143,232,120,194,237,8
1288.byte	143,232,120,194,246,8
1289.byte	143,232,120,194,255,8
1290	vpaddd	%xmm4,%xmm12,%xmm12
1291	vpaddd	%xmm5,%xmm13,%xmm13
1292	vpaddd	%xmm6,%xmm14,%xmm14
1293	vpaddd	%xmm7,%xmm15,%xmm15
1294	vpxor	%xmm0,%xmm12,%xmm0
1295	vpxor	%xmm1,%xmm13,%xmm1
1296	vpxor	%xmm14,%xmm2,%xmm2
1297	vpxor	%xmm15,%xmm3,%xmm3
1298.byte	143,232,120,194,192,7
1299.byte	143,232,120,194,201,7
1300.byte	143,232,120,194,210,7
1301.byte	143,232,120,194,219,7
1302	vpaddd	%xmm1,%xmm8,%xmm8
1303	vpaddd	%xmm2,%xmm9,%xmm9
1304	vpaddd	%xmm3,%xmm10,%xmm10
1305	vpaddd	%xmm0,%xmm11,%xmm11
1306	vpxor	%xmm7,%xmm8,%xmm7
1307	vpxor	%xmm4,%xmm9,%xmm4
1308	vpxor	%xmm5,%xmm10,%xmm5
1309	vpxor	%xmm6,%xmm11,%xmm6
1310.byte	143,232,120,194,255,16
1311.byte	143,232,120,194,228,16
1312.byte	143,232,120,194,237,16
1313.byte	143,232,120,194,246,16
1314	vpaddd	%xmm7,%xmm14,%xmm14
1315	vpaddd	%xmm4,%xmm15,%xmm15
1316	vpaddd	%xmm5,%xmm12,%xmm12
1317	vpaddd	%xmm6,%xmm13,%xmm13
1318	vpxor	%xmm1,%xmm14,%xmm1
1319	vpxor	%xmm2,%xmm15,%xmm2
1320	vpxor	%xmm12,%xmm3,%xmm3
1321	vpxor	%xmm13,%xmm0,%xmm0
1322.byte	143,232,120,194,201,12
1323.byte	143,232,120,194,210,12
1324.byte	143,232,120,194,219,12
1325.byte	143,232,120,194,192,12
1326	vpaddd	%xmm8,%xmm1,%xmm8
1327	vpaddd	%xmm9,%xmm2,%xmm9
1328	vpaddd	%xmm3,%xmm10,%xmm10
1329	vpaddd	%xmm0,%xmm11,%xmm11
1330	vpxor	%xmm7,%xmm8,%xmm7
1331	vpxor	%xmm4,%xmm9,%xmm4
1332	vpxor	%xmm5,%xmm10,%xmm5
1333	vpxor	%xmm6,%xmm11,%xmm6
1334.byte	143,232,120,194,255,8
1335.byte	143,232,120,194,228,8
1336.byte	143,232,120,194,237,8
1337.byte	143,232,120,194,246,8
1338	vpaddd	%xmm7,%xmm14,%xmm14
1339	vpaddd	%xmm4,%xmm15,%xmm15
1340	vpaddd	%xmm5,%xmm12,%xmm12
1341	vpaddd	%xmm6,%xmm13,%xmm13
1342	vpxor	%xmm1,%xmm14,%xmm1
1343	vpxor	%xmm2,%xmm15,%xmm2
1344	vpxor	%xmm12,%xmm3,%xmm3
1345	vpxor	%xmm13,%xmm0,%xmm0
1346.byte	143,232,120,194,201,7
1347.byte	143,232,120,194,210,7
1348.byte	143,232,120,194,219,7
1349.byte	143,232,120,194,192,7
1350	decl	%eax
1351	jnz	.Loop4xop
1352
1353	vpaddd	64(%rsp),%xmm8,%xmm8
1354	vpaddd	80(%rsp),%xmm9,%xmm9
1355	vpaddd	96(%rsp),%xmm10,%xmm10
1356	vpaddd	112(%rsp),%xmm11,%xmm11
1357
1358	vmovdqa	%xmm14,32(%rsp)
1359	vmovdqa	%xmm15,48(%rsp)
1360
1361	vpunpckldq	%xmm9,%xmm8,%xmm14
1362	vpunpckldq	%xmm11,%xmm10,%xmm15
1363	vpunpckhdq	%xmm9,%xmm8,%xmm8
1364	vpunpckhdq	%xmm11,%xmm10,%xmm10
1365	vpunpcklqdq	%xmm15,%xmm14,%xmm9
1366	vpunpckhqdq	%xmm15,%xmm14,%xmm14
1367	vpunpcklqdq	%xmm10,%xmm8,%xmm11
1368	vpunpckhqdq	%xmm10,%xmm8,%xmm8
1369	vpaddd	128-256(%rcx),%xmm0,%xmm0
1370	vpaddd	144-256(%rcx),%xmm1,%xmm1
1371	vpaddd	160-256(%rcx),%xmm2,%xmm2
1372	vpaddd	176-256(%rcx),%xmm3,%xmm3
1373
1374	vmovdqa	%xmm9,0(%rsp)
1375	vmovdqa	%xmm14,16(%rsp)
1376	vmovdqa	32(%rsp),%xmm9
1377	vmovdqa	48(%rsp),%xmm14
1378
1379	vpunpckldq	%xmm1,%xmm0,%xmm10
1380	vpunpckldq	%xmm3,%xmm2,%xmm15
1381	vpunpckhdq	%xmm1,%xmm0,%xmm0
1382	vpunpckhdq	%xmm3,%xmm2,%xmm2
1383	vpunpcklqdq	%xmm15,%xmm10,%xmm1
1384	vpunpckhqdq	%xmm15,%xmm10,%xmm10
1385	vpunpcklqdq	%xmm2,%xmm0,%xmm3
1386	vpunpckhqdq	%xmm2,%xmm0,%xmm0
1387	vpaddd	192-256(%rcx),%xmm12,%xmm12
1388	vpaddd	208-256(%rcx),%xmm13,%xmm13
1389	vpaddd	224-256(%rcx),%xmm9,%xmm9
1390	vpaddd	240-256(%rcx),%xmm14,%xmm14
1391
1392	vpunpckldq	%xmm13,%xmm12,%xmm2
1393	vpunpckldq	%xmm14,%xmm9,%xmm15
1394	vpunpckhdq	%xmm13,%xmm12,%xmm12
1395	vpunpckhdq	%xmm14,%xmm9,%xmm9
1396	vpunpcklqdq	%xmm15,%xmm2,%xmm13
1397	vpunpckhqdq	%xmm15,%xmm2,%xmm2
1398	vpunpcklqdq	%xmm9,%xmm12,%xmm14
1399	vpunpckhqdq	%xmm9,%xmm12,%xmm12
1400	vpaddd	256-256(%rcx),%xmm4,%xmm4
1401	vpaddd	272-256(%rcx),%xmm5,%xmm5
1402	vpaddd	288-256(%rcx),%xmm6,%xmm6
1403	vpaddd	304-256(%rcx),%xmm7,%xmm7
1404
1405	vpunpckldq	%xmm5,%xmm4,%xmm9
1406	vpunpckldq	%xmm7,%xmm6,%xmm15
1407	vpunpckhdq	%xmm5,%xmm4,%xmm4
1408	vpunpckhdq	%xmm7,%xmm6,%xmm6
1409	vpunpcklqdq	%xmm15,%xmm9,%xmm5
1410	vpunpckhqdq	%xmm15,%xmm9,%xmm9
1411	vpunpcklqdq	%xmm6,%xmm4,%xmm7
1412	vpunpckhqdq	%xmm6,%xmm4,%xmm4
1413	vmovdqa	0(%rsp),%xmm6
1414	vmovdqa	16(%rsp),%xmm15
1415
1416	cmpq	$256,%rdx
1417	jb	.Ltail4xop
1418
1419	vpxor	0(%rsi),%xmm6,%xmm6
1420	vpxor	16(%rsi),%xmm1,%xmm1
1421	vpxor	32(%rsi),%xmm13,%xmm13
1422	vpxor	48(%rsi),%xmm5,%xmm5
1423	vpxor	64(%rsi),%xmm15,%xmm15
1424	vpxor	80(%rsi),%xmm10,%xmm10
1425	vpxor	96(%rsi),%xmm2,%xmm2
1426	vpxor	112(%rsi),%xmm9,%xmm9
1427	leaq	128(%rsi),%rsi
1428	vpxor	0(%rsi),%xmm11,%xmm11
1429	vpxor	16(%rsi),%xmm3,%xmm3
1430	vpxor	32(%rsi),%xmm14,%xmm14
1431	vpxor	48(%rsi),%xmm7,%xmm7
1432	vpxor	64(%rsi),%xmm8,%xmm8
1433	vpxor	80(%rsi),%xmm0,%xmm0
1434	vpxor	96(%rsi),%xmm12,%xmm12
1435	vpxor	112(%rsi),%xmm4,%xmm4
1436	leaq	128(%rsi),%rsi
1437
1438	vmovdqu	%xmm6,0(%rdi)
1439	vmovdqu	%xmm1,16(%rdi)
1440	vmovdqu	%xmm13,32(%rdi)
1441	vmovdqu	%xmm5,48(%rdi)
1442	vmovdqu	%xmm15,64(%rdi)
1443	vmovdqu	%xmm10,80(%rdi)
1444	vmovdqu	%xmm2,96(%rdi)
1445	vmovdqu	%xmm9,112(%rdi)
1446	leaq	128(%rdi),%rdi
1447	vmovdqu	%xmm11,0(%rdi)
1448	vmovdqu	%xmm3,16(%rdi)
1449	vmovdqu	%xmm14,32(%rdi)
1450	vmovdqu	%xmm7,48(%rdi)
1451	vmovdqu	%xmm8,64(%rdi)
1452	vmovdqu	%xmm0,80(%rdi)
1453	vmovdqu	%xmm12,96(%rdi)
1454	vmovdqu	%xmm4,112(%rdi)
1455	leaq	128(%rdi),%rdi
1456
1457	subq	$256,%rdx
1458	jnz	.Loop_outer4xop
1459
1460	jmp	.Ldone4xop
1461
1462.align	32
1463.Ltail4xop:
1464	cmpq	$192,%rdx
1465	jae	.L192_or_more4xop
1466	cmpq	$128,%rdx
1467	jae	.L128_or_more4xop
1468	cmpq	$64,%rdx
1469	jae	.L64_or_more4xop
1470
1471	xorq	%r10,%r10
1472	vmovdqa	%xmm6,0(%rsp)
1473	vmovdqa	%xmm1,16(%rsp)
1474	vmovdqa	%xmm13,32(%rsp)
1475	vmovdqa	%xmm5,48(%rsp)
1476	jmp	.Loop_tail4xop
1477
1478.align	32
1479.L64_or_more4xop:
1480	vpxor	0(%rsi),%xmm6,%xmm6
1481	vpxor	16(%rsi),%xmm1,%xmm1
1482	vpxor	32(%rsi),%xmm13,%xmm13
1483	vpxor	48(%rsi),%xmm5,%xmm5
1484	vmovdqu	%xmm6,0(%rdi)
1485	vmovdqu	%xmm1,16(%rdi)
1486	vmovdqu	%xmm13,32(%rdi)
1487	vmovdqu	%xmm5,48(%rdi)
1488	je	.Ldone4xop
1489
1490	leaq	64(%rsi),%rsi
1491	vmovdqa	%xmm15,0(%rsp)
1492	xorq	%r10,%r10
1493	vmovdqa	%xmm10,16(%rsp)
1494	leaq	64(%rdi),%rdi
1495	vmovdqa	%xmm2,32(%rsp)
1496	subq	$64,%rdx
1497	vmovdqa	%xmm9,48(%rsp)
1498	jmp	.Loop_tail4xop
1499
1500.align	32
1501.L128_or_more4xop:
1502	vpxor	0(%rsi),%xmm6,%xmm6
1503	vpxor	16(%rsi),%xmm1,%xmm1
1504	vpxor	32(%rsi),%xmm13,%xmm13
1505	vpxor	48(%rsi),%xmm5,%xmm5
1506	vpxor	64(%rsi),%xmm15,%xmm15
1507	vpxor	80(%rsi),%xmm10,%xmm10
1508	vpxor	96(%rsi),%xmm2,%xmm2
1509	vpxor	112(%rsi),%xmm9,%xmm9
1510
1511	vmovdqu	%xmm6,0(%rdi)
1512	vmovdqu	%xmm1,16(%rdi)
1513	vmovdqu	%xmm13,32(%rdi)
1514	vmovdqu	%xmm5,48(%rdi)
1515	vmovdqu	%xmm15,64(%rdi)
1516	vmovdqu	%xmm10,80(%rdi)
1517	vmovdqu	%xmm2,96(%rdi)
1518	vmovdqu	%xmm9,112(%rdi)
1519	je	.Ldone4xop
1520
1521	leaq	128(%rsi),%rsi
1522	vmovdqa	%xmm11,0(%rsp)
1523	xorq	%r10,%r10
1524	vmovdqa	%xmm3,16(%rsp)
1525	leaq	128(%rdi),%rdi
1526	vmovdqa	%xmm14,32(%rsp)
1527	subq	$128,%rdx
1528	vmovdqa	%xmm7,48(%rsp)
1529	jmp	.Loop_tail4xop
1530
1531.align	32
1532.L192_or_more4xop:
1533	vpxor	0(%rsi),%xmm6,%xmm6
1534	vpxor	16(%rsi),%xmm1,%xmm1
1535	vpxor	32(%rsi),%xmm13,%xmm13
1536	vpxor	48(%rsi),%xmm5,%xmm5
1537	vpxor	64(%rsi),%xmm15,%xmm15
1538	vpxor	80(%rsi),%xmm10,%xmm10
1539	vpxor	96(%rsi),%xmm2,%xmm2
1540	vpxor	112(%rsi),%xmm9,%xmm9
1541	leaq	128(%rsi),%rsi
1542	vpxor	0(%rsi),%xmm11,%xmm11
1543	vpxor	16(%rsi),%xmm3,%xmm3
1544	vpxor	32(%rsi),%xmm14,%xmm14
1545	vpxor	48(%rsi),%xmm7,%xmm7
1546
1547	vmovdqu	%xmm6,0(%rdi)
1548	vmovdqu	%xmm1,16(%rdi)
1549	vmovdqu	%xmm13,32(%rdi)
1550	vmovdqu	%xmm5,48(%rdi)
1551	vmovdqu	%xmm15,64(%rdi)
1552	vmovdqu	%xmm10,80(%rdi)
1553	vmovdqu	%xmm2,96(%rdi)
1554	vmovdqu	%xmm9,112(%rdi)
1555	leaq	128(%rdi),%rdi
1556	vmovdqu	%xmm11,0(%rdi)
1557	vmovdqu	%xmm3,16(%rdi)
1558	vmovdqu	%xmm14,32(%rdi)
1559	vmovdqu	%xmm7,48(%rdi)
1560	je	.Ldone4xop
1561
1562	leaq	64(%rsi),%rsi
1563	vmovdqa	%xmm8,0(%rsp)
1564	xorq	%r10,%r10
1565	vmovdqa	%xmm0,16(%rsp)
1566	leaq	64(%rdi),%rdi
1567	vmovdqa	%xmm12,32(%rsp)
1568	subq	$192,%rdx
1569	vmovdqa	%xmm4,48(%rsp)
1570
1571.Loop_tail4xop:
1572	movzbl	(%rsi,%r10,1),%eax
1573	movzbl	(%rsp,%r10,1),%ecx
1574	leaq	1(%r10),%r10
1575	xorl	%ecx,%eax
1576	movb	%al,-1(%rdi,%r10,1)
1577	decq	%rdx
1578	jnz	.Loop_tail4xop
1579
1580.Ldone4xop:
1581	vzeroupper
1582	leaq	(%r9),%rsp
1583.cfi_def_cfa_register	%rsp
1584.L4xop_epilogue:
1585	.byte	0xf3,0xc3
1586.cfi_endproc
1587.size	ChaCha20_4xop,.-ChaCha20_4xop
1588.type	ChaCha20_8x,@function
1589.align	32
1590ChaCha20_8x:
1591.cfi_startproc
1592.LChaCha20_8x:
1593	movq	%rsp,%r9
1594.cfi_def_cfa_register	%r9
1595	subq	$0x280+8,%rsp
1596	andq	$-32,%rsp
1597	vzeroupper
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608	vbroadcasti128	.Lsigma(%rip),%ymm11
1609	vbroadcasti128	(%rcx),%ymm3
1610	vbroadcasti128	16(%rcx),%ymm15
1611	vbroadcasti128	(%r8),%ymm7
1612	leaq	256(%rsp),%rcx
1613	leaq	512(%rsp),%rax
1614	leaq	.Lrot16(%rip),%r10
1615	leaq	.Lrot24(%rip),%r11
1616
1617	vpshufd	$0x00,%ymm11,%ymm8
1618	vpshufd	$0x55,%ymm11,%ymm9
1619	vmovdqa	%ymm8,128-256(%rcx)
1620	vpshufd	$0xaa,%ymm11,%ymm10
1621	vmovdqa	%ymm9,160-256(%rcx)
1622	vpshufd	$0xff,%ymm11,%ymm11
1623	vmovdqa	%ymm10,192-256(%rcx)
1624	vmovdqa	%ymm11,224-256(%rcx)
1625
1626	vpshufd	$0x00,%ymm3,%ymm0
1627	vpshufd	$0x55,%ymm3,%ymm1
1628	vmovdqa	%ymm0,256-256(%rcx)
1629	vpshufd	$0xaa,%ymm3,%ymm2
1630	vmovdqa	%ymm1,288-256(%rcx)
1631	vpshufd	$0xff,%ymm3,%ymm3
1632	vmovdqa	%ymm2,320-256(%rcx)
1633	vmovdqa	%ymm3,352-256(%rcx)
1634
1635	vpshufd	$0x00,%ymm15,%ymm12
1636	vpshufd	$0x55,%ymm15,%ymm13
1637	vmovdqa	%ymm12,384-512(%rax)
1638	vpshufd	$0xaa,%ymm15,%ymm14
1639	vmovdqa	%ymm13,416-512(%rax)
1640	vpshufd	$0xff,%ymm15,%ymm15
1641	vmovdqa	%ymm14,448-512(%rax)
1642	vmovdqa	%ymm15,480-512(%rax)
1643
1644	vpshufd	$0x00,%ymm7,%ymm4
1645	vpshufd	$0x55,%ymm7,%ymm5
1646	vpaddd	.Lincy(%rip),%ymm4,%ymm4
1647	vpshufd	$0xaa,%ymm7,%ymm6
1648	vmovdqa	%ymm5,544-512(%rax)
1649	vpshufd	$0xff,%ymm7,%ymm7
1650	vmovdqa	%ymm6,576-512(%rax)
1651	vmovdqa	%ymm7,608-512(%rax)
1652
1653	jmp	.Loop_enter8x
1654
1655.align	32
1656.Loop_outer8x:
1657	vmovdqa	128-256(%rcx),%ymm8
1658	vmovdqa	160-256(%rcx),%ymm9
1659	vmovdqa	192-256(%rcx),%ymm10
1660	vmovdqa	224-256(%rcx),%ymm11
1661	vmovdqa	256-256(%rcx),%ymm0
1662	vmovdqa	288-256(%rcx),%ymm1
1663	vmovdqa	320-256(%rcx),%ymm2
1664	vmovdqa	352-256(%rcx),%ymm3
1665	vmovdqa	384-512(%rax),%ymm12
1666	vmovdqa	416-512(%rax),%ymm13
1667	vmovdqa	448-512(%rax),%ymm14
1668	vmovdqa	480-512(%rax),%ymm15
1669	vmovdqa	512-512(%rax),%ymm4
1670	vmovdqa	544-512(%rax),%ymm5
1671	vmovdqa	576-512(%rax),%ymm6
1672	vmovdqa	608-512(%rax),%ymm7
1673	vpaddd	.Leight(%rip),%ymm4,%ymm4
1674
1675.Loop_enter8x:
1676	vmovdqa	%ymm14,64(%rsp)
1677	vmovdqa	%ymm15,96(%rsp)
1678	vbroadcasti128	(%r10),%ymm15
1679	vmovdqa	%ymm4,512-512(%rax)
1680	movl	$10,%eax
1681	jmp	.Loop8x
1682
1683.align	32
1684.Loop8x:
1685	vpaddd	%ymm0,%ymm8,%ymm8
1686	vpxor	%ymm4,%ymm8,%ymm4
1687	vpshufb	%ymm15,%ymm4,%ymm4
1688	vpaddd	%ymm1,%ymm9,%ymm9
1689	vpxor	%ymm5,%ymm9,%ymm5
1690	vpshufb	%ymm15,%ymm5,%ymm5
1691	vpaddd	%ymm4,%ymm12,%ymm12
1692	vpxor	%ymm0,%ymm12,%ymm0
1693	vpslld	$12,%ymm0,%ymm14
1694	vpsrld	$20,%ymm0,%ymm0
1695	vpor	%ymm0,%ymm14,%ymm0
1696	vbroadcasti128	(%r11),%ymm14
1697	vpaddd	%ymm5,%ymm13,%ymm13
1698	vpxor	%ymm1,%ymm13,%ymm1
1699	vpslld	$12,%ymm1,%ymm15
1700	vpsrld	$20,%ymm1,%ymm1
1701	vpor	%ymm1,%ymm15,%ymm1
1702	vpaddd	%ymm0,%ymm8,%ymm8
1703	vpxor	%ymm4,%ymm8,%ymm4
1704	vpshufb	%ymm14,%ymm4,%ymm4
1705	vpaddd	%ymm1,%ymm9,%ymm9
1706	vpxor	%ymm5,%ymm9,%ymm5
1707	vpshufb	%ymm14,%ymm5,%ymm5
1708	vpaddd	%ymm4,%ymm12,%ymm12
1709	vpxor	%ymm0,%ymm12,%ymm0
1710	vpslld	$7,%ymm0,%ymm15
1711	vpsrld	$25,%ymm0,%ymm0
1712	vpor	%ymm0,%ymm15,%ymm0
1713	vbroadcasti128	(%r10),%ymm15
1714	vpaddd	%ymm5,%ymm13,%ymm13
1715	vpxor	%ymm1,%ymm13,%ymm1
1716	vpslld	$7,%ymm1,%ymm14
1717	vpsrld	$25,%ymm1,%ymm1
1718	vpor	%ymm1,%ymm14,%ymm1
1719	vmovdqa	%ymm12,0(%rsp)
1720	vmovdqa	%ymm13,32(%rsp)
1721	vmovdqa	64(%rsp),%ymm12
1722	vmovdqa	96(%rsp),%ymm13
1723	vpaddd	%ymm2,%ymm10,%ymm10
1724	vpxor	%ymm6,%ymm10,%ymm6
1725	vpshufb	%ymm15,%ymm6,%ymm6
1726	vpaddd	%ymm3,%ymm11,%ymm11
1727	vpxor	%ymm7,%ymm11,%ymm7
1728	vpshufb	%ymm15,%ymm7,%ymm7
1729	vpaddd	%ymm6,%ymm12,%ymm12
1730	vpxor	%ymm2,%ymm12,%ymm2
1731	vpslld	$12,%ymm2,%ymm14
1732	vpsrld	$20,%ymm2,%ymm2
1733	vpor	%ymm2,%ymm14,%ymm2
1734	vbroadcasti128	(%r11),%ymm14
1735	vpaddd	%ymm7,%ymm13,%ymm13
1736	vpxor	%ymm3,%ymm13,%ymm3
1737	vpslld	$12,%ymm3,%ymm15
1738	vpsrld	$20,%ymm3,%ymm3
1739	vpor	%ymm3,%ymm15,%ymm3
1740	vpaddd	%ymm2,%ymm10,%ymm10
1741	vpxor	%ymm6,%ymm10,%ymm6
1742	vpshufb	%ymm14,%ymm6,%ymm6
1743	vpaddd	%ymm3,%ymm11,%ymm11
1744	vpxor	%ymm7,%ymm11,%ymm7
1745	vpshufb	%ymm14,%ymm7,%ymm7
1746	vpaddd	%ymm6,%ymm12,%ymm12
1747	vpxor	%ymm2,%ymm12,%ymm2
1748	vpslld	$7,%ymm2,%ymm15
1749	vpsrld	$25,%ymm2,%ymm2
1750	vpor	%ymm2,%ymm15,%ymm2
1751	vbroadcasti128	(%r10),%ymm15
1752	vpaddd	%ymm7,%ymm13,%ymm13
1753	vpxor	%ymm3,%ymm13,%ymm3
1754	vpslld	$7,%ymm3,%ymm14
1755	vpsrld	$25,%ymm3,%ymm3
1756	vpor	%ymm3,%ymm14,%ymm3
1757	vpaddd	%ymm1,%ymm8,%ymm8
1758	vpxor	%ymm7,%ymm8,%ymm7
1759	vpshufb	%ymm15,%ymm7,%ymm7
1760	vpaddd	%ymm2,%ymm9,%ymm9
1761	vpxor	%ymm4,%ymm9,%ymm4
1762	vpshufb	%ymm15,%ymm4,%ymm4
1763	vpaddd	%ymm7,%ymm12,%ymm12
1764	vpxor	%ymm1,%ymm12,%ymm1
1765	vpslld	$12,%ymm1,%ymm14
1766	vpsrld	$20,%ymm1,%ymm1
1767	vpor	%ymm1,%ymm14,%ymm1
1768	vbroadcasti128	(%r11),%ymm14
1769	vpaddd	%ymm4,%ymm13,%ymm13
1770	vpxor	%ymm2,%ymm13,%ymm2
1771	vpslld	$12,%ymm2,%ymm15
1772	vpsrld	$20,%ymm2,%ymm2
1773	vpor	%ymm2,%ymm15,%ymm2
1774	vpaddd	%ymm1,%ymm8,%ymm8
1775	vpxor	%ymm7,%ymm8,%ymm7
1776	vpshufb	%ymm14,%ymm7,%ymm7
1777	vpaddd	%ymm2,%ymm9,%ymm9
1778	vpxor	%ymm4,%ymm9,%ymm4
1779	vpshufb	%ymm14,%ymm4,%ymm4
1780	vpaddd	%ymm7,%ymm12,%ymm12
1781	vpxor	%ymm1,%ymm12,%ymm1
1782	vpslld	$7,%ymm1,%ymm15
1783	vpsrld	$25,%ymm1,%ymm1
1784	vpor	%ymm1,%ymm15,%ymm1
1785	vbroadcasti128	(%r10),%ymm15
1786	vpaddd	%ymm4,%ymm13,%ymm13
1787	vpxor	%ymm2,%ymm13,%ymm2
1788	vpslld	$7,%ymm2,%ymm14
1789	vpsrld	$25,%ymm2,%ymm2
1790	vpor	%ymm2,%ymm14,%ymm2
1791	vmovdqa	%ymm12,64(%rsp)
1792	vmovdqa	%ymm13,96(%rsp)
1793	vmovdqa	0(%rsp),%ymm12
1794	vmovdqa	32(%rsp),%ymm13
1795	vpaddd	%ymm3,%ymm10,%ymm10
1796	vpxor	%ymm5,%ymm10,%ymm5
1797	vpshufb	%ymm15,%ymm5,%ymm5
1798	vpaddd	%ymm0,%ymm11,%ymm11
1799	vpxor	%ymm6,%ymm11,%ymm6
1800	vpshufb	%ymm15,%ymm6,%ymm6
1801	vpaddd	%ymm5,%ymm12,%ymm12
1802	vpxor	%ymm3,%ymm12,%ymm3
1803	vpslld	$12,%ymm3,%ymm14
1804	vpsrld	$20,%ymm3,%ymm3
1805	vpor	%ymm3,%ymm14,%ymm3
1806	vbroadcasti128	(%r11),%ymm14
1807	vpaddd	%ymm6,%ymm13,%ymm13
1808	vpxor	%ymm0,%ymm13,%ymm0
1809	vpslld	$12,%ymm0,%ymm15
1810	vpsrld	$20,%ymm0,%ymm0
1811	vpor	%ymm0,%ymm15,%ymm0
1812	vpaddd	%ymm3,%ymm10,%ymm10
1813	vpxor	%ymm5,%ymm10,%ymm5
1814	vpshufb	%ymm14,%ymm5,%ymm5
1815	vpaddd	%ymm0,%ymm11,%ymm11
1816	vpxor	%ymm6,%ymm11,%ymm6
1817	vpshufb	%ymm14,%ymm6,%ymm6
1818	vpaddd	%ymm5,%ymm12,%ymm12
1819	vpxor	%ymm3,%ymm12,%ymm3
1820	vpslld	$7,%ymm3,%ymm15
1821	vpsrld	$25,%ymm3,%ymm3
1822	vpor	%ymm3,%ymm15,%ymm3
1823	vbroadcasti128	(%r10),%ymm15
1824	vpaddd	%ymm6,%ymm13,%ymm13
1825	vpxor	%ymm0,%ymm13,%ymm0
1826	vpslld	$7,%ymm0,%ymm14
1827	vpsrld	$25,%ymm0,%ymm0
1828	vpor	%ymm0,%ymm14,%ymm0
1829	decl	%eax
1830	jnz	.Loop8x
1831
1832	leaq	512(%rsp),%rax
1833	vpaddd	128-256(%rcx),%ymm8,%ymm8
1834	vpaddd	160-256(%rcx),%ymm9,%ymm9
1835	vpaddd	192-256(%rcx),%ymm10,%ymm10
1836	vpaddd	224-256(%rcx),%ymm11,%ymm11
1837
1838	vpunpckldq	%ymm9,%ymm8,%ymm14
1839	vpunpckldq	%ymm11,%ymm10,%ymm15
1840	vpunpckhdq	%ymm9,%ymm8,%ymm8
1841	vpunpckhdq	%ymm11,%ymm10,%ymm10
1842	vpunpcklqdq	%ymm15,%ymm14,%ymm9
1843	vpunpckhqdq	%ymm15,%ymm14,%ymm14
1844	vpunpcklqdq	%ymm10,%ymm8,%ymm11
1845	vpunpckhqdq	%ymm10,%ymm8,%ymm8
1846	vpaddd	256-256(%rcx),%ymm0,%ymm0
1847	vpaddd	288-256(%rcx),%ymm1,%ymm1
1848	vpaddd	320-256(%rcx),%ymm2,%ymm2
1849	vpaddd	352-256(%rcx),%ymm3,%ymm3
1850
1851	vpunpckldq	%ymm1,%ymm0,%ymm10
1852	vpunpckldq	%ymm3,%ymm2,%ymm15
1853	vpunpckhdq	%ymm1,%ymm0,%ymm0
1854	vpunpckhdq	%ymm3,%ymm2,%ymm2
1855	vpunpcklqdq	%ymm15,%ymm10,%ymm1
1856	vpunpckhqdq	%ymm15,%ymm10,%ymm10
1857	vpunpcklqdq	%ymm2,%ymm0,%ymm3
1858	vpunpckhqdq	%ymm2,%ymm0,%ymm0
1859	vperm2i128	$0x20,%ymm1,%ymm9,%ymm15
1860	vperm2i128	$0x31,%ymm1,%ymm9,%ymm1
1861	vperm2i128	$0x20,%ymm10,%ymm14,%ymm9
1862	vperm2i128	$0x31,%ymm10,%ymm14,%ymm10
1863	vperm2i128	$0x20,%ymm3,%ymm11,%ymm14
1864	vperm2i128	$0x31,%ymm3,%ymm11,%ymm3
1865	vperm2i128	$0x20,%ymm0,%ymm8,%ymm11
1866	vperm2i128	$0x31,%ymm0,%ymm8,%ymm0
1867	vmovdqa	%ymm15,0(%rsp)
1868	vmovdqa	%ymm9,32(%rsp)
1869	vmovdqa	64(%rsp),%ymm15
1870	vmovdqa	96(%rsp),%ymm9
1871
1872	vpaddd	384-512(%rax),%ymm12,%ymm12
1873	vpaddd	416-512(%rax),%ymm13,%ymm13
1874	vpaddd	448-512(%rax),%ymm15,%ymm15
1875	vpaddd	480-512(%rax),%ymm9,%ymm9
1876
1877	vpunpckldq	%ymm13,%ymm12,%ymm2
1878	vpunpckldq	%ymm9,%ymm15,%ymm8
1879	vpunpckhdq	%ymm13,%ymm12,%ymm12
1880	vpunpckhdq	%ymm9,%ymm15,%ymm15
1881	vpunpcklqdq	%ymm8,%ymm2,%ymm13
1882	vpunpckhqdq	%ymm8,%ymm2,%ymm2
1883	vpunpcklqdq	%ymm15,%ymm12,%ymm9
1884	vpunpckhqdq	%ymm15,%ymm12,%ymm12
1885	vpaddd	512-512(%rax),%ymm4,%ymm4
1886	vpaddd	544-512(%rax),%ymm5,%ymm5
1887	vpaddd	576-512(%rax),%ymm6,%ymm6
1888	vpaddd	608-512(%rax),%ymm7,%ymm7
1889
1890	vpunpckldq	%ymm5,%ymm4,%ymm15
1891	vpunpckldq	%ymm7,%ymm6,%ymm8
1892	vpunpckhdq	%ymm5,%ymm4,%ymm4
1893	vpunpckhdq	%ymm7,%ymm6,%ymm6
1894	vpunpcklqdq	%ymm8,%ymm15,%ymm5
1895	vpunpckhqdq	%ymm8,%ymm15,%ymm15
1896	vpunpcklqdq	%ymm6,%ymm4,%ymm7
1897	vpunpckhqdq	%ymm6,%ymm4,%ymm4
1898	vperm2i128	$0x20,%ymm5,%ymm13,%ymm8
1899	vperm2i128	$0x31,%ymm5,%ymm13,%ymm5
1900	vperm2i128	$0x20,%ymm15,%ymm2,%ymm13
1901	vperm2i128	$0x31,%ymm15,%ymm2,%ymm15
1902	vperm2i128	$0x20,%ymm7,%ymm9,%ymm2
1903	vperm2i128	$0x31,%ymm7,%ymm9,%ymm7
1904	vperm2i128	$0x20,%ymm4,%ymm12,%ymm9
1905	vperm2i128	$0x31,%ymm4,%ymm12,%ymm4
1906	vmovdqa	0(%rsp),%ymm6
1907	vmovdqa	32(%rsp),%ymm12
1908
1909	cmpq	$512,%rdx
1910	jb	.Ltail8x
1911
1912	vpxor	0(%rsi),%ymm6,%ymm6
1913	vpxor	32(%rsi),%ymm8,%ymm8
1914	vpxor	64(%rsi),%ymm1,%ymm1
1915	vpxor	96(%rsi),%ymm5,%ymm5
1916	leaq	128(%rsi),%rsi
1917	vmovdqu	%ymm6,0(%rdi)
1918	vmovdqu	%ymm8,32(%rdi)
1919	vmovdqu	%ymm1,64(%rdi)
1920	vmovdqu	%ymm5,96(%rdi)
1921	leaq	128(%rdi),%rdi
1922
1923	vpxor	0(%rsi),%ymm12,%ymm12
1924	vpxor	32(%rsi),%ymm13,%ymm13
1925	vpxor	64(%rsi),%ymm10,%ymm10
1926	vpxor	96(%rsi),%ymm15,%ymm15
1927	leaq	128(%rsi),%rsi
1928	vmovdqu	%ymm12,0(%rdi)
1929	vmovdqu	%ymm13,32(%rdi)
1930	vmovdqu	%ymm10,64(%rdi)
1931	vmovdqu	%ymm15,96(%rdi)
1932	leaq	128(%rdi),%rdi
1933
1934	vpxor	0(%rsi),%ymm14,%ymm14
1935	vpxor	32(%rsi),%ymm2,%ymm2
1936	vpxor	64(%rsi),%ymm3,%ymm3
1937	vpxor	96(%rsi),%ymm7,%ymm7
1938	leaq	128(%rsi),%rsi
1939	vmovdqu	%ymm14,0(%rdi)
1940	vmovdqu	%ymm2,32(%rdi)
1941	vmovdqu	%ymm3,64(%rdi)
1942	vmovdqu	%ymm7,96(%rdi)
1943	leaq	128(%rdi),%rdi
1944
1945	vpxor	0(%rsi),%ymm11,%ymm11
1946	vpxor	32(%rsi),%ymm9,%ymm9
1947	vpxor	64(%rsi),%ymm0,%ymm0
1948	vpxor	96(%rsi),%ymm4,%ymm4
1949	leaq	128(%rsi),%rsi
1950	vmovdqu	%ymm11,0(%rdi)
1951	vmovdqu	%ymm9,32(%rdi)
1952	vmovdqu	%ymm0,64(%rdi)
1953	vmovdqu	%ymm4,96(%rdi)
1954	leaq	128(%rdi),%rdi
1955
1956	subq	$512,%rdx
1957	jnz	.Loop_outer8x
1958
1959	jmp	.Ldone8x
1960
1961.Ltail8x:
1962	cmpq	$448,%rdx
1963	jae	.L448_or_more8x
1964	cmpq	$384,%rdx
1965	jae	.L384_or_more8x
1966	cmpq	$320,%rdx
1967	jae	.L320_or_more8x
1968	cmpq	$256,%rdx
1969	jae	.L256_or_more8x
1970	cmpq	$192,%rdx
1971	jae	.L192_or_more8x
1972	cmpq	$128,%rdx
1973	jae	.L128_or_more8x
1974	cmpq	$64,%rdx
1975	jae	.L64_or_more8x
1976
1977	xorq	%r10,%r10
1978	vmovdqa	%ymm6,0(%rsp)
1979	vmovdqa	%ymm8,32(%rsp)
1980	jmp	.Loop_tail8x
1981
1982.align	32
1983.L64_or_more8x:
1984	vpxor	0(%rsi),%ymm6,%ymm6
1985	vpxor	32(%rsi),%ymm8,%ymm8
1986	vmovdqu	%ymm6,0(%rdi)
1987	vmovdqu	%ymm8,32(%rdi)
1988	je	.Ldone8x
1989
1990	leaq	64(%rsi),%rsi
1991	xorq	%r10,%r10
1992	vmovdqa	%ymm1,0(%rsp)
1993	leaq	64(%rdi),%rdi
1994	subq	$64,%rdx
1995	vmovdqa	%ymm5,32(%rsp)
1996	jmp	.Loop_tail8x
1997
1998.align	32
1999.L128_or_more8x:
2000	vpxor	0(%rsi),%ymm6,%ymm6
2001	vpxor	32(%rsi),%ymm8,%ymm8
2002	vpxor	64(%rsi),%ymm1,%ymm1
2003	vpxor	96(%rsi),%ymm5,%ymm5
2004	vmovdqu	%ymm6,0(%rdi)
2005	vmovdqu	%ymm8,32(%rdi)
2006	vmovdqu	%ymm1,64(%rdi)
2007	vmovdqu	%ymm5,96(%rdi)
2008	je	.Ldone8x
2009
2010	leaq	128(%rsi),%rsi
2011	xorq	%r10,%r10
2012	vmovdqa	%ymm12,0(%rsp)
2013	leaq	128(%rdi),%rdi
2014	subq	$128,%rdx
2015	vmovdqa	%ymm13,32(%rsp)
2016	jmp	.Loop_tail8x
2017
2018.align	32
2019.L192_or_more8x:
2020	vpxor	0(%rsi),%ymm6,%ymm6
2021	vpxor	32(%rsi),%ymm8,%ymm8
2022	vpxor	64(%rsi),%ymm1,%ymm1
2023	vpxor	96(%rsi),%ymm5,%ymm5
2024	vpxor	128(%rsi),%ymm12,%ymm12
2025	vpxor	160(%rsi),%ymm13,%ymm13
2026	vmovdqu	%ymm6,0(%rdi)
2027	vmovdqu	%ymm8,32(%rdi)
2028	vmovdqu	%ymm1,64(%rdi)
2029	vmovdqu	%ymm5,96(%rdi)
2030	vmovdqu	%ymm12,128(%rdi)
2031	vmovdqu	%ymm13,160(%rdi)
2032	je	.Ldone8x
2033
2034	leaq	192(%rsi),%rsi
2035	xorq	%r10,%r10
2036	vmovdqa	%ymm10,0(%rsp)
2037	leaq	192(%rdi),%rdi
2038	subq	$192,%rdx
2039	vmovdqa	%ymm15,32(%rsp)
2040	jmp	.Loop_tail8x
2041
2042.align	32
2043.L256_or_more8x:
2044	vpxor	0(%rsi),%ymm6,%ymm6
2045	vpxor	32(%rsi),%ymm8,%ymm8
2046	vpxor	64(%rsi),%ymm1,%ymm1
2047	vpxor	96(%rsi),%ymm5,%ymm5
2048	vpxor	128(%rsi),%ymm12,%ymm12
2049	vpxor	160(%rsi),%ymm13,%ymm13
2050	vpxor	192(%rsi),%ymm10,%ymm10
2051	vpxor	224(%rsi),%ymm15,%ymm15
2052	vmovdqu	%ymm6,0(%rdi)
2053	vmovdqu	%ymm8,32(%rdi)
2054	vmovdqu	%ymm1,64(%rdi)
2055	vmovdqu	%ymm5,96(%rdi)
2056	vmovdqu	%ymm12,128(%rdi)
2057	vmovdqu	%ymm13,160(%rdi)
2058	vmovdqu	%ymm10,192(%rdi)
2059	vmovdqu	%ymm15,224(%rdi)
2060	je	.Ldone8x
2061
2062	leaq	256(%rsi),%rsi
2063	xorq	%r10,%r10
2064	vmovdqa	%ymm14,0(%rsp)
2065	leaq	256(%rdi),%rdi
2066	subq	$256,%rdx
2067	vmovdqa	%ymm2,32(%rsp)
2068	jmp	.Loop_tail8x
2069
2070.align	32
2071.L320_or_more8x:
2072	vpxor	0(%rsi),%ymm6,%ymm6
2073	vpxor	32(%rsi),%ymm8,%ymm8
2074	vpxor	64(%rsi),%ymm1,%ymm1
2075	vpxor	96(%rsi),%ymm5,%ymm5
2076	vpxor	128(%rsi),%ymm12,%ymm12
2077	vpxor	160(%rsi),%ymm13,%ymm13
2078	vpxor	192(%rsi),%ymm10,%ymm10
2079	vpxor	224(%rsi),%ymm15,%ymm15
2080	vpxor	256(%rsi),%ymm14,%ymm14
2081	vpxor	288(%rsi),%ymm2,%ymm2
2082	vmovdqu	%ymm6,0(%rdi)
2083	vmovdqu	%ymm8,32(%rdi)
2084	vmovdqu	%ymm1,64(%rdi)
2085	vmovdqu	%ymm5,96(%rdi)
2086	vmovdqu	%ymm12,128(%rdi)
2087	vmovdqu	%ymm13,160(%rdi)
2088	vmovdqu	%ymm10,192(%rdi)
2089	vmovdqu	%ymm15,224(%rdi)
2090	vmovdqu	%ymm14,256(%rdi)
2091	vmovdqu	%ymm2,288(%rdi)
2092	je	.Ldone8x
2093
2094	leaq	320(%rsi),%rsi
2095	xorq	%r10,%r10
2096	vmovdqa	%ymm3,0(%rsp)
2097	leaq	320(%rdi),%rdi
2098	subq	$320,%rdx
2099	vmovdqa	%ymm7,32(%rsp)
2100	jmp	.Loop_tail8x
2101
2102.align	32
2103.L384_or_more8x:
2104	vpxor	0(%rsi),%ymm6,%ymm6
2105	vpxor	32(%rsi),%ymm8,%ymm8
2106	vpxor	64(%rsi),%ymm1,%ymm1
2107	vpxor	96(%rsi),%ymm5,%ymm5
2108	vpxor	128(%rsi),%ymm12,%ymm12
2109	vpxor	160(%rsi),%ymm13,%ymm13
2110	vpxor	192(%rsi),%ymm10,%ymm10
2111	vpxor	224(%rsi),%ymm15,%ymm15
2112	vpxor	256(%rsi),%ymm14,%ymm14
2113	vpxor	288(%rsi),%ymm2,%ymm2
2114	vpxor	320(%rsi),%ymm3,%ymm3
2115	vpxor	352(%rsi),%ymm7,%ymm7
2116	vmovdqu	%ymm6,0(%rdi)
2117	vmovdqu	%ymm8,32(%rdi)
2118	vmovdqu	%ymm1,64(%rdi)
2119	vmovdqu	%ymm5,96(%rdi)
2120	vmovdqu	%ymm12,128(%rdi)
2121	vmovdqu	%ymm13,160(%rdi)
2122	vmovdqu	%ymm10,192(%rdi)
2123	vmovdqu	%ymm15,224(%rdi)
2124	vmovdqu	%ymm14,256(%rdi)
2125	vmovdqu	%ymm2,288(%rdi)
2126	vmovdqu	%ymm3,320(%rdi)
2127	vmovdqu	%ymm7,352(%rdi)
2128	je	.Ldone8x
2129
2130	leaq	384(%rsi),%rsi
2131	xorq	%r10,%r10
2132	vmovdqa	%ymm11,0(%rsp)
2133	leaq	384(%rdi),%rdi
2134	subq	$384,%rdx
2135	vmovdqa	%ymm9,32(%rsp)
2136	jmp	.Loop_tail8x
2137
2138.align	32
2139.L448_or_more8x:
2140	vpxor	0(%rsi),%ymm6,%ymm6
2141	vpxor	32(%rsi),%ymm8,%ymm8
2142	vpxor	64(%rsi),%ymm1,%ymm1
2143	vpxor	96(%rsi),%ymm5,%ymm5
2144	vpxor	128(%rsi),%ymm12,%ymm12
2145	vpxor	160(%rsi),%ymm13,%ymm13
2146	vpxor	192(%rsi),%ymm10,%ymm10
2147	vpxor	224(%rsi),%ymm15,%ymm15
2148	vpxor	256(%rsi),%ymm14,%ymm14
2149	vpxor	288(%rsi),%ymm2,%ymm2
2150	vpxor	320(%rsi),%ymm3,%ymm3
2151	vpxor	352(%rsi),%ymm7,%ymm7
2152	vpxor	384(%rsi),%ymm11,%ymm11
2153	vpxor	416(%rsi),%ymm9,%ymm9
2154	vmovdqu	%ymm6,0(%rdi)
2155	vmovdqu	%ymm8,32(%rdi)
2156	vmovdqu	%ymm1,64(%rdi)
2157	vmovdqu	%ymm5,96(%rdi)
2158	vmovdqu	%ymm12,128(%rdi)
2159	vmovdqu	%ymm13,160(%rdi)
2160	vmovdqu	%ymm10,192(%rdi)
2161	vmovdqu	%ymm15,224(%rdi)
2162	vmovdqu	%ymm14,256(%rdi)
2163	vmovdqu	%ymm2,288(%rdi)
2164	vmovdqu	%ymm3,320(%rdi)
2165	vmovdqu	%ymm7,352(%rdi)
2166	vmovdqu	%ymm11,384(%rdi)
2167	vmovdqu	%ymm9,416(%rdi)
2168	je	.Ldone8x
2169
2170	leaq	448(%rsi),%rsi
2171	xorq	%r10,%r10
2172	vmovdqa	%ymm0,0(%rsp)
2173	leaq	448(%rdi),%rdi
2174	subq	$448,%rdx
2175	vmovdqa	%ymm4,32(%rsp)
2176
2177.Loop_tail8x:
2178	movzbl	(%rsi,%r10,1),%eax
2179	movzbl	(%rsp,%r10,1),%ecx
2180	leaq	1(%r10),%r10
2181	xorl	%ecx,%eax
2182	movb	%al,-1(%rdi,%r10,1)
2183	decq	%rdx
2184	jnz	.Loop_tail8x
2185
2186.Ldone8x:
2187	vzeroall
2188	leaq	(%r9),%rsp
2189.cfi_def_cfa_register	%rsp
2190.L8x_epilogue:
2191	.byte	0xf3,0xc3
2192.cfi_endproc
2193.size	ChaCha20_8x,.-ChaCha20_8x
2194