• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# This file is generated from a similarly-named Perl script in the BoringSSL
2# source tree. Do not edit by hand.
3
4#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
11#if defined(BORINGSSL_PREFIX)
12#include <boringssl_prefix_symbols_asm.h>
13#endif
14.text
15
16.extern	OPENSSL_ia32cap_P
17.hidden OPENSSL_ia32cap_P
18
19.align	64
20.Lzero:
21.long	0,0,0,0
22.Lone:
23.long	1,0,0,0
24.Linc:
25.long	0,1,2,3
26.Lfour:
27.long	4,4,4,4
28.Lincy:
29.long	0,2,4,6,1,3,5,7
30.Leight:
31.long	8,8,8,8,8,8,8,8
32.Lrot16:
33.byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
34.Lrot24:
35.byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
36.Lsigma:
37.byte	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
38.align	64
39.Lzeroz:
40.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
41.Lfourz:
42.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
43.Lincz:
44.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
45.Lsixteen:
46.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
47.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
48.globl	ChaCha20_ctr32
49.hidden ChaCha20_ctr32
50.type	ChaCha20_ctr32,@function
51.align	64
52ChaCha20_ctr32:
53.cfi_startproc
54	cmpq	$0,%rdx
55	je	.Lno_data
56	movq	OPENSSL_ia32cap_P+4(%rip),%r10
57	testl	$512,%r10d
58	jnz	.LChaCha20_ssse3
59
60	pushq	%rbx
61.cfi_adjust_cfa_offset	8
62.cfi_offset	rbx,-16
63	pushq	%rbp
64.cfi_adjust_cfa_offset	8
65.cfi_offset	rbp,-24
66	pushq	%r12
67.cfi_adjust_cfa_offset	8
68.cfi_offset	r12,-32
69	pushq	%r13
70.cfi_adjust_cfa_offset	8
71.cfi_offset	r13,-40
72	pushq	%r14
73.cfi_adjust_cfa_offset	8
74.cfi_offset	r14,-48
75	pushq	%r15
76.cfi_adjust_cfa_offset	8
77.cfi_offset	r15,-56
78	subq	$64+24,%rsp
79.cfi_adjust_cfa_offset	88
80.Lctr32_body:
81
82
83	movdqu	(%rcx),%xmm1
84	movdqu	16(%rcx),%xmm2
85	movdqu	(%r8),%xmm3
86	movdqa	.Lone(%rip),%xmm4
87
88
89	movdqa	%xmm1,16(%rsp)
90	movdqa	%xmm2,32(%rsp)
91	movdqa	%xmm3,48(%rsp)
92	movq	%rdx,%rbp
93	jmp	.Loop_outer
94
95.align	32
96.Loop_outer:
97	movl	$0x61707865,%eax
98	movl	$0x3320646e,%ebx
99	movl	$0x79622d32,%ecx
100	movl	$0x6b206574,%edx
101	movl	16(%rsp),%r8d
102	movl	20(%rsp),%r9d
103	movl	24(%rsp),%r10d
104	movl	28(%rsp),%r11d
105	movd	%xmm3,%r12d
106	movl	52(%rsp),%r13d
107	movl	56(%rsp),%r14d
108	movl	60(%rsp),%r15d
109
110	movq	%rbp,64+0(%rsp)
111	movl	$10,%ebp
112	movq	%rsi,64+8(%rsp)
113.byte	102,72,15,126,214
114	movq	%rdi,64+16(%rsp)
115	movq	%rsi,%rdi
116	shrq	$32,%rdi
117	jmp	.Loop
118
119.align	32
120.Loop:
121	addl	%r8d,%eax
122	xorl	%eax,%r12d
123	roll	$16,%r12d
124	addl	%r9d,%ebx
125	xorl	%ebx,%r13d
126	roll	$16,%r13d
127	addl	%r12d,%esi
128	xorl	%esi,%r8d
129	roll	$12,%r8d
130	addl	%r13d,%edi
131	xorl	%edi,%r9d
132	roll	$12,%r9d
133	addl	%r8d,%eax
134	xorl	%eax,%r12d
135	roll	$8,%r12d
136	addl	%r9d,%ebx
137	xorl	%ebx,%r13d
138	roll	$8,%r13d
139	addl	%r12d,%esi
140	xorl	%esi,%r8d
141	roll	$7,%r8d
142	addl	%r13d,%edi
143	xorl	%edi,%r9d
144	roll	$7,%r9d
145	movl	%esi,32(%rsp)
146	movl	%edi,36(%rsp)
147	movl	40(%rsp),%esi
148	movl	44(%rsp),%edi
149	addl	%r10d,%ecx
150	xorl	%ecx,%r14d
151	roll	$16,%r14d
152	addl	%r11d,%edx
153	xorl	%edx,%r15d
154	roll	$16,%r15d
155	addl	%r14d,%esi
156	xorl	%esi,%r10d
157	roll	$12,%r10d
158	addl	%r15d,%edi
159	xorl	%edi,%r11d
160	roll	$12,%r11d
161	addl	%r10d,%ecx
162	xorl	%ecx,%r14d
163	roll	$8,%r14d
164	addl	%r11d,%edx
165	xorl	%edx,%r15d
166	roll	$8,%r15d
167	addl	%r14d,%esi
168	xorl	%esi,%r10d
169	roll	$7,%r10d
170	addl	%r15d,%edi
171	xorl	%edi,%r11d
172	roll	$7,%r11d
173	addl	%r9d,%eax
174	xorl	%eax,%r15d
175	roll	$16,%r15d
176	addl	%r10d,%ebx
177	xorl	%ebx,%r12d
178	roll	$16,%r12d
179	addl	%r15d,%esi
180	xorl	%esi,%r9d
181	roll	$12,%r9d
182	addl	%r12d,%edi
183	xorl	%edi,%r10d
184	roll	$12,%r10d
185	addl	%r9d,%eax
186	xorl	%eax,%r15d
187	roll	$8,%r15d
188	addl	%r10d,%ebx
189	xorl	%ebx,%r12d
190	roll	$8,%r12d
191	addl	%r15d,%esi
192	xorl	%esi,%r9d
193	roll	$7,%r9d
194	addl	%r12d,%edi
195	xorl	%edi,%r10d
196	roll	$7,%r10d
197	movl	%esi,40(%rsp)
198	movl	%edi,44(%rsp)
199	movl	32(%rsp),%esi
200	movl	36(%rsp),%edi
201	addl	%r11d,%ecx
202	xorl	%ecx,%r13d
203	roll	$16,%r13d
204	addl	%r8d,%edx
205	xorl	%edx,%r14d
206	roll	$16,%r14d
207	addl	%r13d,%esi
208	xorl	%esi,%r11d
209	roll	$12,%r11d
210	addl	%r14d,%edi
211	xorl	%edi,%r8d
212	roll	$12,%r8d
213	addl	%r11d,%ecx
214	xorl	%ecx,%r13d
215	roll	$8,%r13d
216	addl	%r8d,%edx
217	xorl	%edx,%r14d
218	roll	$8,%r14d
219	addl	%r13d,%esi
220	xorl	%esi,%r11d
221	roll	$7,%r11d
222	addl	%r14d,%edi
223	xorl	%edi,%r8d
224	roll	$7,%r8d
225	decl	%ebp
226	jnz	.Loop
227	movl	%edi,36(%rsp)
228	movl	%esi,32(%rsp)
229	movq	64(%rsp),%rbp
230	movdqa	%xmm2,%xmm1
231	movq	64+8(%rsp),%rsi
232	paddd	%xmm4,%xmm3
233	movq	64+16(%rsp),%rdi
234
235	addl	$0x61707865,%eax
236	addl	$0x3320646e,%ebx
237	addl	$0x79622d32,%ecx
238	addl	$0x6b206574,%edx
239	addl	16(%rsp),%r8d
240	addl	20(%rsp),%r9d
241	addl	24(%rsp),%r10d
242	addl	28(%rsp),%r11d
243	addl	48(%rsp),%r12d
244	addl	52(%rsp),%r13d
245	addl	56(%rsp),%r14d
246	addl	60(%rsp),%r15d
247	paddd	32(%rsp),%xmm1
248
249	cmpq	$64,%rbp
250	jb	.Ltail
251
252	xorl	0(%rsi),%eax
253	xorl	4(%rsi),%ebx
254	xorl	8(%rsi),%ecx
255	xorl	12(%rsi),%edx
256	xorl	16(%rsi),%r8d
257	xorl	20(%rsi),%r9d
258	xorl	24(%rsi),%r10d
259	xorl	28(%rsi),%r11d
260	movdqu	32(%rsi),%xmm0
261	xorl	48(%rsi),%r12d
262	xorl	52(%rsi),%r13d
263	xorl	56(%rsi),%r14d
264	xorl	60(%rsi),%r15d
265	leaq	64(%rsi),%rsi
266	pxor	%xmm1,%xmm0
267
268	movdqa	%xmm2,32(%rsp)
269	movd	%xmm3,48(%rsp)
270
271	movl	%eax,0(%rdi)
272	movl	%ebx,4(%rdi)
273	movl	%ecx,8(%rdi)
274	movl	%edx,12(%rdi)
275	movl	%r8d,16(%rdi)
276	movl	%r9d,20(%rdi)
277	movl	%r10d,24(%rdi)
278	movl	%r11d,28(%rdi)
279	movdqu	%xmm0,32(%rdi)
280	movl	%r12d,48(%rdi)
281	movl	%r13d,52(%rdi)
282	movl	%r14d,56(%rdi)
283	movl	%r15d,60(%rdi)
284	leaq	64(%rdi),%rdi
285
286	subq	$64,%rbp
287	jnz	.Loop_outer
288
289	jmp	.Ldone
290
291.align	16
292.Ltail:
293	movl	%eax,0(%rsp)
294	movl	%ebx,4(%rsp)
295	xorq	%rbx,%rbx
296	movl	%ecx,8(%rsp)
297	movl	%edx,12(%rsp)
298	movl	%r8d,16(%rsp)
299	movl	%r9d,20(%rsp)
300	movl	%r10d,24(%rsp)
301	movl	%r11d,28(%rsp)
302	movdqa	%xmm1,32(%rsp)
303	movl	%r12d,48(%rsp)
304	movl	%r13d,52(%rsp)
305	movl	%r14d,56(%rsp)
306	movl	%r15d,60(%rsp)
307
308.Loop_tail:
309	movzbl	(%rsi,%rbx,1),%eax
310	movzbl	(%rsp,%rbx,1),%edx
311	leaq	1(%rbx),%rbx
312	xorl	%edx,%eax
313	movb	%al,-1(%rdi,%rbx,1)
314	decq	%rbp
315	jnz	.Loop_tail
316
317.Ldone:
318	leaq	64+24+48(%rsp),%rsi
319	movq	-48(%rsi),%r15
320.cfi_restore	r15
321	movq	-40(%rsi),%r14
322.cfi_restore	r14
323	movq	-32(%rsi),%r13
324.cfi_restore	r13
325	movq	-24(%rsi),%r12
326.cfi_restore	r12
327	movq	-16(%rsi),%rbp
328.cfi_restore	rbp
329	movq	-8(%rsi),%rbx
330.cfi_restore	rbx
331	leaq	(%rsi),%rsp
332.cfi_adjust_cfa_offset	-136
333.Lno_data:
334	.byte	0xf3,0xc3
335.cfi_endproc
336.size	ChaCha20_ctr32,.-ChaCha20_ctr32
337.type	ChaCha20_ssse3,@function
338.align	32
339ChaCha20_ssse3:
340.LChaCha20_ssse3:
341.cfi_startproc
342	movq	%rsp,%r9
343.cfi_def_cfa_register	r9
344	cmpq	$128,%rdx
345	ja	.LChaCha20_4x
346
347.Ldo_sse3_after_all:
348	subq	$64+8,%rsp
349	movdqa	.Lsigma(%rip),%xmm0
350	movdqu	(%rcx),%xmm1
351	movdqu	16(%rcx),%xmm2
352	movdqu	(%r8),%xmm3
353	movdqa	.Lrot16(%rip),%xmm6
354	movdqa	.Lrot24(%rip),%xmm7
355
356	movdqa	%xmm0,0(%rsp)
357	movdqa	%xmm1,16(%rsp)
358	movdqa	%xmm2,32(%rsp)
359	movdqa	%xmm3,48(%rsp)
360	movq	$10,%r8
361	jmp	.Loop_ssse3
362
363.align	32
364.Loop_outer_ssse3:
365	movdqa	.Lone(%rip),%xmm3
366	movdqa	0(%rsp),%xmm0
367	movdqa	16(%rsp),%xmm1
368	movdqa	32(%rsp),%xmm2
369	paddd	48(%rsp),%xmm3
370	movq	$10,%r8
371	movdqa	%xmm3,48(%rsp)
372	jmp	.Loop_ssse3
373
374.align	32
375.Loop_ssse3:
376	paddd	%xmm1,%xmm0
377	pxor	%xmm0,%xmm3
378.byte	102,15,56,0,222
379	paddd	%xmm3,%xmm2
380	pxor	%xmm2,%xmm1
381	movdqa	%xmm1,%xmm4
382	psrld	$20,%xmm1
383	pslld	$12,%xmm4
384	por	%xmm4,%xmm1
385	paddd	%xmm1,%xmm0
386	pxor	%xmm0,%xmm3
387.byte	102,15,56,0,223
388	paddd	%xmm3,%xmm2
389	pxor	%xmm2,%xmm1
390	movdqa	%xmm1,%xmm4
391	psrld	$25,%xmm1
392	pslld	$7,%xmm4
393	por	%xmm4,%xmm1
394	pshufd	$78,%xmm2,%xmm2
395	pshufd	$57,%xmm1,%xmm1
396	pshufd	$147,%xmm3,%xmm3
397	nop
398	paddd	%xmm1,%xmm0
399	pxor	%xmm0,%xmm3
400.byte	102,15,56,0,222
401	paddd	%xmm3,%xmm2
402	pxor	%xmm2,%xmm1
403	movdqa	%xmm1,%xmm4
404	psrld	$20,%xmm1
405	pslld	$12,%xmm4
406	por	%xmm4,%xmm1
407	paddd	%xmm1,%xmm0
408	pxor	%xmm0,%xmm3
409.byte	102,15,56,0,223
410	paddd	%xmm3,%xmm2
411	pxor	%xmm2,%xmm1
412	movdqa	%xmm1,%xmm4
413	psrld	$25,%xmm1
414	pslld	$7,%xmm4
415	por	%xmm4,%xmm1
416	pshufd	$78,%xmm2,%xmm2
417	pshufd	$147,%xmm1,%xmm1
418	pshufd	$57,%xmm3,%xmm3
419	decq	%r8
420	jnz	.Loop_ssse3
421	paddd	0(%rsp),%xmm0
422	paddd	16(%rsp),%xmm1
423	paddd	32(%rsp),%xmm2
424	paddd	48(%rsp),%xmm3
425
426	cmpq	$64,%rdx
427	jb	.Ltail_ssse3
428
429	movdqu	0(%rsi),%xmm4
430	movdqu	16(%rsi),%xmm5
431	pxor	%xmm4,%xmm0
432	movdqu	32(%rsi),%xmm4
433	pxor	%xmm5,%xmm1
434	movdqu	48(%rsi),%xmm5
435	leaq	64(%rsi),%rsi
436	pxor	%xmm4,%xmm2
437	pxor	%xmm5,%xmm3
438
439	movdqu	%xmm0,0(%rdi)
440	movdqu	%xmm1,16(%rdi)
441	movdqu	%xmm2,32(%rdi)
442	movdqu	%xmm3,48(%rdi)
443	leaq	64(%rdi),%rdi
444
445	subq	$64,%rdx
446	jnz	.Loop_outer_ssse3
447
448	jmp	.Ldone_ssse3
449
450.align	16
451.Ltail_ssse3:
452	movdqa	%xmm0,0(%rsp)
453	movdqa	%xmm1,16(%rsp)
454	movdqa	%xmm2,32(%rsp)
455	movdqa	%xmm3,48(%rsp)
456	xorq	%r8,%r8
457
458.Loop_tail_ssse3:
459	movzbl	(%rsi,%r8,1),%eax
460	movzbl	(%rsp,%r8,1),%ecx
461	leaq	1(%r8),%r8
462	xorl	%ecx,%eax
463	movb	%al,-1(%rdi,%r8,1)
464	decq	%rdx
465	jnz	.Loop_tail_ssse3
466
467.Ldone_ssse3:
468	leaq	(%r9),%rsp
469.cfi_def_cfa_register	rsp
470.Lssse3_epilogue:
471	.byte	0xf3,0xc3
472.cfi_endproc
473.size	ChaCha20_ssse3,.-ChaCha20_ssse3
474.type	ChaCha20_4x,@function
475.align	32
476ChaCha20_4x:
477.LChaCha20_4x:
478.cfi_startproc
479	movq	%rsp,%r9
480.cfi_def_cfa_register	r9
481	movq	%r10,%r11
482	shrq	$32,%r10
483	testq	$32,%r10
484	jnz	.LChaCha20_8x
485	cmpq	$192,%rdx
486	ja	.Lproceed4x
487
488	andq	$71303168,%r11
489	cmpq	$4194304,%r11
490	je	.Ldo_sse3_after_all
491
492.Lproceed4x:
493	subq	$0x140+8,%rsp
494	movdqa	.Lsigma(%rip),%xmm11
495	movdqu	(%rcx),%xmm15
496	movdqu	16(%rcx),%xmm7
497	movdqu	(%r8),%xmm3
498	leaq	256(%rsp),%rcx
499	leaq	.Lrot16(%rip),%r10
500	leaq	.Lrot24(%rip),%r11
501
502	pshufd	$0x00,%xmm11,%xmm8
503	pshufd	$0x55,%xmm11,%xmm9
504	movdqa	%xmm8,64(%rsp)
505	pshufd	$0xaa,%xmm11,%xmm10
506	movdqa	%xmm9,80(%rsp)
507	pshufd	$0xff,%xmm11,%xmm11
508	movdqa	%xmm10,96(%rsp)
509	movdqa	%xmm11,112(%rsp)
510
511	pshufd	$0x00,%xmm15,%xmm12
512	pshufd	$0x55,%xmm15,%xmm13
513	movdqa	%xmm12,128-256(%rcx)
514	pshufd	$0xaa,%xmm15,%xmm14
515	movdqa	%xmm13,144-256(%rcx)
516	pshufd	$0xff,%xmm15,%xmm15
517	movdqa	%xmm14,160-256(%rcx)
518	movdqa	%xmm15,176-256(%rcx)
519
520	pshufd	$0x00,%xmm7,%xmm4
521	pshufd	$0x55,%xmm7,%xmm5
522	movdqa	%xmm4,192-256(%rcx)
523	pshufd	$0xaa,%xmm7,%xmm6
524	movdqa	%xmm5,208-256(%rcx)
525	pshufd	$0xff,%xmm7,%xmm7
526	movdqa	%xmm6,224-256(%rcx)
527	movdqa	%xmm7,240-256(%rcx)
528
529	pshufd	$0x00,%xmm3,%xmm0
530	pshufd	$0x55,%xmm3,%xmm1
531	paddd	.Linc(%rip),%xmm0
532	pshufd	$0xaa,%xmm3,%xmm2
533	movdqa	%xmm1,272-256(%rcx)
534	pshufd	$0xff,%xmm3,%xmm3
535	movdqa	%xmm2,288-256(%rcx)
536	movdqa	%xmm3,304-256(%rcx)
537
538	jmp	.Loop_enter4x
539
540.align	32
541.Loop_outer4x:
542	movdqa	64(%rsp),%xmm8
543	movdqa	80(%rsp),%xmm9
544	movdqa	96(%rsp),%xmm10
545	movdqa	112(%rsp),%xmm11
546	movdqa	128-256(%rcx),%xmm12
547	movdqa	144-256(%rcx),%xmm13
548	movdqa	160-256(%rcx),%xmm14
549	movdqa	176-256(%rcx),%xmm15
550	movdqa	192-256(%rcx),%xmm4
551	movdqa	208-256(%rcx),%xmm5
552	movdqa	224-256(%rcx),%xmm6
553	movdqa	240-256(%rcx),%xmm7
554	movdqa	256-256(%rcx),%xmm0
555	movdqa	272-256(%rcx),%xmm1
556	movdqa	288-256(%rcx),%xmm2
557	movdqa	304-256(%rcx),%xmm3
558	paddd	.Lfour(%rip),%xmm0
559
560.Loop_enter4x:
561	movdqa	%xmm6,32(%rsp)
562	movdqa	%xmm7,48(%rsp)
563	movdqa	(%r10),%xmm7
564	movl	$10,%eax
565	movdqa	%xmm0,256-256(%rcx)
566	jmp	.Loop4x
567
568.align	32
569.Loop4x:
570	paddd	%xmm12,%xmm8
571	paddd	%xmm13,%xmm9
572	pxor	%xmm8,%xmm0
573	pxor	%xmm9,%xmm1
574.byte	102,15,56,0,199
575.byte	102,15,56,0,207
576	paddd	%xmm0,%xmm4
577	paddd	%xmm1,%xmm5
578	pxor	%xmm4,%xmm12
579	pxor	%xmm5,%xmm13
580	movdqa	%xmm12,%xmm6
581	pslld	$12,%xmm12
582	psrld	$20,%xmm6
583	movdqa	%xmm13,%xmm7
584	pslld	$12,%xmm13
585	por	%xmm6,%xmm12
586	psrld	$20,%xmm7
587	movdqa	(%r11),%xmm6
588	por	%xmm7,%xmm13
589	paddd	%xmm12,%xmm8
590	paddd	%xmm13,%xmm9
591	pxor	%xmm8,%xmm0
592	pxor	%xmm9,%xmm1
593.byte	102,15,56,0,198
594.byte	102,15,56,0,206
595	paddd	%xmm0,%xmm4
596	paddd	%xmm1,%xmm5
597	pxor	%xmm4,%xmm12
598	pxor	%xmm5,%xmm13
599	movdqa	%xmm12,%xmm7
600	pslld	$7,%xmm12
601	psrld	$25,%xmm7
602	movdqa	%xmm13,%xmm6
603	pslld	$7,%xmm13
604	por	%xmm7,%xmm12
605	psrld	$25,%xmm6
606	movdqa	(%r10),%xmm7
607	por	%xmm6,%xmm13
608	movdqa	%xmm4,0(%rsp)
609	movdqa	%xmm5,16(%rsp)
610	movdqa	32(%rsp),%xmm4
611	movdqa	48(%rsp),%xmm5
612	paddd	%xmm14,%xmm10
613	paddd	%xmm15,%xmm11
614	pxor	%xmm10,%xmm2
615	pxor	%xmm11,%xmm3
616.byte	102,15,56,0,215
617.byte	102,15,56,0,223
618	paddd	%xmm2,%xmm4
619	paddd	%xmm3,%xmm5
620	pxor	%xmm4,%xmm14
621	pxor	%xmm5,%xmm15
622	movdqa	%xmm14,%xmm6
623	pslld	$12,%xmm14
624	psrld	$20,%xmm6
625	movdqa	%xmm15,%xmm7
626	pslld	$12,%xmm15
627	por	%xmm6,%xmm14
628	psrld	$20,%xmm7
629	movdqa	(%r11),%xmm6
630	por	%xmm7,%xmm15
631	paddd	%xmm14,%xmm10
632	paddd	%xmm15,%xmm11
633	pxor	%xmm10,%xmm2
634	pxor	%xmm11,%xmm3
635.byte	102,15,56,0,214
636.byte	102,15,56,0,222
637	paddd	%xmm2,%xmm4
638	paddd	%xmm3,%xmm5
639	pxor	%xmm4,%xmm14
640	pxor	%xmm5,%xmm15
641	movdqa	%xmm14,%xmm7
642	pslld	$7,%xmm14
643	psrld	$25,%xmm7
644	movdqa	%xmm15,%xmm6
645	pslld	$7,%xmm15
646	por	%xmm7,%xmm14
647	psrld	$25,%xmm6
648	movdqa	(%r10),%xmm7
649	por	%xmm6,%xmm15
650	paddd	%xmm13,%xmm8
651	paddd	%xmm14,%xmm9
652	pxor	%xmm8,%xmm3
653	pxor	%xmm9,%xmm0
654.byte	102,15,56,0,223
655.byte	102,15,56,0,199
656	paddd	%xmm3,%xmm4
657	paddd	%xmm0,%xmm5
658	pxor	%xmm4,%xmm13
659	pxor	%xmm5,%xmm14
660	movdqa	%xmm13,%xmm6
661	pslld	$12,%xmm13
662	psrld	$20,%xmm6
663	movdqa	%xmm14,%xmm7
664	pslld	$12,%xmm14
665	por	%xmm6,%xmm13
666	psrld	$20,%xmm7
667	movdqa	(%r11),%xmm6
668	por	%xmm7,%xmm14
669	paddd	%xmm13,%xmm8
670	paddd	%xmm14,%xmm9
671	pxor	%xmm8,%xmm3
672	pxor	%xmm9,%xmm0
673.byte	102,15,56,0,222
674.byte	102,15,56,0,198
675	paddd	%xmm3,%xmm4
676	paddd	%xmm0,%xmm5
677	pxor	%xmm4,%xmm13
678	pxor	%xmm5,%xmm14
679	movdqa	%xmm13,%xmm7
680	pslld	$7,%xmm13
681	psrld	$25,%xmm7
682	movdqa	%xmm14,%xmm6
683	pslld	$7,%xmm14
684	por	%xmm7,%xmm13
685	psrld	$25,%xmm6
686	movdqa	(%r10),%xmm7
687	por	%xmm6,%xmm14
688	movdqa	%xmm4,32(%rsp)
689	movdqa	%xmm5,48(%rsp)
690	movdqa	0(%rsp),%xmm4
691	movdqa	16(%rsp),%xmm5
692	paddd	%xmm15,%xmm10
693	paddd	%xmm12,%xmm11
694	pxor	%xmm10,%xmm1
695	pxor	%xmm11,%xmm2
696.byte	102,15,56,0,207
697.byte	102,15,56,0,215
698	paddd	%xmm1,%xmm4
699	paddd	%xmm2,%xmm5
700	pxor	%xmm4,%xmm15
701	pxor	%xmm5,%xmm12
702	movdqa	%xmm15,%xmm6
703	pslld	$12,%xmm15
704	psrld	$20,%xmm6
705	movdqa	%xmm12,%xmm7
706	pslld	$12,%xmm12
707	por	%xmm6,%xmm15
708	psrld	$20,%xmm7
709	movdqa	(%r11),%xmm6
710	por	%xmm7,%xmm12
711	paddd	%xmm15,%xmm10
712	paddd	%xmm12,%xmm11
713	pxor	%xmm10,%xmm1
714	pxor	%xmm11,%xmm2
715.byte	102,15,56,0,206
716.byte	102,15,56,0,214
717	paddd	%xmm1,%xmm4
718	paddd	%xmm2,%xmm5
719	pxor	%xmm4,%xmm15
720	pxor	%xmm5,%xmm12
721	movdqa	%xmm15,%xmm7
722	pslld	$7,%xmm15
723	psrld	$25,%xmm7
724	movdqa	%xmm12,%xmm6
725	pslld	$7,%xmm12
726	por	%xmm7,%xmm15
727	psrld	$25,%xmm6
728	movdqa	(%r10),%xmm7
729	por	%xmm6,%xmm12
730	decl	%eax
731	jnz	.Loop4x
732
733	paddd	64(%rsp),%xmm8
734	paddd	80(%rsp),%xmm9
735	paddd	96(%rsp),%xmm10
736	paddd	112(%rsp),%xmm11
737
738	movdqa	%xmm8,%xmm6
739	punpckldq	%xmm9,%xmm8
740	movdqa	%xmm10,%xmm7
741	punpckldq	%xmm11,%xmm10
742	punpckhdq	%xmm9,%xmm6
743	punpckhdq	%xmm11,%xmm7
744	movdqa	%xmm8,%xmm9
745	punpcklqdq	%xmm10,%xmm8
746	movdqa	%xmm6,%xmm11
747	punpcklqdq	%xmm7,%xmm6
748	punpckhqdq	%xmm10,%xmm9
749	punpckhqdq	%xmm7,%xmm11
750	paddd	128-256(%rcx),%xmm12
751	paddd	144-256(%rcx),%xmm13
752	paddd	160-256(%rcx),%xmm14
753	paddd	176-256(%rcx),%xmm15
754
755	movdqa	%xmm8,0(%rsp)
756	movdqa	%xmm9,16(%rsp)
757	movdqa	32(%rsp),%xmm8
758	movdqa	48(%rsp),%xmm9
759
760	movdqa	%xmm12,%xmm10
761	punpckldq	%xmm13,%xmm12
762	movdqa	%xmm14,%xmm7
763	punpckldq	%xmm15,%xmm14
764	punpckhdq	%xmm13,%xmm10
765	punpckhdq	%xmm15,%xmm7
766	movdqa	%xmm12,%xmm13
767	punpcklqdq	%xmm14,%xmm12
768	movdqa	%xmm10,%xmm15
769	punpcklqdq	%xmm7,%xmm10
770	punpckhqdq	%xmm14,%xmm13
771	punpckhqdq	%xmm7,%xmm15
772	paddd	192-256(%rcx),%xmm4
773	paddd	208-256(%rcx),%xmm5
774	paddd	224-256(%rcx),%xmm8
775	paddd	240-256(%rcx),%xmm9
776
777	movdqa	%xmm6,32(%rsp)
778	movdqa	%xmm11,48(%rsp)
779
780	movdqa	%xmm4,%xmm14
781	punpckldq	%xmm5,%xmm4
782	movdqa	%xmm8,%xmm7
783	punpckldq	%xmm9,%xmm8
784	punpckhdq	%xmm5,%xmm14
785	punpckhdq	%xmm9,%xmm7
786	movdqa	%xmm4,%xmm5
787	punpcklqdq	%xmm8,%xmm4
788	movdqa	%xmm14,%xmm9
789	punpcklqdq	%xmm7,%xmm14
790	punpckhqdq	%xmm8,%xmm5
791	punpckhqdq	%xmm7,%xmm9
792	paddd	256-256(%rcx),%xmm0
793	paddd	272-256(%rcx),%xmm1
794	paddd	288-256(%rcx),%xmm2
795	paddd	304-256(%rcx),%xmm3
796
797	movdqa	%xmm0,%xmm8
798	punpckldq	%xmm1,%xmm0
799	movdqa	%xmm2,%xmm7
800	punpckldq	%xmm3,%xmm2
801	punpckhdq	%xmm1,%xmm8
802	punpckhdq	%xmm3,%xmm7
803	movdqa	%xmm0,%xmm1
804	punpcklqdq	%xmm2,%xmm0
805	movdqa	%xmm8,%xmm3
806	punpcklqdq	%xmm7,%xmm8
807	punpckhqdq	%xmm2,%xmm1
808	punpckhqdq	%xmm7,%xmm3
809	cmpq	$256,%rdx
810	jb	.Ltail4x
811
812	movdqu	0(%rsi),%xmm6
813	movdqu	16(%rsi),%xmm11
814	movdqu	32(%rsi),%xmm2
815	movdqu	48(%rsi),%xmm7
816	pxor	0(%rsp),%xmm6
817	pxor	%xmm12,%xmm11
818	pxor	%xmm4,%xmm2
819	pxor	%xmm0,%xmm7
820
821	movdqu	%xmm6,0(%rdi)
822	movdqu	64(%rsi),%xmm6
823	movdqu	%xmm11,16(%rdi)
824	movdqu	80(%rsi),%xmm11
825	movdqu	%xmm2,32(%rdi)
826	movdqu	96(%rsi),%xmm2
827	movdqu	%xmm7,48(%rdi)
828	movdqu	112(%rsi),%xmm7
829	leaq	128(%rsi),%rsi
830	pxor	16(%rsp),%xmm6
831	pxor	%xmm13,%xmm11
832	pxor	%xmm5,%xmm2
833	pxor	%xmm1,%xmm7
834
835	movdqu	%xmm6,64(%rdi)
836	movdqu	0(%rsi),%xmm6
837	movdqu	%xmm11,80(%rdi)
838	movdqu	16(%rsi),%xmm11
839	movdqu	%xmm2,96(%rdi)
840	movdqu	32(%rsi),%xmm2
841	movdqu	%xmm7,112(%rdi)
842	leaq	128(%rdi),%rdi
843	movdqu	48(%rsi),%xmm7
844	pxor	32(%rsp),%xmm6
845	pxor	%xmm10,%xmm11
846	pxor	%xmm14,%xmm2
847	pxor	%xmm8,%xmm7
848
849	movdqu	%xmm6,0(%rdi)
850	movdqu	64(%rsi),%xmm6
851	movdqu	%xmm11,16(%rdi)
852	movdqu	80(%rsi),%xmm11
853	movdqu	%xmm2,32(%rdi)
854	movdqu	96(%rsi),%xmm2
855	movdqu	%xmm7,48(%rdi)
856	movdqu	112(%rsi),%xmm7
857	leaq	128(%rsi),%rsi
858	pxor	48(%rsp),%xmm6
859	pxor	%xmm15,%xmm11
860	pxor	%xmm9,%xmm2
861	pxor	%xmm3,%xmm7
862	movdqu	%xmm6,64(%rdi)
863	movdqu	%xmm11,80(%rdi)
864	movdqu	%xmm2,96(%rdi)
865	movdqu	%xmm7,112(%rdi)
866	leaq	128(%rdi),%rdi
867
868	subq	$256,%rdx
869	jnz	.Loop_outer4x
870
871	jmp	.Ldone4x
872
873.Ltail4x:
874	cmpq	$192,%rdx
875	jae	.L192_or_more4x
876	cmpq	$128,%rdx
877	jae	.L128_or_more4x
878	cmpq	$64,%rdx
879	jae	.L64_or_more4x
880
881
882	xorq	%r10,%r10
883
884	movdqa	%xmm12,16(%rsp)
885	movdqa	%xmm4,32(%rsp)
886	movdqa	%xmm0,48(%rsp)
887	jmp	.Loop_tail4x
888
889.align	32
890.L64_or_more4x:
891	movdqu	0(%rsi),%xmm6
892	movdqu	16(%rsi),%xmm11
893	movdqu	32(%rsi),%xmm2
894	movdqu	48(%rsi),%xmm7
895	pxor	0(%rsp),%xmm6
896	pxor	%xmm12,%xmm11
897	pxor	%xmm4,%xmm2
898	pxor	%xmm0,%xmm7
899	movdqu	%xmm6,0(%rdi)
900	movdqu	%xmm11,16(%rdi)
901	movdqu	%xmm2,32(%rdi)
902	movdqu	%xmm7,48(%rdi)
903	je	.Ldone4x
904
905	movdqa	16(%rsp),%xmm6
906	leaq	64(%rsi),%rsi
907	xorq	%r10,%r10
908	movdqa	%xmm6,0(%rsp)
909	movdqa	%xmm13,16(%rsp)
910	leaq	64(%rdi),%rdi
911	movdqa	%xmm5,32(%rsp)
912	subq	$64,%rdx
913	movdqa	%xmm1,48(%rsp)
914	jmp	.Loop_tail4x
915
916.align	32
917.L128_or_more4x:
918	movdqu	0(%rsi),%xmm6
919	movdqu	16(%rsi),%xmm11
920	movdqu	32(%rsi),%xmm2
921	movdqu	48(%rsi),%xmm7
922	pxor	0(%rsp),%xmm6
923	pxor	%xmm12,%xmm11
924	pxor	%xmm4,%xmm2
925	pxor	%xmm0,%xmm7
926
927	movdqu	%xmm6,0(%rdi)
928	movdqu	64(%rsi),%xmm6
929	movdqu	%xmm11,16(%rdi)
930	movdqu	80(%rsi),%xmm11
931	movdqu	%xmm2,32(%rdi)
932	movdqu	96(%rsi),%xmm2
933	movdqu	%xmm7,48(%rdi)
934	movdqu	112(%rsi),%xmm7
935	pxor	16(%rsp),%xmm6
936	pxor	%xmm13,%xmm11
937	pxor	%xmm5,%xmm2
938	pxor	%xmm1,%xmm7
939	movdqu	%xmm6,64(%rdi)
940	movdqu	%xmm11,80(%rdi)
941	movdqu	%xmm2,96(%rdi)
942	movdqu	%xmm7,112(%rdi)
943	je	.Ldone4x
944
945	movdqa	32(%rsp),%xmm6
946	leaq	128(%rsi),%rsi
947	xorq	%r10,%r10
948	movdqa	%xmm6,0(%rsp)
949	movdqa	%xmm10,16(%rsp)
950	leaq	128(%rdi),%rdi
951	movdqa	%xmm14,32(%rsp)
952	subq	$128,%rdx
953	movdqa	%xmm8,48(%rsp)
954	jmp	.Loop_tail4x
955
956.align	32
957.L192_or_more4x:
958	movdqu	0(%rsi),%xmm6
959	movdqu	16(%rsi),%xmm11
960	movdqu	32(%rsi),%xmm2
961	movdqu	48(%rsi),%xmm7
962	pxor	0(%rsp),%xmm6
963	pxor	%xmm12,%xmm11
964	pxor	%xmm4,%xmm2
965	pxor	%xmm0,%xmm7
966
967	movdqu	%xmm6,0(%rdi)
968	movdqu	64(%rsi),%xmm6
969	movdqu	%xmm11,16(%rdi)
970	movdqu	80(%rsi),%xmm11
971	movdqu	%xmm2,32(%rdi)
972	movdqu	96(%rsi),%xmm2
973	movdqu	%xmm7,48(%rdi)
974	movdqu	112(%rsi),%xmm7
975	leaq	128(%rsi),%rsi
976	pxor	16(%rsp),%xmm6
977	pxor	%xmm13,%xmm11
978	pxor	%xmm5,%xmm2
979	pxor	%xmm1,%xmm7
980
981	movdqu	%xmm6,64(%rdi)
982	movdqu	0(%rsi),%xmm6
983	movdqu	%xmm11,80(%rdi)
984	movdqu	16(%rsi),%xmm11
985	movdqu	%xmm2,96(%rdi)
986	movdqu	32(%rsi),%xmm2
987	movdqu	%xmm7,112(%rdi)
988	leaq	128(%rdi),%rdi
989	movdqu	48(%rsi),%xmm7
990	pxor	32(%rsp),%xmm6
991	pxor	%xmm10,%xmm11
992	pxor	%xmm14,%xmm2
993	pxor	%xmm8,%xmm7
994	movdqu	%xmm6,0(%rdi)
995	movdqu	%xmm11,16(%rdi)
996	movdqu	%xmm2,32(%rdi)
997	movdqu	%xmm7,48(%rdi)
998	je	.Ldone4x
999
1000	movdqa	48(%rsp),%xmm6
1001	leaq	64(%rsi),%rsi
1002	xorq	%r10,%r10
1003	movdqa	%xmm6,0(%rsp)
1004	movdqa	%xmm15,16(%rsp)
1005	leaq	64(%rdi),%rdi
1006	movdqa	%xmm9,32(%rsp)
1007	subq	$192,%rdx
1008	movdqa	%xmm3,48(%rsp)
1009
1010.Loop_tail4x:
1011	movzbl	(%rsi,%r10,1),%eax
1012	movzbl	(%rsp,%r10,1),%ecx
1013	leaq	1(%r10),%r10
1014	xorl	%ecx,%eax
1015	movb	%al,-1(%rdi,%r10,1)
1016	decq	%rdx
1017	jnz	.Loop_tail4x
1018
1019.Ldone4x:
1020	leaq	(%r9),%rsp
1021.cfi_def_cfa_register	rsp
1022.L4x_epilogue:
1023	.byte	0xf3,0xc3
1024.cfi_endproc
1025.size	ChaCha20_4x,.-ChaCha20_4x
1026.type	ChaCha20_8x,@function
1027.align	32
1028ChaCha20_8x:
1029.LChaCha20_8x:
1030.cfi_startproc
1031	movq	%rsp,%r9
1032.cfi_def_cfa_register	r9
1033	subq	$0x280+8,%rsp
1034	andq	$-32,%rsp
1035	vzeroupper
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046	vbroadcasti128	.Lsigma(%rip),%ymm11
1047	vbroadcasti128	(%rcx),%ymm3
1048	vbroadcasti128	16(%rcx),%ymm15
1049	vbroadcasti128	(%r8),%ymm7
1050	leaq	256(%rsp),%rcx
1051	leaq	512(%rsp),%rax
1052	leaq	.Lrot16(%rip),%r10
1053	leaq	.Lrot24(%rip),%r11
1054
1055	vpshufd	$0x00,%ymm11,%ymm8
1056	vpshufd	$0x55,%ymm11,%ymm9
1057	vmovdqa	%ymm8,128-256(%rcx)
1058	vpshufd	$0xaa,%ymm11,%ymm10
1059	vmovdqa	%ymm9,160-256(%rcx)
1060	vpshufd	$0xff,%ymm11,%ymm11
1061	vmovdqa	%ymm10,192-256(%rcx)
1062	vmovdqa	%ymm11,224-256(%rcx)
1063
1064	vpshufd	$0x00,%ymm3,%ymm0
1065	vpshufd	$0x55,%ymm3,%ymm1
1066	vmovdqa	%ymm0,256-256(%rcx)
1067	vpshufd	$0xaa,%ymm3,%ymm2
1068	vmovdqa	%ymm1,288-256(%rcx)
1069	vpshufd	$0xff,%ymm3,%ymm3
1070	vmovdqa	%ymm2,320-256(%rcx)
1071	vmovdqa	%ymm3,352-256(%rcx)
1072
1073	vpshufd	$0x00,%ymm15,%ymm12
1074	vpshufd	$0x55,%ymm15,%ymm13
1075	vmovdqa	%ymm12,384-512(%rax)
1076	vpshufd	$0xaa,%ymm15,%ymm14
1077	vmovdqa	%ymm13,416-512(%rax)
1078	vpshufd	$0xff,%ymm15,%ymm15
1079	vmovdqa	%ymm14,448-512(%rax)
1080	vmovdqa	%ymm15,480-512(%rax)
1081
1082	vpshufd	$0x00,%ymm7,%ymm4
1083	vpshufd	$0x55,%ymm7,%ymm5
1084	vpaddd	.Lincy(%rip),%ymm4,%ymm4
1085	vpshufd	$0xaa,%ymm7,%ymm6
1086	vmovdqa	%ymm5,544-512(%rax)
1087	vpshufd	$0xff,%ymm7,%ymm7
1088	vmovdqa	%ymm6,576-512(%rax)
1089	vmovdqa	%ymm7,608-512(%rax)
1090
1091	jmp	.Loop_enter8x
1092
1093.align	32
1094.Loop_outer8x:
1095	vmovdqa	128-256(%rcx),%ymm8
1096	vmovdqa	160-256(%rcx),%ymm9
1097	vmovdqa	192-256(%rcx),%ymm10
1098	vmovdqa	224-256(%rcx),%ymm11
1099	vmovdqa	256-256(%rcx),%ymm0
1100	vmovdqa	288-256(%rcx),%ymm1
1101	vmovdqa	320-256(%rcx),%ymm2
1102	vmovdqa	352-256(%rcx),%ymm3
1103	vmovdqa	384-512(%rax),%ymm12
1104	vmovdqa	416-512(%rax),%ymm13
1105	vmovdqa	448-512(%rax),%ymm14
1106	vmovdqa	480-512(%rax),%ymm15
1107	vmovdqa	512-512(%rax),%ymm4
1108	vmovdqa	544-512(%rax),%ymm5
1109	vmovdqa	576-512(%rax),%ymm6
1110	vmovdqa	608-512(%rax),%ymm7
1111	vpaddd	.Leight(%rip),%ymm4,%ymm4
1112
1113.Loop_enter8x:
1114	vmovdqa	%ymm14,64(%rsp)
1115	vmovdqa	%ymm15,96(%rsp)
1116	vbroadcasti128	(%r10),%ymm15
1117	vmovdqa	%ymm4,512-512(%rax)
1118	movl	$10,%eax
1119	jmp	.Loop8x
1120
1121.align	32
1122.Loop8x:
1123	vpaddd	%ymm0,%ymm8,%ymm8
1124	vpxor	%ymm4,%ymm8,%ymm4
1125	vpshufb	%ymm15,%ymm4,%ymm4
1126	vpaddd	%ymm1,%ymm9,%ymm9
1127	vpxor	%ymm5,%ymm9,%ymm5
1128	vpshufb	%ymm15,%ymm5,%ymm5
1129	vpaddd	%ymm4,%ymm12,%ymm12
1130	vpxor	%ymm0,%ymm12,%ymm0
1131	vpslld	$12,%ymm0,%ymm14
1132	vpsrld	$20,%ymm0,%ymm0
1133	vpor	%ymm0,%ymm14,%ymm0
1134	vbroadcasti128	(%r11),%ymm14
1135	vpaddd	%ymm5,%ymm13,%ymm13
1136	vpxor	%ymm1,%ymm13,%ymm1
1137	vpslld	$12,%ymm1,%ymm15
1138	vpsrld	$20,%ymm1,%ymm1
1139	vpor	%ymm1,%ymm15,%ymm1
1140	vpaddd	%ymm0,%ymm8,%ymm8
1141	vpxor	%ymm4,%ymm8,%ymm4
1142	vpshufb	%ymm14,%ymm4,%ymm4
1143	vpaddd	%ymm1,%ymm9,%ymm9
1144	vpxor	%ymm5,%ymm9,%ymm5
1145	vpshufb	%ymm14,%ymm5,%ymm5
1146	vpaddd	%ymm4,%ymm12,%ymm12
1147	vpxor	%ymm0,%ymm12,%ymm0
1148	vpslld	$7,%ymm0,%ymm15
1149	vpsrld	$25,%ymm0,%ymm0
1150	vpor	%ymm0,%ymm15,%ymm0
1151	vbroadcasti128	(%r10),%ymm15
1152	vpaddd	%ymm5,%ymm13,%ymm13
1153	vpxor	%ymm1,%ymm13,%ymm1
1154	vpslld	$7,%ymm1,%ymm14
1155	vpsrld	$25,%ymm1,%ymm1
1156	vpor	%ymm1,%ymm14,%ymm1
1157	vmovdqa	%ymm12,0(%rsp)
1158	vmovdqa	%ymm13,32(%rsp)
1159	vmovdqa	64(%rsp),%ymm12
1160	vmovdqa	96(%rsp),%ymm13
1161	vpaddd	%ymm2,%ymm10,%ymm10
1162	vpxor	%ymm6,%ymm10,%ymm6
1163	vpshufb	%ymm15,%ymm6,%ymm6
1164	vpaddd	%ymm3,%ymm11,%ymm11
1165	vpxor	%ymm7,%ymm11,%ymm7
1166	vpshufb	%ymm15,%ymm7,%ymm7
1167	vpaddd	%ymm6,%ymm12,%ymm12
1168	vpxor	%ymm2,%ymm12,%ymm2
1169	vpslld	$12,%ymm2,%ymm14
1170	vpsrld	$20,%ymm2,%ymm2
1171	vpor	%ymm2,%ymm14,%ymm2
1172	vbroadcasti128	(%r11),%ymm14
1173	vpaddd	%ymm7,%ymm13,%ymm13
1174	vpxor	%ymm3,%ymm13,%ymm3
1175	vpslld	$12,%ymm3,%ymm15
1176	vpsrld	$20,%ymm3,%ymm3
1177	vpor	%ymm3,%ymm15,%ymm3
1178	vpaddd	%ymm2,%ymm10,%ymm10
1179	vpxor	%ymm6,%ymm10,%ymm6
1180	vpshufb	%ymm14,%ymm6,%ymm6
1181	vpaddd	%ymm3,%ymm11,%ymm11
1182	vpxor	%ymm7,%ymm11,%ymm7
1183	vpshufb	%ymm14,%ymm7,%ymm7
1184	vpaddd	%ymm6,%ymm12,%ymm12
1185	vpxor	%ymm2,%ymm12,%ymm2
1186	vpslld	$7,%ymm2,%ymm15
1187	vpsrld	$25,%ymm2,%ymm2
1188	vpor	%ymm2,%ymm15,%ymm2
1189	vbroadcasti128	(%r10),%ymm15
1190	vpaddd	%ymm7,%ymm13,%ymm13
1191	vpxor	%ymm3,%ymm13,%ymm3
1192	vpslld	$7,%ymm3,%ymm14
1193	vpsrld	$25,%ymm3,%ymm3
1194	vpor	%ymm3,%ymm14,%ymm3
1195	vpaddd	%ymm1,%ymm8,%ymm8
1196	vpxor	%ymm7,%ymm8,%ymm7
1197	vpshufb	%ymm15,%ymm7,%ymm7
1198	vpaddd	%ymm2,%ymm9,%ymm9
1199	vpxor	%ymm4,%ymm9,%ymm4
1200	vpshufb	%ymm15,%ymm4,%ymm4
1201	vpaddd	%ymm7,%ymm12,%ymm12
1202	vpxor	%ymm1,%ymm12,%ymm1
1203	vpslld	$12,%ymm1,%ymm14
1204	vpsrld	$20,%ymm1,%ymm1
1205	vpor	%ymm1,%ymm14,%ymm1
1206	vbroadcasti128	(%r11),%ymm14
1207	vpaddd	%ymm4,%ymm13,%ymm13
1208	vpxor	%ymm2,%ymm13,%ymm2
1209	vpslld	$12,%ymm2,%ymm15
1210	vpsrld	$20,%ymm2,%ymm2
1211	vpor	%ymm2,%ymm15,%ymm2
1212	vpaddd	%ymm1,%ymm8,%ymm8
1213	vpxor	%ymm7,%ymm8,%ymm7
1214	vpshufb	%ymm14,%ymm7,%ymm7
1215	vpaddd	%ymm2,%ymm9,%ymm9
1216	vpxor	%ymm4,%ymm9,%ymm4
1217	vpshufb	%ymm14,%ymm4,%ymm4
1218	vpaddd	%ymm7,%ymm12,%ymm12
1219	vpxor	%ymm1,%ymm12,%ymm1
1220	vpslld	$7,%ymm1,%ymm15
1221	vpsrld	$25,%ymm1,%ymm1
1222	vpor	%ymm1,%ymm15,%ymm1
1223	vbroadcasti128	(%r10),%ymm15
1224	vpaddd	%ymm4,%ymm13,%ymm13
1225	vpxor	%ymm2,%ymm13,%ymm2
1226	vpslld	$7,%ymm2,%ymm14
1227	vpsrld	$25,%ymm2,%ymm2
1228	vpor	%ymm2,%ymm14,%ymm2
1229	vmovdqa	%ymm12,64(%rsp)
1230	vmovdqa	%ymm13,96(%rsp)
1231	vmovdqa	0(%rsp),%ymm12
1232	vmovdqa	32(%rsp),%ymm13
1233	vpaddd	%ymm3,%ymm10,%ymm10
1234	vpxor	%ymm5,%ymm10,%ymm5
1235	vpshufb	%ymm15,%ymm5,%ymm5
1236	vpaddd	%ymm0,%ymm11,%ymm11
1237	vpxor	%ymm6,%ymm11,%ymm6
1238	vpshufb	%ymm15,%ymm6,%ymm6
1239	vpaddd	%ymm5,%ymm12,%ymm12
1240	vpxor	%ymm3,%ymm12,%ymm3
1241	vpslld	$12,%ymm3,%ymm14
1242	vpsrld	$20,%ymm3,%ymm3
1243	vpor	%ymm3,%ymm14,%ymm3
1244	vbroadcasti128	(%r11),%ymm14
1245	vpaddd	%ymm6,%ymm13,%ymm13
1246	vpxor	%ymm0,%ymm13,%ymm0
1247	vpslld	$12,%ymm0,%ymm15
1248	vpsrld	$20,%ymm0,%ymm0
1249	vpor	%ymm0,%ymm15,%ymm0
1250	vpaddd	%ymm3,%ymm10,%ymm10
1251	vpxor	%ymm5,%ymm10,%ymm5
1252	vpshufb	%ymm14,%ymm5,%ymm5
1253	vpaddd	%ymm0,%ymm11,%ymm11
1254	vpxor	%ymm6,%ymm11,%ymm6
1255	vpshufb	%ymm14,%ymm6,%ymm6
1256	vpaddd	%ymm5,%ymm12,%ymm12
1257	vpxor	%ymm3,%ymm12,%ymm3
1258	vpslld	$7,%ymm3,%ymm15
1259	vpsrld	$25,%ymm3,%ymm3
1260	vpor	%ymm3,%ymm15,%ymm3
1261	vbroadcasti128	(%r10),%ymm15
1262	vpaddd	%ymm6,%ymm13,%ymm13
1263	vpxor	%ymm0,%ymm13,%ymm0
1264	vpslld	$7,%ymm0,%ymm14
1265	vpsrld	$25,%ymm0,%ymm0
1266	vpor	%ymm0,%ymm14,%ymm0
1267	decl	%eax
1268	jnz	.Loop8x
1269
1270	leaq	512(%rsp),%rax
1271	vpaddd	128-256(%rcx),%ymm8,%ymm8
1272	vpaddd	160-256(%rcx),%ymm9,%ymm9
1273	vpaddd	192-256(%rcx),%ymm10,%ymm10
1274	vpaddd	224-256(%rcx),%ymm11,%ymm11
1275
1276	vpunpckldq	%ymm9,%ymm8,%ymm14
1277	vpunpckldq	%ymm11,%ymm10,%ymm15
1278	vpunpckhdq	%ymm9,%ymm8,%ymm8
1279	vpunpckhdq	%ymm11,%ymm10,%ymm10
1280	vpunpcklqdq	%ymm15,%ymm14,%ymm9
1281	vpunpckhqdq	%ymm15,%ymm14,%ymm14
1282	vpunpcklqdq	%ymm10,%ymm8,%ymm11
1283	vpunpckhqdq	%ymm10,%ymm8,%ymm8
1284	vpaddd	256-256(%rcx),%ymm0,%ymm0
1285	vpaddd	288-256(%rcx),%ymm1,%ymm1
1286	vpaddd	320-256(%rcx),%ymm2,%ymm2
1287	vpaddd	352-256(%rcx),%ymm3,%ymm3
1288
1289	vpunpckldq	%ymm1,%ymm0,%ymm10
1290	vpunpckldq	%ymm3,%ymm2,%ymm15
1291	vpunpckhdq	%ymm1,%ymm0,%ymm0
1292	vpunpckhdq	%ymm3,%ymm2,%ymm2
1293	vpunpcklqdq	%ymm15,%ymm10,%ymm1
1294	vpunpckhqdq	%ymm15,%ymm10,%ymm10
1295	vpunpcklqdq	%ymm2,%ymm0,%ymm3
1296	vpunpckhqdq	%ymm2,%ymm0,%ymm0
1297	vperm2i128	$0x20,%ymm1,%ymm9,%ymm15
1298	vperm2i128	$0x31,%ymm1,%ymm9,%ymm1
1299	vperm2i128	$0x20,%ymm10,%ymm14,%ymm9
1300	vperm2i128	$0x31,%ymm10,%ymm14,%ymm10
1301	vperm2i128	$0x20,%ymm3,%ymm11,%ymm14
1302	vperm2i128	$0x31,%ymm3,%ymm11,%ymm3
1303	vperm2i128	$0x20,%ymm0,%ymm8,%ymm11
1304	vperm2i128	$0x31,%ymm0,%ymm8,%ymm0
1305	vmovdqa	%ymm15,0(%rsp)
1306	vmovdqa	%ymm9,32(%rsp)
1307	vmovdqa	64(%rsp),%ymm15
1308	vmovdqa	96(%rsp),%ymm9
1309
1310	vpaddd	384-512(%rax),%ymm12,%ymm12
1311	vpaddd	416-512(%rax),%ymm13,%ymm13
1312	vpaddd	448-512(%rax),%ymm15,%ymm15
1313	vpaddd	480-512(%rax),%ymm9,%ymm9
1314
1315	vpunpckldq	%ymm13,%ymm12,%ymm2
1316	vpunpckldq	%ymm9,%ymm15,%ymm8
1317	vpunpckhdq	%ymm13,%ymm12,%ymm12
1318	vpunpckhdq	%ymm9,%ymm15,%ymm15
1319	vpunpcklqdq	%ymm8,%ymm2,%ymm13
1320	vpunpckhqdq	%ymm8,%ymm2,%ymm2
1321	vpunpcklqdq	%ymm15,%ymm12,%ymm9
1322	vpunpckhqdq	%ymm15,%ymm12,%ymm12
1323	vpaddd	512-512(%rax),%ymm4,%ymm4
1324	vpaddd	544-512(%rax),%ymm5,%ymm5
1325	vpaddd	576-512(%rax),%ymm6,%ymm6
1326	vpaddd	608-512(%rax),%ymm7,%ymm7
1327
1328	vpunpckldq	%ymm5,%ymm4,%ymm15
1329	vpunpckldq	%ymm7,%ymm6,%ymm8
1330	vpunpckhdq	%ymm5,%ymm4,%ymm4
1331	vpunpckhdq	%ymm7,%ymm6,%ymm6
1332	vpunpcklqdq	%ymm8,%ymm15,%ymm5
1333	vpunpckhqdq	%ymm8,%ymm15,%ymm15
1334	vpunpcklqdq	%ymm6,%ymm4,%ymm7
1335	vpunpckhqdq	%ymm6,%ymm4,%ymm4
1336	vperm2i128	$0x20,%ymm5,%ymm13,%ymm8
1337	vperm2i128	$0x31,%ymm5,%ymm13,%ymm5
1338	vperm2i128	$0x20,%ymm15,%ymm2,%ymm13
1339	vperm2i128	$0x31,%ymm15,%ymm2,%ymm15
1340	vperm2i128	$0x20,%ymm7,%ymm9,%ymm2
1341	vperm2i128	$0x31,%ymm7,%ymm9,%ymm7
1342	vperm2i128	$0x20,%ymm4,%ymm12,%ymm9
1343	vperm2i128	$0x31,%ymm4,%ymm12,%ymm4
1344	vmovdqa	0(%rsp),%ymm6
1345	vmovdqa	32(%rsp),%ymm12
1346
1347	cmpq	$512,%rdx
1348	jb	.Ltail8x
1349
1350	vpxor	0(%rsi),%ymm6,%ymm6
1351	vpxor	32(%rsi),%ymm8,%ymm8
1352	vpxor	64(%rsi),%ymm1,%ymm1
1353	vpxor	96(%rsi),%ymm5,%ymm5
1354	leaq	128(%rsi),%rsi
1355	vmovdqu	%ymm6,0(%rdi)
1356	vmovdqu	%ymm8,32(%rdi)
1357	vmovdqu	%ymm1,64(%rdi)
1358	vmovdqu	%ymm5,96(%rdi)
1359	leaq	128(%rdi),%rdi
1360
1361	vpxor	0(%rsi),%ymm12,%ymm12
1362	vpxor	32(%rsi),%ymm13,%ymm13
1363	vpxor	64(%rsi),%ymm10,%ymm10
1364	vpxor	96(%rsi),%ymm15,%ymm15
1365	leaq	128(%rsi),%rsi
1366	vmovdqu	%ymm12,0(%rdi)
1367	vmovdqu	%ymm13,32(%rdi)
1368	vmovdqu	%ymm10,64(%rdi)
1369	vmovdqu	%ymm15,96(%rdi)
1370	leaq	128(%rdi),%rdi
1371
1372	vpxor	0(%rsi),%ymm14,%ymm14
1373	vpxor	32(%rsi),%ymm2,%ymm2
1374	vpxor	64(%rsi),%ymm3,%ymm3
1375	vpxor	96(%rsi),%ymm7,%ymm7
1376	leaq	128(%rsi),%rsi
1377	vmovdqu	%ymm14,0(%rdi)
1378	vmovdqu	%ymm2,32(%rdi)
1379	vmovdqu	%ymm3,64(%rdi)
1380	vmovdqu	%ymm7,96(%rdi)
1381	leaq	128(%rdi),%rdi
1382
1383	vpxor	0(%rsi),%ymm11,%ymm11
1384	vpxor	32(%rsi),%ymm9,%ymm9
1385	vpxor	64(%rsi),%ymm0,%ymm0
1386	vpxor	96(%rsi),%ymm4,%ymm4
1387	leaq	128(%rsi),%rsi
1388	vmovdqu	%ymm11,0(%rdi)
1389	vmovdqu	%ymm9,32(%rdi)
1390	vmovdqu	%ymm0,64(%rdi)
1391	vmovdqu	%ymm4,96(%rdi)
1392	leaq	128(%rdi),%rdi
1393
1394	subq	$512,%rdx
1395	jnz	.Loop_outer8x
1396
1397	jmp	.Ldone8x
1398
1399.Ltail8x:
1400	cmpq	$448,%rdx
1401	jae	.L448_or_more8x
1402	cmpq	$384,%rdx
1403	jae	.L384_or_more8x
1404	cmpq	$320,%rdx
1405	jae	.L320_or_more8x
1406	cmpq	$256,%rdx
1407	jae	.L256_or_more8x
1408	cmpq	$192,%rdx
1409	jae	.L192_or_more8x
1410	cmpq	$128,%rdx
1411	jae	.L128_or_more8x
1412	cmpq	$64,%rdx
1413	jae	.L64_or_more8x
1414
1415	xorq	%r10,%r10
1416	vmovdqa	%ymm6,0(%rsp)
1417	vmovdqa	%ymm8,32(%rsp)
1418	jmp	.Loop_tail8x
1419
1420.align	32
1421.L64_or_more8x:
1422	vpxor	0(%rsi),%ymm6,%ymm6
1423	vpxor	32(%rsi),%ymm8,%ymm8
1424	vmovdqu	%ymm6,0(%rdi)
1425	vmovdqu	%ymm8,32(%rdi)
1426	je	.Ldone8x
1427
1428	leaq	64(%rsi),%rsi
1429	xorq	%r10,%r10
1430	vmovdqa	%ymm1,0(%rsp)
1431	leaq	64(%rdi),%rdi
1432	subq	$64,%rdx
1433	vmovdqa	%ymm5,32(%rsp)
1434	jmp	.Loop_tail8x
1435
1436.align	32
1437.L128_or_more8x:
1438	vpxor	0(%rsi),%ymm6,%ymm6
1439	vpxor	32(%rsi),%ymm8,%ymm8
1440	vpxor	64(%rsi),%ymm1,%ymm1
1441	vpxor	96(%rsi),%ymm5,%ymm5
1442	vmovdqu	%ymm6,0(%rdi)
1443	vmovdqu	%ymm8,32(%rdi)
1444	vmovdqu	%ymm1,64(%rdi)
1445	vmovdqu	%ymm5,96(%rdi)
1446	je	.Ldone8x
1447
1448	leaq	128(%rsi),%rsi
1449	xorq	%r10,%r10
1450	vmovdqa	%ymm12,0(%rsp)
1451	leaq	128(%rdi),%rdi
1452	subq	$128,%rdx
1453	vmovdqa	%ymm13,32(%rsp)
1454	jmp	.Loop_tail8x
1455
1456.align	32
1457.L192_or_more8x:
1458	vpxor	0(%rsi),%ymm6,%ymm6
1459	vpxor	32(%rsi),%ymm8,%ymm8
1460	vpxor	64(%rsi),%ymm1,%ymm1
1461	vpxor	96(%rsi),%ymm5,%ymm5
1462	vpxor	128(%rsi),%ymm12,%ymm12
1463	vpxor	160(%rsi),%ymm13,%ymm13
1464	vmovdqu	%ymm6,0(%rdi)
1465	vmovdqu	%ymm8,32(%rdi)
1466	vmovdqu	%ymm1,64(%rdi)
1467	vmovdqu	%ymm5,96(%rdi)
1468	vmovdqu	%ymm12,128(%rdi)
1469	vmovdqu	%ymm13,160(%rdi)
1470	je	.Ldone8x
1471
1472	leaq	192(%rsi),%rsi
1473	xorq	%r10,%r10
1474	vmovdqa	%ymm10,0(%rsp)
1475	leaq	192(%rdi),%rdi
1476	subq	$192,%rdx
1477	vmovdqa	%ymm15,32(%rsp)
1478	jmp	.Loop_tail8x
1479
1480.align	32
1481.L256_or_more8x:
1482	vpxor	0(%rsi),%ymm6,%ymm6
1483	vpxor	32(%rsi),%ymm8,%ymm8
1484	vpxor	64(%rsi),%ymm1,%ymm1
1485	vpxor	96(%rsi),%ymm5,%ymm5
1486	vpxor	128(%rsi),%ymm12,%ymm12
1487	vpxor	160(%rsi),%ymm13,%ymm13
1488	vpxor	192(%rsi),%ymm10,%ymm10
1489	vpxor	224(%rsi),%ymm15,%ymm15
1490	vmovdqu	%ymm6,0(%rdi)
1491	vmovdqu	%ymm8,32(%rdi)
1492	vmovdqu	%ymm1,64(%rdi)
1493	vmovdqu	%ymm5,96(%rdi)
1494	vmovdqu	%ymm12,128(%rdi)
1495	vmovdqu	%ymm13,160(%rdi)
1496	vmovdqu	%ymm10,192(%rdi)
1497	vmovdqu	%ymm15,224(%rdi)
1498	je	.Ldone8x
1499
1500	leaq	256(%rsi),%rsi
1501	xorq	%r10,%r10
1502	vmovdqa	%ymm14,0(%rsp)
1503	leaq	256(%rdi),%rdi
1504	subq	$256,%rdx
1505	vmovdqa	%ymm2,32(%rsp)
1506	jmp	.Loop_tail8x
1507
1508.align	32
1509.L320_or_more8x:
1510	vpxor	0(%rsi),%ymm6,%ymm6
1511	vpxor	32(%rsi),%ymm8,%ymm8
1512	vpxor	64(%rsi),%ymm1,%ymm1
1513	vpxor	96(%rsi),%ymm5,%ymm5
1514	vpxor	128(%rsi),%ymm12,%ymm12
1515	vpxor	160(%rsi),%ymm13,%ymm13
1516	vpxor	192(%rsi),%ymm10,%ymm10
1517	vpxor	224(%rsi),%ymm15,%ymm15
1518	vpxor	256(%rsi),%ymm14,%ymm14
1519	vpxor	288(%rsi),%ymm2,%ymm2
1520	vmovdqu	%ymm6,0(%rdi)
1521	vmovdqu	%ymm8,32(%rdi)
1522	vmovdqu	%ymm1,64(%rdi)
1523	vmovdqu	%ymm5,96(%rdi)
1524	vmovdqu	%ymm12,128(%rdi)
1525	vmovdqu	%ymm13,160(%rdi)
1526	vmovdqu	%ymm10,192(%rdi)
1527	vmovdqu	%ymm15,224(%rdi)
1528	vmovdqu	%ymm14,256(%rdi)
1529	vmovdqu	%ymm2,288(%rdi)
1530	je	.Ldone8x
1531
1532	leaq	320(%rsi),%rsi
1533	xorq	%r10,%r10
1534	vmovdqa	%ymm3,0(%rsp)
1535	leaq	320(%rdi),%rdi
1536	subq	$320,%rdx
1537	vmovdqa	%ymm7,32(%rsp)
1538	jmp	.Loop_tail8x
1539
1540.align	32
1541.L384_or_more8x:
1542	vpxor	0(%rsi),%ymm6,%ymm6
1543	vpxor	32(%rsi),%ymm8,%ymm8
1544	vpxor	64(%rsi),%ymm1,%ymm1
1545	vpxor	96(%rsi),%ymm5,%ymm5
1546	vpxor	128(%rsi),%ymm12,%ymm12
1547	vpxor	160(%rsi),%ymm13,%ymm13
1548	vpxor	192(%rsi),%ymm10,%ymm10
1549	vpxor	224(%rsi),%ymm15,%ymm15
1550	vpxor	256(%rsi),%ymm14,%ymm14
1551	vpxor	288(%rsi),%ymm2,%ymm2
1552	vpxor	320(%rsi),%ymm3,%ymm3
1553	vpxor	352(%rsi),%ymm7,%ymm7
1554	vmovdqu	%ymm6,0(%rdi)
1555	vmovdqu	%ymm8,32(%rdi)
1556	vmovdqu	%ymm1,64(%rdi)
1557	vmovdqu	%ymm5,96(%rdi)
1558	vmovdqu	%ymm12,128(%rdi)
1559	vmovdqu	%ymm13,160(%rdi)
1560	vmovdqu	%ymm10,192(%rdi)
1561	vmovdqu	%ymm15,224(%rdi)
1562	vmovdqu	%ymm14,256(%rdi)
1563	vmovdqu	%ymm2,288(%rdi)
1564	vmovdqu	%ymm3,320(%rdi)
1565	vmovdqu	%ymm7,352(%rdi)
1566	je	.Ldone8x
1567
1568	leaq	384(%rsi),%rsi
1569	xorq	%r10,%r10
1570	vmovdqa	%ymm11,0(%rsp)
1571	leaq	384(%rdi),%rdi
1572	subq	$384,%rdx
1573	vmovdqa	%ymm9,32(%rsp)
1574	jmp	.Loop_tail8x
1575
1576.align	32
1577.L448_or_more8x:
1578	vpxor	0(%rsi),%ymm6,%ymm6
1579	vpxor	32(%rsi),%ymm8,%ymm8
1580	vpxor	64(%rsi),%ymm1,%ymm1
1581	vpxor	96(%rsi),%ymm5,%ymm5
1582	vpxor	128(%rsi),%ymm12,%ymm12
1583	vpxor	160(%rsi),%ymm13,%ymm13
1584	vpxor	192(%rsi),%ymm10,%ymm10
1585	vpxor	224(%rsi),%ymm15,%ymm15
1586	vpxor	256(%rsi),%ymm14,%ymm14
1587	vpxor	288(%rsi),%ymm2,%ymm2
1588	vpxor	320(%rsi),%ymm3,%ymm3
1589	vpxor	352(%rsi),%ymm7,%ymm7
1590	vpxor	384(%rsi),%ymm11,%ymm11
1591	vpxor	416(%rsi),%ymm9,%ymm9
1592	vmovdqu	%ymm6,0(%rdi)
1593	vmovdqu	%ymm8,32(%rdi)
1594	vmovdqu	%ymm1,64(%rdi)
1595	vmovdqu	%ymm5,96(%rdi)
1596	vmovdqu	%ymm12,128(%rdi)
1597	vmovdqu	%ymm13,160(%rdi)
1598	vmovdqu	%ymm10,192(%rdi)
1599	vmovdqu	%ymm15,224(%rdi)
1600	vmovdqu	%ymm14,256(%rdi)
1601	vmovdqu	%ymm2,288(%rdi)
1602	vmovdqu	%ymm3,320(%rdi)
1603	vmovdqu	%ymm7,352(%rdi)
1604	vmovdqu	%ymm11,384(%rdi)
1605	vmovdqu	%ymm9,416(%rdi)
1606	je	.Ldone8x
1607
1608	leaq	448(%rsi),%rsi
1609	xorq	%r10,%r10
1610	vmovdqa	%ymm0,0(%rsp)
1611	leaq	448(%rdi),%rdi
1612	subq	$448,%rdx
1613	vmovdqa	%ymm4,32(%rsp)
1614
1615.Loop_tail8x:
1616	movzbl	(%rsi,%r10,1),%eax
1617	movzbl	(%rsp,%r10,1),%ecx
1618	leaq	1(%r10),%r10
1619	xorl	%ecx,%eax
1620	movb	%al,-1(%rdi,%r10,1)
1621	decq	%rdx
1622	jnz	.Loop_tail8x
1623
1624.Ldone8x:
1625	vzeroall
1626	leaq	(%r9),%rsp
1627.cfi_def_cfa_register	rsp
1628.L8x_epilogue:
1629	.byte	0xf3,0xc3
1630.cfi_endproc
1631.size	ChaCha20_8x,.-ChaCha20_8x
1632#endif
1633