• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# This file is generated from a similarly-named Perl script in the BoringSSL
2# source tree. Do not edit by hand.
3
4#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
11.text
12
13.extern	GFp_ia32cap_P
14.hidden GFp_ia32cap_P
15
16.align	64
17.Lzero:
18.long	0,0,0,0
19.Lone:
20.long	1,0,0,0
21.Linc:
22.long	0,1,2,3
23.Lfour:
24.long	4,4,4,4
25.Lincy:
26.long	0,2,4,6,1,3,5,7
27.Leight:
28.long	8,8,8,8,8,8,8,8
29.Lrot16:
30.byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
31.Lrot24:
32.byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
33.Lsigma:
34.byte	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
35.align	64
36.Lzeroz:
37.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
38.Lfourz:
39.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
40.Lincz:
41.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
42.Lsixteen:
43.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
44.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
45.globl	GFp_ChaCha20_ctr32
46.hidden GFp_ChaCha20_ctr32
47.type	GFp_ChaCha20_ctr32,@function
48.align	64
49GFp_ChaCha20_ctr32:
50.cfi_startproc
51	cmpq	$0,%rdx
52	je	.Lno_data
53	movq	GFp_ia32cap_P+4(%rip),%r10
54	testl	$512,%r10d
55	jnz	.LChaCha20_ssse3
56
57	pushq	%rbx
58.cfi_adjust_cfa_offset	8
59.cfi_offset	rbx,-16
60	pushq	%rbp
61.cfi_adjust_cfa_offset	8
62.cfi_offset	rbp,-24
63	pushq	%r12
64.cfi_adjust_cfa_offset	8
65.cfi_offset	r12,-32
66	pushq	%r13
67.cfi_adjust_cfa_offset	8
68.cfi_offset	r13,-40
69	pushq	%r14
70.cfi_adjust_cfa_offset	8
71.cfi_offset	r14,-48
72	pushq	%r15
73.cfi_adjust_cfa_offset	8
74.cfi_offset	r15,-56
75	subq	$64+24,%rsp
76.cfi_adjust_cfa_offset	88
77.Lctr32_body:
78
79
80	movdqu	(%rcx),%xmm1
81	movdqu	16(%rcx),%xmm2
82	movdqu	(%r8),%xmm3
83	movdqa	.Lone(%rip),%xmm4
84
85
86	movdqa	%xmm1,16(%rsp)
87	movdqa	%xmm2,32(%rsp)
88	movdqa	%xmm3,48(%rsp)
89	movq	%rdx,%rbp
90	jmp	.Loop_outer
91
92.align	32
93.Loop_outer:
94	movl	$0x61707865,%eax
95	movl	$0x3320646e,%ebx
96	movl	$0x79622d32,%ecx
97	movl	$0x6b206574,%edx
98	movl	16(%rsp),%r8d
99	movl	20(%rsp),%r9d
100	movl	24(%rsp),%r10d
101	movl	28(%rsp),%r11d
102	movd	%xmm3,%r12d
103	movl	52(%rsp),%r13d
104	movl	56(%rsp),%r14d
105	movl	60(%rsp),%r15d
106
107	movq	%rbp,64+0(%rsp)
108	movl	$10,%ebp
109	movq	%rsi,64+8(%rsp)
110.byte	102,72,15,126,214
111	movq	%rdi,64+16(%rsp)
112	movq	%rsi,%rdi
113	shrq	$32,%rdi
114	jmp	.Loop
115
116.align	32
117.Loop:
118	addl	%r8d,%eax
119	xorl	%eax,%r12d
120	roll	$16,%r12d
121	addl	%r9d,%ebx
122	xorl	%ebx,%r13d
123	roll	$16,%r13d
124	addl	%r12d,%esi
125	xorl	%esi,%r8d
126	roll	$12,%r8d
127	addl	%r13d,%edi
128	xorl	%edi,%r9d
129	roll	$12,%r9d
130	addl	%r8d,%eax
131	xorl	%eax,%r12d
132	roll	$8,%r12d
133	addl	%r9d,%ebx
134	xorl	%ebx,%r13d
135	roll	$8,%r13d
136	addl	%r12d,%esi
137	xorl	%esi,%r8d
138	roll	$7,%r8d
139	addl	%r13d,%edi
140	xorl	%edi,%r9d
141	roll	$7,%r9d
142	movl	%esi,32(%rsp)
143	movl	%edi,36(%rsp)
144	movl	40(%rsp),%esi
145	movl	44(%rsp),%edi
146	addl	%r10d,%ecx
147	xorl	%ecx,%r14d
148	roll	$16,%r14d
149	addl	%r11d,%edx
150	xorl	%edx,%r15d
151	roll	$16,%r15d
152	addl	%r14d,%esi
153	xorl	%esi,%r10d
154	roll	$12,%r10d
155	addl	%r15d,%edi
156	xorl	%edi,%r11d
157	roll	$12,%r11d
158	addl	%r10d,%ecx
159	xorl	%ecx,%r14d
160	roll	$8,%r14d
161	addl	%r11d,%edx
162	xorl	%edx,%r15d
163	roll	$8,%r15d
164	addl	%r14d,%esi
165	xorl	%esi,%r10d
166	roll	$7,%r10d
167	addl	%r15d,%edi
168	xorl	%edi,%r11d
169	roll	$7,%r11d
170	addl	%r9d,%eax
171	xorl	%eax,%r15d
172	roll	$16,%r15d
173	addl	%r10d,%ebx
174	xorl	%ebx,%r12d
175	roll	$16,%r12d
176	addl	%r15d,%esi
177	xorl	%esi,%r9d
178	roll	$12,%r9d
179	addl	%r12d,%edi
180	xorl	%edi,%r10d
181	roll	$12,%r10d
182	addl	%r9d,%eax
183	xorl	%eax,%r15d
184	roll	$8,%r15d
185	addl	%r10d,%ebx
186	xorl	%ebx,%r12d
187	roll	$8,%r12d
188	addl	%r15d,%esi
189	xorl	%esi,%r9d
190	roll	$7,%r9d
191	addl	%r12d,%edi
192	xorl	%edi,%r10d
193	roll	$7,%r10d
194	movl	%esi,40(%rsp)
195	movl	%edi,44(%rsp)
196	movl	32(%rsp),%esi
197	movl	36(%rsp),%edi
198	addl	%r11d,%ecx
199	xorl	%ecx,%r13d
200	roll	$16,%r13d
201	addl	%r8d,%edx
202	xorl	%edx,%r14d
203	roll	$16,%r14d
204	addl	%r13d,%esi
205	xorl	%esi,%r11d
206	roll	$12,%r11d
207	addl	%r14d,%edi
208	xorl	%edi,%r8d
209	roll	$12,%r8d
210	addl	%r11d,%ecx
211	xorl	%ecx,%r13d
212	roll	$8,%r13d
213	addl	%r8d,%edx
214	xorl	%edx,%r14d
215	roll	$8,%r14d
216	addl	%r13d,%esi
217	xorl	%esi,%r11d
218	roll	$7,%r11d
219	addl	%r14d,%edi
220	xorl	%edi,%r8d
221	roll	$7,%r8d
222	decl	%ebp
223	jnz	.Loop
224	movl	%edi,36(%rsp)
225	movl	%esi,32(%rsp)
226	movq	64(%rsp),%rbp
227	movdqa	%xmm2,%xmm1
228	movq	64+8(%rsp),%rsi
229	paddd	%xmm4,%xmm3
230	movq	64+16(%rsp),%rdi
231
232	addl	$0x61707865,%eax
233	addl	$0x3320646e,%ebx
234	addl	$0x79622d32,%ecx
235	addl	$0x6b206574,%edx
236	addl	16(%rsp),%r8d
237	addl	20(%rsp),%r9d
238	addl	24(%rsp),%r10d
239	addl	28(%rsp),%r11d
240	addl	48(%rsp),%r12d
241	addl	52(%rsp),%r13d
242	addl	56(%rsp),%r14d
243	addl	60(%rsp),%r15d
244	paddd	32(%rsp),%xmm1
245
246	cmpq	$64,%rbp
247	jb	.Ltail
248
249	xorl	0(%rsi),%eax
250	xorl	4(%rsi),%ebx
251	xorl	8(%rsi),%ecx
252	xorl	12(%rsi),%edx
253	xorl	16(%rsi),%r8d
254	xorl	20(%rsi),%r9d
255	xorl	24(%rsi),%r10d
256	xorl	28(%rsi),%r11d
257	movdqu	32(%rsi),%xmm0
258	xorl	48(%rsi),%r12d
259	xorl	52(%rsi),%r13d
260	xorl	56(%rsi),%r14d
261	xorl	60(%rsi),%r15d
262	leaq	64(%rsi),%rsi
263	pxor	%xmm1,%xmm0
264
265	movdqa	%xmm2,32(%rsp)
266	movd	%xmm3,48(%rsp)
267
268	movl	%eax,0(%rdi)
269	movl	%ebx,4(%rdi)
270	movl	%ecx,8(%rdi)
271	movl	%edx,12(%rdi)
272	movl	%r8d,16(%rdi)
273	movl	%r9d,20(%rdi)
274	movl	%r10d,24(%rdi)
275	movl	%r11d,28(%rdi)
276	movdqu	%xmm0,32(%rdi)
277	movl	%r12d,48(%rdi)
278	movl	%r13d,52(%rdi)
279	movl	%r14d,56(%rdi)
280	movl	%r15d,60(%rdi)
281	leaq	64(%rdi),%rdi
282
283	subq	$64,%rbp
284	jnz	.Loop_outer
285
286	jmp	.Ldone
287
288.align	16
289.Ltail:
290	movl	%eax,0(%rsp)
291	movl	%ebx,4(%rsp)
292	xorq	%rbx,%rbx
293	movl	%ecx,8(%rsp)
294	movl	%edx,12(%rsp)
295	movl	%r8d,16(%rsp)
296	movl	%r9d,20(%rsp)
297	movl	%r10d,24(%rsp)
298	movl	%r11d,28(%rsp)
299	movdqa	%xmm1,32(%rsp)
300	movl	%r12d,48(%rsp)
301	movl	%r13d,52(%rsp)
302	movl	%r14d,56(%rsp)
303	movl	%r15d,60(%rsp)
304
305.Loop_tail:
306	movzbl	(%rsi,%rbx,1),%eax
307	movzbl	(%rsp,%rbx,1),%edx
308	leaq	1(%rbx),%rbx
309	xorl	%edx,%eax
310	movb	%al,-1(%rdi,%rbx,1)
311	decq	%rbp
312	jnz	.Loop_tail
313
314.Ldone:
315	leaq	64+24+48(%rsp),%rsi
316	movq	-48(%rsi),%r15
317.cfi_restore	r15
318	movq	-40(%rsi),%r14
319.cfi_restore	r14
320	movq	-32(%rsi),%r13
321.cfi_restore	r13
322	movq	-24(%rsi),%r12
323.cfi_restore	r12
324	movq	-16(%rsi),%rbp
325.cfi_restore	rbp
326	movq	-8(%rsi),%rbx
327.cfi_restore	rbx
328	leaq	(%rsi),%rsp
329.cfi_adjust_cfa_offset	-136
330.Lno_data:
331	.byte	0xf3,0xc3
332.cfi_endproc
333.size	GFp_ChaCha20_ctr32,.-GFp_ChaCha20_ctr32
334.type	ChaCha20_ssse3,@function
335.align	32
336ChaCha20_ssse3:
337.LChaCha20_ssse3:
338.cfi_startproc
339	movq	%rsp,%r9
340.cfi_def_cfa_register	r9
341	cmpq	$128,%rdx
342	ja	.LChaCha20_4x
343
344.Ldo_sse3_after_all:
345	subq	$64+8,%rsp
346	movdqa	.Lsigma(%rip),%xmm0
347	movdqu	(%rcx),%xmm1
348	movdqu	16(%rcx),%xmm2
349	movdqu	(%r8),%xmm3
350	movdqa	.Lrot16(%rip),%xmm6
351	movdqa	.Lrot24(%rip),%xmm7
352
353	movdqa	%xmm0,0(%rsp)
354	movdqa	%xmm1,16(%rsp)
355	movdqa	%xmm2,32(%rsp)
356	movdqa	%xmm3,48(%rsp)
357	movq	$10,%r8
358	jmp	.Loop_ssse3
359
360.align	32
361.Loop_outer_ssse3:
362	movdqa	.Lone(%rip),%xmm3
363	movdqa	0(%rsp),%xmm0
364	movdqa	16(%rsp),%xmm1
365	movdqa	32(%rsp),%xmm2
366	paddd	48(%rsp),%xmm3
367	movq	$10,%r8
368	movdqa	%xmm3,48(%rsp)
369	jmp	.Loop_ssse3
370
371.align	32
372.Loop_ssse3:
373	paddd	%xmm1,%xmm0
374	pxor	%xmm0,%xmm3
375.byte	102,15,56,0,222
376	paddd	%xmm3,%xmm2
377	pxor	%xmm2,%xmm1
378	movdqa	%xmm1,%xmm4
379	psrld	$20,%xmm1
380	pslld	$12,%xmm4
381	por	%xmm4,%xmm1
382	paddd	%xmm1,%xmm0
383	pxor	%xmm0,%xmm3
384.byte	102,15,56,0,223
385	paddd	%xmm3,%xmm2
386	pxor	%xmm2,%xmm1
387	movdqa	%xmm1,%xmm4
388	psrld	$25,%xmm1
389	pslld	$7,%xmm4
390	por	%xmm4,%xmm1
391	pshufd	$78,%xmm2,%xmm2
392	pshufd	$57,%xmm1,%xmm1
393	pshufd	$147,%xmm3,%xmm3
394	nop
395	paddd	%xmm1,%xmm0
396	pxor	%xmm0,%xmm3
397.byte	102,15,56,0,222
398	paddd	%xmm3,%xmm2
399	pxor	%xmm2,%xmm1
400	movdqa	%xmm1,%xmm4
401	psrld	$20,%xmm1
402	pslld	$12,%xmm4
403	por	%xmm4,%xmm1
404	paddd	%xmm1,%xmm0
405	pxor	%xmm0,%xmm3
406.byte	102,15,56,0,223
407	paddd	%xmm3,%xmm2
408	pxor	%xmm2,%xmm1
409	movdqa	%xmm1,%xmm4
410	psrld	$25,%xmm1
411	pslld	$7,%xmm4
412	por	%xmm4,%xmm1
413	pshufd	$78,%xmm2,%xmm2
414	pshufd	$147,%xmm1,%xmm1
415	pshufd	$57,%xmm3,%xmm3
416	decq	%r8
417	jnz	.Loop_ssse3
418	paddd	0(%rsp),%xmm0
419	paddd	16(%rsp),%xmm1
420	paddd	32(%rsp),%xmm2
421	paddd	48(%rsp),%xmm3
422
423	cmpq	$64,%rdx
424	jb	.Ltail_ssse3
425
426	movdqu	0(%rsi),%xmm4
427	movdqu	16(%rsi),%xmm5
428	pxor	%xmm4,%xmm0
429	movdqu	32(%rsi),%xmm4
430	pxor	%xmm5,%xmm1
431	movdqu	48(%rsi),%xmm5
432	leaq	64(%rsi),%rsi
433	pxor	%xmm4,%xmm2
434	pxor	%xmm5,%xmm3
435
436	movdqu	%xmm0,0(%rdi)
437	movdqu	%xmm1,16(%rdi)
438	movdqu	%xmm2,32(%rdi)
439	movdqu	%xmm3,48(%rdi)
440	leaq	64(%rdi),%rdi
441
442	subq	$64,%rdx
443	jnz	.Loop_outer_ssse3
444
445	jmp	.Ldone_ssse3
446
447.align	16
448.Ltail_ssse3:
449	movdqa	%xmm0,0(%rsp)
450	movdqa	%xmm1,16(%rsp)
451	movdqa	%xmm2,32(%rsp)
452	movdqa	%xmm3,48(%rsp)
453	xorq	%r8,%r8
454
455.Loop_tail_ssse3:
456	movzbl	(%rsi,%r8,1),%eax
457	movzbl	(%rsp,%r8,1),%ecx
458	leaq	1(%r8),%r8
459	xorl	%ecx,%eax
460	movb	%al,-1(%rdi,%r8,1)
461	decq	%rdx
462	jnz	.Loop_tail_ssse3
463
464.Ldone_ssse3:
465	leaq	(%r9),%rsp
466.cfi_def_cfa_register	rsp
467.Lssse3_epilogue:
468	.byte	0xf3,0xc3
469.cfi_endproc
470.size	ChaCha20_ssse3,.-ChaCha20_ssse3
471.type	ChaCha20_4x,@function
472.align	32
473ChaCha20_4x:
474.LChaCha20_4x:
475.cfi_startproc
476	movq	%rsp,%r9
477.cfi_def_cfa_register	r9
478	movq	%r10,%r11
479	shrq	$32,%r10
480	testq	$32,%r10
481	jnz	.LChaCha20_8x
482	cmpq	$192,%rdx
483	ja	.Lproceed4x
484
485	andq	$71303168,%r11
486	cmpq	$4194304,%r11
487	je	.Ldo_sse3_after_all
488
489.Lproceed4x:
490	subq	$0x140+8,%rsp
491	movdqa	.Lsigma(%rip),%xmm11
492	movdqu	(%rcx),%xmm15
493	movdqu	16(%rcx),%xmm7
494	movdqu	(%r8),%xmm3
495	leaq	256(%rsp),%rcx
496	leaq	.Lrot16(%rip),%r10
497	leaq	.Lrot24(%rip),%r11
498
499	pshufd	$0x00,%xmm11,%xmm8
500	pshufd	$0x55,%xmm11,%xmm9
501	movdqa	%xmm8,64(%rsp)
502	pshufd	$0xaa,%xmm11,%xmm10
503	movdqa	%xmm9,80(%rsp)
504	pshufd	$0xff,%xmm11,%xmm11
505	movdqa	%xmm10,96(%rsp)
506	movdqa	%xmm11,112(%rsp)
507
508	pshufd	$0x00,%xmm15,%xmm12
509	pshufd	$0x55,%xmm15,%xmm13
510	movdqa	%xmm12,128-256(%rcx)
511	pshufd	$0xaa,%xmm15,%xmm14
512	movdqa	%xmm13,144-256(%rcx)
513	pshufd	$0xff,%xmm15,%xmm15
514	movdqa	%xmm14,160-256(%rcx)
515	movdqa	%xmm15,176-256(%rcx)
516
517	pshufd	$0x00,%xmm7,%xmm4
518	pshufd	$0x55,%xmm7,%xmm5
519	movdqa	%xmm4,192-256(%rcx)
520	pshufd	$0xaa,%xmm7,%xmm6
521	movdqa	%xmm5,208-256(%rcx)
522	pshufd	$0xff,%xmm7,%xmm7
523	movdqa	%xmm6,224-256(%rcx)
524	movdqa	%xmm7,240-256(%rcx)
525
526	pshufd	$0x00,%xmm3,%xmm0
527	pshufd	$0x55,%xmm3,%xmm1
528	paddd	.Linc(%rip),%xmm0
529	pshufd	$0xaa,%xmm3,%xmm2
530	movdqa	%xmm1,272-256(%rcx)
531	pshufd	$0xff,%xmm3,%xmm3
532	movdqa	%xmm2,288-256(%rcx)
533	movdqa	%xmm3,304-256(%rcx)
534
535	jmp	.Loop_enter4x
536
537.align	32
538.Loop_outer4x:
539	movdqa	64(%rsp),%xmm8
540	movdqa	80(%rsp),%xmm9
541	movdqa	96(%rsp),%xmm10
542	movdqa	112(%rsp),%xmm11
543	movdqa	128-256(%rcx),%xmm12
544	movdqa	144-256(%rcx),%xmm13
545	movdqa	160-256(%rcx),%xmm14
546	movdqa	176-256(%rcx),%xmm15
547	movdqa	192-256(%rcx),%xmm4
548	movdqa	208-256(%rcx),%xmm5
549	movdqa	224-256(%rcx),%xmm6
550	movdqa	240-256(%rcx),%xmm7
551	movdqa	256-256(%rcx),%xmm0
552	movdqa	272-256(%rcx),%xmm1
553	movdqa	288-256(%rcx),%xmm2
554	movdqa	304-256(%rcx),%xmm3
555	paddd	.Lfour(%rip),%xmm0
556
557.Loop_enter4x:
558	movdqa	%xmm6,32(%rsp)
559	movdqa	%xmm7,48(%rsp)
560	movdqa	(%r10),%xmm7
561	movl	$10,%eax
562	movdqa	%xmm0,256-256(%rcx)
563	jmp	.Loop4x
564
565.align	32
566.Loop4x:
567	paddd	%xmm12,%xmm8
568	paddd	%xmm13,%xmm9
569	pxor	%xmm8,%xmm0
570	pxor	%xmm9,%xmm1
571.byte	102,15,56,0,199
572.byte	102,15,56,0,207
573	paddd	%xmm0,%xmm4
574	paddd	%xmm1,%xmm5
575	pxor	%xmm4,%xmm12
576	pxor	%xmm5,%xmm13
577	movdqa	%xmm12,%xmm6
578	pslld	$12,%xmm12
579	psrld	$20,%xmm6
580	movdqa	%xmm13,%xmm7
581	pslld	$12,%xmm13
582	por	%xmm6,%xmm12
583	psrld	$20,%xmm7
584	movdqa	(%r11),%xmm6
585	por	%xmm7,%xmm13
586	paddd	%xmm12,%xmm8
587	paddd	%xmm13,%xmm9
588	pxor	%xmm8,%xmm0
589	pxor	%xmm9,%xmm1
590.byte	102,15,56,0,198
591.byte	102,15,56,0,206
592	paddd	%xmm0,%xmm4
593	paddd	%xmm1,%xmm5
594	pxor	%xmm4,%xmm12
595	pxor	%xmm5,%xmm13
596	movdqa	%xmm12,%xmm7
597	pslld	$7,%xmm12
598	psrld	$25,%xmm7
599	movdqa	%xmm13,%xmm6
600	pslld	$7,%xmm13
601	por	%xmm7,%xmm12
602	psrld	$25,%xmm6
603	movdqa	(%r10),%xmm7
604	por	%xmm6,%xmm13
605	movdqa	%xmm4,0(%rsp)
606	movdqa	%xmm5,16(%rsp)
607	movdqa	32(%rsp),%xmm4
608	movdqa	48(%rsp),%xmm5
609	paddd	%xmm14,%xmm10
610	paddd	%xmm15,%xmm11
611	pxor	%xmm10,%xmm2
612	pxor	%xmm11,%xmm3
613.byte	102,15,56,0,215
614.byte	102,15,56,0,223
615	paddd	%xmm2,%xmm4
616	paddd	%xmm3,%xmm5
617	pxor	%xmm4,%xmm14
618	pxor	%xmm5,%xmm15
619	movdqa	%xmm14,%xmm6
620	pslld	$12,%xmm14
621	psrld	$20,%xmm6
622	movdqa	%xmm15,%xmm7
623	pslld	$12,%xmm15
624	por	%xmm6,%xmm14
625	psrld	$20,%xmm7
626	movdqa	(%r11),%xmm6
627	por	%xmm7,%xmm15
628	paddd	%xmm14,%xmm10
629	paddd	%xmm15,%xmm11
630	pxor	%xmm10,%xmm2
631	pxor	%xmm11,%xmm3
632.byte	102,15,56,0,214
633.byte	102,15,56,0,222
634	paddd	%xmm2,%xmm4
635	paddd	%xmm3,%xmm5
636	pxor	%xmm4,%xmm14
637	pxor	%xmm5,%xmm15
638	movdqa	%xmm14,%xmm7
639	pslld	$7,%xmm14
640	psrld	$25,%xmm7
641	movdqa	%xmm15,%xmm6
642	pslld	$7,%xmm15
643	por	%xmm7,%xmm14
644	psrld	$25,%xmm6
645	movdqa	(%r10),%xmm7
646	por	%xmm6,%xmm15
647	paddd	%xmm13,%xmm8
648	paddd	%xmm14,%xmm9
649	pxor	%xmm8,%xmm3
650	pxor	%xmm9,%xmm0
651.byte	102,15,56,0,223
652.byte	102,15,56,0,199
653	paddd	%xmm3,%xmm4
654	paddd	%xmm0,%xmm5
655	pxor	%xmm4,%xmm13
656	pxor	%xmm5,%xmm14
657	movdqa	%xmm13,%xmm6
658	pslld	$12,%xmm13
659	psrld	$20,%xmm6
660	movdqa	%xmm14,%xmm7
661	pslld	$12,%xmm14
662	por	%xmm6,%xmm13
663	psrld	$20,%xmm7
664	movdqa	(%r11),%xmm6
665	por	%xmm7,%xmm14
666	paddd	%xmm13,%xmm8
667	paddd	%xmm14,%xmm9
668	pxor	%xmm8,%xmm3
669	pxor	%xmm9,%xmm0
670.byte	102,15,56,0,222
671.byte	102,15,56,0,198
672	paddd	%xmm3,%xmm4
673	paddd	%xmm0,%xmm5
674	pxor	%xmm4,%xmm13
675	pxor	%xmm5,%xmm14
676	movdqa	%xmm13,%xmm7
677	pslld	$7,%xmm13
678	psrld	$25,%xmm7
679	movdqa	%xmm14,%xmm6
680	pslld	$7,%xmm14
681	por	%xmm7,%xmm13
682	psrld	$25,%xmm6
683	movdqa	(%r10),%xmm7
684	por	%xmm6,%xmm14
685	movdqa	%xmm4,32(%rsp)
686	movdqa	%xmm5,48(%rsp)
687	movdqa	0(%rsp),%xmm4
688	movdqa	16(%rsp),%xmm5
689	paddd	%xmm15,%xmm10
690	paddd	%xmm12,%xmm11
691	pxor	%xmm10,%xmm1
692	pxor	%xmm11,%xmm2
693.byte	102,15,56,0,207
694.byte	102,15,56,0,215
695	paddd	%xmm1,%xmm4
696	paddd	%xmm2,%xmm5
697	pxor	%xmm4,%xmm15
698	pxor	%xmm5,%xmm12
699	movdqa	%xmm15,%xmm6
700	pslld	$12,%xmm15
701	psrld	$20,%xmm6
702	movdqa	%xmm12,%xmm7
703	pslld	$12,%xmm12
704	por	%xmm6,%xmm15
705	psrld	$20,%xmm7
706	movdqa	(%r11),%xmm6
707	por	%xmm7,%xmm12
708	paddd	%xmm15,%xmm10
709	paddd	%xmm12,%xmm11
710	pxor	%xmm10,%xmm1
711	pxor	%xmm11,%xmm2
712.byte	102,15,56,0,206
713.byte	102,15,56,0,214
714	paddd	%xmm1,%xmm4
715	paddd	%xmm2,%xmm5
716	pxor	%xmm4,%xmm15
717	pxor	%xmm5,%xmm12
718	movdqa	%xmm15,%xmm7
719	pslld	$7,%xmm15
720	psrld	$25,%xmm7
721	movdqa	%xmm12,%xmm6
722	pslld	$7,%xmm12
723	por	%xmm7,%xmm15
724	psrld	$25,%xmm6
725	movdqa	(%r10),%xmm7
726	por	%xmm6,%xmm12
727	decl	%eax
728	jnz	.Loop4x
729
730	paddd	64(%rsp),%xmm8
731	paddd	80(%rsp),%xmm9
732	paddd	96(%rsp),%xmm10
733	paddd	112(%rsp),%xmm11
734
735	movdqa	%xmm8,%xmm6
736	punpckldq	%xmm9,%xmm8
737	movdqa	%xmm10,%xmm7
738	punpckldq	%xmm11,%xmm10
739	punpckhdq	%xmm9,%xmm6
740	punpckhdq	%xmm11,%xmm7
741	movdqa	%xmm8,%xmm9
742	punpcklqdq	%xmm10,%xmm8
743	movdqa	%xmm6,%xmm11
744	punpcklqdq	%xmm7,%xmm6
745	punpckhqdq	%xmm10,%xmm9
746	punpckhqdq	%xmm7,%xmm11
747	paddd	128-256(%rcx),%xmm12
748	paddd	144-256(%rcx),%xmm13
749	paddd	160-256(%rcx),%xmm14
750	paddd	176-256(%rcx),%xmm15
751
752	movdqa	%xmm8,0(%rsp)
753	movdqa	%xmm9,16(%rsp)
754	movdqa	32(%rsp),%xmm8
755	movdqa	48(%rsp),%xmm9
756
757	movdqa	%xmm12,%xmm10
758	punpckldq	%xmm13,%xmm12
759	movdqa	%xmm14,%xmm7
760	punpckldq	%xmm15,%xmm14
761	punpckhdq	%xmm13,%xmm10
762	punpckhdq	%xmm15,%xmm7
763	movdqa	%xmm12,%xmm13
764	punpcklqdq	%xmm14,%xmm12
765	movdqa	%xmm10,%xmm15
766	punpcklqdq	%xmm7,%xmm10
767	punpckhqdq	%xmm14,%xmm13
768	punpckhqdq	%xmm7,%xmm15
769	paddd	192-256(%rcx),%xmm4
770	paddd	208-256(%rcx),%xmm5
771	paddd	224-256(%rcx),%xmm8
772	paddd	240-256(%rcx),%xmm9
773
774	movdqa	%xmm6,32(%rsp)
775	movdqa	%xmm11,48(%rsp)
776
777	movdqa	%xmm4,%xmm14
778	punpckldq	%xmm5,%xmm4
779	movdqa	%xmm8,%xmm7
780	punpckldq	%xmm9,%xmm8
781	punpckhdq	%xmm5,%xmm14
782	punpckhdq	%xmm9,%xmm7
783	movdqa	%xmm4,%xmm5
784	punpcklqdq	%xmm8,%xmm4
785	movdqa	%xmm14,%xmm9
786	punpcklqdq	%xmm7,%xmm14
787	punpckhqdq	%xmm8,%xmm5
788	punpckhqdq	%xmm7,%xmm9
789	paddd	256-256(%rcx),%xmm0
790	paddd	272-256(%rcx),%xmm1
791	paddd	288-256(%rcx),%xmm2
792	paddd	304-256(%rcx),%xmm3
793
794	movdqa	%xmm0,%xmm8
795	punpckldq	%xmm1,%xmm0
796	movdqa	%xmm2,%xmm7
797	punpckldq	%xmm3,%xmm2
798	punpckhdq	%xmm1,%xmm8
799	punpckhdq	%xmm3,%xmm7
800	movdqa	%xmm0,%xmm1
801	punpcklqdq	%xmm2,%xmm0
802	movdqa	%xmm8,%xmm3
803	punpcklqdq	%xmm7,%xmm8
804	punpckhqdq	%xmm2,%xmm1
805	punpckhqdq	%xmm7,%xmm3
806	cmpq	$256,%rdx
807	jb	.Ltail4x
808
809	movdqu	0(%rsi),%xmm6
810	movdqu	16(%rsi),%xmm11
811	movdqu	32(%rsi),%xmm2
812	movdqu	48(%rsi),%xmm7
813	pxor	0(%rsp),%xmm6
814	pxor	%xmm12,%xmm11
815	pxor	%xmm4,%xmm2
816	pxor	%xmm0,%xmm7
817
818	movdqu	%xmm6,0(%rdi)
819	movdqu	64(%rsi),%xmm6
820	movdqu	%xmm11,16(%rdi)
821	movdqu	80(%rsi),%xmm11
822	movdqu	%xmm2,32(%rdi)
823	movdqu	96(%rsi),%xmm2
824	movdqu	%xmm7,48(%rdi)
825	movdqu	112(%rsi),%xmm7
826	leaq	128(%rsi),%rsi
827	pxor	16(%rsp),%xmm6
828	pxor	%xmm13,%xmm11
829	pxor	%xmm5,%xmm2
830	pxor	%xmm1,%xmm7
831
832	movdqu	%xmm6,64(%rdi)
833	movdqu	0(%rsi),%xmm6
834	movdqu	%xmm11,80(%rdi)
835	movdqu	16(%rsi),%xmm11
836	movdqu	%xmm2,96(%rdi)
837	movdqu	32(%rsi),%xmm2
838	movdqu	%xmm7,112(%rdi)
839	leaq	128(%rdi),%rdi
840	movdqu	48(%rsi),%xmm7
841	pxor	32(%rsp),%xmm6
842	pxor	%xmm10,%xmm11
843	pxor	%xmm14,%xmm2
844	pxor	%xmm8,%xmm7
845
846	movdqu	%xmm6,0(%rdi)
847	movdqu	64(%rsi),%xmm6
848	movdqu	%xmm11,16(%rdi)
849	movdqu	80(%rsi),%xmm11
850	movdqu	%xmm2,32(%rdi)
851	movdqu	96(%rsi),%xmm2
852	movdqu	%xmm7,48(%rdi)
853	movdqu	112(%rsi),%xmm7
854	leaq	128(%rsi),%rsi
855	pxor	48(%rsp),%xmm6
856	pxor	%xmm15,%xmm11
857	pxor	%xmm9,%xmm2
858	pxor	%xmm3,%xmm7
859	movdqu	%xmm6,64(%rdi)
860	movdqu	%xmm11,80(%rdi)
861	movdqu	%xmm2,96(%rdi)
862	movdqu	%xmm7,112(%rdi)
863	leaq	128(%rdi),%rdi
864
865	subq	$256,%rdx
866	jnz	.Loop_outer4x
867
868	jmp	.Ldone4x
869
870.Ltail4x:
871	cmpq	$192,%rdx
872	jae	.L192_or_more4x
873	cmpq	$128,%rdx
874	jae	.L128_or_more4x
875	cmpq	$64,%rdx
876	jae	.L64_or_more4x
877
878
879	xorq	%r10,%r10
880
881	movdqa	%xmm12,16(%rsp)
882	movdqa	%xmm4,32(%rsp)
883	movdqa	%xmm0,48(%rsp)
884	jmp	.Loop_tail4x
885
886.align	32
887.L64_or_more4x:
888	movdqu	0(%rsi),%xmm6
889	movdqu	16(%rsi),%xmm11
890	movdqu	32(%rsi),%xmm2
891	movdqu	48(%rsi),%xmm7
892	pxor	0(%rsp),%xmm6
893	pxor	%xmm12,%xmm11
894	pxor	%xmm4,%xmm2
895	pxor	%xmm0,%xmm7
896	movdqu	%xmm6,0(%rdi)
897	movdqu	%xmm11,16(%rdi)
898	movdqu	%xmm2,32(%rdi)
899	movdqu	%xmm7,48(%rdi)
900	je	.Ldone4x
901
902	movdqa	16(%rsp),%xmm6
903	leaq	64(%rsi),%rsi
904	xorq	%r10,%r10
905	movdqa	%xmm6,0(%rsp)
906	movdqa	%xmm13,16(%rsp)
907	leaq	64(%rdi),%rdi
908	movdqa	%xmm5,32(%rsp)
909	subq	$64,%rdx
910	movdqa	%xmm1,48(%rsp)
911	jmp	.Loop_tail4x
912
913.align	32
914.L128_or_more4x:
915	movdqu	0(%rsi),%xmm6
916	movdqu	16(%rsi),%xmm11
917	movdqu	32(%rsi),%xmm2
918	movdqu	48(%rsi),%xmm7
919	pxor	0(%rsp),%xmm6
920	pxor	%xmm12,%xmm11
921	pxor	%xmm4,%xmm2
922	pxor	%xmm0,%xmm7
923
924	movdqu	%xmm6,0(%rdi)
925	movdqu	64(%rsi),%xmm6
926	movdqu	%xmm11,16(%rdi)
927	movdqu	80(%rsi),%xmm11
928	movdqu	%xmm2,32(%rdi)
929	movdqu	96(%rsi),%xmm2
930	movdqu	%xmm7,48(%rdi)
931	movdqu	112(%rsi),%xmm7
932	pxor	16(%rsp),%xmm6
933	pxor	%xmm13,%xmm11
934	pxor	%xmm5,%xmm2
935	pxor	%xmm1,%xmm7
936	movdqu	%xmm6,64(%rdi)
937	movdqu	%xmm11,80(%rdi)
938	movdqu	%xmm2,96(%rdi)
939	movdqu	%xmm7,112(%rdi)
940	je	.Ldone4x
941
942	movdqa	32(%rsp),%xmm6
943	leaq	128(%rsi),%rsi
944	xorq	%r10,%r10
945	movdqa	%xmm6,0(%rsp)
946	movdqa	%xmm10,16(%rsp)
947	leaq	128(%rdi),%rdi
948	movdqa	%xmm14,32(%rsp)
949	subq	$128,%rdx
950	movdqa	%xmm8,48(%rsp)
951	jmp	.Loop_tail4x
952
953.align	32
954.L192_or_more4x:
955	movdqu	0(%rsi),%xmm6
956	movdqu	16(%rsi),%xmm11
957	movdqu	32(%rsi),%xmm2
958	movdqu	48(%rsi),%xmm7
959	pxor	0(%rsp),%xmm6
960	pxor	%xmm12,%xmm11
961	pxor	%xmm4,%xmm2
962	pxor	%xmm0,%xmm7
963
964	movdqu	%xmm6,0(%rdi)
965	movdqu	64(%rsi),%xmm6
966	movdqu	%xmm11,16(%rdi)
967	movdqu	80(%rsi),%xmm11
968	movdqu	%xmm2,32(%rdi)
969	movdqu	96(%rsi),%xmm2
970	movdqu	%xmm7,48(%rdi)
971	movdqu	112(%rsi),%xmm7
972	leaq	128(%rsi),%rsi
973	pxor	16(%rsp),%xmm6
974	pxor	%xmm13,%xmm11
975	pxor	%xmm5,%xmm2
976	pxor	%xmm1,%xmm7
977
978	movdqu	%xmm6,64(%rdi)
979	movdqu	0(%rsi),%xmm6
980	movdqu	%xmm11,80(%rdi)
981	movdqu	16(%rsi),%xmm11
982	movdqu	%xmm2,96(%rdi)
983	movdqu	32(%rsi),%xmm2
984	movdqu	%xmm7,112(%rdi)
985	leaq	128(%rdi),%rdi
986	movdqu	48(%rsi),%xmm7
987	pxor	32(%rsp),%xmm6
988	pxor	%xmm10,%xmm11
989	pxor	%xmm14,%xmm2
990	pxor	%xmm8,%xmm7
991	movdqu	%xmm6,0(%rdi)
992	movdqu	%xmm11,16(%rdi)
993	movdqu	%xmm2,32(%rdi)
994	movdqu	%xmm7,48(%rdi)
995	je	.Ldone4x
996
997	movdqa	48(%rsp),%xmm6
998	leaq	64(%rsi),%rsi
999	xorq	%r10,%r10
1000	movdqa	%xmm6,0(%rsp)
1001	movdqa	%xmm15,16(%rsp)
1002	leaq	64(%rdi),%rdi
1003	movdqa	%xmm9,32(%rsp)
1004	subq	$192,%rdx
1005	movdqa	%xmm3,48(%rsp)
1006
1007.Loop_tail4x:
1008	movzbl	(%rsi,%r10,1),%eax
1009	movzbl	(%rsp,%r10,1),%ecx
1010	leaq	1(%r10),%r10
1011	xorl	%ecx,%eax
1012	movb	%al,-1(%rdi,%r10,1)
1013	decq	%rdx
1014	jnz	.Loop_tail4x
1015
1016.Ldone4x:
1017	leaq	(%r9),%rsp
1018.cfi_def_cfa_register	rsp
1019.L4x_epilogue:
1020	.byte	0xf3,0xc3
1021.cfi_endproc
1022.size	ChaCha20_4x,.-ChaCha20_4x
1023.type	ChaCha20_8x,@function
1024.align	32
1025ChaCha20_8x:
1026.LChaCha20_8x:
1027.cfi_startproc
1028	movq	%rsp,%r9
1029.cfi_def_cfa_register	r9
1030	subq	$0x280+8,%rsp
1031	andq	$-32,%rsp
1032	vzeroupper
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043	vbroadcasti128	.Lsigma(%rip),%ymm11
1044	vbroadcasti128	(%rcx),%ymm3
1045	vbroadcasti128	16(%rcx),%ymm15
1046	vbroadcasti128	(%r8),%ymm7
1047	leaq	256(%rsp),%rcx
1048	leaq	512(%rsp),%rax
1049	leaq	.Lrot16(%rip),%r10
1050	leaq	.Lrot24(%rip),%r11
1051
1052	vpshufd	$0x00,%ymm11,%ymm8
1053	vpshufd	$0x55,%ymm11,%ymm9
1054	vmovdqa	%ymm8,128-256(%rcx)
1055	vpshufd	$0xaa,%ymm11,%ymm10
1056	vmovdqa	%ymm9,160-256(%rcx)
1057	vpshufd	$0xff,%ymm11,%ymm11
1058	vmovdqa	%ymm10,192-256(%rcx)
1059	vmovdqa	%ymm11,224-256(%rcx)
1060
1061	vpshufd	$0x00,%ymm3,%ymm0
1062	vpshufd	$0x55,%ymm3,%ymm1
1063	vmovdqa	%ymm0,256-256(%rcx)
1064	vpshufd	$0xaa,%ymm3,%ymm2
1065	vmovdqa	%ymm1,288-256(%rcx)
1066	vpshufd	$0xff,%ymm3,%ymm3
1067	vmovdqa	%ymm2,320-256(%rcx)
1068	vmovdqa	%ymm3,352-256(%rcx)
1069
1070	vpshufd	$0x00,%ymm15,%ymm12
1071	vpshufd	$0x55,%ymm15,%ymm13
1072	vmovdqa	%ymm12,384-512(%rax)
1073	vpshufd	$0xaa,%ymm15,%ymm14
1074	vmovdqa	%ymm13,416-512(%rax)
1075	vpshufd	$0xff,%ymm15,%ymm15
1076	vmovdqa	%ymm14,448-512(%rax)
1077	vmovdqa	%ymm15,480-512(%rax)
1078
1079	vpshufd	$0x00,%ymm7,%ymm4
1080	vpshufd	$0x55,%ymm7,%ymm5
1081	vpaddd	.Lincy(%rip),%ymm4,%ymm4
1082	vpshufd	$0xaa,%ymm7,%ymm6
1083	vmovdqa	%ymm5,544-512(%rax)
1084	vpshufd	$0xff,%ymm7,%ymm7
1085	vmovdqa	%ymm6,576-512(%rax)
1086	vmovdqa	%ymm7,608-512(%rax)
1087
1088	jmp	.Loop_enter8x
1089
1090.align	32
1091.Loop_outer8x:
1092	vmovdqa	128-256(%rcx),%ymm8
1093	vmovdqa	160-256(%rcx),%ymm9
1094	vmovdqa	192-256(%rcx),%ymm10
1095	vmovdqa	224-256(%rcx),%ymm11
1096	vmovdqa	256-256(%rcx),%ymm0
1097	vmovdqa	288-256(%rcx),%ymm1
1098	vmovdqa	320-256(%rcx),%ymm2
1099	vmovdqa	352-256(%rcx),%ymm3
1100	vmovdqa	384-512(%rax),%ymm12
1101	vmovdqa	416-512(%rax),%ymm13
1102	vmovdqa	448-512(%rax),%ymm14
1103	vmovdqa	480-512(%rax),%ymm15
1104	vmovdqa	512-512(%rax),%ymm4
1105	vmovdqa	544-512(%rax),%ymm5
1106	vmovdqa	576-512(%rax),%ymm6
1107	vmovdqa	608-512(%rax),%ymm7
1108	vpaddd	.Leight(%rip),%ymm4,%ymm4
1109
1110.Loop_enter8x:
1111	vmovdqa	%ymm14,64(%rsp)
1112	vmovdqa	%ymm15,96(%rsp)
1113	vbroadcasti128	(%r10),%ymm15
1114	vmovdqa	%ymm4,512-512(%rax)
1115	movl	$10,%eax
1116	jmp	.Loop8x
1117
1118.align	32
1119.Loop8x:
1120	vpaddd	%ymm0,%ymm8,%ymm8
1121	vpxor	%ymm4,%ymm8,%ymm4
1122	vpshufb	%ymm15,%ymm4,%ymm4
1123	vpaddd	%ymm1,%ymm9,%ymm9
1124	vpxor	%ymm5,%ymm9,%ymm5
1125	vpshufb	%ymm15,%ymm5,%ymm5
1126	vpaddd	%ymm4,%ymm12,%ymm12
1127	vpxor	%ymm0,%ymm12,%ymm0
1128	vpslld	$12,%ymm0,%ymm14
1129	vpsrld	$20,%ymm0,%ymm0
1130	vpor	%ymm0,%ymm14,%ymm0
1131	vbroadcasti128	(%r11),%ymm14
1132	vpaddd	%ymm5,%ymm13,%ymm13
1133	vpxor	%ymm1,%ymm13,%ymm1
1134	vpslld	$12,%ymm1,%ymm15
1135	vpsrld	$20,%ymm1,%ymm1
1136	vpor	%ymm1,%ymm15,%ymm1
1137	vpaddd	%ymm0,%ymm8,%ymm8
1138	vpxor	%ymm4,%ymm8,%ymm4
1139	vpshufb	%ymm14,%ymm4,%ymm4
1140	vpaddd	%ymm1,%ymm9,%ymm9
1141	vpxor	%ymm5,%ymm9,%ymm5
1142	vpshufb	%ymm14,%ymm5,%ymm5
1143	vpaddd	%ymm4,%ymm12,%ymm12
1144	vpxor	%ymm0,%ymm12,%ymm0
1145	vpslld	$7,%ymm0,%ymm15
1146	vpsrld	$25,%ymm0,%ymm0
1147	vpor	%ymm0,%ymm15,%ymm0
1148	vbroadcasti128	(%r10),%ymm15
1149	vpaddd	%ymm5,%ymm13,%ymm13
1150	vpxor	%ymm1,%ymm13,%ymm1
1151	vpslld	$7,%ymm1,%ymm14
1152	vpsrld	$25,%ymm1,%ymm1
1153	vpor	%ymm1,%ymm14,%ymm1
1154	vmovdqa	%ymm12,0(%rsp)
1155	vmovdqa	%ymm13,32(%rsp)
1156	vmovdqa	64(%rsp),%ymm12
1157	vmovdqa	96(%rsp),%ymm13
1158	vpaddd	%ymm2,%ymm10,%ymm10
1159	vpxor	%ymm6,%ymm10,%ymm6
1160	vpshufb	%ymm15,%ymm6,%ymm6
1161	vpaddd	%ymm3,%ymm11,%ymm11
1162	vpxor	%ymm7,%ymm11,%ymm7
1163	vpshufb	%ymm15,%ymm7,%ymm7
1164	vpaddd	%ymm6,%ymm12,%ymm12
1165	vpxor	%ymm2,%ymm12,%ymm2
1166	vpslld	$12,%ymm2,%ymm14
1167	vpsrld	$20,%ymm2,%ymm2
1168	vpor	%ymm2,%ymm14,%ymm2
1169	vbroadcasti128	(%r11),%ymm14
1170	vpaddd	%ymm7,%ymm13,%ymm13
1171	vpxor	%ymm3,%ymm13,%ymm3
1172	vpslld	$12,%ymm3,%ymm15
1173	vpsrld	$20,%ymm3,%ymm3
1174	vpor	%ymm3,%ymm15,%ymm3
1175	vpaddd	%ymm2,%ymm10,%ymm10
1176	vpxor	%ymm6,%ymm10,%ymm6
1177	vpshufb	%ymm14,%ymm6,%ymm6
1178	vpaddd	%ymm3,%ymm11,%ymm11
1179	vpxor	%ymm7,%ymm11,%ymm7
1180	vpshufb	%ymm14,%ymm7,%ymm7
1181	vpaddd	%ymm6,%ymm12,%ymm12
1182	vpxor	%ymm2,%ymm12,%ymm2
1183	vpslld	$7,%ymm2,%ymm15
1184	vpsrld	$25,%ymm2,%ymm2
1185	vpor	%ymm2,%ymm15,%ymm2
1186	vbroadcasti128	(%r10),%ymm15
1187	vpaddd	%ymm7,%ymm13,%ymm13
1188	vpxor	%ymm3,%ymm13,%ymm3
1189	vpslld	$7,%ymm3,%ymm14
1190	vpsrld	$25,%ymm3,%ymm3
1191	vpor	%ymm3,%ymm14,%ymm3
1192	vpaddd	%ymm1,%ymm8,%ymm8
1193	vpxor	%ymm7,%ymm8,%ymm7
1194	vpshufb	%ymm15,%ymm7,%ymm7
1195	vpaddd	%ymm2,%ymm9,%ymm9
1196	vpxor	%ymm4,%ymm9,%ymm4
1197	vpshufb	%ymm15,%ymm4,%ymm4
1198	vpaddd	%ymm7,%ymm12,%ymm12
1199	vpxor	%ymm1,%ymm12,%ymm1
1200	vpslld	$12,%ymm1,%ymm14
1201	vpsrld	$20,%ymm1,%ymm1
1202	vpor	%ymm1,%ymm14,%ymm1
1203	vbroadcasti128	(%r11),%ymm14
1204	vpaddd	%ymm4,%ymm13,%ymm13
1205	vpxor	%ymm2,%ymm13,%ymm2
1206	vpslld	$12,%ymm2,%ymm15
1207	vpsrld	$20,%ymm2,%ymm2
1208	vpor	%ymm2,%ymm15,%ymm2
1209	vpaddd	%ymm1,%ymm8,%ymm8
1210	vpxor	%ymm7,%ymm8,%ymm7
1211	vpshufb	%ymm14,%ymm7,%ymm7
1212	vpaddd	%ymm2,%ymm9,%ymm9
1213	vpxor	%ymm4,%ymm9,%ymm4
1214	vpshufb	%ymm14,%ymm4,%ymm4
1215	vpaddd	%ymm7,%ymm12,%ymm12
1216	vpxor	%ymm1,%ymm12,%ymm1
1217	vpslld	$7,%ymm1,%ymm15
1218	vpsrld	$25,%ymm1,%ymm1
1219	vpor	%ymm1,%ymm15,%ymm1
1220	vbroadcasti128	(%r10),%ymm15
1221	vpaddd	%ymm4,%ymm13,%ymm13
1222	vpxor	%ymm2,%ymm13,%ymm2
1223	vpslld	$7,%ymm2,%ymm14
1224	vpsrld	$25,%ymm2,%ymm2
1225	vpor	%ymm2,%ymm14,%ymm2
1226	vmovdqa	%ymm12,64(%rsp)
1227	vmovdqa	%ymm13,96(%rsp)
1228	vmovdqa	0(%rsp),%ymm12
1229	vmovdqa	32(%rsp),%ymm13
1230	vpaddd	%ymm3,%ymm10,%ymm10
1231	vpxor	%ymm5,%ymm10,%ymm5
1232	vpshufb	%ymm15,%ymm5,%ymm5
1233	vpaddd	%ymm0,%ymm11,%ymm11
1234	vpxor	%ymm6,%ymm11,%ymm6
1235	vpshufb	%ymm15,%ymm6,%ymm6
1236	vpaddd	%ymm5,%ymm12,%ymm12
1237	vpxor	%ymm3,%ymm12,%ymm3
1238	vpslld	$12,%ymm3,%ymm14
1239	vpsrld	$20,%ymm3,%ymm3
1240	vpor	%ymm3,%ymm14,%ymm3
1241	vbroadcasti128	(%r11),%ymm14
1242	vpaddd	%ymm6,%ymm13,%ymm13
1243	vpxor	%ymm0,%ymm13,%ymm0
1244	vpslld	$12,%ymm0,%ymm15
1245	vpsrld	$20,%ymm0,%ymm0
1246	vpor	%ymm0,%ymm15,%ymm0
1247	vpaddd	%ymm3,%ymm10,%ymm10
1248	vpxor	%ymm5,%ymm10,%ymm5
1249	vpshufb	%ymm14,%ymm5,%ymm5
1250	vpaddd	%ymm0,%ymm11,%ymm11
1251	vpxor	%ymm6,%ymm11,%ymm6
1252	vpshufb	%ymm14,%ymm6,%ymm6
1253	vpaddd	%ymm5,%ymm12,%ymm12
1254	vpxor	%ymm3,%ymm12,%ymm3
1255	vpslld	$7,%ymm3,%ymm15
1256	vpsrld	$25,%ymm3,%ymm3
1257	vpor	%ymm3,%ymm15,%ymm3
1258	vbroadcasti128	(%r10),%ymm15
1259	vpaddd	%ymm6,%ymm13,%ymm13
1260	vpxor	%ymm0,%ymm13,%ymm0
1261	vpslld	$7,%ymm0,%ymm14
1262	vpsrld	$25,%ymm0,%ymm0
1263	vpor	%ymm0,%ymm14,%ymm0
1264	decl	%eax
1265	jnz	.Loop8x
1266
1267	leaq	512(%rsp),%rax
1268	vpaddd	128-256(%rcx),%ymm8,%ymm8
1269	vpaddd	160-256(%rcx),%ymm9,%ymm9
1270	vpaddd	192-256(%rcx),%ymm10,%ymm10
1271	vpaddd	224-256(%rcx),%ymm11,%ymm11
1272
1273	vpunpckldq	%ymm9,%ymm8,%ymm14
1274	vpunpckldq	%ymm11,%ymm10,%ymm15
1275	vpunpckhdq	%ymm9,%ymm8,%ymm8
1276	vpunpckhdq	%ymm11,%ymm10,%ymm10
1277	vpunpcklqdq	%ymm15,%ymm14,%ymm9
1278	vpunpckhqdq	%ymm15,%ymm14,%ymm14
1279	vpunpcklqdq	%ymm10,%ymm8,%ymm11
1280	vpunpckhqdq	%ymm10,%ymm8,%ymm8
1281	vpaddd	256-256(%rcx),%ymm0,%ymm0
1282	vpaddd	288-256(%rcx),%ymm1,%ymm1
1283	vpaddd	320-256(%rcx),%ymm2,%ymm2
1284	vpaddd	352-256(%rcx),%ymm3,%ymm3
1285
1286	vpunpckldq	%ymm1,%ymm0,%ymm10
1287	vpunpckldq	%ymm3,%ymm2,%ymm15
1288	vpunpckhdq	%ymm1,%ymm0,%ymm0
1289	vpunpckhdq	%ymm3,%ymm2,%ymm2
1290	vpunpcklqdq	%ymm15,%ymm10,%ymm1
1291	vpunpckhqdq	%ymm15,%ymm10,%ymm10
1292	vpunpcklqdq	%ymm2,%ymm0,%ymm3
1293	vpunpckhqdq	%ymm2,%ymm0,%ymm0
1294	vperm2i128	$0x20,%ymm1,%ymm9,%ymm15
1295	vperm2i128	$0x31,%ymm1,%ymm9,%ymm1
1296	vperm2i128	$0x20,%ymm10,%ymm14,%ymm9
1297	vperm2i128	$0x31,%ymm10,%ymm14,%ymm10
1298	vperm2i128	$0x20,%ymm3,%ymm11,%ymm14
1299	vperm2i128	$0x31,%ymm3,%ymm11,%ymm3
1300	vperm2i128	$0x20,%ymm0,%ymm8,%ymm11
1301	vperm2i128	$0x31,%ymm0,%ymm8,%ymm0
1302	vmovdqa	%ymm15,0(%rsp)
1303	vmovdqa	%ymm9,32(%rsp)
1304	vmovdqa	64(%rsp),%ymm15
1305	vmovdqa	96(%rsp),%ymm9
1306
1307	vpaddd	384-512(%rax),%ymm12,%ymm12
1308	vpaddd	416-512(%rax),%ymm13,%ymm13
1309	vpaddd	448-512(%rax),%ymm15,%ymm15
1310	vpaddd	480-512(%rax),%ymm9,%ymm9
1311
1312	vpunpckldq	%ymm13,%ymm12,%ymm2
1313	vpunpckldq	%ymm9,%ymm15,%ymm8
1314	vpunpckhdq	%ymm13,%ymm12,%ymm12
1315	vpunpckhdq	%ymm9,%ymm15,%ymm15
1316	vpunpcklqdq	%ymm8,%ymm2,%ymm13
1317	vpunpckhqdq	%ymm8,%ymm2,%ymm2
1318	vpunpcklqdq	%ymm15,%ymm12,%ymm9
1319	vpunpckhqdq	%ymm15,%ymm12,%ymm12
1320	vpaddd	512-512(%rax),%ymm4,%ymm4
1321	vpaddd	544-512(%rax),%ymm5,%ymm5
1322	vpaddd	576-512(%rax),%ymm6,%ymm6
1323	vpaddd	608-512(%rax),%ymm7,%ymm7
1324
1325	vpunpckldq	%ymm5,%ymm4,%ymm15
1326	vpunpckldq	%ymm7,%ymm6,%ymm8
1327	vpunpckhdq	%ymm5,%ymm4,%ymm4
1328	vpunpckhdq	%ymm7,%ymm6,%ymm6
1329	vpunpcklqdq	%ymm8,%ymm15,%ymm5
1330	vpunpckhqdq	%ymm8,%ymm15,%ymm15
1331	vpunpcklqdq	%ymm6,%ymm4,%ymm7
1332	vpunpckhqdq	%ymm6,%ymm4,%ymm4
1333	vperm2i128	$0x20,%ymm5,%ymm13,%ymm8
1334	vperm2i128	$0x31,%ymm5,%ymm13,%ymm5
1335	vperm2i128	$0x20,%ymm15,%ymm2,%ymm13
1336	vperm2i128	$0x31,%ymm15,%ymm2,%ymm15
1337	vperm2i128	$0x20,%ymm7,%ymm9,%ymm2
1338	vperm2i128	$0x31,%ymm7,%ymm9,%ymm7
1339	vperm2i128	$0x20,%ymm4,%ymm12,%ymm9
1340	vperm2i128	$0x31,%ymm4,%ymm12,%ymm4
1341	vmovdqa	0(%rsp),%ymm6
1342	vmovdqa	32(%rsp),%ymm12
1343
1344	cmpq	$512,%rdx
1345	jb	.Ltail8x
1346
1347	vpxor	0(%rsi),%ymm6,%ymm6
1348	vpxor	32(%rsi),%ymm8,%ymm8
1349	vpxor	64(%rsi),%ymm1,%ymm1
1350	vpxor	96(%rsi),%ymm5,%ymm5
1351	leaq	128(%rsi),%rsi
1352	vmovdqu	%ymm6,0(%rdi)
1353	vmovdqu	%ymm8,32(%rdi)
1354	vmovdqu	%ymm1,64(%rdi)
1355	vmovdqu	%ymm5,96(%rdi)
1356	leaq	128(%rdi),%rdi
1357
1358	vpxor	0(%rsi),%ymm12,%ymm12
1359	vpxor	32(%rsi),%ymm13,%ymm13
1360	vpxor	64(%rsi),%ymm10,%ymm10
1361	vpxor	96(%rsi),%ymm15,%ymm15
1362	leaq	128(%rsi),%rsi
1363	vmovdqu	%ymm12,0(%rdi)
1364	vmovdqu	%ymm13,32(%rdi)
1365	vmovdqu	%ymm10,64(%rdi)
1366	vmovdqu	%ymm15,96(%rdi)
1367	leaq	128(%rdi),%rdi
1368
1369	vpxor	0(%rsi),%ymm14,%ymm14
1370	vpxor	32(%rsi),%ymm2,%ymm2
1371	vpxor	64(%rsi),%ymm3,%ymm3
1372	vpxor	96(%rsi),%ymm7,%ymm7
1373	leaq	128(%rsi),%rsi
1374	vmovdqu	%ymm14,0(%rdi)
1375	vmovdqu	%ymm2,32(%rdi)
1376	vmovdqu	%ymm3,64(%rdi)
1377	vmovdqu	%ymm7,96(%rdi)
1378	leaq	128(%rdi),%rdi
1379
1380	vpxor	0(%rsi),%ymm11,%ymm11
1381	vpxor	32(%rsi),%ymm9,%ymm9
1382	vpxor	64(%rsi),%ymm0,%ymm0
1383	vpxor	96(%rsi),%ymm4,%ymm4
1384	leaq	128(%rsi),%rsi
1385	vmovdqu	%ymm11,0(%rdi)
1386	vmovdqu	%ymm9,32(%rdi)
1387	vmovdqu	%ymm0,64(%rdi)
1388	vmovdqu	%ymm4,96(%rdi)
1389	leaq	128(%rdi),%rdi
1390
1391	subq	$512,%rdx
1392	jnz	.Loop_outer8x
1393
1394	jmp	.Ldone8x
1395
1396.Ltail8x:
1397	cmpq	$448,%rdx
1398	jae	.L448_or_more8x
1399	cmpq	$384,%rdx
1400	jae	.L384_or_more8x
1401	cmpq	$320,%rdx
1402	jae	.L320_or_more8x
1403	cmpq	$256,%rdx
1404	jae	.L256_or_more8x
1405	cmpq	$192,%rdx
1406	jae	.L192_or_more8x
1407	cmpq	$128,%rdx
1408	jae	.L128_or_more8x
1409	cmpq	$64,%rdx
1410	jae	.L64_or_more8x
1411
1412	xorq	%r10,%r10
1413	vmovdqa	%ymm6,0(%rsp)
1414	vmovdqa	%ymm8,32(%rsp)
1415	jmp	.Loop_tail8x
1416
1417.align	32
1418.L64_or_more8x:
1419	vpxor	0(%rsi),%ymm6,%ymm6
1420	vpxor	32(%rsi),%ymm8,%ymm8
1421	vmovdqu	%ymm6,0(%rdi)
1422	vmovdqu	%ymm8,32(%rdi)
1423	je	.Ldone8x
1424
1425	leaq	64(%rsi),%rsi
1426	xorq	%r10,%r10
1427	vmovdqa	%ymm1,0(%rsp)
1428	leaq	64(%rdi),%rdi
1429	subq	$64,%rdx
1430	vmovdqa	%ymm5,32(%rsp)
1431	jmp	.Loop_tail8x
1432
1433.align	32
1434.L128_or_more8x:
1435	vpxor	0(%rsi),%ymm6,%ymm6
1436	vpxor	32(%rsi),%ymm8,%ymm8
1437	vpxor	64(%rsi),%ymm1,%ymm1
1438	vpxor	96(%rsi),%ymm5,%ymm5
1439	vmovdqu	%ymm6,0(%rdi)
1440	vmovdqu	%ymm8,32(%rdi)
1441	vmovdqu	%ymm1,64(%rdi)
1442	vmovdqu	%ymm5,96(%rdi)
1443	je	.Ldone8x
1444
1445	leaq	128(%rsi),%rsi
1446	xorq	%r10,%r10
1447	vmovdqa	%ymm12,0(%rsp)
1448	leaq	128(%rdi),%rdi
1449	subq	$128,%rdx
1450	vmovdqa	%ymm13,32(%rsp)
1451	jmp	.Loop_tail8x
1452
1453.align	32
1454.L192_or_more8x:
1455	vpxor	0(%rsi),%ymm6,%ymm6
1456	vpxor	32(%rsi),%ymm8,%ymm8
1457	vpxor	64(%rsi),%ymm1,%ymm1
1458	vpxor	96(%rsi),%ymm5,%ymm5
1459	vpxor	128(%rsi),%ymm12,%ymm12
1460	vpxor	160(%rsi),%ymm13,%ymm13
1461	vmovdqu	%ymm6,0(%rdi)
1462	vmovdqu	%ymm8,32(%rdi)
1463	vmovdqu	%ymm1,64(%rdi)
1464	vmovdqu	%ymm5,96(%rdi)
1465	vmovdqu	%ymm12,128(%rdi)
1466	vmovdqu	%ymm13,160(%rdi)
1467	je	.Ldone8x
1468
1469	leaq	192(%rsi),%rsi
1470	xorq	%r10,%r10
1471	vmovdqa	%ymm10,0(%rsp)
1472	leaq	192(%rdi),%rdi
1473	subq	$192,%rdx
1474	vmovdqa	%ymm15,32(%rsp)
1475	jmp	.Loop_tail8x
1476
1477.align	32
1478.L256_or_more8x:
1479	vpxor	0(%rsi),%ymm6,%ymm6
1480	vpxor	32(%rsi),%ymm8,%ymm8
1481	vpxor	64(%rsi),%ymm1,%ymm1
1482	vpxor	96(%rsi),%ymm5,%ymm5
1483	vpxor	128(%rsi),%ymm12,%ymm12
1484	vpxor	160(%rsi),%ymm13,%ymm13
1485	vpxor	192(%rsi),%ymm10,%ymm10
1486	vpxor	224(%rsi),%ymm15,%ymm15
1487	vmovdqu	%ymm6,0(%rdi)
1488	vmovdqu	%ymm8,32(%rdi)
1489	vmovdqu	%ymm1,64(%rdi)
1490	vmovdqu	%ymm5,96(%rdi)
1491	vmovdqu	%ymm12,128(%rdi)
1492	vmovdqu	%ymm13,160(%rdi)
1493	vmovdqu	%ymm10,192(%rdi)
1494	vmovdqu	%ymm15,224(%rdi)
1495	je	.Ldone8x
1496
1497	leaq	256(%rsi),%rsi
1498	xorq	%r10,%r10
1499	vmovdqa	%ymm14,0(%rsp)
1500	leaq	256(%rdi),%rdi
1501	subq	$256,%rdx
1502	vmovdqa	%ymm2,32(%rsp)
1503	jmp	.Loop_tail8x
1504
1505.align	32
1506.L320_or_more8x:
1507	vpxor	0(%rsi),%ymm6,%ymm6
1508	vpxor	32(%rsi),%ymm8,%ymm8
1509	vpxor	64(%rsi),%ymm1,%ymm1
1510	vpxor	96(%rsi),%ymm5,%ymm5
1511	vpxor	128(%rsi),%ymm12,%ymm12
1512	vpxor	160(%rsi),%ymm13,%ymm13
1513	vpxor	192(%rsi),%ymm10,%ymm10
1514	vpxor	224(%rsi),%ymm15,%ymm15
1515	vpxor	256(%rsi),%ymm14,%ymm14
1516	vpxor	288(%rsi),%ymm2,%ymm2
1517	vmovdqu	%ymm6,0(%rdi)
1518	vmovdqu	%ymm8,32(%rdi)
1519	vmovdqu	%ymm1,64(%rdi)
1520	vmovdqu	%ymm5,96(%rdi)
1521	vmovdqu	%ymm12,128(%rdi)
1522	vmovdqu	%ymm13,160(%rdi)
1523	vmovdqu	%ymm10,192(%rdi)
1524	vmovdqu	%ymm15,224(%rdi)
1525	vmovdqu	%ymm14,256(%rdi)
1526	vmovdqu	%ymm2,288(%rdi)
1527	je	.Ldone8x
1528
1529	leaq	320(%rsi),%rsi
1530	xorq	%r10,%r10
1531	vmovdqa	%ymm3,0(%rsp)
1532	leaq	320(%rdi),%rdi
1533	subq	$320,%rdx
1534	vmovdqa	%ymm7,32(%rsp)
1535	jmp	.Loop_tail8x
1536
1537.align	32
1538.L384_or_more8x:
1539	vpxor	0(%rsi),%ymm6,%ymm6
1540	vpxor	32(%rsi),%ymm8,%ymm8
1541	vpxor	64(%rsi),%ymm1,%ymm1
1542	vpxor	96(%rsi),%ymm5,%ymm5
1543	vpxor	128(%rsi),%ymm12,%ymm12
1544	vpxor	160(%rsi),%ymm13,%ymm13
1545	vpxor	192(%rsi),%ymm10,%ymm10
1546	vpxor	224(%rsi),%ymm15,%ymm15
1547	vpxor	256(%rsi),%ymm14,%ymm14
1548	vpxor	288(%rsi),%ymm2,%ymm2
1549	vpxor	320(%rsi),%ymm3,%ymm3
1550	vpxor	352(%rsi),%ymm7,%ymm7
1551	vmovdqu	%ymm6,0(%rdi)
1552	vmovdqu	%ymm8,32(%rdi)
1553	vmovdqu	%ymm1,64(%rdi)
1554	vmovdqu	%ymm5,96(%rdi)
1555	vmovdqu	%ymm12,128(%rdi)
1556	vmovdqu	%ymm13,160(%rdi)
1557	vmovdqu	%ymm10,192(%rdi)
1558	vmovdqu	%ymm15,224(%rdi)
1559	vmovdqu	%ymm14,256(%rdi)
1560	vmovdqu	%ymm2,288(%rdi)
1561	vmovdqu	%ymm3,320(%rdi)
1562	vmovdqu	%ymm7,352(%rdi)
1563	je	.Ldone8x
1564
1565	leaq	384(%rsi),%rsi
1566	xorq	%r10,%r10
1567	vmovdqa	%ymm11,0(%rsp)
1568	leaq	384(%rdi),%rdi
1569	subq	$384,%rdx
1570	vmovdqa	%ymm9,32(%rsp)
1571	jmp	.Loop_tail8x
1572
1573.align	32
1574.L448_or_more8x:
1575	vpxor	0(%rsi),%ymm6,%ymm6
1576	vpxor	32(%rsi),%ymm8,%ymm8
1577	vpxor	64(%rsi),%ymm1,%ymm1
1578	vpxor	96(%rsi),%ymm5,%ymm5
1579	vpxor	128(%rsi),%ymm12,%ymm12
1580	vpxor	160(%rsi),%ymm13,%ymm13
1581	vpxor	192(%rsi),%ymm10,%ymm10
1582	vpxor	224(%rsi),%ymm15,%ymm15
1583	vpxor	256(%rsi),%ymm14,%ymm14
1584	vpxor	288(%rsi),%ymm2,%ymm2
1585	vpxor	320(%rsi),%ymm3,%ymm3
1586	vpxor	352(%rsi),%ymm7,%ymm7
1587	vpxor	384(%rsi),%ymm11,%ymm11
1588	vpxor	416(%rsi),%ymm9,%ymm9
1589	vmovdqu	%ymm6,0(%rdi)
1590	vmovdqu	%ymm8,32(%rdi)
1591	vmovdqu	%ymm1,64(%rdi)
1592	vmovdqu	%ymm5,96(%rdi)
1593	vmovdqu	%ymm12,128(%rdi)
1594	vmovdqu	%ymm13,160(%rdi)
1595	vmovdqu	%ymm10,192(%rdi)
1596	vmovdqu	%ymm15,224(%rdi)
1597	vmovdqu	%ymm14,256(%rdi)
1598	vmovdqu	%ymm2,288(%rdi)
1599	vmovdqu	%ymm3,320(%rdi)
1600	vmovdqu	%ymm7,352(%rdi)
1601	vmovdqu	%ymm11,384(%rdi)
1602	vmovdqu	%ymm9,416(%rdi)
1603	je	.Ldone8x
1604
1605	leaq	448(%rsi),%rsi
1606	xorq	%r10,%r10
1607	vmovdqa	%ymm0,0(%rsp)
1608	leaq	448(%rdi),%rdi
1609	subq	$448,%rdx
1610	vmovdqa	%ymm4,32(%rsp)
1611
1612.Loop_tail8x:
1613	movzbl	(%rsi,%r10,1),%eax
1614	movzbl	(%rsp,%r10,1),%ecx
1615	leaq	1(%r10),%r10
1616	xorl	%ecx,%eax
1617	movb	%al,-1(%rdi,%r10,1)
1618	decq	%rdx
1619	jnz	.Loop_tail8x
1620
1621.Ldone8x:
1622	vzeroall
1623	leaq	(%r9),%rsp
1624.cfi_def_cfa_register	rsp
1625.L8x_epilogue:
1626	.byte	0xf3,0xc3
1627.cfi_endproc
1628.size	ChaCha20_8x,.-ChaCha20_8x
1629#endif
1630.section	.note.GNU-stack,"",@progbits
1631