• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# This file is generated from a similarly-named Perl script in the BoringSSL
2# source tree. Do not edit by hand.
3
4#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
11#include "ring_core_generated/prefix_symbols_asm.h"
12.text
13
14.extern	OPENSSL_ia32cap_P
15.hidden OPENSSL_ia32cap_P
16
17.align	64
18.Lzero:
19.long	0,0,0,0
20.Lone:
21.long	1,0,0,0
22.Linc:
23.long	0,1,2,3
24.Lfour:
25.long	4,4,4,4
26.Lincy:
27.long	0,2,4,6,1,3,5,7
28.Leight:
29.long	8,8,8,8,8,8,8,8
30.Lrot16:
31.byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
32.Lrot24:
33.byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
34.Lsigma:
35.byte	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
36.align	64
37.Lzeroz:
38.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
39.Lfourz:
40.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
41.Lincz:
42.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
43.Lsixteen:
44.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
45.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
46.globl	ChaCha20_ctr32
47.hidden ChaCha20_ctr32
48.type	ChaCha20_ctr32,@function
49.align	64
50ChaCha20_ctr32:
51.cfi_startproc
52	cmpq	$0,%rdx
53	je	.Lno_data
54	movq	OPENSSL_ia32cap_P+4(%rip),%r10
55	testl	$512,%r10d
56	jnz	.LChaCha20_ssse3
57
58	pushq	%rbx
59.cfi_adjust_cfa_offset	8
60.cfi_offset	rbx,-16
61	pushq	%rbp
62.cfi_adjust_cfa_offset	8
63.cfi_offset	rbp,-24
64	pushq	%r12
65.cfi_adjust_cfa_offset	8
66.cfi_offset	r12,-32
67	pushq	%r13
68.cfi_adjust_cfa_offset	8
69.cfi_offset	r13,-40
70	pushq	%r14
71.cfi_adjust_cfa_offset	8
72.cfi_offset	r14,-48
73	pushq	%r15
74.cfi_adjust_cfa_offset	8
75.cfi_offset	r15,-56
76	subq	$64+24,%rsp
77.cfi_adjust_cfa_offset	88
78.Lctr32_body:
79
80
81	movdqu	(%rcx),%xmm1
82	movdqu	16(%rcx),%xmm2
83	movdqu	(%r8),%xmm3
84	movdqa	.Lone(%rip),%xmm4
85
86
87	movdqa	%xmm1,16(%rsp)
88	movdqa	%xmm2,32(%rsp)
89	movdqa	%xmm3,48(%rsp)
90	movq	%rdx,%rbp
91	jmp	.Loop_outer
92
93.align	32
94.Loop_outer:
95	movl	$0x61707865,%eax
96	movl	$0x3320646e,%ebx
97	movl	$0x79622d32,%ecx
98	movl	$0x6b206574,%edx
99	movl	16(%rsp),%r8d
100	movl	20(%rsp),%r9d
101	movl	24(%rsp),%r10d
102	movl	28(%rsp),%r11d
103	movd	%xmm3,%r12d
104	movl	52(%rsp),%r13d
105	movl	56(%rsp),%r14d
106	movl	60(%rsp),%r15d
107
108	movq	%rbp,64+0(%rsp)
109	movl	$10,%ebp
110	movq	%rsi,64+8(%rsp)
111.byte	102,72,15,126,214
112	movq	%rdi,64+16(%rsp)
113	movq	%rsi,%rdi
114	shrq	$32,%rdi
115	jmp	.Loop
116
117.align	32
118.Loop:
119	addl	%r8d,%eax
120	xorl	%eax,%r12d
121	roll	$16,%r12d
122	addl	%r9d,%ebx
123	xorl	%ebx,%r13d
124	roll	$16,%r13d
125	addl	%r12d,%esi
126	xorl	%esi,%r8d
127	roll	$12,%r8d
128	addl	%r13d,%edi
129	xorl	%edi,%r9d
130	roll	$12,%r9d
131	addl	%r8d,%eax
132	xorl	%eax,%r12d
133	roll	$8,%r12d
134	addl	%r9d,%ebx
135	xorl	%ebx,%r13d
136	roll	$8,%r13d
137	addl	%r12d,%esi
138	xorl	%esi,%r8d
139	roll	$7,%r8d
140	addl	%r13d,%edi
141	xorl	%edi,%r9d
142	roll	$7,%r9d
143	movl	%esi,32(%rsp)
144	movl	%edi,36(%rsp)
145	movl	40(%rsp),%esi
146	movl	44(%rsp),%edi
147	addl	%r10d,%ecx
148	xorl	%ecx,%r14d
149	roll	$16,%r14d
150	addl	%r11d,%edx
151	xorl	%edx,%r15d
152	roll	$16,%r15d
153	addl	%r14d,%esi
154	xorl	%esi,%r10d
155	roll	$12,%r10d
156	addl	%r15d,%edi
157	xorl	%edi,%r11d
158	roll	$12,%r11d
159	addl	%r10d,%ecx
160	xorl	%ecx,%r14d
161	roll	$8,%r14d
162	addl	%r11d,%edx
163	xorl	%edx,%r15d
164	roll	$8,%r15d
165	addl	%r14d,%esi
166	xorl	%esi,%r10d
167	roll	$7,%r10d
168	addl	%r15d,%edi
169	xorl	%edi,%r11d
170	roll	$7,%r11d
171	addl	%r9d,%eax
172	xorl	%eax,%r15d
173	roll	$16,%r15d
174	addl	%r10d,%ebx
175	xorl	%ebx,%r12d
176	roll	$16,%r12d
177	addl	%r15d,%esi
178	xorl	%esi,%r9d
179	roll	$12,%r9d
180	addl	%r12d,%edi
181	xorl	%edi,%r10d
182	roll	$12,%r10d
183	addl	%r9d,%eax
184	xorl	%eax,%r15d
185	roll	$8,%r15d
186	addl	%r10d,%ebx
187	xorl	%ebx,%r12d
188	roll	$8,%r12d
189	addl	%r15d,%esi
190	xorl	%esi,%r9d
191	roll	$7,%r9d
192	addl	%r12d,%edi
193	xorl	%edi,%r10d
194	roll	$7,%r10d
195	movl	%esi,40(%rsp)
196	movl	%edi,44(%rsp)
197	movl	32(%rsp),%esi
198	movl	36(%rsp),%edi
199	addl	%r11d,%ecx
200	xorl	%ecx,%r13d
201	roll	$16,%r13d
202	addl	%r8d,%edx
203	xorl	%edx,%r14d
204	roll	$16,%r14d
205	addl	%r13d,%esi
206	xorl	%esi,%r11d
207	roll	$12,%r11d
208	addl	%r14d,%edi
209	xorl	%edi,%r8d
210	roll	$12,%r8d
211	addl	%r11d,%ecx
212	xorl	%ecx,%r13d
213	roll	$8,%r13d
214	addl	%r8d,%edx
215	xorl	%edx,%r14d
216	roll	$8,%r14d
217	addl	%r13d,%esi
218	xorl	%esi,%r11d
219	roll	$7,%r11d
220	addl	%r14d,%edi
221	xorl	%edi,%r8d
222	roll	$7,%r8d
223	decl	%ebp
224	jnz	.Loop
225	movl	%edi,36(%rsp)
226	movl	%esi,32(%rsp)
227	movq	64(%rsp),%rbp
228	movdqa	%xmm2,%xmm1
229	movq	64+8(%rsp),%rsi
230	paddd	%xmm4,%xmm3
231	movq	64+16(%rsp),%rdi
232
233	addl	$0x61707865,%eax
234	addl	$0x3320646e,%ebx
235	addl	$0x79622d32,%ecx
236	addl	$0x6b206574,%edx
237	addl	16(%rsp),%r8d
238	addl	20(%rsp),%r9d
239	addl	24(%rsp),%r10d
240	addl	28(%rsp),%r11d
241	addl	48(%rsp),%r12d
242	addl	52(%rsp),%r13d
243	addl	56(%rsp),%r14d
244	addl	60(%rsp),%r15d
245	paddd	32(%rsp),%xmm1
246
247	cmpq	$64,%rbp
248	jb	.Ltail
249
250	xorl	0(%rsi),%eax
251	xorl	4(%rsi),%ebx
252	xorl	8(%rsi),%ecx
253	xorl	12(%rsi),%edx
254	xorl	16(%rsi),%r8d
255	xorl	20(%rsi),%r9d
256	xorl	24(%rsi),%r10d
257	xorl	28(%rsi),%r11d
258	movdqu	32(%rsi),%xmm0
259	xorl	48(%rsi),%r12d
260	xorl	52(%rsi),%r13d
261	xorl	56(%rsi),%r14d
262	xorl	60(%rsi),%r15d
263	leaq	64(%rsi),%rsi
264	pxor	%xmm1,%xmm0
265
266	movdqa	%xmm2,32(%rsp)
267	movd	%xmm3,48(%rsp)
268
269	movl	%eax,0(%rdi)
270	movl	%ebx,4(%rdi)
271	movl	%ecx,8(%rdi)
272	movl	%edx,12(%rdi)
273	movl	%r8d,16(%rdi)
274	movl	%r9d,20(%rdi)
275	movl	%r10d,24(%rdi)
276	movl	%r11d,28(%rdi)
277	movdqu	%xmm0,32(%rdi)
278	movl	%r12d,48(%rdi)
279	movl	%r13d,52(%rdi)
280	movl	%r14d,56(%rdi)
281	movl	%r15d,60(%rdi)
282	leaq	64(%rdi),%rdi
283
284	subq	$64,%rbp
285	jnz	.Loop_outer
286
287	jmp	.Ldone
288
289.align	16
290.Ltail:
291	movl	%eax,0(%rsp)
292	movl	%ebx,4(%rsp)
293	xorq	%rbx,%rbx
294	movl	%ecx,8(%rsp)
295	movl	%edx,12(%rsp)
296	movl	%r8d,16(%rsp)
297	movl	%r9d,20(%rsp)
298	movl	%r10d,24(%rsp)
299	movl	%r11d,28(%rsp)
300	movdqa	%xmm1,32(%rsp)
301	movl	%r12d,48(%rsp)
302	movl	%r13d,52(%rsp)
303	movl	%r14d,56(%rsp)
304	movl	%r15d,60(%rsp)
305
306.Loop_tail:
307	movzbl	(%rsi,%rbx,1),%eax
308	movzbl	(%rsp,%rbx,1),%edx
309	leaq	1(%rbx),%rbx
310	xorl	%edx,%eax
311	movb	%al,-1(%rdi,%rbx,1)
312	decq	%rbp
313	jnz	.Loop_tail
314
315.Ldone:
316	leaq	64+24+48(%rsp),%rsi
317	movq	-48(%rsi),%r15
318.cfi_restore	r15
319	movq	-40(%rsi),%r14
320.cfi_restore	r14
321	movq	-32(%rsi),%r13
322.cfi_restore	r13
323	movq	-24(%rsi),%r12
324.cfi_restore	r12
325	movq	-16(%rsi),%rbp
326.cfi_restore	rbp
327	movq	-8(%rsi),%rbx
328.cfi_restore	rbx
329	leaq	(%rsi),%rsp
330.cfi_adjust_cfa_offset	-136
331.Lno_data:
332	.byte	0xf3,0xc3
333.cfi_endproc
334.size	ChaCha20_ctr32,.-ChaCha20_ctr32
335.type	ChaCha20_ssse3,@function
336.align	32
337ChaCha20_ssse3:
338.LChaCha20_ssse3:
339.cfi_startproc
340	movq	%rsp,%r9
341.cfi_def_cfa_register	r9
342	cmpq	$128,%rdx
343	ja	.LChaCha20_4x
344
345.Ldo_sse3_after_all:
346	subq	$64+8,%rsp
347	movdqa	.Lsigma(%rip),%xmm0
348	movdqu	(%rcx),%xmm1
349	movdqu	16(%rcx),%xmm2
350	movdqu	(%r8),%xmm3
351	movdqa	.Lrot16(%rip),%xmm6
352	movdqa	.Lrot24(%rip),%xmm7
353
354	movdqa	%xmm0,0(%rsp)
355	movdqa	%xmm1,16(%rsp)
356	movdqa	%xmm2,32(%rsp)
357	movdqa	%xmm3,48(%rsp)
358	movq	$10,%r8
359	jmp	.Loop_ssse3
360
361.align	32
362.Loop_outer_ssse3:
363	movdqa	.Lone(%rip),%xmm3
364	movdqa	0(%rsp),%xmm0
365	movdqa	16(%rsp),%xmm1
366	movdqa	32(%rsp),%xmm2
367	paddd	48(%rsp),%xmm3
368	movq	$10,%r8
369	movdqa	%xmm3,48(%rsp)
370	jmp	.Loop_ssse3
371
372.align	32
373.Loop_ssse3:
374	paddd	%xmm1,%xmm0
375	pxor	%xmm0,%xmm3
376.byte	102,15,56,0,222
377	paddd	%xmm3,%xmm2
378	pxor	%xmm2,%xmm1
379	movdqa	%xmm1,%xmm4
380	psrld	$20,%xmm1
381	pslld	$12,%xmm4
382	por	%xmm4,%xmm1
383	paddd	%xmm1,%xmm0
384	pxor	%xmm0,%xmm3
385.byte	102,15,56,0,223
386	paddd	%xmm3,%xmm2
387	pxor	%xmm2,%xmm1
388	movdqa	%xmm1,%xmm4
389	psrld	$25,%xmm1
390	pslld	$7,%xmm4
391	por	%xmm4,%xmm1
392	pshufd	$78,%xmm2,%xmm2
393	pshufd	$57,%xmm1,%xmm1
394	pshufd	$147,%xmm3,%xmm3
395	nop
396	paddd	%xmm1,%xmm0
397	pxor	%xmm0,%xmm3
398.byte	102,15,56,0,222
399	paddd	%xmm3,%xmm2
400	pxor	%xmm2,%xmm1
401	movdqa	%xmm1,%xmm4
402	psrld	$20,%xmm1
403	pslld	$12,%xmm4
404	por	%xmm4,%xmm1
405	paddd	%xmm1,%xmm0
406	pxor	%xmm0,%xmm3
407.byte	102,15,56,0,223
408	paddd	%xmm3,%xmm2
409	pxor	%xmm2,%xmm1
410	movdqa	%xmm1,%xmm4
411	psrld	$25,%xmm1
412	pslld	$7,%xmm4
413	por	%xmm4,%xmm1
414	pshufd	$78,%xmm2,%xmm2
415	pshufd	$147,%xmm1,%xmm1
416	pshufd	$57,%xmm3,%xmm3
417	decq	%r8
418	jnz	.Loop_ssse3
419	paddd	0(%rsp),%xmm0
420	paddd	16(%rsp),%xmm1
421	paddd	32(%rsp),%xmm2
422	paddd	48(%rsp),%xmm3
423
424	cmpq	$64,%rdx
425	jb	.Ltail_ssse3
426
427	movdqu	0(%rsi),%xmm4
428	movdqu	16(%rsi),%xmm5
429	pxor	%xmm4,%xmm0
430	movdqu	32(%rsi),%xmm4
431	pxor	%xmm5,%xmm1
432	movdqu	48(%rsi),%xmm5
433	leaq	64(%rsi),%rsi
434	pxor	%xmm4,%xmm2
435	pxor	%xmm5,%xmm3
436
437	movdqu	%xmm0,0(%rdi)
438	movdqu	%xmm1,16(%rdi)
439	movdqu	%xmm2,32(%rdi)
440	movdqu	%xmm3,48(%rdi)
441	leaq	64(%rdi),%rdi
442
443	subq	$64,%rdx
444	jnz	.Loop_outer_ssse3
445
446	jmp	.Ldone_ssse3
447
448.align	16
449.Ltail_ssse3:
450	movdqa	%xmm0,0(%rsp)
451	movdqa	%xmm1,16(%rsp)
452	movdqa	%xmm2,32(%rsp)
453	movdqa	%xmm3,48(%rsp)
454	xorq	%r8,%r8
455
456.Loop_tail_ssse3:
457	movzbl	(%rsi,%r8,1),%eax
458	movzbl	(%rsp,%r8,1),%ecx
459	leaq	1(%r8),%r8
460	xorl	%ecx,%eax
461	movb	%al,-1(%rdi,%r8,1)
462	decq	%rdx
463	jnz	.Loop_tail_ssse3
464
465.Ldone_ssse3:
466	leaq	(%r9),%rsp
467.cfi_def_cfa_register	rsp
468.Lssse3_epilogue:
469	.byte	0xf3,0xc3
470.cfi_endproc
471.size	ChaCha20_ssse3,.-ChaCha20_ssse3
472.type	ChaCha20_4x,@function
473.align	32
474ChaCha20_4x:
475.LChaCha20_4x:
476.cfi_startproc
477	movq	%rsp,%r9
478.cfi_def_cfa_register	r9
479	movq	%r10,%r11
480	shrq	$32,%r10
481	testq	$32,%r10
482	jnz	.LChaCha20_8x
483	cmpq	$192,%rdx
484	ja	.Lproceed4x
485
486	andq	$71303168,%r11
487	cmpq	$4194304,%r11
488	je	.Ldo_sse3_after_all
489
490.Lproceed4x:
491	subq	$0x140+8,%rsp
492	movdqa	.Lsigma(%rip),%xmm11
493	movdqu	(%rcx),%xmm15
494	movdqu	16(%rcx),%xmm7
495	movdqu	(%r8),%xmm3
496	leaq	256(%rsp),%rcx
497	leaq	.Lrot16(%rip),%r10
498	leaq	.Lrot24(%rip),%r11
499
500	pshufd	$0x00,%xmm11,%xmm8
501	pshufd	$0x55,%xmm11,%xmm9
502	movdqa	%xmm8,64(%rsp)
503	pshufd	$0xaa,%xmm11,%xmm10
504	movdqa	%xmm9,80(%rsp)
505	pshufd	$0xff,%xmm11,%xmm11
506	movdqa	%xmm10,96(%rsp)
507	movdqa	%xmm11,112(%rsp)
508
509	pshufd	$0x00,%xmm15,%xmm12
510	pshufd	$0x55,%xmm15,%xmm13
511	movdqa	%xmm12,128-256(%rcx)
512	pshufd	$0xaa,%xmm15,%xmm14
513	movdqa	%xmm13,144-256(%rcx)
514	pshufd	$0xff,%xmm15,%xmm15
515	movdqa	%xmm14,160-256(%rcx)
516	movdqa	%xmm15,176-256(%rcx)
517
518	pshufd	$0x00,%xmm7,%xmm4
519	pshufd	$0x55,%xmm7,%xmm5
520	movdqa	%xmm4,192-256(%rcx)
521	pshufd	$0xaa,%xmm7,%xmm6
522	movdqa	%xmm5,208-256(%rcx)
523	pshufd	$0xff,%xmm7,%xmm7
524	movdqa	%xmm6,224-256(%rcx)
525	movdqa	%xmm7,240-256(%rcx)
526
527	pshufd	$0x00,%xmm3,%xmm0
528	pshufd	$0x55,%xmm3,%xmm1
529	paddd	.Linc(%rip),%xmm0
530	pshufd	$0xaa,%xmm3,%xmm2
531	movdqa	%xmm1,272-256(%rcx)
532	pshufd	$0xff,%xmm3,%xmm3
533	movdqa	%xmm2,288-256(%rcx)
534	movdqa	%xmm3,304-256(%rcx)
535
536	jmp	.Loop_enter4x
537
538.align	32
539.Loop_outer4x:
540	movdqa	64(%rsp),%xmm8
541	movdqa	80(%rsp),%xmm9
542	movdqa	96(%rsp),%xmm10
543	movdqa	112(%rsp),%xmm11
544	movdqa	128-256(%rcx),%xmm12
545	movdqa	144-256(%rcx),%xmm13
546	movdqa	160-256(%rcx),%xmm14
547	movdqa	176-256(%rcx),%xmm15
548	movdqa	192-256(%rcx),%xmm4
549	movdqa	208-256(%rcx),%xmm5
550	movdqa	224-256(%rcx),%xmm6
551	movdqa	240-256(%rcx),%xmm7
552	movdqa	256-256(%rcx),%xmm0
553	movdqa	272-256(%rcx),%xmm1
554	movdqa	288-256(%rcx),%xmm2
555	movdqa	304-256(%rcx),%xmm3
556	paddd	.Lfour(%rip),%xmm0
557
558.Loop_enter4x:
559	movdqa	%xmm6,32(%rsp)
560	movdqa	%xmm7,48(%rsp)
561	movdqa	(%r10),%xmm7
562	movl	$10,%eax
563	movdqa	%xmm0,256-256(%rcx)
564	jmp	.Loop4x
565
566.align	32
567.Loop4x:
568	paddd	%xmm12,%xmm8
569	paddd	%xmm13,%xmm9
570	pxor	%xmm8,%xmm0
571	pxor	%xmm9,%xmm1
572.byte	102,15,56,0,199
573.byte	102,15,56,0,207
574	paddd	%xmm0,%xmm4
575	paddd	%xmm1,%xmm5
576	pxor	%xmm4,%xmm12
577	pxor	%xmm5,%xmm13
578	movdqa	%xmm12,%xmm6
579	pslld	$12,%xmm12
580	psrld	$20,%xmm6
581	movdqa	%xmm13,%xmm7
582	pslld	$12,%xmm13
583	por	%xmm6,%xmm12
584	psrld	$20,%xmm7
585	movdqa	(%r11),%xmm6
586	por	%xmm7,%xmm13
587	paddd	%xmm12,%xmm8
588	paddd	%xmm13,%xmm9
589	pxor	%xmm8,%xmm0
590	pxor	%xmm9,%xmm1
591.byte	102,15,56,0,198
592.byte	102,15,56,0,206
593	paddd	%xmm0,%xmm4
594	paddd	%xmm1,%xmm5
595	pxor	%xmm4,%xmm12
596	pxor	%xmm5,%xmm13
597	movdqa	%xmm12,%xmm7
598	pslld	$7,%xmm12
599	psrld	$25,%xmm7
600	movdqa	%xmm13,%xmm6
601	pslld	$7,%xmm13
602	por	%xmm7,%xmm12
603	psrld	$25,%xmm6
604	movdqa	(%r10),%xmm7
605	por	%xmm6,%xmm13
606	movdqa	%xmm4,0(%rsp)
607	movdqa	%xmm5,16(%rsp)
608	movdqa	32(%rsp),%xmm4
609	movdqa	48(%rsp),%xmm5
610	paddd	%xmm14,%xmm10
611	paddd	%xmm15,%xmm11
612	pxor	%xmm10,%xmm2
613	pxor	%xmm11,%xmm3
614.byte	102,15,56,0,215
615.byte	102,15,56,0,223
616	paddd	%xmm2,%xmm4
617	paddd	%xmm3,%xmm5
618	pxor	%xmm4,%xmm14
619	pxor	%xmm5,%xmm15
620	movdqa	%xmm14,%xmm6
621	pslld	$12,%xmm14
622	psrld	$20,%xmm6
623	movdqa	%xmm15,%xmm7
624	pslld	$12,%xmm15
625	por	%xmm6,%xmm14
626	psrld	$20,%xmm7
627	movdqa	(%r11),%xmm6
628	por	%xmm7,%xmm15
629	paddd	%xmm14,%xmm10
630	paddd	%xmm15,%xmm11
631	pxor	%xmm10,%xmm2
632	pxor	%xmm11,%xmm3
633.byte	102,15,56,0,214
634.byte	102,15,56,0,222
635	paddd	%xmm2,%xmm4
636	paddd	%xmm3,%xmm5
637	pxor	%xmm4,%xmm14
638	pxor	%xmm5,%xmm15
639	movdqa	%xmm14,%xmm7
640	pslld	$7,%xmm14
641	psrld	$25,%xmm7
642	movdqa	%xmm15,%xmm6
643	pslld	$7,%xmm15
644	por	%xmm7,%xmm14
645	psrld	$25,%xmm6
646	movdqa	(%r10),%xmm7
647	por	%xmm6,%xmm15
648	paddd	%xmm13,%xmm8
649	paddd	%xmm14,%xmm9
650	pxor	%xmm8,%xmm3
651	pxor	%xmm9,%xmm0
652.byte	102,15,56,0,223
653.byte	102,15,56,0,199
654	paddd	%xmm3,%xmm4
655	paddd	%xmm0,%xmm5
656	pxor	%xmm4,%xmm13
657	pxor	%xmm5,%xmm14
658	movdqa	%xmm13,%xmm6
659	pslld	$12,%xmm13
660	psrld	$20,%xmm6
661	movdqa	%xmm14,%xmm7
662	pslld	$12,%xmm14
663	por	%xmm6,%xmm13
664	psrld	$20,%xmm7
665	movdqa	(%r11),%xmm6
666	por	%xmm7,%xmm14
667	paddd	%xmm13,%xmm8
668	paddd	%xmm14,%xmm9
669	pxor	%xmm8,%xmm3
670	pxor	%xmm9,%xmm0
671.byte	102,15,56,0,222
672.byte	102,15,56,0,198
673	paddd	%xmm3,%xmm4
674	paddd	%xmm0,%xmm5
675	pxor	%xmm4,%xmm13
676	pxor	%xmm5,%xmm14
677	movdqa	%xmm13,%xmm7
678	pslld	$7,%xmm13
679	psrld	$25,%xmm7
680	movdqa	%xmm14,%xmm6
681	pslld	$7,%xmm14
682	por	%xmm7,%xmm13
683	psrld	$25,%xmm6
684	movdqa	(%r10),%xmm7
685	por	%xmm6,%xmm14
686	movdqa	%xmm4,32(%rsp)
687	movdqa	%xmm5,48(%rsp)
688	movdqa	0(%rsp),%xmm4
689	movdqa	16(%rsp),%xmm5
690	paddd	%xmm15,%xmm10
691	paddd	%xmm12,%xmm11
692	pxor	%xmm10,%xmm1
693	pxor	%xmm11,%xmm2
694.byte	102,15,56,0,207
695.byte	102,15,56,0,215
696	paddd	%xmm1,%xmm4
697	paddd	%xmm2,%xmm5
698	pxor	%xmm4,%xmm15
699	pxor	%xmm5,%xmm12
700	movdqa	%xmm15,%xmm6
701	pslld	$12,%xmm15
702	psrld	$20,%xmm6
703	movdqa	%xmm12,%xmm7
704	pslld	$12,%xmm12
705	por	%xmm6,%xmm15
706	psrld	$20,%xmm7
707	movdqa	(%r11),%xmm6
708	por	%xmm7,%xmm12
709	paddd	%xmm15,%xmm10
710	paddd	%xmm12,%xmm11
711	pxor	%xmm10,%xmm1
712	pxor	%xmm11,%xmm2
713.byte	102,15,56,0,206
714.byte	102,15,56,0,214
715	paddd	%xmm1,%xmm4
716	paddd	%xmm2,%xmm5
717	pxor	%xmm4,%xmm15
718	pxor	%xmm5,%xmm12
719	movdqa	%xmm15,%xmm7
720	pslld	$7,%xmm15
721	psrld	$25,%xmm7
722	movdqa	%xmm12,%xmm6
723	pslld	$7,%xmm12
724	por	%xmm7,%xmm15
725	psrld	$25,%xmm6
726	movdqa	(%r10),%xmm7
727	por	%xmm6,%xmm12
728	decl	%eax
729	jnz	.Loop4x
730
731	paddd	64(%rsp),%xmm8
732	paddd	80(%rsp),%xmm9
733	paddd	96(%rsp),%xmm10
734	paddd	112(%rsp),%xmm11
735
736	movdqa	%xmm8,%xmm6
737	punpckldq	%xmm9,%xmm8
738	movdqa	%xmm10,%xmm7
739	punpckldq	%xmm11,%xmm10
740	punpckhdq	%xmm9,%xmm6
741	punpckhdq	%xmm11,%xmm7
742	movdqa	%xmm8,%xmm9
743	punpcklqdq	%xmm10,%xmm8
744	movdqa	%xmm6,%xmm11
745	punpcklqdq	%xmm7,%xmm6
746	punpckhqdq	%xmm10,%xmm9
747	punpckhqdq	%xmm7,%xmm11
748	paddd	128-256(%rcx),%xmm12
749	paddd	144-256(%rcx),%xmm13
750	paddd	160-256(%rcx),%xmm14
751	paddd	176-256(%rcx),%xmm15
752
753	movdqa	%xmm8,0(%rsp)
754	movdqa	%xmm9,16(%rsp)
755	movdqa	32(%rsp),%xmm8
756	movdqa	48(%rsp),%xmm9
757
758	movdqa	%xmm12,%xmm10
759	punpckldq	%xmm13,%xmm12
760	movdqa	%xmm14,%xmm7
761	punpckldq	%xmm15,%xmm14
762	punpckhdq	%xmm13,%xmm10
763	punpckhdq	%xmm15,%xmm7
764	movdqa	%xmm12,%xmm13
765	punpcklqdq	%xmm14,%xmm12
766	movdqa	%xmm10,%xmm15
767	punpcklqdq	%xmm7,%xmm10
768	punpckhqdq	%xmm14,%xmm13
769	punpckhqdq	%xmm7,%xmm15
770	paddd	192-256(%rcx),%xmm4
771	paddd	208-256(%rcx),%xmm5
772	paddd	224-256(%rcx),%xmm8
773	paddd	240-256(%rcx),%xmm9
774
775	movdqa	%xmm6,32(%rsp)
776	movdqa	%xmm11,48(%rsp)
777
778	movdqa	%xmm4,%xmm14
779	punpckldq	%xmm5,%xmm4
780	movdqa	%xmm8,%xmm7
781	punpckldq	%xmm9,%xmm8
782	punpckhdq	%xmm5,%xmm14
783	punpckhdq	%xmm9,%xmm7
784	movdqa	%xmm4,%xmm5
785	punpcklqdq	%xmm8,%xmm4
786	movdqa	%xmm14,%xmm9
787	punpcklqdq	%xmm7,%xmm14
788	punpckhqdq	%xmm8,%xmm5
789	punpckhqdq	%xmm7,%xmm9
790	paddd	256-256(%rcx),%xmm0
791	paddd	272-256(%rcx),%xmm1
792	paddd	288-256(%rcx),%xmm2
793	paddd	304-256(%rcx),%xmm3
794
795	movdqa	%xmm0,%xmm8
796	punpckldq	%xmm1,%xmm0
797	movdqa	%xmm2,%xmm7
798	punpckldq	%xmm3,%xmm2
799	punpckhdq	%xmm1,%xmm8
800	punpckhdq	%xmm3,%xmm7
801	movdqa	%xmm0,%xmm1
802	punpcklqdq	%xmm2,%xmm0
803	movdqa	%xmm8,%xmm3
804	punpcklqdq	%xmm7,%xmm8
805	punpckhqdq	%xmm2,%xmm1
806	punpckhqdq	%xmm7,%xmm3
807	cmpq	$256,%rdx
808	jb	.Ltail4x
809
810	movdqu	0(%rsi),%xmm6
811	movdqu	16(%rsi),%xmm11
812	movdqu	32(%rsi),%xmm2
813	movdqu	48(%rsi),%xmm7
814	pxor	0(%rsp),%xmm6
815	pxor	%xmm12,%xmm11
816	pxor	%xmm4,%xmm2
817	pxor	%xmm0,%xmm7
818
819	movdqu	%xmm6,0(%rdi)
820	movdqu	64(%rsi),%xmm6
821	movdqu	%xmm11,16(%rdi)
822	movdqu	80(%rsi),%xmm11
823	movdqu	%xmm2,32(%rdi)
824	movdqu	96(%rsi),%xmm2
825	movdqu	%xmm7,48(%rdi)
826	movdqu	112(%rsi),%xmm7
827	leaq	128(%rsi),%rsi
828	pxor	16(%rsp),%xmm6
829	pxor	%xmm13,%xmm11
830	pxor	%xmm5,%xmm2
831	pxor	%xmm1,%xmm7
832
833	movdqu	%xmm6,64(%rdi)
834	movdqu	0(%rsi),%xmm6
835	movdqu	%xmm11,80(%rdi)
836	movdqu	16(%rsi),%xmm11
837	movdqu	%xmm2,96(%rdi)
838	movdqu	32(%rsi),%xmm2
839	movdqu	%xmm7,112(%rdi)
840	leaq	128(%rdi),%rdi
841	movdqu	48(%rsi),%xmm7
842	pxor	32(%rsp),%xmm6
843	pxor	%xmm10,%xmm11
844	pxor	%xmm14,%xmm2
845	pxor	%xmm8,%xmm7
846
847	movdqu	%xmm6,0(%rdi)
848	movdqu	64(%rsi),%xmm6
849	movdqu	%xmm11,16(%rdi)
850	movdqu	80(%rsi),%xmm11
851	movdqu	%xmm2,32(%rdi)
852	movdqu	96(%rsi),%xmm2
853	movdqu	%xmm7,48(%rdi)
854	movdqu	112(%rsi),%xmm7
855	leaq	128(%rsi),%rsi
856	pxor	48(%rsp),%xmm6
857	pxor	%xmm15,%xmm11
858	pxor	%xmm9,%xmm2
859	pxor	%xmm3,%xmm7
860	movdqu	%xmm6,64(%rdi)
861	movdqu	%xmm11,80(%rdi)
862	movdqu	%xmm2,96(%rdi)
863	movdqu	%xmm7,112(%rdi)
864	leaq	128(%rdi),%rdi
865
866	subq	$256,%rdx
867	jnz	.Loop_outer4x
868
869	jmp	.Ldone4x
870
871.Ltail4x:
872	cmpq	$192,%rdx
873	jae	.L192_or_more4x
874	cmpq	$128,%rdx
875	jae	.L128_or_more4x
876	cmpq	$64,%rdx
877	jae	.L64_or_more4x
878
879
880	xorq	%r10,%r10
881
882	movdqa	%xmm12,16(%rsp)
883	movdqa	%xmm4,32(%rsp)
884	movdqa	%xmm0,48(%rsp)
885	jmp	.Loop_tail4x
886
887.align	32
888.L64_or_more4x:
889	movdqu	0(%rsi),%xmm6
890	movdqu	16(%rsi),%xmm11
891	movdqu	32(%rsi),%xmm2
892	movdqu	48(%rsi),%xmm7
893	pxor	0(%rsp),%xmm6
894	pxor	%xmm12,%xmm11
895	pxor	%xmm4,%xmm2
896	pxor	%xmm0,%xmm7
897	movdqu	%xmm6,0(%rdi)
898	movdqu	%xmm11,16(%rdi)
899	movdqu	%xmm2,32(%rdi)
900	movdqu	%xmm7,48(%rdi)
901	je	.Ldone4x
902
903	movdqa	16(%rsp),%xmm6
904	leaq	64(%rsi),%rsi
905	xorq	%r10,%r10
906	movdqa	%xmm6,0(%rsp)
907	movdqa	%xmm13,16(%rsp)
908	leaq	64(%rdi),%rdi
909	movdqa	%xmm5,32(%rsp)
910	subq	$64,%rdx
911	movdqa	%xmm1,48(%rsp)
912	jmp	.Loop_tail4x
913
914.align	32
915.L128_or_more4x:
916	movdqu	0(%rsi),%xmm6
917	movdqu	16(%rsi),%xmm11
918	movdqu	32(%rsi),%xmm2
919	movdqu	48(%rsi),%xmm7
920	pxor	0(%rsp),%xmm6
921	pxor	%xmm12,%xmm11
922	pxor	%xmm4,%xmm2
923	pxor	%xmm0,%xmm7
924
925	movdqu	%xmm6,0(%rdi)
926	movdqu	64(%rsi),%xmm6
927	movdqu	%xmm11,16(%rdi)
928	movdqu	80(%rsi),%xmm11
929	movdqu	%xmm2,32(%rdi)
930	movdqu	96(%rsi),%xmm2
931	movdqu	%xmm7,48(%rdi)
932	movdqu	112(%rsi),%xmm7
933	pxor	16(%rsp),%xmm6
934	pxor	%xmm13,%xmm11
935	pxor	%xmm5,%xmm2
936	pxor	%xmm1,%xmm7
937	movdqu	%xmm6,64(%rdi)
938	movdqu	%xmm11,80(%rdi)
939	movdqu	%xmm2,96(%rdi)
940	movdqu	%xmm7,112(%rdi)
941	je	.Ldone4x
942
943	movdqa	32(%rsp),%xmm6
944	leaq	128(%rsi),%rsi
945	xorq	%r10,%r10
946	movdqa	%xmm6,0(%rsp)
947	movdqa	%xmm10,16(%rsp)
948	leaq	128(%rdi),%rdi
949	movdqa	%xmm14,32(%rsp)
950	subq	$128,%rdx
951	movdqa	%xmm8,48(%rsp)
952	jmp	.Loop_tail4x
953
954.align	32
955.L192_or_more4x:
956	movdqu	0(%rsi),%xmm6
957	movdqu	16(%rsi),%xmm11
958	movdqu	32(%rsi),%xmm2
959	movdqu	48(%rsi),%xmm7
960	pxor	0(%rsp),%xmm6
961	pxor	%xmm12,%xmm11
962	pxor	%xmm4,%xmm2
963	pxor	%xmm0,%xmm7
964
965	movdqu	%xmm6,0(%rdi)
966	movdqu	64(%rsi),%xmm6
967	movdqu	%xmm11,16(%rdi)
968	movdqu	80(%rsi),%xmm11
969	movdqu	%xmm2,32(%rdi)
970	movdqu	96(%rsi),%xmm2
971	movdqu	%xmm7,48(%rdi)
972	movdqu	112(%rsi),%xmm7
973	leaq	128(%rsi),%rsi
974	pxor	16(%rsp),%xmm6
975	pxor	%xmm13,%xmm11
976	pxor	%xmm5,%xmm2
977	pxor	%xmm1,%xmm7
978
979	movdqu	%xmm6,64(%rdi)
980	movdqu	0(%rsi),%xmm6
981	movdqu	%xmm11,80(%rdi)
982	movdqu	16(%rsi),%xmm11
983	movdqu	%xmm2,96(%rdi)
984	movdqu	32(%rsi),%xmm2
985	movdqu	%xmm7,112(%rdi)
986	leaq	128(%rdi),%rdi
987	movdqu	48(%rsi),%xmm7
988	pxor	32(%rsp),%xmm6
989	pxor	%xmm10,%xmm11
990	pxor	%xmm14,%xmm2
991	pxor	%xmm8,%xmm7
992	movdqu	%xmm6,0(%rdi)
993	movdqu	%xmm11,16(%rdi)
994	movdqu	%xmm2,32(%rdi)
995	movdqu	%xmm7,48(%rdi)
996	je	.Ldone4x
997
998	movdqa	48(%rsp),%xmm6
999	leaq	64(%rsi),%rsi
1000	xorq	%r10,%r10
1001	movdqa	%xmm6,0(%rsp)
1002	movdqa	%xmm15,16(%rsp)
1003	leaq	64(%rdi),%rdi
1004	movdqa	%xmm9,32(%rsp)
1005	subq	$192,%rdx
1006	movdqa	%xmm3,48(%rsp)
1007
1008.Loop_tail4x:
1009	movzbl	(%rsi,%r10,1),%eax
1010	movzbl	(%rsp,%r10,1),%ecx
1011	leaq	1(%r10),%r10
1012	xorl	%ecx,%eax
1013	movb	%al,-1(%rdi,%r10,1)
1014	decq	%rdx
1015	jnz	.Loop_tail4x
1016
1017.Ldone4x:
1018	leaq	(%r9),%rsp
1019.cfi_def_cfa_register	rsp
1020.L4x_epilogue:
1021	.byte	0xf3,0xc3
1022.cfi_endproc
1023.size	ChaCha20_4x,.-ChaCha20_4x
1024.type	ChaCha20_8x,@function
1025.align	32
1026ChaCha20_8x:
1027.LChaCha20_8x:
1028.cfi_startproc
1029	movq	%rsp,%r9
1030.cfi_def_cfa_register	r9
1031	subq	$0x280+8,%rsp
1032	andq	$-32,%rsp
1033	vzeroupper
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044	vbroadcasti128	.Lsigma(%rip),%ymm11
1045	vbroadcasti128	(%rcx),%ymm3
1046	vbroadcasti128	16(%rcx),%ymm15
1047	vbroadcasti128	(%r8),%ymm7
1048	leaq	256(%rsp),%rcx
1049	leaq	512(%rsp),%rax
1050	leaq	.Lrot16(%rip),%r10
1051	leaq	.Lrot24(%rip),%r11
1052
1053	vpshufd	$0x00,%ymm11,%ymm8
1054	vpshufd	$0x55,%ymm11,%ymm9
1055	vmovdqa	%ymm8,128-256(%rcx)
1056	vpshufd	$0xaa,%ymm11,%ymm10
1057	vmovdqa	%ymm9,160-256(%rcx)
1058	vpshufd	$0xff,%ymm11,%ymm11
1059	vmovdqa	%ymm10,192-256(%rcx)
1060	vmovdqa	%ymm11,224-256(%rcx)
1061
1062	vpshufd	$0x00,%ymm3,%ymm0
1063	vpshufd	$0x55,%ymm3,%ymm1
1064	vmovdqa	%ymm0,256-256(%rcx)
1065	vpshufd	$0xaa,%ymm3,%ymm2
1066	vmovdqa	%ymm1,288-256(%rcx)
1067	vpshufd	$0xff,%ymm3,%ymm3
1068	vmovdqa	%ymm2,320-256(%rcx)
1069	vmovdqa	%ymm3,352-256(%rcx)
1070
1071	vpshufd	$0x00,%ymm15,%ymm12
1072	vpshufd	$0x55,%ymm15,%ymm13
1073	vmovdqa	%ymm12,384-512(%rax)
1074	vpshufd	$0xaa,%ymm15,%ymm14
1075	vmovdqa	%ymm13,416-512(%rax)
1076	vpshufd	$0xff,%ymm15,%ymm15
1077	vmovdqa	%ymm14,448-512(%rax)
1078	vmovdqa	%ymm15,480-512(%rax)
1079
1080	vpshufd	$0x00,%ymm7,%ymm4
1081	vpshufd	$0x55,%ymm7,%ymm5
1082	vpaddd	.Lincy(%rip),%ymm4,%ymm4
1083	vpshufd	$0xaa,%ymm7,%ymm6
1084	vmovdqa	%ymm5,544-512(%rax)
1085	vpshufd	$0xff,%ymm7,%ymm7
1086	vmovdqa	%ymm6,576-512(%rax)
1087	vmovdqa	%ymm7,608-512(%rax)
1088
1089	jmp	.Loop_enter8x
1090
1091.align	32
1092.Loop_outer8x:
1093	vmovdqa	128-256(%rcx),%ymm8
1094	vmovdqa	160-256(%rcx),%ymm9
1095	vmovdqa	192-256(%rcx),%ymm10
1096	vmovdqa	224-256(%rcx),%ymm11
1097	vmovdqa	256-256(%rcx),%ymm0
1098	vmovdqa	288-256(%rcx),%ymm1
1099	vmovdqa	320-256(%rcx),%ymm2
1100	vmovdqa	352-256(%rcx),%ymm3
1101	vmovdqa	384-512(%rax),%ymm12
1102	vmovdqa	416-512(%rax),%ymm13
1103	vmovdqa	448-512(%rax),%ymm14
1104	vmovdqa	480-512(%rax),%ymm15
1105	vmovdqa	512-512(%rax),%ymm4
1106	vmovdqa	544-512(%rax),%ymm5
1107	vmovdqa	576-512(%rax),%ymm6
1108	vmovdqa	608-512(%rax),%ymm7
1109	vpaddd	.Leight(%rip),%ymm4,%ymm4
1110
1111.Loop_enter8x:
1112	vmovdqa	%ymm14,64(%rsp)
1113	vmovdqa	%ymm15,96(%rsp)
1114	vbroadcasti128	(%r10),%ymm15
1115	vmovdqa	%ymm4,512-512(%rax)
1116	movl	$10,%eax
1117	jmp	.Loop8x
1118
1119.align	32
1120.Loop8x:
1121	vpaddd	%ymm0,%ymm8,%ymm8
1122	vpxor	%ymm4,%ymm8,%ymm4
1123	vpshufb	%ymm15,%ymm4,%ymm4
1124	vpaddd	%ymm1,%ymm9,%ymm9
1125	vpxor	%ymm5,%ymm9,%ymm5
1126	vpshufb	%ymm15,%ymm5,%ymm5
1127	vpaddd	%ymm4,%ymm12,%ymm12
1128	vpxor	%ymm0,%ymm12,%ymm0
1129	vpslld	$12,%ymm0,%ymm14
1130	vpsrld	$20,%ymm0,%ymm0
1131	vpor	%ymm0,%ymm14,%ymm0
1132	vbroadcasti128	(%r11),%ymm14
1133	vpaddd	%ymm5,%ymm13,%ymm13
1134	vpxor	%ymm1,%ymm13,%ymm1
1135	vpslld	$12,%ymm1,%ymm15
1136	vpsrld	$20,%ymm1,%ymm1
1137	vpor	%ymm1,%ymm15,%ymm1
1138	vpaddd	%ymm0,%ymm8,%ymm8
1139	vpxor	%ymm4,%ymm8,%ymm4
1140	vpshufb	%ymm14,%ymm4,%ymm4
1141	vpaddd	%ymm1,%ymm9,%ymm9
1142	vpxor	%ymm5,%ymm9,%ymm5
1143	vpshufb	%ymm14,%ymm5,%ymm5
1144	vpaddd	%ymm4,%ymm12,%ymm12
1145	vpxor	%ymm0,%ymm12,%ymm0
1146	vpslld	$7,%ymm0,%ymm15
1147	vpsrld	$25,%ymm0,%ymm0
1148	vpor	%ymm0,%ymm15,%ymm0
1149	vbroadcasti128	(%r10),%ymm15
1150	vpaddd	%ymm5,%ymm13,%ymm13
1151	vpxor	%ymm1,%ymm13,%ymm1
1152	vpslld	$7,%ymm1,%ymm14
1153	vpsrld	$25,%ymm1,%ymm1
1154	vpor	%ymm1,%ymm14,%ymm1
1155	vmovdqa	%ymm12,0(%rsp)
1156	vmovdqa	%ymm13,32(%rsp)
1157	vmovdqa	64(%rsp),%ymm12
1158	vmovdqa	96(%rsp),%ymm13
1159	vpaddd	%ymm2,%ymm10,%ymm10
1160	vpxor	%ymm6,%ymm10,%ymm6
1161	vpshufb	%ymm15,%ymm6,%ymm6
1162	vpaddd	%ymm3,%ymm11,%ymm11
1163	vpxor	%ymm7,%ymm11,%ymm7
1164	vpshufb	%ymm15,%ymm7,%ymm7
1165	vpaddd	%ymm6,%ymm12,%ymm12
1166	vpxor	%ymm2,%ymm12,%ymm2
1167	vpslld	$12,%ymm2,%ymm14
1168	vpsrld	$20,%ymm2,%ymm2
1169	vpor	%ymm2,%ymm14,%ymm2
1170	vbroadcasti128	(%r11),%ymm14
1171	vpaddd	%ymm7,%ymm13,%ymm13
1172	vpxor	%ymm3,%ymm13,%ymm3
1173	vpslld	$12,%ymm3,%ymm15
1174	vpsrld	$20,%ymm3,%ymm3
1175	vpor	%ymm3,%ymm15,%ymm3
1176	vpaddd	%ymm2,%ymm10,%ymm10
1177	vpxor	%ymm6,%ymm10,%ymm6
1178	vpshufb	%ymm14,%ymm6,%ymm6
1179	vpaddd	%ymm3,%ymm11,%ymm11
1180	vpxor	%ymm7,%ymm11,%ymm7
1181	vpshufb	%ymm14,%ymm7,%ymm7
1182	vpaddd	%ymm6,%ymm12,%ymm12
1183	vpxor	%ymm2,%ymm12,%ymm2
1184	vpslld	$7,%ymm2,%ymm15
1185	vpsrld	$25,%ymm2,%ymm2
1186	vpor	%ymm2,%ymm15,%ymm2
1187	vbroadcasti128	(%r10),%ymm15
1188	vpaddd	%ymm7,%ymm13,%ymm13
1189	vpxor	%ymm3,%ymm13,%ymm3
1190	vpslld	$7,%ymm3,%ymm14
1191	vpsrld	$25,%ymm3,%ymm3
1192	vpor	%ymm3,%ymm14,%ymm3
1193	vpaddd	%ymm1,%ymm8,%ymm8
1194	vpxor	%ymm7,%ymm8,%ymm7
1195	vpshufb	%ymm15,%ymm7,%ymm7
1196	vpaddd	%ymm2,%ymm9,%ymm9
1197	vpxor	%ymm4,%ymm9,%ymm4
1198	vpshufb	%ymm15,%ymm4,%ymm4
1199	vpaddd	%ymm7,%ymm12,%ymm12
1200	vpxor	%ymm1,%ymm12,%ymm1
1201	vpslld	$12,%ymm1,%ymm14
1202	vpsrld	$20,%ymm1,%ymm1
1203	vpor	%ymm1,%ymm14,%ymm1
1204	vbroadcasti128	(%r11),%ymm14
1205	vpaddd	%ymm4,%ymm13,%ymm13
1206	vpxor	%ymm2,%ymm13,%ymm2
1207	vpslld	$12,%ymm2,%ymm15
1208	vpsrld	$20,%ymm2,%ymm2
1209	vpor	%ymm2,%ymm15,%ymm2
1210	vpaddd	%ymm1,%ymm8,%ymm8
1211	vpxor	%ymm7,%ymm8,%ymm7
1212	vpshufb	%ymm14,%ymm7,%ymm7
1213	vpaddd	%ymm2,%ymm9,%ymm9
1214	vpxor	%ymm4,%ymm9,%ymm4
1215	vpshufb	%ymm14,%ymm4,%ymm4
1216	vpaddd	%ymm7,%ymm12,%ymm12
1217	vpxor	%ymm1,%ymm12,%ymm1
1218	vpslld	$7,%ymm1,%ymm15
1219	vpsrld	$25,%ymm1,%ymm1
1220	vpor	%ymm1,%ymm15,%ymm1
1221	vbroadcasti128	(%r10),%ymm15
1222	vpaddd	%ymm4,%ymm13,%ymm13
1223	vpxor	%ymm2,%ymm13,%ymm2
1224	vpslld	$7,%ymm2,%ymm14
1225	vpsrld	$25,%ymm2,%ymm2
1226	vpor	%ymm2,%ymm14,%ymm2
1227	vmovdqa	%ymm12,64(%rsp)
1228	vmovdqa	%ymm13,96(%rsp)
1229	vmovdqa	0(%rsp),%ymm12
1230	vmovdqa	32(%rsp),%ymm13
1231	vpaddd	%ymm3,%ymm10,%ymm10
1232	vpxor	%ymm5,%ymm10,%ymm5
1233	vpshufb	%ymm15,%ymm5,%ymm5
1234	vpaddd	%ymm0,%ymm11,%ymm11
1235	vpxor	%ymm6,%ymm11,%ymm6
1236	vpshufb	%ymm15,%ymm6,%ymm6
1237	vpaddd	%ymm5,%ymm12,%ymm12
1238	vpxor	%ymm3,%ymm12,%ymm3
1239	vpslld	$12,%ymm3,%ymm14
1240	vpsrld	$20,%ymm3,%ymm3
1241	vpor	%ymm3,%ymm14,%ymm3
1242	vbroadcasti128	(%r11),%ymm14
1243	vpaddd	%ymm6,%ymm13,%ymm13
1244	vpxor	%ymm0,%ymm13,%ymm0
1245	vpslld	$12,%ymm0,%ymm15
1246	vpsrld	$20,%ymm0,%ymm0
1247	vpor	%ymm0,%ymm15,%ymm0
1248	vpaddd	%ymm3,%ymm10,%ymm10
1249	vpxor	%ymm5,%ymm10,%ymm5
1250	vpshufb	%ymm14,%ymm5,%ymm5
1251	vpaddd	%ymm0,%ymm11,%ymm11
1252	vpxor	%ymm6,%ymm11,%ymm6
1253	vpshufb	%ymm14,%ymm6,%ymm6
1254	vpaddd	%ymm5,%ymm12,%ymm12
1255	vpxor	%ymm3,%ymm12,%ymm3
1256	vpslld	$7,%ymm3,%ymm15
1257	vpsrld	$25,%ymm3,%ymm3
1258	vpor	%ymm3,%ymm15,%ymm3
1259	vbroadcasti128	(%r10),%ymm15
1260	vpaddd	%ymm6,%ymm13,%ymm13
1261	vpxor	%ymm0,%ymm13,%ymm0
1262	vpslld	$7,%ymm0,%ymm14
1263	vpsrld	$25,%ymm0,%ymm0
1264	vpor	%ymm0,%ymm14,%ymm0
1265	decl	%eax
1266	jnz	.Loop8x
1267
1268	leaq	512(%rsp),%rax
1269	vpaddd	128-256(%rcx),%ymm8,%ymm8
1270	vpaddd	160-256(%rcx),%ymm9,%ymm9
1271	vpaddd	192-256(%rcx),%ymm10,%ymm10
1272	vpaddd	224-256(%rcx),%ymm11,%ymm11
1273
1274	vpunpckldq	%ymm9,%ymm8,%ymm14
1275	vpunpckldq	%ymm11,%ymm10,%ymm15
1276	vpunpckhdq	%ymm9,%ymm8,%ymm8
1277	vpunpckhdq	%ymm11,%ymm10,%ymm10
1278	vpunpcklqdq	%ymm15,%ymm14,%ymm9
1279	vpunpckhqdq	%ymm15,%ymm14,%ymm14
1280	vpunpcklqdq	%ymm10,%ymm8,%ymm11
1281	vpunpckhqdq	%ymm10,%ymm8,%ymm8
1282	vpaddd	256-256(%rcx),%ymm0,%ymm0
1283	vpaddd	288-256(%rcx),%ymm1,%ymm1
1284	vpaddd	320-256(%rcx),%ymm2,%ymm2
1285	vpaddd	352-256(%rcx),%ymm3,%ymm3
1286
1287	vpunpckldq	%ymm1,%ymm0,%ymm10
1288	vpunpckldq	%ymm3,%ymm2,%ymm15
1289	vpunpckhdq	%ymm1,%ymm0,%ymm0
1290	vpunpckhdq	%ymm3,%ymm2,%ymm2
1291	vpunpcklqdq	%ymm15,%ymm10,%ymm1
1292	vpunpckhqdq	%ymm15,%ymm10,%ymm10
1293	vpunpcklqdq	%ymm2,%ymm0,%ymm3
1294	vpunpckhqdq	%ymm2,%ymm0,%ymm0
1295	vperm2i128	$0x20,%ymm1,%ymm9,%ymm15
1296	vperm2i128	$0x31,%ymm1,%ymm9,%ymm1
1297	vperm2i128	$0x20,%ymm10,%ymm14,%ymm9
1298	vperm2i128	$0x31,%ymm10,%ymm14,%ymm10
1299	vperm2i128	$0x20,%ymm3,%ymm11,%ymm14
1300	vperm2i128	$0x31,%ymm3,%ymm11,%ymm3
1301	vperm2i128	$0x20,%ymm0,%ymm8,%ymm11
1302	vperm2i128	$0x31,%ymm0,%ymm8,%ymm0
1303	vmovdqa	%ymm15,0(%rsp)
1304	vmovdqa	%ymm9,32(%rsp)
1305	vmovdqa	64(%rsp),%ymm15
1306	vmovdqa	96(%rsp),%ymm9
1307
1308	vpaddd	384-512(%rax),%ymm12,%ymm12
1309	vpaddd	416-512(%rax),%ymm13,%ymm13
1310	vpaddd	448-512(%rax),%ymm15,%ymm15
1311	vpaddd	480-512(%rax),%ymm9,%ymm9
1312
1313	vpunpckldq	%ymm13,%ymm12,%ymm2
1314	vpunpckldq	%ymm9,%ymm15,%ymm8
1315	vpunpckhdq	%ymm13,%ymm12,%ymm12
1316	vpunpckhdq	%ymm9,%ymm15,%ymm15
1317	vpunpcklqdq	%ymm8,%ymm2,%ymm13
1318	vpunpckhqdq	%ymm8,%ymm2,%ymm2
1319	vpunpcklqdq	%ymm15,%ymm12,%ymm9
1320	vpunpckhqdq	%ymm15,%ymm12,%ymm12
1321	vpaddd	512-512(%rax),%ymm4,%ymm4
1322	vpaddd	544-512(%rax),%ymm5,%ymm5
1323	vpaddd	576-512(%rax),%ymm6,%ymm6
1324	vpaddd	608-512(%rax),%ymm7,%ymm7
1325
1326	vpunpckldq	%ymm5,%ymm4,%ymm15
1327	vpunpckldq	%ymm7,%ymm6,%ymm8
1328	vpunpckhdq	%ymm5,%ymm4,%ymm4
1329	vpunpckhdq	%ymm7,%ymm6,%ymm6
1330	vpunpcklqdq	%ymm8,%ymm15,%ymm5
1331	vpunpckhqdq	%ymm8,%ymm15,%ymm15
1332	vpunpcklqdq	%ymm6,%ymm4,%ymm7
1333	vpunpckhqdq	%ymm6,%ymm4,%ymm4
1334	vperm2i128	$0x20,%ymm5,%ymm13,%ymm8
1335	vperm2i128	$0x31,%ymm5,%ymm13,%ymm5
1336	vperm2i128	$0x20,%ymm15,%ymm2,%ymm13
1337	vperm2i128	$0x31,%ymm15,%ymm2,%ymm15
1338	vperm2i128	$0x20,%ymm7,%ymm9,%ymm2
1339	vperm2i128	$0x31,%ymm7,%ymm9,%ymm7
1340	vperm2i128	$0x20,%ymm4,%ymm12,%ymm9
1341	vperm2i128	$0x31,%ymm4,%ymm12,%ymm4
1342	vmovdqa	0(%rsp),%ymm6
1343	vmovdqa	32(%rsp),%ymm12
1344
1345	cmpq	$512,%rdx
1346	jb	.Ltail8x
1347
1348	vpxor	0(%rsi),%ymm6,%ymm6
1349	vpxor	32(%rsi),%ymm8,%ymm8
1350	vpxor	64(%rsi),%ymm1,%ymm1
1351	vpxor	96(%rsi),%ymm5,%ymm5
1352	leaq	128(%rsi),%rsi
1353	vmovdqu	%ymm6,0(%rdi)
1354	vmovdqu	%ymm8,32(%rdi)
1355	vmovdqu	%ymm1,64(%rdi)
1356	vmovdqu	%ymm5,96(%rdi)
1357	leaq	128(%rdi),%rdi
1358
1359	vpxor	0(%rsi),%ymm12,%ymm12
1360	vpxor	32(%rsi),%ymm13,%ymm13
1361	vpxor	64(%rsi),%ymm10,%ymm10
1362	vpxor	96(%rsi),%ymm15,%ymm15
1363	leaq	128(%rsi),%rsi
1364	vmovdqu	%ymm12,0(%rdi)
1365	vmovdqu	%ymm13,32(%rdi)
1366	vmovdqu	%ymm10,64(%rdi)
1367	vmovdqu	%ymm15,96(%rdi)
1368	leaq	128(%rdi),%rdi
1369
1370	vpxor	0(%rsi),%ymm14,%ymm14
1371	vpxor	32(%rsi),%ymm2,%ymm2
1372	vpxor	64(%rsi),%ymm3,%ymm3
1373	vpxor	96(%rsi),%ymm7,%ymm7
1374	leaq	128(%rsi),%rsi
1375	vmovdqu	%ymm14,0(%rdi)
1376	vmovdqu	%ymm2,32(%rdi)
1377	vmovdqu	%ymm3,64(%rdi)
1378	vmovdqu	%ymm7,96(%rdi)
1379	leaq	128(%rdi),%rdi
1380
1381	vpxor	0(%rsi),%ymm11,%ymm11
1382	vpxor	32(%rsi),%ymm9,%ymm9
1383	vpxor	64(%rsi),%ymm0,%ymm0
1384	vpxor	96(%rsi),%ymm4,%ymm4
1385	leaq	128(%rsi),%rsi
1386	vmovdqu	%ymm11,0(%rdi)
1387	vmovdqu	%ymm9,32(%rdi)
1388	vmovdqu	%ymm0,64(%rdi)
1389	vmovdqu	%ymm4,96(%rdi)
1390	leaq	128(%rdi),%rdi
1391
1392	subq	$512,%rdx
1393	jnz	.Loop_outer8x
1394
1395	jmp	.Ldone8x
1396
1397.Ltail8x:
1398	cmpq	$448,%rdx
1399	jae	.L448_or_more8x
1400	cmpq	$384,%rdx
1401	jae	.L384_or_more8x
1402	cmpq	$320,%rdx
1403	jae	.L320_or_more8x
1404	cmpq	$256,%rdx
1405	jae	.L256_or_more8x
1406	cmpq	$192,%rdx
1407	jae	.L192_or_more8x
1408	cmpq	$128,%rdx
1409	jae	.L128_or_more8x
1410	cmpq	$64,%rdx
1411	jae	.L64_or_more8x
1412
1413	xorq	%r10,%r10
1414	vmovdqa	%ymm6,0(%rsp)
1415	vmovdqa	%ymm8,32(%rsp)
1416	jmp	.Loop_tail8x
1417
1418.align	32
1419.L64_or_more8x:
1420	vpxor	0(%rsi),%ymm6,%ymm6
1421	vpxor	32(%rsi),%ymm8,%ymm8
1422	vmovdqu	%ymm6,0(%rdi)
1423	vmovdqu	%ymm8,32(%rdi)
1424	je	.Ldone8x
1425
1426	leaq	64(%rsi),%rsi
1427	xorq	%r10,%r10
1428	vmovdqa	%ymm1,0(%rsp)
1429	leaq	64(%rdi),%rdi
1430	subq	$64,%rdx
1431	vmovdqa	%ymm5,32(%rsp)
1432	jmp	.Loop_tail8x
1433
1434.align	32
1435.L128_or_more8x:
1436	vpxor	0(%rsi),%ymm6,%ymm6
1437	vpxor	32(%rsi),%ymm8,%ymm8
1438	vpxor	64(%rsi),%ymm1,%ymm1
1439	vpxor	96(%rsi),%ymm5,%ymm5
1440	vmovdqu	%ymm6,0(%rdi)
1441	vmovdqu	%ymm8,32(%rdi)
1442	vmovdqu	%ymm1,64(%rdi)
1443	vmovdqu	%ymm5,96(%rdi)
1444	je	.Ldone8x
1445
1446	leaq	128(%rsi),%rsi
1447	xorq	%r10,%r10
1448	vmovdqa	%ymm12,0(%rsp)
1449	leaq	128(%rdi),%rdi
1450	subq	$128,%rdx
1451	vmovdqa	%ymm13,32(%rsp)
1452	jmp	.Loop_tail8x
1453
1454.align	32
1455.L192_or_more8x:
1456	vpxor	0(%rsi),%ymm6,%ymm6
1457	vpxor	32(%rsi),%ymm8,%ymm8
1458	vpxor	64(%rsi),%ymm1,%ymm1
1459	vpxor	96(%rsi),%ymm5,%ymm5
1460	vpxor	128(%rsi),%ymm12,%ymm12
1461	vpxor	160(%rsi),%ymm13,%ymm13
1462	vmovdqu	%ymm6,0(%rdi)
1463	vmovdqu	%ymm8,32(%rdi)
1464	vmovdqu	%ymm1,64(%rdi)
1465	vmovdqu	%ymm5,96(%rdi)
1466	vmovdqu	%ymm12,128(%rdi)
1467	vmovdqu	%ymm13,160(%rdi)
1468	je	.Ldone8x
1469
1470	leaq	192(%rsi),%rsi
1471	xorq	%r10,%r10
1472	vmovdqa	%ymm10,0(%rsp)
1473	leaq	192(%rdi),%rdi
1474	subq	$192,%rdx
1475	vmovdqa	%ymm15,32(%rsp)
1476	jmp	.Loop_tail8x
1477
1478.align	32
1479.L256_or_more8x:
1480	vpxor	0(%rsi),%ymm6,%ymm6
1481	vpxor	32(%rsi),%ymm8,%ymm8
1482	vpxor	64(%rsi),%ymm1,%ymm1
1483	vpxor	96(%rsi),%ymm5,%ymm5
1484	vpxor	128(%rsi),%ymm12,%ymm12
1485	vpxor	160(%rsi),%ymm13,%ymm13
1486	vpxor	192(%rsi),%ymm10,%ymm10
1487	vpxor	224(%rsi),%ymm15,%ymm15
1488	vmovdqu	%ymm6,0(%rdi)
1489	vmovdqu	%ymm8,32(%rdi)
1490	vmovdqu	%ymm1,64(%rdi)
1491	vmovdqu	%ymm5,96(%rdi)
1492	vmovdqu	%ymm12,128(%rdi)
1493	vmovdqu	%ymm13,160(%rdi)
1494	vmovdqu	%ymm10,192(%rdi)
1495	vmovdqu	%ymm15,224(%rdi)
1496	je	.Ldone8x
1497
1498	leaq	256(%rsi),%rsi
1499	xorq	%r10,%r10
1500	vmovdqa	%ymm14,0(%rsp)
1501	leaq	256(%rdi),%rdi
1502	subq	$256,%rdx
1503	vmovdqa	%ymm2,32(%rsp)
1504	jmp	.Loop_tail8x
1505
1506.align	32
1507.L320_or_more8x:
1508	vpxor	0(%rsi),%ymm6,%ymm6
1509	vpxor	32(%rsi),%ymm8,%ymm8
1510	vpxor	64(%rsi),%ymm1,%ymm1
1511	vpxor	96(%rsi),%ymm5,%ymm5
1512	vpxor	128(%rsi),%ymm12,%ymm12
1513	vpxor	160(%rsi),%ymm13,%ymm13
1514	vpxor	192(%rsi),%ymm10,%ymm10
1515	vpxor	224(%rsi),%ymm15,%ymm15
1516	vpxor	256(%rsi),%ymm14,%ymm14
1517	vpxor	288(%rsi),%ymm2,%ymm2
1518	vmovdqu	%ymm6,0(%rdi)
1519	vmovdqu	%ymm8,32(%rdi)
1520	vmovdqu	%ymm1,64(%rdi)
1521	vmovdqu	%ymm5,96(%rdi)
1522	vmovdqu	%ymm12,128(%rdi)
1523	vmovdqu	%ymm13,160(%rdi)
1524	vmovdqu	%ymm10,192(%rdi)
1525	vmovdqu	%ymm15,224(%rdi)
1526	vmovdqu	%ymm14,256(%rdi)
1527	vmovdqu	%ymm2,288(%rdi)
1528	je	.Ldone8x
1529
1530	leaq	320(%rsi),%rsi
1531	xorq	%r10,%r10
1532	vmovdqa	%ymm3,0(%rsp)
1533	leaq	320(%rdi),%rdi
1534	subq	$320,%rdx
1535	vmovdqa	%ymm7,32(%rsp)
1536	jmp	.Loop_tail8x
1537
1538.align	32
1539.L384_or_more8x:
1540	vpxor	0(%rsi),%ymm6,%ymm6
1541	vpxor	32(%rsi),%ymm8,%ymm8
1542	vpxor	64(%rsi),%ymm1,%ymm1
1543	vpxor	96(%rsi),%ymm5,%ymm5
1544	vpxor	128(%rsi),%ymm12,%ymm12
1545	vpxor	160(%rsi),%ymm13,%ymm13
1546	vpxor	192(%rsi),%ymm10,%ymm10
1547	vpxor	224(%rsi),%ymm15,%ymm15
1548	vpxor	256(%rsi),%ymm14,%ymm14
1549	vpxor	288(%rsi),%ymm2,%ymm2
1550	vpxor	320(%rsi),%ymm3,%ymm3
1551	vpxor	352(%rsi),%ymm7,%ymm7
1552	vmovdqu	%ymm6,0(%rdi)
1553	vmovdqu	%ymm8,32(%rdi)
1554	vmovdqu	%ymm1,64(%rdi)
1555	vmovdqu	%ymm5,96(%rdi)
1556	vmovdqu	%ymm12,128(%rdi)
1557	vmovdqu	%ymm13,160(%rdi)
1558	vmovdqu	%ymm10,192(%rdi)
1559	vmovdqu	%ymm15,224(%rdi)
1560	vmovdqu	%ymm14,256(%rdi)
1561	vmovdqu	%ymm2,288(%rdi)
1562	vmovdqu	%ymm3,320(%rdi)
1563	vmovdqu	%ymm7,352(%rdi)
1564	je	.Ldone8x
1565
1566	leaq	384(%rsi),%rsi
1567	xorq	%r10,%r10
1568	vmovdqa	%ymm11,0(%rsp)
1569	leaq	384(%rdi),%rdi
1570	subq	$384,%rdx
1571	vmovdqa	%ymm9,32(%rsp)
1572	jmp	.Loop_tail8x
1573
1574.align	32
1575.L448_or_more8x:
1576	vpxor	0(%rsi),%ymm6,%ymm6
1577	vpxor	32(%rsi),%ymm8,%ymm8
1578	vpxor	64(%rsi),%ymm1,%ymm1
1579	vpxor	96(%rsi),%ymm5,%ymm5
1580	vpxor	128(%rsi),%ymm12,%ymm12
1581	vpxor	160(%rsi),%ymm13,%ymm13
1582	vpxor	192(%rsi),%ymm10,%ymm10
1583	vpxor	224(%rsi),%ymm15,%ymm15
1584	vpxor	256(%rsi),%ymm14,%ymm14
1585	vpxor	288(%rsi),%ymm2,%ymm2
1586	vpxor	320(%rsi),%ymm3,%ymm3
1587	vpxor	352(%rsi),%ymm7,%ymm7
1588	vpxor	384(%rsi),%ymm11,%ymm11
1589	vpxor	416(%rsi),%ymm9,%ymm9
1590	vmovdqu	%ymm6,0(%rdi)
1591	vmovdqu	%ymm8,32(%rdi)
1592	vmovdqu	%ymm1,64(%rdi)
1593	vmovdqu	%ymm5,96(%rdi)
1594	vmovdqu	%ymm12,128(%rdi)
1595	vmovdqu	%ymm13,160(%rdi)
1596	vmovdqu	%ymm10,192(%rdi)
1597	vmovdqu	%ymm15,224(%rdi)
1598	vmovdqu	%ymm14,256(%rdi)
1599	vmovdqu	%ymm2,288(%rdi)
1600	vmovdqu	%ymm3,320(%rdi)
1601	vmovdqu	%ymm7,352(%rdi)
1602	vmovdqu	%ymm11,384(%rdi)
1603	vmovdqu	%ymm9,416(%rdi)
1604	je	.Ldone8x
1605
1606	leaq	448(%rsi),%rsi
1607	xorq	%r10,%r10
1608	vmovdqa	%ymm0,0(%rsp)
1609	leaq	448(%rdi),%rdi
1610	subq	$448,%rdx
1611	vmovdqa	%ymm4,32(%rsp)
1612
1613.Loop_tail8x:
1614	movzbl	(%rsi,%r10,1),%eax
1615	movzbl	(%rsp,%r10,1),%ecx
1616	leaq	1(%r10),%r10
1617	xorl	%ecx,%eax
1618	movb	%al,-1(%rdi,%r10,1)
1619	decq	%rdx
1620	jnz	.Loop_tail8x
1621
1622.Ldone8x:
1623	vzeroall
1624	leaq	(%r9),%rsp
1625.cfi_def_cfa_register	rsp
1626.L8x_epilogue:
1627	.byte	0xf3,0xc3
1628.cfi_endproc
1629.size	ChaCha20_8x,.-ChaCha20_8x
1630#endif
1631.section	.note.GNU-stack,"",@progbits
1632