• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
2.text
3
4
5
6.p2align	6
7L$zero:
8.long	0,0,0,0
9L$one:
10.long	1,0,0,0
11L$inc:
12.long	0,1,2,3
13L$four:
14.long	4,4,4,4
15L$incy:
16.long	0,2,4,6,1,3,5,7
17L$eight:
18.long	8,8,8,8,8,8,8,8
19L$rot16:
20.byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
21L$rot24:
22.byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
23L$sigma:
24.byte	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
25.p2align	6
26L$zeroz:
27.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
28L$fourz:
29.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
30L$incz:
31.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
32L$sixteen:
33.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
34.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
35.globl	_ChaCha20_ctr32
36.private_extern _ChaCha20_ctr32
37
38.p2align	6
39_ChaCha20_ctr32:
40	cmpq	$0,%rdx
41	je	L$no_data
42	movq	_OPENSSL_ia32cap_P+4(%rip),%r10
43	testl	$512,%r10d
44	jnz	L$ChaCha20_ssse3
45
46	pushq	%rbx
47	pushq	%rbp
48	pushq	%r12
49	pushq	%r13
50	pushq	%r14
51	pushq	%r15
52	subq	$64+24,%rsp
53L$ctr32_body:
54
55
56	movdqu	(%rcx),%xmm1
57	movdqu	16(%rcx),%xmm2
58	movdqu	(%r8),%xmm3
59	movdqa	L$one(%rip),%xmm4
60
61
62	movdqa	%xmm1,16(%rsp)
63	movdqa	%xmm2,32(%rsp)
64	movdqa	%xmm3,48(%rsp)
65	movq	%rdx,%rbp
66	jmp	L$oop_outer
67
68.p2align	5
69L$oop_outer:
70	movl	$0x61707865,%eax
71	movl	$0x3320646e,%ebx
72	movl	$0x79622d32,%ecx
73	movl	$0x6b206574,%edx
74	movl	16(%rsp),%r8d
75	movl	20(%rsp),%r9d
76	movl	24(%rsp),%r10d
77	movl	28(%rsp),%r11d
78	movd	%xmm3,%r12d
79	movl	52(%rsp),%r13d
80	movl	56(%rsp),%r14d
81	movl	60(%rsp),%r15d
82
83	movq	%rbp,64+0(%rsp)
84	movl	$10,%ebp
85	movq	%rsi,64+8(%rsp)
86.byte	102,72,15,126,214
87	movq	%rdi,64+16(%rsp)
88	movq	%rsi,%rdi
89	shrq	$32,%rdi
90	jmp	L$oop
91
92.p2align	5
93L$oop:
94	addl	%r8d,%eax
95	xorl	%eax,%r12d
96	roll	$16,%r12d
97	addl	%r9d,%ebx
98	xorl	%ebx,%r13d
99	roll	$16,%r13d
100	addl	%r12d,%esi
101	xorl	%esi,%r8d
102	roll	$12,%r8d
103	addl	%r13d,%edi
104	xorl	%edi,%r9d
105	roll	$12,%r9d
106	addl	%r8d,%eax
107	xorl	%eax,%r12d
108	roll	$8,%r12d
109	addl	%r9d,%ebx
110	xorl	%ebx,%r13d
111	roll	$8,%r13d
112	addl	%r12d,%esi
113	xorl	%esi,%r8d
114	roll	$7,%r8d
115	addl	%r13d,%edi
116	xorl	%edi,%r9d
117	roll	$7,%r9d
118	movl	%esi,32(%rsp)
119	movl	%edi,36(%rsp)
120	movl	40(%rsp),%esi
121	movl	44(%rsp),%edi
122	addl	%r10d,%ecx
123	xorl	%ecx,%r14d
124	roll	$16,%r14d
125	addl	%r11d,%edx
126	xorl	%edx,%r15d
127	roll	$16,%r15d
128	addl	%r14d,%esi
129	xorl	%esi,%r10d
130	roll	$12,%r10d
131	addl	%r15d,%edi
132	xorl	%edi,%r11d
133	roll	$12,%r11d
134	addl	%r10d,%ecx
135	xorl	%ecx,%r14d
136	roll	$8,%r14d
137	addl	%r11d,%edx
138	xorl	%edx,%r15d
139	roll	$8,%r15d
140	addl	%r14d,%esi
141	xorl	%esi,%r10d
142	roll	$7,%r10d
143	addl	%r15d,%edi
144	xorl	%edi,%r11d
145	roll	$7,%r11d
146	addl	%r9d,%eax
147	xorl	%eax,%r15d
148	roll	$16,%r15d
149	addl	%r10d,%ebx
150	xorl	%ebx,%r12d
151	roll	$16,%r12d
152	addl	%r15d,%esi
153	xorl	%esi,%r9d
154	roll	$12,%r9d
155	addl	%r12d,%edi
156	xorl	%edi,%r10d
157	roll	$12,%r10d
158	addl	%r9d,%eax
159	xorl	%eax,%r15d
160	roll	$8,%r15d
161	addl	%r10d,%ebx
162	xorl	%ebx,%r12d
163	roll	$8,%r12d
164	addl	%r15d,%esi
165	xorl	%esi,%r9d
166	roll	$7,%r9d
167	addl	%r12d,%edi
168	xorl	%edi,%r10d
169	roll	$7,%r10d
170	movl	%esi,40(%rsp)
171	movl	%edi,44(%rsp)
172	movl	32(%rsp),%esi
173	movl	36(%rsp),%edi
174	addl	%r11d,%ecx
175	xorl	%ecx,%r13d
176	roll	$16,%r13d
177	addl	%r8d,%edx
178	xorl	%edx,%r14d
179	roll	$16,%r14d
180	addl	%r13d,%esi
181	xorl	%esi,%r11d
182	roll	$12,%r11d
183	addl	%r14d,%edi
184	xorl	%edi,%r8d
185	roll	$12,%r8d
186	addl	%r11d,%ecx
187	xorl	%ecx,%r13d
188	roll	$8,%r13d
189	addl	%r8d,%edx
190	xorl	%edx,%r14d
191	roll	$8,%r14d
192	addl	%r13d,%esi
193	xorl	%esi,%r11d
194	roll	$7,%r11d
195	addl	%r14d,%edi
196	xorl	%edi,%r8d
197	roll	$7,%r8d
198	decl	%ebp
199	jnz	L$oop
200	movl	%edi,36(%rsp)
201	movl	%esi,32(%rsp)
202	movq	64(%rsp),%rbp
203	movdqa	%xmm2,%xmm1
204	movq	64+8(%rsp),%rsi
205	paddd	%xmm4,%xmm3
206	movq	64+16(%rsp),%rdi
207
208	addl	$0x61707865,%eax
209	addl	$0x3320646e,%ebx
210	addl	$0x79622d32,%ecx
211	addl	$0x6b206574,%edx
212	addl	16(%rsp),%r8d
213	addl	20(%rsp),%r9d
214	addl	24(%rsp),%r10d
215	addl	28(%rsp),%r11d
216	addl	48(%rsp),%r12d
217	addl	52(%rsp),%r13d
218	addl	56(%rsp),%r14d
219	addl	60(%rsp),%r15d
220	paddd	32(%rsp),%xmm1
221
222	cmpq	$64,%rbp
223	jb	L$tail
224
225	xorl	0(%rsi),%eax
226	xorl	4(%rsi),%ebx
227	xorl	8(%rsi),%ecx
228	xorl	12(%rsi),%edx
229	xorl	16(%rsi),%r8d
230	xorl	20(%rsi),%r9d
231	xorl	24(%rsi),%r10d
232	xorl	28(%rsi),%r11d
233	movdqu	32(%rsi),%xmm0
234	xorl	48(%rsi),%r12d
235	xorl	52(%rsi),%r13d
236	xorl	56(%rsi),%r14d
237	xorl	60(%rsi),%r15d
238	leaq	64(%rsi),%rsi
239	pxor	%xmm1,%xmm0
240
241	movdqa	%xmm2,32(%rsp)
242	movd	%xmm3,48(%rsp)
243
244	movl	%eax,0(%rdi)
245	movl	%ebx,4(%rdi)
246	movl	%ecx,8(%rdi)
247	movl	%edx,12(%rdi)
248	movl	%r8d,16(%rdi)
249	movl	%r9d,20(%rdi)
250	movl	%r10d,24(%rdi)
251	movl	%r11d,28(%rdi)
252	movdqu	%xmm0,32(%rdi)
253	movl	%r12d,48(%rdi)
254	movl	%r13d,52(%rdi)
255	movl	%r14d,56(%rdi)
256	movl	%r15d,60(%rdi)
257	leaq	64(%rdi),%rdi
258
259	subq	$64,%rbp
260	jnz	L$oop_outer
261
262	jmp	L$done
263
264.p2align	4
265L$tail:
266	movl	%eax,0(%rsp)
267	movl	%ebx,4(%rsp)
268	xorq	%rbx,%rbx
269	movl	%ecx,8(%rsp)
270	movl	%edx,12(%rsp)
271	movl	%r8d,16(%rsp)
272	movl	%r9d,20(%rsp)
273	movl	%r10d,24(%rsp)
274	movl	%r11d,28(%rsp)
275	movdqa	%xmm1,32(%rsp)
276	movl	%r12d,48(%rsp)
277	movl	%r13d,52(%rsp)
278	movl	%r14d,56(%rsp)
279	movl	%r15d,60(%rsp)
280
281L$oop_tail:
282	movzbl	(%rsi,%rbx,1),%eax
283	movzbl	(%rsp,%rbx,1),%edx
284	leaq	1(%rbx),%rbx
285	xorl	%edx,%eax
286	movb	%al,-1(%rdi,%rbx,1)
287	decq	%rbp
288	jnz	L$oop_tail
289
290L$done:
291	leaq	64+24+48(%rsp),%rsi
292	movq	-48(%rsi),%r15
293	movq	-40(%rsi),%r14
294	movq	-32(%rsi),%r13
295	movq	-24(%rsi),%r12
296	movq	-16(%rsi),%rbp
297	movq	-8(%rsi),%rbx
298	leaq	(%rsi),%rsp
299L$no_data:
300	.byte	0xf3,0xc3
301
302
303.p2align	5
304ChaCha20_ssse3:
305L$ChaCha20_ssse3:
306	movq	%rsp,%r9
307	cmpq	$128,%rdx
308	ja	L$ChaCha20_4x
309
310L$do_sse3_after_all:
311	subq	$64+8,%rsp
312	movdqa	L$sigma(%rip),%xmm0
313	movdqu	(%rcx),%xmm1
314	movdqu	16(%rcx),%xmm2
315	movdqu	(%r8),%xmm3
316	movdqa	L$rot16(%rip),%xmm6
317	movdqa	L$rot24(%rip),%xmm7
318
319	movdqa	%xmm0,0(%rsp)
320	movdqa	%xmm1,16(%rsp)
321	movdqa	%xmm2,32(%rsp)
322	movdqa	%xmm3,48(%rsp)
323	movq	$10,%r8
324	jmp	L$oop_ssse3
325
326.p2align	5
327L$oop_outer_ssse3:
328	movdqa	L$one(%rip),%xmm3
329	movdqa	0(%rsp),%xmm0
330	movdqa	16(%rsp),%xmm1
331	movdqa	32(%rsp),%xmm2
332	paddd	48(%rsp),%xmm3
333	movq	$10,%r8
334	movdqa	%xmm3,48(%rsp)
335	jmp	L$oop_ssse3
336
337.p2align	5
338L$oop_ssse3:
339	paddd	%xmm1,%xmm0
340	pxor	%xmm0,%xmm3
341.byte	102,15,56,0,222
342	paddd	%xmm3,%xmm2
343	pxor	%xmm2,%xmm1
344	movdqa	%xmm1,%xmm4
345	psrld	$20,%xmm1
346	pslld	$12,%xmm4
347	por	%xmm4,%xmm1
348	paddd	%xmm1,%xmm0
349	pxor	%xmm0,%xmm3
350.byte	102,15,56,0,223
351	paddd	%xmm3,%xmm2
352	pxor	%xmm2,%xmm1
353	movdqa	%xmm1,%xmm4
354	psrld	$25,%xmm1
355	pslld	$7,%xmm4
356	por	%xmm4,%xmm1
357	pshufd	$78,%xmm2,%xmm2
358	pshufd	$57,%xmm1,%xmm1
359	pshufd	$147,%xmm3,%xmm3
360	nop
361	paddd	%xmm1,%xmm0
362	pxor	%xmm0,%xmm3
363.byte	102,15,56,0,222
364	paddd	%xmm3,%xmm2
365	pxor	%xmm2,%xmm1
366	movdqa	%xmm1,%xmm4
367	psrld	$20,%xmm1
368	pslld	$12,%xmm4
369	por	%xmm4,%xmm1
370	paddd	%xmm1,%xmm0
371	pxor	%xmm0,%xmm3
372.byte	102,15,56,0,223
373	paddd	%xmm3,%xmm2
374	pxor	%xmm2,%xmm1
375	movdqa	%xmm1,%xmm4
376	psrld	$25,%xmm1
377	pslld	$7,%xmm4
378	por	%xmm4,%xmm1
379	pshufd	$78,%xmm2,%xmm2
380	pshufd	$147,%xmm1,%xmm1
381	pshufd	$57,%xmm3,%xmm3
382	decq	%r8
383	jnz	L$oop_ssse3
384	paddd	0(%rsp),%xmm0
385	paddd	16(%rsp),%xmm1
386	paddd	32(%rsp),%xmm2
387	paddd	48(%rsp),%xmm3
388
389	cmpq	$64,%rdx
390	jb	L$tail_ssse3
391
392	movdqu	0(%rsi),%xmm4
393	movdqu	16(%rsi),%xmm5
394	pxor	%xmm4,%xmm0
395	movdqu	32(%rsi),%xmm4
396	pxor	%xmm5,%xmm1
397	movdqu	48(%rsi),%xmm5
398	leaq	64(%rsi),%rsi
399	pxor	%xmm4,%xmm2
400	pxor	%xmm5,%xmm3
401
402	movdqu	%xmm0,0(%rdi)
403	movdqu	%xmm1,16(%rdi)
404	movdqu	%xmm2,32(%rdi)
405	movdqu	%xmm3,48(%rdi)
406	leaq	64(%rdi),%rdi
407
408	subq	$64,%rdx
409	jnz	L$oop_outer_ssse3
410
411	jmp	L$done_ssse3
412
413.p2align	4
414L$tail_ssse3:
415	movdqa	%xmm0,0(%rsp)
416	movdqa	%xmm1,16(%rsp)
417	movdqa	%xmm2,32(%rsp)
418	movdqa	%xmm3,48(%rsp)
419	xorq	%r8,%r8
420
421L$oop_tail_ssse3:
422	movzbl	(%rsi,%r8,1),%eax
423	movzbl	(%rsp,%r8,1),%ecx
424	leaq	1(%r8),%r8
425	xorl	%ecx,%eax
426	movb	%al,-1(%rdi,%r8,1)
427	decq	%rdx
428	jnz	L$oop_tail_ssse3
429
430L$done_ssse3:
431	leaq	(%r9),%rsp
432L$ssse3_epilogue:
433	.byte	0xf3,0xc3
434
435
436.p2align	5
437ChaCha20_4x:
438L$ChaCha20_4x:
439	movq	%rsp,%r9
440	movq	%r10,%r11
441	shrq	$32,%r10
442	testq	$32,%r10
443	jnz	L$ChaCha20_8x
444	cmpq	$192,%rdx
445	ja	L$proceed4x
446
447	andq	$71303168,%r11
448	cmpq	$4194304,%r11
449	je	L$do_sse3_after_all
450
451L$proceed4x:
452	subq	$0x140+8,%rsp
453	movdqa	L$sigma(%rip),%xmm11
454	movdqu	(%rcx),%xmm15
455	movdqu	16(%rcx),%xmm7
456	movdqu	(%r8),%xmm3
457	leaq	256(%rsp),%rcx
458	leaq	L$rot16(%rip),%r10
459	leaq	L$rot24(%rip),%r11
460
461	pshufd	$0x00,%xmm11,%xmm8
462	pshufd	$0x55,%xmm11,%xmm9
463	movdqa	%xmm8,64(%rsp)
464	pshufd	$0xaa,%xmm11,%xmm10
465	movdqa	%xmm9,80(%rsp)
466	pshufd	$0xff,%xmm11,%xmm11
467	movdqa	%xmm10,96(%rsp)
468	movdqa	%xmm11,112(%rsp)
469
470	pshufd	$0x00,%xmm15,%xmm12
471	pshufd	$0x55,%xmm15,%xmm13
472	movdqa	%xmm12,128-256(%rcx)
473	pshufd	$0xaa,%xmm15,%xmm14
474	movdqa	%xmm13,144-256(%rcx)
475	pshufd	$0xff,%xmm15,%xmm15
476	movdqa	%xmm14,160-256(%rcx)
477	movdqa	%xmm15,176-256(%rcx)
478
479	pshufd	$0x00,%xmm7,%xmm4
480	pshufd	$0x55,%xmm7,%xmm5
481	movdqa	%xmm4,192-256(%rcx)
482	pshufd	$0xaa,%xmm7,%xmm6
483	movdqa	%xmm5,208-256(%rcx)
484	pshufd	$0xff,%xmm7,%xmm7
485	movdqa	%xmm6,224-256(%rcx)
486	movdqa	%xmm7,240-256(%rcx)
487
488	pshufd	$0x00,%xmm3,%xmm0
489	pshufd	$0x55,%xmm3,%xmm1
490	paddd	L$inc(%rip),%xmm0
491	pshufd	$0xaa,%xmm3,%xmm2
492	movdqa	%xmm1,272-256(%rcx)
493	pshufd	$0xff,%xmm3,%xmm3
494	movdqa	%xmm2,288-256(%rcx)
495	movdqa	%xmm3,304-256(%rcx)
496
497	jmp	L$oop_enter4x
498
499.p2align	5
500L$oop_outer4x:
501	movdqa	64(%rsp),%xmm8
502	movdqa	80(%rsp),%xmm9
503	movdqa	96(%rsp),%xmm10
504	movdqa	112(%rsp),%xmm11
505	movdqa	128-256(%rcx),%xmm12
506	movdqa	144-256(%rcx),%xmm13
507	movdqa	160-256(%rcx),%xmm14
508	movdqa	176-256(%rcx),%xmm15
509	movdqa	192-256(%rcx),%xmm4
510	movdqa	208-256(%rcx),%xmm5
511	movdqa	224-256(%rcx),%xmm6
512	movdqa	240-256(%rcx),%xmm7
513	movdqa	256-256(%rcx),%xmm0
514	movdqa	272-256(%rcx),%xmm1
515	movdqa	288-256(%rcx),%xmm2
516	movdqa	304-256(%rcx),%xmm3
517	paddd	L$four(%rip),%xmm0
518
519L$oop_enter4x:
520	movdqa	%xmm6,32(%rsp)
521	movdqa	%xmm7,48(%rsp)
522	movdqa	(%r10),%xmm7
523	movl	$10,%eax
524	movdqa	%xmm0,256-256(%rcx)
525	jmp	L$oop4x
526
527.p2align	5
528L$oop4x:
529	paddd	%xmm12,%xmm8
530	paddd	%xmm13,%xmm9
531	pxor	%xmm8,%xmm0
532	pxor	%xmm9,%xmm1
533.byte	102,15,56,0,199
534.byte	102,15,56,0,207
535	paddd	%xmm0,%xmm4
536	paddd	%xmm1,%xmm5
537	pxor	%xmm4,%xmm12
538	pxor	%xmm5,%xmm13
539	movdqa	%xmm12,%xmm6
540	pslld	$12,%xmm12
541	psrld	$20,%xmm6
542	movdqa	%xmm13,%xmm7
543	pslld	$12,%xmm13
544	por	%xmm6,%xmm12
545	psrld	$20,%xmm7
546	movdqa	(%r11),%xmm6
547	por	%xmm7,%xmm13
548	paddd	%xmm12,%xmm8
549	paddd	%xmm13,%xmm9
550	pxor	%xmm8,%xmm0
551	pxor	%xmm9,%xmm1
552.byte	102,15,56,0,198
553.byte	102,15,56,0,206
554	paddd	%xmm0,%xmm4
555	paddd	%xmm1,%xmm5
556	pxor	%xmm4,%xmm12
557	pxor	%xmm5,%xmm13
558	movdqa	%xmm12,%xmm7
559	pslld	$7,%xmm12
560	psrld	$25,%xmm7
561	movdqa	%xmm13,%xmm6
562	pslld	$7,%xmm13
563	por	%xmm7,%xmm12
564	psrld	$25,%xmm6
565	movdqa	(%r10),%xmm7
566	por	%xmm6,%xmm13
567	movdqa	%xmm4,0(%rsp)
568	movdqa	%xmm5,16(%rsp)
569	movdqa	32(%rsp),%xmm4
570	movdqa	48(%rsp),%xmm5
571	paddd	%xmm14,%xmm10
572	paddd	%xmm15,%xmm11
573	pxor	%xmm10,%xmm2
574	pxor	%xmm11,%xmm3
575.byte	102,15,56,0,215
576.byte	102,15,56,0,223
577	paddd	%xmm2,%xmm4
578	paddd	%xmm3,%xmm5
579	pxor	%xmm4,%xmm14
580	pxor	%xmm5,%xmm15
581	movdqa	%xmm14,%xmm6
582	pslld	$12,%xmm14
583	psrld	$20,%xmm6
584	movdqa	%xmm15,%xmm7
585	pslld	$12,%xmm15
586	por	%xmm6,%xmm14
587	psrld	$20,%xmm7
588	movdqa	(%r11),%xmm6
589	por	%xmm7,%xmm15
590	paddd	%xmm14,%xmm10
591	paddd	%xmm15,%xmm11
592	pxor	%xmm10,%xmm2
593	pxor	%xmm11,%xmm3
594.byte	102,15,56,0,214
595.byte	102,15,56,0,222
596	paddd	%xmm2,%xmm4
597	paddd	%xmm3,%xmm5
598	pxor	%xmm4,%xmm14
599	pxor	%xmm5,%xmm15
600	movdqa	%xmm14,%xmm7
601	pslld	$7,%xmm14
602	psrld	$25,%xmm7
603	movdqa	%xmm15,%xmm6
604	pslld	$7,%xmm15
605	por	%xmm7,%xmm14
606	psrld	$25,%xmm6
607	movdqa	(%r10),%xmm7
608	por	%xmm6,%xmm15
609	paddd	%xmm13,%xmm8
610	paddd	%xmm14,%xmm9
611	pxor	%xmm8,%xmm3
612	pxor	%xmm9,%xmm0
613.byte	102,15,56,0,223
614.byte	102,15,56,0,199
615	paddd	%xmm3,%xmm4
616	paddd	%xmm0,%xmm5
617	pxor	%xmm4,%xmm13
618	pxor	%xmm5,%xmm14
619	movdqa	%xmm13,%xmm6
620	pslld	$12,%xmm13
621	psrld	$20,%xmm6
622	movdqa	%xmm14,%xmm7
623	pslld	$12,%xmm14
624	por	%xmm6,%xmm13
625	psrld	$20,%xmm7
626	movdqa	(%r11),%xmm6
627	por	%xmm7,%xmm14
628	paddd	%xmm13,%xmm8
629	paddd	%xmm14,%xmm9
630	pxor	%xmm8,%xmm3
631	pxor	%xmm9,%xmm0
632.byte	102,15,56,0,222
633.byte	102,15,56,0,198
634	paddd	%xmm3,%xmm4
635	paddd	%xmm0,%xmm5
636	pxor	%xmm4,%xmm13
637	pxor	%xmm5,%xmm14
638	movdqa	%xmm13,%xmm7
639	pslld	$7,%xmm13
640	psrld	$25,%xmm7
641	movdqa	%xmm14,%xmm6
642	pslld	$7,%xmm14
643	por	%xmm7,%xmm13
644	psrld	$25,%xmm6
645	movdqa	(%r10),%xmm7
646	por	%xmm6,%xmm14
647	movdqa	%xmm4,32(%rsp)
648	movdqa	%xmm5,48(%rsp)
649	movdqa	0(%rsp),%xmm4
650	movdqa	16(%rsp),%xmm5
651	paddd	%xmm15,%xmm10
652	paddd	%xmm12,%xmm11
653	pxor	%xmm10,%xmm1
654	pxor	%xmm11,%xmm2
655.byte	102,15,56,0,207
656.byte	102,15,56,0,215
657	paddd	%xmm1,%xmm4
658	paddd	%xmm2,%xmm5
659	pxor	%xmm4,%xmm15
660	pxor	%xmm5,%xmm12
661	movdqa	%xmm15,%xmm6
662	pslld	$12,%xmm15
663	psrld	$20,%xmm6
664	movdqa	%xmm12,%xmm7
665	pslld	$12,%xmm12
666	por	%xmm6,%xmm15
667	psrld	$20,%xmm7
668	movdqa	(%r11),%xmm6
669	por	%xmm7,%xmm12
670	paddd	%xmm15,%xmm10
671	paddd	%xmm12,%xmm11
672	pxor	%xmm10,%xmm1
673	pxor	%xmm11,%xmm2
674.byte	102,15,56,0,206
675.byte	102,15,56,0,214
676	paddd	%xmm1,%xmm4
677	paddd	%xmm2,%xmm5
678	pxor	%xmm4,%xmm15
679	pxor	%xmm5,%xmm12
680	movdqa	%xmm15,%xmm7
681	pslld	$7,%xmm15
682	psrld	$25,%xmm7
683	movdqa	%xmm12,%xmm6
684	pslld	$7,%xmm12
685	por	%xmm7,%xmm15
686	psrld	$25,%xmm6
687	movdqa	(%r10),%xmm7
688	por	%xmm6,%xmm12
689	decl	%eax
690	jnz	L$oop4x
691
692	paddd	64(%rsp),%xmm8
693	paddd	80(%rsp),%xmm9
694	paddd	96(%rsp),%xmm10
695	paddd	112(%rsp),%xmm11
696
697	movdqa	%xmm8,%xmm6
698	punpckldq	%xmm9,%xmm8
699	movdqa	%xmm10,%xmm7
700	punpckldq	%xmm11,%xmm10
701	punpckhdq	%xmm9,%xmm6
702	punpckhdq	%xmm11,%xmm7
703	movdqa	%xmm8,%xmm9
704	punpcklqdq	%xmm10,%xmm8
705	movdqa	%xmm6,%xmm11
706	punpcklqdq	%xmm7,%xmm6
707	punpckhqdq	%xmm10,%xmm9
708	punpckhqdq	%xmm7,%xmm11
709	paddd	128-256(%rcx),%xmm12
710	paddd	144-256(%rcx),%xmm13
711	paddd	160-256(%rcx),%xmm14
712	paddd	176-256(%rcx),%xmm15
713
714	movdqa	%xmm8,0(%rsp)
715	movdqa	%xmm9,16(%rsp)
716	movdqa	32(%rsp),%xmm8
717	movdqa	48(%rsp),%xmm9
718
719	movdqa	%xmm12,%xmm10
720	punpckldq	%xmm13,%xmm12
721	movdqa	%xmm14,%xmm7
722	punpckldq	%xmm15,%xmm14
723	punpckhdq	%xmm13,%xmm10
724	punpckhdq	%xmm15,%xmm7
725	movdqa	%xmm12,%xmm13
726	punpcklqdq	%xmm14,%xmm12
727	movdqa	%xmm10,%xmm15
728	punpcklqdq	%xmm7,%xmm10
729	punpckhqdq	%xmm14,%xmm13
730	punpckhqdq	%xmm7,%xmm15
731	paddd	192-256(%rcx),%xmm4
732	paddd	208-256(%rcx),%xmm5
733	paddd	224-256(%rcx),%xmm8
734	paddd	240-256(%rcx),%xmm9
735
736	movdqa	%xmm6,32(%rsp)
737	movdqa	%xmm11,48(%rsp)
738
739	movdqa	%xmm4,%xmm14
740	punpckldq	%xmm5,%xmm4
741	movdqa	%xmm8,%xmm7
742	punpckldq	%xmm9,%xmm8
743	punpckhdq	%xmm5,%xmm14
744	punpckhdq	%xmm9,%xmm7
745	movdqa	%xmm4,%xmm5
746	punpcklqdq	%xmm8,%xmm4
747	movdqa	%xmm14,%xmm9
748	punpcklqdq	%xmm7,%xmm14
749	punpckhqdq	%xmm8,%xmm5
750	punpckhqdq	%xmm7,%xmm9
751	paddd	256-256(%rcx),%xmm0
752	paddd	272-256(%rcx),%xmm1
753	paddd	288-256(%rcx),%xmm2
754	paddd	304-256(%rcx),%xmm3
755
756	movdqa	%xmm0,%xmm8
757	punpckldq	%xmm1,%xmm0
758	movdqa	%xmm2,%xmm7
759	punpckldq	%xmm3,%xmm2
760	punpckhdq	%xmm1,%xmm8
761	punpckhdq	%xmm3,%xmm7
762	movdqa	%xmm0,%xmm1
763	punpcklqdq	%xmm2,%xmm0
764	movdqa	%xmm8,%xmm3
765	punpcklqdq	%xmm7,%xmm8
766	punpckhqdq	%xmm2,%xmm1
767	punpckhqdq	%xmm7,%xmm3
768	cmpq	$256,%rdx
769	jb	L$tail4x
770
771	movdqu	0(%rsi),%xmm6
772	movdqu	16(%rsi),%xmm11
773	movdqu	32(%rsi),%xmm2
774	movdqu	48(%rsi),%xmm7
775	pxor	0(%rsp),%xmm6
776	pxor	%xmm12,%xmm11
777	pxor	%xmm4,%xmm2
778	pxor	%xmm0,%xmm7
779
780	movdqu	%xmm6,0(%rdi)
781	movdqu	64(%rsi),%xmm6
782	movdqu	%xmm11,16(%rdi)
783	movdqu	80(%rsi),%xmm11
784	movdqu	%xmm2,32(%rdi)
785	movdqu	96(%rsi),%xmm2
786	movdqu	%xmm7,48(%rdi)
787	movdqu	112(%rsi),%xmm7
788	leaq	128(%rsi),%rsi
789	pxor	16(%rsp),%xmm6
790	pxor	%xmm13,%xmm11
791	pxor	%xmm5,%xmm2
792	pxor	%xmm1,%xmm7
793
794	movdqu	%xmm6,64(%rdi)
795	movdqu	0(%rsi),%xmm6
796	movdqu	%xmm11,80(%rdi)
797	movdqu	16(%rsi),%xmm11
798	movdqu	%xmm2,96(%rdi)
799	movdqu	32(%rsi),%xmm2
800	movdqu	%xmm7,112(%rdi)
801	leaq	128(%rdi),%rdi
802	movdqu	48(%rsi),%xmm7
803	pxor	32(%rsp),%xmm6
804	pxor	%xmm10,%xmm11
805	pxor	%xmm14,%xmm2
806	pxor	%xmm8,%xmm7
807
808	movdqu	%xmm6,0(%rdi)
809	movdqu	64(%rsi),%xmm6
810	movdqu	%xmm11,16(%rdi)
811	movdqu	80(%rsi),%xmm11
812	movdqu	%xmm2,32(%rdi)
813	movdqu	96(%rsi),%xmm2
814	movdqu	%xmm7,48(%rdi)
815	movdqu	112(%rsi),%xmm7
816	leaq	128(%rsi),%rsi
817	pxor	48(%rsp),%xmm6
818	pxor	%xmm15,%xmm11
819	pxor	%xmm9,%xmm2
820	pxor	%xmm3,%xmm7
821	movdqu	%xmm6,64(%rdi)
822	movdqu	%xmm11,80(%rdi)
823	movdqu	%xmm2,96(%rdi)
824	movdqu	%xmm7,112(%rdi)
825	leaq	128(%rdi),%rdi
826
827	subq	$256,%rdx
828	jnz	L$oop_outer4x
829
830	jmp	L$done4x
831
832L$tail4x:
833	cmpq	$192,%rdx
834	jae	L$192_or_more4x
835	cmpq	$128,%rdx
836	jae	L$128_or_more4x
837	cmpq	$64,%rdx
838	jae	L$64_or_more4x
839
840
841	xorq	%r10,%r10
842
843	movdqa	%xmm12,16(%rsp)
844	movdqa	%xmm4,32(%rsp)
845	movdqa	%xmm0,48(%rsp)
846	jmp	L$oop_tail4x
847
848.p2align	5
849L$64_or_more4x:
850	movdqu	0(%rsi),%xmm6
851	movdqu	16(%rsi),%xmm11
852	movdqu	32(%rsi),%xmm2
853	movdqu	48(%rsi),%xmm7
854	pxor	0(%rsp),%xmm6
855	pxor	%xmm12,%xmm11
856	pxor	%xmm4,%xmm2
857	pxor	%xmm0,%xmm7
858	movdqu	%xmm6,0(%rdi)
859	movdqu	%xmm11,16(%rdi)
860	movdqu	%xmm2,32(%rdi)
861	movdqu	%xmm7,48(%rdi)
862	je	L$done4x
863
864	movdqa	16(%rsp),%xmm6
865	leaq	64(%rsi),%rsi
866	xorq	%r10,%r10
867	movdqa	%xmm6,0(%rsp)
868	movdqa	%xmm13,16(%rsp)
869	leaq	64(%rdi),%rdi
870	movdqa	%xmm5,32(%rsp)
871	subq	$64,%rdx
872	movdqa	%xmm1,48(%rsp)
873	jmp	L$oop_tail4x
874
875.p2align	5
876L$128_or_more4x:
877	movdqu	0(%rsi),%xmm6
878	movdqu	16(%rsi),%xmm11
879	movdqu	32(%rsi),%xmm2
880	movdqu	48(%rsi),%xmm7
881	pxor	0(%rsp),%xmm6
882	pxor	%xmm12,%xmm11
883	pxor	%xmm4,%xmm2
884	pxor	%xmm0,%xmm7
885
886	movdqu	%xmm6,0(%rdi)
887	movdqu	64(%rsi),%xmm6
888	movdqu	%xmm11,16(%rdi)
889	movdqu	80(%rsi),%xmm11
890	movdqu	%xmm2,32(%rdi)
891	movdqu	96(%rsi),%xmm2
892	movdqu	%xmm7,48(%rdi)
893	movdqu	112(%rsi),%xmm7
894	pxor	16(%rsp),%xmm6
895	pxor	%xmm13,%xmm11
896	pxor	%xmm5,%xmm2
897	pxor	%xmm1,%xmm7
898	movdqu	%xmm6,64(%rdi)
899	movdqu	%xmm11,80(%rdi)
900	movdqu	%xmm2,96(%rdi)
901	movdqu	%xmm7,112(%rdi)
902	je	L$done4x
903
904	movdqa	32(%rsp),%xmm6
905	leaq	128(%rsi),%rsi
906	xorq	%r10,%r10
907	movdqa	%xmm6,0(%rsp)
908	movdqa	%xmm10,16(%rsp)
909	leaq	128(%rdi),%rdi
910	movdqa	%xmm14,32(%rsp)
911	subq	$128,%rdx
912	movdqa	%xmm8,48(%rsp)
913	jmp	L$oop_tail4x
914
915.p2align	5
916L$192_or_more4x:
917	movdqu	0(%rsi),%xmm6
918	movdqu	16(%rsi),%xmm11
919	movdqu	32(%rsi),%xmm2
920	movdqu	48(%rsi),%xmm7
921	pxor	0(%rsp),%xmm6
922	pxor	%xmm12,%xmm11
923	pxor	%xmm4,%xmm2
924	pxor	%xmm0,%xmm7
925
926	movdqu	%xmm6,0(%rdi)
927	movdqu	64(%rsi),%xmm6
928	movdqu	%xmm11,16(%rdi)
929	movdqu	80(%rsi),%xmm11
930	movdqu	%xmm2,32(%rdi)
931	movdqu	96(%rsi),%xmm2
932	movdqu	%xmm7,48(%rdi)
933	movdqu	112(%rsi),%xmm7
934	leaq	128(%rsi),%rsi
935	pxor	16(%rsp),%xmm6
936	pxor	%xmm13,%xmm11
937	pxor	%xmm5,%xmm2
938	pxor	%xmm1,%xmm7
939
940	movdqu	%xmm6,64(%rdi)
941	movdqu	0(%rsi),%xmm6
942	movdqu	%xmm11,80(%rdi)
943	movdqu	16(%rsi),%xmm11
944	movdqu	%xmm2,96(%rdi)
945	movdqu	32(%rsi),%xmm2
946	movdqu	%xmm7,112(%rdi)
947	leaq	128(%rdi),%rdi
948	movdqu	48(%rsi),%xmm7
949	pxor	32(%rsp),%xmm6
950	pxor	%xmm10,%xmm11
951	pxor	%xmm14,%xmm2
952	pxor	%xmm8,%xmm7
953	movdqu	%xmm6,0(%rdi)
954	movdqu	%xmm11,16(%rdi)
955	movdqu	%xmm2,32(%rdi)
956	movdqu	%xmm7,48(%rdi)
957	je	L$done4x
958
959	movdqa	48(%rsp),%xmm6
960	leaq	64(%rsi),%rsi
961	xorq	%r10,%r10
962	movdqa	%xmm6,0(%rsp)
963	movdqa	%xmm15,16(%rsp)
964	leaq	64(%rdi),%rdi
965	movdqa	%xmm9,32(%rsp)
966	subq	$192,%rdx
967	movdqa	%xmm3,48(%rsp)
968
969L$oop_tail4x:
970	movzbl	(%rsi,%r10,1),%eax
971	movzbl	(%rsp,%r10,1),%ecx
972	leaq	1(%r10),%r10
973	xorl	%ecx,%eax
974	movb	%al,-1(%rdi,%r10,1)
975	decq	%rdx
976	jnz	L$oop_tail4x
977
978L$done4x:
979	leaq	(%r9),%rsp
980L$4x_epilogue:
981	.byte	0xf3,0xc3
982
983
984.p2align	5
985ChaCha20_8x:
986L$ChaCha20_8x:
987	movq	%rsp,%r9
988	subq	$0x280+8,%rsp
989	andq	$-32,%rsp
990	vzeroupper
991
992
993
994
995
996
997
998
999
1000
1001	vbroadcasti128	L$sigma(%rip),%ymm11
1002	vbroadcasti128	(%rcx),%ymm3
1003	vbroadcasti128	16(%rcx),%ymm15
1004	vbroadcasti128	(%r8),%ymm7
1005	leaq	256(%rsp),%rcx
1006	leaq	512(%rsp),%rax
1007	leaq	L$rot16(%rip),%r10
1008	leaq	L$rot24(%rip),%r11
1009
1010	vpshufd	$0x00,%ymm11,%ymm8
1011	vpshufd	$0x55,%ymm11,%ymm9
1012	vmovdqa	%ymm8,128-256(%rcx)
1013	vpshufd	$0xaa,%ymm11,%ymm10
1014	vmovdqa	%ymm9,160-256(%rcx)
1015	vpshufd	$0xff,%ymm11,%ymm11
1016	vmovdqa	%ymm10,192-256(%rcx)
1017	vmovdqa	%ymm11,224-256(%rcx)
1018
1019	vpshufd	$0x00,%ymm3,%ymm0
1020	vpshufd	$0x55,%ymm3,%ymm1
1021	vmovdqa	%ymm0,256-256(%rcx)
1022	vpshufd	$0xaa,%ymm3,%ymm2
1023	vmovdqa	%ymm1,288-256(%rcx)
1024	vpshufd	$0xff,%ymm3,%ymm3
1025	vmovdqa	%ymm2,320-256(%rcx)
1026	vmovdqa	%ymm3,352-256(%rcx)
1027
1028	vpshufd	$0x00,%ymm15,%ymm12
1029	vpshufd	$0x55,%ymm15,%ymm13
1030	vmovdqa	%ymm12,384-512(%rax)
1031	vpshufd	$0xaa,%ymm15,%ymm14
1032	vmovdqa	%ymm13,416-512(%rax)
1033	vpshufd	$0xff,%ymm15,%ymm15
1034	vmovdqa	%ymm14,448-512(%rax)
1035	vmovdqa	%ymm15,480-512(%rax)
1036
1037	vpshufd	$0x00,%ymm7,%ymm4
1038	vpshufd	$0x55,%ymm7,%ymm5
1039	vpaddd	L$incy(%rip),%ymm4,%ymm4
1040	vpshufd	$0xaa,%ymm7,%ymm6
1041	vmovdqa	%ymm5,544-512(%rax)
1042	vpshufd	$0xff,%ymm7,%ymm7
1043	vmovdqa	%ymm6,576-512(%rax)
1044	vmovdqa	%ymm7,608-512(%rax)
1045
1046	jmp	L$oop_enter8x
1047
1048.p2align	5
1049L$oop_outer8x:
1050	vmovdqa	128-256(%rcx),%ymm8
1051	vmovdqa	160-256(%rcx),%ymm9
1052	vmovdqa	192-256(%rcx),%ymm10
1053	vmovdqa	224-256(%rcx),%ymm11
1054	vmovdqa	256-256(%rcx),%ymm0
1055	vmovdqa	288-256(%rcx),%ymm1
1056	vmovdqa	320-256(%rcx),%ymm2
1057	vmovdqa	352-256(%rcx),%ymm3
1058	vmovdqa	384-512(%rax),%ymm12
1059	vmovdqa	416-512(%rax),%ymm13
1060	vmovdqa	448-512(%rax),%ymm14
1061	vmovdqa	480-512(%rax),%ymm15
1062	vmovdqa	512-512(%rax),%ymm4
1063	vmovdqa	544-512(%rax),%ymm5
1064	vmovdqa	576-512(%rax),%ymm6
1065	vmovdqa	608-512(%rax),%ymm7
1066	vpaddd	L$eight(%rip),%ymm4,%ymm4
1067
1068L$oop_enter8x:
1069	vmovdqa	%ymm14,64(%rsp)
1070	vmovdqa	%ymm15,96(%rsp)
1071	vbroadcasti128	(%r10),%ymm15
1072	vmovdqa	%ymm4,512-512(%rax)
1073	movl	$10,%eax
1074	jmp	L$oop8x
1075
1076.p2align	5
1077L$oop8x:
1078	vpaddd	%ymm0,%ymm8,%ymm8
1079	vpxor	%ymm4,%ymm8,%ymm4
1080	vpshufb	%ymm15,%ymm4,%ymm4
1081	vpaddd	%ymm1,%ymm9,%ymm9
1082	vpxor	%ymm5,%ymm9,%ymm5
1083	vpshufb	%ymm15,%ymm5,%ymm5
1084	vpaddd	%ymm4,%ymm12,%ymm12
1085	vpxor	%ymm0,%ymm12,%ymm0
1086	vpslld	$12,%ymm0,%ymm14
1087	vpsrld	$20,%ymm0,%ymm0
1088	vpor	%ymm0,%ymm14,%ymm0
1089	vbroadcasti128	(%r11),%ymm14
1090	vpaddd	%ymm5,%ymm13,%ymm13
1091	vpxor	%ymm1,%ymm13,%ymm1
1092	vpslld	$12,%ymm1,%ymm15
1093	vpsrld	$20,%ymm1,%ymm1
1094	vpor	%ymm1,%ymm15,%ymm1
1095	vpaddd	%ymm0,%ymm8,%ymm8
1096	vpxor	%ymm4,%ymm8,%ymm4
1097	vpshufb	%ymm14,%ymm4,%ymm4
1098	vpaddd	%ymm1,%ymm9,%ymm9
1099	vpxor	%ymm5,%ymm9,%ymm5
1100	vpshufb	%ymm14,%ymm5,%ymm5
1101	vpaddd	%ymm4,%ymm12,%ymm12
1102	vpxor	%ymm0,%ymm12,%ymm0
1103	vpslld	$7,%ymm0,%ymm15
1104	vpsrld	$25,%ymm0,%ymm0
1105	vpor	%ymm0,%ymm15,%ymm0
1106	vbroadcasti128	(%r10),%ymm15
1107	vpaddd	%ymm5,%ymm13,%ymm13
1108	vpxor	%ymm1,%ymm13,%ymm1
1109	vpslld	$7,%ymm1,%ymm14
1110	vpsrld	$25,%ymm1,%ymm1
1111	vpor	%ymm1,%ymm14,%ymm1
1112	vmovdqa	%ymm12,0(%rsp)
1113	vmovdqa	%ymm13,32(%rsp)
1114	vmovdqa	64(%rsp),%ymm12
1115	vmovdqa	96(%rsp),%ymm13
1116	vpaddd	%ymm2,%ymm10,%ymm10
1117	vpxor	%ymm6,%ymm10,%ymm6
1118	vpshufb	%ymm15,%ymm6,%ymm6
1119	vpaddd	%ymm3,%ymm11,%ymm11
1120	vpxor	%ymm7,%ymm11,%ymm7
1121	vpshufb	%ymm15,%ymm7,%ymm7
1122	vpaddd	%ymm6,%ymm12,%ymm12
1123	vpxor	%ymm2,%ymm12,%ymm2
1124	vpslld	$12,%ymm2,%ymm14
1125	vpsrld	$20,%ymm2,%ymm2
1126	vpor	%ymm2,%ymm14,%ymm2
1127	vbroadcasti128	(%r11),%ymm14
1128	vpaddd	%ymm7,%ymm13,%ymm13
1129	vpxor	%ymm3,%ymm13,%ymm3
1130	vpslld	$12,%ymm3,%ymm15
1131	vpsrld	$20,%ymm3,%ymm3
1132	vpor	%ymm3,%ymm15,%ymm3
1133	vpaddd	%ymm2,%ymm10,%ymm10
1134	vpxor	%ymm6,%ymm10,%ymm6
1135	vpshufb	%ymm14,%ymm6,%ymm6
1136	vpaddd	%ymm3,%ymm11,%ymm11
1137	vpxor	%ymm7,%ymm11,%ymm7
1138	vpshufb	%ymm14,%ymm7,%ymm7
1139	vpaddd	%ymm6,%ymm12,%ymm12
1140	vpxor	%ymm2,%ymm12,%ymm2
1141	vpslld	$7,%ymm2,%ymm15
1142	vpsrld	$25,%ymm2,%ymm2
1143	vpor	%ymm2,%ymm15,%ymm2
1144	vbroadcasti128	(%r10),%ymm15
1145	vpaddd	%ymm7,%ymm13,%ymm13
1146	vpxor	%ymm3,%ymm13,%ymm3
1147	vpslld	$7,%ymm3,%ymm14
1148	vpsrld	$25,%ymm3,%ymm3
1149	vpor	%ymm3,%ymm14,%ymm3
1150	vpaddd	%ymm1,%ymm8,%ymm8
1151	vpxor	%ymm7,%ymm8,%ymm7
1152	vpshufb	%ymm15,%ymm7,%ymm7
1153	vpaddd	%ymm2,%ymm9,%ymm9
1154	vpxor	%ymm4,%ymm9,%ymm4
1155	vpshufb	%ymm15,%ymm4,%ymm4
1156	vpaddd	%ymm7,%ymm12,%ymm12
1157	vpxor	%ymm1,%ymm12,%ymm1
1158	vpslld	$12,%ymm1,%ymm14
1159	vpsrld	$20,%ymm1,%ymm1
1160	vpor	%ymm1,%ymm14,%ymm1
1161	vbroadcasti128	(%r11),%ymm14
1162	vpaddd	%ymm4,%ymm13,%ymm13
1163	vpxor	%ymm2,%ymm13,%ymm2
1164	vpslld	$12,%ymm2,%ymm15
1165	vpsrld	$20,%ymm2,%ymm2
1166	vpor	%ymm2,%ymm15,%ymm2
1167	vpaddd	%ymm1,%ymm8,%ymm8
1168	vpxor	%ymm7,%ymm8,%ymm7
1169	vpshufb	%ymm14,%ymm7,%ymm7
1170	vpaddd	%ymm2,%ymm9,%ymm9
1171	vpxor	%ymm4,%ymm9,%ymm4
1172	vpshufb	%ymm14,%ymm4,%ymm4
1173	vpaddd	%ymm7,%ymm12,%ymm12
1174	vpxor	%ymm1,%ymm12,%ymm1
1175	vpslld	$7,%ymm1,%ymm15
1176	vpsrld	$25,%ymm1,%ymm1
1177	vpor	%ymm1,%ymm15,%ymm1
1178	vbroadcasti128	(%r10),%ymm15
1179	vpaddd	%ymm4,%ymm13,%ymm13
1180	vpxor	%ymm2,%ymm13,%ymm2
1181	vpslld	$7,%ymm2,%ymm14
1182	vpsrld	$25,%ymm2,%ymm2
1183	vpor	%ymm2,%ymm14,%ymm2
1184	vmovdqa	%ymm12,64(%rsp)
1185	vmovdqa	%ymm13,96(%rsp)
1186	vmovdqa	0(%rsp),%ymm12
1187	vmovdqa	32(%rsp),%ymm13
1188	vpaddd	%ymm3,%ymm10,%ymm10
1189	vpxor	%ymm5,%ymm10,%ymm5
1190	vpshufb	%ymm15,%ymm5,%ymm5
1191	vpaddd	%ymm0,%ymm11,%ymm11
1192	vpxor	%ymm6,%ymm11,%ymm6
1193	vpshufb	%ymm15,%ymm6,%ymm6
1194	vpaddd	%ymm5,%ymm12,%ymm12
1195	vpxor	%ymm3,%ymm12,%ymm3
1196	vpslld	$12,%ymm3,%ymm14
1197	vpsrld	$20,%ymm3,%ymm3
1198	vpor	%ymm3,%ymm14,%ymm3
1199	vbroadcasti128	(%r11),%ymm14
1200	vpaddd	%ymm6,%ymm13,%ymm13
1201	vpxor	%ymm0,%ymm13,%ymm0
1202	vpslld	$12,%ymm0,%ymm15
1203	vpsrld	$20,%ymm0,%ymm0
1204	vpor	%ymm0,%ymm15,%ymm0
1205	vpaddd	%ymm3,%ymm10,%ymm10
1206	vpxor	%ymm5,%ymm10,%ymm5
1207	vpshufb	%ymm14,%ymm5,%ymm5
1208	vpaddd	%ymm0,%ymm11,%ymm11
1209	vpxor	%ymm6,%ymm11,%ymm6
1210	vpshufb	%ymm14,%ymm6,%ymm6
1211	vpaddd	%ymm5,%ymm12,%ymm12
1212	vpxor	%ymm3,%ymm12,%ymm3
1213	vpslld	$7,%ymm3,%ymm15
1214	vpsrld	$25,%ymm3,%ymm3
1215	vpor	%ymm3,%ymm15,%ymm3
1216	vbroadcasti128	(%r10),%ymm15
1217	vpaddd	%ymm6,%ymm13,%ymm13
1218	vpxor	%ymm0,%ymm13,%ymm0
1219	vpslld	$7,%ymm0,%ymm14
1220	vpsrld	$25,%ymm0,%ymm0
1221	vpor	%ymm0,%ymm14,%ymm0
1222	decl	%eax
1223	jnz	L$oop8x
1224
1225	leaq	512(%rsp),%rax
1226	vpaddd	128-256(%rcx),%ymm8,%ymm8
1227	vpaddd	160-256(%rcx),%ymm9,%ymm9
1228	vpaddd	192-256(%rcx),%ymm10,%ymm10
1229	vpaddd	224-256(%rcx),%ymm11,%ymm11
1230
1231	vpunpckldq	%ymm9,%ymm8,%ymm14
1232	vpunpckldq	%ymm11,%ymm10,%ymm15
1233	vpunpckhdq	%ymm9,%ymm8,%ymm8
1234	vpunpckhdq	%ymm11,%ymm10,%ymm10
1235	vpunpcklqdq	%ymm15,%ymm14,%ymm9
1236	vpunpckhqdq	%ymm15,%ymm14,%ymm14
1237	vpunpcklqdq	%ymm10,%ymm8,%ymm11
1238	vpunpckhqdq	%ymm10,%ymm8,%ymm8
1239	vpaddd	256-256(%rcx),%ymm0,%ymm0
1240	vpaddd	288-256(%rcx),%ymm1,%ymm1
1241	vpaddd	320-256(%rcx),%ymm2,%ymm2
1242	vpaddd	352-256(%rcx),%ymm3,%ymm3
1243
1244	vpunpckldq	%ymm1,%ymm0,%ymm10
1245	vpunpckldq	%ymm3,%ymm2,%ymm15
1246	vpunpckhdq	%ymm1,%ymm0,%ymm0
1247	vpunpckhdq	%ymm3,%ymm2,%ymm2
1248	vpunpcklqdq	%ymm15,%ymm10,%ymm1
1249	vpunpckhqdq	%ymm15,%ymm10,%ymm10
1250	vpunpcklqdq	%ymm2,%ymm0,%ymm3
1251	vpunpckhqdq	%ymm2,%ymm0,%ymm0
1252	vperm2i128	$0x20,%ymm1,%ymm9,%ymm15
1253	vperm2i128	$0x31,%ymm1,%ymm9,%ymm1
1254	vperm2i128	$0x20,%ymm10,%ymm14,%ymm9
1255	vperm2i128	$0x31,%ymm10,%ymm14,%ymm10
1256	vperm2i128	$0x20,%ymm3,%ymm11,%ymm14
1257	vperm2i128	$0x31,%ymm3,%ymm11,%ymm3
1258	vperm2i128	$0x20,%ymm0,%ymm8,%ymm11
1259	vperm2i128	$0x31,%ymm0,%ymm8,%ymm0
1260	vmovdqa	%ymm15,0(%rsp)
1261	vmovdqa	%ymm9,32(%rsp)
1262	vmovdqa	64(%rsp),%ymm15
1263	vmovdqa	96(%rsp),%ymm9
1264
1265	vpaddd	384-512(%rax),%ymm12,%ymm12
1266	vpaddd	416-512(%rax),%ymm13,%ymm13
1267	vpaddd	448-512(%rax),%ymm15,%ymm15
1268	vpaddd	480-512(%rax),%ymm9,%ymm9
1269
1270	vpunpckldq	%ymm13,%ymm12,%ymm2
1271	vpunpckldq	%ymm9,%ymm15,%ymm8
1272	vpunpckhdq	%ymm13,%ymm12,%ymm12
1273	vpunpckhdq	%ymm9,%ymm15,%ymm15
1274	vpunpcklqdq	%ymm8,%ymm2,%ymm13
1275	vpunpckhqdq	%ymm8,%ymm2,%ymm2
1276	vpunpcklqdq	%ymm15,%ymm12,%ymm9
1277	vpunpckhqdq	%ymm15,%ymm12,%ymm12
1278	vpaddd	512-512(%rax),%ymm4,%ymm4
1279	vpaddd	544-512(%rax),%ymm5,%ymm5
1280	vpaddd	576-512(%rax),%ymm6,%ymm6
1281	vpaddd	608-512(%rax),%ymm7,%ymm7
1282
1283	vpunpckldq	%ymm5,%ymm4,%ymm15
1284	vpunpckldq	%ymm7,%ymm6,%ymm8
1285	vpunpckhdq	%ymm5,%ymm4,%ymm4
1286	vpunpckhdq	%ymm7,%ymm6,%ymm6
1287	vpunpcklqdq	%ymm8,%ymm15,%ymm5
1288	vpunpckhqdq	%ymm8,%ymm15,%ymm15
1289	vpunpcklqdq	%ymm6,%ymm4,%ymm7
1290	vpunpckhqdq	%ymm6,%ymm4,%ymm4
1291	vperm2i128	$0x20,%ymm5,%ymm13,%ymm8
1292	vperm2i128	$0x31,%ymm5,%ymm13,%ymm5
1293	vperm2i128	$0x20,%ymm15,%ymm2,%ymm13
1294	vperm2i128	$0x31,%ymm15,%ymm2,%ymm15
1295	vperm2i128	$0x20,%ymm7,%ymm9,%ymm2
1296	vperm2i128	$0x31,%ymm7,%ymm9,%ymm7
1297	vperm2i128	$0x20,%ymm4,%ymm12,%ymm9
1298	vperm2i128	$0x31,%ymm4,%ymm12,%ymm4
1299	vmovdqa	0(%rsp),%ymm6
1300	vmovdqa	32(%rsp),%ymm12
1301
1302	cmpq	$512,%rdx
1303	jb	L$tail8x
1304
1305	vpxor	0(%rsi),%ymm6,%ymm6
1306	vpxor	32(%rsi),%ymm8,%ymm8
1307	vpxor	64(%rsi),%ymm1,%ymm1
1308	vpxor	96(%rsi),%ymm5,%ymm5
1309	leaq	128(%rsi),%rsi
1310	vmovdqu	%ymm6,0(%rdi)
1311	vmovdqu	%ymm8,32(%rdi)
1312	vmovdqu	%ymm1,64(%rdi)
1313	vmovdqu	%ymm5,96(%rdi)
1314	leaq	128(%rdi),%rdi
1315
1316	vpxor	0(%rsi),%ymm12,%ymm12
1317	vpxor	32(%rsi),%ymm13,%ymm13
1318	vpxor	64(%rsi),%ymm10,%ymm10
1319	vpxor	96(%rsi),%ymm15,%ymm15
1320	leaq	128(%rsi),%rsi
1321	vmovdqu	%ymm12,0(%rdi)
1322	vmovdqu	%ymm13,32(%rdi)
1323	vmovdqu	%ymm10,64(%rdi)
1324	vmovdqu	%ymm15,96(%rdi)
1325	leaq	128(%rdi),%rdi
1326
1327	vpxor	0(%rsi),%ymm14,%ymm14
1328	vpxor	32(%rsi),%ymm2,%ymm2
1329	vpxor	64(%rsi),%ymm3,%ymm3
1330	vpxor	96(%rsi),%ymm7,%ymm7
1331	leaq	128(%rsi),%rsi
1332	vmovdqu	%ymm14,0(%rdi)
1333	vmovdqu	%ymm2,32(%rdi)
1334	vmovdqu	%ymm3,64(%rdi)
1335	vmovdqu	%ymm7,96(%rdi)
1336	leaq	128(%rdi),%rdi
1337
1338	vpxor	0(%rsi),%ymm11,%ymm11
1339	vpxor	32(%rsi),%ymm9,%ymm9
1340	vpxor	64(%rsi),%ymm0,%ymm0
1341	vpxor	96(%rsi),%ymm4,%ymm4
1342	leaq	128(%rsi),%rsi
1343	vmovdqu	%ymm11,0(%rdi)
1344	vmovdqu	%ymm9,32(%rdi)
1345	vmovdqu	%ymm0,64(%rdi)
1346	vmovdqu	%ymm4,96(%rdi)
1347	leaq	128(%rdi),%rdi
1348
1349	subq	$512,%rdx
1350	jnz	L$oop_outer8x
1351
1352	jmp	L$done8x
1353
1354L$tail8x:
1355	cmpq	$448,%rdx
1356	jae	L$448_or_more8x
1357	cmpq	$384,%rdx
1358	jae	L$384_or_more8x
1359	cmpq	$320,%rdx
1360	jae	L$320_or_more8x
1361	cmpq	$256,%rdx
1362	jae	L$256_or_more8x
1363	cmpq	$192,%rdx
1364	jae	L$192_or_more8x
1365	cmpq	$128,%rdx
1366	jae	L$128_or_more8x
1367	cmpq	$64,%rdx
1368	jae	L$64_or_more8x
1369
1370	xorq	%r10,%r10
1371	vmovdqa	%ymm6,0(%rsp)
1372	vmovdqa	%ymm8,32(%rsp)
1373	jmp	L$oop_tail8x
1374
1375.p2align	5
1376L$64_or_more8x:
1377	vpxor	0(%rsi),%ymm6,%ymm6
1378	vpxor	32(%rsi),%ymm8,%ymm8
1379	vmovdqu	%ymm6,0(%rdi)
1380	vmovdqu	%ymm8,32(%rdi)
1381	je	L$done8x
1382
1383	leaq	64(%rsi),%rsi
1384	xorq	%r10,%r10
1385	vmovdqa	%ymm1,0(%rsp)
1386	leaq	64(%rdi),%rdi
1387	subq	$64,%rdx
1388	vmovdqa	%ymm5,32(%rsp)
1389	jmp	L$oop_tail8x
1390
1391.p2align	5
1392L$128_or_more8x:
1393	vpxor	0(%rsi),%ymm6,%ymm6
1394	vpxor	32(%rsi),%ymm8,%ymm8
1395	vpxor	64(%rsi),%ymm1,%ymm1
1396	vpxor	96(%rsi),%ymm5,%ymm5
1397	vmovdqu	%ymm6,0(%rdi)
1398	vmovdqu	%ymm8,32(%rdi)
1399	vmovdqu	%ymm1,64(%rdi)
1400	vmovdqu	%ymm5,96(%rdi)
1401	je	L$done8x
1402
1403	leaq	128(%rsi),%rsi
1404	xorq	%r10,%r10
1405	vmovdqa	%ymm12,0(%rsp)
1406	leaq	128(%rdi),%rdi
1407	subq	$128,%rdx
1408	vmovdqa	%ymm13,32(%rsp)
1409	jmp	L$oop_tail8x
1410
1411.p2align	5
1412L$192_or_more8x:
1413	vpxor	0(%rsi),%ymm6,%ymm6
1414	vpxor	32(%rsi),%ymm8,%ymm8
1415	vpxor	64(%rsi),%ymm1,%ymm1
1416	vpxor	96(%rsi),%ymm5,%ymm5
1417	vpxor	128(%rsi),%ymm12,%ymm12
1418	vpxor	160(%rsi),%ymm13,%ymm13
1419	vmovdqu	%ymm6,0(%rdi)
1420	vmovdqu	%ymm8,32(%rdi)
1421	vmovdqu	%ymm1,64(%rdi)
1422	vmovdqu	%ymm5,96(%rdi)
1423	vmovdqu	%ymm12,128(%rdi)
1424	vmovdqu	%ymm13,160(%rdi)
1425	je	L$done8x
1426
1427	leaq	192(%rsi),%rsi
1428	xorq	%r10,%r10
1429	vmovdqa	%ymm10,0(%rsp)
1430	leaq	192(%rdi),%rdi
1431	subq	$192,%rdx
1432	vmovdqa	%ymm15,32(%rsp)
1433	jmp	L$oop_tail8x
1434
1435.p2align	5
1436L$256_or_more8x:
1437	vpxor	0(%rsi),%ymm6,%ymm6
1438	vpxor	32(%rsi),%ymm8,%ymm8
1439	vpxor	64(%rsi),%ymm1,%ymm1
1440	vpxor	96(%rsi),%ymm5,%ymm5
1441	vpxor	128(%rsi),%ymm12,%ymm12
1442	vpxor	160(%rsi),%ymm13,%ymm13
1443	vpxor	192(%rsi),%ymm10,%ymm10
1444	vpxor	224(%rsi),%ymm15,%ymm15
1445	vmovdqu	%ymm6,0(%rdi)
1446	vmovdqu	%ymm8,32(%rdi)
1447	vmovdqu	%ymm1,64(%rdi)
1448	vmovdqu	%ymm5,96(%rdi)
1449	vmovdqu	%ymm12,128(%rdi)
1450	vmovdqu	%ymm13,160(%rdi)
1451	vmovdqu	%ymm10,192(%rdi)
1452	vmovdqu	%ymm15,224(%rdi)
1453	je	L$done8x
1454
1455	leaq	256(%rsi),%rsi
1456	xorq	%r10,%r10
1457	vmovdqa	%ymm14,0(%rsp)
1458	leaq	256(%rdi),%rdi
1459	subq	$256,%rdx
1460	vmovdqa	%ymm2,32(%rsp)
1461	jmp	L$oop_tail8x
1462
1463.p2align	5
1464L$320_or_more8x:
1465	vpxor	0(%rsi),%ymm6,%ymm6
1466	vpxor	32(%rsi),%ymm8,%ymm8
1467	vpxor	64(%rsi),%ymm1,%ymm1
1468	vpxor	96(%rsi),%ymm5,%ymm5
1469	vpxor	128(%rsi),%ymm12,%ymm12
1470	vpxor	160(%rsi),%ymm13,%ymm13
1471	vpxor	192(%rsi),%ymm10,%ymm10
1472	vpxor	224(%rsi),%ymm15,%ymm15
1473	vpxor	256(%rsi),%ymm14,%ymm14
1474	vpxor	288(%rsi),%ymm2,%ymm2
1475	vmovdqu	%ymm6,0(%rdi)
1476	vmovdqu	%ymm8,32(%rdi)
1477	vmovdqu	%ymm1,64(%rdi)
1478	vmovdqu	%ymm5,96(%rdi)
1479	vmovdqu	%ymm12,128(%rdi)
1480	vmovdqu	%ymm13,160(%rdi)
1481	vmovdqu	%ymm10,192(%rdi)
1482	vmovdqu	%ymm15,224(%rdi)
1483	vmovdqu	%ymm14,256(%rdi)
1484	vmovdqu	%ymm2,288(%rdi)
1485	je	L$done8x
1486
1487	leaq	320(%rsi),%rsi
1488	xorq	%r10,%r10
1489	vmovdqa	%ymm3,0(%rsp)
1490	leaq	320(%rdi),%rdi
1491	subq	$320,%rdx
1492	vmovdqa	%ymm7,32(%rsp)
1493	jmp	L$oop_tail8x
1494
1495.p2align	5
1496L$384_or_more8x:
1497	vpxor	0(%rsi),%ymm6,%ymm6
1498	vpxor	32(%rsi),%ymm8,%ymm8
1499	vpxor	64(%rsi),%ymm1,%ymm1
1500	vpxor	96(%rsi),%ymm5,%ymm5
1501	vpxor	128(%rsi),%ymm12,%ymm12
1502	vpxor	160(%rsi),%ymm13,%ymm13
1503	vpxor	192(%rsi),%ymm10,%ymm10
1504	vpxor	224(%rsi),%ymm15,%ymm15
1505	vpxor	256(%rsi),%ymm14,%ymm14
1506	vpxor	288(%rsi),%ymm2,%ymm2
1507	vpxor	320(%rsi),%ymm3,%ymm3
1508	vpxor	352(%rsi),%ymm7,%ymm7
1509	vmovdqu	%ymm6,0(%rdi)
1510	vmovdqu	%ymm8,32(%rdi)
1511	vmovdqu	%ymm1,64(%rdi)
1512	vmovdqu	%ymm5,96(%rdi)
1513	vmovdqu	%ymm12,128(%rdi)
1514	vmovdqu	%ymm13,160(%rdi)
1515	vmovdqu	%ymm10,192(%rdi)
1516	vmovdqu	%ymm15,224(%rdi)
1517	vmovdqu	%ymm14,256(%rdi)
1518	vmovdqu	%ymm2,288(%rdi)
1519	vmovdqu	%ymm3,320(%rdi)
1520	vmovdqu	%ymm7,352(%rdi)
1521	je	L$done8x
1522
1523	leaq	384(%rsi),%rsi
1524	xorq	%r10,%r10
1525	vmovdqa	%ymm11,0(%rsp)
1526	leaq	384(%rdi),%rdi
1527	subq	$384,%rdx
1528	vmovdqa	%ymm9,32(%rsp)
1529	jmp	L$oop_tail8x
1530
1531.p2align	5
1532L$448_or_more8x:
1533	vpxor	0(%rsi),%ymm6,%ymm6
1534	vpxor	32(%rsi),%ymm8,%ymm8
1535	vpxor	64(%rsi),%ymm1,%ymm1
1536	vpxor	96(%rsi),%ymm5,%ymm5
1537	vpxor	128(%rsi),%ymm12,%ymm12
1538	vpxor	160(%rsi),%ymm13,%ymm13
1539	vpxor	192(%rsi),%ymm10,%ymm10
1540	vpxor	224(%rsi),%ymm15,%ymm15
1541	vpxor	256(%rsi),%ymm14,%ymm14
1542	vpxor	288(%rsi),%ymm2,%ymm2
1543	vpxor	320(%rsi),%ymm3,%ymm3
1544	vpxor	352(%rsi),%ymm7,%ymm7
1545	vpxor	384(%rsi),%ymm11,%ymm11
1546	vpxor	416(%rsi),%ymm9,%ymm9
1547	vmovdqu	%ymm6,0(%rdi)
1548	vmovdqu	%ymm8,32(%rdi)
1549	vmovdqu	%ymm1,64(%rdi)
1550	vmovdqu	%ymm5,96(%rdi)
1551	vmovdqu	%ymm12,128(%rdi)
1552	vmovdqu	%ymm13,160(%rdi)
1553	vmovdqu	%ymm10,192(%rdi)
1554	vmovdqu	%ymm15,224(%rdi)
1555	vmovdqu	%ymm14,256(%rdi)
1556	vmovdqu	%ymm2,288(%rdi)
1557	vmovdqu	%ymm3,320(%rdi)
1558	vmovdqu	%ymm7,352(%rdi)
1559	vmovdqu	%ymm11,384(%rdi)
1560	vmovdqu	%ymm9,416(%rdi)
1561	je	L$done8x
1562
1563	leaq	448(%rsi),%rsi
1564	xorq	%r10,%r10
1565	vmovdqa	%ymm0,0(%rsp)
1566	leaq	448(%rdi),%rdi
1567	subq	$448,%rdx
1568	vmovdqa	%ymm4,32(%rsp)
1569
1570L$oop_tail8x:
1571	movzbl	(%rsi,%r10,1),%eax
1572	movzbl	(%rsp,%r10,1),%ecx
1573	leaq	1(%r10),%r10
1574	xorl	%ecx,%eax
1575	movb	%al,-1(%rdi,%r10,1)
1576	decq	%rdx
1577	jnz	L$oop_tail8x
1578
1579L$done8x:
1580	vzeroall
1581	leaq	(%r9),%rsp
1582L$8x_epilogue:
1583	.byte	0xf3,0xc3
1584
1585#endif
1586