• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# This file is generated from a similarly-named Perl script in the BoringSSL
2# source tree. Do not edit by hand.
3
4#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
11#include "ring_core_generated/prefix_symbols_asm.h"
12.text
13
14
15
16.p2align	6
17L$zero:
18.long	0,0,0,0
19L$one:
20.long	1,0,0,0
21L$inc:
22.long	0,1,2,3
23L$four:
24.long	4,4,4,4
25L$incy:
26.long	0,2,4,6,1,3,5,7
27L$eight:
28.long	8,8,8,8,8,8,8,8
29L$rot16:
30.byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
31L$rot24:
32.byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
33L$sigma:
34.byte	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
35.p2align	6
36L$zeroz:
37.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
38L$fourz:
39.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
40L$incz:
41.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
42L$sixteen:
43.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
44.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
45.globl	_ChaCha20_ctr32
46.private_extern _ChaCha20_ctr32
47
48.p2align	6
49_ChaCha20_ctr32:
50
51	cmpq	$0,%rdx
52	je	L$no_data
53	movq	_OPENSSL_ia32cap_P+4(%rip),%r10
54	testl	$512,%r10d
55	jnz	L$ChaCha20_ssse3
56
57	pushq	%rbx
58
59	pushq	%rbp
60
61	pushq	%r12
62
63	pushq	%r13
64
65	pushq	%r14
66
67	pushq	%r15
68
69	subq	$64+24,%rsp
70
71L$ctr32_body:
72
73
74	movdqu	(%rcx),%xmm1
75	movdqu	16(%rcx),%xmm2
76	movdqu	(%r8),%xmm3
77	movdqa	L$one(%rip),%xmm4
78
79
80	movdqa	%xmm1,16(%rsp)
81	movdqa	%xmm2,32(%rsp)
82	movdqa	%xmm3,48(%rsp)
83	movq	%rdx,%rbp
84	jmp	L$oop_outer
85
86.p2align	5
87L$oop_outer:
88	movl	$0x61707865,%eax
89	movl	$0x3320646e,%ebx
90	movl	$0x79622d32,%ecx
91	movl	$0x6b206574,%edx
92	movl	16(%rsp),%r8d
93	movl	20(%rsp),%r9d
94	movl	24(%rsp),%r10d
95	movl	28(%rsp),%r11d
96	movd	%xmm3,%r12d
97	movl	52(%rsp),%r13d
98	movl	56(%rsp),%r14d
99	movl	60(%rsp),%r15d
100
101	movq	%rbp,64+0(%rsp)
102	movl	$10,%ebp
103	movq	%rsi,64+8(%rsp)
104.byte	102,72,15,126,214
105	movq	%rdi,64+16(%rsp)
106	movq	%rsi,%rdi
107	shrq	$32,%rdi
108	jmp	L$oop
109
110.p2align	5
111L$oop:
112	addl	%r8d,%eax
113	xorl	%eax,%r12d
114	roll	$16,%r12d
115	addl	%r9d,%ebx
116	xorl	%ebx,%r13d
117	roll	$16,%r13d
118	addl	%r12d,%esi
119	xorl	%esi,%r8d
120	roll	$12,%r8d
121	addl	%r13d,%edi
122	xorl	%edi,%r9d
123	roll	$12,%r9d
124	addl	%r8d,%eax
125	xorl	%eax,%r12d
126	roll	$8,%r12d
127	addl	%r9d,%ebx
128	xorl	%ebx,%r13d
129	roll	$8,%r13d
130	addl	%r12d,%esi
131	xorl	%esi,%r8d
132	roll	$7,%r8d
133	addl	%r13d,%edi
134	xorl	%edi,%r9d
135	roll	$7,%r9d
136	movl	%esi,32(%rsp)
137	movl	%edi,36(%rsp)
138	movl	40(%rsp),%esi
139	movl	44(%rsp),%edi
140	addl	%r10d,%ecx
141	xorl	%ecx,%r14d
142	roll	$16,%r14d
143	addl	%r11d,%edx
144	xorl	%edx,%r15d
145	roll	$16,%r15d
146	addl	%r14d,%esi
147	xorl	%esi,%r10d
148	roll	$12,%r10d
149	addl	%r15d,%edi
150	xorl	%edi,%r11d
151	roll	$12,%r11d
152	addl	%r10d,%ecx
153	xorl	%ecx,%r14d
154	roll	$8,%r14d
155	addl	%r11d,%edx
156	xorl	%edx,%r15d
157	roll	$8,%r15d
158	addl	%r14d,%esi
159	xorl	%esi,%r10d
160	roll	$7,%r10d
161	addl	%r15d,%edi
162	xorl	%edi,%r11d
163	roll	$7,%r11d
164	addl	%r9d,%eax
165	xorl	%eax,%r15d
166	roll	$16,%r15d
167	addl	%r10d,%ebx
168	xorl	%ebx,%r12d
169	roll	$16,%r12d
170	addl	%r15d,%esi
171	xorl	%esi,%r9d
172	roll	$12,%r9d
173	addl	%r12d,%edi
174	xorl	%edi,%r10d
175	roll	$12,%r10d
176	addl	%r9d,%eax
177	xorl	%eax,%r15d
178	roll	$8,%r15d
179	addl	%r10d,%ebx
180	xorl	%ebx,%r12d
181	roll	$8,%r12d
182	addl	%r15d,%esi
183	xorl	%esi,%r9d
184	roll	$7,%r9d
185	addl	%r12d,%edi
186	xorl	%edi,%r10d
187	roll	$7,%r10d
188	movl	%esi,40(%rsp)
189	movl	%edi,44(%rsp)
190	movl	32(%rsp),%esi
191	movl	36(%rsp),%edi
192	addl	%r11d,%ecx
193	xorl	%ecx,%r13d
194	roll	$16,%r13d
195	addl	%r8d,%edx
196	xorl	%edx,%r14d
197	roll	$16,%r14d
198	addl	%r13d,%esi
199	xorl	%esi,%r11d
200	roll	$12,%r11d
201	addl	%r14d,%edi
202	xorl	%edi,%r8d
203	roll	$12,%r8d
204	addl	%r11d,%ecx
205	xorl	%ecx,%r13d
206	roll	$8,%r13d
207	addl	%r8d,%edx
208	xorl	%edx,%r14d
209	roll	$8,%r14d
210	addl	%r13d,%esi
211	xorl	%esi,%r11d
212	roll	$7,%r11d
213	addl	%r14d,%edi
214	xorl	%edi,%r8d
215	roll	$7,%r8d
216	decl	%ebp
217	jnz	L$oop
218	movl	%edi,36(%rsp)
219	movl	%esi,32(%rsp)
220	movq	64(%rsp),%rbp
221	movdqa	%xmm2,%xmm1
222	movq	64+8(%rsp),%rsi
223	paddd	%xmm4,%xmm3
224	movq	64+16(%rsp),%rdi
225
226	addl	$0x61707865,%eax
227	addl	$0x3320646e,%ebx
228	addl	$0x79622d32,%ecx
229	addl	$0x6b206574,%edx
230	addl	16(%rsp),%r8d
231	addl	20(%rsp),%r9d
232	addl	24(%rsp),%r10d
233	addl	28(%rsp),%r11d
234	addl	48(%rsp),%r12d
235	addl	52(%rsp),%r13d
236	addl	56(%rsp),%r14d
237	addl	60(%rsp),%r15d
238	paddd	32(%rsp),%xmm1
239
240	cmpq	$64,%rbp
241	jb	L$tail
242
243	xorl	0(%rsi),%eax
244	xorl	4(%rsi),%ebx
245	xorl	8(%rsi),%ecx
246	xorl	12(%rsi),%edx
247	xorl	16(%rsi),%r8d
248	xorl	20(%rsi),%r9d
249	xorl	24(%rsi),%r10d
250	xorl	28(%rsi),%r11d
251	movdqu	32(%rsi),%xmm0
252	xorl	48(%rsi),%r12d
253	xorl	52(%rsi),%r13d
254	xorl	56(%rsi),%r14d
255	xorl	60(%rsi),%r15d
256	leaq	64(%rsi),%rsi
257	pxor	%xmm1,%xmm0
258
259	movdqa	%xmm2,32(%rsp)
260	movd	%xmm3,48(%rsp)
261
262	movl	%eax,0(%rdi)
263	movl	%ebx,4(%rdi)
264	movl	%ecx,8(%rdi)
265	movl	%edx,12(%rdi)
266	movl	%r8d,16(%rdi)
267	movl	%r9d,20(%rdi)
268	movl	%r10d,24(%rdi)
269	movl	%r11d,28(%rdi)
270	movdqu	%xmm0,32(%rdi)
271	movl	%r12d,48(%rdi)
272	movl	%r13d,52(%rdi)
273	movl	%r14d,56(%rdi)
274	movl	%r15d,60(%rdi)
275	leaq	64(%rdi),%rdi
276
277	subq	$64,%rbp
278	jnz	L$oop_outer
279
280	jmp	L$done
281
282.p2align	4
283L$tail:
284	movl	%eax,0(%rsp)
285	movl	%ebx,4(%rsp)
286	xorq	%rbx,%rbx
287	movl	%ecx,8(%rsp)
288	movl	%edx,12(%rsp)
289	movl	%r8d,16(%rsp)
290	movl	%r9d,20(%rsp)
291	movl	%r10d,24(%rsp)
292	movl	%r11d,28(%rsp)
293	movdqa	%xmm1,32(%rsp)
294	movl	%r12d,48(%rsp)
295	movl	%r13d,52(%rsp)
296	movl	%r14d,56(%rsp)
297	movl	%r15d,60(%rsp)
298
299L$oop_tail:
300	movzbl	(%rsi,%rbx,1),%eax
301	movzbl	(%rsp,%rbx,1),%edx
302	leaq	1(%rbx),%rbx
303	xorl	%edx,%eax
304	movb	%al,-1(%rdi,%rbx,1)
305	decq	%rbp
306	jnz	L$oop_tail
307
308L$done:
309	leaq	64+24+48(%rsp),%rsi
310	movq	-48(%rsi),%r15
311
312	movq	-40(%rsi),%r14
313
314	movq	-32(%rsi),%r13
315
316	movq	-24(%rsi),%r12
317
318	movq	-16(%rsi),%rbp
319
320	movq	-8(%rsi),%rbx
321
322	leaq	(%rsi),%rsp
323
324L$no_data:
325	.byte	0xf3,0xc3
326
327
328
329.p2align	5
330ChaCha20_ssse3:
331L$ChaCha20_ssse3:
332
333	movq	%rsp,%r9
334
335	cmpq	$128,%rdx
336	ja	L$ChaCha20_4x
337
338L$do_sse3_after_all:
339	subq	$64+8,%rsp
340	movdqa	L$sigma(%rip),%xmm0
341	movdqu	(%rcx),%xmm1
342	movdqu	16(%rcx),%xmm2
343	movdqu	(%r8),%xmm3
344	movdqa	L$rot16(%rip),%xmm6
345	movdqa	L$rot24(%rip),%xmm7
346
347	movdqa	%xmm0,0(%rsp)
348	movdqa	%xmm1,16(%rsp)
349	movdqa	%xmm2,32(%rsp)
350	movdqa	%xmm3,48(%rsp)
351	movq	$10,%r8
352	jmp	L$oop_ssse3
353
354.p2align	5
355L$oop_outer_ssse3:
356	movdqa	L$one(%rip),%xmm3
357	movdqa	0(%rsp),%xmm0
358	movdqa	16(%rsp),%xmm1
359	movdqa	32(%rsp),%xmm2
360	paddd	48(%rsp),%xmm3
361	movq	$10,%r8
362	movdqa	%xmm3,48(%rsp)
363	jmp	L$oop_ssse3
364
365.p2align	5
366L$oop_ssse3:
367	paddd	%xmm1,%xmm0
368	pxor	%xmm0,%xmm3
369.byte	102,15,56,0,222
370	paddd	%xmm3,%xmm2
371	pxor	%xmm2,%xmm1
372	movdqa	%xmm1,%xmm4
373	psrld	$20,%xmm1
374	pslld	$12,%xmm4
375	por	%xmm4,%xmm1
376	paddd	%xmm1,%xmm0
377	pxor	%xmm0,%xmm3
378.byte	102,15,56,0,223
379	paddd	%xmm3,%xmm2
380	pxor	%xmm2,%xmm1
381	movdqa	%xmm1,%xmm4
382	psrld	$25,%xmm1
383	pslld	$7,%xmm4
384	por	%xmm4,%xmm1
385	pshufd	$78,%xmm2,%xmm2
386	pshufd	$57,%xmm1,%xmm1
387	pshufd	$147,%xmm3,%xmm3
388	nop
389	paddd	%xmm1,%xmm0
390	pxor	%xmm0,%xmm3
391.byte	102,15,56,0,222
392	paddd	%xmm3,%xmm2
393	pxor	%xmm2,%xmm1
394	movdqa	%xmm1,%xmm4
395	psrld	$20,%xmm1
396	pslld	$12,%xmm4
397	por	%xmm4,%xmm1
398	paddd	%xmm1,%xmm0
399	pxor	%xmm0,%xmm3
400.byte	102,15,56,0,223
401	paddd	%xmm3,%xmm2
402	pxor	%xmm2,%xmm1
403	movdqa	%xmm1,%xmm4
404	psrld	$25,%xmm1
405	pslld	$7,%xmm4
406	por	%xmm4,%xmm1
407	pshufd	$78,%xmm2,%xmm2
408	pshufd	$147,%xmm1,%xmm1
409	pshufd	$57,%xmm3,%xmm3
410	decq	%r8
411	jnz	L$oop_ssse3
412	paddd	0(%rsp),%xmm0
413	paddd	16(%rsp),%xmm1
414	paddd	32(%rsp),%xmm2
415	paddd	48(%rsp),%xmm3
416
417	cmpq	$64,%rdx
418	jb	L$tail_ssse3
419
420	movdqu	0(%rsi),%xmm4
421	movdqu	16(%rsi),%xmm5
422	pxor	%xmm4,%xmm0
423	movdqu	32(%rsi),%xmm4
424	pxor	%xmm5,%xmm1
425	movdqu	48(%rsi),%xmm5
426	leaq	64(%rsi),%rsi
427	pxor	%xmm4,%xmm2
428	pxor	%xmm5,%xmm3
429
430	movdqu	%xmm0,0(%rdi)
431	movdqu	%xmm1,16(%rdi)
432	movdqu	%xmm2,32(%rdi)
433	movdqu	%xmm3,48(%rdi)
434	leaq	64(%rdi),%rdi
435
436	subq	$64,%rdx
437	jnz	L$oop_outer_ssse3
438
439	jmp	L$done_ssse3
440
441.p2align	4
442L$tail_ssse3:
443	movdqa	%xmm0,0(%rsp)
444	movdqa	%xmm1,16(%rsp)
445	movdqa	%xmm2,32(%rsp)
446	movdqa	%xmm3,48(%rsp)
447	xorq	%r8,%r8
448
449L$oop_tail_ssse3:
450	movzbl	(%rsi,%r8,1),%eax
451	movzbl	(%rsp,%r8,1),%ecx
452	leaq	1(%r8),%r8
453	xorl	%ecx,%eax
454	movb	%al,-1(%rdi,%r8,1)
455	decq	%rdx
456	jnz	L$oop_tail_ssse3
457
458L$done_ssse3:
459	leaq	(%r9),%rsp
460
461L$ssse3_epilogue:
462	.byte	0xf3,0xc3
463
464
465
466.p2align	5
467ChaCha20_4x:
468L$ChaCha20_4x:
469
470	movq	%rsp,%r9
471
472	movq	%r10,%r11
473	shrq	$32,%r10
474	testq	$32,%r10
475	jnz	L$ChaCha20_8x
476	cmpq	$192,%rdx
477	ja	L$proceed4x
478
479	andq	$71303168,%r11
480	cmpq	$4194304,%r11
481	je	L$do_sse3_after_all
482
483L$proceed4x:
484	subq	$0x140+8,%rsp
485	movdqa	L$sigma(%rip),%xmm11
486	movdqu	(%rcx),%xmm15
487	movdqu	16(%rcx),%xmm7
488	movdqu	(%r8),%xmm3
489	leaq	256(%rsp),%rcx
490	leaq	L$rot16(%rip),%r10
491	leaq	L$rot24(%rip),%r11
492
493	pshufd	$0x00,%xmm11,%xmm8
494	pshufd	$0x55,%xmm11,%xmm9
495	movdqa	%xmm8,64(%rsp)
496	pshufd	$0xaa,%xmm11,%xmm10
497	movdqa	%xmm9,80(%rsp)
498	pshufd	$0xff,%xmm11,%xmm11
499	movdqa	%xmm10,96(%rsp)
500	movdqa	%xmm11,112(%rsp)
501
502	pshufd	$0x00,%xmm15,%xmm12
503	pshufd	$0x55,%xmm15,%xmm13
504	movdqa	%xmm12,128-256(%rcx)
505	pshufd	$0xaa,%xmm15,%xmm14
506	movdqa	%xmm13,144-256(%rcx)
507	pshufd	$0xff,%xmm15,%xmm15
508	movdqa	%xmm14,160-256(%rcx)
509	movdqa	%xmm15,176-256(%rcx)
510
511	pshufd	$0x00,%xmm7,%xmm4
512	pshufd	$0x55,%xmm7,%xmm5
513	movdqa	%xmm4,192-256(%rcx)
514	pshufd	$0xaa,%xmm7,%xmm6
515	movdqa	%xmm5,208-256(%rcx)
516	pshufd	$0xff,%xmm7,%xmm7
517	movdqa	%xmm6,224-256(%rcx)
518	movdqa	%xmm7,240-256(%rcx)
519
520	pshufd	$0x00,%xmm3,%xmm0
521	pshufd	$0x55,%xmm3,%xmm1
522	paddd	L$inc(%rip),%xmm0
523	pshufd	$0xaa,%xmm3,%xmm2
524	movdqa	%xmm1,272-256(%rcx)
525	pshufd	$0xff,%xmm3,%xmm3
526	movdqa	%xmm2,288-256(%rcx)
527	movdqa	%xmm3,304-256(%rcx)
528
529	jmp	L$oop_enter4x
530
531.p2align	5
532L$oop_outer4x:
533	movdqa	64(%rsp),%xmm8
534	movdqa	80(%rsp),%xmm9
535	movdqa	96(%rsp),%xmm10
536	movdqa	112(%rsp),%xmm11
537	movdqa	128-256(%rcx),%xmm12
538	movdqa	144-256(%rcx),%xmm13
539	movdqa	160-256(%rcx),%xmm14
540	movdqa	176-256(%rcx),%xmm15
541	movdqa	192-256(%rcx),%xmm4
542	movdqa	208-256(%rcx),%xmm5
543	movdqa	224-256(%rcx),%xmm6
544	movdqa	240-256(%rcx),%xmm7
545	movdqa	256-256(%rcx),%xmm0
546	movdqa	272-256(%rcx),%xmm1
547	movdqa	288-256(%rcx),%xmm2
548	movdqa	304-256(%rcx),%xmm3
549	paddd	L$four(%rip),%xmm0
550
551L$oop_enter4x:
552	movdqa	%xmm6,32(%rsp)
553	movdqa	%xmm7,48(%rsp)
554	movdqa	(%r10),%xmm7
555	movl	$10,%eax
556	movdqa	%xmm0,256-256(%rcx)
557	jmp	L$oop4x
558
559.p2align	5
560L$oop4x:
561	paddd	%xmm12,%xmm8
562	paddd	%xmm13,%xmm9
563	pxor	%xmm8,%xmm0
564	pxor	%xmm9,%xmm1
565.byte	102,15,56,0,199
566.byte	102,15,56,0,207
567	paddd	%xmm0,%xmm4
568	paddd	%xmm1,%xmm5
569	pxor	%xmm4,%xmm12
570	pxor	%xmm5,%xmm13
571	movdqa	%xmm12,%xmm6
572	pslld	$12,%xmm12
573	psrld	$20,%xmm6
574	movdqa	%xmm13,%xmm7
575	pslld	$12,%xmm13
576	por	%xmm6,%xmm12
577	psrld	$20,%xmm7
578	movdqa	(%r11),%xmm6
579	por	%xmm7,%xmm13
580	paddd	%xmm12,%xmm8
581	paddd	%xmm13,%xmm9
582	pxor	%xmm8,%xmm0
583	pxor	%xmm9,%xmm1
584.byte	102,15,56,0,198
585.byte	102,15,56,0,206
586	paddd	%xmm0,%xmm4
587	paddd	%xmm1,%xmm5
588	pxor	%xmm4,%xmm12
589	pxor	%xmm5,%xmm13
590	movdqa	%xmm12,%xmm7
591	pslld	$7,%xmm12
592	psrld	$25,%xmm7
593	movdqa	%xmm13,%xmm6
594	pslld	$7,%xmm13
595	por	%xmm7,%xmm12
596	psrld	$25,%xmm6
597	movdqa	(%r10),%xmm7
598	por	%xmm6,%xmm13
599	movdqa	%xmm4,0(%rsp)
600	movdqa	%xmm5,16(%rsp)
601	movdqa	32(%rsp),%xmm4
602	movdqa	48(%rsp),%xmm5
603	paddd	%xmm14,%xmm10
604	paddd	%xmm15,%xmm11
605	pxor	%xmm10,%xmm2
606	pxor	%xmm11,%xmm3
607.byte	102,15,56,0,215
608.byte	102,15,56,0,223
609	paddd	%xmm2,%xmm4
610	paddd	%xmm3,%xmm5
611	pxor	%xmm4,%xmm14
612	pxor	%xmm5,%xmm15
613	movdqa	%xmm14,%xmm6
614	pslld	$12,%xmm14
615	psrld	$20,%xmm6
616	movdqa	%xmm15,%xmm7
617	pslld	$12,%xmm15
618	por	%xmm6,%xmm14
619	psrld	$20,%xmm7
620	movdqa	(%r11),%xmm6
621	por	%xmm7,%xmm15
622	paddd	%xmm14,%xmm10
623	paddd	%xmm15,%xmm11
624	pxor	%xmm10,%xmm2
625	pxor	%xmm11,%xmm3
626.byte	102,15,56,0,214
627.byte	102,15,56,0,222
628	paddd	%xmm2,%xmm4
629	paddd	%xmm3,%xmm5
630	pxor	%xmm4,%xmm14
631	pxor	%xmm5,%xmm15
632	movdqa	%xmm14,%xmm7
633	pslld	$7,%xmm14
634	psrld	$25,%xmm7
635	movdqa	%xmm15,%xmm6
636	pslld	$7,%xmm15
637	por	%xmm7,%xmm14
638	psrld	$25,%xmm6
639	movdqa	(%r10),%xmm7
640	por	%xmm6,%xmm15
641	paddd	%xmm13,%xmm8
642	paddd	%xmm14,%xmm9
643	pxor	%xmm8,%xmm3
644	pxor	%xmm9,%xmm0
645.byte	102,15,56,0,223
646.byte	102,15,56,0,199
647	paddd	%xmm3,%xmm4
648	paddd	%xmm0,%xmm5
649	pxor	%xmm4,%xmm13
650	pxor	%xmm5,%xmm14
651	movdqa	%xmm13,%xmm6
652	pslld	$12,%xmm13
653	psrld	$20,%xmm6
654	movdqa	%xmm14,%xmm7
655	pslld	$12,%xmm14
656	por	%xmm6,%xmm13
657	psrld	$20,%xmm7
658	movdqa	(%r11),%xmm6
659	por	%xmm7,%xmm14
660	paddd	%xmm13,%xmm8
661	paddd	%xmm14,%xmm9
662	pxor	%xmm8,%xmm3
663	pxor	%xmm9,%xmm0
664.byte	102,15,56,0,222
665.byte	102,15,56,0,198
666	paddd	%xmm3,%xmm4
667	paddd	%xmm0,%xmm5
668	pxor	%xmm4,%xmm13
669	pxor	%xmm5,%xmm14
670	movdqa	%xmm13,%xmm7
671	pslld	$7,%xmm13
672	psrld	$25,%xmm7
673	movdqa	%xmm14,%xmm6
674	pslld	$7,%xmm14
675	por	%xmm7,%xmm13
676	psrld	$25,%xmm6
677	movdqa	(%r10),%xmm7
678	por	%xmm6,%xmm14
679	movdqa	%xmm4,32(%rsp)
680	movdqa	%xmm5,48(%rsp)
681	movdqa	0(%rsp),%xmm4
682	movdqa	16(%rsp),%xmm5
683	paddd	%xmm15,%xmm10
684	paddd	%xmm12,%xmm11
685	pxor	%xmm10,%xmm1
686	pxor	%xmm11,%xmm2
687.byte	102,15,56,0,207
688.byte	102,15,56,0,215
689	paddd	%xmm1,%xmm4
690	paddd	%xmm2,%xmm5
691	pxor	%xmm4,%xmm15
692	pxor	%xmm5,%xmm12
693	movdqa	%xmm15,%xmm6
694	pslld	$12,%xmm15
695	psrld	$20,%xmm6
696	movdqa	%xmm12,%xmm7
697	pslld	$12,%xmm12
698	por	%xmm6,%xmm15
699	psrld	$20,%xmm7
700	movdqa	(%r11),%xmm6
701	por	%xmm7,%xmm12
702	paddd	%xmm15,%xmm10
703	paddd	%xmm12,%xmm11
704	pxor	%xmm10,%xmm1
705	pxor	%xmm11,%xmm2
706.byte	102,15,56,0,206
707.byte	102,15,56,0,214
708	paddd	%xmm1,%xmm4
709	paddd	%xmm2,%xmm5
710	pxor	%xmm4,%xmm15
711	pxor	%xmm5,%xmm12
712	movdqa	%xmm15,%xmm7
713	pslld	$7,%xmm15
714	psrld	$25,%xmm7
715	movdqa	%xmm12,%xmm6
716	pslld	$7,%xmm12
717	por	%xmm7,%xmm15
718	psrld	$25,%xmm6
719	movdqa	(%r10),%xmm7
720	por	%xmm6,%xmm12
721	decl	%eax
722	jnz	L$oop4x
723
724	paddd	64(%rsp),%xmm8
725	paddd	80(%rsp),%xmm9
726	paddd	96(%rsp),%xmm10
727	paddd	112(%rsp),%xmm11
728
729	movdqa	%xmm8,%xmm6
730	punpckldq	%xmm9,%xmm8
731	movdqa	%xmm10,%xmm7
732	punpckldq	%xmm11,%xmm10
733	punpckhdq	%xmm9,%xmm6
734	punpckhdq	%xmm11,%xmm7
735	movdqa	%xmm8,%xmm9
736	punpcklqdq	%xmm10,%xmm8
737	movdqa	%xmm6,%xmm11
738	punpcklqdq	%xmm7,%xmm6
739	punpckhqdq	%xmm10,%xmm9
740	punpckhqdq	%xmm7,%xmm11
741	paddd	128-256(%rcx),%xmm12
742	paddd	144-256(%rcx),%xmm13
743	paddd	160-256(%rcx),%xmm14
744	paddd	176-256(%rcx),%xmm15
745
746	movdqa	%xmm8,0(%rsp)
747	movdqa	%xmm9,16(%rsp)
748	movdqa	32(%rsp),%xmm8
749	movdqa	48(%rsp),%xmm9
750
751	movdqa	%xmm12,%xmm10
752	punpckldq	%xmm13,%xmm12
753	movdqa	%xmm14,%xmm7
754	punpckldq	%xmm15,%xmm14
755	punpckhdq	%xmm13,%xmm10
756	punpckhdq	%xmm15,%xmm7
757	movdqa	%xmm12,%xmm13
758	punpcklqdq	%xmm14,%xmm12
759	movdqa	%xmm10,%xmm15
760	punpcklqdq	%xmm7,%xmm10
761	punpckhqdq	%xmm14,%xmm13
762	punpckhqdq	%xmm7,%xmm15
763	paddd	192-256(%rcx),%xmm4
764	paddd	208-256(%rcx),%xmm5
765	paddd	224-256(%rcx),%xmm8
766	paddd	240-256(%rcx),%xmm9
767
768	movdqa	%xmm6,32(%rsp)
769	movdqa	%xmm11,48(%rsp)
770
771	movdqa	%xmm4,%xmm14
772	punpckldq	%xmm5,%xmm4
773	movdqa	%xmm8,%xmm7
774	punpckldq	%xmm9,%xmm8
775	punpckhdq	%xmm5,%xmm14
776	punpckhdq	%xmm9,%xmm7
777	movdqa	%xmm4,%xmm5
778	punpcklqdq	%xmm8,%xmm4
779	movdqa	%xmm14,%xmm9
780	punpcklqdq	%xmm7,%xmm14
781	punpckhqdq	%xmm8,%xmm5
782	punpckhqdq	%xmm7,%xmm9
783	paddd	256-256(%rcx),%xmm0
784	paddd	272-256(%rcx),%xmm1
785	paddd	288-256(%rcx),%xmm2
786	paddd	304-256(%rcx),%xmm3
787
788	movdqa	%xmm0,%xmm8
789	punpckldq	%xmm1,%xmm0
790	movdqa	%xmm2,%xmm7
791	punpckldq	%xmm3,%xmm2
792	punpckhdq	%xmm1,%xmm8
793	punpckhdq	%xmm3,%xmm7
794	movdqa	%xmm0,%xmm1
795	punpcklqdq	%xmm2,%xmm0
796	movdqa	%xmm8,%xmm3
797	punpcklqdq	%xmm7,%xmm8
798	punpckhqdq	%xmm2,%xmm1
799	punpckhqdq	%xmm7,%xmm3
800	cmpq	$256,%rdx
801	jb	L$tail4x
802
803	movdqu	0(%rsi),%xmm6
804	movdqu	16(%rsi),%xmm11
805	movdqu	32(%rsi),%xmm2
806	movdqu	48(%rsi),%xmm7
807	pxor	0(%rsp),%xmm6
808	pxor	%xmm12,%xmm11
809	pxor	%xmm4,%xmm2
810	pxor	%xmm0,%xmm7
811
812	movdqu	%xmm6,0(%rdi)
813	movdqu	64(%rsi),%xmm6
814	movdqu	%xmm11,16(%rdi)
815	movdqu	80(%rsi),%xmm11
816	movdqu	%xmm2,32(%rdi)
817	movdqu	96(%rsi),%xmm2
818	movdqu	%xmm7,48(%rdi)
819	movdqu	112(%rsi),%xmm7
820	leaq	128(%rsi),%rsi
821	pxor	16(%rsp),%xmm6
822	pxor	%xmm13,%xmm11
823	pxor	%xmm5,%xmm2
824	pxor	%xmm1,%xmm7
825
826	movdqu	%xmm6,64(%rdi)
827	movdqu	0(%rsi),%xmm6
828	movdqu	%xmm11,80(%rdi)
829	movdqu	16(%rsi),%xmm11
830	movdqu	%xmm2,96(%rdi)
831	movdqu	32(%rsi),%xmm2
832	movdqu	%xmm7,112(%rdi)
833	leaq	128(%rdi),%rdi
834	movdqu	48(%rsi),%xmm7
835	pxor	32(%rsp),%xmm6
836	pxor	%xmm10,%xmm11
837	pxor	%xmm14,%xmm2
838	pxor	%xmm8,%xmm7
839
840	movdqu	%xmm6,0(%rdi)
841	movdqu	64(%rsi),%xmm6
842	movdqu	%xmm11,16(%rdi)
843	movdqu	80(%rsi),%xmm11
844	movdqu	%xmm2,32(%rdi)
845	movdqu	96(%rsi),%xmm2
846	movdqu	%xmm7,48(%rdi)
847	movdqu	112(%rsi),%xmm7
848	leaq	128(%rsi),%rsi
849	pxor	48(%rsp),%xmm6
850	pxor	%xmm15,%xmm11
851	pxor	%xmm9,%xmm2
852	pxor	%xmm3,%xmm7
853	movdqu	%xmm6,64(%rdi)
854	movdqu	%xmm11,80(%rdi)
855	movdqu	%xmm2,96(%rdi)
856	movdqu	%xmm7,112(%rdi)
857	leaq	128(%rdi),%rdi
858
859	subq	$256,%rdx
860	jnz	L$oop_outer4x
861
862	jmp	L$done4x
863
864L$tail4x:
865	cmpq	$192,%rdx
866	jae	L$192_or_more4x
867	cmpq	$128,%rdx
868	jae	L$128_or_more4x
869	cmpq	$64,%rdx
870	jae	L$64_or_more4x
871
872
873	xorq	%r10,%r10
874
875	movdqa	%xmm12,16(%rsp)
876	movdqa	%xmm4,32(%rsp)
877	movdqa	%xmm0,48(%rsp)
878	jmp	L$oop_tail4x
879
880.p2align	5
881L$64_or_more4x:
882	movdqu	0(%rsi),%xmm6
883	movdqu	16(%rsi),%xmm11
884	movdqu	32(%rsi),%xmm2
885	movdqu	48(%rsi),%xmm7
886	pxor	0(%rsp),%xmm6
887	pxor	%xmm12,%xmm11
888	pxor	%xmm4,%xmm2
889	pxor	%xmm0,%xmm7
890	movdqu	%xmm6,0(%rdi)
891	movdqu	%xmm11,16(%rdi)
892	movdqu	%xmm2,32(%rdi)
893	movdqu	%xmm7,48(%rdi)
894	je	L$done4x
895
896	movdqa	16(%rsp),%xmm6
897	leaq	64(%rsi),%rsi
898	xorq	%r10,%r10
899	movdqa	%xmm6,0(%rsp)
900	movdqa	%xmm13,16(%rsp)
901	leaq	64(%rdi),%rdi
902	movdqa	%xmm5,32(%rsp)
903	subq	$64,%rdx
904	movdqa	%xmm1,48(%rsp)
905	jmp	L$oop_tail4x
906
907.p2align	5
908L$128_or_more4x:
909	movdqu	0(%rsi),%xmm6
910	movdqu	16(%rsi),%xmm11
911	movdqu	32(%rsi),%xmm2
912	movdqu	48(%rsi),%xmm7
913	pxor	0(%rsp),%xmm6
914	pxor	%xmm12,%xmm11
915	pxor	%xmm4,%xmm2
916	pxor	%xmm0,%xmm7
917
918	movdqu	%xmm6,0(%rdi)
919	movdqu	64(%rsi),%xmm6
920	movdqu	%xmm11,16(%rdi)
921	movdqu	80(%rsi),%xmm11
922	movdqu	%xmm2,32(%rdi)
923	movdqu	96(%rsi),%xmm2
924	movdqu	%xmm7,48(%rdi)
925	movdqu	112(%rsi),%xmm7
926	pxor	16(%rsp),%xmm6
927	pxor	%xmm13,%xmm11
928	pxor	%xmm5,%xmm2
929	pxor	%xmm1,%xmm7
930	movdqu	%xmm6,64(%rdi)
931	movdqu	%xmm11,80(%rdi)
932	movdqu	%xmm2,96(%rdi)
933	movdqu	%xmm7,112(%rdi)
934	je	L$done4x
935
936	movdqa	32(%rsp),%xmm6
937	leaq	128(%rsi),%rsi
938	xorq	%r10,%r10
939	movdqa	%xmm6,0(%rsp)
940	movdqa	%xmm10,16(%rsp)
941	leaq	128(%rdi),%rdi
942	movdqa	%xmm14,32(%rsp)
943	subq	$128,%rdx
944	movdqa	%xmm8,48(%rsp)
945	jmp	L$oop_tail4x
946
947.p2align	5
948L$192_or_more4x:
949	movdqu	0(%rsi),%xmm6
950	movdqu	16(%rsi),%xmm11
951	movdqu	32(%rsi),%xmm2
952	movdqu	48(%rsi),%xmm7
953	pxor	0(%rsp),%xmm6
954	pxor	%xmm12,%xmm11
955	pxor	%xmm4,%xmm2
956	pxor	%xmm0,%xmm7
957
958	movdqu	%xmm6,0(%rdi)
959	movdqu	64(%rsi),%xmm6
960	movdqu	%xmm11,16(%rdi)
961	movdqu	80(%rsi),%xmm11
962	movdqu	%xmm2,32(%rdi)
963	movdqu	96(%rsi),%xmm2
964	movdqu	%xmm7,48(%rdi)
965	movdqu	112(%rsi),%xmm7
966	leaq	128(%rsi),%rsi
967	pxor	16(%rsp),%xmm6
968	pxor	%xmm13,%xmm11
969	pxor	%xmm5,%xmm2
970	pxor	%xmm1,%xmm7
971
972	movdqu	%xmm6,64(%rdi)
973	movdqu	0(%rsi),%xmm6
974	movdqu	%xmm11,80(%rdi)
975	movdqu	16(%rsi),%xmm11
976	movdqu	%xmm2,96(%rdi)
977	movdqu	32(%rsi),%xmm2
978	movdqu	%xmm7,112(%rdi)
979	leaq	128(%rdi),%rdi
980	movdqu	48(%rsi),%xmm7
981	pxor	32(%rsp),%xmm6
982	pxor	%xmm10,%xmm11
983	pxor	%xmm14,%xmm2
984	pxor	%xmm8,%xmm7
985	movdqu	%xmm6,0(%rdi)
986	movdqu	%xmm11,16(%rdi)
987	movdqu	%xmm2,32(%rdi)
988	movdqu	%xmm7,48(%rdi)
989	je	L$done4x
990
991	movdqa	48(%rsp),%xmm6
992	leaq	64(%rsi),%rsi
993	xorq	%r10,%r10
994	movdqa	%xmm6,0(%rsp)
995	movdqa	%xmm15,16(%rsp)
996	leaq	64(%rdi),%rdi
997	movdqa	%xmm9,32(%rsp)
998	subq	$192,%rdx
999	movdqa	%xmm3,48(%rsp)
1000
1001L$oop_tail4x:
1002	movzbl	(%rsi,%r10,1),%eax
1003	movzbl	(%rsp,%r10,1),%ecx
1004	leaq	1(%r10),%r10
1005	xorl	%ecx,%eax
1006	movb	%al,-1(%rdi,%r10,1)
1007	decq	%rdx
1008	jnz	L$oop_tail4x
1009
1010L$done4x:
1011	leaq	(%r9),%rsp
1012
1013L$4x_epilogue:
1014	.byte	0xf3,0xc3
1015
1016
1017
1018.p2align	5
1019ChaCha20_8x:
1020L$ChaCha20_8x:
1021
1022	movq	%rsp,%r9
1023
1024	subq	$0x280+8,%rsp
1025	andq	$-32,%rsp
1026	vzeroupper
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037	vbroadcasti128	L$sigma(%rip),%ymm11
1038	vbroadcasti128	(%rcx),%ymm3
1039	vbroadcasti128	16(%rcx),%ymm15
1040	vbroadcasti128	(%r8),%ymm7
1041	leaq	256(%rsp),%rcx
1042	leaq	512(%rsp),%rax
1043	leaq	L$rot16(%rip),%r10
1044	leaq	L$rot24(%rip),%r11
1045
1046	vpshufd	$0x00,%ymm11,%ymm8
1047	vpshufd	$0x55,%ymm11,%ymm9
1048	vmovdqa	%ymm8,128-256(%rcx)
1049	vpshufd	$0xaa,%ymm11,%ymm10
1050	vmovdqa	%ymm9,160-256(%rcx)
1051	vpshufd	$0xff,%ymm11,%ymm11
1052	vmovdqa	%ymm10,192-256(%rcx)
1053	vmovdqa	%ymm11,224-256(%rcx)
1054
1055	vpshufd	$0x00,%ymm3,%ymm0
1056	vpshufd	$0x55,%ymm3,%ymm1
1057	vmovdqa	%ymm0,256-256(%rcx)
1058	vpshufd	$0xaa,%ymm3,%ymm2
1059	vmovdqa	%ymm1,288-256(%rcx)
1060	vpshufd	$0xff,%ymm3,%ymm3
1061	vmovdqa	%ymm2,320-256(%rcx)
1062	vmovdqa	%ymm3,352-256(%rcx)
1063
1064	vpshufd	$0x00,%ymm15,%ymm12
1065	vpshufd	$0x55,%ymm15,%ymm13
1066	vmovdqa	%ymm12,384-512(%rax)
1067	vpshufd	$0xaa,%ymm15,%ymm14
1068	vmovdqa	%ymm13,416-512(%rax)
1069	vpshufd	$0xff,%ymm15,%ymm15
1070	vmovdqa	%ymm14,448-512(%rax)
1071	vmovdqa	%ymm15,480-512(%rax)
1072
1073	vpshufd	$0x00,%ymm7,%ymm4
1074	vpshufd	$0x55,%ymm7,%ymm5
1075	vpaddd	L$incy(%rip),%ymm4,%ymm4
1076	vpshufd	$0xaa,%ymm7,%ymm6
1077	vmovdqa	%ymm5,544-512(%rax)
1078	vpshufd	$0xff,%ymm7,%ymm7
1079	vmovdqa	%ymm6,576-512(%rax)
1080	vmovdqa	%ymm7,608-512(%rax)
1081
1082	jmp	L$oop_enter8x
1083
1084.p2align	5
1085L$oop_outer8x:
1086	vmovdqa	128-256(%rcx),%ymm8
1087	vmovdqa	160-256(%rcx),%ymm9
1088	vmovdqa	192-256(%rcx),%ymm10
1089	vmovdqa	224-256(%rcx),%ymm11
1090	vmovdqa	256-256(%rcx),%ymm0
1091	vmovdqa	288-256(%rcx),%ymm1
1092	vmovdqa	320-256(%rcx),%ymm2
1093	vmovdqa	352-256(%rcx),%ymm3
1094	vmovdqa	384-512(%rax),%ymm12
1095	vmovdqa	416-512(%rax),%ymm13
1096	vmovdqa	448-512(%rax),%ymm14
1097	vmovdqa	480-512(%rax),%ymm15
1098	vmovdqa	512-512(%rax),%ymm4
1099	vmovdqa	544-512(%rax),%ymm5
1100	vmovdqa	576-512(%rax),%ymm6
1101	vmovdqa	608-512(%rax),%ymm7
1102	vpaddd	L$eight(%rip),%ymm4,%ymm4
1103
1104L$oop_enter8x:
1105	vmovdqa	%ymm14,64(%rsp)
1106	vmovdqa	%ymm15,96(%rsp)
1107	vbroadcasti128	(%r10),%ymm15
1108	vmovdqa	%ymm4,512-512(%rax)
1109	movl	$10,%eax
1110	jmp	L$oop8x
1111
1112.p2align	5
1113L$oop8x:
1114	vpaddd	%ymm0,%ymm8,%ymm8
1115	vpxor	%ymm4,%ymm8,%ymm4
1116	vpshufb	%ymm15,%ymm4,%ymm4
1117	vpaddd	%ymm1,%ymm9,%ymm9
1118	vpxor	%ymm5,%ymm9,%ymm5
1119	vpshufb	%ymm15,%ymm5,%ymm5
1120	vpaddd	%ymm4,%ymm12,%ymm12
1121	vpxor	%ymm0,%ymm12,%ymm0
1122	vpslld	$12,%ymm0,%ymm14
1123	vpsrld	$20,%ymm0,%ymm0
1124	vpor	%ymm0,%ymm14,%ymm0
1125	vbroadcasti128	(%r11),%ymm14
1126	vpaddd	%ymm5,%ymm13,%ymm13
1127	vpxor	%ymm1,%ymm13,%ymm1
1128	vpslld	$12,%ymm1,%ymm15
1129	vpsrld	$20,%ymm1,%ymm1
1130	vpor	%ymm1,%ymm15,%ymm1
1131	vpaddd	%ymm0,%ymm8,%ymm8
1132	vpxor	%ymm4,%ymm8,%ymm4
1133	vpshufb	%ymm14,%ymm4,%ymm4
1134	vpaddd	%ymm1,%ymm9,%ymm9
1135	vpxor	%ymm5,%ymm9,%ymm5
1136	vpshufb	%ymm14,%ymm5,%ymm5
1137	vpaddd	%ymm4,%ymm12,%ymm12
1138	vpxor	%ymm0,%ymm12,%ymm0
1139	vpslld	$7,%ymm0,%ymm15
1140	vpsrld	$25,%ymm0,%ymm0
1141	vpor	%ymm0,%ymm15,%ymm0
1142	vbroadcasti128	(%r10),%ymm15
1143	vpaddd	%ymm5,%ymm13,%ymm13
1144	vpxor	%ymm1,%ymm13,%ymm1
1145	vpslld	$7,%ymm1,%ymm14
1146	vpsrld	$25,%ymm1,%ymm1
1147	vpor	%ymm1,%ymm14,%ymm1
1148	vmovdqa	%ymm12,0(%rsp)
1149	vmovdqa	%ymm13,32(%rsp)
1150	vmovdqa	64(%rsp),%ymm12
1151	vmovdqa	96(%rsp),%ymm13
1152	vpaddd	%ymm2,%ymm10,%ymm10
1153	vpxor	%ymm6,%ymm10,%ymm6
1154	vpshufb	%ymm15,%ymm6,%ymm6
1155	vpaddd	%ymm3,%ymm11,%ymm11
1156	vpxor	%ymm7,%ymm11,%ymm7
1157	vpshufb	%ymm15,%ymm7,%ymm7
1158	vpaddd	%ymm6,%ymm12,%ymm12
1159	vpxor	%ymm2,%ymm12,%ymm2
1160	vpslld	$12,%ymm2,%ymm14
1161	vpsrld	$20,%ymm2,%ymm2
1162	vpor	%ymm2,%ymm14,%ymm2
1163	vbroadcasti128	(%r11),%ymm14
1164	vpaddd	%ymm7,%ymm13,%ymm13
1165	vpxor	%ymm3,%ymm13,%ymm3
1166	vpslld	$12,%ymm3,%ymm15
1167	vpsrld	$20,%ymm3,%ymm3
1168	vpor	%ymm3,%ymm15,%ymm3
1169	vpaddd	%ymm2,%ymm10,%ymm10
1170	vpxor	%ymm6,%ymm10,%ymm6
1171	vpshufb	%ymm14,%ymm6,%ymm6
1172	vpaddd	%ymm3,%ymm11,%ymm11
1173	vpxor	%ymm7,%ymm11,%ymm7
1174	vpshufb	%ymm14,%ymm7,%ymm7
1175	vpaddd	%ymm6,%ymm12,%ymm12
1176	vpxor	%ymm2,%ymm12,%ymm2
1177	vpslld	$7,%ymm2,%ymm15
1178	vpsrld	$25,%ymm2,%ymm2
1179	vpor	%ymm2,%ymm15,%ymm2
1180	vbroadcasti128	(%r10),%ymm15
1181	vpaddd	%ymm7,%ymm13,%ymm13
1182	vpxor	%ymm3,%ymm13,%ymm3
1183	vpslld	$7,%ymm3,%ymm14
1184	vpsrld	$25,%ymm3,%ymm3
1185	vpor	%ymm3,%ymm14,%ymm3
1186	vpaddd	%ymm1,%ymm8,%ymm8
1187	vpxor	%ymm7,%ymm8,%ymm7
1188	vpshufb	%ymm15,%ymm7,%ymm7
1189	vpaddd	%ymm2,%ymm9,%ymm9
1190	vpxor	%ymm4,%ymm9,%ymm4
1191	vpshufb	%ymm15,%ymm4,%ymm4
1192	vpaddd	%ymm7,%ymm12,%ymm12
1193	vpxor	%ymm1,%ymm12,%ymm1
1194	vpslld	$12,%ymm1,%ymm14
1195	vpsrld	$20,%ymm1,%ymm1
1196	vpor	%ymm1,%ymm14,%ymm1
1197	vbroadcasti128	(%r11),%ymm14
1198	vpaddd	%ymm4,%ymm13,%ymm13
1199	vpxor	%ymm2,%ymm13,%ymm2
1200	vpslld	$12,%ymm2,%ymm15
1201	vpsrld	$20,%ymm2,%ymm2
1202	vpor	%ymm2,%ymm15,%ymm2
1203	vpaddd	%ymm1,%ymm8,%ymm8
1204	vpxor	%ymm7,%ymm8,%ymm7
1205	vpshufb	%ymm14,%ymm7,%ymm7
1206	vpaddd	%ymm2,%ymm9,%ymm9
1207	vpxor	%ymm4,%ymm9,%ymm4
1208	vpshufb	%ymm14,%ymm4,%ymm4
1209	vpaddd	%ymm7,%ymm12,%ymm12
1210	vpxor	%ymm1,%ymm12,%ymm1
1211	vpslld	$7,%ymm1,%ymm15
1212	vpsrld	$25,%ymm1,%ymm1
1213	vpor	%ymm1,%ymm15,%ymm1
1214	vbroadcasti128	(%r10),%ymm15
1215	vpaddd	%ymm4,%ymm13,%ymm13
1216	vpxor	%ymm2,%ymm13,%ymm2
1217	vpslld	$7,%ymm2,%ymm14
1218	vpsrld	$25,%ymm2,%ymm2
1219	vpor	%ymm2,%ymm14,%ymm2
1220	vmovdqa	%ymm12,64(%rsp)
1221	vmovdqa	%ymm13,96(%rsp)
1222	vmovdqa	0(%rsp),%ymm12
1223	vmovdqa	32(%rsp),%ymm13
1224	vpaddd	%ymm3,%ymm10,%ymm10
1225	vpxor	%ymm5,%ymm10,%ymm5
1226	vpshufb	%ymm15,%ymm5,%ymm5
1227	vpaddd	%ymm0,%ymm11,%ymm11
1228	vpxor	%ymm6,%ymm11,%ymm6
1229	vpshufb	%ymm15,%ymm6,%ymm6
1230	vpaddd	%ymm5,%ymm12,%ymm12
1231	vpxor	%ymm3,%ymm12,%ymm3
1232	vpslld	$12,%ymm3,%ymm14
1233	vpsrld	$20,%ymm3,%ymm3
1234	vpor	%ymm3,%ymm14,%ymm3
1235	vbroadcasti128	(%r11),%ymm14
1236	vpaddd	%ymm6,%ymm13,%ymm13
1237	vpxor	%ymm0,%ymm13,%ymm0
1238	vpslld	$12,%ymm0,%ymm15
1239	vpsrld	$20,%ymm0,%ymm0
1240	vpor	%ymm0,%ymm15,%ymm0
1241	vpaddd	%ymm3,%ymm10,%ymm10
1242	vpxor	%ymm5,%ymm10,%ymm5
1243	vpshufb	%ymm14,%ymm5,%ymm5
1244	vpaddd	%ymm0,%ymm11,%ymm11
1245	vpxor	%ymm6,%ymm11,%ymm6
1246	vpshufb	%ymm14,%ymm6,%ymm6
1247	vpaddd	%ymm5,%ymm12,%ymm12
1248	vpxor	%ymm3,%ymm12,%ymm3
1249	vpslld	$7,%ymm3,%ymm15
1250	vpsrld	$25,%ymm3,%ymm3
1251	vpor	%ymm3,%ymm15,%ymm3
1252	vbroadcasti128	(%r10),%ymm15
1253	vpaddd	%ymm6,%ymm13,%ymm13
1254	vpxor	%ymm0,%ymm13,%ymm0
1255	vpslld	$7,%ymm0,%ymm14
1256	vpsrld	$25,%ymm0,%ymm0
1257	vpor	%ymm0,%ymm14,%ymm0
1258	decl	%eax
1259	jnz	L$oop8x
1260
1261	leaq	512(%rsp),%rax
1262	vpaddd	128-256(%rcx),%ymm8,%ymm8
1263	vpaddd	160-256(%rcx),%ymm9,%ymm9
1264	vpaddd	192-256(%rcx),%ymm10,%ymm10
1265	vpaddd	224-256(%rcx),%ymm11,%ymm11
1266
1267	vpunpckldq	%ymm9,%ymm8,%ymm14
1268	vpunpckldq	%ymm11,%ymm10,%ymm15
1269	vpunpckhdq	%ymm9,%ymm8,%ymm8
1270	vpunpckhdq	%ymm11,%ymm10,%ymm10
1271	vpunpcklqdq	%ymm15,%ymm14,%ymm9
1272	vpunpckhqdq	%ymm15,%ymm14,%ymm14
1273	vpunpcklqdq	%ymm10,%ymm8,%ymm11
1274	vpunpckhqdq	%ymm10,%ymm8,%ymm8
1275	vpaddd	256-256(%rcx),%ymm0,%ymm0
1276	vpaddd	288-256(%rcx),%ymm1,%ymm1
1277	vpaddd	320-256(%rcx),%ymm2,%ymm2
1278	vpaddd	352-256(%rcx),%ymm3,%ymm3
1279
1280	vpunpckldq	%ymm1,%ymm0,%ymm10
1281	vpunpckldq	%ymm3,%ymm2,%ymm15
1282	vpunpckhdq	%ymm1,%ymm0,%ymm0
1283	vpunpckhdq	%ymm3,%ymm2,%ymm2
1284	vpunpcklqdq	%ymm15,%ymm10,%ymm1
1285	vpunpckhqdq	%ymm15,%ymm10,%ymm10
1286	vpunpcklqdq	%ymm2,%ymm0,%ymm3
1287	vpunpckhqdq	%ymm2,%ymm0,%ymm0
1288	vperm2i128	$0x20,%ymm1,%ymm9,%ymm15
1289	vperm2i128	$0x31,%ymm1,%ymm9,%ymm1
1290	vperm2i128	$0x20,%ymm10,%ymm14,%ymm9
1291	vperm2i128	$0x31,%ymm10,%ymm14,%ymm10
1292	vperm2i128	$0x20,%ymm3,%ymm11,%ymm14
1293	vperm2i128	$0x31,%ymm3,%ymm11,%ymm3
1294	vperm2i128	$0x20,%ymm0,%ymm8,%ymm11
1295	vperm2i128	$0x31,%ymm0,%ymm8,%ymm0
1296	vmovdqa	%ymm15,0(%rsp)
1297	vmovdqa	%ymm9,32(%rsp)
1298	vmovdqa	64(%rsp),%ymm15
1299	vmovdqa	96(%rsp),%ymm9
1300
1301	vpaddd	384-512(%rax),%ymm12,%ymm12
1302	vpaddd	416-512(%rax),%ymm13,%ymm13
1303	vpaddd	448-512(%rax),%ymm15,%ymm15
1304	vpaddd	480-512(%rax),%ymm9,%ymm9
1305
1306	vpunpckldq	%ymm13,%ymm12,%ymm2
1307	vpunpckldq	%ymm9,%ymm15,%ymm8
1308	vpunpckhdq	%ymm13,%ymm12,%ymm12
1309	vpunpckhdq	%ymm9,%ymm15,%ymm15
1310	vpunpcklqdq	%ymm8,%ymm2,%ymm13
1311	vpunpckhqdq	%ymm8,%ymm2,%ymm2
1312	vpunpcklqdq	%ymm15,%ymm12,%ymm9
1313	vpunpckhqdq	%ymm15,%ymm12,%ymm12
1314	vpaddd	512-512(%rax),%ymm4,%ymm4
1315	vpaddd	544-512(%rax),%ymm5,%ymm5
1316	vpaddd	576-512(%rax),%ymm6,%ymm6
1317	vpaddd	608-512(%rax),%ymm7,%ymm7
1318
1319	vpunpckldq	%ymm5,%ymm4,%ymm15
1320	vpunpckldq	%ymm7,%ymm6,%ymm8
1321	vpunpckhdq	%ymm5,%ymm4,%ymm4
1322	vpunpckhdq	%ymm7,%ymm6,%ymm6
1323	vpunpcklqdq	%ymm8,%ymm15,%ymm5
1324	vpunpckhqdq	%ymm8,%ymm15,%ymm15
1325	vpunpcklqdq	%ymm6,%ymm4,%ymm7
1326	vpunpckhqdq	%ymm6,%ymm4,%ymm4
1327	vperm2i128	$0x20,%ymm5,%ymm13,%ymm8
1328	vperm2i128	$0x31,%ymm5,%ymm13,%ymm5
1329	vperm2i128	$0x20,%ymm15,%ymm2,%ymm13
1330	vperm2i128	$0x31,%ymm15,%ymm2,%ymm15
1331	vperm2i128	$0x20,%ymm7,%ymm9,%ymm2
1332	vperm2i128	$0x31,%ymm7,%ymm9,%ymm7
1333	vperm2i128	$0x20,%ymm4,%ymm12,%ymm9
1334	vperm2i128	$0x31,%ymm4,%ymm12,%ymm4
1335	vmovdqa	0(%rsp),%ymm6
1336	vmovdqa	32(%rsp),%ymm12
1337
1338	cmpq	$512,%rdx
1339	jb	L$tail8x
1340
1341	vpxor	0(%rsi),%ymm6,%ymm6
1342	vpxor	32(%rsi),%ymm8,%ymm8
1343	vpxor	64(%rsi),%ymm1,%ymm1
1344	vpxor	96(%rsi),%ymm5,%ymm5
1345	leaq	128(%rsi),%rsi
1346	vmovdqu	%ymm6,0(%rdi)
1347	vmovdqu	%ymm8,32(%rdi)
1348	vmovdqu	%ymm1,64(%rdi)
1349	vmovdqu	%ymm5,96(%rdi)
1350	leaq	128(%rdi),%rdi
1351
1352	vpxor	0(%rsi),%ymm12,%ymm12
1353	vpxor	32(%rsi),%ymm13,%ymm13
1354	vpxor	64(%rsi),%ymm10,%ymm10
1355	vpxor	96(%rsi),%ymm15,%ymm15
1356	leaq	128(%rsi),%rsi
1357	vmovdqu	%ymm12,0(%rdi)
1358	vmovdqu	%ymm13,32(%rdi)
1359	vmovdqu	%ymm10,64(%rdi)
1360	vmovdqu	%ymm15,96(%rdi)
1361	leaq	128(%rdi),%rdi
1362
1363	vpxor	0(%rsi),%ymm14,%ymm14
1364	vpxor	32(%rsi),%ymm2,%ymm2
1365	vpxor	64(%rsi),%ymm3,%ymm3
1366	vpxor	96(%rsi),%ymm7,%ymm7
1367	leaq	128(%rsi),%rsi
1368	vmovdqu	%ymm14,0(%rdi)
1369	vmovdqu	%ymm2,32(%rdi)
1370	vmovdqu	%ymm3,64(%rdi)
1371	vmovdqu	%ymm7,96(%rdi)
1372	leaq	128(%rdi),%rdi
1373
1374	vpxor	0(%rsi),%ymm11,%ymm11
1375	vpxor	32(%rsi),%ymm9,%ymm9
1376	vpxor	64(%rsi),%ymm0,%ymm0
1377	vpxor	96(%rsi),%ymm4,%ymm4
1378	leaq	128(%rsi),%rsi
1379	vmovdqu	%ymm11,0(%rdi)
1380	vmovdqu	%ymm9,32(%rdi)
1381	vmovdqu	%ymm0,64(%rdi)
1382	vmovdqu	%ymm4,96(%rdi)
1383	leaq	128(%rdi),%rdi
1384
1385	subq	$512,%rdx
1386	jnz	L$oop_outer8x
1387
1388	jmp	L$done8x
1389
1390L$tail8x:
1391	cmpq	$448,%rdx
1392	jae	L$448_or_more8x
1393	cmpq	$384,%rdx
1394	jae	L$384_or_more8x
1395	cmpq	$320,%rdx
1396	jae	L$320_or_more8x
1397	cmpq	$256,%rdx
1398	jae	L$256_or_more8x
1399	cmpq	$192,%rdx
1400	jae	L$192_or_more8x
1401	cmpq	$128,%rdx
1402	jae	L$128_or_more8x
1403	cmpq	$64,%rdx
1404	jae	L$64_or_more8x
1405
1406	xorq	%r10,%r10
1407	vmovdqa	%ymm6,0(%rsp)
1408	vmovdqa	%ymm8,32(%rsp)
1409	jmp	L$oop_tail8x
1410
1411.p2align	5
1412L$64_or_more8x:
1413	vpxor	0(%rsi),%ymm6,%ymm6
1414	vpxor	32(%rsi),%ymm8,%ymm8
1415	vmovdqu	%ymm6,0(%rdi)
1416	vmovdqu	%ymm8,32(%rdi)
1417	je	L$done8x
1418
1419	leaq	64(%rsi),%rsi
1420	xorq	%r10,%r10
1421	vmovdqa	%ymm1,0(%rsp)
1422	leaq	64(%rdi),%rdi
1423	subq	$64,%rdx
1424	vmovdqa	%ymm5,32(%rsp)
1425	jmp	L$oop_tail8x
1426
1427.p2align	5
1428L$128_or_more8x:
1429	vpxor	0(%rsi),%ymm6,%ymm6
1430	vpxor	32(%rsi),%ymm8,%ymm8
1431	vpxor	64(%rsi),%ymm1,%ymm1
1432	vpxor	96(%rsi),%ymm5,%ymm5
1433	vmovdqu	%ymm6,0(%rdi)
1434	vmovdqu	%ymm8,32(%rdi)
1435	vmovdqu	%ymm1,64(%rdi)
1436	vmovdqu	%ymm5,96(%rdi)
1437	je	L$done8x
1438
1439	leaq	128(%rsi),%rsi
1440	xorq	%r10,%r10
1441	vmovdqa	%ymm12,0(%rsp)
1442	leaq	128(%rdi),%rdi
1443	subq	$128,%rdx
1444	vmovdqa	%ymm13,32(%rsp)
1445	jmp	L$oop_tail8x
1446
1447.p2align	5
1448L$192_or_more8x:
1449	vpxor	0(%rsi),%ymm6,%ymm6
1450	vpxor	32(%rsi),%ymm8,%ymm8
1451	vpxor	64(%rsi),%ymm1,%ymm1
1452	vpxor	96(%rsi),%ymm5,%ymm5
1453	vpxor	128(%rsi),%ymm12,%ymm12
1454	vpxor	160(%rsi),%ymm13,%ymm13
1455	vmovdqu	%ymm6,0(%rdi)
1456	vmovdqu	%ymm8,32(%rdi)
1457	vmovdqu	%ymm1,64(%rdi)
1458	vmovdqu	%ymm5,96(%rdi)
1459	vmovdqu	%ymm12,128(%rdi)
1460	vmovdqu	%ymm13,160(%rdi)
1461	je	L$done8x
1462
1463	leaq	192(%rsi),%rsi
1464	xorq	%r10,%r10
1465	vmovdqa	%ymm10,0(%rsp)
1466	leaq	192(%rdi),%rdi
1467	subq	$192,%rdx
1468	vmovdqa	%ymm15,32(%rsp)
1469	jmp	L$oop_tail8x
1470
1471.p2align	5
1472L$256_or_more8x:
1473	vpxor	0(%rsi),%ymm6,%ymm6
1474	vpxor	32(%rsi),%ymm8,%ymm8
1475	vpxor	64(%rsi),%ymm1,%ymm1
1476	vpxor	96(%rsi),%ymm5,%ymm5
1477	vpxor	128(%rsi),%ymm12,%ymm12
1478	vpxor	160(%rsi),%ymm13,%ymm13
1479	vpxor	192(%rsi),%ymm10,%ymm10
1480	vpxor	224(%rsi),%ymm15,%ymm15
1481	vmovdqu	%ymm6,0(%rdi)
1482	vmovdqu	%ymm8,32(%rdi)
1483	vmovdqu	%ymm1,64(%rdi)
1484	vmovdqu	%ymm5,96(%rdi)
1485	vmovdqu	%ymm12,128(%rdi)
1486	vmovdqu	%ymm13,160(%rdi)
1487	vmovdqu	%ymm10,192(%rdi)
1488	vmovdqu	%ymm15,224(%rdi)
1489	je	L$done8x
1490
1491	leaq	256(%rsi),%rsi
1492	xorq	%r10,%r10
1493	vmovdqa	%ymm14,0(%rsp)
1494	leaq	256(%rdi),%rdi
1495	subq	$256,%rdx
1496	vmovdqa	%ymm2,32(%rsp)
1497	jmp	L$oop_tail8x
1498
1499.p2align	5
1500L$320_or_more8x:
1501	vpxor	0(%rsi),%ymm6,%ymm6
1502	vpxor	32(%rsi),%ymm8,%ymm8
1503	vpxor	64(%rsi),%ymm1,%ymm1
1504	vpxor	96(%rsi),%ymm5,%ymm5
1505	vpxor	128(%rsi),%ymm12,%ymm12
1506	vpxor	160(%rsi),%ymm13,%ymm13
1507	vpxor	192(%rsi),%ymm10,%ymm10
1508	vpxor	224(%rsi),%ymm15,%ymm15
1509	vpxor	256(%rsi),%ymm14,%ymm14
1510	vpxor	288(%rsi),%ymm2,%ymm2
1511	vmovdqu	%ymm6,0(%rdi)
1512	vmovdqu	%ymm8,32(%rdi)
1513	vmovdqu	%ymm1,64(%rdi)
1514	vmovdqu	%ymm5,96(%rdi)
1515	vmovdqu	%ymm12,128(%rdi)
1516	vmovdqu	%ymm13,160(%rdi)
1517	vmovdqu	%ymm10,192(%rdi)
1518	vmovdqu	%ymm15,224(%rdi)
1519	vmovdqu	%ymm14,256(%rdi)
1520	vmovdqu	%ymm2,288(%rdi)
1521	je	L$done8x
1522
1523	leaq	320(%rsi),%rsi
1524	xorq	%r10,%r10
1525	vmovdqa	%ymm3,0(%rsp)
1526	leaq	320(%rdi),%rdi
1527	subq	$320,%rdx
1528	vmovdqa	%ymm7,32(%rsp)
1529	jmp	L$oop_tail8x
1530
1531.p2align	5
1532L$384_or_more8x:
1533	vpxor	0(%rsi),%ymm6,%ymm6
1534	vpxor	32(%rsi),%ymm8,%ymm8
1535	vpxor	64(%rsi),%ymm1,%ymm1
1536	vpxor	96(%rsi),%ymm5,%ymm5
1537	vpxor	128(%rsi),%ymm12,%ymm12
1538	vpxor	160(%rsi),%ymm13,%ymm13
1539	vpxor	192(%rsi),%ymm10,%ymm10
1540	vpxor	224(%rsi),%ymm15,%ymm15
1541	vpxor	256(%rsi),%ymm14,%ymm14
1542	vpxor	288(%rsi),%ymm2,%ymm2
1543	vpxor	320(%rsi),%ymm3,%ymm3
1544	vpxor	352(%rsi),%ymm7,%ymm7
1545	vmovdqu	%ymm6,0(%rdi)
1546	vmovdqu	%ymm8,32(%rdi)
1547	vmovdqu	%ymm1,64(%rdi)
1548	vmovdqu	%ymm5,96(%rdi)
1549	vmovdqu	%ymm12,128(%rdi)
1550	vmovdqu	%ymm13,160(%rdi)
1551	vmovdqu	%ymm10,192(%rdi)
1552	vmovdqu	%ymm15,224(%rdi)
1553	vmovdqu	%ymm14,256(%rdi)
1554	vmovdqu	%ymm2,288(%rdi)
1555	vmovdqu	%ymm3,320(%rdi)
1556	vmovdqu	%ymm7,352(%rdi)
1557	je	L$done8x
1558
1559	leaq	384(%rsi),%rsi
1560	xorq	%r10,%r10
1561	vmovdqa	%ymm11,0(%rsp)
1562	leaq	384(%rdi),%rdi
1563	subq	$384,%rdx
1564	vmovdqa	%ymm9,32(%rsp)
1565	jmp	L$oop_tail8x
1566
1567.p2align	5
1568L$448_or_more8x:
1569	vpxor	0(%rsi),%ymm6,%ymm6
1570	vpxor	32(%rsi),%ymm8,%ymm8
1571	vpxor	64(%rsi),%ymm1,%ymm1
1572	vpxor	96(%rsi),%ymm5,%ymm5
1573	vpxor	128(%rsi),%ymm12,%ymm12
1574	vpxor	160(%rsi),%ymm13,%ymm13
1575	vpxor	192(%rsi),%ymm10,%ymm10
1576	vpxor	224(%rsi),%ymm15,%ymm15
1577	vpxor	256(%rsi),%ymm14,%ymm14
1578	vpxor	288(%rsi),%ymm2,%ymm2
1579	vpxor	320(%rsi),%ymm3,%ymm3
1580	vpxor	352(%rsi),%ymm7,%ymm7
1581	vpxor	384(%rsi),%ymm11,%ymm11
1582	vpxor	416(%rsi),%ymm9,%ymm9
1583	vmovdqu	%ymm6,0(%rdi)
1584	vmovdqu	%ymm8,32(%rdi)
1585	vmovdqu	%ymm1,64(%rdi)
1586	vmovdqu	%ymm5,96(%rdi)
1587	vmovdqu	%ymm12,128(%rdi)
1588	vmovdqu	%ymm13,160(%rdi)
1589	vmovdqu	%ymm10,192(%rdi)
1590	vmovdqu	%ymm15,224(%rdi)
1591	vmovdqu	%ymm14,256(%rdi)
1592	vmovdqu	%ymm2,288(%rdi)
1593	vmovdqu	%ymm3,320(%rdi)
1594	vmovdqu	%ymm7,352(%rdi)
1595	vmovdqu	%ymm11,384(%rdi)
1596	vmovdqu	%ymm9,416(%rdi)
1597	je	L$done8x
1598
1599	leaq	448(%rsi),%rsi
1600	xorq	%r10,%r10
1601	vmovdqa	%ymm0,0(%rsp)
1602	leaq	448(%rdi),%rdi
1603	subq	$448,%rdx
1604	vmovdqa	%ymm4,32(%rsp)
1605
1606L$oop_tail8x:
1607	movzbl	(%rsi,%r10,1),%eax
1608	movzbl	(%rsp,%r10,1),%ecx
1609	leaq	1(%r10),%r10
1610	xorl	%ecx,%eax
1611	movb	%al,-1(%rdi,%r10,1)
1612	decq	%rdx
1613	jnz	L$oop_tail8x
1614
1615L$done8x:
1616	vzeroall
1617	leaq	(%r9),%rsp
1618
1619L$8x_epilogue:
1620	.byte	0xf3,0xc3
1621
1622
1623#endif
1624