• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; This file is generated from a similarly-named Perl script in the BoringSSL
2; source tree. Do not edit by hand.
3
4%ifidn __OUTPUT_FORMAT__, win64
5default	rel
6%define XMMWORD
7%define YMMWORD
8%define ZMMWORD
9%define _CET_ENDBR
10
11%ifdef BORINGSSL_PREFIX
12%include "boringssl_prefix_symbols_nasm.inc"
13%endif
14section	.text code align=64
15
16
17section	.rdata rdata align=8
18ALIGN	64
19$L$zero:
20	DD	0,0,0,0
21$L$one:
22	DD	1,0,0,0
23$L$inc:
24	DD	0,1,2,3
25$L$four:
26	DD	4,4,4,4
27$L$incy:
28	DD	0,2,4,6,1,3,5,7
29$L$eight:
30	DD	8,8,8,8,8,8,8,8
31$L$rot16:
32	DB	0x2,0x3,0x0,0x1,0x6,0x7,0x4,0x5,0xa,0xb,0x8,0x9,0xe,0xf,0xc,0xd
33$L$rot24:
34	DB	0x3,0x0,0x1,0x2,0x7,0x4,0x5,0x6,0xb,0x8,0x9,0xa,0xf,0xc,0xd,0xe
35$L$sigma:
36	DB	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107
37	DB	0
38ALIGN	64
39$L$zeroz:
40	DD	0,0,0,0,1,0,0,0,2,0,0,0,3,0,0,0
41$L$fourz:
42	DD	4,0,0,0,4,0,0,0,4,0,0,0,4,0,0,0
43$L$incz:
44	DD	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
45$L$sixteen:
46	DD	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
47	DB	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
48	DB	95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32
49	DB	98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115
50	DB	108,46,111,114,103,62,0
51section	.text
52
53global	ChaCha20_ctr32_nohw
54
55ALIGN	64
56ChaCha20_ctr32_nohw:
57	mov	QWORD[8+rsp],rdi	;WIN64 prologue
58	mov	QWORD[16+rsp],rsi
59	mov	rax,rsp
60$L$SEH_begin_ChaCha20_ctr32_nohw:
61	mov	rdi,rcx
62	mov	rsi,rdx
63	mov	rdx,r8
64	mov	rcx,r9
65	mov	r8,QWORD[40+rsp]
66
67
68
69_CET_ENDBR
70	push	rbx
71
72	push	rbp
73
74	push	r12
75
76	push	r13
77
78	push	r14
79
80	push	r15
81
82	sub	rsp,64+24
83
84$L$ctr32_body:
85
86
87	movdqu	xmm1,XMMWORD[rcx]
88	movdqu	xmm2,XMMWORD[16+rcx]
89	movdqu	xmm3,XMMWORD[r8]
90	movdqa	xmm4,XMMWORD[$L$one]
91
92
93	movdqa	XMMWORD[16+rsp],xmm1
94	movdqa	XMMWORD[32+rsp],xmm2
95	movdqa	XMMWORD[48+rsp],xmm3
96	mov	rbp,rdx
97	jmp	NEAR $L$oop_outer
98
99ALIGN	32
100$L$oop_outer:
101	mov	eax,0x61707865
102	mov	ebx,0x3320646e
103	mov	ecx,0x79622d32
104	mov	edx,0x6b206574
105	mov	r8d,DWORD[16+rsp]
106	mov	r9d,DWORD[20+rsp]
107	mov	r10d,DWORD[24+rsp]
108	mov	r11d,DWORD[28+rsp]
109	movd	r12d,xmm3
110	mov	r13d,DWORD[52+rsp]
111	mov	r14d,DWORD[56+rsp]
112	mov	r15d,DWORD[60+rsp]
113
114	mov	QWORD[((64+0))+rsp],rbp
115	mov	ebp,10
116	mov	QWORD[((64+8))+rsp],rsi
117DB	102,72,15,126,214
118	mov	QWORD[((64+16))+rsp],rdi
119	mov	rdi,rsi
120	shr	rdi,32
121	jmp	NEAR $L$oop
122
123ALIGN	32
124$L$oop:
125	add	eax,r8d
126	xor	r12d,eax
127	rol	r12d,16
128	add	ebx,r9d
129	xor	r13d,ebx
130	rol	r13d,16
131	add	esi,r12d
132	xor	r8d,esi
133	rol	r8d,12
134	add	edi,r13d
135	xor	r9d,edi
136	rol	r9d,12
137	add	eax,r8d
138	xor	r12d,eax
139	rol	r12d,8
140	add	ebx,r9d
141	xor	r13d,ebx
142	rol	r13d,8
143	add	esi,r12d
144	xor	r8d,esi
145	rol	r8d,7
146	add	edi,r13d
147	xor	r9d,edi
148	rol	r9d,7
149	mov	DWORD[32+rsp],esi
150	mov	DWORD[36+rsp],edi
151	mov	esi,DWORD[40+rsp]
152	mov	edi,DWORD[44+rsp]
153	add	ecx,r10d
154	xor	r14d,ecx
155	rol	r14d,16
156	add	edx,r11d
157	xor	r15d,edx
158	rol	r15d,16
159	add	esi,r14d
160	xor	r10d,esi
161	rol	r10d,12
162	add	edi,r15d
163	xor	r11d,edi
164	rol	r11d,12
165	add	ecx,r10d
166	xor	r14d,ecx
167	rol	r14d,8
168	add	edx,r11d
169	xor	r15d,edx
170	rol	r15d,8
171	add	esi,r14d
172	xor	r10d,esi
173	rol	r10d,7
174	add	edi,r15d
175	xor	r11d,edi
176	rol	r11d,7
177	add	eax,r9d
178	xor	r15d,eax
179	rol	r15d,16
180	add	ebx,r10d
181	xor	r12d,ebx
182	rol	r12d,16
183	add	esi,r15d
184	xor	r9d,esi
185	rol	r9d,12
186	add	edi,r12d
187	xor	r10d,edi
188	rol	r10d,12
189	add	eax,r9d
190	xor	r15d,eax
191	rol	r15d,8
192	add	ebx,r10d
193	xor	r12d,ebx
194	rol	r12d,8
195	add	esi,r15d
196	xor	r9d,esi
197	rol	r9d,7
198	add	edi,r12d
199	xor	r10d,edi
200	rol	r10d,7
201	mov	DWORD[40+rsp],esi
202	mov	DWORD[44+rsp],edi
203	mov	esi,DWORD[32+rsp]
204	mov	edi,DWORD[36+rsp]
205	add	ecx,r11d
206	xor	r13d,ecx
207	rol	r13d,16
208	add	edx,r8d
209	xor	r14d,edx
210	rol	r14d,16
211	add	esi,r13d
212	xor	r11d,esi
213	rol	r11d,12
214	add	edi,r14d
215	xor	r8d,edi
216	rol	r8d,12
217	add	ecx,r11d
218	xor	r13d,ecx
219	rol	r13d,8
220	add	edx,r8d
221	xor	r14d,edx
222	rol	r14d,8
223	add	esi,r13d
224	xor	r11d,esi
225	rol	r11d,7
226	add	edi,r14d
227	xor	r8d,edi
228	rol	r8d,7
229	dec	ebp
230	jnz	NEAR $L$oop
231	mov	DWORD[36+rsp],edi
232	mov	DWORD[32+rsp],esi
233	mov	rbp,QWORD[64+rsp]
234	movdqa	xmm1,xmm2
235	mov	rsi,QWORD[((64+8))+rsp]
236	paddd	xmm3,xmm4
237	mov	rdi,QWORD[((64+16))+rsp]
238
239	add	eax,0x61707865
240	add	ebx,0x3320646e
241	add	ecx,0x79622d32
242	add	edx,0x6b206574
243	add	r8d,DWORD[16+rsp]
244	add	r9d,DWORD[20+rsp]
245	add	r10d,DWORD[24+rsp]
246	add	r11d,DWORD[28+rsp]
247	add	r12d,DWORD[48+rsp]
248	add	r13d,DWORD[52+rsp]
249	add	r14d,DWORD[56+rsp]
250	add	r15d,DWORD[60+rsp]
251	paddd	xmm1,XMMWORD[32+rsp]
252
253	cmp	rbp,64
254	jb	NEAR $L$tail
255
256	xor	eax,DWORD[rsi]
257	xor	ebx,DWORD[4+rsi]
258	xor	ecx,DWORD[8+rsi]
259	xor	edx,DWORD[12+rsi]
260	xor	r8d,DWORD[16+rsi]
261	xor	r9d,DWORD[20+rsi]
262	xor	r10d,DWORD[24+rsi]
263	xor	r11d,DWORD[28+rsi]
264	movdqu	xmm0,XMMWORD[32+rsi]
265	xor	r12d,DWORD[48+rsi]
266	xor	r13d,DWORD[52+rsi]
267	xor	r14d,DWORD[56+rsi]
268	xor	r15d,DWORD[60+rsi]
269	lea	rsi,[64+rsi]
270	pxor	xmm0,xmm1
271
272	movdqa	XMMWORD[32+rsp],xmm2
273	movd	DWORD[48+rsp],xmm3
274
275	mov	DWORD[rdi],eax
276	mov	DWORD[4+rdi],ebx
277	mov	DWORD[8+rdi],ecx
278	mov	DWORD[12+rdi],edx
279	mov	DWORD[16+rdi],r8d
280	mov	DWORD[20+rdi],r9d
281	mov	DWORD[24+rdi],r10d
282	mov	DWORD[28+rdi],r11d
283	movdqu	XMMWORD[32+rdi],xmm0
284	mov	DWORD[48+rdi],r12d
285	mov	DWORD[52+rdi],r13d
286	mov	DWORD[56+rdi],r14d
287	mov	DWORD[60+rdi],r15d
288	lea	rdi,[64+rdi]
289
290	sub	rbp,64
291	jnz	NEAR $L$oop_outer
292
293	jmp	NEAR $L$done
294
295ALIGN	16
296$L$tail:
297	mov	DWORD[rsp],eax
298	mov	DWORD[4+rsp],ebx
299	xor	rbx,rbx
300	mov	DWORD[8+rsp],ecx
301	mov	DWORD[12+rsp],edx
302	mov	DWORD[16+rsp],r8d
303	mov	DWORD[20+rsp],r9d
304	mov	DWORD[24+rsp],r10d
305	mov	DWORD[28+rsp],r11d
306	movdqa	XMMWORD[32+rsp],xmm1
307	mov	DWORD[48+rsp],r12d
308	mov	DWORD[52+rsp],r13d
309	mov	DWORD[56+rsp],r14d
310	mov	DWORD[60+rsp],r15d
311
312$L$oop_tail:
313	movzx	eax,BYTE[rbx*1+rsi]
314	movzx	edx,BYTE[rbx*1+rsp]
315	lea	rbx,[1+rbx]
316	xor	eax,edx
317	mov	BYTE[((-1))+rbx*1+rdi],al
318	dec	rbp
319	jnz	NEAR $L$oop_tail
320
321$L$done:
322	lea	rsi,[((64+24+48))+rsp]
323	mov	r15,QWORD[((-48))+rsi]
324
325	mov	r14,QWORD[((-40))+rsi]
326
327	mov	r13,QWORD[((-32))+rsi]
328
329	mov	r12,QWORD[((-24))+rsi]
330
331	mov	rbp,QWORD[((-16))+rsi]
332
333	mov	rbx,QWORD[((-8))+rsi]
334
335	lea	rsp,[rsi]
336
337$L$no_data:
338	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
339	mov	rsi,QWORD[16+rsp]
340	ret
341
342$L$SEH_end_ChaCha20_ctr32_nohw:
343global	ChaCha20_ctr32_ssse3
344
345ALIGN	32
346ChaCha20_ctr32_ssse3:
347	mov	QWORD[8+rsp],rdi	;WIN64 prologue
348	mov	QWORD[16+rsp],rsi
349	mov	rax,rsp
350$L$SEH_begin_ChaCha20_ctr32_ssse3:
351	mov	rdi,rcx
352	mov	rsi,rdx
353	mov	rdx,r8
354	mov	rcx,r9
355	mov	r8,QWORD[40+rsp]
356
357
358
359_CET_ENDBR
360	mov	r9,rsp
361
362	sub	rsp,64+40
363	movaps	XMMWORD[(-40)+r9],xmm6
364	movaps	XMMWORD[(-24)+r9],xmm7
365$L$ssse3_body:
366	movdqa	xmm0,XMMWORD[$L$sigma]
367	movdqu	xmm1,XMMWORD[rcx]
368	movdqu	xmm2,XMMWORD[16+rcx]
369	movdqu	xmm3,XMMWORD[r8]
370	movdqa	xmm6,XMMWORD[$L$rot16]
371	movdqa	xmm7,XMMWORD[$L$rot24]
372
373	movdqa	XMMWORD[rsp],xmm0
374	movdqa	XMMWORD[16+rsp],xmm1
375	movdqa	XMMWORD[32+rsp],xmm2
376	movdqa	XMMWORD[48+rsp],xmm3
377	mov	r8,10
378	jmp	NEAR $L$oop_ssse3
379
380ALIGN	32
381$L$oop_outer_ssse3:
382	movdqa	xmm3,XMMWORD[$L$one]
383	movdqa	xmm0,XMMWORD[rsp]
384	movdqa	xmm1,XMMWORD[16+rsp]
385	movdqa	xmm2,XMMWORD[32+rsp]
386	paddd	xmm3,XMMWORD[48+rsp]
387	mov	r8,10
388	movdqa	XMMWORD[48+rsp],xmm3
389	jmp	NEAR $L$oop_ssse3
390
391ALIGN	32
392$L$oop_ssse3:
393	paddd	xmm0,xmm1
394	pxor	xmm3,xmm0
395DB	102,15,56,0,222
396	paddd	xmm2,xmm3
397	pxor	xmm1,xmm2
398	movdqa	xmm4,xmm1
399	psrld	xmm1,20
400	pslld	xmm4,12
401	por	xmm1,xmm4
402	paddd	xmm0,xmm1
403	pxor	xmm3,xmm0
404DB	102,15,56,0,223
405	paddd	xmm2,xmm3
406	pxor	xmm1,xmm2
407	movdqa	xmm4,xmm1
408	psrld	xmm1,25
409	pslld	xmm4,7
410	por	xmm1,xmm4
411	pshufd	xmm2,xmm2,78
412	pshufd	xmm1,xmm1,57
413	pshufd	xmm3,xmm3,147
414	nop
415	paddd	xmm0,xmm1
416	pxor	xmm3,xmm0
417DB	102,15,56,0,222
418	paddd	xmm2,xmm3
419	pxor	xmm1,xmm2
420	movdqa	xmm4,xmm1
421	psrld	xmm1,20
422	pslld	xmm4,12
423	por	xmm1,xmm4
424	paddd	xmm0,xmm1
425	pxor	xmm3,xmm0
426DB	102,15,56,0,223
427	paddd	xmm2,xmm3
428	pxor	xmm1,xmm2
429	movdqa	xmm4,xmm1
430	psrld	xmm1,25
431	pslld	xmm4,7
432	por	xmm1,xmm4
433	pshufd	xmm2,xmm2,78
434	pshufd	xmm1,xmm1,147
435	pshufd	xmm3,xmm3,57
436	dec	r8
437	jnz	NEAR $L$oop_ssse3
438	paddd	xmm0,XMMWORD[rsp]
439	paddd	xmm1,XMMWORD[16+rsp]
440	paddd	xmm2,XMMWORD[32+rsp]
441	paddd	xmm3,XMMWORD[48+rsp]
442
443	cmp	rdx,64
444	jb	NEAR $L$tail_ssse3
445
446	movdqu	xmm4,XMMWORD[rsi]
447	movdqu	xmm5,XMMWORD[16+rsi]
448	pxor	xmm0,xmm4
449	movdqu	xmm4,XMMWORD[32+rsi]
450	pxor	xmm1,xmm5
451	movdqu	xmm5,XMMWORD[48+rsi]
452	lea	rsi,[64+rsi]
453	pxor	xmm2,xmm4
454	pxor	xmm3,xmm5
455
456	movdqu	XMMWORD[rdi],xmm0
457	movdqu	XMMWORD[16+rdi],xmm1
458	movdqu	XMMWORD[32+rdi],xmm2
459	movdqu	XMMWORD[48+rdi],xmm3
460	lea	rdi,[64+rdi]
461
462	sub	rdx,64
463	jnz	NEAR $L$oop_outer_ssse3
464
465	jmp	NEAR $L$done_ssse3
466
467ALIGN	16
468$L$tail_ssse3:
469	movdqa	XMMWORD[rsp],xmm0
470	movdqa	XMMWORD[16+rsp],xmm1
471	movdqa	XMMWORD[32+rsp],xmm2
472	movdqa	XMMWORD[48+rsp],xmm3
473	xor	r8,r8
474
475$L$oop_tail_ssse3:
476	movzx	eax,BYTE[r8*1+rsi]
477	movzx	ecx,BYTE[r8*1+rsp]
478	lea	r8,[1+r8]
479	xor	eax,ecx
480	mov	BYTE[((-1))+r8*1+rdi],al
481	dec	rdx
482	jnz	NEAR $L$oop_tail_ssse3
483
484$L$done_ssse3:
485	movaps	xmm6,XMMWORD[((-40))+r9]
486	movaps	xmm7,XMMWORD[((-24))+r9]
487	lea	rsp,[r9]
488
489$L$ssse3_epilogue:
490	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
491	mov	rsi,QWORD[16+rsp]
492	ret
493
494$L$SEH_end_ChaCha20_ctr32_ssse3:
495global	ChaCha20_ctr32_ssse3_4x
496
497ALIGN	32
498ChaCha20_ctr32_ssse3_4x:
499	mov	QWORD[8+rsp],rdi	;WIN64 prologue
500	mov	QWORD[16+rsp],rsi
501	mov	rax,rsp
502$L$SEH_begin_ChaCha20_ctr32_ssse3_4x:
503	mov	rdi,rcx
504	mov	rsi,rdx
505	mov	rdx,r8
506	mov	rcx,r9
507	mov	r8,QWORD[40+rsp]
508
509
510
511_CET_ENDBR
512	mov	r9,rsp
513
514	mov	r11,r10
515	sub	rsp,0x140+168
516	movaps	XMMWORD[(-168)+r9],xmm6
517	movaps	XMMWORD[(-152)+r9],xmm7
518	movaps	XMMWORD[(-136)+r9],xmm8
519	movaps	XMMWORD[(-120)+r9],xmm9
520	movaps	XMMWORD[(-104)+r9],xmm10
521	movaps	XMMWORD[(-88)+r9],xmm11
522	movaps	XMMWORD[(-72)+r9],xmm12
523	movaps	XMMWORD[(-56)+r9],xmm13
524	movaps	XMMWORD[(-40)+r9],xmm14
525	movaps	XMMWORD[(-24)+r9],xmm15
526$L$4x_body:
527	movdqa	xmm11,XMMWORD[$L$sigma]
528	movdqu	xmm15,XMMWORD[rcx]
529	movdqu	xmm7,XMMWORD[16+rcx]
530	movdqu	xmm3,XMMWORD[r8]
531	lea	rcx,[256+rsp]
532	lea	r10,[$L$rot16]
533	lea	r11,[$L$rot24]
534
535	pshufd	xmm8,xmm11,0x00
536	pshufd	xmm9,xmm11,0x55
537	movdqa	XMMWORD[64+rsp],xmm8
538	pshufd	xmm10,xmm11,0xaa
539	movdqa	XMMWORD[80+rsp],xmm9
540	pshufd	xmm11,xmm11,0xff
541	movdqa	XMMWORD[96+rsp],xmm10
542	movdqa	XMMWORD[112+rsp],xmm11
543
544	pshufd	xmm12,xmm15,0x00
545	pshufd	xmm13,xmm15,0x55
546	movdqa	XMMWORD[(128-256)+rcx],xmm12
547	pshufd	xmm14,xmm15,0xaa
548	movdqa	XMMWORD[(144-256)+rcx],xmm13
549	pshufd	xmm15,xmm15,0xff
550	movdqa	XMMWORD[(160-256)+rcx],xmm14
551	movdqa	XMMWORD[(176-256)+rcx],xmm15
552
553	pshufd	xmm4,xmm7,0x00
554	pshufd	xmm5,xmm7,0x55
555	movdqa	XMMWORD[(192-256)+rcx],xmm4
556	pshufd	xmm6,xmm7,0xaa
557	movdqa	XMMWORD[(208-256)+rcx],xmm5
558	pshufd	xmm7,xmm7,0xff
559	movdqa	XMMWORD[(224-256)+rcx],xmm6
560	movdqa	XMMWORD[(240-256)+rcx],xmm7
561
562	pshufd	xmm0,xmm3,0x00
563	pshufd	xmm1,xmm3,0x55
564	paddd	xmm0,XMMWORD[$L$inc]
565	pshufd	xmm2,xmm3,0xaa
566	movdqa	XMMWORD[(272-256)+rcx],xmm1
567	pshufd	xmm3,xmm3,0xff
568	movdqa	XMMWORD[(288-256)+rcx],xmm2
569	movdqa	XMMWORD[(304-256)+rcx],xmm3
570
571	jmp	NEAR $L$oop_enter4x
572
573ALIGN	32
574$L$oop_outer4x:
575	movdqa	xmm8,XMMWORD[64+rsp]
576	movdqa	xmm9,XMMWORD[80+rsp]
577	movdqa	xmm10,XMMWORD[96+rsp]
578	movdqa	xmm11,XMMWORD[112+rsp]
579	movdqa	xmm12,XMMWORD[((128-256))+rcx]
580	movdqa	xmm13,XMMWORD[((144-256))+rcx]
581	movdqa	xmm14,XMMWORD[((160-256))+rcx]
582	movdqa	xmm15,XMMWORD[((176-256))+rcx]
583	movdqa	xmm4,XMMWORD[((192-256))+rcx]
584	movdqa	xmm5,XMMWORD[((208-256))+rcx]
585	movdqa	xmm6,XMMWORD[((224-256))+rcx]
586	movdqa	xmm7,XMMWORD[((240-256))+rcx]
587	movdqa	xmm0,XMMWORD[((256-256))+rcx]
588	movdqa	xmm1,XMMWORD[((272-256))+rcx]
589	movdqa	xmm2,XMMWORD[((288-256))+rcx]
590	movdqa	xmm3,XMMWORD[((304-256))+rcx]
591	paddd	xmm0,XMMWORD[$L$four]
592
593$L$oop_enter4x:
594	movdqa	XMMWORD[32+rsp],xmm6
595	movdqa	XMMWORD[48+rsp],xmm7
596	movdqa	xmm7,XMMWORD[r10]
597	mov	eax,10
598	movdqa	XMMWORD[(256-256)+rcx],xmm0
599	jmp	NEAR $L$oop4x
600
601ALIGN	32
602$L$oop4x:
603	paddd	xmm8,xmm12
604	paddd	xmm9,xmm13
605	pxor	xmm0,xmm8
606	pxor	xmm1,xmm9
607DB	102,15,56,0,199
608DB	102,15,56,0,207
609	paddd	xmm4,xmm0
610	paddd	xmm5,xmm1
611	pxor	xmm12,xmm4
612	pxor	xmm13,xmm5
613	movdqa	xmm6,xmm12
614	pslld	xmm12,12
615	psrld	xmm6,20
616	movdqa	xmm7,xmm13
617	pslld	xmm13,12
618	por	xmm12,xmm6
619	psrld	xmm7,20
620	movdqa	xmm6,XMMWORD[r11]
621	por	xmm13,xmm7
622	paddd	xmm8,xmm12
623	paddd	xmm9,xmm13
624	pxor	xmm0,xmm8
625	pxor	xmm1,xmm9
626DB	102,15,56,0,198
627DB	102,15,56,0,206
628	paddd	xmm4,xmm0
629	paddd	xmm5,xmm1
630	pxor	xmm12,xmm4
631	pxor	xmm13,xmm5
632	movdqa	xmm7,xmm12
633	pslld	xmm12,7
634	psrld	xmm7,25
635	movdqa	xmm6,xmm13
636	pslld	xmm13,7
637	por	xmm12,xmm7
638	psrld	xmm6,25
639	movdqa	xmm7,XMMWORD[r10]
640	por	xmm13,xmm6
641	movdqa	XMMWORD[rsp],xmm4
642	movdqa	XMMWORD[16+rsp],xmm5
643	movdqa	xmm4,XMMWORD[32+rsp]
644	movdqa	xmm5,XMMWORD[48+rsp]
645	paddd	xmm10,xmm14
646	paddd	xmm11,xmm15
647	pxor	xmm2,xmm10
648	pxor	xmm3,xmm11
649DB	102,15,56,0,215
650DB	102,15,56,0,223
651	paddd	xmm4,xmm2
652	paddd	xmm5,xmm3
653	pxor	xmm14,xmm4
654	pxor	xmm15,xmm5
655	movdqa	xmm6,xmm14
656	pslld	xmm14,12
657	psrld	xmm6,20
658	movdqa	xmm7,xmm15
659	pslld	xmm15,12
660	por	xmm14,xmm6
661	psrld	xmm7,20
662	movdqa	xmm6,XMMWORD[r11]
663	por	xmm15,xmm7
664	paddd	xmm10,xmm14
665	paddd	xmm11,xmm15
666	pxor	xmm2,xmm10
667	pxor	xmm3,xmm11
668DB	102,15,56,0,214
669DB	102,15,56,0,222
670	paddd	xmm4,xmm2
671	paddd	xmm5,xmm3
672	pxor	xmm14,xmm4
673	pxor	xmm15,xmm5
674	movdqa	xmm7,xmm14
675	pslld	xmm14,7
676	psrld	xmm7,25
677	movdqa	xmm6,xmm15
678	pslld	xmm15,7
679	por	xmm14,xmm7
680	psrld	xmm6,25
681	movdqa	xmm7,XMMWORD[r10]
682	por	xmm15,xmm6
683	paddd	xmm8,xmm13
684	paddd	xmm9,xmm14
685	pxor	xmm3,xmm8
686	pxor	xmm0,xmm9
687DB	102,15,56,0,223
688DB	102,15,56,0,199
689	paddd	xmm4,xmm3
690	paddd	xmm5,xmm0
691	pxor	xmm13,xmm4
692	pxor	xmm14,xmm5
693	movdqa	xmm6,xmm13
694	pslld	xmm13,12
695	psrld	xmm6,20
696	movdqa	xmm7,xmm14
697	pslld	xmm14,12
698	por	xmm13,xmm6
699	psrld	xmm7,20
700	movdqa	xmm6,XMMWORD[r11]
701	por	xmm14,xmm7
702	paddd	xmm8,xmm13
703	paddd	xmm9,xmm14
704	pxor	xmm3,xmm8
705	pxor	xmm0,xmm9
706DB	102,15,56,0,222
707DB	102,15,56,0,198
708	paddd	xmm4,xmm3
709	paddd	xmm5,xmm0
710	pxor	xmm13,xmm4
711	pxor	xmm14,xmm5
712	movdqa	xmm7,xmm13
713	pslld	xmm13,7
714	psrld	xmm7,25
715	movdqa	xmm6,xmm14
716	pslld	xmm14,7
717	por	xmm13,xmm7
718	psrld	xmm6,25
719	movdqa	xmm7,XMMWORD[r10]
720	por	xmm14,xmm6
721	movdqa	XMMWORD[32+rsp],xmm4
722	movdqa	XMMWORD[48+rsp],xmm5
723	movdqa	xmm4,XMMWORD[rsp]
724	movdqa	xmm5,XMMWORD[16+rsp]
725	paddd	xmm10,xmm15
726	paddd	xmm11,xmm12
727	pxor	xmm1,xmm10
728	pxor	xmm2,xmm11
729DB	102,15,56,0,207
730DB	102,15,56,0,215
731	paddd	xmm4,xmm1
732	paddd	xmm5,xmm2
733	pxor	xmm15,xmm4
734	pxor	xmm12,xmm5
735	movdqa	xmm6,xmm15
736	pslld	xmm15,12
737	psrld	xmm6,20
738	movdqa	xmm7,xmm12
739	pslld	xmm12,12
740	por	xmm15,xmm6
741	psrld	xmm7,20
742	movdqa	xmm6,XMMWORD[r11]
743	por	xmm12,xmm7
744	paddd	xmm10,xmm15
745	paddd	xmm11,xmm12
746	pxor	xmm1,xmm10
747	pxor	xmm2,xmm11
748DB	102,15,56,0,206
749DB	102,15,56,0,214
750	paddd	xmm4,xmm1
751	paddd	xmm5,xmm2
752	pxor	xmm15,xmm4
753	pxor	xmm12,xmm5
754	movdqa	xmm7,xmm15
755	pslld	xmm15,7
756	psrld	xmm7,25
757	movdqa	xmm6,xmm12
758	pslld	xmm12,7
759	por	xmm15,xmm7
760	psrld	xmm6,25
761	movdqa	xmm7,XMMWORD[r10]
762	por	xmm12,xmm6
763	dec	eax
764	jnz	NEAR $L$oop4x
765
766	paddd	xmm8,XMMWORD[64+rsp]
767	paddd	xmm9,XMMWORD[80+rsp]
768	paddd	xmm10,XMMWORD[96+rsp]
769	paddd	xmm11,XMMWORD[112+rsp]
770
771	movdqa	xmm6,xmm8
772	punpckldq	xmm8,xmm9
773	movdqa	xmm7,xmm10
774	punpckldq	xmm10,xmm11
775	punpckhdq	xmm6,xmm9
776	punpckhdq	xmm7,xmm11
777	movdqa	xmm9,xmm8
778	punpcklqdq	xmm8,xmm10
779	movdqa	xmm11,xmm6
780	punpcklqdq	xmm6,xmm7
781	punpckhqdq	xmm9,xmm10
782	punpckhqdq	xmm11,xmm7
783	paddd	xmm12,XMMWORD[((128-256))+rcx]
784	paddd	xmm13,XMMWORD[((144-256))+rcx]
785	paddd	xmm14,XMMWORD[((160-256))+rcx]
786	paddd	xmm15,XMMWORD[((176-256))+rcx]
787
788	movdqa	XMMWORD[rsp],xmm8
789	movdqa	XMMWORD[16+rsp],xmm9
790	movdqa	xmm8,XMMWORD[32+rsp]
791	movdqa	xmm9,XMMWORD[48+rsp]
792
793	movdqa	xmm10,xmm12
794	punpckldq	xmm12,xmm13
795	movdqa	xmm7,xmm14
796	punpckldq	xmm14,xmm15
797	punpckhdq	xmm10,xmm13
798	punpckhdq	xmm7,xmm15
799	movdqa	xmm13,xmm12
800	punpcklqdq	xmm12,xmm14
801	movdqa	xmm15,xmm10
802	punpcklqdq	xmm10,xmm7
803	punpckhqdq	xmm13,xmm14
804	punpckhqdq	xmm15,xmm7
805	paddd	xmm4,XMMWORD[((192-256))+rcx]
806	paddd	xmm5,XMMWORD[((208-256))+rcx]
807	paddd	xmm8,XMMWORD[((224-256))+rcx]
808	paddd	xmm9,XMMWORD[((240-256))+rcx]
809
810	movdqa	XMMWORD[32+rsp],xmm6
811	movdqa	XMMWORD[48+rsp],xmm11
812
813	movdqa	xmm14,xmm4
814	punpckldq	xmm4,xmm5
815	movdqa	xmm7,xmm8
816	punpckldq	xmm8,xmm9
817	punpckhdq	xmm14,xmm5
818	punpckhdq	xmm7,xmm9
819	movdqa	xmm5,xmm4
820	punpcklqdq	xmm4,xmm8
821	movdqa	xmm9,xmm14
822	punpcklqdq	xmm14,xmm7
823	punpckhqdq	xmm5,xmm8
824	punpckhqdq	xmm9,xmm7
825	paddd	xmm0,XMMWORD[((256-256))+rcx]
826	paddd	xmm1,XMMWORD[((272-256))+rcx]
827	paddd	xmm2,XMMWORD[((288-256))+rcx]
828	paddd	xmm3,XMMWORD[((304-256))+rcx]
829
830	movdqa	xmm8,xmm0
831	punpckldq	xmm0,xmm1
832	movdqa	xmm7,xmm2
833	punpckldq	xmm2,xmm3
834	punpckhdq	xmm8,xmm1
835	punpckhdq	xmm7,xmm3
836	movdqa	xmm1,xmm0
837	punpcklqdq	xmm0,xmm2
838	movdqa	xmm3,xmm8
839	punpcklqdq	xmm8,xmm7
840	punpckhqdq	xmm1,xmm2
841	punpckhqdq	xmm3,xmm7
842	cmp	rdx,64*4
843	jb	NEAR $L$tail4x
844
845	movdqu	xmm6,XMMWORD[rsi]
846	movdqu	xmm11,XMMWORD[16+rsi]
847	movdqu	xmm2,XMMWORD[32+rsi]
848	movdqu	xmm7,XMMWORD[48+rsi]
849	pxor	xmm6,XMMWORD[rsp]
850	pxor	xmm11,xmm12
851	pxor	xmm2,xmm4
852	pxor	xmm7,xmm0
853
854	movdqu	XMMWORD[rdi],xmm6
855	movdqu	xmm6,XMMWORD[64+rsi]
856	movdqu	XMMWORD[16+rdi],xmm11
857	movdqu	xmm11,XMMWORD[80+rsi]
858	movdqu	XMMWORD[32+rdi],xmm2
859	movdqu	xmm2,XMMWORD[96+rsi]
860	movdqu	XMMWORD[48+rdi],xmm7
861	movdqu	xmm7,XMMWORD[112+rsi]
862	lea	rsi,[128+rsi]
863	pxor	xmm6,XMMWORD[16+rsp]
864	pxor	xmm11,xmm13
865	pxor	xmm2,xmm5
866	pxor	xmm7,xmm1
867
868	movdqu	XMMWORD[64+rdi],xmm6
869	movdqu	xmm6,XMMWORD[rsi]
870	movdqu	XMMWORD[80+rdi],xmm11
871	movdqu	xmm11,XMMWORD[16+rsi]
872	movdqu	XMMWORD[96+rdi],xmm2
873	movdqu	xmm2,XMMWORD[32+rsi]
874	movdqu	XMMWORD[112+rdi],xmm7
875	lea	rdi,[128+rdi]
876	movdqu	xmm7,XMMWORD[48+rsi]
877	pxor	xmm6,XMMWORD[32+rsp]
878	pxor	xmm11,xmm10
879	pxor	xmm2,xmm14
880	pxor	xmm7,xmm8
881
882	movdqu	XMMWORD[rdi],xmm6
883	movdqu	xmm6,XMMWORD[64+rsi]
884	movdqu	XMMWORD[16+rdi],xmm11
885	movdqu	xmm11,XMMWORD[80+rsi]
886	movdqu	XMMWORD[32+rdi],xmm2
887	movdqu	xmm2,XMMWORD[96+rsi]
888	movdqu	XMMWORD[48+rdi],xmm7
889	movdqu	xmm7,XMMWORD[112+rsi]
890	lea	rsi,[128+rsi]
891	pxor	xmm6,XMMWORD[48+rsp]
892	pxor	xmm11,xmm15
893	pxor	xmm2,xmm9
894	pxor	xmm7,xmm3
895	movdqu	XMMWORD[64+rdi],xmm6
896	movdqu	XMMWORD[80+rdi],xmm11
897	movdqu	XMMWORD[96+rdi],xmm2
898	movdqu	XMMWORD[112+rdi],xmm7
899	lea	rdi,[128+rdi]
900
901	sub	rdx,64*4
902	jnz	NEAR $L$oop_outer4x
903
904	jmp	NEAR $L$done4x
905
906$L$tail4x:
907	cmp	rdx,192
908	jae	NEAR $L$192_or_more4x
909	cmp	rdx,128
910	jae	NEAR $L$128_or_more4x
911	cmp	rdx,64
912	jae	NEAR $L$64_or_more4x
913
914
915	xor	r10,r10
916
917	movdqa	XMMWORD[16+rsp],xmm12
918	movdqa	XMMWORD[32+rsp],xmm4
919	movdqa	XMMWORD[48+rsp],xmm0
920	jmp	NEAR $L$oop_tail4x
921
922ALIGN	32
923$L$64_or_more4x:
924	movdqu	xmm6,XMMWORD[rsi]
925	movdqu	xmm11,XMMWORD[16+rsi]
926	movdqu	xmm2,XMMWORD[32+rsi]
927	movdqu	xmm7,XMMWORD[48+rsi]
928	pxor	xmm6,XMMWORD[rsp]
929	pxor	xmm11,xmm12
930	pxor	xmm2,xmm4
931	pxor	xmm7,xmm0
932	movdqu	XMMWORD[rdi],xmm6
933	movdqu	XMMWORD[16+rdi],xmm11
934	movdqu	XMMWORD[32+rdi],xmm2
935	movdqu	XMMWORD[48+rdi],xmm7
936	je	NEAR $L$done4x
937
938	movdqa	xmm6,XMMWORD[16+rsp]
939	lea	rsi,[64+rsi]
940	xor	r10,r10
941	movdqa	XMMWORD[rsp],xmm6
942	movdqa	XMMWORD[16+rsp],xmm13
943	lea	rdi,[64+rdi]
944	movdqa	XMMWORD[32+rsp],xmm5
945	sub	rdx,64
946	movdqa	XMMWORD[48+rsp],xmm1
947	jmp	NEAR $L$oop_tail4x
948
949ALIGN	32
950$L$128_or_more4x:
951	movdqu	xmm6,XMMWORD[rsi]
952	movdqu	xmm11,XMMWORD[16+rsi]
953	movdqu	xmm2,XMMWORD[32+rsi]
954	movdqu	xmm7,XMMWORD[48+rsi]
955	pxor	xmm6,XMMWORD[rsp]
956	pxor	xmm11,xmm12
957	pxor	xmm2,xmm4
958	pxor	xmm7,xmm0
959
960	movdqu	XMMWORD[rdi],xmm6
961	movdqu	xmm6,XMMWORD[64+rsi]
962	movdqu	XMMWORD[16+rdi],xmm11
963	movdqu	xmm11,XMMWORD[80+rsi]
964	movdqu	XMMWORD[32+rdi],xmm2
965	movdqu	xmm2,XMMWORD[96+rsi]
966	movdqu	XMMWORD[48+rdi],xmm7
967	movdqu	xmm7,XMMWORD[112+rsi]
968	pxor	xmm6,XMMWORD[16+rsp]
969	pxor	xmm11,xmm13
970	pxor	xmm2,xmm5
971	pxor	xmm7,xmm1
972	movdqu	XMMWORD[64+rdi],xmm6
973	movdqu	XMMWORD[80+rdi],xmm11
974	movdqu	XMMWORD[96+rdi],xmm2
975	movdqu	XMMWORD[112+rdi],xmm7
976	je	NEAR $L$done4x
977
978	movdqa	xmm6,XMMWORD[32+rsp]
979	lea	rsi,[128+rsi]
980	xor	r10,r10
981	movdqa	XMMWORD[rsp],xmm6
982	movdqa	XMMWORD[16+rsp],xmm10
983	lea	rdi,[128+rdi]
984	movdqa	XMMWORD[32+rsp],xmm14
985	sub	rdx,128
986	movdqa	XMMWORD[48+rsp],xmm8
987	jmp	NEAR $L$oop_tail4x
988
989ALIGN	32
990$L$192_or_more4x:
991	movdqu	xmm6,XMMWORD[rsi]
992	movdqu	xmm11,XMMWORD[16+rsi]
993	movdqu	xmm2,XMMWORD[32+rsi]
994	movdqu	xmm7,XMMWORD[48+rsi]
995	pxor	xmm6,XMMWORD[rsp]
996	pxor	xmm11,xmm12
997	pxor	xmm2,xmm4
998	pxor	xmm7,xmm0
999
1000	movdqu	XMMWORD[rdi],xmm6
1001	movdqu	xmm6,XMMWORD[64+rsi]
1002	movdqu	XMMWORD[16+rdi],xmm11
1003	movdqu	xmm11,XMMWORD[80+rsi]
1004	movdqu	XMMWORD[32+rdi],xmm2
1005	movdqu	xmm2,XMMWORD[96+rsi]
1006	movdqu	XMMWORD[48+rdi],xmm7
1007	movdqu	xmm7,XMMWORD[112+rsi]
1008	lea	rsi,[128+rsi]
1009	pxor	xmm6,XMMWORD[16+rsp]
1010	pxor	xmm11,xmm13
1011	pxor	xmm2,xmm5
1012	pxor	xmm7,xmm1
1013
1014	movdqu	XMMWORD[64+rdi],xmm6
1015	movdqu	xmm6,XMMWORD[rsi]
1016	movdqu	XMMWORD[80+rdi],xmm11
1017	movdqu	xmm11,XMMWORD[16+rsi]
1018	movdqu	XMMWORD[96+rdi],xmm2
1019	movdqu	xmm2,XMMWORD[32+rsi]
1020	movdqu	XMMWORD[112+rdi],xmm7
1021	lea	rdi,[128+rdi]
1022	movdqu	xmm7,XMMWORD[48+rsi]
1023	pxor	xmm6,XMMWORD[32+rsp]
1024	pxor	xmm11,xmm10
1025	pxor	xmm2,xmm14
1026	pxor	xmm7,xmm8
1027	movdqu	XMMWORD[rdi],xmm6
1028	movdqu	XMMWORD[16+rdi],xmm11
1029	movdqu	XMMWORD[32+rdi],xmm2
1030	movdqu	XMMWORD[48+rdi],xmm7
1031	je	NEAR $L$done4x
1032
1033	movdqa	xmm6,XMMWORD[48+rsp]
1034	lea	rsi,[64+rsi]
1035	xor	r10,r10
1036	movdqa	XMMWORD[rsp],xmm6
1037	movdqa	XMMWORD[16+rsp],xmm15
1038	lea	rdi,[64+rdi]
1039	movdqa	XMMWORD[32+rsp],xmm9
1040	sub	rdx,192
1041	movdqa	XMMWORD[48+rsp],xmm3
1042
1043$L$oop_tail4x:
1044	movzx	eax,BYTE[r10*1+rsi]
1045	movzx	ecx,BYTE[r10*1+rsp]
1046	lea	r10,[1+r10]
1047	xor	eax,ecx
1048	mov	BYTE[((-1))+r10*1+rdi],al
1049	dec	rdx
1050	jnz	NEAR $L$oop_tail4x
1051
1052$L$done4x:
1053	movaps	xmm6,XMMWORD[((-168))+r9]
1054	movaps	xmm7,XMMWORD[((-152))+r9]
1055	movaps	xmm8,XMMWORD[((-136))+r9]
1056	movaps	xmm9,XMMWORD[((-120))+r9]
1057	movaps	xmm10,XMMWORD[((-104))+r9]
1058	movaps	xmm11,XMMWORD[((-88))+r9]
1059	movaps	xmm12,XMMWORD[((-72))+r9]
1060	movaps	xmm13,XMMWORD[((-56))+r9]
1061	movaps	xmm14,XMMWORD[((-40))+r9]
1062	movaps	xmm15,XMMWORD[((-24))+r9]
1063	lea	rsp,[r9]
1064
1065$L$4x_epilogue:
1066	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1067	mov	rsi,QWORD[16+rsp]
1068	ret
1069
1070$L$SEH_end_ChaCha20_ctr32_ssse3_4x:
1071global	ChaCha20_ctr32_avx2
1072
1073ALIGN	32
1074ChaCha20_ctr32_avx2:
1075	mov	QWORD[8+rsp],rdi	;WIN64 prologue
1076	mov	QWORD[16+rsp],rsi
1077	mov	rax,rsp
1078$L$SEH_begin_ChaCha20_ctr32_avx2:
1079	mov	rdi,rcx
1080	mov	rsi,rdx
1081	mov	rdx,r8
1082	mov	rcx,r9
1083	mov	r8,QWORD[40+rsp]
1084
1085
1086
1087_CET_ENDBR
1088	mov	r9,rsp
1089
1090	sub	rsp,0x280+168
1091	and	rsp,-32
1092	movaps	XMMWORD[(-168)+r9],xmm6
1093	movaps	XMMWORD[(-152)+r9],xmm7
1094	movaps	XMMWORD[(-136)+r9],xmm8
1095	movaps	XMMWORD[(-120)+r9],xmm9
1096	movaps	XMMWORD[(-104)+r9],xmm10
1097	movaps	XMMWORD[(-88)+r9],xmm11
1098	movaps	XMMWORD[(-72)+r9],xmm12
1099	movaps	XMMWORD[(-56)+r9],xmm13
1100	movaps	XMMWORD[(-40)+r9],xmm14
1101	movaps	XMMWORD[(-24)+r9],xmm15
1102$L$8x_body:
1103	vzeroupper
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114	vbroadcasti128	ymm11,XMMWORD[$L$sigma]
1115	vbroadcasti128	ymm3,XMMWORD[rcx]
1116	vbroadcasti128	ymm15,XMMWORD[16+rcx]
1117	vbroadcasti128	ymm7,XMMWORD[r8]
1118	lea	rcx,[256+rsp]
1119	lea	rax,[512+rsp]
1120	lea	r10,[$L$rot16]
1121	lea	r11,[$L$rot24]
1122
1123	vpshufd	ymm8,ymm11,0x00
1124	vpshufd	ymm9,ymm11,0x55
1125	vmovdqa	YMMWORD[(128-256)+rcx],ymm8
1126	vpshufd	ymm10,ymm11,0xaa
1127	vmovdqa	YMMWORD[(160-256)+rcx],ymm9
1128	vpshufd	ymm11,ymm11,0xff
1129	vmovdqa	YMMWORD[(192-256)+rcx],ymm10
1130	vmovdqa	YMMWORD[(224-256)+rcx],ymm11
1131
1132	vpshufd	ymm0,ymm3,0x00
1133	vpshufd	ymm1,ymm3,0x55
1134	vmovdqa	YMMWORD[(256-256)+rcx],ymm0
1135	vpshufd	ymm2,ymm3,0xaa
1136	vmovdqa	YMMWORD[(288-256)+rcx],ymm1
1137	vpshufd	ymm3,ymm3,0xff
1138	vmovdqa	YMMWORD[(320-256)+rcx],ymm2
1139	vmovdqa	YMMWORD[(352-256)+rcx],ymm3
1140
1141	vpshufd	ymm12,ymm15,0x00
1142	vpshufd	ymm13,ymm15,0x55
1143	vmovdqa	YMMWORD[(384-512)+rax],ymm12
1144	vpshufd	ymm14,ymm15,0xaa
1145	vmovdqa	YMMWORD[(416-512)+rax],ymm13
1146	vpshufd	ymm15,ymm15,0xff
1147	vmovdqa	YMMWORD[(448-512)+rax],ymm14
1148	vmovdqa	YMMWORD[(480-512)+rax],ymm15
1149
1150	vpshufd	ymm4,ymm7,0x00
1151	vpshufd	ymm5,ymm7,0x55
1152	vpaddd	ymm4,ymm4,YMMWORD[$L$incy]
1153	vpshufd	ymm6,ymm7,0xaa
1154	vmovdqa	YMMWORD[(544-512)+rax],ymm5
1155	vpshufd	ymm7,ymm7,0xff
1156	vmovdqa	YMMWORD[(576-512)+rax],ymm6
1157	vmovdqa	YMMWORD[(608-512)+rax],ymm7
1158
1159	jmp	NEAR $L$oop_enter8x
1160
1161ALIGN	32
1162$L$oop_outer8x:
1163	vmovdqa	ymm8,YMMWORD[((128-256))+rcx]
1164	vmovdqa	ymm9,YMMWORD[((160-256))+rcx]
1165	vmovdqa	ymm10,YMMWORD[((192-256))+rcx]
1166	vmovdqa	ymm11,YMMWORD[((224-256))+rcx]
1167	vmovdqa	ymm0,YMMWORD[((256-256))+rcx]
1168	vmovdqa	ymm1,YMMWORD[((288-256))+rcx]
1169	vmovdqa	ymm2,YMMWORD[((320-256))+rcx]
1170	vmovdqa	ymm3,YMMWORD[((352-256))+rcx]
1171	vmovdqa	ymm12,YMMWORD[((384-512))+rax]
1172	vmovdqa	ymm13,YMMWORD[((416-512))+rax]
1173	vmovdqa	ymm14,YMMWORD[((448-512))+rax]
1174	vmovdqa	ymm15,YMMWORD[((480-512))+rax]
1175	vmovdqa	ymm4,YMMWORD[((512-512))+rax]
1176	vmovdqa	ymm5,YMMWORD[((544-512))+rax]
1177	vmovdqa	ymm6,YMMWORD[((576-512))+rax]
1178	vmovdqa	ymm7,YMMWORD[((608-512))+rax]
1179	vpaddd	ymm4,ymm4,YMMWORD[$L$eight]
1180
1181$L$oop_enter8x:
1182	vmovdqa	YMMWORD[64+rsp],ymm14
1183	vmovdqa	YMMWORD[96+rsp],ymm15
1184	vbroadcasti128	ymm15,XMMWORD[r10]
1185	vmovdqa	YMMWORD[(512-512)+rax],ymm4
1186	mov	eax,10
1187	jmp	NEAR $L$oop8x
1188
1189ALIGN	32
1190$L$oop8x:
1191	vpaddd	ymm8,ymm8,ymm0
1192	vpxor	ymm4,ymm8,ymm4
1193	vpshufb	ymm4,ymm4,ymm15
1194	vpaddd	ymm9,ymm9,ymm1
1195	vpxor	ymm5,ymm9,ymm5
1196	vpshufb	ymm5,ymm5,ymm15
1197	vpaddd	ymm12,ymm12,ymm4
1198	vpxor	ymm0,ymm12,ymm0
1199	vpslld	ymm14,ymm0,12
1200	vpsrld	ymm0,ymm0,20
1201	vpor	ymm0,ymm14,ymm0
1202	vbroadcasti128	ymm14,XMMWORD[r11]
1203	vpaddd	ymm13,ymm13,ymm5
1204	vpxor	ymm1,ymm13,ymm1
1205	vpslld	ymm15,ymm1,12
1206	vpsrld	ymm1,ymm1,20
1207	vpor	ymm1,ymm15,ymm1
1208	vpaddd	ymm8,ymm8,ymm0
1209	vpxor	ymm4,ymm8,ymm4
1210	vpshufb	ymm4,ymm4,ymm14
1211	vpaddd	ymm9,ymm9,ymm1
1212	vpxor	ymm5,ymm9,ymm5
1213	vpshufb	ymm5,ymm5,ymm14
1214	vpaddd	ymm12,ymm12,ymm4
1215	vpxor	ymm0,ymm12,ymm0
1216	vpslld	ymm15,ymm0,7
1217	vpsrld	ymm0,ymm0,25
1218	vpor	ymm0,ymm15,ymm0
1219	vbroadcasti128	ymm15,XMMWORD[r10]
1220	vpaddd	ymm13,ymm13,ymm5
1221	vpxor	ymm1,ymm13,ymm1
1222	vpslld	ymm14,ymm1,7
1223	vpsrld	ymm1,ymm1,25
1224	vpor	ymm1,ymm14,ymm1
1225	vmovdqa	YMMWORD[rsp],ymm12
1226	vmovdqa	YMMWORD[32+rsp],ymm13
1227	vmovdqa	ymm12,YMMWORD[64+rsp]
1228	vmovdqa	ymm13,YMMWORD[96+rsp]
1229	vpaddd	ymm10,ymm10,ymm2
1230	vpxor	ymm6,ymm10,ymm6
1231	vpshufb	ymm6,ymm6,ymm15
1232	vpaddd	ymm11,ymm11,ymm3
1233	vpxor	ymm7,ymm11,ymm7
1234	vpshufb	ymm7,ymm7,ymm15
1235	vpaddd	ymm12,ymm12,ymm6
1236	vpxor	ymm2,ymm12,ymm2
1237	vpslld	ymm14,ymm2,12
1238	vpsrld	ymm2,ymm2,20
1239	vpor	ymm2,ymm14,ymm2
1240	vbroadcasti128	ymm14,XMMWORD[r11]
1241	vpaddd	ymm13,ymm13,ymm7
1242	vpxor	ymm3,ymm13,ymm3
1243	vpslld	ymm15,ymm3,12
1244	vpsrld	ymm3,ymm3,20
1245	vpor	ymm3,ymm15,ymm3
1246	vpaddd	ymm10,ymm10,ymm2
1247	vpxor	ymm6,ymm10,ymm6
1248	vpshufb	ymm6,ymm6,ymm14
1249	vpaddd	ymm11,ymm11,ymm3
1250	vpxor	ymm7,ymm11,ymm7
1251	vpshufb	ymm7,ymm7,ymm14
1252	vpaddd	ymm12,ymm12,ymm6
1253	vpxor	ymm2,ymm12,ymm2
1254	vpslld	ymm15,ymm2,7
1255	vpsrld	ymm2,ymm2,25
1256	vpor	ymm2,ymm15,ymm2
1257	vbroadcasti128	ymm15,XMMWORD[r10]
1258	vpaddd	ymm13,ymm13,ymm7
1259	vpxor	ymm3,ymm13,ymm3
1260	vpslld	ymm14,ymm3,7
1261	vpsrld	ymm3,ymm3,25
1262	vpor	ymm3,ymm14,ymm3
1263	vpaddd	ymm8,ymm8,ymm1
1264	vpxor	ymm7,ymm8,ymm7
1265	vpshufb	ymm7,ymm7,ymm15
1266	vpaddd	ymm9,ymm9,ymm2
1267	vpxor	ymm4,ymm9,ymm4
1268	vpshufb	ymm4,ymm4,ymm15
1269	vpaddd	ymm12,ymm12,ymm7
1270	vpxor	ymm1,ymm12,ymm1
1271	vpslld	ymm14,ymm1,12
1272	vpsrld	ymm1,ymm1,20
1273	vpor	ymm1,ymm14,ymm1
1274	vbroadcasti128	ymm14,XMMWORD[r11]
1275	vpaddd	ymm13,ymm13,ymm4
1276	vpxor	ymm2,ymm13,ymm2
1277	vpslld	ymm15,ymm2,12
1278	vpsrld	ymm2,ymm2,20
1279	vpor	ymm2,ymm15,ymm2
1280	vpaddd	ymm8,ymm8,ymm1
1281	vpxor	ymm7,ymm8,ymm7
1282	vpshufb	ymm7,ymm7,ymm14
1283	vpaddd	ymm9,ymm9,ymm2
1284	vpxor	ymm4,ymm9,ymm4
1285	vpshufb	ymm4,ymm4,ymm14
1286	vpaddd	ymm12,ymm12,ymm7
1287	vpxor	ymm1,ymm12,ymm1
1288	vpslld	ymm15,ymm1,7
1289	vpsrld	ymm1,ymm1,25
1290	vpor	ymm1,ymm15,ymm1
1291	vbroadcasti128	ymm15,XMMWORD[r10]
1292	vpaddd	ymm13,ymm13,ymm4
1293	vpxor	ymm2,ymm13,ymm2
1294	vpslld	ymm14,ymm2,7
1295	vpsrld	ymm2,ymm2,25
1296	vpor	ymm2,ymm14,ymm2
1297	vmovdqa	YMMWORD[64+rsp],ymm12
1298	vmovdqa	YMMWORD[96+rsp],ymm13
1299	vmovdqa	ymm12,YMMWORD[rsp]
1300	vmovdqa	ymm13,YMMWORD[32+rsp]
1301	vpaddd	ymm10,ymm10,ymm3
1302	vpxor	ymm5,ymm10,ymm5
1303	vpshufb	ymm5,ymm5,ymm15
1304	vpaddd	ymm11,ymm11,ymm0
1305	vpxor	ymm6,ymm11,ymm6
1306	vpshufb	ymm6,ymm6,ymm15
1307	vpaddd	ymm12,ymm12,ymm5
1308	vpxor	ymm3,ymm12,ymm3
1309	vpslld	ymm14,ymm3,12
1310	vpsrld	ymm3,ymm3,20
1311	vpor	ymm3,ymm14,ymm3
1312	vbroadcasti128	ymm14,XMMWORD[r11]
1313	vpaddd	ymm13,ymm13,ymm6
1314	vpxor	ymm0,ymm13,ymm0
1315	vpslld	ymm15,ymm0,12
1316	vpsrld	ymm0,ymm0,20
1317	vpor	ymm0,ymm15,ymm0
1318	vpaddd	ymm10,ymm10,ymm3
1319	vpxor	ymm5,ymm10,ymm5
1320	vpshufb	ymm5,ymm5,ymm14
1321	vpaddd	ymm11,ymm11,ymm0
1322	vpxor	ymm6,ymm11,ymm6
1323	vpshufb	ymm6,ymm6,ymm14
1324	vpaddd	ymm12,ymm12,ymm5
1325	vpxor	ymm3,ymm12,ymm3
1326	vpslld	ymm15,ymm3,7
1327	vpsrld	ymm3,ymm3,25
1328	vpor	ymm3,ymm15,ymm3
1329	vbroadcasti128	ymm15,XMMWORD[r10]
1330	vpaddd	ymm13,ymm13,ymm6
1331	vpxor	ymm0,ymm13,ymm0
1332	vpslld	ymm14,ymm0,7
1333	vpsrld	ymm0,ymm0,25
1334	vpor	ymm0,ymm14,ymm0
1335	dec	eax
1336	jnz	NEAR $L$oop8x
1337
1338	lea	rax,[512+rsp]
1339	vpaddd	ymm8,ymm8,YMMWORD[((128-256))+rcx]
1340	vpaddd	ymm9,ymm9,YMMWORD[((160-256))+rcx]
1341	vpaddd	ymm10,ymm10,YMMWORD[((192-256))+rcx]
1342	vpaddd	ymm11,ymm11,YMMWORD[((224-256))+rcx]
1343
1344	vpunpckldq	ymm14,ymm8,ymm9
1345	vpunpckldq	ymm15,ymm10,ymm11
1346	vpunpckhdq	ymm8,ymm8,ymm9
1347	vpunpckhdq	ymm10,ymm10,ymm11
1348	vpunpcklqdq	ymm9,ymm14,ymm15
1349	vpunpckhqdq	ymm14,ymm14,ymm15
1350	vpunpcklqdq	ymm11,ymm8,ymm10
1351	vpunpckhqdq	ymm8,ymm8,ymm10
1352	vpaddd	ymm0,ymm0,YMMWORD[((256-256))+rcx]
1353	vpaddd	ymm1,ymm1,YMMWORD[((288-256))+rcx]
1354	vpaddd	ymm2,ymm2,YMMWORD[((320-256))+rcx]
1355	vpaddd	ymm3,ymm3,YMMWORD[((352-256))+rcx]
1356
1357	vpunpckldq	ymm10,ymm0,ymm1
1358	vpunpckldq	ymm15,ymm2,ymm3
1359	vpunpckhdq	ymm0,ymm0,ymm1
1360	vpunpckhdq	ymm2,ymm2,ymm3
1361	vpunpcklqdq	ymm1,ymm10,ymm15
1362	vpunpckhqdq	ymm10,ymm10,ymm15
1363	vpunpcklqdq	ymm3,ymm0,ymm2
1364	vpunpckhqdq	ymm0,ymm0,ymm2
1365	vperm2i128	ymm15,ymm9,ymm1,0x20
1366	vperm2i128	ymm1,ymm9,ymm1,0x31
1367	vperm2i128	ymm9,ymm14,ymm10,0x20
1368	vperm2i128	ymm10,ymm14,ymm10,0x31
1369	vperm2i128	ymm14,ymm11,ymm3,0x20
1370	vperm2i128	ymm3,ymm11,ymm3,0x31
1371	vperm2i128	ymm11,ymm8,ymm0,0x20
1372	vperm2i128	ymm0,ymm8,ymm0,0x31
1373	vmovdqa	YMMWORD[rsp],ymm15
1374	vmovdqa	YMMWORD[32+rsp],ymm9
1375	vmovdqa	ymm15,YMMWORD[64+rsp]
1376	vmovdqa	ymm9,YMMWORD[96+rsp]
1377
1378	vpaddd	ymm12,ymm12,YMMWORD[((384-512))+rax]
1379	vpaddd	ymm13,ymm13,YMMWORD[((416-512))+rax]
1380	vpaddd	ymm15,ymm15,YMMWORD[((448-512))+rax]
1381	vpaddd	ymm9,ymm9,YMMWORD[((480-512))+rax]
1382
1383	vpunpckldq	ymm2,ymm12,ymm13
1384	vpunpckldq	ymm8,ymm15,ymm9
1385	vpunpckhdq	ymm12,ymm12,ymm13
1386	vpunpckhdq	ymm15,ymm15,ymm9
1387	vpunpcklqdq	ymm13,ymm2,ymm8
1388	vpunpckhqdq	ymm2,ymm2,ymm8
1389	vpunpcklqdq	ymm9,ymm12,ymm15
1390	vpunpckhqdq	ymm12,ymm12,ymm15
1391	vpaddd	ymm4,ymm4,YMMWORD[((512-512))+rax]
1392	vpaddd	ymm5,ymm5,YMMWORD[((544-512))+rax]
1393	vpaddd	ymm6,ymm6,YMMWORD[((576-512))+rax]
1394	vpaddd	ymm7,ymm7,YMMWORD[((608-512))+rax]
1395
1396	vpunpckldq	ymm15,ymm4,ymm5
1397	vpunpckldq	ymm8,ymm6,ymm7
1398	vpunpckhdq	ymm4,ymm4,ymm5
1399	vpunpckhdq	ymm6,ymm6,ymm7
1400	vpunpcklqdq	ymm5,ymm15,ymm8
1401	vpunpckhqdq	ymm15,ymm15,ymm8
1402	vpunpcklqdq	ymm7,ymm4,ymm6
1403	vpunpckhqdq	ymm4,ymm4,ymm6
1404	vperm2i128	ymm8,ymm13,ymm5,0x20
1405	vperm2i128	ymm5,ymm13,ymm5,0x31
1406	vperm2i128	ymm13,ymm2,ymm15,0x20
1407	vperm2i128	ymm15,ymm2,ymm15,0x31
1408	vperm2i128	ymm2,ymm9,ymm7,0x20
1409	vperm2i128	ymm7,ymm9,ymm7,0x31
1410	vperm2i128	ymm9,ymm12,ymm4,0x20
1411	vperm2i128	ymm4,ymm12,ymm4,0x31
1412	vmovdqa	ymm6,YMMWORD[rsp]
1413	vmovdqa	ymm12,YMMWORD[32+rsp]
1414
1415	cmp	rdx,64*8
1416	jb	NEAR $L$tail8x
1417
1418	vpxor	ymm6,ymm6,YMMWORD[rsi]
1419	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1420	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1421	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1422	lea	rsi,[128+rsi]
1423	vmovdqu	YMMWORD[rdi],ymm6
1424	vmovdqu	YMMWORD[32+rdi],ymm8
1425	vmovdqu	YMMWORD[64+rdi],ymm1
1426	vmovdqu	YMMWORD[96+rdi],ymm5
1427	lea	rdi,[128+rdi]
1428
1429	vpxor	ymm12,ymm12,YMMWORD[rsi]
1430	vpxor	ymm13,ymm13,YMMWORD[32+rsi]
1431	vpxor	ymm10,ymm10,YMMWORD[64+rsi]
1432	vpxor	ymm15,ymm15,YMMWORD[96+rsi]
1433	lea	rsi,[128+rsi]
1434	vmovdqu	YMMWORD[rdi],ymm12
1435	vmovdqu	YMMWORD[32+rdi],ymm13
1436	vmovdqu	YMMWORD[64+rdi],ymm10
1437	vmovdqu	YMMWORD[96+rdi],ymm15
1438	lea	rdi,[128+rdi]
1439
1440	vpxor	ymm14,ymm14,YMMWORD[rsi]
1441	vpxor	ymm2,ymm2,YMMWORD[32+rsi]
1442	vpxor	ymm3,ymm3,YMMWORD[64+rsi]
1443	vpxor	ymm7,ymm7,YMMWORD[96+rsi]
1444	lea	rsi,[128+rsi]
1445	vmovdqu	YMMWORD[rdi],ymm14
1446	vmovdqu	YMMWORD[32+rdi],ymm2
1447	vmovdqu	YMMWORD[64+rdi],ymm3
1448	vmovdqu	YMMWORD[96+rdi],ymm7
1449	lea	rdi,[128+rdi]
1450
1451	vpxor	ymm11,ymm11,YMMWORD[rsi]
1452	vpxor	ymm9,ymm9,YMMWORD[32+rsi]
1453	vpxor	ymm0,ymm0,YMMWORD[64+rsi]
1454	vpxor	ymm4,ymm4,YMMWORD[96+rsi]
1455	lea	rsi,[128+rsi]
1456	vmovdqu	YMMWORD[rdi],ymm11
1457	vmovdqu	YMMWORD[32+rdi],ymm9
1458	vmovdqu	YMMWORD[64+rdi],ymm0
1459	vmovdqu	YMMWORD[96+rdi],ymm4
1460	lea	rdi,[128+rdi]
1461
1462	sub	rdx,64*8
1463	jnz	NEAR $L$oop_outer8x
1464
1465	jmp	NEAR $L$done8x
1466
1467$L$tail8x:
1468	cmp	rdx,448
1469	jae	NEAR $L$448_or_more8x
1470	cmp	rdx,384
1471	jae	NEAR $L$384_or_more8x
1472	cmp	rdx,320
1473	jae	NEAR $L$320_or_more8x
1474	cmp	rdx,256
1475	jae	NEAR $L$256_or_more8x
1476	cmp	rdx,192
1477	jae	NEAR $L$192_or_more8x
1478	cmp	rdx,128
1479	jae	NEAR $L$128_or_more8x
1480	cmp	rdx,64
1481	jae	NEAR $L$64_or_more8x
1482
1483	xor	r10,r10
1484	vmovdqa	YMMWORD[rsp],ymm6
1485	vmovdqa	YMMWORD[32+rsp],ymm8
1486	jmp	NEAR $L$oop_tail8x
1487
1488ALIGN	32
1489$L$64_or_more8x:
1490	vpxor	ymm6,ymm6,YMMWORD[rsi]
1491	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1492	vmovdqu	YMMWORD[rdi],ymm6
1493	vmovdqu	YMMWORD[32+rdi],ymm8
1494	je	NEAR $L$done8x
1495
1496	lea	rsi,[64+rsi]
1497	xor	r10,r10
1498	vmovdqa	YMMWORD[rsp],ymm1
1499	lea	rdi,[64+rdi]
1500	sub	rdx,64
1501	vmovdqa	YMMWORD[32+rsp],ymm5
1502	jmp	NEAR $L$oop_tail8x
1503
1504ALIGN	32
1505$L$128_or_more8x:
1506	vpxor	ymm6,ymm6,YMMWORD[rsi]
1507	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1508	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1509	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1510	vmovdqu	YMMWORD[rdi],ymm6
1511	vmovdqu	YMMWORD[32+rdi],ymm8
1512	vmovdqu	YMMWORD[64+rdi],ymm1
1513	vmovdqu	YMMWORD[96+rdi],ymm5
1514	je	NEAR $L$done8x
1515
1516	lea	rsi,[128+rsi]
1517	xor	r10,r10
1518	vmovdqa	YMMWORD[rsp],ymm12
1519	lea	rdi,[128+rdi]
1520	sub	rdx,128
1521	vmovdqa	YMMWORD[32+rsp],ymm13
1522	jmp	NEAR $L$oop_tail8x
1523
1524ALIGN	32
1525$L$192_or_more8x:
1526	vpxor	ymm6,ymm6,YMMWORD[rsi]
1527	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1528	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1529	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1530	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1531	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1532	vmovdqu	YMMWORD[rdi],ymm6
1533	vmovdqu	YMMWORD[32+rdi],ymm8
1534	vmovdqu	YMMWORD[64+rdi],ymm1
1535	vmovdqu	YMMWORD[96+rdi],ymm5
1536	vmovdqu	YMMWORD[128+rdi],ymm12
1537	vmovdqu	YMMWORD[160+rdi],ymm13
1538	je	NEAR $L$done8x
1539
1540	lea	rsi,[192+rsi]
1541	xor	r10,r10
1542	vmovdqa	YMMWORD[rsp],ymm10
1543	lea	rdi,[192+rdi]
1544	sub	rdx,192
1545	vmovdqa	YMMWORD[32+rsp],ymm15
1546	jmp	NEAR $L$oop_tail8x
1547
1548ALIGN	32
1549$L$256_or_more8x:
1550	vpxor	ymm6,ymm6,YMMWORD[rsi]
1551	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1552	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1553	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1554	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1555	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1556	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
1557	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
1558	vmovdqu	YMMWORD[rdi],ymm6
1559	vmovdqu	YMMWORD[32+rdi],ymm8
1560	vmovdqu	YMMWORD[64+rdi],ymm1
1561	vmovdqu	YMMWORD[96+rdi],ymm5
1562	vmovdqu	YMMWORD[128+rdi],ymm12
1563	vmovdqu	YMMWORD[160+rdi],ymm13
1564	vmovdqu	YMMWORD[192+rdi],ymm10
1565	vmovdqu	YMMWORD[224+rdi],ymm15
1566	je	NEAR $L$done8x
1567
1568	lea	rsi,[256+rsi]
1569	xor	r10,r10
1570	vmovdqa	YMMWORD[rsp],ymm14
1571	lea	rdi,[256+rdi]
1572	sub	rdx,256
1573	vmovdqa	YMMWORD[32+rsp],ymm2
1574	jmp	NEAR $L$oop_tail8x
1575
1576ALIGN	32
1577$L$320_or_more8x:
1578	vpxor	ymm6,ymm6,YMMWORD[rsi]
1579	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1580	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1581	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1582	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1583	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1584	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
1585	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
1586	vpxor	ymm14,ymm14,YMMWORD[256+rsi]
1587	vpxor	ymm2,ymm2,YMMWORD[288+rsi]
1588	vmovdqu	YMMWORD[rdi],ymm6
1589	vmovdqu	YMMWORD[32+rdi],ymm8
1590	vmovdqu	YMMWORD[64+rdi],ymm1
1591	vmovdqu	YMMWORD[96+rdi],ymm5
1592	vmovdqu	YMMWORD[128+rdi],ymm12
1593	vmovdqu	YMMWORD[160+rdi],ymm13
1594	vmovdqu	YMMWORD[192+rdi],ymm10
1595	vmovdqu	YMMWORD[224+rdi],ymm15
1596	vmovdqu	YMMWORD[256+rdi],ymm14
1597	vmovdqu	YMMWORD[288+rdi],ymm2
1598	je	NEAR $L$done8x
1599
1600	lea	rsi,[320+rsi]
1601	xor	r10,r10
1602	vmovdqa	YMMWORD[rsp],ymm3
1603	lea	rdi,[320+rdi]
1604	sub	rdx,320
1605	vmovdqa	YMMWORD[32+rsp],ymm7
1606	jmp	NEAR $L$oop_tail8x
1607
1608ALIGN	32
1609$L$384_or_more8x:
1610	vpxor	ymm6,ymm6,YMMWORD[rsi]
1611	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1612	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1613	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1614	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1615	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1616	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
1617	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
1618	vpxor	ymm14,ymm14,YMMWORD[256+rsi]
1619	vpxor	ymm2,ymm2,YMMWORD[288+rsi]
1620	vpxor	ymm3,ymm3,YMMWORD[320+rsi]
1621	vpxor	ymm7,ymm7,YMMWORD[352+rsi]
1622	vmovdqu	YMMWORD[rdi],ymm6
1623	vmovdqu	YMMWORD[32+rdi],ymm8
1624	vmovdqu	YMMWORD[64+rdi],ymm1
1625	vmovdqu	YMMWORD[96+rdi],ymm5
1626	vmovdqu	YMMWORD[128+rdi],ymm12
1627	vmovdqu	YMMWORD[160+rdi],ymm13
1628	vmovdqu	YMMWORD[192+rdi],ymm10
1629	vmovdqu	YMMWORD[224+rdi],ymm15
1630	vmovdqu	YMMWORD[256+rdi],ymm14
1631	vmovdqu	YMMWORD[288+rdi],ymm2
1632	vmovdqu	YMMWORD[320+rdi],ymm3
1633	vmovdqu	YMMWORD[352+rdi],ymm7
1634	je	NEAR $L$done8x
1635
1636	lea	rsi,[384+rsi]
1637	xor	r10,r10
1638	vmovdqa	YMMWORD[rsp],ymm11
1639	lea	rdi,[384+rdi]
1640	sub	rdx,384
1641	vmovdqa	YMMWORD[32+rsp],ymm9
1642	jmp	NEAR $L$oop_tail8x
1643
1644ALIGN	32
1645$L$448_or_more8x:
1646	vpxor	ymm6,ymm6,YMMWORD[rsi]
1647	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1648	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1649	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1650	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1651	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1652	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
1653	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
1654	vpxor	ymm14,ymm14,YMMWORD[256+rsi]
1655	vpxor	ymm2,ymm2,YMMWORD[288+rsi]
1656	vpxor	ymm3,ymm3,YMMWORD[320+rsi]
1657	vpxor	ymm7,ymm7,YMMWORD[352+rsi]
1658	vpxor	ymm11,ymm11,YMMWORD[384+rsi]
1659	vpxor	ymm9,ymm9,YMMWORD[416+rsi]
1660	vmovdqu	YMMWORD[rdi],ymm6
1661	vmovdqu	YMMWORD[32+rdi],ymm8
1662	vmovdqu	YMMWORD[64+rdi],ymm1
1663	vmovdqu	YMMWORD[96+rdi],ymm5
1664	vmovdqu	YMMWORD[128+rdi],ymm12
1665	vmovdqu	YMMWORD[160+rdi],ymm13
1666	vmovdqu	YMMWORD[192+rdi],ymm10
1667	vmovdqu	YMMWORD[224+rdi],ymm15
1668	vmovdqu	YMMWORD[256+rdi],ymm14
1669	vmovdqu	YMMWORD[288+rdi],ymm2
1670	vmovdqu	YMMWORD[320+rdi],ymm3
1671	vmovdqu	YMMWORD[352+rdi],ymm7
1672	vmovdqu	YMMWORD[384+rdi],ymm11
1673	vmovdqu	YMMWORD[416+rdi],ymm9
1674	je	NEAR $L$done8x
1675
1676	lea	rsi,[448+rsi]
1677	xor	r10,r10
1678	vmovdqa	YMMWORD[rsp],ymm0
1679	lea	rdi,[448+rdi]
1680	sub	rdx,448
1681	vmovdqa	YMMWORD[32+rsp],ymm4
1682
1683$L$oop_tail8x:
1684	movzx	eax,BYTE[r10*1+rsi]
1685	movzx	ecx,BYTE[r10*1+rsp]
1686	lea	r10,[1+r10]
1687	xor	eax,ecx
1688	mov	BYTE[((-1))+r10*1+rdi],al
1689	dec	rdx
1690	jnz	NEAR $L$oop_tail8x
1691
1692$L$done8x:
1693	vzeroall
1694	movaps	xmm6,XMMWORD[((-168))+r9]
1695	movaps	xmm7,XMMWORD[((-152))+r9]
1696	movaps	xmm8,XMMWORD[((-136))+r9]
1697	movaps	xmm9,XMMWORD[((-120))+r9]
1698	movaps	xmm10,XMMWORD[((-104))+r9]
1699	movaps	xmm11,XMMWORD[((-88))+r9]
1700	movaps	xmm12,XMMWORD[((-72))+r9]
1701	movaps	xmm13,XMMWORD[((-56))+r9]
1702	movaps	xmm14,XMMWORD[((-40))+r9]
1703	movaps	xmm15,XMMWORD[((-24))+r9]
1704	lea	rsp,[r9]
1705
1706$L$8x_epilogue:
1707	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1708	mov	rsi,QWORD[16+rsp]
1709	ret
1710
1711$L$SEH_end_ChaCha20_ctr32_avx2:
1712EXTERN	__imp_RtlVirtualUnwind
1713
1714ALIGN	16
1715se_handler:
1716	push	rsi
1717	push	rdi
1718	push	rbx
1719	push	rbp
1720	push	r12
1721	push	r13
1722	push	r14
1723	push	r15
1724	pushfq
1725	sub	rsp,64
1726
1727	mov	rax,QWORD[120+r8]
1728	mov	rbx,QWORD[248+r8]
1729
1730	mov	rsi,QWORD[8+r9]
1731	mov	r11,QWORD[56+r9]
1732
1733	lea	r10,[$L$ctr32_body]
1734	cmp	rbx,r10
1735	jb	NEAR $L$common_seh_tail
1736
1737	mov	rax,QWORD[152+r8]
1738
1739	lea	r10,[$L$no_data]
1740	cmp	rbx,r10
1741	jae	NEAR $L$common_seh_tail
1742
1743	lea	rax,[((64+24+48))+rax]
1744
1745	mov	rbx,QWORD[((-8))+rax]
1746	mov	rbp,QWORD[((-16))+rax]
1747	mov	r12,QWORD[((-24))+rax]
1748	mov	r13,QWORD[((-32))+rax]
1749	mov	r14,QWORD[((-40))+rax]
1750	mov	r15,QWORD[((-48))+rax]
1751	mov	QWORD[144+r8],rbx
1752	mov	QWORD[160+r8],rbp
1753	mov	QWORD[216+r8],r12
1754	mov	QWORD[224+r8],r13
1755	mov	QWORD[232+r8],r14
1756	mov	QWORD[240+r8],r15
1757
1758$L$common_seh_tail:
1759	mov	rdi,QWORD[8+rax]
1760	mov	rsi,QWORD[16+rax]
1761	mov	QWORD[152+r8],rax
1762	mov	QWORD[168+r8],rsi
1763	mov	QWORD[176+r8],rdi
1764
1765	mov	rdi,QWORD[40+r9]
1766	mov	rsi,r8
1767	mov	ecx,154
1768	DD	0xa548f3fc
1769
1770	mov	rsi,r9
1771	xor	rcx,rcx
1772	mov	rdx,QWORD[8+rsi]
1773	mov	r8,QWORD[rsi]
1774	mov	r9,QWORD[16+rsi]
1775	mov	r10,QWORD[40+rsi]
1776	lea	r11,[56+rsi]
1777	lea	r12,[24+rsi]
1778	mov	QWORD[32+rsp],r10
1779	mov	QWORD[40+rsp],r11
1780	mov	QWORD[48+rsp],r12
1781	mov	QWORD[56+rsp],rcx
1782	call	QWORD[__imp_RtlVirtualUnwind]
1783
1784	mov	eax,1
1785	add	rsp,64
1786	popfq
1787	pop	r15
1788	pop	r14
1789	pop	r13
1790	pop	r12
1791	pop	rbp
1792	pop	rbx
1793	pop	rdi
1794	pop	rsi
1795	ret
1796
1797
1798
1799ALIGN	16
1800ssse3_handler:
1801	push	rsi
1802	push	rdi
1803	push	rbx
1804	push	rbp
1805	push	r12
1806	push	r13
1807	push	r14
1808	push	r15
1809	pushfq
1810	sub	rsp,64
1811
1812	mov	rax,QWORD[120+r8]
1813	mov	rbx,QWORD[248+r8]
1814
1815	mov	rsi,QWORD[8+r9]
1816	mov	r11,QWORD[56+r9]
1817
1818	mov	r10d,DWORD[r11]
1819	lea	r10,[r10*1+rsi]
1820	cmp	rbx,r10
1821	jb	NEAR $L$common_seh_tail
1822
1823	mov	rax,QWORD[192+r8]
1824
1825	mov	r10d,DWORD[4+r11]
1826	lea	r10,[r10*1+rsi]
1827	cmp	rbx,r10
1828	jae	NEAR $L$common_seh_tail
1829
1830	lea	rsi,[((-40))+rax]
1831	lea	rdi,[512+r8]
1832	mov	ecx,4
1833	DD	0xa548f3fc
1834
1835	jmp	NEAR $L$common_seh_tail
1836
1837
1838
1839ALIGN	16
1840full_handler:
1841	push	rsi
1842	push	rdi
1843	push	rbx
1844	push	rbp
1845	push	r12
1846	push	r13
1847	push	r14
1848	push	r15
1849	pushfq
1850	sub	rsp,64
1851
1852	mov	rax,QWORD[120+r8]
1853	mov	rbx,QWORD[248+r8]
1854
1855	mov	rsi,QWORD[8+r9]
1856	mov	r11,QWORD[56+r9]
1857
1858	mov	r10d,DWORD[r11]
1859	lea	r10,[r10*1+rsi]
1860	cmp	rbx,r10
1861	jb	NEAR $L$common_seh_tail
1862
1863	mov	rax,QWORD[192+r8]
1864
1865	mov	r10d,DWORD[4+r11]
1866	lea	r10,[r10*1+rsi]
1867	cmp	rbx,r10
1868	jae	NEAR $L$common_seh_tail
1869
1870	lea	rsi,[((-168))+rax]
1871	lea	rdi,[512+r8]
1872	mov	ecx,20
1873	DD	0xa548f3fc
1874
1875	jmp	NEAR $L$common_seh_tail
1876
1877
1878section	.pdata rdata align=4
1879ALIGN	4
1880	DD	$L$SEH_begin_ChaCha20_ctr32_nohw wrt ..imagebase
1881	DD	$L$SEH_end_ChaCha20_ctr32_nohw wrt ..imagebase
1882	DD	$L$SEH_info_ChaCha20_ctr32_nohw wrt ..imagebase
1883
1884	DD	$L$SEH_begin_ChaCha20_ctr32_ssse3 wrt ..imagebase
1885	DD	$L$SEH_end_ChaCha20_ctr32_ssse3 wrt ..imagebase
1886	DD	$L$SEH_info_ChaCha20_ctr32_ssse3 wrt ..imagebase
1887
1888	DD	$L$SEH_begin_ChaCha20_ctr32_ssse3_4x wrt ..imagebase
1889	DD	$L$SEH_end_ChaCha20_ctr32_ssse3_4x wrt ..imagebase
1890	DD	$L$SEH_info_ChaCha20_ctr32_ssse3_4x wrt ..imagebase
1891	DD	$L$SEH_begin_ChaCha20_ctr32_avx2 wrt ..imagebase
1892	DD	$L$SEH_end_ChaCha20_ctr32_avx2 wrt ..imagebase
1893	DD	$L$SEH_info_ChaCha20_ctr32_avx2 wrt ..imagebase
1894section	.xdata rdata align=8
1895ALIGN	8
1896$L$SEH_info_ChaCha20_ctr32_nohw:
1897	DB	9,0,0,0
1898	DD	se_handler wrt ..imagebase
1899
1900$L$SEH_info_ChaCha20_ctr32_ssse3:
1901	DB	9,0,0,0
1902	DD	ssse3_handler wrt ..imagebase
1903	DD	$L$ssse3_body wrt ..imagebase,$L$ssse3_epilogue wrt ..imagebase
1904
1905$L$SEH_info_ChaCha20_ctr32_ssse3_4x:
1906	DB	9,0,0,0
1907	DD	full_handler wrt ..imagebase
1908	DD	$L$4x_body wrt ..imagebase,$L$4x_epilogue wrt ..imagebase
1909$L$SEH_info_ChaCha20_ctr32_avx2:
1910	DB	9,0,0,0
1911	DD	full_handler wrt ..imagebase
1912	DD	$L$8x_body wrt ..imagebase,$L$8x_epilogue wrt ..imagebase
1913%else
1914; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738
1915ret
1916%endif
1917