• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; This file is generated from a similarly-named Perl script in the BoringSSL
2; source tree. Do not edit by hand.
3
4default	rel
5%define XMMWORD
6%define YMMWORD
7%define ZMMWORD
8
9%ifdef BORINGSSL_PREFIX
10%include "boringssl_prefix_symbols_nasm.inc"
11%endif
12section	.text code align=64
13
14
15EXTERN	OPENSSL_ia32cap_P
16
17ALIGN	64
18$L$zero:
19	DD	0,0,0,0
20$L$one:
21	DD	1,0,0,0
22$L$inc:
23	DD	0,1,2,3
24$L$four:
25	DD	4,4,4,4
26$L$incy:
27	DD	0,2,4,6,1,3,5,7
28$L$eight:
29	DD	8,8,8,8,8,8,8,8
30$L$rot16:
31DB	0x2,0x3,0x0,0x1,0x6,0x7,0x4,0x5,0xa,0xb,0x8,0x9,0xe,0xf,0xc,0xd
32$L$rot24:
33DB	0x3,0x0,0x1,0x2,0x7,0x4,0x5,0x6,0xb,0x8,0x9,0xa,0xf,0xc,0xd,0xe
34$L$sigma:
35DB	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107
36DB	0
37ALIGN	64
38$L$zeroz:
39	DD	0,0,0,0,1,0,0,0,2,0,0,0,3,0,0,0
40$L$fourz:
41	DD	4,0,0,0,4,0,0,0,4,0,0,0,4,0,0,0
42$L$incz:
43	DD	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
44$L$sixteen:
45	DD	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
46DB	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
47DB	95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32
48DB	98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115
49DB	108,46,111,114,103,62,0
50global	ChaCha20_ctr32
51
52ALIGN	64
53ChaCha20_ctr32:
54	mov	QWORD[8+rsp],rdi	;WIN64 prologue
55	mov	QWORD[16+rsp],rsi
56	mov	rax,rsp
57$L$SEH_begin_ChaCha20_ctr32:
58	mov	rdi,rcx
59	mov	rsi,rdx
60	mov	rdx,r8
61	mov	rcx,r9
62	mov	r8,QWORD[40+rsp]
63
64
65
66	cmp	rdx,0
67	je	NEAR $L$no_data
68	mov	r10,QWORD[((OPENSSL_ia32cap_P+4))]
69	test	r10d,512
70	jnz	NEAR $L$ChaCha20_ssse3
71
72	push	rbx
73
74	push	rbp
75
76	push	r12
77
78	push	r13
79
80	push	r14
81
82	push	r15
83
84	sub	rsp,64+24
85
86$L$ctr32_body:
87
88
89	movdqu	xmm1,XMMWORD[rcx]
90	movdqu	xmm2,XMMWORD[16+rcx]
91	movdqu	xmm3,XMMWORD[r8]
92	movdqa	xmm4,XMMWORD[$L$one]
93
94
95	movdqa	XMMWORD[16+rsp],xmm1
96	movdqa	XMMWORD[32+rsp],xmm2
97	movdqa	XMMWORD[48+rsp],xmm3
98	mov	rbp,rdx
99	jmp	NEAR $L$oop_outer
100
101ALIGN	32
102$L$oop_outer:
103	mov	eax,0x61707865
104	mov	ebx,0x3320646e
105	mov	ecx,0x79622d32
106	mov	edx,0x6b206574
107	mov	r8d,DWORD[16+rsp]
108	mov	r9d,DWORD[20+rsp]
109	mov	r10d,DWORD[24+rsp]
110	mov	r11d,DWORD[28+rsp]
111	movd	r12d,xmm3
112	mov	r13d,DWORD[52+rsp]
113	mov	r14d,DWORD[56+rsp]
114	mov	r15d,DWORD[60+rsp]
115
116	mov	QWORD[((64+0))+rsp],rbp
117	mov	ebp,10
118	mov	QWORD[((64+8))+rsp],rsi
119DB	102,72,15,126,214
120	mov	QWORD[((64+16))+rsp],rdi
121	mov	rdi,rsi
122	shr	rdi,32
123	jmp	NEAR $L$oop
124
125ALIGN	32
126$L$oop:
127	add	eax,r8d
128	xor	r12d,eax
129	rol	r12d,16
130	add	ebx,r9d
131	xor	r13d,ebx
132	rol	r13d,16
133	add	esi,r12d
134	xor	r8d,esi
135	rol	r8d,12
136	add	edi,r13d
137	xor	r9d,edi
138	rol	r9d,12
139	add	eax,r8d
140	xor	r12d,eax
141	rol	r12d,8
142	add	ebx,r9d
143	xor	r13d,ebx
144	rol	r13d,8
145	add	esi,r12d
146	xor	r8d,esi
147	rol	r8d,7
148	add	edi,r13d
149	xor	r9d,edi
150	rol	r9d,7
151	mov	DWORD[32+rsp],esi
152	mov	DWORD[36+rsp],edi
153	mov	esi,DWORD[40+rsp]
154	mov	edi,DWORD[44+rsp]
155	add	ecx,r10d
156	xor	r14d,ecx
157	rol	r14d,16
158	add	edx,r11d
159	xor	r15d,edx
160	rol	r15d,16
161	add	esi,r14d
162	xor	r10d,esi
163	rol	r10d,12
164	add	edi,r15d
165	xor	r11d,edi
166	rol	r11d,12
167	add	ecx,r10d
168	xor	r14d,ecx
169	rol	r14d,8
170	add	edx,r11d
171	xor	r15d,edx
172	rol	r15d,8
173	add	esi,r14d
174	xor	r10d,esi
175	rol	r10d,7
176	add	edi,r15d
177	xor	r11d,edi
178	rol	r11d,7
179	add	eax,r9d
180	xor	r15d,eax
181	rol	r15d,16
182	add	ebx,r10d
183	xor	r12d,ebx
184	rol	r12d,16
185	add	esi,r15d
186	xor	r9d,esi
187	rol	r9d,12
188	add	edi,r12d
189	xor	r10d,edi
190	rol	r10d,12
191	add	eax,r9d
192	xor	r15d,eax
193	rol	r15d,8
194	add	ebx,r10d
195	xor	r12d,ebx
196	rol	r12d,8
197	add	esi,r15d
198	xor	r9d,esi
199	rol	r9d,7
200	add	edi,r12d
201	xor	r10d,edi
202	rol	r10d,7
203	mov	DWORD[40+rsp],esi
204	mov	DWORD[44+rsp],edi
205	mov	esi,DWORD[32+rsp]
206	mov	edi,DWORD[36+rsp]
207	add	ecx,r11d
208	xor	r13d,ecx
209	rol	r13d,16
210	add	edx,r8d
211	xor	r14d,edx
212	rol	r14d,16
213	add	esi,r13d
214	xor	r11d,esi
215	rol	r11d,12
216	add	edi,r14d
217	xor	r8d,edi
218	rol	r8d,12
219	add	ecx,r11d
220	xor	r13d,ecx
221	rol	r13d,8
222	add	edx,r8d
223	xor	r14d,edx
224	rol	r14d,8
225	add	esi,r13d
226	xor	r11d,esi
227	rol	r11d,7
228	add	edi,r14d
229	xor	r8d,edi
230	rol	r8d,7
231	dec	ebp
232	jnz	NEAR $L$oop
233	mov	DWORD[36+rsp],edi
234	mov	DWORD[32+rsp],esi
235	mov	rbp,QWORD[64+rsp]
236	movdqa	xmm1,xmm2
237	mov	rsi,QWORD[((64+8))+rsp]
238	paddd	xmm3,xmm4
239	mov	rdi,QWORD[((64+16))+rsp]
240
241	add	eax,0x61707865
242	add	ebx,0x3320646e
243	add	ecx,0x79622d32
244	add	edx,0x6b206574
245	add	r8d,DWORD[16+rsp]
246	add	r9d,DWORD[20+rsp]
247	add	r10d,DWORD[24+rsp]
248	add	r11d,DWORD[28+rsp]
249	add	r12d,DWORD[48+rsp]
250	add	r13d,DWORD[52+rsp]
251	add	r14d,DWORD[56+rsp]
252	add	r15d,DWORD[60+rsp]
253	paddd	xmm1,XMMWORD[32+rsp]
254
255	cmp	rbp,64
256	jb	NEAR $L$tail
257
258	xor	eax,DWORD[rsi]
259	xor	ebx,DWORD[4+rsi]
260	xor	ecx,DWORD[8+rsi]
261	xor	edx,DWORD[12+rsi]
262	xor	r8d,DWORD[16+rsi]
263	xor	r9d,DWORD[20+rsi]
264	xor	r10d,DWORD[24+rsi]
265	xor	r11d,DWORD[28+rsi]
266	movdqu	xmm0,XMMWORD[32+rsi]
267	xor	r12d,DWORD[48+rsi]
268	xor	r13d,DWORD[52+rsi]
269	xor	r14d,DWORD[56+rsi]
270	xor	r15d,DWORD[60+rsi]
271	lea	rsi,[64+rsi]
272	pxor	xmm0,xmm1
273
274	movdqa	XMMWORD[32+rsp],xmm2
275	movd	DWORD[48+rsp],xmm3
276
277	mov	DWORD[rdi],eax
278	mov	DWORD[4+rdi],ebx
279	mov	DWORD[8+rdi],ecx
280	mov	DWORD[12+rdi],edx
281	mov	DWORD[16+rdi],r8d
282	mov	DWORD[20+rdi],r9d
283	mov	DWORD[24+rdi],r10d
284	mov	DWORD[28+rdi],r11d
285	movdqu	XMMWORD[32+rdi],xmm0
286	mov	DWORD[48+rdi],r12d
287	mov	DWORD[52+rdi],r13d
288	mov	DWORD[56+rdi],r14d
289	mov	DWORD[60+rdi],r15d
290	lea	rdi,[64+rdi]
291
292	sub	rbp,64
293	jnz	NEAR $L$oop_outer
294
295	jmp	NEAR $L$done
296
297ALIGN	16
298$L$tail:
299	mov	DWORD[rsp],eax
300	mov	DWORD[4+rsp],ebx
301	xor	rbx,rbx
302	mov	DWORD[8+rsp],ecx
303	mov	DWORD[12+rsp],edx
304	mov	DWORD[16+rsp],r8d
305	mov	DWORD[20+rsp],r9d
306	mov	DWORD[24+rsp],r10d
307	mov	DWORD[28+rsp],r11d
308	movdqa	XMMWORD[32+rsp],xmm1
309	mov	DWORD[48+rsp],r12d
310	mov	DWORD[52+rsp],r13d
311	mov	DWORD[56+rsp],r14d
312	mov	DWORD[60+rsp],r15d
313
314$L$oop_tail:
315	movzx	eax,BYTE[rbx*1+rsi]
316	movzx	edx,BYTE[rbx*1+rsp]
317	lea	rbx,[1+rbx]
318	xor	eax,edx
319	mov	BYTE[((-1))+rbx*1+rdi],al
320	dec	rbp
321	jnz	NEAR $L$oop_tail
322
323$L$done:
324	lea	rsi,[((64+24+48))+rsp]
325	mov	r15,QWORD[((-48))+rsi]
326
327	mov	r14,QWORD[((-40))+rsi]
328
329	mov	r13,QWORD[((-32))+rsi]
330
331	mov	r12,QWORD[((-24))+rsi]
332
333	mov	rbp,QWORD[((-16))+rsi]
334
335	mov	rbx,QWORD[((-8))+rsi]
336
337	lea	rsp,[rsi]
338
339$L$no_data:
340	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
341	mov	rsi,QWORD[16+rsp]
342	DB	0F3h,0C3h		;repret
343
344$L$SEH_end_ChaCha20_ctr32:
345
346ALIGN	32
347ChaCha20_ssse3:
348	mov	QWORD[8+rsp],rdi	;WIN64 prologue
349	mov	QWORD[16+rsp],rsi
350	mov	rax,rsp
351$L$SEH_begin_ChaCha20_ssse3:
352	mov	rdi,rcx
353	mov	rsi,rdx
354	mov	rdx,r8
355	mov	rcx,r9
356	mov	r8,QWORD[40+rsp]
357
358
359$L$ChaCha20_ssse3:
360
361	mov	r9,rsp
362
363	cmp	rdx,128
364	ja	NEAR $L$ChaCha20_4x
365
366$L$do_sse3_after_all:
367	sub	rsp,64+40
368	movaps	XMMWORD[(-40)+r9],xmm6
369	movaps	XMMWORD[(-24)+r9],xmm7
370$L$ssse3_body:
371	movdqa	xmm0,XMMWORD[$L$sigma]
372	movdqu	xmm1,XMMWORD[rcx]
373	movdqu	xmm2,XMMWORD[16+rcx]
374	movdqu	xmm3,XMMWORD[r8]
375	movdqa	xmm6,XMMWORD[$L$rot16]
376	movdqa	xmm7,XMMWORD[$L$rot24]
377
378	movdqa	XMMWORD[rsp],xmm0
379	movdqa	XMMWORD[16+rsp],xmm1
380	movdqa	XMMWORD[32+rsp],xmm2
381	movdqa	XMMWORD[48+rsp],xmm3
382	mov	r8,10
383	jmp	NEAR $L$oop_ssse3
384
385ALIGN	32
386$L$oop_outer_ssse3:
387	movdqa	xmm3,XMMWORD[$L$one]
388	movdqa	xmm0,XMMWORD[rsp]
389	movdqa	xmm1,XMMWORD[16+rsp]
390	movdqa	xmm2,XMMWORD[32+rsp]
391	paddd	xmm3,XMMWORD[48+rsp]
392	mov	r8,10
393	movdqa	XMMWORD[48+rsp],xmm3
394	jmp	NEAR $L$oop_ssse3
395
396ALIGN	32
397$L$oop_ssse3:
398	paddd	xmm0,xmm1
399	pxor	xmm3,xmm0
400DB	102,15,56,0,222
401	paddd	xmm2,xmm3
402	pxor	xmm1,xmm2
403	movdqa	xmm4,xmm1
404	psrld	xmm1,20
405	pslld	xmm4,12
406	por	xmm1,xmm4
407	paddd	xmm0,xmm1
408	pxor	xmm3,xmm0
409DB	102,15,56,0,223
410	paddd	xmm2,xmm3
411	pxor	xmm1,xmm2
412	movdqa	xmm4,xmm1
413	psrld	xmm1,25
414	pslld	xmm4,7
415	por	xmm1,xmm4
416	pshufd	xmm2,xmm2,78
417	pshufd	xmm1,xmm1,57
418	pshufd	xmm3,xmm3,147
419	nop
420	paddd	xmm0,xmm1
421	pxor	xmm3,xmm0
422DB	102,15,56,0,222
423	paddd	xmm2,xmm3
424	pxor	xmm1,xmm2
425	movdqa	xmm4,xmm1
426	psrld	xmm1,20
427	pslld	xmm4,12
428	por	xmm1,xmm4
429	paddd	xmm0,xmm1
430	pxor	xmm3,xmm0
431DB	102,15,56,0,223
432	paddd	xmm2,xmm3
433	pxor	xmm1,xmm2
434	movdqa	xmm4,xmm1
435	psrld	xmm1,25
436	pslld	xmm4,7
437	por	xmm1,xmm4
438	pshufd	xmm2,xmm2,78
439	pshufd	xmm1,xmm1,147
440	pshufd	xmm3,xmm3,57
441	dec	r8
442	jnz	NEAR $L$oop_ssse3
443	paddd	xmm0,XMMWORD[rsp]
444	paddd	xmm1,XMMWORD[16+rsp]
445	paddd	xmm2,XMMWORD[32+rsp]
446	paddd	xmm3,XMMWORD[48+rsp]
447
448	cmp	rdx,64
449	jb	NEAR $L$tail_ssse3
450
451	movdqu	xmm4,XMMWORD[rsi]
452	movdqu	xmm5,XMMWORD[16+rsi]
453	pxor	xmm0,xmm4
454	movdqu	xmm4,XMMWORD[32+rsi]
455	pxor	xmm1,xmm5
456	movdqu	xmm5,XMMWORD[48+rsi]
457	lea	rsi,[64+rsi]
458	pxor	xmm2,xmm4
459	pxor	xmm3,xmm5
460
461	movdqu	XMMWORD[rdi],xmm0
462	movdqu	XMMWORD[16+rdi],xmm1
463	movdqu	XMMWORD[32+rdi],xmm2
464	movdqu	XMMWORD[48+rdi],xmm3
465	lea	rdi,[64+rdi]
466
467	sub	rdx,64
468	jnz	NEAR $L$oop_outer_ssse3
469
470	jmp	NEAR $L$done_ssse3
471
472ALIGN	16
473$L$tail_ssse3:
474	movdqa	XMMWORD[rsp],xmm0
475	movdqa	XMMWORD[16+rsp],xmm1
476	movdqa	XMMWORD[32+rsp],xmm2
477	movdqa	XMMWORD[48+rsp],xmm3
478	xor	r8,r8
479
480$L$oop_tail_ssse3:
481	movzx	eax,BYTE[r8*1+rsi]
482	movzx	ecx,BYTE[r8*1+rsp]
483	lea	r8,[1+r8]
484	xor	eax,ecx
485	mov	BYTE[((-1))+r8*1+rdi],al
486	dec	rdx
487	jnz	NEAR $L$oop_tail_ssse3
488
489$L$done_ssse3:
490	movaps	xmm6,XMMWORD[((-40))+r9]
491	movaps	xmm7,XMMWORD[((-24))+r9]
492	lea	rsp,[r9]
493
494$L$ssse3_epilogue:
495	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
496	mov	rsi,QWORD[16+rsp]
497	DB	0F3h,0C3h		;repret
498
499$L$SEH_end_ChaCha20_ssse3:
500
501ALIGN	32
502ChaCha20_4x:
503	mov	QWORD[8+rsp],rdi	;WIN64 prologue
504	mov	QWORD[16+rsp],rsi
505	mov	rax,rsp
506$L$SEH_begin_ChaCha20_4x:
507	mov	rdi,rcx
508	mov	rsi,rdx
509	mov	rdx,r8
510	mov	rcx,r9
511	mov	r8,QWORD[40+rsp]
512
513
514$L$ChaCha20_4x:
515
516	mov	r9,rsp
517
518	mov	r11,r10
519	shr	r10,32
520	test	r10,32
521	jnz	NEAR $L$ChaCha20_8x
522	cmp	rdx,192
523	ja	NEAR $L$proceed4x
524
525	and	r11,71303168
526	cmp	r11,4194304
527	je	NEAR $L$do_sse3_after_all
528
529$L$proceed4x:
530	sub	rsp,0x140+168
531	movaps	XMMWORD[(-168)+r9],xmm6
532	movaps	XMMWORD[(-152)+r9],xmm7
533	movaps	XMMWORD[(-136)+r9],xmm8
534	movaps	XMMWORD[(-120)+r9],xmm9
535	movaps	XMMWORD[(-104)+r9],xmm10
536	movaps	XMMWORD[(-88)+r9],xmm11
537	movaps	XMMWORD[(-72)+r9],xmm12
538	movaps	XMMWORD[(-56)+r9],xmm13
539	movaps	XMMWORD[(-40)+r9],xmm14
540	movaps	XMMWORD[(-24)+r9],xmm15
541$L$4x_body:
542	movdqa	xmm11,XMMWORD[$L$sigma]
543	movdqu	xmm15,XMMWORD[rcx]
544	movdqu	xmm7,XMMWORD[16+rcx]
545	movdqu	xmm3,XMMWORD[r8]
546	lea	rcx,[256+rsp]
547	lea	r10,[$L$rot16]
548	lea	r11,[$L$rot24]
549
550	pshufd	xmm8,xmm11,0x00
551	pshufd	xmm9,xmm11,0x55
552	movdqa	XMMWORD[64+rsp],xmm8
553	pshufd	xmm10,xmm11,0xaa
554	movdqa	XMMWORD[80+rsp],xmm9
555	pshufd	xmm11,xmm11,0xff
556	movdqa	XMMWORD[96+rsp],xmm10
557	movdqa	XMMWORD[112+rsp],xmm11
558
559	pshufd	xmm12,xmm15,0x00
560	pshufd	xmm13,xmm15,0x55
561	movdqa	XMMWORD[(128-256)+rcx],xmm12
562	pshufd	xmm14,xmm15,0xaa
563	movdqa	XMMWORD[(144-256)+rcx],xmm13
564	pshufd	xmm15,xmm15,0xff
565	movdqa	XMMWORD[(160-256)+rcx],xmm14
566	movdqa	XMMWORD[(176-256)+rcx],xmm15
567
568	pshufd	xmm4,xmm7,0x00
569	pshufd	xmm5,xmm7,0x55
570	movdqa	XMMWORD[(192-256)+rcx],xmm4
571	pshufd	xmm6,xmm7,0xaa
572	movdqa	XMMWORD[(208-256)+rcx],xmm5
573	pshufd	xmm7,xmm7,0xff
574	movdqa	XMMWORD[(224-256)+rcx],xmm6
575	movdqa	XMMWORD[(240-256)+rcx],xmm7
576
577	pshufd	xmm0,xmm3,0x00
578	pshufd	xmm1,xmm3,0x55
579	paddd	xmm0,XMMWORD[$L$inc]
580	pshufd	xmm2,xmm3,0xaa
581	movdqa	XMMWORD[(272-256)+rcx],xmm1
582	pshufd	xmm3,xmm3,0xff
583	movdqa	XMMWORD[(288-256)+rcx],xmm2
584	movdqa	XMMWORD[(304-256)+rcx],xmm3
585
586	jmp	NEAR $L$oop_enter4x
587
588ALIGN	32
589$L$oop_outer4x:
590	movdqa	xmm8,XMMWORD[64+rsp]
591	movdqa	xmm9,XMMWORD[80+rsp]
592	movdqa	xmm10,XMMWORD[96+rsp]
593	movdqa	xmm11,XMMWORD[112+rsp]
594	movdqa	xmm12,XMMWORD[((128-256))+rcx]
595	movdqa	xmm13,XMMWORD[((144-256))+rcx]
596	movdqa	xmm14,XMMWORD[((160-256))+rcx]
597	movdqa	xmm15,XMMWORD[((176-256))+rcx]
598	movdqa	xmm4,XMMWORD[((192-256))+rcx]
599	movdqa	xmm5,XMMWORD[((208-256))+rcx]
600	movdqa	xmm6,XMMWORD[((224-256))+rcx]
601	movdqa	xmm7,XMMWORD[((240-256))+rcx]
602	movdqa	xmm0,XMMWORD[((256-256))+rcx]
603	movdqa	xmm1,XMMWORD[((272-256))+rcx]
604	movdqa	xmm2,XMMWORD[((288-256))+rcx]
605	movdqa	xmm3,XMMWORD[((304-256))+rcx]
606	paddd	xmm0,XMMWORD[$L$four]
607
608$L$oop_enter4x:
609	movdqa	XMMWORD[32+rsp],xmm6
610	movdqa	XMMWORD[48+rsp],xmm7
611	movdqa	xmm7,XMMWORD[r10]
612	mov	eax,10
613	movdqa	XMMWORD[(256-256)+rcx],xmm0
614	jmp	NEAR $L$oop4x
615
616ALIGN	32
617$L$oop4x:
618	paddd	xmm8,xmm12
619	paddd	xmm9,xmm13
620	pxor	xmm0,xmm8
621	pxor	xmm1,xmm9
622DB	102,15,56,0,199
623DB	102,15,56,0,207
624	paddd	xmm4,xmm0
625	paddd	xmm5,xmm1
626	pxor	xmm12,xmm4
627	pxor	xmm13,xmm5
628	movdqa	xmm6,xmm12
629	pslld	xmm12,12
630	psrld	xmm6,20
631	movdqa	xmm7,xmm13
632	pslld	xmm13,12
633	por	xmm12,xmm6
634	psrld	xmm7,20
635	movdqa	xmm6,XMMWORD[r11]
636	por	xmm13,xmm7
637	paddd	xmm8,xmm12
638	paddd	xmm9,xmm13
639	pxor	xmm0,xmm8
640	pxor	xmm1,xmm9
641DB	102,15,56,0,198
642DB	102,15,56,0,206
643	paddd	xmm4,xmm0
644	paddd	xmm5,xmm1
645	pxor	xmm12,xmm4
646	pxor	xmm13,xmm5
647	movdqa	xmm7,xmm12
648	pslld	xmm12,7
649	psrld	xmm7,25
650	movdqa	xmm6,xmm13
651	pslld	xmm13,7
652	por	xmm12,xmm7
653	psrld	xmm6,25
654	movdqa	xmm7,XMMWORD[r10]
655	por	xmm13,xmm6
656	movdqa	XMMWORD[rsp],xmm4
657	movdqa	XMMWORD[16+rsp],xmm5
658	movdqa	xmm4,XMMWORD[32+rsp]
659	movdqa	xmm5,XMMWORD[48+rsp]
660	paddd	xmm10,xmm14
661	paddd	xmm11,xmm15
662	pxor	xmm2,xmm10
663	pxor	xmm3,xmm11
664DB	102,15,56,0,215
665DB	102,15,56,0,223
666	paddd	xmm4,xmm2
667	paddd	xmm5,xmm3
668	pxor	xmm14,xmm4
669	pxor	xmm15,xmm5
670	movdqa	xmm6,xmm14
671	pslld	xmm14,12
672	psrld	xmm6,20
673	movdqa	xmm7,xmm15
674	pslld	xmm15,12
675	por	xmm14,xmm6
676	psrld	xmm7,20
677	movdqa	xmm6,XMMWORD[r11]
678	por	xmm15,xmm7
679	paddd	xmm10,xmm14
680	paddd	xmm11,xmm15
681	pxor	xmm2,xmm10
682	pxor	xmm3,xmm11
683DB	102,15,56,0,214
684DB	102,15,56,0,222
685	paddd	xmm4,xmm2
686	paddd	xmm5,xmm3
687	pxor	xmm14,xmm4
688	pxor	xmm15,xmm5
689	movdqa	xmm7,xmm14
690	pslld	xmm14,7
691	psrld	xmm7,25
692	movdqa	xmm6,xmm15
693	pslld	xmm15,7
694	por	xmm14,xmm7
695	psrld	xmm6,25
696	movdqa	xmm7,XMMWORD[r10]
697	por	xmm15,xmm6
698	paddd	xmm8,xmm13
699	paddd	xmm9,xmm14
700	pxor	xmm3,xmm8
701	pxor	xmm0,xmm9
702DB	102,15,56,0,223
703DB	102,15,56,0,199
704	paddd	xmm4,xmm3
705	paddd	xmm5,xmm0
706	pxor	xmm13,xmm4
707	pxor	xmm14,xmm5
708	movdqa	xmm6,xmm13
709	pslld	xmm13,12
710	psrld	xmm6,20
711	movdqa	xmm7,xmm14
712	pslld	xmm14,12
713	por	xmm13,xmm6
714	psrld	xmm7,20
715	movdqa	xmm6,XMMWORD[r11]
716	por	xmm14,xmm7
717	paddd	xmm8,xmm13
718	paddd	xmm9,xmm14
719	pxor	xmm3,xmm8
720	pxor	xmm0,xmm9
721DB	102,15,56,0,222
722DB	102,15,56,0,198
723	paddd	xmm4,xmm3
724	paddd	xmm5,xmm0
725	pxor	xmm13,xmm4
726	pxor	xmm14,xmm5
727	movdqa	xmm7,xmm13
728	pslld	xmm13,7
729	psrld	xmm7,25
730	movdqa	xmm6,xmm14
731	pslld	xmm14,7
732	por	xmm13,xmm7
733	psrld	xmm6,25
734	movdqa	xmm7,XMMWORD[r10]
735	por	xmm14,xmm6
736	movdqa	XMMWORD[32+rsp],xmm4
737	movdqa	XMMWORD[48+rsp],xmm5
738	movdqa	xmm4,XMMWORD[rsp]
739	movdqa	xmm5,XMMWORD[16+rsp]
740	paddd	xmm10,xmm15
741	paddd	xmm11,xmm12
742	pxor	xmm1,xmm10
743	pxor	xmm2,xmm11
744DB	102,15,56,0,207
745DB	102,15,56,0,215
746	paddd	xmm4,xmm1
747	paddd	xmm5,xmm2
748	pxor	xmm15,xmm4
749	pxor	xmm12,xmm5
750	movdqa	xmm6,xmm15
751	pslld	xmm15,12
752	psrld	xmm6,20
753	movdqa	xmm7,xmm12
754	pslld	xmm12,12
755	por	xmm15,xmm6
756	psrld	xmm7,20
757	movdqa	xmm6,XMMWORD[r11]
758	por	xmm12,xmm7
759	paddd	xmm10,xmm15
760	paddd	xmm11,xmm12
761	pxor	xmm1,xmm10
762	pxor	xmm2,xmm11
763DB	102,15,56,0,206
764DB	102,15,56,0,214
765	paddd	xmm4,xmm1
766	paddd	xmm5,xmm2
767	pxor	xmm15,xmm4
768	pxor	xmm12,xmm5
769	movdqa	xmm7,xmm15
770	pslld	xmm15,7
771	psrld	xmm7,25
772	movdqa	xmm6,xmm12
773	pslld	xmm12,7
774	por	xmm15,xmm7
775	psrld	xmm6,25
776	movdqa	xmm7,XMMWORD[r10]
777	por	xmm12,xmm6
778	dec	eax
779	jnz	NEAR $L$oop4x
780
781	paddd	xmm8,XMMWORD[64+rsp]
782	paddd	xmm9,XMMWORD[80+rsp]
783	paddd	xmm10,XMMWORD[96+rsp]
784	paddd	xmm11,XMMWORD[112+rsp]
785
786	movdqa	xmm6,xmm8
787	punpckldq	xmm8,xmm9
788	movdqa	xmm7,xmm10
789	punpckldq	xmm10,xmm11
790	punpckhdq	xmm6,xmm9
791	punpckhdq	xmm7,xmm11
792	movdqa	xmm9,xmm8
793	punpcklqdq	xmm8,xmm10
794	movdqa	xmm11,xmm6
795	punpcklqdq	xmm6,xmm7
796	punpckhqdq	xmm9,xmm10
797	punpckhqdq	xmm11,xmm7
798	paddd	xmm12,XMMWORD[((128-256))+rcx]
799	paddd	xmm13,XMMWORD[((144-256))+rcx]
800	paddd	xmm14,XMMWORD[((160-256))+rcx]
801	paddd	xmm15,XMMWORD[((176-256))+rcx]
802
803	movdqa	XMMWORD[rsp],xmm8
804	movdqa	XMMWORD[16+rsp],xmm9
805	movdqa	xmm8,XMMWORD[32+rsp]
806	movdqa	xmm9,XMMWORD[48+rsp]
807
808	movdqa	xmm10,xmm12
809	punpckldq	xmm12,xmm13
810	movdqa	xmm7,xmm14
811	punpckldq	xmm14,xmm15
812	punpckhdq	xmm10,xmm13
813	punpckhdq	xmm7,xmm15
814	movdqa	xmm13,xmm12
815	punpcklqdq	xmm12,xmm14
816	movdqa	xmm15,xmm10
817	punpcklqdq	xmm10,xmm7
818	punpckhqdq	xmm13,xmm14
819	punpckhqdq	xmm15,xmm7
820	paddd	xmm4,XMMWORD[((192-256))+rcx]
821	paddd	xmm5,XMMWORD[((208-256))+rcx]
822	paddd	xmm8,XMMWORD[((224-256))+rcx]
823	paddd	xmm9,XMMWORD[((240-256))+rcx]
824
825	movdqa	XMMWORD[32+rsp],xmm6
826	movdqa	XMMWORD[48+rsp],xmm11
827
828	movdqa	xmm14,xmm4
829	punpckldq	xmm4,xmm5
830	movdqa	xmm7,xmm8
831	punpckldq	xmm8,xmm9
832	punpckhdq	xmm14,xmm5
833	punpckhdq	xmm7,xmm9
834	movdqa	xmm5,xmm4
835	punpcklqdq	xmm4,xmm8
836	movdqa	xmm9,xmm14
837	punpcklqdq	xmm14,xmm7
838	punpckhqdq	xmm5,xmm8
839	punpckhqdq	xmm9,xmm7
840	paddd	xmm0,XMMWORD[((256-256))+rcx]
841	paddd	xmm1,XMMWORD[((272-256))+rcx]
842	paddd	xmm2,XMMWORD[((288-256))+rcx]
843	paddd	xmm3,XMMWORD[((304-256))+rcx]
844
845	movdqa	xmm8,xmm0
846	punpckldq	xmm0,xmm1
847	movdqa	xmm7,xmm2
848	punpckldq	xmm2,xmm3
849	punpckhdq	xmm8,xmm1
850	punpckhdq	xmm7,xmm3
851	movdqa	xmm1,xmm0
852	punpcklqdq	xmm0,xmm2
853	movdqa	xmm3,xmm8
854	punpcklqdq	xmm8,xmm7
855	punpckhqdq	xmm1,xmm2
856	punpckhqdq	xmm3,xmm7
857	cmp	rdx,64*4
858	jb	NEAR $L$tail4x
859
860	movdqu	xmm6,XMMWORD[rsi]
861	movdqu	xmm11,XMMWORD[16+rsi]
862	movdqu	xmm2,XMMWORD[32+rsi]
863	movdqu	xmm7,XMMWORD[48+rsi]
864	pxor	xmm6,XMMWORD[rsp]
865	pxor	xmm11,xmm12
866	pxor	xmm2,xmm4
867	pxor	xmm7,xmm0
868
869	movdqu	XMMWORD[rdi],xmm6
870	movdqu	xmm6,XMMWORD[64+rsi]
871	movdqu	XMMWORD[16+rdi],xmm11
872	movdqu	xmm11,XMMWORD[80+rsi]
873	movdqu	XMMWORD[32+rdi],xmm2
874	movdqu	xmm2,XMMWORD[96+rsi]
875	movdqu	XMMWORD[48+rdi],xmm7
876	movdqu	xmm7,XMMWORD[112+rsi]
877	lea	rsi,[128+rsi]
878	pxor	xmm6,XMMWORD[16+rsp]
879	pxor	xmm11,xmm13
880	pxor	xmm2,xmm5
881	pxor	xmm7,xmm1
882
883	movdqu	XMMWORD[64+rdi],xmm6
884	movdqu	xmm6,XMMWORD[rsi]
885	movdqu	XMMWORD[80+rdi],xmm11
886	movdqu	xmm11,XMMWORD[16+rsi]
887	movdqu	XMMWORD[96+rdi],xmm2
888	movdqu	xmm2,XMMWORD[32+rsi]
889	movdqu	XMMWORD[112+rdi],xmm7
890	lea	rdi,[128+rdi]
891	movdqu	xmm7,XMMWORD[48+rsi]
892	pxor	xmm6,XMMWORD[32+rsp]
893	pxor	xmm11,xmm10
894	pxor	xmm2,xmm14
895	pxor	xmm7,xmm8
896
897	movdqu	XMMWORD[rdi],xmm6
898	movdqu	xmm6,XMMWORD[64+rsi]
899	movdqu	XMMWORD[16+rdi],xmm11
900	movdqu	xmm11,XMMWORD[80+rsi]
901	movdqu	XMMWORD[32+rdi],xmm2
902	movdqu	xmm2,XMMWORD[96+rsi]
903	movdqu	XMMWORD[48+rdi],xmm7
904	movdqu	xmm7,XMMWORD[112+rsi]
905	lea	rsi,[128+rsi]
906	pxor	xmm6,XMMWORD[48+rsp]
907	pxor	xmm11,xmm15
908	pxor	xmm2,xmm9
909	pxor	xmm7,xmm3
910	movdqu	XMMWORD[64+rdi],xmm6
911	movdqu	XMMWORD[80+rdi],xmm11
912	movdqu	XMMWORD[96+rdi],xmm2
913	movdqu	XMMWORD[112+rdi],xmm7
914	lea	rdi,[128+rdi]
915
916	sub	rdx,64*4
917	jnz	NEAR $L$oop_outer4x
918
919	jmp	NEAR $L$done4x
920
921$L$tail4x:
922	cmp	rdx,192
923	jae	NEAR $L$192_or_more4x
924	cmp	rdx,128
925	jae	NEAR $L$128_or_more4x
926	cmp	rdx,64
927	jae	NEAR $L$64_or_more4x
928
929
930	xor	r10,r10
931
932	movdqa	XMMWORD[16+rsp],xmm12
933	movdqa	XMMWORD[32+rsp],xmm4
934	movdqa	XMMWORD[48+rsp],xmm0
935	jmp	NEAR $L$oop_tail4x
936
937ALIGN	32
938$L$64_or_more4x:
939	movdqu	xmm6,XMMWORD[rsi]
940	movdqu	xmm11,XMMWORD[16+rsi]
941	movdqu	xmm2,XMMWORD[32+rsi]
942	movdqu	xmm7,XMMWORD[48+rsi]
943	pxor	xmm6,XMMWORD[rsp]
944	pxor	xmm11,xmm12
945	pxor	xmm2,xmm4
946	pxor	xmm7,xmm0
947	movdqu	XMMWORD[rdi],xmm6
948	movdqu	XMMWORD[16+rdi],xmm11
949	movdqu	XMMWORD[32+rdi],xmm2
950	movdqu	XMMWORD[48+rdi],xmm7
951	je	NEAR $L$done4x
952
953	movdqa	xmm6,XMMWORD[16+rsp]
954	lea	rsi,[64+rsi]
955	xor	r10,r10
956	movdqa	XMMWORD[rsp],xmm6
957	movdqa	XMMWORD[16+rsp],xmm13
958	lea	rdi,[64+rdi]
959	movdqa	XMMWORD[32+rsp],xmm5
960	sub	rdx,64
961	movdqa	XMMWORD[48+rsp],xmm1
962	jmp	NEAR $L$oop_tail4x
963
964ALIGN	32
965$L$128_or_more4x:
966	movdqu	xmm6,XMMWORD[rsi]
967	movdqu	xmm11,XMMWORD[16+rsi]
968	movdqu	xmm2,XMMWORD[32+rsi]
969	movdqu	xmm7,XMMWORD[48+rsi]
970	pxor	xmm6,XMMWORD[rsp]
971	pxor	xmm11,xmm12
972	pxor	xmm2,xmm4
973	pxor	xmm7,xmm0
974
975	movdqu	XMMWORD[rdi],xmm6
976	movdqu	xmm6,XMMWORD[64+rsi]
977	movdqu	XMMWORD[16+rdi],xmm11
978	movdqu	xmm11,XMMWORD[80+rsi]
979	movdqu	XMMWORD[32+rdi],xmm2
980	movdqu	xmm2,XMMWORD[96+rsi]
981	movdqu	XMMWORD[48+rdi],xmm7
982	movdqu	xmm7,XMMWORD[112+rsi]
983	pxor	xmm6,XMMWORD[16+rsp]
984	pxor	xmm11,xmm13
985	pxor	xmm2,xmm5
986	pxor	xmm7,xmm1
987	movdqu	XMMWORD[64+rdi],xmm6
988	movdqu	XMMWORD[80+rdi],xmm11
989	movdqu	XMMWORD[96+rdi],xmm2
990	movdqu	XMMWORD[112+rdi],xmm7
991	je	NEAR $L$done4x
992
993	movdqa	xmm6,XMMWORD[32+rsp]
994	lea	rsi,[128+rsi]
995	xor	r10,r10
996	movdqa	XMMWORD[rsp],xmm6
997	movdqa	XMMWORD[16+rsp],xmm10
998	lea	rdi,[128+rdi]
999	movdqa	XMMWORD[32+rsp],xmm14
1000	sub	rdx,128
1001	movdqa	XMMWORD[48+rsp],xmm8
1002	jmp	NEAR $L$oop_tail4x
1003
1004ALIGN	32
1005$L$192_or_more4x:
1006	movdqu	xmm6,XMMWORD[rsi]
1007	movdqu	xmm11,XMMWORD[16+rsi]
1008	movdqu	xmm2,XMMWORD[32+rsi]
1009	movdqu	xmm7,XMMWORD[48+rsi]
1010	pxor	xmm6,XMMWORD[rsp]
1011	pxor	xmm11,xmm12
1012	pxor	xmm2,xmm4
1013	pxor	xmm7,xmm0
1014
1015	movdqu	XMMWORD[rdi],xmm6
1016	movdqu	xmm6,XMMWORD[64+rsi]
1017	movdqu	XMMWORD[16+rdi],xmm11
1018	movdqu	xmm11,XMMWORD[80+rsi]
1019	movdqu	XMMWORD[32+rdi],xmm2
1020	movdqu	xmm2,XMMWORD[96+rsi]
1021	movdqu	XMMWORD[48+rdi],xmm7
1022	movdqu	xmm7,XMMWORD[112+rsi]
1023	lea	rsi,[128+rsi]
1024	pxor	xmm6,XMMWORD[16+rsp]
1025	pxor	xmm11,xmm13
1026	pxor	xmm2,xmm5
1027	pxor	xmm7,xmm1
1028
1029	movdqu	XMMWORD[64+rdi],xmm6
1030	movdqu	xmm6,XMMWORD[rsi]
1031	movdqu	XMMWORD[80+rdi],xmm11
1032	movdqu	xmm11,XMMWORD[16+rsi]
1033	movdqu	XMMWORD[96+rdi],xmm2
1034	movdqu	xmm2,XMMWORD[32+rsi]
1035	movdqu	XMMWORD[112+rdi],xmm7
1036	lea	rdi,[128+rdi]
1037	movdqu	xmm7,XMMWORD[48+rsi]
1038	pxor	xmm6,XMMWORD[32+rsp]
1039	pxor	xmm11,xmm10
1040	pxor	xmm2,xmm14
1041	pxor	xmm7,xmm8
1042	movdqu	XMMWORD[rdi],xmm6
1043	movdqu	XMMWORD[16+rdi],xmm11
1044	movdqu	XMMWORD[32+rdi],xmm2
1045	movdqu	XMMWORD[48+rdi],xmm7
1046	je	NEAR $L$done4x
1047
1048	movdqa	xmm6,XMMWORD[48+rsp]
1049	lea	rsi,[64+rsi]
1050	xor	r10,r10
1051	movdqa	XMMWORD[rsp],xmm6
1052	movdqa	XMMWORD[16+rsp],xmm15
1053	lea	rdi,[64+rdi]
1054	movdqa	XMMWORD[32+rsp],xmm9
1055	sub	rdx,192
1056	movdqa	XMMWORD[48+rsp],xmm3
1057
1058$L$oop_tail4x:
1059	movzx	eax,BYTE[r10*1+rsi]
1060	movzx	ecx,BYTE[r10*1+rsp]
1061	lea	r10,[1+r10]
1062	xor	eax,ecx
1063	mov	BYTE[((-1))+r10*1+rdi],al
1064	dec	rdx
1065	jnz	NEAR $L$oop_tail4x
1066
1067$L$done4x:
1068	movaps	xmm6,XMMWORD[((-168))+r9]
1069	movaps	xmm7,XMMWORD[((-152))+r9]
1070	movaps	xmm8,XMMWORD[((-136))+r9]
1071	movaps	xmm9,XMMWORD[((-120))+r9]
1072	movaps	xmm10,XMMWORD[((-104))+r9]
1073	movaps	xmm11,XMMWORD[((-88))+r9]
1074	movaps	xmm12,XMMWORD[((-72))+r9]
1075	movaps	xmm13,XMMWORD[((-56))+r9]
1076	movaps	xmm14,XMMWORD[((-40))+r9]
1077	movaps	xmm15,XMMWORD[((-24))+r9]
1078	lea	rsp,[r9]
1079
1080$L$4x_epilogue:
1081	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1082	mov	rsi,QWORD[16+rsp]
1083	DB	0F3h,0C3h		;repret
1084
1085$L$SEH_end_ChaCha20_4x:
1086
1087ALIGN	32
1088ChaCha20_8x:
1089	mov	QWORD[8+rsp],rdi	;WIN64 prologue
1090	mov	QWORD[16+rsp],rsi
1091	mov	rax,rsp
1092$L$SEH_begin_ChaCha20_8x:
1093	mov	rdi,rcx
1094	mov	rsi,rdx
1095	mov	rdx,r8
1096	mov	rcx,r9
1097	mov	r8,QWORD[40+rsp]
1098
1099
1100$L$ChaCha20_8x:
1101
1102	mov	r9,rsp
1103
1104	sub	rsp,0x280+168
1105	and	rsp,-32
1106	movaps	XMMWORD[(-168)+r9],xmm6
1107	movaps	XMMWORD[(-152)+r9],xmm7
1108	movaps	XMMWORD[(-136)+r9],xmm8
1109	movaps	XMMWORD[(-120)+r9],xmm9
1110	movaps	XMMWORD[(-104)+r9],xmm10
1111	movaps	XMMWORD[(-88)+r9],xmm11
1112	movaps	XMMWORD[(-72)+r9],xmm12
1113	movaps	XMMWORD[(-56)+r9],xmm13
1114	movaps	XMMWORD[(-40)+r9],xmm14
1115	movaps	XMMWORD[(-24)+r9],xmm15
1116$L$8x_body:
1117	vzeroupper
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128	vbroadcasti128	ymm11,XMMWORD[$L$sigma]
1129	vbroadcasti128	ymm3,XMMWORD[rcx]
1130	vbroadcasti128	ymm15,XMMWORD[16+rcx]
1131	vbroadcasti128	ymm7,XMMWORD[r8]
1132	lea	rcx,[256+rsp]
1133	lea	rax,[512+rsp]
1134	lea	r10,[$L$rot16]
1135	lea	r11,[$L$rot24]
1136
1137	vpshufd	ymm8,ymm11,0x00
1138	vpshufd	ymm9,ymm11,0x55
1139	vmovdqa	YMMWORD[(128-256)+rcx],ymm8
1140	vpshufd	ymm10,ymm11,0xaa
1141	vmovdqa	YMMWORD[(160-256)+rcx],ymm9
1142	vpshufd	ymm11,ymm11,0xff
1143	vmovdqa	YMMWORD[(192-256)+rcx],ymm10
1144	vmovdqa	YMMWORD[(224-256)+rcx],ymm11
1145
1146	vpshufd	ymm0,ymm3,0x00
1147	vpshufd	ymm1,ymm3,0x55
1148	vmovdqa	YMMWORD[(256-256)+rcx],ymm0
1149	vpshufd	ymm2,ymm3,0xaa
1150	vmovdqa	YMMWORD[(288-256)+rcx],ymm1
1151	vpshufd	ymm3,ymm3,0xff
1152	vmovdqa	YMMWORD[(320-256)+rcx],ymm2
1153	vmovdqa	YMMWORD[(352-256)+rcx],ymm3
1154
1155	vpshufd	ymm12,ymm15,0x00
1156	vpshufd	ymm13,ymm15,0x55
1157	vmovdqa	YMMWORD[(384-512)+rax],ymm12
1158	vpshufd	ymm14,ymm15,0xaa
1159	vmovdqa	YMMWORD[(416-512)+rax],ymm13
1160	vpshufd	ymm15,ymm15,0xff
1161	vmovdqa	YMMWORD[(448-512)+rax],ymm14
1162	vmovdqa	YMMWORD[(480-512)+rax],ymm15
1163
1164	vpshufd	ymm4,ymm7,0x00
1165	vpshufd	ymm5,ymm7,0x55
1166	vpaddd	ymm4,ymm4,YMMWORD[$L$incy]
1167	vpshufd	ymm6,ymm7,0xaa
1168	vmovdqa	YMMWORD[(544-512)+rax],ymm5
1169	vpshufd	ymm7,ymm7,0xff
1170	vmovdqa	YMMWORD[(576-512)+rax],ymm6
1171	vmovdqa	YMMWORD[(608-512)+rax],ymm7
1172
1173	jmp	NEAR $L$oop_enter8x
1174
1175ALIGN	32
1176$L$oop_outer8x:
1177	vmovdqa	ymm8,YMMWORD[((128-256))+rcx]
1178	vmovdqa	ymm9,YMMWORD[((160-256))+rcx]
1179	vmovdqa	ymm10,YMMWORD[((192-256))+rcx]
1180	vmovdqa	ymm11,YMMWORD[((224-256))+rcx]
1181	vmovdqa	ymm0,YMMWORD[((256-256))+rcx]
1182	vmovdqa	ymm1,YMMWORD[((288-256))+rcx]
1183	vmovdqa	ymm2,YMMWORD[((320-256))+rcx]
1184	vmovdqa	ymm3,YMMWORD[((352-256))+rcx]
1185	vmovdqa	ymm12,YMMWORD[((384-512))+rax]
1186	vmovdqa	ymm13,YMMWORD[((416-512))+rax]
1187	vmovdqa	ymm14,YMMWORD[((448-512))+rax]
1188	vmovdqa	ymm15,YMMWORD[((480-512))+rax]
1189	vmovdqa	ymm4,YMMWORD[((512-512))+rax]
1190	vmovdqa	ymm5,YMMWORD[((544-512))+rax]
1191	vmovdqa	ymm6,YMMWORD[((576-512))+rax]
1192	vmovdqa	ymm7,YMMWORD[((608-512))+rax]
1193	vpaddd	ymm4,ymm4,YMMWORD[$L$eight]
1194
1195$L$oop_enter8x:
1196	vmovdqa	YMMWORD[64+rsp],ymm14
1197	vmovdqa	YMMWORD[96+rsp],ymm15
1198	vbroadcasti128	ymm15,XMMWORD[r10]
1199	vmovdqa	YMMWORD[(512-512)+rax],ymm4
1200	mov	eax,10
1201	jmp	NEAR $L$oop8x
1202
1203ALIGN	32
1204$L$oop8x:
1205	vpaddd	ymm8,ymm8,ymm0
1206	vpxor	ymm4,ymm8,ymm4
1207	vpshufb	ymm4,ymm4,ymm15
1208	vpaddd	ymm9,ymm9,ymm1
1209	vpxor	ymm5,ymm9,ymm5
1210	vpshufb	ymm5,ymm5,ymm15
1211	vpaddd	ymm12,ymm12,ymm4
1212	vpxor	ymm0,ymm12,ymm0
1213	vpslld	ymm14,ymm0,12
1214	vpsrld	ymm0,ymm0,20
1215	vpor	ymm0,ymm14,ymm0
1216	vbroadcasti128	ymm14,XMMWORD[r11]
1217	vpaddd	ymm13,ymm13,ymm5
1218	vpxor	ymm1,ymm13,ymm1
1219	vpslld	ymm15,ymm1,12
1220	vpsrld	ymm1,ymm1,20
1221	vpor	ymm1,ymm15,ymm1
1222	vpaddd	ymm8,ymm8,ymm0
1223	vpxor	ymm4,ymm8,ymm4
1224	vpshufb	ymm4,ymm4,ymm14
1225	vpaddd	ymm9,ymm9,ymm1
1226	vpxor	ymm5,ymm9,ymm5
1227	vpshufb	ymm5,ymm5,ymm14
1228	vpaddd	ymm12,ymm12,ymm4
1229	vpxor	ymm0,ymm12,ymm0
1230	vpslld	ymm15,ymm0,7
1231	vpsrld	ymm0,ymm0,25
1232	vpor	ymm0,ymm15,ymm0
1233	vbroadcasti128	ymm15,XMMWORD[r10]
1234	vpaddd	ymm13,ymm13,ymm5
1235	vpxor	ymm1,ymm13,ymm1
1236	vpslld	ymm14,ymm1,7
1237	vpsrld	ymm1,ymm1,25
1238	vpor	ymm1,ymm14,ymm1
1239	vmovdqa	YMMWORD[rsp],ymm12
1240	vmovdqa	YMMWORD[32+rsp],ymm13
1241	vmovdqa	ymm12,YMMWORD[64+rsp]
1242	vmovdqa	ymm13,YMMWORD[96+rsp]
1243	vpaddd	ymm10,ymm10,ymm2
1244	vpxor	ymm6,ymm10,ymm6
1245	vpshufb	ymm6,ymm6,ymm15
1246	vpaddd	ymm11,ymm11,ymm3
1247	vpxor	ymm7,ymm11,ymm7
1248	vpshufb	ymm7,ymm7,ymm15
1249	vpaddd	ymm12,ymm12,ymm6
1250	vpxor	ymm2,ymm12,ymm2
1251	vpslld	ymm14,ymm2,12
1252	vpsrld	ymm2,ymm2,20
1253	vpor	ymm2,ymm14,ymm2
1254	vbroadcasti128	ymm14,XMMWORD[r11]
1255	vpaddd	ymm13,ymm13,ymm7
1256	vpxor	ymm3,ymm13,ymm3
1257	vpslld	ymm15,ymm3,12
1258	vpsrld	ymm3,ymm3,20
1259	vpor	ymm3,ymm15,ymm3
1260	vpaddd	ymm10,ymm10,ymm2
1261	vpxor	ymm6,ymm10,ymm6
1262	vpshufb	ymm6,ymm6,ymm14
1263	vpaddd	ymm11,ymm11,ymm3
1264	vpxor	ymm7,ymm11,ymm7
1265	vpshufb	ymm7,ymm7,ymm14
1266	vpaddd	ymm12,ymm12,ymm6
1267	vpxor	ymm2,ymm12,ymm2
1268	vpslld	ymm15,ymm2,7
1269	vpsrld	ymm2,ymm2,25
1270	vpor	ymm2,ymm15,ymm2
1271	vbroadcasti128	ymm15,XMMWORD[r10]
1272	vpaddd	ymm13,ymm13,ymm7
1273	vpxor	ymm3,ymm13,ymm3
1274	vpslld	ymm14,ymm3,7
1275	vpsrld	ymm3,ymm3,25
1276	vpor	ymm3,ymm14,ymm3
1277	vpaddd	ymm8,ymm8,ymm1
1278	vpxor	ymm7,ymm8,ymm7
1279	vpshufb	ymm7,ymm7,ymm15
1280	vpaddd	ymm9,ymm9,ymm2
1281	vpxor	ymm4,ymm9,ymm4
1282	vpshufb	ymm4,ymm4,ymm15
1283	vpaddd	ymm12,ymm12,ymm7
1284	vpxor	ymm1,ymm12,ymm1
1285	vpslld	ymm14,ymm1,12
1286	vpsrld	ymm1,ymm1,20
1287	vpor	ymm1,ymm14,ymm1
1288	vbroadcasti128	ymm14,XMMWORD[r11]
1289	vpaddd	ymm13,ymm13,ymm4
1290	vpxor	ymm2,ymm13,ymm2
1291	vpslld	ymm15,ymm2,12
1292	vpsrld	ymm2,ymm2,20
1293	vpor	ymm2,ymm15,ymm2
1294	vpaddd	ymm8,ymm8,ymm1
1295	vpxor	ymm7,ymm8,ymm7
1296	vpshufb	ymm7,ymm7,ymm14
1297	vpaddd	ymm9,ymm9,ymm2
1298	vpxor	ymm4,ymm9,ymm4
1299	vpshufb	ymm4,ymm4,ymm14
1300	vpaddd	ymm12,ymm12,ymm7
1301	vpxor	ymm1,ymm12,ymm1
1302	vpslld	ymm15,ymm1,7
1303	vpsrld	ymm1,ymm1,25
1304	vpor	ymm1,ymm15,ymm1
1305	vbroadcasti128	ymm15,XMMWORD[r10]
1306	vpaddd	ymm13,ymm13,ymm4
1307	vpxor	ymm2,ymm13,ymm2
1308	vpslld	ymm14,ymm2,7
1309	vpsrld	ymm2,ymm2,25
1310	vpor	ymm2,ymm14,ymm2
1311	vmovdqa	YMMWORD[64+rsp],ymm12
1312	vmovdqa	YMMWORD[96+rsp],ymm13
1313	vmovdqa	ymm12,YMMWORD[rsp]
1314	vmovdqa	ymm13,YMMWORD[32+rsp]
1315	vpaddd	ymm10,ymm10,ymm3
1316	vpxor	ymm5,ymm10,ymm5
1317	vpshufb	ymm5,ymm5,ymm15
1318	vpaddd	ymm11,ymm11,ymm0
1319	vpxor	ymm6,ymm11,ymm6
1320	vpshufb	ymm6,ymm6,ymm15
1321	vpaddd	ymm12,ymm12,ymm5
1322	vpxor	ymm3,ymm12,ymm3
1323	vpslld	ymm14,ymm3,12
1324	vpsrld	ymm3,ymm3,20
1325	vpor	ymm3,ymm14,ymm3
1326	vbroadcasti128	ymm14,XMMWORD[r11]
1327	vpaddd	ymm13,ymm13,ymm6
1328	vpxor	ymm0,ymm13,ymm0
1329	vpslld	ymm15,ymm0,12
1330	vpsrld	ymm0,ymm0,20
1331	vpor	ymm0,ymm15,ymm0
1332	vpaddd	ymm10,ymm10,ymm3
1333	vpxor	ymm5,ymm10,ymm5
1334	vpshufb	ymm5,ymm5,ymm14
1335	vpaddd	ymm11,ymm11,ymm0
1336	vpxor	ymm6,ymm11,ymm6
1337	vpshufb	ymm6,ymm6,ymm14
1338	vpaddd	ymm12,ymm12,ymm5
1339	vpxor	ymm3,ymm12,ymm3
1340	vpslld	ymm15,ymm3,7
1341	vpsrld	ymm3,ymm3,25
1342	vpor	ymm3,ymm15,ymm3
1343	vbroadcasti128	ymm15,XMMWORD[r10]
1344	vpaddd	ymm13,ymm13,ymm6
1345	vpxor	ymm0,ymm13,ymm0
1346	vpslld	ymm14,ymm0,7
1347	vpsrld	ymm0,ymm0,25
1348	vpor	ymm0,ymm14,ymm0
1349	dec	eax
1350	jnz	NEAR $L$oop8x
1351
1352	lea	rax,[512+rsp]
1353	vpaddd	ymm8,ymm8,YMMWORD[((128-256))+rcx]
1354	vpaddd	ymm9,ymm9,YMMWORD[((160-256))+rcx]
1355	vpaddd	ymm10,ymm10,YMMWORD[((192-256))+rcx]
1356	vpaddd	ymm11,ymm11,YMMWORD[((224-256))+rcx]
1357
1358	vpunpckldq	ymm14,ymm8,ymm9
1359	vpunpckldq	ymm15,ymm10,ymm11
1360	vpunpckhdq	ymm8,ymm8,ymm9
1361	vpunpckhdq	ymm10,ymm10,ymm11
1362	vpunpcklqdq	ymm9,ymm14,ymm15
1363	vpunpckhqdq	ymm14,ymm14,ymm15
1364	vpunpcklqdq	ymm11,ymm8,ymm10
1365	vpunpckhqdq	ymm8,ymm8,ymm10
1366	vpaddd	ymm0,ymm0,YMMWORD[((256-256))+rcx]
1367	vpaddd	ymm1,ymm1,YMMWORD[((288-256))+rcx]
1368	vpaddd	ymm2,ymm2,YMMWORD[((320-256))+rcx]
1369	vpaddd	ymm3,ymm3,YMMWORD[((352-256))+rcx]
1370
1371	vpunpckldq	ymm10,ymm0,ymm1
1372	vpunpckldq	ymm15,ymm2,ymm3
1373	vpunpckhdq	ymm0,ymm0,ymm1
1374	vpunpckhdq	ymm2,ymm2,ymm3
1375	vpunpcklqdq	ymm1,ymm10,ymm15
1376	vpunpckhqdq	ymm10,ymm10,ymm15
1377	vpunpcklqdq	ymm3,ymm0,ymm2
1378	vpunpckhqdq	ymm0,ymm0,ymm2
1379	vperm2i128	ymm15,ymm9,ymm1,0x20
1380	vperm2i128	ymm1,ymm9,ymm1,0x31
1381	vperm2i128	ymm9,ymm14,ymm10,0x20
1382	vperm2i128	ymm10,ymm14,ymm10,0x31
1383	vperm2i128	ymm14,ymm11,ymm3,0x20
1384	vperm2i128	ymm3,ymm11,ymm3,0x31
1385	vperm2i128	ymm11,ymm8,ymm0,0x20
1386	vperm2i128	ymm0,ymm8,ymm0,0x31
1387	vmovdqa	YMMWORD[rsp],ymm15
1388	vmovdqa	YMMWORD[32+rsp],ymm9
1389	vmovdqa	ymm15,YMMWORD[64+rsp]
1390	vmovdqa	ymm9,YMMWORD[96+rsp]
1391
1392	vpaddd	ymm12,ymm12,YMMWORD[((384-512))+rax]
1393	vpaddd	ymm13,ymm13,YMMWORD[((416-512))+rax]
1394	vpaddd	ymm15,ymm15,YMMWORD[((448-512))+rax]
1395	vpaddd	ymm9,ymm9,YMMWORD[((480-512))+rax]
1396
1397	vpunpckldq	ymm2,ymm12,ymm13
1398	vpunpckldq	ymm8,ymm15,ymm9
1399	vpunpckhdq	ymm12,ymm12,ymm13
1400	vpunpckhdq	ymm15,ymm15,ymm9
1401	vpunpcklqdq	ymm13,ymm2,ymm8
1402	vpunpckhqdq	ymm2,ymm2,ymm8
1403	vpunpcklqdq	ymm9,ymm12,ymm15
1404	vpunpckhqdq	ymm12,ymm12,ymm15
1405	vpaddd	ymm4,ymm4,YMMWORD[((512-512))+rax]
1406	vpaddd	ymm5,ymm5,YMMWORD[((544-512))+rax]
1407	vpaddd	ymm6,ymm6,YMMWORD[((576-512))+rax]
1408	vpaddd	ymm7,ymm7,YMMWORD[((608-512))+rax]
1409
1410	vpunpckldq	ymm15,ymm4,ymm5
1411	vpunpckldq	ymm8,ymm6,ymm7
1412	vpunpckhdq	ymm4,ymm4,ymm5
1413	vpunpckhdq	ymm6,ymm6,ymm7
1414	vpunpcklqdq	ymm5,ymm15,ymm8
1415	vpunpckhqdq	ymm15,ymm15,ymm8
1416	vpunpcklqdq	ymm7,ymm4,ymm6
1417	vpunpckhqdq	ymm4,ymm4,ymm6
1418	vperm2i128	ymm8,ymm13,ymm5,0x20
1419	vperm2i128	ymm5,ymm13,ymm5,0x31
1420	vperm2i128	ymm13,ymm2,ymm15,0x20
1421	vperm2i128	ymm15,ymm2,ymm15,0x31
1422	vperm2i128	ymm2,ymm9,ymm7,0x20
1423	vperm2i128	ymm7,ymm9,ymm7,0x31
1424	vperm2i128	ymm9,ymm12,ymm4,0x20
1425	vperm2i128	ymm4,ymm12,ymm4,0x31
1426	vmovdqa	ymm6,YMMWORD[rsp]
1427	vmovdqa	ymm12,YMMWORD[32+rsp]
1428
1429	cmp	rdx,64*8
1430	jb	NEAR $L$tail8x
1431
1432	vpxor	ymm6,ymm6,YMMWORD[rsi]
1433	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1434	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1435	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1436	lea	rsi,[128+rsi]
1437	vmovdqu	YMMWORD[rdi],ymm6
1438	vmovdqu	YMMWORD[32+rdi],ymm8
1439	vmovdqu	YMMWORD[64+rdi],ymm1
1440	vmovdqu	YMMWORD[96+rdi],ymm5
1441	lea	rdi,[128+rdi]
1442
1443	vpxor	ymm12,ymm12,YMMWORD[rsi]
1444	vpxor	ymm13,ymm13,YMMWORD[32+rsi]
1445	vpxor	ymm10,ymm10,YMMWORD[64+rsi]
1446	vpxor	ymm15,ymm15,YMMWORD[96+rsi]
1447	lea	rsi,[128+rsi]
1448	vmovdqu	YMMWORD[rdi],ymm12
1449	vmovdqu	YMMWORD[32+rdi],ymm13
1450	vmovdqu	YMMWORD[64+rdi],ymm10
1451	vmovdqu	YMMWORD[96+rdi],ymm15
1452	lea	rdi,[128+rdi]
1453
1454	vpxor	ymm14,ymm14,YMMWORD[rsi]
1455	vpxor	ymm2,ymm2,YMMWORD[32+rsi]
1456	vpxor	ymm3,ymm3,YMMWORD[64+rsi]
1457	vpxor	ymm7,ymm7,YMMWORD[96+rsi]
1458	lea	rsi,[128+rsi]
1459	vmovdqu	YMMWORD[rdi],ymm14
1460	vmovdqu	YMMWORD[32+rdi],ymm2
1461	vmovdqu	YMMWORD[64+rdi],ymm3
1462	vmovdqu	YMMWORD[96+rdi],ymm7
1463	lea	rdi,[128+rdi]
1464
1465	vpxor	ymm11,ymm11,YMMWORD[rsi]
1466	vpxor	ymm9,ymm9,YMMWORD[32+rsi]
1467	vpxor	ymm0,ymm0,YMMWORD[64+rsi]
1468	vpxor	ymm4,ymm4,YMMWORD[96+rsi]
1469	lea	rsi,[128+rsi]
1470	vmovdqu	YMMWORD[rdi],ymm11
1471	vmovdqu	YMMWORD[32+rdi],ymm9
1472	vmovdqu	YMMWORD[64+rdi],ymm0
1473	vmovdqu	YMMWORD[96+rdi],ymm4
1474	lea	rdi,[128+rdi]
1475
1476	sub	rdx,64*8
1477	jnz	NEAR $L$oop_outer8x
1478
1479	jmp	NEAR $L$done8x
1480
1481$L$tail8x:
1482	cmp	rdx,448
1483	jae	NEAR $L$448_or_more8x
1484	cmp	rdx,384
1485	jae	NEAR $L$384_or_more8x
1486	cmp	rdx,320
1487	jae	NEAR $L$320_or_more8x
1488	cmp	rdx,256
1489	jae	NEAR $L$256_or_more8x
1490	cmp	rdx,192
1491	jae	NEAR $L$192_or_more8x
1492	cmp	rdx,128
1493	jae	NEAR $L$128_or_more8x
1494	cmp	rdx,64
1495	jae	NEAR $L$64_or_more8x
1496
1497	xor	r10,r10
1498	vmovdqa	YMMWORD[rsp],ymm6
1499	vmovdqa	YMMWORD[32+rsp],ymm8
1500	jmp	NEAR $L$oop_tail8x
1501
1502ALIGN	32
1503$L$64_or_more8x:
1504	vpxor	ymm6,ymm6,YMMWORD[rsi]
1505	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1506	vmovdqu	YMMWORD[rdi],ymm6
1507	vmovdqu	YMMWORD[32+rdi],ymm8
1508	je	NEAR $L$done8x
1509
1510	lea	rsi,[64+rsi]
1511	xor	r10,r10
1512	vmovdqa	YMMWORD[rsp],ymm1
1513	lea	rdi,[64+rdi]
1514	sub	rdx,64
1515	vmovdqa	YMMWORD[32+rsp],ymm5
1516	jmp	NEAR $L$oop_tail8x
1517
1518ALIGN	32
1519$L$128_or_more8x:
1520	vpxor	ymm6,ymm6,YMMWORD[rsi]
1521	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1522	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1523	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1524	vmovdqu	YMMWORD[rdi],ymm6
1525	vmovdqu	YMMWORD[32+rdi],ymm8
1526	vmovdqu	YMMWORD[64+rdi],ymm1
1527	vmovdqu	YMMWORD[96+rdi],ymm5
1528	je	NEAR $L$done8x
1529
1530	lea	rsi,[128+rsi]
1531	xor	r10,r10
1532	vmovdqa	YMMWORD[rsp],ymm12
1533	lea	rdi,[128+rdi]
1534	sub	rdx,128
1535	vmovdqa	YMMWORD[32+rsp],ymm13
1536	jmp	NEAR $L$oop_tail8x
1537
1538ALIGN	32
1539$L$192_or_more8x:
1540	vpxor	ymm6,ymm6,YMMWORD[rsi]
1541	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1542	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1543	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1544	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1545	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1546	vmovdqu	YMMWORD[rdi],ymm6
1547	vmovdqu	YMMWORD[32+rdi],ymm8
1548	vmovdqu	YMMWORD[64+rdi],ymm1
1549	vmovdqu	YMMWORD[96+rdi],ymm5
1550	vmovdqu	YMMWORD[128+rdi],ymm12
1551	vmovdqu	YMMWORD[160+rdi],ymm13
1552	je	NEAR $L$done8x
1553
1554	lea	rsi,[192+rsi]
1555	xor	r10,r10
1556	vmovdqa	YMMWORD[rsp],ymm10
1557	lea	rdi,[192+rdi]
1558	sub	rdx,192
1559	vmovdqa	YMMWORD[32+rsp],ymm15
1560	jmp	NEAR $L$oop_tail8x
1561
1562ALIGN	32
1563$L$256_or_more8x:
1564	vpxor	ymm6,ymm6,YMMWORD[rsi]
1565	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1566	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1567	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1568	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1569	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1570	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
1571	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
1572	vmovdqu	YMMWORD[rdi],ymm6
1573	vmovdqu	YMMWORD[32+rdi],ymm8
1574	vmovdqu	YMMWORD[64+rdi],ymm1
1575	vmovdqu	YMMWORD[96+rdi],ymm5
1576	vmovdqu	YMMWORD[128+rdi],ymm12
1577	vmovdqu	YMMWORD[160+rdi],ymm13
1578	vmovdqu	YMMWORD[192+rdi],ymm10
1579	vmovdqu	YMMWORD[224+rdi],ymm15
1580	je	NEAR $L$done8x
1581
1582	lea	rsi,[256+rsi]
1583	xor	r10,r10
1584	vmovdqa	YMMWORD[rsp],ymm14
1585	lea	rdi,[256+rdi]
1586	sub	rdx,256
1587	vmovdqa	YMMWORD[32+rsp],ymm2
1588	jmp	NEAR $L$oop_tail8x
1589
1590ALIGN	32
1591$L$320_or_more8x:
1592	vpxor	ymm6,ymm6,YMMWORD[rsi]
1593	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1594	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1595	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1596	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1597	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1598	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
1599	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
1600	vpxor	ymm14,ymm14,YMMWORD[256+rsi]
1601	vpxor	ymm2,ymm2,YMMWORD[288+rsi]
1602	vmovdqu	YMMWORD[rdi],ymm6
1603	vmovdqu	YMMWORD[32+rdi],ymm8
1604	vmovdqu	YMMWORD[64+rdi],ymm1
1605	vmovdqu	YMMWORD[96+rdi],ymm5
1606	vmovdqu	YMMWORD[128+rdi],ymm12
1607	vmovdqu	YMMWORD[160+rdi],ymm13
1608	vmovdqu	YMMWORD[192+rdi],ymm10
1609	vmovdqu	YMMWORD[224+rdi],ymm15
1610	vmovdqu	YMMWORD[256+rdi],ymm14
1611	vmovdqu	YMMWORD[288+rdi],ymm2
1612	je	NEAR $L$done8x
1613
1614	lea	rsi,[320+rsi]
1615	xor	r10,r10
1616	vmovdqa	YMMWORD[rsp],ymm3
1617	lea	rdi,[320+rdi]
1618	sub	rdx,320
1619	vmovdqa	YMMWORD[32+rsp],ymm7
1620	jmp	NEAR $L$oop_tail8x
1621
1622ALIGN	32
1623$L$384_or_more8x:
1624	vpxor	ymm6,ymm6,YMMWORD[rsi]
1625	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1626	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1627	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1628	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1629	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1630	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
1631	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
1632	vpxor	ymm14,ymm14,YMMWORD[256+rsi]
1633	vpxor	ymm2,ymm2,YMMWORD[288+rsi]
1634	vpxor	ymm3,ymm3,YMMWORD[320+rsi]
1635	vpxor	ymm7,ymm7,YMMWORD[352+rsi]
1636	vmovdqu	YMMWORD[rdi],ymm6
1637	vmovdqu	YMMWORD[32+rdi],ymm8
1638	vmovdqu	YMMWORD[64+rdi],ymm1
1639	vmovdqu	YMMWORD[96+rdi],ymm5
1640	vmovdqu	YMMWORD[128+rdi],ymm12
1641	vmovdqu	YMMWORD[160+rdi],ymm13
1642	vmovdqu	YMMWORD[192+rdi],ymm10
1643	vmovdqu	YMMWORD[224+rdi],ymm15
1644	vmovdqu	YMMWORD[256+rdi],ymm14
1645	vmovdqu	YMMWORD[288+rdi],ymm2
1646	vmovdqu	YMMWORD[320+rdi],ymm3
1647	vmovdqu	YMMWORD[352+rdi],ymm7
1648	je	NEAR $L$done8x
1649
1650	lea	rsi,[384+rsi]
1651	xor	r10,r10
1652	vmovdqa	YMMWORD[rsp],ymm11
1653	lea	rdi,[384+rdi]
1654	sub	rdx,384
1655	vmovdqa	YMMWORD[32+rsp],ymm9
1656	jmp	NEAR $L$oop_tail8x
1657
1658ALIGN	32
1659$L$448_or_more8x:
1660	vpxor	ymm6,ymm6,YMMWORD[rsi]
1661	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1662	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1663	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1664	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1665	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1666	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
1667	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
1668	vpxor	ymm14,ymm14,YMMWORD[256+rsi]
1669	vpxor	ymm2,ymm2,YMMWORD[288+rsi]
1670	vpxor	ymm3,ymm3,YMMWORD[320+rsi]
1671	vpxor	ymm7,ymm7,YMMWORD[352+rsi]
1672	vpxor	ymm11,ymm11,YMMWORD[384+rsi]
1673	vpxor	ymm9,ymm9,YMMWORD[416+rsi]
1674	vmovdqu	YMMWORD[rdi],ymm6
1675	vmovdqu	YMMWORD[32+rdi],ymm8
1676	vmovdqu	YMMWORD[64+rdi],ymm1
1677	vmovdqu	YMMWORD[96+rdi],ymm5
1678	vmovdqu	YMMWORD[128+rdi],ymm12
1679	vmovdqu	YMMWORD[160+rdi],ymm13
1680	vmovdqu	YMMWORD[192+rdi],ymm10
1681	vmovdqu	YMMWORD[224+rdi],ymm15
1682	vmovdqu	YMMWORD[256+rdi],ymm14
1683	vmovdqu	YMMWORD[288+rdi],ymm2
1684	vmovdqu	YMMWORD[320+rdi],ymm3
1685	vmovdqu	YMMWORD[352+rdi],ymm7
1686	vmovdqu	YMMWORD[384+rdi],ymm11
1687	vmovdqu	YMMWORD[416+rdi],ymm9
1688	je	NEAR $L$done8x
1689
1690	lea	rsi,[448+rsi]
1691	xor	r10,r10
1692	vmovdqa	YMMWORD[rsp],ymm0
1693	lea	rdi,[448+rdi]
1694	sub	rdx,448
1695	vmovdqa	YMMWORD[32+rsp],ymm4
1696
1697$L$oop_tail8x:
1698	movzx	eax,BYTE[r10*1+rsi]
1699	movzx	ecx,BYTE[r10*1+rsp]
1700	lea	r10,[1+r10]
1701	xor	eax,ecx
1702	mov	BYTE[((-1))+r10*1+rdi],al
1703	dec	rdx
1704	jnz	NEAR $L$oop_tail8x
1705
1706$L$done8x:
1707	vzeroall
1708	movaps	xmm6,XMMWORD[((-168))+r9]
1709	movaps	xmm7,XMMWORD[((-152))+r9]
1710	movaps	xmm8,XMMWORD[((-136))+r9]
1711	movaps	xmm9,XMMWORD[((-120))+r9]
1712	movaps	xmm10,XMMWORD[((-104))+r9]
1713	movaps	xmm11,XMMWORD[((-88))+r9]
1714	movaps	xmm12,XMMWORD[((-72))+r9]
1715	movaps	xmm13,XMMWORD[((-56))+r9]
1716	movaps	xmm14,XMMWORD[((-40))+r9]
1717	movaps	xmm15,XMMWORD[((-24))+r9]
1718	lea	rsp,[r9]
1719
1720$L$8x_epilogue:
1721	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1722	mov	rsi,QWORD[16+rsp]
1723	DB	0F3h,0C3h		;repret
1724
1725$L$SEH_end_ChaCha20_8x:
1726EXTERN	__imp_RtlVirtualUnwind
1727
1728ALIGN	16
1729se_handler:
1730	push	rsi
1731	push	rdi
1732	push	rbx
1733	push	rbp
1734	push	r12
1735	push	r13
1736	push	r14
1737	push	r15
1738	pushfq
1739	sub	rsp,64
1740
1741	mov	rax,QWORD[120+r8]
1742	mov	rbx,QWORD[248+r8]
1743
1744	mov	rsi,QWORD[8+r9]
1745	mov	r11,QWORD[56+r9]
1746
1747	lea	r10,[$L$ctr32_body]
1748	cmp	rbx,r10
1749	jb	NEAR $L$common_seh_tail
1750
1751	mov	rax,QWORD[152+r8]
1752
1753	lea	r10,[$L$no_data]
1754	cmp	rbx,r10
1755	jae	NEAR $L$common_seh_tail
1756
1757	lea	rax,[((64+24+48))+rax]
1758
1759	mov	rbx,QWORD[((-8))+rax]
1760	mov	rbp,QWORD[((-16))+rax]
1761	mov	r12,QWORD[((-24))+rax]
1762	mov	r13,QWORD[((-32))+rax]
1763	mov	r14,QWORD[((-40))+rax]
1764	mov	r15,QWORD[((-48))+rax]
1765	mov	QWORD[144+r8],rbx
1766	mov	QWORD[160+r8],rbp
1767	mov	QWORD[216+r8],r12
1768	mov	QWORD[224+r8],r13
1769	mov	QWORD[232+r8],r14
1770	mov	QWORD[240+r8],r15
1771
1772$L$common_seh_tail:
1773	mov	rdi,QWORD[8+rax]
1774	mov	rsi,QWORD[16+rax]
1775	mov	QWORD[152+r8],rax
1776	mov	QWORD[168+r8],rsi
1777	mov	QWORD[176+r8],rdi
1778
1779	mov	rdi,QWORD[40+r9]
1780	mov	rsi,r8
1781	mov	ecx,154
1782	DD	0xa548f3fc
1783
1784	mov	rsi,r9
1785	xor	rcx,rcx
1786	mov	rdx,QWORD[8+rsi]
1787	mov	r8,QWORD[rsi]
1788	mov	r9,QWORD[16+rsi]
1789	mov	r10,QWORD[40+rsi]
1790	lea	r11,[56+rsi]
1791	lea	r12,[24+rsi]
1792	mov	QWORD[32+rsp],r10
1793	mov	QWORD[40+rsp],r11
1794	mov	QWORD[48+rsp],r12
1795	mov	QWORD[56+rsp],rcx
1796	call	QWORD[__imp_RtlVirtualUnwind]
1797
1798	mov	eax,1
1799	add	rsp,64
1800	popfq
1801	pop	r15
1802	pop	r14
1803	pop	r13
1804	pop	r12
1805	pop	rbp
1806	pop	rbx
1807	pop	rdi
1808	pop	rsi
1809	DB	0F3h,0C3h		;repret
1810
1811
1812
1813ALIGN	16
1814ssse3_handler:
1815	push	rsi
1816	push	rdi
1817	push	rbx
1818	push	rbp
1819	push	r12
1820	push	r13
1821	push	r14
1822	push	r15
1823	pushfq
1824	sub	rsp,64
1825
1826	mov	rax,QWORD[120+r8]
1827	mov	rbx,QWORD[248+r8]
1828
1829	mov	rsi,QWORD[8+r9]
1830	mov	r11,QWORD[56+r9]
1831
1832	mov	r10d,DWORD[r11]
1833	lea	r10,[r10*1+rsi]
1834	cmp	rbx,r10
1835	jb	NEAR $L$common_seh_tail
1836
1837	mov	rax,QWORD[192+r8]
1838
1839	mov	r10d,DWORD[4+r11]
1840	lea	r10,[r10*1+rsi]
1841	cmp	rbx,r10
1842	jae	NEAR $L$common_seh_tail
1843
1844	lea	rsi,[((-40))+rax]
1845	lea	rdi,[512+r8]
1846	mov	ecx,4
1847	DD	0xa548f3fc
1848
1849	jmp	NEAR $L$common_seh_tail
1850
1851
1852
1853ALIGN	16
1854full_handler:
1855	push	rsi
1856	push	rdi
1857	push	rbx
1858	push	rbp
1859	push	r12
1860	push	r13
1861	push	r14
1862	push	r15
1863	pushfq
1864	sub	rsp,64
1865
1866	mov	rax,QWORD[120+r8]
1867	mov	rbx,QWORD[248+r8]
1868
1869	mov	rsi,QWORD[8+r9]
1870	mov	r11,QWORD[56+r9]
1871
1872	mov	r10d,DWORD[r11]
1873	lea	r10,[r10*1+rsi]
1874	cmp	rbx,r10
1875	jb	NEAR $L$common_seh_tail
1876
1877	mov	rax,QWORD[192+r8]
1878
1879	mov	r10d,DWORD[4+r11]
1880	lea	r10,[r10*1+rsi]
1881	cmp	rbx,r10
1882	jae	NEAR $L$common_seh_tail
1883
1884	lea	rsi,[((-168))+rax]
1885	lea	rdi,[512+r8]
1886	mov	ecx,20
1887	DD	0xa548f3fc
1888
1889	jmp	NEAR $L$common_seh_tail
1890
1891
1892section	.pdata rdata align=4
1893ALIGN	4
1894	DD	$L$SEH_begin_ChaCha20_ctr32 wrt ..imagebase
1895	DD	$L$SEH_end_ChaCha20_ctr32 wrt ..imagebase
1896	DD	$L$SEH_info_ChaCha20_ctr32 wrt ..imagebase
1897
1898	DD	$L$SEH_begin_ChaCha20_ssse3 wrt ..imagebase
1899	DD	$L$SEH_end_ChaCha20_ssse3 wrt ..imagebase
1900	DD	$L$SEH_info_ChaCha20_ssse3 wrt ..imagebase
1901
1902	DD	$L$SEH_begin_ChaCha20_4x wrt ..imagebase
1903	DD	$L$SEH_end_ChaCha20_4x wrt ..imagebase
1904	DD	$L$SEH_info_ChaCha20_4x wrt ..imagebase
1905	DD	$L$SEH_begin_ChaCha20_8x wrt ..imagebase
1906	DD	$L$SEH_end_ChaCha20_8x wrt ..imagebase
1907	DD	$L$SEH_info_ChaCha20_8x wrt ..imagebase
1908section	.xdata rdata align=8
1909ALIGN	8
1910$L$SEH_info_ChaCha20_ctr32:
1911DB	9,0,0,0
1912	DD	se_handler wrt ..imagebase
1913
1914$L$SEH_info_ChaCha20_ssse3:
1915DB	9,0,0,0
1916	DD	ssse3_handler wrt ..imagebase
1917	DD	$L$ssse3_body wrt ..imagebase,$L$ssse3_epilogue wrt ..imagebase
1918
1919$L$SEH_info_ChaCha20_4x:
1920DB	9,0,0,0
1921	DD	full_handler wrt ..imagebase
1922	DD	$L$4x_body wrt ..imagebase,$L$4x_epilogue wrt ..imagebase
1923$L$SEH_info_ChaCha20_8x:
1924DB	9,0,0,0
1925	DD	full_handler wrt ..imagebase
1926	DD	$L$8x_body wrt ..imagebase,$L$8x_epilogue wrt ..imagebase
1927