• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; This file is generated from a similarly-named Perl script in the BoringSSL
2; source tree. Do not edit by hand.
3
4default	rel
5%define XMMWORD
6%define YMMWORD
7%define ZMMWORD
8
9%include "ring_core_generated/prefix_symbols_nasm.inc"
10section	.text code align=64
11
12
13EXTERN	OPENSSL_ia32cap_P
14
15ALIGN	64
16$L$zero:
17	DD	0,0,0,0
18$L$one:
19	DD	1,0,0,0
20$L$inc:
21	DD	0,1,2,3
22$L$four:
23	DD	4,4,4,4
24$L$incy:
25	DD	0,2,4,6,1,3,5,7
26$L$eight:
27	DD	8,8,8,8,8,8,8,8
28$L$rot16:
29DB	0x2,0x3,0x0,0x1,0x6,0x7,0x4,0x5,0xa,0xb,0x8,0x9,0xe,0xf,0xc,0xd
30$L$rot24:
31DB	0x3,0x0,0x1,0x2,0x7,0x4,0x5,0x6,0xb,0x8,0x9,0xa,0xf,0xc,0xd,0xe
32$L$sigma:
33DB	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107
34DB	0
35ALIGN	64
36$L$zeroz:
37	DD	0,0,0,0,1,0,0,0,2,0,0,0,3,0,0,0
38$L$fourz:
39	DD	4,0,0,0,4,0,0,0,4,0,0,0,4,0,0,0
40$L$incz:
41	DD	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
42$L$sixteen:
43	DD	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
44DB	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
45DB	95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32
46DB	98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115
47DB	108,46,111,114,103,62,0
48global	ChaCha20_ctr32
49
50ALIGN	64
51ChaCha20_ctr32:
52	mov	QWORD[8+rsp],rdi	;WIN64 prologue
53	mov	QWORD[16+rsp],rsi
54	mov	rax,rsp
55$L$SEH_begin_ChaCha20_ctr32:
56	mov	rdi,rcx
57	mov	rsi,rdx
58	mov	rdx,r8
59	mov	rcx,r9
60	mov	r8,QWORD[40+rsp]
61
62
63
64	cmp	rdx,0
65	je	NEAR $L$no_data
66	mov	r10,QWORD[((OPENSSL_ia32cap_P+4))]
67	test	r10d,512
68	jnz	NEAR $L$ChaCha20_ssse3
69
70	push	rbx
71
72	push	rbp
73
74	push	r12
75
76	push	r13
77
78	push	r14
79
80	push	r15
81
82	sub	rsp,64+24
83
84$L$ctr32_body:
85
86
87	movdqu	xmm1,XMMWORD[rcx]
88	movdqu	xmm2,XMMWORD[16+rcx]
89	movdqu	xmm3,XMMWORD[r8]
90	movdqa	xmm4,XMMWORD[$L$one]
91
92
93	movdqa	XMMWORD[16+rsp],xmm1
94	movdqa	XMMWORD[32+rsp],xmm2
95	movdqa	XMMWORD[48+rsp],xmm3
96	mov	rbp,rdx
97	jmp	NEAR $L$oop_outer
98
99ALIGN	32
100$L$oop_outer:
101	mov	eax,0x61707865
102	mov	ebx,0x3320646e
103	mov	ecx,0x79622d32
104	mov	edx,0x6b206574
105	mov	r8d,DWORD[16+rsp]
106	mov	r9d,DWORD[20+rsp]
107	mov	r10d,DWORD[24+rsp]
108	mov	r11d,DWORD[28+rsp]
109	movd	r12d,xmm3
110	mov	r13d,DWORD[52+rsp]
111	mov	r14d,DWORD[56+rsp]
112	mov	r15d,DWORD[60+rsp]
113
114	mov	QWORD[((64+0))+rsp],rbp
115	mov	ebp,10
116	mov	QWORD[((64+8))+rsp],rsi
117DB	102,72,15,126,214
118	mov	QWORD[((64+16))+rsp],rdi
119	mov	rdi,rsi
120	shr	rdi,32
121	jmp	NEAR $L$oop
122
123ALIGN	32
124$L$oop:
125	add	eax,r8d
126	xor	r12d,eax
127	rol	r12d,16
128	add	ebx,r9d
129	xor	r13d,ebx
130	rol	r13d,16
131	add	esi,r12d
132	xor	r8d,esi
133	rol	r8d,12
134	add	edi,r13d
135	xor	r9d,edi
136	rol	r9d,12
137	add	eax,r8d
138	xor	r12d,eax
139	rol	r12d,8
140	add	ebx,r9d
141	xor	r13d,ebx
142	rol	r13d,8
143	add	esi,r12d
144	xor	r8d,esi
145	rol	r8d,7
146	add	edi,r13d
147	xor	r9d,edi
148	rol	r9d,7
149	mov	DWORD[32+rsp],esi
150	mov	DWORD[36+rsp],edi
151	mov	esi,DWORD[40+rsp]
152	mov	edi,DWORD[44+rsp]
153	add	ecx,r10d
154	xor	r14d,ecx
155	rol	r14d,16
156	add	edx,r11d
157	xor	r15d,edx
158	rol	r15d,16
159	add	esi,r14d
160	xor	r10d,esi
161	rol	r10d,12
162	add	edi,r15d
163	xor	r11d,edi
164	rol	r11d,12
165	add	ecx,r10d
166	xor	r14d,ecx
167	rol	r14d,8
168	add	edx,r11d
169	xor	r15d,edx
170	rol	r15d,8
171	add	esi,r14d
172	xor	r10d,esi
173	rol	r10d,7
174	add	edi,r15d
175	xor	r11d,edi
176	rol	r11d,7
177	add	eax,r9d
178	xor	r15d,eax
179	rol	r15d,16
180	add	ebx,r10d
181	xor	r12d,ebx
182	rol	r12d,16
183	add	esi,r15d
184	xor	r9d,esi
185	rol	r9d,12
186	add	edi,r12d
187	xor	r10d,edi
188	rol	r10d,12
189	add	eax,r9d
190	xor	r15d,eax
191	rol	r15d,8
192	add	ebx,r10d
193	xor	r12d,ebx
194	rol	r12d,8
195	add	esi,r15d
196	xor	r9d,esi
197	rol	r9d,7
198	add	edi,r12d
199	xor	r10d,edi
200	rol	r10d,7
201	mov	DWORD[40+rsp],esi
202	mov	DWORD[44+rsp],edi
203	mov	esi,DWORD[32+rsp]
204	mov	edi,DWORD[36+rsp]
205	add	ecx,r11d
206	xor	r13d,ecx
207	rol	r13d,16
208	add	edx,r8d
209	xor	r14d,edx
210	rol	r14d,16
211	add	esi,r13d
212	xor	r11d,esi
213	rol	r11d,12
214	add	edi,r14d
215	xor	r8d,edi
216	rol	r8d,12
217	add	ecx,r11d
218	xor	r13d,ecx
219	rol	r13d,8
220	add	edx,r8d
221	xor	r14d,edx
222	rol	r14d,8
223	add	esi,r13d
224	xor	r11d,esi
225	rol	r11d,7
226	add	edi,r14d
227	xor	r8d,edi
228	rol	r8d,7
229	dec	ebp
230	jnz	NEAR $L$oop
231	mov	DWORD[36+rsp],edi
232	mov	DWORD[32+rsp],esi
233	mov	rbp,QWORD[64+rsp]
234	movdqa	xmm1,xmm2
235	mov	rsi,QWORD[((64+8))+rsp]
236	paddd	xmm3,xmm4
237	mov	rdi,QWORD[((64+16))+rsp]
238
239	add	eax,0x61707865
240	add	ebx,0x3320646e
241	add	ecx,0x79622d32
242	add	edx,0x6b206574
243	add	r8d,DWORD[16+rsp]
244	add	r9d,DWORD[20+rsp]
245	add	r10d,DWORD[24+rsp]
246	add	r11d,DWORD[28+rsp]
247	add	r12d,DWORD[48+rsp]
248	add	r13d,DWORD[52+rsp]
249	add	r14d,DWORD[56+rsp]
250	add	r15d,DWORD[60+rsp]
251	paddd	xmm1,XMMWORD[32+rsp]
252
253	cmp	rbp,64
254	jb	NEAR $L$tail
255
256	xor	eax,DWORD[rsi]
257	xor	ebx,DWORD[4+rsi]
258	xor	ecx,DWORD[8+rsi]
259	xor	edx,DWORD[12+rsi]
260	xor	r8d,DWORD[16+rsi]
261	xor	r9d,DWORD[20+rsi]
262	xor	r10d,DWORD[24+rsi]
263	xor	r11d,DWORD[28+rsi]
264	movdqu	xmm0,XMMWORD[32+rsi]
265	xor	r12d,DWORD[48+rsi]
266	xor	r13d,DWORD[52+rsi]
267	xor	r14d,DWORD[56+rsi]
268	xor	r15d,DWORD[60+rsi]
269	lea	rsi,[64+rsi]
270	pxor	xmm0,xmm1
271
272	movdqa	XMMWORD[32+rsp],xmm2
273	movd	DWORD[48+rsp],xmm3
274
275	mov	DWORD[rdi],eax
276	mov	DWORD[4+rdi],ebx
277	mov	DWORD[8+rdi],ecx
278	mov	DWORD[12+rdi],edx
279	mov	DWORD[16+rdi],r8d
280	mov	DWORD[20+rdi],r9d
281	mov	DWORD[24+rdi],r10d
282	mov	DWORD[28+rdi],r11d
283	movdqu	XMMWORD[32+rdi],xmm0
284	mov	DWORD[48+rdi],r12d
285	mov	DWORD[52+rdi],r13d
286	mov	DWORD[56+rdi],r14d
287	mov	DWORD[60+rdi],r15d
288	lea	rdi,[64+rdi]
289
290	sub	rbp,64
291	jnz	NEAR $L$oop_outer
292
293	jmp	NEAR $L$done
294
295ALIGN	16
296$L$tail:
297	mov	DWORD[rsp],eax
298	mov	DWORD[4+rsp],ebx
299	xor	rbx,rbx
300	mov	DWORD[8+rsp],ecx
301	mov	DWORD[12+rsp],edx
302	mov	DWORD[16+rsp],r8d
303	mov	DWORD[20+rsp],r9d
304	mov	DWORD[24+rsp],r10d
305	mov	DWORD[28+rsp],r11d
306	movdqa	XMMWORD[32+rsp],xmm1
307	mov	DWORD[48+rsp],r12d
308	mov	DWORD[52+rsp],r13d
309	mov	DWORD[56+rsp],r14d
310	mov	DWORD[60+rsp],r15d
311
312$L$oop_tail:
313	movzx	eax,BYTE[rbx*1+rsi]
314	movzx	edx,BYTE[rbx*1+rsp]
315	lea	rbx,[1+rbx]
316	xor	eax,edx
317	mov	BYTE[((-1))+rbx*1+rdi],al
318	dec	rbp
319	jnz	NEAR $L$oop_tail
320
321$L$done:
322	lea	rsi,[((64+24+48))+rsp]
323	mov	r15,QWORD[((-48))+rsi]
324
325	mov	r14,QWORD[((-40))+rsi]
326
327	mov	r13,QWORD[((-32))+rsi]
328
329	mov	r12,QWORD[((-24))+rsi]
330
331	mov	rbp,QWORD[((-16))+rsi]
332
333	mov	rbx,QWORD[((-8))+rsi]
334
335	lea	rsp,[rsi]
336
337$L$no_data:
338	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
339	mov	rsi,QWORD[16+rsp]
340	DB	0F3h,0C3h		;repret
341
342$L$SEH_end_ChaCha20_ctr32:
343
344ALIGN	32
345ChaCha20_ssse3:
346	mov	QWORD[8+rsp],rdi	;WIN64 prologue
347	mov	QWORD[16+rsp],rsi
348	mov	rax,rsp
349$L$SEH_begin_ChaCha20_ssse3:
350	mov	rdi,rcx
351	mov	rsi,rdx
352	mov	rdx,r8
353	mov	rcx,r9
354	mov	r8,QWORD[40+rsp]
355
356
357$L$ChaCha20_ssse3:
358
359	mov	r9,rsp
360
361	cmp	rdx,128
362	ja	NEAR $L$ChaCha20_4x
363
364$L$do_sse3_after_all:
365	sub	rsp,64+40
366	movaps	XMMWORD[(-40)+r9],xmm6
367	movaps	XMMWORD[(-24)+r9],xmm7
368$L$ssse3_body:
369	movdqa	xmm0,XMMWORD[$L$sigma]
370	movdqu	xmm1,XMMWORD[rcx]
371	movdqu	xmm2,XMMWORD[16+rcx]
372	movdqu	xmm3,XMMWORD[r8]
373	movdqa	xmm6,XMMWORD[$L$rot16]
374	movdqa	xmm7,XMMWORD[$L$rot24]
375
376	movdqa	XMMWORD[rsp],xmm0
377	movdqa	XMMWORD[16+rsp],xmm1
378	movdqa	XMMWORD[32+rsp],xmm2
379	movdqa	XMMWORD[48+rsp],xmm3
380	mov	r8,10
381	jmp	NEAR $L$oop_ssse3
382
383ALIGN	32
384$L$oop_outer_ssse3:
385	movdqa	xmm3,XMMWORD[$L$one]
386	movdqa	xmm0,XMMWORD[rsp]
387	movdqa	xmm1,XMMWORD[16+rsp]
388	movdqa	xmm2,XMMWORD[32+rsp]
389	paddd	xmm3,XMMWORD[48+rsp]
390	mov	r8,10
391	movdqa	XMMWORD[48+rsp],xmm3
392	jmp	NEAR $L$oop_ssse3
393
394ALIGN	32
395$L$oop_ssse3:
396	paddd	xmm0,xmm1
397	pxor	xmm3,xmm0
398DB	102,15,56,0,222
399	paddd	xmm2,xmm3
400	pxor	xmm1,xmm2
401	movdqa	xmm4,xmm1
402	psrld	xmm1,20
403	pslld	xmm4,12
404	por	xmm1,xmm4
405	paddd	xmm0,xmm1
406	pxor	xmm3,xmm0
407DB	102,15,56,0,223
408	paddd	xmm2,xmm3
409	pxor	xmm1,xmm2
410	movdqa	xmm4,xmm1
411	psrld	xmm1,25
412	pslld	xmm4,7
413	por	xmm1,xmm4
414	pshufd	xmm2,xmm2,78
415	pshufd	xmm1,xmm1,57
416	pshufd	xmm3,xmm3,147
417	nop
418	paddd	xmm0,xmm1
419	pxor	xmm3,xmm0
420DB	102,15,56,0,222
421	paddd	xmm2,xmm3
422	pxor	xmm1,xmm2
423	movdqa	xmm4,xmm1
424	psrld	xmm1,20
425	pslld	xmm4,12
426	por	xmm1,xmm4
427	paddd	xmm0,xmm1
428	pxor	xmm3,xmm0
429DB	102,15,56,0,223
430	paddd	xmm2,xmm3
431	pxor	xmm1,xmm2
432	movdqa	xmm4,xmm1
433	psrld	xmm1,25
434	pslld	xmm4,7
435	por	xmm1,xmm4
436	pshufd	xmm2,xmm2,78
437	pshufd	xmm1,xmm1,147
438	pshufd	xmm3,xmm3,57
439	dec	r8
440	jnz	NEAR $L$oop_ssse3
441	paddd	xmm0,XMMWORD[rsp]
442	paddd	xmm1,XMMWORD[16+rsp]
443	paddd	xmm2,XMMWORD[32+rsp]
444	paddd	xmm3,XMMWORD[48+rsp]
445
446	cmp	rdx,64
447	jb	NEAR $L$tail_ssse3
448
449	movdqu	xmm4,XMMWORD[rsi]
450	movdqu	xmm5,XMMWORD[16+rsi]
451	pxor	xmm0,xmm4
452	movdqu	xmm4,XMMWORD[32+rsi]
453	pxor	xmm1,xmm5
454	movdqu	xmm5,XMMWORD[48+rsi]
455	lea	rsi,[64+rsi]
456	pxor	xmm2,xmm4
457	pxor	xmm3,xmm5
458
459	movdqu	XMMWORD[rdi],xmm0
460	movdqu	XMMWORD[16+rdi],xmm1
461	movdqu	XMMWORD[32+rdi],xmm2
462	movdqu	XMMWORD[48+rdi],xmm3
463	lea	rdi,[64+rdi]
464
465	sub	rdx,64
466	jnz	NEAR $L$oop_outer_ssse3
467
468	jmp	NEAR $L$done_ssse3
469
470ALIGN	16
471$L$tail_ssse3:
472	movdqa	XMMWORD[rsp],xmm0
473	movdqa	XMMWORD[16+rsp],xmm1
474	movdqa	XMMWORD[32+rsp],xmm2
475	movdqa	XMMWORD[48+rsp],xmm3
476	xor	r8,r8
477
478$L$oop_tail_ssse3:
479	movzx	eax,BYTE[r8*1+rsi]
480	movzx	ecx,BYTE[r8*1+rsp]
481	lea	r8,[1+r8]
482	xor	eax,ecx
483	mov	BYTE[((-1))+r8*1+rdi],al
484	dec	rdx
485	jnz	NEAR $L$oop_tail_ssse3
486
487$L$done_ssse3:
488	movaps	xmm6,XMMWORD[((-40))+r9]
489	movaps	xmm7,XMMWORD[((-24))+r9]
490	lea	rsp,[r9]
491
492$L$ssse3_epilogue:
493	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
494	mov	rsi,QWORD[16+rsp]
495	DB	0F3h,0C3h		;repret
496
497$L$SEH_end_ChaCha20_ssse3:
498
499ALIGN	32
500ChaCha20_4x:
501	mov	QWORD[8+rsp],rdi	;WIN64 prologue
502	mov	QWORD[16+rsp],rsi
503	mov	rax,rsp
504$L$SEH_begin_ChaCha20_4x:
505	mov	rdi,rcx
506	mov	rsi,rdx
507	mov	rdx,r8
508	mov	rcx,r9
509	mov	r8,QWORD[40+rsp]
510
511
512$L$ChaCha20_4x:
513
514	mov	r9,rsp
515
516	mov	r11,r10
517	shr	r10,32
518	test	r10,32
519	jnz	NEAR $L$ChaCha20_8x
520	cmp	rdx,192
521	ja	NEAR $L$proceed4x
522
523	and	r11,71303168
524	cmp	r11,4194304
525	je	NEAR $L$do_sse3_after_all
526
527$L$proceed4x:
528	sub	rsp,0x140+168
529	movaps	XMMWORD[(-168)+r9],xmm6
530	movaps	XMMWORD[(-152)+r9],xmm7
531	movaps	XMMWORD[(-136)+r9],xmm8
532	movaps	XMMWORD[(-120)+r9],xmm9
533	movaps	XMMWORD[(-104)+r9],xmm10
534	movaps	XMMWORD[(-88)+r9],xmm11
535	movaps	XMMWORD[(-72)+r9],xmm12
536	movaps	XMMWORD[(-56)+r9],xmm13
537	movaps	XMMWORD[(-40)+r9],xmm14
538	movaps	XMMWORD[(-24)+r9],xmm15
539$L$4x_body:
540	movdqa	xmm11,XMMWORD[$L$sigma]
541	movdqu	xmm15,XMMWORD[rcx]
542	movdqu	xmm7,XMMWORD[16+rcx]
543	movdqu	xmm3,XMMWORD[r8]
544	lea	rcx,[256+rsp]
545	lea	r10,[$L$rot16]
546	lea	r11,[$L$rot24]
547
548	pshufd	xmm8,xmm11,0x00
549	pshufd	xmm9,xmm11,0x55
550	movdqa	XMMWORD[64+rsp],xmm8
551	pshufd	xmm10,xmm11,0xaa
552	movdqa	XMMWORD[80+rsp],xmm9
553	pshufd	xmm11,xmm11,0xff
554	movdqa	XMMWORD[96+rsp],xmm10
555	movdqa	XMMWORD[112+rsp],xmm11
556
557	pshufd	xmm12,xmm15,0x00
558	pshufd	xmm13,xmm15,0x55
559	movdqa	XMMWORD[(128-256)+rcx],xmm12
560	pshufd	xmm14,xmm15,0xaa
561	movdqa	XMMWORD[(144-256)+rcx],xmm13
562	pshufd	xmm15,xmm15,0xff
563	movdqa	XMMWORD[(160-256)+rcx],xmm14
564	movdqa	XMMWORD[(176-256)+rcx],xmm15
565
566	pshufd	xmm4,xmm7,0x00
567	pshufd	xmm5,xmm7,0x55
568	movdqa	XMMWORD[(192-256)+rcx],xmm4
569	pshufd	xmm6,xmm7,0xaa
570	movdqa	XMMWORD[(208-256)+rcx],xmm5
571	pshufd	xmm7,xmm7,0xff
572	movdqa	XMMWORD[(224-256)+rcx],xmm6
573	movdqa	XMMWORD[(240-256)+rcx],xmm7
574
575	pshufd	xmm0,xmm3,0x00
576	pshufd	xmm1,xmm3,0x55
577	paddd	xmm0,XMMWORD[$L$inc]
578	pshufd	xmm2,xmm3,0xaa
579	movdqa	XMMWORD[(272-256)+rcx],xmm1
580	pshufd	xmm3,xmm3,0xff
581	movdqa	XMMWORD[(288-256)+rcx],xmm2
582	movdqa	XMMWORD[(304-256)+rcx],xmm3
583
584	jmp	NEAR $L$oop_enter4x
585
586ALIGN	32
587$L$oop_outer4x:
588	movdqa	xmm8,XMMWORD[64+rsp]
589	movdqa	xmm9,XMMWORD[80+rsp]
590	movdqa	xmm10,XMMWORD[96+rsp]
591	movdqa	xmm11,XMMWORD[112+rsp]
592	movdqa	xmm12,XMMWORD[((128-256))+rcx]
593	movdqa	xmm13,XMMWORD[((144-256))+rcx]
594	movdqa	xmm14,XMMWORD[((160-256))+rcx]
595	movdqa	xmm15,XMMWORD[((176-256))+rcx]
596	movdqa	xmm4,XMMWORD[((192-256))+rcx]
597	movdqa	xmm5,XMMWORD[((208-256))+rcx]
598	movdqa	xmm6,XMMWORD[((224-256))+rcx]
599	movdqa	xmm7,XMMWORD[((240-256))+rcx]
600	movdqa	xmm0,XMMWORD[((256-256))+rcx]
601	movdqa	xmm1,XMMWORD[((272-256))+rcx]
602	movdqa	xmm2,XMMWORD[((288-256))+rcx]
603	movdqa	xmm3,XMMWORD[((304-256))+rcx]
604	paddd	xmm0,XMMWORD[$L$four]
605
606$L$oop_enter4x:
607	movdqa	XMMWORD[32+rsp],xmm6
608	movdqa	XMMWORD[48+rsp],xmm7
609	movdqa	xmm7,XMMWORD[r10]
610	mov	eax,10
611	movdqa	XMMWORD[(256-256)+rcx],xmm0
612	jmp	NEAR $L$oop4x
613
614ALIGN	32
615$L$oop4x:
616	paddd	xmm8,xmm12
617	paddd	xmm9,xmm13
618	pxor	xmm0,xmm8
619	pxor	xmm1,xmm9
620DB	102,15,56,0,199
621DB	102,15,56,0,207
622	paddd	xmm4,xmm0
623	paddd	xmm5,xmm1
624	pxor	xmm12,xmm4
625	pxor	xmm13,xmm5
626	movdqa	xmm6,xmm12
627	pslld	xmm12,12
628	psrld	xmm6,20
629	movdqa	xmm7,xmm13
630	pslld	xmm13,12
631	por	xmm12,xmm6
632	psrld	xmm7,20
633	movdqa	xmm6,XMMWORD[r11]
634	por	xmm13,xmm7
635	paddd	xmm8,xmm12
636	paddd	xmm9,xmm13
637	pxor	xmm0,xmm8
638	pxor	xmm1,xmm9
639DB	102,15,56,0,198
640DB	102,15,56,0,206
641	paddd	xmm4,xmm0
642	paddd	xmm5,xmm1
643	pxor	xmm12,xmm4
644	pxor	xmm13,xmm5
645	movdqa	xmm7,xmm12
646	pslld	xmm12,7
647	psrld	xmm7,25
648	movdqa	xmm6,xmm13
649	pslld	xmm13,7
650	por	xmm12,xmm7
651	psrld	xmm6,25
652	movdqa	xmm7,XMMWORD[r10]
653	por	xmm13,xmm6
654	movdqa	XMMWORD[rsp],xmm4
655	movdqa	XMMWORD[16+rsp],xmm5
656	movdqa	xmm4,XMMWORD[32+rsp]
657	movdqa	xmm5,XMMWORD[48+rsp]
658	paddd	xmm10,xmm14
659	paddd	xmm11,xmm15
660	pxor	xmm2,xmm10
661	pxor	xmm3,xmm11
662DB	102,15,56,0,215
663DB	102,15,56,0,223
664	paddd	xmm4,xmm2
665	paddd	xmm5,xmm3
666	pxor	xmm14,xmm4
667	pxor	xmm15,xmm5
668	movdqa	xmm6,xmm14
669	pslld	xmm14,12
670	psrld	xmm6,20
671	movdqa	xmm7,xmm15
672	pslld	xmm15,12
673	por	xmm14,xmm6
674	psrld	xmm7,20
675	movdqa	xmm6,XMMWORD[r11]
676	por	xmm15,xmm7
677	paddd	xmm10,xmm14
678	paddd	xmm11,xmm15
679	pxor	xmm2,xmm10
680	pxor	xmm3,xmm11
681DB	102,15,56,0,214
682DB	102,15,56,0,222
683	paddd	xmm4,xmm2
684	paddd	xmm5,xmm3
685	pxor	xmm14,xmm4
686	pxor	xmm15,xmm5
687	movdqa	xmm7,xmm14
688	pslld	xmm14,7
689	psrld	xmm7,25
690	movdqa	xmm6,xmm15
691	pslld	xmm15,7
692	por	xmm14,xmm7
693	psrld	xmm6,25
694	movdqa	xmm7,XMMWORD[r10]
695	por	xmm15,xmm6
696	paddd	xmm8,xmm13
697	paddd	xmm9,xmm14
698	pxor	xmm3,xmm8
699	pxor	xmm0,xmm9
700DB	102,15,56,0,223
701DB	102,15,56,0,199
702	paddd	xmm4,xmm3
703	paddd	xmm5,xmm0
704	pxor	xmm13,xmm4
705	pxor	xmm14,xmm5
706	movdqa	xmm6,xmm13
707	pslld	xmm13,12
708	psrld	xmm6,20
709	movdqa	xmm7,xmm14
710	pslld	xmm14,12
711	por	xmm13,xmm6
712	psrld	xmm7,20
713	movdqa	xmm6,XMMWORD[r11]
714	por	xmm14,xmm7
715	paddd	xmm8,xmm13
716	paddd	xmm9,xmm14
717	pxor	xmm3,xmm8
718	pxor	xmm0,xmm9
719DB	102,15,56,0,222
720DB	102,15,56,0,198
721	paddd	xmm4,xmm3
722	paddd	xmm5,xmm0
723	pxor	xmm13,xmm4
724	pxor	xmm14,xmm5
725	movdqa	xmm7,xmm13
726	pslld	xmm13,7
727	psrld	xmm7,25
728	movdqa	xmm6,xmm14
729	pslld	xmm14,7
730	por	xmm13,xmm7
731	psrld	xmm6,25
732	movdqa	xmm7,XMMWORD[r10]
733	por	xmm14,xmm6
734	movdqa	XMMWORD[32+rsp],xmm4
735	movdqa	XMMWORD[48+rsp],xmm5
736	movdqa	xmm4,XMMWORD[rsp]
737	movdqa	xmm5,XMMWORD[16+rsp]
738	paddd	xmm10,xmm15
739	paddd	xmm11,xmm12
740	pxor	xmm1,xmm10
741	pxor	xmm2,xmm11
742DB	102,15,56,0,207
743DB	102,15,56,0,215
744	paddd	xmm4,xmm1
745	paddd	xmm5,xmm2
746	pxor	xmm15,xmm4
747	pxor	xmm12,xmm5
748	movdqa	xmm6,xmm15
749	pslld	xmm15,12
750	psrld	xmm6,20
751	movdqa	xmm7,xmm12
752	pslld	xmm12,12
753	por	xmm15,xmm6
754	psrld	xmm7,20
755	movdqa	xmm6,XMMWORD[r11]
756	por	xmm12,xmm7
757	paddd	xmm10,xmm15
758	paddd	xmm11,xmm12
759	pxor	xmm1,xmm10
760	pxor	xmm2,xmm11
761DB	102,15,56,0,206
762DB	102,15,56,0,214
763	paddd	xmm4,xmm1
764	paddd	xmm5,xmm2
765	pxor	xmm15,xmm4
766	pxor	xmm12,xmm5
767	movdqa	xmm7,xmm15
768	pslld	xmm15,7
769	psrld	xmm7,25
770	movdqa	xmm6,xmm12
771	pslld	xmm12,7
772	por	xmm15,xmm7
773	psrld	xmm6,25
774	movdqa	xmm7,XMMWORD[r10]
775	por	xmm12,xmm6
776	dec	eax
777	jnz	NEAR $L$oop4x
778
779	paddd	xmm8,XMMWORD[64+rsp]
780	paddd	xmm9,XMMWORD[80+rsp]
781	paddd	xmm10,XMMWORD[96+rsp]
782	paddd	xmm11,XMMWORD[112+rsp]
783
784	movdqa	xmm6,xmm8
785	punpckldq	xmm8,xmm9
786	movdqa	xmm7,xmm10
787	punpckldq	xmm10,xmm11
788	punpckhdq	xmm6,xmm9
789	punpckhdq	xmm7,xmm11
790	movdqa	xmm9,xmm8
791	punpcklqdq	xmm8,xmm10
792	movdqa	xmm11,xmm6
793	punpcklqdq	xmm6,xmm7
794	punpckhqdq	xmm9,xmm10
795	punpckhqdq	xmm11,xmm7
796	paddd	xmm12,XMMWORD[((128-256))+rcx]
797	paddd	xmm13,XMMWORD[((144-256))+rcx]
798	paddd	xmm14,XMMWORD[((160-256))+rcx]
799	paddd	xmm15,XMMWORD[((176-256))+rcx]
800
801	movdqa	XMMWORD[rsp],xmm8
802	movdqa	XMMWORD[16+rsp],xmm9
803	movdqa	xmm8,XMMWORD[32+rsp]
804	movdqa	xmm9,XMMWORD[48+rsp]
805
806	movdqa	xmm10,xmm12
807	punpckldq	xmm12,xmm13
808	movdqa	xmm7,xmm14
809	punpckldq	xmm14,xmm15
810	punpckhdq	xmm10,xmm13
811	punpckhdq	xmm7,xmm15
812	movdqa	xmm13,xmm12
813	punpcklqdq	xmm12,xmm14
814	movdqa	xmm15,xmm10
815	punpcklqdq	xmm10,xmm7
816	punpckhqdq	xmm13,xmm14
817	punpckhqdq	xmm15,xmm7
818	paddd	xmm4,XMMWORD[((192-256))+rcx]
819	paddd	xmm5,XMMWORD[((208-256))+rcx]
820	paddd	xmm8,XMMWORD[((224-256))+rcx]
821	paddd	xmm9,XMMWORD[((240-256))+rcx]
822
823	movdqa	XMMWORD[32+rsp],xmm6
824	movdqa	XMMWORD[48+rsp],xmm11
825
826	movdqa	xmm14,xmm4
827	punpckldq	xmm4,xmm5
828	movdqa	xmm7,xmm8
829	punpckldq	xmm8,xmm9
830	punpckhdq	xmm14,xmm5
831	punpckhdq	xmm7,xmm9
832	movdqa	xmm5,xmm4
833	punpcklqdq	xmm4,xmm8
834	movdqa	xmm9,xmm14
835	punpcklqdq	xmm14,xmm7
836	punpckhqdq	xmm5,xmm8
837	punpckhqdq	xmm9,xmm7
838	paddd	xmm0,XMMWORD[((256-256))+rcx]
839	paddd	xmm1,XMMWORD[((272-256))+rcx]
840	paddd	xmm2,XMMWORD[((288-256))+rcx]
841	paddd	xmm3,XMMWORD[((304-256))+rcx]
842
843	movdqa	xmm8,xmm0
844	punpckldq	xmm0,xmm1
845	movdqa	xmm7,xmm2
846	punpckldq	xmm2,xmm3
847	punpckhdq	xmm8,xmm1
848	punpckhdq	xmm7,xmm3
849	movdqa	xmm1,xmm0
850	punpcklqdq	xmm0,xmm2
851	movdqa	xmm3,xmm8
852	punpcklqdq	xmm8,xmm7
853	punpckhqdq	xmm1,xmm2
854	punpckhqdq	xmm3,xmm7
855	cmp	rdx,64*4
856	jb	NEAR $L$tail4x
857
858	movdqu	xmm6,XMMWORD[rsi]
859	movdqu	xmm11,XMMWORD[16+rsi]
860	movdqu	xmm2,XMMWORD[32+rsi]
861	movdqu	xmm7,XMMWORD[48+rsi]
862	pxor	xmm6,XMMWORD[rsp]
863	pxor	xmm11,xmm12
864	pxor	xmm2,xmm4
865	pxor	xmm7,xmm0
866
867	movdqu	XMMWORD[rdi],xmm6
868	movdqu	xmm6,XMMWORD[64+rsi]
869	movdqu	XMMWORD[16+rdi],xmm11
870	movdqu	xmm11,XMMWORD[80+rsi]
871	movdqu	XMMWORD[32+rdi],xmm2
872	movdqu	xmm2,XMMWORD[96+rsi]
873	movdqu	XMMWORD[48+rdi],xmm7
874	movdqu	xmm7,XMMWORD[112+rsi]
875	lea	rsi,[128+rsi]
876	pxor	xmm6,XMMWORD[16+rsp]
877	pxor	xmm11,xmm13
878	pxor	xmm2,xmm5
879	pxor	xmm7,xmm1
880
881	movdqu	XMMWORD[64+rdi],xmm6
882	movdqu	xmm6,XMMWORD[rsi]
883	movdqu	XMMWORD[80+rdi],xmm11
884	movdqu	xmm11,XMMWORD[16+rsi]
885	movdqu	XMMWORD[96+rdi],xmm2
886	movdqu	xmm2,XMMWORD[32+rsi]
887	movdqu	XMMWORD[112+rdi],xmm7
888	lea	rdi,[128+rdi]
889	movdqu	xmm7,XMMWORD[48+rsi]
890	pxor	xmm6,XMMWORD[32+rsp]
891	pxor	xmm11,xmm10
892	pxor	xmm2,xmm14
893	pxor	xmm7,xmm8
894
895	movdqu	XMMWORD[rdi],xmm6
896	movdqu	xmm6,XMMWORD[64+rsi]
897	movdqu	XMMWORD[16+rdi],xmm11
898	movdqu	xmm11,XMMWORD[80+rsi]
899	movdqu	XMMWORD[32+rdi],xmm2
900	movdqu	xmm2,XMMWORD[96+rsi]
901	movdqu	XMMWORD[48+rdi],xmm7
902	movdqu	xmm7,XMMWORD[112+rsi]
903	lea	rsi,[128+rsi]
904	pxor	xmm6,XMMWORD[48+rsp]
905	pxor	xmm11,xmm15
906	pxor	xmm2,xmm9
907	pxor	xmm7,xmm3
908	movdqu	XMMWORD[64+rdi],xmm6
909	movdqu	XMMWORD[80+rdi],xmm11
910	movdqu	XMMWORD[96+rdi],xmm2
911	movdqu	XMMWORD[112+rdi],xmm7
912	lea	rdi,[128+rdi]
913
914	sub	rdx,64*4
915	jnz	NEAR $L$oop_outer4x
916
917	jmp	NEAR $L$done4x
918
919$L$tail4x:
920	cmp	rdx,192
921	jae	NEAR $L$192_or_more4x
922	cmp	rdx,128
923	jae	NEAR $L$128_or_more4x
924	cmp	rdx,64
925	jae	NEAR $L$64_or_more4x
926
927
928	xor	r10,r10
929
930	movdqa	XMMWORD[16+rsp],xmm12
931	movdqa	XMMWORD[32+rsp],xmm4
932	movdqa	XMMWORD[48+rsp],xmm0
933	jmp	NEAR $L$oop_tail4x
934
935ALIGN	32
936$L$64_or_more4x:
937	movdqu	xmm6,XMMWORD[rsi]
938	movdqu	xmm11,XMMWORD[16+rsi]
939	movdqu	xmm2,XMMWORD[32+rsi]
940	movdqu	xmm7,XMMWORD[48+rsi]
941	pxor	xmm6,XMMWORD[rsp]
942	pxor	xmm11,xmm12
943	pxor	xmm2,xmm4
944	pxor	xmm7,xmm0
945	movdqu	XMMWORD[rdi],xmm6
946	movdqu	XMMWORD[16+rdi],xmm11
947	movdqu	XMMWORD[32+rdi],xmm2
948	movdqu	XMMWORD[48+rdi],xmm7
949	je	NEAR $L$done4x
950
951	movdqa	xmm6,XMMWORD[16+rsp]
952	lea	rsi,[64+rsi]
953	xor	r10,r10
954	movdqa	XMMWORD[rsp],xmm6
955	movdqa	XMMWORD[16+rsp],xmm13
956	lea	rdi,[64+rdi]
957	movdqa	XMMWORD[32+rsp],xmm5
958	sub	rdx,64
959	movdqa	XMMWORD[48+rsp],xmm1
960	jmp	NEAR $L$oop_tail4x
961
962ALIGN	32
963$L$128_or_more4x:
964	movdqu	xmm6,XMMWORD[rsi]
965	movdqu	xmm11,XMMWORD[16+rsi]
966	movdqu	xmm2,XMMWORD[32+rsi]
967	movdqu	xmm7,XMMWORD[48+rsi]
968	pxor	xmm6,XMMWORD[rsp]
969	pxor	xmm11,xmm12
970	pxor	xmm2,xmm4
971	pxor	xmm7,xmm0
972
973	movdqu	XMMWORD[rdi],xmm6
974	movdqu	xmm6,XMMWORD[64+rsi]
975	movdqu	XMMWORD[16+rdi],xmm11
976	movdqu	xmm11,XMMWORD[80+rsi]
977	movdqu	XMMWORD[32+rdi],xmm2
978	movdqu	xmm2,XMMWORD[96+rsi]
979	movdqu	XMMWORD[48+rdi],xmm7
980	movdqu	xmm7,XMMWORD[112+rsi]
981	pxor	xmm6,XMMWORD[16+rsp]
982	pxor	xmm11,xmm13
983	pxor	xmm2,xmm5
984	pxor	xmm7,xmm1
985	movdqu	XMMWORD[64+rdi],xmm6
986	movdqu	XMMWORD[80+rdi],xmm11
987	movdqu	XMMWORD[96+rdi],xmm2
988	movdqu	XMMWORD[112+rdi],xmm7
989	je	NEAR $L$done4x
990
991	movdqa	xmm6,XMMWORD[32+rsp]
992	lea	rsi,[128+rsi]
993	xor	r10,r10
994	movdqa	XMMWORD[rsp],xmm6
995	movdqa	XMMWORD[16+rsp],xmm10
996	lea	rdi,[128+rdi]
997	movdqa	XMMWORD[32+rsp],xmm14
998	sub	rdx,128
999	movdqa	XMMWORD[48+rsp],xmm8
1000	jmp	NEAR $L$oop_tail4x
1001
1002ALIGN	32
1003$L$192_or_more4x:
1004	movdqu	xmm6,XMMWORD[rsi]
1005	movdqu	xmm11,XMMWORD[16+rsi]
1006	movdqu	xmm2,XMMWORD[32+rsi]
1007	movdqu	xmm7,XMMWORD[48+rsi]
1008	pxor	xmm6,XMMWORD[rsp]
1009	pxor	xmm11,xmm12
1010	pxor	xmm2,xmm4
1011	pxor	xmm7,xmm0
1012
1013	movdqu	XMMWORD[rdi],xmm6
1014	movdqu	xmm6,XMMWORD[64+rsi]
1015	movdqu	XMMWORD[16+rdi],xmm11
1016	movdqu	xmm11,XMMWORD[80+rsi]
1017	movdqu	XMMWORD[32+rdi],xmm2
1018	movdqu	xmm2,XMMWORD[96+rsi]
1019	movdqu	XMMWORD[48+rdi],xmm7
1020	movdqu	xmm7,XMMWORD[112+rsi]
1021	lea	rsi,[128+rsi]
1022	pxor	xmm6,XMMWORD[16+rsp]
1023	pxor	xmm11,xmm13
1024	pxor	xmm2,xmm5
1025	pxor	xmm7,xmm1
1026
1027	movdqu	XMMWORD[64+rdi],xmm6
1028	movdqu	xmm6,XMMWORD[rsi]
1029	movdqu	XMMWORD[80+rdi],xmm11
1030	movdqu	xmm11,XMMWORD[16+rsi]
1031	movdqu	XMMWORD[96+rdi],xmm2
1032	movdqu	xmm2,XMMWORD[32+rsi]
1033	movdqu	XMMWORD[112+rdi],xmm7
1034	lea	rdi,[128+rdi]
1035	movdqu	xmm7,XMMWORD[48+rsi]
1036	pxor	xmm6,XMMWORD[32+rsp]
1037	pxor	xmm11,xmm10
1038	pxor	xmm2,xmm14
1039	pxor	xmm7,xmm8
1040	movdqu	XMMWORD[rdi],xmm6
1041	movdqu	XMMWORD[16+rdi],xmm11
1042	movdqu	XMMWORD[32+rdi],xmm2
1043	movdqu	XMMWORD[48+rdi],xmm7
1044	je	NEAR $L$done4x
1045
1046	movdqa	xmm6,XMMWORD[48+rsp]
1047	lea	rsi,[64+rsi]
1048	xor	r10,r10
1049	movdqa	XMMWORD[rsp],xmm6
1050	movdqa	XMMWORD[16+rsp],xmm15
1051	lea	rdi,[64+rdi]
1052	movdqa	XMMWORD[32+rsp],xmm9
1053	sub	rdx,192
1054	movdqa	XMMWORD[48+rsp],xmm3
1055
1056$L$oop_tail4x:
1057	movzx	eax,BYTE[r10*1+rsi]
1058	movzx	ecx,BYTE[r10*1+rsp]
1059	lea	r10,[1+r10]
1060	xor	eax,ecx
1061	mov	BYTE[((-1))+r10*1+rdi],al
1062	dec	rdx
1063	jnz	NEAR $L$oop_tail4x
1064
1065$L$done4x:
1066	movaps	xmm6,XMMWORD[((-168))+r9]
1067	movaps	xmm7,XMMWORD[((-152))+r9]
1068	movaps	xmm8,XMMWORD[((-136))+r9]
1069	movaps	xmm9,XMMWORD[((-120))+r9]
1070	movaps	xmm10,XMMWORD[((-104))+r9]
1071	movaps	xmm11,XMMWORD[((-88))+r9]
1072	movaps	xmm12,XMMWORD[((-72))+r9]
1073	movaps	xmm13,XMMWORD[((-56))+r9]
1074	movaps	xmm14,XMMWORD[((-40))+r9]
1075	movaps	xmm15,XMMWORD[((-24))+r9]
1076	lea	rsp,[r9]
1077
1078$L$4x_epilogue:
1079	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1080	mov	rsi,QWORD[16+rsp]
1081	DB	0F3h,0C3h		;repret
1082
1083$L$SEH_end_ChaCha20_4x:
1084
1085ALIGN	32
1086ChaCha20_8x:
1087	mov	QWORD[8+rsp],rdi	;WIN64 prologue
1088	mov	QWORD[16+rsp],rsi
1089	mov	rax,rsp
1090$L$SEH_begin_ChaCha20_8x:
1091	mov	rdi,rcx
1092	mov	rsi,rdx
1093	mov	rdx,r8
1094	mov	rcx,r9
1095	mov	r8,QWORD[40+rsp]
1096
1097
1098$L$ChaCha20_8x:
1099
1100	mov	r9,rsp
1101
1102	sub	rsp,0x280+168
1103	and	rsp,-32
1104	movaps	XMMWORD[(-168)+r9],xmm6
1105	movaps	XMMWORD[(-152)+r9],xmm7
1106	movaps	XMMWORD[(-136)+r9],xmm8
1107	movaps	XMMWORD[(-120)+r9],xmm9
1108	movaps	XMMWORD[(-104)+r9],xmm10
1109	movaps	XMMWORD[(-88)+r9],xmm11
1110	movaps	XMMWORD[(-72)+r9],xmm12
1111	movaps	XMMWORD[(-56)+r9],xmm13
1112	movaps	XMMWORD[(-40)+r9],xmm14
1113	movaps	XMMWORD[(-24)+r9],xmm15
1114$L$8x_body:
1115	vzeroupper
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126	vbroadcasti128	ymm11,XMMWORD[$L$sigma]
1127	vbroadcasti128	ymm3,XMMWORD[rcx]
1128	vbroadcasti128	ymm15,XMMWORD[16+rcx]
1129	vbroadcasti128	ymm7,XMMWORD[r8]
1130	lea	rcx,[256+rsp]
1131	lea	rax,[512+rsp]
1132	lea	r10,[$L$rot16]
1133	lea	r11,[$L$rot24]
1134
1135	vpshufd	ymm8,ymm11,0x00
1136	vpshufd	ymm9,ymm11,0x55
1137	vmovdqa	YMMWORD[(128-256)+rcx],ymm8
1138	vpshufd	ymm10,ymm11,0xaa
1139	vmovdqa	YMMWORD[(160-256)+rcx],ymm9
1140	vpshufd	ymm11,ymm11,0xff
1141	vmovdqa	YMMWORD[(192-256)+rcx],ymm10
1142	vmovdqa	YMMWORD[(224-256)+rcx],ymm11
1143
1144	vpshufd	ymm0,ymm3,0x00
1145	vpshufd	ymm1,ymm3,0x55
1146	vmovdqa	YMMWORD[(256-256)+rcx],ymm0
1147	vpshufd	ymm2,ymm3,0xaa
1148	vmovdqa	YMMWORD[(288-256)+rcx],ymm1
1149	vpshufd	ymm3,ymm3,0xff
1150	vmovdqa	YMMWORD[(320-256)+rcx],ymm2
1151	vmovdqa	YMMWORD[(352-256)+rcx],ymm3
1152
1153	vpshufd	ymm12,ymm15,0x00
1154	vpshufd	ymm13,ymm15,0x55
1155	vmovdqa	YMMWORD[(384-512)+rax],ymm12
1156	vpshufd	ymm14,ymm15,0xaa
1157	vmovdqa	YMMWORD[(416-512)+rax],ymm13
1158	vpshufd	ymm15,ymm15,0xff
1159	vmovdqa	YMMWORD[(448-512)+rax],ymm14
1160	vmovdqa	YMMWORD[(480-512)+rax],ymm15
1161
1162	vpshufd	ymm4,ymm7,0x00
1163	vpshufd	ymm5,ymm7,0x55
1164	vpaddd	ymm4,ymm4,YMMWORD[$L$incy]
1165	vpshufd	ymm6,ymm7,0xaa
1166	vmovdqa	YMMWORD[(544-512)+rax],ymm5
1167	vpshufd	ymm7,ymm7,0xff
1168	vmovdqa	YMMWORD[(576-512)+rax],ymm6
1169	vmovdqa	YMMWORD[(608-512)+rax],ymm7
1170
1171	jmp	NEAR $L$oop_enter8x
1172
1173ALIGN	32
1174$L$oop_outer8x:
1175	vmovdqa	ymm8,YMMWORD[((128-256))+rcx]
1176	vmovdqa	ymm9,YMMWORD[((160-256))+rcx]
1177	vmovdqa	ymm10,YMMWORD[((192-256))+rcx]
1178	vmovdqa	ymm11,YMMWORD[((224-256))+rcx]
1179	vmovdqa	ymm0,YMMWORD[((256-256))+rcx]
1180	vmovdqa	ymm1,YMMWORD[((288-256))+rcx]
1181	vmovdqa	ymm2,YMMWORD[((320-256))+rcx]
1182	vmovdqa	ymm3,YMMWORD[((352-256))+rcx]
1183	vmovdqa	ymm12,YMMWORD[((384-512))+rax]
1184	vmovdqa	ymm13,YMMWORD[((416-512))+rax]
1185	vmovdqa	ymm14,YMMWORD[((448-512))+rax]
1186	vmovdqa	ymm15,YMMWORD[((480-512))+rax]
1187	vmovdqa	ymm4,YMMWORD[((512-512))+rax]
1188	vmovdqa	ymm5,YMMWORD[((544-512))+rax]
1189	vmovdqa	ymm6,YMMWORD[((576-512))+rax]
1190	vmovdqa	ymm7,YMMWORD[((608-512))+rax]
1191	vpaddd	ymm4,ymm4,YMMWORD[$L$eight]
1192
1193$L$oop_enter8x:
1194	vmovdqa	YMMWORD[64+rsp],ymm14
1195	vmovdqa	YMMWORD[96+rsp],ymm15
1196	vbroadcasti128	ymm15,XMMWORD[r10]
1197	vmovdqa	YMMWORD[(512-512)+rax],ymm4
1198	mov	eax,10
1199	jmp	NEAR $L$oop8x
1200
1201ALIGN	32
1202$L$oop8x:
1203	vpaddd	ymm8,ymm8,ymm0
1204	vpxor	ymm4,ymm8,ymm4
1205	vpshufb	ymm4,ymm4,ymm15
1206	vpaddd	ymm9,ymm9,ymm1
1207	vpxor	ymm5,ymm9,ymm5
1208	vpshufb	ymm5,ymm5,ymm15
1209	vpaddd	ymm12,ymm12,ymm4
1210	vpxor	ymm0,ymm12,ymm0
1211	vpslld	ymm14,ymm0,12
1212	vpsrld	ymm0,ymm0,20
1213	vpor	ymm0,ymm14,ymm0
1214	vbroadcasti128	ymm14,XMMWORD[r11]
1215	vpaddd	ymm13,ymm13,ymm5
1216	vpxor	ymm1,ymm13,ymm1
1217	vpslld	ymm15,ymm1,12
1218	vpsrld	ymm1,ymm1,20
1219	vpor	ymm1,ymm15,ymm1
1220	vpaddd	ymm8,ymm8,ymm0
1221	vpxor	ymm4,ymm8,ymm4
1222	vpshufb	ymm4,ymm4,ymm14
1223	vpaddd	ymm9,ymm9,ymm1
1224	vpxor	ymm5,ymm9,ymm5
1225	vpshufb	ymm5,ymm5,ymm14
1226	vpaddd	ymm12,ymm12,ymm4
1227	vpxor	ymm0,ymm12,ymm0
1228	vpslld	ymm15,ymm0,7
1229	vpsrld	ymm0,ymm0,25
1230	vpor	ymm0,ymm15,ymm0
1231	vbroadcasti128	ymm15,XMMWORD[r10]
1232	vpaddd	ymm13,ymm13,ymm5
1233	vpxor	ymm1,ymm13,ymm1
1234	vpslld	ymm14,ymm1,7
1235	vpsrld	ymm1,ymm1,25
1236	vpor	ymm1,ymm14,ymm1
1237	vmovdqa	YMMWORD[rsp],ymm12
1238	vmovdqa	YMMWORD[32+rsp],ymm13
1239	vmovdqa	ymm12,YMMWORD[64+rsp]
1240	vmovdqa	ymm13,YMMWORD[96+rsp]
1241	vpaddd	ymm10,ymm10,ymm2
1242	vpxor	ymm6,ymm10,ymm6
1243	vpshufb	ymm6,ymm6,ymm15
1244	vpaddd	ymm11,ymm11,ymm3
1245	vpxor	ymm7,ymm11,ymm7
1246	vpshufb	ymm7,ymm7,ymm15
1247	vpaddd	ymm12,ymm12,ymm6
1248	vpxor	ymm2,ymm12,ymm2
1249	vpslld	ymm14,ymm2,12
1250	vpsrld	ymm2,ymm2,20
1251	vpor	ymm2,ymm14,ymm2
1252	vbroadcasti128	ymm14,XMMWORD[r11]
1253	vpaddd	ymm13,ymm13,ymm7
1254	vpxor	ymm3,ymm13,ymm3
1255	vpslld	ymm15,ymm3,12
1256	vpsrld	ymm3,ymm3,20
1257	vpor	ymm3,ymm15,ymm3
1258	vpaddd	ymm10,ymm10,ymm2
1259	vpxor	ymm6,ymm10,ymm6
1260	vpshufb	ymm6,ymm6,ymm14
1261	vpaddd	ymm11,ymm11,ymm3
1262	vpxor	ymm7,ymm11,ymm7
1263	vpshufb	ymm7,ymm7,ymm14
1264	vpaddd	ymm12,ymm12,ymm6
1265	vpxor	ymm2,ymm12,ymm2
1266	vpslld	ymm15,ymm2,7
1267	vpsrld	ymm2,ymm2,25
1268	vpor	ymm2,ymm15,ymm2
1269	vbroadcasti128	ymm15,XMMWORD[r10]
1270	vpaddd	ymm13,ymm13,ymm7
1271	vpxor	ymm3,ymm13,ymm3
1272	vpslld	ymm14,ymm3,7
1273	vpsrld	ymm3,ymm3,25
1274	vpor	ymm3,ymm14,ymm3
1275	vpaddd	ymm8,ymm8,ymm1
1276	vpxor	ymm7,ymm8,ymm7
1277	vpshufb	ymm7,ymm7,ymm15
1278	vpaddd	ymm9,ymm9,ymm2
1279	vpxor	ymm4,ymm9,ymm4
1280	vpshufb	ymm4,ymm4,ymm15
1281	vpaddd	ymm12,ymm12,ymm7
1282	vpxor	ymm1,ymm12,ymm1
1283	vpslld	ymm14,ymm1,12
1284	vpsrld	ymm1,ymm1,20
1285	vpor	ymm1,ymm14,ymm1
1286	vbroadcasti128	ymm14,XMMWORD[r11]
1287	vpaddd	ymm13,ymm13,ymm4
1288	vpxor	ymm2,ymm13,ymm2
1289	vpslld	ymm15,ymm2,12
1290	vpsrld	ymm2,ymm2,20
1291	vpor	ymm2,ymm15,ymm2
1292	vpaddd	ymm8,ymm8,ymm1
1293	vpxor	ymm7,ymm8,ymm7
1294	vpshufb	ymm7,ymm7,ymm14
1295	vpaddd	ymm9,ymm9,ymm2
1296	vpxor	ymm4,ymm9,ymm4
1297	vpshufb	ymm4,ymm4,ymm14
1298	vpaddd	ymm12,ymm12,ymm7
1299	vpxor	ymm1,ymm12,ymm1
1300	vpslld	ymm15,ymm1,7
1301	vpsrld	ymm1,ymm1,25
1302	vpor	ymm1,ymm15,ymm1
1303	vbroadcasti128	ymm15,XMMWORD[r10]
1304	vpaddd	ymm13,ymm13,ymm4
1305	vpxor	ymm2,ymm13,ymm2
1306	vpslld	ymm14,ymm2,7
1307	vpsrld	ymm2,ymm2,25
1308	vpor	ymm2,ymm14,ymm2
1309	vmovdqa	YMMWORD[64+rsp],ymm12
1310	vmovdqa	YMMWORD[96+rsp],ymm13
1311	vmovdqa	ymm12,YMMWORD[rsp]
1312	vmovdqa	ymm13,YMMWORD[32+rsp]
1313	vpaddd	ymm10,ymm10,ymm3
1314	vpxor	ymm5,ymm10,ymm5
1315	vpshufb	ymm5,ymm5,ymm15
1316	vpaddd	ymm11,ymm11,ymm0
1317	vpxor	ymm6,ymm11,ymm6
1318	vpshufb	ymm6,ymm6,ymm15
1319	vpaddd	ymm12,ymm12,ymm5
1320	vpxor	ymm3,ymm12,ymm3
1321	vpslld	ymm14,ymm3,12
1322	vpsrld	ymm3,ymm3,20
1323	vpor	ymm3,ymm14,ymm3
1324	vbroadcasti128	ymm14,XMMWORD[r11]
1325	vpaddd	ymm13,ymm13,ymm6
1326	vpxor	ymm0,ymm13,ymm0
1327	vpslld	ymm15,ymm0,12
1328	vpsrld	ymm0,ymm0,20
1329	vpor	ymm0,ymm15,ymm0
1330	vpaddd	ymm10,ymm10,ymm3
1331	vpxor	ymm5,ymm10,ymm5
1332	vpshufb	ymm5,ymm5,ymm14
1333	vpaddd	ymm11,ymm11,ymm0
1334	vpxor	ymm6,ymm11,ymm6
1335	vpshufb	ymm6,ymm6,ymm14
1336	vpaddd	ymm12,ymm12,ymm5
1337	vpxor	ymm3,ymm12,ymm3
1338	vpslld	ymm15,ymm3,7
1339	vpsrld	ymm3,ymm3,25
1340	vpor	ymm3,ymm15,ymm3
1341	vbroadcasti128	ymm15,XMMWORD[r10]
1342	vpaddd	ymm13,ymm13,ymm6
1343	vpxor	ymm0,ymm13,ymm0
1344	vpslld	ymm14,ymm0,7
1345	vpsrld	ymm0,ymm0,25
1346	vpor	ymm0,ymm14,ymm0
1347	dec	eax
1348	jnz	NEAR $L$oop8x
1349
1350	lea	rax,[512+rsp]
1351	vpaddd	ymm8,ymm8,YMMWORD[((128-256))+rcx]
1352	vpaddd	ymm9,ymm9,YMMWORD[((160-256))+rcx]
1353	vpaddd	ymm10,ymm10,YMMWORD[((192-256))+rcx]
1354	vpaddd	ymm11,ymm11,YMMWORD[((224-256))+rcx]
1355
1356	vpunpckldq	ymm14,ymm8,ymm9
1357	vpunpckldq	ymm15,ymm10,ymm11
1358	vpunpckhdq	ymm8,ymm8,ymm9
1359	vpunpckhdq	ymm10,ymm10,ymm11
1360	vpunpcklqdq	ymm9,ymm14,ymm15
1361	vpunpckhqdq	ymm14,ymm14,ymm15
1362	vpunpcklqdq	ymm11,ymm8,ymm10
1363	vpunpckhqdq	ymm8,ymm8,ymm10
1364	vpaddd	ymm0,ymm0,YMMWORD[((256-256))+rcx]
1365	vpaddd	ymm1,ymm1,YMMWORD[((288-256))+rcx]
1366	vpaddd	ymm2,ymm2,YMMWORD[((320-256))+rcx]
1367	vpaddd	ymm3,ymm3,YMMWORD[((352-256))+rcx]
1368
1369	vpunpckldq	ymm10,ymm0,ymm1
1370	vpunpckldq	ymm15,ymm2,ymm3
1371	vpunpckhdq	ymm0,ymm0,ymm1
1372	vpunpckhdq	ymm2,ymm2,ymm3
1373	vpunpcklqdq	ymm1,ymm10,ymm15
1374	vpunpckhqdq	ymm10,ymm10,ymm15
1375	vpunpcklqdq	ymm3,ymm0,ymm2
1376	vpunpckhqdq	ymm0,ymm0,ymm2
1377	vperm2i128	ymm15,ymm9,ymm1,0x20
1378	vperm2i128	ymm1,ymm9,ymm1,0x31
1379	vperm2i128	ymm9,ymm14,ymm10,0x20
1380	vperm2i128	ymm10,ymm14,ymm10,0x31
1381	vperm2i128	ymm14,ymm11,ymm3,0x20
1382	vperm2i128	ymm3,ymm11,ymm3,0x31
1383	vperm2i128	ymm11,ymm8,ymm0,0x20
1384	vperm2i128	ymm0,ymm8,ymm0,0x31
1385	vmovdqa	YMMWORD[rsp],ymm15
1386	vmovdqa	YMMWORD[32+rsp],ymm9
1387	vmovdqa	ymm15,YMMWORD[64+rsp]
1388	vmovdqa	ymm9,YMMWORD[96+rsp]
1389
1390	vpaddd	ymm12,ymm12,YMMWORD[((384-512))+rax]
1391	vpaddd	ymm13,ymm13,YMMWORD[((416-512))+rax]
1392	vpaddd	ymm15,ymm15,YMMWORD[((448-512))+rax]
1393	vpaddd	ymm9,ymm9,YMMWORD[((480-512))+rax]
1394
1395	vpunpckldq	ymm2,ymm12,ymm13
1396	vpunpckldq	ymm8,ymm15,ymm9
1397	vpunpckhdq	ymm12,ymm12,ymm13
1398	vpunpckhdq	ymm15,ymm15,ymm9
1399	vpunpcklqdq	ymm13,ymm2,ymm8
1400	vpunpckhqdq	ymm2,ymm2,ymm8
1401	vpunpcklqdq	ymm9,ymm12,ymm15
1402	vpunpckhqdq	ymm12,ymm12,ymm15
1403	vpaddd	ymm4,ymm4,YMMWORD[((512-512))+rax]
1404	vpaddd	ymm5,ymm5,YMMWORD[((544-512))+rax]
1405	vpaddd	ymm6,ymm6,YMMWORD[((576-512))+rax]
1406	vpaddd	ymm7,ymm7,YMMWORD[((608-512))+rax]
1407
1408	vpunpckldq	ymm15,ymm4,ymm5
1409	vpunpckldq	ymm8,ymm6,ymm7
1410	vpunpckhdq	ymm4,ymm4,ymm5
1411	vpunpckhdq	ymm6,ymm6,ymm7
1412	vpunpcklqdq	ymm5,ymm15,ymm8
1413	vpunpckhqdq	ymm15,ymm15,ymm8
1414	vpunpcklqdq	ymm7,ymm4,ymm6
1415	vpunpckhqdq	ymm4,ymm4,ymm6
1416	vperm2i128	ymm8,ymm13,ymm5,0x20
1417	vperm2i128	ymm5,ymm13,ymm5,0x31
1418	vperm2i128	ymm13,ymm2,ymm15,0x20
1419	vperm2i128	ymm15,ymm2,ymm15,0x31
1420	vperm2i128	ymm2,ymm9,ymm7,0x20
1421	vperm2i128	ymm7,ymm9,ymm7,0x31
1422	vperm2i128	ymm9,ymm12,ymm4,0x20
1423	vperm2i128	ymm4,ymm12,ymm4,0x31
1424	vmovdqa	ymm6,YMMWORD[rsp]
1425	vmovdqa	ymm12,YMMWORD[32+rsp]
1426
1427	cmp	rdx,64*8
1428	jb	NEAR $L$tail8x
1429
1430	vpxor	ymm6,ymm6,YMMWORD[rsi]
1431	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1432	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1433	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1434	lea	rsi,[128+rsi]
1435	vmovdqu	YMMWORD[rdi],ymm6
1436	vmovdqu	YMMWORD[32+rdi],ymm8
1437	vmovdqu	YMMWORD[64+rdi],ymm1
1438	vmovdqu	YMMWORD[96+rdi],ymm5
1439	lea	rdi,[128+rdi]
1440
1441	vpxor	ymm12,ymm12,YMMWORD[rsi]
1442	vpxor	ymm13,ymm13,YMMWORD[32+rsi]
1443	vpxor	ymm10,ymm10,YMMWORD[64+rsi]
1444	vpxor	ymm15,ymm15,YMMWORD[96+rsi]
1445	lea	rsi,[128+rsi]
1446	vmovdqu	YMMWORD[rdi],ymm12
1447	vmovdqu	YMMWORD[32+rdi],ymm13
1448	vmovdqu	YMMWORD[64+rdi],ymm10
1449	vmovdqu	YMMWORD[96+rdi],ymm15
1450	lea	rdi,[128+rdi]
1451
1452	vpxor	ymm14,ymm14,YMMWORD[rsi]
1453	vpxor	ymm2,ymm2,YMMWORD[32+rsi]
1454	vpxor	ymm3,ymm3,YMMWORD[64+rsi]
1455	vpxor	ymm7,ymm7,YMMWORD[96+rsi]
1456	lea	rsi,[128+rsi]
1457	vmovdqu	YMMWORD[rdi],ymm14
1458	vmovdqu	YMMWORD[32+rdi],ymm2
1459	vmovdqu	YMMWORD[64+rdi],ymm3
1460	vmovdqu	YMMWORD[96+rdi],ymm7
1461	lea	rdi,[128+rdi]
1462
1463	vpxor	ymm11,ymm11,YMMWORD[rsi]
1464	vpxor	ymm9,ymm9,YMMWORD[32+rsi]
1465	vpxor	ymm0,ymm0,YMMWORD[64+rsi]
1466	vpxor	ymm4,ymm4,YMMWORD[96+rsi]
1467	lea	rsi,[128+rsi]
1468	vmovdqu	YMMWORD[rdi],ymm11
1469	vmovdqu	YMMWORD[32+rdi],ymm9
1470	vmovdqu	YMMWORD[64+rdi],ymm0
1471	vmovdqu	YMMWORD[96+rdi],ymm4
1472	lea	rdi,[128+rdi]
1473
1474	sub	rdx,64*8
1475	jnz	NEAR $L$oop_outer8x
1476
1477	jmp	NEAR $L$done8x
1478
1479$L$tail8x:
1480	cmp	rdx,448
1481	jae	NEAR $L$448_or_more8x
1482	cmp	rdx,384
1483	jae	NEAR $L$384_or_more8x
1484	cmp	rdx,320
1485	jae	NEAR $L$320_or_more8x
1486	cmp	rdx,256
1487	jae	NEAR $L$256_or_more8x
1488	cmp	rdx,192
1489	jae	NEAR $L$192_or_more8x
1490	cmp	rdx,128
1491	jae	NEAR $L$128_or_more8x
1492	cmp	rdx,64
1493	jae	NEAR $L$64_or_more8x
1494
1495	xor	r10,r10
1496	vmovdqa	YMMWORD[rsp],ymm6
1497	vmovdqa	YMMWORD[32+rsp],ymm8
1498	jmp	NEAR $L$oop_tail8x
1499
1500ALIGN	32
1501$L$64_or_more8x:
1502	vpxor	ymm6,ymm6,YMMWORD[rsi]
1503	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1504	vmovdqu	YMMWORD[rdi],ymm6
1505	vmovdqu	YMMWORD[32+rdi],ymm8
1506	je	NEAR $L$done8x
1507
1508	lea	rsi,[64+rsi]
1509	xor	r10,r10
1510	vmovdqa	YMMWORD[rsp],ymm1
1511	lea	rdi,[64+rdi]
1512	sub	rdx,64
1513	vmovdqa	YMMWORD[32+rsp],ymm5
1514	jmp	NEAR $L$oop_tail8x
1515
1516ALIGN	32
1517$L$128_or_more8x:
1518	vpxor	ymm6,ymm6,YMMWORD[rsi]
1519	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1520	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1521	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1522	vmovdqu	YMMWORD[rdi],ymm6
1523	vmovdqu	YMMWORD[32+rdi],ymm8
1524	vmovdqu	YMMWORD[64+rdi],ymm1
1525	vmovdqu	YMMWORD[96+rdi],ymm5
1526	je	NEAR $L$done8x
1527
1528	lea	rsi,[128+rsi]
1529	xor	r10,r10
1530	vmovdqa	YMMWORD[rsp],ymm12
1531	lea	rdi,[128+rdi]
1532	sub	rdx,128
1533	vmovdqa	YMMWORD[32+rsp],ymm13
1534	jmp	NEAR $L$oop_tail8x
1535
1536ALIGN	32
1537$L$192_or_more8x:
1538	vpxor	ymm6,ymm6,YMMWORD[rsi]
1539	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1540	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1541	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1542	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1543	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1544	vmovdqu	YMMWORD[rdi],ymm6
1545	vmovdqu	YMMWORD[32+rdi],ymm8
1546	vmovdqu	YMMWORD[64+rdi],ymm1
1547	vmovdqu	YMMWORD[96+rdi],ymm5
1548	vmovdqu	YMMWORD[128+rdi],ymm12
1549	vmovdqu	YMMWORD[160+rdi],ymm13
1550	je	NEAR $L$done8x
1551
1552	lea	rsi,[192+rsi]
1553	xor	r10,r10
1554	vmovdqa	YMMWORD[rsp],ymm10
1555	lea	rdi,[192+rdi]
1556	sub	rdx,192
1557	vmovdqa	YMMWORD[32+rsp],ymm15
1558	jmp	NEAR $L$oop_tail8x
1559
1560ALIGN	32
1561$L$256_or_more8x:
1562	vpxor	ymm6,ymm6,YMMWORD[rsi]
1563	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1564	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1565	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1566	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1567	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1568	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
1569	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
1570	vmovdqu	YMMWORD[rdi],ymm6
1571	vmovdqu	YMMWORD[32+rdi],ymm8
1572	vmovdqu	YMMWORD[64+rdi],ymm1
1573	vmovdqu	YMMWORD[96+rdi],ymm5
1574	vmovdqu	YMMWORD[128+rdi],ymm12
1575	vmovdqu	YMMWORD[160+rdi],ymm13
1576	vmovdqu	YMMWORD[192+rdi],ymm10
1577	vmovdqu	YMMWORD[224+rdi],ymm15
1578	je	NEAR $L$done8x
1579
1580	lea	rsi,[256+rsi]
1581	xor	r10,r10
1582	vmovdqa	YMMWORD[rsp],ymm14
1583	lea	rdi,[256+rdi]
1584	sub	rdx,256
1585	vmovdqa	YMMWORD[32+rsp],ymm2
1586	jmp	NEAR $L$oop_tail8x
1587
1588ALIGN	32
1589$L$320_or_more8x:
1590	vpxor	ymm6,ymm6,YMMWORD[rsi]
1591	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1592	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1593	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1594	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1595	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1596	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
1597	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
1598	vpxor	ymm14,ymm14,YMMWORD[256+rsi]
1599	vpxor	ymm2,ymm2,YMMWORD[288+rsi]
1600	vmovdqu	YMMWORD[rdi],ymm6
1601	vmovdqu	YMMWORD[32+rdi],ymm8
1602	vmovdqu	YMMWORD[64+rdi],ymm1
1603	vmovdqu	YMMWORD[96+rdi],ymm5
1604	vmovdqu	YMMWORD[128+rdi],ymm12
1605	vmovdqu	YMMWORD[160+rdi],ymm13
1606	vmovdqu	YMMWORD[192+rdi],ymm10
1607	vmovdqu	YMMWORD[224+rdi],ymm15
1608	vmovdqu	YMMWORD[256+rdi],ymm14
1609	vmovdqu	YMMWORD[288+rdi],ymm2
1610	je	NEAR $L$done8x
1611
1612	lea	rsi,[320+rsi]
1613	xor	r10,r10
1614	vmovdqa	YMMWORD[rsp],ymm3
1615	lea	rdi,[320+rdi]
1616	sub	rdx,320
1617	vmovdqa	YMMWORD[32+rsp],ymm7
1618	jmp	NEAR $L$oop_tail8x
1619
1620ALIGN	32
1621$L$384_or_more8x:
1622	vpxor	ymm6,ymm6,YMMWORD[rsi]
1623	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1624	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1625	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1626	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1627	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1628	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
1629	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
1630	vpxor	ymm14,ymm14,YMMWORD[256+rsi]
1631	vpxor	ymm2,ymm2,YMMWORD[288+rsi]
1632	vpxor	ymm3,ymm3,YMMWORD[320+rsi]
1633	vpxor	ymm7,ymm7,YMMWORD[352+rsi]
1634	vmovdqu	YMMWORD[rdi],ymm6
1635	vmovdqu	YMMWORD[32+rdi],ymm8
1636	vmovdqu	YMMWORD[64+rdi],ymm1
1637	vmovdqu	YMMWORD[96+rdi],ymm5
1638	vmovdqu	YMMWORD[128+rdi],ymm12
1639	vmovdqu	YMMWORD[160+rdi],ymm13
1640	vmovdqu	YMMWORD[192+rdi],ymm10
1641	vmovdqu	YMMWORD[224+rdi],ymm15
1642	vmovdqu	YMMWORD[256+rdi],ymm14
1643	vmovdqu	YMMWORD[288+rdi],ymm2
1644	vmovdqu	YMMWORD[320+rdi],ymm3
1645	vmovdqu	YMMWORD[352+rdi],ymm7
1646	je	NEAR $L$done8x
1647
1648	lea	rsi,[384+rsi]
1649	xor	r10,r10
1650	vmovdqa	YMMWORD[rsp],ymm11
1651	lea	rdi,[384+rdi]
1652	sub	rdx,384
1653	vmovdqa	YMMWORD[32+rsp],ymm9
1654	jmp	NEAR $L$oop_tail8x
1655
1656ALIGN	32
1657$L$448_or_more8x:
1658	vpxor	ymm6,ymm6,YMMWORD[rsi]
1659	vpxor	ymm8,ymm8,YMMWORD[32+rsi]
1660	vpxor	ymm1,ymm1,YMMWORD[64+rsi]
1661	vpxor	ymm5,ymm5,YMMWORD[96+rsi]
1662	vpxor	ymm12,ymm12,YMMWORD[128+rsi]
1663	vpxor	ymm13,ymm13,YMMWORD[160+rsi]
1664	vpxor	ymm10,ymm10,YMMWORD[192+rsi]
1665	vpxor	ymm15,ymm15,YMMWORD[224+rsi]
1666	vpxor	ymm14,ymm14,YMMWORD[256+rsi]
1667	vpxor	ymm2,ymm2,YMMWORD[288+rsi]
1668	vpxor	ymm3,ymm3,YMMWORD[320+rsi]
1669	vpxor	ymm7,ymm7,YMMWORD[352+rsi]
1670	vpxor	ymm11,ymm11,YMMWORD[384+rsi]
1671	vpxor	ymm9,ymm9,YMMWORD[416+rsi]
1672	vmovdqu	YMMWORD[rdi],ymm6
1673	vmovdqu	YMMWORD[32+rdi],ymm8
1674	vmovdqu	YMMWORD[64+rdi],ymm1
1675	vmovdqu	YMMWORD[96+rdi],ymm5
1676	vmovdqu	YMMWORD[128+rdi],ymm12
1677	vmovdqu	YMMWORD[160+rdi],ymm13
1678	vmovdqu	YMMWORD[192+rdi],ymm10
1679	vmovdqu	YMMWORD[224+rdi],ymm15
1680	vmovdqu	YMMWORD[256+rdi],ymm14
1681	vmovdqu	YMMWORD[288+rdi],ymm2
1682	vmovdqu	YMMWORD[320+rdi],ymm3
1683	vmovdqu	YMMWORD[352+rdi],ymm7
1684	vmovdqu	YMMWORD[384+rdi],ymm11
1685	vmovdqu	YMMWORD[416+rdi],ymm9
1686	je	NEAR $L$done8x
1687
1688	lea	rsi,[448+rsi]
1689	xor	r10,r10
1690	vmovdqa	YMMWORD[rsp],ymm0
1691	lea	rdi,[448+rdi]
1692	sub	rdx,448
1693	vmovdqa	YMMWORD[32+rsp],ymm4
1694
1695$L$oop_tail8x:
1696	movzx	eax,BYTE[r10*1+rsi]
1697	movzx	ecx,BYTE[r10*1+rsp]
1698	lea	r10,[1+r10]
1699	xor	eax,ecx
1700	mov	BYTE[((-1))+r10*1+rdi],al
1701	dec	rdx
1702	jnz	NEAR $L$oop_tail8x
1703
1704$L$done8x:
1705	vzeroall
1706	movaps	xmm6,XMMWORD[((-168))+r9]
1707	movaps	xmm7,XMMWORD[((-152))+r9]
1708	movaps	xmm8,XMMWORD[((-136))+r9]
1709	movaps	xmm9,XMMWORD[((-120))+r9]
1710	movaps	xmm10,XMMWORD[((-104))+r9]
1711	movaps	xmm11,XMMWORD[((-88))+r9]
1712	movaps	xmm12,XMMWORD[((-72))+r9]
1713	movaps	xmm13,XMMWORD[((-56))+r9]
1714	movaps	xmm14,XMMWORD[((-40))+r9]
1715	movaps	xmm15,XMMWORD[((-24))+r9]
1716	lea	rsp,[r9]
1717
1718$L$8x_epilogue:
1719	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1720	mov	rsi,QWORD[16+rsp]
1721	DB	0F3h,0C3h		;repret
1722
1723$L$SEH_end_ChaCha20_8x:
1724EXTERN	__imp_RtlVirtualUnwind
1725
1726ALIGN	16
1727se_handler:
1728	push	rsi
1729	push	rdi
1730	push	rbx
1731	push	rbp
1732	push	r12
1733	push	r13
1734	push	r14
1735	push	r15
1736	pushfq
1737	sub	rsp,64
1738
1739	mov	rax,QWORD[120+r8]
1740	mov	rbx,QWORD[248+r8]
1741
1742	mov	rsi,QWORD[8+r9]
1743	mov	r11,QWORD[56+r9]
1744
1745	lea	r10,[$L$ctr32_body]
1746	cmp	rbx,r10
1747	jb	NEAR $L$common_seh_tail
1748
1749	mov	rax,QWORD[152+r8]
1750
1751	lea	r10,[$L$no_data]
1752	cmp	rbx,r10
1753	jae	NEAR $L$common_seh_tail
1754
1755	lea	rax,[((64+24+48))+rax]
1756
1757	mov	rbx,QWORD[((-8))+rax]
1758	mov	rbp,QWORD[((-16))+rax]
1759	mov	r12,QWORD[((-24))+rax]
1760	mov	r13,QWORD[((-32))+rax]
1761	mov	r14,QWORD[((-40))+rax]
1762	mov	r15,QWORD[((-48))+rax]
1763	mov	QWORD[144+r8],rbx
1764	mov	QWORD[160+r8],rbp
1765	mov	QWORD[216+r8],r12
1766	mov	QWORD[224+r8],r13
1767	mov	QWORD[232+r8],r14
1768	mov	QWORD[240+r8],r15
1769
1770$L$common_seh_tail:
1771	mov	rdi,QWORD[8+rax]
1772	mov	rsi,QWORD[16+rax]
1773	mov	QWORD[152+r8],rax
1774	mov	QWORD[168+r8],rsi
1775	mov	QWORD[176+r8],rdi
1776
1777	mov	rdi,QWORD[40+r9]
1778	mov	rsi,r8
1779	mov	ecx,154
1780	DD	0xa548f3fc
1781
1782	mov	rsi,r9
1783	xor	rcx,rcx
1784	mov	rdx,QWORD[8+rsi]
1785	mov	r8,QWORD[rsi]
1786	mov	r9,QWORD[16+rsi]
1787	mov	r10,QWORD[40+rsi]
1788	lea	r11,[56+rsi]
1789	lea	r12,[24+rsi]
1790	mov	QWORD[32+rsp],r10
1791	mov	QWORD[40+rsp],r11
1792	mov	QWORD[48+rsp],r12
1793	mov	QWORD[56+rsp],rcx
1794	call	QWORD[__imp_RtlVirtualUnwind]
1795
1796	mov	eax,1
1797	add	rsp,64
1798	popfq
1799	pop	r15
1800	pop	r14
1801	pop	r13
1802	pop	r12
1803	pop	rbp
1804	pop	rbx
1805	pop	rdi
1806	pop	rsi
1807	DB	0F3h,0C3h		;repret
1808
1809
1810
1811ALIGN	16
1812ssse3_handler:
1813	push	rsi
1814	push	rdi
1815	push	rbx
1816	push	rbp
1817	push	r12
1818	push	r13
1819	push	r14
1820	push	r15
1821	pushfq
1822	sub	rsp,64
1823
1824	mov	rax,QWORD[120+r8]
1825	mov	rbx,QWORD[248+r8]
1826
1827	mov	rsi,QWORD[8+r9]
1828	mov	r11,QWORD[56+r9]
1829
1830	mov	r10d,DWORD[r11]
1831	lea	r10,[r10*1+rsi]
1832	cmp	rbx,r10
1833	jb	NEAR $L$common_seh_tail
1834
1835	mov	rax,QWORD[192+r8]
1836
1837	mov	r10d,DWORD[4+r11]
1838	lea	r10,[r10*1+rsi]
1839	cmp	rbx,r10
1840	jae	NEAR $L$common_seh_tail
1841
1842	lea	rsi,[((-40))+rax]
1843	lea	rdi,[512+r8]
1844	mov	ecx,4
1845	DD	0xa548f3fc
1846
1847	jmp	NEAR $L$common_seh_tail
1848
1849
1850
1851ALIGN	16
1852full_handler:
1853	push	rsi
1854	push	rdi
1855	push	rbx
1856	push	rbp
1857	push	r12
1858	push	r13
1859	push	r14
1860	push	r15
1861	pushfq
1862	sub	rsp,64
1863
1864	mov	rax,QWORD[120+r8]
1865	mov	rbx,QWORD[248+r8]
1866
1867	mov	rsi,QWORD[8+r9]
1868	mov	r11,QWORD[56+r9]
1869
1870	mov	r10d,DWORD[r11]
1871	lea	r10,[r10*1+rsi]
1872	cmp	rbx,r10
1873	jb	NEAR $L$common_seh_tail
1874
1875	mov	rax,QWORD[192+r8]
1876
1877	mov	r10d,DWORD[4+r11]
1878	lea	r10,[r10*1+rsi]
1879	cmp	rbx,r10
1880	jae	NEAR $L$common_seh_tail
1881
1882	lea	rsi,[((-168))+rax]
1883	lea	rdi,[512+r8]
1884	mov	ecx,20
1885	DD	0xa548f3fc
1886
1887	jmp	NEAR $L$common_seh_tail
1888
1889
1890section	.pdata rdata align=4
1891ALIGN	4
1892	DD	$L$SEH_begin_ChaCha20_ctr32 wrt ..imagebase
1893	DD	$L$SEH_end_ChaCha20_ctr32 wrt ..imagebase
1894	DD	$L$SEH_info_ChaCha20_ctr32 wrt ..imagebase
1895
1896	DD	$L$SEH_begin_ChaCha20_ssse3 wrt ..imagebase
1897	DD	$L$SEH_end_ChaCha20_ssse3 wrt ..imagebase
1898	DD	$L$SEH_info_ChaCha20_ssse3 wrt ..imagebase
1899
1900	DD	$L$SEH_begin_ChaCha20_4x wrt ..imagebase
1901	DD	$L$SEH_end_ChaCha20_4x wrt ..imagebase
1902	DD	$L$SEH_info_ChaCha20_4x wrt ..imagebase
1903	DD	$L$SEH_begin_ChaCha20_8x wrt ..imagebase
1904	DD	$L$SEH_end_ChaCha20_8x wrt ..imagebase
1905	DD	$L$SEH_info_ChaCha20_8x wrt ..imagebase
1906section	.xdata rdata align=8
1907ALIGN	8
1908$L$SEH_info_ChaCha20_ctr32:
1909DB	9,0,0,0
1910	DD	se_handler wrt ..imagebase
1911
1912$L$SEH_info_ChaCha20_ssse3:
1913DB	9,0,0,0
1914	DD	ssse3_handler wrt ..imagebase
1915	DD	$L$ssse3_body wrt ..imagebase,$L$ssse3_epilogue wrt ..imagebase
1916
1917$L$SEH_info_ChaCha20_4x:
1918DB	9,0,0,0
1919	DD	full_handler wrt ..imagebase
1920	DD	$L$4x_body wrt ..imagebase,$L$4x_epilogue wrt ..imagebase
1921$L$SEH_info_ChaCha20_8x:
1922DB	9,0,0,0
1923	DD	full_handler wrt ..imagebase
1924	DD	$L$8x_body wrt ..imagebase,$L$8x_epilogue wrt ..imagebase
1925