• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1default	rel
2%define XMMWORD
3%define YMMWORD
4%define ZMMWORD
5section	.text code align=64
6
7EXTERN	OPENSSL_ia32cap_P
8
9
10ALIGN	64
11$L$poly:
12	DQ	0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
13
14$L$One:
15	DD	1,1,1,1,1,1,1,1
16$L$Two:
17	DD	2,2,2,2,2,2,2,2
18$L$Three:
19	DD	3,3,3,3,3,3,3,3
20$L$ONE_mont:
21	DQ	0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
22
23
24
25global	ecp_nistz256_neg
26
27ALIGN	32
28ecp_nistz256_neg:
29	mov	QWORD[8+rsp],rdi	;WIN64 prologue
30	mov	QWORD[16+rsp],rsi
31	mov	rax,rsp
32$L$SEH_begin_ecp_nistz256_neg:
33	mov	rdi,rcx
34	mov	rsi,rdx
35
36
37	push	r12
38	push	r13
39
40	xor	r8,r8
41	xor	r9,r9
42	xor	r10,r10
43	xor	r11,r11
44	xor	r13,r13
45
46	sub	r8,QWORD[rsi]
47	sbb	r9,QWORD[8+rsi]
48	sbb	r10,QWORD[16+rsi]
49	mov	rax,r8
50	sbb	r11,QWORD[24+rsi]
51	lea	rsi,[$L$poly]
52	mov	rdx,r9
53	sbb	r13,0
54
55	add	r8,QWORD[rsi]
56	mov	rcx,r10
57	adc	r9,QWORD[8+rsi]
58	adc	r10,QWORD[16+rsi]
59	mov	r12,r11
60	adc	r11,QWORD[24+rsi]
61	test	r13,r13
62
63	cmovz	r8,rax
64	cmovz	r9,rdx
65	mov	QWORD[rdi],r8
66	cmovz	r10,rcx
67	mov	QWORD[8+rdi],r9
68	cmovz	r11,r12
69	mov	QWORD[16+rdi],r10
70	mov	QWORD[24+rdi],r11
71
72	pop	r13
73	pop	r12
74	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
75	mov	rsi,QWORD[16+rsp]
76	DB	0F3h,0C3h		;repret
77$L$SEH_end_ecp_nistz256_neg:
78
79
80
81
82
83
84global	ecp_nistz256_mul_mont
85
86ALIGN	32
87ecp_nistz256_mul_mont:
88	mov	QWORD[8+rsp],rdi	;WIN64 prologue
89	mov	QWORD[16+rsp],rsi
90	mov	rax,rsp
91$L$SEH_begin_ecp_nistz256_mul_mont:
92	mov	rdi,rcx
93	mov	rsi,rdx
94	mov	rdx,r8
95
96
97$L$mul_mont:
98	push	rbp
99	push	rbx
100	push	r12
101	push	r13
102	push	r14
103	push	r15
104	mov	rbx,rdx
105	mov	rax,QWORD[rdx]
106	mov	r9,QWORD[rsi]
107	mov	r10,QWORD[8+rsi]
108	mov	r11,QWORD[16+rsi]
109	mov	r12,QWORD[24+rsi]
110
111	call	__ecp_nistz256_mul_montq
112$L$mul_mont_done:
113	pop	r15
114	pop	r14
115	pop	r13
116	pop	r12
117	pop	rbx
118	pop	rbp
119	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
120	mov	rsi,QWORD[16+rsp]
121	DB	0F3h,0C3h		;repret
122$L$SEH_end_ecp_nistz256_mul_mont:
123
124
125ALIGN	32
126__ecp_nistz256_mul_montq:
127
128
129	mov	rbp,rax
130	mul	r9
131	mov	r14,QWORD[(($L$poly+8))]
132	mov	r8,rax
133	mov	rax,rbp
134	mov	r9,rdx
135
136	mul	r10
137	mov	r15,QWORD[(($L$poly+24))]
138	add	r9,rax
139	mov	rax,rbp
140	adc	rdx,0
141	mov	r10,rdx
142
143	mul	r11
144	add	r10,rax
145	mov	rax,rbp
146	adc	rdx,0
147	mov	r11,rdx
148
149	mul	r12
150	add	r11,rax
151	mov	rax,r8
152	adc	rdx,0
153	xor	r13,r13
154	mov	r12,rdx
155
156
157
158
159
160
161
162
163
164
165	mov	rbp,r8
166	shl	r8,32
167	mul	r15
168	shr	rbp,32
169	add	r9,r8
170	adc	r10,rbp
171	adc	r11,rax
172	mov	rax,QWORD[8+rbx]
173	adc	r12,rdx
174	adc	r13,0
175	xor	r8,r8
176
177
178
179	mov	rbp,rax
180	mul	QWORD[rsi]
181	add	r9,rax
182	mov	rax,rbp
183	adc	rdx,0
184	mov	rcx,rdx
185
186	mul	QWORD[8+rsi]
187	add	r10,rcx
188	adc	rdx,0
189	add	r10,rax
190	mov	rax,rbp
191	adc	rdx,0
192	mov	rcx,rdx
193
194	mul	QWORD[16+rsi]
195	add	r11,rcx
196	adc	rdx,0
197	add	r11,rax
198	mov	rax,rbp
199	adc	rdx,0
200	mov	rcx,rdx
201
202	mul	QWORD[24+rsi]
203	add	r12,rcx
204	adc	rdx,0
205	add	r12,rax
206	mov	rax,r9
207	adc	r13,rdx
208	adc	r8,0
209
210
211
212	mov	rbp,r9
213	shl	r9,32
214	mul	r15
215	shr	rbp,32
216	add	r10,r9
217	adc	r11,rbp
218	adc	r12,rax
219	mov	rax,QWORD[16+rbx]
220	adc	r13,rdx
221	adc	r8,0
222	xor	r9,r9
223
224
225
226	mov	rbp,rax
227	mul	QWORD[rsi]
228	add	r10,rax
229	mov	rax,rbp
230	adc	rdx,0
231	mov	rcx,rdx
232
233	mul	QWORD[8+rsi]
234	add	r11,rcx
235	adc	rdx,0
236	add	r11,rax
237	mov	rax,rbp
238	adc	rdx,0
239	mov	rcx,rdx
240
241	mul	QWORD[16+rsi]
242	add	r12,rcx
243	adc	rdx,0
244	add	r12,rax
245	mov	rax,rbp
246	adc	rdx,0
247	mov	rcx,rdx
248
249	mul	QWORD[24+rsi]
250	add	r13,rcx
251	adc	rdx,0
252	add	r13,rax
253	mov	rax,r10
254	adc	r8,rdx
255	adc	r9,0
256
257
258
259	mov	rbp,r10
260	shl	r10,32
261	mul	r15
262	shr	rbp,32
263	add	r11,r10
264	adc	r12,rbp
265	adc	r13,rax
266	mov	rax,QWORD[24+rbx]
267	adc	r8,rdx
268	adc	r9,0
269	xor	r10,r10
270
271
272
273	mov	rbp,rax
274	mul	QWORD[rsi]
275	add	r11,rax
276	mov	rax,rbp
277	adc	rdx,0
278	mov	rcx,rdx
279
280	mul	QWORD[8+rsi]
281	add	r12,rcx
282	adc	rdx,0
283	add	r12,rax
284	mov	rax,rbp
285	adc	rdx,0
286	mov	rcx,rdx
287
288	mul	QWORD[16+rsi]
289	add	r13,rcx
290	adc	rdx,0
291	add	r13,rax
292	mov	rax,rbp
293	adc	rdx,0
294	mov	rcx,rdx
295
296	mul	QWORD[24+rsi]
297	add	r8,rcx
298	adc	rdx,0
299	add	r8,rax
300	mov	rax,r11
301	adc	r9,rdx
302	adc	r10,0
303
304
305
306	mov	rbp,r11
307	shl	r11,32
308	mul	r15
309	shr	rbp,32
310	add	r12,r11
311	adc	r13,rbp
312	mov	rcx,r12
313	adc	r8,rax
314	adc	r9,rdx
315	mov	rbp,r13
316	adc	r10,0
317
318
319
320	sub	r12,-1
321	mov	rbx,r8
322	sbb	r13,r14
323	sbb	r8,0
324	mov	rdx,r9
325	sbb	r9,r15
326	sbb	r10,0
327
328	cmovc	r12,rcx
329	cmovc	r13,rbp
330	mov	QWORD[rdi],r12
331	cmovc	r8,rbx
332	mov	QWORD[8+rdi],r13
333	cmovc	r9,rdx
334	mov	QWORD[16+rdi],r8
335	mov	QWORD[24+rdi],r9
336
337	DB	0F3h,0C3h		;repret
338
339
340
341
342
343
344
345
346
347global	ecp_nistz256_sqr_mont
348
349ALIGN	32
350ecp_nistz256_sqr_mont:
351	mov	QWORD[8+rsp],rdi	;WIN64 prologue
352	mov	QWORD[16+rsp],rsi
353	mov	rax,rsp
354$L$SEH_begin_ecp_nistz256_sqr_mont:
355	mov	rdi,rcx
356	mov	rsi,rdx
357
358
359	push	rbp
360	push	rbx
361	push	r12
362	push	r13
363	push	r14
364	push	r15
365	mov	rax,QWORD[rsi]
366	mov	r14,QWORD[8+rsi]
367	mov	r15,QWORD[16+rsi]
368	mov	r8,QWORD[24+rsi]
369
370	call	__ecp_nistz256_sqr_montq
371$L$sqr_mont_done:
372	pop	r15
373	pop	r14
374	pop	r13
375	pop	r12
376	pop	rbx
377	pop	rbp
378	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
379	mov	rsi,QWORD[16+rsp]
380	DB	0F3h,0C3h		;repret
381$L$SEH_end_ecp_nistz256_sqr_mont:
382
383
384ALIGN	32
385__ecp_nistz256_sqr_montq:
386	mov	r13,rax
387	mul	r14
388	mov	r9,rax
389	mov	rax,r15
390	mov	r10,rdx
391
392	mul	r13
393	add	r10,rax
394	mov	rax,r8
395	adc	rdx,0
396	mov	r11,rdx
397
398	mul	r13
399	add	r11,rax
400	mov	rax,r15
401	adc	rdx,0
402	mov	r12,rdx
403
404
405	mul	r14
406	add	r11,rax
407	mov	rax,r8
408	adc	rdx,0
409	mov	rbp,rdx
410
411	mul	r14
412	add	r12,rax
413	mov	rax,r8
414	adc	rdx,0
415	add	r12,rbp
416	mov	r13,rdx
417	adc	r13,0
418
419
420	mul	r15
421	xor	r15,r15
422	add	r13,rax
423	mov	rax,QWORD[rsi]
424	mov	r14,rdx
425	adc	r14,0
426
427	add	r9,r9
428	adc	r10,r10
429	adc	r11,r11
430	adc	r12,r12
431	adc	r13,r13
432	adc	r14,r14
433	adc	r15,0
434
435	mul	rax
436	mov	r8,rax
437	mov	rax,QWORD[8+rsi]
438	mov	rcx,rdx
439
440	mul	rax
441	add	r9,rcx
442	adc	r10,rax
443	mov	rax,QWORD[16+rsi]
444	adc	rdx,0
445	mov	rcx,rdx
446
447	mul	rax
448	add	r11,rcx
449	adc	r12,rax
450	mov	rax,QWORD[24+rsi]
451	adc	rdx,0
452	mov	rcx,rdx
453
454	mul	rax
455	add	r13,rcx
456	adc	r14,rax
457	mov	rax,r8
458	adc	r15,rdx
459
460	mov	rsi,QWORD[(($L$poly+8))]
461	mov	rbp,QWORD[(($L$poly+24))]
462
463
464
465
466	mov	rcx,r8
467	shl	r8,32
468	mul	rbp
469	shr	rcx,32
470	add	r9,r8
471	adc	r10,rcx
472	adc	r11,rax
473	mov	rax,r9
474	adc	rdx,0
475
476
477
478	mov	rcx,r9
479	shl	r9,32
480	mov	r8,rdx
481	mul	rbp
482	shr	rcx,32
483	add	r10,r9
484	adc	r11,rcx
485	adc	r8,rax
486	mov	rax,r10
487	adc	rdx,0
488
489
490
491	mov	rcx,r10
492	shl	r10,32
493	mov	r9,rdx
494	mul	rbp
495	shr	rcx,32
496	add	r11,r10
497	adc	r8,rcx
498	adc	r9,rax
499	mov	rax,r11
500	adc	rdx,0
501
502
503
504	mov	rcx,r11
505	shl	r11,32
506	mov	r10,rdx
507	mul	rbp
508	shr	rcx,32
509	add	r8,r11
510	adc	r9,rcx
511	adc	r10,rax
512	adc	rdx,0
513	xor	r11,r11
514
515
516
517	add	r12,r8
518	adc	r13,r9
519	mov	r8,r12
520	adc	r14,r10
521	adc	r15,rdx
522	mov	r9,r13
523	adc	r11,0
524
525	sub	r12,-1
526	mov	r10,r14
527	sbb	r13,rsi
528	sbb	r14,0
529	mov	rcx,r15
530	sbb	r15,rbp
531	sbb	r11,0
532
533	cmovc	r12,r8
534	cmovc	r13,r9
535	mov	QWORD[rdi],r12
536	cmovc	r14,r10
537	mov	QWORD[8+rdi],r13
538	cmovc	r15,rcx
539	mov	QWORD[16+rdi],r14
540	mov	QWORD[24+rdi],r15
541
542	DB	0F3h,0C3h		;repret
543
544
545
546global	ecp_nistz256_select_w5
547
548ALIGN	32
549ecp_nistz256_select_w5:
550	lea	rax,[OPENSSL_ia32cap_P]
551	mov	rax,QWORD[8+rax]
552	test	eax,32
553	jnz	NEAR $L$avx2_select_w5
554	lea	rax,[((-136))+rsp]
555$L$SEH_begin_ecp_nistz256_select_w5:
556DB	0x48,0x8d,0x60,0xe0
557DB	0x0f,0x29,0x70,0xe0
558DB	0x0f,0x29,0x78,0xf0
559DB	0x44,0x0f,0x29,0x00
560DB	0x44,0x0f,0x29,0x48,0x10
561DB	0x44,0x0f,0x29,0x50,0x20
562DB	0x44,0x0f,0x29,0x58,0x30
563DB	0x44,0x0f,0x29,0x60,0x40
564DB	0x44,0x0f,0x29,0x68,0x50
565DB	0x44,0x0f,0x29,0x70,0x60
566DB	0x44,0x0f,0x29,0x78,0x70
567	movdqa	xmm0,XMMWORD[$L$One]
568	movd	xmm1,r8d
569
570	pxor	xmm2,xmm2
571	pxor	xmm3,xmm3
572	pxor	xmm4,xmm4
573	pxor	xmm5,xmm5
574	pxor	xmm6,xmm6
575	pxor	xmm7,xmm7
576
577	movdqa	xmm8,xmm0
578	pshufd	xmm1,xmm1,0
579
580	mov	rax,16
581$L$select_loop_sse_w5:
582
583	movdqa	xmm15,xmm8
584	paddd	xmm8,xmm0
585	pcmpeqd	xmm15,xmm1
586
587	movdqa	xmm9,XMMWORD[rdx]
588	movdqa	xmm10,XMMWORD[16+rdx]
589	movdqa	xmm11,XMMWORD[32+rdx]
590	movdqa	xmm12,XMMWORD[48+rdx]
591	movdqa	xmm13,XMMWORD[64+rdx]
592	movdqa	xmm14,XMMWORD[80+rdx]
593	lea	rdx,[96+rdx]
594
595	pand	xmm9,xmm15
596	pand	xmm10,xmm15
597	por	xmm2,xmm9
598	pand	xmm11,xmm15
599	por	xmm3,xmm10
600	pand	xmm12,xmm15
601	por	xmm4,xmm11
602	pand	xmm13,xmm15
603	por	xmm5,xmm12
604	pand	xmm14,xmm15
605	por	xmm6,xmm13
606	por	xmm7,xmm14
607
608	dec	rax
609	jnz	NEAR $L$select_loop_sse_w5
610
611	movdqu	XMMWORD[rcx],xmm2
612	movdqu	XMMWORD[16+rcx],xmm3
613	movdqu	XMMWORD[32+rcx],xmm4
614	movdqu	XMMWORD[48+rcx],xmm5
615	movdqu	XMMWORD[64+rcx],xmm6
616	movdqu	XMMWORD[80+rcx],xmm7
617	movaps	xmm6,XMMWORD[rsp]
618	movaps	xmm7,XMMWORD[16+rsp]
619	movaps	xmm8,XMMWORD[32+rsp]
620	movaps	xmm9,XMMWORD[48+rsp]
621	movaps	xmm10,XMMWORD[64+rsp]
622	movaps	xmm11,XMMWORD[80+rsp]
623	movaps	xmm12,XMMWORD[96+rsp]
624	movaps	xmm13,XMMWORD[112+rsp]
625	movaps	xmm14,XMMWORD[128+rsp]
626	movaps	xmm15,XMMWORD[144+rsp]
627	lea	rsp,[168+rsp]
628$L$SEH_end_ecp_nistz256_select_w5:
629	DB	0F3h,0C3h		;repret
630
631
632
633
634global	ecp_nistz256_select_w7
635
636ALIGN	32
637ecp_nistz256_select_w7:
638	lea	rax,[OPENSSL_ia32cap_P]
639	mov	rax,QWORD[8+rax]
640	test	eax,32
641	jnz	NEAR $L$avx2_select_w7
642	lea	rax,[((-136))+rsp]
643$L$SEH_begin_ecp_nistz256_select_w7:
644DB	0x48,0x8d,0x60,0xe0
645DB	0x0f,0x29,0x70,0xe0
646DB	0x0f,0x29,0x78,0xf0
647DB	0x44,0x0f,0x29,0x00
648DB	0x44,0x0f,0x29,0x48,0x10
649DB	0x44,0x0f,0x29,0x50,0x20
650DB	0x44,0x0f,0x29,0x58,0x30
651DB	0x44,0x0f,0x29,0x60,0x40
652DB	0x44,0x0f,0x29,0x68,0x50
653DB	0x44,0x0f,0x29,0x70,0x60
654DB	0x44,0x0f,0x29,0x78,0x70
655	movdqa	xmm8,XMMWORD[$L$One]
656	movd	xmm1,r8d
657
658	pxor	xmm2,xmm2
659	pxor	xmm3,xmm3
660	pxor	xmm4,xmm4
661	pxor	xmm5,xmm5
662
663	movdqa	xmm0,xmm8
664	pshufd	xmm1,xmm1,0
665	mov	rax,64
666
667$L$select_loop_sse_w7:
668	movdqa	xmm15,xmm8
669	paddd	xmm8,xmm0
670	movdqa	xmm9,XMMWORD[rdx]
671	movdqa	xmm10,XMMWORD[16+rdx]
672	pcmpeqd	xmm15,xmm1
673	movdqa	xmm11,XMMWORD[32+rdx]
674	movdqa	xmm12,XMMWORD[48+rdx]
675	lea	rdx,[64+rdx]
676
677	pand	xmm9,xmm15
678	pand	xmm10,xmm15
679	por	xmm2,xmm9
680	pand	xmm11,xmm15
681	por	xmm3,xmm10
682	pand	xmm12,xmm15
683	por	xmm4,xmm11
684	prefetcht0	[255+rdx]
685	por	xmm5,xmm12
686
687	dec	rax
688	jnz	NEAR $L$select_loop_sse_w7
689
690	movdqu	XMMWORD[rcx],xmm2
691	movdqu	XMMWORD[16+rcx],xmm3
692	movdqu	XMMWORD[32+rcx],xmm4
693	movdqu	XMMWORD[48+rcx],xmm5
694	movaps	xmm6,XMMWORD[rsp]
695	movaps	xmm7,XMMWORD[16+rsp]
696	movaps	xmm8,XMMWORD[32+rsp]
697	movaps	xmm9,XMMWORD[48+rsp]
698	movaps	xmm10,XMMWORD[64+rsp]
699	movaps	xmm11,XMMWORD[80+rsp]
700	movaps	xmm12,XMMWORD[96+rsp]
701	movaps	xmm13,XMMWORD[112+rsp]
702	movaps	xmm14,XMMWORD[128+rsp]
703	movaps	xmm15,XMMWORD[144+rsp]
704	lea	rsp,[168+rsp]
705$L$SEH_end_ecp_nistz256_select_w7:
706	DB	0F3h,0C3h		;repret
707
708
709
710
711ALIGN	32
712ecp_nistz256_avx2_select_w5:
713$L$avx2_select_w5:
714	vzeroupper
715	lea	rax,[((-136))+rsp]
716$L$SEH_begin_ecp_nistz256_avx2_select_w5:
717DB	0x48,0x8d,0x60,0xe0
718DB	0xc5,0xf8,0x29,0x70,0xe0
719DB	0xc5,0xf8,0x29,0x78,0xf0
720DB	0xc5,0x78,0x29,0x40,0x00
721DB	0xc5,0x78,0x29,0x48,0x10
722DB	0xc5,0x78,0x29,0x50,0x20
723DB	0xc5,0x78,0x29,0x58,0x30
724DB	0xc5,0x78,0x29,0x60,0x40
725DB	0xc5,0x78,0x29,0x68,0x50
726DB	0xc5,0x78,0x29,0x70,0x60
727DB	0xc5,0x78,0x29,0x78,0x70
728	vmovdqa	ymm0,YMMWORD[$L$Two]
729
730	vpxor	ymm2,ymm2,ymm2
731	vpxor	ymm3,ymm3,ymm3
732	vpxor	ymm4,ymm4,ymm4
733
734	vmovdqa	ymm5,YMMWORD[$L$One]
735	vmovdqa	ymm10,YMMWORD[$L$Two]
736
737	vmovd	xmm1,r8d
738	vpermd	ymm1,ymm2,ymm1
739
740	mov	rax,8
741$L$select_loop_avx2_w5:
742
743	vmovdqa	ymm6,YMMWORD[rdx]
744	vmovdqa	ymm7,YMMWORD[32+rdx]
745	vmovdqa	ymm8,YMMWORD[64+rdx]
746
747	vmovdqa	ymm11,YMMWORD[96+rdx]
748	vmovdqa	ymm12,YMMWORD[128+rdx]
749	vmovdqa	ymm13,YMMWORD[160+rdx]
750
751	vpcmpeqd	ymm9,ymm5,ymm1
752	vpcmpeqd	ymm14,ymm10,ymm1
753
754	vpaddd	ymm5,ymm5,ymm0
755	vpaddd	ymm10,ymm10,ymm0
756	lea	rdx,[192+rdx]
757
758	vpand	ymm6,ymm6,ymm9
759	vpand	ymm7,ymm7,ymm9
760	vpand	ymm8,ymm8,ymm9
761	vpand	ymm11,ymm11,ymm14
762	vpand	ymm12,ymm12,ymm14
763	vpand	ymm13,ymm13,ymm14
764
765	vpxor	ymm2,ymm2,ymm6
766	vpxor	ymm3,ymm3,ymm7
767	vpxor	ymm4,ymm4,ymm8
768	vpxor	ymm2,ymm2,ymm11
769	vpxor	ymm3,ymm3,ymm12
770	vpxor	ymm4,ymm4,ymm13
771
772	dec	rax
773	jnz	NEAR $L$select_loop_avx2_w5
774
775	vmovdqu	YMMWORD[rcx],ymm2
776	vmovdqu	YMMWORD[32+rcx],ymm3
777	vmovdqu	YMMWORD[64+rcx],ymm4
778	vzeroupper
779	movaps	xmm6,XMMWORD[rsp]
780	movaps	xmm7,XMMWORD[16+rsp]
781	movaps	xmm8,XMMWORD[32+rsp]
782	movaps	xmm9,XMMWORD[48+rsp]
783	movaps	xmm10,XMMWORD[64+rsp]
784	movaps	xmm11,XMMWORD[80+rsp]
785	movaps	xmm12,XMMWORD[96+rsp]
786	movaps	xmm13,XMMWORD[112+rsp]
787	movaps	xmm14,XMMWORD[128+rsp]
788	movaps	xmm15,XMMWORD[144+rsp]
789	lea	rsp,[168+rsp]
790$L$SEH_end_ecp_nistz256_avx2_select_w5:
791	DB	0F3h,0C3h		;repret
792
793
794
795
796global	ecp_nistz256_avx2_select_w7
797
798ALIGN	32
799ecp_nistz256_avx2_select_w7:
800$L$avx2_select_w7:
801	vzeroupper
802	lea	rax,[((-136))+rsp]
803$L$SEH_begin_ecp_nistz256_avx2_select_w7:
804DB	0x48,0x8d,0x60,0xe0
805DB	0xc5,0xf8,0x29,0x70,0xe0
806DB	0xc5,0xf8,0x29,0x78,0xf0
807DB	0xc5,0x78,0x29,0x40,0x00
808DB	0xc5,0x78,0x29,0x48,0x10
809DB	0xc5,0x78,0x29,0x50,0x20
810DB	0xc5,0x78,0x29,0x58,0x30
811DB	0xc5,0x78,0x29,0x60,0x40
812DB	0xc5,0x78,0x29,0x68,0x50
813DB	0xc5,0x78,0x29,0x70,0x60
814DB	0xc5,0x78,0x29,0x78,0x70
815	vmovdqa	ymm0,YMMWORD[$L$Three]
816
817	vpxor	ymm2,ymm2,ymm2
818	vpxor	ymm3,ymm3,ymm3
819
820	vmovdqa	ymm4,YMMWORD[$L$One]
821	vmovdqa	ymm8,YMMWORD[$L$Two]
822	vmovdqa	ymm12,YMMWORD[$L$Three]
823
824	vmovd	xmm1,r8d
825	vpermd	ymm1,ymm2,ymm1
826
827
828	mov	rax,21
829$L$select_loop_avx2_w7:
830
831	vmovdqa	ymm5,YMMWORD[rdx]
832	vmovdqa	ymm6,YMMWORD[32+rdx]
833
834	vmovdqa	ymm9,YMMWORD[64+rdx]
835	vmovdqa	ymm10,YMMWORD[96+rdx]
836
837	vmovdqa	ymm13,YMMWORD[128+rdx]
838	vmovdqa	ymm14,YMMWORD[160+rdx]
839
840	vpcmpeqd	ymm7,ymm4,ymm1
841	vpcmpeqd	ymm11,ymm8,ymm1
842	vpcmpeqd	ymm15,ymm12,ymm1
843
844	vpaddd	ymm4,ymm4,ymm0
845	vpaddd	ymm8,ymm8,ymm0
846	vpaddd	ymm12,ymm12,ymm0
847	lea	rdx,[192+rdx]
848
849	vpand	ymm5,ymm5,ymm7
850	vpand	ymm6,ymm6,ymm7
851	vpand	ymm9,ymm9,ymm11
852	vpand	ymm10,ymm10,ymm11
853	vpand	ymm13,ymm13,ymm15
854	vpand	ymm14,ymm14,ymm15
855
856	vpxor	ymm2,ymm2,ymm5
857	vpxor	ymm3,ymm3,ymm6
858	vpxor	ymm2,ymm2,ymm9
859	vpxor	ymm3,ymm3,ymm10
860	vpxor	ymm2,ymm2,ymm13
861	vpxor	ymm3,ymm3,ymm14
862
863	dec	rax
864	jnz	NEAR $L$select_loop_avx2_w7
865
866
867	vmovdqa	ymm5,YMMWORD[rdx]
868	vmovdqa	ymm6,YMMWORD[32+rdx]
869
870	vpcmpeqd	ymm7,ymm4,ymm1
871
872	vpand	ymm5,ymm5,ymm7
873	vpand	ymm6,ymm6,ymm7
874
875	vpxor	ymm2,ymm2,ymm5
876	vpxor	ymm3,ymm3,ymm6
877
878	vmovdqu	YMMWORD[rcx],ymm2
879	vmovdqu	YMMWORD[32+rcx],ymm3
880	vzeroupper
881	movaps	xmm6,XMMWORD[rsp]
882	movaps	xmm7,XMMWORD[16+rsp]
883	movaps	xmm8,XMMWORD[32+rsp]
884	movaps	xmm9,XMMWORD[48+rsp]
885	movaps	xmm10,XMMWORD[64+rsp]
886	movaps	xmm11,XMMWORD[80+rsp]
887	movaps	xmm12,XMMWORD[96+rsp]
888	movaps	xmm13,XMMWORD[112+rsp]
889	movaps	xmm14,XMMWORD[128+rsp]
890	movaps	xmm15,XMMWORD[144+rsp]
891	lea	rsp,[168+rsp]
892$L$SEH_end_ecp_nistz256_avx2_select_w7:
893	DB	0F3h,0C3h		;repret
894
895
896ALIGN	32
897__ecp_nistz256_add_toq:
898	xor	r11,r11
899	add	r12,QWORD[rbx]
900	adc	r13,QWORD[8+rbx]
901	mov	rax,r12
902	adc	r8,QWORD[16+rbx]
903	adc	r9,QWORD[24+rbx]
904	mov	rbp,r13
905	adc	r11,0
906
907	sub	r12,-1
908	mov	rcx,r8
909	sbb	r13,r14
910	sbb	r8,0
911	mov	r10,r9
912	sbb	r9,r15
913	sbb	r11,0
914
915	cmovc	r12,rax
916	cmovc	r13,rbp
917	mov	QWORD[rdi],r12
918	cmovc	r8,rcx
919	mov	QWORD[8+rdi],r13
920	cmovc	r9,r10
921	mov	QWORD[16+rdi],r8
922	mov	QWORD[24+rdi],r9
923
924	DB	0F3h,0C3h		;repret
925
926
927
928ALIGN	32
929__ecp_nistz256_sub_fromq:
930	sub	r12,QWORD[rbx]
931	sbb	r13,QWORD[8+rbx]
932	mov	rax,r12
933	sbb	r8,QWORD[16+rbx]
934	sbb	r9,QWORD[24+rbx]
935	mov	rbp,r13
936	sbb	r11,r11
937
938	add	r12,-1
939	mov	rcx,r8
940	adc	r13,r14
941	adc	r8,0
942	mov	r10,r9
943	adc	r9,r15
944	test	r11,r11
945
946	cmovz	r12,rax
947	cmovz	r13,rbp
948	mov	QWORD[rdi],r12
949	cmovz	r8,rcx
950	mov	QWORD[8+rdi],r13
951	cmovz	r9,r10
952	mov	QWORD[16+rdi],r8
953	mov	QWORD[24+rdi],r9
954
955	DB	0F3h,0C3h		;repret
956
957
958
959ALIGN	32
960__ecp_nistz256_subq:
961	sub	rax,r12
962	sbb	rbp,r13
963	mov	r12,rax
964	sbb	rcx,r8
965	sbb	r10,r9
966	mov	r13,rbp
967	sbb	r11,r11
968
969	add	rax,-1
970	mov	r8,rcx
971	adc	rbp,r14
972	adc	rcx,0
973	mov	r9,r10
974	adc	r10,r15
975	test	r11,r11
976
977	cmovnz	r12,rax
978	cmovnz	r13,rbp
979	cmovnz	r8,rcx
980	cmovnz	r9,r10
981
982	DB	0F3h,0C3h		;repret
983
984
985
986ALIGN	32
987__ecp_nistz256_mul_by_2q:
988	xor	r11,r11
989	add	r12,r12
990	adc	r13,r13
991	mov	rax,r12
992	adc	r8,r8
993	adc	r9,r9
994	mov	rbp,r13
995	adc	r11,0
996
997	sub	r12,-1
998	mov	rcx,r8
999	sbb	r13,r14
1000	sbb	r8,0
1001	mov	r10,r9
1002	sbb	r9,r15
1003	sbb	r11,0
1004
1005	cmovc	r12,rax
1006	cmovc	r13,rbp
1007	mov	QWORD[rdi],r12
1008	cmovc	r8,rcx
1009	mov	QWORD[8+rdi],r13
1010	cmovc	r9,r10
1011	mov	QWORD[16+rdi],r8
1012	mov	QWORD[24+rdi],r9
1013
1014	DB	0F3h,0C3h		;repret
1015
1016global	ecp_nistz256_point_double
1017
1018ALIGN	32
1019ecp_nistz256_point_double:
1020	mov	QWORD[8+rsp],rdi	;WIN64 prologue
1021	mov	QWORD[16+rsp],rsi
1022	mov	rax,rsp
1023$L$SEH_begin_ecp_nistz256_point_double:
1024	mov	rdi,rcx
1025	mov	rsi,rdx
1026
1027
1028	push	rbp
1029	push	rbx
1030	push	r12
1031	push	r13
1032	push	r14
1033	push	r15
1034	sub	rsp,32*5+8
1035
1036$L$point_double_shortcutq:
1037	movdqu	xmm0,XMMWORD[rsi]
1038	mov	rbx,rsi
1039	movdqu	xmm1,XMMWORD[16+rsi]
1040	mov	r12,QWORD[((32+0))+rsi]
1041	mov	r13,QWORD[((32+8))+rsi]
1042	mov	r8,QWORD[((32+16))+rsi]
1043	mov	r9,QWORD[((32+24))+rsi]
1044	mov	r14,QWORD[(($L$poly+8))]
1045	mov	r15,QWORD[(($L$poly+24))]
1046	movdqa	XMMWORD[96+rsp],xmm0
1047	movdqa	XMMWORD[(96+16)+rsp],xmm1
1048	lea	r10,[32+rdi]
1049	lea	r11,[64+rdi]
1050DB	102,72,15,110,199
1051DB	102,73,15,110,202
1052DB	102,73,15,110,211
1053
1054	lea	rdi,[rsp]
1055	call	__ecp_nistz256_mul_by_2q
1056
1057	mov	rax,QWORD[((64+0))+rsi]
1058	mov	r14,QWORD[((64+8))+rsi]
1059	mov	r15,QWORD[((64+16))+rsi]
1060	mov	r8,QWORD[((64+24))+rsi]
1061	lea	rsi,[((64-0))+rsi]
1062	lea	rdi,[64+rsp]
1063	call	__ecp_nistz256_sqr_montq
1064
1065	mov	rax,QWORD[((0+0))+rsp]
1066	mov	r14,QWORD[((8+0))+rsp]
1067	lea	rsi,[((0+0))+rsp]
1068	mov	r15,QWORD[((16+0))+rsp]
1069	mov	r8,QWORD[((24+0))+rsp]
1070	lea	rdi,[rsp]
1071	call	__ecp_nistz256_sqr_montq
1072
1073	mov	rax,QWORD[32+rbx]
1074	mov	r9,QWORD[((64+0))+rbx]
1075	mov	r10,QWORD[((64+8))+rbx]
1076	mov	r11,QWORD[((64+16))+rbx]
1077	mov	r12,QWORD[((64+24))+rbx]
1078	lea	rsi,[((64-0))+rbx]
1079	lea	rbx,[32+rbx]
1080DB	102,72,15,126,215
1081	call	__ecp_nistz256_mul_montq
1082	call	__ecp_nistz256_mul_by_2q
1083
1084	mov	r12,QWORD[((96+0))+rsp]
1085	mov	r13,QWORD[((96+8))+rsp]
1086	lea	rbx,[64+rsp]
1087	mov	r8,QWORD[((96+16))+rsp]
1088	mov	r9,QWORD[((96+24))+rsp]
1089	lea	rdi,[32+rsp]
1090	call	__ecp_nistz256_add_toq
1091
1092	mov	r12,QWORD[((96+0))+rsp]
1093	mov	r13,QWORD[((96+8))+rsp]
1094	lea	rbx,[64+rsp]
1095	mov	r8,QWORD[((96+16))+rsp]
1096	mov	r9,QWORD[((96+24))+rsp]
1097	lea	rdi,[64+rsp]
1098	call	__ecp_nistz256_sub_fromq
1099
1100	mov	rax,QWORD[((0+0))+rsp]
1101	mov	r14,QWORD[((8+0))+rsp]
1102	lea	rsi,[((0+0))+rsp]
1103	mov	r15,QWORD[((16+0))+rsp]
1104	mov	r8,QWORD[((24+0))+rsp]
1105DB	102,72,15,126,207
1106	call	__ecp_nistz256_sqr_montq
1107	xor	r9,r9
1108	mov	rax,r12
1109	add	r12,-1
1110	mov	r10,r13
1111	adc	r13,rsi
1112	mov	rcx,r14
1113	adc	r14,0
1114	mov	r8,r15
1115	adc	r15,rbp
1116	adc	r9,0
1117	xor	rsi,rsi
1118	test	rax,1
1119
1120	cmovz	r12,rax
1121	cmovz	r13,r10
1122	cmovz	r14,rcx
1123	cmovz	r15,r8
1124	cmovz	r9,rsi
1125
1126	mov	rax,r13
1127	shr	r12,1
1128	shl	rax,63
1129	mov	r10,r14
1130	shr	r13,1
1131	or	r12,rax
1132	shl	r10,63
1133	mov	rcx,r15
1134	shr	r14,1
1135	or	r13,r10
1136	shl	rcx,63
1137	mov	QWORD[rdi],r12
1138	shr	r15,1
1139	mov	QWORD[8+rdi],r13
1140	shl	r9,63
1141	or	r14,rcx
1142	or	r15,r9
1143	mov	QWORD[16+rdi],r14
1144	mov	QWORD[24+rdi],r15
1145	mov	rax,QWORD[64+rsp]
1146	lea	rbx,[64+rsp]
1147	mov	r9,QWORD[((0+32))+rsp]
1148	mov	r10,QWORD[((8+32))+rsp]
1149	lea	rsi,[((0+32))+rsp]
1150	mov	r11,QWORD[((16+32))+rsp]
1151	mov	r12,QWORD[((24+32))+rsp]
1152	lea	rdi,[32+rsp]
1153	call	__ecp_nistz256_mul_montq
1154
1155	lea	rdi,[128+rsp]
1156	call	__ecp_nistz256_mul_by_2q
1157
1158	lea	rbx,[32+rsp]
1159	lea	rdi,[32+rsp]
1160	call	__ecp_nistz256_add_toq
1161
1162	mov	rax,QWORD[96+rsp]
1163	lea	rbx,[96+rsp]
1164	mov	r9,QWORD[((0+0))+rsp]
1165	mov	r10,QWORD[((8+0))+rsp]
1166	lea	rsi,[((0+0))+rsp]
1167	mov	r11,QWORD[((16+0))+rsp]
1168	mov	r12,QWORD[((24+0))+rsp]
1169	lea	rdi,[rsp]
1170	call	__ecp_nistz256_mul_montq
1171
1172	lea	rdi,[128+rsp]
1173	call	__ecp_nistz256_mul_by_2q
1174
1175	mov	rax,QWORD[((0+32))+rsp]
1176	mov	r14,QWORD[((8+32))+rsp]
1177	lea	rsi,[((0+32))+rsp]
1178	mov	r15,QWORD[((16+32))+rsp]
1179	mov	r8,QWORD[((24+32))+rsp]
1180DB	102,72,15,126,199
1181	call	__ecp_nistz256_sqr_montq
1182
1183	lea	rbx,[128+rsp]
1184	mov	r8,r14
1185	mov	r9,r15
1186	mov	r14,rsi
1187	mov	r15,rbp
1188	call	__ecp_nistz256_sub_fromq
1189
1190	mov	rax,QWORD[((0+0))+rsp]
1191	mov	rbp,QWORD[((0+8))+rsp]
1192	mov	rcx,QWORD[((0+16))+rsp]
1193	mov	r10,QWORD[((0+24))+rsp]
1194	lea	rdi,[rsp]
1195	call	__ecp_nistz256_subq
1196
1197	mov	rax,QWORD[32+rsp]
1198	lea	rbx,[32+rsp]
1199	mov	r14,r12
1200	xor	ecx,ecx
1201	mov	QWORD[((0+0))+rsp],r12
1202	mov	r10,r13
1203	mov	QWORD[((0+8))+rsp],r13
1204	cmovz	r11,r8
1205	mov	QWORD[((0+16))+rsp],r8
1206	lea	rsi,[((0-0))+rsp]
1207	cmovz	r12,r9
1208	mov	QWORD[((0+24))+rsp],r9
1209	mov	r9,r14
1210	lea	rdi,[rsp]
1211	call	__ecp_nistz256_mul_montq
1212
1213DB	102,72,15,126,203
1214DB	102,72,15,126,207
1215	call	__ecp_nistz256_sub_fromq
1216
1217	add	rsp,32*5+8
1218	pop	r15
1219	pop	r14
1220	pop	r13
1221	pop	r12
1222	pop	rbx
1223	pop	rbp
1224	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1225	mov	rsi,QWORD[16+rsp]
1226	DB	0F3h,0C3h		;repret
1227$L$SEH_end_ecp_nistz256_point_double:
1228global	ecp_nistz256_point_add
1229
1230ALIGN	32
1231ecp_nistz256_point_add:
1232	mov	QWORD[8+rsp],rdi	;WIN64 prologue
1233	mov	QWORD[16+rsp],rsi
1234	mov	rax,rsp
1235$L$SEH_begin_ecp_nistz256_point_add:
1236	mov	rdi,rcx
1237	mov	rsi,rdx
1238	mov	rdx,r8
1239
1240
1241	push	rbp
1242	push	rbx
1243	push	r12
1244	push	r13
1245	push	r14
1246	push	r15
1247	sub	rsp,32*18+8
1248
1249	movdqu	xmm0,XMMWORD[rsi]
1250	movdqu	xmm1,XMMWORD[16+rsi]
1251	movdqu	xmm2,XMMWORD[32+rsi]
1252	movdqu	xmm3,XMMWORD[48+rsi]
1253	movdqu	xmm4,XMMWORD[64+rsi]
1254	movdqu	xmm5,XMMWORD[80+rsi]
1255	mov	rbx,rsi
1256	mov	rsi,rdx
1257	movdqa	XMMWORD[384+rsp],xmm0
1258	movdqa	XMMWORD[(384+16)+rsp],xmm1
1259	movdqa	XMMWORD[416+rsp],xmm2
1260	movdqa	XMMWORD[(416+16)+rsp],xmm3
1261	movdqa	XMMWORD[448+rsp],xmm4
1262	movdqa	XMMWORD[(448+16)+rsp],xmm5
1263	por	xmm5,xmm4
1264
1265	movdqu	xmm0,XMMWORD[rsi]
1266	pshufd	xmm3,xmm5,0xb1
1267	movdqu	xmm1,XMMWORD[16+rsi]
1268	movdqu	xmm2,XMMWORD[32+rsi]
1269	por	xmm5,xmm3
1270	movdqu	xmm3,XMMWORD[48+rsi]
1271	mov	rax,QWORD[((64+0))+rsi]
1272	mov	r14,QWORD[((64+8))+rsi]
1273	mov	r15,QWORD[((64+16))+rsi]
1274	mov	r8,QWORD[((64+24))+rsi]
1275	movdqa	XMMWORD[480+rsp],xmm0
1276	pshufd	xmm4,xmm5,0x1e
1277	movdqa	XMMWORD[(480+16)+rsp],xmm1
1278	movdqu	xmm0,XMMWORD[64+rsi]
1279	movdqu	xmm1,XMMWORD[80+rsi]
1280	movdqa	XMMWORD[512+rsp],xmm2
1281	movdqa	XMMWORD[(512+16)+rsp],xmm3
1282	por	xmm5,xmm4
1283	pxor	xmm4,xmm4
1284	por	xmm1,xmm0
1285DB	102,72,15,110,199
1286
1287	lea	rsi,[((64-0))+rsi]
1288	mov	QWORD[((544+0))+rsp],rax
1289	mov	QWORD[((544+8))+rsp],r14
1290	mov	QWORD[((544+16))+rsp],r15
1291	mov	QWORD[((544+24))+rsp],r8
1292	lea	rdi,[96+rsp]
1293	call	__ecp_nistz256_sqr_montq
1294
1295	pcmpeqd	xmm5,xmm4
1296	pshufd	xmm4,xmm1,0xb1
1297	por	xmm4,xmm1
1298	pshufd	xmm5,xmm5,0
1299	pshufd	xmm3,xmm4,0x1e
1300	por	xmm4,xmm3
1301	pxor	xmm3,xmm3
1302	pcmpeqd	xmm4,xmm3
1303	pshufd	xmm4,xmm4,0
1304	mov	rax,QWORD[((64+0))+rbx]
1305	mov	r14,QWORD[((64+8))+rbx]
1306	mov	r15,QWORD[((64+16))+rbx]
1307	mov	r8,QWORD[((64+24))+rbx]
1308DB	102,72,15,110,203
1309
1310	lea	rsi,[((64-0))+rbx]
1311	lea	rdi,[32+rsp]
1312	call	__ecp_nistz256_sqr_montq
1313
1314	mov	rax,QWORD[544+rsp]
1315	lea	rbx,[544+rsp]
1316	mov	r9,QWORD[((0+96))+rsp]
1317	mov	r10,QWORD[((8+96))+rsp]
1318	lea	rsi,[((0+96))+rsp]
1319	mov	r11,QWORD[((16+96))+rsp]
1320	mov	r12,QWORD[((24+96))+rsp]
1321	lea	rdi,[224+rsp]
1322	call	__ecp_nistz256_mul_montq
1323
1324	mov	rax,QWORD[448+rsp]
1325	lea	rbx,[448+rsp]
1326	mov	r9,QWORD[((0+32))+rsp]
1327	mov	r10,QWORD[((8+32))+rsp]
1328	lea	rsi,[((0+32))+rsp]
1329	mov	r11,QWORD[((16+32))+rsp]
1330	mov	r12,QWORD[((24+32))+rsp]
1331	lea	rdi,[256+rsp]
1332	call	__ecp_nistz256_mul_montq
1333
1334	mov	rax,QWORD[416+rsp]
1335	lea	rbx,[416+rsp]
1336	mov	r9,QWORD[((0+224))+rsp]
1337	mov	r10,QWORD[((8+224))+rsp]
1338	lea	rsi,[((0+224))+rsp]
1339	mov	r11,QWORD[((16+224))+rsp]
1340	mov	r12,QWORD[((24+224))+rsp]
1341	lea	rdi,[224+rsp]
1342	call	__ecp_nistz256_mul_montq
1343
1344	mov	rax,QWORD[512+rsp]
1345	lea	rbx,[512+rsp]
1346	mov	r9,QWORD[((0+256))+rsp]
1347	mov	r10,QWORD[((8+256))+rsp]
1348	lea	rsi,[((0+256))+rsp]
1349	mov	r11,QWORD[((16+256))+rsp]
1350	mov	r12,QWORD[((24+256))+rsp]
1351	lea	rdi,[256+rsp]
1352	call	__ecp_nistz256_mul_montq
1353
1354	lea	rbx,[224+rsp]
1355	lea	rdi,[64+rsp]
1356	call	__ecp_nistz256_sub_fromq
1357
1358	or	r12,r13
1359	movdqa	xmm2,xmm4
1360	or	r12,r8
1361	or	r12,r9
1362	por	xmm2,xmm5
1363DB	102,73,15,110,220
1364
1365	mov	rax,QWORD[384+rsp]
1366	lea	rbx,[384+rsp]
1367	mov	r9,QWORD[((0+96))+rsp]
1368	mov	r10,QWORD[((8+96))+rsp]
1369	lea	rsi,[((0+96))+rsp]
1370	mov	r11,QWORD[((16+96))+rsp]
1371	mov	r12,QWORD[((24+96))+rsp]
1372	lea	rdi,[160+rsp]
1373	call	__ecp_nistz256_mul_montq
1374
1375	mov	rax,QWORD[480+rsp]
1376	lea	rbx,[480+rsp]
1377	mov	r9,QWORD[((0+32))+rsp]
1378	mov	r10,QWORD[((8+32))+rsp]
1379	lea	rsi,[((0+32))+rsp]
1380	mov	r11,QWORD[((16+32))+rsp]
1381	mov	r12,QWORD[((24+32))+rsp]
1382	lea	rdi,[192+rsp]
1383	call	__ecp_nistz256_mul_montq
1384
1385	lea	rbx,[160+rsp]
1386	lea	rdi,[rsp]
1387	call	__ecp_nistz256_sub_fromq
1388
1389	or	r12,r13
1390	or	r12,r8
1391	or	r12,r9
1392
1393DB	0x3e
1394	jnz	NEAR $L$add_proceedq
1395DB	102,73,15,126,208
1396DB	102,73,15,126,217
1397	test	r8,r8
1398	jnz	NEAR $L$add_proceedq
1399	test	r9,r9
1400	jz	NEAR $L$add_doubleq
1401
1402DB	102,72,15,126,199
1403	pxor	xmm0,xmm0
1404	movdqu	XMMWORD[rdi],xmm0
1405	movdqu	XMMWORD[16+rdi],xmm0
1406	movdqu	XMMWORD[32+rdi],xmm0
1407	movdqu	XMMWORD[48+rdi],xmm0
1408	movdqu	XMMWORD[64+rdi],xmm0
1409	movdqu	XMMWORD[80+rdi],xmm0
1410	jmp	NEAR $L$add_doneq
1411
1412ALIGN	32
1413$L$add_doubleq:
1414DB	102,72,15,126,206
1415DB	102,72,15,126,199
1416	add	rsp,416
1417	jmp	NEAR $L$point_double_shortcutq
1418
1419ALIGN	32
1420$L$add_proceedq:
1421	mov	rax,QWORD[((0+64))+rsp]
1422	mov	r14,QWORD[((8+64))+rsp]
1423	lea	rsi,[((0+64))+rsp]
1424	mov	r15,QWORD[((16+64))+rsp]
1425	mov	r8,QWORD[((24+64))+rsp]
1426	lea	rdi,[96+rsp]
1427	call	__ecp_nistz256_sqr_montq
1428
1429	mov	rax,QWORD[448+rsp]
1430	lea	rbx,[448+rsp]
1431	mov	r9,QWORD[((0+0))+rsp]
1432	mov	r10,QWORD[((8+0))+rsp]
1433	lea	rsi,[((0+0))+rsp]
1434	mov	r11,QWORD[((16+0))+rsp]
1435	mov	r12,QWORD[((24+0))+rsp]
1436	lea	rdi,[352+rsp]
1437	call	__ecp_nistz256_mul_montq
1438
1439	mov	rax,QWORD[((0+0))+rsp]
1440	mov	r14,QWORD[((8+0))+rsp]
1441	lea	rsi,[((0+0))+rsp]
1442	mov	r15,QWORD[((16+0))+rsp]
1443	mov	r8,QWORD[((24+0))+rsp]
1444	lea	rdi,[32+rsp]
1445	call	__ecp_nistz256_sqr_montq
1446
1447	mov	rax,QWORD[544+rsp]
1448	lea	rbx,[544+rsp]
1449	mov	r9,QWORD[((0+352))+rsp]
1450	mov	r10,QWORD[((8+352))+rsp]
1451	lea	rsi,[((0+352))+rsp]
1452	mov	r11,QWORD[((16+352))+rsp]
1453	mov	r12,QWORD[((24+352))+rsp]
1454	lea	rdi,[352+rsp]
1455	call	__ecp_nistz256_mul_montq
1456
1457	mov	rax,QWORD[rsp]
1458	lea	rbx,[rsp]
1459	mov	r9,QWORD[((0+32))+rsp]
1460	mov	r10,QWORD[((8+32))+rsp]
1461	lea	rsi,[((0+32))+rsp]
1462	mov	r11,QWORD[((16+32))+rsp]
1463	mov	r12,QWORD[((24+32))+rsp]
1464	lea	rdi,[128+rsp]
1465	call	__ecp_nistz256_mul_montq
1466
1467	mov	rax,QWORD[160+rsp]
1468	lea	rbx,[160+rsp]
1469	mov	r9,QWORD[((0+32))+rsp]
1470	mov	r10,QWORD[((8+32))+rsp]
1471	lea	rsi,[((0+32))+rsp]
1472	mov	r11,QWORD[((16+32))+rsp]
1473	mov	r12,QWORD[((24+32))+rsp]
1474	lea	rdi,[192+rsp]
1475	call	__ecp_nistz256_mul_montq
1476
1477
1478
1479
1480	xor	r11,r11
1481	add	r12,r12
1482	lea	rsi,[96+rsp]
1483	adc	r13,r13
1484	mov	rax,r12
1485	adc	r8,r8
1486	adc	r9,r9
1487	mov	rbp,r13
1488	adc	r11,0
1489
1490	sub	r12,-1
1491	mov	rcx,r8
1492	sbb	r13,r14
1493	sbb	r8,0
1494	mov	r10,r9
1495	sbb	r9,r15
1496	sbb	r11,0
1497
1498	cmovc	r12,rax
1499	mov	rax,QWORD[rsi]
1500	cmovc	r13,rbp
1501	mov	rbp,QWORD[8+rsi]
1502	cmovc	r8,rcx
1503	mov	rcx,QWORD[16+rsi]
1504	cmovc	r9,r10
1505	mov	r10,QWORD[24+rsi]
1506
1507	call	__ecp_nistz256_subq
1508
1509	lea	rbx,[128+rsp]
1510	lea	rdi,[288+rsp]
1511	call	__ecp_nistz256_sub_fromq
1512
1513	mov	rax,QWORD[((192+0))+rsp]
1514	mov	rbp,QWORD[((192+8))+rsp]
1515	mov	rcx,QWORD[((192+16))+rsp]
1516	mov	r10,QWORD[((192+24))+rsp]
1517	lea	rdi,[320+rsp]
1518
1519	call	__ecp_nistz256_subq
1520
1521	mov	QWORD[rdi],r12
1522	mov	QWORD[8+rdi],r13
1523	mov	QWORD[16+rdi],r8
1524	mov	QWORD[24+rdi],r9
1525	mov	rax,QWORD[128+rsp]
1526	lea	rbx,[128+rsp]
1527	mov	r9,QWORD[((0+224))+rsp]
1528	mov	r10,QWORD[((8+224))+rsp]
1529	lea	rsi,[((0+224))+rsp]
1530	mov	r11,QWORD[((16+224))+rsp]
1531	mov	r12,QWORD[((24+224))+rsp]
1532	lea	rdi,[256+rsp]
1533	call	__ecp_nistz256_mul_montq
1534
1535	mov	rax,QWORD[320+rsp]
1536	lea	rbx,[320+rsp]
1537	mov	r9,QWORD[((0+64))+rsp]
1538	mov	r10,QWORD[((8+64))+rsp]
1539	lea	rsi,[((0+64))+rsp]
1540	mov	r11,QWORD[((16+64))+rsp]
1541	mov	r12,QWORD[((24+64))+rsp]
1542	lea	rdi,[320+rsp]
1543	call	__ecp_nistz256_mul_montq
1544
1545	lea	rbx,[256+rsp]
1546	lea	rdi,[320+rsp]
1547	call	__ecp_nistz256_sub_fromq
1548
1549DB	102,72,15,126,199
1550
1551	movdqa	xmm0,xmm5
1552	movdqa	xmm1,xmm5
1553	pandn	xmm0,XMMWORD[352+rsp]
1554	movdqa	xmm2,xmm5
1555	pandn	xmm1,XMMWORD[((352+16))+rsp]
1556	movdqa	xmm3,xmm5
1557	pand	xmm2,XMMWORD[544+rsp]
1558	pand	xmm3,XMMWORD[((544+16))+rsp]
1559	por	xmm2,xmm0
1560	por	xmm3,xmm1
1561
1562	movdqa	xmm0,xmm4
1563	movdqa	xmm1,xmm4
1564	pandn	xmm0,xmm2
1565	movdqa	xmm2,xmm4
1566	pandn	xmm1,xmm3
1567	movdqa	xmm3,xmm4
1568	pand	xmm2,XMMWORD[448+rsp]
1569	pand	xmm3,XMMWORD[((448+16))+rsp]
1570	por	xmm2,xmm0
1571	por	xmm3,xmm1
1572	movdqu	XMMWORD[64+rdi],xmm2
1573	movdqu	XMMWORD[80+rdi],xmm3
1574
1575	movdqa	xmm0,xmm5
1576	movdqa	xmm1,xmm5
1577	pandn	xmm0,XMMWORD[288+rsp]
1578	movdqa	xmm2,xmm5
1579	pandn	xmm1,XMMWORD[((288+16))+rsp]
1580	movdqa	xmm3,xmm5
1581	pand	xmm2,XMMWORD[480+rsp]
1582	pand	xmm3,XMMWORD[((480+16))+rsp]
1583	por	xmm2,xmm0
1584	por	xmm3,xmm1
1585
1586	movdqa	xmm0,xmm4
1587	movdqa	xmm1,xmm4
1588	pandn	xmm0,xmm2
1589	movdqa	xmm2,xmm4
1590	pandn	xmm1,xmm3
1591	movdqa	xmm3,xmm4
1592	pand	xmm2,XMMWORD[384+rsp]
1593	pand	xmm3,XMMWORD[((384+16))+rsp]
1594	por	xmm2,xmm0
1595	por	xmm3,xmm1
1596	movdqu	XMMWORD[rdi],xmm2
1597	movdqu	XMMWORD[16+rdi],xmm3
1598
1599	movdqa	xmm0,xmm5
1600	movdqa	xmm1,xmm5
1601	pandn	xmm0,XMMWORD[320+rsp]
1602	movdqa	xmm2,xmm5
1603	pandn	xmm1,XMMWORD[((320+16))+rsp]
1604	movdqa	xmm3,xmm5
1605	pand	xmm2,XMMWORD[512+rsp]
1606	pand	xmm3,XMMWORD[((512+16))+rsp]
1607	por	xmm2,xmm0
1608	por	xmm3,xmm1
1609
1610	movdqa	xmm0,xmm4
1611	movdqa	xmm1,xmm4
1612	pandn	xmm0,xmm2
1613	movdqa	xmm2,xmm4
1614	pandn	xmm1,xmm3
1615	movdqa	xmm3,xmm4
1616	pand	xmm2,XMMWORD[416+rsp]
1617	pand	xmm3,XMMWORD[((416+16))+rsp]
1618	por	xmm2,xmm0
1619	por	xmm3,xmm1
1620	movdqu	XMMWORD[32+rdi],xmm2
1621	movdqu	XMMWORD[48+rdi],xmm3
1622
1623$L$add_doneq:
1624	add	rsp,32*18+8
1625	pop	r15
1626	pop	r14
1627	pop	r13
1628	pop	r12
1629	pop	rbx
1630	pop	rbp
1631	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1632	mov	rsi,QWORD[16+rsp]
1633	DB	0F3h,0C3h		;repret
1634$L$SEH_end_ecp_nistz256_point_add:
1635global	ecp_nistz256_point_add_affine
1636
1637ALIGN	32
1638ecp_nistz256_point_add_affine:
1639	mov	QWORD[8+rsp],rdi	;WIN64 prologue
1640	mov	QWORD[16+rsp],rsi
1641	mov	rax,rsp
1642$L$SEH_begin_ecp_nistz256_point_add_affine:
1643	mov	rdi,rcx
1644	mov	rsi,rdx
1645	mov	rdx,r8
1646
1647
1648	push	rbp
1649	push	rbx
1650	push	r12
1651	push	r13
1652	push	r14
1653	push	r15
1654	sub	rsp,32*15+8
1655
1656	movdqu	xmm0,XMMWORD[rsi]
1657	mov	rbx,rdx
1658	movdqu	xmm1,XMMWORD[16+rsi]
1659	movdqu	xmm2,XMMWORD[32+rsi]
1660	movdqu	xmm3,XMMWORD[48+rsi]
1661	movdqu	xmm4,XMMWORD[64+rsi]
1662	movdqu	xmm5,XMMWORD[80+rsi]
1663	mov	rax,QWORD[((64+0))+rsi]
1664	mov	r14,QWORD[((64+8))+rsi]
1665	mov	r15,QWORD[((64+16))+rsi]
1666	mov	r8,QWORD[((64+24))+rsi]
1667	movdqa	XMMWORD[320+rsp],xmm0
1668	movdqa	XMMWORD[(320+16)+rsp],xmm1
1669	movdqa	XMMWORD[352+rsp],xmm2
1670	movdqa	XMMWORD[(352+16)+rsp],xmm3
1671	movdqa	XMMWORD[384+rsp],xmm4
1672	movdqa	XMMWORD[(384+16)+rsp],xmm5
1673	por	xmm5,xmm4
1674
1675	movdqu	xmm0,XMMWORD[rbx]
1676	pshufd	xmm3,xmm5,0xb1
1677	movdqu	xmm1,XMMWORD[16+rbx]
1678	movdqu	xmm2,XMMWORD[32+rbx]
1679	por	xmm5,xmm3
1680	movdqu	xmm3,XMMWORD[48+rbx]
1681	movdqa	XMMWORD[416+rsp],xmm0
1682	pshufd	xmm4,xmm5,0x1e
1683	movdqa	XMMWORD[(416+16)+rsp],xmm1
1684	por	xmm1,xmm0
1685DB	102,72,15,110,199
1686	movdqa	XMMWORD[448+rsp],xmm2
1687	movdqa	XMMWORD[(448+16)+rsp],xmm3
1688	por	xmm3,xmm2
1689	por	xmm5,xmm4
1690	pxor	xmm4,xmm4
1691	por	xmm3,xmm1
1692
1693	lea	rsi,[((64-0))+rsi]
1694	lea	rdi,[32+rsp]
1695	call	__ecp_nistz256_sqr_montq
1696
1697	pcmpeqd	xmm5,xmm4
1698	pshufd	xmm4,xmm3,0xb1
1699	mov	rax,QWORD[rbx]
1700
1701	mov	r9,r12
1702	por	xmm4,xmm3
1703	pshufd	xmm5,xmm5,0
1704	pshufd	xmm3,xmm4,0x1e
1705	mov	r10,r13
1706	por	xmm4,xmm3
1707	pxor	xmm3,xmm3
1708	mov	r11,r14
1709	pcmpeqd	xmm4,xmm3
1710	pshufd	xmm4,xmm4,0
1711
1712	lea	rsi,[((32-0))+rsp]
1713	mov	r12,r15
1714	lea	rdi,[rsp]
1715	call	__ecp_nistz256_mul_montq
1716
1717	lea	rbx,[320+rsp]
1718	lea	rdi,[64+rsp]
1719	call	__ecp_nistz256_sub_fromq
1720
1721	mov	rax,QWORD[384+rsp]
1722	lea	rbx,[384+rsp]
1723	mov	r9,QWORD[((0+32))+rsp]
1724	mov	r10,QWORD[((8+32))+rsp]
1725	lea	rsi,[((0+32))+rsp]
1726	mov	r11,QWORD[((16+32))+rsp]
1727	mov	r12,QWORD[((24+32))+rsp]
1728	lea	rdi,[32+rsp]
1729	call	__ecp_nistz256_mul_montq
1730
1731	mov	rax,QWORD[384+rsp]
1732	lea	rbx,[384+rsp]
1733	mov	r9,QWORD[((0+64))+rsp]
1734	mov	r10,QWORD[((8+64))+rsp]
1735	lea	rsi,[((0+64))+rsp]
1736	mov	r11,QWORD[((16+64))+rsp]
1737	mov	r12,QWORD[((24+64))+rsp]
1738	lea	rdi,[288+rsp]
1739	call	__ecp_nistz256_mul_montq
1740
1741	mov	rax,QWORD[448+rsp]
1742	lea	rbx,[448+rsp]
1743	mov	r9,QWORD[((0+32))+rsp]
1744	mov	r10,QWORD[((8+32))+rsp]
1745	lea	rsi,[((0+32))+rsp]
1746	mov	r11,QWORD[((16+32))+rsp]
1747	mov	r12,QWORD[((24+32))+rsp]
1748	lea	rdi,[32+rsp]
1749	call	__ecp_nistz256_mul_montq
1750
1751	lea	rbx,[352+rsp]
1752	lea	rdi,[96+rsp]
1753	call	__ecp_nistz256_sub_fromq
1754
1755	mov	rax,QWORD[((0+64))+rsp]
1756	mov	r14,QWORD[((8+64))+rsp]
1757	lea	rsi,[((0+64))+rsp]
1758	mov	r15,QWORD[((16+64))+rsp]
1759	mov	r8,QWORD[((24+64))+rsp]
1760	lea	rdi,[128+rsp]
1761	call	__ecp_nistz256_sqr_montq
1762
1763	mov	rax,QWORD[((0+96))+rsp]
1764	mov	r14,QWORD[((8+96))+rsp]
1765	lea	rsi,[((0+96))+rsp]
1766	mov	r15,QWORD[((16+96))+rsp]
1767	mov	r8,QWORD[((24+96))+rsp]
1768	lea	rdi,[192+rsp]
1769	call	__ecp_nistz256_sqr_montq
1770
1771	mov	rax,QWORD[128+rsp]
1772	lea	rbx,[128+rsp]
1773	mov	r9,QWORD[((0+64))+rsp]
1774	mov	r10,QWORD[((8+64))+rsp]
1775	lea	rsi,[((0+64))+rsp]
1776	mov	r11,QWORD[((16+64))+rsp]
1777	mov	r12,QWORD[((24+64))+rsp]
1778	lea	rdi,[160+rsp]
1779	call	__ecp_nistz256_mul_montq
1780
1781	mov	rax,QWORD[320+rsp]
1782	lea	rbx,[320+rsp]
1783	mov	r9,QWORD[((0+128))+rsp]
1784	mov	r10,QWORD[((8+128))+rsp]
1785	lea	rsi,[((0+128))+rsp]
1786	mov	r11,QWORD[((16+128))+rsp]
1787	mov	r12,QWORD[((24+128))+rsp]
1788	lea	rdi,[rsp]
1789	call	__ecp_nistz256_mul_montq
1790
1791
1792
1793
1794	xor	r11,r11
1795	add	r12,r12
1796	lea	rsi,[192+rsp]
1797	adc	r13,r13
1798	mov	rax,r12
1799	adc	r8,r8
1800	adc	r9,r9
1801	mov	rbp,r13
1802	adc	r11,0
1803
1804	sub	r12,-1
1805	mov	rcx,r8
1806	sbb	r13,r14
1807	sbb	r8,0
1808	mov	r10,r9
1809	sbb	r9,r15
1810	sbb	r11,0
1811
1812	cmovc	r12,rax
1813	mov	rax,QWORD[rsi]
1814	cmovc	r13,rbp
1815	mov	rbp,QWORD[8+rsi]
1816	cmovc	r8,rcx
1817	mov	rcx,QWORD[16+rsi]
1818	cmovc	r9,r10
1819	mov	r10,QWORD[24+rsi]
1820
1821	call	__ecp_nistz256_subq
1822
1823	lea	rbx,[160+rsp]
1824	lea	rdi,[224+rsp]
1825	call	__ecp_nistz256_sub_fromq
1826
1827	mov	rax,QWORD[((0+0))+rsp]
1828	mov	rbp,QWORD[((0+8))+rsp]
1829	mov	rcx,QWORD[((0+16))+rsp]
1830	mov	r10,QWORD[((0+24))+rsp]
1831	lea	rdi,[64+rsp]
1832
1833	call	__ecp_nistz256_subq
1834
1835	mov	QWORD[rdi],r12
1836	mov	QWORD[8+rdi],r13
1837	mov	QWORD[16+rdi],r8
1838	mov	QWORD[24+rdi],r9
1839	mov	rax,QWORD[352+rsp]
1840	lea	rbx,[352+rsp]
1841	mov	r9,QWORD[((0+160))+rsp]
1842	mov	r10,QWORD[((8+160))+rsp]
1843	lea	rsi,[((0+160))+rsp]
1844	mov	r11,QWORD[((16+160))+rsp]
1845	mov	r12,QWORD[((24+160))+rsp]
1846	lea	rdi,[32+rsp]
1847	call	__ecp_nistz256_mul_montq
1848
1849	mov	rax,QWORD[96+rsp]
1850	lea	rbx,[96+rsp]
1851	mov	r9,QWORD[((0+64))+rsp]
1852	mov	r10,QWORD[((8+64))+rsp]
1853	lea	rsi,[((0+64))+rsp]
1854	mov	r11,QWORD[((16+64))+rsp]
1855	mov	r12,QWORD[((24+64))+rsp]
1856	lea	rdi,[64+rsp]
1857	call	__ecp_nistz256_mul_montq
1858
1859	lea	rbx,[32+rsp]
1860	lea	rdi,[256+rsp]
1861	call	__ecp_nistz256_sub_fromq
1862
1863DB	102,72,15,126,199
1864
1865	movdqa	xmm0,xmm5
1866	movdqa	xmm1,xmm5
1867	pandn	xmm0,XMMWORD[288+rsp]
1868	movdqa	xmm2,xmm5
1869	pandn	xmm1,XMMWORD[((288+16))+rsp]
1870	movdqa	xmm3,xmm5
1871	pand	xmm2,XMMWORD[$L$ONE_mont]
1872	pand	xmm3,XMMWORD[(($L$ONE_mont+16))]
1873	por	xmm2,xmm0
1874	por	xmm3,xmm1
1875
1876	movdqa	xmm0,xmm4
1877	movdqa	xmm1,xmm4
1878	pandn	xmm0,xmm2
1879	movdqa	xmm2,xmm4
1880	pandn	xmm1,xmm3
1881	movdqa	xmm3,xmm4
1882	pand	xmm2,XMMWORD[384+rsp]
1883	pand	xmm3,XMMWORD[((384+16))+rsp]
1884	por	xmm2,xmm0
1885	por	xmm3,xmm1
1886	movdqu	XMMWORD[64+rdi],xmm2
1887	movdqu	XMMWORD[80+rdi],xmm3
1888
1889	movdqa	xmm0,xmm5
1890	movdqa	xmm1,xmm5
1891	pandn	xmm0,XMMWORD[224+rsp]
1892	movdqa	xmm2,xmm5
1893	pandn	xmm1,XMMWORD[((224+16))+rsp]
1894	movdqa	xmm3,xmm5
1895	pand	xmm2,XMMWORD[416+rsp]
1896	pand	xmm3,XMMWORD[((416+16))+rsp]
1897	por	xmm2,xmm0
1898	por	xmm3,xmm1
1899
1900	movdqa	xmm0,xmm4
1901	movdqa	xmm1,xmm4
1902	pandn	xmm0,xmm2
1903	movdqa	xmm2,xmm4
1904	pandn	xmm1,xmm3
1905	movdqa	xmm3,xmm4
1906	pand	xmm2,XMMWORD[320+rsp]
1907	pand	xmm3,XMMWORD[((320+16))+rsp]
1908	por	xmm2,xmm0
1909	por	xmm3,xmm1
1910	movdqu	XMMWORD[rdi],xmm2
1911	movdqu	XMMWORD[16+rdi],xmm3
1912
1913	movdqa	xmm0,xmm5
1914	movdqa	xmm1,xmm5
1915	pandn	xmm0,XMMWORD[256+rsp]
1916	movdqa	xmm2,xmm5
1917	pandn	xmm1,XMMWORD[((256+16))+rsp]
1918	movdqa	xmm3,xmm5
1919	pand	xmm2,XMMWORD[448+rsp]
1920	pand	xmm3,XMMWORD[((448+16))+rsp]
1921	por	xmm2,xmm0
1922	por	xmm3,xmm1
1923
1924	movdqa	xmm0,xmm4
1925	movdqa	xmm1,xmm4
1926	pandn	xmm0,xmm2
1927	movdqa	xmm2,xmm4
1928	pandn	xmm1,xmm3
1929	movdqa	xmm3,xmm4
1930	pand	xmm2,XMMWORD[352+rsp]
1931	pand	xmm3,XMMWORD[((352+16))+rsp]
1932	por	xmm2,xmm0
1933	por	xmm3,xmm1
1934	movdqu	XMMWORD[32+rdi],xmm2
1935	movdqu	XMMWORD[48+rdi],xmm3
1936
1937	add	rsp,32*15+8
1938	pop	r15
1939	pop	r14
1940	pop	r13
1941	pop	r12
1942	pop	rbx
1943	pop	rbp
1944	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1945	mov	rsi,QWORD[16+rsp]
1946	DB	0F3h,0C3h		;repret
1947$L$SEH_end_ecp_nistz256_point_add_affine:
1948