• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1default	rel
2%define XMMWORD
3%define YMMWORD
4%define ZMMWORD
5section	.text code align=64
6
7
8EXTERN	OPENSSL_ia32cap_P
9
10global	bn_mul_mont
11
12ALIGN	16
13bn_mul_mont:
14	mov	QWORD[8+rsp],rdi	;WIN64 prologue
15	mov	QWORD[16+rsp],rsi
16	mov	rax,rsp
17$L$SEH_begin_bn_mul_mont:
18	mov	rdi,rcx
19	mov	rsi,rdx
20	mov	rdx,r8
21	mov	rcx,r9
22	mov	r8,QWORD[40+rsp]
23	mov	r9,QWORD[48+rsp]
24
25
26
27	mov	r9d,r9d
28	mov	rax,rsp
29
30	test	r9d,3
31	jnz	NEAR $L$mul_enter
32	cmp	r9d,8
33	jb	NEAR $L$mul_enter
34	cmp	rdx,rsi
35	jne	NEAR $L$mul4x_enter
36	test	r9d,7
37	jz	NEAR $L$sqr8x_enter
38	jmp	NEAR $L$mul4x_enter
39
40ALIGN	16
41$L$mul_enter:
42	push	rbx
43
44	push	rbp
45
46	push	r12
47
48	push	r13
49
50	push	r14
51
52	push	r15
53
54
55	neg	r9
56	mov	r11,rsp
57	lea	r10,[((-16))+r9*8+rsp]
58	neg	r9
59	and	r10,-1024
60
61
62
63
64
65
66
67
68
69	sub	r11,r10
70	and	r11,-4096
71	lea	rsp,[r11*1+r10]
72	mov	r11,QWORD[rsp]
73	cmp	rsp,r10
74	ja	NEAR $L$mul_page_walk
75	jmp	NEAR $L$mul_page_walk_done
76
77ALIGN	16
78$L$mul_page_walk:
79	lea	rsp,[((-4096))+rsp]
80	mov	r11,QWORD[rsp]
81	cmp	rsp,r10
82	ja	NEAR $L$mul_page_walk
83$L$mul_page_walk_done:
84
85	mov	QWORD[8+r9*8+rsp],rax
86
87$L$mul_body:
88	mov	r12,rdx
89	mov	r8,QWORD[r8]
90	mov	rbx,QWORD[r12]
91	mov	rax,QWORD[rsi]
92
93	xor	r14,r14
94	xor	r15,r15
95
96	mov	rbp,r8
97	mul	rbx
98	mov	r10,rax
99	mov	rax,QWORD[rcx]
100
101	imul	rbp,r10
102	mov	r11,rdx
103
104	mul	rbp
105	add	r10,rax
106	mov	rax,QWORD[8+rsi]
107	adc	rdx,0
108	mov	r13,rdx
109
110	lea	r15,[1+r15]
111	jmp	NEAR $L$1st_enter
112
113ALIGN	16
114$L$1st:
115	add	r13,rax
116	mov	rax,QWORD[r15*8+rsi]
117	adc	rdx,0
118	add	r13,r11
119	mov	r11,r10
120	adc	rdx,0
121	mov	QWORD[((-16))+r15*8+rsp],r13
122	mov	r13,rdx
123
124$L$1st_enter:
125	mul	rbx
126	add	r11,rax
127	mov	rax,QWORD[r15*8+rcx]
128	adc	rdx,0
129	lea	r15,[1+r15]
130	mov	r10,rdx
131
132	mul	rbp
133	cmp	r15,r9
134	jne	NEAR $L$1st
135
136	add	r13,rax
137	mov	rax,QWORD[rsi]
138	adc	rdx,0
139	add	r13,r11
140	adc	rdx,0
141	mov	QWORD[((-16))+r15*8+rsp],r13
142	mov	r13,rdx
143	mov	r11,r10
144
145	xor	rdx,rdx
146	add	r13,r11
147	adc	rdx,0
148	mov	QWORD[((-8))+r9*8+rsp],r13
149	mov	QWORD[r9*8+rsp],rdx
150
151	lea	r14,[1+r14]
152	jmp	NEAR $L$outer
153ALIGN	16
154$L$outer:
155	mov	rbx,QWORD[r14*8+r12]
156	xor	r15,r15
157	mov	rbp,r8
158	mov	r10,QWORD[rsp]
159	mul	rbx
160	add	r10,rax
161	mov	rax,QWORD[rcx]
162	adc	rdx,0
163
164	imul	rbp,r10
165	mov	r11,rdx
166
167	mul	rbp
168	add	r10,rax
169	mov	rax,QWORD[8+rsi]
170	adc	rdx,0
171	mov	r10,QWORD[8+rsp]
172	mov	r13,rdx
173
174	lea	r15,[1+r15]
175	jmp	NEAR $L$inner_enter
176
177ALIGN	16
178$L$inner:
179	add	r13,rax
180	mov	rax,QWORD[r15*8+rsi]
181	adc	rdx,0
182	add	r13,r10
183	mov	r10,QWORD[r15*8+rsp]
184	adc	rdx,0
185	mov	QWORD[((-16))+r15*8+rsp],r13
186	mov	r13,rdx
187
188$L$inner_enter:
189	mul	rbx
190	add	r11,rax
191	mov	rax,QWORD[r15*8+rcx]
192	adc	rdx,0
193	add	r10,r11
194	mov	r11,rdx
195	adc	r11,0
196	lea	r15,[1+r15]
197
198	mul	rbp
199	cmp	r15,r9
200	jne	NEAR $L$inner
201
202	add	r13,rax
203	mov	rax,QWORD[rsi]
204	adc	rdx,0
205	add	r13,r10
206	mov	r10,QWORD[r15*8+rsp]
207	adc	rdx,0
208	mov	QWORD[((-16))+r15*8+rsp],r13
209	mov	r13,rdx
210
211	xor	rdx,rdx
212	add	r13,r11
213	adc	rdx,0
214	add	r13,r10
215	adc	rdx,0
216	mov	QWORD[((-8))+r9*8+rsp],r13
217	mov	QWORD[r9*8+rsp],rdx
218
219	lea	r14,[1+r14]
220	cmp	r14,r9
221	jb	NEAR $L$outer
222
223	xor	r14,r14
224	mov	rax,QWORD[rsp]
225	lea	rsi,[rsp]
226	mov	r15,r9
227	jmp	NEAR $L$sub
228ALIGN	16
229$L$sub:
230	sbb	rax,QWORD[r14*8+rcx]
231	mov	QWORD[r14*8+rdi],rax
232	mov	rax,QWORD[8+r14*8+rsi]
233	lea	r14,[1+r14]
234	dec	r15
235	jnz	NEAR $L$sub
236
237	sbb	rax,0
238	xor	r14,r14
239	and	rsi,rax
240	not	rax
241	mov	rcx,rdi
242	and	rcx,rax
243	mov	r15,r9
244	or	rsi,rcx
245ALIGN	16
246$L$copy:
247	mov	rax,QWORD[r14*8+rsi]
248	mov	QWORD[r14*8+rsp],r14
249	mov	QWORD[r14*8+rdi],rax
250	lea	r14,[1+r14]
251	sub	r15,1
252	jnz	NEAR $L$copy
253
254	mov	rsi,QWORD[8+r9*8+rsp]
255
256	mov	rax,1
257	mov	r15,QWORD[((-48))+rsi]
258
259	mov	r14,QWORD[((-40))+rsi]
260
261	mov	r13,QWORD[((-32))+rsi]
262
263	mov	r12,QWORD[((-24))+rsi]
264
265	mov	rbp,QWORD[((-16))+rsi]
266
267	mov	rbx,QWORD[((-8))+rsi]
268
269	lea	rsp,[rsi]
270
271$L$mul_epilogue:
272	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
273	mov	rsi,QWORD[16+rsp]
274	DB	0F3h,0C3h		;repret
275
276$L$SEH_end_bn_mul_mont:
277
278ALIGN	16
279bn_mul4x_mont:
280	mov	QWORD[8+rsp],rdi	;WIN64 prologue
281	mov	QWORD[16+rsp],rsi
282	mov	rax,rsp
283$L$SEH_begin_bn_mul4x_mont:
284	mov	rdi,rcx
285	mov	rsi,rdx
286	mov	rdx,r8
287	mov	rcx,r9
288	mov	r8,QWORD[40+rsp]
289	mov	r9,QWORD[48+rsp]
290
291
292
293	mov	r9d,r9d
294	mov	rax,rsp
295
296$L$mul4x_enter:
297	push	rbx
298
299	push	rbp
300
301	push	r12
302
303	push	r13
304
305	push	r14
306
307	push	r15
308
309
310	neg	r9
311	mov	r11,rsp
312	lea	r10,[((-32))+r9*8+rsp]
313	neg	r9
314	and	r10,-1024
315
316	sub	r11,r10
317	and	r11,-4096
318	lea	rsp,[r11*1+r10]
319	mov	r11,QWORD[rsp]
320	cmp	rsp,r10
321	ja	NEAR $L$mul4x_page_walk
322	jmp	NEAR $L$mul4x_page_walk_done
323
324$L$mul4x_page_walk:
325	lea	rsp,[((-4096))+rsp]
326	mov	r11,QWORD[rsp]
327	cmp	rsp,r10
328	ja	NEAR $L$mul4x_page_walk
329$L$mul4x_page_walk_done:
330
331	mov	QWORD[8+r9*8+rsp],rax
332
333$L$mul4x_body:
334	mov	QWORD[16+r9*8+rsp],rdi
335	mov	r12,rdx
336	mov	r8,QWORD[r8]
337	mov	rbx,QWORD[r12]
338	mov	rax,QWORD[rsi]
339
340	xor	r14,r14
341	xor	r15,r15
342
343	mov	rbp,r8
344	mul	rbx
345	mov	r10,rax
346	mov	rax,QWORD[rcx]
347
348	imul	rbp,r10
349	mov	r11,rdx
350
351	mul	rbp
352	add	r10,rax
353	mov	rax,QWORD[8+rsi]
354	adc	rdx,0
355	mov	rdi,rdx
356
357	mul	rbx
358	add	r11,rax
359	mov	rax,QWORD[8+rcx]
360	adc	rdx,0
361	mov	r10,rdx
362
363	mul	rbp
364	add	rdi,rax
365	mov	rax,QWORD[16+rsi]
366	adc	rdx,0
367	add	rdi,r11
368	lea	r15,[4+r15]
369	adc	rdx,0
370	mov	QWORD[rsp],rdi
371	mov	r13,rdx
372	jmp	NEAR $L$1st4x
373ALIGN	16
374$L$1st4x:
375	mul	rbx
376	add	r10,rax
377	mov	rax,QWORD[((-16))+r15*8+rcx]
378	adc	rdx,0
379	mov	r11,rdx
380
381	mul	rbp
382	add	r13,rax
383	mov	rax,QWORD[((-8))+r15*8+rsi]
384	adc	rdx,0
385	add	r13,r10
386	adc	rdx,0
387	mov	QWORD[((-24))+r15*8+rsp],r13
388	mov	rdi,rdx
389
390	mul	rbx
391	add	r11,rax
392	mov	rax,QWORD[((-8))+r15*8+rcx]
393	adc	rdx,0
394	mov	r10,rdx
395
396	mul	rbp
397	add	rdi,rax
398	mov	rax,QWORD[r15*8+rsi]
399	adc	rdx,0
400	add	rdi,r11
401	adc	rdx,0
402	mov	QWORD[((-16))+r15*8+rsp],rdi
403	mov	r13,rdx
404
405	mul	rbx
406	add	r10,rax
407	mov	rax,QWORD[r15*8+rcx]
408	adc	rdx,0
409	mov	r11,rdx
410
411	mul	rbp
412	add	r13,rax
413	mov	rax,QWORD[8+r15*8+rsi]
414	adc	rdx,0
415	add	r13,r10
416	adc	rdx,0
417	mov	QWORD[((-8))+r15*8+rsp],r13
418	mov	rdi,rdx
419
420	mul	rbx
421	add	r11,rax
422	mov	rax,QWORD[8+r15*8+rcx]
423	adc	rdx,0
424	lea	r15,[4+r15]
425	mov	r10,rdx
426
427	mul	rbp
428	add	rdi,rax
429	mov	rax,QWORD[((-16))+r15*8+rsi]
430	adc	rdx,0
431	add	rdi,r11
432	adc	rdx,0
433	mov	QWORD[((-32))+r15*8+rsp],rdi
434	mov	r13,rdx
435	cmp	r15,r9
436	jb	NEAR $L$1st4x
437
438	mul	rbx
439	add	r10,rax
440	mov	rax,QWORD[((-16))+r15*8+rcx]
441	adc	rdx,0
442	mov	r11,rdx
443
444	mul	rbp
445	add	r13,rax
446	mov	rax,QWORD[((-8))+r15*8+rsi]
447	adc	rdx,0
448	add	r13,r10
449	adc	rdx,0
450	mov	QWORD[((-24))+r15*8+rsp],r13
451	mov	rdi,rdx
452
453	mul	rbx
454	add	r11,rax
455	mov	rax,QWORD[((-8))+r15*8+rcx]
456	adc	rdx,0
457	mov	r10,rdx
458
459	mul	rbp
460	add	rdi,rax
461	mov	rax,QWORD[rsi]
462	adc	rdx,0
463	add	rdi,r11
464	adc	rdx,0
465	mov	QWORD[((-16))+r15*8+rsp],rdi
466	mov	r13,rdx
467
468	xor	rdi,rdi
469	add	r13,r10
470	adc	rdi,0
471	mov	QWORD[((-8))+r15*8+rsp],r13
472	mov	QWORD[r15*8+rsp],rdi
473
474	lea	r14,[1+r14]
475ALIGN	4
476$L$outer4x:
477	mov	rbx,QWORD[r14*8+r12]
478	xor	r15,r15
479	mov	r10,QWORD[rsp]
480	mov	rbp,r8
481	mul	rbx
482	add	r10,rax
483	mov	rax,QWORD[rcx]
484	adc	rdx,0
485
486	imul	rbp,r10
487	mov	r11,rdx
488
489	mul	rbp
490	add	r10,rax
491	mov	rax,QWORD[8+rsi]
492	adc	rdx,0
493	mov	rdi,rdx
494
495	mul	rbx
496	add	r11,rax
497	mov	rax,QWORD[8+rcx]
498	adc	rdx,0
499	add	r11,QWORD[8+rsp]
500	adc	rdx,0
501	mov	r10,rdx
502
503	mul	rbp
504	add	rdi,rax
505	mov	rax,QWORD[16+rsi]
506	adc	rdx,0
507	add	rdi,r11
508	lea	r15,[4+r15]
509	adc	rdx,0
510	mov	QWORD[rsp],rdi
511	mov	r13,rdx
512	jmp	NEAR $L$inner4x
513ALIGN	16
514$L$inner4x:
515	mul	rbx
516	add	r10,rax
517	mov	rax,QWORD[((-16))+r15*8+rcx]
518	adc	rdx,0
519	add	r10,QWORD[((-16))+r15*8+rsp]
520	adc	rdx,0
521	mov	r11,rdx
522
523	mul	rbp
524	add	r13,rax
525	mov	rax,QWORD[((-8))+r15*8+rsi]
526	adc	rdx,0
527	add	r13,r10
528	adc	rdx,0
529	mov	QWORD[((-24))+r15*8+rsp],r13
530	mov	rdi,rdx
531
532	mul	rbx
533	add	r11,rax
534	mov	rax,QWORD[((-8))+r15*8+rcx]
535	adc	rdx,0
536	add	r11,QWORD[((-8))+r15*8+rsp]
537	adc	rdx,0
538	mov	r10,rdx
539
540	mul	rbp
541	add	rdi,rax
542	mov	rax,QWORD[r15*8+rsi]
543	adc	rdx,0
544	add	rdi,r11
545	adc	rdx,0
546	mov	QWORD[((-16))+r15*8+rsp],rdi
547	mov	r13,rdx
548
549	mul	rbx
550	add	r10,rax
551	mov	rax,QWORD[r15*8+rcx]
552	adc	rdx,0
553	add	r10,QWORD[r15*8+rsp]
554	adc	rdx,0
555	mov	r11,rdx
556
557	mul	rbp
558	add	r13,rax
559	mov	rax,QWORD[8+r15*8+rsi]
560	adc	rdx,0
561	add	r13,r10
562	adc	rdx,0
563	mov	QWORD[((-8))+r15*8+rsp],r13
564	mov	rdi,rdx
565
566	mul	rbx
567	add	r11,rax
568	mov	rax,QWORD[8+r15*8+rcx]
569	adc	rdx,0
570	add	r11,QWORD[8+r15*8+rsp]
571	adc	rdx,0
572	lea	r15,[4+r15]
573	mov	r10,rdx
574
575	mul	rbp
576	add	rdi,rax
577	mov	rax,QWORD[((-16))+r15*8+rsi]
578	adc	rdx,0
579	add	rdi,r11
580	adc	rdx,0
581	mov	QWORD[((-32))+r15*8+rsp],rdi
582	mov	r13,rdx
583	cmp	r15,r9
584	jb	NEAR $L$inner4x
585
586	mul	rbx
587	add	r10,rax
588	mov	rax,QWORD[((-16))+r15*8+rcx]
589	adc	rdx,0
590	add	r10,QWORD[((-16))+r15*8+rsp]
591	adc	rdx,0
592	mov	r11,rdx
593
594	mul	rbp
595	add	r13,rax
596	mov	rax,QWORD[((-8))+r15*8+rsi]
597	adc	rdx,0
598	add	r13,r10
599	adc	rdx,0
600	mov	QWORD[((-24))+r15*8+rsp],r13
601	mov	rdi,rdx
602
603	mul	rbx
604	add	r11,rax
605	mov	rax,QWORD[((-8))+r15*8+rcx]
606	adc	rdx,0
607	add	r11,QWORD[((-8))+r15*8+rsp]
608	adc	rdx,0
609	lea	r14,[1+r14]
610	mov	r10,rdx
611
612	mul	rbp
613	add	rdi,rax
614	mov	rax,QWORD[rsi]
615	adc	rdx,0
616	add	rdi,r11
617	adc	rdx,0
618	mov	QWORD[((-16))+r15*8+rsp],rdi
619	mov	r13,rdx
620
621	xor	rdi,rdi
622	add	r13,r10
623	adc	rdi,0
624	add	r13,QWORD[r9*8+rsp]
625	adc	rdi,0
626	mov	QWORD[((-8))+r15*8+rsp],r13
627	mov	QWORD[r15*8+rsp],rdi
628
629	cmp	r14,r9
630	jb	NEAR $L$outer4x
631	mov	rdi,QWORD[16+r9*8+rsp]
632	lea	r15,[((-4))+r9]
633	mov	rax,QWORD[rsp]
634	pxor	xmm0,xmm0
635	mov	rdx,QWORD[8+rsp]
636	shr	r15,2
637	lea	rsi,[rsp]
638	xor	r14,r14
639
640	sub	rax,QWORD[rcx]
641	mov	rbx,QWORD[16+rsi]
642	mov	rbp,QWORD[24+rsi]
643	sbb	rdx,QWORD[8+rcx]
644	jmp	NEAR $L$sub4x
645ALIGN	16
646$L$sub4x:
647	mov	QWORD[r14*8+rdi],rax
648	mov	QWORD[8+r14*8+rdi],rdx
649	sbb	rbx,QWORD[16+r14*8+rcx]
650	mov	rax,QWORD[32+r14*8+rsi]
651	mov	rdx,QWORD[40+r14*8+rsi]
652	sbb	rbp,QWORD[24+r14*8+rcx]
653	mov	QWORD[16+r14*8+rdi],rbx
654	mov	QWORD[24+r14*8+rdi],rbp
655	sbb	rax,QWORD[32+r14*8+rcx]
656	mov	rbx,QWORD[48+r14*8+rsi]
657	mov	rbp,QWORD[56+r14*8+rsi]
658	sbb	rdx,QWORD[40+r14*8+rcx]
659	lea	r14,[4+r14]
660	dec	r15
661	jnz	NEAR $L$sub4x
662
663	mov	QWORD[r14*8+rdi],rax
664	mov	rax,QWORD[32+r14*8+rsi]
665	sbb	rbx,QWORD[16+r14*8+rcx]
666	mov	QWORD[8+r14*8+rdi],rdx
667	sbb	rbp,QWORD[24+r14*8+rcx]
668	mov	QWORD[16+r14*8+rdi],rbx
669
670	sbb	rax,0
671	mov	QWORD[24+r14*8+rdi],rbp
672	xor	r14,r14
673	and	rsi,rax
674	not	rax
675	mov	rcx,rdi
676	and	rcx,rax
677	lea	r15,[((-4))+r9]
678	or	rsi,rcx
679	shr	r15,2
680
681	movdqu	xmm1,XMMWORD[rsi]
682	movdqa	XMMWORD[rsp],xmm0
683	movdqu	XMMWORD[rdi],xmm1
684	jmp	NEAR $L$copy4x
685ALIGN	16
686$L$copy4x:
687	movdqu	xmm2,XMMWORD[16+r14*1+rsi]
688	movdqu	xmm1,XMMWORD[32+r14*1+rsi]
689	movdqa	XMMWORD[16+r14*1+rsp],xmm0
690	movdqu	XMMWORD[16+r14*1+rdi],xmm2
691	movdqa	XMMWORD[32+r14*1+rsp],xmm0
692	movdqu	XMMWORD[32+r14*1+rdi],xmm1
693	lea	r14,[32+r14]
694	dec	r15
695	jnz	NEAR $L$copy4x
696
697	movdqu	xmm2,XMMWORD[16+r14*1+rsi]
698	movdqa	XMMWORD[16+r14*1+rsp],xmm0
699	movdqu	XMMWORD[16+r14*1+rdi],xmm2
700	mov	rsi,QWORD[8+r9*8+rsp]
701
702	mov	rax,1
703	mov	r15,QWORD[((-48))+rsi]
704
705	mov	r14,QWORD[((-40))+rsi]
706
707	mov	r13,QWORD[((-32))+rsi]
708
709	mov	r12,QWORD[((-24))+rsi]
710
711	mov	rbp,QWORD[((-16))+rsi]
712
713	mov	rbx,QWORD[((-8))+rsi]
714
715	lea	rsp,[rsi]
716
717$L$mul4x_epilogue:
718	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
719	mov	rsi,QWORD[16+rsp]
720	DB	0F3h,0C3h		;repret
721
722$L$SEH_end_bn_mul4x_mont:
723EXTERN	bn_sqr8x_internal
724
725
726ALIGN	32
727bn_sqr8x_mont:
728	mov	QWORD[8+rsp],rdi	;WIN64 prologue
729	mov	QWORD[16+rsp],rsi
730	mov	rax,rsp
731$L$SEH_begin_bn_sqr8x_mont:
732	mov	rdi,rcx
733	mov	rsi,rdx
734	mov	rdx,r8
735	mov	rcx,r9
736	mov	r8,QWORD[40+rsp]
737	mov	r9,QWORD[48+rsp]
738
739
740
741	mov	rax,rsp
742
743$L$sqr8x_enter:
744	push	rbx
745
746	push	rbp
747
748	push	r12
749
750	push	r13
751
752	push	r14
753
754	push	r15
755
756$L$sqr8x_prologue:
757
758	mov	r10d,r9d
759	shl	r9d,3
760	shl	r10,3+2
761	neg	r9
762
763
764
765
766
767
768	lea	r11,[((-64))+r9*2+rsp]
769	mov	rbp,rsp
770	mov	r8,QWORD[r8]
771	sub	r11,rsi
772	and	r11,4095
773	cmp	r10,r11
774	jb	NEAR $L$sqr8x_sp_alt
775	sub	rbp,r11
776	lea	rbp,[((-64))+r9*2+rbp]
777	jmp	NEAR $L$sqr8x_sp_done
778
779ALIGN	32
780$L$sqr8x_sp_alt:
781	lea	r10,[((4096-64))+r9*2]
782	lea	rbp,[((-64))+r9*2+rbp]
783	sub	r11,r10
784	mov	r10,0
785	cmovc	r11,r10
786	sub	rbp,r11
787$L$sqr8x_sp_done:
788	and	rbp,-64
789	mov	r11,rsp
790	sub	r11,rbp
791	and	r11,-4096
792	lea	rsp,[rbp*1+r11]
793	mov	r10,QWORD[rsp]
794	cmp	rsp,rbp
795	ja	NEAR $L$sqr8x_page_walk
796	jmp	NEAR $L$sqr8x_page_walk_done
797
798ALIGN	16
799$L$sqr8x_page_walk:
800	lea	rsp,[((-4096))+rsp]
801	mov	r10,QWORD[rsp]
802	cmp	rsp,rbp
803	ja	NEAR $L$sqr8x_page_walk
804$L$sqr8x_page_walk_done:
805
806	mov	r10,r9
807	neg	r9
808
809	mov	QWORD[32+rsp],r8
810	mov	QWORD[40+rsp],rax
811
812$L$sqr8x_body:
813
814DB	102,72,15,110,209
815	pxor	xmm0,xmm0
816DB	102,72,15,110,207
817DB	102,73,15,110,218
818	call	bn_sqr8x_internal
819
820
821
822
823	lea	rbx,[r9*1+rdi]
824	mov	rcx,r9
825	mov	rdx,r9
826DB	102,72,15,126,207
827	sar	rcx,3+2
828	jmp	NEAR $L$sqr8x_sub
829
830ALIGN	32
831$L$sqr8x_sub:
832	mov	r12,QWORD[rbx]
833	mov	r13,QWORD[8+rbx]
834	mov	r14,QWORD[16+rbx]
835	mov	r15,QWORD[24+rbx]
836	lea	rbx,[32+rbx]
837	sbb	r12,QWORD[rbp]
838	sbb	r13,QWORD[8+rbp]
839	sbb	r14,QWORD[16+rbp]
840	sbb	r15,QWORD[24+rbp]
841	lea	rbp,[32+rbp]
842	mov	QWORD[rdi],r12
843	mov	QWORD[8+rdi],r13
844	mov	QWORD[16+rdi],r14
845	mov	QWORD[24+rdi],r15
846	lea	rdi,[32+rdi]
847	inc	rcx
848	jnz	NEAR $L$sqr8x_sub
849
850	sbb	rax,0
851	lea	rbx,[r9*1+rbx]
852	lea	rdi,[r9*1+rdi]
853
854DB	102,72,15,110,200
855	pxor	xmm0,xmm0
856	pshufd	xmm1,xmm1,0
857	mov	rsi,QWORD[40+rsp]
858
859	jmp	NEAR $L$sqr8x_cond_copy
860
861ALIGN	32
862$L$sqr8x_cond_copy:
863	movdqa	xmm2,XMMWORD[rbx]
864	movdqa	xmm3,XMMWORD[16+rbx]
865	lea	rbx,[32+rbx]
866	movdqu	xmm4,XMMWORD[rdi]
867	movdqu	xmm5,XMMWORD[16+rdi]
868	lea	rdi,[32+rdi]
869	movdqa	XMMWORD[(-32)+rbx],xmm0
870	movdqa	XMMWORD[(-16)+rbx],xmm0
871	movdqa	XMMWORD[(-32)+rdx*1+rbx],xmm0
872	movdqa	XMMWORD[(-16)+rdx*1+rbx],xmm0
873	pcmpeqd	xmm0,xmm1
874	pand	xmm2,xmm1
875	pand	xmm3,xmm1
876	pand	xmm4,xmm0
877	pand	xmm5,xmm0
878	pxor	xmm0,xmm0
879	por	xmm4,xmm2
880	por	xmm5,xmm3
881	movdqu	XMMWORD[(-32)+rdi],xmm4
882	movdqu	XMMWORD[(-16)+rdi],xmm5
883	add	r9,32
884	jnz	NEAR $L$sqr8x_cond_copy
885
886	mov	rax,1
887	mov	r15,QWORD[((-48))+rsi]
888
889	mov	r14,QWORD[((-40))+rsi]
890
891	mov	r13,QWORD[((-32))+rsi]
892
893	mov	r12,QWORD[((-24))+rsi]
894
895	mov	rbp,QWORD[((-16))+rsi]
896
897	mov	rbx,QWORD[((-8))+rsi]
898
899	lea	rsp,[rsi]
900
901$L$sqr8x_epilogue:
902	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
903	mov	rsi,QWORD[16+rsp]
904	DB	0F3h,0C3h		;repret
905
906$L$SEH_end_bn_sqr8x_mont:
907DB	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
908DB	112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
909DB	54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83
910DB	32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
911DB	115,108,46,111,114,103,62,0
912ALIGN	16
913EXTERN	__imp_RtlVirtualUnwind
914
915ALIGN	16
916mul_handler:
917	push	rsi
918	push	rdi
919	push	rbx
920	push	rbp
921	push	r12
922	push	r13
923	push	r14
924	push	r15
925	pushfq
926	sub	rsp,64
927
928	mov	rax,QWORD[120+r8]
929	mov	rbx,QWORD[248+r8]
930
931	mov	rsi,QWORD[8+r9]
932	mov	r11,QWORD[56+r9]
933
934	mov	r10d,DWORD[r11]
935	lea	r10,[r10*1+rsi]
936	cmp	rbx,r10
937	jb	NEAR $L$common_seh_tail
938
939	mov	rax,QWORD[152+r8]
940
941	mov	r10d,DWORD[4+r11]
942	lea	r10,[r10*1+rsi]
943	cmp	rbx,r10
944	jae	NEAR $L$common_seh_tail
945
946	mov	r10,QWORD[192+r8]
947	mov	rax,QWORD[8+r10*8+rax]
948
949	jmp	NEAR $L$common_pop_regs
950
951
952
953ALIGN	16
954sqr_handler:
955	push	rsi
956	push	rdi
957	push	rbx
958	push	rbp
959	push	r12
960	push	r13
961	push	r14
962	push	r15
963	pushfq
964	sub	rsp,64
965
966	mov	rax,QWORD[120+r8]
967	mov	rbx,QWORD[248+r8]
968
969	mov	rsi,QWORD[8+r9]
970	mov	r11,QWORD[56+r9]
971
972	mov	r10d,DWORD[r11]
973	lea	r10,[r10*1+rsi]
974	cmp	rbx,r10
975	jb	NEAR $L$common_seh_tail
976
977	mov	r10d,DWORD[4+r11]
978	lea	r10,[r10*1+rsi]
979	cmp	rbx,r10
980	jb	NEAR $L$common_pop_regs
981
982	mov	rax,QWORD[152+r8]
983
984	mov	r10d,DWORD[8+r11]
985	lea	r10,[r10*1+rsi]
986	cmp	rbx,r10
987	jae	NEAR $L$common_seh_tail
988
989	mov	rax,QWORD[40+rax]
990
991$L$common_pop_regs:
992	mov	rbx,QWORD[((-8))+rax]
993	mov	rbp,QWORD[((-16))+rax]
994	mov	r12,QWORD[((-24))+rax]
995	mov	r13,QWORD[((-32))+rax]
996	mov	r14,QWORD[((-40))+rax]
997	mov	r15,QWORD[((-48))+rax]
998	mov	QWORD[144+r8],rbx
999	mov	QWORD[160+r8],rbp
1000	mov	QWORD[216+r8],r12
1001	mov	QWORD[224+r8],r13
1002	mov	QWORD[232+r8],r14
1003	mov	QWORD[240+r8],r15
1004
1005$L$common_seh_tail:
1006	mov	rdi,QWORD[8+rax]
1007	mov	rsi,QWORD[16+rax]
1008	mov	QWORD[152+r8],rax
1009	mov	QWORD[168+r8],rsi
1010	mov	QWORD[176+r8],rdi
1011
1012	mov	rdi,QWORD[40+r9]
1013	mov	rsi,r8
1014	mov	ecx,154
1015	DD	0xa548f3fc
1016
1017	mov	rsi,r9
1018	xor	rcx,rcx
1019	mov	rdx,QWORD[8+rsi]
1020	mov	r8,QWORD[rsi]
1021	mov	r9,QWORD[16+rsi]
1022	mov	r10,QWORD[40+rsi]
1023	lea	r11,[56+rsi]
1024	lea	r12,[24+rsi]
1025	mov	QWORD[32+rsp],r10
1026	mov	QWORD[40+rsp],r11
1027	mov	QWORD[48+rsp],r12
1028	mov	QWORD[56+rsp],rcx
1029	call	QWORD[__imp_RtlVirtualUnwind]
1030
1031	mov	eax,1
1032	add	rsp,64
1033	popfq
1034	pop	r15
1035	pop	r14
1036	pop	r13
1037	pop	r12
1038	pop	rbp
1039	pop	rbx
1040	pop	rdi
1041	pop	rsi
1042	DB	0F3h,0C3h		;repret
1043
1044
1045section	.pdata rdata align=4
1046ALIGN	4
1047	DD	$L$SEH_begin_bn_mul_mont wrt ..imagebase
1048	DD	$L$SEH_end_bn_mul_mont wrt ..imagebase
1049	DD	$L$SEH_info_bn_mul_mont wrt ..imagebase
1050
1051	DD	$L$SEH_begin_bn_mul4x_mont wrt ..imagebase
1052	DD	$L$SEH_end_bn_mul4x_mont wrt ..imagebase
1053	DD	$L$SEH_info_bn_mul4x_mont wrt ..imagebase
1054
1055	DD	$L$SEH_begin_bn_sqr8x_mont wrt ..imagebase
1056	DD	$L$SEH_end_bn_sqr8x_mont wrt ..imagebase
1057	DD	$L$SEH_info_bn_sqr8x_mont wrt ..imagebase
1058section	.xdata rdata align=8
1059ALIGN	8
1060$L$SEH_info_bn_mul_mont:
1061DB	9,0,0,0
1062	DD	mul_handler wrt ..imagebase
1063	DD	$L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase
1064$L$SEH_info_bn_mul4x_mont:
1065DB	9,0,0,0
1066	DD	mul_handler wrt ..imagebase
1067	DD	$L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase
1068$L$SEH_info_bn_sqr8x_mont:
1069DB	9,0,0,0
1070	DD	sqr_handler wrt ..imagebase
1071	DD	$L$sqr8x_prologue wrt ..imagebase,$L$sqr8x_body wrt ..imagebase,$L$sqr8x_epilogue wrt ..imagebase
1072ALIGN	8
1073