• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; This file is generated from a similarly-named Perl script in the BoringSSL
2; source tree. Do not edit by hand.
3
4default	rel
5%define XMMWORD
6%define YMMWORD
7%define ZMMWORD
8
9%include "ring_core_generated/prefix_symbols_nasm.inc"
10section	.text code align=64
11
12
13EXTERN	OPENSSL_ia32cap_P
14
15global	bn_mul_mont_gather5
16
17ALIGN	64
18bn_mul_mont_gather5:
19	mov	QWORD[8+rsp],rdi	;WIN64 prologue
20	mov	QWORD[16+rsp],rsi
21	mov	rax,rsp
22$L$SEH_begin_bn_mul_mont_gather5:
23	mov	rdi,rcx
24	mov	rsi,rdx
25	mov	rdx,r8
26	mov	rcx,r9
27	mov	r8,QWORD[40+rsp]
28	mov	r9,QWORD[48+rsp]
29
30
31
32	mov	r9d,r9d
33	mov	rax,rsp
34
35	test	r9d,7
36	jnz	NEAR $L$mul_enter
37	lea	r11,[OPENSSL_ia32cap_P]
38	mov	r11d,DWORD[8+r11]
39	jmp	NEAR $L$mul4x_enter
40
41ALIGN	16
42$L$mul_enter:
43	movd	xmm5,DWORD[56+rsp]
44	push	rbx
45
46	push	rbp
47
48	push	r12
49
50	push	r13
51
52	push	r14
53
54	push	r15
55
56
57	neg	r9
58	mov	r11,rsp
59	lea	r10,[((-280))+r9*8+rsp]
60	neg	r9
61	and	r10,-1024
62
63
64
65
66
67
68
69
70
71	sub	r11,r10
72	and	r11,-4096
73	lea	rsp,[r11*1+r10]
74	mov	r11,QWORD[rsp]
75	cmp	rsp,r10
76	ja	NEAR $L$mul_page_walk
77	jmp	NEAR $L$mul_page_walk_done
78
79$L$mul_page_walk:
80	lea	rsp,[((-4096))+rsp]
81	mov	r11,QWORD[rsp]
82	cmp	rsp,r10
83	ja	NEAR $L$mul_page_walk
84$L$mul_page_walk_done:
85
86	lea	r10,[$L$inc]
87	mov	QWORD[8+r9*8+rsp],rax
88
89$L$mul_body:
90
91	lea	r12,[128+rdx]
92	movdqa	xmm0,XMMWORD[r10]
93	movdqa	xmm1,XMMWORD[16+r10]
94	lea	r10,[((24-112))+r9*8+rsp]
95	and	r10,-16
96
97	pshufd	xmm5,xmm5,0
98	movdqa	xmm4,xmm1
99	movdqa	xmm2,xmm1
100	paddd	xmm1,xmm0
101	pcmpeqd	xmm0,xmm5
102DB	0x67
103	movdqa	xmm3,xmm4
104	paddd	xmm2,xmm1
105	pcmpeqd	xmm1,xmm5
106	movdqa	XMMWORD[112+r10],xmm0
107	movdqa	xmm0,xmm4
108
109	paddd	xmm3,xmm2
110	pcmpeqd	xmm2,xmm5
111	movdqa	XMMWORD[128+r10],xmm1
112	movdqa	xmm1,xmm4
113
114	paddd	xmm0,xmm3
115	pcmpeqd	xmm3,xmm5
116	movdqa	XMMWORD[144+r10],xmm2
117	movdqa	xmm2,xmm4
118
119	paddd	xmm1,xmm0
120	pcmpeqd	xmm0,xmm5
121	movdqa	XMMWORD[160+r10],xmm3
122	movdqa	xmm3,xmm4
123	paddd	xmm2,xmm1
124	pcmpeqd	xmm1,xmm5
125	movdqa	XMMWORD[176+r10],xmm0
126	movdqa	xmm0,xmm4
127
128	paddd	xmm3,xmm2
129	pcmpeqd	xmm2,xmm5
130	movdqa	XMMWORD[192+r10],xmm1
131	movdqa	xmm1,xmm4
132
133	paddd	xmm0,xmm3
134	pcmpeqd	xmm3,xmm5
135	movdqa	XMMWORD[208+r10],xmm2
136	movdqa	xmm2,xmm4
137
138	paddd	xmm1,xmm0
139	pcmpeqd	xmm0,xmm5
140	movdqa	XMMWORD[224+r10],xmm3
141	movdqa	xmm3,xmm4
142	paddd	xmm2,xmm1
143	pcmpeqd	xmm1,xmm5
144	movdqa	XMMWORD[240+r10],xmm0
145	movdqa	xmm0,xmm4
146
147	paddd	xmm3,xmm2
148	pcmpeqd	xmm2,xmm5
149	movdqa	XMMWORD[256+r10],xmm1
150	movdqa	xmm1,xmm4
151
152	paddd	xmm0,xmm3
153	pcmpeqd	xmm3,xmm5
154	movdqa	XMMWORD[272+r10],xmm2
155	movdqa	xmm2,xmm4
156
157	paddd	xmm1,xmm0
158	pcmpeqd	xmm0,xmm5
159	movdqa	XMMWORD[288+r10],xmm3
160	movdqa	xmm3,xmm4
161	paddd	xmm2,xmm1
162	pcmpeqd	xmm1,xmm5
163	movdqa	XMMWORD[304+r10],xmm0
164
165	paddd	xmm3,xmm2
166DB	0x67
167	pcmpeqd	xmm2,xmm5
168	movdqa	XMMWORD[320+r10],xmm1
169
170	pcmpeqd	xmm3,xmm5
171	movdqa	XMMWORD[336+r10],xmm2
172	pand	xmm0,XMMWORD[64+r12]
173
174	pand	xmm1,XMMWORD[80+r12]
175	pand	xmm2,XMMWORD[96+r12]
176	movdqa	XMMWORD[352+r10],xmm3
177	pand	xmm3,XMMWORD[112+r12]
178	por	xmm0,xmm2
179	por	xmm1,xmm3
180	movdqa	xmm4,XMMWORD[((-128))+r12]
181	movdqa	xmm5,XMMWORD[((-112))+r12]
182	movdqa	xmm2,XMMWORD[((-96))+r12]
183	pand	xmm4,XMMWORD[112+r10]
184	movdqa	xmm3,XMMWORD[((-80))+r12]
185	pand	xmm5,XMMWORD[128+r10]
186	por	xmm0,xmm4
187	pand	xmm2,XMMWORD[144+r10]
188	por	xmm1,xmm5
189	pand	xmm3,XMMWORD[160+r10]
190	por	xmm0,xmm2
191	por	xmm1,xmm3
192	movdqa	xmm4,XMMWORD[((-64))+r12]
193	movdqa	xmm5,XMMWORD[((-48))+r12]
194	movdqa	xmm2,XMMWORD[((-32))+r12]
195	pand	xmm4,XMMWORD[176+r10]
196	movdqa	xmm3,XMMWORD[((-16))+r12]
197	pand	xmm5,XMMWORD[192+r10]
198	por	xmm0,xmm4
199	pand	xmm2,XMMWORD[208+r10]
200	por	xmm1,xmm5
201	pand	xmm3,XMMWORD[224+r10]
202	por	xmm0,xmm2
203	por	xmm1,xmm3
204	movdqa	xmm4,XMMWORD[r12]
205	movdqa	xmm5,XMMWORD[16+r12]
206	movdqa	xmm2,XMMWORD[32+r12]
207	pand	xmm4,XMMWORD[240+r10]
208	movdqa	xmm3,XMMWORD[48+r12]
209	pand	xmm5,XMMWORD[256+r10]
210	por	xmm0,xmm4
211	pand	xmm2,XMMWORD[272+r10]
212	por	xmm1,xmm5
213	pand	xmm3,XMMWORD[288+r10]
214	por	xmm0,xmm2
215	por	xmm1,xmm3
216	por	xmm0,xmm1
217	pshufd	xmm1,xmm0,0x4e
218	por	xmm0,xmm1
219	lea	r12,[256+r12]
220DB	102,72,15,126,195
221
222	mov	r8,QWORD[r8]
223	mov	rax,QWORD[rsi]
224
225	xor	r14,r14
226	xor	r15,r15
227
228	mov	rbp,r8
229	mul	rbx
230	mov	r10,rax
231	mov	rax,QWORD[rcx]
232
233	imul	rbp,r10
234	mov	r11,rdx
235
236	mul	rbp
237	add	r10,rax
238	mov	rax,QWORD[8+rsi]
239	adc	rdx,0
240	mov	r13,rdx
241
242	lea	r15,[1+r15]
243	jmp	NEAR $L$1st_enter
244
245ALIGN	16
246$L$1st:
247	add	r13,rax
248	mov	rax,QWORD[r15*8+rsi]
249	adc	rdx,0
250	add	r13,r11
251	mov	r11,r10
252	adc	rdx,0
253	mov	QWORD[((-16))+r15*8+rsp],r13
254	mov	r13,rdx
255
256$L$1st_enter:
257	mul	rbx
258	add	r11,rax
259	mov	rax,QWORD[r15*8+rcx]
260	adc	rdx,0
261	lea	r15,[1+r15]
262	mov	r10,rdx
263
264	mul	rbp
265	cmp	r15,r9
266	jne	NEAR $L$1st
267
268
269	add	r13,rax
270	adc	rdx,0
271	add	r13,r11
272	adc	rdx,0
273	mov	QWORD[((-16))+r9*8+rsp],r13
274	mov	r13,rdx
275	mov	r11,r10
276
277	xor	rdx,rdx
278	add	r13,r11
279	adc	rdx,0
280	mov	QWORD[((-8))+r9*8+rsp],r13
281	mov	QWORD[r9*8+rsp],rdx
282
283	lea	r14,[1+r14]
284	jmp	NEAR $L$outer
285ALIGN	16
286$L$outer:
287	lea	rdx,[((24+128))+r9*8+rsp]
288	and	rdx,-16
289	pxor	xmm4,xmm4
290	pxor	xmm5,xmm5
291	movdqa	xmm0,XMMWORD[((-128))+r12]
292	movdqa	xmm1,XMMWORD[((-112))+r12]
293	movdqa	xmm2,XMMWORD[((-96))+r12]
294	movdqa	xmm3,XMMWORD[((-80))+r12]
295	pand	xmm0,XMMWORD[((-128))+rdx]
296	pand	xmm1,XMMWORD[((-112))+rdx]
297	por	xmm4,xmm0
298	pand	xmm2,XMMWORD[((-96))+rdx]
299	por	xmm5,xmm1
300	pand	xmm3,XMMWORD[((-80))+rdx]
301	por	xmm4,xmm2
302	por	xmm5,xmm3
303	movdqa	xmm0,XMMWORD[((-64))+r12]
304	movdqa	xmm1,XMMWORD[((-48))+r12]
305	movdqa	xmm2,XMMWORD[((-32))+r12]
306	movdqa	xmm3,XMMWORD[((-16))+r12]
307	pand	xmm0,XMMWORD[((-64))+rdx]
308	pand	xmm1,XMMWORD[((-48))+rdx]
309	por	xmm4,xmm0
310	pand	xmm2,XMMWORD[((-32))+rdx]
311	por	xmm5,xmm1
312	pand	xmm3,XMMWORD[((-16))+rdx]
313	por	xmm4,xmm2
314	por	xmm5,xmm3
315	movdqa	xmm0,XMMWORD[r12]
316	movdqa	xmm1,XMMWORD[16+r12]
317	movdqa	xmm2,XMMWORD[32+r12]
318	movdqa	xmm3,XMMWORD[48+r12]
319	pand	xmm0,XMMWORD[rdx]
320	pand	xmm1,XMMWORD[16+rdx]
321	por	xmm4,xmm0
322	pand	xmm2,XMMWORD[32+rdx]
323	por	xmm5,xmm1
324	pand	xmm3,XMMWORD[48+rdx]
325	por	xmm4,xmm2
326	por	xmm5,xmm3
327	movdqa	xmm0,XMMWORD[64+r12]
328	movdqa	xmm1,XMMWORD[80+r12]
329	movdqa	xmm2,XMMWORD[96+r12]
330	movdqa	xmm3,XMMWORD[112+r12]
331	pand	xmm0,XMMWORD[64+rdx]
332	pand	xmm1,XMMWORD[80+rdx]
333	por	xmm4,xmm0
334	pand	xmm2,XMMWORD[96+rdx]
335	por	xmm5,xmm1
336	pand	xmm3,XMMWORD[112+rdx]
337	por	xmm4,xmm2
338	por	xmm5,xmm3
339	por	xmm4,xmm5
340	pshufd	xmm0,xmm4,0x4e
341	por	xmm0,xmm4
342	lea	r12,[256+r12]
343
344	mov	rax,QWORD[rsi]
345DB	102,72,15,126,195
346
347	xor	r15,r15
348	mov	rbp,r8
349	mov	r10,QWORD[rsp]
350
351	mul	rbx
352	add	r10,rax
353	mov	rax,QWORD[rcx]
354	adc	rdx,0
355
356	imul	rbp,r10
357	mov	r11,rdx
358
359	mul	rbp
360	add	r10,rax
361	mov	rax,QWORD[8+rsi]
362	adc	rdx,0
363	mov	r10,QWORD[8+rsp]
364	mov	r13,rdx
365
366	lea	r15,[1+r15]
367	jmp	NEAR $L$inner_enter
368
369ALIGN	16
370$L$inner:
371	add	r13,rax
372	mov	rax,QWORD[r15*8+rsi]
373	adc	rdx,0
374	add	r13,r10
375	mov	r10,QWORD[r15*8+rsp]
376	adc	rdx,0
377	mov	QWORD[((-16))+r15*8+rsp],r13
378	mov	r13,rdx
379
380$L$inner_enter:
381	mul	rbx
382	add	r11,rax
383	mov	rax,QWORD[r15*8+rcx]
384	adc	rdx,0
385	add	r10,r11
386	mov	r11,rdx
387	adc	r11,0
388	lea	r15,[1+r15]
389
390	mul	rbp
391	cmp	r15,r9
392	jne	NEAR $L$inner
393
394	add	r13,rax
395	adc	rdx,0
396	add	r13,r10
397	mov	r10,QWORD[r9*8+rsp]
398	adc	rdx,0
399	mov	QWORD[((-16))+r9*8+rsp],r13
400	mov	r13,rdx
401
402	xor	rdx,rdx
403	add	r13,r11
404	adc	rdx,0
405	add	r13,r10
406	adc	rdx,0
407	mov	QWORD[((-8))+r9*8+rsp],r13
408	mov	QWORD[r9*8+rsp],rdx
409
410	lea	r14,[1+r14]
411	cmp	r14,r9
412	jb	NEAR $L$outer
413
414	xor	r14,r14
415	mov	rax,QWORD[rsp]
416	lea	rsi,[rsp]
417	mov	r15,r9
418	jmp	NEAR $L$sub
419ALIGN	16
420$L$sub:	sbb	rax,QWORD[r14*8+rcx]
421	mov	QWORD[r14*8+rdi],rax
422	mov	rax,QWORD[8+r14*8+rsi]
423	lea	r14,[1+r14]
424	dec	r15
425	jnz	NEAR $L$sub
426
427	sbb	rax,0
428	mov	rbx,-1
429	xor	rbx,rax
430	xor	r14,r14
431	mov	r15,r9
432
433$L$copy:
434	mov	rcx,QWORD[r14*8+rdi]
435	mov	rdx,QWORD[r14*8+rsp]
436	and	rcx,rbx
437	and	rdx,rax
438	mov	QWORD[r14*8+rsp],r14
439	or	rdx,rcx
440	mov	QWORD[r14*8+rdi],rdx
441	lea	r14,[1+r14]
442	sub	r15,1
443	jnz	NEAR $L$copy
444
445	mov	rsi,QWORD[8+r9*8+rsp]
446
447	mov	rax,1
448
449	mov	r15,QWORD[((-48))+rsi]
450
451	mov	r14,QWORD[((-40))+rsi]
452
453	mov	r13,QWORD[((-32))+rsi]
454
455	mov	r12,QWORD[((-24))+rsi]
456
457	mov	rbp,QWORD[((-16))+rsi]
458
459	mov	rbx,QWORD[((-8))+rsi]
460
461	lea	rsp,[rsi]
462
463$L$mul_epilogue:
464	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
465	mov	rsi,QWORD[16+rsp]
466	DB	0F3h,0C3h		;repret
467
468$L$SEH_end_bn_mul_mont_gather5:
469
470ALIGN	32
471bn_mul4x_mont_gather5:
472	mov	QWORD[8+rsp],rdi	;WIN64 prologue
473	mov	QWORD[16+rsp],rsi
474	mov	rax,rsp
475$L$SEH_begin_bn_mul4x_mont_gather5:
476	mov	rdi,rcx
477	mov	rsi,rdx
478	mov	rdx,r8
479	mov	rcx,r9
480	mov	r8,QWORD[40+rsp]
481	mov	r9,QWORD[48+rsp]
482
483
484
485DB	0x67
486	mov	rax,rsp
487
488$L$mul4x_enter:
489	and	r11d,0x80108
490	cmp	r11d,0x80108
491	je	NEAR $L$mulx4x_enter
492	push	rbx
493
494	push	rbp
495
496	push	r12
497
498	push	r13
499
500	push	r14
501
502	push	r15
503
504$L$mul4x_prologue:
505
506DB	0x67
507	shl	r9d,3
508	lea	r10,[r9*2+r9]
509	neg	r9
510
511
512
513
514
515
516
517
518
519
520	lea	r11,[((-320))+r9*2+rsp]
521	mov	rbp,rsp
522	sub	r11,rdi
523	and	r11,4095
524	cmp	r10,r11
525	jb	NEAR $L$mul4xsp_alt
526	sub	rbp,r11
527	lea	rbp,[((-320))+r9*2+rbp]
528	jmp	NEAR $L$mul4xsp_done
529
530ALIGN	32
531$L$mul4xsp_alt:
532	lea	r10,[((4096-320))+r9*2]
533	lea	rbp,[((-320))+r9*2+rbp]
534	sub	r11,r10
535	mov	r10,0
536	cmovc	r11,r10
537	sub	rbp,r11
538$L$mul4xsp_done:
539	and	rbp,-64
540	mov	r11,rsp
541	sub	r11,rbp
542	and	r11,-4096
543	lea	rsp,[rbp*1+r11]
544	mov	r10,QWORD[rsp]
545	cmp	rsp,rbp
546	ja	NEAR $L$mul4x_page_walk
547	jmp	NEAR $L$mul4x_page_walk_done
548
549$L$mul4x_page_walk:
550	lea	rsp,[((-4096))+rsp]
551	mov	r10,QWORD[rsp]
552	cmp	rsp,rbp
553	ja	NEAR $L$mul4x_page_walk
554$L$mul4x_page_walk_done:
555
556	neg	r9
557
558	mov	QWORD[40+rsp],rax
559
560$L$mul4x_body:
561
562	call	mul4x_internal
563
564	mov	rsi,QWORD[40+rsp]
565
566	mov	rax,1
567
568	mov	r15,QWORD[((-48))+rsi]
569
570	mov	r14,QWORD[((-40))+rsi]
571
572	mov	r13,QWORD[((-32))+rsi]
573
574	mov	r12,QWORD[((-24))+rsi]
575
576	mov	rbp,QWORD[((-16))+rsi]
577
578	mov	rbx,QWORD[((-8))+rsi]
579
580	lea	rsp,[rsi]
581
582$L$mul4x_epilogue:
583	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
584	mov	rsi,QWORD[16+rsp]
585	DB	0F3h,0C3h		;repret
586
587$L$SEH_end_bn_mul4x_mont_gather5:
588
589
590ALIGN	32
591mul4x_internal:
592
593	shl	r9,5
594	movd	xmm5,DWORD[56+rax]
595	lea	rax,[$L$inc]
596	lea	r13,[128+r9*1+rdx]
597	shr	r9,5
598	movdqa	xmm0,XMMWORD[rax]
599	movdqa	xmm1,XMMWORD[16+rax]
600	lea	r10,[((88-112))+r9*1+rsp]
601	lea	r12,[128+rdx]
602
603	pshufd	xmm5,xmm5,0
604	movdqa	xmm4,xmm1
605DB	0x67,0x67
606	movdqa	xmm2,xmm1
607	paddd	xmm1,xmm0
608	pcmpeqd	xmm0,xmm5
609DB	0x67
610	movdqa	xmm3,xmm4
611	paddd	xmm2,xmm1
612	pcmpeqd	xmm1,xmm5
613	movdqa	XMMWORD[112+r10],xmm0
614	movdqa	xmm0,xmm4
615
616	paddd	xmm3,xmm2
617	pcmpeqd	xmm2,xmm5
618	movdqa	XMMWORD[128+r10],xmm1
619	movdqa	xmm1,xmm4
620
621	paddd	xmm0,xmm3
622	pcmpeqd	xmm3,xmm5
623	movdqa	XMMWORD[144+r10],xmm2
624	movdqa	xmm2,xmm4
625
626	paddd	xmm1,xmm0
627	pcmpeqd	xmm0,xmm5
628	movdqa	XMMWORD[160+r10],xmm3
629	movdqa	xmm3,xmm4
630	paddd	xmm2,xmm1
631	pcmpeqd	xmm1,xmm5
632	movdqa	XMMWORD[176+r10],xmm0
633	movdqa	xmm0,xmm4
634
635	paddd	xmm3,xmm2
636	pcmpeqd	xmm2,xmm5
637	movdqa	XMMWORD[192+r10],xmm1
638	movdqa	xmm1,xmm4
639
640	paddd	xmm0,xmm3
641	pcmpeqd	xmm3,xmm5
642	movdqa	XMMWORD[208+r10],xmm2
643	movdqa	xmm2,xmm4
644
645	paddd	xmm1,xmm0
646	pcmpeqd	xmm0,xmm5
647	movdqa	XMMWORD[224+r10],xmm3
648	movdqa	xmm3,xmm4
649	paddd	xmm2,xmm1
650	pcmpeqd	xmm1,xmm5
651	movdqa	XMMWORD[240+r10],xmm0
652	movdqa	xmm0,xmm4
653
654	paddd	xmm3,xmm2
655	pcmpeqd	xmm2,xmm5
656	movdqa	XMMWORD[256+r10],xmm1
657	movdqa	xmm1,xmm4
658
659	paddd	xmm0,xmm3
660	pcmpeqd	xmm3,xmm5
661	movdqa	XMMWORD[272+r10],xmm2
662	movdqa	xmm2,xmm4
663
664	paddd	xmm1,xmm0
665	pcmpeqd	xmm0,xmm5
666	movdqa	XMMWORD[288+r10],xmm3
667	movdqa	xmm3,xmm4
668	paddd	xmm2,xmm1
669	pcmpeqd	xmm1,xmm5
670	movdqa	XMMWORD[304+r10],xmm0
671
672	paddd	xmm3,xmm2
673DB	0x67
674	pcmpeqd	xmm2,xmm5
675	movdqa	XMMWORD[320+r10],xmm1
676
677	pcmpeqd	xmm3,xmm5
678	movdqa	XMMWORD[336+r10],xmm2
679	pand	xmm0,XMMWORD[64+r12]
680
681	pand	xmm1,XMMWORD[80+r12]
682	pand	xmm2,XMMWORD[96+r12]
683	movdqa	XMMWORD[352+r10],xmm3
684	pand	xmm3,XMMWORD[112+r12]
685	por	xmm0,xmm2
686	por	xmm1,xmm3
687	movdqa	xmm4,XMMWORD[((-128))+r12]
688	movdqa	xmm5,XMMWORD[((-112))+r12]
689	movdqa	xmm2,XMMWORD[((-96))+r12]
690	pand	xmm4,XMMWORD[112+r10]
691	movdqa	xmm3,XMMWORD[((-80))+r12]
692	pand	xmm5,XMMWORD[128+r10]
693	por	xmm0,xmm4
694	pand	xmm2,XMMWORD[144+r10]
695	por	xmm1,xmm5
696	pand	xmm3,XMMWORD[160+r10]
697	por	xmm0,xmm2
698	por	xmm1,xmm3
699	movdqa	xmm4,XMMWORD[((-64))+r12]
700	movdqa	xmm5,XMMWORD[((-48))+r12]
701	movdqa	xmm2,XMMWORD[((-32))+r12]
702	pand	xmm4,XMMWORD[176+r10]
703	movdqa	xmm3,XMMWORD[((-16))+r12]
704	pand	xmm5,XMMWORD[192+r10]
705	por	xmm0,xmm4
706	pand	xmm2,XMMWORD[208+r10]
707	por	xmm1,xmm5
708	pand	xmm3,XMMWORD[224+r10]
709	por	xmm0,xmm2
710	por	xmm1,xmm3
711	movdqa	xmm4,XMMWORD[r12]
712	movdqa	xmm5,XMMWORD[16+r12]
713	movdqa	xmm2,XMMWORD[32+r12]
714	pand	xmm4,XMMWORD[240+r10]
715	movdqa	xmm3,XMMWORD[48+r12]
716	pand	xmm5,XMMWORD[256+r10]
717	por	xmm0,xmm4
718	pand	xmm2,XMMWORD[272+r10]
719	por	xmm1,xmm5
720	pand	xmm3,XMMWORD[288+r10]
721	por	xmm0,xmm2
722	por	xmm1,xmm3
723	por	xmm0,xmm1
724	pshufd	xmm1,xmm0,0x4e
725	por	xmm0,xmm1
726	lea	r12,[256+r12]
727DB	102,72,15,126,195
728
729	mov	QWORD[((16+8))+rsp],r13
730	mov	QWORD[((56+8))+rsp],rdi
731
732	mov	r8,QWORD[r8]
733	mov	rax,QWORD[rsi]
734	lea	rsi,[r9*1+rsi]
735	neg	r9
736
737	mov	rbp,r8
738	mul	rbx
739	mov	r10,rax
740	mov	rax,QWORD[rcx]
741
742	imul	rbp,r10
743	lea	r14,[((64+8))+rsp]
744	mov	r11,rdx
745
746	mul	rbp
747	add	r10,rax
748	mov	rax,QWORD[8+r9*1+rsi]
749	adc	rdx,0
750	mov	rdi,rdx
751
752	mul	rbx
753	add	r11,rax
754	mov	rax,QWORD[8+rcx]
755	adc	rdx,0
756	mov	r10,rdx
757
758	mul	rbp
759	add	rdi,rax
760	mov	rax,QWORD[16+r9*1+rsi]
761	adc	rdx,0
762	add	rdi,r11
763	lea	r15,[32+r9]
764	lea	rcx,[32+rcx]
765	adc	rdx,0
766	mov	QWORD[r14],rdi
767	mov	r13,rdx
768	jmp	NEAR $L$1st4x
769
770ALIGN	32
771$L$1st4x:
772	mul	rbx
773	add	r10,rax
774	mov	rax,QWORD[((-16))+rcx]
775	lea	r14,[32+r14]
776	adc	rdx,0
777	mov	r11,rdx
778
779	mul	rbp
780	add	r13,rax
781	mov	rax,QWORD[((-8))+r15*1+rsi]
782	adc	rdx,0
783	add	r13,r10
784	adc	rdx,0
785	mov	QWORD[((-24))+r14],r13
786	mov	rdi,rdx
787
788	mul	rbx
789	add	r11,rax
790	mov	rax,QWORD[((-8))+rcx]
791	adc	rdx,0
792	mov	r10,rdx
793
794	mul	rbp
795	add	rdi,rax
796	mov	rax,QWORD[r15*1+rsi]
797	adc	rdx,0
798	add	rdi,r11
799	adc	rdx,0
800	mov	QWORD[((-16))+r14],rdi
801	mov	r13,rdx
802
803	mul	rbx
804	add	r10,rax
805	mov	rax,QWORD[rcx]
806	adc	rdx,0
807	mov	r11,rdx
808
809	mul	rbp
810	add	r13,rax
811	mov	rax,QWORD[8+r15*1+rsi]
812	adc	rdx,0
813	add	r13,r10
814	adc	rdx,0
815	mov	QWORD[((-8))+r14],r13
816	mov	rdi,rdx
817
818	mul	rbx
819	add	r11,rax
820	mov	rax,QWORD[8+rcx]
821	adc	rdx,0
822	mov	r10,rdx
823
824	mul	rbp
825	add	rdi,rax
826	mov	rax,QWORD[16+r15*1+rsi]
827	adc	rdx,0
828	add	rdi,r11
829	lea	rcx,[32+rcx]
830	adc	rdx,0
831	mov	QWORD[r14],rdi
832	mov	r13,rdx
833
834	add	r15,32
835	jnz	NEAR $L$1st4x
836
837	mul	rbx
838	add	r10,rax
839	mov	rax,QWORD[((-16))+rcx]
840	lea	r14,[32+r14]
841	adc	rdx,0
842	mov	r11,rdx
843
844	mul	rbp
845	add	r13,rax
846	mov	rax,QWORD[((-8))+rsi]
847	adc	rdx,0
848	add	r13,r10
849	adc	rdx,0
850	mov	QWORD[((-24))+r14],r13
851	mov	rdi,rdx
852
853	mul	rbx
854	add	r11,rax
855	mov	rax,QWORD[((-8))+rcx]
856	adc	rdx,0
857	mov	r10,rdx
858
859	mul	rbp
860	add	rdi,rax
861	mov	rax,QWORD[r9*1+rsi]
862	adc	rdx,0
863	add	rdi,r11
864	adc	rdx,0
865	mov	QWORD[((-16))+r14],rdi
866	mov	r13,rdx
867
868	lea	rcx,[r9*1+rcx]
869
870	xor	rdi,rdi
871	add	r13,r10
872	adc	rdi,0
873	mov	QWORD[((-8))+r14],r13
874
875	jmp	NEAR $L$outer4x
876
877ALIGN	32
878$L$outer4x:
879	lea	rdx,[((16+128))+r14]
880	pxor	xmm4,xmm4
881	pxor	xmm5,xmm5
882	movdqa	xmm0,XMMWORD[((-128))+r12]
883	movdqa	xmm1,XMMWORD[((-112))+r12]
884	movdqa	xmm2,XMMWORD[((-96))+r12]
885	movdqa	xmm3,XMMWORD[((-80))+r12]
886	pand	xmm0,XMMWORD[((-128))+rdx]
887	pand	xmm1,XMMWORD[((-112))+rdx]
888	por	xmm4,xmm0
889	pand	xmm2,XMMWORD[((-96))+rdx]
890	por	xmm5,xmm1
891	pand	xmm3,XMMWORD[((-80))+rdx]
892	por	xmm4,xmm2
893	por	xmm5,xmm3
894	movdqa	xmm0,XMMWORD[((-64))+r12]
895	movdqa	xmm1,XMMWORD[((-48))+r12]
896	movdqa	xmm2,XMMWORD[((-32))+r12]
897	movdqa	xmm3,XMMWORD[((-16))+r12]
898	pand	xmm0,XMMWORD[((-64))+rdx]
899	pand	xmm1,XMMWORD[((-48))+rdx]
900	por	xmm4,xmm0
901	pand	xmm2,XMMWORD[((-32))+rdx]
902	por	xmm5,xmm1
903	pand	xmm3,XMMWORD[((-16))+rdx]
904	por	xmm4,xmm2
905	por	xmm5,xmm3
906	movdqa	xmm0,XMMWORD[r12]
907	movdqa	xmm1,XMMWORD[16+r12]
908	movdqa	xmm2,XMMWORD[32+r12]
909	movdqa	xmm3,XMMWORD[48+r12]
910	pand	xmm0,XMMWORD[rdx]
911	pand	xmm1,XMMWORD[16+rdx]
912	por	xmm4,xmm0
913	pand	xmm2,XMMWORD[32+rdx]
914	por	xmm5,xmm1
915	pand	xmm3,XMMWORD[48+rdx]
916	por	xmm4,xmm2
917	por	xmm5,xmm3
918	movdqa	xmm0,XMMWORD[64+r12]
919	movdqa	xmm1,XMMWORD[80+r12]
920	movdqa	xmm2,XMMWORD[96+r12]
921	movdqa	xmm3,XMMWORD[112+r12]
922	pand	xmm0,XMMWORD[64+rdx]
923	pand	xmm1,XMMWORD[80+rdx]
924	por	xmm4,xmm0
925	pand	xmm2,XMMWORD[96+rdx]
926	por	xmm5,xmm1
927	pand	xmm3,XMMWORD[112+rdx]
928	por	xmm4,xmm2
929	por	xmm5,xmm3
930	por	xmm4,xmm5
931	pshufd	xmm0,xmm4,0x4e
932	por	xmm0,xmm4
933	lea	r12,[256+r12]
934DB	102,72,15,126,195
935
936	mov	r10,QWORD[r9*1+r14]
937	mov	rbp,r8
938	mul	rbx
939	add	r10,rax
940	mov	rax,QWORD[rcx]
941	adc	rdx,0
942
943	imul	rbp,r10
944	mov	r11,rdx
945	mov	QWORD[r14],rdi
946
947	lea	r14,[r9*1+r14]
948
949	mul	rbp
950	add	r10,rax
951	mov	rax,QWORD[8+r9*1+rsi]
952	adc	rdx,0
953	mov	rdi,rdx
954
955	mul	rbx
956	add	r11,rax
957	mov	rax,QWORD[8+rcx]
958	adc	rdx,0
959	add	r11,QWORD[8+r14]
960	adc	rdx,0
961	mov	r10,rdx
962
963	mul	rbp
964	add	rdi,rax
965	mov	rax,QWORD[16+r9*1+rsi]
966	adc	rdx,0
967	add	rdi,r11
968	lea	r15,[32+r9]
969	lea	rcx,[32+rcx]
970	adc	rdx,0
971	mov	r13,rdx
972	jmp	NEAR $L$inner4x
973
974ALIGN	32
975$L$inner4x:
976	mul	rbx
977	add	r10,rax
978	mov	rax,QWORD[((-16))+rcx]
979	adc	rdx,0
980	add	r10,QWORD[16+r14]
981	lea	r14,[32+r14]
982	adc	rdx,0
983	mov	r11,rdx
984
985	mul	rbp
986	add	r13,rax
987	mov	rax,QWORD[((-8))+r15*1+rsi]
988	adc	rdx,0
989	add	r13,r10
990	adc	rdx,0
991	mov	QWORD[((-32))+r14],rdi
992	mov	rdi,rdx
993
994	mul	rbx
995	add	r11,rax
996	mov	rax,QWORD[((-8))+rcx]
997	adc	rdx,0
998	add	r11,QWORD[((-8))+r14]
999	adc	rdx,0
1000	mov	r10,rdx
1001
1002	mul	rbp
1003	add	rdi,rax
1004	mov	rax,QWORD[r15*1+rsi]
1005	adc	rdx,0
1006	add	rdi,r11
1007	adc	rdx,0
1008	mov	QWORD[((-24))+r14],r13
1009	mov	r13,rdx
1010
1011	mul	rbx
1012	add	r10,rax
1013	mov	rax,QWORD[rcx]
1014	adc	rdx,0
1015	add	r10,QWORD[r14]
1016	adc	rdx,0
1017	mov	r11,rdx
1018
1019	mul	rbp
1020	add	r13,rax
1021	mov	rax,QWORD[8+r15*1+rsi]
1022	adc	rdx,0
1023	add	r13,r10
1024	adc	rdx,0
1025	mov	QWORD[((-16))+r14],rdi
1026	mov	rdi,rdx
1027
1028	mul	rbx
1029	add	r11,rax
1030	mov	rax,QWORD[8+rcx]
1031	adc	rdx,0
1032	add	r11,QWORD[8+r14]
1033	adc	rdx,0
1034	mov	r10,rdx
1035
1036	mul	rbp
1037	add	rdi,rax
1038	mov	rax,QWORD[16+r15*1+rsi]
1039	adc	rdx,0
1040	add	rdi,r11
1041	lea	rcx,[32+rcx]
1042	adc	rdx,0
1043	mov	QWORD[((-8))+r14],r13
1044	mov	r13,rdx
1045
1046	add	r15,32
1047	jnz	NEAR $L$inner4x
1048
1049	mul	rbx
1050	add	r10,rax
1051	mov	rax,QWORD[((-16))+rcx]
1052	adc	rdx,0
1053	add	r10,QWORD[16+r14]
1054	lea	r14,[32+r14]
1055	adc	rdx,0
1056	mov	r11,rdx
1057
1058	mul	rbp
1059	add	r13,rax
1060	mov	rax,QWORD[((-8))+rsi]
1061	adc	rdx,0
1062	add	r13,r10
1063	adc	rdx,0
1064	mov	QWORD[((-32))+r14],rdi
1065	mov	rdi,rdx
1066
1067	mul	rbx
1068	add	r11,rax
1069	mov	rax,rbp
1070	mov	rbp,QWORD[((-8))+rcx]
1071	adc	rdx,0
1072	add	r11,QWORD[((-8))+r14]
1073	adc	rdx,0
1074	mov	r10,rdx
1075
1076	mul	rbp
1077	add	rdi,rax
1078	mov	rax,QWORD[r9*1+rsi]
1079	adc	rdx,0
1080	add	rdi,r11
1081	adc	rdx,0
1082	mov	QWORD[((-24))+r14],r13
1083	mov	r13,rdx
1084
1085	mov	QWORD[((-16))+r14],rdi
1086	lea	rcx,[r9*1+rcx]
1087
1088	xor	rdi,rdi
1089	add	r13,r10
1090	adc	rdi,0
1091	add	r13,QWORD[r14]
1092	adc	rdi,0
1093	mov	QWORD[((-8))+r14],r13
1094
1095	cmp	r12,QWORD[((16+8))+rsp]
1096	jb	NEAR $L$outer4x
1097	xor	rax,rax
1098	sub	rbp,r13
1099	adc	r15,r15
1100	or	rdi,r15
1101	sub	rax,rdi
1102	lea	rbx,[r9*1+r14]
1103	mov	r12,QWORD[rcx]
1104	lea	rbp,[rcx]
1105	mov	rcx,r9
1106	sar	rcx,3+2
1107	mov	rdi,QWORD[((56+8))+rsp]
1108	dec	r12
1109	xor	r10,r10
1110	mov	r13,QWORD[8+rbp]
1111	mov	r14,QWORD[16+rbp]
1112	mov	r15,QWORD[24+rbp]
1113	jmp	NEAR $L$sqr4x_sub_entry
1114
1115
1116global	bn_power5
1117
1118ALIGN	32
1119bn_power5:
1120	mov	QWORD[8+rsp],rdi	;WIN64 prologue
1121	mov	QWORD[16+rsp],rsi
1122	mov	rax,rsp
1123$L$SEH_begin_bn_power5:
1124	mov	rdi,rcx
1125	mov	rsi,rdx
1126	mov	rdx,r8
1127	mov	rcx,r9
1128	mov	r8,QWORD[40+rsp]
1129	mov	r9,QWORD[48+rsp]
1130
1131
1132
1133	mov	rax,rsp
1134
1135	lea	r11,[OPENSSL_ia32cap_P]
1136	mov	r11d,DWORD[8+r11]
1137	and	r11d,0x80108
1138	cmp	r11d,0x80108
1139	je	NEAR $L$powerx5_enter
1140	push	rbx
1141
1142	push	rbp
1143
1144	push	r12
1145
1146	push	r13
1147
1148	push	r14
1149
1150	push	r15
1151
1152$L$power5_prologue:
1153
1154	shl	r9d,3
1155	lea	r10d,[r9*2+r9]
1156	neg	r9
1157	mov	r8,QWORD[r8]
1158
1159
1160
1161
1162
1163
1164
1165
1166	lea	r11,[((-320))+r9*2+rsp]
1167	mov	rbp,rsp
1168	sub	r11,rdi
1169	and	r11,4095
1170	cmp	r10,r11
1171	jb	NEAR $L$pwr_sp_alt
1172	sub	rbp,r11
1173	lea	rbp,[((-320))+r9*2+rbp]
1174	jmp	NEAR $L$pwr_sp_done
1175
1176ALIGN	32
1177$L$pwr_sp_alt:
1178	lea	r10,[((4096-320))+r9*2]
1179	lea	rbp,[((-320))+r9*2+rbp]
1180	sub	r11,r10
1181	mov	r10,0
1182	cmovc	r11,r10
1183	sub	rbp,r11
1184$L$pwr_sp_done:
1185	and	rbp,-64
1186	mov	r11,rsp
1187	sub	r11,rbp
1188	and	r11,-4096
1189	lea	rsp,[rbp*1+r11]
1190	mov	r10,QWORD[rsp]
1191	cmp	rsp,rbp
1192	ja	NEAR $L$pwr_page_walk
1193	jmp	NEAR $L$pwr_page_walk_done
1194
1195$L$pwr_page_walk:
1196	lea	rsp,[((-4096))+rsp]
1197	mov	r10,QWORD[rsp]
1198	cmp	rsp,rbp
1199	ja	NEAR $L$pwr_page_walk
1200$L$pwr_page_walk_done:
1201
1202	mov	r10,r9
1203	neg	r9
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214	mov	QWORD[32+rsp],r8
1215	mov	QWORD[40+rsp],rax
1216
1217$L$power5_body:
1218DB	102,72,15,110,207
1219DB	102,72,15,110,209
1220DB	102,73,15,110,218
1221DB	102,72,15,110,226
1222
1223	call	__bn_sqr8x_internal
1224	call	__bn_post4x_internal
1225	call	__bn_sqr8x_internal
1226	call	__bn_post4x_internal
1227	call	__bn_sqr8x_internal
1228	call	__bn_post4x_internal
1229	call	__bn_sqr8x_internal
1230	call	__bn_post4x_internal
1231	call	__bn_sqr8x_internal
1232	call	__bn_post4x_internal
1233
1234DB	102,72,15,126,209
1235DB	102,72,15,126,226
1236	mov	rdi,rsi
1237	mov	rax,QWORD[40+rsp]
1238	lea	r8,[32+rsp]
1239
1240	call	mul4x_internal
1241
1242	mov	rsi,QWORD[40+rsp]
1243
1244	mov	rax,1
1245	mov	r15,QWORD[((-48))+rsi]
1246
1247	mov	r14,QWORD[((-40))+rsi]
1248
1249	mov	r13,QWORD[((-32))+rsi]
1250
1251	mov	r12,QWORD[((-24))+rsi]
1252
1253	mov	rbp,QWORD[((-16))+rsi]
1254
1255	mov	rbx,QWORD[((-8))+rsi]
1256
1257	lea	rsp,[rsi]
1258
1259$L$power5_epilogue:
1260	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
1261	mov	rsi,QWORD[16+rsp]
1262	DB	0F3h,0C3h		;repret
1263
1264$L$SEH_end_bn_power5:
1265
1266global	bn_sqr8x_internal
1267
1268
1269ALIGN	32
1270bn_sqr8x_internal:
1271__bn_sqr8x_internal:
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346	lea	rbp,[32+r10]
1347	lea	rsi,[r9*1+rsi]
1348
1349	mov	rcx,r9
1350
1351
1352	mov	r14,QWORD[((-32))+rbp*1+rsi]
1353	lea	rdi,[((48+8))+r9*2+rsp]
1354	mov	rax,QWORD[((-24))+rbp*1+rsi]
1355	lea	rdi,[((-32))+rbp*1+rdi]
1356	mov	rbx,QWORD[((-16))+rbp*1+rsi]
1357	mov	r15,rax
1358
1359	mul	r14
1360	mov	r10,rax
1361	mov	rax,rbx
1362	mov	r11,rdx
1363	mov	QWORD[((-24))+rbp*1+rdi],r10
1364
1365	mul	r14
1366	add	r11,rax
1367	mov	rax,rbx
1368	adc	rdx,0
1369	mov	QWORD[((-16))+rbp*1+rdi],r11
1370	mov	r10,rdx
1371
1372
1373	mov	rbx,QWORD[((-8))+rbp*1+rsi]
1374	mul	r15
1375	mov	r12,rax
1376	mov	rax,rbx
1377	mov	r13,rdx
1378
1379	lea	rcx,[rbp]
1380	mul	r14
1381	add	r10,rax
1382	mov	rax,rbx
1383	mov	r11,rdx
1384	adc	r11,0
1385	add	r10,r12
1386	adc	r11,0
1387	mov	QWORD[((-8))+rcx*1+rdi],r10
1388	jmp	NEAR $L$sqr4x_1st
1389
1390ALIGN	32
1391$L$sqr4x_1st:
1392	mov	rbx,QWORD[rcx*1+rsi]
1393	mul	r15
1394	add	r13,rax
1395	mov	rax,rbx
1396	mov	r12,rdx
1397	adc	r12,0
1398
1399	mul	r14
1400	add	r11,rax
1401	mov	rax,rbx
1402	mov	rbx,QWORD[8+rcx*1+rsi]
1403	mov	r10,rdx
1404	adc	r10,0
1405	add	r11,r13
1406	adc	r10,0
1407
1408
1409	mul	r15
1410	add	r12,rax
1411	mov	rax,rbx
1412	mov	QWORD[rcx*1+rdi],r11
1413	mov	r13,rdx
1414	adc	r13,0
1415
1416	mul	r14
1417	add	r10,rax
1418	mov	rax,rbx
1419	mov	rbx,QWORD[16+rcx*1+rsi]
1420	mov	r11,rdx
1421	adc	r11,0
1422	add	r10,r12
1423	adc	r11,0
1424
1425	mul	r15
1426	add	r13,rax
1427	mov	rax,rbx
1428	mov	QWORD[8+rcx*1+rdi],r10
1429	mov	r12,rdx
1430	adc	r12,0
1431
1432	mul	r14
1433	add	r11,rax
1434	mov	rax,rbx
1435	mov	rbx,QWORD[24+rcx*1+rsi]
1436	mov	r10,rdx
1437	adc	r10,0
1438	add	r11,r13
1439	adc	r10,0
1440
1441
1442	mul	r15
1443	add	r12,rax
1444	mov	rax,rbx
1445	mov	QWORD[16+rcx*1+rdi],r11
1446	mov	r13,rdx
1447	adc	r13,0
1448	lea	rcx,[32+rcx]
1449
1450	mul	r14
1451	add	r10,rax
1452	mov	rax,rbx
1453	mov	r11,rdx
1454	adc	r11,0
1455	add	r10,r12
1456	adc	r11,0
1457	mov	QWORD[((-8))+rcx*1+rdi],r10
1458
1459	cmp	rcx,0
1460	jne	NEAR $L$sqr4x_1st
1461
1462	mul	r15
1463	add	r13,rax
1464	lea	rbp,[16+rbp]
1465	adc	rdx,0
1466	add	r13,r11
1467	adc	rdx,0
1468
1469	mov	QWORD[rdi],r13
1470	mov	r12,rdx
1471	mov	QWORD[8+rdi],rdx
1472	jmp	NEAR $L$sqr4x_outer
1473
1474ALIGN	32
1475$L$sqr4x_outer:
1476	mov	r14,QWORD[((-32))+rbp*1+rsi]
1477	lea	rdi,[((48+8))+r9*2+rsp]
1478	mov	rax,QWORD[((-24))+rbp*1+rsi]
1479	lea	rdi,[((-32))+rbp*1+rdi]
1480	mov	rbx,QWORD[((-16))+rbp*1+rsi]
1481	mov	r15,rax
1482
1483	mul	r14
1484	mov	r10,QWORD[((-24))+rbp*1+rdi]
1485	add	r10,rax
1486	mov	rax,rbx
1487	adc	rdx,0
1488	mov	QWORD[((-24))+rbp*1+rdi],r10
1489	mov	r11,rdx
1490
1491	mul	r14
1492	add	r11,rax
1493	mov	rax,rbx
1494	adc	rdx,0
1495	add	r11,QWORD[((-16))+rbp*1+rdi]
1496	mov	r10,rdx
1497	adc	r10,0
1498	mov	QWORD[((-16))+rbp*1+rdi],r11
1499
1500	xor	r12,r12
1501
1502	mov	rbx,QWORD[((-8))+rbp*1+rsi]
1503	mul	r15
1504	add	r12,rax
1505	mov	rax,rbx
1506	adc	rdx,0
1507	add	r12,QWORD[((-8))+rbp*1+rdi]
1508	mov	r13,rdx
1509	adc	r13,0
1510
1511	mul	r14
1512	add	r10,rax
1513	mov	rax,rbx
1514	adc	rdx,0
1515	add	r10,r12
1516	mov	r11,rdx
1517	adc	r11,0
1518	mov	QWORD[((-8))+rbp*1+rdi],r10
1519
1520	lea	rcx,[rbp]
1521	jmp	NEAR $L$sqr4x_inner
1522
1523ALIGN	32
1524$L$sqr4x_inner:
1525	mov	rbx,QWORD[rcx*1+rsi]
1526	mul	r15
1527	add	r13,rax
1528	mov	rax,rbx
1529	mov	r12,rdx
1530	adc	r12,0
1531	add	r13,QWORD[rcx*1+rdi]
1532	adc	r12,0
1533
1534DB	0x67
1535	mul	r14
1536	add	r11,rax
1537	mov	rax,rbx
1538	mov	rbx,QWORD[8+rcx*1+rsi]
1539	mov	r10,rdx
1540	adc	r10,0
1541	add	r11,r13
1542	adc	r10,0
1543
1544	mul	r15
1545	add	r12,rax
1546	mov	QWORD[rcx*1+rdi],r11
1547	mov	rax,rbx
1548	mov	r13,rdx
1549	adc	r13,0
1550	add	r12,QWORD[8+rcx*1+rdi]
1551	lea	rcx,[16+rcx]
1552	adc	r13,0
1553
1554	mul	r14
1555	add	r10,rax
1556	mov	rax,rbx
1557	adc	rdx,0
1558	add	r10,r12
1559	mov	r11,rdx
1560	adc	r11,0
1561	mov	QWORD[((-8))+rcx*1+rdi],r10
1562
1563	cmp	rcx,0
1564	jne	NEAR $L$sqr4x_inner
1565
1566DB	0x67
1567	mul	r15
1568	add	r13,rax
1569	adc	rdx,0
1570	add	r13,r11
1571	adc	rdx,0
1572
1573	mov	QWORD[rdi],r13
1574	mov	r12,rdx
1575	mov	QWORD[8+rdi],rdx
1576
1577	add	rbp,16
1578	jnz	NEAR $L$sqr4x_outer
1579
1580
1581	mov	r14,QWORD[((-32))+rsi]
1582	lea	rdi,[((48+8))+r9*2+rsp]
1583	mov	rax,QWORD[((-24))+rsi]
1584	lea	rdi,[((-32))+rbp*1+rdi]
1585	mov	rbx,QWORD[((-16))+rsi]
1586	mov	r15,rax
1587
1588	mul	r14
1589	add	r10,rax
1590	mov	rax,rbx
1591	mov	r11,rdx
1592	adc	r11,0
1593
1594	mul	r14
1595	add	r11,rax
1596	mov	rax,rbx
1597	mov	QWORD[((-24))+rdi],r10
1598	mov	r10,rdx
1599	adc	r10,0
1600	add	r11,r13
1601	mov	rbx,QWORD[((-8))+rsi]
1602	adc	r10,0
1603
1604	mul	r15
1605	add	r12,rax
1606	mov	rax,rbx
1607	mov	QWORD[((-16))+rdi],r11
1608	mov	r13,rdx
1609	adc	r13,0
1610
1611	mul	r14
1612	add	r10,rax
1613	mov	rax,rbx
1614	mov	r11,rdx
1615	adc	r11,0
1616	add	r10,r12
1617	adc	r11,0
1618	mov	QWORD[((-8))+rdi],r10
1619
1620	mul	r15
1621	add	r13,rax
1622	mov	rax,QWORD[((-16))+rsi]
1623	adc	rdx,0
1624	add	r13,r11
1625	adc	rdx,0
1626
1627	mov	QWORD[rdi],r13
1628	mov	r12,rdx
1629	mov	QWORD[8+rdi],rdx
1630
1631	mul	rbx
1632	add	rbp,16
1633	xor	r14,r14
1634	sub	rbp,r9
1635	xor	r15,r15
1636
1637	add	rax,r12
1638	adc	rdx,0
1639	mov	QWORD[8+rdi],rax
1640	mov	QWORD[16+rdi],rdx
1641	mov	QWORD[24+rdi],r15
1642
1643	mov	rax,QWORD[((-16))+rbp*1+rsi]
1644	lea	rdi,[((48+8))+rsp]
1645	xor	r10,r10
1646	mov	r11,QWORD[8+rdi]
1647
1648	lea	r12,[r10*2+r14]
1649	shr	r10,63
1650	lea	r13,[r11*2+rcx]
1651	shr	r11,63
1652	or	r13,r10
1653	mov	r10,QWORD[16+rdi]
1654	mov	r14,r11
1655	mul	rax
1656	neg	r15
1657	mov	r11,QWORD[24+rdi]
1658	adc	r12,rax
1659	mov	rax,QWORD[((-8))+rbp*1+rsi]
1660	mov	QWORD[rdi],r12
1661	adc	r13,rdx
1662
1663	lea	rbx,[r10*2+r14]
1664	mov	QWORD[8+rdi],r13
1665	sbb	r15,r15
1666	shr	r10,63
1667	lea	r8,[r11*2+rcx]
1668	shr	r11,63
1669	or	r8,r10
1670	mov	r10,QWORD[32+rdi]
1671	mov	r14,r11
1672	mul	rax
1673	neg	r15
1674	mov	r11,QWORD[40+rdi]
1675	adc	rbx,rax
1676	mov	rax,QWORD[rbp*1+rsi]
1677	mov	QWORD[16+rdi],rbx
1678	adc	r8,rdx
1679	lea	rbp,[16+rbp]
1680	mov	QWORD[24+rdi],r8
1681	sbb	r15,r15
1682	lea	rdi,[64+rdi]
1683	jmp	NEAR $L$sqr4x_shift_n_add
1684
1685ALIGN	32
1686$L$sqr4x_shift_n_add:
1687	lea	r12,[r10*2+r14]
1688	shr	r10,63
1689	lea	r13,[r11*2+rcx]
1690	shr	r11,63
1691	or	r13,r10
1692	mov	r10,QWORD[((-16))+rdi]
1693	mov	r14,r11
1694	mul	rax
1695	neg	r15
1696	mov	r11,QWORD[((-8))+rdi]
1697	adc	r12,rax
1698	mov	rax,QWORD[((-8))+rbp*1+rsi]
1699	mov	QWORD[((-32))+rdi],r12
1700	adc	r13,rdx
1701
1702	lea	rbx,[r10*2+r14]
1703	mov	QWORD[((-24))+rdi],r13
1704	sbb	r15,r15
1705	shr	r10,63
1706	lea	r8,[r11*2+rcx]
1707	shr	r11,63
1708	or	r8,r10
1709	mov	r10,QWORD[rdi]
1710	mov	r14,r11
1711	mul	rax
1712	neg	r15
1713	mov	r11,QWORD[8+rdi]
1714	adc	rbx,rax
1715	mov	rax,QWORD[rbp*1+rsi]
1716	mov	QWORD[((-16))+rdi],rbx
1717	adc	r8,rdx
1718
1719	lea	r12,[r10*2+r14]
1720	mov	QWORD[((-8))+rdi],r8
1721	sbb	r15,r15
1722	shr	r10,63
1723	lea	r13,[r11*2+rcx]
1724	shr	r11,63
1725	or	r13,r10
1726	mov	r10,QWORD[16+rdi]
1727	mov	r14,r11
1728	mul	rax
1729	neg	r15
1730	mov	r11,QWORD[24+rdi]
1731	adc	r12,rax
1732	mov	rax,QWORD[8+rbp*1+rsi]
1733	mov	QWORD[rdi],r12
1734	adc	r13,rdx
1735
1736	lea	rbx,[r10*2+r14]
1737	mov	QWORD[8+rdi],r13
1738	sbb	r15,r15
1739	shr	r10,63
1740	lea	r8,[r11*2+rcx]
1741	shr	r11,63
1742	or	r8,r10
1743	mov	r10,QWORD[32+rdi]
1744	mov	r14,r11
1745	mul	rax
1746	neg	r15
1747	mov	r11,QWORD[40+rdi]
1748	adc	rbx,rax
1749	mov	rax,QWORD[16+rbp*1+rsi]
1750	mov	QWORD[16+rdi],rbx
1751	adc	r8,rdx
1752	mov	QWORD[24+rdi],r8
1753	sbb	r15,r15
1754	lea	rdi,[64+rdi]
1755	add	rbp,32
1756	jnz	NEAR $L$sqr4x_shift_n_add
1757
1758	lea	r12,[r10*2+r14]
1759DB	0x67
1760	shr	r10,63
1761	lea	r13,[r11*2+rcx]
1762	shr	r11,63
1763	or	r13,r10
1764	mov	r10,QWORD[((-16))+rdi]
1765	mov	r14,r11
1766	mul	rax
1767	neg	r15
1768	mov	r11,QWORD[((-8))+rdi]
1769	adc	r12,rax
1770	mov	rax,QWORD[((-8))+rsi]
1771	mov	QWORD[((-32))+rdi],r12
1772	adc	r13,rdx
1773
1774	lea	rbx,[r10*2+r14]
1775	mov	QWORD[((-24))+rdi],r13
1776	sbb	r15,r15
1777	shr	r10,63
1778	lea	r8,[r11*2+rcx]
1779	shr	r11,63
1780	or	r8,r10
1781	mul	rax
1782	neg	r15
1783	adc	rbx,rax
1784	adc	r8,rdx
1785	mov	QWORD[((-16))+rdi],rbx
1786	mov	QWORD[((-8))+rdi],r8
1787DB	102,72,15,126,213
1788__bn_sqr8x_reduction:
1789	xor	rax,rax
1790	lea	rcx,[rbp*1+r9]
1791	lea	rdx,[((48+8))+r9*2+rsp]
1792	mov	QWORD[((0+8))+rsp],rcx
1793	lea	rdi,[((48+8))+r9*1+rsp]
1794	mov	QWORD[((8+8))+rsp],rdx
1795	neg	r9
1796	jmp	NEAR $L$8x_reduction_loop
1797
1798ALIGN	32
1799$L$8x_reduction_loop:
1800	lea	rdi,[r9*1+rdi]
1801DB	0x66
1802	mov	rbx,QWORD[rdi]
1803	mov	r9,QWORD[8+rdi]
1804	mov	r10,QWORD[16+rdi]
1805	mov	r11,QWORD[24+rdi]
1806	mov	r12,QWORD[32+rdi]
1807	mov	r13,QWORD[40+rdi]
1808	mov	r14,QWORD[48+rdi]
1809	mov	r15,QWORD[56+rdi]
1810	mov	QWORD[rdx],rax
1811	lea	rdi,[64+rdi]
1812
1813DB	0x67
1814	mov	r8,rbx
1815	imul	rbx,QWORD[((32+8))+rsp]
1816	mov	rax,QWORD[rbp]
1817	mov	ecx,8
1818	jmp	NEAR $L$8x_reduce
1819
1820ALIGN	32
1821$L$8x_reduce:
1822	mul	rbx
1823	mov	rax,QWORD[8+rbp]
1824	neg	r8
1825	mov	r8,rdx
1826	adc	r8,0
1827
1828	mul	rbx
1829	add	r9,rax
1830	mov	rax,QWORD[16+rbp]
1831	adc	rdx,0
1832	add	r8,r9
1833	mov	QWORD[((48-8+8))+rcx*8+rsp],rbx
1834	mov	r9,rdx
1835	adc	r9,0
1836
1837	mul	rbx
1838	add	r10,rax
1839	mov	rax,QWORD[24+rbp]
1840	adc	rdx,0
1841	add	r9,r10
1842	mov	rsi,QWORD[((32+8))+rsp]
1843	mov	r10,rdx
1844	adc	r10,0
1845
1846	mul	rbx
1847	add	r11,rax
1848	mov	rax,QWORD[32+rbp]
1849	adc	rdx,0
1850	imul	rsi,r8
1851	add	r10,r11
1852	mov	r11,rdx
1853	adc	r11,0
1854
1855	mul	rbx
1856	add	r12,rax
1857	mov	rax,QWORD[40+rbp]
1858	adc	rdx,0
1859	add	r11,r12
1860	mov	r12,rdx
1861	adc	r12,0
1862
1863	mul	rbx
1864	add	r13,rax
1865	mov	rax,QWORD[48+rbp]
1866	adc	rdx,0
1867	add	r12,r13
1868	mov	r13,rdx
1869	adc	r13,0
1870
1871	mul	rbx
1872	add	r14,rax
1873	mov	rax,QWORD[56+rbp]
1874	adc	rdx,0
1875	add	r13,r14
1876	mov	r14,rdx
1877	adc	r14,0
1878
1879	mul	rbx
1880	mov	rbx,rsi
1881	add	r15,rax
1882	mov	rax,QWORD[rbp]
1883	adc	rdx,0
1884	add	r14,r15
1885	mov	r15,rdx
1886	adc	r15,0
1887
1888	dec	ecx
1889	jnz	NEAR $L$8x_reduce
1890
1891	lea	rbp,[64+rbp]
1892	xor	rax,rax
1893	mov	rdx,QWORD[((8+8))+rsp]
1894	cmp	rbp,QWORD[((0+8))+rsp]
1895	jae	NEAR $L$8x_no_tail
1896
1897DB	0x66
1898	add	r8,QWORD[rdi]
1899	adc	r9,QWORD[8+rdi]
1900	adc	r10,QWORD[16+rdi]
1901	adc	r11,QWORD[24+rdi]
1902	adc	r12,QWORD[32+rdi]
1903	adc	r13,QWORD[40+rdi]
1904	adc	r14,QWORD[48+rdi]
1905	adc	r15,QWORD[56+rdi]
1906	sbb	rsi,rsi
1907
1908	mov	rbx,QWORD[((48+56+8))+rsp]
1909	mov	ecx,8
1910	mov	rax,QWORD[rbp]
1911	jmp	NEAR $L$8x_tail
1912
1913ALIGN	32
1914$L$8x_tail:
1915	mul	rbx
1916	add	r8,rax
1917	mov	rax,QWORD[8+rbp]
1918	mov	QWORD[rdi],r8
1919	mov	r8,rdx
1920	adc	r8,0
1921
1922	mul	rbx
1923	add	r9,rax
1924	mov	rax,QWORD[16+rbp]
1925	adc	rdx,0
1926	add	r8,r9
1927	lea	rdi,[8+rdi]
1928	mov	r9,rdx
1929	adc	r9,0
1930
1931	mul	rbx
1932	add	r10,rax
1933	mov	rax,QWORD[24+rbp]
1934	adc	rdx,0
1935	add	r9,r10
1936	mov	r10,rdx
1937	adc	r10,0
1938
1939	mul	rbx
1940	add	r11,rax
1941	mov	rax,QWORD[32+rbp]
1942	adc	rdx,0
1943	add	r10,r11
1944	mov	r11,rdx
1945	adc	r11,0
1946
1947	mul	rbx
1948	add	r12,rax
1949	mov	rax,QWORD[40+rbp]
1950	adc	rdx,0
1951	add	r11,r12
1952	mov	r12,rdx
1953	adc	r12,0
1954
1955	mul	rbx
1956	add	r13,rax
1957	mov	rax,QWORD[48+rbp]
1958	adc	rdx,0
1959	add	r12,r13
1960	mov	r13,rdx
1961	adc	r13,0
1962
1963	mul	rbx
1964	add	r14,rax
1965	mov	rax,QWORD[56+rbp]
1966	adc	rdx,0
1967	add	r13,r14
1968	mov	r14,rdx
1969	adc	r14,0
1970
1971	mul	rbx
1972	mov	rbx,QWORD[((48-16+8))+rcx*8+rsp]
1973	add	r15,rax
1974	adc	rdx,0
1975	add	r14,r15
1976	mov	rax,QWORD[rbp]
1977	mov	r15,rdx
1978	adc	r15,0
1979
1980	dec	ecx
1981	jnz	NEAR $L$8x_tail
1982
1983	lea	rbp,[64+rbp]
1984	mov	rdx,QWORD[((8+8))+rsp]
1985	cmp	rbp,QWORD[((0+8))+rsp]
1986	jae	NEAR $L$8x_tail_done
1987
1988	mov	rbx,QWORD[((48+56+8))+rsp]
1989	neg	rsi
1990	mov	rax,QWORD[rbp]
1991	adc	r8,QWORD[rdi]
1992	adc	r9,QWORD[8+rdi]
1993	adc	r10,QWORD[16+rdi]
1994	adc	r11,QWORD[24+rdi]
1995	adc	r12,QWORD[32+rdi]
1996	adc	r13,QWORD[40+rdi]
1997	adc	r14,QWORD[48+rdi]
1998	adc	r15,QWORD[56+rdi]
1999	sbb	rsi,rsi
2000
2001	mov	ecx,8
2002	jmp	NEAR $L$8x_tail
2003
2004ALIGN	32
2005$L$8x_tail_done:
2006	xor	rax,rax
2007	add	r8,QWORD[rdx]
2008	adc	r9,0
2009	adc	r10,0
2010	adc	r11,0
2011	adc	r12,0
2012	adc	r13,0
2013	adc	r14,0
2014	adc	r15,0
2015	adc	rax,0
2016
2017	neg	rsi
2018$L$8x_no_tail:
2019	adc	r8,QWORD[rdi]
2020	adc	r9,QWORD[8+rdi]
2021	adc	r10,QWORD[16+rdi]
2022	adc	r11,QWORD[24+rdi]
2023	adc	r12,QWORD[32+rdi]
2024	adc	r13,QWORD[40+rdi]
2025	adc	r14,QWORD[48+rdi]
2026	adc	r15,QWORD[56+rdi]
2027	adc	rax,0
2028	mov	rcx,QWORD[((-8))+rbp]
2029	xor	rsi,rsi
2030
2031DB	102,72,15,126,213
2032
2033	mov	QWORD[rdi],r8
2034	mov	QWORD[8+rdi],r9
2035DB	102,73,15,126,217
2036	mov	QWORD[16+rdi],r10
2037	mov	QWORD[24+rdi],r11
2038	mov	QWORD[32+rdi],r12
2039	mov	QWORD[40+rdi],r13
2040	mov	QWORD[48+rdi],r14
2041	mov	QWORD[56+rdi],r15
2042	lea	rdi,[64+rdi]
2043
2044	cmp	rdi,rdx
2045	jb	NEAR $L$8x_reduction_loop
2046	DB	0F3h,0C3h		;repret
2047
2048
2049
2050ALIGN	32
2051__bn_post4x_internal:
2052
2053	mov	r12,QWORD[rbp]
2054	lea	rbx,[r9*1+rdi]
2055	mov	rcx,r9
2056DB	102,72,15,126,207
2057	neg	rax
2058DB	102,72,15,126,206
2059	sar	rcx,3+2
2060	dec	r12
2061	xor	r10,r10
2062	mov	r13,QWORD[8+rbp]
2063	mov	r14,QWORD[16+rbp]
2064	mov	r15,QWORD[24+rbp]
2065	jmp	NEAR $L$sqr4x_sub_entry
2066
2067ALIGN	16
2068$L$sqr4x_sub:
2069	mov	r12,QWORD[rbp]
2070	mov	r13,QWORD[8+rbp]
2071	mov	r14,QWORD[16+rbp]
2072	mov	r15,QWORD[24+rbp]
2073$L$sqr4x_sub_entry:
2074	lea	rbp,[32+rbp]
2075	not	r12
2076	not	r13
2077	not	r14
2078	not	r15
2079	and	r12,rax
2080	and	r13,rax
2081	and	r14,rax
2082	and	r15,rax
2083
2084	neg	r10
2085	adc	r12,QWORD[rbx]
2086	adc	r13,QWORD[8+rbx]
2087	adc	r14,QWORD[16+rbx]
2088	adc	r15,QWORD[24+rbx]
2089	mov	QWORD[rdi],r12
2090	lea	rbx,[32+rbx]
2091	mov	QWORD[8+rdi],r13
2092	sbb	r10,r10
2093	mov	QWORD[16+rdi],r14
2094	mov	QWORD[24+rdi],r15
2095	lea	rdi,[32+rdi]
2096
2097	inc	rcx
2098	jnz	NEAR $L$sqr4x_sub
2099
2100	mov	r10,r9
2101	neg	r9
2102	DB	0F3h,0C3h		;repret
2103
2104
2105global	bn_from_montgomery
2106
2107ALIGN	32
2108bn_from_montgomery:
2109
2110	test	DWORD[48+rsp],7
2111	jz	NEAR bn_from_mont8x
2112	xor	eax,eax
2113	DB	0F3h,0C3h		;repret
2114
2115
2116
2117
2118ALIGN	32
2119bn_from_mont8x:
2120	mov	QWORD[8+rsp],rdi	;WIN64 prologue
2121	mov	QWORD[16+rsp],rsi
2122	mov	rax,rsp
2123$L$SEH_begin_bn_from_mont8x:
2124	mov	rdi,rcx
2125	mov	rsi,rdx
2126	mov	rdx,r8
2127	mov	rcx,r9
2128	mov	r8,QWORD[40+rsp]
2129	mov	r9,QWORD[48+rsp]
2130
2131
2132
2133DB	0x67
2134	mov	rax,rsp
2135
2136	push	rbx
2137
2138	push	rbp
2139
2140	push	r12
2141
2142	push	r13
2143
2144	push	r14
2145
2146	push	r15
2147
2148$L$from_prologue:
2149
2150	shl	r9d,3
2151	lea	r10,[r9*2+r9]
2152	neg	r9
2153	mov	r8,QWORD[r8]
2154
2155
2156
2157
2158
2159
2160
2161
2162	lea	r11,[((-320))+r9*2+rsp]
2163	mov	rbp,rsp
2164	sub	r11,rdi
2165	and	r11,4095
2166	cmp	r10,r11
2167	jb	NEAR $L$from_sp_alt
2168	sub	rbp,r11
2169	lea	rbp,[((-320))+r9*2+rbp]
2170	jmp	NEAR $L$from_sp_done
2171
2172ALIGN	32
2173$L$from_sp_alt:
2174	lea	r10,[((4096-320))+r9*2]
2175	lea	rbp,[((-320))+r9*2+rbp]
2176	sub	r11,r10
2177	mov	r10,0
2178	cmovc	r11,r10
2179	sub	rbp,r11
2180$L$from_sp_done:
2181	and	rbp,-64
2182	mov	r11,rsp
2183	sub	r11,rbp
2184	and	r11,-4096
2185	lea	rsp,[rbp*1+r11]
2186	mov	r10,QWORD[rsp]
2187	cmp	rsp,rbp
2188	ja	NEAR $L$from_page_walk
2189	jmp	NEAR $L$from_page_walk_done
2190
2191$L$from_page_walk:
2192	lea	rsp,[((-4096))+rsp]
2193	mov	r10,QWORD[rsp]
2194	cmp	rsp,rbp
2195	ja	NEAR $L$from_page_walk
2196$L$from_page_walk_done:
2197
2198	mov	r10,r9
2199	neg	r9
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210	mov	QWORD[32+rsp],r8
2211	mov	QWORD[40+rsp],rax
2212
2213$L$from_body:
2214	mov	r11,r9
2215	lea	rax,[48+rsp]
2216	pxor	xmm0,xmm0
2217	jmp	NEAR $L$mul_by_1
2218
2219ALIGN	32
2220$L$mul_by_1:
2221	movdqu	xmm1,XMMWORD[rsi]
2222	movdqu	xmm2,XMMWORD[16+rsi]
2223	movdqu	xmm3,XMMWORD[32+rsi]
2224	movdqa	XMMWORD[r9*1+rax],xmm0
2225	movdqu	xmm4,XMMWORD[48+rsi]
2226	movdqa	XMMWORD[16+r9*1+rax],xmm0
2227DB	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
2228	movdqa	XMMWORD[rax],xmm1
2229	movdqa	XMMWORD[32+r9*1+rax],xmm0
2230	movdqa	XMMWORD[16+rax],xmm2
2231	movdqa	XMMWORD[48+r9*1+rax],xmm0
2232	movdqa	XMMWORD[32+rax],xmm3
2233	movdqa	XMMWORD[48+rax],xmm4
2234	lea	rax,[64+rax]
2235	sub	r11,64
2236	jnz	NEAR $L$mul_by_1
2237
2238DB	102,72,15,110,207
2239DB	102,72,15,110,209
2240DB	0x67
2241	mov	rbp,rcx
2242DB	102,73,15,110,218
2243	lea	r11,[OPENSSL_ia32cap_P]
2244	mov	r11d,DWORD[8+r11]
2245	and	r11d,0x80108
2246	cmp	r11d,0x80108
2247	jne	NEAR $L$from_mont_nox
2248
2249	lea	rdi,[r9*1+rax]
2250	call	__bn_sqrx8x_reduction
2251	call	__bn_postx4x_internal
2252
2253	pxor	xmm0,xmm0
2254	lea	rax,[48+rsp]
2255	jmp	NEAR $L$from_mont_zero
2256
2257ALIGN	32
2258$L$from_mont_nox:
2259	call	__bn_sqr8x_reduction
2260	call	__bn_post4x_internal
2261
2262	pxor	xmm0,xmm0
2263	lea	rax,[48+rsp]
2264	jmp	NEAR $L$from_mont_zero
2265
2266ALIGN	32
2267$L$from_mont_zero:
2268	mov	rsi,QWORD[40+rsp]
2269
2270	movdqa	XMMWORD[rax],xmm0
2271	movdqa	XMMWORD[16+rax],xmm0
2272	movdqa	XMMWORD[32+rax],xmm0
2273	movdqa	XMMWORD[48+rax],xmm0
2274	lea	rax,[64+rax]
2275	sub	r9,32
2276	jnz	NEAR $L$from_mont_zero
2277
2278	mov	rax,1
2279	mov	r15,QWORD[((-48))+rsi]
2280
2281	mov	r14,QWORD[((-40))+rsi]
2282
2283	mov	r13,QWORD[((-32))+rsi]
2284
2285	mov	r12,QWORD[((-24))+rsi]
2286
2287	mov	rbp,QWORD[((-16))+rsi]
2288
2289	mov	rbx,QWORD[((-8))+rsi]
2290
2291	lea	rsp,[rsi]
2292
2293$L$from_epilogue:
2294	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
2295	mov	rsi,QWORD[16+rsp]
2296	DB	0F3h,0C3h		;repret
2297
2298$L$SEH_end_bn_from_mont8x:
2299
2300ALIGN	32
2301bn_mulx4x_mont_gather5:
2302	mov	QWORD[8+rsp],rdi	;WIN64 prologue
2303	mov	QWORD[16+rsp],rsi
2304	mov	rax,rsp
2305$L$SEH_begin_bn_mulx4x_mont_gather5:
2306	mov	rdi,rcx
2307	mov	rsi,rdx
2308	mov	rdx,r8
2309	mov	rcx,r9
2310	mov	r8,QWORD[40+rsp]
2311	mov	r9,QWORD[48+rsp]
2312
2313
2314
2315	mov	rax,rsp
2316
2317$L$mulx4x_enter:
2318	push	rbx
2319
2320	push	rbp
2321
2322	push	r12
2323
2324	push	r13
2325
2326	push	r14
2327
2328	push	r15
2329
2330$L$mulx4x_prologue:
2331
2332	shl	r9d,3
2333	lea	r10,[r9*2+r9]
2334	neg	r9
2335	mov	r8,QWORD[r8]
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346	lea	r11,[((-320))+r9*2+rsp]
2347	mov	rbp,rsp
2348	sub	r11,rdi
2349	and	r11,4095
2350	cmp	r10,r11
2351	jb	NEAR $L$mulx4xsp_alt
2352	sub	rbp,r11
2353	lea	rbp,[((-320))+r9*2+rbp]
2354	jmp	NEAR $L$mulx4xsp_done
2355
2356$L$mulx4xsp_alt:
2357	lea	r10,[((4096-320))+r9*2]
2358	lea	rbp,[((-320))+r9*2+rbp]
2359	sub	r11,r10
2360	mov	r10,0
2361	cmovc	r11,r10
2362	sub	rbp,r11
2363$L$mulx4xsp_done:
2364	and	rbp,-64
2365	mov	r11,rsp
2366	sub	r11,rbp
2367	and	r11,-4096
2368	lea	rsp,[rbp*1+r11]
2369	mov	r10,QWORD[rsp]
2370	cmp	rsp,rbp
2371	ja	NEAR $L$mulx4x_page_walk
2372	jmp	NEAR $L$mulx4x_page_walk_done
2373
2374$L$mulx4x_page_walk:
2375	lea	rsp,[((-4096))+rsp]
2376	mov	r10,QWORD[rsp]
2377	cmp	rsp,rbp
2378	ja	NEAR $L$mulx4x_page_walk
2379$L$mulx4x_page_walk_done:
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393	mov	QWORD[32+rsp],r8
2394	mov	QWORD[40+rsp],rax
2395
2396$L$mulx4x_body:
2397	call	mulx4x_internal
2398
2399	mov	rsi,QWORD[40+rsp]
2400
2401	mov	rax,1
2402
2403	mov	r15,QWORD[((-48))+rsi]
2404
2405	mov	r14,QWORD[((-40))+rsi]
2406
2407	mov	r13,QWORD[((-32))+rsi]
2408
2409	mov	r12,QWORD[((-24))+rsi]
2410
2411	mov	rbp,QWORD[((-16))+rsi]
2412
2413	mov	rbx,QWORD[((-8))+rsi]
2414
2415	lea	rsp,[rsi]
2416
2417$L$mulx4x_epilogue:
2418	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
2419	mov	rsi,QWORD[16+rsp]
2420	DB	0F3h,0C3h		;repret
2421
2422$L$SEH_end_bn_mulx4x_mont_gather5:
2423
2424
2425ALIGN	32
2426mulx4x_internal:
2427
2428	mov	QWORD[8+rsp],r9
2429	mov	r10,r9
2430	neg	r9
2431	shl	r9,5
2432	neg	r10
2433	lea	r13,[128+r9*1+rdx]
2434	shr	r9,5+5
2435	movd	xmm5,DWORD[56+rax]
2436	sub	r9,1
2437	lea	rax,[$L$inc]
2438	mov	QWORD[((16+8))+rsp],r13
2439	mov	QWORD[((24+8))+rsp],r9
2440	mov	QWORD[((56+8))+rsp],rdi
2441	movdqa	xmm0,XMMWORD[rax]
2442	movdqa	xmm1,XMMWORD[16+rax]
2443	lea	r10,[((88-112))+r10*1+rsp]
2444	lea	rdi,[128+rdx]
2445
2446	pshufd	xmm5,xmm5,0
2447	movdqa	xmm4,xmm1
2448DB	0x67
2449	movdqa	xmm2,xmm1
2450DB	0x67
2451	paddd	xmm1,xmm0
2452	pcmpeqd	xmm0,xmm5
2453	movdqa	xmm3,xmm4
2454	paddd	xmm2,xmm1
2455	pcmpeqd	xmm1,xmm5
2456	movdqa	XMMWORD[112+r10],xmm0
2457	movdqa	xmm0,xmm4
2458
2459	paddd	xmm3,xmm2
2460	pcmpeqd	xmm2,xmm5
2461	movdqa	XMMWORD[128+r10],xmm1
2462	movdqa	xmm1,xmm4
2463
2464	paddd	xmm0,xmm3
2465	pcmpeqd	xmm3,xmm5
2466	movdqa	XMMWORD[144+r10],xmm2
2467	movdqa	xmm2,xmm4
2468
2469	paddd	xmm1,xmm0
2470	pcmpeqd	xmm0,xmm5
2471	movdqa	XMMWORD[160+r10],xmm3
2472	movdqa	xmm3,xmm4
2473	paddd	xmm2,xmm1
2474	pcmpeqd	xmm1,xmm5
2475	movdqa	XMMWORD[176+r10],xmm0
2476	movdqa	xmm0,xmm4
2477
2478	paddd	xmm3,xmm2
2479	pcmpeqd	xmm2,xmm5
2480	movdqa	XMMWORD[192+r10],xmm1
2481	movdqa	xmm1,xmm4
2482
2483	paddd	xmm0,xmm3
2484	pcmpeqd	xmm3,xmm5
2485	movdqa	XMMWORD[208+r10],xmm2
2486	movdqa	xmm2,xmm4
2487
2488	paddd	xmm1,xmm0
2489	pcmpeqd	xmm0,xmm5
2490	movdqa	XMMWORD[224+r10],xmm3
2491	movdqa	xmm3,xmm4
2492	paddd	xmm2,xmm1
2493	pcmpeqd	xmm1,xmm5
2494	movdqa	XMMWORD[240+r10],xmm0
2495	movdqa	xmm0,xmm4
2496
2497	paddd	xmm3,xmm2
2498	pcmpeqd	xmm2,xmm5
2499	movdqa	XMMWORD[256+r10],xmm1
2500	movdqa	xmm1,xmm4
2501
2502	paddd	xmm0,xmm3
2503	pcmpeqd	xmm3,xmm5
2504	movdqa	XMMWORD[272+r10],xmm2
2505	movdqa	xmm2,xmm4
2506
2507	paddd	xmm1,xmm0
2508	pcmpeqd	xmm0,xmm5
2509	movdqa	XMMWORD[288+r10],xmm3
2510	movdqa	xmm3,xmm4
2511DB	0x67
2512	paddd	xmm2,xmm1
2513	pcmpeqd	xmm1,xmm5
2514	movdqa	XMMWORD[304+r10],xmm0
2515
2516	paddd	xmm3,xmm2
2517	pcmpeqd	xmm2,xmm5
2518	movdqa	XMMWORD[320+r10],xmm1
2519
2520	pcmpeqd	xmm3,xmm5
2521	movdqa	XMMWORD[336+r10],xmm2
2522
2523	pand	xmm0,XMMWORD[64+rdi]
2524	pand	xmm1,XMMWORD[80+rdi]
2525	pand	xmm2,XMMWORD[96+rdi]
2526	movdqa	XMMWORD[352+r10],xmm3
2527	pand	xmm3,XMMWORD[112+rdi]
2528	por	xmm0,xmm2
2529	por	xmm1,xmm3
2530	movdqa	xmm4,XMMWORD[((-128))+rdi]
2531	movdqa	xmm5,XMMWORD[((-112))+rdi]
2532	movdqa	xmm2,XMMWORD[((-96))+rdi]
2533	pand	xmm4,XMMWORD[112+r10]
2534	movdqa	xmm3,XMMWORD[((-80))+rdi]
2535	pand	xmm5,XMMWORD[128+r10]
2536	por	xmm0,xmm4
2537	pand	xmm2,XMMWORD[144+r10]
2538	por	xmm1,xmm5
2539	pand	xmm3,XMMWORD[160+r10]
2540	por	xmm0,xmm2
2541	por	xmm1,xmm3
2542	movdqa	xmm4,XMMWORD[((-64))+rdi]
2543	movdqa	xmm5,XMMWORD[((-48))+rdi]
2544	movdqa	xmm2,XMMWORD[((-32))+rdi]
2545	pand	xmm4,XMMWORD[176+r10]
2546	movdqa	xmm3,XMMWORD[((-16))+rdi]
2547	pand	xmm5,XMMWORD[192+r10]
2548	por	xmm0,xmm4
2549	pand	xmm2,XMMWORD[208+r10]
2550	por	xmm1,xmm5
2551	pand	xmm3,XMMWORD[224+r10]
2552	por	xmm0,xmm2
2553	por	xmm1,xmm3
2554	movdqa	xmm4,XMMWORD[rdi]
2555	movdqa	xmm5,XMMWORD[16+rdi]
2556	movdqa	xmm2,XMMWORD[32+rdi]
2557	pand	xmm4,XMMWORD[240+r10]
2558	movdqa	xmm3,XMMWORD[48+rdi]
2559	pand	xmm5,XMMWORD[256+r10]
2560	por	xmm0,xmm4
2561	pand	xmm2,XMMWORD[272+r10]
2562	por	xmm1,xmm5
2563	pand	xmm3,XMMWORD[288+r10]
2564	por	xmm0,xmm2
2565	por	xmm1,xmm3
2566	pxor	xmm0,xmm1
2567	pshufd	xmm1,xmm0,0x4e
2568	por	xmm0,xmm1
2569	lea	rdi,[256+rdi]
2570DB	102,72,15,126,194
2571	lea	rbx,[((64+32+8))+rsp]
2572
2573	mov	r9,rdx
2574	mulx	rax,r8,QWORD[rsi]
2575	mulx	r12,r11,QWORD[8+rsi]
2576	add	r11,rax
2577	mulx	r13,rax,QWORD[16+rsi]
2578	adc	r12,rax
2579	adc	r13,0
2580	mulx	r14,rax,QWORD[24+rsi]
2581
2582	mov	r15,r8
2583	imul	r8,QWORD[((32+8))+rsp]
2584	xor	rbp,rbp
2585	mov	rdx,r8
2586
2587	mov	QWORD[((8+8))+rsp],rdi
2588
2589	lea	rsi,[32+rsi]
2590	adcx	r13,rax
2591	adcx	r14,rbp
2592
2593	mulx	r10,rax,QWORD[rcx]
2594	adcx	r15,rax
2595	adox	r10,r11
2596	mulx	r11,rax,QWORD[8+rcx]
2597	adcx	r10,rax
2598	adox	r11,r12
2599	mulx	r12,rax,QWORD[16+rcx]
2600	mov	rdi,QWORD[((24+8))+rsp]
2601	mov	QWORD[((-32))+rbx],r10
2602	adcx	r11,rax
2603	adox	r12,r13
2604	mulx	r15,rax,QWORD[24+rcx]
2605	mov	rdx,r9
2606	mov	QWORD[((-24))+rbx],r11
2607	adcx	r12,rax
2608	adox	r15,rbp
2609	lea	rcx,[32+rcx]
2610	mov	QWORD[((-16))+rbx],r12
2611	jmp	NEAR $L$mulx4x_1st
2612
2613ALIGN	32
2614$L$mulx4x_1st:
2615	adcx	r15,rbp
2616	mulx	rax,r10,QWORD[rsi]
2617	adcx	r10,r14
2618	mulx	r14,r11,QWORD[8+rsi]
2619	adcx	r11,rax
2620	mulx	rax,r12,QWORD[16+rsi]
2621	adcx	r12,r14
2622	mulx	r14,r13,QWORD[24+rsi]
2623DB	0x67,0x67
2624	mov	rdx,r8
2625	adcx	r13,rax
2626	adcx	r14,rbp
2627	lea	rsi,[32+rsi]
2628	lea	rbx,[32+rbx]
2629
2630	adox	r10,r15
2631	mulx	r15,rax,QWORD[rcx]
2632	adcx	r10,rax
2633	adox	r11,r15
2634	mulx	r15,rax,QWORD[8+rcx]
2635	adcx	r11,rax
2636	adox	r12,r15
2637	mulx	r15,rax,QWORD[16+rcx]
2638	mov	QWORD[((-40))+rbx],r10
2639	adcx	r12,rax
2640	mov	QWORD[((-32))+rbx],r11
2641	adox	r13,r15
2642	mulx	r15,rax,QWORD[24+rcx]
2643	mov	rdx,r9
2644	mov	QWORD[((-24))+rbx],r12
2645	adcx	r13,rax
2646	adox	r15,rbp
2647	lea	rcx,[32+rcx]
2648	mov	QWORD[((-16))+rbx],r13
2649
2650	dec	rdi
2651	jnz	NEAR $L$mulx4x_1st
2652
2653	mov	rax,QWORD[8+rsp]
2654	adc	r15,rbp
2655	lea	rsi,[rax*1+rsi]
2656	add	r14,r15
2657	mov	rdi,QWORD[((8+8))+rsp]
2658	adc	rbp,rbp
2659	mov	QWORD[((-8))+rbx],r14
2660	jmp	NEAR $L$mulx4x_outer
2661
2662ALIGN	32
2663$L$mulx4x_outer:
2664	lea	r10,[((16-256))+rbx]
2665	pxor	xmm4,xmm4
2666DB	0x67,0x67
2667	pxor	xmm5,xmm5
2668	movdqa	xmm0,XMMWORD[((-128))+rdi]
2669	movdqa	xmm1,XMMWORD[((-112))+rdi]
2670	movdqa	xmm2,XMMWORD[((-96))+rdi]
2671	pand	xmm0,XMMWORD[256+r10]
2672	movdqa	xmm3,XMMWORD[((-80))+rdi]
2673	pand	xmm1,XMMWORD[272+r10]
2674	por	xmm4,xmm0
2675	pand	xmm2,XMMWORD[288+r10]
2676	por	xmm5,xmm1
2677	pand	xmm3,XMMWORD[304+r10]
2678	por	xmm4,xmm2
2679	por	xmm5,xmm3
2680	movdqa	xmm0,XMMWORD[((-64))+rdi]
2681	movdqa	xmm1,XMMWORD[((-48))+rdi]
2682	movdqa	xmm2,XMMWORD[((-32))+rdi]
2683	pand	xmm0,XMMWORD[320+r10]
2684	movdqa	xmm3,XMMWORD[((-16))+rdi]
2685	pand	xmm1,XMMWORD[336+r10]
2686	por	xmm4,xmm0
2687	pand	xmm2,XMMWORD[352+r10]
2688	por	xmm5,xmm1
2689	pand	xmm3,XMMWORD[368+r10]
2690	por	xmm4,xmm2
2691	por	xmm5,xmm3
2692	movdqa	xmm0,XMMWORD[rdi]
2693	movdqa	xmm1,XMMWORD[16+rdi]
2694	movdqa	xmm2,XMMWORD[32+rdi]
2695	pand	xmm0,XMMWORD[384+r10]
2696	movdqa	xmm3,XMMWORD[48+rdi]
2697	pand	xmm1,XMMWORD[400+r10]
2698	por	xmm4,xmm0
2699	pand	xmm2,XMMWORD[416+r10]
2700	por	xmm5,xmm1
2701	pand	xmm3,XMMWORD[432+r10]
2702	por	xmm4,xmm2
2703	por	xmm5,xmm3
2704	movdqa	xmm0,XMMWORD[64+rdi]
2705	movdqa	xmm1,XMMWORD[80+rdi]
2706	movdqa	xmm2,XMMWORD[96+rdi]
2707	pand	xmm0,XMMWORD[448+r10]
2708	movdqa	xmm3,XMMWORD[112+rdi]
2709	pand	xmm1,XMMWORD[464+r10]
2710	por	xmm4,xmm0
2711	pand	xmm2,XMMWORD[480+r10]
2712	por	xmm5,xmm1
2713	pand	xmm3,XMMWORD[496+r10]
2714	por	xmm4,xmm2
2715	por	xmm5,xmm3
2716	por	xmm4,xmm5
2717	pshufd	xmm0,xmm4,0x4e
2718	por	xmm0,xmm4
2719	lea	rdi,[256+rdi]
2720DB	102,72,15,126,194
2721
2722	mov	QWORD[rbx],rbp
2723	lea	rbx,[32+rax*1+rbx]
2724	mulx	r11,r8,QWORD[rsi]
2725	xor	rbp,rbp
2726	mov	r9,rdx
2727	mulx	r12,r14,QWORD[8+rsi]
2728	adox	r8,QWORD[((-32))+rbx]
2729	adcx	r11,r14
2730	mulx	r13,r15,QWORD[16+rsi]
2731	adox	r11,QWORD[((-24))+rbx]
2732	adcx	r12,r15
2733	mulx	r14,rdx,QWORD[24+rsi]
2734	adox	r12,QWORD[((-16))+rbx]
2735	adcx	r13,rdx
2736	lea	rcx,[rax*1+rcx]
2737	lea	rsi,[32+rsi]
2738	adox	r13,QWORD[((-8))+rbx]
2739	adcx	r14,rbp
2740	adox	r14,rbp
2741
2742	mov	r15,r8
2743	imul	r8,QWORD[((32+8))+rsp]
2744
2745	mov	rdx,r8
2746	xor	rbp,rbp
2747	mov	QWORD[((8+8))+rsp],rdi
2748
2749	mulx	r10,rax,QWORD[rcx]
2750	adcx	r15,rax
2751	adox	r10,r11
2752	mulx	r11,rax,QWORD[8+rcx]
2753	adcx	r10,rax
2754	adox	r11,r12
2755	mulx	r12,rax,QWORD[16+rcx]
2756	adcx	r11,rax
2757	adox	r12,r13
2758	mulx	r15,rax,QWORD[24+rcx]
2759	mov	rdx,r9
2760	mov	rdi,QWORD[((24+8))+rsp]
2761	mov	QWORD[((-32))+rbx],r10
2762	adcx	r12,rax
2763	mov	QWORD[((-24))+rbx],r11
2764	adox	r15,rbp
2765	mov	QWORD[((-16))+rbx],r12
2766	lea	rcx,[32+rcx]
2767	jmp	NEAR $L$mulx4x_inner
2768
2769ALIGN	32
2770$L$mulx4x_inner:
2771	mulx	rax,r10,QWORD[rsi]
2772	adcx	r15,rbp
2773	adox	r10,r14
2774	mulx	r14,r11,QWORD[8+rsi]
2775	adcx	r10,QWORD[rbx]
2776	adox	r11,rax
2777	mulx	rax,r12,QWORD[16+rsi]
2778	adcx	r11,QWORD[8+rbx]
2779	adox	r12,r14
2780	mulx	r14,r13,QWORD[24+rsi]
2781	mov	rdx,r8
2782	adcx	r12,QWORD[16+rbx]
2783	adox	r13,rax
2784	adcx	r13,QWORD[24+rbx]
2785	adox	r14,rbp
2786	lea	rsi,[32+rsi]
2787	lea	rbx,[32+rbx]
2788	adcx	r14,rbp
2789
2790	adox	r10,r15
2791	mulx	r15,rax,QWORD[rcx]
2792	adcx	r10,rax
2793	adox	r11,r15
2794	mulx	r15,rax,QWORD[8+rcx]
2795	adcx	r11,rax
2796	adox	r12,r15
2797	mulx	r15,rax,QWORD[16+rcx]
2798	mov	QWORD[((-40))+rbx],r10
2799	adcx	r12,rax
2800	adox	r13,r15
2801	mov	QWORD[((-32))+rbx],r11
2802	mulx	r15,rax,QWORD[24+rcx]
2803	mov	rdx,r9
2804	lea	rcx,[32+rcx]
2805	mov	QWORD[((-24))+rbx],r12
2806	adcx	r13,rax
2807	adox	r15,rbp
2808	mov	QWORD[((-16))+rbx],r13
2809
2810	dec	rdi
2811	jnz	NEAR $L$mulx4x_inner
2812
2813	mov	rax,QWORD[((0+8))+rsp]
2814	adc	r15,rbp
2815	sub	rdi,QWORD[rbx]
2816	mov	rdi,QWORD[((8+8))+rsp]
2817	mov	r10,QWORD[((16+8))+rsp]
2818	adc	r14,r15
2819	lea	rsi,[rax*1+rsi]
2820	adc	rbp,rbp
2821	mov	QWORD[((-8))+rbx],r14
2822
2823	cmp	rdi,r10
2824	jb	NEAR $L$mulx4x_outer
2825
2826	mov	r10,QWORD[((-8))+rcx]
2827	mov	r8,rbp
2828	mov	r12,QWORD[rax*1+rcx]
2829	lea	rbp,[rax*1+rcx]
2830	mov	rcx,rax
2831	lea	rdi,[rax*1+rbx]
2832	xor	eax,eax
2833	xor	r15,r15
2834	sub	r10,r14
2835	adc	r15,r15
2836	or	r8,r15
2837	sar	rcx,3+2
2838	sub	rax,r8
2839	mov	rdx,QWORD[((56+8))+rsp]
2840	dec	r12
2841	mov	r13,QWORD[8+rbp]
2842	xor	r8,r8
2843	mov	r14,QWORD[16+rbp]
2844	mov	r15,QWORD[24+rbp]
2845	jmp	NEAR $L$sqrx4x_sub_entry
2846
2847
2848
2849ALIGN	32
2850bn_powerx5:
2851	mov	QWORD[8+rsp],rdi	;WIN64 prologue
2852	mov	QWORD[16+rsp],rsi
2853	mov	rax,rsp
2854$L$SEH_begin_bn_powerx5:
2855	mov	rdi,rcx
2856	mov	rsi,rdx
2857	mov	rdx,r8
2858	mov	rcx,r9
2859	mov	r8,QWORD[40+rsp]
2860	mov	r9,QWORD[48+rsp]
2861
2862
2863
2864	mov	rax,rsp
2865
2866$L$powerx5_enter:
2867	push	rbx
2868
2869	push	rbp
2870
2871	push	r12
2872
2873	push	r13
2874
2875	push	r14
2876
2877	push	r15
2878
2879$L$powerx5_prologue:
2880
2881	shl	r9d,3
2882	lea	r10,[r9*2+r9]
2883	neg	r9
2884	mov	r8,QWORD[r8]
2885
2886
2887
2888
2889
2890
2891
2892
2893	lea	r11,[((-320))+r9*2+rsp]
2894	mov	rbp,rsp
2895	sub	r11,rdi
2896	and	r11,4095
2897	cmp	r10,r11
2898	jb	NEAR $L$pwrx_sp_alt
2899	sub	rbp,r11
2900	lea	rbp,[((-320))+r9*2+rbp]
2901	jmp	NEAR $L$pwrx_sp_done
2902
2903ALIGN	32
2904$L$pwrx_sp_alt:
2905	lea	r10,[((4096-320))+r9*2]
2906	lea	rbp,[((-320))+r9*2+rbp]
2907	sub	r11,r10
2908	mov	r10,0
2909	cmovc	r11,r10
2910	sub	rbp,r11
2911$L$pwrx_sp_done:
2912	and	rbp,-64
2913	mov	r11,rsp
2914	sub	r11,rbp
2915	and	r11,-4096
2916	lea	rsp,[rbp*1+r11]
2917	mov	r10,QWORD[rsp]
2918	cmp	rsp,rbp
2919	ja	NEAR $L$pwrx_page_walk
2920	jmp	NEAR $L$pwrx_page_walk_done
2921
2922$L$pwrx_page_walk:
2923	lea	rsp,[((-4096))+rsp]
2924	mov	r10,QWORD[rsp]
2925	cmp	rsp,rbp
2926	ja	NEAR $L$pwrx_page_walk
2927$L$pwrx_page_walk_done:
2928
2929	mov	r10,r9
2930	neg	r9
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943	pxor	xmm0,xmm0
2944DB	102,72,15,110,207
2945DB	102,72,15,110,209
2946DB	102,73,15,110,218
2947DB	102,72,15,110,226
2948	mov	QWORD[32+rsp],r8
2949	mov	QWORD[40+rsp],rax
2950
2951$L$powerx5_body:
2952
2953	call	__bn_sqrx8x_internal
2954	call	__bn_postx4x_internal
2955	call	__bn_sqrx8x_internal
2956	call	__bn_postx4x_internal
2957	call	__bn_sqrx8x_internal
2958	call	__bn_postx4x_internal
2959	call	__bn_sqrx8x_internal
2960	call	__bn_postx4x_internal
2961	call	__bn_sqrx8x_internal
2962	call	__bn_postx4x_internal
2963
2964	mov	r9,r10
2965	mov	rdi,rsi
2966DB	102,72,15,126,209
2967DB	102,72,15,126,226
2968	mov	rax,QWORD[40+rsp]
2969
2970	call	mulx4x_internal
2971
2972	mov	rsi,QWORD[40+rsp]
2973
2974	mov	rax,1
2975
2976	mov	r15,QWORD[((-48))+rsi]
2977
2978	mov	r14,QWORD[((-40))+rsi]
2979
2980	mov	r13,QWORD[((-32))+rsi]
2981
2982	mov	r12,QWORD[((-24))+rsi]
2983
2984	mov	rbp,QWORD[((-16))+rsi]
2985
2986	mov	rbx,QWORD[((-8))+rsi]
2987
2988	lea	rsp,[rsi]
2989
2990$L$powerx5_epilogue:
2991	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
2992	mov	rsi,QWORD[16+rsp]
2993	DB	0F3h,0C3h		;repret
2994
2995$L$SEH_end_bn_powerx5:
2996
2997global	bn_sqrx8x_internal
2998
2999ALIGN	32
3000bn_sqrx8x_internal:
3001__bn_sqrx8x_internal:
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043	lea	rdi,[((48+8))+rsp]
3044	lea	rbp,[r9*1+rsi]
3045	mov	QWORD[((0+8))+rsp],r9
3046	mov	QWORD[((8+8))+rsp],rbp
3047	jmp	NEAR $L$sqr8x_zero_start
3048
3049ALIGN	32
3050DB	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
3051$L$sqrx8x_zero:
3052DB	0x3e
3053	movdqa	XMMWORD[rdi],xmm0
3054	movdqa	XMMWORD[16+rdi],xmm0
3055	movdqa	XMMWORD[32+rdi],xmm0
3056	movdqa	XMMWORD[48+rdi],xmm0
3057$L$sqr8x_zero_start:
3058	movdqa	XMMWORD[64+rdi],xmm0
3059	movdqa	XMMWORD[80+rdi],xmm0
3060	movdqa	XMMWORD[96+rdi],xmm0
3061	movdqa	XMMWORD[112+rdi],xmm0
3062	lea	rdi,[128+rdi]
3063	sub	r9,64
3064	jnz	NEAR $L$sqrx8x_zero
3065
3066	mov	rdx,QWORD[rsi]
3067
3068	xor	r10,r10
3069	xor	r11,r11
3070	xor	r12,r12
3071	xor	r13,r13
3072	xor	r14,r14
3073	xor	r15,r15
3074	lea	rdi,[((48+8))+rsp]
3075	xor	rbp,rbp
3076	jmp	NEAR $L$sqrx8x_outer_loop
3077
3078ALIGN	32
3079$L$sqrx8x_outer_loop:
3080	mulx	rax,r8,QWORD[8+rsi]
3081	adcx	r8,r9
3082	adox	r10,rax
3083	mulx	rax,r9,QWORD[16+rsi]
3084	adcx	r9,r10
3085	adox	r11,rax
3086DB	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
3087	adcx	r10,r11
3088	adox	r12,rax
3089DB	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
3090	adcx	r11,r12
3091	adox	r13,rax
3092	mulx	rax,r12,QWORD[40+rsi]
3093	adcx	r12,r13
3094	adox	r14,rax
3095	mulx	rax,r13,QWORD[48+rsi]
3096	adcx	r13,r14
3097	adox	rax,r15
3098	mulx	r15,r14,QWORD[56+rsi]
3099	mov	rdx,QWORD[8+rsi]
3100	adcx	r14,rax
3101	adox	r15,rbp
3102	adc	r15,QWORD[64+rdi]
3103	mov	QWORD[8+rdi],r8
3104	mov	QWORD[16+rdi],r9
3105	sbb	rcx,rcx
3106	xor	rbp,rbp
3107
3108
3109	mulx	rbx,r8,QWORD[16+rsi]
3110	mulx	rax,r9,QWORD[24+rsi]
3111	adcx	r8,r10
3112	adox	r9,rbx
3113	mulx	rbx,r10,QWORD[32+rsi]
3114	adcx	r9,r11
3115	adox	r10,rax
3116DB	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
3117	adcx	r10,r12
3118	adox	r11,rbx
3119DB	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
3120	adcx	r11,r13
3121	adox	r12,r14
3122DB	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
3123	mov	rdx,QWORD[16+rsi]
3124	adcx	r12,rax
3125	adox	r13,rbx
3126	adcx	r13,r15
3127	adox	r14,rbp
3128	adcx	r14,rbp
3129
3130	mov	QWORD[24+rdi],r8
3131	mov	QWORD[32+rdi],r9
3132
3133	mulx	rbx,r8,QWORD[24+rsi]
3134	mulx	rax,r9,QWORD[32+rsi]
3135	adcx	r8,r10
3136	adox	r9,rbx
3137	mulx	rbx,r10,QWORD[40+rsi]
3138	adcx	r9,r11
3139	adox	r10,rax
3140DB	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
3141	adcx	r10,r12
3142	adox	r11,r13
3143DB	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
3144DB	0x3e
3145	mov	rdx,QWORD[24+rsi]
3146	adcx	r11,rbx
3147	adox	r12,rax
3148	adcx	r12,r14
3149	mov	QWORD[40+rdi],r8
3150	mov	QWORD[48+rdi],r9
3151	mulx	rax,r8,QWORD[32+rsi]
3152	adox	r13,rbp
3153	adcx	r13,rbp
3154
3155	mulx	rbx,r9,QWORD[40+rsi]
3156	adcx	r8,r10
3157	adox	r9,rax
3158	mulx	rax,r10,QWORD[48+rsi]
3159	adcx	r9,r11
3160	adox	r10,r12
3161	mulx	r12,r11,QWORD[56+rsi]
3162	mov	rdx,QWORD[32+rsi]
3163	mov	r14,QWORD[40+rsi]
3164	adcx	r10,rbx
3165	adox	r11,rax
3166	mov	r15,QWORD[48+rsi]
3167	adcx	r11,r13
3168	adox	r12,rbp
3169	adcx	r12,rbp
3170
3171	mov	QWORD[56+rdi],r8
3172	mov	QWORD[64+rdi],r9
3173
3174	mulx	rax,r9,r14
3175	mov	r8,QWORD[56+rsi]
3176	adcx	r9,r10
3177	mulx	rbx,r10,r15
3178	adox	r10,rax
3179	adcx	r10,r11
3180	mulx	rax,r11,r8
3181	mov	rdx,r14
3182	adox	r11,rbx
3183	adcx	r11,r12
3184
3185	adcx	rax,rbp
3186
3187	mulx	rbx,r14,r15
3188	mulx	r13,r12,r8
3189	mov	rdx,r15
3190	lea	rsi,[64+rsi]
3191	adcx	r11,r14
3192	adox	r12,rbx
3193	adcx	r12,rax
3194	adox	r13,rbp
3195
3196DB	0x67,0x67
3197	mulx	r14,r8,r8
3198	adcx	r13,r8
3199	adcx	r14,rbp
3200
3201	cmp	rsi,QWORD[((8+8))+rsp]
3202	je	NEAR $L$sqrx8x_outer_break
3203
3204	neg	rcx
3205	mov	rcx,-8
3206	mov	r15,rbp
3207	mov	r8,QWORD[64+rdi]
3208	adcx	r9,QWORD[72+rdi]
3209	adcx	r10,QWORD[80+rdi]
3210	adcx	r11,QWORD[88+rdi]
3211	adc	r12,QWORD[96+rdi]
3212	adc	r13,QWORD[104+rdi]
3213	adc	r14,QWORD[112+rdi]
3214	adc	r15,QWORD[120+rdi]
3215	lea	rbp,[rsi]
3216	lea	rdi,[128+rdi]
3217	sbb	rax,rax
3218
3219	mov	rdx,QWORD[((-64))+rsi]
3220	mov	QWORD[((16+8))+rsp],rax
3221	mov	QWORD[((24+8))+rsp],rdi
3222
3223
3224	xor	eax,eax
3225	jmp	NEAR $L$sqrx8x_loop
3226
3227ALIGN	32
3228$L$sqrx8x_loop:
3229	mov	rbx,r8
3230	mulx	r8,rax,QWORD[rbp]
3231	adcx	rbx,rax
3232	adox	r8,r9
3233
3234	mulx	r9,rax,QWORD[8+rbp]
3235	adcx	r8,rax
3236	adox	r9,r10
3237
3238	mulx	r10,rax,QWORD[16+rbp]
3239	adcx	r9,rax
3240	adox	r10,r11
3241
3242	mulx	r11,rax,QWORD[24+rbp]
3243	adcx	r10,rax
3244	adox	r11,r12
3245
3246DB	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3247	adcx	r11,rax
3248	adox	r12,r13
3249
3250	mulx	r13,rax,QWORD[40+rbp]
3251	adcx	r12,rax
3252	adox	r13,r14
3253
3254	mulx	r14,rax,QWORD[48+rbp]
3255	mov	QWORD[rcx*8+rdi],rbx
3256	mov	ebx,0
3257	adcx	r13,rax
3258	adox	r14,r15
3259
3260DB	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
3261	mov	rdx,QWORD[8+rcx*8+rsi]
3262	adcx	r14,rax
3263	adox	r15,rbx
3264	adcx	r15,rbx
3265
3266DB	0x67
3267	inc	rcx
3268	jnz	NEAR $L$sqrx8x_loop
3269
3270	lea	rbp,[64+rbp]
3271	mov	rcx,-8
3272	cmp	rbp,QWORD[((8+8))+rsp]
3273	je	NEAR $L$sqrx8x_break
3274
3275	sub	rbx,QWORD[((16+8))+rsp]
3276DB	0x66
3277	mov	rdx,QWORD[((-64))+rsi]
3278	adcx	r8,QWORD[rdi]
3279	adcx	r9,QWORD[8+rdi]
3280	adc	r10,QWORD[16+rdi]
3281	adc	r11,QWORD[24+rdi]
3282	adc	r12,QWORD[32+rdi]
3283	adc	r13,QWORD[40+rdi]
3284	adc	r14,QWORD[48+rdi]
3285	adc	r15,QWORD[56+rdi]
3286	lea	rdi,[64+rdi]
3287DB	0x67
3288	sbb	rax,rax
3289	xor	ebx,ebx
3290	mov	QWORD[((16+8))+rsp],rax
3291	jmp	NEAR $L$sqrx8x_loop
3292
3293ALIGN	32
3294$L$sqrx8x_break:
3295	xor	rbp,rbp
3296	sub	rbx,QWORD[((16+8))+rsp]
3297	adcx	r8,rbp
3298	mov	rcx,QWORD[((24+8))+rsp]
3299	adcx	r9,rbp
3300	mov	rdx,QWORD[rsi]
3301	adc	r10,0
3302	mov	QWORD[rdi],r8
3303	adc	r11,0
3304	adc	r12,0
3305	adc	r13,0
3306	adc	r14,0
3307	adc	r15,0
3308	cmp	rdi,rcx
3309	je	NEAR $L$sqrx8x_outer_loop
3310
3311	mov	QWORD[8+rdi],r9
3312	mov	r9,QWORD[8+rcx]
3313	mov	QWORD[16+rdi],r10
3314	mov	r10,QWORD[16+rcx]
3315	mov	QWORD[24+rdi],r11
3316	mov	r11,QWORD[24+rcx]
3317	mov	QWORD[32+rdi],r12
3318	mov	r12,QWORD[32+rcx]
3319	mov	QWORD[40+rdi],r13
3320	mov	r13,QWORD[40+rcx]
3321	mov	QWORD[48+rdi],r14
3322	mov	r14,QWORD[48+rcx]
3323	mov	QWORD[56+rdi],r15
3324	mov	r15,QWORD[56+rcx]
3325	mov	rdi,rcx
3326	jmp	NEAR $L$sqrx8x_outer_loop
3327
3328ALIGN	32
3329$L$sqrx8x_outer_break:
3330	mov	QWORD[72+rdi],r9
3331DB	102,72,15,126,217
3332	mov	QWORD[80+rdi],r10
3333	mov	QWORD[88+rdi],r11
3334	mov	QWORD[96+rdi],r12
3335	mov	QWORD[104+rdi],r13
3336	mov	QWORD[112+rdi],r14
3337	lea	rdi,[((48+8))+rsp]
3338	mov	rdx,QWORD[rcx*1+rsi]
3339
3340	mov	r11,QWORD[8+rdi]
3341	xor	r10,r10
3342	mov	r9,QWORD[((0+8))+rsp]
3343	adox	r11,r11
3344	mov	r12,QWORD[16+rdi]
3345	mov	r13,QWORD[24+rdi]
3346
3347
3348ALIGN	32
3349$L$sqrx4x_shift_n_add:
3350	mulx	rbx,rax,rdx
3351	adox	r12,r12
3352	adcx	rax,r10
3353DB	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
3354DB	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
3355	adox	r13,r13
3356	adcx	rbx,r11
3357	mov	r11,QWORD[40+rdi]
3358	mov	QWORD[rdi],rax
3359	mov	QWORD[8+rdi],rbx
3360
3361	mulx	rbx,rax,rdx
3362	adox	r10,r10
3363	adcx	rax,r12
3364	mov	rdx,QWORD[16+rcx*1+rsi]
3365	mov	r12,QWORD[48+rdi]
3366	adox	r11,r11
3367	adcx	rbx,r13
3368	mov	r13,QWORD[56+rdi]
3369	mov	QWORD[16+rdi],rax
3370	mov	QWORD[24+rdi],rbx
3371
3372	mulx	rbx,rax,rdx
3373	adox	r12,r12
3374	adcx	rax,r10
3375	mov	rdx,QWORD[24+rcx*1+rsi]
3376	lea	rcx,[32+rcx]
3377	mov	r10,QWORD[64+rdi]
3378	adox	r13,r13
3379	adcx	rbx,r11
3380	mov	r11,QWORD[72+rdi]
3381	mov	QWORD[32+rdi],rax
3382	mov	QWORD[40+rdi],rbx
3383
3384	mulx	rbx,rax,rdx
3385	adox	r10,r10
3386	adcx	rax,r12
3387	jrcxz	$L$sqrx4x_shift_n_add_break
3388DB	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
3389	adox	r11,r11
3390	adcx	rbx,r13
3391	mov	r12,QWORD[80+rdi]
3392	mov	r13,QWORD[88+rdi]
3393	mov	QWORD[48+rdi],rax
3394	mov	QWORD[56+rdi],rbx
3395	lea	rdi,[64+rdi]
3396	nop
3397	jmp	NEAR $L$sqrx4x_shift_n_add
3398
3399ALIGN	32
3400$L$sqrx4x_shift_n_add_break:
3401	adcx	rbx,r13
3402	mov	QWORD[48+rdi],rax
3403	mov	QWORD[56+rdi],rbx
3404	lea	rdi,[64+rdi]
3405DB	102,72,15,126,213
3406__bn_sqrx8x_reduction:
3407	xor	eax,eax
3408	mov	rbx,QWORD[((32+8))+rsp]
3409	mov	rdx,QWORD[((48+8))+rsp]
3410	lea	rcx,[((-64))+r9*1+rbp]
3411
3412	mov	QWORD[((0+8))+rsp],rcx
3413	mov	QWORD[((8+8))+rsp],rdi
3414
3415	lea	rdi,[((48+8))+rsp]
3416	jmp	NEAR $L$sqrx8x_reduction_loop
3417
3418ALIGN	32
3419$L$sqrx8x_reduction_loop:
3420	mov	r9,QWORD[8+rdi]
3421	mov	r10,QWORD[16+rdi]
3422	mov	r11,QWORD[24+rdi]
3423	mov	r12,QWORD[32+rdi]
3424	mov	r8,rdx
3425	imul	rdx,rbx
3426	mov	r13,QWORD[40+rdi]
3427	mov	r14,QWORD[48+rdi]
3428	mov	r15,QWORD[56+rdi]
3429	mov	QWORD[((24+8))+rsp],rax
3430
3431	lea	rdi,[64+rdi]
3432	xor	rsi,rsi
3433	mov	rcx,-8
3434	jmp	NEAR $L$sqrx8x_reduce
3435
3436ALIGN	32
3437$L$sqrx8x_reduce:
3438	mov	rbx,r8
3439	mulx	r8,rax,QWORD[rbp]
3440	adcx	rax,rbx
3441	adox	r8,r9
3442
3443	mulx	r9,rbx,QWORD[8+rbp]
3444	adcx	r8,rbx
3445	adox	r9,r10
3446
3447	mulx	r10,rbx,QWORD[16+rbp]
3448	adcx	r9,rbx
3449	adox	r10,r11
3450
3451	mulx	r11,rbx,QWORD[24+rbp]
3452	adcx	r10,rbx
3453	adox	r11,r12
3454
3455DB	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
3456	mov	rax,rdx
3457	mov	rdx,r8
3458	adcx	r11,rbx
3459	adox	r12,r13
3460
3461	mulx	rdx,rbx,QWORD[((32+8))+rsp]
3462	mov	rdx,rax
3463	mov	QWORD[((64+48+8))+rcx*8+rsp],rax
3464
3465	mulx	r13,rax,QWORD[40+rbp]
3466	adcx	r12,rax
3467	adox	r13,r14
3468
3469	mulx	r14,rax,QWORD[48+rbp]
3470	adcx	r13,rax
3471	adox	r14,r15
3472
3473	mulx	r15,rax,QWORD[56+rbp]
3474	mov	rdx,rbx
3475	adcx	r14,rax
3476	adox	r15,rsi
3477	adcx	r15,rsi
3478
3479DB	0x67,0x67,0x67
3480	inc	rcx
3481	jnz	NEAR $L$sqrx8x_reduce
3482
3483	mov	rax,rsi
3484	cmp	rbp,QWORD[((0+8))+rsp]
3485	jae	NEAR $L$sqrx8x_no_tail
3486
3487	mov	rdx,QWORD[((48+8))+rsp]
3488	add	r8,QWORD[rdi]
3489	lea	rbp,[64+rbp]
3490	mov	rcx,-8
3491	adcx	r9,QWORD[8+rdi]
3492	adcx	r10,QWORD[16+rdi]
3493	adc	r11,QWORD[24+rdi]
3494	adc	r12,QWORD[32+rdi]
3495	adc	r13,QWORD[40+rdi]
3496	adc	r14,QWORD[48+rdi]
3497	adc	r15,QWORD[56+rdi]
3498	lea	rdi,[64+rdi]
3499	sbb	rax,rax
3500
3501	xor	rsi,rsi
3502	mov	QWORD[((16+8))+rsp],rax
3503	jmp	NEAR $L$sqrx8x_tail
3504
3505ALIGN	32
3506$L$sqrx8x_tail:
3507	mov	rbx,r8
3508	mulx	r8,rax,QWORD[rbp]
3509	adcx	rbx,rax
3510	adox	r8,r9
3511
3512	mulx	r9,rax,QWORD[8+rbp]
3513	adcx	r8,rax
3514	adox	r9,r10
3515
3516	mulx	r10,rax,QWORD[16+rbp]
3517	adcx	r9,rax
3518	adox	r10,r11
3519
3520	mulx	r11,rax,QWORD[24+rbp]
3521	adcx	r10,rax
3522	adox	r11,r12
3523
3524DB	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3525	adcx	r11,rax
3526	adox	r12,r13
3527
3528	mulx	r13,rax,QWORD[40+rbp]
3529	adcx	r12,rax
3530	adox	r13,r14
3531
3532	mulx	r14,rax,QWORD[48+rbp]
3533	adcx	r13,rax
3534	adox	r14,r15
3535
3536	mulx	r15,rax,QWORD[56+rbp]
3537	mov	rdx,QWORD[((72+48+8))+rcx*8+rsp]
3538	adcx	r14,rax
3539	adox	r15,rsi
3540	mov	QWORD[rcx*8+rdi],rbx
3541	mov	rbx,r8
3542	adcx	r15,rsi
3543
3544	inc	rcx
3545	jnz	NEAR $L$sqrx8x_tail
3546
3547	cmp	rbp,QWORD[((0+8))+rsp]
3548	jae	NEAR $L$sqrx8x_tail_done
3549
3550	sub	rsi,QWORD[((16+8))+rsp]
3551	mov	rdx,QWORD[((48+8))+rsp]
3552	lea	rbp,[64+rbp]
3553	adc	r8,QWORD[rdi]
3554	adc	r9,QWORD[8+rdi]
3555	adc	r10,QWORD[16+rdi]
3556	adc	r11,QWORD[24+rdi]
3557	adc	r12,QWORD[32+rdi]
3558	adc	r13,QWORD[40+rdi]
3559	adc	r14,QWORD[48+rdi]
3560	adc	r15,QWORD[56+rdi]
3561	lea	rdi,[64+rdi]
3562	sbb	rax,rax
3563	sub	rcx,8
3564
3565	xor	rsi,rsi
3566	mov	QWORD[((16+8))+rsp],rax
3567	jmp	NEAR $L$sqrx8x_tail
3568
3569ALIGN	32
3570$L$sqrx8x_tail_done:
3571	xor	rax,rax
3572	add	r8,QWORD[((24+8))+rsp]
3573	adc	r9,0
3574	adc	r10,0
3575	adc	r11,0
3576	adc	r12,0
3577	adc	r13,0
3578	adc	r14,0
3579	adc	r15,0
3580	adc	rax,0
3581
3582	sub	rsi,QWORD[((16+8))+rsp]
3583$L$sqrx8x_no_tail:
3584	adc	r8,QWORD[rdi]
3585DB	102,72,15,126,217
3586	adc	r9,QWORD[8+rdi]
3587	mov	rsi,QWORD[56+rbp]
3588DB	102,72,15,126,213
3589	adc	r10,QWORD[16+rdi]
3590	adc	r11,QWORD[24+rdi]
3591	adc	r12,QWORD[32+rdi]
3592	adc	r13,QWORD[40+rdi]
3593	adc	r14,QWORD[48+rdi]
3594	adc	r15,QWORD[56+rdi]
3595	adc	rax,0
3596
3597	mov	rbx,QWORD[((32+8))+rsp]
3598	mov	rdx,QWORD[64+rcx*1+rdi]
3599
3600	mov	QWORD[rdi],r8
3601	lea	r8,[64+rdi]
3602	mov	QWORD[8+rdi],r9
3603	mov	QWORD[16+rdi],r10
3604	mov	QWORD[24+rdi],r11
3605	mov	QWORD[32+rdi],r12
3606	mov	QWORD[40+rdi],r13
3607	mov	QWORD[48+rdi],r14
3608	mov	QWORD[56+rdi],r15
3609
3610	lea	rdi,[64+rcx*1+rdi]
3611	cmp	r8,QWORD[((8+8))+rsp]
3612	jb	NEAR $L$sqrx8x_reduction_loop
3613	DB	0F3h,0C3h		;repret
3614
3615
3616ALIGN	32
3617
3618__bn_postx4x_internal:
3619
3620	mov	r12,QWORD[rbp]
3621	mov	r10,rcx
3622	mov	r9,rcx
3623	neg	rax
3624	sar	rcx,3+2
3625
3626DB	102,72,15,126,202
3627DB	102,72,15,126,206
3628	dec	r12
3629	mov	r13,QWORD[8+rbp]
3630	xor	r8,r8
3631	mov	r14,QWORD[16+rbp]
3632	mov	r15,QWORD[24+rbp]
3633	jmp	NEAR $L$sqrx4x_sub_entry
3634
3635ALIGN	16
3636$L$sqrx4x_sub:
3637	mov	r12,QWORD[rbp]
3638	mov	r13,QWORD[8+rbp]
3639	mov	r14,QWORD[16+rbp]
3640	mov	r15,QWORD[24+rbp]
3641$L$sqrx4x_sub_entry:
3642	andn	r12,r12,rax
3643	lea	rbp,[32+rbp]
3644	andn	r13,r13,rax
3645	andn	r14,r14,rax
3646	andn	r15,r15,rax
3647
3648	neg	r8
3649	adc	r12,QWORD[rdi]
3650	adc	r13,QWORD[8+rdi]
3651	adc	r14,QWORD[16+rdi]
3652	adc	r15,QWORD[24+rdi]
3653	mov	QWORD[rdx],r12
3654	lea	rdi,[32+rdi]
3655	mov	QWORD[8+rdx],r13
3656	sbb	r8,r8
3657	mov	QWORD[16+rdx],r14
3658	mov	QWORD[24+rdx],r15
3659	lea	rdx,[32+rdx]
3660
3661	inc	rcx
3662	jnz	NEAR $L$sqrx4x_sub
3663
3664	neg	r9
3665
3666	DB	0F3h,0C3h		;repret
3667
3668
3669global	bn_scatter5
3670
3671ALIGN	16
3672bn_scatter5:
3673
3674	cmp	edx,0
3675	jz	NEAR $L$scatter_epilogue
3676	lea	r8,[r9*8+r8]
3677$L$scatter:
3678	mov	rax,QWORD[rcx]
3679	lea	rcx,[8+rcx]
3680	mov	QWORD[r8],rax
3681	lea	r8,[256+r8]
3682	sub	edx,1
3683	jnz	NEAR $L$scatter
3684$L$scatter_epilogue:
3685	DB	0F3h,0C3h		;repret
3686
3687
3688
3689global	bn_gather5
3690
3691ALIGN	32
3692bn_gather5:
3693
3694$L$SEH_begin_bn_gather5:
3695
3696DB	0x4c,0x8d,0x14,0x24
3697
3698DB	0x48,0x81,0xec,0x08,0x01,0x00,0x00
3699	lea	rax,[$L$inc]
3700	and	rsp,-16
3701
3702	movd	xmm5,r9d
3703	movdqa	xmm0,XMMWORD[rax]
3704	movdqa	xmm1,XMMWORD[16+rax]
3705	lea	r11,[128+r8]
3706	lea	rax,[128+rsp]
3707
3708	pshufd	xmm5,xmm5,0
3709	movdqa	xmm4,xmm1
3710	movdqa	xmm2,xmm1
3711	paddd	xmm1,xmm0
3712	pcmpeqd	xmm0,xmm5
3713	movdqa	xmm3,xmm4
3714
3715	paddd	xmm2,xmm1
3716	pcmpeqd	xmm1,xmm5
3717	movdqa	XMMWORD[(-128)+rax],xmm0
3718	movdqa	xmm0,xmm4
3719
3720	paddd	xmm3,xmm2
3721	pcmpeqd	xmm2,xmm5
3722	movdqa	XMMWORD[(-112)+rax],xmm1
3723	movdqa	xmm1,xmm4
3724
3725	paddd	xmm0,xmm3
3726	pcmpeqd	xmm3,xmm5
3727	movdqa	XMMWORD[(-96)+rax],xmm2
3728	movdqa	xmm2,xmm4
3729	paddd	xmm1,xmm0
3730	pcmpeqd	xmm0,xmm5
3731	movdqa	XMMWORD[(-80)+rax],xmm3
3732	movdqa	xmm3,xmm4
3733
3734	paddd	xmm2,xmm1
3735	pcmpeqd	xmm1,xmm5
3736	movdqa	XMMWORD[(-64)+rax],xmm0
3737	movdqa	xmm0,xmm4
3738
3739	paddd	xmm3,xmm2
3740	pcmpeqd	xmm2,xmm5
3741	movdqa	XMMWORD[(-48)+rax],xmm1
3742	movdqa	xmm1,xmm4
3743
3744	paddd	xmm0,xmm3
3745	pcmpeqd	xmm3,xmm5
3746	movdqa	XMMWORD[(-32)+rax],xmm2
3747	movdqa	xmm2,xmm4
3748	paddd	xmm1,xmm0
3749	pcmpeqd	xmm0,xmm5
3750	movdqa	XMMWORD[(-16)+rax],xmm3
3751	movdqa	xmm3,xmm4
3752
3753	paddd	xmm2,xmm1
3754	pcmpeqd	xmm1,xmm5
3755	movdqa	XMMWORD[rax],xmm0
3756	movdqa	xmm0,xmm4
3757
3758	paddd	xmm3,xmm2
3759	pcmpeqd	xmm2,xmm5
3760	movdqa	XMMWORD[16+rax],xmm1
3761	movdqa	xmm1,xmm4
3762
3763	paddd	xmm0,xmm3
3764	pcmpeqd	xmm3,xmm5
3765	movdqa	XMMWORD[32+rax],xmm2
3766	movdqa	xmm2,xmm4
3767	paddd	xmm1,xmm0
3768	pcmpeqd	xmm0,xmm5
3769	movdqa	XMMWORD[48+rax],xmm3
3770	movdqa	xmm3,xmm4
3771
3772	paddd	xmm2,xmm1
3773	pcmpeqd	xmm1,xmm5
3774	movdqa	XMMWORD[64+rax],xmm0
3775	movdqa	xmm0,xmm4
3776
3777	paddd	xmm3,xmm2
3778	pcmpeqd	xmm2,xmm5
3779	movdqa	XMMWORD[80+rax],xmm1
3780	movdqa	xmm1,xmm4
3781
3782	paddd	xmm0,xmm3
3783	pcmpeqd	xmm3,xmm5
3784	movdqa	XMMWORD[96+rax],xmm2
3785	movdqa	xmm2,xmm4
3786	movdqa	XMMWORD[112+rax],xmm3
3787	jmp	NEAR $L$gather
3788
3789ALIGN	32
3790$L$gather:
3791	pxor	xmm4,xmm4
3792	pxor	xmm5,xmm5
3793	movdqa	xmm0,XMMWORD[((-128))+r11]
3794	movdqa	xmm1,XMMWORD[((-112))+r11]
3795	movdqa	xmm2,XMMWORD[((-96))+r11]
3796	pand	xmm0,XMMWORD[((-128))+rax]
3797	movdqa	xmm3,XMMWORD[((-80))+r11]
3798	pand	xmm1,XMMWORD[((-112))+rax]
3799	por	xmm4,xmm0
3800	pand	xmm2,XMMWORD[((-96))+rax]
3801	por	xmm5,xmm1
3802	pand	xmm3,XMMWORD[((-80))+rax]
3803	por	xmm4,xmm2
3804	por	xmm5,xmm3
3805	movdqa	xmm0,XMMWORD[((-64))+r11]
3806	movdqa	xmm1,XMMWORD[((-48))+r11]
3807	movdqa	xmm2,XMMWORD[((-32))+r11]
3808	pand	xmm0,XMMWORD[((-64))+rax]
3809	movdqa	xmm3,XMMWORD[((-16))+r11]
3810	pand	xmm1,XMMWORD[((-48))+rax]
3811	por	xmm4,xmm0
3812	pand	xmm2,XMMWORD[((-32))+rax]
3813	por	xmm5,xmm1
3814	pand	xmm3,XMMWORD[((-16))+rax]
3815	por	xmm4,xmm2
3816	por	xmm5,xmm3
3817	movdqa	xmm0,XMMWORD[r11]
3818	movdqa	xmm1,XMMWORD[16+r11]
3819	movdqa	xmm2,XMMWORD[32+r11]
3820	pand	xmm0,XMMWORD[rax]
3821	movdqa	xmm3,XMMWORD[48+r11]
3822	pand	xmm1,XMMWORD[16+rax]
3823	por	xmm4,xmm0
3824	pand	xmm2,XMMWORD[32+rax]
3825	por	xmm5,xmm1
3826	pand	xmm3,XMMWORD[48+rax]
3827	por	xmm4,xmm2
3828	por	xmm5,xmm3
3829	movdqa	xmm0,XMMWORD[64+r11]
3830	movdqa	xmm1,XMMWORD[80+r11]
3831	movdqa	xmm2,XMMWORD[96+r11]
3832	pand	xmm0,XMMWORD[64+rax]
3833	movdqa	xmm3,XMMWORD[112+r11]
3834	pand	xmm1,XMMWORD[80+rax]
3835	por	xmm4,xmm0
3836	pand	xmm2,XMMWORD[96+rax]
3837	por	xmm5,xmm1
3838	pand	xmm3,XMMWORD[112+rax]
3839	por	xmm4,xmm2
3840	por	xmm5,xmm3
3841	por	xmm4,xmm5
3842	lea	r11,[256+r11]
3843	pshufd	xmm0,xmm4,0x4e
3844	por	xmm0,xmm4
3845	movq	QWORD[rcx],xmm0
3846	lea	rcx,[8+rcx]
3847	sub	edx,1
3848	jnz	NEAR $L$gather
3849
3850	lea	rsp,[r10]
3851
3852	DB	0F3h,0C3h		;repret
3853$L$SEH_end_bn_gather5:
3854
3855
3856ALIGN	64
3857$L$inc:
3858	DD	0,0,1,1
3859	DD	2,2,2,2
3860DB	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
3861DB	112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115
3862DB	99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111
3863DB	114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79
3864DB	71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111
3865DB	112,101,110,115,115,108,46,111,114,103,62,0
3866EXTERN	__imp_RtlVirtualUnwind
3867
3868ALIGN	16
3869mul_handler:
3870	push	rsi
3871	push	rdi
3872	push	rbx
3873	push	rbp
3874	push	r12
3875	push	r13
3876	push	r14
3877	push	r15
3878	pushfq
3879	sub	rsp,64
3880
3881	mov	rax,QWORD[120+r8]
3882	mov	rbx,QWORD[248+r8]
3883
3884	mov	rsi,QWORD[8+r9]
3885	mov	r11,QWORD[56+r9]
3886
3887	mov	r10d,DWORD[r11]
3888	lea	r10,[r10*1+rsi]
3889	cmp	rbx,r10
3890	jb	NEAR $L$common_seh_tail
3891
3892	mov	r10d,DWORD[4+r11]
3893	lea	r10,[r10*1+rsi]
3894	cmp	rbx,r10
3895	jb	NEAR $L$common_pop_regs
3896
3897	mov	rax,QWORD[152+r8]
3898
3899	mov	r10d,DWORD[8+r11]
3900	lea	r10,[r10*1+rsi]
3901	cmp	rbx,r10
3902	jae	NEAR $L$common_seh_tail
3903
3904	lea	r10,[$L$mul_epilogue]
3905	cmp	rbx,r10
3906	ja	NEAR $L$body_40
3907
3908	mov	r10,QWORD[192+r8]
3909	mov	rax,QWORD[8+r10*8+rax]
3910
3911	jmp	NEAR $L$common_pop_regs
3912
3913$L$body_40:
3914	mov	rax,QWORD[40+rax]
3915$L$common_pop_regs:
3916	mov	rbx,QWORD[((-8))+rax]
3917	mov	rbp,QWORD[((-16))+rax]
3918	mov	r12,QWORD[((-24))+rax]
3919	mov	r13,QWORD[((-32))+rax]
3920	mov	r14,QWORD[((-40))+rax]
3921	mov	r15,QWORD[((-48))+rax]
3922	mov	QWORD[144+r8],rbx
3923	mov	QWORD[160+r8],rbp
3924	mov	QWORD[216+r8],r12
3925	mov	QWORD[224+r8],r13
3926	mov	QWORD[232+r8],r14
3927	mov	QWORD[240+r8],r15
3928
3929$L$common_seh_tail:
3930	mov	rdi,QWORD[8+rax]
3931	mov	rsi,QWORD[16+rax]
3932	mov	QWORD[152+r8],rax
3933	mov	QWORD[168+r8],rsi
3934	mov	QWORD[176+r8],rdi
3935
3936	mov	rdi,QWORD[40+r9]
3937	mov	rsi,r8
3938	mov	ecx,154
3939	DD	0xa548f3fc
3940
3941	mov	rsi,r9
3942	xor	rcx,rcx
3943	mov	rdx,QWORD[8+rsi]
3944	mov	r8,QWORD[rsi]
3945	mov	r9,QWORD[16+rsi]
3946	mov	r10,QWORD[40+rsi]
3947	lea	r11,[56+rsi]
3948	lea	r12,[24+rsi]
3949	mov	QWORD[32+rsp],r10
3950	mov	QWORD[40+rsp],r11
3951	mov	QWORD[48+rsp],r12
3952	mov	QWORD[56+rsp],rcx
3953	call	QWORD[__imp_RtlVirtualUnwind]
3954
3955	mov	eax,1
3956	add	rsp,64
3957	popfq
3958	pop	r15
3959	pop	r14
3960	pop	r13
3961	pop	r12
3962	pop	rbp
3963	pop	rbx
3964	pop	rdi
3965	pop	rsi
3966	DB	0F3h,0C3h		;repret
3967
3968
3969section	.pdata rdata align=4
3970ALIGN	4
3971	DD	$L$SEH_begin_bn_mul_mont_gather5 wrt ..imagebase
3972	DD	$L$SEH_end_bn_mul_mont_gather5 wrt ..imagebase
3973	DD	$L$SEH_info_bn_mul_mont_gather5 wrt ..imagebase
3974
3975	DD	$L$SEH_begin_bn_mul4x_mont_gather5 wrt ..imagebase
3976	DD	$L$SEH_end_bn_mul4x_mont_gather5 wrt ..imagebase
3977	DD	$L$SEH_info_bn_mul4x_mont_gather5 wrt ..imagebase
3978
3979	DD	$L$SEH_begin_bn_power5 wrt ..imagebase
3980	DD	$L$SEH_end_bn_power5 wrt ..imagebase
3981	DD	$L$SEH_info_bn_power5 wrt ..imagebase
3982
3983	DD	$L$SEH_begin_bn_from_mont8x wrt ..imagebase
3984	DD	$L$SEH_end_bn_from_mont8x wrt ..imagebase
3985	DD	$L$SEH_info_bn_from_mont8x wrt ..imagebase
3986	DD	$L$SEH_begin_bn_mulx4x_mont_gather5 wrt ..imagebase
3987	DD	$L$SEH_end_bn_mulx4x_mont_gather5 wrt ..imagebase
3988	DD	$L$SEH_info_bn_mulx4x_mont_gather5 wrt ..imagebase
3989
3990	DD	$L$SEH_begin_bn_powerx5 wrt ..imagebase
3991	DD	$L$SEH_end_bn_powerx5 wrt ..imagebase
3992	DD	$L$SEH_info_bn_powerx5 wrt ..imagebase
3993	DD	$L$SEH_begin_bn_gather5 wrt ..imagebase
3994	DD	$L$SEH_end_bn_gather5 wrt ..imagebase
3995	DD	$L$SEH_info_bn_gather5 wrt ..imagebase
3996
3997section	.xdata rdata align=8
3998ALIGN	8
3999$L$SEH_info_bn_mul_mont_gather5:
4000DB	9,0,0,0
4001	DD	mul_handler wrt ..imagebase
4002	DD	$L$mul_body wrt ..imagebase,$L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase
4003ALIGN	8
4004$L$SEH_info_bn_mul4x_mont_gather5:
4005DB	9,0,0,0
4006	DD	mul_handler wrt ..imagebase
4007	DD	$L$mul4x_prologue wrt ..imagebase,$L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase
4008ALIGN	8
4009$L$SEH_info_bn_power5:
4010DB	9,0,0,0
4011	DD	mul_handler wrt ..imagebase
4012	DD	$L$power5_prologue wrt ..imagebase,$L$power5_body wrt ..imagebase,$L$power5_epilogue wrt ..imagebase
4013ALIGN	8
4014$L$SEH_info_bn_from_mont8x:
4015DB	9,0,0,0
4016	DD	mul_handler wrt ..imagebase
4017	DD	$L$from_prologue wrt ..imagebase,$L$from_body wrt ..imagebase,$L$from_epilogue wrt ..imagebase
4018ALIGN	8
4019$L$SEH_info_bn_mulx4x_mont_gather5:
4020DB	9,0,0,0
4021	DD	mul_handler wrt ..imagebase
4022	DD	$L$mulx4x_prologue wrt ..imagebase,$L$mulx4x_body wrt ..imagebase,$L$mulx4x_epilogue wrt ..imagebase
4023ALIGN	8
4024$L$SEH_info_bn_powerx5:
4025DB	9,0,0,0
4026	DD	mul_handler wrt ..imagebase
4027	DD	$L$powerx5_prologue wrt ..imagebase,$L$powerx5_body wrt ..imagebase,$L$powerx5_epilogue wrt ..imagebase
4028ALIGN	8
4029$L$SEH_info_bn_gather5:
4030DB	0x01,0x0b,0x03,0x0a
4031DB	0x0b,0x01,0x21,0x00
4032DB	0x04,0xa3,0x00,0x00
4033ALIGN	8
4034