• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
2.text
3
4.extern	OPENSSL_ia32cap_P
5.hidden OPENSSL_ia32cap_P
6
7.globl	bn_mul_mont
8.hidden bn_mul_mont
9.type	bn_mul_mont,@function
10.align	16
11bn_mul_mont:
12.cfi_startproc
13	movl	%r9d,%r9d
14	movq	%rsp,%rax
15.cfi_def_cfa_register	%rax
16	testl	$3,%r9d
17	jnz	.Lmul_enter
18	cmpl	$8,%r9d
19	jb	.Lmul_enter
20	cmpq	%rsi,%rdx
21	jne	.Lmul4x_enter
22	testl	$7,%r9d
23	jz	.Lsqr8x_enter
24	jmp	.Lmul4x_enter
25
26.align	16
27.Lmul_enter:
28	pushq	%rbx
29.cfi_offset	%rbx,-16
30	pushq	%rbp
31.cfi_offset	%rbp,-24
32	pushq	%r12
33.cfi_offset	%r12,-32
34	pushq	%r13
35.cfi_offset	%r13,-40
36	pushq	%r14
37.cfi_offset	%r14,-48
38	pushq	%r15
39.cfi_offset	%r15,-56
40
41	negq	%r9
42	movq	%rsp,%r11
43	leaq	-16(%rsp,%r9,8),%r10
44	negq	%r9
45	andq	$-1024,%r10
46
47
48
49
50
51
52
53
54
55	subq	%r10,%r11
56	andq	$-4096,%r11
57	leaq	(%r10,%r11,1),%rsp
58	movq	(%rsp),%r11
59	cmpq	%r10,%rsp
60	ja	.Lmul_page_walk
61	jmp	.Lmul_page_walk_done
62
63.align	16
64.Lmul_page_walk:
65	leaq	-4096(%rsp),%rsp
66	movq	(%rsp),%r11
67	cmpq	%r10,%rsp
68	ja	.Lmul_page_walk
69.Lmul_page_walk_done:
70
71	movq	%rax,8(%rsp,%r9,8)
72.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
73.Lmul_body:
74	movq	%rdx,%r12
75	movq	(%r8),%r8
76	movq	(%r12),%rbx
77	movq	(%rsi),%rax
78
79	xorq	%r14,%r14
80	xorq	%r15,%r15
81
82	movq	%r8,%rbp
83	mulq	%rbx
84	movq	%rax,%r10
85	movq	(%rcx),%rax
86
87	imulq	%r10,%rbp
88	movq	%rdx,%r11
89
90	mulq	%rbp
91	addq	%rax,%r10
92	movq	8(%rsi),%rax
93	adcq	$0,%rdx
94	movq	%rdx,%r13
95
96	leaq	1(%r15),%r15
97	jmp	.L1st_enter
98
99.align	16
100.L1st:
101	addq	%rax,%r13
102	movq	(%rsi,%r15,8),%rax
103	adcq	$0,%rdx
104	addq	%r11,%r13
105	movq	%r10,%r11
106	adcq	$0,%rdx
107	movq	%r13,-16(%rsp,%r15,8)
108	movq	%rdx,%r13
109
110.L1st_enter:
111	mulq	%rbx
112	addq	%rax,%r11
113	movq	(%rcx,%r15,8),%rax
114	adcq	$0,%rdx
115	leaq	1(%r15),%r15
116	movq	%rdx,%r10
117
118	mulq	%rbp
119	cmpq	%r9,%r15
120	jne	.L1st
121
122	addq	%rax,%r13
123	movq	(%rsi),%rax
124	adcq	$0,%rdx
125	addq	%r11,%r13
126	adcq	$0,%rdx
127	movq	%r13,-16(%rsp,%r15,8)
128	movq	%rdx,%r13
129	movq	%r10,%r11
130
131	xorq	%rdx,%rdx
132	addq	%r11,%r13
133	adcq	$0,%rdx
134	movq	%r13,-8(%rsp,%r9,8)
135	movq	%rdx,(%rsp,%r9,8)
136
137	leaq	1(%r14),%r14
138	jmp	.Louter
139.align	16
140.Louter:
141	movq	(%r12,%r14,8),%rbx
142	xorq	%r15,%r15
143	movq	%r8,%rbp
144	movq	(%rsp),%r10
145	mulq	%rbx
146	addq	%rax,%r10
147	movq	(%rcx),%rax
148	adcq	$0,%rdx
149
150	imulq	%r10,%rbp
151	movq	%rdx,%r11
152
153	mulq	%rbp
154	addq	%rax,%r10
155	movq	8(%rsi),%rax
156	adcq	$0,%rdx
157	movq	8(%rsp),%r10
158	movq	%rdx,%r13
159
160	leaq	1(%r15),%r15
161	jmp	.Linner_enter
162
163.align	16
164.Linner:
165	addq	%rax,%r13
166	movq	(%rsi,%r15,8),%rax
167	adcq	$0,%rdx
168	addq	%r10,%r13
169	movq	(%rsp,%r15,8),%r10
170	adcq	$0,%rdx
171	movq	%r13,-16(%rsp,%r15,8)
172	movq	%rdx,%r13
173
174.Linner_enter:
175	mulq	%rbx
176	addq	%rax,%r11
177	movq	(%rcx,%r15,8),%rax
178	adcq	$0,%rdx
179	addq	%r11,%r10
180	movq	%rdx,%r11
181	adcq	$0,%r11
182	leaq	1(%r15),%r15
183
184	mulq	%rbp
185	cmpq	%r9,%r15
186	jne	.Linner
187
188	addq	%rax,%r13
189	movq	(%rsi),%rax
190	adcq	$0,%rdx
191	addq	%r10,%r13
192	movq	(%rsp,%r15,8),%r10
193	adcq	$0,%rdx
194	movq	%r13,-16(%rsp,%r15,8)
195	movq	%rdx,%r13
196
197	xorq	%rdx,%rdx
198	addq	%r11,%r13
199	adcq	$0,%rdx
200	addq	%r10,%r13
201	adcq	$0,%rdx
202	movq	%r13,-8(%rsp,%r9,8)
203	movq	%rdx,(%rsp,%r9,8)
204
205	leaq	1(%r14),%r14
206	cmpq	%r9,%r14
207	jb	.Louter
208
209	xorq	%r14,%r14
210	movq	(%rsp),%rax
211	leaq	(%rsp),%rsi
212	movq	%r9,%r15
213	jmp	.Lsub
214.align	16
215.Lsub:
216	sbbq	(%rcx,%r14,8),%rax
217	movq	%rax,(%rdi,%r14,8)
218	movq	8(%rsi,%r14,8),%rax
219	leaq	1(%r14),%r14
220	decq	%r15
221	jnz	.Lsub
222
223	sbbq	$0,%rax
224	xorq	%r14,%r14
225	andq	%rax,%rsi
226	notq	%rax
227	movq	%rdi,%rcx
228	andq	%rax,%rcx
229	movq	%r9,%r15
230	orq	%rcx,%rsi
231.align	16
232.Lcopy:
233	movq	(%rsi,%r14,8),%rax
234	movq	%r14,(%rsp,%r14,8)
235	movq	%rax,(%rdi,%r14,8)
236	leaq	1(%r14),%r14
237	subq	$1,%r15
238	jnz	.Lcopy
239
240	movq	8(%rsp,%r9,8),%rsi
241.cfi_def_cfa	%rsi,8
242	movq	$1,%rax
243	movq	-48(%rsi),%r15
244.cfi_restore	%r15
245	movq	-40(%rsi),%r14
246.cfi_restore	%r14
247	movq	-32(%rsi),%r13
248.cfi_restore	%r13
249	movq	-24(%rsi),%r12
250.cfi_restore	%r12
251	movq	-16(%rsi),%rbp
252.cfi_restore	%rbp
253	movq	-8(%rsi),%rbx
254.cfi_restore	%rbx
255	leaq	(%rsi),%rsp
256.cfi_def_cfa_register	%rsp
257.Lmul_epilogue:
258	.byte	0xf3,0xc3
259.cfi_endproc
260.size	bn_mul_mont,.-bn_mul_mont
261.type	bn_mul4x_mont,@function
262.align	16
263bn_mul4x_mont:
264.cfi_startproc
265	movl	%r9d,%r9d
266	movq	%rsp,%rax
267.cfi_def_cfa_register	%rax
268.Lmul4x_enter:
269	pushq	%rbx
270.cfi_offset	%rbx,-16
271	pushq	%rbp
272.cfi_offset	%rbp,-24
273	pushq	%r12
274.cfi_offset	%r12,-32
275	pushq	%r13
276.cfi_offset	%r13,-40
277	pushq	%r14
278.cfi_offset	%r14,-48
279	pushq	%r15
280.cfi_offset	%r15,-56
281
282	negq	%r9
283	movq	%rsp,%r11
284	leaq	-32(%rsp,%r9,8),%r10
285	negq	%r9
286	andq	$-1024,%r10
287
288	subq	%r10,%r11
289	andq	$-4096,%r11
290	leaq	(%r10,%r11,1),%rsp
291	movq	(%rsp),%r11
292	cmpq	%r10,%rsp
293	ja	.Lmul4x_page_walk
294	jmp	.Lmul4x_page_walk_done
295
296.Lmul4x_page_walk:
297	leaq	-4096(%rsp),%rsp
298	movq	(%rsp),%r11
299	cmpq	%r10,%rsp
300	ja	.Lmul4x_page_walk
301.Lmul4x_page_walk_done:
302
303	movq	%rax,8(%rsp,%r9,8)
304.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
305.Lmul4x_body:
306	movq	%rdi,16(%rsp,%r9,8)
307	movq	%rdx,%r12
308	movq	(%r8),%r8
309	movq	(%r12),%rbx
310	movq	(%rsi),%rax
311
312	xorq	%r14,%r14
313	xorq	%r15,%r15
314
315	movq	%r8,%rbp
316	mulq	%rbx
317	movq	%rax,%r10
318	movq	(%rcx),%rax
319
320	imulq	%r10,%rbp
321	movq	%rdx,%r11
322
323	mulq	%rbp
324	addq	%rax,%r10
325	movq	8(%rsi),%rax
326	adcq	$0,%rdx
327	movq	%rdx,%rdi
328
329	mulq	%rbx
330	addq	%rax,%r11
331	movq	8(%rcx),%rax
332	adcq	$0,%rdx
333	movq	%rdx,%r10
334
335	mulq	%rbp
336	addq	%rax,%rdi
337	movq	16(%rsi),%rax
338	adcq	$0,%rdx
339	addq	%r11,%rdi
340	leaq	4(%r15),%r15
341	adcq	$0,%rdx
342	movq	%rdi,(%rsp)
343	movq	%rdx,%r13
344	jmp	.L1st4x
345.align	16
346.L1st4x:
347	mulq	%rbx
348	addq	%rax,%r10
349	movq	-16(%rcx,%r15,8),%rax
350	adcq	$0,%rdx
351	movq	%rdx,%r11
352
353	mulq	%rbp
354	addq	%rax,%r13
355	movq	-8(%rsi,%r15,8),%rax
356	adcq	$0,%rdx
357	addq	%r10,%r13
358	adcq	$0,%rdx
359	movq	%r13,-24(%rsp,%r15,8)
360	movq	%rdx,%rdi
361
362	mulq	%rbx
363	addq	%rax,%r11
364	movq	-8(%rcx,%r15,8),%rax
365	adcq	$0,%rdx
366	movq	%rdx,%r10
367
368	mulq	%rbp
369	addq	%rax,%rdi
370	movq	(%rsi,%r15,8),%rax
371	adcq	$0,%rdx
372	addq	%r11,%rdi
373	adcq	$0,%rdx
374	movq	%rdi,-16(%rsp,%r15,8)
375	movq	%rdx,%r13
376
377	mulq	%rbx
378	addq	%rax,%r10
379	movq	(%rcx,%r15,8),%rax
380	adcq	$0,%rdx
381	movq	%rdx,%r11
382
383	mulq	%rbp
384	addq	%rax,%r13
385	movq	8(%rsi,%r15,8),%rax
386	adcq	$0,%rdx
387	addq	%r10,%r13
388	adcq	$0,%rdx
389	movq	%r13,-8(%rsp,%r15,8)
390	movq	%rdx,%rdi
391
392	mulq	%rbx
393	addq	%rax,%r11
394	movq	8(%rcx,%r15,8),%rax
395	adcq	$0,%rdx
396	leaq	4(%r15),%r15
397	movq	%rdx,%r10
398
399	mulq	%rbp
400	addq	%rax,%rdi
401	movq	-16(%rsi,%r15,8),%rax
402	adcq	$0,%rdx
403	addq	%r11,%rdi
404	adcq	$0,%rdx
405	movq	%rdi,-32(%rsp,%r15,8)
406	movq	%rdx,%r13
407	cmpq	%r9,%r15
408	jb	.L1st4x
409
410	mulq	%rbx
411	addq	%rax,%r10
412	movq	-16(%rcx,%r15,8),%rax
413	adcq	$0,%rdx
414	movq	%rdx,%r11
415
416	mulq	%rbp
417	addq	%rax,%r13
418	movq	-8(%rsi,%r15,8),%rax
419	adcq	$0,%rdx
420	addq	%r10,%r13
421	adcq	$0,%rdx
422	movq	%r13,-24(%rsp,%r15,8)
423	movq	%rdx,%rdi
424
425	mulq	%rbx
426	addq	%rax,%r11
427	movq	-8(%rcx,%r15,8),%rax
428	adcq	$0,%rdx
429	movq	%rdx,%r10
430
431	mulq	%rbp
432	addq	%rax,%rdi
433	movq	(%rsi),%rax
434	adcq	$0,%rdx
435	addq	%r11,%rdi
436	adcq	$0,%rdx
437	movq	%rdi,-16(%rsp,%r15,8)
438	movq	%rdx,%r13
439
440	xorq	%rdi,%rdi
441	addq	%r10,%r13
442	adcq	$0,%rdi
443	movq	%r13,-8(%rsp,%r15,8)
444	movq	%rdi,(%rsp,%r15,8)
445
446	leaq	1(%r14),%r14
447.align	4
448.Louter4x:
449	movq	(%r12,%r14,8),%rbx
450	xorq	%r15,%r15
451	movq	(%rsp),%r10
452	movq	%r8,%rbp
453	mulq	%rbx
454	addq	%rax,%r10
455	movq	(%rcx),%rax
456	adcq	$0,%rdx
457
458	imulq	%r10,%rbp
459	movq	%rdx,%r11
460
461	mulq	%rbp
462	addq	%rax,%r10
463	movq	8(%rsi),%rax
464	adcq	$0,%rdx
465	movq	%rdx,%rdi
466
467	mulq	%rbx
468	addq	%rax,%r11
469	movq	8(%rcx),%rax
470	adcq	$0,%rdx
471	addq	8(%rsp),%r11
472	adcq	$0,%rdx
473	movq	%rdx,%r10
474
475	mulq	%rbp
476	addq	%rax,%rdi
477	movq	16(%rsi),%rax
478	adcq	$0,%rdx
479	addq	%r11,%rdi
480	leaq	4(%r15),%r15
481	adcq	$0,%rdx
482	movq	%rdi,(%rsp)
483	movq	%rdx,%r13
484	jmp	.Linner4x
485.align	16
486.Linner4x:
487	mulq	%rbx
488	addq	%rax,%r10
489	movq	-16(%rcx,%r15,8),%rax
490	adcq	$0,%rdx
491	addq	-16(%rsp,%r15,8),%r10
492	adcq	$0,%rdx
493	movq	%rdx,%r11
494
495	mulq	%rbp
496	addq	%rax,%r13
497	movq	-8(%rsi,%r15,8),%rax
498	adcq	$0,%rdx
499	addq	%r10,%r13
500	adcq	$0,%rdx
501	movq	%r13,-24(%rsp,%r15,8)
502	movq	%rdx,%rdi
503
504	mulq	%rbx
505	addq	%rax,%r11
506	movq	-8(%rcx,%r15,8),%rax
507	adcq	$0,%rdx
508	addq	-8(%rsp,%r15,8),%r11
509	adcq	$0,%rdx
510	movq	%rdx,%r10
511
512	mulq	%rbp
513	addq	%rax,%rdi
514	movq	(%rsi,%r15,8),%rax
515	adcq	$0,%rdx
516	addq	%r11,%rdi
517	adcq	$0,%rdx
518	movq	%rdi,-16(%rsp,%r15,8)
519	movq	%rdx,%r13
520
521	mulq	%rbx
522	addq	%rax,%r10
523	movq	(%rcx,%r15,8),%rax
524	adcq	$0,%rdx
525	addq	(%rsp,%r15,8),%r10
526	adcq	$0,%rdx
527	movq	%rdx,%r11
528
529	mulq	%rbp
530	addq	%rax,%r13
531	movq	8(%rsi,%r15,8),%rax
532	adcq	$0,%rdx
533	addq	%r10,%r13
534	adcq	$0,%rdx
535	movq	%r13,-8(%rsp,%r15,8)
536	movq	%rdx,%rdi
537
538	mulq	%rbx
539	addq	%rax,%r11
540	movq	8(%rcx,%r15,8),%rax
541	adcq	$0,%rdx
542	addq	8(%rsp,%r15,8),%r11
543	adcq	$0,%rdx
544	leaq	4(%r15),%r15
545	movq	%rdx,%r10
546
547	mulq	%rbp
548	addq	%rax,%rdi
549	movq	-16(%rsi,%r15,8),%rax
550	adcq	$0,%rdx
551	addq	%r11,%rdi
552	adcq	$0,%rdx
553	movq	%rdi,-32(%rsp,%r15,8)
554	movq	%rdx,%r13
555	cmpq	%r9,%r15
556	jb	.Linner4x
557
558	mulq	%rbx
559	addq	%rax,%r10
560	movq	-16(%rcx,%r15,8),%rax
561	adcq	$0,%rdx
562	addq	-16(%rsp,%r15,8),%r10
563	adcq	$0,%rdx
564	movq	%rdx,%r11
565
566	mulq	%rbp
567	addq	%rax,%r13
568	movq	-8(%rsi,%r15,8),%rax
569	adcq	$0,%rdx
570	addq	%r10,%r13
571	adcq	$0,%rdx
572	movq	%r13,-24(%rsp,%r15,8)
573	movq	%rdx,%rdi
574
575	mulq	%rbx
576	addq	%rax,%r11
577	movq	-8(%rcx,%r15,8),%rax
578	adcq	$0,%rdx
579	addq	-8(%rsp,%r15,8),%r11
580	adcq	$0,%rdx
581	leaq	1(%r14),%r14
582	movq	%rdx,%r10
583
584	mulq	%rbp
585	addq	%rax,%rdi
586	movq	(%rsi),%rax
587	adcq	$0,%rdx
588	addq	%r11,%rdi
589	adcq	$0,%rdx
590	movq	%rdi,-16(%rsp,%r15,8)
591	movq	%rdx,%r13
592
593	xorq	%rdi,%rdi
594	addq	%r10,%r13
595	adcq	$0,%rdi
596	addq	(%rsp,%r9,8),%r13
597	adcq	$0,%rdi
598	movq	%r13,-8(%rsp,%r15,8)
599	movq	%rdi,(%rsp,%r15,8)
600
601	cmpq	%r9,%r14
602	jb	.Louter4x
603	movq	16(%rsp,%r9,8),%rdi
604	leaq	-4(%r9),%r15
605	movq	0(%rsp),%rax
606	pxor	%xmm0,%xmm0
607	movq	8(%rsp),%rdx
608	shrq	$2,%r15
609	leaq	(%rsp),%rsi
610	xorq	%r14,%r14
611
612	subq	0(%rcx),%rax
613	movq	16(%rsi),%rbx
614	movq	24(%rsi),%rbp
615	sbbq	8(%rcx),%rdx
616	jmp	.Lsub4x
617.align	16
618.Lsub4x:
619	movq	%rax,0(%rdi,%r14,8)
620	movq	%rdx,8(%rdi,%r14,8)
621	sbbq	16(%rcx,%r14,8),%rbx
622	movq	32(%rsi,%r14,8),%rax
623	movq	40(%rsi,%r14,8),%rdx
624	sbbq	24(%rcx,%r14,8),%rbp
625	movq	%rbx,16(%rdi,%r14,8)
626	movq	%rbp,24(%rdi,%r14,8)
627	sbbq	32(%rcx,%r14,8),%rax
628	movq	48(%rsi,%r14,8),%rbx
629	movq	56(%rsi,%r14,8),%rbp
630	sbbq	40(%rcx,%r14,8),%rdx
631	leaq	4(%r14),%r14
632	decq	%r15
633	jnz	.Lsub4x
634
635	movq	%rax,0(%rdi,%r14,8)
636	movq	32(%rsi,%r14,8),%rax
637	sbbq	16(%rcx,%r14,8),%rbx
638	movq	%rdx,8(%rdi,%r14,8)
639	sbbq	24(%rcx,%r14,8),%rbp
640	movq	%rbx,16(%rdi,%r14,8)
641
642	sbbq	$0,%rax
643	movq	%rbp,24(%rdi,%r14,8)
644	xorq	%r14,%r14
645	andq	%rax,%rsi
646	notq	%rax
647	movq	%rdi,%rcx
648	andq	%rax,%rcx
649	leaq	-4(%r9),%r15
650	orq	%rcx,%rsi
651	shrq	$2,%r15
652
653	movdqu	(%rsi),%xmm1
654	movdqa	%xmm0,(%rsp)
655	movdqu	%xmm1,(%rdi)
656	jmp	.Lcopy4x
657.align	16
658.Lcopy4x:
659	movdqu	16(%rsi,%r14,1),%xmm2
660	movdqu	32(%rsi,%r14,1),%xmm1
661	movdqa	%xmm0,16(%rsp,%r14,1)
662	movdqu	%xmm2,16(%rdi,%r14,1)
663	movdqa	%xmm0,32(%rsp,%r14,1)
664	movdqu	%xmm1,32(%rdi,%r14,1)
665	leaq	32(%r14),%r14
666	decq	%r15
667	jnz	.Lcopy4x
668
669	movdqu	16(%rsi,%r14,1),%xmm2
670	movdqa	%xmm0,16(%rsp,%r14,1)
671	movdqu	%xmm2,16(%rdi,%r14,1)
672	movq	8(%rsp,%r9,8),%rsi
673.cfi_def_cfa	%rsi, 8
674	movq	$1,%rax
675	movq	-48(%rsi),%r15
676.cfi_restore	%r15
677	movq	-40(%rsi),%r14
678.cfi_restore	%r14
679	movq	-32(%rsi),%r13
680.cfi_restore	%r13
681	movq	-24(%rsi),%r12
682.cfi_restore	%r12
683	movq	-16(%rsi),%rbp
684.cfi_restore	%rbp
685	movq	-8(%rsi),%rbx
686.cfi_restore	%rbx
687	leaq	(%rsi),%rsp
688.cfi_def_cfa_register	%rsp
689.Lmul4x_epilogue:
690	.byte	0xf3,0xc3
691.cfi_endproc
692.size	bn_mul4x_mont,.-bn_mul4x_mont
693.extern	bn_sqr8x_internal
694.hidden bn_sqr8x_internal
695
696.type	bn_sqr8x_mont,@function
697.align	32
698bn_sqr8x_mont:
699.cfi_startproc
700	movq	%rsp,%rax
701.cfi_def_cfa_register	%rax
702.Lsqr8x_enter:
703	pushq	%rbx
704.cfi_offset	%rbx,-16
705	pushq	%rbp
706.cfi_offset	%rbp,-24
707	pushq	%r12
708.cfi_offset	%r12,-32
709	pushq	%r13
710.cfi_offset	%r13,-40
711	pushq	%r14
712.cfi_offset	%r14,-48
713	pushq	%r15
714.cfi_offset	%r15,-56
715.Lsqr8x_prologue:
716
717	movl	%r9d,%r10d
718	shll	$3,%r9d
719	shlq	$3+2,%r10
720	negq	%r9
721
722
723
724
725
726
727	leaq	-64(%rsp,%r9,2),%r11
728	movq	%rsp,%rbp
729	movq	(%r8),%r8
730	subq	%rsi,%r11
731	andq	$4095,%r11
732	cmpq	%r11,%r10
733	jb	.Lsqr8x_sp_alt
734	subq	%r11,%rbp
735	leaq	-64(%rbp,%r9,2),%rbp
736	jmp	.Lsqr8x_sp_done
737
738.align	32
739.Lsqr8x_sp_alt:
740	leaq	4096-64(,%r9,2),%r10
741	leaq	-64(%rbp,%r9,2),%rbp
742	subq	%r10,%r11
743	movq	$0,%r10
744	cmovcq	%r10,%r11
745	subq	%r11,%rbp
746.Lsqr8x_sp_done:
747	andq	$-64,%rbp
748	movq	%rsp,%r11
749	subq	%rbp,%r11
750	andq	$-4096,%r11
751	leaq	(%r11,%rbp,1),%rsp
752	movq	(%rsp),%r10
753	cmpq	%rbp,%rsp
754	ja	.Lsqr8x_page_walk
755	jmp	.Lsqr8x_page_walk_done
756
757.align	16
758.Lsqr8x_page_walk:
759	leaq	-4096(%rsp),%rsp
760	movq	(%rsp),%r10
761	cmpq	%rbp,%rsp
762	ja	.Lsqr8x_page_walk
763.Lsqr8x_page_walk_done:
764
765	movq	%r9,%r10
766	negq	%r9
767
768	movq	%r8,32(%rsp)
769	movq	%rax,40(%rsp)
770.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
771.Lsqr8x_body:
772
773.byte	102,72,15,110,209
774	pxor	%xmm0,%xmm0
775.byte	102,72,15,110,207
776.byte	102,73,15,110,218
777	call	bn_sqr8x_internal
778
779
780
781
782	leaq	(%rdi,%r9,1),%rbx
783	movq	%r9,%rcx
784	movq	%r9,%rdx
785.byte	102,72,15,126,207
786	sarq	$3+2,%rcx
787	jmp	.Lsqr8x_sub
788
789.align	32
790.Lsqr8x_sub:
791	movq	0(%rbx),%r12
792	movq	8(%rbx),%r13
793	movq	16(%rbx),%r14
794	movq	24(%rbx),%r15
795	leaq	32(%rbx),%rbx
796	sbbq	0(%rbp),%r12
797	sbbq	8(%rbp),%r13
798	sbbq	16(%rbp),%r14
799	sbbq	24(%rbp),%r15
800	leaq	32(%rbp),%rbp
801	movq	%r12,0(%rdi)
802	movq	%r13,8(%rdi)
803	movq	%r14,16(%rdi)
804	movq	%r15,24(%rdi)
805	leaq	32(%rdi),%rdi
806	incq	%rcx
807	jnz	.Lsqr8x_sub
808
809	sbbq	$0,%rax
810	leaq	(%rbx,%r9,1),%rbx
811	leaq	(%rdi,%r9,1),%rdi
812
813.byte	102,72,15,110,200
814	pxor	%xmm0,%xmm0
815	pshufd	$0,%xmm1,%xmm1
816	movq	40(%rsp),%rsi
817.cfi_def_cfa	%rsi,8
818	jmp	.Lsqr8x_cond_copy
819
820.align	32
821.Lsqr8x_cond_copy:
822	movdqa	0(%rbx),%xmm2
823	movdqa	16(%rbx),%xmm3
824	leaq	32(%rbx),%rbx
825	movdqu	0(%rdi),%xmm4
826	movdqu	16(%rdi),%xmm5
827	leaq	32(%rdi),%rdi
828	movdqa	%xmm0,-32(%rbx)
829	movdqa	%xmm0,-16(%rbx)
830	movdqa	%xmm0,-32(%rbx,%rdx,1)
831	movdqa	%xmm0,-16(%rbx,%rdx,1)
832	pcmpeqd	%xmm1,%xmm0
833	pand	%xmm1,%xmm2
834	pand	%xmm1,%xmm3
835	pand	%xmm0,%xmm4
836	pand	%xmm0,%xmm5
837	pxor	%xmm0,%xmm0
838	por	%xmm2,%xmm4
839	por	%xmm3,%xmm5
840	movdqu	%xmm4,-32(%rdi)
841	movdqu	%xmm5,-16(%rdi)
842	addq	$32,%r9
843	jnz	.Lsqr8x_cond_copy
844
845	movq	$1,%rax
846	movq	-48(%rsi),%r15
847.cfi_restore	%r15
848	movq	-40(%rsi),%r14
849.cfi_restore	%r14
850	movq	-32(%rsi),%r13
851.cfi_restore	%r13
852	movq	-24(%rsi),%r12
853.cfi_restore	%r12
854	movq	-16(%rsi),%rbp
855.cfi_restore	%rbp
856	movq	-8(%rsi),%rbx
857.cfi_restore	%rbx
858	leaq	(%rsi),%rsp
859.cfi_def_cfa_register	%rsp
860.Lsqr8x_epilogue:
861	.byte	0xf3,0xc3
862.cfi_endproc
863.size	bn_sqr8x_mont,.-bn_sqr8x_mont
864.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
865.align	16
866#endif
867