• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# This file is generated from a similarly-named Perl script in the BoringSSL
2# source tree. Do not edit by hand.
3
4#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
11#include "ring_core_generated/prefix_symbols_asm.h"
12.text
13
14.extern	OPENSSL_ia32cap_P
15.hidden OPENSSL_ia32cap_P
16
17.globl	bn_mul_mont
18.hidden bn_mul_mont
19.type	bn_mul_mont,@function
20.align	16
21bn_mul_mont:
22.cfi_startproc
23	movl	%r9d,%r9d
24	movq	%rsp,%rax
25.cfi_def_cfa_register	%rax
26	testl	$3,%r9d
27	jnz	.Lmul_enter
28	cmpl	$8,%r9d
29	jb	.Lmul_enter
30	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
31	cmpq	%rsi,%rdx
32	jne	.Lmul4x_enter
33	testl	$7,%r9d
34	jz	.Lsqr8x_enter
35	jmp	.Lmul4x_enter
36
37.align	16
38.Lmul_enter:
39	pushq	%rbx
40.cfi_offset	%rbx,-16
41	pushq	%rbp
42.cfi_offset	%rbp,-24
43	pushq	%r12
44.cfi_offset	%r12,-32
45	pushq	%r13
46.cfi_offset	%r13,-40
47	pushq	%r14
48.cfi_offset	%r14,-48
49	pushq	%r15
50.cfi_offset	%r15,-56
51
52	negq	%r9
53	movq	%rsp,%r11
54	leaq	-16(%rsp,%r9,8),%r10
55	negq	%r9
56	andq	$-1024,%r10
57
58
59
60
61
62
63
64
65
66	subq	%r10,%r11
67	andq	$-4096,%r11
68	leaq	(%r10,%r11,1),%rsp
69	movq	(%rsp),%r11
70	cmpq	%r10,%rsp
71	ja	.Lmul_page_walk
72	jmp	.Lmul_page_walk_done
73
74.align	16
75.Lmul_page_walk:
76	leaq	-4096(%rsp),%rsp
77	movq	(%rsp),%r11
78	cmpq	%r10,%rsp
79	ja	.Lmul_page_walk
80.Lmul_page_walk_done:
81
82	movq	%rax,8(%rsp,%r9,8)
83.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
84.Lmul_body:
85	movq	%rdx,%r12
86	movq	(%r8),%r8
87	movq	(%r12),%rbx
88	movq	(%rsi),%rax
89
90	xorq	%r14,%r14
91	xorq	%r15,%r15
92
93	movq	%r8,%rbp
94	mulq	%rbx
95	movq	%rax,%r10
96	movq	(%rcx),%rax
97
98	imulq	%r10,%rbp
99	movq	%rdx,%r11
100
101	mulq	%rbp
102	addq	%rax,%r10
103	movq	8(%rsi),%rax
104	adcq	$0,%rdx
105	movq	%rdx,%r13
106
107	leaq	1(%r15),%r15
108	jmp	.L1st_enter
109
110.align	16
111.L1st:
112	addq	%rax,%r13
113	movq	(%rsi,%r15,8),%rax
114	adcq	$0,%rdx
115	addq	%r11,%r13
116	movq	%r10,%r11
117	adcq	$0,%rdx
118	movq	%r13,-16(%rsp,%r15,8)
119	movq	%rdx,%r13
120
121.L1st_enter:
122	mulq	%rbx
123	addq	%rax,%r11
124	movq	(%rcx,%r15,8),%rax
125	adcq	$0,%rdx
126	leaq	1(%r15),%r15
127	movq	%rdx,%r10
128
129	mulq	%rbp
130	cmpq	%r9,%r15
131	jne	.L1st
132
133	addq	%rax,%r13
134	movq	(%rsi),%rax
135	adcq	$0,%rdx
136	addq	%r11,%r13
137	adcq	$0,%rdx
138	movq	%r13,-16(%rsp,%r15,8)
139	movq	%rdx,%r13
140	movq	%r10,%r11
141
142	xorq	%rdx,%rdx
143	addq	%r11,%r13
144	adcq	$0,%rdx
145	movq	%r13,-8(%rsp,%r9,8)
146	movq	%rdx,(%rsp,%r9,8)
147
148	leaq	1(%r14),%r14
149	jmp	.Louter
150.align	16
151.Louter:
152	movq	(%r12,%r14,8),%rbx
153	xorq	%r15,%r15
154	movq	%r8,%rbp
155	movq	(%rsp),%r10
156	mulq	%rbx
157	addq	%rax,%r10
158	movq	(%rcx),%rax
159	adcq	$0,%rdx
160
161	imulq	%r10,%rbp
162	movq	%rdx,%r11
163
164	mulq	%rbp
165	addq	%rax,%r10
166	movq	8(%rsi),%rax
167	adcq	$0,%rdx
168	movq	8(%rsp),%r10
169	movq	%rdx,%r13
170
171	leaq	1(%r15),%r15
172	jmp	.Linner_enter
173
174.align	16
175.Linner:
176	addq	%rax,%r13
177	movq	(%rsi,%r15,8),%rax
178	adcq	$0,%rdx
179	addq	%r10,%r13
180	movq	(%rsp,%r15,8),%r10
181	adcq	$0,%rdx
182	movq	%r13,-16(%rsp,%r15,8)
183	movq	%rdx,%r13
184
185.Linner_enter:
186	mulq	%rbx
187	addq	%rax,%r11
188	movq	(%rcx,%r15,8),%rax
189	adcq	$0,%rdx
190	addq	%r11,%r10
191	movq	%rdx,%r11
192	adcq	$0,%r11
193	leaq	1(%r15),%r15
194
195	mulq	%rbp
196	cmpq	%r9,%r15
197	jne	.Linner
198
199	addq	%rax,%r13
200	movq	(%rsi),%rax
201	adcq	$0,%rdx
202	addq	%r10,%r13
203	movq	(%rsp,%r15,8),%r10
204	adcq	$0,%rdx
205	movq	%r13,-16(%rsp,%r15,8)
206	movq	%rdx,%r13
207
208	xorq	%rdx,%rdx
209	addq	%r11,%r13
210	adcq	$0,%rdx
211	addq	%r10,%r13
212	adcq	$0,%rdx
213	movq	%r13,-8(%rsp,%r9,8)
214	movq	%rdx,(%rsp,%r9,8)
215
216	leaq	1(%r14),%r14
217	cmpq	%r9,%r14
218	jb	.Louter
219
220	xorq	%r14,%r14
221	movq	(%rsp),%rax
222	movq	%r9,%r15
223
224.align	16
225.Lsub:	sbbq	(%rcx,%r14,8),%rax
226	movq	%rax,(%rdi,%r14,8)
227	movq	8(%rsp,%r14,8),%rax
228	leaq	1(%r14),%r14
229	decq	%r15
230	jnz	.Lsub
231
232	sbbq	$0,%rax
233	movq	$-1,%rbx
234	xorq	%rax,%rbx
235	xorq	%r14,%r14
236	movq	%r9,%r15
237
238.Lcopy:
239	movq	(%rdi,%r14,8),%rcx
240	movq	(%rsp,%r14,8),%rdx
241	andq	%rbx,%rcx
242	andq	%rax,%rdx
243	movq	%r9,(%rsp,%r14,8)
244	orq	%rcx,%rdx
245	movq	%rdx,(%rdi,%r14,8)
246	leaq	1(%r14),%r14
247	subq	$1,%r15
248	jnz	.Lcopy
249
250	movq	8(%rsp,%r9,8),%rsi
251.cfi_def_cfa	%rsi,8
252	movq	$1,%rax
253	movq	-48(%rsi),%r15
254.cfi_restore	%r15
255	movq	-40(%rsi),%r14
256.cfi_restore	%r14
257	movq	-32(%rsi),%r13
258.cfi_restore	%r13
259	movq	-24(%rsi),%r12
260.cfi_restore	%r12
261	movq	-16(%rsi),%rbp
262.cfi_restore	%rbp
263	movq	-8(%rsi),%rbx
264.cfi_restore	%rbx
265	leaq	(%rsi),%rsp
266.cfi_def_cfa_register	%rsp
267.Lmul_epilogue:
268	.byte	0xf3,0xc3
269.cfi_endproc
270.size	bn_mul_mont,.-bn_mul_mont
271.type	bn_mul4x_mont,@function
272.align	16
273bn_mul4x_mont:
274.cfi_startproc
275	movl	%r9d,%r9d
276	movq	%rsp,%rax
277.cfi_def_cfa_register	%rax
278.Lmul4x_enter:
279	andl	$0x80100,%r11d
280	cmpl	$0x80100,%r11d
281	je	.Lmulx4x_enter
282	pushq	%rbx
283.cfi_offset	%rbx,-16
284	pushq	%rbp
285.cfi_offset	%rbp,-24
286	pushq	%r12
287.cfi_offset	%r12,-32
288	pushq	%r13
289.cfi_offset	%r13,-40
290	pushq	%r14
291.cfi_offset	%r14,-48
292	pushq	%r15
293.cfi_offset	%r15,-56
294
295	negq	%r9
296	movq	%rsp,%r11
297	leaq	-32(%rsp,%r9,8),%r10
298	negq	%r9
299	andq	$-1024,%r10
300
301	subq	%r10,%r11
302	andq	$-4096,%r11
303	leaq	(%r10,%r11,1),%rsp
304	movq	(%rsp),%r11
305	cmpq	%r10,%rsp
306	ja	.Lmul4x_page_walk
307	jmp	.Lmul4x_page_walk_done
308
309.Lmul4x_page_walk:
310	leaq	-4096(%rsp),%rsp
311	movq	(%rsp),%r11
312	cmpq	%r10,%rsp
313	ja	.Lmul4x_page_walk
314.Lmul4x_page_walk_done:
315
316	movq	%rax,8(%rsp,%r9,8)
317.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
318.Lmul4x_body:
319	movq	%rdi,16(%rsp,%r9,8)
320	movq	%rdx,%r12
321	movq	(%r8),%r8
322	movq	(%r12),%rbx
323	movq	(%rsi),%rax
324
325	xorq	%r14,%r14
326	xorq	%r15,%r15
327
328	movq	%r8,%rbp
329	mulq	%rbx
330	movq	%rax,%r10
331	movq	(%rcx),%rax
332
333	imulq	%r10,%rbp
334	movq	%rdx,%r11
335
336	mulq	%rbp
337	addq	%rax,%r10
338	movq	8(%rsi),%rax
339	adcq	$0,%rdx
340	movq	%rdx,%rdi
341
342	mulq	%rbx
343	addq	%rax,%r11
344	movq	8(%rcx),%rax
345	adcq	$0,%rdx
346	movq	%rdx,%r10
347
348	mulq	%rbp
349	addq	%rax,%rdi
350	movq	16(%rsi),%rax
351	adcq	$0,%rdx
352	addq	%r11,%rdi
353	leaq	4(%r15),%r15
354	adcq	$0,%rdx
355	movq	%rdi,(%rsp)
356	movq	%rdx,%r13
357	jmp	.L1st4x
358.align	16
359.L1st4x:
360	mulq	%rbx
361	addq	%rax,%r10
362	movq	-16(%rcx,%r15,8),%rax
363	adcq	$0,%rdx
364	movq	%rdx,%r11
365
366	mulq	%rbp
367	addq	%rax,%r13
368	movq	-8(%rsi,%r15,8),%rax
369	adcq	$0,%rdx
370	addq	%r10,%r13
371	adcq	$0,%rdx
372	movq	%r13,-24(%rsp,%r15,8)
373	movq	%rdx,%rdi
374
375	mulq	%rbx
376	addq	%rax,%r11
377	movq	-8(%rcx,%r15,8),%rax
378	adcq	$0,%rdx
379	movq	%rdx,%r10
380
381	mulq	%rbp
382	addq	%rax,%rdi
383	movq	(%rsi,%r15,8),%rax
384	adcq	$0,%rdx
385	addq	%r11,%rdi
386	adcq	$0,%rdx
387	movq	%rdi,-16(%rsp,%r15,8)
388	movq	%rdx,%r13
389
390	mulq	%rbx
391	addq	%rax,%r10
392	movq	(%rcx,%r15,8),%rax
393	adcq	$0,%rdx
394	movq	%rdx,%r11
395
396	mulq	%rbp
397	addq	%rax,%r13
398	movq	8(%rsi,%r15,8),%rax
399	adcq	$0,%rdx
400	addq	%r10,%r13
401	adcq	$0,%rdx
402	movq	%r13,-8(%rsp,%r15,8)
403	movq	%rdx,%rdi
404
405	mulq	%rbx
406	addq	%rax,%r11
407	movq	8(%rcx,%r15,8),%rax
408	adcq	$0,%rdx
409	leaq	4(%r15),%r15
410	movq	%rdx,%r10
411
412	mulq	%rbp
413	addq	%rax,%rdi
414	movq	-16(%rsi,%r15,8),%rax
415	adcq	$0,%rdx
416	addq	%r11,%rdi
417	adcq	$0,%rdx
418	movq	%rdi,-32(%rsp,%r15,8)
419	movq	%rdx,%r13
420	cmpq	%r9,%r15
421	jb	.L1st4x
422
423	mulq	%rbx
424	addq	%rax,%r10
425	movq	-16(%rcx,%r15,8),%rax
426	adcq	$0,%rdx
427	movq	%rdx,%r11
428
429	mulq	%rbp
430	addq	%rax,%r13
431	movq	-8(%rsi,%r15,8),%rax
432	adcq	$0,%rdx
433	addq	%r10,%r13
434	adcq	$0,%rdx
435	movq	%r13,-24(%rsp,%r15,8)
436	movq	%rdx,%rdi
437
438	mulq	%rbx
439	addq	%rax,%r11
440	movq	-8(%rcx,%r15,8),%rax
441	adcq	$0,%rdx
442	movq	%rdx,%r10
443
444	mulq	%rbp
445	addq	%rax,%rdi
446	movq	(%rsi),%rax
447	adcq	$0,%rdx
448	addq	%r11,%rdi
449	adcq	$0,%rdx
450	movq	%rdi,-16(%rsp,%r15,8)
451	movq	%rdx,%r13
452
453	xorq	%rdi,%rdi
454	addq	%r10,%r13
455	adcq	$0,%rdi
456	movq	%r13,-8(%rsp,%r15,8)
457	movq	%rdi,(%rsp,%r15,8)
458
459	leaq	1(%r14),%r14
460.align	4
461.Louter4x:
462	movq	(%r12,%r14,8),%rbx
463	xorq	%r15,%r15
464	movq	(%rsp),%r10
465	movq	%r8,%rbp
466	mulq	%rbx
467	addq	%rax,%r10
468	movq	(%rcx),%rax
469	adcq	$0,%rdx
470
471	imulq	%r10,%rbp
472	movq	%rdx,%r11
473
474	mulq	%rbp
475	addq	%rax,%r10
476	movq	8(%rsi),%rax
477	adcq	$0,%rdx
478	movq	%rdx,%rdi
479
480	mulq	%rbx
481	addq	%rax,%r11
482	movq	8(%rcx),%rax
483	adcq	$0,%rdx
484	addq	8(%rsp),%r11
485	adcq	$0,%rdx
486	movq	%rdx,%r10
487
488	mulq	%rbp
489	addq	%rax,%rdi
490	movq	16(%rsi),%rax
491	adcq	$0,%rdx
492	addq	%r11,%rdi
493	leaq	4(%r15),%r15
494	adcq	$0,%rdx
495	movq	%rdi,(%rsp)
496	movq	%rdx,%r13
497	jmp	.Linner4x
498.align	16
499.Linner4x:
500	mulq	%rbx
501	addq	%rax,%r10
502	movq	-16(%rcx,%r15,8),%rax
503	adcq	$0,%rdx
504	addq	-16(%rsp,%r15,8),%r10
505	adcq	$0,%rdx
506	movq	%rdx,%r11
507
508	mulq	%rbp
509	addq	%rax,%r13
510	movq	-8(%rsi,%r15,8),%rax
511	adcq	$0,%rdx
512	addq	%r10,%r13
513	adcq	$0,%rdx
514	movq	%r13,-24(%rsp,%r15,8)
515	movq	%rdx,%rdi
516
517	mulq	%rbx
518	addq	%rax,%r11
519	movq	-8(%rcx,%r15,8),%rax
520	adcq	$0,%rdx
521	addq	-8(%rsp,%r15,8),%r11
522	adcq	$0,%rdx
523	movq	%rdx,%r10
524
525	mulq	%rbp
526	addq	%rax,%rdi
527	movq	(%rsi,%r15,8),%rax
528	adcq	$0,%rdx
529	addq	%r11,%rdi
530	adcq	$0,%rdx
531	movq	%rdi,-16(%rsp,%r15,8)
532	movq	%rdx,%r13
533
534	mulq	%rbx
535	addq	%rax,%r10
536	movq	(%rcx,%r15,8),%rax
537	adcq	$0,%rdx
538	addq	(%rsp,%r15,8),%r10
539	adcq	$0,%rdx
540	movq	%rdx,%r11
541
542	mulq	%rbp
543	addq	%rax,%r13
544	movq	8(%rsi,%r15,8),%rax
545	adcq	$0,%rdx
546	addq	%r10,%r13
547	adcq	$0,%rdx
548	movq	%r13,-8(%rsp,%r15,8)
549	movq	%rdx,%rdi
550
551	mulq	%rbx
552	addq	%rax,%r11
553	movq	8(%rcx,%r15,8),%rax
554	adcq	$0,%rdx
555	addq	8(%rsp,%r15,8),%r11
556	adcq	$0,%rdx
557	leaq	4(%r15),%r15
558	movq	%rdx,%r10
559
560	mulq	%rbp
561	addq	%rax,%rdi
562	movq	-16(%rsi,%r15,8),%rax
563	adcq	$0,%rdx
564	addq	%r11,%rdi
565	adcq	$0,%rdx
566	movq	%rdi,-32(%rsp,%r15,8)
567	movq	%rdx,%r13
568	cmpq	%r9,%r15
569	jb	.Linner4x
570
571	mulq	%rbx
572	addq	%rax,%r10
573	movq	-16(%rcx,%r15,8),%rax
574	adcq	$0,%rdx
575	addq	-16(%rsp,%r15,8),%r10
576	adcq	$0,%rdx
577	movq	%rdx,%r11
578
579	mulq	%rbp
580	addq	%rax,%r13
581	movq	-8(%rsi,%r15,8),%rax
582	adcq	$0,%rdx
583	addq	%r10,%r13
584	adcq	$0,%rdx
585	movq	%r13,-24(%rsp,%r15,8)
586	movq	%rdx,%rdi
587
588	mulq	%rbx
589	addq	%rax,%r11
590	movq	-8(%rcx,%r15,8),%rax
591	adcq	$0,%rdx
592	addq	-8(%rsp,%r15,8),%r11
593	adcq	$0,%rdx
594	leaq	1(%r14),%r14
595	movq	%rdx,%r10
596
597	mulq	%rbp
598	addq	%rax,%rdi
599	movq	(%rsi),%rax
600	adcq	$0,%rdx
601	addq	%r11,%rdi
602	adcq	$0,%rdx
603	movq	%rdi,-16(%rsp,%r15,8)
604	movq	%rdx,%r13
605
606	xorq	%rdi,%rdi
607	addq	%r10,%r13
608	adcq	$0,%rdi
609	addq	(%rsp,%r9,8),%r13
610	adcq	$0,%rdi
611	movq	%r13,-8(%rsp,%r15,8)
612	movq	%rdi,(%rsp,%r15,8)
613
614	cmpq	%r9,%r14
615	jb	.Louter4x
616	movq	16(%rsp,%r9,8),%rdi
617	leaq	-4(%r9),%r15
618	movq	0(%rsp),%rax
619	movq	8(%rsp),%rdx
620	shrq	$2,%r15
621	leaq	(%rsp),%rsi
622	xorq	%r14,%r14
623
624	subq	0(%rcx),%rax
625	movq	16(%rsi),%rbx
626	movq	24(%rsi),%rbp
627	sbbq	8(%rcx),%rdx
628
629.Lsub4x:
630	movq	%rax,0(%rdi,%r14,8)
631	movq	%rdx,8(%rdi,%r14,8)
632	sbbq	16(%rcx,%r14,8),%rbx
633	movq	32(%rsi,%r14,8),%rax
634	movq	40(%rsi,%r14,8),%rdx
635	sbbq	24(%rcx,%r14,8),%rbp
636	movq	%rbx,16(%rdi,%r14,8)
637	movq	%rbp,24(%rdi,%r14,8)
638	sbbq	32(%rcx,%r14,8),%rax
639	movq	48(%rsi,%r14,8),%rbx
640	movq	56(%rsi,%r14,8),%rbp
641	sbbq	40(%rcx,%r14,8),%rdx
642	leaq	4(%r14),%r14
643	decq	%r15
644	jnz	.Lsub4x
645
646	movq	%rax,0(%rdi,%r14,8)
647	movq	32(%rsi,%r14,8),%rax
648	sbbq	16(%rcx,%r14,8),%rbx
649	movq	%rdx,8(%rdi,%r14,8)
650	sbbq	24(%rcx,%r14,8),%rbp
651	movq	%rbx,16(%rdi,%r14,8)
652
653	sbbq	$0,%rax
654	movq	%rbp,24(%rdi,%r14,8)
655	pxor	%xmm0,%xmm0
656.byte	102,72,15,110,224
657	pcmpeqd	%xmm5,%xmm5
658	pshufd	$0,%xmm4,%xmm4
659	movq	%r9,%r15
660	pxor	%xmm4,%xmm5
661	shrq	$2,%r15
662	xorl	%eax,%eax
663
664	jmp	.Lcopy4x
665.align	16
666.Lcopy4x:
667	movdqa	(%rsp,%rax,1),%xmm1
668	movdqu	(%rdi,%rax,1),%xmm2
669	pand	%xmm4,%xmm1
670	pand	%xmm5,%xmm2
671	movdqa	16(%rsp,%rax,1),%xmm3
672	movdqa	%xmm0,(%rsp,%rax,1)
673	por	%xmm2,%xmm1
674	movdqu	16(%rdi,%rax,1),%xmm2
675	movdqu	%xmm1,(%rdi,%rax,1)
676	pand	%xmm4,%xmm3
677	pand	%xmm5,%xmm2
678	movdqa	%xmm0,16(%rsp,%rax,1)
679	por	%xmm2,%xmm3
680	movdqu	%xmm3,16(%rdi,%rax,1)
681	leaq	32(%rax),%rax
682	decq	%r15
683	jnz	.Lcopy4x
684	movq	8(%rsp,%r9,8),%rsi
685.cfi_def_cfa	%rsi, 8
686	movq	$1,%rax
687	movq	-48(%rsi),%r15
688.cfi_restore	%r15
689	movq	-40(%rsi),%r14
690.cfi_restore	%r14
691	movq	-32(%rsi),%r13
692.cfi_restore	%r13
693	movq	-24(%rsi),%r12
694.cfi_restore	%r12
695	movq	-16(%rsi),%rbp
696.cfi_restore	%rbp
697	movq	-8(%rsi),%rbx
698.cfi_restore	%rbx
699	leaq	(%rsi),%rsp
700.cfi_def_cfa_register	%rsp
701.Lmul4x_epilogue:
702	.byte	0xf3,0xc3
703.cfi_endproc
704.size	bn_mul4x_mont,.-bn_mul4x_mont
705.extern	bn_sqrx8x_internal
706.hidden bn_sqrx8x_internal
707.extern	bn_sqr8x_internal
708.hidden bn_sqr8x_internal
709
710.type	bn_sqr8x_mont,@function
711.align	32
712bn_sqr8x_mont:
713.cfi_startproc
714	movq	%rsp,%rax
715.cfi_def_cfa_register	%rax
716.Lsqr8x_enter:
717	pushq	%rbx
718.cfi_offset	%rbx,-16
719	pushq	%rbp
720.cfi_offset	%rbp,-24
721	pushq	%r12
722.cfi_offset	%r12,-32
723	pushq	%r13
724.cfi_offset	%r13,-40
725	pushq	%r14
726.cfi_offset	%r14,-48
727	pushq	%r15
728.cfi_offset	%r15,-56
729.Lsqr8x_prologue:
730
731	movl	%r9d,%r10d
732	shll	$3,%r9d
733	shlq	$3+2,%r10
734	negq	%r9
735
736
737
738
739
740
741	leaq	-64(%rsp,%r9,2),%r11
742	movq	%rsp,%rbp
743	movq	(%r8),%r8
744	subq	%rsi,%r11
745	andq	$4095,%r11
746	cmpq	%r11,%r10
747	jb	.Lsqr8x_sp_alt
748	subq	%r11,%rbp
749	leaq	-64(%rbp,%r9,2),%rbp
750	jmp	.Lsqr8x_sp_done
751
752.align	32
753.Lsqr8x_sp_alt:
754	leaq	4096-64(,%r9,2),%r10
755	leaq	-64(%rbp,%r9,2),%rbp
756	subq	%r10,%r11
757	movq	$0,%r10
758	cmovcq	%r10,%r11
759	subq	%r11,%rbp
760.Lsqr8x_sp_done:
761	andq	$-64,%rbp
762	movq	%rsp,%r11
763	subq	%rbp,%r11
764	andq	$-4096,%r11
765	leaq	(%r11,%rbp,1),%rsp
766	movq	(%rsp),%r10
767	cmpq	%rbp,%rsp
768	ja	.Lsqr8x_page_walk
769	jmp	.Lsqr8x_page_walk_done
770
771.align	16
772.Lsqr8x_page_walk:
773	leaq	-4096(%rsp),%rsp
774	movq	(%rsp),%r10
775	cmpq	%rbp,%rsp
776	ja	.Lsqr8x_page_walk
777.Lsqr8x_page_walk_done:
778
779	movq	%r9,%r10
780	negq	%r9
781
782	movq	%r8,32(%rsp)
783	movq	%rax,40(%rsp)
784.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
785.Lsqr8x_body:
786
787.byte	102,72,15,110,209
788	pxor	%xmm0,%xmm0
789.byte	102,72,15,110,207
790.byte	102,73,15,110,218
791	movl	OPENSSL_ia32cap_P+8(%rip),%eax
792	andl	$0x80100,%eax
793	cmpl	$0x80100,%eax
794	jne	.Lsqr8x_nox
795
796	call	bn_sqrx8x_internal
797
798
799
800
801	leaq	(%r8,%rcx,1),%rbx
802	movq	%rcx,%r9
803	movq	%rcx,%rdx
804.byte	102,72,15,126,207
805	sarq	$3+2,%rcx
806	jmp	.Lsqr8x_sub
807
808.align	32
809.Lsqr8x_nox:
810	call	bn_sqr8x_internal
811
812
813
814
815	leaq	(%rdi,%r9,1),%rbx
816	movq	%r9,%rcx
817	movq	%r9,%rdx
818.byte	102,72,15,126,207
819	sarq	$3+2,%rcx
820	jmp	.Lsqr8x_sub
821
822.align	32
823.Lsqr8x_sub:
824	movq	0(%rbx),%r12
825	movq	8(%rbx),%r13
826	movq	16(%rbx),%r14
827	movq	24(%rbx),%r15
828	leaq	32(%rbx),%rbx
829	sbbq	0(%rbp),%r12
830	sbbq	8(%rbp),%r13
831	sbbq	16(%rbp),%r14
832	sbbq	24(%rbp),%r15
833	leaq	32(%rbp),%rbp
834	movq	%r12,0(%rdi)
835	movq	%r13,8(%rdi)
836	movq	%r14,16(%rdi)
837	movq	%r15,24(%rdi)
838	leaq	32(%rdi),%rdi
839	incq	%rcx
840	jnz	.Lsqr8x_sub
841
842	sbbq	$0,%rax
843	leaq	(%rbx,%r9,1),%rbx
844	leaq	(%rdi,%r9,1),%rdi
845
846.byte	102,72,15,110,200
847	pxor	%xmm0,%xmm0
848	pshufd	$0,%xmm1,%xmm1
849	movq	40(%rsp),%rsi
850.cfi_def_cfa	%rsi,8
851	jmp	.Lsqr8x_cond_copy
852
853.align	32
854.Lsqr8x_cond_copy:
855	movdqa	0(%rbx),%xmm2
856	movdqa	16(%rbx),%xmm3
857	leaq	32(%rbx),%rbx
858	movdqu	0(%rdi),%xmm4
859	movdqu	16(%rdi),%xmm5
860	leaq	32(%rdi),%rdi
861	movdqa	%xmm0,-32(%rbx)
862	movdqa	%xmm0,-16(%rbx)
863	movdqa	%xmm0,-32(%rbx,%rdx,1)
864	movdqa	%xmm0,-16(%rbx,%rdx,1)
865	pcmpeqd	%xmm1,%xmm0
866	pand	%xmm1,%xmm2
867	pand	%xmm1,%xmm3
868	pand	%xmm0,%xmm4
869	pand	%xmm0,%xmm5
870	pxor	%xmm0,%xmm0
871	por	%xmm2,%xmm4
872	por	%xmm3,%xmm5
873	movdqu	%xmm4,-32(%rdi)
874	movdqu	%xmm5,-16(%rdi)
875	addq	$32,%r9
876	jnz	.Lsqr8x_cond_copy
877
878	movq	$1,%rax
879	movq	-48(%rsi),%r15
880.cfi_restore	%r15
881	movq	-40(%rsi),%r14
882.cfi_restore	%r14
883	movq	-32(%rsi),%r13
884.cfi_restore	%r13
885	movq	-24(%rsi),%r12
886.cfi_restore	%r12
887	movq	-16(%rsi),%rbp
888.cfi_restore	%rbp
889	movq	-8(%rsi),%rbx
890.cfi_restore	%rbx
891	leaq	(%rsi),%rsp
892.cfi_def_cfa_register	%rsp
893.Lsqr8x_epilogue:
894	.byte	0xf3,0xc3
895.cfi_endproc
896.size	bn_sqr8x_mont,.-bn_sqr8x_mont
897.type	bn_mulx4x_mont,@function
898.align	32
899bn_mulx4x_mont:
900.cfi_startproc
901	movq	%rsp,%rax
902.cfi_def_cfa_register	%rax
903.Lmulx4x_enter:
904	pushq	%rbx
905.cfi_offset	%rbx,-16
906	pushq	%rbp
907.cfi_offset	%rbp,-24
908	pushq	%r12
909.cfi_offset	%r12,-32
910	pushq	%r13
911.cfi_offset	%r13,-40
912	pushq	%r14
913.cfi_offset	%r14,-48
914	pushq	%r15
915.cfi_offset	%r15,-56
916.Lmulx4x_prologue:
917
918	shll	$3,%r9d
919	xorq	%r10,%r10
920	subq	%r9,%r10
921	movq	(%r8),%r8
922	leaq	-72(%rsp,%r10,1),%rbp
923	andq	$-128,%rbp
924	movq	%rsp,%r11
925	subq	%rbp,%r11
926	andq	$-4096,%r11
927	leaq	(%r11,%rbp,1),%rsp
928	movq	(%rsp),%r10
929	cmpq	%rbp,%rsp
930	ja	.Lmulx4x_page_walk
931	jmp	.Lmulx4x_page_walk_done
932
933.align	16
934.Lmulx4x_page_walk:
935	leaq	-4096(%rsp),%rsp
936	movq	(%rsp),%r10
937	cmpq	%rbp,%rsp
938	ja	.Lmulx4x_page_walk
939.Lmulx4x_page_walk_done:
940
941	leaq	(%rdx,%r9,1),%r10
942
943
944
945
946
947
948
949
950
951
952
953
954	movq	%r9,0(%rsp)
955	shrq	$5,%r9
956	movq	%r10,16(%rsp)
957	subq	$1,%r9
958	movq	%r8,24(%rsp)
959	movq	%rdi,32(%rsp)
960	movq	%rax,40(%rsp)
961.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
962	movq	%r9,48(%rsp)
963	jmp	.Lmulx4x_body
964
965.align	32
966.Lmulx4x_body:
967	leaq	8(%rdx),%rdi
968	movq	(%rdx),%rdx
969	leaq	64+32(%rsp),%rbx
970	movq	%rdx,%r9
971
972	mulxq	0(%rsi),%r8,%rax
973	mulxq	8(%rsi),%r11,%r14
974	addq	%rax,%r11
975	movq	%rdi,8(%rsp)
976	mulxq	16(%rsi),%r12,%r13
977	adcq	%r14,%r12
978	adcq	$0,%r13
979
980	movq	%r8,%rdi
981	imulq	24(%rsp),%r8
982	xorq	%rbp,%rbp
983
984	mulxq	24(%rsi),%rax,%r14
985	movq	%r8,%rdx
986	leaq	32(%rsi),%rsi
987	adcxq	%rax,%r13
988	adcxq	%rbp,%r14
989
990	mulxq	0(%rcx),%rax,%r10
991	adcxq	%rax,%rdi
992	adoxq	%r11,%r10
993	mulxq	8(%rcx),%rax,%r11
994	adcxq	%rax,%r10
995	adoxq	%r12,%r11
996.byte	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
997	movq	48(%rsp),%rdi
998	movq	%r10,-32(%rbx)
999	adcxq	%rax,%r11
1000	adoxq	%r13,%r12
1001	mulxq	24(%rcx),%rax,%r15
1002	movq	%r9,%rdx
1003	movq	%r11,-24(%rbx)
1004	adcxq	%rax,%r12
1005	adoxq	%rbp,%r15
1006	leaq	32(%rcx),%rcx
1007	movq	%r12,-16(%rbx)
1008
1009	jmp	.Lmulx4x_1st
1010
1011.align	32
1012.Lmulx4x_1st:
1013	adcxq	%rbp,%r15
1014	mulxq	0(%rsi),%r10,%rax
1015	adcxq	%r14,%r10
1016	mulxq	8(%rsi),%r11,%r14
1017	adcxq	%rax,%r11
1018	mulxq	16(%rsi),%r12,%rax
1019	adcxq	%r14,%r12
1020	mulxq	24(%rsi),%r13,%r14
1021.byte	0x67,0x67
1022	movq	%r8,%rdx
1023	adcxq	%rax,%r13
1024	adcxq	%rbp,%r14
1025	leaq	32(%rsi),%rsi
1026	leaq	32(%rbx),%rbx
1027
1028	adoxq	%r15,%r10
1029	mulxq	0(%rcx),%rax,%r15
1030	adcxq	%rax,%r10
1031	adoxq	%r15,%r11
1032	mulxq	8(%rcx),%rax,%r15
1033	adcxq	%rax,%r11
1034	adoxq	%r15,%r12
1035	mulxq	16(%rcx),%rax,%r15
1036	movq	%r10,-40(%rbx)
1037	adcxq	%rax,%r12
1038	movq	%r11,-32(%rbx)
1039	adoxq	%r15,%r13
1040	mulxq	24(%rcx),%rax,%r15
1041	movq	%r9,%rdx
1042	movq	%r12,-24(%rbx)
1043	adcxq	%rax,%r13
1044	adoxq	%rbp,%r15
1045	leaq	32(%rcx),%rcx
1046	movq	%r13,-16(%rbx)
1047
1048	decq	%rdi
1049	jnz	.Lmulx4x_1st
1050
1051	movq	0(%rsp),%rax
1052	movq	8(%rsp),%rdi
1053	adcq	%rbp,%r15
1054	addq	%r15,%r14
1055	sbbq	%r15,%r15
1056	movq	%r14,-8(%rbx)
1057	jmp	.Lmulx4x_outer
1058
1059.align	32
1060.Lmulx4x_outer:
1061	movq	(%rdi),%rdx
1062	leaq	8(%rdi),%rdi
1063	subq	%rax,%rsi
1064	movq	%r15,(%rbx)
1065	leaq	64+32(%rsp),%rbx
1066	subq	%rax,%rcx
1067
1068	mulxq	0(%rsi),%r8,%r11
1069	xorl	%ebp,%ebp
1070	movq	%rdx,%r9
1071	mulxq	8(%rsi),%r14,%r12
1072	adoxq	-32(%rbx),%r8
1073	adcxq	%r14,%r11
1074	mulxq	16(%rsi),%r15,%r13
1075	adoxq	-24(%rbx),%r11
1076	adcxq	%r15,%r12
1077	adoxq	-16(%rbx),%r12
1078	adcxq	%rbp,%r13
1079	adoxq	%rbp,%r13
1080
1081	movq	%rdi,8(%rsp)
1082	movq	%r8,%r15
1083	imulq	24(%rsp),%r8
1084	xorl	%ebp,%ebp
1085
1086	mulxq	24(%rsi),%rax,%r14
1087	movq	%r8,%rdx
1088	adcxq	%rax,%r13
1089	adoxq	-8(%rbx),%r13
1090	adcxq	%rbp,%r14
1091	leaq	32(%rsi),%rsi
1092	adoxq	%rbp,%r14
1093
1094	mulxq	0(%rcx),%rax,%r10
1095	adcxq	%rax,%r15
1096	adoxq	%r11,%r10
1097	mulxq	8(%rcx),%rax,%r11
1098	adcxq	%rax,%r10
1099	adoxq	%r12,%r11
1100	mulxq	16(%rcx),%rax,%r12
1101	movq	%r10,-32(%rbx)
1102	adcxq	%rax,%r11
1103	adoxq	%r13,%r12
1104	mulxq	24(%rcx),%rax,%r15
1105	movq	%r9,%rdx
1106	movq	%r11,-24(%rbx)
1107	leaq	32(%rcx),%rcx
1108	adcxq	%rax,%r12
1109	adoxq	%rbp,%r15
1110	movq	48(%rsp),%rdi
1111	movq	%r12,-16(%rbx)
1112
1113	jmp	.Lmulx4x_inner
1114
1115.align	32
1116.Lmulx4x_inner:
1117	mulxq	0(%rsi),%r10,%rax
1118	adcxq	%rbp,%r15
1119	adoxq	%r14,%r10
1120	mulxq	8(%rsi),%r11,%r14
1121	adcxq	0(%rbx),%r10
1122	adoxq	%rax,%r11
1123	mulxq	16(%rsi),%r12,%rax
1124	adcxq	8(%rbx),%r11
1125	adoxq	%r14,%r12
1126	mulxq	24(%rsi),%r13,%r14
1127	movq	%r8,%rdx
1128	adcxq	16(%rbx),%r12
1129	adoxq	%rax,%r13
1130	adcxq	24(%rbx),%r13
1131	adoxq	%rbp,%r14
1132	leaq	32(%rsi),%rsi
1133	leaq	32(%rbx),%rbx
1134	adcxq	%rbp,%r14
1135
1136	adoxq	%r15,%r10
1137	mulxq	0(%rcx),%rax,%r15
1138	adcxq	%rax,%r10
1139	adoxq	%r15,%r11
1140	mulxq	8(%rcx),%rax,%r15
1141	adcxq	%rax,%r11
1142	adoxq	%r15,%r12
1143	mulxq	16(%rcx),%rax,%r15
1144	movq	%r10,-40(%rbx)
1145	adcxq	%rax,%r12
1146	adoxq	%r15,%r13
1147	mulxq	24(%rcx),%rax,%r15
1148	movq	%r9,%rdx
1149	movq	%r11,-32(%rbx)
1150	movq	%r12,-24(%rbx)
1151	adcxq	%rax,%r13
1152	adoxq	%rbp,%r15
1153	leaq	32(%rcx),%rcx
1154	movq	%r13,-16(%rbx)
1155
1156	decq	%rdi
1157	jnz	.Lmulx4x_inner
1158
1159	movq	0(%rsp),%rax
1160	movq	8(%rsp),%rdi
1161	adcq	%rbp,%r15
1162	subq	0(%rbx),%rbp
1163	adcq	%r15,%r14
1164	sbbq	%r15,%r15
1165	movq	%r14,-8(%rbx)
1166
1167	cmpq	16(%rsp),%rdi
1168	jne	.Lmulx4x_outer
1169
1170	leaq	64(%rsp),%rbx
1171	subq	%rax,%rcx
1172	negq	%r15
1173	movq	%rax,%rdx
1174	shrq	$3+2,%rax
1175	movq	32(%rsp),%rdi
1176	jmp	.Lmulx4x_sub
1177
1178.align	32
1179.Lmulx4x_sub:
1180	movq	0(%rbx),%r11
1181	movq	8(%rbx),%r12
1182	movq	16(%rbx),%r13
1183	movq	24(%rbx),%r14
1184	leaq	32(%rbx),%rbx
1185	sbbq	0(%rcx),%r11
1186	sbbq	8(%rcx),%r12
1187	sbbq	16(%rcx),%r13
1188	sbbq	24(%rcx),%r14
1189	leaq	32(%rcx),%rcx
1190	movq	%r11,0(%rdi)
1191	movq	%r12,8(%rdi)
1192	movq	%r13,16(%rdi)
1193	movq	%r14,24(%rdi)
1194	leaq	32(%rdi),%rdi
1195	decq	%rax
1196	jnz	.Lmulx4x_sub
1197
1198	sbbq	$0,%r15
1199	leaq	64(%rsp),%rbx
1200	subq	%rdx,%rdi
1201
1202.byte	102,73,15,110,207
1203	pxor	%xmm0,%xmm0
1204	pshufd	$0,%xmm1,%xmm1
1205	movq	40(%rsp),%rsi
1206.cfi_def_cfa	%rsi,8
1207	jmp	.Lmulx4x_cond_copy
1208
1209.align	32
1210.Lmulx4x_cond_copy:
1211	movdqa	0(%rbx),%xmm2
1212	movdqa	16(%rbx),%xmm3
1213	leaq	32(%rbx),%rbx
1214	movdqu	0(%rdi),%xmm4
1215	movdqu	16(%rdi),%xmm5
1216	leaq	32(%rdi),%rdi
1217	movdqa	%xmm0,-32(%rbx)
1218	movdqa	%xmm0,-16(%rbx)
1219	pcmpeqd	%xmm1,%xmm0
1220	pand	%xmm1,%xmm2
1221	pand	%xmm1,%xmm3
1222	pand	%xmm0,%xmm4
1223	pand	%xmm0,%xmm5
1224	pxor	%xmm0,%xmm0
1225	por	%xmm2,%xmm4
1226	por	%xmm3,%xmm5
1227	movdqu	%xmm4,-32(%rdi)
1228	movdqu	%xmm5,-16(%rdi)
1229	subq	$32,%rdx
1230	jnz	.Lmulx4x_cond_copy
1231
1232	movq	%rdx,(%rbx)
1233
1234	movq	$1,%rax
1235	movq	-48(%rsi),%r15
1236.cfi_restore	%r15
1237	movq	-40(%rsi),%r14
1238.cfi_restore	%r14
1239	movq	-32(%rsi),%r13
1240.cfi_restore	%r13
1241	movq	-24(%rsi),%r12
1242.cfi_restore	%r12
1243	movq	-16(%rsi),%rbp
1244.cfi_restore	%rbp
1245	movq	-8(%rsi),%rbx
1246.cfi_restore	%rbx
1247	leaq	(%rsi),%rsp
1248.cfi_def_cfa_register	%rsp
1249.Lmulx4x_epilogue:
1250	.byte	0xf3,0xc3
1251.cfi_endproc
1252.size	bn_mulx4x_mont,.-bn_mulx4x_mont
1253.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1254.align	16
1255#endif
1256.section	.note.GNU-stack,"",@progbits
1257