• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# This file is generated from a similarly-named Perl script in the BoringSSL
2# source tree. Do not edit by hand.
3
4#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
11#if defined(BORINGSSL_PREFIX)
12#include <boringssl_prefix_symbols_asm.h>
13#endif
14.text
15
16
17
18.globl	_bn_mul_mont
19.private_extern _bn_mul_mont
20
21.p2align	4
22_bn_mul_mont:
23
24	movl	%r9d,%r9d
25	movq	%rsp,%rax
26
27	testl	$3,%r9d
28	jnz	L$mul_enter
29	cmpl	$8,%r9d
30	jb	L$mul_enter
31	leaq	_OPENSSL_ia32cap_P(%rip),%r11
32	movl	8(%r11),%r11d
33	cmpq	%rsi,%rdx
34	jne	L$mul4x_enter
35	testl	$7,%r9d
36	jz	L$sqr8x_enter
37	jmp	L$mul4x_enter
38
39.p2align	4
40L$mul_enter:
41	pushq	%rbx
42
43	pushq	%rbp
44
45	pushq	%r12
46
47	pushq	%r13
48
49	pushq	%r14
50
51	pushq	%r15
52
53
54	negq	%r9
55	movq	%rsp,%r11
56	leaq	-16(%rsp,%r9,8),%r10
57	negq	%r9
58	andq	$-1024,%r10
59
60
61
62
63
64
65
66
67
68	subq	%r10,%r11
69	andq	$-4096,%r11
70	leaq	(%r10,%r11,1),%rsp
71	movq	(%rsp),%r11
72	cmpq	%r10,%rsp
73	ja	L$mul_page_walk
74	jmp	L$mul_page_walk_done
75
76.p2align	4
77L$mul_page_walk:
78	leaq	-4096(%rsp),%rsp
79	movq	(%rsp),%r11
80	cmpq	%r10,%rsp
81	ja	L$mul_page_walk
82L$mul_page_walk_done:
83
84	movq	%rax,8(%rsp,%r9,8)
85
86L$mul_body:
87	movq	%rdx,%r12
88	movq	(%r8),%r8
89	movq	(%r12),%rbx
90	movq	(%rsi),%rax
91
92	xorq	%r14,%r14
93	xorq	%r15,%r15
94
95	movq	%r8,%rbp
96	mulq	%rbx
97	movq	%rax,%r10
98	movq	(%rcx),%rax
99
100	imulq	%r10,%rbp
101	movq	%rdx,%r11
102
103	mulq	%rbp
104	addq	%rax,%r10
105	movq	8(%rsi),%rax
106	adcq	$0,%rdx
107	movq	%rdx,%r13
108
109	leaq	1(%r15),%r15
110	jmp	L$1st_enter
111
112.p2align	4
113L$1st:
114	addq	%rax,%r13
115	movq	(%rsi,%r15,8),%rax
116	adcq	$0,%rdx
117	addq	%r11,%r13
118	movq	%r10,%r11
119	adcq	$0,%rdx
120	movq	%r13,-16(%rsp,%r15,8)
121	movq	%rdx,%r13
122
123L$1st_enter:
124	mulq	%rbx
125	addq	%rax,%r11
126	movq	(%rcx,%r15,8),%rax
127	adcq	$0,%rdx
128	leaq	1(%r15),%r15
129	movq	%rdx,%r10
130
131	mulq	%rbp
132	cmpq	%r9,%r15
133	jne	L$1st
134
135	addq	%rax,%r13
136	movq	(%rsi),%rax
137	adcq	$0,%rdx
138	addq	%r11,%r13
139	adcq	$0,%rdx
140	movq	%r13,-16(%rsp,%r15,8)
141	movq	%rdx,%r13
142	movq	%r10,%r11
143
144	xorq	%rdx,%rdx
145	addq	%r11,%r13
146	adcq	$0,%rdx
147	movq	%r13,-8(%rsp,%r9,8)
148	movq	%rdx,(%rsp,%r9,8)
149
150	leaq	1(%r14),%r14
151	jmp	L$outer
152.p2align	4
153L$outer:
154	movq	(%r12,%r14,8),%rbx
155	xorq	%r15,%r15
156	movq	%r8,%rbp
157	movq	(%rsp),%r10
158	mulq	%rbx
159	addq	%rax,%r10
160	movq	(%rcx),%rax
161	adcq	$0,%rdx
162
163	imulq	%r10,%rbp
164	movq	%rdx,%r11
165
166	mulq	%rbp
167	addq	%rax,%r10
168	movq	8(%rsi),%rax
169	adcq	$0,%rdx
170	movq	8(%rsp),%r10
171	movq	%rdx,%r13
172
173	leaq	1(%r15),%r15
174	jmp	L$inner_enter
175
176.p2align	4
177L$inner:
178	addq	%rax,%r13
179	movq	(%rsi,%r15,8),%rax
180	adcq	$0,%rdx
181	addq	%r10,%r13
182	movq	(%rsp,%r15,8),%r10
183	adcq	$0,%rdx
184	movq	%r13,-16(%rsp,%r15,8)
185	movq	%rdx,%r13
186
187L$inner_enter:
188	mulq	%rbx
189	addq	%rax,%r11
190	movq	(%rcx,%r15,8),%rax
191	adcq	$0,%rdx
192	addq	%r11,%r10
193	movq	%rdx,%r11
194	adcq	$0,%r11
195	leaq	1(%r15),%r15
196
197	mulq	%rbp
198	cmpq	%r9,%r15
199	jne	L$inner
200
201	addq	%rax,%r13
202	movq	(%rsi),%rax
203	adcq	$0,%rdx
204	addq	%r10,%r13
205	movq	(%rsp,%r15,8),%r10
206	adcq	$0,%rdx
207	movq	%r13,-16(%rsp,%r15,8)
208	movq	%rdx,%r13
209
210	xorq	%rdx,%rdx
211	addq	%r11,%r13
212	adcq	$0,%rdx
213	addq	%r10,%r13
214	adcq	$0,%rdx
215	movq	%r13,-8(%rsp,%r9,8)
216	movq	%rdx,(%rsp,%r9,8)
217
218	leaq	1(%r14),%r14
219	cmpq	%r9,%r14
220	jb	L$outer
221
222	xorq	%r14,%r14
223	movq	(%rsp),%rax
224	movq	%r9,%r15
225
226.p2align	4
227L$sub:	sbbq	(%rcx,%r14,8),%rax
228	movq	%rax,(%rdi,%r14,8)
229	movq	8(%rsp,%r14,8),%rax
230	leaq	1(%r14),%r14
231	decq	%r15
232	jnz	L$sub
233
234	sbbq	$0,%rax
235	movq	$-1,%rbx
236	xorq	%rax,%rbx
237	xorq	%r14,%r14
238	movq	%r9,%r15
239
240L$copy:
241	movq	(%rdi,%r14,8),%rcx
242	movq	(%rsp,%r14,8),%rdx
243	andq	%rbx,%rcx
244	andq	%rax,%rdx
245	movq	%r9,(%rsp,%r14,8)
246	orq	%rcx,%rdx
247	movq	%rdx,(%rdi,%r14,8)
248	leaq	1(%r14),%r14
249	subq	$1,%r15
250	jnz	L$copy
251
252	movq	8(%rsp,%r9,8),%rsi
253
254	movq	$1,%rax
255	movq	-48(%rsi),%r15
256
257	movq	-40(%rsi),%r14
258
259	movq	-32(%rsi),%r13
260
261	movq	-24(%rsi),%r12
262
263	movq	-16(%rsi),%rbp
264
265	movq	-8(%rsi),%rbx
266
267	leaq	(%rsi),%rsp
268
269L$mul_epilogue:
270	.byte	0xf3,0xc3
271
272
273
274.p2align	4
275bn_mul4x_mont:
276
277	movl	%r9d,%r9d
278	movq	%rsp,%rax
279
280L$mul4x_enter:
281	andl	$0x80100,%r11d
282	cmpl	$0x80100,%r11d
283	je	L$mulx4x_enter
284	pushq	%rbx
285
286	pushq	%rbp
287
288	pushq	%r12
289
290	pushq	%r13
291
292	pushq	%r14
293
294	pushq	%r15
295
296
297	negq	%r9
298	movq	%rsp,%r11
299	leaq	-32(%rsp,%r9,8),%r10
300	negq	%r9
301	andq	$-1024,%r10
302
303	subq	%r10,%r11
304	andq	$-4096,%r11
305	leaq	(%r10,%r11,1),%rsp
306	movq	(%rsp),%r11
307	cmpq	%r10,%rsp
308	ja	L$mul4x_page_walk
309	jmp	L$mul4x_page_walk_done
310
311L$mul4x_page_walk:
312	leaq	-4096(%rsp),%rsp
313	movq	(%rsp),%r11
314	cmpq	%r10,%rsp
315	ja	L$mul4x_page_walk
316L$mul4x_page_walk_done:
317
318	movq	%rax,8(%rsp,%r9,8)
319
320L$mul4x_body:
321	movq	%rdi,16(%rsp,%r9,8)
322	movq	%rdx,%r12
323	movq	(%r8),%r8
324	movq	(%r12),%rbx
325	movq	(%rsi),%rax
326
327	xorq	%r14,%r14
328	xorq	%r15,%r15
329
330	movq	%r8,%rbp
331	mulq	%rbx
332	movq	%rax,%r10
333	movq	(%rcx),%rax
334
335	imulq	%r10,%rbp
336	movq	%rdx,%r11
337
338	mulq	%rbp
339	addq	%rax,%r10
340	movq	8(%rsi),%rax
341	adcq	$0,%rdx
342	movq	%rdx,%rdi
343
344	mulq	%rbx
345	addq	%rax,%r11
346	movq	8(%rcx),%rax
347	adcq	$0,%rdx
348	movq	%rdx,%r10
349
350	mulq	%rbp
351	addq	%rax,%rdi
352	movq	16(%rsi),%rax
353	adcq	$0,%rdx
354	addq	%r11,%rdi
355	leaq	4(%r15),%r15
356	adcq	$0,%rdx
357	movq	%rdi,(%rsp)
358	movq	%rdx,%r13
359	jmp	L$1st4x
360.p2align	4
361L$1st4x:
362	mulq	%rbx
363	addq	%rax,%r10
364	movq	-16(%rcx,%r15,8),%rax
365	adcq	$0,%rdx
366	movq	%rdx,%r11
367
368	mulq	%rbp
369	addq	%rax,%r13
370	movq	-8(%rsi,%r15,8),%rax
371	adcq	$0,%rdx
372	addq	%r10,%r13
373	adcq	$0,%rdx
374	movq	%r13,-24(%rsp,%r15,8)
375	movq	%rdx,%rdi
376
377	mulq	%rbx
378	addq	%rax,%r11
379	movq	-8(%rcx,%r15,8),%rax
380	adcq	$0,%rdx
381	movq	%rdx,%r10
382
383	mulq	%rbp
384	addq	%rax,%rdi
385	movq	(%rsi,%r15,8),%rax
386	adcq	$0,%rdx
387	addq	%r11,%rdi
388	adcq	$0,%rdx
389	movq	%rdi,-16(%rsp,%r15,8)
390	movq	%rdx,%r13
391
392	mulq	%rbx
393	addq	%rax,%r10
394	movq	(%rcx,%r15,8),%rax
395	adcq	$0,%rdx
396	movq	%rdx,%r11
397
398	mulq	%rbp
399	addq	%rax,%r13
400	movq	8(%rsi,%r15,8),%rax
401	adcq	$0,%rdx
402	addq	%r10,%r13
403	adcq	$0,%rdx
404	movq	%r13,-8(%rsp,%r15,8)
405	movq	%rdx,%rdi
406
407	mulq	%rbx
408	addq	%rax,%r11
409	movq	8(%rcx,%r15,8),%rax
410	adcq	$0,%rdx
411	leaq	4(%r15),%r15
412	movq	%rdx,%r10
413
414	mulq	%rbp
415	addq	%rax,%rdi
416	movq	-16(%rsi,%r15,8),%rax
417	adcq	$0,%rdx
418	addq	%r11,%rdi
419	adcq	$0,%rdx
420	movq	%rdi,-32(%rsp,%r15,8)
421	movq	%rdx,%r13
422	cmpq	%r9,%r15
423	jb	L$1st4x
424
425	mulq	%rbx
426	addq	%rax,%r10
427	movq	-16(%rcx,%r15,8),%rax
428	adcq	$0,%rdx
429	movq	%rdx,%r11
430
431	mulq	%rbp
432	addq	%rax,%r13
433	movq	-8(%rsi,%r15,8),%rax
434	adcq	$0,%rdx
435	addq	%r10,%r13
436	adcq	$0,%rdx
437	movq	%r13,-24(%rsp,%r15,8)
438	movq	%rdx,%rdi
439
440	mulq	%rbx
441	addq	%rax,%r11
442	movq	-8(%rcx,%r15,8),%rax
443	adcq	$0,%rdx
444	movq	%rdx,%r10
445
446	mulq	%rbp
447	addq	%rax,%rdi
448	movq	(%rsi),%rax
449	adcq	$0,%rdx
450	addq	%r11,%rdi
451	adcq	$0,%rdx
452	movq	%rdi,-16(%rsp,%r15,8)
453	movq	%rdx,%r13
454
455	xorq	%rdi,%rdi
456	addq	%r10,%r13
457	adcq	$0,%rdi
458	movq	%r13,-8(%rsp,%r15,8)
459	movq	%rdi,(%rsp,%r15,8)
460
461	leaq	1(%r14),%r14
462.p2align	2
463L$outer4x:
464	movq	(%r12,%r14,8),%rbx
465	xorq	%r15,%r15
466	movq	(%rsp),%r10
467	movq	%r8,%rbp
468	mulq	%rbx
469	addq	%rax,%r10
470	movq	(%rcx),%rax
471	adcq	$0,%rdx
472
473	imulq	%r10,%rbp
474	movq	%rdx,%r11
475
476	mulq	%rbp
477	addq	%rax,%r10
478	movq	8(%rsi),%rax
479	adcq	$0,%rdx
480	movq	%rdx,%rdi
481
482	mulq	%rbx
483	addq	%rax,%r11
484	movq	8(%rcx),%rax
485	adcq	$0,%rdx
486	addq	8(%rsp),%r11
487	adcq	$0,%rdx
488	movq	%rdx,%r10
489
490	mulq	%rbp
491	addq	%rax,%rdi
492	movq	16(%rsi),%rax
493	adcq	$0,%rdx
494	addq	%r11,%rdi
495	leaq	4(%r15),%r15
496	adcq	$0,%rdx
497	movq	%rdi,(%rsp)
498	movq	%rdx,%r13
499	jmp	L$inner4x
500.p2align	4
501L$inner4x:
502	mulq	%rbx
503	addq	%rax,%r10
504	movq	-16(%rcx,%r15,8),%rax
505	adcq	$0,%rdx
506	addq	-16(%rsp,%r15,8),%r10
507	adcq	$0,%rdx
508	movq	%rdx,%r11
509
510	mulq	%rbp
511	addq	%rax,%r13
512	movq	-8(%rsi,%r15,8),%rax
513	adcq	$0,%rdx
514	addq	%r10,%r13
515	adcq	$0,%rdx
516	movq	%r13,-24(%rsp,%r15,8)
517	movq	%rdx,%rdi
518
519	mulq	%rbx
520	addq	%rax,%r11
521	movq	-8(%rcx,%r15,8),%rax
522	adcq	$0,%rdx
523	addq	-8(%rsp,%r15,8),%r11
524	adcq	$0,%rdx
525	movq	%rdx,%r10
526
527	mulq	%rbp
528	addq	%rax,%rdi
529	movq	(%rsi,%r15,8),%rax
530	adcq	$0,%rdx
531	addq	%r11,%rdi
532	adcq	$0,%rdx
533	movq	%rdi,-16(%rsp,%r15,8)
534	movq	%rdx,%r13
535
536	mulq	%rbx
537	addq	%rax,%r10
538	movq	(%rcx,%r15,8),%rax
539	adcq	$0,%rdx
540	addq	(%rsp,%r15,8),%r10
541	adcq	$0,%rdx
542	movq	%rdx,%r11
543
544	mulq	%rbp
545	addq	%rax,%r13
546	movq	8(%rsi,%r15,8),%rax
547	adcq	$0,%rdx
548	addq	%r10,%r13
549	adcq	$0,%rdx
550	movq	%r13,-8(%rsp,%r15,8)
551	movq	%rdx,%rdi
552
553	mulq	%rbx
554	addq	%rax,%r11
555	movq	8(%rcx,%r15,8),%rax
556	adcq	$0,%rdx
557	addq	8(%rsp,%r15,8),%r11
558	adcq	$0,%rdx
559	leaq	4(%r15),%r15
560	movq	%rdx,%r10
561
562	mulq	%rbp
563	addq	%rax,%rdi
564	movq	-16(%rsi,%r15,8),%rax
565	adcq	$0,%rdx
566	addq	%r11,%rdi
567	adcq	$0,%rdx
568	movq	%rdi,-32(%rsp,%r15,8)
569	movq	%rdx,%r13
570	cmpq	%r9,%r15
571	jb	L$inner4x
572
573	mulq	%rbx
574	addq	%rax,%r10
575	movq	-16(%rcx,%r15,8),%rax
576	adcq	$0,%rdx
577	addq	-16(%rsp,%r15,8),%r10
578	adcq	$0,%rdx
579	movq	%rdx,%r11
580
581	mulq	%rbp
582	addq	%rax,%r13
583	movq	-8(%rsi,%r15,8),%rax
584	adcq	$0,%rdx
585	addq	%r10,%r13
586	adcq	$0,%rdx
587	movq	%r13,-24(%rsp,%r15,8)
588	movq	%rdx,%rdi
589
590	mulq	%rbx
591	addq	%rax,%r11
592	movq	-8(%rcx,%r15,8),%rax
593	adcq	$0,%rdx
594	addq	-8(%rsp,%r15,8),%r11
595	adcq	$0,%rdx
596	leaq	1(%r14),%r14
597	movq	%rdx,%r10
598
599	mulq	%rbp
600	addq	%rax,%rdi
601	movq	(%rsi),%rax
602	adcq	$0,%rdx
603	addq	%r11,%rdi
604	adcq	$0,%rdx
605	movq	%rdi,-16(%rsp,%r15,8)
606	movq	%rdx,%r13
607
608	xorq	%rdi,%rdi
609	addq	%r10,%r13
610	adcq	$0,%rdi
611	addq	(%rsp,%r9,8),%r13
612	adcq	$0,%rdi
613	movq	%r13,-8(%rsp,%r15,8)
614	movq	%rdi,(%rsp,%r15,8)
615
616	cmpq	%r9,%r14
617	jb	L$outer4x
618	movq	16(%rsp,%r9,8),%rdi
619	leaq	-4(%r9),%r15
620	movq	0(%rsp),%rax
621	movq	8(%rsp),%rdx
622	shrq	$2,%r15
623	leaq	(%rsp),%rsi
624	xorq	%r14,%r14
625
626	subq	0(%rcx),%rax
627	movq	16(%rsi),%rbx
628	movq	24(%rsi),%rbp
629	sbbq	8(%rcx),%rdx
630
631L$sub4x:
632	movq	%rax,0(%rdi,%r14,8)
633	movq	%rdx,8(%rdi,%r14,8)
634	sbbq	16(%rcx,%r14,8),%rbx
635	movq	32(%rsi,%r14,8),%rax
636	movq	40(%rsi,%r14,8),%rdx
637	sbbq	24(%rcx,%r14,8),%rbp
638	movq	%rbx,16(%rdi,%r14,8)
639	movq	%rbp,24(%rdi,%r14,8)
640	sbbq	32(%rcx,%r14,8),%rax
641	movq	48(%rsi,%r14,8),%rbx
642	movq	56(%rsi,%r14,8),%rbp
643	sbbq	40(%rcx,%r14,8),%rdx
644	leaq	4(%r14),%r14
645	decq	%r15
646	jnz	L$sub4x
647
648	movq	%rax,0(%rdi,%r14,8)
649	movq	32(%rsi,%r14,8),%rax
650	sbbq	16(%rcx,%r14,8),%rbx
651	movq	%rdx,8(%rdi,%r14,8)
652	sbbq	24(%rcx,%r14,8),%rbp
653	movq	%rbx,16(%rdi,%r14,8)
654
655	sbbq	$0,%rax
656	movq	%rbp,24(%rdi,%r14,8)
657	pxor	%xmm0,%xmm0
658.byte	102,72,15,110,224
659	pcmpeqd	%xmm5,%xmm5
660	pshufd	$0,%xmm4,%xmm4
661	movq	%r9,%r15
662	pxor	%xmm4,%xmm5
663	shrq	$2,%r15
664	xorl	%eax,%eax
665
666	jmp	L$copy4x
667.p2align	4
668L$copy4x:
669	movdqa	(%rsp,%rax,1),%xmm1
670	movdqu	(%rdi,%rax,1),%xmm2
671	pand	%xmm4,%xmm1
672	pand	%xmm5,%xmm2
673	movdqa	16(%rsp,%rax,1),%xmm3
674	movdqa	%xmm0,(%rsp,%rax,1)
675	por	%xmm2,%xmm1
676	movdqu	16(%rdi,%rax,1),%xmm2
677	movdqu	%xmm1,(%rdi,%rax,1)
678	pand	%xmm4,%xmm3
679	pand	%xmm5,%xmm2
680	movdqa	%xmm0,16(%rsp,%rax,1)
681	por	%xmm2,%xmm3
682	movdqu	%xmm3,16(%rdi,%rax,1)
683	leaq	32(%rax),%rax
684	decq	%r15
685	jnz	L$copy4x
686	movq	8(%rsp,%r9,8),%rsi
687
688	movq	$1,%rax
689	movq	-48(%rsi),%r15
690
691	movq	-40(%rsi),%r14
692
693	movq	-32(%rsi),%r13
694
695	movq	-24(%rsi),%r12
696
697	movq	-16(%rsi),%rbp
698
699	movq	-8(%rsi),%rbx
700
701	leaq	(%rsi),%rsp
702
703L$mul4x_epilogue:
704	.byte	0xf3,0xc3
705
706
707
708
709
710
711.p2align	5
712bn_sqr8x_mont:
713
714	movq	%rsp,%rax
715
716L$sqr8x_enter:
717	pushq	%rbx
718
719	pushq	%rbp
720
721	pushq	%r12
722
723	pushq	%r13
724
725	pushq	%r14
726
727	pushq	%r15
728
729L$sqr8x_prologue:
730
731	movl	%r9d,%r10d
732	shll	$3,%r9d
733	shlq	$3+2,%r10
734	negq	%r9
735
736
737
738
739
740
741	leaq	-64(%rsp,%r9,2),%r11
742	movq	%rsp,%rbp
743	movq	(%r8),%r8
744	subq	%rsi,%r11
745	andq	$4095,%r11
746	cmpq	%r11,%r10
747	jb	L$sqr8x_sp_alt
748	subq	%r11,%rbp
749	leaq	-64(%rbp,%r9,2),%rbp
750	jmp	L$sqr8x_sp_done
751
752.p2align	5
753L$sqr8x_sp_alt:
754	leaq	4096-64(,%r9,2),%r10
755	leaq	-64(%rbp,%r9,2),%rbp
756	subq	%r10,%r11
757	movq	$0,%r10
758	cmovcq	%r10,%r11
759	subq	%r11,%rbp
760L$sqr8x_sp_done:
761	andq	$-64,%rbp
762	movq	%rsp,%r11
763	subq	%rbp,%r11
764	andq	$-4096,%r11
765	leaq	(%r11,%rbp,1),%rsp
766	movq	(%rsp),%r10
767	cmpq	%rbp,%rsp
768	ja	L$sqr8x_page_walk
769	jmp	L$sqr8x_page_walk_done
770
771.p2align	4
772L$sqr8x_page_walk:
773	leaq	-4096(%rsp),%rsp
774	movq	(%rsp),%r10
775	cmpq	%rbp,%rsp
776	ja	L$sqr8x_page_walk
777L$sqr8x_page_walk_done:
778
779	movq	%r9,%r10
780	negq	%r9
781
782	movq	%r8,32(%rsp)
783	movq	%rax,40(%rsp)
784
785L$sqr8x_body:
786
787.byte	102,72,15,110,209
788	pxor	%xmm0,%xmm0
789.byte	102,72,15,110,207
790.byte	102,73,15,110,218
791	leaq	_OPENSSL_ia32cap_P(%rip),%rax
792	movl	8(%rax),%eax
793	andl	$0x80100,%eax
794	cmpl	$0x80100,%eax
795	jne	L$sqr8x_nox
796
797	call	_bn_sqrx8x_internal
798
799
800
801
802	leaq	(%r8,%rcx,1),%rbx
803	movq	%rcx,%r9
804	movq	%rcx,%rdx
805.byte	102,72,15,126,207
806	sarq	$3+2,%rcx
807	jmp	L$sqr8x_sub
808
809.p2align	5
810L$sqr8x_nox:
811	call	_bn_sqr8x_internal
812
813
814
815
816	leaq	(%rdi,%r9,1),%rbx
817	movq	%r9,%rcx
818	movq	%r9,%rdx
819.byte	102,72,15,126,207
820	sarq	$3+2,%rcx
821	jmp	L$sqr8x_sub
822
823.p2align	5
824L$sqr8x_sub:
825	movq	0(%rbx),%r12
826	movq	8(%rbx),%r13
827	movq	16(%rbx),%r14
828	movq	24(%rbx),%r15
829	leaq	32(%rbx),%rbx
830	sbbq	0(%rbp),%r12
831	sbbq	8(%rbp),%r13
832	sbbq	16(%rbp),%r14
833	sbbq	24(%rbp),%r15
834	leaq	32(%rbp),%rbp
835	movq	%r12,0(%rdi)
836	movq	%r13,8(%rdi)
837	movq	%r14,16(%rdi)
838	movq	%r15,24(%rdi)
839	leaq	32(%rdi),%rdi
840	incq	%rcx
841	jnz	L$sqr8x_sub
842
843	sbbq	$0,%rax
844	leaq	(%rbx,%r9,1),%rbx
845	leaq	(%rdi,%r9,1),%rdi
846
847.byte	102,72,15,110,200
848	pxor	%xmm0,%xmm0
849	pshufd	$0,%xmm1,%xmm1
850	movq	40(%rsp),%rsi
851
852	jmp	L$sqr8x_cond_copy
853
854.p2align	5
855L$sqr8x_cond_copy:
856	movdqa	0(%rbx),%xmm2
857	movdqa	16(%rbx),%xmm3
858	leaq	32(%rbx),%rbx
859	movdqu	0(%rdi),%xmm4
860	movdqu	16(%rdi),%xmm5
861	leaq	32(%rdi),%rdi
862	movdqa	%xmm0,-32(%rbx)
863	movdqa	%xmm0,-16(%rbx)
864	movdqa	%xmm0,-32(%rbx,%rdx,1)
865	movdqa	%xmm0,-16(%rbx,%rdx,1)
866	pcmpeqd	%xmm1,%xmm0
867	pand	%xmm1,%xmm2
868	pand	%xmm1,%xmm3
869	pand	%xmm0,%xmm4
870	pand	%xmm0,%xmm5
871	pxor	%xmm0,%xmm0
872	por	%xmm2,%xmm4
873	por	%xmm3,%xmm5
874	movdqu	%xmm4,-32(%rdi)
875	movdqu	%xmm5,-16(%rdi)
876	addq	$32,%r9
877	jnz	L$sqr8x_cond_copy
878
879	movq	$1,%rax
880	movq	-48(%rsi),%r15
881
882	movq	-40(%rsi),%r14
883
884	movq	-32(%rsi),%r13
885
886	movq	-24(%rsi),%r12
887
888	movq	-16(%rsi),%rbp
889
890	movq	-8(%rsi),%rbx
891
892	leaq	(%rsi),%rsp
893
894L$sqr8x_epilogue:
895	.byte	0xf3,0xc3
896
897
898
899.p2align	5
900bn_mulx4x_mont:
901
902	movq	%rsp,%rax
903
904L$mulx4x_enter:
905	pushq	%rbx
906
907	pushq	%rbp
908
909	pushq	%r12
910
911	pushq	%r13
912
913	pushq	%r14
914
915	pushq	%r15
916
917L$mulx4x_prologue:
918
919	shll	$3,%r9d
920	xorq	%r10,%r10
921	subq	%r9,%r10
922	movq	(%r8),%r8
923	leaq	-72(%rsp,%r10,1),%rbp
924	andq	$-128,%rbp
925	movq	%rsp,%r11
926	subq	%rbp,%r11
927	andq	$-4096,%r11
928	leaq	(%r11,%rbp,1),%rsp
929	movq	(%rsp),%r10
930	cmpq	%rbp,%rsp
931	ja	L$mulx4x_page_walk
932	jmp	L$mulx4x_page_walk_done
933
934.p2align	4
935L$mulx4x_page_walk:
936	leaq	-4096(%rsp),%rsp
937	movq	(%rsp),%r10
938	cmpq	%rbp,%rsp
939	ja	L$mulx4x_page_walk
940L$mulx4x_page_walk_done:
941
942	leaq	(%rdx,%r9,1),%r10
943
944
945
946
947
948
949
950
951
952
953
954
955	movq	%r9,0(%rsp)
956	shrq	$5,%r9
957	movq	%r10,16(%rsp)
958	subq	$1,%r9
959	movq	%r8,24(%rsp)
960	movq	%rdi,32(%rsp)
961	movq	%rax,40(%rsp)
962
963	movq	%r9,48(%rsp)
964	jmp	L$mulx4x_body
965
966.p2align	5
967L$mulx4x_body:
968	leaq	8(%rdx),%rdi
969	movq	(%rdx),%rdx
970	leaq	64+32(%rsp),%rbx
971	movq	%rdx,%r9
972
973	mulxq	0(%rsi),%r8,%rax
974	mulxq	8(%rsi),%r11,%r14
975	addq	%rax,%r11
976	movq	%rdi,8(%rsp)
977	mulxq	16(%rsi),%r12,%r13
978	adcq	%r14,%r12
979	adcq	$0,%r13
980
981	movq	%r8,%rdi
982	imulq	24(%rsp),%r8
983	xorq	%rbp,%rbp
984
985	mulxq	24(%rsi),%rax,%r14
986	movq	%r8,%rdx
987	leaq	32(%rsi),%rsi
988	adcxq	%rax,%r13
989	adcxq	%rbp,%r14
990
991	mulxq	0(%rcx),%rax,%r10
992	adcxq	%rax,%rdi
993	adoxq	%r11,%r10
994	mulxq	8(%rcx),%rax,%r11
995	adcxq	%rax,%r10
996	adoxq	%r12,%r11
997.byte	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
998	movq	48(%rsp),%rdi
999	movq	%r10,-32(%rbx)
1000	adcxq	%rax,%r11
1001	adoxq	%r13,%r12
1002	mulxq	24(%rcx),%rax,%r15
1003	movq	%r9,%rdx
1004	movq	%r11,-24(%rbx)
1005	adcxq	%rax,%r12
1006	adoxq	%rbp,%r15
1007	leaq	32(%rcx),%rcx
1008	movq	%r12,-16(%rbx)
1009
1010	jmp	L$mulx4x_1st
1011
1012.p2align	5
1013L$mulx4x_1st:
1014	adcxq	%rbp,%r15
1015	mulxq	0(%rsi),%r10,%rax
1016	adcxq	%r14,%r10
1017	mulxq	8(%rsi),%r11,%r14
1018	adcxq	%rax,%r11
1019	mulxq	16(%rsi),%r12,%rax
1020	adcxq	%r14,%r12
1021	mulxq	24(%rsi),%r13,%r14
1022.byte	0x67,0x67
1023	movq	%r8,%rdx
1024	adcxq	%rax,%r13
1025	adcxq	%rbp,%r14
1026	leaq	32(%rsi),%rsi
1027	leaq	32(%rbx),%rbx
1028
1029	adoxq	%r15,%r10
1030	mulxq	0(%rcx),%rax,%r15
1031	adcxq	%rax,%r10
1032	adoxq	%r15,%r11
1033	mulxq	8(%rcx),%rax,%r15
1034	adcxq	%rax,%r11
1035	adoxq	%r15,%r12
1036	mulxq	16(%rcx),%rax,%r15
1037	movq	%r10,-40(%rbx)
1038	adcxq	%rax,%r12
1039	movq	%r11,-32(%rbx)
1040	adoxq	%r15,%r13
1041	mulxq	24(%rcx),%rax,%r15
1042	movq	%r9,%rdx
1043	movq	%r12,-24(%rbx)
1044	adcxq	%rax,%r13
1045	adoxq	%rbp,%r15
1046	leaq	32(%rcx),%rcx
1047	movq	%r13,-16(%rbx)
1048
1049	decq	%rdi
1050	jnz	L$mulx4x_1st
1051
1052	movq	0(%rsp),%rax
1053	movq	8(%rsp),%rdi
1054	adcq	%rbp,%r15
1055	addq	%r15,%r14
1056	sbbq	%r15,%r15
1057	movq	%r14,-8(%rbx)
1058	jmp	L$mulx4x_outer
1059
1060.p2align	5
1061L$mulx4x_outer:
1062	movq	(%rdi),%rdx
1063	leaq	8(%rdi),%rdi
1064	subq	%rax,%rsi
1065	movq	%r15,(%rbx)
1066	leaq	64+32(%rsp),%rbx
1067	subq	%rax,%rcx
1068
1069	mulxq	0(%rsi),%r8,%r11
1070	xorl	%ebp,%ebp
1071	movq	%rdx,%r9
1072	mulxq	8(%rsi),%r14,%r12
1073	adoxq	-32(%rbx),%r8
1074	adcxq	%r14,%r11
1075	mulxq	16(%rsi),%r15,%r13
1076	adoxq	-24(%rbx),%r11
1077	adcxq	%r15,%r12
1078	adoxq	-16(%rbx),%r12
1079	adcxq	%rbp,%r13
1080	adoxq	%rbp,%r13
1081
1082	movq	%rdi,8(%rsp)
1083	movq	%r8,%r15
1084	imulq	24(%rsp),%r8
1085	xorl	%ebp,%ebp
1086
1087	mulxq	24(%rsi),%rax,%r14
1088	movq	%r8,%rdx
1089	adcxq	%rax,%r13
1090	adoxq	-8(%rbx),%r13
1091	adcxq	%rbp,%r14
1092	leaq	32(%rsi),%rsi
1093	adoxq	%rbp,%r14
1094
1095	mulxq	0(%rcx),%rax,%r10
1096	adcxq	%rax,%r15
1097	adoxq	%r11,%r10
1098	mulxq	8(%rcx),%rax,%r11
1099	adcxq	%rax,%r10
1100	adoxq	%r12,%r11
1101	mulxq	16(%rcx),%rax,%r12
1102	movq	%r10,-32(%rbx)
1103	adcxq	%rax,%r11
1104	adoxq	%r13,%r12
1105	mulxq	24(%rcx),%rax,%r15
1106	movq	%r9,%rdx
1107	movq	%r11,-24(%rbx)
1108	leaq	32(%rcx),%rcx
1109	adcxq	%rax,%r12
1110	adoxq	%rbp,%r15
1111	movq	48(%rsp),%rdi
1112	movq	%r12,-16(%rbx)
1113
1114	jmp	L$mulx4x_inner
1115
1116.p2align	5
1117L$mulx4x_inner:
1118	mulxq	0(%rsi),%r10,%rax
1119	adcxq	%rbp,%r15
1120	adoxq	%r14,%r10
1121	mulxq	8(%rsi),%r11,%r14
1122	adcxq	0(%rbx),%r10
1123	adoxq	%rax,%r11
1124	mulxq	16(%rsi),%r12,%rax
1125	adcxq	8(%rbx),%r11
1126	adoxq	%r14,%r12
1127	mulxq	24(%rsi),%r13,%r14
1128	movq	%r8,%rdx
1129	adcxq	16(%rbx),%r12
1130	adoxq	%rax,%r13
1131	adcxq	24(%rbx),%r13
1132	adoxq	%rbp,%r14
1133	leaq	32(%rsi),%rsi
1134	leaq	32(%rbx),%rbx
1135	adcxq	%rbp,%r14
1136
1137	adoxq	%r15,%r10
1138	mulxq	0(%rcx),%rax,%r15
1139	adcxq	%rax,%r10
1140	adoxq	%r15,%r11
1141	mulxq	8(%rcx),%rax,%r15
1142	adcxq	%rax,%r11
1143	adoxq	%r15,%r12
1144	mulxq	16(%rcx),%rax,%r15
1145	movq	%r10,-40(%rbx)
1146	adcxq	%rax,%r12
1147	adoxq	%r15,%r13
1148	mulxq	24(%rcx),%rax,%r15
1149	movq	%r9,%rdx
1150	movq	%r11,-32(%rbx)
1151	movq	%r12,-24(%rbx)
1152	adcxq	%rax,%r13
1153	adoxq	%rbp,%r15
1154	leaq	32(%rcx),%rcx
1155	movq	%r13,-16(%rbx)
1156
1157	decq	%rdi
1158	jnz	L$mulx4x_inner
1159
1160	movq	0(%rsp),%rax
1161	movq	8(%rsp),%rdi
1162	adcq	%rbp,%r15
1163	subq	0(%rbx),%rbp
1164	adcq	%r15,%r14
1165	sbbq	%r15,%r15
1166	movq	%r14,-8(%rbx)
1167
1168	cmpq	16(%rsp),%rdi
1169	jne	L$mulx4x_outer
1170
1171	leaq	64(%rsp),%rbx
1172	subq	%rax,%rcx
1173	negq	%r15
1174	movq	%rax,%rdx
1175	shrq	$3+2,%rax
1176	movq	32(%rsp),%rdi
1177	jmp	L$mulx4x_sub
1178
1179.p2align	5
1180L$mulx4x_sub:
1181	movq	0(%rbx),%r11
1182	movq	8(%rbx),%r12
1183	movq	16(%rbx),%r13
1184	movq	24(%rbx),%r14
1185	leaq	32(%rbx),%rbx
1186	sbbq	0(%rcx),%r11
1187	sbbq	8(%rcx),%r12
1188	sbbq	16(%rcx),%r13
1189	sbbq	24(%rcx),%r14
1190	leaq	32(%rcx),%rcx
1191	movq	%r11,0(%rdi)
1192	movq	%r12,8(%rdi)
1193	movq	%r13,16(%rdi)
1194	movq	%r14,24(%rdi)
1195	leaq	32(%rdi),%rdi
1196	decq	%rax
1197	jnz	L$mulx4x_sub
1198
1199	sbbq	$0,%r15
1200	leaq	64(%rsp),%rbx
1201	subq	%rdx,%rdi
1202
1203.byte	102,73,15,110,207
1204	pxor	%xmm0,%xmm0
1205	pshufd	$0,%xmm1,%xmm1
1206	movq	40(%rsp),%rsi
1207
1208	jmp	L$mulx4x_cond_copy
1209
1210.p2align	5
1211L$mulx4x_cond_copy:
1212	movdqa	0(%rbx),%xmm2
1213	movdqa	16(%rbx),%xmm3
1214	leaq	32(%rbx),%rbx
1215	movdqu	0(%rdi),%xmm4
1216	movdqu	16(%rdi),%xmm5
1217	leaq	32(%rdi),%rdi
1218	movdqa	%xmm0,-32(%rbx)
1219	movdqa	%xmm0,-16(%rbx)
1220	pcmpeqd	%xmm1,%xmm0
1221	pand	%xmm1,%xmm2
1222	pand	%xmm1,%xmm3
1223	pand	%xmm0,%xmm4
1224	pand	%xmm0,%xmm5
1225	pxor	%xmm0,%xmm0
1226	por	%xmm2,%xmm4
1227	por	%xmm3,%xmm5
1228	movdqu	%xmm4,-32(%rdi)
1229	movdqu	%xmm5,-16(%rdi)
1230	subq	$32,%rdx
1231	jnz	L$mulx4x_cond_copy
1232
1233	movq	%rdx,(%rbx)
1234
1235	movq	$1,%rax
1236	movq	-48(%rsi),%r15
1237
1238	movq	-40(%rsi),%r14
1239
1240	movq	-32(%rsi),%r13
1241
1242	movq	-24(%rsi),%r12
1243
1244	movq	-16(%rsi),%rbp
1245
1246	movq	-8(%rsi),%rbx
1247
1248	leaq	(%rsi),%rsp
1249
1250L$mulx4x_epilogue:
1251	.byte	0xf3,0xc3
1252
1253
1254.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1255.p2align	4
1256#endif
1257