• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# This file is generated from a similarly-named Perl script in the BoringSSL
2# source tree. Do not edit by hand.
3
4#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
11#include "ring_core_generated/prefix_symbols_asm.h"
12.text
13
14
15
16.globl	_bn_mul_mont
17.private_extern _bn_mul_mont
18
19.p2align	4
20_bn_mul_mont:
21
22	movl	%r9d,%r9d
23	movq	%rsp,%rax
24
25	testl	$3,%r9d
26	jnz	L$mul_enter
27	cmpl	$8,%r9d
28	jb	L$mul_enter
29	movl	_OPENSSL_ia32cap_P+8(%rip),%r11d
30	cmpq	%rsi,%rdx
31	jne	L$mul4x_enter
32	testl	$7,%r9d
33	jz	L$sqr8x_enter
34	jmp	L$mul4x_enter
35
36.p2align	4
37L$mul_enter:
38	pushq	%rbx
39
40	pushq	%rbp
41
42	pushq	%r12
43
44	pushq	%r13
45
46	pushq	%r14
47
48	pushq	%r15
49
50
51	negq	%r9
52	movq	%rsp,%r11
53	leaq	-16(%rsp,%r9,8),%r10
54	negq	%r9
55	andq	$-1024,%r10
56
57
58
59
60
61
62
63
64
65	subq	%r10,%r11
66	andq	$-4096,%r11
67	leaq	(%r10,%r11,1),%rsp
68	movq	(%rsp),%r11
69	cmpq	%r10,%rsp
70	ja	L$mul_page_walk
71	jmp	L$mul_page_walk_done
72
73.p2align	4
74L$mul_page_walk:
75	leaq	-4096(%rsp),%rsp
76	movq	(%rsp),%r11
77	cmpq	%r10,%rsp
78	ja	L$mul_page_walk
79L$mul_page_walk_done:
80
81	movq	%rax,8(%rsp,%r9,8)
82
83L$mul_body:
84	movq	%rdx,%r12
85	movq	(%r8),%r8
86	movq	(%r12),%rbx
87	movq	(%rsi),%rax
88
89	xorq	%r14,%r14
90	xorq	%r15,%r15
91
92	movq	%r8,%rbp
93	mulq	%rbx
94	movq	%rax,%r10
95	movq	(%rcx),%rax
96
97	imulq	%r10,%rbp
98	movq	%rdx,%r11
99
100	mulq	%rbp
101	addq	%rax,%r10
102	movq	8(%rsi),%rax
103	adcq	$0,%rdx
104	movq	%rdx,%r13
105
106	leaq	1(%r15),%r15
107	jmp	L$1st_enter
108
109.p2align	4
110L$1st:
111	addq	%rax,%r13
112	movq	(%rsi,%r15,8),%rax
113	adcq	$0,%rdx
114	addq	%r11,%r13
115	movq	%r10,%r11
116	adcq	$0,%rdx
117	movq	%r13,-16(%rsp,%r15,8)
118	movq	%rdx,%r13
119
120L$1st_enter:
121	mulq	%rbx
122	addq	%rax,%r11
123	movq	(%rcx,%r15,8),%rax
124	adcq	$0,%rdx
125	leaq	1(%r15),%r15
126	movq	%rdx,%r10
127
128	mulq	%rbp
129	cmpq	%r9,%r15
130	jne	L$1st
131
132	addq	%rax,%r13
133	movq	(%rsi),%rax
134	adcq	$0,%rdx
135	addq	%r11,%r13
136	adcq	$0,%rdx
137	movq	%r13,-16(%rsp,%r15,8)
138	movq	%rdx,%r13
139	movq	%r10,%r11
140
141	xorq	%rdx,%rdx
142	addq	%r11,%r13
143	adcq	$0,%rdx
144	movq	%r13,-8(%rsp,%r9,8)
145	movq	%rdx,(%rsp,%r9,8)
146
147	leaq	1(%r14),%r14
148	jmp	L$outer
149.p2align	4
150L$outer:
151	movq	(%r12,%r14,8),%rbx
152	xorq	%r15,%r15
153	movq	%r8,%rbp
154	movq	(%rsp),%r10
155	mulq	%rbx
156	addq	%rax,%r10
157	movq	(%rcx),%rax
158	adcq	$0,%rdx
159
160	imulq	%r10,%rbp
161	movq	%rdx,%r11
162
163	mulq	%rbp
164	addq	%rax,%r10
165	movq	8(%rsi),%rax
166	adcq	$0,%rdx
167	movq	8(%rsp),%r10
168	movq	%rdx,%r13
169
170	leaq	1(%r15),%r15
171	jmp	L$inner_enter
172
173.p2align	4
174L$inner:
175	addq	%rax,%r13
176	movq	(%rsi,%r15,8),%rax
177	adcq	$0,%rdx
178	addq	%r10,%r13
179	movq	(%rsp,%r15,8),%r10
180	adcq	$0,%rdx
181	movq	%r13,-16(%rsp,%r15,8)
182	movq	%rdx,%r13
183
184L$inner_enter:
185	mulq	%rbx
186	addq	%rax,%r11
187	movq	(%rcx,%r15,8),%rax
188	adcq	$0,%rdx
189	addq	%r11,%r10
190	movq	%rdx,%r11
191	adcq	$0,%r11
192	leaq	1(%r15),%r15
193
194	mulq	%rbp
195	cmpq	%r9,%r15
196	jne	L$inner
197
198	addq	%rax,%r13
199	movq	(%rsi),%rax
200	adcq	$0,%rdx
201	addq	%r10,%r13
202	movq	(%rsp,%r15,8),%r10
203	adcq	$0,%rdx
204	movq	%r13,-16(%rsp,%r15,8)
205	movq	%rdx,%r13
206
207	xorq	%rdx,%rdx
208	addq	%r11,%r13
209	adcq	$0,%rdx
210	addq	%r10,%r13
211	adcq	$0,%rdx
212	movq	%r13,-8(%rsp,%r9,8)
213	movq	%rdx,(%rsp,%r9,8)
214
215	leaq	1(%r14),%r14
216	cmpq	%r9,%r14
217	jb	L$outer
218
219	xorq	%r14,%r14
220	movq	(%rsp),%rax
221	movq	%r9,%r15
222
223.p2align	4
224L$sub:	sbbq	(%rcx,%r14,8),%rax
225	movq	%rax,(%rdi,%r14,8)
226	movq	8(%rsp,%r14,8),%rax
227	leaq	1(%r14),%r14
228	decq	%r15
229	jnz	L$sub
230
231	sbbq	$0,%rax
232	movq	$-1,%rbx
233	xorq	%rax,%rbx
234	xorq	%r14,%r14
235	movq	%r9,%r15
236
237L$copy:
238	movq	(%rdi,%r14,8),%rcx
239	movq	(%rsp,%r14,8),%rdx
240	andq	%rbx,%rcx
241	andq	%rax,%rdx
242	movq	%r9,(%rsp,%r14,8)
243	orq	%rcx,%rdx
244	movq	%rdx,(%rdi,%r14,8)
245	leaq	1(%r14),%r14
246	subq	$1,%r15
247	jnz	L$copy
248
249	movq	8(%rsp,%r9,8),%rsi
250
251	movq	$1,%rax
252	movq	-48(%rsi),%r15
253
254	movq	-40(%rsi),%r14
255
256	movq	-32(%rsi),%r13
257
258	movq	-24(%rsi),%r12
259
260	movq	-16(%rsi),%rbp
261
262	movq	-8(%rsi),%rbx
263
264	leaq	(%rsi),%rsp
265
266L$mul_epilogue:
267	.byte	0xf3,0xc3
268
269
270
271.p2align	4
272bn_mul4x_mont:
273
274	movl	%r9d,%r9d
275	movq	%rsp,%rax
276
277L$mul4x_enter:
278	andl	$0x80100,%r11d
279	cmpl	$0x80100,%r11d
280	je	L$mulx4x_enter
281	pushq	%rbx
282
283	pushq	%rbp
284
285	pushq	%r12
286
287	pushq	%r13
288
289	pushq	%r14
290
291	pushq	%r15
292
293
294	negq	%r9
295	movq	%rsp,%r11
296	leaq	-32(%rsp,%r9,8),%r10
297	negq	%r9
298	andq	$-1024,%r10
299
300	subq	%r10,%r11
301	andq	$-4096,%r11
302	leaq	(%r10,%r11,1),%rsp
303	movq	(%rsp),%r11
304	cmpq	%r10,%rsp
305	ja	L$mul4x_page_walk
306	jmp	L$mul4x_page_walk_done
307
308L$mul4x_page_walk:
309	leaq	-4096(%rsp),%rsp
310	movq	(%rsp),%r11
311	cmpq	%r10,%rsp
312	ja	L$mul4x_page_walk
313L$mul4x_page_walk_done:
314
315	movq	%rax,8(%rsp,%r9,8)
316
317L$mul4x_body:
318	movq	%rdi,16(%rsp,%r9,8)
319	movq	%rdx,%r12
320	movq	(%r8),%r8
321	movq	(%r12),%rbx
322	movq	(%rsi),%rax
323
324	xorq	%r14,%r14
325	xorq	%r15,%r15
326
327	movq	%r8,%rbp
328	mulq	%rbx
329	movq	%rax,%r10
330	movq	(%rcx),%rax
331
332	imulq	%r10,%rbp
333	movq	%rdx,%r11
334
335	mulq	%rbp
336	addq	%rax,%r10
337	movq	8(%rsi),%rax
338	adcq	$0,%rdx
339	movq	%rdx,%rdi
340
341	mulq	%rbx
342	addq	%rax,%r11
343	movq	8(%rcx),%rax
344	adcq	$0,%rdx
345	movq	%rdx,%r10
346
347	mulq	%rbp
348	addq	%rax,%rdi
349	movq	16(%rsi),%rax
350	adcq	$0,%rdx
351	addq	%r11,%rdi
352	leaq	4(%r15),%r15
353	adcq	$0,%rdx
354	movq	%rdi,(%rsp)
355	movq	%rdx,%r13
356	jmp	L$1st4x
357.p2align	4
358L$1st4x:
359	mulq	%rbx
360	addq	%rax,%r10
361	movq	-16(%rcx,%r15,8),%rax
362	adcq	$0,%rdx
363	movq	%rdx,%r11
364
365	mulq	%rbp
366	addq	%rax,%r13
367	movq	-8(%rsi,%r15,8),%rax
368	adcq	$0,%rdx
369	addq	%r10,%r13
370	adcq	$0,%rdx
371	movq	%r13,-24(%rsp,%r15,8)
372	movq	%rdx,%rdi
373
374	mulq	%rbx
375	addq	%rax,%r11
376	movq	-8(%rcx,%r15,8),%rax
377	adcq	$0,%rdx
378	movq	%rdx,%r10
379
380	mulq	%rbp
381	addq	%rax,%rdi
382	movq	(%rsi,%r15,8),%rax
383	adcq	$0,%rdx
384	addq	%r11,%rdi
385	adcq	$0,%rdx
386	movq	%rdi,-16(%rsp,%r15,8)
387	movq	%rdx,%r13
388
389	mulq	%rbx
390	addq	%rax,%r10
391	movq	(%rcx,%r15,8),%rax
392	adcq	$0,%rdx
393	movq	%rdx,%r11
394
395	mulq	%rbp
396	addq	%rax,%r13
397	movq	8(%rsi,%r15,8),%rax
398	adcq	$0,%rdx
399	addq	%r10,%r13
400	adcq	$0,%rdx
401	movq	%r13,-8(%rsp,%r15,8)
402	movq	%rdx,%rdi
403
404	mulq	%rbx
405	addq	%rax,%r11
406	movq	8(%rcx,%r15,8),%rax
407	adcq	$0,%rdx
408	leaq	4(%r15),%r15
409	movq	%rdx,%r10
410
411	mulq	%rbp
412	addq	%rax,%rdi
413	movq	-16(%rsi,%r15,8),%rax
414	adcq	$0,%rdx
415	addq	%r11,%rdi
416	adcq	$0,%rdx
417	movq	%rdi,-32(%rsp,%r15,8)
418	movq	%rdx,%r13
419	cmpq	%r9,%r15
420	jb	L$1st4x
421
422	mulq	%rbx
423	addq	%rax,%r10
424	movq	-16(%rcx,%r15,8),%rax
425	adcq	$0,%rdx
426	movq	%rdx,%r11
427
428	mulq	%rbp
429	addq	%rax,%r13
430	movq	-8(%rsi,%r15,8),%rax
431	adcq	$0,%rdx
432	addq	%r10,%r13
433	adcq	$0,%rdx
434	movq	%r13,-24(%rsp,%r15,8)
435	movq	%rdx,%rdi
436
437	mulq	%rbx
438	addq	%rax,%r11
439	movq	-8(%rcx,%r15,8),%rax
440	adcq	$0,%rdx
441	movq	%rdx,%r10
442
443	mulq	%rbp
444	addq	%rax,%rdi
445	movq	(%rsi),%rax
446	adcq	$0,%rdx
447	addq	%r11,%rdi
448	adcq	$0,%rdx
449	movq	%rdi,-16(%rsp,%r15,8)
450	movq	%rdx,%r13
451
452	xorq	%rdi,%rdi
453	addq	%r10,%r13
454	adcq	$0,%rdi
455	movq	%r13,-8(%rsp,%r15,8)
456	movq	%rdi,(%rsp,%r15,8)
457
458	leaq	1(%r14),%r14
459.p2align	2
460L$outer4x:
461	movq	(%r12,%r14,8),%rbx
462	xorq	%r15,%r15
463	movq	(%rsp),%r10
464	movq	%r8,%rbp
465	mulq	%rbx
466	addq	%rax,%r10
467	movq	(%rcx),%rax
468	adcq	$0,%rdx
469
470	imulq	%r10,%rbp
471	movq	%rdx,%r11
472
473	mulq	%rbp
474	addq	%rax,%r10
475	movq	8(%rsi),%rax
476	adcq	$0,%rdx
477	movq	%rdx,%rdi
478
479	mulq	%rbx
480	addq	%rax,%r11
481	movq	8(%rcx),%rax
482	adcq	$0,%rdx
483	addq	8(%rsp),%r11
484	adcq	$0,%rdx
485	movq	%rdx,%r10
486
487	mulq	%rbp
488	addq	%rax,%rdi
489	movq	16(%rsi),%rax
490	adcq	$0,%rdx
491	addq	%r11,%rdi
492	leaq	4(%r15),%r15
493	adcq	$0,%rdx
494	movq	%rdi,(%rsp)
495	movq	%rdx,%r13
496	jmp	L$inner4x
497.p2align	4
498L$inner4x:
499	mulq	%rbx
500	addq	%rax,%r10
501	movq	-16(%rcx,%r15,8),%rax
502	adcq	$0,%rdx
503	addq	-16(%rsp,%r15,8),%r10
504	adcq	$0,%rdx
505	movq	%rdx,%r11
506
507	mulq	%rbp
508	addq	%rax,%r13
509	movq	-8(%rsi,%r15,8),%rax
510	adcq	$0,%rdx
511	addq	%r10,%r13
512	adcq	$0,%rdx
513	movq	%r13,-24(%rsp,%r15,8)
514	movq	%rdx,%rdi
515
516	mulq	%rbx
517	addq	%rax,%r11
518	movq	-8(%rcx,%r15,8),%rax
519	adcq	$0,%rdx
520	addq	-8(%rsp,%r15,8),%r11
521	adcq	$0,%rdx
522	movq	%rdx,%r10
523
524	mulq	%rbp
525	addq	%rax,%rdi
526	movq	(%rsi,%r15,8),%rax
527	adcq	$0,%rdx
528	addq	%r11,%rdi
529	adcq	$0,%rdx
530	movq	%rdi,-16(%rsp,%r15,8)
531	movq	%rdx,%r13
532
533	mulq	%rbx
534	addq	%rax,%r10
535	movq	(%rcx,%r15,8),%rax
536	adcq	$0,%rdx
537	addq	(%rsp,%r15,8),%r10
538	adcq	$0,%rdx
539	movq	%rdx,%r11
540
541	mulq	%rbp
542	addq	%rax,%r13
543	movq	8(%rsi,%r15,8),%rax
544	adcq	$0,%rdx
545	addq	%r10,%r13
546	adcq	$0,%rdx
547	movq	%r13,-8(%rsp,%r15,8)
548	movq	%rdx,%rdi
549
550	mulq	%rbx
551	addq	%rax,%r11
552	movq	8(%rcx,%r15,8),%rax
553	adcq	$0,%rdx
554	addq	8(%rsp,%r15,8),%r11
555	adcq	$0,%rdx
556	leaq	4(%r15),%r15
557	movq	%rdx,%r10
558
559	mulq	%rbp
560	addq	%rax,%rdi
561	movq	-16(%rsi,%r15,8),%rax
562	adcq	$0,%rdx
563	addq	%r11,%rdi
564	adcq	$0,%rdx
565	movq	%rdi,-32(%rsp,%r15,8)
566	movq	%rdx,%r13
567	cmpq	%r9,%r15
568	jb	L$inner4x
569
570	mulq	%rbx
571	addq	%rax,%r10
572	movq	-16(%rcx,%r15,8),%rax
573	adcq	$0,%rdx
574	addq	-16(%rsp,%r15,8),%r10
575	adcq	$0,%rdx
576	movq	%rdx,%r11
577
578	mulq	%rbp
579	addq	%rax,%r13
580	movq	-8(%rsi,%r15,8),%rax
581	adcq	$0,%rdx
582	addq	%r10,%r13
583	adcq	$0,%rdx
584	movq	%r13,-24(%rsp,%r15,8)
585	movq	%rdx,%rdi
586
587	mulq	%rbx
588	addq	%rax,%r11
589	movq	-8(%rcx,%r15,8),%rax
590	adcq	$0,%rdx
591	addq	-8(%rsp,%r15,8),%r11
592	adcq	$0,%rdx
593	leaq	1(%r14),%r14
594	movq	%rdx,%r10
595
596	mulq	%rbp
597	addq	%rax,%rdi
598	movq	(%rsi),%rax
599	adcq	$0,%rdx
600	addq	%r11,%rdi
601	adcq	$0,%rdx
602	movq	%rdi,-16(%rsp,%r15,8)
603	movq	%rdx,%r13
604
605	xorq	%rdi,%rdi
606	addq	%r10,%r13
607	adcq	$0,%rdi
608	addq	(%rsp,%r9,8),%r13
609	adcq	$0,%rdi
610	movq	%r13,-8(%rsp,%r15,8)
611	movq	%rdi,(%rsp,%r15,8)
612
613	cmpq	%r9,%r14
614	jb	L$outer4x
615	movq	16(%rsp,%r9,8),%rdi
616	leaq	-4(%r9),%r15
617	movq	0(%rsp),%rax
618	movq	8(%rsp),%rdx
619	shrq	$2,%r15
620	leaq	(%rsp),%rsi
621	xorq	%r14,%r14
622
623	subq	0(%rcx),%rax
624	movq	16(%rsi),%rbx
625	movq	24(%rsi),%rbp
626	sbbq	8(%rcx),%rdx
627
628L$sub4x:
629	movq	%rax,0(%rdi,%r14,8)
630	movq	%rdx,8(%rdi,%r14,8)
631	sbbq	16(%rcx,%r14,8),%rbx
632	movq	32(%rsi,%r14,8),%rax
633	movq	40(%rsi,%r14,8),%rdx
634	sbbq	24(%rcx,%r14,8),%rbp
635	movq	%rbx,16(%rdi,%r14,8)
636	movq	%rbp,24(%rdi,%r14,8)
637	sbbq	32(%rcx,%r14,8),%rax
638	movq	48(%rsi,%r14,8),%rbx
639	movq	56(%rsi,%r14,8),%rbp
640	sbbq	40(%rcx,%r14,8),%rdx
641	leaq	4(%r14),%r14
642	decq	%r15
643	jnz	L$sub4x
644
645	movq	%rax,0(%rdi,%r14,8)
646	movq	32(%rsi,%r14,8),%rax
647	sbbq	16(%rcx,%r14,8),%rbx
648	movq	%rdx,8(%rdi,%r14,8)
649	sbbq	24(%rcx,%r14,8),%rbp
650	movq	%rbx,16(%rdi,%r14,8)
651
652	sbbq	$0,%rax
653	movq	%rbp,24(%rdi,%r14,8)
654	pxor	%xmm0,%xmm0
655.byte	102,72,15,110,224
656	pcmpeqd	%xmm5,%xmm5
657	pshufd	$0,%xmm4,%xmm4
658	movq	%r9,%r15
659	pxor	%xmm4,%xmm5
660	shrq	$2,%r15
661	xorl	%eax,%eax
662
663	jmp	L$copy4x
664.p2align	4
665L$copy4x:
666	movdqa	(%rsp,%rax,1),%xmm1
667	movdqu	(%rdi,%rax,1),%xmm2
668	pand	%xmm4,%xmm1
669	pand	%xmm5,%xmm2
670	movdqa	16(%rsp,%rax,1),%xmm3
671	movdqa	%xmm0,(%rsp,%rax,1)
672	por	%xmm2,%xmm1
673	movdqu	16(%rdi,%rax,1),%xmm2
674	movdqu	%xmm1,(%rdi,%rax,1)
675	pand	%xmm4,%xmm3
676	pand	%xmm5,%xmm2
677	movdqa	%xmm0,16(%rsp,%rax,1)
678	por	%xmm2,%xmm3
679	movdqu	%xmm3,16(%rdi,%rax,1)
680	leaq	32(%rax),%rax
681	decq	%r15
682	jnz	L$copy4x
683	movq	8(%rsp,%r9,8),%rsi
684
685	movq	$1,%rax
686	movq	-48(%rsi),%r15
687
688	movq	-40(%rsi),%r14
689
690	movq	-32(%rsi),%r13
691
692	movq	-24(%rsi),%r12
693
694	movq	-16(%rsi),%rbp
695
696	movq	-8(%rsi),%rbx
697
698	leaq	(%rsi),%rsp
699
700L$mul4x_epilogue:
701	.byte	0xf3,0xc3
702
703
704
705
706
707
708.p2align	5
709bn_sqr8x_mont:
710
711	movq	%rsp,%rax
712
713L$sqr8x_enter:
714	pushq	%rbx
715
716	pushq	%rbp
717
718	pushq	%r12
719
720	pushq	%r13
721
722	pushq	%r14
723
724	pushq	%r15
725
726L$sqr8x_prologue:
727
728	movl	%r9d,%r10d
729	shll	$3,%r9d
730	shlq	$3+2,%r10
731	negq	%r9
732
733
734
735
736
737
738	leaq	-64(%rsp,%r9,2),%r11
739	movq	%rsp,%rbp
740	movq	(%r8),%r8
741	subq	%rsi,%r11
742	andq	$4095,%r11
743	cmpq	%r11,%r10
744	jb	L$sqr8x_sp_alt
745	subq	%r11,%rbp
746	leaq	-64(%rbp,%r9,2),%rbp
747	jmp	L$sqr8x_sp_done
748
749.p2align	5
750L$sqr8x_sp_alt:
751	leaq	4096-64(,%r9,2),%r10
752	leaq	-64(%rbp,%r9,2),%rbp
753	subq	%r10,%r11
754	movq	$0,%r10
755	cmovcq	%r10,%r11
756	subq	%r11,%rbp
757L$sqr8x_sp_done:
758	andq	$-64,%rbp
759	movq	%rsp,%r11
760	subq	%rbp,%r11
761	andq	$-4096,%r11
762	leaq	(%r11,%rbp,1),%rsp
763	movq	(%rsp),%r10
764	cmpq	%rbp,%rsp
765	ja	L$sqr8x_page_walk
766	jmp	L$sqr8x_page_walk_done
767
768.p2align	4
769L$sqr8x_page_walk:
770	leaq	-4096(%rsp),%rsp
771	movq	(%rsp),%r10
772	cmpq	%rbp,%rsp
773	ja	L$sqr8x_page_walk
774L$sqr8x_page_walk_done:
775
776	movq	%r9,%r10
777	negq	%r9
778
779	movq	%r8,32(%rsp)
780	movq	%rax,40(%rsp)
781
782L$sqr8x_body:
783
784.byte	102,72,15,110,209
785	pxor	%xmm0,%xmm0
786.byte	102,72,15,110,207
787.byte	102,73,15,110,218
788	movl	_OPENSSL_ia32cap_P+8(%rip),%eax
789	andl	$0x80100,%eax
790	cmpl	$0x80100,%eax
791	jne	L$sqr8x_nox
792
793	call	_bn_sqrx8x_internal
794
795
796
797
798	leaq	(%r8,%rcx,1),%rbx
799	movq	%rcx,%r9
800	movq	%rcx,%rdx
801.byte	102,72,15,126,207
802	sarq	$3+2,%rcx
803	jmp	L$sqr8x_sub
804
805.p2align	5
806L$sqr8x_nox:
807	call	_bn_sqr8x_internal
808
809
810
811
812	leaq	(%rdi,%r9,1),%rbx
813	movq	%r9,%rcx
814	movq	%r9,%rdx
815.byte	102,72,15,126,207
816	sarq	$3+2,%rcx
817	jmp	L$sqr8x_sub
818
819.p2align	5
820L$sqr8x_sub:
821	movq	0(%rbx),%r12
822	movq	8(%rbx),%r13
823	movq	16(%rbx),%r14
824	movq	24(%rbx),%r15
825	leaq	32(%rbx),%rbx
826	sbbq	0(%rbp),%r12
827	sbbq	8(%rbp),%r13
828	sbbq	16(%rbp),%r14
829	sbbq	24(%rbp),%r15
830	leaq	32(%rbp),%rbp
831	movq	%r12,0(%rdi)
832	movq	%r13,8(%rdi)
833	movq	%r14,16(%rdi)
834	movq	%r15,24(%rdi)
835	leaq	32(%rdi),%rdi
836	incq	%rcx
837	jnz	L$sqr8x_sub
838
839	sbbq	$0,%rax
840	leaq	(%rbx,%r9,1),%rbx
841	leaq	(%rdi,%r9,1),%rdi
842
843.byte	102,72,15,110,200
844	pxor	%xmm0,%xmm0
845	pshufd	$0,%xmm1,%xmm1
846	movq	40(%rsp),%rsi
847
848	jmp	L$sqr8x_cond_copy
849
850.p2align	5
851L$sqr8x_cond_copy:
852	movdqa	0(%rbx),%xmm2
853	movdqa	16(%rbx),%xmm3
854	leaq	32(%rbx),%rbx
855	movdqu	0(%rdi),%xmm4
856	movdqu	16(%rdi),%xmm5
857	leaq	32(%rdi),%rdi
858	movdqa	%xmm0,-32(%rbx)
859	movdqa	%xmm0,-16(%rbx)
860	movdqa	%xmm0,-32(%rbx,%rdx,1)
861	movdqa	%xmm0,-16(%rbx,%rdx,1)
862	pcmpeqd	%xmm1,%xmm0
863	pand	%xmm1,%xmm2
864	pand	%xmm1,%xmm3
865	pand	%xmm0,%xmm4
866	pand	%xmm0,%xmm5
867	pxor	%xmm0,%xmm0
868	por	%xmm2,%xmm4
869	por	%xmm3,%xmm5
870	movdqu	%xmm4,-32(%rdi)
871	movdqu	%xmm5,-16(%rdi)
872	addq	$32,%r9
873	jnz	L$sqr8x_cond_copy
874
875	movq	$1,%rax
876	movq	-48(%rsi),%r15
877
878	movq	-40(%rsi),%r14
879
880	movq	-32(%rsi),%r13
881
882	movq	-24(%rsi),%r12
883
884	movq	-16(%rsi),%rbp
885
886	movq	-8(%rsi),%rbx
887
888	leaq	(%rsi),%rsp
889
890L$sqr8x_epilogue:
891	.byte	0xf3,0xc3
892
893
894
895.p2align	5
896bn_mulx4x_mont:
897
898	movq	%rsp,%rax
899
900L$mulx4x_enter:
901	pushq	%rbx
902
903	pushq	%rbp
904
905	pushq	%r12
906
907	pushq	%r13
908
909	pushq	%r14
910
911	pushq	%r15
912
913L$mulx4x_prologue:
914
915	shll	$3,%r9d
916	xorq	%r10,%r10
917	subq	%r9,%r10
918	movq	(%r8),%r8
919	leaq	-72(%rsp,%r10,1),%rbp
920	andq	$-128,%rbp
921	movq	%rsp,%r11
922	subq	%rbp,%r11
923	andq	$-4096,%r11
924	leaq	(%r11,%rbp,1),%rsp
925	movq	(%rsp),%r10
926	cmpq	%rbp,%rsp
927	ja	L$mulx4x_page_walk
928	jmp	L$mulx4x_page_walk_done
929
930.p2align	4
931L$mulx4x_page_walk:
932	leaq	-4096(%rsp),%rsp
933	movq	(%rsp),%r10
934	cmpq	%rbp,%rsp
935	ja	L$mulx4x_page_walk
936L$mulx4x_page_walk_done:
937
938	leaq	(%rdx,%r9,1),%r10
939
940
941
942
943
944
945
946
947
948
949
950
951	movq	%r9,0(%rsp)
952	shrq	$5,%r9
953	movq	%r10,16(%rsp)
954	subq	$1,%r9
955	movq	%r8,24(%rsp)
956	movq	%rdi,32(%rsp)
957	movq	%rax,40(%rsp)
958
959	movq	%r9,48(%rsp)
960	jmp	L$mulx4x_body
961
962.p2align	5
963L$mulx4x_body:
964	leaq	8(%rdx),%rdi
965	movq	(%rdx),%rdx
966	leaq	64+32(%rsp),%rbx
967	movq	%rdx,%r9
968
969	mulxq	0(%rsi),%r8,%rax
970	mulxq	8(%rsi),%r11,%r14
971	addq	%rax,%r11
972	movq	%rdi,8(%rsp)
973	mulxq	16(%rsi),%r12,%r13
974	adcq	%r14,%r12
975	adcq	$0,%r13
976
977	movq	%r8,%rdi
978	imulq	24(%rsp),%r8
979	xorq	%rbp,%rbp
980
981	mulxq	24(%rsi),%rax,%r14
982	movq	%r8,%rdx
983	leaq	32(%rsi),%rsi
984	adcxq	%rax,%r13
985	adcxq	%rbp,%r14
986
987	mulxq	0(%rcx),%rax,%r10
988	adcxq	%rax,%rdi
989	adoxq	%r11,%r10
990	mulxq	8(%rcx),%rax,%r11
991	adcxq	%rax,%r10
992	adoxq	%r12,%r11
993.byte	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
994	movq	48(%rsp),%rdi
995	movq	%r10,-32(%rbx)
996	adcxq	%rax,%r11
997	adoxq	%r13,%r12
998	mulxq	24(%rcx),%rax,%r15
999	movq	%r9,%rdx
1000	movq	%r11,-24(%rbx)
1001	adcxq	%rax,%r12
1002	adoxq	%rbp,%r15
1003	leaq	32(%rcx),%rcx
1004	movq	%r12,-16(%rbx)
1005
1006	jmp	L$mulx4x_1st
1007
1008.p2align	5
1009L$mulx4x_1st:
1010	adcxq	%rbp,%r15
1011	mulxq	0(%rsi),%r10,%rax
1012	adcxq	%r14,%r10
1013	mulxq	8(%rsi),%r11,%r14
1014	adcxq	%rax,%r11
1015	mulxq	16(%rsi),%r12,%rax
1016	adcxq	%r14,%r12
1017	mulxq	24(%rsi),%r13,%r14
1018.byte	0x67,0x67
1019	movq	%r8,%rdx
1020	adcxq	%rax,%r13
1021	adcxq	%rbp,%r14
1022	leaq	32(%rsi),%rsi
1023	leaq	32(%rbx),%rbx
1024
1025	adoxq	%r15,%r10
1026	mulxq	0(%rcx),%rax,%r15
1027	adcxq	%rax,%r10
1028	adoxq	%r15,%r11
1029	mulxq	8(%rcx),%rax,%r15
1030	adcxq	%rax,%r11
1031	adoxq	%r15,%r12
1032	mulxq	16(%rcx),%rax,%r15
1033	movq	%r10,-40(%rbx)
1034	adcxq	%rax,%r12
1035	movq	%r11,-32(%rbx)
1036	adoxq	%r15,%r13
1037	mulxq	24(%rcx),%rax,%r15
1038	movq	%r9,%rdx
1039	movq	%r12,-24(%rbx)
1040	adcxq	%rax,%r13
1041	adoxq	%rbp,%r15
1042	leaq	32(%rcx),%rcx
1043	movq	%r13,-16(%rbx)
1044
1045	decq	%rdi
1046	jnz	L$mulx4x_1st
1047
1048	movq	0(%rsp),%rax
1049	movq	8(%rsp),%rdi
1050	adcq	%rbp,%r15
1051	addq	%r15,%r14
1052	sbbq	%r15,%r15
1053	movq	%r14,-8(%rbx)
1054	jmp	L$mulx4x_outer
1055
1056.p2align	5
1057L$mulx4x_outer:
1058	movq	(%rdi),%rdx
1059	leaq	8(%rdi),%rdi
1060	subq	%rax,%rsi
1061	movq	%r15,(%rbx)
1062	leaq	64+32(%rsp),%rbx
1063	subq	%rax,%rcx
1064
1065	mulxq	0(%rsi),%r8,%r11
1066	xorl	%ebp,%ebp
1067	movq	%rdx,%r9
1068	mulxq	8(%rsi),%r14,%r12
1069	adoxq	-32(%rbx),%r8
1070	adcxq	%r14,%r11
1071	mulxq	16(%rsi),%r15,%r13
1072	adoxq	-24(%rbx),%r11
1073	adcxq	%r15,%r12
1074	adoxq	-16(%rbx),%r12
1075	adcxq	%rbp,%r13
1076	adoxq	%rbp,%r13
1077
1078	movq	%rdi,8(%rsp)
1079	movq	%r8,%r15
1080	imulq	24(%rsp),%r8
1081	xorl	%ebp,%ebp
1082
1083	mulxq	24(%rsi),%rax,%r14
1084	movq	%r8,%rdx
1085	adcxq	%rax,%r13
1086	adoxq	-8(%rbx),%r13
1087	adcxq	%rbp,%r14
1088	leaq	32(%rsi),%rsi
1089	adoxq	%rbp,%r14
1090
1091	mulxq	0(%rcx),%rax,%r10
1092	adcxq	%rax,%r15
1093	adoxq	%r11,%r10
1094	mulxq	8(%rcx),%rax,%r11
1095	adcxq	%rax,%r10
1096	adoxq	%r12,%r11
1097	mulxq	16(%rcx),%rax,%r12
1098	movq	%r10,-32(%rbx)
1099	adcxq	%rax,%r11
1100	adoxq	%r13,%r12
1101	mulxq	24(%rcx),%rax,%r15
1102	movq	%r9,%rdx
1103	movq	%r11,-24(%rbx)
1104	leaq	32(%rcx),%rcx
1105	adcxq	%rax,%r12
1106	adoxq	%rbp,%r15
1107	movq	48(%rsp),%rdi
1108	movq	%r12,-16(%rbx)
1109
1110	jmp	L$mulx4x_inner
1111
1112.p2align	5
1113L$mulx4x_inner:
1114	mulxq	0(%rsi),%r10,%rax
1115	adcxq	%rbp,%r15
1116	adoxq	%r14,%r10
1117	mulxq	8(%rsi),%r11,%r14
1118	adcxq	0(%rbx),%r10
1119	adoxq	%rax,%r11
1120	mulxq	16(%rsi),%r12,%rax
1121	adcxq	8(%rbx),%r11
1122	adoxq	%r14,%r12
1123	mulxq	24(%rsi),%r13,%r14
1124	movq	%r8,%rdx
1125	adcxq	16(%rbx),%r12
1126	adoxq	%rax,%r13
1127	adcxq	24(%rbx),%r13
1128	adoxq	%rbp,%r14
1129	leaq	32(%rsi),%rsi
1130	leaq	32(%rbx),%rbx
1131	adcxq	%rbp,%r14
1132
1133	adoxq	%r15,%r10
1134	mulxq	0(%rcx),%rax,%r15
1135	adcxq	%rax,%r10
1136	adoxq	%r15,%r11
1137	mulxq	8(%rcx),%rax,%r15
1138	adcxq	%rax,%r11
1139	adoxq	%r15,%r12
1140	mulxq	16(%rcx),%rax,%r15
1141	movq	%r10,-40(%rbx)
1142	adcxq	%rax,%r12
1143	adoxq	%r15,%r13
1144	mulxq	24(%rcx),%rax,%r15
1145	movq	%r9,%rdx
1146	movq	%r11,-32(%rbx)
1147	movq	%r12,-24(%rbx)
1148	adcxq	%rax,%r13
1149	adoxq	%rbp,%r15
1150	leaq	32(%rcx),%rcx
1151	movq	%r13,-16(%rbx)
1152
1153	decq	%rdi
1154	jnz	L$mulx4x_inner
1155
1156	movq	0(%rsp),%rax
1157	movq	8(%rsp),%rdi
1158	adcq	%rbp,%r15
1159	subq	0(%rbx),%rbp
1160	adcq	%r15,%r14
1161	sbbq	%r15,%r15
1162	movq	%r14,-8(%rbx)
1163
1164	cmpq	16(%rsp),%rdi
1165	jne	L$mulx4x_outer
1166
1167	leaq	64(%rsp),%rbx
1168	subq	%rax,%rcx
1169	negq	%r15
1170	movq	%rax,%rdx
1171	shrq	$3+2,%rax
1172	movq	32(%rsp),%rdi
1173	jmp	L$mulx4x_sub
1174
1175.p2align	5
1176L$mulx4x_sub:
1177	movq	0(%rbx),%r11
1178	movq	8(%rbx),%r12
1179	movq	16(%rbx),%r13
1180	movq	24(%rbx),%r14
1181	leaq	32(%rbx),%rbx
1182	sbbq	0(%rcx),%r11
1183	sbbq	8(%rcx),%r12
1184	sbbq	16(%rcx),%r13
1185	sbbq	24(%rcx),%r14
1186	leaq	32(%rcx),%rcx
1187	movq	%r11,0(%rdi)
1188	movq	%r12,8(%rdi)
1189	movq	%r13,16(%rdi)
1190	movq	%r14,24(%rdi)
1191	leaq	32(%rdi),%rdi
1192	decq	%rax
1193	jnz	L$mulx4x_sub
1194
1195	sbbq	$0,%r15
1196	leaq	64(%rsp),%rbx
1197	subq	%rdx,%rdi
1198
1199.byte	102,73,15,110,207
1200	pxor	%xmm0,%xmm0
1201	pshufd	$0,%xmm1,%xmm1
1202	movq	40(%rsp),%rsi
1203
1204	jmp	L$mulx4x_cond_copy
1205
1206.p2align	5
1207L$mulx4x_cond_copy:
1208	movdqa	0(%rbx),%xmm2
1209	movdqa	16(%rbx),%xmm3
1210	leaq	32(%rbx),%rbx
1211	movdqu	0(%rdi),%xmm4
1212	movdqu	16(%rdi),%xmm5
1213	leaq	32(%rdi),%rdi
1214	movdqa	%xmm0,-32(%rbx)
1215	movdqa	%xmm0,-16(%rbx)
1216	pcmpeqd	%xmm1,%xmm0
1217	pand	%xmm1,%xmm2
1218	pand	%xmm1,%xmm3
1219	pand	%xmm0,%xmm4
1220	pand	%xmm0,%xmm5
1221	pxor	%xmm0,%xmm0
1222	por	%xmm2,%xmm4
1223	por	%xmm3,%xmm5
1224	movdqu	%xmm4,-32(%rdi)
1225	movdqu	%xmm5,-16(%rdi)
1226	subq	$32,%rdx
1227	jnz	L$mulx4x_cond_copy
1228
1229	movq	%rdx,(%rbx)
1230
1231	movq	$1,%rax
1232	movq	-48(%rsi),%r15
1233
1234	movq	-40(%rsi),%r14
1235
1236	movq	-32(%rsi),%r13
1237
1238	movq	-24(%rsi),%r12
1239
1240	movq	-16(%rsi),%rbp
1241
1242	movq	-8(%rsi),%rbx
1243
1244	leaq	(%rsi),%rsp
1245
1246L$mulx4x_epilogue:
1247	.byte	0xf3,0xc3
1248
1249
1250.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1251.p2align	4
1252#endif
1253