• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# This file is generated from a similarly-named Perl script in the BoringSSL
2# source tree. Do not edit by hand.
3
4#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
11.text
12
13
14
15.globl	_GFp_bn_mul_mont
16.private_extern _GFp_bn_mul_mont
17
18.p2align	4
19_GFp_bn_mul_mont:
20
21	movl	%r9d,%r9d
22	movq	%rsp,%rax
23
24	testl	$3,%r9d
25	jnz	L$mul_enter
26	cmpl	$8,%r9d
27	jb	L$mul_enter
28	movl	_GFp_ia32cap_P+8(%rip),%r11d
29	cmpq	%rsi,%rdx
30	jne	L$mul4x_enter
31	testl	$7,%r9d
32	jz	L$sqr8x_enter
33	jmp	L$mul4x_enter
34
35.p2align	4
36L$mul_enter:
37	pushq	%rbx
38
39	pushq	%rbp
40
41	pushq	%r12
42
43	pushq	%r13
44
45	pushq	%r14
46
47	pushq	%r15
48
49
50	negq	%r9
51	movq	%rsp,%r11
52	leaq	-16(%rsp,%r9,8),%r10
53	negq	%r9
54	andq	$-1024,%r10
55
56
57
58
59
60
61
62
63
64	subq	%r10,%r11
65	andq	$-4096,%r11
66	leaq	(%r10,%r11,1),%rsp
67	movq	(%rsp),%r11
68	cmpq	%r10,%rsp
69	ja	L$mul_page_walk
70	jmp	L$mul_page_walk_done
71
72.p2align	4
73L$mul_page_walk:
74	leaq	-4096(%rsp),%rsp
75	movq	(%rsp),%r11
76	cmpq	%r10,%rsp
77	ja	L$mul_page_walk
78L$mul_page_walk_done:
79
80	movq	%rax,8(%rsp,%r9,8)
81
82L$mul_body:
83	movq	%rdx,%r12
84	movq	(%r8),%r8
85	movq	(%r12),%rbx
86	movq	(%rsi),%rax
87
88	xorq	%r14,%r14
89	xorq	%r15,%r15
90
91	movq	%r8,%rbp
92	mulq	%rbx
93	movq	%rax,%r10
94	movq	(%rcx),%rax
95
96	imulq	%r10,%rbp
97	movq	%rdx,%r11
98
99	mulq	%rbp
100	addq	%rax,%r10
101	movq	8(%rsi),%rax
102	adcq	$0,%rdx
103	movq	%rdx,%r13
104
105	leaq	1(%r15),%r15
106	jmp	L$1st_enter
107
108.p2align	4
109L$1st:
110	addq	%rax,%r13
111	movq	(%rsi,%r15,8),%rax
112	adcq	$0,%rdx
113	addq	%r11,%r13
114	movq	%r10,%r11
115	adcq	$0,%rdx
116	movq	%r13,-16(%rsp,%r15,8)
117	movq	%rdx,%r13
118
119L$1st_enter:
120	mulq	%rbx
121	addq	%rax,%r11
122	movq	(%rcx,%r15,8),%rax
123	adcq	$0,%rdx
124	leaq	1(%r15),%r15
125	movq	%rdx,%r10
126
127	mulq	%rbp
128	cmpq	%r9,%r15
129	jne	L$1st
130
131	addq	%rax,%r13
132	movq	(%rsi),%rax
133	adcq	$0,%rdx
134	addq	%r11,%r13
135	adcq	$0,%rdx
136	movq	%r13,-16(%rsp,%r15,8)
137	movq	%rdx,%r13
138	movq	%r10,%r11
139
140	xorq	%rdx,%rdx
141	addq	%r11,%r13
142	adcq	$0,%rdx
143	movq	%r13,-8(%rsp,%r9,8)
144	movq	%rdx,(%rsp,%r9,8)
145
146	leaq	1(%r14),%r14
147	jmp	L$outer
148.p2align	4
149L$outer:
150	movq	(%r12,%r14,8),%rbx
151	xorq	%r15,%r15
152	movq	%r8,%rbp
153	movq	(%rsp),%r10
154	mulq	%rbx
155	addq	%rax,%r10
156	movq	(%rcx),%rax
157	adcq	$0,%rdx
158
159	imulq	%r10,%rbp
160	movq	%rdx,%r11
161
162	mulq	%rbp
163	addq	%rax,%r10
164	movq	8(%rsi),%rax
165	adcq	$0,%rdx
166	movq	8(%rsp),%r10
167	movq	%rdx,%r13
168
169	leaq	1(%r15),%r15
170	jmp	L$inner_enter
171
172.p2align	4
173L$inner:
174	addq	%rax,%r13
175	movq	(%rsi,%r15,8),%rax
176	adcq	$0,%rdx
177	addq	%r10,%r13
178	movq	(%rsp,%r15,8),%r10
179	adcq	$0,%rdx
180	movq	%r13,-16(%rsp,%r15,8)
181	movq	%rdx,%r13
182
183L$inner_enter:
184	mulq	%rbx
185	addq	%rax,%r11
186	movq	(%rcx,%r15,8),%rax
187	adcq	$0,%rdx
188	addq	%r11,%r10
189	movq	%rdx,%r11
190	adcq	$0,%r11
191	leaq	1(%r15),%r15
192
193	mulq	%rbp
194	cmpq	%r9,%r15
195	jne	L$inner
196
197	addq	%rax,%r13
198	movq	(%rsi),%rax
199	adcq	$0,%rdx
200	addq	%r10,%r13
201	movq	(%rsp,%r15,8),%r10
202	adcq	$0,%rdx
203	movq	%r13,-16(%rsp,%r15,8)
204	movq	%rdx,%r13
205
206	xorq	%rdx,%rdx
207	addq	%r11,%r13
208	adcq	$0,%rdx
209	addq	%r10,%r13
210	adcq	$0,%rdx
211	movq	%r13,-8(%rsp,%r9,8)
212	movq	%rdx,(%rsp,%r9,8)
213
214	leaq	1(%r14),%r14
215	cmpq	%r9,%r14
216	jb	L$outer
217
218	xorq	%r14,%r14
219	movq	(%rsp),%rax
220	movq	%r9,%r15
221
222.p2align	4
223L$sub:	sbbq	(%rcx,%r14,8),%rax
224	movq	%rax,(%rdi,%r14,8)
225	movq	8(%rsp,%r14,8),%rax
226	leaq	1(%r14),%r14
227	decq	%r15
228	jnz	L$sub
229
230	sbbq	$0,%rax
231	movq	$-1,%rbx
232	xorq	%rax,%rbx
233	xorq	%r14,%r14
234	movq	%r9,%r15
235
236L$copy:
237	movq	(%rdi,%r14,8),%rcx
238	movq	(%rsp,%r14,8),%rdx
239	andq	%rbx,%rcx
240	andq	%rax,%rdx
241	movq	%r9,(%rsp,%r14,8)
242	orq	%rcx,%rdx
243	movq	%rdx,(%rdi,%r14,8)
244	leaq	1(%r14),%r14
245	subq	$1,%r15
246	jnz	L$copy
247
248	movq	8(%rsp,%r9,8),%rsi
249
250	movq	$1,%rax
251	movq	-48(%rsi),%r15
252
253	movq	-40(%rsi),%r14
254
255	movq	-32(%rsi),%r13
256
257	movq	-24(%rsi),%r12
258
259	movq	-16(%rsi),%rbp
260
261	movq	-8(%rsi),%rbx
262
263	leaq	(%rsi),%rsp
264
265L$mul_epilogue:
266	.byte	0xf3,0xc3
267
268
269
270.p2align	4
271bn_mul4x_mont:
272
273	movl	%r9d,%r9d
274	movq	%rsp,%rax
275
276L$mul4x_enter:
277	andl	$0x80100,%r11d
278	cmpl	$0x80100,%r11d
279	je	L$mulx4x_enter
280	pushq	%rbx
281
282	pushq	%rbp
283
284	pushq	%r12
285
286	pushq	%r13
287
288	pushq	%r14
289
290	pushq	%r15
291
292
293	negq	%r9
294	movq	%rsp,%r11
295	leaq	-32(%rsp,%r9,8),%r10
296	negq	%r9
297	andq	$-1024,%r10
298
299	subq	%r10,%r11
300	andq	$-4096,%r11
301	leaq	(%r10,%r11,1),%rsp
302	movq	(%rsp),%r11
303	cmpq	%r10,%rsp
304	ja	L$mul4x_page_walk
305	jmp	L$mul4x_page_walk_done
306
307L$mul4x_page_walk:
308	leaq	-4096(%rsp),%rsp
309	movq	(%rsp),%r11
310	cmpq	%r10,%rsp
311	ja	L$mul4x_page_walk
312L$mul4x_page_walk_done:
313
314	movq	%rax,8(%rsp,%r9,8)
315
316L$mul4x_body:
317	movq	%rdi,16(%rsp,%r9,8)
318	movq	%rdx,%r12
319	movq	(%r8),%r8
320	movq	(%r12),%rbx
321	movq	(%rsi),%rax
322
323	xorq	%r14,%r14
324	xorq	%r15,%r15
325
326	movq	%r8,%rbp
327	mulq	%rbx
328	movq	%rax,%r10
329	movq	(%rcx),%rax
330
331	imulq	%r10,%rbp
332	movq	%rdx,%r11
333
334	mulq	%rbp
335	addq	%rax,%r10
336	movq	8(%rsi),%rax
337	adcq	$0,%rdx
338	movq	%rdx,%rdi
339
340	mulq	%rbx
341	addq	%rax,%r11
342	movq	8(%rcx),%rax
343	adcq	$0,%rdx
344	movq	%rdx,%r10
345
346	mulq	%rbp
347	addq	%rax,%rdi
348	movq	16(%rsi),%rax
349	adcq	$0,%rdx
350	addq	%r11,%rdi
351	leaq	4(%r15),%r15
352	adcq	$0,%rdx
353	movq	%rdi,(%rsp)
354	movq	%rdx,%r13
355	jmp	L$1st4x
356.p2align	4
357L$1st4x:
358	mulq	%rbx
359	addq	%rax,%r10
360	movq	-16(%rcx,%r15,8),%rax
361	adcq	$0,%rdx
362	movq	%rdx,%r11
363
364	mulq	%rbp
365	addq	%rax,%r13
366	movq	-8(%rsi,%r15,8),%rax
367	adcq	$0,%rdx
368	addq	%r10,%r13
369	adcq	$0,%rdx
370	movq	%r13,-24(%rsp,%r15,8)
371	movq	%rdx,%rdi
372
373	mulq	%rbx
374	addq	%rax,%r11
375	movq	-8(%rcx,%r15,8),%rax
376	adcq	$0,%rdx
377	movq	%rdx,%r10
378
379	mulq	%rbp
380	addq	%rax,%rdi
381	movq	(%rsi,%r15,8),%rax
382	adcq	$0,%rdx
383	addq	%r11,%rdi
384	adcq	$0,%rdx
385	movq	%rdi,-16(%rsp,%r15,8)
386	movq	%rdx,%r13
387
388	mulq	%rbx
389	addq	%rax,%r10
390	movq	(%rcx,%r15,8),%rax
391	adcq	$0,%rdx
392	movq	%rdx,%r11
393
394	mulq	%rbp
395	addq	%rax,%r13
396	movq	8(%rsi,%r15,8),%rax
397	adcq	$0,%rdx
398	addq	%r10,%r13
399	adcq	$0,%rdx
400	movq	%r13,-8(%rsp,%r15,8)
401	movq	%rdx,%rdi
402
403	mulq	%rbx
404	addq	%rax,%r11
405	movq	8(%rcx,%r15,8),%rax
406	adcq	$0,%rdx
407	leaq	4(%r15),%r15
408	movq	%rdx,%r10
409
410	mulq	%rbp
411	addq	%rax,%rdi
412	movq	-16(%rsi,%r15,8),%rax
413	adcq	$0,%rdx
414	addq	%r11,%rdi
415	adcq	$0,%rdx
416	movq	%rdi,-32(%rsp,%r15,8)
417	movq	%rdx,%r13
418	cmpq	%r9,%r15
419	jb	L$1st4x
420
421	mulq	%rbx
422	addq	%rax,%r10
423	movq	-16(%rcx,%r15,8),%rax
424	adcq	$0,%rdx
425	movq	%rdx,%r11
426
427	mulq	%rbp
428	addq	%rax,%r13
429	movq	-8(%rsi,%r15,8),%rax
430	adcq	$0,%rdx
431	addq	%r10,%r13
432	adcq	$0,%rdx
433	movq	%r13,-24(%rsp,%r15,8)
434	movq	%rdx,%rdi
435
436	mulq	%rbx
437	addq	%rax,%r11
438	movq	-8(%rcx,%r15,8),%rax
439	adcq	$0,%rdx
440	movq	%rdx,%r10
441
442	mulq	%rbp
443	addq	%rax,%rdi
444	movq	(%rsi),%rax
445	adcq	$0,%rdx
446	addq	%r11,%rdi
447	adcq	$0,%rdx
448	movq	%rdi,-16(%rsp,%r15,8)
449	movq	%rdx,%r13
450
451	xorq	%rdi,%rdi
452	addq	%r10,%r13
453	adcq	$0,%rdi
454	movq	%r13,-8(%rsp,%r15,8)
455	movq	%rdi,(%rsp,%r15,8)
456
457	leaq	1(%r14),%r14
458.p2align	2
459L$outer4x:
460	movq	(%r12,%r14,8),%rbx
461	xorq	%r15,%r15
462	movq	(%rsp),%r10
463	movq	%r8,%rbp
464	mulq	%rbx
465	addq	%rax,%r10
466	movq	(%rcx),%rax
467	adcq	$0,%rdx
468
469	imulq	%r10,%rbp
470	movq	%rdx,%r11
471
472	mulq	%rbp
473	addq	%rax,%r10
474	movq	8(%rsi),%rax
475	adcq	$0,%rdx
476	movq	%rdx,%rdi
477
478	mulq	%rbx
479	addq	%rax,%r11
480	movq	8(%rcx),%rax
481	adcq	$0,%rdx
482	addq	8(%rsp),%r11
483	adcq	$0,%rdx
484	movq	%rdx,%r10
485
486	mulq	%rbp
487	addq	%rax,%rdi
488	movq	16(%rsi),%rax
489	adcq	$0,%rdx
490	addq	%r11,%rdi
491	leaq	4(%r15),%r15
492	adcq	$0,%rdx
493	movq	%rdi,(%rsp)
494	movq	%rdx,%r13
495	jmp	L$inner4x
496.p2align	4
497L$inner4x:
498	mulq	%rbx
499	addq	%rax,%r10
500	movq	-16(%rcx,%r15,8),%rax
501	adcq	$0,%rdx
502	addq	-16(%rsp,%r15,8),%r10
503	adcq	$0,%rdx
504	movq	%rdx,%r11
505
506	mulq	%rbp
507	addq	%rax,%r13
508	movq	-8(%rsi,%r15,8),%rax
509	adcq	$0,%rdx
510	addq	%r10,%r13
511	adcq	$0,%rdx
512	movq	%r13,-24(%rsp,%r15,8)
513	movq	%rdx,%rdi
514
515	mulq	%rbx
516	addq	%rax,%r11
517	movq	-8(%rcx,%r15,8),%rax
518	adcq	$0,%rdx
519	addq	-8(%rsp,%r15,8),%r11
520	adcq	$0,%rdx
521	movq	%rdx,%r10
522
523	mulq	%rbp
524	addq	%rax,%rdi
525	movq	(%rsi,%r15,8),%rax
526	adcq	$0,%rdx
527	addq	%r11,%rdi
528	adcq	$0,%rdx
529	movq	%rdi,-16(%rsp,%r15,8)
530	movq	%rdx,%r13
531
532	mulq	%rbx
533	addq	%rax,%r10
534	movq	(%rcx,%r15,8),%rax
535	adcq	$0,%rdx
536	addq	(%rsp,%r15,8),%r10
537	adcq	$0,%rdx
538	movq	%rdx,%r11
539
540	mulq	%rbp
541	addq	%rax,%r13
542	movq	8(%rsi,%r15,8),%rax
543	adcq	$0,%rdx
544	addq	%r10,%r13
545	adcq	$0,%rdx
546	movq	%r13,-8(%rsp,%r15,8)
547	movq	%rdx,%rdi
548
549	mulq	%rbx
550	addq	%rax,%r11
551	movq	8(%rcx,%r15,8),%rax
552	adcq	$0,%rdx
553	addq	8(%rsp,%r15,8),%r11
554	adcq	$0,%rdx
555	leaq	4(%r15),%r15
556	movq	%rdx,%r10
557
558	mulq	%rbp
559	addq	%rax,%rdi
560	movq	-16(%rsi,%r15,8),%rax
561	adcq	$0,%rdx
562	addq	%r11,%rdi
563	adcq	$0,%rdx
564	movq	%rdi,-32(%rsp,%r15,8)
565	movq	%rdx,%r13
566	cmpq	%r9,%r15
567	jb	L$inner4x
568
569	mulq	%rbx
570	addq	%rax,%r10
571	movq	-16(%rcx,%r15,8),%rax
572	adcq	$0,%rdx
573	addq	-16(%rsp,%r15,8),%r10
574	adcq	$0,%rdx
575	movq	%rdx,%r11
576
577	mulq	%rbp
578	addq	%rax,%r13
579	movq	-8(%rsi,%r15,8),%rax
580	adcq	$0,%rdx
581	addq	%r10,%r13
582	adcq	$0,%rdx
583	movq	%r13,-24(%rsp,%r15,8)
584	movq	%rdx,%rdi
585
586	mulq	%rbx
587	addq	%rax,%r11
588	movq	-8(%rcx,%r15,8),%rax
589	adcq	$0,%rdx
590	addq	-8(%rsp,%r15,8),%r11
591	adcq	$0,%rdx
592	leaq	1(%r14),%r14
593	movq	%rdx,%r10
594
595	mulq	%rbp
596	addq	%rax,%rdi
597	movq	(%rsi),%rax
598	adcq	$0,%rdx
599	addq	%r11,%rdi
600	adcq	$0,%rdx
601	movq	%rdi,-16(%rsp,%r15,8)
602	movq	%rdx,%r13
603
604	xorq	%rdi,%rdi
605	addq	%r10,%r13
606	adcq	$0,%rdi
607	addq	(%rsp,%r9,8),%r13
608	adcq	$0,%rdi
609	movq	%r13,-8(%rsp,%r15,8)
610	movq	%rdi,(%rsp,%r15,8)
611
612	cmpq	%r9,%r14
613	jb	L$outer4x
614	movq	16(%rsp,%r9,8),%rdi
615	leaq	-4(%r9),%r15
616	movq	0(%rsp),%rax
617	movq	8(%rsp),%rdx
618	shrq	$2,%r15
619	leaq	(%rsp),%rsi
620	xorq	%r14,%r14
621
622	subq	0(%rcx),%rax
623	movq	16(%rsi),%rbx
624	movq	24(%rsi),%rbp
625	sbbq	8(%rcx),%rdx
626
627L$sub4x:
628	movq	%rax,0(%rdi,%r14,8)
629	movq	%rdx,8(%rdi,%r14,8)
630	sbbq	16(%rcx,%r14,8),%rbx
631	movq	32(%rsi,%r14,8),%rax
632	movq	40(%rsi,%r14,8),%rdx
633	sbbq	24(%rcx,%r14,8),%rbp
634	movq	%rbx,16(%rdi,%r14,8)
635	movq	%rbp,24(%rdi,%r14,8)
636	sbbq	32(%rcx,%r14,8),%rax
637	movq	48(%rsi,%r14,8),%rbx
638	movq	56(%rsi,%r14,8),%rbp
639	sbbq	40(%rcx,%r14,8),%rdx
640	leaq	4(%r14),%r14
641	decq	%r15
642	jnz	L$sub4x
643
644	movq	%rax,0(%rdi,%r14,8)
645	movq	32(%rsi,%r14,8),%rax
646	sbbq	16(%rcx,%r14,8),%rbx
647	movq	%rdx,8(%rdi,%r14,8)
648	sbbq	24(%rcx,%r14,8),%rbp
649	movq	%rbx,16(%rdi,%r14,8)
650
651	sbbq	$0,%rax
652	movq	%rbp,24(%rdi,%r14,8)
653	pxor	%xmm0,%xmm0
654.byte	102,72,15,110,224
655	pcmpeqd	%xmm5,%xmm5
656	pshufd	$0,%xmm4,%xmm4
657	movq	%r9,%r15
658	pxor	%xmm4,%xmm5
659	shrq	$2,%r15
660	xorl	%eax,%eax
661
662	jmp	L$copy4x
663.p2align	4
664L$copy4x:
665	movdqa	(%rsp,%rax,1),%xmm1
666	movdqu	(%rdi,%rax,1),%xmm2
667	pand	%xmm4,%xmm1
668	pand	%xmm5,%xmm2
669	movdqa	16(%rsp,%rax,1),%xmm3
670	movdqa	%xmm0,(%rsp,%rax,1)
671	por	%xmm2,%xmm1
672	movdqu	16(%rdi,%rax,1),%xmm2
673	movdqu	%xmm1,(%rdi,%rax,1)
674	pand	%xmm4,%xmm3
675	pand	%xmm5,%xmm2
676	movdqa	%xmm0,16(%rsp,%rax,1)
677	por	%xmm2,%xmm3
678	movdqu	%xmm3,16(%rdi,%rax,1)
679	leaq	32(%rax),%rax
680	decq	%r15
681	jnz	L$copy4x
682	movq	8(%rsp,%r9,8),%rsi
683
684	movq	$1,%rax
685	movq	-48(%rsi),%r15
686
687	movq	-40(%rsi),%r14
688
689	movq	-32(%rsi),%r13
690
691	movq	-24(%rsi),%r12
692
693	movq	-16(%rsi),%rbp
694
695	movq	-8(%rsi),%rbx
696
697	leaq	(%rsi),%rsp
698
699L$mul4x_epilogue:
700	.byte	0xf3,0xc3
701
702
703
704
705
706
707.p2align	5
708bn_sqr8x_mont:
709
710	movq	%rsp,%rax
711
712L$sqr8x_enter:
713	pushq	%rbx
714
715	pushq	%rbp
716
717	pushq	%r12
718
719	pushq	%r13
720
721	pushq	%r14
722
723	pushq	%r15
724
725L$sqr8x_prologue:
726
727	movl	%r9d,%r10d
728	shll	$3,%r9d
729	shlq	$3+2,%r10
730	negq	%r9
731
732
733
734
735
736
737	leaq	-64(%rsp,%r9,2),%r11
738	movq	%rsp,%rbp
739	movq	(%r8),%r8
740	subq	%rsi,%r11
741	andq	$4095,%r11
742	cmpq	%r11,%r10
743	jb	L$sqr8x_sp_alt
744	subq	%r11,%rbp
745	leaq	-64(%rbp,%r9,2),%rbp
746	jmp	L$sqr8x_sp_done
747
748.p2align	5
749L$sqr8x_sp_alt:
750	leaq	4096-64(,%r9,2),%r10
751	leaq	-64(%rbp,%r9,2),%rbp
752	subq	%r10,%r11
753	movq	$0,%r10
754	cmovcq	%r10,%r11
755	subq	%r11,%rbp
756L$sqr8x_sp_done:
757	andq	$-64,%rbp
758	movq	%rsp,%r11
759	subq	%rbp,%r11
760	andq	$-4096,%r11
761	leaq	(%r11,%rbp,1),%rsp
762	movq	(%rsp),%r10
763	cmpq	%rbp,%rsp
764	ja	L$sqr8x_page_walk
765	jmp	L$sqr8x_page_walk_done
766
767.p2align	4
768L$sqr8x_page_walk:
769	leaq	-4096(%rsp),%rsp
770	movq	(%rsp),%r10
771	cmpq	%rbp,%rsp
772	ja	L$sqr8x_page_walk
773L$sqr8x_page_walk_done:
774
775	movq	%r9,%r10
776	negq	%r9
777
778	movq	%r8,32(%rsp)
779	movq	%rax,40(%rsp)
780
781L$sqr8x_body:
782
783.byte	102,72,15,110,209
784	pxor	%xmm0,%xmm0
785.byte	102,72,15,110,207
786.byte	102,73,15,110,218
787	movl	_GFp_ia32cap_P+8(%rip),%eax
788	andl	$0x80100,%eax
789	cmpl	$0x80100,%eax
790	jne	L$sqr8x_nox
791
792	call	_GFp_bn_sqrx8x_internal
793
794
795
796
797	leaq	(%r8,%rcx,1),%rbx
798	movq	%rcx,%r9
799	movq	%rcx,%rdx
800.byte	102,72,15,126,207
801	sarq	$3+2,%rcx
802	jmp	L$sqr8x_sub
803
804.p2align	5
805L$sqr8x_nox:
806	call	_GFp_bn_sqr8x_internal
807
808
809
810
811	leaq	(%rdi,%r9,1),%rbx
812	movq	%r9,%rcx
813	movq	%r9,%rdx
814.byte	102,72,15,126,207
815	sarq	$3+2,%rcx
816	jmp	L$sqr8x_sub
817
818.p2align	5
819L$sqr8x_sub:
820	movq	0(%rbx),%r12
821	movq	8(%rbx),%r13
822	movq	16(%rbx),%r14
823	movq	24(%rbx),%r15
824	leaq	32(%rbx),%rbx
825	sbbq	0(%rbp),%r12
826	sbbq	8(%rbp),%r13
827	sbbq	16(%rbp),%r14
828	sbbq	24(%rbp),%r15
829	leaq	32(%rbp),%rbp
830	movq	%r12,0(%rdi)
831	movq	%r13,8(%rdi)
832	movq	%r14,16(%rdi)
833	movq	%r15,24(%rdi)
834	leaq	32(%rdi),%rdi
835	incq	%rcx
836	jnz	L$sqr8x_sub
837
838	sbbq	$0,%rax
839	leaq	(%rbx,%r9,1),%rbx
840	leaq	(%rdi,%r9,1),%rdi
841
842.byte	102,72,15,110,200
843	pxor	%xmm0,%xmm0
844	pshufd	$0,%xmm1,%xmm1
845	movq	40(%rsp),%rsi
846
847	jmp	L$sqr8x_cond_copy
848
849.p2align	5
850L$sqr8x_cond_copy:
851	movdqa	0(%rbx),%xmm2
852	movdqa	16(%rbx),%xmm3
853	leaq	32(%rbx),%rbx
854	movdqu	0(%rdi),%xmm4
855	movdqu	16(%rdi),%xmm5
856	leaq	32(%rdi),%rdi
857	movdqa	%xmm0,-32(%rbx)
858	movdqa	%xmm0,-16(%rbx)
859	movdqa	%xmm0,-32(%rbx,%rdx,1)
860	movdqa	%xmm0,-16(%rbx,%rdx,1)
861	pcmpeqd	%xmm1,%xmm0
862	pand	%xmm1,%xmm2
863	pand	%xmm1,%xmm3
864	pand	%xmm0,%xmm4
865	pand	%xmm0,%xmm5
866	pxor	%xmm0,%xmm0
867	por	%xmm2,%xmm4
868	por	%xmm3,%xmm5
869	movdqu	%xmm4,-32(%rdi)
870	movdqu	%xmm5,-16(%rdi)
871	addq	$32,%r9
872	jnz	L$sqr8x_cond_copy
873
874	movq	$1,%rax
875	movq	-48(%rsi),%r15
876
877	movq	-40(%rsi),%r14
878
879	movq	-32(%rsi),%r13
880
881	movq	-24(%rsi),%r12
882
883	movq	-16(%rsi),%rbp
884
885	movq	-8(%rsi),%rbx
886
887	leaq	(%rsi),%rsp
888
889L$sqr8x_epilogue:
890	.byte	0xf3,0xc3
891
892
893
894.p2align	5
895bn_mulx4x_mont:
896
897	movq	%rsp,%rax
898
899L$mulx4x_enter:
900	pushq	%rbx
901
902	pushq	%rbp
903
904	pushq	%r12
905
906	pushq	%r13
907
908	pushq	%r14
909
910	pushq	%r15
911
912L$mulx4x_prologue:
913
914	shll	$3,%r9d
915	xorq	%r10,%r10
916	subq	%r9,%r10
917	movq	(%r8),%r8
918	leaq	-72(%rsp,%r10,1),%rbp
919	andq	$-128,%rbp
920	movq	%rsp,%r11
921	subq	%rbp,%r11
922	andq	$-4096,%r11
923	leaq	(%r11,%rbp,1),%rsp
924	movq	(%rsp),%r10
925	cmpq	%rbp,%rsp
926	ja	L$mulx4x_page_walk
927	jmp	L$mulx4x_page_walk_done
928
929.p2align	4
930L$mulx4x_page_walk:
931	leaq	-4096(%rsp),%rsp
932	movq	(%rsp),%r10
933	cmpq	%rbp,%rsp
934	ja	L$mulx4x_page_walk
935L$mulx4x_page_walk_done:
936
937	leaq	(%rdx,%r9,1),%r10
938
939
940
941
942
943
944
945
946
947
948
949
950	movq	%r9,0(%rsp)
951	shrq	$5,%r9
952	movq	%r10,16(%rsp)
953	subq	$1,%r9
954	movq	%r8,24(%rsp)
955	movq	%rdi,32(%rsp)
956	movq	%rax,40(%rsp)
957
958	movq	%r9,48(%rsp)
959	jmp	L$mulx4x_body
960
961.p2align	5
962L$mulx4x_body:
963	leaq	8(%rdx),%rdi
964	movq	(%rdx),%rdx
965	leaq	64+32(%rsp),%rbx
966	movq	%rdx,%r9
967
968	mulxq	0(%rsi),%r8,%rax
969	mulxq	8(%rsi),%r11,%r14
970	addq	%rax,%r11
971	movq	%rdi,8(%rsp)
972	mulxq	16(%rsi),%r12,%r13
973	adcq	%r14,%r12
974	adcq	$0,%r13
975
976	movq	%r8,%rdi
977	imulq	24(%rsp),%r8
978	xorq	%rbp,%rbp
979
980	mulxq	24(%rsi),%rax,%r14
981	movq	%r8,%rdx
982	leaq	32(%rsi),%rsi
983	adcxq	%rax,%r13
984	adcxq	%rbp,%r14
985
986	mulxq	0(%rcx),%rax,%r10
987	adcxq	%rax,%rdi
988	adoxq	%r11,%r10
989	mulxq	8(%rcx),%rax,%r11
990	adcxq	%rax,%r10
991	adoxq	%r12,%r11
992.byte	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
993	movq	48(%rsp),%rdi
994	movq	%r10,-32(%rbx)
995	adcxq	%rax,%r11
996	adoxq	%r13,%r12
997	mulxq	24(%rcx),%rax,%r15
998	movq	%r9,%rdx
999	movq	%r11,-24(%rbx)
1000	adcxq	%rax,%r12
1001	adoxq	%rbp,%r15
1002	leaq	32(%rcx),%rcx
1003	movq	%r12,-16(%rbx)
1004
1005	jmp	L$mulx4x_1st
1006
1007.p2align	5
1008L$mulx4x_1st:
1009	adcxq	%rbp,%r15
1010	mulxq	0(%rsi),%r10,%rax
1011	adcxq	%r14,%r10
1012	mulxq	8(%rsi),%r11,%r14
1013	adcxq	%rax,%r11
1014	mulxq	16(%rsi),%r12,%rax
1015	adcxq	%r14,%r12
1016	mulxq	24(%rsi),%r13,%r14
1017.byte	0x67,0x67
1018	movq	%r8,%rdx
1019	adcxq	%rax,%r13
1020	adcxq	%rbp,%r14
1021	leaq	32(%rsi),%rsi
1022	leaq	32(%rbx),%rbx
1023
1024	adoxq	%r15,%r10
1025	mulxq	0(%rcx),%rax,%r15
1026	adcxq	%rax,%r10
1027	adoxq	%r15,%r11
1028	mulxq	8(%rcx),%rax,%r15
1029	adcxq	%rax,%r11
1030	adoxq	%r15,%r12
1031	mulxq	16(%rcx),%rax,%r15
1032	movq	%r10,-40(%rbx)
1033	adcxq	%rax,%r12
1034	movq	%r11,-32(%rbx)
1035	adoxq	%r15,%r13
1036	mulxq	24(%rcx),%rax,%r15
1037	movq	%r9,%rdx
1038	movq	%r12,-24(%rbx)
1039	adcxq	%rax,%r13
1040	adoxq	%rbp,%r15
1041	leaq	32(%rcx),%rcx
1042	movq	%r13,-16(%rbx)
1043
1044	decq	%rdi
1045	jnz	L$mulx4x_1st
1046
1047	movq	0(%rsp),%rax
1048	movq	8(%rsp),%rdi
1049	adcq	%rbp,%r15
1050	addq	%r15,%r14
1051	sbbq	%r15,%r15
1052	movq	%r14,-8(%rbx)
1053	jmp	L$mulx4x_outer
1054
1055.p2align	5
1056L$mulx4x_outer:
1057	movq	(%rdi),%rdx
1058	leaq	8(%rdi),%rdi
1059	subq	%rax,%rsi
1060	movq	%r15,(%rbx)
1061	leaq	64+32(%rsp),%rbx
1062	subq	%rax,%rcx
1063
1064	mulxq	0(%rsi),%r8,%r11
1065	xorl	%ebp,%ebp
1066	movq	%rdx,%r9
1067	mulxq	8(%rsi),%r14,%r12
1068	adoxq	-32(%rbx),%r8
1069	adcxq	%r14,%r11
1070	mulxq	16(%rsi),%r15,%r13
1071	adoxq	-24(%rbx),%r11
1072	adcxq	%r15,%r12
1073	adoxq	-16(%rbx),%r12
1074	adcxq	%rbp,%r13
1075	adoxq	%rbp,%r13
1076
1077	movq	%rdi,8(%rsp)
1078	movq	%r8,%r15
1079	imulq	24(%rsp),%r8
1080	xorl	%ebp,%ebp
1081
1082	mulxq	24(%rsi),%rax,%r14
1083	movq	%r8,%rdx
1084	adcxq	%rax,%r13
1085	adoxq	-8(%rbx),%r13
1086	adcxq	%rbp,%r14
1087	leaq	32(%rsi),%rsi
1088	adoxq	%rbp,%r14
1089
1090	mulxq	0(%rcx),%rax,%r10
1091	adcxq	%rax,%r15
1092	adoxq	%r11,%r10
1093	mulxq	8(%rcx),%rax,%r11
1094	adcxq	%rax,%r10
1095	adoxq	%r12,%r11
1096	mulxq	16(%rcx),%rax,%r12
1097	movq	%r10,-32(%rbx)
1098	adcxq	%rax,%r11
1099	adoxq	%r13,%r12
1100	mulxq	24(%rcx),%rax,%r15
1101	movq	%r9,%rdx
1102	movq	%r11,-24(%rbx)
1103	leaq	32(%rcx),%rcx
1104	adcxq	%rax,%r12
1105	adoxq	%rbp,%r15
1106	movq	48(%rsp),%rdi
1107	movq	%r12,-16(%rbx)
1108
1109	jmp	L$mulx4x_inner
1110
1111.p2align	5
1112L$mulx4x_inner:
1113	mulxq	0(%rsi),%r10,%rax
1114	adcxq	%rbp,%r15
1115	adoxq	%r14,%r10
1116	mulxq	8(%rsi),%r11,%r14
1117	adcxq	0(%rbx),%r10
1118	adoxq	%rax,%r11
1119	mulxq	16(%rsi),%r12,%rax
1120	adcxq	8(%rbx),%r11
1121	adoxq	%r14,%r12
1122	mulxq	24(%rsi),%r13,%r14
1123	movq	%r8,%rdx
1124	adcxq	16(%rbx),%r12
1125	adoxq	%rax,%r13
1126	adcxq	24(%rbx),%r13
1127	adoxq	%rbp,%r14
1128	leaq	32(%rsi),%rsi
1129	leaq	32(%rbx),%rbx
1130	adcxq	%rbp,%r14
1131
1132	adoxq	%r15,%r10
1133	mulxq	0(%rcx),%rax,%r15
1134	adcxq	%rax,%r10
1135	adoxq	%r15,%r11
1136	mulxq	8(%rcx),%rax,%r15
1137	adcxq	%rax,%r11
1138	adoxq	%r15,%r12
1139	mulxq	16(%rcx),%rax,%r15
1140	movq	%r10,-40(%rbx)
1141	adcxq	%rax,%r12
1142	adoxq	%r15,%r13
1143	mulxq	24(%rcx),%rax,%r15
1144	movq	%r9,%rdx
1145	movq	%r11,-32(%rbx)
1146	movq	%r12,-24(%rbx)
1147	adcxq	%rax,%r13
1148	adoxq	%rbp,%r15
1149	leaq	32(%rcx),%rcx
1150	movq	%r13,-16(%rbx)
1151
1152	decq	%rdi
1153	jnz	L$mulx4x_inner
1154
1155	movq	0(%rsp),%rax
1156	movq	8(%rsp),%rdi
1157	adcq	%rbp,%r15
1158	subq	0(%rbx),%rbp
1159	adcq	%r15,%r14
1160	sbbq	%r15,%r15
1161	movq	%r14,-8(%rbx)
1162
1163	cmpq	16(%rsp),%rdi
1164	jne	L$mulx4x_outer
1165
1166	leaq	64(%rsp),%rbx
1167	subq	%rax,%rcx
1168	negq	%r15
1169	movq	%rax,%rdx
1170	shrq	$3+2,%rax
1171	movq	32(%rsp),%rdi
1172	jmp	L$mulx4x_sub
1173
1174.p2align	5
1175L$mulx4x_sub:
1176	movq	0(%rbx),%r11
1177	movq	8(%rbx),%r12
1178	movq	16(%rbx),%r13
1179	movq	24(%rbx),%r14
1180	leaq	32(%rbx),%rbx
1181	sbbq	0(%rcx),%r11
1182	sbbq	8(%rcx),%r12
1183	sbbq	16(%rcx),%r13
1184	sbbq	24(%rcx),%r14
1185	leaq	32(%rcx),%rcx
1186	movq	%r11,0(%rdi)
1187	movq	%r12,8(%rdi)
1188	movq	%r13,16(%rdi)
1189	movq	%r14,24(%rdi)
1190	leaq	32(%rdi),%rdi
1191	decq	%rax
1192	jnz	L$mulx4x_sub
1193
1194	sbbq	$0,%r15
1195	leaq	64(%rsp),%rbx
1196	subq	%rdx,%rdi
1197
1198.byte	102,73,15,110,207
1199	pxor	%xmm0,%xmm0
1200	pshufd	$0,%xmm1,%xmm1
1201	movq	40(%rsp),%rsi
1202
1203	jmp	L$mulx4x_cond_copy
1204
1205.p2align	5
1206L$mulx4x_cond_copy:
1207	movdqa	0(%rbx),%xmm2
1208	movdqa	16(%rbx),%xmm3
1209	leaq	32(%rbx),%rbx
1210	movdqu	0(%rdi),%xmm4
1211	movdqu	16(%rdi),%xmm5
1212	leaq	32(%rdi),%rdi
1213	movdqa	%xmm0,-32(%rbx)
1214	movdqa	%xmm0,-16(%rbx)
1215	pcmpeqd	%xmm1,%xmm0
1216	pand	%xmm1,%xmm2
1217	pand	%xmm1,%xmm3
1218	pand	%xmm0,%xmm4
1219	pand	%xmm0,%xmm5
1220	pxor	%xmm0,%xmm0
1221	por	%xmm2,%xmm4
1222	por	%xmm3,%xmm5
1223	movdqu	%xmm4,-32(%rdi)
1224	movdqu	%xmm5,-16(%rdi)
1225	subq	$32,%rdx
1226	jnz	L$mulx4x_cond_copy
1227
1228	movq	%rdx,(%rbx)
1229
1230	movq	$1,%rax
1231	movq	-48(%rsi),%r15
1232
1233	movq	-40(%rsi),%r14
1234
1235	movq	-32(%rsi),%r13
1236
1237	movq	-24(%rsi),%r12
1238
1239	movq	-16(%rsi),%rbp
1240
1241	movq	-8(%rsi),%rbx
1242
1243	leaq	(%rsi),%rsp
1244
1245L$mulx4x_epilogue:
1246	.byte	0xf3,0xc3
1247
1248
1249.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1250.p2align	4
1251#endif
1252