• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# This file is generated from a similarly-named Perl script in the BoringSSL
2# source tree. Do not edit by hand.
3
4#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
11#if defined(BORINGSSL_PREFIX)
12#include <boringssl_prefix_symbols_asm.h>
13#endif
14.text
15
16.extern	OPENSSL_ia32cap_P
17.hidden OPENSSL_ia32cap_P
18
19.globl	bn_mul_mont_gather5
20.hidden bn_mul_mont_gather5
21.type	bn_mul_mont_gather5,@function
22.align	64
23bn_mul_mont_gather5:
24.cfi_startproc
25	movl	%r9d,%r9d
26	movq	%rsp,%rax
27.cfi_def_cfa_register	%rax
28	testl	$7,%r9d
29	jnz	.Lmul_enter
30	leaq	OPENSSL_ia32cap_P(%rip),%r11
31	movl	8(%r11),%r11d
32	jmp	.Lmul4x_enter
33
34.align	16
35.Lmul_enter:
36	movd	8(%rsp),%xmm5
37	pushq	%rbx
38.cfi_offset	%rbx,-16
39	pushq	%rbp
40.cfi_offset	%rbp,-24
41	pushq	%r12
42.cfi_offset	%r12,-32
43	pushq	%r13
44.cfi_offset	%r13,-40
45	pushq	%r14
46.cfi_offset	%r14,-48
47	pushq	%r15
48.cfi_offset	%r15,-56
49
50	negq	%r9
51	movq	%rsp,%r11
52	leaq	-280(%rsp,%r9,8),%r10
53	negq	%r9
54	andq	$-1024,%r10
55
56
57
58
59
60
61
62
63
64	subq	%r10,%r11
65	andq	$-4096,%r11
66	leaq	(%r10,%r11,1),%rsp
67	movq	(%rsp),%r11
68	cmpq	%r10,%rsp
69	ja	.Lmul_page_walk
70	jmp	.Lmul_page_walk_done
71
72.Lmul_page_walk:
73	leaq	-4096(%rsp),%rsp
74	movq	(%rsp),%r11
75	cmpq	%r10,%rsp
76	ja	.Lmul_page_walk
77.Lmul_page_walk_done:
78
79	leaq	.Linc(%rip),%r10
80	movq	%rax,8(%rsp,%r9,8)
81.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
82.Lmul_body:
83
84	leaq	128(%rdx),%r12
85	movdqa	0(%r10),%xmm0
86	movdqa	16(%r10),%xmm1
87	leaq	24-112(%rsp,%r9,8),%r10
88	andq	$-16,%r10
89
90	pshufd	$0,%xmm5,%xmm5
91	movdqa	%xmm1,%xmm4
92	movdqa	%xmm1,%xmm2
93	paddd	%xmm0,%xmm1
94	pcmpeqd	%xmm5,%xmm0
95.byte	0x67
96	movdqa	%xmm4,%xmm3
97	paddd	%xmm1,%xmm2
98	pcmpeqd	%xmm5,%xmm1
99	movdqa	%xmm0,112(%r10)
100	movdqa	%xmm4,%xmm0
101
102	paddd	%xmm2,%xmm3
103	pcmpeqd	%xmm5,%xmm2
104	movdqa	%xmm1,128(%r10)
105	movdqa	%xmm4,%xmm1
106
107	paddd	%xmm3,%xmm0
108	pcmpeqd	%xmm5,%xmm3
109	movdqa	%xmm2,144(%r10)
110	movdqa	%xmm4,%xmm2
111
112	paddd	%xmm0,%xmm1
113	pcmpeqd	%xmm5,%xmm0
114	movdqa	%xmm3,160(%r10)
115	movdqa	%xmm4,%xmm3
116	paddd	%xmm1,%xmm2
117	pcmpeqd	%xmm5,%xmm1
118	movdqa	%xmm0,176(%r10)
119	movdqa	%xmm4,%xmm0
120
121	paddd	%xmm2,%xmm3
122	pcmpeqd	%xmm5,%xmm2
123	movdqa	%xmm1,192(%r10)
124	movdqa	%xmm4,%xmm1
125
126	paddd	%xmm3,%xmm0
127	pcmpeqd	%xmm5,%xmm3
128	movdqa	%xmm2,208(%r10)
129	movdqa	%xmm4,%xmm2
130
131	paddd	%xmm0,%xmm1
132	pcmpeqd	%xmm5,%xmm0
133	movdqa	%xmm3,224(%r10)
134	movdqa	%xmm4,%xmm3
135	paddd	%xmm1,%xmm2
136	pcmpeqd	%xmm5,%xmm1
137	movdqa	%xmm0,240(%r10)
138	movdqa	%xmm4,%xmm0
139
140	paddd	%xmm2,%xmm3
141	pcmpeqd	%xmm5,%xmm2
142	movdqa	%xmm1,256(%r10)
143	movdqa	%xmm4,%xmm1
144
145	paddd	%xmm3,%xmm0
146	pcmpeqd	%xmm5,%xmm3
147	movdqa	%xmm2,272(%r10)
148	movdqa	%xmm4,%xmm2
149
150	paddd	%xmm0,%xmm1
151	pcmpeqd	%xmm5,%xmm0
152	movdqa	%xmm3,288(%r10)
153	movdqa	%xmm4,%xmm3
154	paddd	%xmm1,%xmm2
155	pcmpeqd	%xmm5,%xmm1
156	movdqa	%xmm0,304(%r10)
157
158	paddd	%xmm2,%xmm3
159.byte	0x67
160	pcmpeqd	%xmm5,%xmm2
161	movdqa	%xmm1,320(%r10)
162
163	pcmpeqd	%xmm5,%xmm3
164	movdqa	%xmm2,336(%r10)
165	pand	64(%r12),%xmm0
166
167	pand	80(%r12),%xmm1
168	pand	96(%r12),%xmm2
169	movdqa	%xmm3,352(%r10)
170	pand	112(%r12),%xmm3
171	por	%xmm2,%xmm0
172	por	%xmm3,%xmm1
173	movdqa	-128(%r12),%xmm4
174	movdqa	-112(%r12),%xmm5
175	movdqa	-96(%r12),%xmm2
176	pand	112(%r10),%xmm4
177	movdqa	-80(%r12),%xmm3
178	pand	128(%r10),%xmm5
179	por	%xmm4,%xmm0
180	pand	144(%r10),%xmm2
181	por	%xmm5,%xmm1
182	pand	160(%r10),%xmm3
183	por	%xmm2,%xmm0
184	por	%xmm3,%xmm1
185	movdqa	-64(%r12),%xmm4
186	movdqa	-48(%r12),%xmm5
187	movdqa	-32(%r12),%xmm2
188	pand	176(%r10),%xmm4
189	movdqa	-16(%r12),%xmm3
190	pand	192(%r10),%xmm5
191	por	%xmm4,%xmm0
192	pand	208(%r10),%xmm2
193	por	%xmm5,%xmm1
194	pand	224(%r10),%xmm3
195	por	%xmm2,%xmm0
196	por	%xmm3,%xmm1
197	movdqa	0(%r12),%xmm4
198	movdqa	16(%r12),%xmm5
199	movdqa	32(%r12),%xmm2
200	pand	240(%r10),%xmm4
201	movdqa	48(%r12),%xmm3
202	pand	256(%r10),%xmm5
203	por	%xmm4,%xmm0
204	pand	272(%r10),%xmm2
205	por	%xmm5,%xmm1
206	pand	288(%r10),%xmm3
207	por	%xmm2,%xmm0
208	por	%xmm3,%xmm1
209	por	%xmm1,%xmm0
210	pshufd	$0x4e,%xmm0,%xmm1
211	por	%xmm1,%xmm0
212	leaq	256(%r12),%r12
213.byte	102,72,15,126,195
214
215	movq	(%r8),%r8
216	movq	(%rsi),%rax
217
218	xorq	%r14,%r14
219	xorq	%r15,%r15
220
221	movq	%r8,%rbp
222	mulq	%rbx
223	movq	%rax,%r10
224	movq	(%rcx),%rax
225
226	imulq	%r10,%rbp
227	movq	%rdx,%r11
228
229	mulq	%rbp
230	addq	%rax,%r10
231	movq	8(%rsi),%rax
232	adcq	$0,%rdx
233	movq	%rdx,%r13
234
235	leaq	1(%r15),%r15
236	jmp	.L1st_enter
237
238.align	16
239.L1st:
240	addq	%rax,%r13
241	movq	(%rsi,%r15,8),%rax
242	adcq	$0,%rdx
243	addq	%r11,%r13
244	movq	%r10,%r11
245	adcq	$0,%rdx
246	movq	%r13,-16(%rsp,%r15,8)
247	movq	%rdx,%r13
248
249.L1st_enter:
250	mulq	%rbx
251	addq	%rax,%r11
252	movq	(%rcx,%r15,8),%rax
253	adcq	$0,%rdx
254	leaq	1(%r15),%r15
255	movq	%rdx,%r10
256
257	mulq	%rbp
258	cmpq	%r9,%r15
259	jne	.L1st
260
261
262	addq	%rax,%r13
263	adcq	$0,%rdx
264	addq	%r11,%r13
265	adcq	$0,%rdx
266	movq	%r13,-16(%rsp,%r9,8)
267	movq	%rdx,%r13
268	movq	%r10,%r11
269
270	xorq	%rdx,%rdx
271	addq	%r11,%r13
272	adcq	$0,%rdx
273	movq	%r13,-8(%rsp,%r9,8)
274	movq	%rdx,(%rsp,%r9,8)
275
276	leaq	1(%r14),%r14
277	jmp	.Louter
278.align	16
279.Louter:
280	leaq	24+128(%rsp,%r9,8),%rdx
281	andq	$-16,%rdx
282	pxor	%xmm4,%xmm4
283	pxor	%xmm5,%xmm5
284	movdqa	-128(%r12),%xmm0
285	movdqa	-112(%r12),%xmm1
286	movdqa	-96(%r12),%xmm2
287	movdqa	-80(%r12),%xmm3
288	pand	-128(%rdx),%xmm0
289	pand	-112(%rdx),%xmm1
290	por	%xmm0,%xmm4
291	pand	-96(%rdx),%xmm2
292	por	%xmm1,%xmm5
293	pand	-80(%rdx),%xmm3
294	por	%xmm2,%xmm4
295	por	%xmm3,%xmm5
296	movdqa	-64(%r12),%xmm0
297	movdqa	-48(%r12),%xmm1
298	movdqa	-32(%r12),%xmm2
299	movdqa	-16(%r12),%xmm3
300	pand	-64(%rdx),%xmm0
301	pand	-48(%rdx),%xmm1
302	por	%xmm0,%xmm4
303	pand	-32(%rdx),%xmm2
304	por	%xmm1,%xmm5
305	pand	-16(%rdx),%xmm3
306	por	%xmm2,%xmm4
307	por	%xmm3,%xmm5
308	movdqa	0(%r12),%xmm0
309	movdqa	16(%r12),%xmm1
310	movdqa	32(%r12),%xmm2
311	movdqa	48(%r12),%xmm3
312	pand	0(%rdx),%xmm0
313	pand	16(%rdx),%xmm1
314	por	%xmm0,%xmm4
315	pand	32(%rdx),%xmm2
316	por	%xmm1,%xmm5
317	pand	48(%rdx),%xmm3
318	por	%xmm2,%xmm4
319	por	%xmm3,%xmm5
320	movdqa	64(%r12),%xmm0
321	movdqa	80(%r12),%xmm1
322	movdqa	96(%r12),%xmm2
323	movdqa	112(%r12),%xmm3
324	pand	64(%rdx),%xmm0
325	pand	80(%rdx),%xmm1
326	por	%xmm0,%xmm4
327	pand	96(%rdx),%xmm2
328	por	%xmm1,%xmm5
329	pand	112(%rdx),%xmm3
330	por	%xmm2,%xmm4
331	por	%xmm3,%xmm5
332	por	%xmm5,%xmm4
333	pshufd	$0x4e,%xmm4,%xmm0
334	por	%xmm4,%xmm0
335	leaq	256(%r12),%r12
336
337	movq	(%rsi),%rax
338.byte	102,72,15,126,195
339
340	xorq	%r15,%r15
341	movq	%r8,%rbp
342	movq	(%rsp),%r10
343
344	mulq	%rbx
345	addq	%rax,%r10
346	movq	(%rcx),%rax
347	adcq	$0,%rdx
348
349	imulq	%r10,%rbp
350	movq	%rdx,%r11
351
352	mulq	%rbp
353	addq	%rax,%r10
354	movq	8(%rsi),%rax
355	adcq	$0,%rdx
356	movq	8(%rsp),%r10
357	movq	%rdx,%r13
358
359	leaq	1(%r15),%r15
360	jmp	.Linner_enter
361
362.align	16
363.Linner:
364	addq	%rax,%r13
365	movq	(%rsi,%r15,8),%rax
366	adcq	$0,%rdx
367	addq	%r10,%r13
368	movq	(%rsp,%r15,8),%r10
369	adcq	$0,%rdx
370	movq	%r13,-16(%rsp,%r15,8)
371	movq	%rdx,%r13
372
373.Linner_enter:
374	mulq	%rbx
375	addq	%rax,%r11
376	movq	(%rcx,%r15,8),%rax
377	adcq	$0,%rdx
378	addq	%r11,%r10
379	movq	%rdx,%r11
380	adcq	$0,%r11
381	leaq	1(%r15),%r15
382
383	mulq	%rbp
384	cmpq	%r9,%r15
385	jne	.Linner
386
387	addq	%rax,%r13
388	adcq	$0,%rdx
389	addq	%r10,%r13
390	movq	(%rsp,%r9,8),%r10
391	adcq	$0,%rdx
392	movq	%r13,-16(%rsp,%r9,8)
393	movq	%rdx,%r13
394
395	xorq	%rdx,%rdx
396	addq	%r11,%r13
397	adcq	$0,%rdx
398	addq	%r10,%r13
399	adcq	$0,%rdx
400	movq	%r13,-8(%rsp,%r9,8)
401	movq	%rdx,(%rsp,%r9,8)
402
403	leaq	1(%r14),%r14
404	cmpq	%r9,%r14
405	jb	.Louter
406
407	xorq	%r14,%r14
408	movq	(%rsp),%rax
409	leaq	(%rsp),%rsi
410	movq	%r9,%r15
411	jmp	.Lsub
412.align	16
413.Lsub:	sbbq	(%rcx,%r14,8),%rax
414	movq	%rax,(%rdi,%r14,8)
415	movq	8(%rsi,%r14,8),%rax
416	leaq	1(%r14),%r14
417	decq	%r15
418	jnz	.Lsub
419
420	sbbq	$0,%rax
421	movq	$-1,%rbx
422	xorq	%rax,%rbx
423	xorq	%r14,%r14
424	movq	%r9,%r15
425
426.Lcopy:
427	movq	(%rdi,%r14,8),%rcx
428	movq	(%rsp,%r14,8),%rdx
429	andq	%rbx,%rcx
430	andq	%rax,%rdx
431	movq	%r14,(%rsp,%r14,8)
432	orq	%rcx,%rdx
433	movq	%rdx,(%rdi,%r14,8)
434	leaq	1(%r14),%r14
435	subq	$1,%r15
436	jnz	.Lcopy
437
438	movq	8(%rsp,%r9,8),%rsi
439.cfi_def_cfa	%rsi,8
440	movq	$1,%rax
441
442	movq	-48(%rsi),%r15
443.cfi_restore	%r15
444	movq	-40(%rsi),%r14
445.cfi_restore	%r14
446	movq	-32(%rsi),%r13
447.cfi_restore	%r13
448	movq	-24(%rsi),%r12
449.cfi_restore	%r12
450	movq	-16(%rsi),%rbp
451.cfi_restore	%rbp
452	movq	-8(%rsi),%rbx
453.cfi_restore	%rbx
454	leaq	(%rsi),%rsp
455.cfi_def_cfa_register	%rsp
456.Lmul_epilogue:
457	.byte	0xf3,0xc3
458.cfi_endproc
459.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
460.type	bn_mul4x_mont_gather5,@function
461.align	32
462bn_mul4x_mont_gather5:
463.cfi_startproc
464.byte	0x67
465	movq	%rsp,%rax
466.cfi_def_cfa_register	%rax
467.Lmul4x_enter:
468	andl	$0x80108,%r11d
469	cmpl	$0x80108,%r11d
470	je	.Lmulx4x_enter
471	pushq	%rbx
472.cfi_offset	%rbx,-16
473	pushq	%rbp
474.cfi_offset	%rbp,-24
475	pushq	%r12
476.cfi_offset	%r12,-32
477	pushq	%r13
478.cfi_offset	%r13,-40
479	pushq	%r14
480.cfi_offset	%r14,-48
481	pushq	%r15
482.cfi_offset	%r15,-56
483.Lmul4x_prologue:
484
485.byte	0x67
486	shll	$3,%r9d
487	leaq	(%r9,%r9,2),%r10
488	negq	%r9
489
490
491
492
493
494
495
496
497
498
499	leaq	-320(%rsp,%r9,2),%r11
500	movq	%rsp,%rbp
501	subq	%rdi,%r11
502	andq	$4095,%r11
503	cmpq	%r11,%r10
504	jb	.Lmul4xsp_alt
505	subq	%r11,%rbp
506	leaq	-320(%rbp,%r9,2),%rbp
507	jmp	.Lmul4xsp_done
508
509.align	32
510.Lmul4xsp_alt:
511	leaq	4096-320(,%r9,2),%r10
512	leaq	-320(%rbp,%r9,2),%rbp
513	subq	%r10,%r11
514	movq	$0,%r10
515	cmovcq	%r10,%r11
516	subq	%r11,%rbp
517.Lmul4xsp_done:
518	andq	$-64,%rbp
519	movq	%rsp,%r11
520	subq	%rbp,%r11
521	andq	$-4096,%r11
522	leaq	(%r11,%rbp,1),%rsp
523	movq	(%rsp),%r10
524	cmpq	%rbp,%rsp
525	ja	.Lmul4x_page_walk
526	jmp	.Lmul4x_page_walk_done
527
528.Lmul4x_page_walk:
529	leaq	-4096(%rsp),%rsp
530	movq	(%rsp),%r10
531	cmpq	%rbp,%rsp
532	ja	.Lmul4x_page_walk
533.Lmul4x_page_walk_done:
534
535	negq	%r9
536
537	movq	%rax,40(%rsp)
538.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
539.Lmul4x_body:
540
541	call	mul4x_internal
542
543	movq	40(%rsp),%rsi
544.cfi_def_cfa	%rsi,8
545	movq	$1,%rax
546
547	movq	-48(%rsi),%r15
548.cfi_restore	%r15
549	movq	-40(%rsi),%r14
550.cfi_restore	%r14
551	movq	-32(%rsi),%r13
552.cfi_restore	%r13
553	movq	-24(%rsi),%r12
554.cfi_restore	%r12
555	movq	-16(%rsi),%rbp
556.cfi_restore	%rbp
557	movq	-8(%rsi),%rbx
558.cfi_restore	%rbx
559	leaq	(%rsi),%rsp
560.cfi_def_cfa_register	%rsp
561.Lmul4x_epilogue:
562	.byte	0xf3,0xc3
563.cfi_endproc
564.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
565
566.type	mul4x_internal,@function
567.align	32
568mul4x_internal:
569.cfi_startproc
570	shlq	$5,%r9
571	movd	8(%rax),%xmm5
572	leaq	.Linc(%rip),%rax
573	leaq	128(%rdx,%r9,1),%r13
574	shrq	$5,%r9
575	movdqa	0(%rax),%xmm0
576	movdqa	16(%rax),%xmm1
577	leaq	88-112(%rsp,%r9,1),%r10
578	leaq	128(%rdx),%r12
579
580	pshufd	$0,%xmm5,%xmm5
581	movdqa	%xmm1,%xmm4
582.byte	0x67,0x67
583	movdqa	%xmm1,%xmm2
584	paddd	%xmm0,%xmm1
585	pcmpeqd	%xmm5,%xmm0
586.byte	0x67
587	movdqa	%xmm4,%xmm3
588	paddd	%xmm1,%xmm2
589	pcmpeqd	%xmm5,%xmm1
590	movdqa	%xmm0,112(%r10)
591	movdqa	%xmm4,%xmm0
592
593	paddd	%xmm2,%xmm3
594	pcmpeqd	%xmm5,%xmm2
595	movdqa	%xmm1,128(%r10)
596	movdqa	%xmm4,%xmm1
597
598	paddd	%xmm3,%xmm0
599	pcmpeqd	%xmm5,%xmm3
600	movdqa	%xmm2,144(%r10)
601	movdqa	%xmm4,%xmm2
602
603	paddd	%xmm0,%xmm1
604	pcmpeqd	%xmm5,%xmm0
605	movdqa	%xmm3,160(%r10)
606	movdqa	%xmm4,%xmm3
607	paddd	%xmm1,%xmm2
608	pcmpeqd	%xmm5,%xmm1
609	movdqa	%xmm0,176(%r10)
610	movdqa	%xmm4,%xmm0
611
612	paddd	%xmm2,%xmm3
613	pcmpeqd	%xmm5,%xmm2
614	movdqa	%xmm1,192(%r10)
615	movdqa	%xmm4,%xmm1
616
617	paddd	%xmm3,%xmm0
618	pcmpeqd	%xmm5,%xmm3
619	movdqa	%xmm2,208(%r10)
620	movdqa	%xmm4,%xmm2
621
622	paddd	%xmm0,%xmm1
623	pcmpeqd	%xmm5,%xmm0
624	movdqa	%xmm3,224(%r10)
625	movdqa	%xmm4,%xmm3
626	paddd	%xmm1,%xmm2
627	pcmpeqd	%xmm5,%xmm1
628	movdqa	%xmm0,240(%r10)
629	movdqa	%xmm4,%xmm0
630
631	paddd	%xmm2,%xmm3
632	pcmpeqd	%xmm5,%xmm2
633	movdqa	%xmm1,256(%r10)
634	movdqa	%xmm4,%xmm1
635
636	paddd	%xmm3,%xmm0
637	pcmpeqd	%xmm5,%xmm3
638	movdqa	%xmm2,272(%r10)
639	movdqa	%xmm4,%xmm2
640
641	paddd	%xmm0,%xmm1
642	pcmpeqd	%xmm5,%xmm0
643	movdqa	%xmm3,288(%r10)
644	movdqa	%xmm4,%xmm3
645	paddd	%xmm1,%xmm2
646	pcmpeqd	%xmm5,%xmm1
647	movdqa	%xmm0,304(%r10)
648
649	paddd	%xmm2,%xmm3
650.byte	0x67
651	pcmpeqd	%xmm5,%xmm2
652	movdqa	%xmm1,320(%r10)
653
654	pcmpeqd	%xmm5,%xmm3
655	movdqa	%xmm2,336(%r10)
656	pand	64(%r12),%xmm0
657
658	pand	80(%r12),%xmm1
659	pand	96(%r12),%xmm2
660	movdqa	%xmm3,352(%r10)
661	pand	112(%r12),%xmm3
662	por	%xmm2,%xmm0
663	por	%xmm3,%xmm1
664	movdqa	-128(%r12),%xmm4
665	movdqa	-112(%r12),%xmm5
666	movdqa	-96(%r12),%xmm2
667	pand	112(%r10),%xmm4
668	movdqa	-80(%r12),%xmm3
669	pand	128(%r10),%xmm5
670	por	%xmm4,%xmm0
671	pand	144(%r10),%xmm2
672	por	%xmm5,%xmm1
673	pand	160(%r10),%xmm3
674	por	%xmm2,%xmm0
675	por	%xmm3,%xmm1
676	movdqa	-64(%r12),%xmm4
677	movdqa	-48(%r12),%xmm5
678	movdqa	-32(%r12),%xmm2
679	pand	176(%r10),%xmm4
680	movdqa	-16(%r12),%xmm3
681	pand	192(%r10),%xmm5
682	por	%xmm4,%xmm0
683	pand	208(%r10),%xmm2
684	por	%xmm5,%xmm1
685	pand	224(%r10),%xmm3
686	por	%xmm2,%xmm0
687	por	%xmm3,%xmm1
688	movdqa	0(%r12),%xmm4
689	movdqa	16(%r12),%xmm5
690	movdqa	32(%r12),%xmm2
691	pand	240(%r10),%xmm4
692	movdqa	48(%r12),%xmm3
693	pand	256(%r10),%xmm5
694	por	%xmm4,%xmm0
695	pand	272(%r10),%xmm2
696	por	%xmm5,%xmm1
697	pand	288(%r10),%xmm3
698	por	%xmm2,%xmm0
699	por	%xmm3,%xmm1
700	por	%xmm1,%xmm0
701	pshufd	$0x4e,%xmm0,%xmm1
702	por	%xmm1,%xmm0
703	leaq	256(%r12),%r12
704.byte	102,72,15,126,195
705
706	movq	%r13,16+8(%rsp)
707	movq	%rdi,56+8(%rsp)
708
709	movq	(%r8),%r8
710	movq	(%rsi),%rax
711	leaq	(%rsi,%r9,1),%rsi
712	negq	%r9
713
714	movq	%r8,%rbp
715	mulq	%rbx
716	movq	%rax,%r10
717	movq	(%rcx),%rax
718
719	imulq	%r10,%rbp
720	leaq	64+8(%rsp),%r14
721	movq	%rdx,%r11
722
723	mulq	%rbp
724	addq	%rax,%r10
725	movq	8(%rsi,%r9,1),%rax
726	adcq	$0,%rdx
727	movq	%rdx,%rdi
728
729	mulq	%rbx
730	addq	%rax,%r11
731	movq	8(%rcx),%rax
732	adcq	$0,%rdx
733	movq	%rdx,%r10
734
735	mulq	%rbp
736	addq	%rax,%rdi
737	movq	16(%rsi,%r9,1),%rax
738	adcq	$0,%rdx
739	addq	%r11,%rdi
740	leaq	32(%r9),%r15
741	leaq	32(%rcx),%rcx
742	adcq	$0,%rdx
743	movq	%rdi,(%r14)
744	movq	%rdx,%r13
745	jmp	.L1st4x
746
747.align	32
748.L1st4x:
749	mulq	%rbx
750	addq	%rax,%r10
751	movq	-16(%rcx),%rax
752	leaq	32(%r14),%r14
753	adcq	$0,%rdx
754	movq	%rdx,%r11
755
756	mulq	%rbp
757	addq	%rax,%r13
758	movq	-8(%rsi,%r15,1),%rax
759	adcq	$0,%rdx
760	addq	%r10,%r13
761	adcq	$0,%rdx
762	movq	%r13,-24(%r14)
763	movq	%rdx,%rdi
764
765	mulq	%rbx
766	addq	%rax,%r11
767	movq	-8(%rcx),%rax
768	adcq	$0,%rdx
769	movq	%rdx,%r10
770
771	mulq	%rbp
772	addq	%rax,%rdi
773	movq	(%rsi,%r15,1),%rax
774	adcq	$0,%rdx
775	addq	%r11,%rdi
776	adcq	$0,%rdx
777	movq	%rdi,-16(%r14)
778	movq	%rdx,%r13
779
780	mulq	%rbx
781	addq	%rax,%r10
782	movq	0(%rcx),%rax
783	adcq	$0,%rdx
784	movq	%rdx,%r11
785
786	mulq	%rbp
787	addq	%rax,%r13
788	movq	8(%rsi,%r15,1),%rax
789	adcq	$0,%rdx
790	addq	%r10,%r13
791	adcq	$0,%rdx
792	movq	%r13,-8(%r14)
793	movq	%rdx,%rdi
794
795	mulq	%rbx
796	addq	%rax,%r11
797	movq	8(%rcx),%rax
798	adcq	$0,%rdx
799	movq	%rdx,%r10
800
801	mulq	%rbp
802	addq	%rax,%rdi
803	movq	16(%rsi,%r15,1),%rax
804	adcq	$0,%rdx
805	addq	%r11,%rdi
806	leaq	32(%rcx),%rcx
807	adcq	$0,%rdx
808	movq	%rdi,(%r14)
809	movq	%rdx,%r13
810
811	addq	$32,%r15
812	jnz	.L1st4x
813
814	mulq	%rbx
815	addq	%rax,%r10
816	movq	-16(%rcx),%rax
817	leaq	32(%r14),%r14
818	adcq	$0,%rdx
819	movq	%rdx,%r11
820
821	mulq	%rbp
822	addq	%rax,%r13
823	movq	-8(%rsi),%rax
824	adcq	$0,%rdx
825	addq	%r10,%r13
826	adcq	$0,%rdx
827	movq	%r13,-24(%r14)
828	movq	%rdx,%rdi
829
830	mulq	%rbx
831	addq	%rax,%r11
832	movq	-8(%rcx),%rax
833	adcq	$0,%rdx
834	movq	%rdx,%r10
835
836	mulq	%rbp
837	addq	%rax,%rdi
838	movq	(%rsi,%r9,1),%rax
839	adcq	$0,%rdx
840	addq	%r11,%rdi
841	adcq	$0,%rdx
842	movq	%rdi,-16(%r14)
843	movq	%rdx,%r13
844
845	leaq	(%rcx,%r9,1),%rcx
846
847	xorq	%rdi,%rdi
848	addq	%r10,%r13
849	adcq	$0,%rdi
850	movq	%r13,-8(%r14)
851
852	jmp	.Louter4x
853
854.align	32
855.Louter4x:
856	leaq	16+128(%r14),%rdx
857	pxor	%xmm4,%xmm4
858	pxor	%xmm5,%xmm5
859	movdqa	-128(%r12),%xmm0
860	movdqa	-112(%r12),%xmm1
861	movdqa	-96(%r12),%xmm2
862	movdqa	-80(%r12),%xmm3
863	pand	-128(%rdx),%xmm0
864	pand	-112(%rdx),%xmm1
865	por	%xmm0,%xmm4
866	pand	-96(%rdx),%xmm2
867	por	%xmm1,%xmm5
868	pand	-80(%rdx),%xmm3
869	por	%xmm2,%xmm4
870	por	%xmm3,%xmm5
871	movdqa	-64(%r12),%xmm0
872	movdqa	-48(%r12),%xmm1
873	movdqa	-32(%r12),%xmm2
874	movdqa	-16(%r12),%xmm3
875	pand	-64(%rdx),%xmm0
876	pand	-48(%rdx),%xmm1
877	por	%xmm0,%xmm4
878	pand	-32(%rdx),%xmm2
879	por	%xmm1,%xmm5
880	pand	-16(%rdx),%xmm3
881	por	%xmm2,%xmm4
882	por	%xmm3,%xmm5
883	movdqa	0(%r12),%xmm0
884	movdqa	16(%r12),%xmm1
885	movdqa	32(%r12),%xmm2
886	movdqa	48(%r12),%xmm3
887	pand	0(%rdx),%xmm0
888	pand	16(%rdx),%xmm1
889	por	%xmm0,%xmm4
890	pand	32(%rdx),%xmm2
891	por	%xmm1,%xmm5
892	pand	48(%rdx),%xmm3
893	por	%xmm2,%xmm4
894	por	%xmm3,%xmm5
895	movdqa	64(%r12),%xmm0
896	movdqa	80(%r12),%xmm1
897	movdqa	96(%r12),%xmm2
898	movdqa	112(%r12),%xmm3
899	pand	64(%rdx),%xmm0
900	pand	80(%rdx),%xmm1
901	por	%xmm0,%xmm4
902	pand	96(%rdx),%xmm2
903	por	%xmm1,%xmm5
904	pand	112(%rdx),%xmm3
905	por	%xmm2,%xmm4
906	por	%xmm3,%xmm5
907	por	%xmm5,%xmm4
908	pshufd	$0x4e,%xmm4,%xmm0
909	por	%xmm4,%xmm0
910	leaq	256(%r12),%r12
911.byte	102,72,15,126,195
912
913	movq	(%r14,%r9,1),%r10
914	movq	%r8,%rbp
915	mulq	%rbx
916	addq	%rax,%r10
917	movq	(%rcx),%rax
918	adcq	$0,%rdx
919
920	imulq	%r10,%rbp
921	movq	%rdx,%r11
922	movq	%rdi,(%r14)
923
924	leaq	(%r14,%r9,1),%r14
925
926	mulq	%rbp
927	addq	%rax,%r10
928	movq	8(%rsi,%r9,1),%rax
929	adcq	$0,%rdx
930	movq	%rdx,%rdi
931
932	mulq	%rbx
933	addq	%rax,%r11
934	movq	8(%rcx),%rax
935	adcq	$0,%rdx
936	addq	8(%r14),%r11
937	adcq	$0,%rdx
938	movq	%rdx,%r10
939
940	mulq	%rbp
941	addq	%rax,%rdi
942	movq	16(%rsi,%r9,1),%rax
943	adcq	$0,%rdx
944	addq	%r11,%rdi
945	leaq	32(%r9),%r15
946	leaq	32(%rcx),%rcx
947	adcq	$0,%rdx
948	movq	%rdx,%r13
949	jmp	.Linner4x
950
951.align	32
952.Linner4x:
953	mulq	%rbx
954	addq	%rax,%r10
955	movq	-16(%rcx),%rax
956	adcq	$0,%rdx
957	addq	16(%r14),%r10
958	leaq	32(%r14),%r14
959	adcq	$0,%rdx
960	movq	%rdx,%r11
961
962	mulq	%rbp
963	addq	%rax,%r13
964	movq	-8(%rsi,%r15,1),%rax
965	adcq	$0,%rdx
966	addq	%r10,%r13
967	adcq	$0,%rdx
968	movq	%rdi,-32(%r14)
969	movq	%rdx,%rdi
970
971	mulq	%rbx
972	addq	%rax,%r11
973	movq	-8(%rcx),%rax
974	adcq	$0,%rdx
975	addq	-8(%r14),%r11
976	adcq	$0,%rdx
977	movq	%rdx,%r10
978
979	mulq	%rbp
980	addq	%rax,%rdi
981	movq	(%rsi,%r15,1),%rax
982	adcq	$0,%rdx
983	addq	%r11,%rdi
984	adcq	$0,%rdx
985	movq	%r13,-24(%r14)
986	movq	%rdx,%r13
987
988	mulq	%rbx
989	addq	%rax,%r10
990	movq	0(%rcx),%rax
991	adcq	$0,%rdx
992	addq	(%r14),%r10
993	adcq	$0,%rdx
994	movq	%rdx,%r11
995
996	mulq	%rbp
997	addq	%rax,%r13
998	movq	8(%rsi,%r15,1),%rax
999	adcq	$0,%rdx
1000	addq	%r10,%r13
1001	adcq	$0,%rdx
1002	movq	%rdi,-16(%r14)
1003	movq	%rdx,%rdi
1004
1005	mulq	%rbx
1006	addq	%rax,%r11
1007	movq	8(%rcx),%rax
1008	adcq	$0,%rdx
1009	addq	8(%r14),%r11
1010	adcq	$0,%rdx
1011	movq	%rdx,%r10
1012
1013	mulq	%rbp
1014	addq	%rax,%rdi
1015	movq	16(%rsi,%r15,1),%rax
1016	adcq	$0,%rdx
1017	addq	%r11,%rdi
1018	leaq	32(%rcx),%rcx
1019	adcq	$0,%rdx
1020	movq	%r13,-8(%r14)
1021	movq	%rdx,%r13
1022
1023	addq	$32,%r15
1024	jnz	.Linner4x
1025
1026	mulq	%rbx
1027	addq	%rax,%r10
1028	movq	-16(%rcx),%rax
1029	adcq	$0,%rdx
1030	addq	16(%r14),%r10
1031	leaq	32(%r14),%r14
1032	adcq	$0,%rdx
1033	movq	%rdx,%r11
1034
1035	mulq	%rbp
1036	addq	%rax,%r13
1037	movq	-8(%rsi),%rax
1038	adcq	$0,%rdx
1039	addq	%r10,%r13
1040	adcq	$0,%rdx
1041	movq	%rdi,-32(%r14)
1042	movq	%rdx,%rdi
1043
1044	mulq	%rbx
1045	addq	%rax,%r11
1046	movq	%rbp,%rax
1047	movq	-8(%rcx),%rbp
1048	adcq	$0,%rdx
1049	addq	-8(%r14),%r11
1050	adcq	$0,%rdx
1051	movq	%rdx,%r10
1052
1053	mulq	%rbp
1054	addq	%rax,%rdi
1055	movq	(%rsi,%r9,1),%rax
1056	adcq	$0,%rdx
1057	addq	%r11,%rdi
1058	adcq	$0,%rdx
1059	movq	%r13,-24(%r14)
1060	movq	%rdx,%r13
1061
1062	movq	%rdi,-16(%r14)
1063	leaq	(%rcx,%r9,1),%rcx
1064
1065	xorq	%rdi,%rdi
1066	addq	%r10,%r13
1067	adcq	$0,%rdi
1068	addq	(%r14),%r13
1069	adcq	$0,%rdi
1070	movq	%r13,-8(%r14)
1071
1072	cmpq	16+8(%rsp),%r12
1073	jb	.Louter4x
1074	xorq	%rax,%rax
1075	subq	%r13,%rbp
1076	adcq	%r15,%r15
1077	orq	%r15,%rdi
1078	subq	%rdi,%rax
1079	leaq	(%r14,%r9,1),%rbx
1080	movq	(%rcx),%r12
1081	leaq	(%rcx),%rbp
1082	movq	%r9,%rcx
1083	sarq	$3+2,%rcx
1084	movq	56+8(%rsp),%rdi
1085	decq	%r12
1086	xorq	%r10,%r10
1087	movq	8(%rbp),%r13
1088	movq	16(%rbp),%r14
1089	movq	24(%rbp),%r15
1090	jmp	.Lsqr4x_sub_entry
1091.cfi_endproc
1092.size	mul4x_internal,.-mul4x_internal
1093.globl	bn_power5
1094.hidden bn_power5
1095.type	bn_power5,@function
1096.align	32
1097bn_power5:
1098.cfi_startproc
1099	movq	%rsp,%rax
1100.cfi_def_cfa_register	%rax
1101	leaq	OPENSSL_ia32cap_P(%rip),%r11
1102	movl	8(%r11),%r11d
1103	andl	$0x80108,%r11d
1104	cmpl	$0x80108,%r11d
1105	je	.Lpowerx5_enter
1106	pushq	%rbx
1107.cfi_offset	%rbx,-16
1108	pushq	%rbp
1109.cfi_offset	%rbp,-24
1110	pushq	%r12
1111.cfi_offset	%r12,-32
1112	pushq	%r13
1113.cfi_offset	%r13,-40
1114	pushq	%r14
1115.cfi_offset	%r14,-48
1116	pushq	%r15
1117.cfi_offset	%r15,-56
1118.Lpower5_prologue:
1119
1120	shll	$3,%r9d
1121	leal	(%r9,%r9,2),%r10d
1122	negq	%r9
1123	movq	(%r8),%r8
1124
1125
1126
1127
1128
1129
1130
1131
1132	leaq	-320(%rsp,%r9,2),%r11
1133	movq	%rsp,%rbp
1134	subq	%rdi,%r11
1135	andq	$4095,%r11
1136	cmpq	%r11,%r10
1137	jb	.Lpwr_sp_alt
1138	subq	%r11,%rbp
1139	leaq	-320(%rbp,%r9,2),%rbp
1140	jmp	.Lpwr_sp_done
1141
1142.align	32
1143.Lpwr_sp_alt:
1144	leaq	4096-320(,%r9,2),%r10
1145	leaq	-320(%rbp,%r9,2),%rbp
1146	subq	%r10,%r11
1147	movq	$0,%r10
1148	cmovcq	%r10,%r11
1149	subq	%r11,%rbp
1150.Lpwr_sp_done:
1151	andq	$-64,%rbp
1152	movq	%rsp,%r11
1153	subq	%rbp,%r11
1154	andq	$-4096,%r11
1155	leaq	(%r11,%rbp,1),%rsp
1156	movq	(%rsp),%r10
1157	cmpq	%rbp,%rsp
1158	ja	.Lpwr_page_walk
1159	jmp	.Lpwr_page_walk_done
1160
1161.Lpwr_page_walk:
1162	leaq	-4096(%rsp),%rsp
1163	movq	(%rsp),%r10
1164	cmpq	%rbp,%rsp
1165	ja	.Lpwr_page_walk
1166.Lpwr_page_walk_done:
1167
1168	movq	%r9,%r10
1169	negq	%r9
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180	movq	%r8,32(%rsp)
1181	movq	%rax,40(%rsp)
1182.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
1183.Lpower5_body:
1184.byte	102,72,15,110,207
1185.byte	102,72,15,110,209
1186.byte	102,73,15,110,218
1187.byte	102,72,15,110,226
1188
1189	call	__bn_sqr8x_internal
1190	call	__bn_post4x_internal
1191	call	__bn_sqr8x_internal
1192	call	__bn_post4x_internal
1193	call	__bn_sqr8x_internal
1194	call	__bn_post4x_internal
1195	call	__bn_sqr8x_internal
1196	call	__bn_post4x_internal
1197	call	__bn_sqr8x_internal
1198	call	__bn_post4x_internal
1199
1200.byte	102,72,15,126,209
1201.byte	102,72,15,126,226
1202	movq	%rsi,%rdi
1203	movq	40(%rsp),%rax
1204	leaq	32(%rsp),%r8
1205
1206	call	mul4x_internal
1207
1208	movq	40(%rsp),%rsi
1209.cfi_def_cfa	%rsi,8
1210	movq	$1,%rax
1211	movq	-48(%rsi),%r15
1212.cfi_restore	%r15
1213	movq	-40(%rsi),%r14
1214.cfi_restore	%r14
1215	movq	-32(%rsi),%r13
1216.cfi_restore	%r13
1217	movq	-24(%rsi),%r12
1218.cfi_restore	%r12
1219	movq	-16(%rsi),%rbp
1220.cfi_restore	%rbp
1221	movq	-8(%rsi),%rbx
1222.cfi_restore	%rbx
1223	leaq	(%rsi),%rsp
1224.cfi_def_cfa_register	%rsp
1225.Lpower5_epilogue:
1226	.byte	0xf3,0xc3
1227.cfi_endproc
1228.size	bn_power5,.-bn_power5
1229
1230.globl	bn_sqr8x_internal
1231.hidden bn_sqr8x_internal
1232.hidden	bn_sqr8x_internal
1233.type	bn_sqr8x_internal,@function
1234.align	32
1235bn_sqr8x_internal:
1236__bn_sqr8x_internal:
1237.cfi_startproc
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311	leaq	32(%r10),%rbp
1312	leaq	(%rsi,%r9,1),%rsi
1313
1314	movq	%r9,%rcx
1315
1316
1317	movq	-32(%rsi,%rbp,1),%r14
1318	leaq	48+8(%rsp,%r9,2),%rdi
1319	movq	-24(%rsi,%rbp,1),%rax
1320	leaq	-32(%rdi,%rbp,1),%rdi
1321	movq	-16(%rsi,%rbp,1),%rbx
1322	movq	%rax,%r15
1323
1324	mulq	%r14
1325	movq	%rax,%r10
1326	movq	%rbx,%rax
1327	movq	%rdx,%r11
1328	movq	%r10,-24(%rdi,%rbp,1)
1329
1330	mulq	%r14
1331	addq	%rax,%r11
1332	movq	%rbx,%rax
1333	adcq	$0,%rdx
1334	movq	%r11,-16(%rdi,%rbp,1)
1335	movq	%rdx,%r10
1336
1337
1338	movq	-8(%rsi,%rbp,1),%rbx
1339	mulq	%r15
1340	movq	%rax,%r12
1341	movq	%rbx,%rax
1342	movq	%rdx,%r13
1343
1344	leaq	(%rbp),%rcx
1345	mulq	%r14
1346	addq	%rax,%r10
1347	movq	%rbx,%rax
1348	movq	%rdx,%r11
1349	adcq	$0,%r11
1350	addq	%r12,%r10
1351	adcq	$0,%r11
1352	movq	%r10,-8(%rdi,%rcx,1)
1353	jmp	.Lsqr4x_1st
1354
1355.align	32
1356.Lsqr4x_1st:
1357	movq	(%rsi,%rcx,1),%rbx
1358	mulq	%r15
1359	addq	%rax,%r13
1360	movq	%rbx,%rax
1361	movq	%rdx,%r12
1362	adcq	$0,%r12
1363
1364	mulq	%r14
1365	addq	%rax,%r11
1366	movq	%rbx,%rax
1367	movq	8(%rsi,%rcx,1),%rbx
1368	movq	%rdx,%r10
1369	adcq	$0,%r10
1370	addq	%r13,%r11
1371	adcq	$0,%r10
1372
1373
1374	mulq	%r15
1375	addq	%rax,%r12
1376	movq	%rbx,%rax
1377	movq	%r11,(%rdi,%rcx,1)
1378	movq	%rdx,%r13
1379	adcq	$0,%r13
1380
1381	mulq	%r14
1382	addq	%rax,%r10
1383	movq	%rbx,%rax
1384	movq	16(%rsi,%rcx,1),%rbx
1385	movq	%rdx,%r11
1386	adcq	$0,%r11
1387	addq	%r12,%r10
1388	adcq	$0,%r11
1389
1390	mulq	%r15
1391	addq	%rax,%r13
1392	movq	%rbx,%rax
1393	movq	%r10,8(%rdi,%rcx,1)
1394	movq	%rdx,%r12
1395	adcq	$0,%r12
1396
1397	mulq	%r14
1398	addq	%rax,%r11
1399	movq	%rbx,%rax
1400	movq	24(%rsi,%rcx,1),%rbx
1401	movq	%rdx,%r10
1402	adcq	$0,%r10
1403	addq	%r13,%r11
1404	adcq	$0,%r10
1405
1406
1407	mulq	%r15
1408	addq	%rax,%r12
1409	movq	%rbx,%rax
1410	movq	%r11,16(%rdi,%rcx,1)
1411	movq	%rdx,%r13
1412	adcq	$0,%r13
1413	leaq	32(%rcx),%rcx
1414
1415	mulq	%r14
1416	addq	%rax,%r10
1417	movq	%rbx,%rax
1418	movq	%rdx,%r11
1419	adcq	$0,%r11
1420	addq	%r12,%r10
1421	adcq	$0,%r11
1422	movq	%r10,-8(%rdi,%rcx,1)
1423
1424	cmpq	$0,%rcx
1425	jne	.Lsqr4x_1st
1426
1427	mulq	%r15
1428	addq	%rax,%r13
1429	leaq	16(%rbp),%rbp
1430	adcq	$0,%rdx
1431	addq	%r11,%r13
1432	adcq	$0,%rdx
1433
1434	movq	%r13,(%rdi)
1435	movq	%rdx,%r12
1436	movq	%rdx,8(%rdi)
1437	jmp	.Lsqr4x_outer
1438
1439.align	32
1440.Lsqr4x_outer:
1441	movq	-32(%rsi,%rbp,1),%r14
1442	leaq	48+8(%rsp,%r9,2),%rdi
1443	movq	-24(%rsi,%rbp,1),%rax
1444	leaq	-32(%rdi,%rbp,1),%rdi
1445	movq	-16(%rsi,%rbp,1),%rbx
1446	movq	%rax,%r15
1447
1448	mulq	%r14
1449	movq	-24(%rdi,%rbp,1),%r10
1450	addq	%rax,%r10
1451	movq	%rbx,%rax
1452	adcq	$0,%rdx
1453	movq	%r10,-24(%rdi,%rbp,1)
1454	movq	%rdx,%r11
1455
1456	mulq	%r14
1457	addq	%rax,%r11
1458	movq	%rbx,%rax
1459	adcq	$0,%rdx
1460	addq	-16(%rdi,%rbp,1),%r11
1461	movq	%rdx,%r10
1462	adcq	$0,%r10
1463	movq	%r11,-16(%rdi,%rbp,1)
1464
1465	xorq	%r12,%r12
1466
1467	movq	-8(%rsi,%rbp,1),%rbx
1468	mulq	%r15
1469	addq	%rax,%r12
1470	movq	%rbx,%rax
1471	adcq	$0,%rdx
1472	addq	-8(%rdi,%rbp,1),%r12
1473	movq	%rdx,%r13
1474	adcq	$0,%r13
1475
1476	mulq	%r14
1477	addq	%rax,%r10
1478	movq	%rbx,%rax
1479	adcq	$0,%rdx
1480	addq	%r12,%r10
1481	movq	%rdx,%r11
1482	adcq	$0,%r11
1483	movq	%r10,-8(%rdi,%rbp,1)
1484
1485	leaq	(%rbp),%rcx
1486	jmp	.Lsqr4x_inner
1487
1488.align	32
1489.Lsqr4x_inner:
1490	movq	(%rsi,%rcx,1),%rbx
1491	mulq	%r15
1492	addq	%rax,%r13
1493	movq	%rbx,%rax
1494	movq	%rdx,%r12
1495	adcq	$0,%r12
1496	addq	(%rdi,%rcx,1),%r13
1497	adcq	$0,%r12
1498
1499.byte	0x67
1500	mulq	%r14
1501	addq	%rax,%r11
1502	movq	%rbx,%rax
1503	movq	8(%rsi,%rcx,1),%rbx
1504	movq	%rdx,%r10
1505	adcq	$0,%r10
1506	addq	%r13,%r11
1507	adcq	$0,%r10
1508
1509	mulq	%r15
1510	addq	%rax,%r12
1511	movq	%r11,(%rdi,%rcx,1)
1512	movq	%rbx,%rax
1513	movq	%rdx,%r13
1514	adcq	$0,%r13
1515	addq	8(%rdi,%rcx,1),%r12
1516	leaq	16(%rcx),%rcx
1517	adcq	$0,%r13
1518
1519	mulq	%r14
1520	addq	%rax,%r10
1521	movq	%rbx,%rax
1522	adcq	$0,%rdx
1523	addq	%r12,%r10
1524	movq	%rdx,%r11
1525	adcq	$0,%r11
1526	movq	%r10,-8(%rdi,%rcx,1)
1527
1528	cmpq	$0,%rcx
1529	jne	.Lsqr4x_inner
1530
1531.byte	0x67
1532	mulq	%r15
1533	addq	%rax,%r13
1534	adcq	$0,%rdx
1535	addq	%r11,%r13
1536	adcq	$0,%rdx
1537
1538	movq	%r13,(%rdi)
1539	movq	%rdx,%r12
1540	movq	%rdx,8(%rdi)
1541
1542	addq	$16,%rbp
1543	jnz	.Lsqr4x_outer
1544
1545
1546	movq	-32(%rsi),%r14
1547	leaq	48+8(%rsp,%r9,2),%rdi
1548	movq	-24(%rsi),%rax
1549	leaq	-32(%rdi,%rbp,1),%rdi
1550	movq	-16(%rsi),%rbx
1551	movq	%rax,%r15
1552
1553	mulq	%r14
1554	addq	%rax,%r10
1555	movq	%rbx,%rax
1556	movq	%rdx,%r11
1557	adcq	$0,%r11
1558
1559	mulq	%r14
1560	addq	%rax,%r11
1561	movq	%rbx,%rax
1562	movq	%r10,-24(%rdi)
1563	movq	%rdx,%r10
1564	adcq	$0,%r10
1565	addq	%r13,%r11
1566	movq	-8(%rsi),%rbx
1567	adcq	$0,%r10
1568
1569	mulq	%r15
1570	addq	%rax,%r12
1571	movq	%rbx,%rax
1572	movq	%r11,-16(%rdi)
1573	movq	%rdx,%r13
1574	adcq	$0,%r13
1575
1576	mulq	%r14
1577	addq	%rax,%r10
1578	movq	%rbx,%rax
1579	movq	%rdx,%r11
1580	adcq	$0,%r11
1581	addq	%r12,%r10
1582	adcq	$0,%r11
1583	movq	%r10,-8(%rdi)
1584
1585	mulq	%r15
1586	addq	%rax,%r13
1587	movq	-16(%rsi),%rax
1588	adcq	$0,%rdx
1589	addq	%r11,%r13
1590	adcq	$0,%rdx
1591
1592	movq	%r13,(%rdi)
1593	movq	%rdx,%r12
1594	movq	%rdx,8(%rdi)
1595
1596	mulq	%rbx
1597	addq	$16,%rbp
1598	xorq	%r14,%r14
1599	subq	%r9,%rbp
1600	xorq	%r15,%r15
1601
1602	addq	%r12,%rax
1603	adcq	$0,%rdx
1604	movq	%rax,8(%rdi)
1605	movq	%rdx,16(%rdi)
1606	movq	%r15,24(%rdi)
1607
1608	movq	-16(%rsi,%rbp,1),%rax
1609	leaq	48+8(%rsp),%rdi
1610	xorq	%r10,%r10
1611	movq	8(%rdi),%r11
1612
1613	leaq	(%r14,%r10,2),%r12
1614	shrq	$63,%r10
1615	leaq	(%rcx,%r11,2),%r13
1616	shrq	$63,%r11
1617	orq	%r10,%r13
1618	movq	16(%rdi),%r10
1619	movq	%r11,%r14
1620	mulq	%rax
1621	negq	%r15
1622	movq	24(%rdi),%r11
1623	adcq	%rax,%r12
1624	movq	-8(%rsi,%rbp,1),%rax
1625	movq	%r12,(%rdi)
1626	adcq	%rdx,%r13
1627
1628	leaq	(%r14,%r10,2),%rbx
1629	movq	%r13,8(%rdi)
1630	sbbq	%r15,%r15
1631	shrq	$63,%r10
1632	leaq	(%rcx,%r11,2),%r8
1633	shrq	$63,%r11
1634	orq	%r10,%r8
1635	movq	32(%rdi),%r10
1636	movq	%r11,%r14
1637	mulq	%rax
1638	negq	%r15
1639	movq	40(%rdi),%r11
1640	adcq	%rax,%rbx
1641	movq	0(%rsi,%rbp,1),%rax
1642	movq	%rbx,16(%rdi)
1643	adcq	%rdx,%r8
1644	leaq	16(%rbp),%rbp
1645	movq	%r8,24(%rdi)
1646	sbbq	%r15,%r15
1647	leaq	64(%rdi),%rdi
1648	jmp	.Lsqr4x_shift_n_add
1649
1650.align	32
1651.Lsqr4x_shift_n_add:
1652	leaq	(%r14,%r10,2),%r12
1653	shrq	$63,%r10
1654	leaq	(%rcx,%r11,2),%r13
1655	shrq	$63,%r11
1656	orq	%r10,%r13
1657	movq	-16(%rdi),%r10
1658	movq	%r11,%r14
1659	mulq	%rax
1660	negq	%r15
1661	movq	-8(%rdi),%r11
1662	adcq	%rax,%r12
1663	movq	-8(%rsi,%rbp,1),%rax
1664	movq	%r12,-32(%rdi)
1665	adcq	%rdx,%r13
1666
1667	leaq	(%r14,%r10,2),%rbx
1668	movq	%r13,-24(%rdi)
1669	sbbq	%r15,%r15
1670	shrq	$63,%r10
1671	leaq	(%rcx,%r11,2),%r8
1672	shrq	$63,%r11
1673	orq	%r10,%r8
1674	movq	0(%rdi),%r10
1675	movq	%r11,%r14
1676	mulq	%rax
1677	negq	%r15
1678	movq	8(%rdi),%r11
1679	adcq	%rax,%rbx
1680	movq	0(%rsi,%rbp,1),%rax
1681	movq	%rbx,-16(%rdi)
1682	adcq	%rdx,%r8
1683
1684	leaq	(%r14,%r10,2),%r12
1685	movq	%r8,-8(%rdi)
1686	sbbq	%r15,%r15
1687	shrq	$63,%r10
1688	leaq	(%rcx,%r11,2),%r13
1689	shrq	$63,%r11
1690	orq	%r10,%r13
1691	movq	16(%rdi),%r10
1692	movq	%r11,%r14
1693	mulq	%rax
1694	negq	%r15
1695	movq	24(%rdi),%r11
1696	adcq	%rax,%r12
1697	movq	8(%rsi,%rbp,1),%rax
1698	movq	%r12,0(%rdi)
1699	adcq	%rdx,%r13
1700
1701	leaq	(%r14,%r10,2),%rbx
1702	movq	%r13,8(%rdi)
1703	sbbq	%r15,%r15
1704	shrq	$63,%r10
1705	leaq	(%rcx,%r11,2),%r8
1706	shrq	$63,%r11
1707	orq	%r10,%r8
1708	movq	32(%rdi),%r10
1709	movq	%r11,%r14
1710	mulq	%rax
1711	negq	%r15
1712	movq	40(%rdi),%r11
1713	adcq	%rax,%rbx
1714	movq	16(%rsi,%rbp,1),%rax
1715	movq	%rbx,16(%rdi)
1716	adcq	%rdx,%r8
1717	movq	%r8,24(%rdi)
1718	sbbq	%r15,%r15
1719	leaq	64(%rdi),%rdi
1720	addq	$32,%rbp
1721	jnz	.Lsqr4x_shift_n_add
1722
1723	leaq	(%r14,%r10,2),%r12
1724.byte	0x67
1725	shrq	$63,%r10
1726	leaq	(%rcx,%r11,2),%r13
1727	shrq	$63,%r11
1728	orq	%r10,%r13
1729	movq	-16(%rdi),%r10
1730	movq	%r11,%r14
1731	mulq	%rax
1732	negq	%r15
1733	movq	-8(%rdi),%r11
1734	adcq	%rax,%r12
1735	movq	-8(%rsi),%rax
1736	movq	%r12,-32(%rdi)
1737	adcq	%rdx,%r13
1738
1739	leaq	(%r14,%r10,2),%rbx
1740	movq	%r13,-24(%rdi)
1741	sbbq	%r15,%r15
1742	shrq	$63,%r10
1743	leaq	(%rcx,%r11,2),%r8
1744	shrq	$63,%r11
1745	orq	%r10,%r8
1746	mulq	%rax
1747	negq	%r15
1748	adcq	%rax,%rbx
1749	adcq	%rdx,%r8
1750	movq	%rbx,-16(%rdi)
1751	movq	%r8,-8(%rdi)
1752.byte	102,72,15,126,213
1753__bn_sqr8x_reduction:
1754	xorq	%rax,%rax
1755	leaq	(%r9,%rbp,1),%rcx
1756	leaq	48+8(%rsp,%r9,2),%rdx
1757	movq	%rcx,0+8(%rsp)
1758	leaq	48+8(%rsp,%r9,1),%rdi
1759	movq	%rdx,8+8(%rsp)
1760	negq	%r9
1761	jmp	.L8x_reduction_loop
1762
1763.align	32
1764.L8x_reduction_loop:
1765	leaq	(%rdi,%r9,1),%rdi
1766.byte	0x66
1767	movq	0(%rdi),%rbx
1768	movq	8(%rdi),%r9
1769	movq	16(%rdi),%r10
1770	movq	24(%rdi),%r11
1771	movq	32(%rdi),%r12
1772	movq	40(%rdi),%r13
1773	movq	48(%rdi),%r14
1774	movq	56(%rdi),%r15
1775	movq	%rax,(%rdx)
1776	leaq	64(%rdi),%rdi
1777
1778.byte	0x67
1779	movq	%rbx,%r8
1780	imulq	32+8(%rsp),%rbx
1781	movq	0(%rbp),%rax
1782	movl	$8,%ecx
1783	jmp	.L8x_reduce
1784
1785.align	32
1786.L8x_reduce:
1787	mulq	%rbx
1788	movq	8(%rbp),%rax
1789	negq	%r8
1790	movq	%rdx,%r8
1791	adcq	$0,%r8
1792
1793	mulq	%rbx
1794	addq	%rax,%r9
1795	movq	16(%rbp),%rax
1796	adcq	$0,%rdx
1797	addq	%r9,%r8
1798	movq	%rbx,48-8+8(%rsp,%rcx,8)
1799	movq	%rdx,%r9
1800	adcq	$0,%r9
1801
1802	mulq	%rbx
1803	addq	%rax,%r10
1804	movq	24(%rbp),%rax
1805	adcq	$0,%rdx
1806	addq	%r10,%r9
1807	movq	32+8(%rsp),%rsi
1808	movq	%rdx,%r10
1809	adcq	$0,%r10
1810
1811	mulq	%rbx
1812	addq	%rax,%r11
1813	movq	32(%rbp),%rax
1814	adcq	$0,%rdx
1815	imulq	%r8,%rsi
1816	addq	%r11,%r10
1817	movq	%rdx,%r11
1818	adcq	$0,%r11
1819
1820	mulq	%rbx
1821	addq	%rax,%r12
1822	movq	40(%rbp),%rax
1823	adcq	$0,%rdx
1824	addq	%r12,%r11
1825	movq	%rdx,%r12
1826	adcq	$0,%r12
1827
1828	mulq	%rbx
1829	addq	%rax,%r13
1830	movq	48(%rbp),%rax
1831	adcq	$0,%rdx
1832	addq	%r13,%r12
1833	movq	%rdx,%r13
1834	adcq	$0,%r13
1835
1836	mulq	%rbx
1837	addq	%rax,%r14
1838	movq	56(%rbp),%rax
1839	adcq	$0,%rdx
1840	addq	%r14,%r13
1841	movq	%rdx,%r14
1842	adcq	$0,%r14
1843
1844	mulq	%rbx
1845	movq	%rsi,%rbx
1846	addq	%rax,%r15
1847	movq	0(%rbp),%rax
1848	adcq	$0,%rdx
1849	addq	%r15,%r14
1850	movq	%rdx,%r15
1851	adcq	$0,%r15
1852
1853	decl	%ecx
1854	jnz	.L8x_reduce
1855
1856	leaq	64(%rbp),%rbp
1857	xorq	%rax,%rax
1858	movq	8+8(%rsp),%rdx
1859	cmpq	0+8(%rsp),%rbp
1860	jae	.L8x_no_tail
1861
1862.byte	0x66
1863	addq	0(%rdi),%r8
1864	adcq	8(%rdi),%r9
1865	adcq	16(%rdi),%r10
1866	adcq	24(%rdi),%r11
1867	adcq	32(%rdi),%r12
1868	adcq	40(%rdi),%r13
1869	adcq	48(%rdi),%r14
1870	adcq	56(%rdi),%r15
1871	sbbq	%rsi,%rsi
1872
1873	movq	48+56+8(%rsp),%rbx
1874	movl	$8,%ecx
1875	movq	0(%rbp),%rax
1876	jmp	.L8x_tail
1877
1878.align	32
1879.L8x_tail:
1880	mulq	%rbx
1881	addq	%rax,%r8
1882	movq	8(%rbp),%rax
1883	movq	%r8,(%rdi)
1884	movq	%rdx,%r8
1885	adcq	$0,%r8
1886
1887	mulq	%rbx
1888	addq	%rax,%r9
1889	movq	16(%rbp),%rax
1890	adcq	$0,%rdx
1891	addq	%r9,%r8
1892	leaq	8(%rdi),%rdi
1893	movq	%rdx,%r9
1894	adcq	$0,%r9
1895
1896	mulq	%rbx
1897	addq	%rax,%r10
1898	movq	24(%rbp),%rax
1899	adcq	$0,%rdx
1900	addq	%r10,%r9
1901	movq	%rdx,%r10
1902	adcq	$0,%r10
1903
1904	mulq	%rbx
1905	addq	%rax,%r11
1906	movq	32(%rbp),%rax
1907	adcq	$0,%rdx
1908	addq	%r11,%r10
1909	movq	%rdx,%r11
1910	adcq	$0,%r11
1911
1912	mulq	%rbx
1913	addq	%rax,%r12
1914	movq	40(%rbp),%rax
1915	adcq	$0,%rdx
1916	addq	%r12,%r11
1917	movq	%rdx,%r12
1918	adcq	$0,%r12
1919
1920	mulq	%rbx
1921	addq	%rax,%r13
1922	movq	48(%rbp),%rax
1923	adcq	$0,%rdx
1924	addq	%r13,%r12
1925	movq	%rdx,%r13
1926	adcq	$0,%r13
1927
1928	mulq	%rbx
1929	addq	%rax,%r14
1930	movq	56(%rbp),%rax
1931	adcq	$0,%rdx
1932	addq	%r14,%r13
1933	movq	%rdx,%r14
1934	adcq	$0,%r14
1935
1936	mulq	%rbx
1937	movq	48-16+8(%rsp,%rcx,8),%rbx
1938	addq	%rax,%r15
1939	adcq	$0,%rdx
1940	addq	%r15,%r14
1941	movq	0(%rbp),%rax
1942	movq	%rdx,%r15
1943	adcq	$0,%r15
1944
1945	decl	%ecx
1946	jnz	.L8x_tail
1947
1948	leaq	64(%rbp),%rbp
1949	movq	8+8(%rsp),%rdx
1950	cmpq	0+8(%rsp),%rbp
1951	jae	.L8x_tail_done
1952
1953	movq	48+56+8(%rsp),%rbx
1954	negq	%rsi
1955	movq	0(%rbp),%rax
1956	adcq	0(%rdi),%r8
1957	adcq	8(%rdi),%r9
1958	adcq	16(%rdi),%r10
1959	adcq	24(%rdi),%r11
1960	adcq	32(%rdi),%r12
1961	adcq	40(%rdi),%r13
1962	adcq	48(%rdi),%r14
1963	adcq	56(%rdi),%r15
1964	sbbq	%rsi,%rsi
1965
1966	movl	$8,%ecx
1967	jmp	.L8x_tail
1968
1969.align	32
1970.L8x_tail_done:
1971	xorq	%rax,%rax
1972	addq	(%rdx),%r8
1973	adcq	$0,%r9
1974	adcq	$0,%r10
1975	adcq	$0,%r11
1976	adcq	$0,%r12
1977	adcq	$0,%r13
1978	adcq	$0,%r14
1979	adcq	$0,%r15
1980	adcq	$0,%rax
1981
1982	negq	%rsi
1983.L8x_no_tail:
1984	adcq	0(%rdi),%r8
1985	adcq	8(%rdi),%r9
1986	adcq	16(%rdi),%r10
1987	adcq	24(%rdi),%r11
1988	adcq	32(%rdi),%r12
1989	adcq	40(%rdi),%r13
1990	adcq	48(%rdi),%r14
1991	adcq	56(%rdi),%r15
1992	adcq	$0,%rax
1993	movq	-8(%rbp),%rcx
1994	xorq	%rsi,%rsi
1995
1996.byte	102,72,15,126,213
1997
1998	movq	%r8,0(%rdi)
1999	movq	%r9,8(%rdi)
2000.byte	102,73,15,126,217
2001	movq	%r10,16(%rdi)
2002	movq	%r11,24(%rdi)
2003	movq	%r12,32(%rdi)
2004	movq	%r13,40(%rdi)
2005	movq	%r14,48(%rdi)
2006	movq	%r15,56(%rdi)
2007	leaq	64(%rdi),%rdi
2008
2009	cmpq	%rdx,%rdi
2010	jb	.L8x_reduction_loop
2011	.byte	0xf3,0xc3
2012.cfi_endproc
2013.size	bn_sqr8x_internal,.-bn_sqr8x_internal
2014.type	__bn_post4x_internal,@function
2015.align	32
2016__bn_post4x_internal:
2017.cfi_startproc
2018	movq	0(%rbp),%r12
2019	leaq	(%rdi,%r9,1),%rbx
2020	movq	%r9,%rcx
2021.byte	102,72,15,126,207
2022	negq	%rax
2023.byte	102,72,15,126,206
2024	sarq	$3+2,%rcx
2025	decq	%r12
2026	xorq	%r10,%r10
2027	movq	8(%rbp),%r13
2028	movq	16(%rbp),%r14
2029	movq	24(%rbp),%r15
2030	jmp	.Lsqr4x_sub_entry
2031
2032.align	16
2033.Lsqr4x_sub:
2034	movq	0(%rbp),%r12
2035	movq	8(%rbp),%r13
2036	movq	16(%rbp),%r14
2037	movq	24(%rbp),%r15
2038.Lsqr4x_sub_entry:
2039	leaq	32(%rbp),%rbp
2040	notq	%r12
2041	notq	%r13
2042	notq	%r14
2043	notq	%r15
2044	andq	%rax,%r12
2045	andq	%rax,%r13
2046	andq	%rax,%r14
2047	andq	%rax,%r15
2048
2049	negq	%r10
2050	adcq	0(%rbx),%r12
2051	adcq	8(%rbx),%r13
2052	adcq	16(%rbx),%r14
2053	adcq	24(%rbx),%r15
2054	movq	%r12,0(%rdi)
2055	leaq	32(%rbx),%rbx
2056	movq	%r13,8(%rdi)
2057	sbbq	%r10,%r10
2058	movq	%r14,16(%rdi)
2059	movq	%r15,24(%rdi)
2060	leaq	32(%rdi),%rdi
2061
2062	incq	%rcx
2063	jnz	.Lsqr4x_sub
2064
2065	movq	%r9,%r10
2066	negq	%r9
2067	.byte	0xf3,0xc3
2068.cfi_endproc
2069.size	__bn_post4x_internal,.-__bn_post4x_internal
2070.globl	bn_from_montgomery
2071.hidden bn_from_montgomery
2072.type	bn_from_montgomery,@function
2073.align	32
2074bn_from_montgomery:
2075.cfi_startproc
2076	testl	$7,%r9d
2077	jz	bn_from_mont8x
2078	xorl	%eax,%eax
2079	.byte	0xf3,0xc3
2080.cfi_endproc
2081.size	bn_from_montgomery,.-bn_from_montgomery
2082
2083.type	bn_from_mont8x,@function
2084.align	32
2085bn_from_mont8x:
2086.cfi_startproc
2087.byte	0x67
2088	movq	%rsp,%rax
2089.cfi_def_cfa_register	%rax
2090	pushq	%rbx
2091.cfi_offset	%rbx,-16
2092	pushq	%rbp
2093.cfi_offset	%rbp,-24
2094	pushq	%r12
2095.cfi_offset	%r12,-32
2096	pushq	%r13
2097.cfi_offset	%r13,-40
2098	pushq	%r14
2099.cfi_offset	%r14,-48
2100	pushq	%r15
2101.cfi_offset	%r15,-56
2102.Lfrom_prologue:
2103
2104	shll	$3,%r9d
2105	leaq	(%r9,%r9,2),%r10
2106	negq	%r9
2107	movq	(%r8),%r8
2108
2109
2110
2111
2112
2113
2114
2115
2116	leaq	-320(%rsp,%r9,2),%r11
2117	movq	%rsp,%rbp
2118	subq	%rdi,%r11
2119	andq	$4095,%r11
2120	cmpq	%r11,%r10
2121	jb	.Lfrom_sp_alt
2122	subq	%r11,%rbp
2123	leaq	-320(%rbp,%r9,2),%rbp
2124	jmp	.Lfrom_sp_done
2125
2126.align	32
2127.Lfrom_sp_alt:
2128	leaq	4096-320(,%r9,2),%r10
2129	leaq	-320(%rbp,%r9,2),%rbp
2130	subq	%r10,%r11
2131	movq	$0,%r10
2132	cmovcq	%r10,%r11
2133	subq	%r11,%rbp
2134.Lfrom_sp_done:
2135	andq	$-64,%rbp
2136	movq	%rsp,%r11
2137	subq	%rbp,%r11
2138	andq	$-4096,%r11
2139	leaq	(%r11,%rbp,1),%rsp
2140	movq	(%rsp),%r10
2141	cmpq	%rbp,%rsp
2142	ja	.Lfrom_page_walk
2143	jmp	.Lfrom_page_walk_done
2144
2145.Lfrom_page_walk:
2146	leaq	-4096(%rsp),%rsp
2147	movq	(%rsp),%r10
2148	cmpq	%rbp,%rsp
2149	ja	.Lfrom_page_walk
2150.Lfrom_page_walk_done:
2151
2152	movq	%r9,%r10
2153	negq	%r9
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164	movq	%r8,32(%rsp)
2165	movq	%rax,40(%rsp)
2166.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2167.Lfrom_body:
2168	movq	%r9,%r11
2169	leaq	48(%rsp),%rax
2170	pxor	%xmm0,%xmm0
2171	jmp	.Lmul_by_1
2172
2173.align	32
2174.Lmul_by_1:
2175	movdqu	(%rsi),%xmm1
2176	movdqu	16(%rsi),%xmm2
2177	movdqu	32(%rsi),%xmm3
2178	movdqa	%xmm0,(%rax,%r9,1)
2179	movdqu	48(%rsi),%xmm4
2180	movdqa	%xmm0,16(%rax,%r9,1)
2181.byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
2182	movdqa	%xmm1,(%rax)
2183	movdqa	%xmm0,32(%rax,%r9,1)
2184	movdqa	%xmm2,16(%rax)
2185	movdqa	%xmm0,48(%rax,%r9,1)
2186	movdqa	%xmm3,32(%rax)
2187	movdqa	%xmm4,48(%rax)
2188	leaq	64(%rax),%rax
2189	subq	$64,%r11
2190	jnz	.Lmul_by_1
2191
2192.byte	102,72,15,110,207
2193.byte	102,72,15,110,209
2194.byte	0x67
2195	movq	%rcx,%rbp
2196.byte	102,73,15,110,218
2197	leaq	OPENSSL_ia32cap_P(%rip),%r11
2198	movl	8(%r11),%r11d
2199	andl	$0x80108,%r11d
2200	cmpl	$0x80108,%r11d
2201	jne	.Lfrom_mont_nox
2202
2203	leaq	(%rax,%r9,1),%rdi
2204	call	__bn_sqrx8x_reduction
2205	call	__bn_postx4x_internal
2206
2207	pxor	%xmm0,%xmm0
2208	leaq	48(%rsp),%rax
2209	jmp	.Lfrom_mont_zero
2210
2211.align	32
2212.Lfrom_mont_nox:
2213	call	__bn_sqr8x_reduction
2214	call	__bn_post4x_internal
2215
2216	pxor	%xmm0,%xmm0
2217	leaq	48(%rsp),%rax
2218	jmp	.Lfrom_mont_zero
2219
2220.align	32
2221.Lfrom_mont_zero:
2222	movq	40(%rsp),%rsi
2223.cfi_def_cfa	%rsi,8
2224	movdqa	%xmm0,0(%rax)
2225	movdqa	%xmm0,16(%rax)
2226	movdqa	%xmm0,32(%rax)
2227	movdqa	%xmm0,48(%rax)
2228	leaq	64(%rax),%rax
2229	subq	$32,%r9
2230	jnz	.Lfrom_mont_zero
2231
2232	movq	$1,%rax
2233	movq	-48(%rsi),%r15
2234.cfi_restore	%r15
2235	movq	-40(%rsi),%r14
2236.cfi_restore	%r14
2237	movq	-32(%rsi),%r13
2238.cfi_restore	%r13
2239	movq	-24(%rsi),%r12
2240.cfi_restore	%r12
2241	movq	-16(%rsi),%rbp
2242.cfi_restore	%rbp
2243	movq	-8(%rsi),%rbx
2244.cfi_restore	%rbx
2245	leaq	(%rsi),%rsp
2246.cfi_def_cfa_register	%rsp
2247.Lfrom_epilogue:
2248	.byte	0xf3,0xc3
2249.cfi_endproc
2250.size	bn_from_mont8x,.-bn_from_mont8x
2251.type	bn_mulx4x_mont_gather5,@function
2252.align	32
2253bn_mulx4x_mont_gather5:
2254.cfi_startproc
2255	movq	%rsp,%rax
2256.cfi_def_cfa_register	%rax
2257.Lmulx4x_enter:
2258	pushq	%rbx
2259.cfi_offset	%rbx,-16
2260	pushq	%rbp
2261.cfi_offset	%rbp,-24
2262	pushq	%r12
2263.cfi_offset	%r12,-32
2264	pushq	%r13
2265.cfi_offset	%r13,-40
2266	pushq	%r14
2267.cfi_offset	%r14,-48
2268	pushq	%r15
2269.cfi_offset	%r15,-56
2270.Lmulx4x_prologue:
2271
2272	shll	$3,%r9d
2273	leaq	(%r9,%r9,2),%r10
2274	negq	%r9
2275	movq	(%r8),%r8
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286	leaq	-320(%rsp,%r9,2),%r11
2287	movq	%rsp,%rbp
2288	subq	%rdi,%r11
2289	andq	$4095,%r11
2290	cmpq	%r11,%r10
2291	jb	.Lmulx4xsp_alt
2292	subq	%r11,%rbp
2293	leaq	-320(%rbp,%r9,2),%rbp
2294	jmp	.Lmulx4xsp_done
2295
2296.Lmulx4xsp_alt:
2297	leaq	4096-320(,%r9,2),%r10
2298	leaq	-320(%rbp,%r9,2),%rbp
2299	subq	%r10,%r11
2300	movq	$0,%r10
2301	cmovcq	%r10,%r11
2302	subq	%r11,%rbp
2303.Lmulx4xsp_done:
2304	andq	$-64,%rbp
2305	movq	%rsp,%r11
2306	subq	%rbp,%r11
2307	andq	$-4096,%r11
2308	leaq	(%r11,%rbp,1),%rsp
2309	movq	(%rsp),%r10
2310	cmpq	%rbp,%rsp
2311	ja	.Lmulx4x_page_walk
2312	jmp	.Lmulx4x_page_walk_done
2313
2314.Lmulx4x_page_walk:
2315	leaq	-4096(%rsp),%rsp
2316	movq	(%rsp),%r10
2317	cmpq	%rbp,%rsp
2318	ja	.Lmulx4x_page_walk
2319.Lmulx4x_page_walk_done:
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333	movq	%r8,32(%rsp)
2334	movq	%rax,40(%rsp)
2335.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2336.Lmulx4x_body:
2337	call	mulx4x_internal
2338
2339	movq	40(%rsp),%rsi
2340.cfi_def_cfa	%rsi,8
2341	movq	$1,%rax
2342
2343	movq	-48(%rsi),%r15
2344.cfi_restore	%r15
2345	movq	-40(%rsi),%r14
2346.cfi_restore	%r14
2347	movq	-32(%rsi),%r13
2348.cfi_restore	%r13
2349	movq	-24(%rsi),%r12
2350.cfi_restore	%r12
2351	movq	-16(%rsi),%rbp
2352.cfi_restore	%rbp
2353	movq	-8(%rsi),%rbx
2354.cfi_restore	%rbx
2355	leaq	(%rsi),%rsp
2356.cfi_def_cfa_register	%rsp
2357.Lmulx4x_epilogue:
2358	.byte	0xf3,0xc3
2359.cfi_endproc
2360.size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2361
2362.type	mulx4x_internal,@function
2363.align	32
2364mulx4x_internal:
2365.cfi_startproc
2366	movq	%r9,8(%rsp)
2367	movq	%r9,%r10
2368	negq	%r9
2369	shlq	$5,%r9
2370	negq	%r10
2371	leaq	128(%rdx,%r9,1),%r13
2372	shrq	$5+5,%r9
2373	movd	8(%rax),%xmm5
2374	subq	$1,%r9
2375	leaq	.Linc(%rip),%rax
2376	movq	%r13,16+8(%rsp)
2377	movq	%r9,24+8(%rsp)
2378	movq	%rdi,56+8(%rsp)
2379	movdqa	0(%rax),%xmm0
2380	movdqa	16(%rax),%xmm1
2381	leaq	88-112(%rsp,%r10,1),%r10
2382	leaq	128(%rdx),%rdi
2383
2384	pshufd	$0,%xmm5,%xmm5
2385	movdqa	%xmm1,%xmm4
2386.byte	0x67
2387	movdqa	%xmm1,%xmm2
2388.byte	0x67
2389	paddd	%xmm0,%xmm1
2390	pcmpeqd	%xmm5,%xmm0
2391	movdqa	%xmm4,%xmm3
2392	paddd	%xmm1,%xmm2
2393	pcmpeqd	%xmm5,%xmm1
2394	movdqa	%xmm0,112(%r10)
2395	movdqa	%xmm4,%xmm0
2396
2397	paddd	%xmm2,%xmm3
2398	pcmpeqd	%xmm5,%xmm2
2399	movdqa	%xmm1,128(%r10)
2400	movdqa	%xmm4,%xmm1
2401
2402	paddd	%xmm3,%xmm0
2403	pcmpeqd	%xmm5,%xmm3
2404	movdqa	%xmm2,144(%r10)
2405	movdqa	%xmm4,%xmm2
2406
2407	paddd	%xmm0,%xmm1
2408	pcmpeqd	%xmm5,%xmm0
2409	movdqa	%xmm3,160(%r10)
2410	movdqa	%xmm4,%xmm3
2411	paddd	%xmm1,%xmm2
2412	pcmpeqd	%xmm5,%xmm1
2413	movdqa	%xmm0,176(%r10)
2414	movdqa	%xmm4,%xmm0
2415
2416	paddd	%xmm2,%xmm3
2417	pcmpeqd	%xmm5,%xmm2
2418	movdqa	%xmm1,192(%r10)
2419	movdqa	%xmm4,%xmm1
2420
2421	paddd	%xmm3,%xmm0
2422	pcmpeqd	%xmm5,%xmm3
2423	movdqa	%xmm2,208(%r10)
2424	movdqa	%xmm4,%xmm2
2425
2426	paddd	%xmm0,%xmm1
2427	pcmpeqd	%xmm5,%xmm0
2428	movdqa	%xmm3,224(%r10)
2429	movdqa	%xmm4,%xmm3
2430	paddd	%xmm1,%xmm2
2431	pcmpeqd	%xmm5,%xmm1
2432	movdqa	%xmm0,240(%r10)
2433	movdqa	%xmm4,%xmm0
2434
2435	paddd	%xmm2,%xmm3
2436	pcmpeqd	%xmm5,%xmm2
2437	movdqa	%xmm1,256(%r10)
2438	movdqa	%xmm4,%xmm1
2439
2440	paddd	%xmm3,%xmm0
2441	pcmpeqd	%xmm5,%xmm3
2442	movdqa	%xmm2,272(%r10)
2443	movdqa	%xmm4,%xmm2
2444
2445	paddd	%xmm0,%xmm1
2446	pcmpeqd	%xmm5,%xmm0
2447	movdqa	%xmm3,288(%r10)
2448	movdqa	%xmm4,%xmm3
2449.byte	0x67
2450	paddd	%xmm1,%xmm2
2451	pcmpeqd	%xmm5,%xmm1
2452	movdqa	%xmm0,304(%r10)
2453
2454	paddd	%xmm2,%xmm3
2455	pcmpeqd	%xmm5,%xmm2
2456	movdqa	%xmm1,320(%r10)
2457
2458	pcmpeqd	%xmm5,%xmm3
2459	movdqa	%xmm2,336(%r10)
2460
2461	pand	64(%rdi),%xmm0
2462	pand	80(%rdi),%xmm1
2463	pand	96(%rdi),%xmm2
2464	movdqa	%xmm3,352(%r10)
2465	pand	112(%rdi),%xmm3
2466	por	%xmm2,%xmm0
2467	por	%xmm3,%xmm1
2468	movdqa	-128(%rdi),%xmm4
2469	movdqa	-112(%rdi),%xmm5
2470	movdqa	-96(%rdi),%xmm2
2471	pand	112(%r10),%xmm4
2472	movdqa	-80(%rdi),%xmm3
2473	pand	128(%r10),%xmm5
2474	por	%xmm4,%xmm0
2475	pand	144(%r10),%xmm2
2476	por	%xmm5,%xmm1
2477	pand	160(%r10),%xmm3
2478	por	%xmm2,%xmm0
2479	por	%xmm3,%xmm1
2480	movdqa	-64(%rdi),%xmm4
2481	movdqa	-48(%rdi),%xmm5
2482	movdqa	-32(%rdi),%xmm2
2483	pand	176(%r10),%xmm4
2484	movdqa	-16(%rdi),%xmm3
2485	pand	192(%r10),%xmm5
2486	por	%xmm4,%xmm0
2487	pand	208(%r10),%xmm2
2488	por	%xmm5,%xmm1
2489	pand	224(%r10),%xmm3
2490	por	%xmm2,%xmm0
2491	por	%xmm3,%xmm1
2492	movdqa	0(%rdi),%xmm4
2493	movdqa	16(%rdi),%xmm5
2494	movdqa	32(%rdi),%xmm2
2495	pand	240(%r10),%xmm4
2496	movdqa	48(%rdi),%xmm3
2497	pand	256(%r10),%xmm5
2498	por	%xmm4,%xmm0
2499	pand	272(%r10),%xmm2
2500	por	%xmm5,%xmm1
2501	pand	288(%r10),%xmm3
2502	por	%xmm2,%xmm0
2503	por	%xmm3,%xmm1
2504	pxor	%xmm1,%xmm0
2505	pshufd	$0x4e,%xmm0,%xmm1
2506	por	%xmm1,%xmm0
2507	leaq	256(%rdi),%rdi
2508.byte	102,72,15,126,194
2509	leaq	64+32+8(%rsp),%rbx
2510
2511	movq	%rdx,%r9
2512	mulxq	0(%rsi),%r8,%rax
2513	mulxq	8(%rsi),%r11,%r12
2514	addq	%rax,%r11
2515	mulxq	16(%rsi),%rax,%r13
2516	adcq	%rax,%r12
2517	adcq	$0,%r13
2518	mulxq	24(%rsi),%rax,%r14
2519
2520	movq	%r8,%r15
2521	imulq	32+8(%rsp),%r8
2522	xorq	%rbp,%rbp
2523	movq	%r8,%rdx
2524
2525	movq	%rdi,8+8(%rsp)
2526
2527	leaq	32(%rsi),%rsi
2528	adcxq	%rax,%r13
2529	adcxq	%rbp,%r14
2530
2531	mulxq	0(%rcx),%rax,%r10
2532	adcxq	%rax,%r15
2533	adoxq	%r11,%r10
2534	mulxq	8(%rcx),%rax,%r11
2535	adcxq	%rax,%r10
2536	adoxq	%r12,%r11
2537	mulxq	16(%rcx),%rax,%r12
2538	movq	24+8(%rsp),%rdi
2539	movq	%r10,-32(%rbx)
2540	adcxq	%rax,%r11
2541	adoxq	%r13,%r12
2542	mulxq	24(%rcx),%rax,%r15
2543	movq	%r9,%rdx
2544	movq	%r11,-24(%rbx)
2545	adcxq	%rax,%r12
2546	adoxq	%rbp,%r15
2547	leaq	32(%rcx),%rcx
2548	movq	%r12,-16(%rbx)
2549	jmp	.Lmulx4x_1st
2550
2551.align	32
2552.Lmulx4x_1st:
2553	adcxq	%rbp,%r15
2554	mulxq	0(%rsi),%r10,%rax
2555	adcxq	%r14,%r10
2556	mulxq	8(%rsi),%r11,%r14
2557	adcxq	%rax,%r11
2558	mulxq	16(%rsi),%r12,%rax
2559	adcxq	%r14,%r12
2560	mulxq	24(%rsi),%r13,%r14
2561.byte	0x67,0x67
2562	movq	%r8,%rdx
2563	adcxq	%rax,%r13
2564	adcxq	%rbp,%r14
2565	leaq	32(%rsi),%rsi
2566	leaq	32(%rbx),%rbx
2567
2568	adoxq	%r15,%r10
2569	mulxq	0(%rcx),%rax,%r15
2570	adcxq	%rax,%r10
2571	adoxq	%r15,%r11
2572	mulxq	8(%rcx),%rax,%r15
2573	adcxq	%rax,%r11
2574	adoxq	%r15,%r12
2575	mulxq	16(%rcx),%rax,%r15
2576	movq	%r10,-40(%rbx)
2577	adcxq	%rax,%r12
2578	movq	%r11,-32(%rbx)
2579	adoxq	%r15,%r13
2580	mulxq	24(%rcx),%rax,%r15
2581	movq	%r9,%rdx
2582	movq	%r12,-24(%rbx)
2583	adcxq	%rax,%r13
2584	adoxq	%rbp,%r15
2585	leaq	32(%rcx),%rcx
2586	movq	%r13,-16(%rbx)
2587
2588	decq	%rdi
2589	jnz	.Lmulx4x_1st
2590
2591	movq	8(%rsp),%rax
2592	adcq	%rbp,%r15
2593	leaq	(%rsi,%rax,1),%rsi
2594	addq	%r15,%r14
2595	movq	8+8(%rsp),%rdi
2596	adcq	%rbp,%rbp
2597	movq	%r14,-8(%rbx)
2598	jmp	.Lmulx4x_outer
2599
2600.align	32
2601.Lmulx4x_outer:
2602	leaq	16-256(%rbx),%r10
2603	pxor	%xmm4,%xmm4
2604.byte	0x67,0x67
2605	pxor	%xmm5,%xmm5
2606	movdqa	-128(%rdi),%xmm0
2607	movdqa	-112(%rdi),%xmm1
2608	movdqa	-96(%rdi),%xmm2
2609	pand	256(%r10),%xmm0
2610	movdqa	-80(%rdi),%xmm3
2611	pand	272(%r10),%xmm1
2612	por	%xmm0,%xmm4
2613	pand	288(%r10),%xmm2
2614	por	%xmm1,%xmm5
2615	pand	304(%r10),%xmm3
2616	por	%xmm2,%xmm4
2617	por	%xmm3,%xmm5
2618	movdqa	-64(%rdi),%xmm0
2619	movdqa	-48(%rdi),%xmm1
2620	movdqa	-32(%rdi),%xmm2
2621	pand	320(%r10),%xmm0
2622	movdqa	-16(%rdi),%xmm3
2623	pand	336(%r10),%xmm1
2624	por	%xmm0,%xmm4
2625	pand	352(%r10),%xmm2
2626	por	%xmm1,%xmm5
2627	pand	368(%r10),%xmm3
2628	por	%xmm2,%xmm4
2629	por	%xmm3,%xmm5
2630	movdqa	0(%rdi),%xmm0
2631	movdqa	16(%rdi),%xmm1
2632	movdqa	32(%rdi),%xmm2
2633	pand	384(%r10),%xmm0
2634	movdqa	48(%rdi),%xmm3
2635	pand	400(%r10),%xmm1
2636	por	%xmm0,%xmm4
2637	pand	416(%r10),%xmm2
2638	por	%xmm1,%xmm5
2639	pand	432(%r10),%xmm3
2640	por	%xmm2,%xmm4
2641	por	%xmm3,%xmm5
2642	movdqa	64(%rdi),%xmm0
2643	movdqa	80(%rdi),%xmm1
2644	movdqa	96(%rdi),%xmm2
2645	pand	448(%r10),%xmm0
2646	movdqa	112(%rdi),%xmm3
2647	pand	464(%r10),%xmm1
2648	por	%xmm0,%xmm4
2649	pand	480(%r10),%xmm2
2650	por	%xmm1,%xmm5
2651	pand	496(%r10),%xmm3
2652	por	%xmm2,%xmm4
2653	por	%xmm3,%xmm5
2654	por	%xmm5,%xmm4
2655	pshufd	$0x4e,%xmm4,%xmm0
2656	por	%xmm4,%xmm0
2657	leaq	256(%rdi),%rdi
2658.byte	102,72,15,126,194
2659
2660	movq	%rbp,(%rbx)
2661	leaq	32(%rbx,%rax,1),%rbx
2662	mulxq	0(%rsi),%r8,%r11
2663	xorq	%rbp,%rbp
2664	movq	%rdx,%r9
2665	mulxq	8(%rsi),%r14,%r12
2666	adoxq	-32(%rbx),%r8
2667	adcxq	%r14,%r11
2668	mulxq	16(%rsi),%r15,%r13
2669	adoxq	-24(%rbx),%r11
2670	adcxq	%r15,%r12
2671	mulxq	24(%rsi),%rdx,%r14
2672	adoxq	-16(%rbx),%r12
2673	adcxq	%rdx,%r13
2674	leaq	(%rcx,%rax,1),%rcx
2675	leaq	32(%rsi),%rsi
2676	adoxq	-8(%rbx),%r13
2677	adcxq	%rbp,%r14
2678	adoxq	%rbp,%r14
2679
2680	movq	%r8,%r15
2681	imulq	32+8(%rsp),%r8
2682
2683	movq	%r8,%rdx
2684	xorq	%rbp,%rbp
2685	movq	%rdi,8+8(%rsp)
2686
2687	mulxq	0(%rcx),%rax,%r10
2688	adcxq	%rax,%r15
2689	adoxq	%r11,%r10
2690	mulxq	8(%rcx),%rax,%r11
2691	adcxq	%rax,%r10
2692	adoxq	%r12,%r11
2693	mulxq	16(%rcx),%rax,%r12
2694	adcxq	%rax,%r11
2695	adoxq	%r13,%r12
2696	mulxq	24(%rcx),%rax,%r15
2697	movq	%r9,%rdx
2698	movq	24+8(%rsp),%rdi
2699	movq	%r10,-32(%rbx)
2700	adcxq	%rax,%r12
2701	movq	%r11,-24(%rbx)
2702	adoxq	%rbp,%r15
2703	movq	%r12,-16(%rbx)
2704	leaq	32(%rcx),%rcx
2705	jmp	.Lmulx4x_inner
2706
2707.align	32
2708.Lmulx4x_inner:
2709	mulxq	0(%rsi),%r10,%rax
2710	adcxq	%rbp,%r15
2711	adoxq	%r14,%r10
2712	mulxq	8(%rsi),%r11,%r14
2713	adcxq	0(%rbx),%r10
2714	adoxq	%rax,%r11
2715	mulxq	16(%rsi),%r12,%rax
2716	adcxq	8(%rbx),%r11
2717	adoxq	%r14,%r12
2718	mulxq	24(%rsi),%r13,%r14
2719	movq	%r8,%rdx
2720	adcxq	16(%rbx),%r12
2721	adoxq	%rax,%r13
2722	adcxq	24(%rbx),%r13
2723	adoxq	%rbp,%r14
2724	leaq	32(%rsi),%rsi
2725	leaq	32(%rbx),%rbx
2726	adcxq	%rbp,%r14
2727
2728	adoxq	%r15,%r10
2729	mulxq	0(%rcx),%rax,%r15
2730	adcxq	%rax,%r10
2731	adoxq	%r15,%r11
2732	mulxq	8(%rcx),%rax,%r15
2733	adcxq	%rax,%r11
2734	adoxq	%r15,%r12
2735	mulxq	16(%rcx),%rax,%r15
2736	movq	%r10,-40(%rbx)
2737	adcxq	%rax,%r12
2738	adoxq	%r15,%r13
2739	movq	%r11,-32(%rbx)
2740	mulxq	24(%rcx),%rax,%r15
2741	movq	%r9,%rdx
2742	leaq	32(%rcx),%rcx
2743	movq	%r12,-24(%rbx)
2744	adcxq	%rax,%r13
2745	adoxq	%rbp,%r15
2746	movq	%r13,-16(%rbx)
2747
2748	decq	%rdi
2749	jnz	.Lmulx4x_inner
2750
2751	movq	0+8(%rsp),%rax
2752	adcq	%rbp,%r15
2753	subq	0(%rbx),%rdi
2754	movq	8+8(%rsp),%rdi
2755	movq	16+8(%rsp),%r10
2756	adcq	%r15,%r14
2757	leaq	(%rsi,%rax,1),%rsi
2758	adcq	%rbp,%rbp
2759	movq	%r14,-8(%rbx)
2760
2761	cmpq	%r10,%rdi
2762	jb	.Lmulx4x_outer
2763
2764	movq	-8(%rcx),%r10
2765	movq	%rbp,%r8
2766	movq	(%rcx,%rax,1),%r12
2767	leaq	(%rcx,%rax,1),%rbp
2768	movq	%rax,%rcx
2769	leaq	(%rbx,%rax,1),%rdi
2770	xorl	%eax,%eax
2771	xorq	%r15,%r15
2772	subq	%r14,%r10
2773	adcq	%r15,%r15
2774	orq	%r15,%r8
2775	sarq	$3+2,%rcx
2776	subq	%r8,%rax
2777	movq	56+8(%rsp),%rdx
2778	decq	%r12
2779	movq	8(%rbp),%r13
2780	xorq	%r8,%r8
2781	movq	16(%rbp),%r14
2782	movq	24(%rbp),%r15
2783	jmp	.Lsqrx4x_sub_entry
2784.cfi_endproc
2785.size	mulx4x_internal,.-mulx4x_internal
2786.type	bn_powerx5,@function
2787.align	32
2788bn_powerx5:
2789.cfi_startproc
2790	movq	%rsp,%rax
2791.cfi_def_cfa_register	%rax
2792.Lpowerx5_enter:
2793	pushq	%rbx
2794.cfi_offset	%rbx,-16
2795	pushq	%rbp
2796.cfi_offset	%rbp,-24
2797	pushq	%r12
2798.cfi_offset	%r12,-32
2799	pushq	%r13
2800.cfi_offset	%r13,-40
2801	pushq	%r14
2802.cfi_offset	%r14,-48
2803	pushq	%r15
2804.cfi_offset	%r15,-56
2805.Lpowerx5_prologue:
2806
2807	shll	$3,%r9d
2808	leaq	(%r9,%r9,2),%r10
2809	negq	%r9
2810	movq	(%r8),%r8
2811
2812
2813
2814
2815
2816
2817
2818
2819	leaq	-320(%rsp,%r9,2),%r11
2820	movq	%rsp,%rbp
2821	subq	%rdi,%r11
2822	andq	$4095,%r11
2823	cmpq	%r11,%r10
2824	jb	.Lpwrx_sp_alt
2825	subq	%r11,%rbp
2826	leaq	-320(%rbp,%r9,2),%rbp
2827	jmp	.Lpwrx_sp_done
2828
2829.align	32
2830.Lpwrx_sp_alt:
2831	leaq	4096-320(,%r9,2),%r10
2832	leaq	-320(%rbp,%r9,2),%rbp
2833	subq	%r10,%r11
2834	movq	$0,%r10
2835	cmovcq	%r10,%r11
2836	subq	%r11,%rbp
2837.Lpwrx_sp_done:
2838	andq	$-64,%rbp
2839	movq	%rsp,%r11
2840	subq	%rbp,%r11
2841	andq	$-4096,%r11
2842	leaq	(%r11,%rbp,1),%rsp
2843	movq	(%rsp),%r10
2844	cmpq	%rbp,%rsp
2845	ja	.Lpwrx_page_walk
2846	jmp	.Lpwrx_page_walk_done
2847
2848.Lpwrx_page_walk:
2849	leaq	-4096(%rsp),%rsp
2850	movq	(%rsp),%r10
2851	cmpq	%rbp,%rsp
2852	ja	.Lpwrx_page_walk
2853.Lpwrx_page_walk_done:
2854
2855	movq	%r9,%r10
2856	negq	%r9
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869	pxor	%xmm0,%xmm0
2870.byte	102,72,15,110,207
2871.byte	102,72,15,110,209
2872.byte	102,73,15,110,218
2873.byte	102,72,15,110,226
2874	movq	%r8,32(%rsp)
2875	movq	%rax,40(%rsp)
2876.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2877.Lpowerx5_body:
2878
2879	call	__bn_sqrx8x_internal
2880	call	__bn_postx4x_internal
2881	call	__bn_sqrx8x_internal
2882	call	__bn_postx4x_internal
2883	call	__bn_sqrx8x_internal
2884	call	__bn_postx4x_internal
2885	call	__bn_sqrx8x_internal
2886	call	__bn_postx4x_internal
2887	call	__bn_sqrx8x_internal
2888	call	__bn_postx4x_internal
2889
2890	movq	%r10,%r9
2891	movq	%rsi,%rdi
2892.byte	102,72,15,126,209
2893.byte	102,72,15,126,226
2894	movq	40(%rsp),%rax
2895
2896	call	mulx4x_internal
2897
2898	movq	40(%rsp),%rsi
2899.cfi_def_cfa	%rsi,8
2900	movq	$1,%rax
2901
2902	movq	-48(%rsi),%r15
2903.cfi_restore	%r15
2904	movq	-40(%rsi),%r14
2905.cfi_restore	%r14
2906	movq	-32(%rsi),%r13
2907.cfi_restore	%r13
2908	movq	-24(%rsi),%r12
2909.cfi_restore	%r12
2910	movq	-16(%rsi),%rbp
2911.cfi_restore	%rbp
2912	movq	-8(%rsi),%rbx
2913.cfi_restore	%rbx
2914	leaq	(%rsi),%rsp
2915.cfi_def_cfa_register	%rsp
2916.Lpowerx5_epilogue:
2917	.byte	0xf3,0xc3
2918.cfi_endproc
2919.size	bn_powerx5,.-bn_powerx5
2920
2921.globl	bn_sqrx8x_internal
2922.hidden bn_sqrx8x_internal
2923.hidden	bn_sqrx8x_internal
2924.type	bn_sqrx8x_internal,@function
2925.align	32
2926bn_sqrx8x_internal:
2927__bn_sqrx8x_internal:
2928.cfi_startproc
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969	leaq	48+8(%rsp),%rdi
2970	leaq	(%rsi,%r9,1),%rbp
2971	movq	%r9,0+8(%rsp)
2972	movq	%rbp,8+8(%rsp)
2973	jmp	.Lsqr8x_zero_start
2974
2975.align	32
2976.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2977.Lsqrx8x_zero:
2978.byte	0x3e
2979	movdqa	%xmm0,0(%rdi)
2980	movdqa	%xmm0,16(%rdi)
2981	movdqa	%xmm0,32(%rdi)
2982	movdqa	%xmm0,48(%rdi)
2983.Lsqr8x_zero_start:
2984	movdqa	%xmm0,64(%rdi)
2985	movdqa	%xmm0,80(%rdi)
2986	movdqa	%xmm0,96(%rdi)
2987	movdqa	%xmm0,112(%rdi)
2988	leaq	128(%rdi),%rdi
2989	subq	$64,%r9
2990	jnz	.Lsqrx8x_zero
2991
2992	movq	0(%rsi),%rdx
2993
2994	xorq	%r10,%r10
2995	xorq	%r11,%r11
2996	xorq	%r12,%r12
2997	xorq	%r13,%r13
2998	xorq	%r14,%r14
2999	xorq	%r15,%r15
3000	leaq	48+8(%rsp),%rdi
3001	xorq	%rbp,%rbp
3002	jmp	.Lsqrx8x_outer_loop
3003
3004.align	32
3005.Lsqrx8x_outer_loop:
3006	mulxq	8(%rsi),%r8,%rax
3007	adcxq	%r9,%r8
3008	adoxq	%rax,%r10
3009	mulxq	16(%rsi),%r9,%rax
3010	adcxq	%r10,%r9
3011	adoxq	%rax,%r11
3012.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
3013	adcxq	%r11,%r10
3014	adoxq	%rax,%r12
3015.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
3016	adcxq	%r12,%r11
3017	adoxq	%rax,%r13
3018	mulxq	40(%rsi),%r12,%rax
3019	adcxq	%r13,%r12
3020	adoxq	%rax,%r14
3021	mulxq	48(%rsi),%r13,%rax
3022	adcxq	%r14,%r13
3023	adoxq	%r15,%rax
3024	mulxq	56(%rsi),%r14,%r15
3025	movq	8(%rsi),%rdx
3026	adcxq	%rax,%r14
3027	adoxq	%rbp,%r15
3028	adcq	64(%rdi),%r15
3029	movq	%r8,8(%rdi)
3030	movq	%r9,16(%rdi)
3031	sbbq	%rcx,%rcx
3032	xorq	%rbp,%rbp
3033
3034
3035	mulxq	16(%rsi),%r8,%rbx
3036	mulxq	24(%rsi),%r9,%rax
3037	adcxq	%r10,%r8
3038	adoxq	%rbx,%r9
3039	mulxq	32(%rsi),%r10,%rbx
3040	adcxq	%r11,%r9
3041	adoxq	%rax,%r10
3042.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
3043	adcxq	%r12,%r10
3044	adoxq	%rbx,%r11
3045.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
3046	adcxq	%r13,%r11
3047	adoxq	%r14,%r12
3048.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
3049	movq	16(%rsi),%rdx
3050	adcxq	%rax,%r12
3051	adoxq	%rbx,%r13
3052	adcxq	%r15,%r13
3053	adoxq	%rbp,%r14
3054	adcxq	%rbp,%r14
3055
3056	movq	%r8,24(%rdi)
3057	movq	%r9,32(%rdi)
3058
3059	mulxq	24(%rsi),%r8,%rbx
3060	mulxq	32(%rsi),%r9,%rax
3061	adcxq	%r10,%r8
3062	adoxq	%rbx,%r9
3063	mulxq	40(%rsi),%r10,%rbx
3064	adcxq	%r11,%r9
3065	adoxq	%rax,%r10
3066.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
3067	adcxq	%r12,%r10
3068	adoxq	%r13,%r11
3069.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
3070.byte	0x3e
3071	movq	24(%rsi),%rdx
3072	adcxq	%rbx,%r11
3073	adoxq	%rax,%r12
3074	adcxq	%r14,%r12
3075	movq	%r8,40(%rdi)
3076	movq	%r9,48(%rdi)
3077	mulxq	32(%rsi),%r8,%rax
3078	adoxq	%rbp,%r13
3079	adcxq	%rbp,%r13
3080
3081	mulxq	40(%rsi),%r9,%rbx
3082	adcxq	%r10,%r8
3083	adoxq	%rax,%r9
3084	mulxq	48(%rsi),%r10,%rax
3085	adcxq	%r11,%r9
3086	adoxq	%r12,%r10
3087	mulxq	56(%rsi),%r11,%r12
3088	movq	32(%rsi),%rdx
3089	movq	40(%rsi),%r14
3090	adcxq	%rbx,%r10
3091	adoxq	%rax,%r11
3092	movq	48(%rsi),%r15
3093	adcxq	%r13,%r11
3094	adoxq	%rbp,%r12
3095	adcxq	%rbp,%r12
3096
3097	movq	%r8,56(%rdi)
3098	movq	%r9,64(%rdi)
3099
3100	mulxq	%r14,%r9,%rax
3101	movq	56(%rsi),%r8
3102	adcxq	%r10,%r9
3103	mulxq	%r15,%r10,%rbx
3104	adoxq	%rax,%r10
3105	adcxq	%r11,%r10
3106	mulxq	%r8,%r11,%rax
3107	movq	%r14,%rdx
3108	adoxq	%rbx,%r11
3109	adcxq	%r12,%r11
3110
3111	adcxq	%rbp,%rax
3112
3113	mulxq	%r15,%r14,%rbx
3114	mulxq	%r8,%r12,%r13
3115	movq	%r15,%rdx
3116	leaq	64(%rsi),%rsi
3117	adcxq	%r14,%r11
3118	adoxq	%rbx,%r12
3119	adcxq	%rax,%r12
3120	adoxq	%rbp,%r13
3121
3122.byte	0x67,0x67
3123	mulxq	%r8,%r8,%r14
3124	adcxq	%r8,%r13
3125	adcxq	%rbp,%r14
3126
3127	cmpq	8+8(%rsp),%rsi
3128	je	.Lsqrx8x_outer_break
3129
3130	negq	%rcx
3131	movq	$-8,%rcx
3132	movq	%rbp,%r15
3133	movq	64(%rdi),%r8
3134	adcxq	72(%rdi),%r9
3135	adcxq	80(%rdi),%r10
3136	adcxq	88(%rdi),%r11
3137	adcq	96(%rdi),%r12
3138	adcq	104(%rdi),%r13
3139	adcq	112(%rdi),%r14
3140	adcq	120(%rdi),%r15
3141	leaq	(%rsi),%rbp
3142	leaq	128(%rdi),%rdi
3143	sbbq	%rax,%rax
3144
3145	movq	-64(%rsi),%rdx
3146	movq	%rax,16+8(%rsp)
3147	movq	%rdi,24+8(%rsp)
3148
3149
3150	xorl	%eax,%eax
3151	jmp	.Lsqrx8x_loop
3152
3153.align	32
3154.Lsqrx8x_loop:
3155	movq	%r8,%rbx
3156	mulxq	0(%rbp),%rax,%r8
3157	adcxq	%rax,%rbx
3158	adoxq	%r9,%r8
3159
3160	mulxq	8(%rbp),%rax,%r9
3161	adcxq	%rax,%r8
3162	adoxq	%r10,%r9
3163
3164	mulxq	16(%rbp),%rax,%r10
3165	adcxq	%rax,%r9
3166	adoxq	%r11,%r10
3167
3168	mulxq	24(%rbp),%rax,%r11
3169	adcxq	%rax,%r10
3170	adoxq	%r12,%r11
3171
3172.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3173	adcxq	%rax,%r11
3174	adoxq	%r13,%r12
3175
3176	mulxq	40(%rbp),%rax,%r13
3177	adcxq	%rax,%r12
3178	adoxq	%r14,%r13
3179
3180	mulxq	48(%rbp),%rax,%r14
3181	movq	%rbx,(%rdi,%rcx,8)
3182	movl	$0,%ebx
3183	adcxq	%rax,%r13
3184	adoxq	%r15,%r14
3185
3186.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
3187	movq	8(%rsi,%rcx,8),%rdx
3188	adcxq	%rax,%r14
3189	adoxq	%rbx,%r15
3190	adcxq	%rbx,%r15
3191
3192.byte	0x67
3193	incq	%rcx
3194	jnz	.Lsqrx8x_loop
3195
3196	leaq	64(%rbp),%rbp
3197	movq	$-8,%rcx
3198	cmpq	8+8(%rsp),%rbp
3199	je	.Lsqrx8x_break
3200
3201	subq	16+8(%rsp),%rbx
3202.byte	0x66
3203	movq	-64(%rsi),%rdx
3204	adcxq	0(%rdi),%r8
3205	adcxq	8(%rdi),%r9
3206	adcq	16(%rdi),%r10
3207	adcq	24(%rdi),%r11
3208	adcq	32(%rdi),%r12
3209	adcq	40(%rdi),%r13
3210	adcq	48(%rdi),%r14
3211	adcq	56(%rdi),%r15
3212	leaq	64(%rdi),%rdi
3213.byte	0x67
3214	sbbq	%rax,%rax
3215	xorl	%ebx,%ebx
3216	movq	%rax,16+8(%rsp)
3217	jmp	.Lsqrx8x_loop
3218
3219.align	32
3220.Lsqrx8x_break:
3221	xorq	%rbp,%rbp
3222	subq	16+8(%rsp),%rbx
3223	adcxq	%rbp,%r8
3224	movq	24+8(%rsp),%rcx
3225	adcxq	%rbp,%r9
3226	movq	0(%rsi),%rdx
3227	adcq	$0,%r10
3228	movq	%r8,0(%rdi)
3229	adcq	$0,%r11
3230	adcq	$0,%r12
3231	adcq	$0,%r13
3232	adcq	$0,%r14
3233	adcq	$0,%r15
3234	cmpq	%rcx,%rdi
3235	je	.Lsqrx8x_outer_loop
3236
3237	movq	%r9,8(%rdi)
3238	movq	8(%rcx),%r9
3239	movq	%r10,16(%rdi)
3240	movq	16(%rcx),%r10
3241	movq	%r11,24(%rdi)
3242	movq	24(%rcx),%r11
3243	movq	%r12,32(%rdi)
3244	movq	32(%rcx),%r12
3245	movq	%r13,40(%rdi)
3246	movq	40(%rcx),%r13
3247	movq	%r14,48(%rdi)
3248	movq	48(%rcx),%r14
3249	movq	%r15,56(%rdi)
3250	movq	56(%rcx),%r15
3251	movq	%rcx,%rdi
3252	jmp	.Lsqrx8x_outer_loop
3253
3254.align	32
3255.Lsqrx8x_outer_break:
3256	movq	%r9,72(%rdi)
3257.byte	102,72,15,126,217
3258	movq	%r10,80(%rdi)
3259	movq	%r11,88(%rdi)
3260	movq	%r12,96(%rdi)
3261	movq	%r13,104(%rdi)
3262	movq	%r14,112(%rdi)
3263	leaq	48+8(%rsp),%rdi
3264	movq	(%rsi,%rcx,1),%rdx
3265
3266	movq	8(%rdi),%r11
3267	xorq	%r10,%r10
3268	movq	0+8(%rsp),%r9
3269	adoxq	%r11,%r11
3270	movq	16(%rdi),%r12
3271	movq	24(%rdi),%r13
3272
3273
3274.align	32
3275.Lsqrx4x_shift_n_add:
3276	mulxq	%rdx,%rax,%rbx
3277	adoxq	%r12,%r12
3278	adcxq	%r10,%rax
3279.byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
3280.byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
3281	adoxq	%r13,%r13
3282	adcxq	%r11,%rbx
3283	movq	40(%rdi),%r11
3284	movq	%rax,0(%rdi)
3285	movq	%rbx,8(%rdi)
3286
3287	mulxq	%rdx,%rax,%rbx
3288	adoxq	%r10,%r10
3289	adcxq	%r12,%rax
3290	movq	16(%rsi,%rcx,1),%rdx
3291	movq	48(%rdi),%r12
3292	adoxq	%r11,%r11
3293	adcxq	%r13,%rbx
3294	movq	56(%rdi),%r13
3295	movq	%rax,16(%rdi)
3296	movq	%rbx,24(%rdi)
3297
3298	mulxq	%rdx,%rax,%rbx
3299	adoxq	%r12,%r12
3300	adcxq	%r10,%rax
3301	movq	24(%rsi,%rcx,1),%rdx
3302	leaq	32(%rcx),%rcx
3303	movq	64(%rdi),%r10
3304	adoxq	%r13,%r13
3305	adcxq	%r11,%rbx
3306	movq	72(%rdi),%r11
3307	movq	%rax,32(%rdi)
3308	movq	%rbx,40(%rdi)
3309
3310	mulxq	%rdx,%rax,%rbx
3311	adoxq	%r10,%r10
3312	adcxq	%r12,%rax
3313	jrcxz	.Lsqrx4x_shift_n_add_break
3314.byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
3315	adoxq	%r11,%r11
3316	adcxq	%r13,%rbx
3317	movq	80(%rdi),%r12
3318	movq	88(%rdi),%r13
3319	movq	%rax,48(%rdi)
3320	movq	%rbx,56(%rdi)
3321	leaq	64(%rdi),%rdi
3322	nop
3323	jmp	.Lsqrx4x_shift_n_add
3324
3325.align	32
3326.Lsqrx4x_shift_n_add_break:
3327	adcxq	%r13,%rbx
3328	movq	%rax,48(%rdi)
3329	movq	%rbx,56(%rdi)
3330	leaq	64(%rdi),%rdi
3331.byte	102,72,15,126,213
3332__bn_sqrx8x_reduction:
3333	xorl	%eax,%eax
3334	movq	32+8(%rsp),%rbx
3335	movq	48+8(%rsp),%rdx
3336	leaq	-64(%rbp,%r9,1),%rcx
3337
3338	movq	%rcx,0+8(%rsp)
3339	movq	%rdi,8+8(%rsp)
3340
3341	leaq	48+8(%rsp),%rdi
3342	jmp	.Lsqrx8x_reduction_loop
3343
3344.align	32
3345.Lsqrx8x_reduction_loop:
3346	movq	8(%rdi),%r9
3347	movq	16(%rdi),%r10
3348	movq	24(%rdi),%r11
3349	movq	32(%rdi),%r12
3350	movq	%rdx,%r8
3351	imulq	%rbx,%rdx
3352	movq	40(%rdi),%r13
3353	movq	48(%rdi),%r14
3354	movq	56(%rdi),%r15
3355	movq	%rax,24+8(%rsp)
3356
3357	leaq	64(%rdi),%rdi
3358	xorq	%rsi,%rsi
3359	movq	$-8,%rcx
3360	jmp	.Lsqrx8x_reduce
3361
3362.align	32
3363.Lsqrx8x_reduce:
3364	movq	%r8,%rbx
3365	mulxq	0(%rbp),%rax,%r8
3366	adcxq	%rbx,%rax
3367	adoxq	%r9,%r8
3368
3369	mulxq	8(%rbp),%rbx,%r9
3370	adcxq	%rbx,%r8
3371	adoxq	%r10,%r9
3372
3373	mulxq	16(%rbp),%rbx,%r10
3374	adcxq	%rbx,%r9
3375	adoxq	%r11,%r10
3376
3377	mulxq	24(%rbp),%rbx,%r11
3378	adcxq	%rbx,%r10
3379	adoxq	%r12,%r11
3380
3381.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
3382	movq	%rdx,%rax
3383	movq	%r8,%rdx
3384	adcxq	%rbx,%r11
3385	adoxq	%r13,%r12
3386
3387	mulxq	32+8(%rsp),%rbx,%rdx
3388	movq	%rax,%rdx
3389	movq	%rax,64+48+8(%rsp,%rcx,8)
3390
3391	mulxq	40(%rbp),%rax,%r13
3392	adcxq	%rax,%r12
3393	adoxq	%r14,%r13
3394
3395	mulxq	48(%rbp),%rax,%r14
3396	adcxq	%rax,%r13
3397	adoxq	%r15,%r14
3398
3399	mulxq	56(%rbp),%rax,%r15
3400	movq	%rbx,%rdx
3401	adcxq	%rax,%r14
3402	adoxq	%rsi,%r15
3403	adcxq	%rsi,%r15
3404
3405.byte	0x67,0x67,0x67
3406	incq	%rcx
3407	jnz	.Lsqrx8x_reduce
3408
3409	movq	%rsi,%rax
3410	cmpq	0+8(%rsp),%rbp
3411	jae	.Lsqrx8x_no_tail
3412
3413	movq	48+8(%rsp),%rdx
3414	addq	0(%rdi),%r8
3415	leaq	64(%rbp),%rbp
3416	movq	$-8,%rcx
3417	adcxq	8(%rdi),%r9
3418	adcxq	16(%rdi),%r10
3419	adcq	24(%rdi),%r11
3420	adcq	32(%rdi),%r12
3421	adcq	40(%rdi),%r13
3422	adcq	48(%rdi),%r14
3423	adcq	56(%rdi),%r15
3424	leaq	64(%rdi),%rdi
3425	sbbq	%rax,%rax
3426
3427	xorq	%rsi,%rsi
3428	movq	%rax,16+8(%rsp)
3429	jmp	.Lsqrx8x_tail
3430
3431.align	32
3432.Lsqrx8x_tail:
3433	movq	%r8,%rbx
3434	mulxq	0(%rbp),%rax,%r8
3435	adcxq	%rax,%rbx
3436	adoxq	%r9,%r8
3437
3438	mulxq	8(%rbp),%rax,%r9
3439	adcxq	%rax,%r8
3440	adoxq	%r10,%r9
3441
3442	mulxq	16(%rbp),%rax,%r10
3443	adcxq	%rax,%r9
3444	adoxq	%r11,%r10
3445
3446	mulxq	24(%rbp),%rax,%r11
3447	adcxq	%rax,%r10
3448	adoxq	%r12,%r11
3449
3450.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3451	adcxq	%rax,%r11
3452	adoxq	%r13,%r12
3453
3454	mulxq	40(%rbp),%rax,%r13
3455	adcxq	%rax,%r12
3456	adoxq	%r14,%r13
3457
3458	mulxq	48(%rbp),%rax,%r14
3459	adcxq	%rax,%r13
3460	adoxq	%r15,%r14
3461
3462	mulxq	56(%rbp),%rax,%r15
3463	movq	72+48+8(%rsp,%rcx,8),%rdx
3464	adcxq	%rax,%r14
3465	adoxq	%rsi,%r15
3466	movq	%rbx,(%rdi,%rcx,8)
3467	movq	%r8,%rbx
3468	adcxq	%rsi,%r15
3469
3470	incq	%rcx
3471	jnz	.Lsqrx8x_tail
3472
3473	cmpq	0+8(%rsp),%rbp
3474	jae	.Lsqrx8x_tail_done
3475
3476	subq	16+8(%rsp),%rsi
3477	movq	48+8(%rsp),%rdx
3478	leaq	64(%rbp),%rbp
3479	adcq	0(%rdi),%r8
3480	adcq	8(%rdi),%r9
3481	adcq	16(%rdi),%r10
3482	adcq	24(%rdi),%r11
3483	adcq	32(%rdi),%r12
3484	adcq	40(%rdi),%r13
3485	adcq	48(%rdi),%r14
3486	adcq	56(%rdi),%r15
3487	leaq	64(%rdi),%rdi
3488	sbbq	%rax,%rax
3489	subq	$8,%rcx
3490
3491	xorq	%rsi,%rsi
3492	movq	%rax,16+8(%rsp)
3493	jmp	.Lsqrx8x_tail
3494
3495.align	32
3496.Lsqrx8x_tail_done:
3497	xorq	%rax,%rax
3498	addq	24+8(%rsp),%r8
3499	adcq	$0,%r9
3500	adcq	$0,%r10
3501	adcq	$0,%r11
3502	adcq	$0,%r12
3503	adcq	$0,%r13
3504	adcq	$0,%r14
3505	adcq	$0,%r15
3506	adcq	$0,%rax
3507
3508	subq	16+8(%rsp),%rsi
3509.Lsqrx8x_no_tail:
3510	adcq	0(%rdi),%r8
3511.byte	102,72,15,126,217
3512	adcq	8(%rdi),%r9
3513	movq	56(%rbp),%rsi
3514.byte	102,72,15,126,213
3515	adcq	16(%rdi),%r10
3516	adcq	24(%rdi),%r11
3517	adcq	32(%rdi),%r12
3518	adcq	40(%rdi),%r13
3519	adcq	48(%rdi),%r14
3520	adcq	56(%rdi),%r15
3521	adcq	$0,%rax
3522
3523	movq	32+8(%rsp),%rbx
3524	movq	64(%rdi,%rcx,1),%rdx
3525
3526	movq	%r8,0(%rdi)
3527	leaq	64(%rdi),%r8
3528	movq	%r9,8(%rdi)
3529	movq	%r10,16(%rdi)
3530	movq	%r11,24(%rdi)
3531	movq	%r12,32(%rdi)
3532	movq	%r13,40(%rdi)
3533	movq	%r14,48(%rdi)
3534	movq	%r15,56(%rdi)
3535
3536	leaq	64(%rdi,%rcx,1),%rdi
3537	cmpq	8+8(%rsp),%r8
3538	jb	.Lsqrx8x_reduction_loop
3539	.byte	0xf3,0xc3
3540.cfi_endproc
3541.size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
3542.align	32
3543.type	__bn_postx4x_internal,@function
3544__bn_postx4x_internal:
3545.cfi_startproc
3546	movq	0(%rbp),%r12
3547	movq	%rcx,%r10
3548	movq	%rcx,%r9
3549	negq	%rax
3550	sarq	$3+2,%rcx
3551
3552.byte	102,72,15,126,202
3553.byte	102,72,15,126,206
3554	decq	%r12
3555	movq	8(%rbp),%r13
3556	xorq	%r8,%r8
3557	movq	16(%rbp),%r14
3558	movq	24(%rbp),%r15
3559	jmp	.Lsqrx4x_sub_entry
3560
3561.align	16
3562.Lsqrx4x_sub:
3563	movq	0(%rbp),%r12
3564	movq	8(%rbp),%r13
3565	movq	16(%rbp),%r14
3566	movq	24(%rbp),%r15
3567.Lsqrx4x_sub_entry:
3568	andnq	%rax,%r12,%r12
3569	leaq	32(%rbp),%rbp
3570	andnq	%rax,%r13,%r13
3571	andnq	%rax,%r14,%r14
3572	andnq	%rax,%r15,%r15
3573
3574	negq	%r8
3575	adcq	0(%rdi),%r12
3576	adcq	8(%rdi),%r13
3577	adcq	16(%rdi),%r14
3578	adcq	24(%rdi),%r15
3579	movq	%r12,0(%rdx)
3580	leaq	32(%rdi),%rdi
3581	movq	%r13,8(%rdx)
3582	sbbq	%r8,%r8
3583	movq	%r14,16(%rdx)
3584	movq	%r15,24(%rdx)
3585	leaq	32(%rdx),%rdx
3586
3587	incq	%rcx
3588	jnz	.Lsqrx4x_sub
3589
3590	negq	%r9
3591
3592	.byte	0xf3,0xc3
3593.cfi_endproc
3594.size	__bn_postx4x_internal,.-__bn_postx4x_internal
3595.globl	bn_scatter5
3596.hidden bn_scatter5
3597.type	bn_scatter5,@function
3598.align	16
3599bn_scatter5:
3600.cfi_startproc
3601	cmpl	$0,%esi
3602	jz	.Lscatter_epilogue
3603	leaq	(%rdx,%rcx,8),%rdx
3604.Lscatter:
3605	movq	(%rdi),%rax
3606	leaq	8(%rdi),%rdi
3607	movq	%rax,(%rdx)
3608	leaq	256(%rdx),%rdx
3609	subl	$1,%esi
3610	jnz	.Lscatter
3611.Lscatter_epilogue:
3612	.byte	0xf3,0xc3
3613.cfi_endproc
3614.size	bn_scatter5,.-bn_scatter5
3615
3616.globl	bn_gather5
3617.hidden bn_gather5
3618.type	bn_gather5,@function
3619.align	32
3620bn_gather5:
3621.cfi_startproc
3622.LSEH_begin_bn_gather5:
3623
3624.byte	0x4c,0x8d,0x14,0x24
3625.cfi_def_cfa_register	%r10
3626.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00
3627	leaq	.Linc(%rip),%rax
3628	andq	$-16,%rsp
3629
3630	movd	%ecx,%xmm5
3631	movdqa	0(%rax),%xmm0
3632	movdqa	16(%rax),%xmm1
3633	leaq	128(%rdx),%r11
3634	leaq	128(%rsp),%rax
3635
3636	pshufd	$0,%xmm5,%xmm5
3637	movdqa	%xmm1,%xmm4
3638	movdqa	%xmm1,%xmm2
3639	paddd	%xmm0,%xmm1
3640	pcmpeqd	%xmm5,%xmm0
3641	movdqa	%xmm4,%xmm3
3642
3643	paddd	%xmm1,%xmm2
3644	pcmpeqd	%xmm5,%xmm1
3645	movdqa	%xmm0,-128(%rax)
3646	movdqa	%xmm4,%xmm0
3647
3648	paddd	%xmm2,%xmm3
3649	pcmpeqd	%xmm5,%xmm2
3650	movdqa	%xmm1,-112(%rax)
3651	movdqa	%xmm4,%xmm1
3652
3653	paddd	%xmm3,%xmm0
3654	pcmpeqd	%xmm5,%xmm3
3655	movdqa	%xmm2,-96(%rax)
3656	movdqa	%xmm4,%xmm2
3657	paddd	%xmm0,%xmm1
3658	pcmpeqd	%xmm5,%xmm0
3659	movdqa	%xmm3,-80(%rax)
3660	movdqa	%xmm4,%xmm3
3661
3662	paddd	%xmm1,%xmm2
3663	pcmpeqd	%xmm5,%xmm1
3664	movdqa	%xmm0,-64(%rax)
3665	movdqa	%xmm4,%xmm0
3666
3667	paddd	%xmm2,%xmm3
3668	pcmpeqd	%xmm5,%xmm2
3669	movdqa	%xmm1,-48(%rax)
3670	movdqa	%xmm4,%xmm1
3671
3672	paddd	%xmm3,%xmm0
3673	pcmpeqd	%xmm5,%xmm3
3674	movdqa	%xmm2,-32(%rax)
3675	movdqa	%xmm4,%xmm2
3676	paddd	%xmm0,%xmm1
3677	pcmpeqd	%xmm5,%xmm0
3678	movdqa	%xmm3,-16(%rax)
3679	movdqa	%xmm4,%xmm3
3680
3681	paddd	%xmm1,%xmm2
3682	pcmpeqd	%xmm5,%xmm1
3683	movdqa	%xmm0,0(%rax)
3684	movdqa	%xmm4,%xmm0
3685
3686	paddd	%xmm2,%xmm3
3687	pcmpeqd	%xmm5,%xmm2
3688	movdqa	%xmm1,16(%rax)
3689	movdqa	%xmm4,%xmm1
3690
3691	paddd	%xmm3,%xmm0
3692	pcmpeqd	%xmm5,%xmm3
3693	movdqa	%xmm2,32(%rax)
3694	movdqa	%xmm4,%xmm2
3695	paddd	%xmm0,%xmm1
3696	pcmpeqd	%xmm5,%xmm0
3697	movdqa	%xmm3,48(%rax)
3698	movdqa	%xmm4,%xmm3
3699
3700	paddd	%xmm1,%xmm2
3701	pcmpeqd	%xmm5,%xmm1
3702	movdqa	%xmm0,64(%rax)
3703	movdqa	%xmm4,%xmm0
3704
3705	paddd	%xmm2,%xmm3
3706	pcmpeqd	%xmm5,%xmm2
3707	movdqa	%xmm1,80(%rax)
3708	movdqa	%xmm4,%xmm1
3709
3710	paddd	%xmm3,%xmm0
3711	pcmpeqd	%xmm5,%xmm3
3712	movdqa	%xmm2,96(%rax)
3713	movdqa	%xmm4,%xmm2
3714	movdqa	%xmm3,112(%rax)
3715	jmp	.Lgather
3716
3717.align	32
3718.Lgather:
3719	pxor	%xmm4,%xmm4
3720	pxor	%xmm5,%xmm5
3721	movdqa	-128(%r11),%xmm0
3722	movdqa	-112(%r11),%xmm1
3723	movdqa	-96(%r11),%xmm2
3724	pand	-128(%rax),%xmm0
3725	movdqa	-80(%r11),%xmm3
3726	pand	-112(%rax),%xmm1
3727	por	%xmm0,%xmm4
3728	pand	-96(%rax),%xmm2
3729	por	%xmm1,%xmm5
3730	pand	-80(%rax),%xmm3
3731	por	%xmm2,%xmm4
3732	por	%xmm3,%xmm5
3733	movdqa	-64(%r11),%xmm0
3734	movdqa	-48(%r11),%xmm1
3735	movdqa	-32(%r11),%xmm2
3736	pand	-64(%rax),%xmm0
3737	movdqa	-16(%r11),%xmm3
3738	pand	-48(%rax),%xmm1
3739	por	%xmm0,%xmm4
3740	pand	-32(%rax),%xmm2
3741	por	%xmm1,%xmm5
3742	pand	-16(%rax),%xmm3
3743	por	%xmm2,%xmm4
3744	por	%xmm3,%xmm5
3745	movdqa	0(%r11),%xmm0
3746	movdqa	16(%r11),%xmm1
3747	movdqa	32(%r11),%xmm2
3748	pand	0(%rax),%xmm0
3749	movdqa	48(%r11),%xmm3
3750	pand	16(%rax),%xmm1
3751	por	%xmm0,%xmm4
3752	pand	32(%rax),%xmm2
3753	por	%xmm1,%xmm5
3754	pand	48(%rax),%xmm3
3755	por	%xmm2,%xmm4
3756	por	%xmm3,%xmm5
3757	movdqa	64(%r11),%xmm0
3758	movdqa	80(%r11),%xmm1
3759	movdqa	96(%r11),%xmm2
3760	pand	64(%rax),%xmm0
3761	movdqa	112(%r11),%xmm3
3762	pand	80(%rax),%xmm1
3763	por	%xmm0,%xmm4
3764	pand	96(%rax),%xmm2
3765	por	%xmm1,%xmm5
3766	pand	112(%rax),%xmm3
3767	por	%xmm2,%xmm4
3768	por	%xmm3,%xmm5
3769	por	%xmm5,%xmm4
3770	leaq	256(%r11),%r11
3771	pshufd	$0x4e,%xmm4,%xmm0
3772	por	%xmm4,%xmm0
3773	movq	%xmm0,(%rdi)
3774	leaq	8(%rdi),%rdi
3775	subl	$1,%esi
3776	jnz	.Lgather
3777
3778	leaq	(%r10),%rsp
3779.cfi_def_cfa_register	%rsp
3780	.byte	0xf3,0xc3
3781.LSEH_end_bn_gather5:
3782.cfi_endproc
3783.size	bn_gather5,.-bn_gather5
3784.align	64
3785.Linc:
3786.long	0,0, 1,1
3787.long	2,2, 2,2
3788.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
3789#endif
3790