• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# This file is generated from a similarly-named Perl script in the BoringSSL
2# source tree. Do not edit by hand.
3
4#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
11#include "ring_core_generated/prefix_symbols_asm.h"
12.text
13
14.extern	OPENSSL_ia32cap_P
15.hidden OPENSSL_ia32cap_P
16
17.globl	bn_mul_mont_gather5
18.hidden bn_mul_mont_gather5
19.type	bn_mul_mont_gather5,@function
20.align	64
21bn_mul_mont_gather5:
22.cfi_startproc
23	movl	%r9d,%r9d
24	movq	%rsp,%rax
25.cfi_def_cfa_register	%rax
26	testl	$7,%r9d
27	jnz	.Lmul_enter
28	leaq	OPENSSL_ia32cap_P(%rip),%r11
29	movl	8(%r11),%r11d
30	jmp	.Lmul4x_enter
31
32.align	16
33.Lmul_enter:
34	movd	8(%rsp),%xmm5
35	pushq	%rbx
36.cfi_offset	%rbx,-16
37	pushq	%rbp
38.cfi_offset	%rbp,-24
39	pushq	%r12
40.cfi_offset	%r12,-32
41	pushq	%r13
42.cfi_offset	%r13,-40
43	pushq	%r14
44.cfi_offset	%r14,-48
45	pushq	%r15
46.cfi_offset	%r15,-56
47
48	negq	%r9
49	movq	%rsp,%r11
50	leaq	-280(%rsp,%r9,8),%r10
51	negq	%r9
52	andq	$-1024,%r10
53
54
55
56
57
58
59
60
61
62	subq	%r10,%r11
63	andq	$-4096,%r11
64	leaq	(%r10,%r11,1),%rsp
65	movq	(%rsp),%r11
66	cmpq	%r10,%rsp
67	ja	.Lmul_page_walk
68	jmp	.Lmul_page_walk_done
69
70.Lmul_page_walk:
71	leaq	-4096(%rsp),%rsp
72	movq	(%rsp),%r11
73	cmpq	%r10,%rsp
74	ja	.Lmul_page_walk
75.Lmul_page_walk_done:
76
77	leaq	.Linc(%rip),%r10
78	movq	%rax,8(%rsp,%r9,8)
79.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
80.Lmul_body:
81
82	leaq	128(%rdx),%r12
83	movdqa	0(%r10),%xmm0
84	movdqa	16(%r10),%xmm1
85	leaq	24-112(%rsp,%r9,8),%r10
86	andq	$-16,%r10
87
88	pshufd	$0,%xmm5,%xmm5
89	movdqa	%xmm1,%xmm4
90	movdqa	%xmm1,%xmm2
91	paddd	%xmm0,%xmm1
92	pcmpeqd	%xmm5,%xmm0
93.byte	0x67
94	movdqa	%xmm4,%xmm3
95	paddd	%xmm1,%xmm2
96	pcmpeqd	%xmm5,%xmm1
97	movdqa	%xmm0,112(%r10)
98	movdqa	%xmm4,%xmm0
99
100	paddd	%xmm2,%xmm3
101	pcmpeqd	%xmm5,%xmm2
102	movdqa	%xmm1,128(%r10)
103	movdqa	%xmm4,%xmm1
104
105	paddd	%xmm3,%xmm0
106	pcmpeqd	%xmm5,%xmm3
107	movdqa	%xmm2,144(%r10)
108	movdqa	%xmm4,%xmm2
109
110	paddd	%xmm0,%xmm1
111	pcmpeqd	%xmm5,%xmm0
112	movdqa	%xmm3,160(%r10)
113	movdqa	%xmm4,%xmm3
114	paddd	%xmm1,%xmm2
115	pcmpeqd	%xmm5,%xmm1
116	movdqa	%xmm0,176(%r10)
117	movdqa	%xmm4,%xmm0
118
119	paddd	%xmm2,%xmm3
120	pcmpeqd	%xmm5,%xmm2
121	movdqa	%xmm1,192(%r10)
122	movdqa	%xmm4,%xmm1
123
124	paddd	%xmm3,%xmm0
125	pcmpeqd	%xmm5,%xmm3
126	movdqa	%xmm2,208(%r10)
127	movdqa	%xmm4,%xmm2
128
129	paddd	%xmm0,%xmm1
130	pcmpeqd	%xmm5,%xmm0
131	movdqa	%xmm3,224(%r10)
132	movdqa	%xmm4,%xmm3
133	paddd	%xmm1,%xmm2
134	pcmpeqd	%xmm5,%xmm1
135	movdqa	%xmm0,240(%r10)
136	movdqa	%xmm4,%xmm0
137
138	paddd	%xmm2,%xmm3
139	pcmpeqd	%xmm5,%xmm2
140	movdqa	%xmm1,256(%r10)
141	movdqa	%xmm4,%xmm1
142
143	paddd	%xmm3,%xmm0
144	pcmpeqd	%xmm5,%xmm3
145	movdqa	%xmm2,272(%r10)
146	movdqa	%xmm4,%xmm2
147
148	paddd	%xmm0,%xmm1
149	pcmpeqd	%xmm5,%xmm0
150	movdqa	%xmm3,288(%r10)
151	movdqa	%xmm4,%xmm3
152	paddd	%xmm1,%xmm2
153	pcmpeqd	%xmm5,%xmm1
154	movdqa	%xmm0,304(%r10)
155
156	paddd	%xmm2,%xmm3
157.byte	0x67
158	pcmpeqd	%xmm5,%xmm2
159	movdqa	%xmm1,320(%r10)
160
161	pcmpeqd	%xmm5,%xmm3
162	movdqa	%xmm2,336(%r10)
163	pand	64(%r12),%xmm0
164
165	pand	80(%r12),%xmm1
166	pand	96(%r12),%xmm2
167	movdqa	%xmm3,352(%r10)
168	pand	112(%r12),%xmm3
169	por	%xmm2,%xmm0
170	por	%xmm3,%xmm1
171	movdqa	-128(%r12),%xmm4
172	movdqa	-112(%r12),%xmm5
173	movdqa	-96(%r12),%xmm2
174	pand	112(%r10),%xmm4
175	movdqa	-80(%r12),%xmm3
176	pand	128(%r10),%xmm5
177	por	%xmm4,%xmm0
178	pand	144(%r10),%xmm2
179	por	%xmm5,%xmm1
180	pand	160(%r10),%xmm3
181	por	%xmm2,%xmm0
182	por	%xmm3,%xmm1
183	movdqa	-64(%r12),%xmm4
184	movdqa	-48(%r12),%xmm5
185	movdqa	-32(%r12),%xmm2
186	pand	176(%r10),%xmm4
187	movdqa	-16(%r12),%xmm3
188	pand	192(%r10),%xmm5
189	por	%xmm4,%xmm0
190	pand	208(%r10),%xmm2
191	por	%xmm5,%xmm1
192	pand	224(%r10),%xmm3
193	por	%xmm2,%xmm0
194	por	%xmm3,%xmm1
195	movdqa	0(%r12),%xmm4
196	movdqa	16(%r12),%xmm5
197	movdqa	32(%r12),%xmm2
198	pand	240(%r10),%xmm4
199	movdqa	48(%r12),%xmm3
200	pand	256(%r10),%xmm5
201	por	%xmm4,%xmm0
202	pand	272(%r10),%xmm2
203	por	%xmm5,%xmm1
204	pand	288(%r10),%xmm3
205	por	%xmm2,%xmm0
206	por	%xmm3,%xmm1
207	por	%xmm1,%xmm0
208	pshufd	$0x4e,%xmm0,%xmm1
209	por	%xmm1,%xmm0
210	leaq	256(%r12),%r12
211.byte	102,72,15,126,195
212
213	movq	(%r8),%r8
214	movq	(%rsi),%rax
215
216	xorq	%r14,%r14
217	xorq	%r15,%r15
218
219	movq	%r8,%rbp
220	mulq	%rbx
221	movq	%rax,%r10
222	movq	(%rcx),%rax
223
224	imulq	%r10,%rbp
225	movq	%rdx,%r11
226
227	mulq	%rbp
228	addq	%rax,%r10
229	movq	8(%rsi),%rax
230	adcq	$0,%rdx
231	movq	%rdx,%r13
232
233	leaq	1(%r15),%r15
234	jmp	.L1st_enter
235
236.align	16
237.L1st:
238	addq	%rax,%r13
239	movq	(%rsi,%r15,8),%rax
240	adcq	$0,%rdx
241	addq	%r11,%r13
242	movq	%r10,%r11
243	adcq	$0,%rdx
244	movq	%r13,-16(%rsp,%r15,8)
245	movq	%rdx,%r13
246
247.L1st_enter:
248	mulq	%rbx
249	addq	%rax,%r11
250	movq	(%rcx,%r15,8),%rax
251	adcq	$0,%rdx
252	leaq	1(%r15),%r15
253	movq	%rdx,%r10
254
255	mulq	%rbp
256	cmpq	%r9,%r15
257	jne	.L1st
258
259
260	addq	%rax,%r13
261	adcq	$0,%rdx
262	addq	%r11,%r13
263	adcq	$0,%rdx
264	movq	%r13,-16(%rsp,%r9,8)
265	movq	%rdx,%r13
266	movq	%r10,%r11
267
268	xorq	%rdx,%rdx
269	addq	%r11,%r13
270	adcq	$0,%rdx
271	movq	%r13,-8(%rsp,%r9,8)
272	movq	%rdx,(%rsp,%r9,8)
273
274	leaq	1(%r14),%r14
275	jmp	.Louter
276.align	16
277.Louter:
278	leaq	24+128(%rsp,%r9,8),%rdx
279	andq	$-16,%rdx
280	pxor	%xmm4,%xmm4
281	pxor	%xmm5,%xmm5
282	movdqa	-128(%r12),%xmm0
283	movdqa	-112(%r12),%xmm1
284	movdqa	-96(%r12),%xmm2
285	movdqa	-80(%r12),%xmm3
286	pand	-128(%rdx),%xmm0
287	pand	-112(%rdx),%xmm1
288	por	%xmm0,%xmm4
289	pand	-96(%rdx),%xmm2
290	por	%xmm1,%xmm5
291	pand	-80(%rdx),%xmm3
292	por	%xmm2,%xmm4
293	por	%xmm3,%xmm5
294	movdqa	-64(%r12),%xmm0
295	movdqa	-48(%r12),%xmm1
296	movdqa	-32(%r12),%xmm2
297	movdqa	-16(%r12),%xmm3
298	pand	-64(%rdx),%xmm0
299	pand	-48(%rdx),%xmm1
300	por	%xmm0,%xmm4
301	pand	-32(%rdx),%xmm2
302	por	%xmm1,%xmm5
303	pand	-16(%rdx),%xmm3
304	por	%xmm2,%xmm4
305	por	%xmm3,%xmm5
306	movdqa	0(%r12),%xmm0
307	movdqa	16(%r12),%xmm1
308	movdqa	32(%r12),%xmm2
309	movdqa	48(%r12),%xmm3
310	pand	0(%rdx),%xmm0
311	pand	16(%rdx),%xmm1
312	por	%xmm0,%xmm4
313	pand	32(%rdx),%xmm2
314	por	%xmm1,%xmm5
315	pand	48(%rdx),%xmm3
316	por	%xmm2,%xmm4
317	por	%xmm3,%xmm5
318	movdqa	64(%r12),%xmm0
319	movdqa	80(%r12),%xmm1
320	movdqa	96(%r12),%xmm2
321	movdqa	112(%r12),%xmm3
322	pand	64(%rdx),%xmm0
323	pand	80(%rdx),%xmm1
324	por	%xmm0,%xmm4
325	pand	96(%rdx),%xmm2
326	por	%xmm1,%xmm5
327	pand	112(%rdx),%xmm3
328	por	%xmm2,%xmm4
329	por	%xmm3,%xmm5
330	por	%xmm5,%xmm4
331	pshufd	$0x4e,%xmm4,%xmm0
332	por	%xmm4,%xmm0
333	leaq	256(%r12),%r12
334
335	movq	(%rsi),%rax
336.byte	102,72,15,126,195
337
338	xorq	%r15,%r15
339	movq	%r8,%rbp
340	movq	(%rsp),%r10
341
342	mulq	%rbx
343	addq	%rax,%r10
344	movq	(%rcx),%rax
345	adcq	$0,%rdx
346
347	imulq	%r10,%rbp
348	movq	%rdx,%r11
349
350	mulq	%rbp
351	addq	%rax,%r10
352	movq	8(%rsi),%rax
353	adcq	$0,%rdx
354	movq	8(%rsp),%r10
355	movq	%rdx,%r13
356
357	leaq	1(%r15),%r15
358	jmp	.Linner_enter
359
360.align	16
361.Linner:
362	addq	%rax,%r13
363	movq	(%rsi,%r15,8),%rax
364	adcq	$0,%rdx
365	addq	%r10,%r13
366	movq	(%rsp,%r15,8),%r10
367	adcq	$0,%rdx
368	movq	%r13,-16(%rsp,%r15,8)
369	movq	%rdx,%r13
370
371.Linner_enter:
372	mulq	%rbx
373	addq	%rax,%r11
374	movq	(%rcx,%r15,8),%rax
375	adcq	$0,%rdx
376	addq	%r11,%r10
377	movq	%rdx,%r11
378	adcq	$0,%r11
379	leaq	1(%r15),%r15
380
381	mulq	%rbp
382	cmpq	%r9,%r15
383	jne	.Linner
384
385	addq	%rax,%r13
386	adcq	$0,%rdx
387	addq	%r10,%r13
388	movq	(%rsp,%r9,8),%r10
389	adcq	$0,%rdx
390	movq	%r13,-16(%rsp,%r9,8)
391	movq	%rdx,%r13
392
393	xorq	%rdx,%rdx
394	addq	%r11,%r13
395	adcq	$0,%rdx
396	addq	%r10,%r13
397	adcq	$0,%rdx
398	movq	%r13,-8(%rsp,%r9,8)
399	movq	%rdx,(%rsp,%r9,8)
400
401	leaq	1(%r14),%r14
402	cmpq	%r9,%r14
403	jb	.Louter
404
405	xorq	%r14,%r14
406	movq	(%rsp),%rax
407	leaq	(%rsp),%rsi
408	movq	%r9,%r15
409	jmp	.Lsub
410.align	16
411.Lsub:	sbbq	(%rcx,%r14,8),%rax
412	movq	%rax,(%rdi,%r14,8)
413	movq	8(%rsi,%r14,8),%rax
414	leaq	1(%r14),%r14
415	decq	%r15
416	jnz	.Lsub
417
418	sbbq	$0,%rax
419	movq	$-1,%rbx
420	xorq	%rax,%rbx
421	xorq	%r14,%r14
422	movq	%r9,%r15
423
424.Lcopy:
425	movq	(%rdi,%r14,8),%rcx
426	movq	(%rsp,%r14,8),%rdx
427	andq	%rbx,%rcx
428	andq	%rax,%rdx
429	movq	%r14,(%rsp,%r14,8)
430	orq	%rcx,%rdx
431	movq	%rdx,(%rdi,%r14,8)
432	leaq	1(%r14),%r14
433	subq	$1,%r15
434	jnz	.Lcopy
435
436	movq	8(%rsp,%r9,8),%rsi
437.cfi_def_cfa	%rsi,8
438	movq	$1,%rax
439
440	movq	-48(%rsi),%r15
441.cfi_restore	%r15
442	movq	-40(%rsi),%r14
443.cfi_restore	%r14
444	movq	-32(%rsi),%r13
445.cfi_restore	%r13
446	movq	-24(%rsi),%r12
447.cfi_restore	%r12
448	movq	-16(%rsi),%rbp
449.cfi_restore	%rbp
450	movq	-8(%rsi),%rbx
451.cfi_restore	%rbx
452	leaq	(%rsi),%rsp
453.cfi_def_cfa_register	%rsp
454.Lmul_epilogue:
455	.byte	0xf3,0xc3
456.cfi_endproc
457.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
458.type	bn_mul4x_mont_gather5,@function
459.align	32
460bn_mul4x_mont_gather5:
461.cfi_startproc
462.byte	0x67
463	movq	%rsp,%rax
464.cfi_def_cfa_register	%rax
465.Lmul4x_enter:
466	andl	$0x80108,%r11d
467	cmpl	$0x80108,%r11d
468	je	.Lmulx4x_enter
469	pushq	%rbx
470.cfi_offset	%rbx,-16
471	pushq	%rbp
472.cfi_offset	%rbp,-24
473	pushq	%r12
474.cfi_offset	%r12,-32
475	pushq	%r13
476.cfi_offset	%r13,-40
477	pushq	%r14
478.cfi_offset	%r14,-48
479	pushq	%r15
480.cfi_offset	%r15,-56
481.Lmul4x_prologue:
482
483.byte	0x67
484	shll	$3,%r9d
485	leaq	(%r9,%r9,2),%r10
486	negq	%r9
487
488
489
490
491
492
493
494
495
496
497	leaq	-320(%rsp,%r9,2),%r11
498	movq	%rsp,%rbp
499	subq	%rdi,%r11
500	andq	$4095,%r11
501	cmpq	%r11,%r10
502	jb	.Lmul4xsp_alt
503	subq	%r11,%rbp
504	leaq	-320(%rbp,%r9,2),%rbp
505	jmp	.Lmul4xsp_done
506
507.align	32
508.Lmul4xsp_alt:
509	leaq	4096-320(,%r9,2),%r10
510	leaq	-320(%rbp,%r9,2),%rbp
511	subq	%r10,%r11
512	movq	$0,%r10
513	cmovcq	%r10,%r11
514	subq	%r11,%rbp
515.Lmul4xsp_done:
516	andq	$-64,%rbp
517	movq	%rsp,%r11
518	subq	%rbp,%r11
519	andq	$-4096,%r11
520	leaq	(%r11,%rbp,1),%rsp
521	movq	(%rsp),%r10
522	cmpq	%rbp,%rsp
523	ja	.Lmul4x_page_walk
524	jmp	.Lmul4x_page_walk_done
525
526.Lmul4x_page_walk:
527	leaq	-4096(%rsp),%rsp
528	movq	(%rsp),%r10
529	cmpq	%rbp,%rsp
530	ja	.Lmul4x_page_walk
531.Lmul4x_page_walk_done:
532
533	negq	%r9
534
535	movq	%rax,40(%rsp)
536.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
537.Lmul4x_body:
538
539	call	mul4x_internal
540
541	movq	40(%rsp),%rsi
542.cfi_def_cfa	%rsi,8
543	movq	$1,%rax
544
545	movq	-48(%rsi),%r15
546.cfi_restore	%r15
547	movq	-40(%rsi),%r14
548.cfi_restore	%r14
549	movq	-32(%rsi),%r13
550.cfi_restore	%r13
551	movq	-24(%rsi),%r12
552.cfi_restore	%r12
553	movq	-16(%rsi),%rbp
554.cfi_restore	%rbp
555	movq	-8(%rsi),%rbx
556.cfi_restore	%rbx
557	leaq	(%rsi),%rsp
558.cfi_def_cfa_register	%rsp
559.Lmul4x_epilogue:
560	.byte	0xf3,0xc3
561.cfi_endproc
562.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
563
564.type	mul4x_internal,@function
565.align	32
566mul4x_internal:
567.cfi_startproc
568	shlq	$5,%r9
569	movd	8(%rax),%xmm5
570	leaq	.Linc(%rip),%rax
571	leaq	128(%rdx,%r9,1),%r13
572	shrq	$5,%r9
573	movdqa	0(%rax),%xmm0
574	movdqa	16(%rax),%xmm1
575	leaq	88-112(%rsp,%r9,1),%r10
576	leaq	128(%rdx),%r12
577
578	pshufd	$0,%xmm5,%xmm5
579	movdqa	%xmm1,%xmm4
580.byte	0x67,0x67
581	movdqa	%xmm1,%xmm2
582	paddd	%xmm0,%xmm1
583	pcmpeqd	%xmm5,%xmm0
584.byte	0x67
585	movdqa	%xmm4,%xmm3
586	paddd	%xmm1,%xmm2
587	pcmpeqd	%xmm5,%xmm1
588	movdqa	%xmm0,112(%r10)
589	movdqa	%xmm4,%xmm0
590
591	paddd	%xmm2,%xmm3
592	pcmpeqd	%xmm5,%xmm2
593	movdqa	%xmm1,128(%r10)
594	movdqa	%xmm4,%xmm1
595
596	paddd	%xmm3,%xmm0
597	pcmpeqd	%xmm5,%xmm3
598	movdqa	%xmm2,144(%r10)
599	movdqa	%xmm4,%xmm2
600
601	paddd	%xmm0,%xmm1
602	pcmpeqd	%xmm5,%xmm0
603	movdqa	%xmm3,160(%r10)
604	movdqa	%xmm4,%xmm3
605	paddd	%xmm1,%xmm2
606	pcmpeqd	%xmm5,%xmm1
607	movdqa	%xmm0,176(%r10)
608	movdqa	%xmm4,%xmm0
609
610	paddd	%xmm2,%xmm3
611	pcmpeqd	%xmm5,%xmm2
612	movdqa	%xmm1,192(%r10)
613	movdqa	%xmm4,%xmm1
614
615	paddd	%xmm3,%xmm0
616	pcmpeqd	%xmm5,%xmm3
617	movdqa	%xmm2,208(%r10)
618	movdqa	%xmm4,%xmm2
619
620	paddd	%xmm0,%xmm1
621	pcmpeqd	%xmm5,%xmm0
622	movdqa	%xmm3,224(%r10)
623	movdqa	%xmm4,%xmm3
624	paddd	%xmm1,%xmm2
625	pcmpeqd	%xmm5,%xmm1
626	movdqa	%xmm0,240(%r10)
627	movdqa	%xmm4,%xmm0
628
629	paddd	%xmm2,%xmm3
630	pcmpeqd	%xmm5,%xmm2
631	movdqa	%xmm1,256(%r10)
632	movdqa	%xmm4,%xmm1
633
634	paddd	%xmm3,%xmm0
635	pcmpeqd	%xmm5,%xmm3
636	movdqa	%xmm2,272(%r10)
637	movdqa	%xmm4,%xmm2
638
639	paddd	%xmm0,%xmm1
640	pcmpeqd	%xmm5,%xmm0
641	movdqa	%xmm3,288(%r10)
642	movdqa	%xmm4,%xmm3
643	paddd	%xmm1,%xmm2
644	pcmpeqd	%xmm5,%xmm1
645	movdqa	%xmm0,304(%r10)
646
647	paddd	%xmm2,%xmm3
648.byte	0x67
649	pcmpeqd	%xmm5,%xmm2
650	movdqa	%xmm1,320(%r10)
651
652	pcmpeqd	%xmm5,%xmm3
653	movdqa	%xmm2,336(%r10)
654	pand	64(%r12),%xmm0
655
656	pand	80(%r12),%xmm1
657	pand	96(%r12),%xmm2
658	movdqa	%xmm3,352(%r10)
659	pand	112(%r12),%xmm3
660	por	%xmm2,%xmm0
661	por	%xmm3,%xmm1
662	movdqa	-128(%r12),%xmm4
663	movdqa	-112(%r12),%xmm5
664	movdqa	-96(%r12),%xmm2
665	pand	112(%r10),%xmm4
666	movdqa	-80(%r12),%xmm3
667	pand	128(%r10),%xmm5
668	por	%xmm4,%xmm0
669	pand	144(%r10),%xmm2
670	por	%xmm5,%xmm1
671	pand	160(%r10),%xmm3
672	por	%xmm2,%xmm0
673	por	%xmm3,%xmm1
674	movdqa	-64(%r12),%xmm4
675	movdqa	-48(%r12),%xmm5
676	movdqa	-32(%r12),%xmm2
677	pand	176(%r10),%xmm4
678	movdqa	-16(%r12),%xmm3
679	pand	192(%r10),%xmm5
680	por	%xmm4,%xmm0
681	pand	208(%r10),%xmm2
682	por	%xmm5,%xmm1
683	pand	224(%r10),%xmm3
684	por	%xmm2,%xmm0
685	por	%xmm3,%xmm1
686	movdqa	0(%r12),%xmm4
687	movdqa	16(%r12),%xmm5
688	movdqa	32(%r12),%xmm2
689	pand	240(%r10),%xmm4
690	movdqa	48(%r12),%xmm3
691	pand	256(%r10),%xmm5
692	por	%xmm4,%xmm0
693	pand	272(%r10),%xmm2
694	por	%xmm5,%xmm1
695	pand	288(%r10),%xmm3
696	por	%xmm2,%xmm0
697	por	%xmm3,%xmm1
698	por	%xmm1,%xmm0
699	pshufd	$0x4e,%xmm0,%xmm1
700	por	%xmm1,%xmm0
701	leaq	256(%r12),%r12
702.byte	102,72,15,126,195
703
704	movq	%r13,16+8(%rsp)
705	movq	%rdi,56+8(%rsp)
706
707	movq	(%r8),%r8
708	movq	(%rsi),%rax
709	leaq	(%rsi,%r9,1),%rsi
710	negq	%r9
711
712	movq	%r8,%rbp
713	mulq	%rbx
714	movq	%rax,%r10
715	movq	(%rcx),%rax
716
717	imulq	%r10,%rbp
718	leaq	64+8(%rsp),%r14
719	movq	%rdx,%r11
720
721	mulq	%rbp
722	addq	%rax,%r10
723	movq	8(%rsi,%r9,1),%rax
724	adcq	$0,%rdx
725	movq	%rdx,%rdi
726
727	mulq	%rbx
728	addq	%rax,%r11
729	movq	8(%rcx),%rax
730	adcq	$0,%rdx
731	movq	%rdx,%r10
732
733	mulq	%rbp
734	addq	%rax,%rdi
735	movq	16(%rsi,%r9,1),%rax
736	adcq	$0,%rdx
737	addq	%r11,%rdi
738	leaq	32(%r9),%r15
739	leaq	32(%rcx),%rcx
740	adcq	$0,%rdx
741	movq	%rdi,(%r14)
742	movq	%rdx,%r13
743	jmp	.L1st4x
744
745.align	32
746.L1st4x:
747	mulq	%rbx
748	addq	%rax,%r10
749	movq	-16(%rcx),%rax
750	leaq	32(%r14),%r14
751	adcq	$0,%rdx
752	movq	%rdx,%r11
753
754	mulq	%rbp
755	addq	%rax,%r13
756	movq	-8(%rsi,%r15,1),%rax
757	adcq	$0,%rdx
758	addq	%r10,%r13
759	adcq	$0,%rdx
760	movq	%r13,-24(%r14)
761	movq	%rdx,%rdi
762
763	mulq	%rbx
764	addq	%rax,%r11
765	movq	-8(%rcx),%rax
766	adcq	$0,%rdx
767	movq	%rdx,%r10
768
769	mulq	%rbp
770	addq	%rax,%rdi
771	movq	(%rsi,%r15,1),%rax
772	adcq	$0,%rdx
773	addq	%r11,%rdi
774	adcq	$0,%rdx
775	movq	%rdi,-16(%r14)
776	movq	%rdx,%r13
777
778	mulq	%rbx
779	addq	%rax,%r10
780	movq	0(%rcx),%rax
781	adcq	$0,%rdx
782	movq	%rdx,%r11
783
784	mulq	%rbp
785	addq	%rax,%r13
786	movq	8(%rsi,%r15,1),%rax
787	adcq	$0,%rdx
788	addq	%r10,%r13
789	adcq	$0,%rdx
790	movq	%r13,-8(%r14)
791	movq	%rdx,%rdi
792
793	mulq	%rbx
794	addq	%rax,%r11
795	movq	8(%rcx),%rax
796	adcq	$0,%rdx
797	movq	%rdx,%r10
798
799	mulq	%rbp
800	addq	%rax,%rdi
801	movq	16(%rsi,%r15,1),%rax
802	adcq	$0,%rdx
803	addq	%r11,%rdi
804	leaq	32(%rcx),%rcx
805	adcq	$0,%rdx
806	movq	%rdi,(%r14)
807	movq	%rdx,%r13
808
809	addq	$32,%r15
810	jnz	.L1st4x
811
812	mulq	%rbx
813	addq	%rax,%r10
814	movq	-16(%rcx),%rax
815	leaq	32(%r14),%r14
816	adcq	$0,%rdx
817	movq	%rdx,%r11
818
819	mulq	%rbp
820	addq	%rax,%r13
821	movq	-8(%rsi),%rax
822	adcq	$0,%rdx
823	addq	%r10,%r13
824	adcq	$0,%rdx
825	movq	%r13,-24(%r14)
826	movq	%rdx,%rdi
827
828	mulq	%rbx
829	addq	%rax,%r11
830	movq	-8(%rcx),%rax
831	adcq	$0,%rdx
832	movq	%rdx,%r10
833
834	mulq	%rbp
835	addq	%rax,%rdi
836	movq	(%rsi,%r9,1),%rax
837	adcq	$0,%rdx
838	addq	%r11,%rdi
839	adcq	$0,%rdx
840	movq	%rdi,-16(%r14)
841	movq	%rdx,%r13
842
843	leaq	(%rcx,%r9,1),%rcx
844
845	xorq	%rdi,%rdi
846	addq	%r10,%r13
847	adcq	$0,%rdi
848	movq	%r13,-8(%r14)
849
850	jmp	.Louter4x
851
852.align	32
853.Louter4x:
854	leaq	16+128(%r14),%rdx
855	pxor	%xmm4,%xmm4
856	pxor	%xmm5,%xmm5
857	movdqa	-128(%r12),%xmm0
858	movdqa	-112(%r12),%xmm1
859	movdqa	-96(%r12),%xmm2
860	movdqa	-80(%r12),%xmm3
861	pand	-128(%rdx),%xmm0
862	pand	-112(%rdx),%xmm1
863	por	%xmm0,%xmm4
864	pand	-96(%rdx),%xmm2
865	por	%xmm1,%xmm5
866	pand	-80(%rdx),%xmm3
867	por	%xmm2,%xmm4
868	por	%xmm3,%xmm5
869	movdqa	-64(%r12),%xmm0
870	movdqa	-48(%r12),%xmm1
871	movdqa	-32(%r12),%xmm2
872	movdqa	-16(%r12),%xmm3
873	pand	-64(%rdx),%xmm0
874	pand	-48(%rdx),%xmm1
875	por	%xmm0,%xmm4
876	pand	-32(%rdx),%xmm2
877	por	%xmm1,%xmm5
878	pand	-16(%rdx),%xmm3
879	por	%xmm2,%xmm4
880	por	%xmm3,%xmm5
881	movdqa	0(%r12),%xmm0
882	movdqa	16(%r12),%xmm1
883	movdqa	32(%r12),%xmm2
884	movdqa	48(%r12),%xmm3
885	pand	0(%rdx),%xmm0
886	pand	16(%rdx),%xmm1
887	por	%xmm0,%xmm4
888	pand	32(%rdx),%xmm2
889	por	%xmm1,%xmm5
890	pand	48(%rdx),%xmm3
891	por	%xmm2,%xmm4
892	por	%xmm3,%xmm5
893	movdqa	64(%r12),%xmm0
894	movdqa	80(%r12),%xmm1
895	movdqa	96(%r12),%xmm2
896	movdqa	112(%r12),%xmm3
897	pand	64(%rdx),%xmm0
898	pand	80(%rdx),%xmm1
899	por	%xmm0,%xmm4
900	pand	96(%rdx),%xmm2
901	por	%xmm1,%xmm5
902	pand	112(%rdx),%xmm3
903	por	%xmm2,%xmm4
904	por	%xmm3,%xmm5
905	por	%xmm5,%xmm4
906	pshufd	$0x4e,%xmm4,%xmm0
907	por	%xmm4,%xmm0
908	leaq	256(%r12),%r12
909.byte	102,72,15,126,195
910
911	movq	(%r14,%r9,1),%r10
912	movq	%r8,%rbp
913	mulq	%rbx
914	addq	%rax,%r10
915	movq	(%rcx),%rax
916	adcq	$0,%rdx
917
918	imulq	%r10,%rbp
919	movq	%rdx,%r11
920	movq	%rdi,(%r14)
921
922	leaq	(%r14,%r9,1),%r14
923
924	mulq	%rbp
925	addq	%rax,%r10
926	movq	8(%rsi,%r9,1),%rax
927	adcq	$0,%rdx
928	movq	%rdx,%rdi
929
930	mulq	%rbx
931	addq	%rax,%r11
932	movq	8(%rcx),%rax
933	adcq	$0,%rdx
934	addq	8(%r14),%r11
935	adcq	$0,%rdx
936	movq	%rdx,%r10
937
938	mulq	%rbp
939	addq	%rax,%rdi
940	movq	16(%rsi,%r9,1),%rax
941	adcq	$0,%rdx
942	addq	%r11,%rdi
943	leaq	32(%r9),%r15
944	leaq	32(%rcx),%rcx
945	adcq	$0,%rdx
946	movq	%rdx,%r13
947	jmp	.Linner4x
948
949.align	32
950.Linner4x:
951	mulq	%rbx
952	addq	%rax,%r10
953	movq	-16(%rcx),%rax
954	adcq	$0,%rdx
955	addq	16(%r14),%r10
956	leaq	32(%r14),%r14
957	adcq	$0,%rdx
958	movq	%rdx,%r11
959
960	mulq	%rbp
961	addq	%rax,%r13
962	movq	-8(%rsi,%r15,1),%rax
963	adcq	$0,%rdx
964	addq	%r10,%r13
965	adcq	$0,%rdx
966	movq	%rdi,-32(%r14)
967	movq	%rdx,%rdi
968
969	mulq	%rbx
970	addq	%rax,%r11
971	movq	-8(%rcx),%rax
972	adcq	$0,%rdx
973	addq	-8(%r14),%r11
974	adcq	$0,%rdx
975	movq	%rdx,%r10
976
977	mulq	%rbp
978	addq	%rax,%rdi
979	movq	(%rsi,%r15,1),%rax
980	adcq	$0,%rdx
981	addq	%r11,%rdi
982	adcq	$0,%rdx
983	movq	%r13,-24(%r14)
984	movq	%rdx,%r13
985
986	mulq	%rbx
987	addq	%rax,%r10
988	movq	0(%rcx),%rax
989	adcq	$0,%rdx
990	addq	(%r14),%r10
991	adcq	$0,%rdx
992	movq	%rdx,%r11
993
994	mulq	%rbp
995	addq	%rax,%r13
996	movq	8(%rsi,%r15,1),%rax
997	adcq	$0,%rdx
998	addq	%r10,%r13
999	adcq	$0,%rdx
1000	movq	%rdi,-16(%r14)
1001	movq	%rdx,%rdi
1002
1003	mulq	%rbx
1004	addq	%rax,%r11
1005	movq	8(%rcx),%rax
1006	adcq	$0,%rdx
1007	addq	8(%r14),%r11
1008	adcq	$0,%rdx
1009	movq	%rdx,%r10
1010
1011	mulq	%rbp
1012	addq	%rax,%rdi
1013	movq	16(%rsi,%r15,1),%rax
1014	adcq	$0,%rdx
1015	addq	%r11,%rdi
1016	leaq	32(%rcx),%rcx
1017	adcq	$0,%rdx
1018	movq	%r13,-8(%r14)
1019	movq	%rdx,%r13
1020
1021	addq	$32,%r15
1022	jnz	.Linner4x
1023
1024	mulq	%rbx
1025	addq	%rax,%r10
1026	movq	-16(%rcx),%rax
1027	adcq	$0,%rdx
1028	addq	16(%r14),%r10
1029	leaq	32(%r14),%r14
1030	adcq	$0,%rdx
1031	movq	%rdx,%r11
1032
1033	mulq	%rbp
1034	addq	%rax,%r13
1035	movq	-8(%rsi),%rax
1036	adcq	$0,%rdx
1037	addq	%r10,%r13
1038	adcq	$0,%rdx
1039	movq	%rdi,-32(%r14)
1040	movq	%rdx,%rdi
1041
1042	mulq	%rbx
1043	addq	%rax,%r11
1044	movq	%rbp,%rax
1045	movq	-8(%rcx),%rbp
1046	adcq	$0,%rdx
1047	addq	-8(%r14),%r11
1048	adcq	$0,%rdx
1049	movq	%rdx,%r10
1050
1051	mulq	%rbp
1052	addq	%rax,%rdi
1053	movq	(%rsi,%r9,1),%rax
1054	adcq	$0,%rdx
1055	addq	%r11,%rdi
1056	adcq	$0,%rdx
1057	movq	%r13,-24(%r14)
1058	movq	%rdx,%r13
1059
1060	movq	%rdi,-16(%r14)
1061	leaq	(%rcx,%r9,1),%rcx
1062
1063	xorq	%rdi,%rdi
1064	addq	%r10,%r13
1065	adcq	$0,%rdi
1066	addq	(%r14),%r13
1067	adcq	$0,%rdi
1068	movq	%r13,-8(%r14)
1069
1070	cmpq	16+8(%rsp),%r12
1071	jb	.Louter4x
1072	xorq	%rax,%rax
1073	subq	%r13,%rbp
1074	adcq	%r15,%r15
1075	orq	%r15,%rdi
1076	subq	%rdi,%rax
1077	leaq	(%r14,%r9,1),%rbx
1078	movq	(%rcx),%r12
1079	leaq	(%rcx),%rbp
1080	movq	%r9,%rcx
1081	sarq	$3+2,%rcx
1082	movq	56+8(%rsp),%rdi
1083	decq	%r12
1084	xorq	%r10,%r10
1085	movq	8(%rbp),%r13
1086	movq	16(%rbp),%r14
1087	movq	24(%rbp),%r15
1088	jmp	.Lsqr4x_sub_entry
1089.cfi_endproc
1090.size	mul4x_internal,.-mul4x_internal
1091.globl	bn_power5
1092.hidden bn_power5
1093.type	bn_power5,@function
1094.align	32
1095bn_power5:
1096.cfi_startproc
1097	movq	%rsp,%rax
1098.cfi_def_cfa_register	%rax
1099	leaq	OPENSSL_ia32cap_P(%rip),%r11
1100	movl	8(%r11),%r11d
1101	andl	$0x80108,%r11d
1102	cmpl	$0x80108,%r11d
1103	je	.Lpowerx5_enter
1104	pushq	%rbx
1105.cfi_offset	%rbx,-16
1106	pushq	%rbp
1107.cfi_offset	%rbp,-24
1108	pushq	%r12
1109.cfi_offset	%r12,-32
1110	pushq	%r13
1111.cfi_offset	%r13,-40
1112	pushq	%r14
1113.cfi_offset	%r14,-48
1114	pushq	%r15
1115.cfi_offset	%r15,-56
1116.Lpower5_prologue:
1117
1118	shll	$3,%r9d
1119	leal	(%r9,%r9,2),%r10d
1120	negq	%r9
1121	movq	(%r8),%r8
1122
1123
1124
1125
1126
1127
1128
1129
1130	leaq	-320(%rsp,%r9,2),%r11
1131	movq	%rsp,%rbp
1132	subq	%rdi,%r11
1133	andq	$4095,%r11
1134	cmpq	%r11,%r10
1135	jb	.Lpwr_sp_alt
1136	subq	%r11,%rbp
1137	leaq	-320(%rbp,%r9,2),%rbp
1138	jmp	.Lpwr_sp_done
1139
1140.align	32
1141.Lpwr_sp_alt:
1142	leaq	4096-320(,%r9,2),%r10
1143	leaq	-320(%rbp,%r9,2),%rbp
1144	subq	%r10,%r11
1145	movq	$0,%r10
1146	cmovcq	%r10,%r11
1147	subq	%r11,%rbp
1148.Lpwr_sp_done:
1149	andq	$-64,%rbp
1150	movq	%rsp,%r11
1151	subq	%rbp,%r11
1152	andq	$-4096,%r11
1153	leaq	(%r11,%rbp,1),%rsp
1154	movq	(%rsp),%r10
1155	cmpq	%rbp,%rsp
1156	ja	.Lpwr_page_walk
1157	jmp	.Lpwr_page_walk_done
1158
1159.Lpwr_page_walk:
1160	leaq	-4096(%rsp),%rsp
1161	movq	(%rsp),%r10
1162	cmpq	%rbp,%rsp
1163	ja	.Lpwr_page_walk
1164.Lpwr_page_walk_done:
1165
1166	movq	%r9,%r10
1167	negq	%r9
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178	movq	%r8,32(%rsp)
1179	movq	%rax,40(%rsp)
1180.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
1181.Lpower5_body:
1182.byte	102,72,15,110,207
1183.byte	102,72,15,110,209
1184.byte	102,73,15,110,218
1185.byte	102,72,15,110,226
1186
1187	call	__bn_sqr8x_internal
1188	call	__bn_post4x_internal
1189	call	__bn_sqr8x_internal
1190	call	__bn_post4x_internal
1191	call	__bn_sqr8x_internal
1192	call	__bn_post4x_internal
1193	call	__bn_sqr8x_internal
1194	call	__bn_post4x_internal
1195	call	__bn_sqr8x_internal
1196	call	__bn_post4x_internal
1197
1198.byte	102,72,15,126,209
1199.byte	102,72,15,126,226
1200	movq	%rsi,%rdi
1201	movq	40(%rsp),%rax
1202	leaq	32(%rsp),%r8
1203
1204	call	mul4x_internal
1205
1206	movq	40(%rsp),%rsi
1207.cfi_def_cfa	%rsi,8
1208	movq	$1,%rax
1209	movq	-48(%rsi),%r15
1210.cfi_restore	%r15
1211	movq	-40(%rsi),%r14
1212.cfi_restore	%r14
1213	movq	-32(%rsi),%r13
1214.cfi_restore	%r13
1215	movq	-24(%rsi),%r12
1216.cfi_restore	%r12
1217	movq	-16(%rsi),%rbp
1218.cfi_restore	%rbp
1219	movq	-8(%rsi),%rbx
1220.cfi_restore	%rbx
1221	leaq	(%rsi),%rsp
1222.cfi_def_cfa_register	%rsp
1223.Lpower5_epilogue:
1224	.byte	0xf3,0xc3
1225.cfi_endproc
1226.size	bn_power5,.-bn_power5
1227
1228.globl	bn_sqr8x_internal
1229.hidden bn_sqr8x_internal
1230.hidden	bn_sqr8x_internal
1231.type	bn_sqr8x_internal,@function
1232.align	32
1233bn_sqr8x_internal:
1234__bn_sqr8x_internal:
1235.cfi_startproc
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309	leaq	32(%r10),%rbp
1310	leaq	(%rsi,%r9,1),%rsi
1311
1312	movq	%r9,%rcx
1313
1314
1315	movq	-32(%rsi,%rbp,1),%r14
1316	leaq	48+8(%rsp,%r9,2),%rdi
1317	movq	-24(%rsi,%rbp,1),%rax
1318	leaq	-32(%rdi,%rbp,1),%rdi
1319	movq	-16(%rsi,%rbp,1),%rbx
1320	movq	%rax,%r15
1321
1322	mulq	%r14
1323	movq	%rax,%r10
1324	movq	%rbx,%rax
1325	movq	%rdx,%r11
1326	movq	%r10,-24(%rdi,%rbp,1)
1327
1328	mulq	%r14
1329	addq	%rax,%r11
1330	movq	%rbx,%rax
1331	adcq	$0,%rdx
1332	movq	%r11,-16(%rdi,%rbp,1)
1333	movq	%rdx,%r10
1334
1335
1336	movq	-8(%rsi,%rbp,1),%rbx
1337	mulq	%r15
1338	movq	%rax,%r12
1339	movq	%rbx,%rax
1340	movq	%rdx,%r13
1341
1342	leaq	(%rbp),%rcx
1343	mulq	%r14
1344	addq	%rax,%r10
1345	movq	%rbx,%rax
1346	movq	%rdx,%r11
1347	adcq	$0,%r11
1348	addq	%r12,%r10
1349	adcq	$0,%r11
1350	movq	%r10,-8(%rdi,%rcx,1)
1351	jmp	.Lsqr4x_1st
1352
1353.align	32
1354.Lsqr4x_1st:
1355	movq	(%rsi,%rcx,1),%rbx
1356	mulq	%r15
1357	addq	%rax,%r13
1358	movq	%rbx,%rax
1359	movq	%rdx,%r12
1360	adcq	$0,%r12
1361
1362	mulq	%r14
1363	addq	%rax,%r11
1364	movq	%rbx,%rax
1365	movq	8(%rsi,%rcx,1),%rbx
1366	movq	%rdx,%r10
1367	adcq	$0,%r10
1368	addq	%r13,%r11
1369	adcq	$0,%r10
1370
1371
1372	mulq	%r15
1373	addq	%rax,%r12
1374	movq	%rbx,%rax
1375	movq	%r11,(%rdi,%rcx,1)
1376	movq	%rdx,%r13
1377	adcq	$0,%r13
1378
1379	mulq	%r14
1380	addq	%rax,%r10
1381	movq	%rbx,%rax
1382	movq	16(%rsi,%rcx,1),%rbx
1383	movq	%rdx,%r11
1384	adcq	$0,%r11
1385	addq	%r12,%r10
1386	adcq	$0,%r11
1387
1388	mulq	%r15
1389	addq	%rax,%r13
1390	movq	%rbx,%rax
1391	movq	%r10,8(%rdi,%rcx,1)
1392	movq	%rdx,%r12
1393	adcq	$0,%r12
1394
1395	mulq	%r14
1396	addq	%rax,%r11
1397	movq	%rbx,%rax
1398	movq	24(%rsi,%rcx,1),%rbx
1399	movq	%rdx,%r10
1400	adcq	$0,%r10
1401	addq	%r13,%r11
1402	adcq	$0,%r10
1403
1404
1405	mulq	%r15
1406	addq	%rax,%r12
1407	movq	%rbx,%rax
1408	movq	%r11,16(%rdi,%rcx,1)
1409	movq	%rdx,%r13
1410	adcq	$0,%r13
1411	leaq	32(%rcx),%rcx
1412
1413	mulq	%r14
1414	addq	%rax,%r10
1415	movq	%rbx,%rax
1416	movq	%rdx,%r11
1417	adcq	$0,%r11
1418	addq	%r12,%r10
1419	adcq	$0,%r11
1420	movq	%r10,-8(%rdi,%rcx,1)
1421
1422	cmpq	$0,%rcx
1423	jne	.Lsqr4x_1st
1424
1425	mulq	%r15
1426	addq	%rax,%r13
1427	leaq	16(%rbp),%rbp
1428	adcq	$0,%rdx
1429	addq	%r11,%r13
1430	adcq	$0,%rdx
1431
1432	movq	%r13,(%rdi)
1433	movq	%rdx,%r12
1434	movq	%rdx,8(%rdi)
1435	jmp	.Lsqr4x_outer
1436
1437.align	32
1438.Lsqr4x_outer:
1439	movq	-32(%rsi,%rbp,1),%r14
1440	leaq	48+8(%rsp,%r9,2),%rdi
1441	movq	-24(%rsi,%rbp,1),%rax
1442	leaq	-32(%rdi,%rbp,1),%rdi
1443	movq	-16(%rsi,%rbp,1),%rbx
1444	movq	%rax,%r15
1445
1446	mulq	%r14
1447	movq	-24(%rdi,%rbp,1),%r10
1448	addq	%rax,%r10
1449	movq	%rbx,%rax
1450	adcq	$0,%rdx
1451	movq	%r10,-24(%rdi,%rbp,1)
1452	movq	%rdx,%r11
1453
1454	mulq	%r14
1455	addq	%rax,%r11
1456	movq	%rbx,%rax
1457	adcq	$0,%rdx
1458	addq	-16(%rdi,%rbp,1),%r11
1459	movq	%rdx,%r10
1460	adcq	$0,%r10
1461	movq	%r11,-16(%rdi,%rbp,1)
1462
1463	xorq	%r12,%r12
1464
1465	movq	-8(%rsi,%rbp,1),%rbx
1466	mulq	%r15
1467	addq	%rax,%r12
1468	movq	%rbx,%rax
1469	adcq	$0,%rdx
1470	addq	-8(%rdi,%rbp,1),%r12
1471	movq	%rdx,%r13
1472	adcq	$0,%r13
1473
1474	mulq	%r14
1475	addq	%rax,%r10
1476	movq	%rbx,%rax
1477	adcq	$0,%rdx
1478	addq	%r12,%r10
1479	movq	%rdx,%r11
1480	adcq	$0,%r11
1481	movq	%r10,-8(%rdi,%rbp,1)
1482
1483	leaq	(%rbp),%rcx
1484	jmp	.Lsqr4x_inner
1485
1486.align	32
1487.Lsqr4x_inner:
1488	movq	(%rsi,%rcx,1),%rbx
1489	mulq	%r15
1490	addq	%rax,%r13
1491	movq	%rbx,%rax
1492	movq	%rdx,%r12
1493	adcq	$0,%r12
1494	addq	(%rdi,%rcx,1),%r13
1495	adcq	$0,%r12
1496
1497.byte	0x67
1498	mulq	%r14
1499	addq	%rax,%r11
1500	movq	%rbx,%rax
1501	movq	8(%rsi,%rcx,1),%rbx
1502	movq	%rdx,%r10
1503	adcq	$0,%r10
1504	addq	%r13,%r11
1505	adcq	$0,%r10
1506
1507	mulq	%r15
1508	addq	%rax,%r12
1509	movq	%r11,(%rdi,%rcx,1)
1510	movq	%rbx,%rax
1511	movq	%rdx,%r13
1512	adcq	$0,%r13
1513	addq	8(%rdi,%rcx,1),%r12
1514	leaq	16(%rcx),%rcx
1515	adcq	$0,%r13
1516
1517	mulq	%r14
1518	addq	%rax,%r10
1519	movq	%rbx,%rax
1520	adcq	$0,%rdx
1521	addq	%r12,%r10
1522	movq	%rdx,%r11
1523	adcq	$0,%r11
1524	movq	%r10,-8(%rdi,%rcx,1)
1525
1526	cmpq	$0,%rcx
1527	jne	.Lsqr4x_inner
1528
1529.byte	0x67
1530	mulq	%r15
1531	addq	%rax,%r13
1532	adcq	$0,%rdx
1533	addq	%r11,%r13
1534	adcq	$0,%rdx
1535
1536	movq	%r13,(%rdi)
1537	movq	%rdx,%r12
1538	movq	%rdx,8(%rdi)
1539
1540	addq	$16,%rbp
1541	jnz	.Lsqr4x_outer
1542
1543
1544	movq	-32(%rsi),%r14
1545	leaq	48+8(%rsp,%r9,2),%rdi
1546	movq	-24(%rsi),%rax
1547	leaq	-32(%rdi,%rbp,1),%rdi
1548	movq	-16(%rsi),%rbx
1549	movq	%rax,%r15
1550
1551	mulq	%r14
1552	addq	%rax,%r10
1553	movq	%rbx,%rax
1554	movq	%rdx,%r11
1555	adcq	$0,%r11
1556
1557	mulq	%r14
1558	addq	%rax,%r11
1559	movq	%rbx,%rax
1560	movq	%r10,-24(%rdi)
1561	movq	%rdx,%r10
1562	adcq	$0,%r10
1563	addq	%r13,%r11
1564	movq	-8(%rsi),%rbx
1565	adcq	$0,%r10
1566
1567	mulq	%r15
1568	addq	%rax,%r12
1569	movq	%rbx,%rax
1570	movq	%r11,-16(%rdi)
1571	movq	%rdx,%r13
1572	adcq	$0,%r13
1573
1574	mulq	%r14
1575	addq	%rax,%r10
1576	movq	%rbx,%rax
1577	movq	%rdx,%r11
1578	adcq	$0,%r11
1579	addq	%r12,%r10
1580	adcq	$0,%r11
1581	movq	%r10,-8(%rdi)
1582
1583	mulq	%r15
1584	addq	%rax,%r13
1585	movq	-16(%rsi),%rax
1586	adcq	$0,%rdx
1587	addq	%r11,%r13
1588	adcq	$0,%rdx
1589
1590	movq	%r13,(%rdi)
1591	movq	%rdx,%r12
1592	movq	%rdx,8(%rdi)
1593
1594	mulq	%rbx
1595	addq	$16,%rbp
1596	xorq	%r14,%r14
1597	subq	%r9,%rbp
1598	xorq	%r15,%r15
1599
1600	addq	%r12,%rax
1601	adcq	$0,%rdx
1602	movq	%rax,8(%rdi)
1603	movq	%rdx,16(%rdi)
1604	movq	%r15,24(%rdi)
1605
1606	movq	-16(%rsi,%rbp,1),%rax
1607	leaq	48+8(%rsp),%rdi
1608	xorq	%r10,%r10
1609	movq	8(%rdi),%r11
1610
1611	leaq	(%r14,%r10,2),%r12
1612	shrq	$63,%r10
1613	leaq	(%rcx,%r11,2),%r13
1614	shrq	$63,%r11
1615	orq	%r10,%r13
1616	movq	16(%rdi),%r10
1617	movq	%r11,%r14
1618	mulq	%rax
1619	negq	%r15
1620	movq	24(%rdi),%r11
1621	adcq	%rax,%r12
1622	movq	-8(%rsi,%rbp,1),%rax
1623	movq	%r12,(%rdi)
1624	adcq	%rdx,%r13
1625
1626	leaq	(%r14,%r10,2),%rbx
1627	movq	%r13,8(%rdi)
1628	sbbq	%r15,%r15
1629	shrq	$63,%r10
1630	leaq	(%rcx,%r11,2),%r8
1631	shrq	$63,%r11
1632	orq	%r10,%r8
1633	movq	32(%rdi),%r10
1634	movq	%r11,%r14
1635	mulq	%rax
1636	negq	%r15
1637	movq	40(%rdi),%r11
1638	adcq	%rax,%rbx
1639	movq	0(%rsi,%rbp,1),%rax
1640	movq	%rbx,16(%rdi)
1641	adcq	%rdx,%r8
1642	leaq	16(%rbp),%rbp
1643	movq	%r8,24(%rdi)
1644	sbbq	%r15,%r15
1645	leaq	64(%rdi),%rdi
1646	jmp	.Lsqr4x_shift_n_add
1647
1648.align	32
1649.Lsqr4x_shift_n_add:
1650	leaq	(%r14,%r10,2),%r12
1651	shrq	$63,%r10
1652	leaq	(%rcx,%r11,2),%r13
1653	shrq	$63,%r11
1654	orq	%r10,%r13
1655	movq	-16(%rdi),%r10
1656	movq	%r11,%r14
1657	mulq	%rax
1658	negq	%r15
1659	movq	-8(%rdi),%r11
1660	adcq	%rax,%r12
1661	movq	-8(%rsi,%rbp,1),%rax
1662	movq	%r12,-32(%rdi)
1663	adcq	%rdx,%r13
1664
1665	leaq	(%r14,%r10,2),%rbx
1666	movq	%r13,-24(%rdi)
1667	sbbq	%r15,%r15
1668	shrq	$63,%r10
1669	leaq	(%rcx,%r11,2),%r8
1670	shrq	$63,%r11
1671	orq	%r10,%r8
1672	movq	0(%rdi),%r10
1673	movq	%r11,%r14
1674	mulq	%rax
1675	negq	%r15
1676	movq	8(%rdi),%r11
1677	adcq	%rax,%rbx
1678	movq	0(%rsi,%rbp,1),%rax
1679	movq	%rbx,-16(%rdi)
1680	adcq	%rdx,%r8
1681
1682	leaq	(%r14,%r10,2),%r12
1683	movq	%r8,-8(%rdi)
1684	sbbq	%r15,%r15
1685	shrq	$63,%r10
1686	leaq	(%rcx,%r11,2),%r13
1687	shrq	$63,%r11
1688	orq	%r10,%r13
1689	movq	16(%rdi),%r10
1690	movq	%r11,%r14
1691	mulq	%rax
1692	negq	%r15
1693	movq	24(%rdi),%r11
1694	adcq	%rax,%r12
1695	movq	8(%rsi,%rbp,1),%rax
1696	movq	%r12,0(%rdi)
1697	adcq	%rdx,%r13
1698
1699	leaq	(%r14,%r10,2),%rbx
1700	movq	%r13,8(%rdi)
1701	sbbq	%r15,%r15
1702	shrq	$63,%r10
1703	leaq	(%rcx,%r11,2),%r8
1704	shrq	$63,%r11
1705	orq	%r10,%r8
1706	movq	32(%rdi),%r10
1707	movq	%r11,%r14
1708	mulq	%rax
1709	negq	%r15
1710	movq	40(%rdi),%r11
1711	adcq	%rax,%rbx
1712	movq	16(%rsi,%rbp,1),%rax
1713	movq	%rbx,16(%rdi)
1714	adcq	%rdx,%r8
1715	movq	%r8,24(%rdi)
1716	sbbq	%r15,%r15
1717	leaq	64(%rdi),%rdi
1718	addq	$32,%rbp
1719	jnz	.Lsqr4x_shift_n_add
1720
1721	leaq	(%r14,%r10,2),%r12
1722.byte	0x67
1723	shrq	$63,%r10
1724	leaq	(%rcx,%r11,2),%r13
1725	shrq	$63,%r11
1726	orq	%r10,%r13
1727	movq	-16(%rdi),%r10
1728	movq	%r11,%r14
1729	mulq	%rax
1730	negq	%r15
1731	movq	-8(%rdi),%r11
1732	adcq	%rax,%r12
1733	movq	-8(%rsi),%rax
1734	movq	%r12,-32(%rdi)
1735	adcq	%rdx,%r13
1736
1737	leaq	(%r14,%r10,2),%rbx
1738	movq	%r13,-24(%rdi)
1739	sbbq	%r15,%r15
1740	shrq	$63,%r10
1741	leaq	(%rcx,%r11,2),%r8
1742	shrq	$63,%r11
1743	orq	%r10,%r8
1744	mulq	%rax
1745	negq	%r15
1746	adcq	%rax,%rbx
1747	adcq	%rdx,%r8
1748	movq	%rbx,-16(%rdi)
1749	movq	%r8,-8(%rdi)
1750.byte	102,72,15,126,213
1751__bn_sqr8x_reduction:
1752	xorq	%rax,%rax
1753	leaq	(%r9,%rbp,1),%rcx
1754	leaq	48+8(%rsp,%r9,2),%rdx
1755	movq	%rcx,0+8(%rsp)
1756	leaq	48+8(%rsp,%r9,1),%rdi
1757	movq	%rdx,8+8(%rsp)
1758	negq	%r9
1759	jmp	.L8x_reduction_loop
1760
1761.align	32
1762.L8x_reduction_loop:
1763	leaq	(%rdi,%r9,1),%rdi
1764.byte	0x66
1765	movq	0(%rdi),%rbx
1766	movq	8(%rdi),%r9
1767	movq	16(%rdi),%r10
1768	movq	24(%rdi),%r11
1769	movq	32(%rdi),%r12
1770	movq	40(%rdi),%r13
1771	movq	48(%rdi),%r14
1772	movq	56(%rdi),%r15
1773	movq	%rax,(%rdx)
1774	leaq	64(%rdi),%rdi
1775
1776.byte	0x67
1777	movq	%rbx,%r8
1778	imulq	32+8(%rsp),%rbx
1779	movq	0(%rbp),%rax
1780	movl	$8,%ecx
1781	jmp	.L8x_reduce
1782
1783.align	32
1784.L8x_reduce:
1785	mulq	%rbx
1786	movq	8(%rbp),%rax
1787	negq	%r8
1788	movq	%rdx,%r8
1789	adcq	$0,%r8
1790
1791	mulq	%rbx
1792	addq	%rax,%r9
1793	movq	16(%rbp),%rax
1794	adcq	$0,%rdx
1795	addq	%r9,%r8
1796	movq	%rbx,48-8+8(%rsp,%rcx,8)
1797	movq	%rdx,%r9
1798	adcq	$0,%r9
1799
1800	mulq	%rbx
1801	addq	%rax,%r10
1802	movq	24(%rbp),%rax
1803	adcq	$0,%rdx
1804	addq	%r10,%r9
1805	movq	32+8(%rsp),%rsi
1806	movq	%rdx,%r10
1807	adcq	$0,%r10
1808
1809	mulq	%rbx
1810	addq	%rax,%r11
1811	movq	32(%rbp),%rax
1812	adcq	$0,%rdx
1813	imulq	%r8,%rsi
1814	addq	%r11,%r10
1815	movq	%rdx,%r11
1816	adcq	$0,%r11
1817
1818	mulq	%rbx
1819	addq	%rax,%r12
1820	movq	40(%rbp),%rax
1821	adcq	$0,%rdx
1822	addq	%r12,%r11
1823	movq	%rdx,%r12
1824	adcq	$0,%r12
1825
1826	mulq	%rbx
1827	addq	%rax,%r13
1828	movq	48(%rbp),%rax
1829	adcq	$0,%rdx
1830	addq	%r13,%r12
1831	movq	%rdx,%r13
1832	adcq	$0,%r13
1833
1834	mulq	%rbx
1835	addq	%rax,%r14
1836	movq	56(%rbp),%rax
1837	adcq	$0,%rdx
1838	addq	%r14,%r13
1839	movq	%rdx,%r14
1840	adcq	$0,%r14
1841
1842	mulq	%rbx
1843	movq	%rsi,%rbx
1844	addq	%rax,%r15
1845	movq	0(%rbp),%rax
1846	adcq	$0,%rdx
1847	addq	%r15,%r14
1848	movq	%rdx,%r15
1849	adcq	$0,%r15
1850
1851	decl	%ecx
1852	jnz	.L8x_reduce
1853
1854	leaq	64(%rbp),%rbp
1855	xorq	%rax,%rax
1856	movq	8+8(%rsp),%rdx
1857	cmpq	0+8(%rsp),%rbp
1858	jae	.L8x_no_tail
1859
1860.byte	0x66
1861	addq	0(%rdi),%r8
1862	adcq	8(%rdi),%r9
1863	adcq	16(%rdi),%r10
1864	adcq	24(%rdi),%r11
1865	adcq	32(%rdi),%r12
1866	adcq	40(%rdi),%r13
1867	adcq	48(%rdi),%r14
1868	adcq	56(%rdi),%r15
1869	sbbq	%rsi,%rsi
1870
1871	movq	48+56+8(%rsp),%rbx
1872	movl	$8,%ecx
1873	movq	0(%rbp),%rax
1874	jmp	.L8x_tail
1875
1876.align	32
1877.L8x_tail:
1878	mulq	%rbx
1879	addq	%rax,%r8
1880	movq	8(%rbp),%rax
1881	movq	%r8,(%rdi)
1882	movq	%rdx,%r8
1883	adcq	$0,%r8
1884
1885	mulq	%rbx
1886	addq	%rax,%r9
1887	movq	16(%rbp),%rax
1888	adcq	$0,%rdx
1889	addq	%r9,%r8
1890	leaq	8(%rdi),%rdi
1891	movq	%rdx,%r9
1892	adcq	$0,%r9
1893
1894	mulq	%rbx
1895	addq	%rax,%r10
1896	movq	24(%rbp),%rax
1897	adcq	$0,%rdx
1898	addq	%r10,%r9
1899	movq	%rdx,%r10
1900	adcq	$0,%r10
1901
1902	mulq	%rbx
1903	addq	%rax,%r11
1904	movq	32(%rbp),%rax
1905	adcq	$0,%rdx
1906	addq	%r11,%r10
1907	movq	%rdx,%r11
1908	adcq	$0,%r11
1909
1910	mulq	%rbx
1911	addq	%rax,%r12
1912	movq	40(%rbp),%rax
1913	adcq	$0,%rdx
1914	addq	%r12,%r11
1915	movq	%rdx,%r12
1916	adcq	$0,%r12
1917
1918	mulq	%rbx
1919	addq	%rax,%r13
1920	movq	48(%rbp),%rax
1921	adcq	$0,%rdx
1922	addq	%r13,%r12
1923	movq	%rdx,%r13
1924	adcq	$0,%r13
1925
1926	mulq	%rbx
1927	addq	%rax,%r14
1928	movq	56(%rbp),%rax
1929	adcq	$0,%rdx
1930	addq	%r14,%r13
1931	movq	%rdx,%r14
1932	adcq	$0,%r14
1933
1934	mulq	%rbx
1935	movq	48-16+8(%rsp,%rcx,8),%rbx
1936	addq	%rax,%r15
1937	adcq	$0,%rdx
1938	addq	%r15,%r14
1939	movq	0(%rbp),%rax
1940	movq	%rdx,%r15
1941	adcq	$0,%r15
1942
1943	decl	%ecx
1944	jnz	.L8x_tail
1945
1946	leaq	64(%rbp),%rbp
1947	movq	8+8(%rsp),%rdx
1948	cmpq	0+8(%rsp),%rbp
1949	jae	.L8x_tail_done
1950
1951	movq	48+56+8(%rsp),%rbx
1952	negq	%rsi
1953	movq	0(%rbp),%rax
1954	adcq	0(%rdi),%r8
1955	adcq	8(%rdi),%r9
1956	adcq	16(%rdi),%r10
1957	adcq	24(%rdi),%r11
1958	adcq	32(%rdi),%r12
1959	adcq	40(%rdi),%r13
1960	adcq	48(%rdi),%r14
1961	adcq	56(%rdi),%r15
1962	sbbq	%rsi,%rsi
1963
1964	movl	$8,%ecx
1965	jmp	.L8x_tail
1966
1967.align	32
1968.L8x_tail_done:
1969	xorq	%rax,%rax
1970	addq	(%rdx),%r8
1971	adcq	$0,%r9
1972	adcq	$0,%r10
1973	adcq	$0,%r11
1974	adcq	$0,%r12
1975	adcq	$0,%r13
1976	adcq	$0,%r14
1977	adcq	$0,%r15
1978	adcq	$0,%rax
1979
1980	negq	%rsi
1981.L8x_no_tail:
1982	adcq	0(%rdi),%r8
1983	adcq	8(%rdi),%r9
1984	adcq	16(%rdi),%r10
1985	adcq	24(%rdi),%r11
1986	adcq	32(%rdi),%r12
1987	adcq	40(%rdi),%r13
1988	adcq	48(%rdi),%r14
1989	adcq	56(%rdi),%r15
1990	adcq	$0,%rax
1991	movq	-8(%rbp),%rcx
1992	xorq	%rsi,%rsi
1993
1994.byte	102,72,15,126,213
1995
1996	movq	%r8,0(%rdi)
1997	movq	%r9,8(%rdi)
1998.byte	102,73,15,126,217
1999	movq	%r10,16(%rdi)
2000	movq	%r11,24(%rdi)
2001	movq	%r12,32(%rdi)
2002	movq	%r13,40(%rdi)
2003	movq	%r14,48(%rdi)
2004	movq	%r15,56(%rdi)
2005	leaq	64(%rdi),%rdi
2006
2007	cmpq	%rdx,%rdi
2008	jb	.L8x_reduction_loop
2009	.byte	0xf3,0xc3
2010.cfi_endproc
2011.size	bn_sqr8x_internal,.-bn_sqr8x_internal
2012.type	__bn_post4x_internal,@function
2013.align	32
2014__bn_post4x_internal:
2015.cfi_startproc
2016	movq	0(%rbp),%r12
2017	leaq	(%rdi,%r9,1),%rbx
2018	movq	%r9,%rcx
2019.byte	102,72,15,126,207
2020	negq	%rax
2021.byte	102,72,15,126,206
2022	sarq	$3+2,%rcx
2023	decq	%r12
2024	xorq	%r10,%r10
2025	movq	8(%rbp),%r13
2026	movq	16(%rbp),%r14
2027	movq	24(%rbp),%r15
2028	jmp	.Lsqr4x_sub_entry
2029
2030.align	16
2031.Lsqr4x_sub:
2032	movq	0(%rbp),%r12
2033	movq	8(%rbp),%r13
2034	movq	16(%rbp),%r14
2035	movq	24(%rbp),%r15
2036.Lsqr4x_sub_entry:
2037	leaq	32(%rbp),%rbp
2038	notq	%r12
2039	notq	%r13
2040	notq	%r14
2041	notq	%r15
2042	andq	%rax,%r12
2043	andq	%rax,%r13
2044	andq	%rax,%r14
2045	andq	%rax,%r15
2046
2047	negq	%r10
2048	adcq	0(%rbx),%r12
2049	adcq	8(%rbx),%r13
2050	adcq	16(%rbx),%r14
2051	adcq	24(%rbx),%r15
2052	movq	%r12,0(%rdi)
2053	leaq	32(%rbx),%rbx
2054	movq	%r13,8(%rdi)
2055	sbbq	%r10,%r10
2056	movq	%r14,16(%rdi)
2057	movq	%r15,24(%rdi)
2058	leaq	32(%rdi),%rdi
2059
2060	incq	%rcx
2061	jnz	.Lsqr4x_sub
2062
2063	movq	%r9,%r10
2064	negq	%r9
2065	.byte	0xf3,0xc3
2066.cfi_endproc
2067.size	__bn_post4x_internal,.-__bn_post4x_internal
2068.globl	bn_from_montgomery
2069.hidden bn_from_montgomery
2070.type	bn_from_montgomery,@function
2071.align	32
2072bn_from_montgomery:
2073.cfi_startproc
2074	testl	$7,%r9d
2075	jz	bn_from_mont8x
2076	xorl	%eax,%eax
2077	.byte	0xf3,0xc3
2078.cfi_endproc
2079.size	bn_from_montgomery,.-bn_from_montgomery
2080
2081.type	bn_from_mont8x,@function
2082.align	32
2083bn_from_mont8x:
2084.cfi_startproc
2085.byte	0x67
2086	movq	%rsp,%rax
2087.cfi_def_cfa_register	%rax
2088	pushq	%rbx
2089.cfi_offset	%rbx,-16
2090	pushq	%rbp
2091.cfi_offset	%rbp,-24
2092	pushq	%r12
2093.cfi_offset	%r12,-32
2094	pushq	%r13
2095.cfi_offset	%r13,-40
2096	pushq	%r14
2097.cfi_offset	%r14,-48
2098	pushq	%r15
2099.cfi_offset	%r15,-56
2100.Lfrom_prologue:
2101
2102	shll	$3,%r9d
2103	leaq	(%r9,%r9,2),%r10
2104	negq	%r9
2105	movq	(%r8),%r8
2106
2107
2108
2109
2110
2111
2112
2113
2114	leaq	-320(%rsp,%r9,2),%r11
2115	movq	%rsp,%rbp
2116	subq	%rdi,%r11
2117	andq	$4095,%r11
2118	cmpq	%r11,%r10
2119	jb	.Lfrom_sp_alt
2120	subq	%r11,%rbp
2121	leaq	-320(%rbp,%r9,2),%rbp
2122	jmp	.Lfrom_sp_done
2123
2124.align	32
2125.Lfrom_sp_alt:
2126	leaq	4096-320(,%r9,2),%r10
2127	leaq	-320(%rbp,%r9,2),%rbp
2128	subq	%r10,%r11
2129	movq	$0,%r10
2130	cmovcq	%r10,%r11
2131	subq	%r11,%rbp
2132.Lfrom_sp_done:
2133	andq	$-64,%rbp
2134	movq	%rsp,%r11
2135	subq	%rbp,%r11
2136	andq	$-4096,%r11
2137	leaq	(%r11,%rbp,1),%rsp
2138	movq	(%rsp),%r10
2139	cmpq	%rbp,%rsp
2140	ja	.Lfrom_page_walk
2141	jmp	.Lfrom_page_walk_done
2142
2143.Lfrom_page_walk:
2144	leaq	-4096(%rsp),%rsp
2145	movq	(%rsp),%r10
2146	cmpq	%rbp,%rsp
2147	ja	.Lfrom_page_walk
2148.Lfrom_page_walk_done:
2149
2150	movq	%r9,%r10
2151	negq	%r9
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162	movq	%r8,32(%rsp)
2163	movq	%rax,40(%rsp)
2164.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2165.Lfrom_body:
2166	movq	%r9,%r11
2167	leaq	48(%rsp),%rax
2168	pxor	%xmm0,%xmm0
2169	jmp	.Lmul_by_1
2170
2171.align	32
2172.Lmul_by_1:
2173	movdqu	(%rsi),%xmm1
2174	movdqu	16(%rsi),%xmm2
2175	movdqu	32(%rsi),%xmm3
2176	movdqa	%xmm0,(%rax,%r9,1)
2177	movdqu	48(%rsi),%xmm4
2178	movdqa	%xmm0,16(%rax,%r9,1)
2179.byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
2180	movdqa	%xmm1,(%rax)
2181	movdqa	%xmm0,32(%rax,%r9,1)
2182	movdqa	%xmm2,16(%rax)
2183	movdqa	%xmm0,48(%rax,%r9,1)
2184	movdqa	%xmm3,32(%rax)
2185	movdqa	%xmm4,48(%rax)
2186	leaq	64(%rax),%rax
2187	subq	$64,%r11
2188	jnz	.Lmul_by_1
2189
2190.byte	102,72,15,110,207
2191.byte	102,72,15,110,209
2192.byte	0x67
2193	movq	%rcx,%rbp
2194.byte	102,73,15,110,218
2195	leaq	OPENSSL_ia32cap_P(%rip),%r11
2196	movl	8(%r11),%r11d
2197	andl	$0x80108,%r11d
2198	cmpl	$0x80108,%r11d
2199	jne	.Lfrom_mont_nox
2200
2201	leaq	(%rax,%r9,1),%rdi
2202	call	__bn_sqrx8x_reduction
2203	call	__bn_postx4x_internal
2204
2205	pxor	%xmm0,%xmm0
2206	leaq	48(%rsp),%rax
2207	jmp	.Lfrom_mont_zero
2208
2209.align	32
2210.Lfrom_mont_nox:
2211	call	__bn_sqr8x_reduction
2212	call	__bn_post4x_internal
2213
2214	pxor	%xmm0,%xmm0
2215	leaq	48(%rsp),%rax
2216	jmp	.Lfrom_mont_zero
2217
2218.align	32
2219.Lfrom_mont_zero:
2220	movq	40(%rsp),%rsi
2221.cfi_def_cfa	%rsi,8
2222	movdqa	%xmm0,0(%rax)
2223	movdqa	%xmm0,16(%rax)
2224	movdqa	%xmm0,32(%rax)
2225	movdqa	%xmm0,48(%rax)
2226	leaq	64(%rax),%rax
2227	subq	$32,%r9
2228	jnz	.Lfrom_mont_zero
2229
2230	movq	$1,%rax
2231	movq	-48(%rsi),%r15
2232.cfi_restore	%r15
2233	movq	-40(%rsi),%r14
2234.cfi_restore	%r14
2235	movq	-32(%rsi),%r13
2236.cfi_restore	%r13
2237	movq	-24(%rsi),%r12
2238.cfi_restore	%r12
2239	movq	-16(%rsi),%rbp
2240.cfi_restore	%rbp
2241	movq	-8(%rsi),%rbx
2242.cfi_restore	%rbx
2243	leaq	(%rsi),%rsp
2244.cfi_def_cfa_register	%rsp
2245.Lfrom_epilogue:
2246	.byte	0xf3,0xc3
2247.cfi_endproc
2248.size	bn_from_mont8x,.-bn_from_mont8x
2249.type	bn_mulx4x_mont_gather5,@function
2250.align	32
2251bn_mulx4x_mont_gather5:
2252.cfi_startproc
2253	movq	%rsp,%rax
2254.cfi_def_cfa_register	%rax
2255.Lmulx4x_enter:
2256	pushq	%rbx
2257.cfi_offset	%rbx,-16
2258	pushq	%rbp
2259.cfi_offset	%rbp,-24
2260	pushq	%r12
2261.cfi_offset	%r12,-32
2262	pushq	%r13
2263.cfi_offset	%r13,-40
2264	pushq	%r14
2265.cfi_offset	%r14,-48
2266	pushq	%r15
2267.cfi_offset	%r15,-56
2268.Lmulx4x_prologue:
2269
2270	shll	$3,%r9d
2271	leaq	(%r9,%r9,2),%r10
2272	negq	%r9
2273	movq	(%r8),%r8
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284	leaq	-320(%rsp,%r9,2),%r11
2285	movq	%rsp,%rbp
2286	subq	%rdi,%r11
2287	andq	$4095,%r11
2288	cmpq	%r11,%r10
2289	jb	.Lmulx4xsp_alt
2290	subq	%r11,%rbp
2291	leaq	-320(%rbp,%r9,2),%rbp
2292	jmp	.Lmulx4xsp_done
2293
2294.Lmulx4xsp_alt:
2295	leaq	4096-320(,%r9,2),%r10
2296	leaq	-320(%rbp,%r9,2),%rbp
2297	subq	%r10,%r11
2298	movq	$0,%r10
2299	cmovcq	%r10,%r11
2300	subq	%r11,%rbp
2301.Lmulx4xsp_done:
2302	andq	$-64,%rbp
2303	movq	%rsp,%r11
2304	subq	%rbp,%r11
2305	andq	$-4096,%r11
2306	leaq	(%r11,%rbp,1),%rsp
2307	movq	(%rsp),%r10
2308	cmpq	%rbp,%rsp
2309	ja	.Lmulx4x_page_walk
2310	jmp	.Lmulx4x_page_walk_done
2311
2312.Lmulx4x_page_walk:
2313	leaq	-4096(%rsp),%rsp
2314	movq	(%rsp),%r10
2315	cmpq	%rbp,%rsp
2316	ja	.Lmulx4x_page_walk
2317.Lmulx4x_page_walk_done:
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331	movq	%r8,32(%rsp)
2332	movq	%rax,40(%rsp)
2333.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2334.Lmulx4x_body:
2335	call	mulx4x_internal
2336
2337	movq	40(%rsp),%rsi
2338.cfi_def_cfa	%rsi,8
2339	movq	$1,%rax
2340
2341	movq	-48(%rsi),%r15
2342.cfi_restore	%r15
2343	movq	-40(%rsi),%r14
2344.cfi_restore	%r14
2345	movq	-32(%rsi),%r13
2346.cfi_restore	%r13
2347	movq	-24(%rsi),%r12
2348.cfi_restore	%r12
2349	movq	-16(%rsi),%rbp
2350.cfi_restore	%rbp
2351	movq	-8(%rsi),%rbx
2352.cfi_restore	%rbx
2353	leaq	(%rsi),%rsp
2354.cfi_def_cfa_register	%rsp
2355.Lmulx4x_epilogue:
2356	.byte	0xf3,0xc3
2357.cfi_endproc
2358.size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2359
2360.type	mulx4x_internal,@function
2361.align	32
2362mulx4x_internal:
2363.cfi_startproc
2364	movq	%r9,8(%rsp)
2365	movq	%r9,%r10
2366	negq	%r9
2367	shlq	$5,%r9
2368	negq	%r10
2369	leaq	128(%rdx,%r9,1),%r13
2370	shrq	$5+5,%r9
2371	movd	8(%rax),%xmm5
2372	subq	$1,%r9
2373	leaq	.Linc(%rip),%rax
2374	movq	%r13,16+8(%rsp)
2375	movq	%r9,24+8(%rsp)
2376	movq	%rdi,56+8(%rsp)
2377	movdqa	0(%rax),%xmm0
2378	movdqa	16(%rax),%xmm1
2379	leaq	88-112(%rsp,%r10,1),%r10
2380	leaq	128(%rdx),%rdi
2381
2382	pshufd	$0,%xmm5,%xmm5
2383	movdqa	%xmm1,%xmm4
2384.byte	0x67
2385	movdqa	%xmm1,%xmm2
2386.byte	0x67
2387	paddd	%xmm0,%xmm1
2388	pcmpeqd	%xmm5,%xmm0
2389	movdqa	%xmm4,%xmm3
2390	paddd	%xmm1,%xmm2
2391	pcmpeqd	%xmm5,%xmm1
2392	movdqa	%xmm0,112(%r10)
2393	movdqa	%xmm4,%xmm0
2394
2395	paddd	%xmm2,%xmm3
2396	pcmpeqd	%xmm5,%xmm2
2397	movdqa	%xmm1,128(%r10)
2398	movdqa	%xmm4,%xmm1
2399
2400	paddd	%xmm3,%xmm0
2401	pcmpeqd	%xmm5,%xmm3
2402	movdqa	%xmm2,144(%r10)
2403	movdqa	%xmm4,%xmm2
2404
2405	paddd	%xmm0,%xmm1
2406	pcmpeqd	%xmm5,%xmm0
2407	movdqa	%xmm3,160(%r10)
2408	movdqa	%xmm4,%xmm3
2409	paddd	%xmm1,%xmm2
2410	pcmpeqd	%xmm5,%xmm1
2411	movdqa	%xmm0,176(%r10)
2412	movdqa	%xmm4,%xmm0
2413
2414	paddd	%xmm2,%xmm3
2415	pcmpeqd	%xmm5,%xmm2
2416	movdqa	%xmm1,192(%r10)
2417	movdqa	%xmm4,%xmm1
2418
2419	paddd	%xmm3,%xmm0
2420	pcmpeqd	%xmm5,%xmm3
2421	movdqa	%xmm2,208(%r10)
2422	movdqa	%xmm4,%xmm2
2423
2424	paddd	%xmm0,%xmm1
2425	pcmpeqd	%xmm5,%xmm0
2426	movdqa	%xmm3,224(%r10)
2427	movdqa	%xmm4,%xmm3
2428	paddd	%xmm1,%xmm2
2429	pcmpeqd	%xmm5,%xmm1
2430	movdqa	%xmm0,240(%r10)
2431	movdqa	%xmm4,%xmm0
2432
2433	paddd	%xmm2,%xmm3
2434	pcmpeqd	%xmm5,%xmm2
2435	movdqa	%xmm1,256(%r10)
2436	movdqa	%xmm4,%xmm1
2437
2438	paddd	%xmm3,%xmm0
2439	pcmpeqd	%xmm5,%xmm3
2440	movdqa	%xmm2,272(%r10)
2441	movdqa	%xmm4,%xmm2
2442
2443	paddd	%xmm0,%xmm1
2444	pcmpeqd	%xmm5,%xmm0
2445	movdqa	%xmm3,288(%r10)
2446	movdqa	%xmm4,%xmm3
2447.byte	0x67
2448	paddd	%xmm1,%xmm2
2449	pcmpeqd	%xmm5,%xmm1
2450	movdqa	%xmm0,304(%r10)
2451
2452	paddd	%xmm2,%xmm3
2453	pcmpeqd	%xmm5,%xmm2
2454	movdqa	%xmm1,320(%r10)
2455
2456	pcmpeqd	%xmm5,%xmm3
2457	movdqa	%xmm2,336(%r10)
2458
2459	pand	64(%rdi),%xmm0
2460	pand	80(%rdi),%xmm1
2461	pand	96(%rdi),%xmm2
2462	movdqa	%xmm3,352(%r10)
2463	pand	112(%rdi),%xmm3
2464	por	%xmm2,%xmm0
2465	por	%xmm3,%xmm1
2466	movdqa	-128(%rdi),%xmm4
2467	movdqa	-112(%rdi),%xmm5
2468	movdqa	-96(%rdi),%xmm2
2469	pand	112(%r10),%xmm4
2470	movdqa	-80(%rdi),%xmm3
2471	pand	128(%r10),%xmm5
2472	por	%xmm4,%xmm0
2473	pand	144(%r10),%xmm2
2474	por	%xmm5,%xmm1
2475	pand	160(%r10),%xmm3
2476	por	%xmm2,%xmm0
2477	por	%xmm3,%xmm1
2478	movdqa	-64(%rdi),%xmm4
2479	movdqa	-48(%rdi),%xmm5
2480	movdqa	-32(%rdi),%xmm2
2481	pand	176(%r10),%xmm4
2482	movdqa	-16(%rdi),%xmm3
2483	pand	192(%r10),%xmm5
2484	por	%xmm4,%xmm0
2485	pand	208(%r10),%xmm2
2486	por	%xmm5,%xmm1
2487	pand	224(%r10),%xmm3
2488	por	%xmm2,%xmm0
2489	por	%xmm3,%xmm1
2490	movdqa	0(%rdi),%xmm4
2491	movdqa	16(%rdi),%xmm5
2492	movdqa	32(%rdi),%xmm2
2493	pand	240(%r10),%xmm4
2494	movdqa	48(%rdi),%xmm3
2495	pand	256(%r10),%xmm5
2496	por	%xmm4,%xmm0
2497	pand	272(%r10),%xmm2
2498	por	%xmm5,%xmm1
2499	pand	288(%r10),%xmm3
2500	por	%xmm2,%xmm0
2501	por	%xmm3,%xmm1
2502	pxor	%xmm1,%xmm0
2503	pshufd	$0x4e,%xmm0,%xmm1
2504	por	%xmm1,%xmm0
2505	leaq	256(%rdi),%rdi
2506.byte	102,72,15,126,194
2507	leaq	64+32+8(%rsp),%rbx
2508
2509	movq	%rdx,%r9
2510	mulxq	0(%rsi),%r8,%rax
2511	mulxq	8(%rsi),%r11,%r12
2512	addq	%rax,%r11
2513	mulxq	16(%rsi),%rax,%r13
2514	adcq	%rax,%r12
2515	adcq	$0,%r13
2516	mulxq	24(%rsi),%rax,%r14
2517
2518	movq	%r8,%r15
2519	imulq	32+8(%rsp),%r8
2520	xorq	%rbp,%rbp
2521	movq	%r8,%rdx
2522
2523	movq	%rdi,8+8(%rsp)
2524
2525	leaq	32(%rsi),%rsi
2526	adcxq	%rax,%r13
2527	adcxq	%rbp,%r14
2528
2529	mulxq	0(%rcx),%rax,%r10
2530	adcxq	%rax,%r15
2531	adoxq	%r11,%r10
2532	mulxq	8(%rcx),%rax,%r11
2533	adcxq	%rax,%r10
2534	adoxq	%r12,%r11
2535	mulxq	16(%rcx),%rax,%r12
2536	movq	24+8(%rsp),%rdi
2537	movq	%r10,-32(%rbx)
2538	adcxq	%rax,%r11
2539	adoxq	%r13,%r12
2540	mulxq	24(%rcx),%rax,%r15
2541	movq	%r9,%rdx
2542	movq	%r11,-24(%rbx)
2543	adcxq	%rax,%r12
2544	adoxq	%rbp,%r15
2545	leaq	32(%rcx),%rcx
2546	movq	%r12,-16(%rbx)
2547	jmp	.Lmulx4x_1st
2548
2549.align	32
2550.Lmulx4x_1st:
2551	adcxq	%rbp,%r15
2552	mulxq	0(%rsi),%r10,%rax
2553	adcxq	%r14,%r10
2554	mulxq	8(%rsi),%r11,%r14
2555	adcxq	%rax,%r11
2556	mulxq	16(%rsi),%r12,%rax
2557	adcxq	%r14,%r12
2558	mulxq	24(%rsi),%r13,%r14
2559.byte	0x67,0x67
2560	movq	%r8,%rdx
2561	adcxq	%rax,%r13
2562	adcxq	%rbp,%r14
2563	leaq	32(%rsi),%rsi
2564	leaq	32(%rbx),%rbx
2565
2566	adoxq	%r15,%r10
2567	mulxq	0(%rcx),%rax,%r15
2568	adcxq	%rax,%r10
2569	adoxq	%r15,%r11
2570	mulxq	8(%rcx),%rax,%r15
2571	adcxq	%rax,%r11
2572	adoxq	%r15,%r12
2573	mulxq	16(%rcx),%rax,%r15
2574	movq	%r10,-40(%rbx)
2575	adcxq	%rax,%r12
2576	movq	%r11,-32(%rbx)
2577	adoxq	%r15,%r13
2578	mulxq	24(%rcx),%rax,%r15
2579	movq	%r9,%rdx
2580	movq	%r12,-24(%rbx)
2581	adcxq	%rax,%r13
2582	adoxq	%rbp,%r15
2583	leaq	32(%rcx),%rcx
2584	movq	%r13,-16(%rbx)
2585
2586	decq	%rdi
2587	jnz	.Lmulx4x_1st
2588
2589	movq	8(%rsp),%rax
2590	adcq	%rbp,%r15
2591	leaq	(%rsi,%rax,1),%rsi
2592	addq	%r15,%r14
2593	movq	8+8(%rsp),%rdi
2594	adcq	%rbp,%rbp
2595	movq	%r14,-8(%rbx)
2596	jmp	.Lmulx4x_outer
2597
2598.align	32
2599.Lmulx4x_outer:
2600	leaq	16-256(%rbx),%r10
2601	pxor	%xmm4,%xmm4
2602.byte	0x67,0x67
2603	pxor	%xmm5,%xmm5
2604	movdqa	-128(%rdi),%xmm0
2605	movdqa	-112(%rdi),%xmm1
2606	movdqa	-96(%rdi),%xmm2
2607	pand	256(%r10),%xmm0
2608	movdqa	-80(%rdi),%xmm3
2609	pand	272(%r10),%xmm1
2610	por	%xmm0,%xmm4
2611	pand	288(%r10),%xmm2
2612	por	%xmm1,%xmm5
2613	pand	304(%r10),%xmm3
2614	por	%xmm2,%xmm4
2615	por	%xmm3,%xmm5
2616	movdqa	-64(%rdi),%xmm0
2617	movdqa	-48(%rdi),%xmm1
2618	movdqa	-32(%rdi),%xmm2
2619	pand	320(%r10),%xmm0
2620	movdqa	-16(%rdi),%xmm3
2621	pand	336(%r10),%xmm1
2622	por	%xmm0,%xmm4
2623	pand	352(%r10),%xmm2
2624	por	%xmm1,%xmm5
2625	pand	368(%r10),%xmm3
2626	por	%xmm2,%xmm4
2627	por	%xmm3,%xmm5
2628	movdqa	0(%rdi),%xmm0
2629	movdqa	16(%rdi),%xmm1
2630	movdqa	32(%rdi),%xmm2
2631	pand	384(%r10),%xmm0
2632	movdqa	48(%rdi),%xmm3
2633	pand	400(%r10),%xmm1
2634	por	%xmm0,%xmm4
2635	pand	416(%r10),%xmm2
2636	por	%xmm1,%xmm5
2637	pand	432(%r10),%xmm3
2638	por	%xmm2,%xmm4
2639	por	%xmm3,%xmm5
2640	movdqa	64(%rdi),%xmm0
2641	movdqa	80(%rdi),%xmm1
2642	movdqa	96(%rdi),%xmm2
2643	pand	448(%r10),%xmm0
2644	movdqa	112(%rdi),%xmm3
2645	pand	464(%r10),%xmm1
2646	por	%xmm0,%xmm4
2647	pand	480(%r10),%xmm2
2648	por	%xmm1,%xmm5
2649	pand	496(%r10),%xmm3
2650	por	%xmm2,%xmm4
2651	por	%xmm3,%xmm5
2652	por	%xmm5,%xmm4
2653	pshufd	$0x4e,%xmm4,%xmm0
2654	por	%xmm4,%xmm0
2655	leaq	256(%rdi),%rdi
2656.byte	102,72,15,126,194
2657
2658	movq	%rbp,(%rbx)
2659	leaq	32(%rbx,%rax,1),%rbx
2660	mulxq	0(%rsi),%r8,%r11
2661	xorq	%rbp,%rbp
2662	movq	%rdx,%r9
2663	mulxq	8(%rsi),%r14,%r12
2664	adoxq	-32(%rbx),%r8
2665	adcxq	%r14,%r11
2666	mulxq	16(%rsi),%r15,%r13
2667	adoxq	-24(%rbx),%r11
2668	adcxq	%r15,%r12
2669	mulxq	24(%rsi),%rdx,%r14
2670	adoxq	-16(%rbx),%r12
2671	adcxq	%rdx,%r13
2672	leaq	(%rcx,%rax,1),%rcx
2673	leaq	32(%rsi),%rsi
2674	adoxq	-8(%rbx),%r13
2675	adcxq	%rbp,%r14
2676	adoxq	%rbp,%r14
2677
2678	movq	%r8,%r15
2679	imulq	32+8(%rsp),%r8
2680
2681	movq	%r8,%rdx
2682	xorq	%rbp,%rbp
2683	movq	%rdi,8+8(%rsp)
2684
2685	mulxq	0(%rcx),%rax,%r10
2686	adcxq	%rax,%r15
2687	adoxq	%r11,%r10
2688	mulxq	8(%rcx),%rax,%r11
2689	adcxq	%rax,%r10
2690	adoxq	%r12,%r11
2691	mulxq	16(%rcx),%rax,%r12
2692	adcxq	%rax,%r11
2693	adoxq	%r13,%r12
2694	mulxq	24(%rcx),%rax,%r15
2695	movq	%r9,%rdx
2696	movq	24+8(%rsp),%rdi
2697	movq	%r10,-32(%rbx)
2698	adcxq	%rax,%r12
2699	movq	%r11,-24(%rbx)
2700	adoxq	%rbp,%r15
2701	movq	%r12,-16(%rbx)
2702	leaq	32(%rcx),%rcx
2703	jmp	.Lmulx4x_inner
2704
2705.align	32
2706.Lmulx4x_inner:
2707	mulxq	0(%rsi),%r10,%rax
2708	adcxq	%rbp,%r15
2709	adoxq	%r14,%r10
2710	mulxq	8(%rsi),%r11,%r14
2711	adcxq	0(%rbx),%r10
2712	adoxq	%rax,%r11
2713	mulxq	16(%rsi),%r12,%rax
2714	adcxq	8(%rbx),%r11
2715	adoxq	%r14,%r12
2716	mulxq	24(%rsi),%r13,%r14
2717	movq	%r8,%rdx
2718	adcxq	16(%rbx),%r12
2719	adoxq	%rax,%r13
2720	adcxq	24(%rbx),%r13
2721	adoxq	%rbp,%r14
2722	leaq	32(%rsi),%rsi
2723	leaq	32(%rbx),%rbx
2724	adcxq	%rbp,%r14
2725
2726	adoxq	%r15,%r10
2727	mulxq	0(%rcx),%rax,%r15
2728	adcxq	%rax,%r10
2729	adoxq	%r15,%r11
2730	mulxq	8(%rcx),%rax,%r15
2731	adcxq	%rax,%r11
2732	adoxq	%r15,%r12
2733	mulxq	16(%rcx),%rax,%r15
2734	movq	%r10,-40(%rbx)
2735	adcxq	%rax,%r12
2736	adoxq	%r15,%r13
2737	movq	%r11,-32(%rbx)
2738	mulxq	24(%rcx),%rax,%r15
2739	movq	%r9,%rdx
2740	leaq	32(%rcx),%rcx
2741	movq	%r12,-24(%rbx)
2742	adcxq	%rax,%r13
2743	adoxq	%rbp,%r15
2744	movq	%r13,-16(%rbx)
2745
2746	decq	%rdi
2747	jnz	.Lmulx4x_inner
2748
2749	movq	0+8(%rsp),%rax
2750	adcq	%rbp,%r15
2751	subq	0(%rbx),%rdi
2752	movq	8+8(%rsp),%rdi
2753	movq	16+8(%rsp),%r10
2754	adcq	%r15,%r14
2755	leaq	(%rsi,%rax,1),%rsi
2756	adcq	%rbp,%rbp
2757	movq	%r14,-8(%rbx)
2758
2759	cmpq	%r10,%rdi
2760	jb	.Lmulx4x_outer
2761
2762	movq	-8(%rcx),%r10
2763	movq	%rbp,%r8
2764	movq	(%rcx,%rax,1),%r12
2765	leaq	(%rcx,%rax,1),%rbp
2766	movq	%rax,%rcx
2767	leaq	(%rbx,%rax,1),%rdi
2768	xorl	%eax,%eax
2769	xorq	%r15,%r15
2770	subq	%r14,%r10
2771	adcq	%r15,%r15
2772	orq	%r15,%r8
2773	sarq	$3+2,%rcx
2774	subq	%r8,%rax
2775	movq	56+8(%rsp),%rdx
2776	decq	%r12
2777	movq	8(%rbp),%r13
2778	xorq	%r8,%r8
2779	movq	16(%rbp),%r14
2780	movq	24(%rbp),%r15
2781	jmp	.Lsqrx4x_sub_entry
2782.cfi_endproc
2783.size	mulx4x_internal,.-mulx4x_internal
2784.type	bn_powerx5,@function
2785.align	32
2786bn_powerx5:
2787.cfi_startproc
2788	movq	%rsp,%rax
2789.cfi_def_cfa_register	%rax
2790.Lpowerx5_enter:
2791	pushq	%rbx
2792.cfi_offset	%rbx,-16
2793	pushq	%rbp
2794.cfi_offset	%rbp,-24
2795	pushq	%r12
2796.cfi_offset	%r12,-32
2797	pushq	%r13
2798.cfi_offset	%r13,-40
2799	pushq	%r14
2800.cfi_offset	%r14,-48
2801	pushq	%r15
2802.cfi_offset	%r15,-56
2803.Lpowerx5_prologue:
2804
2805	shll	$3,%r9d
2806	leaq	(%r9,%r9,2),%r10
2807	negq	%r9
2808	movq	(%r8),%r8
2809
2810
2811
2812
2813
2814
2815
2816
2817	leaq	-320(%rsp,%r9,2),%r11
2818	movq	%rsp,%rbp
2819	subq	%rdi,%r11
2820	andq	$4095,%r11
2821	cmpq	%r11,%r10
2822	jb	.Lpwrx_sp_alt
2823	subq	%r11,%rbp
2824	leaq	-320(%rbp,%r9,2),%rbp
2825	jmp	.Lpwrx_sp_done
2826
2827.align	32
2828.Lpwrx_sp_alt:
2829	leaq	4096-320(,%r9,2),%r10
2830	leaq	-320(%rbp,%r9,2),%rbp
2831	subq	%r10,%r11
2832	movq	$0,%r10
2833	cmovcq	%r10,%r11
2834	subq	%r11,%rbp
2835.Lpwrx_sp_done:
2836	andq	$-64,%rbp
2837	movq	%rsp,%r11
2838	subq	%rbp,%r11
2839	andq	$-4096,%r11
2840	leaq	(%r11,%rbp,1),%rsp
2841	movq	(%rsp),%r10
2842	cmpq	%rbp,%rsp
2843	ja	.Lpwrx_page_walk
2844	jmp	.Lpwrx_page_walk_done
2845
2846.Lpwrx_page_walk:
2847	leaq	-4096(%rsp),%rsp
2848	movq	(%rsp),%r10
2849	cmpq	%rbp,%rsp
2850	ja	.Lpwrx_page_walk
2851.Lpwrx_page_walk_done:
2852
2853	movq	%r9,%r10
2854	negq	%r9
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867	pxor	%xmm0,%xmm0
2868.byte	102,72,15,110,207
2869.byte	102,72,15,110,209
2870.byte	102,73,15,110,218
2871.byte	102,72,15,110,226
2872	movq	%r8,32(%rsp)
2873	movq	%rax,40(%rsp)
2874.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2875.Lpowerx5_body:
2876
2877	call	__bn_sqrx8x_internal
2878	call	__bn_postx4x_internal
2879	call	__bn_sqrx8x_internal
2880	call	__bn_postx4x_internal
2881	call	__bn_sqrx8x_internal
2882	call	__bn_postx4x_internal
2883	call	__bn_sqrx8x_internal
2884	call	__bn_postx4x_internal
2885	call	__bn_sqrx8x_internal
2886	call	__bn_postx4x_internal
2887
2888	movq	%r10,%r9
2889	movq	%rsi,%rdi
2890.byte	102,72,15,126,209
2891.byte	102,72,15,126,226
2892	movq	40(%rsp),%rax
2893
2894	call	mulx4x_internal
2895
2896	movq	40(%rsp),%rsi
2897.cfi_def_cfa	%rsi,8
2898	movq	$1,%rax
2899
2900	movq	-48(%rsi),%r15
2901.cfi_restore	%r15
2902	movq	-40(%rsi),%r14
2903.cfi_restore	%r14
2904	movq	-32(%rsi),%r13
2905.cfi_restore	%r13
2906	movq	-24(%rsi),%r12
2907.cfi_restore	%r12
2908	movq	-16(%rsi),%rbp
2909.cfi_restore	%rbp
2910	movq	-8(%rsi),%rbx
2911.cfi_restore	%rbx
2912	leaq	(%rsi),%rsp
2913.cfi_def_cfa_register	%rsp
2914.Lpowerx5_epilogue:
2915	.byte	0xf3,0xc3
2916.cfi_endproc
2917.size	bn_powerx5,.-bn_powerx5
2918
2919.globl	bn_sqrx8x_internal
2920.hidden bn_sqrx8x_internal
2921.type	bn_sqrx8x_internal,@function
2922.align	32
2923bn_sqrx8x_internal:
2924__bn_sqrx8x_internal:
2925.cfi_startproc
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966	leaq	48+8(%rsp),%rdi
2967	leaq	(%rsi,%r9,1),%rbp
2968	movq	%r9,0+8(%rsp)
2969	movq	%rbp,8+8(%rsp)
2970	jmp	.Lsqr8x_zero_start
2971
2972.align	32
2973.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2974.Lsqrx8x_zero:
2975.byte	0x3e
2976	movdqa	%xmm0,0(%rdi)
2977	movdqa	%xmm0,16(%rdi)
2978	movdqa	%xmm0,32(%rdi)
2979	movdqa	%xmm0,48(%rdi)
2980.Lsqr8x_zero_start:
2981	movdqa	%xmm0,64(%rdi)
2982	movdqa	%xmm0,80(%rdi)
2983	movdqa	%xmm0,96(%rdi)
2984	movdqa	%xmm0,112(%rdi)
2985	leaq	128(%rdi),%rdi
2986	subq	$64,%r9
2987	jnz	.Lsqrx8x_zero
2988
2989	movq	0(%rsi),%rdx
2990
2991	xorq	%r10,%r10
2992	xorq	%r11,%r11
2993	xorq	%r12,%r12
2994	xorq	%r13,%r13
2995	xorq	%r14,%r14
2996	xorq	%r15,%r15
2997	leaq	48+8(%rsp),%rdi
2998	xorq	%rbp,%rbp
2999	jmp	.Lsqrx8x_outer_loop
3000
3001.align	32
3002.Lsqrx8x_outer_loop:
3003	mulxq	8(%rsi),%r8,%rax
3004	adcxq	%r9,%r8
3005	adoxq	%rax,%r10
3006	mulxq	16(%rsi),%r9,%rax
3007	adcxq	%r10,%r9
3008	adoxq	%rax,%r11
3009.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
3010	adcxq	%r11,%r10
3011	adoxq	%rax,%r12
3012.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
3013	adcxq	%r12,%r11
3014	adoxq	%rax,%r13
3015	mulxq	40(%rsi),%r12,%rax
3016	adcxq	%r13,%r12
3017	adoxq	%rax,%r14
3018	mulxq	48(%rsi),%r13,%rax
3019	adcxq	%r14,%r13
3020	adoxq	%r15,%rax
3021	mulxq	56(%rsi),%r14,%r15
3022	movq	8(%rsi),%rdx
3023	adcxq	%rax,%r14
3024	adoxq	%rbp,%r15
3025	adcq	64(%rdi),%r15
3026	movq	%r8,8(%rdi)
3027	movq	%r9,16(%rdi)
3028	sbbq	%rcx,%rcx
3029	xorq	%rbp,%rbp
3030
3031
3032	mulxq	16(%rsi),%r8,%rbx
3033	mulxq	24(%rsi),%r9,%rax
3034	adcxq	%r10,%r8
3035	adoxq	%rbx,%r9
3036	mulxq	32(%rsi),%r10,%rbx
3037	adcxq	%r11,%r9
3038	adoxq	%rax,%r10
3039.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
3040	adcxq	%r12,%r10
3041	adoxq	%rbx,%r11
3042.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
3043	adcxq	%r13,%r11
3044	adoxq	%r14,%r12
3045.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
3046	movq	16(%rsi),%rdx
3047	adcxq	%rax,%r12
3048	adoxq	%rbx,%r13
3049	adcxq	%r15,%r13
3050	adoxq	%rbp,%r14
3051	adcxq	%rbp,%r14
3052
3053	movq	%r8,24(%rdi)
3054	movq	%r9,32(%rdi)
3055
3056	mulxq	24(%rsi),%r8,%rbx
3057	mulxq	32(%rsi),%r9,%rax
3058	adcxq	%r10,%r8
3059	adoxq	%rbx,%r9
3060	mulxq	40(%rsi),%r10,%rbx
3061	adcxq	%r11,%r9
3062	adoxq	%rax,%r10
3063.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
3064	adcxq	%r12,%r10
3065	adoxq	%r13,%r11
3066.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
3067.byte	0x3e
3068	movq	24(%rsi),%rdx
3069	adcxq	%rbx,%r11
3070	adoxq	%rax,%r12
3071	adcxq	%r14,%r12
3072	movq	%r8,40(%rdi)
3073	movq	%r9,48(%rdi)
3074	mulxq	32(%rsi),%r8,%rax
3075	adoxq	%rbp,%r13
3076	adcxq	%rbp,%r13
3077
3078	mulxq	40(%rsi),%r9,%rbx
3079	adcxq	%r10,%r8
3080	adoxq	%rax,%r9
3081	mulxq	48(%rsi),%r10,%rax
3082	adcxq	%r11,%r9
3083	adoxq	%r12,%r10
3084	mulxq	56(%rsi),%r11,%r12
3085	movq	32(%rsi),%rdx
3086	movq	40(%rsi),%r14
3087	adcxq	%rbx,%r10
3088	adoxq	%rax,%r11
3089	movq	48(%rsi),%r15
3090	adcxq	%r13,%r11
3091	adoxq	%rbp,%r12
3092	adcxq	%rbp,%r12
3093
3094	movq	%r8,56(%rdi)
3095	movq	%r9,64(%rdi)
3096
3097	mulxq	%r14,%r9,%rax
3098	movq	56(%rsi),%r8
3099	adcxq	%r10,%r9
3100	mulxq	%r15,%r10,%rbx
3101	adoxq	%rax,%r10
3102	adcxq	%r11,%r10
3103	mulxq	%r8,%r11,%rax
3104	movq	%r14,%rdx
3105	adoxq	%rbx,%r11
3106	adcxq	%r12,%r11
3107
3108	adcxq	%rbp,%rax
3109
3110	mulxq	%r15,%r14,%rbx
3111	mulxq	%r8,%r12,%r13
3112	movq	%r15,%rdx
3113	leaq	64(%rsi),%rsi
3114	adcxq	%r14,%r11
3115	adoxq	%rbx,%r12
3116	adcxq	%rax,%r12
3117	adoxq	%rbp,%r13
3118
3119.byte	0x67,0x67
3120	mulxq	%r8,%r8,%r14
3121	adcxq	%r8,%r13
3122	adcxq	%rbp,%r14
3123
3124	cmpq	8+8(%rsp),%rsi
3125	je	.Lsqrx8x_outer_break
3126
3127	negq	%rcx
3128	movq	$-8,%rcx
3129	movq	%rbp,%r15
3130	movq	64(%rdi),%r8
3131	adcxq	72(%rdi),%r9
3132	adcxq	80(%rdi),%r10
3133	adcxq	88(%rdi),%r11
3134	adcq	96(%rdi),%r12
3135	adcq	104(%rdi),%r13
3136	adcq	112(%rdi),%r14
3137	adcq	120(%rdi),%r15
3138	leaq	(%rsi),%rbp
3139	leaq	128(%rdi),%rdi
3140	sbbq	%rax,%rax
3141
3142	movq	-64(%rsi),%rdx
3143	movq	%rax,16+8(%rsp)
3144	movq	%rdi,24+8(%rsp)
3145
3146
3147	xorl	%eax,%eax
3148	jmp	.Lsqrx8x_loop
3149
3150.align	32
3151.Lsqrx8x_loop:
3152	movq	%r8,%rbx
3153	mulxq	0(%rbp),%rax,%r8
3154	adcxq	%rax,%rbx
3155	adoxq	%r9,%r8
3156
3157	mulxq	8(%rbp),%rax,%r9
3158	adcxq	%rax,%r8
3159	adoxq	%r10,%r9
3160
3161	mulxq	16(%rbp),%rax,%r10
3162	adcxq	%rax,%r9
3163	adoxq	%r11,%r10
3164
3165	mulxq	24(%rbp),%rax,%r11
3166	adcxq	%rax,%r10
3167	adoxq	%r12,%r11
3168
3169.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3170	adcxq	%rax,%r11
3171	adoxq	%r13,%r12
3172
3173	mulxq	40(%rbp),%rax,%r13
3174	adcxq	%rax,%r12
3175	adoxq	%r14,%r13
3176
3177	mulxq	48(%rbp),%rax,%r14
3178	movq	%rbx,(%rdi,%rcx,8)
3179	movl	$0,%ebx
3180	adcxq	%rax,%r13
3181	adoxq	%r15,%r14
3182
3183.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
3184	movq	8(%rsi,%rcx,8),%rdx
3185	adcxq	%rax,%r14
3186	adoxq	%rbx,%r15
3187	adcxq	%rbx,%r15
3188
3189.byte	0x67
3190	incq	%rcx
3191	jnz	.Lsqrx8x_loop
3192
3193	leaq	64(%rbp),%rbp
3194	movq	$-8,%rcx
3195	cmpq	8+8(%rsp),%rbp
3196	je	.Lsqrx8x_break
3197
3198	subq	16+8(%rsp),%rbx
3199.byte	0x66
3200	movq	-64(%rsi),%rdx
3201	adcxq	0(%rdi),%r8
3202	adcxq	8(%rdi),%r9
3203	adcq	16(%rdi),%r10
3204	adcq	24(%rdi),%r11
3205	adcq	32(%rdi),%r12
3206	adcq	40(%rdi),%r13
3207	adcq	48(%rdi),%r14
3208	adcq	56(%rdi),%r15
3209	leaq	64(%rdi),%rdi
3210.byte	0x67
3211	sbbq	%rax,%rax
3212	xorl	%ebx,%ebx
3213	movq	%rax,16+8(%rsp)
3214	jmp	.Lsqrx8x_loop
3215
3216.align	32
3217.Lsqrx8x_break:
3218	xorq	%rbp,%rbp
3219	subq	16+8(%rsp),%rbx
3220	adcxq	%rbp,%r8
3221	movq	24+8(%rsp),%rcx
3222	adcxq	%rbp,%r9
3223	movq	0(%rsi),%rdx
3224	adcq	$0,%r10
3225	movq	%r8,0(%rdi)
3226	adcq	$0,%r11
3227	adcq	$0,%r12
3228	adcq	$0,%r13
3229	adcq	$0,%r14
3230	adcq	$0,%r15
3231	cmpq	%rcx,%rdi
3232	je	.Lsqrx8x_outer_loop
3233
3234	movq	%r9,8(%rdi)
3235	movq	8(%rcx),%r9
3236	movq	%r10,16(%rdi)
3237	movq	16(%rcx),%r10
3238	movq	%r11,24(%rdi)
3239	movq	24(%rcx),%r11
3240	movq	%r12,32(%rdi)
3241	movq	32(%rcx),%r12
3242	movq	%r13,40(%rdi)
3243	movq	40(%rcx),%r13
3244	movq	%r14,48(%rdi)
3245	movq	48(%rcx),%r14
3246	movq	%r15,56(%rdi)
3247	movq	56(%rcx),%r15
3248	movq	%rcx,%rdi
3249	jmp	.Lsqrx8x_outer_loop
3250
3251.align	32
3252.Lsqrx8x_outer_break:
3253	movq	%r9,72(%rdi)
3254.byte	102,72,15,126,217
3255	movq	%r10,80(%rdi)
3256	movq	%r11,88(%rdi)
3257	movq	%r12,96(%rdi)
3258	movq	%r13,104(%rdi)
3259	movq	%r14,112(%rdi)
3260	leaq	48+8(%rsp),%rdi
3261	movq	(%rsi,%rcx,1),%rdx
3262
3263	movq	8(%rdi),%r11
3264	xorq	%r10,%r10
3265	movq	0+8(%rsp),%r9
3266	adoxq	%r11,%r11
3267	movq	16(%rdi),%r12
3268	movq	24(%rdi),%r13
3269
3270
3271.align	32
3272.Lsqrx4x_shift_n_add:
3273	mulxq	%rdx,%rax,%rbx
3274	adoxq	%r12,%r12
3275	adcxq	%r10,%rax
3276.byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
3277.byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
3278	adoxq	%r13,%r13
3279	adcxq	%r11,%rbx
3280	movq	40(%rdi),%r11
3281	movq	%rax,0(%rdi)
3282	movq	%rbx,8(%rdi)
3283
3284	mulxq	%rdx,%rax,%rbx
3285	adoxq	%r10,%r10
3286	adcxq	%r12,%rax
3287	movq	16(%rsi,%rcx,1),%rdx
3288	movq	48(%rdi),%r12
3289	adoxq	%r11,%r11
3290	adcxq	%r13,%rbx
3291	movq	56(%rdi),%r13
3292	movq	%rax,16(%rdi)
3293	movq	%rbx,24(%rdi)
3294
3295	mulxq	%rdx,%rax,%rbx
3296	adoxq	%r12,%r12
3297	adcxq	%r10,%rax
3298	movq	24(%rsi,%rcx,1),%rdx
3299	leaq	32(%rcx),%rcx
3300	movq	64(%rdi),%r10
3301	adoxq	%r13,%r13
3302	adcxq	%r11,%rbx
3303	movq	72(%rdi),%r11
3304	movq	%rax,32(%rdi)
3305	movq	%rbx,40(%rdi)
3306
3307	mulxq	%rdx,%rax,%rbx
3308	adoxq	%r10,%r10
3309	adcxq	%r12,%rax
3310	jrcxz	.Lsqrx4x_shift_n_add_break
3311.byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
3312	adoxq	%r11,%r11
3313	adcxq	%r13,%rbx
3314	movq	80(%rdi),%r12
3315	movq	88(%rdi),%r13
3316	movq	%rax,48(%rdi)
3317	movq	%rbx,56(%rdi)
3318	leaq	64(%rdi),%rdi
3319	nop
3320	jmp	.Lsqrx4x_shift_n_add
3321
3322.align	32
3323.Lsqrx4x_shift_n_add_break:
3324	adcxq	%r13,%rbx
3325	movq	%rax,48(%rdi)
3326	movq	%rbx,56(%rdi)
3327	leaq	64(%rdi),%rdi
3328.byte	102,72,15,126,213
3329__bn_sqrx8x_reduction:
3330	xorl	%eax,%eax
3331	movq	32+8(%rsp),%rbx
3332	movq	48+8(%rsp),%rdx
3333	leaq	-64(%rbp,%r9,1),%rcx
3334
3335	movq	%rcx,0+8(%rsp)
3336	movq	%rdi,8+8(%rsp)
3337
3338	leaq	48+8(%rsp),%rdi
3339	jmp	.Lsqrx8x_reduction_loop
3340
3341.align	32
3342.Lsqrx8x_reduction_loop:
3343	movq	8(%rdi),%r9
3344	movq	16(%rdi),%r10
3345	movq	24(%rdi),%r11
3346	movq	32(%rdi),%r12
3347	movq	%rdx,%r8
3348	imulq	%rbx,%rdx
3349	movq	40(%rdi),%r13
3350	movq	48(%rdi),%r14
3351	movq	56(%rdi),%r15
3352	movq	%rax,24+8(%rsp)
3353
3354	leaq	64(%rdi),%rdi
3355	xorq	%rsi,%rsi
3356	movq	$-8,%rcx
3357	jmp	.Lsqrx8x_reduce
3358
3359.align	32
3360.Lsqrx8x_reduce:
3361	movq	%r8,%rbx
3362	mulxq	0(%rbp),%rax,%r8
3363	adcxq	%rbx,%rax
3364	adoxq	%r9,%r8
3365
3366	mulxq	8(%rbp),%rbx,%r9
3367	adcxq	%rbx,%r8
3368	adoxq	%r10,%r9
3369
3370	mulxq	16(%rbp),%rbx,%r10
3371	adcxq	%rbx,%r9
3372	adoxq	%r11,%r10
3373
3374	mulxq	24(%rbp),%rbx,%r11
3375	adcxq	%rbx,%r10
3376	adoxq	%r12,%r11
3377
3378.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
3379	movq	%rdx,%rax
3380	movq	%r8,%rdx
3381	adcxq	%rbx,%r11
3382	adoxq	%r13,%r12
3383
3384	mulxq	32+8(%rsp),%rbx,%rdx
3385	movq	%rax,%rdx
3386	movq	%rax,64+48+8(%rsp,%rcx,8)
3387
3388	mulxq	40(%rbp),%rax,%r13
3389	adcxq	%rax,%r12
3390	adoxq	%r14,%r13
3391
3392	mulxq	48(%rbp),%rax,%r14
3393	adcxq	%rax,%r13
3394	adoxq	%r15,%r14
3395
3396	mulxq	56(%rbp),%rax,%r15
3397	movq	%rbx,%rdx
3398	adcxq	%rax,%r14
3399	adoxq	%rsi,%r15
3400	adcxq	%rsi,%r15
3401
3402.byte	0x67,0x67,0x67
3403	incq	%rcx
3404	jnz	.Lsqrx8x_reduce
3405
3406	movq	%rsi,%rax
3407	cmpq	0+8(%rsp),%rbp
3408	jae	.Lsqrx8x_no_tail
3409
3410	movq	48+8(%rsp),%rdx
3411	addq	0(%rdi),%r8
3412	leaq	64(%rbp),%rbp
3413	movq	$-8,%rcx
3414	adcxq	8(%rdi),%r9
3415	adcxq	16(%rdi),%r10
3416	adcq	24(%rdi),%r11
3417	adcq	32(%rdi),%r12
3418	adcq	40(%rdi),%r13
3419	adcq	48(%rdi),%r14
3420	adcq	56(%rdi),%r15
3421	leaq	64(%rdi),%rdi
3422	sbbq	%rax,%rax
3423
3424	xorq	%rsi,%rsi
3425	movq	%rax,16+8(%rsp)
3426	jmp	.Lsqrx8x_tail
3427
3428.align	32
3429.Lsqrx8x_tail:
3430	movq	%r8,%rbx
3431	mulxq	0(%rbp),%rax,%r8
3432	adcxq	%rax,%rbx
3433	adoxq	%r9,%r8
3434
3435	mulxq	8(%rbp),%rax,%r9
3436	adcxq	%rax,%r8
3437	adoxq	%r10,%r9
3438
3439	mulxq	16(%rbp),%rax,%r10
3440	adcxq	%rax,%r9
3441	adoxq	%r11,%r10
3442
3443	mulxq	24(%rbp),%rax,%r11
3444	adcxq	%rax,%r10
3445	adoxq	%r12,%r11
3446
3447.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3448	adcxq	%rax,%r11
3449	adoxq	%r13,%r12
3450
3451	mulxq	40(%rbp),%rax,%r13
3452	adcxq	%rax,%r12
3453	adoxq	%r14,%r13
3454
3455	mulxq	48(%rbp),%rax,%r14
3456	adcxq	%rax,%r13
3457	adoxq	%r15,%r14
3458
3459	mulxq	56(%rbp),%rax,%r15
3460	movq	72+48+8(%rsp,%rcx,8),%rdx
3461	adcxq	%rax,%r14
3462	adoxq	%rsi,%r15
3463	movq	%rbx,(%rdi,%rcx,8)
3464	movq	%r8,%rbx
3465	adcxq	%rsi,%r15
3466
3467	incq	%rcx
3468	jnz	.Lsqrx8x_tail
3469
3470	cmpq	0+8(%rsp),%rbp
3471	jae	.Lsqrx8x_tail_done
3472
3473	subq	16+8(%rsp),%rsi
3474	movq	48+8(%rsp),%rdx
3475	leaq	64(%rbp),%rbp
3476	adcq	0(%rdi),%r8
3477	adcq	8(%rdi),%r9
3478	adcq	16(%rdi),%r10
3479	adcq	24(%rdi),%r11
3480	adcq	32(%rdi),%r12
3481	adcq	40(%rdi),%r13
3482	adcq	48(%rdi),%r14
3483	adcq	56(%rdi),%r15
3484	leaq	64(%rdi),%rdi
3485	sbbq	%rax,%rax
3486	subq	$8,%rcx
3487
3488	xorq	%rsi,%rsi
3489	movq	%rax,16+8(%rsp)
3490	jmp	.Lsqrx8x_tail
3491
3492.align	32
3493.Lsqrx8x_tail_done:
3494	xorq	%rax,%rax
3495	addq	24+8(%rsp),%r8
3496	adcq	$0,%r9
3497	adcq	$0,%r10
3498	adcq	$0,%r11
3499	adcq	$0,%r12
3500	adcq	$0,%r13
3501	adcq	$0,%r14
3502	adcq	$0,%r15
3503	adcq	$0,%rax
3504
3505	subq	16+8(%rsp),%rsi
3506.Lsqrx8x_no_tail:
3507	adcq	0(%rdi),%r8
3508.byte	102,72,15,126,217
3509	adcq	8(%rdi),%r9
3510	movq	56(%rbp),%rsi
3511.byte	102,72,15,126,213
3512	adcq	16(%rdi),%r10
3513	adcq	24(%rdi),%r11
3514	adcq	32(%rdi),%r12
3515	adcq	40(%rdi),%r13
3516	adcq	48(%rdi),%r14
3517	adcq	56(%rdi),%r15
3518	adcq	$0,%rax
3519
3520	movq	32+8(%rsp),%rbx
3521	movq	64(%rdi,%rcx,1),%rdx
3522
3523	movq	%r8,0(%rdi)
3524	leaq	64(%rdi),%r8
3525	movq	%r9,8(%rdi)
3526	movq	%r10,16(%rdi)
3527	movq	%r11,24(%rdi)
3528	movq	%r12,32(%rdi)
3529	movq	%r13,40(%rdi)
3530	movq	%r14,48(%rdi)
3531	movq	%r15,56(%rdi)
3532
3533	leaq	64(%rdi,%rcx,1),%rdi
3534	cmpq	8+8(%rsp),%r8
3535	jb	.Lsqrx8x_reduction_loop
3536	.byte	0xf3,0xc3
3537.cfi_endproc
3538.size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
3539.align	32
3540.type	__bn_postx4x_internal,@function
3541__bn_postx4x_internal:
3542.cfi_startproc
3543	movq	0(%rbp),%r12
3544	movq	%rcx,%r10
3545	movq	%rcx,%r9
3546	negq	%rax
3547	sarq	$3+2,%rcx
3548
3549.byte	102,72,15,126,202
3550.byte	102,72,15,126,206
3551	decq	%r12
3552	movq	8(%rbp),%r13
3553	xorq	%r8,%r8
3554	movq	16(%rbp),%r14
3555	movq	24(%rbp),%r15
3556	jmp	.Lsqrx4x_sub_entry
3557
3558.align	16
3559.Lsqrx4x_sub:
3560	movq	0(%rbp),%r12
3561	movq	8(%rbp),%r13
3562	movq	16(%rbp),%r14
3563	movq	24(%rbp),%r15
3564.Lsqrx4x_sub_entry:
3565	andnq	%rax,%r12,%r12
3566	leaq	32(%rbp),%rbp
3567	andnq	%rax,%r13,%r13
3568	andnq	%rax,%r14,%r14
3569	andnq	%rax,%r15,%r15
3570
3571	negq	%r8
3572	adcq	0(%rdi),%r12
3573	adcq	8(%rdi),%r13
3574	adcq	16(%rdi),%r14
3575	adcq	24(%rdi),%r15
3576	movq	%r12,0(%rdx)
3577	leaq	32(%rdi),%rdi
3578	movq	%r13,8(%rdx)
3579	sbbq	%r8,%r8
3580	movq	%r14,16(%rdx)
3581	movq	%r15,24(%rdx)
3582	leaq	32(%rdx),%rdx
3583
3584	incq	%rcx
3585	jnz	.Lsqrx4x_sub
3586
3587	negq	%r9
3588
3589	.byte	0xf3,0xc3
3590.cfi_endproc
3591.size	__bn_postx4x_internal,.-__bn_postx4x_internal
3592.globl	bn_scatter5
3593.hidden bn_scatter5
3594.type	bn_scatter5,@function
3595.align	16
3596bn_scatter5:
3597.cfi_startproc
3598	cmpl	$0,%esi
3599	jz	.Lscatter_epilogue
3600	leaq	(%rdx,%rcx,8),%rdx
3601.Lscatter:
3602	movq	(%rdi),%rax
3603	leaq	8(%rdi),%rdi
3604	movq	%rax,(%rdx)
3605	leaq	256(%rdx),%rdx
3606	subl	$1,%esi
3607	jnz	.Lscatter
3608.Lscatter_epilogue:
3609	.byte	0xf3,0xc3
3610.cfi_endproc
3611.size	bn_scatter5,.-bn_scatter5
3612
3613.globl	bn_gather5
3614.hidden bn_gather5
3615.type	bn_gather5,@function
3616.align	32
3617bn_gather5:
3618.cfi_startproc
3619.LSEH_begin_bn_gather5:
3620
3621.byte	0x4c,0x8d,0x14,0x24
3622.cfi_def_cfa_register	%r10
3623.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00
3624	leaq	.Linc(%rip),%rax
3625	andq	$-16,%rsp
3626
3627	movd	%ecx,%xmm5
3628	movdqa	0(%rax),%xmm0
3629	movdqa	16(%rax),%xmm1
3630	leaq	128(%rdx),%r11
3631	leaq	128(%rsp),%rax
3632
3633	pshufd	$0,%xmm5,%xmm5
3634	movdqa	%xmm1,%xmm4
3635	movdqa	%xmm1,%xmm2
3636	paddd	%xmm0,%xmm1
3637	pcmpeqd	%xmm5,%xmm0
3638	movdqa	%xmm4,%xmm3
3639
3640	paddd	%xmm1,%xmm2
3641	pcmpeqd	%xmm5,%xmm1
3642	movdqa	%xmm0,-128(%rax)
3643	movdqa	%xmm4,%xmm0
3644
3645	paddd	%xmm2,%xmm3
3646	pcmpeqd	%xmm5,%xmm2
3647	movdqa	%xmm1,-112(%rax)
3648	movdqa	%xmm4,%xmm1
3649
3650	paddd	%xmm3,%xmm0
3651	pcmpeqd	%xmm5,%xmm3
3652	movdqa	%xmm2,-96(%rax)
3653	movdqa	%xmm4,%xmm2
3654	paddd	%xmm0,%xmm1
3655	pcmpeqd	%xmm5,%xmm0
3656	movdqa	%xmm3,-80(%rax)
3657	movdqa	%xmm4,%xmm3
3658
3659	paddd	%xmm1,%xmm2
3660	pcmpeqd	%xmm5,%xmm1
3661	movdqa	%xmm0,-64(%rax)
3662	movdqa	%xmm4,%xmm0
3663
3664	paddd	%xmm2,%xmm3
3665	pcmpeqd	%xmm5,%xmm2
3666	movdqa	%xmm1,-48(%rax)
3667	movdqa	%xmm4,%xmm1
3668
3669	paddd	%xmm3,%xmm0
3670	pcmpeqd	%xmm5,%xmm3
3671	movdqa	%xmm2,-32(%rax)
3672	movdqa	%xmm4,%xmm2
3673	paddd	%xmm0,%xmm1
3674	pcmpeqd	%xmm5,%xmm0
3675	movdqa	%xmm3,-16(%rax)
3676	movdqa	%xmm4,%xmm3
3677
3678	paddd	%xmm1,%xmm2
3679	pcmpeqd	%xmm5,%xmm1
3680	movdqa	%xmm0,0(%rax)
3681	movdqa	%xmm4,%xmm0
3682
3683	paddd	%xmm2,%xmm3
3684	pcmpeqd	%xmm5,%xmm2
3685	movdqa	%xmm1,16(%rax)
3686	movdqa	%xmm4,%xmm1
3687
3688	paddd	%xmm3,%xmm0
3689	pcmpeqd	%xmm5,%xmm3
3690	movdqa	%xmm2,32(%rax)
3691	movdqa	%xmm4,%xmm2
3692	paddd	%xmm0,%xmm1
3693	pcmpeqd	%xmm5,%xmm0
3694	movdqa	%xmm3,48(%rax)
3695	movdqa	%xmm4,%xmm3
3696
3697	paddd	%xmm1,%xmm2
3698	pcmpeqd	%xmm5,%xmm1
3699	movdqa	%xmm0,64(%rax)
3700	movdqa	%xmm4,%xmm0
3701
3702	paddd	%xmm2,%xmm3
3703	pcmpeqd	%xmm5,%xmm2
3704	movdqa	%xmm1,80(%rax)
3705	movdqa	%xmm4,%xmm1
3706
3707	paddd	%xmm3,%xmm0
3708	pcmpeqd	%xmm5,%xmm3
3709	movdqa	%xmm2,96(%rax)
3710	movdqa	%xmm4,%xmm2
3711	movdqa	%xmm3,112(%rax)
3712	jmp	.Lgather
3713
3714.align	32
3715.Lgather:
3716	pxor	%xmm4,%xmm4
3717	pxor	%xmm5,%xmm5
3718	movdqa	-128(%r11),%xmm0
3719	movdqa	-112(%r11),%xmm1
3720	movdqa	-96(%r11),%xmm2
3721	pand	-128(%rax),%xmm0
3722	movdqa	-80(%r11),%xmm3
3723	pand	-112(%rax),%xmm1
3724	por	%xmm0,%xmm4
3725	pand	-96(%rax),%xmm2
3726	por	%xmm1,%xmm5
3727	pand	-80(%rax),%xmm3
3728	por	%xmm2,%xmm4
3729	por	%xmm3,%xmm5
3730	movdqa	-64(%r11),%xmm0
3731	movdqa	-48(%r11),%xmm1
3732	movdqa	-32(%r11),%xmm2
3733	pand	-64(%rax),%xmm0
3734	movdqa	-16(%r11),%xmm3
3735	pand	-48(%rax),%xmm1
3736	por	%xmm0,%xmm4
3737	pand	-32(%rax),%xmm2
3738	por	%xmm1,%xmm5
3739	pand	-16(%rax),%xmm3
3740	por	%xmm2,%xmm4
3741	por	%xmm3,%xmm5
3742	movdqa	0(%r11),%xmm0
3743	movdqa	16(%r11),%xmm1
3744	movdqa	32(%r11),%xmm2
3745	pand	0(%rax),%xmm0
3746	movdqa	48(%r11),%xmm3
3747	pand	16(%rax),%xmm1
3748	por	%xmm0,%xmm4
3749	pand	32(%rax),%xmm2
3750	por	%xmm1,%xmm5
3751	pand	48(%rax),%xmm3
3752	por	%xmm2,%xmm4
3753	por	%xmm3,%xmm5
3754	movdqa	64(%r11),%xmm0
3755	movdqa	80(%r11),%xmm1
3756	movdqa	96(%r11),%xmm2
3757	pand	64(%rax),%xmm0
3758	movdqa	112(%r11),%xmm3
3759	pand	80(%rax),%xmm1
3760	por	%xmm0,%xmm4
3761	pand	96(%rax),%xmm2
3762	por	%xmm1,%xmm5
3763	pand	112(%rax),%xmm3
3764	por	%xmm2,%xmm4
3765	por	%xmm3,%xmm5
3766	por	%xmm5,%xmm4
3767	leaq	256(%r11),%r11
3768	pshufd	$0x4e,%xmm4,%xmm0
3769	por	%xmm4,%xmm0
3770	movq	%xmm0,(%rdi)
3771	leaq	8(%rdi),%rdi
3772	subl	$1,%esi
3773	jnz	.Lgather
3774
3775	leaq	(%r10),%rsp
3776.cfi_def_cfa_register	%rsp
3777	.byte	0xf3,0xc3
3778.LSEH_end_bn_gather5:
3779.cfi_endproc
3780.size	bn_gather5,.-bn_gather5
3781.align	64
3782.Linc:
3783.long	0,0, 1,1
3784.long	2,2, 2,2
3785.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
3786#endif
3787.section	.note.GNU-stack,"",@progbits
3788