• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) && defined(__ELF__)
11#if defined(BORINGSSL_PREFIX)
12#include <boringssl_prefix_symbols_asm.h>
13#endif
14.text
15
16.extern	OPENSSL_ia32cap_P
17.hidden OPENSSL_ia32cap_P
18
19.globl	bn_mul_mont_gather5
20.hidden bn_mul_mont_gather5
21.type	bn_mul_mont_gather5,@function
22.align	64
23bn_mul_mont_gather5:
24.cfi_startproc
25	movl	%r9d,%r9d
26	movq	%rsp,%rax
27.cfi_def_cfa_register	%rax
28	testl	$7,%r9d
29	jnz	.Lmul_enter
30	leaq	OPENSSL_ia32cap_P(%rip),%r11
31	movl	8(%r11),%r11d
32	jmp	.Lmul4x_enter
33
34.align	16
35.Lmul_enter:
36	movd	8(%rsp),%xmm5
37	pushq	%rbx
38.cfi_offset	%rbx,-16
39	pushq	%rbp
40.cfi_offset	%rbp,-24
41	pushq	%r12
42.cfi_offset	%r12,-32
43	pushq	%r13
44.cfi_offset	%r13,-40
45	pushq	%r14
46.cfi_offset	%r14,-48
47	pushq	%r15
48.cfi_offset	%r15,-56
49
50	negq	%r9
51	movq	%rsp,%r11
52	leaq	-280(%rsp,%r9,8),%r10
53	negq	%r9
54	andq	$-1024,%r10
55
56
57
58
59
60
61
62
63
64	subq	%r10,%r11
65	andq	$-4096,%r11
66	leaq	(%r10,%r11,1),%rsp
67	movq	(%rsp),%r11
68	cmpq	%r10,%rsp
69	ja	.Lmul_page_walk
70	jmp	.Lmul_page_walk_done
71
72.Lmul_page_walk:
73	leaq	-4096(%rsp),%rsp
74	movq	(%rsp),%r11
75	cmpq	%r10,%rsp
76	ja	.Lmul_page_walk
77.Lmul_page_walk_done:
78
79	leaq	.Linc(%rip),%r10
80	movq	%rax,8(%rsp,%r9,8)
81.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
82.Lmul_body:
83
84	leaq	128(%rdx),%r12
85	movdqa	0(%r10),%xmm0
86	movdqa	16(%r10),%xmm1
87	leaq	24-112(%rsp,%r9,8),%r10
88	andq	$-16,%r10
89
90	pshufd	$0,%xmm5,%xmm5
91	movdqa	%xmm1,%xmm4
92	movdqa	%xmm1,%xmm2
93	paddd	%xmm0,%xmm1
94	pcmpeqd	%xmm5,%xmm0
95.byte	0x67
96	movdqa	%xmm4,%xmm3
97	paddd	%xmm1,%xmm2
98	pcmpeqd	%xmm5,%xmm1
99	movdqa	%xmm0,112(%r10)
100	movdqa	%xmm4,%xmm0
101
102	paddd	%xmm2,%xmm3
103	pcmpeqd	%xmm5,%xmm2
104	movdqa	%xmm1,128(%r10)
105	movdqa	%xmm4,%xmm1
106
107	paddd	%xmm3,%xmm0
108	pcmpeqd	%xmm5,%xmm3
109	movdqa	%xmm2,144(%r10)
110	movdqa	%xmm4,%xmm2
111
112	paddd	%xmm0,%xmm1
113	pcmpeqd	%xmm5,%xmm0
114	movdqa	%xmm3,160(%r10)
115	movdqa	%xmm4,%xmm3
116	paddd	%xmm1,%xmm2
117	pcmpeqd	%xmm5,%xmm1
118	movdqa	%xmm0,176(%r10)
119	movdqa	%xmm4,%xmm0
120
121	paddd	%xmm2,%xmm3
122	pcmpeqd	%xmm5,%xmm2
123	movdqa	%xmm1,192(%r10)
124	movdqa	%xmm4,%xmm1
125
126	paddd	%xmm3,%xmm0
127	pcmpeqd	%xmm5,%xmm3
128	movdqa	%xmm2,208(%r10)
129	movdqa	%xmm4,%xmm2
130
131	paddd	%xmm0,%xmm1
132	pcmpeqd	%xmm5,%xmm0
133	movdqa	%xmm3,224(%r10)
134	movdqa	%xmm4,%xmm3
135	paddd	%xmm1,%xmm2
136	pcmpeqd	%xmm5,%xmm1
137	movdqa	%xmm0,240(%r10)
138	movdqa	%xmm4,%xmm0
139
140	paddd	%xmm2,%xmm3
141	pcmpeqd	%xmm5,%xmm2
142	movdqa	%xmm1,256(%r10)
143	movdqa	%xmm4,%xmm1
144
145	paddd	%xmm3,%xmm0
146	pcmpeqd	%xmm5,%xmm3
147	movdqa	%xmm2,272(%r10)
148	movdqa	%xmm4,%xmm2
149
150	paddd	%xmm0,%xmm1
151	pcmpeqd	%xmm5,%xmm0
152	movdqa	%xmm3,288(%r10)
153	movdqa	%xmm4,%xmm3
154	paddd	%xmm1,%xmm2
155	pcmpeqd	%xmm5,%xmm1
156	movdqa	%xmm0,304(%r10)
157
158	paddd	%xmm2,%xmm3
159.byte	0x67
160	pcmpeqd	%xmm5,%xmm2
161	movdqa	%xmm1,320(%r10)
162
163	pcmpeqd	%xmm5,%xmm3
164	movdqa	%xmm2,336(%r10)
165	pand	64(%r12),%xmm0
166
167	pand	80(%r12),%xmm1
168	pand	96(%r12),%xmm2
169	movdqa	%xmm3,352(%r10)
170	pand	112(%r12),%xmm3
171	por	%xmm2,%xmm0
172	por	%xmm3,%xmm1
173	movdqa	-128(%r12),%xmm4
174	movdqa	-112(%r12),%xmm5
175	movdqa	-96(%r12),%xmm2
176	pand	112(%r10),%xmm4
177	movdqa	-80(%r12),%xmm3
178	pand	128(%r10),%xmm5
179	por	%xmm4,%xmm0
180	pand	144(%r10),%xmm2
181	por	%xmm5,%xmm1
182	pand	160(%r10),%xmm3
183	por	%xmm2,%xmm0
184	por	%xmm3,%xmm1
185	movdqa	-64(%r12),%xmm4
186	movdqa	-48(%r12),%xmm5
187	movdqa	-32(%r12),%xmm2
188	pand	176(%r10),%xmm4
189	movdqa	-16(%r12),%xmm3
190	pand	192(%r10),%xmm5
191	por	%xmm4,%xmm0
192	pand	208(%r10),%xmm2
193	por	%xmm5,%xmm1
194	pand	224(%r10),%xmm3
195	por	%xmm2,%xmm0
196	por	%xmm3,%xmm1
197	movdqa	0(%r12),%xmm4
198	movdqa	16(%r12),%xmm5
199	movdqa	32(%r12),%xmm2
200	pand	240(%r10),%xmm4
201	movdqa	48(%r12),%xmm3
202	pand	256(%r10),%xmm5
203	por	%xmm4,%xmm0
204	pand	272(%r10),%xmm2
205	por	%xmm5,%xmm1
206	pand	288(%r10),%xmm3
207	por	%xmm2,%xmm0
208	por	%xmm3,%xmm1
209	por	%xmm1,%xmm0
210
211	pshufd	$0x4e,%xmm0,%xmm1
212	por	%xmm1,%xmm0
213	leaq	256(%r12),%r12
214.byte	102,72,15,126,195
215
216	movq	(%r8),%r8
217	movq	(%rsi),%rax
218
219	xorq	%r14,%r14
220	xorq	%r15,%r15
221
222	movq	%r8,%rbp
223	mulq	%rbx
224	movq	%rax,%r10
225	movq	(%rcx),%rax
226
227	imulq	%r10,%rbp
228	movq	%rdx,%r11
229
230	mulq	%rbp
231	addq	%rax,%r10
232	movq	8(%rsi),%rax
233	adcq	$0,%rdx
234	movq	%rdx,%r13
235
236	leaq	1(%r15),%r15
237	jmp	.L1st_enter
238
239.align	16
240.L1st:
241	addq	%rax,%r13
242	movq	(%rsi,%r15,8),%rax
243	adcq	$0,%rdx
244	addq	%r11,%r13
245	movq	%r10,%r11
246	adcq	$0,%rdx
247	movq	%r13,-16(%rsp,%r15,8)
248	movq	%rdx,%r13
249
250.L1st_enter:
251	mulq	%rbx
252	addq	%rax,%r11
253	movq	(%rcx,%r15,8),%rax
254	adcq	$0,%rdx
255	leaq	1(%r15),%r15
256	movq	%rdx,%r10
257
258	mulq	%rbp
259	cmpq	%r9,%r15
260	jne	.L1st
261
262
263	addq	%rax,%r13
264	adcq	$0,%rdx
265	addq	%r11,%r13
266	adcq	$0,%rdx
267	movq	%r13,-16(%rsp,%r9,8)
268	movq	%rdx,%r13
269	movq	%r10,%r11
270
271	xorq	%rdx,%rdx
272	addq	%r11,%r13
273	adcq	$0,%rdx
274	movq	%r13,-8(%rsp,%r9,8)
275	movq	%rdx,(%rsp,%r9,8)
276
277	leaq	1(%r14),%r14
278	jmp	.Louter
279.align	16
280.Louter:
281	leaq	24+128(%rsp,%r9,8),%rdx
282	andq	$-16,%rdx
283	pxor	%xmm4,%xmm4
284	pxor	%xmm5,%xmm5
285	movdqa	-128(%r12),%xmm0
286	movdqa	-112(%r12),%xmm1
287	movdqa	-96(%r12),%xmm2
288	movdqa	-80(%r12),%xmm3
289	pand	-128(%rdx),%xmm0
290	pand	-112(%rdx),%xmm1
291	por	%xmm0,%xmm4
292	pand	-96(%rdx),%xmm2
293	por	%xmm1,%xmm5
294	pand	-80(%rdx),%xmm3
295	por	%xmm2,%xmm4
296	por	%xmm3,%xmm5
297	movdqa	-64(%r12),%xmm0
298	movdqa	-48(%r12),%xmm1
299	movdqa	-32(%r12),%xmm2
300	movdqa	-16(%r12),%xmm3
301	pand	-64(%rdx),%xmm0
302	pand	-48(%rdx),%xmm1
303	por	%xmm0,%xmm4
304	pand	-32(%rdx),%xmm2
305	por	%xmm1,%xmm5
306	pand	-16(%rdx),%xmm3
307	por	%xmm2,%xmm4
308	por	%xmm3,%xmm5
309	movdqa	0(%r12),%xmm0
310	movdqa	16(%r12),%xmm1
311	movdqa	32(%r12),%xmm2
312	movdqa	48(%r12),%xmm3
313	pand	0(%rdx),%xmm0
314	pand	16(%rdx),%xmm1
315	por	%xmm0,%xmm4
316	pand	32(%rdx),%xmm2
317	por	%xmm1,%xmm5
318	pand	48(%rdx),%xmm3
319	por	%xmm2,%xmm4
320	por	%xmm3,%xmm5
321	movdqa	64(%r12),%xmm0
322	movdqa	80(%r12),%xmm1
323	movdqa	96(%r12),%xmm2
324	movdqa	112(%r12),%xmm3
325	pand	64(%rdx),%xmm0
326	pand	80(%rdx),%xmm1
327	por	%xmm0,%xmm4
328	pand	96(%rdx),%xmm2
329	por	%xmm1,%xmm5
330	pand	112(%rdx),%xmm3
331	por	%xmm2,%xmm4
332	por	%xmm3,%xmm5
333	por	%xmm5,%xmm4
334
335	pshufd	$0x4e,%xmm4,%xmm0
336	por	%xmm4,%xmm0
337	leaq	256(%r12),%r12
338
339	movq	(%rsi),%rax
340.byte	102,72,15,126,195
341
342	xorq	%r15,%r15
343	movq	%r8,%rbp
344	movq	(%rsp),%r10
345
346	mulq	%rbx
347	addq	%rax,%r10
348	movq	(%rcx),%rax
349	adcq	$0,%rdx
350
351	imulq	%r10,%rbp
352	movq	%rdx,%r11
353
354	mulq	%rbp
355	addq	%rax,%r10
356	movq	8(%rsi),%rax
357	adcq	$0,%rdx
358	movq	8(%rsp),%r10
359	movq	%rdx,%r13
360
361	leaq	1(%r15),%r15
362	jmp	.Linner_enter
363
364.align	16
365.Linner:
366	addq	%rax,%r13
367	movq	(%rsi,%r15,8),%rax
368	adcq	$0,%rdx
369	addq	%r10,%r13
370	movq	(%rsp,%r15,8),%r10
371	adcq	$0,%rdx
372	movq	%r13,-16(%rsp,%r15,8)
373	movq	%rdx,%r13
374
375.Linner_enter:
376	mulq	%rbx
377	addq	%rax,%r11
378	movq	(%rcx,%r15,8),%rax
379	adcq	$0,%rdx
380	addq	%r11,%r10
381	movq	%rdx,%r11
382	adcq	$0,%r11
383	leaq	1(%r15),%r15
384
385	mulq	%rbp
386	cmpq	%r9,%r15
387	jne	.Linner
388
389	addq	%rax,%r13
390	adcq	$0,%rdx
391	addq	%r10,%r13
392	movq	(%rsp,%r9,8),%r10
393	adcq	$0,%rdx
394	movq	%r13,-16(%rsp,%r9,8)
395	movq	%rdx,%r13
396
397	xorq	%rdx,%rdx
398	addq	%r11,%r13
399	adcq	$0,%rdx
400	addq	%r10,%r13
401	adcq	$0,%rdx
402	movq	%r13,-8(%rsp,%r9,8)
403	movq	%rdx,(%rsp,%r9,8)
404
405	leaq	1(%r14),%r14
406	cmpq	%r9,%r14
407	jb	.Louter
408
409	xorq	%r14,%r14
410	movq	(%rsp),%rax
411	leaq	(%rsp),%rsi
412	movq	%r9,%r15
413	jmp	.Lsub
414.align	16
415.Lsub:	sbbq	(%rcx,%r14,8),%rax
416	movq	%rax,(%rdi,%r14,8)
417	movq	8(%rsi,%r14,8),%rax
418	leaq	1(%r14),%r14
419	decq	%r15
420	jnz	.Lsub
421
422	sbbq	$0,%rax
423	movq	$-1,%rbx
424	xorq	%rax,%rbx
425	xorq	%r14,%r14
426	movq	%r9,%r15
427
428.Lcopy:
429	movq	(%rdi,%r14,8),%rcx
430	movq	(%rsp,%r14,8),%rdx
431	andq	%rbx,%rcx
432	andq	%rax,%rdx
433	movq	%r14,(%rsp,%r14,8)
434	orq	%rcx,%rdx
435	movq	%rdx,(%rdi,%r14,8)
436	leaq	1(%r14),%r14
437	subq	$1,%r15
438	jnz	.Lcopy
439
440	movq	8(%rsp,%r9,8),%rsi
441.cfi_def_cfa	%rsi,8
442	movq	$1,%rax
443
444	movq	-48(%rsi),%r15
445.cfi_restore	%r15
446	movq	-40(%rsi),%r14
447.cfi_restore	%r14
448	movq	-32(%rsi),%r13
449.cfi_restore	%r13
450	movq	-24(%rsi),%r12
451.cfi_restore	%r12
452	movq	-16(%rsi),%rbp
453.cfi_restore	%rbp
454	movq	-8(%rsi),%rbx
455.cfi_restore	%rbx
456	leaq	(%rsi),%rsp
457.cfi_def_cfa_register	%rsp
458.Lmul_epilogue:
459	.byte	0xf3,0xc3
460.cfi_endproc
461.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
462.type	bn_mul4x_mont_gather5,@function
463.align	32
464bn_mul4x_mont_gather5:
465.cfi_startproc
466.byte	0x67
467	movq	%rsp,%rax
468.cfi_def_cfa_register	%rax
469.Lmul4x_enter:
470	andl	$0x80108,%r11d
471	cmpl	$0x80108,%r11d
472	je	.Lmulx4x_enter
473	pushq	%rbx
474.cfi_offset	%rbx,-16
475	pushq	%rbp
476.cfi_offset	%rbp,-24
477	pushq	%r12
478.cfi_offset	%r12,-32
479	pushq	%r13
480.cfi_offset	%r13,-40
481	pushq	%r14
482.cfi_offset	%r14,-48
483	pushq	%r15
484.cfi_offset	%r15,-56
485.Lmul4x_prologue:
486
487.byte	0x67
488	shll	$3,%r9d
489	leaq	(%r9,%r9,2),%r10
490	negq	%r9
491
492
493
494
495
496
497
498
499
500
501	leaq	-320(%rsp,%r9,2),%r11
502	movq	%rsp,%rbp
503	subq	%rdi,%r11
504	andq	$4095,%r11
505	cmpq	%r11,%r10
506	jb	.Lmul4xsp_alt
507	subq	%r11,%rbp
508	leaq	-320(%rbp,%r9,2),%rbp
509	jmp	.Lmul4xsp_done
510
511.align	32
512.Lmul4xsp_alt:
513	leaq	4096-320(,%r9,2),%r10
514	leaq	-320(%rbp,%r9,2),%rbp
515	subq	%r10,%r11
516	movq	$0,%r10
517	cmovcq	%r10,%r11
518	subq	%r11,%rbp
519.Lmul4xsp_done:
520	andq	$-64,%rbp
521	movq	%rsp,%r11
522	subq	%rbp,%r11
523	andq	$-4096,%r11
524	leaq	(%r11,%rbp,1),%rsp
525	movq	(%rsp),%r10
526	cmpq	%rbp,%rsp
527	ja	.Lmul4x_page_walk
528	jmp	.Lmul4x_page_walk_done
529
530.Lmul4x_page_walk:
531	leaq	-4096(%rsp),%rsp
532	movq	(%rsp),%r10
533	cmpq	%rbp,%rsp
534	ja	.Lmul4x_page_walk
535.Lmul4x_page_walk_done:
536
537	negq	%r9
538
539	movq	%rax,40(%rsp)
540.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
541.Lmul4x_body:
542
543	call	mul4x_internal
544
545	movq	40(%rsp),%rsi
546.cfi_def_cfa	%rsi,8
547	movq	$1,%rax
548
549	movq	-48(%rsi),%r15
550.cfi_restore	%r15
551	movq	-40(%rsi),%r14
552.cfi_restore	%r14
553	movq	-32(%rsi),%r13
554.cfi_restore	%r13
555	movq	-24(%rsi),%r12
556.cfi_restore	%r12
557	movq	-16(%rsi),%rbp
558.cfi_restore	%rbp
559	movq	-8(%rsi),%rbx
560.cfi_restore	%rbx
561	leaq	(%rsi),%rsp
562.cfi_def_cfa_register	%rsp
563.Lmul4x_epilogue:
564	.byte	0xf3,0xc3
565.cfi_endproc
566.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
567
568.type	mul4x_internal,@function
569.align	32
570mul4x_internal:
571.cfi_startproc
572	shlq	$5,%r9
573	movd	8(%rax),%xmm5
574	leaq	.Linc(%rip),%rax
575	leaq	128(%rdx,%r9,1),%r13
576	shrq	$5,%r9
577	movdqa	0(%rax),%xmm0
578	movdqa	16(%rax),%xmm1
579	leaq	88-112(%rsp,%r9,1),%r10
580	leaq	128(%rdx),%r12
581
582	pshufd	$0,%xmm5,%xmm5
583	movdqa	%xmm1,%xmm4
584.byte	0x67,0x67
585	movdqa	%xmm1,%xmm2
586	paddd	%xmm0,%xmm1
587	pcmpeqd	%xmm5,%xmm0
588.byte	0x67
589	movdqa	%xmm4,%xmm3
590	paddd	%xmm1,%xmm2
591	pcmpeqd	%xmm5,%xmm1
592	movdqa	%xmm0,112(%r10)
593	movdqa	%xmm4,%xmm0
594
595	paddd	%xmm2,%xmm3
596	pcmpeqd	%xmm5,%xmm2
597	movdqa	%xmm1,128(%r10)
598	movdqa	%xmm4,%xmm1
599
600	paddd	%xmm3,%xmm0
601	pcmpeqd	%xmm5,%xmm3
602	movdqa	%xmm2,144(%r10)
603	movdqa	%xmm4,%xmm2
604
605	paddd	%xmm0,%xmm1
606	pcmpeqd	%xmm5,%xmm0
607	movdqa	%xmm3,160(%r10)
608	movdqa	%xmm4,%xmm3
609	paddd	%xmm1,%xmm2
610	pcmpeqd	%xmm5,%xmm1
611	movdqa	%xmm0,176(%r10)
612	movdqa	%xmm4,%xmm0
613
614	paddd	%xmm2,%xmm3
615	pcmpeqd	%xmm5,%xmm2
616	movdqa	%xmm1,192(%r10)
617	movdqa	%xmm4,%xmm1
618
619	paddd	%xmm3,%xmm0
620	pcmpeqd	%xmm5,%xmm3
621	movdqa	%xmm2,208(%r10)
622	movdqa	%xmm4,%xmm2
623
624	paddd	%xmm0,%xmm1
625	pcmpeqd	%xmm5,%xmm0
626	movdqa	%xmm3,224(%r10)
627	movdqa	%xmm4,%xmm3
628	paddd	%xmm1,%xmm2
629	pcmpeqd	%xmm5,%xmm1
630	movdqa	%xmm0,240(%r10)
631	movdqa	%xmm4,%xmm0
632
633	paddd	%xmm2,%xmm3
634	pcmpeqd	%xmm5,%xmm2
635	movdqa	%xmm1,256(%r10)
636	movdqa	%xmm4,%xmm1
637
638	paddd	%xmm3,%xmm0
639	pcmpeqd	%xmm5,%xmm3
640	movdqa	%xmm2,272(%r10)
641	movdqa	%xmm4,%xmm2
642
643	paddd	%xmm0,%xmm1
644	pcmpeqd	%xmm5,%xmm0
645	movdqa	%xmm3,288(%r10)
646	movdqa	%xmm4,%xmm3
647	paddd	%xmm1,%xmm2
648	pcmpeqd	%xmm5,%xmm1
649	movdqa	%xmm0,304(%r10)
650
651	paddd	%xmm2,%xmm3
652.byte	0x67
653	pcmpeqd	%xmm5,%xmm2
654	movdqa	%xmm1,320(%r10)
655
656	pcmpeqd	%xmm5,%xmm3
657	movdqa	%xmm2,336(%r10)
658	pand	64(%r12),%xmm0
659
660	pand	80(%r12),%xmm1
661	pand	96(%r12),%xmm2
662	movdqa	%xmm3,352(%r10)
663	pand	112(%r12),%xmm3
664	por	%xmm2,%xmm0
665	por	%xmm3,%xmm1
666	movdqa	-128(%r12),%xmm4
667	movdqa	-112(%r12),%xmm5
668	movdqa	-96(%r12),%xmm2
669	pand	112(%r10),%xmm4
670	movdqa	-80(%r12),%xmm3
671	pand	128(%r10),%xmm5
672	por	%xmm4,%xmm0
673	pand	144(%r10),%xmm2
674	por	%xmm5,%xmm1
675	pand	160(%r10),%xmm3
676	por	%xmm2,%xmm0
677	por	%xmm3,%xmm1
678	movdqa	-64(%r12),%xmm4
679	movdqa	-48(%r12),%xmm5
680	movdqa	-32(%r12),%xmm2
681	pand	176(%r10),%xmm4
682	movdqa	-16(%r12),%xmm3
683	pand	192(%r10),%xmm5
684	por	%xmm4,%xmm0
685	pand	208(%r10),%xmm2
686	por	%xmm5,%xmm1
687	pand	224(%r10),%xmm3
688	por	%xmm2,%xmm0
689	por	%xmm3,%xmm1
690	movdqa	0(%r12),%xmm4
691	movdqa	16(%r12),%xmm5
692	movdqa	32(%r12),%xmm2
693	pand	240(%r10),%xmm4
694	movdqa	48(%r12),%xmm3
695	pand	256(%r10),%xmm5
696	por	%xmm4,%xmm0
697	pand	272(%r10),%xmm2
698	por	%xmm5,%xmm1
699	pand	288(%r10),%xmm3
700	por	%xmm2,%xmm0
701	por	%xmm3,%xmm1
702	por	%xmm1,%xmm0
703
704	pshufd	$0x4e,%xmm0,%xmm1
705	por	%xmm1,%xmm0
706	leaq	256(%r12),%r12
707.byte	102,72,15,126,195
708
709	movq	%r13,16+8(%rsp)
710	movq	%rdi,56+8(%rsp)
711
712	movq	(%r8),%r8
713	movq	(%rsi),%rax
714	leaq	(%rsi,%r9,1),%rsi
715	negq	%r9
716
717	movq	%r8,%rbp
718	mulq	%rbx
719	movq	%rax,%r10
720	movq	(%rcx),%rax
721
722	imulq	%r10,%rbp
723	leaq	64+8(%rsp),%r14
724	movq	%rdx,%r11
725
726	mulq	%rbp
727	addq	%rax,%r10
728	movq	8(%rsi,%r9,1),%rax
729	adcq	$0,%rdx
730	movq	%rdx,%rdi
731
732	mulq	%rbx
733	addq	%rax,%r11
734	movq	8(%rcx),%rax
735	adcq	$0,%rdx
736	movq	%rdx,%r10
737
738	mulq	%rbp
739	addq	%rax,%rdi
740	movq	16(%rsi,%r9,1),%rax
741	adcq	$0,%rdx
742	addq	%r11,%rdi
743	leaq	32(%r9),%r15
744	leaq	32(%rcx),%rcx
745	adcq	$0,%rdx
746	movq	%rdi,(%r14)
747	movq	%rdx,%r13
748	jmp	.L1st4x
749
750.align	32
751.L1st4x:
752	mulq	%rbx
753	addq	%rax,%r10
754	movq	-16(%rcx),%rax
755	leaq	32(%r14),%r14
756	adcq	$0,%rdx
757	movq	%rdx,%r11
758
759	mulq	%rbp
760	addq	%rax,%r13
761	movq	-8(%rsi,%r15,1),%rax
762	adcq	$0,%rdx
763	addq	%r10,%r13
764	adcq	$0,%rdx
765	movq	%r13,-24(%r14)
766	movq	%rdx,%rdi
767
768	mulq	%rbx
769	addq	%rax,%r11
770	movq	-8(%rcx),%rax
771	adcq	$0,%rdx
772	movq	%rdx,%r10
773
774	mulq	%rbp
775	addq	%rax,%rdi
776	movq	(%rsi,%r15,1),%rax
777	adcq	$0,%rdx
778	addq	%r11,%rdi
779	adcq	$0,%rdx
780	movq	%rdi,-16(%r14)
781	movq	%rdx,%r13
782
783	mulq	%rbx
784	addq	%rax,%r10
785	movq	0(%rcx),%rax
786	adcq	$0,%rdx
787	movq	%rdx,%r11
788
789	mulq	%rbp
790	addq	%rax,%r13
791	movq	8(%rsi,%r15,1),%rax
792	adcq	$0,%rdx
793	addq	%r10,%r13
794	adcq	$0,%rdx
795	movq	%r13,-8(%r14)
796	movq	%rdx,%rdi
797
798	mulq	%rbx
799	addq	%rax,%r11
800	movq	8(%rcx),%rax
801	adcq	$0,%rdx
802	movq	%rdx,%r10
803
804	mulq	%rbp
805	addq	%rax,%rdi
806	movq	16(%rsi,%r15,1),%rax
807	adcq	$0,%rdx
808	addq	%r11,%rdi
809	leaq	32(%rcx),%rcx
810	adcq	$0,%rdx
811	movq	%rdi,(%r14)
812	movq	%rdx,%r13
813
814	addq	$32,%r15
815	jnz	.L1st4x
816
817	mulq	%rbx
818	addq	%rax,%r10
819	movq	-16(%rcx),%rax
820	leaq	32(%r14),%r14
821	adcq	$0,%rdx
822	movq	%rdx,%r11
823
824	mulq	%rbp
825	addq	%rax,%r13
826	movq	-8(%rsi),%rax
827	adcq	$0,%rdx
828	addq	%r10,%r13
829	adcq	$0,%rdx
830	movq	%r13,-24(%r14)
831	movq	%rdx,%rdi
832
833	mulq	%rbx
834	addq	%rax,%r11
835	movq	-8(%rcx),%rax
836	adcq	$0,%rdx
837	movq	%rdx,%r10
838
839	mulq	%rbp
840	addq	%rax,%rdi
841	movq	(%rsi,%r9,1),%rax
842	adcq	$0,%rdx
843	addq	%r11,%rdi
844	adcq	$0,%rdx
845	movq	%rdi,-16(%r14)
846	movq	%rdx,%r13
847
848	leaq	(%rcx,%r9,1),%rcx
849
850	xorq	%rdi,%rdi
851	addq	%r10,%r13
852	adcq	$0,%rdi
853	movq	%r13,-8(%r14)
854
855	jmp	.Louter4x
856
857.align	32
858.Louter4x:
859	leaq	16+128(%r14),%rdx
860	pxor	%xmm4,%xmm4
861	pxor	%xmm5,%xmm5
862	movdqa	-128(%r12),%xmm0
863	movdqa	-112(%r12),%xmm1
864	movdqa	-96(%r12),%xmm2
865	movdqa	-80(%r12),%xmm3
866	pand	-128(%rdx),%xmm0
867	pand	-112(%rdx),%xmm1
868	por	%xmm0,%xmm4
869	pand	-96(%rdx),%xmm2
870	por	%xmm1,%xmm5
871	pand	-80(%rdx),%xmm3
872	por	%xmm2,%xmm4
873	por	%xmm3,%xmm5
874	movdqa	-64(%r12),%xmm0
875	movdqa	-48(%r12),%xmm1
876	movdqa	-32(%r12),%xmm2
877	movdqa	-16(%r12),%xmm3
878	pand	-64(%rdx),%xmm0
879	pand	-48(%rdx),%xmm1
880	por	%xmm0,%xmm4
881	pand	-32(%rdx),%xmm2
882	por	%xmm1,%xmm5
883	pand	-16(%rdx),%xmm3
884	por	%xmm2,%xmm4
885	por	%xmm3,%xmm5
886	movdqa	0(%r12),%xmm0
887	movdqa	16(%r12),%xmm1
888	movdqa	32(%r12),%xmm2
889	movdqa	48(%r12),%xmm3
890	pand	0(%rdx),%xmm0
891	pand	16(%rdx),%xmm1
892	por	%xmm0,%xmm4
893	pand	32(%rdx),%xmm2
894	por	%xmm1,%xmm5
895	pand	48(%rdx),%xmm3
896	por	%xmm2,%xmm4
897	por	%xmm3,%xmm5
898	movdqa	64(%r12),%xmm0
899	movdqa	80(%r12),%xmm1
900	movdqa	96(%r12),%xmm2
901	movdqa	112(%r12),%xmm3
902	pand	64(%rdx),%xmm0
903	pand	80(%rdx),%xmm1
904	por	%xmm0,%xmm4
905	pand	96(%rdx),%xmm2
906	por	%xmm1,%xmm5
907	pand	112(%rdx),%xmm3
908	por	%xmm2,%xmm4
909	por	%xmm3,%xmm5
910	por	%xmm5,%xmm4
911
912	pshufd	$0x4e,%xmm4,%xmm0
913	por	%xmm4,%xmm0
914	leaq	256(%r12),%r12
915.byte	102,72,15,126,195
916
917	movq	(%r14,%r9,1),%r10
918	movq	%r8,%rbp
919	mulq	%rbx
920	addq	%rax,%r10
921	movq	(%rcx),%rax
922	adcq	$0,%rdx
923
924	imulq	%r10,%rbp
925	movq	%rdx,%r11
926	movq	%rdi,(%r14)
927
928	leaq	(%r14,%r9,1),%r14
929
930	mulq	%rbp
931	addq	%rax,%r10
932	movq	8(%rsi,%r9,1),%rax
933	adcq	$0,%rdx
934	movq	%rdx,%rdi
935
936	mulq	%rbx
937	addq	%rax,%r11
938	movq	8(%rcx),%rax
939	adcq	$0,%rdx
940	addq	8(%r14),%r11
941	adcq	$0,%rdx
942	movq	%rdx,%r10
943
944	mulq	%rbp
945	addq	%rax,%rdi
946	movq	16(%rsi,%r9,1),%rax
947	adcq	$0,%rdx
948	addq	%r11,%rdi
949	leaq	32(%r9),%r15
950	leaq	32(%rcx),%rcx
951	adcq	$0,%rdx
952	movq	%rdx,%r13
953	jmp	.Linner4x
954
955.align	32
956.Linner4x:
957	mulq	%rbx
958	addq	%rax,%r10
959	movq	-16(%rcx),%rax
960	adcq	$0,%rdx
961	addq	16(%r14),%r10
962	leaq	32(%r14),%r14
963	adcq	$0,%rdx
964	movq	%rdx,%r11
965
966	mulq	%rbp
967	addq	%rax,%r13
968	movq	-8(%rsi,%r15,1),%rax
969	adcq	$0,%rdx
970	addq	%r10,%r13
971	adcq	$0,%rdx
972	movq	%rdi,-32(%r14)
973	movq	%rdx,%rdi
974
975	mulq	%rbx
976	addq	%rax,%r11
977	movq	-8(%rcx),%rax
978	adcq	$0,%rdx
979	addq	-8(%r14),%r11
980	adcq	$0,%rdx
981	movq	%rdx,%r10
982
983	mulq	%rbp
984	addq	%rax,%rdi
985	movq	(%rsi,%r15,1),%rax
986	adcq	$0,%rdx
987	addq	%r11,%rdi
988	adcq	$0,%rdx
989	movq	%r13,-24(%r14)
990	movq	%rdx,%r13
991
992	mulq	%rbx
993	addq	%rax,%r10
994	movq	0(%rcx),%rax
995	adcq	$0,%rdx
996	addq	(%r14),%r10
997	adcq	$0,%rdx
998	movq	%rdx,%r11
999
1000	mulq	%rbp
1001	addq	%rax,%r13
1002	movq	8(%rsi,%r15,1),%rax
1003	adcq	$0,%rdx
1004	addq	%r10,%r13
1005	adcq	$0,%rdx
1006	movq	%rdi,-16(%r14)
1007	movq	%rdx,%rdi
1008
1009	mulq	%rbx
1010	addq	%rax,%r11
1011	movq	8(%rcx),%rax
1012	adcq	$0,%rdx
1013	addq	8(%r14),%r11
1014	adcq	$0,%rdx
1015	movq	%rdx,%r10
1016
1017	mulq	%rbp
1018	addq	%rax,%rdi
1019	movq	16(%rsi,%r15,1),%rax
1020	adcq	$0,%rdx
1021	addq	%r11,%rdi
1022	leaq	32(%rcx),%rcx
1023	adcq	$0,%rdx
1024	movq	%r13,-8(%r14)
1025	movq	%rdx,%r13
1026
1027	addq	$32,%r15
1028	jnz	.Linner4x
1029
1030	mulq	%rbx
1031	addq	%rax,%r10
1032	movq	-16(%rcx),%rax
1033	adcq	$0,%rdx
1034	addq	16(%r14),%r10
1035	leaq	32(%r14),%r14
1036	adcq	$0,%rdx
1037	movq	%rdx,%r11
1038
1039	mulq	%rbp
1040	addq	%rax,%r13
1041	movq	-8(%rsi),%rax
1042	adcq	$0,%rdx
1043	addq	%r10,%r13
1044	adcq	$0,%rdx
1045	movq	%rdi,-32(%r14)
1046	movq	%rdx,%rdi
1047
1048	mulq	%rbx
1049	addq	%rax,%r11
1050	movq	%rbp,%rax
1051	movq	-8(%rcx),%rbp
1052	adcq	$0,%rdx
1053	addq	-8(%r14),%r11
1054	adcq	$0,%rdx
1055	movq	%rdx,%r10
1056
1057	mulq	%rbp
1058	addq	%rax,%rdi
1059	movq	(%rsi,%r9,1),%rax
1060	adcq	$0,%rdx
1061	addq	%r11,%rdi
1062	adcq	$0,%rdx
1063	movq	%r13,-24(%r14)
1064	movq	%rdx,%r13
1065
1066	movq	%rdi,-16(%r14)
1067	leaq	(%rcx,%r9,1),%rcx
1068
1069	xorq	%rdi,%rdi
1070	addq	%r10,%r13
1071	adcq	$0,%rdi
1072	addq	(%r14),%r13
1073	adcq	$0,%rdi
1074	movq	%r13,-8(%r14)
1075
1076	cmpq	16+8(%rsp),%r12
1077	jb	.Louter4x
1078	xorq	%rax,%rax
1079	subq	%r13,%rbp
1080	adcq	%r15,%r15
1081	orq	%r15,%rdi
1082	subq	%rdi,%rax
1083	leaq	(%r14,%r9,1),%rbx
1084	movq	(%rcx),%r12
1085	leaq	(%rcx),%rbp
1086	movq	%r9,%rcx
1087	sarq	$3+2,%rcx
1088	movq	56+8(%rsp),%rdi
1089	decq	%r12
1090	xorq	%r10,%r10
1091	movq	8(%rbp),%r13
1092	movq	16(%rbp),%r14
1093	movq	24(%rbp),%r15
1094	jmp	.Lsqr4x_sub_entry
1095.cfi_endproc
1096.size	mul4x_internal,.-mul4x_internal
1097.globl	bn_power5
1098.hidden bn_power5
1099.type	bn_power5,@function
1100.align	32
1101bn_power5:
1102.cfi_startproc
1103	movq	%rsp,%rax
1104.cfi_def_cfa_register	%rax
1105	leaq	OPENSSL_ia32cap_P(%rip),%r11
1106	movl	8(%r11),%r11d
1107	andl	$0x80108,%r11d
1108	cmpl	$0x80108,%r11d
1109	je	.Lpowerx5_enter
1110	pushq	%rbx
1111.cfi_offset	%rbx,-16
1112	pushq	%rbp
1113.cfi_offset	%rbp,-24
1114	pushq	%r12
1115.cfi_offset	%r12,-32
1116	pushq	%r13
1117.cfi_offset	%r13,-40
1118	pushq	%r14
1119.cfi_offset	%r14,-48
1120	pushq	%r15
1121.cfi_offset	%r15,-56
1122.Lpower5_prologue:
1123
1124	shll	$3,%r9d
1125	leal	(%r9,%r9,2),%r10d
1126	negq	%r9
1127	movq	(%r8),%r8
1128
1129
1130
1131
1132
1133
1134
1135
1136	leaq	-320(%rsp,%r9,2),%r11
1137	movq	%rsp,%rbp
1138	subq	%rdi,%r11
1139	andq	$4095,%r11
1140	cmpq	%r11,%r10
1141	jb	.Lpwr_sp_alt
1142	subq	%r11,%rbp
1143	leaq	-320(%rbp,%r9,2),%rbp
1144	jmp	.Lpwr_sp_done
1145
1146.align	32
1147.Lpwr_sp_alt:
1148	leaq	4096-320(,%r9,2),%r10
1149	leaq	-320(%rbp,%r9,2),%rbp
1150	subq	%r10,%r11
1151	movq	$0,%r10
1152	cmovcq	%r10,%r11
1153	subq	%r11,%rbp
1154.Lpwr_sp_done:
1155	andq	$-64,%rbp
1156	movq	%rsp,%r11
1157	subq	%rbp,%r11
1158	andq	$-4096,%r11
1159	leaq	(%r11,%rbp,1),%rsp
1160	movq	(%rsp),%r10
1161	cmpq	%rbp,%rsp
1162	ja	.Lpwr_page_walk
1163	jmp	.Lpwr_page_walk_done
1164
1165.Lpwr_page_walk:
1166	leaq	-4096(%rsp),%rsp
1167	movq	(%rsp),%r10
1168	cmpq	%rbp,%rsp
1169	ja	.Lpwr_page_walk
1170.Lpwr_page_walk_done:
1171
1172	movq	%r9,%r10
1173	negq	%r9
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184	movq	%r8,32(%rsp)
1185	movq	%rax,40(%rsp)
1186.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
1187.Lpower5_body:
1188.byte	102,72,15,110,207
1189.byte	102,72,15,110,209
1190.byte	102,73,15,110,218
1191.byte	102,72,15,110,226
1192
1193	call	__bn_sqr8x_internal
1194	call	__bn_post4x_internal
1195	call	__bn_sqr8x_internal
1196	call	__bn_post4x_internal
1197	call	__bn_sqr8x_internal
1198	call	__bn_post4x_internal
1199	call	__bn_sqr8x_internal
1200	call	__bn_post4x_internal
1201	call	__bn_sqr8x_internal
1202	call	__bn_post4x_internal
1203
1204.byte	102,72,15,126,209
1205.byte	102,72,15,126,226
1206	movq	%rsi,%rdi
1207	movq	40(%rsp),%rax
1208	leaq	32(%rsp),%r8
1209
1210	call	mul4x_internal
1211
1212	movq	40(%rsp),%rsi
1213.cfi_def_cfa	%rsi,8
1214	movq	$1,%rax
1215	movq	-48(%rsi),%r15
1216.cfi_restore	%r15
1217	movq	-40(%rsi),%r14
1218.cfi_restore	%r14
1219	movq	-32(%rsi),%r13
1220.cfi_restore	%r13
1221	movq	-24(%rsi),%r12
1222.cfi_restore	%r12
1223	movq	-16(%rsi),%rbp
1224.cfi_restore	%rbp
1225	movq	-8(%rsi),%rbx
1226.cfi_restore	%rbx
1227	leaq	(%rsi),%rsp
1228.cfi_def_cfa_register	%rsp
1229.Lpower5_epilogue:
1230	.byte	0xf3,0xc3
1231.cfi_endproc
1232.size	bn_power5,.-bn_power5
1233
1234.globl	bn_sqr8x_internal
1235.hidden bn_sqr8x_internal
1236.hidden	bn_sqr8x_internal
1237.type	bn_sqr8x_internal,@function
1238.align	32
1239bn_sqr8x_internal:
1240__bn_sqr8x_internal:
1241.cfi_startproc
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315	leaq	32(%r10),%rbp
1316	leaq	(%rsi,%r9,1),%rsi
1317
1318	movq	%r9,%rcx
1319
1320
1321	movq	-32(%rsi,%rbp,1),%r14
1322	leaq	48+8(%rsp,%r9,2),%rdi
1323	movq	-24(%rsi,%rbp,1),%rax
1324	leaq	-32(%rdi,%rbp,1),%rdi
1325	movq	-16(%rsi,%rbp,1),%rbx
1326	movq	%rax,%r15
1327
1328	mulq	%r14
1329	movq	%rax,%r10
1330	movq	%rbx,%rax
1331	movq	%rdx,%r11
1332	movq	%r10,-24(%rdi,%rbp,1)
1333
1334	mulq	%r14
1335	addq	%rax,%r11
1336	movq	%rbx,%rax
1337	adcq	$0,%rdx
1338	movq	%r11,-16(%rdi,%rbp,1)
1339	movq	%rdx,%r10
1340
1341
1342	movq	-8(%rsi,%rbp,1),%rbx
1343	mulq	%r15
1344	movq	%rax,%r12
1345	movq	%rbx,%rax
1346	movq	%rdx,%r13
1347
1348	leaq	(%rbp),%rcx
1349	mulq	%r14
1350	addq	%rax,%r10
1351	movq	%rbx,%rax
1352	movq	%rdx,%r11
1353	adcq	$0,%r11
1354	addq	%r12,%r10
1355	adcq	$0,%r11
1356	movq	%r10,-8(%rdi,%rcx,1)
1357	jmp	.Lsqr4x_1st
1358
1359.align	32
1360.Lsqr4x_1st:
1361	movq	(%rsi,%rcx,1),%rbx
1362	mulq	%r15
1363	addq	%rax,%r13
1364	movq	%rbx,%rax
1365	movq	%rdx,%r12
1366	adcq	$0,%r12
1367
1368	mulq	%r14
1369	addq	%rax,%r11
1370	movq	%rbx,%rax
1371	movq	8(%rsi,%rcx,1),%rbx
1372	movq	%rdx,%r10
1373	adcq	$0,%r10
1374	addq	%r13,%r11
1375	adcq	$0,%r10
1376
1377
1378	mulq	%r15
1379	addq	%rax,%r12
1380	movq	%rbx,%rax
1381	movq	%r11,(%rdi,%rcx,1)
1382	movq	%rdx,%r13
1383	adcq	$0,%r13
1384
1385	mulq	%r14
1386	addq	%rax,%r10
1387	movq	%rbx,%rax
1388	movq	16(%rsi,%rcx,1),%rbx
1389	movq	%rdx,%r11
1390	adcq	$0,%r11
1391	addq	%r12,%r10
1392	adcq	$0,%r11
1393
1394	mulq	%r15
1395	addq	%rax,%r13
1396	movq	%rbx,%rax
1397	movq	%r10,8(%rdi,%rcx,1)
1398	movq	%rdx,%r12
1399	adcq	$0,%r12
1400
1401	mulq	%r14
1402	addq	%rax,%r11
1403	movq	%rbx,%rax
1404	movq	24(%rsi,%rcx,1),%rbx
1405	movq	%rdx,%r10
1406	adcq	$0,%r10
1407	addq	%r13,%r11
1408	adcq	$0,%r10
1409
1410
1411	mulq	%r15
1412	addq	%rax,%r12
1413	movq	%rbx,%rax
1414	movq	%r11,16(%rdi,%rcx,1)
1415	movq	%rdx,%r13
1416	adcq	$0,%r13
1417	leaq	32(%rcx),%rcx
1418
1419	mulq	%r14
1420	addq	%rax,%r10
1421	movq	%rbx,%rax
1422	movq	%rdx,%r11
1423	adcq	$0,%r11
1424	addq	%r12,%r10
1425	adcq	$0,%r11
1426	movq	%r10,-8(%rdi,%rcx,1)
1427
1428	cmpq	$0,%rcx
1429	jne	.Lsqr4x_1st
1430
1431	mulq	%r15
1432	addq	%rax,%r13
1433	leaq	16(%rbp),%rbp
1434	adcq	$0,%rdx
1435	addq	%r11,%r13
1436	adcq	$0,%rdx
1437
1438	movq	%r13,(%rdi)
1439	movq	%rdx,%r12
1440	movq	%rdx,8(%rdi)
1441	jmp	.Lsqr4x_outer
1442
1443.align	32
1444.Lsqr4x_outer:
1445	movq	-32(%rsi,%rbp,1),%r14
1446	leaq	48+8(%rsp,%r9,2),%rdi
1447	movq	-24(%rsi,%rbp,1),%rax
1448	leaq	-32(%rdi,%rbp,1),%rdi
1449	movq	-16(%rsi,%rbp,1),%rbx
1450	movq	%rax,%r15
1451
1452	mulq	%r14
1453	movq	-24(%rdi,%rbp,1),%r10
1454	addq	%rax,%r10
1455	movq	%rbx,%rax
1456	adcq	$0,%rdx
1457	movq	%r10,-24(%rdi,%rbp,1)
1458	movq	%rdx,%r11
1459
1460	mulq	%r14
1461	addq	%rax,%r11
1462	movq	%rbx,%rax
1463	adcq	$0,%rdx
1464	addq	-16(%rdi,%rbp,1),%r11
1465	movq	%rdx,%r10
1466	adcq	$0,%r10
1467	movq	%r11,-16(%rdi,%rbp,1)
1468
1469	xorq	%r12,%r12
1470
1471	movq	-8(%rsi,%rbp,1),%rbx
1472	mulq	%r15
1473	addq	%rax,%r12
1474	movq	%rbx,%rax
1475	adcq	$0,%rdx
1476	addq	-8(%rdi,%rbp,1),%r12
1477	movq	%rdx,%r13
1478	adcq	$0,%r13
1479
1480	mulq	%r14
1481	addq	%rax,%r10
1482	movq	%rbx,%rax
1483	adcq	$0,%rdx
1484	addq	%r12,%r10
1485	movq	%rdx,%r11
1486	adcq	$0,%r11
1487	movq	%r10,-8(%rdi,%rbp,1)
1488
1489	leaq	(%rbp),%rcx
1490	jmp	.Lsqr4x_inner
1491
1492.align	32
1493.Lsqr4x_inner:
1494	movq	(%rsi,%rcx,1),%rbx
1495	mulq	%r15
1496	addq	%rax,%r13
1497	movq	%rbx,%rax
1498	movq	%rdx,%r12
1499	adcq	$0,%r12
1500	addq	(%rdi,%rcx,1),%r13
1501	adcq	$0,%r12
1502
1503.byte	0x67
1504	mulq	%r14
1505	addq	%rax,%r11
1506	movq	%rbx,%rax
1507	movq	8(%rsi,%rcx,1),%rbx
1508	movq	%rdx,%r10
1509	adcq	$0,%r10
1510	addq	%r13,%r11
1511	adcq	$0,%r10
1512
1513	mulq	%r15
1514	addq	%rax,%r12
1515	movq	%r11,(%rdi,%rcx,1)
1516	movq	%rbx,%rax
1517	movq	%rdx,%r13
1518	adcq	$0,%r13
1519	addq	8(%rdi,%rcx,1),%r12
1520	leaq	16(%rcx),%rcx
1521	adcq	$0,%r13
1522
1523	mulq	%r14
1524	addq	%rax,%r10
1525	movq	%rbx,%rax
1526	adcq	$0,%rdx
1527	addq	%r12,%r10
1528	movq	%rdx,%r11
1529	adcq	$0,%r11
1530	movq	%r10,-8(%rdi,%rcx,1)
1531
1532	cmpq	$0,%rcx
1533	jne	.Lsqr4x_inner
1534
1535.byte	0x67
1536	mulq	%r15
1537	addq	%rax,%r13
1538	adcq	$0,%rdx
1539	addq	%r11,%r13
1540	adcq	$0,%rdx
1541
1542	movq	%r13,(%rdi)
1543	movq	%rdx,%r12
1544	movq	%rdx,8(%rdi)
1545
1546	addq	$16,%rbp
1547	jnz	.Lsqr4x_outer
1548
1549
1550	movq	-32(%rsi),%r14
1551	leaq	48+8(%rsp,%r9,2),%rdi
1552	movq	-24(%rsi),%rax
1553	leaq	-32(%rdi,%rbp,1),%rdi
1554	movq	-16(%rsi),%rbx
1555	movq	%rax,%r15
1556
1557	mulq	%r14
1558	addq	%rax,%r10
1559	movq	%rbx,%rax
1560	movq	%rdx,%r11
1561	adcq	$0,%r11
1562
1563	mulq	%r14
1564	addq	%rax,%r11
1565	movq	%rbx,%rax
1566	movq	%r10,-24(%rdi)
1567	movq	%rdx,%r10
1568	adcq	$0,%r10
1569	addq	%r13,%r11
1570	movq	-8(%rsi),%rbx
1571	adcq	$0,%r10
1572
1573	mulq	%r15
1574	addq	%rax,%r12
1575	movq	%rbx,%rax
1576	movq	%r11,-16(%rdi)
1577	movq	%rdx,%r13
1578	adcq	$0,%r13
1579
1580	mulq	%r14
1581	addq	%rax,%r10
1582	movq	%rbx,%rax
1583	movq	%rdx,%r11
1584	adcq	$0,%r11
1585	addq	%r12,%r10
1586	adcq	$0,%r11
1587	movq	%r10,-8(%rdi)
1588
1589	mulq	%r15
1590	addq	%rax,%r13
1591	movq	-16(%rsi),%rax
1592	adcq	$0,%rdx
1593	addq	%r11,%r13
1594	adcq	$0,%rdx
1595
1596	movq	%r13,(%rdi)
1597	movq	%rdx,%r12
1598	movq	%rdx,8(%rdi)
1599
1600	mulq	%rbx
1601	addq	$16,%rbp
1602	xorq	%r14,%r14
1603	subq	%r9,%rbp
1604	xorq	%r15,%r15
1605
1606	addq	%r12,%rax
1607	adcq	$0,%rdx
1608	movq	%rax,8(%rdi)
1609	movq	%rdx,16(%rdi)
1610	movq	%r15,24(%rdi)
1611
1612	movq	-16(%rsi,%rbp,1),%rax
1613	leaq	48+8(%rsp),%rdi
1614	xorq	%r10,%r10
1615	movq	8(%rdi),%r11
1616
1617	leaq	(%r14,%r10,2),%r12
1618	shrq	$63,%r10
1619	leaq	(%rcx,%r11,2),%r13
1620	shrq	$63,%r11
1621	orq	%r10,%r13
1622	movq	16(%rdi),%r10
1623	movq	%r11,%r14
1624	mulq	%rax
1625	negq	%r15
1626	movq	24(%rdi),%r11
1627	adcq	%rax,%r12
1628	movq	-8(%rsi,%rbp,1),%rax
1629	movq	%r12,(%rdi)
1630	adcq	%rdx,%r13
1631
1632	leaq	(%r14,%r10,2),%rbx
1633	movq	%r13,8(%rdi)
1634	sbbq	%r15,%r15
1635	shrq	$63,%r10
1636	leaq	(%rcx,%r11,2),%r8
1637	shrq	$63,%r11
1638	orq	%r10,%r8
1639	movq	32(%rdi),%r10
1640	movq	%r11,%r14
1641	mulq	%rax
1642	negq	%r15
1643	movq	40(%rdi),%r11
1644	adcq	%rax,%rbx
1645	movq	0(%rsi,%rbp,1),%rax
1646	movq	%rbx,16(%rdi)
1647	adcq	%rdx,%r8
1648	leaq	16(%rbp),%rbp
1649	movq	%r8,24(%rdi)
1650	sbbq	%r15,%r15
1651	leaq	64(%rdi),%rdi
1652	jmp	.Lsqr4x_shift_n_add
1653
1654.align	32
1655.Lsqr4x_shift_n_add:
1656	leaq	(%r14,%r10,2),%r12
1657	shrq	$63,%r10
1658	leaq	(%rcx,%r11,2),%r13
1659	shrq	$63,%r11
1660	orq	%r10,%r13
1661	movq	-16(%rdi),%r10
1662	movq	%r11,%r14
1663	mulq	%rax
1664	negq	%r15
1665	movq	-8(%rdi),%r11
1666	adcq	%rax,%r12
1667	movq	-8(%rsi,%rbp,1),%rax
1668	movq	%r12,-32(%rdi)
1669	adcq	%rdx,%r13
1670
1671	leaq	(%r14,%r10,2),%rbx
1672	movq	%r13,-24(%rdi)
1673	sbbq	%r15,%r15
1674	shrq	$63,%r10
1675	leaq	(%rcx,%r11,2),%r8
1676	shrq	$63,%r11
1677	orq	%r10,%r8
1678	movq	0(%rdi),%r10
1679	movq	%r11,%r14
1680	mulq	%rax
1681	negq	%r15
1682	movq	8(%rdi),%r11
1683	adcq	%rax,%rbx
1684	movq	0(%rsi,%rbp,1),%rax
1685	movq	%rbx,-16(%rdi)
1686	adcq	%rdx,%r8
1687
1688	leaq	(%r14,%r10,2),%r12
1689	movq	%r8,-8(%rdi)
1690	sbbq	%r15,%r15
1691	shrq	$63,%r10
1692	leaq	(%rcx,%r11,2),%r13
1693	shrq	$63,%r11
1694	orq	%r10,%r13
1695	movq	16(%rdi),%r10
1696	movq	%r11,%r14
1697	mulq	%rax
1698	negq	%r15
1699	movq	24(%rdi),%r11
1700	adcq	%rax,%r12
1701	movq	8(%rsi,%rbp,1),%rax
1702	movq	%r12,0(%rdi)
1703	adcq	%rdx,%r13
1704
1705	leaq	(%r14,%r10,2),%rbx
1706	movq	%r13,8(%rdi)
1707	sbbq	%r15,%r15
1708	shrq	$63,%r10
1709	leaq	(%rcx,%r11,2),%r8
1710	shrq	$63,%r11
1711	orq	%r10,%r8
1712	movq	32(%rdi),%r10
1713	movq	%r11,%r14
1714	mulq	%rax
1715	negq	%r15
1716	movq	40(%rdi),%r11
1717	adcq	%rax,%rbx
1718	movq	16(%rsi,%rbp,1),%rax
1719	movq	%rbx,16(%rdi)
1720	adcq	%rdx,%r8
1721	movq	%r8,24(%rdi)
1722	sbbq	%r15,%r15
1723	leaq	64(%rdi),%rdi
1724	addq	$32,%rbp
1725	jnz	.Lsqr4x_shift_n_add
1726
1727	leaq	(%r14,%r10,2),%r12
1728.byte	0x67
1729	shrq	$63,%r10
1730	leaq	(%rcx,%r11,2),%r13
1731	shrq	$63,%r11
1732	orq	%r10,%r13
1733	movq	-16(%rdi),%r10
1734	movq	%r11,%r14
1735	mulq	%rax
1736	negq	%r15
1737	movq	-8(%rdi),%r11
1738	adcq	%rax,%r12
1739	movq	-8(%rsi),%rax
1740	movq	%r12,-32(%rdi)
1741	adcq	%rdx,%r13
1742
1743	leaq	(%r14,%r10,2),%rbx
1744	movq	%r13,-24(%rdi)
1745	sbbq	%r15,%r15
1746	shrq	$63,%r10
1747	leaq	(%rcx,%r11,2),%r8
1748	shrq	$63,%r11
1749	orq	%r10,%r8
1750	mulq	%rax
1751	negq	%r15
1752	adcq	%rax,%rbx
1753	adcq	%rdx,%r8
1754	movq	%rbx,-16(%rdi)
1755	movq	%r8,-8(%rdi)
1756.byte	102,72,15,126,213
1757__bn_sqr8x_reduction:
1758	xorq	%rax,%rax
1759	leaq	(%r9,%rbp,1),%rcx
1760	leaq	48+8(%rsp,%r9,2),%rdx
1761	movq	%rcx,0+8(%rsp)
1762	leaq	48+8(%rsp,%r9,1),%rdi
1763	movq	%rdx,8+8(%rsp)
1764	negq	%r9
1765	jmp	.L8x_reduction_loop
1766
1767.align	32
1768.L8x_reduction_loop:
1769	leaq	(%rdi,%r9,1),%rdi
1770.byte	0x66
1771	movq	0(%rdi),%rbx
1772	movq	8(%rdi),%r9
1773	movq	16(%rdi),%r10
1774	movq	24(%rdi),%r11
1775	movq	32(%rdi),%r12
1776	movq	40(%rdi),%r13
1777	movq	48(%rdi),%r14
1778	movq	56(%rdi),%r15
1779	movq	%rax,(%rdx)
1780	leaq	64(%rdi),%rdi
1781
1782.byte	0x67
1783	movq	%rbx,%r8
1784	imulq	32+8(%rsp),%rbx
1785	movq	0(%rbp),%rax
1786	movl	$8,%ecx
1787	jmp	.L8x_reduce
1788
1789.align	32
1790.L8x_reduce:
1791	mulq	%rbx
1792	movq	8(%rbp),%rax
1793	negq	%r8
1794	movq	%rdx,%r8
1795	adcq	$0,%r8
1796
1797	mulq	%rbx
1798	addq	%rax,%r9
1799	movq	16(%rbp),%rax
1800	adcq	$0,%rdx
1801	addq	%r9,%r8
1802	movq	%rbx,48-8+8(%rsp,%rcx,8)
1803	movq	%rdx,%r9
1804	adcq	$0,%r9
1805
1806	mulq	%rbx
1807	addq	%rax,%r10
1808	movq	24(%rbp),%rax
1809	adcq	$0,%rdx
1810	addq	%r10,%r9
1811	movq	32+8(%rsp),%rsi
1812	movq	%rdx,%r10
1813	adcq	$0,%r10
1814
1815	mulq	%rbx
1816	addq	%rax,%r11
1817	movq	32(%rbp),%rax
1818	adcq	$0,%rdx
1819	imulq	%r8,%rsi
1820	addq	%r11,%r10
1821	movq	%rdx,%r11
1822	adcq	$0,%r11
1823
1824	mulq	%rbx
1825	addq	%rax,%r12
1826	movq	40(%rbp),%rax
1827	adcq	$0,%rdx
1828	addq	%r12,%r11
1829	movq	%rdx,%r12
1830	adcq	$0,%r12
1831
1832	mulq	%rbx
1833	addq	%rax,%r13
1834	movq	48(%rbp),%rax
1835	adcq	$0,%rdx
1836	addq	%r13,%r12
1837	movq	%rdx,%r13
1838	adcq	$0,%r13
1839
1840	mulq	%rbx
1841	addq	%rax,%r14
1842	movq	56(%rbp),%rax
1843	adcq	$0,%rdx
1844	addq	%r14,%r13
1845	movq	%rdx,%r14
1846	adcq	$0,%r14
1847
1848	mulq	%rbx
1849	movq	%rsi,%rbx
1850	addq	%rax,%r15
1851	movq	0(%rbp),%rax
1852	adcq	$0,%rdx
1853	addq	%r15,%r14
1854	movq	%rdx,%r15
1855	adcq	$0,%r15
1856
1857	decl	%ecx
1858	jnz	.L8x_reduce
1859
1860	leaq	64(%rbp),%rbp
1861	xorq	%rax,%rax
1862	movq	8+8(%rsp),%rdx
1863	cmpq	0+8(%rsp),%rbp
1864	jae	.L8x_no_tail
1865
1866.byte	0x66
1867	addq	0(%rdi),%r8
1868	adcq	8(%rdi),%r9
1869	adcq	16(%rdi),%r10
1870	adcq	24(%rdi),%r11
1871	adcq	32(%rdi),%r12
1872	adcq	40(%rdi),%r13
1873	adcq	48(%rdi),%r14
1874	adcq	56(%rdi),%r15
1875	sbbq	%rsi,%rsi
1876
1877	movq	48+56+8(%rsp),%rbx
1878	movl	$8,%ecx
1879	movq	0(%rbp),%rax
1880	jmp	.L8x_tail
1881
1882.align	32
1883.L8x_tail:
1884	mulq	%rbx
1885	addq	%rax,%r8
1886	movq	8(%rbp),%rax
1887	movq	%r8,(%rdi)
1888	movq	%rdx,%r8
1889	adcq	$0,%r8
1890
1891	mulq	%rbx
1892	addq	%rax,%r9
1893	movq	16(%rbp),%rax
1894	adcq	$0,%rdx
1895	addq	%r9,%r8
1896	leaq	8(%rdi),%rdi
1897	movq	%rdx,%r9
1898	adcq	$0,%r9
1899
1900	mulq	%rbx
1901	addq	%rax,%r10
1902	movq	24(%rbp),%rax
1903	adcq	$0,%rdx
1904	addq	%r10,%r9
1905	movq	%rdx,%r10
1906	adcq	$0,%r10
1907
1908	mulq	%rbx
1909	addq	%rax,%r11
1910	movq	32(%rbp),%rax
1911	adcq	$0,%rdx
1912	addq	%r11,%r10
1913	movq	%rdx,%r11
1914	adcq	$0,%r11
1915
1916	mulq	%rbx
1917	addq	%rax,%r12
1918	movq	40(%rbp),%rax
1919	adcq	$0,%rdx
1920	addq	%r12,%r11
1921	movq	%rdx,%r12
1922	adcq	$0,%r12
1923
1924	mulq	%rbx
1925	addq	%rax,%r13
1926	movq	48(%rbp),%rax
1927	adcq	$0,%rdx
1928	addq	%r13,%r12
1929	movq	%rdx,%r13
1930	adcq	$0,%r13
1931
1932	mulq	%rbx
1933	addq	%rax,%r14
1934	movq	56(%rbp),%rax
1935	adcq	$0,%rdx
1936	addq	%r14,%r13
1937	movq	%rdx,%r14
1938	adcq	$0,%r14
1939
1940	mulq	%rbx
1941	movq	48-16+8(%rsp,%rcx,8),%rbx
1942	addq	%rax,%r15
1943	adcq	$0,%rdx
1944	addq	%r15,%r14
1945	movq	0(%rbp),%rax
1946	movq	%rdx,%r15
1947	adcq	$0,%r15
1948
1949	decl	%ecx
1950	jnz	.L8x_tail
1951
1952	leaq	64(%rbp),%rbp
1953	movq	8+8(%rsp),%rdx
1954	cmpq	0+8(%rsp),%rbp
1955	jae	.L8x_tail_done
1956
1957	movq	48+56+8(%rsp),%rbx
1958	negq	%rsi
1959	movq	0(%rbp),%rax
1960	adcq	0(%rdi),%r8
1961	adcq	8(%rdi),%r9
1962	adcq	16(%rdi),%r10
1963	adcq	24(%rdi),%r11
1964	adcq	32(%rdi),%r12
1965	adcq	40(%rdi),%r13
1966	adcq	48(%rdi),%r14
1967	adcq	56(%rdi),%r15
1968	sbbq	%rsi,%rsi
1969
1970	movl	$8,%ecx
1971	jmp	.L8x_tail
1972
1973.align	32
1974.L8x_tail_done:
1975	xorq	%rax,%rax
1976	addq	(%rdx),%r8
1977	adcq	$0,%r9
1978	adcq	$0,%r10
1979	adcq	$0,%r11
1980	adcq	$0,%r12
1981	adcq	$0,%r13
1982	adcq	$0,%r14
1983	adcq	$0,%r15
1984	adcq	$0,%rax
1985
1986	negq	%rsi
1987.L8x_no_tail:
1988	adcq	0(%rdi),%r8
1989	adcq	8(%rdi),%r9
1990	adcq	16(%rdi),%r10
1991	adcq	24(%rdi),%r11
1992	adcq	32(%rdi),%r12
1993	adcq	40(%rdi),%r13
1994	adcq	48(%rdi),%r14
1995	adcq	56(%rdi),%r15
1996	adcq	$0,%rax
1997	movq	-8(%rbp),%rcx
1998	xorq	%rsi,%rsi
1999
2000.byte	102,72,15,126,213
2001
2002	movq	%r8,0(%rdi)
2003	movq	%r9,8(%rdi)
2004.byte	102,73,15,126,217
2005	movq	%r10,16(%rdi)
2006	movq	%r11,24(%rdi)
2007	movq	%r12,32(%rdi)
2008	movq	%r13,40(%rdi)
2009	movq	%r14,48(%rdi)
2010	movq	%r15,56(%rdi)
2011	leaq	64(%rdi),%rdi
2012
2013	cmpq	%rdx,%rdi
2014	jb	.L8x_reduction_loop
2015	.byte	0xf3,0xc3
2016.cfi_endproc
2017.size	bn_sqr8x_internal,.-bn_sqr8x_internal
2018.type	__bn_post4x_internal,@function
2019.align	32
2020__bn_post4x_internal:
2021.cfi_startproc
2022	movq	0(%rbp),%r12
2023	leaq	(%rdi,%r9,1),%rbx
2024	movq	%r9,%rcx
2025.byte	102,72,15,126,207
2026	negq	%rax
2027.byte	102,72,15,126,206
2028	sarq	$3+2,%rcx
2029	decq	%r12
2030	xorq	%r10,%r10
2031	movq	8(%rbp),%r13
2032	movq	16(%rbp),%r14
2033	movq	24(%rbp),%r15
2034	jmp	.Lsqr4x_sub_entry
2035
2036.align	16
2037.Lsqr4x_sub:
2038	movq	0(%rbp),%r12
2039	movq	8(%rbp),%r13
2040	movq	16(%rbp),%r14
2041	movq	24(%rbp),%r15
2042.Lsqr4x_sub_entry:
2043	leaq	32(%rbp),%rbp
2044	notq	%r12
2045	notq	%r13
2046	notq	%r14
2047	notq	%r15
2048	andq	%rax,%r12
2049	andq	%rax,%r13
2050	andq	%rax,%r14
2051	andq	%rax,%r15
2052
2053	negq	%r10
2054	adcq	0(%rbx),%r12
2055	adcq	8(%rbx),%r13
2056	adcq	16(%rbx),%r14
2057	adcq	24(%rbx),%r15
2058	movq	%r12,0(%rdi)
2059	leaq	32(%rbx),%rbx
2060	movq	%r13,8(%rdi)
2061	sbbq	%r10,%r10
2062	movq	%r14,16(%rdi)
2063	movq	%r15,24(%rdi)
2064	leaq	32(%rdi),%rdi
2065
2066	incq	%rcx
2067	jnz	.Lsqr4x_sub
2068
2069	movq	%r9,%r10
2070	negq	%r9
2071	.byte	0xf3,0xc3
2072.cfi_endproc
2073.size	__bn_post4x_internal,.-__bn_post4x_internal
2074.type	bn_mulx4x_mont_gather5,@function
2075.align	32
2076bn_mulx4x_mont_gather5:
2077.cfi_startproc
2078	movq	%rsp,%rax
2079.cfi_def_cfa_register	%rax
2080.Lmulx4x_enter:
2081	pushq	%rbx
2082.cfi_offset	%rbx,-16
2083	pushq	%rbp
2084.cfi_offset	%rbp,-24
2085	pushq	%r12
2086.cfi_offset	%r12,-32
2087	pushq	%r13
2088.cfi_offset	%r13,-40
2089	pushq	%r14
2090.cfi_offset	%r14,-48
2091	pushq	%r15
2092.cfi_offset	%r15,-56
2093.Lmulx4x_prologue:
2094
2095	shll	$3,%r9d
2096	leaq	(%r9,%r9,2),%r10
2097	negq	%r9
2098	movq	(%r8),%r8
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109	leaq	-320(%rsp,%r9,2),%r11
2110	movq	%rsp,%rbp
2111	subq	%rdi,%r11
2112	andq	$4095,%r11
2113	cmpq	%r11,%r10
2114	jb	.Lmulx4xsp_alt
2115	subq	%r11,%rbp
2116	leaq	-320(%rbp,%r9,2),%rbp
2117	jmp	.Lmulx4xsp_done
2118
2119.Lmulx4xsp_alt:
2120	leaq	4096-320(,%r9,2),%r10
2121	leaq	-320(%rbp,%r9,2),%rbp
2122	subq	%r10,%r11
2123	movq	$0,%r10
2124	cmovcq	%r10,%r11
2125	subq	%r11,%rbp
2126.Lmulx4xsp_done:
2127	andq	$-64,%rbp
2128	movq	%rsp,%r11
2129	subq	%rbp,%r11
2130	andq	$-4096,%r11
2131	leaq	(%r11,%rbp,1),%rsp
2132	movq	(%rsp),%r10
2133	cmpq	%rbp,%rsp
2134	ja	.Lmulx4x_page_walk
2135	jmp	.Lmulx4x_page_walk_done
2136
2137.Lmulx4x_page_walk:
2138	leaq	-4096(%rsp),%rsp
2139	movq	(%rsp),%r10
2140	cmpq	%rbp,%rsp
2141	ja	.Lmulx4x_page_walk
2142.Lmulx4x_page_walk_done:
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156	movq	%r8,32(%rsp)
2157	movq	%rax,40(%rsp)
2158.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2159.Lmulx4x_body:
2160	call	mulx4x_internal
2161
2162	movq	40(%rsp),%rsi
2163.cfi_def_cfa	%rsi,8
2164	movq	$1,%rax
2165
2166	movq	-48(%rsi),%r15
2167.cfi_restore	%r15
2168	movq	-40(%rsi),%r14
2169.cfi_restore	%r14
2170	movq	-32(%rsi),%r13
2171.cfi_restore	%r13
2172	movq	-24(%rsi),%r12
2173.cfi_restore	%r12
2174	movq	-16(%rsi),%rbp
2175.cfi_restore	%rbp
2176	movq	-8(%rsi),%rbx
2177.cfi_restore	%rbx
2178	leaq	(%rsi),%rsp
2179.cfi_def_cfa_register	%rsp
2180.Lmulx4x_epilogue:
2181	.byte	0xf3,0xc3
2182.cfi_endproc
2183.size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2184
2185.type	mulx4x_internal,@function
2186.align	32
2187mulx4x_internal:
2188.cfi_startproc
2189	movq	%r9,8(%rsp)
2190	movq	%r9,%r10
2191	negq	%r9
2192	shlq	$5,%r9
2193	negq	%r10
2194	leaq	128(%rdx,%r9,1),%r13
2195	shrq	$5+5,%r9
2196	movd	8(%rax),%xmm5
2197	subq	$1,%r9
2198	leaq	.Linc(%rip),%rax
2199	movq	%r13,16+8(%rsp)
2200	movq	%r9,24+8(%rsp)
2201	movq	%rdi,56+8(%rsp)
2202	movdqa	0(%rax),%xmm0
2203	movdqa	16(%rax),%xmm1
2204	leaq	88-112(%rsp,%r10,1),%r10
2205	leaq	128(%rdx),%rdi
2206
2207	pshufd	$0,%xmm5,%xmm5
2208	movdqa	%xmm1,%xmm4
2209.byte	0x67
2210	movdqa	%xmm1,%xmm2
2211.byte	0x67
2212	paddd	%xmm0,%xmm1
2213	pcmpeqd	%xmm5,%xmm0
2214	movdqa	%xmm4,%xmm3
2215	paddd	%xmm1,%xmm2
2216	pcmpeqd	%xmm5,%xmm1
2217	movdqa	%xmm0,112(%r10)
2218	movdqa	%xmm4,%xmm0
2219
2220	paddd	%xmm2,%xmm3
2221	pcmpeqd	%xmm5,%xmm2
2222	movdqa	%xmm1,128(%r10)
2223	movdqa	%xmm4,%xmm1
2224
2225	paddd	%xmm3,%xmm0
2226	pcmpeqd	%xmm5,%xmm3
2227	movdqa	%xmm2,144(%r10)
2228	movdqa	%xmm4,%xmm2
2229
2230	paddd	%xmm0,%xmm1
2231	pcmpeqd	%xmm5,%xmm0
2232	movdqa	%xmm3,160(%r10)
2233	movdqa	%xmm4,%xmm3
2234	paddd	%xmm1,%xmm2
2235	pcmpeqd	%xmm5,%xmm1
2236	movdqa	%xmm0,176(%r10)
2237	movdqa	%xmm4,%xmm0
2238
2239	paddd	%xmm2,%xmm3
2240	pcmpeqd	%xmm5,%xmm2
2241	movdqa	%xmm1,192(%r10)
2242	movdqa	%xmm4,%xmm1
2243
2244	paddd	%xmm3,%xmm0
2245	pcmpeqd	%xmm5,%xmm3
2246	movdqa	%xmm2,208(%r10)
2247	movdqa	%xmm4,%xmm2
2248
2249	paddd	%xmm0,%xmm1
2250	pcmpeqd	%xmm5,%xmm0
2251	movdqa	%xmm3,224(%r10)
2252	movdqa	%xmm4,%xmm3
2253	paddd	%xmm1,%xmm2
2254	pcmpeqd	%xmm5,%xmm1
2255	movdqa	%xmm0,240(%r10)
2256	movdqa	%xmm4,%xmm0
2257
2258	paddd	%xmm2,%xmm3
2259	pcmpeqd	%xmm5,%xmm2
2260	movdqa	%xmm1,256(%r10)
2261	movdqa	%xmm4,%xmm1
2262
2263	paddd	%xmm3,%xmm0
2264	pcmpeqd	%xmm5,%xmm3
2265	movdqa	%xmm2,272(%r10)
2266	movdqa	%xmm4,%xmm2
2267
2268	paddd	%xmm0,%xmm1
2269	pcmpeqd	%xmm5,%xmm0
2270	movdqa	%xmm3,288(%r10)
2271	movdqa	%xmm4,%xmm3
2272.byte	0x67
2273	paddd	%xmm1,%xmm2
2274	pcmpeqd	%xmm5,%xmm1
2275	movdqa	%xmm0,304(%r10)
2276
2277	paddd	%xmm2,%xmm3
2278	pcmpeqd	%xmm5,%xmm2
2279	movdqa	%xmm1,320(%r10)
2280
2281	pcmpeqd	%xmm5,%xmm3
2282	movdqa	%xmm2,336(%r10)
2283
2284	pand	64(%rdi),%xmm0
2285	pand	80(%rdi),%xmm1
2286	pand	96(%rdi),%xmm2
2287	movdqa	%xmm3,352(%r10)
2288	pand	112(%rdi),%xmm3
2289	por	%xmm2,%xmm0
2290	por	%xmm3,%xmm1
2291	movdqa	-128(%rdi),%xmm4
2292	movdqa	-112(%rdi),%xmm5
2293	movdqa	-96(%rdi),%xmm2
2294	pand	112(%r10),%xmm4
2295	movdqa	-80(%rdi),%xmm3
2296	pand	128(%r10),%xmm5
2297	por	%xmm4,%xmm0
2298	pand	144(%r10),%xmm2
2299	por	%xmm5,%xmm1
2300	pand	160(%r10),%xmm3
2301	por	%xmm2,%xmm0
2302	por	%xmm3,%xmm1
2303	movdqa	-64(%rdi),%xmm4
2304	movdqa	-48(%rdi),%xmm5
2305	movdqa	-32(%rdi),%xmm2
2306	pand	176(%r10),%xmm4
2307	movdqa	-16(%rdi),%xmm3
2308	pand	192(%r10),%xmm5
2309	por	%xmm4,%xmm0
2310	pand	208(%r10),%xmm2
2311	por	%xmm5,%xmm1
2312	pand	224(%r10),%xmm3
2313	por	%xmm2,%xmm0
2314	por	%xmm3,%xmm1
2315	movdqa	0(%rdi),%xmm4
2316	movdqa	16(%rdi),%xmm5
2317	movdqa	32(%rdi),%xmm2
2318	pand	240(%r10),%xmm4
2319	movdqa	48(%rdi),%xmm3
2320	pand	256(%r10),%xmm5
2321	por	%xmm4,%xmm0
2322	pand	272(%r10),%xmm2
2323	por	%xmm5,%xmm1
2324	pand	288(%r10),%xmm3
2325	por	%xmm2,%xmm0
2326	por	%xmm3,%xmm1
2327	pxor	%xmm1,%xmm0
2328
2329	pshufd	$0x4e,%xmm0,%xmm1
2330	por	%xmm1,%xmm0
2331	leaq	256(%rdi),%rdi
2332.byte	102,72,15,126,194
2333	leaq	64+32+8(%rsp),%rbx
2334
2335	movq	%rdx,%r9
2336	mulxq	0(%rsi),%r8,%rax
2337	mulxq	8(%rsi),%r11,%r12
2338	addq	%rax,%r11
2339	mulxq	16(%rsi),%rax,%r13
2340	adcq	%rax,%r12
2341	adcq	$0,%r13
2342	mulxq	24(%rsi),%rax,%r14
2343
2344	movq	%r8,%r15
2345	imulq	32+8(%rsp),%r8
2346	xorq	%rbp,%rbp
2347	movq	%r8,%rdx
2348
2349	movq	%rdi,8+8(%rsp)
2350
2351	leaq	32(%rsi),%rsi
2352	adcxq	%rax,%r13
2353	adcxq	%rbp,%r14
2354
2355	mulxq	0(%rcx),%rax,%r10
2356	adcxq	%rax,%r15
2357	adoxq	%r11,%r10
2358	mulxq	8(%rcx),%rax,%r11
2359	adcxq	%rax,%r10
2360	adoxq	%r12,%r11
2361	mulxq	16(%rcx),%rax,%r12
2362	movq	24+8(%rsp),%rdi
2363	movq	%r10,-32(%rbx)
2364	adcxq	%rax,%r11
2365	adoxq	%r13,%r12
2366	mulxq	24(%rcx),%rax,%r15
2367	movq	%r9,%rdx
2368	movq	%r11,-24(%rbx)
2369	adcxq	%rax,%r12
2370	adoxq	%rbp,%r15
2371	leaq	32(%rcx),%rcx
2372	movq	%r12,-16(%rbx)
2373	jmp	.Lmulx4x_1st
2374
2375.align	32
2376.Lmulx4x_1st:
2377	adcxq	%rbp,%r15
2378	mulxq	0(%rsi),%r10,%rax
2379	adcxq	%r14,%r10
2380	mulxq	8(%rsi),%r11,%r14
2381	adcxq	%rax,%r11
2382	mulxq	16(%rsi),%r12,%rax
2383	adcxq	%r14,%r12
2384	mulxq	24(%rsi),%r13,%r14
2385.byte	0x67,0x67
2386	movq	%r8,%rdx
2387	adcxq	%rax,%r13
2388	adcxq	%rbp,%r14
2389	leaq	32(%rsi),%rsi
2390	leaq	32(%rbx),%rbx
2391
2392	adoxq	%r15,%r10
2393	mulxq	0(%rcx),%rax,%r15
2394	adcxq	%rax,%r10
2395	adoxq	%r15,%r11
2396	mulxq	8(%rcx),%rax,%r15
2397	adcxq	%rax,%r11
2398	adoxq	%r15,%r12
2399	mulxq	16(%rcx),%rax,%r15
2400	movq	%r10,-40(%rbx)
2401	adcxq	%rax,%r12
2402	movq	%r11,-32(%rbx)
2403	adoxq	%r15,%r13
2404	mulxq	24(%rcx),%rax,%r15
2405	movq	%r9,%rdx
2406	movq	%r12,-24(%rbx)
2407	adcxq	%rax,%r13
2408	adoxq	%rbp,%r15
2409	leaq	32(%rcx),%rcx
2410	movq	%r13,-16(%rbx)
2411
2412	decq	%rdi
2413	jnz	.Lmulx4x_1st
2414
2415	movq	8(%rsp),%rax
2416	adcq	%rbp,%r15
2417	leaq	(%rsi,%rax,1),%rsi
2418	addq	%r15,%r14
2419	movq	8+8(%rsp),%rdi
2420	adcq	%rbp,%rbp
2421	movq	%r14,-8(%rbx)
2422	jmp	.Lmulx4x_outer
2423
2424.align	32
2425.Lmulx4x_outer:
2426	leaq	16-256(%rbx),%r10
2427	pxor	%xmm4,%xmm4
2428.byte	0x67,0x67
2429	pxor	%xmm5,%xmm5
2430	movdqa	-128(%rdi),%xmm0
2431	movdqa	-112(%rdi),%xmm1
2432	movdqa	-96(%rdi),%xmm2
2433	pand	256(%r10),%xmm0
2434	movdqa	-80(%rdi),%xmm3
2435	pand	272(%r10),%xmm1
2436	por	%xmm0,%xmm4
2437	pand	288(%r10),%xmm2
2438	por	%xmm1,%xmm5
2439	pand	304(%r10),%xmm3
2440	por	%xmm2,%xmm4
2441	por	%xmm3,%xmm5
2442	movdqa	-64(%rdi),%xmm0
2443	movdqa	-48(%rdi),%xmm1
2444	movdqa	-32(%rdi),%xmm2
2445	pand	320(%r10),%xmm0
2446	movdqa	-16(%rdi),%xmm3
2447	pand	336(%r10),%xmm1
2448	por	%xmm0,%xmm4
2449	pand	352(%r10),%xmm2
2450	por	%xmm1,%xmm5
2451	pand	368(%r10),%xmm3
2452	por	%xmm2,%xmm4
2453	por	%xmm3,%xmm5
2454	movdqa	0(%rdi),%xmm0
2455	movdqa	16(%rdi),%xmm1
2456	movdqa	32(%rdi),%xmm2
2457	pand	384(%r10),%xmm0
2458	movdqa	48(%rdi),%xmm3
2459	pand	400(%r10),%xmm1
2460	por	%xmm0,%xmm4
2461	pand	416(%r10),%xmm2
2462	por	%xmm1,%xmm5
2463	pand	432(%r10),%xmm3
2464	por	%xmm2,%xmm4
2465	por	%xmm3,%xmm5
2466	movdqa	64(%rdi),%xmm0
2467	movdqa	80(%rdi),%xmm1
2468	movdqa	96(%rdi),%xmm2
2469	pand	448(%r10),%xmm0
2470	movdqa	112(%rdi),%xmm3
2471	pand	464(%r10),%xmm1
2472	por	%xmm0,%xmm4
2473	pand	480(%r10),%xmm2
2474	por	%xmm1,%xmm5
2475	pand	496(%r10),%xmm3
2476	por	%xmm2,%xmm4
2477	por	%xmm3,%xmm5
2478	por	%xmm5,%xmm4
2479
2480	pshufd	$0x4e,%xmm4,%xmm0
2481	por	%xmm4,%xmm0
2482	leaq	256(%rdi),%rdi
2483.byte	102,72,15,126,194
2484
2485	movq	%rbp,(%rbx)
2486	leaq	32(%rbx,%rax,1),%rbx
2487	mulxq	0(%rsi),%r8,%r11
2488	xorq	%rbp,%rbp
2489	movq	%rdx,%r9
2490	mulxq	8(%rsi),%r14,%r12
2491	adoxq	-32(%rbx),%r8
2492	adcxq	%r14,%r11
2493	mulxq	16(%rsi),%r15,%r13
2494	adoxq	-24(%rbx),%r11
2495	adcxq	%r15,%r12
2496	mulxq	24(%rsi),%rdx,%r14
2497	adoxq	-16(%rbx),%r12
2498	adcxq	%rdx,%r13
2499	leaq	(%rcx,%rax,1),%rcx
2500	leaq	32(%rsi),%rsi
2501	adoxq	-8(%rbx),%r13
2502	adcxq	%rbp,%r14
2503	adoxq	%rbp,%r14
2504
2505	movq	%r8,%r15
2506	imulq	32+8(%rsp),%r8
2507
2508	movq	%r8,%rdx
2509	xorq	%rbp,%rbp
2510	movq	%rdi,8+8(%rsp)
2511
2512	mulxq	0(%rcx),%rax,%r10
2513	adcxq	%rax,%r15
2514	adoxq	%r11,%r10
2515	mulxq	8(%rcx),%rax,%r11
2516	adcxq	%rax,%r10
2517	adoxq	%r12,%r11
2518	mulxq	16(%rcx),%rax,%r12
2519	adcxq	%rax,%r11
2520	adoxq	%r13,%r12
2521	mulxq	24(%rcx),%rax,%r15
2522	movq	%r9,%rdx
2523	movq	24+8(%rsp),%rdi
2524	movq	%r10,-32(%rbx)
2525	adcxq	%rax,%r12
2526	movq	%r11,-24(%rbx)
2527	adoxq	%rbp,%r15
2528	movq	%r12,-16(%rbx)
2529	leaq	32(%rcx),%rcx
2530	jmp	.Lmulx4x_inner
2531
2532.align	32
2533.Lmulx4x_inner:
2534	mulxq	0(%rsi),%r10,%rax
2535	adcxq	%rbp,%r15
2536	adoxq	%r14,%r10
2537	mulxq	8(%rsi),%r11,%r14
2538	adcxq	0(%rbx),%r10
2539	adoxq	%rax,%r11
2540	mulxq	16(%rsi),%r12,%rax
2541	adcxq	8(%rbx),%r11
2542	adoxq	%r14,%r12
2543	mulxq	24(%rsi),%r13,%r14
2544	movq	%r8,%rdx
2545	adcxq	16(%rbx),%r12
2546	adoxq	%rax,%r13
2547	adcxq	24(%rbx),%r13
2548	adoxq	%rbp,%r14
2549	leaq	32(%rsi),%rsi
2550	leaq	32(%rbx),%rbx
2551	adcxq	%rbp,%r14
2552
2553	adoxq	%r15,%r10
2554	mulxq	0(%rcx),%rax,%r15
2555	adcxq	%rax,%r10
2556	adoxq	%r15,%r11
2557	mulxq	8(%rcx),%rax,%r15
2558	adcxq	%rax,%r11
2559	adoxq	%r15,%r12
2560	mulxq	16(%rcx),%rax,%r15
2561	movq	%r10,-40(%rbx)
2562	adcxq	%rax,%r12
2563	adoxq	%r15,%r13
2564	movq	%r11,-32(%rbx)
2565	mulxq	24(%rcx),%rax,%r15
2566	movq	%r9,%rdx
2567	leaq	32(%rcx),%rcx
2568	movq	%r12,-24(%rbx)
2569	adcxq	%rax,%r13
2570	adoxq	%rbp,%r15
2571	movq	%r13,-16(%rbx)
2572
2573	decq	%rdi
2574	jnz	.Lmulx4x_inner
2575
2576	movq	0+8(%rsp),%rax
2577	adcq	%rbp,%r15
2578	subq	0(%rbx),%rdi
2579	movq	8+8(%rsp),%rdi
2580	movq	16+8(%rsp),%r10
2581	adcq	%r15,%r14
2582	leaq	(%rsi,%rax,1),%rsi
2583	adcq	%rbp,%rbp
2584	movq	%r14,-8(%rbx)
2585
2586	cmpq	%r10,%rdi
2587	jb	.Lmulx4x_outer
2588
2589	movq	-8(%rcx),%r10
2590	movq	%rbp,%r8
2591	movq	(%rcx,%rax,1),%r12
2592	leaq	(%rcx,%rax,1),%rbp
2593	movq	%rax,%rcx
2594	leaq	(%rbx,%rax,1),%rdi
2595	xorl	%eax,%eax
2596	xorq	%r15,%r15
2597	subq	%r14,%r10
2598	adcq	%r15,%r15
2599	orq	%r15,%r8
2600	sarq	$3+2,%rcx
2601	subq	%r8,%rax
2602	movq	56+8(%rsp),%rdx
2603	decq	%r12
2604	movq	8(%rbp),%r13
2605	xorq	%r8,%r8
2606	movq	16(%rbp),%r14
2607	movq	24(%rbp),%r15
2608	jmp	.Lsqrx4x_sub_entry
2609.cfi_endproc
2610.size	mulx4x_internal,.-mulx4x_internal
2611.type	bn_powerx5,@function
2612.align	32
2613bn_powerx5:
2614.cfi_startproc
2615	movq	%rsp,%rax
2616.cfi_def_cfa_register	%rax
2617.Lpowerx5_enter:
2618	pushq	%rbx
2619.cfi_offset	%rbx,-16
2620	pushq	%rbp
2621.cfi_offset	%rbp,-24
2622	pushq	%r12
2623.cfi_offset	%r12,-32
2624	pushq	%r13
2625.cfi_offset	%r13,-40
2626	pushq	%r14
2627.cfi_offset	%r14,-48
2628	pushq	%r15
2629.cfi_offset	%r15,-56
2630.Lpowerx5_prologue:
2631
2632	shll	$3,%r9d
2633	leaq	(%r9,%r9,2),%r10
2634	negq	%r9
2635	movq	(%r8),%r8
2636
2637
2638
2639
2640
2641
2642
2643
2644	leaq	-320(%rsp,%r9,2),%r11
2645	movq	%rsp,%rbp
2646	subq	%rdi,%r11
2647	andq	$4095,%r11
2648	cmpq	%r11,%r10
2649	jb	.Lpwrx_sp_alt
2650	subq	%r11,%rbp
2651	leaq	-320(%rbp,%r9,2),%rbp
2652	jmp	.Lpwrx_sp_done
2653
2654.align	32
2655.Lpwrx_sp_alt:
2656	leaq	4096-320(,%r9,2),%r10
2657	leaq	-320(%rbp,%r9,2),%rbp
2658	subq	%r10,%r11
2659	movq	$0,%r10
2660	cmovcq	%r10,%r11
2661	subq	%r11,%rbp
2662.Lpwrx_sp_done:
2663	andq	$-64,%rbp
2664	movq	%rsp,%r11
2665	subq	%rbp,%r11
2666	andq	$-4096,%r11
2667	leaq	(%r11,%rbp,1),%rsp
2668	movq	(%rsp),%r10
2669	cmpq	%rbp,%rsp
2670	ja	.Lpwrx_page_walk
2671	jmp	.Lpwrx_page_walk_done
2672
2673.Lpwrx_page_walk:
2674	leaq	-4096(%rsp),%rsp
2675	movq	(%rsp),%r10
2676	cmpq	%rbp,%rsp
2677	ja	.Lpwrx_page_walk
2678.Lpwrx_page_walk_done:
2679
2680	movq	%r9,%r10
2681	negq	%r9
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694	pxor	%xmm0,%xmm0
2695.byte	102,72,15,110,207
2696.byte	102,72,15,110,209
2697.byte	102,73,15,110,218
2698.byte	102,72,15,110,226
2699	movq	%r8,32(%rsp)
2700	movq	%rax,40(%rsp)
2701.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2702.Lpowerx5_body:
2703
2704	call	__bn_sqrx8x_internal
2705	call	__bn_postx4x_internal
2706	call	__bn_sqrx8x_internal
2707	call	__bn_postx4x_internal
2708	call	__bn_sqrx8x_internal
2709	call	__bn_postx4x_internal
2710	call	__bn_sqrx8x_internal
2711	call	__bn_postx4x_internal
2712	call	__bn_sqrx8x_internal
2713	call	__bn_postx4x_internal
2714
2715	movq	%r10,%r9
2716	movq	%rsi,%rdi
2717.byte	102,72,15,126,209
2718.byte	102,72,15,126,226
2719	movq	40(%rsp),%rax
2720
2721	call	mulx4x_internal
2722
2723	movq	40(%rsp),%rsi
2724.cfi_def_cfa	%rsi,8
2725	movq	$1,%rax
2726
2727	movq	-48(%rsi),%r15
2728.cfi_restore	%r15
2729	movq	-40(%rsi),%r14
2730.cfi_restore	%r14
2731	movq	-32(%rsi),%r13
2732.cfi_restore	%r13
2733	movq	-24(%rsi),%r12
2734.cfi_restore	%r12
2735	movq	-16(%rsi),%rbp
2736.cfi_restore	%rbp
2737	movq	-8(%rsi),%rbx
2738.cfi_restore	%rbx
2739	leaq	(%rsi),%rsp
2740.cfi_def_cfa_register	%rsp
2741.Lpowerx5_epilogue:
2742	.byte	0xf3,0xc3
2743.cfi_endproc
2744.size	bn_powerx5,.-bn_powerx5
2745
2746.globl	bn_sqrx8x_internal
2747.hidden bn_sqrx8x_internal
2748.hidden	bn_sqrx8x_internal
2749.type	bn_sqrx8x_internal,@function
2750.align	32
2751bn_sqrx8x_internal:
2752__bn_sqrx8x_internal:
2753.cfi_startproc
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794	leaq	48+8(%rsp),%rdi
2795	leaq	(%rsi,%r9,1),%rbp
2796	movq	%r9,0+8(%rsp)
2797	movq	%rbp,8+8(%rsp)
2798	jmp	.Lsqr8x_zero_start
2799
2800.align	32
2801.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2802.Lsqrx8x_zero:
2803.byte	0x3e
2804	movdqa	%xmm0,0(%rdi)
2805	movdqa	%xmm0,16(%rdi)
2806	movdqa	%xmm0,32(%rdi)
2807	movdqa	%xmm0,48(%rdi)
2808.Lsqr8x_zero_start:
2809	movdqa	%xmm0,64(%rdi)
2810	movdqa	%xmm0,80(%rdi)
2811	movdqa	%xmm0,96(%rdi)
2812	movdqa	%xmm0,112(%rdi)
2813	leaq	128(%rdi),%rdi
2814	subq	$64,%r9
2815	jnz	.Lsqrx8x_zero
2816
2817	movq	0(%rsi),%rdx
2818
2819	xorq	%r10,%r10
2820	xorq	%r11,%r11
2821	xorq	%r12,%r12
2822	xorq	%r13,%r13
2823	xorq	%r14,%r14
2824	xorq	%r15,%r15
2825	leaq	48+8(%rsp),%rdi
2826	xorq	%rbp,%rbp
2827	jmp	.Lsqrx8x_outer_loop
2828
2829.align	32
2830.Lsqrx8x_outer_loop:
2831	mulxq	8(%rsi),%r8,%rax
2832	adcxq	%r9,%r8
2833	adoxq	%rax,%r10
2834	mulxq	16(%rsi),%r9,%rax
2835	adcxq	%r10,%r9
2836	adoxq	%rax,%r11
2837.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
2838	adcxq	%r11,%r10
2839	adoxq	%rax,%r12
2840.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
2841	adcxq	%r12,%r11
2842	adoxq	%rax,%r13
2843	mulxq	40(%rsi),%r12,%rax
2844	adcxq	%r13,%r12
2845	adoxq	%rax,%r14
2846	mulxq	48(%rsi),%r13,%rax
2847	adcxq	%r14,%r13
2848	adoxq	%r15,%rax
2849	mulxq	56(%rsi),%r14,%r15
2850	movq	8(%rsi),%rdx
2851	adcxq	%rax,%r14
2852	adoxq	%rbp,%r15
2853	adcq	64(%rdi),%r15
2854	movq	%r8,8(%rdi)
2855	movq	%r9,16(%rdi)
2856	sbbq	%rcx,%rcx
2857	xorq	%rbp,%rbp
2858
2859
2860	mulxq	16(%rsi),%r8,%rbx
2861	mulxq	24(%rsi),%r9,%rax
2862	adcxq	%r10,%r8
2863	adoxq	%rbx,%r9
2864	mulxq	32(%rsi),%r10,%rbx
2865	adcxq	%r11,%r9
2866	adoxq	%rax,%r10
2867.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
2868	adcxq	%r12,%r10
2869	adoxq	%rbx,%r11
2870.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
2871	adcxq	%r13,%r11
2872	adoxq	%r14,%r12
2873.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
2874	movq	16(%rsi),%rdx
2875	adcxq	%rax,%r12
2876	adoxq	%rbx,%r13
2877	adcxq	%r15,%r13
2878	adoxq	%rbp,%r14
2879	adcxq	%rbp,%r14
2880
2881	movq	%r8,24(%rdi)
2882	movq	%r9,32(%rdi)
2883
2884	mulxq	24(%rsi),%r8,%rbx
2885	mulxq	32(%rsi),%r9,%rax
2886	adcxq	%r10,%r8
2887	adoxq	%rbx,%r9
2888	mulxq	40(%rsi),%r10,%rbx
2889	adcxq	%r11,%r9
2890	adoxq	%rax,%r10
2891.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
2892	adcxq	%r12,%r10
2893	adoxq	%r13,%r11
2894.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
2895.byte	0x3e
2896	movq	24(%rsi),%rdx
2897	adcxq	%rbx,%r11
2898	adoxq	%rax,%r12
2899	adcxq	%r14,%r12
2900	movq	%r8,40(%rdi)
2901	movq	%r9,48(%rdi)
2902	mulxq	32(%rsi),%r8,%rax
2903	adoxq	%rbp,%r13
2904	adcxq	%rbp,%r13
2905
2906	mulxq	40(%rsi),%r9,%rbx
2907	adcxq	%r10,%r8
2908	adoxq	%rax,%r9
2909	mulxq	48(%rsi),%r10,%rax
2910	adcxq	%r11,%r9
2911	adoxq	%r12,%r10
2912	mulxq	56(%rsi),%r11,%r12
2913	movq	32(%rsi),%rdx
2914	movq	40(%rsi),%r14
2915	adcxq	%rbx,%r10
2916	adoxq	%rax,%r11
2917	movq	48(%rsi),%r15
2918	adcxq	%r13,%r11
2919	adoxq	%rbp,%r12
2920	adcxq	%rbp,%r12
2921
2922	movq	%r8,56(%rdi)
2923	movq	%r9,64(%rdi)
2924
2925	mulxq	%r14,%r9,%rax
2926	movq	56(%rsi),%r8
2927	adcxq	%r10,%r9
2928	mulxq	%r15,%r10,%rbx
2929	adoxq	%rax,%r10
2930	adcxq	%r11,%r10
2931	mulxq	%r8,%r11,%rax
2932	movq	%r14,%rdx
2933	adoxq	%rbx,%r11
2934	adcxq	%r12,%r11
2935
2936	adcxq	%rbp,%rax
2937
2938	mulxq	%r15,%r14,%rbx
2939	mulxq	%r8,%r12,%r13
2940	movq	%r15,%rdx
2941	leaq	64(%rsi),%rsi
2942	adcxq	%r14,%r11
2943	adoxq	%rbx,%r12
2944	adcxq	%rax,%r12
2945	adoxq	%rbp,%r13
2946
2947.byte	0x67,0x67
2948	mulxq	%r8,%r8,%r14
2949	adcxq	%r8,%r13
2950	adcxq	%rbp,%r14
2951
2952	cmpq	8+8(%rsp),%rsi
2953	je	.Lsqrx8x_outer_break
2954
2955	negq	%rcx
2956	movq	$-8,%rcx
2957	movq	%rbp,%r15
2958	movq	64(%rdi),%r8
2959	adcxq	72(%rdi),%r9
2960	adcxq	80(%rdi),%r10
2961	adcxq	88(%rdi),%r11
2962	adcq	96(%rdi),%r12
2963	adcq	104(%rdi),%r13
2964	adcq	112(%rdi),%r14
2965	adcq	120(%rdi),%r15
2966	leaq	(%rsi),%rbp
2967	leaq	128(%rdi),%rdi
2968	sbbq	%rax,%rax
2969
2970	movq	-64(%rsi),%rdx
2971	movq	%rax,16+8(%rsp)
2972	movq	%rdi,24+8(%rsp)
2973
2974
2975	xorl	%eax,%eax
2976	jmp	.Lsqrx8x_loop
2977
2978.align	32
2979.Lsqrx8x_loop:
2980	movq	%r8,%rbx
2981	mulxq	0(%rbp),%rax,%r8
2982	adcxq	%rax,%rbx
2983	adoxq	%r9,%r8
2984
2985	mulxq	8(%rbp),%rax,%r9
2986	adcxq	%rax,%r8
2987	adoxq	%r10,%r9
2988
2989	mulxq	16(%rbp),%rax,%r10
2990	adcxq	%rax,%r9
2991	adoxq	%r11,%r10
2992
2993	mulxq	24(%rbp),%rax,%r11
2994	adcxq	%rax,%r10
2995	adoxq	%r12,%r11
2996
2997.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
2998	adcxq	%rax,%r11
2999	adoxq	%r13,%r12
3000
3001	mulxq	40(%rbp),%rax,%r13
3002	adcxq	%rax,%r12
3003	adoxq	%r14,%r13
3004
3005	mulxq	48(%rbp),%rax,%r14
3006	movq	%rbx,(%rdi,%rcx,8)
3007	movl	$0,%ebx
3008	adcxq	%rax,%r13
3009	adoxq	%r15,%r14
3010
3011.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
3012	movq	8(%rsi,%rcx,8),%rdx
3013	adcxq	%rax,%r14
3014	adoxq	%rbx,%r15
3015	adcxq	%rbx,%r15
3016
3017.byte	0x67
3018	incq	%rcx
3019	jnz	.Lsqrx8x_loop
3020
3021	leaq	64(%rbp),%rbp
3022	movq	$-8,%rcx
3023	cmpq	8+8(%rsp),%rbp
3024	je	.Lsqrx8x_break
3025
3026	subq	16+8(%rsp),%rbx
3027.byte	0x66
3028	movq	-64(%rsi),%rdx
3029	adcxq	0(%rdi),%r8
3030	adcxq	8(%rdi),%r9
3031	adcq	16(%rdi),%r10
3032	adcq	24(%rdi),%r11
3033	adcq	32(%rdi),%r12
3034	adcq	40(%rdi),%r13
3035	adcq	48(%rdi),%r14
3036	adcq	56(%rdi),%r15
3037	leaq	64(%rdi),%rdi
3038.byte	0x67
3039	sbbq	%rax,%rax
3040	xorl	%ebx,%ebx
3041	movq	%rax,16+8(%rsp)
3042	jmp	.Lsqrx8x_loop
3043
3044.align	32
3045.Lsqrx8x_break:
3046	xorq	%rbp,%rbp
3047	subq	16+8(%rsp),%rbx
3048	adcxq	%rbp,%r8
3049	movq	24+8(%rsp),%rcx
3050	adcxq	%rbp,%r9
3051	movq	0(%rsi),%rdx
3052	adcq	$0,%r10
3053	movq	%r8,0(%rdi)
3054	adcq	$0,%r11
3055	adcq	$0,%r12
3056	adcq	$0,%r13
3057	adcq	$0,%r14
3058	adcq	$0,%r15
3059	cmpq	%rcx,%rdi
3060	je	.Lsqrx8x_outer_loop
3061
3062	movq	%r9,8(%rdi)
3063	movq	8(%rcx),%r9
3064	movq	%r10,16(%rdi)
3065	movq	16(%rcx),%r10
3066	movq	%r11,24(%rdi)
3067	movq	24(%rcx),%r11
3068	movq	%r12,32(%rdi)
3069	movq	32(%rcx),%r12
3070	movq	%r13,40(%rdi)
3071	movq	40(%rcx),%r13
3072	movq	%r14,48(%rdi)
3073	movq	48(%rcx),%r14
3074	movq	%r15,56(%rdi)
3075	movq	56(%rcx),%r15
3076	movq	%rcx,%rdi
3077	jmp	.Lsqrx8x_outer_loop
3078
3079.align	32
3080.Lsqrx8x_outer_break:
3081	movq	%r9,72(%rdi)
3082.byte	102,72,15,126,217
3083	movq	%r10,80(%rdi)
3084	movq	%r11,88(%rdi)
3085	movq	%r12,96(%rdi)
3086	movq	%r13,104(%rdi)
3087	movq	%r14,112(%rdi)
3088	leaq	48+8(%rsp),%rdi
3089	movq	(%rsi,%rcx,1),%rdx
3090
3091	movq	8(%rdi),%r11
3092	xorq	%r10,%r10
3093	movq	0+8(%rsp),%r9
3094	adoxq	%r11,%r11
3095	movq	16(%rdi),%r12
3096	movq	24(%rdi),%r13
3097
3098
3099.align	32
3100.Lsqrx4x_shift_n_add:
3101	mulxq	%rdx,%rax,%rbx
3102	adoxq	%r12,%r12
3103	adcxq	%r10,%rax
3104.byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
3105.byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
3106	adoxq	%r13,%r13
3107	adcxq	%r11,%rbx
3108	movq	40(%rdi),%r11
3109	movq	%rax,0(%rdi)
3110	movq	%rbx,8(%rdi)
3111
3112	mulxq	%rdx,%rax,%rbx
3113	adoxq	%r10,%r10
3114	adcxq	%r12,%rax
3115	movq	16(%rsi,%rcx,1),%rdx
3116	movq	48(%rdi),%r12
3117	adoxq	%r11,%r11
3118	adcxq	%r13,%rbx
3119	movq	56(%rdi),%r13
3120	movq	%rax,16(%rdi)
3121	movq	%rbx,24(%rdi)
3122
3123	mulxq	%rdx,%rax,%rbx
3124	adoxq	%r12,%r12
3125	adcxq	%r10,%rax
3126	movq	24(%rsi,%rcx,1),%rdx
3127	leaq	32(%rcx),%rcx
3128	movq	64(%rdi),%r10
3129	adoxq	%r13,%r13
3130	adcxq	%r11,%rbx
3131	movq	72(%rdi),%r11
3132	movq	%rax,32(%rdi)
3133	movq	%rbx,40(%rdi)
3134
3135	mulxq	%rdx,%rax,%rbx
3136	adoxq	%r10,%r10
3137	adcxq	%r12,%rax
3138	jrcxz	.Lsqrx4x_shift_n_add_break
3139.byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
3140	adoxq	%r11,%r11
3141	adcxq	%r13,%rbx
3142	movq	80(%rdi),%r12
3143	movq	88(%rdi),%r13
3144	movq	%rax,48(%rdi)
3145	movq	%rbx,56(%rdi)
3146	leaq	64(%rdi),%rdi
3147	nop
3148	jmp	.Lsqrx4x_shift_n_add
3149
3150.align	32
3151.Lsqrx4x_shift_n_add_break:
3152	adcxq	%r13,%rbx
3153	movq	%rax,48(%rdi)
3154	movq	%rbx,56(%rdi)
3155	leaq	64(%rdi),%rdi
3156.byte	102,72,15,126,213
3157__bn_sqrx8x_reduction:
3158	xorl	%eax,%eax
3159	movq	32+8(%rsp),%rbx
3160	movq	48+8(%rsp),%rdx
3161	leaq	-64(%rbp,%r9,1),%rcx
3162
3163	movq	%rcx,0+8(%rsp)
3164	movq	%rdi,8+8(%rsp)
3165
3166	leaq	48+8(%rsp),%rdi
3167	jmp	.Lsqrx8x_reduction_loop
3168
3169.align	32
3170.Lsqrx8x_reduction_loop:
3171	movq	8(%rdi),%r9
3172	movq	16(%rdi),%r10
3173	movq	24(%rdi),%r11
3174	movq	32(%rdi),%r12
3175	movq	%rdx,%r8
3176	imulq	%rbx,%rdx
3177	movq	40(%rdi),%r13
3178	movq	48(%rdi),%r14
3179	movq	56(%rdi),%r15
3180	movq	%rax,24+8(%rsp)
3181
3182	leaq	64(%rdi),%rdi
3183	xorq	%rsi,%rsi
3184	movq	$-8,%rcx
3185	jmp	.Lsqrx8x_reduce
3186
3187.align	32
3188.Lsqrx8x_reduce:
3189	movq	%r8,%rbx
3190	mulxq	0(%rbp),%rax,%r8
3191	adcxq	%rbx,%rax
3192	adoxq	%r9,%r8
3193
3194	mulxq	8(%rbp),%rbx,%r9
3195	adcxq	%rbx,%r8
3196	adoxq	%r10,%r9
3197
3198	mulxq	16(%rbp),%rbx,%r10
3199	adcxq	%rbx,%r9
3200	adoxq	%r11,%r10
3201
3202	mulxq	24(%rbp),%rbx,%r11
3203	adcxq	%rbx,%r10
3204	adoxq	%r12,%r11
3205
3206.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
3207	movq	%rdx,%rax
3208	movq	%r8,%rdx
3209	adcxq	%rbx,%r11
3210	adoxq	%r13,%r12
3211
3212	mulxq	32+8(%rsp),%rbx,%rdx
3213	movq	%rax,%rdx
3214	movq	%rax,64+48+8(%rsp,%rcx,8)
3215
3216	mulxq	40(%rbp),%rax,%r13
3217	adcxq	%rax,%r12
3218	adoxq	%r14,%r13
3219
3220	mulxq	48(%rbp),%rax,%r14
3221	adcxq	%rax,%r13
3222	adoxq	%r15,%r14
3223
3224	mulxq	56(%rbp),%rax,%r15
3225	movq	%rbx,%rdx
3226	adcxq	%rax,%r14
3227	adoxq	%rsi,%r15
3228	adcxq	%rsi,%r15
3229
3230.byte	0x67,0x67,0x67
3231	incq	%rcx
3232	jnz	.Lsqrx8x_reduce
3233
3234	movq	%rsi,%rax
3235	cmpq	0+8(%rsp),%rbp
3236	jae	.Lsqrx8x_no_tail
3237
3238	movq	48+8(%rsp),%rdx
3239	addq	0(%rdi),%r8
3240	leaq	64(%rbp),%rbp
3241	movq	$-8,%rcx
3242	adcxq	8(%rdi),%r9
3243	adcxq	16(%rdi),%r10
3244	adcq	24(%rdi),%r11
3245	adcq	32(%rdi),%r12
3246	adcq	40(%rdi),%r13
3247	adcq	48(%rdi),%r14
3248	adcq	56(%rdi),%r15
3249	leaq	64(%rdi),%rdi
3250	sbbq	%rax,%rax
3251
3252	xorq	%rsi,%rsi
3253	movq	%rax,16+8(%rsp)
3254	jmp	.Lsqrx8x_tail
3255
3256.align	32
3257.Lsqrx8x_tail:
3258	movq	%r8,%rbx
3259	mulxq	0(%rbp),%rax,%r8
3260	adcxq	%rax,%rbx
3261	adoxq	%r9,%r8
3262
3263	mulxq	8(%rbp),%rax,%r9
3264	adcxq	%rax,%r8
3265	adoxq	%r10,%r9
3266
3267	mulxq	16(%rbp),%rax,%r10
3268	adcxq	%rax,%r9
3269	adoxq	%r11,%r10
3270
3271	mulxq	24(%rbp),%rax,%r11
3272	adcxq	%rax,%r10
3273	adoxq	%r12,%r11
3274
3275.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3276	adcxq	%rax,%r11
3277	adoxq	%r13,%r12
3278
3279	mulxq	40(%rbp),%rax,%r13
3280	adcxq	%rax,%r12
3281	adoxq	%r14,%r13
3282
3283	mulxq	48(%rbp),%rax,%r14
3284	adcxq	%rax,%r13
3285	adoxq	%r15,%r14
3286
3287	mulxq	56(%rbp),%rax,%r15
3288	movq	72+48+8(%rsp,%rcx,8),%rdx
3289	adcxq	%rax,%r14
3290	adoxq	%rsi,%r15
3291	movq	%rbx,(%rdi,%rcx,8)
3292	movq	%r8,%rbx
3293	adcxq	%rsi,%r15
3294
3295	incq	%rcx
3296	jnz	.Lsqrx8x_tail
3297
3298	cmpq	0+8(%rsp),%rbp
3299	jae	.Lsqrx8x_tail_done
3300
3301	subq	16+8(%rsp),%rsi
3302	movq	48+8(%rsp),%rdx
3303	leaq	64(%rbp),%rbp
3304	adcq	0(%rdi),%r8
3305	adcq	8(%rdi),%r9
3306	adcq	16(%rdi),%r10
3307	adcq	24(%rdi),%r11
3308	adcq	32(%rdi),%r12
3309	adcq	40(%rdi),%r13
3310	adcq	48(%rdi),%r14
3311	adcq	56(%rdi),%r15
3312	leaq	64(%rdi),%rdi
3313	sbbq	%rax,%rax
3314	subq	$8,%rcx
3315
3316	xorq	%rsi,%rsi
3317	movq	%rax,16+8(%rsp)
3318	jmp	.Lsqrx8x_tail
3319
3320.align	32
3321.Lsqrx8x_tail_done:
3322	xorq	%rax,%rax
3323	addq	24+8(%rsp),%r8
3324	adcq	$0,%r9
3325	adcq	$0,%r10
3326	adcq	$0,%r11
3327	adcq	$0,%r12
3328	adcq	$0,%r13
3329	adcq	$0,%r14
3330	adcq	$0,%r15
3331	adcq	$0,%rax
3332
3333	subq	16+8(%rsp),%rsi
3334.Lsqrx8x_no_tail:
3335	adcq	0(%rdi),%r8
3336.byte	102,72,15,126,217
3337	adcq	8(%rdi),%r9
3338	movq	56(%rbp),%rsi
3339.byte	102,72,15,126,213
3340	adcq	16(%rdi),%r10
3341	adcq	24(%rdi),%r11
3342	adcq	32(%rdi),%r12
3343	adcq	40(%rdi),%r13
3344	adcq	48(%rdi),%r14
3345	adcq	56(%rdi),%r15
3346	adcq	$0,%rax
3347
3348	movq	32+8(%rsp),%rbx
3349	movq	64(%rdi,%rcx,1),%rdx
3350
3351	movq	%r8,0(%rdi)
3352	leaq	64(%rdi),%r8
3353	movq	%r9,8(%rdi)
3354	movq	%r10,16(%rdi)
3355	movq	%r11,24(%rdi)
3356	movq	%r12,32(%rdi)
3357	movq	%r13,40(%rdi)
3358	movq	%r14,48(%rdi)
3359	movq	%r15,56(%rdi)
3360
3361	leaq	64(%rdi,%rcx,1),%rdi
3362	cmpq	8+8(%rsp),%r8
3363	jb	.Lsqrx8x_reduction_loop
3364	.byte	0xf3,0xc3
3365.cfi_endproc
3366.size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
3367.align	32
3368.type	__bn_postx4x_internal,@function
3369__bn_postx4x_internal:
3370.cfi_startproc
3371	movq	0(%rbp),%r12
3372	movq	%rcx,%r10
3373	movq	%rcx,%r9
3374	negq	%rax
3375	sarq	$3+2,%rcx
3376
3377.byte	102,72,15,126,202
3378.byte	102,72,15,126,206
3379	decq	%r12
3380	movq	8(%rbp),%r13
3381	xorq	%r8,%r8
3382	movq	16(%rbp),%r14
3383	movq	24(%rbp),%r15
3384	jmp	.Lsqrx4x_sub_entry
3385
3386.align	16
3387.Lsqrx4x_sub:
3388	movq	0(%rbp),%r12
3389	movq	8(%rbp),%r13
3390	movq	16(%rbp),%r14
3391	movq	24(%rbp),%r15
3392.Lsqrx4x_sub_entry:
3393	andnq	%rax,%r12,%r12
3394	leaq	32(%rbp),%rbp
3395	andnq	%rax,%r13,%r13
3396	andnq	%rax,%r14,%r14
3397	andnq	%rax,%r15,%r15
3398
3399	negq	%r8
3400	adcq	0(%rdi),%r12
3401	adcq	8(%rdi),%r13
3402	adcq	16(%rdi),%r14
3403	adcq	24(%rdi),%r15
3404	movq	%r12,0(%rdx)
3405	leaq	32(%rdi),%rdi
3406	movq	%r13,8(%rdx)
3407	sbbq	%r8,%r8
3408	movq	%r14,16(%rdx)
3409	movq	%r15,24(%rdx)
3410	leaq	32(%rdx),%rdx
3411
3412	incq	%rcx
3413	jnz	.Lsqrx4x_sub
3414
3415	negq	%r9
3416
3417	.byte	0xf3,0xc3
3418.cfi_endproc
3419.size	__bn_postx4x_internal,.-__bn_postx4x_internal
3420.globl	bn_scatter5
3421.hidden bn_scatter5
3422.type	bn_scatter5,@function
3423.align	16
3424bn_scatter5:
3425.cfi_startproc
3426	cmpl	$0,%esi
3427	jz	.Lscatter_epilogue
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437	leaq	(%rdx,%rcx,8),%rdx
3438.Lscatter:
3439	movq	(%rdi),%rax
3440	leaq	8(%rdi),%rdi
3441	movq	%rax,(%rdx)
3442	leaq	256(%rdx),%rdx
3443	subl	$1,%esi
3444	jnz	.Lscatter
3445.Lscatter_epilogue:
3446	.byte	0xf3,0xc3
3447.cfi_endproc
3448.size	bn_scatter5,.-bn_scatter5
3449
3450.globl	bn_gather5
3451.hidden bn_gather5
3452.type	bn_gather5,@function
3453.align	32
3454bn_gather5:
3455.cfi_startproc
3456.LSEH_begin_bn_gather5:
3457
3458.byte	0x4c,0x8d,0x14,0x24
3459.cfi_def_cfa_register	%r10
3460.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00
3461	leaq	.Linc(%rip),%rax
3462	andq	$-16,%rsp
3463
3464	movd	%ecx,%xmm5
3465	movdqa	0(%rax),%xmm0
3466	movdqa	16(%rax),%xmm1
3467	leaq	128(%rdx),%r11
3468	leaq	128(%rsp),%rax
3469
3470	pshufd	$0,%xmm5,%xmm5
3471	movdqa	%xmm1,%xmm4
3472	movdqa	%xmm1,%xmm2
3473	paddd	%xmm0,%xmm1
3474	pcmpeqd	%xmm5,%xmm0
3475	movdqa	%xmm4,%xmm3
3476
3477	paddd	%xmm1,%xmm2
3478	pcmpeqd	%xmm5,%xmm1
3479	movdqa	%xmm0,-128(%rax)
3480	movdqa	%xmm4,%xmm0
3481
3482	paddd	%xmm2,%xmm3
3483	pcmpeqd	%xmm5,%xmm2
3484	movdqa	%xmm1,-112(%rax)
3485	movdqa	%xmm4,%xmm1
3486
3487	paddd	%xmm3,%xmm0
3488	pcmpeqd	%xmm5,%xmm3
3489	movdqa	%xmm2,-96(%rax)
3490	movdqa	%xmm4,%xmm2
3491	paddd	%xmm0,%xmm1
3492	pcmpeqd	%xmm5,%xmm0
3493	movdqa	%xmm3,-80(%rax)
3494	movdqa	%xmm4,%xmm3
3495
3496	paddd	%xmm1,%xmm2
3497	pcmpeqd	%xmm5,%xmm1
3498	movdqa	%xmm0,-64(%rax)
3499	movdqa	%xmm4,%xmm0
3500
3501	paddd	%xmm2,%xmm3
3502	pcmpeqd	%xmm5,%xmm2
3503	movdqa	%xmm1,-48(%rax)
3504	movdqa	%xmm4,%xmm1
3505
3506	paddd	%xmm3,%xmm0
3507	pcmpeqd	%xmm5,%xmm3
3508	movdqa	%xmm2,-32(%rax)
3509	movdqa	%xmm4,%xmm2
3510	paddd	%xmm0,%xmm1
3511	pcmpeqd	%xmm5,%xmm0
3512	movdqa	%xmm3,-16(%rax)
3513	movdqa	%xmm4,%xmm3
3514
3515	paddd	%xmm1,%xmm2
3516	pcmpeqd	%xmm5,%xmm1
3517	movdqa	%xmm0,0(%rax)
3518	movdqa	%xmm4,%xmm0
3519
3520	paddd	%xmm2,%xmm3
3521	pcmpeqd	%xmm5,%xmm2
3522	movdqa	%xmm1,16(%rax)
3523	movdqa	%xmm4,%xmm1
3524
3525	paddd	%xmm3,%xmm0
3526	pcmpeqd	%xmm5,%xmm3
3527	movdqa	%xmm2,32(%rax)
3528	movdqa	%xmm4,%xmm2
3529	paddd	%xmm0,%xmm1
3530	pcmpeqd	%xmm5,%xmm0
3531	movdqa	%xmm3,48(%rax)
3532	movdqa	%xmm4,%xmm3
3533
3534	paddd	%xmm1,%xmm2
3535	pcmpeqd	%xmm5,%xmm1
3536	movdqa	%xmm0,64(%rax)
3537	movdqa	%xmm4,%xmm0
3538
3539	paddd	%xmm2,%xmm3
3540	pcmpeqd	%xmm5,%xmm2
3541	movdqa	%xmm1,80(%rax)
3542	movdqa	%xmm4,%xmm1
3543
3544	paddd	%xmm3,%xmm0
3545	pcmpeqd	%xmm5,%xmm3
3546	movdqa	%xmm2,96(%rax)
3547	movdqa	%xmm4,%xmm2
3548	movdqa	%xmm3,112(%rax)
3549	jmp	.Lgather
3550
3551.align	32
3552.Lgather:
3553	pxor	%xmm4,%xmm4
3554	pxor	%xmm5,%xmm5
3555	movdqa	-128(%r11),%xmm0
3556	movdqa	-112(%r11),%xmm1
3557	movdqa	-96(%r11),%xmm2
3558	pand	-128(%rax),%xmm0
3559	movdqa	-80(%r11),%xmm3
3560	pand	-112(%rax),%xmm1
3561	por	%xmm0,%xmm4
3562	pand	-96(%rax),%xmm2
3563	por	%xmm1,%xmm5
3564	pand	-80(%rax),%xmm3
3565	por	%xmm2,%xmm4
3566	por	%xmm3,%xmm5
3567	movdqa	-64(%r11),%xmm0
3568	movdqa	-48(%r11),%xmm1
3569	movdqa	-32(%r11),%xmm2
3570	pand	-64(%rax),%xmm0
3571	movdqa	-16(%r11),%xmm3
3572	pand	-48(%rax),%xmm1
3573	por	%xmm0,%xmm4
3574	pand	-32(%rax),%xmm2
3575	por	%xmm1,%xmm5
3576	pand	-16(%rax),%xmm3
3577	por	%xmm2,%xmm4
3578	por	%xmm3,%xmm5
3579	movdqa	0(%r11),%xmm0
3580	movdqa	16(%r11),%xmm1
3581	movdqa	32(%r11),%xmm2
3582	pand	0(%rax),%xmm0
3583	movdqa	48(%r11),%xmm3
3584	pand	16(%rax),%xmm1
3585	por	%xmm0,%xmm4
3586	pand	32(%rax),%xmm2
3587	por	%xmm1,%xmm5
3588	pand	48(%rax),%xmm3
3589	por	%xmm2,%xmm4
3590	por	%xmm3,%xmm5
3591	movdqa	64(%r11),%xmm0
3592	movdqa	80(%r11),%xmm1
3593	movdqa	96(%r11),%xmm2
3594	pand	64(%rax),%xmm0
3595	movdqa	112(%r11),%xmm3
3596	pand	80(%rax),%xmm1
3597	por	%xmm0,%xmm4
3598	pand	96(%rax),%xmm2
3599	por	%xmm1,%xmm5
3600	pand	112(%rax),%xmm3
3601	por	%xmm2,%xmm4
3602	por	%xmm3,%xmm5
3603	por	%xmm5,%xmm4
3604	leaq	256(%r11),%r11
3605
3606	pshufd	$0x4e,%xmm4,%xmm0
3607	por	%xmm4,%xmm0
3608	movq	%xmm0,(%rdi)
3609	leaq	8(%rdi),%rdi
3610	subl	$1,%esi
3611	jnz	.Lgather
3612
3613	leaq	(%r10),%rsp
3614.cfi_def_cfa_register	%rsp
3615	.byte	0xf3,0xc3
3616.LSEH_end_bn_gather5:
3617.cfi_endproc
3618.size	bn_gather5,.-bn_gather5
3619.section	.rodata
3620.align	64
3621.Linc:
3622.long	0,0, 1,1
3623.long	2,2, 2,2
3624.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
3625.text
3626#endif
3627#if defined(__ELF__)
3628// See https://www.airs.com/blog/archives/518.
3629.section .note.GNU-stack,"",%progbits
3630#endif
3631