• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1.text
2
3.globl	bn_mul_mont_gather5
4.type	bn_mul_mont_gather5,@function
5.align	64
6bn_mul_mont_gather5:
7	testl	$3,%r9d
8	jnz	.Lmul_enter
9	cmpl	$8,%r9d
10	jb	.Lmul_enter
11	jmp	.Lmul4x_enter
12
13.align	16
14.Lmul_enter:
15	movl	%r9d,%r9d
16	movl	8(%rsp),%r10d
17	pushq	%rbx
18	pushq	%rbp
19	pushq	%r12
20	pushq	%r13
21	pushq	%r14
22	pushq	%r15
23	movq	%rsp,%rax
24	leaq	2(%r9),%r11
25	negq	%r11
26	leaq	(%rsp,%r11,8),%rsp
27	andq	$-1024,%rsp
28
29	movq	%rax,8(%rsp,%r9,8)
30.Lmul_body:
31	movq	%rdx,%r12
32	movq	%r10,%r11
33	shrq	$3,%r10
34	andq	$7,%r11
35	notq	%r10
36	leaq	.Lmagic_masks(%rip),%rax
37	andq	$3,%r10
38	leaq	96(%r12,%r11,8),%r12
39	movq	0(%rax,%r10,8),%xmm4
40	movq	8(%rax,%r10,8),%xmm5
41	movq	16(%rax,%r10,8),%xmm6
42	movq	24(%rax,%r10,8),%xmm7
43
44	movq	-96(%r12),%xmm0
45	movq	-32(%r12),%xmm1
46	pand	%xmm4,%xmm0
47	movq	32(%r12),%xmm2
48	pand	%xmm5,%xmm1
49	movq	96(%r12),%xmm3
50	pand	%xmm6,%xmm2
51	por	%xmm1,%xmm0
52	pand	%xmm7,%xmm3
53	por	%xmm2,%xmm0
54	leaq	256(%r12),%r12
55	por	%xmm3,%xmm0
56
57.byte	102,72,15,126,195
58
59	movq	(%r8),%r8
60	movq	(%rsi),%rax
61
62	xorq	%r14,%r14
63	xorq	%r15,%r15
64
65	movq	-96(%r12),%xmm0
66	movq	-32(%r12),%xmm1
67	pand	%xmm4,%xmm0
68	movq	32(%r12),%xmm2
69	pand	%xmm5,%xmm1
70
71	movq	%r8,%rbp
72	mulq	%rbx
73	movq	%rax,%r10
74	movq	(%rcx),%rax
75
76	movq	96(%r12),%xmm3
77	pand	%xmm6,%xmm2
78	por	%xmm1,%xmm0
79	pand	%xmm7,%xmm3
80
81	imulq	%r10,%rbp
82	movq	%rdx,%r11
83
84	por	%xmm2,%xmm0
85	leaq	256(%r12),%r12
86	por	%xmm3,%xmm0
87
88	mulq	%rbp
89	addq	%rax,%r10
90	movq	8(%rsi),%rax
91	adcq	$0,%rdx
92	movq	%rdx,%r13
93
94	leaq	1(%r15),%r15
95	jmp	.L1st_enter
96
97.align	16
98.L1st:
99	addq	%rax,%r13
100	movq	(%rsi,%r15,8),%rax
101	adcq	$0,%rdx
102	addq	%r11,%r13
103	movq	%r10,%r11
104	adcq	$0,%rdx
105	movq	%r13,-16(%rsp,%r15,8)
106	movq	%rdx,%r13
107
108.L1st_enter:
109	mulq	%rbx
110	addq	%rax,%r11
111	movq	(%rcx,%r15,8),%rax
112	adcq	$0,%rdx
113	leaq	1(%r15),%r15
114	movq	%rdx,%r10
115
116	mulq	%rbp
117	cmpq	%r9,%r15
118	jne	.L1st
119
120.byte	102,72,15,126,195
121
122	addq	%rax,%r13
123	movq	(%rsi),%rax
124	adcq	$0,%rdx
125	addq	%r11,%r13
126	adcq	$0,%rdx
127	movq	%r13,-16(%rsp,%r15,8)
128	movq	%rdx,%r13
129	movq	%r10,%r11
130
131	xorq	%rdx,%rdx
132	addq	%r11,%r13
133	adcq	$0,%rdx
134	movq	%r13,-8(%rsp,%r9,8)
135	movq	%rdx,(%rsp,%r9,8)
136
137	leaq	1(%r14),%r14
138	jmp	.Louter
139.align	16
140.Louter:
141	xorq	%r15,%r15
142	movq	%r8,%rbp
143	movq	(%rsp),%r10
144
145	movq	-96(%r12),%xmm0
146	movq	-32(%r12),%xmm1
147	pand	%xmm4,%xmm0
148	movq	32(%r12),%xmm2
149	pand	%xmm5,%xmm1
150
151	mulq	%rbx
152	addq	%rax,%r10
153	movq	(%rcx),%rax
154	adcq	$0,%rdx
155
156	movq	96(%r12),%xmm3
157	pand	%xmm6,%xmm2
158	por	%xmm1,%xmm0
159	pand	%xmm7,%xmm3
160
161	imulq	%r10,%rbp
162	movq	%rdx,%r11
163
164	por	%xmm2,%xmm0
165	leaq	256(%r12),%r12
166	por	%xmm3,%xmm0
167
168	mulq	%rbp
169	addq	%rax,%r10
170	movq	8(%rsi),%rax
171	adcq	$0,%rdx
172	movq	8(%rsp),%r10
173	movq	%rdx,%r13
174
175	leaq	1(%r15),%r15
176	jmp	.Linner_enter
177
178.align	16
179.Linner:
180	addq	%rax,%r13
181	movq	(%rsi,%r15,8),%rax
182	adcq	$0,%rdx
183	addq	%r10,%r13
184	movq	(%rsp,%r15,8),%r10
185	adcq	$0,%rdx
186	movq	%r13,-16(%rsp,%r15,8)
187	movq	%rdx,%r13
188
189.Linner_enter:
190	mulq	%rbx
191	addq	%rax,%r11
192	movq	(%rcx,%r15,8),%rax
193	adcq	$0,%rdx
194	addq	%r11,%r10
195	movq	%rdx,%r11
196	adcq	$0,%r11
197	leaq	1(%r15),%r15
198
199	mulq	%rbp
200	cmpq	%r9,%r15
201	jne	.Linner
202
203.byte	102,72,15,126,195
204
205	addq	%rax,%r13
206	movq	(%rsi),%rax
207	adcq	$0,%rdx
208	addq	%r10,%r13
209	movq	(%rsp,%r15,8),%r10
210	adcq	$0,%rdx
211	movq	%r13,-16(%rsp,%r15,8)
212	movq	%rdx,%r13
213
214	xorq	%rdx,%rdx
215	addq	%r11,%r13
216	adcq	$0,%rdx
217	addq	%r10,%r13
218	adcq	$0,%rdx
219	movq	%r13,-8(%rsp,%r9,8)
220	movq	%rdx,(%rsp,%r9,8)
221
222	leaq	1(%r14),%r14
223	cmpq	%r9,%r14
224	jl	.Louter
225
226	xorq	%r14,%r14
227	movq	(%rsp),%rax
228	leaq	(%rsp),%rsi
229	movq	%r9,%r15
230	jmp	.Lsub
231.align	16
232.Lsub:	sbbq	(%rcx,%r14,8),%rax
233	movq	%rax,(%rdi,%r14,8)
234	movq	8(%rsi,%r14,8),%rax
235	leaq	1(%r14),%r14
236	decq	%r15
237	jnz	.Lsub
238
239	sbbq	$0,%rax
240	xorq	%r14,%r14
241	andq	%rax,%rsi
242	notq	%rax
243	movq	%rdi,%rcx
244	andq	%rax,%rcx
245	movq	%r9,%r15
246	orq	%rcx,%rsi
247.align	16
248.Lcopy:
249	movq	(%rsi,%r14,8),%rax
250	movq	%r14,(%rsp,%r14,8)
251	movq	%rax,(%rdi,%r14,8)
252	leaq	1(%r14),%r14
253	subq	$1,%r15
254	jnz	.Lcopy
255
256	movq	8(%rsp,%r9,8),%rsi
257	movq	$1,%rax
258	movq	(%rsi),%r15
259	movq	8(%rsi),%r14
260	movq	16(%rsi),%r13
261	movq	24(%rsi),%r12
262	movq	32(%rsi),%rbp
263	movq	40(%rsi),%rbx
264	leaq	48(%rsi),%rsp
265.Lmul_epilogue:
266	.byte	0xf3,0xc3
267.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
268.type	bn_mul4x_mont_gather5,@function
269.align	16
270bn_mul4x_mont_gather5:
271.Lmul4x_enter:
272	movl	%r9d,%r9d
273	movl	8(%rsp),%r10d
274	pushq	%rbx
275	pushq	%rbp
276	pushq	%r12
277	pushq	%r13
278	pushq	%r14
279	pushq	%r15
280	movq	%rsp,%rax
281	leaq	4(%r9),%r11
282	negq	%r11
283	leaq	(%rsp,%r11,8),%rsp
284	andq	$-1024,%rsp
285
286	movq	%rax,8(%rsp,%r9,8)
287.Lmul4x_body:
288	movq	%rdi,16(%rsp,%r9,8)
289	movq	%rdx,%r12
290	movq	%r10,%r11
291	shrq	$3,%r10
292	andq	$7,%r11
293	notq	%r10
294	leaq	.Lmagic_masks(%rip),%rax
295	andq	$3,%r10
296	leaq	96(%r12,%r11,8),%r12
297	movq	0(%rax,%r10,8),%xmm4
298	movq	8(%rax,%r10,8),%xmm5
299	movq	16(%rax,%r10,8),%xmm6
300	movq	24(%rax,%r10,8),%xmm7
301
302	movq	-96(%r12),%xmm0
303	movq	-32(%r12),%xmm1
304	pand	%xmm4,%xmm0
305	movq	32(%r12),%xmm2
306	pand	%xmm5,%xmm1
307	movq	96(%r12),%xmm3
308	pand	%xmm6,%xmm2
309	por	%xmm1,%xmm0
310	pand	%xmm7,%xmm3
311	por	%xmm2,%xmm0
312	leaq	256(%r12),%r12
313	por	%xmm3,%xmm0
314
315.byte	102,72,15,126,195
316	movq	(%r8),%r8
317	movq	(%rsi),%rax
318
319	xorq	%r14,%r14
320	xorq	%r15,%r15
321
322	movq	-96(%r12),%xmm0
323	movq	-32(%r12),%xmm1
324	pand	%xmm4,%xmm0
325	movq	32(%r12),%xmm2
326	pand	%xmm5,%xmm1
327
328	movq	%r8,%rbp
329	mulq	%rbx
330	movq	%rax,%r10
331	movq	(%rcx),%rax
332
333	movq	96(%r12),%xmm3
334	pand	%xmm6,%xmm2
335	por	%xmm1,%xmm0
336	pand	%xmm7,%xmm3
337
338	imulq	%r10,%rbp
339	movq	%rdx,%r11
340
341	por	%xmm2,%xmm0
342	leaq	256(%r12),%r12
343	por	%xmm3,%xmm0
344
345	mulq	%rbp
346	addq	%rax,%r10
347	movq	8(%rsi),%rax
348	adcq	$0,%rdx
349	movq	%rdx,%rdi
350
351	mulq	%rbx
352	addq	%rax,%r11
353	movq	8(%rcx),%rax
354	adcq	$0,%rdx
355	movq	%rdx,%r10
356
357	mulq	%rbp
358	addq	%rax,%rdi
359	movq	16(%rsi),%rax
360	adcq	$0,%rdx
361	addq	%r11,%rdi
362	leaq	4(%r15),%r15
363	adcq	$0,%rdx
364	movq	%rdi,(%rsp)
365	movq	%rdx,%r13
366	jmp	.L1st4x
367.align	16
368.L1st4x:
369	mulq	%rbx
370	addq	%rax,%r10
371	movq	-16(%rcx,%r15,8),%rax
372	adcq	$0,%rdx
373	movq	%rdx,%r11
374
375	mulq	%rbp
376	addq	%rax,%r13
377	movq	-8(%rsi,%r15,8),%rax
378	adcq	$0,%rdx
379	addq	%r10,%r13
380	adcq	$0,%rdx
381	movq	%r13,-24(%rsp,%r15,8)
382	movq	%rdx,%rdi
383
384	mulq	%rbx
385	addq	%rax,%r11
386	movq	-8(%rcx,%r15,8),%rax
387	adcq	$0,%rdx
388	movq	%rdx,%r10
389
390	mulq	%rbp
391	addq	%rax,%rdi
392	movq	(%rsi,%r15,8),%rax
393	adcq	$0,%rdx
394	addq	%r11,%rdi
395	adcq	$0,%rdx
396	movq	%rdi,-16(%rsp,%r15,8)
397	movq	%rdx,%r13
398
399	mulq	%rbx
400	addq	%rax,%r10
401	movq	(%rcx,%r15,8),%rax
402	adcq	$0,%rdx
403	movq	%rdx,%r11
404
405	mulq	%rbp
406	addq	%rax,%r13
407	movq	8(%rsi,%r15,8),%rax
408	adcq	$0,%rdx
409	addq	%r10,%r13
410	adcq	$0,%rdx
411	movq	%r13,-8(%rsp,%r15,8)
412	movq	%rdx,%rdi
413
414	mulq	%rbx
415	addq	%rax,%r11
416	movq	8(%rcx,%r15,8),%rax
417	adcq	$0,%rdx
418	leaq	4(%r15),%r15
419	movq	%rdx,%r10
420
421	mulq	%rbp
422	addq	%rax,%rdi
423	movq	-16(%rsi,%r15,8),%rax
424	adcq	$0,%rdx
425	addq	%r11,%rdi
426	adcq	$0,%rdx
427	movq	%rdi,-32(%rsp,%r15,8)
428	movq	%rdx,%r13
429	cmpq	%r9,%r15
430	jl	.L1st4x
431
432	mulq	%rbx
433	addq	%rax,%r10
434	movq	-16(%rcx,%r15,8),%rax
435	adcq	$0,%rdx
436	movq	%rdx,%r11
437
438	mulq	%rbp
439	addq	%rax,%r13
440	movq	-8(%rsi,%r15,8),%rax
441	adcq	$0,%rdx
442	addq	%r10,%r13
443	adcq	$0,%rdx
444	movq	%r13,-24(%rsp,%r15,8)
445	movq	%rdx,%rdi
446
447	mulq	%rbx
448	addq	%rax,%r11
449	movq	-8(%rcx,%r15,8),%rax
450	adcq	$0,%rdx
451	movq	%rdx,%r10
452
453	mulq	%rbp
454	addq	%rax,%rdi
455	movq	(%rsi),%rax
456	adcq	$0,%rdx
457	addq	%r11,%rdi
458	adcq	$0,%rdx
459	movq	%rdi,-16(%rsp,%r15,8)
460	movq	%rdx,%r13
461
462.byte	102,72,15,126,195
463
464	xorq	%rdi,%rdi
465	addq	%r10,%r13
466	adcq	$0,%rdi
467	movq	%r13,-8(%rsp,%r15,8)
468	movq	%rdi,(%rsp,%r15,8)
469
470	leaq	1(%r14),%r14
471.align	4
472.Louter4x:
473	xorq	%r15,%r15
474	movq	-96(%r12),%xmm0
475	movq	-32(%r12),%xmm1
476	pand	%xmm4,%xmm0
477	movq	32(%r12),%xmm2
478	pand	%xmm5,%xmm1
479
480	movq	(%rsp),%r10
481	movq	%r8,%rbp
482	mulq	%rbx
483	addq	%rax,%r10
484	movq	(%rcx),%rax
485	adcq	$0,%rdx
486
487	movq	96(%r12),%xmm3
488	pand	%xmm6,%xmm2
489	por	%xmm1,%xmm0
490	pand	%xmm7,%xmm3
491
492	imulq	%r10,%rbp
493	movq	%rdx,%r11
494
495	por	%xmm2,%xmm0
496	leaq	256(%r12),%r12
497	por	%xmm3,%xmm0
498
499	mulq	%rbp
500	addq	%rax,%r10
501	movq	8(%rsi),%rax
502	adcq	$0,%rdx
503	movq	%rdx,%rdi
504
505	mulq	%rbx
506	addq	%rax,%r11
507	movq	8(%rcx),%rax
508	adcq	$0,%rdx
509	addq	8(%rsp),%r11
510	adcq	$0,%rdx
511	movq	%rdx,%r10
512
513	mulq	%rbp
514	addq	%rax,%rdi
515	movq	16(%rsi),%rax
516	adcq	$0,%rdx
517	addq	%r11,%rdi
518	leaq	4(%r15),%r15
519	adcq	$0,%rdx
520	movq	%rdx,%r13
521	jmp	.Linner4x
522.align	16
523.Linner4x:
524	mulq	%rbx
525	addq	%rax,%r10
526	movq	-16(%rcx,%r15,8),%rax
527	adcq	$0,%rdx
528	addq	-16(%rsp,%r15,8),%r10
529	adcq	$0,%rdx
530	movq	%rdx,%r11
531
532	mulq	%rbp
533	addq	%rax,%r13
534	movq	-8(%rsi,%r15,8),%rax
535	adcq	$0,%rdx
536	addq	%r10,%r13
537	adcq	$0,%rdx
538	movq	%rdi,-32(%rsp,%r15,8)
539	movq	%rdx,%rdi
540
541	mulq	%rbx
542	addq	%rax,%r11
543	movq	-8(%rcx,%r15,8),%rax
544	adcq	$0,%rdx
545	addq	-8(%rsp,%r15,8),%r11
546	adcq	$0,%rdx
547	movq	%rdx,%r10
548
549	mulq	%rbp
550	addq	%rax,%rdi
551	movq	(%rsi,%r15,8),%rax
552	adcq	$0,%rdx
553	addq	%r11,%rdi
554	adcq	$0,%rdx
555	movq	%r13,-24(%rsp,%r15,8)
556	movq	%rdx,%r13
557
558	mulq	%rbx
559	addq	%rax,%r10
560	movq	(%rcx,%r15,8),%rax
561	adcq	$0,%rdx
562	addq	(%rsp,%r15,8),%r10
563	adcq	$0,%rdx
564	movq	%rdx,%r11
565
566	mulq	%rbp
567	addq	%rax,%r13
568	movq	8(%rsi,%r15,8),%rax
569	adcq	$0,%rdx
570	addq	%r10,%r13
571	adcq	$0,%rdx
572	movq	%rdi,-16(%rsp,%r15,8)
573	movq	%rdx,%rdi
574
575	mulq	%rbx
576	addq	%rax,%r11
577	movq	8(%rcx,%r15,8),%rax
578	adcq	$0,%rdx
579	addq	8(%rsp,%r15,8),%r11
580	adcq	$0,%rdx
581	leaq	4(%r15),%r15
582	movq	%rdx,%r10
583
584	mulq	%rbp
585	addq	%rax,%rdi
586	movq	-16(%rsi,%r15,8),%rax
587	adcq	$0,%rdx
588	addq	%r11,%rdi
589	adcq	$0,%rdx
590	movq	%r13,-40(%rsp,%r15,8)
591	movq	%rdx,%r13
592	cmpq	%r9,%r15
593	jl	.Linner4x
594
595	mulq	%rbx
596	addq	%rax,%r10
597	movq	-16(%rcx,%r15,8),%rax
598	adcq	$0,%rdx
599	addq	-16(%rsp,%r15,8),%r10
600	adcq	$0,%rdx
601	movq	%rdx,%r11
602
603	mulq	%rbp
604	addq	%rax,%r13
605	movq	-8(%rsi,%r15,8),%rax
606	adcq	$0,%rdx
607	addq	%r10,%r13
608	adcq	$0,%rdx
609	movq	%rdi,-32(%rsp,%r15,8)
610	movq	%rdx,%rdi
611
612	mulq	%rbx
613	addq	%rax,%r11
614	movq	-8(%rcx,%r15,8),%rax
615	adcq	$0,%rdx
616	addq	-8(%rsp,%r15,8),%r11
617	adcq	$0,%rdx
618	leaq	1(%r14),%r14
619	movq	%rdx,%r10
620
621	mulq	%rbp
622	addq	%rax,%rdi
623	movq	(%rsi),%rax
624	adcq	$0,%rdx
625	addq	%r11,%rdi
626	adcq	$0,%rdx
627	movq	%r13,-24(%rsp,%r15,8)
628	movq	%rdx,%r13
629
630.byte	102,72,15,126,195
631	movq	%rdi,-16(%rsp,%r15,8)
632
633	xorq	%rdi,%rdi
634	addq	%r10,%r13
635	adcq	$0,%rdi
636	addq	(%rsp,%r9,8),%r13
637	adcq	$0,%rdi
638	movq	%r13,-8(%rsp,%r15,8)
639	movq	%rdi,(%rsp,%r15,8)
640
641	cmpq	%r9,%r14
642	jl	.Louter4x
643	movq	16(%rsp,%r9,8),%rdi
644	movq	0(%rsp),%rax
645	pxor	%xmm0,%xmm0
646	movq	8(%rsp),%rdx
647	shrq	$2,%r9
648	leaq	(%rsp),%rsi
649	xorq	%r14,%r14
650
651	subq	0(%rcx),%rax
652	movq	16(%rsi),%rbx
653	movq	24(%rsi),%rbp
654	sbbq	8(%rcx),%rdx
655	leaq	-1(%r9),%r15
656	jmp	.Lsub4x
657.align	16
658.Lsub4x:
659	movq	%rax,0(%rdi,%r14,8)
660	movq	%rdx,8(%rdi,%r14,8)
661	sbbq	16(%rcx,%r14,8),%rbx
662	movq	32(%rsi,%r14,8),%rax
663	movq	40(%rsi,%r14,8),%rdx
664	sbbq	24(%rcx,%r14,8),%rbp
665	movq	%rbx,16(%rdi,%r14,8)
666	movq	%rbp,24(%rdi,%r14,8)
667	sbbq	32(%rcx,%r14,8),%rax
668	movq	48(%rsi,%r14,8),%rbx
669	movq	56(%rsi,%r14,8),%rbp
670	sbbq	40(%rcx,%r14,8),%rdx
671	leaq	4(%r14),%r14
672	decq	%r15
673	jnz	.Lsub4x
674
675	movq	%rax,0(%rdi,%r14,8)
676	movq	32(%rsi,%r14,8),%rax
677	sbbq	16(%rcx,%r14,8),%rbx
678	movq	%rdx,8(%rdi,%r14,8)
679	sbbq	24(%rcx,%r14,8),%rbp
680	movq	%rbx,16(%rdi,%r14,8)
681
682	sbbq	$0,%rax
683	movq	%rbp,24(%rdi,%r14,8)
684	xorq	%r14,%r14
685	andq	%rax,%rsi
686	notq	%rax
687	movq	%rdi,%rcx
688	andq	%rax,%rcx
689	leaq	-1(%r9),%r15
690	orq	%rcx,%rsi
691
692	movdqu	(%rsi),%xmm1
693	movdqa	%xmm0,(%rsp)
694	movdqu	%xmm1,(%rdi)
695	jmp	.Lcopy4x
696.align	16
697.Lcopy4x:
698	movdqu	16(%rsi,%r14,1),%xmm2
699	movdqu	32(%rsi,%r14,1),%xmm1
700	movdqa	%xmm0,16(%rsp,%r14,1)
701	movdqu	%xmm2,16(%rdi,%r14,1)
702	movdqa	%xmm0,32(%rsp,%r14,1)
703	movdqu	%xmm1,32(%rdi,%r14,1)
704	leaq	32(%r14),%r14
705	decq	%r15
706	jnz	.Lcopy4x
707
708	shlq	$2,%r9
709	movdqu	16(%rsi,%r14,1),%xmm2
710	movdqa	%xmm0,16(%rsp,%r14,1)
711	movdqu	%xmm2,16(%rdi,%r14,1)
712	movq	8(%rsp,%r9,8),%rsi
713	movq	$1,%rax
714	movq	(%rsi),%r15
715	movq	8(%rsi),%r14
716	movq	16(%rsi),%r13
717	movq	24(%rsi),%r12
718	movq	32(%rsi),%rbp
719	movq	40(%rsi),%rbx
720	leaq	48(%rsi),%rsp
721.Lmul4x_epilogue:
722	.byte	0xf3,0xc3
723.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
724.globl	bn_scatter5
725.type	bn_scatter5,@function
726.align	16
727bn_scatter5:
728	cmpq	$0,%rsi
729	jz	.Lscatter_epilogue
730	leaq	(%rdx,%rcx,8),%rdx
731.Lscatter:
732	movq	(%rdi),%rax
733	leaq	8(%rdi),%rdi
734	movq	%rax,(%rdx)
735	leaq	256(%rdx),%rdx
736	subq	$1,%rsi
737	jnz	.Lscatter
738.Lscatter_epilogue:
739	.byte	0xf3,0xc3
740.size	bn_scatter5,.-bn_scatter5
741
742.globl	bn_gather5
743.type	bn_gather5,@function
744.align	16
745bn_gather5:
746	movq	%rcx,%r11
747	shrq	$3,%rcx
748	andq	$7,%r11
749	notq	%rcx
750	leaq	.Lmagic_masks(%rip),%rax
751	andq	$3,%rcx
752	leaq	96(%rdx,%r11,8),%rdx
753	movq	0(%rax,%rcx,8),%xmm4
754	movq	8(%rax,%rcx,8),%xmm5
755	movq	16(%rax,%rcx,8),%xmm6
756	movq	24(%rax,%rcx,8),%xmm7
757	jmp	.Lgather
758.align	16
759.Lgather:
760	movq	-96(%rdx),%xmm0
761	movq	-32(%rdx),%xmm1
762	pand	%xmm4,%xmm0
763	movq	32(%rdx),%xmm2
764	pand	%xmm5,%xmm1
765	movq	96(%rdx),%xmm3
766	pand	%xmm6,%xmm2
767	por	%xmm1,%xmm0
768	pand	%xmm7,%xmm3
769	por	%xmm2,%xmm0
770	leaq	256(%rdx),%rdx
771	por	%xmm3,%xmm0
772
773	movq	%xmm0,(%rdi)
774	leaq	8(%rdi),%rdi
775	subq	$1,%rsi
776	jnz	.Lgather
777	.byte	0xf3,0xc3
778.LSEH_end_bn_gather5:
779.size	bn_gather5,.-bn_gather5
780.align	64
781.Lmagic_masks:
782.long	0,0, 0,0, 0,0, -1,-1
783.long	0,0, 0,0, 0,0,  0,0
784.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
785