• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
2.text
3.extern	OPENSSL_ia32cap_P
4.hidden OPENSSL_ia32cap_P
5
6
7.align	64
8.Lpoly:
9.quad	0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
10
11.LOne:
12.long	1,1,1,1,1,1,1,1
13.LTwo:
14.long	2,2,2,2,2,2,2,2
15.LThree:
16.long	3,3,3,3,3,3,3,3
17.LONE_mont:
18.quad	0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
19
20
21
22.globl	ecp_nistz256_neg
23.hidden ecp_nistz256_neg
24.type	ecp_nistz256_neg,@function
25.align	32
26ecp_nistz256_neg:
27	pushq	%r12
28	pushq	%r13
29
30	xorq	%r8,%r8
31	xorq	%r9,%r9
32	xorq	%r10,%r10
33	xorq	%r11,%r11
34	xorq	%r13,%r13
35
36	subq	0(%rsi),%r8
37	sbbq	8(%rsi),%r9
38	sbbq	16(%rsi),%r10
39	movq	%r8,%rax
40	sbbq	24(%rsi),%r11
41	leaq	.Lpoly(%rip),%rsi
42	movq	%r9,%rdx
43	sbbq	$0,%r13
44
45	addq	0(%rsi),%r8
46	movq	%r10,%rcx
47	adcq	8(%rsi),%r9
48	adcq	16(%rsi),%r10
49	movq	%r11,%r12
50	adcq	24(%rsi),%r11
51	testq	%r13,%r13
52
53	cmovzq	%rax,%r8
54	cmovzq	%rdx,%r9
55	movq	%r8,0(%rdi)
56	cmovzq	%rcx,%r10
57	movq	%r9,8(%rdi)
58	cmovzq	%r12,%r11
59	movq	%r10,16(%rdi)
60	movq	%r11,24(%rdi)
61
62	popq	%r13
63	popq	%r12
64	.byte	0xf3,0xc3
65.size	ecp_nistz256_neg,.-ecp_nistz256_neg
66
67
68
69
70
71
72.globl	ecp_nistz256_mul_mont
73.hidden ecp_nistz256_mul_mont
74.type	ecp_nistz256_mul_mont,@function
75.align	32
76ecp_nistz256_mul_mont:
77.Lmul_mont:
78	pushq	%rbp
79	pushq	%rbx
80	pushq	%r12
81	pushq	%r13
82	pushq	%r14
83	pushq	%r15
84	movq	%rdx,%rbx
85	movq	0(%rdx),%rax
86	movq	0(%rsi),%r9
87	movq	8(%rsi),%r10
88	movq	16(%rsi),%r11
89	movq	24(%rsi),%r12
90
91	call	__ecp_nistz256_mul_montq
92.Lmul_mont_done:
93	popq	%r15
94	popq	%r14
95	popq	%r13
96	popq	%r12
97	popq	%rbx
98	popq	%rbp
99	.byte	0xf3,0xc3
100.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
101
102.type	__ecp_nistz256_mul_montq,@function
103.align	32
104__ecp_nistz256_mul_montq:
105
106
107	movq	%rax,%rbp
108	mulq	%r9
109	movq	.Lpoly+8(%rip),%r14
110	movq	%rax,%r8
111	movq	%rbp,%rax
112	movq	%rdx,%r9
113
114	mulq	%r10
115	movq	.Lpoly+24(%rip),%r15
116	addq	%rax,%r9
117	movq	%rbp,%rax
118	adcq	$0,%rdx
119	movq	%rdx,%r10
120
121	mulq	%r11
122	addq	%rax,%r10
123	movq	%rbp,%rax
124	adcq	$0,%rdx
125	movq	%rdx,%r11
126
127	mulq	%r12
128	addq	%rax,%r11
129	movq	%r8,%rax
130	adcq	$0,%rdx
131	xorq	%r13,%r13
132	movq	%rdx,%r12
133
134
135
136
137
138
139
140
141
142
143	movq	%r8,%rbp
144	shlq	$32,%r8
145	mulq	%r15
146	shrq	$32,%rbp
147	addq	%r8,%r9
148	adcq	%rbp,%r10
149	adcq	%rax,%r11
150	movq	8(%rbx),%rax
151	adcq	%rdx,%r12
152	adcq	$0,%r13
153	xorq	%r8,%r8
154
155
156
157	movq	%rax,%rbp
158	mulq	0(%rsi)
159	addq	%rax,%r9
160	movq	%rbp,%rax
161	adcq	$0,%rdx
162	movq	%rdx,%rcx
163
164	mulq	8(%rsi)
165	addq	%rcx,%r10
166	adcq	$0,%rdx
167	addq	%rax,%r10
168	movq	%rbp,%rax
169	adcq	$0,%rdx
170	movq	%rdx,%rcx
171
172	mulq	16(%rsi)
173	addq	%rcx,%r11
174	adcq	$0,%rdx
175	addq	%rax,%r11
176	movq	%rbp,%rax
177	adcq	$0,%rdx
178	movq	%rdx,%rcx
179
180	mulq	24(%rsi)
181	addq	%rcx,%r12
182	adcq	$0,%rdx
183	addq	%rax,%r12
184	movq	%r9,%rax
185	adcq	%rdx,%r13
186	adcq	$0,%r8
187
188
189
190	movq	%r9,%rbp
191	shlq	$32,%r9
192	mulq	%r15
193	shrq	$32,%rbp
194	addq	%r9,%r10
195	adcq	%rbp,%r11
196	adcq	%rax,%r12
197	movq	16(%rbx),%rax
198	adcq	%rdx,%r13
199	adcq	$0,%r8
200	xorq	%r9,%r9
201
202
203
204	movq	%rax,%rbp
205	mulq	0(%rsi)
206	addq	%rax,%r10
207	movq	%rbp,%rax
208	adcq	$0,%rdx
209	movq	%rdx,%rcx
210
211	mulq	8(%rsi)
212	addq	%rcx,%r11
213	adcq	$0,%rdx
214	addq	%rax,%r11
215	movq	%rbp,%rax
216	adcq	$0,%rdx
217	movq	%rdx,%rcx
218
219	mulq	16(%rsi)
220	addq	%rcx,%r12
221	adcq	$0,%rdx
222	addq	%rax,%r12
223	movq	%rbp,%rax
224	adcq	$0,%rdx
225	movq	%rdx,%rcx
226
227	mulq	24(%rsi)
228	addq	%rcx,%r13
229	adcq	$0,%rdx
230	addq	%rax,%r13
231	movq	%r10,%rax
232	adcq	%rdx,%r8
233	adcq	$0,%r9
234
235
236
237	movq	%r10,%rbp
238	shlq	$32,%r10
239	mulq	%r15
240	shrq	$32,%rbp
241	addq	%r10,%r11
242	adcq	%rbp,%r12
243	adcq	%rax,%r13
244	movq	24(%rbx),%rax
245	adcq	%rdx,%r8
246	adcq	$0,%r9
247	xorq	%r10,%r10
248
249
250
251	movq	%rax,%rbp
252	mulq	0(%rsi)
253	addq	%rax,%r11
254	movq	%rbp,%rax
255	adcq	$0,%rdx
256	movq	%rdx,%rcx
257
258	mulq	8(%rsi)
259	addq	%rcx,%r12
260	adcq	$0,%rdx
261	addq	%rax,%r12
262	movq	%rbp,%rax
263	adcq	$0,%rdx
264	movq	%rdx,%rcx
265
266	mulq	16(%rsi)
267	addq	%rcx,%r13
268	adcq	$0,%rdx
269	addq	%rax,%r13
270	movq	%rbp,%rax
271	adcq	$0,%rdx
272	movq	%rdx,%rcx
273
274	mulq	24(%rsi)
275	addq	%rcx,%r8
276	adcq	$0,%rdx
277	addq	%rax,%r8
278	movq	%r11,%rax
279	adcq	%rdx,%r9
280	adcq	$0,%r10
281
282
283
284	movq	%r11,%rbp
285	shlq	$32,%r11
286	mulq	%r15
287	shrq	$32,%rbp
288	addq	%r11,%r12
289	adcq	%rbp,%r13
290	movq	%r12,%rcx
291	adcq	%rax,%r8
292	adcq	%rdx,%r9
293	movq	%r13,%rbp
294	adcq	$0,%r10
295
296
297
298	subq	$-1,%r12
299	movq	%r8,%rbx
300	sbbq	%r14,%r13
301	sbbq	$0,%r8
302	movq	%r9,%rdx
303	sbbq	%r15,%r9
304	sbbq	$0,%r10
305
306	cmovcq	%rcx,%r12
307	cmovcq	%rbp,%r13
308	movq	%r12,0(%rdi)
309	cmovcq	%rbx,%r8
310	movq	%r13,8(%rdi)
311	cmovcq	%rdx,%r9
312	movq	%r8,16(%rdi)
313	movq	%r9,24(%rdi)
314
315	.byte	0xf3,0xc3
316.size	__ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
317
318
319
320
321
322
323
324
325.globl	ecp_nistz256_sqr_mont
326.hidden ecp_nistz256_sqr_mont
327.type	ecp_nistz256_sqr_mont,@function
328.align	32
329ecp_nistz256_sqr_mont:
330	pushq	%rbp
331	pushq	%rbx
332	pushq	%r12
333	pushq	%r13
334	pushq	%r14
335	pushq	%r15
336	movq	0(%rsi),%rax
337	movq	8(%rsi),%r14
338	movq	16(%rsi),%r15
339	movq	24(%rsi),%r8
340
341	call	__ecp_nistz256_sqr_montq
342.Lsqr_mont_done:
343	popq	%r15
344	popq	%r14
345	popq	%r13
346	popq	%r12
347	popq	%rbx
348	popq	%rbp
349	.byte	0xf3,0xc3
350.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
351
352.type	__ecp_nistz256_sqr_montq,@function
353.align	32
354__ecp_nistz256_sqr_montq:
355	movq	%rax,%r13
356	mulq	%r14
357	movq	%rax,%r9
358	movq	%r15,%rax
359	movq	%rdx,%r10
360
361	mulq	%r13
362	addq	%rax,%r10
363	movq	%r8,%rax
364	adcq	$0,%rdx
365	movq	%rdx,%r11
366
367	mulq	%r13
368	addq	%rax,%r11
369	movq	%r15,%rax
370	adcq	$0,%rdx
371	movq	%rdx,%r12
372
373
374	mulq	%r14
375	addq	%rax,%r11
376	movq	%r8,%rax
377	adcq	$0,%rdx
378	movq	%rdx,%rbp
379
380	mulq	%r14
381	addq	%rax,%r12
382	movq	%r8,%rax
383	adcq	$0,%rdx
384	addq	%rbp,%r12
385	movq	%rdx,%r13
386	adcq	$0,%r13
387
388
389	mulq	%r15
390	xorq	%r15,%r15
391	addq	%rax,%r13
392	movq	0(%rsi),%rax
393	movq	%rdx,%r14
394	adcq	$0,%r14
395
396	addq	%r9,%r9
397	adcq	%r10,%r10
398	adcq	%r11,%r11
399	adcq	%r12,%r12
400	adcq	%r13,%r13
401	adcq	%r14,%r14
402	adcq	$0,%r15
403
404	mulq	%rax
405	movq	%rax,%r8
406	movq	8(%rsi),%rax
407	movq	%rdx,%rcx
408
409	mulq	%rax
410	addq	%rcx,%r9
411	adcq	%rax,%r10
412	movq	16(%rsi),%rax
413	adcq	$0,%rdx
414	movq	%rdx,%rcx
415
416	mulq	%rax
417	addq	%rcx,%r11
418	adcq	%rax,%r12
419	movq	24(%rsi),%rax
420	adcq	$0,%rdx
421	movq	%rdx,%rcx
422
423	mulq	%rax
424	addq	%rcx,%r13
425	adcq	%rax,%r14
426	movq	%r8,%rax
427	adcq	%rdx,%r15
428
429	movq	.Lpoly+8(%rip),%rsi
430	movq	.Lpoly+24(%rip),%rbp
431
432
433
434
435	movq	%r8,%rcx
436	shlq	$32,%r8
437	mulq	%rbp
438	shrq	$32,%rcx
439	addq	%r8,%r9
440	adcq	%rcx,%r10
441	adcq	%rax,%r11
442	movq	%r9,%rax
443	adcq	$0,%rdx
444
445
446
447	movq	%r9,%rcx
448	shlq	$32,%r9
449	movq	%rdx,%r8
450	mulq	%rbp
451	shrq	$32,%rcx
452	addq	%r9,%r10
453	adcq	%rcx,%r11
454	adcq	%rax,%r8
455	movq	%r10,%rax
456	adcq	$0,%rdx
457
458
459
460	movq	%r10,%rcx
461	shlq	$32,%r10
462	movq	%rdx,%r9
463	mulq	%rbp
464	shrq	$32,%rcx
465	addq	%r10,%r11
466	adcq	%rcx,%r8
467	adcq	%rax,%r9
468	movq	%r11,%rax
469	adcq	$0,%rdx
470
471
472
473	movq	%r11,%rcx
474	shlq	$32,%r11
475	movq	%rdx,%r10
476	mulq	%rbp
477	shrq	$32,%rcx
478	addq	%r11,%r8
479	adcq	%rcx,%r9
480	adcq	%rax,%r10
481	adcq	$0,%rdx
482	xorq	%r11,%r11
483
484
485
486	addq	%r8,%r12
487	adcq	%r9,%r13
488	movq	%r12,%r8
489	adcq	%r10,%r14
490	adcq	%rdx,%r15
491	movq	%r13,%r9
492	adcq	$0,%r11
493
494	subq	$-1,%r12
495	movq	%r14,%r10
496	sbbq	%rsi,%r13
497	sbbq	$0,%r14
498	movq	%r15,%rcx
499	sbbq	%rbp,%r15
500	sbbq	$0,%r11
501
502	cmovcq	%r8,%r12
503	cmovcq	%r9,%r13
504	movq	%r12,0(%rdi)
505	cmovcq	%r10,%r14
506	movq	%r13,8(%rdi)
507	cmovcq	%rcx,%r15
508	movq	%r14,16(%rdi)
509	movq	%r15,24(%rdi)
510
511	.byte	0xf3,0xc3
512.size	__ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
513
514
515.globl	ecp_nistz256_select_w5
516.hidden ecp_nistz256_select_w5
517.type	ecp_nistz256_select_w5,@function
518.align	32
519ecp_nistz256_select_w5:
520	movdqa	.LOne(%rip),%xmm0
521	movd	%edx,%xmm1
522
523	pxor	%xmm2,%xmm2
524	pxor	%xmm3,%xmm3
525	pxor	%xmm4,%xmm4
526	pxor	%xmm5,%xmm5
527	pxor	%xmm6,%xmm6
528	pxor	%xmm7,%xmm7
529
530	movdqa	%xmm0,%xmm8
531	pshufd	$0,%xmm1,%xmm1
532
533	movq	$16,%rax
534.Lselect_loop_sse_w5:
535
536	movdqa	%xmm8,%xmm15
537	paddd	%xmm0,%xmm8
538	pcmpeqd	%xmm1,%xmm15
539
540	movdqa	0(%rsi),%xmm9
541	movdqa	16(%rsi),%xmm10
542	movdqa	32(%rsi),%xmm11
543	movdqa	48(%rsi),%xmm12
544	movdqa	64(%rsi),%xmm13
545	movdqa	80(%rsi),%xmm14
546	leaq	96(%rsi),%rsi
547
548	pand	%xmm15,%xmm9
549	pand	%xmm15,%xmm10
550	por	%xmm9,%xmm2
551	pand	%xmm15,%xmm11
552	por	%xmm10,%xmm3
553	pand	%xmm15,%xmm12
554	por	%xmm11,%xmm4
555	pand	%xmm15,%xmm13
556	por	%xmm12,%xmm5
557	pand	%xmm15,%xmm14
558	por	%xmm13,%xmm6
559	por	%xmm14,%xmm7
560
561	decq	%rax
562	jnz	.Lselect_loop_sse_w5
563
564	movdqu	%xmm2,0(%rdi)
565	movdqu	%xmm3,16(%rdi)
566	movdqu	%xmm4,32(%rdi)
567	movdqu	%xmm5,48(%rdi)
568	movdqu	%xmm6,64(%rdi)
569	movdqu	%xmm7,80(%rdi)
570	.byte	0xf3,0xc3
571.size	ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
572
573
574
575.globl	ecp_nistz256_select_w7
576.hidden ecp_nistz256_select_w7
577.type	ecp_nistz256_select_w7,@function
578.align	32
579ecp_nistz256_select_w7:
580	movdqa	.LOne(%rip),%xmm8
581	movd	%edx,%xmm1
582
583	pxor	%xmm2,%xmm2
584	pxor	%xmm3,%xmm3
585	pxor	%xmm4,%xmm4
586	pxor	%xmm5,%xmm5
587
588	movdqa	%xmm8,%xmm0
589	pshufd	$0,%xmm1,%xmm1
590	movq	$64,%rax
591
592.Lselect_loop_sse_w7:
593	movdqa	%xmm8,%xmm15
594	paddd	%xmm0,%xmm8
595	movdqa	0(%rsi),%xmm9
596	movdqa	16(%rsi),%xmm10
597	pcmpeqd	%xmm1,%xmm15
598	movdqa	32(%rsi),%xmm11
599	movdqa	48(%rsi),%xmm12
600	leaq	64(%rsi),%rsi
601
602	pand	%xmm15,%xmm9
603	pand	%xmm15,%xmm10
604	por	%xmm9,%xmm2
605	pand	%xmm15,%xmm11
606	por	%xmm10,%xmm3
607	pand	%xmm15,%xmm12
608	por	%xmm11,%xmm4
609	prefetcht0	255(%rsi)
610	por	%xmm12,%xmm5
611
612	decq	%rax
613	jnz	.Lselect_loop_sse_w7
614
615	movdqu	%xmm2,0(%rdi)
616	movdqu	%xmm3,16(%rdi)
617	movdqu	%xmm4,32(%rdi)
618	movdqu	%xmm5,48(%rdi)
619	.byte	0xf3,0xc3
620.size	ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
621.globl	ecp_nistz256_avx2_select_w7
622.hidden ecp_nistz256_avx2_select_w7
623.type	ecp_nistz256_avx2_select_w7,@function
624.align	32
625ecp_nistz256_avx2_select_w7:
626.byte	0x0f,0x0b
627	.byte	0xf3,0xc3
628.size	ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
629.type	__ecp_nistz256_add_toq,@function
630.align	32
631__ecp_nistz256_add_toq:
632	xorq	%r11,%r11
633	addq	0(%rbx),%r12
634	adcq	8(%rbx),%r13
635	movq	%r12,%rax
636	adcq	16(%rbx),%r8
637	adcq	24(%rbx),%r9
638	movq	%r13,%rbp
639	adcq	$0,%r11
640
641	subq	$-1,%r12
642	movq	%r8,%rcx
643	sbbq	%r14,%r13
644	sbbq	$0,%r8
645	movq	%r9,%r10
646	sbbq	%r15,%r9
647	sbbq	$0,%r11
648
649	cmovcq	%rax,%r12
650	cmovcq	%rbp,%r13
651	movq	%r12,0(%rdi)
652	cmovcq	%rcx,%r8
653	movq	%r13,8(%rdi)
654	cmovcq	%r10,%r9
655	movq	%r8,16(%rdi)
656	movq	%r9,24(%rdi)
657
658	.byte	0xf3,0xc3
659.size	__ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
660
661.type	__ecp_nistz256_sub_fromq,@function
662.align	32
663__ecp_nistz256_sub_fromq:
664	subq	0(%rbx),%r12
665	sbbq	8(%rbx),%r13
666	movq	%r12,%rax
667	sbbq	16(%rbx),%r8
668	sbbq	24(%rbx),%r9
669	movq	%r13,%rbp
670	sbbq	%r11,%r11
671
672	addq	$-1,%r12
673	movq	%r8,%rcx
674	adcq	%r14,%r13
675	adcq	$0,%r8
676	movq	%r9,%r10
677	adcq	%r15,%r9
678	testq	%r11,%r11
679
680	cmovzq	%rax,%r12
681	cmovzq	%rbp,%r13
682	movq	%r12,0(%rdi)
683	cmovzq	%rcx,%r8
684	movq	%r13,8(%rdi)
685	cmovzq	%r10,%r9
686	movq	%r8,16(%rdi)
687	movq	%r9,24(%rdi)
688
689	.byte	0xf3,0xc3
690.size	__ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
691
692.type	__ecp_nistz256_subq,@function
693.align	32
694__ecp_nistz256_subq:
695	subq	%r12,%rax
696	sbbq	%r13,%rbp
697	movq	%rax,%r12
698	sbbq	%r8,%rcx
699	sbbq	%r9,%r10
700	movq	%rbp,%r13
701	sbbq	%r11,%r11
702
703	addq	$-1,%rax
704	movq	%rcx,%r8
705	adcq	%r14,%rbp
706	adcq	$0,%rcx
707	movq	%r10,%r9
708	adcq	%r15,%r10
709	testq	%r11,%r11
710
711	cmovnzq	%rax,%r12
712	cmovnzq	%rbp,%r13
713	cmovnzq	%rcx,%r8
714	cmovnzq	%r10,%r9
715
716	.byte	0xf3,0xc3
717.size	__ecp_nistz256_subq,.-__ecp_nistz256_subq
718
719.type	__ecp_nistz256_mul_by_2q,@function
720.align	32
721__ecp_nistz256_mul_by_2q:
722	xorq	%r11,%r11
723	addq	%r12,%r12
724	adcq	%r13,%r13
725	movq	%r12,%rax
726	adcq	%r8,%r8
727	adcq	%r9,%r9
728	movq	%r13,%rbp
729	adcq	$0,%r11
730
731	subq	$-1,%r12
732	movq	%r8,%rcx
733	sbbq	%r14,%r13
734	sbbq	$0,%r8
735	movq	%r9,%r10
736	sbbq	%r15,%r9
737	sbbq	$0,%r11
738
739	cmovcq	%rax,%r12
740	cmovcq	%rbp,%r13
741	movq	%r12,0(%rdi)
742	cmovcq	%rcx,%r8
743	movq	%r13,8(%rdi)
744	cmovcq	%r10,%r9
745	movq	%r8,16(%rdi)
746	movq	%r9,24(%rdi)
747
748	.byte	0xf3,0xc3
749.size	__ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
750.globl	ecp_nistz256_point_double
751.hidden ecp_nistz256_point_double
752.type	ecp_nistz256_point_double,@function
753.align	32
754ecp_nistz256_point_double:
755	pushq	%rbp
756	pushq	%rbx
757	pushq	%r12
758	pushq	%r13
759	pushq	%r14
760	pushq	%r15
761	subq	$160+8,%rsp
762
763.Lpoint_double_shortcutq:
764	movdqu	0(%rsi),%xmm0
765	movq	%rsi,%rbx
766	movdqu	16(%rsi),%xmm1
767	movq	32+0(%rsi),%r12
768	movq	32+8(%rsi),%r13
769	movq	32+16(%rsi),%r8
770	movq	32+24(%rsi),%r9
771	movq	.Lpoly+8(%rip),%r14
772	movq	.Lpoly+24(%rip),%r15
773	movdqa	%xmm0,96(%rsp)
774	movdqa	%xmm1,96+16(%rsp)
775	leaq	32(%rdi),%r10
776	leaq	64(%rdi),%r11
777.byte	102,72,15,110,199
778.byte	102,73,15,110,202
779.byte	102,73,15,110,211
780
781	leaq	0(%rsp),%rdi
782	call	__ecp_nistz256_mul_by_2q
783
784	movq	64+0(%rsi),%rax
785	movq	64+8(%rsi),%r14
786	movq	64+16(%rsi),%r15
787	movq	64+24(%rsi),%r8
788	leaq	64-0(%rsi),%rsi
789	leaq	64(%rsp),%rdi
790	call	__ecp_nistz256_sqr_montq
791
792	movq	0+0(%rsp),%rax
793	movq	8+0(%rsp),%r14
794	leaq	0+0(%rsp),%rsi
795	movq	16+0(%rsp),%r15
796	movq	24+0(%rsp),%r8
797	leaq	0(%rsp),%rdi
798	call	__ecp_nistz256_sqr_montq
799
800	movq	32(%rbx),%rax
801	movq	64+0(%rbx),%r9
802	movq	64+8(%rbx),%r10
803	movq	64+16(%rbx),%r11
804	movq	64+24(%rbx),%r12
805	leaq	64-0(%rbx),%rsi
806	leaq	32(%rbx),%rbx
807.byte	102,72,15,126,215
808	call	__ecp_nistz256_mul_montq
809	call	__ecp_nistz256_mul_by_2q
810
811	movq	96+0(%rsp),%r12
812	movq	96+8(%rsp),%r13
813	leaq	64(%rsp),%rbx
814	movq	96+16(%rsp),%r8
815	movq	96+24(%rsp),%r9
816	leaq	32(%rsp),%rdi
817	call	__ecp_nistz256_add_toq
818
819	movq	96+0(%rsp),%r12
820	movq	96+8(%rsp),%r13
821	leaq	64(%rsp),%rbx
822	movq	96+16(%rsp),%r8
823	movq	96+24(%rsp),%r9
824	leaq	64(%rsp),%rdi
825	call	__ecp_nistz256_sub_fromq
826
827	movq	0+0(%rsp),%rax
828	movq	8+0(%rsp),%r14
829	leaq	0+0(%rsp),%rsi
830	movq	16+0(%rsp),%r15
831	movq	24+0(%rsp),%r8
832.byte	102,72,15,126,207
833	call	__ecp_nistz256_sqr_montq
834	xorq	%r9,%r9
835	movq	%r12,%rax
836	addq	$-1,%r12
837	movq	%r13,%r10
838	adcq	%rsi,%r13
839	movq	%r14,%rcx
840	adcq	$0,%r14
841	movq	%r15,%r8
842	adcq	%rbp,%r15
843	adcq	$0,%r9
844	xorq	%rsi,%rsi
845	testq	$1,%rax
846
847	cmovzq	%rax,%r12
848	cmovzq	%r10,%r13
849	cmovzq	%rcx,%r14
850	cmovzq	%r8,%r15
851	cmovzq	%rsi,%r9
852
853	movq	%r13,%rax
854	shrq	$1,%r12
855	shlq	$63,%rax
856	movq	%r14,%r10
857	shrq	$1,%r13
858	orq	%rax,%r12
859	shlq	$63,%r10
860	movq	%r15,%rcx
861	shrq	$1,%r14
862	orq	%r10,%r13
863	shlq	$63,%rcx
864	movq	%r12,0(%rdi)
865	shrq	$1,%r15
866	movq	%r13,8(%rdi)
867	shlq	$63,%r9
868	orq	%rcx,%r14
869	orq	%r9,%r15
870	movq	%r14,16(%rdi)
871	movq	%r15,24(%rdi)
872	movq	64(%rsp),%rax
873	leaq	64(%rsp),%rbx
874	movq	0+32(%rsp),%r9
875	movq	8+32(%rsp),%r10
876	leaq	0+32(%rsp),%rsi
877	movq	16+32(%rsp),%r11
878	movq	24+32(%rsp),%r12
879	leaq	32(%rsp),%rdi
880	call	__ecp_nistz256_mul_montq
881
882	leaq	128(%rsp),%rdi
883	call	__ecp_nistz256_mul_by_2q
884
885	leaq	32(%rsp),%rbx
886	leaq	32(%rsp),%rdi
887	call	__ecp_nistz256_add_toq
888
889	movq	96(%rsp),%rax
890	leaq	96(%rsp),%rbx
891	movq	0+0(%rsp),%r9
892	movq	8+0(%rsp),%r10
893	leaq	0+0(%rsp),%rsi
894	movq	16+0(%rsp),%r11
895	movq	24+0(%rsp),%r12
896	leaq	0(%rsp),%rdi
897	call	__ecp_nistz256_mul_montq
898
899	leaq	128(%rsp),%rdi
900	call	__ecp_nistz256_mul_by_2q
901
902	movq	0+32(%rsp),%rax
903	movq	8+32(%rsp),%r14
904	leaq	0+32(%rsp),%rsi
905	movq	16+32(%rsp),%r15
906	movq	24+32(%rsp),%r8
907.byte	102,72,15,126,199
908	call	__ecp_nistz256_sqr_montq
909
910	leaq	128(%rsp),%rbx
911	movq	%r14,%r8
912	movq	%r15,%r9
913	movq	%rsi,%r14
914	movq	%rbp,%r15
915	call	__ecp_nistz256_sub_fromq
916
917	movq	0+0(%rsp),%rax
918	movq	0+8(%rsp),%rbp
919	movq	0+16(%rsp),%rcx
920	movq	0+24(%rsp),%r10
921	leaq	0(%rsp),%rdi
922	call	__ecp_nistz256_subq
923
924	movq	32(%rsp),%rax
925	leaq	32(%rsp),%rbx
926	movq	%r12,%r14
927	xorl	%ecx,%ecx
928	movq	%r12,0+0(%rsp)
929	movq	%r13,%r10
930	movq	%r13,0+8(%rsp)
931	cmovzq	%r8,%r11
932	movq	%r8,0+16(%rsp)
933	leaq	0-0(%rsp),%rsi
934	cmovzq	%r9,%r12
935	movq	%r9,0+24(%rsp)
936	movq	%r14,%r9
937	leaq	0(%rsp),%rdi
938	call	__ecp_nistz256_mul_montq
939
940.byte	102,72,15,126,203
941.byte	102,72,15,126,207
942	call	__ecp_nistz256_sub_fromq
943
944	addq	$160+8,%rsp
945	popq	%r15
946	popq	%r14
947	popq	%r13
948	popq	%r12
949	popq	%rbx
950	popq	%rbp
951	.byte	0xf3,0xc3
952.size	ecp_nistz256_point_double,.-ecp_nistz256_point_double
953.globl	ecp_nistz256_point_add
954.hidden ecp_nistz256_point_add
955.type	ecp_nistz256_point_add,@function
956.align	32
957ecp_nistz256_point_add:
958	pushq	%rbp
959	pushq	%rbx
960	pushq	%r12
961	pushq	%r13
962	pushq	%r14
963	pushq	%r15
964	subq	$576+8,%rsp
965
966	movdqu	0(%rsi),%xmm0
967	movdqu	16(%rsi),%xmm1
968	movdqu	32(%rsi),%xmm2
969	movdqu	48(%rsi),%xmm3
970	movdqu	64(%rsi),%xmm4
971	movdqu	80(%rsi),%xmm5
972	movq	%rsi,%rbx
973	movq	%rdx,%rsi
974	movdqa	%xmm0,384(%rsp)
975	movdqa	%xmm1,384+16(%rsp)
976	movdqa	%xmm2,416(%rsp)
977	movdqa	%xmm3,416+16(%rsp)
978	movdqa	%xmm4,448(%rsp)
979	movdqa	%xmm5,448+16(%rsp)
980	por	%xmm4,%xmm5
981
982	movdqu	0(%rsi),%xmm0
983	pshufd	$0xb1,%xmm5,%xmm3
984	movdqu	16(%rsi),%xmm1
985	movdqu	32(%rsi),%xmm2
986	por	%xmm3,%xmm5
987	movdqu	48(%rsi),%xmm3
988	movq	64+0(%rsi),%rax
989	movq	64+8(%rsi),%r14
990	movq	64+16(%rsi),%r15
991	movq	64+24(%rsi),%r8
992	movdqa	%xmm0,480(%rsp)
993	pshufd	$0x1e,%xmm5,%xmm4
994	movdqa	%xmm1,480+16(%rsp)
995	movdqu	64(%rsi),%xmm0
996	movdqu	80(%rsi),%xmm1
997	movdqa	%xmm2,512(%rsp)
998	movdqa	%xmm3,512+16(%rsp)
999	por	%xmm4,%xmm5
1000	pxor	%xmm4,%xmm4
1001	por	%xmm0,%xmm1
1002.byte	102,72,15,110,199
1003
1004	leaq	64-0(%rsi),%rsi
1005	movq	%rax,544+0(%rsp)
1006	movq	%r14,544+8(%rsp)
1007	movq	%r15,544+16(%rsp)
1008	movq	%r8,544+24(%rsp)
1009	leaq	96(%rsp),%rdi
1010	call	__ecp_nistz256_sqr_montq
1011
1012	pcmpeqd	%xmm4,%xmm5
1013	pshufd	$0xb1,%xmm1,%xmm4
1014	por	%xmm1,%xmm4
1015	pshufd	$0,%xmm5,%xmm5
1016	pshufd	$0x1e,%xmm4,%xmm3
1017	por	%xmm3,%xmm4
1018	pxor	%xmm3,%xmm3
1019	pcmpeqd	%xmm3,%xmm4
1020	pshufd	$0,%xmm4,%xmm4
1021	movq	64+0(%rbx),%rax
1022	movq	64+8(%rbx),%r14
1023	movq	64+16(%rbx),%r15
1024	movq	64+24(%rbx),%r8
1025.byte	102,72,15,110,203
1026
1027	leaq	64-0(%rbx),%rsi
1028	leaq	32(%rsp),%rdi
1029	call	__ecp_nistz256_sqr_montq
1030
1031	movq	544(%rsp),%rax
1032	leaq	544(%rsp),%rbx
1033	movq	0+96(%rsp),%r9
1034	movq	8+96(%rsp),%r10
1035	leaq	0+96(%rsp),%rsi
1036	movq	16+96(%rsp),%r11
1037	movq	24+96(%rsp),%r12
1038	leaq	224(%rsp),%rdi
1039	call	__ecp_nistz256_mul_montq
1040
1041	movq	448(%rsp),%rax
1042	leaq	448(%rsp),%rbx
1043	movq	0+32(%rsp),%r9
1044	movq	8+32(%rsp),%r10
1045	leaq	0+32(%rsp),%rsi
1046	movq	16+32(%rsp),%r11
1047	movq	24+32(%rsp),%r12
1048	leaq	256(%rsp),%rdi
1049	call	__ecp_nistz256_mul_montq
1050
1051	movq	416(%rsp),%rax
1052	leaq	416(%rsp),%rbx
1053	movq	0+224(%rsp),%r9
1054	movq	8+224(%rsp),%r10
1055	leaq	0+224(%rsp),%rsi
1056	movq	16+224(%rsp),%r11
1057	movq	24+224(%rsp),%r12
1058	leaq	224(%rsp),%rdi
1059	call	__ecp_nistz256_mul_montq
1060
1061	movq	512(%rsp),%rax
1062	leaq	512(%rsp),%rbx
1063	movq	0+256(%rsp),%r9
1064	movq	8+256(%rsp),%r10
1065	leaq	0+256(%rsp),%rsi
1066	movq	16+256(%rsp),%r11
1067	movq	24+256(%rsp),%r12
1068	leaq	256(%rsp),%rdi
1069	call	__ecp_nistz256_mul_montq
1070
1071	leaq	224(%rsp),%rbx
1072	leaq	64(%rsp),%rdi
1073	call	__ecp_nistz256_sub_fromq
1074
1075	orq	%r13,%r12
1076	movdqa	%xmm4,%xmm2
1077	orq	%r8,%r12
1078	orq	%r9,%r12
1079	por	%xmm5,%xmm2
1080.byte	102,73,15,110,220
1081
1082	movq	384(%rsp),%rax
1083	leaq	384(%rsp),%rbx
1084	movq	0+96(%rsp),%r9
1085	movq	8+96(%rsp),%r10
1086	leaq	0+96(%rsp),%rsi
1087	movq	16+96(%rsp),%r11
1088	movq	24+96(%rsp),%r12
1089	leaq	160(%rsp),%rdi
1090	call	__ecp_nistz256_mul_montq
1091
1092	movq	480(%rsp),%rax
1093	leaq	480(%rsp),%rbx
1094	movq	0+32(%rsp),%r9
1095	movq	8+32(%rsp),%r10
1096	leaq	0+32(%rsp),%rsi
1097	movq	16+32(%rsp),%r11
1098	movq	24+32(%rsp),%r12
1099	leaq	192(%rsp),%rdi
1100	call	__ecp_nistz256_mul_montq
1101
1102	leaq	160(%rsp),%rbx
1103	leaq	0(%rsp),%rdi
1104	call	__ecp_nistz256_sub_fromq
1105
1106	orq	%r13,%r12
1107	orq	%r8,%r12
1108	orq	%r9,%r12
1109
1110.byte	0x3e
1111	jnz	.Ladd_proceedq
1112.byte	102,73,15,126,208
1113.byte	102,73,15,126,217
1114	testq	%r8,%r8
1115	jnz	.Ladd_proceedq
1116	testq	%r9,%r9
1117	jz	.Ladd_doubleq
1118
1119.byte	102,72,15,126,199
1120	pxor	%xmm0,%xmm0
1121	movdqu	%xmm0,0(%rdi)
1122	movdqu	%xmm0,16(%rdi)
1123	movdqu	%xmm0,32(%rdi)
1124	movdqu	%xmm0,48(%rdi)
1125	movdqu	%xmm0,64(%rdi)
1126	movdqu	%xmm0,80(%rdi)
1127	jmp	.Ladd_doneq
1128
1129.align	32
1130.Ladd_doubleq:
1131.byte	102,72,15,126,206
1132.byte	102,72,15,126,199
1133	addq	$416,%rsp
1134	jmp	.Lpoint_double_shortcutq
1135
1136.align	32
1137.Ladd_proceedq:
1138	movq	0+64(%rsp),%rax
1139	movq	8+64(%rsp),%r14
1140	leaq	0+64(%rsp),%rsi
1141	movq	16+64(%rsp),%r15
1142	movq	24+64(%rsp),%r8
1143	leaq	96(%rsp),%rdi
1144	call	__ecp_nistz256_sqr_montq
1145
1146	movq	448(%rsp),%rax
1147	leaq	448(%rsp),%rbx
1148	movq	0+0(%rsp),%r9
1149	movq	8+0(%rsp),%r10
1150	leaq	0+0(%rsp),%rsi
1151	movq	16+0(%rsp),%r11
1152	movq	24+0(%rsp),%r12
1153	leaq	352(%rsp),%rdi
1154	call	__ecp_nistz256_mul_montq
1155
1156	movq	0+0(%rsp),%rax
1157	movq	8+0(%rsp),%r14
1158	leaq	0+0(%rsp),%rsi
1159	movq	16+0(%rsp),%r15
1160	movq	24+0(%rsp),%r8
1161	leaq	32(%rsp),%rdi
1162	call	__ecp_nistz256_sqr_montq
1163
1164	movq	544(%rsp),%rax
1165	leaq	544(%rsp),%rbx
1166	movq	0+352(%rsp),%r9
1167	movq	8+352(%rsp),%r10
1168	leaq	0+352(%rsp),%rsi
1169	movq	16+352(%rsp),%r11
1170	movq	24+352(%rsp),%r12
1171	leaq	352(%rsp),%rdi
1172	call	__ecp_nistz256_mul_montq
1173
1174	movq	0(%rsp),%rax
1175	leaq	0(%rsp),%rbx
1176	movq	0+32(%rsp),%r9
1177	movq	8+32(%rsp),%r10
1178	leaq	0+32(%rsp),%rsi
1179	movq	16+32(%rsp),%r11
1180	movq	24+32(%rsp),%r12
1181	leaq	128(%rsp),%rdi
1182	call	__ecp_nistz256_mul_montq
1183
1184	movq	160(%rsp),%rax
1185	leaq	160(%rsp),%rbx
1186	movq	0+32(%rsp),%r9
1187	movq	8+32(%rsp),%r10
1188	leaq	0+32(%rsp),%rsi
1189	movq	16+32(%rsp),%r11
1190	movq	24+32(%rsp),%r12
1191	leaq	192(%rsp),%rdi
1192	call	__ecp_nistz256_mul_montq
1193
1194
1195
1196
1197	xorq	%r11,%r11
1198	addq	%r12,%r12
1199	leaq	96(%rsp),%rsi
1200	adcq	%r13,%r13
1201	movq	%r12,%rax
1202	adcq	%r8,%r8
1203	adcq	%r9,%r9
1204	movq	%r13,%rbp
1205	adcq	$0,%r11
1206
1207	subq	$-1,%r12
1208	movq	%r8,%rcx
1209	sbbq	%r14,%r13
1210	sbbq	$0,%r8
1211	movq	%r9,%r10
1212	sbbq	%r15,%r9
1213	sbbq	$0,%r11
1214
1215	cmovcq	%rax,%r12
1216	movq	0(%rsi),%rax
1217	cmovcq	%rbp,%r13
1218	movq	8(%rsi),%rbp
1219	cmovcq	%rcx,%r8
1220	movq	16(%rsi),%rcx
1221	cmovcq	%r10,%r9
1222	movq	24(%rsi),%r10
1223
1224	call	__ecp_nistz256_subq
1225
1226	leaq	128(%rsp),%rbx
1227	leaq	288(%rsp),%rdi
1228	call	__ecp_nistz256_sub_fromq
1229
1230	movq	192+0(%rsp),%rax
1231	movq	192+8(%rsp),%rbp
1232	movq	192+16(%rsp),%rcx
1233	movq	192+24(%rsp),%r10
1234	leaq	320(%rsp),%rdi
1235
1236	call	__ecp_nistz256_subq
1237
1238	movq	%r12,0(%rdi)
1239	movq	%r13,8(%rdi)
1240	movq	%r8,16(%rdi)
1241	movq	%r9,24(%rdi)
1242	movq	128(%rsp),%rax
1243	leaq	128(%rsp),%rbx
1244	movq	0+224(%rsp),%r9
1245	movq	8+224(%rsp),%r10
1246	leaq	0+224(%rsp),%rsi
1247	movq	16+224(%rsp),%r11
1248	movq	24+224(%rsp),%r12
1249	leaq	256(%rsp),%rdi
1250	call	__ecp_nistz256_mul_montq
1251
1252	movq	320(%rsp),%rax
1253	leaq	320(%rsp),%rbx
1254	movq	0+64(%rsp),%r9
1255	movq	8+64(%rsp),%r10
1256	leaq	0+64(%rsp),%rsi
1257	movq	16+64(%rsp),%r11
1258	movq	24+64(%rsp),%r12
1259	leaq	320(%rsp),%rdi
1260	call	__ecp_nistz256_mul_montq
1261
1262	leaq	256(%rsp),%rbx
1263	leaq	320(%rsp),%rdi
1264	call	__ecp_nistz256_sub_fromq
1265
1266.byte	102,72,15,126,199
1267
1268	movdqa	%xmm5,%xmm0
1269	movdqa	%xmm5,%xmm1
1270	pandn	352(%rsp),%xmm0
1271	movdqa	%xmm5,%xmm2
1272	pandn	352+16(%rsp),%xmm1
1273	movdqa	%xmm5,%xmm3
1274	pand	544(%rsp),%xmm2
1275	pand	544+16(%rsp),%xmm3
1276	por	%xmm0,%xmm2
1277	por	%xmm1,%xmm3
1278
1279	movdqa	%xmm4,%xmm0
1280	movdqa	%xmm4,%xmm1
1281	pandn	%xmm2,%xmm0
1282	movdqa	%xmm4,%xmm2
1283	pandn	%xmm3,%xmm1
1284	movdqa	%xmm4,%xmm3
1285	pand	448(%rsp),%xmm2
1286	pand	448+16(%rsp),%xmm3
1287	por	%xmm0,%xmm2
1288	por	%xmm1,%xmm3
1289	movdqu	%xmm2,64(%rdi)
1290	movdqu	%xmm3,80(%rdi)
1291
1292	movdqa	%xmm5,%xmm0
1293	movdqa	%xmm5,%xmm1
1294	pandn	288(%rsp),%xmm0
1295	movdqa	%xmm5,%xmm2
1296	pandn	288+16(%rsp),%xmm1
1297	movdqa	%xmm5,%xmm3
1298	pand	480(%rsp),%xmm2
1299	pand	480+16(%rsp),%xmm3
1300	por	%xmm0,%xmm2
1301	por	%xmm1,%xmm3
1302
1303	movdqa	%xmm4,%xmm0
1304	movdqa	%xmm4,%xmm1
1305	pandn	%xmm2,%xmm0
1306	movdqa	%xmm4,%xmm2
1307	pandn	%xmm3,%xmm1
1308	movdqa	%xmm4,%xmm3
1309	pand	384(%rsp),%xmm2
1310	pand	384+16(%rsp),%xmm3
1311	por	%xmm0,%xmm2
1312	por	%xmm1,%xmm3
1313	movdqu	%xmm2,0(%rdi)
1314	movdqu	%xmm3,16(%rdi)
1315
1316	movdqa	%xmm5,%xmm0
1317	movdqa	%xmm5,%xmm1
1318	pandn	320(%rsp),%xmm0
1319	movdqa	%xmm5,%xmm2
1320	pandn	320+16(%rsp),%xmm1
1321	movdqa	%xmm5,%xmm3
1322	pand	512(%rsp),%xmm2
1323	pand	512+16(%rsp),%xmm3
1324	por	%xmm0,%xmm2
1325	por	%xmm1,%xmm3
1326
1327	movdqa	%xmm4,%xmm0
1328	movdqa	%xmm4,%xmm1
1329	pandn	%xmm2,%xmm0
1330	movdqa	%xmm4,%xmm2
1331	pandn	%xmm3,%xmm1
1332	movdqa	%xmm4,%xmm3
1333	pand	416(%rsp),%xmm2
1334	pand	416+16(%rsp),%xmm3
1335	por	%xmm0,%xmm2
1336	por	%xmm1,%xmm3
1337	movdqu	%xmm2,32(%rdi)
1338	movdqu	%xmm3,48(%rdi)
1339
1340.Ladd_doneq:
1341	addq	$576+8,%rsp
1342	popq	%r15
1343	popq	%r14
1344	popq	%r13
1345	popq	%r12
1346	popq	%rbx
1347	popq	%rbp
1348	.byte	0xf3,0xc3
1349.size	ecp_nistz256_point_add,.-ecp_nistz256_point_add
1350.globl	ecp_nistz256_point_add_affine
1351.hidden ecp_nistz256_point_add_affine
1352.type	ecp_nistz256_point_add_affine,@function
1353.align	32
1354ecp_nistz256_point_add_affine:
1355	pushq	%rbp
1356	pushq	%rbx
1357	pushq	%r12
1358	pushq	%r13
1359	pushq	%r14
1360	pushq	%r15
1361	subq	$480+8,%rsp
1362
1363	movdqu	0(%rsi),%xmm0
1364	movq	%rdx,%rbx
1365	movdqu	16(%rsi),%xmm1
1366	movdqu	32(%rsi),%xmm2
1367	movdqu	48(%rsi),%xmm3
1368	movdqu	64(%rsi),%xmm4
1369	movdqu	80(%rsi),%xmm5
1370	movq	64+0(%rsi),%rax
1371	movq	64+8(%rsi),%r14
1372	movq	64+16(%rsi),%r15
1373	movq	64+24(%rsi),%r8
1374	movdqa	%xmm0,320(%rsp)
1375	movdqa	%xmm1,320+16(%rsp)
1376	movdqa	%xmm2,352(%rsp)
1377	movdqa	%xmm3,352+16(%rsp)
1378	movdqa	%xmm4,384(%rsp)
1379	movdqa	%xmm5,384+16(%rsp)
1380	por	%xmm4,%xmm5
1381
1382	movdqu	0(%rbx),%xmm0
1383	pshufd	$0xb1,%xmm5,%xmm3
1384	movdqu	16(%rbx),%xmm1
1385	movdqu	32(%rbx),%xmm2
1386	por	%xmm3,%xmm5
1387	movdqu	48(%rbx),%xmm3
1388	movdqa	%xmm0,416(%rsp)
1389	pshufd	$0x1e,%xmm5,%xmm4
1390	movdqa	%xmm1,416+16(%rsp)
1391	por	%xmm0,%xmm1
1392.byte	102,72,15,110,199
1393	movdqa	%xmm2,448(%rsp)
1394	movdqa	%xmm3,448+16(%rsp)
1395	por	%xmm2,%xmm3
1396	por	%xmm4,%xmm5
1397	pxor	%xmm4,%xmm4
1398	por	%xmm1,%xmm3
1399
1400	leaq	64-0(%rsi),%rsi
1401	leaq	32(%rsp),%rdi
1402	call	__ecp_nistz256_sqr_montq
1403
1404	pcmpeqd	%xmm4,%xmm5
1405	pshufd	$0xb1,%xmm3,%xmm4
1406	movq	0(%rbx),%rax
1407
1408	movq	%r12,%r9
1409	por	%xmm3,%xmm4
1410	pshufd	$0,%xmm5,%xmm5
1411	pshufd	$0x1e,%xmm4,%xmm3
1412	movq	%r13,%r10
1413	por	%xmm3,%xmm4
1414	pxor	%xmm3,%xmm3
1415	movq	%r14,%r11
1416	pcmpeqd	%xmm3,%xmm4
1417	pshufd	$0,%xmm4,%xmm4
1418
1419	leaq	32-0(%rsp),%rsi
1420	movq	%r15,%r12
1421	leaq	0(%rsp),%rdi
1422	call	__ecp_nistz256_mul_montq
1423
1424	leaq	320(%rsp),%rbx
1425	leaq	64(%rsp),%rdi
1426	call	__ecp_nistz256_sub_fromq
1427
1428	movq	384(%rsp),%rax
1429	leaq	384(%rsp),%rbx
1430	movq	0+32(%rsp),%r9
1431	movq	8+32(%rsp),%r10
1432	leaq	0+32(%rsp),%rsi
1433	movq	16+32(%rsp),%r11
1434	movq	24+32(%rsp),%r12
1435	leaq	32(%rsp),%rdi
1436	call	__ecp_nistz256_mul_montq
1437
1438	movq	384(%rsp),%rax
1439	leaq	384(%rsp),%rbx
1440	movq	0+64(%rsp),%r9
1441	movq	8+64(%rsp),%r10
1442	leaq	0+64(%rsp),%rsi
1443	movq	16+64(%rsp),%r11
1444	movq	24+64(%rsp),%r12
1445	leaq	288(%rsp),%rdi
1446	call	__ecp_nistz256_mul_montq
1447
1448	movq	448(%rsp),%rax
1449	leaq	448(%rsp),%rbx
1450	movq	0+32(%rsp),%r9
1451	movq	8+32(%rsp),%r10
1452	leaq	0+32(%rsp),%rsi
1453	movq	16+32(%rsp),%r11
1454	movq	24+32(%rsp),%r12
1455	leaq	32(%rsp),%rdi
1456	call	__ecp_nistz256_mul_montq
1457
1458	leaq	352(%rsp),%rbx
1459	leaq	96(%rsp),%rdi
1460	call	__ecp_nistz256_sub_fromq
1461
1462	movq	0+64(%rsp),%rax
1463	movq	8+64(%rsp),%r14
1464	leaq	0+64(%rsp),%rsi
1465	movq	16+64(%rsp),%r15
1466	movq	24+64(%rsp),%r8
1467	leaq	128(%rsp),%rdi
1468	call	__ecp_nistz256_sqr_montq
1469
1470	movq	0+96(%rsp),%rax
1471	movq	8+96(%rsp),%r14
1472	leaq	0+96(%rsp),%rsi
1473	movq	16+96(%rsp),%r15
1474	movq	24+96(%rsp),%r8
1475	leaq	192(%rsp),%rdi
1476	call	__ecp_nistz256_sqr_montq
1477
1478	movq	128(%rsp),%rax
1479	leaq	128(%rsp),%rbx
1480	movq	0+64(%rsp),%r9
1481	movq	8+64(%rsp),%r10
1482	leaq	0+64(%rsp),%rsi
1483	movq	16+64(%rsp),%r11
1484	movq	24+64(%rsp),%r12
1485	leaq	160(%rsp),%rdi
1486	call	__ecp_nistz256_mul_montq
1487
1488	movq	320(%rsp),%rax
1489	leaq	320(%rsp),%rbx
1490	movq	0+128(%rsp),%r9
1491	movq	8+128(%rsp),%r10
1492	leaq	0+128(%rsp),%rsi
1493	movq	16+128(%rsp),%r11
1494	movq	24+128(%rsp),%r12
1495	leaq	0(%rsp),%rdi
1496	call	__ecp_nistz256_mul_montq
1497
1498
1499
1500
1501	xorq	%r11,%r11
1502	addq	%r12,%r12
1503	leaq	192(%rsp),%rsi
1504	adcq	%r13,%r13
1505	movq	%r12,%rax
1506	adcq	%r8,%r8
1507	adcq	%r9,%r9
1508	movq	%r13,%rbp
1509	adcq	$0,%r11
1510
1511	subq	$-1,%r12
1512	movq	%r8,%rcx
1513	sbbq	%r14,%r13
1514	sbbq	$0,%r8
1515	movq	%r9,%r10
1516	sbbq	%r15,%r9
1517	sbbq	$0,%r11
1518
1519	cmovcq	%rax,%r12
1520	movq	0(%rsi),%rax
1521	cmovcq	%rbp,%r13
1522	movq	8(%rsi),%rbp
1523	cmovcq	%rcx,%r8
1524	movq	16(%rsi),%rcx
1525	cmovcq	%r10,%r9
1526	movq	24(%rsi),%r10
1527
1528	call	__ecp_nistz256_subq
1529
1530	leaq	160(%rsp),%rbx
1531	leaq	224(%rsp),%rdi
1532	call	__ecp_nistz256_sub_fromq
1533
1534	movq	0+0(%rsp),%rax
1535	movq	0+8(%rsp),%rbp
1536	movq	0+16(%rsp),%rcx
1537	movq	0+24(%rsp),%r10
1538	leaq	64(%rsp),%rdi
1539
1540	call	__ecp_nistz256_subq
1541
1542	movq	%r12,0(%rdi)
1543	movq	%r13,8(%rdi)
1544	movq	%r8,16(%rdi)
1545	movq	%r9,24(%rdi)
1546	movq	352(%rsp),%rax
1547	leaq	352(%rsp),%rbx
1548	movq	0+160(%rsp),%r9
1549	movq	8+160(%rsp),%r10
1550	leaq	0+160(%rsp),%rsi
1551	movq	16+160(%rsp),%r11
1552	movq	24+160(%rsp),%r12
1553	leaq	32(%rsp),%rdi
1554	call	__ecp_nistz256_mul_montq
1555
1556	movq	96(%rsp),%rax
1557	leaq	96(%rsp),%rbx
1558	movq	0+64(%rsp),%r9
1559	movq	8+64(%rsp),%r10
1560	leaq	0+64(%rsp),%rsi
1561	movq	16+64(%rsp),%r11
1562	movq	24+64(%rsp),%r12
1563	leaq	64(%rsp),%rdi
1564	call	__ecp_nistz256_mul_montq
1565
1566	leaq	32(%rsp),%rbx
1567	leaq	256(%rsp),%rdi
1568	call	__ecp_nistz256_sub_fromq
1569
1570.byte	102,72,15,126,199
1571
1572	movdqa	%xmm5,%xmm0
1573	movdqa	%xmm5,%xmm1
1574	pandn	288(%rsp),%xmm0
1575	movdqa	%xmm5,%xmm2
1576	pandn	288+16(%rsp),%xmm1
1577	movdqa	%xmm5,%xmm3
1578	pand	.LONE_mont(%rip),%xmm2
1579	pand	.LONE_mont+16(%rip),%xmm3
1580	por	%xmm0,%xmm2
1581	por	%xmm1,%xmm3
1582
1583	movdqa	%xmm4,%xmm0
1584	movdqa	%xmm4,%xmm1
1585	pandn	%xmm2,%xmm0
1586	movdqa	%xmm4,%xmm2
1587	pandn	%xmm3,%xmm1
1588	movdqa	%xmm4,%xmm3
1589	pand	384(%rsp),%xmm2
1590	pand	384+16(%rsp),%xmm3
1591	por	%xmm0,%xmm2
1592	por	%xmm1,%xmm3
1593	movdqu	%xmm2,64(%rdi)
1594	movdqu	%xmm3,80(%rdi)
1595
1596	movdqa	%xmm5,%xmm0
1597	movdqa	%xmm5,%xmm1
1598	pandn	224(%rsp),%xmm0
1599	movdqa	%xmm5,%xmm2
1600	pandn	224+16(%rsp),%xmm1
1601	movdqa	%xmm5,%xmm3
1602	pand	416(%rsp),%xmm2
1603	pand	416+16(%rsp),%xmm3
1604	por	%xmm0,%xmm2
1605	por	%xmm1,%xmm3
1606
1607	movdqa	%xmm4,%xmm0
1608	movdqa	%xmm4,%xmm1
1609	pandn	%xmm2,%xmm0
1610	movdqa	%xmm4,%xmm2
1611	pandn	%xmm3,%xmm1
1612	movdqa	%xmm4,%xmm3
1613	pand	320(%rsp),%xmm2
1614	pand	320+16(%rsp),%xmm3
1615	por	%xmm0,%xmm2
1616	por	%xmm1,%xmm3
1617	movdqu	%xmm2,0(%rdi)
1618	movdqu	%xmm3,16(%rdi)
1619
1620	movdqa	%xmm5,%xmm0
1621	movdqa	%xmm5,%xmm1
1622	pandn	256(%rsp),%xmm0
1623	movdqa	%xmm5,%xmm2
1624	pandn	256+16(%rsp),%xmm1
1625	movdqa	%xmm5,%xmm3
1626	pand	448(%rsp),%xmm2
1627	pand	448+16(%rsp),%xmm3
1628	por	%xmm0,%xmm2
1629	por	%xmm1,%xmm3
1630
1631	movdqa	%xmm4,%xmm0
1632	movdqa	%xmm4,%xmm1
1633	pandn	%xmm2,%xmm0
1634	movdqa	%xmm4,%xmm2
1635	pandn	%xmm3,%xmm1
1636	movdqa	%xmm4,%xmm3
1637	pand	352(%rsp),%xmm2
1638	pand	352+16(%rsp),%xmm3
1639	por	%xmm0,%xmm2
1640	por	%xmm1,%xmm3
1641	movdqu	%xmm2,32(%rdi)
1642	movdqu	%xmm3,48(%rdi)
1643
1644	addq	$480+8,%rsp
1645	popq	%r15
1646	popq	%r14
1647	popq	%r13
1648	popq	%r12
1649	popq	%rbx
1650	popq	%rbp
1651	.byte	0xf3,0xc3
1652.size	ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1653#endif
1654