• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(__aarch64__)
13#if defined(BORINGSSL_PREFIX)
14#include <boringssl_prefix_symbols_asm.h>
15#endif
16#include "openssl/arm_arch.h"
17
18.text
19.align	5
20.Lpoly:
21.quad	0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
22.LRR:	//	2^512 mod P precomputed for NIST P256 polynomial
23.quad	0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd
24.Lone_mont:
25.quad	0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
26.Lone:
27.quad	1,0,0,0
28.Lord:
29.quad	0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
30.LordK:
31.quad	0xccd1c8aaee00bc4f
32.byte	69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
33.align	2
34
35// void	ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
36//					     const BN_ULONG x2[4]);
37.globl	ecp_nistz256_mul_mont
38.hidden	ecp_nistz256_mul_mont
39.type	ecp_nistz256_mul_mont,%function
40.align	4
41ecp_nistz256_mul_mont:
42	AARCH64_SIGN_LINK_REGISTER
43	stp	x29,x30,[sp,#-32]!
44	add	x29,sp,#0
45	stp	x19,x20,[sp,#16]
46
47	ldr	x3,[x2]		// bp[0]
48	ldp	x4,x5,[x1]
49	ldp	x6,x7,[x1,#16]
50	ldr	x12,.Lpoly+8
51	ldr	x13,.Lpoly+24
52
53	bl	__ecp_nistz256_mul_mont
54
55	ldp	x19,x20,[sp,#16]
56	ldp	x29,x30,[sp],#32
57	AARCH64_VALIDATE_LINK_REGISTER
58	ret
59.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
60
61// void	ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
62.globl	ecp_nistz256_sqr_mont
63.hidden	ecp_nistz256_sqr_mont
64.type	ecp_nistz256_sqr_mont,%function
65.align	4
66ecp_nistz256_sqr_mont:
67	AARCH64_SIGN_LINK_REGISTER
68	stp	x29,x30,[sp,#-32]!
69	add	x29,sp,#0
70	stp	x19,x20,[sp,#16]
71
72	ldp	x4,x5,[x1]
73	ldp	x6,x7,[x1,#16]
74	ldr	x12,.Lpoly+8
75	ldr	x13,.Lpoly+24
76
77	bl	__ecp_nistz256_sqr_mont
78
79	ldp	x19,x20,[sp,#16]
80	ldp	x29,x30,[sp],#32
81	AARCH64_VALIDATE_LINK_REGISTER
82	ret
83.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
84
85// void	ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
86.globl	ecp_nistz256_div_by_2
87.hidden	ecp_nistz256_div_by_2
88.type	ecp_nistz256_div_by_2,%function
89.align	4
90ecp_nistz256_div_by_2:
91	AARCH64_SIGN_LINK_REGISTER
92	stp	x29,x30,[sp,#-16]!
93	add	x29,sp,#0
94
95	ldp	x14,x15,[x1]
96	ldp	x16,x17,[x1,#16]
97	ldr	x12,.Lpoly+8
98	ldr	x13,.Lpoly+24
99
100	bl	__ecp_nistz256_div_by_2
101
102	ldp	x29,x30,[sp],#16
103	AARCH64_VALIDATE_LINK_REGISTER
104	ret
105.size	ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
106
107// void	ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
108.globl	ecp_nistz256_mul_by_2
109.hidden	ecp_nistz256_mul_by_2
110.type	ecp_nistz256_mul_by_2,%function
111.align	4
112ecp_nistz256_mul_by_2:
113	AARCH64_SIGN_LINK_REGISTER
114	stp	x29,x30,[sp,#-16]!
115	add	x29,sp,#0
116
117	ldp	x14,x15,[x1]
118	ldp	x16,x17,[x1,#16]
119	ldr	x12,.Lpoly+8
120	ldr	x13,.Lpoly+24
121	mov	x8,x14
122	mov	x9,x15
123	mov	x10,x16
124	mov	x11,x17
125
126	bl	__ecp_nistz256_add_to	// ret = a+a	// 2*a
127
128	ldp	x29,x30,[sp],#16
129	AARCH64_VALIDATE_LINK_REGISTER
130	ret
131.size	ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
132
133// void	ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
134.globl	ecp_nistz256_mul_by_3
135.hidden	ecp_nistz256_mul_by_3
136.type	ecp_nistz256_mul_by_3,%function
137.align	4
138ecp_nistz256_mul_by_3:
139	AARCH64_SIGN_LINK_REGISTER
140	stp	x29,x30,[sp,#-16]!
141	add	x29,sp,#0
142
143	ldp	x14,x15,[x1]
144	ldp	x16,x17,[x1,#16]
145	ldr	x12,.Lpoly+8
146	ldr	x13,.Lpoly+24
147	mov	x8,x14
148	mov	x9,x15
149	mov	x10,x16
150	mov	x11,x17
151	mov	x4,x14
152	mov	x5,x15
153	mov	x6,x16
154	mov	x7,x17
155
156	bl	__ecp_nistz256_add_to	// ret = a+a	// 2*a
157
158	mov	x8,x4
159	mov	x9,x5
160	mov	x10,x6
161	mov	x11,x7
162
163	bl	__ecp_nistz256_add_to	// ret += a	// 2*a+a=3*a
164
165	ldp	x29,x30,[sp],#16
166	AARCH64_VALIDATE_LINK_REGISTER
167	ret
168.size	ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
169
170// void	ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
171//				        const BN_ULONG x2[4]);
172.globl	ecp_nistz256_sub
173.hidden	ecp_nistz256_sub
174.type	ecp_nistz256_sub,%function
175.align	4
176ecp_nistz256_sub:
177	AARCH64_SIGN_LINK_REGISTER
178	stp	x29,x30,[sp,#-16]!
179	add	x29,sp,#0
180
181	ldp	x14,x15,[x1]
182	ldp	x16,x17,[x1,#16]
183	ldr	x12,.Lpoly+8
184	ldr	x13,.Lpoly+24
185
186	bl	__ecp_nistz256_sub_from
187
188	ldp	x29,x30,[sp],#16
189	AARCH64_VALIDATE_LINK_REGISTER
190	ret
191.size	ecp_nistz256_sub,.-ecp_nistz256_sub
192
193// void	ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
194.globl	ecp_nistz256_neg
195.hidden	ecp_nistz256_neg
196.type	ecp_nistz256_neg,%function
197.align	4
198ecp_nistz256_neg:
199	AARCH64_SIGN_LINK_REGISTER
200	stp	x29,x30,[sp,#-16]!
201	add	x29,sp,#0
202
203	mov	x2,x1
204	mov	x14,xzr		// a = 0
205	mov	x15,xzr
206	mov	x16,xzr
207	mov	x17,xzr
208	ldr	x12,.Lpoly+8
209	ldr	x13,.Lpoly+24
210
211	bl	__ecp_nistz256_sub_from
212
213	ldp	x29,x30,[sp],#16
214	AARCH64_VALIDATE_LINK_REGISTER
215	ret
216.size	ecp_nistz256_neg,.-ecp_nistz256_neg
217
218// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
219// to x4-x7 and b[0] - to x3
220.type	__ecp_nistz256_mul_mont,%function
221.align	4
222__ecp_nistz256_mul_mont:
223	mul	x14,x4,x3		// a[0]*b[0]
224	umulh	x8,x4,x3
225
226	mul	x15,x5,x3		// a[1]*b[0]
227	umulh	x9,x5,x3
228
229	mul	x16,x6,x3		// a[2]*b[0]
230	umulh	x10,x6,x3
231
232	mul	x17,x7,x3		// a[3]*b[0]
233	umulh	x11,x7,x3
234	ldr	x3,[x2,#8]		// b[1]
235
236	adds	x15,x15,x8		// accumulate high parts of multiplication
237	lsl	x8,x14,#32
238	adcs	x16,x16,x9
239	lsr	x9,x14,#32
240	adcs	x17,x17,x10
241	adc	x19,xzr,x11
242	mov	x20,xzr
243	subs	x10,x14,x8		// "*0xffff0001"
244	sbc	x11,x14,x9
245	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
246	mul	x8,x4,x3		// lo(a[0]*b[i])
247	adcs	x15,x16,x9
248	mul	x9,x5,x3		// lo(a[1]*b[i])
249	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
250	mul	x10,x6,x3		// lo(a[2]*b[i])
251	adcs	x17,x19,x11
252	mul	x11,x7,x3		// lo(a[3]*b[i])
253	adc	x19,x20,xzr
254
255	adds	x14,x14,x8		// accumulate low parts of multiplication
256	umulh	x8,x4,x3		// hi(a[0]*b[i])
257	adcs	x15,x15,x9
258	umulh	x9,x5,x3		// hi(a[1]*b[i])
259	adcs	x16,x16,x10
260	umulh	x10,x6,x3		// hi(a[2]*b[i])
261	adcs	x17,x17,x11
262	umulh	x11,x7,x3		// hi(a[3]*b[i])
263	adc	x19,x19,xzr
264	ldr	x3,[x2,#8*(1+1)]	// b[1+1]
265	adds	x15,x15,x8		// accumulate high parts of multiplication
266	lsl	x8,x14,#32
267	adcs	x16,x16,x9
268	lsr	x9,x14,#32
269	adcs	x17,x17,x10
270	adcs	x19,x19,x11
271	adc	x20,xzr,xzr
272	subs	x10,x14,x8		// "*0xffff0001"
273	sbc	x11,x14,x9
274	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
275	mul	x8,x4,x3		// lo(a[0]*b[i])
276	adcs	x15,x16,x9
277	mul	x9,x5,x3		// lo(a[1]*b[i])
278	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
279	mul	x10,x6,x3		// lo(a[2]*b[i])
280	adcs	x17,x19,x11
281	mul	x11,x7,x3		// lo(a[3]*b[i])
282	adc	x19,x20,xzr
283
284	adds	x14,x14,x8		// accumulate low parts of multiplication
285	umulh	x8,x4,x3		// hi(a[0]*b[i])
286	adcs	x15,x15,x9
287	umulh	x9,x5,x3		// hi(a[1]*b[i])
288	adcs	x16,x16,x10
289	umulh	x10,x6,x3		// hi(a[2]*b[i])
290	adcs	x17,x17,x11
291	umulh	x11,x7,x3		// hi(a[3]*b[i])
292	adc	x19,x19,xzr
293	ldr	x3,[x2,#8*(2+1)]	// b[2+1]
294	adds	x15,x15,x8		// accumulate high parts of multiplication
295	lsl	x8,x14,#32
296	adcs	x16,x16,x9
297	lsr	x9,x14,#32
298	adcs	x17,x17,x10
299	adcs	x19,x19,x11
300	adc	x20,xzr,xzr
301	subs	x10,x14,x8		// "*0xffff0001"
302	sbc	x11,x14,x9
303	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
304	mul	x8,x4,x3		// lo(a[0]*b[i])
305	adcs	x15,x16,x9
306	mul	x9,x5,x3		// lo(a[1]*b[i])
307	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
308	mul	x10,x6,x3		// lo(a[2]*b[i])
309	adcs	x17,x19,x11
310	mul	x11,x7,x3		// lo(a[3]*b[i])
311	adc	x19,x20,xzr
312
313	adds	x14,x14,x8		// accumulate low parts of multiplication
314	umulh	x8,x4,x3		// hi(a[0]*b[i])
315	adcs	x15,x15,x9
316	umulh	x9,x5,x3		// hi(a[1]*b[i])
317	adcs	x16,x16,x10
318	umulh	x10,x6,x3		// hi(a[2]*b[i])
319	adcs	x17,x17,x11
320	umulh	x11,x7,x3		// hi(a[3]*b[i])
321	adc	x19,x19,xzr
322	adds	x15,x15,x8		// accumulate high parts of multiplication
323	lsl	x8,x14,#32
324	adcs	x16,x16,x9
325	lsr	x9,x14,#32
326	adcs	x17,x17,x10
327	adcs	x19,x19,x11
328	adc	x20,xzr,xzr
329	// last reduction
330	subs	x10,x14,x8		// "*0xffff0001"
331	sbc	x11,x14,x9
332	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
333	adcs	x15,x16,x9
334	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
335	adcs	x17,x19,x11
336	adc	x19,x20,xzr
337
338	adds	x8,x14,#1		// subs	x8,x14,#-1 // tmp = ret-modulus
339	sbcs	x9,x15,x12
340	sbcs	x10,x16,xzr
341	sbcs	x11,x17,x13
342	sbcs	xzr,x19,xzr		// did it borrow?
343
344	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
345	csel	x15,x15,x9,lo
346	csel	x16,x16,x10,lo
347	stp	x14,x15,[x0]
348	csel	x17,x17,x11,lo
349	stp	x16,x17,[x0,#16]
350
351	ret
352.size	__ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
353
354// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
355// to x4-x7
356.type	__ecp_nistz256_sqr_mont,%function
357.align	4
358__ecp_nistz256_sqr_mont:
359	//  |  |  |  |  |  |a1*a0|  |
360	//  |  |  |  |  |a2*a0|  |  |
361	//  |  |a3*a2|a3*a0|  |  |  |
362	//  |  |  |  |a2*a1|  |  |  |
363	//  |  |  |a3*a1|  |  |  |  |
364	// *|  |  |  |  |  |  |  | 2|
365	// +|a3*a3|a2*a2|a1*a1|a0*a0|
366	//  |--+--+--+--+--+--+--+--|
367	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow
368	//
369	//  "can't overflow" below mark carrying into high part of
370	//  multiplication result, which can't overflow, because it
371	//  can never be all ones.
372
373	mul	x15,x5,x4		// a[1]*a[0]
374	umulh	x9,x5,x4
375	mul	x16,x6,x4		// a[2]*a[0]
376	umulh	x10,x6,x4
377	mul	x17,x7,x4		// a[3]*a[0]
378	umulh	x19,x7,x4
379
380	adds	x16,x16,x9		// accumulate high parts of multiplication
381	mul	x8,x6,x5		// a[2]*a[1]
382	umulh	x9,x6,x5
383	adcs	x17,x17,x10
384	mul	x10,x7,x5		// a[3]*a[1]
385	umulh	x11,x7,x5
386	adc	x19,x19,xzr		// can't overflow
387
388	mul	x20,x7,x6		// a[3]*a[2]
389	umulh	x1,x7,x6
390
391	adds	x9,x9,x10		// accumulate high parts of multiplication
392	mul	x14,x4,x4		// a[0]*a[0]
393	adc	x10,x11,xzr		// can't overflow
394
395	adds	x17,x17,x8		// accumulate low parts of multiplication
396	umulh	x4,x4,x4
397	adcs	x19,x19,x9
398	mul	x9,x5,x5		// a[1]*a[1]
399	adcs	x20,x20,x10
400	umulh	x5,x5,x5
401	adc	x1,x1,xzr		// can't overflow
402
403	adds	x15,x15,x15	// acc[1-6]*=2
404	mul	x10,x6,x6		// a[2]*a[2]
405	adcs	x16,x16,x16
406	umulh	x6,x6,x6
407	adcs	x17,x17,x17
408	mul	x11,x7,x7		// a[3]*a[3]
409	adcs	x19,x19,x19
410	umulh	x7,x7,x7
411	adcs	x20,x20,x20
412	adcs	x1,x1,x1
413	adc	x2,xzr,xzr
414
415	adds	x15,x15,x4		// +a[i]*a[i]
416	adcs	x16,x16,x9
417	adcs	x17,x17,x5
418	adcs	x19,x19,x10
419	adcs	x20,x20,x6
420	lsl	x8,x14,#32
421	adcs	x1,x1,x11
422	lsr	x9,x14,#32
423	adc	x2,x2,x7
424	subs	x10,x14,x8		// "*0xffff0001"
425	sbc	x11,x14,x9
426	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
427	adcs	x15,x16,x9
428	lsl	x8,x14,#32
429	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
430	lsr	x9,x14,#32
431	adc	x17,x11,xzr		// can't overflow
432	subs	x10,x14,x8		// "*0xffff0001"
433	sbc	x11,x14,x9
434	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
435	adcs	x15,x16,x9
436	lsl	x8,x14,#32
437	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
438	lsr	x9,x14,#32
439	adc	x17,x11,xzr		// can't overflow
440	subs	x10,x14,x8		// "*0xffff0001"
441	sbc	x11,x14,x9
442	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
443	adcs	x15,x16,x9
444	lsl	x8,x14,#32
445	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
446	lsr	x9,x14,#32
447	adc	x17,x11,xzr		// can't overflow
448	subs	x10,x14,x8		// "*0xffff0001"
449	sbc	x11,x14,x9
450	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
451	adcs	x15,x16,x9
452	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
453	adc	x17,x11,xzr		// can't overflow
454
455	adds	x14,x14,x19	// accumulate upper half
456	adcs	x15,x15,x20
457	adcs	x16,x16,x1
458	adcs	x17,x17,x2
459	adc	x19,xzr,xzr
460
461	adds	x8,x14,#1		// subs	x8,x14,#-1 // tmp = ret-modulus
462	sbcs	x9,x15,x12
463	sbcs	x10,x16,xzr
464	sbcs	x11,x17,x13
465	sbcs	xzr,x19,xzr		// did it borrow?
466
467	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
468	csel	x15,x15,x9,lo
469	csel	x16,x16,x10,lo
470	stp	x14,x15,[x0]
471	csel	x17,x17,x11,lo
472	stp	x16,x17,[x0,#16]
473
474	ret
475.size	__ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
476
477// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to
478// x4-x7 and x8-x11. This is done because it's used in multiple
479// contexts, e.g. in multiplication by 2 and 3...
480.type	__ecp_nistz256_add_to,%function
481.align	4
482__ecp_nistz256_add_to:
483	adds	x14,x14,x8		// ret = a+b
484	adcs	x15,x15,x9
485	adcs	x16,x16,x10
486	adcs	x17,x17,x11
487	adc	x1,xzr,xzr		// zap x1
488
489	adds	x8,x14,#1		// subs	x8,x4,#-1 // tmp = ret-modulus
490	sbcs	x9,x15,x12
491	sbcs	x10,x16,xzr
492	sbcs	x11,x17,x13
493	sbcs	xzr,x1,xzr		// did subtraction borrow?
494
495	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
496	csel	x15,x15,x9,lo
497	csel	x16,x16,x10,lo
498	stp	x14,x15,[x0]
499	csel	x17,x17,x11,lo
500	stp	x16,x17,[x0,#16]
501
502	ret
503.size	__ecp_nistz256_add_to,.-__ecp_nistz256_add_to
504
505.type	__ecp_nistz256_sub_from,%function
506.align	4
507__ecp_nistz256_sub_from:
508	ldp	x8,x9,[x2]
509	ldp	x10,x11,[x2,#16]
510	subs	x14,x14,x8		// ret = a-b
511	sbcs	x15,x15,x9
512	sbcs	x16,x16,x10
513	sbcs	x17,x17,x11
514	sbc	x1,xzr,xzr		// zap x1
515
516	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = ret+modulus
517	adcs	x9,x15,x12
518	adcs	x10,x16,xzr
519	adc	x11,x17,x13
520	cmp	x1,xzr			// did subtraction borrow?
521
522	csel	x14,x14,x8,eq	// ret = borrow ? ret+modulus : ret
523	csel	x15,x15,x9,eq
524	csel	x16,x16,x10,eq
525	stp	x14,x15,[x0]
526	csel	x17,x17,x11,eq
527	stp	x16,x17,[x0,#16]
528
529	ret
530.size	__ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
531
532.type	__ecp_nistz256_sub_morf,%function
533.align	4
534__ecp_nistz256_sub_morf:
535	ldp	x8,x9,[x2]
536	ldp	x10,x11,[x2,#16]
537	subs	x14,x8,x14		// ret = b-a
538	sbcs	x15,x9,x15
539	sbcs	x16,x10,x16
540	sbcs	x17,x11,x17
541	sbc	x1,xzr,xzr		// zap x1
542
543	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = ret+modulus
544	adcs	x9,x15,x12
545	adcs	x10,x16,xzr
546	adc	x11,x17,x13
547	cmp	x1,xzr			// did subtraction borrow?
548
549	csel	x14,x14,x8,eq	// ret = borrow ? ret+modulus : ret
550	csel	x15,x15,x9,eq
551	csel	x16,x16,x10,eq
552	stp	x14,x15,[x0]
553	csel	x17,x17,x11,eq
554	stp	x16,x17,[x0,#16]
555
556	ret
557.size	__ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
558
559.type	__ecp_nistz256_div_by_2,%function
560.align	4
561__ecp_nistz256_div_by_2:
562	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = a+modulus
563	adcs	x9,x15,x12
564	adcs	x10,x16,xzr
565	adcs	x11,x17,x13
566	adc	x1,xzr,xzr		// zap x1
567	tst	x14,#1		// is a even?
568
569	csel	x14,x14,x8,eq	// ret = even ? a : a+modulus
570	csel	x15,x15,x9,eq
571	csel	x16,x16,x10,eq
572	csel	x17,x17,x11,eq
573	csel	x1,xzr,x1,eq
574
575	lsr	x14,x14,#1		// ret >>= 1
576	orr	x14,x14,x15,lsl#63
577	lsr	x15,x15,#1
578	orr	x15,x15,x16,lsl#63
579	lsr	x16,x16,#1
580	orr	x16,x16,x17,lsl#63
581	lsr	x17,x17,#1
582	stp	x14,x15,[x0]
583	orr	x17,x17,x1,lsl#63
584	stp	x16,x17,[x0,#16]
585
586	ret
587.size	__ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
588.globl	ecp_nistz256_point_double
589.hidden	ecp_nistz256_point_double
590.type	ecp_nistz256_point_double,%function
591.align	5
592ecp_nistz256_point_double:
593	AARCH64_SIGN_LINK_REGISTER
594	stp	x29,x30,[sp,#-96]!
595	add	x29,sp,#0
596	stp	x19,x20,[sp,#16]
597	stp	x21,x22,[sp,#32]
598	sub	sp,sp,#32*4
599
600.Ldouble_shortcut:
601	ldp	x14,x15,[x1,#32]
602	mov	x21,x0
603	ldp	x16,x17,[x1,#48]
604	mov	x22,x1
605	ldr	x12,.Lpoly+8
606	mov	x8,x14
607	ldr	x13,.Lpoly+24
608	mov	x9,x15
609	ldp	x4,x5,[x22,#64]	// forward load for p256_sqr_mont
610	mov	x10,x16
611	mov	x11,x17
612	ldp	x6,x7,[x22,#64+16]
613	add	x0,sp,#0
614	bl	__ecp_nistz256_add_to	// p256_mul_by_2(S, in_y);
615
616	add	x0,sp,#64
617	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Zsqr, in_z);
618
619	ldp	x8,x9,[x22]
620	ldp	x10,x11,[x22,#16]
621	mov	x4,x14		// put Zsqr aside for p256_sub
622	mov	x5,x15
623	mov	x6,x16
624	mov	x7,x17
625	add	x0,sp,#32
626	bl	__ecp_nistz256_add_to	// p256_add(M, Zsqr, in_x);
627
628	add	x2,x22,#0
629	mov	x14,x4		// restore Zsqr
630	mov	x15,x5
631	ldp	x4,x5,[sp,#0]	// forward load for p256_sqr_mont
632	mov	x16,x6
633	mov	x17,x7
634	ldp	x6,x7,[sp,#0+16]
635	add	x0,sp,#64
636	bl	__ecp_nistz256_sub_morf	// p256_sub(Zsqr, in_x, Zsqr);
637
638	add	x0,sp,#0
639	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(S, S);
640
641	ldr	x3,[x22,#32]
642	ldp	x4,x5,[x22,#64]
643	ldp	x6,x7,[x22,#64+16]
644	add	x2,x22,#32
645	add	x0,sp,#96
646	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(tmp0, in_z, in_y);
647
648	mov	x8,x14
649	mov	x9,x15
650	ldp	x4,x5,[sp,#0]	// forward load for p256_sqr_mont
651	mov	x10,x16
652	mov	x11,x17
653	ldp	x6,x7,[sp,#0+16]
654	add	x0,x21,#64
655	bl	__ecp_nistz256_add_to	// p256_mul_by_2(res_z, tmp0);
656
657	add	x0,sp,#96
658	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(tmp0, S);
659
660	ldr	x3,[sp,#64]		// forward load for p256_mul_mont
661	ldp	x4,x5,[sp,#32]
662	ldp	x6,x7,[sp,#32+16]
663	add	x0,x21,#32
664	bl	__ecp_nistz256_div_by_2	// p256_div_by_2(res_y, tmp0);
665
666	add	x2,sp,#64
667	add	x0,sp,#32
668	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(M, M, Zsqr);
669
670	mov	x8,x14		// duplicate M
671	mov	x9,x15
672	mov	x10,x16
673	mov	x11,x17
674	mov	x4,x14		// put M aside
675	mov	x5,x15
676	mov	x6,x16
677	mov	x7,x17
678	add	x0,sp,#32
679	bl	__ecp_nistz256_add_to
680	mov	x8,x4			// restore M
681	mov	x9,x5
682	ldr	x3,[x22]		// forward load for p256_mul_mont
683	mov	x10,x6
684	ldp	x4,x5,[sp,#0]
685	mov	x11,x7
686	ldp	x6,x7,[sp,#0+16]
687	bl	__ecp_nistz256_add_to	// p256_mul_by_3(M, M);
688
689	add	x2,x22,#0
690	add	x0,sp,#0
691	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, in_x);
692
693	mov	x8,x14
694	mov	x9,x15
695	ldp	x4,x5,[sp,#32]	// forward load for p256_sqr_mont
696	mov	x10,x16
697	mov	x11,x17
698	ldp	x6,x7,[sp,#32+16]
699	add	x0,sp,#96
700	bl	__ecp_nistz256_add_to	// p256_mul_by_2(tmp0, S);
701
702	add	x0,x21,#0
703	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(res_x, M);
704
705	add	x2,sp,#96
706	bl	__ecp_nistz256_sub_from	// p256_sub(res_x, res_x, tmp0);
707
708	add	x2,sp,#0
709	add	x0,sp,#0
710	bl	__ecp_nistz256_sub_morf	// p256_sub(S, S, res_x);
711
712	ldr	x3,[sp,#32]
713	mov	x4,x14		// copy S
714	mov	x5,x15
715	mov	x6,x16
716	mov	x7,x17
717	add	x2,sp,#32
718	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, M);
719
720	add	x2,x21,#32
721	add	x0,x21,#32
722	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, S, res_y);
723
724	add	sp,x29,#0		// destroy frame
725	ldp	x19,x20,[x29,#16]
726	ldp	x21,x22,[x29,#32]
727	ldp	x29,x30,[sp],#96
728	AARCH64_VALIDATE_LINK_REGISTER
729	ret
730.size	ecp_nistz256_point_double,.-ecp_nistz256_point_double
731.globl	ecp_nistz256_point_add
732.hidden	ecp_nistz256_point_add
733.type	ecp_nistz256_point_add,%function
734.align	5
735ecp_nistz256_point_add:
736	AARCH64_SIGN_LINK_REGISTER
737	stp	x29,x30,[sp,#-96]!
738	add	x29,sp,#0
739	stp	x19,x20,[sp,#16]
740	stp	x21,x22,[sp,#32]
741	stp	x23,x24,[sp,#48]
742	stp	x25,x26,[sp,#64]
743	stp	x27,x28,[sp,#80]
744	sub	sp,sp,#32*12
745
746	ldp	x4,x5,[x2,#64]	// in2_z
747	ldp	x6,x7,[x2,#64+16]
748	mov	x21,x0
749	mov	x22,x1
750	mov	x23,x2
751	ldr	x12,.Lpoly+8
752	ldr	x13,.Lpoly+24
753	orr	x8,x4,x5
754	orr	x10,x6,x7
755	orr	x25,x8,x10
756	cmp	x25,#0
757	csetm	x25,ne		// ~in2infty
758	add	x0,sp,#192
759	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z2sqr, in2_z);
760
761	ldp	x4,x5,[x22,#64]	// in1_z
762	ldp	x6,x7,[x22,#64+16]
763	orr	x8,x4,x5
764	orr	x10,x6,x7
765	orr	x24,x8,x10
766	cmp	x24,#0
767	csetm	x24,ne		// ~in1infty
768	add	x0,sp,#128
769	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
770
771	ldr	x3,[x23,#64]
772	ldp	x4,x5,[sp,#192]
773	ldp	x6,x7,[sp,#192+16]
774	add	x2,x23,#64
775	add	x0,sp,#320
776	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, Z2sqr, in2_z);
777
778	ldr	x3,[x22,#64]
779	ldp	x4,x5,[sp,#128]
780	ldp	x6,x7,[sp,#128+16]
781	add	x2,x22,#64
782	add	x0,sp,#352
783	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
784
785	ldr	x3,[x22,#32]
786	ldp	x4,x5,[sp,#320]
787	ldp	x6,x7,[sp,#320+16]
788	add	x2,x22,#32
789	add	x0,sp,#320
790	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, S1, in1_y);
791
792	ldr	x3,[x23,#32]
793	ldp	x4,x5,[sp,#352]
794	ldp	x6,x7,[sp,#352+16]
795	add	x2,x23,#32
796	add	x0,sp,#352
797	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
798
799	add	x2,sp,#320
800	ldr	x3,[sp,#192]	// forward load for p256_mul_mont
801	ldp	x4,x5,[x22]
802	ldp	x6,x7,[x22,#16]
803	add	x0,sp,#160
804	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, S1);
805
806	orr	x14,x14,x15	// see if result is zero
807	orr	x16,x16,x17
808	orr	x26,x14,x16	// ~is_equal(S1,S2)
809
810	add	x2,sp,#192
811	add	x0,sp,#256
812	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U1, in1_x, Z2sqr);
813
814	ldr	x3,[sp,#128]
815	ldp	x4,x5,[x23]
816	ldp	x6,x7,[x23,#16]
817	add	x2,sp,#128
818	add	x0,sp,#288
819	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in2_x, Z1sqr);
820
821	add	x2,sp,#256
822	ldp	x4,x5,[sp,#160]	// forward load for p256_sqr_mont
823	ldp	x6,x7,[sp,#160+16]
824	add	x0,sp,#96
825	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, U1);
826
827	orr	x14,x14,x15	// see if result is zero
828	orr	x16,x16,x17
829	orr	x14,x14,x16	// ~is_equal(U1,U2)
830
831	mvn	x27,x24	// -1/0 -> 0/-1
832	mvn	x28,x25	// -1/0 -> 0/-1
833	orr	x14,x14,x27
834	orr	x14,x14,x28
835	orr	x14,x14,x26
836	cbnz	x14,.Ladd_proceed	// if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
837
838.Ladd_double:
839	mov	x1,x22
840	mov	x0,x21
841	ldp	x23,x24,[x29,#48]
842	ldp	x25,x26,[x29,#64]
843	ldp	x27,x28,[x29,#80]
844	add	sp,sp,#256	// #256 is from #32*(12-4). difference in stack frames
845	b	.Ldouble_shortcut
846
847.align	4
848.Ladd_proceed:
849	add	x0,sp,#192
850	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
851
852	ldr	x3,[x22,#64]
853	ldp	x4,x5,[sp,#96]
854	ldp	x6,x7,[sp,#96+16]
855	add	x2,x22,#64
856	add	x0,sp,#64
857	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
858
859	ldp	x4,x5,[sp,#96]
860	ldp	x6,x7,[sp,#96+16]
861	add	x0,sp,#128
862	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
863
864	ldr	x3,[x23,#64]
865	ldp	x4,x5,[sp,#64]
866	ldp	x6,x7,[sp,#64+16]
867	add	x2,x23,#64
868	add	x0,sp,#64
869	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, res_z, in2_z);
870
871	ldr	x3,[sp,#96]
872	ldp	x4,x5,[sp,#128]
873	ldp	x6,x7,[sp,#128+16]
874	add	x2,sp,#96
875	add	x0,sp,#224
876	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
877
878	ldr	x3,[sp,#128]
879	ldp	x4,x5,[sp,#256]
880	ldp	x6,x7,[sp,#256+16]
881	add	x2,sp,#128
882	add	x0,sp,#288
883	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, U1, Hsqr);
884
885	mov	x8,x14
886	mov	x9,x15
887	mov	x10,x16
888	mov	x11,x17
889	add	x0,sp,#128
890	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
891
892	add	x2,sp,#192
893	add	x0,sp,#0
894	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
895
896	add	x2,sp,#224
897	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
898
899	add	x2,sp,#288
900	ldr	x3,[sp,#224]		// forward load for p256_mul_mont
901	ldp	x4,x5,[sp,#320]
902	ldp	x6,x7,[sp,#320+16]
903	add	x0,sp,#32
904	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
905
906	add	x2,sp,#224
907	add	x0,sp,#352
908	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S1, Hcub);
909
910	ldr	x3,[sp,#160]
911	ldp	x4,x5,[sp,#32]
912	ldp	x6,x7,[sp,#32+16]
913	add	x2,sp,#160
914	add	x0,sp,#32
915	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
916
917	add	x2,sp,#352
918	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
919
920	ldp	x4,x5,[sp,#0]		// res
921	ldp	x6,x7,[sp,#0+16]
922	ldp	x8,x9,[x23]		// in2
923	ldp	x10,x11,[x23,#16]
924	ldp	x14,x15,[x22,#0]	// in1
925	cmp	x24,#0			// ~, remember?
926	ldp	x16,x17,[x22,#0+16]
927	csel	x8,x4,x8,ne
928	csel	x9,x5,x9,ne
929	ldp	x4,x5,[sp,#0+0+32]	// res
930	csel	x10,x6,x10,ne
931	csel	x11,x7,x11,ne
932	cmp	x25,#0			// ~, remember?
933	ldp	x6,x7,[sp,#0+0+48]
934	csel	x14,x8,x14,ne
935	csel	x15,x9,x15,ne
936	ldp	x8,x9,[x23,#0+32]	// in2
937	csel	x16,x10,x16,ne
938	csel	x17,x11,x17,ne
939	ldp	x10,x11,[x23,#0+48]
940	stp	x14,x15,[x21,#0]
941	stp	x16,x17,[x21,#0+16]
942	ldp	x14,x15,[x22,#32]	// in1
943	cmp	x24,#0			// ~, remember?
944	ldp	x16,x17,[x22,#32+16]
945	csel	x8,x4,x8,ne
946	csel	x9,x5,x9,ne
947	ldp	x4,x5,[sp,#0+32+32]	// res
948	csel	x10,x6,x10,ne
949	csel	x11,x7,x11,ne
950	cmp	x25,#0			// ~, remember?
951	ldp	x6,x7,[sp,#0+32+48]
952	csel	x14,x8,x14,ne
953	csel	x15,x9,x15,ne
954	ldp	x8,x9,[x23,#32+32]	// in2
955	csel	x16,x10,x16,ne
956	csel	x17,x11,x17,ne
957	ldp	x10,x11,[x23,#32+48]
958	stp	x14,x15,[x21,#32]
959	stp	x16,x17,[x21,#32+16]
960	ldp	x14,x15,[x22,#64]	// in1
961	cmp	x24,#0			// ~, remember?
962	ldp	x16,x17,[x22,#64+16]
963	csel	x8,x4,x8,ne
964	csel	x9,x5,x9,ne
965	csel	x10,x6,x10,ne
966	csel	x11,x7,x11,ne
967	cmp	x25,#0			// ~, remember?
968	csel	x14,x8,x14,ne
969	csel	x15,x9,x15,ne
970	csel	x16,x10,x16,ne
971	csel	x17,x11,x17,ne
972	stp	x14,x15,[x21,#64]
973	stp	x16,x17,[x21,#64+16]
974
975.Ladd_done:
976	add	sp,x29,#0		// destroy frame
977	ldp	x19,x20,[x29,#16]
978	ldp	x21,x22,[x29,#32]
979	ldp	x23,x24,[x29,#48]
980	ldp	x25,x26,[x29,#64]
981	ldp	x27,x28,[x29,#80]
982	ldp	x29,x30,[sp],#96
983	AARCH64_VALIDATE_LINK_REGISTER
984	ret
985.size	ecp_nistz256_point_add,.-ecp_nistz256_point_add
986.globl	ecp_nistz256_point_add_affine
987.hidden	ecp_nistz256_point_add_affine
988.type	ecp_nistz256_point_add_affine,%function
989.align	5
990ecp_nistz256_point_add_affine:
991	AARCH64_SIGN_LINK_REGISTER
992	stp	x29,x30,[sp,#-80]!
993	add	x29,sp,#0
994	stp	x19,x20,[sp,#16]
995	stp	x21,x22,[sp,#32]
996	stp	x23,x24,[sp,#48]
997	stp	x25,x26,[sp,#64]
998	sub	sp,sp,#32*10
999
1000	mov	x21,x0
1001	mov	x22,x1
1002	mov	x23,x2
1003	ldr	x12,.Lpoly+8
1004	ldr	x13,.Lpoly+24
1005
1006	ldp	x4,x5,[x1,#64]	// in1_z
1007	ldp	x6,x7,[x1,#64+16]
1008	orr	x8,x4,x5
1009	orr	x10,x6,x7
1010	orr	x24,x8,x10
1011	cmp	x24,#0
1012	csetm	x24,ne		// ~in1infty
1013
1014	ldp	x14,x15,[x2]	// in2_x
1015	ldp	x16,x17,[x2,#16]
1016	ldp	x8,x9,[x2,#32]	// in2_y
1017	ldp	x10,x11,[x2,#48]
1018	orr	x14,x14,x15
1019	orr	x16,x16,x17
1020	orr	x8,x8,x9
1021	orr	x10,x10,x11
1022	orr	x14,x14,x16
1023	orr	x8,x8,x10
1024	orr	x25,x14,x8
1025	cmp	x25,#0
1026	csetm	x25,ne		// ~in2infty
1027
1028	add	x0,sp,#128
1029	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
1030
1031	mov	x4,x14
1032	mov	x5,x15
1033	mov	x6,x16
1034	mov	x7,x17
1035	ldr	x3,[x23]
1036	add	x2,x23,#0
1037	add	x0,sp,#96
1038	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, Z1sqr, in2_x);
1039
1040	add	x2,x22,#0
1041	ldr	x3,[x22,#64]	// forward load for p256_mul_mont
1042	ldp	x4,x5,[sp,#128]
1043	ldp	x6,x7,[sp,#128+16]
1044	add	x0,sp,#160
1045	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, in1_x);
1046
1047	add	x2,x22,#64
1048	add	x0,sp,#128
1049	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
1050
1051	ldr	x3,[x22,#64]
1052	ldp	x4,x5,[sp,#160]
1053	ldp	x6,x7,[sp,#160+16]
1054	add	x2,x22,#64
1055	add	x0,sp,#64
1056	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
1057
1058	ldr	x3,[x23,#32]
1059	ldp	x4,x5,[sp,#128]
1060	ldp	x6,x7,[sp,#128+16]
1061	add	x2,x23,#32
1062	add	x0,sp,#128
1063	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
1064
1065	add	x2,x22,#32
1066	ldp	x4,x5,[sp,#160]	// forward load for p256_sqr_mont
1067	ldp	x6,x7,[sp,#160+16]
1068	add	x0,sp,#192
1069	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, in1_y);
1070
1071	add	x0,sp,#224
1072	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
1073
1074	ldp	x4,x5,[sp,#192]
1075	ldp	x6,x7,[sp,#192+16]
1076	add	x0,sp,#288
1077	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
1078
1079	ldr	x3,[sp,#160]
1080	ldp	x4,x5,[sp,#224]
1081	ldp	x6,x7,[sp,#224+16]
1082	add	x2,sp,#160
1083	add	x0,sp,#256
1084	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
1085
1086	ldr	x3,[x22]
1087	ldp	x4,x5,[sp,#224]
1088	ldp	x6,x7,[sp,#224+16]
1089	add	x2,x22,#0
1090	add	x0,sp,#96
1091	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in1_x, Hsqr);
1092
1093	mov	x8,x14
1094	mov	x9,x15
1095	mov	x10,x16
1096	mov	x11,x17
1097	add	x0,sp,#224
1098	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
1099
1100	add	x2,sp,#288
1101	add	x0,sp,#0
1102	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
1103
1104	add	x2,sp,#256
1105	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
1106
1107	add	x2,sp,#96
1108	ldr	x3,[x22,#32]	// forward load for p256_mul_mont
1109	ldp	x4,x5,[sp,#256]
1110	ldp	x6,x7,[sp,#256+16]
1111	add	x0,sp,#32
1112	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
1113
1114	add	x2,x22,#32
1115	add	x0,sp,#128
1116	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, in1_y, Hcub);
1117
1118	ldr	x3,[sp,#192]
1119	ldp	x4,x5,[sp,#32]
1120	ldp	x6,x7,[sp,#32+16]
1121	add	x2,sp,#192
1122	add	x0,sp,#32
1123	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
1124
1125	add	x2,sp,#128
1126	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
1127
1128	ldp	x4,x5,[sp,#0]		// res
1129	ldp	x6,x7,[sp,#0+16]
1130	ldp	x8,x9,[x23]		// in2
1131	ldp	x10,x11,[x23,#16]
1132	ldp	x14,x15,[x22,#0]	// in1
1133	cmp	x24,#0			// ~, remember?
1134	ldp	x16,x17,[x22,#0+16]
1135	csel	x8,x4,x8,ne
1136	csel	x9,x5,x9,ne
1137	ldp	x4,x5,[sp,#0+0+32]	// res
1138	csel	x10,x6,x10,ne
1139	csel	x11,x7,x11,ne
1140	cmp	x25,#0			// ~, remember?
1141	ldp	x6,x7,[sp,#0+0+48]
1142	csel	x14,x8,x14,ne
1143	csel	x15,x9,x15,ne
1144	ldp	x8,x9,[x23,#0+32]	// in2
1145	csel	x16,x10,x16,ne
1146	csel	x17,x11,x17,ne
1147	ldp	x10,x11,[x23,#0+48]
1148	stp	x14,x15,[x21,#0]
1149	stp	x16,x17,[x21,#0+16]
1150	adr	x23,.Lone_mont-64
1151	ldp	x14,x15,[x22,#32]	// in1
1152	cmp	x24,#0			// ~, remember?
1153	ldp	x16,x17,[x22,#32+16]
1154	csel	x8,x4,x8,ne
1155	csel	x9,x5,x9,ne
1156	ldp	x4,x5,[sp,#0+32+32]	// res
1157	csel	x10,x6,x10,ne
1158	csel	x11,x7,x11,ne
1159	cmp	x25,#0			// ~, remember?
1160	ldp	x6,x7,[sp,#0+32+48]
1161	csel	x14,x8,x14,ne
1162	csel	x15,x9,x15,ne
1163	ldp	x8,x9,[x23,#32+32]	// in2
1164	csel	x16,x10,x16,ne
1165	csel	x17,x11,x17,ne
1166	ldp	x10,x11,[x23,#32+48]
1167	stp	x14,x15,[x21,#32]
1168	stp	x16,x17,[x21,#32+16]
1169	ldp	x14,x15,[x22,#64]	// in1
1170	cmp	x24,#0			// ~, remember?
1171	ldp	x16,x17,[x22,#64+16]
1172	csel	x8,x4,x8,ne
1173	csel	x9,x5,x9,ne
1174	csel	x10,x6,x10,ne
1175	csel	x11,x7,x11,ne
1176	cmp	x25,#0			// ~, remember?
1177	csel	x14,x8,x14,ne
1178	csel	x15,x9,x15,ne
1179	csel	x16,x10,x16,ne
1180	csel	x17,x11,x17,ne
1181	stp	x14,x15,[x21,#64]
1182	stp	x16,x17,[x21,#64+16]
1183
1184	add	sp,x29,#0		// destroy frame
1185	ldp	x19,x20,[x29,#16]
1186	ldp	x21,x22,[x29,#32]
1187	ldp	x23,x24,[x29,#48]
1188	ldp	x25,x26,[x29,#64]
1189	ldp	x29,x30,[sp],#80
1190	AARCH64_VALIDATE_LINK_REGISTER
1191	ret
1192.size	ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1193////////////////////////////////////////////////////////////////////////
1194// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
1195//                                uint64_t b[4]);
1196.globl	ecp_nistz256_ord_mul_mont
1197.hidden	ecp_nistz256_ord_mul_mont
1198.type	ecp_nistz256_ord_mul_mont,%function
1199.align	4
1200ecp_nistz256_ord_mul_mont:
1201	AARCH64_VALID_CALL_TARGET
1202	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1203	stp	x29,x30,[sp,#-64]!
1204	add	x29,sp,#0
1205	stp	x19,x20,[sp,#16]
1206	stp	x21,x22,[sp,#32]
1207	stp	x23,x24,[sp,#48]
1208
1209	adr	x23,.Lord
1210	ldr	x3,[x2]		// bp[0]
1211	ldp	x4,x5,[x1]
1212	ldp	x6,x7,[x1,#16]
1213
1214	ldp	x12,x13,[x23,#0]
1215	ldp	x21,x22,[x23,#16]
1216	ldr	x23,[x23,#32]
1217
1218	mul	x14,x4,x3		// a[0]*b[0]
1219	umulh	x8,x4,x3
1220
1221	mul	x15,x5,x3		// a[1]*b[0]
1222	umulh	x9,x5,x3
1223
1224	mul	x16,x6,x3		// a[2]*b[0]
1225	umulh	x10,x6,x3
1226
1227	mul	x17,x7,x3		// a[3]*b[0]
1228	umulh	x19,x7,x3
1229
1230	mul	x24,x14,x23
1231
1232	adds	x15,x15,x8		// accumulate high parts of multiplication
1233	adcs	x16,x16,x9
1234	adcs	x17,x17,x10
1235	adc	x19,x19,xzr
1236	mov	x20,xzr
1237	ldr	x3,[x2,#8*1]		// b[i]
1238
1239	lsl	x8,x24,#32
1240	subs	x16,x16,x24
1241	lsr	x9,x24,#32
1242	sbcs	x17,x17,x8
1243	sbcs	x19,x19,x9
1244	sbc	x20,x20,xzr
1245
1246	subs	xzr,x14,#1
1247	umulh	x9,x12,x24
1248	mul	x10,x13,x24
1249	umulh	x11,x13,x24
1250
1251	adcs	x10,x10,x9
1252	mul	x8,x4,x3
1253	adc	x11,x11,xzr
1254	mul	x9,x5,x3
1255
1256	adds	x14,x15,x10
1257	mul	x10,x6,x3
1258	adcs	x15,x16,x11
1259	mul	x11,x7,x3
1260	adcs	x16,x17,x24
1261	adcs	x17,x19,x24
1262	adc	x19,x20,xzr
1263
1264	adds	x14,x14,x8		// accumulate low parts
1265	umulh	x8,x4,x3
1266	adcs	x15,x15,x9
1267	umulh	x9,x5,x3
1268	adcs	x16,x16,x10
1269	umulh	x10,x6,x3
1270	adcs	x17,x17,x11
1271	umulh	x11,x7,x3
1272	adc	x19,x19,xzr
1273	mul	x24,x14,x23
1274	adds	x15,x15,x8		// accumulate high parts
1275	adcs	x16,x16,x9
1276	adcs	x17,x17,x10
1277	adcs	x19,x19,x11
1278	adc	x20,xzr,xzr
1279	ldr	x3,[x2,#8*2]		// b[i]
1280
1281	lsl	x8,x24,#32
1282	subs	x16,x16,x24
1283	lsr	x9,x24,#32
1284	sbcs	x17,x17,x8
1285	sbcs	x19,x19,x9
1286	sbc	x20,x20,xzr
1287
1288	subs	xzr,x14,#1
1289	umulh	x9,x12,x24
1290	mul	x10,x13,x24
1291	umulh	x11,x13,x24
1292
1293	adcs	x10,x10,x9
1294	mul	x8,x4,x3
1295	adc	x11,x11,xzr
1296	mul	x9,x5,x3
1297
1298	adds	x14,x15,x10
1299	mul	x10,x6,x3
1300	adcs	x15,x16,x11
1301	mul	x11,x7,x3
1302	adcs	x16,x17,x24
1303	adcs	x17,x19,x24
1304	adc	x19,x20,xzr
1305
1306	adds	x14,x14,x8		// accumulate low parts
1307	umulh	x8,x4,x3
1308	adcs	x15,x15,x9
1309	umulh	x9,x5,x3
1310	adcs	x16,x16,x10
1311	umulh	x10,x6,x3
1312	adcs	x17,x17,x11
1313	umulh	x11,x7,x3
1314	adc	x19,x19,xzr
1315	mul	x24,x14,x23
1316	adds	x15,x15,x8		// accumulate high parts
1317	adcs	x16,x16,x9
1318	adcs	x17,x17,x10
1319	adcs	x19,x19,x11
1320	adc	x20,xzr,xzr
1321	ldr	x3,[x2,#8*3]		// b[i]
1322
1323	lsl	x8,x24,#32
1324	subs	x16,x16,x24
1325	lsr	x9,x24,#32
1326	sbcs	x17,x17,x8
1327	sbcs	x19,x19,x9
1328	sbc	x20,x20,xzr
1329
1330	subs	xzr,x14,#1
1331	umulh	x9,x12,x24
1332	mul	x10,x13,x24
1333	umulh	x11,x13,x24
1334
1335	adcs	x10,x10,x9
1336	mul	x8,x4,x3
1337	adc	x11,x11,xzr
1338	mul	x9,x5,x3
1339
1340	adds	x14,x15,x10
1341	mul	x10,x6,x3
1342	adcs	x15,x16,x11
1343	mul	x11,x7,x3
1344	adcs	x16,x17,x24
1345	adcs	x17,x19,x24
1346	adc	x19,x20,xzr
1347
1348	adds	x14,x14,x8		// accumulate low parts
1349	umulh	x8,x4,x3
1350	adcs	x15,x15,x9
1351	umulh	x9,x5,x3
1352	adcs	x16,x16,x10
1353	umulh	x10,x6,x3
1354	adcs	x17,x17,x11
1355	umulh	x11,x7,x3
1356	adc	x19,x19,xzr
1357	mul	x24,x14,x23
1358	adds	x15,x15,x8		// accumulate high parts
1359	adcs	x16,x16,x9
1360	adcs	x17,x17,x10
1361	adcs	x19,x19,x11
1362	adc	x20,xzr,xzr
1363	lsl	x8,x24,#32		// last reduction
1364	subs	x16,x16,x24
1365	lsr	x9,x24,#32
1366	sbcs	x17,x17,x8
1367	sbcs	x19,x19,x9
1368	sbc	x20,x20,xzr
1369
1370	subs	xzr,x14,#1
1371	umulh	x9,x12,x24
1372	mul	x10,x13,x24
1373	umulh	x11,x13,x24
1374
1375	adcs	x10,x10,x9
1376	adc	x11,x11,xzr
1377
1378	adds	x14,x15,x10
1379	adcs	x15,x16,x11
1380	adcs	x16,x17,x24
1381	adcs	x17,x19,x24
1382	adc	x19,x20,xzr
1383
1384	subs	x8,x14,x12		// ret -= modulus
1385	sbcs	x9,x15,x13
1386	sbcs	x10,x16,x21
1387	sbcs	x11,x17,x22
1388	sbcs	xzr,x19,xzr
1389
1390	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
1391	csel	x15,x15,x9,lo
1392	csel	x16,x16,x10,lo
1393	stp	x14,x15,[x0]
1394	csel	x17,x17,x11,lo
1395	stp	x16,x17,[x0,#16]
1396
1397	ldp	x19,x20,[sp,#16]
1398	ldp	x21,x22,[sp,#32]
1399	ldp	x23,x24,[sp,#48]
1400	ldr	x29,[sp],#64
1401	ret
1402.size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
1403
1404////////////////////////////////////////////////////////////////////////
1405// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
1406//                                int rep);
1407.globl	ecp_nistz256_ord_sqr_mont
1408.hidden	ecp_nistz256_ord_sqr_mont
1409.type	ecp_nistz256_ord_sqr_mont,%function
1410.align	4
1411ecp_nistz256_ord_sqr_mont:
1412	AARCH64_VALID_CALL_TARGET
1413	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1414	stp	x29,x30,[sp,#-64]!
1415	add	x29,sp,#0
1416	stp	x19,x20,[sp,#16]
1417	stp	x21,x22,[sp,#32]
1418	stp	x23,x24,[sp,#48]
1419
1420	adr	x23,.Lord
1421	ldp	x4,x5,[x1]
1422	ldp	x6,x7,[x1,#16]
1423
1424	ldp	x12,x13,[x23,#0]
1425	ldp	x21,x22,[x23,#16]
1426	ldr	x23,[x23,#32]
1427	b	.Loop_ord_sqr
1428
1429.align	4
1430.Loop_ord_sqr:
1431	sub	x2,x2,#1
1432	////////////////////////////////////////////////////////////////
1433	//  |  |  |  |  |  |a1*a0|  |
1434	//  |  |  |  |  |a2*a0|  |  |
1435	//  |  |a3*a2|a3*a0|  |  |  |
1436	//  |  |  |  |a2*a1|  |  |  |
1437	//  |  |  |a3*a1|  |  |  |  |
1438	// *|  |  |  |  |  |  |  | 2|
1439	// +|a3*a3|a2*a2|a1*a1|a0*a0|
1440	//  |--+--+--+--+--+--+--+--|
1441	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow
1442	//
1443	//  "can't overflow" below mark carrying into high part of
1444	//  multiplication result, which can't overflow, because it
1445	//  can never be all ones.
1446
1447	mul	x15,x5,x4		// a[1]*a[0]
1448	umulh	x9,x5,x4
1449	mul	x16,x6,x4		// a[2]*a[0]
1450	umulh	x10,x6,x4
1451	mul	x17,x7,x4		// a[3]*a[0]
1452	umulh	x19,x7,x4
1453
1454	adds	x16,x16,x9		// accumulate high parts of multiplication
1455	mul	x8,x6,x5		// a[2]*a[1]
1456	umulh	x9,x6,x5
1457	adcs	x17,x17,x10
1458	mul	x10,x7,x5		// a[3]*a[1]
1459	umulh	x11,x7,x5
1460	adc	x19,x19,xzr		// can't overflow
1461
1462	mul	x20,x7,x6		// a[3]*a[2]
1463	umulh	x1,x7,x6
1464
1465	adds	x9,x9,x10		// accumulate high parts of multiplication
1466	mul	x14,x4,x4		// a[0]*a[0]
1467	adc	x10,x11,xzr		// can't overflow
1468
1469	adds	x17,x17,x8		// accumulate low parts of multiplication
1470	umulh	x4,x4,x4
1471	adcs	x19,x19,x9
1472	mul	x9,x5,x5		// a[1]*a[1]
1473	adcs	x20,x20,x10
1474	umulh	x5,x5,x5
1475	adc	x1,x1,xzr		// can't overflow
1476
1477	adds	x15,x15,x15	// acc[1-6]*=2
1478	mul	x10,x6,x6		// a[2]*a[2]
1479	adcs	x16,x16,x16
1480	umulh	x6,x6,x6
1481	adcs	x17,x17,x17
1482	mul	x11,x7,x7		// a[3]*a[3]
1483	adcs	x19,x19,x19
1484	umulh	x7,x7,x7
1485	adcs	x20,x20,x20
1486	adcs	x1,x1,x1
1487	adc	x3,xzr,xzr
1488
1489	adds	x15,x15,x4		// +a[i]*a[i]
1490	mul	x24,x14,x23
1491	adcs	x16,x16,x9
1492	adcs	x17,x17,x5
1493	adcs	x19,x19,x10
1494	adcs	x20,x20,x6
1495	adcs	x1,x1,x11
1496	adc	x3,x3,x7
1497	subs	xzr,x14,#1
1498	umulh	x9,x12,x24
1499	mul	x10,x13,x24
1500	umulh	x11,x13,x24
1501
1502	adcs	x10,x10,x9
1503	adc	x11,x11,xzr
1504
1505	adds	x14,x15,x10
1506	adcs	x15,x16,x11
1507	adcs	x16,x17,x24
1508	adc	x17,xzr,x24		// can't overflow
1509	mul	x11,x14,x23
1510	lsl	x8,x24,#32
1511	subs	x15,x15,x24
1512	lsr	x9,x24,#32
1513	sbcs	x16,x16,x8
1514	sbc	x17,x17,x9		// can't borrow
1515	subs	xzr,x14,#1
1516	umulh	x9,x12,x11
1517	mul	x10,x13,x11
1518	umulh	x24,x13,x11
1519
1520	adcs	x10,x10,x9
1521	adc	x24,x24,xzr
1522
1523	adds	x14,x15,x10
1524	adcs	x15,x16,x24
1525	adcs	x16,x17,x11
1526	adc	x17,xzr,x11		// can't overflow
1527	mul	x24,x14,x23
1528	lsl	x8,x11,#32
1529	subs	x15,x15,x11
1530	lsr	x9,x11,#32
1531	sbcs	x16,x16,x8
1532	sbc	x17,x17,x9		// can't borrow
1533	subs	xzr,x14,#1
1534	umulh	x9,x12,x24
1535	mul	x10,x13,x24
1536	umulh	x11,x13,x24
1537
1538	adcs	x10,x10,x9
1539	adc	x11,x11,xzr
1540
1541	adds	x14,x15,x10
1542	adcs	x15,x16,x11
1543	adcs	x16,x17,x24
1544	adc	x17,xzr,x24		// can't overflow
1545	mul	x11,x14,x23
1546	lsl	x8,x24,#32
1547	subs	x15,x15,x24
1548	lsr	x9,x24,#32
1549	sbcs	x16,x16,x8
1550	sbc	x17,x17,x9		// can't borrow
1551	subs	xzr,x14,#1
1552	umulh	x9,x12,x11
1553	mul	x10,x13,x11
1554	umulh	x24,x13,x11
1555
1556	adcs	x10,x10,x9
1557	adc	x24,x24,xzr
1558
1559	adds	x14,x15,x10
1560	adcs	x15,x16,x24
1561	adcs	x16,x17,x11
1562	adc	x17,xzr,x11		// can't overflow
1563	lsl	x8,x11,#32
1564	subs	x15,x15,x11
1565	lsr	x9,x11,#32
1566	sbcs	x16,x16,x8
1567	sbc	x17,x17,x9		// can't borrow
1568	adds	x14,x14,x19	// accumulate upper half
1569	adcs	x15,x15,x20
1570	adcs	x16,x16,x1
1571	adcs	x17,x17,x3
1572	adc	x19,xzr,xzr
1573
1574	subs	x8,x14,x12		// ret -= modulus
1575	sbcs	x9,x15,x13
1576	sbcs	x10,x16,x21
1577	sbcs	x11,x17,x22
1578	sbcs	xzr,x19,xzr
1579
1580	csel	x4,x14,x8,lo	// ret = borrow ? ret : ret-modulus
1581	csel	x5,x15,x9,lo
1582	csel	x6,x16,x10,lo
1583	csel	x7,x17,x11,lo
1584
1585	cbnz	x2,.Loop_ord_sqr
1586
1587	stp	x4,x5,[x0]
1588	stp	x6,x7,[x0,#16]
1589
1590	ldp	x19,x20,[sp,#16]
1591	ldp	x21,x22,[sp,#32]
1592	ldp	x23,x24,[sp,#48]
1593	ldr	x29,[sp],#64
1594	ret
1595.size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
1596////////////////////////////////////////////////////////////////////////
1597// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
1598.globl	ecp_nistz256_select_w5
1599.hidden	ecp_nistz256_select_w5
1600.type	ecp_nistz256_select_w5,%function
1601.align	4
1602ecp_nistz256_select_w5:
1603	AARCH64_VALID_CALL_TARGET
1604
1605    // x10 := x0
1606    // w9 := 0; loop counter and incremented internal index
1607	mov	x10, x0
1608	mov	w9, #0
1609
1610    // [v16-v21] := 0
1611	movi	v16.16b, #0
1612	movi	v17.16b, #0
1613	movi	v18.16b, #0
1614	movi	v19.16b, #0
1615	movi	v20.16b, #0
1616	movi	v21.16b, #0
1617
1618.Lselect_w5_loop:
1619    // Loop 16 times.
1620
1621    // Increment index (loop counter); tested at the end of the loop
1622	add	w9, w9, #1
1623
1624    // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1
1625    //  and advance x1 to point to the next entry
1626	ld1	{v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
1627
1628    // x11 := (w9 == w2)? All 1s : All 0s
1629	cmp	w9, w2
1630	csetm	x11, eq
1631
1632    // continue loading ...
1633	ld1	{v26.2d, v27.2d}, [x1],#32
1634
1635    // duplicate mask_64 into Mask (all 0s or all 1s)
1636	dup	v3.2d, x11
1637
1638    // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
1639    // i.e., values in output registers will remain the same if w9 != w2
1640	bit	v16.16b, v22.16b, v3.16b
1641	bit	v17.16b, v23.16b, v3.16b
1642
1643	bit	v18.16b, v24.16b, v3.16b
1644	bit	v19.16b, v25.16b, v3.16b
1645
1646	bit	v20.16b, v26.16b, v3.16b
1647	bit	v21.16b, v27.16b, v3.16b
1648
1649    // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back
1650	tbz	w9, #4, .Lselect_w5_loop
1651
1652    // Write [v16-v21] to memory at the output pointer
1653	st1	{v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64
1654	st1	{v20.2d, v21.2d}, [x10]
1655
1656	ret
1657.size	ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
1658
1659
1660////////////////////////////////////////////////////////////////////////
1661// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
1662.globl	ecp_nistz256_select_w7
1663.hidden	ecp_nistz256_select_w7
1664.type	ecp_nistz256_select_w7,%function
1665.align	4
1666ecp_nistz256_select_w7:
1667	AARCH64_VALID_CALL_TARGET
1668
1669    // w9 := 0; loop counter and incremented internal index
1670	mov	w9, #0
1671
1672    // [v16-v21] := 0
1673	movi	v16.16b, #0
1674	movi	v17.16b, #0
1675	movi	v18.16b, #0
1676	movi	v19.16b, #0
1677
1678.Lselect_w7_loop:
1679    // Loop 64 times.
1680
1681    // Increment index (loop counter); tested at the end of the loop
1682	add	w9, w9, #1
1683
1684    // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1
1685    //  and advance x1 to point to the next entry
1686	ld1	{v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
1687
1688    // x11 := (w9 == w2)? All 1s : All 0s
1689	cmp	w9, w2
1690	csetm	x11, eq
1691
1692    // duplicate mask_64 into Mask (all 0s or all 1s)
1693	dup	v3.2d, x11
1694
1695    // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
1696    // i.e., values in output registers will remain the same if w9 != w2
1697	bit	v16.16b, v22.16b, v3.16b
1698	bit	v17.16b, v23.16b, v3.16b
1699
1700	bit	v18.16b, v24.16b, v3.16b
1701	bit	v19.16b, v25.16b, v3.16b
1702
1703    // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back
1704	tbz	w9, #6, .Lselect_w7_loop
1705
1706    // Write [v16-v19] to memory at the output pointer
1707	st1	{v16.2d, v17.2d, v18.2d, v19.2d}, [x0]
1708
1709	ret
1710.size	ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
1711#endif
1712#endif  // !OPENSSL_NO_ASM
1713.section	.note.GNU-stack,"",%progbits
1714