• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(__aarch64__)
13#if defined(BORINGSSL_PREFIX)
14#include <boringssl_prefix_symbols_asm.h>
15#endif
16#include "openssl/arm_arch.h"
17
18.text
19.align	5
20Lpoly:
21.quad	0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
22LRR:	//	2^512 mod P precomputed for NIST P256 polynomial
23.quad	0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd
24Lone_mont:
25.quad	0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
26Lone:
27.quad	1,0,0,0
28Lord:
29.quad	0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
30LordK:
31.quad	0xccd1c8aaee00bc4f
32.byte	69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
33.align	2
34
35// void	ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
36//					     const BN_ULONG x2[4]);
37.globl	ecp_nistz256_mul_mont
38
39.def ecp_nistz256_mul_mont
40   .type 32
41.endef
42.align	4
43ecp_nistz256_mul_mont:
44	AARCH64_SIGN_LINK_REGISTER
45	stp	x29,x30,[sp,#-32]!
46	add	x29,sp,#0
47	stp	x19,x20,[sp,#16]
48
49	ldr	x3,[x2]		// bp[0]
50	ldp	x4,x5,[x1]
51	ldp	x6,x7,[x1,#16]
52	ldr	x12,Lpoly+8
53	ldr	x13,Lpoly+24
54
55	bl	__ecp_nistz256_mul_mont
56
57	ldp	x19,x20,[sp,#16]
58	ldp	x29,x30,[sp],#32
59	AARCH64_VALIDATE_LINK_REGISTER
60	ret
61
62
63// void	ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
64.globl	ecp_nistz256_sqr_mont
65
66.def ecp_nistz256_sqr_mont
67   .type 32
68.endef
69.align	4
70ecp_nistz256_sqr_mont:
71	AARCH64_SIGN_LINK_REGISTER
72	stp	x29,x30,[sp,#-32]!
73	add	x29,sp,#0
74	stp	x19,x20,[sp,#16]
75
76	ldp	x4,x5,[x1]
77	ldp	x6,x7,[x1,#16]
78	ldr	x12,Lpoly+8
79	ldr	x13,Lpoly+24
80
81	bl	__ecp_nistz256_sqr_mont
82
83	ldp	x19,x20,[sp,#16]
84	ldp	x29,x30,[sp],#32
85	AARCH64_VALIDATE_LINK_REGISTER
86	ret
87
88
89// void	ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
90.globl	ecp_nistz256_div_by_2
91
92.def ecp_nistz256_div_by_2
93   .type 32
94.endef
95.align	4
96ecp_nistz256_div_by_2:
97	AARCH64_SIGN_LINK_REGISTER
98	stp	x29,x30,[sp,#-16]!
99	add	x29,sp,#0
100
101	ldp	x14,x15,[x1]
102	ldp	x16,x17,[x1,#16]
103	ldr	x12,Lpoly+8
104	ldr	x13,Lpoly+24
105
106	bl	__ecp_nistz256_div_by_2
107
108	ldp	x29,x30,[sp],#16
109	AARCH64_VALIDATE_LINK_REGISTER
110	ret
111
112
113// void	ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
114.globl	ecp_nistz256_mul_by_2
115
116.def ecp_nistz256_mul_by_2
117   .type 32
118.endef
119.align	4
120ecp_nistz256_mul_by_2:
121	AARCH64_SIGN_LINK_REGISTER
122	stp	x29,x30,[sp,#-16]!
123	add	x29,sp,#0
124
125	ldp	x14,x15,[x1]
126	ldp	x16,x17,[x1,#16]
127	ldr	x12,Lpoly+8
128	ldr	x13,Lpoly+24
129	mov	x8,x14
130	mov	x9,x15
131	mov	x10,x16
132	mov	x11,x17
133
134	bl	__ecp_nistz256_add_to	// ret = a+a	// 2*a
135
136	ldp	x29,x30,[sp],#16
137	AARCH64_VALIDATE_LINK_REGISTER
138	ret
139
140
141// void	ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
142.globl	ecp_nistz256_mul_by_3
143
144.def ecp_nistz256_mul_by_3
145   .type 32
146.endef
147.align	4
148ecp_nistz256_mul_by_3:
149	AARCH64_SIGN_LINK_REGISTER
150	stp	x29,x30,[sp,#-16]!
151	add	x29,sp,#0
152
153	ldp	x14,x15,[x1]
154	ldp	x16,x17,[x1,#16]
155	ldr	x12,Lpoly+8
156	ldr	x13,Lpoly+24
157	mov	x8,x14
158	mov	x9,x15
159	mov	x10,x16
160	mov	x11,x17
161	mov	x4,x14
162	mov	x5,x15
163	mov	x6,x16
164	mov	x7,x17
165
166	bl	__ecp_nistz256_add_to	// ret = a+a	// 2*a
167
168	mov	x8,x4
169	mov	x9,x5
170	mov	x10,x6
171	mov	x11,x7
172
173	bl	__ecp_nistz256_add_to	// ret += a	// 2*a+a=3*a
174
175	ldp	x29,x30,[sp],#16
176	AARCH64_VALIDATE_LINK_REGISTER
177	ret
178
179
180// void	ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
181//				        const BN_ULONG x2[4]);
182.globl	ecp_nistz256_sub
183
184.def ecp_nistz256_sub
185   .type 32
186.endef
187.align	4
188ecp_nistz256_sub:
189	AARCH64_SIGN_LINK_REGISTER
190	stp	x29,x30,[sp,#-16]!
191	add	x29,sp,#0
192
193	ldp	x14,x15,[x1]
194	ldp	x16,x17,[x1,#16]
195	ldr	x12,Lpoly+8
196	ldr	x13,Lpoly+24
197
198	bl	__ecp_nistz256_sub_from
199
200	ldp	x29,x30,[sp],#16
201	AARCH64_VALIDATE_LINK_REGISTER
202	ret
203
204
205// void	ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
206.globl	ecp_nistz256_neg
207
208.def ecp_nistz256_neg
209   .type 32
210.endef
211.align	4
212ecp_nistz256_neg:
213	AARCH64_SIGN_LINK_REGISTER
214	stp	x29,x30,[sp,#-16]!
215	add	x29,sp,#0
216
217	mov	x2,x1
218	mov	x14,xzr		// a = 0
219	mov	x15,xzr
220	mov	x16,xzr
221	mov	x17,xzr
222	ldr	x12,Lpoly+8
223	ldr	x13,Lpoly+24
224
225	bl	__ecp_nistz256_sub_from
226
227	ldp	x29,x30,[sp],#16
228	AARCH64_VALIDATE_LINK_REGISTER
229	ret
230
231
232// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
233// to x4-x7 and b[0] - to x3
234.def __ecp_nistz256_mul_mont
235   .type 32
236.endef
237.align	4
238__ecp_nistz256_mul_mont:
239	mul	x14,x4,x3		// a[0]*b[0]
240	umulh	x8,x4,x3
241
242	mul	x15,x5,x3		// a[1]*b[0]
243	umulh	x9,x5,x3
244
245	mul	x16,x6,x3		// a[2]*b[0]
246	umulh	x10,x6,x3
247
248	mul	x17,x7,x3		// a[3]*b[0]
249	umulh	x11,x7,x3
250	ldr	x3,[x2,#8]		// b[1]
251
252	adds	x15,x15,x8		// accumulate high parts of multiplication
253	lsl	x8,x14,#32
254	adcs	x16,x16,x9
255	lsr	x9,x14,#32
256	adcs	x17,x17,x10
257	adc	x19,xzr,x11
258	mov	x20,xzr
259	subs	x10,x14,x8		// "*0xffff0001"
260	sbc	x11,x14,x9
261	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
262	mul	x8,x4,x3		// lo(a[0]*b[i])
263	adcs	x15,x16,x9
264	mul	x9,x5,x3		// lo(a[1]*b[i])
265	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
266	mul	x10,x6,x3		// lo(a[2]*b[i])
267	adcs	x17,x19,x11
268	mul	x11,x7,x3		// lo(a[3]*b[i])
269	adc	x19,x20,xzr
270
271	adds	x14,x14,x8		// accumulate low parts of multiplication
272	umulh	x8,x4,x3		// hi(a[0]*b[i])
273	adcs	x15,x15,x9
274	umulh	x9,x5,x3		// hi(a[1]*b[i])
275	adcs	x16,x16,x10
276	umulh	x10,x6,x3		// hi(a[2]*b[i])
277	adcs	x17,x17,x11
278	umulh	x11,x7,x3		// hi(a[3]*b[i])
279	adc	x19,x19,xzr
280	ldr	x3,[x2,#8*(1+1)]	// b[1+1]
281	adds	x15,x15,x8		// accumulate high parts of multiplication
282	lsl	x8,x14,#32
283	adcs	x16,x16,x9
284	lsr	x9,x14,#32
285	adcs	x17,x17,x10
286	adcs	x19,x19,x11
287	adc	x20,xzr,xzr
288	subs	x10,x14,x8		// "*0xffff0001"
289	sbc	x11,x14,x9
290	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
291	mul	x8,x4,x3		// lo(a[0]*b[i])
292	adcs	x15,x16,x9
293	mul	x9,x5,x3		// lo(a[1]*b[i])
294	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
295	mul	x10,x6,x3		// lo(a[2]*b[i])
296	adcs	x17,x19,x11
297	mul	x11,x7,x3		// lo(a[3]*b[i])
298	adc	x19,x20,xzr
299
300	adds	x14,x14,x8		// accumulate low parts of multiplication
301	umulh	x8,x4,x3		// hi(a[0]*b[i])
302	adcs	x15,x15,x9
303	umulh	x9,x5,x3		// hi(a[1]*b[i])
304	adcs	x16,x16,x10
305	umulh	x10,x6,x3		// hi(a[2]*b[i])
306	adcs	x17,x17,x11
307	umulh	x11,x7,x3		// hi(a[3]*b[i])
308	adc	x19,x19,xzr
309	ldr	x3,[x2,#8*(2+1)]	// b[2+1]
310	adds	x15,x15,x8		// accumulate high parts of multiplication
311	lsl	x8,x14,#32
312	adcs	x16,x16,x9
313	lsr	x9,x14,#32
314	adcs	x17,x17,x10
315	adcs	x19,x19,x11
316	adc	x20,xzr,xzr
317	subs	x10,x14,x8		// "*0xffff0001"
318	sbc	x11,x14,x9
319	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
320	mul	x8,x4,x3		// lo(a[0]*b[i])
321	adcs	x15,x16,x9
322	mul	x9,x5,x3		// lo(a[1]*b[i])
323	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
324	mul	x10,x6,x3		// lo(a[2]*b[i])
325	adcs	x17,x19,x11
326	mul	x11,x7,x3		// lo(a[3]*b[i])
327	adc	x19,x20,xzr
328
329	adds	x14,x14,x8		// accumulate low parts of multiplication
330	umulh	x8,x4,x3		// hi(a[0]*b[i])
331	adcs	x15,x15,x9
332	umulh	x9,x5,x3		// hi(a[1]*b[i])
333	adcs	x16,x16,x10
334	umulh	x10,x6,x3		// hi(a[2]*b[i])
335	adcs	x17,x17,x11
336	umulh	x11,x7,x3		// hi(a[3]*b[i])
337	adc	x19,x19,xzr
338	adds	x15,x15,x8		// accumulate high parts of multiplication
339	lsl	x8,x14,#32
340	adcs	x16,x16,x9
341	lsr	x9,x14,#32
342	adcs	x17,x17,x10
343	adcs	x19,x19,x11
344	adc	x20,xzr,xzr
345	// last reduction
346	subs	x10,x14,x8		// "*0xffff0001"
347	sbc	x11,x14,x9
348	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
349	adcs	x15,x16,x9
350	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
351	adcs	x17,x19,x11
352	adc	x19,x20,xzr
353
354	adds	x8,x14,#1		// subs	x8,x14,#-1 // tmp = ret-modulus
355	sbcs	x9,x15,x12
356	sbcs	x10,x16,xzr
357	sbcs	x11,x17,x13
358	sbcs	xzr,x19,xzr		// did it borrow?
359
360	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
361	csel	x15,x15,x9,lo
362	csel	x16,x16,x10,lo
363	stp	x14,x15,[x0]
364	csel	x17,x17,x11,lo
365	stp	x16,x17,[x0,#16]
366
367	ret
368
369
370// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
371// to x4-x7
372.def __ecp_nistz256_sqr_mont
373   .type 32
374.endef
375.align	4
376__ecp_nistz256_sqr_mont:
377	//  |  |  |  |  |  |a1*a0|  |
378	//  |  |  |  |  |a2*a0|  |  |
379	//  |  |a3*a2|a3*a0|  |  |  |
380	//  |  |  |  |a2*a1|  |  |  |
381	//  |  |  |a3*a1|  |  |  |  |
382	// *|  |  |  |  |  |  |  | 2|
383	// +|a3*a3|a2*a2|a1*a1|a0*a0|
384	//  |--+--+--+--+--+--+--+--|
385	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow
386	//
387	//  "can't overflow" below mark carrying into high part of
388	//  multiplication result, which can't overflow, because it
389	//  can never be all ones.
390
391	mul	x15,x5,x4		// a[1]*a[0]
392	umulh	x9,x5,x4
393	mul	x16,x6,x4		// a[2]*a[0]
394	umulh	x10,x6,x4
395	mul	x17,x7,x4		// a[3]*a[0]
396	umulh	x19,x7,x4
397
398	adds	x16,x16,x9		// accumulate high parts of multiplication
399	mul	x8,x6,x5		// a[2]*a[1]
400	umulh	x9,x6,x5
401	adcs	x17,x17,x10
402	mul	x10,x7,x5		// a[3]*a[1]
403	umulh	x11,x7,x5
404	adc	x19,x19,xzr		// can't overflow
405
406	mul	x20,x7,x6		// a[3]*a[2]
407	umulh	x1,x7,x6
408
409	adds	x9,x9,x10		// accumulate high parts of multiplication
410	mul	x14,x4,x4		// a[0]*a[0]
411	adc	x10,x11,xzr		// can't overflow
412
413	adds	x17,x17,x8		// accumulate low parts of multiplication
414	umulh	x4,x4,x4
415	adcs	x19,x19,x9
416	mul	x9,x5,x5		// a[1]*a[1]
417	adcs	x20,x20,x10
418	umulh	x5,x5,x5
419	adc	x1,x1,xzr		// can't overflow
420
421	adds	x15,x15,x15	// acc[1-6]*=2
422	mul	x10,x6,x6		// a[2]*a[2]
423	adcs	x16,x16,x16
424	umulh	x6,x6,x6
425	adcs	x17,x17,x17
426	mul	x11,x7,x7		// a[3]*a[3]
427	adcs	x19,x19,x19
428	umulh	x7,x7,x7
429	adcs	x20,x20,x20
430	adcs	x1,x1,x1
431	adc	x2,xzr,xzr
432
433	adds	x15,x15,x4		// +a[i]*a[i]
434	adcs	x16,x16,x9
435	adcs	x17,x17,x5
436	adcs	x19,x19,x10
437	adcs	x20,x20,x6
438	lsl	x8,x14,#32
439	adcs	x1,x1,x11
440	lsr	x9,x14,#32
441	adc	x2,x2,x7
442	subs	x10,x14,x8		// "*0xffff0001"
443	sbc	x11,x14,x9
444	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
445	adcs	x15,x16,x9
446	lsl	x8,x14,#32
447	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
448	lsr	x9,x14,#32
449	adc	x17,x11,xzr		// can't overflow
450	subs	x10,x14,x8		// "*0xffff0001"
451	sbc	x11,x14,x9
452	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
453	adcs	x15,x16,x9
454	lsl	x8,x14,#32
455	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
456	lsr	x9,x14,#32
457	adc	x17,x11,xzr		// can't overflow
458	subs	x10,x14,x8		// "*0xffff0001"
459	sbc	x11,x14,x9
460	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
461	adcs	x15,x16,x9
462	lsl	x8,x14,#32
463	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
464	lsr	x9,x14,#32
465	adc	x17,x11,xzr		// can't overflow
466	subs	x10,x14,x8		// "*0xffff0001"
467	sbc	x11,x14,x9
468	adds	x14,x15,x8		// +=acc[0]<<96 and omit acc[0]
469	adcs	x15,x16,x9
470	adcs	x16,x17,x10		// +=acc[0]*0xffff0001
471	adc	x17,x11,xzr		// can't overflow
472
473	adds	x14,x14,x19	// accumulate upper half
474	adcs	x15,x15,x20
475	adcs	x16,x16,x1
476	adcs	x17,x17,x2
477	adc	x19,xzr,xzr
478
479	adds	x8,x14,#1		// subs	x8,x14,#-1 // tmp = ret-modulus
480	sbcs	x9,x15,x12
481	sbcs	x10,x16,xzr
482	sbcs	x11,x17,x13
483	sbcs	xzr,x19,xzr		// did it borrow?
484
485	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
486	csel	x15,x15,x9,lo
487	csel	x16,x16,x10,lo
488	stp	x14,x15,[x0]
489	csel	x17,x17,x11,lo
490	stp	x16,x17,[x0,#16]
491
492	ret
493
494
495// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to
496// x4-x7 and x8-x11. This is done because it's used in multiple
497// contexts, e.g. in multiplication by 2 and 3...
498.def __ecp_nistz256_add_to
499   .type 32
500.endef
501.align	4
502__ecp_nistz256_add_to:
503	adds	x14,x14,x8		// ret = a+b
504	adcs	x15,x15,x9
505	adcs	x16,x16,x10
506	adcs	x17,x17,x11
507	adc	x1,xzr,xzr		// zap x1
508
509	adds	x8,x14,#1		// subs	x8,x4,#-1 // tmp = ret-modulus
510	sbcs	x9,x15,x12
511	sbcs	x10,x16,xzr
512	sbcs	x11,x17,x13
513	sbcs	xzr,x1,xzr		// did subtraction borrow?
514
515	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
516	csel	x15,x15,x9,lo
517	csel	x16,x16,x10,lo
518	stp	x14,x15,[x0]
519	csel	x17,x17,x11,lo
520	stp	x16,x17,[x0,#16]
521
522	ret
523
524
525.def __ecp_nistz256_sub_from
526   .type 32
527.endef
528.align	4
529__ecp_nistz256_sub_from:
530	ldp	x8,x9,[x2]
531	ldp	x10,x11,[x2,#16]
532	subs	x14,x14,x8		// ret = a-b
533	sbcs	x15,x15,x9
534	sbcs	x16,x16,x10
535	sbcs	x17,x17,x11
536	sbc	x1,xzr,xzr		// zap x1
537
538	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = ret+modulus
539	adcs	x9,x15,x12
540	adcs	x10,x16,xzr
541	adc	x11,x17,x13
542	cmp	x1,xzr			// did subtraction borrow?
543
544	csel	x14,x14,x8,eq	// ret = borrow ? ret+modulus : ret
545	csel	x15,x15,x9,eq
546	csel	x16,x16,x10,eq
547	stp	x14,x15,[x0]
548	csel	x17,x17,x11,eq
549	stp	x16,x17,[x0,#16]
550
551	ret
552
553
554.def __ecp_nistz256_sub_morf
555   .type 32
556.endef
557.align	4
558__ecp_nistz256_sub_morf:
559	ldp	x8,x9,[x2]
560	ldp	x10,x11,[x2,#16]
561	subs	x14,x8,x14		// ret = b-a
562	sbcs	x15,x9,x15
563	sbcs	x16,x10,x16
564	sbcs	x17,x11,x17
565	sbc	x1,xzr,xzr		// zap x1
566
567	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = ret+modulus
568	adcs	x9,x15,x12
569	adcs	x10,x16,xzr
570	adc	x11,x17,x13
571	cmp	x1,xzr			// did subtraction borrow?
572
573	csel	x14,x14,x8,eq	// ret = borrow ? ret+modulus : ret
574	csel	x15,x15,x9,eq
575	csel	x16,x16,x10,eq
576	stp	x14,x15,[x0]
577	csel	x17,x17,x11,eq
578	stp	x16,x17,[x0,#16]
579
580	ret
581
582
583.def __ecp_nistz256_div_by_2
584   .type 32
585.endef
586.align	4
587__ecp_nistz256_div_by_2:
588	subs	x8,x14,#1		// adds	x8,x4,#-1 // tmp = a+modulus
589	adcs	x9,x15,x12
590	adcs	x10,x16,xzr
591	adcs	x11,x17,x13
592	adc	x1,xzr,xzr		// zap x1
593	tst	x14,#1		// is a even?
594
595	csel	x14,x14,x8,eq	// ret = even ? a : a+modulus
596	csel	x15,x15,x9,eq
597	csel	x16,x16,x10,eq
598	csel	x17,x17,x11,eq
599	csel	x1,xzr,x1,eq
600
601	lsr	x14,x14,#1		// ret >>= 1
602	orr	x14,x14,x15,lsl#63
603	lsr	x15,x15,#1
604	orr	x15,x15,x16,lsl#63
605	lsr	x16,x16,#1
606	orr	x16,x16,x17,lsl#63
607	lsr	x17,x17,#1
608	stp	x14,x15,[x0]
609	orr	x17,x17,x1,lsl#63
610	stp	x16,x17,[x0,#16]
611
612	ret
613
614.globl	ecp_nistz256_point_double
615
616.def ecp_nistz256_point_double
617   .type 32
618.endef
619.align	5
620ecp_nistz256_point_double:
621	AARCH64_SIGN_LINK_REGISTER
622	stp	x29,x30,[sp,#-96]!
623	add	x29,sp,#0
624	stp	x19,x20,[sp,#16]
625	stp	x21,x22,[sp,#32]
626	sub	sp,sp,#32*4
627
628Ldouble_shortcut:
629	ldp	x14,x15,[x1,#32]
630	mov	x21,x0
631	ldp	x16,x17,[x1,#48]
632	mov	x22,x1
633	ldr	x12,Lpoly+8
634	mov	x8,x14
635	ldr	x13,Lpoly+24
636	mov	x9,x15
637	ldp	x4,x5,[x22,#64]	// forward load for p256_sqr_mont
638	mov	x10,x16
639	mov	x11,x17
640	ldp	x6,x7,[x22,#64+16]
641	add	x0,sp,#0
642	bl	__ecp_nistz256_add_to	// p256_mul_by_2(S, in_y);
643
644	add	x0,sp,#64
645	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Zsqr, in_z);
646
647	ldp	x8,x9,[x22]
648	ldp	x10,x11,[x22,#16]
649	mov	x4,x14		// put Zsqr aside for p256_sub
650	mov	x5,x15
651	mov	x6,x16
652	mov	x7,x17
653	add	x0,sp,#32
654	bl	__ecp_nistz256_add_to	// p256_add(M, Zsqr, in_x);
655
656	add	x2,x22,#0
657	mov	x14,x4		// restore Zsqr
658	mov	x15,x5
659	ldp	x4,x5,[sp,#0]	// forward load for p256_sqr_mont
660	mov	x16,x6
661	mov	x17,x7
662	ldp	x6,x7,[sp,#0+16]
663	add	x0,sp,#64
664	bl	__ecp_nistz256_sub_morf	// p256_sub(Zsqr, in_x, Zsqr);
665
666	add	x0,sp,#0
667	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(S, S);
668
669	ldr	x3,[x22,#32]
670	ldp	x4,x5,[x22,#64]
671	ldp	x6,x7,[x22,#64+16]
672	add	x2,x22,#32
673	add	x0,sp,#96
674	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(tmp0, in_z, in_y);
675
676	mov	x8,x14
677	mov	x9,x15
678	ldp	x4,x5,[sp,#0]	// forward load for p256_sqr_mont
679	mov	x10,x16
680	mov	x11,x17
681	ldp	x6,x7,[sp,#0+16]
682	add	x0,x21,#64
683	bl	__ecp_nistz256_add_to	// p256_mul_by_2(res_z, tmp0);
684
685	add	x0,sp,#96
686	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(tmp0, S);
687
688	ldr	x3,[sp,#64]		// forward load for p256_mul_mont
689	ldp	x4,x5,[sp,#32]
690	ldp	x6,x7,[sp,#32+16]
691	add	x0,x21,#32
692	bl	__ecp_nistz256_div_by_2	// p256_div_by_2(res_y, tmp0);
693
694	add	x2,sp,#64
695	add	x0,sp,#32
696	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(M, M, Zsqr);
697
698	mov	x8,x14		// duplicate M
699	mov	x9,x15
700	mov	x10,x16
701	mov	x11,x17
702	mov	x4,x14		// put M aside
703	mov	x5,x15
704	mov	x6,x16
705	mov	x7,x17
706	add	x0,sp,#32
707	bl	__ecp_nistz256_add_to
708	mov	x8,x4			// restore M
709	mov	x9,x5
710	ldr	x3,[x22]		// forward load for p256_mul_mont
711	mov	x10,x6
712	ldp	x4,x5,[sp,#0]
713	mov	x11,x7
714	ldp	x6,x7,[sp,#0+16]
715	bl	__ecp_nistz256_add_to	// p256_mul_by_3(M, M);
716
717	add	x2,x22,#0
718	add	x0,sp,#0
719	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, in_x);
720
721	mov	x8,x14
722	mov	x9,x15
723	ldp	x4,x5,[sp,#32]	// forward load for p256_sqr_mont
724	mov	x10,x16
725	mov	x11,x17
726	ldp	x6,x7,[sp,#32+16]
727	add	x0,sp,#96
728	bl	__ecp_nistz256_add_to	// p256_mul_by_2(tmp0, S);
729
730	add	x0,x21,#0
731	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(res_x, M);
732
733	add	x2,sp,#96
734	bl	__ecp_nistz256_sub_from	// p256_sub(res_x, res_x, tmp0);
735
736	add	x2,sp,#0
737	add	x0,sp,#0
738	bl	__ecp_nistz256_sub_morf	// p256_sub(S, S, res_x);
739
740	ldr	x3,[sp,#32]
741	mov	x4,x14		// copy S
742	mov	x5,x15
743	mov	x6,x16
744	mov	x7,x17
745	add	x2,sp,#32
746	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, M);
747
748	add	x2,x21,#32
749	add	x0,x21,#32
750	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, S, res_y);
751
752	add	sp,x29,#0		// destroy frame
753	ldp	x19,x20,[x29,#16]
754	ldp	x21,x22,[x29,#32]
755	ldp	x29,x30,[sp],#96
756	AARCH64_VALIDATE_LINK_REGISTER
757	ret
758
759.globl	ecp_nistz256_point_add
760
761.def ecp_nistz256_point_add
762   .type 32
763.endef
764.align	5
765ecp_nistz256_point_add:
766	AARCH64_SIGN_LINK_REGISTER
767	stp	x29,x30,[sp,#-96]!
768	add	x29,sp,#0
769	stp	x19,x20,[sp,#16]
770	stp	x21,x22,[sp,#32]
771	stp	x23,x24,[sp,#48]
772	stp	x25,x26,[sp,#64]
773	stp	x27,x28,[sp,#80]
774	sub	sp,sp,#32*12
775
776	ldp	x4,x5,[x2,#64]	// in2_z
777	ldp	x6,x7,[x2,#64+16]
778	mov	x21,x0
779	mov	x22,x1
780	mov	x23,x2
781	ldr	x12,Lpoly+8
782	ldr	x13,Lpoly+24
783	orr	x8,x4,x5
784	orr	x10,x6,x7
785	orr	x25,x8,x10
786	cmp	x25,#0
787	csetm	x25,ne		// ~in2infty
788	add	x0,sp,#192
789	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z2sqr, in2_z);
790
791	ldp	x4,x5,[x22,#64]	// in1_z
792	ldp	x6,x7,[x22,#64+16]
793	orr	x8,x4,x5
794	orr	x10,x6,x7
795	orr	x24,x8,x10
796	cmp	x24,#0
797	csetm	x24,ne		// ~in1infty
798	add	x0,sp,#128
799	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
800
801	ldr	x3,[x23,#64]
802	ldp	x4,x5,[sp,#192]
803	ldp	x6,x7,[sp,#192+16]
804	add	x2,x23,#64
805	add	x0,sp,#320
806	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, Z2sqr, in2_z);
807
808	ldr	x3,[x22,#64]
809	ldp	x4,x5,[sp,#128]
810	ldp	x6,x7,[sp,#128+16]
811	add	x2,x22,#64
812	add	x0,sp,#352
813	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
814
815	ldr	x3,[x22,#32]
816	ldp	x4,x5,[sp,#320]
817	ldp	x6,x7,[sp,#320+16]
818	add	x2,x22,#32
819	add	x0,sp,#320
820	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, S1, in1_y);
821
822	ldr	x3,[x23,#32]
823	ldp	x4,x5,[sp,#352]
824	ldp	x6,x7,[sp,#352+16]
825	add	x2,x23,#32
826	add	x0,sp,#352
827	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
828
829	add	x2,sp,#320
830	ldr	x3,[sp,#192]	// forward load for p256_mul_mont
831	ldp	x4,x5,[x22]
832	ldp	x6,x7,[x22,#16]
833	add	x0,sp,#160
834	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, S1);
835
836	orr	x14,x14,x15	// see if result is zero
837	orr	x16,x16,x17
838	orr	x26,x14,x16	// ~is_equal(S1,S2)
839
840	add	x2,sp,#192
841	add	x0,sp,#256
842	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U1, in1_x, Z2sqr);
843
844	ldr	x3,[sp,#128]
845	ldp	x4,x5,[x23]
846	ldp	x6,x7,[x23,#16]
847	add	x2,sp,#128
848	add	x0,sp,#288
849	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in2_x, Z1sqr);
850
851	add	x2,sp,#256
852	ldp	x4,x5,[sp,#160]	// forward load for p256_sqr_mont
853	ldp	x6,x7,[sp,#160+16]
854	add	x0,sp,#96
855	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, U1);
856
857	orr	x14,x14,x15	// see if result is zero
858	orr	x16,x16,x17
859	orr	x14,x14,x16	// ~is_equal(U1,U2)
860
861	mvn	x27,x24	// -1/0 -> 0/-1
862	mvn	x28,x25	// -1/0 -> 0/-1
863	orr	x14,x14,x27
864	orr	x14,x14,x28
865	orr	x14,x14,x26
866	cbnz	x14,Ladd_proceed	// if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
867
868Ladd_double:
869	mov	x1,x22
870	mov	x0,x21
871	ldp	x23,x24,[x29,#48]
872	ldp	x25,x26,[x29,#64]
873	ldp	x27,x28,[x29,#80]
874	add	sp,sp,#256	// #256 is from #32*(12-4). difference in stack frames
875	b	Ldouble_shortcut
876
877.align	4
878Ladd_proceed:
879	add	x0,sp,#192
880	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
881
882	ldr	x3,[x22,#64]
883	ldp	x4,x5,[sp,#96]
884	ldp	x6,x7,[sp,#96+16]
885	add	x2,x22,#64
886	add	x0,sp,#64
887	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
888
889	ldp	x4,x5,[sp,#96]
890	ldp	x6,x7,[sp,#96+16]
891	add	x0,sp,#128
892	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
893
894	ldr	x3,[x23,#64]
895	ldp	x4,x5,[sp,#64]
896	ldp	x6,x7,[sp,#64+16]
897	add	x2,x23,#64
898	add	x0,sp,#64
899	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, res_z, in2_z);
900
901	ldr	x3,[sp,#96]
902	ldp	x4,x5,[sp,#128]
903	ldp	x6,x7,[sp,#128+16]
904	add	x2,sp,#96
905	add	x0,sp,#224
906	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
907
908	ldr	x3,[sp,#128]
909	ldp	x4,x5,[sp,#256]
910	ldp	x6,x7,[sp,#256+16]
911	add	x2,sp,#128
912	add	x0,sp,#288
913	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, U1, Hsqr);
914
915	mov	x8,x14
916	mov	x9,x15
917	mov	x10,x16
918	mov	x11,x17
919	add	x0,sp,#128
920	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
921
922	add	x2,sp,#192
923	add	x0,sp,#0
924	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
925
926	add	x2,sp,#224
927	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
928
929	add	x2,sp,#288
930	ldr	x3,[sp,#224]		// forward load for p256_mul_mont
931	ldp	x4,x5,[sp,#320]
932	ldp	x6,x7,[sp,#320+16]
933	add	x0,sp,#32
934	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
935
936	add	x2,sp,#224
937	add	x0,sp,#352
938	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S1, Hcub);
939
940	ldr	x3,[sp,#160]
941	ldp	x4,x5,[sp,#32]
942	ldp	x6,x7,[sp,#32+16]
943	add	x2,sp,#160
944	add	x0,sp,#32
945	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
946
947	add	x2,sp,#352
948	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
949
950	ldp	x4,x5,[sp,#0]		// res
951	ldp	x6,x7,[sp,#0+16]
952	ldp	x8,x9,[x23]		// in2
953	ldp	x10,x11,[x23,#16]
954	ldp	x14,x15,[x22,#0]	// in1
955	cmp	x24,#0			// ~, remember?
956	ldp	x16,x17,[x22,#0+16]
957	csel	x8,x4,x8,ne
958	csel	x9,x5,x9,ne
959	ldp	x4,x5,[sp,#0+0+32]	// res
960	csel	x10,x6,x10,ne
961	csel	x11,x7,x11,ne
962	cmp	x25,#0			// ~, remember?
963	ldp	x6,x7,[sp,#0+0+48]
964	csel	x14,x8,x14,ne
965	csel	x15,x9,x15,ne
966	ldp	x8,x9,[x23,#0+32]	// in2
967	csel	x16,x10,x16,ne
968	csel	x17,x11,x17,ne
969	ldp	x10,x11,[x23,#0+48]
970	stp	x14,x15,[x21,#0]
971	stp	x16,x17,[x21,#0+16]
972	ldp	x14,x15,[x22,#32]	// in1
973	cmp	x24,#0			// ~, remember?
974	ldp	x16,x17,[x22,#32+16]
975	csel	x8,x4,x8,ne
976	csel	x9,x5,x9,ne
977	ldp	x4,x5,[sp,#0+32+32]	// res
978	csel	x10,x6,x10,ne
979	csel	x11,x7,x11,ne
980	cmp	x25,#0			// ~, remember?
981	ldp	x6,x7,[sp,#0+32+48]
982	csel	x14,x8,x14,ne
983	csel	x15,x9,x15,ne
984	ldp	x8,x9,[x23,#32+32]	// in2
985	csel	x16,x10,x16,ne
986	csel	x17,x11,x17,ne
987	ldp	x10,x11,[x23,#32+48]
988	stp	x14,x15,[x21,#32]
989	stp	x16,x17,[x21,#32+16]
990	ldp	x14,x15,[x22,#64]	// in1
991	cmp	x24,#0			// ~, remember?
992	ldp	x16,x17,[x22,#64+16]
993	csel	x8,x4,x8,ne
994	csel	x9,x5,x9,ne
995	csel	x10,x6,x10,ne
996	csel	x11,x7,x11,ne
997	cmp	x25,#0			// ~, remember?
998	csel	x14,x8,x14,ne
999	csel	x15,x9,x15,ne
1000	csel	x16,x10,x16,ne
1001	csel	x17,x11,x17,ne
1002	stp	x14,x15,[x21,#64]
1003	stp	x16,x17,[x21,#64+16]
1004
1005Ladd_done:
1006	add	sp,x29,#0		// destroy frame
1007	ldp	x19,x20,[x29,#16]
1008	ldp	x21,x22,[x29,#32]
1009	ldp	x23,x24,[x29,#48]
1010	ldp	x25,x26,[x29,#64]
1011	ldp	x27,x28,[x29,#80]
1012	ldp	x29,x30,[sp],#96
1013	AARCH64_VALIDATE_LINK_REGISTER
1014	ret
1015
1016.globl	ecp_nistz256_point_add_affine
1017
1018.def ecp_nistz256_point_add_affine
1019   .type 32
1020.endef
1021.align	5
1022ecp_nistz256_point_add_affine:
1023	AARCH64_SIGN_LINK_REGISTER
1024	stp	x29,x30,[sp,#-80]!
1025	add	x29,sp,#0
1026	stp	x19,x20,[sp,#16]
1027	stp	x21,x22,[sp,#32]
1028	stp	x23,x24,[sp,#48]
1029	stp	x25,x26,[sp,#64]
1030	sub	sp,sp,#32*10
1031
1032	mov	x21,x0
1033	mov	x22,x1
1034	mov	x23,x2
1035	ldr	x12,Lpoly+8
1036	ldr	x13,Lpoly+24
1037
1038	ldp	x4,x5,[x1,#64]	// in1_z
1039	ldp	x6,x7,[x1,#64+16]
1040	orr	x8,x4,x5
1041	orr	x10,x6,x7
1042	orr	x24,x8,x10
1043	cmp	x24,#0
1044	csetm	x24,ne		// ~in1infty
1045
1046	ldp	x14,x15,[x2]	// in2_x
1047	ldp	x16,x17,[x2,#16]
1048	ldp	x8,x9,[x2,#32]	// in2_y
1049	ldp	x10,x11,[x2,#48]
1050	orr	x14,x14,x15
1051	orr	x16,x16,x17
1052	orr	x8,x8,x9
1053	orr	x10,x10,x11
1054	orr	x14,x14,x16
1055	orr	x8,x8,x10
1056	orr	x25,x14,x8
1057	cmp	x25,#0
1058	csetm	x25,ne		// ~in2infty
1059
1060	add	x0,sp,#128
1061	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
1062
1063	mov	x4,x14
1064	mov	x5,x15
1065	mov	x6,x16
1066	mov	x7,x17
1067	ldr	x3,[x23]
1068	add	x2,x23,#0
1069	add	x0,sp,#96
1070	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, Z1sqr, in2_x);
1071
1072	add	x2,x22,#0
1073	ldr	x3,[x22,#64]	// forward load for p256_mul_mont
1074	ldp	x4,x5,[sp,#128]
1075	ldp	x6,x7,[sp,#128+16]
1076	add	x0,sp,#160
1077	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, in1_x);
1078
1079	add	x2,x22,#64
1080	add	x0,sp,#128
1081	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
1082
1083	ldr	x3,[x22,#64]
1084	ldp	x4,x5,[sp,#160]
1085	ldp	x6,x7,[sp,#160+16]
1086	add	x2,x22,#64
1087	add	x0,sp,#64
1088	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
1089
1090	ldr	x3,[x23,#32]
1091	ldp	x4,x5,[sp,#128]
1092	ldp	x6,x7,[sp,#128+16]
1093	add	x2,x23,#32
1094	add	x0,sp,#128
1095	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
1096
1097	add	x2,x22,#32
1098	ldp	x4,x5,[sp,#160]	// forward load for p256_sqr_mont
1099	ldp	x6,x7,[sp,#160+16]
1100	add	x0,sp,#192
1101	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, in1_y);
1102
1103	add	x0,sp,#224
1104	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
1105
1106	ldp	x4,x5,[sp,#192]
1107	ldp	x6,x7,[sp,#192+16]
1108	add	x0,sp,#288
1109	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
1110
1111	ldr	x3,[sp,#160]
1112	ldp	x4,x5,[sp,#224]
1113	ldp	x6,x7,[sp,#224+16]
1114	add	x2,sp,#160
1115	add	x0,sp,#256
1116	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
1117
1118	ldr	x3,[x22]
1119	ldp	x4,x5,[sp,#224]
1120	ldp	x6,x7,[sp,#224+16]
1121	add	x2,x22,#0
1122	add	x0,sp,#96
1123	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in1_x, Hsqr);
1124
1125	mov	x8,x14
1126	mov	x9,x15
1127	mov	x10,x16
1128	mov	x11,x17
1129	add	x0,sp,#224
1130	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
1131
1132	add	x2,sp,#288
1133	add	x0,sp,#0
1134	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
1135
1136	add	x2,sp,#256
1137	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
1138
1139	add	x2,sp,#96
1140	ldr	x3,[x22,#32]	// forward load for p256_mul_mont
1141	ldp	x4,x5,[sp,#256]
1142	ldp	x6,x7,[sp,#256+16]
1143	add	x0,sp,#32
1144	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
1145
1146	add	x2,x22,#32
1147	add	x0,sp,#128
1148	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, in1_y, Hcub);
1149
1150	ldr	x3,[sp,#192]
1151	ldp	x4,x5,[sp,#32]
1152	ldp	x6,x7,[sp,#32+16]
1153	add	x2,sp,#192
1154	add	x0,sp,#32
1155	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
1156
1157	add	x2,sp,#128
1158	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
1159
1160	ldp	x4,x5,[sp,#0]		// res
1161	ldp	x6,x7,[sp,#0+16]
1162	ldp	x8,x9,[x23]		// in2
1163	ldp	x10,x11,[x23,#16]
1164	ldp	x14,x15,[x22,#0]	// in1
1165	cmp	x24,#0			// ~, remember?
1166	ldp	x16,x17,[x22,#0+16]
1167	csel	x8,x4,x8,ne
1168	csel	x9,x5,x9,ne
1169	ldp	x4,x5,[sp,#0+0+32]	// res
1170	csel	x10,x6,x10,ne
1171	csel	x11,x7,x11,ne
1172	cmp	x25,#0			// ~, remember?
1173	ldp	x6,x7,[sp,#0+0+48]
1174	csel	x14,x8,x14,ne
1175	csel	x15,x9,x15,ne
1176	ldp	x8,x9,[x23,#0+32]	// in2
1177	csel	x16,x10,x16,ne
1178	csel	x17,x11,x17,ne
1179	ldp	x10,x11,[x23,#0+48]
1180	stp	x14,x15,[x21,#0]
1181	stp	x16,x17,[x21,#0+16]
1182	adr	x23,Lone_mont-64
1183	ldp	x14,x15,[x22,#32]	// in1
1184	cmp	x24,#0			// ~, remember?
1185	ldp	x16,x17,[x22,#32+16]
1186	csel	x8,x4,x8,ne
1187	csel	x9,x5,x9,ne
1188	ldp	x4,x5,[sp,#0+32+32]	// res
1189	csel	x10,x6,x10,ne
1190	csel	x11,x7,x11,ne
1191	cmp	x25,#0			// ~, remember?
1192	ldp	x6,x7,[sp,#0+32+48]
1193	csel	x14,x8,x14,ne
1194	csel	x15,x9,x15,ne
1195	ldp	x8,x9,[x23,#32+32]	// in2
1196	csel	x16,x10,x16,ne
1197	csel	x17,x11,x17,ne
1198	ldp	x10,x11,[x23,#32+48]
1199	stp	x14,x15,[x21,#32]
1200	stp	x16,x17,[x21,#32+16]
1201	ldp	x14,x15,[x22,#64]	// in1
1202	cmp	x24,#0			// ~, remember?
1203	ldp	x16,x17,[x22,#64+16]
1204	csel	x8,x4,x8,ne
1205	csel	x9,x5,x9,ne
1206	csel	x10,x6,x10,ne
1207	csel	x11,x7,x11,ne
1208	cmp	x25,#0			// ~, remember?
1209	csel	x14,x8,x14,ne
1210	csel	x15,x9,x15,ne
1211	csel	x16,x10,x16,ne
1212	csel	x17,x11,x17,ne
1213	stp	x14,x15,[x21,#64]
1214	stp	x16,x17,[x21,#64+16]
1215
1216	add	sp,x29,#0		// destroy frame
1217	ldp	x19,x20,[x29,#16]
1218	ldp	x21,x22,[x29,#32]
1219	ldp	x23,x24,[x29,#48]
1220	ldp	x25,x26,[x29,#64]
1221	ldp	x29,x30,[sp],#80
1222	AARCH64_VALIDATE_LINK_REGISTER
1223	ret
1224
1225////////////////////////////////////////////////////////////////////////
1226// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
1227//                                uint64_t b[4]);
1228.globl	ecp_nistz256_ord_mul_mont
1229
1230.def ecp_nistz256_ord_mul_mont
1231   .type 32
1232.endef
1233.align	4
1234ecp_nistz256_ord_mul_mont:
1235	AARCH64_VALID_CALL_TARGET
1236	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1237	stp	x29,x30,[sp,#-64]!
1238	add	x29,sp,#0
1239	stp	x19,x20,[sp,#16]
1240	stp	x21,x22,[sp,#32]
1241	stp	x23,x24,[sp,#48]
1242
1243	adr	x23,Lord
1244	ldr	x3,[x2]		// bp[0]
1245	ldp	x4,x5,[x1]
1246	ldp	x6,x7,[x1,#16]
1247
1248	ldp	x12,x13,[x23,#0]
1249	ldp	x21,x22,[x23,#16]
1250	ldr	x23,[x23,#32]
1251
1252	mul	x14,x4,x3		// a[0]*b[0]
1253	umulh	x8,x4,x3
1254
1255	mul	x15,x5,x3		// a[1]*b[0]
1256	umulh	x9,x5,x3
1257
1258	mul	x16,x6,x3		// a[2]*b[0]
1259	umulh	x10,x6,x3
1260
1261	mul	x17,x7,x3		// a[3]*b[0]
1262	umulh	x19,x7,x3
1263
1264	mul	x24,x14,x23
1265
1266	adds	x15,x15,x8		// accumulate high parts of multiplication
1267	adcs	x16,x16,x9
1268	adcs	x17,x17,x10
1269	adc	x19,x19,xzr
1270	mov	x20,xzr
1271	ldr	x3,[x2,#8*1]		// b[i]
1272
1273	lsl	x8,x24,#32
1274	subs	x16,x16,x24
1275	lsr	x9,x24,#32
1276	sbcs	x17,x17,x8
1277	sbcs	x19,x19,x9
1278	sbc	x20,x20,xzr
1279
1280	subs	xzr,x14,#1
1281	umulh	x9,x12,x24
1282	mul	x10,x13,x24
1283	umulh	x11,x13,x24
1284
1285	adcs	x10,x10,x9
1286	mul	x8,x4,x3
1287	adc	x11,x11,xzr
1288	mul	x9,x5,x3
1289
1290	adds	x14,x15,x10
1291	mul	x10,x6,x3
1292	adcs	x15,x16,x11
1293	mul	x11,x7,x3
1294	adcs	x16,x17,x24
1295	adcs	x17,x19,x24
1296	adc	x19,x20,xzr
1297
1298	adds	x14,x14,x8		// accumulate low parts
1299	umulh	x8,x4,x3
1300	adcs	x15,x15,x9
1301	umulh	x9,x5,x3
1302	adcs	x16,x16,x10
1303	umulh	x10,x6,x3
1304	adcs	x17,x17,x11
1305	umulh	x11,x7,x3
1306	adc	x19,x19,xzr
1307	mul	x24,x14,x23
1308	adds	x15,x15,x8		// accumulate high parts
1309	adcs	x16,x16,x9
1310	adcs	x17,x17,x10
1311	adcs	x19,x19,x11
1312	adc	x20,xzr,xzr
1313	ldr	x3,[x2,#8*2]		// b[i]
1314
1315	lsl	x8,x24,#32
1316	subs	x16,x16,x24
1317	lsr	x9,x24,#32
1318	sbcs	x17,x17,x8
1319	sbcs	x19,x19,x9
1320	sbc	x20,x20,xzr
1321
1322	subs	xzr,x14,#1
1323	umulh	x9,x12,x24
1324	mul	x10,x13,x24
1325	umulh	x11,x13,x24
1326
1327	adcs	x10,x10,x9
1328	mul	x8,x4,x3
1329	adc	x11,x11,xzr
1330	mul	x9,x5,x3
1331
1332	adds	x14,x15,x10
1333	mul	x10,x6,x3
1334	adcs	x15,x16,x11
1335	mul	x11,x7,x3
1336	adcs	x16,x17,x24
1337	adcs	x17,x19,x24
1338	adc	x19,x20,xzr
1339
1340	adds	x14,x14,x8		// accumulate low parts
1341	umulh	x8,x4,x3
1342	adcs	x15,x15,x9
1343	umulh	x9,x5,x3
1344	adcs	x16,x16,x10
1345	umulh	x10,x6,x3
1346	adcs	x17,x17,x11
1347	umulh	x11,x7,x3
1348	adc	x19,x19,xzr
1349	mul	x24,x14,x23
1350	adds	x15,x15,x8		// accumulate high parts
1351	adcs	x16,x16,x9
1352	adcs	x17,x17,x10
1353	adcs	x19,x19,x11
1354	adc	x20,xzr,xzr
1355	ldr	x3,[x2,#8*3]		// b[i]
1356
1357	lsl	x8,x24,#32
1358	subs	x16,x16,x24
1359	lsr	x9,x24,#32
1360	sbcs	x17,x17,x8
1361	sbcs	x19,x19,x9
1362	sbc	x20,x20,xzr
1363
1364	subs	xzr,x14,#1
1365	umulh	x9,x12,x24
1366	mul	x10,x13,x24
1367	umulh	x11,x13,x24
1368
1369	adcs	x10,x10,x9
1370	mul	x8,x4,x3
1371	adc	x11,x11,xzr
1372	mul	x9,x5,x3
1373
1374	adds	x14,x15,x10
1375	mul	x10,x6,x3
1376	adcs	x15,x16,x11
1377	mul	x11,x7,x3
1378	adcs	x16,x17,x24
1379	adcs	x17,x19,x24
1380	adc	x19,x20,xzr
1381
1382	adds	x14,x14,x8		// accumulate low parts
1383	umulh	x8,x4,x3
1384	adcs	x15,x15,x9
1385	umulh	x9,x5,x3
1386	adcs	x16,x16,x10
1387	umulh	x10,x6,x3
1388	adcs	x17,x17,x11
1389	umulh	x11,x7,x3
1390	adc	x19,x19,xzr
1391	mul	x24,x14,x23
1392	adds	x15,x15,x8		// accumulate high parts
1393	adcs	x16,x16,x9
1394	adcs	x17,x17,x10
1395	adcs	x19,x19,x11
1396	adc	x20,xzr,xzr
1397	lsl	x8,x24,#32		// last reduction
1398	subs	x16,x16,x24
1399	lsr	x9,x24,#32
1400	sbcs	x17,x17,x8
1401	sbcs	x19,x19,x9
1402	sbc	x20,x20,xzr
1403
1404	subs	xzr,x14,#1
1405	umulh	x9,x12,x24
1406	mul	x10,x13,x24
1407	umulh	x11,x13,x24
1408
1409	adcs	x10,x10,x9
1410	adc	x11,x11,xzr
1411
1412	adds	x14,x15,x10
1413	adcs	x15,x16,x11
1414	adcs	x16,x17,x24
1415	adcs	x17,x19,x24
1416	adc	x19,x20,xzr
1417
1418	subs	x8,x14,x12		// ret -= modulus
1419	sbcs	x9,x15,x13
1420	sbcs	x10,x16,x21
1421	sbcs	x11,x17,x22
1422	sbcs	xzr,x19,xzr
1423
1424	csel	x14,x14,x8,lo	// ret = borrow ? ret : ret-modulus
1425	csel	x15,x15,x9,lo
1426	csel	x16,x16,x10,lo
1427	stp	x14,x15,[x0]
1428	csel	x17,x17,x11,lo
1429	stp	x16,x17,[x0,#16]
1430
1431	ldp	x19,x20,[sp,#16]
1432	ldp	x21,x22,[sp,#32]
1433	ldp	x23,x24,[sp,#48]
1434	ldr	x29,[sp],#64
1435	ret
1436
1437
1438////////////////////////////////////////////////////////////////////////
1439// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
1440//                                int rep);
1441.globl	ecp_nistz256_ord_sqr_mont
1442
1443.def ecp_nistz256_ord_sqr_mont
1444   .type 32
1445.endef
1446.align	4
1447ecp_nistz256_ord_sqr_mont:
1448	AARCH64_VALID_CALL_TARGET
1449	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1450	stp	x29,x30,[sp,#-64]!
1451	add	x29,sp,#0
1452	stp	x19,x20,[sp,#16]
1453	stp	x21,x22,[sp,#32]
1454	stp	x23,x24,[sp,#48]
1455
1456	adr	x23,Lord
1457	ldp	x4,x5,[x1]
1458	ldp	x6,x7,[x1,#16]
1459
1460	ldp	x12,x13,[x23,#0]
1461	ldp	x21,x22,[x23,#16]
1462	ldr	x23,[x23,#32]
1463	b	Loop_ord_sqr
1464
1465.align	4
1466Loop_ord_sqr:
1467	sub	x2,x2,#1
1468	////////////////////////////////////////////////////////////////
1469	//  |  |  |  |  |  |a1*a0|  |
1470	//  |  |  |  |  |a2*a0|  |  |
1471	//  |  |a3*a2|a3*a0|  |  |  |
1472	//  |  |  |  |a2*a1|  |  |  |
1473	//  |  |  |a3*a1|  |  |  |  |
1474	// *|  |  |  |  |  |  |  | 2|
1475	// +|a3*a3|a2*a2|a1*a1|a0*a0|
1476	//  |--+--+--+--+--+--+--+--|
1477	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow
1478	//
1479	//  "can't overflow" below mark carrying into high part of
1480	//  multiplication result, which can't overflow, because it
1481	//  can never be all ones.
1482
1483	mul	x15,x5,x4		// a[1]*a[0]
1484	umulh	x9,x5,x4
1485	mul	x16,x6,x4		// a[2]*a[0]
1486	umulh	x10,x6,x4
1487	mul	x17,x7,x4		// a[3]*a[0]
1488	umulh	x19,x7,x4
1489
1490	adds	x16,x16,x9		// accumulate high parts of multiplication
1491	mul	x8,x6,x5		// a[2]*a[1]
1492	umulh	x9,x6,x5
1493	adcs	x17,x17,x10
1494	mul	x10,x7,x5		// a[3]*a[1]
1495	umulh	x11,x7,x5
1496	adc	x19,x19,xzr		// can't overflow
1497
1498	mul	x20,x7,x6		// a[3]*a[2]
1499	umulh	x1,x7,x6
1500
1501	adds	x9,x9,x10		// accumulate high parts of multiplication
1502	mul	x14,x4,x4		// a[0]*a[0]
1503	adc	x10,x11,xzr		// can't overflow
1504
1505	adds	x17,x17,x8		// accumulate low parts of multiplication
1506	umulh	x4,x4,x4
1507	adcs	x19,x19,x9
1508	mul	x9,x5,x5		// a[1]*a[1]
1509	adcs	x20,x20,x10
1510	umulh	x5,x5,x5
1511	adc	x1,x1,xzr		// can't overflow
1512
1513	adds	x15,x15,x15	// acc[1-6]*=2
1514	mul	x10,x6,x6		// a[2]*a[2]
1515	adcs	x16,x16,x16
1516	umulh	x6,x6,x6
1517	adcs	x17,x17,x17
1518	mul	x11,x7,x7		// a[3]*a[3]
1519	adcs	x19,x19,x19
1520	umulh	x7,x7,x7
1521	adcs	x20,x20,x20
1522	adcs	x1,x1,x1
1523	adc	x3,xzr,xzr
1524
1525	adds	x15,x15,x4		// +a[i]*a[i]
1526	mul	x24,x14,x23
1527	adcs	x16,x16,x9
1528	adcs	x17,x17,x5
1529	adcs	x19,x19,x10
1530	adcs	x20,x20,x6
1531	adcs	x1,x1,x11
1532	adc	x3,x3,x7
1533	subs	xzr,x14,#1
1534	umulh	x9,x12,x24
1535	mul	x10,x13,x24
1536	umulh	x11,x13,x24
1537
1538	adcs	x10,x10,x9
1539	adc	x11,x11,xzr
1540
1541	adds	x14,x15,x10
1542	adcs	x15,x16,x11
1543	adcs	x16,x17,x24
1544	adc	x17,xzr,x24		// can't overflow
1545	mul	x11,x14,x23
1546	lsl	x8,x24,#32
1547	subs	x15,x15,x24
1548	lsr	x9,x24,#32
1549	sbcs	x16,x16,x8
1550	sbc	x17,x17,x9		// can't borrow
1551	subs	xzr,x14,#1
1552	umulh	x9,x12,x11
1553	mul	x10,x13,x11
1554	umulh	x24,x13,x11
1555
1556	adcs	x10,x10,x9
1557	adc	x24,x24,xzr
1558
1559	adds	x14,x15,x10
1560	adcs	x15,x16,x24
1561	adcs	x16,x17,x11
1562	adc	x17,xzr,x11		// can't overflow
1563	mul	x24,x14,x23
1564	lsl	x8,x11,#32
1565	subs	x15,x15,x11
1566	lsr	x9,x11,#32
1567	sbcs	x16,x16,x8
1568	sbc	x17,x17,x9		// can't borrow
1569	subs	xzr,x14,#1
1570	umulh	x9,x12,x24
1571	mul	x10,x13,x24
1572	umulh	x11,x13,x24
1573
1574	adcs	x10,x10,x9
1575	adc	x11,x11,xzr
1576
1577	adds	x14,x15,x10
1578	adcs	x15,x16,x11
1579	adcs	x16,x17,x24
1580	adc	x17,xzr,x24		// can't overflow
1581	mul	x11,x14,x23
1582	lsl	x8,x24,#32
1583	subs	x15,x15,x24
1584	lsr	x9,x24,#32
1585	sbcs	x16,x16,x8
1586	sbc	x17,x17,x9		// can't borrow
1587	subs	xzr,x14,#1
1588	umulh	x9,x12,x11
1589	mul	x10,x13,x11
1590	umulh	x24,x13,x11
1591
1592	adcs	x10,x10,x9
1593	adc	x24,x24,xzr
1594
1595	adds	x14,x15,x10
1596	adcs	x15,x16,x24
1597	adcs	x16,x17,x11
1598	adc	x17,xzr,x11		// can't overflow
1599	lsl	x8,x11,#32
1600	subs	x15,x15,x11
1601	lsr	x9,x11,#32
1602	sbcs	x16,x16,x8
1603	sbc	x17,x17,x9		// can't borrow
1604	adds	x14,x14,x19	// accumulate upper half
1605	adcs	x15,x15,x20
1606	adcs	x16,x16,x1
1607	adcs	x17,x17,x3
1608	adc	x19,xzr,xzr
1609
1610	subs	x8,x14,x12		// ret -= modulus
1611	sbcs	x9,x15,x13
1612	sbcs	x10,x16,x21
1613	sbcs	x11,x17,x22
1614	sbcs	xzr,x19,xzr
1615
1616	csel	x4,x14,x8,lo	// ret = borrow ? ret : ret-modulus
1617	csel	x5,x15,x9,lo
1618	csel	x6,x16,x10,lo
1619	csel	x7,x17,x11,lo
1620
1621	cbnz	x2,Loop_ord_sqr
1622
1623	stp	x4,x5,[x0]
1624	stp	x6,x7,[x0,#16]
1625
1626	ldp	x19,x20,[sp,#16]
1627	ldp	x21,x22,[sp,#32]
1628	ldp	x23,x24,[sp,#48]
1629	ldr	x29,[sp],#64
1630	ret
1631
1632////////////////////////////////////////////////////////////////////////
1633// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
1634.globl	ecp_nistz256_select_w5
1635
1636.def ecp_nistz256_select_w5
1637   .type 32
1638.endef
1639.align	4
1640ecp_nistz256_select_w5:
1641	AARCH64_VALID_CALL_TARGET
1642
1643    // x10 := x0
1644    // w9 := 0; loop counter and incremented internal index
1645	mov	x10, x0
1646	mov	w9, #0
1647
1648    // [v16-v21] := 0
1649	movi	v16.16b, #0
1650	movi	v17.16b, #0
1651	movi	v18.16b, #0
1652	movi	v19.16b, #0
1653	movi	v20.16b, #0
1654	movi	v21.16b, #0
1655
1656Lselect_w5_loop:
1657    // Loop 16 times.
1658
1659    // Increment index (loop counter); tested at the end of the loop
1660	add	w9, w9, #1
1661
1662    // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1
1663    //  and advance x1 to point to the next entry
1664	ld1	{v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
1665
1666    // x11 := (w9 == w2)? All 1s : All 0s
1667	cmp	w9, w2
1668	csetm	x11, eq
1669
1670    // continue loading ...
1671	ld1	{v26.2d, v27.2d}, [x1],#32
1672
1673    // duplicate mask_64 into Mask (all 0s or all 1s)
1674	dup	v3.2d, x11
1675
1676    // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
1677    // i.e., values in output registers will remain the same if w9 != w2
1678	bit	v16.16b, v22.16b, v3.16b
1679	bit	v17.16b, v23.16b, v3.16b
1680
1681	bit	v18.16b, v24.16b, v3.16b
1682	bit	v19.16b, v25.16b, v3.16b
1683
1684	bit	v20.16b, v26.16b, v3.16b
1685	bit	v21.16b, v27.16b, v3.16b
1686
1687    // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back
1688	tbz	w9, #4, Lselect_w5_loop
1689
1690    // Write [v16-v21] to memory at the output pointer
1691	st1	{v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64
1692	st1	{v20.2d, v21.2d}, [x10]
1693
1694	ret
1695
1696
1697
1698////////////////////////////////////////////////////////////////////////
1699// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
1700.globl	ecp_nistz256_select_w7
1701
1702.def ecp_nistz256_select_w7
1703   .type 32
1704.endef
1705.align	4
1706ecp_nistz256_select_w7:
1707	AARCH64_VALID_CALL_TARGET
1708
1709    // w9 := 0; loop counter and incremented internal index
1710	mov	w9, #0
1711
1712    // [v16-v21] := 0
1713	movi	v16.16b, #0
1714	movi	v17.16b, #0
1715	movi	v18.16b, #0
1716	movi	v19.16b, #0
1717
1718Lselect_w7_loop:
1719    // Loop 64 times.
1720
1721    // Increment index (loop counter); tested at the end of the loop
1722	add	w9, w9, #1
1723
1724    // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1
1725    //  and advance x1 to point to the next entry
1726	ld1	{v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64
1727
1728    // x11 := (w9 == w2)? All 1s : All 0s
1729	cmp	w9, w2
1730	csetm	x11, eq
1731
1732    // duplicate mask_64 into Mask (all 0s or all 1s)
1733	dup	v3.2d, x11
1734
1735    // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19]
1736    // i.e., values in output registers will remain the same if w9 != w2
1737	bit	v16.16b, v22.16b, v3.16b
1738	bit	v17.16b, v23.16b, v3.16b
1739
1740	bit	v18.16b, v24.16b, v3.16b
1741	bit	v19.16b, v25.16b, v3.16b
1742
1743    // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back
1744	tbz	w9, #6, Lselect_w7_loop
1745
1746    // Write [v16-v19] to memory at the output pointer
1747	st1	{v16.2d, v17.2d, v18.2d, v19.2d}, [x0]
1748
1749	ret
1750
1751#endif
1752#endif  // !OPENSSL_NO_ASM
1753