• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# ECP_NISTZ256 module for ARMv8.
18#
19# February 2015.
20#
21# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
22# http://eprint.iacr.org/2013/816.
23#
24#			with/without -DECP_NISTZ256_ASM
25# Apple A7		+190-360%
26# Cortex-A53		+190-400%
27# Cortex-A57		+190-350%
28# Denver		+230-400%
29#
30# Ranges denote minimum and maximum improvement coefficients depending
31# on benchmark. Lower coefficients are for ECDSA sign, server-side
32# operation. Keep in mind that +400% means 5x improvement.
33
34# The first two arguments should always be the flavour and output file path.
35if ($#ARGV < 1) { die "Not enough arguments provided.
36  Two arguments are necessary: the flavour and the output file path."; }
37
38$flavour = shift;
39$output = shift;
40
41$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
42( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
43( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
44die "can't locate arm-xlate.pl";
45
46open OUT,"| \"$^X\" $xlate $flavour $output";
47*STDOUT=*OUT;
48
49{
50my ($rp,$ap,$bp,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3,
51    $acc0,$acc1,$acc2,$acc3,$acc4,$acc5) =
52    map("x$_",(0..17,19,20));
53
54my ($acc6,$acc7)=($ap,$bp);	# used in __ecp_nistz256_sqr_mont
55
56$code.=<<___;
57#include "ring-core/arm_arch.h"
58
59.section .rodata
60.align	5
61.Lpoly:
62.quad	0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
63.LRR:	// 2^512 mod P precomputed for NIST P256 polynomial
64.quad	0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd
65.Lone_mont:
66.quad	0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
67.Lone:
68.quad	1,0,0,0
69.Lord:
70.quad	0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
71.LordK:
72.quad	0xccd1c8aaee00bc4f
73.asciz	"ECP_NISTZ256 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
74.text
75
76// void	ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
77//					     const BN_ULONG x2[4]);
78.globl	ecp_nistz256_mul_mont
79.type	ecp_nistz256_mul_mont,%function
80.align	4
81ecp_nistz256_mul_mont:
82	AARCH64_SIGN_LINK_REGISTER
83	stp	x29,x30,[sp,#-32]!
84	add	x29,sp,#0
85	stp	x19,x20,[sp,#16]
86
87	ldr	$bi,[$bp]		// bp[0]
88	ldp	$a0,$a1,[$ap]
89	ldp	$a2,$a3,[$ap,#16]
90	adrp	$poly3,:pg_hi21:.Lpoly
91	add	$poly3,$poly3,:lo12:.Lpoly
92	ldr	$poly1,[$poly3,#8]
93	ldr	$poly3,[$poly3,#24]
94
95	bl	__ecp_nistz256_mul_mont
96
97	ldp	x19,x20,[sp,#16]
98	ldp	x29,x30,[sp],#32
99	AARCH64_VALIDATE_LINK_REGISTER
100	ret
101.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
102
103// void	ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
104.globl	ecp_nistz256_sqr_mont
105.type	ecp_nistz256_sqr_mont,%function
106.align	4
107ecp_nistz256_sqr_mont:
108	AARCH64_SIGN_LINK_REGISTER
109	stp	x29,x30,[sp,#-32]!
110	add	x29,sp,#0
111	stp	x19,x20,[sp,#16]
112
113	ldp	$a0,$a1,[$ap]
114	ldp	$a2,$a3,[$ap,#16]
115	adrp	$poly3,:pg_hi21:.Lpoly
116	add	$poly3,$poly3,:lo12:.Lpoly
117	ldr	$poly1,[$poly3,#8]
118	ldr	$poly3,[$poly3,#24]
119
120	bl	__ecp_nistz256_sqr_mont
121
122	ldp	x19,x20,[sp,#16]
123	ldp	x29,x30,[sp],#32
124	AARCH64_VALIDATE_LINK_REGISTER
125	ret
126.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
127
128// void	ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
129.globl	ecp_nistz256_neg
130.type	ecp_nistz256_neg,%function
131.align	4
132ecp_nistz256_neg:
133	AARCH64_SIGN_LINK_REGISTER
134	stp	x29,x30,[sp,#-16]!
135	add	x29,sp,#0
136
137	mov	$bp,$ap
138	mov	$acc0,xzr		// a = 0
139	mov	$acc1,xzr
140	mov	$acc2,xzr
141	mov	$acc3,xzr
142	adrp	$poly3,:pg_hi21:.Lpoly
143	add	$poly3,$poly3,:lo12:.Lpoly
144	ldr	$poly1,[$poly3,#8]
145	ldr	$poly3,[$poly3,#24]
146
147	bl	__ecp_nistz256_sub_from
148
149	ldp	x29,x30,[sp],#16
150	AARCH64_VALIDATE_LINK_REGISTER
151	ret
152.size	ecp_nistz256_neg,.-ecp_nistz256_neg
153
154// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
155// to $a0-$a3 and b[0] - to $bi
156.type	__ecp_nistz256_mul_mont,%function
157.align	4
158__ecp_nistz256_mul_mont:
159	mul	$acc0,$a0,$bi		// a[0]*b[0]
160	umulh	$t0,$a0,$bi
161
162	mul	$acc1,$a1,$bi		// a[1]*b[0]
163	umulh	$t1,$a1,$bi
164
165	mul	$acc2,$a2,$bi		// a[2]*b[0]
166	umulh	$t2,$a2,$bi
167
168	mul	$acc3,$a3,$bi		// a[3]*b[0]
169	umulh	$t3,$a3,$bi
170	ldr	$bi,[$bp,#8]		// b[1]
171
172	adds	$acc1,$acc1,$t0		// accumulate high parts of multiplication
173	 lsl	$t0,$acc0,#32
174	adcs	$acc2,$acc2,$t1
175	 lsr	$t1,$acc0,#32
176	adcs	$acc3,$acc3,$t2
177	adc	$acc4,xzr,$t3
178	mov	$acc5,xzr
179___
180for($i=1;$i<4;$i++) {
181        # Reduction iteration is normally performed by accumulating
182        # result of multiplication of modulus by "magic" digit [and
183        # omitting least significant word, which is guaranteed to
184        # be 0], but thanks to special form of modulus and "magic"
185        # digit being equal to least significant word, it can be
186        # performed with additions and subtractions alone. Indeed:
187        #
188        #            ffff0001.00000000.0000ffff.ffffffff
189        # *                                     abcdefgh
190        # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
191        #
192        # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
193        # rewrite above as:
194        #
195        #   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
196        # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
197        # - 0000abcd.efgh0000.00000000.00000000.abcdefgh
198        #
199        # or marking redundant operations:
200        #
201        #   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
202        # + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
203        # - 0000abcd.efgh0000.--------.--------.--------
204
205$code.=<<___;
206	subs	$t2,$acc0,$t0		// "*0xffff0001"
207	sbc	$t3,$acc0,$t1
208	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
209	 mul	$t0,$a0,$bi		// lo(a[0]*b[i])
210	adcs	$acc1,$acc2,$t1
211	 mul	$t1,$a1,$bi		// lo(a[1]*b[i])
212	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
213	 mul	$t2,$a2,$bi		// lo(a[2]*b[i])
214	adcs	$acc3,$acc4,$t3
215	 mul	$t3,$a3,$bi		// lo(a[3]*b[i])
216	adc	$acc4,$acc5,xzr
217
218	adds	$acc0,$acc0,$t0		// accumulate low parts of multiplication
219	 umulh	$t0,$a0,$bi		// hi(a[0]*b[i])
220	adcs	$acc1,$acc1,$t1
221	 umulh	$t1,$a1,$bi		// hi(a[1]*b[i])
222	adcs	$acc2,$acc2,$t2
223	 umulh	$t2,$a2,$bi		// hi(a[2]*b[i])
224	adcs	$acc3,$acc3,$t3
225	 umulh	$t3,$a3,$bi		// hi(a[3]*b[i])
226	adc	$acc4,$acc4,xzr
227___
228$code.=<<___	if ($i<3);
229	ldr	$bi,[$bp,#8*($i+1)]	// b[$i+1]
230___
231$code.=<<___;
232	adds	$acc1,$acc1,$t0		// accumulate high parts of multiplication
233	 lsl	$t0,$acc0,#32
234	adcs	$acc2,$acc2,$t1
235	 lsr	$t1,$acc0,#32
236	adcs	$acc3,$acc3,$t2
237	adcs	$acc4,$acc4,$t3
238	adc	$acc5,xzr,xzr
239___
240}
241$code.=<<___;
242	// last reduction
243	subs	$t2,$acc0,$t0		// "*0xffff0001"
244	sbc	$t3,$acc0,$t1
245	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
246	adcs	$acc1,$acc2,$t1
247	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
248	adcs	$acc3,$acc4,$t3
249	adc	$acc4,$acc5,xzr
250
251	adds	$t0,$acc0,#1		// subs	$t0,$acc0,#-1 // tmp = ret-modulus
252	sbcs	$t1,$acc1,$poly1
253	sbcs	$t2,$acc2,xzr
254	sbcs	$t3,$acc3,$poly3
255	sbcs	xzr,$acc4,xzr		// did it borrow?
256
257	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
258	csel	$acc1,$acc1,$t1,lo
259	csel	$acc2,$acc2,$t2,lo
260	stp	$acc0,$acc1,[$rp]
261	csel	$acc3,$acc3,$t3,lo
262	stp	$acc2,$acc3,[$rp,#16]
263
264	ret
265.size	__ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
266
267// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
268// to $a0-$a3
269.type	__ecp_nistz256_sqr_mont,%function
270.align	4
271__ecp_nistz256_sqr_mont:
272	//  |  |  |  |  |  |a1*a0|  |
273	//  |  |  |  |  |a2*a0|  |  |
274	//  |  |a3*a2|a3*a0|  |  |  |
275	//  |  |  |  |a2*a1|  |  |  |
276	//  |  |  |a3*a1|  |  |  |  |
277	// *|  |  |  |  |  |  |  | 2|
278	// +|a3*a3|a2*a2|a1*a1|a0*a0|
279	//  |--+--+--+--+--+--+--+--|
280	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
281	//
282	//  "can't overflow" below mark carrying into high part of
283	//  multiplication result, which can't overflow, because it
284	//  can never be all ones.
285
286	mul	$acc1,$a1,$a0		// a[1]*a[0]
287	umulh	$t1,$a1,$a0
288	mul	$acc2,$a2,$a0		// a[2]*a[0]
289	umulh	$t2,$a2,$a0
290	mul	$acc3,$a3,$a0		// a[3]*a[0]
291	umulh	$acc4,$a3,$a0
292
293	adds	$acc2,$acc2,$t1		// accumulate high parts of multiplication
294	 mul	$t0,$a2,$a1		// a[2]*a[1]
295	 umulh	$t1,$a2,$a1
296	adcs	$acc3,$acc3,$t2
297	 mul	$t2,$a3,$a1		// a[3]*a[1]
298	 umulh	$t3,$a3,$a1
299	adc	$acc4,$acc4,xzr		// can't overflow
300
301	mul	$acc5,$a3,$a2		// a[3]*a[2]
302	umulh	$acc6,$a3,$a2
303
304	adds	$t1,$t1,$t2		// accumulate high parts of multiplication
305	 mul	$acc0,$a0,$a0		// a[0]*a[0]
306	adc	$t2,$t3,xzr		// can't overflow
307
308	adds	$acc3,$acc3,$t0		// accumulate low parts of multiplication
309	 umulh	$a0,$a0,$a0
310	adcs	$acc4,$acc4,$t1
311	 mul	$t1,$a1,$a1		// a[1]*a[1]
312	adcs	$acc5,$acc5,$t2
313	 umulh	$a1,$a1,$a1
314	adc	$acc6,$acc6,xzr		// can't overflow
315
316	adds	$acc1,$acc1,$acc1	// acc[1-6]*=2
317	 mul	$t2,$a2,$a2		// a[2]*a[2]
318	adcs	$acc2,$acc2,$acc2
319	 umulh	$a2,$a2,$a2
320	adcs	$acc3,$acc3,$acc3
321	 mul	$t3,$a3,$a3		// a[3]*a[3]
322	adcs	$acc4,$acc4,$acc4
323	 umulh	$a3,$a3,$a3
324	adcs	$acc5,$acc5,$acc5
325	adcs	$acc6,$acc6,$acc6
326	adc	$acc7,xzr,xzr
327
328	adds	$acc1,$acc1,$a0		// +a[i]*a[i]
329	adcs	$acc2,$acc2,$t1
330	adcs	$acc3,$acc3,$a1
331	adcs	$acc4,$acc4,$t2
332	adcs	$acc5,$acc5,$a2
333	 lsl	$t0,$acc0,#32
334	adcs	$acc6,$acc6,$t3
335	 lsr	$t1,$acc0,#32
336	adc	$acc7,$acc7,$a3
337___
338for($i=0;$i<3;$i++) {			# reductions, see commentary in
339					# multiplication for details
340$code.=<<___;
341	subs	$t2,$acc0,$t0		// "*0xffff0001"
342	sbc	$t3,$acc0,$t1
343	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
344	adcs	$acc1,$acc2,$t1
345	 lsl	$t0,$acc0,#32
346	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
347	 lsr	$t1,$acc0,#32
348	adc	$acc3,$t3,xzr		// can't overflow
349___
350}
351$code.=<<___;
352	subs	$t2,$acc0,$t0		// "*0xffff0001"
353	sbc	$t3,$acc0,$t1
354	adds	$acc0,$acc1,$t0		// +=acc[0]<<96 and omit acc[0]
355	adcs	$acc1,$acc2,$t1
356	adcs	$acc2,$acc3,$t2		// +=acc[0]*0xffff0001
357	adc	$acc3,$t3,xzr		// can't overflow
358
359	adds	$acc0,$acc0,$acc4	// accumulate upper half
360	adcs	$acc1,$acc1,$acc5
361	adcs	$acc2,$acc2,$acc6
362	adcs	$acc3,$acc3,$acc7
363	adc	$acc4,xzr,xzr
364
365	adds	$t0,$acc0,#1		// subs	$t0,$acc0,#-1 // tmp = ret-modulus
366	sbcs	$t1,$acc1,$poly1
367	sbcs	$t2,$acc2,xzr
368	sbcs	$t3,$acc3,$poly3
369	sbcs	xzr,$acc4,xzr		// did it borrow?
370
371	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
372	csel	$acc1,$acc1,$t1,lo
373	csel	$acc2,$acc2,$t2,lo
374	stp	$acc0,$acc1,[$rp]
375	csel	$acc3,$acc3,$t3,lo
376	stp	$acc2,$acc3,[$rp,#16]
377
378	ret
379.size	__ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
380
381// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to
382// $a0-$a3 and $t0-$t3. This is done because it's used in multiple
383// contexts, e.g. in multiplication by 2 and 3...
384.type	__ecp_nistz256_add_to,%function
385.align	4
386__ecp_nistz256_add_to:
387	adds	$acc0,$acc0,$t0		// ret = a+b
388	adcs	$acc1,$acc1,$t1
389	adcs	$acc2,$acc2,$t2
390	adcs	$acc3,$acc3,$t3
391	adc	$ap,xzr,xzr		// zap $ap
392
393	adds	$t0,$acc0,#1		// subs	$t0,$a0,#-1 // tmp = ret-modulus
394	sbcs	$t1,$acc1,$poly1
395	sbcs	$t2,$acc2,xzr
396	sbcs	$t3,$acc3,$poly3
397	sbcs	xzr,$ap,xzr		// did subtraction borrow?
398
399	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
400	csel	$acc1,$acc1,$t1,lo
401	csel	$acc2,$acc2,$t2,lo
402	stp	$acc0,$acc1,[$rp]
403	csel	$acc3,$acc3,$t3,lo
404	stp	$acc2,$acc3,[$rp,#16]
405
406	ret
407.size	__ecp_nistz256_add_to,.-__ecp_nistz256_add_to
408
409.type	__ecp_nistz256_sub_from,%function
410.align	4
411__ecp_nistz256_sub_from:
412	ldp	$t0,$t1,[$bp]
413	ldp	$t2,$t3,[$bp,#16]
414	subs	$acc0,$acc0,$t0		// ret = a-b
415	sbcs	$acc1,$acc1,$t1
416	sbcs	$acc2,$acc2,$t2
417	sbcs	$acc3,$acc3,$t3
418	sbc	$ap,xzr,xzr		// zap $ap
419
420	subs	$t0,$acc0,#1		// adds	$t0,$a0,#-1 // tmp = ret+modulus
421	adcs	$t1,$acc1,$poly1
422	adcs	$t2,$acc2,xzr
423	adc	$t3,$acc3,$poly3
424	cmp	$ap,xzr			// did subtraction borrow?
425
426	csel	$acc0,$acc0,$t0,eq	// ret = borrow ? ret+modulus : ret
427	csel	$acc1,$acc1,$t1,eq
428	csel	$acc2,$acc2,$t2,eq
429	stp	$acc0,$acc1,[$rp]
430	csel	$acc3,$acc3,$t3,eq
431	stp	$acc2,$acc3,[$rp,#16]
432
433	ret
434.size	__ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
435
436.type	__ecp_nistz256_sub_morf,%function
437.align	4
438__ecp_nistz256_sub_morf:
439	ldp	$t0,$t1,[$bp]
440	ldp	$t2,$t3,[$bp,#16]
441	subs	$acc0,$t0,$acc0		// ret = b-a
442	sbcs	$acc1,$t1,$acc1
443	sbcs	$acc2,$t2,$acc2
444	sbcs	$acc3,$t3,$acc3
445	sbc	$ap,xzr,xzr		// zap $ap
446
447	subs	$t0,$acc0,#1		// adds	$t0,$a0,#-1 // tmp = ret+modulus
448	adcs	$t1,$acc1,$poly1
449	adcs	$t2,$acc2,xzr
450	adc	$t3,$acc3,$poly3
451	cmp	$ap,xzr			// did subtraction borrow?
452
453	csel	$acc0,$acc0,$t0,eq	// ret = borrow ? ret+modulus : ret
454	csel	$acc1,$acc1,$t1,eq
455	csel	$acc2,$acc2,$t2,eq
456	stp	$acc0,$acc1,[$rp]
457	csel	$acc3,$acc3,$t3,eq
458	stp	$acc2,$acc3,[$rp,#16]
459
460	ret
461.size	__ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
462
463.type	__ecp_nistz256_div_by_2,%function
464.align	4
465__ecp_nistz256_div_by_2:
466	subs	$t0,$acc0,#1		// adds	$t0,$a0,#-1 // tmp = a+modulus
467	adcs	$t1,$acc1,$poly1
468	adcs	$t2,$acc2,xzr
469	adcs	$t3,$acc3,$poly3
470	adc	$ap,xzr,xzr		// zap $ap
471	tst	$acc0,#1		// is a even?
472
473	csel	$acc0,$acc0,$t0,eq	// ret = even ? a : a+modulus
474	csel	$acc1,$acc1,$t1,eq
475	csel	$acc2,$acc2,$t2,eq
476	csel	$acc3,$acc3,$t3,eq
477	csel	$ap,xzr,$ap,eq
478
479	lsr	$acc0,$acc0,#1		// ret >>= 1
480	orr	$acc0,$acc0,$acc1,lsl#63
481	lsr	$acc1,$acc1,#1
482	orr	$acc1,$acc1,$acc2,lsl#63
483	lsr	$acc2,$acc2,#1
484	orr	$acc2,$acc2,$acc3,lsl#63
485	lsr	$acc3,$acc3,#1
486	stp	$acc0,$acc1,[$rp]
487	orr	$acc3,$acc3,$ap,lsl#63
488	stp	$acc2,$acc3,[$rp,#16]
489
490	ret
491.size	__ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
492___
493########################################################################
494# following subroutines are "literal" implementation of those found in
495# ecp_nistz256.c
496#
497########################################################################
498# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
499#
500{
501my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3));
502# above map() describes stack layout with 4 temporary
503# 256-bit vectors on top.
504my ($rp_real,$ap_real) = map("x$_",(21,22));
505
506$code.=<<___;
507.globl	ecp_nistz256_point_double
508.type	ecp_nistz256_point_double,%function
509.align	5
510ecp_nistz256_point_double:
511	AARCH64_SIGN_LINK_REGISTER
512	stp	x29,x30,[sp,#-96]!
513	add	x29,sp,#0
514	stp	x19,x20,[sp,#16]
515	stp	x21,x22,[sp,#32]
516	sub	sp,sp,#32*4
517
518.Ldouble_shortcut:
519	ldp	$acc0,$acc1,[$ap,#32]
520	 mov	$rp_real,$rp
521	ldp	$acc2,$acc3,[$ap,#48]
522	 mov	$ap_real,$ap
523	 adrp	$poly3,:pg_hi21:.Lpoly
524	 add	$poly3,$poly3,:lo12:.Lpoly
525	 ldr	$poly1,[$poly3,#8]
526	mov	$t0,$acc0
527	 ldr	$poly3,[$poly3,#24]
528	mov	$t1,$acc1
529	 ldp	$a0,$a1,[$ap_real,#64]	// forward load for p256_sqr_mont
530	mov	$t2,$acc2
531	mov	$t3,$acc3
532	 ldp	$a2,$a3,[$ap_real,#64+16]
533	add	$rp,sp,#$S
534	bl	__ecp_nistz256_add_to	// p256_mul_by_2(S, in_y);
535
536	add	$rp,sp,#$Zsqr
537	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Zsqr, in_z);
538
539	ldp	$t0,$t1,[$ap_real]
540	ldp	$t2,$t3,[$ap_real,#16]
541	mov	$a0,$acc0		// put Zsqr aside for p256_sub
542	mov	$a1,$acc1
543	mov	$a2,$acc2
544	mov	$a3,$acc3
545	add	$rp,sp,#$M
546	bl	__ecp_nistz256_add_to	// p256_add(M, Zsqr, in_x);
547
548	add	$bp,$ap_real,#0
549	mov	$acc0,$a0		// restore Zsqr
550	mov	$acc1,$a1
551	 ldp	$a0,$a1,[sp,#$S]	// forward load for p256_sqr_mont
552	mov	$acc2,$a2
553	mov	$acc3,$a3
554	 ldp	$a2,$a3,[sp,#$S+16]
555	add	$rp,sp,#$Zsqr
556	bl	__ecp_nistz256_sub_morf	// p256_sub(Zsqr, in_x, Zsqr);
557
558	add	$rp,sp,#$S
559	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(S, S);
560
561	ldr	$bi,[$ap_real,#32]
562	ldp	$a0,$a1,[$ap_real,#64]
563	ldp	$a2,$a3,[$ap_real,#64+16]
564	add	$bp,$ap_real,#32
565	add	$rp,sp,#$tmp0
566	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(tmp0, in_z, in_y);
567
568	mov	$t0,$acc0
569	mov	$t1,$acc1
570	 ldp	$a0,$a1,[sp,#$S]	// forward load for p256_sqr_mont
571	mov	$t2,$acc2
572	mov	$t3,$acc3
573	 ldp	$a2,$a3,[sp,#$S+16]
574	add	$rp,$rp_real,#64
575	bl	__ecp_nistz256_add_to	// p256_mul_by_2(res_z, tmp0);
576
577	add	$rp,sp,#$tmp0
578	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(tmp0, S);
579
580	 ldr	$bi,[sp,#$Zsqr]		// forward load for p256_mul_mont
581	 ldp	$a0,$a1,[sp,#$M]
582	 ldp	$a2,$a3,[sp,#$M+16]
583	add	$rp,$rp_real,#32
584	bl	__ecp_nistz256_div_by_2	// p256_div_by_2(res_y, tmp0);
585
586	add	$bp,sp,#$Zsqr
587	add	$rp,sp,#$M
588	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(M, M, Zsqr);
589
590	mov	$t0,$acc0		// duplicate M
591	mov	$t1,$acc1
592	mov	$t2,$acc2
593	mov	$t3,$acc3
594	mov	$a0,$acc0		// put M aside
595	mov	$a1,$acc1
596	mov	$a2,$acc2
597	mov	$a3,$acc3
598	add	$rp,sp,#$M
599	bl	__ecp_nistz256_add_to
600	mov	$t0,$a0			// restore M
601	mov	$t1,$a1
602	 ldr	$bi,[$ap_real]		// forward load for p256_mul_mont
603	mov	$t2,$a2
604	 ldp	$a0,$a1,[sp,#$S]
605	mov	$t3,$a3
606	 ldp	$a2,$a3,[sp,#$S+16]
607	bl	__ecp_nistz256_add_to	// p256_mul_by_3(M, M);
608
609	add	$bp,$ap_real,#0
610	add	$rp,sp,#$S
611	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, in_x);
612
613	mov	$t0,$acc0
614	mov	$t1,$acc1
615	 ldp	$a0,$a1,[sp,#$M]	// forward load for p256_sqr_mont
616	mov	$t2,$acc2
617	mov	$t3,$acc3
618	 ldp	$a2,$a3,[sp,#$M+16]
619	add	$rp,sp,#$tmp0
620	bl	__ecp_nistz256_add_to	// p256_mul_by_2(tmp0, S);
621
622	add	$rp,$rp_real,#0
623	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(res_x, M);
624
625	add	$bp,sp,#$tmp0
626	bl	__ecp_nistz256_sub_from	// p256_sub(res_x, res_x, tmp0);
627
628	add	$bp,sp,#$S
629	add	$rp,sp,#$S
630	bl	__ecp_nistz256_sub_morf	// p256_sub(S, S, res_x);
631
632	ldr	$bi,[sp,#$M]
633	mov	$a0,$acc0		// copy S
634	mov	$a1,$acc1
635	mov	$a2,$acc2
636	mov	$a3,$acc3
637	add	$bp,sp,#$M
638	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S, S, M);
639
640	add	$bp,$rp_real,#32
641	add	$rp,$rp_real,#32
642	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, S, res_y);
643
644	add	sp,x29,#0		// destroy frame
645	ldp	x19,x20,[x29,#16]
646	ldp	x21,x22,[x29,#32]
647	ldp	x29,x30,[sp],#96
648	AARCH64_VALIDATE_LINK_REGISTER
649	ret
650.size	ecp_nistz256_point_double,.-ecp_nistz256_point_double
651___
652}
653
654########################################################################
655# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
656#			      const P256_POINT *in2);
657{
658my ($res_x,$res_y,$res_z,
659    $H,$Hsqr,$R,$Rsqr,$Hcub,
660    $U1,$U2,$S1,$S2)=map(32*$_,(0..11));
661my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
662# above map() describes stack layout with 12 temporary
663# 256-bit vectors on top.
664my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp0,$temp1,$temp2)=map("x$_",(21..28));
665
666$code.=<<___;
667.globl	ecp_nistz256_point_add
668.type	ecp_nistz256_point_add,%function
669.align	5
670ecp_nistz256_point_add:
671	AARCH64_SIGN_LINK_REGISTER
672	stp	x29,x30,[sp,#-96]!
673	add	x29,sp,#0
674	stp	x19,x20,[sp,#16]
675	stp	x21,x22,[sp,#32]
676	stp	x23,x24,[sp,#48]
677	stp	x25,x26,[sp,#64]
678	stp	x27,x28,[sp,#80]
679	sub	sp,sp,#32*12
680
681	ldp	$a0,$a1,[$bp,#64]	// in2_z
682	ldp	$a2,$a3,[$bp,#64+16]
683	 mov	$rp_real,$rp
684	 mov	$ap_real,$ap
685	 mov	$bp_real,$bp
686	 adrp	$poly3,:pg_hi21:.Lpoly
687	 add	$poly3,$poly3,:lo12:.Lpoly
688	 ldr	$poly1,[$poly3,#8]
689	 ldr	$poly3,[$poly3,#24]
690	orr	$t0,$a0,$a1
691	orr	$t2,$a2,$a3
692	orr	$in2infty,$t0,$t2
693	cmp	$in2infty,#0
694	csetm	$in2infty,ne		// ~in2infty
695	add	$rp,sp,#$Z2sqr
696	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z2sqr, in2_z);
697
698	ldp	$a0,$a1,[$ap_real,#64]	// in1_z
699	ldp	$a2,$a3,[$ap_real,#64+16]
700	orr	$t0,$a0,$a1
701	orr	$t2,$a2,$a3
702	orr	$in1infty,$t0,$t2
703	cmp	$in1infty,#0
704	csetm	$in1infty,ne		// ~in1infty
705	add	$rp,sp,#$Z1sqr
706	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
707
708	ldr	$bi,[$bp_real,#64]
709	ldp	$a0,$a1,[sp,#$Z2sqr]
710	ldp	$a2,$a3,[sp,#$Z2sqr+16]
711	add	$bp,$bp_real,#64
712	add	$rp,sp,#$S1
713	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, Z2sqr, in2_z);
714
715	ldr	$bi,[$ap_real,#64]
716	ldp	$a0,$a1,[sp,#$Z1sqr]
717	ldp	$a2,$a3,[sp,#$Z1sqr+16]
718	add	$bp,$ap_real,#64
719	add	$rp,sp,#$S2
720	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
721
722	ldr	$bi,[$ap_real,#32]
723	ldp	$a0,$a1,[sp,#$S1]
724	ldp	$a2,$a3,[sp,#$S1+16]
725	add	$bp,$ap_real,#32
726	add	$rp,sp,#$S1
727	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S1, S1, in1_y);
728
729	ldr	$bi,[$bp_real,#32]
730	ldp	$a0,$a1,[sp,#$S2]
731	ldp	$a2,$a3,[sp,#$S2+16]
732	add	$bp,$bp_real,#32
733	add	$rp,sp,#$S2
734	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
735
736	add	$bp,sp,#$S1
737	 ldr	$bi,[sp,#$Z2sqr]	// forward load for p256_mul_mont
738	 ldp	$a0,$a1,[$ap_real]
739	 ldp	$a2,$a3,[$ap_real,#16]
740	add	$rp,sp,#$R
741	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, S1);
742
743	orr	$acc0,$acc0,$acc1	// see if result is zero
744	orr	$acc2,$acc2,$acc3
745	orr	$temp0,$acc0,$acc2	// ~is_equal(S1,S2)
746
747	add	$bp,sp,#$Z2sqr
748	add	$rp,sp,#$U1
749	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U1, in1_x, Z2sqr);
750
751	ldr	$bi,[sp,#$Z1sqr]
752	ldp	$a0,$a1,[$bp_real]
753	ldp	$a2,$a3,[$bp_real,#16]
754	add	$bp,sp,#$Z1sqr
755	add	$rp,sp,#$U2
756	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in2_x, Z1sqr);
757
758	add	$bp,sp,#$U1
759	 ldp	$a0,$a1,[sp,#$R]	// forward load for p256_sqr_mont
760	 ldp	$a2,$a3,[sp,#$R+16]
761	add	$rp,sp,#$H
762	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, U1);
763
764	orr	$acc0,$acc0,$acc1	// see if result is zero
765	orr	$acc2,$acc2,$acc3
766	orr	$acc0,$acc0,$acc2	// ~is_equal(U1,U2)
767
768	mvn	$temp1,$in1infty	// -1/0 -> 0/-1
769	mvn	$temp2,$in2infty	// -1/0 -> 0/-1
770	orr	$acc0,$acc0,$temp1
771	orr	$acc0,$acc0,$temp2
772	orr	$acc0,$acc0,$temp0
773	cbnz	$acc0,.Ladd_proceed	// if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
774
775.Ladd_double:
776	mov	$ap,$ap_real
777	mov	$rp,$rp_real
778	ldp	x23,x24,[x29,#48]
779	ldp	x25,x26,[x29,#64]
780	ldp	x27,x28,[x29,#80]
781	add	sp,sp,#256	// #256 is from #32*(12-4). difference in stack frames
782	b	.Ldouble_shortcut
783
784.align	4
785.Ladd_proceed:
786	add	$rp,sp,#$Rsqr
787	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
788
789	ldr	$bi,[$ap_real,#64]
790	ldp	$a0,$a1,[sp,#$H]
791	ldp	$a2,$a3,[sp,#$H+16]
792	add	$bp,$ap_real,#64
793	add	$rp,sp,#$res_z
794	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
795
796	ldp	$a0,$a1,[sp,#$H]
797	ldp	$a2,$a3,[sp,#$H+16]
798	add	$rp,sp,#$Hsqr
799	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
800
801	ldr	$bi,[$bp_real,#64]
802	ldp	$a0,$a1,[sp,#$res_z]
803	ldp	$a2,$a3,[sp,#$res_z+16]
804	add	$bp,$bp_real,#64
805	add	$rp,sp,#$res_z
806	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, res_z, in2_z);
807
808	ldr	$bi,[sp,#$H]
809	ldp	$a0,$a1,[sp,#$Hsqr]
810	ldp	$a2,$a3,[sp,#$Hsqr+16]
811	add	$bp,sp,#$H
812	add	$rp,sp,#$Hcub
813	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
814
815	ldr	$bi,[sp,#$Hsqr]
816	ldp	$a0,$a1,[sp,#$U1]
817	ldp	$a2,$a3,[sp,#$U1+16]
818	add	$bp,sp,#$Hsqr
819	add	$rp,sp,#$U2
820	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, U1, Hsqr);
821
822	mov	$t0,$acc0
823	mov	$t1,$acc1
824	mov	$t2,$acc2
825	mov	$t3,$acc3
826	add	$rp,sp,#$Hsqr
827	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
828
829	add	$bp,sp,#$Rsqr
830	add	$rp,sp,#$res_x
831	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
832
833	add	$bp,sp,#$Hcub
834	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
835
836	add	$bp,sp,#$U2
837	 ldr	$bi,[sp,#$Hcub]		// forward load for p256_mul_mont
838	 ldp	$a0,$a1,[sp,#$S1]
839	 ldp	$a2,$a3,[sp,#$S1+16]
840	add	$rp,sp,#$res_y
841	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
842
843	add	$bp,sp,#$Hcub
844	add	$rp,sp,#$S2
845	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S1, Hcub);
846
847	ldr	$bi,[sp,#$R]
848	ldp	$a0,$a1,[sp,#$res_y]
849	ldp	$a2,$a3,[sp,#$res_y+16]
850	add	$bp,sp,#$R
851	add	$rp,sp,#$res_y
852	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
853
854	add	$bp,sp,#$S2
855	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
856
857	ldp	$a0,$a1,[sp,#$res_x]		// res
858	ldp	$a2,$a3,[sp,#$res_x+16]
859	ldp	$t0,$t1,[$bp_real]		// in2
860	ldp	$t2,$t3,[$bp_real,#16]
861___
862for($i=0;$i<64;$i+=32) {		# conditional moves
863$code.=<<___;
864	ldp	$acc0,$acc1,[$ap_real,#$i]	// in1
865	cmp	$in1infty,#0			// ~$in1intfy, remember?
866	ldp	$acc2,$acc3,[$ap_real,#$i+16]
867	csel	$t0,$a0,$t0,ne
868	csel	$t1,$a1,$t1,ne
869	ldp	$a0,$a1,[sp,#$res_x+$i+32]	// res
870	csel	$t2,$a2,$t2,ne
871	csel	$t3,$a3,$t3,ne
872	cmp	$in2infty,#0			// ~$in2intfy, remember?
873	ldp	$a2,$a3,[sp,#$res_x+$i+48]
874	csel	$acc0,$t0,$acc0,ne
875	csel	$acc1,$t1,$acc1,ne
876	ldp	$t0,$t1,[$bp_real,#$i+32]	// in2
877	csel	$acc2,$t2,$acc2,ne
878	csel	$acc3,$t3,$acc3,ne
879	ldp	$t2,$t3,[$bp_real,#$i+48]
880	stp	$acc0,$acc1,[$rp_real,#$i]
881	stp	$acc2,$acc3,[$rp_real,#$i+16]
882___
883}
884$code.=<<___;
885	ldp	$acc0,$acc1,[$ap_real,#$i]	// in1
886	cmp	$in1infty,#0			// ~$in1intfy, remember?
887	ldp	$acc2,$acc3,[$ap_real,#$i+16]
888	csel	$t0,$a0,$t0,ne
889	csel	$t1,$a1,$t1,ne
890	csel	$t2,$a2,$t2,ne
891	csel	$t3,$a3,$t3,ne
892	cmp	$in2infty,#0			// ~$in2intfy, remember?
893	csel	$acc0,$t0,$acc0,ne
894	csel	$acc1,$t1,$acc1,ne
895	csel	$acc2,$t2,$acc2,ne
896	csel	$acc3,$t3,$acc3,ne
897	stp	$acc0,$acc1,[$rp_real,#$i]
898	stp	$acc2,$acc3,[$rp_real,#$i+16]
899
900.Ladd_done:
901	add	sp,x29,#0		// destroy frame
902	ldp	x19,x20,[x29,#16]
903	ldp	x21,x22,[x29,#32]
904	ldp	x23,x24,[x29,#48]
905	ldp	x25,x26,[x29,#64]
906	ldp	x27,x28,[x29,#80]
907	ldp	x29,x30,[sp],#96
908	AARCH64_VALIDATE_LINK_REGISTER
909	ret
910.size	ecp_nistz256_point_add,.-ecp_nistz256_point_add
911___
912}
913
914########################################################################
915# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
916#				     const P256_POINT_AFFINE *in2);
917{
918my ($res_x,$res_y,$res_z,
919    $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9));
920my $Z1sqr = $S2;
921# above map() describes stack layout with 10 temporary
922# 256-bit vectors on top.
923my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26));
924
925$code.=<<___;
926.globl	ecp_nistz256_point_add_affine
927.type	ecp_nistz256_point_add_affine,%function
928.align	5
929ecp_nistz256_point_add_affine:
930	AARCH64_SIGN_LINK_REGISTER
931	stp	x29,x30,[sp,#-80]!
932	add	x29,sp,#0
933	stp	x19,x20,[sp,#16]
934	stp	x21,x22,[sp,#32]
935	stp	x23,x24,[sp,#48]
936	stp	x25,x26,[sp,#64]
937	sub	sp,sp,#32*10
938
939	mov	$rp_real,$rp
940	mov	$ap_real,$ap
941	mov	$bp_real,$bp
942	adrp	$poly3,:pg_hi21:.Lpoly
943	add	$poly3,$poly3,:lo12:.Lpoly
944	ldr	$poly1,[$poly3,#8]
945	ldr	$poly3,[$poly3,#24]
946
947	ldp	$a0,$a1,[$ap,#64]	// in1_z
948	ldp	$a2,$a3,[$ap,#64+16]
949	orr	$t0,$a0,$a1
950	orr	$t2,$a2,$a3
951	orr	$in1infty,$t0,$t2
952	cmp	$in1infty,#0
953	csetm	$in1infty,ne		// ~in1infty
954
955	ldp	$acc0,$acc1,[$bp]	// in2_x
956	ldp	$acc2,$acc3,[$bp,#16]
957	ldp	$t0,$t1,[$bp,#32]	// in2_y
958	ldp	$t2,$t3,[$bp,#48]
959	orr	$acc0,$acc0,$acc1
960	orr	$acc2,$acc2,$acc3
961	orr	$t0,$t0,$t1
962	orr	$t2,$t2,$t3
963	orr	$acc0,$acc0,$acc2
964	orr	$t0,$t0,$t2
965	orr	$in2infty,$acc0,$t0
966	cmp	$in2infty,#0
967	csetm	$in2infty,ne		// ~in2infty
968
969	add	$rp,sp,#$Z1sqr
970	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Z1sqr, in1_z);
971
972	mov	$a0,$acc0
973	mov	$a1,$acc1
974	mov	$a2,$acc2
975	mov	$a3,$acc3
976	ldr	$bi,[$bp_real]
977	add	$bp,$bp_real,#0
978	add	$rp,sp,#$U2
979	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, Z1sqr, in2_x);
980
981	add	$bp,$ap_real,#0
982	 ldr	$bi,[$ap_real,#64]	// forward load for p256_mul_mont
983	 ldp	$a0,$a1,[sp,#$Z1sqr]
984	 ldp	$a2,$a3,[sp,#$Z1sqr+16]
985	add	$rp,sp,#$H
986	bl	__ecp_nistz256_sub_from	// p256_sub(H, U2, in1_x);
987
988	add	$bp,$ap_real,#64
989	add	$rp,sp,#$S2
990	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, Z1sqr, in1_z);
991
992	ldr	$bi,[$ap_real,#64]
993	ldp	$a0,$a1,[sp,#$H]
994	ldp	$a2,$a3,[sp,#$H+16]
995	add	$bp,$ap_real,#64
996	add	$rp,sp,#$res_z
997	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_z, H, in1_z);
998
999	ldr	$bi,[$bp_real,#32]
1000	ldp	$a0,$a1,[sp,#$S2]
1001	ldp	$a2,$a3,[sp,#$S2+16]
1002	add	$bp,$bp_real,#32
1003	add	$rp,sp,#$S2
1004	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, S2, in2_y);
1005
1006	add	$bp,$ap_real,#32
1007	 ldp	$a0,$a1,[sp,#$H]	// forward load for p256_sqr_mont
1008	 ldp	$a2,$a3,[sp,#$H+16]
1009	add	$rp,sp,#$R
1010	bl	__ecp_nistz256_sub_from	// p256_sub(R, S2, in1_y);
1011
1012	add	$rp,sp,#$Hsqr
1013	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Hsqr, H);
1014
1015	ldp	$a0,$a1,[sp,#$R]
1016	ldp	$a2,$a3,[sp,#$R+16]
1017	add	$rp,sp,#$Rsqr
1018	bl	__ecp_nistz256_sqr_mont	// p256_sqr_mont(Rsqr, R);
1019
1020	ldr	$bi,[sp,#$H]
1021	ldp	$a0,$a1,[sp,#$Hsqr]
1022	ldp	$a2,$a3,[sp,#$Hsqr+16]
1023	add	$bp,sp,#$H
1024	add	$rp,sp,#$Hcub
1025	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(Hcub, Hsqr, H);
1026
1027	ldr	$bi,[$ap_real]
1028	ldp	$a0,$a1,[sp,#$Hsqr]
1029	ldp	$a2,$a3,[sp,#$Hsqr+16]
1030	add	$bp,$ap_real,#0
1031	add	$rp,sp,#$U2
1032	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(U2, in1_x, Hsqr);
1033
1034	mov	$t0,$acc0
1035	mov	$t1,$acc1
1036	mov	$t2,$acc2
1037	mov	$t3,$acc3
1038	add	$rp,sp,#$Hsqr
1039	bl	__ecp_nistz256_add_to	// p256_mul_by_2(Hsqr, U2);
1040
1041	add	$bp,sp,#$Rsqr
1042	add	$rp,sp,#$res_x
1043	bl	__ecp_nistz256_sub_morf	// p256_sub(res_x, Rsqr, Hsqr);
1044
1045	add	$bp,sp,#$Hcub
1046	bl	__ecp_nistz256_sub_from	//  p256_sub(res_x, res_x, Hcub);
1047
1048	add	$bp,sp,#$U2
1049	 ldr	$bi,[$ap_real,#32]	// forward load for p256_mul_mont
1050	 ldp	$a0,$a1,[sp,#$Hcub]
1051	 ldp	$a2,$a3,[sp,#$Hcub+16]
1052	add	$rp,sp,#$res_y
1053	bl	__ecp_nistz256_sub_morf	// p256_sub(res_y, U2, res_x);
1054
1055	add	$bp,$ap_real,#32
1056	add	$rp,sp,#$S2
1057	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(S2, in1_y, Hcub);
1058
1059	ldr	$bi,[sp,#$R]
1060	ldp	$a0,$a1,[sp,#$res_y]
1061	ldp	$a2,$a3,[sp,#$res_y+16]
1062	add	$bp,sp,#$R
1063	add	$rp,sp,#$res_y
1064	bl	__ecp_nistz256_mul_mont	// p256_mul_mont(res_y, res_y, R);
1065
1066	add	$bp,sp,#$S2
1067	bl	__ecp_nistz256_sub_from	// p256_sub(res_y, res_y, S2);
1068
1069	ldp	$a0,$a1,[sp,#$res_x]		// res
1070	ldp	$a2,$a3,[sp,#$res_x+16]
1071	ldp	$t0,$t1,[$bp_real]		// in2
1072	ldp	$t2,$t3,[$bp_real,#16]
1073___
1074for($i=0;$i<64;$i+=32) {		# conditional moves
1075$code.=<<___;
1076	ldp	$acc0,$acc1,[$ap_real,#$i]	// in1
1077	cmp	$in1infty,#0			// ~$in1intfy, remember?
1078	ldp	$acc2,$acc3,[$ap_real,#$i+16]
1079	csel	$t0,$a0,$t0,ne
1080	csel	$t1,$a1,$t1,ne
1081	ldp	$a0,$a1,[sp,#$res_x+$i+32]	// res
1082	csel	$t2,$a2,$t2,ne
1083	csel	$t3,$a3,$t3,ne
1084	cmp	$in2infty,#0			// ~$in2intfy, remember?
1085	ldp	$a2,$a3,[sp,#$res_x+$i+48]
1086	csel	$acc0,$t0,$acc0,ne
1087	csel	$acc1,$t1,$acc1,ne
1088	ldp	$t0,$t1,[$bp_real,#$i+32]	// in2
1089	csel	$acc2,$t2,$acc2,ne
1090	csel	$acc3,$t3,$acc3,ne
1091	ldp	$t2,$t3,[$bp_real,#$i+48]
1092	stp	$acc0,$acc1,[$rp_real,#$i]
1093	stp	$acc2,$acc3,[$rp_real,#$i+16]
1094___
1095$code.=<<___	if ($i == 0);
1096	adrp	$bp_real,:pg_hi21:.Lone_mont-64
1097	add	$bp_real,$bp_real,:lo12:.Lone_mont-64
1098___
1099}
1100$code.=<<___;
1101	ldp	$acc0,$acc1,[$ap_real,#$i]	// in1
1102	cmp	$in1infty,#0			// ~$in1intfy, remember?
1103	ldp	$acc2,$acc3,[$ap_real,#$i+16]
1104	csel	$t0,$a0,$t0,ne
1105	csel	$t1,$a1,$t1,ne
1106	csel	$t2,$a2,$t2,ne
1107	csel	$t3,$a3,$t3,ne
1108	cmp	$in2infty,#0			// ~$in2intfy, remember?
1109	csel	$acc0,$t0,$acc0,ne
1110	csel	$acc1,$t1,$acc1,ne
1111	csel	$acc2,$t2,$acc2,ne
1112	csel	$acc3,$t3,$acc3,ne
1113	stp	$acc0,$acc1,[$rp_real,#$i]
1114	stp	$acc2,$acc3,[$rp_real,#$i+16]
1115
1116	add	sp,x29,#0		// destroy frame
1117	ldp	x19,x20,[x29,#16]
1118	ldp	x21,x22,[x29,#32]
1119	ldp	x23,x24,[x29,#48]
1120	ldp	x25,x26,[x29,#64]
1121	ldp	x29,x30,[sp],#80
1122	AARCH64_VALIDATE_LINK_REGISTER
1123	ret
1124.size	ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1125___
1126}
1127if (1) {
1128my ($ord0,$ord1) = ($poly1,$poly3);
1129my ($ord2,$ord3,$ordk,$t4) = map("x$_",(21..24));
1130my $acc7 = $bi;
1131
1132$code.=<<___;
1133////////////////////////////////////////////////////////////////////////
1134// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
1135//                                uint64_t b[4]);
1136.globl	ecp_nistz256_ord_mul_mont
1137.type	ecp_nistz256_ord_mul_mont,%function
1138.align	4
1139ecp_nistz256_ord_mul_mont:
1140	AARCH64_VALID_CALL_TARGET
1141	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1142	stp	x29,x30,[sp,#-64]!
1143	add	x29,sp,#0
1144	stp	x19,x20,[sp,#16]
1145	stp	x21,x22,[sp,#32]
1146	stp	x23,x24,[sp,#48]
1147
1148	adrp	$ordk,:pg_hi21:.Lord
1149	add	$ordk,$ordk,:lo12:.Lord
1150	ldr	$bi,[$bp]		// bp[0]
1151	ldp	$a0,$a1,[$ap]
1152	ldp	$a2,$a3,[$ap,#16]
1153
1154	ldp	$ord0,$ord1,[$ordk,#0]
1155	ldp	$ord2,$ord3,[$ordk,#16]
1156	ldr	$ordk,[$ordk,#32]
1157
1158	mul	$acc0,$a0,$bi		// a[0]*b[0]
1159	umulh	$t0,$a0,$bi
1160
1161	mul	$acc1,$a1,$bi		// a[1]*b[0]
1162	umulh	$t1,$a1,$bi
1163
1164	mul	$acc2,$a2,$bi		// a[2]*b[0]
1165	umulh	$t2,$a2,$bi
1166
1167	mul	$acc3,$a3,$bi		// a[3]*b[0]
1168	umulh	$acc4,$a3,$bi
1169
1170	mul	$t4,$acc0,$ordk
1171
1172	adds	$acc1,$acc1,$t0		// accumulate high parts of multiplication
1173	adcs	$acc2,$acc2,$t1
1174	adcs	$acc3,$acc3,$t2
1175	adc	$acc4,$acc4,xzr
1176	mov	$acc5,xzr
1177___
1178for ($i=1;$i<4;$i++) {
1179	################################################################
1180	#            ffff0000.ffffffff.yyyyyyyy.zzzzzzzz
1181	# *                                     abcdefgh
1182	# + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1183	#
1184	# Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
1185	# rewrite above as:
1186	#
1187	#   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
1188	# - 0000abcd.efgh0000.abcdefgh.00000000.00000000
1189	# + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh
1190$code.=<<___;
1191	ldr	$bi,[$bp,#8*$i]		// b[i]
1192
1193	lsl	$t0,$t4,#32
1194	subs	$acc2,$acc2,$t4
1195	lsr	$t1,$t4,#32
1196	sbcs	$acc3,$acc3,$t0
1197	sbcs	$acc4,$acc4,$t1
1198	sbc	$acc5,$acc5,xzr
1199
1200	subs	xzr,$acc0,#1
1201	umulh	$t1,$ord0,$t4
1202	mul	$t2,$ord1,$t4
1203	umulh	$t3,$ord1,$t4
1204
1205	adcs	$t2,$t2,$t1
1206	 mul	$t0,$a0,$bi
1207	adc	$t3,$t3,xzr
1208	 mul	$t1,$a1,$bi
1209
1210	adds	$acc0,$acc1,$t2
1211	 mul	$t2,$a2,$bi
1212	adcs	$acc1,$acc2,$t3
1213	 mul	$t3,$a3,$bi
1214	adcs	$acc2,$acc3,$t4
1215	adcs	$acc3,$acc4,$t4
1216	adc	$acc4,$acc5,xzr
1217
1218	adds	$acc0,$acc0,$t0		// accumulate low parts
1219	umulh	$t0,$a0,$bi
1220	adcs	$acc1,$acc1,$t1
1221	umulh	$t1,$a1,$bi
1222	adcs	$acc2,$acc2,$t2
1223	umulh	$t2,$a2,$bi
1224	adcs	$acc3,$acc3,$t3
1225	umulh	$t3,$a3,$bi
1226	adc	$acc4,$acc4,xzr
1227	mul	$t4,$acc0,$ordk
1228	adds	$acc1,$acc1,$t0		// accumulate high parts
1229	adcs	$acc2,$acc2,$t1
1230	adcs	$acc3,$acc3,$t2
1231	adcs	$acc4,$acc4,$t3
1232	adc	$acc5,xzr,xzr
1233___
1234}
1235$code.=<<___;
1236	lsl	$t0,$t4,#32		// last reduction
1237	subs	$acc2,$acc2,$t4
1238	lsr	$t1,$t4,#32
1239	sbcs	$acc3,$acc3,$t0
1240	sbcs	$acc4,$acc4,$t1
1241	sbc	$acc5,$acc5,xzr
1242
1243	subs	xzr,$acc0,#1
1244	umulh	$t1,$ord0,$t4
1245	mul	$t2,$ord1,$t4
1246	umulh	$t3,$ord1,$t4
1247
1248	adcs	$t2,$t2,$t1
1249	adc	$t3,$t3,xzr
1250
1251	adds	$acc0,$acc1,$t2
1252	adcs	$acc1,$acc2,$t3
1253	adcs	$acc2,$acc3,$t4
1254	adcs	$acc3,$acc4,$t4
1255	adc	$acc4,$acc5,xzr
1256
1257	subs	$t0,$acc0,$ord0		// ret -= modulus
1258	sbcs	$t1,$acc1,$ord1
1259	sbcs	$t2,$acc2,$ord2
1260	sbcs	$t3,$acc3,$ord3
1261	sbcs	xzr,$acc4,xzr
1262
1263	csel	$acc0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
1264	csel	$acc1,$acc1,$t1,lo
1265	csel	$acc2,$acc2,$t2,lo
1266	stp	$acc0,$acc1,[$rp]
1267	csel	$acc3,$acc3,$t3,lo
1268	stp	$acc2,$acc3,[$rp,#16]
1269
1270	ldp	x19,x20,[sp,#16]
1271	ldp	x21,x22,[sp,#32]
1272	ldp	x23,x24,[sp,#48]
1273	ldr	x29,[sp],#64
1274	ret
1275.size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
1276
1277////////////////////////////////////////////////////////////////////////
1278// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
1279//                                uint64_t rep);
1280.globl	ecp_nistz256_ord_sqr_mont
1281.type	ecp_nistz256_ord_sqr_mont,%function
1282.align	4
1283ecp_nistz256_ord_sqr_mont:
1284	AARCH64_VALID_CALL_TARGET
1285	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1286	stp	x29,x30,[sp,#-64]!
1287	add	x29,sp,#0
1288	stp	x19,x20,[sp,#16]
1289	stp	x21,x22,[sp,#32]
1290	stp	x23,x24,[sp,#48]
1291
1292	adrp	$ordk,:pg_hi21:.Lord
1293	add	$ordk,$ordk,:lo12:.Lord
1294	ldp	$a0,$a1,[$ap]
1295	ldp	$a2,$a3,[$ap,#16]
1296
1297	ldp	$ord0,$ord1,[$ordk,#0]
1298	ldp	$ord2,$ord3,[$ordk,#16]
1299	ldr	$ordk,[$ordk,#32]
1300	b	.Loop_ord_sqr
1301
1302.align	4
1303.Loop_ord_sqr:
1304	sub	$bp,$bp,#1
1305	////////////////////////////////////////////////////////////////
1306	//  |  |  |  |  |  |a1*a0|  |
1307	//  |  |  |  |  |a2*a0|  |  |
1308	//  |  |a3*a2|a3*a0|  |  |  |
1309	//  |  |  |  |a2*a1|  |  |  |
1310	//  |  |  |a3*a1|  |  |  |  |
1311	// *|  |  |  |  |  |  |  | 2|
1312	// +|a3*a3|a2*a2|a1*a1|a0*a0|
1313	//  |--+--+--+--+--+--+--+--|
1314	//  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
1315	//
1316	//  "can't overflow" below mark carrying into high part of
1317	//  multiplication result, which can't overflow, because it
1318	//  can never be all ones.
1319
1320	mul	$acc1,$a1,$a0		// a[1]*a[0]
1321	umulh	$t1,$a1,$a0
1322	mul	$acc2,$a2,$a0		// a[2]*a[0]
1323	umulh	$t2,$a2,$a0
1324	mul	$acc3,$a3,$a0		// a[3]*a[0]
1325	umulh	$acc4,$a3,$a0
1326
1327	adds	$acc2,$acc2,$t1		// accumulate high parts of multiplication
1328	 mul	$t0,$a2,$a1		// a[2]*a[1]
1329	 umulh	$t1,$a2,$a1
1330	adcs	$acc3,$acc3,$t2
1331	 mul	$t2,$a3,$a1		// a[3]*a[1]
1332	 umulh	$t3,$a3,$a1
1333	adc	$acc4,$acc4,xzr		// can't overflow
1334
1335	mul	$acc5,$a3,$a2		// a[3]*a[2]
1336	umulh	$acc6,$a3,$a2
1337
1338	adds	$t1,$t1,$t2		// accumulate high parts of multiplication
1339	 mul	$acc0,$a0,$a0		// a[0]*a[0]
1340	adc	$t2,$t3,xzr		// can't overflow
1341
1342	adds	$acc3,$acc3,$t0		// accumulate low parts of multiplication
1343	 umulh	$a0,$a0,$a0
1344	adcs	$acc4,$acc4,$t1
1345	 mul	$t1,$a1,$a1		// a[1]*a[1]
1346	adcs	$acc5,$acc5,$t2
1347	 umulh	$a1,$a1,$a1
1348	adc	$acc6,$acc6,xzr		// can't overflow
1349
1350	adds	$acc1,$acc1,$acc1	// acc[1-6]*=2
1351	 mul	$t2,$a2,$a2		// a[2]*a[2]
1352	adcs	$acc2,$acc2,$acc2
1353	 umulh	$a2,$a2,$a2
1354	adcs	$acc3,$acc3,$acc3
1355	 mul	$t3,$a3,$a3		// a[3]*a[3]
1356	adcs	$acc4,$acc4,$acc4
1357	 umulh	$a3,$a3,$a3
1358	adcs	$acc5,$acc5,$acc5
1359	adcs	$acc6,$acc6,$acc6
1360	adc	$acc7,xzr,xzr
1361
1362	adds	$acc1,$acc1,$a0		// +a[i]*a[i]
1363	 mul	$t4,$acc0,$ordk
1364	adcs	$acc2,$acc2,$t1
1365	adcs	$acc3,$acc3,$a1
1366	adcs	$acc4,$acc4,$t2
1367	adcs	$acc5,$acc5,$a2
1368	adcs	$acc6,$acc6,$t3
1369	adc	$acc7,$acc7,$a3
1370___
1371for($i=0; $i<4; $i++) {			# reductions
1372$code.=<<___;
1373	subs	xzr,$acc0,#1
1374	umulh	$t1,$ord0,$t4
1375	mul	$t2,$ord1,$t4
1376	umulh	$t3,$ord1,$t4
1377
1378	adcs	$t2,$t2,$t1
1379	adc	$t3,$t3,xzr
1380
1381	adds	$acc0,$acc1,$t2
1382	adcs	$acc1,$acc2,$t3
1383	adcs	$acc2,$acc3,$t4
1384	adc	$acc3,xzr,$t4		// can't overflow
1385___
1386$code.=<<___	if ($i<3);
1387	mul	$t3,$acc0,$ordk
1388___
1389$code.=<<___;
1390	lsl	$t0,$t4,#32
1391	subs	$acc1,$acc1,$t4
1392	lsr	$t1,$t4,#32
1393	sbcs	$acc2,$acc2,$t0
1394	sbc	$acc3,$acc3,$t1		// can't borrow
1395___
1396	($t3,$t4) = ($t4,$t3);
1397}
1398$code.=<<___;
1399	adds	$acc0,$acc0,$acc4	// accumulate upper half
1400	adcs	$acc1,$acc1,$acc5
1401	adcs	$acc2,$acc2,$acc6
1402	adcs	$acc3,$acc3,$acc7
1403	adc	$acc4,xzr,xzr
1404
1405	subs	$t0,$acc0,$ord0		// ret -= modulus
1406	sbcs	$t1,$acc1,$ord1
1407	sbcs	$t2,$acc2,$ord2
1408	sbcs	$t3,$acc3,$ord3
1409	sbcs	xzr,$acc4,xzr
1410
1411	csel	$a0,$acc0,$t0,lo	// ret = borrow ? ret : ret-modulus
1412	csel	$a1,$acc1,$t1,lo
1413	csel	$a2,$acc2,$t2,lo
1414	csel	$a3,$acc3,$t3,lo
1415
1416	cbnz	$bp,.Loop_ord_sqr
1417
1418	stp	$a0,$a1,[$rp]
1419	stp	$a2,$a3,[$rp,#16]
1420
1421	ldp	x19,x20,[sp,#16]
1422	ldp	x21,x22,[sp,#32]
1423	ldp	x23,x24,[sp,#48]
1424	ldr	x29,[sp],#64
1425	ret
1426.size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
1427___
1428}	}
1429
1430########################################################################
1431# select subroutines
1432# These select functions are similar to those in p256-x86_64-asm.pl
1433# They load all points in the lookup table
1434# keeping in the output only the one corresponding to the input index.
1435{
1436my ($val,$in_t)=map("x$_",(0..1));
1437my ($index)=("w2");
1438my ($Idx_ctr,$Val_in, $Mask_64)=("w9", "x10", "x11");
1439my ($Mask)=("v3");
1440my ($Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("v$_",(16..21));
1441my ($T0a,$T0b,$T0c,$T0d,$T0e,$T0f)=map("v$_",(22..27));
1442$code.=<<___;
1443////////////////////////////////////////////////////////////////////////
1444// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
1445.globl	ecp_nistz256_select_w5
1446.type	ecp_nistz256_select_w5,%function
1447.align	4
1448ecp_nistz256_select_w5:
1449    AARCH64_VALID_CALL_TARGET
1450
1451    // $Val_in := $val
1452    // $Idx_ctr := 0; loop counter and incremented internal index
1453    mov     $Val_in, $val
1454    mov     $Idx_ctr, #0
1455
1456    // [$Ra-$Rf] := 0
1457    movi    $Ra.16b, #0
1458    movi    $Rb.16b, #0
1459    movi    $Rc.16b, #0
1460    movi    $Rd.16b, #0
1461    movi    $Re.16b, #0
1462    movi    $Rf.16b, #0
1463
1464.Lselect_w5_loop:
1465    // Loop 16 times.
1466
1467    // Increment index (loop counter); tested at the end of the loop
1468    add $Idx_ctr, $Idx_ctr, #1
1469
1470    // [$T0a-$T0f] := Load a (3*256-bit = 6*128-bit) table entry starting at $in_t
1471    //  and advance $in_t to point to the next entry
1472    ld1     {$T0a.2d, $T0b.2d, $T0c.2d, $T0d.2d}, [$in_t],#64
1473
1474    // $Mask_64 := ($Idx_ctr == $index)? All 1s : All 0s
1475    cmp     $Idx_ctr, $index
1476    csetm   $Mask_64, eq
1477
1478    // continue loading ...
1479    ld1     {$T0e.2d, $T0f.2d}, [$in_t],#32
1480
1481    // duplicate mask_64 into Mask (all 0s or all 1s)
1482    dup     $Mask.2d, $Mask_64
1483
1484    // [$Ra-$Rd] := (Mask == all 1s)? [$T0a-$T0d] : [$Ra-$Rd]
1485    // i.e., values in output registers will remain the same if $Idx_ctr != $index
1486    bit     $Ra.16b, $T0a.16b, $Mask.16b
1487    bit     $Rb.16b, $T0b.16b, $Mask.16b
1488
1489    bit     $Rc.16b, $T0c.16b, $Mask.16b
1490    bit     $Rd.16b, $T0d.16b, $Mask.16b
1491
1492    bit     $Re.16b, $T0e.16b, $Mask.16b
1493    bit     $Rf.16b, $T0f.16b, $Mask.16b
1494
1495    // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back
1496    tbz    $Idx_ctr, #4, .Lselect_w5_loop
1497
1498    // Write [$Ra-$Rf] to memory at the output pointer
1499    st1     {$Ra.2d, $Rb.2d, $Rc.2d, $Rd.2d}, [$Val_in],#64
1500    st1     {$Re.2d, $Rf.2d}, [$Val_in]
1501
1502	ret
1503.size	ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
1504
1505
1506////////////////////////////////////////////////////////////////////////
1507// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
1508.globl	ecp_nistz256_select_w7
1509.type	ecp_nistz256_select_w7,%function
1510.align	4
1511ecp_nistz256_select_w7:
1512    AARCH64_VALID_CALL_TARGET
1513
1514    // $Idx_ctr := 0; loop counter and incremented internal index
1515    mov     $Idx_ctr, #0
1516
1517    // [$Ra-$Rf] := 0
1518    movi    $Ra.16b, #0
1519    movi    $Rb.16b, #0
1520    movi    $Rc.16b, #0
1521    movi    $Rd.16b, #0
1522
1523.Lselect_w7_loop:
1524    // Loop 64 times.
1525
1526    // Increment index (loop counter); tested at the end of the loop
1527    add $Idx_ctr, $Idx_ctr, #1
1528
1529    // [$T0a-$T0d] := Load a (2*256-bit = 4*128-bit) table entry starting at $in_t
1530    //  and advance $in_t to point to the next entry
1531    ld1     {$T0a.2d, $T0b.2d, $T0c.2d, $T0d.2d}, [$in_t],#64
1532
1533    // $Mask_64 := ($Idx_ctr == $index)? All 1s : All 0s
1534    cmp     $Idx_ctr, $index
1535    csetm   $Mask_64, eq
1536
1537    // duplicate mask_64 into Mask (all 0s or all 1s)
1538    dup     $Mask.2d, $Mask_64
1539
1540    // [$Ra-$Rd] := (Mask == all 1s)? [$T0a-$T0d] : [$Ra-$Rd]
1541    // i.e., values in output registers will remain the same if $Idx_ctr != $index
1542    bit     $Ra.16b, $T0a.16b, $Mask.16b
1543    bit     $Rb.16b, $T0b.16b, $Mask.16b
1544
1545    bit     $Rc.16b, $T0c.16b, $Mask.16b
1546    bit     $Rd.16b, $T0d.16b, $Mask.16b
1547
1548    // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back
1549    tbz    $Idx_ctr, #6, .Lselect_w7_loop
1550
1551    // Write [$Ra-$Rd] to memory at the output pointer
1552    st1     {$Ra.2d, $Rb.2d, $Rc.2d, $Rd.2d}, [$val]
1553
1554	ret
1555.size	ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
1556___
1557}
1558
1559foreach (split("\n",$code)) {
1560	s/\`([^\`]*)\`/eval $1/ge;
1561
1562	print $_,"\n";
1563}
1564close STDOUT or die "error closing STDOUT: $!";	# enforce flush
1565