• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# January 2007.
11
12# Montgomery multiplication for ARMv4.
13#
14# Performance improvement naturally varies among CPU implementations
15# and compilers. The code was observed to provide +65-35% improvement
16# [depending on key length, less for longer keys] on ARM920T, and
17# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
18# base and compiler generated code with in-lined umull and even umlal
19# instructions. The latter means that this code didn't really have an
20# "advantage" of utilizing some "secret" instruction.
21#
22# The code is interoperable with Thumb ISA and is rather compact, less
23# than 1/2KB. Windows CE port would be trivial, as it's exclusively
24# about decorations, ABI and instruction syntax are identical.
25
26# November 2013
27#
28# Add NEON code path, which handles lengths divisible by 8. RSA/DSA
29# performance improvement on Cortex-A8 is ~45-100% depending on key
30# length, more for longer keys. On Cortex-A15 the span is ~10-105%.
31# On Snapdragon S4 improvement was measured to vary from ~70% to
32# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
33# rather because original integer-only code seems to perform
34# suboptimally on S4. Situation on Cortex-A9 is unfortunately
35# different. It's being looked into, but the trouble is that
36# performance for vectors longer than 256 bits is actually couple
37# of percent worse than for integer-only code. The code is chosen
38# for execution on all NEON-capable processors, because gain on
39# others outweighs the marginal loss on Cortex-A9.
40
41while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
42open STDOUT,">$output";
43
44$num="r0";	# starts as num argument, but holds &tp[num-1]
45$ap="r1";
46$bp="r2"; $bi="r2"; $rp="r2";
47$np="r3";
48$tp="r4";
49$aj="r5";
50$nj="r6";
51$tj="r7";
52$n0="r8";
53###########	# r9 is reserved by ELF as platform specific, e.g. TLS pointer
54$alo="r10";	# sl, gcc uses it to keep @GOT
55$ahi="r11";	# fp
56$nlo="r12";	# ip
57###########	# r13 is stack pointer
58$nhi="r14";	# lr
59###########	# r15 is program counter
60
61#### argument block layout relative to &tp[num-1], a.k.a. $num
62$_rp="$num,#12*4";
63# ap permanently resides in r1
64$_bp="$num,#13*4";
65# np permanently resides in r3
66$_n0="$num,#14*4";
67$_num="$num,#15*4";	$_bpend=$_num;
68
69$code=<<___;
70#include "arm_arch.h"
71
72.text
73.code	32
74
75#if __ARM_ARCH__>=7
76.align	5
77.LOPENSSL_armcap:
78.word	OPENSSL_armcap_P-bn_mul_mont
79#endif
80
81.global	bn_mul_mont
82.type	bn_mul_mont,%function
83
84.align	5
85bn_mul_mont:
86	ldr	ip,[sp,#4]		@ load num
87	stmdb	sp!,{r0,r2}		@ sp points at argument block
88#if __ARM_ARCH__>=7
89	tst	ip,#7
90	bne	.Lialu
91	adr	r0,bn_mul_mont
92	ldr	r2,.LOPENSSL_armcap
93	ldr	r0,[r0,r2]
94	tst	r0,#1			@ NEON available?
95	ldmia	sp, {r0,r2}
96	beq	.Lialu
97	add	sp,sp,#8
98	b	bn_mul8x_mont_neon
99.align	4
100.Lialu:
101#endif
102	cmp	ip,#2
103	mov	$num,ip			@ load num
104	movlt	r0,#0
105	addlt	sp,sp,#2*4
106	blt	.Labrt
107
108	stmdb	sp!,{r4-r12,lr}		@ save 10 registers
109
110	mov	$num,$num,lsl#2		@ rescale $num for byte count
111	sub	sp,sp,$num		@ alloca(4*num)
112	sub	sp,sp,#4		@ +extra dword
113	sub	$num,$num,#4		@ "num=num-1"
114	add	$tp,$bp,$num		@ &bp[num-1]
115
116	add	$num,sp,$num		@ $num to point at &tp[num-1]
117	ldr	$n0,[$_n0]		@ &n0
118	ldr	$bi,[$bp]		@ bp[0]
119	ldr	$aj,[$ap],#4		@ ap[0],ap++
120	ldr	$nj,[$np],#4		@ np[0],np++
121	ldr	$n0,[$n0]		@ *n0
122	str	$tp,[$_bpend]		@ save &bp[num]
123
124	umull	$alo,$ahi,$aj,$bi	@ ap[0]*bp[0]
125	str	$n0,[$_n0]		@ save n0 value
126	mul	$n0,$alo,$n0		@ "tp[0]"*n0
127	mov	$nlo,#0
128	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"t[0]"
129	mov	$tp,sp
130
131.L1st:
132	ldr	$aj,[$ap],#4		@ ap[j],ap++
133	mov	$alo,$ahi
134	ldr	$nj,[$np],#4		@ np[j],np++
135	mov	$ahi,#0
136	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[0]
137	mov	$nhi,#0
138	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
139	adds	$nlo,$nlo,$alo
140	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
141	adc	$nlo,$nhi,#0
142	cmp	$tp,$num
143	bne	.L1st
144
145	adds	$nlo,$nlo,$ahi
146	ldr	$tp,[$_bp]		@ restore bp
147	mov	$nhi,#0
148	ldr	$n0,[$_n0]		@ restore n0
149	adc	$nhi,$nhi,#0
150	str	$nlo,[$num]		@ tp[num-1]=
151	str	$nhi,[$num,#4]		@ tp[num]=
152
153.Louter:
154	sub	$tj,$num,sp		@ "original" $num-1 value
155	sub	$ap,$ap,$tj		@ "rewind" ap to &ap[1]
156	ldr	$bi,[$tp,#4]!		@ *(++bp)
157	sub	$np,$np,$tj		@ "rewind" np to &np[1]
158	ldr	$aj,[$ap,#-4]		@ ap[0]
159	ldr	$alo,[sp]		@ tp[0]
160	ldr	$nj,[$np,#-4]		@ np[0]
161	ldr	$tj,[sp,#4]		@ tp[1]
162
163	mov	$ahi,#0
164	umlal	$alo,$ahi,$aj,$bi	@ ap[0]*bp[i]+tp[0]
165	str	$tp,[$_bp]		@ save bp
166	mul	$n0,$alo,$n0
167	mov	$nlo,#0
168	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"tp[0]"
169	mov	$tp,sp
170
171.Linner:
172	ldr	$aj,[$ap],#4		@ ap[j],ap++
173	adds	$alo,$ahi,$tj		@ +=tp[j]
174	ldr	$nj,[$np],#4		@ np[j],np++
175	mov	$ahi,#0
176	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[i]
177	mov	$nhi,#0
178	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
179	adc	$ahi,$ahi,#0
180	ldr	$tj,[$tp,#8]		@ tp[j+1]
181	adds	$nlo,$nlo,$alo
182	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
183	adc	$nlo,$nhi,#0
184	cmp	$tp,$num
185	bne	.Linner
186
187	adds	$nlo,$nlo,$ahi
188	mov	$nhi,#0
189	ldr	$tp,[$_bp]		@ restore bp
190	adc	$nhi,$nhi,#0
191	ldr	$n0,[$_n0]		@ restore n0
192	adds	$nlo,$nlo,$tj
193	ldr	$tj,[$_bpend]		@ restore &bp[num]
194	adc	$nhi,$nhi,#0
195	str	$nlo,[$num]		@ tp[num-1]=
196	str	$nhi,[$num,#4]		@ tp[num]=
197
198	cmp	$tp,$tj
199	bne	.Louter
200
201	ldr	$rp,[$_rp]		@ pull rp
202	add	$num,$num,#4		@ $num to point at &tp[num]
203	sub	$aj,$num,sp		@ "original" num value
204	mov	$tp,sp			@ "rewind" $tp
205	mov	$ap,$tp			@ "borrow" $ap
206	sub	$np,$np,$aj		@ "rewind" $np to &np[0]
207
208	subs	$tj,$tj,$tj		@ "clear" carry flag
209.Lsub:	ldr	$tj,[$tp],#4
210	ldr	$nj,[$np],#4
211	sbcs	$tj,$tj,$nj		@ tp[j]-np[j]
212	str	$tj,[$rp],#4		@ rp[j]=
213	teq	$tp,$num		@ preserve carry
214	bne	.Lsub
215	sbcs	$nhi,$nhi,#0		@ upmost carry
216	mov	$tp,sp			@ "rewind" $tp
217	sub	$rp,$rp,$aj		@ "rewind" $rp
218
219	and	$ap,$tp,$nhi
220	bic	$np,$rp,$nhi
221	orr	$ap,$ap,$np		@ ap=borrow?tp:rp
222
223.Lcopy:	ldr	$tj,[$ap],#4		@ copy or in-place refresh
224	str	sp,[$tp],#4		@ zap tp
225	str	$tj,[$rp],#4
226	cmp	$tp,$num
227	bne	.Lcopy
228
229	add	sp,$num,#4		@ skip over tp[num+1]
230	ldmia	sp!,{r4-r12,lr}		@ restore registers
231	add	sp,sp,#2*4		@ skip over {r0,r2}
232	mov	r0,#1
233.Labrt:
234#if __ARM_ARCH__>=5
235	ret				@ bx lr
236#else
237	tst	lr,#1
238	moveq	pc,lr			@ be binary compatible with V4, yet
239	bx	lr			@ interoperable with Thumb ISA:-)
240#endif
241.size	bn_mul_mont,.-bn_mul_mont
242___
243{
244sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
245sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
246
247my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
248my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
249my ($Z,$Temp)=("q4","q5");
250my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13));
251my ($Bi,$Ni,$M0)=map("d$_",(28..31));
252my $zero=&Dlo($Z);
253my $temp=&Dlo($Temp);
254
255my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
256my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9));
257
258$code.=<<___;
259#if __ARM_ARCH__>=7
260.fpu	neon
261
262.type	bn_mul8x_mont_neon,%function
263.align	5
264bn_mul8x_mont_neon:
265	mov	ip,sp
266	stmdb	sp!,{r4-r11}
267	vstmdb	sp!,{d8-d15}		@ ABI specification says so
268	ldmia	ip,{r4-r5}		@ load rest of parameter block
269
270	sub		$toutptr,sp,#16
271	vld1.32		{${Bi}[0]}, [$bptr,:32]!
272	sub		$toutptr,$toutptr,$num,lsl#4
273	vld1.32		{$A0-$A3},  [$aptr]!		@ can't specify :32 :-(
274	and		$toutptr,$toutptr,#-64
275	vld1.32		{${M0}[0]}, [$n0,:32]
276	mov		sp,$toutptr			@ alloca
277	veor		$zero,$zero,$zero
278	subs		$inner,$num,#8
279	vzip.16		$Bi,$zero
280
281	vmull.u32	$A0xB,$Bi,${A0}[0]
282	vmull.u32	$A1xB,$Bi,${A0}[1]
283	vmull.u32	$A2xB,$Bi,${A1}[0]
284	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
285	vmull.u32	$A3xB,$Bi,${A1}[1]
286
287	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
288	veor		$zero,$zero,$zero
289	vmul.u32	$Ni,$temp,$M0
290
291	vmull.u32	$A4xB,$Bi,${A2}[0]
292	 vld1.32	{$N0-$N3}, [$nptr]!
293	vmull.u32	$A5xB,$Bi,${A2}[1]
294	vmull.u32	$A6xB,$Bi,${A3}[0]
295	vzip.16		$Ni,$zero
296	vmull.u32	$A7xB,$Bi,${A3}[1]
297
298	bne	.LNEON_1st
299
300	@ special case for num=8, everything is in register bank...
301
302	vmlal.u32	$A0xB,$Ni,${N0}[0]
303	sub		$outer,$num,#1
304	vmlal.u32	$A1xB,$Ni,${N0}[1]
305	vmlal.u32	$A2xB,$Ni,${N1}[0]
306	vmlal.u32	$A3xB,$Ni,${N1}[1]
307
308	vmlal.u32	$A4xB,$Ni,${N2}[0]
309	vmov		$Temp,$A0xB
310	vmlal.u32	$A5xB,$Ni,${N2}[1]
311	vmov		$A0xB,$A1xB
312	vmlal.u32	$A6xB,$Ni,${N3}[0]
313	vmov		$A1xB,$A2xB
314	vmlal.u32	$A7xB,$Ni,${N3}[1]
315	vmov		$A2xB,$A3xB
316	vmov		$A3xB,$A4xB
317	vshr.u64	$temp,$temp,#16
318	vmov		$A4xB,$A5xB
319	vmov		$A5xB,$A6xB
320	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
321	vmov		$A6xB,$A7xB
322	veor		$A7xB,$A7xB
323	vshr.u64	$temp,$temp,#16
324
325	b	.LNEON_outer8
326
327.align	4
328.LNEON_outer8:
329	vld1.32		{${Bi}[0]}, [$bptr,:32]!
330	veor		$zero,$zero,$zero
331	vzip.16		$Bi,$zero
332	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
333
334	vmlal.u32	$A0xB,$Bi,${A0}[0]
335	vmlal.u32	$A1xB,$Bi,${A0}[1]
336	vmlal.u32	$A2xB,$Bi,${A1}[0]
337	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
338	vmlal.u32	$A3xB,$Bi,${A1}[1]
339
340	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
341	veor		$zero,$zero,$zero
342	subs		$outer,$outer,#1
343	vmul.u32	$Ni,$temp,$M0
344
345	vmlal.u32	$A4xB,$Bi,${A2}[0]
346	vmlal.u32	$A5xB,$Bi,${A2}[1]
347	vmlal.u32	$A6xB,$Bi,${A3}[0]
348	vzip.16		$Ni,$zero
349	vmlal.u32	$A7xB,$Bi,${A3}[1]
350
351	vmlal.u32	$A0xB,$Ni,${N0}[0]
352	vmlal.u32	$A1xB,$Ni,${N0}[1]
353	vmlal.u32	$A2xB,$Ni,${N1}[0]
354	vmlal.u32	$A3xB,$Ni,${N1}[1]
355
356	vmlal.u32	$A4xB,$Ni,${N2}[0]
357	vmov		$Temp,$A0xB
358	vmlal.u32	$A5xB,$Ni,${N2}[1]
359	vmov		$A0xB,$A1xB
360	vmlal.u32	$A6xB,$Ni,${N3}[0]
361	vmov		$A1xB,$A2xB
362	vmlal.u32	$A7xB,$Ni,${N3}[1]
363	vmov		$A2xB,$A3xB
364	vmov		$A3xB,$A4xB
365	vshr.u64	$temp,$temp,#16
366	vmov		$A4xB,$A5xB
367	vmov		$A5xB,$A6xB
368	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
369	vmov		$A6xB,$A7xB
370	veor		$A7xB,$A7xB
371	vshr.u64	$temp,$temp,#16
372
373	bne	.LNEON_outer8
374
375	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
376	mov		$toutptr,sp
377	vshr.u64	$temp,`&Dlo("$A0xB")`,#16
378	mov		$inner,$num
379	vadd.u64	`&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
380	add		$tinptr,sp,#16
381	vshr.u64	$temp,`&Dhi("$A0xB")`,#16
382	vzip.16		`&Dlo("$A0xB")`,`&Dhi("$A0xB")`
383
384	b	.LNEON_tail2
385
386.align	4
387.LNEON_1st:
388	vmlal.u32	$A0xB,$Ni,${N0}[0]
389	 vld1.32	{$A0-$A3}, [$aptr]!
390	vmlal.u32	$A1xB,$Ni,${N0}[1]
391	subs		$inner,$inner,#8
392	vmlal.u32	$A2xB,$Ni,${N1}[0]
393	vmlal.u32	$A3xB,$Ni,${N1}[1]
394
395	vmlal.u32	$A4xB,$Ni,${N2}[0]
396	 vld1.32	{$N0-$N1}, [$nptr]!
397	vmlal.u32	$A5xB,$Ni,${N2}[1]
398	 vst1.64	{$A0xB-$A1xB}, [$toutptr,:256]!
399	vmlal.u32	$A6xB,$Ni,${N3}[0]
400	vmlal.u32	$A7xB,$Ni,${N3}[1]
401	 vst1.64	{$A2xB-$A3xB}, [$toutptr,:256]!
402
403	vmull.u32	$A0xB,$Bi,${A0}[0]
404	 vld1.32	{$N2-$N3}, [$nptr]!
405	vmull.u32	$A1xB,$Bi,${A0}[1]
406	 vst1.64	{$A4xB-$A5xB}, [$toutptr,:256]!
407	vmull.u32	$A2xB,$Bi,${A1}[0]
408	vmull.u32	$A3xB,$Bi,${A1}[1]
409	 vst1.64	{$A6xB-$A7xB}, [$toutptr,:256]!
410
411	vmull.u32	$A4xB,$Bi,${A2}[0]
412	vmull.u32	$A5xB,$Bi,${A2}[1]
413	vmull.u32	$A6xB,$Bi,${A3}[0]
414	vmull.u32	$A7xB,$Bi,${A3}[1]
415
416	bne	.LNEON_1st
417
418	vmlal.u32	$A0xB,$Ni,${N0}[0]
419	add		$tinptr,sp,#16
420	vmlal.u32	$A1xB,$Ni,${N0}[1]
421	sub		$aptr,$aptr,$num,lsl#2		@ rewind $aptr
422	vmlal.u32	$A2xB,$Ni,${N1}[0]
423	 vld1.64	{$Temp}, [sp,:128]
424	vmlal.u32	$A3xB,$Ni,${N1}[1]
425	sub		$outer,$num,#1
426
427	vmlal.u32	$A4xB,$Ni,${N2}[0]
428	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
429	vmlal.u32	$A5xB,$Ni,${N2}[1]
430	vshr.u64	$temp,$temp,#16
431	 vld1.64	{$A0xB},       [$tinptr, :128]!
432	vmlal.u32	$A6xB,$Ni,${N3}[0]
433	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
434	vmlal.u32	$A7xB,$Ni,${N3}[1]
435
436	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
437	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
438	veor		$Z,$Z,$Z
439	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
440	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
441	vst1.64		{$Z},          [$toutptr,:128]
442	vshr.u64	$temp,$temp,#16
443
444	b		.LNEON_outer
445
446.align	4
447.LNEON_outer:
448	vld1.32		{${Bi}[0]}, [$bptr,:32]!
449	sub		$nptr,$nptr,$num,lsl#2		@ rewind $nptr
450	vld1.32		{$A0-$A3},  [$aptr]!
451	veor		$zero,$zero,$zero
452	mov		$toutptr,sp
453	vzip.16		$Bi,$zero
454	sub		$inner,$num,#8
455	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
456
457	vmlal.u32	$A0xB,$Bi,${A0}[0]
458	 vld1.64	{$A3xB-$A4xB},[$tinptr,:256]!
459	vmlal.u32	$A1xB,$Bi,${A0}[1]
460	vmlal.u32	$A2xB,$Bi,${A1}[0]
461	 vld1.64	{$A5xB-$A6xB},[$tinptr,:256]!
462	vmlal.u32	$A3xB,$Bi,${A1}[1]
463
464	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
465	veor		$zero,$zero,$zero
466	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
467	 vld1.64	{$A7xB},[$tinptr,:128]!
468	vmul.u32	$Ni,$temp,$M0
469
470	vmlal.u32	$A4xB,$Bi,${A2}[0]
471	 vld1.32	{$N0-$N3}, [$nptr]!
472	vmlal.u32	$A5xB,$Bi,${A2}[1]
473	vmlal.u32	$A6xB,$Bi,${A3}[0]
474	vzip.16		$Ni,$zero
475	vmlal.u32	$A7xB,$Bi,${A3}[1]
476
477.LNEON_inner:
478	vmlal.u32	$A0xB,$Ni,${N0}[0]
479	 vld1.32	{$A0-$A3}, [$aptr]!
480	vmlal.u32	$A1xB,$Ni,${N0}[1]
481	 subs		$inner,$inner,#8
482	vmlal.u32	$A2xB,$Ni,${N1}[0]
483	vmlal.u32	$A3xB,$Ni,${N1}[1]
484	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
485
486	vmlal.u32	$A4xB,$Ni,${N2}[0]
487	 vld1.64	{$A0xB},       [$tinptr, :128]!
488	vmlal.u32	$A5xB,$Ni,${N2}[1]
489	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
490	vmlal.u32	$A6xB,$Ni,${N3}[0]
491	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
492	vmlal.u32	$A7xB,$Ni,${N3}[1]
493	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
494
495	vmlal.u32	$A0xB,$Bi,${A0}[0]
496	 vld1.64	{$A3xB-$A4xB}, [$tinptr, :256]!
497	vmlal.u32	$A1xB,$Bi,${A0}[1]
498	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
499	vmlal.u32	$A2xB,$Bi,${A1}[0]
500	 vld1.64	{$A5xB-$A6xB}, [$tinptr, :256]!
501	vmlal.u32	$A3xB,$Bi,${A1}[1]
502	 vld1.32	{$N0-$N3}, [$nptr]!
503
504	vmlal.u32	$A4xB,$Bi,${A2}[0]
505	 vld1.64	{$A7xB},       [$tinptr, :128]!
506	vmlal.u32	$A5xB,$Bi,${A2}[1]
507	vmlal.u32	$A6xB,$Bi,${A3}[0]
508	vmlal.u32	$A7xB,$Bi,${A3}[1]
509
510	bne	.LNEON_inner
511
512	vmlal.u32	$A0xB,$Ni,${N0}[0]
513	add		$tinptr,sp,#16
514	vmlal.u32	$A1xB,$Ni,${N0}[1]
515	sub		$aptr,$aptr,$num,lsl#2		@ rewind $aptr
516	vmlal.u32	$A2xB,$Ni,${N1}[0]
517	 vld1.64	{$Temp}, [sp,:128]
518	vmlal.u32	$A3xB,$Ni,${N1}[1]
519	subs		$outer,$outer,#1
520
521	vmlal.u32	$A4xB,$Ni,${N2}[0]
522	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
523	vmlal.u32	$A5xB,$Ni,${N2}[1]
524	 vld1.64	{$A0xB},       [$tinptr, :128]!
525	vshr.u64	$temp,$temp,#16
526	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
527	vmlal.u32	$A6xB,$Ni,${N3}[0]
528	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
529	vmlal.u32	$A7xB,$Ni,${N3}[1]
530
531	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
532	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
533	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
534	vshr.u64	$temp,$temp,#16
535
536	bne	.LNEON_outer
537
538	mov		$toutptr,sp
539	mov		$inner,$num
540
541.LNEON_tail:
542	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
543	vld1.64		{$A3xB-$A4xB}, [$tinptr, :256]!
544	vshr.u64	$temp,`&Dlo("$A0xB")`,#16
545	vadd.u64	`&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
546	vld1.64		{$A5xB-$A6xB}, [$tinptr, :256]!
547	vshr.u64	$temp,`&Dhi("$A0xB")`,#16
548	vld1.64		{$A7xB},       [$tinptr, :128]!
549	vzip.16		`&Dlo("$A0xB")`,`&Dhi("$A0xB")`
550
551.LNEON_tail2:
552	vadd.u64	`&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp
553	vst1.32		{`&Dlo("$A0xB")`[0]}, [$toutptr, :32]!
554	vshr.u64	$temp,`&Dlo("$A1xB")`,#16
555	vadd.u64	`&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp
556	vshr.u64	$temp,`&Dhi("$A1xB")`,#16
557	vzip.16		`&Dlo("$A1xB")`,`&Dhi("$A1xB")`
558
559	vadd.u64	`&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp
560	vst1.32		{`&Dlo("$A1xB")`[0]}, [$toutptr, :32]!
561	vshr.u64	$temp,`&Dlo("$A2xB")`,#16
562	vadd.u64	`&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp
563	vshr.u64	$temp,`&Dhi("$A2xB")`,#16
564	vzip.16		`&Dlo("$A2xB")`,`&Dhi("$A2xB")`
565
566	vadd.u64	`&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp
567	vst1.32		{`&Dlo("$A2xB")`[0]}, [$toutptr, :32]!
568	vshr.u64	$temp,`&Dlo("$A3xB")`,#16
569	vadd.u64	`&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp
570	vshr.u64	$temp,`&Dhi("$A3xB")`,#16
571	vzip.16		`&Dlo("$A3xB")`,`&Dhi("$A3xB")`
572
573	vadd.u64	`&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp
574	vst1.32		{`&Dlo("$A3xB")`[0]}, [$toutptr, :32]!
575	vshr.u64	$temp,`&Dlo("$A4xB")`,#16
576	vadd.u64	`&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp
577	vshr.u64	$temp,`&Dhi("$A4xB")`,#16
578	vzip.16		`&Dlo("$A4xB")`,`&Dhi("$A4xB")`
579
580	vadd.u64	`&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp
581	vst1.32		{`&Dlo("$A4xB")`[0]}, [$toutptr, :32]!
582	vshr.u64	$temp,`&Dlo("$A5xB")`,#16
583	vadd.u64	`&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp
584	vshr.u64	$temp,`&Dhi("$A5xB")`,#16
585	vzip.16		`&Dlo("$A5xB")`,`&Dhi("$A5xB")`
586
587	vadd.u64	`&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp
588	vst1.32		{`&Dlo("$A5xB")`[0]}, [$toutptr, :32]!
589	vshr.u64	$temp,`&Dlo("$A6xB")`,#16
590	vadd.u64	`&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp
591	vld1.64		{$A0xB}, [$tinptr, :128]!
592	vshr.u64	$temp,`&Dhi("$A6xB")`,#16
593	vzip.16		`&Dlo("$A6xB")`,`&Dhi("$A6xB")`
594
595	vadd.u64	`&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp
596	vst1.32		{`&Dlo("$A6xB")`[0]}, [$toutptr, :32]!
597	vshr.u64	$temp,`&Dlo("$A7xB")`,#16
598	vadd.u64	`&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp
599	vld1.64		{$A1xB-$A2xB},	[$tinptr, :256]!
600	vshr.u64	$temp,`&Dhi("$A7xB")`,#16
601	vzip.16		`&Dlo("$A7xB")`,`&Dhi("$A7xB")`
602	subs		$inner,$inner,#8
603	vst1.32		{`&Dlo("$A7xB")`[0]}, [$toutptr, :32]!
604
605	bne	.LNEON_tail
606
607	vst1.32	{${temp}[0]}, [$toutptr, :32]		@ top-most bit
608	sub	$nptr,$nptr,$num,lsl#2			@ rewind $nptr
609	subs	$aptr,sp,#0				@ clear carry flag
610	add	$bptr,sp,$num,lsl#2
611
612.LNEON_sub:
613	ldmia	$aptr!, {r4-r7}
614	ldmia	$nptr!, {r8-r11}
615	sbcs	r8, r4,r8
616	sbcs	r9, r5,r9
617	sbcs	r10,r6,r10
618	sbcs	r11,r7,r11
619	teq	$aptr,$bptr				@ preserves carry
620	stmia	$rptr!, {r8-r11}
621	bne	.LNEON_sub
622
623	ldr	r10, [$aptr]				@ load top-most bit
624	veor	q0,q0,q0
625	sub	r11,$bptr,sp				@ this is num*4
626	veor	q1,q1,q1
627	mov	$aptr,sp
628	sub	$rptr,$rptr,r11				@ rewind $rptr
629	mov	$nptr,$bptr				@ second 3/4th of frame
630	sbcs	r10,r10,#0				@ result is carry flag
631
632.LNEON_copy_n_zap:
633	ldmia	$aptr!, {r4-r7}
634	ldmia	$rptr,  {r8-r11}
635	movcc	r8, r4
636	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
637	movcc	r9, r5
638	movcc	r10,r6
639	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
640	movcc	r11,r7
641	ldmia	$aptr, {r4-r7}
642	stmia	$rptr!, {r8-r11}
643	sub	$aptr,$aptr,#16
644	ldmia	$rptr, {r8-r11}
645	movcc	r8, r4
646	vst1.64	{q0-q1}, [$aptr,:256]!			@ wipe
647	movcc	r9, r5
648	movcc	r10,r6
649	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
650	movcc	r11,r7
651	teq	$aptr,$bptr				@ preserves carry
652	stmia	$rptr!, {r8-r11}
653	bne	.LNEON_copy_n_zap
654
655	sub	sp,ip,#96
656        vldmia  sp!,{d8-d15}
657        ldmia   sp!,{r4-r11}
658	ret						@ bx lr
659.size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
660#endif
661___
662}
663$code.=<<___;
664.asciz	"Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
665.align	2
666#if __ARM_ARCH__>=7
667.comm	OPENSSL_armcap_P,4,4
668#endif
669___
670
671$code =~ s/\`([^\`]*)\`/eval $1/gem;
672$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
673$code =~ s/\bret\b/bx	lr/gm;
674print $code;
675close STDOUT;
676