• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# January 2007.
11
12# Montgomery multiplication for ARMv4.
13#
14# Performance improvement naturally varies among CPU implementations
15# and compilers. The code was observed to provide +65-35% improvement
16# [depending on key length, less for longer keys] on ARM920T, and
17# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
18# base and compiler generated code with in-lined umull and even umlal
19# instructions. The latter means that this code didn't really have an
20# "advantage" of utilizing some "secret" instruction.
21#
22# The code is interoperable with Thumb ISA and is rather compact, less
23# than 1/2KB. Windows CE port would be trivial, as it's exclusively
24# about decorations, ABI and instruction syntax are identical.
25
26# November 2013
27#
28# Add NEON code path, which handles lengths divisible by 8. RSA/DSA
29# performance improvement on Cortex-A8 is ~45-100% depending on key
30# length, more for longer keys. On Cortex-A15 the span is ~10-105%.
31# On Snapdragon S4 improvement was measured to vary from ~70% to
32# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
33# rather because original integer-only code seems to perform
34# suboptimally on S4. Situation on Cortex-A9 is unfortunately
35# different. It's being looked into, but the trouble is that
36# performance for vectors longer than 256 bits is actually couple
37# of percent worse than for integer-only code. The code is chosen
38# for execution on all NEON-capable processors, because gain on
39# others outweighs the marginal loss on Cortex-A9.
40
41$flavour = shift;
42if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
43else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
44
45if ($flavour && $flavour ne "void") {
46    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
48    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
49    die "can't locate arm-xlate.pl";
50
51    open STDOUT,"| \"$^X\" $xlate $flavour $output";
52} else {
53    open STDOUT,">$output";
54}
55
56$num="r0";	# starts as num argument, but holds &tp[num-1]
57$ap="r1";
58$bp="r2"; $bi="r2"; $rp="r2";
59$np="r3";
60$tp="r4";
61$aj="r5";
62$nj="r6";
63$tj="r7";
64$n0="r8";
65###########	# r9 is reserved by ELF as platform specific, e.g. TLS pointer
66$alo="r10";	# sl, gcc uses it to keep @GOT
67$ahi="r11";	# fp
68$nlo="r12";	# ip
69###########	# r13 is stack pointer
70$nhi="r14";	# lr
71###########	# r15 is program counter
72
73#### argument block layout relative to &tp[num-1], a.k.a. $num
74$_rp="$num,#12*4";
75# ap permanently resides in r1
76$_bp="$num,#13*4";
77# np permanently resides in r3
78$_n0="$num,#14*4";
79$_num="$num,#15*4";	$_bpend=$_num;
80
81$code=<<___;
82#include <openssl/arm_arch.h>
83
84.text
85.code	32
86
87#if __ARM_MAX_ARCH__>=7
88.align	5
89.LOPENSSL_armcap:
90.word	OPENSSL_armcap_P-.Lbn_mul_mont
91#endif
92
93.global	bn_mul_mont
94.hidden	bn_mul_mont
95.type	bn_mul_mont,%function
96
97.align	5
98bn_mul_mont:
99.Lbn_mul_mont:
100	ldr	ip,[sp,#4]		@ load num
101	stmdb	sp!,{r0,r2}		@ sp points at argument block
102#if __ARM_MAX_ARCH__>=7
103	tst	ip,#7
104	bne	.Lialu
105	adr	r0,bn_mul_mont
106	ldr	r2,.LOPENSSL_armcap
107	ldr	r0,[r0,r2]
108#ifdef	__APPLE__
109	ldr	r0,[r0]
110#endif
111	tst	r0,#1			@ NEON available?
112	ldmia	sp, {r0,r2}
113	beq	.Lialu
114	add	sp,sp,#8
115	b	bn_mul8x_mont_neon
116.align	4
117.Lialu:
118#endif
119	cmp	ip,#2
120	mov	$num,ip			@ load num
121	movlt	r0,#0
122	addlt	sp,sp,#2*4
123	blt	.Labrt
124
125	stmdb	sp!,{r4-r12,lr}		@ save 10 registers
126
127	mov	$num,$num,lsl#2		@ rescale $num for byte count
128	sub	sp,sp,$num		@ alloca(4*num)
129	sub	sp,sp,#4		@ +extra dword
130	sub	$num,$num,#4		@ "num=num-1"
131	add	$tp,$bp,$num		@ &bp[num-1]
132
133	add	$num,sp,$num		@ $num to point at &tp[num-1]
134	ldr	$n0,[$_n0]		@ &n0
135	ldr	$bi,[$bp]		@ bp[0]
136	ldr	$aj,[$ap],#4		@ ap[0],ap++
137	ldr	$nj,[$np],#4		@ np[0],np++
138	ldr	$n0,[$n0]		@ *n0
139	str	$tp,[$_bpend]		@ save &bp[num]
140
141	umull	$alo,$ahi,$aj,$bi	@ ap[0]*bp[0]
142	str	$n0,[$_n0]		@ save n0 value
143	mul	$n0,$alo,$n0		@ "tp[0]"*n0
144	mov	$nlo,#0
145	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"t[0]"
146	mov	$tp,sp
147
148.L1st:
149	ldr	$aj,[$ap],#4		@ ap[j],ap++
150	mov	$alo,$ahi
151	ldr	$nj,[$np],#4		@ np[j],np++
152	mov	$ahi,#0
153	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[0]
154	mov	$nhi,#0
155	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
156	adds	$nlo,$nlo,$alo
157	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
158	adc	$nlo,$nhi,#0
159	cmp	$tp,$num
160	bne	.L1st
161
162	adds	$nlo,$nlo,$ahi
163	ldr	$tp,[$_bp]		@ restore bp
164	mov	$nhi,#0
165	ldr	$n0,[$_n0]		@ restore n0
166	adc	$nhi,$nhi,#0
167	str	$nlo,[$num]		@ tp[num-1]=
168	str	$nhi,[$num,#4]		@ tp[num]=
169
170.Louter:
171	sub	$tj,$num,sp		@ "original" $num-1 value
172	sub	$ap,$ap,$tj		@ "rewind" ap to &ap[1]
173	ldr	$bi,[$tp,#4]!		@ *(++bp)
174	sub	$np,$np,$tj		@ "rewind" np to &np[1]
175	ldr	$aj,[$ap,#-4]		@ ap[0]
176	ldr	$alo,[sp]		@ tp[0]
177	ldr	$nj,[$np,#-4]		@ np[0]
178	ldr	$tj,[sp,#4]		@ tp[1]
179
180	mov	$ahi,#0
181	umlal	$alo,$ahi,$aj,$bi	@ ap[0]*bp[i]+tp[0]
182	str	$tp,[$_bp]		@ save bp
183	mul	$n0,$alo,$n0
184	mov	$nlo,#0
185	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"tp[0]"
186	mov	$tp,sp
187
188.Linner:
189	ldr	$aj,[$ap],#4		@ ap[j],ap++
190	adds	$alo,$ahi,$tj		@ +=tp[j]
191	ldr	$nj,[$np],#4		@ np[j],np++
192	mov	$ahi,#0
193	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[i]
194	mov	$nhi,#0
195	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
196	adc	$ahi,$ahi,#0
197	ldr	$tj,[$tp,#8]		@ tp[j+1]
198	adds	$nlo,$nlo,$alo
199	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
200	adc	$nlo,$nhi,#0
201	cmp	$tp,$num
202	bne	.Linner
203
204	adds	$nlo,$nlo,$ahi
205	mov	$nhi,#0
206	ldr	$tp,[$_bp]		@ restore bp
207	adc	$nhi,$nhi,#0
208	ldr	$n0,[$_n0]		@ restore n0
209	adds	$nlo,$nlo,$tj
210	ldr	$tj,[$_bpend]		@ restore &bp[num]
211	adc	$nhi,$nhi,#0
212	str	$nlo,[$num]		@ tp[num-1]=
213	str	$nhi,[$num,#4]		@ tp[num]=
214
215	cmp	$tp,$tj
216	bne	.Louter
217
218	ldr	$rp,[$_rp]		@ pull rp
219	add	$num,$num,#4		@ $num to point at &tp[num]
220	sub	$aj,$num,sp		@ "original" num value
221	mov	$tp,sp			@ "rewind" $tp
222	mov	$ap,$tp			@ "borrow" $ap
223	sub	$np,$np,$aj		@ "rewind" $np to &np[0]
224
225	subs	$tj,$tj,$tj		@ "clear" carry flag
226.Lsub:	ldr	$tj,[$tp],#4
227	ldr	$nj,[$np],#4
228	sbcs	$tj,$tj,$nj		@ tp[j]-np[j]
229	str	$tj,[$rp],#4		@ rp[j]=
230	teq	$tp,$num		@ preserve carry
231	bne	.Lsub
232	sbcs	$nhi,$nhi,#0		@ upmost carry
233	mov	$tp,sp			@ "rewind" $tp
234	sub	$rp,$rp,$aj		@ "rewind" $rp
235
236	and	$ap,$tp,$nhi
237	bic	$np,$rp,$nhi
238	orr	$ap,$ap,$np		@ ap=borrow?tp:rp
239
240.Lcopy:	ldr	$tj,[$ap],#4		@ copy or in-place refresh
241	str	sp,[$tp],#4		@ zap tp
242	str	$tj,[$rp],#4
243	cmp	$tp,$num
244	bne	.Lcopy
245
246	add	sp,$num,#4		@ skip over tp[num+1]
247	ldmia	sp!,{r4-r12,lr}		@ restore registers
248	add	sp,sp,#2*4		@ skip over {r0,r2}
249	mov	r0,#1
250.Labrt:
251#if __ARM_ARCH__>=5
252	ret				@ bx lr
253#else
254	tst	lr,#1
255	moveq	pc,lr			@ be binary compatible with V4, yet
256	bx	lr			@ interoperable with Thumb ISA:-)
257#endif
258.size	bn_mul_mont,.-bn_mul_mont
259___
260{
261sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
262sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
263
264my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
265my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
266my ($Z,$Temp)=("q4","q5");
267my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13));
268my ($Bi,$Ni,$M0)=map("d$_",(28..31));
269my $zero=&Dlo($Z);
270my $temp=&Dlo($Temp);
271
272my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
273my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9));
274
275$code.=<<___;
276#if __ARM_MAX_ARCH__>=7
277.arch	armv7-a
278.fpu	neon
279
280.type	bn_mul8x_mont_neon,%function
281.align	5
282bn_mul8x_mont_neon:
283	mov	ip,sp
284	stmdb	sp!,{r4-r11}
285	vstmdb	sp!,{d8-d15}		@ ABI specification says so
286	ldmia	ip,{r4-r5}		@ load rest of parameter block
287
288	sub		$toutptr,sp,#16
289	vld1.32		{${Bi}[0]}, [$bptr,:32]!
290	sub		$toutptr,$toutptr,$num,lsl#4
291	vld1.32		{$A0-$A3},  [$aptr]!		@ can't specify :32 :-(
292	and		$toutptr,$toutptr,#-64
293	vld1.32		{${M0}[0]}, [$n0,:32]
294	mov		sp,$toutptr			@ alloca
295	veor		$zero,$zero,$zero
296	subs		$inner,$num,#8
297	vzip.16		$Bi,$zero
298
299	vmull.u32	$A0xB,$Bi,${A0}[0]
300	vmull.u32	$A1xB,$Bi,${A0}[1]
301	vmull.u32	$A2xB,$Bi,${A1}[0]
302	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
303	vmull.u32	$A3xB,$Bi,${A1}[1]
304
305	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
306	veor		$zero,$zero,$zero
307	vmul.u32	$Ni,$temp,$M0
308
309	vmull.u32	$A4xB,$Bi,${A2}[0]
310	 vld1.32	{$N0-$N3}, [$nptr]!
311	vmull.u32	$A5xB,$Bi,${A2}[1]
312	vmull.u32	$A6xB,$Bi,${A3}[0]
313	vzip.16		$Ni,$zero
314	vmull.u32	$A7xB,$Bi,${A3}[1]
315
316	bne	.LNEON_1st
317
318	@ special case for num=8, everything is in register bank...
319
320	vmlal.u32	$A0xB,$Ni,${N0}[0]
321	sub		$outer,$num,#1
322	vmlal.u32	$A1xB,$Ni,${N0}[1]
323	vmlal.u32	$A2xB,$Ni,${N1}[0]
324	vmlal.u32	$A3xB,$Ni,${N1}[1]
325
326	vmlal.u32	$A4xB,$Ni,${N2}[0]
327	vmov		$Temp,$A0xB
328	vmlal.u32	$A5xB,$Ni,${N2}[1]
329	vmov		$A0xB,$A1xB
330	vmlal.u32	$A6xB,$Ni,${N3}[0]
331	vmov		$A1xB,$A2xB
332	vmlal.u32	$A7xB,$Ni,${N3}[1]
333	vmov		$A2xB,$A3xB
334	vmov		$A3xB,$A4xB
335	vshr.u64	$temp,$temp,#16
336	vmov		$A4xB,$A5xB
337	vmov		$A5xB,$A6xB
338	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
339	vmov		$A6xB,$A7xB
340	veor		$A7xB,$A7xB
341	vshr.u64	$temp,$temp,#16
342
343	b	.LNEON_outer8
344
345.align	4
346.LNEON_outer8:
347	vld1.32		{${Bi}[0]}, [$bptr,:32]!
348	veor		$zero,$zero,$zero
349	vzip.16		$Bi,$zero
350	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
351
352	vmlal.u32	$A0xB,$Bi,${A0}[0]
353	vmlal.u32	$A1xB,$Bi,${A0}[1]
354	vmlal.u32	$A2xB,$Bi,${A1}[0]
355	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
356	vmlal.u32	$A3xB,$Bi,${A1}[1]
357
358	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
359	veor		$zero,$zero,$zero
360	subs		$outer,$outer,#1
361	vmul.u32	$Ni,$temp,$M0
362
363	vmlal.u32	$A4xB,$Bi,${A2}[0]
364	vmlal.u32	$A5xB,$Bi,${A2}[1]
365	vmlal.u32	$A6xB,$Bi,${A3}[0]
366	vzip.16		$Ni,$zero
367	vmlal.u32	$A7xB,$Bi,${A3}[1]
368
369	vmlal.u32	$A0xB,$Ni,${N0}[0]
370	vmlal.u32	$A1xB,$Ni,${N0}[1]
371	vmlal.u32	$A2xB,$Ni,${N1}[0]
372	vmlal.u32	$A3xB,$Ni,${N1}[1]
373
374	vmlal.u32	$A4xB,$Ni,${N2}[0]
375	vmov		$Temp,$A0xB
376	vmlal.u32	$A5xB,$Ni,${N2}[1]
377	vmov		$A0xB,$A1xB
378	vmlal.u32	$A6xB,$Ni,${N3}[0]
379	vmov		$A1xB,$A2xB
380	vmlal.u32	$A7xB,$Ni,${N3}[1]
381	vmov		$A2xB,$A3xB
382	vmov		$A3xB,$A4xB
383	vshr.u64	$temp,$temp,#16
384	vmov		$A4xB,$A5xB
385	vmov		$A5xB,$A6xB
386	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
387	vmov		$A6xB,$A7xB
388	veor		$A7xB,$A7xB
389	vshr.u64	$temp,$temp,#16
390
391	bne	.LNEON_outer8
392
393	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
394	mov		$toutptr,sp
395	vshr.u64	$temp,`&Dlo("$A0xB")`,#16
396	mov		$inner,$num
397	vadd.u64	`&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
398	add		$tinptr,sp,#16
399	vshr.u64	$temp,`&Dhi("$A0xB")`,#16
400	vzip.16		`&Dlo("$A0xB")`,`&Dhi("$A0xB")`
401
402	b	.LNEON_tail2
403
404.align	4
405.LNEON_1st:
406	vmlal.u32	$A0xB,$Ni,${N0}[0]
407	 vld1.32	{$A0-$A3}, [$aptr]!
408	vmlal.u32	$A1xB,$Ni,${N0}[1]
409	subs		$inner,$inner,#8
410	vmlal.u32	$A2xB,$Ni,${N1}[0]
411	vmlal.u32	$A3xB,$Ni,${N1}[1]
412
413	vmlal.u32	$A4xB,$Ni,${N2}[0]
414	 vld1.32	{$N0-$N1}, [$nptr]!
415	vmlal.u32	$A5xB,$Ni,${N2}[1]
416	 vst1.64	{$A0xB-$A1xB}, [$toutptr,:256]!
417	vmlal.u32	$A6xB,$Ni,${N3}[0]
418	vmlal.u32	$A7xB,$Ni,${N3}[1]
419	 vst1.64	{$A2xB-$A3xB}, [$toutptr,:256]!
420
421	vmull.u32	$A0xB,$Bi,${A0}[0]
422	 vld1.32	{$N2-$N3}, [$nptr]!
423	vmull.u32	$A1xB,$Bi,${A0}[1]
424	 vst1.64	{$A4xB-$A5xB}, [$toutptr,:256]!
425	vmull.u32	$A2xB,$Bi,${A1}[0]
426	vmull.u32	$A3xB,$Bi,${A1}[1]
427	 vst1.64	{$A6xB-$A7xB}, [$toutptr,:256]!
428
429	vmull.u32	$A4xB,$Bi,${A2}[0]
430	vmull.u32	$A5xB,$Bi,${A2}[1]
431	vmull.u32	$A6xB,$Bi,${A3}[0]
432	vmull.u32	$A7xB,$Bi,${A3}[1]
433
434	bne	.LNEON_1st
435
436	vmlal.u32	$A0xB,$Ni,${N0}[0]
437	add		$tinptr,sp,#16
438	vmlal.u32	$A1xB,$Ni,${N0}[1]
439	sub		$aptr,$aptr,$num,lsl#2		@ rewind $aptr
440	vmlal.u32	$A2xB,$Ni,${N1}[0]
441	 vld1.64	{$Temp}, [sp,:128]
442	vmlal.u32	$A3xB,$Ni,${N1}[1]
443	sub		$outer,$num,#1
444
445	vmlal.u32	$A4xB,$Ni,${N2}[0]
446	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
447	vmlal.u32	$A5xB,$Ni,${N2}[1]
448	vshr.u64	$temp,$temp,#16
449	 vld1.64	{$A0xB},       [$tinptr, :128]!
450	vmlal.u32	$A6xB,$Ni,${N3}[0]
451	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
452	vmlal.u32	$A7xB,$Ni,${N3}[1]
453
454	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
455	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
456	veor		$Z,$Z,$Z
457	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
458	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
459	vst1.64		{$Z},          [$toutptr,:128]
460	vshr.u64	$temp,$temp,#16
461
462	b		.LNEON_outer
463
464.align	4
465.LNEON_outer:
466	vld1.32		{${Bi}[0]}, [$bptr,:32]!
467	sub		$nptr,$nptr,$num,lsl#2		@ rewind $nptr
468	vld1.32		{$A0-$A3},  [$aptr]!
469	veor		$zero,$zero,$zero
470	mov		$toutptr,sp
471	vzip.16		$Bi,$zero
472	sub		$inner,$num,#8
473	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
474
475	vmlal.u32	$A0xB,$Bi,${A0}[0]
476	 vld1.64	{$A3xB-$A4xB},[$tinptr,:256]!
477	vmlal.u32	$A1xB,$Bi,${A0}[1]
478	vmlal.u32	$A2xB,$Bi,${A1}[0]
479	 vld1.64	{$A5xB-$A6xB},[$tinptr,:256]!
480	vmlal.u32	$A3xB,$Bi,${A1}[1]
481
482	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
483	veor		$zero,$zero,$zero
484	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
485	 vld1.64	{$A7xB},[$tinptr,:128]!
486	vmul.u32	$Ni,$temp,$M0
487
488	vmlal.u32	$A4xB,$Bi,${A2}[0]
489	 vld1.32	{$N0-$N3}, [$nptr]!
490	vmlal.u32	$A5xB,$Bi,${A2}[1]
491	vmlal.u32	$A6xB,$Bi,${A3}[0]
492	vzip.16		$Ni,$zero
493	vmlal.u32	$A7xB,$Bi,${A3}[1]
494
495.LNEON_inner:
496	vmlal.u32	$A0xB,$Ni,${N0}[0]
497	 vld1.32	{$A0-$A3}, [$aptr]!
498	vmlal.u32	$A1xB,$Ni,${N0}[1]
499	 subs		$inner,$inner,#8
500	vmlal.u32	$A2xB,$Ni,${N1}[0]
501	vmlal.u32	$A3xB,$Ni,${N1}[1]
502	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
503
504	vmlal.u32	$A4xB,$Ni,${N2}[0]
505	 vld1.64	{$A0xB},       [$tinptr, :128]!
506	vmlal.u32	$A5xB,$Ni,${N2}[1]
507	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
508	vmlal.u32	$A6xB,$Ni,${N3}[0]
509	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
510	vmlal.u32	$A7xB,$Ni,${N3}[1]
511	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
512
513	vmlal.u32	$A0xB,$Bi,${A0}[0]
514	 vld1.64	{$A3xB-$A4xB}, [$tinptr, :256]!
515	vmlal.u32	$A1xB,$Bi,${A0}[1]
516	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
517	vmlal.u32	$A2xB,$Bi,${A1}[0]
518	 vld1.64	{$A5xB-$A6xB}, [$tinptr, :256]!
519	vmlal.u32	$A3xB,$Bi,${A1}[1]
520	 vld1.32	{$N0-$N3}, [$nptr]!
521
522	vmlal.u32	$A4xB,$Bi,${A2}[0]
523	 vld1.64	{$A7xB},       [$tinptr, :128]!
524	vmlal.u32	$A5xB,$Bi,${A2}[1]
525	vmlal.u32	$A6xB,$Bi,${A3}[0]
526	vmlal.u32	$A7xB,$Bi,${A3}[1]
527
528	bne	.LNEON_inner
529
530	vmlal.u32	$A0xB,$Ni,${N0}[0]
531	add		$tinptr,sp,#16
532	vmlal.u32	$A1xB,$Ni,${N0}[1]
533	sub		$aptr,$aptr,$num,lsl#2		@ rewind $aptr
534	vmlal.u32	$A2xB,$Ni,${N1}[0]
535	 vld1.64	{$Temp}, [sp,:128]
536	vmlal.u32	$A3xB,$Ni,${N1}[1]
537	subs		$outer,$outer,#1
538
539	vmlal.u32	$A4xB,$Ni,${N2}[0]
540	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
541	vmlal.u32	$A5xB,$Ni,${N2}[1]
542	 vld1.64	{$A0xB},       [$tinptr, :128]!
543	vshr.u64	$temp,$temp,#16
544	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
545	vmlal.u32	$A6xB,$Ni,${N3}[0]
546	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
547	vmlal.u32	$A7xB,$Ni,${N3}[1]
548
549	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
550	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
551	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
552	vshr.u64	$temp,$temp,#16
553
554	bne	.LNEON_outer
555
556	mov		$toutptr,sp
557	mov		$inner,$num
558
559.LNEON_tail:
560	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
561	vld1.64		{$A3xB-$A4xB}, [$tinptr, :256]!
562	vshr.u64	$temp,`&Dlo("$A0xB")`,#16
563	vadd.u64	`&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
564	vld1.64		{$A5xB-$A6xB}, [$tinptr, :256]!
565	vshr.u64	$temp,`&Dhi("$A0xB")`,#16
566	vld1.64		{$A7xB},       [$tinptr, :128]!
567	vzip.16		`&Dlo("$A0xB")`,`&Dhi("$A0xB")`
568
569.LNEON_tail2:
570	vadd.u64	`&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp
571	vst1.32		{`&Dlo("$A0xB")`[0]}, [$toutptr, :32]!
572	vshr.u64	$temp,`&Dlo("$A1xB")`,#16
573	vadd.u64	`&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp
574	vshr.u64	$temp,`&Dhi("$A1xB")`,#16
575	vzip.16		`&Dlo("$A1xB")`,`&Dhi("$A1xB")`
576
577	vadd.u64	`&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp
578	vst1.32		{`&Dlo("$A1xB")`[0]}, [$toutptr, :32]!
579	vshr.u64	$temp,`&Dlo("$A2xB")`,#16
580	vadd.u64	`&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp
581	vshr.u64	$temp,`&Dhi("$A2xB")`,#16
582	vzip.16		`&Dlo("$A2xB")`,`&Dhi("$A2xB")`
583
584	vadd.u64	`&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp
585	vst1.32		{`&Dlo("$A2xB")`[0]}, [$toutptr, :32]!
586	vshr.u64	$temp,`&Dlo("$A3xB")`,#16
587	vadd.u64	`&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp
588	vshr.u64	$temp,`&Dhi("$A3xB")`,#16
589	vzip.16		`&Dlo("$A3xB")`,`&Dhi("$A3xB")`
590
591	vadd.u64	`&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp
592	vst1.32		{`&Dlo("$A3xB")`[0]}, [$toutptr, :32]!
593	vshr.u64	$temp,`&Dlo("$A4xB")`,#16
594	vadd.u64	`&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp
595	vshr.u64	$temp,`&Dhi("$A4xB")`,#16
596	vzip.16		`&Dlo("$A4xB")`,`&Dhi("$A4xB")`
597
598	vadd.u64	`&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp
599	vst1.32		{`&Dlo("$A4xB")`[0]}, [$toutptr, :32]!
600	vshr.u64	$temp,`&Dlo("$A5xB")`,#16
601	vadd.u64	`&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp
602	vshr.u64	$temp,`&Dhi("$A5xB")`,#16
603	vzip.16		`&Dlo("$A5xB")`,`&Dhi("$A5xB")`
604
605	vadd.u64	`&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp
606	vst1.32		{`&Dlo("$A5xB")`[0]}, [$toutptr, :32]!
607	vshr.u64	$temp,`&Dlo("$A6xB")`,#16
608	vadd.u64	`&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp
609	vld1.64		{$A0xB}, [$tinptr, :128]!
610	vshr.u64	$temp,`&Dhi("$A6xB")`,#16
611	vzip.16		`&Dlo("$A6xB")`,`&Dhi("$A6xB")`
612
613	vadd.u64	`&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp
614	vst1.32		{`&Dlo("$A6xB")`[0]}, [$toutptr, :32]!
615	vshr.u64	$temp,`&Dlo("$A7xB")`,#16
616	vadd.u64	`&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp
617	vld1.64		{$A1xB-$A2xB},	[$tinptr, :256]!
618	vshr.u64	$temp,`&Dhi("$A7xB")`,#16
619	vzip.16		`&Dlo("$A7xB")`,`&Dhi("$A7xB")`
620	subs		$inner,$inner,#8
621	vst1.32		{`&Dlo("$A7xB")`[0]}, [$toutptr, :32]!
622
623	bne	.LNEON_tail
624
625	vst1.32	{${temp}[0]}, [$toutptr, :32]		@ top-most bit
626	sub	$nptr,$nptr,$num,lsl#2			@ rewind $nptr
627	subs	$aptr,sp,#0				@ clear carry flag
628	add	$bptr,sp,$num,lsl#2
629
630.LNEON_sub:
631	ldmia	$aptr!, {r4-r7}
632	ldmia	$nptr!, {r8-r11}
633	sbcs	r8, r4,r8
634	sbcs	r9, r5,r9
635	sbcs	r10,r6,r10
636	sbcs	r11,r7,r11
637	teq	$aptr,$bptr				@ preserves carry
638	stmia	$rptr!, {r8-r11}
639	bne	.LNEON_sub
640
641	ldr	r10, [$aptr]				@ load top-most bit
642	veor	q0,q0,q0
643	sub	r11,$bptr,sp				@ this is num*4
644	veor	q1,q1,q1
645	mov	$aptr,sp
646	sub	$rptr,$rptr,r11				@ rewind $rptr
647	mov	$nptr,$bptr				@ second 3/4th of frame
648	sbcs	r10,r10,#0				@ result is carry flag
649
650.LNEON_copy_n_zap:
651	ldmia	$aptr!, {r4-r7}
652	ldmia	$rptr,  {r8-r11}
653	movcc	r8, r4
654	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
655	movcc	r9, r5
656	movcc	r10,r6
657	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
658	movcc	r11,r7
659	ldmia	$aptr, {r4-r7}
660	stmia	$rptr!, {r8-r11}
661	sub	$aptr,$aptr,#16
662	ldmia	$rptr, {r8-r11}
663	movcc	r8, r4
664	vst1.64	{q0-q1}, [$aptr,:256]!			@ wipe
665	movcc	r9, r5
666	movcc	r10,r6
667	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
668	movcc	r11,r7
669	teq	$aptr,$bptr				@ preserves carry
670	stmia	$rptr!, {r8-r11}
671	bne	.LNEON_copy_n_zap
672
673	sub	sp,ip,#96
674        vldmia  sp!,{d8-d15}
675        ldmia   sp!,{r4-r11}
676	ret						@ bx lr
677.size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
678#endif
679___
680}
681$code.=<<___;
682.asciz	"Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
683.align	2
684#if __ARM_MAX_ARCH__>=7
685.comm	OPENSSL_armcap_P,4,4
686.hidden	OPENSSL_armcap_P
687#endif
688___
689
690$code =~ s/\`([^\`]*)\`/eval $1/gem;
691$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
692$code =~ s/\bret\b/bx	lr/gm;
693print $code;
694close STDOUT;
695