• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# January 2007.
18
19# Montgomery multiplication for ARMv4.
20#
21# Performance improvement naturally varies among CPU implementations
22# and compilers. The code was observed to provide +65-35% improvement
23# [depending on key length, less for longer keys] on ARM920T, and
24# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
25# base and compiler generated code with in-lined umull and even umlal
26# instructions. The latter means that this code didn't really have an
27# "advantage" of utilizing some "secret" instruction.
28#
29# The code is interoperable with Thumb ISA and is rather compact, less
30# than 1/2KB. Windows CE port would be trivial, as it's exclusively
31# about decorations, ABI and instruction syntax are identical.
32
33# November 2013
34#
35# Add NEON code path, which handles lengths divisible by 8. RSA/DSA
36# performance improvement on Cortex-A8 is ~45-100% depending on key
37# length, more for longer keys. On Cortex-A15 the span is ~10-105%.
38# On Snapdragon S4 improvement was measured to vary from ~70% to
39# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
40# rather because original integer-only code seems to perform
41# suboptimally on S4. Situation on Cortex-A9 is unfortunately
42# different. It's being looked into, but the trouble is that
43# performance for vectors longer than 256 bits is actually couple
44# of percent worse than for integer-only code. The code is chosen
45# for execution on all NEON-capable processors, because gain on
46# others outweighs the marginal loss on Cortex-A9.
47
48# September 2015
49#
50# Align Cortex-A9 performance with November 2013 improvements, i.e.
51# NEON code is now ~20-105% faster than integer-only one on this
52# processor. But this optimization further improved performance even
53# on other processors: NEON code path is ~45-180% faster than original
54# integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on
55# Snapdragon S4.
56
57$flavour = shift;
58if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
59else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
60
61if ($flavour && $flavour ne "void") {
62    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
63    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
64    ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
65    die "can't locate arm-xlate.pl";
66
67    open OUT,"| \"$^X\" $xlate $flavour $output";
68    *STDOUT=*OUT;
69} else {
70    open OUT,">$output";
71    *STDOUT=*OUT;
72}
73
74$num="r0";	# starts as num argument, but holds &tp[num-1]
75$ap="r1";
76$bp="r2"; $bi="r2"; $rp="r2";
77$np="r3";
78$tp="r4";
79$aj="r5";
80$nj="r6";
81$tj="r7";
82$n0="r8";
83###########	# r9 is reserved by ELF as platform specific, e.g. TLS pointer
84$alo="r10";	# sl, gcc uses it to keep @GOT
85$ahi="r11";	# fp
86$nlo="r12";	# ip
87###########	# r13 is stack pointer
88$nhi="r14";	# lr
89###########	# r15 is program counter
90
91#### argument block layout relative to &tp[num-1], a.k.a. $num
92$_rp="$num,#12*4";
93# ap permanently resides in r1
94$_bp="$num,#13*4";
95# np permanently resides in r3
96$_n0="$num,#14*4";
97$_num="$num,#15*4";	$_bpend=$_num;
98
99$code=<<___;
100#include <GFp/arm_arch.h>
101
102@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
103@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
104.arch  armv7-a
105
106.text
107#if defined(__thumb2__)
108.syntax	unified
109.thumb
110#else
111.code	32
112#endif
113
114#if __ARM_MAX_ARCH__>=7
115.extern GFp_armcap_P
116.hidden GFp_armcap_P
117.align	5
118.LOPENSSL_armcap:
119.word	GFp_armcap_P-.Lbn_mul_mont
120#endif
121
122.global	GFp_bn_mul_mont
123.type	GFp_bn_mul_mont,%function
124
125.align	5
126GFp_bn_mul_mont:
127.Lbn_mul_mont:
128	ldr	ip,[sp,#4]		@ load num
129	stmdb	sp!,{r0,r2}		@ sp points at argument block
130#if __ARM_MAX_ARCH__>=7
131	tst	ip,#7
132	bne	.Lialu
133	adr	r0,.Lbn_mul_mont
134	ldr	r2,.LOPENSSL_armcap
135	ldr	r0,[r0,r2]
136#ifdef	__APPLE__
137	ldr	r0,[r0]
138#endif
139	tst	r0,#ARMV7_NEON		@ NEON available?
140	ldmia	sp, {r0,r2}
141	beq	.Lialu
142	add	sp,sp,#8
143	b	bn_mul8x_mont_neon
144.align	4
145.Lialu:
146#endif
147	cmp	ip,#2
148	mov	$num,ip			@ load num
149#ifdef	__thumb2__
150	ittt	lt
151#endif
152	movlt	r0,#0
153	addlt	sp,sp,#2*4
154	blt	.Labrt
155
156	stmdb	sp!,{r4-r12,lr}		@ save 10 registers
157
158	mov	$num,$num,lsl#2		@ rescale $num for byte count
159	sub	sp,sp,$num		@ alloca(4*num)
160	sub	sp,sp,#4		@ +extra dword
161	sub	$num,$num,#4		@ "num=num-1"
162	add	$tp,$bp,$num		@ &bp[num-1]
163
164	add	$num,sp,$num		@ $num to point at &tp[num-1]
165	ldr	$n0,[$_n0]		@ &n0
166	ldr	$bi,[$bp]		@ bp[0]
167	ldr	$aj,[$ap],#4		@ ap[0],ap++
168	ldr	$nj,[$np],#4		@ np[0],np++
169	ldr	$n0,[$n0]		@ *n0
170	str	$tp,[$_bpend]		@ save &bp[num]
171
172	umull	$alo,$ahi,$aj,$bi	@ ap[0]*bp[0]
173	str	$n0,[$_n0]		@ save n0 value
174	mul	$n0,$alo,$n0		@ "tp[0]"*n0
175	mov	$nlo,#0
176	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"t[0]"
177	mov	$tp,sp
178
179.L1st:
180	ldr	$aj,[$ap],#4		@ ap[j],ap++
181	mov	$alo,$ahi
182	ldr	$nj,[$np],#4		@ np[j],np++
183	mov	$ahi,#0
184	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[0]
185	mov	$nhi,#0
186	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
187	adds	$nlo,$nlo,$alo
188	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
189	adc	$nlo,$nhi,#0
190	cmp	$tp,$num
191	bne	.L1st
192
193	adds	$nlo,$nlo,$ahi
194	ldr	$tp,[$_bp]		@ restore bp
195	mov	$nhi,#0
196	ldr	$n0,[$_n0]		@ restore n0
197	adc	$nhi,$nhi,#0
198	str	$nlo,[$num]		@ tp[num-1]=
199	mov	$tj,sp
200	str	$nhi,[$num,#4]		@ tp[num]=
201
202.Louter:
203	sub	$tj,$num,$tj		@ "original" $num-1 value
204	sub	$ap,$ap,$tj		@ "rewind" ap to &ap[1]
205	ldr	$bi,[$tp,#4]!		@ *(++bp)
206	sub	$np,$np,$tj		@ "rewind" np to &np[1]
207	ldr	$aj,[$ap,#-4]		@ ap[0]
208	ldr	$alo,[sp]		@ tp[0]
209	ldr	$nj,[$np,#-4]		@ np[0]
210	ldr	$tj,[sp,#4]		@ tp[1]
211
212	mov	$ahi,#0
213	umlal	$alo,$ahi,$aj,$bi	@ ap[0]*bp[i]+tp[0]
214	str	$tp,[$_bp]		@ save bp
215	mul	$n0,$alo,$n0
216	mov	$nlo,#0
217	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"tp[0]"
218	mov	$tp,sp
219
220.Linner:
221	ldr	$aj,[$ap],#4		@ ap[j],ap++
222	adds	$alo,$ahi,$tj		@ +=tp[j]
223	ldr	$nj,[$np],#4		@ np[j],np++
224	mov	$ahi,#0
225	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[i]
226	mov	$nhi,#0
227	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
228	adc	$ahi,$ahi,#0
229	ldr	$tj,[$tp,#8]		@ tp[j+1]
230	adds	$nlo,$nlo,$alo
231	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
232	adc	$nlo,$nhi,#0
233	cmp	$tp,$num
234	bne	.Linner
235
236	adds	$nlo,$nlo,$ahi
237	mov	$nhi,#0
238	ldr	$tp,[$_bp]		@ restore bp
239	adc	$nhi,$nhi,#0
240	ldr	$n0,[$_n0]		@ restore n0
241	adds	$nlo,$nlo,$tj
242	ldr	$tj,[$_bpend]		@ restore &bp[num]
243	adc	$nhi,$nhi,#0
244	str	$nlo,[$num]		@ tp[num-1]=
245	str	$nhi,[$num,#4]		@ tp[num]=
246
247	cmp	$tp,$tj
248#ifdef	__thumb2__
249	itt	ne
250#endif
251	movne	$tj,sp
252	bne	.Louter
253
254	ldr	$rp,[$_rp]		@ pull rp
255	mov	$aj,sp
256	add	$num,$num,#4		@ $num to point at &tp[num]
257	sub	$aj,$num,$aj		@ "original" num value
258	mov	$tp,sp			@ "rewind" $tp
259	mov	$ap,$tp			@ "borrow" $ap
260	sub	$np,$np,$aj		@ "rewind" $np to &np[0]
261
262	subs	$tj,$tj,$tj		@ "clear" carry flag
263.Lsub:	ldr	$tj,[$tp],#4
264	ldr	$nj,[$np],#4
265	sbcs	$tj,$tj,$nj		@ tp[j]-np[j]
266	str	$tj,[$rp],#4		@ rp[j]=
267	teq	$tp,$num		@ preserve carry
268	bne	.Lsub
269	sbcs	$nhi,$nhi,#0		@ upmost carry
270	mov	$tp,sp			@ "rewind" $tp
271	sub	$rp,$rp,$aj		@ "rewind" $rp
272
273.Lcopy:	ldr	$tj,[$tp]		@ conditional copy
274	ldr	$aj,[$rp]
275	str	sp,[$tp],#4		@ zap tp
276#ifdef	__thumb2__
277	it	cc
278#endif
279	movcc	$aj,$tj
280	str	$aj,[$rp],#4
281	teq	$tp,$num		@ preserve carry
282	bne	.Lcopy
283
284	mov	sp,$num
285	add	sp,sp,#4		@ skip over tp[num+1]
286	ldmia	sp!,{r4-r12,lr}		@ restore registers
287	add	sp,sp,#2*4		@ skip over {r0,r2}
288	mov	r0,#1
289.Labrt:
290#if __ARM_ARCH__>=5
291	ret				@ bx lr
292#else
293	tst	lr,#1
294	moveq	pc,lr			@ be binary compatible with V4, yet
295	bx	lr			@ interoperable with Thumb ISA:-)
296#endif
297.size	GFp_bn_mul_mont,.-GFp_bn_mul_mont
298___
299{
300my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
301my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
302my ($Z,$Temp)=("q4","q5");
303my @ACC=map("q$_",(6..13));
304my ($Bi,$Ni,$M0)=map("d$_",(28..31));
305my $zero="$Z#lo";
306my $temp="$Temp#lo";
307
308my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
309my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11));
310
311$code.=<<___;
312#if __ARM_MAX_ARCH__>=7
313.arch	armv7-a
314.fpu	neon
315
316.type	bn_mul8x_mont_neon,%function
317.align	5
318bn_mul8x_mont_neon:
319	mov	ip,sp
320	stmdb	sp!,{r4-r11}
321	vstmdb	sp!,{d8-d15}		@ ABI specification says so
322	ldmia	ip,{r4-r5}		@ load rest of parameter block
323	mov	ip,sp
324
325	cmp	$num,#8
326	bhi	.LNEON_8n
327
328	@ special case for $num==8, everything is in register bank...
329
330	vld1.32		{${Bi}[0]}, [$bptr,:32]!
331	veor		$zero,$zero,$zero
332	sub		$toutptr,sp,$num,lsl#4
333	vld1.32		{$A0-$A3},  [$aptr]!		@ can't specify :32 :-(
334	and		$toutptr,$toutptr,#-64
335	vld1.32		{${M0}[0]}, [$n0,:32]
336	mov		sp,$toutptr			@ alloca
337	vzip.16		$Bi,$zero
338
339	vmull.u32	@ACC[0],$Bi,${A0}[0]
340	vmull.u32	@ACC[1],$Bi,${A0}[1]
341	vmull.u32	@ACC[2],$Bi,${A1}[0]
342	vshl.i64	$Ni,@ACC[0]#hi,#16
343	vmull.u32	@ACC[3],$Bi,${A1}[1]
344
345	vadd.u64	$Ni,$Ni,@ACC[0]#lo
346	veor		$zero,$zero,$zero
347	vmul.u32	$Ni,$Ni,$M0
348
349	vmull.u32	@ACC[4],$Bi,${A2}[0]
350	 vld1.32	{$N0-$N3}, [$nptr]!
351	vmull.u32	@ACC[5],$Bi,${A2}[1]
352	vmull.u32	@ACC[6],$Bi,${A3}[0]
353	vzip.16		$Ni,$zero
354	vmull.u32	@ACC[7],$Bi,${A3}[1]
355
356	vmlal.u32	@ACC[0],$Ni,${N0}[0]
357	sub		$outer,$num,#1
358	vmlal.u32	@ACC[1],$Ni,${N0}[1]
359	vmlal.u32	@ACC[2],$Ni,${N1}[0]
360	vmlal.u32	@ACC[3],$Ni,${N1}[1]
361
362	vmlal.u32	@ACC[4],$Ni,${N2}[0]
363	vmov		$Temp,@ACC[0]
364	vmlal.u32	@ACC[5],$Ni,${N2}[1]
365	vmov		@ACC[0],@ACC[1]
366	vmlal.u32	@ACC[6],$Ni,${N3}[0]
367	vmov		@ACC[1],@ACC[2]
368	vmlal.u32	@ACC[7],$Ni,${N3}[1]
369	vmov		@ACC[2],@ACC[3]
370	vmov		@ACC[3],@ACC[4]
371	vshr.u64	$temp,$temp,#16
372	vmov		@ACC[4],@ACC[5]
373	vmov		@ACC[5],@ACC[6]
374	vadd.u64	$temp,$temp,$Temp#hi
375	vmov		@ACC[6],@ACC[7]
376	veor		@ACC[7],@ACC[7]
377	vshr.u64	$temp,$temp,#16
378
379	b	.LNEON_outer8
380
381.align	4
382.LNEON_outer8:
383	vld1.32		{${Bi}[0]}, [$bptr,:32]!
384	veor		$zero,$zero,$zero
385	vzip.16		$Bi,$zero
386	vadd.u64	@ACC[0]#lo,@ACC[0]#lo,$temp
387
388	vmlal.u32	@ACC[0],$Bi,${A0}[0]
389	vmlal.u32	@ACC[1],$Bi,${A0}[1]
390	vmlal.u32	@ACC[2],$Bi,${A1}[0]
391	vshl.i64	$Ni,@ACC[0]#hi,#16
392	vmlal.u32	@ACC[3],$Bi,${A1}[1]
393
394	vadd.u64	$Ni,$Ni,@ACC[0]#lo
395	veor		$zero,$zero,$zero
396	subs		$outer,$outer,#1
397	vmul.u32	$Ni,$Ni,$M0
398
399	vmlal.u32	@ACC[4],$Bi,${A2}[0]
400	vmlal.u32	@ACC[5],$Bi,${A2}[1]
401	vmlal.u32	@ACC[6],$Bi,${A3}[0]
402	vzip.16		$Ni,$zero
403	vmlal.u32	@ACC[7],$Bi,${A3}[1]
404
405	vmlal.u32	@ACC[0],$Ni,${N0}[0]
406	vmlal.u32	@ACC[1],$Ni,${N0}[1]
407	vmlal.u32	@ACC[2],$Ni,${N1}[0]
408	vmlal.u32	@ACC[3],$Ni,${N1}[1]
409
410	vmlal.u32	@ACC[4],$Ni,${N2}[0]
411	vmov		$Temp,@ACC[0]
412	vmlal.u32	@ACC[5],$Ni,${N2}[1]
413	vmov		@ACC[0],@ACC[1]
414	vmlal.u32	@ACC[6],$Ni,${N3}[0]
415	vmov		@ACC[1],@ACC[2]
416	vmlal.u32	@ACC[7],$Ni,${N3}[1]
417	vmov		@ACC[2],@ACC[3]
418	vmov		@ACC[3],@ACC[4]
419	vshr.u64	$temp,$temp,#16
420	vmov		@ACC[4],@ACC[5]
421	vmov		@ACC[5],@ACC[6]
422	vadd.u64	$temp,$temp,$Temp#hi
423	vmov		@ACC[6],@ACC[7]
424	veor		@ACC[7],@ACC[7]
425	vshr.u64	$temp,$temp,#16
426
427	bne	.LNEON_outer8
428
429	vadd.u64	@ACC[0]#lo,@ACC[0]#lo,$temp
430	mov		$toutptr,sp
431	vshr.u64	$temp,@ACC[0]#lo,#16
432	mov		$inner,$num
433	vadd.u64	@ACC[0]#hi,@ACC[0]#hi,$temp
434	add		$tinptr,sp,#96
435	vshr.u64	$temp,@ACC[0]#hi,#16
436	vzip.16		@ACC[0]#lo,@ACC[0]#hi
437
438	b	.LNEON_tail_entry
439
440.align	4
441.LNEON_8n:
442	veor		@ACC[0],@ACC[0],@ACC[0]
443	 sub		$toutptr,sp,#128
444	veor		@ACC[1],@ACC[1],@ACC[1]
445	 sub		$toutptr,$toutptr,$num,lsl#4
446	veor		@ACC[2],@ACC[2],@ACC[2]
447	 and		$toutptr,$toutptr,#-64
448	veor		@ACC[3],@ACC[3],@ACC[3]
449	 mov		sp,$toutptr			@ alloca
450	veor		@ACC[4],@ACC[4],@ACC[4]
451	 add		$toutptr,$toutptr,#256
452	veor		@ACC[5],@ACC[5],@ACC[5]
453	 sub		$inner,$num,#8
454	veor		@ACC[6],@ACC[6],@ACC[6]
455	veor		@ACC[7],@ACC[7],@ACC[7]
456
457.LNEON_8n_init:
458	vst1.64		{@ACC[0]-@ACC[1]},[$toutptr,:256]!
459	subs		$inner,$inner,#8
460	vst1.64		{@ACC[2]-@ACC[3]},[$toutptr,:256]!
461	vst1.64		{@ACC[4]-@ACC[5]},[$toutptr,:256]!
462	vst1.64		{@ACC[6]-@ACC[7]},[$toutptr,:256]!
463	bne		.LNEON_8n_init
464
465	add		$tinptr,sp,#256
466	vld1.32		{$A0-$A3},[$aptr]!
467	add		$bnptr,sp,#8
468	vld1.32		{${M0}[0]},[$n0,:32]
469	mov		$outer,$num
470	b		.LNEON_8n_outer
471
472.align	4
473.LNEON_8n_outer:
474	vld1.32		{${Bi}[0]},[$bptr,:32]!	@ *b++
475	veor		$zero,$zero,$zero
476	vzip.16		$Bi,$zero
477	add		$toutptr,sp,#128
478	vld1.32		{$N0-$N3},[$nptr]!
479
480	vmlal.u32	@ACC[0],$Bi,${A0}[0]
481	vmlal.u32	@ACC[1],$Bi,${A0}[1]
482	 veor		$zero,$zero,$zero
483	vmlal.u32	@ACC[2],$Bi,${A1}[0]
484	 vshl.i64	$Ni,@ACC[0]#hi,#16
485	vmlal.u32	@ACC[3],$Bi,${A1}[1]
486	 vadd.u64	$Ni,$Ni,@ACC[0]#lo
487	vmlal.u32	@ACC[4],$Bi,${A2}[0]
488	 vmul.u32	$Ni,$Ni,$M0
489	vmlal.u32	@ACC[5],$Bi,${A2}[1]
490	vst1.32		{$Bi},[sp,:64]		@ put aside smashed b[8*i+0]
491	vmlal.u32	@ACC[6],$Bi,${A3}[0]
492	 vzip.16	$Ni,$zero
493	vmlal.u32	@ACC[7],$Bi,${A3}[1]
494___
495for ($i=0; $i<7;) {
496$code.=<<___;
497	vld1.32		{${Bi}[0]},[$bptr,:32]!	@ *b++
498	vmlal.u32	@ACC[0],$Ni,${N0}[0]
499	veor		$temp,$temp,$temp
500	vmlal.u32	@ACC[1],$Ni,${N0}[1]
501	vzip.16		$Bi,$temp
502	vmlal.u32	@ACC[2],$Ni,${N1}[0]
503	 vshr.u64	@ACC[0]#lo,@ACC[0]#lo,#16
504	vmlal.u32	@ACC[3],$Ni,${N1}[1]
505	vmlal.u32	@ACC[4],$Ni,${N2}[0]
506	 vadd.u64	@ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
507	vmlal.u32	@ACC[5],$Ni,${N2}[1]
508	 vshr.u64	@ACC[0]#lo,@ACC[0]#lo,#16
509	vmlal.u32	@ACC[6],$Ni,${N3}[0]
510	vmlal.u32	@ACC[7],$Ni,${N3}[1]
511	 vadd.u64	@ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
512	vst1.32		{$Ni},[$bnptr,:64]!	@ put aside smashed m[8*i+$i]
513___
514	push(@ACC,shift(@ACC));	$i++;
515$code.=<<___;
516	vmlal.u32	@ACC[0],$Bi,${A0}[0]
517	vld1.64		{@ACC[7]},[$tinptr,:128]!
518	vmlal.u32	@ACC[1],$Bi,${A0}[1]
519	 veor		$zero,$zero,$zero
520	vmlal.u32	@ACC[2],$Bi,${A1}[0]
521	 vshl.i64	$Ni,@ACC[0]#hi,#16
522	vmlal.u32	@ACC[3],$Bi,${A1}[1]
523	 vadd.u64	$Ni,$Ni,@ACC[0]#lo
524	vmlal.u32	@ACC[4],$Bi,${A2}[0]
525	 vmul.u32	$Ni,$Ni,$M0
526	vmlal.u32	@ACC[5],$Bi,${A2}[1]
527	vst1.32		{$Bi},[$bnptr,:64]!	@ put aside smashed b[8*i+$i]
528	vmlal.u32	@ACC[6],$Bi,${A3}[0]
529	 vzip.16	$Ni,$zero
530	vmlal.u32	@ACC[7],$Bi,${A3}[1]
531___
532}
533$code.=<<___;
534	vld1.32		{$Bi},[sp,:64]		@ pull smashed b[8*i+0]
535	vmlal.u32	@ACC[0],$Ni,${N0}[0]
536	vld1.32		{$A0-$A3},[$aptr]!
537	vmlal.u32	@ACC[1],$Ni,${N0}[1]
538	vmlal.u32	@ACC[2],$Ni,${N1}[0]
539	 vshr.u64	@ACC[0]#lo,@ACC[0]#lo,#16
540	vmlal.u32	@ACC[3],$Ni,${N1}[1]
541	vmlal.u32	@ACC[4],$Ni,${N2}[0]
542	 vadd.u64	@ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
543	vmlal.u32	@ACC[5],$Ni,${N2}[1]
544	 vshr.u64	@ACC[0]#lo,@ACC[0]#lo,#16
545	vmlal.u32	@ACC[6],$Ni,${N3}[0]
546	vmlal.u32	@ACC[7],$Ni,${N3}[1]
547	 vadd.u64	@ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
548	vst1.32		{$Ni},[$bnptr,:64]	@ put aside smashed m[8*i+$i]
549	add		$bnptr,sp,#8		@ rewind
550___
551	push(@ACC,shift(@ACC));
552$code.=<<___;
553	sub		$inner,$num,#8
554	b		.LNEON_8n_inner
555
556.align	4
557.LNEON_8n_inner:
558	subs		$inner,$inner,#8
559	vmlal.u32	@ACC[0],$Bi,${A0}[0]
560	vld1.64		{@ACC[7]},[$tinptr,:128]
561	vmlal.u32	@ACC[1],$Bi,${A0}[1]
562	vld1.32		{$Ni},[$bnptr,:64]!	@ pull smashed m[8*i+0]
563	vmlal.u32	@ACC[2],$Bi,${A1}[0]
564	vld1.32		{$N0-$N3},[$nptr]!
565	vmlal.u32	@ACC[3],$Bi,${A1}[1]
566	it		ne
567	addne		$tinptr,$tinptr,#16	@ don't advance in last iteration
568	vmlal.u32	@ACC[4],$Bi,${A2}[0]
569	vmlal.u32	@ACC[5],$Bi,${A2}[1]
570	vmlal.u32	@ACC[6],$Bi,${A3}[0]
571	vmlal.u32	@ACC[7],$Bi,${A3}[1]
572___
573for ($i=1; $i<8; $i++) {
574$code.=<<___;
575	vld1.32		{$Bi},[$bnptr,:64]!	@ pull smashed b[8*i+$i]
576	vmlal.u32	@ACC[0],$Ni,${N0}[0]
577	vmlal.u32	@ACC[1],$Ni,${N0}[1]
578	vmlal.u32	@ACC[2],$Ni,${N1}[0]
579	vmlal.u32	@ACC[3],$Ni,${N1}[1]
580	vmlal.u32	@ACC[4],$Ni,${N2}[0]
581	vmlal.u32	@ACC[5],$Ni,${N2}[1]
582	vmlal.u32	@ACC[6],$Ni,${N3}[0]
583	vmlal.u32	@ACC[7],$Ni,${N3}[1]
584	vst1.64		{@ACC[0]},[$toutptr,:128]!
585___
586	push(@ACC,shift(@ACC));
587$code.=<<___;
588	vmlal.u32	@ACC[0],$Bi,${A0}[0]
589	vld1.64		{@ACC[7]},[$tinptr,:128]
590	vmlal.u32	@ACC[1],$Bi,${A0}[1]
591	vld1.32		{$Ni},[$bnptr,:64]!	@ pull smashed m[8*i+$i]
592	vmlal.u32	@ACC[2],$Bi,${A1}[0]
593	it		ne
594	addne		$tinptr,$tinptr,#16	@ don't advance in last iteration
595	vmlal.u32	@ACC[3],$Bi,${A1}[1]
596	vmlal.u32	@ACC[4],$Bi,${A2}[0]
597	vmlal.u32	@ACC[5],$Bi,${A2}[1]
598	vmlal.u32	@ACC[6],$Bi,${A3}[0]
599	vmlal.u32	@ACC[7],$Bi,${A3}[1]
600___
601}
602$code.=<<___;
603	it		eq
604	subeq		$aptr,$aptr,$num,lsl#2	@ rewind
605	vmlal.u32	@ACC[0],$Ni,${N0}[0]
606	vld1.32		{$Bi},[sp,:64]		@ pull smashed b[8*i+0]
607	vmlal.u32	@ACC[1],$Ni,${N0}[1]
608	vld1.32		{$A0-$A3},[$aptr]!
609	vmlal.u32	@ACC[2],$Ni,${N1}[0]
610	add		$bnptr,sp,#8		@ rewind
611	vmlal.u32	@ACC[3],$Ni,${N1}[1]
612	vmlal.u32	@ACC[4],$Ni,${N2}[0]
613	vmlal.u32	@ACC[5],$Ni,${N2}[1]
614	vmlal.u32	@ACC[6],$Ni,${N3}[0]
615	vst1.64		{@ACC[0]},[$toutptr,:128]!
616	vmlal.u32	@ACC[7],$Ni,${N3}[1]
617
618	bne		.LNEON_8n_inner
619___
620	push(@ACC,shift(@ACC));
621$code.=<<___;
622	add		$tinptr,sp,#128
623	vst1.64		{@ACC[0]-@ACC[1]},[$toutptr,:256]!
624	veor		q2,q2,q2		@ $N0-$N1
625	vst1.64		{@ACC[2]-@ACC[3]},[$toutptr,:256]!
626	veor		q3,q3,q3		@ $N2-$N3
627	vst1.64		{@ACC[4]-@ACC[5]},[$toutptr,:256]!
628	vst1.64		{@ACC[6]},[$toutptr,:128]
629
630	subs		$outer,$outer,#8
631	vld1.64		{@ACC[0]-@ACC[1]},[$tinptr,:256]!
632	vld1.64		{@ACC[2]-@ACC[3]},[$tinptr,:256]!
633	vld1.64		{@ACC[4]-@ACC[5]},[$tinptr,:256]!
634	vld1.64		{@ACC[6]-@ACC[7]},[$tinptr,:256]!
635
636	itt		ne
637	subne		$nptr,$nptr,$num,lsl#2	@ rewind
638	bne		.LNEON_8n_outer
639
640	add		$toutptr,sp,#128
641	vst1.64		{q2-q3}, [sp,:256]!	@ start wiping stack frame
642	vshr.u64	$temp,@ACC[0]#lo,#16
643	vst1.64		{q2-q3},[sp,:256]!
644	vadd.u64	@ACC[0]#hi,@ACC[0]#hi,$temp
645	vst1.64		{q2-q3}, [sp,:256]!
646	vshr.u64	$temp,@ACC[0]#hi,#16
647	vst1.64		{q2-q3}, [sp,:256]!
648	vzip.16		@ACC[0]#lo,@ACC[0]#hi
649
650	mov		$inner,$num
651	b		.LNEON_tail_entry
652
653.align	4
654.LNEON_tail:
655	vadd.u64	@ACC[0]#lo,@ACC[0]#lo,$temp
656	vshr.u64	$temp,@ACC[0]#lo,#16
657	vld1.64		{@ACC[2]-@ACC[3]}, [$tinptr, :256]!
658	vadd.u64	@ACC[0]#hi,@ACC[0]#hi,$temp
659	vld1.64		{@ACC[4]-@ACC[5]}, [$tinptr, :256]!
660	vshr.u64	$temp,@ACC[0]#hi,#16
661	vld1.64		{@ACC[6]-@ACC[7]}, [$tinptr, :256]!
662	vzip.16		@ACC[0]#lo,@ACC[0]#hi
663
664.LNEON_tail_entry:
665___
666for ($i=1; $i<8; $i++) {
667$code.=<<___;
668	vadd.u64	@ACC[1]#lo,@ACC[1]#lo,$temp
669	vst1.32		{@ACC[0]#lo[0]}, [$toutptr, :32]!
670	vshr.u64	$temp,@ACC[1]#lo,#16
671	vadd.u64	@ACC[1]#hi,@ACC[1]#hi,$temp
672	vshr.u64	$temp,@ACC[1]#hi,#16
673	vzip.16		@ACC[1]#lo,@ACC[1]#hi
674___
675	push(@ACC,shift(@ACC));
676}
677	push(@ACC,shift(@ACC));
678$code.=<<___;
679	vld1.64		{@ACC[0]-@ACC[1]}, [$tinptr, :256]!
680	subs		$inner,$inner,#8
681	vst1.32		{@ACC[7]#lo[0]},   [$toutptr, :32]!
682	bne	.LNEON_tail
683
684	vst1.32	{${temp}[0]}, [$toutptr, :32]		@ top-most bit
685	sub	$nptr,$nptr,$num,lsl#2			@ rewind $nptr
686	subs	$aptr,sp,#0				@ clear carry flag
687	add	$bptr,sp,$num,lsl#2
688
689.LNEON_sub:
690	ldmia	$aptr!, {r4-r7}
691	ldmia	$nptr!, {r8-r11}
692	sbcs	r8, r4,r8
693	sbcs	r9, r5,r9
694	sbcs	r10,r6,r10
695	sbcs	r11,r7,r11
696	teq	$aptr,$bptr				@ preserves carry
697	stmia	$rptr!, {r8-r11}
698	bne	.LNEON_sub
699
700	ldr	r10, [$aptr]				@ load top-most bit
701	mov	r11,sp
702	veor	q0,q0,q0
703	sub	r11,$bptr,r11				@ this is num*4
704	veor	q1,q1,q1
705	mov	$aptr,sp
706	sub	$rptr,$rptr,r11				@ rewind $rptr
707	mov	$nptr,$bptr				@ second 3/4th of frame
708	sbcs	r10,r10,#0				@ result is carry flag
709
710.LNEON_copy_n_zap:
711	ldmia	$aptr!, {r4-r7}
712	ldmia	$rptr,  {r8-r11}
713	it	cc
714	movcc	r8, r4
715	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
716	itt	cc
717	movcc	r9, r5
718	movcc	r10,r6
719	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
720	it	cc
721	movcc	r11,r7
722	ldmia	$aptr, {r4-r7}
723	stmia	$rptr!, {r8-r11}
724	sub	$aptr,$aptr,#16
725	ldmia	$rptr, {r8-r11}
726	it	cc
727	movcc	r8, r4
728	vst1.64	{q0-q1}, [$aptr,:256]!			@ wipe
729	itt	cc
730	movcc	r9, r5
731	movcc	r10,r6
732	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
733	it	cc
734	movcc	r11,r7
735	teq	$aptr,$bptr				@ preserves carry
736	stmia	$rptr!, {r8-r11}
737	bne	.LNEON_copy_n_zap
738
739	mov	sp,ip
740        vldmia  sp!,{d8-d15}
741        ldmia   sp!,{r4-r11}
742	ret						@ bx lr
743.size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
744#endif
745___
746}
747$code.=<<___;
748.asciz	"Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
749___
750
751foreach (split("\n",$code)) {
752	s/\`([^\`]*)\`/eval $1/ge;
753
754	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge	or
755	s/\bret\b/bx    lr/g						or
756	s/\bbx\s+lr\b/.word\t0xe12fff1e/g;	# make it possible to compile with -march=armv4
757
758	print $_,"\n";
759}
760
761close STDOUT or die "error closing STDOUT";
762