• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# January 2007.
18
19# Montgomery multiplication for ARMv4.
20#
21# Performance improvement naturally varies among CPU implementations
22# and compilers. The code was observed to provide +65-35% improvement
23# [depending on key length, less for longer keys] on ARM920T, and
24# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
25# base and compiler generated code with in-lined umull and even umlal
26# instructions. The latter means that this code didn't really have an
27# "advantage" of utilizing some "secret" instruction.
28#
29# The code is interoperable with Thumb ISA and is rather compact, less
30# than 1/2KB. Windows CE port would be trivial, as it's exclusively
31# about decorations, ABI and instruction syntax are identical.
32
33# November 2013
34#
35# Add NEON code path, which handles lengths divisible by 8. RSA/DSA
36# performance improvement on Cortex-A8 is ~45-100% depending on key
37# length, more for longer keys. On Cortex-A15 the span is ~10-105%.
38# On Snapdragon S4 improvement was measured to vary from ~70% to
39# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
40# rather because original integer-only code seems to perform
41# suboptimally on S4. Situation on Cortex-A9 is unfortunately
42# different. It's being looked into, but the trouble is that
43# performance for vectors longer than 256 bits is actually couple
44# of percent worse than for integer-only code. The code is chosen
45# for execution on all NEON-capable processors, because gain on
46# others outweighs the marginal loss on Cortex-A9.
47
48# September 2015
49#
50# Align Cortex-A9 performance with November 2013 improvements, i.e.
51# NEON code is now ~20-105% faster than integer-only one on this
52# processor. But this optimization further improved performance even
53# on other processors: NEON code path is ~45-180% faster than original
54# integer-only on Cortex-A8, ~10-210% on Cortex-A15, ~70-450% on
55# Snapdragon S4.
56
57$flavour = shift;
58if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
59else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
60
61if ($flavour && $flavour ne "void") {
62    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
63    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
64    ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
65    die "can't locate arm-xlate.pl";
66
67    open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
68    *STDOUT=*OUT;
69} else {
70    open OUT,">$output";
71    *STDOUT=*OUT;
72}
73
74$num="r0";	# starts as num argument, but holds &tp[num-1]
75$ap="r1";
76$bp="r2"; $bi="r2"; $rp="r2";
77$np="r3";
78$tp="r4";
79$aj="r5";
80$nj="r6";
81$tj="r7";
82$n0="r8";
83###########	# r9 is reserved by ELF as platform specific, e.g. TLS pointer
84$alo="r10";	# sl, gcc uses it to keep @GOT
85$ahi="r11";	# fp
86$nlo="r12";	# ip
87###########	# r13 is stack pointer
88$nhi="r14";	# lr
89###########	# r15 is program counter
90
91#### argument block layout relative to &tp[num-1], a.k.a. $num
92$_rp="$num,#12*4";
93# ap permanently resides in r1
94$_bp="$num,#13*4";
95# np permanently resides in r3
96$_n0="$num,#14*4";
97$_num="$num,#15*4";	$_bpend=$_num;
98
99$code=<<___;
100#include <openssl/arm_arch.h>
101
102@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
103@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
104.arch  armv7-a
105
106.text
107#if defined(__thumb2__)
108.syntax	unified
109.thumb
110#else
111.code	32
112#endif
113
114#if __ARM_MAX_ARCH__>=7
115.align	5
116.LOPENSSL_armcap:
117.word	OPENSSL_armcap_P-.Lbn_mul_mont
118#endif
119
120.global	bn_mul_mont
121.type	bn_mul_mont,%function
122
123.align	5
124bn_mul_mont:
125.Lbn_mul_mont:
126	ldr	ip,[sp,#4]		@ load num
127	stmdb	sp!,{r0,r2}		@ sp points at argument block
128#if __ARM_MAX_ARCH__>=7
129	tst	ip,#7
130	bne	.Lialu
131	adr	r0,.Lbn_mul_mont
132	ldr	r2,.LOPENSSL_armcap
133	ldr	r0,[r0,r2]
134#ifdef	__APPLE__
135	ldr	r0,[r0]
136#endif
137	tst	r0,#ARMV7_NEON		@ NEON available?
138	ldmia	sp, {r0,r2}
139	beq	.Lialu
140	add	sp,sp,#8
141	b	bn_mul8x_mont_neon
142.align	4
143.Lialu:
144#endif
145	cmp	ip,#2
146	mov	$num,ip			@ load num
147#ifdef	__thumb2__
148	ittt	lt
149#endif
150	movlt	r0,#0
151	addlt	sp,sp,#2*4
152	blt	.Labrt
153
154	stmdb	sp!,{r4-r12,lr}		@ save 10 registers
155
156	mov	$num,$num,lsl#2		@ rescale $num for byte count
157	sub	sp,sp,$num		@ alloca(4*num)
158	sub	sp,sp,#4		@ +extra dword
159	sub	$num,$num,#4		@ "num=num-1"
160	add	$tp,$bp,$num		@ &bp[num-1]
161
162	add	$num,sp,$num		@ $num to point at &tp[num-1]
163	ldr	$n0,[$_n0]		@ &n0
164	ldr	$bi,[$bp]		@ bp[0]
165	ldr	$aj,[$ap],#4		@ ap[0],ap++
166	ldr	$nj,[$np],#4		@ np[0],np++
167	ldr	$n0,[$n0]		@ *n0
168	str	$tp,[$_bpend]		@ save &bp[num]
169
170	umull	$alo,$ahi,$aj,$bi	@ ap[0]*bp[0]
171	str	$n0,[$_n0]		@ save n0 value
172	mul	$n0,$alo,$n0		@ "tp[0]"*n0
173	mov	$nlo,#0
174	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"t[0]"
175	mov	$tp,sp
176
177.L1st:
178	ldr	$aj,[$ap],#4		@ ap[j],ap++
179	mov	$alo,$ahi
180	ldr	$nj,[$np],#4		@ np[j],np++
181	mov	$ahi,#0
182	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[0]
183	mov	$nhi,#0
184	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
185	adds	$nlo,$nlo,$alo
186	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
187	adc	$nlo,$nhi,#0
188	cmp	$tp,$num
189	bne	.L1st
190
191	adds	$nlo,$nlo,$ahi
192	ldr	$tp,[$_bp]		@ restore bp
193	mov	$nhi,#0
194	ldr	$n0,[$_n0]		@ restore n0
195	adc	$nhi,$nhi,#0
196	str	$nlo,[$num]		@ tp[num-1]=
197	mov	$tj,sp
198	str	$nhi,[$num,#4]		@ tp[num]=
199
200.Louter:
201	sub	$tj,$num,$tj		@ "original" $num-1 value
202	sub	$ap,$ap,$tj		@ "rewind" ap to &ap[1]
203	ldr	$bi,[$tp,#4]!		@ *(++bp)
204	sub	$np,$np,$tj		@ "rewind" np to &np[1]
205	ldr	$aj,[$ap,#-4]		@ ap[0]
206	ldr	$alo,[sp]		@ tp[0]
207	ldr	$nj,[$np,#-4]		@ np[0]
208	ldr	$tj,[sp,#4]		@ tp[1]
209
210	mov	$ahi,#0
211	umlal	$alo,$ahi,$aj,$bi	@ ap[0]*bp[i]+tp[0]
212	str	$tp,[$_bp]		@ save bp
213	mul	$n0,$alo,$n0
214	mov	$nlo,#0
215	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"tp[0]"
216	mov	$tp,sp
217
218.Linner:
219	ldr	$aj,[$ap],#4		@ ap[j],ap++
220	adds	$alo,$ahi,$tj		@ +=tp[j]
221	ldr	$nj,[$np],#4		@ np[j],np++
222	mov	$ahi,#0
223	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[i]
224	mov	$nhi,#0
225	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
226	adc	$ahi,$ahi,#0
227	ldr	$tj,[$tp,#8]		@ tp[j+1]
228	adds	$nlo,$nlo,$alo
229	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
230	adc	$nlo,$nhi,#0
231	cmp	$tp,$num
232	bne	.Linner
233
234	adds	$nlo,$nlo,$ahi
235	mov	$nhi,#0
236	ldr	$tp,[$_bp]		@ restore bp
237	adc	$nhi,$nhi,#0
238	ldr	$n0,[$_n0]		@ restore n0
239	adds	$nlo,$nlo,$tj
240	ldr	$tj,[$_bpend]		@ restore &bp[num]
241	adc	$nhi,$nhi,#0
242	str	$nlo,[$num]		@ tp[num-1]=
243	str	$nhi,[$num,#4]		@ tp[num]=
244
245	cmp	$tp,$tj
246#ifdef	__thumb2__
247	itt	ne
248#endif
249	movne	$tj,sp
250	bne	.Louter
251
252	ldr	$rp,[$_rp]		@ pull rp
253	mov	$aj,sp
254	add	$num,$num,#4		@ $num to point at &tp[num]
255	sub	$aj,$num,$aj		@ "original" num value
256	mov	$tp,sp			@ "rewind" $tp
257	mov	$ap,$tp			@ "borrow" $ap
258	sub	$np,$np,$aj		@ "rewind" $np to &np[0]
259
260	subs	$tj,$tj,$tj		@ "clear" carry flag
261.Lsub:	ldr	$tj,[$tp],#4
262	ldr	$nj,[$np],#4
263	sbcs	$tj,$tj,$nj		@ tp[j]-np[j]
264	str	$tj,[$rp],#4		@ rp[j]=
265	teq	$tp,$num		@ preserve carry
266	bne	.Lsub
267	sbcs	$nhi,$nhi,#0		@ upmost carry
268	mov	$tp,sp			@ "rewind" $tp
269	sub	$rp,$rp,$aj		@ "rewind" $rp
270
271.Lcopy:	ldr	$tj,[$tp]		@ conditional copy
272	ldr	$aj,[$rp]
273	str	sp,[$tp],#4		@ zap tp
274#ifdef	__thumb2__
275	it	cc
276#endif
277	movcc	$aj,$tj
278	str	$aj,[$rp],#4
279	teq	$tp,$num		@ preserve carry
280	bne	.Lcopy
281
282	mov	sp,$num
283	add	sp,sp,#4		@ skip over tp[num+1]
284	ldmia	sp!,{r4-r12,lr}		@ restore registers
285	add	sp,sp,#2*4		@ skip over {r0,r2}
286	mov	r0,#1
287.Labrt:
288#if __ARM_ARCH__>=5
289	ret				@ bx lr
290#else
291	tst	lr,#1
292	moveq	pc,lr			@ be binary compatible with V4, yet
293	bx	lr			@ interoperable with Thumb ISA:-)
294#endif
295.size	bn_mul_mont,.-bn_mul_mont
296___
297{
298my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
299my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
300my ($Z,$Temp)=("q4","q5");
301my @ACC=map("q$_",(6..13));
302my ($Bi,$Ni,$M0)=map("d$_",(28..31));
303my $zero="$Z#lo";
304my $temp="$Temp#lo";
305
306my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
307my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("r$_",(6..11));
308
309$code.=<<___;
310#if __ARM_MAX_ARCH__>=7
311.arch	armv7-a
312.fpu	neon
313
314.type	bn_mul8x_mont_neon,%function
315.align	5
316bn_mul8x_mont_neon:
317	mov	ip,sp
318	stmdb	sp!,{r4-r11}
319	vstmdb	sp!,{d8-d15}		@ ABI specification says so
320	ldmia	ip,{r4-r5}		@ load rest of parameter block
321	mov	ip,sp
322
323	cmp	$num,#8
324	bhi	.LNEON_8n
325
326	@ special case for $num==8, everything is in register bank...
327
328	vld1.32		{${Bi}[0]}, [$bptr,:32]!
329	veor		$zero,$zero,$zero
330	sub		$toutptr,sp,$num,lsl#4
331	vld1.32		{$A0-$A3},  [$aptr]!		@ can't specify :32 :-(
332	and		$toutptr,$toutptr,#-64
333	vld1.32		{${M0}[0]}, [$n0,:32]
334	mov		sp,$toutptr			@ alloca
335	vzip.16		$Bi,$zero
336
337	vmull.u32	@ACC[0],$Bi,${A0}[0]
338	vmull.u32	@ACC[1],$Bi,${A0}[1]
339	vmull.u32	@ACC[2],$Bi,${A1}[0]
340	vshl.i64	$Ni,@ACC[0]#hi,#16
341	vmull.u32	@ACC[3],$Bi,${A1}[1]
342
343	vadd.u64	$Ni,$Ni,@ACC[0]#lo
344	veor		$zero,$zero,$zero
345	vmul.u32	$Ni,$Ni,$M0
346
347	vmull.u32	@ACC[4],$Bi,${A2}[0]
348	 vld1.32	{$N0-$N3}, [$nptr]!
349	vmull.u32	@ACC[5],$Bi,${A2}[1]
350	vmull.u32	@ACC[6],$Bi,${A3}[0]
351	vzip.16		$Ni,$zero
352	vmull.u32	@ACC[7],$Bi,${A3}[1]
353
354	vmlal.u32	@ACC[0],$Ni,${N0}[0]
355	sub		$outer,$num,#1
356	vmlal.u32	@ACC[1],$Ni,${N0}[1]
357	vmlal.u32	@ACC[2],$Ni,${N1}[0]
358	vmlal.u32	@ACC[3],$Ni,${N1}[1]
359
360	vmlal.u32	@ACC[4],$Ni,${N2}[0]
361	vmov		$Temp,@ACC[0]
362	vmlal.u32	@ACC[5],$Ni,${N2}[1]
363	vmov		@ACC[0],@ACC[1]
364	vmlal.u32	@ACC[6],$Ni,${N3}[0]
365	vmov		@ACC[1],@ACC[2]
366	vmlal.u32	@ACC[7],$Ni,${N3}[1]
367	vmov		@ACC[2],@ACC[3]
368	vmov		@ACC[3],@ACC[4]
369	vshr.u64	$temp,$temp,#16
370	vmov		@ACC[4],@ACC[5]
371	vmov		@ACC[5],@ACC[6]
372	vadd.u64	$temp,$temp,$Temp#hi
373	vmov		@ACC[6],@ACC[7]
374	veor		@ACC[7],@ACC[7]
375	vshr.u64	$temp,$temp,#16
376
377	b	.LNEON_outer8
378
379.align	4
380.LNEON_outer8:
381	vld1.32		{${Bi}[0]}, [$bptr,:32]!
382	veor		$zero,$zero,$zero
383	vzip.16		$Bi,$zero
384	vadd.u64	@ACC[0]#lo,@ACC[0]#lo,$temp
385
386	vmlal.u32	@ACC[0],$Bi,${A0}[0]
387	vmlal.u32	@ACC[1],$Bi,${A0}[1]
388	vmlal.u32	@ACC[2],$Bi,${A1}[0]
389	vshl.i64	$Ni,@ACC[0]#hi,#16
390	vmlal.u32	@ACC[3],$Bi,${A1}[1]
391
392	vadd.u64	$Ni,$Ni,@ACC[0]#lo
393	veor		$zero,$zero,$zero
394	subs		$outer,$outer,#1
395	vmul.u32	$Ni,$Ni,$M0
396
397	vmlal.u32	@ACC[4],$Bi,${A2}[0]
398	vmlal.u32	@ACC[5],$Bi,${A2}[1]
399	vmlal.u32	@ACC[6],$Bi,${A3}[0]
400	vzip.16		$Ni,$zero
401	vmlal.u32	@ACC[7],$Bi,${A3}[1]
402
403	vmlal.u32	@ACC[0],$Ni,${N0}[0]
404	vmlal.u32	@ACC[1],$Ni,${N0}[1]
405	vmlal.u32	@ACC[2],$Ni,${N1}[0]
406	vmlal.u32	@ACC[3],$Ni,${N1}[1]
407
408	vmlal.u32	@ACC[4],$Ni,${N2}[0]
409	vmov		$Temp,@ACC[0]
410	vmlal.u32	@ACC[5],$Ni,${N2}[1]
411	vmov		@ACC[0],@ACC[1]
412	vmlal.u32	@ACC[6],$Ni,${N3}[0]
413	vmov		@ACC[1],@ACC[2]
414	vmlal.u32	@ACC[7],$Ni,${N3}[1]
415	vmov		@ACC[2],@ACC[3]
416	vmov		@ACC[3],@ACC[4]
417	vshr.u64	$temp,$temp,#16
418	vmov		@ACC[4],@ACC[5]
419	vmov		@ACC[5],@ACC[6]
420	vadd.u64	$temp,$temp,$Temp#hi
421	vmov		@ACC[6],@ACC[7]
422	veor		@ACC[7],@ACC[7]
423	vshr.u64	$temp,$temp,#16
424
425	bne	.LNEON_outer8
426
427	vadd.u64	@ACC[0]#lo,@ACC[0]#lo,$temp
428	mov		$toutptr,sp
429	vshr.u64	$temp,@ACC[0]#lo,#16
430	mov		$inner,$num
431	vadd.u64	@ACC[0]#hi,@ACC[0]#hi,$temp
432	add		$tinptr,sp,#96
433	vshr.u64	$temp,@ACC[0]#hi,#16
434	vzip.16		@ACC[0]#lo,@ACC[0]#hi
435
436	b	.LNEON_tail_entry
437
438.align	4
439.LNEON_8n:
440	veor		@ACC[0],@ACC[0],@ACC[0]
441	 sub		$toutptr,sp,#128
442	veor		@ACC[1],@ACC[1],@ACC[1]
443	 sub		$toutptr,$toutptr,$num,lsl#4
444	veor		@ACC[2],@ACC[2],@ACC[2]
445	 and		$toutptr,$toutptr,#-64
446	veor		@ACC[3],@ACC[3],@ACC[3]
447	 mov		sp,$toutptr			@ alloca
448	veor		@ACC[4],@ACC[4],@ACC[4]
449	 add		$toutptr,$toutptr,#256
450	veor		@ACC[5],@ACC[5],@ACC[5]
451	 sub		$inner,$num,#8
452	veor		@ACC[6],@ACC[6],@ACC[6]
453	veor		@ACC[7],@ACC[7],@ACC[7]
454
455.LNEON_8n_init:
456	vst1.64		{@ACC[0]-@ACC[1]},[$toutptr,:256]!
457	subs		$inner,$inner,#8
458	vst1.64		{@ACC[2]-@ACC[3]},[$toutptr,:256]!
459	vst1.64		{@ACC[4]-@ACC[5]},[$toutptr,:256]!
460	vst1.64		{@ACC[6]-@ACC[7]},[$toutptr,:256]!
461	bne		.LNEON_8n_init
462
463	add		$tinptr,sp,#256
464	vld1.32		{$A0-$A3},[$aptr]!
465	add		$bnptr,sp,#8
466	vld1.32		{${M0}[0]},[$n0,:32]
467	mov		$outer,$num
468	b		.LNEON_8n_outer
469
470.align	4
471.LNEON_8n_outer:
472	vld1.32		{${Bi}[0]},[$bptr,:32]!	@ *b++
473	veor		$zero,$zero,$zero
474	vzip.16		$Bi,$zero
475	add		$toutptr,sp,#128
476	vld1.32		{$N0-$N3},[$nptr]!
477
478	vmlal.u32	@ACC[0],$Bi,${A0}[0]
479	vmlal.u32	@ACC[1],$Bi,${A0}[1]
480	 veor		$zero,$zero,$zero
481	vmlal.u32	@ACC[2],$Bi,${A1}[0]
482	 vshl.i64	$Ni,@ACC[0]#hi,#16
483	vmlal.u32	@ACC[3],$Bi,${A1}[1]
484	 vadd.u64	$Ni,$Ni,@ACC[0]#lo
485	vmlal.u32	@ACC[4],$Bi,${A2}[0]
486	 vmul.u32	$Ni,$Ni,$M0
487	vmlal.u32	@ACC[5],$Bi,${A2}[1]
488	vst1.32		{$Bi},[sp,:64]		@ put aside smashed b[8*i+0]
489	vmlal.u32	@ACC[6],$Bi,${A3}[0]
490	 vzip.16	$Ni,$zero
491	vmlal.u32	@ACC[7],$Bi,${A3}[1]
492___
493for ($i=0; $i<7;) {
494$code.=<<___;
495	vld1.32		{${Bi}[0]},[$bptr,:32]!	@ *b++
496	vmlal.u32	@ACC[0],$Ni,${N0}[0]
497	veor		$temp,$temp,$temp
498	vmlal.u32	@ACC[1],$Ni,${N0}[1]
499	vzip.16		$Bi,$temp
500	vmlal.u32	@ACC[2],$Ni,${N1}[0]
501	 vshr.u64	@ACC[0]#lo,@ACC[0]#lo,#16
502	vmlal.u32	@ACC[3],$Ni,${N1}[1]
503	vmlal.u32	@ACC[4],$Ni,${N2}[0]
504	 vadd.u64	@ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
505	vmlal.u32	@ACC[5],$Ni,${N2}[1]
506	 vshr.u64	@ACC[0]#lo,@ACC[0]#lo,#16
507	vmlal.u32	@ACC[6],$Ni,${N3}[0]
508	vmlal.u32	@ACC[7],$Ni,${N3}[1]
509	 vadd.u64	@ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
510	vst1.32		{$Ni},[$bnptr,:64]!	@ put aside smashed m[8*i+$i]
511___
512	push(@ACC,shift(@ACC));	$i++;
513$code.=<<___;
514	vmlal.u32	@ACC[0],$Bi,${A0}[0]
515	vld1.64		{@ACC[7]},[$tinptr,:128]!
516	vmlal.u32	@ACC[1],$Bi,${A0}[1]
517	 veor		$zero,$zero,$zero
518	vmlal.u32	@ACC[2],$Bi,${A1}[0]
519	 vshl.i64	$Ni,@ACC[0]#hi,#16
520	vmlal.u32	@ACC[3],$Bi,${A1}[1]
521	 vadd.u64	$Ni,$Ni,@ACC[0]#lo
522	vmlal.u32	@ACC[4],$Bi,${A2}[0]
523	 vmul.u32	$Ni,$Ni,$M0
524	vmlal.u32	@ACC[5],$Bi,${A2}[1]
525	vst1.32		{$Bi},[$bnptr,:64]!	@ put aside smashed b[8*i+$i]
526	vmlal.u32	@ACC[6],$Bi,${A3}[0]
527	 vzip.16	$Ni,$zero
528	vmlal.u32	@ACC[7],$Bi,${A3}[1]
529___
530}
531$code.=<<___;
532	vld1.32		{$Bi},[sp,:64]		@ pull smashed b[8*i+0]
533	vmlal.u32	@ACC[0],$Ni,${N0}[0]
534	vld1.32		{$A0-$A3},[$aptr]!
535	vmlal.u32	@ACC[1],$Ni,${N0}[1]
536	vmlal.u32	@ACC[2],$Ni,${N1}[0]
537	 vshr.u64	@ACC[0]#lo,@ACC[0]#lo,#16
538	vmlal.u32	@ACC[3],$Ni,${N1}[1]
539	vmlal.u32	@ACC[4],$Ni,${N2}[0]
540	 vadd.u64	@ACC[0]#lo,@ACC[0]#lo,@ACC[0]#hi
541	vmlal.u32	@ACC[5],$Ni,${N2}[1]
542	 vshr.u64	@ACC[0]#lo,@ACC[0]#lo,#16
543	vmlal.u32	@ACC[6],$Ni,${N3}[0]
544	vmlal.u32	@ACC[7],$Ni,${N3}[1]
545	 vadd.u64	@ACC[1]#lo,@ACC[1]#lo,@ACC[0]#lo
546	vst1.32		{$Ni},[$bnptr,:64]	@ put aside smashed m[8*i+$i]
547	add		$bnptr,sp,#8		@ rewind
548___
549	push(@ACC,shift(@ACC));
550$code.=<<___;
551	sub		$inner,$num,#8
552	b		.LNEON_8n_inner
553
554.align	4
555.LNEON_8n_inner:
556	subs		$inner,$inner,#8
557	vmlal.u32	@ACC[0],$Bi,${A0}[0]
558	vld1.64		{@ACC[7]},[$tinptr,:128]
559	vmlal.u32	@ACC[1],$Bi,${A0}[1]
560	vld1.32		{$Ni},[$bnptr,:64]!	@ pull smashed m[8*i+0]
561	vmlal.u32	@ACC[2],$Bi,${A1}[0]
562	vld1.32		{$N0-$N3},[$nptr]!
563	vmlal.u32	@ACC[3],$Bi,${A1}[1]
564	it		ne
565	addne		$tinptr,$tinptr,#16	@ don't advance in last iteration
566	vmlal.u32	@ACC[4],$Bi,${A2}[0]
567	vmlal.u32	@ACC[5],$Bi,${A2}[1]
568	vmlal.u32	@ACC[6],$Bi,${A3}[0]
569	vmlal.u32	@ACC[7],$Bi,${A3}[1]
570___
571for ($i=1; $i<8; $i++) {
572$code.=<<___;
573	vld1.32		{$Bi},[$bnptr,:64]!	@ pull smashed b[8*i+$i]
574	vmlal.u32	@ACC[0],$Ni,${N0}[0]
575	vmlal.u32	@ACC[1],$Ni,${N0}[1]
576	vmlal.u32	@ACC[2],$Ni,${N1}[0]
577	vmlal.u32	@ACC[3],$Ni,${N1}[1]
578	vmlal.u32	@ACC[4],$Ni,${N2}[0]
579	vmlal.u32	@ACC[5],$Ni,${N2}[1]
580	vmlal.u32	@ACC[6],$Ni,${N3}[0]
581	vmlal.u32	@ACC[7],$Ni,${N3}[1]
582	vst1.64		{@ACC[0]},[$toutptr,:128]!
583___
584	push(@ACC,shift(@ACC));
585$code.=<<___;
586	vmlal.u32	@ACC[0],$Bi,${A0}[0]
587	vld1.64		{@ACC[7]},[$tinptr,:128]
588	vmlal.u32	@ACC[1],$Bi,${A0}[1]
589	vld1.32		{$Ni},[$bnptr,:64]!	@ pull smashed m[8*i+$i]
590	vmlal.u32	@ACC[2],$Bi,${A1}[0]
591	it		ne
592	addne		$tinptr,$tinptr,#16	@ don't advance in last iteration
593	vmlal.u32	@ACC[3],$Bi,${A1}[1]
594	vmlal.u32	@ACC[4],$Bi,${A2}[0]
595	vmlal.u32	@ACC[5],$Bi,${A2}[1]
596	vmlal.u32	@ACC[6],$Bi,${A3}[0]
597	vmlal.u32	@ACC[7],$Bi,${A3}[1]
598___
599}
600$code.=<<___;
601	it		eq
602	subeq		$aptr,$aptr,$num,lsl#2	@ rewind
603	vmlal.u32	@ACC[0],$Ni,${N0}[0]
604	vld1.32		{$Bi},[sp,:64]		@ pull smashed b[8*i+0]
605	vmlal.u32	@ACC[1],$Ni,${N0}[1]
606	vld1.32		{$A0-$A3},[$aptr]!
607	vmlal.u32	@ACC[2],$Ni,${N1}[0]
608	add		$bnptr,sp,#8		@ rewind
609	vmlal.u32	@ACC[3],$Ni,${N1}[1]
610	vmlal.u32	@ACC[4],$Ni,${N2}[0]
611	vmlal.u32	@ACC[5],$Ni,${N2}[1]
612	vmlal.u32	@ACC[6],$Ni,${N3}[0]
613	vst1.64		{@ACC[0]},[$toutptr,:128]!
614	vmlal.u32	@ACC[7],$Ni,${N3}[1]
615
616	bne		.LNEON_8n_inner
617___
618	push(@ACC,shift(@ACC));
619$code.=<<___;
620	add		$tinptr,sp,#128
621	vst1.64		{@ACC[0]-@ACC[1]},[$toutptr,:256]!
622	veor		q2,q2,q2		@ $N0-$N1
623	vst1.64		{@ACC[2]-@ACC[3]},[$toutptr,:256]!
624	veor		q3,q3,q3		@ $N2-$N3
625	vst1.64		{@ACC[4]-@ACC[5]},[$toutptr,:256]!
626	vst1.64		{@ACC[6]},[$toutptr,:128]
627
628	subs		$outer,$outer,#8
629	vld1.64		{@ACC[0]-@ACC[1]},[$tinptr,:256]!
630	vld1.64		{@ACC[2]-@ACC[3]},[$tinptr,:256]!
631	vld1.64		{@ACC[4]-@ACC[5]},[$tinptr,:256]!
632	vld1.64		{@ACC[6]-@ACC[7]},[$tinptr,:256]!
633
634	itt		ne
635	subne		$nptr,$nptr,$num,lsl#2	@ rewind
636	bne		.LNEON_8n_outer
637
638	add		$toutptr,sp,#128
639	vst1.64		{q2-q3}, [sp,:256]!	@ start wiping stack frame
640	vshr.u64	$temp,@ACC[0]#lo,#16
641	vst1.64		{q2-q3},[sp,:256]!
642	vadd.u64	@ACC[0]#hi,@ACC[0]#hi,$temp
643	vst1.64		{q2-q3}, [sp,:256]!
644	vshr.u64	$temp,@ACC[0]#hi,#16
645	vst1.64		{q2-q3}, [sp,:256]!
646	vzip.16		@ACC[0]#lo,@ACC[0]#hi
647
648	mov		$inner,$num
649	b		.LNEON_tail_entry
650
651.align	4
652.LNEON_tail:
653	vadd.u64	@ACC[0]#lo,@ACC[0]#lo,$temp
654	vshr.u64	$temp,@ACC[0]#lo,#16
655	vld1.64		{@ACC[2]-@ACC[3]}, [$tinptr, :256]!
656	vadd.u64	@ACC[0]#hi,@ACC[0]#hi,$temp
657	vld1.64		{@ACC[4]-@ACC[5]}, [$tinptr, :256]!
658	vshr.u64	$temp,@ACC[0]#hi,#16
659	vld1.64		{@ACC[6]-@ACC[7]}, [$tinptr, :256]!
660	vzip.16		@ACC[0]#lo,@ACC[0]#hi
661
662.LNEON_tail_entry:
663___
664for ($i=1; $i<8; $i++) {
665$code.=<<___;
666	vadd.u64	@ACC[1]#lo,@ACC[1]#lo,$temp
667	vst1.32		{@ACC[0]#lo[0]}, [$toutptr, :32]!
668	vshr.u64	$temp,@ACC[1]#lo,#16
669	vadd.u64	@ACC[1]#hi,@ACC[1]#hi,$temp
670	vshr.u64	$temp,@ACC[1]#hi,#16
671	vzip.16		@ACC[1]#lo,@ACC[1]#hi
672___
673	push(@ACC,shift(@ACC));
674}
675	push(@ACC,shift(@ACC));
676$code.=<<___;
677	vld1.64		{@ACC[0]-@ACC[1]}, [$tinptr, :256]!
678	subs		$inner,$inner,#8
679	vst1.32		{@ACC[7]#lo[0]},   [$toutptr, :32]!
680	bne	.LNEON_tail
681
682	vst1.32	{${temp}[0]}, [$toutptr, :32]		@ top-most bit
683	sub	$nptr,$nptr,$num,lsl#2			@ rewind $nptr
684	subs	$aptr,sp,#0				@ clear carry flag
685	add	$bptr,sp,$num,lsl#2
686
687.LNEON_sub:
688	ldmia	$aptr!, {r4-r7}
689	ldmia	$nptr!, {r8-r11}
690	sbcs	r8, r4,r8
691	sbcs	r9, r5,r9
692	sbcs	r10,r6,r10
693	sbcs	r11,r7,r11
694	teq	$aptr,$bptr				@ preserves carry
695	stmia	$rptr!, {r8-r11}
696	bne	.LNEON_sub
697
698	ldr	r10, [$aptr]				@ load top-most bit
699	mov	r11,sp
700	veor	q0,q0,q0
701	sub	r11,$bptr,r11				@ this is num*4
702	veor	q1,q1,q1
703	mov	$aptr,sp
704	sub	$rptr,$rptr,r11				@ rewind $rptr
705	mov	$nptr,$bptr				@ second 3/4th of frame
706	sbcs	r10,r10,#0				@ result is carry flag
707
708.LNEON_copy_n_zap:
709	ldmia	$aptr!, {r4-r7}
710	ldmia	$rptr,  {r8-r11}
711	it	cc
712	movcc	r8, r4
713	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
714	itt	cc
715	movcc	r9, r5
716	movcc	r10,r6
717	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
718	it	cc
719	movcc	r11,r7
720	ldmia	$aptr, {r4-r7}
721	stmia	$rptr!, {r8-r11}
722	sub	$aptr,$aptr,#16
723	ldmia	$rptr, {r8-r11}
724	it	cc
725	movcc	r8, r4
726	vst1.64	{q0-q1}, [$aptr,:256]!			@ wipe
727	itt	cc
728	movcc	r9, r5
729	movcc	r10,r6
730	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
731	it	cc
732	movcc	r11,r7
733	teq	$aptr,$bptr				@ preserves carry
734	stmia	$rptr!, {r8-r11}
735	bne	.LNEON_copy_n_zap
736
737	mov	sp,ip
738        vldmia  sp!,{d8-d15}
739        ldmia   sp!,{r4-r11}
740	ret						@ bx lr
741.size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
742#endif
743___
744}
745$code.=<<___;
746.asciz	"Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
747.align	2
748#if __ARM_MAX_ARCH__>=7
749.comm	OPENSSL_armcap_P,4,4
750.hidden	OPENSSL_armcap_P
751#endif
752___
753
754foreach (split("\n",$code)) {
755	s/\`([^\`]*)\`/eval $1/ge;
756
757	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/ge	or
758	s/\bret\b/bx    lr/g						or
759	s/\bbx\s+lr\b/.word\t0xe12fff1e/g;	# make it possible to compile with -march=armv4
760
761	print $_,"\n";
762}
763
764close STDOUT or die "error closing STDOUT: $!";
765