• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15#
16# Permission to use under GPL terms is granted.
17# ====================================================================
18
19# SHA256 block procedure for ARMv4. May 2007.
20
21# Performance is ~2x better than gcc 3.4 generated code and in "abso-
22# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
23# byte [on single-issue Xscale PXA250 core].
24
25# July 2010.
26#
27# Rescheduling for dual-issue pipeline resulted in 22% improvement on
28# Cortex A8 core and ~20 cycles per processed byte.
29
30# February 2011.
31#
32# Profiler-assisted and platform-specific optimization resulted in 16%
33# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
34
35# September 2013.
36#
37# Add NEON implementation. On Cortex A8 it was measured to process one
38# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
39# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
40# code (meaning that latter performs sub-optimally, nothing was done
41# about it).
42
43# May 2014.
44#
45# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
46
47$flavour = shift;
48if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
49else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
50
51if ($flavour && $flavour ne "void") {
52    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
53    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
54    ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
55    die "can't locate arm-xlate.pl";
56
57    open OUT,"| \"$^X\" $xlate $flavour $output";
58    *STDOUT=*OUT;
59} else {
60    open OUT,">$output";
61    *STDOUT=*OUT;
62}
63
64$ctx="r0";	$t0="r0";
65$inp="r1";	$t4="r1";
66$len="r2";	$t1="r2";
67$T1="r3";	$t3="r3";
68$A="r4";
69$B="r5";
70$C="r6";
71$D="r7";
72$E="r8";
73$F="r9";
74$G="r10";
75$H="r11";
76@V=($A,$B,$C,$D,$E,$F,$G,$H);
77$t2="r12";
78$Ktbl="r14";
79
80@Sigma0=( 2,13,22);
81@Sigma1=( 6,11,25);
82@sigma0=( 7,18, 3);
83@sigma1=(17,19,10);
84
85sub BODY_00_15 {
86my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
87
88$code.=<<___ if ($i<16);
89#if __ARM_ARCH__>=7
90	@ ldr	$t1,[$inp],#4			@ $i
91# if $i==15
92	str	$inp,[sp,#17*4]			@ make room for $t4
93# endif
94	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
95	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
96	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
97# ifndef __ARMEB__
98	rev	$t1,$t1
99# endif
100#else
101	@ ldrb	$t1,[$inp,#3]			@ $i
102	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
103	ldrb	$t2,[$inp,#2]
104	ldrb	$t0,[$inp,#1]
105	orr	$t1,$t1,$t2,lsl#8
106	ldrb	$t2,[$inp],#4
107	orr	$t1,$t1,$t0,lsl#16
108# if $i==15
109	str	$inp,[sp,#17*4]			@ make room for $t4
110# endif
111	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
112	orr	$t1,$t1,$t2,lsl#24
113	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
114#endif
115___
116$code.=<<___;
117	ldr	$t2,[$Ktbl],#4			@ *K256++
118	add	$h,$h,$t1			@ h+=X[i]
119	str	$t1,[sp,#`$i%16`*4]
120	eor	$t1,$f,$g
121	add	$h,$h,$t0,ror#$Sigma1[0]	@ h+=Sigma1(e)
122	and	$t1,$t1,$e
123	add	$h,$h,$t2			@ h+=K256[i]
124	eor	$t1,$t1,$g			@ Ch(e,f,g)
125	eor	$t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
126	add	$h,$h,$t1			@ h+=Ch(e,f,g)
127#if $i==31
128	and	$t2,$t2,#0xff
129	cmp	$t2,#0xf2			@ done?
130#endif
131#if $i<15
132# if __ARM_ARCH__>=7
133	ldr	$t1,[$inp],#4			@ prefetch
134# else
135	ldrb	$t1,[$inp,#3]
136# endif
137	eor	$t2,$a,$b			@ a^b, b^c in next round
138#else
139	ldr	$t1,[sp,#`($i+2)%16`*4]		@ from future BODY_16_xx
140	eor	$t2,$a,$b			@ a^b, b^c in next round
141	ldr	$t4,[sp,#`($i+15)%16`*4]	@ from future BODY_16_xx
142#endif
143	eor	$t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`	@ Sigma0(a)
144	and	$t3,$t3,$t2			@ (b^c)&=(a^b)
145	add	$d,$d,$h			@ d+=h
146	eor	$t3,$t3,$b			@ Maj(a,b,c)
147	add	$h,$h,$t0,ror#$Sigma0[0]	@ h+=Sigma0(a)
148	@ add	$h,$h,$t3			@ h+=Maj(a,b,c)
149___
150	($t2,$t3)=($t3,$t2);
151}
152
153sub BODY_16_XX {
154my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
155
156$code.=<<___;
157	@ ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
158	@ ldr	$t4,[sp,#`($i+14)%16`*4]
159	mov	$t0,$t1,ror#$sigma0[0]
160	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
161	mov	$t2,$t4,ror#$sigma1[0]
162	eor	$t0,$t0,$t1,ror#$sigma0[1]
163	eor	$t2,$t2,$t4,ror#$sigma1[1]
164	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
165	ldr	$t1,[sp,#`($i+0)%16`*4]
166	eor	$t2,$t2,$t4,lsr#$sigma1[2]	@ sigma1(X[i+14])
167	ldr	$t4,[sp,#`($i+9)%16`*4]
168
169	add	$t2,$t2,$t0
170	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`	@ from BODY_00_15
171	add	$t1,$t1,$t2
172	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
173	add	$t1,$t1,$t4			@ X[i]
174___
175	&BODY_00_15(@_);
176}
177
178$code=<<___;
179#ifndef __KERNEL__
180# include <openssl/arm_arch.h>
181#else
182# define __ARM_ARCH__ __LINUX_ARM_ARCH__
183# define __ARM_MAX_ARCH__ 7
184#endif
185
186@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
187@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those
188@ instructions are manually-encoded. (See unsha256.)
189.arch  armv7-a
190
191.text
192#if defined(__thumb2__)
193.syntax unified
194.thumb
195#else
196.code   32
197#endif
198
199.type	K256,%object
200.align	5
201K256:
202.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
203.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
204.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
205.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
206.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
207.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
208.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
209.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
210.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
211.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
212.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
213.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
214.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
215.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
216.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
217.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
218.size	K256,.-K256
219.word	0				@ terminator
220#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
221.LOPENSSL_armcap:
222.word	OPENSSL_armcap_P-.Lsha256_block_data_order
223#endif
224.align	5
225
226.global	sha256_block_data_order
227.type	sha256_block_data_order,%function
228sha256_block_data_order:
229.Lsha256_block_data_order:
230#if __ARM_ARCH__<7 && !defined(__thumb2__)
231	sub	r3,pc,#8		@ sha256_block_data_order
232#else
233	adr	r3,.Lsha256_block_data_order
234#endif
235#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
236	ldr	r12,.LOPENSSL_armcap
237	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
238#ifdef	__APPLE__
239	ldr	r12,[r12]
240#endif
241	tst	r12,#ARMV8_SHA256
242	bne	.LARMv8
243	tst	r12,#ARMV7_NEON
244	bne	.LNEON
245#endif
246	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
247	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
248	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
249	sub	$Ktbl,r3,#256+32	@ K256
250	sub	sp,sp,#16*4		@ alloca(X[16])
251.Loop:
252# if __ARM_ARCH__>=7
253	ldr	$t1,[$inp],#4
254# else
255	ldrb	$t1,[$inp,#3]
256# endif
257	eor	$t3,$B,$C		@ magic
258	eor	$t2,$t2,$t2
259___
260for($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
261$code.=".Lrounds_16_xx:\n";
262for (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
263$code.=<<___;
264#if __ARM_ARCH__>=7
265	ite	eq			@ Thumb2 thing, sanity check in ARM
266#endif
267	ldreq	$t3,[sp,#16*4]		@ pull ctx
268	bne	.Lrounds_16_xx
269
270	add	$A,$A,$t2		@ h+=Maj(a,b,c) from the past
271	ldr	$t0,[$t3,#0]
272	ldr	$t1,[$t3,#4]
273	ldr	$t2,[$t3,#8]
274	add	$A,$A,$t0
275	ldr	$t0,[$t3,#12]
276	add	$B,$B,$t1
277	ldr	$t1,[$t3,#16]
278	add	$C,$C,$t2
279	ldr	$t2,[$t3,#20]
280	add	$D,$D,$t0
281	ldr	$t0,[$t3,#24]
282	add	$E,$E,$t1
283	ldr	$t1,[$t3,#28]
284	add	$F,$F,$t2
285	ldr	$inp,[sp,#17*4]		@ pull inp
286	ldr	$t2,[sp,#18*4]		@ pull inp+len
287	add	$G,$G,$t0
288	add	$H,$H,$t1
289	stmia	$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
290	cmp	$inp,$t2
291	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
292	bne	.Loop
293
294	add	sp,sp,#`16+3`*4	@ destroy frame
295#if __ARM_ARCH__>=5
296	ldmia	sp!,{r4-r11,pc}
297#else
298	ldmia	sp!,{r4-r11,lr}
299	tst	lr,#1
300	moveq	pc,lr			@ be binary compatible with V4, yet
301	bx	lr			@ interoperable with Thumb ISA:-)
302#endif
303.size	sha256_block_data_order,.-sha256_block_data_order
304___
305######################################################################
306# NEON stuff
307#
308{{{
309my @X=map("q$_",(0..3));
310my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
311my $Xfer=$t4;
312my $j=0;
313
314sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
315sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
316
317sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
318{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
319  my $arg = pop;
320    $arg = "#$arg" if ($arg*1 eq $arg);
321    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
322}
323
324sub Xupdate()
325{ use integer;
326  my $body = shift;
327  my @insns = (&$body,&$body,&$body,&$body);
328  my ($a,$b,$c,$d,$e,$f,$g,$h);
329
330	&vext_8		($T0,@X[0],@X[1],4);	# X[1..4]
331	 eval(shift(@insns));
332	 eval(shift(@insns));
333	 eval(shift(@insns));
334	&vext_8		($T1,@X[2],@X[3],4);	# X[9..12]
335	 eval(shift(@insns));
336	 eval(shift(@insns));
337	 eval(shift(@insns));
338	&vshr_u32	($T2,$T0,$sigma0[0]);
339	 eval(shift(@insns));
340	 eval(shift(@insns));
341	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += X[9..12]
342	 eval(shift(@insns));
343	 eval(shift(@insns));
344	&vshr_u32	($T1,$T0,$sigma0[2]);
345	 eval(shift(@insns));
346	 eval(shift(@insns));
347	&vsli_32	($T2,$T0,32-$sigma0[0]);
348	 eval(shift(@insns));
349	 eval(shift(@insns));
350	&vshr_u32	($T3,$T0,$sigma0[1]);
351	 eval(shift(@insns));
352	 eval(shift(@insns));
353	&veor		($T1,$T1,$T2);
354	 eval(shift(@insns));
355	 eval(shift(@insns));
356	&vsli_32	($T3,$T0,32-$sigma0[1]);
357	 eval(shift(@insns));
358	 eval(shift(@insns));
359	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[0]);
360	 eval(shift(@insns));
361	 eval(shift(@insns));
362	&veor		($T1,$T1,$T3);		# sigma0(X[1..4])
363	 eval(shift(@insns));
364	 eval(shift(@insns));
365	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[0]);
366	 eval(shift(@insns));
367	 eval(shift(@insns));
368	  &vshr_u32	($T5,&Dhi(@X[3]),$sigma1[2]);
369	 eval(shift(@insns));
370	 eval(shift(@insns));
371	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
372	 eval(shift(@insns));
373	 eval(shift(@insns));
374	  &veor		($T5,$T5,$T4);
375	 eval(shift(@insns));
376	 eval(shift(@insns));
377	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[1]);
378	 eval(shift(@insns));
379	 eval(shift(@insns));
380	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[1]);
381	 eval(shift(@insns));
382	 eval(shift(@insns));
383	  &veor		($T5,$T5,$T4);		# sigma1(X[14..15])
384	 eval(shift(@insns));
385	 eval(shift(@insns));
386	&vadd_i32	(&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
387	 eval(shift(@insns));
388	 eval(shift(@insns));
389	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[0]);
390	 eval(shift(@insns));
391	 eval(shift(@insns));
392	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[0]);
393	 eval(shift(@insns));
394	 eval(shift(@insns));
395	  &vshr_u32	($T5,&Dlo(@X[0]),$sigma1[2]);
396	 eval(shift(@insns));
397	 eval(shift(@insns));
398	  &veor		($T5,$T5,$T4);
399	 eval(shift(@insns));
400	 eval(shift(@insns));
401	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[1]);
402	 eval(shift(@insns));
403	 eval(shift(@insns));
404	&vld1_32	("{$T0}","[$Ktbl,:128]!");
405	 eval(shift(@insns));
406	 eval(shift(@insns));
407	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[1]);
408	 eval(shift(@insns));
409	 eval(shift(@insns));
410	  &veor		($T5,$T5,$T4);		# sigma1(X[16..17])
411	 eval(shift(@insns));
412	 eval(shift(@insns));
413	&vadd_i32	(&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
414	 eval(shift(@insns));
415	 eval(shift(@insns));
416	&vadd_i32	($T0,$T0,@X[0]);
417	 while($#insns>=2) { eval(shift(@insns)); }
418	&vst1_32	("{$T0}","[$Xfer,:128]!");
419	 eval(shift(@insns));
420	 eval(shift(@insns));
421
422	push(@X,shift(@X));		# "rotate" X[]
423}
424
425sub Xpreload()
426{ use integer;
427  my $body = shift;
428  my @insns = (&$body,&$body,&$body,&$body);
429  my ($a,$b,$c,$d,$e,$f,$g,$h);
430
431	 eval(shift(@insns));
432	 eval(shift(@insns));
433	 eval(shift(@insns));
434	 eval(shift(@insns));
435	&vld1_32	("{$T0}","[$Ktbl,:128]!");
436	 eval(shift(@insns));
437	 eval(shift(@insns));
438	 eval(shift(@insns));
439	 eval(shift(@insns));
440	&vrev32_8	(@X[0],@X[0]);
441	 eval(shift(@insns));
442	 eval(shift(@insns));
443	 eval(shift(@insns));
444	 eval(shift(@insns));
445	&vadd_i32	($T0,$T0,@X[0]);
446	 foreach (@insns) { eval; }	# remaining instructions
447	&vst1_32	("{$T0}","[$Xfer,:128]!");
448
449	push(@X,shift(@X));		# "rotate" X[]
450}
451
452sub body_00_15 () {
453	(
454	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
455	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
456	'&eor	($t1,$f,$g)',
457	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
458	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
459	'&and	($t1,$t1,$e)',
460	'&eor	($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
461	'&eor	($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
462	'&eor	($t1,$t1,$g)',			# Ch(e,f,g)
463	'&add	($h,$h,$t2,"ror#$Sigma1[0]")',	# h+=Sigma1(e)
464	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
465	'&eor	($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
466	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
467	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
468	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
469	'&ldr	($t1,"[sp,#64]")			if ($j==31)',
470	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
471	'&add	($d,$d,$h)',			# d+=h
472	'&add	($h,$h,$t0,"ror#$Sigma0[0]");'.	# h+=Sigma0(a)
473	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
474	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
475	)
476}
477
478$code.=<<___;
479#if __ARM_MAX_ARCH__>=7
480.arch	armv7-a
481.fpu	neon
482
483.global	sha256_block_data_order_neon
484.type	sha256_block_data_order_neon,%function
485.align	5
486.skip	16
487sha256_block_data_order_neon:
488.LNEON:
489	stmdb	sp!,{r4-r12,lr}
490
491	sub	$H,sp,#16*4+16
492	adr	$Ktbl,K256
493	bic	$H,$H,#15		@ align for 128-bit stores
494	mov	$t2,sp
495	mov	sp,$H			@ alloca
496	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
497
498	vld1.8		{@X[0]},[$inp]!
499	vld1.8		{@X[1]},[$inp]!
500	vld1.8		{@X[2]},[$inp]!
501	vld1.8		{@X[3]},[$inp]!
502	vld1.32		{$T0},[$Ktbl,:128]!
503	vld1.32		{$T1},[$Ktbl,:128]!
504	vld1.32		{$T2},[$Ktbl,:128]!
505	vld1.32		{$T3},[$Ktbl,:128]!
506	vrev32.8	@X[0],@X[0]		@ yes, even on
507	str		$ctx,[sp,#64]
508	vrev32.8	@X[1],@X[1]		@ big-endian
509	str		$inp,[sp,#68]
510	mov		$Xfer,sp
511	vrev32.8	@X[2],@X[2]
512	str		$len,[sp,#72]
513	vrev32.8	@X[3],@X[3]
514	str		$t2,[sp,#76]		@ save original sp
515	vadd.i32	$T0,$T0,@X[0]
516	vadd.i32	$T1,$T1,@X[1]
517	vst1.32		{$T0},[$Xfer,:128]!
518	vadd.i32	$T2,$T2,@X[2]
519	vst1.32		{$T1},[$Xfer,:128]!
520	vadd.i32	$T3,$T3,@X[3]
521	vst1.32		{$T2},[$Xfer,:128]!
522	vst1.32		{$T3},[$Xfer,:128]!
523
524	ldmia		$ctx,{$A-$H}
525	sub		$Xfer,$Xfer,#64
526	ldr		$t1,[sp,#0]
527	eor		$t2,$t2,$t2
528	eor		$t3,$B,$C
529	b		.L_00_48
530
531.align	4
532.L_00_48:
533___
534	&Xupdate(\&body_00_15);
535	&Xupdate(\&body_00_15);
536	&Xupdate(\&body_00_15);
537	&Xupdate(\&body_00_15);
538$code.=<<___;
539	teq	$t1,#0				@ check for K256 terminator
540	ldr	$t1,[sp,#0]
541	sub	$Xfer,$Xfer,#64
542	bne	.L_00_48
543
544	ldr		$inp,[sp,#68]
545	ldr		$t0,[sp,#72]
546	sub		$Ktbl,$Ktbl,#256	@ rewind $Ktbl
547	teq		$inp,$t0
548	it		eq
549	subeq		$inp,$inp,#64		@ avoid SEGV
550	vld1.8		{@X[0]},[$inp]!		@ load next input block
551	vld1.8		{@X[1]},[$inp]!
552	vld1.8		{@X[2]},[$inp]!
553	vld1.8		{@X[3]},[$inp]!
554	it		ne
555	strne		$inp,[sp,#68]
556	mov		$Xfer,sp
557___
558	&Xpreload(\&body_00_15);
559	&Xpreload(\&body_00_15);
560	&Xpreload(\&body_00_15);
561	&Xpreload(\&body_00_15);
562$code.=<<___;
563	ldr	$t0,[$t1,#0]
564	add	$A,$A,$t2			@ h+=Maj(a,b,c) from the past
565	ldr	$t2,[$t1,#4]
566	ldr	$t3,[$t1,#8]
567	ldr	$t4,[$t1,#12]
568	add	$A,$A,$t0			@ accumulate
569	ldr	$t0,[$t1,#16]
570	add	$B,$B,$t2
571	ldr	$t2,[$t1,#20]
572	add	$C,$C,$t3
573	ldr	$t3,[$t1,#24]
574	add	$D,$D,$t4
575	ldr	$t4,[$t1,#28]
576	add	$E,$E,$t0
577	str	$A,[$t1],#4
578	add	$F,$F,$t2
579	str	$B,[$t1],#4
580	add	$G,$G,$t3
581	str	$C,[$t1],#4
582	add	$H,$H,$t4
583	str	$D,[$t1],#4
584	stmia	$t1,{$E-$H}
585
586	ittte	ne
587	movne	$Xfer,sp
588	ldrne	$t1,[sp,#0]
589	eorne	$t2,$t2,$t2
590	ldreq	sp,[sp,#76]			@ restore original sp
591	itt	ne
592	eorne	$t3,$B,$C
593	bne	.L_00_48
594
595	ldmia	sp!,{r4-r12,pc}
596.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
597#endif
598___
599}}}
600######################################################################
601# ARMv8 stuff
602#
603{{{
604my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
605my @MSG=map("q$_",(8..11));
606my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
607my $Ktbl="r3";
608
609$code.=<<___;
610#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
611
612# if defined(__thumb2__)
613#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
614# else
615#  define INST(a,b,c,d)	.byte	a,b,c,d
616# endif
617
618.type	sha256_block_data_order_armv8,%function
619.align	5
620sha256_block_data_order_armv8:
621.LARMv8:
622	vld1.32	{$ABCD,$EFGH},[$ctx]
623	sub	$Ktbl,$Ktbl,#256+32
624	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
625	b	.Loop_v8
626
627.align	4
628.Loop_v8:
629	vld1.8		{@MSG[0]-@MSG[1]},[$inp]!
630	vld1.8		{@MSG[2]-@MSG[3]},[$inp]!
631	vld1.32		{$W0},[$Ktbl]!
632	vrev32.8	@MSG[0],@MSG[0]
633	vrev32.8	@MSG[1],@MSG[1]
634	vrev32.8	@MSG[2],@MSG[2]
635	vrev32.8	@MSG[3],@MSG[3]
636	vmov		$ABCD_SAVE,$ABCD	@ offload
637	vmov		$EFGH_SAVE,$EFGH
638	teq		$inp,$len
639___
640for($i=0;$i<12;$i++) {
641$code.=<<___;
642	vld1.32		{$W1},[$Ktbl]!
643	vadd.i32	$W0,$W0,@MSG[0]
644	sha256su0	@MSG[0],@MSG[1]
645	vmov		$abcd,$ABCD
646	sha256h		$ABCD,$EFGH,$W0
647	sha256h2	$EFGH,$abcd,$W0
648	sha256su1	@MSG[0],@MSG[2],@MSG[3]
649___
650	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
651}
652$code.=<<___;
653	vld1.32		{$W1},[$Ktbl]!
654	vadd.i32	$W0,$W0,@MSG[0]
655	vmov		$abcd,$ABCD
656	sha256h		$ABCD,$EFGH,$W0
657	sha256h2	$EFGH,$abcd,$W0
658
659	vld1.32		{$W0},[$Ktbl]!
660	vadd.i32	$W1,$W1,@MSG[1]
661	vmov		$abcd,$ABCD
662	sha256h		$ABCD,$EFGH,$W1
663	sha256h2	$EFGH,$abcd,$W1
664
665	vld1.32		{$W1},[$Ktbl]
666	vadd.i32	$W0,$W0,@MSG[2]
667	sub		$Ktbl,$Ktbl,#256-16	@ rewind
668	vmov		$abcd,$ABCD
669	sha256h		$ABCD,$EFGH,$W0
670	sha256h2	$EFGH,$abcd,$W0
671
672	vadd.i32	$W1,$W1,@MSG[3]
673	vmov		$abcd,$ABCD
674	sha256h		$ABCD,$EFGH,$W1
675	sha256h2	$EFGH,$abcd,$W1
676
677	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE
678	vadd.i32	$EFGH,$EFGH,$EFGH_SAVE
679	it		ne
680	bne		.Loop_v8
681
682	vst1.32		{$ABCD,$EFGH},[$ctx]
683
684	ret		@ bx lr
685.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
686#endif
687___
688}}}
689$code.=<<___;
690.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
691.align	2
692#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
693.comm   OPENSSL_armcap_P,4,4
694.hidden OPENSSL_armcap_P
695#endif
696___
697
698open SELF,$0;
699while(<SELF>) {
700	next if (/^#!/);
701	last if (!s/^#/@/ and !/^$/);
702	print;
703}
704close SELF;
705
706{   my  %opcode = (
707	"sha256h"	=> 0xf3000c40,	"sha256h2"	=> 0xf3100c40,
708	"sha256su0"	=> 0xf3ba03c0,	"sha256su1"	=> 0xf3200c40	);
709
710    sub unsha256 {
711	my ($mnemonic,$arg)=@_;
712
713	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
714	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
715					 |(($2&7)<<17)|(($2&8)<<4)
716					 |(($3&7)<<1) |(($3&8)<<2);
717	    # since ARMv7 instructions are always encoded little-endian.
718	    # correct solution is to use .inst directive, but older
719	    # assemblers don't implement it:-(
720	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
721			$word&0xff,($word>>8)&0xff,
722			($word>>16)&0xff,($word>>24)&0xff,
723			$mnemonic,$arg;
724	}
725    }
726}
727
728foreach (split($/,$code)) {
729
730	s/\`([^\`]*)\`/eval $1/geo;
731
732	s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
733
734	s/\bret\b/bx	lr/go		or
735	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
736
737	print $_,"\n";
738}
739
740close STDOUT or die "error closing STDOUT"; # enforce flush
741