1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# 9# Permission to use under GPL terms is granted. 10# ==================================================================== 11 12# SHA256 block procedure for ARMv4. May 2007. 13 14# Performance is ~2x better than gcc 3.4 generated code and in "abso- 15# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per 16# byte [on single-issue Xscale PXA250 core]. 17 18# July 2010. 19# 20# Rescheduling for dual-issue pipeline resulted in 22% improvement on 21# Cortex A8 core and ~20 cycles per processed byte. 22 23# February 2011. 24# 25# Profiler-assisted and platform-specific optimization resulted in 16% 26# improvement on Cortex A8 core and ~15.4 cycles per processed byte. 27 28# September 2013. 29# 30# Add NEON implementation. On Cortex A8 it was measured to process one 31# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon 32# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only 33# code (meaning that latter performs sub-optimally, nothing was done 34# about it). 35 36# May 2014. 37# 38# Add ARMv8 code path performing at 2.0 cpb on Apple A7. 39 40while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 41open STDOUT,">$output"; 42 43$ctx="r0"; $t0="r0"; 44$inp="r1"; $t4="r1"; 45$len="r2"; $t1="r2"; 46$T1="r3"; $t3="r3"; 47$A="r4"; 48$B="r5"; 49$C="r6"; 50$D="r7"; 51$E="r8"; 52$F="r9"; 53$G="r10"; 54$H="r11"; 55@V=($A,$B,$C,$D,$E,$F,$G,$H); 56$t2="r12"; 57$Ktbl="r14"; 58 59@Sigma0=( 2,13,22); 60@Sigma1=( 6,11,25); 61@sigma0=( 7,18, 3); 62@sigma1=(17,19,10); 63 64sub BODY_00_15 { 65my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 66 67$code.=<<___ if ($i<16); 68#if __ARM_ARCH__>=7 69 @ ldr $t1,[$inp],#4 @ $i 70# if $i==15 71 str $inp,[sp,#17*4] @ make room for $t4 72# endif 73 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` 74 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 75 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 76# ifndef __ARMEB__ 77 rev $t1,$t1 78# endif 79#else 80 @ ldrb $t1,[$inp,#3] @ $i 81 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 82 ldrb $t2,[$inp,#2] 83 ldrb $t0,[$inp,#1] 84 orr $t1,$t1,$t2,lsl#8 85 ldrb $t2,[$inp],#4 86 orr $t1,$t1,$t0,lsl#16 87# if $i==15 88 str $inp,[sp,#17*4] @ make room for $t4 89# endif 90 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` 91 orr $t1,$t1,$t2,lsl#24 92 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 93#endif 94___ 95$code.=<<___; 96 ldr $t2,[$Ktbl],#4 @ *K256++ 97 add $h,$h,$t1 @ h+=X[i] 98 str $t1,[sp,#`$i%16`*4] 99 eor $t1,$f,$g 100 add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e) 101 and $t1,$t1,$e 102 add $h,$h,$t2 @ h+=K256[i] 103 eor $t1,$t1,$g @ Ch(e,f,g) 104 eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]` 105 add $h,$h,$t1 @ h+=Ch(e,f,g) 106#if $i==31 107 and $t2,$t2,#0xff 108 cmp $t2,#0xf2 @ done? 109#endif 110#if $i<15 111# if __ARM_ARCH__>=7 112 ldr $t1,[$inp],#4 @ prefetch 113# else 114 ldrb $t1,[$inp,#3] 115# endif 116 eor $t2,$a,$b @ a^b, b^c in next round 117#else 118 ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx 119 eor $t2,$a,$b @ a^b, b^c in next round 120 ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx 121#endif 122 eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a) 123 and $t3,$t3,$t2 @ (b^c)&=(a^b) 124 add $d,$d,$h @ d+=h 125 eor $t3,$t3,$b @ Maj(a,b,c) 126 add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a) 127 @ add $h,$h,$t3 @ h+=Maj(a,b,c) 128___ 129 ($t2,$t3)=($t3,$t2); 130} 131 132sub BODY_16_XX { 133my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 134 135$code.=<<___; 136 @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i 137 @ ldr $t4,[sp,#`($i+14)%16`*4] 138 mov $t0,$t1,ror#$sigma0[0] 139 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 140 mov $t2,$t4,ror#$sigma1[0] 141 eor $t0,$t0,$t1,ror#$sigma0[1] 142 eor $t2,$t2,$t4,ror#$sigma1[1] 143 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) 144 ldr $t1,[sp,#`($i+0)%16`*4] 145 eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14]) 146 ldr $t4,[sp,#`($i+9)%16`*4] 147 148 add $t2,$t2,$t0 149 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15 150 add $t1,$t1,$t2 151 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 152 add $t1,$t1,$t4 @ X[i] 153___ 154 &BODY_00_15(@_); 155} 156 157$code=<<___; 158#ifndef __KERNEL__ 159# include "arm_arch.h" 160#else 161# define __ARM_ARCH__ __LINUX_ARM_ARCH__ 162# define __ARM_MAX_ARCH__ 7 163#endif 164 165.text 166#if __ARM_ARCH__<7 167.code 32 168#else 169.syntax unified 170# ifdef __thumb2__ 171# define adrl adr 172.thumb 173# else 174.code 32 175# endif 176#endif 177 178.type K256,%object 179.align 5 180K256: 181.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 182.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 183.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 184.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 185.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 186.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 187.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 188.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 189.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 190.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 191.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 192.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 193.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 194.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 195.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 196.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 197.size K256,.-K256 198.word 0 @ terminator 199#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 200.LOPENSSL_armcap: 201.word OPENSSL_armcap_P-sha256_block_data_order 202#endif 203.align 5 204 205.global sha256_block_data_order 206.type sha256_block_data_order,%function 207sha256_block_data_order: 208.Lsha256_block_data_order: 209#if __ARM_ARCH__<7 210 sub r3,pc,#8 @ sha256_block_data_order 211#else 212 adr r3,.Lsha256_block_data_order 213#endif 214#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 215 ldr r12,.LOPENSSL_armcap 216 ldr r12,[r3,r12] @ OPENSSL_armcap_P 217 tst r12,#ARMV8_SHA256 218 bne .LARMv8 219 tst r12,#ARMV7_NEON 220 bne .LNEON 221#endif 222 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 223 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} 224 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} 225 sub $Ktbl,r3,#256+32 @ K256 226 sub sp,sp,#16*4 @ alloca(X[16]) 227.Loop: 228# if __ARM_ARCH__>=7 229 ldr $t1,[$inp],#4 230# else 231 ldrb $t1,[$inp,#3] 232# endif 233 eor $t3,$B,$C @ magic 234 eor $t2,$t2,$t2 235___ 236for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } 237$code.=".Lrounds_16_xx:\n"; 238for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); } 239$code.=<<___; 240#if __ARM_ARCH__>=7 241 ite eq @ Thumb2 thing, sanity check in ARM 242#endif 243 ldreq $t3,[sp,#16*4] @ pull ctx 244 bne .Lrounds_16_xx 245 246 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past 247 ldr $t0,[$t3,#0] 248 ldr $t1,[$t3,#4] 249 ldr $t2,[$t3,#8] 250 add $A,$A,$t0 251 ldr $t0,[$t3,#12] 252 add $B,$B,$t1 253 ldr $t1,[$t3,#16] 254 add $C,$C,$t2 255 ldr $t2,[$t3,#20] 256 add $D,$D,$t0 257 ldr $t0,[$t3,#24] 258 add $E,$E,$t1 259 ldr $t1,[$t3,#28] 260 add $F,$F,$t2 261 ldr $inp,[sp,#17*4] @ pull inp 262 ldr $t2,[sp,#18*4] @ pull inp+len 263 add $G,$G,$t0 264 add $H,$H,$t1 265 stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H} 266 cmp $inp,$t2 267 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl 268 bne .Loop 269 270 add sp,sp,#`16+3`*4 @ destroy frame 271#if __ARM_ARCH__>=5 272 ldmia sp!,{r4-r11,pc} 273#else 274 ldmia sp!,{r4-r11,lr} 275 tst lr,#1 276 moveq pc,lr @ be binary compatible with V4, yet 277 bx lr @ interoperable with Thumb ISA:-) 278#endif 279.size sha256_block_data_order,.-sha256_block_data_order 280___ 281###################################################################### 282# NEON stuff 283# 284{{{ 285my @X=map("q$_",(0..3)); 286my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25"); 287my $Xfer=$t4; 288my $j=0; 289 290sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } 291sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } 292 293sub AUTOLOAD() # thunk [simplified] x86-style perlasm 294{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; 295 my $arg = pop; 296 $arg = "#$arg" if ($arg*1 eq $arg); 297 $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; 298} 299 300sub Xupdate() 301{ use integer; 302 my $body = shift; 303 my @insns = (&$body,&$body,&$body,&$body); 304 my ($a,$b,$c,$d,$e,$f,$g,$h); 305 306 &vext_8 ($T0,@X[0],@X[1],4); # X[1..4] 307 eval(shift(@insns)); 308 eval(shift(@insns)); 309 eval(shift(@insns)); 310 &vext_8 ($T1,@X[2],@X[3],4); # X[9..12] 311 eval(shift(@insns)); 312 eval(shift(@insns)); 313 eval(shift(@insns)); 314 &vshr_u32 ($T2,$T0,$sigma0[0]); 315 eval(shift(@insns)); 316 eval(shift(@insns)); 317 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12] 318 eval(shift(@insns)); 319 eval(shift(@insns)); 320 &vshr_u32 ($T1,$T0,$sigma0[2]); 321 eval(shift(@insns)); 322 eval(shift(@insns)); 323 &vsli_32 ($T2,$T0,32-$sigma0[0]); 324 eval(shift(@insns)); 325 eval(shift(@insns)); 326 &vshr_u32 ($T3,$T0,$sigma0[1]); 327 eval(shift(@insns)); 328 eval(shift(@insns)); 329 &veor ($T1,$T1,$T2); 330 eval(shift(@insns)); 331 eval(shift(@insns)); 332 &vsli_32 ($T3,$T0,32-$sigma0[1]); 333 eval(shift(@insns)); 334 eval(shift(@insns)); 335 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]); 336 eval(shift(@insns)); 337 eval(shift(@insns)); 338 &veor ($T1,$T1,$T3); # sigma0(X[1..4]) 339 eval(shift(@insns)); 340 eval(shift(@insns)); 341 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]); 342 eval(shift(@insns)); 343 eval(shift(@insns)); 344 &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]); 345 eval(shift(@insns)); 346 eval(shift(@insns)); 347 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) 348 eval(shift(@insns)); 349 eval(shift(@insns)); 350 &veor ($T5,$T5,$T4); 351 eval(shift(@insns)); 352 eval(shift(@insns)); 353 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]); 354 eval(shift(@insns)); 355 eval(shift(@insns)); 356 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]); 357 eval(shift(@insns)); 358 eval(shift(@insns)); 359 &veor ($T5,$T5,$T4); # sigma1(X[14..15]) 360 eval(shift(@insns)); 361 eval(shift(@insns)); 362 &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15]) 363 eval(shift(@insns)); 364 eval(shift(@insns)); 365 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]); 366 eval(shift(@insns)); 367 eval(shift(@insns)); 368 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]); 369 eval(shift(@insns)); 370 eval(shift(@insns)); 371 &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]); 372 eval(shift(@insns)); 373 eval(shift(@insns)); 374 &veor ($T5,$T5,$T4); 375 eval(shift(@insns)); 376 eval(shift(@insns)); 377 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]); 378 eval(shift(@insns)); 379 eval(shift(@insns)); 380 &vld1_32 ("{$T0}","[$Ktbl,:128]!"); 381 eval(shift(@insns)); 382 eval(shift(@insns)); 383 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]); 384 eval(shift(@insns)); 385 eval(shift(@insns)); 386 &veor ($T5,$T5,$T4); # sigma1(X[16..17]) 387 eval(shift(@insns)); 388 eval(shift(@insns)); 389 &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17]) 390 eval(shift(@insns)); 391 eval(shift(@insns)); 392 &vadd_i32 ($T0,$T0,@X[0]); 393 while($#insns>=2) { eval(shift(@insns)); } 394 &vst1_32 ("{$T0}","[$Xfer,:128]!"); 395 eval(shift(@insns)); 396 eval(shift(@insns)); 397 398 push(@X,shift(@X)); # "rotate" X[] 399} 400 401sub Xpreload() 402{ use integer; 403 my $body = shift; 404 my @insns = (&$body,&$body,&$body,&$body); 405 my ($a,$b,$c,$d,$e,$f,$g,$h); 406 407 eval(shift(@insns)); 408 eval(shift(@insns)); 409 eval(shift(@insns)); 410 eval(shift(@insns)); 411 &vld1_32 ("{$T0}","[$Ktbl,:128]!"); 412 eval(shift(@insns)); 413 eval(shift(@insns)); 414 eval(shift(@insns)); 415 eval(shift(@insns)); 416 &vrev32_8 (@X[0],@X[0]); 417 eval(shift(@insns)); 418 eval(shift(@insns)); 419 eval(shift(@insns)); 420 eval(shift(@insns)); 421 &vadd_i32 ($T0,$T0,@X[0]); 422 foreach (@insns) { eval; } # remaining instructions 423 &vst1_32 ("{$T0}","[$Xfer,:128]!"); 424 425 push(@X,shift(@X)); # "rotate" X[] 426} 427 428sub body_00_15 () { 429 ( 430 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. 431 '&add ($h,$h,$t1)', # h+=X[i]+K[i] 432 '&eor ($t1,$f,$g)', 433 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', 434 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past 435 '&and ($t1,$t1,$e)', 436 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) 437 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', 438 '&eor ($t1,$t1,$g)', # Ch(e,f,g) 439 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e) 440 '&eor ($t2,$a,$b)', # a^b, b^c in next round 441 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) 442 '&add ($h,$h,$t1)', # h+=Ch(e,f,g) 443 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. 444 '&ldr ($t1,"[$Ktbl]") if ($j==15);'. 445 '&ldr ($t1,"[sp,#64]") if ($j==31)', 446 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) 447 '&add ($d,$d,$h)', # d+=h 448 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a) 449 '&eor ($t3,$t3,$b)', # Maj(a,b,c) 450 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' 451 ) 452} 453 454$code.=<<___; 455#if __ARM_MAX_ARCH__>=7 456.arch armv7-a 457.fpu neon 458 459.global sha256_block_data_order_neon 460.type sha256_block_data_order_neon,%function 461.align 4 462sha256_block_data_order_neon: 463.LNEON: 464 stmdb sp!,{r4-r12,lr} 465 466 sub $H,sp,#16*4+16 467 adrl $Ktbl,K256 468 bic $H,$H,#15 @ align for 128-bit stores 469 mov $t2,sp 470 mov sp,$H @ alloca 471 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 472 473 vld1.8 {@X[0]},[$inp]! 474 vld1.8 {@X[1]},[$inp]! 475 vld1.8 {@X[2]},[$inp]! 476 vld1.8 {@X[3]},[$inp]! 477 vld1.32 {$T0},[$Ktbl,:128]! 478 vld1.32 {$T1},[$Ktbl,:128]! 479 vld1.32 {$T2},[$Ktbl,:128]! 480 vld1.32 {$T3},[$Ktbl,:128]! 481 vrev32.8 @X[0],@X[0] @ yes, even on 482 str $ctx,[sp,#64] 483 vrev32.8 @X[1],@X[1] @ big-endian 484 str $inp,[sp,#68] 485 mov $Xfer,sp 486 vrev32.8 @X[2],@X[2] 487 str $len,[sp,#72] 488 vrev32.8 @X[3],@X[3] 489 str $t2,[sp,#76] @ save original sp 490 vadd.i32 $T0,$T0,@X[0] 491 vadd.i32 $T1,$T1,@X[1] 492 vst1.32 {$T0},[$Xfer,:128]! 493 vadd.i32 $T2,$T2,@X[2] 494 vst1.32 {$T1},[$Xfer,:128]! 495 vadd.i32 $T3,$T3,@X[3] 496 vst1.32 {$T2},[$Xfer,:128]! 497 vst1.32 {$T3},[$Xfer,:128]! 498 499 ldmia $ctx,{$A-$H} 500 sub $Xfer,$Xfer,#64 501 ldr $t1,[sp,#0] 502 eor $t2,$t2,$t2 503 eor $t3,$B,$C 504 b .L_00_48 505 506.align 4 507.L_00_48: 508___ 509 &Xupdate(\&body_00_15); 510 &Xupdate(\&body_00_15); 511 &Xupdate(\&body_00_15); 512 &Xupdate(\&body_00_15); 513$code.=<<___; 514 teq $t1,#0 @ check for K256 terminator 515 ldr $t1,[sp,#0] 516 sub $Xfer,$Xfer,#64 517 bne .L_00_48 518 519 ldr $inp,[sp,#68] 520 ldr $t0,[sp,#72] 521 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl 522 teq $inp,$t0 523 it eq 524 subeq $inp,$inp,#64 @ avoid SEGV 525 vld1.8 {@X[0]},[$inp]! @ load next input block 526 vld1.8 {@X[1]},[$inp]! 527 vld1.8 {@X[2]},[$inp]! 528 vld1.8 {@X[3]},[$inp]! 529 it ne 530 strne $inp,[sp,#68] 531 mov $Xfer,sp 532___ 533 &Xpreload(\&body_00_15); 534 &Xpreload(\&body_00_15); 535 &Xpreload(\&body_00_15); 536 &Xpreload(\&body_00_15); 537$code.=<<___; 538 ldr $t0,[$t1,#0] 539 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past 540 ldr $t2,[$t1,#4] 541 ldr $t3,[$t1,#8] 542 ldr $t4,[$t1,#12] 543 add $A,$A,$t0 @ accumulate 544 ldr $t0,[$t1,#16] 545 add $B,$B,$t2 546 ldr $t2,[$t1,#20] 547 add $C,$C,$t3 548 ldr $t3,[$t1,#24] 549 add $D,$D,$t4 550 ldr $t4,[$t1,#28] 551 add $E,$E,$t0 552 str $A,[$t1],#4 553 add $F,$F,$t2 554 str $B,[$t1],#4 555 add $G,$G,$t3 556 str $C,[$t1],#4 557 add $H,$H,$t4 558 str $D,[$t1],#4 559 stmia $t1,{$E-$H} 560 561 ittte ne 562 movne $Xfer,sp 563 ldrne $t1,[sp,#0] 564 eorne $t2,$t2,$t2 565 ldreq sp,[sp,#76] @ restore original sp 566 itt ne 567 eorne $t3,$B,$C 568 bne .L_00_48 569 570 ldmia sp!,{r4-r12,pc} 571.size sha256_block_data_order_neon,.-sha256_block_data_order_neon 572#endif 573___ 574}}} 575###################################################################### 576# ARMv8 stuff 577# 578{{{ 579my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2)); 580my @MSG=map("q$_",(8..11)); 581my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15)); 582my $Ktbl="r3"; 583 584$code.=<<___; 585#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 586 587# ifdef __thumb2__ 588# define INST(a,b,c,d) .byte c,d|0xc,a,b 589# else 590# define INST(a,b,c,d) .byte a,b,c,d 591# endif 592 593.type sha256_block_data_order_armv8,%function 594.align 5 595sha256_block_data_order_armv8: 596.LARMv8: 597 vld1.32 {$ABCD,$EFGH},[$ctx] 598# ifdef __thumb2__ 599 adr $Ktbl,.LARMv8 600 sub $Ktbl,$Ktbl,#.LARMv8-K256 601# else 602 adrl $Ktbl,K256 603# endif 604 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 605 606.Loop_v8: 607 vld1.8 {@MSG[0]-@MSG[1]},[$inp]! 608 vld1.8 {@MSG[2]-@MSG[3]},[$inp]! 609 vld1.32 {$W0},[$Ktbl]! 610 vrev32.8 @MSG[0],@MSG[0] 611 vrev32.8 @MSG[1],@MSG[1] 612 vrev32.8 @MSG[2],@MSG[2] 613 vrev32.8 @MSG[3],@MSG[3] 614 vmov $ABCD_SAVE,$ABCD @ offload 615 vmov $EFGH_SAVE,$EFGH 616 teq $inp,$len 617___ 618for($i=0;$i<12;$i++) { 619$code.=<<___; 620 vld1.32 {$W1},[$Ktbl]! 621 vadd.i32 $W0,$W0,@MSG[0] 622 sha256su0 @MSG[0],@MSG[1] 623 vmov $abcd,$ABCD 624 sha256h $ABCD,$EFGH,$W0 625 sha256h2 $EFGH,$abcd,$W0 626 sha256su1 @MSG[0],@MSG[2],@MSG[3] 627___ 628 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); 629} 630$code.=<<___; 631 vld1.32 {$W1},[$Ktbl]! 632 vadd.i32 $W0,$W0,@MSG[0] 633 vmov $abcd,$ABCD 634 sha256h $ABCD,$EFGH,$W0 635 sha256h2 $EFGH,$abcd,$W0 636 637 vld1.32 {$W0},[$Ktbl]! 638 vadd.i32 $W1,$W1,@MSG[1] 639 vmov $abcd,$ABCD 640 sha256h $ABCD,$EFGH,$W1 641 sha256h2 $EFGH,$abcd,$W1 642 643 vld1.32 {$W1},[$Ktbl] 644 vadd.i32 $W0,$W0,@MSG[2] 645 sub $Ktbl,$Ktbl,#256-16 @ rewind 646 vmov $abcd,$ABCD 647 sha256h $ABCD,$EFGH,$W0 648 sha256h2 $EFGH,$abcd,$W0 649 650 vadd.i32 $W1,$W1,@MSG[3] 651 vmov $abcd,$ABCD 652 sha256h $ABCD,$EFGH,$W1 653 sha256h2 $EFGH,$abcd,$W1 654 655 vadd.i32 $ABCD,$ABCD,$ABCD_SAVE 656 vadd.i32 $EFGH,$EFGH,$EFGH_SAVE 657 it ne 658 bne .Loop_v8 659 660 vst1.32 {$ABCD,$EFGH},[$ctx] 661 662 ret @ bx lr 663.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 664#endif 665___ 666}}} 667$code.=<<___; 668.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 669.align 2 670#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 671.comm OPENSSL_armcap_P,4,4 672#endif 673___ 674 675open SELF,$0; 676while(<SELF>) { 677 next if (/^#!/); 678 last if (!s/^#/@/ and !/^$/); 679 print; 680} 681close SELF; 682 683{ my %opcode = ( 684 "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40, 685 "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 ); 686 687 sub unsha256 { 688 my ($mnemonic,$arg)=@_; 689 690 if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) { 691 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) 692 |(($2&7)<<17)|(($2&8)<<4) 693 |(($3&7)<<1) |(($3&8)<<2); 694 # since ARMv7 instructions are always encoded little-endian. 695 # correct solution is to use .inst directive, but older 696 # assemblers don't implement it:-( 697 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", 698 $word&0xff,($word>>8)&0xff, 699 ($word>>16)&0xff,($word>>24)&0xff, 700 $mnemonic,$arg; 701 } 702 } 703} 704 705foreach (split($/,$code)) { 706 707 s/\`([^\`]*)\`/eval $1/geo; 708 709 s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo; 710 711 s/\bret\b/bx lr/go or 712 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 713 714 print $_,"\n"; 715} 716 717close STDOUT; # enforce flush 718