1#! /usr/bin/env perl 2# Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# 16# Permission to use under GPL terms is granted. 17# ==================================================================== 18 19# SHA256 block procedure for ARMv4. May 2007. 20 21# Performance is ~2x better than gcc 3.4 generated code and in "abso- 22# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per 23# byte [on single-issue Xscale PXA250 core]. 24 25# July 2010. 26# 27# Rescheduling for dual-issue pipeline resulted in 22% improvement on 28# Cortex A8 core and ~20 cycles per processed byte. 29 30# February 2011. 31# 32# Profiler-assisted and platform-specific optimization resulted in 16% 33# improvement on Cortex A8 core and ~15.4 cycles per processed byte. 34 35# September 2013. 36# 37# Add NEON implementation. On Cortex A8 it was measured to process one 38# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon 39# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only 40# code (meaning that latter performs sub-optimally, nothing was done 41# about it). 42 43# May 2014. 44# 45# Add ARMv8 code path performing at 2.0 cpb on Apple A7. 46 47$flavour = shift; 48if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } 49else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } 50 51if ($flavour && $flavour ne "void") { 52 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 53 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 54 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 55 die "can't locate arm-xlate.pl"; 56 57 open STDOUT,"| \"$^X\" $xlate $flavour $output"; 58} else { 59 open STDOUT,">$output"; 60} 61 62$ctx="r0"; $t0="r0"; 63$inp="r1"; $t4="r1"; 64$len="r2"; $t1="r2"; 65$T1="r3"; $t3="r3"; 66$A="r4"; 67$B="r5"; 68$C="r6"; 69$D="r7"; 70$E="r8"; 71$F="r9"; 72$G="r10"; 73$H="r11"; 74@V=($A,$B,$C,$D,$E,$F,$G,$H); 75$t2="r12"; 76$Ktbl="r14"; 77 78@Sigma0=( 2,13,22); 79@Sigma1=( 6,11,25); 80@sigma0=( 7,18, 3); 81@sigma1=(17,19,10); 82 83sub BODY_00_15 { 84my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 85 86$code.=<<___ if ($i<16); 87#if __ARM_ARCH__>=7 88 @ ldr $t1,[$inp],#4 @ $i 89# if $i==15 90 str $inp,[sp,#17*4] @ make room for $t4 91# endif 92 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` 93 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 94 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 95# ifndef __ARMEB__ 96 rev $t1,$t1 97# endif 98#else 99 @ ldrb $t1,[$inp,#3] @ $i 100 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 101 ldrb $t2,[$inp,#2] 102 ldrb $t0,[$inp,#1] 103 orr $t1,$t1,$t2,lsl#8 104 ldrb $t2,[$inp],#4 105 orr $t1,$t1,$t0,lsl#16 106# if $i==15 107 str $inp,[sp,#17*4] @ make room for $t4 108# endif 109 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` 110 orr $t1,$t1,$t2,lsl#24 111 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 112#endif 113___ 114$code.=<<___; 115 ldr $t2,[$Ktbl],#4 @ *K256++ 116 add $h,$h,$t1 @ h+=X[i] 117 str $t1,[sp,#`$i%16`*4] 118 eor $t1,$f,$g 119 add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e) 120 and $t1,$t1,$e 121 add $h,$h,$t2 @ h+=K256[i] 122 eor $t1,$t1,$g @ Ch(e,f,g) 123 eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]` 124 add $h,$h,$t1 @ h+=Ch(e,f,g) 125#if $i==31 126 and $t2,$t2,#0xff 127 cmp $t2,#0xf2 @ done? 128#endif 129#if $i<15 130# if __ARM_ARCH__>=7 131 ldr $t1,[$inp],#4 @ prefetch 132# else 133 ldrb $t1,[$inp,#3] 134# endif 135 eor $t2,$a,$b @ a^b, b^c in next round 136#else 137 ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx 138 eor $t2,$a,$b @ a^b, b^c in next round 139 ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx 140#endif 141 eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a) 142 and $t3,$t3,$t2 @ (b^c)&=(a^b) 143 add $d,$d,$h @ d+=h 144 eor $t3,$t3,$b @ Maj(a,b,c) 145 add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a) 146 @ add $h,$h,$t3 @ h+=Maj(a,b,c) 147___ 148 ($t2,$t3)=($t3,$t2); 149} 150 151sub BODY_16_XX { 152my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 153 154$code.=<<___; 155 @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i 156 @ ldr $t4,[sp,#`($i+14)%16`*4] 157 mov $t0,$t1,ror#$sigma0[0] 158 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 159 mov $t2,$t4,ror#$sigma1[0] 160 eor $t0,$t0,$t1,ror#$sigma0[1] 161 eor $t2,$t2,$t4,ror#$sigma1[1] 162 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) 163 ldr $t1,[sp,#`($i+0)%16`*4] 164 eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14]) 165 ldr $t4,[sp,#`($i+9)%16`*4] 166 167 add $t2,$t2,$t0 168 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15 169 add $t1,$t1,$t2 170 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 171 add $t1,$t1,$t4 @ X[i] 172___ 173 &BODY_00_15(@_); 174} 175 176$code=<<___; 177#ifndef __KERNEL__ 178# include "arm_arch.h" 179#else 180# define __ARM_ARCH__ __LINUX_ARM_ARCH__ 181# define __ARM_MAX_ARCH__ 7 182#endif 183 184.text 185#if defined(__thumb2__) 186.syntax unified 187.thumb 188#else 189.code 32 190#endif 191 192.type K256,%object 193.align 5 194K256: 195.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 196.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 197.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 198.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 199.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 200.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 201.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 202.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 203.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 204.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 205.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 206.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 207.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 208.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 209.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 210.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 211.size K256,.-K256 212.word 0 @ terminator 213#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 214.LOPENSSL_armcap: 215.word OPENSSL_armcap_P-.Lsha256_block_data_order 216#endif 217.align 5 218 219.global sha256_block_data_order 220.type sha256_block_data_order,%function 221sha256_block_data_order: 222.Lsha256_block_data_order: 223#if __ARM_ARCH__<7 && !defined(__thumb2__) 224 sub r3,pc,#8 @ sha256_block_data_order 225#else 226 adr r3,.Lsha256_block_data_order 227#endif 228#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 229 ldr r12,.LOPENSSL_armcap 230 ldr r12,[r3,r12] @ OPENSSL_armcap_P 231#ifdef __APPLE__ 232 ldr r12,[r12] 233#endif 234 tst r12,#ARMV8_SHA256 235 bne .LARMv8 236 tst r12,#ARMV7_NEON 237 bne .LNEON 238#endif 239 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 240 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} 241 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} 242 sub $Ktbl,r3,#256+32 @ K256 243 sub sp,sp,#16*4 @ alloca(X[16]) 244.Loop: 245# if __ARM_ARCH__>=7 246 ldr $t1,[$inp],#4 247# else 248 ldrb $t1,[$inp,#3] 249# endif 250 eor $t3,$B,$C @ magic 251 eor $t2,$t2,$t2 252___ 253for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } 254$code.=".Lrounds_16_xx:\n"; 255for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); } 256$code.=<<___; 257#ifdef __thumb2__ 258 ite eq @ Thumb2 thing, sanity check in ARM 259#endif 260 ldreq $t3,[sp,#16*4] @ pull ctx 261 bne .Lrounds_16_xx 262 263 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past 264 ldr $t0,[$t3,#0] 265 ldr $t1,[$t3,#4] 266 ldr $t2,[$t3,#8] 267 add $A,$A,$t0 268 ldr $t0,[$t3,#12] 269 add $B,$B,$t1 270 ldr $t1,[$t3,#16] 271 add $C,$C,$t2 272 ldr $t2,[$t3,#20] 273 add $D,$D,$t0 274 ldr $t0,[$t3,#24] 275 add $E,$E,$t1 276 ldr $t1,[$t3,#28] 277 add $F,$F,$t2 278 ldr $inp,[sp,#17*4] @ pull inp 279 ldr $t2,[sp,#18*4] @ pull inp+len 280 add $G,$G,$t0 281 add $H,$H,$t1 282 stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H} 283 cmp $inp,$t2 284 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl 285 bne .Loop 286 287 add sp,sp,#`16+3`*4 @ destroy frame 288#if __ARM_ARCH__>=5 289 ldmia sp!,{r4-r11,pc} 290#else 291 ldmia sp!,{r4-r11,lr} 292 tst lr,#1 293 moveq pc,lr @ be binary compatible with V4, yet 294 bx lr @ interoperable with Thumb ISA:-) 295#endif 296.size sha256_block_data_order,.-sha256_block_data_order 297___ 298###################################################################### 299# NEON stuff 300# 301{{{ 302my @X=map("q$_",(0..3)); 303my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25"); 304my $Xfer=$t4; 305my $j=0; 306 307sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } 308sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } 309 310sub AUTOLOAD() # thunk [simplified] x86-style perlasm 311{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; 312 my $arg = pop; 313 $arg = "#$arg" if ($arg*1 eq $arg); 314 $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; 315} 316 317sub Xupdate() 318{ use integer; 319 my $body = shift; 320 my @insns = (&$body,&$body,&$body,&$body); 321 my ($a,$b,$c,$d,$e,$f,$g,$h); 322 323 &vext_8 ($T0,@X[0],@X[1],4); # X[1..4] 324 eval(shift(@insns)); 325 eval(shift(@insns)); 326 eval(shift(@insns)); 327 &vext_8 ($T1,@X[2],@X[3],4); # X[9..12] 328 eval(shift(@insns)); 329 eval(shift(@insns)); 330 eval(shift(@insns)); 331 &vshr_u32 ($T2,$T0,$sigma0[0]); 332 eval(shift(@insns)); 333 eval(shift(@insns)); 334 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12] 335 eval(shift(@insns)); 336 eval(shift(@insns)); 337 &vshr_u32 ($T1,$T0,$sigma0[2]); 338 eval(shift(@insns)); 339 eval(shift(@insns)); 340 &vsli_32 ($T2,$T0,32-$sigma0[0]); 341 eval(shift(@insns)); 342 eval(shift(@insns)); 343 &vshr_u32 ($T3,$T0,$sigma0[1]); 344 eval(shift(@insns)); 345 eval(shift(@insns)); 346 &veor ($T1,$T1,$T2); 347 eval(shift(@insns)); 348 eval(shift(@insns)); 349 &vsli_32 ($T3,$T0,32-$sigma0[1]); 350 eval(shift(@insns)); 351 eval(shift(@insns)); 352 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]); 353 eval(shift(@insns)); 354 eval(shift(@insns)); 355 &veor ($T1,$T1,$T3); # sigma0(X[1..4]) 356 eval(shift(@insns)); 357 eval(shift(@insns)); 358 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]); 359 eval(shift(@insns)); 360 eval(shift(@insns)); 361 &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]); 362 eval(shift(@insns)); 363 eval(shift(@insns)); 364 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) 365 eval(shift(@insns)); 366 eval(shift(@insns)); 367 &veor ($T5,$T5,$T4); 368 eval(shift(@insns)); 369 eval(shift(@insns)); 370 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]); 371 eval(shift(@insns)); 372 eval(shift(@insns)); 373 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]); 374 eval(shift(@insns)); 375 eval(shift(@insns)); 376 &veor ($T5,$T5,$T4); # sigma1(X[14..15]) 377 eval(shift(@insns)); 378 eval(shift(@insns)); 379 &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15]) 380 eval(shift(@insns)); 381 eval(shift(@insns)); 382 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]); 383 eval(shift(@insns)); 384 eval(shift(@insns)); 385 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]); 386 eval(shift(@insns)); 387 eval(shift(@insns)); 388 &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]); 389 eval(shift(@insns)); 390 eval(shift(@insns)); 391 &veor ($T5,$T5,$T4); 392 eval(shift(@insns)); 393 eval(shift(@insns)); 394 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]); 395 eval(shift(@insns)); 396 eval(shift(@insns)); 397 &vld1_32 ("{$T0}","[$Ktbl,:128]!"); 398 eval(shift(@insns)); 399 eval(shift(@insns)); 400 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]); 401 eval(shift(@insns)); 402 eval(shift(@insns)); 403 &veor ($T5,$T5,$T4); # sigma1(X[16..17]) 404 eval(shift(@insns)); 405 eval(shift(@insns)); 406 &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17]) 407 eval(shift(@insns)); 408 eval(shift(@insns)); 409 &vadd_i32 ($T0,$T0,@X[0]); 410 while($#insns>=2) { eval(shift(@insns)); } 411 &vst1_32 ("{$T0}","[$Xfer,:128]!"); 412 eval(shift(@insns)); 413 eval(shift(@insns)); 414 415 push(@X,shift(@X)); # "rotate" X[] 416} 417 418sub Xpreload() 419{ use integer; 420 my $body = shift; 421 my @insns = (&$body,&$body,&$body,&$body); 422 my ($a,$b,$c,$d,$e,$f,$g,$h); 423 424 eval(shift(@insns)); 425 eval(shift(@insns)); 426 eval(shift(@insns)); 427 eval(shift(@insns)); 428 &vld1_32 ("{$T0}","[$Ktbl,:128]!"); 429 eval(shift(@insns)); 430 eval(shift(@insns)); 431 eval(shift(@insns)); 432 eval(shift(@insns)); 433 &vrev32_8 (@X[0],@X[0]); 434 eval(shift(@insns)); 435 eval(shift(@insns)); 436 eval(shift(@insns)); 437 eval(shift(@insns)); 438 &vadd_i32 ($T0,$T0,@X[0]); 439 foreach (@insns) { eval; } # remaining instructions 440 &vst1_32 ("{$T0}","[$Xfer,:128]!"); 441 442 push(@X,shift(@X)); # "rotate" X[] 443} 444 445sub body_00_15 () { 446 ( 447 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. 448 '&add ($h,$h,$t1)', # h+=X[i]+K[i] 449 '&eor ($t1,$f,$g)', 450 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', 451 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past 452 '&and ($t1,$t1,$e)', 453 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) 454 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', 455 '&eor ($t1,$t1,$g)', # Ch(e,f,g) 456 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e) 457 '&eor ($t2,$a,$b)', # a^b, b^c in next round 458 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) 459 '&add ($h,$h,$t1)', # h+=Ch(e,f,g) 460 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. 461 '&ldr ($t1,"[$Ktbl]") if ($j==15);'. 462 '&ldr ($t1,"[sp,#64]") if ($j==31)', 463 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) 464 '&add ($d,$d,$h)', # d+=h 465 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a) 466 '&eor ($t3,$t3,$b)', # Maj(a,b,c) 467 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' 468 ) 469} 470 471$code.=<<___; 472#if __ARM_MAX_ARCH__>=7 473.arch armv7-a 474.fpu neon 475 476.global sha256_block_data_order_neon 477.type sha256_block_data_order_neon,%function 478.align 5 479.skip 16 480sha256_block_data_order_neon: 481.LNEON: 482 stmdb sp!,{r4-r12,lr} 483 484 sub $H,sp,#16*4+16 485 adr $Ktbl,K256 486 bic $H,$H,#15 @ align for 128-bit stores 487 mov $t2,sp 488 mov sp,$H @ alloca 489 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 490 491 vld1.8 {@X[0]},[$inp]! 492 vld1.8 {@X[1]},[$inp]! 493 vld1.8 {@X[2]},[$inp]! 494 vld1.8 {@X[3]},[$inp]! 495 vld1.32 {$T0},[$Ktbl,:128]! 496 vld1.32 {$T1},[$Ktbl,:128]! 497 vld1.32 {$T2},[$Ktbl,:128]! 498 vld1.32 {$T3},[$Ktbl,:128]! 499 vrev32.8 @X[0],@X[0] @ yes, even on 500 str $ctx,[sp,#64] 501 vrev32.8 @X[1],@X[1] @ big-endian 502 str $inp,[sp,#68] 503 mov $Xfer,sp 504 vrev32.8 @X[2],@X[2] 505 str $len,[sp,#72] 506 vrev32.8 @X[3],@X[3] 507 str $t2,[sp,#76] @ save original sp 508 vadd.i32 $T0,$T0,@X[0] 509 vadd.i32 $T1,$T1,@X[1] 510 vst1.32 {$T0},[$Xfer,:128]! 511 vadd.i32 $T2,$T2,@X[2] 512 vst1.32 {$T1},[$Xfer,:128]! 513 vadd.i32 $T3,$T3,@X[3] 514 vst1.32 {$T2},[$Xfer,:128]! 515 vst1.32 {$T3},[$Xfer,:128]! 516 517 ldmia $ctx,{$A-$H} 518 sub $Xfer,$Xfer,#64 519 ldr $t1,[sp,#0] 520 eor $t2,$t2,$t2 521 eor $t3,$B,$C 522 b .L_00_48 523 524.align 4 525.L_00_48: 526___ 527 &Xupdate(\&body_00_15); 528 &Xupdate(\&body_00_15); 529 &Xupdate(\&body_00_15); 530 &Xupdate(\&body_00_15); 531$code.=<<___; 532 teq $t1,#0 @ check for K256 terminator 533 ldr $t1,[sp,#0] 534 sub $Xfer,$Xfer,#64 535 bne .L_00_48 536 537 ldr $inp,[sp,#68] 538 ldr $t0,[sp,#72] 539 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl 540 teq $inp,$t0 541 it eq 542 subeq $inp,$inp,#64 @ avoid SEGV 543 vld1.8 {@X[0]},[$inp]! @ load next input block 544 vld1.8 {@X[1]},[$inp]! 545 vld1.8 {@X[2]},[$inp]! 546 vld1.8 {@X[3]},[$inp]! 547 it ne 548 strne $inp,[sp,#68] 549 mov $Xfer,sp 550___ 551 &Xpreload(\&body_00_15); 552 &Xpreload(\&body_00_15); 553 &Xpreload(\&body_00_15); 554 &Xpreload(\&body_00_15); 555$code.=<<___; 556 ldr $t0,[$t1,#0] 557 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past 558 ldr $t2,[$t1,#4] 559 ldr $t3,[$t1,#8] 560 ldr $t4,[$t1,#12] 561 add $A,$A,$t0 @ accumulate 562 ldr $t0,[$t1,#16] 563 add $B,$B,$t2 564 ldr $t2,[$t1,#20] 565 add $C,$C,$t3 566 ldr $t3,[$t1,#24] 567 add $D,$D,$t4 568 ldr $t4,[$t1,#28] 569 add $E,$E,$t0 570 str $A,[$t1],#4 571 add $F,$F,$t2 572 str $B,[$t1],#4 573 add $G,$G,$t3 574 str $C,[$t1],#4 575 add $H,$H,$t4 576 str $D,[$t1],#4 577 stmia $t1,{$E-$H} 578 579 ittte ne 580 movne $Xfer,sp 581 ldrne $t1,[sp,#0] 582 eorne $t2,$t2,$t2 583 ldreq sp,[sp,#76] @ restore original sp 584 itt ne 585 eorne $t3,$B,$C 586 bne .L_00_48 587 588 ldmia sp!,{r4-r12,pc} 589.size sha256_block_data_order_neon,.-sha256_block_data_order_neon 590#endif 591___ 592}}} 593###################################################################### 594# ARMv8 stuff 595# 596{{{ 597my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2)); 598my @MSG=map("q$_",(8..11)); 599my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15)); 600my $Ktbl="r3"; 601 602$code.=<<___; 603#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 604 605# if defined(__thumb2__) 606# define INST(a,b,c,d) .byte c,d|0xc,a,b 607# else 608# define INST(a,b,c,d) .byte a,b,c,d 609# endif 610 611.type sha256_block_data_order_armv8,%function 612.align 5 613sha256_block_data_order_armv8: 614.LARMv8: 615 vld1.32 {$ABCD,$EFGH},[$ctx] 616 sub $Ktbl,$Ktbl,#256+32 617 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 618 b .Loop_v8 619 620.align 4 621.Loop_v8: 622 vld1.8 {@MSG[0]-@MSG[1]},[$inp]! 623 vld1.8 {@MSG[2]-@MSG[3]},[$inp]! 624 vld1.32 {$W0},[$Ktbl]! 625 vrev32.8 @MSG[0],@MSG[0] 626 vrev32.8 @MSG[1],@MSG[1] 627 vrev32.8 @MSG[2],@MSG[2] 628 vrev32.8 @MSG[3],@MSG[3] 629 vmov $ABCD_SAVE,$ABCD @ offload 630 vmov $EFGH_SAVE,$EFGH 631 teq $inp,$len 632___ 633for($i=0;$i<12;$i++) { 634$code.=<<___; 635 vld1.32 {$W1},[$Ktbl]! 636 vadd.i32 $W0,$W0,@MSG[0] 637 sha256su0 @MSG[0],@MSG[1] 638 vmov $abcd,$ABCD 639 sha256h $ABCD,$EFGH,$W0 640 sha256h2 $EFGH,$abcd,$W0 641 sha256su1 @MSG[0],@MSG[2],@MSG[3] 642___ 643 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); 644} 645$code.=<<___; 646 vld1.32 {$W1},[$Ktbl]! 647 vadd.i32 $W0,$W0,@MSG[0] 648 vmov $abcd,$ABCD 649 sha256h $ABCD,$EFGH,$W0 650 sha256h2 $EFGH,$abcd,$W0 651 652 vld1.32 {$W0},[$Ktbl]! 653 vadd.i32 $W1,$W1,@MSG[1] 654 vmov $abcd,$ABCD 655 sha256h $ABCD,$EFGH,$W1 656 sha256h2 $EFGH,$abcd,$W1 657 658 vld1.32 {$W1},[$Ktbl] 659 vadd.i32 $W0,$W0,@MSG[2] 660 sub $Ktbl,$Ktbl,#256-16 @ rewind 661 vmov $abcd,$ABCD 662 sha256h $ABCD,$EFGH,$W0 663 sha256h2 $EFGH,$abcd,$W0 664 665 vadd.i32 $W1,$W1,@MSG[3] 666 vmov $abcd,$ABCD 667 sha256h $ABCD,$EFGH,$W1 668 sha256h2 $EFGH,$abcd,$W1 669 670 vadd.i32 $ABCD,$ABCD,$ABCD_SAVE 671 vadd.i32 $EFGH,$EFGH,$EFGH_SAVE 672 it ne 673 bne .Loop_v8 674 675 vst1.32 {$ABCD,$EFGH},[$ctx] 676 677 ret @ bx lr 678.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 679#endif 680___ 681}}} 682$code.=<<___; 683.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 684.align 2 685#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 686.comm OPENSSL_armcap_P,4,4 687#endif 688___ 689 690open SELF,$0; 691while(<SELF>) { 692 next if (/^#!/); 693 last if (!s/^#/@/ and !/^$/); 694 print; 695} 696close SELF; 697 698{ my %opcode = ( 699 "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40, 700 "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 ); 701 702 sub unsha256 { 703 my ($mnemonic,$arg)=@_; 704 705 if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) { 706 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) 707 |(($2&7)<<17)|(($2&8)<<4) 708 |(($3&7)<<1) |(($3&8)<<2); 709 # since ARMv7 instructions are always encoded little-endian. 710 # correct solution is to use .inst directive, but older 711 # assemblers don't implement it:-( 712 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", 713 $word&0xff,($word>>8)&0xff, 714 ($word>>16)&0xff,($word>>24)&0xff, 715 $mnemonic,$arg; 716 } 717 } 718} 719 720foreach (split($/,$code)) { 721 722 s/\`([^\`]*)\`/eval $1/geo; 723 724 s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo; 725 726 s/\bret\b/bx lr/go or 727 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 728 729 print $_,"\n"; 730} 731 732close STDOUT or die "error closing STDOUT: $!"; # enforce flush 733