1#! /usr/bin/env perl 2# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# 16# Permission to use under GPL terms is granted. 17# ==================================================================== 18 19# SHA256 block procedure for ARMv4. May 2007. 20 21# Performance is ~2x better than gcc 3.4 generated code and in "abso- 22# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per 23# byte [on single-issue Xscale PXA250 core]. 24 25# July 2010. 26# 27# Rescheduling for dual-issue pipeline resulted in 22% improvement on 28# Cortex A8 core and ~20 cycles per processed byte. 29 30# February 2011. 31# 32# Profiler-assisted and platform-specific optimization resulted in 16% 33# improvement on Cortex A8 core and ~15.4 cycles per processed byte. 34 35# September 2013. 36# 37# Add NEON implementation. On Cortex A8 it was measured to process one 38# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon 39# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only 40# code (meaning that latter performs sub-optimally, nothing was done 41# about it). 42 43# May 2014. 44# 45# Add ARMv8 code path performing at 2.0 cpb on Apple A7. 46 47$flavour = shift; 48if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } 49else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } 50 51if ($flavour && $flavour ne "void") { 52 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 53 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 54 ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or 55 die "can't locate arm-xlate.pl"; 56 57 open OUT,"| \"$^X\" $xlate $flavour $output"; 58 *STDOUT=*OUT; 59} else { 60 open OUT,">$output"; 61 *STDOUT=*OUT; 62} 63 64$ctx="r0"; $t0="r0"; 65$inp="r1"; $t4="r1"; 66$len="r2"; $t1="r2"; 67$T1="r3"; $t3="r3"; 68$A="r4"; 69$B="r5"; 70$C="r6"; 71$D="r7"; 72$E="r8"; 73$F="r9"; 74$G="r10"; 75$H="r11"; 76@V=($A,$B,$C,$D,$E,$F,$G,$H); 77$t2="r12"; 78$Ktbl="r14"; 79 80@Sigma0=( 2,13,22); 81@Sigma1=( 6,11,25); 82@sigma0=( 7,18, 3); 83@sigma1=(17,19,10); 84 85sub BODY_00_15 { 86my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 87 88$code.=<<___ if ($i<16); 89#if __ARM_ARCH__>=7 90 @ ldr $t1,[$inp],#4 @ $i 91# if $i==15 92 str $inp,[sp,#17*4] @ make room for $t4 93# endif 94 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` 95 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 96 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 97# ifndef __ARMEB__ 98 rev $t1,$t1 99# endif 100#else 101 @ ldrb $t1,[$inp,#3] @ $i 102 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 103 ldrb $t2,[$inp,#2] 104 ldrb $t0,[$inp,#1] 105 orr $t1,$t1,$t2,lsl#8 106 ldrb $t2,[$inp],#4 107 orr $t1,$t1,$t0,lsl#16 108# if $i==15 109 str $inp,[sp,#17*4] @ make room for $t4 110# endif 111 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` 112 orr $t1,$t1,$t2,lsl#24 113 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 114#endif 115___ 116$code.=<<___; 117 ldr $t2,[$Ktbl],#4 @ *K256++ 118 add $h,$h,$t1 @ h+=X[i] 119 str $t1,[sp,#`$i%16`*4] 120 eor $t1,$f,$g 121 add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e) 122 and $t1,$t1,$e 123 add $h,$h,$t2 @ h+=K256[i] 124 eor $t1,$t1,$g @ Ch(e,f,g) 125 eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]` 126 add $h,$h,$t1 @ h+=Ch(e,f,g) 127#if $i==31 128 and $t2,$t2,#0xff 129 cmp $t2,#0xf2 @ done? 130#endif 131#if $i<15 132# if __ARM_ARCH__>=7 133 ldr $t1,[$inp],#4 @ prefetch 134# else 135 ldrb $t1,[$inp,#3] 136# endif 137 eor $t2,$a,$b @ a^b, b^c in next round 138#else 139 ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx 140 eor $t2,$a,$b @ a^b, b^c in next round 141 ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx 142#endif 143 eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a) 144 and $t3,$t3,$t2 @ (b^c)&=(a^b) 145 add $d,$d,$h @ d+=h 146 eor $t3,$t3,$b @ Maj(a,b,c) 147 add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a) 148 @ add $h,$h,$t3 @ h+=Maj(a,b,c) 149___ 150 ($t2,$t3)=($t3,$t2); 151} 152 153sub BODY_16_XX { 154my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 155 156$code.=<<___; 157 @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i 158 @ ldr $t4,[sp,#`($i+14)%16`*4] 159 mov $t0,$t1,ror#$sigma0[0] 160 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 161 mov $t2,$t4,ror#$sigma1[0] 162 eor $t0,$t0,$t1,ror#$sigma0[1] 163 eor $t2,$t2,$t4,ror#$sigma1[1] 164 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) 165 ldr $t1,[sp,#`($i+0)%16`*4] 166 eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14]) 167 ldr $t4,[sp,#`($i+9)%16`*4] 168 169 add $t2,$t2,$t0 170 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15 171 add $t1,$t1,$t2 172 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 173 add $t1,$t1,$t4 @ X[i] 174___ 175 &BODY_00_15(@_); 176} 177 178$code=<<___; 179#ifndef __KERNEL__ 180# include <openssl/arm_arch.h> 181#else 182# define __ARM_ARCH__ __LINUX_ARM_ARCH__ 183# define __ARM_MAX_ARCH__ 7 184#endif 185 186@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both 187@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those 188@ instructions are manually-encoded. (See unsha256.) 189.arch armv7-a 190 191.text 192#if defined(__thumb2__) 193.syntax unified 194.thumb 195#else 196.code 32 197#endif 198 199.type K256,%object 200.align 5 201K256: 202.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 203.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 204.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 205.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 206.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 207.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 208.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 209.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 210.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 211.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 212.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 213.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 214.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 215.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 216.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 217.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 218.size K256,.-K256 219.word 0 @ terminator 220#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 221.LOPENSSL_armcap: 222.word OPENSSL_armcap_P-.Lsha256_block_data_order 223#endif 224.align 5 225 226.global sha256_block_data_order 227.type sha256_block_data_order,%function 228sha256_block_data_order: 229.Lsha256_block_data_order: 230#if __ARM_ARCH__<7 && !defined(__thumb2__) 231 sub r3,pc,#8 @ sha256_block_data_order 232#else 233 adr r3,.Lsha256_block_data_order 234#endif 235#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 236 ldr r12,.LOPENSSL_armcap 237 ldr r12,[r3,r12] @ OPENSSL_armcap_P 238#ifdef __APPLE__ 239 ldr r12,[r12] 240#endif 241 tst r12,#ARMV8_SHA256 242 bne .LARMv8 243 tst r12,#ARMV7_NEON 244 bne .LNEON 245#endif 246 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 247 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} 248 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} 249 sub $Ktbl,r3,#256+32 @ K256 250 sub sp,sp,#16*4 @ alloca(X[16]) 251.Loop: 252# if __ARM_ARCH__>=7 253 ldr $t1,[$inp],#4 254# else 255 ldrb $t1,[$inp,#3] 256# endif 257 eor $t3,$B,$C @ magic 258 eor $t2,$t2,$t2 259___ 260for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } 261$code.=".Lrounds_16_xx:\n"; 262for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); } 263$code.=<<___; 264#if __ARM_ARCH__>=7 265 ite eq @ Thumb2 thing, sanity check in ARM 266#endif 267 ldreq $t3,[sp,#16*4] @ pull ctx 268 bne .Lrounds_16_xx 269 270 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past 271 ldr $t0,[$t3,#0] 272 ldr $t1,[$t3,#4] 273 ldr $t2,[$t3,#8] 274 add $A,$A,$t0 275 ldr $t0,[$t3,#12] 276 add $B,$B,$t1 277 ldr $t1,[$t3,#16] 278 add $C,$C,$t2 279 ldr $t2,[$t3,#20] 280 add $D,$D,$t0 281 ldr $t0,[$t3,#24] 282 add $E,$E,$t1 283 ldr $t1,[$t3,#28] 284 add $F,$F,$t2 285 ldr $inp,[sp,#17*4] @ pull inp 286 ldr $t2,[sp,#18*4] @ pull inp+len 287 add $G,$G,$t0 288 add $H,$H,$t1 289 stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H} 290 cmp $inp,$t2 291 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl 292 bne .Loop 293 294 add sp,sp,#`16+3`*4 @ destroy frame 295#if __ARM_ARCH__>=5 296 ldmia sp!,{r4-r11,pc} 297#else 298 ldmia sp!,{r4-r11,lr} 299 tst lr,#1 300 moveq pc,lr @ be binary compatible with V4, yet 301 bx lr @ interoperable with Thumb ISA:-) 302#endif 303.size sha256_block_data_order,.-sha256_block_data_order 304___ 305###################################################################### 306# NEON stuff 307# 308{{{ 309my @X=map("q$_",(0..3)); 310my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25"); 311my $Xfer=$t4; 312my $j=0; 313 314sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } 315sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } 316 317sub AUTOLOAD() # thunk [simplified] x86-style perlasm 318{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; 319 my $arg = pop; 320 $arg = "#$arg" if ($arg*1 eq $arg); 321 $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; 322} 323 324sub Xupdate() 325{ use integer; 326 my $body = shift; 327 my @insns = (&$body,&$body,&$body,&$body); 328 my ($a,$b,$c,$d,$e,$f,$g,$h); 329 330 &vext_8 ($T0,@X[0],@X[1],4); # X[1..4] 331 eval(shift(@insns)); 332 eval(shift(@insns)); 333 eval(shift(@insns)); 334 &vext_8 ($T1,@X[2],@X[3],4); # X[9..12] 335 eval(shift(@insns)); 336 eval(shift(@insns)); 337 eval(shift(@insns)); 338 &vshr_u32 ($T2,$T0,$sigma0[0]); 339 eval(shift(@insns)); 340 eval(shift(@insns)); 341 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12] 342 eval(shift(@insns)); 343 eval(shift(@insns)); 344 &vshr_u32 ($T1,$T0,$sigma0[2]); 345 eval(shift(@insns)); 346 eval(shift(@insns)); 347 &vsli_32 ($T2,$T0,32-$sigma0[0]); 348 eval(shift(@insns)); 349 eval(shift(@insns)); 350 &vshr_u32 ($T3,$T0,$sigma0[1]); 351 eval(shift(@insns)); 352 eval(shift(@insns)); 353 &veor ($T1,$T1,$T2); 354 eval(shift(@insns)); 355 eval(shift(@insns)); 356 &vsli_32 ($T3,$T0,32-$sigma0[1]); 357 eval(shift(@insns)); 358 eval(shift(@insns)); 359 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]); 360 eval(shift(@insns)); 361 eval(shift(@insns)); 362 &veor ($T1,$T1,$T3); # sigma0(X[1..4]) 363 eval(shift(@insns)); 364 eval(shift(@insns)); 365 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]); 366 eval(shift(@insns)); 367 eval(shift(@insns)); 368 &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]); 369 eval(shift(@insns)); 370 eval(shift(@insns)); 371 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) 372 eval(shift(@insns)); 373 eval(shift(@insns)); 374 &veor ($T5,$T5,$T4); 375 eval(shift(@insns)); 376 eval(shift(@insns)); 377 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]); 378 eval(shift(@insns)); 379 eval(shift(@insns)); 380 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]); 381 eval(shift(@insns)); 382 eval(shift(@insns)); 383 &veor ($T5,$T5,$T4); # sigma1(X[14..15]) 384 eval(shift(@insns)); 385 eval(shift(@insns)); 386 &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15]) 387 eval(shift(@insns)); 388 eval(shift(@insns)); 389 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]); 390 eval(shift(@insns)); 391 eval(shift(@insns)); 392 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]); 393 eval(shift(@insns)); 394 eval(shift(@insns)); 395 &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]); 396 eval(shift(@insns)); 397 eval(shift(@insns)); 398 &veor ($T5,$T5,$T4); 399 eval(shift(@insns)); 400 eval(shift(@insns)); 401 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]); 402 eval(shift(@insns)); 403 eval(shift(@insns)); 404 &vld1_32 ("{$T0}","[$Ktbl,:128]!"); 405 eval(shift(@insns)); 406 eval(shift(@insns)); 407 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]); 408 eval(shift(@insns)); 409 eval(shift(@insns)); 410 &veor ($T5,$T5,$T4); # sigma1(X[16..17]) 411 eval(shift(@insns)); 412 eval(shift(@insns)); 413 &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17]) 414 eval(shift(@insns)); 415 eval(shift(@insns)); 416 &vadd_i32 ($T0,$T0,@X[0]); 417 while($#insns>=2) { eval(shift(@insns)); } 418 &vst1_32 ("{$T0}","[$Xfer,:128]!"); 419 eval(shift(@insns)); 420 eval(shift(@insns)); 421 422 push(@X,shift(@X)); # "rotate" X[] 423} 424 425sub Xpreload() 426{ use integer; 427 my $body = shift; 428 my @insns = (&$body,&$body,&$body,&$body); 429 my ($a,$b,$c,$d,$e,$f,$g,$h); 430 431 eval(shift(@insns)); 432 eval(shift(@insns)); 433 eval(shift(@insns)); 434 eval(shift(@insns)); 435 &vld1_32 ("{$T0}","[$Ktbl,:128]!"); 436 eval(shift(@insns)); 437 eval(shift(@insns)); 438 eval(shift(@insns)); 439 eval(shift(@insns)); 440 &vrev32_8 (@X[0],@X[0]); 441 eval(shift(@insns)); 442 eval(shift(@insns)); 443 eval(shift(@insns)); 444 eval(shift(@insns)); 445 &vadd_i32 ($T0,$T0,@X[0]); 446 foreach (@insns) { eval; } # remaining instructions 447 &vst1_32 ("{$T0}","[$Xfer,:128]!"); 448 449 push(@X,shift(@X)); # "rotate" X[] 450} 451 452sub body_00_15 () { 453 ( 454 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. 455 '&add ($h,$h,$t1)', # h+=X[i]+K[i] 456 '&eor ($t1,$f,$g)', 457 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', 458 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past 459 '&and ($t1,$t1,$e)', 460 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) 461 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', 462 '&eor ($t1,$t1,$g)', # Ch(e,f,g) 463 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e) 464 '&eor ($t2,$a,$b)', # a^b, b^c in next round 465 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) 466 '&add ($h,$h,$t1)', # h+=Ch(e,f,g) 467 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. 468 '&ldr ($t1,"[$Ktbl]") if ($j==15);'. 469 '&ldr ($t1,"[sp,#64]") if ($j==31)', 470 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) 471 '&add ($d,$d,$h)', # d+=h 472 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a) 473 '&eor ($t3,$t3,$b)', # Maj(a,b,c) 474 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' 475 ) 476} 477 478$code.=<<___; 479#if __ARM_MAX_ARCH__>=7 480.arch armv7-a 481.fpu neon 482 483.global sha256_block_data_order_neon 484.type sha256_block_data_order_neon,%function 485.align 5 486.skip 16 487sha256_block_data_order_neon: 488.LNEON: 489 stmdb sp!,{r4-r12,lr} 490 491 sub $H,sp,#16*4+16 492 adr $Ktbl,K256 493 bic $H,$H,#15 @ align for 128-bit stores 494 mov $t2,sp 495 mov sp,$H @ alloca 496 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 497 498 vld1.8 {@X[0]},[$inp]! 499 vld1.8 {@X[1]},[$inp]! 500 vld1.8 {@X[2]},[$inp]! 501 vld1.8 {@X[3]},[$inp]! 502 vld1.32 {$T0},[$Ktbl,:128]! 503 vld1.32 {$T1},[$Ktbl,:128]! 504 vld1.32 {$T2},[$Ktbl,:128]! 505 vld1.32 {$T3},[$Ktbl,:128]! 506 vrev32.8 @X[0],@X[0] @ yes, even on 507 str $ctx,[sp,#64] 508 vrev32.8 @X[1],@X[1] @ big-endian 509 str $inp,[sp,#68] 510 mov $Xfer,sp 511 vrev32.8 @X[2],@X[2] 512 str $len,[sp,#72] 513 vrev32.8 @X[3],@X[3] 514 str $t2,[sp,#76] @ save original sp 515 vadd.i32 $T0,$T0,@X[0] 516 vadd.i32 $T1,$T1,@X[1] 517 vst1.32 {$T0},[$Xfer,:128]! 518 vadd.i32 $T2,$T2,@X[2] 519 vst1.32 {$T1},[$Xfer,:128]! 520 vadd.i32 $T3,$T3,@X[3] 521 vst1.32 {$T2},[$Xfer,:128]! 522 vst1.32 {$T3},[$Xfer,:128]! 523 524 ldmia $ctx,{$A-$H} 525 sub $Xfer,$Xfer,#64 526 ldr $t1,[sp,#0] 527 eor $t2,$t2,$t2 528 eor $t3,$B,$C 529 b .L_00_48 530 531.align 4 532.L_00_48: 533___ 534 &Xupdate(\&body_00_15); 535 &Xupdate(\&body_00_15); 536 &Xupdate(\&body_00_15); 537 &Xupdate(\&body_00_15); 538$code.=<<___; 539 teq $t1,#0 @ check for K256 terminator 540 ldr $t1,[sp,#0] 541 sub $Xfer,$Xfer,#64 542 bne .L_00_48 543 544 ldr $inp,[sp,#68] 545 ldr $t0,[sp,#72] 546 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl 547 teq $inp,$t0 548 it eq 549 subeq $inp,$inp,#64 @ avoid SEGV 550 vld1.8 {@X[0]},[$inp]! @ load next input block 551 vld1.8 {@X[1]},[$inp]! 552 vld1.8 {@X[2]},[$inp]! 553 vld1.8 {@X[3]},[$inp]! 554 it ne 555 strne $inp,[sp,#68] 556 mov $Xfer,sp 557___ 558 &Xpreload(\&body_00_15); 559 &Xpreload(\&body_00_15); 560 &Xpreload(\&body_00_15); 561 &Xpreload(\&body_00_15); 562$code.=<<___; 563 ldr $t0,[$t1,#0] 564 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past 565 ldr $t2,[$t1,#4] 566 ldr $t3,[$t1,#8] 567 ldr $t4,[$t1,#12] 568 add $A,$A,$t0 @ accumulate 569 ldr $t0,[$t1,#16] 570 add $B,$B,$t2 571 ldr $t2,[$t1,#20] 572 add $C,$C,$t3 573 ldr $t3,[$t1,#24] 574 add $D,$D,$t4 575 ldr $t4,[$t1,#28] 576 add $E,$E,$t0 577 str $A,[$t1],#4 578 add $F,$F,$t2 579 str $B,[$t1],#4 580 add $G,$G,$t3 581 str $C,[$t1],#4 582 add $H,$H,$t4 583 str $D,[$t1],#4 584 stmia $t1,{$E-$H} 585 586 ittte ne 587 movne $Xfer,sp 588 ldrne $t1,[sp,#0] 589 eorne $t2,$t2,$t2 590 ldreq sp,[sp,#76] @ restore original sp 591 itt ne 592 eorne $t3,$B,$C 593 bne .L_00_48 594 595 ldmia sp!,{r4-r12,pc} 596.size sha256_block_data_order_neon,.-sha256_block_data_order_neon 597#endif 598___ 599}}} 600###################################################################### 601# ARMv8 stuff 602# 603{{{ 604my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2)); 605my @MSG=map("q$_",(8..11)); 606my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15)); 607my $Ktbl="r3"; 608 609$code.=<<___; 610#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 611 612# if defined(__thumb2__) 613# define INST(a,b,c,d) .byte c,d|0xc,a,b 614# else 615# define INST(a,b,c,d) .byte a,b,c,d 616# endif 617 618.type sha256_block_data_order_armv8,%function 619.align 5 620sha256_block_data_order_armv8: 621.LARMv8: 622 vld1.32 {$ABCD,$EFGH},[$ctx] 623 sub $Ktbl,$Ktbl,#256+32 624 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 625 b .Loop_v8 626 627.align 4 628.Loop_v8: 629 vld1.8 {@MSG[0]-@MSG[1]},[$inp]! 630 vld1.8 {@MSG[2]-@MSG[3]},[$inp]! 631 vld1.32 {$W0},[$Ktbl]! 632 vrev32.8 @MSG[0],@MSG[0] 633 vrev32.8 @MSG[1],@MSG[1] 634 vrev32.8 @MSG[2],@MSG[2] 635 vrev32.8 @MSG[3],@MSG[3] 636 vmov $ABCD_SAVE,$ABCD @ offload 637 vmov $EFGH_SAVE,$EFGH 638 teq $inp,$len 639___ 640for($i=0;$i<12;$i++) { 641$code.=<<___; 642 vld1.32 {$W1},[$Ktbl]! 643 vadd.i32 $W0,$W0,@MSG[0] 644 sha256su0 @MSG[0],@MSG[1] 645 vmov $abcd,$ABCD 646 sha256h $ABCD,$EFGH,$W0 647 sha256h2 $EFGH,$abcd,$W0 648 sha256su1 @MSG[0],@MSG[2],@MSG[3] 649___ 650 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); 651} 652$code.=<<___; 653 vld1.32 {$W1},[$Ktbl]! 654 vadd.i32 $W0,$W0,@MSG[0] 655 vmov $abcd,$ABCD 656 sha256h $ABCD,$EFGH,$W0 657 sha256h2 $EFGH,$abcd,$W0 658 659 vld1.32 {$W0},[$Ktbl]! 660 vadd.i32 $W1,$W1,@MSG[1] 661 vmov $abcd,$ABCD 662 sha256h $ABCD,$EFGH,$W1 663 sha256h2 $EFGH,$abcd,$W1 664 665 vld1.32 {$W1},[$Ktbl] 666 vadd.i32 $W0,$W0,@MSG[2] 667 sub $Ktbl,$Ktbl,#256-16 @ rewind 668 vmov $abcd,$ABCD 669 sha256h $ABCD,$EFGH,$W0 670 sha256h2 $EFGH,$abcd,$W0 671 672 vadd.i32 $W1,$W1,@MSG[3] 673 vmov $abcd,$ABCD 674 sha256h $ABCD,$EFGH,$W1 675 sha256h2 $EFGH,$abcd,$W1 676 677 vadd.i32 $ABCD,$ABCD,$ABCD_SAVE 678 vadd.i32 $EFGH,$EFGH,$EFGH_SAVE 679 it ne 680 bne .Loop_v8 681 682 vst1.32 {$ABCD,$EFGH},[$ctx] 683 684 ret @ bx lr 685.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 686#endif 687___ 688}}} 689$code.=<<___; 690.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 691.align 2 692#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 693.comm OPENSSL_armcap_P,4,4 694.hidden OPENSSL_armcap_P 695#endif 696___ 697 698open SELF,$0; 699while(<SELF>) { 700 next if (/^#!/); 701 last if (!s/^#/@/ and !/^$/); 702 print; 703} 704close SELF; 705 706{ my %opcode = ( 707 "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40, 708 "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 ); 709 710 sub unsha256 { 711 my ($mnemonic,$arg)=@_; 712 713 if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) { 714 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) 715 |(($2&7)<<17)|(($2&8)<<4) 716 |(($3&7)<<1) |(($3&8)<<2); 717 # since ARMv7 instructions are always encoded little-endian. 718 # correct solution is to use .inst directive, but older 719 # assemblers don't implement it:-( 720 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", 721 $word&0xff,($word>>8)&0xff, 722 ($word>>16)&0xff,($word>>24)&0xff, 723 $mnemonic,$arg; 724 } 725 } 726} 727 728foreach (split($/,$code)) { 729 730 s/\`([^\`]*)\`/eval $1/geo; 731 732 s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo; 733 734 s/\bret\b/bx lr/go or 735 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 736 737 print $_,"\n"; 738} 739 740close STDOUT or die "error closing STDOUT"; # enforce flush 741