1#! /usr/bin/env perl 2# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# 16# Permission to use under GPL terms is granted. 17# ==================================================================== 18 19# SHA256 block procedure for ARMv4. May 2007. 20 21# Performance is ~2x better than gcc 3.4 generated code and in "abso- 22# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per 23# byte [on single-issue Xscale PXA250 core]. 24 25# July 2010. 26# 27# Rescheduling for dual-issue pipeline resulted in 22% improvement on 28# Cortex A8 core and ~20 cycles per processed byte. 29 30# February 2011. 31# 32# Profiler-assisted and platform-specific optimization resulted in 16% 33# improvement on Cortex A8 core and ~15.4 cycles per processed byte. 34 35# September 2013. 36# 37# Add NEON implementation. On Cortex A8 it was measured to process one 38# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon 39# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only 40# code (meaning that latter performs sub-optimally, nothing was done 41# about it). 42 43# May 2014. 44# 45# Add ARMv8 code path performing at 2.0 cpb on Apple A7. 46 47$flavour = shift; 48if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } 49else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } 50 51if ($flavour && $flavour ne "void") { 52 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 53 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 54 ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or 55 die "can't locate arm-xlate.pl"; 56 57 open STDOUT,"| \"$^X\" $xlate $flavour $output"; 58} else { 59 open STDOUT,">$output"; 60} 61 62$ctx="r0"; $t0="r0"; 63$inp="r1"; $t4="r1"; 64$len="r2"; $t1="r2"; 65$T1="r3"; $t3="r3"; 66$A="r4"; 67$B="r5"; 68$C="r6"; 69$D="r7"; 70$E="r8"; 71$F="r9"; 72$G="r10"; 73$H="r11"; 74@V=($A,$B,$C,$D,$E,$F,$G,$H); 75$t2="r12"; 76$Ktbl="r14"; 77 78@Sigma0=( 2,13,22); 79@Sigma1=( 6,11,25); 80@sigma0=( 7,18, 3); 81@sigma1=(17,19,10); 82 83sub BODY_00_15 { 84my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 85 86$code.=<<___ if ($i<16); 87#if __ARM_ARCH__>=7 88 @ ldr $t1,[$inp],#4 @ $i 89# if $i==15 90 str $inp,[sp,#17*4] @ make room for $t4 91# endif 92 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` 93 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 94 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 95# ifndef __ARMEB__ 96 rev $t1,$t1 97# endif 98#else 99 @ ldrb $t1,[$inp,#3] @ $i 100 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 101 ldrb $t2,[$inp,#2] 102 ldrb $t0,[$inp,#1] 103 orr $t1,$t1,$t2,lsl#8 104 ldrb $t2,[$inp],#4 105 orr $t1,$t1,$t0,lsl#16 106# if $i==15 107 str $inp,[sp,#17*4] @ make room for $t4 108# endif 109 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` 110 orr $t1,$t1,$t2,lsl#24 111 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 112#endif 113___ 114$code.=<<___; 115 ldr $t2,[$Ktbl],#4 @ *K256++ 116 add $h,$h,$t1 @ h+=X[i] 117 str $t1,[sp,#`$i%16`*4] 118 eor $t1,$f,$g 119 add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e) 120 and $t1,$t1,$e 121 add $h,$h,$t2 @ h+=K256[i] 122 eor $t1,$t1,$g @ Ch(e,f,g) 123 eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]` 124 add $h,$h,$t1 @ h+=Ch(e,f,g) 125#if $i==31 126 and $t2,$t2,#0xff 127 cmp $t2,#0xf2 @ done? 128#endif 129#if $i<15 130# if __ARM_ARCH__>=7 131 ldr $t1,[$inp],#4 @ prefetch 132# else 133 ldrb $t1,[$inp,#3] 134# endif 135 eor $t2,$a,$b @ a^b, b^c in next round 136#else 137 ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx 138 eor $t2,$a,$b @ a^b, b^c in next round 139 ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx 140#endif 141 eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a) 142 and $t3,$t3,$t2 @ (b^c)&=(a^b) 143 add $d,$d,$h @ d+=h 144 eor $t3,$t3,$b @ Maj(a,b,c) 145 add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a) 146 @ add $h,$h,$t3 @ h+=Maj(a,b,c) 147___ 148 ($t2,$t3)=($t3,$t2); 149} 150 151sub BODY_16_XX { 152my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 153 154$code.=<<___; 155 @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i 156 @ ldr $t4,[sp,#`($i+14)%16`*4] 157 mov $t0,$t1,ror#$sigma0[0] 158 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 159 mov $t2,$t4,ror#$sigma1[0] 160 eor $t0,$t0,$t1,ror#$sigma0[1] 161 eor $t2,$t2,$t4,ror#$sigma1[1] 162 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) 163 ldr $t1,[sp,#`($i+0)%16`*4] 164 eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14]) 165 ldr $t4,[sp,#`($i+9)%16`*4] 166 167 add $t2,$t2,$t0 168 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15 169 add $t1,$t1,$t2 170 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 171 add $t1,$t1,$t4 @ X[i] 172___ 173 &BODY_00_15(@_); 174} 175 176$code=<<___; 177#ifndef __KERNEL__ 178# include <openssl/arm_arch.h> 179#else 180# define __ARM_ARCH__ __LINUX_ARM_ARCH__ 181# define __ARM_MAX_ARCH__ 7 182#endif 183 184@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both 185@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those 186@ instructions are manually-encoded. (See unsha256.) 187.arch armv7-a 188 189.text 190#if defined(__thumb2__) 191.syntax unified 192.thumb 193#else 194.code 32 195#endif 196 197.type K256,%object 198.align 5 199K256: 200.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 201.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 202.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 203.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 204.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 205.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 206.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 207.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 208.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 209.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 210.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 211.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 212.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 213.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 214.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 215.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 216.size K256,.-K256 217.word 0 @ terminator 218#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 219.LOPENSSL_armcap: 220.word OPENSSL_armcap_P-.Lsha256_block_data_order 221#endif 222.align 5 223 224.global sha256_block_data_order 225.type sha256_block_data_order,%function 226sha256_block_data_order: 227.Lsha256_block_data_order: 228#if __ARM_ARCH__<7 && !defined(__thumb2__) 229 sub r3,pc,#8 @ sha256_block_data_order 230#else 231 adr r3,.Lsha256_block_data_order 232#endif 233#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 234 ldr r12,.LOPENSSL_armcap 235 ldr r12,[r3,r12] @ OPENSSL_armcap_P 236#ifdef __APPLE__ 237 ldr r12,[r12] 238#endif 239 tst r12,#ARMV8_SHA256 240 bne .LARMv8 241 tst r12,#ARMV7_NEON 242 bne .LNEON 243#endif 244 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 245 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} 246 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} 247 sub $Ktbl,r3,#256+32 @ K256 248 sub sp,sp,#16*4 @ alloca(X[16]) 249.Loop: 250# if __ARM_ARCH__>=7 251 ldr $t1,[$inp],#4 252# else 253 ldrb $t1,[$inp,#3] 254# endif 255 eor $t3,$B,$C @ magic 256 eor $t2,$t2,$t2 257___ 258for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } 259$code.=".Lrounds_16_xx:\n"; 260for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); } 261$code.=<<___; 262#if __ARM_ARCH__>=7 263 ite eq @ Thumb2 thing, sanity check in ARM 264#endif 265 ldreq $t3,[sp,#16*4] @ pull ctx 266 bne .Lrounds_16_xx 267 268 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past 269 ldr $t0,[$t3,#0] 270 ldr $t1,[$t3,#4] 271 ldr $t2,[$t3,#8] 272 add $A,$A,$t0 273 ldr $t0,[$t3,#12] 274 add $B,$B,$t1 275 ldr $t1,[$t3,#16] 276 add $C,$C,$t2 277 ldr $t2,[$t3,#20] 278 add $D,$D,$t0 279 ldr $t0,[$t3,#24] 280 add $E,$E,$t1 281 ldr $t1,[$t3,#28] 282 add $F,$F,$t2 283 ldr $inp,[sp,#17*4] @ pull inp 284 ldr $t2,[sp,#18*4] @ pull inp+len 285 add $G,$G,$t0 286 add $H,$H,$t1 287 stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H} 288 cmp $inp,$t2 289 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl 290 bne .Loop 291 292 add sp,sp,#`16+3`*4 @ destroy frame 293#if __ARM_ARCH__>=5 294 ldmia sp!,{r4-r11,pc} 295#else 296 ldmia sp!,{r4-r11,lr} 297 tst lr,#1 298 moveq pc,lr @ be binary compatible with V4, yet 299 bx lr @ interoperable with Thumb ISA:-) 300#endif 301.size sha256_block_data_order,.-sha256_block_data_order 302___ 303###################################################################### 304# NEON stuff 305# 306{{{ 307my @X=map("q$_",(0..3)); 308my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25"); 309my $Xfer=$t4; 310my $j=0; 311 312sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } 313sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } 314 315sub AUTOLOAD() # thunk [simplified] x86-style perlasm 316{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; 317 my $arg = pop; 318 $arg = "#$arg" if ($arg*1 eq $arg); 319 $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; 320} 321 322sub Xupdate() 323{ use integer; 324 my $body = shift; 325 my @insns = (&$body,&$body,&$body,&$body); 326 my ($a,$b,$c,$d,$e,$f,$g,$h); 327 328 &vext_8 ($T0,@X[0],@X[1],4); # X[1..4] 329 eval(shift(@insns)); 330 eval(shift(@insns)); 331 eval(shift(@insns)); 332 &vext_8 ($T1,@X[2],@X[3],4); # X[9..12] 333 eval(shift(@insns)); 334 eval(shift(@insns)); 335 eval(shift(@insns)); 336 &vshr_u32 ($T2,$T0,$sigma0[0]); 337 eval(shift(@insns)); 338 eval(shift(@insns)); 339 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12] 340 eval(shift(@insns)); 341 eval(shift(@insns)); 342 &vshr_u32 ($T1,$T0,$sigma0[2]); 343 eval(shift(@insns)); 344 eval(shift(@insns)); 345 &vsli_32 ($T2,$T0,32-$sigma0[0]); 346 eval(shift(@insns)); 347 eval(shift(@insns)); 348 &vshr_u32 ($T3,$T0,$sigma0[1]); 349 eval(shift(@insns)); 350 eval(shift(@insns)); 351 &veor ($T1,$T1,$T2); 352 eval(shift(@insns)); 353 eval(shift(@insns)); 354 &vsli_32 ($T3,$T0,32-$sigma0[1]); 355 eval(shift(@insns)); 356 eval(shift(@insns)); 357 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]); 358 eval(shift(@insns)); 359 eval(shift(@insns)); 360 &veor ($T1,$T1,$T3); # sigma0(X[1..4]) 361 eval(shift(@insns)); 362 eval(shift(@insns)); 363 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]); 364 eval(shift(@insns)); 365 eval(shift(@insns)); 366 &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]); 367 eval(shift(@insns)); 368 eval(shift(@insns)); 369 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) 370 eval(shift(@insns)); 371 eval(shift(@insns)); 372 &veor ($T5,$T5,$T4); 373 eval(shift(@insns)); 374 eval(shift(@insns)); 375 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]); 376 eval(shift(@insns)); 377 eval(shift(@insns)); 378 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]); 379 eval(shift(@insns)); 380 eval(shift(@insns)); 381 &veor ($T5,$T5,$T4); # sigma1(X[14..15]) 382 eval(shift(@insns)); 383 eval(shift(@insns)); 384 &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15]) 385 eval(shift(@insns)); 386 eval(shift(@insns)); 387 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]); 388 eval(shift(@insns)); 389 eval(shift(@insns)); 390 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]); 391 eval(shift(@insns)); 392 eval(shift(@insns)); 393 &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]); 394 eval(shift(@insns)); 395 eval(shift(@insns)); 396 &veor ($T5,$T5,$T4); 397 eval(shift(@insns)); 398 eval(shift(@insns)); 399 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]); 400 eval(shift(@insns)); 401 eval(shift(@insns)); 402 &vld1_32 ("{$T0}","[$Ktbl,:128]!"); 403 eval(shift(@insns)); 404 eval(shift(@insns)); 405 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]); 406 eval(shift(@insns)); 407 eval(shift(@insns)); 408 &veor ($T5,$T5,$T4); # sigma1(X[16..17]) 409 eval(shift(@insns)); 410 eval(shift(@insns)); 411 &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17]) 412 eval(shift(@insns)); 413 eval(shift(@insns)); 414 &vadd_i32 ($T0,$T0,@X[0]); 415 while($#insns>=2) { eval(shift(@insns)); } 416 &vst1_32 ("{$T0}","[$Xfer,:128]!"); 417 eval(shift(@insns)); 418 eval(shift(@insns)); 419 420 push(@X,shift(@X)); # "rotate" X[] 421} 422 423sub Xpreload() 424{ use integer; 425 my $body = shift; 426 my @insns = (&$body,&$body,&$body,&$body); 427 my ($a,$b,$c,$d,$e,$f,$g,$h); 428 429 eval(shift(@insns)); 430 eval(shift(@insns)); 431 eval(shift(@insns)); 432 eval(shift(@insns)); 433 &vld1_32 ("{$T0}","[$Ktbl,:128]!"); 434 eval(shift(@insns)); 435 eval(shift(@insns)); 436 eval(shift(@insns)); 437 eval(shift(@insns)); 438 &vrev32_8 (@X[0],@X[0]); 439 eval(shift(@insns)); 440 eval(shift(@insns)); 441 eval(shift(@insns)); 442 eval(shift(@insns)); 443 &vadd_i32 ($T0,$T0,@X[0]); 444 foreach (@insns) { eval; } # remaining instructions 445 &vst1_32 ("{$T0}","[$Xfer,:128]!"); 446 447 push(@X,shift(@X)); # "rotate" X[] 448} 449 450sub body_00_15 () { 451 ( 452 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. 453 '&add ($h,$h,$t1)', # h+=X[i]+K[i] 454 '&eor ($t1,$f,$g)', 455 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', 456 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past 457 '&and ($t1,$t1,$e)', 458 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) 459 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', 460 '&eor ($t1,$t1,$g)', # Ch(e,f,g) 461 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e) 462 '&eor ($t2,$a,$b)', # a^b, b^c in next round 463 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) 464 '&add ($h,$h,$t1)', # h+=Ch(e,f,g) 465 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. 466 '&ldr ($t1,"[$Ktbl]") if ($j==15);'. 467 '&ldr ($t1,"[sp,#64]") if ($j==31)', 468 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) 469 '&add ($d,$d,$h)', # d+=h 470 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a) 471 '&eor ($t3,$t3,$b)', # Maj(a,b,c) 472 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' 473 ) 474} 475 476$code.=<<___; 477#if __ARM_MAX_ARCH__>=7 478.arch armv7-a 479.fpu neon 480 481.global sha256_block_data_order_neon 482.type sha256_block_data_order_neon,%function 483.align 5 484.skip 16 485sha256_block_data_order_neon: 486.LNEON: 487 stmdb sp!,{r4-r12,lr} 488 489 sub $H,sp,#16*4+16 490 adr $Ktbl,K256 491 bic $H,$H,#15 @ align for 128-bit stores 492 mov $t2,sp 493 mov sp,$H @ alloca 494 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 495 496 vld1.8 {@X[0]},[$inp]! 497 vld1.8 {@X[1]},[$inp]! 498 vld1.8 {@X[2]},[$inp]! 499 vld1.8 {@X[3]},[$inp]! 500 vld1.32 {$T0},[$Ktbl,:128]! 501 vld1.32 {$T1},[$Ktbl,:128]! 502 vld1.32 {$T2},[$Ktbl,:128]! 503 vld1.32 {$T3},[$Ktbl,:128]! 504 vrev32.8 @X[0],@X[0] @ yes, even on 505 str $ctx,[sp,#64] 506 vrev32.8 @X[1],@X[1] @ big-endian 507 str $inp,[sp,#68] 508 mov $Xfer,sp 509 vrev32.8 @X[2],@X[2] 510 str $len,[sp,#72] 511 vrev32.8 @X[3],@X[3] 512 str $t2,[sp,#76] @ save original sp 513 vadd.i32 $T0,$T0,@X[0] 514 vadd.i32 $T1,$T1,@X[1] 515 vst1.32 {$T0},[$Xfer,:128]! 516 vadd.i32 $T2,$T2,@X[2] 517 vst1.32 {$T1},[$Xfer,:128]! 518 vadd.i32 $T3,$T3,@X[3] 519 vst1.32 {$T2},[$Xfer,:128]! 520 vst1.32 {$T3},[$Xfer,:128]! 521 522 ldmia $ctx,{$A-$H} 523 sub $Xfer,$Xfer,#64 524 ldr $t1,[sp,#0] 525 eor $t2,$t2,$t2 526 eor $t3,$B,$C 527 b .L_00_48 528 529.align 4 530.L_00_48: 531___ 532 &Xupdate(\&body_00_15); 533 &Xupdate(\&body_00_15); 534 &Xupdate(\&body_00_15); 535 &Xupdate(\&body_00_15); 536$code.=<<___; 537 teq $t1,#0 @ check for K256 terminator 538 ldr $t1,[sp,#0] 539 sub $Xfer,$Xfer,#64 540 bne .L_00_48 541 542 ldr $inp,[sp,#68] 543 ldr $t0,[sp,#72] 544 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl 545 teq $inp,$t0 546 it eq 547 subeq $inp,$inp,#64 @ avoid SEGV 548 vld1.8 {@X[0]},[$inp]! @ load next input block 549 vld1.8 {@X[1]},[$inp]! 550 vld1.8 {@X[2]},[$inp]! 551 vld1.8 {@X[3]},[$inp]! 552 it ne 553 strne $inp,[sp,#68] 554 mov $Xfer,sp 555___ 556 &Xpreload(\&body_00_15); 557 &Xpreload(\&body_00_15); 558 &Xpreload(\&body_00_15); 559 &Xpreload(\&body_00_15); 560$code.=<<___; 561 ldr $t0,[$t1,#0] 562 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past 563 ldr $t2,[$t1,#4] 564 ldr $t3,[$t1,#8] 565 ldr $t4,[$t1,#12] 566 add $A,$A,$t0 @ accumulate 567 ldr $t0,[$t1,#16] 568 add $B,$B,$t2 569 ldr $t2,[$t1,#20] 570 add $C,$C,$t3 571 ldr $t3,[$t1,#24] 572 add $D,$D,$t4 573 ldr $t4,[$t1,#28] 574 add $E,$E,$t0 575 str $A,[$t1],#4 576 add $F,$F,$t2 577 str $B,[$t1],#4 578 add $G,$G,$t3 579 str $C,[$t1],#4 580 add $H,$H,$t4 581 str $D,[$t1],#4 582 stmia $t1,{$E-$H} 583 584 ittte ne 585 movne $Xfer,sp 586 ldrne $t1,[sp,#0] 587 eorne $t2,$t2,$t2 588 ldreq sp,[sp,#76] @ restore original sp 589 itt ne 590 eorne $t3,$B,$C 591 bne .L_00_48 592 593 ldmia sp!,{r4-r12,pc} 594.size sha256_block_data_order_neon,.-sha256_block_data_order_neon 595#endif 596___ 597}}} 598###################################################################### 599# ARMv8 stuff 600# 601{{{ 602my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2)); 603my @MSG=map("q$_",(8..11)); 604my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15)); 605my $Ktbl="r3"; 606 607$code.=<<___; 608#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 609 610# if defined(__thumb2__) 611# define INST(a,b,c,d) .byte c,d|0xc,a,b 612# else 613# define INST(a,b,c,d) .byte a,b,c,d 614# endif 615 616.type sha256_block_data_order_armv8,%function 617.align 5 618sha256_block_data_order_armv8: 619.LARMv8: 620 vld1.32 {$ABCD,$EFGH},[$ctx] 621 sub $Ktbl,$Ktbl,#256+32 622 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 623 b .Loop_v8 624 625.align 4 626.Loop_v8: 627 vld1.8 {@MSG[0]-@MSG[1]},[$inp]! 628 vld1.8 {@MSG[2]-@MSG[3]},[$inp]! 629 vld1.32 {$W0},[$Ktbl]! 630 vrev32.8 @MSG[0],@MSG[0] 631 vrev32.8 @MSG[1],@MSG[1] 632 vrev32.8 @MSG[2],@MSG[2] 633 vrev32.8 @MSG[3],@MSG[3] 634 vmov $ABCD_SAVE,$ABCD @ offload 635 vmov $EFGH_SAVE,$EFGH 636 teq $inp,$len 637___ 638for($i=0;$i<12;$i++) { 639$code.=<<___; 640 vld1.32 {$W1},[$Ktbl]! 641 vadd.i32 $W0,$W0,@MSG[0] 642 sha256su0 @MSG[0],@MSG[1] 643 vmov $abcd,$ABCD 644 sha256h $ABCD,$EFGH,$W0 645 sha256h2 $EFGH,$abcd,$W0 646 sha256su1 @MSG[0],@MSG[2],@MSG[3] 647___ 648 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); 649} 650$code.=<<___; 651 vld1.32 {$W1},[$Ktbl]! 652 vadd.i32 $W0,$W0,@MSG[0] 653 vmov $abcd,$ABCD 654 sha256h $ABCD,$EFGH,$W0 655 sha256h2 $EFGH,$abcd,$W0 656 657 vld1.32 {$W0},[$Ktbl]! 658 vadd.i32 $W1,$W1,@MSG[1] 659 vmov $abcd,$ABCD 660 sha256h $ABCD,$EFGH,$W1 661 sha256h2 $EFGH,$abcd,$W1 662 663 vld1.32 {$W1},[$Ktbl] 664 vadd.i32 $W0,$W0,@MSG[2] 665 sub $Ktbl,$Ktbl,#256-16 @ rewind 666 vmov $abcd,$ABCD 667 sha256h $ABCD,$EFGH,$W0 668 sha256h2 $EFGH,$abcd,$W0 669 670 vadd.i32 $W1,$W1,@MSG[3] 671 vmov $abcd,$ABCD 672 sha256h $ABCD,$EFGH,$W1 673 sha256h2 $EFGH,$abcd,$W1 674 675 vadd.i32 $ABCD,$ABCD,$ABCD_SAVE 676 vadd.i32 $EFGH,$EFGH,$EFGH_SAVE 677 it ne 678 bne .Loop_v8 679 680 vst1.32 {$ABCD,$EFGH},[$ctx] 681 682 ret @ bx lr 683.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 684#endif 685___ 686}}} 687$code.=<<___; 688.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 689.align 2 690#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 691.comm OPENSSL_armcap_P,4,4 692.hidden OPENSSL_armcap_P 693#endif 694___ 695 696open SELF,$0; 697while(<SELF>) { 698 next if (/^#!/); 699 last if (!s/^#/@/ and !/^$/); 700 print; 701} 702close SELF; 703 704{ my %opcode = ( 705 "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40, 706 "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 ); 707 708 sub unsha256 { 709 my ($mnemonic,$arg)=@_; 710 711 if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) { 712 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) 713 |(($2&7)<<17)|(($2&8)<<4) 714 |(($3&7)<<1) |(($3&8)<<2); 715 # since ARMv7 instructions are always encoded little-endian. 716 # correct solution is to use .inst directive, but older 717 # assemblers don't implement it:-( 718 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", 719 $word&0xff,($word>>8)&0xff, 720 ($word>>16)&0xff,($word>>24)&0xff, 721 $mnemonic,$arg; 722 } 723 } 724} 725 726foreach (split($/,$code)) { 727 728 s/\`([^\`]*)\`/eval $1/geo; 729 730 s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo; 731 732 s/\bret\b/bx lr/go or 733 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 734 735 print $_,"\n"; 736} 737 738close STDOUT; # enforce flush 739