1#!/usr/bin/env perl 2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# ==================================================================== 10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 11# project. The module is, however, dual licensed under OpenSSL and 12# CRYPTOGAMS licenses depending on where you obtain it. For further 13# details see http://www.openssl.org/~appro/cryptogams/. 14# ==================================================================== 15# 16# Keccak-1600 for ARMv8. 17# 18# June 2017. 19# 20# This is straightforward KECCAK_1X_ALT implementation. It makes no 21# sense to attempt SIMD/NEON implementation for following reason. 22# 64-bit lanes of vector registers can't be addressed as easily as in 23# 32-bit mode. This means that 64-bit NEON is bound to be slower than 24# 32-bit NEON, and this implementation is faster than 32-bit NEON on 25# same processor. Even though it takes more scalar xor's and andn's, 26# it gets compensated by availability of rotate. Not to forget that 27# most processors achieve higher issue rate with scalar instructions. 28# 29# February 2018. 30# 31# Add hardware-assisted ARMv8.2 implementation. It's KECCAK_1X_ALT 32# variant with register permutation/rotation twist that allows to 33# eliminate copies to temporary registers. If you look closely you'll 34# notice that it uses only one lane of vector registers. The new 35# instructions effectively facilitate parallel hashing, which we don't 36# support [yet?]. But lowest-level core procedure is prepared for it. 37# The inner round is 67 [vector] instructions, so it's not actually 38# obvious that it will provide performance improvement [in serial 39# hash] as long as vector instructions issue rate is limited to 1 per 40# cycle... 41# 42###################################################################### 43# Numbers are cycles per processed byte. 44# 45# r=1088(*) 46# 47# Cortex-A53 13 48# Cortex-A57 12 49# X-Gene 14 50# Mongoose 10 51# Kryo 12 52# Denver 7.8 53# Apple A7 7.2 54# 55# (*) Corresponds to SHA3-256. No improvement coefficients are listed 56# because they vary too much from compiler to compiler. Newer 57# compiler does much better and improvement varies from 5% on 58# Cortex-A57 to 25% on Cortex-A53. While in comparison to older 59# compiler this code is at least 2x faster... 60 61$flavour = shift; 62$output = shift; 63 64$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 65( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 66( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 67die "can't locate arm-xlate.pl"; 68 69open OUT,"| \"$^X\" $xlate $flavour $output"; 70*STDOUT=*OUT; 71 72my @rhotates = ([ 0, 1, 62, 28, 27 ], 73 [ 36, 44, 6, 55, 20 ], 74 [ 3, 10, 43, 25, 39 ], 75 [ 41, 45, 15, 21, 8 ], 76 [ 18, 2, 61, 56, 14 ]); 77 78$code.=<<___; 79.text 80 81.align 8 // strategic alignment and padding that allows to use 82 // address value as loop termination condition... 83 .quad 0,0,0,0,0,0,0,0 84.type iotas,%object 85iotas: 86 .quad 0x0000000000000001 87 .quad 0x0000000000008082 88 .quad 0x800000000000808a 89 .quad 0x8000000080008000 90 .quad 0x000000000000808b 91 .quad 0x0000000080000001 92 .quad 0x8000000080008081 93 .quad 0x8000000000008009 94 .quad 0x000000000000008a 95 .quad 0x0000000000000088 96 .quad 0x0000000080008009 97 .quad 0x000000008000000a 98 .quad 0x000000008000808b 99 .quad 0x800000000000008b 100 .quad 0x8000000000008089 101 .quad 0x8000000000008003 102 .quad 0x8000000000008002 103 .quad 0x8000000000000080 104 .quad 0x000000000000800a 105 .quad 0x800000008000000a 106 .quad 0x8000000080008081 107 .quad 0x8000000000008080 108 .quad 0x0000000080000001 109 .quad 0x8000000080008008 110.size iotas,.-iotas 111___ 112 {{{ 113my @A = map([ "x$_", "x".($_+1), "x".($_+2), "x".($_+3), "x".($_+4) ], 114 (0, 5, 10, 15, 20)); 115 $A[3][3] = "x25"; # x18 is reserved 116 117my @C = map("x$_", (26,27,28,30)); 118 119$code.=<<___; 120.type KeccakF1600_int,%function 121.align 5 122KeccakF1600_int: 123 adr $C[2],iotas 124 .inst 0xd503233f // paciasp 125 stp $C[2],x30,[sp,#16] // 32 bytes on top are mine 126 b .Loop 127.align 4 128.Loop: 129 ////////////////////////////////////////// Theta 130 eor $C[0],$A[0][0],$A[1][0] 131 stp $A[0][4],$A[1][4],[sp,#0] // offload pair... 132 eor $C[1],$A[0][1],$A[1][1] 133 eor $C[2],$A[0][2],$A[1][2] 134 eor $C[3],$A[0][3],$A[1][3] 135___ 136 $C[4]=$A[0][4]; 137 $C[5]=$A[1][4]; 138$code.=<<___; 139 eor $C[4],$A[0][4],$A[1][4] 140 eor $C[0],$C[0],$A[2][0] 141 eor $C[1],$C[1],$A[2][1] 142 eor $C[2],$C[2],$A[2][2] 143 eor $C[3],$C[3],$A[2][3] 144 eor $C[4],$C[4],$A[2][4] 145 eor $C[0],$C[0],$A[3][0] 146 eor $C[1],$C[1],$A[3][1] 147 eor $C[2],$C[2],$A[3][2] 148 eor $C[3],$C[3],$A[3][3] 149 eor $C[4],$C[4],$A[3][4] 150 eor $C[0],$C[0],$A[4][0] 151 eor $C[2],$C[2],$A[4][2] 152 eor $C[1],$C[1],$A[4][1] 153 eor $C[3],$C[3],$A[4][3] 154 eor $C[4],$C[4],$A[4][4] 155 156 eor $C[5],$C[0],$C[2],ror#63 157 158 eor $A[0][1],$A[0][1],$C[5] 159 eor $A[1][1],$A[1][1],$C[5] 160 eor $A[2][1],$A[2][1],$C[5] 161 eor $A[3][1],$A[3][1],$C[5] 162 eor $A[4][1],$A[4][1],$C[5] 163 164 eor $C[5],$C[1],$C[3],ror#63 165 eor $C[2],$C[2],$C[4],ror#63 166 eor $C[3],$C[3],$C[0],ror#63 167 eor $C[4],$C[4],$C[1],ror#63 168 169 eor $C[1], $A[0][2],$C[5] // mov $C[1],$A[0][2] 170 eor $A[1][2],$A[1][2],$C[5] 171 eor $A[2][2],$A[2][2],$C[5] 172 eor $A[3][2],$A[3][2],$C[5] 173 eor $A[4][2],$A[4][2],$C[5] 174 175 eor $A[0][0],$A[0][0],$C[4] 176 eor $A[1][0],$A[1][0],$C[4] 177 eor $A[2][0],$A[2][0],$C[4] 178 eor $A[3][0],$A[3][0],$C[4] 179 eor $A[4][0],$A[4][0],$C[4] 180___ 181 $C[4]=undef; 182 $C[5]=undef; 183$code.=<<___; 184 ldp $A[0][4],$A[1][4],[sp,#0] // re-load offloaded data 185 eor $C[0], $A[0][3],$C[2] // mov $C[0],$A[0][3] 186 eor $A[1][3],$A[1][3],$C[2] 187 eor $A[2][3],$A[2][3],$C[2] 188 eor $A[3][3],$A[3][3],$C[2] 189 eor $A[4][3],$A[4][3],$C[2] 190 191 eor $C[2], $A[0][4],$C[3] // mov $C[2],$A[0][4] 192 eor $A[1][4],$A[1][4],$C[3] 193 eor $A[2][4],$A[2][4],$C[3] 194 eor $A[3][4],$A[3][4],$C[3] 195 eor $A[4][4],$A[4][4],$C[3] 196 197 ////////////////////////////////////////// Rho+Pi 198 mov $C[3],$A[0][1] 199 ror $A[0][1],$A[1][1],#64-$rhotates[1][1] 200 //mov $C[1],$A[0][2] 201 ror $A[0][2],$A[2][2],#64-$rhotates[2][2] 202 //mov $C[0],$A[0][3] 203 ror $A[0][3],$A[3][3],#64-$rhotates[3][3] 204 //mov $C[2],$A[0][4] 205 ror $A[0][4],$A[4][4],#64-$rhotates[4][4] 206 207 ror $A[1][1],$A[1][4],#64-$rhotates[1][4] 208 ror $A[2][2],$A[2][3],#64-$rhotates[2][3] 209 ror $A[3][3],$A[3][2],#64-$rhotates[3][2] 210 ror $A[4][4],$A[4][1],#64-$rhotates[4][1] 211 212 ror $A[1][4],$A[4][2],#64-$rhotates[4][2] 213 ror $A[2][3],$A[3][4],#64-$rhotates[3][4] 214 ror $A[3][2],$A[2][1],#64-$rhotates[2][1] 215 ror $A[4][1],$A[1][3],#64-$rhotates[1][3] 216 217 ror $A[4][2],$A[2][4],#64-$rhotates[2][4] 218 ror $A[3][4],$A[4][3],#64-$rhotates[4][3] 219 ror $A[2][1],$A[1][2],#64-$rhotates[1][2] 220 ror $A[1][3],$A[3][1],#64-$rhotates[3][1] 221 222 ror $A[2][4],$A[4][0],#64-$rhotates[4][0] 223 ror $A[4][3],$A[3][0],#64-$rhotates[3][0] 224 ror $A[1][2],$A[2][0],#64-$rhotates[2][0] 225 ror $A[3][1],$A[1][0],#64-$rhotates[1][0] 226 227 ror $A[1][0],$C[0],#64-$rhotates[0][3] 228 ror $A[2][0],$C[3],#64-$rhotates[0][1] 229 ror $A[3][0],$C[2],#64-$rhotates[0][4] 230 ror $A[4][0],$C[1],#64-$rhotates[0][2] 231 232 ////////////////////////////////////////// Chi+Iota 233 bic $C[0],$A[0][2],$A[0][1] 234 bic $C[1],$A[0][3],$A[0][2] 235 bic $C[2],$A[0][0],$A[0][4] 236 bic $C[3],$A[0][1],$A[0][0] 237 eor $A[0][0],$A[0][0],$C[0] 238 bic $C[0],$A[0][4],$A[0][3] 239 eor $A[0][1],$A[0][1],$C[1] 240 ldr $C[1],[sp,#16] 241 eor $A[0][3],$A[0][3],$C[2] 242 eor $A[0][4],$A[0][4],$C[3] 243 eor $A[0][2],$A[0][2],$C[0] 244 ldr $C[3],[$C[1]],#8 // Iota[i++] 245 246 bic $C[0],$A[1][2],$A[1][1] 247 tst $C[1],#255 // are we done? 248 str $C[1],[sp,#16] 249 bic $C[1],$A[1][3],$A[1][2] 250 bic $C[2],$A[1][0],$A[1][4] 251 eor $A[0][0],$A[0][0],$C[3] // A[0][0] ^= Iota 252 bic $C[3],$A[1][1],$A[1][0] 253 eor $A[1][0],$A[1][0],$C[0] 254 bic $C[0],$A[1][4],$A[1][3] 255 eor $A[1][1],$A[1][1],$C[1] 256 eor $A[1][3],$A[1][3],$C[2] 257 eor $A[1][4],$A[1][4],$C[3] 258 eor $A[1][2],$A[1][2],$C[0] 259 260 bic $C[0],$A[2][2],$A[2][1] 261 bic $C[1],$A[2][3],$A[2][2] 262 bic $C[2],$A[2][0],$A[2][4] 263 bic $C[3],$A[2][1],$A[2][0] 264 eor $A[2][0],$A[2][0],$C[0] 265 bic $C[0],$A[2][4],$A[2][3] 266 eor $A[2][1],$A[2][1],$C[1] 267 eor $A[2][3],$A[2][3],$C[2] 268 eor $A[2][4],$A[2][4],$C[3] 269 eor $A[2][2],$A[2][2],$C[0] 270 271 bic $C[0],$A[3][2],$A[3][1] 272 bic $C[1],$A[3][3],$A[3][2] 273 bic $C[2],$A[3][0],$A[3][4] 274 bic $C[3],$A[3][1],$A[3][0] 275 eor $A[3][0],$A[3][0],$C[0] 276 bic $C[0],$A[3][4],$A[3][3] 277 eor $A[3][1],$A[3][1],$C[1] 278 eor $A[3][3],$A[3][3],$C[2] 279 eor $A[3][4],$A[3][4],$C[3] 280 eor $A[3][2],$A[3][2],$C[0] 281 282 bic $C[0],$A[4][2],$A[4][1] 283 bic $C[1],$A[4][3],$A[4][2] 284 bic $C[2],$A[4][0],$A[4][4] 285 bic $C[3],$A[4][1],$A[4][0] 286 eor $A[4][0],$A[4][0],$C[0] 287 bic $C[0],$A[4][4],$A[4][3] 288 eor $A[4][1],$A[4][1],$C[1] 289 eor $A[4][3],$A[4][3],$C[2] 290 eor $A[4][4],$A[4][4],$C[3] 291 eor $A[4][2],$A[4][2],$C[0] 292 293 bne .Loop 294 295 ldr x30,[sp,#24] 296 .inst 0xd50323bf // autiasp 297 ret 298.size KeccakF1600_int,.-KeccakF1600_int 299 300.type KeccakF1600,%function 301.align 5 302KeccakF1600: 303 .inst 0xd503233f // paciasp 304 stp x29,x30,[sp,#-128]! 305 add x29,sp,#0 306 stp x19,x20,[sp,#16] 307 stp x21,x22,[sp,#32] 308 stp x23,x24,[sp,#48] 309 stp x25,x26,[sp,#64] 310 stp x27,x28,[sp,#80] 311 sub sp,sp,#48 312 313 str x0,[sp,#32] // offload argument 314 mov $C[0],x0 315 ldp $A[0][0],$A[0][1],[x0,#16*0] 316 ldp $A[0][2],$A[0][3],[$C[0],#16*1] 317 ldp $A[0][4],$A[1][0],[$C[0],#16*2] 318 ldp $A[1][1],$A[1][2],[$C[0],#16*3] 319 ldp $A[1][3],$A[1][4],[$C[0],#16*4] 320 ldp $A[2][0],$A[2][1],[$C[0],#16*5] 321 ldp $A[2][2],$A[2][3],[$C[0],#16*6] 322 ldp $A[2][4],$A[3][0],[$C[0],#16*7] 323 ldp $A[3][1],$A[3][2],[$C[0],#16*8] 324 ldp $A[3][3],$A[3][4],[$C[0],#16*9] 325 ldp $A[4][0],$A[4][1],[$C[0],#16*10] 326 ldp $A[4][2],$A[4][3],[$C[0],#16*11] 327 ldr $A[4][4],[$C[0],#16*12] 328 329 bl KeccakF1600_int 330 331 ldr $C[0],[sp,#32] 332 stp $A[0][0],$A[0][1],[$C[0],#16*0] 333 stp $A[0][2],$A[0][3],[$C[0],#16*1] 334 stp $A[0][4],$A[1][0],[$C[0],#16*2] 335 stp $A[1][1],$A[1][2],[$C[0],#16*3] 336 stp $A[1][3],$A[1][4],[$C[0],#16*4] 337 stp $A[2][0],$A[2][1],[$C[0],#16*5] 338 stp $A[2][2],$A[2][3],[$C[0],#16*6] 339 stp $A[2][4],$A[3][0],[$C[0],#16*7] 340 stp $A[3][1],$A[3][2],[$C[0],#16*8] 341 stp $A[3][3],$A[3][4],[$C[0],#16*9] 342 stp $A[4][0],$A[4][1],[$C[0],#16*10] 343 stp $A[4][2],$A[4][3],[$C[0],#16*11] 344 str $A[4][4],[$C[0],#16*12] 345 346 ldp x19,x20,[x29,#16] 347 add sp,sp,#48 348 ldp x21,x22,[x29,#32] 349 ldp x23,x24,[x29,#48] 350 ldp x25,x26,[x29,#64] 351 ldp x27,x28,[x29,#80] 352 ldp x29,x30,[sp],#128 353 .inst 0xd50323bf // autiasp 354 ret 355.size KeccakF1600,.-KeccakF1600 356 357.globl SHA3_absorb 358.type SHA3_absorb,%function 359.align 5 360SHA3_absorb: 361 .inst 0xd503233f // paciasp 362 stp x29,x30,[sp,#-128]! 363 add x29,sp,#0 364 stp x19,x20,[sp,#16] 365 stp x21,x22,[sp,#32] 366 stp x23,x24,[sp,#48] 367 stp x25,x26,[sp,#64] 368 stp x27,x28,[sp,#80] 369 sub sp,sp,#64 370 371 stp x0,x1,[sp,#32] // offload arguments 372 stp x2,x3,[sp,#48] 373 374 mov $C[0],x0 // uint64_t A[5][5] 375 mov $C[1],x1 // const void *inp 376 mov $C[2],x2 // size_t len 377 mov $C[3],x3 // size_t bsz 378 ldp $A[0][0],$A[0][1],[$C[0],#16*0] 379 ldp $A[0][2],$A[0][3],[$C[0],#16*1] 380 ldp $A[0][4],$A[1][0],[$C[0],#16*2] 381 ldp $A[1][1],$A[1][2],[$C[0],#16*3] 382 ldp $A[1][3],$A[1][4],[$C[0],#16*4] 383 ldp $A[2][0],$A[2][1],[$C[0],#16*5] 384 ldp $A[2][2],$A[2][3],[$C[0],#16*6] 385 ldp $A[2][4],$A[3][0],[$C[0],#16*7] 386 ldp $A[3][1],$A[3][2],[$C[0],#16*8] 387 ldp $A[3][3],$A[3][4],[$C[0],#16*9] 388 ldp $A[4][0],$A[4][1],[$C[0],#16*10] 389 ldp $A[4][2],$A[4][3],[$C[0],#16*11] 390 ldr $A[4][4],[$C[0],#16*12] 391 b .Loop_absorb 392 393.align 4 394.Loop_absorb: 395 subs $C[0],$C[2],$C[3] // len - bsz 396 blo .Labsorbed 397 398 str $C[0],[sp,#48] // save len - bsz 399___ 400for (my $i=0; $i<24; $i+=2) { 401my $j = $i+1; 402$code.=<<___; 403 ldr $C[0],[$C[1]],#8 // *inp++ 404#ifdef __AARCH64EB__ 405 rev $C[0],$C[0] 406#endif 407 eor $A[$i/5][$i%5],$A[$i/5][$i%5],$C[0] 408 cmp $C[3],#8*($i+2) 409 blo .Lprocess_block 410 ldr $C[0],[$C[1]],#8 // *inp++ 411#ifdef __AARCH64EB__ 412 rev $C[0],$C[0] 413#endif 414 eor $A[$j/5][$j%5],$A[$j/5][$j%5],$C[0] 415 beq .Lprocess_block 416___ 417} 418$code.=<<___; 419 ldr $C[0],[$C[1]],#8 // *inp++ 420#ifdef __AARCH64EB__ 421 rev $C[0],$C[0] 422#endif 423 eor $A[4][4],$A[4][4],$C[0] 424 425.Lprocess_block: 426 str $C[1],[sp,#40] // save inp 427 428 bl KeccakF1600_int 429 430 ldr $C[1],[sp,#40] // restore arguments 431 ldp $C[2],$C[3],[sp,#48] 432 b .Loop_absorb 433 434.align 4 435.Labsorbed: 436 ldr $C[1],[sp,#32] 437 stp $A[0][0],$A[0][1],[$C[1],#16*0] 438 stp $A[0][2],$A[0][3],[$C[1],#16*1] 439 stp $A[0][4],$A[1][0],[$C[1],#16*2] 440 stp $A[1][1],$A[1][2],[$C[1],#16*3] 441 stp $A[1][3],$A[1][4],[$C[1],#16*4] 442 stp $A[2][0],$A[2][1],[$C[1],#16*5] 443 stp $A[2][2],$A[2][3],[$C[1],#16*6] 444 stp $A[2][4],$A[3][0],[$C[1],#16*7] 445 stp $A[3][1],$A[3][2],[$C[1],#16*8] 446 stp $A[3][3],$A[3][4],[$C[1],#16*9] 447 stp $A[4][0],$A[4][1],[$C[1],#16*10] 448 stp $A[4][2],$A[4][3],[$C[1],#16*11] 449 str $A[4][4],[$C[1],#16*12] 450 451 mov x0,$C[2] // return value 452 ldp x19,x20,[x29,#16] 453 add sp,sp,#64 454 ldp x21,x22,[x29,#32] 455 ldp x23,x24,[x29,#48] 456 ldp x25,x26,[x29,#64] 457 ldp x27,x28,[x29,#80] 458 ldp x29,x30,[sp],#128 459 .inst 0xd50323bf // autiasp 460 ret 461.size SHA3_absorb,.-SHA3_absorb 462___ 463{ 464my ($A_flat,$out,$len,$bsz) = map("x$_",(19..22)); 465$code.=<<___; 466.globl SHA3_squeeze 467.type SHA3_squeeze,%function 468.align 5 469SHA3_squeeze: 470 .inst 0xd503233f // paciasp 471 stp x29,x30,[sp,#-48]! 472 add x29,sp,#0 473 stp x19,x20,[sp,#16] 474 stp x21,x22,[sp,#32] 475 476 mov $A_flat,x0 // put aside arguments 477 mov $out,x1 478 mov $len,x2 479 mov $bsz,x3 480 481.Loop_squeeze: 482 ldr x4,[x0],#8 483 cmp $len,#8 484 blo .Lsqueeze_tail 485#ifdef __AARCH64EB__ 486 rev x4,x4 487#endif 488 str x4,[$out],#8 489 subs $len,$len,#8 490 beq .Lsqueeze_done 491 492 subs x3,x3,#8 493 bhi .Loop_squeeze 494 495 mov x0,$A_flat 496 bl KeccakF1600 497 mov x0,$A_flat 498 mov x3,$bsz 499 b .Loop_squeeze 500 501.align 4 502.Lsqueeze_tail: 503 strb w4,[$out],#1 504 lsr x4,x4,#8 505 subs $len,$len,#1 506 beq .Lsqueeze_done 507 strb w4,[$out],#1 508 lsr x4,x4,#8 509 subs $len,$len,#1 510 beq .Lsqueeze_done 511 strb w4,[$out],#1 512 lsr x4,x4,#8 513 subs $len,$len,#1 514 beq .Lsqueeze_done 515 strb w4,[$out],#1 516 lsr x4,x4,#8 517 subs $len,$len,#1 518 beq .Lsqueeze_done 519 strb w4,[$out],#1 520 lsr x4,x4,#8 521 subs $len,$len,#1 522 beq .Lsqueeze_done 523 strb w4,[$out],#1 524 lsr x4,x4,#8 525 subs $len,$len,#1 526 beq .Lsqueeze_done 527 strb w4,[$out],#1 528 529.Lsqueeze_done: 530 ldp x19,x20,[sp,#16] 531 ldp x21,x22,[sp,#32] 532 ldp x29,x30,[sp],#48 533 .inst 0xd50323bf // autiasp 534 ret 535.size SHA3_squeeze,.-SHA3_squeeze 536___ 537} }}} 538 {{{ 539my @A = map([ "v".$_.".16b", "v".($_+1).".16b", "v".($_+2).".16b", 540 "v".($_+3).".16b", "v".($_+4).".16b" ], 541 (0, 5, 10, 15, 20)); 542 543my @C = map("v$_.16b", (25..31)); 544 545$code.=<<___; 546.type KeccakF1600_ce,%function 547.align 5 548KeccakF1600_ce: 549 mov x9,#12 550 adr x10,iotas 551 b .Loop_ce 552.align 4 553.Loop_ce: 554___ 555for($i=0; $i<2; $i++) { 556$code.=<<___; 557 ////////////////////////////////////////////////// Theta 558 eor3 $C[0],$A[0][0],$A[1][0],$A[2][0] 559 eor3 $C[1],$A[0][1],$A[1][1],$A[2][1] 560 eor3 $C[2],$A[0][2],$A[1][2],$A[2][2] 561 eor3 $C[3],$A[0][3],$A[1][3],$A[2][3] 562 eor3 $C[4],$A[0][4],$A[1][4],$A[2][4] 563 eor3 $C[0],$C[0], $A[3][0],$A[4][0] 564 eor3 $C[1],$C[1], $A[3][1],$A[4][1] 565 eor3 $C[2],$C[2], $A[3][2],$A[4][2] 566 eor3 $C[3],$C[3], $A[3][3],$A[4][3] 567 eor3 $C[4],$C[4], $A[3][4],$A[4][4] 568 569 rax1 $C[5],$C[0],$C[2] // D[1] 570 rax1 $C[6],$C[1],$C[3] // D[2] 571 rax1 $C[2],$C[2],$C[4] // D[3] 572 rax1 $C[3],$C[3],$C[0] // D[4] 573 rax1 $C[4],$C[4],$C[1] // D[0] 574 575 ////////////////////////////////////////////////// Theta+Rho+Pi 576 xar $C[0], $A[1][1],$C[5],#64-$rhotates[1][1] // C[0]=A[0][1] 577 xar $A[1][1],$A[1][4],$C[3],#64-$rhotates[1][4] 578 xar $A[1][4],$A[4][2],$C[6],#64-$rhotates[4][2] 579 xar $A[4][2],$A[2][4],$C[3],#64-$rhotates[2][4] 580 xar $A[2][4],$A[4][0],$C[4],#64-$rhotates[4][0] 581 582 xar $A[4][0],$A[0][2],$C[6],#64-$rhotates[0][2] 583 584 xar $A[0][2],$A[2][2],$C[6],#64-$rhotates[2][2] 585 xar $A[2][2],$A[2][3],$C[2],#64-$rhotates[2][3] 586 xar $A[2][3],$A[3][4],$C[3],#64-$rhotates[3][4] 587 xar $A[3][4],$A[4][3],$C[2],#64-$rhotates[4][3] 588 xar $A[4][3],$A[3][0],$C[4],#64-$rhotates[3][0] 589 590 xar $A[3][0],$A[0][4],$C[3],#64-$rhotates[0][4] 591 592 eor $A[0][0],$A[0][0],$C[4] 593 ldr x11,[x10],#8 594 595 xar $C[1], $A[3][3],$C[2],#64-$rhotates[3][3] // C[1]=A[0][3] 596 xar $A[3][3],$A[3][2],$C[6],#64-$rhotates[3][2] 597 xar $A[3][2],$A[2][1],$C[5],#64-$rhotates[2][1] 598 xar $A[2][1],$A[1][2],$C[6],#64-$rhotates[1][2] 599 xar $A[1][2],$A[2][0],$C[4],#64-$rhotates[2][0] 600 601 xar $A[2][0],$A[0][1],$C[5],#64-$rhotates[0][1] // * 602 603 xar $A[0][4],$A[4][4],$C[3],#64-$rhotates[4][4] 604 xar $A[4][4],$A[4][1],$C[5],#64-$rhotates[4][1] 605 xar $A[4][1],$A[1][3],$C[2],#64-$rhotates[1][3] 606 xar $A[1][3],$A[3][1],$C[5],#64-$rhotates[3][1] 607 xar $A[3][1],$A[1][0],$C[4],#64-$rhotates[1][0] 608 609 xar $C[2], $A[0][3],$C[2],#64-$rhotates[0][3] // C[2]=A[1][0] 610 611 ////////////////////////////////////////////////// Chi+Iota 612 dup $C[6],x11 // borrow C[6] 613 bcax $C[3], $A[0][0],$A[0][2],$C[0] // * 614 bcax $A[0][1],$C[0], $C[1], $A[0][2] // * 615 bcax $A[0][2],$A[0][2],$A[0][4],$C[1] 616 bcax $A[0][3],$C[1], $A[0][0],$A[0][4] 617 bcax $A[0][4],$A[0][4],$C[0], $A[0][0] 618 619 bcax $A[1][0],$C[2], $A[1][2],$A[1][1] // * 620 bcax $C[0], $A[1][1],$A[1][3],$A[1][2] // * 621 bcax $A[1][2],$A[1][2],$A[1][4],$A[1][3] 622 bcax $A[1][3],$A[1][3],$C[2], $A[1][4] 623 bcax $A[1][4],$A[1][4],$A[1][1],$C[2] 624 625 eor $A[0][0],$C[3],$C[6] // Iota 626 627 bcax $C[1], $A[2][0],$A[2][2],$A[2][1] // * 628 bcax $C[2], $A[2][1],$A[2][3],$A[2][2] // * 629 bcax $A[2][2],$A[2][2],$A[2][4],$A[2][3] 630 bcax $A[2][3],$A[2][3],$A[2][0],$A[2][4] 631 bcax $A[2][4],$A[2][4],$A[2][1],$A[2][0] 632 633 bcax $C[3], $A[3][0],$A[3][2],$A[3][1] // * 634 bcax $C[4], $A[3][1],$A[3][3],$A[3][2] // * 635 bcax $A[3][2],$A[3][2],$A[3][4],$A[3][3] 636 bcax $A[3][3],$A[3][3],$A[3][0],$A[3][4] 637 bcax $A[3][4],$A[3][4],$A[3][1],$A[3][0] 638 639 bcax $C[5], $A[4][0],$A[4][2],$A[4][1] // * 640 bcax $C[6], $A[4][1],$A[4][3],$A[4][2] // * 641 bcax $A[4][2],$A[4][2],$A[4][4],$A[4][3] 642 bcax $A[4][3],$A[4][3],$A[4][0],$A[4][4] 643 bcax $A[4][4],$A[4][4],$A[4][1],$A[4][0] 644___ 645 ( $A[1][1], $C[0]) = ( $C[0], $A[1][1]); 646 ($A[2][0],$A[2][1], $C[1],$C[2]) = ($C[1],$C[2], $A[2][0],$A[2][1]); 647 ($A[3][0],$A[3][1], $C[3],$C[4]) = ($C[3],$C[4], $A[3][0],$A[3][1]); 648 ($A[4][0],$A[4][1], $C[5],$C[6]) = ($C[5],$C[6], $A[4][0],$A[4][1]); 649} 650$code.=<<___; 651 subs x9,x9,#1 652 bne .Loop_ce 653 654 ret 655.size KeccakF1600_ce,.-KeccakF1600_ce 656 657.type KeccakF1600_cext,%function 658.align 5 659KeccakF1600_cext: 660 .inst 0xd503233f // paciasp 661 stp x29,x30,[sp,#-80]! 662 add x29,sp,#0 663 stp d8,d9,[sp,#16] // per ABI requirement 664 stp d10,d11,[sp,#32] 665 stp d12,d13,[sp,#48] 666 stp d14,d15,[sp,#64] 667___ 668for($i=0; $i<24; $i+=2) { # load A[5][5] 669my $j=$i+1; 670$code.=<<___; 671 ldp d$i,d$j,[x0,#8*$i] 672___ 673} 674$code.=<<___; 675 ldr d24,[x0,#8*$i] 676 bl KeccakF1600_ce 677 ldr x30,[sp,#8] 678___ 679for($i=0; $i<24; $i+=2) { # store A[5][5] 680my $j=$i+1; 681$code.=<<___; 682 stp d$i,d$j,[x0,#8*$i] 683___ 684} 685$code.=<<___; 686 str d24,[x0,#8*$i] 687 688 ldp d8,d9,[sp,#16] 689 ldp d10,d11,[sp,#32] 690 ldp d12,d13,[sp,#48] 691 ldp d14,d15,[sp,#64] 692 ldr x29,[sp],#80 693 .inst 0xd50323bf // autiasp 694 ret 695.size KeccakF1600_cext,.-KeccakF1600_cext 696___ 697 698{ 699my ($ctx,$inp,$len,$bsz) = map("x$_",(0..3)); 700 701$code.=<<___; 702.globl SHA3_absorb_cext 703.type SHA3_absorb_cext,%function 704.align 5 705SHA3_absorb_cext: 706 .inst 0xd503233f // paciasp 707 stp x29,x30,[sp,#-80]! 708 add x29,sp,#0 709 stp d8,d9,[sp,#16] // per ABI requirement 710 stp d10,d11,[sp,#32] 711 stp d12,d13,[sp,#48] 712 stp d14,d15,[sp,#64] 713___ 714for($i=0; $i<24; $i+=2) { # load A[5][5] 715my $j=$i+1; 716$code.=<<___; 717 ldp d$i,d$j,[x0,#8*$i] 718___ 719} 720$code.=<<___; 721 ldr d24,[x0,#8*$i] 722 b .Loop_absorb_ce 723 724.align 4 725.Loop_absorb_ce: 726 subs $len,$len,$bsz // len - bsz 727 blo .Labsorbed_ce 728___ 729for (my $i=0; $i<24; $i+=2) { 730my $j = $i+1; 731$code.=<<___; 732 ldr d31,[$inp],#8 // *inp++ 733#ifdef __AARCH64EB__ 734 rev64 v31.16b,v31.16b 735#endif 736 eor $A[$i/5][$i%5],$A[$i/5][$i%5],v31.16b 737 cmp $bsz,#8*($i+2) 738 blo .Lprocess_block_ce 739 ldr d31,[$inp],#8 // *inp++ 740#ifdef __AARCH64EB__ 741 rev64 v31.16b,v31.16b 742#endif 743 eor $A[$j/5][$j%5],$A[$j/5][$j%5],v31.16b 744 beq .Lprocess_block_ce 745___ 746} 747$code.=<<___; 748 ldr d31,[$inp],#8 // *inp++ 749#ifdef __AARCH64EB__ 750 rev64 v31.16b,v31.16b 751#endif 752 eor $A[4][4],$A[4][4],v31.16b 753 754.Lprocess_block_ce: 755 756 bl KeccakF1600_ce 757 758 b .Loop_absorb_ce 759 760.align 4 761.Labsorbed_ce: 762___ 763for($i=0; $i<24; $i+=2) { # store A[5][5] 764my $j=$i+1; 765$code.=<<___; 766 stp d$i,d$j,[x0,#8*$i] 767___ 768} 769$code.=<<___; 770 str d24,[x0,#8*$i] 771 add x0,$len,$bsz // return value 772 773 ldp d8,d9,[sp,#16] 774 ldp d10,d11,[sp,#32] 775 ldp d12,d13,[sp,#48] 776 ldp d14,d15,[sp,#64] 777 ldp x29,x30,[sp],#80 778 .inst 0xd50323bf // autiasp 779 ret 780.size SHA3_absorb_cext,.-SHA3_absorb_cext 781___ 782} 783{ 784my ($ctx,$out,$len,$bsz) = map("x$_",(0..3)); 785$code.=<<___; 786.globl SHA3_squeeze_cext 787.type SHA3_squeeze_cext,%function 788.align 5 789SHA3_squeeze_cext: 790 .inst 0xd503233f // paciasp 791 stp x29,x30,[sp,#-16]! 792 add x29,sp,#0 793 mov x9,$ctx 794 mov x10,$bsz 795 796.Loop_squeeze_ce: 797 ldr x4,[x9],#8 798 cmp $len,#8 799 blo .Lsqueeze_tail_ce 800#ifdef __AARCH64EB__ 801 rev x4,x4 802#endif 803 str x4,[$out],#8 804 beq .Lsqueeze_done_ce 805 806 sub $len,$len,#8 807 subs x10,x10,#8 808 bhi .Loop_squeeze_ce 809 810 bl KeccakF1600_cext 811 ldr x30,[sp,#8] 812 mov x9,$ctx 813 mov x10,$bsz 814 b .Loop_squeeze_ce 815 816.align 4 817.Lsqueeze_tail_ce: 818 strb w4,[$out],#1 819 lsr x4,x4,#8 820 subs $len,$len,#1 821 beq .Lsqueeze_done_ce 822 strb w4,[$out],#1 823 lsr x4,x4,#8 824 subs $len,$len,#1 825 beq .Lsqueeze_done_ce 826 strb w4,[$out],#1 827 lsr x4,x4,#8 828 subs $len,$len,#1 829 beq .Lsqueeze_done_ce 830 strb w4,[$out],#1 831 lsr x4,x4,#8 832 subs $len,$len,#1 833 beq .Lsqueeze_done_ce 834 strb w4,[$out],#1 835 lsr x4,x4,#8 836 subs $len,$len,#1 837 beq .Lsqueeze_done_ce 838 strb w4,[$out],#1 839 lsr x4,x4,#8 840 subs $len,$len,#1 841 beq .Lsqueeze_done_ce 842 strb w4,[$out],#1 843 844.Lsqueeze_done_ce: 845 ldr x29,[sp],#16 846 .inst 0xd50323bf // autiasp 847 ret 848.size SHA3_squeeze_cext,.-SHA3_squeeze_cext 849___ 850} }}} 851$code.=<<___; 852.asciz "Keccak-1600 absorb and squeeze for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 853___ 854 855{ my %opcode = ( 856 "rax1" => 0xce608c00, "eor3" => 0xce000000, 857 "bcax" => 0xce200000, "xar" => 0xce800000 ); 858 859 sub unsha3 { 860 my ($mnemonic,$arg)=@_; 861 862 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/ 863 && 864 sprintf ".inst\t0x%08x\t//%s %s", 865 $opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10), 866 $mnemonic,$arg; 867 } 868} 869 870foreach(split("\n",$code)) { 871 872 s/\`([^\`]*)\`/eval($1)/ge; 873 874 m/\bdup\b/ and s/\.16b/.2d/g or 875 s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge; 876 877 print $_,"\n"; 878} 879 880close STDOUT or die "error closing STDOUT: $!"; 881