1#!/usr/bin/env perl 2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# ==================================================================== 10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 11# project. The module is, however, dual licensed under OpenSSL and 12# CRYPTOGAMS licenses depending on where you obtain it. For further 13# details see http://www.openssl.org/~appro/cryptogams/. 14# ==================================================================== 15# 16# Keccak-1600 for ARMv8. 17# 18# June 2017. 19# 20# This is straightforward KECCAK_1X_ALT implementation. It makes no 21# sense to attempt SIMD/NEON implementation for following reason. 22# 64-bit lanes of vector registers can't be addressed as easily as in 23# 32-bit mode. This means that 64-bit NEON is bound to be slower than 24# 32-bit NEON, and this implementation is faster than 32-bit NEON on 25# same processor. Even though it takes more scalar xor's and andn's, 26# it gets compensated by availability of rotate. Not to forget that 27# most processors achieve higher issue rate with scalar instructions. 28# 29# February 2018. 30# 31# Add hardware-assisted ARMv8.2 implementation. It's KECCAK_1X_ALT 32# variant with register permutation/rotation twist that allows to 33# eliminate copies to temporary registers. If you look closely you'll 34# notice that it uses only one lane of vector registers. The new 35# instructions effectively facilitate parallel hashing, which we don't 36# support [yet?]. But lowest-level core procedure is prepared for it. 37# The inner round is 67 [vector] instructions, so it's not actually 38# obvious that it will provide performance improvement [in serial 39# hash] as long as vector instructions issue rate is limited to 1 per 40# cycle... 41# 42###################################################################### 43# Numbers are cycles per processed byte. 44# 45# r=1088(*) 46# 47# Cortex-A53 13 48# Cortex-A57 12 49# X-Gene 14 50# Mongoose 10 51# Kryo 12 52# Denver 7.8 53# Apple A7 7.2 54# ThunderX2 9.7 55# 56# (*) Corresponds to SHA3-256. No improvement coefficients are listed 57# because they vary too much from compiler to compiler. Newer 58# compiler does much better and improvement varies from 5% on 59# Cortex-A57 to 25% on Cortex-A53. While in comparison to older 60# compiler this code is at least 2x faster... 61 62# $output is the last argument if it looks like a file (it has an extension) 63# $flavour is the first argument if it doesn't look like a file 64$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 65$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 66 67$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 68( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 69( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 70die "can't locate arm-xlate.pl"; 71 72open OUT,"| \"$^X\" $xlate $flavour \"$output\"" 73 or die "can't call $xlate: $!"; 74*STDOUT=*OUT; 75 76my @rhotates = ([ 0, 1, 62, 28, 27 ], 77 [ 36, 44, 6, 55, 20 ], 78 [ 3, 10, 43, 25, 39 ], 79 [ 41, 45, 15, 21, 8 ], 80 [ 18, 2, 61, 56, 14 ]); 81 82$code.=<<___; 83.rodata 84 85.align 8 // strategic alignment and padding that allows to use 86 // address value as loop termination condition... 87 .quad 0,0,0,0,0,0,0,0 88.type iotas,%object 89iotas: 90 .quad 0x0000000000000001 91 .quad 0x0000000000008082 92 .quad 0x800000000000808a 93 .quad 0x8000000080008000 94 .quad 0x000000000000808b 95 .quad 0x0000000080000001 96 .quad 0x8000000080008081 97 .quad 0x8000000000008009 98 .quad 0x000000000000008a 99 .quad 0x0000000000000088 100 .quad 0x0000000080008009 101 .quad 0x000000008000000a 102 .quad 0x000000008000808b 103 .quad 0x800000000000008b 104 .quad 0x8000000000008089 105 .quad 0x8000000000008003 106 .quad 0x8000000000008002 107 .quad 0x8000000000000080 108 .quad 0x000000000000800a 109 .quad 0x800000008000000a 110 .quad 0x8000000080008081 111 .quad 0x8000000000008080 112 .quad 0x0000000080000001 113 .quad 0x8000000080008008 114.size iotas,.-iotas 115___ 116 {{{ 117my @A = map([ "x$_", "x".($_+1), "x".($_+2), "x".($_+3), "x".($_+4) ], 118 (0, 5, 10, 15, 20)); 119 $A[3][3] = "x25"; # x18 is reserved 120 121my @C = map("x$_", (26,27,28,30)); 122 123$code.=<<___; 124.text 125 126.type KeccakF1600_int,%function 127.align 5 128KeccakF1600_int: 129 adrp $C[2],iotas 130 add $C[2],$C[2],:lo12:iotas 131 .inst 0xd503233f // paciasp 132 stp $C[2],x30,[sp,#16] // 32 bytes on top are mine 133 b .Loop 134.align 4 135.Loop: 136 ////////////////////////////////////////// Theta 137 eor $C[0],$A[0][0],$A[1][0] 138 stp $A[0][4],$A[1][4],[sp,#0] // offload pair... 139 eor $C[1],$A[0][1],$A[1][1] 140 eor $C[2],$A[0][2],$A[1][2] 141 eor $C[3],$A[0][3],$A[1][3] 142___ 143 $C[4]=$A[0][4]; 144 $C[5]=$A[1][4]; 145$code.=<<___; 146 eor $C[4],$A[0][4],$A[1][4] 147 eor $C[0],$C[0],$A[2][0] 148 eor $C[1],$C[1],$A[2][1] 149 eor $C[2],$C[2],$A[2][2] 150 eor $C[3],$C[3],$A[2][3] 151 eor $C[4],$C[4],$A[2][4] 152 eor $C[0],$C[0],$A[3][0] 153 eor $C[1],$C[1],$A[3][1] 154 eor $C[2],$C[2],$A[3][2] 155 eor $C[3],$C[3],$A[3][3] 156 eor $C[4],$C[4],$A[3][4] 157 eor $C[0],$C[0],$A[4][0] 158 eor $C[2],$C[2],$A[4][2] 159 eor $C[1],$C[1],$A[4][1] 160 eor $C[3],$C[3],$A[4][3] 161 eor $C[4],$C[4],$A[4][4] 162 163 eor $C[5],$C[0],$C[2],ror#63 164 165 eor $A[0][1],$A[0][1],$C[5] 166 eor $A[1][1],$A[1][1],$C[5] 167 eor $A[2][1],$A[2][1],$C[5] 168 eor $A[3][1],$A[3][1],$C[5] 169 eor $A[4][1],$A[4][1],$C[5] 170 171 eor $C[5],$C[1],$C[3],ror#63 172 eor $C[2],$C[2],$C[4],ror#63 173 eor $C[3],$C[3],$C[0],ror#63 174 eor $C[4],$C[4],$C[1],ror#63 175 176 eor $C[1], $A[0][2],$C[5] // mov $C[1],$A[0][2] 177 eor $A[1][2],$A[1][2],$C[5] 178 eor $A[2][2],$A[2][2],$C[5] 179 eor $A[3][2],$A[3][2],$C[5] 180 eor $A[4][2],$A[4][2],$C[5] 181 182 eor $A[0][0],$A[0][0],$C[4] 183 eor $A[1][0],$A[1][0],$C[4] 184 eor $A[2][0],$A[2][0],$C[4] 185 eor $A[3][0],$A[3][0],$C[4] 186 eor $A[4][0],$A[4][0],$C[4] 187___ 188 $C[4]=undef; 189 $C[5]=undef; 190$code.=<<___; 191 ldp $A[0][4],$A[1][4],[sp,#0] // re-load offloaded data 192 eor $C[0], $A[0][3],$C[2] // mov $C[0],$A[0][3] 193 eor $A[1][3],$A[1][3],$C[2] 194 eor $A[2][3],$A[2][3],$C[2] 195 eor $A[3][3],$A[3][3],$C[2] 196 eor $A[4][3],$A[4][3],$C[2] 197 198 eor $C[2], $A[0][4],$C[3] // mov $C[2],$A[0][4] 199 eor $A[1][4],$A[1][4],$C[3] 200 eor $A[2][4],$A[2][4],$C[3] 201 eor $A[3][4],$A[3][4],$C[3] 202 eor $A[4][4],$A[4][4],$C[3] 203 204 ////////////////////////////////////////// Rho+Pi 205 mov $C[3],$A[0][1] 206 ror $A[0][1],$A[1][1],#64-$rhotates[1][1] 207 //mov $C[1],$A[0][2] 208 ror $A[0][2],$A[2][2],#64-$rhotates[2][2] 209 //mov $C[0],$A[0][3] 210 ror $A[0][3],$A[3][3],#64-$rhotates[3][3] 211 //mov $C[2],$A[0][4] 212 ror $A[0][4],$A[4][4],#64-$rhotates[4][4] 213 214 ror $A[1][1],$A[1][4],#64-$rhotates[1][4] 215 ror $A[2][2],$A[2][3],#64-$rhotates[2][3] 216 ror $A[3][3],$A[3][2],#64-$rhotates[3][2] 217 ror $A[4][4],$A[4][1],#64-$rhotates[4][1] 218 219 ror $A[1][4],$A[4][2],#64-$rhotates[4][2] 220 ror $A[2][3],$A[3][4],#64-$rhotates[3][4] 221 ror $A[3][2],$A[2][1],#64-$rhotates[2][1] 222 ror $A[4][1],$A[1][3],#64-$rhotates[1][3] 223 224 ror $A[4][2],$A[2][4],#64-$rhotates[2][4] 225 ror $A[3][4],$A[4][3],#64-$rhotates[4][3] 226 ror $A[2][1],$A[1][2],#64-$rhotates[1][2] 227 ror $A[1][3],$A[3][1],#64-$rhotates[3][1] 228 229 ror $A[2][4],$A[4][0],#64-$rhotates[4][0] 230 ror $A[4][3],$A[3][0],#64-$rhotates[3][0] 231 ror $A[1][2],$A[2][0],#64-$rhotates[2][0] 232 ror $A[3][1],$A[1][0],#64-$rhotates[1][0] 233 234 ror $A[1][0],$C[0],#64-$rhotates[0][3] 235 ror $A[2][0],$C[3],#64-$rhotates[0][1] 236 ror $A[3][0],$C[2],#64-$rhotates[0][4] 237 ror $A[4][0],$C[1],#64-$rhotates[0][2] 238 239 ////////////////////////////////////////// Chi+Iota 240 bic $C[0],$A[0][2],$A[0][1] 241 bic $C[1],$A[0][3],$A[0][2] 242 bic $C[2],$A[0][0],$A[0][4] 243 bic $C[3],$A[0][1],$A[0][0] 244 eor $A[0][0],$A[0][0],$C[0] 245 bic $C[0],$A[0][4],$A[0][3] 246 eor $A[0][1],$A[0][1],$C[1] 247 ldr $C[1],[sp,#16] 248 eor $A[0][3],$A[0][3],$C[2] 249 eor $A[0][4],$A[0][4],$C[3] 250 eor $A[0][2],$A[0][2],$C[0] 251 ldr $C[3],[$C[1]],#8 // Iota[i++] 252 253 bic $C[0],$A[1][2],$A[1][1] 254 tst $C[1],#255 // are we done? 255 str $C[1],[sp,#16] 256 bic $C[1],$A[1][3],$A[1][2] 257 bic $C[2],$A[1][0],$A[1][4] 258 eor $A[0][0],$A[0][0],$C[3] // A[0][0] ^= Iota 259 bic $C[3],$A[1][1],$A[1][0] 260 eor $A[1][0],$A[1][0],$C[0] 261 bic $C[0],$A[1][4],$A[1][3] 262 eor $A[1][1],$A[1][1],$C[1] 263 eor $A[1][3],$A[1][3],$C[2] 264 eor $A[1][4],$A[1][4],$C[3] 265 eor $A[1][2],$A[1][2],$C[0] 266 267 bic $C[0],$A[2][2],$A[2][1] 268 bic $C[1],$A[2][3],$A[2][2] 269 bic $C[2],$A[2][0],$A[2][4] 270 bic $C[3],$A[2][1],$A[2][0] 271 eor $A[2][0],$A[2][0],$C[0] 272 bic $C[0],$A[2][4],$A[2][3] 273 eor $A[2][1],$A[2][1],$C[1] 274 eor $A[2][3],$A[2][3],$C[2] 275 eor $A[2][4],$A[2][4],$C[3] 276 eor $A[2][2],$A[2][2],$C[0] 277 278 bic $C[0],$A[3][2],$A[3][1] 279 bic $C[1],$A[3][3],$A[3][2] 280 bic $C[2],$A[3][0],$A[3][4] 281 bic $C[3],$A[3][1],$A[3][0] 282 eor $A[3][0],$A[3][0],$C[0] 283 bic $C[0],$A[3][4],$A[3][3] 284 eor $A[3][1],$A[3][1],$C[1] 285 eor $A[3][3],$A[3][3],$C[2] 286 eor $A[3][4],$A[3][4],$C[3] 287 eor $A[3][2],$A[3][2],$C[0] 288 289 bic $C[0],$A[4][2],$A[4][1] 290 bic $C[1],$A[4][3],$A[4][2] 291 bic $C[2],$A[4][0],$A[4][4] 292 bic $C[3],$A[4][1],$A[4][0] 293 eor $A[4][0],$A[4][0],$C[0] 294 bic $C[0],$A[4][4],$A[4][3] 295 eor $A[4][1],$A[4][1],$C[1] 296 eor $A[4][3],$A[4][3],$C[2] 297 eor $A[4][4],$A[4][4],$C[3] 298 eor $A[4][2],$A[4][2],$C[0] 299 300 bne .Loop 301 302 ldr x30,[sp,#24] 303 .inst 0xd50323bf // autiasp 304 ret 305.size KeccakF1600_int,.-KeccakF1600_int 306 307.type KeccakF1600,%function 308.align 5 309KeccakF1600: 310 .inst 0xd503233f // paciasp 311 stp x29,x30,[sp,#-128]! 312 add x29,sp,#0 313 stp x19,x20,[sp,#16] 314 stp x21,x22,[sp,#32] 315 stp x23,x24,[sp,#48] 316 stp x25,x26,[sp,#64] 317 stp x27,x28,[sp,#80] 318 sub sp,sp,#48 319 320 str x0,[sp,#32] // offload argument 321 mov $C[0],x0 322 ldp $A[0][0],$A[0][1],[x0,#16*0] 323 ldp $A[0][2],$A[0][3],[$C[0],#16*1] 324 ldp $A[0][4],$A[1][0],[$C[0],#16*2] 325 ldp $A[1][1],$A[1][2],[$C[0],#16*3] 326 ldp $A[1][3],$A[1][4],[$C[0],#16*4] 327 ldp $A[2][0],$A[2][1],[$C[0],#16*5] 328 ldp $A[2][2],$A[2][3],[$C[0],#16*6] 329 ldp $A[2][4],$A[3][0],[$C[0],#16*7] 330 ldp $A[3][1],$A[3][2],[$C[0],#16*8] 331 ldp $A[3][3],$A[3][4],[$C[0],#16*9] 332 ldp $A[4][0],$A[4][1],[$C[0],#16*10] 333 ldp $A[4][2],$A[4][3],[$C[0],#16*11] 334 ldr $A[4][4],[$C[0],#16*12] 335 336 bl KeccakF1600_int 337 338 ldr $C[0],[sp,#32] 339 stp $A[0][0],$A[0][1],[$C[0],#16*0] 340 stp $A[0][2],$A[0][3],[$C[0],#16*1] 341 stp $A[0][4],$A[1][0],[$C[0],#16*2] 342 stp $A[1][1],$A[1][2],[$C[0],#16*3] 343 stp $A[1][3],$A[1][4],[$C[0],#16*4] 344 stp $A[2][0],$A[2][1],[$C[0],#16*5] 345 stp $A[2][2],$A[2][3],[$C[0],#16*6] 346 stp $A[2][4],$A[3][0],[$C[0],#16*7] 347 stp $A[3][1],$A[3][2],[$C[0],#16*8] 348 stp $A[3][3],$A[3][4],[$C[0],#16*9] 349 stp $A[4][0],$A[4][1],[$C[0],#16*10] 350 stp $A[4][2],$A[4][3],[$C[0],#16*11] 351 str $A[4][4],[$C[0],#16*12] 352 353 ldp x19,x20,[x29,#16] 354 add sp,sp,#48 355 ldp x21,x22,[x29,#32] 356 ldp x23,x24,[x29,#48] 357 ldp x25,x26,[x29,#64] 358 ldp x27,x28,[x29,#80] 359 ldp x29,x30,[sp],#128 360 .inst 0xd50323bf // autiasp 361 ret 362.size KeccakF1600,.-KeccakF1600 363 364.globl SHA3_absorb 365.type SHA3_absorb,%function 366.align 5 367SHA3_absorb: 368 .inst 0xd503233f // paciasp 369 stp x29,x30,[sp,#-128]! 370 add x29,sp,#0 371 stp x19,x20,[sp,#16] 372 stp x21,x22,[sp,#32] 373 stp x23,x24,[sp,#48] 374 stp x25,x26,[sp,#64] 375 stp x27,x28,[sp,#80] 376 sub sp,sp,#64 377 378 stp x0,x1,[sp,#32] // offload arguments 379 stp x2,x3,[sp,#48] 380 381 mov $C[0],x0 // uint64_t A[5][5] 382 mov $C[1],x1 // const void *inp 383 mov $C[2],x2 // size_t len 384 mov $C[3],x3 // size_t bsz 385 ldp $A[0][0],$A[0][1],[$C[0],#16*0] 386 ldp $A[0][2],$A[0][3],[$C[0],#16*1] 387 ldp $A[0][4],$A[1][0],[$C[0],#16*2] 388 ldp $A[1][1],$A[1][2],[$C[0],#16*3] 389 ldp $A[1][3],$A[1][4],[$C[0],#16*4] 390 ldp $A[2][0],$A[2][1],[$C[0],#16*5] 391 ldp $A[2][2],$A[2][3],[$C[0],#16*6] 392 ldp $A[2][4],$A[3][0],[$C[0],#16*7] 393 ldp $A[3][1],$A[3][2],[$C[0],#16*8] 394 ldp $A[3][3],$A[3][4],[$C[0],#16*9] 395 ldp $A[4][0],$A[4][1],[$C[0],#16*10] 396 ldp $A[4][2],$A[4][3],[$C[0],#16*11] 397 ldr $A[4][4],[$C[0],#16*12] 398 b .Loop_absorb 399 400.align 4 401.Loop_absorb: 402 subs $C[0],$C[2],$C[3] // len - bsz 403 blo .Labsorbed 404 405 str $C[0],[sp,#48] // save len - bsz 406___ 407for (my $i=0; $i<24; $i+=2) { 408my $j = $i+1; 409$code.=<<___; 410 ldr $C[0],[$C[1]],#8 // *inp++ 411#ifdef __AARCH64EB__ 412 rev $C[0],$C[0] 413#endif 414 eor $A[$i/5][$i%5],$A[$i/5][$i%5],$C[0] 415 cmp $C[3],#8*($i+2) 416 blo .Lprocess_block 417 ldr $C[0],[$C[1]],#8 // *inp++ 418#ifdef __AARCH64EB__ 419 rev $C[0],$C[0] 420#endif 421 eor $A[$j/5][$j%5],$A[$j/5][$j%5],$C[0] 422 beq .Lprocess_block 423___ 424} 425$code.=<<___; 426 ldr $C[0],[$C[1]],#8 // *inp++ 427#ifdef __AARCH64EB__ 428 rev $C[0],$C[0] 429#endif 430 eor $A[4][4],$A[4][4],$C[0] 431 432.Lprocess_block: 433 str $C[1],[sp,#40] // save inp 434 435 bl KeccakF1600_int 436 437 ldr $C[1],[sp,#40] // restore arguments 438 ldp $C[2],$C[3],[sp,#48] 439 b .Loop_absorb 440 441.align 4 442.Labsorbed: 443 ldr $C[1],[sp,#32] 444 stp $A[0][0],$A[0][1],[$C[1],#16*0] 445 stp $A[0][2],$A[0][3],[$C[1],#16*1] 446 stp $A[0][4],$A[1][0],[$C[1],#16*2] 447 stp $A[1][1],$A[1][2],[$C[1],#16*3] 448 stp $A[1][3],$A[1][4],[$C[1],#16*4] 449 stp $A[2][0],$A[2][1],[$C[1],#16*5] 450 stp $A[2][2],$A[2][3],[$C[1],#16*6] 451 stp $A[2][4],$A[3][0],[$C[1],#16*7] 452 stp $A[3][1],$A[3][2],[$C[1],#16*8] 453 stp $A[3][3],$A[3][4],[$C[1],#16*9] 454 stp $A[4][0],$A[4][1],[$C[1],#16*10] 455 stp $A[4][2],$A[4][3],[$C[1],#16*11] 456 str $A[4][4],[$C[1],#16*12] 457 458 mov x0,$C[2] // return value 459 ldp x19,x20,[x29,#16] 460 add sp,sp,#64 461 ldp x21,x22,[x29,#32] 462 ldp x23,x24,[x29,#48] 463 ldp x25,x26,[x29,#64] 464 ldp x27,x28,[x29,#80] 465 ldp x29,x30,[sp],#128 466 .inst 0xd50323bf // autiasp 467 ret 468.size SHA3_absorb,.-SHA3_absorb 469___ 470{ 471my ($A_flat,$out,$len,$bsz) = map("x$_",(19..22)); 472$code.=<<___; 473.globl SHA3_squeeze 474.type SHA3_squeeze,%function 475.align 5 476SHA3_squeeze: 477 .inst 0xd503233f // paciasp 478 stp x29,x30,[sp,#-48]! 479 add x29,sp,#0 480 stp x19,x20,[sp,#16] 481 stp x21,x22,[sp,#32] 482 483 mov $A_flat,x0 // put aside arguments 484 mov $out,x1 485 mov $len,x2 486 mov $bsz,x3 487 488.Loop_squeeze: 489 ldr x4,[x0],#8 490 cmp $len,#8 491 blo .Lsqueeze_tail 492#ifdef __AARCH64EB__ 493 rev x4,x4 494#endif 495 str x4,[$out],#8 496 subs $len,$len,#8 497 beq .Lsqueeze_done 498 499 subs x3,x3,#8 500 bhi .Loop_squeeze 501 502 mov x0,$A_flat 503 bl KeccakF1600 504 mov x0,$A_flat 505 mov x3,$bsz 506 b .Loop_squeeze 507 508.align 4 509.Lsqueeze_tail: 510 strb w4,[$out],#1 511 lsr x4,x4,#8 512 subs $len,$len,#1 513 beq .Lsqueeze_done 514 strb w4,[$out],#1 515 lsr x4,x4,#8 516 subs $len,$len,#1 517 beq .Lsqueeze_done 518 strb w4,[$out],#1 519 lsr x4,x4,#8 520 subs $len,$len,#1 521 beq .Lsqueeze_done 522 strb w4,[$out],#1 523 lsr x4,x4,#8 524 subs $len,$len,#1 525 beq .Lsqueeze_done 526 strb w4,[$out],#1 527 lsr x4,x4,#8 528 subs $len,$len,#1 529 beq .Lsqueeze_done 530 strb w4,[$out],#1 531 lsr x4,x4,#8 532 subs $len,$len,#1 533 beq .Lsqueeze_done 534 strb w4,[$out],#1 535 536.Lsqueeze_done: 537 ldp x19,x20,[sp,#16] 538 ldp x21,x22,[sp,#32] 539 ldp x29,x30,[sp],#48 540 .inst 0xd50323bf // autiasp 541 ret 542.size SHA3_squeeze,.-SHA3_squeeze 543___ 544} }}} 545 {{{ 546my @A = map([ "v".$_.".16b", "v".($_+1).".16b", "v".($_+2).".16b", 547 "v".($_+3).".16b", "v".($_+4).".16b" ], 548 (0, 5, 10, 15, 20)); 549 550my @C = map("v$_.16b", (25..31)); 551my @D = @C[4,5,6,2,3]; 552 553$code.=<<___; 554.type KeccakF1600_ce,%function 555.align 5 556KeccakF1600_ce: 557 mov x9,#24 558 adrp x10,iotas 559 add x10,x10,:lo12:iotas 560 b .Loop_ce 561.align 4 562.Loop_ce: 563 ////////////////////////////////////////////////// Theta 564 eor3 $C[0],$A[4][0],$A[3][0],$A[2][0] 565 eor3 $C[1],$A[4][1],$A[3][1],$A[2][1] 566 eor3 $C[2],$A[4][2],$A[3][2],$A[2][2] 567 eor3 $C[3],$A[4][3],$A[3][3],$A[2][3] 568 eor3 $C[4],$A[4][4],$A[3][4],$A[2][4] 569 eor3 $C[0],$C[0], $A[1][0],$A[0][0] 570 eor3 $C[1],$C[1], $A[1][1],$A[0][1] 571 eor3 $C[2],$C[2], $A[1][2],$A[0][2] 572 eor3 $C[3],$C[3], $A[1][3],$A[0][3] 573 eor3 $C[4],$C[4], $A[1][4],$A[0][4] 574 575 rax1 $C[5],$C[0],$C[2] // D[1] 576 rax1 $C[6],$C[1],$C[3] // D[2] 577 rax1 $C[2],$C[2],$C[4] // D[3] 578 rax1 $C[3],$C[3],$C[0] // D[4] 579 rax1 $C[4],$C[4],$C[1] // D[0] 580 581 ////////////////////////////////////////////////// Theta+Rho+Pi 582 xar $C[0], $A[0][1],$D[1],#64-$rhotates[0][1] // C[0]=A[2][0] 583 584 xar $A[0][1],$A[1][1],$D[1],#64-$rhotates[1][1] 585 xar $A[1][1],$A[1][4],$D[4],#64-$rhotates[1][4] 586 xar $A[1][4],$A[4][2],$D[2],#64-$rhotates[4][2] 587 xar $A[4][2],$A[2][4],$D[4],#64-$rhotates[2][4] 588 xar $A[2][4],$A[4][0],$D[0],#64-$rhotates[4][0] 589 590 xar $C[1], $A[0][2],$D[2],#64-$rhotates[0][2] // C[1]=A[4][0] 591 592 xar $A[0][2],$A[2][2],$D[2],#64-$rhotates[2][2] 593 xar $A[2][2],$A[2][3],$D[3],#64-$rhotates[2][3] 594 xar $A[2][3],$A[3][4],$D[4],#64-$rhotates[3][4] 595 xar $A[3][4],$A[4][3],$D[3],#64-$rhotates[4][3] 596 xar $A[4][3],$A[3][0],$D[0],#64-$rhotates[3][0] 597 598 xar $A[3][0],$A[0][4],$D[4],#64-$rhotates[0][4] 599 600 xar $D[4], $A[4][4],$D[4],#64-$rhotates[4][4] // D[4]=A[0][4] 601 xar $A[4][4],$A[4][1],$D[1],#64-$rhotates[4][1] 602 xar $A[1][3],$A[1][3],$D[3],#64-$rhotates[1][3] // A[1][3]=A[4][1] 603 xar $A[0][4],$A[3][1],$D[1],#64-$rhotates[3][1] // A[0][4]=A[1][3] 604 xar $A[3][1],$A[1][0],$D[0],#64-$rhotates[1][0] 605 606 xar $A[1][0],$A[0][3],$D[3],#64-$rhotates[0][3] 607 608 eor $A[0][0],$A[0][0],$D[0] 609 610 xar $D[3], $A[3][3],$D[3],#64-$rhotates[3][3] // D[3]=A[0][3] 611 xar $A[0][3],$A[3][2],$D[2],#64-$rhotates[3][2] // A[0][3]=A[3][3] 612 xar $D[1], $A[2][1],$D[1],#64-$rhotates[2][1] // D[1]=A[3][2] 613 xar $D[2], $A[1][2],$D[2],#64-$rhotates[1][2] // D[2]=A[2][1] 614 xar $D[0], $A[2][0],$D[0],#64-$rhotates[2][0] // D[0]=A[1][2] 615 616 ////////////////////////////////////////////////// Chi+Iota 617 bcax $A[4][0],$C[1], $A[4][2],$A[1][3] // A[1][3]=A[4][1] 618 bcax $A[4][1],$A[1][3],$A[4][3],$A[4][2] // A[1][3]=A[4][1] 619 bcax $A[4][2],$A[4][2],$A[4][4],$A[4][3] 620 bcax $A[4][3],$A[4][3],$C[1], $A[4][4] 621 bcax $A[4][4],$A[4][4],$A[1][3],$C[1] // A[1][3]=A[4][1] 622 623 ld1r {$C[1]},[x10],#8 624 625 bcax $A[3][2],$D[1], $A[3][4],$A[0][3] // A[0][3]=A[3][3] 626 bcax $A[3][3],$A[0][3],$A[3][0],$A[3][4] // A[0][3]=A[3][3] 627 bcax $A[3][4],$A[3][4],$A[3][1],$A[3][0] 628 bcax $A[3][0],$A[3][0],$D[1], $A[3][1] 629 bcax $A[3][1],$A[3][1],$A[0][3],$D[1] // A[0][3]=A[3][3] 630 631 bcax $A[2][0],$C[0], $A[2][2],$D[2] 632 bcax $A[2][1],$D[2], $A[2][3],$A[2][2] 633 bcax $A[2][2],$A[2][2],$A[2][4],$A[2][3] 634 bcax $A[2][3],$A[2][3],$C[0], $A[2][4] 635 bcax $A[2][4],$A[2][4],$D[2], $C[0] 636 637 bcax $A[1][2],$D[0], $A[1][4],$A[0][4] // A[0][4]=A[1][3] 638 bcax $A[1][3],$A[0][4],$A[1][0],$A[1][4] // A[0][4]=A[1][3] 639 bcax $A[1][4],$A[1][4],$A[1][1],$A[1][0] 640 bcax $A[1][0],$A[1][0],$D[0], $A[1][1] 641 bcax $A[1][1],$A[1][1],$A[0][4],$D[0] // A[0][4]=A[1][3] 642 643 bcax $A[0][3],$D[3], $A[0][0],$D[4] 644 bcax $A[0][4],$D[4], $A[0][1],$A[0][0] 645 bcax $A[0][0],$A[0][0],$A[0][2],$A[0][1] 646 bcax $A[0][1],$A[0][1],$D[3], $A[0][2] 647 bcax $A[0][2],$A[0][2],$D[4], $D[3] 648 649 eor $A[0][0],$A[0][0],$C[1] 650 651 subs x9,x9,#1 652 bne .Loop_ce 653 654 ret 655.size KeccakF1600_ce,.-KeccakF1600_ce 656 657.type KeccakF1600_cext,%function 658.align 5 659KeccakF1600_cext: 660 .inst 0xd503233f // paciasp 661 stp x29,x30,[sp,#-80]! 662 add x29,sp,#0 663 stp d8,d9,[sp,#16] // per ABI requirement 664 stp d10,d11,[sp,#32] 665 stp d12,d13,[sp,#48] 666 stp d14,d15,[sp,#64] 667___ 668for($i=0; $i<24; $i+=2) { # load A[5][5] 669my $j=$i+1; 670$code.=<<___; 671 ldp d$i,d$j,[x0,#8*$i] 672___ 673} 674$code.=<<___; 675 ldr d24,[x0,#8*$i] 676 bl KeccakF1600_ce 677 ldr x30,[sp,#8] 678___ 679for($i=0; $i<24; $i+=2) { # store A[5][5] 680my $j=$i+1; 681$code.=<<___; 682 stp d$i,d$j,[x0,#8*$i] 683___ 684} 685$code.=<<___; 686 str d24,[x0,#8*$i] 687 688 ldp d8,d9,[sp,#16] 689 ldp d10,d11,[sp,#32] 690 ldp d12,d13,[sp,#48] 691 ldp d14,d15,[sp,#64] 692 ldr x29,[sp],#80 693 .inst 0xd50323bf // autiasp 694 ret 695.size KeccakF1600_cext,.-KeccakF1600_cext 696___ 697 698{ 699my ($ctx,$inp,$len,$bsz) = map("x$_",(0..3)); 700 701$code.=<<___; 702.globl SHA3_absorb_cext 703.type SHA3_absorb_cext,%function 704.align 5 705SHA3_absorb_cext: 706 .inst 0xd503233f // paciasp 707 stp x29,x30,[sp,#-80]! 708 add x29,sp,#0 709 stp d8,d9,[sp,#16] // per ABI requirement 710 stp d10,d11,[sp,#32] 711 stp d12,d13,[sp,#48] 712 stp d14,d15,[sp,#64] 713___ 714for($i=0; $i<24; $i+=2) { # load A[5][5] 715my $j=$i+1; 716$code.=<<___; 717 ldp d$i,d$j,[x0,#8*$i] 718___ 719} 720$code.=<<___; 721 ldr d24,[x0,#8*$i] 722 b .Loop_absorb_ce 723 724.align 4 725.Loop_absorb_ce: 726 subs $len,$len,$bsz // len - bsz 727 blo .Labsorbed_ce 728___ 729for (my $i=0; $i<24; $i+=2) { 730my $j = $i+1; 731$code.=<<___; 732 ldr d31,[$inp],#8 // *inp++ 733#ifdef __AARCH64EB__ 734 rev64 v31.16b,v31.16b 735#endif 736 eor $A[$i/5][$i%5],$A[$i/5][$i%5],v31.16b 737 cmp $bsz,#8*($i+2) 738 blo .Lprocess_block_ce 739 ldr d31,[$inp],#8 // *inp++ 740#ifdef __AARCH64EB__ 741 rev64 v31.16b,v31.16b 742#endif 743 eor $A[$j/5][$j%5],$A[$j/5][$j%5],v31.16b 744 beq .Lprocess_block_ce 745___ 746} 747$code.=<<___; 748 ldr d31,[$inp],#8 // *inp++ 749#ifdef __AARCH64EB__ 750 rev64 v31.16b,v31.16b 751#endif 752 eor $A[4][4],$A[4][4],v31.16b 753 754.Lprocess_block_ce: 755 756 bl KeccakF1600_ce 757 758 b .Loop_absorb_ce 759 760.align 4 761.Labsorbed_ce: 762___ 763for($i=0; $i<24; $i+=2) { # store A[5][5] 764my $j=$i+1; 765$code.=<<___; 766 stp d$i,d$j,[x0,#8*$i] 767___ 768} 769$code.=<<___; 770 str d24,[x0,#8*$i] 771 add x0,$len,$bsz // return value 772 773 ldp d8,d9,[sp,#16] 774 ldp d10,d11,[sp,#32] 775 ldp d12,d13,[sp,#48] 776 ldp d14,d15,[sp,#64] 777 ldp x29,x30,[sp],#80 778 .inst 0xd50323bf // autiasp 779 ret 780.size SHA3_absorb_cext,.-SHA3_absorb_cext 781___ 782} 783{ 784my ($ctx,$out,$len,$bsz) = map("x$_",(0..3)); 785$code.=<<___; 786.globl SHA3_squeeze_cext 787.type SHA3_squeeze_cext,%function 788.align 5 789SHA3_squeeze_cext: 790 .inst 0xd503233f // paciasp 791 stp x29,x30,[sp,#-16]! 792 add x29,sp,#0 793 mov x9,$ctx 794 mov x10,$bsz 795 796.Loop_squeeze_ce: 797 ldr x4,[x9],#8 798 cmp $len,#8 799 blo .Lsqueeze_tail_ce 800#ifdef __AARCH64EB__ 801 rev x4,x4 802#endif 803 str x4,[$out],#8 804 beq .Lsqueeze_done_ce 805 806 sub $len,$len,#8 807 subs x10,x10,#8 808 bhi .Loop_squeeze_ce 809 810 bl KeccakF1600_cext 811 ldr x30,[sp,#8] 812 mov x9,$ctx 813 mov x10,$bsz 814 b .Loop_squeeze_ce 815 816.align 4 817.Lsqueeze_tail_ce: 818 strb w4,[$out],#1 819 lsr x4,x4,#8 820 subs $len,$len,#1 821 beq .Lsqueeze_done_ce 822 strb w4,[$out],#1 823 lsr x4,x4,#8 824 subs $len,$len,#1 825 beq .Lsqueeze_done_ce 826 strb w4,[$out],#1 827 lsr x4,x4,#8 828 subs $len,$len,#1 829 beq .Lsqueeze_done_ce 830 strb w4,[$out],#1 831 lsr x4,x4,#8 832 subs $len,$len,#1 833 beq .Lsqueeze_done_ce 834 strb w4,[$out],#1 835 lsr x4,x4,#8 836 subs $len,$len,#1 837 beq .Lsqueeze_done_ce 838 strb w4,[$out],#1 839 lsr x4,x4,#8 840 subs $len,$len,#1 841 beq .Lsqueeze_done_ce 842 strb w4,[$out],#1 843 844.Lsqueeze_done_ce: 845 ldr x29,[sp],#16 846 .inst 0xd50323bf // autiasp 847 ret 848.size SHA3_squeeze_cext,.-SHA3_squeeze_cext 849___ 850} }}} 851$code.=<<___; 852.asciz "Keccak-1600 absorb and squeeze for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 853___ 854 855{ my %opcode = ( 856 "rax1" => 0xce608c00, "eor3" => 0xce000000, 857 "bcax" => 0xce200000, "xar" => 0xce800000 ); 858 859 sub unsha3 { 860 my ($mnemonic,$arg)=@_; 861 862 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/ 863 && 864 sprintf ".inst\t0x%08x\t//%s %s", 865 $opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10), 866 $mnemonic,$arg; 867 } 868} 869 870foreach(split("\n",$code)) { 871 872 s/\`([^\`]*)\`/eval($1)/ge; 873 874 m/\bld1r\b/ and s/\.16b/.2d/g or 875 s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge; 876 877 print $_,"\n"; 878} 879 880close STDOUT or die "error closing STDOUT: $!"; 881