1#!/usr/bin/env perl 2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# ==================================================================== 10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 11# project. The module is, however, dual licensed under OpenSSL and 12# CRYPTOGAMS licenses depending on where you obtain it. For further 13# details see http://www.openssl.org/~appro/cryptogams/. 14# ==================================================================== 15# 16# Keccak-1600 for ARMv8. 17# 18# June 2017. 19# 20# This is straightforward KECCAK_1X_ALT implementation. It makes no 21# sense to attempt SIMD/NEON implementation for following reason. 22# 64-bit lanes of vector registers can't be addressed as easily as in 23# 32-bit mode. This means that 64-bit NEON is bound to be slower than 24# 32-bit NEON, and this implementation is faster than 32-bit NEON on 25# same processor. Even though it takes more scalar xor's and andn's, 26# it gets compensated by availability of rotate. Not to forget that 27# most processors achieve higher issue rate with scalar instructions. 28# 29# February 2018. 30# 31# Add hardware-assisted ARMv8.2 implementation. It's KECCAK_1X_ALT 32# variant with register permutation/rotation twist that allows to 33# eliminate copies to temporary registers. If you look closely you'll 34# notice that it uses only one lane of vector registers. The new 35# instructions effectively facilitate parallel hashing, which we don't 36# support [yet?]. But lowest-level core procedure is prepared for it. 37# The inner round is 67 [vector] instructions, so it's not actually 38# obvious that it will provide performance improvement [in serial 39# hash] as long as vector instructions issue rate is limited to 1 per 40# cycle... 41# 42###################################################################### 43# Numbers are cycles per processed byte. 44# 45# r=1088(*) 46# 47# Cortex-A53 13 48# Cortex-A57 12 49# X-Gene 14 50# Mongoose 10 51# Kryo 12 52# Denver 7.8 53# Apple A7 7.2 54# ThunderX2 9.7 55# 56# (*) Corresponds to SHA3-256. No improvement coefficients are listed 57# because they vary too much from compiler to compiler. Newer 58# compiler does much better and improvement varies from 5% on 59# Cortex-A57 to 25% on Cortex-A53. While in comparison to older 60# compiler this code is at least 2x faster... 61 62# $output is the last argument if it looks like a file (it has an extension) 63# $flavour is the first argument if it doesn't look like a file 64$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 65$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 66 67$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 68( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 69( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 70die "can't locate arm-xlate.pl"; 71 72open OUT,"| \"$^X\" $xlate $flavour \"$output\"" 73 or die "can't call $xlate: $!"; 74*STDOUT=*OUT; 75 76my @rhotates = ([ 0, 1, 62, 28, 27 ], 77 [ 36, 44, 6, 55, 20 ], 78 [ 3, 10, 43, 25, 39 ], 79 [ 41, 45, 15, 21, 8 ], 80 [ 18, 2, 61, 56, 14 ]); 81 82$code.=<<___; 83.text 84 85.align 8 // strategic alignment and padding that allows to use 86 // address value as loop termination condition... 87 .quad 0,0,0,0,0,0,0,0 88.type iotas,%object 89iotas: 90 .quad 0x0000000000000001 91 .quad 0x0000000000008082 92 .quad 0x800000000000808a 93 .quad 0x8000000080008000 94 .quad 0x000000000000808b 95 .quad 0x0000000080000001 96 .quad 0x8000000080008081 97 .quad 0x8000000000008009 98 .quad 0x000000000000008a 99 .quad 0x0000000000000088 100 .quad 0x0000000080008009 101 .quad 0x000000008000000a 102 .quad 0x000000008000808b 103 .quad 0x800000000000008b 104 .quad 0x8000000000008089 105 .quad 0x8000000000008003 106 .quad 0x8000000000008002 107 .quad 0x8000000000000080 108 .quad 0x000000000000800a 109 .quad 0x800000008000000a 110 .quad 0x8000000080008081 111 .quad 0x8000000000008080 112 .quad 0x0000000080000001 113 .quad 0x8000000080008008 114.size iotas,.-iotas 115___ 116 {{{ 117my @A = map([ "x$_", "x".($_+1), "x".($_+2), "x".($_+3), "x".($_+4) ], 118 (0, 5, 10, 15, 20)); 119 $A[3][3] = "x25"; # x18 is reserved 120 121my @C = map("x$_", (26,27,28,30)); 122 123$code.=<<___; 124.type KeccakF1600_int,%function 125.align 5 126KeccakF1600_int: 127 adr $C[2],iotas 128 .inst 0xd503233f // paciasp 129 stp $C[2],x30,[sp,#16] // 32 bytes on top are mine 130 b .Loop 131.align 4 132.Loop: 133 ////////////////////////////////////////// Theta 134 eor $C[0],$A[0][0],$A[1][0] 135 stp $A[0][4],$A[1][4],[sp,#0] // offload pair... 136 eor $C[1],$A[0][1],$A[1][1] 137 eor $C[2],$A[0][2],$A[1][2] 138 eor $C[3],$A[0][3],$A[1][3] 139___ 140 $C[4]=$A[0][4]; 141 $C[5]=$A[1][4]; 142$code.=<<___; 143 eor $C[4],$A[0][4],$A[1][4] 144 eor $C[0],$C[0],$A[2][0] 145 eor $C[1],$C[1],$A[2][1] 146 eor $C[2],$C[2],$A[2][2] 147 eor $C[3],$C[3],$A[2][3] 148 eor $C[4],$C[4],$A[2][4] 149 eor $C[0],$C[0],$A[3][0] 150 eor $C[1],$C[1],$A[3][1] 151 eor $C[2],$C[2],$A[3][2] 152 eor $C[3],$C[3],$A[3][3] 153 eor $C[4],$C[4],$A[3][4] 154 eor $C[0],$C[0],$A[4][0] 155 eor $C[2],$C[2],$A[4][2] 156 eor $C[1],$C[1],$A[4][1] 157 eor $C[3],$C[3],$A[4][3] 158 eor $C[4],$C[4],$A[4][4] 159 160 eor $C[5],$C[0],$C[2],ror#63 161 162 eor $A[0][1],$A[0][1],$C[5] 163 eor $A[1][1],$A[1][1],$C[5] 164 eor $A[2][1],$A[2][1],$C[5] 165 eor $A[3][1],$A[3][1],$C[5] 166 eor $A[4][1],$A[4][1],$C[5] 167 168 eor $C[5],$C[1],$C[3],ror#63 169 eor $C[2],$C[2],$C[4],ror#63 170 eor $C[3],$C[3],$C[0],ror#63 171 eor $C[4],$C[4],$C[1],ror#63 172 173 eor $C[1], $A[0][2],$C[5] // mov $C[1],$A[0][2] 174 eor $A[1][2],$A[1][2],$C[5] 175 eor $A[2][2],$A[2][2],$C[5] 176 eor $A[3][2],$A[3][2],$C[5] 177 eor $A[4][2],$A[4][2],$C[5] 178 179 eor $A[0][0],$A[0][0],$C[4] 180 eor $A[1][0],$A[1][0],$C[4] 181 eor $A[2][0],$A[2][0],$C[4] 182 eor $A[3][0],$A[3][0],$C[4] 183 eor $A[4][0],$A[4][0],$C[4] 184___ 185 $C[4]=undef; 186 $C[5]=undef; 187$code.=<<___; 188 ldp $A[0][4],$A[1][4],[sp,#0] // re-load offloaded data 189 eor $C[0], $A[0][3],$C[2] // mov $C[0],$A[0][3] 190 eor $A[1][3],$A[1][3],$C[2] 191 eor $A[2][3],$A[2][3],$C[2] 192 eor $A[3][3],$A[3][3],$C[2] 193 eor $A[4][3],$A[4][3],$C[2] 194 195 eor $C[2], $A[0][4],$C[3] // mov $C[2],$A[0][4] 196 eor $A[1][4],$A[1][4],$C[3] 197 eor $A[2][4],$A[2][4],$C[3] 198 eor $A[3][4],$A[3][4],$C[3] 199 eor $A[4][4],$A[4][4],$C[3] 200 201 ////////////////////////////////////////// Rho+Pi 202 mov $C[3],$A[0][1] 203 ror $A[0][1],$A[1][1],#64-$rhotates[1][1] 204 //mov $C[1],$A[0][2] 205 ror $A[0][2],$A[2][2],#64-$rhotates[2][2] 206 //mov $C[0],$A[0][3] 207 ror $A[0][3],$A[3][3],#64-$rhotates[3][3] 208 //mov $C[2],$A[0][4] 209 ror $A[0][4],$A[4][4],#64-$rhotates[4][4] 210 211 ror $A[1][1],$A[1][4],#64-$rhotates[1][4] 212 ror $A[2][2],$A[2][3],#64-$rhotates[2][3] 213 ror $A[3][3],$A[3][2],#64-$rhotates[3][2] 214 ror $A[4][4],$A[4][1],#64-$rhotates[4][1] 215 216 ror $A[1][4],$A[4][2],#64-$rhotates[4][2] 217 ror $A[2][3],$A[3][4],#64-$rhotates[3][4] 218 ror $A[3][2],$A[2][1],#64-$rhotates[2][1] 219 ror $A[4][1],$A[1][3],#64-$rhotates[1][3] 220 221 ror $A[4][2],$A[2][4],#64-$rhotates[2][4] 222 ror $A[3][4],$A[4][3],#64-$rhotates[4][3] 223 ror $A[2][1],$A[1][2],#64-$rhotates[1][2] 224 ror $A[1][3],$A[3][1],#64-$rhotates[3][1] 225 226 ror $A[2][4],$A[4][0],#64-$rhotates[4][0] 227 ror $A[4][3],$A[3][0],#64-$rhotates[3][0] 228 ror $A[1][2],$A[2][0],#64-$rhotates[2][0] 229 ror $A[3][1],$A[1][0],#64-$rhotates[1][0] 230 231 ror $A[1][0],$C[0],#64-$rhotates[0][3] 232 ror $A[2][0],$C[3],#64-$rhotates[0][1] 233 ror $A[3][0],$C[2],#64-$rhotates[0][4] 234 ror $A[4][0],$C[1],#64-$rhotates[0][2] 235 236 ////////////////////////////////////////// Chi+Iota 237 bic $C[0],$A[0][2],$A[0][1] 238 bic $C[1],$A[0][3],$A[0][2] 239 bic $C[2],$A[0][0],$A[0][4] 240 bic $C[3],$A[0][1],$A[0][0] 241 eor $A[0][0],$A[0][0],$C[0] 242 bic $C[0],$A[0][4],$A[0][3] 243 eor $A[0][1],$A[0][1],$C[1] 244 ldr $C[1],[sp,#16] 245 eor $A[0][3],$A[0][3],$C[2] 246 eor $A[0][4],$A[0][4],$C[3] 247 eor $A[0][2],$A[0][2],$C[0] 248 ldr $C[3],[$C[1]],#8 // Iota[i++] 249 250 bic $C[0],$A[1][2],$A[1][1] 251 tst $C[1],#255 // are we done? 252 str $C[1],[sp,#16] 253 bic $C[1],$A[1][3],$A[1][2] 254 bic $C[2],$A[1][0],$A[1][4] 255 eor $A[0][0],$A[0][0],$C[3] // A[0][0] ^= Iota 256 bic $C[3],$A[1][1],$A[1][0] 257 eor $A[1][0],$A[1][0],$C[0] 258 bic $C[0],$A[1][4],$A[1][3] 259 eor $A[1][1],$A[1][1],$C[1] 260 eor $A[1][3],$A[1][3],$C[2] 261 eor $A[1][4],$A[1][4],$C[3] 262 eor $A[1][2],$A[1][2],$C[0] 263 264 bic $C[0],$A[2][2],$A[2][1] 265 bic $C[1],$A[2][3],$A[2][2] 266 bic $C[2],$A[2][0],$A[2][4] 267 bic $C[3],$A[2][1],$A[2][0] 268 eor $A[2][0],$A[2][0],$C[0] 269 bic $C[0],$A[2][4],$A[2][3] 270 eor $A[2][1],$A[2][1],$C[1] 271 eor $A[2][3],$A[2][3],$C[2] 272 eor $A[2][4],$A[2][4],$C[3] 273 eor $A[2][2],$A[2][2],$C[0] 274 275 bic $C[0],$A[3][2],$A[3][1] 276 bic $C[1],$A[3][3],$A[3][2] 277 bic $C[2],$A[3][0],$A[3][4] 278 bic $C[3],$A[3][1],$A[3][0] 279 eor $A[3][0],$A[3][0],$C[0] 280 bic $C[0],$A[3][4],$A[3][3] 281 eor $A[3][1],$A[3][1],$C[1] 282 eor $A[3][3],$A[3][3],$C[2] 283 eor $A[3][4],$A[3][4],$C[3] 284 eor $A[3][2],$A[3][2],$C[0] 285 286 bic $C[0],$A[4][2],$A[4][1] 287 bic $C[1],$A[4][3],$A[4][2] 288 bic $C[2],$A[4][0],$A[4][4] 289 bic $C[3],$A[4][1],$A[4][0] 290 eor $A[4][0],$A[4][0],$C[0] 291 bic $C[0],$A[4][4],$A[4][3] 292 eor $A[4][1],$A[4][1],$C[1] 293 eor $A[4][3],$A[4][3],$C[2] 294 eor $A[4][4],$A[4][4],$C[3] 295 eor $A[4][2],$A[4][2],$C[0] 296 297 bne .Loop 298 299 ldr x30,[sp,#24] 300 .inst 0xd50323bf // autiasp 301 ret 302.size KeccakF1600_int,.-KeccakF1600_int 303 304.type KeccakF1600,%function 305.align 5 306KeccakF1600: 307 .inst 0xd503233f // paciasp 308 stp x29,x30,[sp,#-128]! 309 add x29,sp,#0 310 stp x19,x20,[sp,#16] 311 stp x21,x22,[sp,#32] 312 stp x23,x24,[sp,#48] 313 stp x25,x26,[sp,#64] 314 stp x27,x28,[sp,#80] 315 sub sp,sp,#48 316 317 str x0,[sp,#32] // offload argument 318 mov $C[0],x0 319 ldp $A[0][0],$A[0][1],[x0,#16*0] 320 ldp $A[0][2],$A[0][3],[$C[0],#16*1] 321 ldp $A[0][4],$A[1][0],[$C[0],#16*2] 322 ldp $A[1][1],$A[1][2],[$C[0],#16*3] 323 ldp $A[1][3],$A[1][4],[$C[0],#16*4] 324 ldp $A[2][0],$A[2][1],[$C[0],#16*5] 325 ldp $A[2][2],$A[2][3],[$C[0],#16*6] 326 ldp $A[2][4],$A[3][0],[$C[0],#16*7] 327 ldp $A[3][1],$A[3][2],[$C[0],#16*8] 328 ldp $A[3][3],$A[3][4],[$C[0],#16*9] 329 ldp $A[4][0],$A[4][1],[$C[0],#16*10] 330 ldp $A[4][2],$A[4][3],[$C[0],#16*11] 331 ldr $A[4][4],[$C[0],#16*12] 332 333 bl KeccakF1600_int 334 335 ldr $C[0],[sp,#32] 336 stp $A[0][0],$A[0][1],[$C[0],#16*0] 337 stp $A[0][2],$A[0][3],[$C[0],#16*1] 338 stp $A[0][4],$A[1][0],[$C[0],#16*2] 339 stp $A[1][1],$A[1][2],[$C[0],#16*3] 340 stp $A[1][3],$A[1][4],[$C[0],#16*4] 341 stp $A[2][0],$A[2][1],[$C[0],#16*5] 342 stp $A[2][2],$A[2][3],[$C[0],#16*6] 343 stp $A[2][4],$A[3][0],[$C[0],#16*7] 344 stp $A[3][1],$A[3][2],[$C[0],#16*8] 345 stp $A[3][3],$A[3][4],[$C[0],#16*9] 346 stp $A[4][0],$A[4][1],[$C[0],#16*10] 347 stp $A[4][2],$A[4][3],[$C[0],#16*11] 348 str $A[4][4],[$C[0],#16*12] 349 350 ldp x19,x20,[x29,#16] 351 add sp,sp,#48 352 ldp x21,x22,[x29,#32] 353 ldp x23,x24,[x29,#48] 354 ldp x25,x26,[x29,#64] 355 ldp x27,x28,[x29,#80] 356 ldp x29,x30,[sp],#128 357 .inst 0xd50323bf // autiasp 358 ret 359.size KeccakF1600,.-KeccakF1600 360 361.globl SHA3_absorb 362.type SHA3_absorb,%function 363.align 5 364SHA3_absorb: 365 .inst 0xd503233f // paciasp 366 stp x29,x30,[sp,#-128]! 367 add x29,sp,#0 368 stp x19,x20,[sp,#16] 369 stp x21,x22,[sp,#32] 370 stp x23,x24,[sp,#48] 371 stp x25,x26,[sp,#64] 372 stp x27,x28,[sp,#80] 373 sub sp,sp,#64 374 375 stp x0,x1,[sp,#32] // offload arguments 376 stp x2,x3,[sp,#48] 377 378 mov $C[0],x0 // uint64_t A[5][5] 379 mov $C[1],x1 // const void *inp 380 mov $C[2],x2 // size_t len 381 mov $C[3],x3 // size_t bsz 382 ldp $A[0][0],$A[0][1],[$C[0],#16*0] 383 ldp $A[0][2],$A[0][3],[$C[0],#16*1] 384 ldp $A[0][4],$A[1][0],[$C[0],#16*2] 385 ldp $A[1][1],$A[1][2],[$C[0],#16*3] 386 ldp $A[1][3],$A[1][4],[$C[0],#16*4] 387 ldp $A[2][0],$A[2][1],[$C[0],#16*5] 388 ldp $A[2][2],$A[2][3],[$C[0],#16*6] 389 ldp $A[2][4],$A[3][0],[$C[0],#16*7] 390 ldp $A[3][1],$A[3][2],[$C[0],#16*8] 391 ldp $A[3][3],$A[3][4],[$C[0],#16*9] 392 ldp $A[4][0],$A[4][1],[$C[0],#16*10] 393 ldp $A[4][2],$A[4][3],[$C[0],#16*11] 394 ldr $A[4][4],[$C[0],#16*12] 395 b .Loop_absorb 396 397.align 4 398.Loop_absorb: 399 subs $C[0],$C[2],$C[3] // len - bsz 400 blo .Labsorbed 401 402 str $C[0],[sp,#48] // save len - bsz 403___ 404for (my $i=0; $i<24; $i+=2) { 405my $j = $i+1; 406$code.=<<___; 407 ldr $C[0],[$C[1]],#8 // *inp++ 408#ifdef __AARCH64EB__ 409 rev $C[0],$C[0] 410#endif 411 eor $A[$i/5][$i%5],$A[$i/5][$i%5],$C[0] 412 cmp $C[3],#8*($i+2) 413 blo .Lprocess_block 414 ldr $C[0],[$C[1]],#8 // *inp++ 415#ifdef __AARCH64EB__ 416 rev $C[0],$C[0] 417#endif 418 eor $A[$j/5][$j%5],$A[$j/5][$j%5],$C[0] 419 beq .Lprocess_block 420___ 421} 422$code.=<<___; 423 ldr $C[0],[$C[1]],#8 // *inp++ 424#ifdef __AARCH64EB__ 425 rev $C[0],$C[0] 426#endif 427 eor $A[4][4],$A[4][4],$C[0] 428 429.Lprocess_block: 430 str $C[1],[sp,#40] // save inp 431 432 bl KeccakF1600_int 433 434 ldr $C[1],[sp,#40] // restore arguments 435 ldp $C[2],$C[3],[sp,#48] 436 b .Loop_absorb 437 438.align 4 439.Labsorbed: 440 ldr $C[1],[sp,#32] 441 stp $A[0][0],$A[0][1],[$C[1],#16*0] 442 stp $A[0][2],$A[0][3],[$C[1],#16*1] 443 stp $A[0][4],$A[1][0],[$C[1],#16*2] 444 stp $A[1][1],$A[1][2],[$C[1],#16*3] 445 stp $A[1][3],$A[1][4],[$C[1],#16*4] 446 stp $A[2][0],$A[2][1],[$C[1],#16*5] 447 stp $A[2][2],$A[2][3],[$C[1],#16*6] 448 stp $A[2][4],$A[3][0],[$C[1],#16*7] 449 stp $A[3][1],$A[3][2],[$C[1],#16*8] 450 stp $A[3][3],$A[3][4],[$C[1],#16*9] 451 stp $A[4][0],$A[4][1],[$C[1],#16*10] 452 stp $A[4][2],$A[4][3],[$C[1],#16*11] 453 str $A[4][4],[$C[1],#16*12] 454 455 mov x0,$C[2] // return value 456 ldp x19,x20,[x29,#16] 457 add sp,sp,#64 458 ldp x21,x22,[x29,#32] 459 ldp x23,x24,[x29,#48] 460 ldp x25,x26,[x29,#64] 461 ldp x27,x28,[x29,#80] 462 ldp x29,x30,[sp],#128 463 .inst 0xd50323bf // autiasp 464 ret 465.size SHA3_absorb,.-SHA3_absorb 466___ 467{ 468my ($A_flat,$out,$len,$bsz) = map("x$_",(19..22)); 469$code.=<<___; 470.globl SHA3_squeeze 471.type SHA3_squeeze,%function 472.align 5 473SHA3_squeeze: 474 .inst 0xd503233f // paciasp 475 stp x29,x30,[sp,#-48]! 476 add x29,sp,#0 477 stp x19,x20,[sp,#16] 478 stp x21,x22,[sp,#32] 479 480 mov $A_flat,x0 // put aside arguments 481 mov $out,x1 482 mov $len,x2 483 mov $bsz,x3 484 485.Loop_squeeze: 486 ldr x4,[x0],#8 487 cmp $len,#8 488 blo .Lsqueeze_tail 489#ifdef __AARCH64EB__ 490 rev x4,x4 491#endif 492 str x4,[$out],#8 493 subs $len,$len,#8 494 beq .Lsqueeze_done 495 496 subs x3,x3,#8 497 bhi .Loop_squeeze 498 499 mov x0,$A_flat 500 bl KeccakF1600 501 mov x0,$A_flat 502 mov x3,$bsz 503 b .Loop_squeeze 504 505.align 4 506.Lsqueeze_tail: 507 strb w4,[$out],#1 508 lsr x4,x4,#8 509 subs $len,$len,#1 510 beq .Lsqueeze_done 511 strb w4,[$out],#1 512 lsr x4,x4,#8 513 subs $len,$len,#1 514 beq .Lsqueeze_done 515 strb w4,[$out],#1 516 lsr x4,x4,#8 517 subs $len,$len,#1 518 beq .Lsqueeze_done 519 strb w4,[$out],#1 520 lsr x4,x4,#8 521 subs $len,$len,#1 522 beq .Lsqueeze_done 523 strb w4,[$out],#1 524 lsr x4,x4,#8 525 subs $len,$len,#1 526 beq .Lsqueeze_done 527 strb w4,[$out],#1 528 lsr x4,x4,#8 529 subs $len,$len,#1 530 beq .Lsqueeze_done 531 strb w4,[$out],#1 532 533.Lsqueeze_done: 534 ldp x19,x20,[sp,#16] 535 ldp x21,x22,[sp,#32] 536 ldp x29,x30,[sp],#48 537 .inst 0xd50323bf // autiasp 538 ret 539.size SHA3_squeeze,.-SHA3_squeeze 540___ 541} }}} 542 {{{ 543my @A = map([ "v".$_.".16b", "v".($_+1).".16b", "v".($_+2).".16b", 544 "v".($_+3).".16b", "v".($_+4).".16b" ], 545 (0, 5, 10, 15, 20)); 546 547my @C = map("v$_.16b", (25..31)); 548my @D = @C[4,5,6,2,3]; 549 550$code.=<<___; 551.type KeccakF1600_ce,%function 552.align 5 553KeccakF1600_ce: 554 mov x9,#24 555 adr x10,iotas 556 b .Loop_ce 557.align 4 558.Loop_ce: 559 ////////////////////////////////////////////////// Theta 560 eor3 $C[0],$A[4][0],$A[3][0],$A[2][0] 561 eor3 $C[1],$A[4][1],$A[3][1],$A[2][1] 562 eor3 $C[2],$A[4][2],$A[3][2],$A[2][2] 563 eor3 $C[3],$A[4][3],$A[3][3],$A[2][3] 564 eor3 $C[4],$A[4][4],$A[3][4],$A[2][4] 565 eor3 $C[0],$C[0], $A[1][0],$A[0][0] 566 eor3 $C[1],$C[1], $A[1][1],$A[0][1] 567 eor3 $C[2],$C[2], $A[1][2],$A[0][2] 568 eor3 $C[3],$C[3], $A[1][3],$A[0][3] 569 eor3 $C[4],$C[4], $A[1][4],$A[0][4] 570 571 rax1 $C[5],$C[0],$C[2] // D[1] 572 rax1 $C[6],$C[1],$C[3] // D[2] 573 rax1 $C[2],$C[2],$C[4] // D[3] 574 rax1 $C[3],$C[3],$C[0] // D[4] 575 rax1 $C[4],$C[4],$C[1] // D[0] 576 577 ////////////////////////////////////////////////// Theta+Rho+Pi 578 xar $C[0], $A[0][1],$D[1],#64-$rhotates[0][1] // C[0]=A[2][0] 579 580 xar $A[0][1],$A[1][1],$D[1],#64-$rhotates[1][1] 581 xar $A[1][1],$A[1][4],$D[4],#64-$rhotates[1][4] 582 xar $A[1][4],$A[4][2],$D[2],#64-$rhotates[4][2] 583 xar $A[4][2],$A[2][4],$D[4],#64-$rhotates[2][4] 584 xar $A[2][4],$A[4][0],$D[0],#64-$rhotates[4][0] 585 586 xar $C[1], $A[0][2],$D[2],#64-$rhotates[0][2] // C[1]=A[4][0] 587 588 xar $A[0][2],$A[2][2],$D[2],#64-$rhotates[2][2] 589 xar $A[2][2],$A[2][3],$D[3],#64-$rhotates[2][3] 590 xar $A[2][3],$A[3][4],$D[4],#64-$rhotates[3][4] 591 xar $A[3][4],$A[4][3],$D[3],#64-$rhotates[4][3] 592 xar $A[4][3],$A[3][0],$D[0],#64-$rhotates[3][0] 593 594 xar $A[3][0],$A[0][4],$D[4],#64-$rhotates[0][4] 595 596 xar $D[4], $A[4][4],$D[4],#64-$rhotates[4][4] // D[4]=A[0][4] 597 xar $A[4][4],$A[4][1],$D[1],#64-$rhotates[4][1] 598 xar $A[1][3],$A[1][3],$D[3],#64-$rhotates[1][3] // A[1][3]=A[4][1] 599 xar $A[0][4],$A[3][1],$D[1],#64-$rhotates[3][1] // A[0][4]=A[1][3] 600 xar $A[3][1],$A[1][0],$D[0],#64-$rhotates[1][0] 601 602 xar $A[1][0],$A[0][3],$D[3],#64-$rhotates[0][3] 603 604 eor $A[0][0],$A[0][0],$D[0] 605 606 xar $D[3], $A[3][3],$D[3],#64-$rhotates[3][3] // D[3]=A[0][3] 607 xar $A[0][3],$A[3][2],$D[2],#64-$rhotates[3][2] // A[0][3]=A[3][3] 608 xar $D[1], $A[2][1],$D[1],#64-$rhotates[2][1] // D[1]=A[3][2] 609 xar $D[2], $A[1][2],$D[2],#64-$rhotates[1][2] // D[2]=A[2][1] 610 xar $D[0], $A[2][0],$D[0],#64-$rhotates[2][0] // D[0]=A[1][2] 611 612 ////////////////////////////////////////////////// Chi+Iota 613 bcax $A[4][0],$C[1], $A[4][2],$A[1][3] // A[1][3]=A[4][1] 614 bcax $A[4][1],$A[1][3],$A[4][3],$A[4][2] // A[1][3]=A[4][1] 615 bcax $A[4][2],$A[4][2],$A[4][4],$A[4][3] 616 bcax $A[4][3],$A[4][3],$C[1], $A[4][4] 617 bcax $A[4][4],$A[4][4],$A[1][3],$C[1] // A[1][3]=A[4][1] 618 619 ld1r {$C[1]},[x10],#8 620 621 bcax $A[3][2],$D[1], $A[3][4],$A[0][3] // A[0][3]=A[3][3] 622 bcax $A[3][3],$A[0][3],$A[3][0],$A[3][4] // A[0][3]=A[3][3] 623 bcax $A[3][4],$A[3][4],$A[3][1],$A[3][0] 624 bcax $A[3][0],$A[3][0],$D[1], $A[3][1] 625 bcax $A[3][1],$A[3][1],$A[0][3],$D[1] // A[0][3]=A[3][3] 626 627 bcax $A[2][0],$C[0], $A[2][2],$D[2] 628 bcax $A[2][1],$D[2], $A[2][3],$A[2][2] 629 bcax $A[2][2],$A[2][2],$A[2][4],$A[2][3] 630 bcax $A[2][3],$A[2][3],$C[0], $A[2][4] 631 bcax $A[2][4],$A[2][4],$D[2], $C[0] 632 633 bcax $A[1][2],$D[0], $A[1][4],$A[0][4] // A[0][4]=A[1][3] 634 bcax $A[1][3],$A[0][4],$A[1][0],$A[1][4] // A[0][4]=A[1][3] 635 bcax $A[1][4],$A[1][4],$A[1][1],$A[1][0] 636 bcax $A[1][0],$A[1][0],$D[0], $A[1][1] 637 bcax $A[1][1],$A[1][1],$A[0][4],$D[0] // A[0][4]=A[1][3] 638 639 bcax $A[0][3],$D[3], $A[0][0],$D[4] 640 bcax $A[0][4],$D[4], $A[0][1],$A[0][0] 641 bcax $A[0][0],$A[0][0],$A[0][2],$A[0][1] 642 bcax $A[0][1],$A[0][1],$D[3], $A[0][2] 643 bcax $A[0][2],$A[0][2],$D[4], $D[3] 644 645 eor $A[0][0],$A[0][0],$C[1] 646 647 subs x9,x9,#1 648 bne .Loop_ce 649 650 ret 651.size KeccakF1600_ce,.-KeccakF1600_ce 652 653.type KeccakF1600_cext,%function 654.align 5 655KeccakF1600_cext: 656 .inst 0xd503233f // paciasp 657 stp x29,x30,[sp,#-80]! 658 add x29,sp,#0 659 stp d8,d9,[sp,#16] // per ABI requirement 660 stp d10,d11,[sp,#32] 661 stp d12,d13,[sp,#48] 662 stp d14,d15,[sp,#64] 663___ 664for($i=0; $i<24; $i+=2) { # load A[5][5] 665my $j=$i+1; 666$code.=<<___; 667 ldp d$i,d$j,[x0,#8*$i] 668___ 669} 670$code.=<<___; 671 ldr d24,[x0,#8*$i] 672 bl KeccakF1600_ce 673 ldr x30,[sp,#8] 674___ 675for($i=0; $i<24; $i+=2) { # store A[5][5] 676my $j=$i+1; 677$code.=<<___; 678 stp d$i,d$j,[x0,#8*$i] 679___ 680} 681$code.=<<___; 682 str d24,[x0,#8*$i] 683 684 ldp d8,d9,[sp,#16] 685 ldp d10,d11,[sp,#32] 686 ldp d12,d13,[sp,#48] 687 ldp d14,d15,[sp,#64] 688 ldr x29,[sp],#80 689 .inst 0xd50323bf // autiasp 690 ret 691.size KeccakF1600_cext,.-KeccakF1600_cext 692___ 693 694{ 695my ($ctx,$inp,$len,$bsz) = map("x$_",(0..3)); 696 697$code.=<<___; 698.globl SHA3_absorb_cext 699.type SHA3_absorb_cext,%function 700.align 5 701SHA3_absorb_cext: 702 .inst 0xd503233f // paciasp 703 stp x29,x30,[sp,#-80]! 704 add x29,sp,#0 705 stp d8,d9,[sp,#16] // per ABI requirement 706 stp d10,d11,[sp,#32] 707 stp d12,d13,[sp,#48] 708 stp d14,d15,[sp,#64] 709___ 710for($i=0; $i<24; $i+=2) { # load A[5][5] 711my $j=$i+1; 712$code.=<<___; 713 ldp d$i,d$j,[x0,#8*$i] 714___ 715} 716$code.=<<___; 717 ldr d24,[x0,#8*$i] 718 b .Loop_absorb_ce 719 720.align 4 721.Loop_absorb_ce: 722 subs $len,$len,$bsz // len - bsz 723 blo .Labsorbed_ce 724___ 725for (my $i=0; $i<24; $i+=2) { 726my $j = $i+1; 727$code.=<<___; 728 ldr d31,[$inp],#8 // *inp++ 729#ifdef __AARCH64EB__ 730 rev64 v31.16b,v31.16b 731#endif 732 eor $A[$i/5][$i%5],$A[$i/5][$i%5],v31.16b 733 cmp $bsz,#8*($i+2) 734 blo .Lprocess_block_ce 735 ldr d31,[$inp],#8 // *inp++ 736#ifdef __AARCH64EB__ 737 rev64 v31.16b,v31.16b 738#endif 739 eor $A[$j/5][$j%5],$A[$j/5][$j%5],v31.16b 740 beq .Lprocess_block_ce 741___ 742} 743$code.=<<___; 744 ldr d31,[$inp],#8 // *inp++ 745#ifdef __AARCH64EB__ 746 rev64 v31.16b,v31.16b 747#endif 748 eor $A[4][4],$A[4][4],v31.16b 749 750.Lprocess_block_ce: 751 752 bl KeccakF1600_ce 753 754 b .Loop_absorb_ce 755 756.align 4 757.Labsorbed_ce: 758___ 759for($i=0; $i<24; $i+=2) { # store A[5][5] 760my $j=$i+1; 761$code.=<<___; 762 stp d$i,d$j,[x0,#8*$i] 763___ 764} 765$code.=<<___; 766 str d24,[x0,#8*$i] 767 add x0,$len,$bsz // return value 768 769 ldp d8,d9,[sp,#16] 770 ldp d10,d11,[sp,#32] 771 ldp d12,d13,[sp,#48] 772 ldp d14,d15,[sp,#64] 773 ldp x29,x30,[sp],#80 774 .inst 0xd50323bf // autiasp 775 ret 776.size SHA3_absorb_cext,.-SHA3_absorb_cext 777___ 778} 779{ 780my ($ctx,$out,$len,$bsz) = map("x$_",(0..3)); 781$code.=<<___; 782.globl SHA3_squeeze_cext 783.type SHA3_squeeze_cext,%function 784.align 5 785SHA3_squeeze_cext: 786 .inst 0xd503233f // paciasp 787 stp x29,x30,[sp,#-16]! 788 add x29,sp,#0 789 mov x9,$ctx 790 mov x10,$bsz 791 792.Loop_squeeze_ce: 793 ldr x4,[x9],#8 794 cmp $len,#8 795 blo .Lsqueeze_tail_ce 796#ifdef __AARCH64EB__ 797 rev x4,x4 798#endif 799 str x4,[$out],#8 800 beq .Lsqueeze_done_ce 801 802 sub $len,$len,#8 803 subs x10,x10,#8 804 bhi .Loop_squeeze_ce 805 806 bl KeccakF1600_cext 807 ldr x30,[sp,#8] 808 mov x9,$ctx 809 mov x10,$bsz 810 b .Loop_squeeze_ce 811 812.align 4 813.Lsqueeze_tail_ce: 814 strb w4,[$out],#1 815 lsr x4,x4,#8 816 subs $len,$len,#1 817 beq .Lsqueeze_done_ce 818 strb w4,[$out],#1 819 lsr x4,x4,#8 820 subs $len,$len,#1 821 beq .Lsqueeze_done_ce 822 strb w4,[$out],#1 823 lsr x4,x4,#8 824 subs $len,$len,#1 825 beq .Lsqueeze_done_ce 826 strb w4,[$out],#1 827 lsr x4,x4,#8 828 subs $len,$len,#1 829 beq .Lsqueeze_done_ce 830 strb w4,[$out],#1 831 lsr x4,x4,#8 832 subs $len,$len,#1 833 beq .Lsqueeze_done_ce 834 strb w4,[$out],#1 835 lsr x4,x4,#8 836 subs $len,$len,#1 837 beq .Lsqueeze_done_ce 838 strb w4,[$out],#1 839 840.Lsqueeze_done_ce: 841 ldr x29,[sp],#16 842 .inst 0xd50323bf // autiasp 843 ret 844.size SHA3_squeeze_cext,.-SHA3_squeeze_cext 845___ 846} }}} 847$code.=<<___; 848.asciz "Keccak-1600 absorb and squeeze for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 849___ 850 851{ my %opcode = ( 852 "rax1" => 0xce608c00, "eor3" => 0xce000000, 853 "bcax" => 0xce200000, "xar" => 0xce800000 ); 854 855 sub unsha3 { 856 my ($mnemonic,$arg)=@_; 857 858 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/ 859 && 860 sprintf ".inst\t0x%08x\t//%s %s", 861 $opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10), 862 $mnemonic,$arg; 863 } 864} 865 866foreach(split("\n",$code)) { 867 868 s/\`([^\`]*)\`/eval($1)/ge; 869 870 m/\bld1r\b/ and s/\.16b/.2d/g or 871 s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge; 872 873 print $_,"\n"; 874} 875 876close STDOUT or die "error closing STDOUT: $!"; 877