1#!/usr/bin/env perl 2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# ==================================================================== 10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 11# project. The module is, however, dual licensed under OpenSSL and 12# CRYPTOGAMS licenses depending on where you obtain it. For further 13# details see http://www.openssl.org/~appro/cryptogams/. 14# ==================================================================== 15# 16# Keccak-1600 for x86_64. 17# 18# June 2017. 19# 20# Below code is [lane complementing] KECCAK_2X implementation (see 21# sha/keccak1600.c) with C[5] and D[5] held in register bank. Though 22# instead of actually unrolling the loop pair-wise I simply flip 23# pointers to T[][] and A[][] at the end of round. Since number of 24# rounds is even, last round writes to A[][] and everything works out. 25# How does it compare to x86_64 assembly module in Keccak Code Package? 26# Depending on processor it's either as fast or faster by up to 15%... 27# 28######################################################################## 29# Numbers are cycles per processed byte out of large message. 30# 31# r=1088(*) 32# 33# P4 25.8 34# Core 2 12.9 35# Westmere 13.7 36# Sandy Bridge 12.9(**) 37# Haswell 9.6 38# Skylake 9.4 39# Silvermont 22.8 40# Goldmont 15.8 41# VIA Nano 17.3 42# Sledgehammer 13.3 43# Bulldozer 16.5 44# Ryzen 8.8 45# 46# (*) Corresponds to SHA3-256. Improvement over compiler-generate 47# varies a lot, most common coefficient is 15% in comparison to 48# gcc-5.x, 50% for gcc-4.x, 90% for gcc-3.x. 49# (**) Sandy Bridge has broken rotate instruction. Performance can be 50# improved by 14% by replacing rotates with double-precision 51# shift with same register as source and destination. 52 53# $output is the last argument if it looks like a file (it has an extension) 54# $flavour is the first argument if it doesn't look like a file 55$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 56$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 57 58$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 59 60$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 61( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 62( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 63die "can't locate x86_64-xlate.pl"; 64 65open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 66 or die "can't call $xlate: $!"; 67*STDOUT=*OUT; 68 69my @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100, 70 8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20)); 71 72my @C = ("%rax","%rbx","%rcx","%rdx","%rbp"); 73my @D = map("%r$_",(8..12)); 74my @T = map("%r$_",(13..14)); 75my $iotas = "%r15"; 76 77my @rhotates = ([ 0, 1, 62, 28, 27 ], 78 [ 36, 44, 6, 55, 20 ], 79 [ 3, 10, 43, 25, 39 ], 80 [ 41, 45, 15, 21, 8 ], 81 [ 18, 2, 61, 56, 14 ]); 82 83$code.=<<___; 84.text 85 86.type __KeccakF1600,\@abi-omnipotent 87.align 32 88__KeccakF1600: 89.cfi_startproc 90 mov $A[4][0](%rdi),@C[0] 91 mov $A[4][1](%rdi),@C[1] 92 mov $A[4][2](%rdi),@C[2] 93 mov $A[4][3](%rdi),@C[3] 94 mov $A[4][4](%rdi),@C[4] 95 jmp .Loop 96 97.align 32 98.Loop: 99 mov $A[0][0](%rdi),@D[0] 100 mov $A[1][1](%rdi),@D[1] 101 mov $A[2][2](%rdi),@D[2] 102 mov $A[3][3](%rdi),@D[3] 103 104 xor $A[0][2](%rdi),@C[2] 105 xor $A[0][3](%rdi),@C[3] 106 xor @D[0], @C[0] 107 xor $A[0][1](%rdi),@C[1] 108 xor $A[1][2](%rdi),@C[2] 109 xor $A[1][0](%rdi),@C[0] 110 mov @C[4],@D[4] 111 xor $A[0][4](%rdi),@C[4] 112 113 xor @D[2], @C[2] 114 xor $A[2][0](%rdi),@C[0] 115 xor $A[1][3](%rdi),@C[3] 116 xor @D[1], @C[1] 117 xor $A[1][4](%rdi),@C[4] 118 119 xor $A[3][2](%rdi),@C[2] 120 xor $A[3][0](%rdi),@C[0] 121 xor $A[2][3](%rdi),@C[3] 122 xor $A[2][1](%rdi),@C[1] 123 xor $A[2][4](%rdi),@C[4] 124 125 mov @C[2],@T[0] 126 rol \$1,@C[2] 127 xor @C[0],@C[2] # D[1] = ROL64(C[2], 1) ^ C[0] 128 xor @D[3], @C[3] 129 130 rol \$1,@C[0] 131 xor @C[3],@C[0] # D[4] = ROL64(C[0], 1) ^ C[3] 132 xor $A[3][1](%rdi),@C[1] 133 134 rol \$1,@C[3] 135 xor @C[1],@C[3] # D[2] = ROL64(C[3], 1) ^ C[1] 136 xor $A[3][4](%rdi),@C[4] 137 138 rol \$1,@C[1] 139 xor @C[4],@C[1] # D[0] = ROL64(C[1], 1) ^ C[4] 140 141 rol \$1,@C[4] 142 xor @T[0],@C[4] # D[3] = ROL64(C[4], 1) ^ C[2] 143___ 144 (@D[0..4], @C) = (@C[1..4,0], @D); 145$code.=<<___; 146 xor @D[1],@C[1] 147 xor @D[2],@C[2] 148 rol \$$rhotates[1][1],@C[1] 149 xor @D[3],@C[3] 150 xor @D[4],@C[4] 151 rol \$$rhotates[2][2],@C[2] 152 xor @D[0],@C[0] 153 mov @C[1],@T[0] 154 rol \$$rhotates[3][3],@C[3] 155 or @C[2],@C[1] 156 xor @C[0],@C[1] # C[0] ^ ( C[1] | C[2]) 157 rol \$$rhotates[4][4],@C[4] 158 159 xor ($iotas),@C[1] 160 lea 8($iotas),$iotas 161 162 mov @C[4],@T[1] 163 and @C[3],@C[4] 164 mov @C[1],$A[0][0](%rsi) # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i] 165 xor @C[2],@C[4] # C[2] ^ ( C[4] & C[3]) 166 not @C[2] 167 mov @C[4],$A[0][2](%rsi) # R[0][2] = C[2] ^ ( C[4] & C[3]) 168 169 or @C[3],@C[2] 170 mov $A[4][2](%rdi),@C[4] 171 xor @T[0],@C[2] # C[1] ^ (~C[2] | C[3]) 172 mov @C[2],$A[0][1](%rsi) # R[0][1] = C[1] ^ (~C[2] | C[3]) 173 174 and @C[0],@T[0] 175 mov $A[1][4](%rdi),@C[1] 176 xor @T[1],@T[0] # C[4] ^ ( C[1] & C[0]) 177 mov $A[2][0](%rdi),@C[2] 178 mov @T[0],$A[0][4](%rsi) # R[0][4] = C[4] ^ ( C[1] & C[0]) 179 180 or @C[0],@T[1] 181 mov $A[0][3](%rdi),@C[0] 182 xor @C[3],@T[1] # C[3] ^ ( C[4] | C[0]) 183 mov $A[3][1](%rdi),@C[3] 184 mov @T[1],$A[0][3](%rsi) # R[0][3] = C[3] ^ ( C[4] | C[0]) 185 186 187 xor @D[3],@C[0] 188 xor @D[2],@C[4] 189 rol \$$rhotates[0][3],@C[0] 190 xor @D[1],@C[3] 191 xor @D[4],@C[1] 192 rol \$$rhotates[4][2],@C[4] 193 rol \$$rhotates[3][1],@C[3] 194 xor @D[0],@C[2] 195 rol \$$rhotates[1][4],@C[1] 196 mov @C[0],@T[0] 197 or @C[4],@C[0] 198 rol \$$rhotates[2][0],@C[2] 199 200 xor @C[3],@C[0] # C[3] ^ (C[0] | C[4]) 201 mov @C[0],$A[1][3](%rsi) # R[1][3] = C[3] ^ (C[0] | C[4]) 202 203 mov @C[1],@T[1] 204 and @T[0],@C[1] 205 mov $A[0][1](%rdi),@C[0] 206 xor @C[4],@C[1] # C[4] ^ (C[1] & C[0]) 207 not @C[4] 208 mov @C[1],$A[1][4](%rsi) # R[1][4] = C[4] ^ (C[1] & C[0]) 209 210 or @C[3],@C[4] 211 mov $A[1][2](%rdi),@C[1] 212 xor @C[2],@C[4] # C[2] ^ (~C[4] | C[3]) 213 mov @C[4],$A[1][2](%rsi) # R[1][2] = C[2] ^ (~C[4] | C[3]) 214 215 and @C[2],@C[3] 216 mov $A[4][0](%rdi),@C[4] 217 xor @T[1],@C[3] # C[1] ^ (C[3] & C[2]) 218 mov @C[3],$A[1][1](%rsi) # R[1][1] = C[1] ^ (C[3] & C[2]) 219 220 or @C[2],@T[1] 221 mov $A[2][3](%rdi),@C[2] 222 xor @T[0],@T[1] # C[0] ^ (C[1] | C[2]) 223 mov $A[3][4](%rdi),@C[3] 224 mov @T[1],$A[1][0](%rsi) # R[1][0] = C[0] ^ (C[1] | C[2]) 225 226 227 xor @D[3],@C[2] 228 xor @D[4],@C[3] 229 rol \$$rhotates[2][3],@C[2] 230 xor @D[2],@C[1] 231 rol \$$rhotates[3][4],@C[3] 232 xor @D[0],@C[4] 233 rol \$$rhotates[1][2],@C[1] 234 xor @D[1],@C[0] 235 rol \$$rhotates[4][0],@C[4] 236 mov @C[2],@T[0] 237 and @C[3],@C[2] 238 rol \$$rhotates[0][1],@C[0] 239 240 not @C[3] 241 xor @C[1],@C[2] # C[1] ^ ( C[2] & C[3]) 242 mov @C[2],$A[2][1](%rsi) # R[2][1] = C[1] ^ ( C[2] & C[3]) 243 244 mov @C[4],@T[1] 245 and @C[3],@C[4] 246 mov $A[2][1](%rdi),@C[2] 247 xor @T[0],@C[4] # C[2] ^ ( C[4] & ~C[3]) 248 mov @C[4],$A[2][2](%rsi) # R[2][2] = C[2] ^ ( C[4] & ~C[3]) 249 250 or @C[1],@T[0] 251 mov $A[4][3](%rdi),@C[4] 252 xor @C[0],@T[0] # C[0] ^ ( C[2] | C[1]) 253 mov @T[0],$A[2][0](%rsi) # R[2][0] = C[0] ^ ( C[2] | C[1]) 254 255 and @C[0],@C[1] 256 xor @T[1],@C[1] # C[4] ^ ( C[1] & C[0]) 257 mov @C[1],$A[2][4](%rsi) # R[2][4] = C[4] ^ ( C[1] & C[0]) 258 259 or @C[0],@T[1] 260 mov $A[1][0](%rdi),@C[1] 261 xor @C[3],@T[1] # ~C[3] ^ ( C[0] | C[4]) 262 mov $A[3][2](%rdi),@C[3] 263 mov @T[1],$A[2][3](%rsi) # R[2][3] = ~C[3] ^ ( C[0] | C[4]) 264 265 266 mov $A[0][4](%rdi),@C[0] 267 268 xor @D[1],@C[2] 269 xor @D[2],@C[3] 270 rol \$$rhotates[2][1],@C[2] 271 xor @D[0],@C[1] 272 rol \$$rhotates[3][2],@C[3] 273 xor @D[3],@C[4] 274 rol \$$rhotates[1][0],@C[1] 275 xor @D[4],@C[0] 276 rol \$$rhotates[4][3],@C[4] 277 mov @C[2],@T[0] 278 or @C[3],@C[2] 279 rol \$$rhotates[0][4],@C[0] 280 281 not @C[3] 282 xor @C[1],@C[2] # C[1] ^ ( C[2] | C[3]) 283 mov @C[2],$A[3][1](%rsi) # R[3][1] = C[1] ^ ( C[2] | C[3]) 284 285 mov @C[4],@T[1] 286 or @C[3],@C[4] 287 xor @T[0],@C[4] # C[2] ^ ( C[4] | ~C[3]) 288 mov @C[4],$A[3][2](%rsi) # R[3][2] = C[2] ^ ( C[4] | ~C[3]) 289 290 and @C[1],@T[0] 291 xor @C[0],@T[0] # C[0] ^ ( C[2] & C[1]) 292 mov @T[0],$A[3][0](%rsi) # R[3][0] = C[0] ^ ( C[2] & C[1]) 293 294 or @C[0],@C[1] 295 xor @T[1],@C[1] # C[4] ^ ( C[1] | C[0]) 296 mov @C[1],$A[3][4](%rsi) # R[3][4] = C[4] ^ ( C[1] | C[0]) 297 298 and @T[1],@C[0] 299 xor @C[3],@C[0] # ~C[3] ^ ( C[0] & C[4]) 300 mov @C[0],$A[3][3](%rsi) # R[3][3] = ~C[3] ^ ( C[0] & C[4]) 301 302 303 xor $A[0][2](%rdi),@D[2] 304 xor $A[1][3](%rdi),@D[3] 305 rol \$$rhotates[0][2],@D[2] 306 xor $A[4][1](%rdi),@D[1] 307 rol \$$rhotates[1][3],@D[3] 308 xor $A[2][4](%rdi),@D[4] 309 rol \$$rhotates[4][1],@D[1] 310 xor $A[3][0](%rdi),@D[0] 311 xchg %rsi,%rdi 312 rol \$$rhotates[2][4],@D[4] 313 rol \$$rhotates[3][0],@D[0] 314___ 315 @C = @D[2..4,0,1]; 316$code.=<<___; 317 mov @C[0],@T[0] 318 and @C[1],@C[0] 319 not @C[1] 320 xor @C[4],@C[0] # C[4] ^ ( C[0] & C[1]) 321 mov @C[0],$A[4][4](%rdi) # R[4][4] = C[4] ^ ( C[0] & C[1]) 322 323 mov @C[2],@T[1] 324 and @C[1],@C[2] 325 xor @T[0],@C[2] # C[0] ^ ( C[2] & ~C[1]) 326 mov @C[2],$A[4][0](%rdi) # R[4][0] = C[0] ^ ( C[2] & ~C[1]) 327 328 or @C[4],@T[0] 329 xor @C[3],@T[0] # C[3] ^ ( C[0] | C[4]) 330 mov @T[0],$A[4][3](%rdi) # R[4][3] = C[3] ^ ( C[0] | C[4]) 331 332 and @C[3],@C[4] 333 xor @T[1],@C[4] # C[2] ^ ( C[4] & C[3]) 334 mov @C[4],$A[4][2](%rdi) # R[4][2] = C[2] ^ ( C[4] & C[3]) 335 336 or @T[1],@C[3] 337 xor @C[1],@C[3] # ~C[1] ^ ( C[2] | C[3]) 338 mov @C[3],$A[4][1](%rdi) # R[4][1] = ~C[1] ^ ( C[2] | C[3]) 339 340 mov @C[0],@C[1] # harmonize with the loop top 341 mov @T[0],@C[0] 342 343 test \$255,$iotas 344 jnz .Loop 345 346 lea -192($iotas),$iotas # rewind iotas 347 ret 348.cfi_endproc 349.size __KeccakF1600,.-__KeccakF1600 350 351.type KeccakF1600,\@abi-omnipotent 352.align 32 353KeccakF1600: 354.cfi_startproc 355 push %rbx 356.cfi_push %rbx 357 push %rbp 358.cfi_push %rbp 359 push %r12 360.cfi_push %r12 361 push %r13 362.cfi_push %r13 363 push %r14 364.cfi_push %r14 365 push %r15 366.cfi_push %r15 367 368 lea 100(%rdi),%rdi # size optimization 369 sub \$200,%rsp 370.cfi_adjust_cfa_offset 200 371 372 notq $A[0][1](%rdi) 373 notq $A[0][2](%rdi) 374 notq $A[1][3](%rdi) 375 notq $A[2][2](%rdi) 376 notq $A[3][2](%rdi) 377 notq $A[4][0](%rdi) 378 379 lea iotas(%rip),$iotas 380 lea 100(%rsp),%rsi # size optimization 381 382 call __KeccakF1600 383 384 notq $A[0][1](%rdi) 385 notq $A[0][2](%rdi) 386 notq $A[1][3](%rdi) 387 notq $A[2][2](%rdi) 388 notq $A[3][2](%rdi) 389 notq $A[4][0](%rdi) 390 lea -100(%rdi),%rdi # preserve A[][] 391 392 add \$200,%rsp 393.cfi_adjust_cfa_offset -200 394 395 pop %r15 396.cfi_pop %r15 397 pop %r14 398.cfi_pop %r14 399 pop %r13 400.cfi_pop %r13 401 pop %r12 402.cfi_pop %r12 403 pop %rbp 404.cfi_pop %rbp 405 pop %rbx 406.cfi_pop %rbx 407 ret 408.cfi_endproc 409.size KeccakF1600,.-KeccakF1600 410___ 411 412{ my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx"); 413 ($A_flat,$inp) = ("%r8","%r9"); 414$code.=<<___; 415.globl SHA3_absorb 416.type SHA3_absorb,\@function,4 417.align 32 418SHA3_absorb: 419.cfi_startproc 420 push %rbx 421.cfi_push %rbx 422 push %rbp 423.cfi_push %rbp 424 push %r12 425.cfi_push %r12 426 push %r13 427.cfi_push %r13 428 push %r14 429.cfi_push %r14 430 push %r15 431.cfi_push %r15 432 433 lea 100(%rdi),%rdi # size optimization 434 sub \$232,%rsp 435.cfi_adjust_cfa_offset 232 436 437 mov %rsi,$inp 438 lea 100(%rsp),%rsi # size optimization 439 440 notq $A[0][1](%rdi) 441 notq $A[0][2](%rdi) 442 notq $A[1][3](%rdi) 443 notq $A[2][2](%rdi) 444 notq $A[3][2](%rdi) 445 notq $A[4][0](%rdi) 446 lea iotas(%rip),$iotas 447 448 mov $bsz,216-100(%rsi) # save bsz 449 450.Loop_absorb: 451 cmp $bsz,$len 452 jc .Ldone_absorb 453 454 shr \$3,$bsz 455 lea -100(%rdi),$A_flat 456 457.Lblock_absorb: 458 mov ($inp),%rax 459 lea 8($inp),$inp 460 xor ($A_flat),%rax 461 lea 8($A_flat),$A_flat 462 sub \$8,$len 463 mov %rax,-8($A_flat) 464 sub \$1,$bsz 465 jnz .Lblock_absorb 466 467 mov $inp,200-100(%rsi) # save inp 468 mov $len,208-100(%rsi) # save len 469 call __KeccakF1600 470 mov 200-100(%rsi),$inp # pull inp 471 mov 208-100(%rsi),$len # pull len 472 mov 216-100(%rsi),$bsz # pull bsz 473 jmp .Loop_absorb 474 475.align 32 476.Ldone_absorb: 477 mov $len,%rax # return value 478 479 notq $A[0][1](%rdi) 480 notq $A[0][2](%rdi) 481 notq $A[1][3](%rdi) 482 notq $A[2][2](%rdi) 483 notq $A[3][2](%rdi) 484 notq $A[4][0](%rdi) 485 486 add \$232,%rsp 487.cfi_adjust_cfa_offset -232 488 489 pop %r15 490.cfi_pop %r15 491 pop %r14 492.cfi_pop %r14 493 pop %r13 494.cfi_pop %r13 495 pop %r12 496.cfi_pop %r12 497 pop %rbp 498.cfi_pop %rbp 499 pop %rbx 500.cfi_pop %rbx 501 ret 502.cfi_endproc 503.size SHA3_absorb,.-SHA3_absorb 504___ 505} 506{ my ($A_flat,$out,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx"); 507 ($out,$len,$bsz) = ("%r12","%r13","%r14"); 508 509$code.=<<___; 510.globl SHA3_squeeze 511.type SHA3_squeeze,\@function,4 512.align 32 513SHA3_squeeze: 514.cfi_startproc 515 push %r12 516.cfi_push %r12 517 push %r13 518.cfi_push %r13 519 push %r14 520.cfi_push %r14 521 522 shr \$3,%rcx 523 mov $A_flat,%r8 524 mov %rsi,$out 525 mov %rdx,$len 526 mov %rcx,$bsz 527 jmp .Loop_squeeze 528 529.align 32 530.Loop_squeeze: 531 cmp \$8,$len 532 jb .Ltail_squeeze 533 534 mov (%r8),%rax 535 lea 8(%r8),%r8 536 mov %rax,($out) 537 lea 8($out),$out 538 sub \$8,$len # len -= 8 539 jz .Ldone_squeeze 540 541 sub \$1,%rcx # bsz-- 542 jnz .Loop_squeeze 543 544 call KeccakF1600 545 mov $A_flat,%r8 546 mov $bsz,%rcx 547 jmp .Loop_squeeze 548 549.Ltail_squeeze: 550 mov %r8, %rsi 551 mov $out,%rdi 552 mov $len,%rcx 553 .byte 0xf3,0xa4 # rep movsb 554 555.Ldone_squeeze: 556 pop %r14 557.cfi_pop %r14 558 pop %r13 559.cfi_pop %r13 560 pop %r12 561.cfi_pop %r13 562 ret 563.cfi_endproc 564.size SHA3_squeeze,.-SHA3_squeeze 565___ 566} 567$code.=<<___; 568.align 256 569 .quad 0,0,0,0,0,0,0,0 570.type iotas,\@object 571iotas: 572 .quad 0x0000000000000001 573 .quad 0x0000000000008082 574 .quad 0x800000000000808a 575 .quad 0x8000000080008000 576 .quad 0x000000000000808b 577 .quad 0x0000000080000001 578 .quad 0x8000000080008081 579 .quad 0x8000000000008009 580 .quad 0x000000000000008a 581 .quad 0x0000000000000088 582 .quad 0x0000000080008009 583 .quad 0x000000008000000a 584 .quad 0x000000008000808b 585 .quad 0x800000000000008b 586 .quad 0x8000000000008089 587 .quad 0x8000000000008003 588 .quad 0x8000000000008002 589 .quad 0x8000000000000080 590 .quad 0x000000000000800a 591 .quad 0x800000008000000a 592 .quad 0x8000000080008081 593 .quad 0x8000000000008080 594 .quad 0x0000000080000001 595 .quad 0x8000000080008008 596.size iotas,.-iotas 597.asciz "Keccak-1600 absorb and squeeze for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 598___ 599 600foreach (split("\n",$code)) { 601 # Below replacement results in 11.2 on Sandy Bridge, 9.4 on 602 # Haswell, but it hurts other processors by up to 2-3-4x... 603 #s/rol\s+(\$[0-9]+),(%[a-z][a-z0-9]+)/shld\t$1,$2,$2/; 604 # Below replacement results in 9.3 on Haswell [as well as 605 # on Ryzen, i.e. it *hurts* Ryzen]... 606 #s/rol\s+\$([0-9]+),(%[a-z][a-z0-9]+)/rorx\t\$64-$1,$2,$2/; 607 608 print $_, "\n"; 609} 610 611close STDOUT or die "error closing STDOUT: $!"; 612