1#!/usr/bin/env perl 2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# ==================================================================== 10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 11# project. The module is, however, dual licensed under OpenSSL and 12# CRYPTOGAMS licenses depending on where you obtain it. For further 13# details see http://www.openssl.org/~appro/cryptogams/. 14# ==================================================================== 15# 16# Keccak-1600 for s390x. 17# 18# June 2017. 19# 20# Below code is [lane complementing] KECCAK_2X implementation (see 21# sha/keccak1600.c) with C[5] and D[5] held in register bank. Though 22# instead of actually unrolling the loop pair-wise I simply flip 23# pointers to T[][] and A[][] at the end of round. Since number of 24# rounds is even, last round writes to A[][] and everything works out. 25# In the nutshell it's transliteration of x86_64 module, because both 26# architectures have similar capabilities/limitations. Performance 27# measurement is problematic as I don't have access to an idle system. 28# It looks like z13 processes one byte [out of long message] in ~14 29# cycles. At least the result is consistent with estimate based on 30# amount of instruction and assumed instruction issue rate. It's ~2.5x 31# faster than compiler-generated code. 32 33# $output is the last argument if it looks like a file (it has an extension) 34# $flavour is the first argument if it doesn't look like a file 35$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 36$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 37 38if ($flavour =~ /3[12]/) { 39 $SIZE_T=4; 40 $g=""; 41} else { 42 $SIZE_T=8; 43 $g="g"; 44} 45 46$output and open STDOUT,">$output"; 47 48my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20)); 49 50my @C = map("%r$_",(0,1,5..7)); 51my @D = map("%r$_",(8..12)); 52my @T = map("%r$_",(13..14)); 53my ($src,$dst,$iotas) = map("%r$_",(2..4)); 54my $sp = "%r15"; 55 56$stdframe=16*$SIZE_T+4*8; 57$frame=$stdframe+25*8; 58 59my @rhotates = ([ 0, 1, 62, 28, 27 ], 60 [ 36, 44, 6, 55, 20 ], 61 [ 3, 10, 43, 25, 39 ], 62 [ 41, 45, 15, 21, 8 ], 63 [ 18, 2, 61, 56, 14 ]); 64 65{ my @C = @C; # copy, because we mess them up... 66 my @D = @D; 67 68$code.=<<___; 69.text 70 71.type __KeccakF1600,\@function 72.align 32 73__KeccakF1600: 74 st${g} %r14,$SIZE_T*14($sp) 75 lg @C[0],$A[4][0]($src) 76 lg @C[1],$A[4][1]($src) 77 lg @C[2],$A[4][2]($src) 78 lg @C[3],$A[4][3]($src) 79 lg @C[4],$A[4][4]($src) 80 larl $iotas,iotas 81 j .Loop 82 83.align 16 84.Loop: 85 lg @D[0],$A[0][0]($src) 86 lg @D[1],$A[1][1]($src) 87 lg @D[2],$A[2][2]($src) 88 lg @D[3],$A[3][3]($src) 89 90 xgr @C[0],@D[0] 91 xg @C[1],$A[0][1]($src) 92 xg @C[2],$A[0][2]($src) 93 xg @C[3],$A[0][3]($src) 94 lgr @D[4],@C[4] 95 xg @C[4],$A[0][4]($src) 96 97 xg @C[0],$A[1][0]($src) 98 xgr @C[1],@D[1] 99 xg @C[2],$A[1][2]($src) 100 xg @C[3],$A[1][3]($src) 101 xg @C[4],$A[1][4]($src) 102 103 xg @C[0],$A[2][0]($src) 104 xg @C[1],$A[2][1]($src) 105 xgr @C[2],@D[2] 106 xg @C[3],$A[2][3]($src) 107 xg @C[4],$A[2][4]($src) 108 109 xg @C[0],$A[3][0]($src) 110 xg @C[1],$A[3][1]($src) 111 xg @C[2],$A[3][2]($src) 112 xgr @C[3],@D[3] 113 xg @C[4],$A[3][4]($src) 114 115 lgr @T[0],@C[2] 116 rllg @C[2],@C[2],1 117 xgr @C[2],@C[0] # D[1] = ROL64(C[2], 1) ^ C[0] 118 119 rllg @C[0],@C[0],1 120 xgr @C[0],@C[3] # D[4] = ROL64(C[0], 1) ^ C[3] 121 122 rllg @C[3],@C[3],1 123 xgr @C[3],@C[1] # D[2] = ROL64(C[3], 1) ^ C[1] 124 125 rllg @C[1],@C[1],1 126 xgr @C[1],@C[4] # D[0] = ROL64(C[1], 1) ^ C[4] 127 128 rllg @C[4],@C[4],1 129 xgr @C[4],@T[0] # D[3] = ROL64(C[4], 1) ^ C[2] 130___ 131 (@D[0..4], @C) = (@C[1..4,0], @D); 132$code.=<<___; 133 xgr @C[1],@D[1] 134 xgr @C[2],@D[2] 135 xgr @C[3],@D[3] 136 rllg @C[1],@C[1],$rhotates[1][1] 137 xgr @C[4],@D[4] 138 rllg @C[2],@C[2],$rhotates[2][2] 139 xgr @C[0],@D[0] 140 141 lgr @T[0],@C[1] 142 ogr @C[1],@C[2] 143 rllg @C[3],@C[3],$rhotates[3][3] 144 xgr @C[1],@C[0] # C[0] ^ ( C[1] | C[2]) 145 rllg @C[4],@C[4],$rhotates[4][4] 146 xg @C[1],0($iotas) 147 la $iotas,8($iotas) 148 stg @C[1],$A[0][0]($dst) # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i] 149 150 lgr @T[1],@C[4] 151 ngr @C[4],@C[3] 152 lghi @C[1],-1 # no 'not' instruction :-( 153 xgr @C[4],@C[2] # C[2] ^ ( C[4] & C[3]) 154 xgr @C[2],@C[1] # not @C[2] 155 stg @C[4],$A[0][2]($dst) # R[0][2] = C[2] ^ ( C[4] & C[3]) 156 ogr @C[2],@C[3] 157 xgr @C[2],@T[0] # C[1] ^ (~C[2] | C[3]) 158 159 ngr @T[0],@C[0] 160 stg @C[2],$A[0][1]($dst) # R[0][1] = C[1] ^ (~C[2] | C[3]) 161 xgr @T[0],@T[1] # C[4] ^ ( C[1] & C[0]) 162 ogr @T[1],@C[0] 163 stg @T[0],$A[0][4]($dst) # R[0][4] = C[4] ^ ( C[1] & C[0]) 164 xgr @T[1],@C[3] # C[3] ^ ( C[4] | C[0]) 165 stg @T[1],$A[0][3]($dst) # R[0][3] = C[3] ^ ( C[4] | C[0]) 166 167 168 lg @C[0],$A[0][3]($src) 169 lg @C[4],$A[4][2]($src) 170 lg @C[3],$A[3][1]($src) 171 lg @C[1],$A[1][4]($src) 172 lg @C[2],$A[2][0]($src) 173 174 xgr @C[0],@D[3] 175 xgr @C[4],@D[2] 176 rllg @C[0],@C[0],$rhotates[0][3] 177 xgr @C[3],@D[1] 178 rllg @C[4],@C[4],$rhotates[4][2] 179 xgr @C[1],@D[4] 180 rllg @C[3],@C[3],$rhotates[3][1] 181 xgr @C[2],@D[0] 182 183 lgr @T[0],@C[0] 184 ogr @C[0],@C[4] 185 rllg @C[1],@C[1],$rhotates[1][4] 186 xgr @C[0],@C[3] # C[3] ^ (C[0] | C[4]) 187 rllg @C[2],@C[2],$rhotates[2][0] 188 stg @C[0],$A[1][3]($dst) # R[1][3] = C[3] ^ (C[0] | C[4]) 189 190 lgr @T[1],@C[1] 191 ngr @C[1],@T[0] 192 lghi @C[0],-1 # no 'not' instruction :-( 193 xgr @C[1],@C[4] # C[4] ^ (C[1] & C[0]) 194 xgr @C[4],@C[0] # not @C[4] 195 stg @C[1],$A[1][4]($dst) # R[1][4] = C[4] ^ (C[1] & C[0]) 196 197 ogr @C[4],@C[3] 198 xgr @C[4],@C[2] # C[2] ^ (~C[4] | C[3]) 199 200 ngr @C[3],@C[2] 201 stg @C[4],$A[1][2]($dst) # R[1][2] = C[2] ^ (~C[4] | C[3]) 202 xgr @C[3],@T[1] # C[1] ^ (C[3] & C[2]) 203 ogr @T[1],@C[2] 204 stg @C[3],$A[1][1]($dst) # R[1][1] = C[1] ^ (C[3] & C[2]) 205 xgr @T[1],@T[0] # C[0] ^ (C[1] | C[2]) 206 stg @T[1],$A[1][0]($dst) # R[1][0] = C[0] ^ (C[1] | C[2]) 207 208 209 lg @C[2],$A[2][3]($src) 210 lg @C[3],$A[3][4]($src) 211 lg @C[1],$A[1][2]($src) 212 lg @C[4],$A[4][0]($src) 213 lg @C[0],$A[0][1]($src) 214 215 xgr @C[2],@D[3] 216 xgr @C[3],@D[4] 217 rllg @C[2],@C[2],$rhotates[2][3] 218 xgr @C[1],@D[2] 219 rllg @C[3],@C[3],$rhotates[3][4] 220 xgr @C[4],@D[0] 221 rllg @C[1],@C[1],$rhotates[1][2] 222 xgr @C[0],@D[1] 223 224 lgr @T[0],@C[2] 225 ngr @C[2],@C[3] 226 rllg @C[4],@C[4],$rhotates[4][0] 227 xgr @C[2],@C[1] # C[1] ^ ( C[2] & C[3]) 228 lghi @T[1],-1 # no 'not' instruction :-( 229 stg @C[2],$A[2][1]($dst) # R[2][1] = C[1] ^ ( C[2] & C[3]) 230 231 xgr @C[3],@T[1] # not @C[3] 232 lgr @T[1],@C[4] 233 ngr @C[4],@C[3] 234 rllg @C[0],@C[0],$rhotates[0][1] 235 xgr @C[4],@T[0] # C[2] ^ ( C[4] & ~C[3]) 236 ogr @T[0],@C[1] 237 stg @C[4],$A[2][2]($dst) # R[2][2] = C[2] ^ ( C[4] & ~C[3]) 238 xgr @T[0],@C[0] # C[0] ^ ( C[2] | C[1]) 239 240 ngr @C[1],@C[0] 241 stg @T[0],$A[2][0]($dst) # R[2][0] = C[0] ^ ( C[2] | C[1]) 242 xgr @C[1],@T[1] # C[4] ^ ( C[1] & C[0]) 243 ogr @C[0],@T[1] 244 stg @C[1],$A[2][4]($dst) # R[2][4] = C[4] ^ ( C[1] & C[0]) 245 xgr @C[0],@C[3] # ~C[3] ^ ( C[0] | C[4]) 246 stg @C[0],$A[2][3]($dst) # R[2][3] = ~C[3] ^ ( C[0] | C[4]) 247 248 249 lg @C[2],$A[2][1]($src) 250 lg @C[3],$A[3][2]($src) 251 lg @C[1],$A[1][0]($src) 252 lg @C[4],$A[4][3]($src) 253 lg @C[0],$A[0][4]($src) 254 255 xgr @C[2],@D[1] 256 xgr @C[3],@D[2] 257 rllg @C[2],@C[2],$rhotates[2][1] 258 xgr @C[1],@D[0] 259 rllg @C[3],@C[3],$rhotates[3][2] 260 xgr @C[4],@D[3] 261 rllg @C[1],@C[1],$rhotates[1][0] 262 xgr @C[0],@D[4] 263 rllg @C[4],@C[4],$rhotates[4][3] 264 265 lgr @T[0],@C[2] 266 ogr @C[2],@C[3] 267 lghi @T[1],-1 # no 'not' instruction :-( 268 xgr @C[2],@C[1] # C[1] ^ ( C[2] | C[3]) 269 xgr @C[3],@T[1] # not @C[3] 270 stg @C[2],$A[3][1]($dst) # R[3][1] = C[1] ^ ( C[2] | C[3]) 271 272 lgr @T[1],@C[4] 273 ogr @C[4],@C[3] 274 rllg @C[0],@C[0],$rhotates[0][4] 275 xgr @C[4],@T[0] # C[2] ^ ( C[4] | ~C[3]) 276 ngr @T[0],@C[1] 277 stg @C[4],$A[3][2]($dst) # R[3][2] = C[2] ^ ( C[4] | ~C[3]) 278 xgr @T[0],@C[0] # C[0] ^ ( C[2] & C[1]) 279 280 ogr @C[1],@C[0] 281 stg @T[0],$A[3][0]($dst) # R[3][0] = C[0] ^ ( C[2] & C[1]) 282 xgr @C[1],@T[1] # C[4] ^ ( C[1] | C[0]) 283 ngr @C[0],@T[1] 284 stg @C[1],$A[3][4]($dst) # R[3][4] = C[4] ^ ( C[1] | C[0]) 285 xgr @C[0],@C[3] # ~C[3] ^ ( C[0] & C[4]) 286 stg @C[0],$A[3][3]($dst) # R[3][3] = ~C[3] ^ ( C[0] & C[4]) 287 288 289 xg @D[2],$A[0][2]($src) 290 xg @D[3],$A[1][3]($src) 291 xg @D[1],$A[4][1]($src) 292 xg @D[4],$A[2][4]($src) 293 xgr $dst,$src # xchg $dst,$src 294 rllg @D[2],@D[2],$rhotates[0][2] 295 xg @D[0],$A[3][0]($src) 296 rllg @D[3],@D[3],$rhotates[1][3] 297 xgr $src,$dst 298 rllg @D[1],@D[1],$rhotates[4][1] 299 xgr $dst,$src 300 rllg @D[4],@D[4],$rhotates[2][4] 301___ 302 @C = @D[2..4,0,1]; 303$code.=<<___; 304 lgr @T[0],@C[0] 305 ngr @C[0],@C[1] 306 lghi @T[1],-1 # no 'not' instruction :-( 307 xgr @C[0],@C[4] # C[4] ^ ( C[0] & C[1]) 308 xgr @C[1],@T[1] # not @C[1] 309 stg @C[0],$A[4][4]($src) # R[4][4] = C[4] ^ ( C[0] & C[1]) 310 311 lgr @T[1],@C[2] 312 ngr @C[2],@C[1] 313 rllg @D[0],@D[0],$rhotates[3][0] 314 xgr @C[2],@T[0] # C[0] ^ ( C[2] & ~C[1]) 315 ogr @T[0],@C[4] 316 stg @C[2],$A[4][0]($src) # R[4][0] = C[0] ^ ( C[2] & ~C[1]) 317 xgr @T[0],@C[3] # C[3] ^ ( C[0] | C[4]) 318 319 ngr @C[4],@C[3] 320 stg @T[0],$A[4][3]($src) # R[4][3] = C[3] ^ ( C[0] | C[4]) 321 xgr @C[4],@T[1] # C[2] ^ ( C[4] & C[3]) 322 ogr @C[3],@T[1] 323 stg @C[4],$A[4][2]($src) # R[4][2] = C[2] ^ ( C[4] & C[3]) 324 xgr @C[3],@C[1] # ~C[1] ^ ( C[2] | C[3]) 325 326 lgr @C[1],@C[0] # harmonize with the loop top 327 lgr @C[0],@T[0] 328 stg @C[3],$A[4][1]($src) # R[4][1] = ~C[1] ^ ( C[2] | C[3]) 329 330 tmll $iotas,255 331 jnz .Loop 332 333 l${g} %r14,$SIZE_T*14($sp) 334 br %r14 335.size __KeccakF1600,.-__KeccakF1600 336___ 337} 338{ 339$code.=<<___; 340.type KeccakF1600,\@function 341.align 32 342KeccakF1600: 343.LKeccakF1600: 344 lghi %r1,-$frame 345 stm${g} %r6,%r15,$SIZE_T*6($sp) 346 lgr %r0,$sp 347 la $sp,0(%r1,$sp) 348 st${g} %r0,0($sp) 349 350 lghi @D[0],-1 # no 'not' instruction :-( 351 lghi @D[1],-1 352 lghi @D[2],-1 353 lghi @D[3],-1 354 lghi @D[4],-1 355 lghi @T[0],-1 356 xg @D[0],$A[0][1]($src) 357 xg @D[1],$A[0][2]($src) 358 xg @D[2],$A[1][3]($src) 359 xg @D[3],$A[2][2]($src) 360 xg @D[4],$A[3][2]($src) 361 xg @T[0],$A[4][0]($src) 362 stmg @D[0],@D[1],$A[0][1]($src) 363 stg @D[2],$A[1][3]($src) 364 stg @D[3],$A[2][2]($src) 365 stg @D[4],$A[3][2]($src) 366 stg @T[0],$A[4][0]($src) 367 368 la $dst,$stdframe($sp) 369 370 bras %r14,__KeccakF1600 371 372 lghi @D[0],-1 # no 'not' instruction :-( 373 lghi @D[1],-1 374 lghi @D[2],-1 375 lghi @D[3],-1 376 lghi @D[4],-1 377 lghi @T[0],-1 378 xg @D[0],$A[0][1]($src) 379 xg @D[1],$A[0][2]($src) 380 xg @D[2],$A[1][3]($src) 381 xg @D[3],$A[2][2]($src) 382 xg @D[4],$A[3][2]($src) 383 xg @T[0],$A[4][0]($src) 384 stmg @D[0],@D[1],$A[0][1]($src) 385 stg @D[2],$A[1][3]($src) 386 stg @D[3],$A[2][2]($src) 387 stg @D[4],$A[3][2]($src) 388 stg @T[0],$A[4][0]($src) 389 390 lm${g} %r6,%r15,$frame+6*$SIZE_T($sp) 391 br %r14 392.size KeccakF1600,.-KeccakF1600 393___ 394} 395{ my ($A_flat,$inp,$len,$bsz) = map("%r$_",(2..5)); 396 397$code.=<<___; 398.globl SHA3_absorb 399.type SHA3_absorb,\@function 400.align 32 401SHA3_absorb: 402 lghi %r1,-$frame 403 stm${g} %r5,%r15,$SIZE_T*5($sp) 404 lgr %r0,$sp 405 la $sp,0(%r1,$sp) 406 st${g} %r0,0($sp) 407 408 lghi @D[0],-1 # no 'not' instruction :-( 409 lghi @D[1],-1 410 lghi @D[2],-1 411 lghi @D[3],-1 412 lghi @D[4],-1 413 lghi @T[0],-1 414 xg @D[0],$A[0][1]($src) 415 xg @D[1],$A[0][2]($src) 416 xg @D[2],$A[1][3]($src) 417 xg @D[3],$A[2][2]($src) 418 xg @D[4],$A[3][2]($src) 419 xg @T[0],$A[4][0]($src) 420 stmg @D[0],@D[1],$A[0][1]($src) 421 stg @D[2],$A[1][3]($src) 422 stg @D[3],$A[2][2]($src) 423 stg @D[4],$A[3][2]($src) 424 stg @T[0],$A[4][0]($src) 425 426.Loop_absorb: 427 cl${g}r $len,$bsz 428 jl .Ldone_absorb 429 430 srl${g} $bsz,3 431 la %r1,0($A_flat) 432 433.Lblock_absorb: 434 lrvg %r0,0($inp) 435 la $inp,8($inp) 436 xg %r0,0(%r1) 437 a${g}hi $len,-8 438 stg %r0,0(%r1) 439 la %r1,8(%r1) 440 brct $bsz,.Lblock_absorb 441 442 stm${g} $inp,$len,$frame+3*$SIZE_T($sp) 443 la $dst,$stdframe($sp) 444 bras %r14,__KeccakF1600 445 lm${g} $inp,$bsz,$frame+3*$SIZE_T($sp) 446 j .Loop_absorb 447 448.align 16 449.Ldone_absorb: 450 lghi @D[0],-1 # no 'not' instruction :-( 451 lghi @D[1],-1 452 lghi @D[2],-1 453 lghi @D[3],-1 454 lghi @D[4],-1 455 lghi @T[0],-1 456 xg @D[0],$A[0][1]($src) 457 xg @D[1],$A[0][2]($src) 458 xg @D[2],$A[1][3]($src) 459 xg @D[3],$A[2][2]($src) 460 xg @D[4],$A[3][2]($src) 461 xg @T[0],$A[4][0]($src) 462 stmg @D[0],@D[1],$A[0][1]($src) 463 stg @D[2],$A[1][3]($src) 464 stg @D[3],$A[2][2]($src) 465 stg @D[4],$A[3][2]($src) 466 stg @T[0],$A[4][0]($src) 467 468 lgr %r2,$len # return value 469 470 lm${g} %r6,%r15,$frame+6*$SIZE_T($sp) 471 br %r14 472.size SHA3_absorb,.-SHA3_absorb 473___ 474} 475{ my ($A_flat,$out,$len,$bsz) = map("%r$_",(2..5)); 476 477$code.=<<___; 478.globl SHA3_squeeze 479.type SHA3_squeeze,\@function 480.align 32 481SHA3_squeeze: 482 srl${g} $bsz,3 483 st${g} %r14,2*$SIZE_T($sp) 484 lghi %r14,8 485 st${g} $bsz,5*$SIZE_T($sp) 486 la %r1,0($A_flat) 487 488 j .Loop_squeeze 489 490.align 16 491.Loop_squeeze: 492 cl${g}r $len,%r14 493 jl .Ltail_squeeze 494 495 lrvg %r0,0(%r1) 496 la %r1,8(%r1) 497 stg %r0,0($out) 498 la $out,8($out) 499 a${g}hi $len,-8 # len -= 8 500 jz .Ldone_squeeze 501 502 brct $bsz,.Loop_squeeze # bsz-- 503 504 stm${g} $out,$len,3*$SIZE_T($sp) 505 bras %r14,.LKeccakF1600 506 lm${g} $out,$bsz,3*$SIZE_T($sp) 507 lghi %r14,8 508 la %r1,0($A_flat) 509 j .Loop_squeeze 510 511.Ltail_squeeze: 512 lg %r0,0(%r1) 513.Loop_tail_squeeze: 514 stc %r0,0($out) 515 la $out,1($out) 516 srlg %r0,8 517 brct $len,.Loop_tail_squeeze 518 519.Ldone_squeeze: 520 l${g} %r14,2*$SIZE_T($sp) 521 br %r14 522.size SHA3_squeeze,.-SHA3_squeeze 523___ 524} 525$code.=<<___; 526.align 256 527 .quad 0,0,0,0,0,0,0,0 528.type iotas,\@object 529iotas: 530 .quad 0x0000000000000001 531 .quad 0x0000000000008082 532 .quad 0x800000000000808a 533 .quad 0x8000000080008000 534 .quad 0x000000000000808b 535 .quad 0x0000000080000001 536 .quad 0x8000000080008081 537 .quad 0x8000000000008009 538 .quad 0x000000000000008a 539 .quad 0x0000000000000088 540 .quad 0x0000000080008009 541 .quad 0x000000008000000a 542 .quad 0x000000008000808b 543 .quad 0x800000000000008b 544 .quad 0x8000000000008089 545 .quad 0x8000000000008003 546 .quad 0x8000000000008002 547 .quad 0x8000000000000080 548 .quad 0x000000000000800a 549 .quad 0x800000008000000a 550 .quad 0x8000000080008081 551 .quad 0x8000000000008080 552 .quad 0x0000000080000001 553 .quad 0x8000000080008008 554.size iotas,.-iotas 555.asciz "Keccak-1600 absorb and squeeze for s390x, CRYPTOGAMS by <appro\@openssl.org>" 556___ 557 558# unlike 32-bit shift 64-bit one takes three arguments 559$code =~ s/(srlg\s+)(%r[0-9]+),/$1$2,$2,/gm; 560 561print $code; 562close STDOUT or die "error closing STDOUT: $!"; 563