1#!/usr/bin/env perl 2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# ==================================================================== 10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 11# project. The module is, however, dual licensed under OpenSSL and 12# CRYPTOGAMS licenses depending on where you obtain it. For further 13# details see http://www.openssl.org/~appro/cryptogams/. 14# ==================================================================== 15# 16# Keccak-1600 for s390x. 17# 18# June 2017. 19# 20# Below code is [lane complementing] KECCAK_2X implementation (see 21# sha/keccak1600.c) with C[5] and D[5] held in register bank. Though 22# instead of actually unrolling the loop pair-wise I simply flip 23# pointers to T[][] and A[][] at the end of round. Since number of 24# rounds is even, last round writes to A[][] and everything works out. 25# In the nutshell it's transliteration of x86_64 module, because both 26# architectures have similar capabilities/limitations. Performance 27# measurement is problematic as I don't have access to an idle system. 28# It looks like z13 processes one byte [out of long message] in ~14 29# cycles. At least the result is consistent with estimate based on 30# amount of instruction and assumed instruction issue rate. It's ~2.5x 31# faster than compiler-generated code. 32 33$flavour = shift; 34 35if ($flavour =~ /3[12]/) { 36 $SIZE_T=4; 37 $g=""; 38} else { 39 $SIZE_T=8; 40 $g="g"; 41} 42 43while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} 44open STDOUT,">$output"; 45 46my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20)); 47 48my @C = map("%r$_",(0,1,5..7)); 49my @D = map("%r$_",(8..12)); 50my @T = map("%r$_",(13..14)); 51my ($src,$dst,$iotas) = map("%r$_",(2..4)); 52my $sp = "%r15"; 53 54$stdframe=16*$SIZE_T+4*8; 55$frame=$stdframe+25*8; 56 57my @rhotates = ([ 0, 1, 62, 28, 27 ], 58 [ 36, 44, 6, 55, 20 ], 59 [ 3, 10, 43, 25, 39 ], 60 [ 41, 45, 15, 21, 8 ], 61 [ 18, 2, 61, 56, 14 ]); 62 63{ my @C = @C; # copy, because we mess them up... 64 my @D = @D; 65 66$code.=<<___; 67.text 68 69.type __KeccakF1600,\@function 70.align 32 71__KeccakF1600: 72 st${g} %r14,$SIZE_T*14($sp) 73 lg @C[0],$A[4][0]($src) 74 lg @C[1],$A[4][1]($src) 75 lg @C[2],$A[4][2]($src) 76 lg @C[3],$A[4][3]($src) 77 lg @C[4],$A[4][4]($src) 78 larl $iotas,iotas 79 j .Loop 80 81.align 16 82.Loop: 83 lg @D[0],$A[0][0]($src) 84 lg @D[1],$A[1][1]($src) 85 lg @D[2],$A[2][2]($src) 86 lg @D[3],$A[3][3]($src) 87 88 xgr @C[0],@D[0] 89 xg @C[1],$A[0][1]($src) 90 xg @C[2],$A[0][2]($src) 91 xg @C[3],$A[0][3]($src) 92 lgr @D[4],@C[4] 93 xg @C[4],$A[0][4]($src) 94 95 xg @C[0],$A[1][0]($src) 96 xgr @C[1],@D[1] 97 xg @C[2],$A[1][2]($src) 98 xg @C[3],$A[1][3]($src) 99 xg @C[4],$A[1][4]($src) 100 101 xg @C[0],$A[2][0]($src) 102 xg @C[1],$A[2][1]($src) 103 xgr @C[2],@D[2] 104 xg @C[3],$A[2][3]($src) 105 xg @C[4],$A[2][4]($src) 106 107 xg @C[0],$A[3][0]($src) 108 xg @C[1],$A[3][1]($src) 109 xg @C[2],$A[3][2]($src) 110 xgr @C[3],@D[3] 111 xg @C[4],$A[3][4]($src) 112 113 lgr @T[0],@C[2] 114 rllg @C[2],@C[2],1 115 xgr @C[2],@C[0] # D[1] = ROL64(C[2], 1) ^ C[0] 116 117 rllg @C[0],@C[0],1 118 xgr @C[0],@C[3] # D[4] = ROL64(C[0], 1) ^ C[3] 119 120 rllg @C[3],@C[3],1 121 xgr @C[3],@C[1] # D[2] = ROL64(C[3], 1) ^ C[1] 122 123 rllg @C[1],@C[1],1 124 xgr @C[1],@C[4] # D[0] = ROL64(C[1], 1) ^ C[4] 125 126 rllg @C[4],@C[4],1 127 xgr @C[4],@T[0] # D[3] = ROL64(C[4], 1) ^ C[2] 128___ 129 (@D[0..4], @C) = (@C[1..4,0], @D); 130$code.=<<___; 131 xgr @C[1],@D[1] 132 xgr @C[2],@D[2] 133 xgr @C[3],@D[3] 134 rllg @C[1],@C[1],$rhotates[1][1] 135 xgr @C[4],@D[4] 136 rllg @C[2],@C[2],$rhotates[2][2] 137 xgr @C[0],@D[0] 138 139 lgr @T[0],@C[1] 140 ogr @C[1],@C[2] 141 rllg @C[3],@C[3],$rhotates[3][3] 142 xgr @C[1],@C[0] # C[0] ^ ( C[1] | C[2]) 143 rllg @C[4],@C[4],$rhotates[4][4] 144 xg @C[1],0($iotas) 145 la $iotas,8($iotas) 146 stg @C[1],$A[0][0]($dst) # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i] 147 148 lgr @T[1],@C[4] 149 ngr @C[4],@C[3] 150 lghi @C[1],-1 # no 'not' instruction :-( 151 xgr @C[4],@C[2] # C[2] ^ ( C[4] & C[3]) 152 xgr @C[2],@C[1] # not @C[2] 153 stg @C[4],$A[0][2]($dst) # R[0][2] = C[2] ^ ( C[4] & C[3]) 154 ogr @C[2],@C[3] 155 xgr @C[2],@T[0] # C[1] ^ (~C[2] | C[3]) 156 157 ngr @T[0],@C[0] 158 stg @C[2],$A[0][1]($dst) # R[0][1] = C[1] ^ (~C[2] | C[3]) 159 xgr @T[0],@T[1] # C[4] ^ ( C[1] & C[0]) 160 ogr @T[1],@C[0] 161 stg @T[0],$A[0][4]($dst) # R[0][4] = C[4] ^ ( C[1] & C[0]) 162 xgr @T[1],@C[3] # C[3] ^ ( C[4] | C[0]) 163 stg @T[1],$A[0][3]($dst) # R[0][3] = C[3] ^ ( C[4] | C[0]) 164 165 166 lg @C[0],$A[0][3]($src) 167 lg @C[4],$A[4][2]($src) 168 lg @C[3],$A[3][1]($src) 169 lg @C[1],$A[1][4]($src) 170 lg @C[2],$A[2][0]($src) 171 172 xgr @C[0],@D[3] 173 xgr @C[4],@D[2] 174 rllg @C[0],@C[0],$rhotates[0][3] 175 xgr @C[3],@D[1] 176 rllg @C[4],@C[4],$rhotates[4][2] 177 xgr @C[1],@D[4] 178 rllg @C[3],@C[3],$rhotates[3][1] 179 xgr @C[2],@D[0] 180 181 lgr @T[0],@C[0] 182 ogr @C[0],@C[4] 183 rllg @C[1],@C[1],$rhotates[1][4] 184 xgr @C[0],@C[3] # C[3] ^ (C[0] | C[4]) 185 rllg @C[2],@C[2],$rhotates[2][0] 186 stg @C[0],$A[1][3]($dst) # R[1][3] = C[3] ^ (C[0] | C[4]) 187 188 lgr @T[1],@C[1] 189 ngr @C[1],@T[0] 190 lghi @C[0],-1 # no 'not' instruction :-( 191 xgr @C[1],@C[4] # C[4] ^ (C[1] & C[0]) 192 xgr @C[4],@C[0] # not @C[4] 193 stg @C[1],$A[1][4]($dst) # R[1][4] = C[4] ^ (C[1] & C[0]) 194 195 ogr @C[4],@C[3] 196 xgr @C[4],@C[2] # C[2] ^ (~C[4] | C[3]) 197 198 ngr @C[3],@C[2] 199 stg @C[4],$A[1][2]($dst) # R[1][2] = C[2] ^ (~C[4] | C[3]) 200 xgr @C[3],@T[1] # C[1] ^ (C[3] & C[2]) 201 ogr @T[1],@C[2] 202 stg @C[3],$A[1][1]($dst) # R[1][1] = C[1] ^ (C[3] & C[2]) 203 xgr @T[1],@T[0] # C[0] ^ (C[1] | C[2]) 204 stg @T[1],$A[1][0]($dst) # R[1][0] = C[0] ^ (C[1] | C[2]) 205 206 207 lg @C[2],$A[2][3]($src) 208 lg @C[3],$A[3][4]($src) 209 lg @C[1],$A[1][2]($src) 210 lg @C[4],$A[4][0]($src) 211 lg @C[0],$A[0][1]($src) 212 213 xgr @C[2],@D[3] 214 xgr @C[3],@D[4] 215 rllg @C[2],@C[2],$rhotates[2][3] 216 xgr @C[1],@D[2] 217 rllg @C[3],@C[3],$rhotates[3][4] 218 xgr @C[4],@D[0] 219 rllg @C[1],@C[1],$rhotates[1][2] 220 xgr @C[0],@D[1] 221 222 lgr @T[0],@C[2] 223 ngr @C[2],@C[3] 224 rllg @C[4],@C[4],$rhotates[4][0] 225 xgr @C[2],@C[1] # C[1] ^ ( C[2] & C[3]) 226 lghi @T[1],-1 # no 'not' instruction :-( 227 stg @C[2],$A[2][1]($dst) # R[2][1] = C[1] ^ ( C[2] & C[3]) 228 229 xgr @C[3],@T[1] # not @C[3] 230 lgr @T[1],@C[4] 231 ngr @C[4],@C[3] 232 rllg @C[0],@C[0],$rhotates[0][1] 233 xgr @C[4],@T[0] # C[2] ^ ( C[4] & ~C[3]) 234 ogr @T[0],@C[1] 235 stg @C[4],$A[2][2]($dst) # R[2][2] = C[2] ^ ( C[4] & ~C[3]) 236 xgr @T[0],@C[0] # C[0] ^ ( C[2] | C[1]) 237 238 ngr @C[1],@C[0] 239 stg @T[0],$A[2][0]($dst) # R[2][0] = C[0] ^ ( C[2] | C[1]) 240 xgr @C[1],@T[1] # C[4] ^ ( C[1] & C[0]) 241 ogr @C[0],@T[1] 242 stg @C[1],$A[2][4]($dst) # R[2][4] = C[4] ^ ( C[1] & C[0]) 243 xgr @C[0],@C[3] # ~C[3] ^ ( C[0] | C[4]) 244 stg @C[0],$A[2][3]($dst) # R[2][3] = ~C[3] ^ ( C[0] | C[4]) 245 246 247 lg @C[2],$A[2][1]($src) 248 lg @C[3],$A[3][2]($src) 249 lg @C[1],$A[1][0]($src) 250 lg @C[4],$A[4][3]($src) 251 lg @C[0],$A[0][4]($src) 252 253 xgr @C[2],@D[1] 254 xgr @C[3],@D[2] 255 rllg @C[2],@C[2],$rhotates[2][1] 256 xgr @C[1],@D[0] 257 rllg @C[3],@C[3],$rhotates[3][2] 258 xgr @C[4],@D[3] 259 rllg @C[1],@C[1],$rhotates[1][0] 260 xgr @C[0],@D[4] 261 rllg @C[4],@C[4],$rhotates[4][3] 262 263 lgr @T[0],@C[2] 264 ogr @C[2],@C[3] 265 lghi @T[1],-1 # no 'not' instruction :-( 266 xgr @C[2],@C[1] # C[1] ^ ( C[2] | C[3]) 267 xgr @C[3],@T[1] # not @C[3] 268 stg @C[2],$A[3][1]($dst) # R[3][1] = C[1] ^ ( C[2] | C[3]) 269 270 lgr @T[1],@C[4] 271 ogr @C[4],@C[3] 272 rllg @C[0],@C[0],$rhotates[0][4] 273 xgr @C[4],@T[0] # C[2] ^ ( C[4] | ~C[3]) 274 ngr @T[0],@C[1] 275 stg @C[4],$A[3][2]($dst) # R[3][2] = C[2] ^ ( C[4] | ~C[3]) 276 xgr @T[0],@C[0] # C[0] ^ ( C[2] & C[1]) 277 278 ogr @C[1],@C[0] 279 stg @T[0],$A[3][0]($dst) # R[3][0] = C[0] ^ ( C[2] & C[1]) 280 xgr @C[1],@T[1] # C[4] ^ ( C[1] | C[0]) 281 ngr @C[0],@T[1] 282 stg @C[1],$A[3][4]($dst) # R[3][4] = C[4] ^ ( C[1] | C[0]) 283 xgr @C[0],@C[3] # ~C[3] ^ ( C[0] & C[4]) 284 stg @C[0],$A[3][3]($dst) # R[3][3] = ~C[3] ^ ( C[0] & C[4]) 285 286 287 xg @D[2],$A[0][2]($src) 288 xg @D[3],$A[1][3]($src) 289 xg @D[1],$A[4][1]($src) 290 xg @D[4],$A[2][4]($src) 291 xgr $dst,$src # xchg $dst,$src 292 rllg @D[2],@D[2],$rhotates[0][2] 293 xg @D[0],$A[3][0]($src) 294 rllg @D[3],@D[3],$rhotates[1][3] 295 xgr $src,$dst 296 rllg @D[1],@D[1],$rhotates[4][1] 297 xgr $dst,$src 298 rllg @D[4],@D[4],$rhotates[2][4] 299___ 300 @C = @D[2..4,0,1]; 301$code.=<<___; 302 lgr @T[0],@C[0] 303 ngr @C[0],@C[1] 304 lghi @T[1],-1 # no 'not' instruction :-( 305 xgr @C[0],@C[4] # C[4] ^ ( C[0] & C[1]) 306 xgr @C[1],@T[1] # not @C[1] 307 stg @C[0],$A[4][4]($src) # R[4][4] = C[4] ^ ( C[0] & C[1]) 308 309 lgr @T[1],@C[2] 310 ngr @C[2],@C[1] 311 rllg @D[0],@D[0],$rhotates[3][0] 312 xgr @C[2],@T[0] # C[0] ^ ( C[2] & ~C[1]) 313 ogr @T[0],@C[4] 314 stg @C[2],$A[4][0]($src) # R[4][0] = C[0] ^ ( C[2] & ~C[1]) 315 xgr @T[0],@C[3] # C[3] ^ ( C[0] | C[4]) 316 317 ngr @C[4],@C[3] 318 stg @T[0],$A[4][3]($src) # R[4][3] = C[3] ^ ( C[0] | C[4]) 319 xgr @C[4],@T[1] # C[2] ^ ( C[4] & C[3]) 320 ogr @C[3],@T[1] 321 stg @C[4],$A[4][2]($src) # R[4][2] = C[2] ^ ( C[4] & C[3]) 322 xgr @C[3],@C[1] # ~C[1] ^ ( C[2] | C[3]) 323 324 lgr @C[1],@C[0] # harmonize with the loop top 325 lgr @C[0],@T[0] 326 stg @C[3],$A[4][1]($src) # R[4][1] = ~C[1] ^ ( C[2] | C[3]) 327 328 tmll $iotas,255 329 jnz .Loop 330 331 l${g} %r14,$SIZE_T*14($sp) 332 br %r14 333.size __KeccakF1600,.-__KeccakF1600 334___ 335} 336{ 337$code.=<<___; 338.type KeccakF1600,\@function 339.align 32 340KeccakF1600: 341.LKeccakF1600: 342 lghi %r1,-$frame 343 stm${g} %r6,%r15,$SIZE_T*6($sp) 344 lgr %r0,$sp 345 la $sp,0(%r1,$sp) 346 st${g} %r0,0($sp) 347 348 lghi @D[0],-1 # no 'not' instruction :-( 349 lghi @D[1],-1 350 lghi @D[2],-1 351 lghi @D[3],-1 352 lghi @D[4],-1 353 lghi @T[0],-1 354 xg @D[0],$A[0][1]($src) 355 xg @D[1],$A[0][2]($src) 356 xg @D[2],$A[1][3]($src) 357 xg @D[3],$A[2][2]($src) 358 xg @D[4],$A[3][2]($src) 359 xg @T[0],$A[4][0]($src) 360 stmg @D[0],@D[1],$A[0][1]($src) 361 stg @D[2],$A[1][3]($src) 362 stg @D[3],$A[2][2]($src) 363 stg @D[4],$A[3][2]($src) 364 stg @T[0],$A[4][0]($src) 365 366 la $dst,$stdframe($sp) 367 368 bras %r14,__KeccakF1600 369 370 lghi @D[0],-1 # no 'not' instruction :-( 371 lghi @D[1],-1 372 lghi @D[2],-1 373 lghi @D[3],-1 374 lghi @D[4],-1 375 lghi @T[0],-1 376 xg @D[0],$A[0][1]($src) 377 xg @D[1],$A[0][2]($src) 378 xg @D[2],$A[1][3]($src) 379 xg @D[3],$A[2][2]($src) 380 xg @D[4],$A[3][2]($src) 381 xg @T[0],$A[4][0]($src) 382 stmg @D[0],@D[1],$A[0][1]($src) 383 stg @D[2],$A[1][3]($src) 384 stg @D[3],$A[2][2]($src) 385 stg @D[4],$A[3][2]($src) 386 stg @T[0],$A[4][0]($src) 387 388 lm${g} %r6,%r15,$frame+6*$SIZE_T($sp) 389 br %r14 390.size KeccakF1600,.-KeccakF1600 391___ 392} 393{ my ($A_flat,$inp,$len,$bsz) = map("%r$_",(2..5)); 394 395$code.=<<___; 396.globl SHA3_absorb 397.type SHA3_absorb,\@function 398.align 32 399SHA3_absorb: 400 lghi %r1,-$frame 401 stm${g} %r5,%r15,$SIZE_T*5($sp) 402 lgr %r0,$sp 403 la $sp,0(%r1,$sp) 404 st${g} %r0,0($sp) 405 406 lghi @D[0],-1 # no 'not' instruction :-( 407 lghi @D[1],-1 408 lghi @D[2],-1 409 lghi @D[3],-1 410 lghi @D[4],-1 411 lghi @T[0],-1 412 xg @D[0],$A[0][1]($src) 413 xg @D[1],$A[0][2]($src) 414 xg @D[2],$A[1][3]($src) 415 xg @D[3],$A[2][2]($src) 416 xg @D[4],$A[3][2]($src) 417 xg @T[0],$A[4][0]($src) 418 stmg @D[0],@D[1],$A[0][1]($src) 419 stg @D[2],$A[1][3]($src) 420 stg @D[3],$A[2][2]($src) 421 stg @D[4],$A[3][2]($src) 422 stg @T[0],$A[4][0]($src) 423 424.Loop_absorb: 425 cl${g}r $len,$bsz 426 jl .Ldone_absorb 427 428 srl${g} $bsz,3 429 la %r1,0($A_flat) 430 431.Lblock_absorb: 432 lrvg %r0,0($inp) 433 la $inp,8($inp) 434 xg %r0,0(%r1) 435 a${g}hi $len,-8 436 stg %r0,0(%r1) 437 la %r1,8(%r1) 438 brct $bsz,.Lblock_absorb 439 440 stm${g} $inp,$len,$frame+3*$SIZE_T($sp) 441 la $dst,$stdframe($sp) 442 bras %r14,__KeccakF1600 443 lm${g} $inp,$bsz,$frame+3*$SIZE_T($sp) 444 j .Loop_absorb 445 446.align 16 447.Ldone_absorb: 448 lghi @D[0],-1 # no 'not' instruction :-( 449 lghi @D[1],-1 450 lghi @D[2],-1 451 lghi @D[3],-1 452 lghi @D[4],-1 453 lghi @T[0],-1 454 xg @D[0],$A[0][1]($src) 455 xg @D[1],$A[0][2]($src) 456 xg @D[2],$A[1][3]($src) 457 xg @D[3],$A[2][2]($src) 458 xg @D[4],$A[3][2]($src) 459 xg @T[0],$A[4][0]($src) 460 stmg @D[0],@D[1],$A[0][1]($src) 461 stg @D[2],$A[1][3]($src) 462 stg @D[3],$A[2][2]($src) 463 stg @D[4],$A[3][2]($src) 464 stg @T[0],$A[4][0]($src) 465 466 lgr %r2,$len # return value 467 468 lm${g} %r6,%r15,$frame+6*$SIZE_T($sp) 469 br %r14 470.size SHA3_absorb,.-SHA3_absorb 471___ 472} 473{ my ($A_flat,$out,$len,$bsz) = map("%r$_",(2..5)); 474 475$code.=<<___; 476.globl SHA3_squeeze 477.type SHA3_squeeze,\@function 478.align 32 479SHA3_squeeze: 480 srl${g} $bsz,3 481 st${g} %r14,2*$SIZE_T($sp) 482 lghi %r14,8 483 st${g} $bsz,5*$SIZE_T($sp) 484 la %r1,0($A_flat) 485 486 j .Loop_squeeze 487 488.align 16 489.Loop_squeeze: 490 cl${g}r $len,%r14 491 jl .Ltail_squeeze 492 493 lrvg %r0,0(%r1) 494 la %r1,8(%r1) 495 stg %r0,0($out) 496 la $out,8($out) 497 a${g}hi $len,-8 # len -= 8 498 jz .Ldone_squeeze 499 500 brct $bsz,.Loop_squeeze # bsz-- 501 502 stm${g} $out,$len,3*$SIZE_T($sp) 503 bras %r14,.LKeccakF1600 504 lm${g} $out,$bsz,3*$SIZE_T($sp) 505 lghi %r14,8 506 la %r1,0($A_flat) 507 j .Loop_squeeze 508 509.Ltail_squeeze: 510 lg %r0,0(%r1) 511.Loop_tail_squeeze: 512 stc %r0,0($out) 513 la $out,1($out) 514 srlg %r0,8 515 brct $len,.Loop_tail_squeeze 516 517.Ldone_squeeze: 518 l${g} %r14,2*$SIZE_T($sp) 519 br %r14 520.size SHA3_squeeze,.-SHA3_squeeze 521___ 522} 523$code.=<<___; 524.align 256 525 .quad 0,0,0,0,0,0,0,0 526.type iotas,\@object 527iotas: 528 .quad 0x0000000000000001 529 .quad 0x0000000000008082 530 .quad 0x800000000000808a 531 .quad 0x8000000080008000 532 .quad 0x000000000000808b 533 .quad 0x0000000080000001 534 .quad 0x8000000080008081 535 .quad 0x8000000000008009 536 .quad 0x000000000000008a 537 .quad 0x0000000000000088 538 .quad 0x0000000080008009 539 .quad 0x000000008000000a 540 .quad 0x000000008000808b 541 .quad 0x800000000000008b 542 .quad 0x8000000000008089 543 .quad 0x8000000000008003 544 .quad 0x8000000000008002 545 .quad 0x8000000000000080 546 .quad 0x000000000000800a 547 .quad 0x800000008000000a 548 .quad 0x8000000080008081 549 .quad 0x8000000000008080 550 .quad 0x0000000080000001 551 .quad 0x8000000080008008 552.size iotas,.-iotas 553.asciz "Keccak-1600 absorb and squeeze for s390x, CRYPTOGAMS by <appro\@openssl.org>" 554___ 555 556# unlike 32-bit shift 64-bit one takes three arguments 557$code =~ s/(srlg\s+)(%r[0-9]+),/$1$2,$2,/gm; 558 559print $code; 560close STDOUT or die "error closing STDOUT: $!"; 561