1#! /usr/bin/env perl 2# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# March 2010 18# 19# The module implements "4-bit" GCM GHASH function and underlying 20# single multiplication operation in GF(2^128). "4-bit" means that it 21# uses 256 bytes per-key table [+128 bytes shared table]. Performance 22# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU 23# and are expressed in cycles per processed byte, less is better: 24# 25# gcc 3.3.x cc 5.2 this assembler 26# 27# 32-bit build 81.4 43.3 12.6 (+546%/+244%) 28# 64-bit build 20.2 21.2 12.6 (+60%/+68%) 29# 30# Here is data collected on UltraSPARC T1 system running Linux: 31# 32# gcc 4.4.1 this assembler 33# 34# 32-bit build 566 50 (+1000%) 35# 64-bit build 56 50 (+12%) 36# 37# I don't quite understand why difference between 32-bit and 64-bit 38# compiler-generated code is so big. Compilers *were* instructed to 39# generate code for UltraSPARC and should have used 64-bit registers 40# for Z vector (see C code) even in 32-bit build... Oh well, it only 41# means more impressive improvement coefficients for this assembler 42# module;-) Loops are aggressively modulo-scheduled in respect to 43# references to input data and Z.hi updates to achieve 12 cycles 44# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6 45# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1. 46# 47# October 2012 48# 49# Add VIS3 lookup-table-free implementation using polynomial 50# multiplication xmulx[hi] and extended addition addxc[cc] 51# instructions. 4.52/7.63x improvement on T3/T4 or in absolute 52# terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark 53# saturates at ~15.5x single-process result on 8-core processor, 54# or ~20.5GBps per 2.85GHz socket. 55 56$output=pop; 57open STDOUT,">$output"; 58 59$frame="STACK_FRAME"; 60$bias="STACK_BIAS"; 61 62$Zhi="%o0"; # 64-bit values 63$Zlo="%o1"; 64$Thi="%o2"; 65$Tlo="%o3"; 66$rem="%o4"; 67$tmp="%o5"; 68 69$nhi="%l0"; # small values and pointers 70$nlo="%l1"; 71$xi0="%l2"; 72$xi1="%l3"; 73$rem_4bit="%l4"; 74$remi="%l5"; 75$Htblo="%l6"; 76$cnt="%l7"; 77 78$Xi="%i0"; # input argument block 79$Htbl="%i1"; 80$inp="%i2"; 81$len="%i3"; 82 83$code.=<<___; 84#include "sparc_arch.h" 85 86#ifdef __arch64__ 87.register %g2,#scratch 88.register %g3,#scratch 89#endif 90 91.section ".text",#alloc,#execinstr 92 93.align 64 94rem_4bit: 95 .long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0 96 .long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0 97 .long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0 98 .long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0 99.type rem_4bit,#object 100.size rem_4bit,(.-rem_4bit) 101 102.globl gcm_ghash_4bit 103.align 32 104gcm_ghash_4bit: 105 save %sp,-$frame,%sp 106 ldub [$inp+15],$nlo 107 ldub [$Xi+15],$xi0 108 ldub [$Xi+14],$xi1 109 add $len,$inp,$len 110 add $Htbl,8,$Htblo 111 1121: call .+8 113 add %o7,rem_4bit-1b,$rem_4bit 114 115.Louter: 116 xor $xi0,$nlo,$nlo 117 and $nlo,0xf0,$nhi 118 and $nlo,0x0f,$nlo 119 sll $nlo,4,$nlo 120 ldx [$Htblo+$nlo],$Zlo 121 ldx [$Htbl+$nlo],$Zhi 122 123 ldub [$inp+14],$nlo 124 125 ldx [$Htblo+$nhi],$Tlo 126 and $Zlo,0xf,$remi 127 ldx [$Htbl+$nhi],$Thi 128 sll $remi,3,$remi 129 ldx [$rem_4bit+$remi],$rem 130 srlx $Zlo,4,$Zlo 131 mov 13,$cnt 132 sllx $Zhi,60,$tmp 133 xor $Tlo,$Zlo,$Zlo 134 srlx $Zhi,4,$Zhi 135 xor $Zlo,$tmp,$Zlo 136 137 xor $xi1,$nlo,$nlo 138 and $Zlo,0xf,$remi 139 and $nlo,0xf0,$nhi 140 and $nlo,0x0f,$nlo 141 ba .Lghash_inner 142 sll $nlo,4,$nlo 143.align 32 144.Lghash_inner: 145 ldx [$Htblo+$nlo],$Tlo 146 sll $remi,3,$remi 147 xor $Thi,$Zhi,$Zhi 148 ldx [$Htbl+$nlo],$Thi 149 srlx $Zlo,4,$Zlo 150 xor $rem,$Zhi,$Zhi 151 ldx [$rem_4bit+$remi],$rem 152 sllx $Zhi,60,$tmp 153 xor $Tlo,$Zlo,$Zlo 154 ldub [$inp+$cnt],$nlo 155 srlx $Zhi,4,$Zhi 156 xor $Zlo,$tmp,$Zlo 157 ldub [$Xi+$cnt],$xi1 158 xor $Thi,$Zhi,$Zhi 159 and $Zlo,0xf,$remi 160 161 ldx [$Htblo+$nhi],$Tlo 162 sll $remi,3,$remi 163 xor $rem,$Zhi,$Zhi 164 ldx [$Htbl+$nhi],$Thi 165 srlx $Zlo,4,$Zlo 166 ldx [$rem_4bit+$remi],$rem 167 sllx $Zhi,60,$tmp 168 xor $xi1,$nlo,$nlo 169 srlx $Zhi,4,$Zhi 170 and $nlo,0xf0,$nhi 171 addcc $cnt,-1,$cnt 172 xor $Zlo,$tmp,$Zlo 173 and $nlo,0x0f,$nlo 174 xor $Tlo,$Zlo,$Zlo 175 sll $nlo,4,$nlo 176 blu .Lghash_inner 177 and $Zlo,0xf,$remi 178 179 ldx [$Htblo+$nlo],$Tlo 180 sll $remi,3,$remi 181 xor $Thi,$Zhi,$Zhi 182 ldx [$Htbl+$nlo],$Thi 183 srlx $Zlo,4,$Zlo 184 xor $rem,$Zhi,$Zhi 185 ldx [$rem_4bit+$remi],$rem 186 sllx $Zhi,60,$tmp 187 xor $Tlo,$Zlo,$Zlo 188 srlx $Zhi,4,$Zhi 189 xor $Zlo,$tmp,$Zlo 190 xor $Thi,$Zhi,$Zhi 191 192 add $inp,16,$inp 193 cmp $inp,$len 194 be,pn SIZE_T_CC,.Ldone 195 and $Zlo,0xf,$remi 196 197 ldx [$Htblo+$nhi],$Tlo 198 sll $remi,3,$remi 199 xor $rem,$Zhi,$Zhi 200 ldx [$Htbl+$nhi],$Thi 201 srlx $Zlo,4,$Zlo 202 ldx [$rem_4bit+$remi],$rem 203 sllx $Zhi,60,$tmp 204 xor $Tlo,$Zlo,$Zlo 205 ldub [$inp+15],$nlo 206 srlx $Zhi,4,$Zhi 207 xor $Zlo,$tmp,$Zlo 208 xor $Thi,$Zhi,$Zhi 209 stx $Zlo,[$Xi+8] 210 xor $rem,$Zhi,$Zhi 211 stx $Zhi,[$Xi] 212 srl $Zlo,8,$xi1 213 and $Zlo,0xff,$xi0 214 ba .Louter 215 and $xi1,0xff,$xi1 216.align 32 217.Ldone: 218 ldx [$Htblo+$nhi],$Tlo 219 sll $remi,3,$remi 220 xor $rem,$Zhi,$Zhi 221 ldx [$Htbl+$nhi],$Thi 222 srlx $Zlo,4,$Zlo 223 ldx [$rem_4bit+$remi],$rem 224 sllx $Zhi,60,$tmp 225 xor $Tlo,$Zlo,$Zlo 226 srlx $Zhi,4,$Zhi 227 xor $Zlo,$tmp,$Zlo 228 xor $Thi,$Zhi,$Zhi 229 stx $Zlo,[$Xi+8] 230 xor $rem,$Zhi,$Zhi 231 stx $Zhi,[$Xi] 232 233 ret 234 restore 235.type gcm_ghash_4bit,#function 236.size gcm_ghash_4bit,(.-gcm_ghash_4bit) 237___ 238 239undef $inp; 240undef $len; 241 242$code.=<<___; 243.globl gcm_gmult_4bit 244.align 32 245gcm_gmult_4bit: 246 save %sp,-$frame,%sp 247 ldub [$Xi+15],$nlo 248 add $Htbl,8,$Htblo 249 2501: call .+8 251 add %o7,rem_4bit-1b,$rem_4bit 252 253 and $nlo,0xf0,$nhi 254 and $nlo,0x0f,$nlo 255 sll $nlo,4,$nlo 256 ldx [$Htblo+$nlo],$Zlo 257 ldx [$Htbl+$nlo],$Zhi 258 259 ldub [$Xi+14],$nlo 260 261 ldx [$Htblo+$nhi],$Tlo 262 and $Zlo,0xf,$remi 263 ldx [$Htbl+$nhi],$Thi 264 sll $remi,3,$remi 265 ldx [$rem_4bit+$remi],$rem 266 srlx $Zlo,4,$Zlo 267 mov 13,$cnt 268 sllx $Zhi,60,$tmp 269 xor $Tlo,$Zlo,$Zlo 270 srlx $Zhi,4,$Zhi 271 xor $Zlo,$tmp,$Zlo 272 273 and $Zlo,0xf,$remi 274 and $nlo,0xf0,$nhi 275 and $nlo,0x0f,$nlo 276 ba .Lgmult_inner 277 sll $nlo,4,$nlo 278.align 32 279.Lgmult_inner: 280 ldx [$Htblo+$nlo],$Tlo 281 sll $remi,3,$remi 282 xor $Thi,$Zhi,$Zhi 283 ldx [$Htbl+$nlo],$Thi 284 srlx $Zlo,4,$Zlo 285 xor $rem,$Zhi,$Zhi 286 ldx [$rem_4bit+$remi],$rem 287 sllx $Zhi,60,$tmp 288 xor $Tlo,$Zlo,$Zlo 289 ldub [$Xi+$cnt],$nlo 290 srlx $Zhi,4,$Zhi 291 xor $Zlo,$tmp,$Zlo 292 xor $Thi,$Zhi,$Zhi 293 and $Zlo,0xf,$remi 294 295 ldx [$Htblo+$nhi],$Tlo 296 sll $remi,3,$remi 297 xor $rem,$Zhi,$Zhi 298 ldx [$Htbl+$nhi],$Thi 299 srlx $Zlo,4,$Zlo 300 ldx [$rem_4bit+$remi],$rem 301 sllx $Zhi,60,$tmp 302 srlx $Zhi,4,$Zhi 303 and $nlo,0xf0,$nhi 304 addcc $cnt,-1,$cnt 305 xor $Zlo,$tmp,$Zlo 306 and $nlo,0x0f,$nlo 307 xor $Tlo,$Zlo,$Zlo 308 sll $nlo,4,$nlo 309 blu .Lgmult_inner 310 and $Zlo,0xf,$remi 311 312 ldx [$Htblo+$nlo],$Tlo 313 sll $remi,3,$remi 314 xor $Thi,$Zhi,$Zhi 315 ldx [$Htbl+$nlo],$Thi 316 srlx $Zlo,4,$Zlo 317 xor $rem,$Zhi,$Zhi 318 ldx [$rem_4bit+$remi],$rem 319 sllx $Zhi,60,$tmp 320 xor $Tlo,$Zlo,$Zlo 321 srlx $Zhi,4,$Zhi 322 xor $Zlo,$tmp,$Zlo 323 xor $Thi,$Zhi,$Zhi 324 and $Zlo,0xf,$remi 325 326 ldx [$Htblo+$nhi],$Tlo 327 sll $remi,3,$remi 328 xor $rem,$Zhi,$Zhi 329 ldx [$Htbl+$nhi],$Thi 330 srlx $Zlo,4,$Zlo 331 ldx [$rem_4bit+$remi],$rem 332 sllx $Zhi,60,$tmp 333 xor $Tlo,$Zlo,$Zlo 334 srlx $Zhi,4,$Zhi 335 xor $Zlo,$tmp,$Zlo 336 xor $Thi,$Zhi,$Zhi 337 stx $Zlo,[$Xi+8] 338 xor $rem,$Zhi,$Zhi 339 stx $Zhi,[$Xi] 340 341 ret 342 restore 343.type gcm_gmult_4bit,#function 344.size gcm_gmult_4bit,(.-gcm_gmult_4bit) 345___ 346 347{{{ 348# Straightforward 128x128-bit multiplication using Karatsuba algorithm 349# followed by pair of 64-bit reductions [with a shortcut in first one, 350# which allowed to break dependency between reductions and remove one 351# multiplication from critical path]. While it might be suboptimal 352# with regard to sheer number of multiplications, other methods [such 353# as aggregate reduction] would require more 64-bit registers, which 354# we don't have in 32-bit application context. 355 356($Xip,$Htable,$inp,$len)=map("%i$_",(0..3)); 357 358($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)= 359 (map("%o$_",(0..5,7)),map("%g$_",(1..5))); 360 361($shl,$shr)=map("%l$_",(0..7)); 362 363# For details regarding "twisted H" see ghash-x86.pl. 364$code.=<<___; 365.globl gcm_init_vis3 366.align 32 367gcm_init_vis3: 368 save %sp,-$frame,%sp 369 370 ldx [%i1+0],$Hhi 371 ldx [%i1+8],$Hlo 372 mov 0xE1,$Xhi 373 mov 1,$Xlo 374 sllx $Xhi,57,$Xhi 375 srax $Hhi,63,$C0 ! broadcast carry 376 addcc $Hlo,$Hlo,$Hlo ! H<<=1 377 addxc $Hhi,$Hhi,$Hhi 378 and $C0,$Xlo,$Xlo 379 and $C0,$Xhi,$Xhi 380 xor $Xlo,$Hlo,$Hlo 381 xor $Xhi,$Hhi,$Hhi 382 stx $Hlo,[%i0+8] ! save twisted H 383 stx $Hhi,[%i0+0] 384 385 sethi %hi(0xA0406080),$V 386 sethi %hi(0x20C0E000),%l0 387 or $V,%lo(0xA0406080),$V 388 or %l0,%lo(0x20C0E000),%l0 389 sllx $V,32,$V 390 or %l0,$V,$V ! (0xE0·i)&0xff=0xA040608020C0E000 391 stx $V,[%i0+16] 392 393 ret 394 restore 395.type gcm_init_vis3,#function 396.size gcm_init_vis3,.-gcm_init_vis3 397 398.globl gcm_gmult_vis3 399.align 32 400gcm_gmult_vis3: 401 save %sp,-$frame,%sp 402 403 ldx [$Xip+8],$Xlo ! load Xi 404 ldx [$Xip+0],$Xhi 405 ldx [$Htable+8],$Hlo ! load twisted H 406 ldx [$Htable+0],$Hhi 407 408 mov 0xE1,%l7 409 sllx %l7,57,$xE1 ! 57 is not a typo 410 ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000 411 412 xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing 413 xmulx $Xlo,$Hlo,$C0 414 xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing 415 xmulx $C2,$Hhl,$C1 416 xmulxhi $Xlo,$Hlo,$Xlo 417 xmulxhi $C2,$Hhl,$C2 418 xmulxhi $Xhi,$Hhi,$C3 419 xmulx $Xhi,$Hhi,$Xhi 420 421 sll $C0,3,$sqr 422 srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)] 423 xor $C0,$sqr,$sqr 424 sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f] 425 426 xor $C0,$C1,$C1 ! Karatsuba post-processing 427 xor $Xlo,$C2,$C2 428 xor $sqr,$Xlo,$Xlo ! real destination is $C1 429 xor $C3,$C2,$C2 430 xor $Xlo,$C1,$C1 431 xor $Xhi,$C2,$C2 432 xor $Xhi,$C1,$C1 433 434 xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56 435 xor $C0,$C2,$C2 436 xmulx $C1,$xE1,$C0 437 xor $C1,$C3,$C3 438 xmulxhi $C1,$xE1,$C1 439 440 xor $Xlo,$C2,$C2 441 xor $C0,$C2,$C2 442 xor $C1,$C3,$C3 443 444 stx $C2,[$Xip+8] ! save Xi 445 stx $C3,[$Xip+0] 446 447 ret 448 restore 449.type gcm_gmult_vis3,#function 450.size gcm_gmult_vis3,.-gcm_gmult_vis3 451 452.globl gcm_ghash_vis3 453.align 32 454gcm_ghash_vis3: 455 save %sp,-$frame,%sp 456 nop 457 srln $len,0,$len ! needed on v8+, "nop" on v9 458 459 ldx [$Xip+8],$C2 ! load Xi 460 ldx [$Xip+0],$C3 461 ldx [$Htable+8],$Hlo ! load twisted H 462 ldx [$Htable+0],$Hhi 463 464 mov 0xE1,%l7 465 sllx %l7,57,$xE1 ! 57 is not a typo 466 ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000 467 468 and $inp,7,$shl 469 andn $inp,7,$inp 470 sll $shl,3,$shl 471 prefetch [$inp+63], 20 472 sub %g0,$shl,$shr 473 474 xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing 475.Loop: 476 ldx [$inp+8],$Xlo 477 brz,pt $shl,1f 478 ldx [$inp+0],$Xhi 479 480 ldx [$inp+16],$C1 ! align data 481 srlx $Xlo,$shr,$C0 482 sllx $Xlo,$shl,$Xlo 483 sllx $Xhi,$shl,$Xhi 484 srlx $C1,$shr,$C1 485 or $C0,$Xhi,$Xhi 486 or $C1,$Xlo,$Xlo 4871: 488 add $inp,16,$inp 489 sub $len,16,$len 490 xor $C2,$Xlo,$Xlo 491 xor $C3,$Xhi,$Xhi 492 prefetch [$inp+63], 20 493 494 xmulx $Xlo,$Hlo,$C0 495 xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing 496 xmulx $C2,$Hhl,$C1 497 xmulxhi $Xlo,$Hlo,$Xlo 498 xmulxhi $C2,$Hhl,$C2 499 xmulxhi $Xhi,$Hhi,$C3 500 xmulx $Xhi,$Hhi,$Xhi 501 502 sll $C0,3,$sqr 503 srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)] 504 xor $C0,$sqr,$sqr 505 sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f] 506 507 xor $C0,$C1,$C1 ! Karatsuba post-processing 508 xor $Xlo,$C2,$C2 509 xor $sqr,$Xlo,$Xlo ! real destination is $C1 510 xor $C3,$C2,$C2 511 xor $Xlo,$C1,$C1 512 xor $Xhi,$C2,$C2 513 xor $Xhi,$C1,$C1 514 515 xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56 516 xor $C0,$C2,$C2 517 xmulx $C1,$xE1,$C0 518 xor $C1,$C3,$C3 519 xmulxhi $C1,$xE1,$C1 520 521 xor $Xlo,$C2,$C2 522 xor $C0,$C2,$C2 523 brnz,pt $len,.Loop 524 xor $C1,$C3,$C3 525 526 stx $C2,[$Xip+8] ! save Xi 527 stx $C3,[$Xip+0] 528 529 ret 530 restore 531.type gcm_ghash_vis3,#function 532.size gcm_ghash_vis3,.-gcm_ghash_vis3 533___ 534}}} 535$code.=<<___; 536.asciz "GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>" 537.align 4 538___ 539 540 541# Purpose of these subroutines is to explicitly encode VIS instructions, 542# so that one can compile the module without having to specify VIS 543# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. 544# Idea is to reserve for option to produce "universal" binary and let 545# programmer detect if current CPU is VIS capable at run-time. 546sub unvis3 { 547my ($mnemonic,$rs1,$rs2,$rd)=@_; 548my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); 549my ($ref,$opf); 550my %visopf = ( "addxc" => 0x011, 551 "addxccc" => 0x013, 552 "xmulx" => 0x115, 553 "xmulxhi" => 0x116 ); 554 555 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 556 557 if ($opf=$visopf{$mnemonic}) { 558 foreach ($rs1,$rs2,$rd) { 559 return $ref if (!/%([goli])([0-9])/); 560 $_=$bias{$1}+$2; 561 } 562 563 return sprintf ".word\t0x%08x !%s", 564 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, 565 $ref; 566 } else { 567 return $ref; 568 } 569} 570 571foreach (split("\n",$code)) { 572 s/\`([^\`]*)\`/eval $1/ge; 573 574 s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ 575 &unvis3($1,$2,$3,$4) 576 /ge; 577 578 print $_,"\n"; 579} 580 581close STDOUT or die "error closing STDOUT: $!"; 582