1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# April 2010 11# 12# The module implements "4-bit" GCM GHASH function and underlying 13# single multiplication operation in GF(2^128). "4-bit" means that it 14# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC 15# it processes one byte in 19.6 cycles, which is more than twice as 16# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for 17# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per 18# processed byte. This is ~2.2x faster than 64-bit code generated by 19# vendor compiler (which used to be very hard to beat:-). 20# 21# Special thanks to polarhome.com for providing HP-UX account. 22 23$flavour = shift; 24$output = shift; 25open STDOUT,">$output"; 26 27if ($flavour =~ /64/) { 28 $LEVEL ="2.0W"; 29 $SIZE_T =8; 30 $FRAME_MARKER =80; 31 $SAVED_RP =16; 32 $PUSH ="std"; 33 $PUSHMA ="std,ma"; 34 $POP ="ldd"; 35 $POPMB ="ldd,mb"; 36 $NREGS =6; 37} else { 38 $LEVEL ="1.0"; #"\n\t.ALLOW\t2.0"; 39 $SIZE_T =4; 40 $FRAME_MARKER =48; 41 $SAVED_RP =20; 42 $PUSH ="stw"; 43 $PUSHMA ="stwm"; 44 $POP ="ldw"; 45 $POPMB ="ldwm"; 46 $NREGS =11; 47} 48 49$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker 50 # [+ argument transfer] 51 52################# volatile registers 53$Xi="%r26"; # argument block 54$Htbl="%r25"; 55$inp="%r24"; 56$len="%r23"; 57$Hhh=$Htbl; # variables 58$Hll="%r22"; 59$Zhh="%r21"; 60$Zll="%r20"; 61$cnt="%r19"; 62$rem_4bit="%r28"; 63$rem="%r29"; 64$mask0xf0="%r31"; 65 66################# preserved registers 67$Thh="%r1"; 68$Tll="%r2"; 69$nlo="%r3"; 70$nhi="%r4"; 71$byte="%r5"; 72if ($SIZE_T==4) { 73 $Zhl="%r6"; 74 $Zlh="%r7"; 75 $Hhl="%r8"; 76 $Hlh="%r9"; 77 $Thl="%r10"; 78 $Tlh="%r11"; 79} 80$rem2="%r6"; # used in PA-RISC 2.0 code 81 82$code.=<<___; 83 .LEVEL $LEVEL 84 .SPACE \$TEXT\$ 85 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY 86 87 .EXPORT gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR 88 .ALIGN 64 89gcm_gmult_4bit 90 .PROC 91 .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS 92 .ENTRY 93 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue 94 $PUSHMA %r3,$FRAME(%sp) 95 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) 96 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) 97 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) 98___ 99$code.=<<___ if ($SIZE_T==4); 100 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) 101 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) 102 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) 103 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) 104 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) 105___ 106$code.=<<___; 107 blr %r0,$rem_4bit 108 ldi 3,$rem 109L\$pic_gmult 110 andcm $rem_4bit,$rem,$rem_4bit 111 addl $inp,$len,$len 112 ldo L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit 113 ldi 0xf0,$mask0xf0 114___ 115$code.=<<___ if ($SIZE_T==4); 116 ldi 31,$rem 117 mtctl $rem,%cr11 118 extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0 119 b L\$parisc1_gmult 120 nop 121___ 122 123$code.=<<___; 124 ldb 15($Xi),$nlo 125 ldo 8($Htbl),$Hll 126 127 and $mask0xf0,$nlo,$nhi 128 depd,z $nlo,59,4,$nlo 129 130 ldd $nlo($Hll),$Zll 131 ldd $nlo($Hhh),$Zhh 132 133 depd,z $Zll,60,4,$rem 134 shrpd $Zhh,$Zll,4,$Zll 135 extrd,u $Zhh,59,60,$Zhh 136 ldb 14($Xi),$nlo 137 138 ldd $nhi($Hll),$Tll 139 ldd $nhi($Hhh),$Thh 140 and $mask0xf0,$nlo,$nhi 141 depd,z $nlo,59,4,$nlo 142 143 xor $Tll,$Zll,$Zll 144 xor $Thh,$Zhh,$Zhh 145 ldd $rem($rem_4bit),$rem 146 b L\$oop_gmult_pa2 147 ldi 13,$cnt 148 149 .ALIGN 8 150L\$oop_gmult_pa2 151 xor $rem,$Zhh,$Zhh ; moved here to work around gas bug 152 depd,z $Zll,60,4,$rem 153 154 shrpd $Zhh,$Zll,4,$Zll 155 extrd,u $Zhh,59,60,$Zhh 156 ldd $nlo($Hll),$Tll 157 ldd $nlo($Hhh),$Thh 158 159 xor $Tll,$Zll,$Zll 160 xor $Thh,$Zhh,$Zhh 161 ldd $rem($rem_4bit),$rem 162 163 xor $rem,$Zhh,$Zhh 164 depd,z $Zll,60,4,$rem 165 ldbx $cnt($Xi),$nlo 166 167 shrpd $Zhh,$Zll,4,$Zll 168 extrd,u $Zhh,59,60,$Zhh 169 ldd $nhi($Hll),$Tll 170 ldd $nhi($Hhh),$Thh 171 172 and $mask0xf0,$nlo,$nhi 173 depd,z $nlo,59,4,$nlo 174 ldd $rem($rem_4bit),$rem 175 176 xor $Tll,$Zll,$Zll 177 addib,uv -1,$cnt,L\$oop_gmult_pa2 178 xor $Thh,$Zhh,$Zhh 179 180 xor $rem,$Zhh,$Zhh 181 depd,z $Zll,60,4,$rem 182 183 shrpd $Zhh,$Zll,4,$Zll 184 extrd,u $Zhh,59,60,$Zhh 185 ldd $nlo($Hll),$Tll 186 ldd $nlo($Hhh),$Thh 187 188 xor $Tll,$Zll,$Zll 189 xor $Thh,$Zhh,$Zhh 190 ldd $rem($rem_4bit),$rem 191 192 xor $rem,$Zhh,$Zhh 193 depd,z $Zll,60,4,$rem 194 195 shrpd $Zhh,$Zll,4,$Zll 196 extrd,u $Zhh,59,60,$Zhh 197 ldd $nhi($Hll),$Tll 198 ldd $nhi($Hhh),$Thh 199 200 xor $Tll,$Zll,$Zll 201 xor $Thh,$Zhh,$Zhh 202 ldd $rem($rem_4bit),$rem 203 204 xor $rem,$Zhh,$Zhh 205 std $Zll,8($Xi) 206 std $Zhh,0($Xi) 207___ 208 209$code.=<<___ if ($SIZE_T==4); 210 b L\$done_gmult 211 nop 212 213L\$parisc1_gmult 214 ldb 15($Xi),$nlo 215 ldo 12($Htbl),$Hll 216 ldo 8($Htbl),$Hlh 217 ldo 4($Htbl),$Hhl 218 219 and $mask0xf0,$nlo,$nhi 220 zdep $nlo,27,4,$nlo 221 222 ldwx $nlo($Hll),$Zll 223 ldwx $nlo($Hlh),$Zlh 224 ldwx $nlo($Hhl),$Zhl 225 ldwx $nlo($Hhh),$Zhh 226 zdep $Zll,28,4,$rem 227 ldb 14($Xi),$nlo 228 ldwx $rem($rem_4bit),$rem 229 shrpw $Zlh,$Zll,4,$Zll 230 ldwx $nhi($Hll),$Tll 231 shrpw $Zhl,$Zlh,4,$Zlh 232 ldwx $nhi($Hlh),$Tlh 233 shrpw $Zhh,$Zhl,4,$Zhl 234 ldwx $nhi($Hhl),$Thl 235 extru $Zhh,27,28,$Zhh 236 ldwx $nhi($Hhh),$Thh 237 xor $rem,$Zhh,$Zhh 238 and $mask0xf0,$nlo,$nhi 239 zdep $nlo,27,4,$nlo 240 241 xor $Tll,$Zll,$Zll 242 ldwx $nlo($Hll),$Tll 243 xor $Tlh,$Zlh,$Zlh 244 ldwx $nlo($Hlh),$Tlh 245 xor $Thl,$Zhl,$Zhl 246 b L\$oop_gmult_pa1 247 ldi 13,$cnt 248 249 .ALIGN 8 250L\$oop_gmult_pa1 251 zdep $Zll,28,4,$rem 252 ldwx $nlo($Hhl),$Thl 253 xor $Thh,$Zhh,$Zhh 254 ldwx $rem($rem_4bit),$rem 255 shrpw $Zlh,$Zll,4,$Zll 256 ldwx $nlo($Hhh),$Thh 257 shrpw $Zhl,$Zlh,4,$Zlh 258 ldbx $cnt($Xi),$nlo 259 xor $Tll,$Zll,$Zll 260 ldwx $nhi($Hll),$Tll 261 shrpw $Zhh,$Zhl,4,$Zhl 262 xor $Tlh,$Zlh,$Zlh 263 ldwx $nhi($Hlh),$Tlh 264 extru $Zhh,27,28,$Zhh 265 xor $Thl,$Zhl,$Zhl 266 ldwx $nhi($Hhl),$Thl 267 xor $rem,$Zhh,$Zhh 268 zdep $Zll,28,4,$rem 269 xor $Thh,$Zhh,$Zhh 270 ldwx $nhi($Hhh),$Thh 271 shrpw $Zlh,$Zll,4,$Zll 272 ldwx $rem($rem_4bit),$rem 273 shrpw $Zhl,$Zlh,4,$Zlh 274 shrpw $Zhh,$Zhl,4,$Zhl 275 and $mask0xf0,$nlo,$nhi 276 extru $Zhh,27,28,$Zhh 277 zdep $nlo,27,4,$nlo 278 xor $Tll,$Zll,$Zll 279 ldwx $nlo($Hll),$Tll 280 xor $Tlh,$Zlh,$Zlh 281 ldwx $nlo($Hlh),$Tlh 282 xor $rem,$Zhh,$Zhh 283 addib,uv -1,$cnt,L\$oop_gmult_pa1 284 xor $Thl,$Zhl,$Zhl 285 286 zdep $Zll,28,4,$rem 287 ldwx $nlo($Hhl),$Thl 288 xor $Thh,$Zhh,$Zhh 289 ldwx $rem($rem_4bit),$rem 290 shrpw $Zlh,$Zll,4,$Zll 291 ldwx $nlo($Hhh),$Thh 292 shrpw $Zhl,$Zlh,4,$Zlh 293 xor $Tll,$Zll,$Zll 294 ldwx $nhi($Hll),$Tll 295 shrpw $Zhh,$Zhl,4,$Zhl 296 xor $Tlh,$Zlh,$Zlh 297 ldwx $nhi($Hlh),$Tlh 298 extru $Zhh,27,28,$Zhh 299 xor $rem,$Zhh,$Zhh 300 xor $Thl,$Zhl,$Zhl 301 ldwx $nhi($Hhl),$Thl 302 xor $Thh,$Zhh,$Zhh 303 ldwx $nhi($Hhh),$Thh 304 zdep $Zll,28,4,$rem 305 ldwx $rem($rem_4bit),$rem 306 shrpw $Zlh,$Zll,4,$Zll 307 shrpw $Zhl,$Zlh,4,$Zlh 308 shrpw $Zhh,$Zhl,4,$Zhl 309 extru $Zhh,27,28,$Zhh 310 xor $Tll,$Zll,$Zll 311 xor $Tlh,$Zlh,$Zlh 312 xor $rem,$Zhh,$Zhh 313 stw $Zll,12($Xi) 314 xor $Thl,$Zhl,$Zhl 315 stw $Zlh,8($Xi) 316 xor $Thh,$Zhh,$Zhh 317 stw $Zhl,4($Xi) 318 stw $Zhh,0($Xi) 319___ 320$code.=<<___; 321L\$done_gmult 322 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue 323 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 324 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 325 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 326___ 327$code.=<<___ if ($SIZE_T==4); 328 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 329 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 330 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 331 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 332 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 333___ 334$code.=<<___; 335 bv (%r2) 336 .EXIT 337 $POPMB -$FRAME(%sp),%r3 338 .PROCEND 339 340 .EXPORT gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR 341 .ALIGN 64 342gcm_ghash_4bit 343 .PROC 344 .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11 345 .ENTRY 346 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue 347 $PUSHMA %r3,$FRAME(%sp) 348 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) 349 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) 350 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) 351___ 352$code.=<<___ if ($SIZE_T==4); 353 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) 354 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) 355 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) 356 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) 357 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) 358___ 359$code.=<<___; 360 blr %r0,$rem_4bit 361 ldi 3,$rem 362L\$pic_ghash 363 andcm $rem_4bit,$rem,$rem_4bit 364 addl $inp,$len,$len 365 ldo L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit 366 ldi 0xf0,$mask0xf0 367___ 368$code.=<<___ if ($SIZE_T==4); 369 ldi 31,$rem 370 mtctl $rem,%cr11 371 extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0 372 b L\$parisc1_ghash 373 nop 374___ 375 376$code.=<<___; 377 ldb 15($Xi),$nlo 378 ldo 8($Htbl),$Hll 379 380L\$outer_ghash_pa2 381 ldb 15($inp),$nhi 382 xor $nhi,$nlo,$nlo 383 and $mask0xf0,$nlo,$nhi 384 depd,z $nlo,59,4,$nlo 385 386 ldd $nlo($Hll),$Zll 387 ldd $nlo($Hhh),$Zhh 388 389 depd,z $Zll,60,4,$rem 390 shrpd $Zhh,$Zll,4,$Zll 391 extrd,u $Zhh,59,60,$Zhh 392 ldb 14($Xi),$nlo 393 ldb 14($inp),$byte 394 395 ldd $nhi($Hll),$Tll 396 ldd $nhi($Hhh),$Thh 397 xor $byte,$nlo,$nlo 398 and $mask0xf0,$nlo,$nhi 399 depd,z $nlo,59,4,$nlo 400 401 xor $Tll,$Zll,$Zll 402 xor $Thh,$Zhh,$Zhh 403 ldd $rem($rem_4bit),$rem 404 b L\$oop_ghash_pa2 405 ldi 13,$cnt 406 407 .ALIGN 8 408L\$oop_ghash_pa2 409 xor $rem,$Zhh,$Zhh ; moved here to work around gas bug 410 depd,z $Zll,60,4,$rem2 411 412 shrpd $Zhh,$Zll,4,$Zll 413 extrd,u $Zhh,59,60,$Zhh 414 ldd $nlo($Hll),$Tll 415 ldd $nlo($Hhh),$Thh 416 417 xor $Tll,$Zll,$Zll 418 xor $Thh,$Zhh,$Zhh 419 ldbx $cnt($Xi),$nlo 420 ldbx $cnt($inp),$byte 421 422 depd,z $Zll,60,4,$rem 423 shrpd $Zhh,$Zll,4,$Zll 424 ldd $rem2($rem_4bit),$rem2 425 426 xor $rem2,$Zhh,$Zhh 427 xor $byte,$nlo,$nlo 428 ldd $nhi($Hll),$Tll 429 ldd $nhi($Hhh),$Thh 430 431 and $mask0xf0,$nlo,$nhi 432 depd,z $nlo,59,4,$nlo 433 434 extrd,u $Zhh,59,60,$Zhh 435 xor $Tll,$Zll,$Zll 436 437 ldd $rem($rem_4bit),$rem 438 addib,uv -1,$cnt,L\$oop_ghash_pa2 439 xor $Thh,$Zhh,$Zhh 440 441 xor $rem,$Zhh,$Zhh 442 depd,z $Zll,60,4,$rem2 443 444 shrpd $Zhh,$Zll,4,$Zll 445 extrd,u $Zhh,59,60,$Zhh 446 ldd $nlo($Hll),$Tll 447 ldd $nlo($Hhh),$Thh 448 449 xor $Tll,$Zll,$Zll 450 xor $Thh,$Zhh,$Zhh 451 452 depd,z $Zll,60,4,$rem 453 shrpd $Zhh,$Zll,4,$Zll 454 ldd $rem2($rem_4bit),$rem2 455 456 xor $rem2,$Zhh,$Zhh 457 ldd $nhi($Hll),$Tll 458 ldd $nhi($Hhh),$Thh 459 460 extrd,u $Zhh,59,60,$Zhh 461 xor $Tll,$Zll,$Zll 462 xor $Thh,$Zhh,$Zhh 463 ldd $rem($rem_4bit),$rem 464 465 xor $rem,$Zhh,$Zhh 466 std $Zll,8($Xi) 467 ldo 16($inp),$inp 468 std $Zhh,0($Xi) 469 cmpb,*<> $inp,$len,L\$outer_ghash_pa2 470 copy $Zll,$nlo 471___ 472 473$code.=<<___ if ($SIZE_T==4); 474 b L\$done_ghash 475 nop 476 477L\$parisc1_ghash 478 ldb 15($Xi),$nlo 479 ldo 12($Htbl),$Hll 480 ldo 8($Htbl),$Hlh 481 ldo 4($Htbl),$Hhl 482 483L\$outer_ghash_pa1 484 ldb 15($inp),$byte 485 xor $byte,$nlo,$nlo 486 and $mask0xf0,$nlo,$nhi 487 zdep $nlo,27,4,$nlo 488 489 ldwx $nlo($Hll),$Zll 490 ldwx $nlo($Hlh),$Zlh 491 ldwx $nlo($Hhl),$Zhl 492 ldwx $nlo($Hhh),$Zhh 493 zdep $Zll,28,4,$rem 494 ldb 14($Xi),$nlo 495 ldb 14($inp),$byte 496 ldwx $rem($rem_4bit),$rem 497 shrpw $Zlh,$Zll,4,$Zll 498 ldwx $nhi($Hll),$Tll 499 shrpw $Zhl,$Zlh,4,$Zlh 500 ldwx $nhi($Hlh),$Tlh 501 shrpw $Zhh,$Zhl,4,$Zhl 502 ldwx $nhi($Hhl),$Thl 503 extru $Zhh,27,28,$Zhh 504 ldwx $nhi($Hhh),$Thh 505 xor $byte,$nlo,$nlo 506 xor $rem,$Zhh,$Zhh 507 and $mask0xf0,$nlo,$nhi 508 zdep $nlo,27,4,$nlo 509 510 xor $Tll,$Zll,$Zll 511 ldwx $nlo($Hll),$Tll 512 xor $Tlh,$Zlh,$Zlh 513 ldwx $nlo($Hlh),$Tlh 514 xor $Thl,$Zhl,$Zhl 515 b L\$oop_ghash_pa1 516 ldi 13,$cnt 517 518 .ALIGN 8 519L\$oop_ghash_pa1 520 zdep $Zll,28,4,$rem 521 ldwx $nlo($Hhl),$Thl 522 xor $Thh,$Zhh,$Zhh 523 ldwx $rem($rem_4bit),$rem 524 shrpw $Zlh,$Zll,4,$Zll 525 ldwx $nlo($Hhh),$Thh 526 shrpw $Zhl,$Zlh,4,$Zlh 527 ldbx $cnt($Xi),$nlo 528 xor $Tll,$Zll,$Zll 529 ldwx $nhi($Hll),$Tll 530 shrpw $Zhh,$Zhl,4,$Zhl 531 ldbx $cnt($inp),$byte 532 xor $Tlh,$Zlh,$Zlh 533 ldwx $nhi($Hlh),$Tlh 534 extru $Zhh,27,28,$Zhh 535 xor $Thl,$Zhl,$Zhl 536 ldwx $nhi($Hhl),$Thl 537 xor $rem,$Zhh,$Zhh 538 zdep $Zll,28,4,$rem 539 xor $Thh,$Zhh,$Zhh 540 ldwx $nhi($Hhh),$Thh 541 shrpw $Zlh,$Zll,4,$Zll 542 ldwx $rem($rem_4bit),$rem 543 shrpw $Zhl,$Zlh,4,$Zlh 544 xor $byte,$nlo,$nlo 545 shrpw $Zhh,$Zhl,4,$Zhl 546 and $mask0xf0,$nlo,$nhi 547 extru $Zhh,27,28,$Zhh 548 zdep $nlo,27,4,$nlo 549 xor $Tll,$Zll,$Zll 550 ldwx $nlo($Hll),$Tll 551 xor $Tlh,$Zlh,$Zlh 552 ldwx $nlo($Hlh),$Tlh 553 xor $rem,$Zhh,$Zhh 554 addib,uv -1,$cnt,L\$oop_ghash_pa1 555 xor $Thl,$Zhl,$Zhl 556 557 zdep $Zll,28,4,$rem 558 ldwx $nlo($Hhl),$Thl 559 xor $Thh,$Zhh,$Zhh 560 ldwx $rem($rem_4bit),$rem 561 shrpw $Zlh,$Zll,4,$Zll 562 ldwx $nlo($Hhh),$Thh 563 shrpw $Zhl,$Zlh,4,$Zlh 564 xor $Tll,$Zll,$Zll 565 ldwx $nhi($Hll),$Tll 566 shrpw $Zhh,$Zhl,4,$Zhl 567 xor $Tlh,$Zlh,$Zlh 568 ldwx $nhi($Hlh),$Tlh 569 extru $Zhh,27,28,$Zhh 570 xor $rem,$Zhh,$Zhh 571 xor $Thl,$Zhl,$Zhl 572 ldwx $nhi($Hhl),$Thl 573 xor $Thh,$Zhh,$Zhh 574 ldwx $nhi($Hhh),$Thh 575 zdep $Zll,28,4,$rem 576 ldwx $rem($rem_4bit),$rem 577 shrpw $Zlh,$Zll,4,$Zll 578 shrpw $Zhl,$Zlh,4,$Zlh 579 shrpw $Zhh,$Zhl,4,$Zhl 580 extru $Zhh,27,28,$Zhh 581 xor $Tll,$Zll,$Zll 582 xor $Tlh,$Zlh,$Zlh 583 xor $rem,$Zhh,$Zhh 584 stw $Zll,12($Xi) 585 xor $Thl,$Zhl,$Zhl 586 stw $Zlh,8($Xi) 587 xor $Thh,$Zhh,$Zhh 588 stw $Zhl,4($Xi) 589 ldo 16($inp),$inp 590 stw $Zhh,0($Xi) 591 comb,<> $inp,$len,L\$outer_ghash_pa1 592 copy $Zll,$nlo 593___ 594$code.=<<___; 595L\$done_ghash 596 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue 597 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 598 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 599 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 600___ 601$code.=<<___ if ($SIZE_T==4); 602 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 603 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 604 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 605 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 606 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 607___ 608$code.=<<___; 609 bv (%r2) 610 .EXIT 611 $POPMB -$FRAME(%sp),%r3 612 .PROCEND 613 614 .ALIGN 64 615L\$rem_4bit 616 .WORD `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0 617 .WORD `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0 618 .WORD `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0 619 .WORD `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0 620 .STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>" 621 .ALIGN 64 622___ 623 624# Explicitly encode PA-RISC 2.0 instructions used in this module, so 625# that it can be compiled with .LEVEL 1.0. It should be noted that I 626# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 627# directive... 628 629my $ldd = sub { 630 my ($mod,$args) = @_; 631 my $orig = "ldd$mod\t$args"; 632 633 if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4 634 { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3; 635 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 636 } 637 elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5 638 { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3; 639 $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset 640 $opcode|=(1<<5) if ($mod =~ /^,m/); 641 $opcode|=(1<<13) if ($mod =~ /^,mb/); 642 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 643 } 644 else { "\t".$orig; } 645}; 646 647my $std = sub { 648 my ($mod,$args) = @_; 649 my $orig = "std$mod\t$args"; 650 651 if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices 652 { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1); 653 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 654 } 655 else { "\t".$orig; } 656}; 657 658my $extrd = sub { 659 my ($mod,$args) = @_; 660 my $orig = "extrd$mod\t$args"; 661 662 # I only have ",u" completer, it's implicitly encoded... 663 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 664 { my $opcode=(0x36<<26)|($1<<21)|($4<<16); 665 my $len=32-$3; 666 $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos 667 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len 668 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 669 } 670 elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 671 { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); 672 my $len=32-$2; 673 $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len 674 $opcode |= (1<<13) if ($mod =~ /,\**=/); 675 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 676 } 677 else { "\t".$orig; } 678}; 679 680my $shrpd = sub { 681 my ($mod,$args) = @_; 682 my $orig = "shrpd$mod\t$args"; 683 684 if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 685 { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; 686 my $cpos=63-$3; 687 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa 688 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 689 } 690 elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11 691 { sprintf "\t.WORD\t0x%08x\t; %s", 692 (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig; 693 } 694 else { "\t".$orig; } 695}; 696 697my $depd = sub { 698 my ($mod,$args) = @_; 699 my $orig = "depd$mod\t$args"; 700 701 # I only have ",z" completer, it's impicitly encoded... 702 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 16 703 { my $opcode=(0x3c<<26)|($4<<21)|($1<<16); 704 my $cpos=63-$2; 705 my $len=32-$3; 706 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode pos 707 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len 708 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 709 } 710 else { "\t".$orig; } 711}; 712 713sub assemble { 714 my ($mnemonic,$mod,$args)=@_; 715 my $opcode = eval("\$$mnemonic"); 716 717 ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; 718} 719 720foreach (split("\n",$code)) { 721 s/\`([^\`]*)\`/eval $1/ge; 722 if ($SIZE_T==4) { 723 s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e; 724 s/cmpb,\*/comb,/; 725 s/,\*/,/; 726 } 727 print $_,"\n"; 728} 729 730close STDOUT; 731