1#!/usr/bin/env perl 2 3# Specific modes implementations for SPARC Architecture 2011. There 4# is T4 dependency though, an ASI value that is not specified in the 5# Architecture Manual. But as SPARC universe is rather monocultural, 6# we imply that processor capable of executing crypto instructions 7# can handle the ASI in question as well. This means that we ought to 8# keep eyes open when new processors emerge... 9# 10# As for above mentioned ASI. It's so called "block initializing 11# store" which cancels "read" in "read-update-write" on cache lines. 12# This is "cooperative" optimization, as it reduces overall pressure 13# on memory interface. Benefits can't be observed/quantified with 14# usual benchmarks, on the contrary you can notice that single-thread 15# performance for parallelizable modes is ~1.5% worse for largest 16# block sizes [though few percent better for not so long ones]. All 17# this based on suggestions from David Miller. 18 19sub asm_init { # to be called with @ARGV as argument 20 for (@_) { $::abibits=64 if (/\-m64/ || /\-xarch\=v9/); } 21 if ($::abibits==64) { $::bias=2047; $::frame=192; $::size_t_cc="%xcc"; } 22 else { $::bias=0; $::frame=112; $::size_t_cc="%icc"; } 23} 24 25# unified interface 26my ($inp,$out,$len,$key,$ivec)=map("%i$_",(0..5)); 27# local variables 28my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7)); 29 30sub alg_cbc_encrypt_implement { 31my ($alg,$bits) = @_; 32 33$::code.=<<___; 34.globl ${alg}${bits}_t4_cbc_encrypt 35.align 32 36${alg}${bits}_t4_cbc_encrypt: 37 save %sp, -$::frame, %sp 38 sub $inp, $out, $blk_init ! $inp!=$out 39___ 40$::code.=<<___ if (!$::evp); 41 andcc $ivec, 7, $ivoff 42 alignaddr $ivec, %g0, $ivec 43 44 ldd [$ivec + 0], %f0 ! load ivec 45 bz,pt %icc, 1f 46 ldd [$ivec + 8], %f2 47 ldd [$ivec + 16], %f4 48 faligndata %f0, %f2, %f0 49 faligndata %f2, %f4, %f2 501: 51___ 52$::code.=<<___ if ($::evp); 53 ld [$ivec + 0], %f0 54 ld [$ivec + 4], %f1 55 ld [$ivec + 8], %f2 56 ld [$ivec + 12], %f3 57___ 58$::code.=<<___; 59 prefetch [$inp], 20 60 prefetch [$inp + 63], 20 61 call _${alg}${bits}_load_enckey 62 and $inp, 7, $ileft 63 andn $inp, 7, $inp 64 sll $ileft, 3, $ileft 65 mov 64, $iright 66 mov 0xff, $omask 67 sub $iright, $ileft, $iright 68 and $out, 7, $ooff 69 cmp $len, 127 70 movrnz $ooff, 0, $blk_init ! if ( $out&7 || 71 movleu $::size_t_cc, 0, $blk_init ! $len<128 || 72 brnz,pn $blk_init, .L${bits}cbc_enc_blk ! $inp==$out) 73 srl $omask, $ooff, $omask 74 75 alignaddrl $out, %g0, $out 76 srlx $len, 4, $len 77 prefetch [$out], 22 78 79.L${bits}_cbc_enc_loop: 80 ldx [$inp + 0], %o0 81 brz,pt $ileft, 4f 82 ldx [$inp + 8], %o1 83 84 ldx [$inp + 16], %o2 85 sllx %o0, $ileft, %o0 86 srlx %o1, $iright, %g1 87 sllx %o1, $ileft, %o1 88 or %g1, %o0, %o0 89 srlx %o2, $iright, %o2 90 or %o2, %o1, %o1 914: 92 xor %g4, %o0, %o0 ! ^= rk[0] 93 xor %g5, %o1, %o1 94 movxtod %o0, %f12 95 movxtod %o1, %f14 96 97 fxor %f12, %f0, %f0 ! ^= ivec 98 fxor %f14, %f2, %f2 99 prefetch [$out + 63], 22 100 prefetch [$inp + 16+63], 20 101 call _${alg}${bits}_encrypt_1x 102 add $inp, 16, $inp 103 104 brnz,pn $ooff, 2f 105 sub $len, 1, $len 106 107 std %f0, [$out + 0] 108 std %f2, [$out + 8] 109 brnz,pt $len, .L${bits}_cbc_enc_loop 110 add $out, 16, $out 111___ 112$::code.=<<___ if ($::evp); 113 st %f0, [$ivec + 0] 114 st %f1, [$ivec + 4] 115 st %f2, [$ivec + 8] 116 st %f3, [$ivec + 12] 117___ 118$::code.=<<___ if (!$::evp); 119 brnz,pn $ivoff, 3f 120 nop 121 122 std %f0, [$ivec + 0] ! write out ivec 123 std %f2, [$ivec + 8] 124___ 125$::code.=<<___; 126 ret 127 restore 128 129.align 16 1302: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard 131 ! and ~3x deterioration 132 ! in inp==out case 133 faligndata %f0, %f0, %f4 ! handle unaligned output 134 faligndata %f0, %f2, %f6 135 faligndata %f2, %f2, %f8 136 137 stda %f4, [$out + $omask]0xc0 ! partial store 138 std %f6, [$out + 8] 139 add $out, 16, $out 140 orn %g0, $omask, $omask 141 stda %f8, [$out + $omask]0xc0 ! partial store 142 143 brnz,pt $len, .L${bits}_cbc_enc_loop+4 144 orn %g0, $omask, $omask 145___ 146$::code.=<<___ if ($::evp); 147 st %f0, [$ivec + 0] 148 st %f1, [$ivec + 4] 149 st %f2, [$ivec + 8] 150 st %f3, [$ivec + 12] 151___ 152$::code.=<<___ if (!$::evp); 153 brnz,pn $ivoff, 3f 154 nop 155 156 std %f0, [$ivec + 0] ! write out ivec 157 std %f2, [$ivec + 8] 158 ret 159 restore 160 161.align 16 1623: alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec 163 mov 0xff, $omask 164 srl $omask, $ivoff, $omask 165 faligndata %f0, %f0, %f4 166 faligndata %f0, %f2, %f6 167 faligndata %f2, %f2, %f8 168 stda %f4, [$ivec + $omask]0xc0 169 std %f6, [$ivec + 8] 170 add $ivec, 16, $ivec 171 orn %g0, $omask, $omask 172 stda %f8, [$ivec + $omask]0xc0 173___ 174$::code.=<<___; 175 ret 176 restore 177 178!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 179.align 32 180.L${bits}cbc_enc_blk: 181 add $out, $len, $blk_init 182 and $blk_init, 63, $blk_init ! tail 183 sub $len, $blk_init, $len 184 add $blk_init, 15, $blk_init ! round up to 16n 185 srlx $len, 4, $len 186 srl $blk_init, 4, $blk_init 187 188.L${bits}_cbc_enc_blk_loop: 189 ldx [$inp + 0], %o0 190 brz,pt $ileft, 5f 191 ldx [$inp + 8], %o1 192 193 ldx [$inp + 16], %o2 194 sllx %o0, $ileft, %o0 195 srlx %o1, $iright, %g1 196 sllx %o1, $ileft, %o1 197 or %g1, %o0, %o0 198 srlx %o2, $iright, %o2 199 or %o2, %o1, %o1 2005: 201 xor %g4, %o0, %o0 ! ^= rk[0] 202 xor %g5, %o1, %o1 203 movxtod %o0, %f12 204 movxtod %o1, %f14 205 206 fxor %f12, %f0, %f0 ! ^= ivec 207 fxor %f14, %f2, %f2 208 prefetch [$inp + 16+63], 20 209 call _${alg}${bits}_encrypt_1x 210 add $inp, 16, $inp 211 sub $len, 1, $len 212 213 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 214 add $out, 8, $out 215 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 216 brnz,pt $len, .L${bits}_cbc_enc_blk_loop 217 add $out, 8, $out 218 219 membar #StoreLoad|#StoreStore 220 brnz,pt $blk_init, .L${bits}_cbc_enc_loop 221 mov $blk_init, $len 222___ 223$::code.=<<___ if ($::evp); 224 st %f0, [$ivec + 0] 225 st %f1, [$ivec + 4] 226 st %f2, [$ivec + 8] 227 st %f3, [$ivec + 12] 228___ 229$::code.=<<___ if (!$::evp); 230 brnz,pn $ivoff, 3b 231 nop 232 233 std %f0, [$ivec + 0] ! write out ivec 234 std %f2, [$ivec + 8] 235___ 236$::code.=<<___; 237 ret 238 restore 239.type ${alg}${bits}_t4_cbc_encrypt,#function 240.size ${alg}${bits}_t4_cbc_encrypt,.-${alg}${bits}_t4_cbc_encrypt 241___ 242} 243 244sub alg_cbc_decrypt_implement { 245my ($alg,$bits) = @_; 246 247$::code.=<<___; 248.globl ${alg}${bits}_t4_cbc_decrypt 249.align 32 250${alg}${bits}_t4_cbc_decrypt: 251 save %sp, -$::frame, %sp 252 sub $inp, $out, $blk_init ! $inp!=$out 253___ 254$::code.=<<___ if (!$::evp); 255 andcc $ivec, 7, $ivoff 256 alignaddr $ivec, %g0, $ivec 257 258 ldd [$ivec + 0], %f12 ! load ivec 259 bz,pt %icc, 1f 260 ldd [$ivec + 8], %f14 261 ldd [$ivec + 16], %f0 262 faligndata %f12, %f14, %f12 263 faligndata %f14, %f0, %f14 2641: 265___ 266$::code.=<<___ if ($::evp); 267 ld [$ivec + 0], %f12 ! load ivec 268 ld [$ivec + 4], %f13 269 ld [$ivec + 8], %f14 270 ld [$ivec + 12], %f15 271___ 272$::code.=<<___; 273 prefetch [$inp], 20 274 prefetch [$inp + 63], 20 275 call _${alg}${bits}_load_deckey 276 and $inp, 7, $ileft 277 andn $inp, 7, $inp 278 sll $ileft, 3, $ileft 279 mov 64, $iright 280 mov 0xff, $omask 281 sub $iright, $ileft, $iright 282 and $out, 7, $ooff 283 cmp $len, 255 284 movrnz $ooff, 0, $blk_init ! if ( $out&7 || 285 movleu $::size_t_cc, 0, $blk_init ! $len<256 || 286 brnz,pn $blk_init, .L${bits}cbc_dec_blk ! $inp==$out) 287 srl $omask, $ooff, $omask 288 289 andcc $len, 16, %g0 ! is number of blocks even? 290 srlx $len, 4, $len 291 alignaddrl $out, %g0, $out 292 bz %icc, .L${bits}_cbc_dec_loop2x 293 prefetch [$out], 22 294.L${bits}_cbc_dec_loop: 295 ldx [$inp + 0], %o0 296 brz,pt $ileft, 4f 297 ldx [$inp + 8], %o1 298 299 ldx [$inp + 16], %o2 300 sllx %o0, $ileft, %o0 301 srlx %o1, $iright, %g1 302 sllx %o1, $ileft, %o1 303 or %g1, %o0, %o0 304 srlx %o2, $iright, %o2 305 or %o2, %o1, %o1 3064: 307 xor %g4, %o0, %o2 ! ^= rk[0] 308 xor %g5, %o1, %o3 309 movxtod %o2, %f0 310 movxtod %o3, %f2 311 312 prefetch [$out + 63], 22 313 prefetch [$inp + 16+63], 20 314 call _${alg}${bits}_decrypt_1x 315 add $inp, 16, $inp 316 317 fxor %f12, %f0, %f0 ! ^= ivec 318 fxor %f14, %f2, %f2 319 movxtod %o0, %f12 320 movxtod %o1, %f14 321 322 brnz,pn $ooff, 2f 323 sub $len, 1, $len 324 325 std %f0, [$out + 0] 326 std %f2, [$out + 8] 327 brnz,pt $len, .L${bits}_cbc_dec_loop2x 328 add $out, 16, $out 329___ 330$::code.=<<___ if ($::evp); 331 st %f12, [$ivec + 0] 332 st %f13, [$ivec + 4] 333 st %f14, [$ivec + 8] 334 st %f15, [$ivec + 12] 335___ 336$::code.=<<___ if (!$::evp); 337 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec 338 nop 339 340 std %f12, [$ivec + 0] ! write out ivec 341 std %f14, [$ivec + 8] 342___ 343$::code.=<<___; 344 ret 345 restore 346 347.align 16 3482: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard 349 ! and ~3x deterioration 350 ! in inp==out case 351 faligndata %f0, %f0, %f4 ! handle unaligned output 352 faligndata %f0, %f2, %f6 353 faligndata %f2, %f2, %f8 354 355 stda %f4, [$out + $omask]0xc0 ! partial store 356 std %f6, [$out + 8] 357 add $out, 16, $out 358 orn %g0, $omask, $omask 359 stda %f8, [$out + $omask]0xc0 ! partial store 360 361 brnz,pt $len, .L${bits}_cbc_dec_loop2x+4 362 orn %g0, $omask, $omask 363___ 364$::code.=<<___ if ($::evp); 365 st %f12, [$ivec + 0] 366 st %f13, [$ivec + 4] 367 st %f14, [$ivec + 8] 368 st %f15, [$ivec + 12] 369___ 370$::code.=<<___ if (!$::evp); 371 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec 372 nop 373 374 std %f12, [$ivec + 0] ! write out ivec 375 std %f14, [$ivec + 8] 376___ 377$::code.=<<___; 378 ret 379 restore 380 381!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 382.align 32 383.L${bits}_cbc_dec_loop2x: 384 ldx [$inp + 0], %o0 385 ldx [$inp + 8], %o1 386 ldx [$inp + 16], %o2 387 brz,pt $ileft, 4f 388 ldx [$inp + 24], %o3 389 390 ldx [$inp + 32], %o4 391 sllx %o0, $ileft, %o0 392 srlx %o1, $iright, %g1 393 or %g1, %o0, %o0 394 sllx %o1, $ileft, %o1 395 srlx %o2, $iright, %g1 396 or %g1, %o1, %o1 397 sllx %o2, $ileft, %o2 398 srlx %o3, $iright, %g1 399 or %g1, %o2, %o2 400 sllx %o3, $ileft, %o3 401 srlx %o4, $iright, %o4 402 or %o4, %o3, %o3 4034: 404 xor %g4, %o0, %o4 ! ^= rk[0] 405 xor %g5, %o1, %o5 406 movxtod %o4, %f0 407 movxtod %o5, %f2 408 xor %g4, %o2, %o4 409 xor %g5, %o3, %o5 410 movxtod %o4, %f4 411 movxtod %o5, %f6 412 413 prefetch [$out + 63], 22 414 prefetch [$inp + 32+63], 20 415 call _${alg}${bits}_decrypt_2x 416 add $inp, 32, $inp 417 418 movxtod %o0, %f8 419 movxtod %o1, %f10 420 fxor %f12, %f0, %f0 ! ^= ivec 421 fxor %f14, %f2, %f2 422 movxtod %o2, %f12 423 movxtod %o3, %f14 424 fxor %f8, %f4, %f4 425 fxor %f10, %f6, %f6 426 427 brnz,pn $ooff, 2f 428 sub $len, 2, $len 429 430 std %f0, [$out + 0] 431 std %f2, [$out + 8] 432 std %f4, [$out + 16] 433 std %f6, [$out + 24] 434 brnz,pt $len, .L${bits}_cbc_dec_loop2x 435 add $out, 32, $out 436___ 437$::code.=<<___ if ($::evp); 438 st %f12, [$ivec + 0] 439 st %f13, [$ivec + 4] 440 st %f14, [$ivec + 8] 441 st %f15, [$ivec + 12] 442___ 443$::code.=<<___ if (!$::evp); 444 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec 445 nop 446 447 std %f12, [$ivec + 0] ! write out ivec 448 std %f14, [$ivec + 8] 449___ 450$::code.=<<___; 451 ret 452 restore 453 454.align 16 4552: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard 456 ! and ~3x deterioration 457 ! in inp==out case 458 faligndata %f0, %f0, %f8 ! handle unaligned output 459 faligndata %f0, %f2, %f0 460 faligndata %f2, %f4, %f2 461 faligndata %f4, %f6, %f4 462 faligndata %f6, %f6, %f6 463 stda %f8, [$out + $omask]0xc0 ! partial store 464 std %f0, [$out + 8] 465 std %f2, [$out + 16] 466 std %f4, [$out + 24] 467 add $out, 32, $out 468 orn %g0, $omask, $omask 469 stda %f6, [$out + $omask]0xc0 ! partial store 470 471 brnz,pt $len, .L${bits}_cbc_dec_loop2x+4 472 orn %g0, $omask, $omask 473___ 474$::code.=<<___ if ($::evp); 475 st %f12, [$ivec + 0] 476 st %f13, [$ivec + 4] 477 st %f14, [$ivec + 8] 478 st %f15, [$ivec + 12] 479___ 480$::code.=<<___ if (!$::evp); 481 brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec 482 nop 483 484 std %f12, [$ivec + 0] ! write out ivec 485 std %f14, [$ivec + 8] 486 ret 487 restore 488 489.align 16 490.L${bits}_cbc_dec_unaligned_ivec: 491 alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec 492 mov 0xff, $omask 493 srl $omask, $ivoff, $omask 494 faligndata %f12, %f12, %f0 495 faligndata %f12, %f14, %f2 496 faligndata %f14, %f14, %f4 497 stda %f0, [$ivec + $omask]0xc0 498 std %f2, [$ivec + 8] 499 add $ivec, 16, $ivec 500 orn %g0, $omask, $omask 501 stda %f4, [$ivec + $omask]0xc0 502___ 503$::code.=<<___; 504 ret 505 restore 506 507!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 508.align 32 509.L${bits}cbc_dec_blk: 510 add $out, $len, $blk_init 511 and $blk_init, 63, $blk_init ! tail 512 sub $len, $blk_init, $len 513 add $blk_init, 15, $blk_init ! round up to 16n 514 srlx $len, 4, $len 515 srl $blk_init, 4, $blk_init 516 sub $len, 1, $len 517 add $blk_init, 1, $blk_init 518 519.L${bits}_cbc_dec_blk_loop2x: 520 ldx [$inp + 0], %o0 521 ldx [$inp + 8], %o1 522 ldx [$inp + 16], %o2 523 brz,pt $ileft, 5f 524 ldx [$inp + 24], %o3 525 526 ldx [$inp + 32], %o4 527 sllx %o0, $ileft, %o0 528 srlx %o1, $iright, %g1 529 or %g1, %o0, %o0 530 sllx %o1, $ileft, %o1 531 srlx %o2, $iright, %g1 532 or %g1, %o1, %o1 533 sllx %o2, $ileft, %o2 534 srlx %o3, $iright, %g1 535 or %g1, %o2, %o2 536 sllx %o3, $ileft, %o3 537 srlx %o4, $iright, %o4 538 or %o4, %o3, %o3 5395: 540 xor %g4, %o0, %o4 ! ^= rk[0] 541 xor %g5, %o1, %o5 542 movxtod %o4, %f0 543 movxtod %o5, %f2 544 xor %g4, %o2, %o4 545 xor %g5, %o3, %o5 546 movxtod %o4, %f4 547 movxtod %o5, %f6 548 549 prefetch [$inp + 32+63], 20 550 call _${alg}${bits}_decrypt_2x 551 add $inp, 32, $inp 552 subcc $len, 2, $len 553 554 movxtod %o0, %f8 555 movxtod %o1, %f10 556 fxor %f12, %f0, %f0 ! ^= ivec 557 fxor %f14, %f2, %f2 558 movxtod %o2, %f12 559 movxtod %o3, %f14 560 fxor %f8, %f4, %f4 561 fxor %f10, %f6, %f6 562 563 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 564 add $out, 8, $out 565 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 566 add $out, 8, $out 567 stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 568 add $out, 8, $out 569 stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 570 bgu,pt $::size_t_cc, .L${bits}_cbc_dec_blk_loop2x 571 add $out, 8, $out 572 573 add $blk_init, $len, $len 574 andcc $len, 1, %g0 ! is number of blocks even? 575 membar #StoreLoad|#StoreStore 576 bnz,pt %icc, .L${bits}_cbc_dec_loop 577 srl $len, 0, $len 578 brnz,pn $len, .L${bits}_cbc_dec_loop2x 579 nop 580___ 581$::code.=<<___ if ($::evp); 582 st %f12, [$ivec + 0] ! write out ivec 583 st %f13, [$ivec + 4] 584 st %f14, [$ivec + 8] 585 st %f15, [$ivec + 12] 586___ 587$::code.=<<___ if (!$::evp); 588 brnz,pn $ivoff, 3b 589 nop 590 591 std %f12, [$ivec + 0] ! write out ivec 592 std %f14, [$ivec + 8] 593___ 594$::code.=<<___; 595 ret 596 restore 597.type ${alg}${bits}_t4_cbc_decrypt,#function 598.size ${alg}${bits}_t4_cbc_decrypt,.-${alg}${bits}_t4_cbc_decrypt 599___ 600} 601 602sub alg_ctr32_implement { 603my ($alg,$bits) = @_; 604 605$::code.=<<___; 606.globl ${alg}${bits}_t4_ctr32_encrypt 607.align 32 608${alg}${bits}_t4_ctr32_encrypt: 609 save %sp, -$::frame, %sp 610 611 prefetch [$inp], 20 612 prefetch [$inp + 63], 20 613 call _${alg}${bits}_load_enckey 614 sllx $len, 4, $len 615 616 ld [$ivec + 0], %l4 ! counter 617 ld [$ivec + 4], %l5 618 ld [$ivec + 8], %l6 619 ld [$ivec + 12], %l7 620 621 sllx %l4, 32, %o5 622 or %l5, %o5, %o5 623 sllx %l6, 32, %g1 624 xor %o5, %g4, %g4 ! ^= rk[0] 625 xor %g1, %g5, %g5 626 movxtod %g4, %f14 ! most significant 64 bits 627 628 sub $inp, $out, $blk_init ! $inp!=$out 629 and $inp, 7, $ileft 630 andn $inp, 7, $inp 631 sll $ileft, 3, $ileft 632 mov 64, $iright 633 mov 0xff, $omask 634 sub $iright, $ileft, $iright 635 and $out, 7, $ooff 636 cmp $len, 255 637 movrnz $ooff, 0, $blk_init ! if ( $out&7 || 638 movleu $::size_t_cc, 0, $blk_init ! $len<256 || 639 brnz,pn $blk_init, .L${bits}_ctr32_blk ! $inp==$out) 640 srl $omask, $ooff, $omask 641 642 andcc $len, 16, %g0 ! is number of blocks even? 643 alignaddrl $out, %g0, $out 644 bz %icc, .L${bits}_ctr32_loop2x 645 srlx $len, 4, $len 646.L${bits}_ctr32_loop: 647 ldx [$inp + 0], %o0 648 brz,pt $ileft, 4f 649 ldx [$inp + 8], %o1 650 651 ldx [$inp + 16], %o2 652 sllx %o0, $ileft, %o0 653 srlx %o1, $iright, %g1 654 sllx %o1, $ileft, %o1 655 or %g1, %o0, %o0 656 srlx %o2, $iright, %o2 657 or %o2, %o1, %o1 6584: 659 xor %g5, %l7, %g1 ! ^= rk[0] 660 add %l7, 1, %l7 661 movxtod %g1, %f2 662 srl %l7, 0, %l7 ! clruw 663 prefetch [$out + 63], 22 664 prefetch [$inp + 16+63], 20 665___ 666$::code.=<<___ if ($alg eq "aes"); 667 aes_eround01 %f16, %f14, %f2, %f4 668 aes_eround23 %f18, %f14, %f2, %f2 669___ 670$::code.=<<___ if ($alg eq "cmll"); 671 camellia_f %f16, %f2, %f14, %f2 672 camellia_f %f18, %f14, %f2, %f0 673___ 674$::code.=<<___; 675 call _${alg}${bits}_encrypt_1x+8 676 add $inp, 16, $inp 677 678 movxtod %o0, %f10 679 movxtod %o1, %f12 680 fxor %f10, %f0, %f0 ! ^= inp 681 fxor %f12, %f2, %f2 682 683 brnz,pn $ooff, 2f 684 sub $len, 1, $len 685 686 std %f0, [$out + 0] 687 std %f2, [$out + 8] 688 brnz,pt $len, .L${bits}_ctr32_loop2x 689 add $out, 16, $out 690 691 ret 692 restore 693 694.align 16 6952: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard 696 ! and ~3x deterioration 697 ! in inp==out case 698 faligndata %f0, %f0, %f4 ! handle unaligned output 699 faligndata %f0, %f2, %f6 700 faligndata %f2, %f2, %f8 701 stda %f4, [$out + $omask]0xc0 ! partial store 702 std %f6, [$out + 8] 703 add $out, 16, $out 704 orn %g0, $omask, $omask 705 stda %f8, [$out + $omask]0xc0 ! partial store 706 707 brnz,pt $len, .L${bits}_ctr32_loop2x+4 708 orn %g0, $omask, $omask 709 710 ret 711 restore 712 713!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 714.align 32 715.L${bits}_ctr32_loop2x: 716 ldx [$inp + 0], %o0 717 ldx [$inp + 8], %o1 718 ldx [$inp + 16], %o2 719 brz,pt $ileft, 4f 720 ldx [$inp + 24], %o3 721 722 ldx [$inp + 32], %o4 723 sllx %o0, $ileft, %o0 724 srlx %o1, $iright, %g1 725 or %g1, %o0, %o0 726 sllx %o1, $ileft, %o1 727 srlx %o2, $iright, %g1 728 or %g1, %o1, %o1 729 sllx %o2, $ileft, %o2 730 srlx %o3, $iright, %g1 731 or %g1, %o2, %o2 732 sllx %o3, $ileft, %o3 733 srlx %o4, $iright, %o4 734 or %o4, %o3, %o3 7354: 736 xor %g5, %l7, %g1 ! ^= rk[0] 737 add %l7, 1, %l7 738 movxtod %g1, %f2 739 srl %l7, 0, %l7 ! clruw 740 xor %g5, %l7, %g1 741 add %l7, 1, %l7 742 movxtod %g1, %f6 743 srl %l7, 0, %l7 ! clruw 744 prefetch [$out + 63], 22 745 prefetch [$inp + 32+63], 20 746___ 747$::code.=<<___ if ($alg eq "aes"); 748 aes_eround01 %f16, %f14, %f2, %f8 749 aes_eround23 %f18, %f14, %f2, %f2 750 aes_eround01 %f16, %f14, %f6, %f10 751 aes_eround23 %f18, %f14, %f6, %f6 752___ 753$::code.=<<___ if ($alg eq "cmll"); 754 camellia_f %f16, %f2, %f14, %f2 755 camellia_f %f16, %f6, %f14, %f6 756 camellia_f %f18, %f14, %f2, %f0 757 camellia_f %f18, %f14, %f6, %f4 758___ 759$::code.=<<___; 760 call _${alg}${bits}_encrypt_2x+16 761 add $inp, 32, $inp 762 763 movxtod %o0, %f8 764 movxtod %o1, %f10 765 movxtod %o2, %f12 766 fxor %f8, %f0, %f0 ! ^= inp 767 movxtod %o3, %f8 768 fxor %f10, %f2, %f2 769 fxor %f12, %f4, %f4 770 fxor %f8, %f6, %f6 771 772 brnz,pn $ooff, 2f 773 sub $len, 2, $len 774 775 std %f0, [$out + 0] 776 std %f2, [$out + 8] 777 std %f4, [$out + 16] 778 std %f6, [$out + 24] 779 brnz,pt $len, .L${bits}_ctr32_loop2x 780 add $out, 32, $out 781 782 ret 783 restore 784 785.align 16 7862: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard 787 ! and ~3x deterioration 788 ! in inp==out case 789 faligndata %f0, %f0, %f8 ! handle unaligned output 790 faligndata %f0, %f2, %f0 791 faligndata %f2, %f4, %f2 792 faligndata %f4, %f6, %f4 793 faligndata %f6, %f6, %f6 794 795 stda %f8, [$out + $omask]0xc0 ! partial store 796 std %f0, [$out + 8] 797 std %f2, [$out + 16] 798 std %f4, [$out + 24] 799 add $out, 32, $out 800 orn %g0, $omask, $omask 801 stda %f6, [$out + $omask]0xc0 ! partial store 802 803 brnz,pt $len, .L${bits}_ctr32_loop2x+4 804 orn %g0, $omask, $omask 805 806 ret 807 restore 808 809!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 810.align 32 811.L${bits}_ctr32_blk: 812 add $out, $len, $blk_init 813 and $blk_init, 63, $blk_init ! tail 814 sub $len, $blk_init, $len 815 add $blk_init, 15, $blk_init ! round up to 16n 816 srlx $len, 4, $len 817 srl $blk_init, 4, $blk_init 818 sub $len, 1, $len 819 add $blk_init, 1, $blk_init 820 821.L${bits}_ctr32_blk_loop2x: 822 ldx [$inp + 0], %o0 823 ldx [$inp + 8], %o1 824 ldx [$inp + 16], %o2 825 brz,pt $ileft, 5f 826 ldx [$inp + 24], %o3 827 828 ldx [$inp + 32], %o4 829 sllx %o0, $ileft, %o0 830 srlx %o1, $iright, %g1 831 or %g1, %o0, %o0 832 sllx %o1, $ileft, %o1 833 srlx %o2, $iright, %g1 834 or %g1, %o1, %o1 835 sllx %o2, $ileft, %o2 836 srlx %o3, $iright, %g1 837 or %g1, %o2, %o2 838 sllx %o3, $ileft, %o3 839 srlx %o4, $iright, %o4 840 or %o4, %o3, %o3 8415: 842 xor %g5, %l7, %g1 ! ^= rk[0] 843 add %l7, 1, %l7 844 movxtod %g1, %f2 845 srl %l7, 0, %l7 ! clruw 846 xor %g5, %l7, %g1 847 add %l7, 1, %l7 848 movxtod %g1, %f6 849 srl %l7, 0, %l7 ! clruw 850 prefetch [$inp + 32+63], 20 851___ 852$::code.=<<___ if ($alg eq "aes"); 853 aes_eround01 %f16, %f14, %f2, %f8 854 aes_eround23 %f18, %f14, %f2, %f2 855 aes_eround01 %f16, %f14, %f6, %f10 856 aes_eround23 %f18, %f14, %f6, %f6 857___ 858$::code.=<<___ if ($alg eq "cmll"); 859 camellia_f %f16, %f2, %f14, %f2 860 camellia_f %f16, %f6, %f14, %f6 861 camellia_f %f18, %f14, %f2, %f0 862 camellia_f %f18, %f14, %f6, %f4 863___ 864$::code.=<<___; 865 call _${alg}${bits}_encrypt_2x+16 866 add $inp, 32, $inp 867 subcc $len, 2, $len 868 869 movxtod %o0, %f8 870 movxtod %o1, %f10 871 movxtod %o2, %f12 872 fxor %f8, %f0, %f0 ! ^= inp 873 movxtod %o3, %f8 874 fxor %f10, %f2, %f2 875 fxor %f12, %f4, %f4 876 fxor %f8, %f6, %f6 877 878 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 879 add $out, 8, $out 880 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 881 add $out, 8, $out 882 stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 883 add $out, 8, $out 884 stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 885 bgu,pt $::size_t_cc, .L${bits}_ctr32_blk_loop2x 886 add $out, 8, $out 887 888 add $blk_init, $len, $len 889 andcc $len, 1, %g0 ! is number of blocks even? 890 membar #StoreLoad|#StoreStore 891 bnz,pt %icc, .L${bits}_ctr32_loop 892 srl $len, 0, $len 893 brnz,pn $len, .L${bits}_ctr32_loop2x 894 nop 895 896 ret 897 restore 898.type ${alg}${bits}_t4_ctr32_encrypt,#function 899.size ${alg}${bits}_t4_ctr32_encrypt,.-${alg}${bits}_t4_ctr32_encrypt 900___ 901} 902 903sub alg_xts_implement { 904my ($alg,$bits,$dir) = @_; 905my ($inp,$out,$len,$key1,$key2,$ivec)=map("%i$_",(0..5)); 906my $rem=$ivec; 907 908$::code.=<<___; 909.globl ${alg}${bits}_t4_xts_${dir}crypt 910.align 32 911${alg}${bits}_t4_xts_${dir}crypt: 912 save %sp, -$::frame-16, %sp 913 914 mov $ivec, %o0 915 add %fp, $::bias-16, %o1 916 call ${alg}_t4_encrypt 917 mov $key2, %o2 918 919 add %fp, $::bias-16, %l7 920 ldxa [%l7]0x88, %g2 921 add %fp, $::bias-8, %l7 922 ldxa [%l7]0x88, %g3 ! %g3:%g2 is tweak 923 924 sethi %hi(0x76543210), %l7 925 or %l7, %lo(0x76543210), %l7 926 bmask %l7, %g0, %g0 ! byte swap mask 927 928 prefetch [$inp], 20 929 prefetch [$inp + 63], 20 930 call _${alg}${bits}_load_${dir}ckey 931 and $len, 15, $rem 932 and $len, -16, $len 933___ 934$code.=<<___ if ($dir eq "de"); 935 mov 0, %l7 936 movrnz $rem, 16, %l7 937 sub $len, %l7, $len 938___ 939$code.=<<___; 940 941 sub $inp, $out, $blk_init ! $inp!=$out 942 and $inp, 7, $ileft 943 andn $inp, 7, $inp 944 sll $ileft, 3, $ileft 945 mov 64, $iright 946 mov 0xff, $omask 947 sub $iright, $ileft, $iright 948 and $out, 7, $ooff 949 cmp $len, 255 950 movrnz $ooff, 0, $blk_init ! if ( $out&7 || 951 movleu $::size_t_cc, 0, $blk_init ! $len<256 || 952 brnz,pn $blk_init, .L${bits}_xts_${dir}blk ! $inp==$out) 953 srl $omask, $ooff, $omask 954 955 andcc $len, 16, %g0 ! is number of blocks even? 956___ 957$code.=<<___ if ($dir eq "de"); 958 brz,pn $len, .L${bits}_xts_${dir}steal 959___ 960$code.=<<___; 961 alignaddrl $out, %g0, $out 962 bz %icc, .L${bits}_xts_${dir}loop2x 963 srlx $len, 4, $len 964.L${bits}_xts_${dir}loop: 965 ldx [$inp + 0], %o0 966 brz,pt $ileft, 4f 967 ldx [$inp + 8], %o1 968 969 ldx [$inp + 16], %o2 970 sllx %o0, $ileft, %o0 971 srlx %o1, $iright, %g1 972 sllx %o1, $ileft, %o1 973 or %g1, %o0, %o0 974 srlx %o2, $iright, %o2 975 or %o2, %o1, %o1 9764: 977 movxtod %g2, %f12 978 movxtod %g3, %f14 979 bshuffle %f12, %f12, %f12 980 bshuffle %f14, %f14, %f14 981 982 xor %g4, %o0, %o0 ! ^= rk[0] 983 xor %g5, %o1, %o1 984 movxtod %o0, %f0 985 movxtod %o1, %f2 986 987 fxor %f12, %f0, %f0 ! ^= tweak[0] 988 fxor %f14, %f2, %f2 989 990 prefetch [$out + 63], 22 991 prefetch [$inp + 16+63], 20 992 call _${alg}${bits}_${dir}crypt_1x 993 add $inp, 16, $inp 994 995 fxor %f12, %f0, %f0 ! ^= tweak[0] 996 fxor %f14, %f2, %f2 997 998 srax %g3, 63, %l7 ! next tweak value 999 addcc %g2, %g2, %g2 1000 and %l7, 0x87, %l7 1001 addxc %g3, %g3, %g3 1002 xor %l7, %g2, %g2 1003 1004 brnz,pn $ooff, 2f 1005 sub $len, 1, $len 1006 1007 std %f0, [$out + 0] 1008 std %f2, [$out + 8] 1009 brnz,pt $len, .L${bits}_xts_${dir}loop2x 1010 add $out, 16, $out 1011 1012 brnz,pn $rem, .L${bits}_xts_${dir}steal 1013 nop 1014 1015 ret 1016 restore 1017 1018.align 16 10192: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard 1020 ! and ~3x deterioration 1021 ! in inp==out case 1022 faligndata %f0, %f0, %f4 ! handle unaligned output 1023 faligndata %f0, %f2, %f6 1024 faligndata %f2, %f2, %f8 1025 stda %f4, [$out + $omask]0xc0 ! partial store 1026 std %f6, [$out + 8] 1027 add $out, 16, $out 1028 orn %g0, $omask, $omask 1029 stda %f8, [$out + $omask]0xc0 ! partial store 1030 1031 brnz,pt $len, .L${bits}_xts_${dir}loop2x+4 1032 orn %g0, $omask, $omask 1033 1034 brnz,pn $rem, .L${bits}_xts_${dir}steal 1035 nop 1036 1037 ret 1038 restore 1039 1040!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 1041.align 32 1042.L${bits}_xts_${dir}loop2x: 1043 ldx [$inp + 0], %o0 1044 ldx [$inp + 8], %o1 1045 ldx [$inp + 16], %o2 1046 brz,pt $ileft, 4f 1047 ldx [$inp + 24], %o3 1048 1049 ldx [$inp + 32], %o4 1050 sllx %o0, $ileft, %o0 1051 srlx %o1, $iright, %g1 1052 or %g1, %o0, %o0 1053 sllx %o1, $ileft, %o1 1054 srlx %o2, $iright, %g1 1055 or %g1, %o1, %o1 1056 sllx %o2, $ileft, %o2 1057 srlx %o3, $iright, %g1 1058 or %g1, %o2, %o2 1059 sllx %o3, $ileft, %o3 1060 srlx %o4, $iright, %o4 1061 or %o4, %o3, %o3 10624: 1063 movxtod %g2, %f12 1064 movxtod %g3, %f14 1065 bshuffle %f12, %f12, %f12 1066 bshuffle %f14, %f14, %f14 1067 1068 srax %g3, 63, %l7 ! next tweak value 1069 addcc %g2, %g2, %g2 1070 and %l7, 0x87, %l7 1071 addxc %g3, %g3, %g3 1072 xor %l7, %g2, %g2 1073 1074 movxtod %g2, %f8 1075 movxtod %g3, %f10 1076 bshuffle %f8, %f8, %f8 1077 bshuffle %f10, %f10, %f10 1078 1079 xor %g4, %o0, %o0 ! ^= rk[0] 1080 xor %g5, %o1, %o1 1081 xor %g4, %o2, %o2 ! ^= rk[0] 1082 xor %g5, %o3, %o3 1083 movxtod %o0, %f0 1084 movxtod %o1, %f2 1085 movxtod %o2, %f4 1086 movxtod %o3, %f6 1087 1088 fxor %f12, %f0, %f0 ! ^= tweak[0] 1089 fxor %f14, %f2, %f2 1090 fxor %f8, %f4, %f4 ! ^= tweak[0] 1091 fxor %f10, %f6, %f6 1092 1093 prefetch [$out + 63], 22 1094 prefetch [$inp + 32+63], 20 1095 call _${alg}${bits}_${dir}crypt_2x 1096 add $inp, 32, $inp 1097 1098 movxtod %g2, %f8 1099 movxtod %g3, %f10 1100 1101 srax %g3, 63, %l7 ! next tweak value 1102 addcc %g2, %g2, %g2 1103 and %l7, 0x87, %l7 1104 addxc %g3, %g3, %g3 1105 xor %l7, %g2, %g2 1106 1107 bshuffle %f8, %f8, %f8 1108 bshuffle %f10, %f10, %f10 1109 1110 fxor %f12, %f0, %f0 ! ^= tweak[0] 1111 fxor %f14, %f2, %f2 1112 fxor %f8, %f4, %f4 1113 fxor %f10, %f6, %f6 1114 1115 brnz,pn $ooff, 2f 1116 sub $len, 2, $len 1117 1118 std %f0, [$out + 0] 1119 std %f2, [$out + 8] 1120 std %f4, [$out + 16] 1121 std %f6, [$out + 24] 1122 brnz,pt $len, .L${bits}_xts_${dir}loop2x 1123 add $out, 32, $out 1124 1125 fsrc2 %f4, %f0 1126 fsrc2 %f6, %f2 1127 brnz,pn $rem, .L${bits}_xts_${dir}steal 1128 nop 1129 1130 ret 1131 restore 1132 1133.align 16 11342: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard 1135 ! and ~3x deterioration 1136 ! in inp==out case 1137 faligndata %f0, %f0, %f8 ! handle unaligned output 1138 faligndata %f0, %f2, %f10 1139 faligndata %f2, %f4, %f12 1140 faligndata %f4, %f6, %f14 1141 faligndata %f6, %f6, %f0 1142 1143 stda %f8, [$out + $omask]0xc0 ! partial store 1144 std %f10, [$out + 8] 1145 std %f12, [$out + 16] 1146 std %f14, [$out + 24] 1147 add $out, 32, $out 1148 orn %g0, $omask, $omask 1149 stda %f0, [$out + $omask]0xc0 ! partial store 1150 1151 brnz,pt $len, .L${bits}_xts_${dir}loop2x+4 1152 orn %g0, $omask, $omask 1153 1154 fsrc2 %f4, %f0 1155 fsrc2 %f6, %f2 1156 brnz,pn $rem, .L${bits}_xts_${dir}steal 1157 nop 1158 1159 ret 1160 restore 1161 1162!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 1163.align 32 1164.L${bits}_xts_${dir}blk: 1165 add $out, $len, $blk_init 1166 and $blk_init, 63, $blk_init ! tail 1167 sub $len, $blk_init, $len 1168 add $blk_init, 15, $blk_init ! round up to 16n 1169 srlx $len, 4, $len 1170 srl $blk_init, 4, $blk_init 1171 sub $len, 1, $len 1172 add $blk_init, 1, $blk_init 1173 1174.L${bits}_xts_${dir}blk2x: 1175 ldx [$inp + 0], %o0 1176 ldx [$inp + 8], %o1 1177 ldx [$inp + 16], %o2 1178 brz,pt $ileft, 5f 1179 ldx [$inp + 24], %o3 1180 1181 ldx [$inp + 32], %o4 1182 sllx %o0, $ileft, %o0 1183 srlx %o1, $iright, %g1 1184 or %g1, %o0, %o0 1185 sllx %o1, $ileft, %o1 1186 srlx %o2, $iright, %g1 1187 or %g1, %o1, %o1 1188 sllx %o2, $ileft, %o2 1189 srlx %o3, $iright, %g1 1190 or %g1, %o2, %o2 1191 sllx %o3, $ileft, %o3 1192 srlx %o4, $iright, %o4 1193 or %o4, %o3, %o3 11945: 1195 movxtod %g2, %f12 1196 movxtod %g3, %f14 1197 bshuffle %f12, %f12, %f12 1198 bshuffle %f14, %f14, %f14 1199 1200 srax %g3, 63, %l7 ! next tweak value 1201 addcc %g2, %g2, %g2 1202 and %l7, 0x87, %l7 1203 addxc %g3, %g3, %g3 1204 xor %l7, %g2, %g2 1205 1206 movxtod %g2, %f8 1207 movxtod %g3, %f10 1208 bshuffle %f8, %f8, %f8 1209 bshuffle %f10, %f10, %f10 1210 1211 xor %g4, %o0, %o0 ! ^= rk[0] 1212 xor %g5, %o1, %o1 1213 xor %g4, %o2, %o2 ! ^= rk[0] 1214 xor %g5, %o3, %o3 1215 movxtod %o0, %f0 1216 movxtod %o1, %f2 1217 movxtod %o2, %f4 1218 movxtod %o3, %f6 1219 1220 fxor %f12, %f0, %f0 ! ^= tweak[0] 1221 fxor %f14, %f2, %f2 1222 fxor %f8, %f4, %f4 ! ^= tweak[0] 1223 fxor %f10, %f6, %f6 1224 1225 prefetch [$inp + 32+63], 20 1226 call _${alg}${bits}_${dir}crypt_2x 1227 add $inp, 32, $inp 1228 1229 movxtod %g2, %f8 1230 movxtod %g3, %f10 1231 1232 srax %g3, 63, %l7 ! next tweak value 1233 addcc %g2, %g2, %g2 1234 and %l7, 0x87, %l7 1235 addxc %g3, %g3, %g3 1236 xor %l7, %g2, %g2 1237 1238 bshuffle %f8, %f8, %f8 1239 bshuffle %f10, %f10, %f10 1240 1241 fxor %f12, %f0, %f0 ! ^= tweak[0] 1242 fxor %f14, %f2, %f2 1243 fxor %f8, %f4, %f4 1244 fxor %f10, %f6, %f6 1245 1246 stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 1247 add $out, 8, $out 1248 stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 1249 add $out, 8, $out 1250 stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 1251 add $out, 8, $out 1252 stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific 1253 bgu,pt $::size_t_cc, .L${bits}_xts_${dir}blk2x 1254 add $out, 8, $out 1255 1256 add $blk_init, $len, $len 1257 andcc $len, 1, %g0 ! is number of blocks even? 1258 membar #StoreLoad|#StoreStore 1259 bnz,pt %icc, .L${bits}_xts_${dir}loop 1260 srl $len, 0, $len 1261 brnz,pn $len, .L${bits}_xts_${dir}loop2x 1262 nop 1263 1264 fsrc2 %f4, %f0 1265 fsrc2 %f6, %f2 1266 brnz,pn $rem, .L${bits}_xts_${dir}steal 1267 nop 1268 1269 ret 1270 restore 1271!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 1272___ 1273$code.=<<___ if ($dir eq "en"); 1274.align 32 1275.L${bits}_xts_${dir}steal: 1276 std %f0, [%fp + $::bias-16] ! copy of output 1277 std %f2, [%fp + $::bias-8] 1278 1279 srl $ileft, 3, $ileft 1280 add %fp, $::bias-16, %l7 1281 add $inp, $ileft, $inp ! original $inp+$len&-15 1282 add $out, $ooff, $out ! original $out+$len&-15 1283 mov 0, $ileft 1284 nop ! align 1285 1286.L${bits}_xts_${dir}stealing: 1287 ldub [$inp + $ileft], %o0 1288 ldub [%l7 + $ileft], %o1 1289 dec $rem 1290 stb %o0, [%l7 + $ileft] 1291 stb %o1, [$out + $ileft] 1292 brnz $rem, .L${bits}_xts_${dir}stealing 1293 inc $ileft 1294 1295 mov %l7, $inp 1296 sub $out, 16, $out 1297 mov 0, $ileft 1298 sub $out, $ooff, $out 1299 ba .L${bits}_xts_${dir}loop ! one more time 1300 mov 1, $len ! $rem is 0 1301___ 1302$code.=<<___ if ($dir eq "de"); 1303.align 32 1304.L${bits}_xts_${dir}steal: 1305 ldx [$inp + 0], %o0 1306 brz,pt $ileft, 8f 1307 ldx [$inp + 8], %o1 1308 1309 ldx [$inp + 16], %o2 1310 sllx %o0, $ileft, %o0 1311 srlx %o1, $iright, %g1 1312 sllx %o1, $ileft, %o1 1313 or %g1, %o0, %o0 1314 srlx %o2, $iright, %o2 1315 or %o2, %o1, %o1 13168: 1317 srax %g3, 63, %l7 ! next tweak value 1318 addcc %g2, %g2, %o2 1319 and %l7, 0x87, %l7 1320 addxc %g3, %g3, %o3 1321 xor %l7, %o2, %o2 1322 1323 movxtod %o2, %f12 1324 movxtod %o3, %f14 1325 bshuffle %f12, %f12, %f12 1326 bshuffle %f14, %f14, %f14 1327 1328 xor %g4, %o0, %o0 ! ^= rk[0] 1329 xor %g5, %o1, %o1 1330 movxtod %o0, %f0 1331 movxtod %o1, %f2 1332 1333 fxor %f12, %f0, %f0 ! ^= tweak[0] 1334 fxor %f14, %f2, %f2 1335 1336 call _${alg}${bits}_${dir}crypt_1x 1337 add $inp, 16, $inp 1338 1339 fxor %f12, %f0, %f0 ! ^= tweak[0] 1340 fxor %f14, %f2, %f2 1341 1342 std %f0, [%fp + $::bias-16] 1343 std %f2, [%fp + $::bias-8] 1344 1345 srl $ileft, 3, $ileft 1346 add %fp, $::bias-16, %l7 1347 add $inp, $ileft, $inp ! original $inp+$len&-15 1348 add $out, $ooff, $out ! original $out+$len&-15 1349 mov 0, $ileft 1350 add $out, 16, $out 1351 nop ! align 1352 1353.L${bits}_xts_${dir}stealing: 1354 ldub [$inp + $ileft], %o0 1355 ldub [%l7 + $ileft], %o1 1356 dec $rem 1357 stb %o0, [%l7 + $ileft] 1358 stb %o1, [$out + $ileft] 1359 brnz $rem, .L${bits}_xts_${dir}stealing 1360 inc $ileft 1361 1362 mov %l7, $inp 1363 sub $out, 16, $out 1364 mov 0, $ileft 1365 sub $out, $ooff, $out 1366 ba .L${bits}_xts_${dir}loop ! one more time 1367 mov 1, $len ! $rem is 0 1368___ 1369$code.=<<___; 1370 ret 1371 restore 1372.type ${alg}${bits}_t4_xts_${dir}crypt,#function 1373.size ${alg}${bits}_t4_xts_${dir}crypt,.-${alg}${bits}_t4_xts_${dir}crypt 1374___ 1375} 1376 1377# Purpose of these subroutines is to explicitly encode VIS instructions, 1378# so that one can compile the module without having to specify VIS 1379# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. 1380# Idea is to reserve for option to produce "universal" binary and let 1381# programmer detect if current CPU is VIS capable at run-time. 1382sub unvis { 1383my ($mnemonic,$rs1,$rs2,$rd)=@_; 1384my ($ref,$opf); 1385my %visopf = ( "faligndata" => 0x048, 1386 "bshuffle" => 0x04c, 1387 "fnot2" => 0x066, 1388 "fxor" => 0x06c, 1389 "fsrc2" => 0x078 ); 1390 1391 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 1392 1393 if ($opf=$visopf{$mnemonic}) { 1394 foreach ($rs1,$rs2,$rd) { 1395 return $ref if (!/%f([0-9]{1,2})/); 1396 $_=$1; 1397 if ($1>=32) { 1398 return $ref if ($1&1); 1399 # re-encode for upper double register addressing 1400 $_=($1|$1>>5)&31; 1401 } 1402 } 1403 1404 return sprintf ".word\t0x%08x !%s", 1405 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, 1406 $ref; 1407 } else { 1408 return $ref; 1409 } 1410} 1411 1412sub unvis3 { 1413my ($mnemonic,$rs1,$rs2,$rd)=@_; 1414my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); 1415my ($ref,$opf); 1416my %visopf = ( "addxc" => 0x011, 1417 "addxccc" => 0x013, 1418 "umulxhi" => 0x016, 1419 "alignaddr" => 0x018, 1420 "bmask" => 0x019, 1421 "alignaddrl" => 0x01a ); 1422 1423 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 1424 1425 if ($opf=$visopf{$mnemonic}) { 1426 foreach ($rs1,$rs2,$rd) { 1427 return $ref if (!/%([goli])([0-9])/); 1428 $_=$bias{$1}+$2; 1429 } 1430 1431 return sprintf ".word\t0x%08x !%s", 1432 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, 1433 $ref; 1434 } else { 1435 return $ref; 1436 } 1437} 1438 1439sub unaes_round { # 4-argument instructions 1440my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_; 1441my ($ref,$opf); 1442my %aesopf = ( "aes_eround01" => 0, 1443 "aes_eround23" => 1, 1444 "aes_dround01" => 2, 1445 "aes_dround23" => 3, 1446 "aes_eround01_l"=> 4, 1447 "aes_eround23_l"=> 5, 1448 "aes_dround01_l"=> 6, 1449 "aes_dround23_l"=> 7, 1450 "aes_kexpand1" => 8 ); 1451 1452 $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd"; 1453 1454 if (defined($opf=$aesopf{$mnemonic})) { 1455 $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3; 1456 foreach ($rs1,$rs2,$rd) { 1457 return $ref if (!/%f([0-9]{1,2})/); 1458 $_=$1; 1459 if ($1>=32) { 1460 return $ref if ($1&1); 1461 # re-encode for upper double register addressing 1462 $_=($1|$1>>5)&31; 1463 } 1464 } 1465 1466 return sprintf ".word\t0x%08x !%s", 1467 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2, 1468 $ref; 1469 } else { 1470 return $ref; 1471 } 1472} 1473 1474sub unaes_kexpand { # 3-argument instructions 1475my ($mnemonic,$rs1,$rs2,$rd)=@_; 1476my ($ref,$opf); 1477my %aesopf = ( "aes_kexpand0" => 0x130, 1478 "aes_kexpand2" => 0x131 ); 1479 1480 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 1481 1482 if (defined($opf=$aesopf{$mnemonic})) { 1483 foreach ($rs1,$rs2,$rd) { 1484 return $ref if (!/%f([0-9]{1,2})/); 1485 $_=$1; 1486 if ($1>=32) { 1487 return $ref if ($1&1); 1488 # re-encode for upper double register addressing 1489 $_=($1|$1>>5)&31; 1490 } 1491 } 1492 1493 return sprintf ".word\t0x%08x !%s", 1494 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2, 1495 $ref; 1496 } else { 1497 return $ref; 1498 } 1499} 1500 1501sub uncamellia_f { # 4-argument instructions 1502my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_; 1503my ($ref,$opf); 1504 1505 $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd"; 1506 1507 if (1) { 1508 $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3; 1509 foreach ($rs1,$rs2,$rd) { 1510 return $ref if (!/%f([0-9]{1,2})/); 1511 $_=$1; 1512 if ($1>=32) { 1513 return $ref if ($1&1); 1514 # re-encode for upper double register addressing 1515 $_=($1|$1>>5)&31; 1516 } 1517 } 1518 1519 return sprintf ".word\t0x%08x !%s", 1520 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|0xc<<5|$rs2, 1521 $ref; 1522 } else { 1523 return $ref; 1524 } 1525} 1526 1527sub uncamellia3 { # 3-argument instructions 1528my ($mnemonic,$rs1,$rs2,$rd)=@_; 1529my ($ref,$opf); 1530my %cmllopf = ( "camellia_fl" => 0x13c, 1531 "camellia_fli" => 0x13d ); 1532 1533 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 1534 1535 if (defined($opf=$cmllopf{$mnemonic})) { 1536 foreach ($rs1,$rs2,$rd) { 1537 return $ref if (!/%f([0-9]{1,2})/); 1538 $_=$1; 1539 if ($1>=32) { 1540 return $ref if ($1&1); 1541 # re-encode for upper double register addressing 1542 $_=($1|$1>>5)&31; 1543 } 1544 } 1545 1546 return sprintf ".word\t0x%08x !%s", 1547 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2, 1548 $ref; 1549 } else { 1550 return $ref; 1551 } 1552} 1553 1554sub unmovxtox { # 2-argument instructions 1555my ($mnemonic,$rs,$rd)=@_; 1556my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24, "f" => 0 ); 1557my ($ref,$opf); 1558my %movxopf = ( "movdtox" => 0x110, 1559 "movstouw" => 0x111, 1560 "movstosw" => 0x113, 1561 "movxtod" => 0x118, 1562 "movwtos" => 0x119 ); 1563 1564 $ref = "$mnemonic\t$rs,$rd"; 1565 1566 if (defined($opf=$movxopf{$mnemonic})) { 1567 foreach ($rs,$rd) { 1568 return $ref if (!/%([fgoli])([0-9]{1,2})/); 1569 $_=$bias{$1}+$2; 1570 if ($2>=32) { 1571 return $ref if ($2&1); 1572 # re-encode for upper double register addressing 1573 $_=($2|$2>>5)&31; 1574 } 1575 } 1576 1577 return sprintf ".word\t0x%08x !%s", 1578 2<<30|$rd<<25|0x36<<19|$opf<<5|$rs, 1579 $ref; 1580 } else { 1581 return $ref; 1582 } 1583} 1584 1585sub undes { 1586my ($mnemonic)=shift; 1587my @args=@_; 1588my ($ref,$opf); 1589my %desopf = ( "des_round" => 0b1001, 1590 "des_ip" => 0b100110100, 1591 "des_iip" => 0b100110101, 1592 "des_kexpand" => 0b100110110 ); 1593 1594 $ref = "$mnemonic\t".join(",",@_); 1595 1596 if (defined($opf=$desopf{$mnemonic})) { # 4-arg 1597 if ($mnemonic eq "des_round") { 1598 foreach (@args[0..3]) { 1599 return $ref if (!/%f([0-9]{1,2})/); 1600 $_=$1; 1601 if ($1>=32) { 1602 return $ref if ($1&1); 1603 # re-encode for upper double register addressing 1604 $_=($1|$1>>5)&31; 1605 } 1606 } 1607 return sprintf ".word\t0x%08x !%s", 1608 2<<30|0b011001<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<9|$args[3]<<25, 1609 $ref; 1610 } elsif ($mnemonic eq "des_kexpand") { # 3-arg 1611 foreach (@args[0..2]) { 1612 return $ref if (!/(%f)?([0-9]{1,2})/); 1613 $_=$2; 1614 if ($2>=32) { 1615 return $ref if ($2&1); 1616 # re-encode for upper double register addressing 1617 $_=($2|$2>>5)&31; 1618 } 1619 } 1620 return sprintf ".word\t0x%08x !%s", 1621 2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<25, 1622 $ref; 1623 } else { # 2-arg 1624 foreach (@args[0..1]) { 1625 return $ref if (!/%f([0-9]{1,2})/); 1626 $_=$1; 1627 if ($1>=32) { 1628 return $ref if ($2&1); 1629 # re-encode for upper double register addressing 1630 $_=($1|$1>>5)&31; 1631 } 1632 } 1633 return sprintf ".word\t0x%08x !%s", 1634 2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]<<25, 1635 $ref; 1636 } 1637 } else { 1638 return $ref; 1639 } 1640} 1641 1642sub emit_assembler { 1643 foreach (split("\n",$::code)) { 1644 s/\`([^\`]*)\`/eval $1/ge; 1645 1646 s/\b(f[a-z]+2[sd]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})\s*$/$1\t%f0,$2,$3/go; 1647 1648 s/\b(aes_[edk][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/ 1649 &unaes_round($1,$2,$3,$4,$5) 1650 /geo or 1651 s/\b(aes_kexpand[02])\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ 1652 &unaes_kexpand($1,$2,$3,$4) 1653 /geo or 1654 s/\b(camellia_f)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/ 1655 &uncamellia_f($1,$2,$3,$4,$5) 1656 /geo or 1657 s/\b(camellia_[^s]+)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ 1658 &uncamellia3($1,$2,$3,$4) 1659 /geo or 1660 s/\b(des_\w+)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+)(?:,\s*(%f[0-9]{1,2})(?:,\s*(%f[0-9]{1,2}))?)?/ 1661 &undes($1,$2,$3,$4,$5) 1662 /geo or 1663 s/\b(mov[ds]to\w+)\s+(%f[0-9]{1,2}),\s*(%[goli][0-7])/ 1664 &unmovxtox($1,$2,$3) 1665 /geo or 1666 s/\b(mov[xw]to[ds])\s+(%[goli][0-7]),\s*(%f[0-9]{1,2})/ 1667 &unmovxtox($1,$2,$3) 1668 /geo or 1669 s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ 1670 &unvis($1,$2,$3,$4) 1671 /geo or 1672 s/\b(umulxhi|bmask|addxc[c]{0,2}|alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ 1673 &unvis3($1,$2,$3,$4) 1674 /geo; 1675 1676 print $_,"\n"; 1677 } 1678} 1679 16801; 1681