1#! /usr/bin/env perl 2# Copyright 2016-2021 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# March 2016 18# 19# Initial support for Fujitsu SPARC64 X/X+ comprises minimally 20# required key setup and single-block procedures. 21# 22# April 2016 23# 24# Add "teaser" CBC and CTR mode-specific subroutines. "Teaser" means 25# that parallelizable nature of CBC decrypt and CTR is not utilized 26# yet. CBC encrypt on the other hand is as good as it can possibly 27# get processing one byte in 4.1 cycles with 128-bit key on SPARC64 X. 28# This is ~6x faster than pure software implementation... 29# 30# July 2016 31# 32# Switch from faligndata to fshiftorx, which allows to omit alignaddr 33# instructions and improve single-block and short-input performance 34# with misaligned data. 35 36$output = pop and open STDOUT,">$output"; 37 38{ 39my ($inp,$out,$key,$rounds,$tmp,$mask) = map("%o$_",(0..5)); 40 41$code.=<<___; 42#ifndef __ASSEMBLER__ 43# define __ASSEMBLER__ 1 44#endif 45#include "crypto/sparc_arch.h" 46 47#define LOCALS (STACK_BIAS+STACK_FRAME) 48 49.text 50 51.globl aes_fx_encrypt 52.align 32 53aes_fx_encrypt: 54 and $inp, 7, $tmp ! is input aligned? 55 andn $inp, 7, $inp 56 ldd [$key + 0], %f6 ! round[0] 57 ldd [$key + 8], %f8 58 mov %o7, %g1 59 ld [$key + 240], $rounds 60 611: call .+8 62 add %o7, .Linp_align-1b, %o7 63 64 sll $tmp, 3, $tmp 65 ldd [$inp + 0], %f0 ! load input 66 brz,pt $tmp, .Lenc_inp_aligned 67 ldd [$inp + 8], %f2 68 69 ldd [%o7 + $tmp], %f14 ! shift left params 70 ldd [$inp + 16], %f4 71 fshiftorx %f0, %f2, %f14, %f0 72 fshiftorx %f2, %f4, %f14, %f2 73 74.Lenc_inp_aligned: 75 ldd [$key + 16], %f10 ! round[1] 76 ldd [$key + 24], %f12 77 78 fxor %f0, %f6, %f0 ! ^=round[0] 79 fxor %f2, %f8, %f2 80 ldd [$key + 32], %f6 ! round[2] 81 ldd [$key + 40], %f8 82 add $key, 32, $key 83 sub $rounds, 4, $rounds 84 85.Loop_enc: 86 fmovd %f0, %f4 87 faesencx %f2, %f10, %f0 88 faesencx %f4, %f12, %f2 89 ldd [$key + 16], %f10 90 ldd [$key + 24], %f12 91 add $key, 32, $key 92 93 fmovd %f0, %f4 94 faesencx %f2, %f6, %f0 95 faesencx %f4, %f8, %f2 96 ldd [$key + 0], %f6 97 ldd [$key + 8], %f8 98 99 brnz,a $rounds, .Loop_enc 100 sub $rounds, 2, $rounds 101 102 andcc $out, 7, $tmp ! is output aligned? 103 andn $out, 7, $out 104 mov 0xff, $mask 105 srl $mask, $tmp, $mask 106 add %o7, 64, %o7 107 sll $tmp, 3, $tmp 108 109 fmovd %f0, %f4 110 faesencx %f2, %f10, %f0 111 faesencx %f4, %f12, %f2 112 ldd [%o7 + $tmp], %f14 ! shift right params 113 114 fmovd %f0, %f4 115 faesenclx %f2, %f6, %f0 116 faesenclx %f4, %f8, %f2 117 118 bnz,pn %icc, .Lenc_out_unaligned 119 mov %g1, %o7 120 121 std %f0, [$out + 0] 122 retl 123 std %f2, [$out + 8] 124 125.align 16 126.Lenc_out_unaligned: 127 add $out, 16, $inp 128 orn %g0, $mask, $tmp 129 fshiftorx %f0, %f0, %f14, %f4 130 fshiftorx %f0, %f2, %f14, %f6 131 fshiftorx %f2, %f2, %f14, %f8 132 133 stda %f4, [$out + $mask]0xc0 ! partial store 134 std %f6, [$out + 8] 135 stda %f8, [$inp + $tmp]0xc0 ! partial store 136 retl 137 nop 138.type aes_fx_encrypt,#function 139.size aes_fx_encrypt,.-aes_fx_encrypt 140 141.globl aes_fx_decrypt 142.align 32 143aes_fx_decrypt: 144 and $inp, 7, $tmp ! is input aligned? 145 andn $inp, 7, $inp 146 ldd [$key + 0], %f6 ! round[0] 147 ldd [$key + 8], %f8 148 mov %o7, %g1 149 ld [$key + 240], $rounds 150 1511: call .+8 152 add %o7, .Linp_align-1b, %o7 153 154 sll $tmp, 3, $tmp 155 ldd [$inp + 0], %f0 ! load input 156 brz,pt $tmp, .Ldec_inp_aligned 157 ldd [$inp + 8], %f2 158 159 ldd [%o7 + $tmp], %f14 ! shift left params 160 ldd [$inp + 16], %f4 161 fshiftorx %f0, %f2, %f14, %f0 162 fshiftorx %f2, %f4, %f14, %f2 163 164.Ldec_inp_aligned: 165 ldd [$key + 16], %f10 ! round[1] 166 ldd [$key + 24], %f12 167 168 fxor %f0, %f6, %f0 ! ^=round[0] 169 fxor %f2, %f8, %f2 170 ldd [$key + 32], %f6 ! round[2] 171 ldd [$key + 40], %f8 172 add $key, 32, $key 173 sub $rounds, 4, $rounds 174 175.Loop_dec: 176 fmovd %f0, %f4 177 faesdecx %f2, %f10, %f0 178 faesdecx %f4, %f12, %f2 179 ldd [$key + 16], %f10 180 ldd [$key + 24], %f12 181 add $key, 32, $key 182 183 fmovd %f0, %f4 184 faesdecx %f2, %f6, %f0 185 faesdecx %f4, %f8, %f2 186 ldd [$key + 0], %f6 187 ldd [$key + 8], %f8 188 189 brnz,a $rounds, .Loop_dec 190 sub $rounds, 2, $rounds 191 192 andcc $out, 7, $tmp ! is output aligned? 193 andn $out, 7, $out 194 mov 0xff, $mask 195 srl $mask, $tmp, $mask 196 add %o7, 64, %o7 197 sll $tmp, 3, $tmp 198 199 fmovd %f0, %f4 200 faesdecx %f2, %f10, %f0 201 faesdecx %f4, %f12, %f2 202 ldd [%o7 + $tmp], %f14 ! shift right params 203 204 fmovd %f0, %f4 205 faesdeclx %f2, %f6, %f0 206 faesdeclx %f4, %f8, %f2 207 208 bnz,pn %icc, .Ldec_out_unaligned 209 mov %g1, %o7 210 211 std %f0, [$out + 0] 212 retl 213 std %f2, [$out + 8] 214 215.align 16 216.Ldec_out_unaligned: 217 add $out, 16, $inp 218 orn %g0, $mask, $tmp 219 fshiftorx %f0, %f0, %f14, %f4 220 fshiftorx %f0, %f2, %f14, %f6 221 fshiftorx %f2, %f2, %f14, %f8 222 223 stda %f4, [$out + $mask]0xc0 ! partial store 224 std %f6, [$out + 8] 225 stda %f8, [$inp + $tmp]0xc0 ! partial store 226 retl 227 nop 228.type aes_fx_decrypt,#function 229.size aes_fx_decrypt,.-aes_fx_decrypt 230___ 231} 232{ 233my ($inp,$bits,$out,$tmp,$inc) = map("%o$_",(0..5)); 234$code.=<<___; 235.globl aes_fx_set_decrypt_key 236.align 32 237aes_fx_set_decrypt_key: 238 b .Lset_encrypt_key 239 mov -1, $inc 240 retl 241 nop 242.type aes_fx_set_decrypt_key,#function 243.size aes_fx_set_decrypt_key,.-aes_fx_set_decrypt_key 244 245.globl aes_fx_set_encrypt_key 246.align 32 247aes_fx_set_encrypt_key: 248 mov 1, $inc 249 nop 250.Lset_encrypt_key: 251 and $inp, 7, $tmp 252 andn $inp, 7, $inp 253 sll $tmp, 3, $tmp 254 mov %o7, %g1 255 2561: call .+8 257 add %o7, .Linp_align-1b, %o7 258 259 ldd [%o7 + $tmp], %f10 ! shift left params 260 mov %g1, %o7 261 262 cmp $bits, 192 263 ldd [$inp + 0], %f0 264 bl,pt %icc, .L128 265 ldd [$inp + 8], %f2 266 267 be,pt %icc, .L192 268 ldd [$inp + 16], %f4 269 brz,pt $tmp, .L256aligned 270 ldd [$inp + 24], %f6 271 272 ldd [$inp + 32], %f8 273 fshiftorx %f0, %f2, %f10, %f0 274 fshiftorx %f2, %f4, %f10, %f2 275 fshiftorx %f4, %f6, %f10, %f4 276 fshiftorx %f6, %f8, %f10, %f6 277 278.L256aligned: 279 mov 14, $bits 280 and $inc, `14*16`, $tmp 281 st $bits, [$out + 240] ! store rounds 282 add $out, $tmp, $out ! start or end of key schedule 283 sllx $inc, 4, $inc ! 16 or -16 284___ 285for ($i=0; $i<6; $i++) { 286 $code.=<<___; 287 std %f0, [$out + 0] 288 faeskeyx %f6, `0x10+$i`, %f0 289 std %f2, [$out + 8] 290 add $out, $inc, $out 291 faeskeyx %f0, 0x00, %f2 292 std %f4, [$out + 0] 293 faeskeyx %f2, 0x01, %f4 294 std %f6, [$out + 8] 295 add $out, $inc, $out 296 faeskeyx %f4, 0x00, %f6 297___ 298} 299$code.=<<___; 300 std %f0, [$out + 0] 301 faeskeyx %f6, `0x10+$i`, %f0 302 std %f2, [$out + 8] 303 add $out, $inc, $out 304 faeskeyx %f0, 0x00, %f2 305 std %f4,[$out + 0] 306 std %f6,[$out + 8] 307 add $out, $inc, $out 308 std %f0,[$out + 0] 309 std %f2,[$out + 8] 310 retl 311 xor %o0, %o0, %o0 ! return 0 312 313.align 16 314.L192: 315 brz,pt $tmp, .L192aligned 316 nop 317 318 ldd [$inp + 24], %f6 319 fshiftorx %f0, %f2, %f10, %f0 320 fshiftorx %f2, %f4, %f10, %f2 321 fshiftorx %f4, %f6, %f10, %f4 322 323.L192aligned: 324 mov 12, $bits 325 and $inc, `12*16`, $tmp 326 st $bits, [$out + 240] ! store rounds 327 add $out, $tmp, $out ! start or end of key schedule 328 sllx $inc, 4, $inc ! 16 or -16 329___ 330for ($i=0; $i<8; $i+=2) { 331 $code.=<<___; 332 std %f0, [$out + 0] 333 faeskeyx %f4, `0x10+$i`, %f0 334 std %f2, [$out + 8] 335 add $out, $inc, $out 336 faeskeyx %f0, 0x00, %f2 337 std %f4, [$out + 0] 338 faeskeyx %f2, 0x00, %f4 339 std %f0, [$out + 8] 340 add $out, $inc, $out 341 faeskeyx %f4, `0x10+$i+1`, %f0 342 std %f2, [$out + 0] 343 faeskeyx %f0, 0x00, %f2 344 std %f4, [$out + 8] 345 add $out, $inc, $out 346___ 347$code.=<<___ if ($i<6); 348 faeskeyx %f2, 0x00, %f4 349___ 350} 351$code.=<<___; 352 std %f0, [$out + 0] 353 std %f2, [$out + 8] 354 retl 355 xor %o0, %o0, %o0 ! return 0 356 357.align 16 358.L128: 359 brz,pt $tmp, .L128aligned 360 nop 361 362 ldd [$inp + 16], %f4 363 fshiftorx %f0, %f2, %f10, %f0 364 fshiftorx %f2, %f4, %f10, %f2 365 366.L128aligned: 367 mov 10, $bits 368 and $inc, `10*16`, $tmp 369 st $bits, [$out + 240] ! store rounds 370 add $out, $tmp, $out ! start or end of key schedule 371 sllx $inc, 4, $inc ! 16 or -16 372___ 373for ($i=0; $i<10; $i++) { 374 $code.=<<___; 375 std %f0, [$out + 0] 376 faeskeyx %f2, `0x10+$i`, %f0 377 std %f2, [$out + 8] 378 add $out, $inc, $out 379 faeskeyx %f0, 0x00, %f2 380___ 381} 382$code.=<<___; 383 std %f0, [$out + 0] 384 std %f2, [$out + 8] 385 retl 386 xor %o0, %o0, %o0 ! return 0 387.type aes_fx_set_encrypt_key,#function 388.size aes_fx_set_encrypt_key,.-aes_fx_set_encrypt_key 389___ 390} 391{ 392my ($inp,$out,$len,$key,$ivp,$dir) = map("%i$_",(0..5)); 393my ($rounds,$inner,$end,$inc,$ialign,$oalign,$mask) = map("%l$_",(0..7)); 394my ($iv0,$iv1,$r0hi,$r0lo,$rlhi,$rllo,$in0,$in1,$intail,$outhead,$fshift) 395 = map("%f$_",grep { !($_ & 1) } (16 .. 62)); 396my ($ileft,$iright) = ($ialign,$oalign); 397 398$code.=<<___; 399.globl aes_fx_cbc_encrypt 400.align 32 401aes_fx_cbc_encrypt: 402 save %sp, -STACK_FRAME-16, %sp 403 srln $len, 4, $len 404 and $inp, 7, $ialign 405 andn $inp, 7, $inp 406 brz,pn $len, .Lcbc_no_data 407 sll $ialign, 3, $ileft 408 4091: call .+8 410 add %o7, .Linp_align-1b, %o7 411 412 ld [$key + 240], $rounds 413 and $out, 7, $oalign 414 ld [$ivp + 0], %f0 ! load ivec 415 andn $out, 7, $out 416 ld [$ivp + 4], %f1 417 sll $oalign, 3, $mask 418 ld [$ivp + 8], %f2 419 ld [$ivp + 12], %f3 420 421 sll $rounds, 4, $rounds 422 add $rounds, $key, $end 423 ldd [$key + 0], $r0hi ! round[0] 424 ldd [$key + 8], $r0lo 425 426 add $inp, 16, $inp 427 sub $len, 1, $len 428 ldd [$end + 0], $rlhi ! round[last] 429 ldd [$end + 8], $rllo 430 431 mov 16, $inc 432 movrz $len, 0, $inc 433 ldd [$key + 16], %f10 ! round[1] 434 ldd [$key + 24], %f12 435 436 ldd [%o7 + $ileft], $fshift ! shift left params 437 add %o7, 64, %o7 438 ldd [$inp - 16], $in0 ! load input 439 ldd [$inp - 8], $in1 440 ldda [$inp]0x82, $intail ! non-faulting load 441 brz $dir, .Lcbc_decrypt 442 add $inp, $inc, $inp ! inp+=16 443 444 fxor $r0hi, %f0, %f0 ! ivec^=round[0] 445 fxor $r0lo, %f2, %f2 446 fshiftorx $in0, $in1, $fshift, $in0 447 fshiftorx $in1, $intail, $fshift, $in1 448 nop 449 450.Loop_cbc_enc: 451 fxor $in0, %f0, %f0 ! inp^ivec^round[0] 452 fxor $in1, %f2, %f2 453 ldd [$key + 32], %f6 ! round[2] 454 ldd [$key + 40], %f8 455 add $key, 32, $end 456 sub $rounds, 16*6, $inner 457 458.Lcbc_enc: 459 fmovd %f0, %f4 460 faesencx %f2, %f10, %f0 461 faesencx %f4, %f12, %f2 462 ldd [$end + 16], %f10 463 ldd [$end + 24], %f12 464 add $end, 32, $end 465 466 fmovd %f0, %f4 467 faesencx %f2, %f6, %f0 468 faesencx %f4, %f8, %f2 469 ldd [$end + 0], %f6 470 ldd [$end + 8], %f8 471 472 brnz,a $inner, .Lcbc_enc 473 sub $inner, 16*2, $inner 474 475 fmovd %f0, %f4 476 faesencx %f2, %f10, %f0 477 faesencx %f4, %f12, %f2 478 ldd [$end + 16], %f10 ! round[last-1] 479 ldd [$end + 24], %f12 480 481 movrz $len, 0, $inc 482 fmovd $intail, $in0 483 ldd [$inp - 8], $in1 ! load next input block 484 ldda [$inp]0x82, $intail ! non-faulting load 485 add $inp, $inc, $inp ! inp+=16 486 487 fmovd %f0, %f4 488 faesencx %f2, %f6, %f0 489 faesencx %f4, %f8, %f2 490 491 fshiftorx $in0, $in1, $fshift, $in0 492 fshiftorx $in1, $intail, $fshift, $in1 493 494 fmovd %f0, %f4 495 faesencx %f2, %f10, %f0 496 faesencx %f4, %f12, %f2 497 ldd [$key + 16], %f10 ! round[1] 498 ldd [$key + 24], %f12 499 500 fxor $r0hi, $in0, $in0 ! inp^=round[0] 501 fxor $r0lo, $in1, $in1 502 503 fmovd %f0, %f4 504 faesenclx %f2, $rlhi, %f0 505 faesenclx %f4, $rllo, %f2 506 507 brnz,pn $oalign, .Lcbc_enc_unaligned_out 508 nop 509 510 std %f0, [$out + 0] 511 std %f2, [$out + 8] 512 add $out, 16, $out 513 514 brnz,a $len, .Loop_cbc_enc 515 sub $len, 1, $len 516 517 st %f0, [$ivp + 0] ! output ivec 518 st %f1, [$ivp + 4] 519 st %f2, [$ivp + 8] 520 st %f3, [$ivp + 12] 521 522.Lcbc_no_data: 523 ret 524 restore 525 526.align 32 527.Lcbc_enc_unaligned_out: 528 ldd [%o7 + $mask], $fshift ! shift right params 529 mov 0xff, $mask 530 srl $mask, $oalign, $mask 531 sub %g0, $ileft, $iright 532 533 fshiftorx %f0, %f0, $fshift, %f6 534 fshiftorx %f0, %f2, $fshift, %f8 535 536 stda %f6, [$out + $mask]0xc0 ! partial store 537 orn %g0, $mask, $mask 538 std %f8, [$out + 8] 539 add $out, 16, $out 540 brz $len, .Lcbc_enc_unaligned_out_done 541 sub $len, 1, $len 542 b .Loop_cbc_enc_unaligned_out 543 nop 544 545.align 32 546.Loop_cbc_enc_unaligned_out: 547 fmovd %f2, $outhead 548 fxor $in0, %f0, %f0 ! inp^ivec^round[0] 549 fxor $in1, %f2, %f2 550 ldd [$key + 32], %f6 ! round[2] 551 ldd [$key + 40], %f8 552 553 fmovd %f0, %f4 554 faesencx %f2, %f10, %f0 555 faesencx %f4, %f12, %f2 556 ldd [$key + 48], %f10 ! round[3] 557 ldd [$key + 56], %f12 558 559 ldx [$inp - 16], %o0 560 ldx [$inp - 8], %o1 561 brz $ileft, .Lcbc_enc_aligned_inp 562 movrz $len, 0, $inc 563 564 ldx [$inp], %o2 565 sllx %o0, $ileft, %o0 566 srlx %o1, $iright, %g1 567 sllx %o1, $ileft, %o1 568 or %g1, %o0, %o0 569 srlx %o2, $iright, %o2 570 or %o2, %o1, %o1 571 572.Lcbc_enc_aligned_inp: 573 fmovd %f0, %f4 574 faesencx %f2, %f6, %f0 575 faesencx %f4, %f8, %f2 576 ldd [$key + 64], %f6 ! round[4] 577 ldd [$key + 72], %f8 578 add $key, 64, $end 579 sub $rounds, 16*8, $inner 580 581 stx %o0, [%sp + LOCALS + 0] 582 stx %o1, [%sp + LOCALS + 8] 583 add $inp, $inc, $inp ! inp+=16 584 nop 585 586.Lcbc_enc_unaligned: 587 fmovd %f0, %f4 588 faesencx %f2, %f10, %f0 589 faesencx %f4, %f12, %f2 590 ldd [$end + 16], %f10 591 ldd [$end + 24], %f12 592 add $end, 32, $end 593 594 fmovd %f0, %f4 595 faesencx %f2, %f6, %f0 596 faesencx %f4, %f8, %f2 597 ldd [$end + 0], %f6 598 ldd [$end + 8], %f8 599 600 brnz,a $inner, .Lcbc_enc_unaligned 601 sub $inner, 16*2, $inner 602 603 fmovd %f0, %f4 604 faesencx %f2, %f10, %f0 605 faesencx %f4, %f12, %f2 606 ldd [$end + 16], %f10 ! round[last-1] 607 ldd [$end + 24], %f12 608 609 fmovd %f0, %f4 610 faesencx %f2, %f6, %f0 611 faesencx %f4, %f8, %f2 612 613 ldd [%sp + LOCALS + 0], $in0 614 ldd [%sp + LOCALS + 8], $in1 615 616 fmovd %f0, %f4 617 faesencx %f2, %f10, %f0 618 faesencx %f4, %f12, %f2 619 ldd [$key + 16], %f10 ! round[1] 620 ldd [$key + 24], %f12 621 622 fxor $r0hi, $in0, $in0 ! inp^=round[0] 623 fxor $r0lo, $in1, $in1 624 625 fmovd %f0, %f4 626 faesenclx %f2, $rlhi, %f0 627 faesenclx %f4, $rllo, %f2 628 629 fshiftorx $outhead, %f0, $fshift, %f6 630 fshiftorx %f0, %f2, $fshift, %f8 631 std %f6, [$out + 0] 632 std %f8, [$out + 8] 633 add $out, 16, $out 634 635 brnz,a $len, .Loop_cbc_enc_unaligned_out 636 sub $len, 1, $len 637 638.Lcbc_enc_unaligned_out_done: 639 fshiftorx %f2, %f2, $fshift, %f8 640 stda %f8, [$out + $mask]0xc0 ! partial store 641 642 st %f0, [$ivp + 0] ! output ivec 643 st %f1, [$ivp + 4] 644 st %f2, [$ivp + 8] 645 st %f3, [$ivp + 12] 646 647 ret 648 restore 649 650.align 32 651.Lcbc_decrypt: 652 fshiftorx $in0, $in1, $fshift, $in0 653 fshiftorx $in1, $intail, $fshift, $in1 654 fmovd %f0, $iv0 655 fmovd %f2, $iv1 656 657.Loop_cbc_dec: 658 fxor $in0, $r0hi, %f0 ! inp^round[0] 659 fxor $in1, $r0lo, %f2 660 ldd [$key + 32], %f6 ! round[2] 661 ldd [$key + 40], %f8 662 add $key, 32, $end 663 sub $rounds, 16*6, $inner 664 665.Lcbc_dec: 666 fmovd %f0, %f4 667 faesdecx %f2, %f10, %f0 668 faesdecx %f4, %f12, %f2 669 ldd [$end + 16], %f10 670 ldd [$end + 24], %f12 671 add $end, 32, $end 672 673 fmovd %f0, %f4 674 faesdecx %f2, %f6, %f0 675 faesdecx %f4, %f8, %f2 676 ldd [$end + 0], %f6 677 ldd [$end + 8], %f8 678 679 brnz,a $inner, .Lcbc_dec 680 sub $inner, 16*2, $inner 681 682 fmovd %f0, %f4 683 faesdecx %f2, %f10, %f0 684 faesdecx %f4, %f12, %f2 685 ldd [$end + 16], %f10 ! round[last-1] 686 ldd [$end + 24], %f12 687 688 fmovd %f0, %f4 689 faesdecx %f2, %f6, %f0 690 faesdecx %f4, %f8, %f2 691 fxor $iv0, $rlhi, %f6 ! ivec^round[last] 692 fxor $iv1, $rllo, %f8 693 fmovd $in0, $iv0 694 fmovd $in1, $iv1 695 696 movrz $len, 0, $inc 697 fmovd $intail, $in0 698 ldd [$inp - 8], $in1 ! load next input block 699 ldda [$inp]0x82, $intail ! non-faulting load 700 add $inp, $inc, $inp ! inp+=16 701 702 fmovd %f0, %f4 703 faesdecx %f2, %f10, %f0 704 faesdecx %f4, %f12, %f2 705 ldd [$key + 16], %f10 ! round[1] 706 ldd [$key + 24], %f12 707 708 fshiftorx $in0, $in1, $fshift, $in0 709 fshiftorx $in1, $intail, $fshift, $in1 710 711 fmovd %f0, %f4 712 faesdeclx %f2, %f6, %f0 713 faesdeclx %f4, %f8, %f2 714 715 brnz,pn $oalign, .Lcbc_dec_unaligned_out 716 nop 717 718 std %f0, [$out + 0] 719 std %f2, [$out + 8] 720 add $out, 16, $out 721 722 brnz,a $len, .Loop_cbc_dec 723 sub $len, 1, $len 724 725 st $iv0, [$ivp + 0] ! output ivec 726 st $iv0#lo, [$ivp + 4] 727 st $iv1, [$ivp + 8] 728 st $iv1#lo, [$ivp + 12] 729 730 ret 731 restore 732 733.align 32 734.Lcbc_dec_unaligned_out: 735 ldd [%o7 + $mask], $fshift ! shift right params 736 mov 0xff, $mask 737 srl $mask, $oalign, $mask 738 sub %g0, $ileft, $iright 739 740 fshiftorx %f0, %f0, $fshift, %f6 741 fshiftorx %f0, %f2, $fshift, %f8 742 743 stda %f6, [$out + $mask]0xc0 ! partial store 744 orn %g0, $mask, $mask 745 std %f8, [$out + 8] 746 add $out, 16, $out 747 brz $len, .Lcbc_dec_unaligned_out_done 748 sub $len, 1, $len 749 b .Loop_cbc_dec_unaligned_out 750 nop 751 752.align 32 753.Loop_cbc_dec_unaligned_out: 754 fmovd %f2, $outhead 755 fxor $in0, $r0hi, %f0 ! inp^round[0] 756 fxor $in1, $r0lo, %f2 757 ldd [$key + 32], %f6 ! round[2] 758 ldd [$key + 40], %f8 759 760 fmovd %f0, %f4 761 faesdecx %f2, %f10, %f0 762 faesdecx %f4, %f12, %f2 763 ldd [$key + 48], %f10 ! round[3] 764 ldd [$key + 56], %f12 765 766 ldx [$inp - 16], %o0 767 ldx [$inp - 8], %o1 768 brz $ileft, .Lcbc_dec_aligned_inp 769 movrz $len, 0, $inc 770 771 ldx [$inp], %o2 772 sllx %o0, $ileft, %o0 773 srlx %o1, $iright, %g1 774 sllx %o1, $ileft, %o1 775 or %g1, %o0, %o0 776 srlx %o2, $iright, %o2 777 or %o2, %o1, %o1 778 779.Lcbc_dec_aligned_inp: 780 fmovd %f0, %f4 781 faesdecx %f2, %f6, %f0 782 faesdecx %f4, %f8, %f2 783 ldd [$key + 64], %f6 ! round[4] 784 ldd [$key + 72], %f8 785 add $key, 64, $end 786 sub $rounds, 16*8, $inner 787 788 stx %o0, [%sp + LOCALS + 0] 789 stx %o1, [%sp + LOCALS + 8] 790 add $inp, $inc, $inp ! inp+=16 791 nop 792 793.Lcbc_dec_unaligned: 794 fmovd %f0, %f4 795 faesdecx %f2, %f10, %f0 796 faesdecx %f4, %f12, %f2 797 ldd [$end + 16], %f10 798 ldd [$end + 24], %f12 799 add $end, 32, $end 800 801 fmovd %f0, %f4 802 faesdecx %f2, %f6, %f0 803 faesdecx %f4, %f8, %f2 804 ldd [$end + 0], %f6 805 ldd [$end + 8], %f8 806 807 brnz,a $inner, .Lcbc_dec_unaligned 808 sub $inner, 16*2, $inner 809 810 fmovd %f0, %f4 811 faesdecx %f2, %f10, %f0 812 faesdecx %f4, %f12, %f2 813 ldd [$end + 16], %f10 ! round[last-1] 814 ldd [$end + 24], %f12 815 816 fmovd %f0, %f4 817 faesdecx %f2, %f6, %f0 818 faesdecx %f4, %f8, %f2 819 820 fxor $iv0, $rlhi, %f6 ! ivec^round[last] 821 fxor $iv1, $rllo, %f8 822 fmovd $in0, $iv0 823 fmovd $in1, $iv1 824 ldd [%sp + LOCALS + 0], $in0 825 ldd [%sp + LOCALS + 8], $in1 826 827 fmovd %f0, %f4 828 faesdecx %f2, %f10, %f0 829 faesdecx %f4, %f12, %f2 830 ldd [$key + 16], %f10 ! round[1] 831 ldd [$key + 24], %f12 832 833 fmovd %f0, %f4 834 faesdeclx %f2, %f6, %f0 835 faesdeclx %f4, %f8, %f2 836 837 fshiftorx $outhead, %f0, $fshift, %f6 838 fshiftorx %f0, %f2, $fshift, %f8 839 std %f6, [$out + 0] 840 std %f8, [$out + 8] 841 add $out, 16, $out 842 843 brnz,a $len, .Loop_cbc_dec_unaligned_out 844 sub $len, 1, $len 845 846.Lcbc_dec_unaligned_out_done: 847 fshiftorx %f2, %f2, $fshift, %f8 848 stda %f8, [$out + $mask]0xc0 ! partial store 849 850 st $iv0, [$ivp + 0] ! output ivec 851 st $iv0#lo, [$ivp + 4] 852 st $iv1, [$ivp + 8] 853 st $iv1#lo, [$ivp + 12] 854 855 ret 856 restore 857.type aes_fx_cbc_encrypt,#function 858.size aes_fx_cbc_encrypt,.-aes_fx_cbc_encrypt 859___ 860} 861{ 862my ($inp,$out,$len,$key,$ivp) = map("%i$_",(0..5)); 863my ($rounds,$inner,$end,$inc,$ialign,$oalign,$mask) = map("%l$_",(0..7)); 864my ($ctr0,$ctr1,$r0hi,$r0lo,$rlhi,$rllo,$in0,$in1,$intail,$outhead,$fshift) 865 = map("%f$_",grep { !($_ & 1) } (16 .. 62)); 866my ($ileft,$iright) = ($ialign, $oalign); 867my $one = "%f14"; 868 869$code.=<<___; 870.globl aes_fx_ctr32_encrypt_blocks 871.align 32 872aes_fx_ctr32_encrypt_blocks: 873 save %sp, -STACK_FRAME-16, %sp 874 srln $len, 0, $len 875 and $inp, 7, $ialign 876 andn $inp, 7, $inp 877 brz,pn $len, .Lctr32_no_data 878 sll $ialign, 3, $ileft 879 880.Lpic: call .+8 881 add %o7, .Linp_align - .Lpic, %o7 882 883 ld [$key + 240], $rounds 884 and $out, 7, $oalign 885 ld [$ivp + 0], $ctr0 ! load counter 886 andn $out, 7, $out 887 ld [$ivp + 4], $ctr0#lo 888 sll $oalign, 3, $mask 889 ld [$ivp + 8], $ctr1 890 ld [$ivp + 12], $ctr1#lo 891 ldd [%o7 + 128], $one 892 893 sll $rounds, 4, $rounds 894 add $rounds, $key, $end 895 ldd [$key + 0], $r0hi ! round[0] 896 ldd [$key + 8], $r0lo 897 898 add $inp, 16, $inp 899 sub $len, 1, $len 900 ldd [$key + 16], %f10 ! round[1] 901 ldd [$key + 24], %f12 902 903 mov 16, $inc 904 movrz $len, 0, $inc 905 ldd [$end + 0], $rlhi ! round[last] 906 ldd [$end + 8], $rllo 907 908 ldd [%o7 + $ileft], $fshift ! shiftleft params 909 add %o7, 64, %o7 910 ldd [$inp - 16], $in0 ! load input 911 ldd [$inp - 8], $in1 912 ldda [$inp]0x82, $intail ! non-faulting load 913 add $inp, $inc, $inp ! inp+=16 914 915 fshiftorx $in0, $in1, $fshift, $in0 916 fshiftorx $in1, $intail, $fshift, $in1 917 918.Loop_ctr32: 919 fxor $ctr0, $r0hi, %f0 ! counter^round[0] 920 fxor $ctr1, $r0lo, %f2 921 ldd [$key + 32], %f6 ! round[2] 922 ldd [$key + 40], %f8 923 add $key, 32, $end 924 sub $rounds, 16*6, $inner 925 926.Lctr32_enc: 927 fmovd %f0, %f4 928 faesencx %f2, %f10, %f0 929 faesencx %f4, %f12, %f2 930 ldd [$end + 16], %f10 931 ldd [$end + 24], %f12 932 add $end, 32, $end 933 934 fmovd %f0, %f4 935 faesencx %f2, %f6, %f0 936 faesencx %f4, %f8, %f2 937 ldd [$end + 0], %f6 938 ldd [$end + 8], %f8 939 940 brnz,a $inner, .Lctr32_enc 941 sub $inner, 16*2, $inner 942 943 fmovd %f0, %f4 944 faesencx %f2, %f10, %f0 945 faesencx %f4, %f12, %f2 946 ldd [$end + 16], %f10 ! round[last-1] 947 ldd [$end + 24], %f12 948 949 fmovd %f0, %f4 950 faesencx %f2, %f6, %f0 951 faesencx %f4, %f8, %f2 952 fxor $in0, $rlhi, %f6 ! inp^round[last] 953 fxor $in1, $rllo, %f8 954 955 movrz $len, 0, $inc 956 fmovd $intail, $in0 957 ldd [$inp - 8], $in1 ! load next input block 958 ldda [$inp]0x82, $intail ! non-faulting load 959 add $inp, $inc, $inp ! inp+=16 960 961 fmovd %f0, %f4 962 faesencx %f2, %f10, %f0 963 faesencx %f4, %f12, %f2 964 ldd [$key + 16], %f10 ! round[1] 965 ldd [$key + 24], %f12 966 967 fshiftorx $in0, $in1, $fshift, $in0 968 fshiftorx $in1, $intail, $fshift, $in1 969 fpadd32 $ctr1, $one, $ctr1 ! increment counter 970 971 fmovd %f0, %f4 972 faesenclx %f2, %f6, %f0 973 faesenclx %f4, %f8, %f2 974 975 brnz,pn $oalign, .Lctr32_unaligned_out 976 nop 977 978 std %f0, [$out + 0] 979 std %f2, [$out + 8] 980 add $out, 16, $out 981 982 brnz,a $len, .Loop_ctr32 983 sub $len, 1, $len 984 985.Lctr32_no_data: 986 ret 987 restore 988 989.align 32 990.Lctr32_unaligned_out: 991 ldd [%o7 + $mask], $fshift ! shift right params 992 mov 0xff, $mask 993 srl $mask, $oalign, $mask 994 sub %g0, $ileft, $iright 995 996 fshiftorx %f0, %f0, $fshift, %f6 997 fshiftorx %f0, %f2, $fshift, %f8 998 999 stda %f6, [$out + $mask]0xc0 ! partial store 1000 orn %g0, $mask, $mask 1001 std %f8, [$out + 8] 1002 add $out, 16, $out 1003 brz $len, .Lctr32_unaligned_out_done 1004 sub $len, 1, $len 1005 b .Loop_ctr32_unaligned_out 1006 nop 1007 1008.align 32 1009.Loop_ctr32_unaligned_out: 1010 fmovd %f2, $outhead 1011 fxor $ctr0, $r0hi, %f0 ! counter^round[0] 1012 fxor $ctr1, $r0lo, %f2 1013 ldd [$key + 32], %f6 ! round[2] 1014 ldd [$key + 40], %f8 1015 1016 fmovd %f0, %f4 1017 faesencx %f2, %f10, %f0 1018 faesencx %f4, %f12, %f2 1019 ldd [$key + 48], %f10 ! round[3] 1020 ldd [$key + 56], %f12 1021 1022 ldx [$inp - 16], %o0 1023 ldx [$inp - 8], %o1 1024 brz $ileft, .Lctr32_aligned_inp 1025 movrz $len, 0, $inc 1026 1027 ldx [$inp], %o2 1028 sllx %o0, $ileft, %o0 1029 srlx %o1, $iright, %g1 1030 sllx %o1, $ileft, %o1 1031 or %g1, %o0, %o0 1032 srlx %o2, $iright, %o2 1033 or %o2, %o1, %o1 1034 1035.Lctr32_aligned_inp: 1036 fmovd %f0, %f4 1037 faesencx %f2, %f6, %f0 1038 faesencx %f4, %f8, %f2 1039 ldd [$key + 64], %f6 ! round[4] 1040 ldd [$key + 72], %f8 1041 add $key, 64, $end 1042 sub $rounds, 16*8, $inner 1043 1044 stx %o0, [%sp + LOCALS + 0] 1045 stx %o1, [%sp + LOCALS + 8] 1046 add $inp, $inc, $inp ! inp+=16 1047 nop 1048 1049.Lctr32_enc_unaligned: 1050 fmovd %f0, %f4 1051 faesencx %f2, %f10, %f0 1052 faesencx %f4, %f12, %f2 1053 ldd [$end + 16], %f10 1054 ldd [$end + 24], %f12 1055 add $end, 32, $end 1056 1057 fmovd %f0, %f4 1058 faesencx %f2, %f6, %f0 1059 faesencx %f4, %f8, %f2 1060 ldd [$end + 0], %f6 1061 ldd [$end + 8], %f8 1062 1063 brnz,a $inner, .Lctr32_enc_unaligned 1064 sub $inner, 16*2, $inner 1065 1066 fmovd %f0, %f4 1067 faesencx %f2, %f10, %f0 1068 faesencx %f4, %f12, %f2 1069 ldd [$end + 16], %f10 ! round[last-1] 1070 ldd [$end + 24], %f12 1071 fpadd32 $ctr1, $one, $ctr1 ! increment counter 1072 1073 fmovd %f0, %f4 1074 faesencx %f2, %f6, %f0 1075 faesencx %f4, %f8, %f2 1076 fxor $in0, $rlhi, %f6 ! inp^round[last] 1077 fxor $in1, $rllo, %f8 1078 ldd [%sp + LOCALS + 0], $in0 1079 ldd [%sp + LOCALS + 8], $in1 1080 1081 fmovd %f0, %f4 1082 faesencx %f2, %f10, %f0 1083 faesencx %f4, %f12, %f2 1084 ldd [$key + 16], %f10 ! round[1] 1085 ldd [$key + 24], %f12 1086 1087 fmovd %f0, %f4 1088 faesenclx %f2, %f6, %f0 1089 faesenclx %f4, %f8, %f2 1090 1091 fshiftorx $outhead, %f0, $fshift, %f6 1092 fshiftorx %f0, %f2, $fshift, %f8 1093 std %f6, [$out + 0] 1094 std %f8, [$out + 8] 1095 add $out, 16, $out 1096 1097 brnz,a $len, .Loop_ctr32_unaligned_out 1098 sub $len, 1, $len 1099 1100.Lctr32_unaligned_out_done: 1101 fshiftorx %f2, %f2, $fshift, %f8 1102 stda %f8, [$out + $mask]0xc0 ! partial store 1103 1104 ret 1105 restore 1106.type aes_fx_ctr32_encrypt_blocks,#function 1107.size aes_fx_ctr32_encrypt_blocks,.-aes_fx_ctr32_encrypt_blocks 1108 1109.align 32 1110.Linp_align: ! fshiftorx parameters for left shift toward %rs1 1111 .byte 0, 0, 64, 0, 0, 64, 0, -64 1112 .byte 0, 0, 56, 8, 0, 56, 8, -56 1113 .byte 0, 0, 48, 16, 0, 48, 16, -48 1114 .byte 0, 0, 40, 24, 0, 40, 24, -40 1115 .byte 0, 0, 32, 32, 0, 32, 32, -32 1116 .byte 0, 0, 24, 40, 0, 24, 40, -24 1117 .byte 0, 0, 16, 48, 0, 16, 48, -16 1118 .byte 0, 0, 8, 56, 0, 8, 56, -8 1119.Lout_align: ! fshiftorx parameters for right shift toward %rs2 1120 .byte 0, 0, 0, 64, 0, 0, 64, 0 1121 .byte 0, 0, 8, 56, 0, 8, 56, -8 1122 .byte 0, 0, 16, 48, 0, 16, 48, -16 1123 .byte 0, 0, 24, 40, 0, 24, 40, -24 1124 .byte 0, 0, 32, 32, 0, 32, 32, -32 1125 .byte 0, 0, 40, 24, 0, 40, 24, -40 1126 .byte 0, 0, 48, 16, 0, 48, 16, -48 1127 .byte 0, 0, 56, 8, 0, 56, 8, -56 1128.Lone: 1129 .word 0, 1 1130.asciz "AES for Fujitsu SPARC64 X, CRYPTOGAMS by <appro\@openssl.org>" 1131.align 4 1132___ 1133} 1134# Purpose of these subroutines is to explicitly encode VIS instructions, 1135# so that one can compile the module without having to specify VIS 1136# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. 1137# Idea is to reserve for option to produce "universal" binary and let 1138# programmer detect if current CPU is VIS capable at run-time. 1139sub unvis { 1140my ($mnemonic,$rs1,$rs2,$rd)=@_; 1141my ($ref,$opf); 1142my %visopf = ( "faligndata" => 0x048, 1143 "bshuffle" => 0x04c, 1144 "fpadd32" => 0x052, 1145 "fxor" => 0x06c, 1146 "fsrc2" => 0x078 ); 1147 1148 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 1149 1150 if ($opf=$visopf{$mnemonic}) { 1151 foreach ($rs1,$rs2,$rd) { 1152 return $ref if (!/%f([0-9]{1,2})/); 1153 $_=$1; 1154 if ($1>=32) { 1155 return $ref if ($1&1); 1156 # re-encode for upper double register addressing 1157 $_=($1|$1>>5)&31; 1158 } 1159 } 1160 1161 return sprintf ".word\t0x%08x !%s", 1162 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, 1163 $ref; 1164 } else { 1165 return $ref; 1166 } 1167} 1168 1169sub unvis3 { 1170my ($mnemonic,$rs1,$rs2,$rd)=@_; 1171my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); 1172my ($ref,$opf); 1173my %visopf = ( "alignaddr" => 0x018, 1174 "bmask" => 0x019, 1175 "alignaddrl" => 0x01a ); 1176 1177 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 1178 1179 if ($opf=$visopf{$mnemonic}) { 1180 foreach ($rs1,$rs2,$rd) { 1181 return $ref if (!/%([goli])([0-9])/); 1182 $_=$bias{$1}+$2; 1183 } 1184 1185 return sprintf ".word\t0x%08x !%s", 1186 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, 1187 $ref; 1188 } else { 1189 return $ref; 1190 } 1191} 1192 1193sub unfx { 1194my ($mnemonic,$rs1,$rs2,$rd)=@_; 1195my ($ref,$opf); 1196my %aesopf = ( "faesencx" => 0x90, 1197 "faesdecx" => 0x91, 1198 "faesenclx" => 0x92, 1199 "faesdeclx" => 0x93, 1200 "faeskeyx" => 0x94 ); 1201 1202 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 1203 1204 if (defined($opf=$aesopf{$mnemonic})) { 1205 $rs2 = ($rs2 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs2; 1206 $rs2 = oct($rs2) if ($rs2 =~ /^0/); 1207 1208 foreach ($rs1,$rd) { 1209 return $ref if (!/%f([0-9]{1,2})/); 1210 $_=$1; 1211 if ($1>=32) { 1212 return $ref if ($1&1); 1213 # re-encode for upper double register addressing 1214 $_=($1|$1>>5)&31; 1215 } 1216 } 1217 1218 return sprintf ".word\t0x%08x !%s", 1219 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2, 1220 $ref; 1221 } else { 1222 return $ref; 1223 } 1224} 1225 1226sub unfx3src { 1227my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_; 1228my ($ref,$opf); 1229my %aesopf = ( "fshiftorx" => 0x0b ); 1230 1231 $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd"; 1232 1233 if (defined($opf=$aesopf{$mnemonic})) { 1234 foreach ($rs1,$rs2,$rs3,$rd) { 1235 return $ref if (!/%f([0-9]{1,2})/); 1236 $_=$1; 1237 if ($1>=32) { 1238 return $ref if ($1&1); 1239 # re-encode for upper double register addressing 1240 $_=($1|$1>>5)&31; 1241 } 1242 } 1243 1244 return sprintf ".word\t0x%08x !%s", 1245 2<<30|$rd<<25|0x37<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2, 1246 $ref; 1247 } else { 1248 return $ref; 1249 } 1250} 1251 1252foreach (split("\n",$code)) { 1253 s/\`([^\`]*)\`/eval $1/ge; 1254 1255 s/%f([0-9]+)#lo/sprintf "%%f%d",$1+1/ge; 1256 1257 s/\b(faes[^x]{3,4}x)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/ 1258 &unfx($1,$2,$3,$4) 1259 /ge or 1260 s/\b([f][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ 1261 &unfx3src($1,$2,$3,$4,$5) 1262 /ge or 1263 s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/ 1264 &unvis($1,$2,$3,$4) 1265 /ge or 1266 s/\b(alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ 1267 &unvis3($1,$2,$3,$4) 1268 /ge; 1269 print $_,"\n"; 1270} 1271 1272close STDOUT or die "error closing STDOUT: $!"; 1273