1; AesOpt.asm -- AES optimized code for x86 AES hardware instructions 2; 2021-12-25 : Igor Pavlov : Public domain 3 4include 7zAsm.asm 5 6ifdef __ASMC__ 7 use_vaes_256 equ 1 8else 9ifdef ymm0 10 use_vaes_256 equ 1 11endif 12endif 13 14 15ifdef use_vaes_256 16 ECHO "++ VAES 256" 17else 18 ECHO "-- NO VAES 256" 19endif 20 21ifdef x64 22 ECHO "x86-64" 23else 24 ECHO "x86" 25if (IS_CDECL gt 0) 26 ECHO "ABI : CDECL" 27else 28 ECHO "ABI : no CDECL : FASTCALL" 29endif 30endif 31 32if (IS_LINUX gt 0) 33 ECHO "ABI : LINUX" 34else 35 ECHO "ABI : WINDOWS" 36endif 37 38MY_ASM_START 39 40ifndef x64 41 .686 42 .xmm 43endif 44 45 46; MY_ALIGN EQU ALIGN(64) 47MY_ALIGN EQU 48 49SEG_ALIGN EQU MY_ALIGN 50 51MY_SEG_PROC macro name:req, numParams:req 52 ; seg_name equ @CatStr(_TEXT$, name) 53 ; seg_name SEGMENT SEG_ALIGN 'CODE' 54 MY_PROC name, numParams 55endm 56 57MY_SEG_ENDP macro 58 ; seg_name ENDS 59endm 60 61 62NUM_AES_KEYS_MAX equ 15 63 64; the number of push operators in function PROLOG 65if (IS_LINUX eq 0) or (IS_X64 eq 0) 66num_regs_push equ 2 67stack_param_offset equ (REG_SIZE * (1 + num_regs_push)) 68endif 69 70ifdef x64 71 num_param equ REG_ABI_PARAM_2 72else 73 if (IS_CDECL gt 0) 74 ; size_t size 75 ; void * data 76 ; UInt32 * aes 77 ; ret-ip <- (r4) 78 aes_OFFS equ (stack_param_offset) 79 data_OFFS equ (REG_SIZE + aes_OFFS) 80 size_OFFS equ (REG_SIZE + data_OFFS) 81 num_param equ [r4 + size_OFFS] 82 else 83 num_param equ [r4 + stack_param_offset] 84 endif 85endif 86 87keys equ REG_PARAM_0 ; r1 88rD equ REG_PARAM_1 ; r2 89rN equ r0 90 91koffs_x equ x7 92koffs_r equ r7 93 94ksize_x equ x6 95ksize_r equ r6 96 97keys2 equ r3 98 99state equ xmm0 100key equ xmm0 101key_ymm equ ymm0 102key_ymm_n equ 0 103 104ifdef x64 105 ways = 11 106else 107 ways = 4 108endif 109 110ways_start_reg equ 1 111 112iv equ @CatStr(xmm, %(ways_start_reg + ways)) 113iv_ymm equ @CatStr(ymm, %(ways_start_reg + ways)) 114 115 116WOP macro op, op2 117 i = 0 118 rept ways 119 op @CatStr(xmm, %(ways_start_reg + i)), op2 120 i = i + 1 121 endm 122endm 123 124 125ifndef ABI_LINUX 126ifdef x64 127 128; we use 32 bytes of home space in stack in WIN64-x64 129NUM_HOME_MM_REGS equ (32 / 16) 130; we preserve xmm registers starting from xmm6 in WIN64-x64 131MM_START_SAVE_REG equ 6 132 133SAVE_XMM macro num_used_mm_regs:req 134 num_save_mm_regs = num_used_mm_regs - MM_START_SAVE_REG 135 if num_save_mm_regs GT 0 136 num_save_mm_regs2 = num_save_mm_regs - NUM_HOME_MM_REGS 137 ; RSP is (16*x + 8) after entering the function in WIN64-x64 138 stack_offset = 16 * num_save_mm_regs2 + (stack_param_offset mod 16) 139 140 i = 0 141 rept num_save_mm_regs 142 143 if i eq NUM_HOME_MM_REGS 144 sub r4, stack_offset 145 endif 146 147 if i lt NUM_HOME_MM_REGS 148 movdqa [r4 + stack_param_offset + i * 16], @CatStr(xmm, %(MM_START_SAVE_REG + i)) 149 else 150 movdqa [r4 + (i - NUM_HOME_MM_REGS) * 16], @CatStr(xmm, %(MM_START_SAVE_REG + i)) 151 endif 152 153 i = i + 1 154 endm 155 endif 156endm 157 158RESTORE_XMM macro num_used_mm_regs:req 159 if num_save_mm_regs GT 0 160 i = 0 161 if num_save_mm_regs2 GT 0 162 rept num_save_mm_regs2 163 movdqa @CatStr(xmm, %(MM_START_SAVE_REG + NUM_HOME_MM_REGS + i)), [r4 + i * 16] 164 i = i + 1 165 endm 166 add r4, stack_offset 167 endif 168 169 num_low_regs = num_save_mm_regs - i 170 i = 0 171 rept num_low_regs 172 movdqa @CatStr(xmm, %(MM_START_SAVE_REG + i)), [r4 + stack_param_offset + i * 16] 173 i = i + 1 174 endm 175 endif 176endm 177 178endif ; x64 179endif ; ABI_LINUX 180 181 182MY_PROLOG macro num_used_mm_regs:req 183 ; num_regs_push: must be equal to the number of push operators 184 ; push r3 185 ; push r5 186 if (IS_LINUX eq 0) or (IS_X64 eq 0) 187 push r6 188 push r7 189 endif 190 191 mov rN, num_param ; don't move it; num_param can use stack pointer (r4) 192 193 if (IS_X64 eq 0) 194 if (IS_CDECL gt 0) 195 mov rD, [r4 + data_OFFS] 196 mov keys, [r4 + aes_OFFS] 197 endif 198 elseif (IS_LINUX gt 0) 199 MY_ABI_LINUX_TO_WIN_2 200 endif 201 202 203 ifndef ABI_LINUX 204 ifdef x64 205 SAVE_XMM num_used_mm_regs 206 endif 207 endif 208 209 mov ksize_x, [keys + 16] 210 shl ksize_x, 5 211endm 212 213 214MY_EPILOG macro 215 ifndef ABI_LINUX 216 ifdef x64 217 RESTORE_XMM num_save_mm_regs 218 endif 219 endif 220 221 if (IS_LINUX eq 0) or (IS_X64 eq 0) 222 pop r7 223 pop r6 224 endif 225 ; pop r5 226 ; pop r3 227 MY_ENDP 228endm 229 230 231OP_KEY macro op:req, offs:req 232 op state, [keys + offs] 233endm 234 235 236WOP_KEY macro op:req, offs:req 237 movdqa key, [keys + offs] 238 WOP op, key 239endm 240 241 242; ---------- AES-CBC Decode ---------- 243 244 245XOR_WITH_DATA macro reg, _ppp_ 246 pxor reg, [rD + i * 16] 247endm 248 249WRITE_TO_DATA macro reg, _ppp_ 250 movdqa [rD + i * 16], reg 251endm 252 253 254; state0 equ @CatStr(xmm, %(ways_start_reg)) 255 256key0 equ @CatStr(xmm, %(ways_start_reg + ways + 1)) 257key0_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 1)) 258 259key_last equ @CatStr(xmm, %(ways_start_reg + ways + 2)) 260key_last_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 2)) 261key_last_ymm_n equ (ways_start_reg + ways + 2) 262 263NUM_CBC_REGS equ (ways_start_reg + ways + 3) 264 265 266MY_SEG_PROC AesCbc_Decode_HW, 3 267 268 AesCbc_Decode_HW_start:: 269 MY_PROLOG NUM_CBC_REGS 270 271 AesCbc_Decode_HW_start_2:: 272 movdqa iv, [keys] 273 add keys, 32 274 275 movdqa key0, [keys + 1 * ksize_r] 276 movdqa key_last, [keys] 277 sub ksize_x, 16 278 279 jmp check2 280 align 16 281 nextBlocks2: 282 WOP movdqa, [rD + i * 16] 283 mov koffs_x, ksize_x 284 ; WOP_KEY pxor, ksize_r + 16 285 WOP pxor, key0 286 ; align 16 287 @@: 288 WOP_KEY aesdec, 1 * koffs_r 289 sub koffs_r, 16 290 jnz @B 291 ; WOP_KEY aesdeclast, 0 292 WOP aesdeclast, key_last 293 294 pxor @CatStr(xmm, %(ways_start_reg)), iv 295 i = 1 296 rept ways - 1 297 pxor @CatStr(xmm, %(ways_start_reg + i)), [rD + i * 16 - 16] 298 i = i + 1 299 endm 300 movdqa iv, [rD + ways * 16 - 16] 301 WOP WRITE_TO_DATA 302 303 add rD, ways * 16 304 AesCbc_Decode_HW_start_3:: 305 check2: 306 sub rN, ways 307 jnc nextBlocks2 308 add rN, ways 309 310 sub ksize_x, 16 311 312 jmp check 313 nextBlock: 314 movdqa state, [rD] 315 mov koffs_x, ksize_x 316 ; OP_KEY pxor, 1 * ksize_r + 32 317 pxor state, key0 318 ; movdqa state0, [rD] 319 ; movdqa state, key0 320 ; pxor state, state0 321 @@: 322 OP_KEY aesdec, 1 * koffs_r + 16 323 OP_KEY aesdec, 1 * koffs_r 324 sub koffs_r, 32 325 jnz @B 326 OP_KEY aesdec, 16 327 ; OP_KEY aesdeclast, 0 328 aesdeclast state, key_last 329 330 pxor state, iv 331 movdqa iv, [rD] 332 ; movdqa iv, state0 333 movdqa [rD], state 334 335 add rD, 16 336 check: 337 sub rN, 1 338 jnc nextBlock 339 340 movdqa [keys - 32], iv 341MY_EPILOG 342 343 344 345 346; ---------- AVX ---------- 347 348 349AVX__WOP_n macro op 350 i = 0 351 rept ways 352 op (ways_start_reg + i) 353 i = i + 1 354 endm 355endm 356 357AVX__WOP macro op 358 i = 0 359 rept ways 360 op @CatStr(ymm, %(ways_start_reg + i)) 361 i = i + 1 362 endm 363endm 364 365 366AVX__WOP_KEY macro op:req, offs:req 367 vmovdqa key_ymm, ymmword ptr [keys2 + offs] 368 AVX__WOP_n op 369endm 370 371 372AVX__CBC_START macro reg 373 ; vpxor reg, key_ymm, ymmword ptr [rD + 32 * i] 374 vpxor reg, key0_ymm, ymmword ptr [rD + 32 * i] 375endm 376 377AVX__CBC_END macro reg 378 if i eq 0 379 vpxor reg, reg, iv_ymm 380 else 381 vpxor reg, reg, ymmword ptr [rD + i * 32 - 16] 382 endif 383endm 384 385 386AVX__WRITE_TO_DATA macro reg 387 vmovdqu ymmword ptr [rD + 32 * i], reg 388endm 389 390AVX__XOR_WITH_DATA macro reg 391 vpxor reg, reg, ymmword ptr [rD + 32 * i] 392endm 393 394AVX__CTR_START macro reg 395 vpaddq iv_ymm, iv_ymm, one_ymm 396 ; vpxor reg, iv_ymm, key_ymm 397 vpxor reg, iv_ymm, key0_ymm 398endm 399 400 401MY_VAES_INSTR_2 macro cmd, dest, a1, a2 402 db 0c4H 403 db 2 + 040H + 020h * (1 - (a2) / 8) + 080h * (1 - (dest) / 8) 404 db 5 + 8 * ((not (a1)) and 15) 405 db cmd 406 db 0c0H + 8 * ((dest) and 7) + ((a2) and 7) 407endm 408 409MY_VAES_INSTR macro cmd, dest, a 410 MY_VAES_INSTR_2 cmd, dest, dest, a 411endm 412 413MY_vaesenc macro dest, a 414 MY_VAES_INSTR 0dcH, dest, a 415endm 416MY_vaesenclast macro dest, a 417 MY_VAES_INSTR 0ddH, dest, a 418endm 419MY_vaesdec macro dest, a 420 MY_VAES_INSTR 0deH, dest, a 421endm 422MY_vaesdeclast macro dest, a 423 MY_VAES_INSTR 0dfH, dest, a 424endm 425 426 427AVX__VAES_DEC macro reg 428 MY_vaesdec reg, key_ymm_n 429endm 430 431AVX__VAES_DEC_LAST_key_last macro reg 432 ; MY_vaesdeclast reg, key_ymm_n 433 MY_vaesdeclast reg, key_last_ymm_n 434endm 435 436AVX__VAES_ENC macro reg 437 MY_vaesenc reg, key_ymm_n 438endm 439 440AVX__VAES_ENC_LAST macro reg 441 MY_vaesenclast reg, key_ymm_n 442endm 443 444AVX__vinserti128_TO_HIGH macro dest, src 445 vinserti128 dest, dest, src, 1 446endm 447 448 449MY_PROC AesCbc_Decode_HW_256, 3 450 ifdef use_vaes_256 451 MY_PROLOG NUM_CBC_REGS 452 453 cmp rN, ways * 2 454 jb AesCbc_Decode_HW_start_2 455 456 vmovdqa iv, xmmword ptr [keys] 457 add keys, 32 458 459 vbroadcasti128 key0_ymm, xmmword ptr [keys + 1 * ksize_r] 460 vbroadcasti128 key_last_ymm, xmmword ptr [keys] 461 sub ksize_x, 16 462 mov koffs_x, ksize_x 463 add ksize_x, ksize_x 464 465 AVX_STACK_SUB = ((NUM_AES_KEYS_MAX + 1 - 2) * 32) 466 push keys2 467 sub r4, AVX_STACK_SUB 468 ; sub r4, 32 469 ; sub r4, ksize_r 470 ; lea keys2, [r4 + 32] 471 mov keys2, r4 472 and keys2, -32 473 broad: 474 vbroadcasti128 key_ymm, xmmword ptr [keys + 1 * koffs_r] 475 vmovdqa ymmword ptr [keys2 + koffs_r * 2], key_ymm 476 sub koffs_r, 16 477 ; jnc broad 478 jnz broad 479 480 sub rN, ways * 2 481 482 align 16 483 avx_cbcdec_nextBlock2: 484 mov koffs_x, ksize_x 485 ; AVX__WOP_KEY AVX__CBC_START, 1 * koffs_r + 32 486 AVX__WOP AVX__CBC_START 487 @@: 488 AVX__WOP_KEY AVX__VAES_DEC, 1 * koffs_r 489 sub koffs_r, 32 490 jnz @B 491 ; AVX__WOP_KEY AVX__VAES_DEC_LAST, 0 492 AVX__WOP_n AVX__VAES_DEC_LAST_key_last 493 494 AVX__vinserti128_TO_HIGH iv_ymm, xmmword ptr [rD] 495 AVX__WOP AVX__CBC_END 496 497 vmovdqa iv, xmmword ptr [rD + ways * 32 - 16] 498 AVX__WOP AVX__WRITE_TO_DATA 499 500 add rD, ways * 32 501 sub rN, ways * 2 502 jnc avx_cbcdec_nextBlock2 503 add rN, ways * 2 504 505 shr ksize_x, 1 506 507 ; lea r4, [r4 + 1 * ksize_r + 32] 508 add r4, AVX_STACK_SUB 509 pop keys2 510 511 vzeroupper 512 jmp AesCbc_Decode_HW_start_3 513 else 514 jmp AesCbc_Decode_HW_start 515 endif 516MY_ENDP 517MY_SEG_ENDP 518 519 520 521 522; ---------- AES-CBC Encode ---------- 523 524e0 equ xmm1 525 526CENC_START_KEY equ 2 527CENC_NUM_REG_KEYS equ (3 * 2) 528; last_key equ @CatStr(xmm, %(CENC_START_KEY + CENC_NUM_REG_KEYS)) 529 530MY_SEG_PROC AesCbc_Encode_HW, 3 531 MY_PROLOG (CENC_START_KEY + CENC_NUM_REG_KEYS + 0) 532 533 movdqa state, [keys] 534 add keys, 32 535 536 i = 0 537 rept CENC_NUM_REG_KEYS 538 movdqa @CatStr(xmm, %(CENC_START_KEY + i)), [keys + i * 16] 539 i = i + 1 540 endm 541 542 add keys, ksize_r 543 neg ksize_r 544 add ksize_r, (16 * CENC_NUM_REG_KEYS) 545 ; movdqa last_key, [keys] 546 jmp check_e 547 548 align 16 549 nextBlock_e: 550 movdqa e0, [rD] 551 mov koffs_r, ksize_r 552 pxor e0, @CatStr(xmm, %(CENC_START_KEY)) 553 pxor state, e0 554 555 i = 1 556 rept (CENC_NUM_REG_KEYS - 1) 557 aesenc state, @CatStr(xmm, %(CENC_START_KEY + i)) 558 i = i + 1 559 endm 560 561 @@: 562 OP_KEY aesenc, 1 * koffs_r 563 OP_KEY aesenc, 1 * koffs_r + 16 564 add koffs_r, 32 565 jnz @B 566 OP_KEY aesenclast, 0 567 ; aesenclast state, last_key 568 569 movdqa [rD], state 570 add rD, 16 571 check_e: 572 sub rN, 1 573 jnc nextBlock_e 574 575 ; movdqa [keys - 32], state 576 movdqa [keys + 1 * ksize_r - (16 * CENC_NUM_REG_KEYS) - 32], state 577MY_EPILOG 578MY_SEG_ENDP 579 580 581 582; ---------- AES-CTR ---------- 583 584ifdef x64 585 ; ways = 11 586endif 587 588 589one equ @CatStr(xmm, %(ways_start_reg + ways + 1)) 590one_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 1)) 591key0 equ @CatStr(xmm, %(ways_start_reg + ways + 2)) 592key0_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 2)) 593NUM_CTR_REGS equ (ways_start_reg + ways + 3) 594 595INIT_CTR macro reg, _ppp_ 596 paddq iv, one 597 movdqa reg, iv 598endm 599 600 601MY_SEG_PROC AesCtr_Code_HW, 3 602 Ctr_start:: 603 MY_PROLOG NUM_CTR_REGS 604 605 Ctr_start_2:: 606 movdqa iv, [keys] 607 add keys, 32 608 movdqa key0, [keys] 609 610 add keys, ksize_r 611 neg ksize_r 612 add ksize_r, 16 613 614 Ctr_start_3:: 615 mov koffs_x, 1 616 movd one, koffs_x 617 jmp check2_c 618 619 align 16 620 nextBlocks2_c: 621 WOP INIT_CTR, 0 622 mov koffs_r, ksize_r 623 ; WOP_KEY pxor, 1 * koffs_r -16 624 WOP pxor, key0 625 @@: 626 WOP_KEY aesenc, 1 * koffs_r 627 add koffs_r, 16 628 jnz @B 629 WOP_KEY aesenclast, 0 630 631 WOP XOR_WITH_DATA 632 WOP WRITE_TO_DATA 633 add rD, ways * 16 634 check2_c: 635 sub rN, ways 636 jnc nextBlocks2_c 637 add rN, ways 638 639 sub keys, 16 640 add ksize_r, 16 641 642 jmp check_c 643 644 ; align 16 645 nextBlock_c: 646 paddq iv, one 647 ; movdqa state, [keys + 1 * koffs_r - 16] 648 movdqa state, key0 649 mov koffs_r, ksize_r 650 pxor state, iv 651 652 @@: 653 OP_KEY aesenc, 1 * koffs_r 654 OP_KEY aesenc, 1 * koffs_r + 16 655 add koffs_r, 32 656 jnz @B 657 OP_KEY aesenc, 0 658 OP_KEY aesenclast, 16 659 660 pxor state, [rD] 661 movdqa [rD], state 662 add rD, 16 663 check_c: 664 sub rN, 1 665 jnc nextBlock_c 666 667 ; movdqa [keys - 32], iv 668 movdqa [keys + 1 * ksize_r - 16 - 32], iv 669MY_EPILOG 670 671 672MY_PROC AesCtr_Code_HW_256, 3 673 ifdef use_vaes_256 674 MY_PROLOG NUM_CTR_REGS 675 676 cmp rN, ways * 2 677 jb Ctr_start_2 678 679 vbroadcasti128 iv_ymm, xmmword ptr [keys] 680 add keys, 32 681 vbroadcasti128 key0_ymm, xmmword ptr [keys] 682 mov koffs_x, 1 683 vmovd one, koffs_x 684 vpsubq iv_ymm, iv_ymm, one_ymm 685 vpaddq one, one, one 686 AVX__vinserti128_TO_HIGH one_ymm, one 687 688 add keys, ksize_r 689 sub ksize_x, 16 690 neg ksize_r 691 mov koffs_r, ksize_r 692 add ksize_r, ksize_r 693 694 AVX_STACK_SUB = ((NUM_AES_KEYS_MAX + 1 - 1) * 32) 695 push keys2 696 lea keys2, [r4 - 32] 697 sub r4, AVX_STACK_SUB 698 and keys2, -32 699 vbroadcasti128 key_ymm, xmmword ptr [keys] 700 vmovdqa ymmword ptr [keys2], key_ymm 701 @@: 702 vbroadcasti128 key_ymm, xmmword ptr [keys + 1 * koffs_r] 703 vmovdqa ymmword ptr [keys2 + koffs_r * 2], key_ymm 704 add koffs_r, 16 705 jnz @B 706 707 sub rN, ways * 2 708 709 align 16 710 avx_ctr_nextBlock2: 711 mov koffs_r, ksize_r 712 AVX__WOP AVX__CTR_START 713 ; AVX__WOP_KEY AVX__CTR_START, 1 * koffs_r - 32 714 @@: 715 AVX__WOP_KEY AVX__VAES_ENC, 1 * koffs_r 716 add koffs_r, 32 717 jnz @B 718 AVX__WOP_KEY AVX__VAES_ENC_LAST, 0 719 720 AVX__WOP AVX__XOR_WITH_DATA 721 AVX__WOP AVX__WRITE_TO_DATA 722 723 add rD, ways * 32 724 sub rN, ways * 2 725 jnc avx_ctr_nextBlock2 726 add rN, ways * 2 727 728 vextracti128 iv, iv_ymm, 1 729 sar ksize_r, 1 730 731 add r4, AVX_STACK_SUB 732 pop keys2 733 734 vzeroupper 735 jmp Ctr_start_3 736 else 737 jmp Ctr_start 738 endif 739MY_ENDP 740MY_SEG_ENDP 741 742end 743