1default rel 2%define XMMWORD 3%define YMMWORD 4%define ZMMWORD 5EXTERN OPENSSL_ia32cap_P 6global ossl_rsaz_avx512ifma_eligible 7 8ALIGN 32 9ossl_rsaz_avx512ifma_eligible: 10 mov ecx,DWORD[((OPENSSL_ia32cap_P+8))] 11 xor eax,eax 12 and ecx,2149777408 13 cmp ecx,2149777408 14 cmove eax,ecx 15 DB 0F3h,0C3h ;repret 16 17section .text code align=64 18 19 20global ossl_rsaz_amm52x20_x1_256 21 22ALIGN 32 23ossl_rsaz_amm52x20_x1_256: 24 mov QWORD[8+rsp],rdi ;WIN64 prologue 25 mov QWORD[16+rsp],rsi 26 mov rax,rsp 27$L$SEH_begin_ossl_rsaz_amm52x20_x1_256: 28 mov rdi,rcx 29 mov rsi,rdx 30 mov rdx,r8 31 mov rcx,r9 32 mov r8,QWORD[40+rsp] 33 34 35 36DB 243,15,30,250 37 push rbx 38 39 push rbp 40 41 push r12 42 43 push r13 44 45 push r14 46 47 push r15 48 49$L$rsaz_amm52x20_x1_256_body: 50 51 52 vpxord ymm0,ymm0,ymm0 53 vmovdqa64 ymm1,ymm0 54 vmovdqa64 ymm16,ymm0 55 vmovdqa64 ymm17,ymm0 56 vmovdqa64 ymm18,ymm0 57 vmovdqa64 ymm19,ymm0 58 59 xor r9d,r9d 60 61 mov r11,rdx 62 mov rax,0xfffffffffffff 63 64 65 mov ebx,5 66 67ALIGN 32 68$L$loop5: 69 mov r13,QWORD[r11] 70 71 vpbroadcastq ymm3,r13 72 mov rdx,QWORD[rsi] 73 mulx r12,r13,r13 74 add r9,r13 75 mov r10,r12 76 adc r10,0 77 78 mov r13,r8 79 imul r13,r9 80 and r13,rax 81 82 vpbroadcastq ymm4,r13 83 mov rdx,QWORD[rcx] 84 mulx r12,r13,r13 85 add r9,r13 86 adc r10,r12 87 88 shr r9,52 89 sal r10,12 90 or r9,r10 91 92 vpmadd52luq ymm1,ymm3,YMMWORD[rsi] 93 vpmadd52luq ymm16,ymm3,YMMWORD[32+rsi] 94 vpmadd52luq ymm17,ymm3,YMMWORD[64+rsi] 95 vpmadd52luq ymm18,ymm3,YMMWORD[96+rsi] 96 vpmadd52luq ymm19,ymm3,YMMWORD[128+rsi] 97 98 vpmadd52luq ymm1,ymm4,YMMWORD[rcx] 99 vpmadd52luq ymm16,ymm4,YMMWORD[32+rcx] 100 vpmadd52luq ymm17,ymm4,YMMWORD[64+rcx] 101 vpmadd52luq ymm18,ymm4,YMMWORD[96+rcx] 102 vpmadd52luq ymm19,ymm4,YMMWORD[128+rcx] 103 104 105 valignq ymm1,ymm16,ymm1,1 106 valignq ymm16,ymm17,ymm16,1 107 valignq ymm17,ymm18,ymm17,1 108 valignq ymm18,ymm19,ymm18,1 109 valignq ymm19,ymm0,ymm19,1 110 111 vmovq r13,xmm1 112 add r9,r13 113 114 vpmadd52huq ymm1,ymm3,YMMWORD[rsi] 115 vpmadd52huq ymm16,ymm3,YMMWORD[32+rsi] 116 vpmadd52huq ymm17,ymm3,YMMWORD[64+rsi] 117 vpmadd52huq ymm18,ymm3,YMMWORD[96+rsi] 118 vpmadd52huq ymm19,ymm3,YMMWORD[128+rsi] 119 120 vpmadd52huq ymm1,ymm4,YMMWORD[rcx] 121 vpmadd52huq ymm16,ymm4,YMMWORD[32+rcx] 122 vpmadd52huq ymm17,ymm4,YMMWORD[64+rcx] 123 vpmadd52huq ymm18,ymm4,YMMWORD[96+rcx] 124 vpmadd52huq ymm19,ymm4,YMMWORD[128+rcx] 125 mov r13,QWORD[8+r11] 126 127 vpbroadcastq ymm3,r13 128 mov rdx,QWORD[rsi] 129 mulx r12,r13,r13 130 add r9,r13 131 mov r10,r12 132 adc r10,0 133 134 mov r13,r8 135 imul r13,r9 136 and r13,rax 137 138 vpbroadcastq ymm4,r13 139 mov rdx,QWORD[rcx] 140 mulx r12,r13,r13 141 add r9,r13 142 adc r10,r12 143 144 shr r9,52 145 sal r10,12 146 or r9,r10 147 148 vpmadd52luq ymm1,ymm3,YMMWORD[rsi] 149 vpmadd52luq ymm16,ymm3,YMMWORD[32+rsi] 150 vpmadd52luq ymm17,ymm3,YMMWORD[64+rsi] 151 vpmadd52luq ymm18,ymm3,YMMWORD[96+rsi] 152 vpmadd52luq ymm19,ymm3,YMMWORD[128+rsi] 153 154 vpmadd52luq ymm1,ymm4,YMMWORD[rcx] 155 vpmadd52luq ymm16,ymm4,YMMWORD[32+rcx] 156 vpmadd52luq ymm17,ymm4,YMMWORD[64+rcx] 157 vpmadd52luq ymm18,ymm4,YMMWORD[96+rcx] 158 vpmadd52luq ymm19,ymm4,YMMWORD[128+rcx] 159 160 161 valignq ymm1,ymm16,ymm1,1 162 valignq ymm16,ymm17,ymm16,1 163 valignq ymm17,ymm18,ymm17,1 164 valignq ymm18,ymm19,ymm18,1 165 valignq ymm19,ymm0,ymm19,1 166 167 vmovq r13,xmm1 168 add r9,r13 169 170 vpmadd52huq ymm1,ymm3,YMMWORD[rsi] 171 vpmadd52huq ymm16,ymm3,YMMWORD[32+rsi] 172 vpmadd52huq ymm17,ymm3,YMMWORD[64+rsi] 173 vpmadd52huq ymm18,ymm3,YMMWORD[96+rsi] 174 vpmadd52huq ymm19,ymm3,YMMWORD[128+rsi] 175 176 vpmadd52huq ymm1,ymm4,YMMWORD[rcx] 177 vpmadd52huq ymm16,ymm4,YMMWORD[32+rcx] 178 vpmadd52huq ymm17,ymm4,YMMWORD[64+rcx] 179 vpmadd52huq ymm18,ymm4,YMMWORD[96+rcx] 180 vpmadd52huq ymm19,ymm4,YMMWORD[128+rcx] 181 mov r13,QWORD[16+r11] 182 183 vpbroadcastq ymm3,r13 184 mov rdx,QWORD[rsi] 185 mulx r12,r13,r13 186 add r9,r13 187 mov r10,r12 188 adc r10,0 189 190 mov r13,r8 191 imul r13,r9 192 and r13,rax 193 194 vpbroadcastq ymm4,r13 195 mov rdx,QWORD[rcx] 196 mulx r12,r13,r13 197 add r9,r13 198 adc r10,r12 199 200 shr r9,52 201 sal r10,12 202 or r9,r10 203 204 vpmadd52luq ymm1,ymm3,YMMWORD[rsi] 205 vpmadd52luq ymm16,ymm3,YMMWORD[32+rsi] 206 vpmadd52luq ymm17,ymm3,YMMWORD[64+rsi] 207 vpmadd52luq ymm18,ymm3,YMMWORD[96+rsi] 208 vpmadd52luq ymm19,ymm3,YMMWORD[128+rsi] 209 210 vpmadd52luq ymm1,ymm4,YMMWORD[rcx] 211 vpmadd52luq ymm16,ymm4,YMMWORD[32+rcx] 212 vpmadd52luq ymm17,ymm4,YMMWORD[64+rcx] 213 vpmadd52luq ymm18,ymm4,YMMWORD[96+rcx] 214 vpmadd52luq ymm19,ymm4,YMMWORD[128+rcx] 215 216 217 valignq ymm1,ymm16,ymm1,1 218 valignq ymm16,ymm17,ymm16,1 219 valignq ymm17,ymm18,ymm17,1 220 valignq ymm18,ymm19,ymm18,1 221 valignq ymm19,ymm0,ymm19,1 222 223 vmovq r13,xmm1 224 add r9,r13 225 226 vpmadd52huq ymm1,ymm3,YMMWORD[rsi] 227 vpmadd52huq ymm16,ymm3,YMMWORD[32+rsi] 228 vpmadd52huq ymm17,ymm3,YMMWORD[64+rsi] 229 vpmadd52huq ymm18,ymm3,YMMWORD[96+rsi] 230 vpmadd52huq ymm19,ymm3,YMMWORD[128+rsi] 231 232 vpmadd52huq ymm1,ymm4,YMMWORD[rcx] 233 vpmadd52huq ymm16,ymm4,YMMWORD[32+rcx] 234 vpmadd52huq ymm17,ymm4,YMMWORD[64+rcx] 235 vpmadd52huq ymm18,ymm4,YMMWORD[96+rcx] 236 vpmadd52huq ymm19,ymm4,YMMWORD[128+rcx] 237 mov r13,QWORD[24+r11] 238 239 vpbroadcastq ymm3,r13 240 mov rdx,QWORD[rsi] 241 mulx r12,r13,r13 242 add r9,r13 243 mov r10,r12 244 adc r10,0 245 246 mov r13,r8 247 imul r13,r9 248 and r13,rax 249 250 vpbroadcastq ymm4,r13 251 mov rdx,QWORD[rcx] 252 mulx r12,r13,r13 253 add r9,r13 254 adc r10,r12 255 256 shr r9,52 257 sal r10,12 258 or r9,r10 259 260 vpmadd52luq ymm1,ymm3,YMMWORD[rsi] 261 vpmadd52luq ymm16,ymm3,YMMWORD[32+rsi] 262 vpmadd52luq ymm17,ymm3,YMMWORD[64+rsi] 263 vpmadd52luq ymm18,ymm3,YMMWORD[96+rsi] 264 vpmadd52luq ymm19,ymm3,YMMWORD[128+rsi] 265 266 vpmadd52luq ymm1,ymm4,YMMWORD[rcx] 267 vpmadd52luq ymm16,ymm4,YMMWORD[32+rcx] 268 vpmadd52luq ymm17,ymm4,YMMWORD[64+rcx] 269 vpmadd52luq ymm18,ymm4,YMMWORD[96+rcx] 270 vpmadd52luq ymm19,ymm4,YMMWORD[128+rcx] 271 272 273 valignq ymm1,ymm16,ymm1,1 274 valignq ymm16,ymm17,ymm16,1 275 valignq ymm17,ymm18,ymm17,1 276 valignq ymm18,ymm19,ymm18,1 277 valignq ymm19,ymm0,ymm19,1 278 279 vmovq r13,xmm1 280 add r9,r13 281 282 vpmadd52huq ymm1,ymm3,YMMWORD[rsi] 283 vpmadd52huq ymm16,ymm3,YMMWORD[32+rsi] 284 vpmadd52huq ymm17,ymm3,YMMWORD[64+rsi] 285 vpmadd52huq ymm18,ymm3,YMMWORD[96+rsi] 286 vpmadd52huq ymm19,ymm3,YMMWORD[128+rsi] 287 288 vpmadd52huq ymm1,ymm4,YMMWORD[rcx] 289 vpmadd52huq ymm16,ymm4,YMMWORD[32+rcx] 290 vpmadd52huq ymm17,ymm4,YMMWORD[64+rcx] 291 vpmadd52huq ymm18,ymm4,YMMWORD[96+rcx] 292 vpmadd52huq ymm19,ymm4,YMMWORD[128+rcx] 293 lea r11,[32+r11] 294 dec ebx 295 jne NEAR $L$loop5 296 297 vmovdqa64 ymm4,YMMWORD[$L$mask52x4] 298 299 vpbroadcastq ymm3,r9 300 vpblendd ymm1,ymm1,ymm3,3 301 302 303 304 vpsrlq ymm24,ymm1,52 305 vpsrlq ymm25,ymm16,52 306 vpsrlq ymm26,ymm17,52 307 vpsrlq ymm27,ymm18,52 308 vpsrlq ymm28,ymm19,52 309 310 311 valignq ymm28,ymm28,ymm27,3 312 valignq ymm27,ymm27,ymm26,3 313 valignq ymm26,ymm26,ymm25,3 314 valignq ymm25,ymm25,ymm24,3 315 valignq ymm24,ymm24,ymm0,3 316 317 318 vpandq ymm1,ymm1,ymm4 319 vpandq ymm16,ymm16,ymm4 320 vpandq ymm17,ymm17,ymm4 321 vpandq ymm18,ymm18,ymm4 322 vpandq ymm19,ymm19,ymm4 323 324 325 vpaddq ymm1,ymm1,ymm24 326 vpaddq ymm16,ymm16,ymm25 327 vpaddq ymm17,ymm17,ymm26 328 vpaddq ymm18,ymm18,ymm27 329 vpaddq ymm19,ymm19,ymm28 330 331 332 333 vpcmpuq k1,ymm4,ymm1,1 334 vpcmpuq k2,ymm4,ymm16,1 335 vpcmpuq k3,ymm4,ymm17,1 336 vpcmpuq k4,ymm4,ymm18,1 337 vpcmpuq k5,ymm4,ymm19,1 338 kmovb r14d,k1 339 kmovb r13d,k2 340 kmovb r12d,k3 341 kmovb r11d,k4 342 kmovb r10d,k5 343 344 345 vpcmpuq k1,ymm4,ymm1,0 346 vpcmpuq k2,ymm4,ymm16,0 347 vpcmpuq k3,ymm4,ymm17,0 348 vpcmpuq k4,ymm4,ymm18,0 349 vpcmpuq k5,ymm4,ymm19,0 350 kmovb r9d,k1 351 kmovb r8d,k2 352 kmovb ebx,k3 353 kmovb ecx,k4 354 kmovb edx,k5 355 356 357 358 shl r13b,4 359 or r14b,r13b 360 shl r11b,4 361 or r12b,r11b 362 363 add r14b,r14b 364 adc r12b,r12b 365 adc r10b,r10b 366 367 shl r8b,4 368 or r9b,r8b 369 shl cl,4 370 or bl,cl 371 372 add r14b,r9b 373 adc r12b,bl 374 adc r10b,dl 375 376 xor r14b,r9b 377 xor r12b,bl 378 xor r10b,dl 379 380 kmovb k1,r14d 381 shr r14b,4 382 kmovb k2,r14d 383 kmovb k3,r12d 384 shr r12b,4 385 kmovb k4,r12d 386 kmovb k5,r10d 387 388 389 vpsubq ymm1{k1},ymm1,ymm4 390 vpsubq ymm16{k2},ymm16,ymm4 391 vpsubq ymm17{k3},ymm17,ymm4 392 vpsubq ymm18{k4},ymm18,ymm4 393 vpsubq ymm19{k5},ymm19,ymm4 394 395 vpandq ymm1,ymm1,ymm4 396 vpandq ymm16,ymm16,ymm4 397 vpandq ymm17,ymm17,ymm4 398 vpandq ymm18,ymm18,ymm4 399 vpandq ymm19,ymm19,ymm4 400 401 vmovdqu64 YMMWORD[rdi],ymm1 402 vmovdqu64 YMMWORD[32+rdi],ymm16 403 vmovdqu64 YMMWORD[64+rdi],ymm17 404 vmovdqu64 YMMWORD[96+rdi],ymm18 405 vmovdqu64 YMMWORD[128+rdi],ymm19 406 407 vzeroupper 408 mov r15,QWORD[rsp] 409 410 mov r14,QWORD[8+rsp] 411 412 mov r13,QWORD[16+rsp] 413 414 mov r12,QWORD[24+rsp] 415 416 mov rbp,QWORD[32+rsp] 417 418 mov rbx,QWORD[40+rsp] 419 420 lea rsp,[48+rsp] 421 422$L$rsaz_amm52x20_x1_256_epilogue: 423 mov rdi,QWORD[8+rsp] ;WIN64 epilogue 424 mov rsi,QWORD[16+rsp] 425 DB 0F3h,0C3h ;repret 426 427$L$SEH_end_ossl_rsaz_amm52x20_x1_256: 428section .data data align=8 429 430ALIGN 32 431$L$mask52x4: 432 DQ 0xfffffffffffff 433 DQ 0xfffffffffffff 434 DQ 0xfffffffffffff 435 DQ 0xfffffffffffff 436section .text code align=64 437 438 439global ossl_rsaz_amm52x20_x2_256 440 441ALIGN 32 442ossl_rsaz_amm52x20_x2_256: 443 mov QWORD[8+rsp],rdi ;WIN64 prologue 444 mov QWORD[16+rsp],rsi 445 mov rax,rsp 446$L$SEH_begin_ossl_rsaz_amm52x20_x2_256: 447 mov rdi,rcx 448 mov rsi,rdx 449 mov rdx,r8 450 mov rcx,r9 451 mov r8,QWORD[40+rsp] 452 453 454 455DB 243,15,30,250 456 push rbx 457 458 push rbp 459 460 push r12 461 462 push r13 463 464 push r14 465 466 push r15 467 468$L$rsaz_amm52x20_x2_256_body: 469 470 471 vpxord ymm0,ymm0,ymm0 472 vmovdqa64 ymm1,ymm0 473 vmovdqa64 ymm16,ymm0 474 vmovdqa64 ymm17,ymm0 475 vmovdqa64 ymm18,ymm0 476 vmovdqa64 ymm19,ymm0 477 vmovdqa64 ymm2,ymm0 478 vmovdqa64 ymm20,ymm0 479 vmovdqa64 ymm21,ymm0 480 vmovdqa64 ymm22,ymm0 481 vmovdqa64 ymm23,ymm0 482 483 xor r9d,r9d 484 xor r15d,r15d 485 486 mov r11,rdx 487 mov rax,0xfffffffffffff 488 489 mov ebx,20 490 491ALIGN 32 492$L$loop20: 493 mov r13,QWORD[r11] 494 495 vpbroadcastq ymm3,r13 496 mov rdx,QWORD[rsi] 497 mulx r12,r13,r13 498 add r9,r13 499 mov r10,r12 500 adc r10,0 501 502 mov r13,QWORD[r8] 503 imul r13,r9 504 and r13,rax 505 506 vpbroadcastq ymm4,r13 507 mov rdx,QWORD[rcx] 508 mulx r12,r13,r13 509 add r9,r13 510 adc r10,r12 511 512 shr r9,52 513 sal r10,12 514 or r9,r10 515 516 vpmadd52luq ymm1,ymm3,YMMWORD[rsi] 517 vpmadd52luq ymm16,ymm3,YMMWORD[32+rsi] 518 vpmadd52luq ymm17,ymm3,YMMWORD[64+rsi] 519 vpmadd52luq ymm18,ymm3,YMMWORD[96+rsi] 520 vpmadd52luq ymm19,ymm3,YMMWORD[128+rsi] 521 522 vpmadd52luq ymm1,ymm4,YMMWORD[rcx] 523 vpmadd52luq ymm16,ymm4,YMMWORD[32+rcx] 524 vpmadd52luq ymm17,ymm4,YMMWORD[64+rcx] 525 vpmadd52luq ymm18,ymm4,YMMWORD[96+rcx] 526 vpmadd52luq ymm19,ymm4,YMMWORD[128+rcx] 527 528 529 valignq ymm1,ymm16,ymm1,1 530 valignq ymm16,ymm17,ymm16,1 531 valignq ymm17,ymm18,ymm17,1 532 valignq ymm18,ymm19,ymm18,1 533 valignq ymm19,ymm0,ymm19,1 534 535 vmovq r13,xmm1 536 add r9,r13 537 538 vpmadd52huq ymm1,ymm3,YMMWORD[rsi] 539 vpmadd52huq ymm16,ymm3,YMMWORD[32+rsi] 540 vpmadd52huq ymm17,ymm3,YMMWORD[64+rsi] 541 vpmadd52huq ymm18,ymm3,YMMWORD[96+rsi] 542 vpmadd52huq ymm19,ymm3,YMMWORD[128+rsi] 543 544 vpmadd52huq ymm1,ymm4,YMMWORD[rcx] 545 vpmadd52huq ymm16,ymm4,YMMWORD[32+rcx] 546 vpmadd52huq ymm17,ymm4,YMMWORD[64+rcx] 547 vpmadd52huq ymm18,ymm4,YMMWORD[96+rcx] 548 vpmadd52huq ymm19,ymm4,YMMWORD[128+rcx] 549 mov r13,QWORD[160+r11] 550 551 vpbroadcastq ymm3,r13 552 mov rdx,QWORD[160+rsi] 553 mulx r12,r13,r13 554 add r15,r13 555 mov r10,r12 556 adc r10,0 557 558 mov r13,QWORD[8+r8] 559 imul r13,r15 560 and r13,rax 561 562 vpbroadcastq ymm4,r13 563 mov rdx,QWORD[160+rcx] 564 mulx r12,r13,r13 565 add r15,r13 566 adc r10,r12 567 568 shr r15,52 569 sal r10,12 570 or r15,r10 571 572 vpmadd52luq ymm2,ymm3,YMMWORD[160+rsi] 573 vpmadd52luq ymm20,ymm3,YMMWORD[192+rsi] 574 vpmadd52luq ymm21,ymm3,YMMWORD[224+rsi] 575 vpmadd52luq ymm22,ymm3,YMMWORD[256+rsi] 576 vpmadd52luq ymm23,ymm3,YMMWORD[288+rsi] 577 578 vpmadd52luq ymm2,ymm4,YMMWORD[160+rcx] 579 vpmadd52luq ymm20,ymm4,YMMWORD[192+rcx] 580 vpmadd52luq ymm21,ymm4,YMMWORD[224+rcx] 581 vpmadd52luq ymm22,ymm4,YMMWORD[256+rcx] 582 vpmadd52luq ymm23,ymm4,YMMWORD[288+rcx] 583 584 585 valignq ymm2,ymm20,ymm2,1 586 valignq ymm20,ymm21,ymm20,1 587 valignq ymm21,ymm22,ymm21,1 588 valignq ymm22,ymm23,ymm22,1 589 valignq ymm23,ymm0,ymm23,1 590 591 vmovq r13,xmm2 592 add r15,r13 593 594 vpmadd52huq ymm2,ymm3,YMMWORD[160+rsi] 595 vpmadd52huq ymm20,ymm3,YMMWORD[192+rsi] 596 vpmadd52huq ymm21,ymm3,YMMWORD[224+rsi] 597 vpmadd52huq ymm22,ymm3,YMMWORD[256+rsi] 598 vpmadd52huq ymm23,ymm3,YMMWORD[288+rsi] 599 600 vpmadd52huq ymm2,ymm4,YMMWORD[160+rcx] 601 vpmadd52huq ymm20,ymm4,YMMWORD[192+rcx] 602 vpmadd52huq ymm21,ymm4,YMMWORD[224+rcx] 603 vpmadd52huq ymm22,ymm4,YMMWORD[256+rcx] 604 vpmadd52huq ymm23,ymm4,YMMWORD[288+rcx] 605 lea r11,[8+r11] 606 dec ebx 607 jne NEAR $L$loop20 608 609 vmovdqa64 ymm4,YMMWORD[$L$mask52x4] 610 611 vpbroadcastq ymm3,r9 612 vpblendd ymm1,ymm1,ymm3,3 613 614 615 616 vpsrlq ymm24,ymm1,52 617 vpsrlq ymm25,ymm16,52 618 vpsrlq ymm26,ymm17,52 619 vpsrlq ymm27,ymm18,52 620 vpsrlq ymm28,ymm19,52 621 622 623 valignq ymm28,ymm28,ymm27,3 624 valignq ymm27,ymm27,ymm26,3 625 valignq ymm26,ymm26,ymm25,3 626 valignq ymm25,ymm25,ymm24,3 627 valignq ymm24,ymm24,ymm0,3 628 629 630 vpandq ymm1,ymm1,ymm4 631 vpandq ymm16,ymm16,ymm4 632 vpandq ymm17,ymm17,ymm4 633 vpandq ymm18,ymm18,ymm4 634 vpandq ymm19,ymm19,ymm4 635 636 637 vpaddq ymm1,ymm1,ymm24 638 vpaddq ymm16,ymm16,ymm25 639 vpaddq ymm17,ymm17,ymm26 640 vpaddq ymm18,ymm18,ymm27 641 vpaddq ymm19,ymm19,ymm28 642 643 644 645 vpcmpuq k1,ymm4,ymm1,1 646 vpcmpuq k2,ymm4,ymm16,1 647 vpcmpuq k3,ymm4,ymm17,1 648 vpcmpuq k4,ymm4,ymm18,1 649 vpcmpuq k5,ymm4,ymm19,1 650 kmovb r14d,k1 651 kmovb r13d,k2 652 kmovb r12d,k3 653 kmovb r11d,k4 654 kmovb r10d,k5 655 656 657 vpcmpuq k1,ymm4,ymm1,0 658 vpcmpuq k2,ymm4,ymm16,0 659 vpcmpuq k3,ymm4,ymm17,0 660 vpcmpuq k4,ymm4,ymm18,0 661 vpcmpuq k5,ymm4,ymm19,0 662 kmovb r9d,k1 663 kmovb r8d,k2 664 kmovb ebx,k3 665 kmovb ecx,k4 666 kmovb edx,k5 667 668 669 670 shl r13b,4 671 or r14b,r13b 672 shl r11b,4 673 or r12b,r11b 674 675 add r14b,r14b 676 adc r12b,r12b 677 adc r10b,r10b 678 679 shl r8b,4 680 or r9b,r8b 681 shl cl,4 682 or bl,cl 683 684 add r14b,r9b 685 adc r12b,bl 686 adc r10b,dl 687 688 xor r14b,r9b 689 xor r12b,bl 690 xor r10b,dl 691 692 kmovb k1,r14d 693 shr r14b,4 694 kmovb k2,r14d 695 kmovb k3,r12d 696 shr r12b,4 697 kmovb k4,r12d 698 kmovb k5,r10d 699 700 701 vpsubq ymm1{k1},ymm1,ymm4 702 vpsubq ymm16{k2},ymm16,ymm4 703 vpsubq ymm17{k3},ymm17,ymm4 704 vpsubq ymm18{k4},ymm18,ymm4 705 vpsubq ymm19{k5},ymm19,ymm4 706 707 vpandq ymm1,ymm1,ymm4 708 vpandq ymm16,ymm16,ymm4 709 vpandq ymm17,ymm17,ymm4 710 vpandq ymm18,ymm18,ymm4 711 vpandq ymm19,ymm19,ymm4 712 713 vpbroadcastq ymm3,r15 714 vpblendd ymm2,ymm2,ymm3,3 715 716 717 718 vpsrlq ymm24,ymm2,52 719 vpsrlq ymm25,ymm20,52 720 vpsrlq ymm26,ymm21,52 721 vpsrlq ymm27,ymm22,52 722 vpsrlq ymm28,ymm23,52 723 724 725 valignq ymm28,ymm28,ymm27,3 726 valignq ymm27,ymm27,ymm26,3 727 valignq ymm26,ymm26,ymm25,3 728 valignq ymm25,ymm25,ymm24,3 729 valignq ymm24,ymm24,ymm0,3 730 731 732 vpandq ymm2,ymm2,ymm4 733 vpandq ymm20,ymm20,ymm4 734 vpandq ymm21,ymm21,ymm4 735 vpandq ymm22,ymm22,ymm4 736 vpandq ymm23,ymm23,ymm4 737 738 739 vpaddq ymm2,ymm2,ymm24 740 vpaddq ymm20,ymm20,ymm25 741 vpaddq ymm21,ymm21,ymm26 742 vpaddq ymm22,ymm22,ymm27 743 vpaddq ymm23,ymm23,ymm28 744 745 746 747 vpcmpuq k1,ymm4,ymm2,1 748 vpcmpuq k2,ymm4,ymm20,1 749 vpcmpuq k3,ymm4,ymm21,1 750 vpcmpuq k4,ymm4,ymm22,1 751 vpcmpuq k5,ymm4,ymm23,1 752 kmovb r14d,k1 753 kmovb r13d,k2 754 kmovb r12d,k3 755 kmovb r11d,k4 756 kmovb r10d,k5 757 758 759 vpcmpuq k1,ymm4,ymm2,0 760 vpcmpuq k2,ymm4,ymm20,0 761 vpcmpuq k3,ymm4,ymm21,0 762 vpcmpuq k4,ymm4,ymm22,0 763 vpcmpuq k5,ymm4,ymm23,0 764 kmovb r9d,k1 765 kmovb r8d,k2 766 kmovb ebx,k3 767 kmovb ecx,k4 768 kmovb edx,k5 769 770 771 772 shl r13b,4 773 or r14b,r13b 774 shl r11b,4 775 or r12b,r11b 776 777 add r14b,r14b 778 adc r12b,r12b 779 adc r10b,r10b 780 781 shl r8b,4 782 or r9b,r8b 783 shl cl,4 784 or bl,cl 785 786 add r14b,r9b 787 adc r12b,bl 788 adc r10b,dl 789 790 xor r14b,r9b 791 xor r12b,bl 792 xor r10b,dl 793 794 kmovb k1,r14d 795 shr r14b,4 796 kmovb k2,r14d 797 kmovb k3,r12d 798 shr r12b,4 799 kmovb k4,r12d 800 kmovb k5,r10d 801 802 803 vpsubq ymm2{k1},ymm2,ymm4 804 vpsubq ymm20{k2},ymm20,ymm4 805 vpsubq ymm21{k3},ymm21,ymm4 806 vpsubq ymm22{k4},ymm22,ymm4 807 vpsubq ymm23{k5},ymm23,ymm4 808 809 vpandq ymm2,ymm2,ymm4 810 vpandq ymm20,ymm20,ymm4 811 vpandq ymm21,ymm21,ymm4 812 vpandq ymm22,ymm22,ymm4 813 vpandq ymm23,ymm23,ymm4 814 815 vmovdqu64 YMMWORD[rdi],ymm1 816 vmovdqu64 YMMWORD[32+rdi],ymm16 817 vmovdqu64 YMMWORD[64+rdi],ymm17 818 vmovdqu64 YMMWORD[96+rdi],ymm18 819 vmovdqu64 YMMWORD[128+rdi],ymm19 820 821 vmovdqu64 YMMWORD[160+rdi],ymm2 822 vmovdqu64 YMMWORD[192+rdi],ymm20 823 vmovdqu64 YMMWORD[224+rdi],ymm21 824 vmovdqu64 YMMWORD[256+rdi],ymm22 825 vmovdqu64 YMMWORD[288+rdi],ymm23 826 827 vzeroupper 828 mov r15,QWORD[rsp] 829 830 mov r14,QWORD[8+rsp] 831 832 mov r13,QWORD[16+rsp] 833 834 mov r12,QWORD[24+rsp] 835 836 mov rbp,QWORD[32+rsp] 837 838 mov rbx,QWORD[40+rsp] 839 840 lea rsp,[48+rsp] 841 842$L$rsaz_amm52x20_x2_256_epilogue: 843 mov rdi,QWORD[8+rsp] ;WIN64 epilogue 844 mov rsi,QWORD[16+rsp] 845 DB 0F3h,0C3h ;repret 846 847$L$SEH_end_ossl_rsaz_amm52x20_x2_256: 848section .text code align=64 849 850 851ALIGN 32 852global ossl_extract_multiplier_2x20_win5 853 854ossl_extract_multiplier_2x20_win5: 855 mov QWORD[8+rsp],rdi ;WIN64 prologue 856 mov QWORD[16+rsp],rsi 857 mov rax,rsp 858$L$SEH_begin_ossl_extract_multiplier_2x20_win5: 859 mov rdi,rcx 860 mov rsi,rdx 861 mov rdx,r8 862 mov rcx,r9 863 864 865 866DB 243,15,30,250 867 lea rax,[rcx*4+rcx] 868 sal rax,5 869 add rsi,rax 870 871 vmovdqa64 ymm23,YMMWORD[$L$ones] 872 vpbroadcastq ymm22,rdx 873 lea rax,[10240+rsi] 874 875 vpxor xmm4,xmm4,xmm4 876 vmovdqa64 ymm3,ymm4 877 vmovdqa64 ymm2,ymm4 878 vmovdqa64 ymm1,ymm4 879 vmovdqa64 ymm0,ymm4 880 vmovdqa64 ymm21,ymm4 881 882ALIGN 32 883$L$loop: 884 vpcmpq k1,ymm22,ymm21,0 885 add rsi,320 886 vpaddq ymm21,ymm21,ymm23 887 vmovdqu64 ymm16,YMMWORD[((-320))+rsi] 888 vmovdqu64 ymm17,YMMWORD[((-288))+rsi] 889 vmovdqu64 ymm18,YMMWORD[((-256))+rsi] 890 vmovdqu64 ymm19,YMMWORD[((-224))+rsi] 891 vmovdqu64 ymm20,YMMWORD[((-192))+rsi] 892 vpblendmq ymm0{k1},ymm0,ymm16 893 vpblendmq ymm1{k1},ymm1,ymm17 894 vpblendmq ymm2{k1},ymm2,ymm18 895 vpblendmq ymm3{k1},ymm3,ymm19 896 vpblendmq ymm4{k1},ymm4,ymm20 897 cmp rax,rsi 898 jne NEAR $L$loop 899 900 vmovdqu64 YMMWORD[rdi],ymm0 901 vmovdqu64 YMMWORD[32+rdi],ymm1 902 vmovdqu64 YMMWORD[64+rdi],ymm2 903 vmovdqu64 YMMWORD[96+rdi],ymm3 904 vmovdqu64 YMMWORD[128+rdi],ymm4 905 906 mov rdi,QWORD[8+rsp] ;WIN64 epilogue 907 mov rsi,QWORD[16+rsp] 908 DB 0F3h,0C3h ;repret 909 910$L$SEH_end_ossl_extract_multiplier_2x20_win5: 911section .data data align=8 912 913ALIGN 32 914$L$ones: 915 DQ 1,1,1,1 916EXTERN __imp_RtlVirtualUnwind 917 918ALIGN 16 919rsaz_def_handler: 920 push rsi 921 push rdi 922 push rbx 923 push rbp 924 push r12 925 push r13 926 push r14 927 push r15 928 pushfq 929 sub rsp,64 930 931 mov rax,QWORD[120+r8] 932 mov rbx,QWORD[248+r8] 933 934 mov rsi,QWORD[8+r9] 935 mov r11,QWORD[56+r9] 936 937 mov r10d,DWORD[r11] 938 lea r10,[r10*1+rsi] 939 cmp rbx,r10 940 jb NEAR $L$common_seh_tail 941 942 mov rax,QWORD[152+r8] 943 944 mov r10d,DWORD[4+r11] 945 lea r10,[r10*1+rsi] 946 cmp rbx,r10 947 jae NEAR $L$common_seh_tail 948 949 lea rax,[48+rax] 950 951 mov rbx,QWORD[((-8))+rax] 952 mov rbp,QWORD[((-16))+rax] 953 mov r12,QWORD[((-24))+rax] 954 mov r13,QWORD[((-32))+rax] 955 mov r14,QWORD[((-40))+rax] 956 mov r15,QWORD[((-48))+rax] 957 mov QWORD[144+r8],rbx 958 mov QWORD[160+r8],rbp 959 mov QWORD[216+r8],r12 960 mov QWORD[224+r8],r13 961 mov QWORD[232+r8],r14 962 mov QWORD[240+r8],r15 963 964$L$common_seh_tail: 965 mov rdi,QWORD[8+rax] 966 mov rsi,QWORD[16+rax] 967 mov QWORD[152+r8],rax 968 mov QWORD[168+r8],rsi 969 mov QWORD[176+r8],rdi 970 971 mov rdi,QWORD[40+r9] 972 mov rsi,r8 973 mov ecx,154 974 DD 0xa548f3fc 975 976 mov rsi,r9 977 xor rcx,rcx 978 mov rdx,QWORD[8+rsi] 979 mov r8,QWORD[rsi] 980 mov r9,QWORD[16+rsi] 981 mov r10,QWORD[40+rsi] 982 lea r11,[56+rsi] 983 lea r12,[24+rsi] 984 mov QWORD[32+rsp],r10 985 mov QWORD[40+rsp],r11 986 mov QWORD[48+rsp],r12 987 mov QWORD[56+rsp],rcx 988 call QWORD[__imp_RtlVirtualUnwind] 989 990 mov eax,1 991 add rsp,64 992 popfq 993 pop r15 994 pop r14 995 pop r13 996 pop r12 997 pop rbp 998 pop rbx 999 pop rdi 1000 pop rsi 1001 DB 0F3h,0C3h ;repret 1002 1003 1004section .pdata rdata align=4 1005ALIGN 4 1006 DD $L$SEH_begin_ossl_rsaz_amm52x20_x1_256 wrt ..imagebase 1007 DD $L$SEH_end_ossl_rsaz_amm52x20_x1_256 wrt ..imagebase 1008 DD $L$SEH_info_ossl_rsaz_amm52x20_x1_256 wrt ..imagebase 1009 1010 DD $L$SEH_begin_ossl_rsaz_amm52x20_x2_256 wrt ..imagebase 1011 DD $L$SEH_end_ossl_rsaz_amm52x20_x2_256 wrt ..imagebase 1012 DD $L$SEH_info_ossl_rsaz_amm52x20_x2_256 wrt ..imagebase 1013 1014 DD $L$SEH_begin_ossl_extract_multiplier_2x20_win5 wrt ..imagebase 1015 DD $L$SEH_end_ossl_extract_multiplier_2x20_win5 wrt ..imagebase 1016 DD $L$SEH_info_ossl_extract_multiplier_2x20_win5 wrt ..imagebase 1017 1018section .xdata rdata align=8 1019ALIGN 8 1020$L$SEH_info_ossl_rsaz_amm52x20_x1_256: 1021DB 9,0,0,0 1022 DD rsaz_def_handler wrt ..imagebase 1023 DD $L$rsaz_amm52x20_x1_256_body wrt ..imagebase,$L$rsaz_amm52x20_x1_256_epilogue wrt ..imagebase 1024$L$SEH_info_ossl_rsaz_amm52x20_x2_256: 1025DB 9,0,0,0 1026 DD rsaz_def_handler wrt ..imagebase 1027 DD $L$rsaz_amm52x20_x2_256_body wrt ..imagebase,$L$rsaz_amm52x20_x2_256_epilogue wrt ..imagebase 1028$L$SEH_info_ossl_extract_multiplier_2x20_win5: 1029DB 9,0,0,0 1030 DD rsaz_def_handler wrt ..imagebase 1031 DD $L$SEH_begin_ossl_extract_multiplier_2x20_win5 wrt ..imagebase,$L$SEH_begin_ossl_extract_multiplier_2x20_win5 wrt ..imagebase 1032