1#! /usr/bin/env perl 2# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. 3# Copyright (c) 2012, Intel Corporation. All Rights Reserved. 4# 5# Licensed under the OpenSSL license (the "License"). You may not use 6# this file except in compliance with the License. You can obtain a copy 7# in the file LICENSE in the source distribution or at 8# https://www.openssl.org/source/license.html 9# 10# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1) 11# (1) Intel Corporation, Israel Development Center, Haifa, Israel 12# (2) University of Haifa, Israel 13# 14# References: 15# [1] S. Gueron, V. Krasnov: "Software Implementation of Modular 16# Exponentiation, Using Advanced Vector Instructions Architectures", 17# F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369, 18# pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012 19# [2] S. Gueron: "Efficient Software Implementations of Modular 20# Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012). 21# [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE 22# Proceedings of 9th International Conference on Information Technology: 23# New Generations (ITNG 2012), pp.821-823 (2012) 24# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis 25# resistant 1024-bit modular exponentiation, for optimizing RSA2048 26# on AVX2 capable x86_64 platforms", 27# http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest 28# 29# +13% improvement over original submission by <appro@openssl.org> 30# 31# rsa2048 sign/sec OpenSSL 1.0.1 scalar(*) this 32# 2.3GHz Haswell 621 765/+23% 1113/+79% 33# 2.3GHz Broadwell(**) 688 1200(***)/+74% 1120/+63% 34# 35# (*) if system doesn't support AVX2, for reference purposes; 36# (**) scaled to 2.3GHz to simplify comparison; 37# (***) scalar AD*X code is faster than AVX2 and is preferred code 38# path for Broadwell; 39 40$flavour = shift; 41$output = shift; 42if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 43 44$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 45 46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 47( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 48( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 49die "can't locate x86_64-xlate.pl"; 50 51# In upstream, this is controlled by shelling out to the compiler to check 52# versions, but BoringSSL is intended to be used with pre-generated perlasm 53# output, so this isn't useful anyway. 54$avx = 2; 55 56open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 57*STDOUT = *OUT; 58 59if ($avx>1) {{{ 60{ # void AMS_WW( 61my $rp="%rdi"; # BN_ULONG *rp, 62my $ap="%rsi"; # const BN_ULONG *ap, 63my $np="%rdx"; # const BN_ULONG *np, 64my $n0="%ecx"; # const BN_ULONG n0, 65my $rep="%r8d"; # int repeat); 66 67# The registers that hold the accumulated redundant result 68# The AMM works on 1024 bit operands, and redundant word size is 29 69# Therefore: ceil(1024/29)/4 = 9 70my $ACC0="%ymm0"; 71my $ACC1="%ymm1"; 72my $ACC2="%ymm2"; 73my $ACC3="%ymm3"; 74my $ACC4="%ymm4"; 75my $ACC5="%ymm5"; 76my $ACC6="%ymm6"; 77my $ACC7="%ymm7"; 78my $ACC8="%ymm8"; 79my $ACC9="%ymm9"; 80# Registers that hold the broadcasted words of bp, currently used 81my $B1="%ymm10"; 82my $B2="%ymm11"; 83# Registers that hold the broadcasted words of Y, currently used 84my $Y1="%ymm12"; 85my $Y2="%ymm13"; 86# Helper registers 87my $TEMP1="%ymm14"; 88my $AND_MASK="%ymm15"; 89# alu registers that hold the first words of the ACC 90my $r0="%r9"; 91my $r1="%r10"; 92my $r2="%r11"; 93my $r3="%r12"; 94 95my $i="%r14d"; # loop counter 96my $tmp = "%r15"; 97 98my $FrameSize=32*18+32*8; # place for A^2 and 2*A 99 100my $aap=$r0; 101my $tp0="%rbx"; 102my $tp1=$r3; 103my $tpa=$tmp; 104 105$np="%r13"; # reassigned argument 106 107$code.=<<___; 108.text 109 110.globl rsaz_1024_sqr_avx2 111.type rsaz_1024_sqr_avx2,\@function,5 112.align 64 113rsaz_1024_sqr_avx2: # 702 cycles, 14% faster than rsaz_1024_mul_avx2 114.cfi_startproc 115 lea (%rsp), %rax 116.cfi_def_cfa_register %rax 117 push %rbx 118.cfi_push %rbx 119 push %rbp 120.cfi_push %rbp 121 push %r12 122.cfi_push %r12 123 push %r13 124.cfi_push %r13 125 push %r14 126.cfi_push %r14 127 push %r15 128.cfi_push %r15 129 vzeroupper 130___ 131$code.=<<___ if ($win64); 132 lea -0xa8(%rsp),%rsp 133 vmovaps %xmm6,-0xd8(%rax) 134 vmovaps %xmm7,-0xc8(%rax) 135 vmovaps %xmm8,-0xb8(%rax) 136 vmovaps %xmm9,-0xa8(%rax) 137 vmovaps %xmm10,-0x98(%rax) 138 vmovaps %xmm11,-0x88(%rax) 139 vmovaps %xmm12,-0x78(%rax) 140 vmovaps %xmm13,-0x68(%rax) 141 vmovaps %xmm14,-0x58(%rax) 142 vmovaps %xmm15,-0x48(%rax) 143.Lsqr_1024_body: 144___ 145$code.=<<___; 146 mov %rax,%rbp 147.cfi_def_cfa_register %rbp 148 mov %rdx, $np # reassigned argument 149 sub \$$FrameSize, %rsp 150 mov $np, $tmp 151 sub \$-128, $rp # size optimization 152 sub \$-128, $ap 153 sub \$-128, $np 154 155 and \$4095, $tmp # see if $np crosses page 156 add \$32*10, $tmp 157 shr \$12, $tmp 158 vpxor $ACC9,$ACC9,$ACC9 159 jz .Lsqr_1024_no_n_copy 160 161 # unaligned 256-bit load that crosses page boundary can 162 # cause >2x performance degradation here, so if $np does 163 # cross page boundary, copy it to stack and make sure stack 164 # frame doesn't... 165 sub \$32*10,%rsp 166 vmovdqu 32*0-128($np), $ACC0 167 and \$-2048, %rsp 168 vmovdqu 32*1-128($np), $ACC1 169 vmovdqu 32*2-128($np), $ACC2 170 vmovdqu 32*3-128($np), $ACC3 171 vmovdqu 32*4-128($np), $ACC4 172 vmovdqu 32*5-128($np), $ACC5 173 vmovdqu 32*6-128($np), $ACC6 174 vmovdqu 32*7-128($np), $ACC7 175 vmovdqu 32*8-128($np), $ACC8 176 lea $FrameSize+128(%rsp),$np 177 vmovdqu $ACC0, 32*0-128($np) 178 vmovdqu $ACC1, 32*1-128($np) 179 vmovdqu $ACC2, 32*2-128($np) 180 vmovdqu $ACC3, 32*3-128($np) 181 vmovdqu $ACC4, 32*4-128($np) 182 vmovdqu $ACC5, 32*5-128($np) 183 vmovdqu $ACC6, 32*6-128($np) 184 vmovdqu $ACC7, 32*7-128($np) 185 vmovdqu $ACC8, 32*8-128($np) 186 vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero 187 188.Lsqr_1024_no_n_copy: 189 and \$-1024, %rsp 190 191 vmovdqu 32*1-128($ap), $ACC1 192 vmovdqu 32*2-128($ap), $ACC2 193 vmovdqu 32*3-128($ap), $ACC3 194 vmovdqu 32*4-128($ap), $ACC4 195 vmovdqu 32*5-128($ap), $ACC5 196 vmovdqu 32*6-128($ap), $ACC6 197 vmovdqu 32*7-128($ap), $ACC7 198 vmovdqu 32*8-128($ap), $ACC8 199 200 lea 192(%rsp), $tp0 # 64+128=192 201 vmovdqu .Land_mask(%rip), $AND_MASK 202 jmp .LOOP_GRANDE_SQR_1024 203 204.align 32 205.LOOP_GRANDE_SQR_1024: 206 lea 32*18+128(%rsp), $aap # size optimization 207 lea 448(%rsp), $tp1 # 64+128+256=448 208 209 # the squaring is performed as described in Variant B of 210 # "Speeding up Big-Number Squaring", so start by calculating 211 # the A*2=A+A vector 212 vpaddq $ACC1, $ACC1, $ACC1 213 vpbroadcastq 32*0-128($ap), $B1 214 vpaddq $ACC2, $ACC2, $ACC2 215 vmovdqa $ACC1, 32*0-128($aap) 216 vpaddq $ACC3, $ACC3, $ACC3 217 vmovdqa $ACC2, 32*1-128($aap) 218 vpaddq $ACC4, $ACC4, $ACC4 219 vmovdqa $ACC3, 32*2-128($aap) 220 vpaddq $ACC5, $ACC5, $ACC5 221 vmovdqa $ACC4, 32*3-128($aap) 222 vpaddq $ACC6, $ACC6, $ACC6 223 vmovdqa $ACC5, 32*4-128($aap) 224 vpaddq $ACC7, $ACC7, $ACC7 225 vmovdqa $ACC6, 32*5-128($aap) 226 vpaddq $ACC8, $ACC8, $ACC8 227 vmovdqa $ACC7, 32*6-128($aap) 228 vpxor $ACC9, $ACC9, $ACC9 229 vmovdqa $ACC8, 32*7-128($aap) 230 231 vpmuludq 32*0-128($ap), $B1, $ACC0 232 vpbroadcastq 32*1-128($ap), $B2 233 vmovdqu $ACC9, 32*9-192($tp0) # zero upper half 234 vpmuludq $B1, $ACC1, $ACC1 235 vmovdqu $ACC9, 32*10-448($tp1) 236 vpmuludq $B1, $ACC2, $ACC2 237 vmovdqu $ACC9, 32*11-448($tp1) 238 vpmuludq $B1, $ACC3, $ACC3 239 vmovdqu $ACC9, 32*12-448($tp1) 240 vpmuludq $B1, $ACC4, $ACC4 241 vmovdqu $ACC9, 32*13-448($tp1) 242 vpmuludq $B1, $ACC5, $ACC5 243 vmovdqu $ACC9, 32*14-448($tp1) 244 vpmuludq $B1, $ACC6, $ACC6 245 vmovdqu $ACC9, 32*15-448($tp1) 246 vpmuludq $B1, $ACC7, $ACC7 247 vmovdqu $ACC9, 32*16-448($tp1) 248 vpmuludq $B1, $ACC8, $ACC8 249 vpbroadcastq 32*2-128($ap), $B1 250 vmovdqu $ACC9, 32*17-448($tp1) 251 252 mov $ap, $tpa 253 mov \$4, $i 254 jmp .Lsqr_entry_1024 255___ 256$TEMP0=$Y1; 257$TEMP2=$Y2; 258$code.=<<___; 259.align 32 260.LOOP_SQR_1024: 261 vpbroadcastq 32*1-128($tpa), $B2 262 vpmuludq 32*0-128($ap), $B1, $ACC0 263 vpaddq 32*0-192($tp0), $ACC0, $ACC0 264 vpmuludq 32*0-128($aap), $B1, $ACC1 265 vpaddq 32*1-192($tp0), $ACC1, $ACC1 266 vpmuludq 32*1-128($aap), $B1, $ACC2 267 vpaddq 32*2-192($tp0), $ACC2, $ACC2 268 vpmuludq 32*2-128($aap), $B1, $ACC3 269 vpaddq 32*3-192($tp0), $ACC3, $ACC3 270 vpmuludq 32*3-128($aap), $B1, $ACC4 271 vpaddq 32*4-192($tp0), $ACC4, $ACC4 272 vpmuludq 32*4-128($aap), $B1, $ACC5 273 vpaddq 32*5-192($tp0), $ACC5, $ACC5 274 vpmuludq 32*5-128($aap), $B1, $ACC6 275 vpaddq 32*6-192($tp0), $ACC6, $ACC6 276 vpmuludq 32*6-128($aap), $B1, $ACC7 277 vpaddq 32*7-192($tp0), $ACC7, $ACC7 278 vpmuludq 32*7-128($aap), $B1, $ACC8 279 vpbroadcastq 32*2-128($tpa), $B1 280 vpaddq 32*8-192($tp0), $ACC8, $ACC8 281.Lsqr_entry_1024: 282 vmovdqu $ACC0, 32*0-192($tp0) 283 vmovdqu $ACC1, 32*1-192($tp0) 284 285 vpmuludq 32*1-128($ap), $B2, $TEMP0 286 vpaddq $TEMP0, $ACC2, $ACC2 287 vpmuludq 32*1-128($aap), $B2, $TEMP1 288 vpaddq $TEMP1, $ACC3, $ACC3 289 vpmuludq 32*2-128($aap), $B2, $TEMP2 290 vpaddq $TEMP2, $ACC4, $ACC4 291 vpmuludq 32*3-128($aap), $B2, $TEMP0 292 vpaddq $TEMP0, $ACC5, $ACC5 293 vpmuludq 32*4-128($aap), $B2, $TEMP1 294 vpaddq $TEMP1, $ACC6, $ACC6 295 vpmuludq 32*5-128($aap), $B2, $TEMP2 296 vpaddq $TEMP2, $ACC7, $ACC7 297 vpmuludq 32*6-128($aap), $B2, $TEMP0 298 vpaddq $TEMP0, $ACC8, $ACC8 299 vpmuludq 32*7-128($aap), $B2, $ACC0 300 vpbroadcastq 32*3-128($tpa), $B2 301 vpaddq 32*9-192($tp0), $ACC0, $ACC0 302 303 vmovdqu $ACC2, 32*2-192($tp0) 304 vmovdqu $ACC3, 32*3-192($tp0) 305 306 vpmuludq 32*2-128($ap), $B1, $TEMP2 307 vpaddq $TEMP2, $ACC4, $ACC4 308 vpmuludq 32*2-128($aap), $B1, $TEMP0 309 vpaddq $TEMP0, $ACC5, $ACC5 310 vpmuludq 32*3-128($aap), $B1, $TEMP1 311 vpaddq $TEMP1, $ACC6, $ACC6 312 vpmuludq 32*4-128($aap), $B1, $TEMP2 313 vpaddq $TEMP2, $ACC7, $ACC7 314 vpmuludq 32*5-128($aap), $B1, $TEMP0 315 vpaddq $TEMP0, $ACC8, $ACC8 316 vpmuludq 32*6-128($aap), $B1, $TEMP1 317 vpaddq $TEMP1, $ACC0, $ACC0 318 vpmuludq 32*7-128($aap), $B1, $ACC1 319 vpbroadcastq 32*4-128($tpa), $B1 320 vpaddq 32*10-448($tp1), $ACC1, $ACC1 321 322 vmovdqu $ACC4, 32*4-192($tp0) 323 vmovdqu $ACC5, 32*5-192($tp0) 324 325 vpmuludq 32*3-128($ap), $B2, $TEMP0 326 vpaddq $TEMP0, $ACC6, $ACC6 327 vpmuludq 32*3-128($aap), $B2, $TEMP1 328 vpaddq $TEMP1, $ACC7, $ACC7 329 vpmuludq 32*4-128($aap), $B2, $TEMP2 330 vpaddq $TEMP2, $ACC8, $ACC8 331 vpmuludq 32*5-128($aap), $B2, $TEMP0 332 vpaddq $TEMP0, $ACC0, $ACC0 333 vpmuludq 32*6-128($aap), $B2, $TEMP1 334 vpaddq $TEMP1, $ACC1, $ACC1 335 vpmuludq 32*7-128($aap), $B2, $ACC2 336 vpbroadcastq 32*5-128($tpa), $B2 337 vpaddq 32*11-448($tp1), $ACC2, $ACC2 338 339 vmovdqu $ACC6, 32*6-192($tp0) 340 vmovdqu $ACC7, 32*7-192($tp0) 341 342 vpmuludq 32*4-128($ap), $B1, $TEMP0 343 vpaddq $TEMP0, $ACC8, $ACC8 344 vpmuludq 32*4-128($aap), $B1, $TEMP1 345 vpaddq $TEMP1, $ACC0, $ACC0 346 vpmuludq 32*5-128($aap), $B1, $TEMP2 347 vpaddq $TEMP2, $ACC1, $ACC1 348 vpmuludq 32*6-128($aap), $B1, $TEMP0 349 vpaddq $TEMP0, $ACC2, $ACC2 350 vpmuludq 32*7-128($aap), $B1, $ACC3 351 vpbroadcastq 32*6-128($tpa), $B1 352 vpaddq 32*12-448($tp1), $ACC3, $ACC3 353 354 vmovdqu $ACC8, 32*8-192($tp0) 355 vmovdqu $ACC0, 32*9-192($tp0) 356 lea 8($tp0), $tp0 357 358 vpmuludq 32*5-128($ap), $B2, $TEMP2 359 vpaddq $TEMP2, $ACC1, $ACC1 360 vpmuludq 32*5-128($aap), $B2, $TEMP0 361 vpaddq $TEMP0, $ACC2, $ACC2 362 vpmuludq 32*6-128($aap), $B2, $TEMP1 363 vpaddq $TEMP1, $ACC3, $ACC3 364 vpmuludq 32*7-128($aap), $B2, $ACC4 365 vpbroadcastq 32*7-128($tpa), $B2 366 vpaddq 32*13-448($tp1), $ACC4, $ACC4 367 368 vmovdqu $ACC1, 32*10-448($tp1) 369 vmovdqu $ACC2, 32*11-448($tp1) 370 371 vpmuludq 32*6-128($ap), $B1, $TEMP0 372 vpaddq $TEMP0, $ACC3, $ACC3 373 vpmuludq 32*6-128($aap), $B1, $TEMP1 374 vpbroadcastq 32*8-128($tpa), $ACC0 # borrow $ACC0 for $B1 375 vpaddq $TEMP1, $ACC4, $ACC4 376 vpmuludq 32*7-128($aap), $B1, $ACC5 377 vpbroadcastq 32*0+8-128($tpa), $B1 # for next iteration 378 vpaddq 32*14-448($tp1), $ACC5, $ACC5 379 380 vmovdqu $ACC3, 32*12-448($tp1) 381 vmovdqu $ACC4, 32*13-448($tp1) 382 lea 8($tpa), $tpa 383 384 vpmuludq 32*7-128($ap), $B2, $TEMP0 385 vpaddq $TEMP0, $ACC5, $ACC5 386 vpmuludq 32*7-128($aap), $B2, $ACC6 387 vpaddq 32*15-448($tp1), $ACC6, $ACC6 388 389 vpmuludq 32*8-128($ap), $ACC0, $ACC7 390 vmovdqu $ACC5, 32*14-448($tp1) 391 vpaddq 32*16-448($tp1), $ACC7, $ACC7 392 vmovdqu $ACC6, 32*15-448($tp1) 393 vmovdqu $ACC7, 32*16-448($tp1) 394 lea 8($tp1), $tp1 395 396 dec $i 397 jnz .LOOP_SQR_1024 398___ 399$ZERO = $ACC9; 400$TEMP0 = $B1; 401$TEMP2 = $B2; 402$TEMP3 = $Y1; 403$TEMP4 = $Y2; 404$code.=<<___; 405 # we need to fix indices 32-39 to avoid overflow 406 vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0), 407 vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0) 408 vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0) 409 lea 192(%rsp), $tp0 # 64+128=192 410 411 vpsrlq \$29, $ACC8, $TEMP1 412 vpand $AND_MASK, $ACC8, $ACC8 413 vpsrlq \$29, $ACC1, $TEMP2 414 vpand $AND_MASK, $ACC1, $ACC1 415 416 vpermq \$0x93, $TEMP1, $TEMP1 417 vpxor $ZERO, $ZERO, $ZERO 418 vpermq \$0x93, $TEMP2, $TEMP2 419 420 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 421 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 422 vpaddq $TEMP0, $ACC8, $ACC8 423 vpblendd \$3, $TEMP2, $ZERO, $TEMP2 424 vpaddq $TEMP1, $ACC1, $ACC1 425 vpaddq $TEMP2, $ACC2, $ACC2 426 vmovdqu $ACC1, 32*9-192($tp0) 427 vmovdqu $ACC2, 32*10-192($tp0) 428 429 mov (%rsp), %rax 430 mov 8(%rsp), $r1 431 mov 16(%rsp), $r2 432 mov 24(%rsp), $r3 433 vmovdqu 32*1(%rsp), $ACC1 434 vmovdqu 32*2-192($tp0), $ACC2 435 vmovdqu 32*3-192($tp0), $ACC3 436 vmovdqu 32*4-192($tp0), $ACC4 437 vmovdqu 32*5-192($tp0), $ACC5 438 vmovdqu 32*6-192($tp0), $ACC6 439 vmovdqu 32*7-192($tp0), $ACC7 440 441 mov %rax, $r0 442 imull $n0, %eax 443 and \$0x1fffffff, %eax 444 vmovd %eax, $Y1 445 446 mov %rax, %rdx 447 imulq -128($np), %rax 448 vpbroadcastq $Y1, $Y1 449 add %rax, $r0 450 mov %rdx, %rax 451 imulq 8-128($np), %rax 452 shr \$29, $r0 453 add %rax, $r1 454 mov %rdx, %rax 455 imulq 16-128($np), %rax 456 add $r0, $r1 457 add %rax, $r2 458 imulq 24-128($np), %rdx 459 add %rdx, $r3 460 461 mov $r1, %rax 462 imull $n0, %eax 463 and \$0x1fffffff, %eax 464 465 mov \$9, $i 466 jmp .LOOP_REDUCE_1024 467 468.align 32 469.LOOP_REDUCE_1024: 470 vmovd %eax, $Y2 471 vpbroadcastq $Y2, $Y2 472 473 vpmuludq 32*1-128($np), $Y1, $TEMP0 474 mov %rax, %rdx 475 imulq -128($np), %rax 476 vpaddq $TEMP0, $ACC1, $ACC1 477 add %rax, $r1 478 vpmuludq 32*2-128($np), $Y1, $TEMP1 479 mov %rdx, %rax 480 imulq 8-128($np), %rax 481 vpaddq $TEMP1, $ACC2, $ACC2 482 vpmuludq 32*3-128($np), $Y1, $TEMP2 483 .byte 0x67 484 add %rax, $r2 485 .byte 0x67 486 mov %rdx, %rax 487 imulq 16-128($np), %rax 488 shr \$29, $r1 489 vpaddq $TEMP2, $ACC3, $ACC3 490 vpmuludq 32*4-128($np), $Y1, $TEMP0 491 add %rax, $r3 492 add $r1, $r2 493 vpaddq $TEMP0, $ACC4, $ACC4 494 vpmuludq 32*5-128($np), $Y1, $TEMP1 495 mov $r2, %rax 496 imull $n0, %eax 497 vpaddq $TEMP1, $ACC5, $ACC5 498 vpmuludq 32*6-128($np), $Y1, $TEMP2 499 and \$0x1fffffff, %eax 500 vpaddq $TEMP2, $ACC6, $ACC6 501 vpmuludq 32*7-128($np), $Y1, $TEMP0 502 vpaddq $TEMP0, $ACC7, $ACC7 503 vpmuludq 32*8-128($np), $Y1, $TEMP1 504 vmovd %eax, $Y1 505 #vmovdqu 32*1-8-128($np), $TEMP2 # moved below 506 vpaddq $TEMP1, $ACC8, $ACC8 507 #vmovdqu 32*2-8-128($np), $TEMP0 # moved below 508 vpbroadcastq $Y1, $Y1 509 510 vpmuludq 32*1-8-128($np), $Y2, $TEMP2 # see above 511 vmovdqu 32*3-8-128($np), $TEMP1 512 mov %rax, %rdx 513 imulq -128($np), %rax 514 vpaddq $TEMP2, $ACC1, $ACC1 515 vpmuludq 32*2-8-128($np), $Y2, $TEMP0 # see above 516 vmovdqu 32*4-8-128($np), $TEMP2 517 add %rax, $r2 518 mov %rdx, %rax 519 imulq 8-128($np), %rax 520 vpaddq $TEMP0, $ACC2, $ACC2 521 add $r3, %rax 522 shr \$29, $r2 523 vpmuludq $Y2, $TEMP1, $TEMP1 524 vmovdqu 32*5-8-128($np), $TEMP0 525 add $r2, %rax 526 vpaddq $TEMP1, $ACC3, $ACC3 527 vpmuludq $Y2, $TEMP2, $TEMP2 528 vmovdqu 32*6-8-128($np), $TEMP1 529 .byte 0x67 530 mov %rax, $r3 531 imull $n0, %eax 532 vpaddq $TEMP2, $ACC4, $ACC4 533 vpmuludq $Y2, $TEMP0, $TEMP0 534 .byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 # vmovdqu 32*7-8-128($np), $TEMP2 535 and \$0x1fffffff, %eax 536 vpaddq $TEMP0, $ACC5, $ACC5 537 vpmuludq $Y2, $TEMP1, $TEMP1 538 vmovdqu 32*8-8-128($np), $TEMP0 539 vpaddq $TEMP1, $ACC6, $ACC6 540 vpmuludq $Y2, $TEMP2, $TEMP2 541 vmovdqu 32*9-8-128($np), $ACC9 542 vmovd %eax, $ACC0 # borrow ACC0 for Y2 543 imulq -128($np), %rax 544 vpaddq $TEMP2, $ACC7, $ACC7 545 vpmuludq $Y2, $TEMP0, $TEMP0 546 vmovdqu 32*1-16-128($np), $TEMP1 547 vpbroadcastq $ACC0, $ACC0 548 vpaddq $TEMP0, $ACC8, $ACC8 549 vpmuludq $Y2, $ACC9, $ACC9 550 vmovdqu 32*2-16-128($np), $TEMP2 551 add %rax, $r3 552 553___ 554($ACC0,$Y2)=($Y2,$ACC0); 555$code.=<<___; 556 vmovdqu 32*1-24-128($np), $ACC0 557 vpmuludq $Y1, $TEMP1, $TEMP1 558 vmovdqu 32*3-16-128($np), $TEMP0 559 vpaddq $TEMP1, $ACC1, $ACC1 560 vpmuludq $Y2, $ACC0, $ACC0 561 vpmuludq $Y1, $TEMP2, $TEMP2 562 .byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff # vmovdqu 32*4-16-128($np), $TEMP1 563 vpaddq $ACC1, $ACC0, $ACC0 564 vpaddq $TEMP2, $ACC2, $ACC2 565 vpmuludq $Y1, $TEMP0, $TEMP0 566 vmovdqu 32*5-16-128($np), $TEMP2 567 .byte 0x67 568 vmovq $ACC0, %rax 569 vmovdqu $ACC0, (%rsp) # transfer $r0-$r3 570 vpaddq $TEMP0, $ACC3, $ACC3 571 vpmuludq $Y1, $TEMP1, $TEMP1 572 vmovdqu 32*6-16-128($np), $TEMP0 573 vpaddq $TEMP1, $ACC4, $ACC4 574 vpmuludq $Y1, $TEMP2, $TEMP2 575 vmovdqu 32*7-16-128($np), $TEMP1 576 vpaddq $TEMP2, $ACC5, $ACC5 577 vpmuludq $Y1, $TEMP0, $TEMP0 578 vmovdqu 32*8-16-128($np), $TEMP2 579 vpaddq $TEMP0, $ACC6, $ACC6 580 vpmuludq $Y1, $TEMP1, $TEMP1 581 shr \$29, $r3 582 vmovdqu 32*9-16-128($np), $TEMP0 583 add $r3, %rax 584 vpaddq $TEMP1, $ACC7, $ACC7 585 vpmuludq $Y1, $TEMP2, $TEMP2 586 #vmovdqu 32*2-24-128($np), $TEMP1 # moved below 587 mov %rax, $r0 588 imull $n0, %eax 589 vpaddq $TEMP2, $ACC8, $ACC8 590 vpmuludq $Y1, $TEMP0, $TEMP0 591 and \$0x1fffffff, %eax 592 vmovd %eax, $Y1 593 vmovdqu 32*3-24-128($np), $TEMP2 594 .byte 0x67 595 vpaddq $TEMP0, $ACC9, $ACC9 596 vpbroadcastq $Y1, $Y1 597 598 vpmuludq 32*2-24-128($np), $Y2, $TEMP1 # see above 599 vmovdqu 32*4-24-128($np), $TEMP0 600 mov %rax, %rdx 601 imulq -128($np), %rax 602 mov 8(%rsp), $r1 603 vpaddq $TEMP1, $ACC2, $ACC1 604 vpmuludq $Y2, $TEMP2, $TEMP2 605 vmovdqu 32*5-24-128($np), $TEMP1 606 add %rax, $r0 607 mov %rdx, %rax 608 imulq 8-128($np), %rax 609 .byte 0x67 610 shr \$29, $r0 611 mov 16(%rsp), $r2 612 vpaddq $TEMP2, $ACC3, $ACC2 613 vpmuludq $Y2, $TEMP0, $TEMP0 614 vmovdqu 32*6-24-128($np), $TEMP2 615 add %rax, $r1 616 mov %rdx, %rax 617 imulq 16-128($np), %rax 618 vpaddq $TEMP0, $ACC4, $ACC3 619 vpmuludq $Y2, $TEMP1, $TEMP1 620 vmovdqu 32*7-24-128($np), $TEMP0 621 imulq 24-128($np), %rdx # future $r3 622 add %rax, $r2 623 lea ($r0,$r1), %rax 624 vpaddq $TEMP1, $ACC5, $ACC4 625 vpmuludq $Y2, $TEMP2, $TEMP2 626 vmovdqu 32*8-24-128($np), $TEMP1 627 mov %rax, $r1 628 imull $n0, %eax 629 vpmuludq $Y2, $TEMP0, $TEMP0 630 vpaddq $TEMP2, $ACC6, $ACC5 631 vmovdqu 32*9-24-128($np), $TEMP2 632 and \$0x1fffffff, %eax 633 vpaddq $TEMP0, $ACC7, $ACC6 634 vpmuludq $Y2, $TEMP1, $TEMP1 635 add 24(%rsp), %rdx 636 vpaddq $TEMP1, $ACC8, $ACC7 637 vpmuludq $Y2, $TEMP2, $TEMP2 638 vpaddq $TEMP2, $ACC9, $ACC8 639 vmovq $r3, $ACC9 640 mov %rdx, $r3 641 642 dec $i 643 jnz .LOOP_REDUCE_1024 644___ 645($ACC0,$Y2)=($Y2,$ACC0); 646$code.=<<___; 647 lea 448(%rsp), $tp1 # size optimization 648 vpaddq $ACC9, $Y2, $ACC0 649 vpxor $ZERO, $ZERO, $ZERO 650 651 vpaddq 32*9-192($tp0), $ACC0, $ACC0 652 vpaddq 32*10-448($tp1), $ACC1, $ACC1 653 vpaddq 32*11-448($tp1), $ACC2, $ACC2 654 vpaddq 32*12-448($tp1), $ACC3, $ACC3 655 vpaddq 32*13-448($tp1), $ACC4, $ACC4 656 vpaddq 32*14-448($tp1), $ACC5, $ACC5 657 vpaddq 32*15-448($tp1), $ACC6, $ACC6 658 vpaddq 32*16-448($tp1), $ACC7, $ACC7 659 vpaddq 32*17-448($tp1), $ACC8, $ACC8 660 661 vpsrlq \$29, $ACC0, $TEMP1 662 vpand $AND_MASK, $ACC0, $ACC0 663 vpsrlq \$29, $ACC1, $TEMP2 664 vpand $AND_MASK, $ACC1, $ACC1 665 vpsrlq \$29, $ACC2, $TEMP3 666 vpermq \$0x93, $TEMP1, $TEMP1 667 vpand $AND_MASK, $ACC2, $ACC2 668 vpsrlq \$29, $ACC3, $TEMP4 669 vpermq \$0x93, $TEMP2, $TEMP2 670 vpand $AND_MASK, $ACC3, $ACC3 671 vpermq \$0x93, $TEMP3, $TEMP3 672 673 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 674 vpermq \$0x93, $TEMP4, $TEMP4 675 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 676 vpaddq $TEMP0, $ACC0, $ACC0 677 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 678 vpaddq $TEMP1, $ACC1, $ACC1 679 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 680 vpaddq $TEMP2, $ACC2, $ACC2 681 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 682 vpaddq $TEMP3, $ACC3, $ACC3 683 vpaddq $TEMP4, $ACC4, $ACC4 684 685 vpsrlq \$29, $ACC0, $TEMP1 686 vpand $AND_MASK, $ACC0, $ACC0 687 vpsrlq \$29, $ACC1, $TEMP2 688 vpand $AND_MASK, $ACC1, $ACC1 689 vpsrlq \$29, $ACC2, $TEMP3 690 vpermq \$0x93, $TEMP1, $TEMP1 691 vpand $AND_MASK, $ACC2, $ACC2 692 vpsrlq \$29, $ACC3, $TEMP4 693 vpermq \$0x93, $TEMP2, $TEMP2 694 vpand $AND_MASK, $ACC3, $ACC3 695 vpermq \$0x93, $TEMP3, $TEMP3 696 697 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 698 vpermq \$0x93, $TEMP4, $TEMP4 699 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 700 vpaddq $TEMP0, $ACC0, $ACC0 701 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 702 vpaddq $TEMP1, $ACC1, $ACC1 703 vmovdqu $ACC0, 32*0-128($rp) 704 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 705 vpaddq $TEMP2, $ACC2, $ACC2 706 vmovdqu $ACC1, 32*1-128($rp) 707 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 708 vpaddq $TEMP3, $ACC3, $ACC3 709 vmovdqu $ACC2, 32*2-128($rp) 710 vpaddq $TEMP4, $ACC4, $ACC4 711 vmovdqu $ACC3, 32*3-128($rp) 712___ 713$TEMP5=$ACC0; 714$code.=<<___; 715 vpsrlq \$29, $ACC4, $TEMP1 716 vpand $AND_MASK, $ACC4, $ACC4 717 vpsrlq \$29, $ACC5, $TEMP2 718 vpand $AND_MASK, $ACC5, $ACC5 719 vpsrlq \$29, $ACC6, $TEMP3 720 vpermq \$0x93, $TEMP1, $TEMP1 721 vpand $AND_MASK, $ACC6, $ACC6 722 vpsrlq \$29, $ACC7, $TEMP4 723 vpermq \$0x93, $TEMP2, $TEMP2 724 vpand $AND_MASK, $ACC7, $ACC7 725 vpsrlq \$29, $ACC8, $TEMP5 726 vpermq \$0x93, $TEMP3, $TEMP3 727 vpand $AND_MASK, $ACC8, $ACC8 728 vpermq \$0x93, $TEMP4, $TEMP4 729 730 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 731 vpermq \$0x93, $TEMP5, $TEMP5 732 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 733 vpaddq $TEMP0, $ACC4, $ACC4 734 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 735 vpaddq $TEMP1, $ACC5, $ACC5 736 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 737 vpaddq $TEMP2, $ACC6, $ACC6 738 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 739 vpaddq $TEMP3, $ACC7, $ACC7 740 vpaddq $TEMP4, $ACC8, $ACC8 741 742 vpsrlq \$29, $ACC4, $TEMP1 743 vpand $AND_MASK, $ACC4, $ACC4 744 vpsrlq \$29, $ACC5, $TEMP2 745 vpand $AND_MASK, $ACC5, $ACC5 746 vpsrlq \$29, $ACC6, $TEMP3 747 vpermq \$0x93, $TEMP1, $TEMP1 748 vpand $AND_MASK, $ACC6, $ACC6 749 vpsrlq \$29, $ACC7, $TEMP4 750 vpermq \$0x93, $TEMP2, $TEMP2 751 vpand $AND_MASK, $ACC7, $ACC7 752 vpsrlq \$29, $ACC8, $TEMP5 753 vpermq \$0x93, $TEMP3, $TEMP3 754 vpand $AND_MASK, $ACC8, $ACC8 755 vpermq \$0x93, $TEMP4, $TEMP4 756 757 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 758 vpermq \$0x93, $TEMP5, $TEMP5 759 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 760 vpaddq $TEMP0, $ACC4, $ACC4 761 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 762 vpaddq $TEMP1, $ACC5, $ACC5 763 vmovdqu $ACC4, 32*4-128($rp) 764 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 765 vpaddq $TEMP2, $ACC6, $ACC6 766 vmovdqu $ACC5, 32*5-128($rp) 767 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 768 vpaddq $TEMP3, $ACC7, $ACC7 769 vmovdqu $ACC6, 32*6-128($rp) 770 vpaddq $TEMP4, $ACC8, $ACC8 771 vmovdqu $ACC7, 32*7-128($rp) 772 vmovdqu $ACC8, 32*8-128($rp) 773 774 mov $rp, $ap 775 dec $rep 776 jne .LOOP_GRANDE_SQR_1024 777 778 vzeroall 779 mov %rbp, %rax 780.cfi_def_cfa_register %rax 781___ 782$code.=<<___ if ($win64); 783.Lsqr_1024_in_tail: 784 movaps -0xd8(%rax),%xmm6 785 movaps -0xc8(%rax),%xmm7 786 movaps -0xb8(%rax),%xmm8 787 movaps -0xa8(%rax),%xmm9 788 movaps -0x98(%rax),%xmm10 789 movaps -0x88(%rax),%xmm11 790 movaps -0x78(%rax),%xmm12 791 movaps -0x68(%rax),%xmm13 792 movaps -0x58(%rax),%xmm14 793 movaps -0x48(%rax),%xmm15 794___ 795$code.=<<___; 796 mov -48(%rax),%r15 797.cfi_restore %r15 798 mov -40(%rax),%r14 799.cfi_restore %r14 800 mov -32(%rax),%r13 801.cfi_restore %r13 802 mov -24(%rax),%r12 803.cfi_restore %r12 804 mov -16(%rax),%rbp 805.cfi_restore %rbp 806 mov -8(%rax),%rbx 807.cfi_restore %rbx 808 lea (%rax),%rsp # restore %rsp 809.cfi_def_cfa_register %rsp 810.Lsqr_1024_epilogue: 811 ret 812.cfi_endproc 813.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 814___ 815} 816 817{ # void AMM_WW( 818my $rp="%rdi"; # BN_ULONG *rp, 819my $ap="%rsi"; # const BN_ULONG *ap, 820my $bp="%rdx"; # const BN_ULONG *bp, 821my $np="%rcx"; # const BN_ULONG *np, 822my $n0="%r8d"; # unsigned int n0); 823 824# The registers that hold the accumulated redundant result 825# The AMM works on 1024 bit operands, and redundant word size is 29 826# Therefore: ceil(1024/29)/4 = 9 827my $ACC0="%ymm0"; 828my $ACC1="%ymm1"; 829my $ACC2="%ymm2"; 830my $ACC3="%ymm3"; 831my $ACC4="%ymm4"; 832my $ACC5="%ymm5"; 833my $ACC6="%ymm6"; 834my $ACC7="%ymm7"; 835my $ACC8="%ymm8"; 836my $ACC9="%ymm9"; 837 838# Registers that hold the broadcasted words of multiplier, currently used 839my $Bi="%ymm10"; 840my $Yi="%ymm11"; 841 842# Helper registers 843my $TEMP0=$ACC0; 844my $TEMP1="%ymm12"; 845my $TEMP2="%ymm13"; 846my $ZERO="%ymm14"; 847my $AND_MASK="%ymm15"; 848 849# alu registers that hold the first words of the ACC 850my $r0="%r9"; 851my $r1="%r10"; 852my $r2="%r11"; 853my $r3="%r12"; 854 855my $i="%r14d"; 856my $tmp="%r15"; 857 858$bp="%r13"; # reassigned argument 859 860$code.=<<___; 861.globl rsaz_1024_mul_avx2 862.type rsaz_1024_mul_avx2,\@function,5 863.align 64 864rsaz_1024_mul_avx2: 865.cfi_startproc 866 lea (%rsp), %rax 867.cfi_def_cfa_register %rax 868 push %rbx 869.cfi_push %rbx 870 push %rbp 871.cfi_push %rbp 872 push %r12 873.cfi_push %r12 874 push %r13 875.cfi_push %r13 876 push %r14 877.cfi_push %r14 878 push %r15 879.cfi_push %r15 880___ 881$code.=<<___ if ($win64); 882 vzeroupper 883 lea -0xa8(%rsp),%rsp 884 vmovaps %xmm6,-0xd8(%rax) 885 vmovaps %xmm7,-0xc8(%rax) 886 vmovaps %xmm8,-0xb8(%rax) 887 vmovaps %xmm9,-0xa8(%rax) 888 vmovaps %xmm10,-0x98(%rax) 889 vmovaps %xmm11,-0x88(%rax) 890 vmovaps %xmm12,-0x78(%rax) 891 vmovaps %xmm13,-0x68(%rax) 892 vmovaps %xmm14,-0x58(%rax) 893 vmovaps %xmm15,-0x48(%rax) 894.Lmul_1024_body: 895___ 896$code.=<<___; 897 mov %rax,%rbp 898.cfi_def_cfa_register %rbp 899 vzeroall 900 mov %rdx, $bp # reassigned argument 901 sub \$64,%rsp 902 903 # unaligned 256-bit load that crosses page boundary can 904 # cause severe performance degradation here, so if $ap does 905 # cross page boundary, swap it with $bp [meaning that caller 906 # is advised to lay down $ap and $bp next to each other, so 907 # that only one can cross page boundary]. 908 .byte 0x67,0x67 909 mov $ap, $tmp 910 and \$4095, $tmp 911 add \$32*10, $tmp 912 shr \$12, $tmp 913 mov $ap, $tmp 914 cmovnz $bp, $ap 915 cmovnz $tmp, $bp 916 917 mov $np, $tmp 918 sub \$-128,$ap # size optimization 919 sub \$-128,$np 920 sub \$-128,$rp 921 922 and \$4095, $tmp # see if $np crosses page 923 add \$32*10, $tmp 924 .byte 0x67,0x67 925 shr \$12, $tmp 926 jz .Lmul_1024_no_n_copy 927 928 # unaligned 256-bit load that crosses page boundary can 929 # cause severe performance degradation here, so if $np does 930 # cross page boundary, copy it to stack and make sure stack 931 # frame doesn't... 932 sub \$32*10,%rsp 933 vmovdqu 32*0-128($np), $ACC0 934 and \$-512, %rsp 935 vmovdqu 32*1-128($np), $ACC1 936 vmovdqu 32*2-128($np), $ACC2 937 vmovdqu 32*3-128($np), $ACC3 938 vmovdqu 32*4-128($np), $ACC4 939 vmovdqu 32*5-128($np), $ACC5 940 vmovdqu 32*6-128($np), $ACC6 941 vmovdqu 32*7-128($np), $ACC7 942 vmovdqu 32*8-128($np), $ACC8 943 lea 64+128(%rsp),$np 944 vmovdqu $ACC0, 32*0-128($np) 945 vpxor $ACC0, $ACC0, $ACC0 946 vmovdqu $ACC1, 32*1-128($np) 947 vpxor $ACC1, $ACC1, $ACC1 948 vmovdqu $ACC2, 32*2-128($np) 949 vpxor $ACC2, $ACC2, $ACC2 950 vmovdqu $ACC3, 32*3-128($np) 951 vpxor $ACC3, $ACC3, $ACC3 952 vmovdqu $ACC4, 32*4-128($np) 953 vpxor $ACC4, $ACC4, $ACC4 954 vmovdqu $ACC5, 32*5-128($np) 955 vpxor $ACC5, $ACC5, $ACC5 956 vmovdqu $ACC6, 32*6-128($np) 957 vpxor $ACC6, $ACC6, $ACC6 958 vmovdqu $ACC7, 32*7-128($np) 959 vpxor $ACC7, $ACC7, $ACC7 960 vmovdqu $ACC8, 32*8-128($np) 961 vmovdqa $ACC0, $ACC8 962 vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero after vzeroall 963.Lmul_1024_no_n_copy: 964 and \$-64,%rsp 965 966 mov ($bp), %rbx 967 vpbroadcastq ($bp), $Bi 968 vmovdqu $ACC0, (%rsp) # clear top of stack 969 xor $r0, $r0 970 .byte 0x67 971 xor $r1, $r1 972 xor $r2, $r2 973 xor $r3, $r3 974 975 vmovdqu .Land_mask(%rip), $AND_MASK 976 mov \$9, $i 977 vmovdqu $ACC9, 32*9-128($rp) # $ACC9 is zero after vzeroall 978 jmp .Loop_mul_1024 979 980.align 32 981.Loop_mul_1024: 982 vpsrlq \$29, $ACC3, $ACC9 # correct $ACC3(*) 983 mov %rbx, %rax 984 imulq -128($ap), %rax 985 add $r0, %rax 986 mov %rbx, $r1 987 imulq 8-128($ap), $r1 988 add 8(%rsp), $r1 989 990 mov %rax, $r0 991 imull $n0, %eax 992 and \$0x1fffffff, %eax 993 994 mov %rbx, $r2 995 imulq 16-128($ap), $r2 996 add 16(%rsp), $r2 997 998 mov %rbx, $r3 999 imulq 24-128($ap), $r3 1000 add 24(%rsp), $r3 1001 vpmuludq 32*1-128($ap),$Bi,$TEMP0 1002 vmovd %eax, $Yi 1003 vpaddq $TEMP0,$ACC1,$ACC1 1004 vpmuludq 32*2-128($ap),$Bi,$TEMP1 1005 vpbroadcastq $Yi, $Yi 1006 vpaddq $TEMP1,$ACC2,$ACC2 1007 vpmuludq 32*3-128($ap),$Bi,$TEMP2 1008 vpand $AND_MASK, $ACC3, $ACC3 # correct $ACC3 1009 vpaddq $TEMP2,$ACC3,$ACC3 1010 vpmuludq 32*4-128($ap),$Bi,$TEMP0 1011 vpaddq $TEMP0,$ACC4,$ACC4 1012 vpmuludq 32*5-128($ap),$Bi,$TEMP1 1013 vpaddq $TEMP1,$ACC5,$ACC5 1014 vpmuludq 32*6-128($ap),$Bi,$TEMP2 1015 vpaddq $TEMP2,$ACC6,$ACC6 1016 vpmuludq 32*7-128($ap),$Bi,$TEMP0 1017 vpermq \$0x93, $ACC9, $ACC9 # correct $ACC3 1018 vpaddq $TEMP0,$ACC7,$ACC7 1019 vpmuludq 32*8-128($ap),$Bi,$TEMP1 1020 vpbroadcastq 8($bp), $Bi 1021 vpaddq $TEMP1,$ACC8,$ACC8 1022 1023 mov %rax,%rdx 1024 imulq -128($np),%rax 1025 add %rax,$r0 1026 mov %rdx,%rax 1027 imulq 8-128($np),%rax 1028 add %rax,$r1 1029 mov %rdx,%rax 1030 imulq 16-128($np),%rax 1031 add %rax,$r2 1032 shr \$29, $r0 1033 imulq 24-128($np),%rdx 1034 add %rdx,$r3 1035 add $r0, $r1 1036 1037 vpmuludq 32*1-128($np),$Yi,$TEMP2 1038 vmovq $Bi, %rbx 1039 vpaddq $TEMP2,$ACC1,$ACC1 1040 vpmuludq 32*2-128($np),$Yi,$TEMP0 1041 vpaddq $TEMP0,$ACC2,$ACC2 1042 vpmuludq 32*3-128($np),$Yi,$TEMP1 1043 vpaddq $TEMP1,$ACC3,$ACC3 1044 vpmuludq 32*4-128($np),$Yi,$TEMP2 1045 vpaddq $TEMP2,$ACC4,$ACC4 1046 vpmuludq 32*5-128($np),$Yi,$TEMP0 1047 vpaddq $TEMP0,$ACC5,$ACC5 1048 vpmuludq 32*6-128($np),$Yi,$TEMP1 1049 vpaddq $TEMP1,$ACC6,$ACC6 1050 vpmuludq 32*7-128($np),$Yi,$TEMP2 1051 vpblendd \$3, $ZERO, $ACC9, $TEMP1 # correct $ACC3 1052 vpaddq $TEMP2,$ACC7,$ACC7 1053 vpmuludq 32*8-128($np),$Yi,$TEMP0 1054 vpaddq $TEMP1, $ACC3, $ACC3 # correct $ACC3 1055 vpaddq $TEMP0,$ACC8,$ACC8 1056 1057 mov %rbx, %rax 1058 imulq -128($ap),%rax 1059 add %rax,$r1 1060 vmovdqu -8+32*1-128($ap),$TEMP1 1061 mov %rbx, %rax 1062 imulq 8-128($ap),%rax 1063 add %rax,$r2 1064 vmovdqu -8+32*2-128($ap),$TEMP2 1065 1066 mov $r1, %rax 1067 vpblendd \$0xfc, $ZERO, $ACC9, $ACC9 # correct $ACC3 1068 imull $n0, %eax 1069 vpaddq $ACC9,$ACC4,$ACC4 # correct $ACC3 1070 and \$0x1fffffff, %eax 1071 1072 imulq 16-128($ap),%rbx 1073 add %rbx,$r3 1074 vpmuludq $Bi,$TEMP1,$TEMP1 1075 vmovd %eax, $Yi 1076 vmovdqu -8+32*3-128($ap),$TEMP0 1077 vpaddq $TEMP1,$ACC1,$ACC1 1078 vpmuludq $Bi,$TEMP2,$TEMP2 1079 vpbroadcastq $Yi, $Yi 1080 vmovdqu -8+32*4-128($ap),$TEMP1 1081 vpaddq $TEMP2,$ACC2,$ACC2 1082 vpmuludq $Bi,$TEMP0,$TEMP0 1083 vmovdqu -8+32*5-128($ap),$TEMP2 1084 vpaddq $TEMP0,$ACC3,$ACC3 1085 vpmuludq $Bi,$TEMP1,$TEMP1 1086 vmovdqu -8+32*6-128($ap),$TEMP0 1087 vpaddq $TEMP1,$ACC4,$ACC4 1088 vpmuludq $Bi,$TEMP2,$TEMP2 1089 vmovdqu -8+32*7-128($ap),$TEMP1 1090 vpaddq $TEMP2,$ACC5,$ACC5 1091 vpmuludq $Bi,$TEMP0,$TEMP0 1092 vmovdqu -8+32*8-128($ap),$TEMP2 1093 vpaddq $TEMP0,$ACC6,$ACC6 1094 vpmuludq $Bi,$TEMP1,$TEMP1 1095 vmovdqu -8+32*9-128($ap),$ACC9 1096 vpaddq $TEMP1,$ACC7,$ACC7 1097 vpmuludq $Bi,$TEMP2,$TEMP2 1098 vpaddq $TEMP2,$ACC8,$ACC8 1099 vpmuludq $Bi,$ACC9,$ACC9 1100 vpbroadcastq 16($bp), $Bi 1101 1102 mov %rax,%rdx 1103 imulq -128($np),%rax 1104 add %rax,$r1 1105 vmovdqu -8+32*1-128($np),$TEMP0 1106 mov %rdx,%rax 1107 imulq 8-128($np),%rax 1108 add %rax,$r2 1109 vmovdqu -8+32*2-128($np),$TEMP1 1110 shr \$29, $r1 1111 imulq 16-128($np),%rdx 1112 add %rdx,$r3 1113 add $r1, $r2 1114 1115 vpmuludq $Yi,$TEMP0,$TEMP0 1116 vmovq $Bi, %rbx 1117 vmovdqu -8+32*3-128($np),$TEMP2 1118 vpaddq $TEMP0,$ACC1,$ACC1 1119 vpmuludq $Yi,$TEMP1,$TEMP1 1120 vmovdqu -8+32*4-128($np),$TEMP0 1121 vpaddq $TEMP1,$ACC2,$ACC2 1122 vpmuludq $Yi,$TEMP2,$TEMP2 1123 vmovdqu -8+32*5-128($np),$TEMP1 1124 vpaddq $TEMP2,$ACC3,$ACC3 1125 vpmuludq $Yi,$TEMP0,$TEMP0 1126 vmovdqu -8+32*6-128($np),$TEMP2 1127 vpaddq $TEMP0,$ACC4,$ACC4 1128 vpmuludq $Yi,$TEMP1,$TEMP1 1129 vmovdqu -8+32*7-128($np),$TEMP0 1130 vpaddq $TEMP1,$ACC5,$ACC5 1131 vpmuludq $Yi,$TEMP2,$TEMP2 1132 vmovdqu -8+32*8-128($np),$TEMP1 1133 vpaddq $TEMP2,$ACC6,$ACC6 1134 vpmuludq $Yi,$TEMP0,$TEMP0 1135 vmovdqu -8+32*9-128($np),$TEMP2 1136 vpaddq $TEMP0,$ACC7,$ACC7 1137 vpmuludq $Yi,$TEMP1,$TEMP1 1138 vpaddq $TEMP1,$ACC8,$ACC8 1139 vpmuludq $Yi,$TEMP2,$TEMP2 1140 vpaddq $TEMP2,$ACC9,$ACC9 1141 1142 vmovdqu -16+32*1-128($ap),$TEMP0 1143 mov %rbx,%rax 1144 imulq -128($ap),%rax 1145 add $r2,%rax 1146 1147 vmovdqu -16+32*2-128($ap),$TEMP1 1148 mov %rax,$r2 1149 imull $n0, %eax 1150 and \$0x1fffffff, %eax 1151 1152 imulq 8-128($ap),%rbx 1153 add %rbx,$r3 1154 vpmuludq $Bi,$TEMP0,$TEMP0 1155 vmovd %eax, $Yi 1156 vmovdqu -16+32*3-128($ap),$TEMP2 1157 vpaddq $TEMP0,$ACC1,$ACC1 1158 vpmuludq $Bi,$TEMP1,$TEMP1 1159 vpbroadcastq $Yi, $Yi 1160 vmovdqu -16+32*4-128($ap),$TEMP0 1161 vpaddq $TEMP1,$ACC2,$ACC2 1162 vpmuludq $Bi,$TEMP2,$TEMP2 1163 vmovdqu -16+32*5-128($ap),$TEMP1 1164 vpaddq $TEMP2,$ACC3,$ACC3 1165 vpmuludq $Bi,$TEMP0,$TEMP0 1166 vmovdqu -16+32*6-128($ap),$TEMP2 1167 vpaddq $TEMP0,$ACC4,$ACC4 1168 vpmuludq $Bi,$TEMP1,$TEMP1 1169 vmovdqu -16+32*7-128($ap),$TEMP0 1170 vpaddq $TEMP1,$ACC5,$ACC5 1171 vpmuludq $Bi,$TEMP2,$TEMP2 1172 vmovdqu -16+32*8-128($ap),$TEMP1 1173 vpaddq $TEMP2,$ACC6,$ACC6 1174 vpmuludq $Bi,$TEMP0,$TEMP0 1175 vmovdqu -16+32*9-128($ap),$TEMP2 1176 vpaddq $TEMP0,$ACC7,$ACC7 1177 vpmuludq $Bi,$TEMP1,$TEMP1 1178 vpaddq $TEMP1,$ACC8,$ACC8 1179 vpmuludq $Bi,$TEMP2,$TEMP2 1180 vpbroadcastq 24($bp), $Bi 1181 vpaddq $TEMP2,$ACC9,$ACC9 1182 1183 vmovdqu -16+32*1-128($np),$TEMP0 1184 mov %rax,%rdx 1185 imulq -128($np),%rax 1186 add %rax,$r2 1187 vmovdqu -16+32*2-128($np),$TEMP1 1188 imulq 8-128($np),%rdx 1189 add %rdx,$r3 1190 shr \$29, $r2 1191 1192 vpmuludq $Yi,$TEMP0,$TEMP0 1193 vmovq $Bi, %rbx 1194 vmovdqu -16+32*3-128($np),$TEMP2 1195 vpaddq $TEMP0,$ACC1,$ACC1 1196 vpmuludq $Yi,$TEMP1,$TEMP1 1197 vmovdqu -16+32*4-128($np),$TEMP0 1198 vpaddq $TEMP1,$ACC2,$ACC2 1199 vpmuludq $Yi,$TEMP2,$TEMP2 1200 vmovdqu -16+32*5-128($np),$TEMP1 1201 vpaddq $TEMP2,$ACC3,$ACC3 1202 vpmuludq $Yi,$TEMP0,$TEMP0 1203 vmovdqu -16+32*6-128($np),$TEMP2 1204 vpaddq $TEMP0,$ACC4,$ACC4 1205 vpmuludq $Yi,$TEMP1,$TEMP1 1206 vmovdqu -16+32*7-128($np),$TEMP0 1207 vpaddq $TEMP1,$ACC5,$ACC5 1208 vpmuludq $Yi,$TEMP2,$TEMP2 1209 vmovdqu -16+32*8-128($np),$TEMP1 1210 vpaddq $TEMP2,$ACC6,$ACC6 1211 vpmuludq $Yi,$TEMP0,$TEMP0 1212 vmovdqu -16+32*9-128($np),$TEMP2 1213 vpaddq $TEMP0,$ACC7,$ACC7 1214 vpmuludq $Yi,$TEMP1,$TEMP1 1215 vmovdqu -24+32*1-128($ap),$TEMP0 1216 vpaddq $TEMP1,$ACC8,$ACC8 1217 vpmuludq $Yi,$TEMP2,$TEMP2 1218 vmovdqu -24+32*2-128($ap),$TEMP1 1219 vpaddq $TEMP2,$ACC9,$ACC9 1220 1221 add $r2, $r3 1222 imulq -128($ap),%rbx 1223 add %rbx,$r3 1224 1225 mov $r3, %rax 1226 imull $n0, %eax 1227 and \$0x1fffffff, %eax 1228 1229 vpmuludq $Bi,$TEMP0,$TEMP0 1230 vmovd %eax, $Yi 1231 vmovdqu -24+32*3-128($ap),$TEMP2 1232 vpaddq $TEMP0,$ACC1,$ACC1 1233 vpmuludq $Bi,$TEMP1,$TEMP1 1234 vpbroadcastq $Yi, $Yi 1235 vmovdqu -24+32*4-128($ap),$TEMP0 1236 vpaddq $TEMP1,$ACC2,$ACC2 1237 vpmuludq $Bi,$TEMP2,$TEMP2 1238 vmovdqu -24+32*5-128($ap),$TEMP1 1239 vpaddq $TEMP2,$ACC3,$ACC3 1240 vpmuludq $Bi,$TEMP0,$TEMP0 1241 vmovdqu -24+32*6-128($ap),$TEMP2 1242 vpaddq $TEMP0,$ACC4,$ACC4 1243 vpmuludq $Bi,$TEMP1,$TEMP1 1244 vmovdqu -24+32*7-128($ap),$TEMP0 1245 vpaddq $TEMP1,$ACC5,$ACC5 1246 vpmuludq $Bi,$TEMP2,$TEMP2 1247 vmovdqu -24+32*8-128($ap),$TEMP1 1248 vpaddq $TEMP2,$ACC6,$ACC6 1249 vpmuludq $Bi,$TEMP0,$TEMP0 1250 vmovdqu -24+32*9-128($ap),$TEMP2 1251 vpaddq $TEMP0,$ACC7,$ACC7 1252 vpmuludq $Bi,$TEMP1,$TEMP1 1253 vpaddq $TEMP1,$ACC8,$ACC8 1254 vpmuludq $Bi,$TEMP2,$TEMP2 1255 vpbroadcastq 32($bp), $Bi 1256 vpaddq $TEMP2,$ACC9,$ACC9 1257 add \$32, $bp # $bp++ 1258 1259 vmovdqu -24+32*1-128($np),$TEMP0 1260 imulq -128($np),%rax 1261 add %rax,$r3 1262 shr \$29, $r3 1263 1264 vmovdqu -24+32*2-128($np),$TEMP1 1265 vpmuludq $Yi,$TEMP0,$TEMP0 1266 vmovq $Bi, %rbx 1267 vmovdqu -24+32*3-128($np),$TEMP2 1268 vpaddq $TEMP0,$ACC1,$ACC0 # $ACC0==$TEMP0 1269 vpmuludq $Yi,$TEMP1,$TEMP1 1270 vmovdqu $ACC0, (%rsp) # transfer $r0-$r3 1271 vpaddq $TEMP1,$ACC2,$ACC1 1272 vmovdqu -24+32*4-128($np),$TEMP0 1273 vpmuludq $Yi,$TEMP2,$TEMP2 1274 vmovdqu -24+32*5-128($np),$TEMP1 1275 vpaddq $TEMP2,$ACC3,$ACC2 1276 vpmuludq $Yi,$TEMP0,$TEMP0 1277 vmovdqu -24+32*6-128($np),$TEMP2 1278 vpaddq $TEMP0,$ACC4,$ACC3 1279 vpmuludq $Yi,$TEMP1,$TEMP1 1280 vmovdqu -24+32*7-128($np),$TEMP0 1281 vpaddq $TEMP1,$ACC5,$ACC4 1282 vpmuludq $Yi,$TEMP2,$TEMP2 1283 vmovdqu -24+32*8-128($np),$TEMP1 1284 vpaddq $TEMP2,$ACC6,$ACC5 1285 vpmuludq $Yi,$TEMP0,$TEMP0 1286 vmovdqu -24+32*9-128($np),$TEMP2 1287 mov $r3, $r0 1288 vpaddq $TEMP0,$ACC7,$ACC6 1289 vpmuludq $Yi,$TEMP1,$TEMP1 1290 add (%rsp), $r0 1291 vpaddq $TEMP1,$ACC8,$ACC7 1292 vpmuludq $Yi,$TEMP2,$TEMP2 1293 vmovq $r3, $TEMP1 1294 vpaddq $TEMP2,$ACC9,$ACC8 1295 1296 dec $i 1297 jnz .Loop_mul_1024 1298___ 1299 1300# (*) Original implementation was correcting ACC1-ACC3 for overflow 1301# after 7 loop runs, or after 28 iterations, or 56 additions. 1302# But as we underutilize resources, it's possible to correct in 1303# each iteration with marginal performance loss. But then, as 1304# we do it in each iteration, we can correct less digits, and 1305# avoid performance penalties completely. 1306 1307$TEMP0 = $ACC9; 1308$TEMP3 = $Bi; 1309$TEMP4 = $Yi; 1310$code.=<<___; 1311 vpaddq (%rsp), $TEMP1, $ACC0 1312 1313 vpsrlq \$29, $ACC0, $TEMP1 1314 vpand $AND_MASK, $ACC0, $ACC0 1315 vpsrlq \$29, $ACC1, $TEMP2 1316 vpand $AND_MASK, $ACC1, $ACC1 1317 vpsrlq \$29, $ACC2, $TEMP3 1318 vpermq \$0x93, $TEMP1, $TEMP1 1319 vpand $AND_MASK, $ACC2, $ACC2 1320 vpsrlq \$29, $ACC3, $TEMP4 1321 vpermq \$0x93, $TEMP2, $TEMP2 1322 vpand $AND_MASK, $ACC3, $ACC3 1323 1324 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 1325 vpermq \$0x93, $TEMP3, $TEMP3 1326 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 1327 vpermq \$0x93, $TEMP4, $TEMP4 1328 vpaddq $TEMP0, $ACC0, $ACC0 1329 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 1330 vpaddq $TEMP1, $ACC1, $ACC1 1331 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 1332 vpaddq $TEMP2, $ACC2, $ACC2 1333 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 1334 vpaddq $TEMP3, $ACC3, $ACC3 1335 vpaddq $TEMP4, $ACC4, $ACC4 1336 1337 vpsrlq \$29, $ACC0, $TEMP1 1338 vpand $AND_MASK, $ACC0, $ACC0 1339 vpsrlq \$29, $ACC1, $TEMP2 1340 vpand $AND_MASK, $ACC1, $ACC1 1341 vpsrlq \$29, $ACC2, $TEMP3 1342 vpermq \$0x93, $TEMP1, $TEMP1 1343 vpand $AND_MASK, $ACC2, $ACC2 1344 vpsrlq \$29, $ACC3, $TEMP4 1345 vpermq \$0x93, $TEMP2, $TEMP2 1346 vpand $AND_MASK, $ACC3, $ACC3 1347 vpermq \$0x93, $TEMP3, $TEMP3 1348 1349 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 1350 vpermq \$0x93, $TEMP4, $TEMP4 1351 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 1352 vpaddq $TEMP0, $ACC0, $ACC0 1353 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 1354 vpaddq $TEMP1, $ACC1, $ACC1 1355 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 1356 vpaddq $TEMP2, $ACC2, $ACC2 1357 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 1358 vpaddq $TEMP3, $ACC3, $ACC3 1359 vpaddq $TEMP4, $ACC4, $ACC4 1360 1361 vmovdqu $ACC0, 0-128($rp) 1362 vmovdqu $ACC1, 32-128($rp) 1363 vmovdqu $ACC2, 64-128($rp) 1364 vmovdqu $ACC3, 96-128($rp) 1365___ 1366 1367$TEMP5=$ACC0; 1368$code.=<<___; 1369 vpsrlq \$29, $ACC4, $TEMP1 1370 vpand $AND_MASK, $ACC4, $ACC4 1371 vpsrlq \$29, $ACC5, $TEMP2 1372 vpand $AND_MASK, $ACC5, $ACC5 1373 vpsrlq \$29, $ACC6, $TEMP3 1374 vpermq \$0x93, $TEMP1, $TEMP1 1375 vpand $AND_MASK, $ACC6, $ACC6 1376 vpsrlq \$29, $ACC7, $TEMP4 1377 vpermq \$0x93, $TEMP2, $TEMP2 1378 vpand $AND_MASK, $ACC7, $ACC7 1379 vpsrlq \$29, $ACC8, $TEMP5 1380 vpermq \$0x93, $TEMP3, $TEMP3 1381 vpand $AND_MASK, $ACC8, $ACC8 1382 vpermq \$0x93, $TEMP4, $TEMP4 1383 1384 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 1385 vpermq \$0x93, $TEMP5, $TEMP5 1386 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 1387 vpaddq $TEMP0, $ACC4, $ACC4 1388 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 1389 vpaddq $TEMP1, $ACC5, $ACC5 1390 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 1391 vpaddq $TEMP2, $ACC6, $ACC6 1392 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 1393 vpaddq $TEMP3, $ACC7, $ACC7 1394 vpaddq $TEMP4, $ACC8, $ACC8 1395 1396 vpsrlq \$29, $ACC4, $TEMP1 1397 vpand $AND_MASK, $ACC4, $ACC4 1398 vpsrlq \$29, $ACC5, $TEMP2 1399 vpand $AND_MASK, $ACC5, $ACC5 1400 vpsrlq \$29, $ACC6, $TEMP3 1401 vpermq \$0x93, $TEMP1, $TEMP1 1402 vpand $AND_MASK, $ACC6, $ACC6 1403 vpsrlq \$29, $ACC7, $TEMP4 1404 vpermq \$0x93, $TEMP2, $TEMP2 1405 vpand $AND_MASK, $ACC7, $ACC7 1406 vpsrlq \$29, $ACC8, $TEMP5 1407 vpermq \$0x93, $TEMP3, $TEMP3 1408 vpand $AND_MASK, $ACC8, $ACC8 1409 vpermq \$0x93, $TEMP4, $TEMP4 1410 1411 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 1412 vpermq \$0x93, $TEMP5, $TEMP5 1413 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 1414 vpaddq $TEMP0, $ACC4, $ACC4 1415 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 1416 vpaddq $TEMP1, $ACC5, $ACC5 1417 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 1418 vpaddq $TEMP2, $ACC6, $ACC6 1419 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 1420 vpaddq $TEMP3, $ACC7, $ACC7 1421 vpaddq $TEMP4, $ACC8, $ACC8 1422 1423 vmovdqu $ACC4, 128-128($rp) 1424 vmovdqu $ACC5, 160-128($rp) 1425 vmovdqu $ACC6, 192-128($rp) 1426 vmovdqu $ACC7, 224-128($rp) 1427 vmovdqu $ACC8, 256-128($rp) 1428 vzeroupper 1429 1430 mov %rbp, %rax 1431.cfi_def_cfa_register %rax 1432___ 1433$code.=<<___ if ($win64); 1434.Lmul_1024_in_tail: 1435 movaps -0xd8(%rax),%xmm6 1436 movaps -0xc8(%rax),%xmm7 1437 movaps -0xb8(%rax),%xmm8 1438 movaps -0xa8(%rax),%xmm9 1439 movaps -0x98(%rax),%xmm10 1440 movaps -0x88(%rax),%xmm11 1441 movaps -0x78(%rax),%xmm12 1442 movaps -0x68(%rax),%xmm13 1443 movaps -0x58(%rax),%xmm14 1444 movaps -0x48(%rax),%xmm15 1445___ 1446$code.=<<___; 1447 mov -48(%rax),%r15 1448.cfi_restore %r15 1449 mov -40(%rax),%r14 1450.cfi_restore %r14 1451 mov -32(%rax),%r13 1452.cfi_restore %r13 1453 mov -24(%rax),%r12 1454.cfi_restore %r12 1455 mov -16(%rax),%rbp 1456.cfi_restore %rbp 1457 mov -8(%rax),%rbx 1458.cfi_restore %rbx 1459 lea (%rax),%rsp # restore %rsp 1460.cfi_def_cfa_register %rsp 1461.Lmul_1024_epilogue: 1462 ret 1463.cfi_endproc 1464.size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2 1465___ 1466} 1467{ 1468my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi"); 1469my @T = map("%r$_",(8..11)); 1470 1471$code.=<<___; 1472.globl rsaz_1024_red2norm_avx2 1473.type rsaz_1024_red2norm_avx2,\@abi-omnipotent 1474.align 32 1475rsaz_1024_red2norm_avx2: 1476.cfi_startproc 1477 sub \$-128,$inp # size optimization 1478 xor %rax,%rax 1479___ 1480 1481for ($j=0,$i=0; $i<16; $i++) { 1482 my $k=0; 1483 while (29*$j<64*($i+1)) { # load data till boundary 1484 $code.=" mov `8*$j-128`($inp), @T[0]\n"; 1485 $j++; $k++; push(@T,shift(@T)); 1486 } 1487 $l=$k; 1488 while ($k>1) { # shift loaded data but last value 1489 $code.=" shl \$`29*($j-$k)`,@T[-$k]\n"; 1490 $k--; 1491 } 1492 $code.=<<___; # shift last value 1493 mov @T[-1], @T[0] 1494 shl \$`29*($j-1)`, @T[-1] 1495 shr \$`-29*($j-1)`, @T[0] 1496___ 1497 while ($l) { # accumulate all values 1498 $code.=" add @T[-$l], %rax\n"; 1499 $l--; 1500 } 1501 $code.=<<___; 1502 adc \$0, @T[0] # consume eventual carry 1503 mov %rax, 8*$i($out) 1504 mov @T[0], %rax 1505___ 1506 push(@T,shift(@T)); 1507} 1508$code.=<<___; 1509 ret 1510.cfi_endproc 1511.size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2 1512 1513.globl rsaz_1024_norm2red_avx2 1514.type rsaz_1024_norm2red_avx2,\@abi-omnipotent 1515.align 32 1516rsaz_1024_norm2red_avx2: 1517.cfi_startproc 1518 sub \$-128,$out # size optimization 1519 mov ($inp),@T[0] 1520 mov \$0x1fffffff,%eax 1521___ 1522for ($j=0,$i=0; $i<16; $i++) { 1523 $code.=" mov `8*($i+1)`($inp),@T[1]\n" if ($i<15); 1524 $code.=" xor @T[1],@T[1]\n" if ($i==15); 1525 my $k=1; 1526 while (29*($j+1)<64*($i+1)) { 1527 $code.=<<___; 1528 mov @T[0],@T[-$k] 1529 shr \$`29*$j`,@T[-$k] 1530 and %rax,@T[-$k] # &0x1fffffff 1531 mov @T[-$k],`8*$j-128`($out) 1532___ 1533 $j++; $k++; 1534 } 1535 $code.=<<___; 1536 shrd \$`29*$j`,@T[1],@T[0] 1537 and %rax,@T[0] 1538 mov @T[0],`8*$j-128`($out) 1539___ 1540 $j++; 1541 push(@T,shift(@T)); 1542} 1543$code.=<<___; 1544 mov @T[0],`8*$j-128`($out) # zero 1545 mov @T[0],`8*($j+1)-128`($out) 1546 mov @T[0],`8*($j+2)-128`($out) 1547 mov @T[0],`8*($j+3)-128`($out) 1548 ret 1549.cfi_endproc 1550.size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2 1551___ 1552} 1553{ 1554my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx"); 1555 1556$code.=<<___; 1557.globl rsaz_1024_scatter5_avx2 1558.type rsaz_1024_scatter5_avx2,\@abi-omnipotent 1559.align 32 1560rsaz_1024_scatter5_avx2: 1561.cfi_startproc 1562 vzeroupper 1563 vmovdqu .Lscatter_permd(%rip),%ymm5 1564 shl \$4,$power 1565 lea ($out,$power),$out 1566 mov \$9,%eax 1567 jmp .Loop_scatter_1024 1568 1569.align 32 1570.Loop_scatter_1024: 1571 vmovdqu ($inp),%ymm0 1572 lea 32($inp),$inp 1573 vpermd %ymm0,%ymm5,%ymm0 1574 vmovdqu %xmm0,($out) 1575 lea 16*32($out),$out 1576 dec %eax 1577 jnz .Loop_scatter_1024 1578 1579 vzeroupper 1580 ret 1581.cfi_endproc 1582.size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2 1583 1584.globl rsaz_1024_gather5_avx2 1585.type rsaz_1024_gather5_avx2,\@abi-omnipotent 1586.align 32 1587rsaz_1024_gather5_avx2: 1588.cfi_startproc 1589 vzeroupper 1590 mov %rsp,%r11 1591.cfi_def_cfa_register %r11 1592___ 1593$code.=<<___ if ($win64); 1594 lea -0x88(%rsp),%rax 1595.LSEH_begin_rsaz_1024_gather5: 1596 # I can't trust assembler to use specific encoding:-( 1597 .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax),%rsp 1598 .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6,-0x20(%rax) 1599 .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7,-0x10(%rax) 1600 .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8,0(%rax) 1601 .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9,0x10(%rax) 1602 .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10,0x20(%rax) 1603 .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11,0x30(%rax) 1604 .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12,0x40(%rax) 1605 .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13,0x50(%rax) 1606 .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14,0x60(%rax) 1607 .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15,0x70(%rax) 1608___ 1609$code.=<<___; 1610 lea -0x100(%rsp),%rsp 1611 and \$-32, %rsp 1612 lea .Linc(%rip), %r10 1613 lea -128(%rsp),%rax # control u-op density 1614 1615 vmovd $power, %xmm4 1616 vmovdqa (%r10),%ymm0 1617 vmovdqa 32(%r10),%ymm1 1618 vmovdqa 64(%r10),%ymm5 1619 vpbroadcastd %xmm4,%ymm4 1620 1621 vpaddd %ymm5, %ymm0, %ymm2 1622 vpcmpeqd %ymm4, %ymm0, %ymm0 1623 vpaddd %ymm5, %ymm1, %ymm3 1624 vpcmpeqd %ymm4, %ymm1, %ymm1 1625 vmovdqa %ymm0, 32*0+128(%rax) 1626 vpaddd %ymm5, %ymm2, %ymm0 1627 vpcmpeqd %ymm4, %ymm2, %ymm2 1628 vmovdqa %ymm1, 32*1+128(%rax) 1629 vpaddd %ymm5, %ymm3, %ymm1 1630 vpcmpeqd %ymm4, %ymm3, %ymm3 1631 vmovdqa %ymm2, 32*2+128(%rax) 1632 vpaddd %ymm5, %ymm0, %ymm2 1633 vpcmpeqd %ymm4, %ymm0, %ymm0 1634 vmovdqa %ymm3, 32*3+128(%rax) 1635 vpaddd %ymm5, %ymm1, %ymm3 1636 vpcmpeqd %ymm4, %ymm1, %ymm1 1637 vmovdqa %ymm0, 32*4+128(%rax) 1638 vpaddd %ymm5, %ymm2, %ymm8 1639 vpcmpeqd %ymm4, %ymm2, %ymm2 1640 vmovdqa %ymm1, 32*5+128(%rax) 1641 vpaddd %ymm5, %ymm3, %ymm9 1642 vpcmpeqd %ymm4, %ymm3, %ymm3 1643 vmovdqa %ymm2, 32*6+128(%rax) 1644 vpaddd %ymm5, %ymm8, %ymm10 1645 vpcmpeqd %ymm4, %ymm8, %ymm8 1646 vmovdqa %ymm3, 32*7+128(%rax) 1647 vpaddd %ymm5, %ymm9, %ymm11 1648 vpcmpeqd %ymm4, %ymm9, %ymm9 1649 vpaddd %ymm5, %ymm10, %ymm12 1650 vpcmpeqd %ymm4, %ymm10, %ymm10 1651 vpaddd %ymm5, %ymm11, %ymm13 1652 vpcmpeqd %ymm4, %ymm11, %ymm11 1653 vpaddd %ymm5, %ymm12, %ymm14 1654 vpcmpeqd %ymm4, %ymm12, %ymm12 1655 vpaddd %ymm5, %ymm13, %ymm15 1656 vpcmpeqd %ymm4, %ymm13, %ymm13 1657 vpcmpeqd %ymm4, %ymm14, %ymm14 1658 vpcmpeqd %ymm4, %ymm15, %ymm15 1659 1660 vmovdqa -32(%r10),%ymm7 # .Lgather_permd 1661 lea 128($inp), $inp 1662 mov \$9,$power 1663 1664.Loop_gather_1024: 1665 vmovdqa 32*0-128($inp), %ymm0 1666 vmovdqa 32*1-128($inp), %ymm1 1667 vmovdqa 32*2-128($inp), %ymm2 1668 vmovdqa 32*3-128($inp), %ymm3 1669 vpand 32*0+128(%rax), %ymm0, %ymm0 1670 vpand 32*1+128(%rax), %ymm1, %ymm1 1671 vpand 32*2+128(%rax), %ymm2, %ymm2 1672 vpor %ymm0, %ymm1, %ymm4 1673 vpand 32*3+128(%rax), %ymm3, %ymm3 1674 vmovdqa 32*4-128($inp), %ymm0 1675 vmovdqa 32*5-128($inp), %ymm1 1676 vpor %ymm2, %ymm3, %ymm5 1677 vmovdqa 32*6-128($inp), %ymm2 1678 vmovdqa 32*7-128($inp), %ymm3 1679 vpand 32*4+128(%rax), %ymm0, %ymm0 1680 vpand 32*5+128(%rax), %ymm1, %ymm1 1681 vpand 32*6+128(%rax), %ymm2, %ymm2 1682 vpor %ymm0, %ymm4, %ymm4 1683 vpand 32*7+128(%rax), %ymm3, %ymm3 1684 vpand 32*8-128($inp), %ymm8, %ymm0 1685 vpor %ymm1, %ymm5, %ymm5 1686 vpand 32*9-128($inp), %ymm9, %ymm1 1687 vpor %ymm2, %ymm4, %ymm4 1688 vpand 32*10-128($inp),%ymm10, %ymm2 1689 vpor %ymm3, %ymm5, %ymm5 1690 vpand 32*11-128($inp),%ymm11, %ymm3 1691 vpor %ymm0, %ymm4, %ymm4 1692 vpand 32*12-128($inp),%ymm12, %ymm0 1693 vpor %ymm1, %ymm5, %ymm5 1694 vpand 32*13-128($inp),%ymm13, %ymm1 1695 vpor %ymm2, %ymm4, %ymm4 1696 vpand 32*14-128($inp),%ymm14, %ymm2 1697 vpor %ymm3, %ymm5, %ymm5 1698 vpand 32*15-128($inp),%ymm15, %ymm3 1699 lea 32*16($inp), $inp 1700 vpor %ymm0, %ymm4, %ymm4 1701 vpor %ymm1, %ymm5, %ymm5 1702 vpor %ymm2, %ymm4, %ymm4 1703 vpor %ymm3, %ymm5, %ymm5 1704 1705 vpor %ymm5, %ymm4, %ymm4 1706 vextracti128 \$1, %ymm4, %xmm5 # upper half is cleared 1707 vpor %xmm4, %xmm5, %xmm5 1708 vpermd %ymm5,%ymm7,%ymm5 1709 vmovdqu %ymm5,($out) 1710 lea 32($out),$out 1711 dec $power 1712 jnz .Loop_gather_1024 1713 1714 vpxor %ymm0,%ymm0,%ymm0 1715 vmovdqu %ymm0,($out) 1716 vzeroupper 1717___ 1718$code.=<<___ if ($win64); 1719 movaps -0xa8(%r11),%xmm6 1720 movaps -0x98(%r11),%xmm7 1721 movaps -0x88(%r11),%xmm8 1722 movaps -0x78(%r11),%xmm9 1723 movaps -0x68(%r11),%xmm10 1724 movaps -0x58(%r11),%xmm11 1725 movaps -0x48(%r11),%xmm12 1726 movaps -0x38(%r11),%xmm13 1727 movaps -0x28(%r11),%xmm14 1728 movaps -0x18(%r11),%xmm15 1729___ 1730$code.=<<___; 1731 lea (%r11),%rsp 1732.cfi_def_cfa_register %rsp 1733 ret 1734.cfi_endproc 1735.LSEH_end_rsaz_1024_gather5: 1736.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2 1737___ 1738} 1739 1740$code.=<<___; 1741.align 64 1742.Land_mask: 1743 .quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff 1744.Lscatter_permd: 1745 .long 0,2,4,6,7,7,7,7 1746.Lgather_permd: 1747 .long 0,7,1,7,2,7,3,7 1748.Linc: 1749 .long 0,0,0,0, 1,1,1,1 1750 .long 2,2,2,2, 3,3,3,3 1751 .long 4,4,4,4, 4,4,4,4 1752.align 64 1753___ 1754 1755if ($win64) { 1756$rec="%rcx"; 1757$frame="%rdx"; 1758$context="%r8"; 1759$disp="%r9"; 1760 1761$code.=<<___ 1762.extern __imp_RtlVirtualUnwind 1763.type rsaz_se_handler,\@abi-omnipotent 1764.align 16 1765rsaz_se_handler: 1766 push %rsi 1767 push %rdi 1768 push %rbx 1769 push %rbp 1770 push %r12 1771 push %r13 1772 push %r14 1773 push %r15 1774 pushfq 1775 sub \$64,%rsp 1776 1777 mov 120($context),%rax # pull context->Rax 1778 mov 248($context),%rbx # pull context->Rip 1779 1780 mov 8($disp),%rsi # disp->ImageBase 1781 mov 56($disp),%r11 # disp->HandlerData 1782 1783 mov 0(%r11),%r10d # HandlerData[0] 1784 lea (%rsi,%r10),%r10 # prologue label 1785 cmp %r10,%rbx # context->Rip<prologue label 1786 jb .Lcommon_seh_tail 1787 1788 mov 4(%r11),%r10d # HandlerData[1] 1789 lea (%rsi,%r10),%r10 # epilogue label 1790 cmp %r10,%rbx # context->Rip>=epilogue label 1791 jae .Lcommon_seh_tail 1792 1793 mov 160($context),%rbp # pull context->Rbp 1794 1795 mov 8(%r11),%r10d # HandlerData[2] 1796 lea (%rsi,%r10),%r10 # "in tail" label 1797 cmp %r10,%rbx # context->Rip>="in tail" label 1798 cmovc %rbp,%rax 1799 1800 mov -48(%rax),%r15 1801 mov -40(%rax),%r14 1802 mov -32(%rax),%r13 1803 mov -24(%rax),%r12 1804 mov -16(%rax),%rbp 1805 mov -8(%rax),%rbx 1806 mov %r15,240($context) 1807 mov %r14,232($context) 1808 mov %r13,224($context) 1809 mov %r12,216($context) 1810 mov %rbp,160($context) 1811 mov %rbx,144($context) 1812 1813 lea -0xd8(%rax),%rsi # %xmm save area 1814 lea 512($context),%rdi # & context.Xmm6 1815 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 1816 .long 0xa548f3fc # cld; rep movsq 1817 1818.Lcommon_seh_tail: 1819 mov 8(%rax),%rdi 1820 mov 16(%rax),%rsi 1821 mov %rax,152($context) # restore context->Rsp 1822 mov %rsi,168($context) # restore context->Rsi 1823 mov %rdi,176($context) # restore context->Rdi 1824 1825 mov 40($disp),%rdi # disp->ContextRecord 1826 mov $context,%rsi # context 1827 mov \$154,%ecx # sizeof(CONTEXT) 1828 .long 0xa548f3fc # cld; rep movsq 1829 1830 mov $disp,%rsi 1831 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1832 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1833 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1834 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1835 mov 40(%rsi),%r10 # disp->ContextRecord 1836 lea 56(%rsi),%r11 # &disp->HandlerData 1837 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1838 mov %r10,32(%rsp) # arg5 1839 mov %r11,40(%rsp) # arg6 1840 mov %r12,48(%rsp) # arg7 1841 mov %rcx,56(%rsp) # arg8, (NULL) 1842 call *__imp_RtlVirtualUnwind(%rip) 1843 1844 mov \$1,%eax # ExceptionContinueSearch 1845 add \$64,%rsp 1846 popfq 1847 pop %r15 1848 pop %r14 1849 pop %r13 1850 pop %r12 1851 pop %rbp 1852 pop %rbx 1853 pop %rdi 1854 pop %rsi 1855 ret 1856.size rsaz_se_handler,.-rsaz_se_handler 1857 1858.section .pdata 1859.align 4 1860 .rva .LSEH_begin_rsaz_1024_sqr_avx2 1861 .rva .LSEH_end_rsaz_1024_sqr_avx2 1862 .rva .LSEH_info_rsaz_1024_sqr_avx2 1863 1864 .rva .LSEH_begin_rsaz_1024_mul_avx2 1865 .rva .LSEH_end_rsaz_1024_mul_avx2 1866 .rva .LSEH_info_rsaz_1024_mul_avx2 1867 1868 .rva .LSEH_begin_rsaz_1024_gather5 1869 .rva .LSEH_end_rsaz_1024_gather5 1870 .rva .LSEH_info_rsaz_1024_gather5 1871.section .xdata 1872.align 8 1873.LSEH_info_rsaz_1024_sqr_avx2: 1874 .byte 9,0,0,0 1875 .rva rsaz_se_handler 1876 .rva .Lsqr_1024_body,.Lsqr_1024_epilogue,.Lsqr_1024_in_tail 1877 .long 0 1878.LSEH_info_rsaz_1024_mul_avx2: 1879 .byte 9,0,0,0 1880 .rva rsaz_se_handler 1881 .rva .Lmul_1024_body,.Lmul_1024_epilogue,.Lmul_1024_in_tail 1882 .long 0 1883.LSEH_info_rsaz_1024_gather5: 1884 .byte 0x01,0x36,0x17,0x0b 1885 .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 1886 .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 1887 .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 1888 .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 1889 .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 1890 .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 1891 .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 1892 .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 1893 .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 1894 .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 1895 .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8 1896 .byte 0x00,0xb3,0x00,0x00 # set_frame r11 1897___ 1898} 1899 1900foreach (split("\n",$code)) { 1901 s/\`([^\`]*)\`/eval($1)/ge; 1902 1903 s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge or 1904 1905 s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1906 s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or 1907 s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1908 s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1909 s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go; 1910 print $_,"\n"; 1911} 1912 1913}}} else {{{ 1914print <<___; # assembler is too old 1915.text 1916 1917.globl rsaz_avx2_eligible 1918.type rsaz_avx2_eligible,\@abi-omnipotent 1919rsaz_avx2_eligible: 1920 xor %eax,%eax 1921 ret 1922.size rsaz_avx2_eligible,.-rsaz_avx2_eligible 1923 1924.globl rsaz_1024_sqr_avx2 1925.globl rsaz_1024_mul_avx2 1926.globl rsaz_1024_norm2red_avx2 1927.globl rsaz_1024_red2norm_avx2 1928.globl rsaz_1024_scatter5_avx2 1929.globl rsaz_1024_gather5_avx2 1930.type rsaz_1024_sqr_avx2,\@abi-omnipotent 1931rsaz_1024_sqr_avx2: 1932rsaz_1024_mul_avx2: 1933rsaz_1024_norm2red_avx2: 1934rsaz_1024_red2norm_avx2: 1935rsaz_1024_scatter5_avx2: 1936rsaz_1024_gather5_avx2: 1937 .byte 0x0f,0x0b # ud2 1938 ret 1939.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 1940___ 1941}}} 1942 1943close STDOUT; 1944