1#!/usr/bin/env perl 2 3############################################################################## 4# # 5# Copyright (c) 2012, Intel Corporation # 6# # 7# All rights reserved. # 8# # 9# Redistribution and use in source and binary forms, with or without # 10# modification, are permitted provided that the following conditions are # 11# met: # 12# # 13# * Redistributions of source code must retain the above copyright # 14# notice, this list of conditions and the following disclaimer. # 15# # 16# * Redistributions in binary form must reproduce the above copyright # 17# notice, this list of conditions and the following disclaimer in the # 18# documentation and/or other materials provided with the # 19# distribution. # 20# # 21# * Neither the name of the Intel Corporation nor the names of its # 22# contributors may be used to endorse or promote products derived from # 23# this software without specific prior written permission. # 24# # 25# # 26# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY # 27# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # 28# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # 29# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR # 30# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # 31# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # 32# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # 33# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # 34# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # 35# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # 36# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # 37# # 38############################################################################## 39# Developers and authors: # 40# Shay Gueron (1, 2), and Vlad Krasnov (1) # 41# (1) Intel Corporation, Israel Development Center, Haifa, Israel # 42# (2) University of Haifa, Israel # 43############################################################################## 44# Reference: # 45# [1] S. Gueron, V. Krasnov: "Software Implementation of Modular # 46# Exponentiation, Using Advanced Vector Instructions Architectures", # 47# F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369, # 48# pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012 # 49# [2] S. Gueron: "Efficient Software Implementations of Modular # 50# Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012). # 51# [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE # 52# Proceedings of 9th International Conference on Information Technology: # 53# New Generations (ITNG 2012), pp.821-823 (2012) # 54# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis # 55# resistant 1024-bit modular exponentiation, for optimizing RSA2048 # 56# on AVX2 capable x86_64 platforms", # 57# http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest# 58############################################################################## 59# 60# +13% improvement over original submission by <appro@openssl.org> 61# 62# rsa2048 sign/sec OpenSSL 1.0.1 scalar(*) this 63# 2.3GHz Haswell 621 765/+23% 1113/+79% 64# 2.3GHz Broadwell(**) 688 1200(***)/+74% 1120/+63% 65# 66# (*) if system doesn't support AVX2, for reference purposes; 67# (**) scaled to 2.3GHz to simplify comparison; 68# (***) scalar AD*X code is faster than AVX2 and is preferred code 69# path for Broadwell; 70 71$flavour = shift; 72$output = shift; 73if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 74 75$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 76 77$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 78( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 79( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 80die "can't locate x86_64-xlate.pl"; 81 82# In upstream, this is controlled by shelling out to the compiler to check 83# versions, but BoringSSL is intended to be used with pre-generated perlasm 84# output, so this isn't useful anyway. 85# 86# TODO(davidben): Enable these after testing. $avx goes up to 2 and $addx to 1. 87$avx = 0; 88$addx = 0; 89 90open OUT,"| \"$^X\" $xlate $flavour $output"; 91*STDOUT = *OUT; 92 93if ($avx>1) {{{ 94{ # void AMS_WW( 95my $rp="%rdi"; # BN_ULONG *rp, 96my $ap="%rsi"; # const BN_ULONG *ap, 97my $np="%rdx"; # const BN_ULONG *np, 98my $n0="%ecx"; # const BN_ULONG n0, 99my $rep="%r8d"; # int repeat); 100 101# The registers that hold the accumulated redundant result 102# The AMM works on 1024 bit operands, and redundant word size is 29 103# Therefore: ceil(1024/29)/4 = 9 104my $ACC0="%ymm0"; 105my $ACC1="%ymm1"; 106my $ACC2="%ymm2"; 107my $ACC3="%ymm3"; 108my $ACC4="%ymm4"; 109my $ACC5="%ymm5"; 110my $ACC6="%ymm6"; 111my $ACC7="%ymm7"; 112my $ACC8="%ymm8"; 113my $ACC9="%ymm9"; 114# Registers that hold the broadcasted words of bp, currently used 115my $B1="%ymm10"; 116my $B2="%ymm11"; 117# Registers that hold the broadcasted words of Y, currently used 118my $Y1="%ymm12"; 119my $Y2="%ymm13"; 120# Helper registers 121my $TEMP1="%ymm14"; 122my $AND_MASK="%ymm15"; 123# alu registers that hold the first words of the ACC 124my $r0="%r9"; 125my $r1="%r10"; 126my $r2="%r11"; 127my $r3="%r12"; 128 129my $i="%r14d"; # loop counter 130my $tmp = "%r15"; 131 132my $FrameSize=32*18+32*8; # place for A^2 and 2*A 133 134my $aap=$r0; 135my $tp0="%rbx"; 136my $tp1=$r3; 137my $tpa=$tmp; 138 139$np="%r13"; # reassigned argument 140 141$code.=<<___; 142.text 143 144.globl rsaz_1024_sqr_avx2 145.type rsaz_1024_sqr_avx2,\@function,5 146.align 64 147rsaz_1024_sqr_avx2: # 702 cycles, 14% faster than rsaz_1024_mul_avx2 148 lea (%rsp), %rax 149 push %rbx 150 push %rbp 151 push %r12 152 push %r13 153 push %r14 154 push %r15 155 vzeroupper 156___ 157$code.=<<___ if ($win64); 158 lea -0xa8(%rsp),%rsp 159 vmovaps %xmm6,-0xd8(%rax) 160 vmovaps %xmm7,-0xc8(%rax) 161 vmovaps %xmm8,-0xb8(%rax) 162 vmovaps %xmm9,-0xa8(%rax) 163 vmovaps %xmm10,-0x98(%rax) 164 vmovaps %xmm11,-0x88(%rax) 165 vmovaps %xmm12,-0x78(%rax) 166 vmovaps %xmm13,-0x68(%rax) 167 vmovaps %xmm14,-0x58(%rax) 168 vmovaps %xmm15,-0x48(%rax) 169.Lsqr_1024_body: 170___ 171$code.=<<___; 172 mov %rax,%rbp 173 mov %rdx, $np # reassigned argument 174 sub \$$FrameSize, %rsp 175 mov $np, $tmp 176 sub \$-128, $rp # size optimization 177 sub \$-128, $ap 178 sub \$-128, $np 179 180 and \$4095, $tmp # see if $np crosses page 181 add \$32*10, $tmp 182 shr \$12, $tmp 183 vpxor $ACC9,$ACC9,$ACC9 184 jz .Lsqr_1024_no_n_copy 185 186 # unaligned 256-bit load that crosses page boundary can 187 # cause >2x performance degradation here, so if $np does 188 # cross page boundary, copy it to stack and make sure stack 189 # frame doesn't... 190 sub \$32*10,%rsp 191 vmovdqu 32*0-128($np), $ACC0 192 and \$-2048, %rsp 193 vmovdqu 32*1-128($np), $ACC1 194 vmovdqu 32*2-128($np), $ACC2 195 vmovdqu 32*3-128($np), $ACC3 196 vmovdqu 32*4-128($np), $ACC4 197 vmovdqu 32*5-128($np), $ACC5 198 vmovdqu 32*6-128($np), $ACC6 199 vmovdqu 32*7-128($np), $ACC7 200 vmovdqu 32*8-128($np), $ACC8 201 lea $FrameSize+128(%rsp),$np 202 vmovdqu $ACC0, 32*0-128($np) 203 vmovdqu $ACC1, 32*1-128($np) 204 vmovdqu $ACC2, 32*2-128($np) 205 vmovdqu $ACC3, 32*3-128($np) 206 vmovdqu $ACC4, 32*4-128($np) 207 vmovdqu $ACC5, 32*5-128($np) 208 vmovdqu $ACC6, 32*6-128($np) 209 vmovdqu $ACC7, 32*7-128($np) 210 vmovdqu $ACC8, 32*8-128($np) 211 vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero 212 213.Lsqr_1024_no_n_copy: 214 and \$-1024, %rsp 215 216 vmovdqu 32*1-128($ap), $ACC1 217 vmovdqu 32*2-128($ap), $ACC2 218 vmovdqu 32*3-128($ap), $ACC3 219 vmovdqu 32*4-128($ap), $ACC4 220 vmovdqu 32*5-128($ap), $ACC5 221 vmovdqu 32*6-128($ap), $ACC6 222 vmovdqu 32*7-128($ap), $ACC7 223 vmovdqu 32*8-128($ap), $ACC8 224 225 lea 192(%rsp), $tp0 # 64+128=192 226 vpbroadcastq .Land_mask(%rip), $AND_MASK 227 jmp .LOOP_GRANDE_SQR_1024 228 229.align 32 230.LOOP_GRANDE_SQR_1024: 231 lea 32*18+128(%rsp), $aap # size optimization 232 lea 448(%rsp), $tp1 # 64+128+256=448 233 234 # the squaring is performed as described in Variant B of 235 # "Speeding up Big-Number Squaring", so start by calculating 236 # the A*2=A+A vector 237 vpaddq $ACC1, $ACC1, $ACC1 238 vpbroadcastq 32*0-128($ap), $B1 239 vpaddq $ACC2, $ACC2, $ACC2 240 vmovdqa $ACC1, 32*0-128($aap) 241 vpaddq $ACC3, $ACC3, $ACC3 242 vmovdqa $ACC2, 32*1-128($aap) 243 vpaddq $ACC4, $ACC4, $ACC4 244 vmovdqa $ACC3, 32*2-128($aap) 245 vpaddq $ACC5, $ACC5, $ACC5 246 vmovdqa $ACC4, 32*3-128($aap) 247 vpaddq $ACC6, $ACC6, $ACC6 248 vmovdqa $ACC5, 32*4-128($aap) 249 vpaddq $ACC7, $ACC7, $ACC7 250 vmovdqa $ACC6, 32*5-128($aap) 251 vpaddq $ACC8, $ACC8, $ACC8 252 vmovdqa $ACC7, 32*6-128($aap) 253 vpxor $ACC9, $ACC9, $ACC9 254 vmovdqa $ACC8, 32*7-128($aap) 255 256 vpmuludq 32*0-128($ap), $B1, $ACC0 257 vpbroadcastq 32*1-128($ap), $B2 258 vmovdqu $ACC9, 32*9-192($tp0) # zero upper half 259 vpmuludq $B1, $ACC1, $ACC1 260 vmovdqu $ACC9, 32*10-448($tp1) 261 vpmuludq $B1, $ACC2, $ACC2 262 vmovdqu $ACC9, 32*11-448($tp1) 263 vpmuludq $B1, $ACC3, $ACC3 264 vmovdqu $ACC9, 32*12-448($tp1) 265 vpmuludq $B1, $ACC4, $ACC4 266 vmovdqu $ACC9, 32*13-448($tp1) 267 vpmuludq $B1, $ACC5, $ACC5 268 vmovdqu $ACC9, 32*14-448($tp1) 269 vpmuludq $B1, $ACC6, $ACC6 270 vmovdqu $ACC9, 32*15-448($tp1) 271 vpmuludq $B1, $ACC7, $ACC7 272 vmovdqu $ACC9, 32*16-448($tp1) 273 vpmuludq $B1, $ACC8, $ACC8 274 vpbroadcastq 32*2-128($ap), $B1 275 vmovdqu $ACC9, 32*17-448($tp1) 276 277 mov $ap, $tpa 278 mov \$4, $i 279 jmp .Lsqr_entry_1024 280___ 281$TEMP0=$Y1; 282$TEMP2=$Y2; 283$code.=<<___; 284.align 32 285.LOOP_SQR_1024: 286 vpbroadcastq 32*1-128($tpa), $B2 287 vpmuludq 32*0-128($ap), $B1, $ACC0 288 vpaddq 32*0-192($tp0), $ACC0, $ACC0 289 vpmuludq 32*0-128($aap), $B1, $ACC1 290 vpaddq 32*1-192($tp0), $ACC1, $ACC1 291 vpmuludq 32*1-128($aap), $B1, $ACC2 292 vpaddq 32*2-192($tp0), $ACC2, $ACC2 293 vpmuludq 32*2-128($aap), $B1, $ACC3 294 vpaddq 32*3-192($tp0), $ACC3, $ACC3 295 vpmuludq 32*3-128($aap), $B1, $ACC4 296 vpaddq 32*4-192($tp0), $ACC4, $ACC4 297 vpmuludq 32*4-128($aap), $B1, $ACC5 298 vpaddq 32*5-192($tp0), $ACC5, $ACC5 299 vpmuludq 32*5-128($aap), $B1, $ACC6 300 vpaddq 32*6-192($tp0), $ACC6, $ACC6 301 vpmuludq 32*6-128($aap), $B1, $ACC7 302 vpaddq 32*7-192($tp0), $ACC7, $ACC7 303 vpmuludq 32*7-128($aap), $B1, $ACC8 304 vpbroadcastq 32*2-128($tpa), $B1 305 vpaddq 32*8-192($tp0), $ACC8, $ACC8 306.Lsqr_entry_1024: 307 vmovdqu $ACC0, 32*0-192($tp0) 308 vmovdqu $ACC1, 32*1-192($tp0) 309 310 vpmuludq 32*1-128($ap), $B2, $TEMP0 311 vpaddq $TEMP0, $ACC2, $ACC2 312 vpmuludq 32*1-128($aap), $B2, $TEMP1 313 vpaddq $TEMP1, $ACC3, $ACC3 314 vpmuludq 32*2-128($aap), $B2, $TEMP2 315 vpaddq $TEMP2, $ACC4, $ACC4 316 vpmuludq 32*3-128($aap), $B2, $TEMP0 317 vpaddq $TEMP0, $ACC5, $ACC5 318 vpmuludq 32*4-128($aap), $B2, $TEMP1 319 vpaddq $TEMP1, $ACC6, $ACC6 320 vpmuludq 32*5-128($aap), $B2, $TEMP2 321 vpaddq $TEMP2, $ACC7, $ACC7 322 vpmuludq 32*6-128($aap), $B2, $TEMP0 323 vpaddq $TEMP0, $ACC8, $ACC8 324 vpmuludq 32*7-128($aap), $B2, $ACC0 325 vpbroadcastq 32*3-128($tpa), $B2 326 vpaddq 32*9-192($tp0), $ACC0, $ACC0 327 328 vmovdqu $ACC2, 32*2-192($tp0) 329 vmovdqu $ACC3, 32*3-192($tp0) 330 331 vpmuludq 32*2-128($ap), $B1, $TEMP2 332 vpaddq $TEMP2, $ACC4, $ACC4 333 vpmuludq 32*2-128($aap), $B1, $TEMP0 334 vpaddq $TEMP0, $ACC5, $ACC5 335 vpmuludq 32*3-128($aap), $B1, $TEMP1 336 vpaddq $TEMP1, $ACC6, $ACC6 337 vpmuludq 32*4-128($aap), $B1, $TEMP2 338 vpaddq $TEMP2, $ACC7, $ACC7 339 vpmuludq 32*5-128($aap), $B1, $TEMP0 340 vpaddq $TEMP0, $ACC8, $ACC8 341 vpmuludq 32*6-128($aap), $B1, $TEMP1 342 vpaddq $TEMP1, $ACC0, $ACC0 343 vpmuludq 32*7-128($aap), $B1, $ACC1 344 vpbroadcastq 32*4-128($tpa), $B1 345 vpaddq 32*10-448($tp1), $ACC1, $ACC1 346 347 vmovdqu $ACC4, 32*4-192($tp0) 348 vmovdqu $ACC5, 32*5-192($tp0) 349 350 vpmuludq 32*3-128($ap), $B2, $TEMP0 351 vpaddq $TEMP0, $ACC6, $ACC6 352 vpmuludq 32*3-128($aap), $B2, $TEMP1 353 vpaddq $TEMP1, $ACC7, $ACC7 354 vpmuludq 32*4-128($aap), $B2, $TEMP2 355 vpaddq $TEMP2, $ACC8, $ACC8 356 vpmuludq 32*5-128($aap), $B2, $TEMP0 357 vpaddq $TEMP0, $ACC0, $ACC0 358 vpmuludq 32*6-128($aap), $B2, $TEMP1 359 vpaddq $TEMP1, $ACC1, $ACC1 360 vpmuludq 32*7-128($aap), $B2, $ACC2 361 vpbroadcastq 32*5-128($tpa), $B2 362 vpaddq 32*11-448($tp1), $ACC2, $ACC2 363 364 vmovdqu $ACC6, 32*6-192($tp0) 365 vmovdqu $ACC7, 32*7-192($tp0) 366 367 vpmuludq 32*4-128($ap), $B1, $TEMP0 368 vpaddq $TEMP0, $ACC8, $ACC8 369 vpmuludq 32*4-128($aap), $B1, $TEMP1 370 vpaddq $TEMP1, $ACC0, $ACC0 371 vpmuludq 32*5-128($aap), $B1, $TEMP2 372 vpaddq $TEMP2, $ACC1, $ACC1 373 vpmuludq 32*6-128($aap), $B1, $TEMP0 374 vpaddq $TEMP0, $ACC2, $ACC2 375 vpmuludq 32*7-128($aap), $B1, $ACC3 376 vpbroadcastq 32*6-128($tpa), $B1 377 vpaddq 32*12-448($tp1), $ACC3, $ACC3 378 379 vmovdqu $ACC8, 32*8-192($tp0) 380 vmovdqu $ACC0, 32*9-192($tp0) 381 lea 8($tp0), $tp0 382 383 vpmuludq 32*5-128($ap), $B2, $TEMP2 384 vpaddq $TEMP2, $ACC1, $ACC1 385 vpmuludq 32*5-128($aap), $B2, $TEMP0 386 vpaddq $TEMP0, $ACC2, $ACC2 387 vpmuludq 32*6-128($aap), $B2, $TEMP1 388 vpaddq $TEMP1, $ACC3, $ACC3 389 vpmuludq 32*7-128($aap), $B2, $ACC4 390 vpbroadcastq 32*7-128($tpa), $B2 391 vpaddq 32*13-448($tp1), $ACC4, $ACC4 392 393 vmovdqu $ACC1, 32*10-448($tp1) 394 vmovdqu $ACC2, 32*11-448($tp1) 395 396 vpmuludq 32*6-128($ap), $B1, $TEMP0 397 vpaddq $TEMP0, $ACC3, $ACC3 398 vpmuludq 32*6-128($aap), $B1, $TEMP1 399 vpbroadcastq 32*8-128($tpa), $ACC0 # borrow $ACC0 for $B1 400 vpaddq $TEMP1, $ACC4, $ACC4 401 vpmuludq 32*7-128($aap), $B1, $ACC5 402 vpbroadcastq 32*0+8-128($tpa), $B1 # for next iteration 403 vpaddq 32*14-448($tp1), $ACC5, $ACC5 404 405 vmovdqu $ACC3, 32*12-448($tp1) 406 vmovdqu $ACC4, 32*13-448($tp1) 407 lea 8($tpa), $tpa 408 409 vpmuludq 32*7-128($ap), $B2, $TEMP0 410 vpaddq $TEMP0, $ACC5, $ACC5 411 vpmuludq 32*7-128($aap), $B2, $ACC6 412 vpaddq 32*15-448($tp1), $ACC6, $ACC6 413 414 vpmuludq 32*8-128($ap), $ACC0, $ACC7 415 vmovdqu $ACC5, 32*14-448($tp1) 416 vpaddq 32*16-448($tp1), $ACC7, $ACC7 417 vmovdqu $ACC6, 32*15-448($tp1) 418 vmovdqu $ACC7, 32*16-448($tp1) 419 lea 8($tp1), $tp1 420 421 dec $i 422 jnz .LOOP_SQR_1024 423___ 424$ZERO = $ACC9; 425$TEMP0 = $B1; 426$TEMP2 = $B2; 427$TEMP3 = $Y1; 428$TEMP4 = $Y2; 429$code.=<<___; 430 #we need to fix indexes 32-39 to avoid overflow 431 vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0), 432 vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0) 433 vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0) 434 lea 192(%rsp), $tp0 # 64+128=192 435 436 vpsrlq \$29, $ACC8, $TEMP1 437 vpand $AND_MASK, $ACC8, $ACC8 438 vpsrlq \$29, $ACC1, $TEMP2 439 vpand $AND_MASK, $ACC1, $ACC1 440 441 vpermq \$0x93, $TEMP1, $TEMP1 442 vpxor $ZERO, $ZERO, $ZERO 443 vpermq \$0x93, $TEMP2, $TEMP2 444 445 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 446 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 447 vpaddq $TEMP0, $ACC8, $ACC8 448 vpblendd \$3, $TEMP2, $ZERO, $TEMP2 449 vpaddq $TEMP1, $ACC1, $ACC1 450 vpaddq $TEMP2, $ACC2, $ACC2 451 vmovdqu $ACC1, 32*9-192($tp0) 452 vmovdqu $ACC2, 32*10-192($tp0) 453 454 mov (%rsp), %rax 455 mov 8(%rsp), $r1 456 mov 16(%rsp), $r2 457 mov 24(%rsp), $r3 458 vmovdqu 32*1(%rsp), $ACC1 459 vmovdqu 32*2-192($tp0), $ACC2 460 vmovdqu 32*3-192($tp0), $ACC3 461 vmovdqu 32*4-192($tp0), $ACC4 462 vmovdqu 32*5-192($tp0), $ACC5 463 vmovdqu 32*6-192($tp0), $ACC6 464 vmovdqu 32*7-192($tp0), $ACC7 465 466 mov %rax, $r0 467 imull $n0, %eax 468 and \$0x1fffffff, %eax 469 vmovd %eax, $Y1 470 471 mov %rax, %rdx 472 imulq -128($np), %rax 473 vpbroadcastq $Y1, $Y1 474 add %rax, $r0 475 mov %rdx, %rax 476 imulq 8-128($np), %rax 477 shr \$29, $r0 478 add %rax, $r1 479 mov %rdx, %rax 480 imulq 16-128($np), %rax 481 add $r0, $r1 482 add %rax, $r2 483 imulq 24-128($np), %rdx 484 add %rdx, $r3 485 486 mov $r1, %rax 487 imull $n0, %eax 488 and \$0x1fffffff, %eax 489 490 mov \$9, $i 491 jmp .LOOP_REDUCE_1024 492 493.align 32 494.LOOP_REDUCE_1024: 495 vmovd %eax, $Y2 496 vpbroadcastq $Y2, $Y2 497 498 vpmuludq 32*1-128($np), $Y1, $TEMP0 499 mov %rax, %rdx 500 imulq -128($np), %rax 501 vpaddq $TEMP0, $ACC1, $ACC1 502 add %rax, $r1 503 vpmuludq 32*2-128($np), $Y1, $TEMP1 504 mov %rdx, %rax 505 imulq 8-128($np), %rax 506 vpaddq $TEMP1, $ACC2, $ACC2 507 vpmuludq 32*3-128($np), $Y1, $TEMP2 508 .byte 0x67 509 add %rax, $r2 510 .byte 0x67 511 mov %rdx, %rax 512 imulq 16-128($np), %rax 513 shr \$29, $r1 514 vpaddq $TEMP2, $ACC3, $ACC3 515 vpmuludq 32*4-128($np), $Y1, $TEMP0 516 add %rax, $r3 517 add $r1, $r2 518 vpaddq $TEMP0, $ACC4, $ACC4 519 vpmuludq 32*5-128($np), $Y1, $TEMP1 520 mov $r2, %rax 521 imull $n0, %eax 522 vpaddq $TEMP1, $ACC5, $ACC5 523 vpmuludq 32*6-128($np), $Y1, $TEMP2 524 and \$0x1fffffff, %eax 525 vpaddq $TEMP2, $ACC6, $ACC6 526 vpmuludq 32*7-128($np), $Y1, $TEMP0 527 vpaddq $TEMP0, $ACC7, $ACC7 528 vpmuludq 32*8-128($np), $Y1, $TEMP1 529 vmovd %eax, $Y1 530 #vmovdqu 32*1-8-128($np), $TEMP2 # moved below 531 vpaddq $TEMP1, $ACC8, $ACC8 532 #vmovdqu 32*2-8-128($np), $TEMP0 # moved below 533 vpbroadcastq $Y1, $Y1 534 535 vpmuludq 32*1-8-128($np), $Y2, $TEMP2 # see above 536 vmovdqu 32*3-8-128($np), $TEMP1 537 mov %rax, %rdx 538 imulq -128($np), %rax 539 vpaddq $TEMP2, $ACC1, $ACC1 540 vpmuludq 32*2-8-128($np), $Y2, $TEMP0 # see above 541 vmovdqu 32*4-8-128($np), $TEMP2 542 add %rax, $r2 543 mov %rdx, %rax 544 imulq 8-128($np), %rax 545 vpaddq $TEMP0, $ACC2, $ACC2 546 add $r3, %rax 547 shr \$29, $r2 548 vpmuludq $Y2, $TEMP1, $TEMP1 549 vmovdqu 32*5-8-128($np), $TEMP0 550 add $r2, %rax 551 vpaddq $TEMP1, $ACC3, $ACC3 552 vpmuludq $Y2, $TEMP2, $TEMP2 553 vmovdqu 32*6-8-128($np), $TEMP1 554 .byte 0x67 555 mov %rax, $r3 556 imull $n0, %eax 557 vpaddq $TEMP2, $ACC4, $ACC4 558 vpmuludq $Y2, $TEMP0, $TEMP0 559 .byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 # vmovdqu 32*7-8-128($np), $TEMP2 560 and \$0x1fffffff, %eax 561 vpaddq $TEMP0, $ACC5, $ACC5 562 vpmuludq $Y2, $TEMP1, $TEMP1 563 vmovdqu 32*8-8-128($np), $TEMP0 564 vpaddq $TEMP1, $ACC6, $ACC6 565 vpmuludq $Y2, $TEMP2, $TEMP2 566 vmovdqu 32*9-8-128($np), $ACC9 567 vmovd %eax, $ACC0 # borrow ACC0 for Y2 568 imulq -128($np), %rax 569 vpaddq $TEMP2, $ACC7, $ACC7 570 vpmuludq $Y2, $TEMP0, $TEMP0 571 vmovdqu 32*1-16-128($np), $TEMP1 572 vpbroadcastq $ACC0, $ACC0 573 vpaddq $TEMP0, $ACC8, $ACC8 574 vpmuludq $Y2, $ACC9, $ACC9 575 vmovdqu 32*2-16-128($np), $TEMP2 576 add %rax, $r3 577 578___ 579($ACC0,$Y2)=($Y2,$ACC0); 580$code.=<<___; 581 vmovdqu 32*1-24-128($np), $ACC0 582 vpmuludq $Y1, $TEMP1, $TEMP1 583 vmovdqu 32*3-16-128($np), $TEMP0 584 vpaddq $TEMP1, $ACC1, $ACC1 585 vpmuludq $Y2, $ACC0, $ACC0 586 vpmuludq $Y1, $TEMP2, $TEMP2 587 .byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff # vmovdqu 32*4-16-128($np), $TEMP1 588 vpaddq $ACC1, $ACC0, $ACC0 589 vpaddq $TEMP2, $ACC2, $ACC2 590 vpmuludq $Y1, $TEMP0, $TEMP0 591 vmovdqu 32*5-16-128($np), $TEMP2 592 .byte 0x67 593 vmovq $ACC0, %rax 594 vmovdqu $ACC0, (%rsp) # transfer $r0-$r3 595 vpaddq $TEMP0, $ACC3, $ACC3 596 vpmuludq $Y1, $TEMP1, $TEMP1 597 vmovdqu 32*6-16-128($np), $TEMP0 598 vpaddq $TEMP1, $ACC4, $ACC4 599 vpmuludq $Y1, $TEMP2, $TEMP2 600 vmovdqu 32*7-16-128($np), $TEMP1 601 vpaddq $TEMP2, $ACC5, $ACC5 602 vpmuludq $Y1, $TEMP0, $TEMP0 603 vmovdqu 32*8-16-128($np), $TEMP2 604 vpaddq $TEMP0, $ACC6, $ACC6 605 vpmuludq $Y1, $TEMP1, $TEMP1 606 shr \$29, $r3 607 vmovdqu 32*9-16-128($np), $TEMP0 608 add $r3, %rax 609 vpaddq $TEMP1, $ACC7, $ACC7 610 vpmuludq $Y1, $TEMP2, $TEMP2 611 #vmovdqu 32*2-24-128($np), $TEMP1 # moved below 612 mov %rax, $r0 613 imull $n0, %eax 614 vpaddq $TEMP2, $ACC8, $ACC8 615 vpmuludq $Y1, $TEMP0, $TEMP0 616 and \$0x1fffffff, %eax 617 vmovd %eax, $Y1 618 vmovdqu 32*3-24-128($np), $TEMP2 619 .byte 0x67 620 vpaddq $TEMP0, $ACC9, $ACC9 621 vpbroadcastq $Y1, $Y1 622 623 vpmuludq 32*2-24-128($np), $Y2, $TEMP1 # see above 624 vmovdqu 32*4-24-128($np), $TEMP0 625 mov %rax, %rdx 626 imulq -128($np), %rax 627 mov 8(%rsp), $r1 628 vpaddq $TEMP1, $ACC2, $ACC1 629 vpmuludq $Y2, $TEMP2, $TEMP2 630 vmovdqu 32*5-24-128($np), $TEMP1 631 add %rax, $r0 632 mov %rdx, %rax 633 imulq 8-128($np), %rax 634 .byte 0x67 635 shr \$29, $r0 636 mov 16(%rsp), $r2 637 vpaddq $TEMP2, $ACC3, $ACC2 638 vpmuludq $Y2, $TEMP0, $TEMP0 639 vmovdqu 32*6-24-128($np), $TEMP2 640 add %rax, $r1 641 mov %rdx, %rax 642 imulq 16-128($np), %rax 643 vpaddq $TEMP0, $ACC4, $ACC3 644 vpmuludq $Y2, $TEMP1, $TEMP1 645 vmovdqu 32*7-24-128($np), $TEMP0 646 imulq 24-128($np), %rdx # future $r3 647 add %rax, $r2 648 lea ($r0,$r1), %rax 649 vpaddq $TEMP1, $ACC5, $ACC4 650 vpmuludq $Y2, $TEMP2, $TEMP2 651 vmovdqu 32*8-24-128($np), $TEMP1 652 mov %rax, $r1 653 imull $n0, %eax 654 vpmuludq $Y2, $TEMP0, $TEMP0 655 vpaddq $TEMP2, $ACC6, $ACC5 656 vmovdqu 32*9-24-128($np), $TEMP2 657 and \$0x1fffffff, %eax 658 vpaddq $TEMP0, $ACC7, $ACC6 659 vpmuludq $Y2, $TEMP1, $TEMP1 660 add 24(%rsp), %rdx 661 vpaddq $TEMP1, $ACC8, $ACC7 662 vpmuludq $Y2, $TEMP2, $TEMP2 663 vpaddq $TEMP2, $ACC9, $ACC8 664 vmovq $r3, $ACC9 665 mov %rdx, $r3 666 667 dec $i 668 jnz .LOOP_REDUCE_1024 669___ 670($ACC0,$Y2)=($Y2,$ACC0); 671$code.=<<___; 672 lea 448(%rsp), $tp1 # size optimization 673 vpaddq $ACC9, $Y2, $ACC0 674 vpxor $ZERO, $ZERO, $ZERO 675 676 vpaddq 32*9-192($tp0), $ACC0, $ACC0 677 vpaddq 32*10-448($tp1), $ACC1, $ACC1 678 vpaddq 32*11-448($tp1), $ACC2, $ACC2 679 vpaddq 32*12-448($tp1), $ACC3, $ACC3 680 vpaddq 32*13-448($tp1), $ACC4, $ACC4 681 vpaddq 32*14-448($tp1), $ACC5, $ACC5 682 vpaddq 32*15-448($tp1), $ACC6, $ACC6 683 vpaddq 32*16-448($tp1), $ACC7, $ACC7 684 vpaddq 32*17-448($tp1), $ACC8, $ACC8 685 686 vpsrlq \$29, $ACC0, $TEMP1 687 vpand $AND_MASK, $ACC0, $ACC0 688 vpsrlq \$29, $ACC1, $TEMP2 689 vpand $AND_MASK, $ACC1, $ACC1 690 vpsrlq \$29, $ACC2, $TEMP3 691 vpermq \$0x93, $TEMP1, $TEMP1 692 vpand $AND_MASK, $ACC2, $ACC2 693 vpsrlq \$29, $ACC3, $TEMP4 694 vpermq \$0x93, $TEMP2, $TEMP2 695 vpand $AND_MASK, $ACC3, $ACC3 696 vpermq \$0x93, $TEMP3, $TEMP3 697 698 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 699 vpermq \$0x93, $TEMP4, $TEMP4 700 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 701 vpaddq $TEMP0, $ACC0, $ACC0 702 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 703 vpaddq $TEMP1, $ACC1, $ACC1 704 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 705 vpaddq $TEMP2, $ACC2, $ACC2 706 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 707 vpaddq $TEMP3, $ACC3, $ACC3 708 vpaddq $TEMP4, $ACC4, $ACC4 709 710 vpsrlq \$29, $ACC0, $TEMP1 711 vpand $AND_MASK, $ACC0, $ACC0 712 vpsrlq \$29, $ACC1, $TEMP2 713 vpand $AND_MASK, $ACC1, $ACC1 714 vpsrlq \$29, $ACC2, $TEMP3 715 vpermq \$0x93, $TEMP1, $TEMP1 716 vpand $AND_MASK, $ACC2, $ACC2 717 vpsrlq \$29, $ACC3, $TEMP4 718 vpermq \$0x93, $TEMP2, $TEMP2 719 vpand $AND_MASK, $ACC3, $ACC3 720 vpermq \$0x93, $TEMP3, $TEMP3 721 722 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 723 vpermq \$0x93, $TEMP4, $TEMP4 724 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 725 vpaddq $TEMP0, $ACC0, $ACC0 726 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 727 vpaddq $TEMP1, $ACC1, $ACC1 728 vmovdqu $ACC0, 32*0-128($rp) 729 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 730 vpaddq $TEMP2, $ACC2, $ACC2 731 vmovdqu $ACC1, 32*1-128($rp) 732 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 733 vpaddq $TEMP3, $ACC3, $ACC3 734 vmovdqu $ACC2, 32*2-128($rp) 735 vpaddq $TEMP4, $ACC4, $ACC4 736 vmovdqu $ACC3, 32*3-128($rp) 737___ 738$TEMP5=$ACC0; 739$code.=<<___; 740 vpsrlq \$29, $ACC4, $TEMP1 741 vpand $AND_MASK, $ACC4, $ACC4 742 vpsrlq \$29, $ACC5, $TEMP2 743 vpand $AND_MASK, $ACC5, $ACC5 744 vpsrlq \$29, $ACC6, $TEMP3 745 vpermq \$0x93, $TEMP1, $TEMP1 746 vpand $AND_MASK, $ACC6, $ACC6 747 vpsrlq \$29, $ACC7, $TEMP4 748 vpermq \$0x93, $TEMP2, $TEMP2 749 vpand $AND_MASK, $ACC7, $ACC7 750 vpsrlq \$29, $ACC8, $TEMP5 751 vpermq \$0x93, $TEMP3, $TEMP3 752 vpand $AND_MASK, $ACC8, $ACC8 753 vpermq \$0x93, $TEMP4, $TEMP4 754 755 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 756 vpermq \$0x93, $TEMP5, $TEMP5 757 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 758 vpaddq $TEMP0, $ACC4, $ACC4 759 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 760 vpaddq $TEMP1, $ACC5, $ACC5 761 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 762 vpaddq $TEMP2, $ACC6, $ACC6 763 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 764 vpaddq $TEMP3, $ACC7, $ACC7 765 vpaddq $TEMP4, $ACC8, $ACC8 766 767 vpsrlq \$29, $ACC4, $TEMP1 768 vpand $AND_MASK, $ACC4, $ACC4 769 vpsrlq \$29, $ACC5, $TEMP2 770 vpand $AND_MASK, $ACC5, $ACC5 771 vpsrlq \$29, $ACC6, $TEMP3 772 vpermq \$0x93, $TEMP1, $TEMP1 773 vpand $AND_MASK, $ACC6, $ACC6 774 vpsrlq \$29, $ACC7, $TEMP4 775 vpermq \$0x93, $TEMP2, $TEMP2 776 vpand $AND_MASK, $ACC7, $ACC7 777 vpsrlq \$29, $ACC8, $TEMP5 778 vpermq \$0x93, $TEMP3, $TEMP3 779 vpand $AND_MASK, $ACC8, $ACC8 780 vpermq \$0x93, $TEMP4, $TEMP4 781 782 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 783 vpermq \$0x93, $TEMP5, $TEMP5 784 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 785 vpaddq $TEMP0, $ACC4, $ACC4 786 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 787 vpaddq $TEMP1, $ACC5, $ACC5 788 vmovdqu $ACC4, 32*4-128($rp) 789 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 790 vpaddq $TEMP2, $ACC6, $ACC6 791 vmovdqu $ACC5, 32*5-128($rp) 792 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 793 vpaddq $TEMP3, $ACC7, $ACC7 794 vmovdqu $ACC6, 32*6-128($rp) 795 vpaddq $TEMP4, $ACC8, $ACC8 796 vmovdqu $ACC7, 32*7-128($rp) 797 vmovdqu $ACC8, 32*8-128($rp) 798 799 mov $rp, $ap 800 dec $rep 801 jne .LOOP_GRANDE_SQR_1024 802 803 vzeroall 804 mov %rbp, %rax 805___ 806$code.=<<___ if ($win64); 807 movaps -0xd8(%rax),%xmm6 808 movaps -0xc8(%rax),%xmm7 809 movaps -0xb8(%rax),%xmm8 810 movaps -0xa8(%rax),%xmm9 811 movaps -0x98(%rax),%xmm10 812 movaps -0x88(%rax),%xmm11 813 movaps -0x78(%rax),%xmm12 814 movaps -0x68(%rax),%xmm13 815 movaps -0x58(%rax),%xmm14 816 movaps -0x48(%rax),%xmm15 817___ 818$code.=<<___; 819 mov -48(%rax),%r15 820 mov -40(%rax),%r14 821 mov -32(%rax),%r13 822 mov -24(%rax),%r12 823 mov -16(%rax),%rbp 824 mov -8(%rax),%rbx 825 lea (%rax),%rsp # restore %rsp 826.Lsqr_1024_epilogue: 827 ret 828.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 829___ 830} 831 832{ # void AMM_WW( 833my $rp="%rdi"; # BN_ULONG *rp, 834my $ap="%rsi"; # const BN_ULONG *ap, 835my $bp="%rdx"; # const BN_ULONG *bp, 836my $np="%rcx"; # const BN_ULONG *np, 837my $n0="%r8d"; # unsigned int n0); 838 839# The registers that hold the accumulated redundant result 840# The AMM works on 1024 bit operands, and redundant word size is 29 841# Therefore: ceil(1024/29)/4 = 9 842my $ACC0="%ymm0"; 843my $ACC1="%ymm1"; 844my $ACC2="%ymm2"; 845my $ACC3="%ymm3"; 846my $ACC4="%ymm4"; 847my $ACC5="%ymm5"; 848my $ACC6="%ymm6"; 849my $ACC7="%ymm7"; 850my $ACC8="%ymm8"; 851my $ACC9="%ymm9"; 852 853# Registers that hold the broadcasted words of multiplier, currently used 854my $Bi="%ymm10"; 855my $Yi="%ymm11"; 856 857# Helper registers 858my $TEMP0=$ACC0; 859my $TEMP1="%ymm12"; 860my $TEMP2="%ymm13"; 861my $ZERO="%ymm14"; 862my $AND_MASK="%ymm15"; 863 864# alu registers that hold the first words of the ACC 865my $r0="%r9"; 866my $r1="%r10"; 867my $r2="%r11"; 868my $r3="%r12"; 869 870my $i="%r14d"; 871my $tmp="%r15"; 872 873$bp="%r13"; # reassigned argument 874 875$code.=<<___; 876.globl rsaz_1024_mul_avx2 877.type rsaz_1024_mul_avx2,\@function,5 878.align 64 879rsaz_1024_mul_avx2: 880 lea (%rsp), %rax 881 push %rbx 882 push %rbp 883 push %r12 884 push %r13 885 push %r14 886 push %r15 887___ 888$code.=<<___ if ($win64); 889 vzeroupper 890 lea -0xa8(%rsp),%rsp 891 vmovaps %xmm6,-0xd8(%rax) 892 vmovaps %xmm7,-0xc8(%rax) 893 vmovaps %xmm8,-0xb8(%rax) 894 vmovaps %xmm9,-0xa8(%rax) 895 vmovaps %xmm10,-0x98(%rax) 896 vmovaps %xmm11,-0x88(%rax) 897 vmovaps %xmm12,-0x78(%rax) 898 vmovaps %xmm13,-0x68(%rax) 899 vmovaps %xmm14,-0x58(%rax) 900 vmovaps %xmm15,-0x48(%rax) 901.Lmul_1024_body: 902___ 903$code.=<<___; 904 mov %rax,%rbp 905 vzeroall 906 mov %rdx, $bp # reassigned argument 907 sub \$64,%rsp 908 909 # unaligned 256-bit load that crosses page boundary can 910 # cause severe performance degradation here, so if $ap does 911 # cross page boundary, swap it with $bp [meaning that caller 912 # is advised to lay down $ap and $bp next to each other, so 913 # that only one can cross page boundary]. 914 .byte 0x67,0x67 915 mov $ap, $tmp 916 and \$4095, $tmp 917 add \$32*10, $tmp 918 shr \$12, $tmp 919 mov $ap, $tmp 920 cmovnz $bp, $ap 921 cmovnz $tmp, $bp 922 923 mov $np, $tmp 924 sub \$-128,$ap # size optimization 925 sub \$-128,$np 926 sub \$-128,$rp 927 928 and \$4095, $tmp # see if $np crosses page 929 add \$32*10, $tmp 930 .byte 0x67,0x67 931 shr \$12, $tmp 932 jz .Lmul_1024_no_n_copy 933 934 # unaligned 256-bit load that crosses page boundary can 935 # cause severe performance degradation here, so if $np does 936 # cross page boundary, copy it to stack and make sure stack 937 # frame doesn't... 938 sub \$32*10,%rsp 939 vmovdqu 32*0-128($np), $ACC0 940 and \$-512, %rsp 941 vmovdqu 32*1-128($np), $ACC1 942 vmovdqu 32*2-128($np), $ACC2 943 vmovdqu 32*3-128($np), $ACC3 944 vmovdqu 32*4-128($np), $ACC4 945 vmovdqu 32*5-128($np), $ACC5 946 vmovdqu 32*6-128($np), $ACC6 947 vmovdqu 32*7-128($np), $ACC7 948 vmovdqu 32*8-128($np), $ACC8 949 lea 64+128(%rsp),$np 950 vmovdqu $ACC0, 32*0-128($np) 951 vpxor $ACC0, $ACC0, $ACC0 952 vmovdqu $ACC1, 32*1-128($np) 953 vpxor $ACC1, $ACC1, $ACC1 954 vmovdqu $ACC2, 32*2-128($np) 955 vpxor $ACC2, $ACC2, $ACC2 956 vmovdqu $ACC3, 32*3-128($np) 957 vpxor $ACC3, $ACC3, $ACC3 958 vmovdqu $ACC4, 32*4-128($np) 959 vpxor $ACC4, $ACC4, $ACC4 960 vmovdqu $ACC5, 32*5-128($np) 961 vpxor $ACC5, $ACC5, $ACC5 962 vmovdqu $ACC6, 32*6-128($np) 963 vpxor $ACC6, $ACC6, $ACC6 964 vmovdqu $ACC7, 32*7-128($np) 965 vpxor $ACC7, $ACC7, $ACC7 966 vmovdqu $ACC8, 32*8-128($np) 967 vmovdqa $ACC0, $ACC8 968 vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero after vzeroall 969.Lmul_1024_no_n_copy: 970 and \$-64,%rsp 971 972 mov ($bp), %rbx 973 vpbroadcastq ($bp), $Bi 974 vmovdqu $ACC0, (%rsp) # clear top of stack 975 xor $r0, $r0 976 .byte 0x67 977 xor $r1, $r1 978 xor $r2, $r2 979 xor $r3, $r3 980 981 vmovdqu .Land_mask(%rip), $AND_MASK 982 mov \$9, $i 983 vmovdqu $ACC9, 32*9-128($rp) # $ACC9 is zero after vzeroall 984 jmp .Loop_mul_1024 985 986.align 32 987.Loop_mul_1024: 988 vpsrlq \$29, $ACC3, $ACC9 # correct $ACC3(*) 989 mov %rbx, %rax 990 imulq -128($ap), %rax 991 add $r0, %rax 992 mov %rbx, $r1 993 imulq 8-128($ap), $r1 994 add 8(%rsp), $r1 995 996 mov %rax, $r0 997 imull $n0, %eax 998 and \$0x1fffffff, %eax 999 1000 mov %rbx, $r2 1001 imulq 16-128($ap), $r2 1002 add 16(%rsp), $r2 1003 1004 mov %rbx, $r3 1005 imulq 24-128($ap), $r3 1006 add 24(%rsp), $r3 1007 vpmuludq 32*1-128($ap),$Bi,$TEMP0 1008 vmovd %eax, $Yi 1009 vpaddq $TEMP0,$ACC1,$ACC1 1010 vpmuludq 32*2-128($ap),$Bi,$TEMP1 1011 vpbroadcastq $Yi, $Yi 1012 vpaddq $TEMP1,$ACC2,$ACC2 1013 vpmuludq 32*3-128($ap),$Bi,$TEMP2 1014 vpand $AND_MASK, $ACC3, $ACC3 # correct $ACC3 1015 vpaddq $TEMP2,$ACC3,$ACC3 1016 vpmuludq 32*4-128($ap),$Bi,$TEMP0 1017 vpaddq $TEMP0,$ACC4,$ACC4 1018 vpmuludq 32*5-128($ap),$Bi,$TEMP1 1019 vpaddq $TEMP1,$ACC5,$ACC5 1020 vpmuludq 32*6-128($ap),$Bi,$TEMP2 1021 vpaddq $TEMP2,$ACC6,$ACC6 1022 vpmuludq 32*7-128($ap),$Bi,$TEMP0 1023 vpermq \$0x93, $ACC9, $ACC9 # correct $ACC3 1024 vpaddq $TEMP0,$ACC7,$ACC7 1025 vpmuludq 32*8-128($ap),$Bi,$TEMP1 1026 vpbroadcastq 8($bp), $Bi 1027 vpaddq $TEMP1,$ACC8,$ACC8 1028 1029 mov %rax,%rdx 1030 imulq -128($np),%rax 1031 add %rax,$r0 1032 mov %rdx,%rax 1033 imulq 8-128($np),%rax 1034 add %rax,$r1 1035 mov %rdx,%rax 1036 imulq 16-128($np),%rax 1037 add %rax,$r2 1038 shr \$29, $r0 1039 imulq 24-128($np),%rdx 1040 add %rdx,$r3 1041 add $r0, $r1 1042 1043 vpmuludq 32*1-128($np),$Yi,$TEMP2 1044 vmovq $Bi, %rbx 1045 vpaddq $TEMP2,$ACC1,$ACC1 1046 vpmuludq 32*2-128($np),$Yi,$TEMP0 1047 vpaddq $TEMP0,$ACC2,$ACC2 1048 vpmuludq 32*3-128($np),$Yi,$TEMP1 1049 vpaddq $TEMP1,$ACC3,$ACC3 1050 vpmuludq 32*4-128($np),$Yi,$TEMP2 1051 vpaddq $TEMP2,$ACC4,$ACC4 1052 vpmuludq 32*5-128($np),$Yi,$TEMP0 1053 vpaddq $TEMP0,$ACC5,$ACC5 1054 vpmuludq 32*6-128($np),$Yi,$TEMP1 1055 vpaddq $TEMP1,$ACC6,$ACC6 1056 vpmuludq 32*7-128($np),$Yi,$TEMP2 1057 vpblendd \$3, $ZERO, $ACC9, $ACC9 # correct $ACC3 1058 vpaddq $TEMP2,$ACC7,$ACC7 1059 vpmuludq 32*8-128($np),$Yi,$TEMP0 1060 vpaddq $ACC9, $ACC3, $ACC3 # correct $ACC3 1061 vpaddq $TEMP0,$ACC8,$ACC8 1062 1063 mov %rbx, %rax 1064 imulq -128($ap),%rax 1065 add %rax,$r1 1066 vmovdqu -8+32*1-128($ap),$TEMP1 1067 mov %rbx, %rax 1068 imulq 8-128($ap),%rax 1069 add %rax,$r2 1070 vmovdqu -8+32*2-128($ap),$TEMP2 1071 1072 mov $r1, %rax 1073 imull $n0, %eax 1074 and \$0x1fffffff, %eax 1075 1076 imulq 16-128($ap),%rbx 1077 add %rbx,$r3 1078 vpmuludq $Bi,$TEMP1,$TEMP1 1079 vmovd %eax, $Yi 1080 vmovdqu -8+32*3-128($ap),$TEMP0 1081 vpaddq $TEMP1,$ACC1,$ACC1 1082 vpmuludq $Bi,$TEMP2,$TEMP2 1083 vpbroadcastq $Yi, $Yi 1084 vmovdqu -8+32*4-128($ap),$TEMP1 1085 vpaddq $TEMP2,$ACC2,$ACC2 1086 vpmuludq $Bi,$TEMP0,$TEMP0 1087 vmovdqu -8+32*5-128($ap),$TEMP2 1088 vpaddq $TEMP0,$ACC3,$ACC3 1089 vpmuludq $Bi,$TEMP1,$TEMP1 1090 vmovdqu -8+32*6-128($ap),$TEMP0 1091 vpaddq $TEMP1,$ACC4,$ACC4 1092 vpmuludq $Bi,$TEMP2,$TEMP2 1093 vmovdqu -8+32*7-128($ap),$TEMP1 1094 vpaddq $TEMP2,$ACC5,$ACC5 1095 vpmuludq $Bi,$TEMP0,$TEMP0 1096 vmovdqu -8+32*8-128($ap),$TEMP2 1097 vpaddq $TEMP0,$ACC6,$ACC6 1098 vpmuludq $Bi,$TEMP1,$TEMP1 1099 vmovdqu -8+32*9-128($ap),$ACC9 1100 vpaddq $TEMP1,$ACC7,$ACC7 1101 vpmuludq $Bi,$TEMP2,$TEMP2 1102 vpaddq $TEMP2,$ACC8,$ACC8 1103 vpmuludq $Bi,$ACC9,$ACC9 1104 vpbroadcastq 16($bp), $Bi 1105 1106 mov %rax,%rdx 1107 imulq -128($np),%rax 1108 add %rax,$r1 1109 vmovdqu -8+32*1-128($np),$TEMP0 1110 mov %rdx,%rax 1111 imulq 8-128($np),%rax 1112 add %rax,$r2 1113 vmovdqu -8+32*2-128($np),$TEMP1 1114 shr \$29, $r1 1115 imulq 16-128($np),%rdx 1116 add %rdx,$r3 1117 add $r1, $r2 1118 1119 vpmuludq $Yi,$TEMP0,$TEMP0 1120 vmovq $Bi, %rbx 1121 vmovdqu -8+32*3-128($np),$TEMP2 1122 vpaddq $TEMP0,$ACC1,$ACC1 1123 vpmuludq $Yi,$TEMP1,$TEMP1 1124 vmovdqu -8+32*4-128($np),$TEMP0 1125 vpaddq $TEMP1,$ACC2,$ACC2 1126 vpmuludq $Yi,$TEMP2,$TEMP2 1127 vmovdqu -8+32*5-128($np),$TEMP1 1128 vpaddq $TEMP2,$ACC3,$ACC3 1129 vpmuludq $Yi,$TEMP0,$TEMP0 1130 vmovdqu -8+32*6-128($np),$TEMP2 1131 vpaddq $TEMP0,$ACC4,$ACC4 1132 vpmuludq $Yi,$TEMP1,$TEMP1 1133 vmovdqu -8+32*7-128($np),$TEMP0 1134 vpaddq $TEMP1,$ACC5,$ACC5 1135 vpmuludq $Yi,$TEMP2,$TEMP2 1136 vmovdqu -8+32*8-128($np),$TEMP1 1137 vpaddq $TEMP2,$ACC6,$ACC6 1138 vpmuludq $Yi,$TEMP0,$TEMP0 1139 vmovdqu -8+32*9-128($np),$TEMP2 1140 vpaddq $TEMP0,$ACC7,$ACC7 1141 vpmuludq $Yi,$TEMP1,$TEMP1 1142 vpaddq $TEMP1,$ACC8,$ACC8 1143 vpmuludq $Yi,$TEMP2,$TEMP2 1144 vpaddq $TEMP2,$ACC9,$ACC9 1145 1146 vmovdqu -16+32*1-128($ap),$TEMP0 1147 mov %rbx,%rax 1148 imulq -128($ap),%rax 1149 add $r2,%rax 1150 1151 vmovdqu -16+32*2-128($ap),$TEMP1 1152 mov %rax,$r2 1153 imull $n0, %eax 1154 and \$0x1fffffff, %eax 1155 1156 imulq 8-128($ap),%rbx 1157 add %rbx,$r3 1158 vpmuludq $Bi,$TEMP0,$TEMP0 1159 vmovd %eax, $Yi 1160 vmovdqu -16+32*3-128($ap),$TEMP2 1161 vpaddq $TEMP0,$ACC1,$ACC1 1162 vpmuludq $Bi,$TEMP1,$TEMP1 1163 vpbroadcastq $Yi, $Yi 1164 vmovdqu -16+32*4-128($ap),$TEMP0 1165 vpaddq $TEMP1,$ACC2,$ACC2 1166 vpmuludq $Bi,$TEMP2,$TEMP2 1167 vmovdqu -16+32*5-128($ap),$TEMP1 1168 vpaddq $TEMP2,$ACC3,$ACC3 1169 vpmuludq $Bi,$TEMP0,$TEMP0 1170 vmovdqu -16+32*6-128($ap),$TEMP2 1171 vpaddq $TEMP0,$ACC4,$ACC4 1172 vpmuludq $Bi,$TEMP1,$TEMP1 1173 vmovdqu -16+32*7-128($ap),$TEMP0 1174 vpaddq $TEMP1,$ACC5,$ACC5 1175 vpmuludq $Bi,$TEMP2,$TEMP2 1176 vmovdqu -16+32*8-128($ap),$TEMP1 1177 vpaddq $TEMP2,$ACC6,$ACC6 1178 vpmuludq $Bi,$TEMP0,$TEMP0 1179 vmovdqu -16+32*9-128($ap),$TEMP2 1180 vpaddq $TEMP0,$ACC7,$ACC7 1181 vpmuludq $Bi,$TEMP1,$TEMP1 1182 vpaddq $TEMP1,$ACC8,$ACC8 1183 vpmuludq $Bi,$TEMP2,$TEMP2 1184 vpbroadcastq 24($bp), $Bi 1185 vpaddq $TEMP2,$ACC9,$ACC9 1186 1187 vmovdqu -16+32*1-128($np),$TEMP0 1188 mov %rax,%rdx 1189 imulq -128($np),%rax 1190 add %rax,$r2 1191 vmovdqu -16+32*2-128($np),$TEMP1 1192 imulq 8-128($np),%rdx 1193 add %rdx,$r3 1194 shr \$29, $r2 1195 1196 vpmuludq $Yi,$TEMP0,$TEMP0 1197 vmovq $Bi, %rbx 1198 vmovdqu -16+32*3-128($np),$TEMP2 1199 vpaddq $TEMP0,$ACC1,$ACC1 1200 vpmuludq $Yi,$TEMP1,$TEMP1 1201 vmovdqu -16+32*4-128($np),$TEMP0 1202 vpaddq $TEMP1,$ACC2,$ACC2 1203 vpmuludq $Yi,$TEMP2,$TEMP2 1204 vmovdqu -16+32*5-128($np),$TEMP1 1205 vpaddq $TEMP2,$ACC3,$ACC3 1206 vpmuludq $Yi,$TEMP0,$TEMP0 1207 vmovdqu -16+32*6-128($np),$TEMP2 1208 vpaddq $TEMP0,$ACC4,$ACC4 1209 vpmuludq $Yi,$TEMP1,$TEMP1 1210 vmovdqu -16+32*7-128($np),$TEMP0 1211 vpaddq $TEMP1,$ACC5,$ACC5 1212 vpmuludq $Yi,$TEMP2,$TEMP2 1213 vmovdqu -16+32*8-128($np),$TEMP1 1214 vpaddq $TEMP2,$ACC6,$ACC6 1215 vpmuludq $Yi,$TEMP0,$TEMP0 1216 vmovdqu -16+32*9-128($np),$TEMP2 1217 vpaddq $TEMP0,$ACC7,$ACC7 1218 vpmuludq $Yi,$TEMP1,$TEMP1 1219 vmovdqu -24+32*1-128($ap),$TEMP0 1220 vpaddq $TEMP1,$ACC8,$ACC8 1221 vpmuludq $Yi,$TEMP2,$TEMP2 1222 vmovdqu -24+32*2-128($ap),$TEMP1 1223 vpaddq $TEMP2,$ACC9,$ACC9 1224 1225 add $r2, $r3 1226 imulq -128($ap),%rbx 1227 add %rbx,$r3 1228 1229 mov $r3, %rax 1230 imull $n0, %eax 1231 and \$0x1fffffff, %eax 1232 1233 vpmuludq $Bi,$TEMP0,$TEMP0 1234 vmovd %eax, $Yi 1235 vmovdqu -24+32*3-128($ap),$TEMP2 1236 vpaddq $TEMP0,$ACC1,$ACC1 1237 vpmuludq $Bi,$TEMP1,$TEMP1 1238 vpbroadcastq $Yi, $Yi 1239 vmovdqu -24+32*4-128($ap),$TEMP0 1240 vpaddq $TEMP1,$ACC2,$ACC2 1241 vpmuludq $Bi,$TEMP2,$TEMP2 1242 vmovdqu -24+32*5-128($ap),$TEMP1 1243 vpaddq $TEMP2,$ACC3,$ACC3 1244 vpmuludq $Bi,$TEMP0,$TEMP0 1245 vmovdqu -24+32*6-128($ap),$TEMP2 1246 vpaddq $TEMP0,$ACC4,$ACC4 1247 vpmuludq $Bi,$TEMP1,$TEMP1 1248 vmovdqu -24+32*7-128($ap),$TEMP0 1249 vpaddq $TEMP1,$ACC5,$ACC5 1250 vpmuludq $Bi,$TEMP2,$TEMP2 1251 vmovdqu -24+32*8-128($ap),$TEMP1 1252 vpaddq $TEMP2,$ACC6,$ACC6 1253 vpmuludq $Bi,$TEMP0,$TEMP0 1254 vmovdqu -24+32*9-128($ap),$TEMP2 1255 vpaddq $TEMP0,$ACC7,$ACC7 1256 vpmuludq $Bi,$TEMP1,$TEMP1 1257 vpaddq $TEMP1,$ACC8,$ACC8 1258 vpmuludq $Bi,$TEMP2,$TEMP2 1259 vpbroadcastq 32($bp), $Bi 1260 vpaddq $TEMP2,$ACC9,$ACC9 1261 add \$32, $bp # $bp++ 1262 1263 vmovdqu -24+32*1-128($np),$TEMP0 1264 imulq -128($np),%rax 1265 add %rax,$r3 1266 shr \$29, $r3 1267 1268 vmovdqu -24+32*2-128($np),$TEMP1 1269 vpmuludq $Yi,$TEMP0,$TEMP0 1270 vmovq $Bi, %rbx 1271 vmovdqu -24+32*3-128($np),$TEMP2 1272 vpaddq $TEMP0,$ACC1,$ACC0 # $ACC0==$TEMP0 1273 vpmuludq $Yi,$TEMP1,$TEMP1 1274 vmovdqu $ACC0, (%rsp) # transfer $r0-$r3 1275 vpaddq $TEMP1,$ACC2,$ACC1 1276 vmovdqu -24+32*4-128($np),$TEMP0 1277 vpmuludq $Yi,$TEMP2,$TEMP2 1278 vmovdqu -24+32*5-128($np),$TEMP1 1279 vpaddq $TEMP2,$ACC3,$ACC2 1280 vpmuludq $Yi,$TEMP0,$TEMP0 1281 vmovdqu -24+32*6-128($np),$TEMP2 1282 vpaddq $TEMP0,$ACC4,$ACC3 1283 vpmuludq $Yi,$TEMP1,$TEMP1 1284 vmovdqu -24+32*7-128($np),$TEMP0 1285 vpaddq $TEMP1,$ACC5,$ACC4 1286 vpmuludq $Yi,$TEMP2,$TEMP2 1287 vmovdqu -24+32*8-128($np),$TEMP1 1288 vpaddq $TEMP2,$ACC6,$ACC5 1289 vpmuludq $Yi,$TEMP0,$TEMP0 1290 vmovdqu -24+32*9-128($np),$TEMP2 1291 mov $r3, $r0 1292 vpaddq $TEMP0,$ACC7,$ACC6 1293 vpmuludq $Yi,$TEMP1,$TEMP1 1294 add (%rsp), $r0 1295 vpaddq $TEMP1,$ACC8,$ACC7 1296 vpmuludq $Yi,$TEMP2,$TEMP2 1297 vmovq $r3, $TEMP1 1298 vpaddq $TEMP2,$ACC9,$ACC8 1299 1300 dec $i 1301 jnz .Loop_mul_1024 1302___ 1303 1304# (*) Original implementation was correcting ACC1-ACC3 for overflow 1305# after 7 loop runs, or after 28 iterations, or 56 additions. 1306# But as we underutilize resources, it's possible to correct in 1307# each iteration with marginal performance loss. But then, as 1308# we do it in each iteration, we can correct less digits, and 1309# avoid performance penalties completely. Also note that we 1310# correct only three digits out of four. This works because 1311# most significant digit is subjected to less additions. 1312 1313$TEMP0 = $ACC9; 1314$TEMP3 = $Bi; 1315$TEMP4 = $Yi; 1316$code.=<<___; 1317 vpermq \$0, $AND_MASK, $AND_MASK 1318 vpaddq (%rsp), $TEMP1, $ACC0 1319 1320 vpsrlq \$29, $ACC0, $TEMP1 1321 vpand $AND_MASK, $ACC0, $ACC0 1322 vpsrlq \$29, $ACC1, $TEMP2 1323 vpand $AND_MASK, $ACC1, $ACC1 1324 vpsrlq \$29, $ACC2, $TEMP3 1325 vpermq \$0x93, $TEMP1, $TEMP1 1326 vpand $AND_MASK, $ACC2, $ACC2 1327 vpsrlq \$29, $ACC3, $TEMP4 1328 vpermq \$0x93, $TEMP2, $TEMP2 1329 vpand $AND_MASK, $ACC3, $ACC3 1330 1331 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 1332 vpermq \$0x93, $TEMP3, $TEMP3 1333 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 1334 vpermq \$0x93, $TEMP4, $TEMP4 1335 vpaddq $TEMP0, $ACC0, $ACC0 1336 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 1337 vpaddq $TEMP1, $ACC1, $ACC1 1338 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 1339 vpaddq $TEMP2, $ACC2, $ACC2 1340 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 1341 vpaddq $TEMP3, $ACC3, $ACC3 1342 vpaddq $TEMP4, $ACC4, $ACC4 1343 1344 vpsrlq \$29, $ACC0, $TEMP1 1345 vpand $AND_MASK, $ACC0, $ACC0 1346 vpsrlq \$29, $ACC1, $TEMP2 1347 vpand $AND_MASK, $ACC1, $ACC1 1348 vpsrlq \$29, $ACC2, $TEMP3 1349 vpermq \$0x93, $TEMP1, $TEMP1 1350 vpand $AND_MASK, $ACC2, $ACC2 1351 vpsrlq \$29, $ACC3, $TEMP4 1352 vpermq \$0x93, $TEMP2, $TEMP2 1353 vpand $AND_MASK, $ACC3, $ACC3 1354 vpermq \$0x93, $TEMP3, $TEMP3 1355 1356 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 1357 vpermq \$0x93, $TEMP4, $TEMP4 1358 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 1359 vpaddq $TEMP0, $ACC0, $ACC0 1360 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 1361 vpaddq $TEMP1, $ACC1, $ACC1 1362 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 1363 vpaddq $TEMP2, $ACC2, $ACC2 1364 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 1365 vpaddq $TEMP3, $ACC3, $ACC3 1366 vpaddq $TEMP4, $ACC4, $ACC4 1367 1368 vmovdqu $ACC0, 0-128($rp) 1369 vmovdqu $ACC1, 32-128($rp) 1370 vmovdqu $ACC2, 64-128($rp) 1371 vmovdqu $ACC3, 96-128($rp) 1372___ 1373 1374$TEMP5=$ACC0; 1375$code.=<<___; 1376 vpsrlq \$29, $ACC4, $TEMP1 1377 vpand $AND_MASK, $ACC4, $ACC4 1378 vpsrlq \$29, $ACC5, $TEMP2 1379 vpand $AND_MASK, $ACC5, $ACC5 1380 vpsrlq \$29, $ACC6, $TEMP3 1381 vpermq \$0x93, $TEMP1, $TEMP1 1382 vpand $AND_MASK, $ACC6, $ACC6 1383 vpsrlq \$29, $ACC7, $TEMP4 1384 vpermq \$0x93, $TEMP2, $TEMP2 1385 vpand $AND_MASK, $ACC7, $ACC7 1386 vpsrlq \$29, $ACC8, $TEMP5 1387 vpermq \$0x93, $TEMP3, $TEMP3 1388 vpand $AND_MASK, $ACC8, $ACC8 1389 vpermq \$0x93, $TEMP4, $TEMP4 1390 1391 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 1392 vpermq \$0x93, $TEMP5, $TEMP5 1393 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 1394 vpaddq $TEMP0, $ACC4, $ACC4 1395 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 1396 vpaddq $TEMP1, $ACC5, $ACC5 1397 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 1398 vpaddq $TEMP2, $ACC6, $ACC6 1399 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 1400 vpaddq $TEMP3, $ACC7, $ACC7 1401 vpaddq $TEMP4, $ACC8, $ACC8 1402 1403 vpsrlq \$29, $ACC4, $TEMP1 1404 vpand $AND_MASK, $ACC4, $ACC4 1405 vpsrlq \$29, $ACC5, $TEMP2 1406 vpand $AND_MASK, $ACC5, $ACC5 1407 vpsrlq \$29, $ACC6, $TEMP3 1408 vpermq \$0x93, $TEMP1, $TEMP1 1409 vpand $AND_MASK, $ACC6, $ACC6 1410 vpsrlq \$29, $ACC7, $TEMP4 1411 vpermq \$0x93, $TEMP2, $TEMP2 1412 vpand $AND_MASK, $ACC7, $ACC7 1413 vpsrlq \$29, $ACC8, $TEMP5 1414 vpermq \$0x93, $TEMP3, $TEMP3 1415 vpand $AND_MASK, $ACC8, $ACC8 1416 vpermq \$0x93, $TEMP4, $TEMP4 1417 1418 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 1419 vpermq \$0x93, $TEMP5, $TEMP5 1420 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 1421 vpaddq $TEMP0, $ACC4, $ACC4 1422 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 1423 vpaddq $TEMP1, $ACC5, $ACC5 1424 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 1425 vpaddq $TEMP2, $ACC6, $ACC6 1426 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 1427 vpaddq $TEMP3, $ACC7, $ACC7 1428 vpaddq $TEMP4, $ACC8, $ACC8 1429 1430 vmovdqu $ACC4, 128-128($rp) 1431 vmovdqu $ACC5, 160-128($rp) 1432 vmovdqu $ACC6, 192-128($rp) 1433 vmovdqu $ACC7, 224-128($rp) 1434 vmovdqu $ACC8, 256-128($rp) 1435 vzeroupper 1436 1437 mov %rbp, %rax 1438___ 1439$code.=<<___ if ($win64); 1440 movaps -0xd8(%rax),%xmm6 1441 movaps -0xc8(%rax),%xmm7 1442 movaps -0xb8(%rax),%xmm8 1443 movaps -0xa8(%rax),%xmm9 1444 movaps -0x98(%rax),%xmm10 1445 movaps -0x88(%rax),%xmm11 1446 movaps -0x78(%rax),%xmm12 1447 movaps -0x68(%rax),%xmm13 1448 movaps -0x58(%rax),%xmm14 1449 movaps -0x48(%rax),%xmm15 1450___ 1451$code.=<<___; 1452 mov -48(%rax),%r15 1453 mov -40(%rax),%r14 1454 mov -32(%rax),%r13 1455 mov -24(%rax),%r12 1456 mov -16(%rax),%rbp 1457 mov -8(%rax),%rbx 1458 lea (%rax),%rsp # restore %rsp 1459.Lmul_1024_epilogue: 1460 ret 1461.size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2 1462___ 1463} 1464{ 1465my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi"); 1466my @T = map("%r$_",(8..11)); 1467 1468$code.=<<___; 1469.globl rsaz_1024_red2norm_avx2 1470.type rsaz_1024_red2norm_avx2,\@abi-omnipotent 1471.align 32 1472rsaz_1024_red2norm_avx2: 1473 sub \$-128,$inp # size optimization 1474 xor %rax,%rax 1475___ 1476 1477for ($j=0,$i=0; $i<16; $i++) { 1478 my $k=0; 1479 while (29*$j<64*($i+1)) { # load data till boundary 1480 $code.=" mov `8*$j-128`($inp), @T[0]\n"; 1481 $j++; $k++; push(@T,shift(@T)); 1482 } 1483 $l=$k; 1484 while ($k>1) { # shift loaded data but last value 1485 $code.=" shl \$`29*($j-$k)`,@T[-$k]\n"; 1486 $k--; 1487 } 1488 $code.=<<___; # shift last value 1489 mov @T[-1], @T[0] 1490 shl \$`29*($j-1)`, @T[-1] 1491 shr \$`-29*($j-1)`, @T[0] 1492___ 1493 while ($l) { # accumulate all values 1494 $code.=" add @T[-$l], %rax\n"; 1495 $l--; 1496 } 1497 $code.=<<___; 1498 adc \$0, @T[0] # consume eventual carry 1499 mov %rax, 8*$i($out) 1500 mov @T[0], %rax 1501___ 1502 push(@T,shift(@T)); 1503} 1504$code.=<<___; 1505 ret 1506.size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2 1507 1508.globl rsaz_1024_norm2red_avx2 1509.type rsaz_1024_norm2red_avx2,\@abi-omnipotent 1510.align 32 1511rsaz_1024_norm2red_avx2: 1512 sub \$-128,$out # size optimization 1513 mov ($inp),@T[0] 1514 mov \$0x1fffffff,%eax 1515___ 1516for ($j=0,$i=0; $i<16; $i++) { 1517 $code.=" mov `8*($i+1)`($inp),@T[1]\n" if ($i<15); 1518 $code.=" xor @T[1],@T[1]\n" if ($i==15); 1519 my $k=1; 1520 while (29*($j+1)<64*($i+1)) { 1521 $code.=<<___; 1522 mov @T[0],@T[-$k] 1523 shr \$`29*$j`,@T[-$k] 1524 and %rax,@T[-$k] # &0x1fffffff 1525 mov @T[-$k],`8*$j-128`($out) 1526___ 1527 $j++; $k++; 1528 } 1529 $code.=<<___; 1530 shrd \$`29*$j`,@T[1],@T[0] 1531 and %rax,@T[0] 1532 mov @T[0],`8*$j-128`($out) 1533___ 1534 $j++; 1535 push(@T,shift(@T)); 1536} 1537$code.=<<___; 1538 mov @T[0],`8*$j-128`($out) # zero 1539 mov @T[0],`8*($j+1)-128`($out) 1540 mov @T[0],`8*($j+2)-128`($out) 1541 mov @T[0],`8*($j+3)-128`($out) 1542 ret 1543.size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2 1544___ 1545} 1546{ 1547my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx"); 1548 1549$code.=<<___; 1550.globl rsaz_1024_scatter5_avx2 1551.type rsaz_1024_scatter5_avx2,\@abi-omnipotent 1552.align 32 1553rsaz_1024_scatter5_avx2: 1554 vzeroupper 1555 vmovdqu .Lscatter_permd(%rip),%ymm5 1556 shl \$4,$power 1557 lea ($out,$power),$out 1558 mov \$9,%eax 1559 jmp .Loop_scatter_1024 1560 1561.align 32 1562.Loop_scatter_1024: 1563 vmovdqu ($inp),%ymm0 1564 lea 32($inp),$inp 1565 vpermd %ymm0,%ymm5,%ymm0 1566 vmovdqu %xmm0,($out) 1567 lea 16*32($out),$out 1568 dec %eax 1569 jnz .Loop_scatter_1024 1570 1571 vzeroupper 1572 ret 1573.size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2 1574 1575.globl rsaz_1024_gather5_avx2 1576.type rsaz_1024_gather5_avx2,\@abi-omnipotent 1577.align 32 1578rsaz_1024_gather5_avx2: 1579___ 1580$code.=<<___ if ($win64); 1581 lea -0x88(%rsp),%rax 1582 vzeroupper 1583.LSEH_begin_rsaz_1024_gather5: 1584 # I can't trust assembler to use specific encoding:-( 1585 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp 1586 .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6,-0x20(%rax) 1587 .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7,-0x10(%rax) 1588 .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8,0(%rax) 1589 .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9,0x10(%rax) 1590 .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10,0x20(%rax) 1591 .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11,0x30(%rax) 1592 .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12,0x40(%rax) 1593 .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13,0x50(%rax) 1594 .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14,0x60(%rax) 1595 .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15,0x70(%rax) 1596___ 1597$code.=<<___; 1598 lea .Lgather_table(%rip),%r11 1599 mov $power,%eax 1600 and \$3,$power 1601 shr \$2,%eax # cache line number 1602 shl \$4,$power # offset within cache line 1603 1604 vmovdqu -32(%r11),%ymm7 # .Lgather_permd 1605 vpbroadcastb 8(%r11,%rax), %xmm8 1606 vpbroadcastb 7(%r11,%rax), %xmm9 1607 vpbroadcastb 6(%r11,%rax), %xmm10 1608 vpbroadcastb 5(%r11,%rax), %xmm11 1609 vpbroadcastb 4(%r11,%rax), %xmm12 1610 vpbroadcastb 3(%r11,%rax), %xmm13 1611 vpbroadcastb 2(%r11,%rax), %xmm14 1612 vpbroadcastb 1(%r11,%rax), %xmm15 1613 1614 lea 64($inp,$power),$inp 1615 mov \$64,%r11 # size optimization 1616 mov \$9,%eax 1617 jmp .Loop_gather_1024 1618 1619.align 32 1620.Loop_gather_1024: 1621 vpand -64($inp), %xmm8,%xmm0 1622 vpand ($inp), %xmm9,%xmm1 1623 vpand 64($inp), %xmm10,%xmm2 1624 vpand ($inp,%r11,2), %xmm11,%xmm3 1625 vpor %xmm0,%xmm1,%xmm1 1626 vpand 64($inp,%r11,2), %xmm12,%xmm4 1627 vpor %xmm2,%xmm3,%xmm3 1628 vpand ($inp,%r11,4), %xmm13,%xmm5 1629 vpor %xmm1,%xmm3,%xmm3 1630 vpand 64($inp,%r11,4), %xmm14,%xmm6 1631 vpor %xmm4,%xmm5,%xmm5 1632 vpand -128($inp,%r11,8), %xmm15,%xmm2 1633 lea ($inp,%r11,8),$inp 1634 vpor %xmm3,%xmm5,%xmm5 1635 vpor %xmm2,%xmm6,%xmm6 1636 vpor %xmm5,%xmm6,%xmm6 1637 vpermd %ymm6,%ymm7,%ymm6 1638 vmovdqu %ymm6,($out) 1639 lea 32($out),$out 1640 dec %eax 1641 jnz .Loop_gather_1024 1642 1643 vpxor %ymm0,%ymm0,%ymm0 1644 vmovdqu %ymm0,($out) 1645 vzeroupper 1646___ 1647$code.=<<___ if ($win64); 1648 movaps (%rsp),%xmm6 1649 movaps 0x10(%rsp),%xmm7 1650 movaps 0x20(%rsp),%xmm8 1651 movaps 0x30(%rsp),%xmm9 1652 movaps 0x40(%rsp),%xmm10 1653 movaps 0x50(%rsp),%xmm11 1654 movaps 0x60(%rsp),%xmm12 1655 movaps 0x70(%rsp),%xmm13 1656 movaps 0x80(%rsp),%xmm14 1657 movaps 0x90(%rsp),%xmm15 1658 lea 0xa8(%rsp),%rsp 1659.LSEH_end_rsaz_1024_gather5: 1660___ 1661$code.=<<___; 1662 ret 1663.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2 1664___ 1665} 1666 1667$code.=<<___; 1668.extern OPENSSL_ia32cap_P 1669.globl rsaz_avx2_eligible 1670.type rsaz_avx2_eligible,\@abi-omnipotent 1671.align 32 1672rsaz_avx2_eligible: 1673 mov OPENSSL_ia32cap_P+8(%rip),%eax 1674___ 1675$code.=<<___ if ($addx); 1676 mov \$`1<<8|1<<19`,%ecx 1677 mov \$0,%edx 1678 and %eax,%ecx 1679 cmp \$`1<<8|1<<19`,%ecx # check for BMI2+AD*X 1680 cmove %edx,%eax 1681___ 1682$code.=<<___; 1683 and \$`1<<5`,%eax 1684 shr \$5,%eax 1685 ret 1686.size rsaz_avx2_eligible,.-rsaz_avx2_eligible 1687 1688.align 64 1689.Land_mask: 1690 .quad 0x1fffffff,0x1fffffff,0x1fffffff,-1 1691.Lscatter_permd: 1692 .long 0,2,4,6,7,7,7,7 1693.Lgather_permd: 1694 .long 0,7,1,7,2,7,3,7 1695.Lgather_table: 1696 .byte 0,0,0,0,0,0,0,0, 0xff,0,0,0,0,0,0,0 1697.align 64 1698___ 1699 1700if ($win64) { 1701$rec="%rcx"; 1702$frame="%rdx"; 1703$context="%r8"; 1704$disp="%r9"; 1705 1706$code.=<<___ 1707.extern __imp_RtlVirtualUnwind 1708.type rsaz_se_handler,\@abi-omnipotent 1709.align 16 1710rsaz_se_handler: 1711 push %rsi 1712 push %rdi 1713 push %rbx 1714 push %rbp 1715 push %r12 1716 push %r13 1717 push %r14 1718 push %r15 1719 pushfq 1720 sub \$64,%rsp 1721 1722 mov 120($context),%rax # pull context->Rax 1723 mov 248($context),%rbx # pull context->Rip 1724 1725 mov 8($disp),%rsi # disp->ImageBase 1726 mov 56($disp),%r11 # disp->HandlerData 1727 1728 mov 0(%r11),%r10d # HandlerData[0] 1729 lea (%rsi,%r10),%r10 # prologue label 1730 cmp %r10,%rbx # context->Rip<prologue label 1731 jb .Lcommon_seh_tail 1732 1733 mov 152($context),%rax # pull context->Rsp 1734 1735 mov 4(%r11),%r10d # HandlerData[1] 1736 lea (%rsi,%r10),%r10 # epilogue label 1737 cmp %r10,%rbx # context->Rip>=epilogue label 1738 jae .Lcommon_seh_tail 1739 1740 mov 160($context),%rax # pull context->Rbp 1741 1742 mov -48(%rax),%r15 1743 mov -40(%rax),%r14 1744 mov -32(%rax),%r13 1745 mov -24(%rax),%r12 1746 mov -16(%rax),%rbp 1747 mov -8(%rax),%rbx 1748 mov %r15,240($context) 1749 mov %r14,232($context) 1750 mov %r13,224($context) 1751 mov %r12,216($context) 1752 mov %rbp,160($context) 1753 mov %rbx,144($context) 1754 1755 lea -0xd8(%rax),%rsi # %xmm save area 1756 lea 512($context),%rdi # & context.Xmm6 1757 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 1758 .long 0xa548f3fc # cld; rep movsq 1759 1760.Lcommon_seh_tail: 1761 mov 8(%rax),%rdi 1762 mov 16(%rax),%rsi 1763 mov %rax,152($context) # restore context->Rsp 1764 mov %rsi,168($context) # restore context->Rsi 1765 mov %rdi,176($context) # restore context->Rdi 1766 1767 mov 40($disp),%rdi # disp->ContextRecord 1768 mov $context,%rsi # context 1769 mov \$154,%ecx # sizeof(CONTEXT) 1770 .long 0xa548f3fc # cld; rep movsq 1771 1772 mov $disp,%rsi 1773 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1774 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1775 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1776 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1777 mov 40(%rsi),%r10 # disp->ContextRecord 1778 lea 56(%rsi),%r11 # &disp->HandlerData 1779 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1780 mov %r10,32(%rsp) # arg5 1781 mov %r11,40(%rsp) # arg6 1782 mov %r12,48(%rsp) # arg7 1783 mov %rcx,56(%rsp) # arg8, (NULL) 1784 call *__imp_RtlVirtualUnwind(%rip) 1785 1786 mov \$1,%eax # ExceptionContinueSearch 1787 add \$64,%rsp 1788 popfq 1789 pop %r15 1790 pop %r14 1791 pop %r13 1792 pop %r12 1793 pop %rbp 1794 pop %rbx 1795 pop %rdi 1796 pop %rsi 1797 ret 1798.size rsaz_se_handler,.-rsaz_se_handler 1799 1800.section .pdata 1801.align 4 1802 .rva .LSEH_begin_rsaz_1024_sqr_avx2 1803 .rva .LSEH_end_rsaz_1024_sqr_avx2 1804 .rva .LSEH_info_rsaz_1024_sqr_avx2 1805 1806 .rva .LSEH_begin_rsaz_1024_mul_avx2 1807 .rva .LSEH_end_rsaz_1024_mul_avx2 1808 .rva .LSEH_info_rsaz_1024_mul_avx2 1809 1810 .rva .LSEH_begin_rsaz_1024_gather5 1811 .rva .LSEH_end_rsaz_1024_gather5 1812 .rva .LSEH_info_rsaz_1024_gather5 1813.section .xdata 1814.align 8 1815.LSEH_info_rsaz_1024_sqr_avx2: 1816 .byte 9,0,0,0 1817 .rva rsaz_se_handler 1818 .rva .Lsqr_1024_body,.Lsqr_1024_epilogue 1819.LSEH_info_rsaz_1024_mul_avx2: 1820 .byte 9,0,0,0 1821 .rva rsaz_se_handler 1822 .rva .Lmul_1024_body,.Lmul_1024_epilogue 1823.LSEH_info_rsaz_1024_gather5: 1824 .byte 0x01,0x33,0x16,0x00 1825 .byte 0x36,0xf8,0x09,0x00 #vmovaps 0x90(rsp),xmm15 1826 .byte 0x31,0xe8,0x08,0x00 #vmovaps 0x80(rsp),xmm14 1827 .byte 0x2c,0xd8,0x07,0x00 #vmovaps 0x70(rsp),xmm13 1828 .byte 0x27,0xc8,0x06,0x00 #vmovaps 0x60(rsp),xmm12 1829 .byte 0x22,0xb8,0x05,0x00 #vmovaps 0x50(rsp),xmm11 1830 .byte 0x1d,0xa8,0x04,0x00 #vmovaps 0x40(rsp),xmm10 1831 .byte 0x18,0x98,0x03,0x00 #vmovaps 0x30(rsp),xmm9 1832 .byte 0x13,0x88,0x02,0x00 #vmovaps 0x20(rsp),xmm8 1833 .byte 0x0e,0x78,0x01,0x00 #vmovaps 0x10(rsp),xmm7 1834 .byte 0x09,0x68,0x00,0x00 #vmovaps 0x00(rsp),xmm6 1835 .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8 1836___ 1837} 1838 1839foreach (split("\n",$code)) { 1840 s/\`([^\`]*)\`/eval($1)/ge; 1841 1842 s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge or 1843 1844 s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1845 s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or 1846 s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1847 s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1848 s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go; 1849 print $_,"\n"; 1850} 1851 1852}}} else {{{ 1853print <<___; # assembler is too old 1854.text 1855 1856.globl rsaz_avx2_eligible 1857.type rsaz_avx2_eligible,\@abi-omnipotent 1858rsaz_avx2_eligible: 1859 xor %eax,%eax 1860 ret 1861.size rsaz_avx2_eligible,.-rsaz_avx2_eligible 1862 1863.globl rsaz_1024_sqr_avx2 1864.globl rsaz_1024_mul_avx2 1865.globl rsaz_1024_norm2red_avx2 1866.globl rsaz_1024_red2norm_avx2 1867.globl rsaz_1024_scatter5_avx2 1868.globl rsaz_1024_gather5_avx2 1869.type rsaz_1024_sqr_avx2,\@abi-omnipotent 1870rsaz_1024_sqr_avx2: 1871rsaz_1024_mul_avx2: 1872rsaz_1024_norm2red_avx2: 1873rsaz_1024_red2norm_avx2: 1874rsaz_1024_scatter5_avx2: 1875rsaz_1024_gather5_avx2: 1876 .byte 0x0f,0x0b # ud2 1877 ret 1878.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 1879___ 1880}}} 1881 1882close STDOUT; 1883