1#! /usr/bin/env perl 2# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements support for Intel AES-NI extension. In 18# OpenSSL context it's used with Intel engine, but can also be used as 19# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for 20# details]. 21# 22# Performance. 23# 24# Given aes(enc|dec) instructions' latency asymptotic performance for 25# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte 26# processed with 128-bit key. And given their throughput asymptotic 27# performance for parallelizable modes is 1.25 cycles per byte. Being 28# asymptotic limit it's not something you commonly achieve in reality, 29# but how close does one get? Below are results collected for 30# different modes and block sized. Pairs of numbers are for en-/ 31# decryption. 32# 33# 16-byte 64-byte 256-byte 1-KB 8-KB 34# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26 35# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26 36# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28 37# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07 38# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38 39# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55 40# 41# ECB, CTR, CBC and CCM results are free from EVP overhead. This means 42# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni 43# [-decrypt]' will exhibit 10-15% worse results for smaller blocks. 44# The results were collected with specially crafted speed.c benchmark 45# in order to compare them with results reported in "Intel Advanced 46# Encryption Standard (AES) New Instruction Set" White Paper Revision 47# 3.0 dated May 2010. All above results are consistently better. This 48# module also provides better performance for block sizes smaller than 49# 128 bytes in points *not* represented in the above table. 50# 51# Looking at the results for 8-KB buffer. 52# 53# CFB and OFB results are far from the limit, because implementation 54# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on 55# single-block aesni_encrypt, which is not the most optimal way to go. 56# CBC encrypt result is unexpectedly high and there is no documented 57# explanation for it. Seemingly there is a small penalty for feeding 58# the result back to AES unit the way it's done in CBC mode. There is 59# nothing one can do and the result appears optimal. CCM result is 60# identical to CBC, because CBC-MAC is essentially CBC encrypt without 61# saving output. CCM CTR "stays invisible," because it's neatly 62# interleaved wih CBC-MAC. This provides ~30% improvement over 63# "straightforward" CCM implementation with CTR and CBC-MAC performed 64# disjointly. Parallelizable modes practically achieve the theoretical 65# limit. 66# 67# Looking at how results vary with buffer size. 68# 69# Curves are practically saturated at 1-KB buffer size. In most cases 70# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one. 71# CTR curve doesn't follow this pattern and is "slowest" changing one 72# with "256-byte" result being 87% of "8-KB." This is because overhead 73# in CTR mode is most computationally intensive. Small-block CCM 74# decrypt is slower than encrypt, because first CTR and last CBC-MAC 75# iterations can't be interleaved. 76# 77# Results for 192- and 256-bit keys. 78# 79# EVP-free results were observed to scale perfectly with number of 80# rounds for larger block sizes, i.e. 192-bit result being 10/12 times 81# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences 82# are a tad smaller, because the above mentioned penalty biases all 83# results by same constant value. In similar way function call 84# overhead affects small-block performance, as well as OFB and CFB 85# results. Differences are not large, most common coefficients are 86# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one 87# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)... 88 89# January 2011 90# 91# While Westmere processor features 6 cycles latency for aes[enc|dec] 92# instructions, which can be scheduled every second cycle, Sandy 93# Bridge spends 8 cycles per instruction, but it can schedule them 94# every cycle. This means that code targeting Westmere would perform 95# suboptimally on Sandy Bridge. Therefore this update. 96# 97# In addition, non-parallelizable CBC encrypt (as well as CCM) is 98# optimized. Relative improvement might appear modest, 8% on Westmere, 99# but in absolute terms it's 3.77 cycles per byte encrypted with 100# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers 101# should be compared to asymptotic limits of 3.75 for Westmere and 102# 5.00 for Sandy Bridge. Actually, the fact that they get this close 103# to asymptotic limits is quite amazing. Indeed, the limit is 104# calculated as latency times number of rounds, 10 for 128-bit key, 105# and divided by 16, the number of bytes in block, or in other words 106# it accounts *solely* for aesenc instructions. But there are extra 107# instructions, and numbers so close to the asymptotic limits mean 108# that it's as if it takes as little as *one* additional cycle to 109# execute all of them. How is it possible? It is possible thanks to 110# out-of-order execution logic, which manages to overlap post- 111# processing of previous block, things like saving the output, with 112# actual encryption of current block, as well as pre-processing of 113# current block, things like fetching input and xor-ing it with 114# 0-round element of the key schedule, with actual encryption of 115# previous block. Keep this in mind... 116# 117# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher 118# performance is achieved by interleaving instructions working on 119# independent blocks. In which case asymptotic limit for such modes 120# can be obtained by dividing above mentioned numbers by AES 121# instructions' interleave factor. Westmere can execute at most 3 122# instructions at a time, meaning that optimal interleave factor is 3, 123# and that's where the "magic" number of 1.25 come from. "Optimal 124# interleave factor" means that increase of interleave factor does 125# not improve performance. The formula has proven to reflect reality 126# pretty well on Westmere... Sandy Bridge on the other hand can 127# execute up to 8 AES instructions at a time, so how does varying 128# interleave factor affect the performance? Here is table for ECB 129# (numbers are cycles per byte processed with 128-bit key): 130# 131# instruction interleave factor 3x 6x 8x 132# theoretical asymptotic limit 1.67 0.83 0.625 133# measured performance for 8KB block 1.05 0.86 0.84 134# 135# "as if" interleave factor 4.7x 5.8x 6.0x 136# 137# Further data for other parallelizable modes: 138# 139# CBC decrypt 1.16 0.93 0.74 140# CTR 1.14 0.91 0.74 141# 142# Well, given 3x column it's probably inappropriate to call the limit 143# asymptotic, if it can be surpassed, isn't it? What happens there? 144# Rewind to CBC paragraph for the answer. Yes, out-of-order execution 145# magic is responsible for this. Processor overlaps not only the 146# additional instructions with AES ones, but even AES instructions 147# processing adjacent triplets of independent blocks. In the 6x case 148# additional instructions still claim disproportionally small amount 149# of additional cycles, but in 8x case number of instructions must be 150# a tad too high for out-of-order logic to cope with, and AES unit 151# remains underutilized... As you can see 8x interleave is hardly 152# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl 153# utilizes 6x interleave because of limited register bank capacity. 154# 155# Higher interleave factors do have negative impact on Westmere 156# performance. While for ECB mode it's negligible ~1.5%, other 157# parallelizables perform ~5% worse, which is outweighed by ~25% 158# improvement on Sandy Bridge. To balance regression on Westmere 159# CTR mode was implemented with 6x aesenc interleave factor. 160 161# April 2011 162# 163# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing 164# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like 165# in CTR mode AES instruction interleave factor was chosen to be 6x. 166 167###################################################################### 168# Current large-block performance in cycles per byte processed with 169# 128-bit key (less is better). 170# 171# CBC en-/decrypt CTR XTS ECB OCB 172# Westmere 3.77/1.25 1.25 1.25 1.26 173# * Bridge 5.07/0.74 0.75 0.90 0.85 0.98 174# Haswell 4.44/0.63 0.63 0.73 0.63 0.70 175# Skylake 2.62/0.63 0.63 0.63 0.63 176# Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11 177# Knights L 2.54/0.77 0.78 0.85 - 1.50 178# Goldmont 3.82/1.26 1.26 1.29 1.29 1.50 179# Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95 180# Ryzen 2.71/0.35 0.35 0.44 0.38 0.49 181# 182# (*) Atom Silvermont ECB result is suboptimal because of penalties 183# incurred by operations on %xmm8-15. As ECB is not considered 184# critical, nothing was done to mitigate the problem. 185 186$PREFIX="aes_hw"; # if $PREFIX is set to "AES", the script 187 # generates drop-in replacement for 188 # crypto/aes/asm/aes-x86_64.pl:-) 189 190$flavour = shift; 191$output = shift; 192if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 193 194$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 195 196$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 197( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 198( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 199die "can't locate x86_64-xlate.pl"; 200 201open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 202*STDOUT=*OUT; 203 204$movkey = $PREFIX eq "aes_hw" ? "movups" : "movups"; 205@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 206 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 207 208$code=".text\n"; 209$code.=".extern OPENSSL_ia32cap_P\n"; 210 211$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!! 212# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ... 213$inp="%rdi"; 214$out="%rsi"; 215$len="%rdx"; 216$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! 217$ivp="%r8"; # cbc, ctr, ... 218 219$rnds_="%r10d"; # backup copy for $rounds 220$key_="%r11"; # backup copy for $key 221 222# %xmm register layout 223$rndkey0="%xmm0"; $rndkey1="%xmm1"; 224$inout0="%xmm2"; $inout1="%xmm3"; 225$inout2="%xmm4"; $inout3="%xmm5"; 226$inout4="%xmm6"; $inout5="%xmm7"; 227$inout6="%xmm8"; $inout7="%xmm9"; 228 229$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ... 230$in0="%xmm8"; $iv="%xmm9"; 231 232# Inline version of internal aesni_[en|de]crypt1. 233# 234# Why folded loop? Because aes[enc|dec] is slow enough to accommodate 235# cycles which take care of loop variables... 236{ my $sn; 237sub aesni_generate1 { 238my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); 239++$sn; 240$code.=<<___; 241 $movkey ($key),$rndkey0 242 $movkey 16($key),$rndkey1 243___ 244$code.=<<___ if (defined($ivec)); 245 xorps $rndkey0,$ivec 246 lea 32($key),$key 247 xorps $ivec,$inout 248___ 249$code.=<<___ if (!defined($ivec)); 250 lea 32($key),$key 251 xorps $rndkey0,$inout 252___ 253$code.=<<___; 254.Loop_${p}1_$sn: 255 aes${p} $rndkey1,$inout 256 dec $rounds 257 $movkey ($key),$rndkey1 258 lea 16($key),$key 259 jnz .Loop_${p}1_$sn # loop body is 16 bytes 260 aes${p}last $rndkey1,$inout 261___ 262}} 263# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key); 264# 265{ my ($inp,$out,$key) = @_4args; 266 267$code.=<<___; 268.globl ${PREFIX}_encrypt 269.type ${PREFIX}_encrypt,\@abi-omnipotent 270.align 16 271${PREFIX}_encrypt: 272.cfi_startproc 273 _CET_ENDBR 274#ifdef BORINGSSL_DISPATCH_TEST 275.extern BORINGSSL_function_hit 276 movb \$1,BORINGSSL_function_hit+1(%rip) 277#endif 278 movups ($inp),$inout0 # load input 279 mov 240($key),$rounds # key->rounds 280___ 281 &aesni_generate1("enc",$key,$rounds); 282$code.=<<___; 283 pxor $rndkey0,$rndkey0 # clear register bank 284 pxor $rndkey1,$rndkey1 285 movups $inout0,($out) # output 286 pxor $inout0,$inout0 287 ret 288.cfi_endproc 289.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt 290___ 291} 292 293# _aesni_[en|de]cryptN are private interfaces, N denotes interleave 294# factor. Why 3x subroutine were originally used in loops? Even though 295# aes[enc|dec] latency was originally 6, it could be scheduled only 296# every *2nd* cycle. Thus 3x interleave was the one providing optimal 297# utilization, i.e. when subroutine's throughput is virtually same as 298# of non-interleaved subroutine [for number of input blocks up to 3]. 299# This is why it originally made no sense to implement 2x subroutine. 300# But times change and it became appropriate to spend extra 192 bytes 301# on 2x subroutine on Atom Silvermont account. For processors that 302# can schedule aes[enc|dec] every cycle optimal interleave factor 303# equals to corresponding instructions latency. 8x is optimal for 304# * Bridge and "super-optimal" for other Intel CPUs... 305 306sub aesni_generate2 { 307my $dir=shift; 308# As already mentioned it takes in $key and $rounds, which are *not* 309# preserved. $inout[0-1] is cipher/clear text... 310$code.=<<___; 311.type _aesni_${dir}rypt2,\@abi-omnipotent 312.align 16 313_aesni_${dir}rypt2: 314.cfi_startproc 315 $movkey ($key),$rndkey0 316 shl \$4,$rounds 317 $movkey 16($key),$rndkey1 318 xorps $rndkey0,$inout0 319 xorps $rndkey0,$inout1 320 $movkey 32($key),$rndkey0 321 lea 32($key,$rounds),$key 322 neg %rax # $rounds 323 add \$16,%rax 324 325.L${dir}_loop2: 326 aes${dir} $rndkey1,$inout0 327 aes${dir} $rndkey1,$inout1 328 $movkey ($key,%rax),$rndkey1 329 add \$32,%rax 330 aes${dir} $rndkey0,$inout0 331 aes${dir} $rndkey0,$inout1 332 $movkey -16($key,%rax),$rndkey0 333 jnz .L${dir}_loop2 334 335 aes${dir} $rndkey1,$inout0 336 aes${dir} $rndkey1,$inout1 337 aes${dir}last $rndkey0,$inout0 338 aes${dir}last $rndkey0,$inout1 339 ret 340.cfi_endproc 341.size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2 342___ 343} 344sub aesni_generate3 { 345my $dir=shift; 346# As already mentioned it takes in $key and $rounds, which are *not* 347# preserved. $inout[0-2] is cipher/clear text... 348$code.=<<___; 349.type _aesni_${dir}rypt3,\@abi-omnipotent 350.align 16 351_aesni_${dir}rypt3: 352.cfi_startproc 353 $movkey ($key),$rndkey0 354 shl \$4,$rounds 355 $movkey 16($key),$rndkey1 356 xorps $rndkey0,$inout0 357 xorps $rndkey0,$inout1 358 xorps $rndkey0,$inout2 359 $movkey 32($key),$rndkey0 360 lea 32($key,$rounds),$key 361 neg %rax # $rounds 362 add \$16,%rax 363 364.L${dir}_loop3: 365 aes${dir} $rndkey1,$inout0 366 aes${dir} $rndkey1,$inout1 367 aes${dir} $rndkey1,$inout2 368 $movkey ($key,%rax),$rndkey1 369 add \$32,%rax 370 aes${dir} $rndkey0,$inout0 371 aes${dir} $rndkey0,$inout1 372 aes${dir} $rndkey0,$inout2 373 $movkey -16($key,%rax),$rndkey0 374 jnz .L${dir}_loop3 375 376 aes${dir} $rndkey1,$inout0 377 aes${dir} $rndkey1,$inout1 378 aes${dir} $rndkey1,$inout2 379 aes${dir}last $rndkey0,$inout0 380 aes${dir}last $rndkey0,$inout1 381 aes${dir}last $rndkey0,$inout2 382 ret 383.cfi_endproc 384.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3 385___ 386} 387# 4x interleave is implemented to improve small block performance, 388# most notably [and naturally] 4 block by ~30%. One can argue that one 389# should have implemented 5x as well, but improvement would be <20%, 390# so it's not worth it... 391sub aesni_generate4 { 392my $dir=shift; 393# As already mentioned it takes in $key and $rounds, which are *not* 394# preserved. $inout[0-3] is cipher/clear text... 395$code.=<<___; 396.type _aesni_${dir}rypt4,\@abi-omnipotent 397.align 16 398_aesni_${dir}rypt4: 399.cfi_startproc 400 $movkey ($key),$rndkey0 401 shl \$4,$rounds 402 $movkey 16($key),$rndkey1 403 xorps $rndkey0,$inout0 404 xorps $rndkey0,$inout1 405 xorps $rndkey0,$inout2 406 xorps $rndkey0,$inout3 407 $movkey 32($key),$rndkey0 408 lea 32($key,$rounds),$key 409 neg %rax # $rounds 410 .byte 0x0f,0x1f,0x00 411 add \$16,%rax 412 413.L${dir}_loop4: 414 aes${dir} $rndkey1,$inout0 415 aes${dir} $rndkey1,$inout1 416 aes${dir} $rndkey1,$inout2 417 aes${dir} $rndkey1,$inout3 418 $movkey ($key,%rax),$rndkey1 419 add \$32,%rax 420 aes${dir} $rndkey0,$inout0 421 aes${dir} $rndkey0,$inout1 422 aes${dir} $rndkey0,$inout2 423 aes${dir} $rndkey0,$inout3 424 $movkey -16($key,%rax),$rndkey0 425 jnz .L${dir}_loop4 426 427 aes${dir} $rndkey1,$inout0 428 aes${dir} $rndkey1,$inout1 429 aes${dir} $rndkey1,$inout2 430 aes${dir} $rndkey1,$inout3 431 aes${dir}last $rndkey0,$inout0 432 aes${dir}last $rndkey0,$inout1 433 aes${dir}last $rndkey0,$inout2 434 aes${dir}last $rndkey0,$inout3 435 ret 436.cfi_endproc 437.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 438___ 439} 440sub aesni_generate6 { 441my $dir=shift; 442# As already mentioned it takes in $key and $rounds, which are *not* 443# preserved. $inout[0-5] is cipher/clear text... 444$code.=<<___; 445.type _aesni_${dir}rypt6,\@abi-omnipotent 446.align 16 447_aesni_${dir}rypt6: 448.cfi_startproc 449 $movkey ($key),$rndkey0 450 shl \$4,$rounds 451 $movkey 16($key),$rndkey1 452 xorps $rndkey0,$inout0 453 pxor $rndkey0,$inout1 454 pxor $rndkey0,$inout2 455 aes${dir} $rndkey1,$inout0 456 lea 32($key,$rounds),$key 457 neg %rax # $rounds 458 aes${dir} $rndkey1,$inout1 459 pxor $rndkey0,$inout3 460 pxor $rndkey0,$inout4 461 aes${dir} $rndkey1,$inout2 462 pxor $rndkey0,$inout5 463 $movkey ($key,%rax),$rndkey0 464 add \$16,%rax 465 jmp .L${dir}_loop6_enter 466.align 16 467.L${dir}_loop6: 468 aes${dir} $rndkey1,$inout0 469 aes${dir} $rndkey1,$inout1 470 aes${dir} $rndkey1,$inout2 471.L${dir}_loop6_enter: 472 aes${dir} $rndkey1,$inout3 473 aes${dir} $rndkey1,$inout4 474 aes${dir} $rndkey1,$inout5 475 $movkey ($key,%rax),$rndkey1 476 add \$32,%rax 477 aes${dir} $rndkey0,$inout0 478 aes${dir} $rndkey0,$inout1 479 aes${dir} $rndkey0,$inout2 480 aes${dir} $rndkey0,$inout3 481 aes${dir} $rndkey0,$inout4 482 aes${dir} $rndkey0,$inout5 483 $movkey -16($key,%rax),$rndkey0 484 jnz .L${dir}_loop6 485 486 aes${dir} $rndkey1,$inout0 487 aes${dir} $rndkey1,$inout1 488 aes${dir} $rndkey1,$inout2 489 aes${dir} $rndkey1,$inout3 490 aes${dir} $rndkey1,$inout4 491 aes${dir} $rndkey1,$inout5 492 aes${dir}last $rndkey0,$inout0 493 aes${dir}last $rndkey0,$inout1 494 aes${dir}last $rndkey0,$inout2 495 aes${dir}last $rndkey0,$inout3 496 aes${dir}last $rndkey0,$inout4 497 aes${dir}last $rndkey0,$inout5 498 ret 499.cfi_endproc 500.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6 501___ 502} 503sub aesni_generate8 { 504my $dir=shift; 505# As already mentioned it takes in $key and $rounds, which are *not* 506# preserved. $inout[0-7] is cipher/clear text... 507$code.=<<___; 508.type _aesni_${dir}rypt8,\@abi-omnipotent 509.align 16 510_aesni_${dir}rypt8: 511.cfi_startproc 512 $movkey ($key),$rndkey0 513 shl \$4,$rounds 514 $movkey 16($key),$rndkey1 515 xorps $rndkey0,$inout0 516 xorps $rndkey0,$inout1 517 pxor $rndkey0,$inout2 518 pxor $rndkey0,$inout3 519 pxor $rndkey0,$inout4 520 lea 32($key,$rounds),$key 521 neg %rax # $rounds 522 aes${dir} $rndkey1,$inout0 523 pxor $rndkey0,$inout5 524 pxor $rndkey0,$inout6 525 aes${dir} $rndkey1,$inout1 526 pxor $rndkey0,$inout7 527 $movkey ($key,%rax),$rndkey0 528 add \$16,%rax 529 jmp .L${dir}_loop8_inner 530.align 16 531.L${dir}_loop8: 532 aes${dir} $rndkey1,$inout0 533 aes${dir} $rndkey1,$inout1 534.L${dir}_loop8_inner: 535 aes${dir} $rndkey1,$inout2 536 aes${dir} $rndkey1,$inout3 537 aes${dir} $rndkey1,$inout4 538 aes${dir} $rndkey1,$inout5 539 aes${dir} $rndkey1,$inout6 540 aes${dir} $rndkey1,$inout7 541.L${dir}_loop8_enter: 542 $movkey ($key,%rax),$rndkey1 543 add \$32,%rax 544 aes${dir} $rndkey0,$inout0 545 aes${dir} $rndkey0,$inout1 546 aes${dir} $rndkey0,$inout2 547 aes${dir} $rndkey0,$inout3 548 aes${dir} $rndkey0,$inout4 549 aes${dir} $rndkey0,$inout5 550 aes${dir} $rndkey0,$inout6 551 aes${dir} $rndkey0,$inout7 552 $movkey -16($key,%rax),$rndkey0 553 jnz .L${dir}_loop8 554 555 aes${dir} $rndkey1,$inout0 556 aes${dir} $rndkey1,$inout1 557 aes${dir} $rndkey1,$inout2 558 aes${dir} $rndkey1,$inout3 559 aes${dir} $rndkey1,$inout4 560 aes${dir} $rndkey1,$inout5 561 aes${dir} $rndkey1,$inout6 562 aes${dir} $rndkey1,$inout7 563 aes${dir}last $rndkey0,$inout0 564 aes${dir}last $rndkey0,$inout1 565 aes${dir}last $rndkey0,$inout2 566 aes${dir}last $rndkey0,$inout3 567 aes${dir}last $rndkey0,$inout4 568 aes${dir}last $rndkey0,$inout5 569 aes${dir}last $rndkey0,$inout6 570 aes${dir}last $rndkey0,$inout7 571 ret 572.cfi_endproc 573.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8 574___ 575} 576&aesni_generate2("enc") if ($PREFIX eq "aes_hw"); 577&aesni_generate3("enc") if ($PREFIX eq "aes_hw"); 578&aesni_generate4("enc") if ($PREFIX eq "aes_hw"); 579&aesni_generate6("enc") if ($PREFIX eq "aes_hw"); 580&aesni_generate8("enc") if ($PREFIX eq "aes_hw"); 581 582if ($PREFIX eq "aes_hw") { 583{ 584###################################################################### 585# void aesni_ctr32_encrypt_blocks (const void *in, void *out, 586# size_t blocks, const AES_KEY *key, 587# const char *ivec); 588# 589# Handles only complete blocks, operates on 32-bit counter and 590# does not update *ivec! (see crypto/modes/ctr128.c for details) 591# 592# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov, 593# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest. 594# Keywords are full unroll and modulo-schedule counter calculations 595# with zero-round key xor. 596{ 597my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15)); 598my ($key0,$ctr)=("%ebp","${ivp}d"); 599my $frame_size = 0x80 + ($win64?160:0); 600 601$code.=<<___; 602.globl ${PREFIX}_ctr32_encrypt_blocks 603.type ${PREFIX}_ctr32_encrypt_blocks,\@function,5 604.align 16 605${PREFIX}_ctr32_encrypt_blocks: 606.cfi_startproc 607 _CET_ENDBR 608#ifdef BORINGSSL_DISPATCH_TEST 609 movb \$1,BORINGSSL_function_hit(%rip) 610#endif 611 cmp \$1,$len 612 jne .Lctr32_bulk 613 614 # handle single block without allocating stack frame, 615 # useful when handling edges 616 movups ($ivp),$inout0 617 movups ($inp),$inout1 618 mov 240($key),%edx # key->rounds 619___ 620 &aesni_generate1("enc",$key,"%edx"); 621$code.=<<___; 622 pxor $rndkey0,$rndkey0 # clear register bank 623 pxor $rndkey1,$rndkey1 624 xorps $inout1,$inout0 625 pxor $inout1,$inout1 626 movups $inout0,($out) 627 xorps $inout0,$inout0 628 jmp .Lctr32_epilogue 629 630.align 16 631.Lctr32_bulk: 632 lea (%rsp),$key_ # use $key_ as frame pointer 633.cfi_def_cfa_register $key_ 634 push %rbp 635.cfi_push %rbp 636 sub \$$frame_size,%rsp 637 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 638___ 639$code.=<<___ if ($win64); 640 movaps %xmm6,-0xa8($key_) # offload everything 641 movaps %xmm7,-0x98($key_) 642 movaps %xmm8,-0x88($key_) 643 movaps %xmm9,-0x78($key_) 644 movaps %xmm10,-0x68($key_) 645 movaps %xmm11,-0x58($key_) 646 movaps %xmm12,-0x48($key_) 647 movaps %xmm13,-0x38($key_) 648 movaps %xmm14,-0x28($key_) 649 movaps %xmm15,-0x18($key_) 650.Lctr32_body: 651___ 652$code.=<<___; 653 654 # 8 16-byte words on top of stack are counter values 655 # xor-ed with zero-round key 656 657 movdqu ($ivp),$inout0 658 movdqu ($key),$rndkey0 659 mov 12($ivp),$ctr # counter LSB 660 pxor $rndkey0,$inout0 661 mov 12($key),$key0 # 0-round key LSB 662 movdqa $inout0,0x00(%rsp) # populate counter block 663 bswap $ctr 664 movdqa $inout0,$inout1 665 movdqa $inout0,$inout2 666 movdqa $inout0,$inout3 667 movdqa $inout0,0x40(%rsp) 668 movdqa $inout0,0x50(%rsp) 669 movdqa $inout0,0x60(%rsp) 670 mov %rdx,%r10 # about to borrow %rdx 671 movdqa $inout0,0x70(%rsp) 672 673 lea 1($ctr),%rax 674 lea 2($ctr),%rdx 675 bswap %eax 676 bswap %edx 677 xor $key0,%eax 678 xor $key0,%edx 679 pinsrd \$3,%eax,$inout1 680 lea 3($ctr),%rax 681 movdqa $inout1,0x10(%rsp) 682 pinsrd \$3,%edx,$inout2 683 bswap %eax 684 mov %r10,%rdx # restore %rdx 685 lea 4($ctr),%r10 686 movdqa $inout2,0x20(%rsp) 687 xor $key0,%eax 688 bswap %r10d 689 pinsrd \$3,%eax,$inout3 690 xor $key0,%r10d 691 movdqa $inout3,0x30(%rsp) 692 lea 5($ctr),%r9 693 mov %r10d,0x40+12(%rsp) 694 bswap %r9d 695 lea 6($ctr),%r10 696 mov 240($key),$rounds # key->rounds 697 xor $key0,%r9d 698 bswap %r10d 699 mov %r9d,0x50+12(%rsp) 700 xor $key0,%r10d 701 lea 7($ctr),%r9 702 mov %r10d,0x60+12(%rsp) 703 bswap %r9d 704 leaq OPENSSL_ia32cap_P(%rip),%r10 705 mov 4(%r10),%r10d 706 xor $key0,%r9d 707 and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE 708 mov %r9d,0x70+12(%rsp) 709 710 $movkey 0x10($key),$rndkey1 711 712 movdqa 0x40(%rsp),$inout4 713 movdqa 0x50(%rsp),$inout5 714 715 cmp \$8,$len # $len is in blocks 716 jb .Lctr32_tail # short input if ($len<8) 717 718 sub \$6,$len # $len is biased by -6 719 cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE 720 je .Lctr32_6x # [which denotes Atom Silvermont] 721 722 lea 0x80($key),$key # size optimization 723 sub \$2,$len # $len is biased by -8 724 jmp .Lctr32_loop8 725 726.align 16 727.Lctr32_6x: 728 shl \$4,$rounds 729 mov \$48,$rnds_ 730 bswap $key0 731 lea 32($key,$rounds),$key # end of key schedule 732 sub %rax,%r10 # twisted $rounds 733 jmp .Lctr32_loop6 734 735.align 16 736.Lctr32_loop6: 737 add \$6,$ctr # next counter value 738 $movkey -48($key,$rnds_),$rndkey0 739 aesenc $rndkey1,$inout0 740 mov $ctr,%eax 741 xor $key0,%eax 742 aesenc $rndkey1,$inout1 743 movbe %eax,`0x00+12`(%rsp) # store next counter value 744 lea 1($ctr),%eax 745 aesenc $rndkey1,$inout2 746 xor $key0,%eax 747 movbe %eax,`0x10+12`(%rsp) 748 aesenc $rndkey1,$inout3 749 lea 2($ctr),%eax 750 xor $key0,%eax 751 aesenc $rndkey1,$inout4 752 movbe %eax,`0x20+12`(%rsp) 753 lea 3($ctr),%eax 754 aesenc $rndkey1,$inout5 755 $movkey -32($key,$rnds_),$rndkey1 756 xor $key0,%eax 757 758 aesenc $rndkey0,$inout0 759 movbe %eax,`0x30+12`(%rsp) 760 lea 4($ctr),%eax 761 aesenc $rndkey0,$inout1 762 xor $key0,%eax 763 movbe %eax,`0x40+12`(%rsp) 764 aesenc $rndkey0,$inout2 765 lea 5($ctr),%eax 766 xor $key0,%eax 767 aesenc $rndkey0,$inout3 768 movbe %eax,`0x50+12`(%rsp) 769 mov %r10,%rax # mov $rnds_,$rounds 770 aesenc $rndkey0,$inout4 771 aesenc $rndkey0,$inout5 772 $movkey -16($key,$rnds_),$rndkey0 773 774 call .Lenc_loop6 775 776 movdqu ($inp),$inout6 # load 6 input blocks 777 movdqu 0x10($inp),$inout7 778 movdqu 0x20($inp),$in0 779 movdqu 0x30($inp),$in1 780 movdqu 0x40($inp),$in2 781 movdqu 0x50($inp),$in3 782 lea 0x60($inp),$inp # $inp+=6*16 783 $movkey -64($key,$rnds_),$rndkey1 784 pxor $inout0,$inout6 # inp^=E(ctr) 785 movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round] 786 pxor $inout1,$inout7 787 movaps 0x10(%rsp),$inout1 788 pxor $inout2,$in0 789 movaps 0x20(%rsp),$inout2 790 pxor $inout3,$in1 791 movaps 0x30(%rsp),$inout3 792 pxor $inout4,$in2 793 movaps 0x40(%rsp),$inout4 794 pxor $inout5,$in3 795 movaps 0x50(%rsp),$inout5 796 movdqu $inout6,($out) # store 6 output blocks 797 movdqu $inout7,0x10($out) 798 movdqu $in0,0x20($out) 799 movdqu $in1,0x30($out) 800 movdqu $in2,0x40($out) 801 movdqu $in3,0x50($out) 802 lea 0x60($out),$out # $out+=6*16 803 804 sub \$6,$len 805 jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow 806 807 add \$6,$len # restore real remaining $len 808 jz .Lctr32_done # done if ($len==0) 809 810 lea -48($rnds_),$rounds 811 lea -80($key,$rnds_),$key # restore $key 812 neg $rounds 813 shr \$4,$rounds # restore $rounds 814 jmp .Lctr32_tail 815 816.align 32 817.Lctr32_loop8: 818 add \$8,$ctr # next counter value 819 movdqa 0x60(%rsp),$inout6 820 aesenc $rndkey1,$inout0 821 mov $ctr,%r9d 822 movdqa 0x70(%rsp),$inout7 823 aesenc $rndkey1,$inout1 824 bswap %r9d 825 $movkey 0x20-0x80($key),$rndkey0 826 aesenc $rndkey1,$inout2 827 xor $key0,%r9d 828 nop 829 aesenc $rndkey1,$inout3 830 mov %r9d,0x00+12(%rsp) # store next counter value 831 lea 1($ctr),%r9 832 aesenc $rndkey1,$inout4 833 aesenc $rndkey1,$inout5 834 aesenc $rndkey1,$inout6 835 aesenc $rndkey1,$inout7 836 $movkey 0x30-0x80($key),$rndkey1 837___ 838for($i=2;$i<8;$i++) { 839my $rndkeyx = ($i&1)?$rndkey1:$rndkey0; 840$code.=<<___; 841 bswap %r9d 842 aesenc $rndkeyx,$inout0 843 aesenc $rndkeyx,$inout1 844 xor $key0,%r9d 845 .byte 0x66,0x90 846 aesenc $rndkeyx,$inout2 847 aesenc $rndkeyx,$inout3 848 mov %r9d,`0x10*($i-1)`+12(%rsp) 849 lea $i($ctr),%r9 850 aesenc $rndkeyx,$inout4 851 aesenc $rndkeyx,$inout5 852 aesenc $rndkeyx,$inout6 853 aesenc $rndkeyx,$inout7 854 $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx 855___ 856} 857$code.=<<___; 858 bswap %r9d 859 aesenc $rndkey0,$inout0 860 aesenc $rndkey0,$inout1 861 aesenc $rndkey0,$inout2 862 xor $key0,%r9d 863 movdqu 0x00($inp),$in0 # start loading input 864 aesenc $rndkey0,$inout3 865 mov %r9d,0x70+12(%rsp) 866 cmp \$11,$rounds 867 aesenc $rndkey0,$inout4 868 aesenc $rndkey0,$inout5 869 aesenc $rndkey0,$inout6 870 aesenc $rndkey0,$inout7 871 $movkey 0xa0-0x80($key),$rndkey0 872 873 jb .Lctr32_enc_done 874 875 aesenc $rndkey1,$inout0 876 aesenc $rndkey1,$inout1 877 aesenc $rndkey1,$inout2 878 aesenc $rndkey1,$inout3 879 aesenc $rndkey1,$inout4 880 aesenc $rndkey1,$inout5 881 aesenc $rndkey1,$inout6 882 aesenc $rndkey1,$inout7 883 $movkey 0xb0-0x80($key),$rndkey1 884 885 aesenc $rndkey0,$inout0 886 aesenc $rndkey0,$inout1 887 aesenc $rndkey0,$inout2 888 aesenc $rndkey0,$inout3 889 aesenc $rndkey0,$inout4 890 aesenc $rndkey0,$inout5 891 aesenc $rndkey0,$inout6 892 aesenc $rndkey0,$inout7 893 $movkey 0xc0-0x80($key),$rndkey0 894 895 # 192-bit key support was removed. 896 897 aesenc $rndkey1,$inout0 898 aesenc $rndkey1,$inout1 899 aesenc $rndkey1,$inout2 900 aesenc $rndkey1,$inout3 901 aesenc $rndkey1,$inout4 902 aesenc $rndkey1,$inout5 903 aesenc $rndkey1,$inout6 904 aesenc $rndkey1,$inout7 905 $movkey 0xd0-0x80($key),$rndkey1 906 907 aesenc $rndkey0,$inout0 908 aesenc $rndkey0,$inout1 909 aesenc $rndkey0,$inout2 910 aesenc $rndkey0,$inout3 911 aesenc $rndkey0,$inout4 912 aesenc $rndkey0,$inout5 913 aesenc $rndkey0,$inout6 914 aesenc $rndkey0,$inout7 915 $movkey 0xe0-0x80($key),$rndkey0 916 jmp .Lctr32_enc_done 917 918.align 16 919.Lctr32_enc_done: 920 movdqu 0x10($inp),$in1 921 pxor $rndkey0,$in0 # input^=round[last] 922 movdqu 0x20($inp),$in2 923 pxor $rndkey0,$in1 924 movdqu 0x30($inp),$in3 925 pxor $rndkey0,$in2 926 movdqu 0x40($inp),$in4 927 pxor $rndkey0,$in3 928 movdqu 0x50($inp),$in5 929 pxor $rndkey0,$in4 930 prefetcht0 0x1c0($inp) # We process 128 bytes (8*16), so to prefetch 1 iteration 931 prefetcht0 0x200($inp) # We need to prefetch 2 64 byte lines 932 pxor $rndkey0,$in5 933 aesenc $rndkey1,$inout0 934 aesenc $rndkey1,$inout1 935 aesenc $rndkey1,$inout2 936 aesenc $rndkey1,$inout3 937 aesenc $rndkey1,$inout4 938 aesenc $rndkey1,$inout5 939 aesenc $rndkey1,$inout6 940 aesenc $rndkey1,$inout7 941 movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6] 942 lea 0x80($inp),$inp # $inp+=8*16 943 944 aesenclast $in0,$inout0 # $inN is inp[N]^round[last] 945 pxor $rndkey0,$rndkey1 # borrowed $rndkey 946 movdqu 0x70-0x80($inp),$in0 947 aesenclast $in1,$inout1 948 pxor $rndkey0,$in0 949 movdqa 0x00(%rsp),$in1 # load next counter block 950 aesenclast $in2,$inout2 951 aesenclast $in3,$inout3 952 movdqa 0x10(%rsp),$in2 953 movdqa 0x20(%rsp),$in3 954 aesenclast $in4,$inout4 955 aesenclast $in5,$inout5 956 movdqa 0x30(%rsp),$in4 957 movdqa 0x40(%rsp),$in5 958 aesenclast $rndkey1,$inout6 959 movdqa 0x50(%rsp),$rndkey0 960 $movkey 0x10-0x80($key),$rndkey1#real 1st-round key 961 aesenclast $in0,$inout7 962 963 movups $inout0,($out) # store 8 output blocks 964 movdqa $in1,$inout0 965 movups $inout1,0x10($out) 966 movdqa $in2,$inout1 967 movups $inout2,0x20($out) 968 movdqa $in3,$inout2 969 movups $inout3,0x30($out) 970 movdqa $in4,$inout3 971 movups $inout4,0x40($out) 972 movdqa $in5,$inout4 973 movups $inout5,0x50($out) 974 movdqa $rndkey0,$inout5 975 movups $inout6,0x60($out) 976 movups $inout7,0x70($out) 977 lea 0x80($out),$out # $out+=8*16 978 979 sub \$8,$len 980 jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow 981 982 add \$8,$len # restore real remaining $len 983 jz .Lctr32_done # done if ($len==0) 984 lea -0x80($key),$key 985 986.Lctr32_tail: 987 # note that at this point $inout0..5 are populated with 988 # counter values xor-ed with 0-round key 989 lea 16($key),$key 990 cmp \$4,$len 991 jb .Lctr32_loop3 992 je .Lctr32_loop4 993 994 # if ($len>4) compute 7 E(counter) 995 shl \$4,$rounds 996 movdqa 0x60(%rsp),$inout6 997 pxor $inout7,$inout7 998 999 $movkey 16($key),$rndkey0 1000 aesenc $rndkey1,$inout0 1001 aesenc $rndkey1,$inout1 1002 lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter 1003 neg %rax 1004 aesenc $rndkey1,$inout2 1005 add \$16,%rax # prepare for .Lenc_loop8_enter 1006 movups ($inp),$in0 1007 aesenc $rndkey1,$inout3 1008 aesenc $rndkey1,$inout4 1009 movups 0x10($inp),$in1 # pre-load input 1010 movups 0x20($inp),$in2 1011 aesenc $rndkey1,$inout5 1012 aesenc $rndkey1,$inout6 1013 1014 call .Lenc_loop8_enter 1015 1016 movdqu 0x30($inp),$in3 1017 pxor $in0,$inout0 1018 movdqu 0x40($inp),$in0 1019 pxor $in1,$inout1 1020 movdqu $inout0,($out) # store output 1021 pxor $in2,$inout2 1022 movdqu $inout1,0x10($out) 1023 pxor $in3,$inout3 1024 movdqu $inout2,0x20($out) 1025 pxor $in0,$inout4 1026 movdqu $inout3,0x30($out) 1027 movdqu $inout4,0x40($out) 1028 cmp \$6,$len 1029 jb .Lctr32_done # $len was 5, stop store 1030 1031 movups 0x50($inp),$in1 1032 xorps $in1,$inout5 1033 movups $inout5,0x50($out) 1034 je .Lctr32_done # $len was 6, stop store 1035 1036 movups 0x60($inp),$in2 1037 xorps $in2,$inout6 1038 movups $inout6,0x60($out) 1039 jmp .Lctr32_done # $len was 7, stop store 1040 1041.align 32 1042.Lctr32_loop4: 1043 aesenc $rndkey1,$inout0 1044 lea 16($key),$key 1045 dec $rounds 1046 aesenc $rndkey1,$inout1 1047 aesenc $rndkey1,$inout2 1048 aesenc $rndkey1,$inout3 1049 $movkey ($key),$rndkey1 1050 jnz .Lctr32_loop4 1051 aesenclast $rndkey1,$inout0 1052 aesenclast $rndkey1,$inout1 1053 movups ($inp),$in0 # load input 1054 movups 0x10($inp),$in1 1055 aesenclast $rndkey1,$inout2 1056 aesenclast $rndkey1,$inout3 1057 movups 0x20($inp),$in2 1058 movups 0x30($inp),$in3 1059 1060 xorps $in0,$inout0 1061 movups $inout0,($out) # store output 1062 xorps $in1,$inout1 1063 movups $inout1,0x10($out) 1064 pxor $in2,$inout2 1065 movdqu $inout2,0x20($out) 1066 pxor $in3,$inout3 1067 movdqu $inout3,0x30($out) 1068 jmp .Lctr32_done # $len was 4, stop store 1069 1070.align 32 1071.Lctr32_loop3: 1072 aesenc $rndkey1,$inout0 1073 lea 16($key),$key 1074 dec $rounds 1075 aesenc $rndkey1,$inout1 1076 aesenc $rndkey1,$inout2 1077 $movkey ($key),$rndkey1 1078 jnz .Lctr32_loop3 1079 aesenclast $rndkey1,$inout0 1080 aesenclast $rndkey1,$inout1 1081 aesenclast $rndkey1,$inout2 1082 1083 movups ($inp),$in0 # load input 1084 xorps $in0,$inout0 1085 movups $inout0,($out) # store output 1086 cmp \$2,$len 1087 jb .Lctr32_done # $len was 1, stop store 1088 1089 movups 0x10($inp),$in1 1090 xorps $in1,$inout1 1091 movups $inout1,0x10($out) 1092 je .Lctr32_done # $len was 2, stop store 1093 1094 movups 0x20($inp),$in2 1095 xorps $in2,$inout2 1096 movups $inout2,0x20($out) # $len was 3, stop store 1097 1098.Lctr32_done: 1099 xorps %xmm0,%xmm0 # clear register bank 1100 xor $key0,$key0 1101 pxor %xmm1,%xmm1 1102 pxor %xmm2,%xmm2 1103 pxor %xmm3,%xmm3 1104 pxor %xmm4,%xmm4 1105 pxor %xmm5,%xmm5 1106___ 1107$code.=<<___ if (!$win64); 1108 pxor %xmm6,%xmm6 1109 pxor %xmm7,%xmm7 1110 movaps %xmm0,0x00(%rsp) # clear stack 1111 pxor %xmm8,%xmm8 1112 movaps %xmm0,0x10(%rsp) 1113 pxor %xmm9,%xmm9 1114 movaps %xmm0,0x20(%rsp) 1115 pxor %xmm10,%xmm10 1116 movaps %xmm0,0x30(%rsp) 1117 pxor %xmm11,%xmm11 1118 movaps %xmm0,0x40(%rsp) 1119 pxor %xmm12,%xmm12 1120 movaps %xmm0,0x50(%rsp) 1121 pxor %xmm13,%xmm13 1122 movaps %xmm0,0x60(%rsp) 1123 pxor %xmm14,%xmm14 1124 movaps %xmm0,0x70(%rsp) 1125 pxor %xmm15,%xmm15 1126___ 1127$code.=<<___ if ($win64); 1128 movaps -0xa8($key_),%xmm6 1129 movaps %xmm0,-0xa8($key_) # clear stack 1130 movaps -0x98($key_),%xmm7 1131 movaps %xmm0,-0x98($key_) 1132 movaps -0x88($key_),%xmm8 1133 movaps %xmm0,-0x88($key_) 1134 movaps -0x78($key_),%xmm9 1135 movaps %xmm0,-0x78($key_) 1136 movaps -0x68($key_),%xmm10 1137 movaps %xmm0,-0x68($key_) 1138 movaps -0x58($key_),%xmm11 1139 movaps %xmm0,-0x58($key_) 1140 movaps -0x48($key_),%xmm12 1141 movaps %xmm0,-0x48($key_) 1142 movaps -0x38($key_),%xmm13 1143 movaps %xmm0,-0x38($key_) 1144 movaps -0x28($key_),%xmm14 1145 movaps %xmm0,-0x28($key_) 1146 movaps -0x18($key_),%xmm15 1147 movaps %xmm0,-0x18($key_) 1148 movaps %xmm0,0x00(%rsp) 1149 movaps %xmm0,0x10(%rsp) 1150 movaps %xmm0,0x20(%rsp) 1151 movaps %xmm0,0x30(%rsp) 1152 movaps %xmm0,0x40(%rsp) 1153 movaps %xmm0,0x50(%rsp) 1154 movaps %xmm0,0x60(%rsp) 1155 movaps %xmm0,0x70(%rsp) 1156___ 1157$code.=<<___; 1158 mov -8($key_),%rbp 1159.cfi_restore %rbp 1160 lea ($key_),%rsp 1161.cfi_def_cfa_register %rsp 1162.Lctr32_epilogue: 1163 ret 1164.cfi_endproc 1165.size ${PREFIX}_ctr32_encrypt_blocks,.-${PREFIX}_ctr32_encrypt_blocks 1166___ 1167} }} 1168 1169{ my ($inp,$bits,$key) = @_4args; 1170 $bits =~ s/%r/%e/; 1171 1172# This is based on submission by 1173# 1174# Huang Ying <ying.huang@intel.com> 1175# Vinodh Gopal <vinodh.gopal@intel.com> 1176# Kahraman Akdemir 1177# 1178# Aggressively optimized in respect to aeskeygenassist's critical path 1179# and is contained in %xmm0-5 to meet Win64 ABI requirement. 1180# 1181# int ${PREFIX}_set_encrypt_key(const unsigned char *inp, 1182# int bits, AES_KEY * const key); 1183# 1184# input: $inp user-supplied key 1185# $bits $inp length in bits 1186# $key pointer to key schedule 1187# output: %eax 0 denoting success, -1 or -2 - failure (see C) 1188# $bits rounds-1 (used in aesni_set_decrypt_key) 1189# *$key key schedule 1190# $key pointer to key schedule (used in 1191# aesni_set_decrypt_key) 1192# 1193# Subroutine is frame-less, which means that only volatile registers 1194# are used. Note that it's declared "abi-omnipotent", which means that 1195# amount of volatile registers is smaller on Windows. 1196# 1197$code.=<<___; 1198.globl ${PREFIX}_set_encrypt_key 1199.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent 1200.align 16 1201${PREFIX}_set_encrypt_key: 1202__aesni_set_encrypt_key: 1203.cfi_startproc 1204 _CET_ENDBR 1205#ifdef BORINGSSL_DISPATCH_TEST 1206 movb \$1,BORINGSSL_function_hit+3(%rip) 1207#endif 1208 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 1209.cfi_adjust_cfa_offset 8 1210 mov \$-1,%rax 1211 test $inp,$inp 1212 jz .Lenc_key_ret 1213 test $key,$key 1214 jz .Lenc_key_ret 1215 1216 movups ($inp),%xmm0 # pull first 128 bits of *userKey 1217 xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0 1218 leaq OPENSSL_ia32cap_P(%rip),%r10 1219 movl 4(%r10),%r10d 1220 and \$`1<<28|1<<11`,%r10d # AVX and XOP bits 1221 lea 16($key),%rax # %rax is used as modifiable copy of $key 1222 cmp \$256,$bits 1223 je .L14rounds 1224 # 192-bit key support was removed. 1225 cmp \$128,$bits 1226 jne .Lbad_keybits 1227 1228.L10rounds: 1229 mov \$9,$bits # 10 rounds for 128-bit key 1230 cmp \$`1<<28`,%r10d # AVX, bit no XOP 1231 je .L10rounds_alt 1232 1233 $movkey %xmm0,($key) # round 0 1234 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1 1235 call .Lkey_expansion_128_cold 1236 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2 1237 call .Lkey_expansion_128 1238 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3 1239 call .Lkey_expansion_128 1240 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4 1241 call .Lkey_expansion_128 1242 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5 1243 call .Lkey_expansion_128 1244 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6 1245 call .Lkey_expansion_128 1246 aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7 1247 call .Lkey_expansion_128 1248 aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8 1249 call .Lkey_expansion_128 1250 aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9 1251 call .Lkey_expansion_128 1252 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10 1253 call .Lkey_expansion_128 1254 $movkey %xmm0,(%rax) 1255 mov $bits,80(%rax) # 240(%rdx) 1256 xor %eax,%eax 1257 jmp .Lenc_key_ret 1258 1259.align 16 1260.L10rounds_alt: 1261 movdqa .Lkey_rotate(%rip),%xmm5 1262 mov \$8,%r10d 1263 movdqa .Lkey_rcon1(%rip),%xmm4 1264 movdqa %xmm0,%xmm2 1265 movdqu %xmm0,($key) 1266 jmp .Loop_key128 1267 1268.align 16 1269.Loop_key128: 1270 pshufb %xmm5,%xmm0 1271 aesenclast %xmm4,%xmm0 1272 pslld \$1,%xmm4 1273 lea 16(%rax),%rax 1274 1275 movdqa %xmm2,%xmm3 1276 pslldq \$4,%xmm2 1277 pxor %xmm2,%xmm3 1278 pslldq \$4,%xmm2 1279 pxor %xmm2,%xmm3 1280 pslldq \$4,%xmm2 1281 pxor %xmm3,%xmm2 1282 1283 pxor %xmm2,%xmm0 1284 movdqu %xmm0,-16(%rax) 1285 movdqa %xmm0,%xmm2 1286 1287 dec %r10d 1288 jnz .Loop_key128 1289 1290 movdqa .Lkey_rcon1b(%rip),%xmm4 1291 1292 pshufb %xmm5,%xmm0 1293 aesenclast %xmm4,%xmm0 1294 pslld \$1,%xmm4 1295 1296 movdqa %xmm2,%xmm3 1297 pslldq \$4,%xmm2 1298 pxor %xmm2,%xmm3 1299 pslldq \$4,%xmm2 1300 pxor %xmm2,%xmm3 1301 pslldq \$4,%xmm2 1302 pxor %xmm3,%xmm2 1303 1304 pxor %xmm2,%xmm0 1305 movdqu %xmm0,(%rax) 1306 1307 movdqa %xmm0,%xmm2 1308 pshufb %xmm5,%xmm0 1309 aesenclast %xmm4,%xmm0 1310 1311 movdqa %xmm2,%xmm3 1312 pslldq \$4,%xmm2 1313 pxor %xmm2,%xmm3 1314 pslldq \$4,%xmm2 1315 pxor %xmm2,%xmm3 1316 pslldq \$4,%xmm2 1317 pxor %xmm3,%xmm2 1318 1319 pxor %xmm2,%xmm0 1320 movdqu %xmm0,16(%rax) 1321 1322 mov $bits,96(%rax) # 240($key) 1323 xor %eax,%eax 1324 jmp .Lenc_key_ret 1325 1326# 192-bit key support was removed. 1327 1328.align 16 1329.L14rounds: 1330 movups 16($inp),%xmm2 # remaining half of *userKey 1331 mov \$13,$bits # 14 rounds for 256 1332 lea 16(%rax),%rax 1333 cmp \$`1<<28`,%r10d # AVX, but no XOP 1334 je .L14rounds_alt 1335 1336 $movkey %xmm0,($key) # round 0 1337 $movkey %xmm2,16($key) # round 1 1338 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2 1339 call .Lkey_expansion_256a_cold 1340 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3 1341 call .Lkey_expansion_256b 1342 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4 1343 call .Lkey_expansion_256a 1344 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5 1345 call .Lkey_expansion_256b 1346 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6 1347 call .Lkey_expansion_256a 1348 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7 1349 call .Lkey_expansion_256b 1350 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8 1351 call .Lkey_expansion_256a 1352 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9 1353 call .Lkey_expansion_256b 1354 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10 1355 call .Lkey_expansion_256a 1356 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11 1357 call .Lkey_expansion_256b 1358 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12 1359 call .Lkey_expansion_256a 1360 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13 1361 call .Lkey_expansion_256b 1362 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14 1363 call .Lkey_expansion_256a 1364 $movkey %xmm0,(%rax) 1365 mov $bits,16(%rax) # 240(%rdx) 1366 xor %rax,%rax 1367 jmp .Lenc_key_ret 1368 1369.align 16 1370.L14rounds_alt: 1371 movdqa .Lkey_rotate(%rip),%xmm5 1372 movdqa .Lkey_rcon1(%rip),%xmm4 1373 mov \$7,%r10d 1374 movdqu %xmm0,0($key) 1375 movdqa %xmm2,%xmm1 1376 movdqu %xmm2,16($key) 1377 jmp .Loop_key256 1378 1379.align 16 1380.Loop_key256: 1381 pshufb %xmm5,%xmm2 1382 aesenclast %xmm4,%xmm2 1383 1384 movdqa %xmm0,%xmm3 1385 pslldq \$4,%xmm0 1386 pxor %xmm0,%xmm3 1387 pslldq \$4,%xmm0 1388 pxor %xmm0,%xmm3 1389 pslldq \$4,%xmm0 1390 pxor %xmm3,%xmm0 1391 pslld \$1,%xmm4 1392 1393 pxor %xmm2,%xmm0 1394 movdqu %xmm0,(%rax) 1395 1396 dec %r10d 1397 jz .Ldone_key256 1398 1399 pshufd \$0xff,%xmm0,%xmm2 1400 pxor %xmm3,%xmm3 1401 aesenclast %xmm3,%xmm2 1402 1403 movdqa %xmm1,%xmm3 1404 pslldq \$4,%xmm1 1405 pxor %xmm1,%xmm3 1406 pslldq \$4,%xmm1 1407 pxor %xmm1,%xmm3 1408 pslldq \$4,%xmm1 1409 pxor %xmm3,%xmm1 1410 1411 pxor %xmm1,%xmm2 1412 movdqu %xmm2,16(%rax) 1413 lea 32(%rax),%rax 1414 movdqa %xmm2,%xmm1 1415 1416 jmp .Loop_key256 1417 1418.Ldone_key256: 1419 mov $bits,16(%rax) # 240($key) 1420 xor %eax,%eax 1421 jmp .Lenc_key_ret 1422 1423.align 16 1424.Lbad_keybits: 1425 mov \$-2,%rax 1426.Lenc_key_ret: 1427 pxor %xmm0,%xmm0 1428 pxor %xmm1,%xmm1 1429 pxor %xmm2,%xmm2 1430 pxor %xmm3,%xmm3 1431 pxor %xmm4,%xmm4 1432 pxor %xmm5,%xmm5 1433 add \$8,%rsp 1434.cfi_adjust_cfa_offset -8 1435 ret 1436.cfi_endproc 1437.LSEH_end_set_encrypt_key: 1438 1439.align 16 1440.Lkey_expansion_128: 1441 $movkey %xmm0,(%rax) 1442 lea 16(%rax),%rax 1443.Lkey_expansion_128_cold: 1444 shufps \$0b00010000,%xmm0,%xmm4 1445 xorps %xmm4, %xmm0 1446 shufps \$0b10001100,%xmm0,%xmm4 1447 xorps %xmm4, %xmm0 1448 shufps \$0b11111111,%xmm1,%xmm1 # critical path 1449 xorps %xmm1,%xmm0 1450 ret 1451 1452.align 16 1453.Lkey_expansion_192a: 1454 $movkey %xmm0,(%rax) 1455 lea 16(%rax),%rax 1456.Lkey_expansion_192a_cold: 1457 movaps %xmm2, %xmm5 1458.Lkey_expansion_192b_warm: 1459 shufps \$0b00010000,%xmm0,%xmm4 1460 movdqa %xmm2,%xmm3 1461 xorps %xmm4,%xmm0 1462 shufps \$0b10001100,%xmm0,%xmm4 1463 pslldq \$4,%xmm3 1464 xorps %xmm4,%xmm0 1465 pshufd \$0b01010101,%xmm1,%xmm1 # critical path 1466 pxor %xmm3,%xmm2 1467 pxor %xmm1,%xmm0 1468 pshufd \$0b11111111,%xmm0,%xmm3 1469 pxor %xmm3,%xmm2 1470 ret 1471 1472.align 16 1473.Lkey_expansion_192b: 1474 movaps %xmm0,%xmm3 1475 shufps \$0b01000100,%xmm0,%xmm5 1476 $movkey %xmm5,(%rax) 1477 shufps \$0b01001110,%xmm2,%xmm3 1478 $movkey %xmm3,16(%rax) 1479 lea 32(%rax),%rax 1480 jmp .Lkey_expansion_192b_warm 1481 1482.align 16 1483.Lkey_expansion_256a: 1484 $movkey %xmm2,(%rax) 1485 lea 16(%rax),%rax 1486.Lkey_expansion_256a_cold: 1487 shufps \$0b00010000,%xmm0,%xmm4 1488 xorps %xmm4,%xmm0 1489 shufps \$0b10001100,%xmm0,%xmm4 1490 xorps %xmm4,%xmm0 1491 shufps \$0b11111111,%xmm1,%xmm1 # critical path 1492 xorps %xmm1,%xmm0 1493 ret 1494 1495.align 16 1496.Lkey_expansion_256b: 1497 $movkey %xmm0,(%rax) 1498 lea 16(%rax),%rax 1499 1500 shufps \$0b00010000,%xmm2,%xmm4 1501 xorps %xmm4,%xmm2 1502 shufps \$0b10001100,%xmm2,%xmm4 1503 xorps %xmm4,%xmm2 1504 shufps \$0b10101010,%xmm1,%xmm1 # critical path 1505 xorps %xmm1,%xmm2 1506 ret 1507.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key 1508.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key 1509___ 1510} 1511 1512$code.=<<___; 1513.section .rodata 1514.align 64 1515.Lbswap_mask: 1516 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 1517.Lincrement32: 1518 .long 6,6,6,0 1519.Lincrement64: 1520 .long 1,0,0,0 1521.Lincrement1: 1522 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 1523.Lkey_rotate: 1524 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d 1525.Lkey_rotate192: 1526 .long 0x04070605,0x04070605,0x04070605,0x04070605 1527.Lkey_rcon1: 1528 .long 1,1,1,1 1529.Lkey_rcon1b: 1530 .long 0x1b,0x1b,0x1b,0x1b 1531 1532.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>" 1533.align 64 1534.text 1535___ 1536 1537# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1538# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1539if ($win64) { 1540$rec="%rcx"; 1541$frame="%rdx"; 1542$context="%r8"; 1543$disp="%r9"; 1544 1545$code.=<<___; 1546.extern __imp_RtlVirtualUnwind 1547___ 1548$code.=<<___ if ($PREFIX eq "aes_hw"); 1549.type ctr_xts_se_handler,\@abi-omnipotent 1550.align 16 1551ctr_xts_se_handler: 1552 push %rsi 1553 push %rdi 1554 push %rbx 1555 push %rbp 1556 push %r12 1557 push %r13 1558 push %r14 1559 push %r15 1560 pushfq 1561 sub \$64,%rsp 1562 1563 mov 120($context),%rax # pull context->Rax 1564 mov 248($context),%rbx # pull context->Rip 1565 1566 mov 8($disp),%rsi # disp->ImageBase 1567 mov 56($disp),%r11 # disp->HandlerData 1568 1569 mov 0(%r11),%r10d # HandlerData[0] 1570 lea (%rsi,%r10),%r10 # prologue lable 1571 cmp %r10,%rbx # context->Rip<prologue label 1572 jb .Lcommon_seh_tail 1573 1574 mov 152($context),%rax # pull context->Rsp 1575 1576 mov 4(%r11),%r10d # HandlerData[1] 1577 lea (%rsi,%r10),%r10 # epilogue label 1578 cmp %r10,%rbx # context->Rip>=epilogue label 1579 jae .Lcommon_seh_tail 1580 1581 mov 208($context),%rax # pull context->R11 1582 1583 lea -0xa8(%rax),%rsi # %xmm save area 1584 lea 512($context),%rdi # & context.Xmm6 1585 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 1586 .long 0xa548f3fc # cld; rep movsq 1587 1588 mov -8(%rax),%rbp # restore saved %rbp 1589 mov %rbp,160($context) # restore context->Rbp 1590 1591 1592.Lcommon_seh_tail: 1593 mov 8(%rax),%rdi 1594 mov 16(%rax),%rsi 1595 mov %rax,152($context) # restore context->Rsp 1596 mov %rsi,168($context) # restore context->Rsi 1597 mov %rdi,176($context) # restore context->Rdi 1598 1599 mov 40($disp),%rdi # disp->ContextRecord 1600 mov $context,%rsi # context 1601 mov \$154,%ecx # sizeof(CONTEXT) 1602 .long 0xa548f3fc # cld; rep movsq 1603 1604 mov $disp,%rsi 1605 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1606 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1607 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1608 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1609 mov 40(%rsi),%r10 # disp->ContextRecord 1610 lea 56(%rsi),%r11 # &disp->HandlerData 1611 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1612 mov %r10,32(%rsp) # arg5 1613 mov %r11,40(%rsp) # arg6 1614 mov %r12,48(%rsp) # arg7 1615 mov %rcx,56(%rsp) # arg8, (NULL) 1616 call *__imp_RtlVirtualUnwind(%rip) 1617 1618 mov \$1,%eax # ExceptionContinueSearch 1619 add \$64,%rsp 1620 popfq 1621 pop %r15 1622 pop %r14 1623 pop %r13 1624 pop %r12 1625 pop %rbp 1626 pop %rbx 1627 pop %rdi 1628 pop %rsi 1629 ret 1630.size ctr_xts_se_handler,.-ctr_xts_se_handler 1631 1632.section .pdata 1633.align 4 1634___ 1635$code.=<<___ if ($PREFIX eq "aes_hw"); 1636 .rva .LSEH_begin_${PREFIX}_ctr32_encrypt_blocks 1637 .rva .LSEH_end_${PREFIX}_ctr32_encrypt_blocks 1638 .rva .LSEH_info_ctr32 1639___ 1640$code.=<<___; 1641 .rva ${PREFIX}_set_encrypt_key 1642 .rva .LSEH_end_set_encrypt_key 1643 .rva .LSEH_info_key 1644.section .xdata 1645.align 8 1646___ 1647$code.=<<___ if ($PREFIX eq "aes_hw"); 1648.LSEH_info_ctr32: 1649 .byte 9,0,0,0 1650 .rva ctr_xts_se_handler 1651 .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[] 1652___ 1653$code.=<<___; 1654.LSEH_info_key: 1655 .byte 0x01,0x04,0x01,0x00 1656 .byte 0x04,0x02,0x00,0x00 # sub rsp,8 1657___ 1658} 1659 1660sub rex { 1661 local *opcode=shift; 1662 my ($dst,$src)=@_; 1663 my $rex=0; 1664 1665 $rex|=0x04 if($dst>=8); 1666 $rex|=0x01 if($src>=8); 1667 push @opcode,$rex|0x40 if($rex); 1668} 1669 1670sub aesni { 1671 my $line=shift; 1672 my @opcode=(0x66); 1673 1674 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { 1675 rex(\@opcode,$4,$3); 1676 push @opcode,0x0f,0x3a,0xdf; 1677 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M 1678 my $c=$2; 1679 push @opcode,$c=~/^0/?oct($c):$c; 1680 return ".byte\t".join(',',@opcode); 1681 } 1682 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { 1683 my %opcodelet = ( 1684 "aesimc" => 0xdb, 1685 "aesenc" => 0xdc, "aesenclast" => 0xdd, 1686 "aesdec" => 0xde, "aesdeclast" => 0xdf 1687 ); 1688 return undef if (!defined($opcodelet{$1})); 1689 rex(\@opcode,$3,$2); 1690 push @opcode,0x0f,0x38,$opcodelet{$1}; 1691 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M 1692 return ".byte\t".join(',',@opcode); 1693 } 1694 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) { 1695 my %opcodelet = ( 1696 "aesenc" => 0xdc, "aesenclast" => 0xdd, 1697 "aesdec" => 0xde, "aesdeclast" => 0xdf 1698 ); 1699 return undef if (!defined($opcodelet{$1})); 1700 my $off = $2; 1701 push @opcode,0x44 if ($3>=8); 1702 push @opcode,0x0f,0x38,$opcodelet{$1}; 1703 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M 1704 push @opcode,($off=~/^0/?oct($off):$off)&0xff; 1705 return ".byte\t".join(',',@opcode); 1706 } 1707 return $line; 1708} 1709 1710sub movbe { 1711 ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift; 1712} 1713 1714$code =~ s/\`([^\`]*)\`/eval($1)/gem; 1715$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; 1716#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact 1717$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem; 1718 1719print $code; 1720 1721close STDOUT or die "error closing STDOUT: $!"; 1722