1#! /usr/bin/env perl 2# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements support for Intel AES-NI extension. In 18# OpenSSL context it's used with Intel engine, but can also be used as 19# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for 20# details]. 21# 22# Performance. 23# 24# Given aes(enc|dec) instructions' latency asymptotic performance for 25# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte 26# processed with 128-bit key. And given their throughput asymptotic 27# performance for parallelizable modes is 1.25 cycles per byte. Being 28# asymptotic limit it's not something you commonly achieve in reality, 29# but how close does one get? Below are results collected for 30# different modes and block sized. Pairs of numbers are for en-/ 31# decryption. 32# 33# 16-byte 64-byte 256-byte 1-KB 8-KB 34# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26 35# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26 36# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28 37# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07 38# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38 39# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55 40# 41# ECB, CTR, CBC and CCM results are free from EVP overhead. This means 42# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni 43# [-decrypt]' will exhibit 10-15% worse results for smaller blocks. 44# The results were collected with specially crafted speed.c benchmark 45# in order to compare them with results reported in "Intel Advanced 46# Encryption Standard (AES) New Instruction Set" White Paper Revision 47# 3.0 dated May 2010. All above results are consistently better. This 48# module also provides better performance for block sizes smaller than 49# 128 bytes in points *not* represented in the above table. 50# 51# Looking at the results for 8-KB buffer. 52# 53# CFB and OFB results are far from the limit, because implementation 54# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on 55# single-block aesni_encrypt, which is not the most optimal way to go. 56# CBC encrypt result is unexpectedly high and there is no documented 57# explanation for it. Seemingly there is a small penalty for feeding 58# the result back to AES unit the way it's done in CBC mode. There is 59# nothing one can do and the result appears optimal. CCM result is 60# identical to CBC, because CBC-MAC is essentially CBC encrypt without 61# saving output. CCM CTR "stays invisible," because it's neatly 62# interleaved wih CBC-MAC. This provides ~30% improvement over 63# "straightforward" CCM implementation with CTR and CBC-MAC performed 64# disjointly. Parallelizable modes practically achieve the theoretical 65# limit. 66# 67# Looking at how results vary with buffer size. 68# 69# Curves are practically saturated at 1-KB buffer size. In most cases 70# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one. 71# CTR curve doesn't follow this pattern and is "slowest" changing one 72# with "256-byte" result being 87% of "8-KB." This is because overhead 73# in CTR mode is most computationally intensive. Small-block CCM 74# decrypt is slower than encrypt, because first CTR and last CBC-MAC 75# iterations can't be interleaved. 76# 77# Results for 192- and 256-bit keys. 78# 79# EVP-free results were observed to scale perfectly with number of 80# rounds for larger block sizes, i.e. 192-bit result being 10/12 times 81# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences 82# are a tad smaller, because the above mentioned penalty biases all 83# results by same constant value. In similar way function call 84# overhead affects small-block performance, as well as OFB and CFB 85# results. Differences are not large, most common coefficients are 86# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one 87# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)... 88 89# January 2011 90# 91# While Westmere processor features 6 cycles latency for aes[enc|dec] 92# instructions, which can be scheduled every second cycle, Sandy 93# Bridge spends 8 cycles per instruction, but it can schedule them 94# every cycle. This means that code targeting Westmere would perform 95# suboptimally on Sandy Bridge. Therefore this update. 96# 97# In addition, non-parallelizable CBC encrypt (as well as CCM) is 98# optimized. Relative improvement might appear modest, 8% on Westmere, 99# but in absolute terms it's 3.77 cycles per byte encrypted with 100# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers 101# should be compared to asymptotic limits of 3.75 for Westmere and 102# 5.00 for Sandy Bridge. Actually, the fact that they get this close 103# to asymptotic limits is quite amazing. Indeed, the limit is 104# calculated as latency times number of rounds, 10 for 128-bit key, 105# and divided by 16, the number of bytes in block, or in other words 106# it accounts *solely* for aesenc instructions. But there are extra 107# instructions, and numbers so close to the asymptotic limits mean 108# that it's as if it takes as little as *one* additional cycle to 109# execute all of them. How is it possible? It is possible thanks to 110# out-of-order execution logic, which manages to overlap post- 111# processing of previous block, things like saving the output, with 112# actual encryption of current block, as well as pre-processing of 113# current block, things like fetching input and xor-ing it with 114# 0-round element of the key schedule, with actual encryption of 115# previous block. Keep this in mind... 116# 117# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher 118# performance is achieved by interleaving instructions working on 119# independent blocks. In which case asymptotic limit for such modes 120# can be obtained by dividing above mentioned numbers by AES 121# instructions' interleave factor. Westmere can execute at most 3 122# instructions at a time, meaning that optimal interleave factor is 3, 123# and that's where the "magic" number of 1.25 come from. "Optimal 124# interleave factor" means that increase of interleave factor does 125# not improve performance. The formula has proven to reflect reality 126# pretty well on Westmere... Sandy Bridge on the other hand can 127# execute up to 8 AES instructions at a time, so how does varying 128# interleave factor affect the performance? Here is table for ECB 129# (numbers are cycles per byte processed with 128-bit key): 130# 131# instruction interleave factor 3x 6x 8x 132# theoretical asymptotic limit 1.67 0.83 0.625 133# measured performance for 8KB block 1.05 0.86 0.84 134# 135# "as if" interleave factor 4.7x 5.8x 6.0x 136# 137# Further data for other parallelizable modes: 138# 139# CBC decrypt 1.16 0.93 0.74 140# CTR 1.14 0.91 0.74 141# 142# Well, given 3x column it's probably inappropriate to call the limit 143# asymptotic, if it can be surpassed, isn't it? What happens there? 144# Rewind to CBC paragraph for the answer. Yes, out-of-order execution 145# magic is responsible for this. Processor overlaps not only the 146# additional instructions with AES ones, but even AES instructions 147# processing adjacent triplets of independent blocks. In the 6x case 148# additional instructions still claim disproportionally small amount 149# of additional cycles, but in 8x case number of instructions must be 150# a tad too high for out-of-order logic to cope with, and AES unit 151# remains underutilized... As you can see 8x interleave is hardly 152# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl 153# utilizes 6x interleave because of limited register bank capacity. 154# 155# Higher interleave factors do have negative impact on Westmere 156# performance. While for ECB mode it's negligible ~1.5%, other 157# parallelizables perform ~5% worse, which is outweighed by ~25% 158# improvement on Sandy Bridge. To balance regression on Westmere 159# CTR mode was implemented with 6x aesenc interleave factor. 160 161# April 2011 162# 163# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing 164# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like 165# in CTR mode AES instruction interleave factor was chosen to be 6x. 166 167# November 2015 168# 169# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was 170# chosen to be 6x. 171 172###################################################################### 173# Current large-block performance in cycles per byte processed with 174# 128-bit key (less is better). 175# 176# CBC en-/decrypt CTR XTS ECB OCB 177# Westmere 3.77/1.25 1.25 1.25 1.26 178# * Bridge 5.07/0.74 0.75 0.90 0.85 0.98 179# Haswell 4.44/0.63 0.63 0.73 0.63 0.70 180# Skylake 2.62/0.63 0.63 0.63 0.63 181# Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11 182# Knights L 2.54/0.77 0.78 0.85 - 1.50 183# Goldmont 3.82/1.26 1.26 1.29 1.29 1.50 184# Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95 185# Ryzen 2.71/0.35 0.35 0.44 0.38 0.49 186# 187# (*) Atom Silvermont ECB result is suboptimal because of penalties 188# incurred by operations on %xmm8-15. As ECB is not considered 189# critical, nothing was done to mitigate the problem. 190 191$PREFIX="aes_hw"; # if $PREFIX is set to "AES", the script 192 # generates drop-in replacement for 193 # crypto/aes/asm/aes-x86_64.pl:-) 194 195$flavour = shift; 196$output = shift; 197if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 198 199$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 200 201$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 202( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 203( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 204die "can't locate x86_64-xlate.pl"; 205 206open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 207*STDOUT=*OUT; 208 209$movkey = $PREFIX eq "aes_hw" ? "movups" : "movups"; 210@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 211 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 212 213$code=".text\n"; 214$code.=".extern OPENSSL_ia32cap_P\n"; 215 216$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!! 217# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ... 218$inp="%rdi"; 219$out="%rsi"; 220$len="%rdx"; 221$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! 222$ivp="%r8"; # cbc, ctr, ... 223 224$rnds_="%r10d"; # backup copy for $rounds 225$key_="%r11"; # backup copy for $key 226 227# %xmm register layout 228$rndkey0="%xmm0"; $rndkey1="%xmm1"; 229$inout0="%xmm2"; $inout1="%xmm3"; 230$inout2="%xmm4"; $inout3="%xmm5"; 231$inout4="%xmm6"; $inout5="%xmm7"; 232$inout6="%xmm8"; $inout7="%xmm9"; 233 234$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ... 235$in0="%xmm8"; $iv="%xmm9"; 236 237# Inline version of internal aesni_[en|de]crypt1. 238# 239# Why folded loop? Because aes[enc|dec] is slow enough to accommodate 240# cycles which take care of loop variables... 241{ my $sn; 242sub aesni_generate1 { 243my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); 244++$sn; 245$code.=<<___; 246 $movkey ($key),$rndkey0 247 $movkey 16($key),$rndkey1 248___ 249$code.=<<___ if (defined($ivec)); 250 xorps $rndkey0,$ivec 251 lea 32($key),$key 252 xorps $ivec,$inout 253___ 254$code.=<<___ if (!defined($ivec)); 255 lea 32($key),$key 256 xorps $rndkey0,$inout 257___ 258$code.=<<___; 259.Loop_${p}1_$sn: 260 aes${p} $rndkey1,$inout 261 dec $rounds 262 $movkey ($key),$rndkey1 263 lea 16($key),$key 264 jnz .Loop_${p}1_$sn # loop body is 16 bytes 265 aes${p}last $rndkey1,$inout 266___ 267}} 268# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key); 269# 270{ my ($inp,$out,$key) = @_4args; 271 272$code.=<<___; 273.globl ${PREFIX}_encrypt 274.type ${PREFIX}_encrypt,\@abi-omnipotent 275.align 16 276${PREFIX}_encrypt: 277.cfi_startproc 278 _CET_ENDBR 279#ifdef BORINGSSL_DISPATCH_TEST 280.extern BORINGSSL_function_hit 281 movb \$1,BORINGSSL_function_hit+1(%rip) 282#endif 283 movups ($inp),$inout0 # load input 284 mov 240($key),$rounds # key->rounds 285___ 286 &aesni_generate1("enc",$key,$rounds); 287$code.=<<___; 288 pxor $rndkey0,$rndkey0 # clear register bank 289 pxor $rndkey1,$rndkey1 290 movups $inout0,($out) # output 291 pxor $inout0,$inout0 292 ret 293.cfi_endproc 294.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt 295 296.globl ${PREFIX}_decrypt 297.type ${PREFIX}_decrypt,\@abi-omnipotent 298.align 16 299${PREFIX}_decrypt: 300.cfi_startproc 301 _CET_ENDBR 302 movups ($inp),$inout0 # load input 303 mov 240($key),$rounds # key->rounds 304___ 305 &aesni_generate1("dec",$key,$rounds); 306$code.=<<___; 307 pxor $rndkey0,$rndkey0 # clear register bank 308 pxor $rndkey1,$rndkey1 309 movups $inout0,($out) # output 310 pxor $inout0,$inout0 311 ret 312.cfi_endproc 313.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt 314___ 315} 316 317# _aesni_[en|de]cryptN are private interfaces, N denotes interleave 318# factor. Why 3x subroutine were originally used in loops? Even though 319# aes[enc|dec] latency was originally 6, it could be scheduled only 320# every *2nd* cycle. Thus 3x interleave was the one providing optimal 321# utilization, i.e. when subroutine's throughput is virtually same as 322# of non-interleaved subroutine [for number of input blocks up to 3]. 323# This is why it originally made no sense to implement 2x subroutine. 324# But times change and it became appropriate to spend extra 192 bytes 325# on 2x subroutine on Atom Silvermont account. For processors that 326# can schedule aes[enc|dec] every cycle optimal interleave factor 327# equals to corresponding instructions latency. 8x is optimal for 328# * Bridge and "super-optimal" for other Intel CPUs... 329 330sub aesni_generate2 { 331my $dir=shift; 332# As already mentioned it takes in $key and $rounds, which are *not* 333# preserved. $inout[0-1] is cipher/clear text... 334$code.=<<___; 335.type _aesni_${dir}rypt2,\@abi-omnipotent 336.align 16 337_aesni_${dir}rypt2: 338.cfi_startproc 339 $movkey ($key),$rndkey0 340 shl \$4,$rounds 341 $movkey 16($key),$rndkey1 342 xorps $rndkey0,$inout0 343 xorps $rndkey0,$inout1 344 $movkey 32($key),$rndkey0 345 lea 32($key,$rounds),$key 346 neg %rax # $rounds 347 add \$16,%rax 348 349.L${dir}_loop2: 350 aes${dir} $rndkey1,$inout0 351 aes${dir} $rndkey1,$inout1 352 $movkey ($key,%rax),$rndkey1 353 add \$32,%rax 354 aes${dir} $rndkey0,$inout0 355 aes${dir} $rndkey0,$inout1 356 $movkey -16($key,%rax),$rndkey0 357 jnz .L${dir}_loop2 358 359 aes${dir} $rndkey1,$inout0 360 aes${dir} $rndkey1,$inout1 361 aes${dir}last $rndkey0,$inout0 362 aes${dir}last $rndkey0,$inout1 363 ret 364.cfi_endproc 365.size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2 366___ 367} 368sub aesni_generate3 { 369my $dir=shift; 370# As already mentioned it takes in $key and $rounds, which are *not* 371# preserved. $inout[0-2] is cipher/clear text... 372$code.=<<___; 373.type _aesni_${dir}rypt3,\@abi-omnipotent 374.align 16 375_aesni_${dir}rypt3: 376.cfi_startproc 377 $movkey ($key),$rndkey0 378 shl \$4,$rounds 379 $movkey 16($key),$rndkey1 380 xorps $rndkey0,$inout0 381 xorps $rndkey0,$inout1 382 xorps $rndkey0,$inout2 383 $movkey 32($key),$rndkey0 384 lea 32($key,$rounds),$key 385 neg %rax # $rounds 386 add \$16,%rax 387 388.L${dir}_loop3: 389 aes${dir} $rndkey1,$inout0 390 aes${dir} $rndkey1,$inout1 391 aes${dir} $rndkey1,$inout2 392 $movkey ($key,%rax),$rndkey1 393 add \$32,%rax 394 aes${dir} $rndkey0,$inout0 395 aes${dir} $rndkey0,$inout1 396 aes${dir} $rndkey0,$inout2 397 $movkey -16($key,%rax),$rndkey0 398 jnz .L${dir}_loop3 399 400 aes${dir} $rndkey1,$inout0 401 aes${dir} $rndkey1,$inout1 402 aes${dir} $rndkey1,$inout2 403 aes${dir}last $rndkey0,$inout0 404 aes${dir}last $rndkey0,$inout1 405 aes${dir}last $rndkey0,$inout2 406 ret 407.cfi_endproc 408.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3 409___ 410} 411# 4x interleave is implemented to improve small block performance, 412# most notably [and naturally] 4 block by ~30%. One can argue that one 413# should have implemented 5x as well, but improvement would be <20%, 414# so it's not worth it... 415sub aesni_generate4 { 416my $dir=shift; 417# As already mentioned it takes in $key and $rounds, which are *not* 418# preserved. $inout[0-3] is cipher/clear text... 419$code.=<<___; 420.type _aesni_${dir}rypt4,\@abi-omnipotent 421.align 16 422_aesni_${dir}rypt4: 423.cfi_startproc 424 $movkey ($key),$rndkey0 425 shl \$4,$rounds 426 $movkey 16($key),$rndkey1 427 xorps $rndkey0,$inout0 428 xorps $rndkey0,$inout1 429 xorps $rndkey0,$inout2 430 xorps $rndkey0,$inout3 431 $movkey 32($key),$rndkey0 432 lea 32($key,$rounds),$key 433 neg %rax # $rounds 434 .byte 0x0f,0x1f,0x00 435 add \$16,%rax 436 437.L${dir}_loop4: 438 aes${dir} $rndkey1,$inout0 439 aes${dir} $rndkey1,$inout1 440 aes${dir} $rndkey1,$inout2 441 aes${dir} $rndkey1,$inout3 442 $movkey ($key,%rax),$rndkey1 443 add \$32,%rax 444 aes${dir} $rndkey0,$inout0 445 aes${dir} $rndkey0,$inout1 446 aes${dir} $rndkey0,$inout2 447 aes${dir} $rndkey0,$inout3 448 $movkey -16($key,%rax),$rndkey0 449 jnz .L${dir}_loop4 450 451 aes${dir} $rndkey1,$inout0 452 aes${dir} $rndkey1,$inout1 453 aes${dir} $rndkey1,$inout2 454 aes${dir} $rndkey1,$inout3 455 aes${dir}last $rndkey0,$inout0 456 aes${dir}last $rndkey0,$inout1 457 aes${dir}last $rndkey0,$inout2 458 aes${dir}last $rndkey0,$inout3 459 ret 460.cfi_endproc 461.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 462___ 463} 464sub aesni_generate6 { 465my $dir=shift; 466# As already mentioned it takes in $key and $rounds, which are *not* 467# preserved. $inout[0-5] is cipher/clear text... 468$code.=<<___; 469.type _aesni_${dir}rypt6,\@abi-omnipotent 470.align 16 471_aesni_${dir}rypt6: 472.cfi_startproc 473 $movkey ($key),$rndkey0 474 shl \$4,$rounds 475 $movkey 16($key),$rndkey1 476 xorps $rndkey0,$inout0 477 pxor $rndkey0,$inout1 478 pxor $rndkey0,$inout2 479 aes${dir} $rndkey1,$inout0 480 lea 32($key,$rounds),$key 481 neg %rax # $rounds 482 aes${dir} $rndkey1,$inout1 483 pxor $rndkey0,$inout3 484 pxor $rndkey0,$inout4 485 aes${dir} $rndkey1,$inout2 486 pxor $rndkey0,$inout5 487 $movkey ($key,%rax),$rndkey0 488 add \$16,%rax 489 jmp .L${dir}_loop6_enter 490.align 16 491.L${dir}_loop6: 492 aes${dir} $rndkey1,$inout0 493 aes${dir} $rndkey1,$inout1 494 aes${dir} $rndkey1,$inout2 495.L${dir}_loop6_enter: 496 aes${dir} $rndkey1,$inout3 497 aes${dir} $rndkey1,$inout4 498 aes${dir} $rndkey1,$inout5 499 $movkey ($key,%rax),$rndkey1 500 add \$32,%rax 501 aes${dir} $rndkey0,$inout0 502 aes${dir} $rndkey0,$inout1 503 aes${dir} $rndkey0,$inout2 504 aes${dir} $rndkey0,$inout3 505 aes${dir} $rndkey0,$inout4 506 aes${dir} $rndkey0,$inout5 507 $movkey -16($key,%rax),$rndkey0 508 jnz .L${dir}_loop6 509 510 aes${dir} $rndkey1,$inout0 511 aes${dir} $rndkey1,$inout1 512 aes${dir} $rndkey1,$inout2 513 aes${dir} $rndkey1,$inout3 514 aes${dir} $rndkey1,$inout4 515 aes${dir} $rndkey1,$inout5 516 aes${dir}last $rndkey0,$inout0 517 aes${dir}last $rndkey0,$inout1 518 aes${dir}last $rndkey0,$inout2 519 aes${dir}last $rndkey0,$inout3 520 aes${dir}last $rndkey0,$inout4 521 aes${dir}last $rndkey0,$inout5 522 ret 523.cfi_endproc 524.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6 525___ 526} 527sub aesni_generate8 { 528my $dir=shift; 529# As already mentioned it takes in $key and $rounds, which are *not* 530# preserved. $inout[0-7] is cipher/clear text... 531$code.=<<___; 532.type _aesni_${dir}rypt8,\@abi-omnipotent 533.align 16 534_aesni_${dir}rypt8: 535.cfi_startproc 536 $movkey ($key),$rndkey0 537 shl \$4,$rounds 538 $movkey 16($key),$rndkey1 539 xorps $rndkey0,$inout0 540 xorps $rndkey0,$inout1 541 pxor $rndkey0,$inout2 542 pxor $rndkey0,$inout3 543 pxor $rndkey0,$inout4 544 lea 32($key,$rounds),$key 545 neg %rax # $rounds 546 aes${dir} $rndkey1,$inout0 547 pxor $rndkey0,$inout5 548 pxor $rndkey0,$inout6 549 aes${dir} $rndkey1,$inout1 550 pxor $rndkey0,$inout7 551 $movkey ($key,%rax),$rndkey0 552 add \$16,%rax 553 jmp .L${dir}_loop8_inner 554.align 16 555.L${dir}_loop8: 556 aes${dir} $rndkey1,$inout0 557 aes${dir} $rndkey1,$inout1 558.L${dir}_loop8_inner: 559 aes${dir} $rndkey1,$inout2 560 aes${dir} $rndkey1,$inout3 561 aes${dir} $rndkey1,$inout4 562 aes${dir} $rndkey1,$inout5 563 aes${dir} $rndkey1,$inout6 564 aes${dir} $rndkey1,$inout7 565.L${dir}_loop8_enter: 566 $movkey ($key,%rax),$rndkey1 567 add \$32,%rax 568 aes${dir} $rndkey0,$inout0 569 aes${dir} $rndkey0,$inout1 570 aes${dir} $rndkey0,$inout2 571 aes${dir} $rndkey0,$inout3 572 aes${dir} $rndkey0,$inout4 573 aes${dir} $rndkey0,$inout5 574 aes${dir} $rndkey0,$inout6 575 aes${dir} $rndkey0,$inout7 576 $movkey -16($key,%rax),$rndkey0 577 jnz .L${dir}_loop8 578 579 aes${dir} $rndkey1,$inout0 580 aes${dir} $rndkey1,$inout1 581 aes${dir} $rndkey1,$inout2 582 aes${dir} $rndkey1,$inout3 583 aes${dir} $rndkey1,$inout4 584 aes${dir} $rndkey1,$inout5 585 aes${dir} $rndkey1,$inout6 586 aes${dir} $rndkey1,$inout7 587 aes${dir}last $rndkey0,$inout0 588 aes${dir}last $rndkey0,$inout1 589 aes${dir}last $rndkey0,$inout2 590 aes${dir}last $rndkey0,$inout3 591 aes${dir}last $rndkey0,$inout4 592 aes${dir}last $rndkey0,$inout5 593 aes${dir}last $rndkey0,$inout6 594 aes${dir}last $rndkey0,$inout7 595 ret 596.cfi_endproc 597.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8 598___ 599} 600&aesni_generate2("enc") if ($PREFIX eq "aes_hw"); 601&aesni_generate2("dec"); 602&aesni_generate3("enc") if ($PREFIX eq "aes_hw"); 603&aesni_generate3("dec"); 604&aesni_generate4("enc") if ($PREFIX eq "aes_hw"); 605&aesni_generate4("dec"); 606&aesni_generate6("enc") if ($PREFIX eq "aes_hw"); 607&aesni_generate6("dec"); 608&aesni_generate8("enc") if ($PREFIX eq "aes_hw"); 609&aesni_generate8("dec"); 610 611if ($PREFIX eq "aes_hw") { 612######################################################################## 613# void aesni_ecb_encrypt (const void *in, void *out, 614# size_t length, const AES_KEY *key, 615# int enc); 616$code.=<<___; 617.globl ${PREFIX}_ecb_encrypt 618.type ${PREFIX}_ecb_encrypt,\@function,5 619.align 16 620${PREFIX}_ecb_encrypt: 621.cfi_startproc 622 _CET_ENDBR 623___ 624$code.=<<___ if ($win64); 625 lea -0x58(%rsp),%rsp 626 movaps %xmm6,(%rsp) # offload $inout4..7 627 movaps %xmm7,0x10(%rsp) 628 movaps %xmm8,0x20(%rsp) 629 movaps %xmm9,0x30(%rsp) 630.Lecb_enc_body: 631___ 632$code.=<<___; 633 and \$-16,$len # if ($len<16) 634 jz .Lecb_ret # return 635 636 mov 240($key),$rounds # key->rounds 637 $movkey ($key),$rndkey0 638 mov $key,$key_ # backup $key 639 mov $rounds,$rnds_ # backup $rounds 640 test %r8d,%r8d # 5th argument 641 jz .Lecb_decrypt 642#--------------------------- ECB ENCRYPT ------------------------------# 643 cmp \$0x80,$len # if ($len<8*16) 644 jb .Lecb_enc_tail # short input 645 646 movdqu ($inp),$inout0 # load 8 input blocks 647 movdqu 0x10($inp),$inout1 648 movdqu 0x20($inp),$inout2 649 movdqu 0x30($inp),$inout3 650 movdqu 0x40($inp),$inout4 651 movdqu 0x50($inp),$inout5 652 movdqu 0x60($inp),$inout6 653 movdqu 0x70($inp),$inout7 654 lea 0x80($inp),$inp # $inp+=8*16 655 sub \$0x80,$len # $len-=8*16 (can be zero) 656 jmp .Lecb_enc_loop8_enter 657.align 16 658.Lecb_enc_loop8: 659 movups $inout0,($out) # store 8 output blocks 660 mov $key_,$key # restore $key 661 movdqu ($inp),$inout0 # load 8 input blocks 662 mov $rnds_,$rounds # restore $rounds 663 movups $inout1,0x10($out) 664 movdqu 0x10($inp),$inout1 665 movups $inout2,0x20($out) 666 movdqu 0x20($inp),$inout2 667 movups $inout3,0x30($out) 668 movdqu 0x30($inp),$inout3 669 movups $inout4,0x40($out) 670 movdqu 0x40($inp),$inout4 671 movups $inout5,0x50($out) 672 movdqu 0x50($inp),$inout5 673 movups $inout6,0x60($out) 674 movdqu 0x60($inp),$inout6 675 movups $inout7,0x70($out) 676 lea 0x80($out),$out # $out+=8*16 677 movdqu 0x70($inp),$inout7 678 lea 0x80($inp),$inp # $inp+=8*16 679.Lecb_enc_loop8_enter: 680 681 call _aesni_encrypt8 682 683 sub \$0x80,$len 684 jnc .Lecb_enc_loop8 # loop if $len-=8*16 didn't borrow 685 686 movups $inout0,($out) # store 8 output blocks 687 mov $key_,$key # restore $key 688 movups $inout1,0x10($out) 689 mov $rnds_,$rounds # restore $rounds 690 movups $inout2,0x20($out) 691 movups $inout3,0x30($out) 692 movups $inout4,0x40($out) 693 movups $inout5,0x50($out) 694 movups $inout6,0x60($out) 695 movups $inout7,0x70($out) 696 lea 0x80($out),$out # $out+=8*16 697 add \$0x80,$len # restore real remaining $len 698 jz .Lecb_ret # done if ($len==0) 699 700.Lecb_enc_tail: # $len is less than 8*16 701 movups ($inp),$inout0 702 cmp \$0x20,$len 703 jb .Lecb_enc_one 704 movups 0x10($inp),$inout1 705 je .Lecb_enc_two 706 movups 0x20($inp),$inout2 707 cmp \$0x40,$len 708 jb .Lecb_enc_three 709 movups 0x30($inp),$inout3 710 je .Lecb_enc_four 711 movups 0x40($inp),$inout4 712 cmp \$0x60,$len 713 jb .Lecb_enc_five 714 movups 0x50($inp),$inout5 715 je .Lecb_enc_six 716 movdqu 0x60($inp),$inout6 717 xorps $inout7,$inout7 718 call _aesni_encrypt8 719 movups $inout0,($out) # store 7 output blocks 720 movups $inout1,0x10($out) 721 movups $inout2,0x20($out) 722 movups $inout3,0x30($out) 723 movups $inout4,0x40($out) 724 movups $inout5,0x50($out) 725 movups $inout6,0x60($out) 726 jmp .Lecb_ret 727.align 16 728.Lecb_enc_one: 729___ 730 &aesni_generate1("enc",$key,$rounds); 731$code.=<<___; 732 movups $inout0,($out) # store one output block 733 jmp .Lecb_ret 734.align 16 735.Lecb_enc_two: 736 call _aesni_encrypt2 737 movups $inout0,($out) # store 2 output blocks 738 movups $inout1,0x10($out) 739 jmp .Lecb_ret 740.align 16 741.Lecb_enc_three: 742 call _aesni_encrypt3 743 movups $inout0,($out) # store 3 output blocks 744 movups $inout1,0x10($out) 745 movups $inout2,0x20($out) 746 jmp .Lecb_ret 747.align 16 748.Lecb_enc_four: 749 call _aesni_encrypt4 750 movups $inout0,($out) # store 4 output blocks 751 movups $inout1,0x10($out) 752 movups $inout2,0x20($out) 753 movups $inout3,0x30($out) 754 jmp .Lecb_ret 755.align 16 756.Lecb_enc_five: 757 xorps $inout5,$inout5 758 call _aesni_encrypt6 759 movups $inout0,($out) # store 5 output blocks 760 movups $inout1,0x10($out) 761 movups $inout2,0x20($out) 762 movups $inout3,0x30($out) 763 movups $inout4,0x40($out) 764 jmp .Lecb_ret 765.align 16 766.Lecb_enc_six: 767 call _aesni_encrypt6 768 movups $inout0,($out) # store 6 output blocks 769 movups $inout1,0x10($out) 770 movups $inout2,0x20($out) 771 movups $inout3,0x30($out) 772 movups $inout4,0x40($out) 773 movups $inout5,0x50($out) 774 jmp .Lecb_ret 775#--------------------------- ECB DECRYPT ------------------------------# 776.align 16 777.Lecb_decrypt: 778 cmp \$0x80,$len # if ($len<8*16) 779 jb .Lecb_dec_tail # short input 780 781 movdqu ($inp),$inout0 # load 8 input blocks 782 movdqu 0x10($inp),$inout1 783 movdqu 0x20($inp),$inout2 784 movdqu 0x30($inp),$inout3 785 movdqu 0x40($inp),$inout4 786 movdqu 0x50($inp),$inout5 787 movdqu 0x60($inp),$inout6 788 movdqu 0x70($inp),$inout7 789 lea 0x80($inp),$inp # $inp+=8*16 790 sub \$0x80,$len # $len-=8*16 (can be zero) 791 jmp .Lecb_dec_loop8_enter 792.align 16 793.Lecb_dec_loop8: 794 movups $inout0,($out) # store 8 output blocks 795 mov $key_,$key # restore $key 796 movdqu ($inp),$inout0 # load 8 input blocks 797 mov $rnds_,$rounds # restore $rounds 798 movups $inout1,0x10($out) 799 movdqu 0x10($inp),$inout1 800 movups $inout2,0x20($out) 801 movdqu 0x20($inp),$inout2 802 movups $inout3,0x30($out) 803 movdqu 0x30($inp),$inout3 804 movups $inout4,0x40($out) 805 movdqu 0x40($inp),$inout4 806 movups $inout5,0x50($out) 807 movdqu 0x50($inp),$inout5 808 movups $inout6,0x60($out) 809 movdqu 0x60($inp),$inout6 810 movups $inout7,0x70($out) 811 lea 0x80($out),$out # $out+=8*16 812 movdqu 0x70($inp),$inout7 813 lea 0x80($inp),$inp # $inp+=8*16 814.Lecb_dec_loop8_enter: 815 816 call _aesni_decrypt8 817 818 $movkey ($key_),$rndkey0 819 sub \$0x80,$len 820 jnc .Lecb_dec_loop8 # loop if $len-=8*16 didn't borrow 821 822 movups $inout0,($out) # store 8 output blocks 823 pxor $inout0,$inout0 # clear register bank 824 mov $key_,$key # restore $key 825 movups $inout1,0x10($out) 826 pxor $inout1,$inout1 827 mov $rnds_,$rounds # restore $rounds 828 movups $inout2,0x20($out) 829 pxor $inout2,$inout2 830 movups $inout3,0x30($out) 831 pxor $inout3,$inout3 832 movups $inout4,0x40($out) 833 pxor $inout4,$inout4 834 movups $inout5,0x50($out) 835 pxor $inout5,$inout5 836 movups $inout6,0x60($out) 837 pxor $inout6,$inout6 838 movups $inout7,0x70($out) 839 pxor $inout7,$inout7 840 lea 0x80($out),$out # $out+=8*16 841 add \$0x80,$len # restore real remaining $len 842 jz .Lecb_ret # done if ($len==0) 843 844.Lecb_dec_tail: 845 movups ($inp),$inout0 846 cmp \$0x20,$len 847 jb .Lecb_dec_one 848 movups 0x10($inp),$inout1 849 je .Lecb_dec_two 850 movups 0x20($inp),$inout2 851 cmp \$0x40,$len 852 jb .Lecb_dec_three 853 movups 0x30($inp),$inout3 854 je .Lecb_dec_four 855 movups 0x40($inp),$inout4 856 cmp \$0x60,$len 857 jb .Lecb_dec_five 858 movups 0x50($inp),$inout5 859 je .Lecb_dec_six 860 movups 0x60($inp),$inout6 861 $movkey ($key),$rndkey0 862 xorps $inout7,$inout7 863 call _aesni_decrypt8 864 movups $inout0,($out) # store 7 output blocks 865 pxor $inout0,$inout0 # clear register bank 866 movups $inout1,0x10($out) 867 pxor $inout1,$inout1 868 movups $inout2,0x20($out) 869 pxor $inout2,$inout2 870 movups $inout3,0x30($out) 871 pxor $inout3,$inout3 872 movups $inout4,0x40($out) 873 pxor $inout4,$inout4 874 movups $inout5,0x50($out) 875 pxor $inout5,$inout5 876 movups $inout6,0x60($out) 877 pxor $inout6,$inout6 878 pxor $inout7,$inout7 879 jmp .Lecb_ret 880.align 16 881.Lecb_dec_one: 882___ 883 &aesni_generate1("dec",$key,$rounds); 884$code.=<<___; 885 movups $inout0,($out) # store one output block 886 pxor $inout0,$inout0 # clear register bank 887 jmp .Lecb_ret 888.align 16 889.Lecb_dec_two: 890 call _aesni_decrypt2 891 movups $inout0,($out) # store 2 output blocks 892 pxor $inout0,$inout0 # clear register bank 893 movups $inout1,0x10($out) 894 pxor $inout1,$inout1 895 jmp .Lecb_ret 896.align 16 897.Lecb_dec_three: 898 call _aesni_decrypt3 899 movups $inout0,($out) # store 3 output blocks 900 pxor $inout0,$inout0 # clear register bank 901 movups $inout1,0x10($out) 902 pxor $inout1,$inout1 903 movups $inout2,0x20($out) 904 pxor $inout2,$inout2 905 jmp .Lecb_ret 906.align 16 907.Lecb_dec_four: 908 call _aesni_decrypt4 909 movups $inout0,($out) # store 4 output blocks 910 pxor $inout0,$inout0 # clear register bank 911 movups $inout1,0x10($out) 912 pxor $inout1,$inout1 913 movups $inout2,0x20($out) 914 pxor $inout2,$inout2 915 movups $inout3,0x30($out) 916 pxor $inout3,$inout3 917 jmp .Lecb_ret 918.align 16 919.Lecb_dec_five: 920 xorps $inout5,$inout5 921 call _aesni_decrypt6 922 movups $inout0,($out) # store 5 output blocks 923 pxor $inout0,$inout0 # clear register bank 924 movups $inout1,0x10($out) 925 pxor $inout1,$inout1 926 movups $inout2,0x20($out) 927 pxor $inout2,$inout2 928 movups $inout3,0x30($out) 929 pxor $inout3,$inout3 930 movups $inout4,0x40($out) 931 pxor $inout4,$inout4 932 pxor $inout5,$inout5 933 jmp .Lecb_ret 934.align 16 935.Lecb_dec_six: 936 call _aesni_decrypt6 937 movups $inout0,($out) # store 6 output blocks 938 pxor $inout0,$inout0 # clear register bank 939 movups $inout1,0x10($out) 940 pxor $inout1,$inout1 941 movups $inout2,0x20($out) 942 pxor $inout2,$inout2 943 movups $inout3,0x30($out) 944 pxor $inout3,$inout3 945 movups $inout4,0x40($out) 946 pxor $inout4,$inout4 947 movups $inout5,0x50($out) 948 pxor $inout5,$inout5 949 950.Lecb_ret: 951 xorps $rndkey0,$rndkey0 # %xmm0 952 pxor $rndkey1,$rndkey1 953___ 954$code.=<<___ if ($win64); 955 movaps (%rsp),%xmm6 956 movaps %xmm0,(%rsp) # clear stack 957 movaps 0x10(%rsp),%xmm7 958 movaps %xmm0,0x10(%rsp) 959 movaps 0x20(%rsp),%xmm8 960 movaps %xmm0,0x20(%rsp) 961 movaps 0x30(%rsp),%xmm9 962 movaps %xmm0,0x30(%rsp) 963 lea 0x58(%rsp),%rsp 964.Lecb_enc_ret: 965___ 966$code.=<<___; 967 ret 968.cfi_endproc 969.size ${PREFIX}_ecb_encrypt,.-${PREFIX}_ecb_encrypt 970___ 971 972{ 973###################################################################### 974# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, 975# size_t blocks, const AES_KEY *key, 976# const char *ivec,char *cmac); 977# 978# Handles only complete blocks, operates on 64-bit counter and 979# does not update *ivec! Nor does it finalize CMAC value 980# (see engine/eng_aesni.c for details) 981# 982if (0) { # Omit these functions in BoringSSL 983my $cmac="%r9"; # 6th argument 984 985my $increment="%xmm9"; 986my $iv="%xmm6"; 987my $bswap_mask="%xmm7"; 988 989$code.=<<___; 990.globl ${PREFIX}_ccm64_encrypt_blocks 991.type ${PREFIX}_ccm64_encrypt_blocks,\@function,6 992.align 16 993${PREFIX}_ccm64_encrypt_blocks: 994___ 995$code.=<<___ if ($win64); 996 lea -0x58(%rsp),%rsp 997 movaps %xmm6,(%rsp) # $iv 998 movaps %xmm7,0x10(%rsp) # $bswap_mask 999 movaps %xmm8,0x20(%rsp) # $in0 1000 movaps %xmm9,0x30(%rsp) # $increment 1001.Lccm64_enc_body: 1002___ 1003$code.=<<___; 1004 mov 240($key),$rounds # key->rounds 1005 movdqu ($ivp),$iv 1006 movdqa .Lincrement64(%rip),$increment 1007 movdqa .Lbswap_mask(%rip),$bswap_mask 1008 1009 shl \$4,$rounds 1010 mov \$16,$rnds_ 1011 lea 0($key),$key_ 1012 movdqu ($cmac),$inout1 1013 movdqa $iv,$inout0 1014 lea 32($key,$rounds),$key # end of key schedule 1015 pshufb $bswap_mask,$iv 1016 sub %rax,%r10 # twisted $rounds 1017 jmp .Lccm64_enc_outer 1018.align 16 1019.Lccm64_enc_outer: 1020 $movkey ($key_),$rndkey0 1021 mov %r10,%rax 1022 movups ($inp),$in0 # load inp 1023 1024 xorps $rndkey0,$inout0 # counter 1025 $movkey 16($key_),$rndkey1 1026 xorps $in0,$rndkey0 1027 xorps $rndkey0,$inout1 # cmac^=inp 1028 $movkey 32($key_),$rndkey0 1029 1030.Lccm64_enc2_loop: 1031 aesenc $rndkey1,$inout0 1032 aesenc $rndkey1,$inout1 1033 $movkey ($key,%rax),$rndkey1 1034 add \$32,%rax 1035 aesenc $rndkey0,$inout0 1036 aesenc $rndkey0,$inout1 1037 $movkey -16($key,%rax),$rndkey0 1038 jnz .Lccm64_enc2_loop 1039 aesenc $rndkey1,$inout0 1040 aesenc $rndkey1,$inout1 1041 paddq $increment,$iv 1042 dec $len # $len-- ($len is in blocks) 1043 aesenclast $rndkey0,$inout0 1044 aesenclast $rndkey0,$inout1 1045 1046 lea 16($inp),$inp 1047 xorps $inout0,$in0 # inp ^= E(iv) 1048 movdqa $iv,$inout0 1049 movups $in0,($out) # save output 1050 pshufb $bswap_mask,$inout0 1051 lea 16($out),$out # $out+=16 1052 jnz .Lccm64_enc_outer # loop if ($len!=0) 1053 1054 pxor $rndkey0,$rndkey0 # clear register bank 1055 pxor $rndkey1,$rndkey1 1056 pxor $inout0,$inout0 1057 movups $inout1,($cmac) # store resulting mac 1058 pxor $inout1,$inout1 1059 pxor $in0,$in0 1060 pxor $iv,$iv 1061___ 1062$code.=<<___ if ($win64); 1063 movaps (%rsp),%xmm6 1064 movaps %xmm0,(%rsp) # clear stack 1065 movaps 0x10(%rsp),%xmm7 1066 movaps %xmm0,0x10(%rsp) 1067 movaps 0x20(%rsp),%xmm8 1068 movaps %xmm0,0x20(%rsp) 1069 movaps 0x30(%rsp),%xmm9 1070 movaps %xmm0,0x30(%rsp) 1071 lea 0x58(%rsp),%rsp 1072.Lccm64_enc_ret: 1073___ 1074$code.=<<___; 1075 ret 1076.size ${PREFIX}_ccm64_encrypt_blocks,.-${PREFIX}_ccm64_encrypt_blocks 1077___ 1078###################################################################### 1079$code.=<<___; 1080.globl ${PREFIX}_ccm64_decrypt_blocks 1081.type ${PREFIX}_ccm64_decrypt_blocks,\@function,6 1082.align 16 1083${PREFIX}_ccm64_decrypt_blocks: 1084___ 1085$code.=<<___ if ($win64); 1086 lea -0x58(%rsp),%rsp 1087 movaps %xmm6,(%rsp) # $iv 1088 movaps %xmm7,0x10(%rsp) # $bswap_mask 1089 movaps %xmm8,0x20(%rsp) # $in8 1090 movaps %xmm9,0x30(%rsp) # $increment 1091.Lccm64_dec_body: 1092___ 1093$code.=<<___; 1094 mov 240($key),$rounds # key->rounds 1095 movups ($ivp),$iv 1096 movdqu ($cmac),$inout1 1097 movdqa .Lincrement64(%rip),$increment 1098 movdqa .Lbswap_mask(%rip),$bswap_mask 1099 1100 movaps $iv,$inout0 1101 mov $rounds,$rnds_ 1102 mov $key,$key_ 1103 pshufb $bswap_mask,$iv 1104___ 1105 &aesni_generate1("enc",$key,$rounds); 1106$code.=<<___; 1107 shl \$4,$rnds_ 1108 mov \$16,$rounds 1109 movups ($inp),$in0 # load inp 1110 paddq $increment,$iv 1111 lea 16($inp),$inp # $inp+=16 1112 sub %r10,%rax # twisted $rounds 1113 lea 32($key_,$rnds_),$key # end of key schedule 1114 mov %rax,%r10 1115 jmp .Lccm64_dec_outer 1116.align 16 1117.Lccm64_dec_outer: 1118 xorps $inout0,$in0 # inp ^= E(iv) 1119 movdqa $iv,$inout0 1120 movups $in0,($out) # save output 1121 lea 16($out),$out # $out+=16 1122 pshufb $bswap_mask,$inout0 1123 1124 sub \$1,$len # $len-- ($len is in blocks) 1125 jz .Lccm64_dec_break # if ($len==0) break 1126 1127 $movkey ($key_),$rndkey0 1128 mov %r10,%rax 1129 $movkey 16($key_),$rndkey1 1130 xorps $rndkey0,$in0 1131 xorps $rndkey0,$inout0 1132 xorps $in0,$inout1 # cmac^=out 1133 $movkey 32($key_),$rndkey0 1134 jmp .Lccm64_dec2_loop 1135.align 16 1136.Lccm64_dec2_loop: 1137 aesenc $rndkey1,$inout0 1138 aesenc $rndkey1,$inout1 1139 $movkey ($key,%rax),$rndkey1 1140 add \$32,%rax 1141 aesenc $rndkey0,$inout0 1142 aesenc $rndkey0,$inout1 1143 $movkey -16($key,%rax),$rndkey0 1144 jnz .Lccm64_dec2_loop 1145 movups ($inp),$in0 # load input 1146 paddq $increment,$iv 1147 aesenc $rndkey1,$inout0 1148 aesenc $rndkey1,$inout1 1149 aesenclast $rndkey0,$inout0 1150 aesenclast $rndkey0,$inout1 1151 lea 16($inp),$inp # $inp+=16 1152 jmp .Lccm64_dec_outer 1153 1154.align 16 1155.Lccm64_dec_break: 1156 #xorps $in0,$inout1 # cmac^=out 1157 mov 240($key_),$rounds 1158___ 1159 &aesni_generate1("enc",$key_,$rounds,$inout1,$in0); 1160$code.=<<___; 1161 pxor $rndkey0,$rndkey0 # clear register bank 1162 pxor $rndkey1,$rndkey1 1163 pxor $inout0,$inout0 1164 movups $inout1,($cmac) # store resulting mac 1165 pxor $inout1,$inout1 1166 pxor $in0,$in0 1167 pxor $iv,$iv 1168___ 1169$code.=<<___ if ($win64); 1170 movaps (%rsp),%xmm6 1171 movaps %xmm0,(%rsp) # clear stack 1172 movaps 0x10(%rsp),%xmm7 1173 movaps %xmm0,0x10(%rsp) 1174 movaps 0x20(%rsp),%xmm8 1175 movaps %xmm0,0x20(%rsp) 1176 movaps 0x30(%rsp),%xmm9 1177 movaps %xmm0,0x30(%rsp) 1178 lea 0x58(%rsp),%rsp 1179.Lccm64_dec_ret: 1180___ 1181$code.=<<___; 1182 ret 1183.size ${PREFIX}_ccm64_decrypt_blocks,.-${PREFIX}_ccm64_decrypt_blocks 1184___ 1185} 1186###################################################################### 1187# void aesni_ctr32_encrypt_blocks (const void *in, void *out, 1188# size_t blocks, const AES_KEY *key, 1189# const char *ivec); 1190# 1191# Handles only complete blocks, operates on 32-bit counter and 1192# does not update *ivec! (see crypto/modes/ctr128.c for details) 1193# 1194# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov, 1195# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest. 1196# Keywords are full unroll and modulo-schedule counter calculations 1197# with zero-round key xor. 1198{ 1199my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15)); 1200my ($key0,$ctr)=("%ebp","${ivp}d"); 1201my $frame_size = 0x80 + ($win64?160:0); 1202 1203$code.=<<___; 1204.globl ${PREFIX}_ctr32_encrypt_blocks 1205.type ${PREFIX}_ctr32_encrypt_blocks,\@function,5 1206.align 16 1207${PREFIX}_ctr32_encrypt_blocks: 1208.cfi_startproc 1209 _CET_ENDBR 1210#ifdef BORINGSSL_DISPATCH_TEST 1211 movb \$1,BORINGSSL_function_hit(%rip) 1212#endif 1213 cmp \$1,$len 1214 jne .Lctr32_bulk 1215 1216 # handle single block without allocating stack frame, 1217 # useful when handling edges 1218 movups ($ivp),$inout0 1219 movups ($inp),$inout1 1220 mov 240($key),%edx # key->rounds 1221___ 1222 &aesni_generate1("enc",$key,"%edx"); 1223$code.=<<___; 1224 pxor $rndkey0,$rndkey0 # clear register bank 1225 pxor $rndkey1,$rndkey1 1226 xorps $inout1,$inout0 1227 pxor $inout1,$inout1 1228 movups $inout0,($out) 1229 xorps $inout0,$inout0 1230 jmp .Lctr32_epilogue 1231 1232.align 16 1233.Lctr32_bulk: 1234 lea (%rsp),$key_ # use $key_ as frame pointer 1235.cfi_def_cfa_register $key_ 1236 push %rbp 1237.cfi_push %rbp 1238 sub \$$frame_size,%rsp 1239 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 1240___ 1241$code.=<<___ if ($win64); 1242 movaps %xmm6,-0xa8($key_) # offload everything 1243 movaps %xmm7,-0x98($key_) 1244 movaps %xmm8,-0x88($key_) 1245 movaps %xmm9,-0x78($key_) 1246 movaps %xmm10,-0x68($key_) 1247 movaps %xmm11,-0x58($key_) 1248 movaps %xmm12,-0x48($key_) 1249 movaps %xmm13,-0x38($key_) 1250 movaps %xmm14,-0x28($key_) 1251 movaps %xmm15,-0x18($key_) 1252.Lctr32_body: 1253___ 1254$code.=<<___; 1255 1256 # 8 16-byte words on top of stack are counter values 1257 # xor-ed with zero-round key 1258 1259 movdqu ($ivp),$inout0 1260 movdqu ($key),$rndkey0 1261 mov 12($ivp),$ctr # counter LSB 1262 pxor $rndkey0,$inout0 1263 mov 12($key),$key0 # 0-round key LSB 1264 movdqa $inout0,0x00(%rsp) # populate counter block 1265 bswap $ctr 1266 movdqa $inout0,$inout1 1267 movdqa $inout0,$inout2 1268 movdqa $inout0,$inout3 1269 movdqa $inout0,0x40(%rsp) 1270 movdqa $inout0,0x50(%rsp) 1271 movdqa $inout0,0x60(%rsp) 1272 mov %rdx,%r10 # about to borrow %rdx 1273 movdqa $inout0,0x70(%rsp) 1274 1275 lea 1($ctr),%rax 1276 lea 2($ctr),%rdx 1277 bswap %eax 1278 bswap %edx 1279 xor $key0,%eax 1280 xor $key0,%edx 1281 pinsrd \$3,%eax,$inout1 1282 lea 3($ctr),%rax 1283 movdqa $inout1,0x10(%rsp) 1284 pinsrd \$3,%edx,$inout2 1285 bswap %eax 1286 mov %r10,%rdx # restore %rdx 1287 lea 4($ctr),%r10 1288 movdqa $inout2,0x20(%rsp) 1289 xor $key0,%eax 1290 bswap %r10d 1291 pinsrd \$3,%eax,$inout3 1292 xor $key0,%r10d 1293 movdqa $inout3,0x30(%rsp) 1294 lea 5($ctr),%r9 1295 mov %r10d,0x40+12(%rsp) 1296 bswap %r9d 1297 lea 6($ctr),%r10 1298 mov 240($key),$rounds # key->rounds 1299 xor $key0,%r9d 1300 bswap %r10d 1301 mov %r9d,0x50+12(%rsp) 1302 xor $key0,%r10d 1303 lea 7($ctr),%r9 1304 mov %r10d,0x60+12(%rsp) 1305 bswap %r9d 1306 xor $key0,%r9d 1307 mov %r9d,0x70+12(%rsp) 1308 1309 $movkey 0x10($key),$rndkey1 1310 1311 movdqa 0x40(%rsp),$inout4 1312 movdqa 0x50(%rsp),$inout5 1313 1314 cmp \$8,$len # $len is in blocks 1315 jb .Lctr32_tail # short input if ($len<8) 1316 1317 lea 0x80($key),$key # size optimization 1318 sub \$8,$len # $len is biased by -8 1319 jmp .Lctr32_loop8 1320 1321.align 32 1322.Lctr32_loop8: 1323 add \$8,$ctr # next counter value 1324 movdqa 0x60(%rsp),$inout6 1325 aesenc $rndkey1,$inout0 1326 mov $ctr,%r9d 1327 movdqa 0x70(%rsp),$inout7 1328 aesenc $rndkey1,$inout1 1329 bswap %r9d 1330 $movkey 0x20-0x80($key),$rndkey0 1331 aesenc $rndkey1,$inout2 1332 xor $key0,%r9d 1333 nop 1334 aesenc $rndkey1,$inout3 1335 mov %r9d,0x00+12(%rsp) # store next counter value 1336 lea 1($ctr),%r9 1337 aesenc $rndkey1,$inout4 1338 aesenc $rndkey1,$inout5 1339 aesenc $rndkey1,$inout6 1340 aesenc $rndkey1,$inout7 1341 $movkey 0x30-0x80($key),$rndkey1 1342___ 1343for($i=2;$i<8;$i++) { 1344my $rndkeyx = ($i&1)?$rndkey1:$rndkey0; 1345$code.=<<___; 1346 bswap %r9d 1347 aesenc $rndkeyx,$inout0 1348 aesenc $rndkeyx,$inout1 1349 xor $key0,%r9d 1350 .byte 0x66,0x90 1351 aesenc $rndkeyx,$inout2 1352 aesenc $rndkeyx,$inout3 1353 mov %r9d,`0x10*($i-1)`+12(%rsp) 1354 lea $i($ctr),%r9 1355 aesenc $rndkeyx,$inout4 1356 aesenc $rndkeyx,$inout5 1357 aesenc $rndkeyx,$inout6 1358 aesenc $rndkeyx,$inout7 1359 $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx 1360___ 1361} 1362$code.=<<___; 1363 bswap %r9d 1364 aesenc $rndkey0,$inout0 1365 aesenc $rndkey0,$inout1 1366 aesenc $rndkey0,$inout2 1367 xor $key0,%r9d 1368 movdqu 0x00($inp),$in0 # start loading input 1369 aesenc $rndkey0,$inout3 1370 mov %r9d,0x70+12(%rsp) 1371 cmp \$11,$rounds 1372 aesenc $rndkey0,$inout4 1373 aesenc $rndkey0,$inout5 1374 aesenc $rndkey0,$inout6 1375 aesenc $rndkey0,$inout7 1376 $movkey 0xa0-0x80($key),$rndkey0 1377 1378 jb .Lctr32_enc_done 1379 1380 aesenc $rndkey1,$inout0 1381 aesenc $rndkey1,$inout1 1382 aesenc $rndkey1,$inout2 1383 aesenc $rndkey1,$inout3 1384 aesenc $rndkey1,$inout4 1385 aesenc $rndkey1,$inout5 1386 aesenc $rndkey1,$inout6 1387 aesenc $rndkey1,$inout7 1388 $movkey 0xb0-0x80($key),$rndkey1 1389 1390 aesenc $rndkey0,$inout0 1391 aesenc $rndkey0,$inout1 1392 aesenc $rndkey0,$inout2 1393 aesenc $rndkey0,$inout3 1394 aesenc $rndkey0,$inout4 1395 aesenc $rndkey0,$inout5 1396 aesenc $rndkey0,$inout6 1397 aesenc $rndkey0,$inout7 1398 $movkey 0xc0-0x80($key),$rndkey0 1399 je .Lctr32_enc_done 1400 1401 aesenc $rndkey1,$inout0 1402 aesenc $rndkey1,$inout1 1403 aesenc $rndkey1,$inout2 1404 aesenc $rndkey1,$inout3 1405 aesenc $rndkey1,$inout4 1406 aesenc $rndkey1,$inout5 1407 aesenc $rndkey1,$inout6 1408 aesenc $rndkey1,$inout7 1409 $movkey 0xd0-0x80($key),$rndkey1 1410 1411 aesenc $rndkey0,$inout0 1412 aesenc $rndkey0,$inout1 1413 aesenc $rndkey0,$inout2 1414 aesenc $rndkey0,$inout3 1415 aesenc $rndkey0,$inout4 1416 aesenc $rndkey0,$inout5 1417 aesenc $rndkey0,$inout6 1418 aesenc $rndkey0,$inout7 1419 $movkey 0xe0-0x80($key),$rndkey0 1420 jmp .Lctr32_enc_done 1421 1422.align 16 1423.Lctr32_enc_done: 1424 movdqu 0x10($inp),$in1 1425 pxor $rndkey0,$in0 # input^=round[last] 1426 movdqu 0x20($inp),$in2 1427 pxor $rndkey0,$in1 1428 movdqu 0x30($inp),$in3 1429 pxor $rndkey0,$in2 1430 movdqu 0x40($inp),$in4 1431 pxor $rndkey0,$in3 1432 movdqu 0x50($inp),$in5 1433 pxor $rndkey0,$in4 1434 prefetcht0 0x1c0($inp) # We process 128 bytes (8*16), so to prefetch 1 iteration 1435 prefetcht0 0x200($inp) # We need to prefetch 2 64 byte lines 1436 pxor $rndkey0,$in5 1437 aesenc $rndkey1,$inout0 1438 aesenc $rndkey1,$inout1 1439 aesenc $rndkey1,$inout2 1440 aesenc $rndkey1,$inout3 1441 aesenc $rndkey1,$inout4 1442 aesenc $rndkey1,$inout5 1443 aesenc $rndkey1,$inout6 1444 aesenc $rndkey1,$inout7 1445 movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6] 1446 lea 0x80($inp),$inp # $inp+=8*16 1447 1448 aesenclast $in0,$inout0 # $inN is inp[N]^round[last] 1449 pxor $rndkey0,$rndkey1 # borrowed $rndkey 1450 movdqu 0x70-0x80($inp),$in0 1451 aesenclast $in1,$inout1 1452 pxor $rndkey0,$in0 1453 movdqa 0x00(%rsp),$in1 # load next counter block 1454 aesenclast $in2,$inout2 1455 aesenclast $in3,$inout3 1456 movdqa 0x10(%rsp),$in2 1457 movdqa 0x20(%rsp),$in3 1458 aesenclast $in4,$inout4 1459 aesenclast $in5,$inout5 1460 movdqa 0x30(%rsp),$in4 1461 movdqa 0x40(%rsp),$in5 1462 aesenclast $rndkey1,$inout6 1463 movdqa 0x50(%rsp),$rndkey0 1464 $movkey 0x10-0x80($key),$rndkey1#real 1st-round key 1465 aesenclast $in0,$inout7 1466 1467 movups $inout0,($out) # store 8 output blocks 1468 movdqa $in1,$inout0 1469 movups $inout1,0x10($out) 1470 movdqa $in2,$inout1 1471 movups $inout2,0x20($out) 1472 movdqa $in3,$inout2 1473 movups $inout3,0x30($out) 1474 movdqa $in4,$inout3 1475 movups $inout4,0x40($out) 1476 movdqa $in5,$inout4 1477 movups $inout5,0x50($out) 1478 movdqa $rndkey0,$inout5 1479 movups $inout6,0x60($out) 1480 movups $inout7,0x70($out) 1481 lea 0x80($out),$out # $out+=8*16 1482 1483 sub \$8,$len 1484 jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow 1485 1486 add \$8,$len # restore real remaining $len 1487 jz .Lctr32_done # done if ($len==0) 1488 lea -0x80($key),$key 1489 1490.Lctr32_tail: 1491 # note that at this point $inout0..5 are populated with 1492 # counter values xor-ed with 0-round key 1493 lea 16($key),$key 1494 cmp \$4,$len 1495 jb .Lctr32_loop3 1496 je .Lctr32_loop4 1497 1498 # if ($len>4) compute 7 E(counter) 1499 shl \$4,$rounds 1500 movdqa 0x60(%rsp),$inout6 1501 pxor $inout7,$inout7 1502 1503 $movkey 16($key),$rndkey0 1504 aesenc $rndkey1,$inout0 1505 aesenc $rndkey1,$inout1 1506 lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter 1507 neg %rax 1508 aesenc $rndkey1,$inout2 1509 add \$16,%rax # prepare for .Lenc_loop8_enter 1510 movups ($inp),$in0 1511 aesenc $rndkey1,$inout3 1512 aesenc $rndkey1,$inout4 1513 movups 0x10($inp),$in1 # pre-load input 1514 movups 0x20($inp),$in2 1515 aesenc $rndkey1,$inout5 1516 aesenc $rndkey1,$inout6 1517 1518 call .Lenc_loop8_enter 1519 1520 movdqu 0x30($inp),$in3 1521 pxor $in0,$inout0 1522 movdqu 0x40($inp),$in0 1523 pxor $in1,$inout1 1524 movdqu $inout0,($out) # store output 1525 pxor $in2,$inout2 1526 movdqu $inout1,0x10($out) 1527 pxor $in3,$inout3 1528 movdqu $inout2,0x20($out) 1529 pxor $in0,$inout4 1530 movdqu $inout3,0x30($out) 1531 movdqu $inout4,0x40($out) 1532 cmp \$6,$len 1533 jb .Lctr32_done # $len was 5, stop store 1534 1535 movups 0x50($inp),$in1 1536 xorps $in1,$inout5 1537 movups $inout5,0x50($out) 1538 je .Lctr32_done # $len was 6, stop store 1539 1540 movups 0x60($inp),$in2 1541 xorps $in2,$inout6 1542 movups $inout6,0x60($out) 1543 jmp .Lctr32_done # $len was 7, stop store 1544 1545.align 32 1546.Lctr32_loop4: 1547 aesenc $rndkey1,$inout0 1548 lea 16($key),$key 1549 dec $rounds 1550 aesenc $rndkey1,$inout1 1551 aesenc $rndkey1,$inout2 1552 aesenc $rndkey1,$inout3 1553 $movkey ($key),$rndkey1 1554 jnz .Lctr32_loop4 1555 aesenclast $rndkey1,$inout0 1556 aesenclast $rndkey1,$inout1 1557 movups ($inp),$in0 # load input 1558 movups 0x10($inp),$in1 1559 aesenclast $rndkey1,$inout2 1560 aesenclast $rndkey1,$inout3 1561 movups 0x20($inp),$in2 1562 movups 0x30($inp),$in3 1563 1564 xorps $in0,$inout0 1565 movups $inout0,($out) # store output 1566 xorps $in1,$inout1 1567 movups $inout1,0x10($out) 1568 pxor $in2,$inout2 1569 movdqu $inout2,0x20($out) 1570 pxor $in3,$inout3 1571 movdqu $inout3,0x30($out) 1572 jmp .Lctr32_done # $len was 4, stop store 1573 1574.align 32 1575.Lctr32_loop3: 1576 aesenc $rndkey1,$inout0 1577 lea 16($key),$key 1578 dec $rounds 1579 aesenc $rndkey1,$inout1 1580 aesenc $rndkey1,$inout2 1581 $movkey ($key),$rndkey1 1582 jnz .Lctr32_loop3 1583 aesenclast $rndkey1,$inout0 1584 aesenclast $rndkey1,$inout1 1585 aesenclast $rndkey1,$inout2 1586 1587 movups ($inp),$in0 # load input 1588 xorps $in0,$inout0 1589 movups $inout0,($out) # store output 1590 cmp \$2,$len 1591 jb .Lctr32_done # $len was 1, stop store 1592 1593 movups 0x10($inp),$in1 1594 xorps $in1,$inout1 1595 movups $inout1,0x10($out) 1596 je .Lctr32_done # $len was 2, stop store 1597 1598 movups 0x20($inp),$in2 1599 xorps $in2,$inout2 1600 movups $inout2,0x20($out) # $len was 3, stop store 1601 1602.Lctr32_done: 1603 xorps %xmm0,%xmm0 # clear register bank 1604 xor $key0,$key0 1605 pxor %xmm1,%xmm1 1606 pxor %xmm2,%xmm2 1607 pxor %xmm3,%xmm3 1608 pxor %xmm4,%xmm4 1609 pxor %xmm5,%xmm5 1610___ 1611$code.=<<___ if (!$win64); 1612 pxor %xmm6,%xmm6 1613 pxor %xmm7,%xmm7 1614 movaps %xmm0,0x00(%rsp) # clear stack 1615 pxor %xmm8,%xmm8 1616 movaps %xmm0,0x10(%rsp) 1617 pxor %xmm9,%xmm9 1618 movaps %xmm0,0x20(%rsp) 1619 pxor %xmm10,%xmm10 1620 movaps %xmm0,0x30(%rsp) 1621 pxor %xmm11,%xmm11 1622 movaps %xmm0,0x40(%rsp) 1623 pxor %xmm12,%xmm12 1624 movaps %xmm0,0x50(%rsp) 1625 pxor %xmm13,%xmm13 1626 movaps %xmm0,0x60(%rsp) 1627 pxor %xmm14,%xmm14 1628 movaps %xmm0,0x70(%rsp) 1629 pxor %xmm15,%xmm15 1630___ 1631$code.=<<___ if ($win64); 1632 movaps -0xa8($key_),%xmm6 1633 movaps %xmm0,-0xa8($key_) # clear stack 1634 movaps -0x98($key_),%xmm7 1635 movaps %xmm0,-0x98($key_) 1636 movaps -0x88($key_),%xmm8 1637 movaps %xmm0,-0x88($key_) 1638 movaps -0x78($key_),%xmm9 1639 movaps %xmm0,-0x78($key_) 1640 movaps -0x68($key_),%xmm10 1641 movaps %xmm0,-0x68($key_) 1642 movaps -0x58($key_),%xmm11 1643 movaps %xmm0,-0x58($key_) 1644 movaps -0x48($key_),%xmm12 1645 movaps %xmm0,-0x48($key_) 1646 movaps -0x38($key_),%xmm13 1647 movaps %xmm0,-0x38($key_) 1648 movaps -0x28($key_),%xmm14 1649 movaps %xmm0,-0x28($key_) 1650 movaps -0x18($key_),%xmm15 1651 movaps %xmm0,-0x18($key_) 1652 movaps %xmm0,0x00(%rsp) 1653 movaps %xmm0,0x10(%rsp) 1654 movaps %xmm0,0x20(%rsp) 1655 movaps %xmm0,0x30(%rsp) 1656 movaps %xmm0,0x40(%rsp) 1657 movaps %xmm0,0x50(%rsp) 1658 movaps %xmm0,0x60(%rsp) 1659 movaps %xmm0,0x70(%rsp) 1660___ 1661$code.=<<___; 1662 mov -8($key_),%rbp 1663.cfi_restore %rbp 1664 lea ($key_),%rsp 1665.cfi_def_cfa_register %rsp 1666.Lctr32_epilogue: 1667 ret 1668.cfi_endproc 1669.size ${PREFIX}_ctr32_encrypt_blocks,.-${PREFIX}_ctr32_encrypt_blocks 1670___ 1671} 1672 1673###################################################################### 1674# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, 1675# const AES_KEY *key1, const AES_KEY *key2 1676# const unsigned char iv[16]); 1677# 1678if (0) { # Omit these functions in BoringSSL 1679my @tweak=map("%xmm$_",(10..15)); 1680my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]); 1681my ($key2,$ivp,$len_)=("%r8","%r9","%r9"); 1682my $frame_size = 0x70 + ($win64?160:0); 1683my $key_ = "%rbp"; # override so that we can use %r11 as FP 1684 1685$code.=<<___; 1686.globl ${PREFIX}_xts_encrypt 1687.type ${PREFIX}_xts_encrypt,\@function,6 1688.align 16 1689${PREFIX}_xts_encrypt: 1690.cfi_startproc 1691 _CET_ENDBR 1692 lea (%rsp),%r11 # frame pointer 1693.cfi_def_cfa_register %r11 1694 push %rbp 1695.cfi_push %rbp 1696 sub \$$frame_size,%rsp 1697 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 1698___ 1699$code.=<<___ if ($win64); 1700 movaps %xmm6,-0xa8(%r11) # offload everything 1701 movaps %xmm7,-0x98(%r11) 1702 movaps %xmm8,-0x88(%r11) 1703 movaps %xmm9,-0x78(%r11) 1704 movaps %xmm10,-0x68(%r11) 1705 movaps %xmm11,-0x58(%r11) 1706 movaps %xmm12,-0x48(%r11) 1707 movaps %xmm13,-0x38(%r11) 1708 movaps %xmm14,-0x28(%r11) 1709 movaps %xmm15,-0x18(%r11) 1710.Lxts_enc_body: 1711___ 1712$code.=<<___; 1713 movups ($ivp),$inout0 # load clear-text tweak 1714 mov 240(%r8),$rounds # key2->rounds 1715 mov 240($key),$rnds_ # key1->rounds 1716___ 1717 # generate the tweak 1718 &aesni_generate1("enc",$key2,$rounds,$inout0); 1719$code.=<<___; 1720 $movkey ($key),$rndkey0 # zero round key 1721 mov $key,$key_ # backup $key 1722 mov $rnds_,$rounds # backup $rounds 1723 shl \$4,$rnds_ 1724 mov $len,$len_ # backup $len 1725 and \$-16,$len 1726 1727 $movkey 16($key,$rnds_),$rndkey1 # last round key 1728 1729 movdqa .Lxts_magic(%rip),$twmask 1730 movdqa $inout0,@tweak[5] 1731 pshufd \$0x5f,$inout0,$twres 1732 pxor $rndkey0,$rndkey1 1733___ 1734 # alternative tweak calculation algorithm is based on suggestions 1735 # by Shay Gueron. psrad doesn't conflict with AES-NI instructions 1736 # and should help in the future... 1737 for ($i=0;$i<4;$i++) { 1738 $code.=<<___; 1739 movdqa $twres,$twtmp 1740 paddd $twres,$twres 1741 movdqa @tweak[5],@tweak[$i] 1742 psrad \$31,$twtmp # broadcast upper bits 1743 paddq @tweak[5],@tweak[5] 1744 pand $twmask,$twtmp 1745 pxor $rndkey0,@tweak[$i] 1746 pxor $twtmp,@tweak[5] 1747___ 1748 } 1749$code.=<<___; 1750 movdqa @tweak[5],@tweak[4] 1751 psrad \$31,$twres 1752 paddq @tweak[5],@tweak[5] 1753 pand $twmask,$twres 1754 pxor $rndkey0,@tweak[4] 1755 pxor $twres,@tweak[5] 1756 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] 1757 1758 sub \$16*6,$len 1759 jc .Lxts_enc_short # if $len-=6*16 borrowed 1760 1761 mov \$16+96,$rounds 1762 lea 32($key_,$rnds_),$key # end of key schedule 1763 sub %r10,%rax # twisted $rounds 1764 $movkey 16($key_),$rndkey1 1765 mov %rax,%r10 # backup twisted $rounds 1766 lea .Lxts_magic(%rip),%r8 1767 jmp .Lxts_enc_grandloop 1768 1769.align 32 1770.Lxts_enc_grandloop: 1771 movdqu `16*0`($inp),$inout0 # load input 1772 movdqa $rndkey0,$twmask 1773 movdqu `16*1`($inp),$inout1 1774 pxor @tweak[0],$inout0 # input^=tweak^round[0] 1775 movdqu `16*2`($inp),$inout2 1776 pxor @tweak[1],$inout1 1777 aesenc $rndkey1,$inout0 1778 movdqu `16*3`($inp),$inout3 1779 pxor @tweak[2],$inout2 1780 aesenc $rndkey1,$inout1 1781 movdqu `16*4`($inp),$inout4 1782 pxor @tweak[3],$inout3 1783 aesenc $rndkey1,$inout2 1784 movdqu `16*5`($inp),$inout5 1785 pxor @tweak[5],$twmask # round[0]^=tweak[5] 1786 movdqa 0x60(%rsp),$twres # load round[0]^round[last] 1787 pxor @tweak[4],$inout4 1788 aesenc $rndkey1,$inout3 1789 $movkey 32($key_),$rndkey0 1790 lea `16*6`($inp),$inp 1791 pxor $twmask,$inout5 1792 1793 pxor $twres,@tweak[0] # calculate tweaks^round[last] 1794 aesenc $rndkey1,$inout4 1795 pxor $twres,@tweak[1] 1796 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last] 1797 aesenc $rndkey1,$inout5 1798 $movkey 48($key_),$rndkey1 1799 pxor $twres,@tweak[2] 1800 1801 aesenc $rndkey0,$inout0 1802 pxor $twres,@tweak[3] 1803 movdqa @tweak[1],`16*1`(%rsp) 1804 aesenc $rndkey0,$inout1 1805 pxor $twres,@tweak[4] 1806 movdqa @tweak[2],`16*2`(%rsp) 1807 aesenc $rndkey0,$inout2 1808 aesenc $rndkey0,$inout3 1809 pxor $twres,$twmask 1810 movdqa @tweak[4],`16*4`(%rsp) 1811 aesenc $rndkey0,$inout4 1812 aesenc $rndkey0,$inout5 1813 $movkey 64($key_),$rndkey0 1814 movdqa $twmask,`16*5`(%rsp) 1815 pshufd \$0x5f,@tweak[5],$twres 1816 jmp .Lxts_enc_loop6 1817.align 32 1818.Lxts_enc_loop6: 1819 aesenc $rndkey1,$inout0 1820 aesenc $rndkey1,$inout1 1821 aesenc $rndkey1,$inout2 1822 aesenc $rndkey1,$inout3 1823 aesenc $rndkey1,$inout4 1824 aesenc $rndkey1,$inout5 1825 $movkey -64($key,%rax),$rndkey1 1826 add \$32,%rax 1827 1828 aesenc $rndkey0,$inout0 1829 aesenc $rndkey0,$inout1 1830 aesenc $rndkey0,$inout2 1831 aesenc $rndkey0,$inout3 1832 aesenc $rndkey0,$inout4 1833 aesenc $rndkey0,$inout5 1834 $movkey -80($key,%rax),$rndkey0 1835 jnz .Lxts_enc_loop6 1836 1837 movdqa (%r8),$twmask # start calculating next tweak 1838 movdqa $twres,$twtmp 1839 paddd $twres,$twres 1840 aesenc $rndkey1,$inout0 1841 paddq @tweak[5],@tweak[5] 1842 psrad \$31,$twtmp 1843 aesenc $rndkey1,$inout1 1844 pand $twmask,$twtmp 1845 $movkey ($key_),@tweak[0] # load round[0] 1846 aesenc $rndkey1,$inout2 1847 aesenc $rndkey1,$inout3 1848 aesenc $rndkey1,$inout4 1849 pxor $twtmp,@tweak[5] 1850 movaps @tweak[0],@tweak[1] # copy round[0] 1851 aesenc $rndkey1,$inout5 1852 $movkey -64($key),$rndkey1 1853 1854 movdqa $twres,$twtmp 1855 aesenc $rndkey0,$inout0 1856 paddd $twres,$twres 1857 pxor @tweak[5],@tweak[0] 1858 aesenc $rndkey0,$inout1 1859 psrad \$31,$twtmp 1860 paddq @tweak[5],@tweak[5] 1861 aesenc $rndkey0,$inout2 1862 aesenc $rndkey0,$inout3 1863 pand $twmask,$twtmp 1864 movaps @tweak[1],@tweak[2] 1865 aesenc $rndkey0,$inout4 1866 pxor $twtmp,@tweak[5] 1867 movdqa $twres,$twtmp 1868 aesenc $rndkey0,$inout5 1869 $movkey -48($key),$rndkey0 1870 1871 paddd $twres,$twres 1872 aesenc $rndkey1,$inout0 1873 pxor @tweak[5],@tweak[1] 1874 psrad \$31,$twtmp 1875 aesenc $rndkey1,$inout1 1876 paddq @tweak[5],@tweak[5] 1877 pand $twmask,$twtmp 1878 aesenc $rndkey1,$inout2 1879 aesenc $rndkey1,$inout3 1880 movdqa @tweak[3],`16*3`(%rsp) 1881 pxor $twtmp,@tweak[5] 1882 aesenc $rndkey1,$inout4 1883 movaps @tweak[2],@tweak[3] 1884 movdqa $twres,$twtmp 1885 aesenc $rndkey1,$inout5 1886 $movkey -32($key),$rndkey1 1887 1888 paddd $twres,$twres 1889 aesenc $rndkey0,$inout0 1890 pxor @tweak[5],@tweak[2] 1891 psrad \$31,$twtmp 1892 aesenc $rndkey0,$inout1 1893 paddq @tweak[5],@tweak[5] 1894 pand $twmask,$twtmp 1895 aesenc $rndkey0,$inout2 1896 aesenc $rndkey0,$inout3 1897 aesenc $rndkey0,$inout4 1898 pxor $twtmp,@tweak[5] 1899 movaps @tweak[3],@tweak[4] 1900 aesenc $rndkey0,$inout5 1901 1902 movdqa $twres,$rndkey0 1903 paddd $twres,$twres 1904 aesenc $rndkey1,$inout0 1905 pxor @tweak[5],@tweak[3] 1906 psrad \$31,$rndkey0 1907 aesenc $rndkey1,$inout1 1908 paddq @tweak[5],@tweak[5] 1909 pand $twmask,$rndkey0 1910 aesenc $rndkey1,$inout2 1911 aesenc $rndkey1,$inout3 1912 pxor $rndkey0,@tweak[5] 1913 $movkey ($key_),$rndkey0 1914 aesenc $rndkey1,$inout4 1915 aesenc $rndkey1,$inout5 1916 $movkey 16($key_),$rndkey1 1917 1918 pxor @tweak[5],@tweak[4] 1919 aesenclast `16*0`(%rsp),$inout0 1920 psrad \$31,$twres 1921 paddq @tweak[5],@tweak[5] 1922 aesenclast `16*1`(%rsp),$inout1 1923 aesenclast `16*2`(%rsp),$inout2 1924 pand $twmask,$twres 1925 mov %r10,%rax # restore $rounds 1926 aesenclast `16*3`(%rsp),$inout3 1927 aesenclast `16*4`(%rsp),$inout4 1928 aesenclast `16*5`(%rsp),$inout5 1929 pxor $twres,@tweak[5] 1930 1931 lea `16*6`($out),$out # $out+=6*16 1932 movups $inout0,`-16*6`($out) # store 6 output blocks 1933 movups $inout1,`-16*5`($out) 1934 movups $inout2,`-16*4`($out) 1935 movups $inout3,`-16*3`($out) 1936 movups $inout4,`-16*2`($out) 1937 movups $inout5,`-16*1`($out) 1938 sub \$16*6,$len 1939 jnc .Lxts_enc_grandloop # loop if $len-=6*16 didn't borrow 1940 1941 mov \$16+96,$rounds 1942 sub $rnds_,$rounds 1943 mov $key_,$key # restore $key 1944 shr \$4,$rounds # restore original value 1945 1946.Lxts_enc_short: 1947 # at the point @tweak[0..5] are populated with tweak values 1948 mov $rounds,$rnds_ # backup $rounds 1949 pxor $rndkey0,@tweak[0] 1950 add \$16*6,$len # restore real remaining $len 1951 jz .Lxts_enc_done # done if ($len==0) 1952 1953 pxor $rndkey0,@tweak[1] 1954 cmp \$0x20,$len 1955 jb .Lxts_enc_one # $len is 1*16 1956 pxor $rndkey0,@tweak[2] 1957 je .Lxts_enc_two # $len is 2*16 1958 1959 pxor $rndkey0,@tweak[3] 1960 cmp \$0x40,$len 1961 jb .Lxts_enc_three # $len is 3*16 1962 pxor $rndkey0,@tweak[4] 1963 je .Lxts_enc_four # $len is 4*16 1964 1965 movdqu ($inp),$inout0 # $len is 5*16 1966 movdqu 16*1($inp),$inout1 1967 movdqu 16*2($inp),$inout2 1968 pxor @tweak[0],$inout0 1969 movdqu 16*3($inp),$inout3 1970 pxor @tweak[1],$inout1 1971 movdqu 16*4($inp),$inout4 1972 lea 16*5($inp),$inp # $inp+=5*16 1973 pxor @tweak[2],$inout2 1974 pxor @tweak[3],$inout3 1975 pxor @tweak[4],$inout4 1976 pxor $inout5,$inout5 1977 1978 call _aesni_encrypt6 1979 1980 xorps @tweak[0],$inout0 1981 movdqa @tweak[5],@tweak[0] 1982 xorps @tweak[1],$inout1 1983 xorps @tweak[2],$inout2 1984 movdqu $inout0,($out) # store 5 output blocks 1985 xorps @tweak[3],$inout3 1986 movdqu $inout1,16*1($out) 1987 xorps @tweak[4],$inout4 1988 movdqu $inout2,16*2($out) 1989 movdqu $inout3,16*3($out) 1990 movdqu $inout4,16*4($out) 1991 lea 16*5($out),$out # $out+=5*16 1992 jmp .Lxts_enc_done 1993 1994.align 16 1995.Lxts_enc_one: 1996 movups ($inp),$inout0 1997 lea 16*1($inp),$inp # inp+=1*16 1998 xorps @tweak[0],$inout0 1999___ 2000 &aesni_generate1("enc",$key,$rounds); 2001$code.=<<___; 2002 xorps @tweak[0],$inout0 2003 movdqa @tweak[1],@tweak[0] 2004 movups $inout0,($out) # store one output block 2005 lea 16*1($out),$out # $out+=1*16 2006 jmp .Lxts_enc_done 2007 2008.align 16 2009.Lxts_enc_two: 2010 movups ($inp),$inout0 2011 movups 16($inp),$inout1 2012 lea 32($inp),$inp # $inp+=2*16 2013 xorps @tweak[0],$inout0 2014 xorps @tweak[1],$inout1 2015 2016 call _aesni_encrypt2 2017 2018 xorps @tweak[0],$inout0 2019 movdqa @tweak[2],@tweak[0] 2020 xorps @tweak[1],$inout1 2021 movups $inout0,($out) # store 2 output blocks 2022 movups $inout1,16*1($out) 2023 lea 16*2($out),$out # $out+=2*16 2024 jmp .Lxts_enc_done 2025 2026.align 16 2027.Lxts_enc_three: 2028 movups ($inp),$inout0 2029 movups 16*1($inp),$inout1 2030 movups 16*2($inp),$inout2 2031 lea 16*3($inp),$inp # $inp+=3*16 2032 xorps @tweak[0],$inout0 2033 xorps @tweak[1],$inout1 2034 xorps @tweak[2],$inout2 2035 2036 call _aesni_encrypt3 2037 2038 xorps @tweak[0],$inout0 2039 movdqa @tweak[3],@tweak[0] 2040 xorps @tweak[1],$inout1 2041 xorps @tweak[2],$inout2 2042 movups $inout0,($out) # store 3 output blocks 2043 movups $inout1,16*1($out) 2044 movups $inout2,16*2($out) 2045 lea 16*3($out),$out # $out+=3*16 2046 jmp .Lxts_enc_done 2047 2048.align 16 2049.Lxts_enc_four: 2050 movups ($inp),$inout0 2051 movups 16*1($inp),$inout1 2052 movups 16*2($inp),$inout2 2053 xorps @tweak[0],$inout0 2054 movups 16*3($inp),$inout3 2055 lea 16*4($inp),$inp # $inp+=4*16 2056 xorps @tweak[1],$inout1 2057 xorps @tweak[2],$inout2 2058 xorps @tweak[3],$inout3 2059 2060 call _aesni_encrypt4 2061 2062 pxor @tweak[0],$inout0 2063 movdqa @tweak[4],@tweak[0] 2064 pxor @tweak[1],$inout1 2065 pxor @tweak[2],$inout2 2066 movdqu $inout0,($out) # store 4 output blocks 2067 pxor @tweak[3],$inout3 2068 movdqu $inout1,16*1($out) 2069 movdqu $inout2,16*2($out) 2070 movdqu $inout3,16*3($out) 2071 lea 16*4($out),$out # $out+=4*16 2072 jmp .Lxts_enc_done 2073 2074.align 16 2075.Lxts_enc_done: 2076 and \$15,$len_ # see if $len%16 is 0 2077 jz .Lxts_enc_ret 2078 mov $len_,$len 2079 2080.Lxts_enc_steal: 2081 movzb ($inp),%eax # borrow $rounds ... 2082 movzb -16($out),%ecx # ... and $key 2083 lea 1($inp),$inp 2084 mov %al,-16($out) 2085 mov %cl,0($out) 2086 lea 1($out),$out 2087 sub \$1,$len 2088 jnz .Lxts_enc_steal 2089 2090 sub $len_,$out # rewind $out 2091 mov $key_,$key # restore $key 2092 mov $rnds_,$rounds # restore $rounds 2093 2094 movups -16($out),$inout0 2095 xorps @tweak[0],$inout0 2096___ 2097 &aesni_generate1("enc",$key,$rounds); 2098$code.=<<___; 2099 xorps @tweak[0],$inout0 2100 movups $inout0,-16($out) 2101 2102.Lxts_enc_ret: 2103 xorps %xmm0,%xmm0 # clear register bank 2104 pxor %xmm1,%xmm1 2105 pxor %xmm2,%xmm2 2106 pxor %xmm3,%xmm3 2107 pxor %xmm4,%xmm4 2108 pxor %xmm5,%xmm5 2109___ 2110$code.=<<___ if (!$win64); 2111 pxor %xmm6,%xmm6 2112 pxor %xmm7,%xmm7 2113 movaps %xmm0,0x00(%rsp) # clear stack 2114 pxor %xmm8,%xmm8 2115 movaps %xmm0,0x10(%rsp) 2116 pxor %xmm9,%xmm9 2117 movaps %xmm0,0x20(%rsp) 2118 pxor %xmm10,%xmm10 2119 movaps %xmm0,0x30(%rsp) 2120 pxor %xmm11,%xmm11 2121 movaps %xmm0,0x40(%rsp) 2122 pxor %xmm12,%xmm12 2123 movaps %xmm0,0x50(%rsp) 2124 pxor %xmm13,%xmm13 2125 movaps %xmm0,0x60(%rsp) 2126 pxor %xmm14,%xmm14 2127 pxor %xmm15,%xmm15 2128___ 2129$code.=<<___ if ($win64); 2130 movaps -0xa8(%r11),%xmm6 2131 movaps %xmm0,-0xa8(%r11) # clear stack 2132 movaps -0x98(%r11),%xmm7 2133 movaps %xmm0,-0x98(%r11) 2134 movaps -0x88(%r11),%xmm8 2135 movaps %xmm0,-0x88(%r11) 2136 movaps -0x78(%r11),%xmm9 2137 movaps %xmm0,-0x78(%r11) 2138 movaps -0x68(%r11),%xmm10 2139 movaps %xmm0,-0x68(%r11) 2140 movaps -0x58(%r11),%xmm11 2141 movaps %xmm0,-0x58(%r11) 2142 movaps -0x48(%r11),%xmm12 2143 movaps %xmm0,-0x48(%r11) 2144 movaps -0x38(%r11),%xmm13 2145 movaps %xmm0,-0x38(%r11) 2146 movaps -0x28(%r11),%xmm14 2147 movaps %xmm0,-0x28(%r11) 2148 movaps -0x18(%r11),%xmm15 2149 movaps %xmm0,-0x18(%r11) 2150 movaps %xmm0,0x00(%rsp) 2151 movaps %xmm0,0x10(%rsp) 2152 movaps %xmm0,0x20(%rsp) 2153 movaps %xmm0,0x30(%rsp) 2154 movaps %xmm0,0x40(%rsp) 2155 movaps %xmm0,0x50(%rsp) 2156 movaps %xmm0,0x60(%rsp) 2157___ 2158$code.=<<___; 2159 mov -8(%r11),%rbp 2160.cfi_restore %rbp 2161 lea (%r11),%rsp 2162.cfi_def_cfa_register %rsp 2163.Lxts_enc_epilogue: 2164 ret 2165.cfi_endproc 2166.size ${PREFIX}_xts_encrypt,.-${PREFIX}_xts_encrypt 2167___ 2168 2169$code.=<<___; 2170.globl ${PREFIX}_xts_decrypt 2171.type ${PREFIX}_xts_decrypt,\@function,6 2172.align 16 2173${PREFIX}_xts_decrypt: 2174.cfi_startproc 2175 _CET_ENDBR 2176 lea (%rsp),%r11 # frame pointer 2177.cfi_def_cfa_register %r11 2178 push %rbp 2179.cfi_push %rbp 2180 sub \$$frame_size,%rsp 2181 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 2182___ 2183$code.=<<___ if ($win64); 2184 movaps %xmm6,-0xa8(%r11) # offload everything 2185 movaps %xmm7,-0x98(%r11) 2186 movaps %xmm8,-0x88(%r11) 2187 movaps %xmm9,-0x78(%r11) 2188 movaps %xmm10,-0x68(%r11) 2189 movaps %xmm11,-0x58(%r11) 2190 movaps %xmm12,-0x48(%r11) 2191 movaps %xmm13,-0x38(%r11) 2192 movaps %xmm14,-0x28(%r11) 2193 movaps %xmm15,-0x18(%r11) 2194.Lxts_dec_body: 2195___ 2196$code.=<<___; 2197 movups ($ivp),$inout0 # load clear-text tweak 2198 mov 240($key2),$rounds # key2->rounds 2199 mov 240($key),$rnds_ # key1->rounds 2200___ 2201 # generate the tweak 2202 &aesni_generate1("enc",$key2,$rounds,$inout0); 2203$code.=<<___; 2204 xor %eax,%eax # if ($len%16) len-=16; 2205 test \$15,$len 2206 setnz %al 2207 shl \$4,%rax 2208 sub %rax,$len 2209 2210 $movkey ($key),$rndkey0 # zero round key 2211 mov $key,$key_ # backup $key 2212 mov $rnds_,$rounds # backup $rounds 2213 shl \$4,$rnds_ 2214 mov $len,$len_ # backup $len 2215 and \$-16,$len 2216 2217 $movkey 16($key,$rnds_),$rndkey1 # last round key 2218 2219 movdqa .Lxts_magic(%rip),$twmask 2220 movdqa $inout0,@tweak[5] 2221 pshufd \$0x5f,$inout0,$twres 2222 pxor $rndkey0,$rndkey1 2223___ 2224 for ($i=0;$i<4;$i++) { 2225 $code.=<<___; 2226 movdqa $twres,$twtmp 2227 paddd $twres,$twres 2228 movdqa @tweak[5],@tweak[$i] 2229 psrad \$31,$twtmp # broadcast upper bits 2230 paddq @tweak[5],@tweak[5] 2231 pand $twmask,$twtmp 2232 pxor $rndkey0,@tweak[$i] 2233 pxor $twtmp,@tweak[5] 2234___ 2235 } 2236$code.=<<___; 2237 movdqa @tweak[5],@tweak[4] 2238 psrad \$31,$twres 2239 paddq @tweak[5],@tweak[5] 2240 pand $twmask,$twres 2241 pxor $rndkey0,@tweak[4] 2242 pxor $twres,@tweak[5] 2243 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] 2244 2245 sub \$16*6,$len 2246 jc .Lxts_dec_short # if $len-=6*16 borrowed 2247 2248 mov \$16+96,$rounds 2249 lea 32($key_,$rnds_),$key # end of key schedule 2250 sub %r10,%rax # twisted $rounds 2251 $movkey 16($key_),$rndkey1 2252 mov %rax,%r10 # backup twisted $rounds 2253 lea .Lxts_magic(%rip),%r8 2254 jmp .Lxts_dec_grandloop 2255 2256.align 32 2257.Lxts_dec_grandloop: 2258 movdqu `16*0`($inp),$inout0 # load input 2259 movdqa $rndkey0,$twmask 2260 movdqu `16*1`($inp),$inout1 2261 pxor @tweak[0],$inout0 # intput^=tweak^round[0] 2262 movdqu `16*2`($inp),$inout2 2263 pxor @tweak[1],$inout1 2264 aesdec $rndkey1,$inout0 2265 movdqu `16*3`($inp),$inout3 2266 pxor @tweak[2],$inout2 2267 aesdec $rndkey1,$inout1 2268 movdqu `16*4`($inp),$inout4 2269 pxor @tweak[3],$inout3 2270 aesdec $rndkey1,$inout2 2271 movdqu `16*5`($inp),$inout5 2272 pxor @tweak[5],$twmask # round[0]^=tweak[5] 2273 movdqa 0x60(%rsp),$twres # load round[0]^round[last] 2274 pxor @tweak[4],$inout4 2275 aesdec $rndkey1,$inout3 2276 $movkey 32($key_),$rndkey0 2277 lea `16*6`($inp),$inp 2278 pxor $twmask,$inout5 2279 2280 pxor $twres,@tweak[0] # calculate tweaks^round[last] 2281 aesdec $rndkey1,$inout4 2282 pxor $twres,@tweak[1] 2283 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key 2284 aesdec $rndkey1,$inout5 2285 $movkey 48($key_),$rndkey1 2286 pxor $twres,@tweak[2] 2287 2288 aesdec $rndkey0,$inout0 2289 pxor $twres,@tweak[3] 2290 movdqa @tweak[1],`16*1`(%rsp) 2291 aesdec $rndkey0,$inout1 2292 pxor $twres,@tweak[4] 2293 movdqa @tweak[2],`16*2`(%rsp) 2294 aesdec $rndkey0,$inout2 2295 aesdec $rndkey0,$inout3 2296 pxor $twres,$twmask 2297 movdqa @tweak[4],`16*4`(%rsp) 2298 aesdec $rndkey0,$inout4 2299 aesdec $rndkey0,$inout5 2300 $movkey 64($key_),$rndkey0 2301 movdqa $twmask,`16*5`(%rsp) 2302 pshufd \$0x5f,@tweak[5],$twres 2303 jmp .Lxts_dec_loop6 2304.align 32 2305.Lxts_dec_loop6: 2306 aesdec $rndkey1,$inout0 2307 aesdec $rndkey1,$inout1 2308 aesdec $rndkey1,$inout2 2309 aesdec $rndkey1,$inout3 2310 aesdec $rndkey1,$inout4 2311 aesdec $rndkey1,$inout5 2312 $movkey -64($key,%rax),$rndkey1 2313 add \$32,%rax 2314 2315 aesdec $rndkey0,$inout0 2316 aesdec $rndkey0,$inout1 2317 aesdec $rndkey0,$inout2 2318 aesdec $rndkey0,$inout3 2319 aesdec $rndkey0,$inout4 2320 aesdec $rndkey0,$inout5 2321 $movkey -80($key,%rax),$rndkey0 2322 jnz .Lxts_dec_loop6 2323 2324 movdqa (%r8),$twmask # start calculating next tweak 2325 movdqa $twres,$twtmp 2326 paddd $twres,$twres 2327 aesdec $rndkey1,$inout0 2328 paddq @tweak[5],@tweak[5] 2329 psrad \$31,$twtmp 2330 aesdec $rndkey1,$inout1 2331 pand $twmask,$twtmp 2332 $movkey ($key_),@tweak[0] # load round[0] 2333 aesdec $rndkey1,$inout2 2334 aesdec $rndkey1,$inout3 2335 aesdec $rndkey1,$inout4 2336 pxor $twtmp,@tweak[5] 2337 movaps @tweak[0],@tweak[1] # copy round[0] 2338 aesdec $rndkey1,$inout5 2339 $movkey -64($key),$rndkey1 2340 2341 movdqa $twres,$twtmp 2342 aesdec $rndkey0,$inout0 2343 paddd $twres,$twres 2344 pxor @tweak[5],@tweak[0] 2345 aesdec $rndkey0,$inout1 2346 psrad \$31,$twtmp 2347 paddq @tweak[5],@tweak[5] 2348 aesdec $rndkey0,$inout2 2349 aesdec $rndkey0,$inout3 2350 pand $twmask,$twtmp 2351 movaps @tweak[1],@tweak[2] 2352 aesdec $rndkey0,$inout4 2353 pxor $twtmp,@tweak[5] 2354 movdqa $twres,$twtmp 2355 aesdec $rndkey0,$inout5 2356 $movkey -48($key),$rndkey0 2357 2358 paddd $twres,$twres 2359 aesdec $rndkey1,$inout0 2360 pxor @tweak[5],@tweak[1] 2361 psrad \$31,$twtmp 2362 aesdec $rndkey1,$inout1 2363 paddq @tweak[5],@tweak[5] 2364 pand $twmask,$twtmp 2365 aesdec $rndkey1,$inout2 2366 aesdec $rndkey1,$inout3 2367 movdqa @tweak[3],`16*3`(%rsp) 2368 pxor $twtmp,@tweak[5] 2369 aesdec $rndkey1,$inout4 2370 movaps @tweak[2],@tweak[3] 2371 movdqa $twres,$twtmp 2372 aesdec $rndkey1,$inout5 2373 $movkey -32($key),$rndkey1 2374 2375 paddd $twres,$twres 2376 aesdec $rndkey0,$inout0 2377 pxor @tweak[5],@tweak[2] 2378 psrad \$31,$twtmp 2379 aesdec $rndkey0,$inout1 2380 paddq @tweak[5],@tweak[5] 2381 pand $twmask,$twtmp 2382 aesdec $rndkey0,$inout2 2383 aesdec $rndkey0,$inout3 2384 aesdec $rndkey0,$inout4 2385 pxor $twtmp,@tweak[5] 2386 movaps @tweak[3],@tweak[4] 2387 aesdec $rndkey0,$inout5 2388 2389 movdqa $twres,$rndkey0 2390 paddd $twres,$twres 2391 aesdec $rndkey1,$inout0 2392 pxor @tweak[5],@tweak[3] 2393 psrad \$31,$rndkey0 2394 aesdec $rndkey1,$inout1 2395 paddq @tweak[5],@tweak[5] 2396 pand $twmask,$rndkey0 2397 aesdec $rndkey1,$inout2 2398 aesdec $rndkey1,$inout3 2399 pxor $rndkey0,@tweak[5] 2400 $movkey ($key_),$rndkey0 2401 aesdec $rndkey1,$inout4 2402 aesdec $rndkey1,$inout5 2403 $movkey 16($key_),$rndkey1 2404 2405 pxor @tweak[5],@tweak[4] 2406 aesdeclast `16*0`(%rsp),$inout0 2407 psrad \$31,$twres 2408 paddq @tweak[5],@tweak[5] 2409 aesdeclast `16*1`(%rsp),$inout1 2410 aesdeclast `16*2`(%rsp),$inout2 2411 pand $twmask,$twres 2412 mov %r10,%rax # restore $rounds 2413 aesdeclast `16*3`(%rsp),$inout3 2414 aesdeclast `16*4`(%rsp),$inout4 2415 aesdeclast `16*5`(%rsp),$inout5 2416 pxor $twres,@tweak[5] 2417 2418 lea `16*6`($out),$out # $out+=6*16 2419 movups $inout0,`-16*6`($out) # store 6 output blocks 2420 movups $inout1,`-16*5`($out) 2421 movups $inout2,`-16*4`($out) 2422 movups $inout3,`-16*3`($out) 2423 movups $inout4,`-16*2`($out) 2424 movups $inout5,`-16*1`($out) 2425 sub \$16*6,$len 2426 jnc .Lxts_dec_grandloop # loop if $len-=6*16 didn't borrow 2427 2428 mov \$16+96,$rounds 2429 sub $rnds_,$rounds 2430 mov $key_,$key # restore $key 2431 shr \$4,$rounds # restore original value 2432 2433.Lxts_dec_short: 2434 # at the point @tweak[0..5] are populated with tweak values 2435 mov $rounds,$rnds_ # backup $rounds 2436 pxor $rndkey0,@tweak[0] 2437 pxor $rndkey0,@tweak[1] 2438 add \$16*6,$len # restore real remaining $len 2439 jz .Lxts_dec_done # done if ($len==0) 2440 2441 pxor $rndkey0,@tweak[2] 2442 cmp \$0x20,$len 2443 jb .Lxts_dec_one # $len is 1*16 2444 pxor $rndkey0,@tweak[3] 2445 je .Lxts_dec_two # $len is 2*16 2446 2447 pxor $rndkey0,@tweak[4] 2448 cmp \$0x40,$len 2449 jb .Lxts_dec_three # $len is 3*16 2450 je .Lxts_dec_four # $len is 4*16 2451 2452 movdqu ($inp),$inout0 # $len is 5*16 2453 movdqu 16*1($inp),$inout1 2454 movdqu 16*2($inp),$inout2 2455 pxor @tweak[0],$inout0 2456 movdqu 16*3($inp),$inout3 2457 pxor @tweak[1],$inout1 2458 movdqu 16*4($inp),$inout4 2459 lea 16*5($inp),$inp # $inp+=5*16 2460 pxor @tweak[2],$inout2 2461 pxor @tweak[3],$inout3 2462 pxor @tweak[4],$inout4 2463 2464 call _aesni_decrypt6 2465 2466 xorps @tweak[0],$inout0 2467 xorps @tweak[1],$inout1 2468 xorps @tweak[2],$inout2 2469 movdqu $inout0,($out) # store 5 output blocks 2470 xorps @tweak[3],$inout3 2471 movdqu $inout1,16*1($out) 2472 xorps @tweak[4],$inout4 2473 movdqu $inout2,16*2($out) 2474 pxor $twtmp,$twtmp 2475 movdqu $inout3,16*3($out) 2476 pcmpgtd @tweak[5],$twtmp 2477 movdqu $inout4,16*4($out) 2478 lea 16*5($out),$out # $out+=5*16 2479 pshufd \$0x13,$twtmp,@tweak[1] # $twres 2480 and \$15,$len_ 2481 jz .Lxts_dec_ret 2482 2483 movdqa @tweak[5],@tweak[0] 2484 paddq @tweak[5],@tweak[5] # psllq 1,$tweak 2485 pand $twmask,@tweak[1] # isolate carry and residue 2486 pxor @tweak[5],@tweak[1] 2487 jmp .Lxts_dec_done2 2488 2489.align 16 2490.Lxts_dec_one: 2491 movups ($inp),$inout0 2492 lea 16*1($inp),$inp # $inp+=1*16 2493 xorps @tweak[0],$inout0 2494___ 2495 &aesni_generate1("dec",$key,$rounds); 2496$code.=<<___; 2497 xorps @tweak[0],$inout0 2498 movdqa @tweak[1],@tweak[0] 2499 movups $inout0,($out) # store one output block 2500 movdqa @tweak[2],@tweak[1] 2501 lea 16*1($out),$out # $out+=1*16 2502 jmp .Lxts_dec_done 2503 2504.align 16 2505.Lxts_dec_two: 2506 movups ($inp),$inout0 2507 movups 16($inp),$inout1 2508 lea 32($inp),$inp # $inp+=2*16 2509 xorps @tweak[0],$inout0 2510 xorps @tweak[1],$inout1 2511 2512 call _aesni_decrypt2 2513 2514 xorps @tweak[0],$inout0 2515 movdqa @tweak[2],@tweak[0] 2516 xorps @tweak[1],$inout1 2517 movdqa @tweak[3],@tweak[1] 2518 movups $inout0,($out) # store 2 output blocks 2519 movups $inout1,16*1($out) 2520 lea 16*2($out),$out # $out+=2*16 2521 jmp .Lxts_dec_done 2522 2523.align 16 2524.Lxts_dec_three: 2525 movups ($inp),$inout0 2526 movups 16*1($inp),$inout1 2527 movups 16*2($inp),$inout2 2528 lea 16*3($inp),$inp # $inp+=3*16 2529 xorps @tweak[0],$inout0 2530 xorps @tweak[1],$inout1 2531 xorps @tweak[2],$inout2 2532 2533 call _aesni_decrypt3 2534 2535 xorps @tweak[0],$inout0 2536 movdqa @tweak[3],@tweak[0] 2537 xorps @tweak[1],$inout1 2538 movdqa @tweak[4],@tweak[1] 2539 xorps @tweak[2],$inout2 2540 movups $inout0,($out) # store 3 output blocks 2541 movups $inout1,16*1($out) 2542 movups $inout2,16*2($out) 2543 lea 16*3($out),$out # $out+=3*16 2544 jmp .Lxts_dec_done 2545 2546.align 16 2547.Lxts_dec_four: 2548 movups ($inp),$inout0 2549 movups 16*1($inp),$inout1 2550 movups 16*2($inp),$inout2 2551 xorps @tweak[0],$inout0 2552 movups 16*3($inp),$inout3 2553 lea 16*4($inp),$inp # $inp+=4*16 2554 xorps @tweak[1],$inout1 2555 xorps @tweak[2],$inout2 2556 xorps @tweak[3],$inout3 2557 2558 call _aesni_decrypt4 2559 2560 pxor @tweak[0],$inout0 2561 movdqa @tweak[4],@tweak[0] 2562 pxor @tweak[1],$inout1 2563 movdqa @tweak[5],@tweak[1] 2564 pxor @tweak[2],$inout2 2565 movdqu $inout0,($out) # store 4 output blocks 2566 pxor @tweak[3],$inout3 2567 movdqu $inout1,16*1($out) 2568 movdqu $inout2,16*2($out) 2569 movdqu $inout3,16*3($out) 2570 lea 16*4($out),$out # $out+=4*16 2571 jmp .Lxts_dec_done 2572 2573.align 16 2574.Lxts_dec_done: 2575 and \$15,$len_ # see if $len%16 is 0 2576 jz .Lxts_dec_ret 2577.Lxts_dec_done2: 2578 mov $len_,$len 2579 mov $key_,$key # restore $key 2580 mov $rnds_,$rounds # restore $rounds 2581 2582 movups ($inp),$inout0 2583 xorps @tweak[1],$inout0 2584___ 2585 &aesni_generate1("dec",$key,$rounds); 2586$code.=<<___; 2587 xorps @tweak[1],$inout0 2588 movups $inout0,($out) 2589 2590.Lxts_dec_steal: 2591 movzb 16($inp),%eax # borrow $rounds ... 2592 movzb ($out),%ecx # ... and $key 2593 lea 1($inp),$inp 2594 mov %al,($out) 2595 mov %cl,16($out) 2596 lea 1($out),$out 2597 sub \$1,$len 2598 jnz .Lxts_dec_steal 2599 2600 sub $len_,$out # rewind $out 2601 mov $key_,$key # restore $key 2602 mov $rnds_,$rounds # restore $rounds 2603 2604 movups ($out),$inout0 2605 xorps @tweak[0],$inout0 2606___ 2607 &aesni_generate1("dec",$key,$rounds); 2608$code.=<<___; 2609 xorps @tweak[0],$inout0 2610 movups $inout0,($out) 2611 2612.Lxts_dec_ret: 2613 xorps %xmm0,%xmm0 # clear register bank 2614 pxor %xmm1,%xmm1 2615 pxor %xmm2,%xmm2 2616 pxor %xmm3,%xmm3 2617 pxor %xmm4,%xmm4 2618 pxor %xmm5,%xmm5 2619___ 2620$code.=<<___ if (!$win64); 2621 pxor %xmm6,%xmm6 2622 pxor %xmm7,%xmm7 2623 movaps %xmm0,0x00(%rsp) # clear stack 2624 pxor %xmm8,%xmm8 2625 movaps %xmm0,0x10(%rsp) 2626 pxor %xmm9,%xmm9 2627 movaps %xmm0,0x20(%rsp) 2628 pxor %xmm10,%xmm10 2629 movaps %xmm0,0x30(%rsp) 2630 pxor %xmm11,%xmm11 2631 movaps %xmm0,0x40(%rsp) 2632 pxor %xmm12,%xmm12 2633 movaps %xmm0,0x50(%rsp) 2634 pxor %xmm13,%xmm13 2635 movaps %xmm0,0x60(%rsp) 2636 pxor %xmm14,%xmm14 2637 pxor %xmm15,%xmm15 2638___ 2639$code.=<<___ if ($win64); 2640 movaps -0xa8(%r11),%xmm6 2641 movaps %xmm0,-0xa8(%r11) # clear stack 2642 movaps -0x98(%r11),%xmm7 2643 movaps %xmm0,-0x98(%r11) 2644 movaps -0x88(%r11),%xmm8 2645 movaps %xmm0,-0x88(%r11) 2646 movaps -0x78(%r11),%xmm9 2647 movaps %xmm0,-0x78(%r11) 2648 movaps -0x68(%r11),%xmm10 2649 movaps %xmm0,-0x68(%r11) 2650 movaps -0x58(%r11),%xmm11 2651 movaps %xmm0,-0x58(%r11) 2652 movaps -0x48(%r11),%xmm12 2653 movaps %xmm0,-0x48(%r11) 2654 movaps -0x38(%r11),%xmm13 2655 movaps %xmm0,-0x38(%r11) 2656 movaps -0x28(%r11),%xmm14 2657 movaps %xmm0,-0x28(%r11) 2658 movaps -0x18(%r11),%xmm15 2659 movaps %xmm0,-0x18(%r11) 2660 movaps %xmm0,0x00(%rsp) 2661 movaps %xmm0,0x10(%rsp) 2662 movaps %xmm0,0x20(%rsp) 2663 movaps %xmm0,0x30(%rsp) 2664 movaps %xmm0,0x40(%rsp) 2665 movaps %xmm0,0x50(%rsp) 2666 movaps %xmm0,0x60(%rsp) 2667___ 2668$code.=<<___; 2669 mov -8(%r11),%rbp 2670.cfi_restore %rbp 2671 lea (%r11),%rsp 2672.cfi_def_cfa_register %rsp 2673.Lxts_dec_epilogue: 2674 ret 2675.cfi_endproc 2676.size ${PREFIX}_xts_decrypt,.-${PREFIX}_xts_decrypt 2677___ 2678} }} 2679 2680######################################################################## 2681# void $PREFIX_cbc_encrypt (const void *inp, void *out, 2682# size_t length, const AES_KEY *key, 2683# unsigned char *ivp,const int enc); 2684{ 2685my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt 2686my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15)); 2687 2688$code.=<<___; 2689.globl ${PREFIX}_cbc_encrypt 2690.type ${PREFIX}_cbc_encrypt,\@function,6 2691.align 16 2692${PREFIX}_cbc_encrypt: 2693.cfi_startproc 2694 _CET_ENDBR 2695 test $len,$len # check length 2696 jz .Lcbc_ret 2697 2698 mov 240($key),$rnds_ # key->rounds 2699 mov $key,$key_ # backup $key 2700 test %r9d,%r9d # 6th argument 2701 jz .Lcbc_decrypt 2702#--------------------------- CBC ENCRYPT ------------------------------# 2703 movups ($ivp),$inout0 # load iv as initial state 2704 mov $rnds_,$rounds 2705 cmp \$16,$len 2706 jb .Lcbc_enc_tail 2707 sub \$16,$len 2708 jmp .Lcbc_enc_loop 2709.align 16 2710.Lcbc_enc_loop: 2711 movups ($inp),$inout1 # load input 2712 lea 16($inp),$inp 2713 #xorps $inout1,$inout0 2714___ 2715 &aesni_generate1("enc",$key,$rounds,$inout0,$inout1); 2716$code.=<<___; 2717 mov $rnds_,$rounds # restore $rounds 2718 mov $key_,$key # restore $key 2719 movups $inout0,0($out) # store output 2720 lea 16($out),$out 2721 sub \$16,$len 2722 jnc .Lcbc_enc_loop 2723 add \$16,$len 2724 jnz .Lcbc_enc_tail 2725 pxor $rndkey0,$rndkey0 # clear register bank 2726 pxor $rndkey1,$rndkey1 2727 movups $inout0,($ivp) 2728 pxor $inout0,$inout0 2729 pxor $inout1,$inout1 2730 jmp .Lcbc_ret 2731 2732.Lcbc_enc_tail: 2733 mov $len,%rcx # zaps $key 2734 xchg $inp,$out # $inp is %rsi and $out is %rdi now 2735 .long 0x9066A4F3 # rep movsb 2736 mov \$16,%ecx # zero tail 2737 sub $len,%rcx 2738 xor %eax,%eax 2739 .long 0x9066AAF3 # rep stosb 2740 lea -16(%rdi),%rdi # rewind $out by 1 block 2741 mov $rnds_,$rounds # restore $rounds 2742 mov %rdi,%rsi # $inp and $out are the same 2743 mov $key_,$key # restore $key 2744 xor $len,$len # len=16 2745 jmp .Lcbc_enc_loop # one more spin 2746#--------------------------- CBC DECRYPT ------------------------------# 2747.align 16 2748.Lcbc_decrypt: 2749 cmp \$16,$len 2750 jne .Lcbc_decrypt_bulk 2751 2752 # handle single block without allocating stack frame, 2753 # useful in ciphertext stealing mode 2754 movdqu ($inp),$inout0 # load input 2755 movdqu ($ivp),$inout1 # load iv 2756 movdqa $inout0,$inout2 # future iv 2757___ 2758 &aesni_generate1("dec",$key,$rnds_); 2759$code.=<<___; 2760 pxor $rndkey0,$rndkey0 # clear register bank 2761 pxor $rndkey1,$rndkey1 2762 movdqu $inout2,($ivp) # store iv 2763 xorps $inout1,$inout0 # ^=iv 2764 pxor $inout1,$inout1 2765 movups $inout0,($out) # store output 2766 pxor $inout0,$inout0 2767 jmp .Lcbc_ret 2768.align 16 2769.Lcbc_decrypt_bulk: 2770 lea (%rsp),%r11 # frame pointer 2771.cfi_def_cfa_register %r11 2772 push %rbp 2773.cfi_push %rbp 2774 sub \$$frame_size,%rsp 2775 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 2776___ 2777$code.=<<___ if ($win64); 2778 movaps %xmm6,0x10(%rsp) 2779 movaps %xmm7,0x20(%rsp) 2780 movaps %xmm8,0x30(%rsp) 2781 movaps %xmm9,0x40(%rsp) 2782 movaps %xmm10,0x50(%rsp) 2783 movaps %xmm11,0x60(%rsp) 2784 movaps %xmm12,0x70(%rsp) 2785 movaps %xmm13,0x80(%rsp) 2786 movaps %xmm14,0x90(%rsp) 2787 movaps %xmm15,0xa0(%rsp) 2788.Lcbc_decrypt_body: 2789___ 2790 2791my $inp_=$key_="%rbp"; # reassign $key_ 2792 2793$code.=<<___; 2794 mov $key,$key_ # [re-]backup $key [after reassignment] 2795 movups ($ivp),$iv 2796 mov $rnds_,$rounds 2797 cmp \$0x50,$len 2798 jbe .Lcbc_dec_tail 2799 2800 $movkey ($key),$rndkey0 2801 movdqu 0x00($inp),$inout0 # load input 2802 movdqu 0x10($inp),$inout1 2803 movdqa $inout0,$in0 2804 movdqu 0x20($inp),$inout2 2805 movdqa $inout1,$in1 2806 movdqu 0x30($inp),$inout3 2807 movdqa $inout2,$in2 2808 movdqu 0x40($inp),$inout4 2809 movdqa $inout3,$in3 2810 movdqu 0x50($inp),$inout5 2811 movdqa $inout4,$in4 2812 cmp \$0x70,$len 2813 jbe .Lcbc_dec_six_or_seven 2814 2815 sub \$0x70,$len # $len is biased by -7*16 2816 lea 0x70($key),$key # size optimization 2817 jmp .Lcbc_dec_loop8_enter 2818.align 16 2819.Lcbc_dec_loop8: 2820 movups $inout7,($out) 2821 lea 0x10($out),$out 2822.Lcbc_dec_loop8_enter: 2823 movdqu 0x60($inp),$inout6 2824 pxor $rndkey0,$inout0 2825 movdqu 0x70($inp),$inout7 2826 pxor $rndkey0,$inout1 2827 $movkey 0x10-0x70($key),$rndkey1 2828 pxor $rndkey0,$inout2 2829 mov \$-1,$inp_ 2830 cmp \$0x70,$len # is there at least 0x60 bytes ahead? 2831 pxor $rndkey0,$inout3 2832 pxor $rndkey0,$inout4 2833 pxor $rndkey0,$inout5 2834 pxor $rndkey0,$inout6 2835 2836 aesdec $rndkey1,$inout0 2837 pxor $rndkey0,$inout7 2838 $movkey 0x20-0x70($key),$rndkey0 2839 aesdec $rndkey1,$inout1 2840 aesdec $rndkey1,$inout2 2841 aesdec $rndkey1,$inout3 2842 aesdec $rndkey1,$inout4 2843 aesdec $rndkey1,$inout5 2844 aesdec $rndkey1,$inout6 2845 adc \$0,$inp_ 2846 and \$128,$inp_ 2847 aesdec $rndkey1,$inout7 2848 add $inp,$inp_ 2849 $movkey 0x30-0x70($key),$rndkey1 2850___ 2851for($i=1;$i<12;$i++) { 2852my $rndkeyx = ($i&1)?$rndkey0:$rndkey1; 2853$code.=<<___ if ($i==7); 2854 cmp \$11,$rounds 2855___ 2856$code.=<<___; 2857 aesdec $rndkeyx,$inout0 2858 aesdec $rndkeyx,$inout1 2859 aesdec $rndkeyx,$inout2 2860 aesdec $rndkeyx,$inout3 2861 aesdec $rndkeyx,$inout4 2862 aesdec $rndkeyx,$inout5 2863 aesdec $rndkeyx,$inout6 2864 aesdec $rndkeyx,$inout7 2865 $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx 2866___ 2867$code.=<<___ if ($i<6 || (!($i&1) && $i>7)); 2868 nop 2869___ 2870$code.=<<___ if ($i==7); 2871 jb .Lcbc_dec_done 2872___ 2873$code.=<<___ if ($i==9); 2874 je .Lcbc_dec_done 2875___ 2876$code.=<<___ if ($i==11); 2877 jmp .Lcbc_dec_done 2878___ 2879} 2880$code.=<<___; 2881.align 16 2882.Lcbc_dec_done: 2883 aesdec $rndkey1,$inout0 2884 aesdec $rndkey1,$inout1 2885 pxor $rndkey0,$iv 2886 pxor $rndkey0,$in0 2887 aesdec $rndkey1,$inout2 2888 aesdec $rndkey1,$inout3 2889 pxor $rndkey0,$in1 2890 pxor $rndkey0,$in2 2891 aesdec $rndkey1,$inout4 2892 aesdec $rndkey1,$inout5 2893 pxor $rndkey0,$in3 2894 pxor $rndkey0,$in4 2895 aesdec $rndkey1,$inout6 2896 aesdec $rndkey1,$inout7 2897 movdqu 0x50($inp),$rndkey1 2898 2899 aesdeclast $iv,$inout0 2900 movdqu 0x60($inp),$iv # borrow $iv 2901 pxor $rndkey0,$rndkey1 2902 aesdeclast $in0,$inout1 2903 pxor $rndkey0,$iv 2904 movdqu 0x70($inp),$rndkey0 # next IV 2905 aesdeclast $in1,$inout2 2906 lea 0x80($inp),$inp 2907 movdqu 0x00($inp_),$in0 2908 aesdeclast $in2,$inout3 2909 aesdeclast $in3,$inout4 2910 movdqu 0x10($inp_),$in1 2911 movdqu 0x20($inp_),$in2 2912 aesdeclast $in4,$inout5 2913 aesdeclast $rndkey1,$inout6 2914 movdqu 0x30($inp_),$in3 2915 movdqu 0x40($inp_),$in4 2916 aesdeclast $iv,$inout7 2917 movdqa $rndkey0,$iv # return $iv 2918 movdqu 0x50($inp_),$rndkey1 2919 $movkey -0x70($key),$rndkey0 2920 2921 movups $inout0,($out) # store output 2922 movdqa $in0,$inout0 2923 movups $inout1,0x10($out) 2924 movdqa $in1,$inout1 2925 movups $inout2,0x20($out) 2926 movdqa $in2,$inout2 2927 movups $inout3,0x30($out) 2928 movdqa $in3,$inout3 2929 movups $inout4,0x40($out) 2930 movdqa $in4,$inout4 2931 movups $inout5,0x50($out) 2932 movdqa $rndkey1,$inout5 2933 movups $inout6,0x60($out) 2934 lea 0x70($out),$out 2935 2936 sub \$0x80,$len 2937 ja .Lcbc_dec_loop8 2938 2939 movaps $inout7,$inout0 2940 lea -0x70($key),$key 2941 add \$0x70,$len 2942 jle .Lcbc_dec_clear_tail_collected 2943 movups $inout7,($out) 2944 lea 0x10($out),$out 2945 cmp \$0x50,$len 2946 jbe .Lcbc_dec_tail 2947 2948 movaps $in0,$inout0 2949.Lcbc_dec_six_or_seven: 2950 cmp \$0x60,$len 2951 ja .Lcbc_dec_seven 2952 2953 movaps $inout5,$inout6 2954 call _aesni_decrypt6 2955 pxor $iv,$inout0 # ^= IV 2956 movaps $inout6,$iv 2957 pxor $in0,$inout1 2958 movdqu $inout0,($out) 2959 pxor $in1,$inout2 2960 movdqu $inout1,0x10($out) 2961 pxor $inout1,$inout1 # clear register bank 2962 pxor $in2,$inout3 2963 movdqu $inout2,0x20($out) 2964 pxor $inout2,$inout2 2965 pxor $in3,$inout4 2966 movdqu $inout3,0x30($out) 2967 pxor $inout3,$inout3 2968 pxor $in4,$inout5 2969 movdqu $inout4,0x40($out) 2970 pxor $inout4,$inout4 2971 lea 0x50($out),$out 2972 movdqa $inout5,$inout0 2973 pxor $inout5,$inout5 2974 jmp .Lcbc_dec_tail_collected 2975 2976.align 16 2977.Lcbc_dec_seven: 2978 movups 0x60($inp),$inout6 2979 xorps $inout7,$inout7 2980 call _aesni_decrypt8 2981 movups 0x50($inp),$inout7 2982 pxor $iv,$inout0 # ^= IV 2983 movups 0x60($inp),$iv 2984 pxor $in0,$inout1 2985 movdqu $inout0,($out) 2986 pxor $in1,$inout2 2987 movdqu $inout1,0x10($out) 2988 pxor $inout1,$inout1 # clear register bank 2989 pxor $in2,$inout3 2990 movdqu $inout2,0x20($out) 2991 pxor $inout2,$inout2 2992 pxor $in3,$inout4 2993 movdqu $inout3,0x30($out) 2994 pxor $inout3,$inout3 2995 pxor $in4,$inout5 2996 movdqu $inout4,0x40($out) 2997 pxor $inout4,$inout4 2998 pxor $inout7,$inout6 2999 movdqu $inout5,0x50($out) 3000 pxor $inout5,$inout5 3001 lea 0x60($out),$out 3002 movdqa $inout6,$inout0 3003 pxor $inout6,$inout6 3004 pxor $inout7,$inout7 3005 jmp .Lcbc_dec_tail_collected 3006 3007.Lcbc_dec_tail: 3008 movups ($inp),$inout0 3009 sub \$0x10,$len 3010 jbe .Lcbc_dec_one # $len is 1*16 or less 3011 3012 movups 0x10($inp),$inout1 3013 movaps $inout0,$in0 3014 sub \$0x10,$len 3015 jbe .Lcbc_dec_two # $len is 2*16 or less 3016 3017 movups 0x20($inp),$inout2 3018 movaps $inout1,$in1 3019 sub \$0x10,$len 3020 jbe .Lcbc_dec_three # $len is 3*16 or less 3021 3022 movups 0x30($inp),$inout3 3023 movaps $inout2,$in2 3024 sub \$0x10,$len 3025 jbe .Lcbc_dec_four # $len is 4*16 or less 3026 3027 movups 0x40($inp),$inout4 # $len is 5*16 or less 3028 movaps $inout3,$in3 3029 movaps $inout4,$in4 3030 xorps $inout5,$inout5 3031 call _aesni_decrypt6 3032 pxor $iv,$inout0 3033 movaps $in4,$iv 3034 pxor $in0,$inout1 3035 movdqu $inout0,($out) 3036 pxor $in1,$inout2 3037 movdqu $inout1,0x10($out) 3038 pxor $inout1,$inout1 # clear register bank 3039 pxor $in2,$inout3 3040 movdqu $inout2,0x20($out) 3041 pxor $inout2,$inout2 3042 pxor $in3,$inout4 3043 movdqu $inout3,0x30($out) 3044 pxor $inout3,$inout3 3045 lea 0x40($out),$out 3046 movdqa $inout4,$inout0 3047 pxor $inout4,$inout4 3048 pxor $inout5,$inout5 3049 sub \$0x10,$len 3050 jmp .Lcbc_dec_tail_collected 3051 3052.align 16 3053.Lcbc_dec_one: 3054 movaps $inout0,$in0 3055___ 3056 &aesni_generate1("dec",$key,$rounds); 3057$code.=<<___; 3058 xorps $iv,$inout0 3059 movaps $in0,$iv 3060 jmp .Lcbc_dec_tail_collected 3061.align 16 3062.Lcbc_dec_two: 3063 movaps $inout1,$in1 3064 call _aesni_decrypt2 3065 pxor $iv,$inout0 3066 movaps $in1,$iv 3067 pxor $in0,$inout1 3068 movdqu $inout0,($out) 3069 movdqa $inout1,$inout0 3070 pxor $inout1,$inout1 # clear register bank 3071 lea 0x10($out),$out 3072 jmp .Lcbc_dec_tail_collected 3073.align 16 3074.Lcbc_dec_three: 3075 movaps $inout2,$in2 3076 call _aesni_decrypt3 3077 pxor $iv,$inout0 3078 movaps $in2,$iv 3079 pxor $in0,$inout1 3080 movdqu $inout0,($out) 3081 pxor $in1,$inout2 3082 movdqu $inout1,0x10($out) 3083 pxor $inout1,$inout1 # clear register bank 3084 movdqa $inout2,$inout0 3085 pxor $inout2,$inout2 3086 lea 0x20($out),$out 3087 jmp .Lcbc_dec_tail_collected 3088.align 16 3089.Lcbc_dec_four: 3090 movaps $inout3,$in3 3091 call _aesni_decrypt4 3092 pxor $iv,$inout0 3093 movaps $in3,$iv 3094 pxor $in0,$inout1 3095 movdqu $inout0,($out) 3096 pxor $in1,$inout2 3097 movdqu $inout1,0x10($out) 3098 pxor $inout1,$inout1 # clear register bank 3099 pxor $in2,$inout3 3100 movdqu $inout2,0x20($out) 3101 pxor $inout2,$inout2 3102 movdqa $inout3,$inout0 3103 pxor $inout3,$inout3 3104 lea 0x30($out),$out 3105 jmp .Lcbc_dec_tail_collected 3106 3107.align 16 3108.Lcbc_dec_clear_tail_collected: 3109 pxor $inout1,$inout1 # clear register bank 3110 pxor $inout2,$inout2 3111 pxor $inout3,$inout3 3112___ 3113$code.=<<___ if (!$win64); 3114 pxor $inout4,$inout4 # %xmm6..9 3115 pxor $inout5,$inout5 3116 pxor $inout6,$inout6 3117 pxor $inout7,$inout7 3118___ 3119$code.=<<___; 3120.Lcbc_dec_tail_collected: 3121 movups $iv,($ivp) 3122 and \$15,$len 3123 jnz .Lcbc_dec_tail_partial 3124 movups $inout0,($out) 3125 pxor $inout0,$inout0 3126 jmp .Lcbc_dec_ret 3127.align 16 3128.Lcbc_dec_tail_partial: 3129 movaps $inout0,(%rsp) 3130 pxor $inout0,$inout0 3131 mov \$16,%rcx 3132 mov $out,%rdi 3133 sub $len,%rcx 3134 lea (%rsp),%rsi 3135 .long 0x9066A4F3 # rep movsb 3136 movdqa $inout0,(%rsp) 3137 3138.Lcbc_dec_ret: 3139 xorps $rndkey0,$rndkey0 # %xmm0 3140 pxor $rndkey1,$rndkey1 3141___ 3142$code.=<<___ if ($win64); 3143 movaps 0x10(%rsp),%xmm6 3144 movaps %xmm0,0x10(%rsp) # clear stack 3145 movaps 0x20(%rsp),%xmm7 3146 movaps %xmm0,0x20(%rsp) 3147 movaps 0x30(%rsp),%xmm8 3148 movaps %xmm0,0x30(%rsp) 3149 movaps 0x40(%rsp),%xmm9 3150 movaps %xmm0,0x40(%rsp) 3151 movaps 0x50(%rsp),%xmm10 3152 movaps %xmm0,0x50(%rsp) 3153 movaps 0x60(%rsp),%xmm11 3154 movaps %xmm0,0x60(%rsp) 3155 movaps 0x70(%rsp),%xmm12 3156 movaps %xmm0,0x70(%rsp) 3157 movaps 0x80(%rsp),%xmm13 3158 movaps %xmm0,0x80(%rsp) 3159 movaps 0x90(%rsp),%xmm14 3160 movaps %xmm0,0x90(%rsp) 3161 movaps 0xa0(%rsp),%xmm15 3162 movaps %xmm0,0xa0(%rsp) 3163___ 3164$code.=<<___; 3165 mov -8(%r11),%rbp 3166.cfi_restore %rbp 3167 lea (%r11),%rsp 3168.cfi_def_cfa_register %rsp 3169.Lcbc_ret: 3170 ret 3171.cfi_endproc 3172.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt 3173___ 3174} 3175# int ${PREFIX}_set_decrypt_key(const unsigned char *inp, 3176# int bits, AES_KEY *key) 3177# 3178# input: $inp user-supplied key 3179# $bits $inp length in bits 3180# $key pointer to key schedule 3181# output: %eax 0 denoting success, -1 or -2 - failure (see C) 3182# *$key key schedule 3183# 3184{ my ($inp,$bits,$key) = @_4args; 3185 $bits =~ s/%r/%e/; 3186 3187$code.=<<___; 3188.globl ${PREFIX}_set_decrypt_key 3189.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent 3190.align 16 3191${PREFIX}_set_decrypt_key: 3192.cfi_startproc 3193 _CET_ENDBR 3194 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 3195.cfi_adjust_cfa_offset 8 3196 call __aesni_set_encrypt_key 3197 shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key 3198 test %eax,%eax 3199 jnz .Ldec_key_ret 3200 lea 16($key,$bits),$inp # points at the end of key schedule 3201 3202 $movkey ($key),%xmm0 # just swap 3203 $movkey ($inp),%xmm1 3204 $movkey %xmm0,($inp) 3205 $movkey %xmm1,($key) 3206 lea 16($key),$key 3207 lea -16($inp),$inp 3208 3209.Ldec_key_inverse: 3210 $movkey ($key),%xmm0 # swap and inverse 3211 $movkey ($inp),%xmm1 3212 aesimc %xmm0,%xmm0 3213 aesimc %xmm1,%xmm1 3214 lea 16($key),$key 3215 lea -16($inp),$inp 3216 $movkey %xmm0,16($inp) 3217 $movkey %xmm1,-16($key) 3218 cmp $key,$inp 3219 ja .Ldec_key_inverse 3220 3221 $movkey ($key),%xmm0 # inverse middle 3222 aesimc %xmm0,%xmm0 3223 pxor %xmm1,%xmm1 3224 $movkey %xmm0,($inp) 3225 pxor %xmm0,%xmm0 3226.Ldec_key_ret: 3227 add \$8,%rsp 3228.cfi_adjust_cfa_offset -8 3229 ret 3230.cfi_endproc 3231.LSEH_end_set_decrypt_key: 3232.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key 3233___ 3234 3235# This is based on submission from Intel by 3236# Huang Ying 3237# Vinodh Gopal 3238# Kahraman Akdemir 3239# 3240# Aggressively optimized in respect to aeskeygenassist's critical path 3241# and is contained in %xmm0-5 to meet Win64 ABI requirement. 3242# 3243# int ${PREFIX}_set_encrypt_key(const unsigned char *inp, 3244# int bits, AES_KEY * const key); 3245# 3246# input: $inp user-supplied key 3247# $bits $inp length in bits 3248# $key pointer to key schedule 3249# output: %eax 0 denoting success, -1 or -2 - failure (see C) 3250# $bits rounds-1 (used in aesni_set_decrypt_key) 3251# *$key key schedule 3252# $key pointer to key schedule (used in 3253# aesni_set_decrypt_key) 3254# 3255# Subroutine is frame-less, which means that only volatile registers 3256# are used. Note that it's declared "abi-omnipotent", which means that 3257# amount of volatile registers is smaller on Windows. 3258# 3259$code.=<<___; 3260.globl ${PREFIX}_set_encrypt_key 3261.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent 3262.align 16 3263${PREFIX}_set_encrypt_key: 3264__aesni_set_encrypt_key: 3265.cfi_startproc 3266 _CET_ENDBR 3267#ifdef BORINGSSL_DISPATCH_TEST 3268 movb \$1,BORINGSSL_function_hit+3(%rip) 3269#endif 3270 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 3271.cfi_adjust_cfa_offset 8 3272 mov \$-1,%rax 3273 test $inp,$inp 3274 jz .Lenc_key_ret 3275 test $key,$key 3276 jz .Lenc_key_ret 3277 3278 movups ($inp),%xmm0 # pull first 128 bits of *userKey 3279 xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0 3280 leaq OPENSSL_ia32cap_P(%rip),%r10 3281 movl 4(%r10),%r10d 3282 and \$`1<<28|1<<11`,%r10d # AVX and XOP bits 3283 lea 16($key),%rax # %rax is used as modifiable copy of $key 3284 cmp \$256,$bits 3285 je .L14rounds 3286 cmp \$192,$bits 3287 je .L12rounds 3288 cmp \$128,$bits 3289 jne .Lbad_keybits 3290 3291.L10rounds: 3292 mov \$9,$bits # 10 rounds for 128-bit key 3293 cmp \$`1<<28`,%r10d # AVX, bit no XOP 3294 je .L10rounds_alt 3295 3296 $movkey %xmm0,($key) # round 0 3297 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1 3298 call .Lkey_expansion_128_cold 3299 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2 3300 call .Lkey_expansion_128 3301 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3 3302 call .Lkey_expansion_128 3303 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4 3304 call .Lkey_expansion_128 3305 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5 3306 call .Lkey_expansion_128 3307 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6 3308 call .Lkey_expansion_128 3309 aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7 3310 call .Lkey_expansion_128 3311 aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8 3312 call .Lkey_expansion_128 3313 aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9 3314 call .Lkey_expansion_128 3315 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10 3316 call .Lkey_expansion_128 3317 $movkey %xmm0,(%rax) 3318 mov $bits,80(%rax) # 240(%rdx) 3319 xor %eax,%eax 3320 jmp .Lenc_key_ret 3321 3322.align 16 3323.L10rounds_alt: 3324 movdqa .Lkey_rotate(%rip),%xmm5 3325 mov \$8,%r10d 3326 movdqa .Lkey_rcon1(%rip),%xmm4 3327 movdqa %xmm0,%xmm2 3328 movdqu %xmm0,($key) 3329 jmp .Loop_key128 3330 3331.align 16 3332.Loop_key128: 3333 pshufb %xmm5,%xmm0 3334 aesenclast %xmm4,%xmm0 3335 pslld \$1,%xmm4 3336 lea 16(%rax),%rax 3337 3338 movdqa %xmm2,%xmm3 3339 pslldq \$4,%xmm2 3340 pxor %xmm2,%xmm3 3341 pslldq \$4,%xmm2 3342 pxor %xmm2,%xmm3 3343 pslldq \$4,%xmm2 3344 pxor %xmm3,%xmm2 3345 3346 pxor %xmm2,%xmm0 3347 movdqu %xmm0,-16(%rax) 3348 movdqa %xmm0,%xmm2 3349 3350 dec %r10d 3351 jnz .Loop_key128 3352 3353 movdqa .Lkey_rcon1b(%rip),%xmm4 3354 3355 pshufb %xmm5,%xmm0 3356 aesenclast %xmm4,%xmm0 3357 pslld \$1,%xmm4 3358 3359 movdqa %xmm2,%xmm3 3360 pslldq \$4,%xmm2 3361 pxor %xmm2,%xmm3 3362 pslldq \$4,%xmm2 3363 pxor %xmm2,%xmm3 3364 pslldq \$4,%xmm2 3365 pxor %xmm3,%xmm2 3366 3367 pxor %xmm2,%xmm0 3368 movdqu %xmm0,(%rax) 3369 3370 movdqa %xmm0,%xmm2 3371 pshufb %xmm5,%xmm0 3372 aesenclast %xmm4,%xmm0 3373 3374 movdqa %xmm2,%xmm3 3375 pslldq \$4,%xmm2 3376 pxor %xmm2,%xmm3 3377 pslldq \$4,%xmm2 3378 pxor %xmm2,%xmm3 3379 pslldq \$4,%xmm2 3380 pxor %xmm3,%xmm2 3381 3382 pxor %xmm2,%xmm0 3383 movdqu %xmm0,16(%rax) 3384 3385 mov $bits,96(%rax) # 240($key) 3386 xor %eax,%eax 3387 jmp .Lenc_key_ret 3388 3389.align 16 3390.L12rounds: 3391 movq 16($inp),%xmm2 # remaining 1/3 of *userKey 3392 mov \$11,$bits # 12 rounds for 192 3393 cmp \$`1<<28`,%r10d # AVX, but no XOP 3394 je .L12rounds_alt 3395 3396 $movkey %xmm0,($key) # round 0 3397 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2 3398 call .Lkey_expansion_192a_cold 3399 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3 3400 call .Lkey_expansion_192b 3401 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5 3402 call .Lkey_expansion_192a 3403 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6 3404 call .Lkey_expansion_192b 3405 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8 3406 call .Lkey_expansion_192a 3407 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9 3408 call .Lkey_expansion_192b 3409 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11 3410 call .Lkey_expansion_192a 3411 aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12 3412 call .Lkey_expansion_192b 3413 $movkey %xmm0,(%rax) 3414 mov $bits,48(%rax) # 240(%rdx) 3415 xor %rax, %rax 3416 jmp .Lenc_key_ret 3417 3418.align 16 3419.L12rounds_alt: 3420 movdqa .Lkey_rotate192(%rip),%xmm5 3421 movdqa .Lkey_rcon1(%rip),%xmm4 3422 mov \$8,%r10d 3423 movdqu %xmm0,($key) 3424 jmp .Loop_key192 3425 3426.align 16 3427.Loop_key192: 3428 movq %xmm2,0(%rax) 3429 movdqa %xmm2,%xmm1 3430 pshufb %xmm5,%xmm2 3431 aesenclast %xmm4,%xmm2 3432 pslld \$1, %xmm4 3433 lea 24(%rax),%rax 3434 3435 movdqa %xmm0,%xmm3 3436 pslldq \$4,%xmm0 3437 pxor %xmm0,%xmm3 3438 pslldq \$4,%xmm0 3439 pxor %xmm0,%xmm3 3440 pslldq \$4,%xmm0 3441 pxor %xmm3,%xmm0 3442 3443 pshufd \$0xff,%xmm0,%xmm3 3444 pxor %xmm1,%xmm3 3445 pslldq \$4,%xmm1 3446 pxor %xmm1,%xmm3 3447 3448 pxor %xmm2,%xmm0 3449 pxor %xmm3,%xmm2 3450 movdqu %xmm0,-16(%rax) 3451 3452 dec %r10d 3453 jnz .Loop_key192 3454 3455 mov $bits,32(%rax) # 240($key) 3456 xor %eax,%eax 3457 jmp .Lenc_key_ret 3458 3459.align 16 3460.L14rounds: 3461 movups 16($inp),%xmm2 # remaining half of *userKey 3462 mov \$13,$bits # 14 rounds for 256 3463 lea 16(%rax),%rax 3464 cmp \$`1<<28`,%r10d # AVX, but no XOP 3465 je .L14rounds_alt 3466 3467 $movkey %xmm0,($key) # round 0 3468 $movkey %xmm2,16($key) # round 1 3469 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2 3470 call .Lkey_expansion_256a_cold 3471 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3 3472 call .Lkey_expansion_256b 3473 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4 3474 call .Lkey_expansion_256a 3475 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5 3476 call .Lkey_expansion_256b 3477 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6 3478 call .Lkey_expansion_256a 3479 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7 3480 call .Lkey_expansion_256b 3481 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8 3482 call .Lkey_expansion_256a 3483 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9 3484 call .Lkey_expansion_256b 3485 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10 3486 call .Lkey_expansion_256a 3487 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11 3488 call .Lkey_expansion_256b 3489 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12 3490 call .Lkey_expansion_256a 3491 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13 3492 call .Lkey_expansion_256b 3493 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14 3494 call .Lkey_expansion_256a 3495 $movkey %xmm0,(%rax) 3496 mov $bits,16(%rax) # 240(%rdx) 3497 xor %rax,%rax 3498 jmp .Lenc_key_ret 3499 3500.align 16 3501.L14rounds_alt: 3502 movdqa .Lkey_rotate(%rip),%xmm5 3503 movdqa .Lkey_rcon1(%rip),%xmm4 3504 mov \$7,%r10d 3505 movdqu %xmm0,0($key) 3506 movdqa %xmm2,%xmm1 3507 movdqu %xmm2,16($key) 3508 jmp .Loop_key256 3509 3510.align 16 3511.Loop_key256: 3512 pshufb %xmm5,%xmm2 3513 aesenclast %xmm4,%xmm2 3514 3515 movdqa %xmm0,%xmm3 3516 pslldq \$4,%xmm0 3517 pxor %xmm0,%xmm3 3518 pslldq \$4,%xmm0 3519 pxor %xmm0,%xmm3 3520 pslldq \$4,%xmm0 3521 pxor %xmm3,%xmm0 3522 pslld \$1,%xmm4 3523 3524 pxor %xmm2,%xmm0 3525 movdqu %xmm0,(%rax) 3526 3527 dec %r10d 3528 jz .Ldone_key256 3529 3530 pshufd \$0xff,%xmm0,%xmm2 3531 pxor %xmm3,%xmm3 3532 aesenclast %xmm3,%xmm2 3533 3534 movdqa %xmm1,%xmm3 3535 pslldq \$4,%xmm1 3536 pxor %xmm1,%xmm3 3537 pslldq \$4,%xmm1 3538 pxor %xmm1,%xmm3 3539 pslldq \$4,%xmm1 3540 pxor %xmm3,%xmm1 3541 3542 pxor %xmm1,%xmm2 3543 movdqu %xmm2,16(%rax) 3544 lea 32(%rax),%rax 3545 movdqa %xmm2,%xmm1 3546 3547 jmp .Loop_key256 3548 3549.Ldone_key256: 3550 mov $bits,16(%rax) # 240($key) 3551 xor %eax,%eax 3552 jmp .Lenc_key_ret 3553 3554.align 16 3555.Lbad_keybits: 3556 mov \$-2,%rax 3557.Lenc_key_ret: 3558 pxor %xmm0,%xmm0 3559 pxor %xmm1,%xmm1 3560 pxor %xmm2,%xmm2 3561 pxor %xmm3,%xmm3 3562 pxor %xmm4,%xmm4 3563 pxor %xmm5,%xmm5 3564 add \$8,%rsp 3565.cfi_adjust_cfa_offset -8 3566 ret 3567.cfi_endproc 3568.LSEH_end_set_encrypt_key: 3569 3570.align 16 3571.Lkey_expansion_128: 3572 $movkey %xmm0,(%rax) 3573 lea 16(%rax),%rax 3574.Lkey_expansion_128_cold: 3575 shufps \$0b00010000,%xmm0,%xmm4 3576 xorps %xmm4, %xmm0 3577 shufps \$0b10001100,%xmm0,%xmm4 3578 xorps %xmm4, %xmm0 3579 shufps \$0b11111111,%xmm1,%xmm1 # critical path 3580 xorps %xmm1,%xmm0 3581 ret 3582 3583.align 16 3584.Lkey_expansion_192a: 3585 $movkey %xmm0,(%rax) 3586 lea 16(%rax),%rax 3587.Lkey_expansion_192a_cold: 3588 movaps %xmm2, %xmm5 3589.Lkey_expansion_192b_warm: 3590 shufps \$0b00010000,%xmm0,%xmm4 3591 movdqa %xmm2,%xmm3 3592 xorps %xmm4,%xmm0 3593 shufps \$0b10001100,%xmm0,%xmm4 3594 pslldq \$4,%xmm3 3595 xorps %xmm4,%xmm0 3596 pshufd \$0b01010101,%xmm1,%xmm1 # critical path 3597 pxor %xmm3,%xmm2 3598 pxor %xmm1,%xmm0 3599 pshufd \$0b11111111,%xmm0,%xmm3 3600 pxor %xmm3,%xmm2 3601 ret 3602 3603.align 16 3604.Lkey_expansion_192b: 3605 movaps %xmm0,%xmm3 3606 shufps \$0b01000100,%xmm0,%xmm5 3607 $movkey %xmm5,(%rax) 3608 shufps \$0b01001110,%xmm2,%xmm3 3609 $movkey %xmm3,16(%rax) 3610 lea 32(%rax),%rax 3611 jmp .Lkey_expansion_192b_warm 3612 3613.align 16 3614.Lkey_expansion_256a: 3615 $movkey %xmm2,(%rax) 3616 lea 16(%rax),%rax 3617.Lkey_expansion_256a_cold: 3618 shufps \$0b00010000,%xmm0,%xmm4 3619 xorps %xmm4,%xmm0 3620 shufps \$0b10001100,%xmm0,%xmm4 3621 xorps %xmm4,%xmm0 3622 shufps \$0b11111111,%xmm1,%xmm1 # critical path 3623 xorps %xmm1,%xmm0 3624 ret 3625 3626.align 16 3627.Lkey_expansion_256b: 3628 $movkey %xmm0,(%rax) 3629 lea 16(%rax),%rax 3630 3631 shufps \$0b00010000,%xmm2,%xmm4 3632 xorps %xmm4,%xmm2 3633 shufps \$0b10001100,%xmm2,%xmm4 3634 xorps %xmm4,%xmm2 3635 shufps \$0b10101010,%xmm1,%xmm1 # critical path 3636 xorps %xmm1,%xmm2 3637 ret 3638.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key 3639.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key 3640___ 3641} 3642 3643$code.=<<___; 3644.section .rodata 3645.align 64 3646.Lbswap_mask: 3647 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 3648.Lincrement32: 3649 .long 6,6,6,0 3650.Lincrement64: 3651 .long 1,0,0,0 3652.Lxts_magic: 3653 .long 0x87,0,1,0 3654.Lincrement1: 3655 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 3656.Lkey_rotate: 3657 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d 3658.Lkey_rotate192: 3659 .long 0x04070605,0x04070605,0x04070605,0x04070605 3660.Lkey_rcon1: 3661 .long 1,1,1,1 3662.Lkey_rcon1b: 3663 .long 0x1b,0x1b,0x1b,0x1b 3664 3665.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>" 3666.align 64 3667.text 3668___ 3669 3670# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3671# CONTEXT *context,DISPATCHER_CONTEXT *disp) 3672if ($win64) { 3673$rec="%rcx"; 3674$frame="%rdx"; 3675$context="%r8"; 3676$disp="%r9"; 3677 3678$code.=<<___; 3679.extern __imp_RtlVirtualUnwind 3680___ 3681$code.=<<___ if ($PREFIX eq "aes_hw"); 3682.type ecb_ccm64_se_handler,\@abi-omnipotent 3683.align 16 3684ecb_ccm64_se_handler: 3685 push %rsi 3686 push %rdi 3687 push %rbx 3688 push %rbp 3689 push %r12 3690 push %r13 3691 push %r14 3692 push %r15 3693 pushfq 3694 sub \$64,%rsp 3695 3696 mov 120($context),%rax # pull context->Rax 3697 mov 248($context),%rbx # pull context->Rip 3698 3699 mov 8($disp),%rsi # disp->ImageBase 3700 mov 56($disp),%r11 # disp->HandlerData 3701 3702 mov 0(%r11),%r10d # HandlerData[0] 3703 lea (%rsi,%r10),%r10 # prologue label 3704 cmp %r10,%rbx # context->Rip<prologue label 3705 jb .Lcommon_seh_tail 3706 3707 mov 152($context),%rax # pull context->Rsp 3708 3709 mov 4(%r11),%r10d # HandlerData[1] 3710 lea (%rsi,%r10),%r10 # epilogue label 3711 cmp %r10,%rbx # context->Rip>=epilogue label 3712 jae .Lcommon_seh_tail 3713 3714 lea 0(%rax),%rsi # %xmm save area 3715 lea 512($context),%rdi # &context.Xmm6 3716 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) 3717 .long 0xa548f3fc # cld; rep movsq 3718 lea 0x58(%rax),%rax # adjust stack pointer 3719 3720 jmp .Lcommon_seh_tail 3721.size ${PREFIX}_ccm64_se_handler,.-${PREFIX}_ccm64_se_handler 3722 3723.type ctr_xts_se_handler,\@abi-omnipotent 3724.align 16 3725ctr_xts_se_handler: 3726 push %rsi 3727 push %rdi 3728 push %rbx 3729 push %rbp 3730 push %r12 3731 push %r13 3732 push %r14 3733 push %r15 3734 pushfq 3735 sub \$64,%rsp 3736 3737 mov 120($context),%rax # pull context->Rax 3738 mov 248($context),%rbx # pull context->Rip 3739 3740 mov 8($disp),%rsi # disp->ImageBase 3741 mov 56($disp),%r11 # disp->HandlerData 3742 3743 mov 0(%r11),%r10d # HandlerData[0] 3744 lea (%rsi,%r10),%r10 # prologue lable 3745 cmp %r10,%rbx # context->Rip<prologue label 3746 jb .Lcommon_seh_tail 3747 3748 mov 152($context),%rax # pull context->Rsp 3749 3750 mov 4(%r11),%r10d # HandlerData[1] 3751 lea (%rsi,%r10),%r10 # epilogue label 3752 cmp %r10,%rbx # context->Rip>=epilogue label 3753 jae .Lcommon_seh_tail 3754 3755 mov 208($context),%rax # pull context->R11 3756 3757 lea -0xa8(%rax),%rsi # %xmm save area 3758 lea 512($context),%rdi # & context.Xmm6 3759 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 3760 .long 0xa548f3fc # cld; rep movsq 3761 3762 mov -8(%rax),%rbp # restore saved %rbp 3763 mov %rbp,160($context) # restore context->Rbp 3764 jmp .Lcommon_seh_tail 3765.size ctr_xts_se_handler,.-ctr_xts_se_handler 3766 3767___ 3768# BoringSSL omits the OCB functions. 3769$code.=<<___ if (0); 3770.type ocb_se_handler,\@abi-omnipotent 3771.align 16 3772ocb_se_handler: 3773 push %rsi 3774 push %rdi 3775 push %rbx 3776 push %rbp 3777 push %r12 3778 push %r13 3779 push %r14 3780 push %r15 3781 pushfq 3782 sub \$64,%rsp 3783 3784 mov 120($context),%rax # pull context->Rax 3785 mov 248($context),%rbx # pull context->Rip 3786 3787 mov 8($disp),%rsi # disp->ImageBase 3788 mov 56($disp),%r11 # disp->HandlerData 3789 3790 mov 0(%r11),%r10d # HandlerData[0] 3791 lea (%rsi,%r10),%r10 # prologue lable 3792 cmp %r10,%rbx # context->Rip<prologue label 3793 jb .Lcommon_seh_tail 3794 3795 mov 4(%r11),%r10d # HandlerData[1] 3796 lea (%rsi,%r10),%r10 # epilogue label 3797 cmp %r10,%rbx # context->Rip>=epilogue label 3798 jae .Lcommon_seh_tail 3799 3800 mov 8(%r11),%r10d # HandlerData[2] 3801 lea (%rsi,%r10),%r10 3802 cmp %r10,%rbx # context->Rip>=pop label 3803 jae .Locb_no_xmm 3804 3805 mov 152($context),%rax # pull context->Rsp 3806 3807 lea (%rax),%rsi # %xmm save area 3808 lea 512($context),%rdi # & context.Xmm6 3809 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 3810 .long 0xa548f3fc # cld; rep movsq 3811 lea 0xa0+0x28(%rax),%rax 3812 3813.Locb_no_xmm: 3814 mov -8(%rax),%rbx 3815 mov -16(%rax),%rbp 3816 mov -24(%rax),%r12 3817 mov -32(%rax),%r13 3818 mov -40(%rax),%r14 3819 3820 mov %rbx,144($context) # restore context->Rbx 3821 mov %rbp,160($context) # restore context->Rbp 3822 mov %r12,216($context) # restore context->R12 3823 mov %r13,224($context) # restore context->R13 3824 mov %r14,232($context) # restore context->R14 3825 3826 jmp .Lcommon_seh_tail 3827.size ocb_se_handler,.-ocb_se_handler 3828___ 3829$code.=<<___; 3830.type cbc_se_handler,\@abi-omnipotent 3831.align 16 3832cbc_se_handler: 3833 push %rsi 3834 push %rdi 3835 push %rbx 3836 push %rbp 3837 push %r12 3838 push %r13 3839 push %r14 3840 push %r15 3841 pushfq 3842 sub \$64,%rsp 3843 3844 mov 152($context),%rax # pull context->Rsp 3845 mov 248($context),%rbx # pull context->Rip 3846 3847 lea .Lcbc_decrypt_bulk(%rip),%r10 3848 cmp %r10,%rbx # context->Rip<"prologue" label 3849 jb .Lcommon_seh_tail 3850 3851 mov 120($context),%rax # pull context->Rax 3852 3853 lea .Lcbc_decrypt_body(%rip),%r10 3854 cmp %r10,%rbx # context->Rip<cbc_decrypt_body 3855 jb .Lcommon_seh_tail 3856 3857 mov 152($context),%rax # pull context->Rsp 3858 3859 lea .Lcbc_ret(%rip),%r10 3860 cmp %r10,%rbx # context->Rip>="epilogue" label 3861 jae .Lcommon_seh_tail 3862 3863 lea 16(%rax),%rsi # %xmm save area 3864 lea 512($context),%rdi # &context.Xmm6 3865 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 3866 .long 0xa548f3fc # cld; rep movsq 3867 3868 mov 208($context),%rax # pull context->R11 3869 3870 mov -8(%rax),%rbp # restore saved %rbp 3871 mov %rbp,160($context) # restore context->Rbp 3872 3873.Lcommon_seh_tail: 3874 mov 8(%rax),%rdi 3875 mov 16(%rax),%rsi 3876 mov %rax,152($context) # restore context->Rsp 3877 mov %rsi,168($context) # restore context->Rsi 3878 mov %rdi,176($context) # restore context->Rdi 3879 3880 mov 40($disp),%rdi # disp->ContextRecord 3881 mov $context,%rsi # context 3882 mov \$154,%ecx # sizeof(CONTEXT) 3883 .long 0xa548f3fc # cld; rep movsq 3884 3885 mov $disp,%rsi 3886 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3887 mov 8(%rsi),%rdx # arg2, disp->ImageBase 3888 mov 0(%rsi),%r8 # arg3, disp->ControlPc 3889 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3890 mov 40(%rsi),%r10 # disp->ContextRecord 3891 lea 56(%rsi),%r11 # &disp->HandlerData 3892 lea 24(%rsi),%r12 # &disp->EstablisherFrame 3893 mov %r10,32(%rsp) # arg5 3894 mov %r11,40(%rsp) # arg6 3895 mov %r12,48(%rsp) # arg7 3896 mov %rcx,56(%rsp) # arg8, (NULL) 3897 call *__imp_RtlVirtualUnwind(%rip) 3898 3899 mov \$1,%eax # ExceptionContinueSearch 3900 add \$64,%rsp 3901 popfq 3902 pop %r15 3903 pop %r14 3904 pop %r13 3905 pop %r12 3906 pop %rbp 3907 pop %rbx 3908 pop %rdi 3909 pop %rsi 3910 ret 3911.size cbc_se_handler,.-cbc_se_handler 3912 3913.section .pdata 3914.align 4 3915___ 3916$code.=<<___ if ($PREFIX eq "aes_hw"); 3917 .rva .LSEH_begin_${PREFIX}_ecb_encrypt 3918 .rva .LSEH_end_${PREFIX}_ecb_encrypt 3919 .rva .LSEH_info_ecb 3920 3921 .rva .LSEH_begin_${PREFIX}_ctr32_encrypt_blocks 3922 .rva .LSEH_end_${PREFIX}_ctr32_encrypt_blocks 3923 .rva .LSEH_info_ctr32 3924___ 3925$code.=<<___; 3926 .rva .LSEH_begin_${PREFIX}_cbc_encrypt 3927 .rva .LSEH_end_${PREFIX}_cbc_encrypt 3928 .rva .LSEH_info_cbc 3929 3930 .rva ${PREFIX}_set_decrypt_key 3931 .rva .LSEH_end_set_decrypt_key 3932 .rva .LSEH_info_key 3933 3934 .rva ${PREFIX}_set_encrypt_key 3935 .rva .LSEH_end_set_encrypt_key 3936 .rva .LSEH_info_key 3937.section .xdata 3938.align 8 3939___ 3940$code.=<<___ if ($PREFIX eq "aes_hw"); 3941.LSEH_info_ecb: 3942 .byte 9,0,0,0 3943 .rva ecb_ccm64_se_handler 3944 .rva .Lecb_enc_body,.Lecb_enc_ret # HandlerData[] 3945.LSEH_info_ctr32: 3946 .byte 9,0,0,0 3947 .rva ctr_xts_se_handler 3948 .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[] 3949___ 3950$code.=<<___; 3951.LSEH_info_cbc: 3952 .byte 9,0,0,0 3953 .rva cbc_se_handler 3954.LSEH_info_key: 3955 .byte 0x01,0x04,0x01,0x00 3956 .byte 0x04,0x02,0x00,0x00 # sub rsp,8 3957___ 3958} 3959 3960sub rex { 3961 local *opcode=shift; 3962 my ($dst,$src)=@_; 3963 my $rex=0; 3964 3965 $rex|=0x04 if($dst>=8); 3966 $rex|=0x01 if($src>=8); 3967 push @opcode,$rex|0x40 if($rex); 3968} 3969 3970sub aesni { 3971 my $line=shift; 3972 my @opcode=(0x66); 3973 3974 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { 3975 rex(\@opcode,$4,$3); 3976 push @opcode,0x0f,0x3a,0xdf; 3977 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M 3978 my $c=$2; 3979 push @opcode,$c=~/^0/?oct($c):$c; 3980 return ".byte\t".join(',',@opcode); 3981 } 3982 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { 3983 my %opcodelet = ( 3984 "aesimc" => 0xdb, 3985 "aesenc" => 0xdc, "aesenclast" => 0xdd, 3986 "aesdec" => 0xde, "aesdeclast" => 0xdf 3987 ); 3988 return undef if (!defined($opcodelet{$1})); 3989 rex(\@opcode,$3,$2); 3990 push @opcode,0x0f,0x38,$opcodelet{$1}; 3991 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M 3992 return ".byte\t".join(',',@opcode); 3993 } 3994 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) { 3995 my %opcodelet = ( 3996 "aesenc" => 0xdc, "aesenclast" => 0xdd, 3997 "aesdec" => 0xde, "aesdeclast" => 0xdf 3998 ); 3999 return undef if (!defined($opcodelet{$1})); 4000 my $off = $2; 4001 push @opcode,0x44 if ($3>=8); 4002 push @opcode,0x0f,0x38,$opcodelet{$1}; 4003 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M 4004 push @opcode,($off=~/^0/?oct($off):$off)&0xff; 4005 return ".byte\t".join(',',@opcode); 4006 } 4007 return $line; 4008} 4009 4010sub movbe { 4011 ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift; 4012} 4013 4014$code =~ s/\`([^\`]*)\`/eval($1)/gem; 4015$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; 4016#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact 4017$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem; 4018 4019print $code; 4020 4021close STDOUT or die "error closing STDOUT: $!"; 4022