1#! /usr/bin/env perl 2# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements support for Intel AES-NI extension. In 18# OpenSSL context it's used with Intel engine, but can also be used as 19# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for 20# details]. 21# 22# Performance. 23# 24# Given aes(enc|dec) instructions' latency asymptotic performance for 25# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte 26# processed with 128-bit key. And given their throughput asymptotic 27# performance for parallelizable modes is 1.25 cycles per byte. Being 28# asymptotic limit it's not something you commonly achieve in reality, 29# but how close does one get? Below are results collected for 30# different modes and block sized. Pairs of numbers are for en-/ 31# decryption. 32# 33# 16-byte 64-byte 256-byte 1-KB 8-KB 34# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26 35# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26 36# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28 37# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07 38# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38 39# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55 40# 41# ECB, CTR, CBC and CCM results are free from EVP overhead. This means 42# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni 43# [-decrypt]' will exhibit 10-15% worse results for smaller blocks. 44# The results were collected with specially crafted speed.c benchmark 45# in order to compare them with results reported in "Intel Advanced 46# Encryption Standard (AES) New Instruction Set" White Paper Revision 47# 3.0 dated May 2010. All above results are consistently better. This 48# module also provides better performance for block sizes smaller than 49# 128 bytes in points *not* represented in the above table. 50# 51# Looking at the results for 8-KB buffer. 52# 53# CFB and OFB results are far from the limit, because implementation 54# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on 55# single-block aesni_encrypt, which is not the most optimal way to go. 56# CBC encrypt result is unexpectedly high and there is no documented 57# explanation for it. Seemingly there is a small penalty for feeding 58# the result back to AES unit the way it's done in CBC mode. There is 59# nothing one can do and the result appears optimal. CCM result is 60# identical to CBC, because CBC-MAC is essentially CBC encrypt without 61# saving output. CCM CTR "stays invisible," because it's neatly 62# interleaved wih CBC-MAC. This provides ~30% improvement over 63# "straightforward" CCM implementation with CTR and CBC-MAC performed 64# disjointly. Parallelizable modes practically achieve the theoretical 65# limit. 66# 67# Looking at how results vary with buffer size. 68# 69# Curves are practically saturated at 1-KB buffer size. In most cases 70# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one. 71# CTR curve doesn't follow this pattern and is "slowest" changing one 72# with "256-byte" result being 87% of "8-KB." This is because overhead 73# in CTR mode is most computationally intensive. Small-block CCM 74# decrypt is slower than encrypt, because first CTR and last CBC-MAC 75# iterations can't be interleaved. 76# 77# Results for 192- and 256-bit keys. 78# 79# EVP-free results were observed to scale perfectly with number of 80# rounds for larger block sizes, i.e. 192-bit result being 10/12 times 81# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences 82# are a tad smaller, because the above mentioned penalty biases all 83# results by same constant value. In similar way function call 84# overhead affects small-block performance, as well as OFB and CFB 85# results. Differences are not large, most common coefficients are 86# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one 87# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)... 88 89# January 2011 90# 91# While Westmere processor features 6 cycles latency for aes[enc|dec] 92# instructions, which can be scheduled every second cycle, Sandy 93# Bridge spends 8 cycles per instruction, but it can schedule them 94# every cycle. This means that code targeting Westmere would perform 95# suboptimally on Sandy Bridge. Therefore this update. 96# 97# In addition, non-parallelizable CBC encrypt (as well as CCM) is 98# optimized. Relative improvement might appear modest, 8% on Westmere, 99# but in absolute terms it's 3.77 cycles per byte encrypted with 100# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers 101# should be compared to asymptotic limits of 3.75 for Westmere and 102# 5.00 for Sandy Bridge. Actually, the fact that they get this close 103# to asymptotic limits is quite amazing. Indeed, the limit is 104# calculated as latency times number of rounds, 10 for 128-bit key, 105# and divided by 16, the number of bytes in block, or in other words 106# it accounts *solely* for aesenc instructions. But there are extra 107# instructions, and numbers so close to the asymptotic limits mean 108# that it's as if it takes as little as *one* additional cycle to 109# execute all of them. How is it possible? It is possible thanks to 110# out-of-order execution logic, which manages to overlap post- 111# processing of previous block, things like saving the output, with 112# actual encryption of current block, as well as pre-processing of 113# current block, things like fetching input and xor-ing it with 114# 0-round element of the key schedule, with actual encryption of 115# previous block. Keep this in mind... 116# 117# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher 118# performance is achieved by interleaving instructions working on 119# independent blocks. In which case asymptotic limit for such modes 120# can be obtained by dividing above mentioned numbers by AES 121# instructions' interleave factor. Westmere can execute at most 3 122# instructions at a time, meaning that optimal interleave factor is 3, 123# and that's where the "magic" number of 1.25 come from. "Optimal 124# interleave factor" means that increase of interleave factor does 125# not improve performance. The formula has proven to reflect reality 126# pretty well on Westmere... Sandy Bridge on the other hand can 127# execute up to 8 AES instructions at a time, so how does varying 128# interleave factor affect the performance? Here is table for ECB 129# (numbers are cycles per byte processed with 128-bit key): 130# 131# instruction interleave factor 3x 6x 8x 132# theoretical asymptotic limit 1.67 0.83 0.625 133# measured performance for 8KB block 1.05 0.86 0.84 134# 135# "as if" interleave factor 4.7x 5.8x 6.0x 136# 137# Further data for other parallelizable modes: 138# 139# CBC decrypt 1.16 0.93 0.74 140# CTR 1.14 0.91 0.74 141# 142# Well, given 3x column it's probably inappropriate to call the limit 143# asymptotic, if it can be surpassed, isn't it? What happens there? 144# Rewind to CBC paragraph for the answer. Yes, out-of-order execution 145# magic is responsible for this. Processor overlaps not only the 146# additional instructions with AES ones, but even AES instructions 147# processing adjacent triplets of independent blocks. In the 6x case 148# additional instructions still claim disproportionally small amount 149# of additional cycles, but in 8x case number of instructions must be 150# a tad too high for out-of-order logic to cope with, and AES unit 151# remains underutilized... As you can see 8x interleave is hardly 152# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl 153# utilizes 6x interleave because of limited register bank capacity. 154# 155# Higher interleave factors do have negative impact on Westmere 156# performance. While for ECB mode it's negligible ~1.5%, other 157# parallelizables perform ~5% worse, which is outweighed by ~25% 158# improvement on Sandy Bridge. To balance regression on Westmere 159# CTR mode was implemented with 6x aesenc interleave factor. 160 161# April 2011 162# 163# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing 164# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like 165# in CTR mode AES instruction interleave factor was chosen to be 6x. 166 167# November 2015 168# 169# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was 170# chosen to be 6x. 171 172###################################################################### 173# Current large-block performance in cycles per byte processed with 174# 128-bit key (less is better). 175# 176# CBC en-/decrypt CTR XTS ECB OCB 177# Westmere 3.77/1.25 1.25 1.25 1.26 178# * Bridge 5.07/0.74 0.75 0.90 0.85 0.98 179# Haswell 4.44/0.63 0.63 0.73 0.63 0.70 180# Skylake 2.62/0.63 0.63 0.63 0.63 181# Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11 182# Knights L 2.54/0.77 0.78 0.85 - 1.50 183# Goldmont 3.82/1.26 1.26 1.29 1.29 1.50 184# Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95 185# Ryzen 2.71/0.35 0.35 0.44 0.38 0.49 186# 187# (*) Atom Silvermont ECB result is suboptimal because of penalties 188# incurred by operations on %xmm8-15. As ECB is not considered 189# critical, nothing was done to mitigate the problem. 190 191$PREFIX="aes_hw"; # if $PREFIX is set to "AES", the script 192 # generates drop-in replacement for 193 # crypto/aes/asm/aes-x86_64.pl:-) 194 195$flavour = shift; 196$output = shift; 197if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 198 199$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 200 201$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 202( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 203( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 204die "can't locate x86_64-xlate.pl"; 205 206open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 207*STDOUT=*OUT; 208 209$movkey = $PREFIX eq "aes_hw" ? "movups" : "movups"; 210@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 211 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 212 213$code=".text\n"; 214$code.=".extern OPENSSL_ia32cap_P\n"; 215 216$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!! 217# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ... 218$inp="%rdi"; 219$out="%rsi"; 220$len="%rdx"; 221$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! 222$ivp="%r8"; # cbc, ctr, ... 223 224$rnds_="%r10d"; # backup copy for $rounds 225$key_="%r11"; # backup copy for $key 226 227# %xmm register layout 228$rndkey0="%xmm0"; $rndkey1="%xmm1"; 229$inout0="%xmm2"; $inout1="%xmm3"; 230$inout2="%xmm4"; $inout3="%xmm5"; 231$inout4="%xmm6"; $inout5="%xmm7"; 232$inout6="%xmm8"; $inout7="%xmm9"; 233 234$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ... 235$in0="%xmm8"; $iv="%xmm9"; 236 237# Inline version of internal aesni_[en|de]crypt1. 238# 239# Why folded loop? Because aes[enc|dec] is slow enough to accommodate 240# cycles which take care of loop variables... 241{ my $sn; 242sub aesni_generate1 { 243my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); 244++$sn; 245$code.=<<___; 246 $movkey ($key),$rndkey0 247 $movkey 16($key),$rndkey1 248___ 249$code.=<<___ if (defined($ivec)); 250 xorps $rndkey0,$ivec 251 lea 32($key),$key 252 xorps $ivec,$inout 253___ 254$code.=<<___ if (!defined($ivec)); 255 lea 32($key),$key 256 xorps $rndkey0,$inout 257___ 258$code.=<<___; 259.Loop_${p}1_$sn: 260 aes${p} $rndkey1,$inout 261 dec $rounds 262 $movkey ($key),$rndkey1 263 lea 16($key),$key 264 jnz .Loop_${p}1_$sn # loop body is 16 bytes 265 aes${p}last $rndkey1,$inout 266___ 267}} 268# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key); 269# 270{ my ($inp,$out,$key) = @_4args; 271 272$code.=<<___; 273.globl ${PREFIX}_encrypt 274.type ${PREFIX}_encrypt,\@abi-omnipotent 275.align 16 276${PREFIX}_encrypt: 277.cfi_startproc 278#ifdef BORINGSSL_DISPATCH_TEST 279.extern BORINGSSL_function_hit 280 movb \$1,BORINGSSL_function_hit+1(%rip) 281#endif 282 movups ($inp),$inout0 # load input 283 mov 240($key),$rounds # key->rounds 284___ 285 &aesni_generate1("enc",$key,$rounds); 286$code.=<<___; 287 pxor $rndkey0,$rndkey0 # clear register bank 288 pxor $rndkey1,$rndkey1 289 movups $inout0,($out) # output 290 pxor $inout0,$inout0 291 ret 292.cfi_endproc 293.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt 294 295.globl ${PREFIX}_decrypt 296.type ${PREFIX}_decrypt,\@abi-omnipotent 297.align 16 298${PREFIX}_decrypt: 299.cfi_startproc 300 movups ($inp),$inout0 # load input 301 mov 240($key),$rounds # key->rounds 302___ 303 &aesni_generate1("dec",$key,$rounds); 304$code.=<<___; 305 pxor $rndkey0,$rndkey0 # clear register bank 306 pxor $rndkey1,$rndkey1 307 movups $inout0,($out) # output 308 pxor $inout0,$inout0 309 ret 310.cfi_endproc 311.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt 312___ 313} 314 315# _aesni_[en|de]cryptN are private interfaces, N denotes interleave 316# factor. Why 3x subroutine were originally used in loops? Even though 317# aes[enc|dec] latency was originally 6, it could be scheduled only 318# every *2nd* cycle. Thus 3x interleave was the one providing optimal 319# utilization, i.e. when subroutine's throughput is virtually same as 320# of non-interleaved subroutine [for number of input blocks up to 3]. 321# This is why it originally made no sense to implement 2x subroutine. 322# But times change and it became appropriate to spend extra 192 bytes 323# on 2x subroutine on Atom Silvermont account. For processors that 324# can schedule aes[enc|dec] every cycle optimal interleave factor 325# equals to corresponding instructions latency. 8x is optimal for 326# * Bridge and "super-optimal" for other Intel CPUs... 327 328sub aesni_generate2 { 329my $dir=shift; 330# As already mentioned it takes in $key and $rounds, which are *not* 331# preserved. $inout[0-1] is cipher/clear text... 332$code.=<<___; 333.type _aesni_${dir}rypt2,\@abi-omnipotent 334.align 16 335_aesni_${dir}rypt2: 336.cfi_startproc 337 $movkey ($key),$rndkey0 338 shl \$4,$rounds 339 $movkey 16($key),$rndkey1 340 xorps $rndkey0,$inout0 341 xorps $rndkey0,$inout1 342 $movkey 32($key),$rndkey0 343 lea 32($key,$rounds),$key 344 neg %rax # $rounds 345 add \$16,%rax 346 347.L${dir}_loop2: 348 aes${dir} $rndkey1,$inout0 349 aes${dir} $rndkey1,$inout1 350 $movkey ($key,%rax),$rndkey1 351 add \$32,%rax 352 aes${dir} $rndkey0,$inout0 353 aes${dir} $rndkey0,$inout1 354 $movkey -16($key,%rax),$rndkey0 355 jnz .L${dir}_loop2 356 357 aes${dir} $rndkey1,$inout0 358 aes${dir} $rndkey1,$inout1 359 aes${dir}last $rndkey0,$inout0 360 aes${dir}last $rndkey0,$inout1 361 ret 362.cfi_endproc 363.size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2 364___ 365} 366sub aesni_generate3 { 367my $dir=shift; 368# As already mentioned it takes in $key and $rounds, which are *not* 369# preserved. $inout[0-2] is cipher/clear text... 370$code.=<<___; 371.type _aesni_${dir}rypt3,\@abi-omnipotent 372.align 16 373_aesni_${dir}rypt3: 374.cfi_startproc 375 $movkey ($key),$rndkey0 376 shl \$4,$rounds 377 $movkey 16($key),$rndkey1 378 xorps $rndkey0,$inout0 379 xorps $rndkey0,$inout1 380 xorps $rndkey0,$inout2 381 $movkey 32($key),$rndkey0 382 lea 32($key,$rounds),$key 383 neg %rax # $rounds 384 add \$16,%rax 385 386.L${dir}_loop3: 387 aes${dir} $rndkey1,$inout0 388 aes${dir} $rndkey1,$inout1 389 aes${dir} $rndkey1,$inout2 390 $movkey ($key,%rax),$rndkey1 391 add \$32,%rax 392 aes${dir} $rndkey0,$inout0 393 aes${dir} $rndkey0,$inout1 394 aes${dir} $rndkey0,$inout2 395 $movkey -16($key,%rax),$rndkey0 396 jnz .L${dir}_loop3 397 398 aes${dir} $rndkey1,$inout0 399 aes${dir} $rndkey1,$inout1 400 aes${dir} $rndkey1,$inout2 401 aes${dir}last $rndkey0,$inout0 402 aes${dir}last $rndkey0,$inout1 403 aes${dir}last $rndkey0,$inout2 404 ret 405.cfi_endproc 406.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3 407___ 408} 409# 4x interleave is implemented to improve small block performance, 410# most notably [and naturally] 4 block by ~30%. One can argue that one 411# should have implemented 5x as well, but improvement would be <20%, 412# so it's not worth it... 413sub aesni_generate4 { 414my $dir=shift; 415# As already mentioned it takes in $key and $rounds, which are *not* 416# preserved. $inout[0-3] is cipher/clear text... 417$code.=<<___; 418.type _aesni_${dir}rypt4,\@abi-omnipotent 419.align 16 420_aesni_${dir}rypt4: 421.cfi_startproc 422 $movkey ($key),$rndkey0 423 shl \$4,$rounds 424 $movkey 16($key),$rndkey1 425 xorps $rndkey0,$inout0 426 xorps $rndkey0,$inout1 427 xorps $rndkey0,$inout2 428 xorps $rndkey0,$inout3 429 $movkey 32($key),$rndkey0 430 lea 32($key,$rounds),$key 431 neg %rax # $rounds 432 .byte 0x0f,0x1f,0x00 433 add \$16,%rax 434 435.L${dir}_loop4: 436 aes${dir} $rndkey1,$inout0 437 aes${dir} $rndkey1,$inout1 438 aes${dir} $rndkey1,$inout2 439 aes${dir} $rndkey1,$inout3 440 $movkey ($key,%rax),$rndkey1 441 add \$32,%rax 442 aes${dir} $rndkey0,$inout0 443 aes${dir} $rndkey0,$inout1 444 aes${dir} $rndkey0,$inout2 445 aes${dir} $rndkey0,$inout3 446 $movkey -16($key,%rax),$rndkey0 447 jnz .L${dir}_loop4 448 449 aes${dir} $rndkey1,$inout0 450 aes${dir} $rndkey1,$inout1 451 aes${dir} $rndkey1,$inout2 452 aes${dir} $rndkey1,$inout3 453 aes${dir}last $rndkey0,$inout0 454 aes${dir}last $rndkey0,$inout1 455 aes${dir}last $rndkey0,$inout2 456 aes${dir}last $rndkey0,$inout3 457 ret 458.cfi_endproc 459.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 460___ 461} 462sub aesni_generate6 { 463my $dir=shift; 464# As already mentioned it takes in $key and $rounds, which are *not* 465# preserved. $inout[0-5] is cipher/clear text... 466$code.=<<___; 467.type _aesni_${dir}rypt6,\@abi-omnipotent 468.align 16 469_aesni_${dir}rypt6: 470.cfi_startproc 471 $movkey ($key),$rndkey0 472 shl \$4,$rounds 473 $movkey 16($key),$rndkey1 474 xorps $rndkey0,$inout0 475 pxor $rndkey0,$inout1 476 pxor $rndkey0,$inout2 477 aes${dir} $rndkey1,$inout0 478 lea 32($key,$rounds),$key 479 neg %rax # $rounds 480 aes${dir} $rndkey1,$inout1 481 pxor $rndkey0,$inout3 482 pxor $rndkey0,$inout4 483 aes${dir} $rndkey1,$inout2 484 pxor $rndkey0,$inout5 485 $movkey ($key,%rax),$rndkey0 486 add \$16,%rax 487 jmp .L${dir}_loop6_enter 488.align 16 489.L${dir}_loop6: 490 aes${dir} $rndkey1,$inout0 491 aes${dir} $rndkey1,$inout1 492 aes${dir} $rndkey1,$inout2 493.L${dir}_loop6_enter: 494 aes${dir} $rndkey1,$inout3 495 aes${dir} $rndkey1,$inout4 496 aes${dir} $rndkey1,$inout5 497 $movkey ($key,%rax),$rndkey1 498 add \$32,%rax 499 aes${dir} $rndkey0,$inout0 500 aes${dir} $rndkey0,$inout1 501 aes${dir} $rndkey0,$inout2 502 aes${dir} $rndkey0,$inout3 503 aes${dir} $rndkey0,$inout4 504 aes${dir} $rndkey0,$inout5 505 $movkey -16($key,%rax),$rndkey0 506 jnz .L${dir}_loop6 507 508 aes${dir} $rndkey1,$inout0 509 aes${dir} $rndkey1,$inout1 510 aes${dir} $rndkey1,$inout2 511 aes${dir} $rndkey1,$inout3 512 aes${dir} $rndkey1,$inout4 513 aes${dir} $rndkey1,$inout5 514 aes${dir}last $rndkey0,$inout0 515 aes${dir}last $rndkey0,$inout1 516 aes${dir}last $rndkey0,$inout2 517 aes${dir}last $rndkey0,$inout3 518 aes${dir}last $rndkey0,$inout4 519 aes${dir}last $rndkey0,$inout5 520 ret 521.cfi_endproc 522.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6 523___ 524} 525sub aesni_generate8 { 526my $dir=shift; 527# As already mentioned it takes in $key and $rounds, which are *not* 528# preserved. $inout[0-7] is cipher/clear text... 529$code.=<<___; 530.type _aesni_${dir}rypt8,\@abi-omnipotent 531.align 16 532_aesni_${dir}rypt8: 533.cfi_startproc 534 $movkey ($key),$rndkey0 535 shl \$4,$rounds 536 $movkey 16($key),$rndkey1 537 xorps $rndkey0,$inout0 538 xorps $rndkey0,$inout1 539 pxor $rndkey0,$inout2 540 pxor $rndkey0,$inout3 541 pxor $rndkey0,$inout4 542 lea 32($key,$rounds),$key 543 neg %rax # $rounds 544 aes${dir} $rndkey1,$inout0 545 pxor $rndkey0,$inout5 546 pxor $rndkey0,$inout6 547 aes${dir} $rndkey1,$inout1 548 pxor $rndkey0,$inout7 549 $movkey ($key,%rax),$rndkey0 550 add \$16,%rax 551 jmp .L${dir}_loop8_inner 552.align 16 553.L${dir}_loop8: 554 aes${dir} $rndkey1,$inout0 555 aes${dir} $rndkey1,$inout1 556.L${dir}_loop8_inner: 557 aes${dir} $rndkey1,$inout2 558 aes${dir} $rndkey1,$inout3 559 aes${dir} $rndkey1,$inout4 560 aes${dir} $rndkey1,$inout5 561 aes${dir} $rndkey1,$inout6 562 aes${dir} $rndkey1,$inout7 563.L${dir}_loop8_enter: 564 $movkey ($key,%rax),$rndkey1 565 add \$32,%rax 566 aes${dir} $rndkey0,$inout0 567 aes${dir} $rndkey0,$inout1 568 aes${dir} $rndkey0,$inout2 569 aes${dir} $rndkey0,$inout3 570 aes${dir} $rndkey0,$inout4 571 aes${dir} $rndkey0,$inout5 572 aes${dir} $rndkey0,$inout6 573 aes${dir} $rndkey0,$inout7 574 $movkey -16($key,%rax),$rndkey0 575 jnz .L${dir}_loop8 576 577 aes${dir} $rndkey1,$inout0 578 aes${dir} $rndkey1,$inout1 579 aes${dir} $rndkey1,$inout2 580 aes${dir} $rndkey1,$inout3 581 aes${dir} $rndkey1,$inout4 582 aes${dir} $rndkey1,$inout5 583 aes${dir} $rndkey1,$inout6 584 aes${dir} $rndkey1,$inout7 585 aes${dir}last $rndkey0,$inout0 586 aes${dir}last $rndkey0,$inout1 587 aes${dir}last $rndkey0,$inout2 588 aes${dir}last $rndkey0,$inout3 589 aes${dir}last $rndkey0,$inout4 590 aes${dir}last $rndkey0,$inout5 591 aes${dir}last $rndkey0,$inout6 592 aes${dir}last $rndkey0,$inout7 593 ret 594.cfi_endproc 595.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8 596___ 597} 598&aesni_generate2("enc") if ($PREFIX eq "aes_hw"); 599&aesni_generate2("dec"); 600&aesni_generate3("enc") if ($PREFIX eq "aes_hw"); 601&aesni_generate3("dec"); 602&aesni_generate4("enc") if ($PREFIX eq "aes_hw"); 603&aesni_generate4("dec"); 604&aesni_generate6("enc") if ($PREFIX eq "aes_hw"); 605&aesni_generate6("dec"); 606&aesni_generate8("enc") if ($PREFIX eq "aes_hw"); 607&aesni_generate8("dec"); 608 609if ($PREFIX eq "aes_hw") { 610######################################################################## 611# void aesni_ecb_encrypt (const void *in, void *out, 612# size_t length, const AES_KEY *key, 613# int enc); 614$code.=<<___; 615.globl ${PREFIX}_ecb_encrypt 616.type ${PREFIX}_ecb_encrypt,\@function,5 617.align 16 618${PREFIX}_ecb_encrypt: 619.cfi_startproc 620___ 621$code.=<<___ if ($win64); 622 lea -0x58(%rsp),%rsp 623 movaps %xmm6,(%rsp) # offload $inout4..7 624 movaps %xmm7,0x10(%rsp) 625 movaps %xmm8,0x20(%rsp) 626 movaps %xmm9,0x30(%rsp) 627.Lecb_enc_body: 628___ 629$code.=<<___; 630 and \$-16,$len # if ($len<16) 631 jz .Lecb_ret # return 632 633 mov 240($key),$rounds # key->rounds 634 $movkey ($key),$rndkey0 635 mov $key,$key_ # backup $key 636 mov $rounds,$rnds_ # backup $rounds 637 test %r8d,%r8d # 5th argument 638 jz .Lecb_decrypt 639#--------------------------- ECB ENCRYPT ------------------------------# 640 cmp \$0x80,$len # if ($len<8*16) 641 jb .Lecb_enc_tail # short input 642 643 movdqu ($inp),$inout0 # load 8 input blocks 644 movdqu 0x10($inp),$inout1 645 movdqu 0x20($inp),$inout2 646 movdqu 0x30($inp),$inout3 647 movdqu 0x40($inp),$inout4 648 movdqu 0x50($inp),$inout5 649 movdqu 0x60($inp),$inout6 650 movdqu 0x70($inp),$inout7 651 lea 0x80($inp),$inp # $inp+=8*16 652 sub \$0x80,$len # $len-=8*16 (can be zero) 653 jmp .Lecb_enc_loop8_enter 654.align 16 655.Lecb_enc_loop8: 656 movups $inout0,($out) # store 8 output blocks 657 mov $key_,$key # restore $key 658 movdqu ($inp),$inout0 # load 8 input blocks 659 mov $rnds_,$rounds # restore $rounds 660 movups $inout1,0x10($out) 661 movdqu 0x10($inp),$inout1 662 movups $inout2,0x20($out) 663 movdqu 0x20($inp),$inout2 664 movups $inout3,0x30($out) 665 movdqu 0x30($inp),$inout3 666 movups $inout4,0x40($out) 667 movdqu 0x40($inp),$inout4 668 movups $inout5,0x50($out) 669 movdqu 0x50($inp),$inout5 670 movups $inout6,0x60($out) 671 movdqu 0x60($inp),$inout6 672 movups $inout7,0x70($out) 673 lea 0x80($out),$out # $out+=8*16 674 movdqu 0x70($inp),$inout7 675 lea 0x80($inp),$inp # $inp+=8*16 676.Lecb_enc_loop8_enter: 677 678 call _aesni_encrypt8 679 680 sub \$0x80,$len 681 jnc .Lecb_enc_loop8 # loop if $len-=8*16 didn't borrow 682 683 movups $inout0,($out) # store 8 output blocks 684 mov $key_,$key # restore $key 685 movups $inout1,0x10($out) 686 mov $rnds_,$rounds # restore $rounds 687 movups $inout2,0x20($out) 688 movups $inout3,0x30($out) 689 movups $inout4,0x40($out) 690 movups $inout5,0x50($out) 691 movups $inout6,0x60($out) 692 movups $inout7,0x70($out) 693 lea 0x80($out),$out # $out+=8*16 694 add \$0x80,$len # restore real remaining $len 695 jz .Lecb_ret # done if ($len==0) 696 697.Lecb_enc_tail: # $len is less than 8*16 698 movups ($inp),$inout0 699 cmp \$0x20,$len 700 jb .Lecb_enc_one 701 movups 0x10($inp),$inout1 702 je .Lecb_enc_two 703 movups 0x20($inp),$inout2 704 cmp \$0x40,$len 705 jb .Lecb_enc_three 706 movups 0x30($inp),$inout3 707 je .Lecb_enc_four 708 movups 0x40($inp),$inout4 709 cmp \$0x60,$len 710 jb .Lecb_enc_five 711 movups 0x50($inp),$inout5 712 je .Lecb_enc_six 713 movdqu 0x60($inp),$inout6 714 xorps $inout7,$inout7 715 call _aesni_encrypt8 716 movups $inout0,($out) # store 7 output blocks 717 movups $inout1,0x10($out) 718 movups $inout2,0x20($out) 719 movups $inout3,0x30($out) 720 movups $inout4,0x40($out) 721 movups $inout5,0x50($out) 722 movups $inout6,0x60($out) 723 jmp .Lecb_ret 724.align 16 725.Lecb_enc_one: 726___ 727 &aesni_generate1("enc",$key,$rounds); 728$code.=<<___; 729 movups $inout0,($out) # store one output block 730 jmp .Lecb_ret 731.align 16 732.Lecb_enc_two: 733 call _aesni_encrypt2 734 movups $inout0,($out) # store 2 output blocks 735 movups $inout1,0x10($out) 736 jmp .Lecb_ret 737.align 16 738.Lecb_enc_three: 739 call _aesni_encrypt3 740 movups $inout0,($out) # store 3 output blocks 741 movups $inout1,0x10($out) 742 movups $inout2,0x20($out) 743 jmp .Lecb_ret 744.align 16 745.Lecb_enc_four: 746 call _aesni_encrypt4 747 movups $inout0,($out) # store 4 output blocks 748 movups $inout1,0x10($out) 749 movups $inout2,0x20($out) 750 movups $inout3,0x30($out) 751 jmp .Lecb_ret 752.align 16 753.Lecb_enc_five: 754 xorps $inout5,$inout5 755 call _aesni_encrypt6 756 movups $inout0,($out) # store 5 output blocks 757 movups $inout1,0x10($out) 758 movups $inout2,0x20($out) 759 movups $inout3,0x30($out) 760 movups $inout4,0x40($out) 761 jmp .Lecb_ret 762.align 16 763.Lecb_enc_six: 764 call _aesni_encrypt6 765 movups $inout0,($out) # store 6 output blocks 766 movups $inout1,0x10($out) 767 movups $inout2,0x20($out) 768 movups $inout3,0x30($out) 769 movups $inout4,0x40($out) 770 movups $inout5,0x50($out) 771 jmp .Lecb_ret 772#--------------------------- ECB DECRYPT ------------------------------# 773.align 16 774.Lecb_decrypt: 775 cmp \$0x80,$len # if ($len<8*16) 776 jb .Lecb_dec_tail # short input 777 778 movdqu ($inp),$inout0 # load 8 input blocks 779 movdqu 0x10($inp),$inout1 780 movdqu 0x20($inp),$inout2 781 movdqu 0x30($inp),$inout3 782 movdqu 0x40($inp),$inout4 783 movdqu 0x50($inp),$inout5 784 movdqu 0x60($inp),$inout6 785 movdqu 0x70($inp),$inout7 786 lea 0x80($inp),$inp # $inp+=8*16 787 sub \$0x80,$len # $len-=8*16 (can be zero) 788 jmp .Lecb_dec_loop8_enter 789.align 16 790.Lecb_dec_loop8: 791 movups $inout0,($out) # store 8 output blocks 792 mov $key_,$key # restore $key 793 movdqu ($inp),$inout0 # load 8 input blocks 794 mov $rnds_,$rounds # restore $rounds 795 movups $inout1,0x10($out) 796 movdqu 0x10($inp),$inout1 797 movups $inout2,0x20($out) 798 movdqu 0x20($inp),$inout2 799 movups $inout3,0x30($out) 800 movdqu 0x30($inp),$inout3 801 movups $inout4,0x40($out) 802 movdqu 0x40($inp),$inout4 803 movups $inout5,0x50($out) 804 movdqu 0x50($inp),$inout5 805 movups $inout6,0x60($out) 806 movdqu 0x60($inp),$inout6 807 movups $inout7,0x70($out) 808 lea 0x80($out),$out # $out+=8*16 809 movdqu 0x70($inp),$inout7 810 lea 0x80($inp),$inp # $inp+=8*16 811.Lecb_dec_loop8_enter: 812 813 call _aesni_decrypt8 814 815 $movkey ($key_),$rndkey0 816 sub \$0x80,$len 817 jnc .Lecb_dec_loop8 # loop if $len-=8*16 didn't borrow 818 819 movups $inout0,($out) # store 8 output blocks 820 pxor $inout0,$inout0 # clear register bank 821 mov $key_,$key # restore $key 822 movups $inout1,0x10($out) 823 pxor $inout1,$inout1 824 mov $rnds_,$rounds # restore $rounds 825 movups $inout2,0x20($out) 826 pxor $inout2,$inout2 827 movups $inout3,0x30($out) 828 pxor $inout3,$inout3 829 movups $inout4,0x40($out) 830 pxor $inout4,$inout4 831 movups $inout5,0x50($out) 832 pxor $inout5,$inout5 833 movups $inout6,0x60($out) 834 pxor $inout6,$inout6 835 movups $inout7,0x70($out) 836 pxor $inout7,$inout7 837 lea 0x80($out),$out # $out+=8*16 838 add \$0x80,$len # restore real remaining $len 839 jz .Lecb_ret # done if ($len==0) 840 841.Lecb_dec_tail: 842 movups ($inp),$inout0 843 cmp \$0x20,$len 844 jb .Lecb_dec_one 845 movups 0x10($inp),$inout1 846 je .Lecb_dec_two 847 movups 0x20($inp),$inout2 848 cmp \$0x40,$len 849 jb .Lecb_dec_three 850 movups 0x30($inp),$inout3 851 je .Lecb_dec_four 852 movups 0x40($inp),$inout4 853 cmp \$0x60,$len 854 jb .Lecb_dec_five 855 movups 0x50($inp),$inout5 856 je .Lecb_dec_six 857 movups 0x60($inp),$inout6 858 $movkey ($key),$rndkey0 859 xorps $inout7,$inout7 860 call _aesni_decrypt8 861 movups $inout0,($out) # store 7 output blocks 862 pxor $inout0,$inout0 # clear register bank 863 movups $inout1,0x10($out) 864 pxor $inout1,$inout1 865 movups $inout2,0x20($out) 866 pxor $inout2,$inout2 867 movups $inout3,0x30($out) 868 pxor $inout3,$inout3 869 movups $inout4,0x40($out) 870 pxor $inout4,$inout4 871 movups $inout5,0x50($out) 872 pxor $inout5,$inout5 873 movups $inout6,0x60($out) 874 pxor $inout6,$inout6 875 pxor $inout7,$inout7 876 jmp .Lecb_ret 877.align 16 878.Lecb_dec_one: 879___ 880 &aesni_generate1("dec",$key,$rounds); 881$code.=<<___; 882 movups $inout0,($out) # store one output block 883 pxor $inout0,$inout0 # clear register bank 884 jmp .Lecb_ret 885.align 16 886.Lecb_dec_two: 887 call _aesni_decrypt2 888 movups $inout0,($out) # store 2 output blocks 889 pxor $inout0,$inout0 # clear register bank 890 movups $inout1,0x10($out) 891 pxor $inout1,$inout1 892 jmp .Lecb_ret 893.align 16 894.Lecb_dec_three: 895 call _aesni_decrypt3 896 movups $inout0,($out) # store 3 output blocks 897 pxor $inout0,$inout0 # clear register bank 898 movups $inout1,0x10($out) 899 pxor $inout1,$inout1 900 movups $inout2,0x20($out) 901 pxor $inout2,$inout2 902 jmp .Lecb_ret 903.align 16 904.Lecb_dec_four: 905 call _aesni_decrypt4 906 movups $inout0,($out) # store 4 output blocks 907 pxor $inout0,$inout0 # clear register bank 908 movups $inout1,0x10($out) 909 pxor $inout1,$inout1 910 movups $inout2,0x20($out) 911 pxor $inout2,$inout2 912 movups $inout3,0x30($out) 913 pxor $inout3,$inout3 914 jmp .Lecb_ret 915.align 16 916.Lecb_dec_five: 917 xorps $inout5,$inout5 918 call _aesni_decrypt6 919 movups $inout0,($out) # store 5 output blocks 920 pxor $inout0,$inout0 # clear register bank 921 movups $inout1,0x10($out) 922 pxor $inout1,$inout1 923 movups $inout2,0x20($out) 924 pxor $inout2,$inout2 925 movups $inout3,0x30($out) 926 pxor $inout3,$inout3 927 movups $inout4,0x40($out) 928 pxor $inout4,$inout4 929 pxor $inout5,$inout5 930 jmp .Lecb_ret 931.align 16 932.Lecb_dec_six: 933 call _aesni_decrypt6 934 movups $inout0,($out) # store 6 output blocks 935 pxor $inout0,$inout0 # clear register bank 936 movups $inout1,0x10($out) 937 pxor $inout1,$inout1 938 movups $inout2,0x20($out) 939 pxor $inout2,$inout2 940 movups $inout3,0x30($out) 941 pxor $inout3,$inout3 942 movups $inout4,0x40($out) 943 pxor $inout4,$inout4 944 movups $inout5,0x50($out) 945 pxor $inout5,$inout5 946 947.Lecb_ret: 948 xorps $rndkey0,$rndkey0 # %xmm0 949 pxor $rndkey1,$rndkey1 950___ 951$code.=<<___ if ($win64); 952 movaps (%rsp),%xmm6 953 movaps %xmm0,(%rsp) # clear stack 954 movaps 0x10(%rsp),%xmm7 955 movaps %xmm0,0x10(%rsp) 956 movaps 0x20(%rsp),%xmm8 957 movaps %xmm0,0x20(%rsp) 958 movaps 0x30(%rsp),%xmm9 959 movaps %xmm0,0x30(%rsp) 960 lea 0x58(%rsp),%rsp 961.Lecb_enc_ret: 962___ 963$code.=<<___; 964 ret 965.cfi_endproc 966.size ${PREFIX}_ecb_encrypt,.-${PREFIX}_ecb_encrypt 967___ 968 969{ 970###################################################################### 971# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, 972# size_t blocks, const AES_KEY *key, 973# const char *ivec,char *cmac); 974# 975# Handles only complete blocks, operates on 64-bit counter and 976# does not update *ivec! Nor does it finalize CMAC value 977# (see engine/eng_aesni.c for details) 978# 979if (0) { # Omit these functions in BoringSSL 980my $cmac="%r9"; # 6th argument 981 982my $increment="%xmm9"; 983my $iv="%xmm6"; 984my $bswap_mask="%xmm7"; 985 986$code.=<<___; 987.globl ${PREFIX}_ccm64_encrypt_blocks 988.type ${PREFIX}_ccm64_encrypt_blocks,\@function,6 989.align 16 990${PREFIX}_ccm64_encrypt_blocks: 991___ 992$code.=<<___ if ($win64); 993 lea -0x58(%rsp),%rsp 994 movaps %xmm6,(%rsp) # $iv 995 movaps %xmm7,0x10(%rsp) # $bswap_mask 996 movaps %xmm8,0x20(%rsp) # $in0 997 movaps %xmm9,0x30(%rsp) # $increment 998.Lccm64_enc_body: 999___ 1000$code.=<<___; 1001 mov 240($key),$rounds # key->rounds 1002 movdqu ($ivp),$iv 1003 movdqa .Lincrement64(%rip),$increment 1004 movdqa .Lbswap_mask(%rip),$bswap_mask 1005 1006 shl \$4,$rounds 1007 mov \$16,$rnds_ 1008 lea 0($key),$key_ 1009 movdqu ($cmac),$inout1 1010 movdqa $iv,$inout0 1011 lea 32($key,$rounds),$key # end of key schedule 1012 pshufb $bswap_mask,$iv 1013 sub %rax,%r10 # twisted $rounds 1014 jmp .Lccm64_enc_outer 1015.align 16 1016.Lccm64_enc_outer: 1017 $movkey ($key_),$rndkey0 1018 mov %r10,%rax 1019 movups ($inp),$in0 # load inp 1020 1021 xorps $rndkey0,$inout0 # counter 1022 $movkey 16($key_),$rndkey1 1023 xorps $in0,$rndkey0 1024 xorps $rndkey0,$inout1 # cmac^=inp 1025 $movkey 32($key_),$rndkey0 1026 1027.Lccm64_enc2_loop: 1028 aesenc $rndkey1,$inout0 1029 aesenc $rndkey1,$inout1 1030 $movkey ($key,%rax),$rndkey1 1031 add \$32,%rax 1032 aesenc $rndkey0,$inout0 1033 aesenc $rndkey0,$inout1 1034 $movkey -16($key,%rax),$rndkey0 1035 jnz .Lccm64_enc2_loop 1036 aesenc $rndkey1,$inout0 1037 aesenc $rndkey1,$inout1 1038 paddq $increment,$iv 1039 dec $len # $len-- ($len is in blocks) 1040 aesenclast $rndkey0,$inout0 1041 aesenclast $rndkey0,$inout1 1042 1043 lea 16($inp),$inp 1044 xorps $inout0,$in0 # inp ^= E(iv) 1045 movdqa $iv,$inout0 1046 movups $in0,($out) # save output 1047 pshufb $bswap_mask,$inout0 1048 lea 16($out),$out # $out+=16 1049 jnz .Lccm64_enc_outer # loop if ($len!=0) 1050 1051 pxor $rndkey0,$rndkey0 # clear register bank 1052 pxor $rndkey1,$rndkey1 1053 pxor $inout0,$inout0 1054 movups $inout1,($cmac) # store resulting mac 1055 pxor $inout1,$inout1 1056 pxor $in0,$in0 1057 pxor $iv,$iv 1058___ 1059$code.=<<___ if ($win64); 1060 movaps (%rsp),%xmm6 1061 movaps %xmm0,(%rsp) # clear stack 1062 movaps 0x10(%rsp),%xmm7 1063 movaps %xmm0,0x10(%rsp) 1064 movaps 0x20(%rsp),%xmm8 1065 movaps %xmm0,0x20(%rsp) 1066 movaps 0x30(%rsp),%xmm9 1067 movaps %xmm0,0x30(%rsp) 1068 lea 0x58(%rsp),%rsp 1069.Lccm64_enc_ret: 1070___ 1071$code.=<<___; 1072 ret 1073.size ${PREFIX}_ccm64_encrypt_blocks,.-${PREFIX}_ccm64_encrypt_blocks 1074___ 1075###################################################################### 1076$code.=<<___; 1077.globl ${PREFIX}_ccm64_decrypt_blocks 1078.type ${PREFIX}_ccm64_decrypt_blocks,\@function,6 1079.align 16 1080${PREFIX}_ccm64_decrypt_blocks: 1081___ 1082$code.=<<___ if ($win64); 1083 lea -0x58(%rsp),%rsp 1084 movaps %xmm6,(%rsp) # $iv 1085 movaps %xmm7,0x10(%rsp) # $bswap_mask 1086 movaps %xmm8,0x20(%rsp) # $in8 1087 movaps %xmm9,0x30(%rsp) # $increment 1088.Lccm64_dec_body: 1089___ 1090$code.=<<___; 1091 mov 240($key),$rounds # key->rounds 1092 movups ($ivp),$iv 1093 movdqu ($cmac),$inout1 1094 movdqa .Lincrement64(%rip),$increment 1095 movdqa .Lbswap_mask(%rip),$bswap_mask 1096 1097 movaps $iv,$inout0 1098 mov $rounds,$rnds_ 1099 mov $key,$key_ 1100 pshufb $bswap_mask,$iv 1101___ 1102 &aesni_generate1("enc",$key,$rounds); 1103$code.=<<___; 1104 shl \$4,$rnds_ 1105 mov \$16,$rounds 1106 movups ($inp),$in0 # load inp 1107 paddq $increment,$iv 1108 lea 16($inp),$inp # $inp+=16 1109 sub %r10,%rax # twisted $rounds 1110 lea 32($key_,$rnds_),$key # end of key schedule 1111 mov %rax,%r10 1112 jmp .Lccm64_dec_outer 1113.align 16 1114.Lccm64_dec_outer: 1115 xorps $inout0,$in0 # inp ^= E(iv) 1116 movdqa $iv,$inout0 1117 movups $in0,($out) # save output 1118 lea 16($out),$out # $out+=16 1119 pshufb $bswap_mask,$inout0 1120 1121 sub \$1,$len # $len-- ($len is in blocks) 1122 jz .Lccm64_dec_break # if ($len==0) break 1123 1124 $movkey ($key_),$rndkey0 1125 mov %r10,%rax 1126 $movkey 16($key_),$rndkey1 1127 xorps $rndkey0,$in0 1128 xorps $rndkey0,$inout0 1129 xorps $in0,$inout1 # cmac^=out 1130 $movkey 32($key_),$rndkey0 1131 jmp .Lccm64_dec2_loop 1132.align 16 1133.Lccm64_dec2_loop: 1134 aesenc $rndkey1,$inout0 1135 aesenc $rndkey1,$inout1 1136 $movkey ($key,%rax),$rndkey1 1137 add \$32,%rax 1138 aesenc $rndkey0,$inout0 1139 aesenc $rndkey0,$inout1 1140 $movkey -16($key,%rax),$rndkey0 1141 jnz .Lccm64_dec2_loop 1142 movups ($inp),$in0 # load input 1143 paddq $increment,$iv 1144 aesenc $rndkey1,$inout0 1145 aesenc $rndkey1,$inout1 1146 aesenclast $rndkey0,$inout0 1147 aesenclast $rndkey0,$inout1 1148 lea 16($inp),$inp # $inp+=16 1149 jmp .Lccm64_dec_outer 1150 1151.align 16 1152.Lccm64_dec_break: 1153 #xorps $in0,$inout1 # cmac^=out 1154 mov 240($key_),$rounds 1155___ 1156 &aesni_generate1("enc",$key_,$rounds,$inout1,$in0); 1157$code.=<<___; 1158 pxor $rndkey0,$rndkey0 # clear register bank 1159 pxor $rndkey1,$rndkey1 1160 pxor $inout0,$inout0 1161 movups $inout1,($cmac) # store resulting mac 1162 pxor $inout1,$inout1 1163 pxor $in0,$in0 1164 pxor $iv,$iv 1165___ 1166$code.=<<___ if ($win64); 1167 movaps (%rsp),%xmm6 1168 movaps %xmm0,(%rsp) # clear stack 1169 movaps 0x10(%rsp),%xmm7 1170 movaps %xmm0,0x10(%rsp) 1171 movaps 0x20(%rsp),%xmm8 1172 movaps %xmm0,0x20(%rsp) 1173 movaps 0x30(%rsp),%xmm9 1174 movaps %xmm0,0x30(%rsp) 1175 lea 0x58(%rsp),%rsp 1176.Lccm64_dec_ret: 1177___ 1178$code.=<<___; 1179 ret 1180.size ${PREFIX}_ccm64_decrypt_blocks,.-${PREFIX}_ccm64_decrypt_blocks 1181___ 1182} 1183###################################################################### 1184# void aesni_ctr32_encrypt_blocks (const void *in, void *out, 1185# size_t blocks, const AES_KEY *key, 1186# const char *ivec); 1187# 1188# Handles only complete blocks, operates on 32-bit counter and 1189# does not update *ivec! (see crypto/modes/ctr128.c for details) 1190# 1191# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov, 1192# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest. 1193# Keywords are full unroll and modulo-schedule counter calculations 1194# with zero-round key xor. 1195{ 1196my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15)); 1197my ($key0,$ctr)=("%ebp","${ivp}d"); 1198my $frame_size = 0x80 + ($win64?160:0); 1199 1200$code.=<<___; 1201.globl ${PREFIX}_ctr32_encrypt_blocks 1202.type ${PREFIX}_ctr32_encrypt_blocks,\@function,5 1203.align 16 1204${PREFIX}_ctr32_encrypt_blocks: 1205.cfi_startproc 1206#ifdef BORINGSSL_DISPATCH_TEST 1207 movb \$1,BORINGSSL_function_hit(%rip) 1208#endif 1209 cmp \$1,$len 1210 jne .Lctr32_bulk 1211 1212 # handle single block without allocating stack frame, 1213 # useful when handling edges 1214 movups ($ivp),$inout0 1215 movups ($inp),$inout1 1216 mov 240($key),%edx # key->rounds 1217___ 1218 &aesni_generate1("enc",$key,"%edx"); 1219$code.=<<___; 1220 pxor $rndkey0,$rndkey0 # clear register bank 1221 pxor $rndkey1,$rndkey1 1222 xorps $inout1,$inout0 1223 pxor $inout1,$inout1 1224 movups $inout0,($out) 1225 xorps $inout0,$inout0 1226 jmp .Lctr32_epilogue 1227 1228.align 16 1229.Lctr32_bulk: 1230 lea (%rsp),$key_ # use $key_ as frame pointer 1231.cfi_def_cfa_register $key_ 1232 push %rbp 1233.cfi_push %rbp 1234 sub \$$frame_size,%rsp 1235 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 1236___ 1237$code.=<<___ if ($win64); 1238 movaps %xmm6,-0xa8($key_) # offload everything 1239 movaps %xmm7,-0x98($key_) 1240 movaps %xmm8,-0x88($key_) 1241 movaps %xmm9,-0x78($key_) 1242 movaps %xmm10,-0x68($key_) 1243 movaps %xmm11,-0x58($key_) 1244 movaps %xmm12,-0x48($key_) 1245 movaps %xmm13,-0x38($key_) 1246 movaps %xmm14,-0x28($key_) 1247 movaps %xmm15,-0x18($key_) 1248.Lctr32_body: 1249___ 1250$code.=<<___; 1251 1252 # 8 16-byte words on top of stack are counter values 1253 # xor-ed with zero-round key 1254 1255 movdqu ($ivp),$inout0 1256 movdqu ($key),$rndkey0 1257 mov 12($ivp),$ctr # counter LSB 1258 pxor $rndkey0,$inout0 1259 mov 12($key),$key0 # 0-round key LSB 1260 movdqa $inout0,0x00(%rsp) # populate counter block 1261 bswap $ctr 1262 movdqa $inout0,$inout1 1263 movdqa $inout0,$inout2 1264 movdqa $inout0,$inout3 1265 movdqa $inout0,0x40(%rsp) 1266 movdqa $inout0,0x50(%rsp) 1267 movdqa $inout0,0x60(%rsp) 1268 mov %rdx,%r10 # about to borrow %rdx 1269 movdqa $inout0,0x70(%rsp) 1270 1271 lea 1($ctr),%rax 1272 lea 2($ctr),%rdx 1273 bswap %eax 1274 bswap %edx 1275 xor $key0,%eax 1276 xor $key0,%edx 1277 pinsrd \$3,%eax,$inout1 1278 lea 3($ctr),%rax 1279 movdqa $inout1,0x10(%rsp) 1280 pinsrd \$3,%edx,$inout2 1281 bswap %eax 1282 mov %r10,%rdx # restore %rdx 1283 lea 4($ctr),%r10 1284 movdqa $inout2,0x20(%rsp) 1285 xor $key0,%eax 1286 bswap %r10d 1287 pinsrd \$3,%eax,$inout3 1288 xor $key0,%r10d 1289 movdqa $inout3,0x30(%rsp) 1290 lea 5($ctr),%r9 1291 mov %r10d,0x40+12(%rsp) 1292 bswap %r9d 1293 lea 6($ctr),%r10 1294 mov 240($key),$rounds # key->rounds 1295 xor $key0,%r9d 1296 bswap %r10d 1297 mov %r9d,0x50+12(%rsp) 1298 xor $key0,%r10d 1299 lea 7($ctr),%r9 1300 mov %r10d,0x60+12(%rsp) 1301 bswap %r9d 1302 leaq OPENSSL_ia32cap_P(%rip),%r10 1303 mov 4(%r10),%r10d 1304 xor $key0,%r9d 1305 and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE 1306 mov %r9d,0x70+12(%rsp) 1307 1308 $movkey 0x10($key),$rndkey1 1309 1310 movdqa 0x40(%rsp),$inout4 1311 movdqa 0x50(%rsp),$inout5 1312 1313 cmp \$8,$len # $len is in blocks 1314 jb .Lctr32_tail # short input if ($len<8) 1315 1316 sub \$6,$len # $len is biased by -6 1317 cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE 1318 je .Lctr32_6x # [which denotes Atom Silvermont] 1319 1320 lea 0x80($key),$key # size optimization 1321 sub \$2,$len # $len is biased by -8 1322 jmp .Lctr32_loop8 1323 1324.align 16 1325.Lctr32_6x: 1326 shl \$4,$rounds 1327 mov \$48,$rnds_ 1328 bswap $key0 1329 lea 32($key,$rounds),$key # end of key schedule 1330 sub %rax,%r10 # twisted $rounds 1331 jmp .Lctr32_loop6 1332 1333.align 16 1334.Lctr32_loop6: 1335 add \$6,$ctr # next counter value 1336 $movkey -48($key,$rnds_),$rndkey0 1337 aesenc $rndkey1,$inout0 1338 mov $ctr,%eax 1339 xor $key0,%eax 1340 aesenc $rndkey1,$inout1 1341 movbe %eax,`0x00+12`(%rsp) # store next counter value 1342 lea 1($ctr),%eax 1343 aesenc $rndkey1,$inout2 1344 xor $key0,%eax 1345 movbe %eax,`0x10+12`(%rsp) 1346 aesenc $rndkey1,$inout3 1347 lea 2($ctr),%eax 1348 xor $key0,%eax 1349 aesenc $rndkey1,$inout4 1350 movbe %eax,`0x20+12`(%rsp) 1351 lea 3($ctr),%eax 1352 aesenc $rndkey1,$inout5 1353 $movkey -32($key,$rnds_),$rndkey1 1354 xor $key0,%eax 1355 1356 aesenc $rndkey0,$inout0 1357 movbe %eax,`0x30+12`(%rsp) 1358 lea 4($ctr),%eax 1359 aesenc $rndkey0,$inout1 1360 xor $key0,%eax 1361 movbe %eax,`0x40+12`(%rsp) 1362 aesenc $rndkey0,$inout2 1363 lea 5($ctr),%eax 1364 xor $key0,%eax 1365 aesenc $rndkey0,$inout3 1366 movbe %eax,`0x50+12`(%rsp) 1367 mov %r10,%rax # mov $rnds_,$rounds 1368 aesenc $rndkey0,$inout4 1369 aesenc $rndkey0,$inout5 1370 $movkey -16($key,$rnds_),$rndkey0 1371 1372 call .Lenc_loop6 1373 1374 movdqu ($inp),$inout6 # load 6 input blocks 1375 movdqu 0x10($inp),$inout7 1376 movdqu 0x20($inp),$in0 1377 movdqu 0x30($inp),$in1 1378 movdqu 0x40($inp),$in2 1379 movdqu 0x50($inp),$in3 1380 lea 0x60($inp),$inp # $inp+=6*16 1381 $movkey -64($key,$rnds_),$rndkey1 1382 pxor $inout0,$inout6 # inp^=E(ctr) 1383 movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round] 1384 pxor $inout1,$inout7 1385 movaps 0x10(%rsp),$inout1 1386 pxor $inout2,$in0 1387 movaps 0x20(%rsp),$inout2 1388 pxor $inout3,$in1 1389 movaps 0x30(%rsp),$inout3 1390 pxor $inout4,$in2 1391 movaps 0x40(%rsp),$inout4 1392 pxor $inout5,$in3 1393 movaps 0x50(%rsp),$inout5 1394 movdqu $inout6,($out) # store 6 output blocks 1395 movdqu $inout7,0x10($out) 1396 movdqu $in0,0x20($out) 1397 movdqu $in1,0x30($out) 1398 movdqu $in2,0x40($out) 1399 movdqu $in3,0x50($out) 1400 lea 0x60($out),$out # $out+=6*16 1401 1402 sub \$6,$len 1403 jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow 1404 1405 add \$6,$len # restore real remaining $len 1406 jz .Lctr32_done # done if ($len==0) 1407 1408 lea -48($rnds_),$rounds 1409 lea -80($key,$rnds_),$key # restore $key 1410 neg $rounds 1411 shr \$4,$rounds # restore $rounds 1412 jmp .Lctr32_tail 1413 1414.align 32 1415.Lctr32_loop8: 1416 add \$8,$ctr # next counter value 1417 movdqa 0x60(%rsp),$inout6 1418 aesenc $rndkey1,$inout0 1419 mov $ctr,%r9d 1420 movdqa 0x70(%rsp),$inout7 1421 aesenc $rndkey1,$inout1 1422 bswap %r9d 1423 $movkey 0x20-0x80($key),$rndkey0 1424 aesenc $rndkey1,$inout2 1425 xor $key0,%r9d 1426 nop 1427 aesenc $rndkey1,$inout3 1428 mov %r9d,0x00+12(%rsp) # store next counter value 1429 lea 1($ctr),%r9 1430 aesenc $rndkey1,$inout4 1431 aesenc $rndkey1,$inout5 1432 aesenc $rndkey1,$inout6 1433 aesenc $rndkey1,$inout7 1434 $movkey 0x30-0x80($key),$rndkey1 1435___ 1436for($i=2;$i<8;$i++) { 1437my $rndkeyx = ($i&1)?$rndkey1:$rndkey0; 1438$code.=<<___; 1439 bswap %r9d 1440 aesenc $rndkeyx,$inout0 1441 aesenc $rndkeyx,$inout1 1442 xor $key0,%r9d 1443 .byte 0x66,0x90 1444 aesenc $rndkeyx,$inout2 1445 aesenc $rndkeyx,$inout3 1446 mov %r9d,`0x10*($i-1)`+12(%rsp) 1447 lea $i($ctr),%r9 1448 aesenc $rndkeyx,$inout4 1449 aesenc $rndkeyx,$inout5 1450 aesenc $rndkeyx,$inout6 1451 aesenc $rndkeyx,$inout7 1452 $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx 1453___ 1454} 1455$code.=<<___; 1456 bswap %r9d 1457 aesenc $rndkey0,$inout0 1458 aesenc $rndkey0,$inout1 1459 aesenc $rndkey0,$inout2 1460 xor $key0,%r9d 1461 movdqu 0x00($inp),$in0 # start loading input 1462 aesenc $rndkey0,$inout3 1463 mov %r9d,0x70+12(%rsp) 1464 cmp \$11,$rounds 1465 aesenc $rndkey0,$inout4 1466 aesenc $rndkey0,$inout5 1467 aesenc $rndkey0,$inout6 1468 aesenc $rndkey0,$inout7 1469 $movkey 0xa0-0x80($key),$rndkey0 1470 1471 jb .Lctr32_enc_done 1472 1473 aesenc $rndkey1,$inout0 1474 aesenc $rndkey1,$inout1 1475 aesenc $rndkey1,$inout2 1476 aesenc $rndkey1,$inout3 1477 aesenc $rndkey1,$inout4 1478 aesenc $rndkey1,$inout5 1479 aesenc $rndkey1,$inout6 1480 aesenc $rndkey1,$inout7 1481 $movkey 0xb0-0x80($key),$rndkey1 1482 1483 aesenc $rndkey0,$inout0 1484 aesenc $rndkey0,$inout1 1485 aesenc $rndkey0,$inout2 1486 aesenc $rndkey0,$inout3 1487 aesenc $rndkey0,$inout4 1488 aesenc $rndkey0,$inout5 1489 aesenc $rndkey0,$inout6 1490 aesenc $rndkey0,$inout7 1491 $movkey 0xc0-0x80($key),$rndkey0 1492 je .Lctr32_enc_done 1493 1494 aesenc $rndkey1,$inout0 1495 aesenc $rndkey1,$inout1 1496 aesenc $rndkey1,$inout2 1497 aesenc $rndkey1,$inout3 1498 aesenc $rndkey1,$inout4 1499 aesenc $rndkey1,$inout5 1500 aesenc $rndkey1,$inout6 1501 aesenc $rndkey1,$inout7 1502 $movkey 0xd0-0x80($key),$rndkey1 1503 1504 aesenc $rndkey0,$inout0 1505 aesenc $rndkey0,$inout1 1506 aesenc $rndkey0,$inout2 1507 aesenc $rndkey0,$inout3 1508 aesenc $rndkey0,$inout4 1509 aesenc $rndkey0,$inout5 1510 aesenc $rndkey0,$inout6 1511 aesenc $rndkey0,$inout7 1512 $movkey 0xe0-0x80($key),$rndkey0 1513 jmp .Lctr32_enc_done 1514 1515.align 16 1516.Lctr32_enc_done: 1517 movdqu 0x10($inp),$in1 1518 pxor $rndkey0,$in0 # input^=round[last] 1519 movdqu 0x20($inp),$in2 1520 pxor $rndkey0,$in1 1521 movdqu 0x30($inp),$in3 1522 pxor $rndkey0,$in2 1523 movdqu 0x40($inp),$in4 1524 pxor $rndkey0,$in3 1525 movdqu 0x50($inp),$in5 1526 pxor $rndkey0,$in4 1527 pxor $rndkey0,$in5 1528 aesenc $rndkey1,$inout0 1529 aesenc $rndkey1,$inout1 1530 aesenc $rndkey1,$inout2 1531 aesenc $rndkey1,$inout3 1532 aesenc $rndkey1,$inout4 1533 aesenc $rndkey1,$inout5 1534 aesenc $rndkey1,$inout6 1535 aesenc $rndkey1,$inout7 1536 movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6] 1537 lea 0x80($inp),$inp # $inp+=8*16 1538 1539 aesenclast $in0,$inout0 # $inN is inp[N]^round[last] 1540 pxor $rndkey0,$rndkey1 # borrowed $rndkey 1541 movdqu 0x70-0x80($inp),$in0 1542 aesenclast $in1,$inout1 1543 pxor $rndkey0,$in0 1544 movdqa 0x00(%rsp),$in1 # load next counter block 1545 aesenclast $in2,$inout2 1546 aesenclast $in3,$inout3 1547 movdqa 0x10(%rsp),$in2 1548 movdqa 0x20(%rsp),$in3 1549 aesenclast $in4,$inout4 1550 aesenclast $in5,$inout5 1551 movdqa 0x30(%rsp),$in4 1552 movdqa 0x40(%rsp),$in5 1553 aesenclast $rndkey1,$inout6 1554 movdqa 0x50(%rsp),$rndkey0 1555 $movkey 0x10-0x80($key),$rndkey1#real 1st-round key 1556 aesenclast $in0,$inout7 1557 1558 movups $inout0,($out) # store 8 output blocks 1559 movdqa $in1,$inout0 1560 movups $inout1,0x10($out) 1561 movdqa $in2,$inout1 1562 movups $inout2,0x20($out) 1563 movdqa $in3,$inout2 1564 movups $inout3,0x30($out) 1565 movdqa $in4,$inout3 1566 movups $inout4,0x40($out) 1567 movdqa $in5,$inout4 1568 movups $inout5,0x50($out) 1569 movdqa $rndkey0,$inout5 1570 movups $inout6,0x60($out) 1571 movups $inout7,0x70($out) 1572 lea 0x80($out),$out # $out+=8*16 1573 1574 sub \$8,$len 1575 jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow 1576 1577 add \$8,$len # restore real remaining $len 1578 jz .Lctr32_done # done if ($len==0) 1579 lea -0x80($key),$key 1580 1581.Lctr32_tail: 1582 # note that at this point $inout0..5 are populated with 1583 # counter values xor-ed with 0-round key 1584 lea 16($key),$key 1585 cmp \$4,$len 1586 jb .Lctr32_loop3 1587 je .Lctr32_loop4 1588 1589 # if ($len>4) compute 7 E(counter) 1590 shl \$4,$rounds 1591 movdqa 0x60(%rsp),$inout6 1592 pxor $inout7,$inout7 1593 1594 $movkey 16($key),$rndkey0 1595 aesenc $rndkey1,$inout0 1596 aesenc $rndkey1,$inout1 1597 lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter 1598 neg %rax 1599 aesenc $rndkey1,$inout2 1600 add \$16,%rax # prepare for .Lenc_loop8_enter 1601 movups ($inp),$in0 1602 aesenc $rndkey1,$inout3 1603 aesenc $rndkey1,$inout4 1604 movups 0x10($inp),$in1 # pre-load input 1605 movups 0x20($inp),$in2 1606 aesenc $rndkey1,$inout5 1607 aesenc $rndkey1,$inout6 1608 1609 call .Lenc_loop8_enter 1610 1611 movdqu 0x30($inp),$in3 1612 pxor $in0,$inout0 1613 movdqu 0x40($inp),$in0 1614 pxor $in1,$inout1 1615 movdqu $inout0,($out) # store output 1616 pxor $in2,$inout2 1617 movdqu $inout1,0x10($out) 1618 pxor $in3,$inout3 1619 movdqu $inout2,0x20($out) 1620 pxor $in0,$inout4 1621 movdqu $inout3,0x30($out) 1622 movdqu $inout4,0x40($out) 1623 cmp \$6,$len 1624 jb .Lctr32_done # $len was 5, stop store 1625 1626 movups 0x50($inp),$in1 1627 xorps $in1,$inout5 1628 movups $inout5,0x50($out) 1629 je .Lctr32_done # $len was 6, stop store 1630 1631 movups 0x60($inp),$in2 1632 xorps $in2,$inout6 1633 movups $inout6,0x60($out) 1634 jmp .Lctr32_done # $len was 7, stop store 1635 1636.align 32 1637.Lctr32_loop4: 1638 aesenc $rndkey1,$inout0 1639 lea 16($key),$key 1640 dec $rounds 1641 aesenc $rndkey1,$inout1 1642 aesenc $rndkey1,$inout2 1643 aesenc $rndkey1,$inout3 1644 $movkey ($key),$rndkey1 1645 jnz .Lctr32_loop4 1646 aesenclast $rndkey1,$inout0 1647 aesenclast $rndkey1,$inout1 1648 movups ($inp),$in0 # load input 1649 movups 0x10($inp),$in1 1650 aesenclast $rndkey1,$inout2 1651 aesenclast $rndkey1,$inout3 1652 movups 0x20($inp),$in2 1653 movups 0x30($inp),$in3 1654 1655 xorps $in0,$inout0 1656 movups $inout0,($out) # store output 1657 xorps $in1,$inout1 1658 movups $inout1,0x10($out) 1659 pxor $in2,$inout2 1660 movdqu $inout2,0x20($out) 1661 pxor $in3,$inout3 1662 movdqu $inout3,0x30($out) 1663 jmp .Lctr32_done # $len was 4, stop store 1664 1665.align 32 1666.Lctr32_loop3: 1667 aesenc $rndkey1,$inout0 1668 lea 16($key),$key 1669 dec $rounds 1670 aesenc $rndkey1,$inout1 1671 aesenc $rndkey1,$inout2 1672 $movkey ($key),$rndkey1 1673 jnz .Lctr32_loop3 1674 aesenclast $rndkey1,$inout0 1675 aesenclast $rndkey1,$inout1 1676 aesenclast $rndkey1,$inout2 1677 1678 movups ($inp),$in0 # load input 1679 xorps $in0,$inout0 1680 movups $inout0,($out) # store output 1681 cmp \$2,$len 1682 jb .Lctr32_done # $len was 1, stop store 1683 1684 movups 0x10($inp),$in1 1685 xorps $in1,$inout1 1686 movups $inout1,0x10($out) 1687 je .Lctr32_done # $len was 2, stop store 1688 1689 movups 0x20($inp),$in2 1690 xorps $in2,$inout2 1691 movups $inout2,0x20($out) # $len was 3, stop store 1692 1693.Lctr32_done: 1694 xorps %xmm0,%xmm0 # clear register bank 1695 xor $key0,$key0 1696 pxor %xmm1,%xmm1 1697 pxor %xmm2,%xmm2 1698 pxor %xmm3,%xmm3 1699 pxor %xmm4,%xmm4 1700 pxor %xmm5,%xmm5 1701___ 1702$code.=<<___ if (!$win64); 1703 pxor %xmm6,%xmm6 1704 pxor %xmm7,%xmm7 1705 movaps %xmm0,0x00(%rsp) # clear stack 1706 pxor %xmm8,%xmm8 1707 movaps %xmm0,0x10(%rsp) 1708 pxor %xmm9,%xmm9 1709 movaps %xmm0,0x20(%rsp) 1710 pxor %xmm10,%xmm10 1711 movaps %xmm0,0x30(%rsp) 1712 pxor %xmm11,%xmm11 1713 movaps %xmm0,0x40(%rsp) 1714 pxor %xmm12,%xmm12 1715 movaps %xmm0,0x50(%rsp) 1716 pxor %xmm13,%xmm13 1717 movaps %xmm0,0x60(%rsp) 1718 pxor %xmm14,%xmm14 1719 movaps %xmm0,0x70(%rsp) 1720 pxor %xmm15,%xmm15 1721___ 1722$code.=<<___ if ($win64); 1723 movaps -0xa8($key_),%xmm6 1724 movaps %xmm0,-0xa8($key_) # clear stack 1725 movaps -0x98($key_),%xmm7 1726 movaps %xmm0,-0x98($key_) 1727 movaps -0x88($key_),%xmm8 1728 movaps %xmm0,-0x88($key_) 1729 movaps -0x78($key_),%xmm9 1730 movaps %xmm0,-0x78($key_) 1731 movaps -0x68($key_),%xmm10 1732 movaps %xmm0,-0x68($key_) 1733 movaps -0x58($key_),%xmm11 1734 movaps %xmm0,-0x58($key_) 1735 movaps -0x48($key_),%xmm12 1736 movaps %xmm0,-0x48($key_) 1737 movaps -0x38($key_),%xmm13 1738 movaps %xmm0,-0x38($key_) 1739 movaps -0x28($key_),%xmm14 1740 movaps %xmm0,-0x28($key_) 1741 movaps -0x18($key_),%xmm15 1742 movaps %xmm0,-0x18($key_) 1743 movaps %xmm0,0x00(%rsp) 1744 movaps %xmm0,0x10(%rsp) 1745 movaps %xmm0,0x20(%rsp) 1746 movaps %xmm0,0x30(%rsp) 1747 movaps %xmm0,0x40(%rsp) 1748 movaps %xmm0,0x50(%rsp) 1749 movaps %xmm0,0x60(%rsp) 1750 movaps %xmm0,0x70(%rsp) 1751___ 1752$code.=<<___; 1753 mov -8($key_),%rbp 1754.cfi_restore %rbp 1755 lea ($key_),%rsp 1756.cfi_def_cfa_register %rsp 1757.Lctr32_epilogue: 1758 ret 1759.cfi_endproc 1760.size ${PREFIX}_ctr32_encrypt_blocks,.-${PREFIX}_ctr32_encrypt_blocks 1761___ 1762} 1763 1764###################################################################### 1765# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, 1766# const AES_KEY *key1, const AES_KEY *key2 1767# const unsigned char iv[16]); 1768# 1769if (0) { # Omit these functions in BoringSSL 1770my @tweak=map("%xmm$_",(10..15)); 1771my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]); 1772my ($key2,$ivp,$len_)=("%r8","%r9","%r9"); 1773my $frame_size = 0x70 + ($win64?160:0); 1774my $key_ = "%rbp"; # override so that we can use %r11 as FP 1775 1776$code.=<<___; 1777.globl ${PREFIX}_xts_encrypt 1778.type ${PREFIX}_xts_encrypt,\@function,6 1779.align 16 1780${PREFIX}_xts_encrypt: 1781.cfi_startproc 1782 lea (%rsp),%r11 # frame pointer 1783.cfi_def_cfa_register %r11 1784 push %rbp 1785.cfi_push %rbp 1786 sub \$$frame_size,%rsp 1787 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 1788___ 1789$code.=<<___ if ($win64); 1790 movaps %xmm6,-0xa8(%r11) # offload everything 1791 movaps %xmm7,-0x98(%r11) 1792 movaps %xmm8,-0x88(%r11) 1793 movaps %xmm9,-0x78(%r11) 1794 movaps %xmm10,-0x68(%r11) 1795 movaps %xmm11,-0x58(%r11) 1796 movaps %xmm12,-0x48(%r11) 1797 movaps %xmm13,-0x38(%r11) 1798 movaps %xmm14,-0x28(%r11) 1799 movaps %xmm15,-0x18(%r11) 1800.Lxts_enc_body: 1801___ 1802$code.=<<___; 1803 movups ($ivp),$inout0 # load clear-text tweak 1804 mov 240(%r8),$rounds # key2->rounds 1805 mov 240($key),$rnds_ # key1->rounds 1806___ 1807 # generate the tweak 1808 &aesni_generate1("enc",$key2,$rounds,$inout0); 1809$code.=<<___; 1810 $movkey ($key),$rndkey0 # zero round key 1811 mov $key,$key_ # backup $key 1812 mov $rnds_,$rounds # backup $rounds 1813 shl \$4,$rnds_ 1814 mov $len,$len_ # backup $len 1815 and \$-16,$len 1816 1817 $movkey 16($key,$rnds_),$rndkey1 # last round key 1818 1819 movdqa .Lxts_magic(%rip),$twmask 1820 movdqa $inout0,@tweak[5] 1821 pshufd \$0x5f,$inout0,$twres 1822 pxor $rndkey0,$rndkey1 1823___ 1824 # alternative tweak calculation algorithm is based on suggestions 1825 # by Shay Gueron. psrad doesn't conflict with AES-NI instructions 1826 # and should help in the future... 1827 for ($i=0;$i<4;$i++) { 1828 $code.=<<___; 1829 movdqa $twres,$twtmp 1830 paddd $twres,$twres 1831 movdqa @tweak[5],@tweak[$i] 1832 psrad \$31,$twtmp # broadcast upper bits 1833 paddq @tweak[5],@tweak[5] 1834 pand $twmask,$twtmp 1835 pxor $rndkey0,@tweak[$i] 1836 pxor $twtmp,@tweak[5] 1837___ 1838 } 1839$code.=<<___; 1840 movdqa @tweak[5],@tweak[4] 1841 psrad \$31,$twres 1842 paddq @tweak[5],@tweak[5] 1843 pand $twmask,$twres 1844 pxor $rndkey0,@tweak[4] 1845 pxor $twres,@tweak[5] 1846 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] 1847 1848 sub \$16*6,$len 1849 jc .Lxts_enc_short # if $len-=6*16 borrowed 1850 1851 mov \$16+96,$rounds 1852 lea 32($key_,$rnds_),$key # end of key schedule 1853 sub %r10,%rax # twisted $rounds 1854 $movkey 16($key_),$rndkey1 1855 mov %rax,%r10 # backup twisted $rounds 1856 lea .Lxts_magic(%rip),%r8 1857 jmp .Lxts_enc_grandloop 1858 1859.align 32 1860.Lxts_enc_grandloop: 1861 movdqu `16*0`($inp),$inout0 # load input 1862 movdqa $rndkey0,$twmask 1863 movdqu `16*1`($inp),$inout1 1864 pxor @tweak[0],$inout0 # input^=tweak^round[0] 1865 movdqu `16*2`($inp),$inout2 1866 pxor @tweak[1],$inout1 1867 aesenc $rndkey1,$inout0 1868 movdqu `16*3`($inp),$inout3 1869 pxor @tweak[2],$inout2 1870 aesenc $rndkey1,$inout1 1871 movdqu `16*4`($inp),$inout4 1872 pxor @tweak[3],$inout3 1873 aesenc $rndkey1,$inout2 1874 movdqu `16*5`($inp),$inout5 1875 pxor @tweak[5],$twmask # round[0]^=tweak[5] 1876 movdqa 0x60(%rsp),$twres # load round[0]^round[last] 1877 pxor @tweak[4],$inout4 1878 aesenc $rndkey1,$inout3 1879 $movkey 32($key_),$rndkey0 1880 lea `16*6`($inp),$inp 1881 pxor $twmask,$inout5 1882 1883 pxor $twres,@tweak[0] # calculate tweaks^round[last] 1884 aesenc $rndkey1,$inout4 1885 pxor $twres,@tweak[1] 1886 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last] 1887 aesenc $rndkey1,$inout5 1888 $movkey 48($key_),$rndkey1 1889 pxor $twres,@tweak[2] 1890 1891 aesenc $rndkey0,$inout0 1892 pxor $twres,@tweak[3] 1893 movdqa @tweak[1],`16*1`(%rsp) 1894 aesenc $rndkey0,$inout1 1895 pxor $twres,@tweak[4] 1896 movdqa @tweak[2],`16*2`(%rsp) 1897 aesenc $rndkey0,$inout2 1898 aesenc $rndkey0,$inout3 1899 pxor $twres,$twmask 1900 movdqa @tweak[4],`16*4`(%rsp) 1901 aesenc $rndkey0,$inout4 1902 aesenc $rndkey0,$inout5 1903 $movkey 64($key_),$rndkey0 1904 movdqa $twmask,`16*5`(%rsp) 1905 pshufd \$0x5f,@tweak[5],$twres 1906 jmp .Lxts_enc_loop6 1907.align 32 1908.Lxts_enc_loop6: 1909 aesenc $rndkey1,$inout0 1910 aesenc $rndkey1,$inout1 1911 aesenc $rndkey1,$inout2 1912 aesenc $rndkey1,$inout3 1913 aesenc $rndkey1,$inout4 1914 aesenc $rndkey1,$inout5 1915 $movkey -64($key,%rax),$rndkey1 1916 add \$32,%rax 1917 1918 aesenc $rndkey0,$inout0 1919 aesenc $rndkey0,$inout1 1920 aesenc $rndkey0,$inout2 1921 aesenc $rndkey0,$inout3 1922 aesenc $rndkey0,$inout4 1923 aesenc $rndkey0,$inout5 1924 $movkey -80($key,%rax),$rndkey0 1925 jnz .Lxts_enc_loop6 1926 1927 movdqa (%r8),$twmask # start calculating next tweak 1928 movdqa $twres,$twtmp 1929 paddd $twres,$twres 1930 aesenc $rndkey1,$inout0 1931 paddq @tweak[5],@tweak[5] 1932 psrad \$31,$twtmp 1933 aesenc $rndkey1,$inout1 1934 pand $twmask,$twtmp 1935 $movkey ($key_),@tweak[0] # load round[0] 1936 aesenc $rndkey1,$inout2 1937 aesenc $rndkey1,$inout3 1938 aesenc $rndkey1,$inout4 1939 pxor $twtmp,@tweak[5] 1940 movaps @tweak[0],@tweak[1] # copy round[0] 1941 aesenc $rndkey1,$inout5 1942 $movkey -64($key),$rndkey1 1943 1944 movdqa $twres,$twtmp 1945 aesenc $rndkey0,$inout0 1946 paddd $twres,$twres 1947 pxor @tweak[5],@tweak[0] 1948 aesenc $rndkey0,$inout1 1949 psrad \$31,$twtmp 1950 paddq @tweak[5],@tweak[5] 1951 aesenc $rndkey0,$inout2 1952 aesenc $rndkey0,$inout3 1953 pand $twmask,$twtmp 1954 movaps @tweak[1],@tweak[2] 1955 aesenc $rndkey0,$inout4 1956 pxor $twtmp,@tweak[5] 1957 movdqa $twres,$twtmp 1958 aesenc $rndkey0,$inout5 1959 $movkey -48($key),$rndkey0 1960 1961 paddd $twres,$twres 1962 aesenc $rndkey1,$inout0 1963 pxor @tweak[5],@tweak[1] 1964 psrad \$31,$twtmp 1965 aesenc $rndkey1,$inout1 1966 paddq @tweak[5],@tweak[5] 1967 pand $twmask,$twtmp 1968 aesenc $rndkey1,$inout2 1969 aesenc $rndkey1,$inout3 1970 movdqa @tweak[3],`16*3`(%rsp) 1971 pxor $twtmp,@tweak[5] 1972 aesenc $rndkey1,$inout4 1973 movaps @tweak[2],@tweak[3] 1974 movdqa $twres,$twtmp 1975 aesenc $rndkey1,$inout5 1976 $movkey -32($key),$rndkey1 1977 1978 paddd $twres,$twres 1979 aesenc $rndkey0,$inout0 1980 pxor @tweak[5],@tweak[2] 1981 psrad \$31,$twtmp 1982 aesenc $rndkey0,$inout1 1983 paddq @tweak[5],@tweak[5] 1984 pand $twmask,$twtmp 1985 aesenc $rndkey0,$inout2 1986 aesenc $rndkey0,$inout3 1987 aesenc $rndkey0,$inout4 1988 pxor $twtmp,@tweak[5] 1989 movaps @tweak[3],@tweak[4] 1990 aesenc $rndkey0,$inout5 1991 1992 movdqa $twres,$rndkey0 1993 paddd $twres,$twres 1994 aesenc $rndkey1,$inout0 1995 pxor @tweak[5],@tweak[3] 1996 psrad \$31,$rndkey0 1997 aesenc $rndkey1,$inout1 1998 paddq @tweak[5],@tweak[5] 1999 pand $twmask,$rndkey0 2000 aesenc $rndkey1,$inout2 2001 aesenc $rndkey1,$inout3 2002 pxor $rndkey0,@tweak[5] 2003 $movkey ($key_),$rndkey0 2004 aesenc $rndkey1,$inout4 2005 aesenc $rndkey1,$inout5 2006 $movkey 16($key_),$rndkey1 2007 2008 pxor @tweak[5],@tweak[4] 2009 aesenclast `16*0`(%rsp),$inout0 2010 psrad \$31,$twres 2011 paddq @tweak[5],@tweak[5] 2012 aesenclast `16*1`(%rsp),$inout1 2013 aesenclast `16*2`(%rsp),$inout2 2014 pand $twmask,$twres 2015 mov %r10,%rax # restore $rounds 2016 aesenclast `16*3`(%rsp),$inout3 2017 aesenclast `16*4`(%rsp),$inout4 2018 aesenclast `16*5`(%rsp),$inout5 2019 pxor $twres,@tweak[5] 2020 2021 lea `16*6`($out),$out # $out+=6*16 2022 movups $inout0,`-16*6`($out) # store 6 output blocks 2023 movups $inout1,`-16*5`($out) 2024 movups $inout2,`-16*4`($out) 2025 movups $inout3,`-16*3`($out) 2026 movups $inout4,`-16*2`($out) 2027 movups $inout5,`-16*1`($out) 2028 sub \$16*6,$len 2029 jnc .Lxts_enc_grandloop # loop if $len-=6*16 didn't borrow 2030 2031 mov \$16+96,$rounds 2032 sub $rnds_,$rounds 2033 mov $key_,$key # restore $key 2034 shr \$4,$rounds # restore original value 2035 2036.Lxts_enc_short: 2037 # at the point @tweak[0..5] are populated with tweak values 2038 mov $rounds,$rnds_ # backup $rounds 2039 pxor $rndkey0,@tweak[0] 2040 add \$16*6,$len # restore real remaining $len 2041 jz .Lxts_enc_done # done if ($len==0) 2042 2043 pxor $rndkey0,@tweak[1] 2044 cmp \$0x20,$len 2045 jb .Lxts_enc_one # $len is 1*16 2046 pxor $rndkey0,@tweak[2] 2047 je .Lxts_enc_two # $len is 2*16 2048 2049 pxor $rndkey0,@tweak[3] 2050 cmp \$0x40,$len 2051 jb .Lxts_enc_three # $len is 3*16 2052 pxor $rndkey0,@tweak[4] 2053 je .Lxts_enc_four # $len is 4*16 2054 2055 movdqu ($inp),$inout0 # $len is 5*16 2056 movdqu 16*1($inp),$inout1 2057 movdqu 16*2($inp),$inout2 2058 pxor @tweak[0],$inout0 2059 movdqu 16*3($inp),$inout3 2060 pxor @tweak[1],$inout1 2061 movdqu 16*4($inp),$inout4 2062 lea 16*5($inp),$inp # $inp+=5*16 2063 pxor @tweak[2],$inout2 2064 pxor @tweak[3],$inout3 2065 pxor @tweak[4],$inout4 2066 pxor $inout5,$inout5 2067 2068 call _aesni_encrypt6 2069 2070 xorps @tweak[0],$inout0 2071 movdqa @tweak[5],@tweak[0] 2072 xorps @tweak[1],$inout1 2073 xorps @tweak[2],$inout2 2074 movdqu $inout0,($out) # store 5 output blocks 2075 xorps @tweak[3],$inout3 2076 movdqu $inout1,16*1($out) 2077 xorps @tweak[4],$inout4 2078 movdqu $inout2,16*2($out) 2079 movdqu $inout3,16*3($out) 2080 movdqu $inout4,16*4($out) 2081 lea 16*5($out),$out # $out+=5*16 2082 jmp .Lxts_enc_done 2083 2084.align 16 2085.Lxts_enc_one: 2086 movups ($inp),$inout0 2087 lea 16*1($inp),$inp # inp+=1*16 2088 xorps @tweak[0],$inout0 2089___ 2090 &aesni_generate1("enc",$key,$rounds); 2091$code.=<<___; 2092 xorps @tweak[0],$inout0 2093 movdqa @tweak[1],@tweak[0] 2094 movups $inout0,($out) # store one output block 2095 lea 16*1($out),$out # $out+=1*16 2096 jmp .Lxts_enc_done 2097 2098.align 16 2099.Lxts_enc_two: 2100 movups ($inp),$inout0 2101 movups 16($inp),$inout1 2102 lea 32($inp),$inp # $inp+=2*16 2103 xorps @tweak[0],$inout0 2104 xorps @tweak[1],$inout1 2105 2106 call _aesni_encrypt2 2107 2108 xorps @tweak[0],$inout0 2109 movdqa @tweak[2],@tweak[0] 2110 xorps @tweak[1],$inout1 2111 movups $inout0,($out) # store 2 output blocks 2112 movups $inout1,16*1($out) 2113 lea 16*2($out),$out # $out+=2*16 2114 jmp .Lxts_enc_done 2115 2116.align 16 2117.Lxts_enc_three: 2118 movups ($inp),$inout0 2119 movups 16*1($inp),$inout1 2120 movups 16*2($inp),$inout2 2121 lea 16*3($inp),$inp # $inp+=3*16 2122 xorps @tweak[0],$inout0 2123 xorps @tweak[1],$inout1 2124 xorps @tweak[2],$inout2 2125 2126 call _aesni_encrypt3 2127 2128 xorps @tweak[0],$inout0 2129 movdqa @tweak[3],@tweak[0] 2130 xorps @tweak[1],$inout1 2131 xorps @tweak[2],$inout2 2132 movups $inout0,($out) # store 3 output blocks 2133 movups $inout1,16*1($out) 2134 movups $inout2,16*2($out) 2135 lea 16*3($out),$out # $out+=3*16 2136 jmp .Lxts_enc_done 2137 2138.align 16 2139.Lxts_enc_four: 2140 movups ($inp),$inout0 2141 movups 16*1($inp),$inout1 2142 movups 16*2($inp),$inout2 2143 xorps @tweak[0],$inout0 2144 movups 16*3($inp),$inout3 2145 lea 16*4($inp),$inp # $inp+=4*16 2146 xorps @tweak[1],$inout1 2147 xorps @tweak[2],$inout2 2148 xorps @tweak[3],$inout3 2149 2150 call _aesni_encrypt4 2151 2152 pxor @tweak[0],$inout0 2153 movdqa @tweak[4],@tweak[0] 2154 pxor @tweak[1],$inout1 2155 pxor @tweak[2],$inout2 2156 movdqu $inout0,($out) # store 4 output blocks 2157 pxor @tweak[3],$inout3 2158 movdqu $inout1,16*1($out) 2159 movdqu $inout2,16*2($out) 2160 movdqu $inout3,16*3($out) 2161 lea 16*4($out),$out # $out+=4*16 2162 jmp .Lxts_enc_done 2163 2164.align 16 2165.Lxts_enc_done: 2166 and \$15,$len_ # see if $len%16 is 0 2167 jz .Lxts_enc_ret 2168 mov $len_,$len 2169 2170.Lxts_enc_steal: 2171 movzb ($inp),%eax # borrow $rounds ... 2172 movzb -16($out),%ecx # ... and $key 2173 lea 1($inp),$inp 2174 mov %al,-16($out) 2175 mov %cl,0($out) 2176 lea 1($out),$out 2177 sub \$1,$len 2178 jnz .Lxts_enc_steal 2179 2180 sub $len_,$out # rewind $out 2181 mov $key_,$key # restore $key 2182 mov $rnds_,$rounds # restore $rounds 2183 2184 movups -16($out),$inout0 2185 xorps @tweak[0],$inout0 2186___ 2187 &aesni_generate1("enc",$key,$rounds); 2188$code.=<<___; 2189 xorps @tweak[0],$inout0 2190 movups $inout0,-16($out) 2191 2192.Lxts_enc_ret: 2193 xorps %xmm0,%xmm0 # clear register bank 2194 pxor %xmm1,%xmm1 2195 pxor %xmm2,%xmm2 2196 pxor %xmm3,%xmm3 2197 pxor %xmm4,%xmm4 2198 pxor %xmm5,%xmm5 2199___ 2200$code.=<<___ if (!$win64); 2201 pxor %xmm6,%xmm6 2202 pxor %xmm7,%xmm7 2203 movaps %xmm0,0x00(%rsp) # clear stack 2204 pxor %xmm8,%xmm8 2205 movaps %xmm0,0x10(%rsp) 2206 pxor %xmm9,%xmm9 2207 movaps %xmm0,0x20(%rsp) 2208 pxor %xmm10,%xmm10 2209 movaps %xmm0,0x30(%rsp) 2210 pxor %xmm11,%xmm11 2211 movaps %xmm0,0x40(%rsp) 2212 pxor %xmm12,%xmm12 2213 movaps %xmm0,0x50(%rsp) 2214 pxor %xmm13,%xmm13 2215 movaps %xmm0,0x60(%rsp) 2216 pxor %xmm14,%xmm14 2217 pxor %xmm15,%xmm15 2218___ 2219$code.=<<___ if ($win64); 2220 movaps -0xa8(%r11),%xmm6 2221 movaps %xmm0,-0xa8(%r11) # clear stack 2222 movaps -0x98(%r11),%xmm7 2223 movaps %xmm0,-0x98(%r11) 2224 movaps -0x88(%r11),%xmm8 2225 movaps %xmm0,-0x88(%r11) 2226 movaps -0x78(%r11),%xmm9 2227 movaps %xmm0,-0x78(%r11) 2228 movaps -0x68(%r11),%xmm10 2229 movaps %xmm0,-0x68(%r11) 2230 movaps -0x58(%r11),%xmm11 2231 movaps %xmm0,-0x58(%r11) 2232 movaps -0x48(%r11),%xmm12 2233 movaps %xmm0,-0x48(%r11) 2234 movaps -0x38(%r11),%xmm13 2235 movaps %xmm0,-0x38(%r11) 2236 movaps -0x28(%r11),%xmm14 2237 movaps %xmm0,-0x28(%r11) 2238 movaps -0x18(%r11),%xmm15 2239 movaps %xmm0,-0x18(%r11) 2240 movaps %xmm0,0x00(%rsp) 2241 movaps %xmm0,0x10(%rsp) 2242 movaps %xmm0,0x20(%rsp) 2243 movaps %xmm0,0x30(%rsp) 2244 movaps %xmm0,0x40(%rsp) 2245 movaps %xmm0,0x50(%rsp) 2246 movaps %xmm0,0x60(%rsp) 2247___ 2248$code.=<<___; 2249 mov -8(%r11),%rbp 2250.cfi_restore %rbp 2251 lea (%r11),%rsp 2252.cfi_def_cfa_register %rsp 2253.Lxts_enc_epilogue: 2254 ret 2255.cfi_endproc 2256.size ${PREFIX}_xts_encrypt,.-${PREFIX}_xts_encrypt 2257___ 2258 2259$code.=<<___; 2260.globl ${PREFIX}_xts_decrypt 2261.type ${PREFIX}_xts_decrypt,\@function,6 2262.align 16 2263${PREFIX}_xts_decrypt: 2264.cfi_startproc 2265 lea (%rsp),%r11 # frame pointer 2266.cfi_def_cfa_register %r11 2267 push %rbp 2268.cfi_push %rbp 2269 sub \$$frame_size,%rsp 2270 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 2271___ 2272$code.=<<___ if ($win64); 2273 movaps %xmm6,-0xa8(%r11) # offload everything 2274 movaps %xmm7,-0x98(%r11) 2275 movaps %xmm8,-0x88(%r11) 2276 movaps %xmm9,-0x78(%r11) 2277 movaps %xmm10,-0x68(%r11) 2278 movaps %xmm11,-0x58(%r11) 2279 movaps %xmm12,-0x48(%r11) 2280 movaps %xmm13,-0x38(%r11) 2281 movaps %xmm14,-0x28(%r11) 2282 movaps %xmm15,-0x18(%r11) 2283.Lxts_dec_body: 2284___ 2285$code.=<<___; 2286 movups ($ivp),$inout0 # load clear-text tweak 2287 mov 240($key2),$rounds # key2->rounds 2288 mov 240($key),$rnds_ # key1->rounds 2289___ 2290 # generate the tweak 2291 &aesni_generate1("enc",$key2,$rounds,$inout0); 2292$code.=<<___; 2293 xor %eax,%eax # if ($len%16) len-=16; 2294 test \$15,$len 2295 setnz %al 2296 shl \$4,%rax 2297 sub %rax,$len 2298 2299 $movkey ($key),$rndkey0 # zero round key 2300 mov $key,$key_ # backup $key 2301 mov $rnds_,$rounds # backup $rounds 2302 shl \$4,$rnds_ 2303 mov $len,$len_ # backup $len 2304 and \$-16,$len 2305 2306 $movkey 16($key,$rnds_),$rndkey1 # last round key 2307 2308 movdqa .Lxts_magic(%rip),$twmask 2309 movdqa $inout0,@tweak[5] 2310 pshufd \$0x5f,$inout0,$twres 2311 pxor $rndkey0,$rndkey1 2312___ 2313 for ($i=0;$i<4;$i++) { 2314 $code.=<<___; 2315 movdqa $twres,$twtmp 2316 paddd $twres,$twres 2317 movdqa @tweak[5],@tweak[$i] 2318 psrad \$31,$twtmp # broadcast upper bits 2319 paddq @tweak[5],@tweak[5] 2320 pand $twmask,$twtmp 2321 pxor $rndkey0,@tweak[$i] 2322 pxor $twtmp,@tweak[5] 2323___ 2324 } 2325$code.=<<___; 2326 movdqa @tweak[5],@tweak[4] 2327 psrad \$31,$twres 2328 paddq @tweak[5],@tweak[5] 2329 pand $twmask,$twres 2330 pxor $rndkey0,@tweak[4] 2331 pxor $twres,@tweak[5] 2332 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] 2333 2334 sub \$16*6,$len 2335 jc .Lxts_dec_short # if $len-=6*16 borrowed 2336 2337 mov \$16+96,$rounds 2338 lea 32($key_,$rnds_),$key # end of key schedule 2339 sub %r10,%rax # twisted $rounds 2340 $movkey 16($key_),$rndkey1 2341 mov %rax,%r10 # backup twisted $rounds 2342 lea .Lxts_magic(%rip),%r8 2343 jmp .Lxts_dec_grandloop 2344 2345.align 32 2346.Lxts_dec_grandloop: 2347 movdqu `16*0`($inp),$inout0 # load input 2348 movdqa $rndkey0,$twmask 2349 movdqu `16*1`($inp),$inout1 2350 pxor @tweak[0],$inout0 # intput^=tweak^round[0] 2351 movdqu `16*2`($inp),$inout2 2352 pxor @tweak[1],$inout1 2353 aesdec $rndkey1,$inout0 2354 movdqu `16*3`($inp),$inout3 2355 pxor @tweak[2],$inout2 2356 aesdec $rndkey1,$inout1 2357 movdqu `16*4`($inp),$inout4 2358 pxor @tweak[3],$inout3 2359 aesdec $rndkey1,$inout2 2360 movdqu `16*5`($inp),$inout5 2361 pxor @tweak[5],$twmask # round[0]^=tweak[5] 2362 movdqa 0x60(%rsp),$twres # load round[0]^round[last] 2363 pxor @tweak[4],$inout4 2364 aesdec $rndkey1,$inout3 2365 $movkey 32($key_),$rndkey0 2366 lea `16*6`($inp),$inp 2367 pxor $twmask,$inout5 2368 2369 pxor $twres,@tweak[0] # calculate tweaks^round[last] 2370 aesdec $rndkey1,$inout4 2371 pxor $twres,@tweak[1] 2372 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key 2373 aesdec $rndkey1,$inout5 2374 $movkey 48($key_),$rndkey1 2375 pxor $twres,@tweak[2] 2376 2377 aesdec $rndkey0,$inout0 2378 pxor $twres,@tweak[3] 2379 movdqa @tweak[1],`16*1`(%rsp) 2380 aesdec $rndkey0,$inout1 2381 pxor $twres,@tweak[4] 2382 movdqa @tweak[2],`16*2`(%rsp) 2383 aesdec $rndkey0,$inout2 2384 aesdec $rndkey0,$inout3 2385 pxor $twres,$twmask 2386 movdqa @tweak[4],`16*4`(%rsp) 2387 aesdec $rndkey0,$inout4 2388 aesdec $rndkey0,$inout5 2389 $movkey 64($key_),$rndkey0 2390 movdqa $twmask,`16*5`(%rsp) 2391 pshufd \$0x5f,@tweak[5],$twres 2392 jmp .Lxts_dec_loop6 2393.align 32 2394.Lxts_dec_loop6: 2395 aesdec $rndkey1,$inout0 2396 aesdec $rndkey1,$inout1 2397 aesdec $rndkey1,$inout2 2398 aesdec $rndkey1,$inout3 2399 aesdec $rndkey1,$inout4 2400 aesdec $rndkey1,$inout5 2401 $movkey -64($key,%rax),$rndkey1 2402 add \$32,%rax 2403 2404 aesdec $rndkey0,$inout0 2405 aesdec $rndkey0,$inout1 2406 aesdec $rndkey0,$inout2 2407 aesdec $rndkey0,$inout3 2408 aesdec $rndkey0,$inout4 2409 aesdec $rndkey0,$inout5 2410 $movkey -80($key,%rax),$rndkey0 2411 jnz .Lxts_dec_loop6 2412 2413 movdqa (%r8),$twmask # start calculating next tweak 2414 movdqa $twres,$twtmp 2415 paddd $twres,$twres 2416 aesdec $rndkey1,$inout0 2417 paddq @tweak[5],@tweak[5] 2418 psrad \$31,$twtmp 2419 aesdec $rndkey1,$inout1 2420 pand $twmask,$twtmp 2421 $movkey ($key_),@tweak[0] # load round[0] 2422 aesdec $rndkey1,$inout2 2423 aesdec $rndkey1,$inout3 2424 aesdec $rndkey1,$inout4 2425 pxor $twtmp,@tweak[5] 2426 movaps @tweak[0],@tweak[1] # copy round[0] 2427 aesdec $rndkey1,$inout5 2428 $movkey -64($key),$rndkey1 2429 2430 movdqa $twres,$twtmp 2431 aesdec $rndkey0,$inout0 2432 paddd $twres,$twres 2433 pxor @tweak[5],@tweak[0] 2434 aesdec $rndkey0,$inout1 2435 psrad \$31,$twtmp 2436 paddq @tweak[5],@tweak[5] 2437 aesdec $rndkey0,$inout2 2438 aesdec $rndkey0,$inout3 2439 pand $twmask,$twtmp 2440 movaps @tweak[1],@tweak[2] 2441 aesdec $rndkey0,$inout4 2442 pxor $twtmp,@tweak[5] 2443 movdqa $twres,$twtmp 2444 aesdec $rndkey0,$inout5 2445 $movkey -48($key),$rndkey0 2446 2447 paddd $twres,$twres 2448 aesdec $rndkey1,$inout0 2449 pxor @tweak[5],@tweak[1] 2450 psrad \$31,$twtmp 2451 aesdec $rndkey1,$inout1 2452 paddq @tweak[5],@tweak[5] 2453 pand $twmask,$twtmp 2454 aesdec $rndkey1,$inout2 2455 aesdec $rndkey1,$inout3 2456 movdqa @tweak[3],`16*3`(%rsp) 2457 pxor $twtmp,@tweak[5] 2458 aesdec $rndkey1,$inout4 2459 movaps @tweak[2],@tweak[3] 2460 movdqa $twres,$twtmp 2461 aesdec $rndkey1,$inout5 2462 $movkey -32($key),$rndkey1 2463 2464 paddd $twres,$twres 2465 aesdec $rndkey0,$inout0 2466 pxor @tweak[5],@tweak[2] 2467 psrad \$31,$twtmp 2468 aesdec $rndkey0,$inout1 2469 paddq @tweak[5],@tweak[5] 2470 pand $twmask,$twtmp 2471 aesdec $rndkey0,$inout2 2472 aesdec $rndkey0,$inout3 2473 aesdec $rndkey0,$inout4 2474 pxor $twtmp,@tweak[5] 2475 movaps @tweak[3],@tweak[4] 2476 aesdec $rndkey0,$inout5 2477 2478 movdqa $twres,$rndkey0 2479 paddd $twres,$twres 2480 aesdec $rndkey1,$inout0 2481 pxor @tweak[5],@tweak[3] 2482 psrad \$31,$rndkey0 2483 aesdec $rndkey1,$inout1 2484 paddq @tweak[5],@tweak[5] 2485 pand $twmask,$rndkey0 2486 aesdec $rndkey1,$inout2 2487 aesdec $rndkey1,$inout3 2488 pxor $rndkey0,@tweak[5] 2489 $movkey ($key_),$rndkey0 2490 aesdec $rndkey1,$inout4 2491 aesdec $rndkey1,$inout5 2492 $movkey 16($key_),$rndkey1 2493 2494 pxor @tweak[5],@tweak[4] 2495 aesdeclast `16*0`(%rsp),$inout0 2496 psrad \$31,$twres 2497 paddq @tweak[5],@tweak[5] 2498 aesdeclast `16*1`(%rsp),$inout1 2499 aesdeclast `16*2`(%rsp),$inout2 2500 pand $twmask,$twres 2501 mov %r10,%rax # restore $rounds 2502 aesdeclast `16*3`(%rsp),$inout3 2503 aesdeclast `16*4`(%rsp),$inout4 2504 aesdeclast `16*5`(%rsp),$inout5 2505 pxor $twres,@tweak[5] 2506 2507 lea `16*6`($out),$out # $out+=6*16 2508 movups $inout0,`-16*6`($out) # store 6 output blocks 2509 movups $inout1,`-16*5`($out) 2510 movups $inout2,`-16*4`($out) 2511 movups $inout3,`-16*3`($out) 2512 movups $inout4,`-16*2`($out) 2513 movups $inout5,`-16*1`($out) 2514 sub \$16*6,$len 2515 jnc .Lxts_dec_grandloop # loop if $len-=6*16 didn't borrow 2516 2517 mov \$16+96,$rounds 2518 sub $rnds_,$rounds 2519 mov $key_,$key # restore $key 2520 shr \$4,$rounds # restore original value 2521 2522.Lxts_dec_short: 2523 # at the point @tweak[0..5] are populated with tweak values 2524 mov $rounds,$rnds_ # backup $rounds 2525 pxor $rndkey0,@tweak[0] 2526 pxor $rndkey0,@tweak[1] 2527 add \$16*6,$len # restore real remaining $len 2528 jz .Lxts_dec_done # done if ($len==0) 2529 2530 pxor $rndkey0,@tweak[2] 2531 cmp \$0x20,$len 2532 jb .Lxts_dec_one # $len is 1*16 2533 pxor $rndkey0,@tweak[3] 2534 je .Lxts_dec_two # $len is 2*16 2535 2536 pxor $rndkey0,@tweak[4] 2537 cmp \$0x40,$len 2538 jb .Lxts_dec_three # $len is 3*16 2539 je .Lxts_dec_four # $len is 4*16 2540 2541 movdqu ($inp),$inout0 # $len is 5*16 2542 movdqu 16*1($inp),$inout1 2543 movdqu 16*2($inp),$inout2 2544 pxor @tweak[0],$inout0 2545 movdqu 16*3($inp),$inout3 2546 pxor @tweak[1],$inout1 2547 movdqu 16*4($inp),$inout4 2548 lea 16*5($inp),$inp # $inp+=5*16 2549 pxor @tweak[2],$inout2 2550 pxor @tweak[3],$inout3 2551 pxor @tweak[4],$inout4 2552 2553 call _aesni_decrypt6 2554 2555 xorps @tweak[0],$inout0 2556 xorps @tweak[1],$inout1 2557 xorps @tweak[2],$inout2 2558 movdqu $inout0,($out) # store 5 output blocks 2559 xorps @tweak[3],$inout3 2560 movdqu $inout1,16*1($out) 2561 xorps @tweak[4],$inout4 2562 movdqu $inout2,16*2($out) 2563 pxor $twtmp,$twtmp 2564 movdqu $inout3,16*3($out) 2565 pcmpgtd @tweak[5],$twtmp 2566 movdqu $inout4,16*4($out) 2567 lea 16*5($out),$out # $out+=5*16 2568 pshufd \$0x13,$twtmp,@tweak[1] # $twres 2569 and \$15,$len_ 2570 jz .Lxts_dec_ret 2571 2572 movdqa @tweak[5],@tweak[0] 2573 paddq @tweak[5],@tweak[5] # psllq 1,$tweak 2574 pand $twmask,@tweak[1] # isolate carry and residue 2575 pxor @tweak[5],@tweak[1] 2576 jmp .Lxts_dec_done2 2577 2578.align 16 2579.Lxts_dec_one: 2580 movups ($inp),$inout0 2581 lea 16*1($inp),$inp # $inp+=1*16 2582 xorps @tweak[0],$inout0 2583___ 2584 &aesni_generate1("dec",$key,$rounds); 2585$code.=<<___; 2586 xorps @tweak[0],$inout0 2587 movdqa @tweak[1],@tweak[0] 2588 movups $inout0,($out) # store one output block 2589 movdqa @tweak[2],@tweak[1] 2590 lea 16*1($out),$out # $out+=1*16 2591 jmp .Lxts_dec_done 2592 2593.align 16 2594.Lxts_dec_two: 2595 movups ($inp),$inout0 2596 movups 16($inp),$inout1 2597 lea 32($inp),$inp # $inp+=2*16 2598 xorps @tweak[0],$inout0 2599 xorps @tweak[1],$inout1 2600 2601 call _aesni_decrypt2 2602 2603 xorps @tweak[0],$inout0 2604 movdqa @tweak[2],@tweak[0] 2605 xorps @tweak[1],$inout1 2606 movdqa @tweak[3],@tweak[1] 2607 movups $inout0,($out) # store 2 output blocks 2608 movups $inout1,16*1($out) 2609 lea 16*2($out),$out # $out+=2*16 2610 jmp .Lxts_dec_done 2611 2612.align 16 2613.Lxts_dec_three: 2614 movups ($inp),$inout0 2615 movups 16*1($inp),$inout1 2616 movups 16*2($inp),$inout2 2617 lea 16*3($inp),$inp # $inp+=3*16 2618 xorps @tweak[0],$inout0 2619 xorps @tweak[1],$inout1 2620 xorps @tweak[2],$inout2 2621 2622 call _aesni_decrypt3 2623 2624 xorps @tweak[0],$inout0 2625 movdqa @tweak[3],@tweak[0] 2626 xorps @tweak[1],$inout1 2627 movdqa @tweak[4],@tweak[1] 2628 xorps @tweak[2],$inout2 2629 movups $inout0,($out) # store 3 output blocks 2630 movups $inout1,16*1($out) 2631 movups $inout2,16*2($out) 2632 lea 16*3($out),$out # $out+=3*16 2633 jmp .Lxts_dec_done 2634 2635.align 16 2636.Lxts_dec_four: 2637 movups ($inp),$inout0 2638 movups 16*1($inp),$inout1 2639 movups 16*2($inp),$inout2 2640 xorps @tweak[0],$inout0 2641 movups 16*3($inp),$inout3 2642 lea 16*4($inp),$inp # $inp+=4*16 2643 xorps @tweak[1],$inout1 2644 xorps @tweak[2],$inout2 2645 xorps @tweak[3],$inout3 2646 2647 call _aesni_decrypt4 2648 2649 pxor @tweak[0],$inout0 2650 movdqa @tweak[4],@tweak[0] 2651 pxor @tweak[1],$inout1 2652 movdqa @tweak[5],@tweak[1] 2653 pxor @tweak[2],$inout2 2654 movdqu $inout0,($out) # store 4 output blocks 2655 pxor @tweak[3],$inout3 2656 movdqu $inout1,16*1($out) 2657 movdqu $inout2,16*2($out) 2658 movdqu $inout3,16*3($out) 2659 lea 16*4($out),$out # $out+=4*16 2660 jmp .Lxts_dec_done 2661 2662.align 16 2663.Lxts_dec_done: 2664 and \$15,$len_ # see if $len%16 is 0 2665 jz .Lxts_dec_ret 2666.Lxts_dec_done2: 2667 mov $len_,$len 2668 mov $key_,$key # restore $key 2669 mov $rnds_,$rounds # restore $rounds 2670 2671 movups ($inp),$inout0 2672 xorps @tweak[1],$inout0 2673___ 2674 &aesni_generate1("dec",$key,$rounds); 2675$code.=<<___; 2676 xorps @tweak[1],$inout0 2677 movups $inout0,($out) 2678 2679.Lxts_dec_steal: 2680 movzb 16($inp),%eax # borrow $rounds ... 2681 movzb ($out),%ecx # ... and $key 2682 lea 1($inp),$inp 2683 mov %al,($out) 2684 mov %cl,16($out) 2685 lea 1($out),$out 2686 sub \$1,$len 2687 jnz .Lxts_dec_steal 2688 2689 sub $len_,$out # rewind $out 2690 mov $key_,$key # restore $key 2691 mov $rnds_,$rounds # restore $rounds 2692 2693 movups ($out),$inout0 2694 xorps @tweak[0],$inout0 2695___ 2696 &aesni_generate1("dec",$key,$rounds); 2697$code.=<<___; 2698 xorps @tweak[0],$inout0 2699 movups $inout0,($out) 2700 2701.Lxts_dec_ret: 2702 xorps %xmm0,%xmm0 # clear register bank 2703 pxor %xmm1,%xmm1 2704 pxor %xmm2,%xmm2 2705 pxor %xmm3,%xmm3 2706 pxor %xmm4,%xmm4 2707 pxor %xmm5,%xmm5 2708___ 2709$code.=<<___ if (!$win64); 2710 pxor %xmm6,%xmm6 2711 pxor %xmm7,%xmm7 2712 movaps %xmm0,0x00(%rsp) # clear stack 2713 pxor %xmm8,%xmm8 2714 movaps %xmm0,0x10(%rsp) 2715 pxor %xmm9,%xmm9 2716 movaps %xmm0,0x20(%rsp) 2717 pxor %xmm10,%xmm10 2718 movaps %xmm0,0x30(%rsp) 2719 pxor %xmm11,%xmm11 2720 movaps %xmm0,0x40(%rsp) 2721 pxor %xmm12,%xmm12 2722 movaps %xmm0,0x50(%rsp) 2723 pxor %xmm13,%xmm13 2724 movaps %xmm0,0x60(%rsp) 2725 pxor %xmm14,%xmm14 2726 pxor %xmm15,%xmm15 2727___ 2728$code.=<<___ if ($win64); 2729 movaps -0xa8(%r11),%xmm6 2730 movaps %xmm0,-0xa8(%r11) # clear stack 2731 movaps -0x98(%r11),%xmm7 2732 movaps %xmm0,-0x98(%r11) 2733 movaps -0x88(%r11),%xmm8 2734 movaps %xmm0,-0x88(%r11) 2735 movaps -0x78(%r11),%xmm9 2736 movaps %xmm0,-0x78(%r11) 2737 movaps -0x68(%r11),%xmm10 2738 movaps %xmm0,-0x68(%r11) 2739 movaps -0x58(%r11),%xmm11 2740 movaps %xmm0,-0x58(%r11) 2741 movaps -0x48(%r11),%xmm12 2742 movaps %xmm0,-0x48(%r11) 2743 movaps -0x38(%r11),%xmm13 2744 movaps %xmm0,-0x38(%r11) 2745 movaps -0x28(%r11),%xmm14 2746 movaps %xmm0,-0x28(%r11) 2747 movaps -0x18(%r11),%xmm15 2748 movaps %xmm0,-0x18(%r11) 2749 movaps %xmm0,0x00(%rsp) 2750 movaps %xmm0,0x10(%rsp) 2751 movaps %xmm0,0x20(%rsp) 2752 movaps %xmm0,0x30(%rsp) 2753 movaps %xmm0,0x40(%rsp) 2754 movaps %xmm0,0x50(%rsp) 2755 movaps %xmm0,0x60(%rsp) 2756___ 2757$code.=<<___; 2758 mov -8(%r11),%rbp 2759.cfi_restore %rbp 2760 lea (%r11),%rsp 2761.cfi_def_cfa_register %rsp 2762.Lxts_dec_epilogue: 2763 ret 2764.cfi_endproc 2765.size ${PREFIX}_xts_decrypt,.-${PREFIX}_xts_decrypt 2766___ 2767} 2768 2769###################################################################### 2770# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks, 2771# const AES_KEY *key, unsigned int start_block_num, 2772# unsigned char offset_i[16], const unsigned char L_[][16], 2773# unsigned char checksum[16]); 2774# 2775if (0) { # Omit these functions in BoringSSL 2776my @offset=map("%xmm$_",(10..15)); 2777my ($checksum,$rndkey0l)=("%xmm8","%xmm9"); 2778my ($block_num,$offset_p)=("%r8","%r9"); # 5th and 6th arguments 2779my ($L_p,$checksum_p) = ("%rbx","%rbp"); 2780my ($i1,$i3,$i5) = ("%r12","%r13","%r14"); 2781my $seventh_arg = $win64 ? 56 : 8; 2782my $blocks = $len; 2783 2784$code.=<<___; 2785.globl ${PREFIX}_ocb_encrypt 2786.type ${PREFIX}_ocb_encrypt,\@function,6 2787.align 32 2788${PREFIX}_ocb_encrypt: 2789.cfi_startproc 2790 lea (%rsp),%rax 2791 push %rbx 2792.cfi_push %rbx 2793 push %rbp 2794.cfi_push %rbp 2795 push %r12 2796.cfi_push %r12 2797 push %r13 2798.cfi_push %r13 2799 push %r14 2800.cfi_push %r14 2801___ 2802$code.=<<___ if ($win64); 2803 lea -0xa0(%rsp),%rsp 2804 movaps %xmm6,0x00(%rsp) # offload everything 2805 movaps %xmm7,0x10(%rsp) 2806 movaps %xmm8,0x20(%rsp) 2807 movaps %xmm9,0x30(%rsp) 2808 movaps %xmm10,0x40(%rsp) 2809 movaps %xmm11,0x50(%rsp) 2810 movaps %xmm12,0x60(%rsp) 2811 movaps %xmm13,0x70(%rsp) 2812 movaps %xmm14,0x80(%rsp) 2813 movaps %xmm15,0x90(%rsp) 2814.Locb_enc_body: 2815___ 2816$code.=<<___; 2817 mov $seventh_arg(%rax),$L_p # 7th argument 2818 mov $seventh_arg+8(%rax),$checksum_p# 8th argument 2819 2820 mov 240($key),$rnds_ 2821 mov $key,$key_ 2822 shl \$4,$rnds_ 2823 $movkey ($key),$rndkey0l # round[0] 2824 $movkey 16($key,$rnds_),$rndkey1 # round[last] 2825 2826 movdqu ($offset_p),@offset[5] # load last offset_i 2827 pxor $rndkey1,$rndkey0l # round[0] ^ round[last] 2828 pxor $rndkey1,@offset[5] # offset_i ^ round[last] 2829 2830 mov \$16+32,$rounds 2831 lea 32($key_,$rnds_),$key 2832 $movkey 16($key_),$rndkey1 # round[1] 2833 sub %r10,%rax # twisted $rounds 2834 mov %rax,%r10 # backup twisted $rounds 2835 2836 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 2837 movdqu ($checksum_p),$checksum # load checksum 2838 2839 test \$1,$block_num # is first block number odd? 2840 jnz .Locb_enc_odd 2841 2842 bsf $block_num,$i1 2843 add \$1,$block_num 2844 shl \$4,$i1 2845 movdqu ($L_p,$i1),$inout5 # borrow 2846 movdqu ($inp),$inout0 2847 lea 16($inp),$inp 2848 2849 call __ocb_encrypt1 2850 2851 movdqa $inout5,@offset[5] 2852 movups $inout0,($out) 2853 lea 16($out),$out 2854 sub \$1,$blocks 2855 jz .Locb_enc_done 2856 2857.Locb_enc_odd: 2858 lea 1($block_num),$i1 # even-numbered blocks 2859 lea 3($block_num),$i3 2860 lea 5($block_num),$i5 2861 lea 6($block_num),$block_num 2862 bsf $i1,$i1 # ntz(block) 2863 bsf $i3,$i3 2864 bsf $i5,$i5 2865 shl \$4,$i1 # ntz(block) -> table offset 2866 shl \$4,$i3 2867 shl \$4,$i5 2868 2869 sub \$6,$blocks 2870 jc .Locb_enc_short 2871 jmp .Locb_enc_grandloop 2872 2873.align 32 2874.Locb_enc_grandloop: 2875 movdqu `16*0`($inp),$inout0 # load input 2876 movdqu `16*1`($inp),$inout1 2877 movdqu `16*2`($inp),$inout2 2878 movdqu `16*3`($inp),$inout3 2879 movdqu `16*4`($inp),$inout4 2880 movdqu `16*5`($inp),$inout5 2881 lea `16*6`($inp),$inp 2882 2883 call __ocb_encrypt6 2884 2885 movups $inout0,`16*0`($out) # store output 2886 movups $inout1,`16*1`($out) 2887 movups $inout2,`16*2`($out) 2888 movups $inout3,`16*3`($out) 2889 movups $inout4,`16*4`($out) 2890 movups $inout5,`16*5`($out) 2891 lea `16*6`($out),$out 2892 sub \$6,$blocks 2893 jnc .Locb_enc_grandloop 2894 2895.Locb_enc_short: 2896 add \$6,$blocks 2897 jz .Locb_enc_done 2898 2899 movdqu `16*0`($inp),$inout0 2900 cmp \$2,$blocks 2901 jb .Locb_enc_one 2902 movdqu `16*1`($inp),$inout1 2903 je .Locb_enc_two 2904 2905 movdqu `16*2`($inp),$inout2 2906 cmp \$4,$blocks 2907 jb .Locb_enc_three 2908 movdqu `16*3`($inp),$inout3 2909 je .Locb_enc_four 2910 2911 movdqu `16*4`($inp),$inout4 2912 pxor $inout5,$inout5 2913 2914 call __ocb_encrypt6 2915 2916 movdqa @offset[4],@offset[5] 2917 movups $inout0,`16*0`($out) 2918 movups $inout1,`16*1`($out) 2919 movups $inout2,`16*2`($out) 2920 movups $inout3,`16*3`($out) 2921 movups $inout4,`16*4`($out) 2922 2923 jmp .Locb_enc_done 2924 2925.align 16 2926.Locb_enc_one: 2927 movdqa @offset[0],$inout5 # borrow 2928 2929 call __ocb_encrypt1 2930 2931 movdqa $inout5,@offset[5] 2932 movups $inout0,`16*0`($out) 2933 jmp .Locb_enc_done 2934 2935.align 16 2936.Locb_enc_two: 2937 pxor $inout2,$inout2 2938 pxor $inout3,$inout3 2939 2940 call __ocb_encrypt4 2941 2942 movdqa @offset[1],@offset[5] 2943 movups $inout0,`16*0`($out) 2944 movups $inout1,`16*1`($out) 2945 2946 jmp .Locb_enc_done 2947 2948.align 16 2949.Locb_enc_three: 2950 pxor $inout3,$inout3 2951 2952 call __ocb_encrypt4 2953 2954 movdqa @offset[2],@offset[5] 2955 movups $inout0,`16*0`($out) 2956 movups $inout1,`16*1`($out) 2957 movups $inout2,`16*2`($out) 2958 2959 jmp .Locb_enc_done 2960 2961.align 16 2962.Locb_enc_four: 2963 call __ocb_encrypt4 2964 2965 movdqa @offset[3],@offset[5] 2966 movups $inout0,`16*0`($out) 2967 movups $inout1,`16*1`($out) 2968 movups $inout2,`16*2`($out) 2969 movups $inout3,`16*3`($out) 2970 2971.Locb_enc_done: 2972 pxor $rndkey0,@offset[5] # "remove" round[last] 2973 movdqu $checksum,($checksum_p) # store checksum 2974 movdqu @offset[5],($offset_p) # store last offset_i 2975 2976 xorps %xmm0,%xmm0 # clear register bank 2977 pxor %xmm1,%xmm1 2978 pxor %xmm2,%xmm2 2979 pxor %xmm3,%xmm3 2980 pxor %xmm4,%xmm4 2981 pxor %xmm5,%xmm5 2982___ 2983$code.=<<___ if (!$win64); 2984 pxor %xmm6,%xmm6 2985 pxor %xmm7,%xmm7 2986 pxor %xmm8,%xmm8 2987 pxor %xmm9,%xmm9 2988 pxor %xmm10,%xmm10 2989 pxor %xmm11,%xmm11 2990 pxor %xmm12,%xmm12 2991 pxor %xmm13,%xmm13 2992 pxor %xmm14,%xmm14 2993 pxor %xmm15,%xmm15 2994 lea 0x28(%rsp),%rax 2995.cfi_def_cfa %rax,8 2996___ 2997$code.=<<___ if ($win64); 2998 movaps 0x00(%rsp),%xmm6 2999 movaps %xmm0,0x00(%rsp) # clear stack 3000 movaps 0x10(%rsp),%xmm7 3001 movaps %xmm0,0x10(%rsp) 3002 movaps 0x20(%rsp),%xmm8 3003 movaps %xmm0,0x20(%rsp) 3004 movaps 0x30(%rsp),%xmm9 3005 movaps %xmm0,0x30(%rsp) 3006 movaps 0x40(%rsp),%xmm10 3007 movaps %xmm0,0x40(%rsp) 3008 movaps 0x50(%rsp),%xmm11 3009 movaps %xmm0,0x50(%rsp) 3010 movaps 0x60(%rsp),%xmm12 3011 movaps %xmm0,0x60(%rsp) 3012 movaps 0x70(%rsp),%xmm13 3013 movaps %xmm0,0x70(%rsp) 3014 movaps 0x80(%rsp),%xmm14 3015 movaps %xmm0,0x80(%rsp) 3016 movaps 0x90(%rsp),%xmm15 3017 movaps %xmm0,0x90(%rsp) 3018 lea 0xa0+0x28(%rsp),%rax 3019.Locb_enc_pop: 3020___ 3021$code.=<<___; 3022 mov -40(%rax),%r14 3023.cfi_restore %r14 3024 mov -32(%rax),%r13 3025.cfi_restore %r13 3026 mov -24(%rax),%r12 3027.cfi_restore %r12 3028 mov -16(%rax),%rbp 3029.cfi_restore %rbp 3030 mov -8(%rax),%rbx 3031.cfi_restore %rbx 3032 lea (%rax),%rsp 3033.cfi_def_cfa_register %rsp 3034.Locb_enc_epilogue: 3035 ret 3036.cfi_endproc 3037.size ${PREFIX}_ocb_encrypt,.-${PREFIX}_ocb_encrypt 3038 3039.type __ocb_encrypt6,\@abi-omnipotent 3040.align 32 3041__ocb_encrypt6: 3042 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3043 movdqu ($L_p,$i1),@offset[1] 3044 movdqa @offset[0],@offset[2] 3045 movdqu ($L_p,$i3),@offset[3] 3046 movdqa @offset[0],@offset[4] 3047 pxor @offset[5],@offset[0] 3048 movdqu ($L_p,$i5),@offset[5] 3049 pxor @offset[0],@offset[1] 3050 pxor $inout0,$checksum # accumulate checksum 3051 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3052 pxor @offset[1],@offset[2] 3053 pxor $inout1,$checksum 3054 pxor @offset[1],$inout1 3055 pxor @offset[2],@offset[3] 3056 pxor $inout2,$checksum 3057 pxor @offset[2],$inout2 3058 pxor @offset[3],@offset[4] 3059 pxor $inout3,$checksum 3060 pxor @offset[3],$inout3 3061 pxor @offset[4],@offset[5] 3062 pxor $inout4,$checksum 3063 pxor @offset[4],$inout4 3064 pxor $inout5,$checksum 3065 pxor @offset[5],$inout5 3066 $movkey 32($key_),$rndkey0 3067 3068 lea 1($block_num),$i1 # even-numbered blocks 3069 lea 3($block_num),$i3 3070 lea 5($block_num),$i5 3071 add \$6,$block_num 3072 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3073 bsf $i1,$i1 # ntz(block) 3074 bsf $i3,$i3 3075 bsf $i5,$i5 3076 3077 aesenc $rndkey1,$inout0 3078 aesenc $rndkey1,$inout1 3079 aesenc $rndkey1,$inout2 3080 aesenc $rndkey1,$inout3 3081 pxor $rndkey0l,@offset[1] 3082 pxor $rndkey0l,@offset[2] 3083 aesenc $rndkey1,$inout4 3084 pxor $rndkey0l,@offset[3] 3085 pxor $rndkey0l,@offset[4] 3086 aesenc $rndkey1,$inout5 3087 $movkey 48($key_),$rndkey1 3088 pxor $rndkey0l,@offset[5] 3089 3090 aesenc $rndkey0,$inout0 3091 aesenc $rndkey0,$inout1 3092 aesenc $rndkey0,$inout2 3093 aesenc $rndkey0,$inout3 3094 aesenc $rndkey0,$inout4 3095 aesenc $rndkey0,$inout5 3096 $movkey 64($key_),$rndkey0 3097 shl \$4,$i1 # ntz(block) -> table offset 3098 shl \$4,$i3 3099 jmp .Locb_enc_loop6 3100 3101.align 32 3102.Locb_enc_loop6: 3103 aesenc $rndkey1,$inout0 3104 aesenc $rndkey1,$inout1 3105 aesenc $rndkey1,$inout2 3106 aesenc $rndkey1,$inout3 3107 aesenc $rndkey1,$inout4 3108 aesenc $rndkey1,$inout5 3109 $movkey ($key,%rax),$rndkey1 3110 add \$32,%rax 3111 3112 aesenc $rndkey0,$inout0 3113 aesenc $rndkey0,$inout1 3114 aesenc $rndkey0,$inout2 3115 aesenc $rndkey0,$inout3 3116 aesenc $rndkey0,$inout4 3117 aesenc $rndkey0,$inout5 3118 $movkey -16($key,%rax),$rndkey0 3119 jnz .Locb_enc_loop6 3120 3121 aesenc $rndkey1,$inout0 3122 aesenc $rndkey1,$inout1 3123 aesenc $rndkey1,$inout2 3124 aesenc $rndkey1,$inout3 3125 aesenc $rndkey1,$inout4 3126 aesenc $rndkey1,$inout5 3127 $movkey 16($key_),$rndkey1 3128 shl \$4,$i5 3129 3130 aesenclast @offset[0],$inout0 3131 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 3132 mov %r10,%rax # restore twisted rounds 3133 aesenclast @offset[1],$inout1 3134 aesenclast @offset[2],$inout2 3135 aesenclast @offset[3],$inout3 3136 aesenclast @offset[4],$inout4 3137 aesenclast @offset[5],$inout5 3138 ret 3139.size __ocb_encrypt6,.-__ocb_encrypt6 3140 3141.type __ocb_encrypt4,\@abi-omnipotent 3142.align 32 3143__ocb_encrypt4: 3144 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3145 movdqu ($L_p,$i1),@offset[1] 3146 movdqa @offset[0],@offset[2] 3147 movdqu ($L_p,$i3),@offset[3] 3148 pxor @offset[5],@offset[0] 3149 pxor @offset[0],@offset[1] 3150 pxor $inout0,$checksum # accumulate checksum 3151 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3152 pxor @offset[1],@offset[2] 3153 pxor $inout1,$checksum 3154 pxor @offset[1],$inout1 3155 pxor @offset[2],@offset[3] 3156 pxor $inout2,$checksum 3157 pxor @offset[2],$inout2 3158 pxor $inout3,$checksum 3159 pxor @offset[3],$inout3 3160 $movkey 32($key_),$rndkey0 3161 3162 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3163 pxor $rndkey0l,@offset[1] 3164 pxor $rndkey0l,@offset[2] 3165 pxor $rndkey0l,@offset[3] 3166 3167 aesenc $rndkey1,$inout0 3168 aesenc $rndkey1,$inout1 3169 aesenc $rndkey1,$inout2 3170 aesenc $rndkey1,$inout3 3171 $movkey 48($key_),$rndkey1 3172 3173 aesenc $rndkey0,$inout0 3174 aesenc $rndkey0,$inout1 3175 aesenc $rndkey0,$inout2 3176 aesenc $rndkey0,$inout3 3177 $movkey 64($key_),$rndkey0 3178 jmp .Locb_enc_loop4 3179 3180.align 32 3181.Locb_enc_loop4: 3182 aesenc $rndkey1,$inout0 3183 aesenc $rndkey1,$inout1 3184 aesenc $rndkey1,$inout2 3185 aesenc $rndkey1,$inout3 3186 $movkey ($key,%rax),$rndkey1 3187 add \$32,%rax 3188 3189 aesenc $rndkey0,$inout0 3190 aesenc $rndkey0,$inout1 3191 aesenc $rndkey0,$inout2 3192 aesenc $rndkey0,$inout3 3193 $movkey -16($key,%rax),$rndkey0 3194 jnz .Locb_enc_loop4 3195 3196 aesenc $rndkey1,$inout0 3197 aesenc $rndkey1,$inout1 3198 aesenc $rndkey1,$inout2 3199 aesenc $rndkey1,$inout3 3200 $movkey 16($key_),$rndkey1 3201 mov %r10,%rax # restore twisted rounds 3202 3203 aesenclast @offset[0],$inout0 3204 aesenclast @offset[1],$inout1 3205 aesenclast @offset[2],$inout2 3206 aesenclast @offset[3],$inout3 3207 ret 3208.size __ocb_encrypt4,.-__ocb_encrypt4 3209 3210.type __ocb_encrypt1,\@abi-omnipotent 3211.align 32 3212__ocb_encrypt1: 3213 pxor @offset[5],$inout5 # offset_i 3214 pxor $rndkey0l,$inout5 # offset_i ^ round[0] 3215 pxor $inout0,$checksum # accumulate checksum 3216 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i 3217 $movkey 32($key_),$rndkey0 3218 3219 aesenc $rndkey1,$inout0 3220 $movkey 48($key_),$rndkey1 3221 pxor $rndkey0l,$inout5 # offset_i ^ round[last] 3222 3223 aesenc $rndkey0,$inout0 3224 $movkey 64($key_),$rndkey0 3225 jmp .Locb_enc_loop1 3226 3227.align 32 3228.Locb_enc_loop1: 3229 aesenc $rndkey1,$inout0 3230 $movkey ($key,%rax),$rndkey1 3231 add \$32,%rax 3232 3233 aesenc $rndkey0,$inout0 3234 $movkey -16($key,%rax),$rndkey0 3235 jnz .Locb_enc_loop1 3236 3237 aesenc $rndkey1,$inout0 3238 $movkey 16($key_),$rndkey1 # redundant in tail 3239 mov %r10,%rax # restore twisted rounds 3240 3241 aesenclast $inout5,$inout0 3242 ret 3243.size __ocb_encrypt1,.-__ocb_encrypt1 3244 3245.globl ${PREFIX}_ocb_decrypt 3246.type ${PREFIX}_ocb_decrypt,\@function,6 3247.align 32 3248${PREFIX}_ocb_decrypt: 3249.cfi_startproc 3250 lea (%rsp),%rax 3251 push %rbx 3252.cfi_push %rbx 3253 push %rbp 3254.cfi_push %rbp 3255 push %r12 3256.cfi_push %r12 3257 push %r13 3258.cfi_push %r13 3259 push %r14 3260.cfi_push %r14 3261___ 3262$code.=<<___ if ($win64); 3263 lea -0xa0(%rsp),%rsp 3264 movaps %xmm6,0x00(%rsp) # offload everything 3265 movaps %xmm7,0x10(%rsp) 3266 movaps %xmm8,0x20(%rsp) 3267 movaps %xmm9,0x30(%rsp) 3268 movaps %xmm10,0x40(%rsp) 3269 movaps %xmm11,0x50(%rsp) 3270 movaps %xmm12,0x60(%rsp) 3271 movaps %xmm13,0x70(%rsp) 3272 movaps %xmm14,0x80(%rsp) 3273 movaps %xmm15,0x90(%rsp) 3274.Locb_dec_body: 3275___ 3276$code.=<<___; 3277 mov $seventh_arg(%rax),$L_p # 7th argument 3278 mov $seventh_arg+8(%rax),$checksum_p# 8th argument 3279 3280 mov 240($key),$rnds_ 3281 mov $key,$key_ 3282 shl \$4,$rnds_ 3283 $movkey ($key),$rndkey0l # round[0] 3284 $movkey 16($key,$rnds_),$rndkey1 # round[last] 3285 3286 movdqu ($offset_p),@offset[5] # load last offset_i 3287 pxor $rndkey1,$rndkey0l # round[0] ^ round[last] 3288 pxor $rndkey1,@offset[5] # offset_i ^ round[last] 3289 3290 mov \$16+32,$rounds 3291 lea 32($key_,$rnds_),$key 3292 $movkey 16($key_),$rndkey1 # round[1] 3293 sub %r10,%rax # twisted $rounds 3294 mov %rax,%r10 # backup twisted $rounds 3295 3296 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 3297 movdqu ($checksum_p),$checksum # load checksum 3298 3299 test \$1,$block_num # is first block number odd? 3300 jnz .Locb_dec_odd 3301 3302 bsf $block_num,$i1 3303 add \$1,$block_num 3304 shl \$4,$i1 3305 movdqu ($L_p,$i1),$inout5 # borrow 3306 movdqu ($inp),$inout0 3307 lea 16($inp),$inp 3308 3309 call __ocb_decrypt1 3310 3311 movdqa $inout5,@offset[5] 3312 movups $inout0,($out) 3313 xorps $inout0,$checksum # accumulate checksum 3314 lea 16($out),$out 3315 sub \$1,$blocks 3316 jz .Locb_dec_done 3317 3318.Locb_dec_odd: 3319 lea 1($block_num),$i1 # even-numbered blocks 3320 lea 3($block_num),$i3 3321 lea 5($block_num),$i5 3322 lea 6($block_num),$block_num 3323 bsf $i1,$i1 # ntz(block) 3324 bsf $i3,$i3 3325 bsf $i5,$i5 3326 shl \$4,$i1 # ntz(block) -> table offset 3327 shl \$4,$i3 3328 shl \$4,$i5 3329 3330 sub \$6,$blocks 3331 jc .Locb_dec_short 3332 jmp .Locb_dec_grandloop 3333 3334.align 32 3335.Locb_dec_grandloop: 3336 movdqu `16*0`($inp),$inout0 # load input 3337 movdqu `16*1`($inp),$inout1 3338 movdqu `16*2`($inp),$inout2 3339 movdqu `16*3`($inp),$inout3 3340 movdqu `16*4`($inp),$inout4 3341 movdqu `16*5`($inp),$inout5 3342 lea `16*6`($inp),$inp 3343 3344 call __ocb_decrypt6 3345 3346 movups $inout0,`16*0`($out) # store output 3347 pxor $inout0,$checksum # accumulate checksum 3348 movups $inout1,`16*1`($out) 3349 pxor $inout1,$checksum 3350 movups $inout2,`16*2`($out) 3351 pxor $inout2,$checksum 3352 movups $inout3,`16*3`($out) 3353 pxor $inout3,$checksum 3354 movups $inout4,`16*4`($out) 3355 pxor $inout4,$checksum 3356 movups $inout5,`16*5`($out) 3357 pxor $inout5,$checksum 3358 lea `16*6`($out),$out 3359 sub \$6,$blocks 3360 jnc .Locb_dec_grandloop 3361 3362.Locb_dec_short: 3363 add \$6,$blocks 3364 jz .Locb_dec_done 3365 3366 movdqu `16*0`($inp),$inout0 3367 cmp \$2,$blocks 3368 jb .Locb_dec_one 3369 movdqu `16*1`($inp),$inout1 3370 je .Locb_dec_two 3371 3372 movdqu `16*2`($inp),$inout2 3373 cmp \$4,$blocks 3374 jb .Locb_dec_three 3375 movdqu `16*3`($inp),$inout3 3376 je .Locb_dec_four 3377 3378 movdqu `16*4`($inp),$inout4 3379 pxor $inout5,$inout5 3380 3381 call __ocb_decrypt6 3382 3383 movdqa @offset[4],@offset[5] 3384 movups $inout0,`16*0`($out) # store output 3385 pxor $inout0,$checksum # accumulate checksum 3386 movups $inout1,`16*1`($out) 3387 pxor $inout1,$checksum 3388 movups $inout2,`16*2`($out) 3389 pxor $inout2,$checksum 3390 movups $inout3,`16*3`($out) 3391 pxor $inout3,$checksum 3392 movups $inout4,`16*4`($out) 3393 pxor $inout4,$checksum 3394 3395 jmp .Locb_dec_done 3396 3397.align 16 3398.Locb_dec_one: 3399 movdqa @offset[0],$inout5 # borrow 3400 3401 call __ocb_decrypt1 3402 3403 movdqa $inout5,@offset[5] 3404 movups $inout0,`16*0`($out) # store output 3405 xorps $inout0,$checksum # accumulate checksum 3406 jmp .Locb_dec_done 3407 3408.align 16 3409.Locb_dec_two: 3410 pxor $inout2,$inout2 3411 pxor $inout3,$inout3 3412 3413 call __ocb_decrypt4 3414 3415 movdqa @offset[1],@offset[5] 3416 movups $inout0,`16*0`($out) # store output 3417 xorps $inout0,$checksum # accumulate checksum 3418 movups $inout1,`16*1`($out) 3419 xorps $inout1,$checksum 3420 3421 jmp .Locb_dec_done 3422 3423.align 16 3424.Locb_dec_three: 3425 pxor $inout3,$inout3 3426 3427 call __ocb_decrypt4 3428 3429 movdqa @offset[2],@offset[5] 3430 movups $inout0,`16*0`($out) # store output 3431 xorps $inout0,$checksum # accumulate checksum 3432 movups $inout1,`16*1`($out) 3433 xorps $inout1,$checksum 3434 movups $inout2,`16*2`($out) 3435 xorps $inout2,$checksum 3436 3437 jmp .Locb_dec_done 3438 3439.align 16 3440.Locb_dec_four: 3441 call __ocb_decrypt4 3442 3443 movdqa @offset[3],@offset[5] 3444 movups $inout0,`16*0`($out) # store output 3445 pxor $inout0,$checksum # accumulate checksum 3446 movups $inout1,`16*1`($out) 3447 pxor $inout1,$checksum 3448 movups $inout2,`16*2`($out) 3449 pxor $inout2,$checksum 3450 movups $inout3,`16*3`($out) 3451 pxor $inout3,$checksum 3452 3453.Locb_dec_done: 3454 pxor $rndkey0,@offset[5] # "remove" round[last] 3455 movdqu $checksum,($checksum_p) # store checksum 3456 movdqu @offset[5],($offset_p) # store last offset_i 3457 3458 xorps %xmm0,%xmm0 # clear register bank 3459 pxor %xmm1,%xmm1 3460 pxor %xmm2,%xmm2 3461 pxor %xmm3,%xmm3 3462 pxor %xmm4,%xmm4 3463 pxor %xmm5,%xmm5 3464___ 3465$code.=<<___ if (!$win64); 3466 pxor %xmm6,%xmm6 3467 pxor %xmm7,%xmm7 3468 pxor %xmm8,%xmm8 3469 pxor %xmm9,%xmm9 3470 pxor %xmm10,%xmm10 3471 pxor %xmm11,%xmm11 3472 pxor %xmm12,%xmm12 3473 pxor %xmm13,%xmm13 3474 pxor %xmm14,%xmm14 3475 pxor %xmm15,%xmm15 3476 lea 0x28(%rsp),%rax 3477.cfi_def_cfa %rax,8 3478___ 3479$code.=<<___ if ($win64); 3480 movaps 0x00(%rsp),%xmm6 3481 movaps %xmm0,0x00(%rsp) # clear stack 3482 movaps 0x10(%rsp),%xmm7 3483 movaps %xmm0,0x10(%rsp) 3484 movaps 0x20(%rsp),%xmm8 3485 movaps %xmm0,0x20(%rsp) 3486 movaps 0x30(%rsp),%xmm9 3487 movaps %xmm0,0x30(%rsp) 3488 movaps 0x40(%rsp),%xmm10 3489 movaps %xmm0,0x40(%rsp) 3490 movaps 0x50(%rsp),%xmm11 3491 movaps %xmm0,0x50(%rsp) 3492 movaps 0x60(%rsp),%xmm12 3493 movaps %xmm0,0x60(%rsp) 3494 movaps 0x70(%rsp),%xmm13 3495 movaps %xmm0,0x70(%rsp) 3496 movaps 0x80(%rsp),%xmm14 3497 movaps %xmm0,0x80(%rsp) 3498 movaps 0x90(%rsp),%xmm15 3499 movaps %xmm0,0x90(%rsp) 3500 lea 0xa0+0x28(%rsp),%rax 3501.Locb_dec_pop: 3502___ 3503$code.=<<___; 3504 mov -40(%rax),%r14 3505.cfi_restore %r14 3506 mov -32(%rax),%r13 3507.cfi_restore %r13 3508 mov -24(%rax),%r12 3509.cfi_restore %r12 3510 mov -16(%rax),%rbp 3511.cfi_restore %rbp 3512 mov -8(%rax),%rbx 3513.cfi_restore %rbx 3514 lea (%rax),%rsp 3515.cfi_def_cfa_register %rsp 3516.Locb_dec_epilogue: 3517 ret 3518.cfi_endproc 3519.size ${PREFIX}_ocb_decrypt,.-${PREFIX}_ocb_decrypt 3520 3521.type __ocb_decrypt6,\@abi-omnipotent 3522.align 32 3523__ocb_decrypt6: 3524 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3525 movdqu ($L_p,$i1),@offset[1] 3526 movdqa @offset[0],@offset[2] 3527 movdqu ($L_p,$i3),@offset[3] 3528 movdqa @offset[0],@offset[4] 3529 pxor @offset[5],@offset[0] 3530 movdqu ($L_p,$i5),@offset[5] 3531 pxor @offset[0],@offset[1] 3532 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3533 pxor @offset[1],@offset[2] 3534 pxor @offset[1],$inout1 3535 pxor @offset[2],@offset[3] 3536 pxor @offset[2],$inout2 3537 pxor @offset[3],@offset[4] 3538 pxor @offset[3],$inout3 3539 pxor @offset[4],@offset[5] 3540 pxor @offset[4],$inout4 3541 pxor @offset[5],$inout5 3542 $movkey 32($key_),$rndkey0 3543 3544 lea 1($block_num),$i1 # even-numbered blocks 3545 lea 3($block_num),$i3 3546 lea 5($block_num),$i5 3547 add \$6,$block_num 3548 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3549 bsf $i1,$i1 # ntz(block) 3550 bsf $i3,$i3 3551 bsf $i5,$i5 3552 3553 aesdec $rndkey1,$inout0 3554 aesdec $rndkey1,$inout1 3555 aesdec $rndkey1,$inout2 3556 aesdec $rndkey1,$inout3 3557 pxor $rndkey0l,@offset[1] 3558 pxor $rndkey0l,@offset[2] 3559 aesdec $rndkey1,$inout4 3560 pxor $rndkey0l,@offset[3] 3561 pxor $rndkey0l,@offset[4] 3562 aesdec $rndkey1,$inout5 3563 $movkey 48($key_),$rndkey1 3564 pxor $rndkey0l,@offset[5] 3565 3566 aesdec $rndkey0,$inout0 3567 aesdec $rndkey0,$inout1 3568 aesdec $rndkey0,$inout2 3569 aesdec $rndkey0,$inout3 3570 aesdec $rndkey0,$inout4 3571 aesdec $rndkey0,$inout5 3572 $movkey 64($key_),$rndkey0 3573 shl \$4,$i1 # ntz(block) -> table offset 3574 shl \$4,$i3 3575 jmp .Locb_dec_loop6 3576 3577.align 32 3578.Locb_dec_loop6: 3579 aesdec $rndkey1,$inout0 3580 aesdec $rndkey1,$inout1 3581 aesdec $rndkey1,$inout2 3582 aesdec $rndkey1,$inout3 3583 aesdec $rndkey1,$inout4 3584 aesdec $rndkey1,$inout5 3585 $movkey ($key,%rax),$rndkey1 3586 add \$32,%rax 3587 3588 aesdec $rndkey0,$inout0 3589 aesdec $rndkey0,$inout1 3590 aesdec $rndkey0,$inout2 3591 aesdec $rndkey0,$inout3 3592 aesdec $rndkey0,$inout4 3593 aesdec $rndkey0,$inout5 3594 $movkey -16($key,%rax),$rndkey0 3595 jnz .Locb_dec_loop6 3596 3597 aesdec $rndkey1,$inout0 3598 aesdec $rndkey1,$inout1 3599 aesdec $rndkey1,$inout2 3600 aesdec $rndkey1,$inout3 3601 aesdec $rndkey1,$inout4 3602 aesdec $rndkey1,$inout5 3603 $movkey 16($key_),$rndkey1 3604 shl \$4,$i5 3605 3606 aesdeclast @offset[0],$inout0 3607 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 3608 mov %r10,%rax # restore twisted rounds 3609 aesdeclast @offset[1],$inout1 3610 aesdeclast @offset[2],$inout2 3611 aesdeclast @offset[3],$inout3 3612 aesdeclast @offset[4],$inout4 3613 aesdeclast @offset[5],$inout5 3614 ret 3615.size __ocb_decrypt6,.-__ocb_decrypt6 3616 3617.type __ocb_decrypt4,\@abi-omnipotent 3618.align 32 3619__ocb_decrypt4: 3620 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3621 movdqu ($L_p,$i1),@offset[1] 3622 movdqa @offset[0],@offset[2] 3623 movdqu ($L_p,$i3),@offset[3] 3624 pxor @offset[5],@offset[0] 3625 pxor @offset[0],@offset[1] 3626 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3627 pxor @offset[1],@offset[2] 3628 pxor @offset[1],$inout1 3629 pxor @offset[2],@offset[3] 3630 pxor @offset[2],$inout2 3631 pxor @offset[3],$inout3 3632 $movkey 32($key_),$rndkey0 3633 3634 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3635 pxor $rndkey0l,@offset[1] 3636 pxor $rndkey0l,@offset[2] 3637 pxor $rndkey0l,@offset[3] 3638 3639 aesdec $rndkey1,$inout0 3640 aesdec $rndkey1,$inout1 3641 aesdec $rndkey1,$inout2 3642 aesdec $rndkey1,$inout3 3643 $movkey 48($key_),$rndkey1 3644 3645 aesdec $rndkey0,$inout0 3646 aesdec $rndkey0,$inout1 3647 aesdec $rndkey0,$inout2 3648 aesdec $rndkey0,$inout3 3649 $movkey 64($key_),$rndkey0 3650 jmp .Locb_dec_loop4 3651 3652.align 32 3653.Locb_dec_loop4: 3654 aesdec $rndkey1,$inout0 3655 aesdec $rndkey1,$inout1 3656 aesdec $rndkey1,$inout2 3657 aesdec $rndkey1,$inout3 3658 $movkey ($key,%rax),$rndkey1 3659 add \$32,%rax 3660 3661 aesdec $rndkey0,$inout0 3662 aesdec $rndkey0,$inout1 3663 aesdec $rndkey0,$inout2 3664 aesdec $rndkey0,$inout3 3665 $movkey -16($key,%rax),$rndkey0 3666 jnz .Locb_dec_loop4 3667 3668 aesdec $rndkey1,$inout0 3669 aesdec $rndkey1,$inout1 3670 aesdec $rndkey1,$inout2 3671 aesdec $rndkey1,$inout3 3672 $movkey 16($key_),$rndkey1 3673 mov %r10,%rax # restore twisted rounds 3674 3675 aesdeclast @offset[0],$inout0 3676 aesdeclast @offset[1],$inout1 3677 aesdeclast @offset[2],$inout2 3678 aesdeclast @offset[3],$inout3 3679 ret 3680.size __ocb_decrypt4,.-__ocb_decrypt4 3681 3682.type __ocb_decrypt1,\@abi-omnipotent 3683.align 32 3684__ocb_decrypt1: 3685 pxor @offset[5],$inout5 # offset_i 3686 pxor $rndkey0l,$inout5 # offset_i ^ round[0] 3687 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i 3688 $movkey 32($key_),$rndkey0 3689 3690 aesdec $rndkey1,$inout0 3691 $movkey 48($key_),$rndkey1 3692 pxor $rndkey0l,$inout5 # offset_i ^ round[last] 3693 3694 aesdec $rndkey0,$inout0 3695 $movkey 64($key_),$rndkey0 3696 jmp .Locb_dec_loop1 3697 3698.align 32 3699.Locb_dec_loop1: 3700 aesdec $rndkey1,$inout0 3701 $movkey ($key,%rax),$rndkey1 3702 add \$32,%rax 3703 3704 aesdec $rndkey0,$inout0 3705 $movkey -16($key,%rax),$rndkey0 3706 jnz .Locb_dec_loop1 3707 3708 aesdec $rndkey1,$inout0 3709 $movkey 16($key_),$rndkey1 # redundant in tail 3710 mov %r10,%rax # restore twisted rounds 3711 3712 aesdeclast $inout5,$inout0 3713 ret 3714.size __ocb_decrypt1,.-__ocb_decrypt1 3715___ 3716} }} 3717 3718######################################################################## 3719# void $PREFIX_cbc_encrypt (const void *inp, void *out, 3720# size_t length, const AES_KEY *key, 3721# unsigned char *ivp,const int enc); 3722{ 3723my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt 3724my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15)); 3725 3726$code.=<<___; 3727.globl ${PREFIX}_cbc_encrypt 3728.type ${PREFIX}_cbc_encrypt,\@function,6 3729.align 16 3730${PREFIX}_cbc_encrypt: 3731.cfi_startproc 3732 test $len,$len # check length 3733 jz .Lcbc_ret 3734 3735 mov 240($key),$rnds_ # key->rounds 3736 mov $key,$key_ # backup $key 3737 test %r9d,%r9d # 6th argument 3738 jz .Lcbc_decrypt 3739#--------------------------- CBC ENCRYPT ------------------------------# 3740 movups ($ivp),$inout0 # load iv as initial state 3741 mov $rnds_,$rounds 3742 cmp \$16,$len 3743 jb .Lcbc_enc_tail 3744 sub \$16,$len 3745 jmp .Lcbc_enc_loop 3746.align 16 3747.Lcbc_enc_loop: 3748 movups ($inp),$inout1 # load input 3749 lea 16($inp),$inp 3750 #xorps $inout1,$inout0 3751___ 3752 &aesni_generate1("enc",$key,$rounds,$inout0,$inout1); 3753$code.=<<___; 3754 mov $rnds_,$rounds # restore $rounds 3755 mov $key_,$key # restore $key 3756 movups $inout0,0($out) # store output 3757 lea 16($out),$out 3758 sub \$16,$len 3759 jnc .Lcbc_enc_loop 3760 add \$16,$len 3761 jnz .Lcbc_enc_tail 3762 pxor $rndkey0,$rndkey0 # clear register bank 3763 pxor $rndkey1,$rndkey1 3764 movups $inout0,($ivp) 3765 pxor $inout0,$inout0 3766 pxor $inout1,$inout1 3767 jmp .Lcbc_ret 3768 3769.Lcbc_enc_tail: 3770 mov $len,%rcx # zaps $key 3771 xchg $inp,$out # $inp is %rsi and $out is %rdi now 3772 .long 0x9066A4F3 # rep movsb 3773 mov \$16,%ecx # zero tail 3774 sub $len,%rcx 3775 xor %eax,%eax 3776 .long 0x9066AAF3 # rep stosb 3777 lea -16(%rdi),%rdi # rewind $out by 1 block 3778 mov $rnds_,$rounds # restore $rounds 3779 mov %rdi,%rsi # $inp and $out are the same 3780 mov $key_,$key # restore $key 3781 xor $len,$len # len=16 3782 jmp .Lcbc_enc_loop # one more spin 3783#--------------------------- CBC DECRYPT ------------------------------# 3784.align 16 3785.Lcbc_decrypt: 3786 cmp \$16,$len 3787 jne .Lcbc_decrypt_bulk 3788 3789 # handle single block without allocating stack frame, 3790 # useful in ciphertext stealing mode 3791 movdqu ($inp),$inout0 # load input 3792 movdqu ($ivp),$inout1 # load iv 3793 movdqa $inout0,$inout2 # future iv 3794___ 3795 &aesni_generate1("dec",$key,$rnds_); 3796$code.=<<___; 3797 pxor $rndkey0,$rndkey0 # clear register bank 3798 pxor $rndkey1,$rndkey1 3799 movdqu $inout2,($ivp) # store iv 3800 xorps $inout1,$inout0 # ^=iv 3801 pxor $inout1,$inout1 3802 movups $inout0,($out) # store output 3803 pxor $inout0,$inout0 3804 jmp .Lcbc_ret 3805.align 16 3806.Lcbc_decrypt_bulk: 3807 lea (%rsp),%r11 # frame pointer 3808.cfi_def_cfa_register %r11 3809 push %rbp 3810.cfi_push %rbp 3811 sub \$$frame_size,%rsp 3812 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 3813___ 3814$code.=<<___ if ($win64); 3815 movaps %xmm6,0x10(%rsp) 3816 movaps %xmm7,0x20(%rsp) 3817 movaps %xmm8,0x30(%rsp) 3818 movaps %xmm9,0x40(%rsp) 3819 movaps %xmm10,0x50(%rsp) 3820 movaps %xmm11,0x60(%rsp) 3821 movaps %xmm12,0x70(%rsp) 3822 movaps %xmm13,0x80(%rsp) 3823 movaps %xmm14,0x90(%rsp) 3824 movaps %xmm15,0xa0(%rsp) 3825.Lcbc_decrypt_body: 3826___ 3827 3828my $inp_=$key_="%rbp"; # reassign $key_ 3829 3830$code.=<<___; 3831 mov $key,$key_ # [re-]backup $key [after reassignment] 3832 movups ($ivp),$iv 3833 mov $rnds_,$rounds 3834 cmp \$0x50,$len 3835 jbe .Lcbc_dec_tail 3836 3837 $movkey ($key),$rndkey0 3838 movdqu 0x00($inp),$inout0 # load input 3839 movdqu 0x10($inp),$inout1 3840 movdqa $inout0,$in0 3841 movdqu 0x20($inp),$inout2 3842 movdqa $inout1,$in1 3843 movdqu 0x30($inp),$inout3 3844 movdqa $inout2,$in2 3845 movdqu 0x40($inp),$inout4 3846 movdqa $inout3,$in3 3847 movdqu 0x50($inp),$inout5 3848 movdqa $inout4,$in4 3849 leaq OPENSSL_ia32cap_P(%rip),%r9 3850 mov 4(%r9),%r9d 3851 cmp \$0x70,$len 3852 jbe .Lcbc_dec_six_or_seven 3853 3854 and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE 3855 sub \$0x50,$len # $len is biased by -5*16 3856 cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE 3857 je .Lcbc_dec_loop6_enter # [which denotes Atom Silvermont] 3858 sub \$0x20,$len # $len is biased by -7*16 3859 lea 0x70($key),$key # size optimization 3860 jmp .Lcbc_dec_loop8_enter 3861.align 16 3862.Lcbc_dec_loop8: 3863 movups $inout7,($out) 3864 lea 0x10($out),$out 3865.Lcbc_dec_loop8_enter: 3866 movdqu 0x60($inp),$inout6 3867 pxor $rndkey0,$inout0 3868 movdqu 0x70($inp),$inout7 3869 pxor $rndkey0,$inout1 3870 $movkey 0x10-0x70($key),$rndkey1 3871 pxor $rndkey0,$inout2 3872 mov \$-1,$inp_ 3873 cmp \$0x70,$len # is there at least 0x60 bytes ahead? 3874 pxor $rndkey0,$inout3 3875 pxor $rndkey0,$inout4 3876 pxor $rndkey0,$inout5 3877 pxor $rndkey0,$inout6 3878 3879 aesdec $rndkey1,$inout0 3880 pxor $rndkey0,$inout7 3881 $movkey 0x20-0x70($key),$rndkey0 3882 aesdec $rndkey1,$inout1 3883 aesdec $rndkey1,$inout2 3884 aesdec $rndkey1,$inout3 3885 aesdec $rndkey1,$inout4 3886 aesdec $rndkey1,$inout5 3887 aesdec $rndkey1,$inout6 3888 adc \$0,$inp_ 3889 and \$128,$inp_ 3890 aesdec $rndkey1,$inout7 3891 add $inp,$inp_ 3892 $movkey 0x30-0x70($key),$rndkey1 3893___ 3894for($i=1;$i<12;$i++) { 3895my $rndkeyx = ($i&1)?$rndkey0:$rndkey1; 3896$code.=<<___ if ($i==7); 3897 cmp \$11,$rounds 3898___ 3899$code.=<<___; 3900 aesdec $rndkeyx,$inout0 3901 aesdec $rndkeyx,$inout1 3902 aesdec $rndkeyx,$inout2 3903 aesdec $rndkeyx,$inout3 3904 aesdec $rndkeyx,$inout4 3905 aesdec $rndkeyx,$inout5 3906 aesdec $rndkeyx,$inout6 3907 aesdec $rndkeyx,$inout7 3908 $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx 3909___ 3910$code.=<<___ if ($i<6 || (!($i&1) && $i>7)); 3911 nop 3912___ 3913$code.=<<___ if ($i==7); 3914 jb .Lcbc_dec_done 3915___ 3916$code.=<<___ if ($i==9); 3917 je .Lcbc_dec_done 3918___ 3919$code.=<<___ if ($i==11); 3920 jmp .Lcbc_dec_done 3921___ 3922} 3923$code.=<<___; 3924.align 16 3925.Lcbc_dec_done: 3926 aesdec $rndkey1,$inout0 3927 aesdec $rndkey1,$inout1 3928 pxor $rndkey0,$iv 3929 pxor $rndkey0,$in0 3930 aesdec $rndkey1,$inout2 3931 aesdec $rndkey1,$inout3 3932 pxor $rndkey0,$in1 3933 pxor $rndkey0,$in2 3934 aesdec $rndkey1,$inout4 3935 aesdec $rndkey1,$inout5 3936 pxor $rndkey0,$in3 3937 pxor $rndkey0,$in4 3938 aesdec $rndkey1,$inout6 3939 aesdec $rndkey1,$inout7 3940 movdqu 0x50($inp),$rndkey1 3941 3942 aesdeclast $iv,$inout0 3943 movdqu 0x60($inp),$iv # borrow $iv 3944 pxor $rndkey0,$rndkey1 3945 aesdeclast $in0,$inout1 3946 pxor $rndkey0,$iv 3947 movdqu 0x70($inp),$rndkey0 # next IV 3948 aesdeclast $in1,$inout2 3949 lea 0x80($inp),$inp 3950 movdqu 0x00($inp_),$in0 3951 aesdeclast $in2,$inout3 3952 aesdeclast $in3,$inout4 3953 movdqu 0x10($inp_),$in1 3954 movdqu 0x20($inp_),$in2 3955 aesdeclast $in4,$inout5 3956 aesdeclast $rndkey1,$inout6 3957 movdqu 0x30($inp_),$in3 3958 movdqu 0x40($inp_),$in4 3959 aesdeclast $iv,$inout7 3960 movdqa $rndkey0,$iv # return $iv 3961 movdqu 0x50($inp_),$rndkey1 3962 $movkey -0x70($key),$rndkey0 3963 3964 movups $inout0,($out) # store output 3965 movdqa $in0,$inout0 3966 movups $inout1,0x10($out) 3967 movdqa $in1,$inout1 3968 movups $inout2,0x20($out) 3969 movdqa $in2,$inout2 3970 movups $inout3,0x30($out) 3971 movdqa $in3,$inout3 3972 movups $inout4,0x40($out) 3973 movdqa $in4,$inout4 3974 movups $inout5,0x50($out) 3975 movdqa $rndkey1,$inout5 3976 movups $inout6,0x60($out) 3977 lea 0x70($out),$out 3978 3979 sub \$0x80,$len 3980 ja .Lcbc_dec_loop8 3981 3982 movaps $inout7,$inout0 3983 lea -0x70($key),$key 3984 add \$0x70,$len 3985 jle .Lcbc_dec_clear_tail_collected 3986 movups $inout7,($out) 3987 lea 0x10($out),$out 3988 cmp \$0x50,$len 3989 jbe .Lcbc_dec_tail 3990 3991 movaps $in0,$inout0 3992.Lcbc_dec_six_or_seven: 3993 cmp \$0x60,$len 3994 ja .Lcbc_dec_seven 3995 3996 movaps $inout5,$inout6 3997 call _aesni_decrypt6 3998 pxor $iv,$inout0 # ^= IV 3999 movaps $inout6,$iv 4000 pxor $in0,$inout1 4001 movdqu $inout0,($out) 4002 pxor $in1,$inout2 4003 movdqu $inout1,0x10($out) 4004 pxor $inout1,$inout1 # clear register bank 4005 pxor $in2,$inout3 4006 movdqu $inout2,0x20($out) 4007 pxor $inout2,$inout2 4008 pxor $in3,$inout4 4009 movdqu $inout3,0x30($out) 4010 pxor $inout3,$inout3 4011 pxor $in4,$inout5 4012 movdqu $inout4,0x40($out) 4013 pxor $inout4,$inout4 4014 lea 0x50($out),$out 4015 movdqa $inout5,$inout0 4016 pxor $inout5,$inout5 4017 jmp .Lcbc_dec_tail_collected 4018 4019.align 16 4020.Lcbc_dec_seven: 4021 movups 0x60($inp),$inout6 4022 xorps $inout7,$inout7 4023 call _aesni_decrypt8 4024 movups 0x50($inp),$inout7 4025 pxor $iv,$inout0 # ^= IV 4026 movups 0x60($inp),$iv 4027 pxor $in0,$inout1 4028 movdqu $inout0,($out) 4029 pxor $in1,$inout2 4030 movdqu $inout1,0x10($out) 4031 pxor $inout1,$inout1 # clear register bank 4032 pxor $in2,$inout3 4033 movdqu $inout2,0x20($out) 4034 pxor $inout2,$inout2 4035 pxor $in3,$inout4 4036 movdqu $inout3,0x30($out) 4037 pxor $inout3,$inout3 4038 pxor $in4,$inout5 4039 movdqu $inout4,0x40($out) 4040 pxor $inout4,$inout4 4041 pxor $inout7,$inout6 4042 movdqu $inout5,0x50($out) 4043 pxor $inout5,$inout5 4044 lea 0x60($out),$out 4045 movdqa $inout6,$inout0 4046 pxor $inout6,$inout6 4047 pxor $inout7,$inout7 4048 jmp .Lcbc_dec_tail_collected 4049 4050.align 16 4051.Lcbc_dec_loop6: 4052 movups $inout5,($out) 4053 lea 0x10($out),$out 4054 movdqu 0x00($inp),$inout0 # load input 4055 movdqu 0x10($inp),$inout1 4056 movdqa $inout0,$in0 4057 movdqu 0x20($inp),$inout2 4058 movdqa $inout1,$in1 4059 movdqu 0x30($inp),$inout3 4060 movdqa $inout2,$in2 4061 movdqu 0x40($inp),$inout4 4062 movdqa $inout3,$in3 4063 movdqu 0x50($inp),$inout5 4064 movdqa $inout4,$in4 4065.Lcbc_dec_loop6_enter: 4066 lea 0x60($inp),$inp 4067 movdqa $inout5,$inout6 4068 4069 call _aesni_decrypt6 4070 4071 pxor $iv,$inout0 # ^= IV 4072 movdqa $inout6,$iv 4073 pxor $in0,$inout1 4074 movdqu $inout0,($out) 4075 pxor $in1,$inout2 4076 movdqu $inout1,0x10($out) 4077 pxor $in2,$inout3 4078 movdqu $inout2,0x20($out) 4079 pxor $in3,$inout4 4080 mov $key_,$key 4081 movdqu $inout3,0x30($out) 4082 pxor $in4,$inout5 4083 mov $rnds_,$rounds 4084 movdqu $inout4,0x40($out) 4085 lea 0x50($out),$out 4086 sub \$0x60,$len 4087 ja .Lcbc_dec_loop6 4088 4089 movdqa $inout5,$inout0 4090 add \$0x50,$len 4091 jle .Lcbc_dec_clear_tail_collected 4092 movups $inout5,($out) 4093 lea 0x10($out),$out 4094 4095.Lcbc_dec_tail: 4096 movups ($inp),$inout0 4097 sub \$0x10,$len 4098 jbe .Lcbc_dec_one # $len is 1*16 or less 4099 4100 movups 0x10($inp),$inout1 4101 movaps $inout0,$in0 4102 sub \$0x10,$len 4103 jbe .Lcbc_dec_two # $len is 2*16 or less 4104 4105 movups 0x20($inp),$inout2 4106 movaps $inout1,$in1 4107 sub \$0x10,$len 4108 jbe .Lcbc_dec_three # $len is 3*16 or less 4109 4110 movups 0x30($inp),$inout3 4111 movaps $inout2,$in2 4112 sub \$0x10,$len 4113 jbe .Lcbc_dec_four # $len is 4*16 or less 4114 4115 movups 0x40($inp),$inout4 # $len is 5*16 or less 4116 movaps $inout3,$in3 4117 movaps $inout4,$in4 4118 xorps $inout5,$inout5 4119 call _aesni_decrypt6 4120 pxor $iv,$inout0 4121 movaps $in4,$iv 4122 pxor $in0,$inout1 4123 movdqu $inout0,($out) 4124 pxor $in1,$inout2 4125 movdqu $inout1,0x10($out) 4126 pxor $inout1,$inout1 # clear register bank 4127 pxor $in2,$inout3 4128 movdqu $inout2,0x20($out) 4129 pxor $inout2,$inout2 4130 pxor $in3,$inout4 4131 movdqu $inout3,0x30($out) 4132 pxor $inout3,$inout3 4133 lea 0x40($out),$out 4134 movdqa $inout4,$inout0 4135 pxor $inout4,$inout4 4136 pxor $inout5,$inout5 4137 sub \$0x10,$len 4138 jmp .Lcbc_dec_tail_collected 4139 4140.align 16 4141.Lcbc_dec_one: 4142 movaps $inout0,$in0 4143___ 4144 &aesni_generate1("dec",$key,$rounds); 4145$code.=<<___; 4146 xorps $iv,$inout0 4147 movaps $in0,$iv 4148 jmp .Lcbc_dec_tail_collected 4149.align 16 4150.Lcbc_dec_two: 4151 movaps $inout1,$in1 4152 call _aesni_decrypt2 4153 pxor $iv,$inout0 4154 movaps $in1,$iv 4155 pxor $in0,$inout1 4156 movdqu $inout0,($out) 4157 movdqa $inout1,$inout0 4158 pxor $inout1,$inout1 # clear register bank 4159 lea 0x10($out),$out 4160 jmp .Lcbc_dec_tail_collected 4161.align 16 4162.Lcbc_dec_three: 4163 movaps $inout2,$in2 4164 call _aesni_decrypt3 4165 pxor $iv,$inout0 4166 movaps $in2,$iv 4167 pxor $in0,$inout1 4168 movdqu $inout0,($out) 4169 pxor $in1,$inout2 4170 movdqu $inout1,0x10($out) 4171 pxor $inout1,$inout1 # clear register bank 4172 movdqa $inout2,$inout0 4173 pxor $inout2,$inout2 4174 lea 0x20($out),$out 4175 jmp .Lcbc_dec_tail_collected 4176.align 16 4177.Lcbc_dec_four: 4178 movaps $inout3,$in3 4179 call _aesni_decrypt4 4180 pxor $iv,$inout0 4181 movaps $in3,$iv 4182 pxor $in0,$inout1 4183 movdqu $inout0,($out) 4184 pxor $in1,$inout2 4185 movdqu $inout1,0x10($out) 4186 pxor $inout1,$inout1 # clear register bank 4187 pxor $in2,$inout3 4188 movdqu $inout2,0x20($out) 4189 pxor $inout2,$inout2 4190 movdqa $inout3,$inout0 4191 pxor $inout3,$inout3 4192 lea 0x30($out),$out 4193 jmp .Lcbc_dec_tail_collected 4194 4195.align 16 4196.Lcbc_dec_clear_tail_collected: 4197 pxor $inout1,$inout1 # clear register bank 4198 pxor $inout2,$inout2 4199 pxor $inout3,$inout3 4200___ 4201$code.=<<___ if (!$win64); 4202 pxor $inout4,$inout4 # %xmm6..9 4203 pxor $inout5,$inout5 4204 pxor $inout6,$inout6 4205 pxor $inout7,$inout7 4206___ 4207$code.=<<___; 4208.Lcbc_dec_tail_collected: 4209 movups $iv,($ivp) 4210 and \$15,$len 4211 jnz .Lcbc_dec_tail_partial 4212 movups $inout0,($out) 4213 pxor $inout0,$inout0 4214 jmp .Lcbc_dec_ret 4215.align 16 4216.Lcbc_dec_tail_partial: 4217 movaps $inout0,(%rsp) 4218 pxor $inout0,$inout0 4219 mov \$16,%rcx 4220 mov $out,%rdi 4221 sub $len,%rcx 4222 lea (%rsp),%rsi 4223 .long 0x9066A4F3 # rep movsb 4224 movdqa $inout0,(%rsp) 4225 4226.Lcbc_dec_ret: 4227 xorps $rndkey0,$rndkey0 # %xmm0 4228 pxor $rndkey1,$rndkey1 4229___ 4230$code.=<<___ if ($win64); 4231 movaps 0x10(%rsp),%xmm6 4232 movaps %xmm0,0x10(%rsp) # clear stack 4233 movaps 0x20(%rsp),%xmm7 4234 movaps %xmm0,0x20(%rsp) 4235 movaps 0x30(%rsp),%xmm8 4236 movaps %xmm0,0x30(%rsp) 4237 movaps 0x40(%rsp),%xmm9 4238 movaps %xmm0,0x40(%rsp) 4239 movaps 0x50(%rsp),%xmm10 4240 movaps %xmm0,0x50(%rsp) 4241 movaps 0x60(%rsp),%xmm11 4242 movaps %xmm0,0x60(%rsp) 4243 movaps 0x70(%rsp),%xmm12 4244 movaps %xmm0,0x70(%rsp) 4245 movaps 0x80(%rsp),%xmm13 4246 movaps %xmm0,0x80(%rsp) 4247 movaps 0x90(%rsp),%xmm14 4248 movaps %xmm0,0x90(%rsp) 4249 movaps 0xa0(%rsp),%xmm15 4250 movaps %xmm0,0xa0(%rsp) 4251___ 4252$code.=<<___; 4253 mov -8(%r11),%rbp 4254.cfi_restore %rbp 4255 lea (%r11),%rsp 4256.cfi_def_cfa_register %rsp 4257.Lcbc_ret: 4258 ret 4259.cfi_endproc 4260.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt 4261___ 4262} 4263# int ${PREFIX}_set_decrypt_key(const unsigned char *inp, 4264# int bits, AES_KEY *key) 4265# 4266# input: $inp user-supplied key 4267# $bits $inp length in bits 4268# $key pointer to key schedule 4269# output: %eax 0 denoting success, -1 or -2 - failure (see C) 4270# *$key key schedule 4271# 4272{ my ($inp,$bits,$key) = @_4args; 4273 $bits =~ s/%r/%e/; 4274 4275$code.=<<___; 4276.globl ${PREFIX}_set_decrypt_key 4277.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent 4278.align 16 4279${PREFIX}_set_decrypt_key: 4280.cfi_startproc 4281 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 4282.cfi_adjust_cfa_offset 8 4283 call __aesni_set_encrypt_key 4284 shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key 4285 test %eax,%eax 4286 jnz .Ldec_key_ret 4287 lea 16($key,$bits),$inp # points at the end of key schedule 4288 4289 $movkey ($key),%xmm0 # just swap 4290 $movkey ($inp),%xmm1 4291 $movkey %xmm0,($inp) 4292 $movkey %xmm1,($key) 4293 lea 16($key),$key 4294 lea -16($inp),$inp 4295 4296.Ldec_key_inverse: 4297 $movkey ($key),%xmm0 # swap and inverse 4298 $movkey ($inp),%xmm1 4299 aesimc %xmm0,%xmm0 4300 aesimc %xmm1,%xmm1 4301 lea 16($key),$key 4302 lea -16($inp),$inp 4303 $movkey %xmm0,16($inp) 4304 $movkey %xmm1,-16($key) 4305 cmp $key,$inp 4306 ja .Ldec_key_inverse 4307 4308 $movkey ($key),%xmm0 # inverse middle 4309 aesimc %xmm0,%xmm0 4310 pxor %xmm1,%xmm1 4311 $movkey %xmm0,($inp) 4312 pxor %xmm0,%xmm0 4313.Ldec_key_ret: 4314 add \$8,%rsp 4315.cfi_adjust_cfa_offset -8 4316 ret 4317.cfi_endproc 4318.LSEH_end_set_decrypt_key: 4319.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key 4320___ 4321 4322# This is based on submission from Intel by 4323# Huang Ying 4324# Vinodh Gopal 4325# Kahraman Akdemir 4326# 4327# Aggressively optimized in respect to aeskeygenassist's critical path 4328# and is contained in %xmm0-5 to meet Win64 ABI requirement. 4329# 4330# int ${PREFIX}_set_encrypt_key(const unsigned char *inp, 4331# int bits, AES_KEY * const key); 4332# 4333# input: $inp user-supplied key 4334# $bits $inp length in bits 4335# $key pointer to key schedule 4336# output: %eax 0 denoting success, -1 or -2 - failure (see C) 4337# $bits rounds-1 (used in aesni_set_decrypt_key) 4338# *$key key schedule 4339# $key pointer to key schedule (used in 4340# aesni_set_decrypt_key) 4341# 4342# Subroutine is frame-less, which means that only volatile registers 4343# are used. Note that it's declared "abi-omnipotent", which means that 4344# amount of volatile registers is smaller on Windows. 4345# 4346$code.=<<___; 4347.globl ${PREFIX}_set_encrypt_key 4348.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent 4349.align 16 4350${PREFIX}_set_encrypt_key: 4351__aesni_set_encrypt_key: 4352.cfi_startproc 4353#ifdef BORINGSSL_DISPATCH_TEST 4354 movb \$1,BORINGSSL_function_hit+3(%rip) 4355#endif 4356 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 4357.cfi_adjust_cfa_offset 8 4358 mov \$-1,%rax 4359 test $inp,$inp 4360 jz .Lenc_key_ret 4361 test $key,$key 4362 jz .Lenc_key_ret 4363 4364 movups ($inp),%xmm0 # pull first 128 bits of *userKey 4365 xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0 4366 leaq OPENSSL_ia32cap_P(%rip),%r10 4367 movl 4(%r10),%r10d 4368 and \$`1<<28|1<<11`,%r10d # AVX and XOP bits 4369 lea 16($key),%rax # %rax is used as modifiable copy of $key 4370 cmp \$256,$bits 4371 je .L14rounds 4372 cmp \$192,$bits 4373 je .L12rounds 4374 cmp \$128,$bits 4375 jne .Lbad_keybits 4376 4377.L10rounds: 4378 mov \$9,$bits # 10 rounds for 128-bit key 4379 cmp \$`1<<28`,%r10d # AVX, bit no XOP 4380 je .L10rounds_alt 4381 4382 $movkey %xmm0,($key) # round 0 4383 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1 4384 call .Lkey_expansion_128_cold 4385 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2 4386 call .Lkey_expansion_128 4387 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3 4388 call .Lkey_expansion_128 4389 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4 4390 call .Lkey_expansion_128 4391 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5 4392 call .Lkey_expansion_128 4393 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6 4394 call .Lkey_expansion_128 4395 aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7 4396 call .Lkey_expansion_128 4397 aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8 4398 call .Lkey_expansion_128 4399 aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9 4400 call .Lkey_expansion_128 4401 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10 4402 call .Lkey_expansion_128 4403 $movkey %xmm0,(%rax) 4404 mov $bits,80(%rax) # 240(%rdx) 4405 xor %eax,%eax 4406 jmp .Lenc_key_ret 4407 4408.align 16 4409.L10rounds_alt: 4410 movdqa .Lkey_rotate(%rip),%xmm5 4411 mov \$8,%r10d 4412 movdqa .Lkey_rcon1(%rip),%xmm4 4413 movdqa %xmm0,%xmm2 4414 movdqu %xmm0,($key) 4415 jmp .Loop_key128 4416 4417.align 16 4418.Loop_key128: 4419 pshufb %xmm5,%xmm0 4420 aesenclast %xmm4,%xmm0 4421 pslld \$1,%xmm4 4422 lea 16(%rax),%rax 4423 4424 movdqa %xmm2,%xmm3 4425 pslldq \$4,%xmm2 4426 pxor %xmm2,%xmm3 4427 pslldq \$4,%xmm2 4428 pxor %xmm2,%xmm3 4429 pslldq \$4,%xmm2 4430 pxor %xmm3,%xmm2 4431 4432 pxor %xmm2,%xmm0 4433 movdqu %xmm0,-16(%rax) 4434 movdqa %xmm0,%xmm2 4435 4436 dec %r10d 4437 jnz .Loop_key128 4438 4439 movdqa .Lkey_rcon1b(%rip),%xmm4 4440 4441 pshufb %xmm5,%xmm0 4442 aesenclast %xmm4,%xmm0 4443 pslld \$1,%xmm4 4444 4445 movdqa %xmm2,%xmm3 4446 pslldq \$4,%xmm2 4447 pxor %xmm2,%xmm3 4448 pslldq \$4,%xmm2 4449 pxor %xmm2,%xmm3 4450 pslldq \$4,%xmm2 4451 pxor %xmm3,%xmm2 4452 4453 pxor %xmm2,%xmm0 4454 movdqu %xmm0,(%rax) 4455 4456 movdqa %xmm0,%xmm2 4457 pshufb %xmm5,%xmm0 4458 aesenclast %xmm4,%xmm0 4459 4460 movdqa %xmm2,%xmm3 4461 pslldq \$4,%xmm2 4462 pxor %xmm2,%xmm3 4463 pslldq \$4,%xmm2 4464 pxor %xmm2,%xmm3 4465 pslldq \$4,%xmm2 4466 pxor %xmm3,%xmm2 4467 4468 pxor %xmm2,%xmm0 4469 movdqu %xmm0,16(%rax) 4470 4471 mov $bits,96(%rax) # 240($key) 4472 xor %eax,%eax 4473 jmp .Lenc_key_ret 4474 4475.align 16 4476.L12rounds: 4477 movq 16($inp),%xmm2 # remaining 1/3 of *userKey 4478 mov \$11,$bits # 12 rounds for 192 4479 cmp \$`1<<28`,%r10d # AVX, but no XOP 4480 je .L12rounds_alt 4481 4482 $movkey %xmm0,($key) # round 0 4483 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2 4484 call .Lkey_expansion_192a_cold 4485 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3 4486 call .Lkey_expansion_192b 4487 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5 4488 call .Lkey_expansion_192a 4489 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6 4490 call .Lkey_expansion_192b 4491 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8 4492 call .Lkey_expansion_192a 4493 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9 4494 call .Lkey_expansion_192b 4495 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11 4496 call .Lkey_expansion_192a 4497 aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12 4498 call .Lkey_expansion_192b 4499 $movkey %xmm0,(%rax) 4500 mov $bits,48(%rax) # 240(%rdx) 4501 xor %rax, %rax 4502 jmp .Lenc_key_ret 4503 4504.align 16 4505.L12rounds_alt: 4506 movdqa .Lkey_rotate192(%rip),%xmm5 4507 movdqa .Lkey_rcon1(%rip),%xmm4 4508 mov \$8,%r10d 4509 movdqu %xmm0,($key) 4510 jmp .Loop_key192 4511 4512.align 16 4513.Loop_key192: 4514 movq %xmm2,0(%rax) 4515 movdqa %xmm2,%xmm1 4516 pshufb %xmm5,%xmm2 4517 aesenclast %xmm4,%xmm2 4518 pslld \$1, %xmm4 4519 lea 24(%rax),%rax 4520 4521 movdqa %xmm0,%xmm3 4522 pslldq \$4,%xmm0 4523 pxor %xmm0,%xmm3 4524 pslldq \$4,%xmm0 4525 pxor %xmm0,%xmm3 4526 pslldq \$4,%xmm0 4527 pxor %xmm3,%xmm0 4528 4529 pshufd \$0xff,%xmm0,%xmm3 4530 pxor %xmm1,%xmm3 4531 pslldq \$4,%xmm1 4532 pxor %xmm1,%xmm3 4533 4534 pxor %xmm2,%xmm0 4535 pxor %xmm3,%xmm2 4536 movdqu %xmm0,-16(%rax) 4537 4538 dec %r10d 4539 jnz .Loop_key192 4540 4541 mov $bits,32(%rax) # 240($key) 4542 xor %eax,%eax 4543 jmp .Lenc_key_ret 4544 4545.align 16 4546.L14rounds: 4547 movups 16($inp),%xmm2 # remaining half of *userKey 4548 mov \$13,$bits # 14 rounds for 256 4549 lea 16(%rax),%rax 4550 cmp \$`1<<28`,%r10d # AVX, but no XOP 4551 je .L14rounds_alt 4552 4553 $movkey %xmm0,($key) # round 0 4554 $movkey %xmm2,16($key) # round 1 4555 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2 4556 call .Lkey_expansion_256a_cold 4557 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3 4558 call .Lkey_expansion_256b 4559 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4 4560 call .Lkey_expansion_256a 4561 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5 4562 call .Lkey_expansion_256b 4563 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6 4564 call .Lkey_expansion_256a 4565 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7 4566 call .Lkey_expansion_256b 4567 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8 4568 call .Lkey_expansion_256a 4569 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9 4570 call .Lkey_expansion_256b 4571 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10 4572 call .Lkey_expansion_256a 4573 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11 4574 call .Lkey_expansion_256b 4575 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12 4576 call .Lkey_expansion_256a 4577 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13 4578 call .Lkey_expansion_256b 4579 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14 4580 call .Lkey_expansion_256a 4581 $movkey %xmm0,(%rax) 4582 mov $bits,16(%rax) # 240(%rdx) 4583 xor %rax,%rax 4584 jmp .Lenc_key_ret 4585 4586.align 16 4587.L14rounds_alt: 4588 movdqa .Lkey_rotate(%rip),%xmm5 4589 movdqa .Lkey_rcon1(%rip),%xmm4 4590 mov \$7,%r10d 4591 movdqu %xmm0,0($key) 4592 movdqa %xmm2,%xmm1 4593 movdqu %xmm2,16($key) 4594 jmp .Loop_key256 4595 4596.align 16 4597.Loop_key256: 4598 pshufb %xmm5,%xmm2 4599 aesenclast %xmm4,%xmm2 4600 4601 movdqa %xmm0,%xmm3 4602 pslldq \$4,%xmm0 4603 pxor %xmm0,%xmm3 4604 pslldq \$4,%xmm0 4605 pxor %xmm0,%xmm3 4606 pslldq \$4,%xmm0 4607 pxor %xmm3,%xmm0 4608 pslld \$1,%xmm4 4609 4610 pxor %xmm2,%xmm0 4611 movdqu %xmm0,(%rax) 4612 4613 dec %r10d 4614 jz .Ldone_key256 4615 4616 pshufd \$0xff,%xmm0,%xmm2 4617 pxor %xmm3,%xmm3 4618 aesenclast %xmm3,%xmm2 4619 4620 movdqa %xmm1,%xmm3 4621 pslldq \$4,%xmm1 4622 pxor %xmm1,%xmm3 4623 pslldq \$4,%xmm1 4624 pxor %xmm1,%xmm3 4625 pslldq \$4,%xmm1 4626 pxor %xmm3,%xmm1 4627 4628 pxor %xmm1,%xmm2 4629 movdqu %xmm2,16(%rax) 4630 lea 32(%rax),%rax 4631 movdqa %xmm2,%xmm1 4632 4633 jmp .Loop_key256 4634 4635.Ldone_key256: 4636 mov $bits,16(%rax) # 240($key) 4637 xor %eax,%eax 4638 jmp .Lenc_key_ret 4639 4640.align 16 4641.Lbad_keybits: 4642 mov \$-2,%rax 4643.Lenc_key_ret: 4644 pxor %xmm0,%xmm0 4645 pxor %xmm1,%xmm1 4646 pxor %xmm2,%xmm2 4647 pxor %xmm3,%xmm3 4648 pxor %xmm4,%xmm4 4649 pxor %xmm5,%xmm5 4650 add \$8,%rsp 4651.cfi_adjust_cfa_offset -8 4652 ret 4653.cfi_endproc 4654.LSEH_end_set_encrypt_key: 4655 4656.align 16 4657.Lkey_expansion_128: 4658 $movkey %xmm0,(%rax) 4659 lea 16(%rax),%rax 4660.Lkey_expansion_128_cold: 4661 shufps \$0b00010000,%xmm0,%xmm4 4662 xorps %xmm4, %xmm0 4663 shufps \$0b10001100,%xmm0,%xmm4 4664 xorps %xmm4, %xmm0 4665 shufps \$0b11111111,%xmm1,%xmm1 # critical path 4666 xorps %xmm1,%xmm0 4667 ret 4668 4669.align 16 4670.Lkey_expansion_192a: 4671 $movkey %xmm0,(%rax) 4672 lea 16(%rax),%rax 4673.Lkey_expansion_192a_cold: 4674 movaps %xmm2, %xmm5 4675.Lkey_expansion_192b_warm: 4676 shufps \$0b00010000,%xmm0,%xmm4 4677 movdqa %xmm2,%xmm3 4678 xorps %xmm4,%xmm0 4679 shufps \$0b10001100,%xmm0,%xmm4 4680 pslldq \$4,%xmm3 4681 xorps %xmm4,%xmm0 4682 pshufd \$0b01010101,%xmm1,%xmm1 # critical path 4683 pxor %xmm3,%xmm2 4684 pxor %xmm1,%xmm0 4685 pshufd \$0b11111111,%xmm0,%xmm3 4686 pxor %xmm3,%xmm2 4687 ret 4688 4689.align 16 4690.Lkey_expansion_192b: 4691 movaps %xmm0,%xmm3 4692 shufps \$0b01000100,%xmm0,%xmm5 4693 $movkey %xmm5,(%rax) 4694 shufps \$0b01001110,%xmm2,%xmm3 4695 $movkey %xmm3,16(%rax) 4696 lea 32(%rax),%rax 4697 jmp .Lkey_expansion_192b_warm 4698 4699.align 16 4700.Lkey_expansion_256a: 4701 $movkey %xmm2,(%rax) 4702 lea 16(%rax),%rax 4703.Lkey_expansion_256a_cold: 4704 shufps \$0b00010000,%xmm0,%xmm4 4705 xorps %xmm4,%xmm0 4706 shufps \$0b10001100,%xmm0,%xmm4 4707 xorps %xmm4,%xmm0 4708 shufps \$0b11111111,%xmm1,%xmm1 # critical path 4709 xorps %xmm1,%xmm0 4710 ret 4711 4712.align 16 4713.Lkey_expansion_256b: 4714 $movkey %xmm0,(%rax) 4715 lea 16(%rax),%rax 4716 4717 shufps \$0b00010000,%xmm2,%xmm4 4718 xorps %xmm4,%xmm2 4719 shufps \$0b10001100,%xmm2,%xmm4 4720 xorps %xmm4,%xmm2 4721 shufps \$0b10101010,%xmm1,%xmm1 # critical path 4722 xorps %xmm1,%xmm2 4723 ret 4724.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key 4725.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key 4726___ 4727} 4728 4729$code.=<<___; 4730.align 64 4731.Lbswap_mask: 4732 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 4733.Lincrement32: 4734 .long 6,6,6,0 4735.Lincrement64: 4736 .long 1,0,0,0 4737.Lxts_magic: 4738 .long 0x87,0,1,0 4739.Lincrement1: 4740 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 4741.Lkey_rotate: 4742 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d 4743.Lkey_rotate192: 4744 .long 0x04070605,0x04070605,0x04070605,0x04070605 4745.Lkey_rcon1: 4746 .long 1,1,1,1 4747.Lkey_rcon1b: 4748 .long 0x1b,0x1b,0x1b,0x1b 4749 4750.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>" 4751.align 64 4752___ 4753 4754# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 4755# CONTEXT *context,DISPATCHER_CONTEXT *disp) 4756if ($win64) { 4757$rec="%rcx"; 4758$frame="%rdx"; 4759$context="%r8"; 4760$disp="%r9"; 4761 4762$code.=<<___; 4763.extern __imp_RtlVirtualUnwind 4764___ 4765$code.=<<___ if ($PREFIX eq "aes_hw"); 4766.type ecb_ccm64_se_handler,\@abi-omnipotent 4767.align 16 4768ecb_ccm64_se_handler: 4769 push %rsi 4770 push %rdi 4771 push %rbx 4772 push %rbp 4773 push %r12 4774 push %r13 4775 push %r14 4776 push %r15 4777 pushfq 4778 sub \$64,%rsp 4779 4780 mov 120($context),%rax # pull context->Rax 4781 mov 248($context),%rbx # pull context->Rip 4782 4783 mov 8($disp),%rsi # disp->ImageBase 4784 mov 56($disp),%r11 # disp->HandlerData 4785 4786 mov 0(%r11),%r10d # HandlerData[0] 4787 lea (%rsi,%r10),%r10 # prologue label 4788 cmp %r10,%rbx # context->Rip<prologue label 4789 jb .Lcommon_seh_tail 4790 4791 mov 152($context),%rax # pull context->Rsp 4792 4793 mov 4(%r11),%r10d # HandlerData[1] 4794 lea (%rsi,%r10),%r10 # epilogue label 4795 cmp %r10,%rbx # context->Rip>=epilogue label 4796 jae .Lcommon_seh_tail 4797 4798 lea 0(%rax),%rsi # %xmm save area 4799 lea 512($context),%rdi # &context.Xmm6 4800 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) 4801 .long 0xa548f3fc # cld; rep movsq 4802 lea 0x58(%rax),%rax # adjust stack pointer 4803 4804 jmp .Lcommon_seh_tail 4805.size ${PREFIX}_ccm64_se_handler,.-${PREFIX}_ccm64_se_handler 4806 4807.type ctr_xts_se_handler,\@abi-omnipotent 4808.align 16 4809ctr_xts_se_handler: 4810 push %rsi 4811 push %rdi 4812 push %rbx 4813 push %rbp 4814 push %r12 4815 push %r13 4816 push %r14 4817 push %r15 4818 pushfq 4819 sub \$64,%rsp 4820 4821 mov 120($context),%rax # pull context->Rax 4822 mov 248($context),%rbx # pull context->Rip 4823 4824 mov 8($disp),%rsi # disp->ImageBase 4825 mov 56($disp),%r11 # disp->HandlerData 4826 4827 mov 0(%r11),%r10d # HandlerData[0] 4828 lea (%rsi,%r10),%r10 # prologue lable 4829 cmp %r10,%rbx # context->Rip<prologue label 4830 jb .Lcommon_seh_tail 4831 4832 mov 152($context),%rax # pull context->Rsp 4833 4834 mov 4(%r11),%r10d # HandlerData[1] 4835 lea (%rsi,%r10),%r10 # epilogue label 4836 cmp %r10,%rbx # context->Rip>=epilogue label 4837 jae .Lcommon_seh_tail 4838 4839 mov 208($context),%rax # pull context->R11 4840 4841 lea -0xa8(%rax),%rsi # %xmm save area 4842 lea 512($context),%rdi # & context.Xmm6 4843 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 4844 .long 0xa548f3fc # cld; rep movsq 4845 4846 mov -8(%rax),%rbp # restore saved %rbp 4847 mov %rbp,160($context) # restore context->Rbp 4848 jmp .Lcommon_seh_tail 4849.size ctr_xts_se_handler,.-ctr_xts_se_handler 4850 4851___ 4852# BoringSSL omits the OCB functions. 4853$code.=<<___ if (0); 4854.type ocb_se_handler,\@abi-omnipotent 4855.align 16 4856ocb_se_handler: 4857 push %rsi 4858 push %rdi 4859 push %rbx 4860 push %rbp 4861 push %r12 4862 push %r13 4863 push %r14 4864 push %r15 4865 pushfq 4866 sub \$64,%rsp 4867 4868 mov 120($context),%rax # pull context->Rax 4869 mov 248($context),%rbx # pull context->Rip 4870 4871 mov 8($disp),%rsi # disp->ImageBase 4872 mov 56($disp),%r11 # disp->HandlerData 4873 4874 mov 0(%r11),%r10d # HandlerData[0] 4875 lea (%rsi,%r10),%r10 # prologue lable 4876 cmp %r10,%rbx # context->Rip<prologue label 4877 jb .Lcommon_seh_tail 4878 4879 mov 4(%r11),%r10d # HandlerData[1] 4880 lea (%rsi,%r10),%r10 # epilogue label 4881 cmp %r10,%rbx # context->Rip>=epilogue label 4882 jae .Lcommon_seh_tail 4883 4884 mov 8(%r11),%r10d # HandlerData[2] 4885 lea (%rsi,%r10),%r10 4886 cmp %r10,%rbx # context->Rip>=pop label 4887 jae .Locb_no_xmm 4888 4889 mov 152($context),%rax # pull context->Rsp 4890 4891 lea (%rax),%rsi # %xmm save area 4892 lea 512($context),%rdi # & context.Xmm6 4893 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 4894 .long 0xa548f3fc # cld; rep movsq 4895 lea 0xa0+0x28(%rax),%rax 4896 4897.Locb_no_xmm: 4898 mov -8(%rax),%rbx 4899 mov -16(%rax),%rbp 4900 mov -24(%rax),%r12 4901 mov -32(%rax),%r13 4902 mov -40(%rax),%r14 4903 4904 mov %rbx,144($context) # restore context->Rbx 4905 mov %rbp,160($context) # restore context->Rbp 4906 mov %r12,216($context) # restore context->R12 4907 mov %r13,224($context) # restore context->R13 4908 mov %r14,232($context) # restore context->R14 4909 4910 jmp .Lcommon_seh_tail 4911.size ocb_se_handler,.-ocb_se_handler 4912___ 4913$code.=<<___; 4914.type cbc_se_handler,\@abi-omnipotent 4915.align 16 4916cbc_se_handler: 4917 push %rsi 4918 push %rdi 4919 push %rbx 4920 push %rbp 4921 push %r12 4922 push %r13 4923 push %r14 4924 push %r15 4925 pushfq 4926 sub \$64,%rsp 4927 4928 mov 152($context),%rax # pull context->Rsp 4929 mov 248($context),%rbx # pull context->Rip 4930 4931 lea .Lcbc_decrypt_bulk(%rip),%r10 4932 cmp %r10,%rbx # context->Rip<"prologue" label 4933 jb .Lcommon_seh_tail 4934 4935 mov 120($context),%rax # pull context->Rax 4936 4937 lea .Lcbc_decrypt_body(%rip),%r10 4938 cmp %r10,%rbx # context->Rip<cbc_decrypt_body 4939 jb .Lcommon_seh_tail 4940 4941 mov 152($context),%rax # pull context->Rsp 4942 4943 lea .Lcbc_ret(%rip),%r10 4944 cmp %r10,%rbx # context->Rip>="epilogue" label 4945 jae .Lcommon_seh_tail 4946 4947 lea 16(%rax),%rsi # %xmm save area 4948 lea 512($context),%rdi # &context.Xmm6 4949 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 4950 .long 0xa548f3fc # cld; rep movsq 4951 4952 mov 208($context),%rax # pull context->R11 4953 4954 mov -8(%rax),%rbp # restore saved %rbp 4955 mov %rbp,160($context) # restore context->Rbp 4956 4957.Lcommon_seh_tail: 4958 mov 8(%rax),%rdi 4959 mov 16(%rax),%rsi 4960 mov %rax,152($context) # restore context->Rsp 4961 mov %rsi,168($context) # restore context->Rsi 4962 mov %rdi,176($context) # restore context->Rdi 4963 4964 mov 40($disp),%rdi # disp->ContextRecord 4965 mov $context,%rsi # context 4966 mov \$154,%ecx # sizeof(CONTEXT) 4967 .long 0xa548f3fc # cld; rep movsq 4968 4969 mov $disp,%rsi 4970 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 4971 mov 8(%rsi),%rdx # arg2, disp->ImageBase 4972 mov 0(%rsi),%r8 # arg3, disp->ControlPc 4973 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 4974 mov 40(%rsi),%r10 # disp->ContextRecord 4975 lea 56(%rsi),%r11 # &disp->HandlerData 4976 lea 24(%rsi),%r12 # &disp->EstablisherFrame 4977 mov %r10,32(%rsp) # arg5 4978 mov %r11,40(%rsp) # arg6 4979 mov %r12,48(%rsp) # arg7 4980 mov %rcx,56(%rsp) # arg8, (NULL) 4981 call *__imp_RtlVirtualUnwind(%rip) 4982 4983 mov \$1,%eax # ExceptionContinueSearch 4984 add \$64,%rsp 4985 popfq 4986 pop %r15 4987 pop %r14 4988 pop %r13 4989 pop %r12 4990 pop %rbp 4991 pop %rbx 4992 pop %rdi 4993 pop %rsi 4994 ret 4995.size cbc_se_handler,.-cbc_se_handler 4996 4997.section .pdata 4998.align 4 4999___ 5000$code.=<<___ if ($PREFIX eq "aes_hw"); 5001 .rva .LSEH_begin_${PREFIX}_ecb_encrypt 5002 .rva .LSEH_end_${PREFIX}_ecb_encrypt 5003 .rva .LSEH_info_ecb 5004 5005 .rva .LSEH_begin_${PREFIX}_ctr32_encrypt_blocks 5006 .rva .LSEH_end_${PREFIX}_ctr32_encrypt_blocks 5007 .rva .LSEH_info_ctr32 5008___ 5009$code.=<<___; 5010 .rva .LSEH_begin_${PREFIX}_cbc_encrypt 5011 .rva .LSEH_end_${PREFIX}_cbc_encrypt 5012 .rva .LSEH_info_cbc 5013 5014 .rva ${PREFIX}_set_decrypt_key 5015 .rva .LSEH_end_set_decrypt_key 5016 .rva .LSEH_info_key 5017 5018 .rva ${PREFIX}_set_encrypt_key 5019 .rva .LSEH_end_set_encrypt_key 5020 .rva .LSEH_info_key 5021.section .xdata 5022.align 8 5023___ 5024$code.=<<___ if ($PREFIX eq "aes_hw"); 5025.LSEH_info_ecb: 5026 .byte 9,0,0,0 5027 .rva ecb_ccm64_se_handler 5028 .rva .Lecb_enc_body,.Lecb_enc_ret # HandlerData[] 5029.LSEH_info_ctr32: 5030 .byte 9,0,0,0 5031 .rva ctr_xts_se_handler 5032 .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[] 5033___ 5034$code.=<<___; 5035.LSEH_info_cbc: 5036 .byte 9,0,0,0 5037 .rva cbc_se_handler 5038.LSEH_info_key: 5039 .byte 0x01,0x04,0x01,0x00 5040 .byte 0x04,0x02,0x00,0x00 # sub rsp,8 5041___ 5042} 5043 5044sub rex { 5045 local *opcode=shift; 5046 my ($dst,$src)=@_; 5047 my $rex=0; 5048 5049 $rex|=0x04 if($dst>=8); 5050 $rex|=0x01 if($src>=8); 5051 push @opcode,$rex|0x40 if($rex); 5052} 5053 5054sub aesni { 5055 my $line=shift; 5056 my @opcode=(0x66); 5057 5058 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { 5059 rex(\@opcode,$4,$3); 5060 push @opcode,0x0f,0x3a,0xdf; 5061 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M 5062 my $c=$2; 5063 push @opcode,$c=~/^0/?oct($c):$c; 5064 return ".byte\t".join(',',@opcode); 5065 } 5066 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { 5067 my %opcodelet = ( 5068 "aesimc" => 0xdb, 5069 "aesenc" => 0xdc, "aesenclast" => 0xdd, 5070 "aesdec" => 0xde, "aesdeclast" => 0xdf 5071 ); 5072 return undef if (!defined($opcodelet{$1})); 5073 rex(\@opcode,$3,$2); 5074 push @opcode,0x0f,0x38,$opcodelet{$1}; 5075 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M 5076 return ".byte\t".join(',',@opcode); 5077 } 5078 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) { 5079 my %opcodelet = ( 5080 "aesenc" => 0xdc, "aesenclast" => 0xdd, 5081 "aesdec" => 0xde, "aesdeclast" => 0xdf 5082 ); 5083 return undef if (!defined($opcodelet{$1})); 5084 my $off = $2; 5085 push @opcode,0x44 if ($3>=8); 5086 push @opcode,0x0f,0x38,$opcodelet{$1}; 5087 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M 5088 push @opcode,($off=~/^0/?oct($off):$off)&0xff; 5089 return ".byte\t".join(',',@opcode); 5090 } 5091 return $line; 5092} 5093 5094sub movbe { 5095 ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift; 5096} 5097 5098$code =~ s/\`([^\`]*)\`/eval($1)/gem; 5099$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; 5100#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact 5101$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem; 5102 5103print $code; 5104 5105close STDOUT or die "error closing STDOUT"; 5106