1#! /usr/bin/env perl 2# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements support for Intel AES-NI extension. In 18# OpenSSL context it's used with Intel engine, but can also be used as 19# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for 20# details]. 21# 22# Performance. 23# 24# Given aes(enc|dec) instructions' latency asymptotic performance for 25# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte 26# processed with 128-bit key. And given their throughput asymptotic 27# performance for parallelizable modes is 1.25 cycles per byte. Being 28# asymptotic limit it's not something you commonly achieve in reality, 29# but how close does one get? Below are results collected for 30# different modes and block sized. Pairs of numbers are for en-/ 31# decryption. 32# 33# 16-byte 64-byte 256-byte 1-KB 8-KB 34# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26 35# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26 36# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28 37# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07 38# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38 39# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55 40# 41# ECB, CTR, CBC and CCM results are free from EVP overhead. This means 42# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni 43# [-decrypt]' will exhibit 10-15% worse results for smaller blocks. 44# The results were collected with specially crafted speed.c benchmark 45# in order to compare them with results reported in "Intel Advanced 46# Encryption Standard (AES) New Instruction Set" White Paper Revision 47# 3.0 dated May 2010. All above results are consistently better. This 48# module also provides better performance for block sizes smaller than 49# 128 bytes in points *not* represented in the above table. 50# 51# Looking at the results for 8-KB buffer. 52# 53# CFB and OFB results are far from the limit, because implementation 54# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on 55# single-block aesni_encrypt, which is not the most optimal way to go. 56# CBC encrypt result is unexpectedly high and there is no documented 57# explanation for it. Seemingly there is a small penalty for feeding 58# the result back to AES unit the way it's done in CBC mode. There is 59# nothing one can do and the result appears optimal. CCM result is 60# identical to CBC, because CBC-MAC is essentially CBC encrypt without 61# saving output. CCM CTR "stays invisible," because it's neatly 62# interleaved wih CBC-MAC. This provides ~30% improvement over 63# "straightforward" CCM implementation with CTR and CBC-MAC performed 64# disjointly. Parallelizable modes practically achieve the theoretical 65# limit. 66# 67# Looking at how results vary with buffer size. 68# 69# Curves are practically saturated at 1-KB buffer size. In most cases 70# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one. 71# CTR curve doesn't follow this pattern and is "slowest" changing one 72# with "256-byte" result being 87% of "8-KB." This is because overhead 73# in CTR mode is most computationally intensive. Small-block CCM 74# decrypt is slower than encrypt, because first CTR and last CBC-MAC 75# iterations can't be interleaved. 76# 77# Results for 192- and 256-bit keys. 78# 79# EVP-free results were observed to scale perfectly with number of 80# rounds for larger block sizes, i.e. 192-bit result being 10/12 times 81# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences 82# are a tad smaller, because the above mentioned penalty biases all 83# results by same constant value. In similar way function call 84# overhead affects small-block performance, as well as OFB and CFB 85# results. Differences are not large, most common coefficients are 86# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one 87# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)... 88 89# January 2011 90# 91# While Westmere processor features 6 cycles latency for aes[enc|dec] 92# instructions, which can be scheduled every second cycle, Sandy 93# Bridge spends 8 cycles per instruction, but it can schedule them 94# every cycle. This means that code targeting Westmere would perform 95# suboptimally on Sandy Bridge. Therefore this update. 96# 97# In addition, non-parallelizable CBC encrypt (as well as CCM) is 98# optimized. Relative improvement might appear modest, 8% on Westmere, 99# but in absolute terms it's 3.77 cycles per byte encrypted with 100# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers 101# should be compared to asymptotic limits of 3.75 for Westmere and 102# 5.00 for Sandy Bridge. Actually, the fact that they get this close 103# to asymptotic limits is quite amazing. Indeed, the limit is 104# calculated as latency times number of rounds, 10 for 128-bit key, 105# and divided by 16, the number of bytes in block, or in other words 106# it accounts *solely* for aesenc instructions. But there are extra 107# instructions, and numbers so close to the asymptotic limits mean 108# that it's as if it takes as little as *one* additional cycle to 109# execute all of them. How is it possible? It is possible thanks to 110# out-of-order execution logic, which manages to overlap post- 111# processing of previous block, things like saving the output, with 112# actual encryption of current block, as well as pre-processing of 113# current block, things like fetching input and xor-ing it with 114# 0-round element of the key schedule, with actual encryption of 115# previous block. Keep this in mind... 116# 117# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher 118# performance is achieved by interleaving instructions working on 119# independent blocks. In which case asymptotic limit for such modes 120# can be obtained by dividing above mentioned numbers by AES 121# instructions' interleave factor. Westmere can execute at most 3 122# instructions at a time, meaning that optimal interleave factor is 3, 123# and that's where the "magic" number of 1.25 come from. "Optimal 124# interleave factor" means that increase of interleave factor does 125# not improve performance. The formula has proven to reflect reality 126# pretty well on Westmere... Sandy Bridge on the other hand can 127# execute up to 8 AES instructions at a time, so how does varying 128# interleave factor affect the performance? Here is table for ECB 129# (numbers are cycles per byte processed with 128-bit key): 130# 131# instruction interleave factor 3x 6x 8x 132# theoretical asymptotic limit 1.67 0.83 0.625 133# measured performance for 8KB block 1.05 0.86 0.84 134# 135# "as if" interleave factor 4.7x 5.8x 6.0x 136# 137# Further data for other parallelizable modes: 138# 139# CBC decrypt 1.16 0.93 0.74 140# CTR 1.14 0.91 0.74 141# 142# Well, given 3x column it's probably inappropriate to call the limit 143# asymptotic, if it can be surpassed, isn't it? What happens there? 144# Rewind to CBC paragraph for the answer. Yes, out-of-order execution 145# magic is responsible for this. Processor overlaps not only the 146# additional instructions with AES ones, but even AES instructions 147# processing adjacent triplets of independent blocks. In the 6x case 148# additional instructions still claim disproportionally small amount 149# of additional cycles, but in 8x case number of instructions must be 150# a tad too high for out-of-order logic to cope with, and AES unit 151# remains underutilized... As you can see 8x interleave is hardly 152# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl 153# utilizes 6x interleave because of limited register bank capacity. 154# 155# Higher interleave factors do have negative impact on Westmere 156# performance. While for ECB mode it's negligible ~1.5%, other 157# parallelizables perform ~5% worse, which is outweighed by ~25% 158# improvement on Sandy Bridge. To balance regression on Westmere 159# CTR mode was implemented with 6x aesenc interleave factor. 160 161# April 2011 162# 163# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing 164# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like 165# in CTR mode AES instruction interleave factor was chosen to be 6x. 166 167# November 2015 168# 169# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was 170# chosen to be 6x. 171 172###################################################################### 173# Current large-block performance in cycles per byte processed with 174# 128-bit key (less is better). 175# 176# CBC en-/decrypt CTR XTS ECB OCB 177# Westmere 3.77/1.25 1.25 1.25 1.26 178# * Bridge 5.07/0.74 0.75 0.90 0.85 0.98 179# Haswell 4.44/0.63 0.63 0.73 0.63 0.70 180# Skylake 2.62/0.63 0.63 0.63 0.63 181# Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11 182# Knights L 2.54/0.77 0.78 0.85 - 1.50 183# Goldmont 3.82/1.26 1.26 1.29 1.29 1.50 184# Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95 185# Ryzen 2.71/0.35 0.35 0.44 0.38 0.49 186# 187# (*) Atom Silvermont ECB result is suboptimal because of penalties 188# incurred by operations on %xmm8-15. As ECB is not considered 189# critical, nothing was done to mitigate the problem. 190 191$PREFIX="aes_hw"; # if $PREFIX is set to "AES", the script 192 # generates drop-in replacement for 193 # crypto/aes/asm/aes-x86_64.pl:-) 194 195$flavour = shift; 196$output = shift; 197if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 198 199$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 200 201$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 202( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 203( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 204die "can't locate x86_64-xlate.pl"; 205 206open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 207*STDOUT=*OUT; 208 209$movkey = $PREFIX eq "aes_hw" ? "movups" : "movups"; 210@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 211 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 212 213$code=".text\n"; 214$code.=".extern OPENSSL_ia32cap_P\n"; 215 216$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!! 217# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ... 218$inp="%rdi"; 219$out="%rsi"; 220$len="%rdx"; 221$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! 222$ivp="%r8"; # cbc, ctr, ... 223 224$rnds_="%r10d"; # backup copy for $rounds 225$key_="%r11"; # backup copy for $key 226 227# %xmm register layout 228$rndkey0="%xmm0"; $rndkey1="%xmm1"; 229$inout0="%xmm2"; $inout1="%xmm3"; 230$inout2="%xmm4"; $inout3="%xmm5"; 231$inout4="%xmm6"; $inout5="%xmm7"; 232$inout6="%xmm8"; $inout7="%xmm9"; 233 234$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ... 235$in0="%xmm8"; $iv="%xmm9"; 236 237# Inline version of internal aesni_[en|de]crypt1. 238# 239# Why folded loop? Because aes[enc|dec] is slow enough to accommodate 240# cycles which take care of loop variables... 241{ my $sn; 242sub aesni_generate1 { 243my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); 244++$sn; 245$code.=<<___; 246 $movkey ($key),$rndkey0 247 $movkey 16($key),$rndkey1 248___ 249$code.=<<___ if (defined($ivec)); 250 xorps $rndkey0,$ivec 251 lea 32($key),$key 252 xorps $ivec,$inout 253___ 254$code.=<<___ if (!defined($ivec)); 255 lea 32($key),$key 256 xorps $rndkey0,$inout 257___ 258$code.=<<___; 259.Loop_${p}1_$sn: 260 aes${p} $rndkey1,$inout 261 dec $rounds 262 $movkey ($key),$rndkey1 263 lea 16($key),$key 264 jnz .Loop_${p}1_$sn # loop body is 16 bytes 265 aes${p}last $rndkey1,$inout 266___ 267}} 268# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key); 269# 270{ my ($inp,$out,$key) = @_4args; 271 272$code.=<<___; 273.globl ${PREFIX}_encrypt 274.type ${PREFIX}_encrypt,\@abi-omnipotent 275.align 16 276${PREFIX}_encrypt: 277.cfi_startproc 278#ifndef NDEBUG 279#ifndef BORINGSSL_FIPS 280.extern BORINGSSL_function_hit 281 movb \$1,BORINGSSL_function_hit+1(%rip) 282#endif 283#endif 284 movups ($inp),$inout0 # load input 285 mov 240($key),$rounds # key->rounds 286___ 287 &aesni_generate1("enc",$key,$rounds); 288$code.=<<___; 289 pxor $rndkey0,$rndkey0 # clear register bank 290 pxor $rndkey1,$rndkey1 291 movups $inout0,($out) # output 292 pxor $inout0,$inout0 293 ret 294.cfi_endproc 295.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt 296 297.globl ${PREFIX}_decrypt 298.type ${PREFIX}_decrypt,\@abi-omnipotent 299.align 16 300${PREFIX}_decrypt: 301.cfi_startproc 302 movups ($inp),$inout0 # load input 303 mov 240($key),$rounds # key->rounds 304___ 305 &aesni_generate1("dec",$key,$rounds); 306$code.=<<___; 307 pxor $rndkey0,$rndkey0 # clear register bank 308 pxor $rndkey1,$rndkey1 309 movups $inout0,($out) # output 310 pxor $inout0,$inout0 311 ret 312.cfi_endproc 313.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt 314___ 315} 316 317# _aesni_[en|de]cryptN are private interfaces, N denotes interleave 318# factor. Why 3x subroutine were originally used in loops? Even though 319# aes[enc|dec] latency was originally 6, it could be scheduled only 320# every *2nd* cycle. Thus 3x interleave was the one providing optimal 321# utilization, i.e. when subroutine's throughput is virtually same as 322# of non-interleaved subroutine [for number of input blocks up to 3]. 323# This is why it originally made no sense to implement 2x subroutine. 324# But times change and it became appropriate to spend extra 192 bytes 325# on 2x subroutine on Atom Silvermont account. For processors that 326# can schedule aes[enc|dec] every cycle optimal interleave factor 327# equals to corresponding instructions latency. 8x is optimal for 328# * Bridge and "super-optimal" for other Intel CPUs... 329 330sub aesni_generate2 { 331my $dir=shift; 332# As already mentioned it takes in $key and $rounds, which are *not* 333# preserved. $inout[0-1] is cipher/clear text... 334$code.=<<___; 335.type _aesni_${dir}rypt2,\@abi-omnipotent 336.align 16 337_aesni_${dir}rypt2: 338.cfi_startproc 339 $movkey ($key),$rndkey0 340 shl \$4,$rounds 341 $movkey 16($key),$rndkey1 342 xorps $rndkey0,$inout0 343 xorps $rndkey0,$inout1 344 $movkey 32($key),$rndkey0 345 lea 32($key,$rounds),$key 346 neg %rax # $rounds 347 add \$16,%rax 348 349.L${dir}_loop2: 350 aes${dir} $rndkey1,$inout0 351 aes${dir} $rndkey1,$inout1 352 $movkey ($key,%rax),$rndkey1 353 add \$32,%rax 354 aes${dir} $rndkey0,$inout0 355 aes${dir} $rndkey0,$inout1 356 $movkey -16($key,%rax),$rndkey0 357 jnz .L${dir}_loop2 358 359 aes${dir} $rndkey1,$inout0 360 aes${dir} $rndkey1,$inout1 361 aes${dir}last $rndkey0,$inout0 362 aes${dir}last $rndkey0,$inout1 363 ret 364.cfi_endproc 365.size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2 366___ 367} 368sub aesni_generate3 { 369my $dir=shift; 370# As already mentioned it takes in $key and $rounds, which are *not* 371# preserved. $inout[0-2] is cipher/clear text... 372$code.=<<___; 373.type _aesni_${dir}rypt3,\@abi-omnipotent 374.align 16 375_aesni_${dir}rypt3: 376.cfi_startproc 377 $movkey ($key),$rndkey0 378 shl \$4,$rounds 379 $movkey 16($key),$rndkey1 380 xorps $rndkey0,$inout0 381 xorps $rndkey0,$inout1 382 xorps $rndkey0,$inout2 383 $movkey 32($key),$rndkey0 384 lea 32($key,$rounds),$key 385 neg %rax # $rounds 386 add \$16,%rax 387 388.L${dir}_loop3: 389 aes${dir} $rndkey1,$inout0 390 aes${dir} $rndkey1,$inout1 391 aes${dir} $rndkey1,$inout2 392 $movkey ($key,%rax),$rndkey1 393 add \$32,%rax 394 aes${dir} $rndkey0,$inout0 395 aes${dir} $rndkey0,$inout1 396 aes${dir} $rndkey0,$inout2 397 $movkey -16($key,%rax),$rndkey0 398 jnz .L${dir}_loop3 399 400 aes${dir} $rndkey1,$inout0 401 aes${dir} $rndkey1,$inout1 402 aes${dir} $rndkey1,$inout2 403 aes${dir}last $rndkey0,$inout0 404 aes${dir}last $rndkey0,$inout1 405 aes${dir}last $rndkey0,$inout2 406 ret 407.cfi_endproc 408.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3 409___ 410} 411# 4x interleave is implemented to improve small block performance, 412# most notably [and naturally] 4 block by ~30%. One can argue that one 413# should have implemented 5x as well, but improvement would be <20%, 414# so it's not worth it... 415sub aesni_generate4 { 416my $dir=shift; 417# As already mentioned it takes in $key and $rounds, which are *not* 418# preserved. $inout[0-3] is cipher/clear text... 419$code.=<<___; 420.type _aesni_${dir}rypt4,\@abi-omnipotent 421.align 16 422_aesni_${dir}rypt4: 423.cfi_startproc 424 $movkey ($key),$rndkey0 425 shl \$4,$rounds 426 $movkey 16($key),$rndkey1 427 xorps $rndkey0,$inout0 428 xorps $rndkey0,$inout1 429 xorps $rndkey0,$inout2 430 xorps $rndkey0,$inout3 431 $movkey 32($key),$rndkey0 432 lea 32($key,$rounds),$key 433 neg %rax # $rounds 434 .byte 0x0f,0x1f,0x00 435 add \$16,%rax 436 437.L${dir}_loop4: 438 aes${dir} $rndkey1,$inout0 439 aes${dir} $rndkey1,$inout1 440 aes${dir} $rndkey1,$inout2 441 aes${dir} $rndkey1,$inout3 442 $movkey ($key,%rax),$rndkey1 443 add \$32,%rax 444 aes${dir} $rndkey0,$inout0 445 aes${dir} $rndkey0,$inout1 446 aes${dir} $rndkey0,$inout2 447 aes${dir} $rndkey0,$inout3 448 $movkey -16($key,%rax),$rndkey0 449 jnz .L${dir}_loop4 450 451 aes${dir} $rndkey1,$inout0 452 aes${dir} $rndkey1,$inout1 453 aes${dir} $rndkey1,$inout2 454 aes${dir} $rndkey1,$inout3 455 aes${dir}last $rndkey0,$inout0 456 aes${dir}last $rndkey0,$inout1 457 aes${dir}last $rndkey0,$inout2 458 aes${dir}last $rndkey0,$inout3 459 ret 460.cfi_endproc 461.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 462___ 463} 464sub aesni_generate6 { 465my $dir=shift; 466# As already mentioned it takes in $key and $rounds, which are *not* 467# preserved. $inout[0-5] is cipher/clear text... 468$code.=<<___; 469.type _aesni_${dir}rypt6,\@abi-omnipotent 470.align 16 471_aesni_${dir}rypt6: 472.cfi_startproc 473 $movkey ($key),$rndkey0 474 shl \$4,$rounds 475 $movkey 16($key),$rndkey1 476 xorps $rndkey0,$inout0 477 pxor $rndkey0,$inout1 478 pxor $rndkey0,$inout2 479 aes${dir} $rndkey1,$inout0 480 lea 32($key,$rounds),$key 481 neg %rax # $rounds 482 aes${dir} $rndkey1,$inout1 483 pxor $rndkey0,$inout3 484 pxor $rndkey0,$inout4 485 aes${dir} $rndkey1,$inout2 486 pxor $rndkey0,$inout5 487 $movkey ($key,%rax),$rndkey0 488 add \$16,%rax 489 jmp .L${dir}_loop6_enter 490.align 16 491.L${dir}_loop6: 492 aes${dir} $rndkey1,$inout0 493 aes${dir} $rndkey1,$inout1 494 aes${dir} $rndkey1,$inout2 495.L${dir}_loop6_enter: 496 aes${dir} $rndkey1,$inout3 497 aes${dir} $rndkey1,$inout4 498 aes${dir} $rndkey1,$inout5 499 $movkey ($key,%rax),$rndkey1 500 add \$32,%rax 501 aes${dir} $rndkey0,$inout0 502 aes${dir} $rndkey0,$inout1 503 aes${dir} $rndkey0,$inout2 504 aes${dir} $rndkey0,$inout3 505 aes${dir} $rndkey0,$inout4 506 aes${dir} $rndkey0,$inout5 507 $movkey -16($key,%rax),$rndkey0 508 jnz .L${dir}_loop6 509 510 aes${dir} $rndkey1,$inout0 511 aes${dir} $rndkey1,$inout1 512 aes${dir} $rndkey1,$inout2 513 aes${dir} $rndkey1,$inout3 514 aes${dir} $rndkey1,$inout4 515 aes${dir} $rndkey1,$inout5 516 aes${dir}last $rndkey0,$inout0 517 aes${dir}last $rndkey0,$inout1 518 aes${dir}last $rndkey0,$inout2 519 aes${dir}last $rndkey0,$inout3 520 aes${dir}last $rndkey0,$inout4 521 aes${dir}last $rndkey0,$inout5 522 ret 523.cfi_endproc 524.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6 525___ 526} 527sub aesni_generate8 { 528my $dir=shift; 529# As already mentioned it takes in $key and $rounds, which are *not* 530# preserved. $inout[0-7] is cipher/clear text... 531$code.=<<___; 532.type _aesni_${dir}rypt8,\@abi-omnipotent 533.align 16 534_aesni_${dir}rypt8: 535.cfi_startproc 536 $movkey ($key),$rndkey0 537 shl \$4,$rounds 538 $movkey 16($key),$rndkey1 539 xorps $rndkey0,$inout0 540 xorps $rndkey0,$inout1 541 pxor $rndkey0,$inout2 542 pxor $rndkey0,$inout3 543 pxor $rndkey0,$inout4 544 lea 32($key,$rounds),$key 545 neg %rax # $rounds 546 aes${dir} $rndkey1,$inout0 547 pxor $rndkey0,$inout5 548 pxor $rndkey0,$inout6 549 aes${dir} $rndkey1,$inout1 550 pxor $rndkey0,$inout7 551 $movkey ($key,%rax),$rndkey0 552 add \$16,%rax 553 jmp .L${dir}_loop8_inner 554.align 16 555.L${dir}_loop8: 556 aes${dir} $rndkey1,$inout0 557 aes${dir} $rndkey1,$inout1 558.L${dir}_loop8_inner: 559 aes${dir} $rndkey1,$inout2 560 aes${dir} $rndkey1,$inout3 561 aes${dir} $rndkey1,$inout4 562 aes${dir} $rndkey1,$inout5 563 aes${dir} $rndkey1,$inout6 564 aes${dir} $rndkey1,$inout7 565.L${dir}_loop8_enter: 566 $movkey ($key,%rax),$rndkey1 567 add \$32,%rax 568 aes${dir} $rndkey0,$inout0 569 aes${dir} $rndkey0,$inout1 570 aes${dir} $rndkey0,$inout2 571 aes${dir} $rndkey0,$inout3 572 aes${dir} $rndkey0,$inout4 573 aes${dir} $rndkey0,$inout5 574 aes${dir} $rndkey0,$inout6 575 aes${dir} $rndkey0,$inout7 576 $movkey -16($key,%rax),$rndkey0 577 jnz .L${dir}_loop8 578 579 aes${dir} $rndkey1,$inout0 580 aes${dir} $rndkey1,$inout1 581 aes${dir} $rndkey1,$inout2 582 aes${dir} $rndkey1,$inout3 583 aes${dir} $rndkey1,$inout4 584 aes${dir} $rndkey1,$inout5 585 aes${dir} $rndkey1,$inout6 586 aes${dir} $rndkey1,$inout7 587 aes${dir}last $rndkey0,$inout0 588 aes${dir}last $rndkey0,$inout1 589 aes${dir}last $rndkey0,$inout2 590 aes${dir}last $rndkey0,$inout3 591 aes${dir}last $rndkey0,$inout4 592 aes${dir}last $rndkey0,$inout5 593 aes${dir}last $rndkey0,$inout6 594 aes${dir}last $rndkey0,$inout7 595 ret 596.cfi_endproc 597.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8 598___ 599} 600&aesni_generate2("enc") if ($PREFIX eq "aes_hw"); 601&aesni_generate2("dec"); 602&aesni_generate3("enc") if ($PREFIX eq "aes_hw"); 603&aesni_generate3("dec"); 604&aesni_generate4("enc") if ($PREFIX eq "aes_hw"); 605&aesni_generate4("dec"); 606&aesni_generate6("enc") if ($PREFIX eq "aes_hw"); 607&aesni_generate6("dec"); 608&aesni_generate8("enc") if ($PREFIX eq "aes_hw"); 609&aesni_generate8("dec"); 610 611if ($PREFIX eq "aes_hw") { 612######################################################################## 613# void aesni_ecb_encrypt (const void *in, void *out, 614# size_t length, const AES_KEY *key, 615# int enc); 616$code.=<<___; 617.globl ${PREFIX}_ecb_encrypt 618.type ${PREFIX}_ecb_encrypt,\@function,5 619.align 16 620${PREFIX}_ecb_encrypt: 621.cfi_startproc 622___ 623$code.=<<___ if ($win64); 624 lea -0x58(%rsp),%rsp 625 movaps %xmm6,(%rsp) # offload $inout4..7 626 movaps %xmm7,0x10(%rsp) 627 movaps %xmm8,0x20(%rsp) 628 movaps %xmm9,0x30(%rsp) 629.Lecb_enc_body: 630___ 631$code.=<<___; 632 and \$-16,$len # if ($len<16) 633 jz .Lecb_ret # return 634 635 mov 240($key),$rounds # key->rounds 636 $movkey ($key),$rndkey0 637 mov $key,$key_ # backup $key 638 mov $rounds,$rnds_ # backup $rounds 639 test %r8d,%r8d # 5th argument 640 jz .Lecb_decrypt 641#--------------------------- ECB ENCRYPT ------------------------------# 642 cmp \$0x80,$len # if ($len<8*16) 643 jb .Lecb_enc_tail # short input 644 645 movdqu ($inp),$inout0 # load 8 input blocks 646 movdqu 0x10($inp),$inout1 647 movdqu 0x20($inp),$inout2 648 movdqu 0x30($inp),$inout3 649 movdqu 0x40($inp),$inout4 650 movdqu 0x50($inp),$inout5 651 movdqu 0x60($inp),$inout6 652 movdqu 0x70($inp),$inout7 653 lea 0x80($inp),$inp # $inp+=8*16 654 sub \$0x80,$len # $len-=8*16 (can be zero) 655 jmp .Lecb_enc_loop8_enter 656.align 16 657.Lecb_enc_loop8: 658 movups $inout0,($out) # store 8 output blocks 659 mov $key_,$key # restore $key 660 movdqu ($inp),$inout0 # load 8 input blocks 661 mov $rnds_,$rounds # restore $rounds 662 movups $inout1,0x10($out) 663 movdqu 0x10($inp),$inout1 664 movups $inout2,0x20($out) 665 movdqu 0x20($inp),$inout2 666 movups $inout3,0x30($out) 667 movdqu 0x30($inp),$inout3 668 movups $inout4,0x40($out) 669 movdqu 0x40($inp),$inout4 670 movups $inout5,0x50($out) 671 movdqu 0x50($inp),$inout5 672 movups $inout6,0x60($out) 673 movdqu 0x60($inp),$inout6 674 movups $inout7,0x70($out) 675 lea 0x80($out),$out # $out+=8*16 676 movdqu 0x70($inp),$inout7 677 lea 0x80($inp),$inp # $inp+=8*16 678.Lecb_enc_loop8_enter: 679 680 call _aesni_encrypt8 681 682 sub \$0x80,$len 683 jnc .Lecb_enc_loop8 # loop if $len-=8*16 didn't borrow 684 685 movups $inout0,($out) # store 8 output blocks 686 mov $key_,$key # restore $key 687 movups $inout1,0x10($out) 688 mov $rnds_,$rounds # restore $rounds 689 movups $inout2,0x20($out) 690 movups $inout3,0x30($out) 691 movups $inout4,0x40($out) 692 movups $inout5,0x50($out) 693 movups $inout6,0x60($out) 694 movups $inout7,0x70($out) 695 lea 0x80($out),$out # $out+=8*16 696 add \$0x80,$len # restore real remaining $len 697 jz .Lecb_ret # done if ($len==0) 698 699.Lecb_enc_tail: # $len is less than 8*16 700 movups ($inp),$inout0 701 cmp \$0x20,$len 702 jb .Lecb_enc_one 703 movups 0x10($inp),$inout1 704 je .Lecb_enc_two 705 movups 0x20($inp),$inout2 706 cmp \$0x40,$len 707 jb .Lecb_enc_three 708 movups 0x30($inp),$inout3 709 je .Lecb_enc_four 710 movups 0x40($inp),$inout4 711 cmp \$0x60,$len 712 jb .Lecb_enc_five 713 movups 0x50($inp),$inout5 714 je .Lecb_enc_six 715 movdqu 0x60($inp),$inout6 716 xorps $inout7,$inout7 717 call _aesni_encrypt8 718 movups $inout0,($out) # store 7 output blocks 719 movups $inout1,0x10($out) 720 movups $inout2,0x20($out) 721 movups $inout3,0x30($out) 722 movups $inout4,0x40($out) 723 movups $inout5,0x50($out) 724 movups $inout6,0x60($out) 725 jmp .Lecb_ret 726.align 16 727.Lecb_enc_one: 728___ 729 &aesni_generate1("enc",$key,$rounds); 730$code.=<<___; 731 movups $inout0,($out) # store one output block 732 jmp .Lecb_ret 733.align 16 734.Lecb_enc_two: 735 call _aesni_encrypt2 736 movups $inout0,($out) # store 2 output blocks 737 movups $inout1,0x10($out) 738 jmp .Lecb_ret 739.align 16 740.Lecb_enc_three: 741 call _aesni_encrypt3 742 movups $inout0,($out) # store 3 output blocks 743 movups $inout1,0x10($out) 744 movups $inout2,0x20($out) 745 jmp .Lecb_ret 746.align 16 747.Lecb_enc_four: 748 call _aesni_encrypt4 749 movups $inout0,($out) # store 4 output blocks 750 movups $inout1,0x10($out) 751 movups $inout2,0x20($out) 752 movups $inout3,0x30($out) 753 jmp .Lecb_ret 754.align 16 755.Lecb_enc_five: 756 xorps $inout5,$inout5 757 call _aesni_encrypt6 758 movups $inout0,($out) # store 5 output blocks 759 movups $inout1,0x10($out) 760 movups $inout2,0x20($out) 761 movups $inout3,0x30($out) 762 movups $inout4,0x40($out) 763 jmp .Lecb_ret 764.align 16 765.Lecb_enc_six: 766 call _aesni_encrypt6 767 movups $inout0,($out) # store 6 output blocks 768 movups $inout1,0x10($out) 769 movups $inout2,0x20($out) 770 movups $inout3,0x30($out) 771 movups $inout4,0x40($out) 772 movups $inout5,0x50($out) 773 jmp .Lecb_ret 774#--------------------------- ECB DECRYPT ------------------------------# 775.align 16 776.Lecb_decrypt: 777 cmp \$0x80,$len # if ($len<8*16) 778 jb .Lecb_dec_tail # short input 779 780 movdqu ($inp),$inout0 # load 8 input blocks 781 movdqu 0x10($inp),$inout1 782 movdqu 0x20($inp),$inout2 783 movdqu 0x30($inp),$inout3 784 movdqu 0x40($inp),$inout4 785 movdqu 0x50($inp),$inout5 786 movdqu 0x60($inp),$inout6 787 movdqu 0x70($inp),$inout7 788 lea 0x80($inp),$inp # $inp+=8*16 789 sub \$0x80,$len # $len-=8*16 (can be zero) 790 jmp .Lecb_dec_loop8_enter 791.align 16 792.Lecb_dec_loop8: 793 movups $inout0,($out) # store 8 output blocks 794 mov $key_,$key # restore $key 795 movdqu ($inp),$inout0 # load 8 input blocks 796 mov $rnds_,$rounds # restore $rounds 797 movups $inout1,0x10($out) 798 movdqu 0x10($inp),$inout1 799 movups $inout2,0x20($out) 800 movdqu 0x20($inp),$inout2 801 movups $inout3,0x30($out) 802 movdqu 0x30($inp),$inout3 803 movups $inout4,0x40($out) 804 movdqu 0x40($inp),$inout4 805 movups $inout5,0x50($out) 806 movdqu 0x50($inp),$inout5 807 movups $inout6,0x60($out) 808 movdqu 0x60($inp),$inout6 809 movups $inout7,0x70($out) 810 lea 0x80($out),$out # $out+=8*16 811 movdqu 0x70($inp),$inout7 812 lea 0x80($inp),$inp # $inp+=8*16 813.Lecb_dec_loop8_enter: 814 815 call _aesni_decrypt8 816 817 $movkey ($key_),$rndkey0 818 sub \$0x80,$len 819 jnc .Lecb_dec_loop8 # loop if $len-=8*16 didn't borrow 820 821 movups $inout0,($out) # store 8 output blocks 822 pxor $inout0,$inout0 # clear register bank 823 mov $key_,$key # restore $key 824 movups $inout1,0x10($out) 825 pxor $inout1,$inout1 826 mov $rnds_,$rounds # restore $rounds 827 movups $inout2,0x20($out) 828 pxor $inout2,$inout2 829 movups $inout3,0x30($out) 830 pxor $inout3,$inout3 831 movups $inout4,0x40($out) 832 pxor $inout4,$inout4 833 movups $inout5,0x50($out) 834 pxor $inout5,$inout5 835 movups $inout6,0x60($out) 836 pxor $inout6,$inout6 837 movups $inout7,0x70($out) 838 pxor $inout7,$inout7 839 lea 0x80($out),$out # $out+=8*16 840 add \$0x80,$len # restore real remaining $len 841 jz .Lecb_ret # done if ($len==0) 842 843.Lecb_dec_tail: 844 movups ($inp),$inout0 845 cmp \$0x20,$len 846 jb .Lecb_dec_one 847 movups 0x10($inp),$inout1 848 je .Lecb_dec_two 849 movups 0x20($inp),$inout2 850 cmp \$0x40,$len 851 jb .Lecb_dec_three 852 movups 0x30($inp),$inout3 853 je .Lecb_dec_four 854 movups 0x40($inp),$inout4 855 cmp \$0x60,$len 856 jb .Lecb_dec_five 857 movups 0x50($inp),$inout5 858 je .Lecb_dec_six 859 movups 0x60($inp),$inout6 860 $movkey ($key),$rndkey0 861 xorps $inout7,$inout7 862 call _aesni_decrypt8 863 movups $inout0,($out) # store 7 output blocks 864 pxor $inout0,$inout0 # clear register bank 865 movups $inout1,0x10($out) 866 pxor $inout1,$inout1 867 movups $inout2,0x20($out) 868 pxor $inout2,$inout2 869 movups $inout3,0x30($out) 870 pxor $inout3,$inout3 871 movups $inout4,0x40($out) 872 pxor $inout4,$inout4 873 movups $inout5,0x50($out) 874 pxor $inout5,$inout5 875 movups $inout6,0x60($out) 876 pxor $inout6,$inout6 877 pxor $inout7,$inout7 878 jmp .Lecb_ret 879.align 16 880.Lecb_dec_one: 881___ 882 &aesni_generate1("dec",$key,$rounds); 883$code.=<<___; 884 movups $inout0,($out) # store one output block 885 pxor $inout0,$inout0 # clear register bank 886 jmp .Lecb_ret 887.align 16 888.Lecb_dec_two: 889 call _aesni_decrypt2 890 movups $inout0,($out) # store 2 output blocks 891 pxor $inout0,$inout0 # clear register bank 892 movups $inout1,0x10($out) 893 pxor $inout1,$inout1 894 jmp .Lecb_ret 895.align 16 896.Lecb_dec_three: 897 call _aesni_decrypt3 898 movups $inout0,($out) # store 3 output blocks 899 pxor $inout0,$inout0 # clear register bank 900 movups $inout1,0x10($out) 901 pxor $inout1,$inout1 902 movups $inout2,0x20($out) 903 pxor $inout2,$inout2 904 jmp .Lecb_ret 905.align 16 906.Lecb_dec_four: 907 call _aesni_decrypt4 908 movups $inout0,($out) # store 4 output blocks 909 pxor $inout0,$inout0 # clear register bank 910 movups $inout1,0x10($out) 911 pxor $inout1,$inout1 912 movups $inout2,0x20($out) 913 pxor $inout2,$inout2 914 movups $inout3,0x30($out) 915 pxor $inout3,$inout3 916 jmp .Lecb_ret 917.align 16 918.Lecb_dec_five: 919 xorps $inout5,$inout5 920 call _aesni_decrypt6 921 movups $inout0,($out) # store 5 output blocks 922 pxor $inout0,$inout0 # clear register bank 923 movups $inout1,0x10($out) 924 pxor $inout1,$inout1 925 movups $inout2,0x20($out) 926 pxor $inout2,$inout2 927 movups $inout3,0x30($out) 928 pxor $inout3,$inout3 929 movups $inout4,0x40($out) 930 pxor $inout4,$inout4 931 pxor $inout5,$inout5 932 jmp .Lecb_ret 933.align 16 934.Lecb_dec_six: 935 call _aesni_decrypt6 936 movups $inout0,($out) # store 6 output blocks 937 pxor $inout0,$inout0 # clear register bank 938 movups $inout1,0x10($out) 939 pxor $inout1,$inout1 940 movups $inout2,0x20($out) 941 pxor $inout2,$inout2 942 movups $inout3,0x30($out) 943 pxor $inout3,$inout3 944 movups $inout4,0x40($out) 945 pxor $inout4,$inout4 946 movups $inout5,0x50($out) 947 pxor $inout5,$inout5 948 949.Lecb_ret: 950 xorps $rndkey0,$rndkey0 # %xmm0 951 pxor $rndkey1,$rndkey1 952___ 953$code.=<<___ if ($win64); 954 movaps (%rsp),%xmm6 955 movaps %xmm0,(%rsp) # clear stack 956 movaps 0x10(%rsp),%xmm7 957 movaps %xmm0,0x10(%rsp) 958 movaps 0x20(%rsp),%xmm8 959 movaps %xmm0,0x20(%rsp) 960 movaps 0x30(%rsp),%xmm9 961 movaps %xmm0,0x30(%rsp) 962 lea 0x58(%rsp),%rsp 963.Lecb_enc_ret: 964___ 965$code.=<<___; 966 ret 967.cfi_endproc 968.size ${PREFIX}_ecb_encrypt,.-${PREFIX}_ecb_encrypt 969___ 970 971{ 972###################################################################### 973# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, 974# size_t blocks, const AES_KEY *key, 975# const char *ivec,char *cmac); 976# 977# Handles only complete blocks, operates on 64-bit counter and 978# does not update *ivec! Nor does it finalize CMAC value 979# (see engine/eng_aesni.c for details) 980# 981if (0) { # Omit these functions in BoringSSL 982my $cmac="%r9"; # 6th argument 983 984my $increment="%xmm9"; 985my $iv="%xmm6"; 986my $bswap_mask="%xmm7"; 987 988$code.=<<___; 989.globl ${PREFIX}_ccm64_encrypt_blocks 990.type ${PREFIX}_ccm64_encrypt_blocks,\@function,6 991.align 16 992${PREFIX}_ccm64_encrypt_blocks: 993___ 994$code.=<<___ if ($win64); 995 lea -0x58(%rsp),%rsp 996 movaps %xmm6,(%rsp) # $iv 997 movaps %xmm7,0x10(%rsp) # $bswap_mask 998 movaps %xmm8,0x20(%rsp) # $in0 999 movaps %xmm9,0x30(%rsp) # $increment 1000.Lccm64_enc_body: 1001___ 1002$code.=<<___; 1003 mov 240($key),$rounds # key->rounds 1004 movdqu ($ivp),$iv 1005 movdqa .Lincrement64(%rip),$increment 1006 movdqa .Lbswap_mask(%rip),$bswap_mask 1007 1008 shl \$4,$rounds 1009 mov \$16,$rnds_ 1010 lea 0($key),$key_ 1011 movdqu ($cmac),$inout1 1012 movdqa $iv,$inout0 1013 lea 32($key,$rounds),$key # end of key schedule 1014 pshufb $bswap_mask,$iv 1015 sub %rax,%r10 # twisted $rounds 1016 jmp .Lccm64_enc_outer 1017.align 16 1018.Lccm64_enc_outer: 1019 $movkey ($key_),$rndkey0 1020 mov %r10,%rax 1021 movups ($inp),$in0 # load inp 1022 1023 xorps $rndkey0,$inout0 # counter 1024 $movkey 16($key_),$rndkey1 1025 xorps $in0,$rndkey0 1026 xorps $rndkey0,$inout1 # cmac^=inp 1027 $movkey 32($key_),$rndkey0 1028 1029.Lccm64_enc2_loop: 1030 aesenc $rndkey1,$inout0 1031 aesenc $rndkey1,$inout1 1032 $movkey ($key,%rax),$rndkey1 1033 add \$32,%rax 1034 aesenc $rndkey0,$inout0 1035 aesenc $rndkey0,$inout1 1036 $movkey -16($key,%rax),$rndkey0 1037 jnz .Lccm64_enc2_loop 1038 aesenc $rndkey1,$inout0 1039 aesenc $rndkey1,$inout1 1040 paddq $increment,$iv 1041 dec $len # $len-- ($len is in blocks) 1042 aesenclast $rndkey0,$inout0 1043 aesenclast $rndkey0,$inout1 1044 1045 lea 16($inp),$inp 1046 xorps $inout0,$in0 # inp ^= E(iv) 1047 movdqa $iv,$inout0 1048 movups $in0,($out) # save output 1049 pshufb $bswap_mask,$inout0 1050 lea 16($out),$out # $out+=16 1051 jnz .Lccm64_enc_outer # loop if ($len!=0) 1052 1053 pxor $rndkey0,$rndkey0 # clear register bank 1054 pxor $rndkey1,$rndkey1 1055 pxor $inout0,$inout0 1056 movups $inout1,($cmac) # store resulting mac 1057 pxor $inout1,$inout1 1058 pxor $in0,$in0 1059 pxor $iv,$iv 1060___ 1061$code.=<<___ if ($win64); 1062 movaps (%rsp),%xmm6 1063 movaps %xmm0,(%rsp) # clear stack 1064 movaps 0x10(%rsp),%xmm7 1065 movaps %xmm0,0x10(%rsp) 1066 movaps 0x20(%rsp),%xmm8 1067 movaps %xmm0,0x20(%rsp) 1068 movaps 0x30(%rsp),%xmm9 1069 movaps %xmm0,0x30(%rsp) 1070 lea 0x58(%rsp),%rsp 1071.Lccm64_enc_ret: 1072___ 1073$code.=<<___; 1074 ret 1075.size ${PREFIX}_ccm64_encrypt_blocks,.-${PREFIX}_ccm64_encrypt_blocks 1076___ 1077###################################################################### 1078$code.=<<___; 1079.globl ${PREFIX}_ccm64_decrypt_blocks 1080.type ${PREFIX}_ccm64_decrypt_blocks,\@function,6 1081.align 16 1082${PREFIX}_ccm64_decrypt_blocks: 1083___ 1084$code.=<<___ if ($win64); 1085 lea -0x58(%rsp),%rsp 1086 movaps %xmm6,(%rsp) # $iv 1087 movaps %xmm7,0x10(%rsp) # $bswap_mask 1088 movaps %xmm8,0x20(%rsp) # $in8 1089 movaps %xmm9,0x30(%rsp) # $increment 1090.Lccm64_dec_body: 1091___ 1092$code.=<<___; 1093 mov 240($key),$rounds # key->rounds 1094 movups ($ivp),$iv 1095 movdqu ($cmac),$inout1 1096 movdqa .Lincrement64(%rip),$increment 1097 movdqa .Lbswap_mask(%rip),$bswap_mask 1098 1099 movaps $iv,$inout0 1100 mov $rounds,$rnds_ 1101 mov $key,$key_ 1102 pshufb $bswap_mask,$iv 1103___ 1104 &aesni_generate1("enc",$key,$rounds); 1105$code.=<<___; 1106 shl \$4,$rnds_ 1107 mov \$16,$rounds 1108 movups ($inp),$in0 # load inp 1109 paddq $increment,$iv 1110 lea 16($inp),$inp # $inp+=16 1111 sub %r10,%rax # twisted $rounds 1112 lea 32($key_,$rnds_),$key # end of key schedule 1113 mov %rax,%r10 1114 jmp .Lccm64_dec_outer 1115.align 16 1116.Lccm64_dec_outer: 1117 xorps $inout0,$in0 # inp ^= E(iv) 1118 movdqa $iv,$inout0 1119 movups $in0,($out) # save output 1120 lea 16($out),$out # $out+=16 1121 pshufb $bswap_mask,$inout0 1122 1123 sub \$1,$len # $len-- ($len is in blocks) 1124 jz .Lccm64_dec_break # if ($len==0) break 1125 1126 $movkey ($key_),$rndkey0 1127 mov %r10,%rax 1128 $movkey 16($key_),$rndkey1 1129 xorps $rndkey0,$in0 1130 xorps $rndkey0,$inout0 1131 xorps $in0,$inout1 # cmac^=out 1132 $movkey 32($key_),$rndkey0 1133 jmp .Lccm64_dec2_loop 1134.align 16 1135.Lccm64_dec2_loop: 1136 aesenc $rndkey1,$inout0 1137 aesenc $rndkey1,$inout1 1138 $movkey ($key,%rax),$rndkey1 1139 add \$32,%rax 1140 aesenc $rndkey0,$inout0 1141 aesenc $rndkey0,$inout1 1142 $movkey -16($key,%rax),$rndkey0 1143 jnz .Lccm64_dec2_loop 1144 movups ($inp),$in0 # load input 1145 paddq $increment,$iv 1146 aesenc $rndkey1,$inout0 1147 aesenc $rndkey1,$inout1 1148 aesenclast $rndkey0,$inout0 1149 aesenclast $rndkey0,$inout1 1150 lea 16($inp),$inp # $inp+=16 1151 jmp .Lccm64_dec_outer 1152 1153.align 16 1154.Lccm64_dec_break: 1155 #xorps $in0,$inout1 # cmac^=out 1156 mov 240($key_),$rounds 1157___ 1158 &aesni_generate1("enc",$key_,$rounds,$inout1,$in0); 1159$code.=<<___; 1160 pxor $rndkey0,$rndkey0 # clear register bank 1161 pxor $rndkey1,$rndkey1 1162 pxor $inout0,$inout0 1163 movups $inout1,($cmac) # store resulting mac 1164 pxor $inout1,$inout1 1165 pxor $in0,$in0 1166 pxor $iv,$iv 1167___ 1168$code.=<<___ if ($win64); 1169 movaps (%rsp),%xmm6 1170 movaps %xmm0,(%rsp) # clear stack 1171 movaps 0x10(%rsp),%xmm7 1172 movaps %xmm0,0x10(%rsp) 1173 movaps 0x20(%rsp),%xmm8 1174 movaps %xmm0,0x20(%rsp) 1175 movaps 0x30(%rsp),%xmm9 1176 movaps %xmm0,0x30(%rsp) 1177 lea 0x58(%rsp),%rsp 1178.Lccm64_dec_ret: 1179___ 1180$code.=<<___; 1181 ret 1182.size ${PREFIX}_ccm64_decrypt_blocks,.-${PREFIX}_ccm64_decrypt_blocks 1183___ 1184} 1185###################################################################### 1186# void aesni_ctr32_encrypt_blocks (const void *in, void *out, 1187# size_t blocks, const AES_KEY *key, 1188# const char *ivec); 1189# 1190# Handles only complete blocks, operates on 32-bit counter and 1191# does not update *ivec! (see crypto/modes/ctr128.c for details) 1192# 1193# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov, 1194# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest. 1195# Keywords are full unroll and modulo-schedule counter calculations 1196# with zero-round key xor. 1197{ 1198my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15)); 1199my ($key0,$ctr)=("%ebp","${ivp}d"); 1200my $frame_size = 0x80 + ($win64?160:0); 1201 1202$code.=<<___; 1203.globl ${PREFIX}_ctr32_encrypt_blocks 1204.type ${PREFIX}_ctr32_encrypt_blocks,\@function,5 1205.align 16 1206${PREFIX}_ctr32_encrypt_blocks: 1207.cfi_startproc 1208#ifndef NDEBUG 1209#ifndef BORINGSSL_FIPS 1210 movb \$1,BORINGSSL_function_hit(%rip) 1211#endif 1212#endif 1213 cmp \$1,$len 1214 jne .Lctr32_bulk 1215 1216 # handle single block without allocating stack frame, 1217 # useful when handling edges 1218 movups ($ivp),$inout0 1219 movups ($inp),$inout1 1220 mov 240($key),%edx # key->rounds 1221___ 1222 &aesni_generate1("enc",$key,"%edx"); 1223$code.=<<___; 1224 pxor $rndkey0,$rndkey0 # clear register bank 1225 pxor $rndkey1,$rndkey1 1226 xorps $inout1,$inout0 1227 pxor $inout1,$inout1 1228 movups $inout0,($out) 1229 xorps $inout0,$inout0 1230 jmp .Lctr32_epilogue 1231 1232.align 16 1233.Lctr32_bulk: 1234 lea (%rsp),$key_ # use $key_ as frame pointer 1235.cfi_def_cfa_register $key_ 1236 push %rbp 1237.cfi_push %rbp 1238 sub \$$frame_size,%rsp 1239 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 1240___ 1241$code.=<<___ if ($win64); 1242 movaps %xmm6,-0xa8($key_) # offload everything 1243 movaps %xmm7,-0x98($key_) 1244 movaps %xmm8,-0x88($key_) 1245 movaps %xmm9,-0x78($key_) 1246 movaps %xmm10,-0x68($key_) 1247 movaps %xmm11,-0x58($key_) 1248 movaps %xmm12,-0x48($key_) 1249 movaps %xmm13,-0x38($key_) 1250 movaps %xmm14,-0x28($key_) 1251 movaps %xmm15,-0x18($key_) 1252.Lctr32_body: 1253___ 1254$code.=<<___; 1255 1256 # 8 16-byte words on top of stack are counter values 1257 # xor-ed with zero-round key 1258 1259 movdqu ($ivp),$inout0 1260 movdqu ($key),$rndkey0 1261 mov 12($ivp),$ctr # counter LSB 1262 pxor $rndkey0,$inout0 1263 mov 12($key),$key0 # 0-round key LSB 1264 movdqa $inout0,0x00(%rsp) # populate counter block 1265 bswap $ctr 1266 movdqa $inout0,$inout1 1267 movdqa $inout0,$inout2 1268 movdqa $inout0,$inout3 1269 movdqa $inout0,0x40(%rsp) 1270 movdqa $inout0,0x50(%rsp) 1271 movdqa $inout0,0x60(%rsp) 1272 mov %rdx,%r10 # about to borrow %rdx 1273 movdqa $inout0,0x70(%rsp) 1274 1275 lea 1($ctr),%rax 1276 lea 2($ctr),%rdx 1277 bswap %eax 1278 bswap %edx 1279 xor $key0,%eax 1280 xor $key0,%edx 1281 pinsrd \$3,%eax,$inout1 1282 lea 3($ctr),%rax 1283 movdqa $inout1,0x10(%rsp) 1284 pinsrd \$3,%edx,$inout2 1285 bswap %eax 1286 mov %r10,%rdx # restore %rdx 1287 lea 4($ctr),%r10 1288 movdqa $inout2,0x20(%rsp) 1289 xor $key0,%eax 1290 bswap %r10d 1291 pinsrd \$3,%eax,$inout3 1292 xor $key0,%r10d 1293 movdqa $inout3,0x30(%rsp) 1294 lea 5($ctr),%r9 1295 mov %r10d,0x40+12(%rsp) 1296 bswap %r9d 1297 lea 6($ctr),%r10 1298 mov 240($key),$rounds # key->rounds 1299 xor $key0,%r9d 1300 bswap %r10d 1301 mov %r9d,0x50+12(%rsp) 1302 xor $key0,%r10d 1303 lea 7($ctr),%r9 1304 mov %r10d,0x60+12(%rsp) 1305 bswap %r9d 1306 leaq OPENSSL_ia32cap_P(%rip),%r10 1307 mov 4(%r10),%r10d 1308 xor $key0,%r9d 1309 and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE 1310 mov %r9d,0x70+12(%rsp) 1311 1312 $movkey 0x10($key),$rndkey1 1313 1314 movdqa 0x40(%rsp),$inout4 1315 movdqa 0x50(%rsp),$inout5 1316 1317 cmp \$8,$len # $len is in blocks 1318 jb .Lctr32_tail # short input if ($len<8) 1319 1320 sub \$6,$len # $len is biased by -6 1321 cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE 1322 je .Lctr32_6x # [which denotes Atom Silvermont] 1323 1324 lea 0x80($key),$key # size optimization 1325 sub \$2,$len # $len is biased by -8 1326 jmp .Lctr32_loop8 1327 1328.align 16 1329.Lctr32_6x: 1330 shl \$4,$rounds 1331 mov \$48,$rnds_ 1332 bswap $key0 1333 lea 32($key,$rounds),$key # end of key schedule 1334 sub %rax,%r10 # twisted $rounds 1335 jmp .Lctr32_loop6 1336 1337.align 16 1338.Lctr32_loop6: 1339 add \$6,$ctr # next counter value 1340 $movkey -48($key,$rnds_),$rndkey0 1341 aesenc $rndkey1,$inout0 1342 mov $ctr,%eax 1343 xor $key0,%eax 1344 aesenc $rndkey1,$inout1 1345 movbe %eax,`0x00+12`(%rsp) # store next counter value 1346 lea 1($ctr),%eax 1347 aesenc $rndkey1,$inout2 1348 xor $key0,%eax 1349 movbe %eax,`0x10+12`(%rsp) 1350 aesenc $rndkey1,$inout3 1351 lea 2($ctr),%eax 1352 xor $key0,%eax 1353 aesenc $rndkey1,$inout4 1354 movbe %eax,`0x20+12`(%rsp) 1355 lea 3($ctr),%eax 1356 aesenc $rndkey1,$inout5 1357 $movkey -32($key,$rnds_),$rndkey1 1358 xor $key0,%eax 1359 1360 aesenc $rndkey0,$inout0 1361 movbe %eax,`0x30+12`(%rsp) 1362 lea 4($ctr),%eax 1363 aesenc $rndkey0,$inout1 1364 xor $key0,%eax 1365 movbe %eax,`0x40+12`(%rsp) 1366 aesenc $rndkey0,$inout2 1367 lea 5($ctr),%eax 1368 xor $key0,%eax 1369 aesenc $rndkey0,$inout3 1370 movbe %eax,`0x50+12`(%rsp) 1371 mov %r10,%rax # mov $rnds_,$rounds 1372 aesenc $rndkey0,$inout4 1373 aesenc $rndkey0,$inout5 1374 $movkey -16($key,$rnds_),$rndkey0 1375 1376 call .Lenc_loop6 1377 1378 movdqu ($inp),$inout6 # load 6 input blocks 1379 movdqu 0x10($inp),$inout7 1380 movdqu 0x20($inp),$in0 1381 movdqu 0x30($inp),$in1 1382 movdqu 0x40($inp),$in2 1383 movdqu 0x50($inp),$in3 1384 lea 0x60($inp),$inp # $inp+=6*16 1385 $movkey -64($key,$rnds_),$rndkey1 1386 pxor $inout0,$inout6 # inp^=E(ctr) 1387 movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round] 1388 pxor $inout1,$inout7 1389 movaps 0x10(%rsp),$inout1 1390 pxor $inout2,$in0 1391 movaps 0x20(%rsp),$inout2 1392 pxor $inout3,$in1 1393 movaps 0x30(%rsp),$inout3 1394 pxor $inout4,$in2 1395 movaps 0x40(%rsp),$inout4 1396 pxor $inout5,$in3 1397 movaps 0x50(%rsp),$inout5 1398 movdqu $inout6,($out) # store 6 output blocks 1399 movdqu $inout7,0x10($out) 1400 movdqu $in0,0x20($out) 1401 movdqu $in1,0x30($out) 1402 movdqu $in2,0x40($out) 1403 movdqu $in3,0x50($out) 1404 lea 0x60($out),$out # $out+=6*16 1405 1406 sub \$6,$len 1407 jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow 1408 1409 add \$6,$len # restore real remaining $len 1410 jz .Lctr32_done # done if ($len==0) 1411 1412 lea -48($rnds_),$rounds 1413 lea -80($key,$rnds_),$key # restore $key 1414 neg $rounds 1415 shr \$4,$rounds # restore $rounds 1416 jmp .Lctr32_tail 1417 1418.align 32 1419.Lctr32_loop8: 1420 add \$8,$ctr # next counter value 1421 movdqa 0x60(%rsp),$inout6 1422 aesenc $rndkey1,$inout0 1423 mov $ctr,%r9d 1424 movdqa 0x70(%rsp),$inout7 1425 aesenc $rndkey1,$inout1 1426 bswap %r9d 1427 $movkey 0x20-0x80($key),$rndkey0 1428 aesenc $rndkey1,$inout2 1429 xor $key0,%r9d 1430 nop 1431 aesenc $rndkey1,$inout3 1432 mov %r9d,0x00+12(%rsp) # store next counter value 1433 lea 1($ctr),%r9 1434 aesenc $rndkey1,$inout4 1435 aesenc $rndkey1,$inout5 1436 aesenc $rndkey1,$inout6 1437 aesenc $rndkey1,$inout7 1438 $movkey 0x30-0x80($key),$rndkey1 1439___ 1440for($i=2;$i<8;$i++) { 1441my $rndkeyx = ($i&1)?$rndkey1:$rndkey0; 1442$code.=<<___; 1443 bswap %r9d 1444 aesenc $rndkeyx,$inout0 1445 aesenc $rndkeyx,$inout1 1446 xor $key0,%r9d 1447 .byte 0x66,0x90 1448 aesenc $rndkeyx,$inout2 1449 aesenc $rndkeyx,$inout3 1450 mov %r9d,`0x10*($i-1)`+12(%rsp) 1451 lea $i($ctr),%r9 1452 aesenc $rndkeyx,$inout4 1453 aesenc $rndkeyx,$inout5 1454 aesenc $rndkeyx,$inout6 1455 aesenc $rndkeyx,$inout7 1456 $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx 1457___ 1458} 1459$code.=<<___; 1460 bswap %r9d 1461 aesenc $rndkey0,$inout0 1462 aesenc $rndkey0,$inout1 1463 aesenc $rndkey0,$inout2 1464 xor $key0,%r9d 1465 movdqu 0x00($inp),$in0 # start loading input 1466 aesenc $rndkey0,$inout3 1467 mov %r9d,0x70+12(%rsp) 1468 cmp \$11,$rounds 1469 aesenc $rndkey0,$inout4 1470 aesenc $rndkey0,$inout5 1471 aesenc $rndkey0,$inout6 1472 aesenc $rndkey0,$inout7 1473 $movkey 0xa0-0x80($key),$rndkey0 1474 1475 jb .Lctr32_enc_done 1476 1477 aesenc $rndkey1,$inout0 1478 aesenc $rndkey1,$inout1 1479 aesenc $rndkey1,$inout2 1480 aesenc $rndkey1,$inout3 1481 aesenc $rndkey1,$inout4 1482 aesenc $rndkey1,$inout5 1483 aesenc $rndkey1,$inout6 1484 aesenc $rndkey1,$inout7 1485 $movkey 0xb0-0x80($key),$rndkey1 1486 1487 aesenc $rndkey0,$inout0 1488 aesenc $rndkey0,$inout1 1489 aesenc $rndkey0,$inout2 1490 aesenc $rndkey0,$inout3 1491 aesenc $rndkey0,$inout4 1492 aesenc $rndkey0,$inout5 1493 aesenc $rndkey0,$inout6 1494 aesenc $rndkey0,$inout7 1495 $movkey 0xc0-0x80($key),$rndkey0 1496 je .Lctr32_enc_done 1497 1498 aesenc $rndkey1,$inout0 1499 aesenc $rndkey1,$inout1 1500 aesenc $rndkey1,$inout2 1501 aesenc $rndkey1,$inout3 1502 aesenc $rndkey1,$inout4 1503 aesenc $rndkey1,$inout5 1504 aesenc $rndkey1,$inout6 1505 aesenc $rndkey1,$inout7 1506 $movkey 0xd0-0x80($key),$rndkey1 1507 1508 aesenc $rndkey0,$inout0 1509 aesenc $rndkey0,$inout1 1510 aesenc $rndkey0,$inout2 1511 aesenc $rndkey0,$inout3 1512 aesenc $rndkey0,$inout4 1513 aesenc $rndkey0,$inout5 1514 aesenc $rndkey0,$inout6 1515 aesenc $rndkey0,$inout7 1516 $movkey 0xe0-0x80($key),$rndkey0 1517 jmp .Lctr32_enc_done 1518 1519.align 16 1520.Lctr32_enc_done: 1521 movdqu 0x10($inp),$in1 1522 pxor $rndkey0,$in0 # input^=round[last] 1523 movdqu 0x20($inp),$in2 1524 pxor $rndkey0,$in1 1525 movdqu 0x30($inp),$in3 1526 pxor $rndkey0,$in2 1527 movdqu 0x40($inp),$in4 1528 pxor $rndkey0,$in3 1529 movdqu 0x50($inp),$in5 1530 pxor $rndkey0,$in4 1531 pxor $rndkey0,$in5 1532 aesenc $rndkey1,$inout0 1533 aesenc $rndkey1,$inout1 1534 aesenc $rndkey1,$inout2 1535 aesenc $rndkey1,$inout3 1536 aesenc $rndkey1,$inout4 1537 aesenc $rndkey1,$inout5 1538 aesenc $rndkey1,$inout6 1539 aesenc $rndkey1,$inout7 1540 movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6] 1541 lea 0x80($inp),$inp # $inp+=8*16 1542 1543 aesenclast $in0,$inout0 # $inN is inp[N]^round[last] 1544 pxor $rndkey0,$rndkey1 # borrowed $rndkey 1545 movdqu 0x70-0x80($inp),$in0 1546 aesenclast $in1,$inout1 1547 pxor $rndkey0,$in0 1548 movdqa 0x00(%rsp),$in1 # load next counter block 1549 aesenclast $in2,$inout2 1550 aesenclast $in3,$inout3 1551 movdqa 0x10(%rsp),$in2 1552 movdqa 0x20(%rsp),$in3 1553 aesenclast $in4,$inout4 1554 aesenclast $in5,$inout5 1555 movdqa 0x30(%rsp),$in4 1556 movdqa 0x40(%rsp),$in5 1557 aesenclast $rndkey1,$inout6 1558 movdqa 0x50(%rsp),$rndkey0 1559 $movkey 0x10-0x80($key),$rndkey1#real 1st-round key 1560 aesenclast $in0,$inout7 1561 1562 movups $inout0,($out) # store 8 output blocks 1563 movdqa $in1,$inout0 1564 movups $inout1,0x10($out) 1565 movdqa $in2,$inout1 1566 movups $inout2,0x20($out) 1567 movdqa $in3,$inout2 1568 movups $inout3,0x30($out) 1569 movdqa $in4,$inout3 1570 movups $inout4,0x40($out) 1571 movdqa $in5,$inout4 1572 movups $inout5,0x50($out) 1573 movdqa $rndkey0,$inout5 1574 movups $inout6,0x60($out) 1575 movups $inout7,0x70($out) 1576 lea 0x80($out),$out # $out+=8*16 1577 1578 sub \$8,$len 1579 jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow 1580 1581 add \$8,$len # restore real remaining $len 1582 jz .Lctr32_done # done if ($len==0) 1583 lea -0x80($key),$key 1584 1585.Lctr32_tail: 1586 # note that at this point $inout0..5 are populated with 1587 # counter values xor-ed with 0-round key 1588 lea 16($key),$key 1589 cmp \$4,$len 1590 jb .Lctr32_loop3 1591 je .Lctr32_loop4 1592 1593 # if ($len>4) compute 7 E(counter) 1594 shl \$4,$rounds 1595 movdqa 0x60(%rsp),$inout6 1596 pxor $inout7,$inout7 1597 1598 $movkey 16($key),$rndkey0 1599 aesenc $rndkey1,$inout0 1600 aesenc $rndkey1,$inout1 1601 lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter 1602 neg %rax 1603 aesenc $rndkey1,$inout2 1604 add \$16,%rax # prepare for .Lenc_loop8_enter 1605 movups ($inp),$in0 1606 aesenc $rndkey1,$inout3 1607 aesenc $rndkey1,$inout4 1608 movups 0x10($inp),$in1 # pre-load input 1609 movups 0x20($inp),$in2 1610 aesenc $rndkey1,$inout5 1611 aesenc $rndkey1,$inout6 1612 1613 call .Lenc_loop8_enter 1614 1615 movdqu 0x30($inp),$in3 1616 pxor $in0,$inout0 1617 movdqu 0x40($inp),$in0 1618 pxor $in1,$inout1 1619 movdqu $inout0,($out) # store output 1620 pxor $in2,$inout2 1621 movdqu $inout1,0x10($out) 1622 pxor $in3,$inout3 1623 movdqu $inout2,0x20($out) 1624 pxor $in0,$inout4 1625 movdqu $inout3,0x30($out) 1626 movdqu $inout4,0x40($out) 1627 cmp \$6,$len 1628 jb .Lctr32_done # $len was 5, stop store 1629 1630 movups 0x50($inp),$in1 1631 xorps $in1,$inout5 1632 movups $inout5,0x50($out) 1633 je .Lctr32_done # $len was 6, stop store 1634 1635 movups 0x60($inp),$in2 1636 xorps $in2,$inout6 1637 movups $inout6,0x60($out) 1638 jmp .Lctr32_done # $len was 7, stop store 1639 1640.align 32 1641.Lctr32_loop4: 1642 aesenc $rndkey1,$inout0 1643 lea 16($key),$key 1644 dec $rounds 1645 aesenc $rndkey1,$inout1 1646 aesenc $rndkey1,$inout2 1647 aesenc $rndkey1,$inout3 1648 $movkey ($key),$rndkey1 1649 jnz .Lctr32_loop4 1650 aesenclast $rndkey1,$inout0 1651 aesenclast $rndkey1,$inout1 1652 movups ($inp),$in0 # load input 1653 movups 0x10($inp),$in1 1654 aesenclast $rndkey1,$inout2 1655 aesenclast $rndkey1,$inout3 1656 movups 0x20($inp),$in2 1657 movups 0x30($inp),$in3 1658 1659 xorps $in0,$inout0 1660 movups $inout0,($out) # store output 1661 xorps $in1,$inout1 1662 movups $inout1,0x10($out) 1663 pxor $in2,$inout2 1664 movdqu $inout2,0x20($out) 1665 pxor $in3,$inout3 1666 movdqu $inout3,0x30($out) 1667 jmp .Lctr32_done # $len was 4, stop store 1668 1669.align 32 1670.Lctr32_loop3: 1671 aesenc $rndkey1,$inout0 1672 lea 16($key),$key 1673 dec $rounds 1674 aesenc $rndkey1,$inout1 1675 aesenc $rndkey1,$inout2 1676 $movkey ($key),$rndkey1 1677 jnz .Lctr32_loop3 1678 aesenclast $rndkey1,$inout0 1679 aesenclast $rndkey1,$inout1 1680 aesenclast $rndkey1,$inout2 1681 1682 movups ($inp),$in0 # load input 1683 xorps $in0,$inout0 1684 movups $inout0,($out) # store output 1685 cmp \$2,$len 1686 jb .Lctr32_done # $len was 1, stop store 1687 1688 movups 0x10($inp),$in1 1689 xorps $in1,$inout1 1690 movups $inout1,0x10($out) 1691 je .Lctr32_done # $len was 2, stop store 1692 1693 movups 0x20($inp),$in2 1694 xorps $in2,$inout2 1695 movups $inout2,0x20($out) # $len was 3, stop store 1696 1697.Lctr32_done: 1698 xorps %xmm0,%xmm0 # clear register bank 1699 xor $key0,$key0 1700 pxor %xmm1,%xmm1 1701 pxor %xmm2,%xmm2 1702 pxor %xmm3,%xmm3 1703 pxor %xmm4,%xmm4 1704 pxor %xmm5,%xmm5 1705___ 1706$code.=<<___ if (!$win64); 1707 pxor %xmm6,%xmm6 1708 pxor %xmm7,%xmm7 1709 movaps %xmm0,0x00(%rsp) # clear stack 1710 pxor %xmm8,%xmm8 1711 movaps %xmm0,0x10(%rsp) 1712 pxor %xmm9,%xmm9 1713 movaps %xmm0,0x20(%rsp) 1714 pxor %xmm10,%xmm10 1715 movaps %xmm0,0x30(%rsp) 1716 pxor %xmm11,%xmm11 1717 movaps %xmm0,0x40(%rsp) 1718 pxor %xmm12,%xmm12 1719 movaps %xmm0,0x50(%rsp) 1720 pxor %xmm13,%xmm13 1721 movaps %xmm0,0x60(%rsp) 1722 pxor %xmm14,%xmm14 1723 movaps %xmm0,0x70(%rsp) 1724 pxor %xmm15,%xmm15 1725___ 1726$code.=<<___ if ($win64); 1727 movaps -0xa8($key_),%xmm6 1728 movaps %xmm0,-0xa8($key_) # clear stack 1729 movaps -0x98($key_),%xmm7 1730 movaps %xmm0,-0x98($key_) 1731 movaps -0x88($key_),%xmm8 1732 movaps %xmm0,-0x88($key_) 1733 movaps -0x78($key_),%xmm9 1734 movaps %xmm0,-0x78($key_) 1735 movaps -0x68($key_),%xmm10 1736 movaps %xmm0,-0x68($key_) 1737 movaps -0x58($key_),%xmm11 1738 movaps %xmm0,-0x58($key_) 1739 movaps -0x48($key_),%xmm12 1740 movaps %xmm0,-0x48($key_) 1741 movaps -0x38($key_),%xmm13 1742 movaps %xmm0,-0x38($key_) 1743 movaps -0x28($key_),%xmm14 1744 movaps %xmm0,-0x28($key_) 1745 movaps -0x18($key_),%xmm15 1746 movaps %xmm0,-0x18($key_) 1747 movaps %xmm0,0x00(%rsp) 1748 movaps %xmm0,0x10(%rsp) 1749 movaps %xmm0,0x20(%rsp) 1750 movaps %xmm0,0x30(%rsp) 1751 movaps %xmm0,0x40(%rsp) 1752 movaps %xmm0,0x50(%rsp) 1753 movaps %xmm0,0x60(%rsp) 1754 movaps %xmm0,0x70(%rsp) 1755___ 1756$code.=<<___; 1757 mov -8($key_),%rbp 1758.cfi_restore %rbp 1759 lea ($key_),%rsp 1760.cfi_def_cfa_register %rsp 1761.Lctr32_epilogue: 1762 ret 1763.cfi_endproc 1764.size ${PREFIX}_ctr32_encrypt_blocks,.-${PREFIX}_ctr32_encrypt_blocks 1765___ 1766} 1767 1768###################################################################### 1769# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, 1770# const AES_KEY *key1, const AES_KEY *key2 1771# const unsigned char iv[16]); 1772# 1773if (0) { # Omit these functions in BoringSSL 1774my @tweak=map("%xmm$_",(10..15)); 1775my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]); 1776my ($key2,$ivp,$len_)=("%r8","%r9","%r9"); 1777my $frame_size = 0x70 + ($win64?160:0); 1778my $key_ = "%rbp"; # override so that we can use %r11 as FP 1779 1780$code.=<<___; 1781.globl ${PREFIX}_xts_encrypt 1782.type ${PREFIX}_xts_encrypt,\@function,6 1783.align 16 1784${PREFIX}_xts_encrypt: 1785.cfi_startproc 1786 lea (%rsp),%r11 # frame pointer 1787.cfi_def_cfa_register %r11 1788 push %rbp 1789.cfi_push %rbp 1790 sub \$$frame_size,%rsp 1791 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 1792___ 1793$code.=<<___ if ($win64); 1794 movaps %xmm6,-0xa8(%r11) # offload everything 1795 movaps %xmm7,-0x98(%r11) 1796 movaps %xmm8,-0x88(%r11) 1797 movaps %xmm9,-0x78(%r11) 1798 movaps %xmm10,-0x68(%r11) 1799 movaps %xmm11,-0x58(%r11) 1800 movaps %xmm12,-0x48(%r11) 1801 movaps %xmm13,-0x38(%r11) 1802 movaps %xmm14,-0x28(%r11) 1803 movaps %xmm15,-0x18(%r11) 1804.Lxts_enc_body: 1805___ 1806$code.=<<___; 1807 movups ($ivp),$inout0 # load clear-text tweak 1808 mov 240(%r8),$rounds # key2->rounds 1809 mov 240($key),$rnds_ # key1->rounds 1810___ 1811 # generate the tweak 1812 &aesni_generate1("enc",$key2,$rounds,$inout0); 1813$code.=<<___; 1814 $movkey ($key),$rndkey0 # zero round key 1815 mov $key,$key_ # backup $key 1816 mov $rnds_,$rounds # backup $rounds 1817 shl \$4,$rnds_ 1818 mov $len,$len_ # backup $len 1819 and \$-16,$len 1820 1821 $movkey 16($key,$rnds_),$rndkey1 # last round key 1822 1823 movdqa .Lxts_magic(%rip),$twmask 1824 movdqa $inout0,@tweak[5] 1825 pshufd \$0x5f,$inout0,$twres 1826 pxor $rndkey0,$rndkey1 1827___ 1828 # alternative tweak calculation algorithm is based on suggestions 1829 # by Shay Gueron. psrad doesn't conflict with AES-NI instructions 1830 # and should help in the future... 1831 for ($i=0;$i<4;$i++) { 1832 $code.=<<___; 1833 movdqa $twres,$twtmp 1834 paddd $twres,$twres 1835 movdqa @tweak[5],@tweak[$i] 1836 psrad \$31,$twtmp # broadcast upper bits 1837 paddq @tweak[5],@tweak[5] 1838 pand $twmask,$twtmp 1839 pxor $rndkey0,@tweak[$i] 1840 pxor $twtmp,@tweak[5] 1841___ 1842 } 1843$code.=<<___; 1844 movdqa @tweak[5],@tweak[4] 1845 psrad \$31,$twres 1846 paddq @tweak[5],@tweak[5] 1847 pand $twmask,$twres 1848 pxor $rndkey0,@tweak[4] 1849 pxor $twres,@tweak[5] 1850 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] 1851 1852 sub \$16*6,$len 1853 jc .Lxts_enc_short # if $len-=6*16 borrowed 1854 1855 mov \$16+96,$rounds 1856 lea 32($key_,$rnds_),$key # end of key schedule 1857 sub %r10,%rax # twisted $rounds 1858 $movkey 16($key_),$rndkey1 1859 mov %rax,%r10 # backup twisted $rounds 1860 lea .Lxts_magic(%rip),%r8 1861 jmp .Lxts_enc_grandloop 1862 1863.align 32 1864.Lxts_enc_grandloop: 1865 movdqu `16*0`($inp),$inout0 # load input 1866 movdqa $rndkey0,$twmask 1867 movdqu `16*1`($inp),$inout1 1868 pxor @tweak[0],$inout0 # input^=tweak^round[0] 1869 movdqu `16*2`($inp),$inout2 1870 pxor @tweak[1],$inout1 1871 aesenc $rndkey1,$inout0 1872 movdqu `16*3`($inp),$inout3 1873 pxor @tweak[2],$inout2 1874 aesenc $rndkey1,$inout1 1875 movdqu `16*4`($inp),$inout4 1876 pxor @tweak[3],$inout3 1877 aesenc $rndkey1,$inout2 1878 movdqu `16*5`($inp),$inout5 1879 pxor @tweak[5],$twmask # round[0]^=tweak[5] 1880 movdqa 0x60(%rsp),$twres # load round[0]^round[last] 1881 pxor @tweak[4],$inout4 1882 aesenc $rndkey1,$inout3 1883 $movkey 32($key_),$rndkey0 1884 lea `16*6`($inp),$inp 1885 pxor $twmask,$inout5 1886 1887 pxor $twres,@tweak[0] # calculate tweaks^round[last] 1888 aesenc $rndkey1,$inout4 1889 pxor $twres,@tweak[1] 1890 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last] 1891 aesenc $rndkey1,$inout5 1892 $movkey 48($key_),$rndkey1 1893 pxor $twres,@tweak[2] 1894 1895 aesenc $rndkey0,$inout0 1896 pxor $twres,@tweak[3] 1897 movdqa @tweak[1],`16*1`(%rsp) 1898 aesenc $rndkey0,$inout1 1899 pxor $twres,@tweak[4] 1900 movdqa @tweak[2],`16*2`(%rsp) 1901 aesenc $rndkey0,$inout2 1902 aesenc $rndkey0,$inout3 1903 pxor $twres,$twmask 1904 movdqa @tweak[4],`16*4`(%rsp) 1905 aesenc $rndkey0,$inout4 1906 aesenc $rndkey0,$inout5 1907 $movkey 64($key_),$rndkey0 1908 movdqa $twmask,`16*5`(%rsp) 1909 pshufd \$0x5f,@tweak[5],$twres 1910 jmp .Lxts_enc_loop6 1911.align 32 1912.Lxts_enc_loop6: 1913 aesenc $rndkey1,$inout0 1914 aesenc $rndkey1,$inout1 1915 aesenc $rndkey1,$inout2 1916 aesenc $rndkey1,$inout3 1917 aesenc $rndkey1,$inout4 1918 aesenc $rndkey1,$inout5 1919 $movkey -64($key,%rax),$rndkey1 1920 add \$32,%rax 1921 1922 aesenc $rndkey0,$inout0 1923 aesenc $rndkey0,$inout1 1924 aesenc $rndkey0,$inout2 1925 aesenc $rndkey0,$inout3 1926 aesenc $rndkey0,$inout4 1927 aesenc $rndkey0,$inout5 1928 $movkey -80($key,%rax),$rndkey0 1929 jnz .Lxts_enc_loop6 1930 1931 movdqa (%r8),$twmask # start calculating next tweak 1932 movdqa $twres,$twtmp 1933 paddd $twres,$twres 1934 aesenc $rndkey1,$inout0 1935 paddq @tweak[5],@tweak[5] 1936 psrad \$31,$twtmp 1937 aesenc $rndkey1,$inout1 1938 pand $twmask,$twtmp 1939 $movkey ($key_),@tweak[0] # load round[0] 1940 aesenc $rndkey1,$inout2 1941 aesenc $rndkey1,$inout3 1942 aesenc $rndkey1,$inout4 1943 pxor $twtmp,@tweak[5] 1944 movaps @tweak[0],@tweak[1] # copy round[0] 1945 aesenc $rndkey1,$inout5 1946 $movkey -64($key),$rndkey1 1947 1948 movdqa $twres,$twtmp 1949 aesenc $rndkey0,$inout0 1950 paddd $twres,$twres 1951 pxor @tweak[5],@tweak[0] 1952 aesenc $rndkey0,$inout1 1953 psrad \$31,$twtmp 1954 paddq @tweak[5],@tweak[5] 1955 aesenc $rndkey0,$inout2 1956 aesenc $rndkey0,$inout3 1957 pand $twmask,$twtmp 1958 movaps @tweak[1],@tweak[2] 1959 aesenc $rndkey0,$inout4 1960 pxor $twtmp,@tweak[5] 1961 movdqa $twres,$twtmp 1962 aesenc $rndkey0,$inout5 1963 $movkey -48($key),$rndkey0 1964 1965 paddd $twres,$twres 1966 aesenc $rndkey1,$inout0 1967 pxor @tweak[5],@tweak[1] 1968 psrad \$31,$twtmp 1969 aesenc $rndkey1,$inout1 1970 paddq @tweak[5],@tweak[5] 1971 pand $twmask,$twtmp 1972 aesenc $rndkey1,$inout2 1973 aesenc $rndkey1,$inout3 1974 movdqa @tweak[3],`16*3`(%rsp) 1975 pxor $twtmp,@tweak[5] 1976 aesenc $rndkey1,$inout4 1977 movaps @tweak[2],@tweak[3] 1978 movdqa $twres,$twtmp 1979 aesenc $rndkey1,$inout5 1980 $movkey -32($key),$rndkey1 1981 1982 paddd $twres,$twres 1983 aesenc $rndkey0,$inout0 1984 pxor @tweak[5],@tweak[2] 1985 psrad \$31,$twtmp 1986 aesenc $rndkey0,$inout1 1987 paddq @tweak[5],@tweak[5] 1988 pand $twmask,$twtmp 1989 aesenc $rndkey0,$inout2 1990 aesenc $rndkey0,$inout3 1991 aesenc $rndkey0,$inout4 1992 pxor $twtmp,@tweak[5] 1993 movaps @tweak[3],@tweak[4] 1994 aesenc $rndkey0,$inout5 1995 1996 movdqa $twres,$rndkey0 1997 paddd $twres,$twres 1998 aesenc $rndkey1,$inout0 1999 pxor @tweak[5],@tweak[3] 2000 psrad \$31,$rndkey0 2001 aesenc $rndkey1,$inout1 2002 paddq @tweak[5],@tweak[5] 2003 pand $twmask,$rndkey0 2004 aesenc $rndkey1,$inout2 2005 aesenc $rndkey1,$inout3 2006 pxor $rndkey0,@tweak[5] 2007 $movkey ($key_),$rndkey0 2008 aesenc $rndkey1,$inout4 2009 aesenc $rndkey1,$inout5 2010 $movkey 16($key_),$rndkey1 2011 2012 pxor @tweak[5],@tweak[4] 2013 aesenclast `16*0`(%rsp),$inout0 2014 psrad \$31,$twres 2015 paddq @tweak[5],@tweak[5] 2016 aesenclast `16*1`(%rsp),$inout1 2017 aesenclast `16*2`(%rsp),$inout2 2018 pand $twmask,$twres 2019 mov %r10,%rax # restore $rounds 2020 aesenclast `16*3`(%rsp),$inout3 2021 aesenclast `16*4`(%rsp),$inout4 2022 aesenclast `16*5`(%rsp),$inout5 2023 pxor $twres,@tweak[5] 2024 2025 lea `16*6`($out),$out # $out+=6*16 2026 movups $inout0,`-16*6`($out) # store 6 output blocks 2027 movups $inout1,`-16*5`($out) 2028 movups $inout2,`-16*4`($out) 2029 movups $inout3,`-16*3`($out) 2030 movups $inout4,`-16*2`($out) 2031 movups $inout5,`-16*1`($out) 2032 sub \$16*6,$len 2033 jnc .Lxts_enc_grandloop # loop if $len-=6*16 didn't borrow 2034 2035 mov \$16+96,$rounds 2036 sub $rnds_,$rounds 2037 mov $key_,$key # restore $key 2038 shr \$4,$rounds # restore original value 2039 2040.Lxts_enc_short: 2041 # at the point @tweak[0..5] are populated with tweak values 2042 mov $rounds,$rnds_ # backup $rounds 2043 pxor $rndkey0,@tweak[0] 2044 add \$16*6,$len # restore real remaining $len 2045 jz .Lxts_enc_done # done if ($len==0) 2046 2047 pxor $rndkey0,@tweak[1] 2048 cmp \$0x20,$len 2049 jb .Lxts_enc_one # $len is 1*16 2050 pxor $rndkey0,@tweak[2] 2051 je .Lxts_enc_two # $len is 2*16 2052 2053 pxor $rndkey0,@tweak[3] 2054 cmp \$0x40,$len 2055 jb .Lxts_enc_three # $len is 3*16 2056 pxor $rndkey0,@tweak[4] 2057 je .Lxts_enc_four # $len is 4*16 2058 2059 movdqu ($inp),$inout0 # $len is 5*16 2060 movdqu 16*1($inp),$inout1 2061 movdqu 16*2($inp),$inout2 2062 pxor @tweak[0],$inout0 2063 movdqu 16*3($inp),$inout3 2064 pxor @tweak[1],$inout1 2065 movdqu 16*4($inp),$inout4 2066 lea 16*5($inp),$inp # $inp+=5*16 2067 pxor @tweak[2],$inout2 2068 pxor @tweak[3],$inout3 2069 pxor @tweak[4],$inout4 2070 pxor $inout5,$inout5 2071 2072 call _aesni_encrypt6 2073 2074 xorps @tweak[0],$inout0 2075 movdqa @tweak[5],@tweak[0] 2076 xorps @tweak[1],$inout1 2077 xorps @tweak[2],$inout2 2078 movdqu $inout0,($out) # store 5 output blocks 2079 xorps @tweak[3],$inout3 2080 movdqu $inout1,16*1($out) 2081 xorps @tweak[4],$inout4 2082 movdqu $inout2,16*2($out) 2083 movdqu $inout3,16*3($out) 2084 movdqu $inout4,16*4($out) 2085 lea 16*5($out),$out # $out+=5*16 2086 jmp .Lxts_enc_done 2087 2088.align 16 2089.Lxts_enc_one: 2090 movups ($inp),$inout0 2091 lea 16*1($inp),$inp # inp+=1*16 2092 xorps @tweak[0],$inout0 2093___ 2094 &aesni_generate1("enc",$key,$rounds); 2095$code.=<<___; 2096 xorps @tweak[0],$inout0 2097 movdqa @tweak[1],@tweak[0] 2098 movups $inout0,($out) # store one output block 2099 lea 16*1($out),$out # $out+=1*16 2100 jmp .Lxts_enc_done 2101 2102.align 16 2103.Lxts_enc_two: 2104 movups ($inp),$inout0 2105 movups 16($inp),$inout1 2106 lea 32($inp),$inp # $inp+=2*16 2107 xorps @tweak[0],$inout0 2108 xorps @tweak[1],$inout1 2109 2110 call _aesni_encrypt2 2111 2112 xorps @tweak[0],$inout0 2113 movdqa @tweak[2],@tweak[0] 2114 xorps @tweak[1],$inout1 2115 movups $inout0,($out) # store 2 output blocks 2116 movups $inout1,16*1($out) 2117 lea 16*2($out),$out # $out+=2*16 2118 jmp .Lxts_enc_done 2119 2120.align 16 2121.Lxts_enc_three: 2122 movups ($inp),$inout0 2123 movups 16*1($inp),$inout1 2124 movups 16*2($inp),$inout2 2125 lea 16*3($inp),$inp # $inp+=3*16 2126 xorps @tweak[0],$inout0 2127 xorps @tweak[1],$inout1 2128 xorps @tweak[2],$inout2 2129 2130 call _aesni_encrypt3 2131 2132 xorps @tweak[0],$inout0 2133 movdqa @tweak[3],@tweak[0] 2134 xorps @tweak[1],$inout1 2135 xorps @tweak[2],$inout2 2136 movups $inout0,($out) # store 3 output blocks 2137 movups $inout1,16*1($out) 2138 movups $inout2,16*2($out) 2139 lea 16*3($out),$out # $out+=3*16 2140 jmp .Lxts_enc_done 2141 2142.align 16 2143.Lxts_enc_four: 2144 movups ($inp),$inout0 2145 movups 16*1($inp),$inout1 2146 movups 16*2($inp),$inout2 2147 xorps @tweak[0],$inout0 2148 movups 16*3($inp),$inout3 2149 lea 16*4($inp),$inp # $inp+=4*16 2150 xorps @tweak[1],$inout1 2151 xorps @tweak[2],$inout2 2152 xorps @tweak[3],$inout3 2153 2154 call _aesni_encrypt4 2155 2156 pxor @tweak[0],$inout0 2157 movdqa @tweak[4],@tweak[0] 2158 pxor @tweak[1],$inout1 2159 pxor @tweak[2],$inout2 2160 movdqu $inout0,($out) # store 4 output blocks 2161 pxor @tweak[3],$inout3 2162 movdqu $inout1,16*1($out) 2163 movdqu $inout2,16*2($out) 2164 movdqu $inout3,16*3($out) 2165 lea 16*4($out),$out # $out+=4*16 2166 jmp .Lxts_enc_done 2167 2168.align 16 2169.Lxts_enc_done: 2170 and \$15,$len_ # see if $len%16 is 0 2171 jz .Lxts_enc_ret 2172 mov $len_,$len 2173 2174.Lxts_enc_steal: 2175 movzb ($inp),%eax # borrow $rounds ... 2176 movzb -16($out),%ecx # ... and $key 2177 lea 1($inp),$inp 2178 mov %al,-16($out) 2179 mov %cl,0($out) 2180 lea 1($out),$out 2181 sub \$1,$len 2182 jnz .Lxts_enc_steal 2183 2184 sub $len_,$out # rewind $out 2185 mov $key_,$key # restore $key 2186 mov $rnds_,$rounds # restore $rounds 2187 2188 movups -16($out),$inout0 2189 xorps @tweak[0],$inout0 2190___ 2191 &aesni_generate1("enc",$key,$rounds); 2192$code.=<<___; 2193 xorps @tweak[0],$inout0 2194 movups $inout0,-16($out) 2195 2196.Lxts_enc_ret: 2197 xorps %xmm0,%xmm0 # clear register bank 2198 pxor %xmm1,%xmm1 2199 pxor %xmm2,%xmm2 2200 pxor %xmm3,%xmm3 2201 pxor %xmm4,%xmm4 2202 pxor %xmm5,%xmm5 2203___ 2204$code.=<<___ if (!$win64); 2205 pxor %xmm6,%xmm6 2206 pxor %xmm7,%xmm7 2207 movaps %xmm0,0x00(%rsp) # clear stack 2208 pxor %xmm8,%xmm8 2209 movaps %xmm0,0x10(%rsp) 2210 pxor %xmm9,%xmm9 2211 movaps %xmm0,0x20(%rsp) 2212 pxor %xmm10,%xmm10 2213 movaps %xmm0,0x30(%rsp) 2214 pxor %xmm11,%xmm11 2215 movaps %xmm0,0x40(%rsp) 2216 pxor %xmm12,%xmm12 2217 movaps %xmm0,0x50(%rsp) 2218 pxor %xmm13,%xmm13 2219 movaps %xmm0,0x60(%rsp) 2220 pxor %xmm14,%xmm14 2221 pxor %xmm15,%xmm15 2222___ 2223$code.=<<___ if ($win64); 2224 movaps -0xa8(%r11),%xmm6 2225 movaps %xmm0,-0xa8(%r11) # clear stack 2226 movaps -0x98(%r11),%xmm7 2227 movaps %xmm0,-0x98(%r11) 2228 movaps -0x88(%r11),%xmm8 2229 movaps %xmm0,-0x88(%r11) 2230 movaps -0x78(%r11),%xmm9 2231 movaps %xmm0,-0x78(%r11) 2232 movaps -0x68(%r11),%xmm10 2233 movaps %xmm0,-0x68(%r11) 2234 movaps -0x58(%r11),%xmm11 2235 movaps %xmm0,-0x58(%r11) 2236 movaps -0x48(%r11),%xmm12 2237 movaps %xmm0,-0x48(%r11) 2238 movaps -0x38(%r11),%xmm13 2239 movaps %xmm0,-0x38(%r11) 2240 movaps -0x28(%r11),%xmm14 2241 movaps %xmm0,-0x28(%r11) 2242 movaps -0x18(%r11),%xmm15 2243 movaps %xmm0,-0x18(%r11) 2244 movaps %xmm0,0x00(%rsp) 2245 movaps %xmm0,0x10(%rsp) 2246 movaps %xmm0,0x20(%rsp) 2247 movaps %xmm0,0x30(%rsp) 2248 movaps %xmm0,0x40(%rsp) 2249 movaps %xmm0,0x50(%rsp) 2250 movaps %xmm0,0x60(%rsp) 2251___ 2252$code.=<<___; 2253 mov -8(%r11),%rbp 2254.cfi_restore %rbp 2255 lea (%r11),%rsp 2256.cfi_def_cfa_register %rsp 2257.Lxts_enc_epilogue: 2258 ret 2259.cfi_endproc 2260.size ${PREFIX}_xts_encrypt,.-${PREFIX}_xts_encrypt 2261___ 2262 2263$code.=<<___; 2264.globl ${PREFIX}_xts_decrypt 2265.type ${PREFIX}_xts_decrypt,\@function,6 2266.align 16 2267${PREFIX}_xts_decrypt: 2268.cfi_startproc 2269 lea (%rsp),%r11 # frame pointer 2270.cfi_def_cfa_register %r11 2271 push %rbp 2272.cfi_push %rbp 2273 sub \$$frame_size,%rsp 2274 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 2275___ 2276$code.=<<___ if ($win64); 2277 movaps %xmm6,-0xa8(%r11) # offload everything 2278 movaps %xmm7,-0x98(%r11) 2279 movaps %xmm8,-0x88(%r11) 2280 movaps %xmm9,-0x78(%r11) 2281 movaps %xmm10,-0x68(%r11) 2282 movaps %xmm11,-0x58(%r11) 2283 movaps %xmm12,-0x48(%r11) 2284 movaps %xmm13,-0x38(%r11) 2285 movaps %xmm14,-0x28(%r11) 2286 movaps %xmm15,-0x18(%r11) 2287.Lxts_dec_body: 2288___ 2289$code.=<<___; 2290 movups ($ivp),$inout0 # load clear-text tweak 2291 mov 240($key2),$rounds # key2->rounds 2292 mov 240($key),$rnds_ # key1->rounds 2293___ 2294 # generate the tweak 2295 &aesni_generate1("enc",$key2,$rounds,$inout0); 2296$code.=<<___; 2297 xor %eax,%eax # if ($len%16) len-=16; 2298 test \$15,$len 2299 setnz %al 2300 shl \$4,%rax 2301 sub %rax,$len 2302 2303 $movkey ($key),$rndkey0 # zero round key 2304 mov $key,$key_ # backup $key 2305 mov $rnds_,$rounds # backup $rounds 2306 shl \$4,$rnds_ 2307 mov $len,$len_ # backup $len 2308 and \$-16,$len 2309 2310 $movkey 16($key,$rnds_),$rndkey1 # last round key 2311 2312 movdqa .Lxts_magic(%rip),$twmask 2313 movdqa $inout0,@tweak[5] 2314 pshufd \$0x5f,$inout0,$twres 2315 pxor $rndkey0,$rndkey1 2316___ 2317 for ($i=0;$i<4;$i++) { 2318 $code.=<<___; 2319 movdqa $twres,$twtmp 2320 paddd $twres,$twres 2321 movdqa @tweak[5],@tweak[$i] 2322 psrad \$31,$twtmp # broadcast upper bits 2323 paddq @tweak[5],@tweak[5] 2324 pand $twmask,$twtmp 2325 pxor $rndkey0,@tweak[$i] 2326 pxor $twtmp,@tweak[5] 2327___ 2328 } 2329$code.=<<___; 2330 movdqa @tweak[5],@tweak[4] 2331 psrad \$31,$twres 2332 paddq @tweak[5],@tweak[5] 2333 pand $twmask,$twres 2334 pxor $rndkey0,@tweak[4] 2335 pxor $twres,@tweak[5] 2336 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last] 2337 2338 sub \$16*6,$len 2339 jc .Lxts_dec_short # if $len-=6*16 borrowed 2340 2341 mov \$16+96,$rounds 2342 lea 32($key_,$rnds_),$key # end of key schedule 2343 sub %r10,%rax # twisted $rounds 2344 $movkey 16($key_),$rndkey1 2345 mov %rax,%r10 # backup twisted $rounds 2346 lea .Lxts_magic(%rip),%r8 2347 jmp .Lxts_dec_grandloop 2348 2349.align 32 2350.Lxts_dec_grandloop: 2351 movdqu `16*0`($inp),$inout0 # load input 2352 movdqa $rndkey0,$twmask 2353 movdqu `16*1`($inp),$inout1 2354 pxor @tweak[0],$inout0 # intput^=tweak^round[0] 2355 movdqu `16*2`($inp),$inout2 2356 pxor @tweak[1],$inout1 2357 aesdec $rndkey1,$inout0 2358 movdqu `16*3`($inp),$inout3 2359 pxor @tweak[2],$inout2 2360 aesdec $rndkey1,$inout1 2361 movdqu `16*4`($inp),$inout4 2362 pxor @tweak[3],$inout3 2363 aesdec $rndkey1,$inout2 2364 movdqu `16*5`($inp),$inout5 2365 pxor @tweak[5],$twmask # round[0]^=tweak[5] 2366 movdqa 0x60(%rsp),$twres # load round[0]^round[last] 2367 pxor @tweak[4],$inout4 2368 aesdec $rndkey1,$inout3 2369 $movkey 32($key_),$rndkey0 2370 lea `16*6`($inp),$inp 2371 pxor $twmask,$inout5 2372 2373 pxor $twres,@tweak[0] # calculate tweaks^round[last] 2374 aesdec $rndkey1,$inout4 2375 pxor $twres,@tweak[1] 2376 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key 2377 aesdec $rndkey1,$inout5 2378 $movkey 48($key_),$rndkey1 2379 pxor $twres,@tweak[2] 2380 2381 aesdec $rndkey0,$inout0 2382 pxor $twres,@tweak[3] 2383 movdqa @tweak[1],`16*1`(%rsp) 2384 aesdec $rndkey0,$inout1 2385 pxor $twres,@tweak[4] 2386 movdqa @tweak[2],`16*2`(%rsp) 2387 aesdec $rndkey0,$inout2 2388 aesdec $rndkey0,$inout3 2389 pxor $twres,$twmask 2390 movdqa @tweak[4],`16*4`(%rsp) 2391 aesdec $rndkey0,$inout4 2392 aesdec $rndkey0,$inout5 2393 $movkey 64($key_),$rndkey0 2394 movdqa $twmask,`16*5`(%rsp) 2395 pshufd \$0x5f,@tweak[5],$twres 2396 jmp .Lxts_dec_loop6 2397.align 32 2398.Lxts_dec_loop6: 2399 aesdec $rndkey1,$inout0 2400 aesdec $rndkey1,$inout1 2401 aesdec $rndkey1,$inout2 2402 aesdec $rndkey1,$inout3 2403 aesdec $rndkey1,$inout4 2404 aesdec $rndkey1,$inout5 2405 $movkey -64($key,%rax),$rndkey1 2406 add \$32,%rax 2407 2408 aesdec $rndkey0,$inout0 2409 aesdec $rndkey0,$inout1 2410 aesdec $rndkey0,$inout2 2411 aesdec $rndkey0,$inout3 2412 aesdec $rndkey0,$inout4 2413 aesdec $rndkey0,$inout5 2414 $movkey -80($key,%rax),$rndkey0 2415 jnz .Lxts_dec_loop6 2416 2417 movdqa (%r8),$twmask # start calculating next tweak 2418 movdqa $twres,$twtmp 2419 paddd $twres,$twres 2420 aesdec $rndkey1,$inout0 2421 paddq @tweak[5],@tweak[5] 2422 psrad \$31,$twtmp 2423 aesdec $rndkey1,$inout1 2424 pand $twmask,$twtmp 2425 $movkey ($key_),@tweak[0] # load round[0] 2426 aesdec $rndkey1,$inout2 2427 aesdec $rndkey1,$inout3 2428 aesdec $rndkey1,$inout4 2429 pxor $twtmp,@tweak[5] 2430 movaps @tweak[0],@tweak[1] # copy round[0] 2431 aesdec $rndkey1,$inout5 2432 $movkey -64($key),$rndkey1 2433 2434 movdqa $twres,$twtmp 2435 aesdec $rndkey0,$inout0 2436 paddd $twres,$twres 2437 pxor @tweak[5],@tweak[0] 2438 aesdec $rndkey0,$inout1 2439 psrad \$31,$twtmp 2440 paddq @tweak[5],@tweak[5] 2441 aesdec $rndkey0,$inout2 2442 aesdec $rndkey0,$inout3 2443 pand $twmask,$twtmp 2444 movaps @tweak[1],@tweak[2] 2445 aesdec $rndkey0,$inout4 2446 pxor $twtmp,@tweak[5] 2447 movdqa $twres,$twtmp 2448 aesdec $rndkey0,$inout5 2449 $movkey -48($key),$rndkey0 2450 2451 paddd $twres,$twres 2452 aesdec $rndkey1,$inout0 2453 pxor @tweak[5],@tweak[1] 2454 psrad \$31,$twtmp 2455 aesdec $rndkey1,$inout1 2456 paddq @tweak[5],@tweak[5] 2457 pand $twmask,$twtmp 2458 aesdec $rndkey1,$inout2 2459 aesdec $rndkey1,$inout3 2460 movdqa @tweak[3],`16*3`(%rsp) 2461 pxor $twtmp,@tweak[5] 2462 aesdec $rndkey1,$inout4 2463 movaps @tweak[2],@tweak[3] 2464 movdqa $twres,$twtmp 2465 aesdec $rndkey1,$inout5 2466 $movkey -32($key),$rndkey1 2467 2468 paddd $twres,$twres 2469 aesdec $rndkey0,$inout0 2470 pxor @tweak[5],@tweak[2] 2471 psrad \$31,$twtmp 2472 aesdec $rndkey0,$inout1 2473 paddq @tweak[5],@tweak[5] 2474 pand $twmask,$twtmp 2475 aesdec $rndkey0,$inout2 2476 aesdec $rndkey0,$inout3 2477 aesdec $rndkey0,$inout4 2478 pxor $twtmp,@tweak[5] 2479 movaps @tweak[3],@tweak[4] 2480 aesdec $rndkey0,$inout5 2481 2482 movdqa $twres,$rndkey0 2483 paddd $twres,$twres 2484 aesdec $rndkey1,$inout0 2485 pxor @tweak[5],@tweak[3] 2486 psrad \$31,$rndkey0 2487 aesdec $rndkey1,$inout1 2488 paddq @tweak[5],@tweak[5] 2489 pand $twmask,$rndkey0 2490 aesdec $rndkey1,$inout2 2491 aesdec $rndkey1,$inout3 2492 pxor $rndkey0,@tweak[5] 2493 $movkey ($key_),$rndkey0 2494 aesdec $rndkey1,$inout4 2495 aesdec $rndkey1,$inout5 2496 $movkey 16($key_),$rndkey1 2497 2498 pxor @tweak[5],@tweak[4] 2499 aesdeclast `16*0`(%rsp),$inout0 2500 psrad \$31,$twres 2501 paddq @tweak[5],@tweak[5] 2502 aesdeclast `16*1`(%rsp),$inout1 2503 aesdeclast `16*2`(%rsp),$inout2 2504 pand $twmask,$twres 2505 mov %r10,%rax # restore $rounds 2506 aesdeclast `16*3`(%rsp),$inout3 2507 aesdeclast `16*4`(%rsp),$inout4 2508 aesdeclast `16*5`(%rsp),$inout5 2509 pxor $twres,@tweak[5] 2510 2511 lea `16*6`($out),$out # $out+=6*16 2512 movups $inout0,`-16*6`($out) # store 6 output blocks 2513 movups $inout1,`-16*5`($out) 2514 movups $inout2,`-16*4`($out) 2515 movups $inout3,`-16*3`($out) 2516 movups $inout4,`-16*2`($out) 2517 movups $inout5,`-16*1`($out) 2518 sub \$16*6,$len 2519 jnc .Lxts_dec_grandloop # loop if $len-=6*16 didn't borrow 2520 2521 mov \$16+96,$rounds 2522 sub $rnds_,$rounds 2523 mov $key_,$key # restore $key 2524 shr \$4,$rounds # restore original value 2525 2526.Lxts_dec_short: 2527 # at the point @tweak[0..5] are populated with tweak values 2528 mov $rounds,$rnds_ # backup $rounds 2529 pxor $rndkey0,@tweak[0] 2530 pxor $rndkey0,@tweak[1] 2531 add \$16*6,$len # restore real remaining $len 2532 jz .Lxts_dec_done # done if ($len==0) 2533 2534 pxor $rndkey0,@tweak[2] 2535 cmp \$0x20,$len 2536 jb .Lxts_dec_one # $len is 1*16 2537 pxor $rndkey0,@tweak[3] 2538 je .Lxts_dec_two # $len is 2*16 2539 2540 pxor $rndkey0,@tweak[4] 2541 cmp \$0x40,$len 2542 jb .Lxts_dec_three # $len is 3*16 2543 je .Lxts_dec_four # $len is 4*16 2544 2545 movdqu ($inp),$inout0 # $len is 5*16 2546 movdqu 16*1($inp),$inout1 2547 movdqu 16*2($inp),$inout2 2548 pxor @tweak[0],$inout0 2549 movdqu 16*3($inp),$inout3 2550 pxor @tweak[1],$inout1 2551 movdqu 16*4($inp),$inout4 2552 lea 16*5($inp),$inp # $inp+=5*16 2553 pxor @tweak[2],$inout2 2554 pxor @tweak[3],$inout3 2555 pxor @tweak[4],$inout4 2556 2557 call _aesni_decrypt6 2558 2559 xorps @tweak[0],$inout0 2560 xorps @tweak[1],$inout1 2561 xorps @tweak[2],$inout2 2562 movdqu $inout0,($out) # store 5 output blocks 2563 xorps @tweak[3],$inout3 2564 movdqu $inout1,16*1($out) 2565 xorps @tweak[4],$inout4 2566 movdqu $inout2,16*2($out) 2567 pxor $twtmp,$twtmp 2568 movdqu $inout3,16*3($out) 2569 pcmpgtd @tweak[5],$twtmp 2570 movdqu $inout4,16*4($out) 2571 lea 16*5($out),$out # $out+=5*16 2572 pshufd \$0x13,$twtmp,@tweak[1] # $twres 2573 and \$15,$len_ 2574 jz .Lxts_dec_ret 2575 2576 movdqa @tweak[5],@tweak[0] 2577 paddq @tweak[5],@tweak[5] # psllq 1,$tweak 2578 pand $twmask,@tweak[1] # isolate carry and residue 2579 pxor @tweak[5],@tweak[1] 2580 jmp .Lxts_dec_done2 2581 2582.align 16 2583.Lxts_dec_one: 2584 movups ($inp),$inout0 2585 lea 16*1($inp),$inp # $inp+=1*16 2586 xorps @tweak[0],$inout0 2587___ 2588 &aesni_generate1("dec",$key,$rounds); 2589$code.=<<___; 2590 xorps @tweak[0],$inout0 2591 movdqa @tweak[1],@tweak[0] 2592 movups $inout0,($out) # store one output block 2593 movdqa @tweak[2],@tweak[1] 2594 lea 16*1($out),$out # $out+=1*16 2595 jmp .Lxts_dec_done 2596 2597.align 16 2598.Lxts_dec_two: 2599 movups ($inp),$inout0 2600 movups 16($inp),$inout1 2601 lea 32($inp),$inp # $inp+=2*16 2602 xorps @tweak[0],$inout0 2603 xorps @tweak[1],$inout1 2604 2605 call _aesni_decrypt2 2606 2607 xorps @tweak[0],$inout0 2608 movdqa @tweak[2],@tweak[0] 2609 xorps @tweak[1],$inout1 2610 movdqa @tweak[3],@tweak[1] 2611 movups $inout0,($out) # store 2 output blocks 2612 movups $inout1,16*1($out) 2613 lea 16*2($out),$out # $out+=2*16 2614 jmp .Lxts_dec_done 2615 2616.align 16 2617.Lxts_dec_three: 2618 movups ($inp),$inout0 2619 movups 16*1($inp),$inout1 2620 movups 16*2($inp),$inout2 2621 lea 16*3($inp),$inp # $inp+=3*16 2622 xorps @tweak[0],$inout0 2623 xorps @tweak[1],$inout1 2624 xorps @tweak[2],$inout2 2625 2626 call _aesni_decrypt3 2627 2628 xorps @tweak[0],$inout0 2629 movdqa @tweak[3],@tweak[0] 2630 xorps @tweak[1],$inout1 2631 movdqa @tweak[4],@tweak[1] 2632 xorps @tweak[2],$inout2 2633 movups $inout0,($out) # store 3 output blocks 2634 movups $inout1,16*1($out) 2635 movups $inout2,16*2($out) 2636 lea 16*3($out),$out # $out+=3*16 2637 jmp .Lxts_dec_done 2638 2639.align 16 2640.Lxts_dec_four: 2641 movups ($inp),$inout0 2642 movups 16*1($inp),$inout1 2643 movups 16*2($inp),$inout2 2644 xorps @tweak[0],$inout0 2645 movups 16*3($inp),$inout3 2646 lea 16*4($inp),$inp # $inp+=4*16 2647 xorps @tweak[1],$inout1 2648 xorps @tweak[2],$inout2 2649 xorps @tweak[3],$inout3 2650 2651 call _aesni_decrypt4 2652 2653 pxor @tweak[0],$inout0 2654 movdqa @tweak[4],@tweak[0] 2655 pxor @tweak[1],$inout1 2656 movdqa @tweak[5],@tweak[1] 2657 pxor @tweak[2],$inout2 2658 movdqu $inout0,($out) # store 4 output blocks 2659 pxor @tweak[3],$inout3 2660 movdqu $inout1,16*1($out) 2661 movdqu $inout2,16*2($out) 2662 movdqu $inout3,16*3($out) 2663 lea 16*4($out),$out # $out+=4*16 2664 jmp .Lxts_dec_done 2665 2666.align 16 2667.Lxts_dec_done: 2668 and \$15,$len_ # see if $len%16 is 0 2669 jz .Lxts_dec_ret 2670.Lxts_dec_done2: 2671 mov $len_,$len 2672 mov $key_,$key # restore $key 2673 mov $rnds_,$rounds # restore $rounds 2674 2675 movups ($inp),$inout0 2676 xorps @tweak[1],$inout0 2677___ 2678 &aesni_generate1("dec",$key,$rounds); 2679$code.=<<___; 2680 xorps @tweak[1],$inout0 2681 movups $inout0,($out) 2682 2683.Lxts_dec_steal: 2684 movzb 16($inp),%eax # borrow $rounds ... 2685 movzb ($out),%ecx # ... and $key 2686 lea 1($inp),$inp 2687 mov %al,($out) 2688 mov %cl,16($out) 2689 lea 1($out),$out 2690 sub \$1,$len 2691 jnz .Lxts_dec_steal 2692 2693 sub $len_,$out # rewind $out 2694 mov $key_,$key # restore $key 2695 mov $rnds_,$rounds # restore $rounds 2696 2697 movups ($out),$inout0 2698 xorps @tweak[0],$inout0 2699___ 2700 &aesni_generate1("dec",$key,$rounds); 2701$code.=<<___; 2702 xorps @tweak[0],$inout0 2703 movups $inout0,($out) 2704 2705.Lxts_dec_ret: 2706 xorps %xmm0,%xmm0 # clear register bank 2707 pxor %xmm1,%xmm1 2708 pxor %xmm2,%xmm2 2709 pxor %xmm3,%xmm3 2710 pxor %xmm4,%xmm4 2711 pxor %xmm5,%xmm5 2712___ 2713$code.=<<___ if (!$win64); 2714 pxor %xmm6,%xmm6 2715 pxor %xmm7,%xmm7 2716 movaps %xmm0,0x00(%rsp) # clear stack 2717 pxor %xmm8,%xmm8 2718 movaps %xmm0,0x10(%rsp) 2719 pxor %xmm9,%xmm9 2720 movaps %xmm0,0x20(%rsp) 2721 pxor %xmm10,%xmm10 2722 movaps %xmm0,0x30(%rsp) 2723 pxor %xmm11,%xmm11 2724 movaps %xmm0,0x40(%rsp) 2725 pxor %xmm12,%xmm12 2726 movaps %xmm0,0x50(%rsp) 2727 pxor %xmm13,%xmm13 2728 movaps %xmm0,0x60(%rsp) 2729 pxor %xmm14,%xmm14 2730 pxor %xmm15,%xmm15 2731___ 2732$code.=<<___ if ($win64); 2733 movaps -0xa8(%r11),%xmm6 2734 movaps %xmm0,-0xa8(%r11) # clear stack 2735 movaps -0x98(%r11),%xmm7 2736 movaps %xmm0,-0x98(%r11) 2737 movaps -0x88(%r11),%xmm8 2738 movaps %xmm0,-0x88(%r11) 2739 movaps -0x78(%r11),%xmm9 2740 movaps %xmm0,-0x78(%r11) 2741 movaps -0x68(%r11),%xmm10 2742 movaps %xmm0,-0x68(%r11) 2743 movaps -0x58(%r11),%xmm11 2744 movaps %xmm0,-0x58(%r11) 2745 movaps -0x48(%r11),%xmm12 2746 movaps %xmm0,-0x48(%r11) 2747 movaps -0x38(%r11),%xmm13 2748 movaps %xmm0,-0x38(%r11) 2749 movaps -0x28(%r11),%xmm14 2750 movaps %xmm0,-0x28(%r11) 2751 movaps -0x18(%r11),%xmm15 2752 movaps %xmm0,-0x18(%r11) 2753 movaps %xmm0,0x00(%rsp) 2754 movaps %xmm0,0x10(%rsp) 2755 movaps %xmm0,0x20(%rsp) 2756 movaps %xmm0,0x30(%rsp) 2757 movaps %xmm0,0x40(%rsp) 2758 movaps %xmm0,0x50(%rsp) 2759 movaps %xmm0,0x60(%rsp) 2760___ 2761$code.=<<___; 2762 mov -8(%r11),%rbp 2763.cfi_restore %rbp 2764 lea (%r11),%rsp 2765.cfi_def_cfa_register %rsp 2766.Lxts_dec_epilogue: 2767 ret 2768.cfi_endproc 2769.size ${PREFIX}_xts_decrypt,.-${PREFIX}_xts_decrypt 2770___ 2771} 2772 2773###################################################################### 2774# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks, 2775# const AES_KEY *key, unsigned int start_block_num, 2776# unsigned char offset_i[16], const unsigned char L_[][16], 2777# unsigned char checksum[16]); 2778# 2779if (0) { # Omit these functions in BoringSSL 2780my @offset=map("%xmm$_",(10..15)); 2781my ($checksum,$rndkey0l)=("%xmm8","%xmm9"); 2782my ($block_num,$offset_p)=("%r8","%r9"); # 5th and 6th arguments 2783my ($L_p,$checksum_p) = ("%rbx","%rbp"); 2784my ($i1,$i3,$i5) = ("%r12","%r13","%r14"); 2785my $seventh_arg = $win64 ? 56 : 8; 2786my $blocks = $len; 2787 2788$code.=<<___; 2789.globl ${PREFIX}_ocb_encrypt 2790.type ${PREFIX}_ocb_encrypt,\@function,6 2791.align 32 2792${PREFIX}_ocb_encrypt: 2793.cfi_startproc 2794 lea (%rsp),%rax 2795 push %rbx 2796.cfi_push %rbx 2797 push %rbp 2798.cfi_push %rbp 2799 push %r12 2800.cfi_push %r12 2801 push %r13 2802.cfi_push %r13 2803 push %r14 2804.cfi_push %r14 2805___ 2806$code.=<<___ if ($win64); 2807 lea -0xa0(%rsp),%rsp 2808 movaps %xmm6,0x00(%rsp) # offload everything 2809 movaps %xmm7,0x10(%rsp) 2810 movaps %xmm8,0x20(%rsp) 2811 movaps %xmm9,0x30(%rsp) 2812 movaps %xmm10,0x40(%rsp) 2813 movaps %xmm11,0x50(%rsp) 2814 movaps %xmm12,0x60(%rsp) 2815 movaps %xmm13,0x70(%rsp) 2816 movaps %xmm14,0x80(%rsp) 2817 movaps %xmm15,0x90(%rsp) 2818.Locb_enc_body: 2819___ 2820$code.=<<___; 2821 mov $seventh_arg(%rax),$L_p # 7th argument 2822 mov $seventh_arg+8(%rax),$checksum_p# 8th argument 2823 2824 mov 240($key),$rnds_ 2825 mov $key,$key_ 2826 shl \$4,$rnds_ 2827 $movkey ($key),$rndkey0l # round[0] 2828 $movkey 16($key,$rnds_),$rndkey1 # round[last] 2829 2830 movdqu ($offset_p),@offset[5] # load last offset_i 2831 pxor $rndkey1,$rndkey0l # round[0] ^ round[last] 2832 pxor $rndkey1,@offset[5] # offset_i ^ round[last] 2833 2834 mov \$16+32,$rounds 2835 lea 32($key_,$rnds_),$key 2836 $movkey 16($key_),$rndkey1 # round[1] 2837 sub %r10,%rax # twisted $rounds 2838 mov %rax,%r10 # backup twisted $rounds 2839 2840 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 2841 movdqu ($checksum_p),$checksum # load checksum 2842 2843 test \$1,$block_num # is first block number odd? 2844 jnz .Locb_enc_odd 2845 2846 bsf $block_num,$i1 2847 add \$1,$block_num 2848 shl \$4,$i1 2849 movdqu ($L_p,$i1),$inout5 # borrow 2850 movdqu ($inp),$inout0 2851 lea 16($inp),$inp 2852 2853 call __ocb_encrypt1 2854 2855 movdqa $inout5,@offset[5] 2856 movups $inout0,($out) 2857 lea 16($out),$out 2858 sub \$1,$blocks 2859 jz .Locb_enc_done 2860 2861.Locb_enc_odd: 2862 lea 1($block_num),$i1 # even-numbered blocks 2863 lea 3($block_num),$i3 2864 lea 5($block_num),$i5 2865 lea 6($block_num),$block_num 2866 bsf $i1,$i1 # ntz(block) 2867 bsf $i3,$i3 2868 bsf $i5,$i5 2869 shl \$4,$i1 # ntz(block) -> table offset 2870 shl \$4,$i3 2871 shl \$4,$i5 2872 2873 sub \$6,$blocks 2874 jc .Locb_enc_short 2875 jmp .Locb_enc_grandloop 2876 2877.align 32 2878.Locb_enc_grandloop: 2879 movdqu `16*0`($inp),$inout0 # load input 2880 movdqu `16*1`($inp),$inout1 2881 movdqu `16*2`($inp),$inout2 2882 movdqu `16*3`($inp),$inout3 2883 movdqu `16*4`($inp),$inout4 2884 movdqu `16*5`($inp),$inout5 2885 lea `16*6`($inp),$inp 2886 2887 call __ocb_encrypt6 2888 2889 movups $inout0,`16*0`($out) # store output 2890 movups $inout1,`16*1`($out) 2891 movups $inout2,`16*2`($out) 2892 movups $inout3,`16*3`($out) 2893 movups $inout4,`16*4`($out) 2894 movups $inout5,`16*5`($out) 2895 lea `16*6`($out),$out 2896 sub \$6,$blocks 2897 jnc .Locb_enc_grandloop 2898 2899.Locb_enc_short: 2900 add \$6,$blocks 2901 jz .Locb_enc_done 2902 2903 movdqu `16*0`($inp),$inout0 2904 cmp \$2,$blocks 2905 jb .Locb_enc_one 2906 movdqu `16*1`($inp),$inout1 2907 je .Locb_enc_two 2908 2909 movdqu `16*2`($inp),$inout2 2910 cmp \$4,$blocks 2911 jb .Locb_enc_three 2912 movdqu `16*3`($inp),$inout3 2913 je .Locb_enc_four 2914 2915 movdqu `16*4`($inp),$inout4 2916 pxor $inout5,$inout5 2917 2918 call __ocb_encrypt6 2919 2920 movdqa @offset[4],@offset[5] 2921 movups $inout0,`16*0`($out) 2922 movups $inout1,`16*1`($out) 2923 movups $inout2,`16*2`($out) 2924 movups $inout3,`16*3`($out) 2925 movups $inout4,`16*4`($out) 2926 2927 jmp .Locb_enc_done 2928 2929.align 16 2930.Locb_enc_one: 2931 movdqa @offset[0],$inout5 # borrow 2932 2933 call __ocb_encrypt1 2934 2935 movdqa $inout5,@offset[5] 2936 movups $inout0,`16*0`($out) 2937 jmp .Locb_enc_done 2938 2939.align 16 2940.Locb_enc_two: 2941 pxor $inout2,$inout2 2942 pxor $inout3,$inout3 2943 2944 call __ocb_encrypt4 2945 2946 movdqa @offset[1],@offset[5] 2947 movups $inout0,`16*0`($out) 2948 movups $inout1,`16*1`($out) 2949 2950 jmp .Locb_enc_done 2951 2952.align 16 2953.Locb_enc_three: 2954 pxor $inout3,$inout3 2955 2956 call __ocb_encrypt4 2957 2958 movdqa @offset[2],@offset[5] 2959 movups $inout0,`16*0`($out) 2960 movups $inout1,`16*1`($out) 2961 movups $inout2,`16*2`($out) 2962 2963 jmp .Locb_enc_done 2964 2965.align 16 2966.Locb_enc_four: 2967 call __ocb_encrypt4 2968 2969 movdqa @offset[3],@offset[5] 2970 movups $inout0,`16*0`($out) 2971 movups $inout1,`16*1`($out) 2972 movups $inout2,`16*2`($out) 2973 movups $inout3,`16*3`($out) 2974 2975.Locb_enc_done: 2976 pxor $rndkey0,@offset[5] # "remove" round[last] 2977 movdqu $checksum,($checksum_p) # store checksum 2978 movdqu @offset[5],($offset_p) # store last offset_i 2979 2980 xorps %xmm0,%xmm0 # clear register bank 2981 pxor %xmm1,%xmm1 2982 pxor %xmm2,%xmm2 2983 pxor %xmm3,%xmm3 2984 pxor %xmm4,%xmm4 2985 pxor %xmm5,%xmm5 2986___ 2987$code.=<<___ if (!$win64); 2988 pxor %xmm6,%xmm6 2989 pxor %xmm7,%xmm7 2990 pxor %xmm8,%xmm8 2991 pxor %xmm9,%xmm9 2992 pxor %xmm10,%xmm10 2993 pxor %xmm11,%xmm11 2994 pxor %xmm12,%xmm12 2995 pxor %xmm13,%xmm13 2996 pxor %xmm14,%xmm14 2997 pxor %xmm15,%xmm15 2998 lea 0x28(%rsp),%rax 2999.cfi_def_cfa %rax,8 3000___ 3001$code.=<<___ if ($win64); 3002 movaps 0x00(%rsp),%xmm6 3003 movaps %xmm0,0x00(%rsp) # clear stack 3004 movaps 0x10(%rsp),%xmm7 3005 movaps %xmm0,0x10(%rsp) 3006 movaps 0x20(%rsp),%xmm8 3007 movaps %xmm0,0x20(%rsp) 3008 movaps 0x30(%rsp),%xmm9 3009 movaps %xmm0,0x30(%rsp) 3010 movaps 0x40(%rsp),%xmm10 3011 movaps %xmm0,0x40(%rsp) 3012 movaps 0x50(%rsp),%xmm11 3013 movaps %xmm0,0x50(%rsp) 3014 movaps 0x60(%rsp),%xmm12 3015 movaps %xmm0,0x60(%rsp) 3016 movaps 0x70(%rsp),%xmm13 3017 movaps %xmm0,0x70(%rsp) 3018 movaps 0x80(%rsp),%xmm14 3019 movaps %xmm0,0x80(%rsp) 3020 movaps 0x90(%rsp),%xmm15 3021 movaps %xmm0,0x90(%rsp) 3022 lea 0xa0+0x28(%rsp),%rax 3023.Locb_enc_pop: 3024___ 3025$code.=<<___; 3026 mov -40(%rax),%r14 3027.cfi_restore %r14 3028 mov -32(%rax),%r13 3029.cfi_restore %r13 3030 mov -24(%rax),%r12 3031.cfi_restore %r12 3032 mov -16(%rax),%rbp 3033.cfi_restore %rbp 3034 mov -8(%rax),%rbx 3035.cfi_restore %rbx 3036 lea (%rax),%rsp 3037.cfi_def_cfa_register %rsp 3038.Locb_enc_epilogue: 3039 ret 3040.cfi_endproc 3041.size ${PREFIX}_ocb_encrypt,.-${PREFIX}_ocb_encrypt 3042 3043.type __ocb_encrypt6,\@abi-omnipotent 3044.align 32 3045__ocb_encrypt6: 3046 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3047 movdqu ($L_p,$i1),@offset[1] 3048 movdqa @offset[0],@offset[2] 3049 movdqu ($L_p,$i3),@offset[3] 3050 movdqa @offset[0],@offset[4] 3051 pxor @offset[5],@offset[0] 3052 movdqu ($L_p,$i5),@offset[5] 3053 pxor @offset[0],@offset[1] 3054 pxor $inout0,$checksum # accumulate checksum 3055 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3056 pxor @offset[1],@offset[2] 3057 pxor $inout1,$checksum 3058 pxor @offset[1],$inout1 3059 pxor @offset[2],@offset[3] 3060 pxor $inout2,$checksum 3061 pxor @offset[2],$inout2 3062 pxor @offset[3],@offset[4] 3063 pxor $inout3,$checksum 3064 pxor @offset[3],$inout3 3065 pxor @offset[4],@offset[5] 3066 pxor $inout4,$checksum 3067 pxor @offset[4],$inout4 3068 pxor $inout5,$checksum 3069 pxor @offset[5],$inout5 3070 $movkey 32($key_),$rndkey0 3071 3072 lea 1($block_num),$i1 # even-numbered blocks 3073 lea 3($block_num),$i3 3074 lea 5($block_num),$i5 3075 add \$6,$block_num 3076 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3077 bsf $i1,$i1 # ntz(block) 3078 bsf $i3,$i3 3079 bsf $i5,$i5 3080 3081 aesenc $rndkey1,$inout0 3082 aesenc $rndkey1,$inout1 3083 aesenc $rndkey1,$inout2 3084 aesenc $rndkey1,$inout3 3085 pxor $rndkey0l,@offset[1] 3086 pxor $rndkey0l,@offset[2] 3087 aesenc $rndkey1,$inout4 3088 pxor $rndkey0l,@offset[3] 3089 pxor $rndkey0l,@offset[4] 3090 aesenc $rndkey1,$inout5 3091 $movkey 48($key_),$rndkey1 3092 pxor $rndkey0l,@offset[5] 3093 3094 aesenc $rndkey0,$inout0 3095 aesenc $rndkey0,$inout1 3096 aesenc $rndkey0,$inout2 3097 aesenc $rndkey0,$inout3 3098 aesenc $rndkey0,$inout4 3099 aesenc $rndkey0,$inout5 3100 $movkey 64($key_),$rndkey0 3101 shl \$4,$i1 # ntz(block) -> table offset 3102 shl \$4,$i3 3103 jmp .Locb_enc_loop6 3104 3105.align 32 3106.Locb_enc_loop6: 3107 aesenc $rndkey1,$inout0 3108 aesenc $rndkey1,$inout1 3109 aesenc $rndkey1,$inout2 3110 aesenc $rndkey1,$inout3 3111 aesenc $rndkey1,$inout4 3112 aesenc $rndkey1,$inout5 3113 $movkey ($key,%rax),$rndkey1 3114 add \$32,%rax 3115 3116 aesenc $rndkey0,$inout0 3117 aesenc $rndkey0,$inout1 3118 aesenc $rndkey0,$inout2 3119 aesenc $rndkey0,$inout3 3120 aesenc $rndkey0,$inout4 3121 aesenc $rndkey0,$inout5 3122 $movkey -16($key,%rax),$rndkey0 3123 jnz .Locb_enc_loop6 3124 3125 aesenc $rndkey1,$inout0 3126 aesenc $rndkey1,$inout1 3127 aesenc $rndkey1,$inout2 3128 aesenc $rndkey1,$inout3 3129 aesenc $rndkey1,$inout4 3130 aesenc $rndkey1,$inout5 3131 $movkey 16($key_),$rndkey1 3132 shl \$4,$i5 3133 3134 aesenclast @offset[0],$inout0 3135 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 3136 mov %r10,%rax # restore twisted rounds 3137 aesenclast @offset[1],$inout1 3138 aesenclast @offset[2],$inout2 3139 aesenclast @offset[3],$inout3 3140 aesenclast @offset[4],$inout4 3141 aesenclast @offset[5],$inout5 3142 ret 3143.size __ocb_encrypt6,.-__ocb_encrypt6 3144 3145.type __ocb_encrypt4,\@abi-omnipotent 3146.align 32 3147__ocb_encrypt4: 3148 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3149 movdqu ($L_p,$i1),@offset[1] 3150 movdqa @offset[0],@offset[2] 3151 movdqu ($L_p,$i3),@offset[3] 3152 pxor @offset[5],@offset[0] 3153 pxor @offset[0],@offset[1] 3154 pxor $inout0,$checksum # accumulate checksum 3155 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3156 pxor @offset[1],@offset[2] 3157 pxor $inout1,$checksum 3158 pxor @offset[1],$inout1 3159 pxor @offset[2],@offset[3] 3160 pxor $inout2,$checksum 3161 pxor @offset[2],$inout2 3162 pxor $inout3,$checksum 3163 pxor @offset[3],$inout3 3164 $movkey 32($key_),$rndkey0 3165 3166 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3167 pxor $rndkey0l,@offset[1] 3168 pxor $rndkey0l,@offset[2] 3169 pxor $rndkey0l,@offset[3] 3170 3171 aesenc $rndkey1,$inout0 3172 aesenc $rndkey1,$inout1 3173 aesenc $rndkey1,$inout2 3174 aesenc $rndkey1,$inout3 3175 $movkey 48($key_),$rndkey1 3176 3177 aesenc $rndkey0,$inout0 3178 aesenc $rndkey0,$inout1 3179 aesenc $rndkey0,$inout2 3180 aesenc $rndkey0,$inout3 3181 $movkey 64($key_),$rndkey0 3182 jmp .Locb_enc_loop4 3183 3184.align 32 3185.Locb_enc_loop4: 3186 aesenc $rndkey1,$inout0 3187 aesenc $rndkey1,$inout1 3188 aesenc $rndkey1,$inout2 3189 aesenc $rndkey1,$inout3 3190 $movkey ($key,%rax),$rndkey1 3191 add \$32,%rax 3192 3193 aesenc $rndkey0,$inout0 3194 aesenc $rndkey0,$inout1 3195 aesenc $rndkey0,$inout2 3196 aesenc $rndkey0,$inout3 3197 $movkey -16($key,%rax),$rndkey0 3198 jnz .Locb_enc_loop4 3199 3200 aesenc $rndkey1,$inout0 3201 aesenc $rndkey1,$inout1 3202 aesenc $rndkey1,$inout2 3203 aesenc $rndkey1,$inout3 3204 $movkey 16($key_),$rndkey1 3205 mov %r10,%rax # restore twisted rounds 3206 3207 aesenclast @offset[0],$inout0 3208 aesenclast @offset[1],$inout1 3209 aesenclast @offset[2],$inout2 3210 aesenclast @offset[3],$inout3 3211 ret 3212.size __ocb_encrypt4,.-__ocb_encrypt4 3213 3214.type __ocb_encrypt1,\@abi-omnipotent 3215.align 32 3216__ocb_encrypt1: 3217 pxor @offset[5],$inout5 # offset_i 3218 pxor $rndkey0l,$inout5 # offset_i ^ round[0] 3219 pxor $inout0,$checksum # accumulate checksum 3220 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i 3221 $movkey 32($key_),$rndkey0 3222 3223 aesenc $rndkey1,$inout0 3224 $movkey 48($key_),$rndkey1 3225 pxor $rndkey0l,$inout5 # offset_i ^ round[last] 3226 3227 aesenc $rndkey0,$inout0 3228 $movkey 64($key_),$rndkey0 3229 jmp .Locb_enc_loop1 3230 3231.align 32 3232.Locb_enc_loop1: 3233 aesenc $rndkey1,$inout0 3234 $movkey ($key,%rax),$rndkey1 3235 add \$32,%rax 3236 3237 aesenc $rndkey0,$inout0 3238 $movkey -16($key,%rax),$rndkey0 3239 jnz .Locb_enc_loop1 3240 3241 aesenc $rndkey1,$inout0 3242 $movkey 16($key_),$rndkey1 # redundant in tail 3243 mov %r10,%rax # restore twisted rounds 3244 3245 aesenclast $inout5,$inout0 3246 ret 3247.size __ocb_encrypt1,.-__ocb_encrypt1 3248 3249.globl ${PREFIX}_ocb_decrypt 3250.type ${PREFIX}_ocb_decrypt,\@function,6 3251.align 32 3252${PREFIX}_ocb_decrypt: 3253.cfi_startproc 3254 lea (%rsp),%rax 3255 push %rbx 3256.cfi_push %rbx 3257 push %rbp 3258.cfi_push %rbp 3259 push %r12 3260.cfi_push %r12 3261 push %r13 3262.cfi_push %r13 3263 push %r14 3264.cfi_push %r14 3265___ 3266$code.=<<___ if ($win64); 3267 lea -0xa0(%rsp),%rsp 3268 movaps %xmm6,0x00(%rsp) # offload everything 3269 movaps %xmm7,0x10(%rsp) 3270 movaps %xmm8,0x20(%rsp) 3271 movaps %xmm9,0x30(%rsp) 3272 movaps %xmm10,0x40(%rsp) 3273 movaps %xmm11,0x50(%rsp) 3274 movaps %xmm12,0x60(%rsp) 3275 movaps %xmm13,0x70(%rsp) 3276 movaps %xmm14,0x80(%rsp) 3277 movaps %xmm15,0x90(%rsp) 3278.Locb_dec_body: 3279___ 3280$code.=<<___; 3281 mov $seventh_arg(%rax),$L_p # 7th argument 3282 mov $seventh_arg+8(%rax),$checksum_p# 8th argument 3283 3284 mov 240($key),$rnds_ 3285 mov $key,$key_ 3286 shl \$4,$rnds_ 3287 $movkey ($key),$rndkey0l # round[0] 3288 $movkey 16($key,$rnds_),$rndkey1 # round[last] 3289 3290 movdqu ($offset_p),@offset[5] # load last offset_i 3291 pxor $rndkey1,$rndkey0l # round[0] ^ round[last] 3292 pxor $rndkey1,@offset[5] # offset_i ^ round[last] 3293 3294 mov \$16+32,$rounds 3295 lea 32($key_,$rnds_),$key 3296 $movkey 16($key_),$rndkey1 # round[1] 3297 sub %r10,%rax # twisted $rounds 3298 mov %rax,%r10 # backup twisted $rounds 3299 3300 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 3301 movdqu ($checksum_p),$checksum # load checksum 3302 3303 test \$1,$block_num # is first block number odd? 3304 jnz .Locb_dec_odd 3305 3306 bsf $block_num,$i1 3307 add \$1,$block_num 3308 shl \$4,$i1 3309 movdqu ($L_p,$i1),$inout5 # borrow 3310 movdqu ($inp),$inout0 3311 lea 16($inp),$inp 3312 3313 call __ocb_decrypt1 3314 3315 movdqa $inout5,@offset[5] 3316 movups $inout0,($out) 3317 xorps $inout0,$checksum # accumulate checksum 3318 lea 16($out),$out 3319 sub \$1,$blocks 3320 jz .Locb_dec_done 3321 3322.Locb_dec_odd: 3323 lea 1($block_num),$i1 # even-numbered blocks 3324 lea 3($block_num),$i3 3325 lea 5($block_num),$i5 3326 lea 6($block_num),$block_num 3327 bsf $i1,$i1 # ntz(block) 3328 bsf $i3,$i3 3329 bsf $i5,$i5 3330 shl \$4,$i1 # ntz(block) -> table offset 3331 shl \$4,$i3 3332 shl \$4,$i5 3333 3334 sub \$6,$blocks 3335 jc .Locb_dec_short 3336 jmp .Locb_dec_grandloop 3337 3338.align 32 3339.Locb_dec_grandloop: 3340 movdqu `16*0`($inp),$inout0 # load input 3341 movdqu `16*1`($inp),$inout1 3342 movdqu `16*2`($inp),$inout2 3343 movdqu `16*3`($inp),$inout3 3344 movdqu `16*4`($inp),$inout4 3345 movdqu `16*5`($inp),$inout5 3346 lea `16*6`($inp),$inp 3347 3348 call __ocb_decrypt6 3349 3350 movups $inout0,`16*0`($out) # store output 3351 pxor $inout0,$checksum # accumulate checksum 3352 movups $inout1,`16*1`($out) 3353 pxor $inout1,$checksum 3354 movups $inout2,`16*2`($out) 3355 pxor $inout2,$checksum 3356 movups $inout3,`16*3`($out) 3357 pxor $inout3,$checksum 3358 movups $inout4,`16*4`($out) 3359 pxor $inout4,$checksum 3360 movups $inout5,`16*5`($out) 3361 pxor $inout5,$checksum 3362 lea `16*6`($out),$out 3363 sub \$6,$blocks 3364 jnc .Locb_dec_grandloop 3365 3366.Locb_dec_short: 3367 add \$6,$blocks 3368 jz .Locb_dec_done 3369 3370 movdqu `16*0`($inp),$inout0 3371 cmp \$2,$blocks 3372 jb .Locb_dec_one 3373 movdqu `16*1`($inp),$inout1 3374 je .Locb_dec_two 3375 3376 movdqu `16*2`($inp),$inout2 3377 cmp \$4,$blocks 3378 jb .Locb_dec_three 3379 movdqu `16*3`($inp),$inout3 3380 je .Locb_dec_four 3381 3382 movdqu `16*4`($inp),$inout4 3383 pxor $inout5,$inout5 3384 3385 call __ocb_decrypt6 3386 3387 movdqa @offset[4],@offset[5] 3388 movups $inout0,`16*0`($out) # store output 3389 pxor $inout0,$checksum # accumulate checksum 3390 movups $inout1,`16*1`($out) 3391 pxor $inout1,$checksum 3392 movups $inout2,`16*2`($out) 3393 pxor $inout2,$checksum 3394 movups $inout3,`16*3`($out) 3395 pxor $inout3,$checksum 3396 movups $inout4,`16*4`($out) 3397 pxor $inout4,$checksum 3398 3399 jmp .Locb_dec_done 3400 3401.align 16 3402.Locb_dec_one: 3403 movdqa @offset[0],$inout5 # borrow 3404 3405 call __ocb_decrypt1 3406 3407 movdqa $inout5,@offset[5] 3408 movups $inout0,`16*0`($out) # store output 3409 xorps $inout0,$checksum # accumulate checksum 3410 jmp .Locb_dec_done 3411 3412.align 16 3413.Locb_dec_two: 3414 pxor $inout2,$inout2 3415 pxor $inout3,$inout3 3416 3417 call __ocb_decrypt4 3418 3419 movdqa @offset[1],@offset[5] 3420 movups $inout0,`16*0`($out) # store output 3421 xorps $inout0,$checksum # accumulate checksum 3422 movups $inout1,`16*1`($out) 3423 xorps $inout1,$checksum 3424 3425 jmp .Locb_dec_done 3426 3427.align 16 3428.Locb_dec_three: 3429 pxor $inout3,$inout3 3430 3431 call __ocb_decrypt4 3432 3433 movdqa @offset[2],@offset[5] 3434 movups $inout0,`16*0`($out) # store output 3435 xorps $inout0,$checksum # accumulate checksum 3436 movups $inout1,`16*1`($out) 3437 xorps $inout1,$checksum 3438 movups $inout2,`16*2`($out) 3439 xorps $inout2,$checksum 3440 3441 jmp .Locb_dec_done 3442 3443.align 16 3444.Locb_dec_four: 3445 call __ocb_decrypt4 3446 3447 movdqa @offset[3],@offset[5] 3448 movups $inout0,`16*0`($out) # store output 3449 pxor $inout0,$checksum # accumulate checksum 3450 movups $inout1,`16*1`($out) 3451 pxor $inout1,$checksum 3452 movups $inout2,`16*2`($out) 3453 pxor $inout2,$checksum 3454 movups $inout3,`16*3`($out) 3455 pxor $inout3,$checksum 3456 3457.Locb_dec_done: 3458 pxor $rndkey0,@offset[5] # "remove" round[last] 3459 movdqu $checksum,($checksum_p) # store checksum 3460 movdqu @offset[5],($offset_p) # store last offset_i 3461 3462 xorps %xmm0,%xmm0 # clear register bank 3463 pxor %xmm1,%xmm1 3464 pxor %xmm2,%xmm2 3465 pxor %xmm3,%xmm3 3466 pxor %xmm4,%xmm4 3467 pxor %xmm5,%xmm5 3468___ 3469$code.=<<___ if (!$win64); 3470 pxor %xmm6,%xmm6 3471 pxor %xmm7,%xmm7 3472 pxor %xmm8,%xmm8 3473 pxor %xmm9,%xmm9 3474 pxor %xmm10,%xmm10 3475 pxor %xmm11,%xmm11 3476 pxor %xmm12,%xmm12 3477 pxor %xmm13,%xmm13 3478 pxor %xmm14,%xmm14 3479 pxor %xmm15,%xmm15 3480 lea 0x28(%rsp),%rax 3481.cfi_def_cfa %rax,8 3482___ 3483$code.=<<___ if ($win64); 3484 movaps 0x00(%rsp),%xmm6 3485 movaps %xmm0,0x00(%rsp) # clear stack 3486 movaps 0x10(%rsp),%xmm7 3487 movaps %xmm0,0x10(%rsp) 3488 movaps 0x20(%rsp),%xmm8 3489 movaps %xmm0,0x20(%rsp) 3490 movaps 0x30(%rsp),%xmm9 3491 movaps %xmm0,0x30(%rsp) 3492 movaps 0x40(%rsp),%xmm10 3493 movaps %xmm0,0x40(%rsp) 3494 movaps 0x50(%rsp),%xmm11 3495 movaps %xmm0,0x50(%rsp) 3496 movaps 0x60(%rsp),%xmm12 3497 movaps %xmm0,0x60(%rsp) 3498 movaps 0x70(%rsp),%xmm13 3499 movaps %xmm0,0x70(%rsp) 3500 movaps 0x80(%rsp),%xmm14 3501 movaps %xmm0,0x80(%rsp) 3502 movaps 0x90(%rsp),%xmm15 3503 movaps %xmm0,0x90(%rsp) 3504 lea 0xa0+0x28(%rsp),%rax 3505.Locb_dec_pop: 3506___ 3507$code.=<<___; 3508 mov -40(%rax),%r14 3509.cfi_restore %r14 3510 mov -32(%rax),%r13 3511.cfi_restore %r13 3512 mov -24(%rax),%r12 3513.cfi_restore %r12 3514 mov -16(%rax),%rbp 3515.cfi_restore %rbp 3516 mov -8(%rax),%rbx 3517.cfi_restore %rbx 3518 lea (%rax),%rsp 3519.cfi_def_cfa_register %rsp 3520.Locb_dec_epilogue: 3521 ret 3522.cfi_endproc 3523.size ${PREFIX}_ocb_decrypt,.-${PREFIX}_ocb_decrypt 3524 3525.type __ocb_decrypt6,\@abi-omnipotent 3526.align 32 3527__ocb_decrypt6: 3528 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3529 movdqu ($L_p,$i1),@offset[1] 3530 movdqa @offset[0],@offset[2] 3531 movdqu ($L_p,$i3),@offset[3] 3532 movdqa @offset[0],@offset[4] 3533 pxor @offset[5],@offset[0] 3534 movdqu ($L_p,$i5),@offset[5] 3535 pxor @offset[0],@offset[1] 3536 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3537 pxor @offset[1],@offset[2] 3538 pxor @offset[1],$inout1 3539 pxor @offset[2],@offset[3] 3540 pxor @offset[2],$inout2 3541 pxor @offset[3],@offset[4] 3542 pxor @offset[3],$inout3 3543 pxor @offset[4],@offset[5] 3544 pxor @offset[4],$inout4 3545 pxor @offset[5],$inout5 3546 $movkey 32($key_),$rndkey0 3547 3548 lea 1($block_num),$i1 # even-numbered blocks 3549 lea 3($block_num),$i3 3550 lea 5($block_num),$i5 3551 add \$6,$block_num 3552 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3553 bsf $i1,$i1 # ntz(block) 3554 bsf $i3,$i3 3555 bsf $i5,$i5 3556 3557 aesdec $rndkey1,$inout0 3558 aesdec $rndkey1,$inout1 3559 aesdec $rndkey1,$inout2 3560 aesdec $rndkey1,$inout3 3561 pxor $rndkey0l,@offset[1] 3562 pxor $rndkey0l,@offset[2] 3563 aesdec $rndkey1,$inout4 3564 pxor $rndkey0l,@offset[3] 3565 pxor $rndkey0l,@offset[4] 3566 aesdec $rndkey1,$inout5 3567 $movkey 48($key_),$rndkey1 3568 pxor $rndkey0l,@offset[5] 3569 3570 aesdec $rndkey0,$inout0 3571 aesdec $rndkey0,$inout1 3572 aesdec $rndkey0,$inout2 3573 aesdec $rndkey0,$inout3 3574 aesdec $rndkey0,$inout4 3575 aesdec $rndkey0,$inout5 3576 $movkey 64($key_),$rndkey0 3577 shl \$4,$i1 # ntz(block) -> table offset 3578 shl \$4,$i3 3579 jmp .Locb_dec_loop6 3580 3581.align 32 3582.Locb_dec_loop6: 3583 aesdec $rndkey1,$inout0 3584 aesdec $rndkey1,$inout1 3585 aesdec $rndkey1,$inout2 3586 aesdec $rndkey1,$inout3 3587 aesdec $rndkey1,$inout4 3588 aesdec $rndkey1,$inout5 3589 $movkey ($key,%rax),$rndkey1 3590 add \$32,%rax 3591 3592 aesdec $rndkey0,$inout0 3593 aesdec $rndkey0,$inout1 3594 aesdec $rndkey0,$inout2 3595 aesdec $rndkey0,$inout3 3596 aesdec $rndkey0,$inout4 3597 aesdec $rndkey0,$inout5 3598 $movkey -16($key,%rax),$rndkey0 3599 jnz .Locb_dec_loop6 3600 3601 aesdec $rndkey1,$inout0 3602 aesdec $rndkey1,$inout1 3603 aesdec $rndkey1,$inout2 3604 aesdec $rndkey1,$inout3 3605 aesdec $rndkey1,$inout4 3606 aesdec $rndkey1,$inout5 3607 $movkey 16($key_),$rndkey1 3608 shl \$4,$i5 3609 3610 aesdeclast @offset[0],$inout0 3611 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks 3612 mov %r10,%rax # restore twisted rounds 3613 aesdeclast @offset[1],$inout1 3614 aesdeclast @offset[2],$inout2 3615 aesdeclast @offset[3],$inout3 3616 aesdeclast @offset[4],$inout4 3617 aesdeclast @offset[5],$inout5 3618 ret 3619.size __ocb_decrypt6,.-__ocb_decrypt6 3620 3621.type __ocb_decrypt4,\@abi-omnipotent 3622.align 32 3623__ocb_decrypt4: 3624 pxor $rndkey0l,@offset[5] # offset_i ^ round[0] 3625 movdqu ($L_p,$i1),@offset[1] 3626 movdqa @offset[0],@offset[2] 3627 movdqu ($L_p,$i3),@offset[3] 3628 pxor @offset[5],@offset[0] 3629 pxor @offset[0],@offset[1] 3630 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i 3631 pxor @offset[1],@offset[2] 3632 pxor @offset[1],$inout1 3633 pxor @offset[2],@offset[3] 3634 pxor @offset[2],$inout2 3635 pxor @offset[3],$inout3 3636 $movkey 32($key_),$rndkey0 3637 3638 pxor $rndkey0l,@offset[0] # offset_i ^ round[last] 3639 pxor $rndkey0l,@offset[1] 3640 pxor $rndkey0l,@offset[2] 3641 pxor $rndkey0l,@offset[3] 3642 3643 aesdec $rndkey1,$inout0 3644 aesdec $rndkey1,$inout1 3645 aesdec $rndkey1,$inout2 3646 aesdec $rndkey1,$inout3 3647 $movkey 48($key_),$rndkey1 3648 3649 aesdec $rndkey0,$inout0 3650 aesdec $rndkey0,$inout1 3651 aesdec $rndkey0,$inout2 3652 aesdec $rndkey0,$inout3 3653 $movkey 64($key_),$rndkey0 3654 jmp .Locb_dec_loop4 3655 3656.align 32 3657.Locb_dec_loop4: 3658 aesdec $rndkey1,$inout0 3659 aesdec $rndkey1,$inout1 3660 aesdec $rndkey1,$inout2 3661 aesdec $rndkey1,$inout3 3662 $movkey ($key,%rax),$rndkey1 3663 add \$32,%rax 3664 3665 aesdec $rndkey0,$inout0 3666 aesdec $rndkey0,$inout1 3667 aesdec $rndkey0,$inout2 3668 aesdec $rndkey0,$inout3 3669 $movkey -16($key,%rax),$rndkey0 3670 jnz .Locb_dec_loop4 3671 3672 aesdec $rndkey1,$inout0 3673 aesdec $rndkey1,$inout1 3674 aesdec $rndkey1,$inout2 3675 aesdec $rndkey1,$inout3 3676 $movkey 16($key_),$rndkey1 3677 mov %r10,%rax # restore twisted rounds 3678 3679 aesdeclast @offset[0],$inout0 3680 aesdeclast @offset[1],$inout1 3681 aesdeclast @offset[2],$inout2 3682 aesdeclast @offset[3],$inout3 3683 ret 3684.size __ocb_decrypt4,.-__ocb_decrypt4 3685 3686.type __ocb_decrypt1,\@abi-omnipotent 3687.align 32 3688__ocb_decrypt1: 3689 pxor @offset[5],$inout5 # offset_i 3690 pxor $rndkey0l,$inout5 # offset_i ^ round[0] 3691 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i 3692 $movkey 32($key_),$rndkey0 3693 3694 aesdec $rndkey1,$inout0 3695 $movkey 48($key_),$rndkey1 3696 pxor $rndkey0l,$inout5 # offset_i ^ round[last] 3697 3698 aesdec $rndkey0,$inout0 3699 $movkey 64($key_),$rndkey0 3700 jmp .Locb_dec_loop1 3701 3702.align 32 3703.Locb_dec_loop1: 3704 aesdec $rndkey1,$inout0 3705 $movkey ($key,%rax),$rndkey1 3706 add \$32,%rax 3707 3708 aesdec $rndkey0,$inout0 3709 $movkey -16($key,%rax),$rndkey0 3710 jnz .Locb_dec_loop1 3711 3712 aesdec $rndkey1,$inout0 3713 $movkey 16($key_),$rndkey1 # redundant in tail 3714 mov %r10,%rax # restore twisted rounds 3715 3716 aesdeclast $inout5,$inout0 3717 ret 3718.size __ocb_decrypt1,.-__ocb_decrypt1 3719___ 3720} }} 3721 3722######################################################################## 3723# void $PREFIX_cbc_encrypt (const void *inp, void *out, 3724# size_t length, const AES_KEY *key, 3725# unsigned char *ivp,const int enc); 3726{ 3727my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt 3728my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15)); 3729 3730$code.=<<___; 3731.globl ${PREFIX}_cbc_encrypt 3732.type ${PREFIX}_cbc_encrypt,\@function,6 3733.align 16 3734${PREFIX}_cbc_encrypt: 3735.cfi_startproc 3736 test $len,$len # check length 3737 jz .Lcbc_ret 3738 3739 mov 240($key),$rnds_ # key->rounds 3740 mov $key,$key_ # backup $key 3741 test %r9d,%r9d # 6th argument 3742 jz .Lcbc_decrypt 3743#--------------------------- CBC ENCRYPT ------------------------------# 3744 movups ($ivp),$inout0 # load iv as initial state 3745 mov $rnds_,$rounds 3746 cmp \$16,$len 3747 jb .Lcbc_enc_tail 3748 sub \$16,$len 3749 jmp .Lcbc_enc_loop 3750.align 16 3751.Lcbc_enc_loop: 3752 movups ($inp),$inout1 # load input 3753 lea 16($inp),$inp 3754 #xorps $inout1,$inout0 3755___ 3756 &aesni_generate1("enc",$key,$rounds,$inout0,$inout1); 3757$code.=<<___; 3758 mov $rnds_,$rounds # restore $rounds 3759 mov $key_,$key # restore $key 3760 movups $inout0,0($out) # store output 3761 lea 16($out),$out 3762 sub \$16,$len 3763 jnc .Lcbc_enc_loop 3764 add \$16,$len 3765 jnz .Lcbc_enc_tail 3766 pxor $rndkey0,$rndkey0 # clear register bank 3767 pxor $rndkey1,$rndkey1 3768 movups $inout0,($ivp) 3769 pxor $inout0,$inout0 3770 pxor $inout1,$inout1 3771 jmp .Lcbc_ret 3772 3773.Lcbc_enc_tail: 3774 mov $len,%rcx # zaps $key 3775 xchg $inp,$out # $inp is %rsi and $out is %rdi now 3776 .long 0x9066A4F3 # rep movsb 3777 mov \$16,%ecx # zero tail 3778 sub $len,%rcx 3779 xor %eax,%eax 3780 .long 0x9066AAF3 # rep stosb 3781 lea -16(%rdi),%rdi # rewind $out by 1 block 3782 mov $rnds_,$rounds # restore $rounds 3783 mov %rdi,%rsi # $inp and $out are the same 3784 mov $key_,$key # restore $key 3785 xor $len,$len # len=16 3786 jmp .Lcbc_enc_loop # one more spin 3787#--------------------------- CBC DECRYPT ------------------------------# 3788.align 16 3789.Lcbc_decrypt: 3790 cmp \$16,$len 3791 jne .Lcbc_decrypt_bulk 3792 3793 # handle single block without allocating stack frame, 3794 # useful in ciphertext stealing mode 3795 movdqu ($inp),$inout0 # load input 3796 movdqu ($ivp),$inout1 # load iv 3797 movdqa $inout0,$inout2 # future iv 3798___ 3799 &aesni_generate1("dec",$key,$rnds_); 3800$code.=<<___; 3801 pxor $rndkey0,$rndkey0 # clear register bank 3802 pxor $rndkey1,$rndkey1 3803 movdqu $inout2,($ivp) # store iv 3804 xorps $inout1,$inout0 # ^=iv 3805 pxor $inout1,$inout1 3806 movups $inout0,($out) # store output 3807 pxor $inout0,$inout0 3808 jmp .Lcbc_ret 3809.align 16 3810.Lcbc_decrypt_bulk: 3811 lea (%rsp),%r11 # frame pointer 3812.cfi_def_cfa_register %r11 3813 push %rbp 3814.cfi_push %rbp 3815 sub \$$frame_size,%rsp 3816 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded 3817___ 3818$code.=<<___ if ($win64); 3819 movaps %xmm6,0x10(%rsp) 3820 movaps %xmm7,0x20(%rsp) 3821 movaps %xmm8,0x30(%rsp) 3822 movaps %xmm9,0x40(%rsp) 3823 movaps %xmm10,0x50(%rsp) 3824 movaps %xmm11,0x60(%rsp) 3825 movaps %xmm12,0x70(%rsp) 3826 movaps %xmm13,0x80(%rsp) 3827 movaps %xmm14,0x90(%rsp) 3828 movaps %xmm15,0xa0(%rsp) 3829.Lcbc_decrypt_body: 3830___ 3831 3832my $inp_=$key_="%rbp"; # reassign $key_ 3833 3834$code.=<<___; 3835 mov $key,$key_ # [re-]backup $key [after reassignment] 3836 movups ($ivp),$iv 3837 mov $rnds_,$rounds 3838 cmp \$0x50,$len 3839 jbe .Lcbc_dec_tail 3840 3841 $movkey ($key),$rndkey0 3842 movdqu 0x00($inp),$inout0 # load input 3843 movdqu 0x10($inp),$inout1 3844 movdqa $inout0,$in0 3845 movdqu 0x20($inp),$inout2 3846 movdqa $inout1,$in1 3847 movdqu 0x30($inp),$inout3 3848 movdqa $inout2,$in2 3849 movdqu 0x40($inp),$inout4 3850 movdqa $inout3,$in3 3851 movdqu 0x50($inp),$inout5 3852 movdqa $inout4,$in4 3853 leaq OPENSSL_ia32cap_P(%rip),%r9 3854 mov 4(%r9),%r9d 3855 cmp \$0x70,$len 3856 jbe .Lcbc_dec_six_or_seven 3857 3858 and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE 3859 sub \$0x50,$len # $len is biased by -5*16 3860 cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE 3861 je .Lcbc_dec_loop6_enter # [which denotes Atom Silvermont] 3862 sub \$0x20,$len # $len is biased by -7*16 3863 lea 0x70($key),$key # size optimization 3864 jmp .Lcbc_dec_loop8_enter 3865.align 16 3866.Lcbc_dec_loop8: 3867 movups $inout7,($out) 3868 lea 0x10($out),$out 3869.Lcbc_dec_loop8_enter: 3870 movdqu 0x60($inp),$inout6 3871 pxor $rndkey0,$inout0 3872 movdqu 0x70($inp),$inout7 3873 pxor $rndkey0,$inout1 3874 $movkey 0x10-0x70($key),$rndkey1 3875 pxor $rndkey0,$inout2 3876 mov \$-1,$inp_ 3877 cmp \$0x70,$len # is there at least 0x60 bytes ahead? 3878 pxor $rndkey0,$inout3 3879 pxor $rndkey0,$inout4 3880 pxor $rndkey0,$inout5 3881 pxor $rndkey0,$inout6 3882 3883 aesdec $rndkey1,$inout0 3884 pxor $rndkey0,$inout7 3885 $movkey 0x20-0x70($key),$rndkey0 3886 aesdec $rndkey1,$inout1 3887 aesdec $rndkey1,$inout2 3888 aesdec $rndkey1,$inout3 3889 aesdec $rndkey1,$inout4 3890 aesdec $rndkey1,$inout5 3891 aesdec $rndkey1,$inout6 3892 adc \$0,$inp_ 3893 and \$128,$inp_ 3894 aesdec $rndkey1,$inout7 3895 add $inp,$inp_ 3896 $movkey 0x30-0x70($key),$rndkey1 3897___ 3898for($i=1;$i<12;$i++) { 3899my $rndkeyx = ($i&1)?$rndkey0:$rndkey1; 3900$code.=<<___ if ($i==7); 3901 cmp \$11,$rounds 3902___ 3903$code.=<<___; 3904 aesdec $rndkeyx,$inout0 3905 aesdec $rndkeyx,$inout1 3906 aesdec $rndkeyx,$inout2 3907 aesdec $rndkeyx,$inout3 3908 aesdec $rndkeyx,$inout4 3909 aesdec $rndkeyx,$inout5 3910 aesdec $rndkeyx,$inout6 3911 aesdec $rndkeyx,$inout7 3912 $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx 3913___ 3914$code.=<<___ if ($i<6 || (!($i&1) && $i>7)); 3915 nop 3916___ 3917$code.=<<___ if ($i==7); 3918 jb .Lcbc_dec_done 3919___ 3920$code.=<<___ if ($i==9); 3921 je .Lcbc_dec_done 3922___ 3923$code.=<<___ if ($i==11); 3924 jmp .Lcbc_dec_done 3925___ 3926} 3927$code.=<<___; 3928.align 16 3929.Lcbc_dec_done: 3930 aesdec $rndkey1,$inout0 3931 aesdec $rndkey1,$inout1 3932 pxor $rndkey0,$iv 3933 pxor $rndkey0,$in0 3934 aesdec $rndkey1,$inout2 3935 aesdec $rndkey1,$inout3 3936 pxor $rndkey0,$in1 3937 pxor $rndkey0,$in2 3938 aesdec $rndkey1,$inout4 3939 aesdec $rndkey1,$inout5 3940 pxor $rndkey0,$in3 3941 pxor $rndkey0,$in4 3942 aesdec $rndkey1,$inout6 3943 aesdec $rndkey1,$inout7 3944 movdqu 0x50($inp),$rndkey1 3945 3946 aesdeclast $iv,$inout0 3947 movdqu 0x60($inp),$iv # borrow $iv 3948 pxor $rndkey0,$rndkey1 3949 aesdeclast $in0,$inout1 3950 pxor $rndkey0,$iv 3951 movdqu 0x70($inp),$rndkey0 # next IV 3952 aesdeclast $in1,$inout2 3953 lea 0x80($inp),$inp 3954 movdqu 0x00($inp_),$in0 3955 aesdeclast $in2,$inout3 3956 aesdeclast $in3,$inout4 3957 movdqu 0x10($inp_),$in1 3958 movdqu 0x20($inp_),$in2 3959 aesdeclast $in4,$inout5 3960 aesdeclast $rndkey1,$inout6 3961 movdqu 0x30($inp_),$in3 3962 movdqu 0x40($inp_),$in4 3963 aesdeclast $iv,$inout7 3964 movdqa $rndkey0,$iv # return $iv 3965 movdqu 0x50($inp_),$rndkey1 3966 $movkey -0x70($key),$rndkey0 3967 3968 movups $inout0,($out) # store output 3969 movdqa $in0,$inout0 3970 movups $inout1,0x10($out) 3971 movdqa $in1,$inout1 3972 movups $inout2,0x20($out) 3973 movdqa $in2,$inout2 3974 movups $inout3,0x30($out) 3975 movdqa $in3,$inout3 3976 movups $inout4,0x40($out) 3977 movdqa $in4,$inout4 3978 movups $inout5,0x50($out) 3979 movdqa $rndkey1,$inout5 3980 movups $inout6,0x60($out) 3981 lea 0x70($out),$out 3982 3983 sub \$0x80,$len 3984 ja .Lcbc_dec_loop8 3985 3986 movaps $inout7,$inout0 3987 lea -0x70($key),$key 3988 add \$0x70,$len 3989 jle .Lcbc_dec_clear_tail_collected 3990 movups $inout7,($out) 3991 lea 0x10($out),$out 3992 cmp \$0x50,$len 3993 jbe .Lcbc_dec_tail 3994 3995 movaps $in0,$inout0 3996.Lcbc_dec_six_or_seven: 3997 cmp \$0x60,$len 3998 ja .Lcbc_dec_seven 3999 4000 movaps $inout5,$inout6 4001 call _aesni_decrypt6 4002 pxor $iv,$inout0 # ^= IV 4003 movaps $inout6,$iv 4004 pxor $in0,$inout1 4005 movdqu $inout0,($out) 4006 pxor $in1,$inout2 4007 movdqu $inout1,0x10($out) 4008 pxor $inout1,$inout1 # clear register bank 4009 pxor $in2,$inout3 4010 movdqu $inout2,0x20($out) 4011 pxor $inout2,$inout2 4012 pxor $in3,$inout4 4013 movdqu $inout3,0x30($out) 4014 pxor $inout3,$inout3 4015 pxor $in4,$inout5 4016 movdqu $inout4,0x40($out) 4017 pxor $inout4,$inout4 4018 lea 0x50($out),$out 4019 movdqa $inout5,$inout0 4020 pxor $inout5,$inout5 4021 jmp .Lcbc_dec_tail_collected 4022 4023.align 16 4024.Lcbc_dec_seven: 4025 movups 0x60($inp),$inout6 4026 xorps $inout7,$inout7 4027 call _aesni_decrypt8 4028 movups 0x50($inp),$inout7 4029 pxor $iv,$inout0 # ^= IV 4030 movups 0x60($inp),$iv 4031 pxor $in0,$inout1 4032 movdqu $inout0,($out) 4033 pxor $in1,$inout2 4034 movdqu $inout1,0x10($out) 4035 pxor $inout1,$inout1 # clear register bank 4036 pxor $in2,$inout3 4037 movdqu $inout2,0x20($out) 4038 pxor $inout2,$inout2 4039 pxor $in3,$inout4 4040 movdqu $inout3,0x30($out) 4041 pxor $inout3,$inout3 4042 pxor $in4,$inout5 4043 movdqu $inout4,0x40($out) 4044 pxor $inout4,$inout4 4045 pxor $inout7,$inout6 4046 movdqu $inout5,0x50($out) 4047 pxor $inout5,$inout5 4048 lea 0x60($out),$out 4049 movdqa $inout6,$inout0 4050 pxor $inout6,$inout6 4051 pxor $inout7,$inout7 4052 jmp .Lcbc_dec_tail_collected 4053 4054.align 16 4055.Lcbc_dec_loop6: 4056 movups $inout5,($out) 4057 lea 0x10($out),$out 4058 movdqu 0x00($inp),$inout0 # load input 4059 movdqu 0x10($inp),$inout1 4060 movdqa $inout0,$in0 4061 movdqu 0x20($inp),$inout2 4062 movdqa $inout1,$in1 4063 movdqu 0x30($inp),$inout3 4064 movdqa $inout2,$in2 4065 movdqu 0x40($inp),$inout4 4066 movdqa $inout3,$in3 4067 movdqu 0x50($inp),$inout5 4068 movdqa $inout4,$in4 4069.Lcbc_dec_loop6_enter: 4070 lea 0x60($inp),$inp 4071 movdqa $inout5,$inout6 4072 4073 call _aesni_decrypt6 4074 4075 pxor $iv,$inout0 # ^= IV 4076 movdqa $inout6,$iv 4077 pxor $in0,$inout1 4078 movdqu $inout0,($out) 4079 pxor $in1,$inout2 4080 movdqu $inout1,0x10($out) 4081 pxor $in2,$inout3 4082 movdqu $inout2,0x20($out) 4083 pxor $in3,$inout4 4084 mov $key_,$key 4085 movdqu $inout3,0x30($out) 4086 pxor $in4,$inout5 4087 mov $rnds_,$rounds 4088 movdqu $inout4,0x40($out) 4089 lea 0x50($out),$out 4090 sub \$0x60,$len 4091 ja .Lcbc_dec_loop6 4092 4093 movdqa $inout5,$inout0 4094 add \$0x50,$len 4095 jle .Lcbc_dec_clear_tail_collected 4096 movups $inout5,($out) 4097 lea 0x10($out),$out 4098 4099.Lcbc_dec_tail: 4100 movups ($inp),$inout0 4101 sub \$0x10,$len 4102 jbe .Lcbc_dec_one # $len is 1*16 or less 4103 4104 movups 0x10($inp),$inout1 4105 movaps $inout0,$in0 4106 sub \$0x10,$len 4107 jbe .Lcbc_dec_two # $len is 2*16 or less 4108 4109 movups 0x20($inp),$inout2 4110 movaps $inout1,$in1 4111 sub \$0x10,$len 4112 jbe .Lcbc_dec_three # $len is 3*16 or less 4113 4114 movups 0x30($inp),$inout3 4115 movaps $inout2,$in2 4116 sub \$0x10,$len 4117 jbe .Lcbc_dec_four # $len is 4*16 or less 4118 4119 movups 0x40($inp),$inout4 # $len is 5*16 or less 4120 movaps $inout3,$in3 4121 movaps $inout4,$in4 4122 xorps $inout5,$inout5 4123 call _aesni_decrypt6 4124 pxor $iv,$inout0 4125 movaps $in4,$iv 4126 pxor $in0,$inout1 4127 movdqu $inout0,($out) 4128 pxor $in1,$inout2 4129 movdqu $inout1,0x10($out) 4130 pxor $inout1,$inout1 # clear register bank 4131 pxor $in2,$inout3 4132 movdqu $inout2,0x20($out) 4133 pxor $inout2,$inout2 4134 pxor $in3,$inout4 4135 movdqu $inout3,0x30($out) 4136 pxor $inout3,$inout3 4137 lea 0x40($out),$out 4138 movdqa $inout4,$inout0 4139 pxor $inout4,$inout4 4140 pxor $inout5,$inout5 4141 sub \$0x10,$len 4142 jmp .Lcbc_dec_tail_collected 4143 4144.align 16 4145.Lcbc_dec_one: 4146 movaps $inout0,$in0 4147___ 4148 &aesni_generate1("dec",$key,$rounds); 4149$code.=<<___; 4150 xorps $iv,$inout0 4151 movaps $in0,$iv 4152 jmp .Lcbc_dec_tail_collected 4153.align 16 4154.Lcbc_dec_two: 4155 movaps $inout1,$in1 4156 call _aesni_decrypt2 4157 pxor $iv,$inout0 4158 movaps $in1,$iv 4159 pxor $in0,$inout1 4160 movdqu $inout0,($out) 4161 movdqa $inout1,$inout0 4162 pxor $inout1,$inout1 # clear register bank 4163 lea 0x10($out),$out 4164 jmp .Lcbc_dec_tail_collected 4165.align 16 4166.Lcbc_dec_three: 4167 movaps $inout2,$in2 4168 call _aesni_decrypt3 4169 pxor $iv,$inout0 4170 movaps $in2,$iv 4171 pxor $in0,$inout1 4172 movdqu $inout0,($out) 4173 pxor $in1,$inout2 4174 movdqu $inout1,0x10($out) 4175 pxor $inout1,$inout1 # clear register bank 4176 movdqa $inout2,$inout0 4177 pxor $inout2,$inout2 4178 lea 0x20($out),$out 4179 jmp .Lcbc_dec_tail_collected 4180.align 16 4181.Lcbc_dec_four: 4182 movaps $inout3,$in3 4183 call _aesni_decrypt4 4184 pxor $iv,$inout0 4185 movaps $in3,$iv 4186 pxor $in0,$inout1 4187 movdqu $inout0,($out) 4188 pxor $in1,$inout2 4189 movdqu $inout1,0x10($out) 4190 pxor $inout1,$inout1 # clear register bank 4191 pxor $in2,$inout3 4192 movdqu $inout2,0x20($out) 4193 pxor $inout2,$inout2 4194 movdqa $inout3,$inout0 4195 pxor $inout3,$inout3 4196 lea 0x30($out),$out 4197 jmp .Lcbc_dec_tail_collected 4198 4199.align 16 4200.Lcbc_dec_clear_tail_collected: 4201 pxor $inout1,$inout1 # clear register bank 4202 pxor $inout2,$inout2 4203 pxor $inout3,$inout3 4204___ 4205$code.=<<___ if (!$win64); 4206 pxor $inout4,$inout4 # %xmm6..9 4207 pxor $inout5,$inout5 4208 pxor $inout6,$inout6 4209 pxor $inout7,$inout7 4210___ 4211$code.=<<___; 4212.Lcbc_dec_tail_collected: 4213 movups $iv,($ivp) 4214 and \$15,$len 4215 jnz .Lcbc_dec_tail_partial 4216 movups $inout0,($out) 4217 pxor $inout0,$inout0 4218 jmp .Lcbc_dec_ret 4219.align 16 4220.Lcbc_dec_tail_partial: 4221 movaps $inout0,(%rsp) 4222 pxor $inout0,$inout0 4223 mov \$16,%rcx 4224 mov $out,%rdi 4225 sub $len,%rcx 4226 lea (%rsp),%rsi 4227 .long 0x9066A4F3 # rep movsb 4228 movdqa $inout0,(%rsp) 4229 4230.Lcbc_dec_ret: 4231 xorps $rndkey0,$rndkey0 # %xmm0 4232 pxor $rndkey1,$rndkey1 4233___ 4234$code.=<<___ if ($win64); 4235 movaps 0x10(%rsp),%xmm6 4236 movaps %xmm0,0x10(%rsp) # clear stack 4237 movaps 0x20(%rsp),%xmm7 4238 movaps %xmm0,0x20(%rsp) 4239 movaps 0x30(%rsp),%xmm8 4240 movaps %xmm0,0x30(%rsp) 4241 movaps 0x40(%rsp),%xmm9 4242 movaps %xmm0,0x40(%rsp) 4243 movaps 0x50(%rsp),%xmm10 4244 movaps %xmm0,0x50(%rsp) 4245 movaps 0x60(%rsp),%xmm11 4246 movaps %xmm0,0x60(%rsp) 4247 movaps 0x70(%rsp),%xmm12 4248 movaps %xmm0,0x70(%rsp) 4249 movaps 0x80(%rsp),%xmm13 4250 movaps %xmm0,0x80(%rsp) 4251 movaps 0x90(%rsp),%xmm14 4252 movaps %xmm0,0x90(%rsp) 4253 movaps 0xa0(%rsp),%xmm15 4254 movaps %xmm0,0xa0(%rsp) 4255___ 4256$code.=<<___; 4257 mov -8(%r11),%rbp 4258.cfi_restore %rbp 4259 lea (%r11),%rsp 4260.cfi_def_cfa_register %rsp 4261.Lcbc_ret: 4262 ret 4263.cfi_endproc 4264.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt 4265___ 4266} 4267# int ${PREFIX}_set_decrypt_key(const unsigned char *inp, 4268# int bits, AES_KEY *key) 4269# 4270# input: $inp user-supplied key 4271# $bits $inp length in bits 4272# $key pointer to key schedule 4273# output: %eax 0 denoting success, -1 or -2 - failure (see C) 4274# *$key key schedule 4275# 4276{ my ($inp,$bits,$key) = @_4args; 4277 $bits =~ s/%r/%e/; 4278 4279$code.=<<___; 4280.globl ${PREFIX}_set_decrypt_key 4281.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent 4282.align 16 4283${PREFIX}_set_decrypt_key: 4284.cfi_startproc 4285 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 4286.cfi_adjust_cfa_offset 8 4287 call __aesni_set_encrypt_key 4288 shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key 4289 test %eax,%eax 4290 jnz .Ldec_key_ret 4291 lea 16($key,$bits),$inp # points at the end of key schedule 4292 4293 $movkey ($key),%xmm0 # just swap 4294 $movkey ($inp),%xmm1 4295 $movkey %xmm0,($inp) 4296 $movkey %xmm1,($key) 4297 lea 16($key),$key 4298 lea -16($inp),$inp 4299 4300.Ldec_key_inverse: 4301 $movkey ($key),%xmm0 # swap and inverse 4302 $movkey ($inp),%xmm1 4303 aesimc %xmm0,%xmm0 4304 aesimc %xmm1,%xmm1 4305 lea 16($key),$key 4306 lea -16($inp),$inp 4307 $movkey %xmm0,16($inp) 4308 $movkey %xmm1,-16($key) 4309 cmp $key,$inp 4310 ja .Ldec_key_inverse 4311 4312 $movkey ($key),%xmm0 # inverse middle 4313 aesimc %xmm0,%xmm0 4314 pxor %xmm1,%xmm1 4315 $movkey %xmm0,($inp) 4316 pxor %xmm0,%xmm0 4317.Ldec_key_ret: 4318 add \$8,%rsp 4319.cfi_adjust_cfa_offset -8 4320 ret 4321.cfi_endproc 4322.LSEH_end_set_decrypt_key: 4323.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key 4324___ 4325 4326# This is based on submission from Intel by 4327# Huang Ying 4328# Vinodh Gopal 4329# Kahraman Akdemir 4330# 4331# Aggressively optimized in respect to aeskeygenassist's critical path 4332# and is contained in %xmm0-5 to meet Win64 ABI requirement. 4333# 4334# int ${PREFIX}_set_encrypt_key(const unsigned char *inp, 4335# int bits, AES_KEY * const key); 4336# 4337# input: $inp user-supplied key 4338# $bits $inp length in bits 4339# $key pointer to key schedule 4340# output: %eax 0 denoting success, -1 or -2 - failure (see C) 4341# $bits rounds-1 (used in aesni_set_decrypt_key) 4342# *$key key schedule 4343# $key pointer to key schedule (used in 4344# aesni_set_decrypt_key) 4345# 4346# Subroutine is frame-less, which means that only volatile registers 4347# are used. Note that it's declared "abi-omnipotent", which means that 4348# amount of volatile registers is smaller on Windows. 4349# 4350$code.=<<___; 4351.globl ${PREFIX}_set_encrypt_key 4352.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent 4353.align 16 4354${PREFIX}_set_encrypt_key: 4355__aesni_set_encrypt_key: 4356.cfi_startproc 4357#ifndef NDEBUG 4358#ifndef BORINGSSL_FIPS 4359 movb \$1,BORINGSSL_function_hit+3(%rip) 4360#endif 4361#endif 4362 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 4363.cfi_adjust_cfa_offset 8 4364 mov \$-1,%rax 4365 test $inp,$inp 4366 jz .Lenc_key_ret 4367 test $key,$key 4368 jz .Lenc_key_ret 4369 4370 movups ($inp),%xmm0 # pull first 128 bits of *userKey 4371 xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0 4372 leaq OPENSSL_ia32cap_P(%rip),%r10 4373 movl 4(%r10),%r10d 4374 and \$`1<<28|1<<11`,%r10d # AVX and XOP bits 4375 lea 16($key),%rax # %rax is used as modifiable copy of $key 4376 cmp \$256,$bits 4377 je .L14rounds 4378 cmp \$192,$bits 4379 je .L12rounds 4380 cmp \$128,$bits 4381 jne .Lbad_keybits 4382 4383.L10rounds: 4384 mov \$9,$bits # 10 rounds for 128-bit key 4385 cmp \$`1<<28`,%r10d # AVX, bit no XOP 4386 je .L10rounds_alt 4387 4388 $movkey %xmm0,($key) # round 0 4389 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1 4390 call .Lkey_expansion_128_cold 4391 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2 4392 call .Lkey_expansion_128 4393 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3 4394 call .Lkey_expansion_128 4395 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4 4396 call .Lkey_expansion_128 4397 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5 4398 call .Lkey_expansion_128 4399 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6 4400 call .Lkey_expansion_128 4401 aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7 4402 call .Lkey_expansion_128 4403 aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8 4404 call .Lkey_expansion_128 4405 aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9 4406 call .Lkey_expansion_128 4407 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10 4408 call .Lkey_expansion_128 4409 $movkey %xmm0,(%rax) 4410 mov $bits,80(%rax) # 240(%rdx) 4411 xor %eax,%eax 4412 jmp .Lenc_key_ret 4413 4414.align 16 4415.L10rounds_alt: 4416 movdqa .Lkey_rotate(%rip),%xmm5 4417 mov \$8,%r10d 4418 movdqa .Lkey_rcon1(%rip),%xmm4 4419 movdqa %xmm0,%xmm2 4420 movdqu %xmm0,($key) 4421 jmp .Loop_key128 4422 4423.align 16 4424.Loop_key128: 4425 pshufb %xmm5,%xmm0 4426 aesenclast %xmm4,%xmm0 4427 pslld \$1,%xmm4 4428 lea 16(%rax),%rax 4429 4430 movdqa %xmm2,%xmm3 4431 pslldq \$4,%xmm2 4432 pxor %xmm2,%xmm3 4433 pslldq \$4,%xmm2 4434 pxor %xmm2,%xmm3 4435 pslldq \$4,%xmm2 4436 pxor %xmm3,%xmm2 4437 4438 pxor %xmm2,%xmm0 4439 movdqu %xmm0,-16(%rax) 4440 movdqa %xmm0,%xmm2 4441 4442 dec %r10d 4443 jnz .Loop_key128 4444 4445 movdqa .Lkey_rcon1b(%rip),%xmm4 4446 4447 pshufb %xmm5,%xmm0 4448 aesenclast %xmm4,%xmm0 4449 pslld \$1,%xmm4 4450 4451 movdqa %xmm2,%xmm3 4452 pslldq \$4,%xmm2 4453 pxor %xmm2,%xmm3 4454 pslldq \$4,%xmm2 4455 pxor %xmm2,%xmm3 4456 pslldq \$4,%xmm2 4457 pxor %xmm3,%xmm2 4458 4459 pxor %xmm2,%xmm0 4460 movdqu %xmm0,(%rax) 4461 4462 movdqa %xmm0,%xmm2 4463 pshufb %xmm5,%xmm0 4464 aesenclast %xmm4,%xmm0 4465 4466 movdqa %xmm2,%xmm3 4467 pslldq \$4,%xmm2 4468 pxor %xmm2,%xmm3 4469 pslldq \$4,%xmm2 4470 pxor %xmm2,%xmm3 4471 pslldq \$4,%xmm2 4472 pxor %xmm3,%xmm2 4473 4474 pxor %xmm2,%xmm0 4475 movdqu %xmm0,16(%rax) 4476 4477 mov $bits,96(%rax) # 240($key) 4478 xor %eax,%eax 4479 jmp .Lenc_key_ret 4480 4481.align 16 4482.L12rounds: 4483 movq 16($inp),%xmm2 # remaining 1/3 of *userKey 4484 mov \$11,$bits # 12 rounds for 192 4485 cmp \$`1<<28`,%r10d # AVX, but no XOP 4486 je .L12rounds_alt 4487 4488 $movkey %xmm0,($key) # round 0 4489 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2 4490 call .Lkey_expansion_192a_cold 4491 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3 4492 call .Lkey_expansion_192b 4493 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5 4494 call .Lkey_expansion_192a 4495 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6 4496 call .Lkey_expansion_192b 4497 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8 4498 call .Lkey_expansion_192a 4499 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9 4500 call .Lkey_expansion_192b 4501 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11 4502 call .Lkey_expansion_192a 4503 aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12 4504 call .Lkey_expansion_192b 4505 $movkey %xmm0,(%rax) 4506 mov $bits,48(%rax) # 240(%rdx) 4507 xor %rax, %rax 4508 jmp .Lenc_key_ret 4509 4510.align 16 4511.L12rounds_alt: 4512 movdqa .Lkey_rotate192(%rip),%xmm5 4513 movdqa .Lkey_rcon1(%rip),%xmm4 4514 mov \$8,%r10d 4515 movdqu %xmm0,($key) 4516 jmp .Loop_key192 4517 4518.align 16 4519.Loop_key192: 4520 movq %xmm2,0(%rax) 4521 movdqa %xmm2,%xmm1 4522 pshufb %xmm5,%xmm2 4523 aesenclast %xmm4,%xmm2 4524 pslld \$1, %xmm4 4525 lea 24(%rax),%rax 4526 4527 movdqa %xmm0,%xmm3 4528 pslldq \$4,%xmm0 4529 pxor %xmm0,%xmm3 4530 pslldq \$4,%xmm0 4531 pxor %xmm0,%xmm3 4532 pslldq \$4,%xmm0 4533 pxor %xmm3,%xmm0 4534 4535 pshufd \$0xff,%xmm0,%xmm3 4536 pxor %xmm1,%xmm3 4537 pslldq \$4,%xmm1 4538 pxor %xmm1,%xmm3 4539 4540 pxor %xmm2,%xmm0 4541 pxor %xmm3,%xmm2 4542 movdqu %xmm0,-16(%rax) 4543 4544 dec %r10d 4545 jnz .Loop_key192 4546 4547 mov $bits,32(%rax) # 240($key) 4548 xor %eax,%eax 4549 jmp .Lenc_key_ret 4550 4551.align 16 4552.L14rounds: 4553 movups 16($inp),%xmm2 # remaining half of *userKey 4554 mov \$13,$bits # 14 rounds for 256 4555 lea 16(%rax),%rax 4556 cmp \$`1<<28`,%r10d # AVX, but no XOP 4557 je .L14rounds_alt 4558 4559 $movkey %xmm0,($key) # round 0 4560 $movkey %xmm2,16($key) # round 1 4561 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2 4562 call .Lkey_expansion_256a_cold 4563 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3 4564 call .Lkey_expansion_256b 4565 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4 4566 call .Lkey_expansion_256a 4567 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5 4568 call .Lkey_expansion_256b 4569 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6 4570 call .Lkey_expansion_256a 4571 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7 4572 call .Lkey_expansion_256b 4573 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8 4574 call .Lkey_expansion_256a 4575 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9 4576 call .Lkey_expansion_256b 4577 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10 4578 call .Lkey_expansion_256a 4579 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11 4580 call .Lkey_expansion_256b 4581 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12 4582 call .Lkey_expansion_256a 4583 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13 4584 call .Lkey_expansion_256b 4585 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14 4586 call .Lkey_expansion_256a 4587 $movkey %xmm0,(%rax) 4588 mov $bits,16(%rax) # 240(%rdx) 4589 xor %rax,%rax 4590 jmp .Lenc_key_ret 4591 4592.align 16 4593.L14rounds_alt: 4594 movdqa .Lkey_rotate(%rip),%xmm5 4595 movdqa .Lkey_rcon1(%rip),%xmm4 4596 mov \$7,%r10d 4597 movdqu %xmm0,0($key) 4598 movdqa %xmm2,%xmm1 4599 movdqu %xmm2,16($key) 4600 jmp .Loop_key256 4601 4602.align 16 4603.Loop_key256: 4604 pshufb %xmm5,%xmm2 4605 aesenclast %xmm4,%xmm2 4606 4607 movdqa %xmm0,%xmm3 4608 pslldq \$4,%xmm0 4609 pxor %xmm0,%xmm3 4610 pslldq \$4,%xmm0 4611 pxor %xmm0,%xmm3 4612 pslldq \$4,%xmm0 4613 pxor %xmm3,%xmm0 4614 pslld \$1,%xmm4 4615 4616 pxor %xmm2,%xmm0 4617 movdqu %xmm0,(%rax) 4618 4619 dec %r10d 4620 jz .Ldone_key256 4621 4622 pshufd \$0xff,%xmm0,%xmm2 4623 pxor %xmm3,%xmm3 4624 aesenclast %xmm3,%xmm2 4625 4626 movdqa %xmm1,%xmm3 4627 pslldq \$4,%xmm1 4628 pxor %xmm1,%xmm3 4629 pslldq \$4,%xmm1 4630 pxor %xmm1,%xmm3 4631 pslldq \$4,%xmm1 4632 pxor %xmm3,%xmm1 4633 4634 pxor %xmm1,%xmm2 4635 movdqu %xmm2,16(%rax) 4636 lea 32(%rax),%rax 4637 movdqa %xmm2,%xmm1 4638 4639 jmp .Loop_key256 4640 4641.Ldone_key256: 4642 mov $bits,16(%rax) # 240($key) 4643 xor %eax,%eax 4644 jmp .Lenc_key_ret 4645 4646.align 16 4647.Lbad_keybits: 4648 mov \$-2,%rax 4649.Lenc_key_ret: 4650 pxor %xmm0,%xmm0 4651 pxor %xmm1,%xmm1 4652 pxor %xmm2,%xmm2 4653 pxor %xmm3,%xmm3 4654 pxor %xmm4,%xmm4 4655 pxor %xmm5,%xmm5 4656 add \$8,%rsp 4657.cfi_adjust_cfa_offset -8 4658 ret 4659.cfi_endproc 4660.LSEH_end_set_encrypt_key: 4661 4662.align 16 4663.Lkey_expansion_128: 4664 $movkey %xmm0,(%rax) 4665 lea 16(%rax),%rax 4666.Lkey_expansion_128_cold: 4667 shufps \$0b00010000,%xmm0,%xmm4 4668 xorps %xmm4, %xmm0 4669 shufps \$0b10001100,%xmm0,%xmm4 4670 xorps %xmm4, %xmm0 4671 shufps \$0b11111111,%xmm1,%xmm1 # critical path 4672 xorps %xmm1,%xmm0 4673 ret 4674 4675.align 16 4676.Lkey_expansion_192a: 4677 $movkey %xmm0,(%rax) 4678 lea 16(%rax),%rax 4679.Lkey_expansion_192a_cold: 4680 movaps %xmm2, %xmm5 4681.Lkey_expansion_192b_warm: 4682 shufps \$0b00010000,%xmm0,%xmm4 4683 movdqa %xmm2,%xmm3 4684 xorps %xmm4,%xmm0 4685 shufps \$0b10001100,%xmm0,%xmm4 4686 pslldq \$4,%xmm3 4687 xorps %xmm4,%xmm0 4688 pshufd \$0b01010101,%xmm1,%xmm1 # critical path 4689 pxor %xmm3,%xmm2 4690 pxor %xmm1,%xmm0 4691 pshufd \$0b11111111,%xmm0,%xmm3 4692 pxor %xmm3,%xmm2 4693 ret 4694 4695.align 16 4696.Lkey_expansion_192b: 4697 movaps %xmm0,%xmm3 4698 shufps \$0b01000100,%xmm0,%xmm5 4699 $movkey %xmm5,(%rax) 4700 shufps \$0b01001110,%xmm2,%xmm3 4701 $movkey %xmm3,16(%rax) 4702 lea 32(%rax),%rax 4703 jmp .Lkey_expansion_192b_warm 4704 4705.align 16 4706.Lkey_expansion_256a: 4707 $movkey %xmm2,(%rax) 4708 lea 16(%rax),%rax 4709.Lkey_expansion_256a_cold: 4710 shufps \$0b00010000,%xmm0,%xmm4 4711 xorps %xmm4,%xmm0 4712 shufps \$0b10001100,%xmm0,%xmm4 4713 xorps %xmm4,%xmm0 4714 shufps \$0b11111111,%xmm1,%xmm1 # critical path 4715 xorps %xmm1,%xmm0 4716 ret 4717 4718.align 16 4719.Lkey_expansion_256b: 4720 $movkey %xmm0,(%rax) 4721 lea 16(%rax),%rax 4722 4723 shufps \$0b00010000,%xmm2,%xmm4 4724 xorps %xmm4,%xmm2 4725 shufps \$0b10001100,%xmm2,%xmm4 4726 xorps %xmm4,%xmm2 4727 shufps \$0b10101010,%xmm1,%xmm1 # critical path 4728 xorps %xmm1,%xmm2 4729 ret 4730.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key 4731.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key 4732___ 4733} 4734 4735$code.=<<___; 4736.align 64 4737.Lbswap_mask: 4738 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 4739.Lincrement32: 4740 .long 6,6,6,0 4741.Lincrement64: 4742 .long 1,0,0,0 4743.Lxts_magic: 4744 .long 0x87,0,1,0 4745.Lincrement1: 4746 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 4747.Lkey_rotate: 4748 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d 4749.Lkey_rotate192: 4750 .long 0x04070605,0x04070605,0x04070605,0x04070605 4751.Lkey_rcon1: 4752 .long 1,1,1,1 4753.Lkey_rcon1b: 4754 .long 0x1b,0x1b,0x1b,0x1b 4755 4756.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>" 4757.align 64 4758___ 4759 4760# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 4761# CONTEXT *context,DISPATCHER_CONTEXT *disp) 4762if ($win64) { 4763$rec="%rcx"; 4764$frame="%rdx"; 4765$context="%r8"; 4766$disp="%r9"; 4767 4768$code.=<<___; 4769.extern __imp_RtlVirtualUnwind 4770___ 4771$code.=<<___ if ($PREFIX eq "aes_hw"); 4772.type ecb_ccm64_se_handler,\@abi-omnipotent 4773.align 16 4774ecb_ccm64_se_handler: 4775 push %rsi 4776 push %rdi 4777 push %rbx 4778 push %rbp 4779 push %r12 4780 push %r13 4781 push %r14 4782 push %r15 4783 pushfq 4784 sub \$64,%rsp 4785 4786 mov 120($context),%rax # pull context->Rax 4787 mov 248($context),%rbx # pull context->Rip 4788 4789 mov 8($disp),%rsi # disp->ImageBase 4790 mov 56($disp),%r11 # disp->HandlerData 4791 4792 mov 0(%r11),%r10d # HandlerData[0] 4793 lea (%rsi,%r10),%r10 # prologue label 4794 cmp %r10,%rbx # context->Rip<prologue label 4795 jb .Lcommon_seh_tail 4796 4797 mov 152($context),%rax # pull context->Rsp 4798 4799 mov 4(%r11),%r10d # HandlerData[1] 4800 lea (%rsi,%r10),%r10 # epilogue label 4801 cmp %r10,%rbx # context->Rip>=epilogue label 4802 jae .Lcommon_seh_tail 4803 4804 lea 0(%rax),%rsi # %xmm save area 4805 lea 512($context),%rdi # &context.Xmm6 4806 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) 4807 .long 0xa548f3fc # cld; rep movsq 4808 lea 0x58(%rax),%rax # adjust stack pointer 4809 4810 jmp .Lcommon_seh_tail 4811.size ${PREFIX}_ccm64_se_handler,.-${PREFIX}_ccm64_se_handler 4812 4813.type ctr_xts_se_handler,\@abi-omnipotent 4814.align 16 4815ctr_xts_se_handler: 4816 push %rsi 4817 push %rdi 4818 push %rbx 4819 push %rbp 4820 push %r12 4821 push %r13 4822 push %r14 4823 push %r15 4824 pushfq 4825 sub \$64,%rsp 4826 4827 mov 120($context),%rax # pull context->Rax 4828 mov 248($context),%rbx # pull context->Rip 4829 4830 mov 8($disp),%rsi # disp->ImageBase 4831 mov 56($disp),%r11 # disp->HandlerData 4832 4833 mov 0(%r11),%r10d # HandlerData[0] 4834 lea (%rsi,%r10),%r10 # prologue lable 4835 cmp %r10,%rbx # context->Rip<prologue label 4836 jb .Lcommon_seh_tail 4837 4838 mov 152($context),%rax # pull context->Rsp 4839 4840 mov 4(%r11),%r10d # HandlerData[1] 4841 lea (%rsi,%r10),%r10 # epilogue label 4842 cmp %r10,%rbx # context->Rip>=epilogue label 4843 jae .Lcommon_seh_tail 4844 4845 mov 208($context),%rax # pull context->R11 4846 4847 lea -0xa8(%rax),%rsi # %xmm save area 4848 lea 512($context),%rdi # & context.Xmm6 4849 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 4850 .long 0xa548f3fc # cld; rep movsq 4851 4852 mov -8(%rax),%rbp # restore saved %rbp 4853 mov %rbp,160($context) # restore context->Rbp 4854 jmp .Lcommon_seh_tail 4855.size ctr_xts_se_handler,.-ctr_xts_se_handler 4856 4857___ 4858# BoringSSL omits the OCB functions. 4859$code.=<<___ if (0); 4860.type ocb_se_handler,\@abi-omnipotent 4861.align 16 4862ocb_se_handler: 4863 push %rsi 4864 push %rdi 4865 push %rbx 4866 push %rbp 4867 push %r12 4868 push %r13 4869 push %r14 4870 push %r15 4871 pushfq 4872 sub \$64,%rsp 4873 4874 mov 120($context),%rax # pull context->Rax 4875 mov 248($context),%rbx # pull context->Rip 4876 4877 mov 8($disp),%rsi # disp->ImageBase 4878 mov 56($disp),%r11 # disp->HandlerData 4879 4880 mov 0(%r11),%r10d # HandlerData[0] 4881 lea (%rsi,%r10),%r10 # prologue lable 4882 cmp %r10,%rbx # context->Rip<prologue label 4883 jb .Lcommon_seh_tail 4884 4885 mov 4(%r11),%r10d # HandlerData[1] 4886 lea (%rsi,%r10),%r10 # epilogue label 4887 cmp %r10,%rbx # context->Rip>=epilogue label 4888 jae .Lcommon_seh_tail 4889 4890 mov 8(%r11),%r10d # HandlerData[2] 4891 lea (%rsi,%r10),%r10 4892 cmp %r10,%rbx # context->Rip>=pop label 4893 jae .Locb_no_xmm 4894 4895 mov 152($context),%rax # pull context->Rsp 4896 4897 lea (%rax),%rsi # %xmm save area 4898 lea 512($context),%rdi # & context.Xmm6 4899 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 4900 .long 0xa548f3fc # cld; rep movsq 4901 lea 0xa0+0x28(%rax),%rax 4902 4903.Locb_no_xmm: 4904 mov -8(%rax),%rbx 4905 mov -16(%rax),%rbp 4906 mov -24(%rax),%r12 4907 mov -32(%rax),%r13 4908 mov -40(%rax),%r14 4909 4910 mov %rbx,144($context) # restore context->Rbx 4911 mov %rbp,160($context) # restore context->Rbp 4912 mov %r12,216($context) # restore context->R12 4913 mov %r13,224($context) # restore context->R13 4914 mov %r14,232($context) # restore context->R14 4915 4916 jmp .Lcommon_seh_tail 4917.size ocb_se_handler,.-ocb_se_handler 4918___ 4919$code.=<<___; 4920.type cbc_se_handler,\@abi-omnipotent 4921.align 16 4922cbc_se_handler: 4923 push %rsi 4924 push %rdi 4925 push %rbx 4926 push %rbp 4927 push %r12 4928 push %r13 4929 push %r14 4930 push %r15 4931 pushfq 4932 sub \$64,%rsp 4933 4934 mov 152($context),%rax # pull context->Rsp 4935 mov 248($context),%rbx # pull context->Rip 4936 4937 lea .Lcbc_decrypt_bulk(%rip),%r10 4938 cmp %r10,%rbx # context->Rip<"prologue" label 4939 jb .Lcommon_seh_tail 4940 4941 mov 120($context),%rax # pull context->Rax 4942 4943 lea .Lcbc_decrypt_body(%rip),%r10 4944 cmp %r10,%rbx # context->Rip<cbc_decrypt_body 4945 jb .Lcommon_seh_tail 4946 4947 mov 152($context),%rax # pull context->Rsp 4948 4949 lea .Lcbc_ret(%rip),%r10 4950 cmp %r10,%rbx # context->Rip>="epilogue" label 4951 jae .Lcommon_seh_tail 4952 4953 lea 16(%rax),%rsi # %xmm save area 4954 lea 512($context),%rdi # &context.Xmm6 4955 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 4956 .long 0xa548f3fc # cld; rep movsq 4957 4958 mov 208($context),%rax # pull context->R11 4959 4960 mov -8(%rax),%rbp # restore saved %rbp 4961 mov %rbp,160($context) # restore context->Rbp 4962 4963.Lcommon_seh_tail: 4964 mov 8(%rax),%rdi 4965 mov 16(%rax),%rsi 4966 mov %rax,152($context) # restore context->Rsp 4967 mov %rsi,168($context) # restore context->Rsi 4968 mov %rdi,176($context) # restore context->Rdi 4969 4970 mov 40($disp),%rdi # disp->ContextRecord 4971 mov $context,%rsi # context 4972 mov \$154,%ecx # sizeof(CONTEXT) 4973 .long 0xa548f3fc # cld; rep movsq 4974 4975 mov $disp,%rsi 4976 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 4977 mov 8(%rsi),%rdx # arg2, disp->ImageBase 4978 mov 0(%rsi),%r8 # arg3, disp->ControlPc 4979 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 4980 mov 40(%rsi),%r10 # disp->ContextRecord 4981 lea 56(%rsi),%r11 # &disp->HandlerData 4982 lea 24(%rsi),%r12 # &disp->EstablisherFrame 4983 mov %r10,32(%rsp) # arg5 4984 mov %r11,40(%rsp) # arg6 4985 mov %r12,48(%rsp) # arg7 4986 mov %rcx,56(%rsp) # arg8, (NULL) 4987 call *__imp_RtlVirtualUnwind(%rip) 4988 4989 mov \$1,%eax # ExceptionContinueSearch 4990 add \$64,%rsp 4991 popfq 4992 pop %r15 4993 pop %r14 4994 pop %r13 4995 pop %r12 4996 pop %rbp 4997 pop %rbx 4998 pop %rdi 4999 pop %rsi 5000 ret 5001.size cbc_se_handler,.-cbc_se_handler 5002 5003.section .pdata 5004.align 4 5005___ 5006$code.=<<___ if ($PREFIX eq "aes_hw"); 5007 .rva .LSEH_begin_${PREFIX}_ecb_encrypt 5008 .rva .LSEH_end_${PREFIX}_ecb_encrypt 5009 .rva .LSEH_info_ecb 5010 5011 .rva .LSEH_begin_${PREFIX}_ctr32_encrypt_blocks 5012 .rva .LSEH_end_${PREFIX}_ctr32_encrypt_blocks 5013 .rva .LSEH_info_ctr32 5014___ 5015$code.=<<___; 5016 .rva .LSEH_begin_${PREFIX}_cbc_encrypt 5017 .rva .LSEH_end_${PREFIX}_cbc_encrypt 5018 .rva .LSEH_info_cbc 5019 5020 .rva ${PREFIX}_set_decrypt_key 5021 .rva .LSEH_end_set_decrypt_key 5022 .rva .LSEH_info_key 5023 5024 .rva ${PREFIX}_set_encrypt_key 5025 .rva .LSEH_end_set_encrypt_key 5026 .rva .LSEH_info_key 5027.section .xdata 5028.align 8 5029___ 5030$code.=<<___ if ($PREFIX eq "aes_hw"); 5031.LSEH_info_ecb: 5032 .byte 9,0,0,0 5033 .rva ecb_ccm64_se_handler 5034 .rva .Lecb_enc_body,.Lecb_enc_ret # HandlerData[] 5035.LSEH_info_ctr32: 5036 .byte 9,0,0,0 5037 .rva ctr_xts_se_handler 5038 .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[] 5039___ 5040$code.=<<___; 5041.LSEH_info_cbc: 5042 .byte 9,0,0,0 5043 .rva cbc_se_handler 5044.LSEH_info_key: 5045 .byte 0x01,0x04,0x01,0x00 5046 .byte 0x04,0x02,0x00,0x00 # sub rsp,8 5047___ 5048} 5049 5050sub rex { 5051 local *opcode=shift; 5052 my ($dst,$src)=@_; 5053 my $rex=0; 5054 5055 $rex|=0x04 if($dst>=8); 5056 $rex|=0x01 if($src>=8); 5057 push @opcode,$rex|0x40 if($rex); 5058} 5059 5060sub aesni { 5061 my $line=shift; 5062 my @opcode=(0x66); 5063 5064 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { 5065 rex(\@opcode,$4,$3); 5066 push @opcode,0x0f,0x3a,0xdf; 5067 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M 5068 my $c=$2; 5069 push @opcode,$c=~/^0/?oct($c):$c; 5070 return ".byte\t".join(',',@opcode); 5071 } 5072 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { 5073 my %opcodelet = ( 5074 "aesimc" => 0xdb, 5075 "aesenc" => 0xdc, "aesenclast" => 0xdd, 5076 "aesdec" => 0xde, "aesdeclast" => 0xdf 5077 ); 5078 return undef if (!defined($opcodelet{$1})); 5079 rex(\@opcode,$3,$2); 5080 push @opcode,0x0f,0x38,$opcodelet{$1}; 5081 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M 5082 return ".byte\t".join(',',@opcode); 5083 } 5084 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) { 5085 my %opcodelet = ( 5086 "aesenc" => 0xdc, "aesenclast" => 0xdd, 5087 "aesdec" => 0xde, "aesdeclast" => 0xdf 5088 ); 5089 return undef if (!defined($opcodelet{$1})); 5090 my $off = $2; 5091 push @opcode,0x44 if ($3>=8); 5092 push @opcode,0x0f,0x38,$opcodelet{$1}; 5093 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M 5094 push @opcode,($off=~/^0/?oct($off):$off)&0xff; 5095 return ".byte\t".join(',',@opcode); 5096 } 5097 return $line; 5098} 5099 5100sub movbe { 5101 ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift; 5102} 5103 5104$code =~ s/\`([^\`]*)\`/eval($1)/gem; 5105$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; 5106#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact 5107$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem; 5108 5109print $code; 5110 5111close STDOUT; 5112