1#! /usr/bin/env perl 2# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements support for Intel AES-NI extension. In 18# OpenSSL context it's used with Intel engine, but can also be used as 19# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for 20# details]. 21# 22# Performance. 23# 24# To start with see corresponding paragraph in aesni-x86_64.pl... 25# Instead of filling table similar to one found there I've chosen to 26# summarize *comparison* results for raw ECB, CTR and CBC benchmarks. 27# The simplified table below represents 32-bit performance relative 28# to 64-bit one in every given point. Ratios vary for different 29# encryption modes, therefore interval values. 30# 31# 16-byte 64-byte 256-byte 1-KB 8-KB 32# 53-67% 67-84% 91-94% 95-98% 97-99.5% 33# 34# Lower ratios for smaller block sizes are perfectly understandable, 35# because function call overhead is higher in 32-bit mode. Largest 36# 8-KB block performance is virtually same: 32-bit code is less than 37# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise. 38 39# January 2011 40# 41# See aesni-x86_64.pl for details. Unlike x86_64 version this module 42# interleaves at most 6 aes[enc|dec] instructions, because there are 43# not enough registers for 8x interleave [which should be optimal for 44# Sandy Bridge]. Actually, performance results for 6x interleave 45# factor presented in aesni-x86_64.pl (except for CTR) are for this 46# module. 47 48# April 2011 49# 50# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing 51# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09. 52 53# November 2015 54# 55# Add aesni_ocb_[en|de]crypt. [Removed in BoringSSL] 56 57###################################################################### 58# Current large-block performance in cycles per byte processed with 59# 128-bit key (less is better). 60# 61# CBC en-/decrypt CTR XTS ECB OCB 62# Westmere 3.77/1.37 1.37 1.52 1.27 63# * Bridge 5.07/0.98 0.99 1.09 0.91 1.10 64# Haswell 4.44/0.80 0.97 1.03 0.72 0.76 65# Skylake 2.68/0.65 0.65 0.66 0.64 0.66 66# Silvermont 5.77/3.56 3.67 4.03 3.46 4.03 67# Goldmont 3.84/1.39 1.39 1.63 1.31 1.70 68# Bulldozer 5.80/0.98 1.05 1.24 0.93 1.23 69 70$PREFIX="aes_hw"; # if $PREFIX is set to "AES", the script 71 # generates drop-in replacement for 72 # crypto/aes/asm/aes-586.pl:-) 73$AESNI_PREFIX="aes_hw"; 74$inline=1; # inline _aesni_[en|de]crypt 75 76$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 77push(@INC,"${dir}","${dir}../../../perlasm"); 78require "x86asm.pl"; 79 80$output = pop; 81open OUT,">$output"; 82*STDOUT=*OUT; 83 84&asm_init($ARGV[0]); 85 86&external_label("OPENSSL_ia32cap_P"); 87&preprocessor_ifndef("NDEBUG") 88&external_label("BORINGSSL_function_hit"); 89&preprocessor_endif(); 90&static_label("key_const"); 91 92if ($PREFIX eq $AESNI_PREFIX) { $movekey=\&movups; } 93else { $movekey=\&movups; } 94 95$len="eax"; 96$rounds="ecx"; 97$key="edx"; 98$inp="esi"; 99$out="edi"; 100$rounds_="ebx"; # backup copy for $rounds 101$key_="ebp"; # backup copy for $key 102 103$rndkey0="xmm0"; 104$rndkey1="xmm1"; 105$inout0="xmm2"; 106$inout1="xmm3"; 107$inout2="xmm4"; 108$inout3="xmm5"; $in1="xmm5"; 109$inout4="xmm6"; $in0="xmm6"; 110$inout5="xmm7"; $ivec="xmm7"; 111 112# AESNI extension 113sub aeskeygenassist 114{ my($dst,$src,$imm)=@_; 115 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) 116 { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); } 117} 118sub aescommon 119{ my($opcodelet,$dst,$src)=@_; 120 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) 121 { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);} 122} 123sub aesimc { aescommon(0xdb,@_); } 124sub aesenc { aescommon(0xdc,@_); } 125sub aesenclast { aescommon(0xdd,@_); } 126sub aesdec { aescommon(0xde,@_); } 127sub aesdeclast { aescommon(0xdf,@_); } 128 129# Inline version of internal aesni_[en|de]crypt1 130{ my $sn; 131sub aesni_inline_generate1 132{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); 133 $sn++; 134 135 &$movekey ($rndkey0,&QWP(0,$key)); 136 &$movekey ($rndkey1,&QWP(16,$key)); 137 &xorps ($ivec,$rndkey0) if (defined($ivec)); 138 &lea ($key,&DWP(32,$key)); 139 &xorps ($inout,$ivec) if (defined($ivec)); 140 &xorps ($inout,$rndkey0) if (!defined($ivec)); 141 &set_label("${p}1_loop_$sn"); 142 eval"&aes${p} ($inout,$rndkey1)"; 143 &dec ($rounds); 144 &$movekey ($rndkey1,&QWP(0,$key)); 145 &lea ($key,&DWP(16,$key)); 146 &jnz (&label("${p}1_loop_$sn")); 147 eval"&aes${p}last ($inout,$rndkey1)"; 148}} 149 150sub aesni_generate1 # fully unrolled loop 151{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout)); 152 153 &function_begin_B("_aesni_${p}rypt1"); 154 &movups ($rndkey0,&QWP(0,$key)); 155 &$movekey ($rndkey1,&QWP(0x10,$key)); 156 &xorps ($inout,$rndkey0); 157 &$movekey ($rndkey0,&QWP(0x20,$key)); 158 &lea ($key,&DWP(0x30,$key)); 159 &cmp ($rounds,11); 160 &jb (&label("${p}128")); 161 &lea ($key,&DWP(0x20,$key)); 162 &je (&label("${p}192")); 163 &lea ($key,&DWP(0x20,$key)); 164 eval"&aes${p} ($inout,$rndkey1)"; 165 &$movekey ($rndkey1,&QWP(-0x40,$key)); 166 eval"&aes${p} ($inout,$rndkey0)"; 167 &$movekey ($rndkey0,&QWP(-0x30,$key)); 168 &set_label("${p}192"); 169 eval"&aes${p} ($inout,$rndkey1)"; 170 &$movekey ($rndkey1,&QWP(-0x20,$key)); 171 eval"&aes${p} ($inout,$rndkey0)"; 172 &$movekey ($rndkey0,&QWP(-0x10,$key)); 173 &set_label("${p}128"); 174 eval"&aes${p} ($inout,$rndkey1)"; 175 &$movekey ($rndkey1,&QWP(0,$key)); 176 eval"&aes${p} ($inout,$rndkey0)"; 177 &$movekey ($rndkey0,&QWP(0x10,$key)); 178 eval"&aes${p} ($inout,$rndkey1)"; 179 &$movekey ($rndkey1,&QWP(0x20,$key)); 180 eval"&aes${p} ($inout,$rndkey0)"; 181 &$movekey ($rndkey0,&QWP(0x30,$key)); 182 eval"&aes${p} ($inout,$rndkey1)"; 183 &$movekey ($rndkey1,&QWP(0x40,$key)); 184 eval"&aes${p} ($inout,$rndkey0)"; 185 &$movekey ($rndkey0,&QWP(0x50,$key)); 186 eval"&aes${p} ($inout,$rndkey1)"; 187 &$movekey ($rndkey1,&QWP(0x60,$key)); 188 eval"&aes${p} ($inout,$rndkey0)"; 189 &$movekey ($rndkey0,&QWP(0x70,$key)); 190 eval"&aes${p} ($inout,$rndkey1)"; 191 eval"&aes${p}last ($inout,$rndkey0)"; 192 &ret(); 193 &function_end_B("_aesni_${p}rypt1"); 194} 195 196# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key); 197&aesni_generate1("enc") if (!$inline); 198&function_begin_B("${PREFIX}_encrypt"); 199 &record_function_hit(1); 200 201 &mov ("eax",&wparam(0)); 202 &mov ($key,&wparam(2)); 203 &movups ($inout0,&QWP(0,"eax")); 204 &mov ($rounds,&DWP(240,$key)); 205 &mov ("eax",&wparam(1)); 206 if ($inline) 207 { &aesni_inline_generate1("enc"); } 208 else 209 { &call ("_aesni_encrypt1"); } 210 &pxor ($rndkey0,$rndkey0); # clear register bank 211 &pxor ($rndkey1,$rndkey1); 212 &movups (&QWP(0,"eax"),$inout0); 213 &pxor ($inout0,$inout0); 214 &ret (); 215&function_end_B("${PREFIX}_encrypt"); 216 217# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key); 218&aesni_generate1("dec") if(!$inline); 219&function_begin_B("${PREFIX}_decrypt"); 220 &mov ("eax",&wparam(0)); 221 &mov ($key,&wparam(2)); 222 &movups ($inout0,&QWP(0,"eax")); 223 &mov ($rounds,&DWP(240,$key)); 224 &mov ("eax",&wparam(1)); 225 if ($inline) 226 { &aesni_inline_generate1("dec"); } 227 else 228 { &call ("_aesni_decrypt1"); } 229 &pxor ($rndkey0,$rndkey0); # clear register bank 230 &pxor ($rndkey1,$rndkey1); 231 &movups (&QWP(0,"eax"),$inout0); 232 &pxor ($inout0,$inout0); 233 &ret (); 234&function_end_B("${PREFIX}_decrypt"); 235 236# _aesni_[en|de]cryptN are private interfaces, N denotes interleave 237# factor. Why 3x subroutine were originally used in loops? Even though 238# aes[enc|dec] latency was originally 6, it could be scheduled only 239# every *2nd* cycle. Thus 3x interleave was the one providing optimal 240# utilization, i.e. when subroutine's throughput is virtually same as 241# of non-interleaved subroutine [for number of input blocks up to 3]. 242# This is why it originally made no sense to implement 2x subroutine. 243# But times change and it became appropriate to spend extra 192 bytes 244# on 2x subroutine on Atom Silvermont account. For processors that 245# can schedule aes[enc|dec] every cycle optimal interleave factor 246# equals to corresponding instructions latency. 8x is optimal for 247# * Bridge, but it's unfeasible to accommodate such implementation 248# in XMM registers addressable in 32-bit mode and therefore maximum 249# of 6x is used instead... 250 251sub aesni_generate2 252{ my $p=shift; 253 254 &function_begin_B("_aesni_${p}rypt2"); 255 &$movekey ($rndkey0,&QWP(0,$key)); 256 &shl ($rounds,4); 257 &$movekey ($rndkey1,&QWP(16,$key)); 258 &xorps ($inout0,$rndkey0); 259 &pxor ($inout1,$rndkey0); 260 &$movekey ($rndkey0,&QWP(32,$key)); 261 &lea ($key,&DWP(32,$key,$rounds)); 262 &neg ($rounds); 263 &add ($rounds,16); 264 265 &set_label("${p}2_loop"); 266 eval"&aes${p} ($inout0,$rndkey1)"; 267 eval"&aes${p} ($inout1,$rndkey1)"; 268 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 269 &add ($rounds,32); 270 eval"&aes${p} ($inout0,$rndkey0)"; 271 eval"&aes${p} ($inout1,$rndkey0)"; 272 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 273 &jnz (&label("${p}2_loop")); 274 eval"&aes${p} ($inout0,$rndkey1)"; 275 eval"&aes${p} ($inout1,$rndkey1)"; 276 eval"&aes${p}last ($inout0,$rndkey0)"; 277 eval"&aes${p}last ($inout1,$rndkey0)"; 278 &ret(); 279 &function_end_B("_aesni_${p}rypt2"); 280} 281 282sub aesni_generate3 283{ my $p=shift; 284 285 &function_begin_B("_aesni_${p}rypt3"); 286 &$movekey ($rndkey0,&QWP(0,$key)); 287 &shl ($rounds,4); 288 &$movekey ($rndkey1,&QWP(16,$key)); 289 &xorps ($inout0,$rndkey0); 290 &pxor ($inout1,$rndkey0); 291 &pxor ($inout2,$rndkey0); 292 &$movekey ($rndkey0,&QWP(32,$key)); 293 &lea ($key,&DWP(32,$key,$rounds)); 294 &neg ($rounds); 295 &add ($rounds,16); 296 297 &set_label("${p}3_loop"); 298 eval"&aes${p} ($inout0,$rndkey1)"; 299 eval"&aes${p} ($inout1,$rndkey1)"; 300 eval"&aes${p} ($inout2,$rndkey1)"; 301 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 302 &add ($rounds,32); 303 eval"&aes${p} ($inout0,$rndkey0)"; 304 eval"&aes${p} ($inout1,$rndkey0)"; 305 eval"&aes${p} ($inout2,$rndkey0)"; 306 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 307 &jnz (&label("${p}3_loop")); 308 eval"&aes${p} ($inout0,$rndkey1)"; 309 eval"&aes${p} ($inout1,$rndkey1)"; 310 eval"&aes${p} ($inout2,$rndkey1)"; 311 eval"&aes${p}last ($inout0,$rndkey0)"; 312 eval"&aes${p}last ($inout1,$rndkey0)"; 313 eval"&aes${p}last ($inout2,$rndkey0)"; 314 &ret(); 315 &function_end_B("_aesni_${p}rypt3"); 316} 317 318# 4x interleave is implemented to improve small block performance, 319# most notably [and naturally] 4 block by ~30%. One can argue that one 320# should have implemented 5x as well, but improvement would be <20%, 321# so it's not worth it... 322sub aesni_generate4 323{ my $p=shift; 324 325 &function_begin_B("_aesni_${p}rypt4"); 326 &$movekey ($rndkey0,&QWP(0,$key)); 327 &$movekey ($rndkey1,&QWP(16,$key)); 328 &shl ($rounds,4); 329 &xorps ($inout0,$rndkey0); 330 &pxor ($inout1,$rndkey0); 331 &pxor ($inout2,$rndkey0); 332 &pxor ($inout3,$rndkey0); 333 &$movekey ($rndkey0,&QWP(32,$key)); 334 &lea ($key,&DWP(32,$key,$rounds)); 335 &neg ($rounds); 336 &data_byte (0x0f,0x1f,0x40,0x00); 337 &add ($rounds,16); 338 339 &set_label("${p}4_loop"); 340 eval"&aes${p} ($inout0,$rndkey1)"; 341 eval"&aes${p} ($inout1,$rndkey1)"; 342 eval"&aes${p} ($inout2,$rndkey1)"; 343 eval"&aes${p} ($inout3,$rndkey1)"; 344 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 345 &add ($rounds,32); 346 eval"&aes${p} ($inout0,$rndkey0)"; 347 eval"&aes${p} ($inout1,$rndkey0)"; 348 eval"&aes${p} ($inout2,$rndkey0)"; 349 eval"&aes${p} ($inout3,$rndkey0)"; 350 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 351 &jnz (&label("${p}4_loop")); 352 353 eval"&aes${p} ($inout0,$rndkey1)"; 354 eval"&aes${p} ($inout1,$rndkey1)"; 355 eval"&aes${p} ($inout2,$rndkey1)"; 356 eval"&aes${p} ($inout3,$rndkey1)"; 357 eval"&aes${p}last ($inout0,$rndkey0)"; 358 eval"&aes${p}last ($inout1,$rndkey0)"; 359 eval"&aes${p}last ($inout2,$rndkey0)"; 360 eval"&aes${p}last ($inout3,$rndkey0)"; 361 &ret(); 362 &function_end_B("_aesni_${p}rypt4"); 363} 364 365sub aesni_generate6 366{ my $p=shift; 367 368 &function_begin_B("_aesni_${p}rypt6"); 369 &static_label("_aesni_${p}rypt6_enter"); 370 &$movekey ($rndkey0,&QWP(0,$key)); 371 &shl ($rounds,4); 372 &$movekey ($rndkey1,&QWP(16,$key)); 373 &xorps ($inout0,$rndkey0); 374 &pxor ($inout1,$rndkey0); # pxor does better here 375 &pxor ($inout2,$rndkey0); 376 eval"&aes${p} ($inout0,$rndkey1)"; 377 &pxor ($inout3,$rndkey0); 378 &pxor ($inout4,$rndkey0); 379 eval"&aes${p} ($inout1,$rndkey1)"; 380 &lea ($key,&DWP(32,$key,$rounds)); 381 &neg ($rounds); 382 eval"&aes${p} ($inout2,$rndkey1)"; 383 &pxor ($inout5,$rndkey0); 384 &$movekey ($rndkey0,&QWP(0,$key,$rounds)); 385 &add ($rounds,16); 386 &jmp (&label("_aesni_${p}rypt6_inner")); 387 388 &set_label("${p}6_loop",16); 389 eval"&aes${p} ($inout0,$rndkey1)"; 390 eval"&aes${p} ($inout1,$rndkey1)"; 391 eval"&aes${p} ($inout2,$rndkey1)"; 392 &set_label("_aesni_${p}rypt6_inner"); 393 eval"&aes${p} ($inout3,$rndkey1)"; 394 eval"&aes${p} ($inout4,$rndkey1)"; 395 eval"&aes${p} ($inout5,$rndkey1)"; 396 &set_label("_aesni_${p}rypt6_enter"); 397 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 398 &add ($rounds,32); 399 eval"&aes${p} ($inout0,$rndkey0)"; 400 eval"&aes${p} ($inout1,$rndkey0)"; 401 eval"&aes${p} ($inout2,$rndkey0)"; 402 eval"&aes${p} ($inout3,$rndkey0)"; 403 eval"&aes${p} ($inout4,$rndkey0)"; 404 eval"&aes${p} ($inout5,$rndkey0)"; 405 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 406 &jnz (&label("${p}6_loop")); 407 408 eval"&aes${p} ($inout0,$rndkey1)"; 409 eval"&aes${p} ($inout1,$rndkey1)"; 410 eval"&aes${p} ($inout2,$rndkey1)"; 411 eval"&aes${p} ($inout3,$rndkey1)"; 412 eval"&aes${p} ($inout4,$rndkey1)"; 413 eval"&aes${p} ($inout5,$rndkey1)"; 414 eval"&aes${p}last ($inout0,$rndkey0)"; 415 eval"&aes${p}last ($inout1,$rndkey0)"; 416 eval"&aes${p}last ($inout2,$rndkey0)"; 417 eval"&aes${p}last ($inout3,$rndkey0)"; 418 eval"&aes${p}last ($inout4,$rndkey0)"; 419 eval"&aes${p}last ($inout5,$rndkey0)"; 420 &ret(); 421 &function_end_B("_aesni_${p}rypt6"); 422} 423&aesni_generate2("enc") if ($PREFIX eq $AESNI_PREFIX); 424&aesni_generate2("dec"); 425&aesni_generate3("enc") if ($PREFIX eq $AESNI_PREFIX); 426&aesni_generate3("dec"); 427&aesni_generate4("enc") if ($PREFIX eq $AESNI_PREFIX); 428&aesni_generate4("dec"); 429&aesni_generate6("enc") if ($PREFIX eq $AESNI_PREFIX); 430&aesni_generate6("dec"); 431 432if ($PREFIX eq $AESNI_PREFIX) { 433###################################################################### 434# void aes_hw_ecb_encrypt (const void *in, void *out, 435# size_t length, const AES_KEY *key, 436# int enc); 437&function_begin("${PREFIX}_ecb_encrypt"); 438 &mov ($inp,&wparam(0)); 439 &mov ($out,&wparam(1)); 440 &mov ($len,&wparam(2)); 441 &mov ($key,&wparam(3)); 442 &mov ($rounds_,&wparam(4)); 443 &and ($len,-16); 444 &jz (&label("ecb_ret")); 445 &mov ($rounds,&DWP(240,$key)); 446 &test ($rounds_,$rounds_); 447 &jz (&label("ecb_decrypt")); 448 449 &mov ($key_,$key); # backup $key 450 &mov ($rounds_,$rounds); # backup $rounds 451 &cmp ($len,0x60); 452 &jb (&label("ecb_enc_tail")); 453 454 &movdqu ($inout0,&QWP(0,$inp)); 455 &movdqu ($inout1,&QWP(0x10,$inp)); 456 &movdqu ($inout2,&QWP(0x20,$inp)); 457 &movdqu ($inout3,&QWP(0x30,$inp)); 458 &movdqu ($inout4,&QWP(0x40,$inp)); 459 &movdqu ($inout5,&QWP(0x50,$inp)); 460 &lea ($inp,&DWP(0x60,$inp)); 461 &sub ($len,0x60); 462 &jmp (&label("ecb_enc_loop6_enter")); 463 464&set_label("ecb_enc_loop6",16); 465 &movups (&QWP(0,$out),$inout0); 466 &movdqu ($inout0,&QWP(0,$inp)); 467 &movups (&QWP(0x10,$out),$inout1); 468 &movdqu ($inout1,&QWP(0x10,$inp)); 469 &movups (&QWP(0x20,$out),$inout2); 470 &movdqu ($inout2,&QWP(0x20,$inp)); 471 &movups (&QWP(0x30,$out),$inout3); 472 &movdqu ($inout3,&QWP(0x30,$inp)); 473 &movups (&QWP(0x40,$out),$inout4); 474 &movdqu ($inout4,&QWP(0x40,$inp)); 475 &movups (&QWP(0x50,$out),$inout5); 476 &lea ($out,&DWP(0x60,$out)); 477 &movdqu ($inout5,&QWP(0x50,$inp)); 478 &lea ($inp,&DWP(0x60,$inp)); 479&set_label("ecb_enc_loop6_enter"); 480 481 &call ("_aesni_encrypt6"); 482 483 &mov ($key,$key_); # restore $key 484 &mov ($rounds,$rounds_); # restore $rounds 485 &sub ($len,0x60); 486 &jnc (&label("ecb_enc_loop6")); 487 488 &movups (&QWP(0,$out),$inout0); 489 &movups (&QWP(0x10,$out),$inout1); 490 &movups (&QWP(0x20,$out),$inout2); 491 &movups (&QWP(0x30,$out),$inout3); 492 &movups (&QWP(0x40,$out),$inout4); 493 &movups (&QWP(0x50,$out),$inout5); 494 &lea ($out,&DWP(0x60,$out)); 495 &add ($len,0x60); 496 &jz (&label("ecb_ret")); 497 498&set_label("ecb_enc_tail"); 499 &movups ($inout0,&QWP(0,$inp)); 500 &cmp ($len,0x20); 501 &jb (&label("ecb_enc_one")); 502 &movups ($inout1,&QWP(0x10,$inp)); 503 &je (&label("ecb_enc_two")); 504 &movups ($inout2,&QWP(0x20,$inp)); 505 &cmp ($len,0x40); 506 &jb (&label("ecb_enc_three")); 507 &movups ($inout3,&QWP(0x30,$inp)); 508 &je (&label("ecb_enc_four")); 509 &movups ($inout4,&QWP(0x40,$inp)); 510 &xorps ($inout5,$inout5); 511 &call ("_aesni_encrypt6"); 512 &movups (&QWP(0,$out),$inout0); 513 &movups (&QWP(0x10,$out),$inout1); 514 &movups (&QWP(0x20,$out),$inout2); 515 &movups (&QWP(0x30,$out),$inout3); 516 &movups (&QWP(0x40,$out),$inout4); 517 jmp (&label("ecb_ret")); 518 519&set_label("ecb_enc_one",16); 520 if ($inline) 521 { &aesni_inline_generate1("enc"); } 522 else 523 { &call ("_aesni_encrypt1"); } 524 &movups (&QWP(0,$out),$inout0); 525 &jmp (&label("ecb_ret")); 526 527&set_label("ecb_enc_two",16); 528 &call ("_aesni_encrypt2"); 529 &movups (&QWP(0,$out),$inout0); 530 &movups (&QWP(0x10,$out),$inout1); 531 &jmp (&label("ecb_ret")); 532 533&set_label("ecb_enc_three",16); 534 &call ("_aesni_encrypt3"); 535 &movups (&QWP(0,$out),$inout0); 536 &movups (&QWP(0x10,$out),$inout1); 537 &movups (&QWP(0x20,$out),$inout2); 538 &jmp (&label("ecb_ret")); 539 540&set_label("ecb_enc_four",16); 541 &call ("_aesni_encrypt4"); 542 &movups (&QWP(0,$out),$inout0); 543 &movups (&QWP(0x10,$out),$inout1); 544 &movups (&QWP(0x20,$out),$inout2); 545 &movups (&QWP(0x30,$out),$inout3); 546 &jmp (&label("ecb_ret")); 547###################################################################### 548&set_label("ecb_decrypt",16); 549 &mov ($key_,$key); # backup $key 550 &mov ($rounds_,$rounds); # backup $rounds 551 &cmp ($len,0x60); 552 &jb (&label("ecb_dec_tail")); 553 554 &movdqu ($inout0,&QWP(0,$inp)); 555 &movdqu ($inout1,&QWP(0x10,$inp)); 556 &movdqu ($inout2,&QWP(0x20,$inp)); 557 &movdqu ($inout3,&QWP(0x30,$inp)); 558 &movdqu ($inout4,&QWP(0x40,$inp)); 559 &movdqu ($inout5,&QWP(0x50,$inp)); 560 &lea ($inp,&DWP(0x60,$inp)); 561 &sub ($len,0x60); 562 &jmp (&label("ecb_dec_loop6_enter")); 563 564&set_label("ecb_dec_loop6",16); 565 &movups (&QWP(0,$out),$inout0); 566 &movdqu ($inout0,&QWP(0,$inp)); 567 &movups (&QWP(0x10,$out),$inout1); 568 &movdqu ($inout1,&QWP(0x10,$inp)); 569 &movups (&QWP(0x20,$out),$inout2); 570 &movdqu ($inout2,&QWP(0x20,$inp)); 571 &movups (&QWP(0x30,$out),$inout3); 572 &movdqu ($inout3,&QWP(0x30,$inp)); 573 &movups (&QWP(0x40,$out),$inout4); 574 &movdqu ($inout4,&QWP(0x40,$inp)); 575 &movups (&QWP(0x50,$out),$inout5); 576 &lea ($out,&DWP(0x60,$out)); 577 &movdqu ($inout5,&QWP(0x50,$inp)); 578 &lea ($inp,&DWP(0x60,$inp)); 579&set_label("ecb_dec_loop6_enter"); 580 581 &call ("_aesni_decrypt6"); 582 583 &mov ($key,$key_); # restore $key 584 &mov ($rounds,$rounds_); # restore $rounds 585 &sub ($len,0x60); 586 &jnc (&label("ecb_dec_loop6")); 587 588 &movups (&QWP(0,$out),$inout0); 589 &movups (&QWP(0x10,$out),$inout1); 590 &movups (&QWP(0x20,$out),$inout2); 591 &movups (&QWP(0x30,$out),$inout3); 592 &movups (&QWP(0x40,$out),$inout4); 593 &movups (&QWP(0x50,$out),$inout5); 594 &lea ($out,&DWP(0x60,$out)); 595 &add ($len,0x60); 596 &jz (&label("ecb_ret")); 597 598&set_label("ecb_dec_tail"); 599 &movups ($inout0,&QWP(0,$inp)); 600 &cmp ($len,0x20); 601 &jb (&label("ecb_dec_one")); 602 &movups ($inout1,&QWP(0x10,$inp)); 603 &je (&label("ecb_dec_two")); 604 &movups ($inout2,&QWP(0x20,$inp)); 605 &cmp ($len,0x40); 606 &jb (&label("ecb_dec_three")); 607 &movups ($inout3,&QWP(0x30,$inp)); 608 &je (&label("ecb_dec_four")); 609 &movups ($inout4,&QWP(0x40,$inp)); 610 &xorps ($inout5,$inout5); 611 &call ("_aesni_decrypt6"); 612 &movups (&QWP(0,$out),$inout0); 613 &movups (&QWP(0x10,$out),$inout1); 614 &movups (&QWP(0x20,$out),$inout2); 615 &movups (&QWP(0x30,$out),$inout3); 616 &movups (&QWP(0x40,$out),$inout4); 617 &jmp (&label("ecb_ret")); 618 619&set_label("ecb_dec_one",16); 620 if ($inline) 621 { &aesni_inline_generate1("dec"); } 622 else 623 { &call ("_aesni_decrypt1"); } 624 &movups (&QWP(0,$out),$inout0); 625 &jmp (&label("ecb_ret")); 626 627&set_label("ecb_dec_two",16); 628 &call ("_aesni_decrypt2"); 629 &movups (&QWP(0,$out),$inout0); 630 &movups (&QWP(0x10,$out),$inout1); 631 &jmp (&label("ecb_ret")); 632 633&set_label("ecb_dec_three",16); 634 &call ("_aesni_decrypt3"); 635 &movups (&QWP(0,$out),$inout0); 636 &movups (&QWP(0x10,$out),$inout1); 637 &movups (&QWP(0x20,$out),$inout2); 638 &jmp (&label("ecb_ret")); 639 640&set_label("ecb_dec_four",16); 641 &call ("_aesni_decrypt4"); 642 &movups (&QWP(0,$out),$inout0); 643 &movups (&QWP(0x10,$out),$inout1); 644 &movups (&QWP(0x20,$out),$inout2); 645 &movups (&QWP(0x30,$out),$inout3); 646 647&set_label("ecb_ret"); 648 &pxor ("xmm0","xmm0"); # clear register bank 649 &pxor ("xmm1","xmm1"); 650 &pxor ("xmm2","xmm2"); 651 &pxor ("xmm3","xmm3"); 652 &pxor ("xmm4","xmm4"); 653 &pxor ("xmm5","xmm5"); 654 &pxor ("xmm6","xmm6"); 655 &pxor ("xmm7","xmm7"); 656&function_end("${PREFIX}_ecb_encrypt"); 657 658###################################################################### 659# void aes_hw_ccm64_[en|de]crypt_blocks (const void *in, void *out, 660# size_t blocks, const AES_KEY *key, 661# const char *ivec,char *cmac); 662# 663# Handles only complete blocks, operates on 64-bit counter and 664# does not update *ivec! Nor does it finalize CMAC value 665# (see engine/eng_aesni.c for details) 666# 667{ my $cmac=$inout1; 668&function_begin("${PREFIX}_ccm64_encrypt_blocks"); 669 &mov ($inp,&wparam(0)); 670 &mov ($out,&wparam(1)); 671 &mov ($len,&wparam(2)); 672 &mov ($key,&wparam(3)); 673 &mov ($rounds_,&wparam(4)); 674 &mov ($rounds,&wparam(5)); 675 &mov ($key_,"esp"); 676 &sub ("esp",60); 677 &and ("esp",-16); # align stack 678 &mov (&DWP(48,"esp"),$key_); 679 680 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec 681 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac 682 &mov ($rounds,&DWP(240,$key)); 683 684 # compose byte-swap control mask for pshufb on stack 685 &mov (&DWP(0,"esp"),0x0c0d0e0f); 686 &mov (&DWP(4,"esp"),0x08090a0b); 687 &mov (&DWP(8,"esp"),0x04050607); 688 &mov (&DWP(12,"esp"),0x00010203); 689 690 # compose counter increment vector on stack 691 &mov ($rounds_,1); 692 &xor ($key_,$key_); 693 &mov (&DWP(16,"esp"),$rounds_); 694 &mov (&DWP(20,"esp"),$key_); 695 &mov (&DWP(24,"esp"),$key_); 696 &mov (&DWP(28,"esp"),$key_); 697 698 &shl ($rounds,4); 699 &mov ($rounds_,16); 700 &lea ($key_,&DWP(0,$key)); 701 &movdqa ($inout3,&QWP(0,"esp")); 702 &movdqa ($inout0,$ivec); 703 &lea ($key,&DWP(32,$key,$rounds)); 704 &sub ($rounds_,$rounds); 705 &pshufb ($ivec,$inout3); 706 707&set_label("ccm64_enc_outer"); 708 &$movekey ($rndkey0,&QWP(0,$key_)); 709 &mov ($rounds,$rounds_); 710 &movups ($in0,&QWP(0,$inp)); 711 712 &xorps ($inout0,$rndkey0); 713 &$movekey ($rndkey1,&QWP(16,$key_)); 714 &xorps ($rndkey0,$in0); 715 &xorps ($cmac,$rndkey0); # cmac^=inp 716 &$movekey ($rndkey0,&QWP(32,$key_)); 717 718&set_label("ccm64_enc2_loop"); 719 &aesenc ($inout0,$rndkey1); 720 &aesenc ($cmac,$rndkey1); 721 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 722 &add ($rounds,32); 723 &aesenc ($inout0,$rndkey0); 724 &aesenc ($cmac,$rndkey0); 725 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 726 &jnz (&label("ccm64_enc2_loop")); 727 &aesenc ($inout0,$rndkey1); 728 &aesenc ($cmac,$rndkey1); 729 &paddq ($ivec,&QWP(16,"esp")); 730 &dec ($len); 731 &aesenclast ($inout0,$rndkey0); 732 &aesenclast ($cmac,$rndkey0); 733 734 &lea ($inp,&DWP(16,$inp)); 735 &xorps ($in0,$inout0); # inp^=E(ivec) 736 &movdqa ($inout0,$ivec); 737 &movups (&QWP(0,$out),$in0); # save output 738 &pshufb ($inout0,$inout3); 739 &lea ($out,&DWP(16,$out)); 740 &jnz (&label("ccm64_enc_outer")); 741 742 &mov ("esp",&DWP(48,"esp")); 743 &mov ($out,&wparam(5)); 744 &movups (&QWP(0,$out),$cmac); 745 746 &pxor ("xmm0","xmm0"); # clear register bank 747 &pxor ("xmm1","xmm1"); 748 &pxor ("xmm2","xmm2"); 749 &pxor ("xmm3","xmm3"); 750 &pxor ("xmm4","xmm4"); 751 &pxor ("xmm5","xmm5"); 752 &pxor ("xmm6","xmm6"); 753 &pxor ("xmm7","xmm7"); 754&function_end("${PREFIX}_ccm64_encrypt_blocks"); 755 756&function_begin("${PREFIX}_ccm64_decrypt_blocks"); 757 &mov ($inp,&wparam(0)); 758 &mov ($out,&wparam(1)); 759 &mov ($len,&wparam(2)); 760 &mov ($key,&wparam(3)); 761 &mov ($rounds_,&wparam(4)); 762 &mov ($rounds,&wparam(5)); 763 &mov ($key_,"esp"); 764 &sub ("esp",60); 765 &and ("esp",-16); # align stack 766 &mov (&DWP(48,"esp"),$key_); 767 768 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec 769 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac 770 &mov ($rounds,&DWP(240,$key)); 771 772 # compose byte-swap control mask for pshufb on stack 773 &mov (&DWP(0,"esp"),0x0c0d0e0f); 774 &mov (&DWP(4,"esp"),0x08090a0b); 775 &mov (&DWP(8,"esp"),0x04050607); 776 &mov (&DWP(12,"esp"),0x00010203); 777 778 # compose counter increment vector on stack 779 &mov ($rounds_,1); 780 &xor ($key_,$key_); 781 &mov (&DWP(16,"esp"),$rounds_); 782 &mov (&DWP(20,"esp"),$key_); 783 &mov (&DWP(24,"esp"),$key_); 784 &mov (&DWP(28,"esp"),$key_); 785 786 &movdqa ($inout3,&QWP(0,"esp")); # bswap mask 787 &movdqa ($inout0,$ivec); 788 789 &mov ($key_,$key); 790 &mov ($rounds_,$rounds); 791 792 &pshufb ($ivec,$inout3); 793 if ($inline) 794 { &aesni_inline_generate1("enc"); } 795 else 796 { &call ("_aesni_encrypt1"); } 797 &shl ($rounds_,4); 798 &mov ($rounds,16); 799 &movups ($in0,&QWP(0,$inp)); # load inp 800 &paddq ($ivec,&QWP(16,"esp")); 801 &lea ($inp,&QWP(16,$inp)); 802 &sub ($rounds,$rounds_); 803 &lea ($key,&DWP(32,$key_,$rounds_)); 804 &mov ($rounds_,$rounds); 805 &jmp (&label("ccm64_dec_outer")); 806 807&set_label("ccm64_dec_outer",16); 808 &xorps ($in0,$inout0); # inp ^= E(ivec) 809 &movdqa ($inout0,$ivec); 810 &movups (&QWP(0,$out),$in0); # save output 811 &lea ($out,&DWP(16,$out)); 812 &pshufb ($inout0,$inout3); 813 814 &sub ($len,1); 815 &jz (&label("ccm64_dec_break")); 816 817 &$movekey ($rndkey0,&QWP(0,$key_)); 818 &mov ($rounds,$rounds_); 819 &$movekey ($rndkey1,&QWP(16,$key_)); 820 &xorps ($in0,$rndkey0); 821 &xorps ($inout0,$rndkey0); 822 &xorps ($cmac,$in0); # cmac^=out 823 &$movekey ($rndkey0,&QWP(32,$key_)); 824 825&set_label("ccm64_dec2_loop"); 826 &aesenc ($inout0,$rndkey1); 827 &aesenc ($cmac,$rndkey1); 828 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 829 &add ($rounds,32); 830 &aesenc ($inout0,$rndkey0); 831 &aesenc ($cmac,$rndkey0); 832 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 833 &jnz (&label("ccm64_dec2_loop")); 834 &movups ($in0,&QWP(0,$inp)); # load inp 835 &paddq ($ivec,&QWP(16,"esp")); 836 &aesenc ($inout0,$rndkey1); 837 &aesenc ($cmac,$rndkey1); 838 &aesenclast ($inout0,$rndkey0); 839 &aesenclast ($cmac,$rndkey0); 840 &lea ($inp,&QWP(16,$inp)); 841 &jmp (&label("ccm64_dec_outer")); 842 843&set_label("ccm64_dec_break",16); 844 &mov ($rounds,&DWP(240,$key_)); 845 &mov ($key,$key_); 846 if ($inline) 847 { &aesni_inline_generate1("enc",$cmac,$in0); } 848 else 849 { &call ("_aesni_encrypt1",$cmac); } 850 851 &mov ("esp",&DWP(48,"esp")); 852 &mov ($out,&wparam(5)); 853 &movups (&QWP(0,$out),$cmac); 854 855 &pxor ("xmm0","xmm0"); # clear register bank 856 &pxor ("xmm1","xmm1"); 857 &pxor ("xmm2","xmm2"); 858 &pxor ("xmm3","xmm3"); 859 &pxor ("xmm4","xmm4"); 860 &pxor ("xmm5","xmm5"); 861 &pxor ("xmm6","xmm6"); 862 &pxor ("xmm7","xmm7"); 863&function_end("${PREFIX}_ccm64_decrypt_blocks"); 864} 865 866###################################################################### 867# void aes_hw_ctr32_encrypt_blocks (const void *in, void *out, 868# size_t blocks, const AES_KEY *key, 869# const char *ivec); 870# 871# Handles only complete blocks, operates on 32-bit counter and 872# does not update *ivec! (see crypto/modes/ctr128.c for details) 873# 874# stack layout: 875# 0 pshufb mask 876# 16 vector addend: 0,6,6,6 877# 32 counter-less ivec 878# 48 1st triplet of counter vector 879# 64 2nd triplet of counter vector 880# 80 saved %esp 881 882&function_begin("${PREFIX}_ctr32_encrypt_blocks"); 883 &record_function_hit(0); 884 885 &mov ($inp,&wparam(0)); 886 &mov ($out,&wparam(1)); 887 &mov ($len,&wparam(2)); 888 &mov ($key,&wparam(3)); 889 &mov ($rounds_,&wparam(4)); 890 &mov ($key_,"esp"); 891 &sub ("esp",88); 892 &and ("esp",-16); # align stack 893 &mov (&DWP(80,"esp"),$key_); 894 895 &cmp ($len,1); 896 &je (&label("ctr32_one_shortcut")); 897 898 &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec 899 900 # compose byte-swap control mask for pshufb on stack 901 &mov (&DWP(0,"esp"),0x0c0d0e0f); 902 &mov (&DWP(4,"esp"),0x08090a0b); 903 &mov (&DWP(8,"esp"),0x04050607); 904 &mov (&DWP(12,"esp"),0x00010203); 905 906 # compose counter increment vector on stack 907 &mov ($rounds,6); 908 &xor ($key_,$key_); 909 &mov (&DWP(16,"esp"),$rounds); 910 &mov (&DWP(20,"esp"),$rounds); 911 &mov (&DWP(24,"esp"),$rounds); 912 &mov (&DWP(28,"esp"),$key_); 913 914 &pextrd ($rounds_,$inout5,3); # pull 32-bit counter 915 &pinsrd ($inout5,$key_,3); # wipe 32-bit counter 916 917 &mov ($rounds,&DWP(240,$key)); # key->rounds 918 919 # compose 2 vectors of 3x32-bit counters 920 &bswap ($rounds_); 921 &pxor ($rndkey0,$rndkey0); 922 &pxor ($rndkey1,$rndkey1); 923 &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask 924 &pinsrd ($rndkey0,$rounds_,0); 925 &lea ($key_,&DWP(3,$rounds_)); 926 &pinsrd ($rndkey1,$key_,0); 927 &inc ($rounds_); 928 &pinsrd ($rndkey0,$rounds_,1); 929 &inc ($key_); 930 &pinsrd ($rndkey1,$key_,1); 931 &inc ($rounds_); 932 &pinsrd ($rndkey0,$rounds_,2); 933 &inc ($key_); 934 &pinsrd ($rndkey1,$key_,2); 935 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet 936 &pshufb ($rndkey0,$inout0); # byte swap 937 &movdqu ($inout4,&QWP(0,$key)); # key[0] 938 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet 939 &pshufb ($rndkey1,$inout0); # byte swap 940 941 &pshufd ($inout0,$rndkey0,3<<6); # place counter to upper dword 942 &pshufd ($inout1,$rndkey0,2<<6); 943 &cmp ($len,6); 944 &jb (&label("ctr32_tail")); 945 &pxor ($inout5,$inout4); # counter-less ivec^key[0] 946 &shl ($rounds,4); 947 &mov ($rounds_,16); 948 &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec^key[0] 949 &mov ($key_,$key); # backup $key 950 &sub ($rounds_,$rounds); # backup twisted $rounds 951 &lea ($key,&DWP(32,$key,$rounds)); 952 &sub ($len,6); 953 &jmp (&label("ctr32_loop6")); 954 955&set_label("ctr32_loop6",16); 956 # inlining _aesni_encrypt6's prologue gives ~6% improvement... 957 &pshufd ($inout2,$rndkey0,1<<6); 958 &movdqa ($rndkey0,&QWP(32,"esp")); # pull counter-less ivec 959 &pshufd ($inout3,$rndkey1,3<<6); 960 &pxor ($inout0,$rndkey0); # merge counter-less ivec 961 &pshufd ($inout4,$rndkey1,2<<6); 962 &pxor ($inout1,$rndkey0); 963 &pshufd ($inout5,$rndkey1,1<<6); 964 &$movekey ($rndkey1,&QWP(16,$key_)); 965 &pxor ($inout2,$rndkey0); 966 &pxor ($inout3,$rndkey0); 967 &aesenc ($inout0,$rndkey1); 968 &pxor ($inout4,$rndkey0); 969 &pxor ($inout5,$rndkey0); 970 &aesenc ($inout1,$rndkey1); 971 &$movekey ($rndkey0,&QWP(32,$key_)); 972 &mov ($rounds,$rounds_); 973 &aesenc ($inout2,$rndkey1); 974 &aesenc ($inout3,$rndkey1); 975 &aesenc ($inout4,$rndkey1); 976 &aesenc ($inout5,$rndkey1); 977 978 &call (&label("_aesni_encrypt6_enter")); 979 980 &movups ($rndkey1,&QWP(0,$inp)); 981 &movups ($rndkey0,&QWP(0x10,$inp)); 982 &xorps ($inout0,$rndkey1); 983 &movups ($rndkey1,&QWP(0x20,$inp)); 984 &xorps ($inout1,$rndkey0); 985 &movups (&QWP(0,$out),$inout0); 986 &movdqa ($rndkey0,&QWP(16,"esp")); # load increment 987 &xorps ($inout2,$rndkey1); 988 &movdqa ($rndkey1,&QWP(64,"esp")); # load 2nd triplet 989 &movups (&QWP(0x10,$out),$inout1); 990 &movups (&QWP(0x20,$out),$inout2); 991 992 &paddd ($rndkey1,$rndkey0); # 2nd triplet increment 993 &paddd ($rndkey0,&QWP(48,"esp")); # 1st triplet increment 994 &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask 995 996 &movups ($inout1,&QWP(0x30,$inp)); 997 &movups ($inout2,&QWP(0x40,$inp)); 998 &xorps ($inout3,$inout1); 999 &movups ($inout1,&QWP(0x50,$inp)); 1000 &lea ($inp,&DWP(0x60,$inp)); 1001 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet 1002 &pshufb ($rndkey0,$inout0); # byte swap 1003 &xorps ($inout4,$inout2); 1004 &movups (&QWP(0x30,$out),$inout3); 1005 &xorps ($inout5,$inout1); 1006 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet 1007 &pshufb ($rndkey1,$inout0); # byte swap 1008 &movups (&QWP(0x40,$out),$inout4); 1009 &pshufd ($inout0,$rndkey0,3<<6); 1010 &movups (&QWP(0x50,$out),$inout5); 1011 &lea ($out,&DWP(0x60,$out)); 1012 1013 &pshufd ($inout1,$rndkey0,2<<6); 1014 &sub ($len,6); 1015 &jnc (&label("ctr32_loop6")); 1016 1017 &add ($len,6); 1018 &jz (&label("ctr32_ret")); 1019 &movdqu ($inout5,&QWP(0,$key_)); 1020 &mov ($key,$key_); 1021 &pxor ($inout5,&QWP(32,"esp")); # restore count-less ivec 1022 &mov ($rounds,&DWP(240,$key_)); # restore $rounds 1023 1024&set_label("ctr32_tail"); 1025 &por ($inout0,$inout5); 1026 &cmp ($len,2); 1027 &jb (&label("ctr32_one")); 1028 1029 &pshufd ($inout2,$rndkey0,1<<6); 1030 &por ($inout1,$inout5); 1031 &je (&label("ctr32_two")); 1032 1033 &pshufd ($inout3,$rndkey1,3<<6); 1034 &por ($inout2,$inout5); 1035 &cmp ($len,4); 1036 &jb (&label("ctr32_three")); 1037 1038 &pshufd ($inout4,$rndkey1,2<<6); 1039 &por ($inout3,$inout5); 1040 &je (&label("ctr32_four")); 1041 1042 &por ($inout4,$inout5); 1043 &call ("_aesni_encrypt6"); 1044 &movups ($rndkey1,&QWP(0,$inp)); 1045 &movups ($rndkey0,&QWP(0x10,$inp)); 1046 &xorps ($inout0,$rndkey1); 1047 &movups ($rndkey1,&QWP(0x20,$inp)); 1048 &xorps ($inout1,$rndkey0); 1049 &movups ($rndkey0,&QWP(0x30,$inp)); 1050 &xorps ($inout2,$rndkey1); 1051 &movups ($rndkey1,&QWP(0x40,$inp)); 1052 &xorps ($inout3,$rndkey0); 1053 &movups (&QWP(0,$out),$inout0); 1054 &xorps ($inout4,$rndkey1); 1055 &movups (&QWP(0x10,$out),$inout1); 1056 &movups (&QWP(0x20,$out),$inout2); 1057 &movups (&QWP(0x30,$out),$inout3); 1058 &movups (&QWP(0x40,$out),$inout4); 1059 &jmp (&label("ctr32_ret")); 1060 1061&set_label("ctr32_one_shortcut",16); 1062 &movups ($inout0,&QWP(0,$rounds_)); # load ivec 1063 &mov ($rounds,&DWP(240,$key)); 1064 1065&set_label("ctr32_one"); 1066 if ($inline) 1067 { &aesni_inline_generate1("enc"); } 1068 else 1069 { &call ("_aesni_encrypt1"); } 1070 &movups ($in0,&QWP(0,$inp)); 1071 &xorps ($in0,$inout0); 1072 &movups (&QWP(0,$out),$in0); 1073 &jmp (&label("ctr32_ret")); 1074 1075&set_label("ctr32_two",16); 1076 &call ("_aesni_encrypt2"); 1077 &movups ($inout3,&QWP(0,$inp)); 1078 &movups ($inout4,&QWP(0x10,$inp)); 1079 &xorps ($inout0,$inout3); 1080 &xorps ($inout1,$inout4); 1081 &movups (&QWP(0,$out),$inout0); 1082 &movups (&QWP(0x10,$out),$inout1); 1083 &jmp (&label("ctr32_ret")); 1084 1085&set_label("ctr32_three",16); 1086 &call ("_aesni_encrypt3"); 1087 &movups ($inout3,&QWP(0,$inp)); 1088 &movups ($inout4,&QWP(0x10,$inp)); 1089 &xorps ($inout0,$inout3); 1090 &movups ($inout5,&QWP(0x20,$inp)); 1091 &xorps ($inout1,$inout4); 1092 &movups (&QWP(0,$out),$inout0); 1093 &xorps ($inout2,$inout5); 1094 &movups (&QWP(0x10,$out),$inout1); 1095 &movups (&QWP(0x20,$out),$inout2); 1096 &jmp (&label("ctr32_ret")); 1097 1098&set_label("ctr32_four",16); 1099 &call ("_aesni_encrypt4"); 1100 &movups ($inout4,&QWP(0,$inp)); 1101 &movups ($inout5,&QWP(0x10,$inp)); 1102 &movups ($rndkey1,&QWP(0x20,$inp)); 1103 &xorps ($inout0,$inout4); 1104 &movups ($rndkey0,&QWP(0x30,$inp)); 1105 &xorps ($inout1,$inout5); 1106 &movups (&QWP(0,$out),$inout0); 1107 &xorps ($inout2,$rndkey1); 1108 &movups (&QWP(0x10,$out),$inout1); 1109 &xorps ($inout3,$rndkey0); 1110 &movups (&QWP(0x20,$out),$inout2); 1111 &movups (&QWP(0x30,$out),$inout3); 1112 1113&set_label("ctr32_ret"); 1114 &pxor ("xmm0","xmm0"); # clear register bank 1115 &pxor ("xmm1","xmm1"); 1116 &pxor ("xmm2","xmm2"); 1117 &pxor ("xmm3","xmm3"); 1118 &pxor ("xmm4","xmm4"); 1119 &movdqa (&QWP(32,"esp"),"xmm0"); # clear stack 1120 &pxor ("xmm5","xmm5"); 1121 &movdqa (&QWP(48,"esp"),"xmm0"); 1122 &pxor ("xmm6","xmm6"); 1123 &movdqa (&QWP(64,"esp"),"xmm0"); 1124 &pxor ("xmm7","xmm7"); 1125 &mov ("esp",&DWP(80,"esp")); 1126&function_end("${PREFIX}_ctr32_encrypt_blocks"); 1127 1128###################################################################### 1129# void aes_hw_xts_[en|de]crypt(const char *inp,char *out,size_t len, 1130# const AES_KEY *key1, const AES_KEY *key2 1131# const unsigned char iv[16]); 1132# 1133{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1); 1134 1135&function_begin("${PREFIX}_xts_encrypt"); 1136 &mov ($key,&wparam(4)); # key2 1137 &mov ($inp,&wparam(5)); # clear-text tweak 1138 1139 &mov ($rounds,&DWP(240,$key)); # key2->rounds 1140 &movups ($inout0,&QWP(0,$inp)); 1141 if ($inline) 1142 { &aesni_inline_generate1("enc"); } 1143 else 1144 { &call ("_aesni_encrypt1"); } 1145 1146 &mov ($inp,&wparam(0)); 1147 &mov ($out,&wparam(1)); 1148 &mov ($len,&wparam(2)); 1149 &mov ($key,&wparam(3)); # key1 1150 1151 &mov ($key_,"esp"); 1152 &sub ("esp",16*7+8); 1153 &mov ($rounds,&DWP(240,$key)); # key1->rounds 1154 &and ("esp",-16); # align stack 1155 1156 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant 1157 &mov (&DWP(16*6+4,"esp"),0); 1158 &mov (&DWP(16*6+8,"esp"),1); 1159 &mov (&DWP(16*6+12,"esp"),0); 1160 &mov (&DWP(16*7+0,"esp"),$len); # save original $len 1161 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp 1162 1163 &movdqa ($tweak,$inout0); 1164 &pxor ($twtmp,$twtmp); 1165 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 1166 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1167 1168 &and ($len,-16); 1169 &mov ($key_,$key); # backup $key 1170 &mov ($rounds_,$rounds); # backup $rounds 1171 &sub ($len,16*6); 1172 &jc (&label("xts_enc_short")); 1173 1174 &shl ($rounds,4); 1175 &mov ($rounds_,16); 1176 &sub ($rounds_,$rounds); 1177 &lea ($key,&DWP(32,$key,$rounds)); 1178 &jmp (&label("xts_enc_loop6")); 1179 1180&set_label("xts_enc_loop6",16); 1181 for ($i=0;$i<4;$i++) { 1182 &pshufd ($twres,$twtmp,0x13); 1183 &pxor ($twtmp,$twtmp); 1184 &movdqa (&QWP(16*$i,"esp"),$tweak); 1185 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1186 &pand ($twres,$twmask); # isolate carry and residue 1187 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1188 &pxor ($tweak,$twres); 1189 } 1190 &pshufd ($inout5,$twtmp,0x13); 1191 &movdqa (&QWP(16*$i++,"esp"),$tweak); 1192 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1193 &$movekey ($rndkey0,&QWP(0,$key_)); 1194 &pand ($inout5,$twmask); # isolate carry and residue 1195 &movups ($inout0,&QWP(0,$inp)); # load input 1196 &pxor ($inout5,$tweak); 1197 1198 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] 1199 &mov ($rounds,$rounds_); # restore $rounds 1200 &movdqu ($inout1,&QWP(16*1,$inp)); 1201 &xorps ($inout0,$rndkey0); # input^=rndkey[0] 1202 &movdqu ($inout2,&QWP(16*2,$inp)); 1203 &pxor ($inout1,$rndkey0); 1204 &movdqu ($inout3,&QWP(16*3,$inp)); 1205 &pxor ($inout2,$rndkey0); 1206 &movdqu ($inout4,&QWP(16*4,$inp)); 1207 &pxor ($inout3,$rndkey0); 1208 &movdqu ($rndkey1,&QWP(16*5,$inp)); 1209 &pxor ($inout4,$rndkey0); 1210 &lea ($inp,&DWP(16*6,$inp)); 1211 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1212 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak 1213 &pxor ($inout5,$rndkey1); 1214 1215 &$movekey ($rndkey1,&QWP(16,$key_)); 1216 &pxor ($inout1,&QWP(16*1,"esp")); 1217 &pxor ($inout2,&QWP(16*2,"esp")); 1218 &aesenc ($inout0,$rndkey1); 1219 &pxor ($inout3,&QWP(16*3,"esp")); 1220 &pxor ($inout4,&QWP(16*4,"esp")); 1221 &aesenc ($inout1,$rndkey1); 1222 &pxor ($inout5,$rndkey0); 1223 &$movekey ($rndkey0,&QWP(32,$key_)); 1224 &aesenc ($inout2,$rndkey1); 1225 &aesenc ($inout3,$rndkey1); 1226 &aesenc ($inout4,$rndkey1); 1227 &aesenc ($inout5,$rndkey1); 1228 &call (&label("_aesni_encrypt6_enter")); 1229 1230 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak 1231 &pxor ($twtmp,$twtmp); 1232 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1233 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1234 &xorps ($inout1,&QWP(16*1,"esp")); 1235 &movups (&QWP(16*0,$out),$inout0); # write output 1236 &xorps ($inout2,&QWP(16*2,"esp")); 1237 &movups (&QWP(16*1,$out),$inout1); 1238 &xorps ($inout3,&QWP(16*3,"esp")); 1239 &movups (&QWP(16*2,$out),$inout2); 1240 &xorps ($inout4,&QWP(16*4,"esp")); 1241 &movups (&QWP(16*3,$out),$inout3); 1242 &xorps ($inout5,$tweak); 1243 &movups (&QWP(16*4,$out),$inout4); 1244 &pshufd ($twres,$twtmp,0x13); 1245 &movups (&QWP(16*5,$out),$inout5); 1246 &lea ($out,&DWP(16*6,$out)); 1247 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 1248 1249 &pxor ($twtmp,$twtmp); 1250 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1251 &pand ($twres,$twmask); # isolate carry and residue 1252 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1253 &pxor ($tweak,$twres); 1254 1255 &sub ($len,16*6); 1256 &jnc (&label("xts_enc_loop6")); 1257 1258 &mov ($rounds,&DWP(240,$key_)); # restore $rounds 1259 &mov ($key,$key_); # restore $key 1260 &mov ($rounds_,$rounds); 1261 1262&set_label("xts_enc_short"); 1263 &add ($len,16*6); 1264 &jz (&label("xts_enc_done6x")); 1265 1266 &movdqa ($inout3,$tweak); # put aside previous tweak 1267 &cmp ($len,0x20); 1268 &jb (&label("xts_enc_one")); 1269 1270 &pshufd ($twres,$twtmp,0x13); 1271 &pxor ($twtmp,$twtmp); 1272 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1273 &pand ($twres,$twmask); # isolate carry and residue 1274 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1275 &pxor ($tweak,$twres); 1276 &je (&label("xts_enc_two")); 1277 1278 &pshufd ($twres,$twtmp,0x13); 1279 &pxor ($twtmp,$twtmp); 1280 &movdqa ($inout4,$tweak); # put aside previous tweak 1281 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1282 &pand ($twres,$twmask); # isolate carry and residue 1283 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1284 &pxor ($tweak,$twres); 1285 &cmp ($len,0x40); 1286 &jb (&label("xts_enc_three")); 1287 1288 &pshufd ($twres,$twtmp,0x13); 1289 &pxor ($twtmp,$twtmp); 1290 &movdqa ($inout5,$tweak); # put aside previous tweak 1291 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1292 &pand ($twres,$twmask); # isolate carry and residue 1293 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1294 &pxor ($tweak,$twres); 1295 &movdqa (&QWP(16*0,"esp"),$inout3); 1296 &movdqa (&QWP(16*1,"esp"),$inout4); 1297 &je (&label("xts_enc_four")); 1298 1299 &movdqa (&QWP(16*2,"esp"),$inout5); 1300 &pshufd ($inout5,$twtmp,0x13); 1301 &movdqa (&QWP(16*3,"esp"),$tweak); 1302 &paddq ($tweak,$tweak); # &psllq($inout0,1); 1303 &pand ($inout5,$twmask); # isolate carry and residue 1304 &pxor ($inout5,$tweak); 1305 1306 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 1307 &movdqu ($inout1,&QWP(16*1,$inp)); 1308 &movdqu ($inout2,&QWP(16*2,$inp)); 1309 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1310 &movdqu ($inout3,&QWP(16*3,$inp)); 1311 &pxor ($inout1,&QWP(16*1,"esp")); 1312 &movdqu ($inout4,&QWP(16*4,$inp)); 1313 &pxor ($inout2,&QWP(16*2,"esp")); 1314 &lea ($inp,&DWP(16*5,$inp)); 1315 &pxor ($inout3,&QWP(16*3,"esp")); 1316 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak 1317 &pxor ($inout4,$inout5); 1318 1319 &call ("_aesni_encrypt6"); 1320 1321 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak 1322 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1323 &xorps ($inout1,&QWP(16*1,"esp")); 1324 &xorps ($inout2,&QWP(16*2,"esp")); 1325 &movups (&QWP(16*0,$out),$inout0); # write output 1326 &xorps ($inout3,&QWP(16*3,"esp")); 1327 &movups (&QWP(16*1,$out),$inout1); 1328 &xorps ($inout4,$tweak); 1329 &movups (&QWP(16*2,$out),$inout2); 1330 &movups (&QWP(16*3,$out),$inout3); 1331 &movups (&QWP(16*4,$out),$inout4); 1332 &lea ($out,&DWP(16*5,$out)); 1333 &jmp (&label("xts_enc_done")); 1334 1335&set_label("xts_enc_one",16); 1336 &movups ($inout0,&QWP(16*0,$inp)); # load input 1337 &lea ($inp,&DWP(16*1,$inp)); 1338 &xorps ($inout0,$inout3); # input^=tweak 1339 if ($inline) 1340 { &aesni_inline_generate1("enc"); } 1341 else 1342 { &call ("_aesni_encrypt1"); } 1343 &xorps ($inout0,$inout3); # output^=tweak 1344 &movups (&QWP(16*0,$out),$inout0); # write output 1345 &lea ($out,&DWP(16*1,$out)); 1346 1347 &movdqa ($tweak,$inout3); # last tweak 1348 &jmp (&label("xts_enc_done")); 1349 1350&set_label("xts_enc_two",16); 1351 &movaps ($inout4,$tweak); # put aside last tweak 1352 1353 &movups ($inout0,&QWP(16*0,$inp)); # load input 1354 &movups ($inout1,&QWP(16*1,$inp)); 1355 &lea ($inp,&DWP(16*2,$inp)); 1356 &xorps ($inout0,$inout3); # input^=tweak 1357 &xorps ($inout1,$inout4); 1358 1359 &call ("_aesni_encrypt2"); 1360 1361 &xorps ($inout0,$inout3); # output^=tweak 1362 &xorps ($inout1,$inout4); 1363 &movups (&QWP(16*0,$out),$inout0); # write output 1364 &movups (&QWP(16*1,$out),$inout1); 1365 &lea ($out,&DWP(16*2,$out)); 1366 1367 &movdqa ($tweak,$inout4); # last tweak 1368 &jmp (&label("xts_enc_done")); 1369 1370&set_label("xts_enc_three",16); 1371 &movaps ($inout5,$tweak); # put aside last tweak 1372 &movups ($inout0,&QWP(16*0,$inp)); # load input 1373 &movups ($inout1,&QWP(16*1,$inp)); 1374 &movups ($inout2,&QWP(16*2,$inp)); 1375 &lea ($inp,&DWP(16*3,$inp)); 1376 &xorps ($inout0,$inout3); # input^=tweak 1377 &xorps ($inout1,$inout4); 1378 &xorps ($inout2,$inout5); 1379 1380 &call ("_aesni_encrypt3"); 1381 1382 &xorps ($inout0,$inout3); # output^=tweak 1383 &xorps ($inout1,$inout4); 1384 &xorps ($inout2,$inout5); 1385 &movups (&QWP(16*0,$out),$inout0); # write output 1386 &movups (&QWP(16*1,$out),$inout1); 1387 &movups (&QWP(16*2,$out),$inout2); 1388 &lea ($out,&DWP(16*3,$out)); 1389 1390 &movdqa ($tweak,$inout5); # last tweak 1391 &jmp (&label("xts_enc_done")); 1392 1393&set_label("xts_enc_four",16); 1394 &movaps ($inout4,$tweak); # put aside last tweak 1395 1396 &movups ($inout0,&QWP(16*0,$inp)); # load input 1397 &movups ($inout1,&QWP(16*1,$inp)); 1398 &movups ($inout2,&QWP(16*2,$inp)); 1399 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak 1400 &movups ($inout3,&QWP(16*3,$inp)); 1401 &lea ($inp,&DWP(16*4,$inp)); 1402 &xorps ($inout1,&QWP(16*1,"esp")); 1403 &xorps ($inout2,$inout5); 1404 &xorps ($inout3,$inout4); 1405 1406 &call ("_aesni_encrypt4"); 1407 1408 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1409 &xorps ($inout1,&QWP(16*1,"esp")); 1410 &xorps ($inout2,$inout5); 1411 &movups (&QWP(16*0,$out),$inout0); # write output 1412 &xorps ($inout3,$inout4); 1413 &movups (&QWP(16*1,$out),$inout1); 1414 &movups (&QWP(16*2,$out),$inout2); 1415 &movups (&QWP(16*3,$out),$inout3); 1416 &lea ($out,&DWP(16*4,$out)); 1417 1418 &movdqa ($tweak,$inout4); # last tweak 1419 &jmp (&label("xts_enc_done")); 1420 1421&set_label("xts_enc_done6x",16); # $tweak is pre-calculated 1422 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1423 &and ($len,15); 1424 &jz (&label("xts_enc_ret")); 1425 &movdqa ($inout3,$tweak); 1426 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1427 &jmp (&label("xts_enc_steal")); 1428 1429&set_label("xts_enc_done",16); 1430 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1431 &pxor ($twtmp,$twtmp); 1432 &and ($len,15); 1433 &jz (&label("xts_enc_ret")); 1434 1435 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1436 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1437 &pshufd ($inout3,$twtmp,0x13); 1438 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1439 &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue 1440 &pxor ($inout3,$tweak); 1441 1442&set_label("xts_enc_steal"); 1443 &movz ($rounds,&BP(0,$inp)); 1444 &movz ($key,&BP(-16,$out)); 1445 &lea ($inp,&DWP(1,$inp)); 1446 &mov (&BP(-16,$out),&LB($rounds)); 1447 &mov (&BP(0,$out),&LB($key)); 1448 &lea ($out,&DWP(1,$out)); 1449 &sub ($len,1); 1450 &jnz (&label("xts_enc_steal")); 1451 1452 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out 1453 &mov ($key,$key_); # restore $key 1454 &mov ($rounds,$rounds_); # restore $rounds 1455 1456 &movups ($inout0,&QWP(-16,$out)); # load input 1457 &xorps ($inout0,$inout3); # input^=tweak 1458 if ($inline) 1459 { &aesni_inline_generate1("enc"); } 1460 else 1461 { &call ("_aesni_encrypt1"); } 1462 &xorps ($inout0,$inout3); # output^=tweak 1463 &movups (&QWP(-16,$out),$inout0); # write output 1464 1465&set_label("xts_enc_ret"); 1466 &pxor ("xmm0","xmm0"); # clear register bank 1467 &pxor ("xmm1","xmm1"); 1468 &pxor ("xmm2","xmm2"); 1469 &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack 1470 &pxor ("xmm3","xmm3"); 1471 &movdqa (&QWP(16*1,"esp"),"xmm0"); 1472 &pxor ("xmm4","xmm4"); 1473 &movdqa (&QWP(16*2,"esp"),"xmm0"); 1474 &pxor ("xmm5","xmm5"); 1475 &movdqa (&QWP(16*3,"esp"),"xmm0"); 1476 &pxor ("xmm6","xmm6"); 1477 &movdqa (&QWP(16*4,"esp"),"xmm0"); 1478 &pxor ("xmm7","xmm7"); 1479 &movdqa (&QWP(16*5,"esp"),"xmm0"); 1480 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp 1481&function_end("${PREFIX}_xts_encrypt"); 1482 1483&function_begin("${PREFIX}_xts_decrypt"); 1484 &mov ($key,&wparam(4)); # key2 1485 &mov ($inp,&wparam(5)); # clear-text tweak 1486 1487 &mov ($rounds,&DWP(240,$key)); # key2->rounds 1488 &movups ($inout0,&QWP(0,$inp)); 1489 if ($inline) 1490 { &aesni_inline_generate1("enc"); } 1491 else 1492 { &call ("_aesni_encrypt1"); } 1493 1494 &mov ($inp,&wparam(0)); 1495 &mov ($out,&wparam(1)); 1496 &mov ($len,&wparam(2)); 1497 &mov ($key,&wparam(3)); # key1 1498 1499 &mov ($key_,"esp"); 1500 &sub ("esp",16*7+8); 1501 &and ("esp",-16); # align stack 1502 1503 &xor ($rounds_,$rounds_); # if(len%16) len-=16; 1504 &test ($len,15); 1505 &setnz (&LB($rounds_)); 1506 &shl ($rounds_,4); 1507 &sub ($len,$rounds_); 1508 1509 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant 1510 &mov (&DWP(16*6+4,"esp"),0); 1511 &mov (&DWP(16*6+8,"esp"),1); 1512 &mov (&DWP(16*6+12,"esp"),0); 1513 &mov (&DWP(16*7+0,"esp"),$len); # save original $len 1514 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp 1515 1516 &mov ($rounds,&DWP(240,$key)); # key1->rounds 1517 &mov ($key_,$key); # backup $key 1518 &mov ($rounds_,$rounds); # backup $rounds 1519 1520 &movdqa ($tweak,$inout0); 1521 &pxor ($twtmp,$twtmp); 1522 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 1523 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1524 1525 &and ($len,-16); 1526 &sub ($len,16*6); 1527 &jc (&label("xts_dec_short")); 1528 1529 &shl ($rounds,4); 1530 &mov ($rounds_,16); 1531 &sub ($rounds_,$rounds); 1532 &lea ($key,&DWP(32,$key,$rounds)); 1533 &jmp (&label("xts_dec_loop6")); 1534 1535&set_label("xts_dec_loop6",16); 1536 for ($i=0;$i<4;$i++) { 1537 &pshufd ($twres,$twtmp,0x13); 1538 &pxor ($twtmp,$twtmp); 1539 &movdqa (&QWP(16*$i,"esp"),$tweak); 1540 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1541 &pand ($twres,$twmask); # isolate carry and residue 1542 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1543 &pxor ($tweak,$twres); 1544 } 1545 &pshufd ($inout5,$twtmp,0x13); 1546 &movdqa (&QWP(16*$i++,"esp"),$tweak); 1547 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1548 &$movekey ($rndkey0,&QWP(0,$key_)); 1549 &pand ($inout5,$twmask); # isolate carry and residue 1550 &movups ($inout0,&QWP(0,$inp)); # load input 1551 &pxor ($inout5,$tweak); 1552 1553 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] 1554 &mov ($rounds,$rounds_); 1555 &movdqu ($inout1,&QWP(16*1,$inp)); 1556 &xorps ($inout0,$rndkey0); # input^=rndkey[0] 1557 &movdqu ($inout2,&QWP(16*2,$inp)); 1558 &pxor ($inout1,$rndkey0); 1559 &movdqu ($inout3,&QWP(16*3,$inp)); 1560 &pxor ($inout2,$rndkey0); 1561 &movdqu ($inout4,&QWP(16*4,$inp)); 1562 &pxor ($inout3,$rndkey0); 1563 &movdqu ($rndkey1,&QWP(16*5,$inp)); 1564 &pxor ($inout4,$rndkey0); 1565 &lea ($inp,&DWP(16*6,$inp)); 1566 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1567 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak 1568 &pxor ($inout5,$rndkey1); 1569 1570 &$movekey ($rndkey1,&QWP(16,$key_)); 1571 &pxor ($inout1,&QWP(16*1,"esp")); 1572 &pxor ($inout2,&QWP(16*2,"esp")); 1573 &aesdec ($inout0,$rndkey1); 1574 &pxor ($inout3,&QWP(16*3,"esp")); 1575 &pxor ($inout4,&QWP(16*4,"esp")); 1576 &aesdec ($inout1,$rndkey1); 1577 &pxor ($inout5,$rndkey0); 1578 &$movekey ($rndkey0,&QWP(32,$key_)); 1579 &aesdec ($inout2,$rndkey1); 1580 &aesdec ($inout3,$rndkey1); 1581 &aesdec ($inout4,$rndkey1); 1582 &aesdec ($inout5,$rndkey1); 1583 &call (&label("_aesni_decrypt6_enter")); 1584 1585 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak 1586 &pxor ($twtmp,$twtmp); 1587 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1588 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1589 &xorps ($inout1,&QWP(16*1,"esp")); 1590 &movups (&QWP(16*0,$out),$inout0); # write output 1591 &xorps ($inout2,&QWP(16*2,"esp")); 1592 &movups (&QWP(16*1,$out),$inout1); 1593 &xorps ($inout3,&QWP(16*3,"esp")); 1594 &movups (&QWP(16*2,$out),$inout2); 1595 &xorps ($inout4,&QWP(16*4,"esp")); 1596 &movups (&QWP(16*3,$out),$inout3); 1597 &xorps ($inout5,$tweak); 1598 &movups (&QWP(16*4,$out),$inout4); 1599 &pshufd ($twres,$twtmp,0x13); 1600 &movups (&QWP(16*5,$out),$inout5); 1601 &lea ($out,&DWP(16*6,$out)); 1602 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 1603 1604 &pxor ($twtmp,$twtmp); 1605 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1606 &pand ($twres,$twmask); # isolate carry and residue 1607 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1608 &pxor ($tweak,$twres); 1609 1610 &sub ($len,16*6); 1611 &jnc (&label("xts_dec_loop6")); 1612 1613 &mov ($rounds,&DWP(240,$key_)); # restore $rounds 1614 &mov ($key,$key_); # restore $key 1615 &mov ($rounds_,$rounds); 1616 1617&set_label("xts_dec_short"); 1618 &add ($len,16*6); 1619 &jz (&label("xts_dec_done6x")); 1620 1621 &movdqa ($inout3,$tweak); # put aside previous tweak 1622 &cmp ($len,0x20); 1623 &jb (&label("xts_dec_one")); 1624 1625 &pshufd ($twres,$twtmp,0x13); 1626 &pxor ($twtmp,$twtmp); 1627 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1628 &pand ($twres,$twmask); # isolate carry and residue 1629 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1630 &pxor ($tweak,$twres); 1631 &je (&label("xts_dec_two")); 1632 1633 &pshufd ($twres,$twtmp,0x13); 1634 &pxor ($twtmp,$twtmp); 1635 &movdqa ($inout4,$tweak); # put aside previous tweak 1636 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1637 &pand ($twres,$twmask); # isolate carry and residue 1638 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1639 &pxor ($tweak,$twres); 1640 &cmp ($len,0x40); 1641 &jb (&label("xts_dec_three")); 1642 1643 &pshufd ($twres,$twtmp,0x13); 1644 &pxor ($twtmp,$twtmp); 1645 &movdqa ($inout5,$tweak); # put aside previous tweak 1646 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1647 &pand ($twres,$twmask); # isolate carry and residue 1648 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1649 &pxor ($tweak,$twres); 1650 &movdqa (&QWP(16*0,"esp"),$inout3); 1651 &movdqa (&QWP(16*1,"esp"),$inout4); 1652 &je (&label("xts_dec_four")); 1653 1654 &movdqa (&QWP(16*2,"esp"),$inout5); 1655 &pshufd ($inout5,$twtmp,0x13); 1656 &movdqa (&QWP(16*3,"esp"),$tweak); 1657 &paddq ($tweak,$tweak); # &psllq($inout0,1); 1658 &pand ($inout5,$twmask); # isolate carry and residue 1659 &pxor ($inout5,$tweak); 1660 1661 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 1662 &movdqu ($inout1,&QWP(16*1,$inp)); 1663 &movdqu ($inout2,&QWP(16*2,$inp)); 1664 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1665 &movdqu ($inout3,&QWP(16*3,$inp)); 1666 &pxor ($inout1,&QWP(16*1,"esp")); 1667 &movdqu ($inout4,&QWP(16*4,$inp)); 1668 &pxor ($inout2,&QWP(16*2,"esp")); 1669 &lea ($inp,&DWP(16*5,$inp)); 1670 &pxor ($inout3,&QWP(16*3,"esp")); 1671 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak 1672 &pxor ($inout4,$inout5); 1673 1674 &call ("_aesni_decrypt6"); 1675 1676 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak 1677 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1678 &xorps ($inout1,&QWP(16*1,"esp")); 1679 &xorps ($inout2,&QWP(16*2,"esp")); 1680 &movups (&QWP(16*0,$out),$inout0); # write output 1681 &xorps ($inout3,&QWP(16*3,"esp")); 1682 &movups (&QWP(16*1,$out),$inout1); 1683 &xorps ($inout4,$tweak); 1684 &movups (&QWP(16*2,$out),$inout2); 1685 &movups (&QWP(16*3,$out),$inout3); 1686 &movups (&QWP(16*4,$out),$inout4); 1687 &lea ($out,&DWP(16*5,$out)); 1688 &jmp (&label("xts_dec_done")); 1689 1690&set_label("xts_dec_one",16); 1691 &movups ($inout0,&QWP(16*0,$inp)); # load input 1692 &lea ($inp,&DWP(16*1,$inp)); 1693 &xorps ($inout0,$inout3); # input^=tweak 1694 if ($inline) 1695 { &aesni_inline_generate1("dec"); } 1696 else 1697 { &call ("_aesni_decrypt1"); } 1698 &xorps ($inout0,$inout3); # output^=tweak 1699 &movups (&QWP(16*0,$out),$inout0); # write output 1700 &lea ($out,&DWP(16*1,$out)); 1701 1702 &movdqa ($tweak,$inout3); # last tweak 1703 &jmp (&label("xts_dec_done")); 1704 1705&set_label("xts_dec_two",16); 1706 &movaps ($inout4,$tweak); # put aside last tweak 1707 1708 &movups ($inout0,&QWP(16*0,$inp)); # load input 1709 &movups ($inout1,&QWP(16*1,$inp)); 1710 &lea ($inp,&DWP(16*2,$inp)); 1711 &xorps ($inout0,$inout3); # input^=tweak 1712 &xorps ($inout1,$inout4); 1713 1714 &call ("_aesni_decrypt2"); 1715 1716 &xorps ($inout0,$inout3); # output^=tweak 1717 &xorps ($inout1,$inout4); 1718 &movups (&QWP(16*0,$out),$inout0); # write output 1719 &movups (&QWP(16*1,$out),$inout1); 1720 &lea ($out,&DWP(16*2,$out)); 1721 1722 &movdqa ($tweak,$inout4); # last tweak 1723 &jmp (&label("xts_dec_done")); 1724 1725&set_label("xts_dec_three",16); 1726 &movaps ($inout5,$tweak); # put aside last tweak 1727 &movups ($inout0,&QWP(16*0,$inp)); # load input 1728 &movups ($inout1,&QWP(16*1,$inp)); 1729 &movups ($inout2,&QWP(16*2,$inp)); 1730 &lea ($inp,&DWP(16*3,$inp)); 1731 &xorps ($inout0,$inout3); # input^=tweak 1732 &xorps ($inout1,$inout4); 1733 &xorps ($inout2,$inout5); 1734 1735 &call ("_aesni_decrypt3"); 1736 1737 &xorps ($inout0,$inout3); # output^=tweak 1738 &xorps ($inout1,$inout4); 1739 &xorps ($inout2,$inout5); 1740 &movups (&QWP(16*0,$out),$inout0); # write output 1741 &movups (&QWP(16*1,$out),$inout1); 1742 &movups (&QWP(16*2,$out),$inout2); 1743 &lea ($out,&DWP(16*3,$out)); 1744 1745 &movdqa ($tweak,$inout5); # last tweak 1746 &jmp (&label("xts_dec_done")); 1747 1748&set_label("xts_dec_four",16); 1749 &movaps ($inout4,$tweak); # put aside last tweak 1750 1751 &movups ($inout0,&QWP(16*0,$inp)); # load input 1752 &movups ($inout1,&QWP(16*1,$inp)); 1753 &movups ($inout2,&QWP(16*2,$inp)); 1754 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak 1755 &movups ($inout3,&QWP(16*3,$inp)); 1756 &lea ($inp,&DWP(16*4,$inp)); 1757 &xorps ($inout1,&QWP(16*1,"esp")); 1758 &xorps ($inout2,$inout5); 1759 &xorps ($inout3,$inout4); 1760 1761 &call ("_aesni_decrypt4"); 1762 1763 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1764 &xorps ($inout1,&QWP(16*1,"esp")); 1765 &xorps ($inout2,$inout5); 1766 &movups (&QWP(16*0,$out),$inout0); # write output 1767 &xorps ($inout3,$inout4); 1768 &movups (&QWP(16*1,$out),$inout1); 1769 &movups (&QWP(16*2,$out),$inout2); 1770 &movups (&QWP(16*3,$out),$inout3); 1771 &lea ($out,&DWP(16*4,$out)); 1772 1773 &movdqa ($tweak,$inout4); # last tweak 1774 &jmp (&label("xts_dec_done")); 1775 1776&set_label("xts_dec_done6x",16); # $tweak is pre-calculated 1777 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1778 &and ($len,15); 1779 &jz (&label("xts_dec_ret")); 1780 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1781 &jmp (&label("xts_dec_only_one_more")); 1782 1783&set_label("xts_dec_done",16); 1784 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1785 &pxor ($twtmp,$twtmp); 1786 &and ($len,15); 1787 &jz (&label("xts_dec_ret")); 1788 1789 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1790 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1791 &pshufd ($twres,$twtmp,0x13); 1792 &pxor ($twtmp,$twtmp); 1793 &movdqa ($twmask,&QWP(16*6,"esp")); 1794 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1795 &pand ($twres,$twmask); # isolate carry and residue 1796 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1797 &pxor ($tweak,$twres); 1798 1799&set_label("xts_dec_only_one_more"); 1800 &pshufd ($inout3,$twtmp,0x13); 1801 &movdqa ($inout4,$tweak); # put aside previous tweak 1802 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1803 &pand ($inout3,$twmask); # isolate carry and residue 1804 &pxor ($inout3,$tweak); 1805 1806 &mov ($key,$key_); # restore $key 1807 &mov ($rounds,$rounds_); # restore $rounds 1808 1809 &movups ($inout0,&QWP(0,$inp)); # load input 1810 &xorps ($inout0,$inout3); # input^=tweak 1811 if ($inline) 1812 { &aesni_inline_generate1("dec"); } 1813 else 1814 { &call ("_aesni_decrypt1"); } 1815 &xorps ($inout0,$inout3); # output^=tweak 1816 &movups (&QWP(0,$out),$inout0); # write output 1817 1818&set_label("xts_dec_steal"); 1819 &movz ($rounds,&BP(16,$inp)); 1820 &movz ($key,&BP(0,$out)); 1821 &lea ($inp,&DWP(1,$inp)); 1822 &mov (&BP(0,$out),&LB($rounds)); 1823 &mov (&BP(16,$out),&LB($key)); 1824 &lea ($out,&DWP(1,$out)); 1825 &sub ($len,1); 1826 &jnz (&label("xts_dec_steal")); 1827 1828 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out 1829 &mov ($key,$key_); # restore $key 1830 &mov ($rounds,$rounds_); # restore $rounds 1831 1832 &movups ($inout0,&QWP(0,$out)); # load input 1833 &xorps ($inout0,$inout4); # input^=tweak 1834 if ($inline) 1835 { &aesni_inline_generate1("dec"); } 1836 else 1837 { &call ("_aesni_decrypt1"); } 1838 &xorps ($inout0,$inout4); # output^=tweak 1839 &movups (&QWP(0,$out),$inout0); # write output 1840 1841&set_label("xts_dec_ret"); 1842 &pxor ("xmm0","xmm0"); # clear register bank 1843 &pxor ("xmm1","xmm1"); 1844 &pxor ("xmm2","xmm2"); 1845 &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack 1846 &pxor ("xmm3","xmm3"); 1847 &movdqa (&QWP(16*1,"esp"),"xmm0"); 1848 &pxor ("xmm4","xmm4"); 1849 &movdqa (&QWP(16*2,"esp"),"xmm0"); 1850 &pxor ("xmm5","xmm5"); 1851 &movdqa (&QWP(16*3,"esp"),"xmm0"); 1852 &pxor ("xmm6","xmm6"); 1853 &movdqa (&QWP(16*4,"esp"),"xmm0"); 1854 &pxor ("xmm7","xmm7"); 1855 &movdqa (&QWP(16*5,"esp"),"xmm0"); 1856 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp 1857&function_end("${PREFIX}_xts_decrypt"); 1858} 1859} 1860 1861###################################################################### 1862# void $PREFIX_cbc_encrypt (const void *inp, void *out, 1863# size_t length, const AES_KEY *key, 1864# unsigned char *ivp,const int enc); 1865&function_begin("${PREFIX}_cbc_encrypt"); 1866 &mov ($inp,&wparam(0)); 1867 &mov ($rounds_,"esp"); 1868 &mov ($out,&wparam(1)); 1869 &sub ($rounds_,24); 1870 &mov ($len,&wparam(2)); 1871 &and ($rounds_,-16); 1872 &mov ($key,&wparam(3)); 1873 &mov ($key_,&wparam(4)); 1874 &test ($len,$len); 1875 &jz (&label("cbc_abort")); 1876 1877 &cmp (&wparam(5),0); 1878 &xchg ($rounds_,"esp"); # alloca 1879 &movups ($ivec,&QWP(0,$key_)); # load IV 1880 &mov ($rounds,&DWP(240,$key)); 1881 &mov ($key_,$key); # backup $key 1882 &mov (&DWP(16,"esp"),$rounds_); # save original %esp 1883 &mov ($rounds_,$rounds); # backup $rounds 1884 &je (&label("cbc_decrypt")); 1885 1886 &movaps ($inout0,$ivec); 1887 &cmp ($len,16); 1888 &jb (&label("cbc_enc_tail")); 1889 &sub ($len,16); 1890 &jmp (&label("cbc_enc_loop")); 1891 1892&set_label("cbc_enc_loop",16); 1893 &movups ($ivec,&QWP(0,$inp)); # input actually 1894 &lea ($inp,&DWP(16,$inp)); 1895 if ($inline) 1896 { &aesni_inline_generate1("enc",$inout0,$ivec); } 1897 else 1898 { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); } 1899 &mov ($rounds,$rounds_); # restore $rounds 1900 &mov ($key,$key_); # restore $key 1901 &movups (&QWP(0,$out),$inout0); # store output 1902 &lea ($out,&DWP(16,$out)); 1903 &sub ($len,16); 1904 &jnc (&label("cbc_enc_loop")); 1905 &add ($len,16); 1906 &jnz (&label("cbc_enc_tail")); 1907 &movaps ($ivec,$inout0); 1908 &pxor ($inout0,$inout0); 1909 &jmp (&label("cbc_ret")); 1910 1911&set_label("cbc_enc_tail"); 1912 &mov ("ecx",$len); # zaps $rounds 1913 &data_word(0xA4F3F689); # rep movsb 1914 &mov ("ecx",16); # zero tail 1915 &sub ("ecx",$len); 1916 &xor ("eax","eax"); # zaps $len 1917 &data_word(0xAAF3F689); # rep stosb 1918 &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block 1919 &mov ($rounds,$rounds_); # restore $rounds 1920 &mov ($inp,$out); # $inp and $out are the same 1921 &mov ($key,$key_); # restore $key 1922 &jmp (&label("cbc_enc_loop")); 1923###################################################################### 1924&set_label("cbc_decrypt",16); 1925 &cmp ($len,0x50); 1926 &jbe (&label("cbc_dec_tail")); 1927 &movaps (&QWP(0,"esp"),$ivec); # save IV 1928 &sub ($len,0x50); 1929 &jmp (&label("cbc_dec_loop6_enter")); 1930 1931&set_label("cbc_dec_loop6",16); 1932 &movaps (&QWP(0,"esp"),$rndkey0); # save IV 1933 &movups (&QWP(0,$out),$inout5); 1934 &lea ($out,&DWP(0x10,$out)); 1935&set_label("cbc_dec_loop6_enter"); 1936 &movdqu ($inout0,&QWP(0,$inp)); 1937 &movdqu ($inout1,&QWP(0x10,$inp)); 1938 &movdqu ($inout2,&QWP(0x20,$inp)); 1939 &movdqu ($inout3,&QWP(0x30,$inp)); 1940 &movdqu ($inout4,&QWP(0x40,$inp)); 1941 &movdqu ($inout5,&QWP(0x50,$inp)); 1942 1943 &call ("_aesni_decrypt6"); 1944 1945 &movups ($rndkey1,&QWP(0,$inp)); 1946 &movups ($rndkey0,&QWP(0x10,$inp)); 1947 &xorps ($inout0,&QWP(0,"esp")); # ^=IV 1948 &xorps ($inout1,$rndkey1); 1949 &movups ($rndkey1,&QWP(0x20,$inp)); 1950 &xorps ($inout2,$rndkey0); 1951 &movups ($rndkey0,&QWP(0x30,$inp)); 1952 &xorps ($inout3,$rndkey1); 1953 &movups ($rndkey1,&QWP(0x40,$inp)); 1954 &xorps ($inout4,$rndkey0); 1955 &movups ($rndkey0,&QWP(0x50,$inp)); # IV 1956 &xorps ($inout5,$rndkey1); 1957 &movups (&QWP(0,$out),$inout0); 1958 &movups (&QWP(0x10,$out),$inout1); 1959 &lea ($inp,&DWP(0x60,$inp)); 1960 &movups (&QWP(0x20,$out),$inout2); 1961 &mov ($rounds,$rounds_); # restore $rounds 1962 &movups (&QWP(0x30,$out),$inout3); 1963 &mov ($key,$key_); # restore $key 1964 &movups (&QWP(0x40,$out),$inout4); 1965 &lea ($out,&DWP(0x50,$out)); 1966 &sub ($len,0x60); 1967 &ja (&label("cbc_dec_loop6")); 1968 1969 &movaps ($inout0,$inout5); 1970 &movaps ($ivec,$rndkey0); 1971 &add ($len,0x50); 1972 &jle (&label("cbc_dec_clear_tail_collected")); 1973 &movups (&QWP(0,$out),$inout0); 1974 &lea ($out,&DWP(0x10,$out)); 1975&set_label("cbc_dec_tail"); 1976 &movups ($inout0,&QWP(0,$inp)); 1977 &movaps ($in0,$inout0); 1978 &cmp ($len,0x10); 1979 &jbe (&label("cbc_dec_one")); 1980 1981 &movups ($inout1,&QWP(0x10,$inp)); 1982 &movaps ($in1,$inout1); 1983 &cmp ($len,0x20); 1984 &jbe (&label("cbc_dec_two")); 1985 1986 &movups ($inout2,&QWP(0x20,$inp)); 1987 &cmp ($len,0x30); 1988 &jbe (&label("cbc_dec_three")); 1989 1990 &movups ($inout3,&QWP(0x30,$inp)); 1991 &cmp ($len,0x40); 1992 &jbe (&label("cbc_dec_four")); 1993 1994 &movups ($inout4,&QWP(0x40,$inp)); 1995 &movaps (&QWP(0,"esp"),$ivec); # save IV 1996 &movups ($inout0,&QWP(0,$inp)); 1997 &xorps ($inout5,$inout5); 1998 &call ("_aesni_decrypt6"); 1999 &movups ($rndkey1,&QWP(0,$inp)); 2000 &movups ($rndkey0,&QWP(0x10,$inp)); 2001 &xorps ($inout0,&QWP(0,"esp")); # ^= IV 2002 &xorps ($inout1,$rndkey1); 2003 &movups ($rndkey1,&QWP(0x20,$inp)); 2004 &xorps ($inout2,$rndkey0); 2005 &movups ($rndkey0,&QWP(0x30,$inp)); 2006 &xorps ($inout3,$rndkey1); 2007 &movups ($ivec,&QWP(0x40,$inp)); # IV 2008 &xorps ($inout4,$rndkey0); 2009 &movups (&QWP(0,$out),$inout0); 2010 &movups (&QWP(0x10,$out),$inout1); 2011 &pxor ($inout1,$inout1); 2012 &movups (&QWP(0x20,$out),$inout2); 2013 &pxor ($inout2,$inout2); 2014 &movups (&QWP(0x30,$out),$inout3); 2015 &pxor ($inout3,$inout3); 2016 &lea ($out,&DWP(0x40,$out)); 2017 &movaps ($inout0,$inout4); 2018 &pxor ($inout4,$inout4); 2019 &sub ($len,0x50); 2020 &jmp (&label("cbc_dec_tail_collected")); 2021 2022&set_label("cbc_dec_one",16); 2023 if ($inline) 2024 { &aesni_inline_generate1("dec"); } 2025 else 2026 { &call ("_aesni_decrypt1"); } 2027 &xorps ($inout0,$ivec); 2028 &movaps ($ivec,$in0); 2029 &sub ($len,0x10); 2030 &jmp (&label("cbc_dec_tail_collected")); 2031 2032&set_label("cbc_dec_two",16); 2033 &call ("_aesni_decrypt2"); 2034 &xorps ($inout0,$ivec); 2035 &xorps ($inout1,$in0); 2036 &movups (&QWP(0,$out),$inout0); 2037 &movaps ($inout0,$inout1); 2038 &pxor ($inout1,$inout1); 2039 &lea ($out,&DWP(0x10,$out)); 2040 &movaps ($ivec,$in1); 2041 &sub ($len,0x20); 2042 &jmp (&label("cbc_dec_tail_collected")); 2043 2044&set_label("cbc_dec_three",16); 2045 &call ("_aesni_decrypt3"); 2046 &xorps ($inout0,$ivec); 2047 &xorps ($inout1,$in0); 2048 &xorps ($inout2,$in1); 2049 &movups (&QWP(0,$out),$inout0); 2050 &movaps ($inout0,$inout2); 2051 &pxor ($inout2,$inout2); 2052 &movups (&QWP(0x10,$out),$inout1); 2053 &pxor ($inout1,$inout1); 2054 &lea ($out,&DWP(0x20,$out)); 2055 &movups ($ivec,&QWP(0x20,$inp)); 2056 &sub ($len,0x30); 2057 &jmp (&label("cbc_dec_tail_collected")); 2058 2059&set_label("cbc_dec_four",16); 2060 &call ("_aesni_decrypt4"); 2061 &movups ($rndkey1,&QWP(0x10,$inp)); 2062 &movups ($rndkey0,&QWP(0x20,$inp)); 2063 &xorps ($inout0,$ivec); 2064 &movups ($ivec,&QWP(0x30,$inp)); 2065 &xorps ($inout1,$in0); 2066 &movups (&QWP(0,$out),$inout0); 2067 &xorps ($inout2,$rndkey1); 2068 &movups (&QWP(0x10,$out),$inout1); 2069 &pxor ($inout1,$inout1); 2070 &xorps ($inout3,$rndkey0); 2071 &movups (&QWP(0x20,$out),$inout2); 2072 &pxor ($inout2,$inout2); 2073 &lea ($out,&DWP(0x30,$out)); 2074 &movaps ($inout0,$inout3); 2075 &pxor ($inout3,$inout3); 2076 &sub ($len,0x40); 2077 &jmp (&label("cbc_dec_tail_collected")); 2078 2079&set_label("cbc_dec_clear_tail_collected",16); 2080 &pxor ($inout1,$inout1); 2081 &pxor ($inout2,$inout2); 2082 &pxor ($inout3,$inout3); 2083 &pxor ($inout4,$inout4); 2084&set_label("cbc_dec_tail_collected"); 2085 &and ($len,15); 2086 &jnz (&label("cbc_dec_tail_partial")); 2087 &movups (&QWP(0,$out),$inout0); 2088 &pxor ($rndkey0,$rndkey0); 2089 &jmp (&label("cbc_ret")); 2090 2091&set_label("cbc_dec_tail_partial",16); 2092 &movaps (&QWP(0,"esp"),$inout0); 2093 &pxor ($rndkey0,$rndkey0); 2094 &mov ("ecx",16); 2095 &mov ($inp,"esp"); 2096 &sub ("ecx",$len); 2097 &data_word(0xA4F3F689); # rep movsb 2098 &movdqa (&QWP(0,"esp"),$inout0); 2099 2100&set_label("cbc_ret"); 2101 &mov ("esp",&DWP(16,"esp")); # pull original %esp 2102 &mov ($key_,&wparam(4)); 2103 &pxor ($inout0,$inout0); 2104 &pxor ($rndkey1,$rndkey1); 2105 &movups (&QWP(0,$key_),$ivec); # output IV 2106 &pxor ($ivec,$ivec); 2107&set_label("cbc_abort"); 2108&function_end("${PREFIX}_cbc_encrypt"); 2109 2110###################################################################### 2111# Mechanical port from aesni-x86_64.pl. 2112# 2113# _aesni_set_encrypt_key is private interface, 2114# input: 2115# "eax" const unsigned char *userKey 2116# $rounds int bits 2117# $key AES_KEY *key 2118# output: 2119# "eax" return code 2120# $round rounds 2121 2122&function_begin_B("_aesni_set_encrypt_key"); 2123 &push ("ebp"); 2124 &push ("ebx"); 2125 &test ("eax","eax"); 2126 &jz (&label("bad_pointer")); 2127 &test ($key,$key); 2128 &jz (&label("bad_pointer")); 2129 2130 &call (&label("pic")); 2131&set_label("pic"); 2132 &blindpop("ebx"); 2133 &lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx")); 2134 2135 &picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const")); 2136 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey 2137 &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0 2138 &mov ("ebp",&DWP(4,"ebp")); 2139 &lea ($key,&DWP(16,$key)); 2140 &and ("ebp",1<<28|1<<11); # AVX and XOP bits 2141 &cmp ($rounds,256); 2142 &je (&label("14rounds")); 2143 &cmp ($rounds,192); 2144 &je (&label("12rounds")); 2145 &cmp ($rounds,128); 2146 &jne (&label("bad_keybits")); 2147 2148&set_label("10rounds",16); 2149 &cmp ("ebp",1<<28); 2150 &je (&label("10rounds_alt")); 2151 2152 &mov ($rounds,9); 2153 &$movekey (&QWP(-16,$key),"xmm0"); # round 0 2154 &aeskeygenassist("xmm1","xmm0",0x01); # round 1 2155 &call (&label("key_128_cold")); 2156 &aeskeygenassist("xmm1","xmm0",0x2); # round 2 2157 &call (&label("key_128")); 2158 &aeskeygenassist("xmm1","xmm0",0x04); # round 3 2159 &call (&label("key_128")); 2160 &aeskeygenassist("xmm1","xmm0",0x08); # round 4 2161 &call (&label("key_128")); 2162 &aeskeygenassist("xmm1","xmm0",0x10); # round 5 2163 &call (&label("key_128")); 2164 &aeskeygenassist("xmm1","xmm0",0x20); # round 6 2165 &call (&label("key_128")); 2166 &aeskeygenassist("xmm1","xmm0",0x40); # round 7 2167 &call (&label("key_128")); 2168 &aeskeygenassist("xmm1","xmm0",0x80); # round 8 2169 &call (&label("key_128")); 2170 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9 2171 &call (&label("key_128")); 2172 &aeskeygenassist("xmm1","xmm0",0x36); # round 10 2173 &call (&label("key_128")); 2174 &$movekey (&QWP(0,$key),"xmm0"); 2175 &mov (&DWP(80,$key),$rounds); 2176 2177 &jmp (&label("good_key")); 2178 2179&set_label("key_128",16); 2180 &$movekey (&QWP(0,$key),"xmm0"); 2181 &lea ($key,&DWP(16,$key)); 2182&set_label("key_128_cold"); 2183 &shufps ("xmm4","xmm0",0b00010000); 2184 &xorps ("xmm0","xmm4"); 2185 &shufps ("xmm4","xmm0",0b10001100); 2186 &xorps ("xmm0","xmm4"); 2187 &shufps ("xmm1","xmm1",0b11111111); # critical path 2188 &xorps ("xmm0","xmm1"); 2189 &ret(); 2190 2191&set_label("10rounds_alt",16); 2192 &movdqa ("xmm5",&QWP(0x00,"ebx")); 2193 &mov ($rounds,8); 2194 &movdqa ("xmm4",&QWP(0x20,"ebx")); 2195 &movdqa ("xmm2","xmm0"); 2196 &movdqu (&QWP(-16,$key),"xmm0"); 2197 2198&set_label("loop_key128"); 2199 &pshufb ("xmm0","xmm5"); 2200 &aesenclast ("xmm0","xmm4"); 2201 &pslld ("xmm4",1); 2202 &lea ($key,&DWP(16,$key)); 2203 2204 &movdqa ("xmm3","xmm2"); 2205 &pslldq ("xmm2",4); 2206 &pxor ("xmm3","xmm2"); 2207 &pslldq ("xmm2",4); 2208 &pxor ("xmm3","xmm2"); 2209 &pslldq ("xmm2",4); 2210 &pxor ("xmm2","xmm3"); 2211 2212 &pxor ("xmm0","xmm2"); 2213 &movdqu (&QWP(-16,$key),"xmm0"); 2214 &movdqa ("xmm2","xmm0"); 2215 2216 &dec ($rounds); 2217 &jnz (&label("loop_key128")); 2218 2219 &movdqa ("xmm4",&QWP(0x30,"ebx")); 2220 2221 &pshufb ("xmm0","xmm5"); 2222 &aesenclast ("xmm0","xmm4"); 2223 &pslld ("xmm4",1); 2224 2225 &movdqa ("xmm3","xmm2"); 2226 &pslldq ("xmm2",4); 2227 &pxor ("xmm3","xmm2"); 2228 &pslldq ("xmm2",4); 2229 &pxor ("xmm3","xmm2"); 2230 &pslldq ("xmm2",4); 2231 &pxor ("xmm2","xmm3"); 2232 2233 &pxor ("xmm0","xmm2"); 2234 &movdqu (&QWP(0,$key),"xmm0"); 2235 2236 &movdqa ("xmm2","xmm0"); 2237 &pshufb ("xmm0","xmm5"); 2238 &aesenclast ("xmm0","xmm4"); 2239 2240 &movdqa ("xmm3","xmm2"); 2241 &pslldq ("xmm2",4); 2242 &pxor ("xmm3","xmm2"); 2243 &pslldq ("xmm2",4); 2244 &pxor ("xmm3","xmm2"); 2245 &pslldq ("xmm2",4); 2246 &pxor ("xmm2","xmm3"); 2247 2248 &pxor ("xmm0","xmm2"); 2249 &movdqu (&QWP(16,$key),"xmm0"); 2250 2251 &mov ($rounds,9); 2252 &mov (&DWP(96,$key),$rounds); 2253 2254 &jmp (&label("good_key")); 2255 2256&set_label("12rounds",16); 2257 &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey 2258 &cmp ("ebp",1<<28); 2259 &je (&label("12rounds_alt")); 2260 2261 &mov ($rounds,11); 2262 &$movekey (&QWP(-16,$key),"xmm0"); # round 0 2263 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2 2264 &call (&label("key_192a_cold")); 2265 &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3 2266 &call (&label("key_192b")); 2267 &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5 2268 &call (&label("key_192a")); 2269 &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6 2270 &call (&label("key_192b")); 2271 &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8 2272 &call (&label("key_192a")); 2273 &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9 2274 &call (&label("key_192b")); 2275 &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11 2276 &call (&label("key_192a")); 2277 &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12 2278 &call (&label("key_192b")); 2279 &$movekey (&QWP(0,$key),"xmm0"); 2280 &mov (&DWP(48,$key),$rounds); 2281 2282 &jmp (&label("good_key")); 2283 2284&set_label("key_192a",16); 2285 &$movekey (&QWP(0,$key),"xmm0"); 2286 &lea ($key,&DWP(16,$key)); 2287&set_label("key_192a_cold",16); 2288 &movaps ("xmm5","xmm2"); 2289&set_label("key_192b_warm"); 2290 &shufps ("xmm4","xmm0",0b00010000); 2291 &movdqa ("xmm3","xmm2"); 2292 &xorps ("xmm0","xmm4"); 2293 &shufps ("xmm4","xmm0",0b10001100); 2294 &pslldq ("xmm3",4); 2295 &xorps ("xmm0","xmm4"); 2296 &pshufd ("xmm1","xmm1",0b01010101); # critical path 2297 &pxor ("xmm2","xmm3"); 2298 &pxor ("xmm0","xmm1"); 2299 &pshufd ("xmm3","xmm0",0b11111111); 2300 &pxor ("xmm2","xmm3"); 2301 &ret(); 2302 2303&set_label("key_192b",16); 2304 &movaps ("xmm3","xmm0"); 2305 &shufps ("xmm5","xmm0",0b01000100); 2306 &$movekey (&QWP(0,$key),"xmm5"); 2307 &shufps ("xmm3","xmm2",0b01001110); 2308 &$movekey (&QWP(16,$key),"xmm3"); 2309 &lea ($key,&DWP(32,$key)); 2310 &jmp (&label("key_192b_warm")); 2311 2312&set_label("12rounds_alt",16); 2313 &movdqa ("xmm5",&QWP(0x10,"ebx")); 2314 &movdqa ("xmm4",&QWP(0x20,"ebx")); 2315 &mov ($rounds,8); 2316 &movdqu (&QWP(-16,$key),"xmm0"); 2317 2318&set_label("loop_key192"); 2319 &movq (&QWP(0,$key),"xmm2"); 2320 &movdqa ("xmm1","xmm2"); 2321 &pshufb ("xmm2","xmm5"); 2322 &aesenclast ("xmm2","xmm4"); 2323 &pslld ("xmm4",1); 2324 &lea ($key,&DWP(24,$key)); 2325 2326 &movdqa ("xmm3","xmm0"); 2327 &pslldq ("xmm0",4); 2328 &pxor ("xmm3","xmm0"); 2329 &pslldq ("xmm0",4); 2330 &pxor ("xmm3","xmm0"); 2331 &pslldq ("xmm0",4); 2332 &pxor ("xmm0","xmm3"); 2333 2334 &pshufd ("xmm3","xmm0",0xff); 2335 &pxor ("xmm3","xmm1"); 2336 &pslldq ("xmm1",4); 2337 &pxor ("xmm3","xmm1"); 2338 2339 &pxor ("xmm0","xmm2"); 2340 &pxor ("xmm2","xmm3"); 2341 &movdqu (&QWP(-16,$key),"xmm0"); 2342 2343 &dec ($rounds); 2344 &jnz (&label("loop_key192")); 2345 2346 &mov ($rounds,11); 2347 &mov (&DWP(32,$key),$rounds); 2348 2349 &jmp (&label("good_key")); 2350 2351&set_label("14rounds",16); 2352 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey 2353 &lea ($key,&DWP(16,$key)); 2354 &cmp ("ebp",1<<28); 2355 &je (&label("14rounds_alt")); 2356 2357 &mov ($rounds,13); 2358 &$movekey (&QWP(-32,$key),"xmm0"); # round 0 2359 &$movekey (&QWP(-16,$key),"xmm2"); # round 1 2360 &aeskeygenassist("xmm1","xmm2",0x01); # round 2 2361 &call (&label("key_256a_cold")); 2362 &aeskeygenassist("xmm1","xmm0",0x01); # round 3 2363 &call (&label("key_256b")); 2364 &aeskeygenassist("xmm1","xmm2",0x02); # round 4 2365 &call (&label("key_256a")); 2366 &aeskeygenassist("xmm1","xmm0",0x02); # round 5 2367 &call (&label("key_256b")); 2368 &aeskeygenassist("xmm1","xmm2",0x04); # round 6 2369 &call (&label("key_256a")); 2370 &aeskeygenassist("xmm1","xmm0",0x04); # round 7 2371 &call (&label("key_256b")); 2372 &aeskeygenassist("xmm1","xmm2",0x08); # round 8 2373 &call (&label("key_256a")); 2374 &aeskeygenassist("xmm1","xmm0",0x08); # round 9 2375 &call (&label("key_256b")); 2376 &aeskeygenassist("xmm1","xmm2",0x10); # round 10 2377 &call (&label("key_256a")); 2378 &aeskeygenassist("xmm1","xmm0",0x10); # round 11 2379 &call (&label("key_256b")); 2380 &aeskeygenassist("xmm1","xmm2",0x20); # round 12 2381 &call (&label("key_256a")); 2382 &aeskeygenassist("xmm1","xmm0",0x20); # round 13 2383 &call (&label("key_256b")); 2384 &aeskeygenassist("xmm1","xmm2",0x40); # round 14 2385 &call (&label("key_256a")); 2386 &$movekey (&QWP(0,$key),"xmm0"); 2387 &mov (&DWP(16,$key),$rounds); 2388 &xor ("eax","eax"); 2389 2390 &jmp (&label("good_key")); 2391 2392&set_label("key_256a",16); 2393 &$movekey (&QWP(0,$key),"xmm2"); 2394 &lea ($key,&DWP(16,$key)); 2395&set_label("key_256a_cold"); 2396 &shufps ("xmm4","xmm0",0b00010000); 2397 &xorps ("xmm0","xmm4"); 2398 &shufps ("xmm4","xmm0",0b10001100); 2399 &xorps ("xmm0","xmm4"); 2400 &shufps ("xmm1","xmm1",0b11111111); # critical path 2401 &xorps ("xmm0","xmm1"); 2402 &ret(); 2403 2404&set_label("key_256b",16); 2405 &$movekey (&QWP(0,$key),"xmm0"); 2406 &lea ($key,&DWP(16,$key)); 2407 2408 &shufps ("xmm4","xmm2",0b00010000); 2409 &xorps ("xmm2","xmm4"); 2410 &shufps ("xmm4","xmm2",0b10001100); 2411 &xorps ("xmm2","xmm4"); 2412 &shufps ("xmm1","xmm1",0b10101010); # critical path 2413 &xorps ("xmm2","xmm1"); 2414 &ret(); 2415 2416&set_label("14rounds_alt",16); 2417 &movdqa ("xmm5",&QWP(0x00,"ebx")); 2418 &movdqa ("xmm4",&QWP(0x20,"ebx")); 2419 &mov ($rounds,7); 2420 &movdqu (&QWP(-32,$key),"xmm0"); 2421 &movdqa ("xmm1","xmm2"); 2422 &movdqu (&QWP(-16,$key),"xmm2"); 2423 2424&set_label("loop_key256"); 2425 &pshufb ("xmm2","xmm5"); 2426 &aesenclast ("xmm2","xmm4"); 2427 2428 &movdqa ("xmm3","xmm0"); 2429 &pslldq ("xmm0",4); 2430 &pxor ("xmm3","xmm0"); 2431 &pslldq ("xmm0",4); 2432 &pxor ("xmm3","xmm0"); 2433 &pslldq ("xmm0",4); 2434 &pxor ("xmm0","xmm3"); 2435 &pslld ("xmm4",1); 2436 2437 &pxor ("xmm0","xmm2"); 2438 &movdqu (&QWP(0,$key),"xmm0"); 2439 2440 &dec ($rounds); 2441 &jz (&label("done_key256")); 2442 2443 &pshufd ("xmm2","xmm0",0xff); 2444 &pxor ("xmm3","xmm3"); 2445 &aesenclast ("xmm2","xmm3"); 2446 2447 &movdqa ("xmm3","xmm1"); 2448 &pslldq ("xmm1",4); 2449 &pxor ("xmm3","xmm1"); 2450 &pslldq ("xmm1",4); 2451 &pxor ("xmm3","xmm1"); 2452 &pslldq ("xmm1",4); 2453 &pxor ("xmm1","xmm3"); 2454 2455 &pxor ("xmm2","xmm1"); 2456 &movdqu (&QWP(16,$key),"xmm2"); 2457 &lea ($key,&DWP(32,$key)); 2458 &movdqa ("xmm1","xmm2"); 2459 &jmp (&label("loop_key256")); 2460 2461&set_label("done_key256"); 2462 &mov ($rounds,13); 2463 &mov (&DWP(16,$key),$rounds); 2464 2465&set_label("good_key"); 2466 &pxor ("xmm0","xmm0"); 2467 &pxor ("xmm1","xmm1"); 2468 &pxor ("xmm2","xmm2"); 2469 &pxor ("xmm3","xmm3"); 2470 &pxor ("xmm4","xmm4"); 2471 &pxor ("xmm5","xmm5"); 2472 &xor ("eax","eax"); 2473 &pop ("ebx"); 2474 &pop ("ebp"); 2475 &ret (); 2476 2477&set_label("bad_pointer",4); 2478 &mov ("eax",-1); 2479 &pop ("ebx"); 2480 &pop ("ebp"); 2481 &ret (); 2482&set_label("bad_keybits",4); 2483 &pxor ("xmm0","xmm0"); 2484 &mov ("eax",-2); 2485 &pop ("ebx"); 2486 &pop ("ebp"); 2487 &ret (); 2488&function_end_B("_aesni_set_encrypt_key"); 2489 2490# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits, 2491# AES_KEY *key) 2492&function_begin_B("${PREFIX}_set_encrypt_key"); 2493 &record_function_hit(3); 2494 2495 &mov ("eax",&wparam(0)); 2496 &mov ($rounds,&wparam(1)); 2497 &mov ($key,&wparam(2)); 2498 &call ("_aesni_set_encrypt_key"); 2499 &ret (); 2500&function_end_B("${PREFIX}_set_encrypt_key"); 2501 2502# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits, 2503# AES_KEY *key) 2504&function_begin_B("${PREFIX}_set_decrypt_key"); 2505 &mov ("eax",&wparam(0)); 2506 &mov ($rounds,&wparam(1)); 2507 &mov ($key,&wparam(2)); 2508 &call ("_aesni_set_encrypt_key"); 2509 &mov ($key,&wparam(2)); 2510 &shl ($rounds,4); # rounds-1 after _aesni_set_encrypt_key 2511 &test ("eax","eax"); 2512 &jnz (&label("dec_key_ret")); 2513 &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule 2514 2515 &$movekey ("xmm0",&QWP(0,$key)); # just swap 2516 &$movekey ("xmm1",&QWP(0,"eax")); 2517 &$movekey (&QWP(0,"eax"),"xmm0"); 2518 &$movekey (&QWP(0,$key),"xmm1"); 2519 &lea ($key,&DWP(16,$key)); 2520 &lea ("eax",&DWP(-16,"eax")); 2521 2522&set_label("dec_key_inverse"); 2523 &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse 2524 &$movekey ("xmm1",&QWP(0,"eax")); 2525 &aesimc ("xmm0","xmm0"); 2526 &aesimc ("xmm1","xmm1"); 2527 &lea ($key,&DWP(16,$key)); 2528 &lea ("eax",&DWP(-16,"eax")); 2529 &$movekey (&QWP(16,"eax"),"xmm0"); 2530 &$movekey (&QWP(-16,$key),"xmm1"); 2531 &cmp ("eax",$key); 2532 &ja (&label("dec_key_inverse")); 2533 2534 &$movekey ("xmm0",&QWP(0,$key)); # inverse middle 2535 &aesimc ("xmm0","xmm0"); 2536 &$movekey (&QWP(0,$key),"xmm0"); 2537 2538 &pxor ("xmm0","xmm0"); 2539 &pxor ("xmm1","xmm1"); 2540 &xor ("eax","eax"); # return success 2541&set_label("dec_key_ret"); 2542 &ret (); 2543&function_end_B("${PREFIX}_set_decrypt_key"); 2544 2545&set_label("key_const",64); 2546&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d); 2547&data_word(0x04070605,0x04070605,0x04070605,0x04070605); 2548&data_word(1,1,1,1); 2549&data_word(0x1b,0x1b,0x1b,0x1b); 2550&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"); 2551 2552&asm_finish(); 2553 2554close STDOUT; 2555