1#! /usr/bin/env perl 2# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements support for Intel AES-NI extension. In 18# OpenSSL context it's used with Intel engine, but can also be used as 19# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for 20# details]. 21# 22# Performance. 23# 24# To start with see corresponding paragraph in aesni-x86_64.pl... 25# Instead of filling table similar to one found there I've chosen to 26# summarize *comparison* results for raw ECB, CTR and CBC benchmarks. 27# The simplified table below represents 32-bit performance relative 28# to 64-bit one in every given point. Ratios vary for different 29# encryption modes, therefore interval values. 30# 31# 16-byte 64-byte 256-byte 1-KB 8-KB 32# 53-67% 67-84% 91-94% 95-98% 97-99.5% 33# 34# Lower ratios for smaller block sizes are perfectly understandable, 35# because function call overhead is higher in 32-bit mode. Largest 36# 8-KB block performance is virtually same: 32-bit code is less than 37# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise. 38 39# January 2011 40# 41# See aesni-x86_64.pl for details. Unlike x86_64 version this module 42# interleaves at most 6 aes[enc|dec] instructions, because there are 43# not enough registers for 8x interleave [which should be optimal for 44# Sandy Bridge]. Actually, performance results for 6x interleave 45# factor presented in aesni-x86_64.pl (except for CTR) are for this 46# module. 47 48# April 2011 49# 50# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing 51# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09. 52 53# November 2015 54# 55# Add aesni_ocb_[en|de]crypt. [Removed in BoringSSL] 56 57###################################################################### 58# Current large-block performance in cycles per byte processed with 59# 128-bit key (less is better). 60# 61# CBC en-/decrypt CTR XTS ECB OCB 62# Westmere 3.77/1.37 1.37 1.52 1.27 63# * Bridge 5.07/0.98 0.99 1.09 0.91 1.10 64# Haswell 4.44/0.80 0.97 1.03 0.72 0.76 65# Skylake 2.68/0.65 0.65 0.66 0.64 0.66 66# Silvermont 5.77/3.56 3.67 4.03 3.46 4.03 67# Goldmont 3.84/1.39 1.39 1.63 1.31 1.70 68# Bulldozer 5.80/0.98 1.05 1.24 0.93 1.23 69 70$PREFIX="aes_hw"; # if $PREFIX is set to "AES", the script 71 # generates drop-in replacement for 72 # crypto/aes/asm/aes-586.pl:-) 73$AESNI_PREFIX="aes_hw"; 74$inline=1; # inline _aesni_[en|de]crypt 75 76$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 77push(@INC,"${dir}","${dir}../../../perlasm"); 78require "x86asm.pl"; 79 80$output = pop; 81open OUT,">$output"; 82*STDOUT=*OUT; 83 84&asm_init($ARGV[0]); 85 86&external_label("OPENSSL_ia32cap_P"); 87&preprocessor_ifdef("BORINGSSL_DISPATCH_TEST") 88&external_label("BORINGSSL_function_hit"); 89&preprocessor_endif(); 90&static_label("key_const"); 91 92if ($PREFIX eq $AESNI_PREFIX) { $movekey=\&movups; } 93else { $movekey=\&movups; } 94 95$len="eax"; 96$rounds="ecx"; 97$key="edx"; 98$inp="esi"; 99$out="edi"; 100$rounds_="ebx"; # backup copy for $rounds 101$key_="ebp"; # backup copy for $key 102 103$rndkey0="xmm0"; 104$rndkey1="xmm1"; 105$inout0="xmm2"; 106$inout1="xmm3"; 107$inout2="xmm4"; 108$inout3="xmm5"; $in1="xmm5"; 109$inout4="xmm6"; $in0="xmm6"; 110$inout5="xmm7"; $ivec="xmm7"; 111 112# AESNI extension 113sub aeskeygenassist 114{ my($dst,$src,$imm)=@_; 115 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) 116 { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); } 117} 118sub aescommon 119{ my($opcodelet,$dst,$src)=@_; 120 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) 121 { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);} 122} 123sub aesimc { aescommon(0xdb,@_); } 124sub aesenc { aescommon(0xdc,@_); } 125sub aesenclast { aescommon(0xdd,@_); } 126 127# Inline version of internal aesni_[en|de]crypt1 128{ my $sn; 129sub aesni_inline_generate1 130{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); 131 $sn++; 132 133 &$movekey ($rndkey0,&QWP(0,$key)); 134 &$movekey ($rndkey1,&QWP(16,$key)); 135 &xorps ($ivec,$rndkey0) if (defined($ivec)); 136 &lea ($key,&DWP(32,$key)); 137 &xorps ($inout,$ivec) if (defined($ivec)); 138 &xorps ($inout,$rndkey0) if (!defined($ivec)); 139 &set_label("${p}1_loop_$sn"); 140 eval"&aes${p} ($inout,$rndkey1)"; 141 &dec ($rounds); 142 &$movekey ($rndkey1,&QWP(0,$key)); 143 &lea ($key,&DWP(16,$key)); 144 &jnz (&label("${p}1_loop_$sn")); 145 eval"&aes${p}last ($inout,$rndkey1)"; 146}} 147 148sub aesni_generate1 # fully unrolled loop 149{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout)); 150 151 &function_begin_B("_aesni_${p}rypt1"); 152 &movups ($rndkey0,&QWP(0,$key)); 153 &$movekey ($rndkey1,&QWP(0x10,$key)); 154 &xorps ($inout,$rndkey0); 155 &$movekey ($rndkey0,&QWP(0x20,$key)); 156 &lea ($key,&DWP(0x30,$key)); 157 &cmp ($rounds,11); 158 &jb (&label("${p}128")); 159 &lea ($key,&DWP(0x40,$key)); 160 # 192-bit key support was removed. 161 162 eval"&aes${p} ($inout,$rndkey1)"; 163 &$movekey ($rndkey1,&QWP(-0x40,$key)); 164 eval"&aes${p} ($inout,$rndkey0)"; 165 &$movekey ($rndkey0,&QWP(-0x30,$key)); 166 167 # 192-bit key support was removed. 168 eval"&aes${p} ($inout,$rndkey1)"; 169 &$movekey ($rndkey1,&QWP(-0x20,$key)); 170 eval"&aes${p} ($inout,$rndkey0)"; 171 &$movekey ($rndkey0,&QWP(-0x10,$key)); 172 &set_label("${p}128"); 173 eval"&aes${p} ($inout,$rndkey1)"; 174 &$movekey ($rndkey1,&QWP(0,$key)); 175 eval"&aes${p} ($inout,$rndkey0)"; 176 &$movekey ($rndkey0,&QWP(0x10,$key)); 177 eval"&aes${p} ($inout,$rndkey1)"; 178 &$movekey ($rndkey1,&QWP(0x20,$key)); 179 eval"&aes${p} ($inout,$rndkey0)"; 180 &$movekey ($rndkey0,&QWP(0x30,$key)); 181 eval"&aes${p} ($inout,$rndkey1)"; 182 &$movekey ($rndkey1,&QWP(0x40,$key)); 183 eval"&aes${p} ($inout,$rndkey0)"; 184 &$movekey ($rndkey0,&QWP(0x50,$key)); 185 eval"&aes${p} ($inout,$rndkey1)"; 186 &$movekey ($rndkey1,&QWP(0x60,$key)); 187 eval"&aes${p} ($inout,$rndkey0)"; 188 &$movekey ($rndkey0,&QWP(0x70,$key)); 189 eval"&aes${p} ($inout,$rndkey1)"; 190 eval"&aes${p}last ($inout,$rndkey0)"; 191 &ret(); 192 &function_end_B("_aesni_${p}rypt1"); 193} 194 195# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key); 196&aesni_generate1("enc") if (!$inline); 197&function_begin_B("${PREFIX}_encrypt"); 198 &record_function_hit(1); 199 200 &mov ("eax",&wparam(0)); 201 &mov ($key,&wparam(2)); 202 &movups ($inout0,&QWP(0,"eax")); 203 &mov ($rounds,&DWP(240,$key)); 204 &mov ("eax",&wparam(1)); 205 if ($inline) 206 { &aesni_inline_generate1("enc"); } 207 else 208 { &call ("_aesni_encrypt1"); } 209 &pxor ($rndkey0,$rndkey0); # clear register bank 210 &pxor ($rndkey1,$rndkey1); 211 &movups (&QWP(0,"eax"),$inout0); 212 &pxor ($inout0,$inout0); 213 &ret (); 214&function_end_B("${PREFIX}_encrypt"); 215 216# _aesni_[en|de]cryptN are private interfaces, N denotes interleave 217# factor. Why 3x subroutine were originally used in loops? Even though 218# aes[enc|dec] latency was originally 6, it could be scheduled only 219# every *2nd* cycle. Thus 3x interleave was the one providing optimal 220# utilization, i.e. when subroutine's throughput is virtually same as 221# of non-interleaved subroutine [for number of input blocks up to 3]. 222# This is why it originally made no sense to implement 2x subroutine. 223# But times change and it became appropriate to spend extra 192 bytes 224# on 2x subroutine on Atom Silvermont account. For processors that 225# can schedule aes[enc|dec] every cycle optimal interleave factor 226# equals to corresponding instructions latency. 8x is optimal for 227# * Bridge, but it's unfeasible to accommodate such implementation 228# in XMM registers addressable in 32-bit mode and therefore maximum 229# of 6x is used instead... 230 231sub aesni_generate2 232{ my $p=shift; 233 234 &function_begin_B("_aesni_${p}rypt2"); 235 &$movekey ($rndkey0,&QWP(0,$key)); 236 &shl ($rounds,4); 237 &$movekey ($rndkey1,&QWP(16,$key)); 238 &xorps ($inout0,$rndkey0); 239 &pxor ($inout1,$rndkey0); 240 &$movekey ($rndkey0,&QWP(32,$key)); 241 &lea ($key,&DWP(32,$key,$rounds)); 242 &neg ($rounds); 243 &add ($rounds,16); 244 245 &set_label("${p}2_loop"); 246 eval"&aes${p} ($inout0,$rndkey1)"; 247 eval"&aes${p} ($inout1,$rndkey1)"; 248 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 249 &add ($rounds,32); 250 eval"&aes${p} ($inout0,$rndkey0)"; 251 eval"&aes${p} ($inout1,$rndkey0)"; 252 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 253 &jnz (&label("${p}2_loop")); 254 eval"&aes${p} ($inout0,$rndkey1)"; 255 eval"&aes${p} ($inout1,$rndkey1)"; 256 eval"&aes${p}last ($inout0,$rndkey0)"; 257 eval"&aes${p}last ($inout1,$rndkey0)"; 258 &ret(); 259 &function_end_B("_aesni_${p}rypt2"); 260} 261 262sub aesni_generate3 263{ my $p=shift; 264 265 &function_begin_B("_aesni_${p}rypt3"); 266 &$movekey ($rndkey0,&QWP(0,$key)); 267 &shl ($rounds,4); 268 &$movekey ($rndkey1,&QWP(16,$key)); 269 &xorps ($inout0,$rndkey0); 270 &pxor ($inout1,$rndkey0); 271 &pxor ($inout2,$rndkey0); 272 &$movekey ($rndkey0,&QWP(32,$key)); 273 &lea ($key,&DWP(32,$key,$rounds)); 274 &neg ($rounds); 275 &add ($rounds,16); 276 277 &set_label("${p}3_loop"); 278 eval"&aes${p} ($inout0,$rndkey1)"; 279 eval"&aes${p} ($inout1,$rndkey1)"; 280 eval"&aes${p} ($inout2,$rndkey1)"; 281 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 282 &add ($rounds,32); 283 eval"&aes${p} ($inout0,$rndkey0)"; 284 eval"&aes${p} ($inout1,$rndkey0)"; 285 eval"&aes${p} ($inout2,$rndkey0)"; 286 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 287 &jnz (&label("${p}3_loop")); 288 eval"&aes${p} ($inout0,$rndkey1)"; 289 eval"&aes${p} ($inout1,$rndkey1)"; 290 eval"&aes${p} ($inout2,$rndkey1)"; 291 eval"&aes${p}last ($inout0,$rndkey0)"; 292 eval"&aes${p}last ($inout1,$rndkey0)"; 293 eval"&aes${p}last ($inout2,$rndkey0)"; 294 &ret(); 295 &function_end_B("_aesni_${p}rypt3"); 296} 297 298# 4x interleave is implemented to improve small block performance, 299# most notably [and naturally] 4 block by ~30%. One can argue that one 300# should have implemented 5x as well, but improvement would be <20%, 301# so it's not worth it... 302sub aesni_generate4 303{ my $p=shift; 304 305 &function_begin_B("_aesni_${p}rypt4"); 306 &$movekey ($rndkey0,&QWP(0,$key)); 307 &$movekey ($rndkey1,&QWP(16,$key)); 308 &shl ($rounds,4); 309 &xorps ($inout0,$rndkey0); 310 &pxor ($inout1,$rndkey0); 311 &pxor ($inout2,$rndkey0); 312 &pxor ($inout3,$rndkey0); 313 &$movekey ($rndkey0,&QWP(32,$key)); 314 &lea ($key,&DWP(32,$key,$rounds)); 315 &neg ($rounds); 316 &data_byte (0x0f,0x1f,0x40,0x00); 317 &add ($rounds,16); 318 319 &set_label("${p}4_loop"); 320 eval"&aes${p} ($inout0,$rndkey1)"; 321 eval"&aes${p} ($inout1,$rndkey1)"; 322 eval"&aes${p} ($inout2,$rndkey1)"; 323 eval"&aes${p} ($inout3,$rndkey1)"; 324 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 325 &add ($rounds,32); 326 eval"&aes${p} ($inout0,$rndkey0)"; 327 eval"&aes${p} ($inout1,$rndkey0)"; 328 eval"&aes${p} ($inout2,$rndkey0)"; 329 eval"&aes${p} ($inout3,$rndkey0)"; 330 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 331 &jnz (&label("${p}4_loop")); 332 333 eval"&aes${p} ($inout0,$rndkey1)"; 334 eval"&aes${p} ($inout1,$rndkey1)"; 335 eval"&aes${p} ($inout2,$rndkey1)"; 336 eval"&aes${p} ($inout3,$rndkey1)"; 337 eval"&aes${p}last ($inout0,$rndkey0)"; 338 eval"&aes${p}last ($inout1,$rndkey0)"; 339 eval"&aes${p}last ($inout2,$rndkey0)"; 340 eval"&aes${p}last ($inout3,$rndkey0)"; 341 &ret(); 342 &function_end_B("_aesni_${p}rypt4"); 343} 344 345sub aesni_generate6 346{ my $p=shift; 347 348 &function_begin_B("_aesni_${p}rypt6"); 349 &static_label("_aesni_${p}rypt6_enter"); 350 &$movekey ($rndkey0,&QWP(0,$key)); 351 &shl ($rounds,4); 352 &$movekey ($rndkey1,&QWP(16,$key)); 353 &xorps ($inout0,$rndkey0); 354 &pxor ($inout1,$rndkey0); # pxor does better here 355 &pxor ($inout2,$rndkey0); 356 eval"&aes${p} ($inout0,$rndkey1)"; 357 &pxor ($inout3,$rndkey0); 358 &pxor ($inout4,$rndkey0); 359 eval"&aes${p} ($inout1,$rndkey1)"; 360 &lea ($key,&DWP(32,$key,$rounds)); 361 &neg ($rounds); 362 eval"&aes${p} ($inout2,$rndkey1)"; 363 &pxor ($inout5,$rndkey0); 364 &$movekey ($rndkey0,&QWP(0,$key,$rounds)); 365 &add ($rounds,16); 366 &jmp (&label("_aesni_${p}rypt6_inner")); 367 368 &set_label("${p}6_loop",16); 369 eval"&aes${p} ($inout0,$rndkey1)"; 370 eval"&aes${p} ($inout1,$rndkey1)"; 371 eval"&aes${p} ($inout2,$rndkey1)"; 372 &set_label("_aesni_${p}rypt6_inner"); 373 eval"&aes${p} ($inout3,$rndkey1)"; 374 eval"&aes${p} ($inout4,$rndkey1)"; 375 eval"&aes${p} ($inout5,$rndkey1)"; 376 &set_label("_aesni_${p}rypt6_enter"); 377 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 378 &add ($rounds,32); 379 eval"&aes${p} ($inout0,$rndkey0)"; 380 eval"&aes${p} ($inout1,$rndkey0)"; 381 eval"&aes${p} ($inout2,$rndkey0)"; 382 eval"&aes${p} ($inout3,$rndkey0)"; 383 eval"&aes${p} ($inout4,$rndkey0)"; 384 eval"&aes${p} ($inout5,$rndkey0)"; 385 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 386 &jnz (&label("${p}6_loop")); 387 388 eval"&aes${p} ($inout0,$rndkey1)"; 389 eval"&aes${p} ($inout1,$rndkey1)"; 390 eval"&aes${p} ($inout2,$rndkey1)"; 391 eval"&aes${p} ($inout3,$rndkey1)"; 392 eval"&aes${p} ($inout4,$rndkey1)"; 393 eval"&aes${p} ($inout5,$rndkey1)"; 394 eval"&aes${p}last ($inout0,$rndkey0)"; 395 eval"&aes${p}last ($inout1,$rndkey0)"; 396 eval"&aes${p}last ($inout2,$rndkey0)"; 397 eval"&aes${p}last ($inout3,$rndkey0)"; 398 eval"&aes${p}last ($inout4,$rndkey0)"; 399 eval"&aes${p}last ($inout5,$rndkey0)"; 400 &ret(); 401 &function_end_B("_aesni_${p}rypt6"); 402} 403&aesni_generate2("enc") if ($PREFIX eq $AESNI_PREFIX); 404&aesni_generate3("enc") if ($PREFIX eq $AESNI_PREFIX); 405&aesni_generate4("enc") if ($PREFIX eq $AESNI_PREFIX); 406&aesni_generate6("enc") if ($PREFIX eq $AESNI_PREFIX); 407 408if ($PREFIX eq $AESNI_PREFIX) { 409 410###################################################################### 411# void aes_hw_ctr32_encrypt_blocks (const void *in, void *out, 412# size_t blocks, const AES_KEY *key, 413# const char *ivec); 414# 415# Handles only complete blocks, operates on 32-bit counter and 416# does not update *ivec! (see crypto/modes/ctr128.c for details) 417# 418# stack layout: 419# 0 pshufb mask 420# 16 vector addend: 0,6,6,6 421# 32 counter-less ivec 422# 48 1st triplet of counter vector 423# 64 2nd triplet of counter vector 424# 80 saved %esp 425 426&function_begin("${PREFIX}_ctr32_encrypt_blocks"); 427 &record_function_hit(0); 428 429 &mov ($inp,&wparam(0)); 430 &mov ($out,&wparam(1)); 431 &mov ($len,&wparam(2)); 432 &mov ($key,&wparam(3)); 433 &mov ($rounds_,&wparam(4)); 434 &mov ($key_,"esp"); 435 &sub ("esp",88); 436 &and ("esp",-16); # align stack 437 &mov (&DWP(80,"esp"),$key_); 438 439 &cmp ($len,1); 440 &je (&label("ctr32_one_shortcut")); 441 442 &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec 443 444 # compose byte-swap control mask for pshufb on stack 445 &mov (&DWP(0,"esp"),0x0c0d0e0f); 446 &mov (&DWP(4,"esp"),0x08090a0b); 447 &mov (&DWP(8,"esp"),0x04050607); 448 &mov (&DWP(12,"esp"),0x00010203); 449 450 # compose counter increment vector on stack 451 &mov ($rounds,6); 452 &xor ($key_,$key_); 453 &mov (&DWP(16,"esp"),$rounds); 454 &mov (&DWP(20,"esp"),$rounds); 455 &mov (&DWP(24,"esp"),$rounds); 456 &mov (&DWP(28,"esp"),$key_); 457 458 &pextrd ($rounds_,$inout5,3); # pull 32-bit counter 459 &pinsrd ($inout5,$key_,3); # wipe 32-bit counter 460 461 &mov ($rounds,&DWP(240,$key)); # key->rounds 462 463 # compose 2 vectors of 3x32-bit counters 464 &bswap ($rounds_); 465 &pxor ($rndkey0,$rndkey0); 466 &pxor ($rndkey1,$rndkey1); 467 &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask 468 &pinsrd ($rndkey0,$rounds_,0); 469 &lea ($key_,&DWP(3,$rounds_)); 470 &pinsrd ($rndkey1,$key_,0); 471 &inc ($rounds_); 472 &pinsrd ($rndkey0,$rounds_,1); 473 &inc ($key_); 474 &pinsrd ($rndkey1,$key_,1); 475 &inc ($rounds_); 476 &pinsrd ($rndkey0,$rounds_,2); 477 &inc ($key_); 478 &pinsrd ($rndkey1,$key_,2); 479 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet 480 &pshufb ($rndkey0,$inout0); # byte swap 481 &movdqu ($inout4,&QWP(0,$key)); # key[0] 482 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet 483 &pshufb ($rndkey1,$inout0); # byte swap 484 485 &pshufd ($inout0,$rndkey0,3<<6); # place counter to upper dword 486 &pshufd ($inout1,$rndkey0,2<<6); 487 &cmp ($len,6); 488 &jb (&label("ctr32_tail")); 489 &pxor ($inout5,$inout4); # counter-less ivec^key[0] 490 &shl ($rounds,4); 491 &mov ($rounds_,16); 492 &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec^key[0] 493 &mov ($key_,$key); # backup $key 494 &sub ($rounds_,$rounds); # backup twisted $rounds 495 &lea ($key,&DWP(32,$key,$rounds)); 496 &sub ($len,6); 497 &jmp (&label("ctr32_loop6")); 498 499&set_label("ctr32_loop6",16); 500 # inlining _aesni_encrypt6's prologue gives ~6% improvement... 501 &pshufd ($inout2,$rndkey0,1<<6); 502 &movdqa ($rndkey0,&QWP(32,"esp")); # pull counter-less ivec 503 &pshufd ($inout3,$rndkey1,3<<6); 504 &pxor ($inout0,$rndkey0); # merge counter-less ivec 505 &pshufd ($inout4,$rndkey1,2<<6); 506 &pxor ($inout1,$rndkey0); 507 &pshufd ($inout5,$rndkey1,1<<6); 508 &$movekey ($rndkey1,&QWP(16,$key_)); 509 &pxor ($inout2,$rndkey0); 510 &pxor ($inout3,$rndkey0); 511 &aesenc ($inout0,$rndkey1); 512 &pxor ($inout4,$rndkey0); 513 &pxor ($inout5,$rndkey0); 514 &aesenc ($inout1,$rndkey1); 515 &$movekey ($rndkey0,&QWP(32,$key_)); 516 &mov ($rounds,$rounds_); 517 &aesenc ($inout2,$rndkey1); 518 &aesenc ($inout3,$rndkey1); 519 &aesenc ($inout4,$rndkey1); 520 &aesenc ($inout5,$rndkey1); 521 522 &call (&label("_aesni_encrypt6_enter")); 523 524 &movups ($rndkey1,&QWP(0,$inp)); 525 &movups ($rndkey0,&QWP(0x10,$inp)); 526 &xorps ($inout0,$rndkey1); 527 &movups ($rndkey1,&QWP(0x20,$inp)); 528 &xorps ($inout1,$rndkey0); 529 &movups (&QWP(0,$out),$inout0); 530 &movdqa ($rndkey0,&QWP(16,"esp")); # load increment 531 &xorps ($inout2,$rndkey1); 532 &movdqa ($rndkey1,&QWP(64,"esp")); # load 2nd triplet 533 &movups (&QWP(0x10,$out),$inout1); 534 &movups (&QWP(0x20,$out),$inout2); 535 536 &paddd ($rndkey1,$rndkey0); # 2nd triplet increment 537 &paddd ($rndkey0,&QWP(48,"esp")); # 1st triplet increment 538 &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask 539 540 &movups ($inout1,&QWP(0x30,$inp)); 541 &movups ($inout2,&QWP(0x40,$inp)); 542 &xorps ($inout3,$inout1); 543 &movups ($inout1,&QWP(0x50,$inp)); 544 &lea ($inp,&DWP(0x60,$inp)); 545 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet 546 &pshufb ($rndkey0,$inout0); # byte swap 547 &xorps ($inout4,$inout2); 548 &movups (&QWP(0x30,$out),$inout3); 549 &xorps ($inout5,$inout1); 550 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet 551 &pshufb ($rndkey1,$inout0); # byte swap 552 &movups (&QWP(0x40,$out),$inout4); 553 &pshufd ($inout0,$rndkey0,3<<6); 554 &movups (&QWP(0x50,$out),$inout5); 555 &lea ($out,&DWP(0x60,$out)); 556 557 &pshufd ($inout1,$rndkey0,2<<6); 558 &sub ($len,6); 559 &jnc (&label("ctr32_loop6")); 560 561 &add ($len,6); 562 &jz (&label("ctr32_ret")); 563 &movdqu ($inout5,&QWP(0,$key_)); 564 &mov ($key,$key_); 565 &pxor ($inout5,&QWP(32,"esp")); # restore count-less ivec 566 &mov ($rounds,&DWP(240,$key_)); # restore $rounds 567 568&set_label("ctr32_tail"); 569 &por ($inout0,$inout5); 570 &cmp ($len,2); 571 &jb (&label("ctr32_one")); 572 573 &pshufd ($inout2,$rndkey0,1<<6); 574 &por ($inout1,$inout5); 575 &je (&label("ctr32_two")); 576 577 &pshufd ($inout3,$rndkey1,3<<6); 578 &por ($inout2,$inout5); 579 &cmp ($len,4); 580 &jb (&label("ctr32_three")); 581 582 &pshufd ($inout4,$rndkey1,2<<6); 583 &por ($inout3,$inout5); 584 &je (&label("ctr32_four")); 585 586 &por ($inout4,$inout5); 587 &call ("_aesni_encrypt6"); 588 &movups ($rndkey1,&QWP(0,$inp)); 589 &movups ($rndkey0,&QWP(0x10,$inp)); 590 &xorps ($inout0,$rndkey1); 591 &movups ($rndkey1,&QWP(0x20,$inp)); 592 &xorps ($inout1,$rndkey0); 593 &movups ($rndkey0,&QWP(0x30,$inp)); 594 &xorps ($inout2,$rndkey1); 595 &movups ($rndkey1,&QWP(0x40,$inp)); 596 &xorps ($inout3,$rndkey0); 597 &movups (&QWP(0,$out),$inout0); 598 &xorps ($inout4,$rndkey1); 599 &movups (&QWP(0x10,$out),$inout1); 600 &movups (&QWP(0x20,$out),$inout2); 601 &movups (&QWP(0x30,$out),$inout3); 602 &movups (&QWP(0x40,$out),$inout4); 603 &jmp (&label("ctr32_ret")); 604 605&set_label("ctr32_one_shortcut",16); 606 &movups ($inout0,&QWP(0,$rounds_)); # load ivec 607 &mov ($rounds,&DWP(240,$key)); 608 609&set_label("ctr32_one"); 610 if ($inline) 611 { &aesni_inline_generate1("enc"); } 612 else 613 { &call ("_aesni_encrypt1"); } 614 &movups ($in0,&QWP(0,$inp)); 615 &xorps ($in0,$inout0); 616 &movups (&QWP(0,$out),$in0); 617 &jmp (&label("ctr32_ret")); 618 619&set_label("ctr32_two",16); 620 &call ("_aesni_encrypt2"); 621 &movups ($inout3,&QWP(0,$inp)); 622 &movups ($inout4,&QWP(0x10,$inp)); 623 &xorps ($inout0,$inout3); 624 &xorps ($inout1,$inout4); 625 &movups (&QWP(0,$out),$inout0); 626 &movups (&QWP(0x10,$out),$inout1); 627 &jmp (&label("ctr32_ret")); 628 629&set_label("ctr32_three",16); 630 &call ("_aesni_encrypt3"); 631 &movups ($inout3,&QWP(0,$inp)); 632 &movups ($inout4,&QWP(0x10,$inp)); 633 &xorps ($inout0,$inout3); 634 &movups ($inout5,&QWP(0x20,$inp)); 635 &xorps ($inout1,$inout4); 636 &movups (&QWP(0,$out),$inout0); 637 &xorps ($inout2,$inout5); 638 &movups (&QWP(0x10,$out),$inout1); 639 &movups (&QWP(0x20,$out),$inout2); 640 &jmp (&label("ctr32_ret")); 641 642&set_label("ctr32_four",16); 643 &call ("_aesni_encrypt4"); 644 &movups ($inout4,&QWP(0,$inp)); 645 &movups ($inout5,&QWP(0x10,$inp)); 646 &movups ($rndkey1,&QWP(0x20,$inp)); 647 &xorps ($inout0,$inout4); 648 &movups ($rndkey0,&QWP(0x30,$inp)); 649 &xorps ($inout1,$inout5); 650 &movups (&QWP(0,$out),$inout0); 651 &xorps ($inout2,$rndkey1); 652 &movups (&QWP(0x10,$out),$inout1); 653 &xorps ($inout3,$rndkey0); 654 &movups (&QWP(0x20,$out),$inout2); 655 &movups (&QWP(0x30,$out),$inout3); 656 657&set_label("ctr32_ret"); 658 &pxor ("xmm0","xmm0"); # clear register bank 659 &pxor ("xmm1","xmm1"); 660 &pxor ("xmm2","xmm2"); 661 &pxor ("xmm3","xmm3"); 662 &pxor ("xmm4","xmm4"); 663 &movdqa (&QWP(32,"esp"),"xmm0"); # clear stack 664 &pxor ("xmm5","xmm5"); 665 &movdqa (&QWP(48,"esp"),"xmm0"); 666 &pxor ("xmm6","xmm6"); 667 &movdqa (&QWP(64,"esp"),"xmm0"); 668 &pxor ("xmm7","xmm7"); 669 &mov ("esp",&DWP(80,"esp")); 670&function_end("${PREFIX}_ctr32_encrypt_blocks"); 671} 672 673###################################################################### 674# Mechanical port from aesni-x86_64.pl. 675# 676# _aesni_set_encrypt_key is private interface, 677# input: 678# "eax" const unsigned char *userKey 679# $rounds int bits 680# $key AES_KEY *key 681# output: 682# "eax" return code 683# $round rounds 684 685&function_begin_B("_aesni_set_encrypt_key"); 686 &push ("ebp"); 687 &push ("ebx"); 688 &test ("eax","eax"); 689 &jz (&label("bad_pointer")); 690 &test ($key,$key); 691 &jz (&label("bad_pointer")); 692 693 &call (&label("pic")); 694&set_label("pic"); 695 &blindpop("ebx"); 696 &lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx")); 697 698 &picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const")); 699 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey 700 &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0 701 &mov ("ebp",&DWP(4,"ebp")); 702 &lea ($key,&DWP(16,$key)); 703 &and ("ebp",1<<28|1<<11); # AVX and XOP bits 704 &cmp ($rounds,256); 705 &je (&label("14rounds")); 706 # 192-bit key support was removed. 707 &cmp ($rounds,128); 708 &jne (&label("bad_keybits")); 709 710&set_label("10rounds",16); 711 &cmp ("ebp",1<<28); 712 &je (&label("10rounds_alt")); 713 714 &mov ($rounds,9); 715 &$movekey (&QWP(-16,$key),"xmm0"); # round 0 716 &aeskeygenassist("xmm1","xmm0",0x01); # round 1 717 &call (&label("key_128_cold")); 718 &aeskeygenassist("xmm1","xmm0",0x2); # round 2 719 &call (&label("key_128")); 720 &aeskeygenassist("xmm1","xmm0",0x04); # round 3 721 &call (&label("key_128")); 722 &aeskeygenassist("xmm1","xmm0",0x08); # round 4 723 &call (&label("key_128")); 724 &aeskeygenassist("xmm1","xmm0",0x10); # round 5 725 &call (&label("key_128")); 726 &aeskeygenassist("xmm1","xmm0",0x20); # round 6 727 &call (&label("key_128")); 728 &aeskeygenassist("xmm1","xmm0",0x40); # round 7 729 &call (&label("key_128")); 730 &aeskeygenassist("xmm1","xmm0",0x80); # round 8 731 &call (&label("key_128")); 732 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9 733 &call (&label("key_128")); 734 &aeskeygenassist("xmm1","xmm0",0x36); # round 10 735 &call (&label("key_128")); 736 &$movekey (&QWP(0,$key),"xmm0"); 737 &mov (&DWP(80,$key),$rounds); 738 739 &jmp (&label("good_key")); 740 741&set_label("key_128",16); 742 &$movekey (&QWP(0,$key),"xmm0"); 743 &lea ($key,&DWP(16,$key)); 744&set_label("key_128_cold"); 745 &shufps ("xmm4","xmm0",0b00010000); 746 &xorps ("xmm0","xmm4"); 747 &shufps ("xmm4","xmm0",0b10001100); 748 &xorps ("xmm0","xmm4"); 749 &shufps ("xmm1","xmm1",0b11111111); # critical path 750 &xorps ("xmm0","xmm1"); 751 &ret(); 752 753&set_label("10rounds_alt",16); 754 &movdqa ("xmm5",&QWP(0x00,"ebx")); 755 &mov ($rounds,8); 756 &movdqa ("xmm4",&QWP(0x20,"ebx")); 757 &movdqa ("xmm2","xmm0"); 758 &movdqu (&QWP(-16,$key),"xmm0"); 759 760&set_label("loop_key128"); 761 &pshufb ("xmm0","xmm5"); 762 &aesenclast ("xmm0","xmm4"); 763 &pslld ("xmm4",1); 764 &lea ($key,&DWP(16,$key)); 765 766 &movdqa ("xmm3","xmm2"); 767 &pslldq ("xmm2",4); 768 &pxor ("xmm3","xmm2"); 769 &pslldq ("xmm2",4); 770 &pxor ("xmm3","xmm2"); 771 &pslldq ("xmm2",4); 772 &pxor ("xmm2","xmm3"); 773 774 &pxor ("xmm0","xmm2"); 775 &movdqu (&QWP(-16,$key),"xmm0"); 776 &movdqa ("xmm2","xmm0"); 777 778 &dec ($rounds); 779 &jnz (&label("loop_key128")); 780 781 &movdqa ("xmm4",&QWP(0x30,"ebx")); 782 783 &pshufb ("xmm0","xmm5"); 784 &aesenclast ("xmm0","xmm4"); 785 &pslld ("xmm4",1); 786 787 &movdqa ("xmm3","xmm2"); 788 &pslldq ("xmm2",4); 789 &pxor ("xmm3","xmm2"); 790 &pslldq ("xmm2",4); 791 &pxor ("xmm3","xmm2"); 792 &pslldq ("xmm2",4); 793 &pxor ("xmm2","xmm3"); 794 795 &pxor ("xmm0","xmm2"); 796 &movdqu (&QWP(0,$key),"xmm0"); 797 798 &movdqa ("xmm2","xmm0"); 799 &pshufb ("xmm0","xmm5"); 800 &aesenclast ("xmm0","xmm4"); 801 802 &movdqa ("xmm3","xmm2"); 803 &pslldq ("xmm2",4); 804 &pxor ("xmm3","xmm2"); 805 &pslldq ("xmm2",4); 806 &pxor ("xmm3","xmm2"); 807 &pslldq ("xmm2",4); 808 &pxor ("xmm2","xmm3"); 809 810 &pxor ("xmm0","xmm2"); 811 &movdqu (&QWP(16,$key),"xmm0"); 812 813 &mov ($rounds,9); 814 &mov (&DWP(96,$key),$rounds); 815 816 &jmp (&label("good_key")); 817 818# 192-bit key support was removed. 819 820&set_label("14rounds",16); 821 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey 822 &lea ($key,&DWP(16,$key)); 823 &cmp ("ebp",1<<28); 824 &je (&label("14rounds_alt")); 825 826 &mov ($rounds,13); 827 &$movekey (&QWP(-32,$key),"xmm0"); # round 0 828 &$movekey (&QWP(-16,$key),"xmm2"); # round 1 829 &aeskeygenassist("xmm1","xmm2",0x01); # round 2 830 &call (&label("key_256a_cold")); 831 &aeskeygenassist("xmm1","xmm0",0x01); # round 3 832 &call (&label("key_256b")); 833 &aeskeygenassist("xmm1","xmm2",0x02); # round 4 834 &call (&label("key_256a")); 835 &aeskeygenassist("xmm1","xmm0",0x02); # round 5 836 &call (&label("key_256b")); 837 &aeskeygenassist("xmm1","xmm2",0x04); # round 6 838 &call (&label("key_256a")); 839 &aeskeygenassist("xmm1","xmm0",0x04); # round 7 840 &call (&label("key_256b")); 841 &aeskeygenassist("xmm1","xmm2",0x08); # round 8 842 &call (&label("key_256a")); 843 &aeskeygenassist("xmm1","xmm0",0x08); # round 9 844 &call (&label("key_256b")); 845 &aeskeygenassist("xmm1","xmm2",0x10); # round 10 846 &call (&label("key_256a")); 847 &aeskeygenassist("xmm1","xmm0",0x10); # round 11 848 &call (&label("key_256b")); 849 &aeskeygenassist("xmm1","xmm2",0x20); # round 12 850 &call (&label("key_256a")); 851 &aeskeygenassist("xmm1","xmm0",0x20); # round 13 852 &call (&label("key_256b")); 853 &aeskeygenassist("xmm1","xmm2",0x40); # round 14 854 &call (&label("key_256a")); 855 &$movekey (&QWP(0,$key),"xmm0"); 856 &mov (&DWP(16,$key),$rounds); 857 &xor ("eax","eax"); 858 859 &jmp (&label("good_key")); 860 861&set_label("key_256a",16); 862 &$movekey (&QWP(0,$key),"xmm2"); 863 &lea ($key,&DWP(16,$key)); 864&set_label("key_256a_cold"); 865 &shufps ("xmm4","xmm0",0b00010000); 866 &xorps ("xmm0","xmm4"); 867 &shufps ("xmm4","xmm0",0b10001100); 868 &xorps ("xmm0","xmm4"); 869 &shufps ("xmm1","xmm1",0b11111111); # critical path 870 &xorps ("xmm0","xmm1"); 871 &ret(); 872 873&set_label("key_256b",16); 874 &$movekey (&QWP(0,$key),"xmm0"); 875 &lea ($key,&DWP(16,$key)); 876 877 &shufps ("xmm4","xmm2",0b00010000); 878 &xorps ("xmm2","xmm4"); 879 &shufps ("xmm4","xmm2",0b10001100); 880 &xorps ("xmm2","xmm4"); 881 &shufps ("xmm1","xmm1",0b10101010); # critical path 882 &xorps ("xmm2","xmm1"); 883 &ret(); 884 885&set_label("14rounds_alt",16); 886 &movdqa ("xmm5",&QWP(0x00,"ebx")); 887 &movdqa ("xmm4",&QWP(0x20,"ebx")); 888 &mov ($rounds,7); 889 &movdqu (&QWP(-32,$key),"xmm0"); 890 &movdqa ("xmm1","xmm2"); 891 &movdqu (&QWP(-16,$key),"xmm2"); 892 893&set_label("loop_key256"); 894 &pshufb ("xmm2","xmm5"); 895 &aesenclast ("xmm2","xmm4"); 896 897 &movdqa ("xmm3","xmm0"); 898 &pslldq ("xmm0",4); 899 &pxor ("xmm3","xmm0"); 900 &pslldq ("xmm0",4); 901 &pxor ("xmm3","xmm0"); 902 &pslldq ("xmm0",4); 903 &pxor ("xmm0","xmm3"); 904 &pslld ("xmm4",1); 905 906 &pxor ("xmm0","xmm2"); 907 &movdqu (&QWP(0,$key),"xmm0"); 908 909 &dec ($rounds); 910 &jz (&label("done_key256")); 911 912 &pshufd ("xmm2","xmm0",0xff); 913 &pxor ("xmm3","xmm3"); 914 &aesenclast ("xmm2","xmm3"); 915 916 &movdqa ("xmm3","xmm1"); 917 &pslldq ("xmm1",4); 918 &pxor ("xmm3","xmm1"); 919 &pslldq ("xmm1",4); 920 &pxor ("xmm3","xmm1"); 921 &pslldq ("xmm1",4); 922 &pxor ("xmm1","xmm3"); 923 924 &pxor ("xmm2","xmm1"); 925 &movdqu (&QWP(16,$key),"xmm2"); 926 &lea ($key,&DWP(32,$key)); 927 &movdqa ("xmm1","xmm2"); 928 &jmp (&label("loop_key256")); 929 930&set_label("done_key256"); 931 &mov ($rounds,13); 932 &mov (&DWP(16,$key),$rounds); 933 934&set_label("good_key"); 935 &pxor ("xmm0","xmm0"); 936 &pxor ("xmm1","xmm1"); 937 &pxor ("xmm2","xmm2"); 938 &pxor ("xmm3","xmm3"); 939 &pxor ("xmm4","xmm4"); 940 &pxor ("xmm5","xmm5"); 941 &xor ("eax","eax"); 942 &pop ("ebx"); 943 &pop ("ebp"); 944 &ret (); 945 946&set_label("bad_pointer",4); 947 &mov ("eax",-1); 948 &pop ("ebx"); 949 &pop ("ebp"); 950 &ret (); 951&set_label("bad_keybits",4); 952 &pxor ("xmm0","xmm0"); 953 &mov ("eax",-2); 954 &pop ("ebx"); 955 &pop ("ebp"); 956 &ret (); 957&function_end_B("_aesni_set_encrypt_key"); 958 959# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits, 960# AES_KEY *key) 961&function_begin_B("${PREFIX}_set_encrypt_key"); 962 &record_function_hit(3); 963 964 &mov ("eax",&wparam(0)); 965 &mov ($rounds,&wparam(1)); 966 &mov ($key,&wparam(2)); 967 &call ("_aesni_set_encrypt_key"); 968 &ret (); 969&function_end_B("${PREFIX}_set_encrypt_key"); 970 971&set_label("key_const",64); 972&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d); 973&data_word(0x04070605,0x04070605,0x04070605,0x04070605); 974&data_word(1,1,1,1); 975&data_word(0x1b,0x1b,0x1b,0x1b); 976&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"); 977 978&asm_finish(); 979 980close STDOUT or die "error closing STDOUT: $!"; 981