1#! /usr/bin/env perl 2# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements support for Intel AES-NI extension. In 18# OpenSSL context it's used with Intel engine, but can also be used as 19# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for 20# details]. 21# 22# Performance. 23# 24# To start with see corresponding paragraph in aesni-x86_64.pl... 25# Instead of filling table similar to one found there I've chosen to 26# summarize *comparison* results for raw ECB, CTR and CBC benchmarks. 27# The simplified table below represents 32-bit performance relative 28# to 64-bit one in every given point. Ratios vary for different 29# encryption modes, therefore interval values. 30# 31# 16-byte 64-byte 256-byte 1-KB 8-KB 32# 53-67% 67-84% 91-94% 95-98% 97-99.5% 33# 34# Lower ratios for smaller block sizes are perfectly understandable, 35# because function call overhead is higher in 32-bit mode. Largest 36# 8-KB block performance is virtually same: 32-bit code is less than 37# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise. 38 39# January 2011 40# 41# See aesni-x86_64.pl for details. Unlike x86_64 version this module 42# interleaves at most 6 aes[enc|dec] instructions, because there are 43# not enough registers for 8x interleave [which should be optimal for 44# Sandy Bridge]. Actually, performance results for 6x interleave 45# factor presented in aesni-x86_64.pl (except for CTR) are for this 46# module. 47 48# April 2011 49# 50# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing 51# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09. 52 53# November 2015 54# 55# Add aesni_ocb_[en|de]crypt. [Removed in BoringSSL] 56 57###################################################################### 58# Current large-block performance in cycles per byte processed with 59# 128-bit key (less is better). 60# 61# CBC en-/decrypt CTR XTS ECB OCB 62# Westmere 3.77/1.37 1.37 1.52 1.27 63# * Bridge 5.07/0.98 0.99 1.09 0.91 1.10 64# Haswell 4.44/0.80 0.97 1.03 0.72 0.76 65# Skylake 2.68/0.65 0.65 0.66 0.64 0.66 66# Silvermont 5.77/3.56 3.67 4.03 3.46 4.03 67# Goldmont 3.84/1.39 1.39 1.63 1.31 1.70 68# Bulldozer 5.80/0.98 1.05 1.24 0.93 1.23 69 70$PREFIX="GFp_aes_hw"; # if $PREFIX is set to "AES", the script 71 # generates drop-in replacement for 72 # crypto/aes/asm/aes-586.pl:-) 73$AESNI_PREFIX="GFp_aes_hw"; 74$inline=1; # inline _aesni_[en|de]crypt 75 76$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 77push(@INC,"${dir}","${dir}../../../perlasm"); 78require "x86asm.pl"; 79 80$output = pop; 81open OUT,">$output"; 82*STDOUT=*OUT; 83 84&asm_init($ARGV[0]); 85 86&external_label("GFp_ia32cap_P"); 87&static_label("key_const"); 88 89if ($PREFIX eq $AESNI_PREFIX) { $movekey=\&movups; } 90else { $movekey=\&movups; } 91 92$len="eax"; 93$rounds="ecx"; 94$key="edx"; 95$inp="esi"; 96$out="edi"; 97$rounds_="ebx"; # backup copy for $rounds 98$key_="ebp"; # backup copy for $key 99 100$rndkey0="xmm0"; 101$rndkey1="xmm1"; 102$inout0="xmm2"; 103$inout1="xmm3"; 104$inout2="xmm4"; 105$inout3="xmm5"; $in1="xmm5"; 106$inout4="xmm6"; $in0="xmm6"; 107$inout5="xmm7"; $ivec="xmm7"; 108 109# AESNI extension 110sub aeskeygenassist 111{ my($dst,$src,$imm)=@_; 112 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) 113 { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); } 114} 115sub aescommon 116{ my($opcodelet,$dst,$src)=@_; 117 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) 118 { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);} 119} 120sub aesimc { aescommon(0xdb,@_); } 121sub aesenc { aescommon(0xdc,@_); } 122sub aesenclast { aescommon(0xdd,@_); } 123 124# Inline version of internal aesni_[en|de]crypt1 125{ my $sn; 126sub aesni_inline_generate1 127{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); 128 $sn++; 129 130 &$movekey ($rndkey0,&QWP(0,$key)); 131 &$movekey ($rndkey1,&QWP(16,$key)); 132 &xorps ($ivec,$rndkey0) if (defined($ivec)); 133 &lea ($key,&DWP(32,$key)); 134 &xorps ($inout,$ivec) if (defined($ivec)); 135 &xorps ($inout,$rndkey0) if (!defined($ivec)); 136 &set_label("${p}1_loop_$sn"); 137 eval"&aes${p} ($inout,$rndkey1)"; 138 &dec ($rounds); 139 &$movekey ($rndkey1,&QWP(0,$key)); 140 &lea ($key,&DWP(16,$key)); 141 &jnz (&label("${p}1_loop_$sn")); 142 eval"&aes${p}last ($inout,$rndkey1)"; 143}} 144 145sub aesni_generate1 # fully unrolled loop 146{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout)); 147 148 &function_begin_B("_aesni_${p}rypt1"); 149 &movups ($rndkey0,&QWP(0,$key)); 150 &$movekey ($rndkey1,&QWP(0x10,$key)); 151 &xorps ($inout,$rndkey0); 152 &$movekey ($rndkey0,&QWP(0x20,$key)); 153 &lea ($key,&DWP(0x30,$key)); 154 &cmp ($rounds,11); 155 &jb (&label("${p}128")); 156 &lea ($key,&DWP(0x40,$key)); 157 # 192-bit key support was removed. 158 159 eval"&aes${p} ($inout,$rndkey1)"; 160 &$movekey ($rndkey1,&QWP(-0x40,$key)); 161 eval"&aes${p} ($inout,$rndkey0)"; 162 &$movekey ($rndkey0,&QWP(-0x30,$key)); 163 164 # 192-bit key support was removed. 165 eval"&aes${p} ($inout,$rndkey1)"; 166 &$movekey ($rndkey1,&QWP(-0x20,$key)); 167 eval"&aes${p} ($inout,$rndkey0)"; 168 &$movekey ($rndkey0,&QWP(-0x10,$key)); 169 &set_label("${p}128"); 170 eval"&aes${p} ($inout,$rndkey1)"; 171 &$movekey ($rndkey1,&QWP(0,$key)); 172 eval"&aes${p} ($inout,$rndkey0)"; 173 &$movekey ($rndkey0,&QWP(0x10,$key)); 174 eval"&aes${p} ($inout,$rndkey1)"; 175 &$movekey ($rndkey1,&QWP(0x20,$key)); 176 eval"&aes${p} ($inout,$rndkey0)"; 177 &$movekey ($rndkey0,&QWP(0x30,$key)); 178 eval"&aes${p} ($inout,$rndkey1)"; 179 &$movekey ($rndkey1,&QWP(0x40,$key)); 180 eval"&aes${p} ($inout,$rndkey0)"; 181 &$movekey ($rndkey0,&QWP(0x50,$key)); 182 eval"&aes${p} ($inout,$rndkey1)"; 183 &$movekey ($rndkey1,&QWP(0x60,$key)); 184 eval"&aes${p} ($inout,$rndkey0)"; 185 &$movekey ($rndkey0,&QWP(0x70,$key)); 186 eval"&aes${p} ($inout,$rndkey1)"; 187 eval"&aes${p}last ($inout,$rndkey0)"; 188 &ret(); 189 &function_end_B("_aesni_${p}rypt1"); 190} 191 192# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key); 193&aesni_generate1("enc") if (!$inline); 194&function_begin_B("${PREFIX}_encrypt"); 195 &mov ("eax",&wparam(0)); 196 &mov ($key,&wparam(2)); 197 &movups ($inout0,&QWP(0,"eax")); 198 &mov ($rounds,&DWP(240,$key)); 199 &mov ("eax",&wparam(1)); 200 if ($inline) 201 { &aesni_inline_generate1("enc"); } 202 else 203 { &call ("_aesni_encrypt1"); } 204 &pxor ($rndkey0,$rndkey0); # clear register bank 205 &pxor ($rndkey1,$rndkey1); 206 &movups (&QWP(0,"eax"),$inout0); 207 &pxor ($inout0,$inout0); 208 &ret (); 209&function_end_B("${PREFIX}_encrypt"); 210 211# _aesni_[en|de]cryptN are private interfaces, N denotes interleave 212# factor. Why 3x subroutine were originally used in loops? Even though 213# aes[enc|dec] latency was originally 6, it could be scheduled only 214# every *2nd* cycle. Thus 3x interleave was the one providing optimal 215# utilization, i.e. when subroutine's throughput is virtually same as 216# of non-interleaved subroutine [for number of input blocks up to 3]. 217# This is why it originally made no sense to implement 2x subroutine. 218# But times change and it became appropriate to spend extra 192 bytes 219# on 2x subroutine on Atom Silvermont account. For processors that 220# can schedule aes[enc|dec] every cycle optimal interleave factor 221# equals to corresponding instructions latency. 8x is optimal for 222# * Bridge, but it's unfeasible to accommodate such implementation 223# in XMM registers addressable in 32-bit mode and therefore maximum 224# of 6x is used instead... 225 226sub aesni_generate2 227{ my $p=shift; 228 229 &function_begin_B("_aesni_${p}rypt2"); 230 &$movekey ($rndkey0,&QWP(0,$key)); 231 &shl ($rounds,4); 232 &$movekey ($rndkey1,&QWP(16,$key)); 233 &xorps ($inout0,$rndkey0); 234 &pxor ($inout1,$rndkey0); 235 &$movekey ($rndkey0,&QWP(32,$key)); 236 &lea ($key,&DWP(32,$key,$rounds)); 237 &neg ($rounds); 238 &add ($rounds,16); 239 240 &set_label("${p}2_loop"); 241 eval"&aes${p} ($inout0,$rndkey1)"; 242 eval"&aes${p} ($inout1,$rndkey1)"; 243 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 244 &add ($rounds,32); 245 eval"&aes${p} ($inout0,$rndkey0)"; 246 eval"&aes${p} ($inout1,$rndkey0)"; 247 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 248 &jnz (&label("${p}2_loop")); 249 eval"&aes${p} ($inout0,$rndkey1)"; 250 eval"&aes${p} ($inout1,$rndkey1)"; 251 eval"&aes${p}last ($inout0,$rndkey0)"; 252 eval"&aes${p}last ($inout1,$rndkey0)"; 253 &ret(); 254 &function_end_B("_aesni_${p}rypt2"); 255} 256 257sub aesni_generate3 258{ my $p=shift; 259 260 &function_begin_B("_aesni_${p}rypt3"); 261 &$movekey ($rndkey0,&QWP(0,$key)); 262 &shl ($rounds,4); 263 &$movekey ($rndkey1,&QWP(16,$key)); 264 &xorps ($inout0,$rndkey0); 265 &pxor ($inout1,$rndkey0); 266 &pxor ($inout2,$rndkey0); 267 &$movekey ($rndkey0,&QWP(32,$key)); 268 &lea ($key,&DWP(32,$key,$rounds)); 269 &neg ($rounds); 270 &add ($rounds,16); 271 272 &set_label("${p}3_loop"); 273 eval"&aes${p} ($inout0,$rndkey1)"; 274 eval"&aes${p} ($inout1,$rndkey1)"; 275 eval"&aes${p} ($inout2,$rndkey1)"; 276 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 277 &add ($rounds,32); 278 eval"&aes${p} ($inout0,$rndkey0)"; 279 eval"&aes${p} ($inout1,$rndkey0)"; 280 eval"&aes${p} ($inout2,$rndkey0)"; 281 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 282 &jnz (&label("${p}3_loop")); 283 eval"&aes${p} ($inout0,$rndkey1)"; 284 eval"&aes${p} ($inout1,$rndkey1)"; 285 eval"&aes${p} ($inout2,$rndkey1)"; 286 eval"&aes${p}last ($inout0,$rndkey0)"; 287 eval"&aes${p}last ($inout1,$rndkey0)"; 288 eval"&aes${p}last ($inout2,$rndkey0)"; 289 &ret(); 290 &function_end_B("_aesni_${p}rypt3"); 291} 292 293# 4x interleave is implemented to improve small block performance, 294# most notably [and naturally] 4 block by ~30%. One can argue that one 295# should have implemented 5x as well, but improvement would be <20%, 296# so it's not worth it... 297sub aesni_generate4 298{ my $p=shift; 299 300 &function_begin_B("_aesni_${p}rypt4"); 301 &$movekey ($rndkey0,&QWP(0,$key)); 302 &$movekey ($rndkey1,&QWP(16,$key)); 303 &shl ($rounds,4); 304 &xorps ($inout0,$rndkey0); 305 &pxor ($inout1,$rndkey0); 306 &pxor ($inout2,$rndkey0); 307 &pxor ($inout3,$rndkey0); 308 &$movekey ($rndkey0,&QWP(32,$key)); 309 &lea ($key,&DWP(32,$key,$rounds)); 310 &neg ($rounds); 311 &data_byte (0x0f,0x1f,0x40,0x00); 312 &add ($rounds,16); 313 314 &set_label("${p}4_loop"); 315 eval"&aes${p} ($inout0,$rndkey1)"; 316 eval"&aes${p} ($inout1,$rndkey1)"; 317 eval"&aes${p} ($inout2,$rndkey1)"; 318 eval"&aes${p} ($inout3,$rndkey1)"; 319 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 320 &add ($rounds,32); 321 eval"&aes${p} ($inout0,$rndkey0)"; 322 eval"&aes${p} ($inout1,$rndkey0)"; 323 eval"&aes${p} ($inout2,$rndkey0)"; 324 eval"&aes${p} ($inout3,$rndkey0)"; 325 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 326 &jnz (&label("${p}4_loop")); 327 328 eval"&aes${p} ($inout0,$rndkey1)"; 329 eval"&aes${p} ($inout1,$rndkey1)"; 330 eval"&aes${p} ($inout2,$rndkey1)"; 331 eval"&aes${p} ($inout3,$rndkey1)"; 332 eval"&aes${p}last ($inout0,$rndkey0)"; 333 eval"&aes${p}last ($inout1,$rndkey0)"; 334 eval"&aes${p}last ($inout2,$rndkey0)"; 335 eval"&aes${p}last ($inout3,$rndkey0)"; 336 &ret(); 337 &function_end_B("_aesni_${p}rypt4"); 338} 339 340sub aesni_generate6 341{ my $p=shift; 342 343 &function_begin_B("_aesni_${p}rypt6"); 344 &static_label("_aesni_${p}rypt6_enter"); 345 &$movekey ($rndkey0,&QWP(0,$key)); 346 &shl ($rounds,4); 347 &$movekey ($rndkey1,&QWP(16,$key)); 348 &xorps ($inout0,$rndkey0); 349 &pxor ($inout1,$rndkey0); # pxor does better here 350 &pxor ($inout2,$rndkey0); 351 eval"&aes${p} ($inout0,$rndkey1)"; 352 &pxor ($inout3,$rndkey0); 353 &pxor ($inout4,$rndkey0); 354 eval"&aes${p} ($inout1,$rndkey1)"; 355 &lea ($key,&DWP(32,$key,$rounds)); 356 &neg ($rounds); 357 eval"&aes${p} ($inout2,$rndkey1)"; 358 &pxor ($inout5,$rndkey0); 359 &$movekey ($rndkey0,&QWP(0,$key,$rounds)); 360 &add ($rounds,16); 361 &jmp (&label("_aesni_${p}rypt6_inner")); 362 363 &set_label("${p}6_loop",16); 364 eval"&aes${p} ($inout0,$rndkey1)"; 365 eval"&aes${p} ($inout1,$rndkey1)"; 366 eval"&aes${p} ($inout2,$rndkey1)"; 367 &set_label("_aesni_${p}rypt6_inner"); 368 eval"&aes${p} ($inout3,$rndkey1)"; 369 eval"&aes${p} ($inout4,$rndkey1)"; 370 eval"&aes${p} ($inout5,$rndkey1)"; 371 &set_label("_aesni_${p}rypt6_enter"); 372 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 373 &add ($rounds,32); 374 eval"&aes${p} ($inout0,$rndkey0)"; 375 eval"&aes${p} ($inout1,$rndkey0)"; 376 eval"&aes${p} ($inout2,$rndkey0)"; 377 eval"&aes${p} ($inout3,$rndkey0)"; 378 eval"&aes${p} ($inout4,$rndkey0)"; 379 eval"&aes${p} ($inout5,$rndkey0)"; 380 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 381 &jnz (&label("${p}6_loop")); 382 383 eval"&aes${p} ($inout0,$rndkey1)"; 384 eval"&aes${p} ($inout1,$rndkey1)"; 385 eval"&aes${p} ($inout2,$rndkey1)"; 386 eval"&aes${p} ($inout3,$rndkey1)"; 387 eval"&aes${p} ($inout4,$rndkey1)"; 388 eval"&aes${p} ($inout5,$rndkey1)"; 389 eval"&aes${p}last ($inout0,$rndkey0)"; 390 eval"&aes${p}last ($inout1,$rndkey0)"; 391 eval"&aes${p}last ($inout2,$rndkey0)"; 392 eval"&aes${p}last ($inout3,$rndkey0)"; 393 eval"&aes${p}last ($inout4,$rndkey0)"; 394 eval"&aes${p}last ($inout5,$rndkey0)"; 395 &ret(); 396 &function_end_B("_aesni_${p}rypt6"); 397} 398&aesni_generate2("enc") if ($PREFIX eq $AESNI_PREFIX); 399&aesni_generate3("enc") if ($PREFIX eq $AESNI_PREFIX); 400&aesni_generate4("enc") if ($PREFIX eq $AESNI_PREFIX); 401&aesni_generate6("enc") if ($PREFIX eq $AESNI_PREFIX); 402 403if ($PREFIX eq $AESNI_PREFIX) { 404 405###################################################################### 406# void aes_hw_ctr32_encrypt_blocks (const void *in, void *out, 407# size_t blocks, const AES_KEY *key, 408# const char *ivec); 409# 410# Handles only complete blocks, operates on 32-bit counter and 411# does not update *ivec! (see crypto/modes/ctr128.c for details) 412# 413# stack layout: 414# 0 pshufb mask 415# 16 vector addend: 0,6,6,6 416# 32 counter-less ivec 417# 48 1st triplet of counter vector 418# 64 2nd triplet of counter vector 419# 80 saved %esp 420 421&function_begin("${PREFIX}_ctr32_encrypt_blocks"); 422 &mov ($inp,&wparam(0)); 423 &mov ($out,&wparam(1)); 424 &mov ($len,&wparam(2)); 425 &mov ($key,&wparam(3)); 426 &mov ($rounds_,&wparam(4)); 427 &mov ($key_,"esp"); 428 &sub ("esp",88); 429 &and ("esp",-16); # align stack 430 &mov (&DWP(80,"esp"),$key_); 431 432 &cmp ($len,1); 433 &je (&label("ctr32_one_shortcut")); 434 435 &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec 436 437 # compose byte-swap control mask for pshufb on stack 438 &mov (&DWP(0,"esp"),0x0c0d0e0f); 439 &mov (&DWP(4,"esp"),0x08090a0b); 440 &mov (&DWP(8,"esp"),0x04050607); 441 &mov (&DWP(12,"esp"),0x00010203); 442 443 # compose counter increment vector on stack 444 &mov ($rounds,6); 445 &xor ($key_,$key_); 446 &mov (&DWP(16,"esp"),$rounds); 447 &mov (&DWP(20,"esp"),$rounds); 448 &mov (&DWP(24,"esp"),$rounds); 449 &mov (&DWP(28,"esp"),$key_); 450 451 &pextrd ($rounds_,$inout5,3); # pull 32-bit counter 452 &pinsrd ($inout5,$key_,3); # wipe 32-bit counter 453 454 &mov ($rounds,&DWP(240,$key)); # key->rounds 455 456 # compose 2 vectors of 3x32-bit counters 457 &bswap ($rounds_); 458 &pxor ($rndkey0,$rndkey0); 459 &pxor ($rndkey1,$rndkey1); 460 &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask 461 &pinsrd ($rndkey0,$rounds_,0); 462 &lea ($key_,&DWP(3,$rounds_)); 463 &pinsrd ($rndkey1,$key_,0); 464 &inc ($rounds_); 465 &pinsrd ($rndkey0,$rounds_,1); 466 &inc ($key_); 467 &pinsrd ($rndkey1,$key_,1); 468 &inc ($rounds_); 469 &pinsrd ($rndkey0,$rounds_,2); 470 &inc ($key_); 471 &pinsrd ($rndkey1,$key_,2); 472 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet 473 &pshufb ($rndkey0,$inout0); # byte swap 474 &movdqu ($inout4,&QWP(0,$key)); # key[0] 475 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet 476 &pshufb ($rndkey1,$inout0); # byte swap 477 478 &pshufd ($inout0,$rndkey0,3<<6); # place counter to upper dword 479 &pshufd ($inout1,$rndkey0,2<<6); 480 &cmp ($len,6); 481 &jb (&label("ctr32_tail")); 482 &pxor ($inout5,$inout4); # counter-less ivec^key[0] 483 &shl ($rounds,4); 484 &mov ($rounds_,16); 485 &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec^key[0] 486 &mov ($key_,$key); # backup $key 487 &sub ($rounds_,$rounds); # backup twisted $rounds 488 &lea ($key,&DWP(32,$key,$rounds)); 489 &sub ($len,6); 490 &jmp (&label("ctr32_loop6")); 491 492&set_label("ctr32_loop6",16); 493 # inlining _aesni_encrypt6's prologue gives ~6% improvement... 494 &pshufd ($inout2,$rndkey0,1<<6); 495 &movdqa ($rndkey0,&QWP(32,"esp")); # pull counter-less ivec 496 &pshufd ($inout3,$rndkey1,3<<6); 497 &pxor ($inout0,$rndkey0); # merge counter-less ivec 498 &pshufd ($inout4,$rndkey1,2<<6); 499 &pxor ($inout1,$rndkey0); 500 &pshufd ($inout5,$rndkey1,1<<6); 501 &$movekey ($rndkey1,&QWP(16,$key_)); 502 &pxor ($inout2,$rndkey0); 503 &pxor ($inout3,$rndkey0); 504 &aesenc ($inout0,$rndkey1); 505 &pxor ($inout4,$rndkey0); 506 &pxor ($inout5,$rndkey0); 507 &aesenc ($inout1,$rndkey1); 508 &$movekey ($rndkey0,&QWP(32,$key_)); 509 &mov ($rounds,$rounds_); 510 &aesenc ($inout2,$rndkey1); 511 &aesenc ($inout3,$rndkey1); 512 &aesenc ($inout4,$rndkey1); 513 &aesenc ($inout5,$rndkey1); 514 515 &call (&label("_aesni_encrypt6_enter")); 516 517 &movups ($rndkey1,&QWP(0,$inp)); 518 &movups ($rndkey0,&QWP(0x10,$inp)); 519 &xorps ($inout0,$rndkey1); 520 &movups ($rndkey1,&QWP(0x20,$inp)); 521 &xorps ($inout1,$rndkey0); 522 &movups (&QWP(0,$out),$inout0); 523 &movdqa ($rndkey0,&QWP(16,"esp")); # load increment 524 &xorps ($inout2,$rndkey1); 525 &movdqa ($rndkey1,&QWP(64,"esp")); # load 2nd triplet 526 &movups (&QWP(0x10,$out),$inout1); 527 &movups (&QWP(0x20,$out),$inout2); 528 529 &paddd ($rndkey1,$rndkey0); # 2nd triplet increment 530 &paddd ($rndkey0,&QWP(48,"esp")); # 1st triplet increment 531 &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask 532 533 &movups ($inout1,&QWP(0x30,$inp)); 534 &movups ($inout2,&QWP(0x40,$inp)); 535 &xorps ($inout3,$inout1); 536 &movups ($inout1,&QWP(0x50,$inp)); 537 &lea ($inp,&DWP(0x60,$inp)); 538 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet 539 &pshufb ($rndkey0,$inout0); # byte swap 540 &xorps ($inout4,$inout2); 541 &movups (&QWP(0x30,$out),$inout3); 542 &xorps ($inout5,$inout1); 543 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet 544 &pshufb ($rndkey1,$inout0); # byte swap 545 &movups (&QWP(0x40,$out),$inout4); 546 &pshufd ($inout0,$rndkey0,3<<6); 547 &movups (&QWP(0x50,$out),$inout5); 548 &lea ($out,&DWP(0x60,$out)); 549 550 &pshufd ($inout1,$rndkey0,2<<6); 551 &sub ($len,6); 552 &jnc (&label("ctr32_loop6")); 553 554 &add ($len,6); 555 &jz (&label("ctr32_ret")); 556 &movdqu ($inout5,&QWP(0,$key_)); 557 &mov ($key,$key_); 558 &pxor ($inout5,&QWP(32,"esp")); # restore count-less ivec 559 &mov ($rounds,&DWP(240,$key_)); # restore $rounds 560 561&set_label("ctr32_tail"); 562 &por ($inout0,$inout5); 563 &cmp ($len,2); 564 &jb (&label("ctr32_one")); 565 566 &pshufd ($inout2,$rndkey0,1<<6); 567 &por ($inout1,$inout5); 568 &je (&label("ctr32_two")); 569 570 &pshufd ($inout3,$rndkey1,3<<6); 571 &por ($inout2,$inout5); 572 &cmp ($len,4); 573 &jb (&label("ctr32_three")); 574 575 &pshufd ($inout4,$rndkey1,2<<6); 576 &por ($inout3,$inout5); 577 &je (&label("ctr32_four")); 578 579 &por ($inout4,$inout5); 580 &call ("_aesni_encrypt6"); 581 &movups ($rndkey1,&QWP(0,$inp)); 582 &movups ($rndkey0,&QWP(0x10,$inp)); 583 &xorps ($inout0,$rndkey1); 584 &movups ($rndkey1,&QWP(0x20,$inp)); 585 &xorps ($inout1,$rndkey0); 586 &movups ($rndkey0,&QWP(0x30,$inp)); 587 &xorps ($inout2,$rndkey1); 588 &movups ($rndkey1,&QWP(0x40,$inp)); 589 &xorps ($inout3,$rndkey0); 590 &movups (&QWP(0,$out),$inout0); 591 &xorps ($inout4,$rndkey1); 592 &movups (&QWP(0x10,$out),$inout1); 593 &movups (&QWP(0x20,$out),$inout2); 594 &movups (&QWP(0x30,$out),$inout3); 595 &movups (&QWP(0x40,$out),$inout4); 596 &jmp (&label("ctr32_ret")); 597 598&set_label("ctr32_one_shortcut",16); 599 &movups ($inout0,&QWP(0,$rounds_)); # load ivec 600 &mov ($rounds,&DWP(240,$key)); 601 602&set_label("ctr32_one"); 603 if ($inline) 604 { &aesni_inline_generate1("enc"); } 605 else 606 { &call ("_aesni_encrypt1"); } 607 &movups ($in0,&QWP(0,$inp)); 608 &xorps ($in0,$inout0); 609 &movups (&QWP(0,$out),$in0); 610 &jmp (&label("ctr32_ret")); 611 612&set_label("ctr32_two",16); 613 &call ("_aesni_encrypt2"); 614 &movups ($inout3,&QWP(0,$inp)); 615 &movups ($inout4,&QWP(0x10,$inp)); 616 &xorps ($inout0,$inout3); 617 &xorps ($inout1,$inout4); 618 &movups (&QWP(0,$out),$inout0); 619 &movups (&QWP(0x10,$out),$inout1); 620 &jmp (&label("ctr32_ret")); 621 622&set_label("ctr32_three",16); 623 &call ("_aesni_encrypt3"); 624 &movups ($inout3,&QWP(0,$inp)); 625 &movups ($inout4,&QWP(0x10,$inp)); 626 &xorps ($inout0,$inout3); 627 &movups ($inout5,&QWP(0x20,$inp)); 628 &xorps ($inout1,$inout4); 629 &movups (&QWP(0,$out),$inout0); 630 &xorps ($inout2,$inout5); 631 &movups (&QWP(0x10,$out),$inout1); 632 &movups (&QWP(0x20,$out),$inout2); 633 &jmp (&label("ctr32_ret")); 634 635&set_label("ctr32_four",16); 636 &call ("_aesni_encrypt4"); 637 &movups ($inout4,&QWP(0,$inp)); 638 &movups ($inout5,&QWP(0x10,$inp)); 639 &movups ($rndkey1,&QWP(0x20,$inp)); 640 &xorps ($inout0,$inout4); 641 &movups ($rndkey0,&QWP(0x30,$inp)); 642 &xorps ($inout1,$inout5); 643 &movups (&QWP(0,$out),$inout0); 644 &xorps ($inout2,$rndkey1); 645 &movups (&QWP(0x10,$out),$inout1); 646 &xorps ($inout3,$rndkey0); 647 &movups (&QWP(0x20,$out),$inout2); 648 &movups (&QWP(0x30,$out),$inout3); 649 650&set_label("ctr32_ret"); 651 &pxor ("xmm0","xmm0"); # clear register bank 652 &pxor ("xmm1","xmm1"); 653 &pxor ("xmm2","xmm2"); 654 &pxor ("xmm3","xmm3"); 655 &pxor ("xmm4","xmm4"); 656 &movdqa (&QWP(32,"esp"),"xmm0"); # clear stack 657 &pxor ("xmm5","xmm5"); 658 &movdqa (&QWP(48,"esp"),"xmm0"); 659 &pxor ("xmm6","xmm6"); 660 &movdqa (&QWP(64,"esp"),"xmm0"); 661 &pxor ("xmm7","xmm7"); 662 &mov ("esp",&DWP(80,"esp")); 663&function_end("${PREFIX}_ctr32_encrypt_blocks"); 664} 665 666###################################################################### 667# Mechanical port from aesni-x86_64.pl. 668# 669# _aesni_set_encrypt_key is private interface, 670# input: 671# "eax" const unsigned char *userKey 672# $rounds int bits 673# $key AES_KEY *key 674# output: 675# "eax" return code 676# $round rounds 677 678&function_begin_B("_aesni_set_encrypt_key"); 679 &push ("ebp"); 680 &push ("ebx"); 681 &test ("eax","eax"); 682 &jz (&label("bad_pointer")); 683 &test ($key,$key); 684 &jz (&label("bad_pointer")); 685 686 &call (&label("pic")); 687&set_label("pic"); 688 &blindpop("ebx"); 689 &lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx")); 690 691 &picmeup("ebp","GFp_ia32cap_P","ebx",&label("key_const")); 692 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey 693 &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0 694 &mov ("ebp",&DWP(4,"ebp")); 695 &lea ($key,&DWP(16,$key)); 696 &and ("ebp",1<<28|1<<11); # AVX and XOP bits 697 &cmp ($rounds,256); 698 &je (&label("14rounds")); 699 # 192-bit key support was removed. 700 &cmp ($rounds,128); 701 &jne (&label("bad_keybits")); 702 703&set_label("10rounds",16); 704 &cmp ("ebp",1<<28); 705 &je (&label("10rounds_alt")); 706 707 &mov ($rounds,9); 708 &$movekey (&QWP(-16,$key),"xmm0"); # round 0 709 &aeskeygenassist("xmm1","xmm0",0x01); # round 1 710 &call (&label("key_128_cold")); 711 &aeskeygenassist("xmm1","xmm0",0x2); # round 2 712 &call (&label("key_128")); 713 &aeskeygenassist("xmm1","xmm0",0x04); # round 3 714 &call (&label("key_128")); 715 &aeskeygenassist("xmm1","xmm0",0x08); # round 4 716 &call (&label("key_128")); 717 &aeskeygenassist("xmm1","xmm0",0x10); # round 5 718 &call (&label("key_128")); 719 &aeskeygenassist("xmm1","xmm0",0x20); # round 6 720 &call (&label("key_128")); 721 &aeskeygenassist("xmm1","xmm0",0x40); # round 7 722 &call (&label("key_128")); 723 &aeskeygenassist("xmm1","xmm0",0x80); # round 8 724 &call (&label("key_128")); 725 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9 726 &call (&label("key_128")); 727 &aeskeygenassist("xmm1","xmm0",0x36); # round 10 728 &call (&label("key_128")); 729 &$movekey (&QWP(0,$key),"xmm0"); 730 &mov (&DWP(80,$key),$rounds); 731 732 &jmp (&label("good_key")); 733 734&set_label("key_128",16); 735 &$movekey (&QWP(0,$key),"xmm0"); 736 &lea ($key,&DWP(16,$key)); 737&set_label("key_128_cold"); 738 &shufps ("xmm4","xmm0",0b00010000); 739 &xorps ("xmm0","xmm4"); 740 &shufps ("xmm4","xmm0",0b10001100); 741 &xorps ("xmm0","xmm4"); 742 &shufps ("xmm1","xmm1",0b11111111); # critical path 743 &xorps ("xmm0","xmm1"); 744 &ret(); 745 746&set_label("10rounds_alt",16); 747 &movdqa ("xmm5",&QWP(0x00,"ebx")); 748 &mov ($rounds,8); 749 &movdqa ("xmm4",&QWP(0x20,"ebx")); 750 &movdqa ("xmm2","xmm0"); 751 &movdqu (&QWP(-16,$key),"xmm0"); 752 753&set_label("loop_key128"); 754 &pshufb ("xmm0","xmm5"); 755 &aesenclast ("xmm0","xmm4"); 756 &pslld ("xmm4",1); 757 &lea ($key,&DWP(16,$key)); 758 759 &movdqa ("xmm3","xmm2"); 760 &pslldq ("xmm2",4); 761 &pxor ("xmm3","xmm2"); 762 &pslldq ("xmm2",4); 763 &pxor ("xmm3","xmm2"); 764 &pslldq ("xmm2",4); 765 &pxor ("xmm2","xmm3"); 766 767 &pxor ("xmm0","xmm2"); 768 &movdqu (&QWP(-16,$key),"xmm0"); 769 &movdqa ("xmm2","xmm0"); 770 771 &dec ($rounds); 772 &jnz (&label("loop_key128")); 773 774 &movdqa ("xmm4",&QWP(0x30,"ebx")); 775 776 &pshufb ("xmm0","xmm5"); 777 &aesenclast ("xmm0","xmm4"); 778 &pslld ("xmm4",1); 779 780 &movdqa ("xmm3","xmm2"); 781 &pslldq ("xmm2",4); 782 &pxor ("xmm3","xmm2"); 783 &pslldq ("xmm2",4); 784 &pxor ("xmm3","xmm2"); 785 &pslldq ("xmm2",4); 786 &pxor ("xmm2","xmm3"); 787 788 &pxor ("xmm0","xmm2"); 789 &movdqu (&QWP(0,$key),"xmm0"); 790 791 &movdqa ("xmm2","xmm0"); 792 &pshufb ("xmm0","xmm5"); 793 &aesenclast ("xmm0","xmm4"); 794 795 &movdqa ("xmm3","xmm2"); 796 &pslldq ("xmm2",4); 797 &pxor ("xmm3","xmm2"); 798 &pslldq ("xmm2",4); 799 &pxor ("xmm3","xmm2"); 800 &pslldq ("xmm2",4); 801 &pxor ("xmm2","xmm3"); 802 803 &pxor ("xmm0","xmm2"); 804 &movdqu (&QWP(16,$key),"xmm0"); 805 806 &mov ($rounds,9); 807 &mov (&DWP(96,$key),$rounds); 808 809 &jmp (&label("good_key")); 810 811# 192-bit key support was removed. 812 813&set_label("14rounds",16); 814 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey 815 &lea ($key,&DWP(16,$key)); 816 &cmp ("ebp",1<<28); 817 &je (&label("14rounds_alt")); 818 819 &mov ($rounds,13); 820 &$movekey (&QWP(-32,$key),"xmm0"); # round 0 821 &$movekey (&QWP(-16,$key),"xmm2"); # round 1 822 &aeskeygenassist("xmm1","xmm2",0x01); # round 2 823 &call (&label("key_256a_cold")); 824 &aeskeygenassist("xmm1","xmm0",0x01); # round 3 825 &call (&label("key_256b")); 826 &aeskeygenassist("xmm1","xmm2",0x02); # round 4 827 &call (&label("key_256a")); 828 &aeskeygenassist("xmm1","xmm0",0x02); # round 5 829 &call (&label("key_256b")); 830 &aeskeygenassist("xmm1","xmm2",0x04); # round 6 831 &call (&label("key_256a")); 832 &aeskeygenassist("xmm1","xmm0",0x04); # round 7 833 &call (&label("key_256b")); 834 &aeskeygenassist("xmm1","xmm2",0x08); # round 8 835 &call (&label("key_256a")); 836 &aeskeygenassist("xmm1","xmm0",0x08); # round 9 837 &call (&label("key_256b")); 838 &aeskeygenassist("xmm1","xmm2",0x10); # round 10 839 &call (&label("key_256a")); 840 &aeskeygenassist("xmm1","xmm0",0x10); # round 11 841 &call (&label("key_256b")); 842 &aeskeygenassist("xmm1","xmm2",0x20); # round 12 843 &call (&label("key_256a")); 844 &aeskeygenassist("xmm1","xmm0",0x20); # round 13 845 &call (&label("key_256b")); 846 &aeskeygenassist("xmm1","xmm2",0x40); # round 14 847 &call (&label("key_256a")); 848 &$movekey (&QWP(0,$key),"xmm0"); 849 &mov (&DWP(16,$key),$rounds); 850 &xor ("eax","eax"); 851 852 &jmp (&label("good_key")); 853 854&set_label("key_256a",16); 855 &$movekey (&QWP(0,$key),"xmm2"); 856 &lea ($key,&DWP(16,$key)); 857&set_label("key_256a_cold"); 858 &shufps ("xmm4","xmm0",0b00010000); 859 &xorps ("xmm0","xmm4"); 860 &shufps ("xmm4","xmm0",0b10001100); 861 &xorps ("xmm0","xmm4"); 862 &shufps ("xmm1","xmm1",0b11111111); # critical path 863 &xorps ("xmm0","xmm1"); 864 &ret(); 865 866&set_label("key_256b",16); 867 &$movekey (&QWP(0,$key),"xmm0"); 868 &lea ($key,&DWP(16,$key)); 869 870 &shufps ("xmm4","xmm2",0b00010000); 871 &xorps ("xmm2","xmm4"); 872 &shufps ("xmm4","xmm2",0b10001100); 873 &xorps ("xmm2","xmm4"); 874 &shufps ("xmm1","xmm1",0b10101010); # critical path 875 &xorps ("xmm2","xmm1"); 876 &ret(); 877 878&set_label("14rounds_alt",16); 879 &movdqa ("xmm5",&QWP(0x00,"ebx")); 880 &movdqa ("xmm4",&QWP(0x20,"ebx")); 881 &mov ($rounds,7); 882 &movdqu (&QWP(-32,$key),"xmm0"); 883 &movdqa ("xmm1","xmm2"); 884 &movdqu (&QWP(-16,$key),"xmm2"); 885 886&set_label("loop_key256"); 887 &pshufb ("xmm2","xmm5"); 888 &aesenclast ("xmm2","xmm4"); 889 890 &movdqa ("xmm3","xmm0"); 891 &pslldq ("xmm0",4); 892 &pxor ("xmm3","xmm0"); 893 &pslldq ("xmm0",4); 894 &pxor ("xmm3","xmm0"); 895 &pslldq ("xmm0",4); 896 &pxor ("xmm0","xmm3"); 897 &pslld ("xmm4",1); 898 899 &pxor ("xmm0","xmm2"); 900 &movdqu (&QWP(0,$key),"xmm0"); 901 902 &dec ($rounds); 903 &jz (&label("done_key256")); 904 905 &pshufd ("xmm2","xmm0",0xff); 906 &pxor ("xmm3","xmm3"); 907 &aesenclast ("xmm2","xmm3"); 908 909 &movdqa ("xmm3","xmm1"); 910 &pslldq ("xmm1",4); 911 &pxor ("xmm3","xmm1"); 912 &pslldq ("xmm1",4); 913 &pxor ("xmm3","xmm1"); 914 &pslldq ("xmm1",4); 915 &pxor ("xmm1","xmm3"); 916 917 &pxor ("xmm2","xmm1"); 918 &movdqu (&QWP(16,$key),"xmm2"); 919 &lea ($key,&DWP(32,$key)); 920 &movdqa ("xmm1","xmm2"); 921 &jmp (&label("loop_key256")); 922 923&set_label("done_key256"); 924 &mov ($rounds,13); 925 &mov (&DWP(16,$key),$rounds); 926 927&set_label("good_key"); 928 &pxor ("xmm0","xmm0"); 929 &pxor ("xmm1","xmm1"); 930 &pxor ("xmm2","xmm2"); 931 &pxor ("xmm3","xmm3"); 932 &pxor ("xmm4","xmm4"); 933 &pxor ("xmm5","xmm5"); 934 &xor ("eax","eax"); 935 &pop ("ebx"); 936 &pop ("ebp"); 937 &ret (); 938 939&set_label("bad_pointer",4); 940 &mov ("eax",-1); 941 &pop ("ebx"); 942 &pop ("ebp"); 943 &ret (); 944&set_label("bad_keybits",4); 945 &pxor ("xmm0","xmm0"); 946 &mov ("eax",-2); 947 &pop ("ebx"); 948 &pop ("ebp"); 949 &ret (); 950&function_end_B("_aesni_set_encrypt_key"); 951 952# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits, 953# AES_KEY *key) 954&function_begin_B("${PREFIX}_set_encrypt_key"); 955 &mov ("eax",&wparam(0)); 956 &mov ($rounds,&wparam(1)); 957 &mov ($key,&wparam(2)); 958 &call ("_aesni_set_encrypt_key"); 959 &ret (); 960&function_end_B("${PREFIX}_set_encrypt_key"); 961 962&set_label("key_const",64); 963&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d); 964&data_word(0x04070605,0x04070605,0x04070605,0x04070605); 965&data_word(1,1,1,1); 966&data_word(0x1b,0x1b,0x1b,0x1b); 967&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"); 968 969&asm_finish(); 970 971close STDOUT or die "error closing STDOUT"; 972