1#! /usr/bin/env perl 2# Copyright 2009-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements support for Intel AES-NI extension. In 18# OpenSSL context it's used with Intel engine, but can also be used as 19# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for 20# details]. 21# 22# Performance. 23# 24# To start with see corresponding paragraph in aesni-x86_64.pl... 25# Instead of filling table similar to one found there I've chosen to 26# summarize *comparison* results for raw ECB, CTR and CBC benchmarks. 27# The simplified table below represents 32-bit performance relative 28# to 64-bit one in every given point. Ratios vary for different 29# encryption modes, therefore interval values. 30# 31# 16-byte 64-byte 256-byte 1-KB 8-KB 32# 53-67% 67-84% 91-94% 95-98% 97-99.5% 33# 34# Lower ratios for smaller block sizes are perfectly understandable, 35# because function call overhead is higher in 32-bit mode. Largest 36# 8-KB block performance is virtually same: 32-bit code is less than 37# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise. 38 39# January 2011 40# 41# See aesni-x86_64.pl for details. Unlike x86_64 version this module 42# interleaves at most 6 aes[enc|dec] instructions, because there are 43# not enough registers for 8x interleave [which should be optimal for 44# Sandy Bridge]. Actually, performance results for 6x interleave 45# factor presented in aesni-x86_64.pl (except for CTR) are for this 46# module. 47 48# April 2011 49# 50# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing 51# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09. 52 53# November 2015 54# 55# Add aesni_ocb_[en|de]crypt. 56 57###################################################################### 58# Current large-block performance in cycles per byte processed with 59# 128-bit key (less is better). 60# 61# CBC en-/decrypt CTR XTS ECB OCB 62# Westmere 3.77/1.37 1.37 1.52 1.27 63# * Bridge 5.07/0.98 0.99 1.09 0.91 1.10 64# Haswell 4.44/0.80 0.97 1.03 0.72 0.76 65# Skylake 2.68/0.65 0.65 0.66 0.64 0.66 66# Silvermont 5.77/3.56 3.67 4.03 3.46 4.03 67# Goldmont 3.84/1.39 1.39 1.63 1.31 1.70 68# Bulldozer 5.80/0.98 1.05 1.24 0.93 1.23 69 70$PREFIX="aesni"; # if $PREFIX is set to "AES", the script 71 # generates drop-in replacement for 72 # crypto/aes/asm/aes-586.pl:-) 73$inline=1; # inline _aesni_[en|de]crypt 74 75$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 76push(@INC,"${dir}","${dir}../../perlasm"); 77require "x86asm.pl"; 78 79$output = pop; 80open OUT,">$output"; 81*STDOUT=*OUT; 82 83&asm_init($ARGV[0]); 84 85&external_label("OPENSSL_ia32cap_P"); 86&static_label("key_const"); 87 88if ($PREFIX eq "aesni") { $movekey=\&movups; } 89else { $movekey=\&movups; } 90 91$len="eax"; 92$rounds="ecx"; 93$key="edx"; 94$inp="esi"; 95$out="edi"; 96$rounds_="ebx"; # backup copy for $rounds 97$key_="ebp"; # backup copy for $key 98 99$rndkey0="xmm0"; 100$rndkey1="xmm1"; 101$inout0="xmm2"; 102$inout1="xmm3"; 103$inout2="xmm4"; 104$inout3="xmm5"; $in1="xmm5"; 105$inout4="xmm6"; $in0="xmm6"; 106$inout5="xmm7"; $ivec="xmm7"; 107 108# AESNI extension 109sub aeskeygenassist 110{ my($dst,$src,$imm)=@_; 111 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) 112 { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); } 113} 114sub aescommon 115{ my($opcodelet,$dst,$src)=@_; 116 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) 117 { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);} 118} 119sub aesimc { aescommon(0xdb,@_); } 120sub aesenc { aescommon(0xdc,@_); } 121sub aesenclast { aescommon(0xdd,@_); } 122sub aesdec { aescommon(0xde,@_); } 123sub aesdeclast { aescommon(0xdf,@_); } 124 125# Inline version of internal aesni_[en|de]crypt1 126{ my $sn; 127sub aesni_inline_generate1 128{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); 129 $sn++; 130 131 &$movekey ($rndkey0,&QWP(0,$key)); 132 &$movekey ($rndkey1,&QWP(16,$key)); 133 &xorps ($ivec,$rndkey0) if (defined($ivec)); 134 &lea ($key,&DWP(32,$key)); 135 &xorps ($inout,$ivec) if (defined($ivec)); 136 &xorps ($inout,$rndkey0) if (!defined($ivec)); 137 &set_label("${p}1_loop_$sn"); 138 eval"&aes${p} ($inout,$rndkey1)"; 139 &dec ($rounds); 140 &$movekey ($rndkey1,&QWP(0,$key)); 141 &lea ($key,&DWP(16,$key)); 142 &jnz (&label("${p}1_loop_$sn")); 143 eval"&aes${p}last ($inout,$rndkey1)"; 144}} 145 146sub aesni_generate1 # fully unrolled loop 147{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout)); 148 149 &function_begin_B("_aesni_${p}rypt1"); 150 &movups ($rndkey0,&QWP(0,$key)); 151 &$movekey ($rndkey1,&QWP(0x10,$key)); 152 &xorps ($inout,$rndkey0); 153 &$movekey ($rndkey0,&QWP(0x20,$key)); 154 &lea ($key,&DWP(0x30,$key)); 155 &cmp ($rounds,11); 156 &jb (&label("${p}128")); 157 &lea ($key,&DWP(0x20,$key)); 158 &je (&label("${p}192")); 159 &lea ($key,&DWP(0x20,$key)); 160 eval"&aes${p} ($inout,$rndkey1)"; 161 &$movekey ($rndkey1,&QWP(-0x40,$key)); 162 eval"&aes${p} ($inout,$rndkey0)"; 163 &$movekey ($rndkey0,&QWP(-0x30,$key)); 164 &set_label("${p}192"); 165 eval"&aes${p} ($inout,$rndkey1)"; 166 &$movekey ($rndkey1,&QWP(-0x20,$key)); 167 eval"&aes${p} ($inout,$rndkey0)"; 168 &$movekey ($rndkey0,&QWP(-0x10,$key)); 169 &set_label("${p}128"); 170 eval"&aes${p} ($inout,$rndkey1)"; 171 &$movekey ($rndkey1,&QWP(0,$key)); 172 eval"&aes${p} ($inout,$rndkey0)"; 173 &$movekey ($rndkey0,&QWP(0x10,$key)); 174 eval"&aes${p} ($inout,$rndkey1)"; 175 &$movekey ($rndkey1,&QWP(0x20,$key)); 176 eval"&aes${p} ($inout,$rndkey0)"; 177 &$movekey ($rndkey0,&QWP(0x30,$key)); 178 eval"&aes${p} ($inout,$rndkey1)"; 179 &$movekey ($rndkey1,&QWP(0x40,$key)); 180 eval"&aes${p} ($inout,$rndkey0)"; 181 &$movekey ($rndkey0,&QWP(0x50,$key)); 182 eval"&aes${p} ($inout,$rndkey1)"; 183 &$movekey ($rndkey1,&QWP(0x60,$key)); 184 eval"&aes${p} ($inout,$rndkey0)"; 185 &$movekey ($rndkey0,&QWP(0x70,$key)); 186 eval"&aes${p} ($inout,$rndkey1)"; 187 eval"&aes${p}last ($inout,$rndkey0)"; 188 &ret(); 189 &function_end_B("_aesni_${p}rypt1"); 190} 191 192# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key); 193&aesni_generate1("enc") if (!$inline); 194&function_begin_B("${PREFIX}_encrypt"); 195 &mov ("eax",&wparam(0)); 196 &mov ($key,&wparam(2)); 197 &movups ($inout0,&QWP(0,"eax")); 198 &mov ($rounds,&DWP(240,$key)); 199 &mov ("eax",&wparam(1)); 200 if ($inline) 201 { &aesni_inline_generate1("enc"); } 202 else 203 { &call ("_aesni_encrypt1"); } 204 &pxor ($rndkey0,$rndkey0); # clear register bank 205 &pxor ($rndkey1,$rndkey1); 206 &movups (&QWP(0,"eax"),$inout0); 207 &pxor ($inout0,$inout0); 208 &ret (); 209&function_end_B("${PREFIX}_encrypt"); 210 211# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key); 212&aesni_generate1("dec") if(!$inline); 213&function_begin_B("${PREFIX}_decrypt"); 214 &mov ("eax",&wparam(0)); 215 &mov ($key,&wparam(2)); 216 &movups ($inout0,&QWP(0,"eax")); 217 &mov ($rounds,&DWP(240,$key)); 218 &mov ("eax",&wparam(1)); 219 if ($inline) 220 { &aesni_inline_generate1("dec"); } 221 else 222 { &call ("_aesni_decrypt1"); } 223 &pxor ($rndkey0,$rndkey0); # clear register bank 224 &pxor ($rndkey1,$rndkey1); 225 &movups (&QWP(0,"eax"),$inout0); 226 &pxor ($inout0,$inout0); 227 &ret (); 228&function_end_B("${PREFIX}_decrypt"); 229 230# _aesni_[en|de]cryptN are private interfaces, N denotes interleave 231# factor. Why 3x subroutine were originally used in loops? Even though 232# aes[enc|dec] latency was originally 6, it could be scheduled only 233# every *2nd* cycle. Thus 3x interleave was the one providing optimal 234# utilization, i.e. when subroutine's throughput is virtually same as 235# of non-interleaved subroutine [for number of input blocks up to 3]. 236# This is why it originally made no sense to implement 2x subroutine. 237# But times change and it became appropriate to spend extra 192 bytes 238# on 2x subroutine on Atom Silvermont account. For processors that 239# can schedule aes[enc|dec] every cycle optimal interleave factor 240# equals to corresponding instructions latency. 8x is optimal for 241# * Bridge, but it's unfeasible to accommodate such implementation 242# in XMM registers addressable in 32-bit mode and therefore maximum 243# of 6x is used instead... 244 245sub aesni_generate2 246{ my $p=shift; 247 248 &function_begin_B("_aesni_${p}rypt2"); 249 &$movekey ($rndkey0,&QWP(0,$key)); 250 &shl ($rounds,4); 251 &$movekey ($rndkey1,&QWP(16,$key)); 252 &xorps ($inout0,$rndkey0); 253 &pxor ($inout1,$rndkey0); 254 &$movekey ($rndkey0,&QWP(32,$key)); 255 &lea ($key,&DWP(32,$key,$rounds)); 256 &neg ($rounds); 257 &add ($rounds,16); 258 259 &set_label("${p}2_loop"); 260 eval"&aes${p} ($inout0,$rndkey1)"; 261 eval"&aes${p} ($inout1,$rndkey1)"; 262 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 263 &add ($rounds,32); 264 eval"&aes${p} ($inout0,$rndkey0)"; 265 eval"&aes${p} ($inout1,$rndkey0)"; 266 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 267 &jnz (&label("${p}2_loop")); 268 eval"&aes${p} ($inout0,$rndkey1)"; 269 eval"&aes${p} ($inout1,$rndkey1)"; 270 eval"&aes${p}last ($inout0,$rndkey0)"; 271 eval"&aes${p}last ($inout1,$rndkey0)"; 272 &ret(); 273 &function_end_B("_aesni_${p}rypt2"); 274} 275 276sub aesni_generate3 277{ my $p=shift; 278 279 &function_begin_B("_aesni_${p}rypt3"); 280 &$movekey ($rndkey0,&QWP(0,$key)); 281 &shl ($rounds,4); 282 &$movekey ($rndkey1,&QWP(16,$key)); 283 &xorps ($inout0,$rndkey0); 284 &pxor ($inout1,$rndkey0); 285 &pxor ($inout2,$rndkey0); 286 &$movekey ($rndkey0,&QWP(32,$key)); 287 &lea ($key,&DWP(32,$key,$rounds)); 288 &neg ($rounds); 289 &add ($rounds,16); 290 291 &set_label("${p}3_loop"); 292 eval"&aes${p} ($inout0,$rndkey1)"; 293 eval"&aes${p} ($inout1,$rndkey1)"; 294 eval"&aes${p} ($inout2,$rndkey1)"; 295 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 296 &add ($rounds,32); 297 eval"&aes${p} ($inout0,$rndkey0)"; 298 eval"&aes${p} ($inout1,$rndkey0)"; 299 eval"&aes${p} ($inout2,$rndkey0)"; 300 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 301 &jnz (&label("${p}3_loop")); 302 eval"&aes${p} ($inout0,$rndkey1)"; 303 eval"&aes${p} ($inout1,$rndkey1)"; 304 eval"&aes${p} ($inout2,$rndkey1)"; 305 eval"&aes${p}last ($inout0,$rndkey0)"; 306 eval"&aes${p}last ($inout1,$rndkey0)"; 307 eval"&aes${p}last ($inout2,$rndkey0)"; 308 &ret(); 309 &function_end_B("_aesni_${p}rypt3"); 310} 311 312# 4x interleave is implemented to improve small block performance, 313# most notably [and naturally] 4 block by ~30%. One can argue that one 314# should have implemented 5x as well, but improvement would be <20%, 315# so it's not worth it... 316sub aesni_generate4 317{ my $p=shift; 318 319 &function_begin_B("_aesni_${p}rypt4"); 320 &$movekey ($rndkey0,&QWP(0,$key)); 321 &$movekey ($rndkey1,&QWP(16,$key)); 322 &shl ($rounds,4); 323 &xorps ($inout0,$rndkey0); 324 &pxor ($inout1,$rndkey0); 325 &pxor ($inout2,$rndkey0); 326 &pxor ($inout3,$rndkey0); 327 &$movekey ($rndkey0,&QWP(32,$key)); 328 &lea ($key,&DWP(32,$key,$rounds)); 329 &neg ($rounds); 330 &data_byte (0x0f,0x1f,0x40,0x00); 331 &add ($rounds,16); 332 333 &set_label("${p}4_loop"); 334 eval"&aes${p} ($inout0,$rndkey1)"; 335 eval"&aes${p} ($inout1,$rndkey1)"; 336 eval"&aes${p} ($inout2,$rndkey1)"; 337 eval"&aes${p} ($inout3,$rndkey1)"; 338 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 339 &add ($rounds,32); 340 eval"&aes${p} ($inout0,$rndkey0)"; 341 eval"&aes${p} ($inout1,$rndkey0)"; 342 eval"&aes${p} ($inout2,$rndkey0)"; 343 eval"&aes${p} ($inout3,$rndkey0)"; 344 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 345 &jnz (&label("${p}4_loop")); 346 347 eval"&aes${p} ($inout0,$rndkey1)"; 348 eval"&aes${p} ($inout1,$rndkey1)"; 349 eval"&aes${p} ($inout2,$rndkey1)"; 350 eval"&aes${p} ($inout3,$rndkey1)"; 351 eval"&aes${p}last ($inout0,$rndkey0)"; 352 eval"&aes${p}last ($inout1,$rndkey0)"; 353 eval"&aes${p}last ($inout2,$rndkey0)"; 354 eval"&aes${p}last ($inout3,$rndkey0)"; 355 &ret(); 356 &function_end_B("_aesni_${p}rypt4"); 357} 358 359sub aesni_generate6 360{ my $p=shift; 361 362 &function_begin_B("_aesni_${p}rypt6"); 363 &static_label("_aesni_${p}rypt6_enter"); 364 &$movekey ($rndkey0,&QWP(0,$key)); 365 &shl ($rounds,4); 366 &$movekey ($rndkey1,&QWP(16,$key)); 367 &xorps ($inout0,$rndkey0); 368 &pxor ($inout1,$rndkey0); # pxor does better here 369 &pxor ($inout2,$rndkey0); 370 eval"&aes${p} ($inout0,$rndkey1)"; 371 &pxor ($inout3,$rndkey0); 372 &pxor ($inout4,$rndkey0); 373 eval"&aes${p} ($inout1,$rndkey1)"; 374 &lea ($key,&DWP(32,$key,$rounds)); 375 &neg ($rounds); 376 eval"&aes${p} ($inout2,$rndkey1)"; 377 &pxor ($inout5,$rndkey0); 378 &$movekey ($rndkey0,&QWP(0,$key,$rounds)); 379 &add ($rounds,16); 380 &jmp (&label("_aesni_${p}rypt6_inner")); 381 382 &set_label("${p}6_loop",16); 383 eval"&aes${p} ($inout0,$rndkey1)"; 384 eval"&aes${p} ($inout1,$rndkey1)"; 385 eval"&aes${p} ($inout2,$rndkey1)"; 386 &set_label("_aesni_${p}rypt6_inner"); 387 eval"&aes${p} ($inout3,$rndkey1)"; 388 eval"&aes${p} ($inout4,$rndkey1)"; 389 eval"&aes${p} ($inout5,$rndkey1)"; 390 &set_label("_aesni_${p}rypt6_enter"); 391 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 392 &add ($rounds,32); 393 eval"&aes${p} ($inout0,$rndkey0)"; 394 eval"&aes${p} ($inout1,$rndkey0)"; 395 eval"&aes${p} ($inout2,$rndkey0)"; 396 eval"&aes${p} ($inout3,$rndkey0)"; 397 eval"&aes${p} ($inout4,$rndkey0)"; 398 eval"&aes${p} ($inout5,$rndkey0)"; 399 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 400 &jnz (&label("${p}6_loop")); 401 402 eval"&aes${p} ($inout0,$rndkey1)"; 403 eval"&aes${p} ($inout1,$rndkey1)"; 404 eval"&aes${p} ($inout2,$rndkey1)"; 405 eval"&aes${p} ($inout3,$rndkey1)"; 406 eval"&aes${p} ($inout4,$rndkey1)"; 407 eval"&aes${p} ($inout5,$rndkey1)"; 408 eval"&aes${p}last ($inout0,$rndkey0)"; 409 eval"&aes${p}last ($inout1,$rndkey0)"; 410 eval"&aes${p}last ($inout2,$rndkey0)"; 411 eval"&aes${p}last ($inout3,$rndkey0)"; 412 eval"&aes${p}last ($inout4,$rndkey0)"; 413 eval"&aes${p}last ($inout5,$rndkey0)"; 414 &ret(); 415 &function_end_B("_aesni_${p}rypt6"); 416} 417&aesni_generate2("enc") if ($PREFIX eq "aesni"); 418&aesni_generate2("dec"); 419&aesni_generate3("enc") if ($PREFIX eq "aesni"); 420&aesni_generate3("dec"); 421&aesni_generate4("enc") if ($PREFIX eq "aesni"); 422&aesni_generate4("dec"); 423&aesni_generate6("enc") if ($PREFIX eq "aesni"); 424&aesni_generate6("dec"); 425 426if ($PREFIX eq "aesni") { 427###################################################################### 428# void aesni_ecb_encrypt (const void *in, void *out, 429# size_t length, const AES_KEY *key, 430# int enc); 431&function_begin("aesni_ecb_encrypt"); 432 &mov ($inp,&wparam(0)); 433 &mov ($out,&wparam(1)); 434 &mov ($len,&wparam(2)); 435 &mov ($key,&wparam(3)); 436 &mov ($rounds_,&wparam(4)); 437 &and ($len,-16); 438 &jz (&label("ecb_ret")); 439 &mov ($rounds,&DWP(240,$key)); 440 &test ($rounds_,$rounds_); 441 &jz (&label("ecb_decrypt")); 442 443 &mov ($key_,$key); # backup $key 444 &mov ($rounds_,$rounds); # backup $rounds 445 &cmp ($len,0x60); 446 &jb (&label("ecb_enc_tail")); 447 448 &movdqu ($inout0,&QWP(0,$inp)); 449 &movdqu ($inout1,&QWP(0x10,$inp)); 450 &movdqu ($inout2,&QWP(0x20,$inp)); 451 &movdqu ($inout3,&QWP(0x30,$inp)); 452 &movdqu ($inout4,&QWP(0x40,$inp)); 453 &movdqu ($inout5,&QWP(0x50,$inp)); 454 &lea ($inp,&DWP(0x60,$inp)); 455 &sub ($len,0x60); 456 &jmp (&label("ecb_enc_loop6_enter")); 457 458&set_label("ecb_enc_loop6",16); 459 &movups (&QWP(0,$out),$inout0); 460 &movdqu ($inout0,&QWP(0,$inp)); 461 &movups (&QWP(0x10,$out),$inout1); 462 &movdqu ($inout1,&QWP(0x10,$inp)); 463 &movups (&QWP(0x20,$out),$inout2); 464 &movdqu ($inout2,&QWP(0x20,$inp)); 465 &movups (&QWP(0x30,$out),$inout3); 466 &movdqu ($inout3,&QWP(0x30,$inp)); 467 &movups (&QWP(0x40,$out),$inout4); 468 &movdqu ($inout4,&QWP(0x40,$inp)); 469 &movups (&QWP(0x50,$out),$inout5); 470 &lea ($out,&DWP(0x60,$out)); 471 &movdqu ($inout5,&QWP(0x50,$inp)); 472 &lea ($inp,&DWP(0x60,$inp)); 473&set_label("ecb_enc_loop6_enter"); 474 475 &call ("_aesni_encrypt6"); 476 477 &mov ($key,$key_); # restore $key 478 &mov ($rounds,$rounds_); # restore $rounds 479 &sub ($len,0x60); 480 &jnc (&label("ecb_enc_loop6")); 481 482 &movups (&QWP(0,$out),$inout0); 483 &movups (&QWP(0x10,$out),$inout1); 484 &movups (&QWP(0x20,$out),$inout2); 485 &movups (&QWP(0x30,$out),$inout3); 486 &movups (&QWP(0x40,$out),$inout4); 487 &movups (&QWP(0x50,$out),$inout5); 488 &lea ($out,&DWP(0x60,$out)); 489 &add ($len,0x60); 490 &jz (&label("ecb_ret")); 491 492&set_label("ecb_enc_tail"); 493 &movups ($inout0,&QWP(0,$inp)); 494 &cmp ($len,0x20); 495 &jb (&label("ecb_enc_one")); 496 &movups ($inout1,&QWP(0x10,$inp)); 497 &je (&label("ecb_enc_two")); 498 &movups ($inout2,&QWP(0x20,$inp)); 499 &cmp ($len,0x40); 500 &jb (&label("ecb_enc_three")); 501 &movups ($inout3,&QWP(0x30,$inp)); 502 &je (&label("ecb_enc_four")); 503 &movups ($inout4,&QWP(0x40,$inp)); 504 &xorps ($inout5,$inout5); 505 &call ("_aesni_encrypt6"); 506 &movups (&QWP(0,$out),$inout0); 507 &movups (&QWP(0x10,$out),$inout1); 508 &movups (&QWP(0x20,$out),$inout2); 509 &movups (&QWP(0x30,$out),$inout3); 510 &movups (&QWP(0x40,$out),$inout4); 511 jmp (&label("ecb_ret")); 512 513&set_label("ecb_enc_one",16); 514 if ($inline) 515 { &aesni_inline_generate1("enc"); } 516 else 517 { &call ("_aesni_encrypt1"); } 518 &movups (&QWP(0,$out),$inout0); 519 &jmp (&label("ecb_ret")); 520 521&set_label("ecb_enc_two",16); 522 &call ("_aesni_encrypt2"); 523 &movups (&QWP(0,$out),$inout0); 524 &movups (&QWP(0x10,$out),$inout1); 525 &jmp (&label("ecb_ret")); 526 527&set_label("ecb_enc_three",16); 528 &call ("_aesni_encrypt3"); 529 &movups (&QWP(0,$out),$inout0); 530 &movups (&QWP(0x10,$out),$inout1); 531 &movups (&QWP(0x20,$out),$inout2); 532 &jmp (&label("ecb_ret")); 533 534&set_label("ecb_enc_four",16); 535 &call ("_aesni_encrypt4"); 536 &movups (&QWP(0,$out),$inout0); 537 &movups (&QWP(0x10,$out),$inout1); 538 &movups (&QWP(0x20,$out),$inout2); 539 &movups (&QWP(0x30,$out),$inout3); 540 &jmp (&label("ecb_ret")); 541###################################################################### 542&set_label("ecb_decrypt",16); 543 &mov ($key_,$key); # backup $key 544 &mov ($rounds_,$rounds); # backup $rounds 545 &cmp ($len,0x60); 546 &jb (&label("ecb_dec_tail")); 547 548 &movdqu ($inout0,&QWP(0,$inp)); 549 &movdqu ($inout1,&QWP(0x10,$inp)); 550 &movdqu ($inout2,&QWP(0x20,$inp)); 551 &movdqu ($inout3,&QWP(0x30,$inp)); 552 &movdqu ($inout4,&QWP(0x40,$inp)); 553 &movdqu ($inout5,&QWP(0x50,$inp)); 554 &lea ($inp,&DWP(0x60,$inp)); 555 &sub ($len,0x60); 556 &jmp (&label("ecb_dec_loop6_enter")); 557 558&set_label("ecb_dec_loop6",16); 559 &movups (&QWP(0,$out),$inout0); 560 &movdqu ($inout0,&QWP(0,$inp)); 561 &movups (&QWP(0x10,$out),$inout1); 562 &movdqu ($inout1,&QWP(0x10,$inp)); 563 &movups (&QWP(0x20,$out),$inout2); 564 &movdqu ($inout2,&QWP(0x20,$inp)); 565 &movups (&QWP(0x30,$out),$inout3); 566 &movdqu ($inout3,&QWP(0x30,$inp)); 567 &movups (&QWP(0x40,$out),$inout4); 568 &movdqu ($inout4,&QWP(0x40,$inp)); 569 &movups (&QWP(0x50,$out),$inout5); 570 &lea ($out,&DWP(0x60,$out)); 571 &movdqu ($inout5,&QWP(0x50,$inp)); 572 &lea ($inp,&DWP(0x60,$inp)); 573&set_label("ecb_dec_loop6_enter"); 574 575 &call ("_aesni_decrypt6"); 576 577 &mov ($key,$key_); # restore $key 578 &mov ($rounds,$rounds_); # restore $rounds 579 &sub ($len,0x60); 580 &jnc (&label("ecb_dec_loop6")); 581 582 &movups (&QWP(0,$out),$inout0); 583 &movups (&QWP(0x10,$out),$inout1); 584 &movups (&QWP(0x20,$out),$inout2); 585 &movups (&QWP(0x30,$out),$inout3); 586 &movups (&QWP(0x40,$out),$inout4); 587 &movups (&QWP(0x50,$out),$inout5); 588 &lea ($out,&DWP(0x60,$out)); 589 &add ($len,0x60); 590 &jz (&label("ecb_ret")); 591 592&set_label("ecb_dec_tail"); 593 &movups ($inout0,&QWP(0,$inp)); 594 &cmp ($len,0x20); 595 &jb (&label("ecb_dec_one")); 596 &movups ($inout1,&QWP(0x10,$inp)); 597 &je (&label("ecb_dec_two")); 598 &movups ($inout2,&QWP(0x20,$inp)); 599 &cmp ($len,0x40); 600 &jb (&label("ecb_dec_three")); 601 &movups ($inout3,&QWP(0x30,$inp)); 602 &je (&label("ecb_dec_four")); 603 &movups ($inout4,&QWP(0x40,$inp)); 604 &xorps ($inout5,$inout5); 605 &call ("_aesni_decrypt6"); 606 &movups (&QWP(0,$out),$inout0); 607 &movups (&QWP(0x10,$out),$inout1); 608 &movups (&QWP(0x20,$out),$inout2); 609 &movups (&QWP(0x30,$out),$inout3); 610 &movups (&QWP(0x40,$out),$inout4); 611 &jmp (&label("ecb_ret")); 612 613&set_label("ecb_dec_one",16); 614 if ($inline) 615 { &aesni_inline_generate1("dec"); } 616 else 617 { &call ("_aesni_decrypt1"); } 618 &movups (&QWP(0,$out),$inout0); 619 &jmp (&label("ecb_ret")); 620 621&set_label("ecb_dec_two",16); 622 &call ("_aesni_decrypt2"); 623 &movups (&QWP(0,$out),$inout0); 624 &movups (&QWP(0x10,$out),$inout1); 625 &jmp (&label("ecb_ret")); 626 627&set_label("ecb_dec_three",16); 628 &call ("_aesni_decrypt3"); 629 &movups (&QWP(0,$out),$inout0); 630 &movups (&QWP(0x10,$out),$inout1); 631 &movups (&QWP(0x20,$out),$inout2); 632 &jmp (&label("ecb_ret")); 633 634&set_label("ecb_dec_four",16); 635 &call ("_aesni_decrypt4"); 636 &movups (&QWP(0,$out),$inout0); 637 &movups (&QWP(0x10,$out),$inout1); 638 &movups (&QWP(0x20,$out),$inout2); 639 &movups (&QWP(0x30,$out),$inout3); 640 641&set_label("ecb_ret"); 642 &pxor ("xmm0","xmm0"); # clear register bank 643 &pxor ("xmm1","xmm1"); 644 &pxor ("xmm2","xmm2"); 645 &pxor ("xmm3","xmm3"); 646 &pxor ("xmm4","xmm4"); 647 &pxor ("xmm5","xmm5"); 648 &pxor ("xmm6","xmm6"); 649 &pxor ("xmm7","xmm7"); 650&function_end("aesni_ecb_encrypt"); 651 652###################################################################### 653# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, 654# size_t blocks, const AES_KEY *key, 655# const char *ivec,char *cmac); 656# 657# Handles only complete blocks, operates on 64-bit counter and 658# does not update *ivec! Nor does it finalize CMAC value 659# (see engine/eng_aesni.c for details) 660# 661{ my $cmac=$inout1; 662&function_begin("aesni_ccm64_encrypt_blocks"); 663 &mov ($inp,&wparam(0)); 664 &mov ($out,&wparam(1)); 665 &mov ($len,&wparam(2)); 666 &mov ($key,&wparam(3)); 667 &mov ($rounds_,&wparam(4)); 668 &mov ($rounds,&wparam(5)); 669 &mov ($key_,"esp"); 670 &sub ("esp",60); 671 &and ("esp",-16); # align stack 672 &mov (&DWP(48,"esp"),$key_); 673 674 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec 675 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac 676 &mov ($rounds,&DWP(240,$key)); 677 678 # compose byte-swap control mask for pshufb on stack 679 &mov (&DWP(0,"esp"),0x0c0d0e0f); 680 &mov (&DWP(4,"esp"),0x08090a0b); 681 &mov (&DWP(8,"esp"),0x04050607); 682 &mov (&DWP(12,"esp"),0x00010203); 683 684 # compose counter increment vector on stack 685 &mov ($rounds_,1); 686 &xor ($key_,$key_); 687 &mov (&DWP(16,"esp"),$rounds_); 688 &mov (&DWP(20,"esp"),$key_); 689 &mov (&DWP(24,"esp"),$key_); 690 &mov (&DWP(28,"esp"),$key_); 691 692 &shl ($rounds,4); 693 &mov ($rounds_,16); 694 &lea ($key_,&DWP(0,$key)); 695 &movdqa ($inout3,&QWP(0,"esp")); 696 &movdqa ($inout0,$ivec); 697 &lea ($key,&DWP(32,$key,$rounds)); 698 &sub ($rounds_,$rounds); 699 &pshufb ($ivec,$inout3); 700 701&set_label("ccm64_enc_outer"); 702 &$movekey ($rndkey0,&QWP(0,$key_)); 703 &mov ($rounds,$rounds_); 704 &movups ($in0,&QWP(0,$inp)); 705 706 &xorps ($inout0,$rndkey0); 707 &$movekey ($rndkey1,&QWP(16,$key_)); 708 &xorps ($rndkey0,$in0); 709 &xorps ($cmac,$rndkey0); # cmac^=inp 710 &$movekey ($rndkey0,&QWP(32,$key_)); 711 712&set_label("ccm64_enc2_loop"); 713 &aesenc ($inout0,$rndkey1); 714 &aesenc ($cmac,$rndkey1); 715 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 716 &add ($rounds,32); 717 &aesenc ($inout0,$rndkey0); 718 &aesenc ($cmac,$rndkey0); 719 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 720 &jnz (&label("ccm64_enc2_loop")); 721 &aesenc ($inout0,$rndkey1); 722 &aesenc ($cmac,$rndkey1); 723 &paddq ($ivec,&QWP(16,"esp")); 724 &dec ($len); 725 &aesenclast ($inout0,$rndkey0); 726 &aesenclast ($cmac,$rndkey0); 727 728 &lea ($inp,&DWP(16,$inp)); 729 &xorps ($in0,$inout0); # inp^=E(ivec) 730 &movdqa ($inout0,$ivec); 731 &movups (&QWP(0,$out),$in0); # save output 732 &pshufb ($inout0,$inout3); 733 &lea ($out,&DWP(16,$out)); 734 &jnz (&label("ccm64_enc_outer")); 735 736 &mov ("esp",&DWP(48,"esp")); 737 &mov ($out,&wparam(5)); 738 &movups (&QWP(0,$out),$cmac); 739 740 &pxor ("xmm0","xmm0"); # clear register bank 741 &pxor ("xmm1","xmm1"); 742 &pxor ("xmm2","xmm2"); 743 &pxor ("xmm3","xmm3"); 744 &pxor ("xmm4","xmm4"); 745 &pxor ("xmm5","xmm5"); 746 &pxor ("xmm6","xmm6"); 747 &pxor ("xmm7","xmm7"); 748&function_end("aesni_ccm64_encrypt_blocks"); 749 750&function_begin("aesni_ccm64_decrypt_blocks"); 751 &mov ($inp,&wparam(0)); 752 &mov ($out,&wparam(1)); 753 &mov ($len,&wparam(2)); 754 &mov ($key,&wparam(3)); 755 &mov ($rounds_,&wparam(4)); 756 &mov ($rounds,&wparam(5)); 757 &mov ($key_,"esp"); 758 &sub ("esp",60); 759 &and ("esp",-16); # align stack 760 &mov (&DWP(48,"esp"),$key_); 761 762 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec 763 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac 764 &mov ($rounds,&DWP(240,$key)); 765 766 # compose byte-swap control mask for pshufb on stack 767 &mov (&DWP(0,"esp"),0x0c0d0e0f); 768 &mov (&DWP(4,"esp"),0x08090a0b); 769 &mov (&DWP(8,"esp"),0x04050607); 770 &mov (&DWP(12,"esp"),0x00010203); 771 772 # compose counter increment vector on stack 773 &mov ($rounds_,1); 774 &xor ($key_,$key_); 775 &mov (&DWP(16,"esp"),$rounds_); 776 &mov (&DWP(20,"esp"),$key_); 777 &mov (&DWP(24,"esp"),$key_); 778 &mov (&DWP(28,"esp"),$key_); 779 780 &movdqa ($inout3,&QWP(0,"esp")); # bswap mask 781 &movdqa ($inout0,$ivec); 782 783 &mov ($key_,$key); 784 &mov ($rounds_,$rounds); 785 786 &pshufb ($ivec,$inout3); 787 if ($inline) 788 { &aesni_inline_generate1("enc"); } 789 else 790 { &call ("_aesni_encrypt1"); } 791 &shl ($rounds_,4); 792 &mov ($rounds,16); 793 &movups ($in0,&QWP(0,$inp)); # load inp 794 &paddq ($ivec,&QWP(16,"esp")); 795 &lea ($inp,&QWP(16,$inp)); 796 &sub ($rounds,$rounds_); 797 &lea ($key,&DWP(32,$key_,$rounds_)); 798 &mov ($rounds_,$rounds); 799 &jmp (&label("ccm64_dec_outer")); 800 801&set_label("ccm64_dec_outer",16); 802 &xorps ($in0,$inout0); # inp ^= E(ivec) 803 &movdqa ($inout0,$ivec); 804 &movups (&QWP(0,$out),$in0); # save output 805 &lea ($out,&DWP(16,$out)); 806 &pshufb ($inout0,$inout3); 807 808 &sub ($len,1); 809 &jz (&label("ccm64_dec_break")); 810 811 &$movekey ($rndkey0,&QWP(0,$key_)); 812 &mov ($rounds,$rounds_); 813 &$movekey ($rndkey1,&QWP(16,$key_)); 814 &xorps ($in0,$rndkey0); 815 &xorps ($inout0,$rndkey0); 816 &xorps ($cmac,$in0); # cmac^=out 817 &$movekey ($rndkey0,&QWP(32,$key_)); 818 819&set_label("ccm64_dec2_loop"); 820 &aesenc ($inout0,$rndkey1); 821 &aesenc ($cmac,$rndkey1); 822 &$movekey ($rndkey1,&QWP(0,$key,$rounds)); 823 &add ($rounds,32); 824 &aesenc ($inout0,$rndkey0); 825 &aesenc ($cmac,$rndkey0); 826 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 827 &jnz (&label("ccm64_dec2_loop")); 828 &movups ($in0,&QWP(0,$inp)); # load inp 829 &paddq ($ivec,&QWP(16,"esp")); 830 &aesenc ($inout0,$rndkey1); 831 &aesenc ($cmac,$rndkey1); 832 &aesenclast ($inout0,$rndkey0); 833 &aesenclast ($cmac,$rndkey0); 834 &lea ($inp,&QWP(16,$inp)); 835 &jmp (&label("ccm64_dec_outer")); 836 837&set_label("ccm64_dec_break",16); 838 &mov ($rounds,&DWP(240,$key_)); 839 &mov ($key,$key_); 840 if ($inline) 841 { &aesni_inline_generate1("enc",$cmac,$in0); } 842 else 843 { &call ("_aesni_encrypt1",$cmac); } 844 845 &mov ("esp",&DWP(48,"esp")); 846 &mov ($out,&wparam(5)); 847 &movups (&QWP(0,$out),$cmac); 848 849 &pxor ("xmm0","xmm0"); # clear register bank 850 &pxor ("xmm1","xmm1"); 851 &pxor ("xmm2","xmm2"); 852 &pxor ("xmm3","xmm3"); 853 &pxor ("xmm4","xmm4"); 854 &pxor ("xmm5","xmm5"); 855 &pxor ("xmm6","xmm6"); 856 &pxor ("xmm7","xmm7"); 857&function_end("aesni_ccm64_decrypt_blocks"); 858} 859 860###################################################################### 861# void aesni_ctr32_encrypt_blocks (const void *in, void *out, 862# size_t blocks, const AES_KEY *key, 863# const char *ivec); 864# 865# Handles only complete blocks, operates on 32-bit counter and 866# does not update *ivec! (see crypto/modes/ctr128.c for details) 867# 868# stack layout: 869# 0 pshufb mask 870# 16 vector addend: 0,6,6,6 871# 32 counter-less ivec 872# 48 1st triplet of counter vector 873# 64 2nd triplet of counter vector 874# 80 saved %esp 875 876&function_begin("aesni_ctr32_encrypt_blocks"); 877 &mov ($inp,&wparam(0)); 878 &mov ($out,&wparam(1)); 879 &mov ($len,&wparam(2)); 880 &mov ($key,&wparam(3)); 881 &mov ($rounds_,&wparam(4)); 882 &mov ($key_,"esp"); 883 &sub ("esp",88); 884 &and ("esp",-16); # align stack 885 &mov (&DWP(80,"esp"),$key_); 886 887 &cmp ($len,1); 888 &je (&label("ctr32_one_shortcut")); 889 890 &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec 891 892 # compose byte-swap control mask for pshufb on stack 893 &mov (&DWP(0,"esp"),0x0c0d0e0f); 894 &mov (&DWP(4,"esp"),0x08090a0b); 895 &mov (&DWP(8,"esp"),0x04050607); 896 &mov (&DWP(12,"esp"),0x00010203); 897 898 # compose counter increment vector on stack 899 &mov ($rounds,6); 900 &xor ($key_,$key_); 901 &mov (&DWP(16,"esp"),$rounds); 902 &mov (&DWP(20,"esp"),$rounds); 903 &mov (&DWP(24,"esp"),$rounds); 904 &mov (&DWP(28,"esp"),$key_); 905 906 &pextrd ($rounds_,$inout5,3); # pull 32-bit counter 907 &pinsrd ($inout5,$key_,3); # wipe 32-bit counter 908 909 &mov ($rounds,&DWP(240,$key)); # key->rounds 910 911 # compose 2 vectors of 3x32-bit counters 912 &bswap ($rounds_); 913 &pxor ($rndkey0,$rndkey0); 914 &pxor ($rndkey1,$rndkey1); 915 &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask 916 &pinsrd ($rndkey0,$rounds_,0); 917 &lea ($key_,&DWP(3,$rounds_)); 918 &pinsrd ($rndkey1,$key_,0); 919 &inc ($rounds_); 920 &pinsrd ($rndkey0,$rounds_,1); 921 &inc ($key_); 922 &pinsrd ($rndkey1,$key_,1); 923 &inc ($rounds_); 924 &pinsrd ($rndkey0,$rounds_,2); 925 &inc ($key_); 926 &pinsrd ($rndkey1,$key_,2); 927 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet 928 &pshufb ($rndkey0,$inout0); # byte swap 929 &movdqu ($inout4,&QWP(0,$key)); # key[0] 930 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet 931 &pshufb ($rndkey1,$inout0); # byte swap 932 933 &pshufd ($inout0,$rndkey0,3<<6); # place counter to upper dword 934 &pshufd ($inout1,$rndkey0,2<<6); 935 &cmp ($len,6); 936 &jb (&label("ctr32_tail")); 937 &pxor ($inout5,$inout4); # counter-less ivec^key[0] 938 &shl ($rounds,4); 939 &mov ($rounds_,16); 940 &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec^key[0] 941 &mov ($key_,$key); # backup $key 942 &sub ($rounds_,$rounds); # backup twisted $rounds 943 &lea ($key,&DWP(32,$key,$rounds)); 944 &sub ($len,6); 945 &jmp (&label("ctr32_loop6")); 946 947&set_label("ctr32_loop6",16); 948 # inlining _aesni_encrypt6's prologue gives ~6% improvement... 949 &pshufd ($inout2,$rndkey0,1<<6); 950 &movdqa ($rndkey0,&QWP(32,"esp")); # pull counter-less ivec 951 &pshufd ($inout3,$rndkey1,3<<6); 952 &pxor ($inout0,$rndkey0); # merge counter-less ivec 953 &pshufd ($inout4,$rndkey1,2<<6); 954 &pxor ($inout1,$rndkey0); 955 &pshufd ($inout5,$rndkey1,1<<6); 956 &$movekey ($rndkey1,&QWP(16,$key_)); 957 &pxor ($inout2,$rndkey0); 958 &pxor ($inout3,$rndkey0); 959 &aesenc ($inout0,$rndkey1); 960 &pxor ($inout4,$rndkey0); 961 &pxor ($inout5,$rndkey0); 962 &aesenc ($inout1,$rndkey1); 963 &$movekey ($rndkey0,&QWP(32,$key_)); 964 &mov ($rounds,$rounds_); 965 &aesenc ($inout2,$rndkey1); 966 &aesenc ($inout3,$rndkey1); 967 &aesenc ($inout4,$rndkey1); 968 &aesenc ($inout5,$rndkey1); 969 970 &call (&label("_aesni_encrypt6_enter")); 971 972 &movups ($rndkey1,&QWP(0,$inp)); 973 &movups ($rndkey0,&QWP(0x10,$inp)); 974 &xorps ($inout0,$rndkey1); 975 &movups ($rndkey1,&QWP(0x20,$inp)); 976 &xorps ($inout1,$rndkey0); 977 &movups (&QWP(0,$out),$inout0); 978 &movdqa ($rndkey0,&QWP(16,"esp")); # load increment 979 &xorps ($inout2,$rndkey1); 980 &movdqa ($rndkey1,&QWP(64,"esp")); # load 2nd triplet 981 &movups (&QWP(0x10,$out),$inout1); 982 &movups (&QWP(0x20,$out),$inout2); 983 984 &paddd ($rndkey1,$rndkey0); # 2nd triplet increment 985 &paddd ($rndkey0,&QWP(48,"esp")); # 1st triplet increment 986 &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask 987 988 &movups ($inout1,&QWP(0x30,$inp)); 989 &movups ($inout2,&QWP(0x40,$inp)); 990 &xorps ($inout3,$inout1); 991 &movups ($inout1,&QWP(0x50,$inp)); 992 &lea ($inp,&DWP(0x60,$inp)); 993 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet 994 &pshufb ($rndkey0,$inout0); # byte swap 995 &xorps ($inout4,$inout2); 996 &movups (&QWP(0x30,$out),$inout3); 997 &xorps ($inout5,$inout1); 998 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet 999 &pshufb ($rndkey1,$inout0); # byte swap 1000 &movups (&QWP(0x40,$out),$inout4); 1001 &pshufd ($inout0,$rndkey0,3<<6); 1002 &movups (&QWP(0x50,$out),$inout5); 1003 &lea ($out,&DWP(0x60,$out)); 1004 1005 &pshufd ($inout1,$rndkey0,2<<6); 1006 &sub ($len,6); 1007 &jnc (&label("ctr32_loop6")); 1008 1009 &add ($len,6); 1010 &jz (&label("ctr32_ret")); 1011 &movdqu ($inout5,&QWP(0,$key_)); 1012 &mov ($key,$key_); 1013 &pxor ($inout5,&QWP(32,"esp")); # restore count-less ivec 1014 &mov ($rounds,&DWP(240,$key_)); # restore $rounds 1015 1016&set_label("ctr32_tail"); 1017 &por ($inout0,$inout5); 1018 &cmp ($len,2); 1019 &jb (&label("ctr32_one")); 1020 1021 &pshufd ($inout2,$rndkey0,1<<6); 1022 &por ($inout1,$inout5); 1023 &je (&label("ctr32_two")); 1024 1025 &pshufd ($inout3,$rndkey1,3<<6); 1026 &por ($inout2,$inout5); 1027 &cmp ($len,4); 1028 &jb (&label("ctr32_three")); 1029 1030 &pshufd ($inout4,$rndkey1,2<<6); 1031 &por ($inout3,$inout5); 1032 &je (&label("ctr32_four")); 1033 1034 &por ($inout4,$inout5); 1035 &call ("_aesni_encrypt6"); 1036 &movups ($rndkey1,&QWP(0,$inp)); 1037 &movups ($rndkey0,&QWP(0x10,$inp)); 1038 &xorps ($inout0,$rndkey1); 1039 &movups ($rndkey1,&QWP(0x20,$inp)); 1040 &xorps ($inout1,$rndkey0); 1041 &movups ($rndkey0,&QWP(0x30,$inp)); 1042 &xorps ($inout2,$rndkey1); 1043 &movups ($rndkey1,&QWP(0x40,$inp)); 1044 &xorps ($inout3,$rndkey0); 1045 &movups (&QWP(0,$out),$inout0); 1046 &xorps ($inout4,$rndkey1); 1047 &movups (&QWP(0x10,$out),$inout1); 1048 &movups (&QWP(0x20,$out),$inout2); 1049 &movups (&QWP(0x30,$out),$inout3); 1050 &movups (&QWP(0x40,$out),$inout4); 1051 &jmp (&label("ctr32_ret")); 1052 1053&set_label("ctr32_one_shortcut",16); 1054 &movups ($inout0,&QWP(0,$rounds_)); # load ivec 1055 &mov ($rounds,&DWP(240,$key)); 1056 1057&set_label("ctr32_one"); 1058 if ($inline) 1059 { &aesni_inline_generate1("enc"); } 1060 else 1061 { &call ("_aesni_encrypt1"); } 1062 &movups ($in0,&QWP(0,$inp)); 1063 &xorps ($in0,$inout0); 1064 &movups (&QWP(0,$out),$in0); 1065 &jmp (&label("ctr32_ret")); 1066 1067&set_label("ctr32_two",16); 1068 &call ("_aesni_encrypt2"); 1069 &movups ($inout3,&QWP(0,$inp)); 1070 &movups ($inout4,&QWP(0x10,$inp)); 1071 &xorps ($inout0,$inout3); 1072 &xorps ($inout1,$inout4); 1073 &movups (&QWP(0,$out),$inout0); 1074 &movups (&QWP(0x10,$out),$inout1); 1075 &jmp (&label("ctr32_ret")); 1076 1077&set_label("ctr32_three",16); 1078 &call ("_aesni_encrypt3"); 1079 &movups ($inout3,&QWP(0,$inp)); 1080 &movups ($inout4,&QWP(0x10,$inp)); 1081 &xorps ($inout0,$inout3); 1082 &movups ($inout5,&QWP(0x20,$inp)); 1083 &xorps ($inout1,$inout4); 1084 &movups (&QWP(0,$out),$inout0); 1085 &xorps ($inout2,$inout5); 1086 &movups (&QWP(0x10,$out),$inout1); 1087 &movups (&QWP(0x20,$out),$inout2); 1088 &jmp (&label("ctr32_ret")); 1089 1090&set_label("ctr32_four",16); 1091 &call ("_aesni_encrypt4"); 1092 &movups ($inout4,&QWP(0,$inp)); 1093 &movups ($inout5,&QWP(0x10,$inp)); 1094 &movups ($rndkey1,&QWP(0x20,$inp)); 1095 &xorps ($inout0,$inout4); 1096 &movups ($rndkey0,&QWP(0x30,$inp)); 1097 &xorps ($inout1,$inout5); 1098 &movups (&QWP(0,$out),$inout0); 1099 &xorps ($inout2,$rndkey1); 1100 &movups (&QWP(0x10,$out),$inout1); 1101 &xorps ($inout3,$rndkey0); 1102 &movups (&QWP(0x20,$out),$inout2); 1103 &movups (&QWP(0x30,$out),$inout3); 1104 1105&set_label("ctr32_ret"); 1106 &pxor ("xmm0","xmm0"); # clear register bank 1107 &pxor ("xmm1","xmm1"); 1108 &pxor ("xmm2","xmm2"); 1109 &pxor ("xmm3","xmm3"); 1110 &pxor ("xmm4","xmm4"); 1111 &movdqa (&QWP(32,"esp"),"xmm0"); # clear stack 1112 &pxor ("xmm5","xmm5"); 1113 &movdqa (&QWP(48,"esp"),"xmm0"); 1114 &pxor ("xmm6","xmm6"); 1115 &movdqa (&QWP(64,"esp"),"xmm0"); 1116 &pxor ("xmm7","xmm7"); 1117 &mov ("esp",&DWP(80,"esp")); 1118&function_end("aesni_ctr32_encrypt_blocks"); 1119 1120###################################################################### 1121# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, 1122# const AES_KEY *key1, const AES_KEY *key2 1123# const unsigned char iv[16]); 1124# 1125{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1); 1126 1127&function_begin("aesni_xts_encrypt"); 1128 &mov ($key,&wparam(4)); # key2 1129 &mov ($inp,&wparam(5)); # clear-text tweak 1130 1131 &mov ($rounds,&DWP(240,$key)); # key2->rounds 1132 &movups ($inout0,&QWP(0,$inp)); 1133 if ($inline) 1134 { &aesni_inline_generate1("enc"); } 1135 else 1136 { &call ("_aesni_encrypt1"); } 1137 1138 &mov ($inp,&wparam(0)); 1139 &mov ($out,&wparam(1)); 1140 &mov ($len,&wparam(2)); 1141 &mov ($key,&wparam(3)); # key1 1142 1143 &mov ($key_,"esp"); 1144 &sub ("esp",16*7+8); 1145 &mov ($rounds,&DWP(240,$key)); # key1->rounds 1146 &and ("esp",-16); # align stack 1147 1148 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant 1149 &mov (&DWP(16*6+4,"esp"),0); 1150 &mov (&DWP(16*6+8,"esp"),1); 1151 &mov (&DWP(16*6+12,"esp"),0); 1152 &mov (&DWP(16*7+0,"esp"),$len); # save original $len 1153 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp 1154 1155 &movdqa ($tweak,$inout0); 1156 &pxor ($twtmp,$twtmp); 1157 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 1158 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1159 1160 &and ($len,-16); 1161 &mov ($key_,$key); # backup $key 1162 &mov ($rounds_,$rounds); # backup $rounds 1163 &sub ($len,16*6); 1164 &jc (&label("xts_enc_short")); 1165 1166 &shl ($rounds,4); 1167 &mov ($rounds_,16); 1168 &sub ($rounds_,$rounds); 1169 &lea ($key,&DWP(32,$key,$rounds)); 1170 &jmp (&label("xts_enc_loop6")); 1171 1172&set_label("xts_enc_loop6",16); 1173 for ($i=0;$i<4;$i++) { 1174 &pshufd ($twres,$twtmp,0x13); 1175 &pxor ($twtmp,$twtmp); 1176 &movdqa (&QWP(16*$i,"esp"),$tweak); 1177 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1178 &pand ($twres,$twmask); # isolate carry and residue 1179 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1180 &pxor ($tweak,$twres); 1181 } 1182 &pshufd ($inout5,$twtmp,0x13); 1183 &movdqa (&QWP(16*$i++,"esp"),$tweak); 1184 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1185 &$movekey ($rndkey0,&QWP(0,$key_)); 1186 &pand ($inout5,$twmask); # isolate carry and residue 1187 &movups ($inout0,&QWP(0,$inp)); # load input 1188 &pxor ($inout5,$tweak); 1189 1190 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] 1191 &mov ($rounds,$rounds_); # restore $rounds 1192 &movdqu ($inout1,&QWP(16*1,$inp)); 1193 &xorps ($inout0,$rndkey0); # input^=rndkey[0] 1194 &movdqu ($inout2,&QWP(16*2,$inp)); 1195 &pxor ($inout1,$rndkey0); 1196 &movdqu ($inout3,&QWP(16*3,$inp)); 1197 &pxor ($inout2,$rndkey0); 1198 &movdqu ($inout4,&QWP(16*4,$inp)); 1199 &pxor ($inout3,$rndkey0); 1200 &movdqu ($rndkey1,&QWP(16*5,$inp)); 1201 &pxor ($inout4,$rndkey0); 1202 &lea ($inp,&DWP(16*6,$inp)); 1203 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1204 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak 1205 &pxor ($inout5,$rndkey1); 1206 1207 &$movekey ($rndkey1,&QWP(16,$key_)); 1208 &pxor ($inout1,&QWP(16*1,"esp")); 1209 &pxor ($inout2,&QWP(16*2,"esp")); 1210 &aesenc ($inout0,$rndkey1); 1211 &pxor ($inout3,&QWP(16*3,"esp")); 1212 &pxor ($inout4,&QWP(16*4,"esp")); 1213 &aesenc ($inout1,$rndkey1); 1214 &pxor ($inout5,$rndkey0); 1215 &$movekey ($rndkey0,&QWP(32,$key_)); 1216 &aesenc ($inout2,$rndkey1); 1217 &aesenc ($inout3,$rndkey1); 1218 &aesenc ($inout4,$rndkey1); 1219 &aesenc ($inout5,$rndkey1); 1220 &call (&label("_aesni_encrypt6_enter")); 1221 1222 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak 1223 &pxor ($twtmp,$twtmp); 1224 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1225 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1226 &xorps ($inout1,&QWP(16*1,"esp")); 1227 &movups (&QWP(16*0,$out),$inout0); # write output 1228 &xorps ($inout2,&QWP(16*2,"esp")); 1229 &movups (&QWP(16*1,$out),$inout1); 1230 &xorps ($inout3,&QWP(16*3,"esp")); 1231 &movups (&QWP(16*2,$out),$inout2); 1232 &xorps ($inout4,&QWP(16*4,"esp")); 1233 &movups (&QWP(16*3,$out),$inout3); 1234 &xorps ($inout5,$tweak); 1235 &movups (&QWP(16*4,$out),$inout4); 1236 &pshufd ($twres,$twtmp,0x13); 1237 &movups (&QWP(16*5,$out),$inout5); 1238 &lea ($out,&DWP(16*6,$out)); 1239 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 1240 1241 &pxor ($twtmp,$twtmp); 1242 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1243 &pand ($twres,$twmask); # isolate carry and residue 1244 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1245 &pxor ($tweak,$twres); 1246 1247 &sub ($len,16*6); 1248 &jnc (&label("xts_enc_loop6")); 1249 1250 &mov ($rounds,&DWP(240,$key_)); # restore $rounds 1251 &mov ($key,$key_); # restore $key 1252 &mov ($rounds_,$rounds); 1253 1254&set_label("xts_enc_short"); 1255 &add ($len,16*6); 1256 &jz (&label("xts_enc_done6x")); 1257 1258 &movdqa ($inout3,$tweak); # put aside previous tweak 1259 &cmp ($len,0x20); 1260 &jb (&label("xts_enc_one")); 1261 1262 &pshufd ($twres,$twtmp,0x13); 1263 &pxor ($twtmp,$twtmp); 1264 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1265 &pand ($twres,$twmask); # isolate carry and residue 1266 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1267 &pxor ($tweak,$twres); 1268 &je (&label("xts_enc_two")); 1269 1270 &pshufd ($twres,$twtmp,0x13); 1271 &pxor ($twtmp,$twtmp); 1272 &movdqa ($inout4,$tweak); # put aside previous tweak 1273 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1274 &pand ($twres,$twmask); # isolate carry and residue 1275 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1276 &pxor ($tweak,$twres); 1277 &cmp ($len,0x40); 1278 &jb (&label("xts_enc_three")); 1279 1280 &pshufd ($twres,$twtmp,0x13); 1281 &pxor ($twtmp,$twtmp); 1282 &movdqa ($inout5,$tweak); # put aside previous tweak 1283 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1284 &pand ($twres,$twmask); # isolate carry and residue 1285 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1286 &pxor ($tweak,$twres); 1287 &movdqa (&QWP(16*0,"esp"),$inout3); 1288 &movdqa (&QWP(16*1,"esp"),$inout4); 1289 &je (&label("xts_enc_four")); 1290 1291 &movdqa (&QWP(16*2,"esp"),$inout5); 1292 &pshufd ($inout5,$twtmp,0x13); 1293 &movdqa (&QWP(16*3,"esp"),$tweak); 1294 &paddq ($tweak,$tweak); # &psllq($inout0,1); 1295 &pand ($inout5,$twmask); # isolate carry and residue 1296 &pxor ($inout5,$tweak); 1297 1298 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 1299 &movdqu ($inout1,&QWP(16*1,$inp)); 1300 &movdqu ($inout2,&QWP(16*2,$inp)); 1301 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1302 &movdqu ($inout3,&QWP(16*3,$inp)); 1303 &pxor ($inout1,&QWP(16*1,"esp")); 1304 &movdqu ($inout4,&QWP(16*4,$inp)); 1305 &pxor ($inout2,&QWP(16*2,"esp")); 1306 &lea ($inp,&DWP(16*5,$inp)); 1307 &pxor ($inout3,&QWP(16*3,"esp")); 1308 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak 1309 &pxor ($inout4,$inout5); 1310 1311 &call ("_aesni_encrypt6"); 1312 1313 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak 1314 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1315 &xorps ($inout1,&QWP(16*1,"esp")); 1316 &xorps ($inout2,&QWP(16*2,"esp")); 1317 &movups (&QWP(16*0,$out),$inout0); # write output 1318 &xorps ($inout3,&QWP(16*3,"esp")); 1319 &movups (&QWP(16*1,$out),$inout1); 1320 &xorps ($inout4,$tweak); 1321 &movups (&QWP(16*2,$out),$inout2); 1322 &movups (&QWP(16*3,$out),$inout3); 1323 &movups (&QWP(16*4,$out),$inout4); 1324 &lea ($out,&DWP(16*5,$out)); 1325 &jmp (&label("xts_enc_done")); 1326 1327&set_label("xts_enc_one",16); 1328 &movups ($inout0,&QWP(16*0,$inp)); # load input 1329 &lea ($inp,&DWP(16*1,$inp)); 1330 &xorps ($inout0,$inout3); # input^=tweak 1331 if ($inline) 1332 { &aesni_inline_generate1("enc"); } 1333 else 1334 { &call ("_aesni_encrypt1"); } 1335 &xorps ($inout0,$inout3); # output^=tweak 1336 &movups (&QWP(16*0,$out),$inout0); # write output 1337 &lea ($out,&DWP(16*1,$out)); 1338 1339 &movdqa ($tweak,$inout3); # last tweak 1340 &jmp (&label("xts_enc_done")); 1341 1342&set_label("xts_enc_two",16); 1343 &movaps ($inout4,$tweak); # put aside last tweak 1344 1345 &movups ($inout0,&QWP(16*0,$inp)); # load input 1346 &movups ($inout1,&QWP(16*1,$inp)); 1347 &lea ($inp,&DWP(16*2,$inp)); 1348 &xorps ($inout0,$inout3); # input^=tweak 1349 &xorps ($inout1,$inout4); 1350 1351 &call ("_aesni_encrypt2"); 1352 1353 &xorps ($inout0,$inout3); # output^=tweak 1354 &xorps ($inout1,$inout4); 1355 &movups (&QWP(16*0,$out),$inout0); # write output 1356 &movups (&QWP(16*1,$out),$inout1); 1357 &lea ($out,&DWP(16*2,$out)); 1358 1359 &movdqa ($tweak,$inout4); # last tweak 1360 &jmp (&label("xts_enc_done")); 1361 1362&set_label("xts_enc_three",16); 1363 &movaps ($inout5,$tweak); # put aside last tweak 1364 &movups ($inout0,&QWP(16*0,$inp)); # load input 1365 &movups ($inout1,&QWP(16*1,$inp)); 1366 &movups ($inout2,&QWP(16*2,$inp)); 1367 &lea ($inp,&DWP(16*3,$inp)); 1368 &xorps ($inout0,$inout3); # input^=tweak 1369 &xorps ($inout1,$inout4); 1370 &xorps ($inout2,$inout5); 1371 1372 &call ("_aesni_encrypt3"); 1373 1374 &xorps ($inout0,$inout3); # output^=tweak 1375 &xorps ($inout1,$inout4); 1376 &xorps ($inout2,$inout5); 1377 &movups (&QWP(16*0,$out),$inout0); # write output 1378 &movups (&QWP(16*1,$out),$inout1); 1379 &movups (&QWP(16*2,$out),$inout2); 1380 &lea ($out,&DWP(16*3,$out)); 1381 1382 &movdqa ($tweak,$inout5); # last tweak 1383 &jmp (&label("xts_enc_done")); 1384 1385&set_label("xts_enc_four",16); 1386 &movaps ($inout4,$tweak); # put aside last tweak 1387 1388 &movups ($inout0,&QWP(16*0,$inp)); # load input 1389 &movups ($inout1,&QWP(16*1,$inp)); 1390 &movups ($inout2,&QWP(16*2,$inp)); 1391 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak 1392 &movups ($inout3,&QWP(16*3,$inp)); 1393 &lea ($inp,&DWP(16*4,$inp)); 1394 &xorps ($inout1,&QWP(16*1,"esp")); 1395 &xorps ($inout2,$inout5); 1396 &xorps ($inout3,$inout4); 1397 1398 &call ("_aesni_encrypt4"); 1399 1400 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1401 &xorps ($inout1,&QWP(16*1,"esp")); 1402 &xorps ($inout2,$inout5); 1403 &movups (&QWP(16*0,$out),$inout0); # write output 1404 &xorps ($inout3,$inout4); 1405 &movups (&QWP(16*1,$out),$inout1); 1406 &movups (&QWP(16*2,$out),$inout2); 1407 &movups (&QWP(16*3,$out),$inout3); 1408 &lea ($out,&DWP(16*4,$out)); 1409 1410 &movdqa ($tweak,$inout4); # last tweak 1411 &jmp (&label("xts_enc_done")); 1412 1413&set_label("xts_enc_done6x",16); # $tweak is pre-calculated 1414 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1415 &and ($len,15); 1416 &jz (&label("xts_enc_ret")); 1417 &movdqa ($inout3,$tweak); 1418 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1419 &jmp (&label("xts_enc_steal")); 1420 1421&set_label("xts_enc_done",16); 1422 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1423 &pxor ($twtmp,$twtmp); 1424 &and ($len,15); 1425 &jz (&label("xts_enc_ret")); 1426 1427 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1428 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1429 &pshufd ($inout3,$twtmp,0x13); 1430 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1431 &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue 1432 &pxor ($inout3,$tweak); 1433 1434&set_label("xts_enc_steal"); 1435 &movz ($rounds,&BP(0,$inp)); 1436 &movz ($key,&BP(-16,$out)); 1437 &lea ($inp,&DWP(1,$inp)); 1438 &mov (&BP(-16,$out),&LB($rounds)); 1439 &mov (&BP(0,$out),&LB($key)); 1440 &lea ($out,&DWP(1,$out)); 1441 &sub ($len,1); 1442 &jnz (&label("xts_enc_steal")); 1443 1444 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out 1445 &mov ($key,$key_); # restore $key 1446 &mov ($rounds,$rounds_); # restore $rounds 1447 1448 &movups ($inout0,&QWP(-16,$out)); # load input 1449 &xorps ($inout0,$inout3); # input^=tweak 1450 if ($inline) 1451 { &aesni_inline_generate1("enc"); } 1452 else 1453 { &call ("_aesni_encrypt1"); } 1454 &xorps ($inout0,$inout3); # output^=tweak 1455 &movups (&QWP(-16,$out),$inout0); # write output 1456 1457&set_label("xts_enc_ret"); 1458 &pxor ("xmm0","xmm0"); # clear register bank 1459 &pxor ("xmm1","xmm1"); 1460 &pxor ("xmm2","xmm2"); 1461 &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack 1462 &pxor ("xmm3","xmm3"); 1463 &movdqa (&QWP(16*1,"esp"),"xmm0"); 1464 &pxor ("xmm4","xmm4"); 1465 &movdqa (&QWP(16*2,"esp"),"xmm0"); 1466 &pxor ("xmm5","xmm5"); 1467 &movdqa (&QWP(16*3,"esp"),"xmm0"); 1468 &pxor ("xmm6","xmm6"); 1469 &movdqa (&QWP(16*4,"esp"),"xmm0"); 1470 &pxor ("xmm7","xmm7"); 1471 &movdqa (&QWP(16*5,"esp"),"xmm0"); 1472 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp 1473&function_end("aesni_xts_encrypt"); 1474 1475&function_begin("aesni_xts_decrypt"); 1476 &mov ($key,&wparam(4)); # key2 1477 &mov ($inp,&wparam(5)); # clear-text tweak 1478 1479 &mov ($rounds,&DWP(240,$key)); # key2->rounds 1480 &movups ($inout0,&QWP(0,$inp)); 1481 if ($inline) 1482 { &aesni_inline_generate1("enc"); } 1483 else 1484 { &call ("_aesni_encrypt1"); } 1485 1486 &mov ($inp,&wparam(0)); 1487 &mov ($out,&wparam(1)); 1488 &mov ($len,&wparam(2)); 1489 &mov ($key,&wparam(3)); # key1 1490 1491 &mov ($key_,"esp"); 1492 &sub ("esp",16*7+8); 1493 &and ("esp",-16); # align stack 1494 1495 &xor ($rounds_,$rounds_); # if(len%16) len-=16; 1496 &test ($len,15); 1497 &setnz (&LB($rounds_)); 1498 &shl ($rounds_,4); 1499 &sub ($len,$rounds_); 1500 1501 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant 1502 &mov (&DWP(16*6+4,"esp"),0); 1503 &mov (&DWP(16*6+8,"esp"),1); 1504 &mov (&DWP(16*6+12,"esp"),0); 1505 &mov (&DWP(16*7+0,"esp"),$len); # save original $len 1506 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp 1507 1508 &mov ($rounds,&DWP(240,$key)); # key1->rounds 1509 &mov ($key_,$key); # backup $key 1510 &mov ($rounds_,$rounds); # backup $rounds 1511 1512 &movdqa ($tweak,$inout0); 1513 &pxor ($twtmp,$twtmp); 1514 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 1515 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1516 1517 &and ($len,-16); 1518 &sub ($len,16*6); 1519 &jc (&label("xts_dec_short")); 1520 1521 &shl ($rounds,4); 1522 &mov ($rounds_,16); 1523 &sub ($rounds_,$rounds); 1524 &lea ($key,&DWP(32,$key,$rounds)); 1525 &jmp (&label("xts_dec_loop6")); 1526 1527&set_label("xts_dec_loop6",16); 1528 for ($i=0;$i<4;$i++) { 1529 &pshufd ($twres,$twtmp,0x13); 1530 &pxor ($twtmp,$twtmp); 1531 &movdqa (&QWP(16*$i,"esp"),$tweak); 1532 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1533 &pand ($twres,$twmask); # isolate carry and residue 1534 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1535 &pxor ($tweak,$twres); 1536 } 1537 &pshufd ($inout5,$twtmp,0x13); 1538 &movdqa (&QWP(16*$i++,"esp"),$tweak); 1539 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1540 &$movekey ($rndkey0,&QWP(0,$key_)); 1541 &pand ($inout5,$twmask); # isolate carry and residue 1542 &movups ($inout0,&QWP(0,$inp)); # load input 1543 &pxor ($inout5,$tweak); 1544 1545 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] 1546 &mov ($rounds,$rounds_); 1547 &movdqu ($inout1,&QWP(16*1,$inp)); 1548 &xorps ($inout0,$rndkey0); # input^=rndkey[0] 1549 &movdqu ($inout2,&QWP(16*2,$inp)); 1550 &pxor ($inout1,$rndkey0); 1551 &movdqu ($inout3,&QWP(16*3,$inp)); 1552 &pxor ($inout2,$rndkey0); 1553 &movdqu ($inout4,&QWP(16*4,$inp)); 1554 &pxor ($inout3,$rndkey0); 1555 &movdqu ($rndkey1,&QWP(16*5,$inp)); 1556 &pxor ($inout4,$rndkey0); 1557 &lea ($inp,&DWP(16*6,$inp)); 1558 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1559 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak 1560 &pxor ($inout5,$rndkey1); 1561 1562 &$movekey ($rndkey1,&QWP(16,$key_)); 1563 &pxor ($inout1,&QWP(16*1,"esp")); 1564 &pxor ($inout2,&QWP(16*2,"esp")); 1565 &aesdec ($inout0,$rndkey1); 1566 &pxor ($inout3,&QWP(16*3,"esp")); 1567 &pxor ($inout4,&QWP(16*4,"esp")); 1568 &aesdec ($inout1,$rndkey1); 1569 &pxor ($inout5,$rndkey0); 1570 &$movekey ($rndkey0,&QWP(32,$key_)); 1571 &aesdec ($inout2,$rndkey1); 1572 &aesdec ($inout3,$rndkey1); 1573 &aesdec ($inout4,$rndkey1); 1574 &aesdec ($inout5,$rndkey1); 1575 &call (&label("_aesni_decrypt6_enter")); 1576 1577 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak 1578 &pxor ($twtmp,$twtmp); 1579 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1580 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1581 &xorps ($inout1,&QWP(16*1,"esp")); 1582 &movups (&QWP(16*0,$out),$inout0); # write output 1583 &xorps ($inout2,&QWP(16*2,"esp")); 1584 &movups (&QWP(16*1,$out),$inout1); 1585 &xorps ($inout3,&QWP(16*3,"esp")); 1586 &movups (&QWP(16*2,$out),$inout2); 1587 &xorps ($inout4,&QWP(16*4,"esp")); 1588 &movups (&QWP(16*3,$out),$inout3); 1589 &xorps ($inout5,$tweak); 1590 &movups (&QWP(16*4,$out),$inout4); 1591 &pshufd ($twres,$twtmp,0x13); 1592 &movups (&QWP(16*5,$out),$inout5); 1593 &lea ($out,&DWP(16*6,$out)); 1594 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 1595 1596 &pxor ($twtmp,$twtmp); 1597 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1598 &pand ($twres,$twmask); # isolate carry and residue 1599 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1600 &pxor ($tweak,$twres); 1601 1602 &sub ($len,16*6); 1603 &jnc (&label("xts_dec_loop6")); 1604 1605 &mov ($rounds,&DWP(240,$key_)); # restore $rounds 1606 &mov ($key,$key_); # restore $key 1607 &mov ($rounds_,$rounds); 1608 1609&set_label("xts_dec_short"); 1610 &add ($len,16*6); 1611 &jz (&label("xts_dec_done6x")); 1612 1613 &movdqa ($inout3,$tweak); # put aside previous tweak 1614 &cmp ($len,0x20); 1615 &jb (&label("xts_dec_one")); 1616 1617 &pshufd ($twres,$twtmp,0x13); 1618 &pxor ($twtmp,$twtmp); 1619 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1620 &pand ($twres,$twmask); # isolate carry and residue 1621 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1622 &pxor ($tweak,$twres); 1623 &je (&label("xts_dec_two")); 1624 1625 &pshufd ($twres,$twtmp,0x13); 1626 &pxor ($twtmp,$twtmp); 1627 &movdqa ($inout4,$tweak); # put aside previous tweak 1628 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1629 &pand ($twres,$twmask); # isolate carry and residue 1630 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1631 &pxor ($tweak,$twres); 1632 &cmp ($len,0x40); 1633 &jb (&label("xts_dec_three")); 1634 1635 &pshufd ($twres,$twtmp,0x13); 1636 &pxor ($twtmp,$twtmp); 1637 &movdqa ($inout5,$tweak); # put aside previous tweak 1638 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1639 &pand ($twres,$twmask); # isolate carry and residue 1640 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1641 &pxor ($tweak,$twres); 1642 &movdqa (&QWP(16*0,"esp"),$inout3); 1643 &movdqa (&QWP(16*1,"esp"),$inout4); 1644 &je (&label("xts_dec_four")); 1645 1646 &movdqa (&QWP(16*2,"esp"),$inout5); 1647 &pshufd ($inout5,$twtmp,0x13); 1648 &movdqa (&QWP(16*3,"esp"),$tweak); 1649 &paddq ($tweak,$tweak); # &psllq($inout0,1); 1650 &pand ($inout5,$twmask); # isolate carry and residue 1651 &pxor ($inout5,$tweak); 1652 1653 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 1654 &movdqu ($inout1,&QWP(16*1,$inp)); 1655 &movdqu ($inout2,&QWP(16*2,$inp)); 1656 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1657 &movdqu ($inout3,&QWP(16*3,$inp)); 1658 &pxor ($inout1,&QWP(16*1,"esp")); 1659 &movdqu ($inout4,&QWP(16*4,$inp)); 1660 &pxor ($inout2,&QWP(16*2,"esp")); 1661 &lea ($inp,&DWP(16*5,$inp)); 1662 &pxor ($inout3,&QWP(16*3,"esp")); 1663 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak 1664 &pxor ($inout4,$inout5); 1665 1666 &call ("_aesni_decrypt6"); 1667 1668 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak 1669 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1670 &xorps ($inout1,&QWP(16*1,"esp")); 1671 &xorps ($inout2,&QWP(16*2,"esp")); 1672 &movups (&QWP(16*0,$out),$inout0); # write output 1673 &xorps ($inout3,&QWP(16*3,"esp")); 1674 &movups (&QWP(16*1,$out),$inout1); 1675 &xorps ($inout4,$tweak); 1676 &movups (&QWP(16*2,$out),$inout2); 1677 &movups (&QWP(16*3,$out),$inout3); 1678 &movups (&QWP(16*4,$out),$inout4); 1679 &lea ($out,&DWP(16*5,$out)); 1680 &jmp (&label("xts_dec_done")); 1681 1682&set_label("xts_dec_one",16); 1683 &movups ($inout0,&QWP(16*0,$inp)); # load input 1684 &lea ($inp,&DWP(16*1,$inp)); 1685 &xorps ($inout0,$inout3); # input^=tweak 1686 if ($inline) 1687 { &aesni_inline_generate1("dec"); } 1688 else 1689 { &call ("_aesni_decrypt1"); } 1690 &xorps ($inout0,$inout3); # output^=tweak 1691 &movups (&QWP(16*0,$out),$inout0); # write output 1692 &lea ($out,&DWP(16*1,$out)); 1693 1694 &movdqa ($tweak,$inout3); # last tweak 1695 &jmp (&label("xts_dec_done")); 1696 1697&set_label("xts_dec_two",16); 1698 &movaps ($inout4,$tweak); # put aside last tweak 1699 1700 &movups ($inout0,&QWP(16*0,$inp)); # load input 1701 &movups ($inout1,&QWP(16*1,$inp)); 1702 &lea ($inp,&DWP(16*2,$inp)); 1703 &xorps ($inout0,$inout3); # input^=tweak 1704 &xorps ($inout1,$inout4); 1705 1706 &call ("_aesni_decrypt2"); 1707 1708 &xorps ($inout0,$inout3); # output^=tweak 1709 &xorps ($inout1,$inout4); 1710 &movups (&QWP(16*0,$out),$inout0); # write output 1711 &movups (&QWP(16*1,$out),$inout1); 1712 &lea ($out,&DWP(16*2,$out)); 1713 1714 &movdqa ($tweak,$inout4); # last tweak 1715 &jmp (&label("xts_dec_done")); 1716 1717&set_label("xts_dec_three",16); 1718 &movaps ($inout5,$tweak); # put aside last tweak 1719 &movups ($inout0,&QWP(16*0,$inp)); # load input 1720 &movups ($inout1,&QWP(16*1,$inp)); 1721 &movups ($inout2,&QWP(16*2,$inp)); 1722 &lea ($inp,&DWP(16*3,$inp)); 1723 &xorps ($inout0,$inout3); # input^=tweak 1724 &xorps ($inout1,$inout4); 1725 &xorps ($inout2,$inout5); 1726 1727 &call ("_aesni_decrypt3"); 1728 1729 &xorps ($inout0,$inout3); # output^=tweak 1730 &xorps ($inout1,$inout4); 1731 &xorps ($inout2,$inout5); 1732 &movups (&QWP(16*0,$out),$inout0); # write output 1733 &movups (&QWP(16*1,$out),$inout1); 1734 &movups (&QWP(16*2,$out),$inout2); 1735 &lea ($out,&DWP(16*3,$out)); 1736 1737 &movdqa ($tweak,$inout5); # last tweak 1738 &jmp (&label("xts_dec_done")); 1739 1740&set_label("xts_dec_four",16); 1741 &movaps ($inout4,$tweak); # put aside last tweak 1742 1743 &movups ($inout0,&QWP(16*0,$inp)); # load input 1744 &movups ($inout1,&QWP(16*1,$inp)); 1745 &movups ($inout2,&QWP(16*2,$inp)); 1746 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak 1747 &movups ($inout3,&QWP(16*3,$inp)); 1748 &lea ($inp,&DWP(16*4,$inp)); 1749 &xorps ($inout1,&QWP(16*1,"esp")); 1750 &xorps ($inout2,$inout5); 1751 &xorps ($inout3,$inout4); 1752 1753 &call ("_aesni_decrypt4"); 1754 1755 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1756 &xorps ($inout1,&QWP(16*1,"esp")); 1757 &xorps ($inout2,$inout5); 1758 &movups (&QWP(16*0,$out),$inout0); # write output 1759 &xorps ($inout3,$inout4); 1760 &movups (&QWP(16*1,$out),$inout1); 1761 &movups (&QWP(16*2,$out),$inout2); 1762 &movups (&QWP(16*3,$out),$inout3); 1763 &lea ($out,&DWP(16*4,$out)); 1764 1765 &movdqa ($tweak,$inout4); # last tweak 1766 &jmp (&label("xts_dec_done")); 1767 1768&set_label("xts_dec_done6x",16); # $tweak is pre-calculated 1769 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1770 &and ($len,15); 1771 &jz (&label("xts_dec_ret")); 1772 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1773 &jmp (&label("xts_dec_only_one_more")); 1774 1775&set_label("xts_dec_done",16); 1776 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1777 &pxor ($twtmp,$twtmp); 1778 &and ($len,15); 1779 &jz (&label("xts_dec_ret")); 1780 1781 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1782 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1783 &pshufd ($twres,$twtmp,0x13); 1784 &pxor ($twtmp,$twtmp); 1785 &movdqa ($twmask,&QWP(16*6,"esp")); 1786 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1787 &pand ($twres,$twmask); # isolate carry and residue 1788 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1789 &pxor ($tweak,$twres); 1790 1791&set_label("xts_dec_only_one_more"); 1792 &pshufd ($inout3,$twtmp,0x13); 1793 &movdqa ($inout4,$tweak); # put aside previous tweak 1794 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1795 &pand ($inout3,$twmask); # isolate carry and residue 1796 &pxor ($inout3,$tweak); 1797 1798 &mov ($key,$key_); # restore $key 1799 &mov ($rounds,$rounds_); # restore $rounds 1800 1801 &movups ($inout0,&QWP(0,$inp)); # load input 1802 &xorps ($inout0,$inout3); # input^=tweak 1803 if ($inline) 1804 { &aesni_inline_generate1("dec"); } 1805 else 1806 { &call ("_aesni_decrypt1"); } 1807 &xorps ($inout0,$inout3); # output^=tweak 1808 &movups (&QWP(0,$out),$inout0); # write output 1809 1810&set_label("xts_dec_steal"); 1811 &movz ($rounds,&BP(16,$inp)); 1812 &movz ($key,&BP(0,$out)); 1813 &lea ($inp,&DWP(1,$inp)); 1814 &mov (&BP(0,$out),&LB($rounds)); 1815 &mov (&BP(16,$out),&LB($key)); 1816 &lea ($out,&DWP(1,$out)); 1817 &sub ($len,1); 1818 &jnz (&label("xts_dec_steal")); 1819 1820 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out 1821 &mov ($key,$key_); # restore $key 1822 &mov ($rounds,$rounds_); # restore $rounds 1823 1824 &movups ($inout0,&QWP(0,$out)); # load input 1825 &xorps ($inout0,$inout4); # input^=tweak 1826 if ($inline) 1827 { &aesni_inline_generate1("dec"); } 1828 else 1829 { &call ("_aesni_decrypt1"); } 1830 &xorps ($inout0,$inout4); # output^=tweak 1831 &movups (&QWP(0,$out),$inout0); # write output 1832 1833&set_label("xts_dec_ret"); 1834 &pxor ("xmm0","xmm0"); # clear register bank 1835 &pxor ("xmm1","xmm1"); 1836 &pxor ("xmm2","xmm2"); 1837 &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack 1838 &pxor ("xmm3","xmm3"); 1839 &movdqa (&QWP(16*1,"esp"),"xmm0"); 1840 &pxor ("xmm4","xmm4"); 1841 &movdqa (&QWP(16*2,"esp"),"xmm0"); 1842 &pxor ("xmm5","xmm5"); 1843 &movdqa (&QWP(16*3,"esp"),"xmm0"); 1844 &pxor ("xmm6","xmm6"); 1845 &movdqa (&QWP(16*4,"esp"),"xmm0"); 1846 &pxor ("xmm7","xmm7"); 1847 &movdqa (&QWP(16*5,"esp"),"xmm0"); 1848 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp 1849&function_end("aesni_xts_decrypt"); 1850} 1851 1852###################################################################### 1853# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks, 1854# const AES_KEY *key, unsigned int start_block_num, 1855# unsigned char offset_i[16], const unsigned char L_[][16], 1856# unsigned char checksum[16]); 1857# 1858{ 1859# offsets within stack frame 1860my $checksum = 16*6; 1861my ($key_off,$rounds_off,$out_off,$end_off,$esp_off)=map(16*7+4*$_,(0..4)); 1862 1863# reassigned registers 1864my ($l_,$block,$i1,$i3,$i5) = ($rounds_,$key_,$rounds,$len,$out); 1865# $l_, $blocks, $inp, $key are permanently allocated in registers; 1866# remaining non-volatile ones are offloaded to stack, which even 1867# stay invariant after written to stack. 1868 1869&function_begin("aesni_ocb_encrypt"); 1870 &mov ($rounds,&wparam(5)); # &offset_i 1871 &mov ($rounds_,&wparam(7)); # &checksum 1872 1873 &mov ($inp,&wparam(0)); 1874 &mov ($out,&wparam(1)); 1875 &mov ($len,&wparam(2)); 1876 &mov ($key,&wparam(3)); 1877 &movdqu ($rndkey0,&QWP(0,$rounds)); # load offset_i 1878 &mov ($block,&wparam(4)); # start_block_num 1879 &movdqu ($rndkey1,&QWP(0,$rounds_)); # load checksum 1880 &mov ($l_,&wparam(6)); # L_ 1881 1882 &mov ($rounds,"esp"); 1883 &sub ("esp",$esp_off+4); # alloca 1884 &and ("esp",-16); # align stack 1885 1886 &sub ($out,$inp); 1887 &shl ($len,4); 1888 &lea ($len,&DWP(-16*6,$inp,$len)); # end of input - 16*6 1889 &mov (&DWP($out_off,"esp"),$out); 1890 &mov (&DWP($end_off,"esp"),$len); 1891 &mov (&DWP($esp_off,"esp"),$rounds); 1892 1893 &mov ($rounds,&DWP(240,$key)); 1894 1895 &test ($block,1); 1896 &jnz (&label("odd")); 1897 1898 &bsf ($i3,$block); 1899 &add ($block,1); 1900 &shl ($i3,4); 1901 &movdqu ($inout5,&QWP(0,$l_,$i3)); 1902 &mov ($i3,$key); # put aside key 1903 1904 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 1905 &lea ($inp,&DWP(16,$inp)); 1906 1907 &pxor ($inout5,$rndkey0); # ^ last offset_i 1908 &pxor ($rndkey1,$inout0); # checksum 1909 &pxor ($inout0,$inout5); # ^ offset_i 1910 1911 &movdqa ($inout4,$rndkey1); 1912 if ($inline) 1913 { &aesni_inline_generate1("enc"); } 1914 else 1915 { &call ("_aesni_encrypt1"); } 1916 1917 &xorps ($inout0,$inout5); # ^ offset_i 1918 &movdqa ($rndkey0,$inout5); # pass last offset_i 1919 &movdqa ($rndkey1,$inout4); # pass the checksum 1920 1921 &movups (&QWP(-16,$out,$inp),$inout0); # store output 1922 1923 &mov ($rounds,&DWP(240,$i3)); 1924 &mov ($key,$i3); # restore key 1925 &mov ($len,&DWP($end_off,"esp")); 1926 1927&set_label("odd"); 1928 &shl ($rounds,4); 1929 &mov ($out,16); 1930 &sub ($out,$rounds); # twisted rounds 1931 &mov (&DWP($key_off,"esp"),$key); 1932 &lea ($key,&DWP(32,$key,$rounds)); # end of key schedule 1933 &mov (&DWP($rounds_off,"esp"),$out); 1934 1935 &cmp ($inp,$len); 1936 &ja (&label("short")); 1937 &jmp (&label("grandloop")); 1938 1939&set_label("grandloop",32); 1940 &lea ($i1,&DWP(1,$block)); 1941 &lea ($i3,&DWP(3,$block)); 1942 &lea ($i5,&DWP(5,$block)); 1943 &add ($block,6); 1944 &bsf ($i1,$i1); 1945 &bsf ($i3,$i3); 1946 &bsf ($i5,$i5); 1947 &shl ($i1,4); 1948 &shl ($i3,4); 1949 &shl ($i5,4); 1950 &movdqu ($inout0,&QWP(0,$l_)); 1951 &movdqu ($inout1,&QWP(0,$l_,$i1)); 1952 &mov ($rounds,&DWP($rounds_off,"esp")); 1953 &movdqa ($inout2,$inout0); 1954 &movdqu ($inout3,&QWP(0,$l_,$i3)); 1955 &movdqa ($inout4,$inout0); 1956 &movdqu ($inout5,&QWP(0,$l_,$i5)); 1957 1958 &pxor ($inout0,$rndkey0); # ^ last offset_i 1959 &pxor ($inout1,$inout0); 1960 &movdqa (&QWP(16*0,"esp"),$inout0); 1961 &pxor ($inout2,$inout1); 1962 &movdqa (&QWP(16*1,"esp"),$inout1); 1963 &pxor ($inout3,$inout2); 1964 &movdqa (&QWP(16*2,"esp"),$inout2); 1965 &pxor ($inout4,$inout3); 1966 &movdqa (&QWP(16*3,"esp"),$inout3); 1967 &pxor ($inout5,$inout4); 1968 &movdqa (&QWP(16*4,"esp"),$inout4); 1969 &movdqa (&QWP(16*5,"esp"),$inout5); 1970 1971 &$movekey ($rndkey0,&QWP(-48,$key,$rounds)); 1972 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 1973 &movdqu ($inout1,&QWP(16*1,$inp)); 1974 &movdqu ($inout2,&QWP(16*2,$inp)); 1975 &movdqu ($inout3,&QWP(16*3,$inp)); 1976 &movdqu ($inout4,&QWP(16*4,$inp)); 1977 &movdqu ($inout5,&QWP(16*5,$inp)); 1978 &lea ($inp,&DWP(16*6,$inp)); 1979 1980 &pxor ($rndkey1,$inout0); # checksum 1981 &pxor ($inout0,$rndkey0); # ^ roundkey[0] 1982 &pxor ($rndkey1,$inout1); 1983 &pxor ($inout1,$rndkey0); 1984 &pxor ($rndkey1,$inout2); 1985 &pxor ($inout2,$rndkey0); 1986 &pxor ($rndkey1,$inout3); 1987 &pxor ($inout3,$rndkey0); 1988 &pxor ($rndkey1,$inout4); 1989 &pxor ($inout4,$rndkey0); 1990 &pxor ($rndkey1,$inout5); 1991 &pxor ($inout5,$rndkey0); 1992 &movdqa (&QWP($checksum,"esp"),$rndkey1); 1993 1994 &$movekey ($rndkey1,&QWP(-32,$key,$rounds)); 1995 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 1996 &pxor ($inout1,&QWP(16*1,"esp")); 1997 &pxor ($inout2,&QWP(16*2,"esp")); 1998 &pxor ($inout3,&QWP(16*3,"esp")); 1999 &pxor ($inout4,&QWP(16*4,"esp")); 2000 &pxor ($inout5,&QWP(16*5,"esp")); 2001 2002 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 2003 &aesenc ($inout0,$rndkey1); 2004 &aesenc ($inout1,$rndkey1); 2005 &aesenc ($inout2,$rndkey1); 2006 &aesenc ($inout3,$rndkey1); 2007 &aesenc ($inout4,$rndkey1); 2008 &aesenc ($inout5,$rndkey1); 2009 2010 &mov ($out,&DWP($out_off,"esp")); 2011 &mov ($len,&DWP($end_off,"esp")); 2012 &call ("_aesni_encrypt6_enter"); 2013 2014 &movdqa ($rndkey0,&QWP(16*5,"esp")); # pass last offset_i 2015 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 2016 &pxor ($inout1,&QWP(16*1,"esp")); 2017 &pxor ($inout2,&QWP(16*2,"esp")); 2018 &pxor ($inout3,&QWP(16*3,"esp")); 2019 &pxor ($inout4,&QWP(16*4,"esp")); 2020 &pxor ($inout5,$rndkey0); 2021 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum 2022 2023 &movdqu (&QWP(-16*6,$out,$inp),$inout0);# store output 2024 &movdqu (&QWP(-16*5,$out,$inp),$inout1); 2025 &movdqu (&QWP(-16*4,$out,$inp),$inout2); 2026 &movdqu (&QWP(-16*3,$out,$inp),$inout3); 2027 &movdqu (&QWP(-16*2,$out,$inp),$inout4); 2028 &movdqu (&QWP(-16*1,$out,$inp),$inout5); 2029 &cmp ($inp,$len); # done yet? 2030 &jbe (&label("grandloop")); 2031 2032&set_label("short"); 2033 &add ($len,16*6); 2034 &sub ($len,$inp); 2035 &jz (&label("done")); 2036 2037 &cmp ($len,16*2); 2038 &jb (&label("one")); 2039 &je (&label("two")); 2040 2041 &cmp ($len,16*4); 2042 &jb (&label("three")); 2043 &je (&label("four")); 2044 2045 &lea ($i1,&DWP(1,$block)); 2046 &lea ($i3,&DWP(3,$block)); 2047 &bsf ($i1,$i1); 2048 &bsf ($i3,$i3); 2049 &shl ($i1,4); 2050 &shl ($i3,4); 2051 &movdqu ($inout0,&QWP(0,$l_)); 2052 &movdqu ($inout1,&QWP(0,$l_,$i1)); 2053 &mov ($rounds,&DWP($rounds_off,"esp")); 2054 &movdqa ($inout2,$inout0); 2055 &movdqu ($inout3,&QWP(0,$l_,$i3)); 2056 &movdqa ($inout4,$inout0); 2057 2058 &pxor ($inout0,$rndkey0); # ^ last offset_i 2059 &pxor ($inout1,$inout0); 2060 &movdqa (&QWP(16*0,"esp"),$inout0); 2061 &pxor ($inout2,$inout1); 2062 &movdqa (&QWP(16*1,"esp"),$inout1); 2063 &pxor ($inout3,$inout2); 2064 &movdqa (&QWP(16*2,"esp"),$inout2); 2065 &pxor ($inout4,$inout3); 2066 &movdqa (&QWP(16*3,"esp"),$inout3); 2067 &pxor ($inout5,$inout4); 2068 &movdqa (&QWP(16*4,"esp"),$inout4); 2069 2070 &$movekey ($rndkey0,&QWP(-48,$key,$rounds)); 2071 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2072 &movdqu ($inout1,&QWP(16*1,$inp)); 2073 &movdqu ($inout2,&QWP(16*2,$inp)); 2074 &movdqu ($inout3,&QWP(16*3,$inp)); 2075 &movdqu ($inout4,&QWP(16*4,$inp)); 2076 &pxor ($inout5,$inout5); 2077 2078 &pxor ($rndkey1,$inout0); # checksum 2079 &pxor ($inout0,$rndkey0); # ^ roundkey[0] 2080 &pxor ($rndkey1,$inout1); 2081 &pxor ($inout1,$rndkey0); 2082 &pxor ($rndkey1,$inout2); 2083 &pxor ($inout2,$rndkey0); 2084 &pxor ($rndkey1,$inout3); 2085 &pxor ($inout3,$rndkey0); 2086 &pxor ($rndkey1,$inout4); 2087 &pxor ($inout4,$rndkey0); 2088 &movdqa (&QWP($checksum,"esp"),$rndkey1); 2089 2090 &$movekey ($rndkey1,&QWP(-32,$key,$rounds)); 2091 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 2092 &pxor ($inout1,&QWP(16*1,"esp")); 2093 &pxor ($inout2,&QWP(16*2,"esp")); 2094 &pxor ($inout3,&QWP(16*3,"esp")); 2095 &pxor ($inout4,&QWP(16*4,"esp")); 2096 2097 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 2098 &aesenc ($inout0,$rndkey1); 2099 &aesenc ($inout1,$rndkey1); 2100 &aesenc ($inout2,$rndkey1); 2101 &aesenc ($inout3,$rndkey1); 2102 &aesenc ($inout4,$rndkey1); 2103 &aesenc ($inout5,$rndkey1); 2104 2105 &mov ($out,&DWP($out_off,"esp")); 2106 &call ("_aesni_encrypt6_enter"); 2107 2108 &movdqa ($rndkey0,&QWP(16*4,"esp")); # pass last offset_i 2109 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 2110 &pxor ($inout1,&QWP(16*1,"esp")); 2111 &pxor ($inout2,&QWP(16*2,"esp")); 2112 &pxor ($inout3,&QWP(16*3,"esp")); 2113 &pxor ($inout4,$rndkey0); 2114 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum 2115 2116 &movdqu (&QWP(16*0,$out,$inp),$inout0); # store output 2117 &movdqu (&QWP(16*1,$out,$inp),$inout1); 2118 &movdqu (&QWP(16*2,$out,$inp),$inout2); 2119 &movdqu (&QWP(16*3,$out,$inp),$inout3); 2120 &movdqu (&QWP(16*4,$out,$inp),$inout4); 2121 2122 &jmp (&label("done")); 2123 2124&set_label("one",16); 2125 &movdqu ($inout5,&QWP(0,$l_)); 2126 &mov ($key,&DWP($key_off,"esp")); # restore key 2127 2128 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2129 &mov ($rounds,&DWP(240,$key)); 2130 2131 &pxor ($inout5,$rndkey0); # ^ last offset_i 2132 &pxor ($rndkey1,$inout0); # checksum 2133 &pxor ($inout0,$inout5); # ^ offset_i 2134 2135 &movdqa ($inout4,$rndkey1); 2136 &mov ($out,&DWP($out_off,"esp")); 2137 if ($inline) 2138 { &aesni_inline_generate1("enc"); } 2139 else 2140 { &call ("_aesni_encrypt1"); } 2141 2142 &xorps ($inout0,$inout5); # ^ offset_i 2143 &movdqa ($rndkey0,$inout5); # pass last offset_i 2144 &movdqa ($rndkey1,$inout4); # pass the checksum 2145 &movups (&QWP(0,$out,$inp),$inout0); 2146 2147 &jmp (&label("done")); 2148 2149&set_label("two",16); 2150 &lea ($i1,&DWP(1,$block)); 2151 &mov ($key,&DWP($key_off,"esp")); # restore key 2152 &bsf ($i1,$i1); 2153 &shl ($i1,4); 2154 &movdqu ($inout4,&QWP(0,$l_)); 2155 &movdqu ($inout5,&QWP(0,$l_,$i1)); 2156 2157 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2158 &movdqu ($inout1,&QWP(16*1,$inp)); 2159 &mov ($rounds,&DWP(240,$key)); 2160 2161 &pxor ($inout4,$rndkey0); # ^ last offset_i 2162 &pxor ($inout5,$inout4); 2163 2164 &pxor ($rndkey1,$inout0); # checksum 2165 &pxor ($inout0,$inout4); # ^ offset_i 2166 &pxor ($rndkey1,$inout1); 2167 &pxor ($inout1,$inout5); 2168 2169 &movdqa ($inout3,$rndkey1) 2170 &mov ($out,&DWP($out_off,"esp")); 2171 &call ("_aesni_encrypt2"); 2172 2173 &xorps ($inout0,$inout4); # ^ offset_i 2174 &xorps ($inout1,$inout5); 2175 &movdqa ($rndkey0,$inout5); # pass last offset_i 2176 &movdqa ($rndkey1,$inout3); # pass the checksum 2177 &movups (&QWP(16*0,$out,$inp),$inout0); # store output 2178 &movups (&QWP(16*1,$out,$inp),$inout1); 2179 2180 &jmp (&label("done")); 2181 2182&set_label("three",16); 2183 &lea ($i1,&DWP(1,$block)); 2184 &mov ($key,&DWP($key_off,"esp")); # restore key 2185 &bsf ($i1,$i1); 2186 &shl ($i1,4); 2187 &movdqu ($inout3,&QWP(0,$l_)); 2188 &movdqu ($inout4,&QWP(0,$l_,$i1)); 2189 &movdqa ($inout5,$inout3); 2190 2191 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2192 &movdqu ($inout1,&QWP(16*1,$inp)); 2193 &movdqu ($inout2,&QWP(16*2,$inp)); 2194 &mov ($rounds,&DWP(240,$key)); 2195 2196 &pxor ($inout3,$rndkey0); # ^ last offset_i 2197 &pxor ($inout4,$inout3); 2198 &pxor ($inout5,$inout4); 2199 2200 &pxor ($rndkey1,$inout0); # checksum 2201 &pxor ($inout0,$inout3); # ^ offset_i 2202 &pxor ($rndkey1,$inout1); 2203 &pxor ($inout1,$inout4); 2204 &pxor ($rndkey1,$inout2); 2205 &pxor ($inout2,$inout5); 2206 2207 &movdqa (&QWP($checksum,"esp"),$rndkey1); 2208 &mov ($out,&DWP($out_off,"esp")); 2209 &call ("_aesni_encrypt3"); 2210 2211 &xorps ($inout0,$inout3); # ^ offset_i 2212 &xorps ($inout1,$inout4); 2213 &xorps ($inout2,$inout5); 2214 &movdqa ($rndkey0,$inout5); # pass last offset_i 2215 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum 2216 &movups (&QWP(16*0,$out,$inp),$inout0); # store output 2217 &movups (&QWP(16*1,$out,$inp),$inout1); 2218 &movups (&QWP(16*2,$out,$inp),$inout2); 2219 2220 &jmp (&label("done")); 2221 2222&set_label("four",16); 2223 &lea ($i1,&DWP(1,$block)); 2224 &lea ($i3,&DWP(3,$block)); 2225 &bsf ($i1,$i1); 2226 &bsf ($i3,$i3); 2227 &mov ($key,&DWP($key_off,"esp")); # restore key 2228 &shl ($i1,4); 2229 &shl ($i3,4); 2230 &movdqu ($inout2,&QWP(0,$l_)); 2231 &movdqu ($inout3,&QWP(0,$l_,$i1)); 2232 &movdqa ($inout4,$inout2); 2233 &movdqu ($inout5,&QWP(0,$l_,$i3)); 2234 2235 &pxor ($inout2,$rndkey0); # ^ last offset_i 2236 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2237 &pxor ($inout3,$inout2); 2238 &movdqu ($inout1,&QWP(16*1,$inp)); 2239 &pxor ($inout4,$inout3); 2240 &movdqa (&QWP(16*0,"esp"),$inout2); 2241 &pxor ($inout5,$inout4); 2242 &movdqa (&QWP(16*1,"esp"),$inout3); 2243 &movdqu ($inout2,&QWP(16*2,$inp)); 2244 &movdqu ($inout3,&QWP(16*3,$inp)); 2245 &mov ($rounds,&DWP(240,$key)); 2246 2247 &pxor ($rndkey1,$inout0); # checksum 2248 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 2249 &pxor ($rndkey1,$inout1); 2250 &pxor ($inout1,&QWP(16*1,"esp")); 2251 &pxor ($rndkey1,$inout2); 2252 &pxor ($inout2,$inout4); 2253 &pxor ($rndkey1,$inout3); 2254 &pxor ($inout3,$inout5); 2255 2256 &movdqa (&QWP($checksum,"esp"),$rndkey1) 2257 &mov ($out,&DWP($out_off,"esp")); 2258 &call ("_aesni_encrypt4"); 2259 2260 &xorps ($inout0,&QWP(16*0,"esp")); # ^ offset_i 2261 &xorps ($inout1,&QWP(16*1,"esp")); 2262 &xorps ($inout2,$inout4); 2263 &movups (&QWP(16*0,$out,$inp),$inout0); # store output 2264 &xorps ($inout3,$inout5); 2265 &movups (&QWP(16*1,$out,$inp),$inout1); 2266 &movdqa ($rndkey0,$inout5); # pass last offset_i 2267 &movups (&QWP(16*2,$out,$inp),$inout2); 2268 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum 2269 &movups (&QWP(16*3,$out,$inp),$inout3); 2270 2271&set_label("done"); 2272 &mov ($key,&DWP($esp_off,"esp")); 2273 &pxor ($inout0,$inout0); # clear register bank 2274 &pxor ($inout1,$inout1); 2275 &movdqa (&QWP(16*0,"esp"),$inout0); # clear stack 2276 &pxor ($inout2,$inout2); 2277 &movdqa (&QWP(16*1,"esp"),$inout0); 2278 &pxor ($inout3,$inout3); 2279 &movdqa (&QWP(16*2,"esp"),$inout0); 2280 &pxor ($inout4,$inout4); 2281 &movdqa (&QWP(16*3,"esp"),$inout0); 2282 &pxor ($inout5,$inout5); 2283 &movdqa (&QWP(16*4,"esp"),$inout0); 2284 &movdqa (&QWP(16*5,"esp"),$inout0); 2285 &movdqa (&QWP(16*6,"esp"),$inout0); 2286 2287 &lea ("esp",&DWP(0,$key)); 2288 &mov ($rounds,&wparam(5)); # &offset_i 2289 &mov ($rounds_,&wparam(7)); # &checksum 2290 &movdqu (&QWP(0,$rounds),$rndkey0); 2291 &pxor ($rndkey0,$rndkey0); 2292 &movdqu (&QWP(0,$rounds_),$rndkey1); 2293 &pxor ($rndkey1,$rndkey1); 2294&function_end("aesni_ocb_encrypt"); 2295 2296&function_begin("aesni_ocb_decrypt"); 2297 &mov ($rounds,&wparam(5)); # &offset_i 2298 &mov ($rounds_,&wparam(7)); # &checksum 2299 2300 &mov ($inp,&wparam(0)); 2301 &mov ($out,&wparam(1)); 2302 &mov ($len,&wparam(2)); 2303 &mov ($key,&wparam(3)); 2304 &movdqu ($rndkey0,&QWP(0,$rounds)); # load offset_i 2305 &mov ($block,&wparam(4)); # start_block_num 2306 &movdqu ($rndkey1,&QWP(0,$rounds_)); # load checksum 2307 &mov ($l_,&wparam(6)); # L_ 2308 2309 &mov ($rounds,"esp"); 2310 &sub ("esp",$esp_off+4); # alloca 2311 &and ("esp",-16); # align stack 2312 2313 &sub ($out,$inp); 2314 &shl ($len,4); 2315 &lea ($len,&DWP(-16*6,$inp,$len)); # end of input - 16*6 2316 &mov (&DWP($out_off,"esp"),$out); 2317 &mov (&DWP($end_off,"esp"),$len); 2318 &mov (&DWP($esp_off,"esp"),$rounds); 2319 2320 &mov ($rounds,&DWP(240,$key)); 2321 2322 &test ($block,1); 2323 &jnz (&label("odd")); 2324 2325 &bsf ($i3,$block); 2326 &add ($block,1); 2327 &shl ($i3,4); 2328 &movdqu ($inout5,&QWP(0,$l_,$i3)); 2329 &mov ($i3,$key); # put aside key 2330 2331 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2332 &lea ($inp,&DWP(16,$inp)); 2333 2334 &pxor ($inout5,$rndkey0); # ^ last offset_i 2335 &pxor ($inout0,$inout5); # ^ offset_i 2336 2337 &movdqa ($inout4,$rndkey1); 2338 if ($inline) 2339 { &aesni_inline_generate1("dec"); } 2340 else 2341 { &call ("_aesni_decrypt1"); } 2342 2343 &xorps ($inout0,$inout5); # ^ offset_i 2344 &movaps ($rndkey1,$inout4); # pass the checksum 2345 &movdqa ($rndkey0,$inout5); # pass last offset_i 2346 &xorps ($rndkey1,$inout0); # checksum 2347 &movups (&QWP(-16,$out,$inp),$inout0); # store output 2348 2349 &mov ($rounds,&DWP(240,$i3)); 2350 &mov ($key,$i3); # restore key 2351 &mov ($len,&DWP($end_off,"esp")); 2352 2353&set_label("odd"); 2354 &shl ($rounds,4); 2355 &mov ($out,16); 2356 &sub ($out,$rounds); # twisted rounds 2357 &mov (&DWP($key_off,"esp"),$key); 2358 &lea ($key,&DWP(32,$key,$rounds)); # end of key schedule 2359 &mov (&DWP($rounds_off,"esp"),$out); 2360 2361 &cmp ($inp,$len); 2362 &ja (&label("short")); 2363 &jmp (&label("grandloop")); 2364 2365&set_label("grandloop",32); 2366 &lea ($i1,&DWP(1,$block)); 2367 &lea ($i3,&DWP(3,$block)); 2368 &lea ($i5,&DWP(5,$block)); 2369 &add ($block,6); 2370 &bsf ($i1,$i1); 2371 &bsf ($i3,$i3); 2372 &bsf ($i5,$i5); 2373 &shl ($i1,4); 2374 &shl ($i3,4); 2375 &shl ($i5,4); 2376 &movdqu ($inout0,&QWP(0,$l_)); 2377 &movdqu ($inout1,&QWP(0,$l_,$i1)); 2378 &mov ($rounds,&DWP($rounds_off,"esp")); 2379 &movdqa ($inout2,$inout0); 2380 &movdqu ($inout3,&QWP(0,$l_,$i3)); 2381 &movdqa ($inout4,$inout0); 2382 &movdqu ($inout5,&QWP(0,$l_,$i5)); 2383 2384 &pxor ($inout0,$rndkey0); # ^ last offset_i 2385 &pxor ($inout1,$inout0); 2386 &movdqa (&QWP(16*0,"esp"),$inout0); 2387 &pxor ($inout2,$inout1); 2388 &movdqa (&QWP(16*1,"esp"),$inout1); 2389 &pxor ($inout3,$inout2); 2390 &movdqa (&QWP(16*2,"esp"),$inout2); 2391 &pxor ($inout4,$inout3); 2392 &movdqa (&QWP(16*3,"esp"),$inout3); 2393 &pxor ($inout5,$inout4); 2394 &movdqa (&QWP(16*4,"esp"),$inout4); 2395 &movdqa (&QWP(16*5,"esp"),$inout5); 2396 2397 &$movekey ($rndkey0,&QWP(-48,$key,$rounds)); 2398 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2399 &movdqu ($inout1,&QWP(16*1,$inp)); 2400 &movdqu ($inout2,&QWP(16*2,$inp)); 2401 &movdqu ($inout3,&QWP(16*3,$inp)); 2402 &movdqu ($inout4,&QWP(16*4,$inp)); 2403 &movdqu ($inout5,&QWP(16*5,$inp)); 2404 &lea ($inp,&DWP(16*6,$inp)); 2405 2406 &movdqa (&QWP($checksum,"esp"),$rndkey1); 2407 &pxor ($inout0,$rndkey0); # ^ roundkey[0] 2408 &pxor ($inout1,$rndkey0); 2409 &pxor ($inout2,$rndkey0); 2410 &pxor ($inout3,$rndkey0); 2411 &pxor ($inout4,$rndkey0); 2412 &pxor ($inout5,$rndkey0); 2413 2414 &$movekey ($rndkey1,&QWP(-32,$key,$rounds)); 2415 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 2416 &pxor ($inout1,&QWP(16*1,"esp")); 2417 &pxor ($inout2,&QWP(16*2,"esp")); 2418 &pxor ($inout3,&QWP(16*3,"esp")); 2419 &pxor ($inout4,&QWP(16*4,"esp")); 2420 &pxor ($inout5,&QWP(16*5,"esp")); 2421 2422 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 2423 &aesdec ($inout0,$rndkey1); 2424 &aesdec ($inout1,$rndkey1); 2425 &aesdec ($inout2,$rndkey1); 2426 &aesdec ($inout3,$rndkey1); 2427 &aesdec ($inout4,$rndkey1); 2428 &aesdec ($inout5,$rndkey1); 2429 2430 &mov ($out,&DWP($out_off,"esp")); 2431 &mov ($len,&DWP($end_off,"esp")); 2432 &call ("_aesni_decrypt6_enter"); 2433 2434 &movdqa ($rndkey0,&QWP(16*5,"esp")); # pass last offset_i 2435 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 2436 &movdqa ($rndkey1,&QWP($checksum,"esp")); 2437 &pxor ($inout1,&QWP(16*1,"esp")); 2438 &pxor ($inout2,&QWP(16*2,"esp")); 2439 &pxor ($inout3,&QWP(16*3,"esp")); 2440 &pxor ($inout4,&QWP(16*4,"esp")); 2441 &pxor ($inout5,$rndkey0); 2442 2443 &pxor ($rndkey1,$inout0); # checksum 2444 &movdqu (&QWP(-16*6,$out,$inp),$inout0);# store output 2445 &pxor ($rndkey1,$inout1); 2446 &movdqu (&QWP(-16*5,$out,$inp),$inout1); 2447 &pxor ($rndkey1,$inout2); 2448 &movdqu (&QWP(-16*4,$out,$inp),$inout2); 2449 &pxor ($rndkey1,$inout3); 2450 &movdqu (&QWP(-16*3,$out,$inp),$inout3); 2451 &pxor ($rndkey1,$inout4); 2452 &movdqu (&QWP(-16*2,$out,$inp),$inout4); 2453 &pxor ($rndkey1,$inout5); 2454 &movdqu (&QWP(-16*1,$out,$inp),$inout5); 2455 &cmp ($inp,$len); # done yet? 2456 &jbe (&label("grandloop")); 2457 2458&set_label("short"); 2459 &add ($len,16*6); 2460 &sub ($len,$inp); 2461 &jz (&label("done")); 2462 2463 &cmp ($len,16*2); 2464 &jb (&label("one")); 2465 &je (&label("two")); 2466 2467 &cmp ($len,16*4); 2468 &jb (&label("three")); 2469 &je (&label("four")); 2470 2471 &lea ($i1,&DWP(1,$block)); 2472 &lea ($i3,&DWP(3,$block)); 2473 &bsf ($i1,$i1); 2474 &bsf ($i3,$i3); 2475 &shl ($i1,4); 2476 &shl ($i3,4); 2477 &movdqu ($inout0,&QWP(0,$l_)); 2478 &movdqu ($inout1,&QWP(0,$l_,$i1)); 2479 &mov ($rounds,&DWP($rounds_off,"esp")); 2480 &movdqa ($inout2,$inout0); 2481 &movdqu ($inout3,&QWP(0,$l_,$i3)); 2482 &movdqa ($inout4,$inout0); 2483 2484 &pxor ($inout0,$rndkey0); # ^ last offset_i 2485 &pxor ($inout1,$inout0); 2486 &movdqa (&QWP(16*0,"esp"),$inout0); 2487 &pxor ($inout2,$inout1); 2488 &movdqa (&QWP(16*1,"esp"),$inout1); 2489 &pxor ($inout3,$inout2); 2490 &movdqa (&QWP(16*2,"esp"),$inout2); 2491 &pxor ($inout4,$inout3); 2492 &movdqa (&QWP(16*3,"esp"),$inout3); 2493 &pxor ($inout5,$inout4); 2494 &movdqa (&QWP(16*4,"esp"),$inout4); 2495 2496 &$movekey ($rndkey0,&QWP(-48,$key,$rounds)); 2497 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2498 &movdqu ($inout1,&QWP(16*1,$inp)); 2499 &movdqu ($inout2,&QWP(16*2,$inp)); 2500 &movdqu ($inout3,&QWP(16*3,$inp)); 2501 &movdqu ($inout4,&QWP(16*4,$inp)); 2502 &pxor ($inout5,$inout5); 2503 2504 &movdqa (&QWP($checksum,"esp"),$rndkey1); 2505 &pxor ($inout0,$rndkey0); # ^ roundkey[0] 2506 &pxor ($inout1,$rndkey0); 2507 &pxor ($inout2,$rndkey0); 2508 &pxor ($inout3,$rndkey0); 2509 &pxor ($inout4,$rndkey0); 2510 2511 &$movekey ($rndkey1,&QWP(-32,$key,$rounds)); 2512 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 2513 &pxor ($inout1,&QWP(16*1,"esp")); 2514 &pxor ($inout2,&QWP(16*2,"esp")); 2515 &pxor ($inout3,&QWP(16*3,"esp")); 2516 &pxor ($inout4,&QWP(16*4,"esp")); 2517 2518 &$movekey ($rndkey0,&QWP(-16,$key,$rounds)); 2519 &aesdec ($inout0,$rndkey1); 2520 &aesdec ($inout1,$rndkey1); 2521 &aesdec ($inout2,$rndkey1); 2522 &aesdec ($inout3,$rndkey1); 2523 &aesdec ($inout4,$rndkey1); 2524 &aesdec ($inout5,$rndkey1); 2525 2526 &mov ($out,&DWP($out_off,"esp")); 2527 &call ("_aesni_decrypt6_enter"); 2528 2529 &movdqa ($rndkey0,&QWP(16*4,"esp")); # pass last offset_i 2530 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 2531 &movdqa ($rndkey1,&QWP($checksum,"esp")); 2532 &pxor ($inout1,&QWP(16*1,"esp")); 2533 &pxor ($inout2,&QWP(16*2,"esp")); 2534 &pxor ($inout3,&QWP(16*3,"esp")); 2535 &pxor ($inout4,$rndkey0); 2536 2537 &pxor ($rndkey1,$inout0); # checksum 2538 &movdqu (&QWP(16*0,$out,$inp),$inout0); # store output 2539 &pxor ($rndkey1,$inout1); 2540 &movdqu (&QWP(16*1,$out,$inp),$inout1); 2541 &pxor ($rndkey1,$inout2); 2542 &movdqu (&QWP(16*2,$out,$inp),$inout2); 2543 &pxor ($rndkey1,$inout3); 2544 &movdqu (&QWP(16*3,$out,$inp),$inout3); 2545 &pxor ($rndkey1,$inout4); 2546 &movdqu (&QWP(16*4,$out,$inp),$inout4); 2547 2548 &jmp (&label("done")); 2549 2550&set_label("one",16); 2551 &movdqu ($inout5,&QWP(0,$l_)); 2552 &mov ($key,&DWP($key_off,"esp")); # restore key 2553 2554 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2555 &mov ($rounds,&DWP(240,$key)); 2556 2557 &pxor ($inout5,$rndkey0); # ^ last offset_i 2558 &pxor ($inout0,$inout5); # ^ offset_i 2559 2560 &movdqa ($inout4,$rndkey1); 2561 &mov ($out,&DWP($out_off,"esp")); 2562 if ($inline) 2563 { &aesni_inline_generate1("dec"); } 2564 else 2565 { &call ("_aesni_decrypt1"); } 2566 2567 &xorps ($inout0,$inout5); # ^ offset_i 2568 &movaps ($rndkey1,$inout4); # pass the checksum 2569 &movdqa ($rndkey0,$inout5); # pass last offset_i 2570 &xorps ($rndkey1,$inout0); # checksum 2571 &movups (&QWP(0,$out,$inp),$inout0); 2572 2573 &jmp (&label("done")); 2574 2575&set_label("two",16); 2576 &lea ($i1,&DWP(1,$block)); 2577 &mov ($key,&DWP($key_off,"esp")); # restore key 2578 &bsf ($i1,$i1); 2579 &shl ($i1,4); 2580 &movdqu ($inout4,&QWP(0,$l_)); 2581 &movdqu ($inout5,&QWP(0,$l_,$i1)); 2582 2583 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2584 &movdqu ($inout1,&QWP(16*1,$inp)); 2585 &mov ($rounds,&DWP(240,$key)); 2586 2587 &movdqa ($inout3,$rndkey1); 2588 &pxor ($inout4,$rndkey0); # ^ last offset_i 2589 &pxor ($inout5,$inout4); 2590 2591 &pxor ($inout0,$inout4); # ^ offset_i 2592 &pxor ($inout1,$inout5); 2593 2594 &mov ($out,&DWP($out_off,"esp")); 2595 &call ("_aesni_decrypt2"); 2596 2597 &xorps ($inout0,$inout4); # ^ offset_i 2598 &xorps ($inout1,$inout5); 2599 &movdqa ($rndkey0,$inout5); # pass last offset_i 2600 &xorps ($inout3,$inout0); # checksum 2601 &movups (&QWP(16*0,$out,$inp),$inout0); # store output 2602 &xorps ($inout3,$inout1); 2603 &movups (&QWP(16*1,$out,$inp),$inout1); 2604 &movaps ($rndkey1,$inout3); # pass the checksum 2605 2606 &jmp (&label("done")); 2607 2608&set_label("three",16); 2609 &lea ($i1,&DWP(1,$block)); 2610 &mov ($key,&DWP($key_off,"esp")); # restore key 2611 &bsf ($i1,$i1); 2612 &shl ($i1,4); 2613 &movdqu ($inout3,&QWP(0,$l_)); 2614 &movdqu ($inout4,&QWP(0,$l_,$i1)); 2615 &movdqa ($inout5,$inout3); 2616 2617 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2618 &movdqu ($inout1,&QWP(16*1,$inp)); 2619 &movdqu ($inout2,&QWP(16*2,$inp)); 2620 &mov ($rounds,&DWP(240,$key)); 2621 2622 &movdqa (&QWP($checksum,"esp"),$rndkey1); 2623 &pxor ($inout3,$rndkey0); # ^ last offset_i 2624 &pxor ($inout4,$inout3); 2625 &pxor ($inout5,$inout4); 2626 2627 &pxor ($inout0,$inout3); # ^ offset_i 2628 &pxor ($inout1,$inout4); 2629 &pxor ($inout2,$inout5); 2630 2631 &mov ($out,&DWP($out_off,"esp")); 2632 &call ("_aesni_decrypt3"); 2633 2634 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum 2635 &xorps ($inout0,$inout3); # ^ offset_i 2636 &xorps ($inout1,$inout4); 2637 &xorps ($inout2,$inout5); 2638 &movups (&QWP(16*0,$out,$inp),$inout0); # store output 2639 &pxor ($rndkey1,$inout0); # checksum 2640 &movdqa ($rndkey0,$inout5); # pass last offset_i 2641 &movups (&QWP(16*1,$out,$inp),$inout1); 2642 &pxor ($rndkey1,$inout1); 2643 &movups (&QWP(16*2,$out,$inp),$inout2); 2644 &pxor ($rndkey1,$inout2); 2645 2646 &jmp (&label("done")); 2647 2648&set_label("four",16); 2649 &lea ($i1,&DWP(1,$block)); 2650 &lea ($i3,&DWP(3,$block)); 2651 &bsf ($i1,$i1); 2652 &bsf ($i3,$i3); 2653 &mov ($key,&DWP($key_off,"esp")); # restore key 2654 &shl ($i1,4); 2655 &shl ($i3,4); 2656 &movdqu ($inout2,&QWP(0,$l_)); 2657 &movdqu ($inout3,&QWP(0,$l_,$i1)); 2658 &movdqa ($inout4,$inout2); 2659 &movdqu ($inout5,&QWP(0,$l_,$i3)); 2660 2661 &pxor ($inout2,$rndkey0); # ^ last offset_i 2662 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 2663 &pxor ($inout3,$inout2); 2664 &movdqu ($inout1,&QWP(16*1,$inp)); 2665 &pxor ($inout4,$inout3); 2666 &movdqa (&QWP(16*0,"esp"),$inout2); 2667 &pxor ($inout5,$inout4); 2668 &movdqa (&QWP(16*1,"esp"),$inout3); 2669 &movdqu ($inout2,&QWP(16*2,$inp)); 2670 &movdqu ($inout3,&QWP(16*3,$inp)); 2671 &mov ($rounds,&DWP(240,$key)); 2672 2673 &movdqa (&QWP($checksum,"esp"),$rndkey1); 2674 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i 2675 &pxor ($inout1,&QWP(16*1,"esp")); 2676 &pxor ($inout2,$inout4); 2677 &pxor ($inout3,$inout5); 2678 2679 &mov ($out,&DWP($out_off,"esp")); 2680 &call ("_aesni_decrypt4"); 2681 2682 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum 2683 &xorps ($inout0,&QWP(16*0,"esp")); # ^ offset_i 2684 &xorps ($inout1,&QWP(16*1,"esp")); 2685 &xorps ($inout2,$inout4); 2686 &movups (&QWP(16*0,$out,$inp),$inout0); # store output 2687 &pxor ($rndkey1,$inout0); # checksum 2688 &xorps ($inout3,$inout5); 2689 &movups (&QWP(16*1,$out,$inp),$inout1); 2690 &pxor ($rndkey1,$inout1); 2691 &movdqa ($rndkey0,$inout5); # pass last offset_i 2692 &movups (&QWP(16*2,$out,$inp),$inout2); 2693 &pxor ($rndkey1,$inout2); 2694 &movups (&QWP(16*3,$out,$inp),$inout3); 2695 &pxor ($rndkey1,$inout3); 2696 2697&set_label("done"); 2698 &mov ($key,&DWP($esp_off,"esp")); 2699 &pxor ($inout0,$inout0); # clear register bank 2700 &pxor ($inout1,$inout1); 2701 &movdqa (&QWP(16*0,"esp"),$inout0); # clear stack 2702 &pxor ($inout2,$inout2); 2703 &movdqa (&QWP(16*1,"esp"),$inout0); 2704 &pxor ($inout3,$inout3); 2705 &movdqa (&QWP(16*2,"esp"),$inout0); 2706 &pxor ($inout4,$inout4); 2707 &movdqa (&QWP(16*3,"esp"),$inout0); 2708 &pxor ($inout5,$inout5); 2709 &movdqa (&QWP(16*4,"esp"),$inout0); 2710 &movdqa (&QWP(16*5,"esp"),$inout0); 2711 &movdqa (&QWP(16*6,"esp"),$inout0); 2712 2713 &lea ("esp",&DWP(0,$key)); 2714 &mov ($rounds,&wparam(5)); # &offset_i 2715 &mov ($rounds_,&wparam(7)); # &checksum 2716 &movdqu (&QWP(0,$rounds),$rndkey0); 2717 &pxor ($rndkey0,$rndkey0); 2718 &movdqu (&QWP(0,$rounds_),$rndkey1); 2719 &pxor ($rndkey1,$rndkey1); 2720&function_end("aesni_ocb_decrypt"); 2721} 2722} 2723 2724###################################################################### 2725# void $PREFIX_cbc_encrypt (const void *inp, void *out, 2726# size_t length, const AES_KEY *key, 2727# unsigned char *ivp,const int enc); 2728&function_begin("${PREFIX}_cbc_encrypt"); 2729 &mov ($inp,&wparam(0)); 2730 &mov ($rounds_,"esp"); 2731 &mov ($out,&wparam(1)); 2732 &sub ($rounds_,24); 2733 &mov ($len,&wparam(2)); 2734 &and ($rounds_,-16); 2735 &mov ($key,&wparam(3)); 2736 &mov ($key_,&wparam(4)); 2737 &test ($len,$len); 2738 &jz (&label("cbc_abort")); 2739 2740 &cmp (&wparam(5),0); 2741 &xchg ($rounds_,"esp"); # alloca 2742 &movups ($ivec,&QWP(0,$key_)); # load IV 2743 &mov ($rounds,&DWP(240,$key)); 2744 &mov ($key_,$key); # backup $key 2745 &mov (&DWP(16,"esp"),$rounds_); # save original %esp 2746 &mov ($rounds_,$rounds); # backup $rounds 2747 &je (&label("cbc_decrypt")); 2748 2749 &movaps ($inout0,$ivec); 2750 &cmp ($len,16); 2751 &jb (&label("cbc_enc_tail")); 2752 &sub ($len,16); 2753 &jmp (&label("cbc_enc_loop")); 2754 2755&set_label("cbc_enc_loop",16); 2756 &movups ($ivec,&QWP(0,$inp)); # input actually 2757 &lea ($inp,&DWP(16,$inp)); 2758 if ($inline) 2759 { &aesni_inline_generate1("enc",$inout0,$ivec); } 2760 else 2761 { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); } 2762 &mov ($rounds,$rounds_); # restore $rounds 2763 &mov ($key,$key_); # restore $key 2764 &movups (&QWP(0,$out),$inout0); # store output 2765 &lea ($out,&DWP(16,$out)); 2766 &sub ($len,16); 2767 &jnc (&label("cbc_enc_loop")); 2768 &add ($len,16); 2769 &jnz (&label("cbc_enc_tail")); 2770 &movaps ($ivec,$inout0); 2771 &pxor ($inout0,$inout0); 2772 &jmp (&label("cbc_ret")); 2773 2774&set_label("cbc_enc_tail"); 2775 &mov ("ecx",$len); # zaps $rounds 2776 &data_word(0xA4F3F689); # rep movsb 2777 &mov ("ecx",16); # zero tail 2778 &sub ("ecx",$len); 2779 &xor ("eax","eax"); # zaps $len 2780 &data_word(0xAAF3F689); # rep stosb 2781 &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block 2782 &mov ($rounds,$rounds_); # restore $rounds 2783 &mov ($inp,$out); # $inp and $out are the same 2784 &mov ($key,$key_); # restore $key 2785 &jmp (&label("cbc_enc_loop")); 2786###################################################################### 2787&set_label("cbc_decrypt",16); 2788 &cmp ($len,0x50); 2789 &jbe (&label("cbc_dec_tail")); 2790 &movaps (&QWP(0,"esp"),$ivec); # save IV 2791 &sub ($len,0x50); 2792 &jmp (&label("cbc_dec_loop6_enter")); 2793 2794&set_label("cbc_dec_loop6",16); 2795 &movaps (&QWP(0,"esp"),$rndkey0); # save IV 2796 &movups (&QWP(0,$out),$inout5); 2797 &lea ($out,&DWP(0x10,$out)); 2798&set_label("cbc_dec_loop6_enter"); 2799 &movdqu ($inout0,&QWP(0,$inp)); 2800 &movdqu ($inout1,&QWP(0x10,$inp)); 2801 &movdqu ($inout2,&QWP(0x20,$inp)); 2802 &movdqu ($inout3,&QWP(0x30,$inp)); 2803 &movdqu ($inout4,&QWP(0x40,$inp)); 2804 &movdqu ($inout5,&QWP(0x50,$inp)); 2805 2806 &call ("_aesni_decrypt6"); 2807 2808 &movups ($rndkey1,&QWP(0,$inp)); 2809 &movups ($rndkey0,&QWP(0x10,$inp)); 2810 &xorps ($inout0,&QWP(0,"esp")); # ^=IV 2811 &xorps ($inout1,$rndkey1); 2812 &movups ($rndkey1,&QWP(0x20,$inp)); 2813 &xorps ($inout2,$rndkey0); 2814 &movups ($rndkey0,&QWP(0x30,$inp)); 2815 &xorps ($inout3,$rndkey1); 2816 &movups ($rndkey1,&QWP(0x40,$inp)); 2817 &xorps ($inout4,$rndkey0); 2818 &movups ($rndkey0,&QWP(0x50,$inp)); # IV 2819 &xorps ($inout5,$rndkey1); 2820 &movups (&QWP(0,$out),$inout0); 2821 &movups (&QWP(0x10,$out),$inout1); 2822 &lea ($inp,&DWP(0x60,$inp)); 2823 &movups (&QWP(0x20,$out),$inout2); 2824 &mov ($rounds,$rounds_); # restore $rounds 2825 &movups (&QWP(0x30,$out),$inout3); 2826 &mov ($key,$key_); # restore $key 2827 &movups (&QWP(0x40,$out),$inout4); 2828 &lea ($out,&DWP(0x50,$out)); 2829 &sub ($len,0x60); 2830 &ja (&label("cbc_dec_loop6")); 2831 2832 &movaps ($inout0,$inout5); 2833 &movaps ($ivec,$rndkey0); 2834 &add ($len,0x50); 2835 &jle (&label("cbc_dec_clear_tail_collected")); 2836 &movups (&QWP(0,$out),$inout0); 2837 &lea ($out,&DWP(0x10,$out)); 2838&set_label("cbc_dec_tail"); 2839 &movups ($inout0,&QWP(0,$inp)); 2840 &movaps ($in0,$inout0); 2841 &cmp ($len,0x10); 2842 &jbe (&label("cbc_dec_one")); 2843 2844 &movups ($inout1,&QWP(0x10,$inp)); 2845 &movaps ($in1,$inout1); 2846 &cmp ($len,0x20); 2847 &jbe (&label("cbc_dec_two")); 2848 2849 &movups ($inout2,&QWP(0x20,$inp)); 2850 &cmp ($len,0x30); 2851 &jbe (&label("cbc_dec_three")); 2852 2853 &movups ($inout3,&QWP(0x30,$inp)); 2854 &cmp ($len,0x40); 2855 &jbe (&label("cbc_dec_four")); 2856 2857 &movups ($inout4,&QWP(0x40,$inp)); 2858 &movaps (&QWP(0,"esp"),$ivec); # save IV 2859 &movups ($inout0,&QWP(0,$inp)); 2860 &xorps ($inout5,$inout5); 2861 &call ("_aesni_decrypt6"); 2862 &movups ($rndkey1,&QWP(0,$inp)); 2863 &movups ($rndkey0,&QWP(0x10,$inp)); 2864 &xorps ($inout0,&QWP(0,"esp")); # ^= IV 2865 &xorps ($inout1,$rndkey1); 2866 &movups ($rndkey1,&QWP(0x20,$inp)); 2867 &xorps ($inout2,$rndkey0); 2868 &movups ($rndkey0,&QWP(0x30,$inp)); 2869 &xorps ($inout3,$rndkey1); 2870 &movups ($ivec,&QWP(0x40,$inp)); # IV 2871 &xorps ($inout4,$rndkey0); 2872 &movups (&QWP(0,$out),$inout0); 2873 &movups (&QWP(0x10,$out),$inout1); 2874 &pxor ($inout1,$inout1); 2875 &movups (&QWP(0x20,$out),$inout2); 2876 &pxor ($inout2,$inout2); 2877 &movups (&QWP(0x30,$out),$inout3); 2878 &pxor ($inout3,$inout3); 2879 &lea ($out,&DWP(0x40,$out)); 2880 &movaps ($inout0,$inout4); 2881 &pxor ($inout4,$inout4); 2882 &sub ($len,0x50); 2883 &jmp (&label("cbc_dec_tail_collected")); 2884 2885&set_label("cbc_dec_one",16); 2886 if ($inline) 2887 { &aesni_inline_generate1("dec"); } 2888 else 2889 { &call ("_aesni_decrypt1"); } 2890 &xorps ($inout0,$ivec); 2891 &movaps ($ivec,$in0); 2892 &sub ($len,0x10); 2893 &jmp (&label("cbc_dec_tail_collected")); 2894 2895&set_label("cbc_dec_two",16); 2896 &call ("_aesni_decrypt2"); 2897 &xorps ($inout0,$ivec); 2898 &xorps ($inout1,$in0); 2899 &movups (&QWP(0,$out),$inout0); 2900 &movaps ($inout0,$inout1); 2901 &pxor ($inout1,$inout1); 2902 &lea ($out,&DWP(0x10,$out)); 2903 &movaps ($ivec,$in1); 2904 &sub ($len,0x20); 2905 &jmp (&label("cbc_dec_tail_collected")); 2906 2907&set_label("cbc_dec_three",16); 2908 &call ("_aesni_decrypt3"); 2909 &xorps ($inout0,$ivec); 2910 &xorps ($inout1,$in0); 2911 &xorps ($inout2,$in1); 2912 &movups (&QWP(0,$out),$inout0); 2913 &movaps ($inout0,$inout2); 2914 &pxor ($inout2,$inout2); 2915 &movups (&QWP(0x10,$out),$inout1); 2916 &pxor ($inout1,$inout1); 2917 &lea ($out,&DWP(0x20,$out)); 2918 &movups ($ivec,&QWP(0x20,$inp)); 2919 &sub ($len,0x30); 2920 &jmp (&label("cbc_dec_tail_collected")); 2921 2922&set_label("cbc_dec_four",16); 2923 &call ("_aesni_decrypt4"); 2924 &movups ($rndkey1,&QWP(0x10,$inp)); 2925 &movups ($rndkey0,&QWP(0x20,$inp)); 2926 &xorps ($inout0,$ivec); 2927 &movups ($ivec,&QWP(0x30,$inp)); 2928 &xorps ($inout1,$in0); 2929 &movups (&QWP(0,$out),$inout0); 2930 &xorps ($inout2,$rndkey1); 2931 &movups (&QWP(0x10,$out),$inout1); 2932 &pxor ($inout1,$inout1); 2933 &xorps ($inout3,$rndkey0); 2934 &movups (&QWP(0x20,$out),$inout2); 2935 &pxor ($inout2,$inout2); 2936 &lea ($out,&DWP(0x30,$out)); 2937 &movaps ($inout0,$inout3); 2938 &pxor ($inout3,$inout3); 2939 &sub ($len,0x40); 2940 &jmp (&label("cbc_dec_tail_collected")); 2941 2942&set_label("cbc_dec_clear_tail_collected",16); 2943 &pxor ($inout1,$inout1); 2944 &pxor ($inout2,$inout2); 2945 &pxor ($inout3,$inout3); 2946 &pxor ($inout4,$inout4); 2947&set_label("cbc_dec_tail_collected"); 2948 &and ($len,15); 2949 &jnz (&label("cbc_dec_tail_partial")); 2950 &movups (&QWP(0,$out),$inout0); 2951 &pxor ($rndkey0,$rndkey0); 2952 &jmp (&label("cbc_ret")); 2953 2954&set_label("cbc_dec_tail_partial",16); 2955 &movaps (&QWP(0,"esp"),$inout0); 2956 &pxor ($rndkey0,$rndkey0); 2957 &mov ("ecx",16); 2958 &mov ($inp,"esp"); 2959 &sub ("ecx",$len); 2960 &data_word(0xA4F3F689); # rep movsb 2961 &movdqa (&QWP(0,"esp"),$inout0); 2962 2963&set_label("cbc_ret"); 2964 &mov ("esp",&DWP(16,"esp")); # pull original %esp 2965 &mov ($key_,&wparam(4)); 2966 &pxor ($inout0,$inout0); 2967 &pxor ($rndkey1,$rndkey1); 2968 &movups (&QWP(0,$key_),$ivec); # output IV 2969 &pxor ($ivec,$ivec); 2970&set_label("cbc_abort"); 2971&function_end("${PREFIX}_cbc_encrypt"); 2972 2973###################################################################### 2974# Mechanical port from aesni-x86_64.pl. 2975# 2976# _aesni_set_encrypt_key is private interface, 2977# input: 2978# "eax" const unsigned char *userKey 2979# $rounds int bits 2980# $key AES_KEY *key 2981# output: 2982# "eax" return code 2983# $round rounds 2984 2985&function_begin_B("_aesni_set_encrypt_key"); 2986 &push ("ebp"); 2987 &push ("ebx"); 2988 &test ("eax","eax"); 2989 &jz (&label("bad_pointer")); 2990 &test ($key,$key); 2991 &jz (&label("bad_pointer")); 2992 2993 &call (&label("pic")); 2994&set_label("pic"); 2995 &blindpop("ebx"); 2996 &lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx")); 2997 2998 &picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const")); 2999 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey 3000 &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0 3001 &mov ("ebp",&DWP(4,"ebp")); 3002 &lea ($key,&DWP(16,$key)); 3003 &and ("ebp",1<<28|1<<11); # AVX and XOP bits 3004 &cmp ($rounds,256); 3005 &je (&label("14rounds")); 3006 &cmp ($rounds,192); 3007 &je (&label("12rounds")); 3008 &cmp ($rounds,128); 3009 &jne (&label("bad_keybits")); 3010 3011&set_label("10rounds",16); 3012 &cmp ("ebp",1<<28); 3013 &je (&label("10rounds_alt")); 3014 3015 &mov ($rounds,9); 3016 &$movekey (&QWP(-16,$key),"xmm0"); # round 0 3017 &aeskeygenassist("xmm1","xmm0",0x01); # round 1 3018 &call (&label("key_128_cold")); 3019 &aeskeygenassist("xmm1","xmm0",0x2); # round 2 3020 &call (&label("key_128")); 3021 &aeskeygenassist("xmm1","xmm0",0x04); # round 3 3022 &call (&label("key_128")); 3023 &aeskeygenassist("xmm1","xmm0",0x08); # round 4 3024 &call (&label("key_128")); 3025 &aeskeygenassist("xmm1","xmm0",0x10); # round 5 3026 &call (&label("key_128")); 3027 &aeskeygenassist("xmm1","xmm0",0x20); # round 6 3028 &call (&label("key_128")); 3029 &aeskeygenassist("xmm1","xmm0",0x40); # round 7 3030 &call (&label("key_128")); 3031 &aeskeygenassist("xmm1","xmm0",0x80); # round 8 3032 &call (&label("key_128")); 3033 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9 3034 &call (&label("key_128")); 3035 &aeskeygenassist("xmm1","xmm0",0x36); # round 10 3036 &call (&label("key_128")); 3037 &$movekey (&QWP(0,$key),"xmm0"); 3038 &mov (&DWP(80,$key),$rounds); 3039 3040 &jmp (&label("good_key")); 3041 3042&set_label("key_128",16); 3043 &$movekey (&QWP(0,$key),"xmm0"); 3044 &lea ($key,&DWP(16,$key)); 3045&set_label("key_128_cold"); 3046 &shufps ("xmm4","xmm0",0b00010000); 3047 &xorps ("xmm0","xmm4"); 3048 &shufps ("xmm4","xmm0",0b10001100); 3049 &xorps ("xmm0","xmm4"); 3050 &shufps ("xmm1","xmm1",0b11111111); # critical path 3051 &xorps ("xmm0","xmm1"); 3052 &ret(); 3053 3054&set_label("10rounds_alt",16); 3055 &movdqa ("xmm5",&QWP(0x00,"ebx")); 3056 &mov ($rounds,8); 3057 &movdqa ("xmm4",&QWP(0x20,"ebx")); 3058 &movdqa ("xmm2","xmm0"); 3059 &movdqu (&QWP(-16,$key),"xmm0"); 3060 3061&set_label("loop_key128"); 3062 &pshufb ("xmm0","xmm5"); 3063 &aesenclast ("xmm0","xmm4"); 3064 &pslld ("xmm4",1); 3065 &lea ($key,&DWP(16,$key)); 3066 3067 &movdqa ("xmm3","xmm2"); 3068 &pslldq ("xmm2",4); 3069 &pxor ("xmm3","xmm2"); 3070 &pslldq ("xmm2",4); 3071 &pxor ("xmm3","xmm2"); 3072 &pslldq ("xmm2",4); 3073 &pxor ("xmm2","xmm3"); 3074 3075 &pxor ("xmm0","xmm2"); 3076 &movdqu (&QWP(-16,$key),"xmm0"); 3077 &movdqa ("xmm2","xmm0"); 3078 3079 &dec ($rounds); 3080 &jnz (&label("loop_key128")); 3081 3082 &movdqa ("xmm4",&QWP(0x30,"ebx")); 3083 3084 &pshufb ("xmm0","xmm5"); 3085 &aesenclast ("xmm0","xmm4"); 3086 &pslld ("xmm4",1); 3087 3088 &movdqa ("xmm3","xmm2"); 3089 &pslldq ("xmm2",4); 3090 &pxor ("xmm3","xmm2"); 3091 &pslldq ("xmm2",4); 3092 &pxor ("xmm3","xmm2"); 3093 &pslldq ("xmm2",4); 3094 &pxor ("xmm2","xmm3"); 3095 3096 &pxor ("xmm0","xmm2"); 3097 &movdqu (&QWP(0,$key),"xmm0"); 3098 3099 &movdqa ("xmm2","xmm0"); 3100 &pshufb ("xmm0","xmm5"); 3101 &aesenclast ("xmm0","xmm4"); 3102 3103 &movdqa ("xmm3","xmm2"); 3104 &pslldq ("xmm2",4); 3105 &pxor ("xmm3","xmm2"); 3106 &pslldq ("xmm2",4); 3107 &pxor ("xmm3","xmm2"); 3108 &pslldq ("xmm2",4); 3109 &pxor ("xmm2","xmm3"); 3110 3111 &pxor ("xmm0","xmm2"); 3112 &movdqu (&QWP(16,$key),"xmm0"); 3113 3114 &mov ($rounds,9); 3115 &mov (&DWP(96,$key),$rounds); 3116 3117 &jmp (&label("good_key")); 3118 3119&set_label("12rounds",16); 3120 &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey 3121 &cmp ("ebp",1<<28); 3122 &je (&label("12rounds_alt")); 3123 3124 &mov ($rounds,11); 3125 &$movekey (&QWP(-16,$key),"xmm0"); # round 0 3126 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2 3127 &call (&label("key_192a_cold")); 3128 &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3 3129 &call (&label("key_192b")); 3130 &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5 3131 &call (&label("key_192a")); 3132 &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6 3133 &call (&label("key_192b")); 3134 &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8 3135 &call (&label("key_192a")); 3136 &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9 3137 &call (&label("key_192b")); 3138 &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11 3139 &call (&label("key_192a")); 3140 &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12 3141 &call (&label("key_192b")); 3142 &$movekey (&QWP(0,$key),"xmm0"); 3143 &mov (&DWP(48,$key),$rounds); 3144 3145 &jmp (&label("good_key")); 3146 3147&set_label("key_192a",16); 3148 &$movekey (&QWP(0,$key),"xmm0"); 3149 &lea ($key,&DWP(16,$key)); 3150&set_label("key_192a_cold",16); 3151 &movaps ("xmm5","xmm2"); 3152&set_label("key_192b_warm"); 3153 &shufps ("xmm4","xmm0",0b00010000); 3154 &movdqa ("xmm3","xmm2"); 3155 &xorps ("xmm0","xmm4"); 3156 &shufps ("xmm4","xmm0",0b10001100); 3157 &pslldq ("xmm3",4); 3158 &xorps ("xmm0","xmm4"); 3159 &pshufd ("xmm1","xmm1",0b01010101); # critical path 3160 &pxor ("xmm2","xmm3"); 3161 &pxor ("xmm0","xmm1"); 3162 &pshufd ("xmm3","xmm0",0b11111111); 3163 &pxor ("xmm2","xmm3"); 3164 &ret(); 3165 3166&set_label("key_192b",16); 3167 &movaps ("xmm3","xmm0"); 3168 &shufps ("xmm5","xmm0",0b01000100); 3169 &$movekey (&QWP(0,$key),"xmm5"); 3170 &shufps ("xmm3","xmm2",0b01001110); 3171 &$movekey (&QWP(16,$key),"xmm3"); 3172 &lea ($key,&DWP(32,$key)); 3173 &jmp (&label("key_192b_warm")); 3174 3175&set_label("12rounds_alt",16); 3176 &movdqa ("xmm5",&QWP(0x10,"ebx")); 3177 &movdqa ("xmm4",&QWP(0x20,"ebx")); 3178 &mov ($rounds,8); 3179 &movdqu (&QWP(-16,$key),"xmm0"); 3180 3181&set_label("loop_key192"); 3182 &movq (&QWP(0,$key),"xmm2"); 3183 &movdqa ("xmm1","xmm2"); 3184 &pshufb ("xmm2","xmm5"); 3185 &aesenclast ("xmm2","xmm4"); 3186 &pslld ("xmm4",1); 3187 &lea ($key,&DWP(24,$key)); 3188 3189 &movdqa ("xmm3","xmm0"); 3190 &pslldq ("xmm0",4); 3191 &pxor ("xmm3","xmm0"); 3192 &pslldq ("xmm0",4); 3193 &pxor ("xmm3","xmm0"); 3194 &pslldq ("xmm0",4); 3195 &pxor ("xmm0","xmm3"); 3196 3197 &pshufd ("xmm3","xmm0",0xff); 3198 &pxor ("xmm3","xmm1"); 3199 &pslldq ("xmm1",4); 3200 &pxor ("xmm3","xmm1"); 3201 3202 &pxor ("xmm0","xmm2"); 3203 &pxor ("xmm2","xmm3"); 3204 &movdqu (&QWP(-16,$key),"xmm0"); 3205 3206 &dec ($rounds); 3207 &jnz (&label("loop_key192")); 3208 3209 &mov ($rounds,11); 3210 &mov (&DWP(32,$key),$rounds); 3211 3212 &jmp (&label("good_key")); 3213 3214&set_label("14rounds",16); 3215 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey 3216 &lea ($key,&DWP(16,$key)); 3217 &cmp ("ebp",1<<28); 3218 &je (&label("14rounds_alt")); 3219 3220 &mov ($rounds,13); 3221 &$movekey (&QWP(-32,$key),"xmm0"); # round 0 3222 &$movekey (&QWP(-16,$key),"xmm2"); # round 1 3223 &aeskeygenassist("xmm1","xmm2",0x01); # round 2 3224 &call (&label("key_256a_cold")); 3225 &aeskeygenassist("xmm1","xmm0",0x01); # round 3 3226 &call (&label("key_256b")); 3227 &aeskeygenassist("xmm1","xmm2",0x02); # round 4 3228 &call (&label("key_256a")); 3229 &aeskeygenassist("xmm1","xmm0",0x02); # round 5 3230 &call (&label("key_256b")); 3231 &aeskeygenassist("xmm1","xmm2",0x04); # round 6 3232 &call (&label("key_256a")); 3233 &aeskeygenassist("xmm1","xmm0",0x04); # round 7 3234 &call (&label("key_256b")); 3235 &aeskeygenassist("xmm1","xmm2",0x08); # round 8 3236 &call (&label("key_256a")); 3237 &aeskeygenassist("xmm1","xmm0",0x08); # round 9 3238 &call (&label("key_256b")); 3239 &aeskeygenassist("xmm1","xmm2",0x10); # round 10 3240 &call (&label("key_256a")); 3241 &aeskeygenassist("xmm1","xmm0",0x10); # round 11 3242 &call (&label("key_256b")); 3243 &aeskeygenassist("xmm1","xmm2",0x20); # round 12 3244 &call (&label("key_256a")); 3245 &aeskeygenassist("xmm1","xmm0",0x20); # round 13 3246 &call (&label("key_256b")); 3247 &aeskeygenassist("xmm1","xmm2",0x40); # round 14 3248 &call (&label("key_256a")); 3249 &$movekey (&QWP(0,$key),"xmm0"); 3250 &mov (&DWP(16,$key),$rounds); 3251 &xor ("eax","eax"); 3252 3253 &jmp (&label("good_key")); 3254 3255&set_label("key_256a",16); 3256 &$movekey (&QWP(0,$key),"xmm2"); 3257 &lea ($key,&DWP(16,$key)); 3258&set_label("key_256a_cold"); 3259 &shufps ("xmm4","xmm0",0b00010000); 3260 &xorps ("xmm0","xmm4"); 3261 &shufps ("xmm4","xmm0",0b10001100); 3262 &xorps ("xmm0","xmm4"); 3263 &shufps ("xmm1","xmm1",0b11111111); # critical path 3264 &xorps ("xmm0","xmm1"); 3265 &ret(); 3266 3267&set_label("key_256b",16); 3268 &$movekey (&QWP(0,$key),"xmm0"); 3269 &lea ($key,&DWP(16,$key)); 3270 3271 &shufps ("xmm4","xmm2",0b00010000); 3272 &xorps ("xmm2","xmm4"); 3273 &shufps ("xmm4","xmm2",0b10001100); 3274 &xorps ("xmm2","xmm4"); 3275 &shufps ("xmm1","xmm1",0b10101010); # critical path 3276 &xorps ("xmm2","xmm1"); 3277 &ret(); 3278 3279&set_label("14rounds_alt",16); 3280 &movdqa ("xmm5",&QWP(0x00,"ebx")); 3281 &movdqa ("xmm4",&QWP(0x20,"ebx")); 3282 &mov ($rounds,7); 3283 &movdqu (&QWP(-32,$key),"xmm0"); 3284 &movdqa ("xmm1","xmm2"); 3285 &movdqu (&QWP(-16,$key),"xmm2"); 3286 3287&set_label("loop_key256"); 3288 &pshufb ("xmm2","xmm5"); 3289 &aesenclast ("xmm2","xmm4"); 3290 3291 &movdqa ("xmm3","xmm0"); 3292 &pslldq ("xmm0",4); 3293 &pxor ("xmm3","xmm0"); 3294 &pslldq ("xmm0",4); 3295 &pxor ("xmm3","xmm0"); 3296 &pslldq ("xmm0",4); 3297 &pxor ("xmm0","xmm3"); 3298 &pslld ("xmm4",1); 3299 3300 &pxor ("xmm0","xmm2"); 3301 &movdqu (&QWP(0,$key),"xmm0"); 3302 3303 &dec ($rounds); 3304 &jz (&label("done_key256")); 3305 3306 &pshufd ("xmm2","xmm0",0xff); 3307 &pxor ("xmm3","xmm3"); 3308 &aesenclast ("xmm2","xmm3"); 3309 3310 &movdqa ("xmm3","xmm1"); 3311 &pslldq ("xmm1",4); 3312 &pxor ("xmm3","xmm1"); 3313 &pslldq ("xmm1",4); 3314 &pxor ("xmm3","xmm1"); 3315 &pslldq ("xmm1",4); 3316 &pxor ("xmm1","xmm3"); 3317 3318 &pxor ("xmm2","xmm1"); 3319 &movdqu (&QWP(16,$key),"xmm2"); 3320 &lea ($key,&DWP(32,$key)); 3321 &movdqa ("xmm1","xmm2"); 3322 &jmp (&label("loop_key256")); 3323 3324&set_label("done_key256"); 3325 &mov ($rounds,13); 3326 &mov (&DWP(16,$key),$rounds); 3327 3328&set_label("good_key"); 3329 &pxor ("xmm0","xmm0"); 3330 &pxor ("xmm1","xmm1"); 3331 &pxor ("xmm2","xmm2"); 3332 &pxor ("xmm3","xmm3"); 3333 &pxor ("xmm4","xmm4"); 3334 &pxor ("xmm5","xmm5"); 3335 &xor ("eax","eax"); 3336 &pop ("ebx"); 3337 &pop ("ebp"); 3338 &ret (); 3339 3340&set_label("bad_pointer",4); 3341 &mov ("eax",-1); 3342 &pop ("ebx"); 3343 &pop ("ebp"); 3344 &ret (); 3345&set_label("bad_keybits",4); 3346 &pxor ("xmm0","xmm0"); 3347 &mov ("eax",-2); 3348 &pop ("ebx"); 3349 &pop ("ebp"); 3350 &ret (); 3351&function_end_B("_aesni_set_encrypt_key"); 3352 3353# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits, 3354# AES_KEY *key) 3355&function_begin_B("${PREFIX}_set_encrypt_key"); 3356 &mov ("eax",&wparam(0)); 3357 &mov ($rounds,&wparam(1)); 3358 &mov ($key,&wparam(2)); 3359 &call ("_aesni_set_encrypt_key"); 3360 &ret (); 3361&function_end_B("${PREFIX}_set_encrypt_key"); 3362 3363# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits, 3364# AES_KEY *key) 3365&function_begin_B("${PREFIX}_set_decrypt_key"); 3366 &mov ("eax",&wparam(0)); 3367 &mov ($rounds,&wparam(1)); 3368 &mov ($key,&wparam(2)); 3369 &call ("_aesni_set_encrypt_key"); 3370 &mov ($key,&wparam(2)); 3371 &shl ($rounds,4); # rounds-1 after _aesni_set_encrypt_key 3372 &test ("eax","eax"); 3373 &jnz (&label("dec_key_ret")); 3374 &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule 3375 3376 &$movekey ("xmm0",&QWP(0,$key)); # just swap 3377 &$movekey ("xmm1",&QWP(0,"eax")); 3378 &$movekey (&QWP(0,"eax"),"xmm0"); 3379 &$movekey (&QWP(0,$key),"xmm1"); 3380 &lea ($key,&DWP(16,$key)); 3381 &lea ("eax",&DWP(-16,"eax")); 3382 3383&set_label("dec_key_inverse"); 3384 &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse 3385 &$movekey ("xmm1",&QWP(0,"eax")); 3386 &aesimc ("xmm0","xmm0"); 3387 &aesimc ("xmm1","xmm1"); 3388 &lea ($key,&DWP(16,$key)); 3389 &lea ("eax",&DWP(-16,"eax")); 3390 &$movekey (&QWP(16,"eax"),"xmm0"); 3391 &$movekey (&QWP(-16,$key),"xmm1"); 3392 &cmp ("eax",$key); 3393 &ja (&label("dec_key_inverse")); 3394 3395 &$movekey ("xmm0",&QWP(0,$key)); # inverse middle 3396 &aesimc ("xmm0","xmm0"); 3397 &$movekey (&QWP(0,$key),"xmm0"); 3398 3399 &pxor ("xmm0","xmm0"); 3400 &pxor ("xmm1","xmm1"); 3401 &xor ("eax","eax"); # return success 3402&set_label("dec_key_ret"); 3403 &ret (); 3404&function_end_B("${PREFIX}_set_decrypt_key"); 3405 3406&set_label("key_const",64); 3407&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d); 3408&data_word(0x04070605,0x04070605,0x04070605,0x04070605); 3409&data_word(1,1,1,1); 3410&data_word(0x1b,0x1b,0x1b,0x1b); 3411&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"); 3412 3413&asm_finish(); 3414 3415close STDOUT or die "error closing STDOUT: $!"; 3416