1#!/usr/bin/env perl 2 3################################################################### 4### AES-128 [originally in CTR mode] ### 5### bitsliced implementation for Intel Core 2 processors ### 6### requires support of SSE extensions up to SSSE3 ### 7### Author: Emilia Käsper and Peter Schwabe ### 8### Date: 2009-03-19 ### 9### Public domain ### 10### ### 11### See http://homes.esat.kuleuven.be/~ekasper/#software for ### 12### further information. ### 13################################################################### 14# 15# September 2011. 16# 17# Started as transliteration to "perlasm" the original code has 18# undergone following changes: 19# 20# - code was made position-independent; 21# - rounds were folded into a loop resulting in >5x size reduction 22# from 12.5KB to 2.2KB; 23# - above was possibile thanks to mixcolumns() modification that 24# allowed to feed its output back to aesenc[last], this was 25# achieved at cost of two additional inter-registers moves; 26# - some instruction reordering and interleaving; 27# - this module doesn't implement key setup subroutine, instead it 28# relies on conversion of "conventional" key schedule as returned 29# by AES_set_encrypt_key (see discussion below); 30# - first and last round keys are treated differently, which allowed 31# to skip one shiftrows(), reduce bit-sliced key schedule and 32# speed-up conversion by 22%; 33# - support for 192- and 256-bit keys was added; 34# 35# Resulting performance in CPU cycles spent to encrypt one byte out 36# of 4096-byte buffer with 128-bit key is: 37# 38# Emilia's this(*) difference 39# 40# Core 2 9.30 8.69 +7% 41# Nehalem(**) 7.63 6.98 +9% 42# Atom 17.1 17.4 -2%(***) 43# 44# (*) Comparison is not completely fair, because "this" is ECB, 45# i.e. no extra processing such as counter values calculation 46# and xor-ing input as in Emilia's CTR implementation is 47# performed. However, the CTR calculations stand for not more 48# than 1% of total time, so comparison is *rather* fair. 49# 50# (**) Results were collected on Westmere, which is considered to 51# be equivalent to Nehalem for this code. 52# 53# (***) Slowdown on Atom is rather strange per se, because original 54# implementation has a number of 9+-bytes instructions, which 55# are bad for Atom front-end, and which I eliminated completely. 56# In attempt to address deterioration sbox() was tested in FP 57# SIMD "domain" (movaps instead of movdqa, xorps instead of 58# pxor, etc.). While it resulted in nominal 4% improvement on 59# Atom, it hurted Westmere by more than 2x factor. 60# 61# As for key schedule conversion subroutine. Interface to OpenSSL 62# relies on per-invocation on-the-fly conversion. This naturally 63# has impact on performance, especially for short inputs. Conversion 64# time in CPU cycles and its ratio to CPU cycles spent in 8x block 65# function is: 66# 67# conversion conversion/8x block 68# Core 2 240 0.22 69# Nehalem 180 0.20 70# Atom 430 0.19 71# 72# The ratio values mean that 128-byte blocks will be processed 73# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%, 74# etc. Then keep in mind that input sizes not divisible by 128 are 75# *effectively* slower, especially shortest ones, e.g. consecutive 76# 144-byte blocks are processed 44% slower than one would expect, 77# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings" 78# it's still faster than ["hyper-threading-safe" code path in] 79# aes-x86_64.pl on all lengths above 64 bytes... 80# 81# October 2011. 82# 83# Add decryption procedure. Performance in CPU cycles spent to decrypt 84# one byte out of 4096-byte buffer with 128-bit key is: 85# 86# Core 2 11.0 87# Nehalem 9.16 88# Atom 20.9 89# 90# November 2011. 91# 92# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is 93# suboptimal, but XTS is meant to be used with larger blocks... 94# 95# <appro@openssl.org> 96 97$flavour = shift; 98$output = shift; 99if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 100 101$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 102 103$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 104( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 105( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 106die "can't locate x86_64-xlate.pl"; 107 108open STDOUT,"| $^X $xlate $flavour $output"; 109 110my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx"); 111my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15) 112my $ecb=0; # suppress unreferenced ECB subroutines, spare some space... 113 114{ 115my ($key,$rounds,$const)=("%rax","%r10d","%r11"); 116 117sub Sbox { 118# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 119# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb 120my @b=@_[0..7]; 121my @t=@_[8..11]; 122my @s=@_[12..15]; 123 &InBasisChange (@b); 124 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s); 125 &OutBasisChange (@b[7,1,4,2,6,5,0,3]); 126} 127 128sub InBasisChange { 129# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 130# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb 131my @b=@_[0..7]; 132$code.=<<___; 133 pxor @b[6], @b[5] 134 pxor @b[1], @b[2] 135 pxor @b[0], @b[3] 136 pxor @b[2], @b[6] 137 pxor @b[0], @b[5] 138 139 pxor @b[3], @b[6] 140 pxor @b[7], @b[3] 141 pxor @b[5], @b[7] 142 pxor @b[4], @b[3] 143 pxor @b[5], @b[4] 144 pxor @b[1], @b[3] 145 146 pxor @b[7], @b[2] 147 pxor @b[5], @b[1] 148___ 149} 150 151sub OutBasisChange { 152# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 153# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb 154my @b=@_[0..7]; 155$code.=<<___; 156 pxor @b[6], @b[0] 157 pxor @b[4], @b[1] 158 pxor @b[0], @b[2] 159 pxor @b[6], @b[4] 160 pxor @b[1], @b[6] 161 162 pxor @b[5], @b[1] 163 pxor @b[3], @b[5] 164 pxor @b[7], @b[3] 165 pxor @b[5], @b[7] 166 pxor @b[5], @b[2] 167 168 pxor @b[7], @b[4] 169___ 170} 171 172sub InvSbox { 173# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 174# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb 175my @b=@_[0..7]; 176my @t=@_[8..11]; 177my @s=@_[12..15]; 178 &InvInBasisChange (@b); 179 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s); 180 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]); 181} 182 183sub InvInBasisChange { # OutBasisChange in reverse 184my @b=@_[5,1,2,6,3,7,0,4]; 185$code.=<<___ 186 pxor @b[7], @b[4] 187 188 pxor @b[5], @b[7] 189 pxor @b[5], @b[2] 190 pxor @b[7], @b[3] 191 pxor @b[3], @b[5] 192 pxor @b[5], @b[1] 193 194 pxor @b[1], @b[6] 195 pxor @b[0], @b[2] 196 pxor @b[6], @b[4] 197 pxor @b[6], @b[0] 198 pxor @b[4], @b[1] 199___ 200} 201 202sub InvOutBasisChange { # InBasisChange in reverse 203my @b=@_[2,5,7,3,6,1,0,4]; 204$code.=<<___; 205 pxor @b[5], @b[1] 206 pxor @b[7], @b[2] 207 208 pxor @b[1], @b[3] 209 pxor @b[5], @b[4] 210 pxor @b[5], @b[7] 211 pxor @b[4], @b[3] 212 pxor @b[0], @b[5] 213 pxor @b[7], @b[3] 214 pxor @b[2], @b[6] 215 pxor @b[1], @b[2] 216 pxor @b[3], @b[6] 217 218 pxor @b[0], @b[3] 219 pxor @b[6], @b[5] 220___ 221} 222 223sub Mul_GF4 { 224#;************************************************************* 225#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) * 226#;************************************************************* 227my ($x0,$x1,$y0,$y1,$t0)=@_; 228$code.=<<___; 229 movdqa $y0, $t0 230 pxor $y1, $t0 231 pand $x0, $t0 232 pxor $x1, $x0 233 pand $y0, $x1 234 pand $y1, $x0 235 pxor $x1, $x0 236 pxor $t0, $x1 237___ 238} 239 240sub Mul_GF4_N { # not used, see next subroutine 241# multiply and scale by N 242my ($x0,$x1,$y0,$y1,$t0)=@_; 243$code.=<<___; 244 movdqa $y0, $t0 245 pxor $y1, $t0 246 pand $x0, $t0 247 pxor $x1, $x0 248 pand $y0, $x1 249 pand $y1, $x0 250 pxor $x0, $x1 251 pxor $t0, $x0 252___ 253} 254 255sub Mul_GF4_N_GF4 { 256# interleaved Mul_GF4_N and Mul_GF4 257my ($x0,$x1,$y0,$y1,$t0, 258 $x2,$x3,$y2,$y3,$t1)=@_; 259$code.=<<___; 260 movdqa $y0, $t0 261 movdqa $y2, $t1 262 pxor $y1, $t0 263 pxor $y3, $t1 264 pand $x0, $t0 265 pand $x2, $t1 266 pxor $x1, $x0 267 pxor $x3, $x2 268 pand $y0, $x1 269 pand $y2, $x3 270 pand $y1, $x0 271 pand $y3, $x2 272 pxor $x0, $x1 273 pxor $x3, $x2 274 pxor $t0, $x0 275 pxor $t1, $x3 276___ 277} 278sub Mul_GF16_2 { 279my @x=@_[0..7]; 280my @y=@_[8..11]; 281my @t=@_[12..15]; 282$code.=<<___; 283 movdqa @x[0], @t[0] 284 movdqa @x[1], @t[1] 285___ 286 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]); 287$code.=<<___; 288 pxor @x[2], @t[0] 289 pxor @x[3], @t[1] 290 pxor @y[2], @y[0] 291 pxor @y[3], @y[1] 292___ 293 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], 294 @x[2], @x[3], @y[2], @y[3], @t[2]); 295$code.=<<___; 296 pxor @t[0], @x[0] 297 pxor @t[0], @x[2] 298 pxor @t[1], @x[1] 299 pxor @t[1], @x[3] 300 301 movdqa @x[4], @t[0] 302 movdqa @x[5], @t[1] 303 pxor @x[6], @t[0] 304 pxor @x[7], @t[1] 305___ 306 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], 307 @x[6], @x[7], @y[2], @y[3], @t[2]); 308$code.=<<___; 309 pxor @y[2], @y[0] 310 pxor @y[3], @y[1] 311___ 312 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]); 313$code.=<<___; 314 pxor @t[0], @x[4] 315 pxor @t[0], @x[6] 316 pxor @t[1], @x[5] 317 pxor @t[1], @x[7] 318___ 319} 320sub Inv_GF256 { 321#;******************************************************************** 322#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) * 323#;******************************************************************** 324my @x=@_[0..7]; 325my @t=@_[8..11]; 326my @s=@_[12..15]; 327# direct optimizations from hardware 328$code.=<<___; 329 movdqa @x[4], @t[3] 330 movdqa @x[5], @t[2] 331 movdqa @x[1], @t[1] 332 movdqa @x[7], @s[1] 333 movdqa @x[0], @s[0] 334 335 pxor @x[6], @t[3] 336 pxor @x[7], @t[2] 337 pxor @x[3], @t[1] 338 movdqa @t[3], @s[2] 339 pxor @x[6], @s[1] 340 movdqa @t[2], @t[0] 341 pxor @x[2], @s[0] 342 movdqa @t[3], @s[3] 343 344 por @t[1], @t[2] 345 por @s[0], @t[3] 346 pxor @t[0], @s[3] 347 pand @s[0], @s[2] 348 pxor @t[1], @s[0] 349 pand @t[1], @t[0] 350 pand @s[0], @s[3] 351 movdqa @x[3], @s[0] 352 pxor @x[2], @s[0] 353 pand @s[0], @s[1] 354 pxor @s[1], @t[3] 355 pxor @s[1], @t[2] 356 movdqa @x[4], @s[1] 357 movdqa @x[1], @s[0] 358 pxor @x[5], @s[1] 359 pxor @x[0], @s[0] 360 movdqa @s[1], @t[1] 361 pand @s[0], @s[1] 362 por @s[0], @t[1] 363 pxor @s[1], @t[0] 364 pxor @s[3], @t[3] 365 pxor @s[2], @t[2] 366 pxor @s[3], @t[1] 367 movdqa @x[7], @s[0] 368 pxor @s[2], @t[0] 369 movdqa @x[6], @s[1] 370 pxor @s[2], @t[1] 371 movdqa @x[5], @s[2] 372 pand @x[3], @s[0] 373 movdqa @x[4], @s[3] 374 pand @x[2], @s[1] 375 pand @x[1], @s[2] 376 por @x[0], @s[3] 377 pxor @s[0], @t[3] 378 pxor @s[1], @t[2] 379 pxor @s[2], @t[1] 380 pxor @s[3], @t[0] 381 382 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3 383 384 # new smaller inversion 385 386 movdqa @t[3], @s[0] 387 pand @t[1], @t[3] 388 pxor @t[2], @s[0] 389 390 movdqa @t[0], @s[2] 391 movdqa @s[0], @s[3] 392 pxor @t[3], @s[2] 393 pand @s[2], @s[3] 394 395 movdqa @t[1], @s[1] 396 pxor @t[2], @s[3] 397 pxor @t[0], @s[1] 398 399 pxor @t[2], @t[3] 400 401 pand @t[3], @s[1] 402 403 movdqa @s[2], @t[2] 404 pxor @t[0], @s[1] 405 406 pxor @s[1], @t[2] 407 pxor @s[1], @t[1] 408 409 pand @t[0], @t[2] 410 411 pxor @t[2], @s[2] 412 pxor @t[2], @t[1] 413 414 pand @s[3], @s[2] 415 416 pxor @s[0], @s[2] 417___ 418# output in s3, s2, s1, t1 419 420# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3 421 422# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 423 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]); 424 425### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb 426} 427 428# AES linear components 429 430sub ShiftRows { 431my @x=@_[0..7]; 432my $mask=pop; 433$code.=<<___; 434 pxor 0x00($key),@x[0] 435 pxor 0x10($key),@x[1] 436 pshufb $mask,@x[0] 437 pxor 0x20($key),@x[2] 438 pshufb $mask,@x[1] 439 pxor 0x30($key),@x[3] 440 pshufb $mask,@x[2] 441 pxor 0x40($key),@x[4] 442 pshufb $mask,@x[3] 443 pxor 0x50($key),@x[5] 444 pshufb $mask,@x[4] 445 pxor 0x60($key),@x[6] 446 pshufb $mask,@x[5] 447 pxor 0x70($key),@x[7] 448 pshufb $mask,@x[6] 449 lea 0x80($key),$key 450 pshufb $mask,@x[7] 451___ 452} 453 454sub MixColumns { 455# modified to emit output in order suitable for feeding back to aesenc[last] 456my @x=@_[0..7]; 457my @t=@_[8..15]; 458$code.=<<___; 459 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32 460 pshufd \$0x93, @x[1], @t[1] 461 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32) 462 pshufd \$0x93, @x[2], @t[2] 463 pxor @t[1], @x[1] 464 pshufd \$0x93, @x[3], @t[3] 465 pxor @t[2], @x[2] 466 pshufd \$0x93, @x[4], @t[4] 467 pxor @t[3], @x[3] 468 pshufd \$0x93, @x[5], @t[5] 469 pxor @t[4], @x[4] 470 pshufd \$0x93, @x[6], @t[6] 471 pxor @t[5], @x[5] 472 pshufd \$0x93, @x[7], @t[7] 473 pxor @t[6], @x[6] 474 pxor @t[7], @x[7] 475 476 pxor @x[0], @t[1] 477 pxor @x[7], @t[0] 478 pxor @x[7], @t[1] 479 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64) 480 pxor @x[1], @t[2] 481 pshufd \$0x4E, @x[1], @x[1] 482 pxor @x[4], @t[5] 483 pxor @t[0], @x[0] 484 pxor @x[5], @t[6] 485 pxor @t[1], @x[1] 486 pxor @x[3], @t[4] 487 pshufd \$0x4E, @x[4], @t[0] 488 pxor @x[6], @t[7] 489 pshufd \$0x4E, @x[5], @t[1] 490 pxor @x[2], @t[3] 491 pshufd \$0x4E, @x[3], @x[4] 492 pxor @x[7], @t[3] 493 pshufd \$0x4E, @x[7], @x[5] 494 pxor @x[7], @t[4] 495 pshufd \$0x4E, @x[6], @x[3] 496 pxor @t[4], @t[0] 497 pshufd \$0x4E, @x[2], @x[6] 498 pxor @t[5], @t[1] 499 500 pxor @t[3], @x[4] 501 pxor @t[7], @x[5] 502 pxor @t[6], @x[3] 503 movdqa @t[0], @x[2] 504 pxor @t[2], @x[6] 505 movdqa @t[1], @x[7] 506___ 507} 508 509sub InvMixColumns { 510my @x=@_[0..7]; 511my @t=@_[8..15]; 512 513$code.=<<___; 514 # multiplication by 0x0e 515 pshufd \$0x93, @x[7], @t[7] 516 movdqa @x[2], @t[2] 517 pxor @x[5], @x[7] # 7 5 518 pxor @x[5], @x[2] # 2 5 519 pshufd \$0x93, @x[0], @t[0] 520 movdqa @x[5], @t[5] 521 pxor @x[0], @x[5] # 5 0 [1] 522 pxor @x[1], @x[0] # 0 1 523 pshufd \$0x93, @x[1], @t[1] 524 pxor @x[2], @x[1] # 1 25 525 pxor @x[6], @x[0] # 01 6 [2] 526 pxor @x[3], @x[1] # 125 3 [4] 527 pshufd \$0x93, @x[3], @t[3] 528 pxor @x[0], @x[2] # 25 016 [3] 529 pxor @x[7], @x[3] # 3 75 530 pxor @x[6], @x[7] # 75 6 [0] 531 pshufd \$0x93, @x[6], @t[6] 532 movdqa @x[4], @t[4] 533 pxor @x[4], @x[6] # 6 4 534 pxor @x[3], @x[4] # 4 375 [6] 535 pxor @x[7], @x[3] # 375 756=36 536 pxor @t[5], @x[6] # 64 5 [7] 537 pxor @t[2], @x[3] # 36 2 538 pxor @t[4], @x[3] # 362 4 [5] 539 pshufd \$0x93, @t[5], @t[5] 540___ 541 my @y = @x[7,5,0,2,1,3,4,6]; 542$code.=<<___; 543 # multiplication by 0x0b 544 pxor @y[0], @y[1] 545 pxor @t[0], @y[0] 546 pxor @t[1], @y[1] 547 pshufd \$0x93, @t[2], @t[2] 548 pxor @t[5], @y[0] 549 pxor @t[6], @y[1] 550 pxor @t[7], @y[0] 551 pshufd \$0x93, @t[4], @t[4] 552 pxor @t[6], @t[7] # clobber t[7] 553 pxor @y[0], @y[1] 554 555 pxor @t[0], @y[3] 556 pshufd \$0x93, @t[0], @t[0] 557 pxor @t[1], @y[2] 558 pxor @t[1], @y[4] 559 pxor @t[2], @y[2] 560 pshufd \$0x93, @t[1], @t[1] 561 pxor @t[2], @y[3] 562 pxor @t[2], @y[5] 563 pxor @t[7], @y[2] 564 pshufd \$0x93, @t[2], @t[2] 565 pxor @t[3], @y[3] 566 pxor @t[3], @y[6] 567 pxor @t[3], @y[4] 568 pshufd \$0x93, @t[3], @t[3] 569 pxor @t[4], @y[7] 570 pxor @t[4], @y[5] 571 pxor @t[7], @y[7] 572 pxor @t[5], @y[3] 573 pxor @t[4], @y[4] 574 pxor @t[5], @t[7] # clobber t[7] even more 575 576 pxor @t[7], @y[5] 577 pshufd \$0x93, @t[4], @t[4] 578 pxor @t[7], @y[6] 579 pxor @t[7], @y[4] 580 581 pxor @t[5], @t[7] 582 pshufd \$0x93, @t[5], @t[5] 583 pxor @t[6], @t[7] # restore t[7] 584 585 # multiplication by 0x0d 586 pxor @y[7], @y[4] 587 pxor @t[4], @y[7] 588 pshufd \$0x93, @t[6], @t[6] 589 pxor @t[0], @y[2] 590 pxor @t[5], @y[7] 591 pxor @t[2], @y[2] 592 pshufd \$0x93, @t[7], @t[7] 593 594 pxor @y[1], @y[3] 595 pxor @t[1], @y[1] 596 pxor @t[0], @y[0] 597 pxor @t[0], @y[3] 598 pxor @t[5], @y[1] 599 pxor @t[5], @y[0] 600 pxor @t[7], @y[1] 601 pshufd \$0x93, @t[0], @t[0] 602 pxor @t[6], @y[0] 603 pxor @y[1], @y[3] 604 pxor @t[1], @y[4] 605 pshufd \$0x93, @t[1], @t[1] 606 607 pxor @t[7], @y[7] 608 pxor @t[2], @y[4] 609 pxor @t[2], @y[5] 610 pshufd \$0x93, @t[2], @t[2] 611 pxor @t[6], @y[2] 612 pxor @t[3], @t[6] # clobber t[6] 613 pxor @y[7], @y[4] 614 pxor @t[6], @y[3] 615 616 pxor @t[6], @y[6] 617 pxor @t[5], @y[5] 618 pxor @t[4], @y[6] 619 pshufd \$0x93, @t[4], @t[4] 620 pxor @t[6], @y[5] 621 pxor @t[7], @y[6] 622 pxor @t[3], @t[6] # restore t[6] 623 624 pshufd \$0x93, @t[5], @t[5] 625 pshufd \$0x93, @t[6], @t[6] 626 pshufd \$0x93, @t[7], @t[7] 627 pshufd \$0x93, @t[3], @t[3] 628 629 # multiplication by 0x09 630 pxor @y[1], @y[4] 631 pxor @y[1], @t[1] # t[1]=y[1] 632 pxor @t[5], @t[0] # clobber t[0] 633 pxor @t[5], @t[1] 634 pxor @t[0], @y[3] 635 pxor @y[0], @t[0] # t[0]=y[0] 636 pxor @t[6], @t[1] 637 pxor @t[7], @t[6] # clobber t[6] 638 pxor @t[1], @y[4] 639 pxor @t[4], @y[7] 640 pxor @y[4], @t[4] # t[4]=y[4] 641 pxor @t[3], @y[6] 642 pxor @y[3], @t[3] # t[3]=y[3] 643 pxor @t[2], @y[5] 644 pxor @y[2], @t[2] # t[2]=y[2] 645 pxor @t[7], @t[3] 646 pxor @y[5], @t[5] # t[5]=y[5] 647 pxor @t[6], @t[2] 648 pxor @t[6], @t[5] 649 pxor @y[6], @t[6] # t[6]=y[6] 650 pxor @y[7], @t[7] # t[7]=y[7] 651 652 movdqa @t[0],@XMM[0] 653 movdqa @t[1],@XMM[1] 654 movdqa @t[2],@XMM[2] 655 movdqa @t[3],@XMM[3] 656 movdqa @t[4],@XMM[4] 657 movdqa @t[5],@XMM[5] 658 movdqa @t[6],@XMM[6] 659 movdqa @t[7],@XMM[7] 660___ 661} 662 663sub aesenc { # not used 664my @b=@_[0..7]; 665my @t=@_[8..15]; 666$code.=<<___; 667 movdqa 0x30($const),@t[0] # .LSR 668___ 669 &ShiftRows (@b,@t[0]); 670 &Sbox (@b,@t); 671 &MixColumns (@b[0,1,4,6,3,7,2,5],@t); 672} 673 674sub aesenclast { # not used 675my @b=@_[0..7]; 676my @t=@_[8..15]; 677$code.=<<___; 678 movdqa 0x40($const),@t[0] # .LSRM0 679___ 680 &ShiftRows (@b,@t[0]); 681 &Sbox (@b,@t); 682$code.=<<___ 683 pxor 0x00($key),@b[0] 684 pxor 0x10($key),@b[1] 685 pxor 0x20($key),@b[4] 686 pxor 0x30($key),@b[6] 687 pxor 0x40($key),@b[3] 688 pxor 0x50($key),@b[7] 689 pxor 0x60($key),@b[2] 690 pxor 0x70($key),@b[5] 691___ 692} 693 694sub swapmove { 695my ($a,$b,$n,$mask,$t)=@_; 696$code.=<<___; 697 movdqa $b,$t 698 psrlq \$$n,$b 699 pxor $a,$b 700 pand $mask,$b 701 pxor $b,$a 702 psllq \$$n,$b 703 pxor $t,$b 704___ 705} 706sub swapmove2x { 707my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_; 708$code.=<<___; 709 movdqa $b0,$t0 710 psrlq \$$n,$b0 711 movdqa $b1,$t1 712 psrlq \$$n,$b1 713 pxor $a0,$b0 714 pxor $a1,$b1 715 pand $mask,$b0 716 pand $mask,$b1 717 pxor $b0,$a0 718 psllq \$$n,$b0 719 pxor $b1,$a1 720 psllq \$$n,$b1 721 pxor $t0,$b0 722 pxor $t1,$b1 723___ 724} 725 726sub bitslice { 727my @x=reverse(@_[0..7]); 728my ($t0,$t1,$t2,$t3)=@_[8..11]; 729$code.=<<___; 730 movdqa 0x00($const),$t0 # .LBS0 731 movdqa 0x10($const),$t1 # .LBS1 732___ 733 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3); 734 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); 735$code.=<<___; 736 movdqa 0x20($const),$t0 # .LBS2 737___ 738 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3); 739 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); 740 741 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3); 742 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3); 743} 744 745$code.=<<___; 746.text 747 748.extern asm_AES_encrypt 749.extern asm_AES_decrypt 750 751.type _bsaes_encrypt8,\@abi-omnipotent 752.align 64 753_bsaes_encrypt8: 754 lea .LBS0(%rip), $const # constants table 755 756 movdqa ($key), @XMM[9] # round 0 key 757 lea 0x10($key), $key 758 movdqa 0x50($const), @XMM[8] # .LM0SR 759 pxor @XMM[9], @XMM[0] # xor with round0 key 760 pxor @XMM[9], @XMM[1] 761 pshufb @XMM[8], @XMM[0] 762 pxor @XMM[9], @XMM[2] 763 pshufb @XMM[8], @XMM[1] 764 pxor @XMM[9], @XMM[3] 765 pshufb @XMM[8], @XMM[2] 766 pxor @XMM[9], @XMM[4] 767 pshufb @XMM[8], @XMM[3] 768 pxor @XMM[9], @XMM[5] 769 pshufb @XMM[8], @XMM[4] 770 pxor @XMM[9], @XMM[6] 771 pshufb @XMM[8], @XMM[5] 772 pxor @XMM[9], @XMM[7] 773 pshufb @XMM[8], @XMM[6] 774 pshufb @XMM[8], @XMM[7] 775_bsaes_encrypt8_bitslice: 776___ 777 &bitslice (@XMM[0..7, 8..11]); 778$code.=<<___; 779 dec $rounds 780 jmp .Lenc_sbox 781.align 16 782.Lenc_loop: 783___ 784 &ShiftRows (@XMM[0..7, 8]); 785$code.=".Lenc_sbox:\n"; 786 &Sbox (@XMM[0..7, 8..15]); 787$code.=<<___; 788 dec $rounds 789 jl .Lenc_done 790___ 791 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]); 792$code.=<<___; 793 movdqa 0x30($const), @XMM[8] # .LSR 794 jnz .Lenc_loop 795 movdqa 0x40($const), @XMM[8] # .LSRM0 796 jmp .Lenc_loop 797.align 16 798.Lenc_done: 799___ 800 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb 801 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]); 802$code.=<<___; 803 movdqa ($key), @XMM[8] # last round key 804 pxor @XMM[8], @XMM[4] 805 pxor @XMM[8], @XMM[6] 806 pxor @XMM[8], @XMM[3] 807 pxor @XMM[8], @XMM[7] 808 pxor @XMM[8], @XMM[2] 809 pxor @XMM[8], @XMM[5] 810 pxor @XMM[8], @XMM[0] 811 pxor @XMM[8], @XMM[1] 812 ret 813.size _bsaes_encrypt8,.-_bsaes_encrypt8 814 815.type _bsaes_decrypt8,\@abi-omnipotent 816.align 64 817_bsaes_decrypt8: 818 lea .LBS0(%rip), $const # constants table 819 820 movdqa ($key), @XMM[9] # round 0 key 821 lea 0x10($key), $key 822 movdqa -0x30($const), @XMM[8] # .LM0ISR 823 pxor @XMM[9], @XMM[0] # xor with round0 key 824 pxor @XMM[9], @XMM[1] 825 pshufb @XMM[8], @XMM[0] 826 pxor @XMM[9], @XMM[2] 827 pshufb @XMM[8], @XMM[1] 828 pxor @XMM[9], @XMM[3] 829 pshufb @XMM[8], @XMM[2] 830 pxor @XMM[9], @XMM[4] 831 pshufb @XMM[8], @XMM[3] 832 pxor @XMM[9], @XMM[5] 833 pshufb @XMM[8], @XMM[4] 834 pxor @XMM[9], @XMM[6] 835 pshufb @XMM[8], @XMM[5] 836 pxor @XMM[9], @XMM[7] 837 pshufb @XMM[8], @XMM[6] 838 pshufb @XMM[8], @XMM[7] 839___ 840 &bitslice (@XMM[0..7, 8..11]); 841$code.=<<___; 842 dec $rounds 843 jmp .Ldec_sbox 844.align 16 845.Ldec_loop: 846___ 847 &ShiftRows (@XMM[0..7, 8]); 848$code.=".Ldec_sbox:\n"; 849 &InvSbox (@XMM[0..7, 8..15]); 850$code.=<<___; 851 dec $rounds 852 jl .Ldec_done 853___ 854 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]); 855$code.=<<___; 856 movdqa -0x10($const), @XMM[8] # .LISR 857 jnz .Ldec_loop 858 movdqa -0x20($const), @XMM[8] # .LISRM0 859 jmp .Ldec_loop 860.align 16 861.Ldec_done: 862___ 863 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]); 864$code.=<<___; 865 movdqa ($key), @XMM[8] # last round key 866 pxor @XMM[8], @XMM[6] 867 pxor @XMM[8], @XMM[4] 868 pxor @XMM[8], @XMM[2] 869 pxor @XMM[8], @XMM[7] 870 pxor @XMM[8], @XMM[3] 871 pxor @XMM[8], @XMM[5] 872 pxor @XMM[8], @XMM[0] 873 pxor @XMM[8], @XMM[1] 874 ret 875.size _bsaes_decrypt8,.-_bsaes_decrypt8 876___ 877} 878{ 879my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11"); 880 881sub bitslice_key { 882my @x=reverse(@_[0..7]); 883my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12]; 884 885 &swapmove (@x[0,1],1,$bs0,$t2,$t3); 886$code.=<<___; 887 #&swapmove(@x[2,3],1,$t0,$t2,$t3); 888 movdqa @x[0], @x[2] 889 movdqa @x[1], @x[3] 890___ 891 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); 892 893 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3); 894$code.=<<___; 895 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); 896 movdqa @x[0], @x[4] 897 movdqa @x[2], @x[6] 898 movdqa @x[1], @x[5] 899 movdqa @x[3], @x[7] 900___ 901 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3); 902 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3); 903} 904 905$code.=<<___; 906.type _bsaes_key_convert,\@abi-omnipotent 907.align 16 908_bsaes_key_convert: 909 lea .Lmasks(%rip), $const 910 movdqu ($inp), %xmm7 # load round 0 key 911 lea 0x10($inp), $inp 912 movdqa 0x00($const), %xmm0 # 0x01... 913 movdqa 0x10($const), %xmm1 # 0x02... 914 movdqa 0x20($const), %xmm2 # 0x04... 915 movdqa 0x30($const), %xmm3 # 0x08... 916 movdqa 0x40($const), %xmm4 # .LM0 917 pcmpeqd %xmm5, %xmm5 # .LNOT 918 919 movdqu ($inp), %xmm6 # load round 1 key 920 movdqa %xmm7, ($out) # save round 0 key 921 lea 0x10($out), $out 922 dec $rounds 923 jmp .Lkey_loop 924.align 16 925.Lkey_loop: 926 pshufb %xmm4, %xmm6 # .LM0 927 928 movdqa %xmm0, %xmm8 929 movdqa %xmm1, %xmm9 930 931 pand %xmm6, %xmm8 932 pand %xmm6, %xmm9 933 movdqa %xmm2, %xmm10 934 pcmpeqb %xmm0, %xmm8 935 psllq \$4, %xmm0 # 0x10... 936 movdqa %xmm3, %xmm11 937 pcmpeqb %xmm1, %xmm9 938 psllq \$4, %xmm1 # 0x20... 939 940 pand %xmm6, %xmm10 941 pand %xmm6, %xmm11 942 movdqa %xmm0, %xmm12 943 pcmpeqb %xmm2, %xmm10 944 psllq \$4, %xmm2 # 0x40... 945 movdqa %xmm1, %xmm13 946 pcmpeqb %xmm3, %xmm11 947 psllq \$4, %xmm3 # 0x80... 948 949 movdqa %xmm2, %xmm14 950 movdqa %xmm3, %xmm15 951 pxor %xmm5, %xmm8 # "pnot" 952 pxor %xmm5, %xmm9 953 954 pand %xmm6, %xmm12 955 pand %xmm6, %xmm13 956 movdqa %xmm8, 0x00($out) # write bit-sliced round key 957 pcmpeqb %xmm0, %xmm12 958 psrlq \$4, %xmm0 # 0x01... 959 movdqa %xmm9, 0x10($out) 960 pcmpeqb %xmm1, %xmm13 961 psrlq \$4, %xmm1 # 0x02... 962 lea 0x10($inp), $inp 963 964 pand %xmm6, %xmm14 965 pand %xmm6, %xmm15 966 movdqa %xmm10, 0x20($out) 967 pcmpeqb %xmm2, %xmm14 968 psrlq \$4, %xmm2 # 0x04... 969 movdqa %xmm11, 0x30($out) 970 pcmpeqb %xmm3, %xmm15 971 psrlq \$4, %xmm3 # 0x08... 972 movdqu ($inp), %xmm6 # load next round key 973 974 pxor %xmm5, %xmm13 # "pnot" 975 pxor %xmm5, %xmm14 976 movdqa %xmm12, 0x40($out) 977 movdqa %xmm13, 0x50($out) 978 movdqa %xmm14, 0x60($out) 979 movdqa %xmm15, 0x70($out) 980 lea 0x80($out),$out 981 dec $rounds 982 jnz .Lkey_loop 983 984 movdqa 0x50($const), %xmm7 # .L63 985 #movdqa %xmm6, ($out) # don't save last round key 986 ret 987.size _bsaes_key_convert,.-_bsaes_key_convert 988___ 989} 990 991if (0 && !$win64) { # following four functions are unsupported interface 992 # used for benchmarking... 993$code.=<<___; 994.globl bsaes_enc_key_convert 995.type bsaes_enc_key_convert,\@function,2 996.align 16 997bsaes_enc_key_convert: 998 mov 240($inp),%r10d # pass rounds 999 mov $inp,%rcx # pass key 1000 mov $out,%rax # pass key schedule 1001 call _bsaes_key_convert 1002 pxor %xmm6,%xmm7 # fix up last round key 1003 movdqa %xmm7,(%rax) # save last round key 1004 ret 1005.size bsaes_enc_key_convert,.-bsaes_enc_key_convert 1006 1007.globl bsaes_encrypt_128 1008.type bsaes_encrypt_128,\@function,4 1009.align 16 1010bsaes_encrypt_128: 1011.Lenc128_loop: 1012 movdqu 0x00($inp), @XMM[0] # load input 1013 movdqu 0x10($inp), @XMM[1] 1014 movdqu 0x20($inp), @XMM[2] 1015 movdqu 0x30($inp), @XMM[3] 1016 movdqu 0x40($inp), @XMM[4] 1017 movdqu 0x50($inp), @XMM[5] 1018 movdqu 0x60($inp), @XMM[6] 1019 movdqu 0x70($inp), @XMM[7] 1020 mov $key, %rax # pass the $key 1021 lea 0x80($inp), $inp 1022 mov \$10,%r10d 1023 1024 call _bsaes_encrypt8 1025 1026 movdqu @XMM[0], 0x00($out) # write output 1027 movdqu @XMM[1], 0x10($out) 1028 movdqu @XMM[4], 0x20($out) 1029 movdqu @XMM[6], 0x30($out) 1030 movdqu @XMM[3], 0x40($out) 1031 movdqu @XMM[7], 0x50($out) 1032 movdqu @XMM[2], 0x60($out) 1033 movdqu @XMM[5], 0x70($out) 1034 lea 0x80($out), $out 1035 sub \$0x80,$len 1036 ja .Lenc128_loop 1037 ret 1038.size bsaes_encrypt_128,.-bsaes_encrypt_128 1039 1040.globl bsaes_dec_key_convert 1041.type bsaes_dec_key_convert,\@function,2 1042.align 16 1043bsaes_dec_key_convert: 1044 mov 240($inp),%r10d # pass rounds 1045 mov $inp,%rcx # pass key 1046 mov $out,%rax # pass key schedule 1047 call _bsaes_key_convert 1048 pxor ($out),%xmm7 # fix up round 0 key 1049 movdqa %xmm6,(%rax) # save last round key 1050 movdqa %xmm7,($out) 1051 ret 1052.size bsaes_dec_key_convert,.-bsaes_dec_key_convert 1053 1054.globl bsaes_decrypt_128 1055.type bsaes_decrypt_128,\@function,4 1056.align 16 1057bsaes_decrypt_128: 1058.Ldec128_loop: 1059 movdqu 0x00($inp), @XMM[0] # load input 1060 movdqu 0x10($inp), @XMM[1] 1061 movdqu 0x20($inp), @XMM[2] 1062 movdqu 0x30($inp), @XMM[3] 1063 movdqu 0x40($inp), @XMM[4] 1064 movdqu 0x50($inp), @XMM[5] 1065 movdqu 0x60($inp), @XMM[6] 1066 movdqu 0x70($inp), @XMM[7] 1067 mov $key, %rax # pass the $key 1068 lea 0x80($inp), $inp 1069 mov \$10,%r10d 1070 1071 call _bsaes_decrypt8 1072 1073 movdqu @XMM[0], 0x00($out) # write output 1074 movdqu @XMM[1], 0x10($out) 1075 movdqu @XMM[6], 0x20($out) 1076 movdqu @XMM[4], 0x30($out) 1077 movdqu @XMM[2], 0x40($out) 1078 movdqu @XMM[7], 0x50($out) 1079 movdqu @XMM[3], 0x60($out) 1080 movdqu @XMM[5], 0x70($out) 1081 lea 0x80($out), $out 1082 sub \$0x80,$len 1083 ja .Ldec128_loop 1084 ret 1085.size bsaes_decrypt_128,.-bsaes_decrypt_128 1086___ 1087} 1088{ 1089###################################################################### 1090# 1091# OpenSSL interface 1092# 1093my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d") 1094 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); 1095my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15"); 1096 1097if ($ecb) { 1098$code.=<<___; 1099.globl bsaes_ecb_encrypt_blocks 1100.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent 1101.align 16 1102bsaes_ecb_encrypt_blocks: 1103 mov %rsp, %rax 1104.Lecb_enc_prologue: 1105 push %rbp 1106 push %rbx 1107 push %r12 1108 push %r13 1109 push %r14 1110 push %r15 1111 lea -0x48(%rsp),%rsp 1112___ 1113$code.=<<___ if ($win64); 1114 lea -0xa0(%rsp), %rsp 1115 movaps %xmm6, 0x40(%rsp) 1116 movaps %xmm7, 0x50(%rsp) 1117 movaps %xmm8, 0x60(%rsp) 1118 movaps %xmm9, 0x70(%rsp) 1119 movaps %xmm10, 0x80(%rsp) 1120 movaps %xmm11, 0x90(%rsp) 1121 movaps %xmm12, 0xa0(%rsp) 1122 movaps %xmm13, 0xb0(%rsp) 1123 movaps %xmm14, 0xc0(%rsp) 1124 movaps %xmm15, 0xd0(%rsp) 1125.Lecb_enc_body: 1126___ 1127$code.=<<___; 1128 mov %rsp,%rbp # backup %rsp 1129 mov 240($arg4),%eax # rounds 1130 mov $arg1,$inp # backup arguments 1131 mov $arg2,$out 1132 mov $arg3,$len 1133 mov $arg4,$key 1134 cmp \$8,$arg3 1135 jb .Lecb_enc_short 1136 1137 mov %eax,%ebx # backup rounds 1138 shl \$7,%rax # 128 bytes per inner round key 1139 sub \$`128-32`,%rax # size of bit-sliced key schedule 1140 sub %rax,%rsp 1141 mov %rsp,%rax # pass key schedule 1142 mov $key,%rcx # pass key 1143 mov %ebx,%r10d # pass rounds 1144 call _bsaes_key_convert 1145 pxor %xmm6,%xmm7 # fix up last round key 1146 movdqa %xmm7,(%rax) # save last round key 1147 1148 sub \$8,$len 1149.Lecb_enc_loop: 1150 movdqu 0x00($inp), @XMM[0] # load input 1151 movdqu 0x10($inp), @XMM[1] 1152 movdqu 0x20($inp), @XMM[2] 1153 movdqu 0x30($inp), @XMM[3] 1154 movdqu 0x40($inp), @XMM[4] 1155 movdqu 0x50($inp), @XMM[5] 1156 mov %rsp, %rax # pass key schedule 1157 movdqu 0x60($inp), @XMM[6] 1158 mov %ebx,%r10d # pass rounds 1159 movdqu 0x70($inp), @XMM[7] 1160 lea 0x80($inp), $inp 1161 1162 call _bsaes_encrypt8 1163 1164 movdqu @XMM[0], 0x00($out) # write output 1165 movdqu @XMM[1], 0x10($out) 1166 movdqu @XMM[4], 0x20($out) 1167 movdqu @XMM[6], 0x30($out) 1168 movdqu @XMM[3], 0x40($out) 1169 movdqu @XMM[7], 0x50($out) 1170 movdqu @XMM[2], 0x60($out) 1171 movdqu @XMM[5], 0x70($out) 1172 lea 0x80($out), $out 1173 sub \$8,$len 1174 jnc .Lecb_enc_loop 1175 1176 add \$8,$len 1177 jz .Lecb_enc_done 1178 1179 movdqu 0x00($inp), @XMM[0] # load input 1180 mov %rsp, %rax # pass key schedule 1181 mov %ebx,%r10d # pass rounds 1182 cmp \$2,$len 1183 jb .Lecb_enc_one 1184 movdqu 0x10($inp), @XMM[1] 1185 je .Lecb_enc_two 1186 movdqu 0x20($inp), @XMM[2] 1187 cmp \$4,$len 1188 jb .Lecb_enc_three 1189 movdqu 0x30($inp), @XMM[3] 1190 je .Lecb_enc_four 1191 movdqu 0x40($inp), @XMM[4] 1192 cmp \$6,$len 1193 jb .Lecb_enc_five 1194 movdqu 0x50($inp), @XMM[5] 1195 je .Lecb_enc_six 1196 movdqu 0x60($inp), @XMM[6] 1197 call _bsaes_encrypt8 1198 movdqu @XMM[0], 0x00($out) # write output 1199 movdqu @XMM[1], 0x10($out) 1200 movdqu @XMM[4], 0x20($out) 1201 movdqu @XMM[6], 0x30($out) 1202 movdqu @XMM[3], 0x40($out) 1203 movdqu @XMM[7], 0x50($out) 1204 movdqu @XMM[2], 0x60($out) 1205 jmp .Lecb_enc_done 1206.align 16 1207.Lecb_enc_six: 1208 call _bsaes_encrypt8 1209 movdqu @XMM[0], 0x00($out) # write output 1210 movdqu @XMM[1], 0x10($out) 1211 movdqu @XMM[4], 0x20($out) 1212 movdqu @XMM[6], 0x30($out) 1213 movdqu @XMM[3], 0x40($out) 1214 movdqu @XMM[7], 0x50($out) 1215 jmp .Lecb_enc_done 1216.align 16 1217.Lecb_enc_five: 1218 call _bsaes_encrypt8 1219 movdqu @XMM[0], 0x00($out) # write output 1220 movdqu @XMM[1], 0x10($out) 1221 movdqu @XMM[4], 0x20($out) 1222 movdqu @XMM[6], 0x30($out) 1223 movdqu @XMM[3], 0x40($out) 1224 jmp .Lecb_enc_done 1225.align 16 1226.Lecb_enc_four: 1227 call _bsaes_encrypt8 1228 movdqu @XMM[0], 0x00($out) # write output 1229 movdqu @XMM[1], 0x10($out) 1230 movdqu @XMM[4], 0x20($out) 1231 movdqu @XMM[6], 0x30($out) 1232 jmp .Lecb_enc_done 1233.align 16 1234.Lecb_enc_three: 1235 call _bsaes_encrypt8 1236 movdqu @XMM[0], 0x00($out) # write output 1237 movdqu @XMM[1], 0x10($out) 1238 movdqu @XMM[4], 0x20($out) 1239 jmp .Lecb_enc_done 1240.align 16 1241.Lecb_enc_two: 1242 call _bsaes_encrypt8 1243 movdqu @XMM[0], 0x00($out) # write output 1244 movdqu @XMM[1], 0x10($out) 1245 jmp .Lecb_enc_done 1246.align 16 1247.Lecb_enc_one: 1248 call _bsaes_encrypt8 1249 movdqu @XMM[0], 0x00($out) # write output 1250 jmp .Lecb_enc_done 1251.align 16 1252.Lecb_enc_short: 1253 lea ($inp), $arg1 1254 lea ($out), $arg2 1255 lea ($key), $arg3 1256 call asm_AES_encrypt 1257 lea 16($inp), $inp 1258 lea 16($out), $out 1259 dec $len 1260 jnz .Lecb_enc_short 1261 1262.Lecb_enc_done: 1263 lea (%rsp),%rax 1264 pxor %xmm0, %xmm0 1265.Lecb_enc_bzero: # wipe key schedule [if any] 1266 movdqa %xmm0, 0x00(%rax) 1267 movdqa %xmm0, 0x10(%rax) 1268 lea 0x20(%rax), %rax 1269 cmp %rax, %rbp 1270 jb .Lecb_enc_bzero 1271 1272 lea (%rbp),%rsp # restore %rsp 1273___ 1274$code.=<<___ if ($win64); 1275 movaps 0x40(%rbp), %xmm6 1276 movaps 0x50(%rbp), %xmm7 1277 movaps 0x60(%rbp), %xmm8 1278 movaps 0x70(%rbp), %xmm9 1279 movaps 0x80(%rbp), %xmm10 1280 movaps 0x90(%rbp), %xmm11 1281 movaps 0xa0(%rbp), %xmm12 1282 movaps 0xb0(%rbp), %xmm13 1283 movaps 0xc0(%rbp), %xmm14 1284 movaps 0xd0(%rbp), %xmm15 1285 lea 0xa0(%rbp), %rsp 1286___ 1287$code.=<<___; 1288 mov 0x48(%rsp), %r15 1289 mov 0x50(%rsp), %r14 1290 mov 0x58(%rsp), %r13 1291 mov 0x60(%rsp), %r12 1292 mov 0x68(%rsp), %rbx 1293 mov 0x70(%rsp), %rax 1294 lea 0x78(%rsp), %rsp 1295 mov %rax, %rbp 1296.Lecb_enc_epilogue: 1297 ret 1298.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks 1299 1300.globl bsaes_ecb_decrypt_blocks 1301.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent 1302.align 16 1303bsaes_ecb_decrypt_blocks: 1304 mov %rsp, %rax 1305.Lecb_dec_prologue: 1306 push %rbp 1307 push %rbx 1308 push %r12 1309 push %r13 1310 push %r14 1311 push %r15 1312 lea -0x48(%rsp),%rsp 1313___ 1314$code.=<<___ if ($win64); 1315 lea -0xa0(%rsp), %rsp 1316 movaps %xmm6, 0x40(%rsp) 1317 movaps %xmm7, 0x50(%rsp) 1318 movaps %xmm8, 0x60(%rsp) 1319 movaps %xmm9, 0x70(%rsp) 1320 movaps %xmm10, 0x80(%rsp) 1321 movaps %xmm11, 0x90(%rsp) 1322 movaps %xmm12, 0xa0(%rsp) 1323 movaps %xmm13, 0xb0(%rsp) 1324 movaps %xmm14, 0xc0(%rsp) 1325 movaps %xmm15, 0xd0(%rsp) 1326.Lecb_dec_body: 1327___ 1328$code.=<<___; 1329 mov %rsp,%rbp # backup %rsp 1330 mov 240($arg4),%eax # rounds 1331 mov $arg1,$inp # backup arguments 1332 mov $arg2,$out 1333 mov $arg3,$len 1334 mov $arg4,$key 1335 cmp \$8,$arg3 1336 jb .Lecb_dec_short 1337 1338 mov %eax,%ebx # backup rounds 1339 shl \$7,%rax # 128 bytes per inner round key 1340 sub \$`128-32`,%rax # size of bit-sliced key schedule 1341 sub %rax,%rsp 1342 mov %rsp,%rax # pass key schedule 1343 mov $key,%rcx # pass key 1344 mov %ebx,%r10d # pass rounds 1345 call _bsaes_key_convert 1346 pxor (%rsp),%xmm7 # fix up 0 round key 1347 movdqa %xmm6,(%rax) # save last round key 1348 movdqa %xmm7,(%rsp) 1349 1350 sub \$8,$len 1351.Lecb_dec_loop: 1352 movdqu 0x00($inp), @XMM[0] # load input 1353 movdqu 0x10($inp), @XMM[1] 1354 movdqu 0x20($inp), @XMM[2] 1355 movdqu 0x30($inp), @XMM[3] 1356 movdqu 0x40($inp), @XMM[4] 1357 movdqu 0x50($inp), @XMM[5] 1358 mov %rsp, %rax # pass key schedule 1359 movdqu 0x60($inp), @XMM[6] 1360 mov %ebx,%r10d # pass rounds 1361 movdqu 0x70($inp), @XMM[7] 1362 lea 0x80($inp), $inp 1363 1364 call _bsaes_decrypt8 1365 1366 movdqu @XMM[0], 0x00($out) # write output 1367 movdqu @XMM[1], 0x10($out) 1368 movdqu @XMM[6], 0x20($out) 1369 movdqu @XMM[4], 0x30($out) 1370 movdqu @XMM[2], 0x40($out) 1371 movdqu @XMM[7], 0x50($out) 1372 movdqu @XMM[3], 0x60($out) 1373 movdqu @XMM[5], 0x70($out) 1374 lea 0x80($out), $out 1375 sub \$8,$len 1376 jnc .Lecb_dec_loop 1377 1378 add \$8,$len 1379 jz .Lecb_dec_done 1380 1381 movdqu 0x00($inp), @XMM[0] # load input 1382 mov %rsp, %rax # pass key schedule 1383 mov %ebx,%r10d # pass rounds 1384 cmp \$2,$len 1385 jb .Lecb_dec_one 1386 movdqu 0x10($inp), @XMM[1] 1387 je .Lecb_dec_two 1388 movdqu 0x20($inp), @XMM[2] 1389 cmp \$4,$len 1390 jb .Lecb_dec_three 1391 movdqu 0x30($inp), @XMM[3] 1392 je .Lecb_dec_four 1393 movdqu 0x40($inp), @XMM[4] 1394 cmp \$6,$len 1395 jb .Lecb_dec_five 1396 movdqu 0x50($inp), @XMM[5] 1397 je .Lecb_dec_six 1398 movdqu 0x60($inp), @XMM[6] 1399 call _bsaes_decrypt8 1400 movdqu @XMM[0], 0x00($out) # write output 1401 movdqu @XMM[1], 0x10($out) 1402 movdqu @XMM[6], 0x20($out) 1403 movdqu @XMM[4], 0x30($out) 1404 movdqu @XMM[2], 0x40($out) 1405 movdqu @XMM[7], 0x50($out) 1406 movdqu @XMM[3], 0x60($out) 1407 jmp .Lecb_dec_done 1408.align 16 1409.Lecb_dec_six: 1410 call _bsaes_decrypt8 1411 movdqu @XMM[0], 0x00($out) # write output 1412 movdqu @XMM[1], 0x10($out) 1413 movdqu @XMM[6], 0x20($out) 1414 movdqu @XMM[4], 0x30($out) 1415 movdqu @XMM[2], 0x40($out) 1416 movdqu @XMM[7], 0x50($out) 1417 jmp .Lecb_dec_done 1418.align 16 1419.Lecb_dec_five: 1420 call _bsaes_decrypt8 1421 movdqu @XMM[0], 0x00($out) # write output 1422 movdqu @XMM[1], 0x10($out) 1423 movdqu @XMM[6], 0x20($out) 1424 movdqu @XMM[4], 0x30($out) 1425 movdqu @XMM[2], 0x40($out) 1426 jmp .Lecb_dec_done 1427.align 16 1428.Lecb_dec_four: 1429 call _bsaes_decrypt8 1430 movdqu @XMM[0], 0x00($out) # write output 1431 movdqu @XMM[1], 0x10($out) 1432 movdqu @XMM[6], 0x20($out) 1433 movdqu @XMM[4], 0x30($out) 1434 jmp .Lecb_dec_done 1435.align 16 1436.Lecb_dec_three: 1437 call _bsaes_decrypt8 1438 movdqu @XMM[0], 0x00($out) # write output 1439 movdqu @XMM[1], 0x10($out) 1440 movdqu @XMM[6], 0x20($out) 1441 jmp .Lecb_dec_done 1442.align 16 1443.Lecb_dec_two: 1444 call _bsaes_decrypt8 1445 movdqu @XMM[0], 0x00($out) # write output 1446 movdqu @XMM[1], 0x10($out) 1447 jmp .Lecb_dec_done 1448.align 16 1449.Lecb_dec_one: 1450 call _bsaes_decrypt8 1451 movdqu @XMM[0], 0x00($out) # write output 1452 jmp .Lecb_dec_done 1453.align 16 1454.Lecb_dec_short: 1455 lea ($inp), $arg1 1456 lea ($out), $arg2 1457 lea ($key), $arg3 1458 call asm_AES_decrypt 1459 lea 16($inp), $inp 1460 lea 16($out), $out 1461 dec $len 1462 jnz .Lecb_dec_short 1463 1464.Lecb_dec_done: 1465 lea (%rsp),%rax 1466 pxor %xmm0, %xmm0 1467.Lecb_dec_bzero: # wipe key schedule [if any] 1468 movdqa %xmm0, 0x00(%rax) 1469 movdqa %xmm0, 0x10(%rax) 1470 lea 0x20(%rax), %rax 1471 cmp %rax, %rbp 1472 jb .Lecb_dec_bzero 1473 1474 lea (%rbp),%rsp # restore %rsp 1475___ 1476$code.=<<___ if ($win64); 1477 movaps 0x40(%rbp), %xmm6 1478 movaps 0x50(%rbp), %xmm7 1479 movaps 0x60(%rbp), %xmm8 1480 movaps 0x70(%rbp), %xmm9 1481 movaps 0x80(%rbp), %xmm10 1482 movaps 0x90(%rbp), %xmm11 1483 movaps 0xa0(%rbp), %xmm12 1484 movaps 0xb0(%rbp), %xmm13 1485 movaps 0xc0(%rbp), %xmm14 1486 movaps 0xd0(%rbp), %xmm15 1487 lea 0xa0(%rbp), %rsp 1488___ 1489$code.=<<___; 1490 mov 0x48(%rsp), %r15 1491 mov 0x50(%rsp), %r14 1492 mov 0x58(%rsp), %r13 1493 mov 0x60(%rsp), %r12 1494 mov 0x68(%rsp), %rbx 1495 mov 0x70(%rsp), %rax 1496 lea 0x78(%rsp), %rsp 1497 mov %rax, %rbp 1498.Lecb_dec_epilogue: 1499 ret 1500.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks 1501___ 1502} 1503$code.=<<___; 1504.extern asm_AES_cbc_encrypt 1505.globl bsaes_cbc_encrypt 1506.type bsaes_cbc_encrypt,\@abi-omnipotent 1507.align 16 1508bsaes_cbc_encrypt: 1509___ 1510$code.=<<___ if ($win64); 1511 mov 48(%rsp),$arg6 # pull direction flag 1512___ 1513$code.=<<___; 1514 cmp \$0,$arg6 1515 jne asm_AES_cbc_encrypt 1516 cmp \$128,$arg3 1517 jb asm_AES_cbc_encrypt 1518 1519 mov %rsp, %rax 1520.Lcbc_dec_prologue: 1521 push %rbp 1522 push %rbx 1523 push %r12 1524 push %r13 1525 push %r14 1526 push %r15 1527 lea -0x48(%rsp), %rsp 1528___ 1529$code.=<<___ if ($win64); 1530 mov 0xa0(%rsp),$arg5 # pull ivp 1531 lea -0xa0(%rsp), %rsp 1532 movaps %xmm6, 0x40(%rsp) 1533 movaps %xmm7, 0x50(%rsp) 1534 movaps %xmm8, 0x60(%rsp) 1535 movaps %xmm9, 0x70(%rsp) 1536 movaps %xmm10, 0x80(%rsp) 1537 movaps %xmm11, 0x90(%rsp) 1538 movaps %xmm12, 0xa0(%rsp) 1539 movaps %xmm13, 0xb0(%rsp) 1540 movaps %xmm14, 0xc0(%rsp) 1541 movaps %xmm15, 0xd0(%rsp) 1542.Lcbc_dec_body: 1543___ 1544$code.=<<___; 1545 mov %rsp, %rbp # backup %rsp 1546 mov 240($arg4), %eax # rounds 1547 mov $arg1, $inp # backup arguments 1548 mov $arg2, $out 1549 mov $arg3, $len 1550 mov $arg4, $key 1551 mov $arg5, %rbx 1552 shr \$4, $len # bytes to blocks 1553 1554 mov %eax, %edx # rounds 1555 shl \$7, %rax # 128 bytes per inner round key 1556 sub \$`128-32`, %rax # size of bit-sliced key schedule 1557 sub %rax, %rsp 1558 1559 mov %rsp, %rax # pass key schedule 1560 mov $key, %rcx # pass key 1561 mov %edx, %r10d # pass rounds 1562 call _bsaes_key_convert 1563 pxor (%rsp),%xmm7 # fix up 0 round key 1564 movdqa %xmm6,(%rax) # save last round key 1565 movdqa %xmm7,(%rsp) 1566 1567 movdqu (%rbx), @XMM[15] # load IV 1568 sub \$8,$len 1569.Lcbc_dec_loop: 1570 movdqu 0x00($inp), @XMM[0] # load input 1571 movdqu 0x10($inp), @XMM[1] 1572 movdqu 0x20($inp), @XMM[2] 1573 movdqu 0x30($inp), @XMM[3] 1574 movdqu 0x40($inp), @XMM[4] 1575 movdqu 0x50($inp), @XMM[5] 1576 mov %rsp, %rax # pass key schedule 1577 movdqu 0x60($inp), @XMM[6] 1578 mov %edx,%r10d # pass rounds 1579 movdqu 0x70($inp), @XMM[7] 1580 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1581 1582 call _bsaes_decrypt8 1583 1584 pxor 0x20(%rbp), @XMM[0] # ^= IV 1585 movdqu 0x00($inp), @XMM[8] # re-load input 1586 movdqu 0x10($inp), @XMM[9] 1587 pxor @XMM[8], @XMM[1] 1588 movdqu 0x20($inp), @XMM[10] 1589 pxor @XMM[9], @XMM[6] 1590 movdqu 0x30($inp), @XMM[11] 1591 pxor @XMM[10], @XMM[4] 1592 movdqu 0x40($inp), @XMM[12] 1593 pxor @XMM[11], @XMM[2] 1594 movdqu 0x50($inp), @XMM[13] 1595 pxor @XMM[12], @XMM[7] 1596 movdqu 0x60($inp), @XMM[14] 1597 pxor @XMM[13], @XMM[3] 1598 movdqu 0x70($inp), @XMM[15] # IV 1599 pxor @XMM[14], @XMM[5] 1600 movdqu @XMM[0], 0x00($out) # write output 1601 lea 0x80($inp), $inp 1602 movdqu @XMM[1], 0x10($out) 1603 movdqu @XMM[6], 0x20($out) 1604 movdqu @XMM[4], 0x30($out) 1605 movdqu @XMM[2], 0x40($out) 1606 movdqu @XMM[7], 0x50($out) 1607 movdqu @XMM[3], 0x60($out) 1608 movdqu @XMM[5], 0x70($out) 1609 lea 0x80($out), $out 1610 sub \$8,$len 1611 jnc .Lcbc_dec_loop 1612 1613 add \$8,$len 1614 jz .Lcbc_dec_done 1615 1616 movdqu 0x00($inp), @XMM[0] # load input 1617 mov %rsp, %rax # pass key schedule 1618 mov %edx, %r10d # pass rounds 1619 cmp \$2,$len 1620 jb .Lcbc_dec_one 1621 movdqu 0x10($inp), @XMM[1] 1622 je .Lcbc_dec_two 1623 movdqu 0x20($inp), @XMM[2] 1624 cmp \$4,$len 1625 jb .Lcbc_dec_three 1626 movdqu 0x30($inp), @XMM[3] 1627 je .Lcbc_dec_four 1628 movdqu 0x40($inp), @XMM[4] 1629 cmp \$6,$len 1630 jb .Lcbc_dec_five 1631 movdqu 0x50($inp), @XMM[5] 1632 je .Lcbc_dec_six 1633 movdqu 0x60($inp), @XMM[6] 1634 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1635 call _bsaes_decrypt8 1636 pxor 0x20(%rbp), @XMM[0] # ^= IV 1637 movdqu 0x00($inp), @XMM[8] # re-load input 1638 movdqu 0x10($inp), @XMM[9] 1639 pxor @XMM[8], @XMM[1] 1640 movdqu 0x20($inp), @XMM[10] 1641 pxor @XMM[9], @XMM[6] 1642 movdqu 0x30($inp), @XMM[11] 1643 pxor @XMM[10], @XMM[4] 1644 movdqu 0x40($inp), @XMM[12] 1645 pxor @XMM[11], @XMM[2] 1646 movdqu 0x50($inp), @XMM[13] 1647 pxor @XMM[12], @XMM[7] 1648 movdqu 0x60($inp), @XMM[15] # IV 1649 pxor @XMM[13], @XMM[3] 1650 movdqu @XMM[0], 0x00($out) # write output 1651 movdqu @XMM[1], 0x10($out) 1652 movdqu @XMM[6], 0x20($out) 1653 movdqu @XMM[4], 0x30($out) 1654 movdqu @XMM[2], 0x40($out) 1655 movdqu @XMM[7], 0x50($out) 1656 movdqu @XMM[3], 0x60($out) 1657 jmp .Lcbc_dec_done 1658.align 16 1659.Lcbc_dec_six: 1660 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1661 call _bsaes_decrypt8 1662 pxor 0x20(%rbp), @XMM[0] # ^= IV 1663 movdqu 0x00($inp), @XMM[8] # re-load input 1664 movdqu 0x10($inp), @XMM[9] 1665 pxor @XMM[8], @XMM[1] 1666 movdqu 0x20($inp), @XMM[10] 1667 pxor @XMM[9], @XMM[6] 1668 movdqu 0x30($inp), @XMM[11] 1669 pxor @XMM[10], @XMM[4] 1670 movdqu 0x40($inp), @XMM[12] 1671 pxor @XMM[11], @XMM[2] 1672 movdqu 0x50($inp), @XMM[15] # IV 1673 pxor @XMM[12], @XMM[7] 1674 movdqu @XMM[0], 0x00($out) # write output 1675 movdqu @XMM[1], 0x10($out) 1676 movdqu @XMM[6], 0x20($out) 1677 movdqu @XMM[4], 0x30($out) 1678 movdqu @XMM[2], 0x40($out) 1679 movdqu @XMM[7], 0x50($out) 1680 jmp .Lcbc_dec_done 1681.align 16 1682.Lcbc_dec_five: 1683 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1684 call _bsaes_decrypt8 1685 pxor 0x20(%rbp), @XMM[0] # ^= IV 1686 movdqu 0x00($inp), @XMM[8] # re-load input 1687 movdqu 0x10($inp), @XMM[9] 1688 pxor @XMM[8], @XMM[1] 1689 movdqu 0x20($inp), @XMM[10] 1690 pxor @XMM[9], @XMM[6] 1691 movdqu 0x30($inp), @XMM[11] 1692 pxor @XMM[10], @XMM[4] 1693 movdqu 0x40($inp), @XMM[15] # IV 1694 pxor @XMM[11], @XMM[2] 1695 movdqu @XMM[0], 0x00($out) # write output 1696 movdqu @XMM[1], 0x10($out) 1697 movdqu @XMM[6], 0x20($out) 1698 movdqu @XMM[4], 0x30($out) 1699 movdqu @XMM[2], 0x40($out) 1700 jmp .Lcbc_dec_done 1701.align 16 1702.Lcbc_dec_four: 1703 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1704 call _bsaes_decrypt8 1705 pxor 0x20(%rbp), @XMM[0] # ^= IV 1706 movdqu 0x00($inp), @XMM[8] # re-load input 1707 movdqu 0x10($inp), @XMM[9] 1708 pxor @XMM[8], @XMM[1] 1709 movdqu 0x20($inp), @XMM[10] 1710 pxor @XMM[9], @XMM[6] 1711 movdqu 0x30($inp), @XMM[15] # IV 1712 pxor @XMM[10], @XMM[4] 1713 movdqu @XMM[0], 0x00($out) # write output 1714 movdqu @XMM[1], 0x10($out) 1715 movdqu @XMM[6], 0x20($out) 1716 movdqu @XMM[4], 0x30($out) 1717 jmp .Lcbc_dec_done 1718.align 16 1719.Lcbc_dec_three: 1720 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1721 call _bsaes_decrypt8 1722 pxor 0x20(%rbp), @XMM[0] # ^= IV 1723 movdqu 0x00($inp), @XMM[8] # re-load input 1724 movdqu 0x10($inp), @XMM[9] 1725 pxor @XMM[8], @XMM[1] 1726 movdqu 0x20($inp), @XMM[15] # IV 1727 pxor @XMM[9], @XMM[6] 1728 movdqu @XMM[0], 0x00($out) # write output 1729 movdqu @XMM[1], 0x10($out) 1730 movdqu @XMM[6], 0x20($out) 1731 jmp .Lcbc_dec_done 1732.align 16 1733.Lcbc_dec_two: 1734 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1735 call _bsaes_decrypt8 1736 pxor 0x20(%rbp), @XMM[0] # ^= IV 1737 movdqu 0x00($inp), @XMM[8] # re-load input 1738 movdqu 0x10($inp), @XMM[15] # IV 1739 pxor @XMM[8], @XMM[1] 1740 movdqu @XMM[0], 0x00($out) # write output 1741 movdqu @XMM[1], 0x10($out) 1742 jmp .Lcbc_dec_done 1743.align 16 1744.Lcbc_dec_one: 1745 lea ($inp), $arg1 1746 lea 0x20(%rbp), $arg2 # buffer output 1747 lea ($key), $arg3 1748 call asm_AES_decrypt # doesn't touch %xmm 1749 pxor 0x20(%rbp), @XMM[15] # ^= IV 1750 movdqu @XMM[15], ($out) # write output 1751 movdqa @XMM[0], @XMM[15] # IV 1752 1753.Lcbc_dec_done: 1754 movdqu @XMM[15], (%rbx) # return IV 1755 lea (%rsp), %rax 1756 pxor %xmm0, %xmm0 1757.Lcbc_dec_bzero: # wipe key schedule [if any] 1758 movdqa %xmm0, 0x00(%rax) 1759 movdqa %xmm0, 0x10(%rax) 1760 lea 0x20(%rax), %rax 1761 cmp %rax, %rbp 1762 ja .Lcbc_dec_bzero 1763 1764 lea (%rbp),%rsp # restore %rsp 1765___ 1766$code.=<<___ if ($win64); 1767 movaps 0x40(%rbp), %xmm6 1768 movaps 0x50(%rbp), %xmm7 1769 movaps 0x60(%rbp), %xmm8 1770 movaps 0x70(%rbp), %xmm9 1771 movaps 0x80(%rbp), %xmm10 1772 movaps 0x90(%rbp), %xmm11 1773 movaps 0xa0(%rbp), %xmm12 1774 movaps 0xb0(%rbp), %xmm13 1775 movaps 0xc0(%rbp), %xmm14 1776 movaps 0xd0(%rbp), %xmm15 1777 lea 0xa0(%rbp), %rsp 1778___ 1779$code.=<<___; 1780 mov 0x48(%rsp), %r15 1781 mov 0x50(%rsp), %r14 1782 mov 0x58(%rsp), %r13 1783 mov 0x60(%rsp), %r12 1784 mov 0x68(%rsp), %rbx 1785 mov 0x70(%rsp), %rax 1786 lea 0x78(%rsp), %rsp 1787 mov %rax, %rbp 1788.Lcbc_dec_epilogue: 1789 ret 1790.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt 1791 1792.globl bsaes_ctr32_encrypt_blocks 1793.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent 1794.align 16 1795bsaes_ctr32_encrypt_blocks: 1796 mov %rsp, %rax 1797.Lctr_enc_prologue: 1798 push %rbp 1799 push %rbx 1800 push %r12 1801 push %r13 1802 push %r14 1803 push %r15 1804 lea -0x48(%rsp), %rsp 1805___ 1806$code.=<<___ if ($win64); 1807 mov 0xa0(%rsp),$arg5 # pull ivp 1808 lea -0xa0(%rsp), %rsp 1809 movaps %xmm6, 0x40(%rsp) 1810 movaps %xmm7, 0x50(%rsp) 1811 movaps %xmm8, 0x60(%rsp) 1812 movaps %xmm9, 0x70(%rsp) 1813 movaps %xmm10, 0x80(%rsp) 1814 movaps %xmm11, 0x90(%rsp) 1815 movaps %xmm12, 0xa0(%rsp) 1816 movaps %xmm13, 0xb0(%rsp) 1817 movaps %xmm14, 0xc0(%rsp) 1818 movaps %xmm15, 0xd0(%rsp) 1819.Lctr_enc_body: 1820___ 1821$code.=<<___; 1822 mov %rsp, %rbp # backup %rsp 1823 movdqu ($arg5), %xmm0 # load counter 1824 mov 240($arg4), %eax # rounds 1825 mov $arg1, $inp # backup arguments 1826 mov $arg2, $out 1827 mov $arg3, $len 1828 mov $arg4, $key 1829 movdqa %xmm0, 0x20(%rbp) # copy counter 1830 cmp \$8, $arg3 1831 jb .Lctr_enc_short 1832 1833 mov %eax, %ebx # rounds 1834 shl \$7, %rax # 128 bytes per inner round key 1835 sub \$`128-32`, %rax # size of bit-sliced key schedule 1836 sub %rax, %rsp 1837 1838 mov %rsp, %rax # pass key schedule 1839 mov $key, %rcx # pass key 1840 mov %ebx, %r10d # pass rounds 1841 call _bsaes_key_convert 1842 pxor %xmm6,%xmm7 # fix up last round key 1843 movdqa %xmm7,(%rax) # save last round key 1844 1845 movdqa (%rsp), @XMM[9] # load round0 key 1846 lea .LADD1(%rip), %r11 1847 movdqa 0x20(%rbp), @XMM[0] # counter copy 1848 movdqa -0x20(%r11), @XMM[8] # .LSWPUP 1849 pshufb @XMM[8], @XMM[9] # byte swap upper part 1850 pshufb @XMM[8], @XMM[0] 1851 movdqa @XMM[9], (%rsp) # save adjusted round0 key 1852 jmp .Lctr_enc_loop 1853.align 16 1854.Lctr_enc_loop: 1855 movdqa @XMM[0], 0x20(%rbp) # save counter 1856 movdqa @XMM[0], @XMM[1] # prepare 8 counter values 1857 movdqa @XMM[0], @XMM[2] 1858 paddd 0x00(%r11), @XMM[1] # .LADD1 1859 movdqa @XMM[0], @XMM[3] 1860 paddd 0x10(%r11), @XMM[2] # .LADD2 1861 movdqa @XMM[0], @XMM[4] 1862 paddd 0x20(%r11), @XMM[3] # .LADD3 1863 movdqa @XMM[0], @XMM[5] 1864 paddd 0x30(%r11), @XMM[4] # .LADD4 1865 movdqa @XMM[0], @XMM[6] 1866 paddd 0x40(%r11), @XMM[5] # .LADD5 1867 movdqa @XMM[0], @XMM[7] 1868 paddd 0x50(%r11), @XMM[6] # .LADD6 1869 paddd 0x60(%r11), @XMM[7] # .LADD7 1870 1871 # Borrow prologue from _bsaes_encrypt8 to use the opportunity 1872 # to flip byte order in 32-bit counter 1873 movdqa (%rsp), @XMM[9] # round 0 key 1874 lea 0x10(%rsp), %rax # pass key schedule 1875 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR 1876 pxor @XMM[9], @XMM[0] # xor with round0 key 1877 pxor @XMM[9], @XMM[1] 1878 pshufb @XMM[8], @XMM[0] 1879 pxor @XMM[9], @XMM[2] 1880 pshufb @XMM[8], @XMM[1] 1881 pxor @XMM[9], @XMM[3] 1882 pshufb @XMM[8], @XMM[2] 1883 pxor @XMM[9], @XMM[4] 1884 pshufb @XMM[8], @XMM[3] 1885 pxor @XMM[9], @XMM[5] 1886 pshufb @XMM[8], @XMM[4] 1887 pxor @XMM[9], @XMM[6] 1888 pshufb @XMM[8], @XMM[5] 1889 pxor @XMM[9], @XMM[7] 1890 pshufb @XMM[8], @XMM[6] 1891 lea .LBS0(%rip), %r11 # constants table 1892 pshufb @XMM[8], @XMM[7] 1893 mov %ebx,%r10d # pass rounds 1894 1895 call _bsaes_encrypt8_bitslice 1896 1897 sub \$8,$len 1898 jc .Lctr_enc_loop_done 1899 1900 movdqu 0x00($inp), @XMM[8] # load input 1901 movdqu 0x10($inp), @XMM[9] 1902 movdqu 0x20($inp), @XMM[10] 1903 movdqu 0x30($inp), @XMM[11] 1904 movdqu 0x40($inp), @XMM[12] 1905 movdqu 0x50($inp), @XMM[13] 1906 movdqu 0x60($inp), @XMM[14] 1907 movdqu 0x70($inp), @XMM[15] 1908 lea 0x80($inp),$inp 1909 pxor @XMM[0], @XMM[8] 1910 movdqa 0x20(%rbp), @XMM[0] # load counter 1911 pxor @XMM[9], @XMM[1] 1912 movdqu @XMM[8], 0x00($out) # write output 1913 pxor @XMM[10], @XMM[4] 1914 movdqu @XMM[1], 0x10($out) 1915 pxor @XMM[11], @XMM[6] 1916 movdqu @XMM[4], 0x20($out) 1917 pxor @XMM[12], @XMM[3] 1918 movdqu @XMM[6], 0x30($out) 1919 pxor @XMM[13], @XMM[7] 1920 movdqu @XMM[3], 0x40($out) 1921 pxor @XMM[14], @XMM[2] 1922 movdqu @XMM[7], 0x50($out) 1923 pxor @XMM[15], @XMM[5] 1924 movdqu @XMM[2], 0x60($out) 1925 lea .LADD1(%rip), %r11 1926 movdqu @XMM[5], 0x70($out) 1927 lea 0x80($out), $out 1928 paddd 0x70(%r11), @XMM[0] # .LADD8 1929 jnz .Lctr_enc_loop 1930 1931 jmp .Lctr_enc_done 1932.align 16 1933.Lctr_enc_loop_done: 1934 add \$8, $len 1935 movdqu 0x00($inp), @XMM[8] # load input 1936 pxor @XMM[8], @XMM[0] 1937 movdqu @XMM[0], 0x00($out) # write output 1938 cmp \$2,$len 1939 jb .Lctr_enc_done 1940 movdqu 0x10($inp), @XMM[9] 1941 pxor @XMM[9], @XMM[1] 1942 movdqu @XMM[1], 0x10($out) 1943 je .Lctr_enc_done 1944 movdqu 0x20($inp), @XMM[10] 1945 pxor @XMM[10], @XMM[4] 1946 movdqu @XMM[4], 0x20($out) 1947 cmp \$4,$len 1948 jb .Lctr_enc_done 1949 movdqu 0x30($inp), @XMM[11] 1950 pxor @XMM[11], @XMM[6] 1951 movdqu @XMM[6], 0x30($out) 1952 je .Lctr_enc_done 1953 movdqu 0x40($inp), @XMM[12] 1954 pxor @XMM[12], @XMM[3] 1955 movdqu @XMM[3], 0x40($out) 1956 cmp \$6,$len 1957 jb .Lctr_enc_done 1958 movdqu 0x50($inp), @XMM[13] 1959 pxor @XMM[13], @XMM[7] 1960 movdqu @XMM[7], 0x50($out) 1961 je .Lctr_enc_done 1962 movdqu 0x60($inp), @XMM[14] 1963 pxor @XMM[14], @XMM[2] 1964 movdqu @XMM[2], 0x60($out) 1965 jmp .Lctr_enc_done 1966 1967.align 16 1968.Lctr_enc_short: 1969 lea 0x20(%rbp), $arg1 1970 lea 0x30(%rbp), $arg2 1971 lea ($key), $arg3 1972 call asm_AES_encrypt 1973 movdqu ($inp), @XMM[1] 1974 lea 16($inp), $inp 1975 mov 0x2c(%rbp), %eax # load 32-bit counter 1976 bswap %eax 1977 pxor 0x30(%rbp), @XMM[1] 1978 inc %eax # increment 1979 movdqu @XMM[1], ($out) 1980 bswap %eax 1981 lea 16($out), $out 1982 mov %eax, 0x2c(%rsp) # save 32-bit counter 1983 dec $len 1984 jnz .Lctr_enc_short 1985 1986.Lctr_enc_done: 1987 lea (%rsp), %rax 1988 pxor %xmm0, %xmm0 1989.Lctr_enc_bzero: # wipe key schedule [if any] 1990 movdqa %xmm0, 0x00(%rax) 1991 movdqa %xmm0, 0x10(%rax) 1992 lea 0x20(%rax), %rax 1993 cmp %rax, %rbp 1994 ja .Lctr_enc_bzero 1995 1996 lea (%rbp),%rsp # restore %rsp 1997___ 1998$code.=<<___ if ($win64); 1999 movaps 0x40(%rbp), %xmm6 2000 movaps 0x50(%rbp), %xmm7 2001 movaps 0x60(%rbp), %xmm8 2002 movaps 0x70(%rbp), %xmm9 2003 movaps 0x80(%rbp), %xmm10 2004 movaps 0x90(%rbp), %xmm11 2005 movaps 0xa0(%rbp), %xmm12 2006 movaps 0xb0(%rbp), %xmm13 2007 movaps 0xc0(%rbp), %xmm14 2008 movaps 0xd0(%rbp), %xmm15 2009 lea 0xa0(%rbp), %rsp 2010___ 2011$code.=<<___; 2012 mov 0x48(%rsp), %r15 2013 mov 0x50(%rsp), %r14 2014 mov 0x58(%rsp), %r13 2015 mov 0x60(%rsp), %r12 2016 mov 0x68(%rsp), %rbx 2017 mov 0x70(%rsp), %rax 2018 lea 0x78(%rsp), %rsp 2019 mov %rax, %rbp 2020.Lctr_enc_epilogue: 2021 ret 2022.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks 2023___ 2024###################################################################### 2025# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len, 2026# const AES_KEY *key1, const AES_KEY *key2, 2027# const unsigned char iv[16]); 2028# 2029my ($twmask,$twres,$twtmp)=@XMM[13..15]; 2030$code.=<<___; 2031.globl bsaes_xts_encrypt 2032.type bsaes_xts_encrypt,\@abi-omnipotent 2033.align 16 2034bsaes_xts_encrypt: 2035 mov %rsp, %rax 2036.Lxts_enc_prologue: 2037 push %rbp 2038 push %rbx 2039 push %r12 2040 push %r13 2041 push %r14 2042 push %r15 2043 lea -0x48(%rsp), %rsp 2044___ 2045$code.=<<___ if ($win64); 2046 mov 0xa0(%rsp),$arg5 # pull key2 2047 mov 0xa8(%rsp),$arg6 # pull ivp 2048 lea -0xa0(%rsp), %rsp 2049 movaps %xmm6, 0x40(%rsp) 2050 movaps %xmm7, 0x50(%rsp) 2051 movaps %xmm8, 0x60(%rsp) 2052 movaps %xmm9, 0x70(%rsp) 2053 movaps %xmm10, 0x80(%rsp) 2054 movaps %xmm11, 0x90(%rsp) 2055 movaps %xmm12, 0xa0(%rsp) 2056 movaps %xmm13, 0xb0(%rsp) 2057 movaps %xmm14, 0xc0(%rsp) 2058 movaps %xmm15, 0xd0(%rsp) 2059.Lxts_enc_body: 2060___ 2061$code.=<<___; 2062 mov %rsp, %rbp # backup %rsp 2063 mov $arg1, $inp # backup arguments 2064 mov $arg2, $out 2065 mov $arg3, $len 2066 mov $arg4, $key 2067 2068 lea ($arg6), $arg1 2069 lea 0x20(%rbp), $arg2 2070 lea ($arg5), $arg3 2071 call asm_AES_encrypt # generate initial tweak 2072 2073 mov 240($key), %eax # rounds 2074 mov $len, %rbx # backup $len 2075 2076 mov %eax, %edx # rounds 2077 shl \$7, %rax # 128 bytes per inner round key 2078 sub \$`128-32`, %rax # size of bit-sliced key schedule 2079 sub %rax, %rsp 2080 2081 mov %rsp, %rax # pass key schedule 2082 mov $key, %rcx # pass key 2083 mov %edx, %r10d # pass rounds 2084 call _bsaes_key_convert 2085 pxor %xmm6, %xmm7 # fix up last round key 2086 movdqa %xmm7, (%rax) # save last round key 2087 2088 and \$-16, $len 2089 sub \$0x80, %rsp # place for tweak[8] 2090 movdqa 0x20(%rbp), @XMM[7] # initial tweak 2091 2092 pxor $twtmp, $twtmp 2093 movdqa .Lxts_magic(%rip), $twmask 2094 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2095 2096 sub \$0x80, $len 2097 jc .Lxts_enc_short 2098 jmp .Lxts_enc_loop 2099 2100.align 16 2101.Lxts_enc_loop: 2102___ 2103 for ($i=0;$i<7;$i++) { 2104 $code.=<<___; 2105 pshufd \$0x13, $twtmp, $twres 2106 pxor $twtmp, $twtmp 2107 movdqa @XMM[7], @XMM[$i] 2108 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2109 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2110 pand $twmask, $twres # isolate carry and residue 2111 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2112 pxor $twres, @XMM[7] 2113___ 2114 $code.=<<___ if ($i>=1); 2115 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2116___ 2117 $code.=<<___ if ($i>=2); 2118 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2119___ 2120 } 2121$code.=<<___; 2122 movdqu 0x60($inp), @XMM[8+6] 2123 pxor @XMM[8+5], @XMM[5] 2124 movdqu 0x70($inp), @XMM[8+7] 2125 lea 0x80($inp), $inp 2126 movdqa @XMM[7], 0x70(%rsp) 2127 pxor @XMM[8+6], @XMM[6] 2128 lea 0x80(%rsp), %rax # pass key schedule 2129 pxor @XMM[8+7], @XMM[7] 2130 mov %edx, %r10d # pass rounds 2131 2132 call _bsaes_encrypt8 2133 2134 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2135 pxor 0x10(%rsp), @XMM[1] 2136 movdqu @XMM[0], 0x00($out) # write output 2137 pxor 0x20(%rsp), @XMM[4] 2138 movdqu @XMM[1], 0x10($out) 2139 pxor 0x30(%rsp), @XMM[6] 2140 movdqu @XMM[4], 0x20($out) 2141 pxor 0x40(%rsp), @XMM[3] 2142 movdqu @XMM[6], 0x30($out) 2143 pxor 0x50(%rsp), @XMM[7] 2144 movdqu @XMM[3], 0x40($out) 2145 pxor 0x60(%rsp), @XMM[2] 2146 movdqu @XMM[7], 0x50($out) 2147 pxor 0x70(%rsp), @XMM[5] 2148 movdqu @XMM[2], 0x60($out) 2149 movdqu @XMM[5], 0x70($out) 2150 lea 0x80($out), $out 2151 2152 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak 2153 pxor $twtmp, $twtmp 2154 movdqa .Lxts_magic(%rip), $twmask 2155 pcmpgtd @XMM[7], $twtmp 2156 pshufd \$0x13, $twtmp, $twres 2157 pxor $twtmp, $twtmp 2158 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2159 pand $twmask, $twres # isolate carry and residue 2160 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2161 pxor $twres, @XMM[7] 2162 2163 sub \$0x80,$len 2164 jnc .Lxts_enc_loop 2165 2166.Lxts_enc_short: 2167 add \$0x80, $len 2168 jz .Lxts_enc_done 2169___ 2170 for ($i=0;$i<7;$i++) { 2171 $code.=<<___; 2172 pshufd \$0x13, $twtmp, $twres 2173 pxor $twtmp, $twtmp 2174 movdqa @XMM[7], @XMM[$i] 2175 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2176 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2177 pand $twmask, $twres # isolate carry and residue 2178 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2179 pxor $twres, @XMM[7] 2180___ 2181 $code.=<<___ if ($i>=1); 2182 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2183 cmp \$`0x10*$i`,$len 2184 je .Lxts_enc_$i 2185___ 2186 $code.=<<___ if ($i>=2); 2187 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2188___ 2189 } 2190$code.=<<___; 2191 movdqu 0x60($inp), @XMM[8+6] 2192 pxor @XMM[8+5], @XMM[5] 2193 movdqa @XMM[7], 0x70(%rsp) 2194 lea 0x70($inp), $inp 2195 pxor @XMM[8+6], @XMM[6] 2196 lea 0x80(%rsp), %rax # pass key schedule 2197 mov %edx, %r10d # pass rounds 2198 2199 call _bsaes_encrypt8 2200 2201 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2202 pxor 0x10(%rsp), @XMM[1] 2203 movdqu @XMM[0], 0x00($out) # write output 2204 pxor 0x20(%rsp), @XMM[4] 2205 movdqu @XMM[1], 0x10($out) 2206 pxor 0x30(%rsp), @XMM[6] 2207 movdqu @XMM[4], 0x20($out) 2208 pxor 0x40(%rsp), @XMM[3] 2209 movdqu @XMM[6], 0x30($out) 2210 pxor 0x50(%rsp), @XMM[7] 2211 movdqu @XMM[3], 0x40($out) 2212 pxor 0x60(%rsp), @XMM[2] 2213 movdqu @XMM[7], 0x50($out) 2214 movdqu @XMM[2], 0x60($out) 2215 lea 0x70($out), $out 2216 2217 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak 2218 jmp .Lxts_enc_done 2219.align 16 2220.Lxts_enc_6: 2221 pxor @XMM[8+4], @XMM[4] 2222 lea 0x60($inp), $inp 2223 pxor @XMM[8+5], @XMM[5] 2224 lea 0x80(%rsp), %rax # pass key schedule 2225 mov %edx, %r10d # pass rounds 2226 2227 call _bsaes_encrypt8 2228 2229 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2230 pxor 0x10(%rsp), @XMM[1] 2231 movdqu @XMM[0], 0x00($out) # write output 2232 pxor 0x20(%rsp), @XMM[4] 2233 movdqu @XMM[1], 0x10($out) 2234 pxor 0x30(%rsp), @XMM[6] 2235 movdqu @XMM[4], 0x20($out) 2236 pxor 0x40(%rsp), @XMM[3] 2237 movdqu @XMM[6], 0x30($out) 2238 pxor 0x50(%rsp), @XMM[7] 2239 movdqu @XMM[3], 0x40($out) 2240 movdqu @XMM[7], 0x50($out) 2241 lea 0x60($out), $out 2242 2243 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak 2244 jmp .Lxts_enc_done 2245.align 16 2246.Lxts_enc_5: 2247 pxor @XMM[8+3], @XMM[3] 2248 lea 0x50($inp), $inp 2249 pxor @XMM[8+4], @XMM[4] 2250 lea 0x80(%rsp), %rax # pass key schedule 2251 mov %edx, %r10d # pass rounds 2252 2253 call _bsaes_encrypt8 2254 2255 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2256 pxor 0x10(%rsp), @XMM[1] 2257 movdqu @XMM[0], 0x00($out) # write output 2258 pxor 0x20(%rsp), @XMM[4] 2259 movdqu @XMM[1], 0x10($out) 2260 pxor 0x30(%rsp), @XMM[6] 2261 movdqu @XMM[4], 0x20($out) 2262 pxor 0x40(%rsp), @XMM[3] 2263 movdqu @XMM[6], 0x30($out) 2264 movdqu @XMM[3], 0x40($out) 2265 lea 0x50($out), $out 2266 2267 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak 2268 jmp .Lxts_enc_done 2269.align 16 2270.Lxts_enc_4: 2271 pxor @XMM[8+2], @XMM[2] 2272 lea 0x40($inp), $inp 2273 pxor @XMM[8+3], @XMM[3] 2274 lea 0x80(%rsp), %rax # pass key schedule 2275 mov %edx, %r10d # pass rounds 2276 2277 call _bsaes_encrypt8 2278 2279 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2280 pxor 0x10(%rsp), @XMM[1] 2281 movdqu @XMM[0], 0x00($out) # write output 2282 pxor 0x20(%rsp), @XMM[4] 2283 movdqu @XMM[1], 0x10($out) 2284 pxor 0x30(%rsp), @XMM[6] 2285 movdqu @XMM[4], 0x20($out) 2286 movdqu @XMM[6], 0x30($out) 2287 lea 0x40($out), $out 2288 2289 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak 2290 jmp .Lxts_enc_done 2291.align 16 2292.Lxts_enc_3: 2293 pxor @XMM[8+1], @XMM[1] 2294 lea 0x30($inp), $inp 2295 pxor @XMM[8+2], @XMM[2] 2296 lea 0x80(%rsp), %rax # pass key schedule 2297 mov %edx, %r10d # pass rounds 2298 2299 call _bsaes_encrypt8 2300 2301 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2302 pxor 0x10(%rsp), @XMM[1] 2303 movdqu @XMM[0], 0x00($out) # write output 2304 pxor 0x20(%rsp), @XMM[4] 2305 movdqu @XMM[1], 0x10($out) 2306 movdqu @XMM[4], 0x20($out) 2307 lea 0x30($out), $out 2308 2309 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak 2310 jmp .Lxts_enc_done 2311.align 16 2312.Lxts_enc_2: 2313 pxor @XMM[8+0], @XMM[0] 2314 lea 0x20($inp), $inp 2315 pxor @XMM[8+1], @XMM[1] 2316 lea 0x80(%rsp), %rax # pass key schedule 2317 mov %edx, %r10d # pass rounds 2318 2319 call _bsaes_encrypt8 2320 2321 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2322 pxor 0x10(%rsp), @XMM[1] 2323 movdqu @XMM[0], 0x00($out) # write output 2324 movdqu @XMM[1], 0x10($out) 2325 lea 0x20($out), $out 2326 2327 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak 2328 jmp .Lxts_enc_done 2329.align 16 2330.Lxts_enc_1: 2331 pxor @XMM[0], @XMM[8] 2332 lea 0x10($inp), $inp 2333 movdqa @XMM[8], 0x20(%rbp) 2334 lea 0x20(%rbp), $arg1 2335 lea 0x20(%rbp), $arg2 2336 lea ($key), $arg3 2337 call asm_AES_encrypt # doesn't touch %xmm 2338 pxor 0x20(%rbp), @XMM[0] # ^= tweak[] 2339 #pxor @XMM[8], @XMM[0] 2340 #lea 0x80(%rsp), %rax # pass key schedule 2341 #mov %edx, %r10d # pass rounds 2342 #call _bsaes_encrypt8 2343 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2344 movdqu @XMM[0], 0x00($out) # write output 2345 lea 0x10($out), $out 2346 2347 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak 2348 2349.Lxts_enc_done: 2350 and \$15, %ebx 2351 jz .Lxts_enc_ret 2352 mov $out, %rdx 2353 2354.Lxts_enc_steal: 2355 movzb ($inp), %eax 2356 movzb -16(%rdx), %ecx 2357 lea 1($inp), $inp 2358 mov %al, -16(%rdx) 2359 mov %cl, 0(%rdx) 2360 lea 1(%rdx), %rdx 2361 sub \$1,%ebx 2362 jnz .Lxts_enc_steal 2363 2364 movdqu -16($out), @XMM[0] 2365 lea 0x20(%rbp), $arg1 2366 pxor @XMM[7], @XMM[0] 2367 lea 0x20(%rbp), $arg2 2368 movdqa @XMM[0], 0x20(%rbp) 2369 lea ($key), $arg3 2370 call asm_AES_encrypt # doesn't touch %xmm 2371 pxor 0x20(%rbp), @XMM[7] 2372 movdqu @XMM[7], -16($out) 2373 2374.Lxts_enc_ret: 2375 lea (%rsp), %rax 2376 pxor %xmm0, %xmm0 2377.Lxts_enc_bzero: # wipe key schedule [if any] 2378 movdqa %xmm0, 0x00(%rax) 2379 movdqa %xmm0, 0x10(%rax) 2380 lea 0x20(%rax), %rax 2381 cmp %rax, %rbp 2382 ja .Lxts_enc_bzero 2383 2384 lea (%rbp),%rsp # restore %rsp 2385___ 2386$code.=<<___ if ($win64); 2387 movaps 0x40(%rbp), %xmm6 2388 movaps 0x50(%rbp), %xmm7 2389 movaps 0x60(%rbp), %xmm8 2390 movaps 0x70(%rbp), %xmm9 2391 movaps 0x80(%rbp), %xmm10 2392 movaps 0x90(%rbp), %xmm11 2393 movaps 0xa0(%rbp), %xmm12 2394 movaps 0xb0(%rbp), %xmm13 2395 movaps 0xc0(%rbp), %xmm14 2396 movaps 0xd0(%rbp), %xmm15 2397 lea 0xa0(%rbp), %rsp 2398___ 2399$code.=<<___; 2400 mov 0x48(%rsp), %r15 2401 mov 0x50(%rsp), %r14 2402 mov 0x58(%rsp), %r13 2403 mov 0x60(%rsp), %r12 2404 mov 0x68(%rsp), %rbx 2405 mov 0x70(%rsp), %rax 2406 lea 0x78(%rsp), %rsp 2407 mov %rax, %rbp 2408.Lxts_enc_epilogue: 2409 ret 2410.size bsaes_xts_encrypt,.-bsaes_xts_encrypt 2411 2412.globl bsaes_xts_decrypt 2413.type bsaes_xts_decrypt,\@abi-omnipotent 2414.align 16 2415bsaes_xts_decrypt: 2416 mov %rsp, %rax 2417.Lxts_dec_prologue: 2418 push %rbp 2419 push %rbx 2420 push %r12 2421 push %r13 2422 push %r14 2423 push %r15 2424 lea -0x48(%rsp), %rsp 2425___ 2426$code.=<<___ if ($win64); 2427 mov 0xa0(%rsp),$arg5 # pull key2 2428 mov 0xa8(%rsp),$arg6 # pull ivp 2429 lea -0xa0(%rsp), %rsp 2430 movaps %xmm6, 0x40(%rsp) 2431 movaps %xmm7, 0x50(%rsp) 2432 movaps %xmm8, 0x60(%rsp) 2433 movaps %xmm9, 0x70(%rsp) 2434 movaps %xmm10, 0x80(%rsp) 2435 movaps %xmm11, 0x90(%rsp) 2436 movaps %xmm12, 0xa0(%rsp) 2437 movaps %xmm13, 0xb0(%rsp) 2438 movaps %xmm14, 0xc0(%rsp) 2439 movaps %xmm15, 0xd0(%rsp) 2440.Lxts_dec_body: 2441___ 2442$code.=<<___; 2443 mov %rsp, %rbp # backup %rsp 2444 mov $arg1, $inp # backup arguments 2445 mov $arg2, $out 2446 mov $arg3, $len 2447 mov $arg4, $key 2448 2449 lea ($arg6), $arg1 2450 lea 0x20(%rbp), $arg2 2451 lea ($arg5), $arg3 2452 call asm_AES_encrypt # generate initial tweak 2453 2454 mov 240($key), %eax # rounds 2455 mov $len, %rbx # backup $len 2456 2457 mov %eax, %edx # rounds 2458 shl \$7, %rax # 128 bytes per inner round key 2459 sub \$`128-32`, %rax # size of bit-sliced key schedule 2460 sub %rax, %rsp 2461 2462 mov %rsp, %rax # pass key schedule 2463 mov $key, %rcx # pass key 2464 mov %edx, %r10d # pass rounds 2465 call _bsaes_key_convert 2466 pxor (%rsp), %xmm7 # fix up round 0 key 2467 movdqa %xmm6, (%rax) # save last round key 2468 movdqa %xmm7, (%rsp) 2469 2470 xor %eax, %eax # if ($len%16) len-=16; 2471 and \$-16, $len 2472 test \$15, %ebx 2473 setnz %al 2474 shl \$4, %rax 2475 sub %rax, $len 2476 2477 sub \$0x80, %rsp # place for tweak[8] 2478 movdqa 0x20(%rbp), @XMM[7] # initial tweak 2479 2480 pxor $twtmp, $twtmp 2481 movdqa .Lxts_magic(%rip), $twmask 2482 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2483 2484 sub \$0x80, $len 2485 jc .Lxts_dec_short 2486 jmp .Lxts_dec_loop 2487 2488.align 16 2489.Lxts_dec_loop: 2490___ 2491 for ($i=0;$i<7;$i++) { 2492 $code.=<<___; 2493 pshufd \$0x13, $twtmp, $twres 2494 pxor $twtmp, $twtmp 2495 movdqa @XMM[7], @XMM[$i] 2496 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2497 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2498 pand $twmask, $twres # isolate carry and residue 2499 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2500 pxor $twres, @XMM[7] 2501___ 2502 $code.=<<___ if ($i>=1); 2503 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2504___ 2505 $code.=<<___ if ($i>=2); 2506 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2507___ 2508 } 2509$code.=<<___; 2510 movdqu 0x60($inp), @XMM[8+6] 2511 pxor @XMM[8+5], @XMM[5] 2512 movdqu 0x70($inp), @XMM[8+7] 2513 lea 0x80($inp), $inp 2514 movdqa @XMM[7], 0x70(%rsp) 2515 pxor @XMM[8+6], @XMM[6] 2516 lea 0x80(%rsp), %rax # pass key schedule 2517 pxor @XMM[8+7], @XMM[7] 2518 mov %edx, %r10d # pass rounds 2519 2520 call _bsaes_decrypt8 2521 2522 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2523 pxor 0x10(%rsp), @XMM[1] 2524 movdqu @XMM[0], 0x00($out) # write output 2525 pxor 0x20(%rsp), @XMM[6] 2526 movdqu @XMM[1], 0x10($out) 2527 pxor 0x30(%rsp), @XMM[4] 2528 movdqu @XMM[6], 0x20($out) 2529 pxor 0x40(%rsp), @XMM[2] 2530 movdqu @XMM[4], 0x30($out) 2531 pxor 0x50(%rsp), @XMM[7] 2532 movdqu @XMM[2], 0x40($out) 2533 pxor 0x60(%rsp), @XMM[3] 2534 movdqu @XMM[7], 0x50($out) 2535 pxor 0x70(%rsp), @XMM[5] 2536 movdqu @XMM[3], 0x60($out) 2537 movdqu @XMM[5], 0x70($out) 2538 lea 0x80($out), $out 2539 2540 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak 2541 pxor $twtmp, $twtmp 2542 movdqa .Lxts_magic(%rip), $twmask 2543 pcmpgtd @XMM[7], $twtmp 2544 pshufd \$0x13, $twtmp, $twres 2545 pxor $twtmp, $twtmp 2546 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2547 pand $twmask, $twres # isolate carry and residue 2548 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2549 pxor $twres, @XMM[7] 2550 2551 sub \$0x80,$len 2552 jnc .Lxts_dec_loop 2553 2554.Lxts_dec_short: 2555 add \$0x80, $len 2556 jz .Lxts_dec_done 2557___ 2558 for ($i=0;$i<7;$i++) { 2559 $code.=<<___; 2560 pshufd \$0x13, $twtmp, $twres 2561 pxor $twtmp, $twtmp 2562 movdqa @XMM[7], @XMM[$i] 2563 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2564 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2565 pand $twmask, $twres # isolate carry and residue 2566 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2567 pxor $twres, @XMM[7] 2568___ 2569 $code.=<<___ if ($i>=1); 2570 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2571 cmp \$`0x10*$i`,$len 2572 je .Lxts_dec_$i 2573___ 2574 $code.=<<___ if ($i>=2); 2575 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2576___ 2577 } 2578$code.=<<___; 2579 movdqu 0x60($inp), @XMM[8+6] 2580 pxor @XMM[8+5], @XMM[5] 2581 movdqa @XMM[7], 0x70(%rsp) 2582 lea 0x70($inp), $inp 2583 pxor @XMM[8+6], @XMM[6] 2584 lea 0x80(%rsp), %rax # pass key schedule 2585 mov %edx, %r10d # pass rounds 2586 2587 call _bsaes_decrypt8 2588 2589 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2590 pxor 0x10(%rsp), @XMM[1] 2591 movdqu @XMM[0], 0x00($out) # write output 2592 pxor 0x20(%rsp), @XMM[6] 2593 movdqu @XMM[1], 0x10($out) 2594 pxor 0x30(%rsp), @XMM[4] 2595 movdqu @XMM[6], 0x20($out) 2596 pxor 0x40(%rsp), @XMM[2] 2597 movdqu @XMM[4], 0x30($out) 2598 pxor 0x50(%rsp), @XMM[7] 2599 movdqu @XMM[2], 0x40($out) 2600 pxor 0x60(%rsp), @XMM[3] 2601 movdqu @XMM[7], 0x50($out) 2602 movdqu @XMM[3], 0x60($out) 2603 lea 0x70($out), $out 2604 2605 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak 2606 jmp .Lxts_dec_done 2607.align 16 2608.Lxts_dec_6: 2609 pxor @XMM[8+4], @XMM[4] 2610 lea 0x60($inp), $inp 2611 pxor @XMM[8+5], @XMM[5] 2612 lea 0x80(%rsp), %rax # pass key schedule 2613 mov %edx, %r10d # pass rounds 2614 2615 call _bsaes_decrypt8 2616 2617 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2618 pxor 0x10(%rsp), @XMM[1] 2619 movdqu @XMM[0], 0x00($out) # write output 2620 pxor 0x20(%rsp), @XMM[6] 2621 movdqu @XMM[1], 0x10($out) 2622 pxor 0x30(%rsp), @XMM[4] 2623 movdqu @XMM[6], 0x20($out) 2624 pxor 0x40(%rsp), @XMM[2] 2625 movdqu @XMM[4], 0x30($out) 2626 pxor 0x50(%rsp), @XMM[7] 2627 movdqu @XMM[2], 0x40($out) 2628 movdqu @XMM[7], 0x50($out) 2629 lea 0x60($out), $out 2630 2631 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak 2632 jmp .Lxts_dec_done 2633.align 16 2634.Lxts_dec_5: 2635 pxor @XMM[8+3], @XMM[3] 2636 lea 0x50($inp), $inp 2637 pxor @XMM[8+4], @XMM[4] 2638 lea 0x80(%rsp), %rax # pass key schedule 2639 mov %edx, %r10d # pass rounds 2640 2641 call _bsaes_decrypt8 2642 2643 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2644 pxor 0x10(%rsp), @XMM[1] 2645 movdqu @XMM[0], 0x00($out) # write output 2646 pxor 0x20(%rsp), @XMM[6] 2647 movdqu @XMM[1], 0x10($out) 2648 pxor 0x30(%rsp), @XMM[4] 2649 movdqu @XMM[6], 0x20($out) 2650 pxor 0x40(%rsp), @XMM[2] 2651 movdqu @XMM[4], 0x30($out) 2652 movdqu @XMM[2], 0x40($out) 2653 lea 0x50($out), $out 2654 2655 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak 2656 jmp .Lxts_dec_done 2657.align 16 2658.Lxts_dec_4: 2659 pxor @XMM[8+2], @XMM[2] 2660 lea 0x40($inp), $inp 2661 pxor @XMM[8+3], @XMM[3] 2662 lea 0x80(%rsp), %rax # pass key schedule 2663 mov %edx, %r10d # pass rounds 2664 2665 call _bsaes_decrypt8 2666 2667 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2668 pxor 0x10(%rsp), @XMM[1] 2669 movdqu @XMM[0], 0x00($out) # write output 2670 pxor 0x20(%rsp), @XMM[6] 2671 movdqu @XMM[1], 0x10($out) 2672 pxor 0x30(%rsp), @XMM[4] 2673 movdqu @XMM[6], 0x20($out) 2674 movdqu @XMM[4], 0x30($out) 2675 lea 0x40($out), $out 2676 2677 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak 2678 jmp .Lxts_dec_done 2679.align 16 2680.Lxts_dec_3: 2681 pxor @XMM[8+1], @XMM[1] 2682 lea 0x30($inp), $inp 2683 pxor @XMM[8+2], @XMM[2] 2684 lea 0x80(%rsp), %rax # pass key schedule 2685 mov %edx, %r10d # pass rounds 2686 2687 call _bsaes_decrypt8 2688 2689 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2690 pxor 0x10(%rsp), @XMM[1] 2691 movdqu @XMM[0], 0x00($out) # write output 2692 pxor 0x20(%rsp), @XMM[6] 2693 movdqu @XMM[1], 0x10($out) 2694 movdqu @XMM[6], 0x20($out) 2695 lea 0x30($out), $out 2696 2697 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak 2698 jmp .Lxts_dec_done 2699.align 16 2700.Lxts_dec_2: 2701 pxor @XMM[8+0], @XMM[0] 2702 lea 0x20($inp), $inp 2703 pxor @XMM[8+1], @XMM[1] 2704 lea 0x80(%rsp), %rax # pass key schedule 2705 mov %edx, %r10d # pass rounds 2706 2707 call _bsaes_decrypt8 2708 2709 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2710 pxor 0x10(%rsp), @XMM[1] 2711 movdqu @XMM[0], 0x00($out) # write output 2712 movdqu @XMM[1], 0x10($out) 2713 lea 0x20($out), $out 2714 2715 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak 2716 jmp .Lxts_dec_done 2717.align 16 2718.Lxts_dec_1: 2719 pxor @XMM[0], @XMM[8] 2720 lea 0x10($inp), $inp 2721 movdqa @XMM[8], 0x20(%rbp) 2722 lea 0x20(%rbp), $arg1 2723 lea 0x20(%rbp), $arg2 2724 lea ($key), $arg3 2725 call asm_AES_decrypt # doesn't touch %xmm 2726 pxor 0x20(%rbp), @XMM[0] # ^= tweak[] 2727 #pxor @XMM[8], @XMM[0] 2728 #lea 0x80(%rsp), %rax # pass key schedule 2729 #mov %edx, %r10d # pass rounds 2730 #call _bsaes_decrypt8 2731 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2732 movdqu @XMM[0], 0x00($out) # write output 2733 lea 0x10($out), $out 2734 2735 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak 2736 2737.Lxts_dec_done: 2738 and \$15, %ebx 2739 jz .Lxts_dec_ret 2740 2741 pxor $twtmp, $twtmp 2742 movdqa .Lxts_magic(%rip), $twmask 2743 pcmpgtd @XMM[7], $twtmp 2744 pshufd \$0x13, $twtmp, $twres 2745 movdqa @XMM[7], @XMM[6] 2746 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2747 pand $twmask, $twres # isolate carry and residue 2748 movdqu ($inp), @XMM[0] 2749 pxor $twres, @XMM[7] 2750 2751 lea 0x20(%rbp), $arg1 2752 pxor @XMM[7], @XMM[0] 2753 lea 0x20(%rbp), $arg2 2754 movdqa @XMM[0], 0x20(%rbp) 2755 lea ($key), $arg3 2756 call asm_AES_decrypt # doesn't touch %xmm 2757 pxor 0x20(%rbp), @XMM[7] 2758 mov $out, %rdx 2759 movdqu @XMM[7], ($out) 2760 2761.Lxts_dec_steal: 2762 movzb 16($inp), %eax 2763 movzb (%rdx), %ecx 2764 lea 1($inp), $inp 2765 mov %al, (%rdx) 2766 mov %cl, 16(%rdx) 2767 lea 1(%rdx), %rdx 2768 sub \$1,%ebx 2769 jnz .Lxts_dec_steal 2770 2771 movdqu ($out), @XMM[0] 2772 lea 0x20(%rbp), $arg1 2773 pxor @XMM[6], @XMM[0] 2774 lea 0x20(%rbp), $arg2 2775 movdqa @XMM[0], 0x20(%rbp) 2776 lea ($key), $arg3 2777 call asm_AES_decrypt # doesn't touch %xmm 2778 pxor 0x20(%rbp), @XMM[6] 2779 movdqu @XMM[6], ($out) 2780 2781.Lxts_dec_ret: 2782 lea (%rsp), %rax 2783 pxor %xmm0, %xmm0 2784.Lxts_dec_bzero: # wipe key schedule [if any] 2785 movdqa %xmm0, 0x00(%rax) 2786 movdqa %xmm0, 0x10(%rax) 2787 lea 0x20(%rax), %rax 2788 cmp %rax, %rbp 2789 ja .Lxts_dec_bzero 2790 2791 lea (%rbp),%rsp # restore %rsp 2792___ 2793$code.=<<___ if ($win64); 2794 movaps 0x40(%rbp), %xmm6 2795 movaps 0x50(%rbp), %xmm7 2796 movaps 0x60(%rbp), %xmm8 2797 movaps 0x70(%rbp), %xmm9 2798 movaps 0x80(%rbp), %xmm10 2799 movaps 0x90(%rbp), %xmm11 2800 movaps 0xa0(%rbp), %xmm12 2801 movaps 0xb0(%rbp), %xmm13 2802 movaps 0xc0(%rbp), %xmm14 2803 movaps 0xd0(%rbp), %xmm15 2804 lea 0xa0(%rbp), %rsp 2805___ 2806$code.=<<___; 2807 mov 0x48(%rsp), %r15 2808 mov 0x50(%rsp), %r14 2809 mov 0x58(%rsp), %r13 2810 mov 0x60(%rsp), %r12 2811 mov 0x68(%rsp), %rbx 2812 mov 0x70(%rsp), %rax 2813 lea 0x78(%rsp), %rsp 2814 mov %rax, %rbp 2815.Lxts_dec_epilogue: 2816 ret 2817.size bsaes_xts_decrypt,.-bsaes_xts_decrypt 2818___ 2819} 2820$code.=<<___; 2821.type _bsaes_const,\@object 2822.align 64 2823_bsaes_const: 2824.LM0ISR: # InvShiftRows constants 2825 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 2826.LISRM0: 2827 .quad 0x01040b0e0205080f, 0x0306090c00070a0d 2828.LISR: 2829 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 2830.LBS0: # bit-slice constants 2831 .quad 0x5555555555555555, 0x5555555555555555 2832.LBS1: 2833 .quad 0x3333333333333333, 0x3333333333333333 2834.LBS2: 2835 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f 2836.LSR: # shiftrows constants 2837 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b 2838.LSRM0: 2839 .quad 0x0304090e00050a0f, 0x01060b0c0207080d 2840.LM0SR: 2841 .quad 0x0a0e02060f03070b, 0x0004080c05090d01 2842.LSWPUP: # byte-swap upper dword 2843 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 2844.LSWPUPM0SR: 2845 .quad 0x0a0d02060c03070b, 0x0004080f05090e01 2846.LADD1: # counter increment constants 2847 .quad 0x0000000000000000, 0x0000000100000000 2848.LADD2: 2849 .quad 0x0000000000000000, 0x0000000200000000 2850.LADD3: 2851 .quad 0x0000000000000000, 0x0000000300000000 2852.LADD4: 2853 .quad 0x0000000000000000, 0x0000000400000000 2854.LADD5: 2855 .quad 0x0000000000000000, 0x0000000500000000 2856.LADD6: 2857 .quad 0x0000000000000000, 0x0000000600000000 2858.LADD7: 2859 .quad 0x0000000000000000, 0x0000000700000000 2860.LADD8: 2861 .quad 0x0000000000000000, 0x0000000800000000 2862.Lxts_magic: 2863 .long 0x87,0,1,0 2864.Lmasks: 2865 .quad 0x0101010101010101, 0x0101010101010101 2866 .quad 0x0202020202020202, 0x0202020202020202 2867 .quad 0x0404040404040404, 0x0404040404040404 2868 .quad 0x0808080808080808, 0x0808080808080808 2869.LM0: 2870 .quad 0x02060a0e03070b0f, 0x0004080c0105090d 2871.L63: 2872 .quad 0x6363636363636363, 0x6363636363636363 2873.asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov" 2874.align 64 2875.size _bsaes_const,.-_bsaes_const 2876___ 2877 2878# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 2879# CONTEXT *context,DISPATCHER_CONTEXT *disp) 2880if ($win64) { 2881$rec="%rcx"; 2882$frame="%rdx"; 2883$context="%r8"; 2884$disp="%r9"; 2885 2886$code.=<<___; 2887.extern __imp_RtlVirtualUnwind 2888.type se_handler,\@abi-omnipotent 2889.align 16 2890se_handler: 2891 push %rsi 2892 push %rdi 2893 push %rbx 2894 push %rbp 2895 push %r12 2896 push %r13 2897 push %r14 2898 push %r15 2899 pushfq 2900 sub \$64,%rsp 2901 2902 mov 120($context),%rax # pull context->Rax 2903 mov 248($context),%rbx # pull context->Rip 2904 2905 mov 8($disp),%rsi # disp->ImageBase 2906 mov 56($disp),%r11 # disp->HandlerData 2907 2908 mov 0(%r11),%r10d # HandlerData[0] 2909 lea (%rsi,%r10),%r10 # prologue label 2910 cmp %r10,%rbx # context->Rip<prologue label 2911 jb .Lin_prologue 2912 2913 mov 152($context),%rax # pull context->Rsp 2914 2915 mov 4(%r11),%r10d # HandlerData[1] 2916 lea (%rsi,%r10),%r10 # epilogue label 2917 cmp %r10,%rbx # context->Rip>=epilogue label 2918 jae .Lin_prologue 2919 2920 mov 160($context),%rax # pull context->Rbp 2921 2922 lea 0x40(%rax),%rsi # %xmm save area 2923 lea 512($context),%rdi # &context.Xmm6 2924 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 2925 .long 0xa548f3fc # cld; rep movsq 2926 lea 0xa0(%rax),%rax # adjust stack pointer 2927 2928 mov 0x70(%rax),%rbp 2929 mov 0x68(%rax),%rbx 2930 mov 0x60(%rax),%r12 2931 mov 0x58(%rax),%r13 2932 mov 0x50(%rax),%r14 2933 mov 0x48(%rax),%r15 2934 lea 0x78(%rax),%rax # adjust stack pointer 2935 mov %rbx,144($context) # restore context->Rbx 2936 mov %rbp,160($context) # restore context->Rbp 2937 mov %r12,216($context) # restore context->R12 2938 mov %r13,224($context) # restore context->R13 2939 mov %r14,232($context) # restore context->R14 2940 mov %r15,240($context) # restore context->R15 2941 2942.Lin_prologue: 2943 mov %rax,152($context) # restore context->Rsp 2944 2945 mov 40($disp),%rdi # disp->ContextRecord 2946 mov $context,%rsi # context 2947 mov \$`1232/8`,%ecx # sizeof(CONTEXT) 2948 .long 0xa548f3fc # cld; rep movsq 2949 2950 mov $disp,%rsi 2951 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 2952 mov 8(%rsi),%rdx # arg2, disp->ImageBase 2953 mov 0(%rsi),%r8 # arg3, disp->ControlPc 2954 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 2955 mov 40(%rsi),%r10 # disp->ContextRecord 2956 lea 56(%rsi),%r11 # &disp->HandlerData 2957 lea 24(%rsi),%r12 # &disp->EstablisherFrame 2958 mov %r10,32(%rsp) # arg5 2959 mov %r11,40(%rsp) # arg6 2960 mov %r12,48(%rsp) # arg7 2961 mov %rcx,56(%rsp) # arg8, (NULL) 2962 call *__imp_RtlVirtualUnwind(%rip) 2963 2964 mov \$1,%eax # ExceptionContinueSearch 2965 add \$64,%rsp 2966 popfq 2967 pop %r15 2968 pop %r14 2969 pop %r13 2970 pop %r12 2971 pop %rbp 2972 pop %rbx 2973 pop %rdi 2974 pop %rsi 2975 ret 2976.size se_handler,.-se_handler 2977 2978.section .pdata 2979.align 4 2980___ 2981$code.=<<___ if ($ecb); 2982 .rva .Lecb_enc_prologue 2983 .rva .Lecb_enc_epilogue 2984 .rva .Lecb_enc_info 2985 2986 .rva .Lecb_dec_prologue 2987 .rva .Lecb_dec_epilogue 2988 .rva .Lecb_dec_info 2989___ 2990$code.=<<___; 2991 .rva .Lcbc_dec_prologue 2992 .rva .Lcbc_dec_epilogue 2993 .rva .Lcbc_dec_info 2994 2995 .rva .Lctr_enc_prologue 2996 .rva .Lctr_enc_epilogue 2997 .rva .Lctr_enc_info 2998 2999 .rva .Lxts_enc_prologue 3000 .rva .Lxts_enc_epilogue 3001 .rva .Lxts_enc_info 3002 3003 .rva .Lxts_dec_prologue 3004 .rva .Lxts_dec_epilogue 3005 .rva .Lxts_dec_info 3006 3007.section .xdata 3008.align 8 3009___ 3010$code.=<<___ if ($ecb); 3011.Lecb_enc_info: 3012 .byte 9,0,0,0 3013 .rva se_handler 3014 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[] 3015.Lecb_dec_info: 3016 .byte 9,0,0,0 3017 .rva se_handler 3018 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[] 3019___ 3020$code.=<<___; 3021.Lcbc_dec_info: 3022 .byte 9,0,0,0 3023 .rva se_handler 3024 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[] 3025.Lctr_enc_info: 3026 .byte 9,0,0,0 3027 .rva se_handler 3028 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[] 3029.Lxts_enc_info: 3030 .byte 9,0,0,0 3031 .rva se_handler 3032 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] 3033.Lxts_dec_info: 3034 .byte 9,0,0,0 3035 .rva se_handler 3036 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] 3037___ 3038} 3039 3040$code =~ s/\`([^\`]*)\`/eval($1)/gem; 3041 3042print $code; 3043 3044close STDOUT; 3045