1#! /usr/bin/env perl 2# Copyright 2011-2021 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10################################################################### 11### AES-128 [originally in CTR mode] ### 12### bitsliced implementation for Intel Core 2 processors ### 13### requires support of SSE extensions up to SSSE3 ### 14### Author: Emilia Käsper and Peter Schwabe ### 15### Date: 2009-03-19 ### 16### Public domain ### 17### ### 18### See http://homes.esat.kuleuven.be/~ekasper/#software for ### 19### further information. ### 20################################################################### 21# 22# September 2011. 23# 24# Started as transliteration to "perlasm" the original code has 25# undergone following changes: 26# 27# - code was made position-independent; 28# - rounds were folded into a loop resulting in >5x size reduction 29# from 12.5KB to 2.2KB; 30# - above was possible thanks to mixcolumns() modification that 31# allowed to feed its output back to aesenc[last], this was 32# achieved at cost of two additional inter-registers moves; 33# - some instruction reordering and interleaving; 34# - this module doesn't implement key setup subroutine, instead it 35# relies on conversion of "conventional" key schedule as returned 36# by AES_set_encrypt_key (see discussion below); 37# - first and last round keys are treated differently, which allowed 38# to skip one shiftrows(), reduce bit-sliced key schedule and 39# speed-up conversion by 22%; 40# - support for 192- and 256-bit keys was added; 41# 42# Resulting performance in CPU cycles spent to encrypt one byte out 43# of 4096-byte buffer with 128-bit key is: 44# 45# Emilia's this(*) difference 46# 47# Core 2 9.30 8.69 +7% 48# Nehalem(**) 7.63 6.88 +11% 49# Atom 17.1 16.4 +4% 50# Silvermont - 12.9 51# Goldmont - 8.85 52# 53# (*) Comparison is not completely fair, because "this" is ECB, 54# i.e. no extra processing such as counter values calculation 55# and xor-ing input as in Emilia's CTR implementation is 56# performed. However, the CTR calculations stand for not more 57# than 1% of total time, so comparison is *rather* fair. 58# 59# (**) Results were collected on Westmere, which is considered to 60# be equivalent to Nehalem for this code. 61# 62# As for key schedule conversion subroutine. Interface to OpenSSL 63# relies on per-invocation on-the-fly conversion. This naturally 64# has impact on performance, especially for short inputs. Conversion 65# time in CPU cycles and its ratio to CPU cycles spent in 8x block 66# function is: 67# 68# conversion conversion/8x block 69# Core 2 240 0.22 70# Nehalem 180 0.20 71# Atom 430 0.20 72# 73# The ratio values mean that 128-byte blocks will be processed 74# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%, 75# etc. Then keep in mind that input sizes not divisible by 128 are 76# *effectively* slower, especially shortest ones, e.g. consecutive 77# 144-byte blocks are processed 44% slower than one would expect, 78# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings" 79# it's still faster than ["hyper-threading-safe" code path in] 80# aes-x86_64.pl on all lengths above 64 bytes... 81# 82# October 2011. 83# 84# Add decryption procedure. Performance in CPU cycles spent to decrypt 85# one byte out of 4096-byte buffer with 128-bit key is: 86# 87# Core 2 9.98 88# Nehalem 7.80 89# Atom 17.9 90# Silvermont 14.0 91# Goldmont 10.2 92# 93# November 2011. 94# 95# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is 96# suboptimal, but XTS is meant to be used with larger blocks... 97# 98# <appro@openssl.org> 99 100# $output is the last argument if it looks like a file (it has an extension) 101# $flavour is the first argument if it doesn't look like a file 102$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 103$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 104 105$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 106 107$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 108( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 109( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 110die "can't locate x86_64-xlate.pl"; 111 112open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 113 or die "can't call $xlate: $!"; 114*STDOUT=*OUT; 115 116my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx"); 117my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15) 118my $ecb=0; # suppress unreferenced ECB subroutines, spare some space... 119 120{ 121my ($key,$rounds,$const)=("%rax","%r10d","%r11"); 122 123sub Sbox { 124# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 125# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb 126my @b=@_[0..7]; 127my @t=@_[8..11]; 128my @s=@_[12..15]; 129 &InBasisChange (@b); 130 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s); 131 &OutBasisChange (@b[7,1,4,2,6,5,0,3]); 132} 133 134sub InBasisChange { 135# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 136# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb 137my @b=@_[0..7]; 138$code.=<<___; 139 pxor @b[6], @b[5] 140 pxor @b[1], @b[2] 141 pxor @b[0], @b[3] 142 pxor @b[2], @b[6] 143 pxor @b[0], @b[5] 144 145 pxor @b[3], @b[6] 146 pxor @b[7], @b[3] 147 pxor @b[5], @b[7] 148 pxor @b[4], @b[3] 149 pxor @b[5], @b[4] 150 pxor @b[1], @b[3] 151 152 pxor @b[7], @b[2] 153 pxor @b[5], @b[1] 154___ 155} 156 157sub OutBasisChange { 158# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 159# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb 160my @b=@_[0..7]; 161$code.=<<___; 162 pxor @b[6], @b[0] 163 pxor @b[4], @b[1] 164 pxor @b[0], @b[2] 165 pxor @b[6], @b[4] 166 pxor @b[1], @b[6] 167 168 pxor @b[5], @b[1] 169 pxor @b[3], @b[5] 170 pxor @b[7], @b[3] 171 pxor @b[5], @b[7] 172 pxor @b[5], @b[2] 173 174 pxor @b[7], @b[4] 175___ 176} 177 178sub InvSbox { 179# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 180# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb 181my @b=@_[0..7]; 182my @t=@_[8..11]; 183my @s=@_[12..15]; 184 &InvInBasisChange (@b); 185 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s); 186 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]); 187} 188 189sub InvInBasisChange { # OutBasisChange in reverse 190my @b=@_[5,1,2,6,3,7,0,4]; 191$code.=<<___ 192 pxor @b[7], @b[4] 193 194 pxor @b[5], @b[7] 195 pxor @b[5], @b[2] 196 pxor @b[7], @b[3] 197 pxor @b[3], @b[5] 198 pxor @b[5], @b[1] 199 200 pxor @b[1], @b[6] 201 pxor @b[0], @b[2] 202 pxor @b[6], @b[4] 203 pxor @b[6], @b[0] 204 pxor @b[4], @b[1] 205___ 206} 207 208sub InvOutBasisChange { # InBasisChange in reverse 209my @b=@_[2,5,7,3,6,1,0,4]; 210$code.=<<___; 211 pxor @b[5], @b[1] 212 pxor @b[7], @b[2] 213 214 pxor @b[1], @b[3] 215 pxor @b[5], @b[4] 216 pxor @b[5], @b[7] 217 pxor @b[4], @b[3] 218 pxor @b[0], @b[5] 219 pxor @b[7], @b[3] 220 pxor @b[2], @b[6] 221 pxor @b[1], @b[2] 222 pxor @b[3], @b[6] 223 224 pxor @b[0], @b[3] 225 pxor @b[6], @b[5] 226___ 227} 228 229sub Mul_GF4 { 230#;************************************************************* 231#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) * 232#;************************************************************* 233my ($x0,$x1,$y0,$y1,$t0)=@_; 234$code.=<<___; 235 movdqa $y0, $t0 236 pxor $y1, $t0 237 pand $x0, $t0 238 pxor $x1, $x0 239 pand $y0, $x1 240 pand $y1, $x0 241 pxor $x1, $x0 242 pxor $t0, $x1 243___ 244} 245 246sub Mul_GF4_N { # not used, see next subroutine 247# multiply and scale by N 248my ($x0,$x1,$y0,$y1,$t0)=@_; 249$code.=<<___; 250 movdqa $y0, $t0 251 pxor $y1, $t0 252 pand $x0, $t0 253 pxor $x1, $x0 254 pand $y0, $x1 255 pand $y1, $x0 256 pxor $x0, $x1 257 pxor $t0, $x0 258___ 259} 260 261sub Mul_GF4_N_GF4 { 262# interleaved Mul_GF4_N and Mul_GF4 263my ($x0,$x1,$y0,$y1,$t0, 264 $x2,$x3,$y2,$y3,$t1)=@_; 265$code.=<<___; 266 movdqa $y0, $t0 267 movdqa $y2, $t1 268 pxor $y1, $t0 269 pxor $y3, $t1 270 pand $x0, $t0 271 pand $x2, $t1 272 pxor $x1, $x0 273 pxor $x3, $x2 274 pand $y0, $x1 275 pand $y2, $x3 276 pand $y1, $x0 277 pand $y3, $x2 278 pxor $x0, $x1 279 pxor $x3, $x2 280 pxor $t0, $x0 281 pxor $t1, $x3 282___ 283} 284sub Mul_GF16_2 { 285my @x=@_[0..7]; 286my @y=@_[8..11]; 287my @t=@_[12..15]; 288$code.=<<___; 289 movdqa @x[0], @t[0] 290 movdqa @x[1], @t[1] 291___ 292 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]); 293$code.=<<___; 294 pxor @x[2], @t[0] 295 pxor @x[3], @t[1] 296 pxor @y[2], @y[0] 297 pxor @y[3], @y[1] 298___ 299 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], 300 @x[2], @x[3], @y[2], @y[3], @t[2]); 301$code.=<<___; 302 pxor @t[0], @x[0] 303 pxor @t[0], @x[2] 304 pxor @t[1], @x[1] 305 pxor @t[1], @x[3] 306 307 movdqa @x[4], @t[0] 308 movdqa @x[5], @t[1] 309 pxor @x[6], @t[0] 310 pxor @x[7], @t[1] 311___ 312 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], 313 @x[6], @x[7], @y[2], @y[3], @t[2]); 314$code.=<<___; 315 pxor @y[2], @y[0] 316 pxor @y[3], @y[1] 317___ 318 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]); 319$code.=<<___; 320 pxor @t[0], @x[4] 321 pxor @t[0], @x[6] 322 pxor @t[1], @x[5] 323 pxor @t[1], @x[7] 324___ 325} 326sub Inv_GF256 { 327#;******************************************************************** 328#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) * 329#;******************************************************************** 330my @x=@_[0..7]; 331my @t=@_[8..11]; 332my @s=@_[12..15]; 333# direct optimizations from hardware 334$code.=<<___; 335 movdqa @x[4], @t[3] 336 movdqa @x[5], @t[2] 337 movdqa @x[1], @t[1] 338 movdqa @x[7], @s[1] 339 movdqa @x[0], @s[0] 340 341 pxor @x[6], @t[3] 342 pxor @x[7], @t[2] 343 pxor @x[3], @t[1] 344 movdqa @t[3], @s[2] 345 pxor @x[6], @s[1] 346 movdqa @t[2], @t[0] 347 pxor @x[2], @s[0] 348 movdqa @t[3], @s[3] 349 350 por @t[1], @t[2] 351 por @s[0], @t[3] 352 pxor @t[0], @s[3] 353 pand @s[0], @s[2] 354 pxor @t[1], @s[0] 355 pand @t[1], @t[0] 356 pand @s[0], @s[3] 357 movdqa @x[3], @s[0] 358 pxor @x[2], @s[0] 359 pand @s[0], @s[1] 360 pxor @s[1], @t[3] 361 pxor @s[1], @t[2] 362 movdqa @x[4], @s[1] 363 movdqa @x[1], @s[0] 364 pxor @x[5], @s[1] 365 pxor @x[0], @s[0] 366 movdqa @s[1], @t[1] 367 pand @s[0], @s[1] 368 por @s[0], @t[1] 369 pxor @s[1], @t[0] 370 pxor @s[3], @t[3] 371 pxor @s[2], @t[2] 372 pxor @s[3], @t[1] 373 movdqa @x[7], @s[0] 374 pxor @s[2], @t[0] 375 movdqa @x[6], @s[1] 376 pxor @s[2], @t[1] 377 movdqa @x[5], @s[2] 378 pand @x[3], @s[0] 379 movdqa @x[4], @s[3] 380 pand @x[2], @s[1] 381 pand @x[1], @s[2] 382 por @x[0], @s[3] 383 pxor @s[0], @t[3] 384 pxor @s[1], @t[2] 385 pxor @s[2], @t[1] 386 pxor @s[3], @t[0] 387 388 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3 389 390 # new smaller inversion 391 392 movdqa @t[3], @s[0] 393 pand @t[1], @t[3] 394 pxor @t[2], @s[0] 395 396 movdqa @t[0], @s[2] 397 movdqa @s[0], @s[3] 398 pxor @t[3], @s[2] 399 pand @s[2], @s[3] 400 401 movdqa @t[1], @s[1] 402 pxor @t[2], @s[3] 403 pxor @t[0], @s[1] 404 405 pxor @t[2], @t[3] 406 407 pand @t[3], @s[1] 408 409 movdqa @s[2], @t[2] 410 pxor @t[0], @s[1] 411 412 pxor @s[1], @t[2] 413 pxor @s[1], @t[1] 414 415 pand @t[0], @t[2] 416 417 pxor @t[2], @s[2] 418 pxor @t[2], @t[1] 419 420 pand @s[3], @s[2] 421 422 pxor @s[0], @s[2] 423___ 424# output in s3, s2, s1, t1 425 426# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3 427 428# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 429 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]); 430 431### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb 432} 433 434# AES linear components 435 436sub ShiftRows { 437my @x=@_[0..7]; 438my $mask=pop; 439$code.=<<___; 440 pxor 0x00($key),@x[0] 441 pxor 0x10($key),@x[1] 442 pxor 0x20($key),@x[2] 443 pxor 0x30($key),@x[3] 444 pshufb $mask,@x[0] 445 pshufb $mask,@x[1] 446 pxor 0x40($key),@x[4] 447 pxor 0x50($key),@x[5] 448 pshufb $mask,@x[2] 449 pshufb $mask,@x[3] 450 pxor 0x60($key),@x[6] 451 pxor 0x70($key),@x[7] 452 pshufb $mask,@x[4] 453 pshufb $mask,@x[5] 454 pshufb $mask,@x[6] 455 pshufb $mask,@x[7] 456 lea 0x80($key),$key 457___ 458} 459 460sub MixColumns { 461# modified to emit output in order suitable for feeding back to aesenc[last] 462my @x=@_[0..7]; 463my @t=@_[8..15]; 464my $inv=@_[16]; # optional 465$code.=<<___; 466 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32 467 pshufd \$0x93, @x[1], @t[1] 468 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32) 469 pshufd \$0x93, @x[2], @t[2] 470 pxor @t[1], @x[1] 471 pshufd \$0x93, @x[3], @t[3] 472 pxor @t[2], @x[2] 473 pshufd \$0x93, @x[4], @t[4] 474 pxor @t[3], @x[3] 475 pshufd \$0x93, @x[5], @t[5] 476 pxor @t[4], @x[4] 477 pshufd \$0x93, @x[6], @t[6] 478 pxor @t[5], @x[5] 479 pshufd \$0x93, @x[7], @t[7] 480 pxor @t[6], @x[6] 481 pxor @t[7], @x[7] 482 483 pxor @x[0], @t[1] 484 pxor @x[7], @t[0] 485 pxor @x[7], @t[1] 486 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64) 487 pxor @x[1], @t[2] 488 pshufd \$0x4E, @x[1], @x[1] 489 pxor @x[4], @t[5] 490 pxor @t[0], @x[0] 491 pxor @x[5], @t[6] 492 pxor @t[1], @x[1] 493 pxor @x[3], @t[4] 494 pshufd \$0x4E, @x[4], @t[0] 495 pxor @x[6], @t[7] 496 pshufd \$0x4E, @x[5], @t[1] 497 pxor @x[2], @t[3] 498 pshufd \$0x4E, @x[3], @x[4] 499 pxor @x[7], @t[3] 500 pshufd \$0x4E, @x[7], @x[5] 501 pxor @x[7], @t[4] 502 pshufd \$0x4E, @x[6], @x[3] 503 pxor @t[4], @t[0] 504 pshufd \$0x4E, @x[2], @x[6] 505 pxor @t[5], @t[1] 506___ 507$code.=<<___ if (!$inv); 508 pxor @t[3], @x[4] 509 pxor @t[7], @x[5] 510 pxor @t[6], @x[3] 511 movdqa @t[0], @x[2] 512 pxor @t[2], @x[6] 513 movdqa @t[1], @x[7] 514___ 515$code.=<<___ if ($inv); 516 pxor @x[4], @t[3] 517 pxor @t[7], @x[5] 518 pxor @x[3], @t[6] 519 movdqa @t[0], @x[3] 520 pxor @t[2], @x[6] 521 movdqa @t[6], @x[2] 522 movdqa @t[1], @x[7] 523 movdqa @x[6], @x[4] 524 movdqa @t[3], @x[6] 525___ 526} 527 528sub InvMixColumns_orig { 529my @x=@_[0..7]; 530my @t=@_[8..15]; 531 532$code.=<<___; 533 # multiplication by 0x0e 534 pshufd \$0x93, @x[7], @t[7] 535 movdqa @x[2], @t[2] 536 pxor @x[5], @x[7] # 7 5 537 pxor @x[5], @x[2] # 2 5 538 pshufd \$0x93, @x[0], @t[0] 539 movdqa @x[5], @t[5] 540 pxor @x[0], @x[5] # 5 0 [1] 541 pxor @x[1], @x[0] # 0 1 542 pshufd \$0x93, @x[1], @t[1] 543 pxor @x[2], @x[1] # 1 25 544 pxor @x[6], @x[0] # 01 6 [2] 545 pxor @x[3], @x[1] # 125 3 [4] 546 pshufd \$0x93, @x[3], @t[3] 547 pxor @x[0], @x[2] # 25 016 [3] 548 pxor @x[7], @x[3] # 3 75 549 pxor @x[6], @x[7] # 75 6 [0] 550 pshufd \$0x93, @x[6], @t[6] 551 movdqa @x[4], @t[4] 552 pxor @x[4], @x[6] # 6 4 553 pxor @x[3], @x[4] # 4 375 [6] 554 pxor @x[7], @x[3] # 375 756=36 555 pxor @t[5], @x[6] # 64 5 [7] 556 pxor @t[2], @x[3] # 36 2 557 pxor @t[4], @x[3] # 362 4 [5] 558 pshufd \$0x93, @t[5], @t[5] 559___ 560 my @y = @x[7,5,0,2,1,3,4,6]; 561$code.=<<___; 562 # multiplication by 0x0b 563 pxor @y[0], @y[1] 564 pxor @t[0], @y[0] 565 pxor @t[1], @y[1] 566 pshufd \$0x93, @t[2], @t[2] 567 pxor @t[5], @y[0] 568 pxor @t[6], @y[1] 569 pxor @t[7], @y[0] 570 pshufd \$0x93, @t[4], @t[4] 571 pxor @t[6], @t[7] # clobber t[7] 572 pxor @y[0], @y[1] 573 574 pxor @t[0], @y[3] 575 pshufd \$0x93, @t[0], @t[0] 576 pxor @t[1], @y[2] 577 pxor @t[1], @y[4] 578 pxor @t[2], @y[2] 579 pshufd \$0x93, @t[1], @t[1] 580 pxor @t[2], @y[3] 581 pxor @t[2], @y[5] 582 pxor @t[7], @y[2] 583 pshufd \$0x93, @t[2], @t[2] 584 pxor @t[3], @y[3] 585 pxor @t[3], @y[6] 586 pxor @t[3], @y[4] 587 pshufd \$0x93, @t[3], @t[3] 588 pxor @t[4], @y[7] 589 pxor @t[4], @y[5] 590 pxor @t[7], @y[7] 591 pxor @t[5], @y[3] 592 pxor @t[4], @y[4] 593 pxor @t[5], @t[7] # clobber t[7] even more 594 595 pxor @t[7], @y[5] 596 pshufd \$0x93, @t[4], @t[4] 597 pxor @t[7], @y[6] 598 pxor @t[7], @y[4] 599 600 pxor @t[5], @t[7] 601 pshufd \$0x93, @t[5], @t[5] 602 pxor @t[6], @t[7] # restore t[7] 603 604 # multiplication by 0x0d 605 pxor @y[7], @y[4] 606 pxor @t[4], @y[7] 607 pshufd \$0x93, @t[6], @t[6] 608 pxor @t[0], @y[2] 609 pxor @t[5], @y[7] 610 pxor @t[2], @y[2] 611 pshufd \$0x93, @t[7], @t[7] 612 613 pxor @y[1], @y[3] 614 pxor @t[1], @y[1] 615 pxor @t[0], @y[0] 616 pxor @t[0], @y[3] 617 pxor @t[5], @y[1] 618 pxor @t[5], @y[0] 619 pxor @t[7], @y[1] 620 pshufd \$0x93, @t[0], @t[0] 621 pxor @t[6], @y[0] 622 pxor @y[1], @y[3] 623 pxor @t[1], @y[4] 624 pshufd \$0x93, @t[1], @t[1] 625 626 pxor @t[7], @y[7] 627 pxor @t[2], @y[4] 628 pxor @t[2], @y[5] 629 pshufd \$0x93, @t[2], @t[2] 630 pxor @t[6], @y[2] 631 pxor @t[3], @t[6] # clobber t[6] 632 pxor @y[7], @y[4] 633 pxor @t[6], @y[3] 634 635 pxor @t[6], @y[6] 636 pxor @t[5], @y[5] 637 pxor @t[4], @y[6] 638 pshufd \$0x93, @t[4], @t[4] 639 pxor @t[6], @y[5] 640 pxor @t[7], @y[6] 641 pxor @t[3], @t[6] # restore t[6] 642 643 pshufd \$0x93, @t[5], @t[5] 644 pshufd \$0x93, @t[6], @t[6] 645 pshufd \$0x93, @t[7], @t[7] 646 pshufd \$0x93, @t[3], @t[3] 647 648 # multiplication by 0x09 649 pxor @y[1], @y[4] 650 pxor @y[1], @t[1] # t[1]=y[1] 651 pxor @t[5], @t[0] # clobber t[0] 652 pxor @t[5], @t[1] 653 pxor @t[0], @y[3] 654 pxor @y[0], @t[0] # t[0]=y[0] 655 pxor @t[6], @t[1] 656 pxor @t[7], @t[6] # clobber t[6] 657 pxor @t[1], @y[4] 658 pxor @t[4], @y[7] 659 pxor @y[4], @t[4] # t[4]=y[4] 660 pxor @t[3], @y[6] 661 pxor @y[3], @t[3] # t[3]=y[3] 662 pxor @t[2], @y[5] 663 pxor @y[2], @t[2] # t[2]=y[2] 664 pxor @t[7], @t[3] 665 pxor @y[5], @t[5] # t[5]=y[5] 666 pxor @t[6], @t[2] 667 pxor @t[6], @t[5] 668 pxor @y[6], @t[6] # t[6]=y[6] 669 pxor @y[7], @t[7] # t[7]=y[7] 670 671 movdqa @t[0],@XMM[0] 672 movdqa @t[1],@XMM[1] 673 movdqa @t[2],@XMM[2] 674 movdqa @t[3],@XMM[3] 675 movdqa @t[4],@XMM[4] 676 movdqa @t[5],@XMM[5] 677 movdqa @t[6],@XMM[6] 678 movdqa @t[7],@XMM[7] 679___ 680} 681 682sub InvMixColumns { 683my @x=@_[0..7]; 684my @t=@_[8..15]; 685 686# Thanks to Jussi Kivilinna for providing pointer to 687# 688# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 | 689# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 | 690# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 | 691# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 | 692 693$code.=<<___; 694 # multiplication by 0x05-0x00-0x04-0x00 695 pshufd \$0x4E, @x[0], @t[0] 696 pshufd \$0x4E, @x[6], @t[6] 697 pxor @x[0], @t[0] 698 pshufd \$0x4E, @x[7], @t[7] 699 pxor @x[6], @t[6] 700 pshufd \$0x4E, @x[1], @t[1] 701 pxor @x[7], @t[7] 702 pshufd \$0x4E, @x[2], @t[2] 703 pxor @x[1], @t[1] 704 pshufd \$0x4E, @x[3], @t[3] 705 pxor @x[2], @t[2] 706 pxor @t[6], @x[0] 707 pxor @t[6], @x[1] 708 pshufd \$0x4E, @x[4], @t[4] 709 pxor @x[3], @t[3] 710 pxor @t[0], @x[2] 711 pxor @t[1], @x[3] 712 pshufd \$0x4E, @x[5], @t[5] 713 pxor @x[4], @t[4] 714 pxor @t[7], @x[1] 715 pxor @t[2], @x[4] 716 pxor @x[5], @t[5] 717 718 pxor @t[7], @x[2] 719 pxor @t[6], @x[3] 720 pxor @t[6], @x[4] 721 pxor @t[3], @x[5] 722 pxor @t[4], @x[6] 723 pxor @t[7], @x[4] 724 pxor @t[7], @x[5] 725 pxor @t[5], @x[7] 726___ 727 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6 728} 729 730sub aesenc { # not used 731my @b=@_[0..7]; 732my @t=@_[8..15]; 733$code.=<<___; 734 movdqa 0x30($const),@t[0] # .LSR 735___ 736 &ShiftRows (@b,@t[0]); 737 &Sbox (@b,@t); 738 &MixColumns (@b[0,1,4,6,3,7,2,5],@t); 739} 740 741sub aesenclast { # not used 742my @b=@_[0..7]; 743my @t=@_[8..15]; 744$code.=<<___; 745 movdqa 0x40($const),@t[0] # .LSRM0 746___ 747 &ShiftRows (@b,@t[0]); 748 &Sbox (@b,@t); 749$code.=<<___ 750 pxor 0x00($key),@b[0] 751 pxor 0x10($key),@b[1] 752 pxor 0x20($key),@b[4] 753 pxor 0x30($key),@b[6] 754 pxor 0x40($key),@b[3] 755 pxor 0x50($key),@b[7] 756 pxor 0x60($key),@b[2] 757 pxor 0x70($key),@b[5] 758___ 759} 760 761sub swapmove { 762my ($a,$b,$n,$mask,$t)=@_; 763$code.=<<___; 764 movdqa $b,$t 765 psrlq \$$n,$b 766 pxor $a,$b 767 pand $mask,$b 768 pxor $b,$a 769 psllq \$$n,$b 770 pxor $t,$b 771___ 772} 773sub swapmove2x { 774my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_; 775$code.=<<___; 776 movdqa $b0,$t0 777 psrlq \$$n,$b0 778 movdqa $b1,$t1 779 psrlq \$$n,$b1 780 pxor $a0,$b0 781 pxor $a1,$b1 782 pand $mask,$b0 783 pand $mask,$b1 784 pxor $b0,$a0 785 psllq \$$n,$b0 786 pxor $b1,$a1 787 psllq \$$n,$b1 788 pxor $t0,$b0 789 pxor $t1,$b1 790___ 791} 792 793sub bitslice { 794my @x=reverse(@_[0..7]); 795my ($t0,$t1,$t2,$t3)=@_[8..11]; 796$code.=<<___; 797 movdqa 0x00($const),$t0 # .LBS0 798 movdqa 0x10($const),$t1 # .LBS1 799___ 800 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3); 801 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); 802$code.=<<___; 803 movdqa 0x20($const),$t0 # .LBS2 804___ 805 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3); 806 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); 807 808 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3); 809 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3); 810} 811 812$code.=<<___; 813.text 814 815.extern asm_AES_encrypt 816.extern asm_AES_decrypt 817 818.type _bsaes_encrypt8,\@abi-omnipotent 819.align 64 820_bsaes_encrypt8: 821.cfi_startproc 822 lea .LBS0(%rip), $const # constants table 823 824 movdqa ($key), @XMM[9] # round 0 key 825 lea 0x10($key), $key 826 movdqa 0x50($const), @XMM[8] # .LM0SR 827 pxor @XMM[9], @XMM[0] # xor with round0 key 828 pxor @XMM[9], @XMM[1] 829 pxor @XMM[9], @XMM[2] 830 pxor @XMM[9], @XMM[3] 831 pshufb @XMM[8], @XMM[0] 832 pshufb @XMM[8], @XMM[1] 833 pxor @XMM[9], @XMM[4] 834 pxor @XMM[9], @XMM[5] 835 pshufb @XMM[8], @XMM[2] 836 pshufb @XMM[8], @XMM[3] 837 pxor @XMM[9], @XMM[6] 838 pxor @XMM[9], @XMM[7] 839 pshufb @XMM[8], @XMM[4] 840 pshufb @XMM[8], @XMM[5] 841 pshufb @XMM[8], @XMM[6] 842 pshufb @XMM[8], @XMM[7] 843_bsaes_encrypt8_bitslice: 844___ 845 &bitslice (@XMM[0..7, 8..11]); 846$code.=<<___; 847 dec $rounds 848 jmp .Lenc_sbox 849.align 16 850.Lenc_loop: 851___ 852 &ShiftRows (@XMM[0..7, 8]); 853$code.=".Lenc_sbox:\n"; 854 &Sbox (@XMM[0..7, 8..15]); 855$code.=<<___; 856 dec $rounds 857 jl .Lenc_done 858___ 859 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]); 860$code.=<<___; 861 movdqa 0x30($const), @XMM[8] # .LSR 862 jnz .Lenc_loop 863 movdqa 0x40($const), @XMM[8] # .LSRM0 864 jmp .Lenc_loop 865.align 16 866.Lenc_done: 867___ 868 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb 869 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]); 870$code.=<<___; 871 movdqa ($key), @XMM[8] # last round key 872 pxor @XMM[8], @XMM[4] 873 pxor @XMM[8], @XMM[6] 874 pxor @XMM[8], @XMM[3] 875 pxor @XMM[8], @XMM[7] 876 pxor @XMM[8], @XMM[2] 877 pxor @XMM[8], @XMM[5] 878 pxor @XMM[8], @XMM[0] 879 pxor @XMM[8], @XMM[1] 880 ret 881.cfi_endproc 882.size _bsaes_encrypt8,.-_bsaes_encrypt8 883 884.type _bsaes_decrypt8,\@abi-omnipotent 885.align 64 886_bsaes_decrypt8: 887.cfi_startproc 888 lea .LBS0(%rip), $const # constants table 889 890 movdqa ($key), @XMM[9] # round 0 key 891 lea 0x10($key), $key 892 movdqa -0x30($const), @XMM[8] # .LM0ISR 893 pxor @XMM[9], @XMM[0] # xor with round0 key 894 pxor @XMM[9], @XMM[1] 895 pxor @XMM[9], @XMM[2] 896 pxor @XMM[9], @XMM[3] 897 pshufb @XMM[8], @XMM[0] 898 pshufb @XMM[8], @XMM[1] 899 pxor @XMM[9], @XMM[4] 900 pxor @XMM[9], @XMM[5] 901 pshufb @XMM[8], @XMM[2] 902 pshufb @XMM[8], @XMM[3] 903 pxor @XMM[9], @XMM[6] 904 pxor @XMM[9], @XMM[7] 905 pshufb @XMM[8], @XMM[4] 906 pshufb @XMM[8], @XMM[5] 907 pshufb @XMM[8], @XMM[6] 908 pshufb @XMM[8], @XMM[7] 909___ 910 &bitslice (@XMM[0..7, 8..11]); 911$code.=<<___; 912 dec $rounds 913 jmp .Ldec_sbox 914.align 16 915.Ldec_loop: 916___ 917 &ShiftRows (@XMM[0..7, 8]); 918$code.=".Ldec_sbox:\n"; 919 &InvSbox (@XMM[0..7, 8..15]); 920$code.=<<___; 921 dec $rounds 922 jl .Ldec_done 923___ 924 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]); 925$code.=<<___; 926 movdqa -0x10($const), @XMM[8] # .LISR 927 jnz .Ldec_loop 928 movdqa -0x20($const), @XMM[8] # .LISRM0 929 jmp .Ldec_loop 930.align 16 931.Ldec_done: 932___ 933 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]); 934$code.=<<___; 935 movdqa ($key), @XMM[8] # last round key 936 pxor @XMM[8], @XMM[6] 937 pxor @XMM[8], @XMM[4] 938 pxor @XMM[8], @XMM[2] 939 pxor @XMM[8], @XMM[7] 940 pxor @XMM[8], @XMM[3] 941 pxor @XMM[8], @XMM[5] 942 pxor @XMM[8], @XMM[0] 943 pxor @XMM[8], @XMM[1] 944 ret 945.cfi_endproc 946.size _bsaes_decrypt8,.-_bsaes_decrypt8 947___ 948} 949{ 950my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11"); 951 952sub bitslice_key { 953my @x=reverse(@_[0..7]); 954my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12]; 955 956 &swapmove (@x[0,1],1,$bs0,$t2,$t3); 957$code.=<<___; 958 #&swapmove(@x[2,3],1,$t0,$t2,$t3); 959 movdqa @x[0], @x[2] 960 movdqa @x[1], @x[3] 961___ 962 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); 963 964 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3); 965$code.=<<___; 966 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); 967 movdqa @x[0], @x[4] 968 movdqa @x[2], @x[6] 969 movdqa @x[1], @x[5] 970 movdqa @x[3], @x[7] 971___ 972 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3); 973 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3); 974} 975 976$code.=<<___; 977.type _bsaes_key_convert,\@abi-omnipotent 978.align 16 979_bsaes_key_convert: 980.cfi_startproc 981 lea .Lmasks(%rip), $const 982 movdqu ($inp), %xmm7 # load round 0 key 983 lea 0x10($inp), $inp 984 movdqa 0x00($const), %xmm0 # 0x01... 985 movdqa 0x10($const), %xmm1 # 0x02... 986 movdqa 0x20($const), %xmm2 # 0x04... 987 movdqa 0x30($const), %xmm3 # 0x08... 988 movdqa 0x40($const), %xmm4 # .LM0 989 pcmpeqd %xmm5, %xmm5 # .LNOT 990 991 movdqu ($inp), %xmm6 # load round 1 key 992 movdqa %xmm7, ($out) # save round 0 key 993 lea 0x10($out), $out 994 dec $rounds 995 jmp .Lkey_loop 996.align 16 997.Lkey_loop: 998 pshufb %xmm4, %xmm6 # .LM0 999 1000 movdqa %xmm0, %xmm8 1001 movdqa %xmm1, %xmm9 1002 1003 pand %xmm6, %xmm8 1004 pand %xmm6, %xmm9 1005 movdqa %xmm2, %xmm10 1006 pcmpeqb %xmm0, %xmm8 1007 psllq \$4, %xmm0 # 0x10... 1008 movdqa %xmm3, %xmm11 1009 pcmpeqb %xmm1, %xmm9 1010 psllq \$4, %xmm1 # 0x20... 1011 1012 pand %xmm6, %xmm10 1013 pand %xmm6, %xmm11 1014 movdqa %xmm0, %xmm12 1015 pcmpeqb %xmm2, %xmm10 1016 psllq \$4, %xmm2 # 0x40... 1017 movdqa %xmm1, %xmm13 1018 pcmpeqb %xmm3, %xmm11 1019 psllq \$4, %xmm3 # 0x80... 1020 1021 movdqa %xmm2, %xmm14 1022 movdqa %xmm3, %xmm15 1023 pxor %xmm5, %xmm8 # "pnot" 1024 pxor %xmm5, %xmm9 1025 1026 pand %xmm6, %xmm12 1027 pand %xmm6, %xmm13 1028 movdqa %xmm8, 0x00($out) # write bit-sliced round key 1029 pcmpeqb %xmm0, %xmm12 1030 psrlq \$4, %xmm0 # 0x01... 1031 movdqa %xmm9, 0x10($out) 1032 pcmpeqb %xmm1, %xmm13 1033 psrlq \$4, %xmm1 # 0x02... 1034 lea 0x10($inp), $inp 1035 1036 pand %xmm6, %xmm14 1037 pand %xmm6, %xmm15 1038 movdqa %xmm10, 0x20($out) 1039 pcmpeqb %xmm2, %xmm14 1040 psrlq \$4, %xmm2 # 0x04... 1041 movdqa %xmm11, 0x30($out) 1042 pcmpeqb %xmm3, %xmm15 1043 psrlq \$4, %xmm3 # 0x08... 1044 movdqu ($inp), %xmm6 # load next round key 1045 1046 pxor %xmm5, %xmm13 # "pnot" 1047 pxor %xmm5, %xmm14 1048 movdqa %xmm12, 0x40($out) 1049 movdqa %xmm13, 0x50($out) 1050 movdqa %xmm14, 0x60($out) 1051 movdqa %xmm15, 0x70($out) 1052 lea 0x80($out),$out 1053 dec $rounds 1054 jnz .Lkey_loop 1055 1056 movdqa 0x50($const), %xmm7 # .L63 1057 #movdqa %xmm6, ($out) # don't save last round key 1058 ret 1059.cfi_endproc 1060.size _bsaes_key_convert,.-_bsaes_key_convert 1061___ 1062} 1063 1064if (0 && !$win64) { # following four functions are unsupported interface 1065 # used for benchmarking... 1066$code.=<<___; 1067.globl bsaes_enc_key_convert 1068.type bsaes_enc_key_convert,\@function,2 1069.align 16 1070bsaes_enc_key_convert: 1071 mov 240($inp),%r10d # pass rounds 1072 mov $inp,%rcx # pass key 1073 mov $out,%rax # pass key schedule 1074 call _bsaes_key_convert 1075 pxor %xmm6,%xmm7 # fix up last round key 1076 movdqa %xmm7,(%rax) # save last round key 1077 ret 1078.size bsaes_enc_key_convert,.-bsaes_enc_key_convert 1079 1080.globl bsaes_encrypt_128 1081.type bsaes_encrypt_128,\@function,4 1082.align 16 1083bsaes_encrypt_128: 1084.Lenc128_loop: 1085 movdqu 0x00($inp), @XMM[0] # load input 1086 movdqu 0x10($inp), @XMM[1] 1087 movdqu 0x20($inp), @XMM[2] 1088 movdqu 0x30($inp), @XMM[3] 1089 movdqu 0x40($inp), @XMM[4] 1090 movdqu 0x50($inp), @XMM[5] 1091 movdqu 0x60($inp), @XMM[6] 1092 movdqu 0x70($inp), @XMM[7] 1093 mov $key, %rax # pass the $key 1094 lea 0x80($inp), $inp 1095 mov \$10,%r10d 1096 1097 call _bsaes_encrypt8 1098 1099 movdqu @XMM[0], 0x00($out) # write output 1100 movdqu @XMM[1], 0x10($out) 1101 movdqu @XMM[4], 0x20($out) 1102 movdqu @XMM[6], 0x30($out) 1103 movdqu @XMM[3], 0x40($out) 1104 movdqu @XMM[7], 0x50($out) 1105 movdqu @XMM[2], 0x60($out) 1106 movdqu @XMM[5], 0x70($out) 1107 lea 0x80($out), $out 1108 sub \$0x80,$len 1109 ja .Lenc128_loop 1110 ret 1111.size bsaes_encrypt_128,.-bsaes_encrypt_128 1112 1113.globl bsaes_dec_key_convert 1114.type bsaes_dec_key_convert,\@function,2 1115.align 16 1116bsaes_dec_key_convert: 1117 mov 240($inp),%r10d # pass rounds 1118 mov $inp,%rcx # pass key 1119 mov $out,%rax # pass key schedule 1120 call _bsaes_key_convert 1121 pxor ($out),%xmm7 # fix up round 0 key 1122 movdqa %xmm6,(%rax) # save last round key 1123 movdqa %xmm7,($out) 1124 ret 1125.size bsaes_dec_key_convert,.-bsaes_dec_key_convert 1126 1127.globl bsaes_decrypt_128 1128.type bsaes_decrypt_128,\@function,4 1129.align 16 1130bsaes_decrypt_128: 1131.Ldec128_loop: 1132 movdqu 0x00($inp), @XMM[0] # load input 1133 movdqu 0x10($inp), @XMM[1] 1134 movdqu 0x20($inp), @XMM[2] 1135 movdqu 0x30($inp), @XMM[3] 1136 movdqu 0x40($inp), @XMM[4] 1137 movdqu 0x50($inp), @XMM[5] 1138 movdqu 0x60($inp), @XMM[6] 1139 movdqu 0x70($inp), @XMM[7] 1140 mov $key, %rax # pass the $key 1141 lea 0x80($inp), $inp 1142 mov \$10,%r10d 1143 1144 call _bsaes_decrypt8 1145 1146 movdqu @XMM[0], 0x00($out) # write output 1147 movdqu @XMM[1], 0x10($out) 1148 movdqu @XMM[6], 0x20($out) 1149 movdqu @XMM[4], 0x30($out) 1150 movdqu @XMM[2], 0x40($out) 1151 movdqu @XMM[7], 0x50($out) 1152 movdqu @XMM[3], 0x60($out) 1153 movdqu @XMM[5], 0x70($out) 1154 lea 0x80($out), $out 1155 sub \$0x80,$len 1156 ja .Ldec128_loop 1157 ret 1158.size bsaes_decrypt_128,.-bsaes_decrypt_128 1159___ 1160} 1161{ 1162###################################################################### 1163# 1164# OpenSSL interface 1165# 1166my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d") 1167 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); 1168my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15"); 1169 1170if ($ecb) { 1171$code.=<<___; 1172.globl bsaes_ecb_encrypt_blocks 1173.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent 1174.align 16 1175bsaes_ecb_encrypt_blocks: 1176.cfi_startproc 1177 mov %rsp, %rax 1178.Lecb_enc_prologue: 1179 push %rbp 1180.cfi_push %rbp 1181 push %rbx 1182.cfi_push %rbx 1183 push %r12 1184.cfi_push %r12 1185 push %r13 1186.cfi_push %r13 1187 push %r14 1188.cfi_push %r14 1189 push %r15 1190.cfi_push %r15 1191 lea -0x48(%rsp),%rsp 1192.cfi_adjust_cfa_offset 0x48 1193___ 1194$code.=<<___ if ($win64); 1195 lea -0xa0(%rsp), %rsp 1196 movaps %xmm6, 0x40(%rsp) 1197 movaps %xmm7, 0x50(%rsp) 1198 movaps %xmm8, 0x60(%rsp) 1199 movaps %xmm9, 0x70(%rsp) 1200 movaps %xmm10, 0x80(%rsp) 1201 movaps %xmm11, 0x90(%rsp) 1202 movaps %xmm12, 0xa0(%rsp) 1203 movaps %xmm13, 0xb0(%rsp) 1204 movaps %xmm14, 0xc0(%rsp) 1205 movaps %xmm15, 0xd0(%rsp) 1206.Lecb_enc_body: 1207___ 1208$code.=<<___; 1209 mov %rsp,%rbp # backup %rsp 1210.cfi_def_cfa_register %rbp 1211 mov 240($arg4),%eax # rounds 1212 mov $arg1,$inp # backup arguments 1213 mov $arg2,$out 1214 mov $arg3,$len 1215 mov $arg4,$key 1216 cmp \$8,$arg3 1217 jb .Lecb_enc_short 1218 1219 mov %eax,%ebx # backup rounds 1220 shl \$7,%rax # 128 bytes per inner round key 1221 sub \$`128-32`,%rax # size of bit-sliced key schedule 1222 sub %rax,%rsp 1223 mov %rsp,%rax # pass key schedule 1224 mov $key,%rcx # pass key 1225 mov %ebx,%r10d # pass rounds 1226 call _bsaes_key_convert 1227 pxor %xmm6,%xmm7 # fix up last round key 1228 movdqa %xmm7,(%rax) # save last round key 1229 1230 sub \$8,$len 1231.Lecb_enc_loop: 1232 movdqu 0x00($inp), @XMM[0] # load input 1233 movdqu 0x10($inp), @XMM[1] 1234 movdqu 0x20($inp), @XMM[2] 1235 movdqu 0x30($inp), @XMM[3] 1236 movdqu 0x40($inp), @XMM[4] 1237 movdqu 0x50($inp), @XMM[5] 1238 mov %rsp, %rax # pass key schedule 1239 movdqu 0x60($inp), @XMM[6] 1240 mov %ebx,%r10d # pass rounds 1241 movdqu 0x70($inp), @XMM[7] 1242 lea 0x80($inp), $inp 1243 1244 call _bsaes_encrypt8 1245 1246 movdqu @XMM[0], 0x00($out) # write output 1247 movdqu @XMM[1], 0x10($out) 1248 movdqu @XMM[4], 0x20($out) 1249 movdqu @XMM[6], 0x30($out) 1250 movdqu @XMM[3], 0x40($out) 1251 movdqu @XMM[7], 0x50($out) 1252 movdqu @XMM[2], 0x60($out) 1253 movdqu @XMM[5], 0x70($out) 1254 lea 0x80($out), $out 1255 sub \$8,$len 1256 jnc .Lecb_enc_loop 1257 1258 add \$8,$len 1259 jz .Lecb_enc_done 1260 1261 movdqu 0x00($inp), @XMM[0] # load input 1262 mov %rsp, %rax # pass key schedule 1263 mov %ebx,%r10d # pass rounds 1264 cmp \$2,$len 1265 jb .Lecb_enc_one 1266 movdqu 0x10($inp), @XMM[1] 1267 je .Lecb_enc_two 1268 movdqu 0x20($inp), @XMM[2] 1269 cmp \$4,$len 1270 jb .Lecb_enc_three 1271 movdqu 0x30($inp), @XMM[3] 1272 je .Lecb_enc_four 1273 movdqu 0x40($inp), @XMM[4] 1274 cmp \$6,$len 1275 jb .Lecb_enc_five 1276 movdqu 0x50($inp), @XMM[5] 1277 je .Lecb_enc_six 1278 movdqu 0x60($inp), @XMM[6] 1279 call _bsaes_encrypt8 1280 movdqu @XMM[0], 0x00($out) # write output 1281 movdqu @XMM[1], 0x10($out) 1282 movdqu @XMM[4], 0x20($out) 1283 movdqu @XMM[6], 0x30($out) 1284 movdqu @XMM[3], 0x40($out) 1285 movdqu @XMM[7], 0x50($out) 1286 movdqu @XMM[2], 0x60($out) 1287 jmp .Lecb_enc_done 1288.align 16 1289.Lecb_enc_six: 1290 call _bsaes_encrypt8 1291 movdqu @XMM[0], 0x00($out) # write output 1292 movdqu @XMM[1], 0x10($out) 1293 movdqu @XMM[4], 0x20($out) 1294 movdqu @XMM[6], 0x30($out) 1295 movdqu @XMM[3], 0x40($out) 1296 movdqu @XMM[7], 0x50($out) 1297 jmp .Lecb_enc_done 1298.align 16 1299.Lecb_enc_five: 1300 call _bsaes_encrypt8 1301 movdqu @XMM[0], 0x00($out) # write output 1302 movdqu @XMM[1], 0x10($out) 1303 movdqu @XMM[4], 0x20($out) 1304 movdqu @XMM[6], 0x30($out) 1305 movdqu @XMM[3], 0x40($out) 1306 jmp .Lecb_enc_done 1307.align 16 1308.Lecb_enc_four: 1309 call _bsaes_encrypt8 1310 movdqu @XMM[0], 0x00($out) # write output 1311 movdqu @XMM[1], 0x10($out) 1312 movdqu @XMM[4], 0x20($out) 1313 movdqu @XMM[6], 0x30($out) 1314 jmp .Lecb_enc_done 1315.align 16 1316.Lecb_enc_three: 1317 call _bsaes_encrypt8 1318 movdqu @XMM[0], 0x00($out) # write output 1319 movdqu @XMM[1], 0x10($out) 1320 movdqu @XMM[4], 0x20($out) 1321 jmp .Lecb_enc_done 1322.align 16 1323.Lecb_enc_two: 1324 call _bsaes_encrypt8 1325 movdqu @XMM[0], 0x00($out) # write output 1326 movdqu @XMM[1], 0x10($out) 1327 jmp .Lecb_enc_done 1328.align 16 1329.Lecb_enc_one: 1330 call _bsaes_encrypt8 1331 movdqu @XMM[0], 0x00($out) # write output 1332 jmp .Lecb_enc_done 1333.align 16 1334.Lecb_enc_short: 1335 lea ($inp), $arg1 1336 lea ($out), $arg2 1337 lea ($key), $arg3 1338 call asm_AES_encrypt 1339 lea 16($inp), $inp 1340 lea 16($out), $out 1341 dec $len 1342 jnz .Lecb_enc_short 1343 1344.Lecb_enc_done: 1345 lea (%rsp),%rax 1346 pxor %xmm0, %xmm0 1347.Lecb_enc_bzero: # wipe key schedule [if any] 1348 movdqa %xmm0, 0x00(%rax) 1349 movdqa %xmm0, 0x10(%rax) 1350 lea 0x20(%rax), %rax 1351 cmp %rax, %rbp 1352 jb .Lecb_enc_bzero 1353 1354 lea 0x78(%rbp),%rax 1355.cfi_def_cfa %rax,8 1356___ 1357$code.=<<___ if ($win64); 1358 movaps 0x40(%rbp), %xmm6 1359 movaps 0x50(%rbp), %xmm7 1360 movaps 0x60(%rbp), %xmm8 1361 movaps 0x70(%rbp), %xmm9 1362 movaps 0x80(%rbp), %xmm10 1363 movaps 0x90(%rbp), %xmm11 1364 movaps 0xa0(%rbp), %xmm12 1365 movaps 0xb0(%rbp), %xmm13 1366 movaps 0xc0(%rbp), %xmm14 1367 movaps 0xd0(%rbp), %xmm15 1368 lea 0xa0(%rax), %rax 1369.Lecb_enc_tail: 1370___ 1371$code.=<<___; 1372 mov -48(%rax), %r15 1373.cfi_restore %r15 1374 mov -40(%rax), %r14 1375.cfi_restore %r14 1376 mov -32(%rax), %r13 1377.cfi_restore %r13 1378 mov -24(%rax), %r12 1379.cfi_restore %r12 1380 mov -16(%rax), %rbx 1381.cfi_restore %rbx 1382 mov -8(%rax), %rbp 1383.cfi_restore %rbp 1384 lea (%rax), %rsp # restore %rsp 1385.cfi_def_cfa_register %rsp 1386.Lecb_enc_epilogue: 1387 ret 1388.cfi_endproc 1389.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks 1390 1391.globl bsaes_ecb_decrypt_blocks 1392.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent 1393.align 16 1394bsaes_ecb_decrypt_blocks: 1395.cfi_startproc 1396 mov %rsp, %rax 1397.Lecb_dec_prologue: 1398 push %rbp 1399.cfi_push %rbp 1400 push %rbx 1401.cfi_push %rbx 1402 push %r12 1403.cfi_push %r12 1404 push %r13 1405.cfi_push %r13 1406 push %r14 1407.cfi_push %r14 1408 push %r15 1409.cfi_push %r15 1410 lea -0x48(%rsp),%rsp 1411.cfi_adjust_cfa_offset 0x48 1412___ 1413$code.=<<___ if ($win64); 1414 lea -0xa0(%rsp), %rsp 1415 movaps %xmm6, 0x40(%rsp) 1416 movaps %xmm7, 0x50(%rsp) 1417 movaps %xmm8, 0x60(%rsp) 1418 movaps %xmm9, 0x70(%rsp) 1419 movaps %xmm10, 0x80(%rsp) 1420 movaps %xmm11, 0x90(%rsp) 1421 movaps %xmm12, 0xa0(%rsp) 1422 movaps %xmm13, 0xb0(%rsp) 1423 movaps %xmm14, 0xc0(%rsp) 1424 movaps %xmm15, 0xd0(%rsp) 1425.Lecb_dec_body: 1426___ 1427$code.=<<___; 1428 mov %rsp,%rbp # backup %rsp 1429.cfi_def_cfa_register %rbp 1430 mov 240($arg4),%eax # rounds 1431 mov $arg1,$inp # backup arguments 1432 mov $arg2,$out 1433 mov $arg3,$len 1434 mov $arg4,$key 1435 cmp \$8,$arg3 1436 jb .Lecb_dec_short 1437 1438 mov %eax,%ebx # backup rounds 1439 shl \$7,%rax # 128 bytes per inner round key 1440 sub \$`128-32`,%rax # size of bit-sliced key schedule 1441 sub %rax,%rsp 1442 mov %rsp,%rax # pass key schedule 1443 mov $key,%rcx # pass key 1444 mov %ebx,%r10d # pass rounds 1445 call _bsaes_key_convert 1446 pxor (%rsp),%xmm7 # fix up 0 round key 1447 movdqa %xmm6,(%rax) # save last round key 1448 movdqa %xmm7,(%rsp) 1449 1450 sub \$8,$len 1451.Lecb_dec_loop: 1452 movdqu 0x00($inp), @XMM[0] # load input 1453 movdqu 0x10($inp), @XMM[1] 1454 movdqu 0x20($inp), @XMM[2] 1455 movdqu 0x30($inp), @XMM[3] 1456 movdqu 0x40($inp), @XMM[4] 1457 movdqu 0x50($inp), @XMM[5] 1458 mov %rsp, %rax # pass key schedule 1459 movdqu 0x60($inp), @XMM[6] 1460 mov %ebx,%r10d # pass rounds 1461 movdqu 0x70($inp), @XMM[7] 1462 lea 0x80($inp), $inp 1463 1464 call _bsaes_decrypt8 1465 1466 movdqu @XMM[0], 0x00($out) # write output 1467 movdqu @XMM[1], 0x10($out) 1468 movdqu @XMM[6], 0x20($out) 1469 movdqu @XMM[4], 0x30($out) 1470 movdqu @XMM[2], 0x40($out) 1471 movdqu @XMM[7], 0x50($out) 1472 movdqu @XMM[3], 0x60($out) 1473 movdqu @XMM[5], 0x70($out) 1474 lea 0x80($out), $out 1475 sub \$8,$len 1476 jnc .Lecb_dec_loop 1477 1478 add \$8,$len 1479 jz .Lecb_dec_done 1480 1481 movdqu 0x00($inp), @XMM[0] # load input 1482 mov %rsp, %rax # pass key schedule 1483 mov %ebx,%r10d # pass rounds 1484 cmp \$2,$len 1485 jb .Lecb_dec_one 1486 movdqu 0x10($inp), @XMM[1] 1487 je .Lecb_dec_two 1488 movdqu 0x20($inp), @XMM[2] 1489 cmp \$4,$len 1490 jb .Lecb_dec_three 1491 movdqu 0x30($inp), @XMM[3] 1492 je .Lecb_dec_four 1493 movdqu 0x40($inp), @XMM[4] 1494 cmp \$6,$len 1495 jb .Lecb_dec_five 1496 movdqu 0x50($inp), @XMM[5] 1497 je .Lecb_dec_six 1498 movdqu 0x60($inp), @XMM[6] 1499 call _bsaes_decrypt8 1500 movdqu @XMM[0], 0x00($out) # write output 1501 movdqu @XMM[1], 0x10($out) 1502 movdqu @XMM[6], 0x20($out) 1503 movdqu @XMM[4], 0x30($out) 1504 movdqu @XMM[2], 0x40($out) 1505 movdqu @XMM[7], 0x50($out) 1506 movdqu @XMM[3], 0x60($out) 1507 jmp .Lecb_dec_done 1508.align 16 1509.Lecb_dec_six: 1510 call _bsaes_decrypt8 1511 movdqu @XMM[0], 0x00($out) # write output 1512 movdqu @XMM[1], 0x10($out) 1513 movdqu @XMM[6], 0x20($out) 1514 movdqu @XMM[4], 0x30($out) 1515 movdqu @XMM[2], 0x40($out) 1516 movdqu @XMM[7], 0x50($out) 1517 jmp .Lecb_dec_done 1518.align 16 1519.Lecb_dec_five: 1520 call _bsaes_decrypt8 1521 movdqu @XMM[0], 0x00($out) # write output 1522 movdqu @XMM[1], 0x10($out) 1523 movdqu @XMM[6], 0x20($out) 1524 movdqu @XMM[4], 0x30($out) 1525 movdqu @XMM[2], 0x40($out) 1526 jmp .Lecb_dec_done 1527.align 16 1528.Lecb_dec_four: 1529 call _bsaes_decrypt8 1530 movdqu @XMM[0], 0x00($out) # write output 1531 movdqu @XMM[1], 0x10($out) 1532 movdqu @XMM[6], 0x20($out) 1533 movdqu @XMM[4], 0x30($out) 1534 jmp .Lecb_dec_done 1535.align 16 1536.Lecb_dec_three: 1537 call _bsaes_decrypt8 1538 movdqu @XMM[0], 0x00($out) # write output 1539 movdqu @XMM[1], 0x10($out) 1540 movdqu @XMM[6], 0x20($out) 1541 jmp .Lecb_dec_done 1542.align 16 1543.Lecb_dec_two: 1544 call _bsaes_decrypt8 1545 movdqu @XMM[0], 0x00($out) # write output 1546 movdqu @XMM[1], 0x10($out) 1547 jmp .Lecb_dec_done 1548.align 16 1549.Lecb_dec_one: 1550 call _bsaes_decrypt8 1551 movdqu @XMM[0], 0x00($out) # write output 1552 jmp .Lecb_dec_done 1553.align 16 1554.Lecb_dec_short: 1555 lea ($inp), $arg1 1556 lea ($out), $arg2 1557 lea ($key), $arg3 1558 call asm_AES_decrypt 1559 lea 16($inp), $inp 1560 lea 16($out), $out 1561 dec $len 1562 jnz .Lecb_dec_short 1563 1564.Lecb_dec_done: 1565 lea (%rsp),%rax 1566 pxor %xmm0, %xmm0 1567.Lecb_dec_bzero: # wipe key schedule [if any] 1568 movdqa %xmm0, 0x00(%rax) 1569 movdqa %xmm0, 0x10(%rax) 1570 lea 0x20(%rax), %rax 1571 cmp %rax, %rbp 1572 jb .Lecb_dec_bzero 1573 1574 lea 0x78(%rbp),%rax 1575.cfi_def_cfa %rax,8 1576___ 1577$code.=<<___ if ($win64); 1578 movaps 0x40(%rbp), %xmm6 1579 movaps 0x50(%rbp), %xmm7 1580 movaps 0x60(%rbp), %xmm8 1581 movaps 0x70(%rbp), %xmm9 1582 movaps 0x80(%rbp), %xmm10 1583 movaps 0x90(%rbp), %xmm11 1584 movaps 0xa0(%rbp), %xmm12 1585 movaps 0xb0(%rbp), %xmm13 1586 movaps 0xc0(%rbp), %xmm14 1587 movaps 0xd0(%rbp), %xmm15 1588 lea 0xa0(%rax), %rax 1589.Lecb_dec_tail: 1590___ 1591$code.=<<___; 1592 mov -48(%rax), %r15 1593.cfi_restore %r15 1594 mov -40(%rax), %r14 1595.cfi_restore %r14 1596 mov -32(%rax), %r13 1597.cfi_restore %r13 1598 mov -24(%rax), %r12 1599.cfi_restore %r12 1600 mov -16(%rax), %rbx 1601.cfi_restore %rbx 1602 mov -8(%rax), %rbp 1603.cfi_restore %rbp 1604 lea (%rax), %rsp # restore %rsp 1605.cfi_def_cfa_register %rsp 1606.Lecb_dec_epilogue: 1607 ret 1608.cfi_endproc 1609.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks 1610___ 1611} 1612$code.=<<___; 1613.extern asm_AES_cbc_encrypt 1614.globl ossl_bsaes_cbc_encrypt 1615.type ossl_bsaes_cbc_encrypt,\@abi-omnipotent 1616.align 16 1617ossl_bsaes_cbc_encrypt: 1618.cfi_startproc 1619 endbranch 1620___ 1621$code.=<<___ if ($win64); 1622 mov 48(%rsp),$arg6 # pull direction flag 1623___ 1624$code.=<<___; 1625 cmp \$0,$arg6 1626 jne asm_AES_cbc_encrypt 1627 cmp \$128,$arg3 1628 jb asm_AES_cbc_encrypt 1629 1630 mov %rsp, %rax 1631.Lcbc_dec_prologue: 1632 push %rbp 1633.cfi_push %rbp 1634 push %rbx 1635.cfi_push %rbx 1636 push %r12 1637.cfi_push %r12 1638 push %r13 1639.cfi_push %r13 1640 push %r14 1641.cfi_push %r14 1642 push %r15 1643.cfi_push %r15 1644 lea -0x48(%rsp), %rsp 1645.cfi_adjust_cfa_offset 0x48 1646___ 1647$code.=<<___ if ($win64); 1648 mov 0xa0(%rsp),$arg5 # pull ivp 1649 lea -0xa0(%rsp), %rsp 1650 movaps %xmm6, 0x40(%rsp) 1651 movaps %xmm7, 0x50(%rsp) 1652 movaps %xmm8, 0x60(%rsp) 1653 movaps %xmm9, 0x70(%rsp) 1654 movaps %xmm10, 0x80(%rsp) 1655 movaps %xmm11, 0x90(%rsp) 1656 movaps %xmm12, 0xa0(%rsp) 1657 movaps %xmm13, 0xb0(%rsp) 1658 movaps %xmm14, 0xc0(%rsp) 1659 movaps %xmm15, 0xd0(%rsp) 1660.Lcbc_dec_body: 1661___ 1662$code.=<<___; 1663 mov %rsp, %rbp # backup %rsp 1664.cfi_def_cfa_register %rbp 1665 mov 240($arg4), %eax # rounds 1666 mov $arg1, $inp # backup arguments 1667 mov $arg2, $out 1668 mov $arg3, $len 1669 mov $arg4, $key 1670 mov $arg5, %rbx 1671 shr \$4, $len # bytes to blocks 1672 1673 mov %eax, %edx # rounds 1674 shl \$7, %rax # 128 bytes per inner round key 1675 sub \$`128-32`, %rax # size of bit-sliced key schedule 1676 sub %rax, %rsp 1677 1678 mov %rsp, %rax # pass key schedule 1679 mov $key, %rcx # pass key 1680 mov %edx, %r10d # pass rounds 1681 call _bsaes_key_convert 1682 pxor (%rsp),%xmm7 # fix up 0 round key 1683 movdqa %xmm6,(%rax) # save last round key 1684 movdqa %xmm7,(%rsp) 1685 1686 movdqu (%rbx), @XMM[15] # load IV 1687 sub \$8,$len 1688.Lcbc_dec_loop: 1689 movdqu 0x00($inp), @XMM[0] # load input 1690 movdqu 0x10($inp), @XMM[1] 1691 movdqu 0x20($inp), @XMM[2] 1692 movdqu 0x30($inp), @XMM[3] 1693 movdqu 0x40($inp), @XMM[4] 1694 movdqu 0x50($inp), @XMM[5] 1695 mov %rsp, %rax # pass key schedule 1696 movdqu 0x60($inp), @XMM[6] 1697 mov %edx,%r10d # pass rounds 1698 movdqu 0x70($inp), @XMM[7] 1699 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1700 1701 call _bsaes_decrypt8 1702 1703 pxor 0x20(%rbp), @XMM[0] # ^= IV 1704 movdqu 0x00($inp), @XMM[8] # re-load input 1705 movdqu 0x10($inp), @XMM[9] 1706 pxor @XMM[8], @XMM[1] 1707 movdqu 0x20($inp), @XMM[10] 1708 pxor @XMM[9], @XMM[6] 1709 movdqu 0x30($inp), @XMM[11] 1710 pxor @XMM[10], @XMM[4] 1711 movdqu 0x40($inp), @XMM[12] 1712 pxor @XMM[11], @XMM[2] 1713 movdqu 0x50($inp), @XMM[13] 1714 pxor @XMM[12], @XMM[7] 1715 movdqu 0x60($inp), @XMM[14] 1716 pxor @XMM[13], @XMM[3] 1717 movdqu 0x70($inp), @XMM[15] # IV 1718 pxor @XMM[14], @XMM[5] 1719 movdqu @XMM[0], 0x00($out) # write output 1720 lea 0x80($inp), $inp 1721 movdqu @XMM[1], 0x10($out) 1722 movdqu @XMM[6], 0x20($out) 1723 movdqu @XMM[4], 0x30($out) 1724 movdqu @XMM[2], 0x40($out) 1725 movdqu @XMM[7], 0x50($out) 1726 movdqu @XMM[3], 0x60($out) 1727 movdqu @XMM[5], 0x70($out) 1728 lea 0x80($out), $out 1729 sub \$8,$len 1730 jnc .Lcbc_dec_loop 1731 1732 add \$8,$len 1733 jz .Lcbc_dec_done 1734 1735 movdqu 0x00($inp), @XMM[0] # load input 1736 mov %rsp, %rax # pass key schedule 1737 mov %edx, %r10d # pass rounds 1738 cmp \$2,$len 1739 jb .Lcbc_dec_one 1740 movdqu 0x10($inp), @XMM[1] 1741 je .Lcbc_dec_two 1742 movdqu 0x20($inp), @XMM[2] 1743 cmp \$4,$len 1744 jb .Lcbc_dec_three 1745 movdqu 0x30($inp), @XMM[3] 1746 je .Lcbc_dec_four 1747 movdqu 0x40($inp), @XMM[4] 1748 cmp \$6,$len 1749 jb .Lcbc_dec_five 1750 movdqu 0x50($inp), @XMM[5] 1751 je .Lcbc_dec_six 1752 movdqu 0x60($inp), @XMM[6] 1753 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1754 call _bsaes_decrypt8 1755 pxor 0x20(%rbp), @XMM[0] # ^= IV 1756 movdqu 0x00($inp), @XMM[8] # re-load input 1757 movdqu 0x10($inp), @XMM[9] 1758 pxor @XMM[8], @XMM[1] 1759 movdqu 0x20($inp), @XMM[10] 1760 pxor @XMM[9], @XMM[6] 1761 movdqu 0x30($inp), @XMM[11] 1762 pxor @XMM[10], @XMM[4] 1763 movdqu 0x40($inp), @XMM[12] 1764 pxor @XMM[11], @XMM[2] 1765 movdqu 0x50($inp), @XMM[13] 1766 pxor @XMM[12], @XMM[7] 1767 movdqu 0x60($inp), @XMM[15] # IV 1768 pxor @XMM[13], @XMM[3] 1769 movdqu @XMM[0], 0x00($out) # write output 1770 movdqu @XMM[1], 0x10($out) 1771 movdqu @XMM[6], 0x20($out) 1772 movdqu @XMM[4], 0x30($out) 1773 movdqu @XMM[2], 0x40($out) 1774 movdqu @XMM[7], 0x50($out) 1775 movdqu @XMM[3], 0x60($out) 1776 jmp .Lcbc_dec_done 1777.align 16 1778.Lcbc_dec_six: 1779 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1780 call _bsaes_decrypt8 1781 pxor 0x20(%rbp), @XMM[0] # ^= IV 1782 movdqu 0x00($inp), @XMM[8] # re-load input 1783 movdqu 0x10($inp), @XMM[9] 1784 pxor @XMM[8], @XMM[1] 1785 movdqu 0x20($inp), @XMM[10] 1786 pxor @XMM[9], @XMM[6] 1787 movdqu 0x30($inp), @XMM[11] 1788 pxor @XMM[10], @XMM[4] 1789 movdqu 0x40($inp), @XMM[12] 1790 pxor @XMM[11], @XMM[2] 1791 movdqu 0x50($inp), @XMM[15] # IV 1792 pxor @XMM[12], @XMM[7] 1793 movdqu @XMM[0], 0x00($out) # write output 1794 movdqu @XMM[1], 0x10($out) 1795 movdqu @XMM[6], 0x20($out) 1796 movdqu @XMM[4], 0x30($out) 1797 movdqu @XMM[2], 0x40($out) 1798 movdqu @XMM[7], 0x50($out) 1799 jmp .Lcbc_dec_done 1800.align 16 1801.Lcbc_dec_five: 1802 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1803 call _bsaes_decrypt8 1804 pxor 0x20(%rbp), @XMM[0] # ^= IV 1805 movdqu 0x00($inp), @XMM[8] # re-load input 1806 movdqu 0x10($inp), @XMM[9] 1807 pxor @XMM[8], @XMM[1] 1808 movdqu 0x20($inp), @XMM[10] 1809 pxor @XMM[9], @XMM[6] 1810 movdqu 0x30($inp), @XMM[11] 1811 pxor @XMM[10], @XMM[4] 1812 movdqu 0x40($inp), @XMM[15] # IV 1813 pxor @XMM[11], @XMM[2] 1814 movdqu @XMM[0], 0x00($out) # write output 1815 movdqu @XMM[1], 0x10($out) 1816 movdqu @XMM[6], 0x20($out) 1817 movdqu @XMM[4], 0x30($out) 1818 movdqu @XMM[2], 0x40($out) 1819 jmp .Lcbc_dec_done 1820.align 16 1821.Lcbc_dec_four: 1822 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1823 call _bsaes_decrypt8 1824 pxor 0x20(%rbp), @XMM[0] # ^= IV 1825 movdqu 0x00($inp), @XMM[8] # re-load input 1826 movdqu 0x10($inp), @XMM[9] 1827 pxor @XMM[8], @XMM[1] 1828 movdqu 0x20($inp), @XMM[10] 1829 pxor @XMM[9], @XMM[6] 1830 movdqu 0x30($inp), @XMM[15] # IV 1831 pxor @XMM[10], @XMM[4] 1832 movdqu @XMM[0], 0x00($out) # write output 1833 movdqu @XMM[1], 0x10($out) 1834 movdqu @XMM[6], 0x20($out) 1835 movdqu @XMM[4], 0x30($out) 1836 jmp .Lcbc_dec_done 1837.align 16 1838.Lcbc_dec_three: 1839 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1840 call _bsaes_decrypt8 1841 pxor 0x20(%rbp), @XMM[0] # ^= IV 1842 movdqu 0x00($inp), @XMM[8] # re-load input 1843 movdqu 0x10($inp), @XMM[9] 1844 pxor @XMM[8], @XMM[1] 1845 movdqu 0x20($inp), @XMM[15] # IV 1846 pxor @XMM[9], @XMM[6] 1847 movdqu @XMM[0], 0x00($out) # write output 1848 movdqu @XMM[1], 0x10($out) 1849 movdqu @XMM[6], 0x20($out) 1850 jmp .Lcbc_dec_done 1851.align 16 1852.Lcbc_dec_two: 1853 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1854 call _bsaes_decrypt8 1855 pxor 0x20(%rbp), @XMM[0] # ^= IV 1856 movdqu 0x00($inp), @XMM[8] # re-load input 1857 movdqu 0x10($inp), @XMM[15] # IV 1858 pxor @XMM[8], @XMM[1] 1859 movdqu @XMM[0], 0x00($out) # write output 1860 movdqu @XMM[1], 0x10($out) 1861 jmp .Lcbc_dec_done 1862.align 16 1863.Lcbc_dec_one: 1864 lea ($inp), $arg1 1865 lea 0x20(%rbp), $arg2 # buffer output 1866 lea ($key), $arg3 1867 call asm_AES_decrypt # doesn't touch %xmm 1868 pxor 0x20(%rbp), @XMM[15] # ^= IV 1869 movdqu @XMM[15], ($out) # write output 1870 movdqa @XMM[0], @XMM[15] # IV 1871 1872.Lcbc_dec_done: 1873 movdqu @XMM[15], (%rbx) # return IV 1874 lea (%rsp), %rax 1875 pxor %xmm0, %xmm0 1876.Lcbc_dec_bzero: # wipe key schedule [if any] 1877 movdqa %xmm0, 0x00(%rax) 1878 movdqa %xmm0, 0x10(%rax) 1879 lea 0x20(%rax), %rax 1880 cmp %rax, %rbp 1881 ja .Lcbc_dec_bzero 1882 1883 lea 0x78(%rbp),%rax 1884.cfi_def_cfa %rax,8 1885___ 1886$code.=<<___ if ($win64); 1887 movaps 0x40(%rbp), %xmm6 1888 movaps 0x50(%rbp), %xmm7 1889 movaps 0x60(%rbp), %xmm8 1890 movaps 0x70(%rbp), %xmm9 1891 movaps 0x80(%rbp), %xmm10 1892 movaps 0x90(%rbp), %xmm11 1893 movaps 0xa0(%rbp), %xmm12 1894 movaps 0xb0(%rbp), %xmm13 1895 movaps 0xc0(%rbp), %xmm14 1896 movaps 0xd0(%rbp), %xmm15 1897 lea 0xa0(%rax), %rax 1898.Lcbc_dec_tail: 1899___ 1900$code.=<<___; 1901 mov -48(%rax), %r15 1902.cfi_restore %r15 1903 mov -40(%rax), %r14 1904.cfi_restore %r14 1905 mov -32(%rax), %r13 1906.cfi_restore %r13 1907 mov -24(%rax), %r12 1908.cfi_restore %r12 1909 mov -16(%rax), %rbx 1910.cfi_restore %rbx 1911 mov -8(%rax), %rbp 1912.cfi_restore %rbp 1913 lea (%rax), %rsp # restore %rsp 1914.cfi_def_cfa_register %rsp 1915.Lcbc_dec_epilogue: 1916 ret 1917.cfi_endproc 1918.size ossl_bsaes_cbc_encrypt,.-ossl_bsaes_cbc_encrypt 1919 1920.globl ossl_bsaes_ctr32_encrypt_blocks 1921.type ossl_bsaes_ctr32_encrypt_blocks,\@abi-omnipotent 1922.align 16 1923ossl_bsaes_ctr32_encrypt_blocks: 1924.cfi_startproc 1925 endbranch 1926 mov %rsp, %rax 1927.Lctr_enc_prologue: 1928 push %rbp 1929.cfi_push %rbp 1930 push %rbx 1931.cfi_push %rbx 1932 push %r12 1933.cfi_push %r12 1934 push %r13 1935.cfi_push %r13 1936 push %r14 1937.cfi_push %r14 1938 push %r15 1939.cfi_push %r15 1940 lea -0x48(%rsp), %rsp 1941.cfi_adjust_cfa_offset 0x48 1942___ 1943$code.=<<___ if ($win64); 1944 mov 0xa0(%rsp),$arg5 # pull ivp 1945 lea -0xa0(%rsp), %rsp 1946 movaps %xmm6, 0x40(%rsp) 1947 movaps %xmm7, 0x50(%rsp) 1948 movaps %xmm8, 0x60(%rsp) 1949 movaps %xmm9, 0x70(%rsp) 1950 movaps %xmm10, 0x80(%rsp) 1951 movaps %xmm11, 0x90(%rsp) 1952 movaps %xmm12, 0xa0(%rsp) 1953 movaps %xmm13, 0xb0(%rsp) 1954 movaps %xmm14, 0xc0(%rsp) 1955 movaps %xmm15, 0xd0(%rsp) 1956.Lctr_enc_body: 1957___ 1958$code.=<<___; 1959 mov %rsp, %rbp # backup %rsp 1960.cfi_def_cfa_register %rbp 1961 movdqu ($arg5), %xmm0 # load counter 1962 mov 240($arg4), %eax # rounds 1963 mov $arg1, $inp # backup arguments 1964 mov $arg2, $out 1965 mov $arg3, $len 1966 mov $arg4, $key 1967 movdqa %xmm0, 0x20(%rbp) # copy counter 1968 cmp \$8, $arg3 1969 jb .Lctr_enc_short 1970 1971 mov %eax, %ebx # rounds 1972 shl \$7, %rax # 128 bytes per inner round key 1973 sub \$`128-32`, %rax # size of bit-sliced key schedule 1974 sub %rax, %rsp 1975 1976 mov %rsp, %rax # pass key schedule 1977 mov $key, %rcx # pass key 1978 mov %ebx, %r10d # pass rounds 1979 call _bsaes_key_convert 1980 pxor %xmm6,%xmm7 # fix up last round key 1981 movdqa %xmm7,(%rax) # save last round key 1982 1983 movdqa (%rsp), @XMM[9] # load round0 key 1984 lea .LADD1(%rip), %r11 1985 movdqa 0x20(%rbp), @XMM[0] # counter copy 1986 movdqa -0x20(%r11), @XMM[8] # .LSWPUP 1987 pshufb @XMM[8], @XMM[9] # byte swap upper part 1988 pshufb @XMM[8], @XMM[0] 1989 movdqa @XMM[9], (%rsp) # save adjusted round0 key 1990 jmp .Lctr_enc_loop 1991.align 16 1992.Lctr_enc_loop: 1993 movdqa @XMM[0], 0x20(%rbp) # save counter 1994 movdqa @XMM[0], @XMM[1] # prepare 8 counter values 1995 movdqa @XMM[0], @XMM[2] 1996 paddd 0x00(%r11), @XMM[1] # .LADD1 1997 movdqa @XMM[0], @XMM[3] 1998 paddd 0x10(%r11), @XMM[2] # .LADD2 1999 movdqa @XMM[0], @XMM[4] 2000 paddd 0x20(%r11), @XMM[3] # .LADD3 2001 movdqa @XMM[0], @XMM[5] 2002 paddd 0x30(%r11), @XMM[4] # .LADD4 2003 movdqa @XMM[0], @XMM[6] 2004 paddd 0x40(%r11), @XMM[5] # .LADD5 2005 movdqa @XMM[0], @XMM[7] 2006 paddd 0x50(%r11), @XMM[6] # .LADD6 2007 paddd 0x60(%r11), @XMM[7] # .LADD7 2008 2009 # Borrow prologue from _bsaes_encrypt8 to use the opportunity 2010 # to flip byte order in 32-bit counter 2011 movdqa (%rsp), @XMM[9] # round 0 key 2012 lea 0x10(%rsp), %rax # pass key schedule 2013 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR 2014 pxor @XMM[9], @XMM[0] # xor with round0 key 2015 pxor @XMM[9], @XMM[1] 2016 pxor @XMM[9], @XMM[2] 2017 pxor @XMM[9], @XMM[3] 2018 pshufb @XMM[8], @XMM[0] 2019 pshufb @XMM[8], @XMM[1] 2020 pxor @XMM[9], @XMM[4] 2021 pxor @XMM[9], @XMM[5] 2022 pshufb @XMM[8], @XMM[2] 2023 pshufb @XMM[8], @XMM[3] 2024 pxor @XMM[9], @XMM[6] 2025 pxor @XMM[9], @XMM[7] 2026 pshufb @XMM[8], @XMM[4] 2027 pshufb @XMM[8], @XMM[5] 2028 pshufb @XMM[8], @XMM[6] 2029 pshufb @XMM[8], @XMM[7] 2030 lea .LBS0(%rip), %r11 # constants table 2031 mov %ebx,%r10d # pass rounds 2032 2033 call _bsaes_encrypt8_bitslice 2034 2035 sub \$8,$len 2036 jc .Lctr_enc_loop_done 2037 2038 movdqu 0x00($inp), @XMM[8] # load input 2039 movdqu 0x10($inp), @XMM[9] 2040 movdqu 0x20($inp), @XMM[10] 2041 movdqu 0x30($inp), @XMM[11] 2042 movdqu 0x40($inp), @XMM[12] 2043 movdqu 0x50($inp), @XMM[13] 2044 movdqu 0x60($inp), @XMM[14] 2045 movdqu 0x70($inp), @XMM[15] 2046 lea 0x80($inp),$inp 2047 pxor @XMM[0], @XMM[8] 2048 movdqa 0x20(%rbp), @XMM[0] # load counter 2049 pxor @XMM[9], @XMM[1] 2050 movdqu @XMM[8], 0x00($out) # write output 2051 pxor @XMM[10], @XMM[4] 2052 movdqu @XMM[1], 0x10($out) 2053 pxor @XMM[11], @XMM[6] 2054 movdqu @XMM[4], 0x20($out) 2055 pxor @XMM[12], @XMM[3] 2056 movdqu @XMM[6], 0x30($out) 2057 pxor @XMM[13], @XMM[7] 2058 movdqu @XMM[3], 0x40($out) 2059 pxor @XMM[14], @XMM[2] 2060 movdqu @XMM[7], 0x50($out) 2061 pxor @XMM[15], @XMM[5] 2062 movdqu @XMM[2], 0x60($out) 2063 lea .LADD1(%rip), %r11 2064 movdqu @XMM[5], 0x70($out) 2065 lea 0x80($out), $out 2066 paddd 0x70(%r11), @XMM[0] # .LADD8 2067 jnz .Lctr_enc_loop 2068 2069 jmp .Lctr_enc_done 2070.align 16 2071.Lctr_enc_loop_done: 2072 add \$8, $len 2073 movdqu 0x00($inp), @XMM[8] # load input 2074 pxor @XMM[8], @XMM[0] 2075 movdqu @XMM[0], 0x00($out) # write output 2076 cmp \$2,$len 2077 jb .Lctr_enc_done 2078 movdqu 0x10($inp), @XMM[9] 2079 pxor @XMM[9], @XMM[1] 2080 movdqu @XMM[1], 0x10($out) 2081 je .Lctr_enc_done 2082 movdqu 0x20($inp), @XMM[10] 2083 pxor @XMM[10], @XMM[4] 2084 movdqu @XMM[4], 0x20($out) 2085 cmp \$4,$len 2086 jb .Lctr_enc_done 2087 movdqu 0x30($inp), @XMM[11] 2088 pxor @XMM[11], @XMM[6] 2089 movdqu @XMM[6], 0x30($out) 2090 je .Lctr_enc_done 2091 movdqu 0x40($inp), @XMM[12] 2092 pxor @XMM[12], @XMM[3] 2093 movdqu @XMM[3], 0x40($out) 2094 cmp \$6,$len 2095 jb .Lctr_enc_done 2096 movdqu 0x50($inp), @XMM[13] 2097 pxor @XMM[13], @XMM[7] 2098 movdqu @XMM[7], 0x50($out) 2099 je .Lctr_enc_done 2100 movdqu 0x60($inp), @XMM[14] 2101 pxor @XMM[14], @XMM[2] 2102 movdqu @XMM[2], 0x60($out) 2103 jmp .Lctr_enc_done 2104 2105.align 16 2106.Lctr_enc_short: 2107 lea 0x20(%rbp), $arg1 2108 lea 0x30(%rbp), $arg2 2109 lea ($key), $arg3 2110 call asm_AES_encrypt 2111 movdqu ($inp), @XMM[1] 2112 lea 16($inp), $inp 2113 mov 0x2c(%rbp), %eax # load 32-bit counter 2114 bswap %eax 2115 pxor 0x30(%rbp), @XMM[1] 2116 inc %eax # increment 2117 movdqu @XMM[1], ($out) 2118 bswap %eax 2119 lea 16($out), $out 2120 mov %eax, 0x2c(%rsp) # save 32-bit counter 2121 dec $len 2122 jnz .Lctr_enc_short 2123 2124.Lctr_enc_done: 2125 lea (%rsp), %rax 2126 pxor %xmm0, %xmm0 2127.Lctr_enc_bzero: # wipe key schedule [if any] 2128 movdqa %xmm0, 0x00(%rax) 2129 movdqa %xmm0, 0x10(%rax) 2130 lea 0x20(%rax), %rax 2131 cmp %rax, %rbp 2132 ja .Lctr_enc_bzero 2133 2134 lea 0x78(%rbp),%rax 2135.cfi_def_cfa %rax,8 2136___ 2137$code.=<<___ if ($win64); 2138 movaps 0x40(%rbp), %xmm6 2139 movaps 0x50(%rbp), %xmm7 2140 movaps 0x60(%rbp), %xmm8 2141 movaps 0x70(%rbp), %xmm9 2142 movaps 0x80(%rbp), %xmm10 2143 movaps 0x90(%rbp), %xmm11 2144 movaps 0xa0(%rbp), %xmm12 2145 movaps 0xb0(%rbp), %xmm13 2146 movaps 0xc0(%rbp), %xmm14 2147 movaps 0xd0(%rbp), %xmm15 2148 lea 0xa0(%rax), %rax 2149.Lctr_enc_tail: 2150___ 2151$code.=<<___; 2152 mov -48(%rax), %r15 2153.cfi_restore %r15 2154 mov -40(%rax), %r14 2155.cfi_restore %r14 2156 mov -32(%rax), %r13 2157.cfi_restore %r13 2158 mov -24(%rax), %r12 2159.cfi_restore %r12 2160 mov -16(%rax), %rbx 2161.cfi_restore %rbx 2162 mov -8(%rax), %rbp 2163.cfi_restore %rbp 2164 lea (%rax), %rsp # restore %rsp 2165.cfi_def_cfa_register %rsp 2166.Lctr_enc_epilogue: 2167 ret 2168.cfi_endproc 2169.size ossl_bsaes_ctr32_encrypt_blocks,.-ossl_bsaes_ctr32_encrypt_blocks 2170___ 2171###################################################################### 2172# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len, 2173# const AES_KEY *key1, const AES_KEY *key2, 2174# const unsigned char iv[16]); 2175# 2176my ($twmask,$twres,$twtmp)=@XMM[13..15]; 2177$arg6=~s/d$//; 2178 2179$code.=<<___; 2180.globl ossl_bsaes_xts_encrypt 2181.type ossl_bsaes_xts_encrypt,\@abi-omnipotent 2182.align 16 2183ossl_bsaes_xts_encrypt: 2184.cfi_startproc 2185 mov %rsp, %rax 2186.Lxts_enc_prologue: 2187 push %rbp 2188.cfi_push %rbp 2189 push %rbx 2190.cfi_push %rbx 2191 push %r12 2192.cfi_push %r12 2193 push %r13 2194.cfi_push %r13 2195 push %r14 2196.cfi_push %r14 2197 push %r15 2198.cfi_push %r15 2199 lea -0x48(%rsp), %rsp 2200.cfi_adjust_cfa_offset 0x48 2201___ 2202$code.=<<___ if ($win64); 2203 mov 0xa0(%rsp),$arg5 # pull key2 2204 mov 0xa8(%rsp),$arg6 # pull ivp 2205 lea -0xa0(%rsp), %rsp 2206 movaps %xmm6, 0x40(%rsp) 2207 movaps %xmm7, 0x50(%rsp) 2208 movaps %xmm8, 0x60(%rsp) 2209 movaps %xmm9, 0x70(%rsp) 2210 movaps %xmm10, 0x80(%rsp) 2211 movaps %xmm11, 0x90(%rsp) 2212 movaps %xmm12, 0xa0(%rsp) 2213 movaps %xmm13, 0xb0(%rsp) 2214 movaps %xmm14, 0xc0(%rsp) 2215 movaps %xmm15, 0xd0(%rsp) 2216.Lxts_enc_body: 2217___ 2218$code.=<<___; 2219 mov %rsp, %rbp # backup %rsp 2220.cfi_def_cfa_register %rbp 2221 mov $arg1, $inp # backup arguments 2222 mov $arg2, $out 2223 mov $arg3, $len 2224 mov $arg4, $key 2225 2226 lea ($arg6), $arg1 2227 lea 0x20(%rbp), $arg2 2228 lea ($arg5), $arg3 2229 call asm_AES_encrypt # generate initial tweak 2230 2231 mov 240($key), %eax # rounds 2232 mov $len, %rbx # backup $len 2233 2234 mov %eax, %edx # rounds 2235 shl \$7, %rax # 128 bytes per inner round key 2236 sub \$`128-32`, %rax # size of bit-sliced key schedule 2237 sub %rax, %rsp 2238 2239 mov %rsp, %rax # pass key schedule 2240 mov $key, %rcx # pass key 2241 mov %edx, %r10d # pass rounds 2242 call _bsaes_key_convert 2243 pxor %xmm6, %xmm7 # fix up last round key 2244 movdqa %xmm7, (%rax) # save last round key 2245 2246 and \$-16, $len 2247 sub \$0x80, %rsp # place for tweak[8] 2248 movdqa 0x20(%rbp), @XMM[7] # initial tweak 2249 2250 pxor $twtmp, $twtmp 2251 movdqa .Lxts_magic(%rip), $twmask 2252 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2253 2254 sub \$0x80, $len 2255 jc .Lxts_enc_short 2256 jmp .Lxts_enc_loop 2257 2258.align 16 2259.Lxts_enc_loop: 2260___ 2261 for ($i=0;$i<7;$i++) { 2262 $code.=<<___; 2263 pshufd \$0x13, $twtmp, $twres 2264 pxor $twtmp, $twtmp 2265 movdqa @XMM[7], @XMM[$i] 2266 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2267 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2268 pand $twmask, $twres # isolate carry and residue 2269 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2270 pxor $twres, @XMM[7] 2271___ 2272 $code.=<<___ if ($i>=1); 2273 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2274___ 2275 $code.=<<___ if ($i>=2); 2276 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2277___ 2278 } 2279$code.=<<___; 2280 movdqu 0x60($inp), @XMM[8+6] 2281 pxor @XMM[8+5], @XMM[5] 2282 movdqu 0x70($inp), @XMM[8+7] 2283 lea 0x80($inp), $inp 2284 movdqa @XMM[7], 0x70(%rsp) 2285 pxor @XMM[8+6], @XMM[6] 2286 lea 0x80(%rsp), %rax # pass key schedule 2287 pxor @XMM[8+7], @XMM[7] 2288 mov %edx, %r10d # pass rounds 2289 2290 call _bsaes_encrypt8 2291 2292 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2293 pxor 0x10(%rsp), @XMM[1] 2294 movdqu @XMM[0], 0x00($out) # write output 2295 pxor 0x20(%rsp), @XMM[4] 2296 movdqu @XMM[1], 0x10($out) 2297 pxor 0x30(%rsp), @XMM[6] 2298 movdqu @XMM[4], 0x20($out) 2299 pxor 0x40(%rsp), @XMM[3] 2300 movdqu @XMM[6], 0x30($out) 2301 pxor 0x50(%rsp), @XMM[7] 2302 movdqu @XMM[3], 0x40($out) 2303 pxor 0x60(%rsp), @XMM[2] 2304 movdqu @XMM[7], 0x50($out) 2305 pxor 0x70(%rsp), @XMM[5] 2306 movdqu @XMM[2], 0x60($out) 2307 movdqu @XMM[5], 0x70($out) 2308 lea 0x80($out), $out 2309 2310 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak 2311 pxor $twtmp, $twtmp 2312 movdqa .Lxts_magic(%rip), $twmask 2313 pcmpgtd @XMM[7], $twtmp 2314 pshufd \$0x13, $twtmp, $twres 2315 pxor $twtmp, $twtmp 2316 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2317 pand $twmask, $twres # isolate carry and residue 2318 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2319 pxor $twres, @XMM[7] 2320 2321 sub \$0x80,$len 2322 jnc .Lxts_enc_loop 2323 2324.Lxts_enc_short: 2325 add \$0x80, $len 2326 jz .Lxts_enc_done 2327___ 2328 for ($i=0;$i<7;$i++) { 2329 $code.=<<___; 2330 pshufd \$0x13, $twtmp, $twres 2331 pxor $twtmp, $twtmp 2332 movdqa @XMM[7], @XMM[$i] 2333 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2334 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2335 pand $twmask, $twres # isolate carry and residue 2336 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2337 pxor $twres, @XMM[7] 2338___ 2339 $code.=<<___ if ($i>=1); 2340 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2341 cmp \$`0x10*$i`,$len 2342 je .Lxts_enc_$i 2343___ 2344 $code.=<<___ if ($i>=2); 2345 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2346___ 2347 } 2348$code.=<<___; 2349 movdqu 0x60($inp), @XMM[8+6] 2350 pxor @XMM[8+5], @XMM[5] 2351 movdqa @XMM[7], 0x70(%rsp) 2352 lea 0x70($inp), $inp 2353 pxor @XMM[8+6], @XMM[6] 2354 lea 0x80(%rsp), %rax # pass key schedule 2355 mov %edx, %r10d # pass rounds 2356 2357 call _bsaes_encrypt8 2358 2359 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2360 pxor 0x10(%rsp), @XMM[1] 2361 movdqu @XMM[0], 0x00($out) # write output 2362 pxor 0x20(%rsp), @XMM[4] 2363 movdqu @XMM[1], 0x10($out) 2364 pxor 0x30(%rsp), @XMM[6] 2365 movdqu @XMM[4], 0x20($out) 2366 pxor 0x40(%rsp), @XMM[3] 2367 movdqu @XMM[6], 0x30($out) 2368 pxor 0x50(%rsp), @XMM[7] 2369 movdqu @XMM[3], 0x40($out) 2370 pxor 0x60(%rsp), @XMM[2] 2371 movdqu @XMM[7], 0x50($out) 2372 movdqu @XMM[2], 0x60($out) 2373 lea 0x70($out), $out 2374 2375 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak 2376 jmp .Lxts_enc_done 2377.align 16 2378.Lxts_enc_6: 2379 pxor @XMM[8+4], @XMM[4] 2380 lea 0x60($inp), $inp 2381 pxor @XMM[8+5], @XMM[5] 2382 lea 0x80(%rsp), %rax # pass key schedule 2383 mov %edx, %r10d # pass rounds 2384 2385 call _bsaes_encrypt8 2386 2387 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2388 pxor 0x10(%rsp), @XMM[1] 2389 movdqu @XMM[0], 0x00($out) # write output 2390 pxor 0x20(%rsp), @XMM[4] 2391 movdqu @XMM[1], 0x10($out) 2392 pxor 0x30(%rsp), @XMM[6] 2393 movdqu @XMM[4], 0x20($out) 2394 pxor 0x40(%rsp), @XMM[3] 2395 movdqu @XMM[6], 0x30($out) 2396 pxor 0x50(%rsp), @XMM[7] 2397 movdqu @XMM[3], 0x40($out) 2398 movdqu @XMM[7], 0x50($out) 2399 lea 0x60($out), $out 2400 2401 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak 2402 jmp .Lxts_enc_done 2403.align 16 2404.Lxts_enc_5: 2405 pxor @XMM[8+3], @XMM[3] 2406 lea 0x50($inp), $inp 2407 pxor @XMM[8+4], @XMM[4] 2408 lea 0x80(%rsp), %rax # pass key schedule 2409 mov %edx, %r10d # pass rounds 2410 2411 call _bsaes_encrypt8 2412 2413 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2414 pxor 0x10(%rsp), @XMM[1] 2415 movdqu @XMM[0], 0x00($out) # write output 2416 pxor 0x20(%rsp), @XMM[4] 2417 movdqu @XMM[1], 0x10($out) 2418 pxor 0x30(%rsp), @XMM[6] 2419 movdqu @XMM[4], 0x20($out) 2420 pxor 0x40(%rsp), @XMM[3] 2421 movdqu @XMM[6], 0x30($out) 2422 movdqu @XMM[3], 0x40($out) 2423 lea 0x50($out), $out 2424 2425 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak 2426 jmp .Lxts_enc_done 2427.align 16 2428.Lxts_enc_4: 2429 pxor @XMM[8+2], @XMM[2] 2430 lea 0x40($inp), $inp 2431 pxor @XMM[8+3], @XMM[3] 2432 lea 0x80(%rsp), %rax # pass key schedule 2433 mov %edx, %r10d # pass rounds 2434 2435 call _bsaes_encrypt8 2436 2437 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2438 pxor 0x10(%rsp), @XMM[1] 2439 movdqu @XMM[0], 0x00($out) # write output 2440 pxor 0x20(%rsp), @XMM[4] 2441 movdqu @XMM[1], 0x10($out) 2442 pxor 0x30(%rsp), @XMM[6] 2443 movdqu @XMM[4], 0x20($out) 2444 movdqu @XMM[6], 0x30($out) 2445 lea 0x40($out), $out 2446 2447 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak 2448 jmp .Lxts_enc_done 2449.align 16 2450.Lxts_enc_3: 2451 pxor @XMM[8+1], @XMM[1] 2452 lea 0x30($inp), $inp 2453 pxor @XMM[8+2], @XMM[2] 2454 lea 0x80(%rsp), %rax # pass key schedule 2455 mov %edx, %r10d # pass rounds 2456 2457 call _bsaes_encrypt8 2458 2459 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2460 pxor 0x10(%rsp), @XMM[1] 2461 movdqu @XMM[0], 0x00($out) # write output 2462 pxor 0x20(%rsp), @XMM[4] 2463 movdqu @XMM[1], 0x10($out) 2464 movdqu @XMM[4], 0x20($out) 2465 lea 0x30($out), $out 2466 2467 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak 2468 jmp .Lxts_enc_done 2469.align 16 2470.Lxts_enc_2: 2471 pxor @XMM[8+0], @XMM[0] 2472 lea 0x20($inp), $inp 2473 pxor @XMM[8+1], @XMM[1] 2474 lea 0x80(%rsp), %rax # pass key schedule 2475 mov %edx, %r10d # pass rounds 2476 2477 call _bsaes_encrypt8 2478 2479 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2480 pxor 0x10(%rsp), @XMM[1] 2481 movdqu @XMM[0], 0x00($out) # write output 2482 movdqu @XMM[1], 0x10($out) 2483 lea 0x20($out), $out 2484 2485 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak 2486 jmp .Lxts_enc_done 2487.align 16 2488.Lxts_enc_1: 2489 pxor @XMM[0], @XMM[8] 2490 lea 0x10($inp), $inp 2491 movdqa @XMM[8], 0x20(%rbp) 2492 lea 0x20(%rbp), $arg1 2493 lea 0x20(%rbp), $arg2 2494 lea ($key), $arg3 2495 call asm_AES_encrypt # doesn't touch %xmm 2496 pxor 0x20(%rbp), @XMM[0] # ^= tweak[] 2497 #pxor @XMM[8], @XMM[0] 2498 #lea 0x80(%rsp), %rax # pass key schedule 2499 #mov %edx, %r10d # pass rounds 2500 #call _bsaes_encrypt8 2501 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2502 movdqu @XMM[0], 0x00($out) # write output 2503 lea 0x10($out), $out 2504 2505 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak 2506 2507.Lxts_enc_done: 2508 and \$15, %ebx 2509 jz .Lxts_enc_ret 2510 mov $out, %rdx 2511 2512.Lxts_enc_steal: 2513 movzb ($inp), %eax 2514 movzb -16(%rdx), %ecx 2515 lea 1($inp), $inp 2516 mov %al, -16(%rdx) 2517 mov %cl, 0(%rdx) 2518 lea 1(%rdx), %rdx 2519 sub \$1,%ebx 2520 jnz .Lxts_enc_steal 2521 2522 movdqu -16($out), @XMM[0] 2523 lea 0x20(%rbp), $arg1 2524 pxor @XMM[7], @XMM[0] 2525 lea 0x20(%rbp), $arg2 2526 movdqa @XMM[0], 0x20(%rbp) 2527 lea ($key), $arg3 2528 call asm_AES_encrypt # doesn't touch %xmm 2529 pxor 0x20(%rbp), @XMM[7] 2530 movdqu @XMM[7], -16($out) 2531 2532.Lxts_enc_ret: 2533 lea (%rsp), %rax 2534 pxor %xmm0, %xmm0 2535.Lxts_enc_bzero: # wipe key schedule [if any] 2536 movdqa %xmm0, 0x00(%rax) 2537 movdqa %xmm0, 0x10(%rax) 2538 lea 0x20(%rax), %rax 2539 cmp %rax, %rbp 2540 ja .Lxts_enc_bzero 2541 2542 lea 0x78(%rbp),%rax 2543.cfi_def_cfa %rax,8 2544___ 2545$code.=<<___ if ($win64); 2546 movaps 0x40(%rbp), %xmm6 2547 movaps 0x50(%rbp), %xmm7 2548 movaps 0x60(%rbp), %xmm8 2549 movaps 0x70(%rbp), %xmm9 2550 movaps 0x80(%rbp), %xmm10 2551 movaps 0x90(%rbp), %xmm11 2552 movaps 0xa0(%rbp), %xmm12 2553 movaps 0xb0(%rbp), %xmm13 2554 movaps 0xc0(%rbp), %xmm14 2555 movaps 0xd0(%rbp), %xmm15 2556 lea 0xa0(%rax), %rax 2557.Lxts_enc_tail: 2558___ 2559$code.=<<___; 2560 mov -48(%rax), %r15 2561.cfi_restore %r15 2562 mov -40(%rax), %r14 2563.cfi_restore %r14 2564 mov -32(%rax), %r13 2565.cfi_restore %r13 2566 mov -24(%rax), %r12 2567.cfi_restore %r12 2568 mov -16(%rax), %rbx 2569.cfi_restore %rbx 2570 mov -8(%rax), %rbp 2571.cfi_restore %rbp 2572 lea (%rax), %rsp # restore %rsp 2573.cfi_def_cfa_register %rsp 2574.Lxts_enc_epilogue: 2575 ret 2576.cfi_endproc 2577.size ossl_bsaes_xts_encrypt,.-ossl_bsaes_xts_encrypt 2578 2579.globl ossl_bsaes_xts_decrypt 2580.type ossl_bsaes_xts_decrypt,\@abi-omnipotent 2581.align 16 2582ossl_bsaes_xts_decrypt: 2583.cfi_startproc 2584 mov %rsp, %rax 2585.Lxts_dec_prologue: 2586 push %rbp 2587.cfi_push %rbp 2588 push %rbx 2589.cfi_push %rbx 2590 push %r12 2591.cfi_push %r12 2592 push %r13 2593.cfi_push %r13 2594 push %r14 2595.cfi_push %r14 2596 push %r15 2597.cfi_push %r15 2598 lea -0x48(%rsp), %rsp 2599.cfi_adjust_cfa_offset 0x48 2600___ 2601$code.=<<___ if ($win64); 2602 mov 0xa0(%rsp),$arg5 # pull key2 2603 mov 0xa8(%rsp),$arg6 # pull ivp 2604 lea -0xa0(%rsp), %rsp 2605 movaps %xmm6, 0x40(%rsp) 2606 movaps %xmm7, 0x50(%rsp) 2607 movaps %xmm8, 0x60(%rsp) 2608 movaps %xmm9, 0x70(%rsp) 2609 movaps %xmm10, 0x80(%rsp) 2610 movaps %xmm11, 0x90(%rsp) 2611 movaps %xmm12, 0xa0(%rsp) 2612 movaps %xmm13, 0xb0(%rsp) 2613 movaps %xmm14, 0xc0(%rsp) 2614 movaps %xmm15, 0xd0(%rsp) 2615.Lxts_dec_body: 2616___ 2617$code.=<<___; 2618 mov %rsp, %rbp # backup %rsp 2619 mov $arg1, $inp # backup arguments 2620 mov $arg2, $out 2621 mov $arg3, $len 2622 mov $arg4, $key 2623 2624 lea ($arg6), $arg1 2625 lea 0x20(%rbp), $arg2 2626 lea ($arg5), $arg3 2627 call asm_AES_encrypt # generate initial tweak 2628 2629 mov 240($key), %eax # rounds 2630 mov $len, %rbx # backup $len 2631 2632 mov %eax, %edx # rounds 2633 shl \$7, %rax # 128 bytes per inner round key 2634 sub \$`128-32`, %rax # size of bit-sliced key schedule 2635 sub %rax, %rsp 2636 2637 mov %rsp, %rax # pass key schedule 2638 mov $key, %rcx # pass key 2639 mov %edx, %r10d # pass rounds 2640 call _bsaes_key_convert 2641 pxor (%rsp), %xmm7 # fix up round 0 key 2642 movdqa %xmm6, (%rax) # save last round key 2643 movdqa %xmm7, (%rsp) 2644 2645 xor %eax, %eax # if ($len%16) len-=16; 2646 and \$-16, $len 2647 test \$15, %ebx 2648 setnz %al 2649 shl \$4, %rax 2650 sub %rax, $len 2651 2652 sub \$0x80, %rsp # place for tweak[8] 2653 movdqa 0x20(%rbp), @XMM[7] # initial tweak 2654 2655 pxor $twtmp, $twtmp 2656 movdqa .Lxts_magic(%rip), $twmask 2657 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2658 2659 sub \$0x80, $len 2660 jc .Lxts_dec_short 2661 jmp .Lxts_dec_loop 2662 2663.align 16 2664.Lxts_dec_loop: 2665___ 2666 for ($i=0;$i<7;$i++) { 2667 $code.=<<___; 2668 pshufd \$0x13, $twtmp, $twres 2669 pxor $twtmp, $twtmp 2670 movdqa @XMM[7], @XMM[$i] 2671 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2672 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2673 pand $twmask, $twres # isolate carry and residue 2674 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2675 pxor $twres, @XMM[7] 2676___ 2677 $code.=<<___ if ($i>=1); 2678 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2679___ 2680 $code.=<<___ if ($i>=2); 2681 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2682___ 2683 } 2684$code.=<<___; 2685 movdqu 0x60($inp), @XMM[8+6] 2686 pxor @XMM[8+5], @XMM[5] 2687 movdqu 0x70($inp), @XMM[8+7] 2688 lea 0x80($inp), $inp 2689 movdqa @XMM[7], 0x70(%rsp) 2690 pxor @XMM[8+6], @XMM[6] 2691 lea 0x80(%rsp), %rax # pass key schedule 2692 pxor @XMM[8+7], @XMM[7] 2693 mov %edx, %r10d # pass rounds 2694 2695 call _bsaes_decrypt8 2696 2697 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2698 pxor 0x10(%rsp), @XMM[1] 2699 movdqu @XMM[0], 0x00($out) # write output 2700 pxor 0x20(%rsp), @XMM[6] 2701 movdqu @XMM[1], 0x10($out) 2702 pxor 0x30(%rsp), @XMM[4] 2703 movdqu @XMM[6], 0x20($out) 2704 pxor 0x40(%rsp), @XMM[2] 2705 movdqu @XMM[4], 0x30($out) 2706 pxor 0x50(%rsp), @XMM[7] 2707 movdqu @XMM[2], 0x40($out) 2708 pxor 0x60(%rsp), @XMM[3] 2709 movdqu @XMM[7], 0x50($out) 2710 pxor 0x70(%rsp), @XMM[5] 2711 movdqu @XMM[3], 0x60($out) 2712 movdqu @XMM[5], 0x70($out) 2713 lea 0x80($out), $out 2714 2715 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak 2716 pxor $twtmp, $twtmp 2717 movdqa .Lxts_magic(%rip), $twmask 2718 pcmpgtd @XMM[7], $twtmp 2719 pshufd \$0x13, $twtmp, $twres 2720 pxor $twtmp, $twtmp 2721 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2722 pand $twmask, $twres # isolate carry and residue 2723 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2724 pxor $twres, @XMM[7] 2725 2726 sub \$0x80,$len 2727 jnc .Lxts_dec_loop 2728 2729.Lxts_dec_short: 2730 add \$0x80, $len 2731 jz .Lxts_dec_done 2732___ 2733 for ($i=0;$i<7;$i++) { 2734 $code.=<<___; 2735 pshufd \$0x13, $twtmp, $twres 2736 pxor $twtmp, $twtmp 2737 movdqa @XMM[7], @XMM[$i] 2738 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2739 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2740 pand $twmask, $twres # isolate carry and residue 2741 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2742 pxor $twres, @XMM[7] 2743___ 2744 $code.=<<___ if ($i>=1); 2745 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2746 cmp \$`0x10*$i`,$len 2747 je .Lxts_dec_$i 2748___ 2749 $code.=<<___ if ($i>=2); 2750 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2751___ 2752 } 2753$code.=<<___; 2754 movdqu 0x60($inp), @XMM[8+6] 2755 pxor @XMM[8+5], @XMM[5] 2756 movdqa @XMM[7], 0x70(%rsp) 2757 lea 0x70($inp), $inp 2758 pxor @XMM[8+6], @XMM[6] 2759 lea 0x80(%rsp), %rax # pass key schedule 2760 mov %edx, %r10d # pass rounds 2761 2762 call _bsaes_decrypt8 2763 2764 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2765 pxor 0x10(%rsp), @XMM[1] 2766 movdqu @XMM[0], 0x00($out) # write output 2767 pxor 0x20(%rsp), @XMM[6] 2768 movdqu @XMM[1], 0x10($out) 2769 pxor 0x30(%rsp), @XMM[4] 2770 movdqu @XMM[6], 0x20($out) 2771 pxor 0x40(%rsp), @XMM[2] 2772 movdqu @XMM[4], 0x30($out) 2773 pxor 0x50(%rsp), @XMM[7] 2774 movdqu @XMM[2], 0x40($out) 2775 pxor 0x60(%rsp), @XMM[3] 2776 movdqu @XMM[7], 0x50($out) 2777 movdqu @XMM[3], 0x60($out) 2778 lea 0x70($out), $out 2779 2780 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak 2781 jmp .Lxts_dec_done 2782.align 16 2783.Lxts_dec_6: 2784 pxor @XMM[8+4], @XMM[4] 2785 lea 0x60($inp), $inp 2786 pxor @XMM[8+5], @XMM[5] 2787 lea 0x80(%rsp), %rax # pass key schedule 2788 mov %edx, %r10d # pass rounds 2789 2790 call _bsaes_decrypt8 2791 2792 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2793 pxor 0x10(%rsp), @XMM[1] 2794 movdqu @XMM[0], 0x00($out) # write output 2795 pxor 0x20(%rsp), @XMM[6] 2796 movdqu @XMM[1], 0x10($out) 2797 pxor 0x30(%rsp), @XMM[4] 2798 movdqu @XMM[6], 0x20($out) 2799 pxor 0x40(%rsp), @XMM[2] 2800 movdqu @XMM[4], 0x30($out) 2801 pxor 0x50(%rsp), @XMM[7] 2802 movdqu @XMM[2], 0x40($out) 2803 movdqu @XMM[7], 0x50($out) 2804 lea 0x60($out), $out 2805 2806 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak 2807 jmp .Lxts_dec_done 2808.align 16 2809.Lxts_dec_5: 2810 pxor @XMM[8+3], @XMM[3] 2811 lea 0x50($inp), $inp 2812 pxor @XMM[8+4], @XMM[4] 2813 lea 0x80(%rsp), %rax # pass key schedule 2814 mov %edx, %r10d # pass rounds 2815 2816 call _bsaes_decrypt8 2817 2818 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2819 pxor 0x10(%rsp), @XMM[1] 2820 movdqu @XMM[0], 0x00($out) # write output 2821 pxor 0x20(%rsp), @XMM[6] 2822 movdqu @XMM[1], 0x10($out) 2823 pxor 0x30(%rsp), @XMM[4] 2824 movdqu @XMM[6], 0x20($out) 2825 pxor 0x40(%rsp), @XMM[2] 2826 movdqu @XMM[4], 0x30($out) 2827 movdqu @XMM[2], 0x40($out) 2828 lea 0x50($out), $out 2829 2830 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak 2831 jmp .Lxts_dec_done 2832.align 16 2833.Lxts_dec_4: 2834 pxor @XMM[8+2], @XMM[2] 2835 lea 0x40($inp), $inp 2836 pxor @XMM[8+3], @XMM[3] 2837 lea 0x80(%rsp), %rax # pass key schedule 2838 mov %edx, %r10d # pass rounds 2839 2840 call _bsaes_decrypt8 2841 2842 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2843 pxor 0x10(%rsp), @XMM[1] 2844 movdqu @XMM[0], 0x00($out) # write output 2845 pxor 0x20(%rsp), @XMM[6] 2846 movdqu @XMM[1], 0x10($out) 2847 pxor 0x30(%rsp), @XMM[4] 2848 movdqu @XMM[6], 0x20($out) 2849 movdqu @XMM[4], 0x30($out) 2850 lea 0x40($out), $out 2851 2852 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak 2853 jmp .Lxts_dec_done 2854.align 16 2855.Lxts_dec_3: 2856 pxor @XMM[8+1], @XMM[1] 2857 lea 0x30($inp), $inp 2858 pxor @XMM[8+2], @XMM[2] 2859 lea 0x80(%rsp), %rax # pass key schedule 2860 mov %edx, %r10d # pass rounds 2861 2862 call _bsaes_decrypt8 2863 2864 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2865 pxor 0x10(%rsp), @XMM[1] 2866 movdqu @XMM[0], 0x00($out) # write output 2867 pxor 0x20(%rsp), @XMM[6] 2868 movdqu @XMM[1], 0x10($out) 2869 movdqu @XMM[6], 0x20($out) 2870 lea 0x30($out), $out 2871 2872 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak 2873 jmp .Lxts_dec_done 2874.align 16 2875.Lxts_dec_2: 2876 pxor @XMM[8+0], @XMM[0] 2877 lea 0x20($inp), $inp 2878 pxor @XMM[8+1], @XMM[1] 2879 lea 0x80(%rsp), %rax # pass key schedule 2880 mov %edx, %r10d # pass rounds 2881 2882 call _bsaes_decrypt8 2883 2884 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2885 pxor 0x10(%rsp), @XMM[1] 2886 movdqu @XMM[0], 0x00($out) # write output 2887 movdqu @XMM[1], 0x10($out) 2888 lea 0x20($out), $out 2889 2890 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak 2891 jmp .Lxts_dec_done 2892.align 16 2893.Lxts_dec_1: 2894 pxor @XMM[0], @XMM[8] 2895 lea 0x10($inp), $inp 2896 movdqa @XMM[8], 0x20(%rbp) 2897 lea 0x20(%rbp), $arg1 2898 lea 0x20(%rbp), $arg2 2899 lea ($key), $arg3 2900 call asm_AES_decrypt # doesn't touch %xmm 2901 pxor 0x20(%rbp), @XMM[0] # ^= tweak[] 2902 #pxor @XMM[8], @XMM[0] 2903 #lea 0x80(%rsp), %rax # pass key schedule 2904 #mov %edx, %r10d # pass rounds 2905 #call _bsaes_decrypt8 2906 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2907 movdqu @XMM[0], 0x00($out) # write output 2908 lea 0x10($out), $out 2909 2910 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak 2911 2912.Lxts_dec_done: 2913 and \$15, %ebx 2914 jz .Lxts_dec_ret 2915 2916 pxor $twtmp, $twtmp 2917 movdqa .Lxts_magic(%rip), $twmask 2918 pcmpgtd @XMM[7], $twtmp 2919 pshufd \$0x13, $twtmp, $twres 2920 movdqa @XMM[7], @XMM[6] 2921 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2922 pand $twmask, $twres # isolate carry and residue 2923 movdqu ($inp), @XMM[0] 2924 pxor $twres, @XMM[7] 2925 2926 lea 0x20(%rbp), $arg1 2927 pxor @XMM[7], @XMM[0] 2928 lea 0x20(%rbp), $arg2 2929 movdqa @XMM[0], 0x20(%rbp) 2930 lea ($key), $arg3 2931 call asm_AES_decrypt # doesn't touch %xmm 2932 pxor 0x20(%rbp), @XMM[7] 2933 mov $out, %rdx 2934 movdqu @XMM[7], ($out) 2935 2936.Lxts_dec_steal: 2937 movzb 16($inp), %eax 2938 movzb (%rdx), %ecx 2939 lea 1($inp), $inp 2940 mov %al, (%rdx) 2941 mov %cl, 16(%rdx) 2942 lea 1(%rdx), %rdx 2943 sub \$1,%ebx 2944 jnz .Lxts_dec_steal 2945 2946 movdqu ($out), @XMM[0] 2947 lea 0x20(%rbp), $arg1 2948 pxor @XMM[6], @XMM[0] 2949 lea 0x20(%rbp), $arg2 2950 movdqa @XMM[0], 0x20(%rbp) 2951 lea ($key), $arg3 2952 call asm_AES_decrypt # doesn't touch %xmm 2953 pxor 0x20(%rbp), @XMM[6] 2954 movdqu @XMM[6], ($out) 2955 2956.Lxts_dec_ret: 2957 lea (%rsp), %rax 2958 pxor %xmm0, %xmm0 2959.Lxts_dec_bzero: # wipe key schedule [if any] 2960 movdqa %xmm0, 0x00(%rax) 2961 movdqa %xmm0, 0x10(%rax) 2962 lea 0x20(%rax), %rax 2963 cmp %rax, %rbp 2964 ja .Lxts_dec_bzero 2965 2966 lea 0x78(%rbp),%rax 2967.cfi_def_cfa %rax,8 2968___ 2969$code.=<<___ if ($win64); 2970 movaps 0x40(%rbp), %xmm6 2971 movaps 0x50(%rbp), %xmm7 2972 movaps 0x60(%rbp), %xmm8 2973 movaps 0x70(%rbp), %xmm9 2974 movaps 0x80(%rbp), %xmm10 2975 movaps 0x90(%rbp), %xmm11 2976 movaps 0xa0(%rbp), %xmm12 2977 movaps 0xb0(%rbp), %xmm13 2978 movaps 0xc0(%rbp), %xmm14 2979 movaps 0xd0(%rbp), %xmm15 2980 lea 0xa0(%rax), %rax 2981.Lxts_dec_tail: 2982___ 2983$code.=<<___; 2984 mov -48(%rax), %r15 2985.cfi_restore %r15 2986 mov -40(%rax), %r14 2987.cfi_restore %r14 2988 mov -32(%rax), %r13 2989.cfi_restore %r13 2990 mov -24(%rax), %r12 2991.cfi_restore %r12 2992 mov -16(%rax), %rbx 2993.cfi_restore %rbx 2994 mov -8(%rax), %rbp 2995.cfi_restore %rbp 2996 lea (%rax), %rsp # restore %rsp 2997.cfi_def_cfa_register %rsp 2998.Lxts_dec_epilogue: 2999 ret 3000.cfi_endproc 3001.size ossl_bsaes_xts_decrypt,.-ossl_bsaes_xts_decrypt 3002___ 3003} 3004$code.=<<___; 3005.type _bsaes_const,\@object 3006.align 64 3007_bsaes_const: 3008.LM0ISR: # InvShiftRows constants 3009 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 3010.LISRM0: 3011 .quad 0x01040b0e0205080f, 0x0306090c00070a0d 3012.LISR: 3013 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 3014.LBS0: # bit-slice constants 3015 .quad 0x5555555555555555, 0x5555555555555555 3016.LBS1: 3017 .quad 0x3333333333333333, 0x3333333333333333 3018.LBS2: 3019 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f 3020.LSR: # shiftrows constants 3021 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b 3022.LSRM0: 3023 .quad 0x0304090e00050a0f, 0x01060b0c0207080d 3024.LM0SR: 3025 .quad 0x0a0e02060f03070b, 0x0004080c05090d01 3026.LSWPUP: # byte-swap upper dword 3027 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 3028.LSWPUPM0SR: 3029 .quad 0x0a0d02060c03070b, 0x0004080f05090e01 3030.LADD1: # counter increment constants 3031 .quad 0x0000000000000000, 0x0000000100000000 3032.LADD2: 3033 .quad 0x0000000000000000, 0x0000000200000000 3034.LADD3: 3035 .quad 0x0000000000000000, 0x0000000300000000 3036.LADD4: 3037 .quad 0x0000000000000000, 0x0000000400000000 3038.LADD5: 3039 .quad 0x0000000000000000, 0x0000000500000000 3040.LADD6: 3041 .quad 0x0000000000000000, 0x0000000600000000 3042.LADD7: 3043 .quad 0x0000000000000000, 0x0000000700000000 3044.LADD8: 3045 .quad 0x0000000000000000, 0x0000000800000000 3046.Lxts_magic: 3047 .long 0x87,0,1,0 3048.Lmasks: 3049 .quad 0x0101010101010101, 0x0101010101010101 3050 .quad 0x0202020202020202, 0x0202020202020202 3051 .quad 0x0404040404040404, 0x0404040404040404 3052 .quad 0x0808080808080808, 0x0808080808080808 3053.LM0: 3054 .quad 0x02060a0e03070b0f, 0x0004080c0105090d 3055.L63: 3056 .quad 0x6363636363636363, 0x6363636363636363 3057.asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov" 3058.align 64 3059.size _bsaes_const,.-_bsaes_const 3060___ 3061 3062# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3063# CONTEXT *context,DISPATCHER_CONTEXT *disp) 3064if ($win64) { 3065$rec="%rcx"; 3066$frame="%rdx"; 3067$context="%r8"; 3068$disp="%r9"; 3069 3070$code.=<<___; 3071.extern __imp_RtlVirtualUnwind 3072.type se_handler,\@abi-omnipotent 3073.align 16 3074se_handler: 3075 push %rsi 3076 push %rdi 3077 push %rbx 3078 push %rbp 3079 push %r12 3080 push %r13 3081 push %r14 3082 push %r15 3083 pushfq 3084 sub \$64,%rsp 3085 3086 mov 120($context),%rax # pull context->Rax 3087 mov 248($context),%rbx # pull context->Rip 3088 3089 mov 8($disp),%rsi # disp->ImageBase 3090 mov 56($disp),%r11 # disp->HandlerData 3091 3092 mov 0(%r11),%r10d # HandlerData[0] 3093 lea (%rsi,%r10),%r10 # prologue label 3094 cmp %r10,%rbx # context->Rip<=prologue label 3095 jbe .Lin_prologue 3096 3097 mov 4(%r11),%r10d # HandlerData[1] 3098 lea (%rsi,%r10),%r10 # epilogue label 3099 cmp %r10,%rbx # context->Rip>=epilogue label 3100 jae .Lin_prologue 3101 3102 mov 8(%r11),%r10d # HandlerData[2] 3103 lea (%rsi,%r10),%r10 # epilogue label 3104 cmp %r10,%rbx # context->Rip>=tail label 3105 jae .Lin_tail 3106 3107 mov 160($context),%rax # pull context->Rbp 3108 3109 lea 0x40(%rax),%rsi # %xmm save area 3110 lea 512($context),%rdi # &context.Xmm6 3111 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 3112 .long 0xa548f3fc # cld; rep movsq 3113 lea 0xa0+0x78(%rax),%rax # adjust stack pointer 3114 3115.Lin_tail: 3116 mov -48(%rax),%rbp 3117 mov -40(%rax),%rbx 3118 mov -32(%rax),%r12 3119 mov -24(%rax),%r13 3120 mov -16(%rax),%r14 3121 mov -8(%rax),%r15 3122 mov %rbx,144($context) # restore context->Rbx 3123 mov %rbp,160($context) # restore context->Rbp 3124 mov %r12,216($context) # restore context->R12 3125 mov %r13,224($context) # restore context->R13 3126 mov %r14,232($context) # restore context->R14 3127 mov %r15,240($context) # restore context->R15 3128 3129.Lin_prologue: 3130 mov %rax,152($context) # restore context->Rsp 3131 3132 mov 40($disp),%rdi # disp->ContextRecord 3133 mov $context,%rsi # context 3134 mov \$`1232/8`,%ecx # sizeof(CONTEXT) 3135 .long 0xa548f3fc # cld; rep movsq 3136 3137 mov $disp,%rsi 3138 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3139 mov 8(%rsi),%rdx # arg2, disp->ImageBase 3140 mov 0(%rsi),%r8 # arg3, disp->ControlPc 3141 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3142 mov 40(%rsi),%r10 # disp->ContextRecord 3143 lea 56(%rsi),%r11 # &disp->HandlerData 3144 lea 24(%rsi),%r12 # &disp->EstablisherFrame 3145 mov %r10,32(%rsp) # arg5 3146 mov %r11,40(%rsp) # arg6 3147 mov %r12,48(%rsp) # arg7 3148 mov %rcx,56(%rsp) # arg8, (NULL) 3149 call *__imp_RtlVirtualUnwind(%rip) 3150 3151 mov \$1,%eax # ExceptionContinueSearch 3152 add \$64,%rsp 3153 popfq 3154 pop %r15 3155 pop %r14 3156 pop %r13 3157 pop %r12 3158 pop %rbp 3159 pop %rbx 3160 pop %rdi 3161 pop %rsi 3162 ret 3163.size se_handler,.-se_handler 3164 3165.section .pdata 3166.align 4 3167___ 3168$code.=<<___ if ($ecb); 3169 .rva .Lecb_enc_prologue 3170 .rva .Lecb_enc_epilogue 3171 .rva .Lecb_enc_info 3172 3173 .rva .Lecb_dec_prologue 3174 .rva .Lecb_dec_epilogue 3175 .rva .Lecb_dec_info 3176___ 3177$code.=<<___; 3178 .rva .Lcbc_dec_prologue 3179 .rva .Lcbc_dec_epilogue 3180 .rva .Lcbc_dec_info 3181 3182 .rva .Lctr_enc_prologue 3183 .rva .Lctr_enc_epilogue 3184 .rva .Lctr_enc_info 3185 3186 .rva .Lxts_enc_prologue 3187 .rva .Lxts_enc_epilogue 3188 .rva .Lxts_enc_info 3189 3190 .rva .Lxts_dec_prologue 3191 .rva .Lxts_dec_epilogue 3192 .rva .Lxts_dec_info 3193 3194.section .xdata 3195.align 8 3196___ 3197$code.=<<___ if ($ecb); 3198.Lecb_enc_info: 3199 .byte 9,0,0,0 3200 .rva se_handler 3201 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[] 3202 .rva .Lecb_enc_tail 3203 .long 0 3204.Lecb_dec_info: 3205 .byte 9,0,0,0 3206 .rva se_handler 3207 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[] 3208 .rva .Lecb_dec_tail 3209 .long 0 3210___ 3211$code.=<<___; 3212.Lcbc_dec_info: 3213 .byte 9,0,0,0 3214 .rva se_handler 3215 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[] 3216 .rva .Lcbc_dec_tail 3217 .long 0 3218.Lctr_enc_info: 3219 .byte 9,0,0,0 3220 .rva se_handler 3221 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[] 3222 .rva .Lctr_enc_tail 3223 .long 0 3224.Lxts_enc_info: 3225 .byte 9,0,0,0 3226 .rva se_handler 3227 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] 3228 .rva .Lxts_enc_tail 3229 .long 0 3230.Lxts_dec_info: 3231 .byte 9,0,0,0 3232 .rva se_handler 3233 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] 3234 .rva .Lxts_dec_tail 3235 .long 0 3236___ 3237} 3238 3239$code =~ s/\`([^\`]*)\`/eval($1)/gem; 3240 3241print $code; 3242 3243close STDOUT or die "error closing STDOUT: $!"; 3244