1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# 11# AES-NI-CTR+GHASH stitch. 12# 13# February 2013 14# 15# OpenSSL GCM implementation is organized in such way that its 16# performance is rather close to the sum of its streamed components, 17# in the context parallelized AES-NI CTR and modulo-scheduled 18# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation 19# was observed to perform significantly better than the sum of the 20# components on contemporary CPUs, the effort was deemed impossible to 21# justify. This module is based on combination of Intel submissions, 22# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max 23# Locktyukhin of Intel Corp. who verified that it reduces shuffles 24# pressure with notable relative improvement, achieving 1.0 cycle per 25# byte processed with 128-bit key on Haswell processor, 0.74 - on 26# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled 27# measurements for favourable packet size, one divisible by 96. 28# Applications using the EVP interface will observe a few percent 29# worse performance.] 30# 31# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest 32# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf 33 34$flavour = shift; 35$output = shift; 36if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 37 38$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 39 40$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 41( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 42( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 43die "can't locate x86_64-xlate.pl"; 44 45# |$avx| in ghash-x86_64.pl must be set to at least 1; otherwise tags will 46# be computed incorrectly. 47# 48# In upstream, this is controlled by shelling out to the compiler to check 49# versions, but BoringSSL is intended to be used with pre-generated perlasm 50# output, so this isn't useful anyway. 51# 52# The upstream code uses the condition |$avx>1| even though no AVX2 53# instructions are used, because it assumes MOVBE is supported by the assembler 54# if and only if AVX2 is also supported by the assembler; see 55# https://marc.info/?l=openssl-dev&m=146567589526984&w=2. 56$avx = 2; 57 58open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 59*STDOUT=*OUT; 60 61# See the comment above regarding why the condition is ($avx>1) when there are 62# no AVX2 instructions being used. 63if ($avx>1) {{{ 64 65($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); 66 67($Ii,$T1,$T2,$Hkey, 68 $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8)); 69 70($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15)); 71 72($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15"); 73 74$code=<<___; 75.text 76 77.type _aesni_ctr32_ghash_6x,\@abi-omnipotent 78.align 32 79_aesni_ctr32_ghash_6x: 80 vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb 81 sub \$6,$len 82 vpxor $Z0,$Z0,$Z0 # $Z0 = 0 83 vmovdqu 0x00-0x80($key),$rndkey 84 vpaddb $T2,$T1,$inout1 85 vpaddb $T2,$inout1,$inout2 86 vpaddb $T2,$inout2,$inout3 87 vpaddb $T2,$inout3,$inout4 88 vpaddb $T2,$inout4,$inout5 89 vpxor $rndkey,$T1,$inout0 90 vmovdqu $Z0,16+8(%rsp) # "$Z3" = 0 91 jmp .Loop6x 92 93.align 32 94.Loop6x: 95 add \$`6<<24`,$counter 96 jc .Lhandle_ctr32 # discard $inout[1-5]? 97 vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1 98 vpaddb $T2,$inout5,$T1 # next counter value 99 vpxor $rndkey,$inout1,$inout1 100 vpxor $rndkey,$inout2,$inout2 101 102.Lresume_ctr32: 103 vmovdqu $T1,($ivp) # save next counter value 104 vpclmulqdq \$0x10,$Hkey,$Z3,$Z1 105 vpxor $rndkey,$inout3,$inout3 106 vmovups 0x10-0x80($key),$T2 # borrow $T2 for $rndkey 107 vpclmulqdq \$0x01,$Hkey,$Z3,$Z2 108 109 # At this point, the current block of 96 (0x60) bytes has already been 110 # loaded into registers. Concurrently with processing it, we want to 111 # load the next 96 bytes of input for the next round. Obviously, we can 112 # only do this if there are at least 96 more bytes of input beyond the 113 # input we're currently processing, or else we'd read past the end of 114 # the input buffer. Here, we set |%r12| to 96 if there are at least 96 115 # bytes of input beyond the 96 bytes we're already processing, and we 116 # set |%r12| to 0 otherwise. In the case where we set |%r12| to 96, 117 # we'll read in the next block so that it is in registers for the next 118 # loop iteration. In the case where we set |%r12| to 0, we'll re-read 119 # the current block and then ignore what we re-read. 120 # 121 # At this point, |$in0| points to the current (already read into 122 # registers) block, and |$end0| points to 2*96 bytes before the end of 123 # the input. Thus, |$in0| > |$end0| means that we do not have the next 124 # 96-byte block to read in, and |$in0| <= |$end0| means we do. 125 xor %r12,%r12 126 cmp $in0,$end0 127 128 vaesenc $T2,$inout0,$inout0 129 vmovdqu 0x30+8(%rsp),$Ii # I[4] 130 vpxor $rndkey,$inout4,$inout4 131 vpclmulqdq \$0x00,$Hkey,$Z3,$T1 132 vaesenc $T2,$inout1,$inout1 133 vpxor $rndkey,$inout5,$inout5 134 setnc %r12b 135 vpclmulqdq \$0x11,$Hkey,$Z3,$Z3 136 vaesenc $T2,$inout2,$inout2 137 vmovdqu 0x10-0x20($Xip),$Hkey # $Hkey^2 138 neg %r12 139 vaesenc $T2,$inout3,$inout3 140 vpxor $Z1,$Z2,$Z2 141 vpclmulqdq \$0x00,$Hkey,$Ii,$Z1 142 vpxor $Z0,$Xi,$Xi # modulo-scheduled 143 vaesenc $T2,$inout4,$inout4 144 vpxor $Z1,$T1,$Z0 145 and \$0x60,%r12 146 vmovups 0x20-0x80($key),$rndkey 147 vpclmulqdq \$0x10,$Hkey,$Ii,$T1 148 vaesenc $T2,$inout5,$inout5 149 150 vpclmulqdq \$0x01,$Hkey,$Ii,$T2 151 lea ($in0,%r12),$in0 152 vaesenc $rndkey,$inout0,$inout0 153 vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled [vpxor $Z3,$Xi,$Xi] 154 vpclmulqdq \$0x11,$Hkey,$Ii,$Hkey 155 vmovdqu 0x40+8(%rsp),$Ii # I[3] 156 vaesenc $rndkey,$inout1,$inout1 157 movbe 0x58($in0),%r13 158 vaesenc $rndkey,$inout2,$inout2 159 movbe 0x50($in0),%r12 160 vaesenc $rndkey,$inout3,$inout3 161 mov %r13,0x20+8(%rsp) 162 vaesenc $rndkey,$inout4,$inout4 163 mov %r12,0x28+8(%rsp) 164 vmovdqu 0x30-0x20($Xip),$Z1 # borrow $Z1 for $Hkey^3 165 vaesenc $rndkey,$inout5,$inout5 166 167 vmovups 0x30-0x80($key),$rndkey 168 vpxor $T1,$Z2,$Z2 169 vpclmulqdq \$0x00,$Z1,$Ii,$T1 170 vaesenc $rndkey,$inout0,$inout0 171 vpxor $T2,$Z2,$Z2 172 vpclmulqdq \$0x10,$Z1,$Ii,$T2 173 vaesenc $rndkey,$inout1,$inout1 174 vpxor $Hkey,$Z3,$Z3 175 vpclmulqdq \$0x01,$Z1,$Ii,$Hkey 176 vaesenc $rndkey,$inout2,$inout2 177 vpclmulqdq \$0x11,$Z1,$Ii,$Z1 178 vmovdqu 0x50+8(%rsp),$Ii # I[2] 179 vaesenc $rndkey,$inout3,$inout3 180 vaesenc $rndkey,$inout4,$inout4 181 vpxor $T1,$Z0,$Z0 182 vmovdqu 0x40-0x20($Xip),$T1 # borrow $T1 for $Hkey^4 183 vaesenc $rndkey,$inout5,$inout5 184 185 vmovups 0x40-0x80($key),$rndkey 186 vpxor $T2,$Z2,$Z2 187 vpclmulqdq \$0x00,$T1,$Ii,$T2 188 vaesenc $rndkey,$inout0,$inout0 189 vpxor $Hkey,$Z2,$Z2 190 vpclmulqdq \$0x10,$T1,$Ii,$Hkey 191 vaesenc $rndkey,$inout1,$inout1 192 movbe 0x48($in0),%r13 193 vpxor $Z1,$Z3,$Z3 194 vpclmulqdq \$0x01,$T1,$Ii,$Z1 195 vaesenc $rndkey,$inout2,$inout2 196 movbe 0x40($in0),%r12 197 vpclmulqdq \$0x11,$T1,$Ii,$T1 198 vmovdqu 0x60+8(%rsp),$Ii # I[1] 199 vaesenc $rndkey,$inout3,$inout3 200 mov %r13,0x30+8(%rsp) 201 vaesenc $rndkey,$inout4,$inout4 202 mov %r12,0x38+8(%rsp) 203 vpxor $T2,$Z0,$Z0 204 vmovdqu 0x60-0x20($Xip),$T2 # borrow $T2 for $Hkey^5 205 vaesenc $rndkey,$inout5,$inout5 206 207 vmovups 0x50-0x80($key),$rndkey 208 vpxor $Hkey,$Z2,$Z2 209 vpclmulqdq \$0x00,$T2,$Ii,$Hkey 210 vaesenc $rndkey,$inout0,$inout0 211 vpxor $Z1,$Z2,$Z2 212 vpclmulqdq \$0x10,$T2,$Ii,$Z1 213 vaesenc $rndkey,$inout1,$inout1 214 movbe 0x38($in0),%r13 215 vpxor $T1,$Z3,$Z3 216 vpclmulqdq \$0x01,$T2,$Ii,$T1 217 vpxor 0x70+8(%rsp),$Xi,$Xi # accumulate I[0] 218 vaesenc $rndkey,$inout2,$inout2 219 movbe 0x30($in0),%r12 220 vpclmulqdq \$0x11,$T2,$Ii,$T2 221 vaesenc $rndkey,$inout3,$inout3 222 mov %r13,0x40+8(%rsp) 223 vaesenc $rndkey,$inout4,$inout4 224 mov %r12,0x48+8(%rsp) 225 vpxor $Hkey,$Z0,$Z0 226 vmovdqu 0x70-0x20($Xip),$Hkey # $Hkey^6 227 vaesenc $rndkey,$inout5,$inout5 228 229 vmovups 0x60-0x80($key),$rndkey 230 vpxor $Z1,$Z2,$Z2 231 vpclmulqdq \$0x10,$Hkey,$Xi,$Z1 232 vaesenc $rndkey,$inout0,$inout0 233 vpxor $T1,$Z2,$Z2 234 vpclmulqdq \$0x01,$Hkey,$Xi,$T1 235 vaesenc $rndkey,$inout1,$inout1 236 movbe 0x28($in0),%r13 237 vpxor $T2,$Z3,$Z3 238 vpclmulqdq \$0x00,$Hkey,$Xi,$T2 239 vaesenc $rndkey,$inout2,$inout2 240 movbe 0x20($in0),%r12 241 vpclmulqdq \$0x11,$Hkey,$Xi,$Xi 242 vaesenc $rndkey,$inout3,$inout3 243 mov %r13,0x50+8(%rsp) 244 vaesenc $rndkey,$inout4,$inout4 245 mov %r12,0x58+8(%rsp) 246 vpxor $Z1,$Z2,$Z2 247 vaesenc $rndkey,$inout5,$inout5 248 vpxor $T1,$Z2,$Z2 249 250 vmovups 0x70-0x80($key),$rndkey 251 vpslldq \$8,$Z2,$Z1 252 vpxor $T2,$Z0,$Z0 253 vmovdqu 0x10($const),$Hkey # .Lpoly 254 255 vaesenc $rndkey,$inout0,$inout0 256 vpxor $Xi,$Z3,$Z3 257 vaesenc $rndkey,$inout1,$inout1 258 vpxor $Z1,$Z0,$Z0 259 movbe 0x18($in0),%r13 260 vaesenc $rndkey,$inout2,$inout2 261 movbe 0x10($in0),%r12 262 vpalignr \$8,$Z0,$Z0,$Ii # 1st phase 263 vpclmulqdq \$0x10,$Hkey,$Z0,$Z0 264 mov %r13,0x60+8(%rsp) 265 vaesenc $rndkey,$inout3,$inout3 266 mov %r12,0x68+8(%rsp) 267 vaesenc $rndkey,$inout4,$inout4 268 vmovups 0x80-0x80($key),$T1 # borrow $T1 for $rndkey 269 vaesenc $rndkey,$inout5,$inout5 270 271 vaesenc $T1,$inout0,$inout0 272 vmovups 0x90-0x80($key),$rndkey 273 vaesenc $T1,$inout1,$inout1 274 vpsrldq \$8,$Z2,$Z2 275 vaesenc $T1,$inout2,$inout2 276 vpxor $Z2,$Z3,$Z3 277 vaesenc $T1,$inout3,$inout3 278 vpxor $Ii,$Z0,$Z0 279 movbe 0x08($in0),%r13 280 vaesenc $T1,$inout4,$inout4 281 movbe 0x00($in0),%r12 282 vaesenc $T1,$inout5,$inout5 283 vmovups 0xa0-0x80($key),$T1 284 cmp \$11,$rounds 285 jb .Lenc_tail # 128-bit key 286 287 vaesenc $rndkey,$inout0,$inout0 288 vaesenc $rndkey,$inout1,$inout1 289 vaesenc $rndkey,$inout2,$inout2 290 vaesenc $rndkey,$inout3,$inout3 291 vaesenc $rndkey,$inout4,$inout4 292 vaesenc $rndkey,$inout5,$inout5 293 294 vaesenc $T1,$inout0,$inout0 295 vaesenc $T1,$inout1,$inout1 296 vaesenc $T1,$inout2,$inout2 297 vaesenc $T1,$inout3,$inout3 298 vaesenc $T1,$inout4,$inout4 299 vmovups 0xb0-0x80($key),$rndkey 300 vaesenc $T1,$inout5,$inout5 301 vmovups 0xc0-0x80($key),$T1 302 je .Lenc_tail # 192-bit key 303 304 vaesenc $rndkey,$inout0,$inout0 305 vaesenc $rndkey,$inout1,$inout1 306 vaesenc $rndkey,$inout2,$inout2 307 vaesenc $rndkey,$inout3,$inout3 308 vaesenc $rndkey,$inout4,$inout4 309 vaesenc $rndkey,$inout5,$inout5 310 311 vaesenc $T1,$inout0,$inout0 312 vaesenc $T1,$inout1,$inout1 313 vaesenc $T1,$inout2,$inout2 314 vaesenc $T1,$inout3,$inout3 315 vaesenc $T1,$inout4,$inout4 316 vmovups 0xd0-0x80($key),$rndkey 317 vaesenc $T1,$inout5,$inout5 318 vmovups 0xe0-0x80($key),$T1 319 jmp .Lenc_tail # 256-bit key 320 321.align 32 322.Lhandle_ctr32: 323 vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask 324 vpshufb $Ii,$T1,$Z2 # byte-swap counter 325 vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb 326 vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb 327 vpaddd $Z1,$Z2,$inout2 328 vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1 329 vpaddd $Z1,$inout1,$inout3 330 vpshufb $Ii,$inout1,$inout1 331 vpaddd $Z1,$inout2,$inout4 332 vpshufb $Ii,$inout2,$inout2 333 vpxor $rndkey,$inout1,$inout1 334 vpaddd $Z1,$inout3,$inout5 335 vpshufb $Ii,$inout3,$inout3 336 vpxor $rndkey,$inout2,$inout2 337 vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value 338 vpshufb $Ii,$inout4,$inout4 339 vpshufb $Ii,$inout5,$inout5 340 vpshufb $Ii,$T1,$T1 # next counter value 341 jmp .Lresume_ctr32 342 343.align 32 344.Lenc_tail: 345 vaesenc $rndkey,$inout0,$inout0 346 vmovdqu $Z3,16+8(%rsp) # postpone vpxor $Z3,$Xi,$Xi 347 vpalignr \$8,$Z0,$Z0,$Xi # 2nd phase 348 vaesenc $rndkey,$inout1,$inout1 349 vpclmulqdq \$0x10,$Hkey,$Z0,$Z0 350 vpxor 0x00($inp),$T1,$T2 351 vaesenc $rndkey,$inout2,$inout2 352 vpxor 0x10($inp),$T1,$Ii 353 vaesenc $rndkey,$inout3,$inout3 354 vpxor 0x20($inp),$T1,$Z1 355 vaesenc $rndkey,$inout4,$inout4 356 vpxor 0x30($inp),$T1,$Z2 357 vaesenc $rndkey,$inout5,$inout5 358 vpxor 0x40($inp),$T1,$Z3 359 vpxor 0x50($inp),$T1,$Hkey 360 vmovdqu ($ivp),$T1 # load next counter value 361 362 vaesenclast $T2,$inout0,$inout0 363 vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb 364 vaesenclast $Ii,$inout1,$inout1 365 vpaddb $T2,$T1,$Ii 366 mov %r13,0x70+8(%rsp) 367 lea 0x60($inp),$inp 368 vaesenclast $Z1,$inout2,$inout2 369 vpaddb $T2,$Ii,$Z1 370 mov %r12,0x78+8(%rsp) 371 lea 0x60($out),$out 372 vmovdqu 0x00-0x80($key),$rndkey 373 vaesenclast $Z2,$inout3,$inout3 374 vpaddb $T2,$Z1,$Z2 375 vaesenclast $Z3, $inout4,$inout4 376 vpaddb $T2,$Z2,$Z3 377 vaesenclast $Hkey,$inout5,$inout5 378 vpaddb $T2,$Z3,$Hkey 379 380 add \$0x60,$ret 381 sub \$0x6,$len 382 jc .L6x_done 383 384 vmovups $inout0,-0x60($out) # save output 385 vpxor $rndkey,$T1,$inout0 386 vmovups $inout1,-0x50($out) 387 vmovdqa $Ii,$inout1 # 0 latency 388 vmovups $inout2,-0x40($out) 389 vmovdqa $Z1,$inout2 # 0 latency 390 vmovups $inout3,-0x30($out) 391 vmovdqa $Z2,$inout3 # 0 latency 392 vmovups $inout4,-0x20($out) 393 vmovdqa $Z3,$inout4 # 0 latency 394 vmovups $inout5,-0x10($out) 395 vmovdqa $Hkey,$inout5 # 0 latency 396 vmovdqu 0x20+8(%rsp),$Z3 # I[5] 397 jmp .Loop6x 398 399.L6x_done: 400 vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled 401 vpxor $Z0,$Xi,$Xi # modulo-scheduled 402 403 ret 404.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x 405___ 406###################################################################### 407# 408# size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len, 409# const AES_KEY *key, unsigned char iv[16], 410# struct { u128 Xi,H,Htbl[9]; } *Xip); 411$code.=<<___; 412.globl aesni_gcm_decrypt 413.type aesni_gcm_decrypt,\@function,6 414.align 32 415aesni_gcm_decrypt: 416 xor $ret,$ret 417 418 # We call |_aesni_ctr32_ghash_6x|, which requires at least 96 (0x60) 419 # bytes of input. 420 cmp \$0x60,$len # minimal accepted length 421 jb .Lgcm_dec_abort 422 423 lea (%rsp),%rax # save stack pointer 424 push %rbx 425 push %rbp 426 push %r12 427 push %r13 428 push %r14 429 push %r15 430___ 431$code.=<<___ if ($win64); 432 lea -0xa8(%rsp),%rsp 433 movaps %xmm6,-0xd8(%rax) 434 movaps %xmm7,-0xc8(%rax) 435 movaps %xmm8,-0xb8(%rax) 436 movaps %xmm9,-0xa8(%rax) 437 movaps %xmm10,-0x98(%rax) 438 movaps %xmm11,-0x88(%rax) 439 movaps %xmm12,-0x78(%rax) 440 movaps %xmm13,-0x68(%rax) 441 movaps %xmm14,-0x58(%rax) 442 movaps %xmm15,-0x48(%rax) 443.Lgcm_dec_body: 444___ 445$code.=<<___; 446 vzeroupper 447 448 vmovdqu ($ivp),$T1 # input counter value 449 add \$-128,%rsp 450 mov 12($ivp),$counter 451 lea .Lbswap_mask(%rip),$const 452 lea -0x80($key),$in0 # borrow $in0 453 mov \$0xf80,$end0 # borrow $end0 454 vmovdqu ($Xip),$Xi # load Xi 455 and \$-128,%rsp # ensure stack alignment 456 vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask 457 lea 0x80($key),$key # size optimization 458 lea 0x20+0x20($Xip),$Xip # size optimization 459 mov 0xf0-0x80($key),$rounds 460 vpshufb $Ii,$Xi,$Xi 461 462 and $end0,$in0 463 and %rsp,$end0 464 sub $in0,$end0 465 jc .Ldec_no_key_aliasing 466 cmp \$768,$end0 467 jnc .Ldec_no_key_aliasing 468 sub $end0,%rsp # avoid aliasing with key 469.Ldec_no_key_aliasing: 470 471 vmovdqu 0x50($inp),$Z3 # I[5] 472 lea ($inp),$in0 473 vmovdqu 0x40($inp),$Z0 474 475 # |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0) 476 # bytes before the end of the input. Note, in particular, that this is 477 # correct even if |$len| is not an even multiple of 96 or 16. XXX: This 478 # seems to require that |$inp| + |$len| >= 2*96 (0xc0); i.e. |$inp| must 479 # not be near the very beginning of the address space when |$len| < 2*96 480 # (0xc0). 481 lea -0xc0($inp,$len),$end0 482 483 vmovdqu 0x30($inp),$Z1 484 shr \$4,$len 485 xor $ret,$ret 486 vmovdqu 0x20($inp),$Z2 487 vpshufb $Ii,$Z3,$Z3 # passed to _aesni_ctr32_ghash_6x 488 vmovdqu 0x10($inp),$T2 489 vpshufb $Ii,$Z0,$Z0 490 vmovdqu ($inp),$Hkey 491 vpshufb $Ii,$Z1,$Z1 492 vmovdqu $Z0,0x30(%rsp) 493 vpshufb $Ii,$Z2,$Z2 494 vmovdqu $Z1,0x40(%rsp) 495 vpshufb $Ii,$T2,$T2 496 vmovdqu $Z2,0x50(%rsp) 497 vpshufb $Ii,$Hkey,$Hkey 498 vmovdqu $T2,0x60(%rsp) 499 vmovdqu $Hkey,0x70(%rsp) 500 501 call _aesni_ctr32_ghash_6x 502 503 vmovups $inout0,-0x60($out) # save output 504 vmovups $inout1,-0x50($out) 505 vmovups $inout2,-0x40($out) 506 vmovups $inout3,-0x30($out) 507 vmovups $inout4,-0x20($out) 508 vmovups $inout5,-0x10($out) 509 510 vpshufb ($const),$Xi,$Xi # .Lbswap_mask 511 vmovdqu $Xi,-0x40($Xip) # output Xi 512 513 vzeroupper 514___ 515$code.=<<___ if ($win64); 516 movaps -0xd8(%rax),%xmm6 517 movaps -0xc8(%rax),%xmm7 518 movaps -0xb8(%rax),%xmm8 519 movaps -0xa8(%rax),%xmm9 520 movaps -0x98(%rax),%xmm10 521 movaps -0x88(%rax),%xmm11 522 movaps -0x78(%rax),%xmm12 523 movaps -0x68(%rax),%xmm13 524 movaps -0x58(%rax),%xmm14 525 movaps -0x48(%rax),%xmm15 526___ 527$code.=<<___; 528 mov -48(%rax),%r15 529 mov -40(%rax),%r14 530 mov -32(%rax),%r13 531 mov -24(%rax),%r12 532 mov -16(%rax),%rbp 533 mov -8(%rax),%rbx 534 lea (%rax),%rsp # restore %rsp 535.Lgcm_dec_abort: 536 mov $ret,%rax # return value 537 ret 538.size aesni_gcm_decrypt,.-aesni_gcm_decrypt 539___ 540 541$code.=<<___; 542.type _aesni_ctr32_6x,\@abi-omnipotent 543.align 32 544_aesni_ctr32_6x: 545 vmovdqu 0x00-0x80($key),$Z0 # borrow $Z0 for $rndkey 546 vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb 547 lea -1($rounds),%r13 548 vmovups 0x10-0x80($key),$rndkey 549 lea 0x20-0x80($key),%r12 550 vpxor $Z0,$T1,$inout0 551 add \$`6<<24`,$counter 552 jc .Lhandle_ctr32_2 553 vpaddb $T2,$T1,$inout1 554 vpaddb $T2,$inout1,$inout2 555 vpxor $Z0,$inout1,$inout1 556 vpaddb $T2,$inout2,$inout3 557 vpxor $Z0,$inout2,$inout2 558 vpaddb $T2,$inout3,$inout4 559 vpxor $Z0,$inout3,$inout3 560 vpaddb $T2,$inout4,$inout5 561 vpxor $Z0,$inout4,$inout4 562 vpaddb $T2,$inout5,$T1 563 vpxor $Z0,$inout5,$inout5 564 jmp .Loop_ctr32 565 566.align 16 567.Loop_ctr32: 568 vaesenc $rndkey,$inout0,$inout0 569 vaesenc $rndkey,$inout1,$inout1 570 vaesenc $rndkey,$inout2,$inout2 571 vaesenc $rndkey,$inout3,$inout3 572 vaesenc $rndkey,$inout4,$inout4 573 vaesenc $rndkey,$inout5,$inout5 574 vmovups (%r12),$rndkey 575 lea 0x10(%r12),%r12 576 dec %r13d 577 jnz .Loop_ctr32 578 579 vmovdqu (%r12),$Hkey # last round key 580 vaesenc $rndkey,$inout0,$inout0 581 vpxor 0x00($inp),$Hkey,$Z0 582 vaesenc $rndkey,$inout1,$inout1 583 vpxor 0x10($inp),$Hkey,$Z1 584 vaesenc $rndkey,$inout2,$inout2 585 vpxor 0x20($inp),$Hkey,$Z2 586 vaesenc $rndkey,$inout3,$inout3 587 vpxor 0x30($inp),$Hkey,$Xi 588 vaesenc $rndkey,$inout4,$inout4 589 vpxor 0x40($inp),$Hkey,$T2 590 vaesenc $rndkey,$inout5,$inout5 591 vpxor 0x50($inp),$Hkey,$Hkey 592 lea 0x60($inp),$inp 593 594 vaesenclast $Z0,$inout0,$inout0 595 vaesenclast $Z1,$inout1,$inout1 596 vaesenclast $Z2,$inout2,$inout2 597 vaesenclast $Xi,$inout3,$inout3 598 vaesenclast $T2,$inout4,$inout4 599 vaesenclast $Hkey,$inout5,$inout5 600 vmovups $inout0,0x00($out) 601 vmovups $inout1,0x10($out) 602 vmovups $inout2,0x20($out) 603 vmovups $inout3,0x30($out) 604 vmovups $inout4,0x40($out) 605 vmovups $inout5,0x50($out) 606 lea 0x60($out),$out 607 608 ret 609.align 32 610.Lhandle_ctr32_2: 611 vpshufb $Ii,$T1,$Z2 # byte-swap counter 612 vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb 613 vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb 614 vpaddd $Z1,$Z2,$inout2 615 vpaddd $Z1,$inout1,$inout3 616 vpshufb $Ii,$inout1,$inout1 617 vpaddd $Z1,$inout2,$inout4 618 vpshufb $Ii,$inout2,$inout2 619 vpxor $Z0,$inout1,$inout1 620 vpaddd $Z1,$inout3,$inout5 621 vpshufb $Ii,$inout3,$inout3 622 vpxor $Z0,$inout2,$inout2 623 vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value 624 vpshufb $Ii,$inout4,$inout4 625 vpxor $Z0,$inout3,$inout3 626 vpshufb $Ii,$inout5,$inout5 627 vpxor $Z0,$inout4,$inout4 628 vpshufb $Ii,$T1,$T1 # next counter value 629 vpxor $Z0,$inout5,$inout5 630 jmp .Loop_ctr32 631.size _aesni_ctr32_6x,.-_aesni_ctr32_6x 632 633.globl aesni_gcm_encrypt 634.type aesni_gcm_encrypt,\@function,6 635.align 32 636aesni_gcm_encrypt: 637 xor $ret,$ret 638 639 # We call |_aesni_ctr32_6x| twice, each call consuming 96 bytes of 640 # input. Then we call |_aesni_ctr32_ghash_6x|, which requires at 641 # least 96 more bytes of input. 642 cmp \$0x60*3,$len # minimal accepted length 643 jb .Lgcm_enc_abort 644 645 lea (%rsp),%rax # save stack pointer 646 push %rbx 647 push %rbp 648 push %r12 649 push %r13 650 push %r14 651 push %r15 652___ 653$code.=<<___ if ($win64); 654 lea -0xa8(%rsp),%rsp 655 movaps %xmm6,-0xd8(%rax) 656 movaps %xmm7,-0xc8(%rax) 657 movaps %xmm8,-0xb8(%rax) 658 movaps %xmm9,-0xa8(%rax) 659 movaps %xmm10,-0x98(%rax) 660 movaps %xmm11,-0x88(%rax) 661 movaps %xmm12,-0x78(%rax) 662 movaps %xmm13,-0x68(%rax) 663 movaps %xmm14,-0x58(%rax) 664 movaps %xmm15,-0x48(%rax) 665.Lgcm_enc_body: 666___ 667$code.=<<___; 668 vzeroupper 669 670 vmovdqu ($ivp),$T1 # input counter value 671 add \$-128,%rsp 672 mov 12($ivp),$counter 673 lea .Lbswap_mask(%rip),$const 674 lea -0x80($key),$in0 # borrow $in0 675 mov \$0xf80,$end0 # borrow $end0 676 lea 0x80($key),$key # size optimization 677 vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask 678 and \$-128,%rsp # ensure stack alignment 679 mov 0xf0-0x80($key),$rounds 680 681 and $end0,$in0 682 and %rsp,$end0 683 sub $in0,$end0 684 jc .Lenc_no_key_aliasing 685 cmp \$768,$end0 686 jnc .Lenc_no_key_aliasing 687 sub $end0,%rsp # avoid aliasing with key 688.Lenc_no_key_aliasing: 689 690 lea ($out),$in0 691 692 # |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0) 693 # bytes before the end of the input. Note, in particular, that this is 694 # correct even if |$len| is not an even multiple of 96 or 16. Unlike in 695 # the decryption case, there's no caveat that |$out| must not be near 696 # the very beginning of the address space, because we know that 697 # |$len| >= 3*96 from the check above, and so we know 698 # |$out| + |$len| >= 2*96 (0xc0). 699 lea -0xc0($out,$len),$end0 700 701 shr \$4,$len 702 703 call _aesni_ctr32_6x 704 vpshufb $Ii,$inout0,$Xi # save bswapped output on stack 705 vpshufb $Ii,$inout1,$T2 706 vmovdqu $Xi,0x70(%rsp) 707 vpshufb $Ii,$inout2,$Z0 708 vmovdqu $T2,0x60(%rsp) 709 vpshufb $Ii,$inout3,$Z1 710 vmovdqu $Z0,0x50(%rsp) 711 vpshufb $Ii,$inout4,$Z2 712 vmovdqu $Z1,0x40(%rsp) 713 vpshufb $Ii,$inout5,$Z3 # passed to _aesni_ctr32_ghash_6x 714 vmovdqu $Z2,0x30(%rsp) 715 716 call _aesni_ctr32_6x 717 718 vmovdqu ($Xip),$Xi # load Xi 719 lea 0x20+0x20($Xip),$Xip # size optimization 720 sub \$12,$len 721 mov \$0x60*2,$ret 722 vpshufb $Ii,$Xi,$Xi 723 724 call _aesni_ctr32_ghash_6x 725 vmovdqu 0x20(%rsp),$Z3 # I[5] 726 vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask 727 vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1 728 vpunpckhqdq $Z3,$Z3,$T1 729 vmovdqu 0x20-0x20($Xip),$rndkey # borrow $rndkey for $HK 730 vmovups $inout0,-0x60($out) # save output 731 vpshufb $Ii,$inout0,$inout0 # but keep bswapped copy 732 vpxor $Z3,$T1,$T1 733 vmovups $inout1,-0x50($out) 734 vpshufb $Ii,$inout1,$inout1 735 vmovups $inout2,-0x40($out) 736 vpshufb $Ii,$inout2,$inout2 737 vmovups $inout3,-0x30($out) 738 vpshufb $Ii,$inout3,$inout3 739 vmovups $inout4,-0x20($out) 740 vpshufb $Ii,$inout4,$inout4 741 vmovups $inout5,-0x10($out) 742 vpshufb $Ii,$inout5,$inout5 743 vmovdqu $inout0,0x10(%rsp) # free $inout0 744___ 745{ my ($HK,$T3)=($rndkey,$inout0); 746 747$code.=<<___; 748 vmovdqu 0x30(%rsp),$Z2 # I[4] 749 vmovdqu 0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2 750 vpunpckhqdq $Z2,$Z2,$T2 751 vpclmulqdq \$0x00,$Hkey,$Z3,$Z1 752 vpxor $Z2,$T2,$T2 753 vpclmulqdq \$0x11,$Hkey,$Z3,$Z3 754 vpclmulqdq \$0x00,$HK,$T1,$T1 755 756 vmovdqu 0x40(%rsp),$T3 # I[3] 757 vpclmulqdq \$0x00,$Ii,$Z2,$Z0 758 vmovdqu 0x30-0x20($Xip),$Hkey # $Hkey^3 759 vpxor $Z1,$Z0,$Z0 760 vpunpckhqdq $T3,$T3,$Z1 761 vpclmulqdq \$0x11,$Ii,$Z2,$Z2 762 vpxor $T3,$Z1,$Z1 763 vpxor $Z3,$Z2,$Z2 764 vpclmulqdq \$0x10,$HK,$T2,$T2 765 vmovdqu 0x50-0x20($Xip),$HK 766 vpxor $T1,$T2,$T2 767 768 vmovdqu 0x50(%rsp),$T1 # I[2] 769 vpclmulqdq \$0x00,$Hkey,$T3,$Z3 770 vmovdqu 0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4 771 vpxor $Z0,$Z3,$Z3 772 vpunpckhqdq $T1,$T1,$Z0 773 vpclmulqdq \$0x11,$Hkey,$T3,$T3 774 vpxor $T1,$Z0,$Z0 775 vpxor $Z2,$T3,$T3 776 vpclmulqdq \$0x00,$HK,$Z1,$Z1 777 vpxor $T2,$Z1,$Z1 778 779 vmovdqu 0x60(%rsp),$T2 # I[1] 780 vpclmulqdq \$0x00,$Ii,$T1,$Z2 781 vmovdqu 0x60-0x20($Xip),$Hkey # $Hkey^5 782 vpxor $Z3,$Z2,$Z2 783 vpunpckhqdq $T2,$T2,$Z3 784 vpclmulqdq \$0x11,$Ii,$T1,$T1 785 vpxor $T2,$Z3,$Z3 786 vpxor $T3,$T1,$T1 787 vpclmulqdq \$0x10,$HK,$Z0,$Z0 788 vmovdqu 0x80-0x20($Xip),$HK 789 vpxor $Z1,$Z0,$Z0 790 791 vpxor 0x70(%rsp),$Xi,$Xi # accumulate I[0] 792 vpclmulqdq \$0x00,$Hkey,$T2,$Z1 793 vmovdqu 0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6 794 vpunpckhqdq $Xi,$Xi,$T3 795 vpxor $Z2,$Z1,$Z1 796 vpclmulqdq \$0x11,$Hkey,$T2,$T2 797 vpxor $Xi,$T3,$T3 798 vpxor $T1,$T2,$T2 799 vpclmulqdq \$0x00,$HK,$Z3,$Z3 800 vpxor $Z0,$Z3,$Z0 801 802 vpclmulqdq \$0x00,$Ii,$Xi,$Z2 803 vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1 804 vpunpckhqdq $inout5,$inout5,$T1 805 vpclmulqdq \$0x11,$Ii,$Xi,$Xi 806 vpxor $inout5,$T1,$T1 807 vpxor $Z1,$Z2,$Z1 808 vpclmulqdq \$0x10,$HK,$T3,$T3 809 vmovdqu 0x20-0x20($Xip),$HK 810 vpxor $T2,$Xi,$Z3 811 vpxor $Z0,$T3,$Z2 812 813 vmovdqu 0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2 814 vpxor $Z1,$Z3,$T3 # aggregated Karatsuba post-processing 815 vpclmulqdq \$0x00,$Hkey,$inout5,$Z0 816 vpxor $T3,$Z2,$Z2 817 vpunpckhqdq $inout4,$inout4,$T2 818 vpclmulqdq \$0x11,$Hkey,$inout5,$inout5 819 vpxor $inout4,$T2,$T2 820 vpslldq \$8,$Z2,$T3 821 vpclmulqdq \$0x00,$HK,$T1,$T1 822 vpxor $T3,$Z1,$Xi 823 vpsrldq \$8,$Z2,$Z2 824 vpxor $Z2,$Z3,$Z3 825 826 vpclmulqdq \$0x00,$Ii,$inout4,$Z1 827 vmovdqu 0x30-0x20($Xip),$Hkey # $Hkey^3 828 vpxor $Z0,$Z1,$Z1 829 vpunpckhqdq $inout3,$inout3,$T3 830 vpclmulqdq \$0x11,$Ii,$inout4,$inout4 831 vpxor $inout3,$T3,$T3 832 vpxor $inout5,$inout4,$inout4 833 vpalignr \$8,$Xi,$Xi,$inout5 # 1st phase 834 vpclmulqdq \$0x10,$HK,$T2,$T2 835 vmovdqu 0x50-0x20($Xip),$HK 836 vpxor $T1,$T2,$T2 837 838 vpclmulqdq \$0x00,$Hkey,$inout3,$Z0 839 vmovdqu 0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4 840 vpxor $Z1,$Z0,$Z0 841 vpunpckhqdq $inout2,$inout2,$T1 842 vpclmulqdq \$0x11,$Hkey,$inout3,$inout3 843 vpxor $inout2,$T1,$T1 844 vpxor $inout4,$inout3,$inout3 845 vxorps 0x10(%rsp),$Z3,$Z3 # accumulate $inout0 846 vpclmulqdq \$0x00,$HK,$T3,$T3 847 vpxor $T2,$T3,$T3 848 849 vpclmulqdq \$0x10,0x10($const),$Xi,$Xi 850 vxorps $inout5,$Xi,$Xi 851 852 vpclmulqdq \$0x00,$Ii,$inout2,$Z1 853 vmovdqu 0x60-0x20($Xip),$Hkey # $Hkey^5 854 vpxor $Z0,$Z1,$Z1 855 vpunpckhqdq $inout1,$inout1,$T2 856 vpclmulqdq \$0x11,$Ii,$inout2,$inout2 857 vpxor $inout1,$T2,$T2 858 vpalignr \$8,$Xi,$Xi,$inout5 # 2nd phase 859 vpxor $inout3,$inout2,$inout2 860 vpclmulqdq \$0x10,$HK,$T1,$T1 861 vmovdqu 0x80-0x20($Xip),$HK 862 vpxor $T3,$T1,$T1 863 864 vxorps $Z3,$inout5,$inout5 865 vpclmulqdq \$0x10,0x10($const),$Xi,$Xi 866 vxorps $inout5,$Xi,$Xi 867 868 vpclmulqdq \$0x00,$Hkey,$inout1,$Z0 869 vmovdqu 0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6 870 vpxor $Z1,$Z0,$Z0 871 vpunpckhqdq $Xi,$Xi,$T3 872 vpclmulqdq \$0x11,$Hkey,$inout1,$inout1 873 vpxor $Xi,$T3,$T3 874 vpxor $inout2,$inout1,$inout1 875 vpclmulqdq \$0x00,$HK,$T2,$T2 876 vpxor $T1,$T2,$T2 877 878 vpclmulqdq \$0x00,$Ii,$Xi,$Z1 879 vpclmulqdq \$0x11,$Ii,$Xi,$Z3 880 vpxor $Z0,$Z1,$Z1 881 vpclmulqdq \$0x10,$HK,$T3,$Z2 882 vpxor $inout1,$Z3,$Z3 883 vpxor $T2,$Z2,$Z2 884 885 vpxor $Z1,$Z3,$Z0 # aggregated Karatsuba post-processing 886 vpxor $Z0,$Z2,$Z2 887 vpslldq \$8,$Z2,$T1 888 vmovdqu 0x10($const),$Hkey # .Lpoly 889 vpsrldq \$8,$Z2,$Z2 890 vpxor $T1,$Z1,$Xi 891 vpxor $Z2,$Z3,$Z3 892 893 vpalignr \$8,$Xi,$Xi,$T2 # 1st phase 894 vpclmulqdq \$0x10,$Hkey,$Xi,$Xi 895 vpxor $T2,$Xi,$Xi 896 897 vpalignr \$8,$Xi,$Xi,$T2 # 2nd phase 898 vpclmulqdq \$0x10,$Hkey,$Xi,$Xi 899 vpxor $Z3,$T2,$T2 900 vpxor $T2,$Xi,$Xi 901___ 902} 903$code.=<<___; 904 vpshufb ($const),$Xi,$Xi # .Lbswap_mask 905 vmovdqu $Xi,-0x40($Xip) # output Xi 906 907 vzeroupper 908___ 909$code.=<<___ if ($win64); 910 movaps -0xd8(%rax),%xmm6 911 movaps -0xc8(%rax),%xmm7 912 movaps -0xb8(%rax),%xmm8 913 movaps -0xa8(%rax),%xmm9 914 movaps -0x98(%rax),%xmm10 915 movaps -0x88(%rax),%xmm11 916 movaps -0x78(%rax),%xmm12 917 movaps -0x68(%rax),%xmm13 918 movaps -0x58(%rax),%xmm14 919 movaps -0x48(%rax),%xmm15 920___ 921$code.=<<___; 922 mov -48(%rax),%r15 923 mov -40(%rax),%r14 924 mov -32(%rax),%r13 925 mov -24(%rax),%r12 926 mov -16(%rax),%rbp 927 mov -8(%rax),%rbx 928 lea (%rax),%rsp # restore %rsp 929.Lgcm_enc_abort: 930 mov $ret,%rax # return value 931 ret 932.size aesni_gcm_encrypt,.-aesni_gcm_encrypt 933___ 934 935$code.=<<___; 936.align 64 937.Lbswap_mask: 938 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 939.Lpoly: 940 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 941.Lone_msb: 942 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 943.Ltwo_lsb: 944 .byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 945.Lone_lsb: 946 .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 947.asciz "AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 948.align 64 949___ 950if ($win64) { 951$rec="%rcx"; 952$frame="%rdx"; 953$context="%r8"; 954$disp="%r9"; 955 956$code.=<<___ 957.extern __imp_RtlVirtualUnwind 958.type gcm_se_handler,\@abi-omnipotent 959.align 16 960gcm_se_handler: 961 push %rsi 962 push %rdi 963 push %rbx 964 push %rbp 965 push %r12 966 push %r13 967 push %r14 968 push %r15 969 pushfq 970 sub \$64,%rsp 971 972 mov 120($context),%rax # pull context->Rax 973 mov 248($context),%rbx # pull context->Rip 974 975 mov 8($disp),%rsi # disp->ImageBase 976 mov 56($disp),%r11 # disp->HandlerData 977 978 mov 0(%r11),%r10d # HandlerData[0] 979 lea (%rsi,%r10),%r10 # prologue label 980 cmp %r10,%rbx # context->Rip<prologue label 981 jb .Lcommon_seh_tail 982 983 mov 152($context),%rax # pull context->Rsp 984 985 mov 4(%r11),%r10d # HandlerData[1] 986 lea (%rsi,%r10),%r10 # epilogue label 987 cmp %r10,%rbx # context->Rip>=epilogue label 988 jae .Lcommon_seh_tail 989 990 mov 120($context),%rax # pull context->Rax 991 992 mov -48(%rax),%r15 993 mov -40(%rax),%r14 994 mov -32(%rax),%r13 995 mov -24(%rax),%r12 996 mov -16(%rax),%rbp 997 mov -8(%rax),%rbx 998 mov %r15,240($context) 999 mov %r14,232($context) 1000 mov %r13,224($context) 1001 mov %r12,216($context) 1002 mov %rbp,160($context) 1003 mov %rbx,144($context) 1004 1005 lea -0xd8(%rax),%rsi # %xmm save area 1006 lea 512($context),%rdi # & context.Xmm6 1007 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 1008 .long 0xa548f3fc # cld; rep movsq 1009 1010.Lcommon_seh_tail: 1011 mov 8(%rax),%rdi 1012 mov 16(%rax),%rsi 1013 mov %rax,152($context) # restore context->Rsp 1014 mov %rsi,168($context) # restore context->Rsi 1015 mov %rdi,176($context) # restore context->Rdi 1016 1017 mov 40($disp),%rdi # disp->ContextRecord 1018 mov $context,%rsi # context 1019 mov \$154,%ecx # sizeof(CONTEXT) 1020 .long 0xa548f3fc # cld; rep movsq 1021 1022 mov $disp,%rsi 1023 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1024 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1025 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1026 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1027 mov 40(%rsi),%r10 # disp->ContextRecord 1028 lea 56(%rsi),%r11 # &disp->HandlerData 1029 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1030 mov %r10,32(%rsp) # arg5 1031 mov %r11,40(%rsp) # arg6 1032 mov %r12,48(%rsp) # arg7 1033 mov %rcx,56(%rsp) # arg8, (NULL) 1034 call *__imp_RtlVirtualUnwind(%rip) 1035 1036 mov \$1,%eax # ExceptionContinueSearch 1037 add \$64,%rsp 1038 popfq 1039 pop %r15 1040 pop %r14 1041 pop %r13 1042 pop %r12 1043 pop %rbp 1044 pop %rbx 1045 pop %rdi 1046 pop %rsi 1047 ret 1048.size gcm_se_handler,.-gcm_se_handler 1049 1050.section .pdata 1051.align 4 1052 .rva .LSEH_begin_aesni_gcm_decrypt 1053 .rva .LSEH_end_aesni_gcm_decrypt 1054 .rva .LSEH_gcm_dec_info 1055 1056 .rva .LSEH_begin_aesni_gcm_encrypt 1057 .rva .LSEH_end_aesni_gcm_encrypt 1058 .rva .LSEH_gcm_enc_info 1059.section .xdata 1060.align 8 1061.LSEH_gcm_dec_info: 1062 .byte 9,0,0,0 1063 .rva gcm_se_handler 1064 .rva .Lgcm_dec_body,.Lgcm_dec_abort 1065.LSEH_gcm_enc_info: 1066 .byte 9,0,0,0 1067 .rva gcm_se_handler 1068 .rva .Lgcm_enc_body,.Lgcm_enc_abort 1069___ 1070} 1071}}} else {{{ 1072$code=<<___; # assembler is too old 1073.text 1074 1075.globl aesni_gcm_encrypt 1076.type aesni_gcm_encrypt,\@abi-omnipotent 1077aesni_gcm_encrypt: 1078 xor %eax,%eax 1079 ret 1080.size aesni_gcm_encrypt,.-aesni_gcm_encrypt 1081 1082.globl aesni_gcm_decrypt 1083.type aesni_gcm_decrypt,\@abi-omnipotent 1084aesni_gcm_decrypt: 1085 xor %eax,%eax 1086 ret 1087.size aesni_gcm_decrypt,.-aesni_gcm_decrypt 1088___ 1089}}} 1090 1091$code =~ s/\`([^\`]*)\`/eval($1)/gem; 1092 1093print $code; 1094 1095close STDOUT; 1096