1#! /usr/bin/env perl 2# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# 18# AES-NI-CTR+GHASH stitch. 19# 20# February 2013 21# 22# OpenSSL GCM implementation is organized in such way that its 23# performance is rather close to the sum of its streamed components, 24# in the context parallelized AES-NI CTR and modulo-scheduled 25# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation 26# was observed to perform significantly better than the sum of the 27# components on contemporary CPUs, the effort was deemed impossible to 28# justify. This module is based on combination of Intel submissions, 29# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max 30# Locktyukhin of Intel Corp. who verified that it reduces shuffles 31# pressure with notable relative improvement, achieving 1.0 cycle per 32# byte processed with 128-bit key on Haswell processor, 0.74 - on 33# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled 34# measurements for favourable packet size, one divisible by 96. 35# Applications using the EVP interface will observe a few percent 36# worse performance.] 37# 38# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP). 39# 40# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest 41# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf 42 43$flavour = shift; 44$output = shift; 45if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 46 47$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 48 49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 50( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 51( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 52die "can't locate x86_64-xlate.pl"; 53 54# |$avx| in ghash-x86_64.pl must be set to at least 1; otherwise tags will 55# be computed incorrectly. 56# 57# In upstream, this is controlled by shelling out to the compiler to check 58# versions, but BoringSSL is intended to be used with pre-generated perlasm 59# output, so this isn't useful anyway. 60# 61# The upstream code uses the condition |$avx>1| even though no AVX2 62# instructions are used, because it assumes MOVBE is supported by the assembler 63# if and only if AVX2 is also supported by the assembler; see 64# https://marc.info/?l=openssl-dev&m=146567589526984&w=2. 65$avx = 2; 66 67open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 68*STDOUT=*OUT; 69 70# See the comment above regarding why the condition is ($avx>1) when there are 71# no AVX2 instructions being used. 72if ($avx>1) {{{ 73 74# On Windows, only four parameters are passed in registers. The last two 75# parameters will be manually loaded into %rdi and %rsi. 76my ($inp, $out, $len, $key, $ivp, $Htable) = 77 $win64 ? ("%rcx", "%rdx", "%r8", "%r9", "%rdi", "%rsi") : 78 ("%rdi", "%rsi", "%rdx", "%rcx", "%r8", "%r9"); 79 80# The offset from %rbp to the Xip parameter. On Windows, all parameters have 81# corresponding stack positions, not just ones passed on the stack. 82# (0x40 = 6*8 + 0x10) 83# 84# Xip only needs to be accessed at the beginning and end of the function, and 85# this function is short on registers, so we make it the last parameter for 86# convenience. 87my $Xip_offset = $win64 ? 0x40 : 0x10; 88 89($Ii,$T1,$T2,$Hkey, 90 $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8)); 91 92($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15)); 93 94($counter,$rounds,$const,$in0,$end0)=("%ebx","%r10d","%r11","%r14","%r15"); 95 96$code=<<___; 97.text 98 99.type _aesni_ctr32_ghash_6x,\@abi-omnipotent 100.align 32 101_aesni_ctr32_ghash_6x: 102.cfi_startproc 103 vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb 104 sub \$6,$len 105 vpxor $Z0,$Z0,$Z0 # $Z0 = 0 106 vmovdqu 0x00-0x80($key),$rndkey 107 vpaddb $T2,$T1,$inout1 108 vpaddb $T2,$inout1,$inout2 109 vpaddb $T2,$inout2,$inout3 110 vpaddb $T2,$inout3,$inout4 111 vpaddb $T2,$inout4,$inout5 112 vpxor $rndkey,$T1,$inout0 113 vmovdqu $Z0,16+8(%rsp) # "$Z3" = 0 114 jmp .Loop6x 115 116.align 32 117.Loop6x: 118 add \$`6<<24`,$counter 119 jc .Lhandle_ctr32 # discard $inout[1-5]? 120 vmovdqu 0x00-0x20($Htable),$Hkey # $Hkey^1 121 vpaddb $T2,$inout5,$T1 # next counter value 122 vpxor $rndkey,$inout1,$inout1 123 vpxor $rndkey,$inout2,$inout2 124 125.Lresume_ctr32: 126 vmovdqu $T1,($ivp) # save next counter value 127 vpclmulqdq \$0x10,$Hkey,$Z3,$Z1 128 vpxor $rndkey,$inout3,$inout3 129 vmovups 0x10-0x80($key),$T2 # borrow $T2 for $rndkey 130 vpclmulqdq \$0x01,$Hkey,$Z3,$Z2 131 132 # At this point, the current block of 96 (0x60) bytes has already been 133 # loaded into registers. Concurrently with processing it, we want to 134 # load the next 96 bytes of input for the next round. Obviously, we can 135 # only do this if there are at least 96 more bytes of input beyond the 136 # input we're currently processing, or else we'd read past the end of 137 # the input buffer. Here, we set |%r12| to 96 if there are at least 96 138 # bytes of input beyond the 96 bytes we're already processing, and we 139 # set |%r12| to 0 otherwise. In the case where we set |%r12| to 96, 140 # we'll read in the next block so that it is in registers for the next 141 # loop iteration. In the case where we set |%r12| to 0, we'll re-read 142 # the current block and then ignore what we re-read. 143 # 144 # At this point, |$in0| points to the current (already read into 145 # registers) block, and |$end0| points to 2*96 bytes before the end of 146 # the input. Thus, |$in0| > |$end0| means that we do not have the next 147 # 96-byte block to read in, and |$in0| <= |$end0| means we do. 148 xor %r12,%r12 149 cmp $in0,$end0 150 151 vaesenc $T2,$inout0,$inout0 152 vmovdqu 0x30+8(%rsp),$Ii # I[4] 153 vpxor $rndkey,$inout4,$inout4 154 vpclmulqdq \$0x00,$Hkey,$Z3,$T1 155 vaesenc $T2,$inout1,$inout1 156 vpxor $rndkey,$inout5,$inout5 157 setnc %r12b 158 vpclmulqdq \$0x11,$Hkey,$Z3,$Z3 159 vaesenc $T2,$inout2,$inout2 160 vmovdqu 0x10-0x20($Htable),$Hkey # $Hkey^2 161 neg %r12 162 vaesenc $T2,$inout3,$inout3 163 vpxor $Z1,$Z2,$Z2 164 vpclmulqdq \$0x00,$Hkey,$Ii,$Z1 165 vpxor $Z0,$Xi,$Xi # modulo-scheduled 166 vaesenc $T2,$inout4,$inout4 167 vpxor $Z1,$T1,$Z0 168 and \$0x60,%r12 169 vmovups 0x20-0x80($key),$rndkey 170 vpclmulqdq \$0x10,$Hkey,$Ii,$T1 171 vaesenc $T2,$inout5,$inout5 172 173 vpclmulqdq \$0x01,$Hkey,$Ii,$T2 174 lea ($in0,%r12),$in0 175 vaesenc $rndkey,$inout0,$inout0 176 vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled [vpxor $Z3,$Xi,$Xi] 177 vpclmulqdq \$0x11,$Hkey,$Ii,$Hkey 178 vmovdqu 0x40+8(%rsp),$Ii # I[3] 179 vaesenc $rndkey,$inout1,$inout1 180 movbe 0x58($in0),%r13 181 vaesenc $rndkey,$inout2,$inout2 182 movbe 0x50($in0),%r12 183 vaesenc $rndkey,$inout3,$inout3 184 mov %r13,0x20+8(%rsp) 185 vaesenc $rndkey,$inout4,$inout4 186 mov %r12,0x28+8(%rsp) 187 vmovdqu 0x30-0x20($Htable),$Z1 # borrow $Z1 for $Hkey^3 188 vaesenc $rndkey,$inout5,$inout5 189 190 vmovups 0x30-0x80($key),$rndkey 191 vpxor $T1,$Z2,$Z2 192 vpclmulqdq \$0x00,$Z1,$Ii,$T1 193 vaesenc $rndkey,$inout0,$inout0 194 vpxor $T2,$Z2,$Z2 195 vpclmulqdq \$0x10,$Z1,$Ii,$T2 196 vaesenc $rndkey,$inout1,$inout1 197 vpxor $Hkey,$Z3,$Z3 198 vpclmulqdq \$0x01,$Z1,$Ii,$Hkey 199 vaesenc $rndkey,$inout2,$inout2 200 vpclmulqdq \$0x11,$Z1,$Ii,$Z1 201 vmovdqu 0x50+8(%rsp),$Ii # I[2] 202 vaesenc $rndkey,$inout3,$inout3 203 vaesenc $rndkey,$inout4,$inout4 204 vpxor $T1,$Z0,$Z0 205 vmovdqu 0x40-0x20($Htable),$T1 # borrow $T1 for $Hkey^4 206 vaesenc $rndkey,$inout5,$inout5 207 208 vmovups 0x40-0x80($key),$rndkey 209 vpxor $T2,$Z2,$Z2 210 vpclmulqdq \$0x00,$T1,$Ii,$T2 211 vaesenc $rndkey,$inout0,$inout0 212 vpxor $Hkey,$Z2,$Z2 213 vpclmulqdq \$0x10,$T1,$Ii,$Hkey 214 vaesenc $rndkey,$inout1,$inout1 215 movbe 0x48($in0),%r13 216 vpxor $Z1,$Z3,$Z3 217 vpclmulqdq \$0x01,$T1,$Ii,$Z1 218 vaesenc $rndkey,$inout2,$inout2 219 movbe 0x40($in0),%r12 220 vpclmulqdq \$0x11,$T1,$Ii,$T1 221 vmovdqu 0x60+8(%rsp),$Ii # I[1] 222 vaesenc $rndkey,$inout3,$inout3 223 mov %r13,0x30+8(%rsp) 224 vaesenc $rndkey,$inout4,$inout4 225 mov %r12,0x38+8(%rsp) 226 vpxor $T2,$Z0,$Z0 227 vmovdqu 0x60-0x20($Htable),$T2 # borrow $T2 for $Hkey^5 228 vaesenc $rndkey,$inout5,$inout5 229 230 vmovups 0x50-0x80($key),$rndkey 231 vpxor $Hkey,$Z2,$Z2 232 vpclmulqdq \$0x00,$T2,$Ii,$Hkey 233 vaesenc $rndkey,$inout0,$inout0 234 vpxor $Z1,$Z2,$Z2 235 vpclmulqdq \$0x10,$T2,$Ii,$Z1 236 vaesenc $rndkey,$inout1,$inout1 237 movbe 0x38($in0),%r13 238 vpxor $T1,$Z3,$Z3 239 vpclmulqdq \$0x01,$T2,$Ii,$T1 240 vpxor 0x70+8(%rsp),$Xi,$Xi # accumulate I[0] 241 vaesenc $rndkey,$inout2,$inout2 242 movbe 0x30($in0),%r12 243 vpclmulqdq \$0x11,$T2,$Ii,$T2 244 vaesenc $rndkey,$inout3,$inout3 245 mov %r13,0x40+8(%rsp) 246 vaesenc $rndkey,$inout4,$inout4 247 mov %r12,0x48+8(%rsp) 248 vpxor $Hkey,$Z0,$Z0 249 vmovdqu 0x70-0x20($Htable),$Hkey # $Hkey^6 250 vaesenc $rndkey,$inout5,$inout5 251 252 vmovups 0x60-0x80($key),$rndkey 253 vpxor $Z1,$Z2,$Z2 254 vpclmulqdq \$0x10,$Hkey,$Xi,$Z1 255 vaesenc $rndkey,$inout0,$inout0 256 vpxor $T1,$Z2,$Z2 257 vpclmulqdq \$0x01,$Hkey,$Xi,$T1 258 vaesenc $rndkey,$inout1,$inout1 259 movbe 0x28($in0),%r13 260 vpxor $T2,$Z3,$Z3 261 vpclmulqdq \$0x00,$Hkey,$Xi,$T2 262 vaesenc $rndkey,$inout2,$inout2 263 movbe 0x20($in0),%r12 264 vpclmulqdq \$0x11,$Hkey,$Xi,$Xi 265 vaesenc $rndkey,$inout3,$inout3 266 mov %r13,0x50+8(%rsp) 267 vaesenc $rndkey,$inout4,$inout4 268 mov %r12,0x58+8(%rsp) 269 vpxor $Z1,$Z2,$Z2 270 vaesenc $rndkey,$inout5,$inout5 271 vpxor $T1,$Z2,$Z2 272 273 vmovups 0x70-0x80($key),$rndkey 274 vpslldq \$8,$Z2,$Z1 275 vpxor $T2,$Z0,$Z0 276 vmovdqu 0x10($const),$Hkey # .Lpoly 277 278 vaesenc $rndkey,$inout0,$inout0 279 vpxor $Xi,$Z3,$Z3 280 vaesenc $rndkey,$inout1,$inout1 281 vpxor $Z1,$Z0,$Z0 282 movbe 0x18($in0),%r13 283 vaesenc $rndkey,$inout2,$inout2 284 movbe 0x10($in0),%r12 285 vpalignr \$8,$Z0,$Z0,$Ii # 1st phase 286 vpclmulqdq \$0x10,$Hkey,$Z0,$Z0 287 mov %r13,0x60+8(%rsp) 288 vaesenc $rndkey,$inout3,$inout3 289 mov %r12,0x68+8(%rsp) 290 vaesenc $rndkey,$inout4,$inout4 291 vmovups 0x80-0x80($key),$T1 # borrow $T1 for $rndkey 292 vaesenc $rndkey,$inout5,$inout5 293 294 vaesenc $T1,$inout0,$inout0 295 vmovups 0x90-0x80($key),$rndkey 296 vaesenc $T1,$inout1,$inout1 297 vpsrldq \$8,$Z2,$Z2 298 vaesenc $T1,$inout2,$inout2 299 vpxor $Z2,$Z3,$Z3 300 vaesenc $T1,$inout3,$inout3 301 vpxor $Ii,$Z0,$Z0 302 movbe 0x08($in0),%r13 303 vaesenc $T1,$inout4,$inout4 304 movbe 0x00($in0),%r12 305 vaesenc $T1,$inout5,$inout5 306 vmovups 0xa0-0x80($key),$T1 307 cmp \$11,$rounds 308 jb .Lenc_tail # 128-bit key 309 310 vaesenc $rndkey,$inout0,$inout0 311 vaesenc $rndkey,$inout1,$inout1 312 vaesenc $rndkey,$inout2,$inout2 313 vaesenc $rndkey,$inout3,$inout3 314 vaesenc $rndkey,$inout4,$inout4 315 vaesenc $rndkey,$inout5,$inout5 316 317 vaesenc $T1,$inout0,$inout0 318 vaesenc $T1,$inout1,$inout1 319 vaesenc $T1,$inout2,$inout2 320 vaesenc $T1,$inout3,$inout3 321 vaesenc $T1,$inout4,$inout4 322 vmovups 0xb0-0x80($key),$rndkey 323 vaesenc $T1,$inout5,$inout5 324 vmovups 0xc0-0x80($key),$T1 325 je .Lenc_tail # 192-bit key 326 327 vaesenc $rndkey,$inout0,$inout0 328 vaesenc $rndkey,$inout1,$inout1 329 vaesenc $rndkey,$inout2,$inout2 330 vaesenc $rndkey,$inout3,$inout3 331 vaesenc $rndkey,$inout4,$inout4 332 vaesenc $rndkey,$inout5,$inout5 333 334 vaesenc $T1,$inout0,$inout0 335 vaesenc $T1,$inout1,$inout1 336 vaesenc $T1,$inout2,$inout2 337 vaesenc $T1,$inout3,$inout3 338 vaesenc $T1,$inout4,$inout4 339 vmovups 0xd0-0x80($key),$rndkey 340 vaesenc $T1,$inout5,$inout5 341 vmovups 0xe0-0x80($key),$T1 342 jmp .Lenc_tail # 256-bit key 343 344.align 32 345.Lhandle_ctr32: 346 vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask 347 vpshufb $Ii,$T1,$Z2 # byte-swap counter 348 vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb 349 vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb 350 vpaddd $Z1,$Z2,$inout2 351 vmovdqu 0x00-0x20($Htable),$Hkey # $Hkey^1 352 vpaddd $Z1,$inout1,$inout3 353 vpshufb $Ii,$inout1,$inout1 354 vpaddd $Z1,$inout2,$inout4 355 vpshufb $Ii,$inout2,$inout2 356 vpxor $rndkey,$inout1,$inout1 357 vpaddd $Z1,$inout3,$inout5 358 vpshufb $Ii,$inout3,$inout3 359 vpxor $rndkey,$inout2,$inout2 360 vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value 361 vpshufb $Ii,$inout4,$inout4 362 vpshufb $Ii,$inout5,$inout5 363 vpshufb $Ii,$T1,$T1 # next counter value 364 jmp .Lresume_ctr32 365 366.align 32 367.Lenc_tail: 368 vaesenc $rndkey,$inout0,$inout0 369 vmovdqu $Z3,16+8(%rsp) # postpone vpxor $Z3,$Xi,$Xi 370 vpalignr \$8,$Z0,$Z0,$Xi # 2nd phase 371 vaesenc $rndkey,$inout1,$inout1 372 vpclmulqdq \$0x10,$Hkey,$Z0,$Z0 373 vpxor 0x00($inp),$T1,$T2 374 vaesenc $rndkey,$inout2,$inout2 375 vpxor 0x10($inp),$T1,$Ii 376 vaesenc $rndkey,$inout3,$inout3 377 vpxor 0x20($inp),$T1,$Z1 378 vaesenc $rndkey,$inout4,$inout4 379 vpxor 0x30($inp),$T1,$Z2 380 vaesenc $rndkey,$inout5,$inout5 381 vpxor 0x40($inp),$T1,$Z3 382 vpxor 0x50($inp),$T1,$Hkey 383 vmovdqu ($ivp),$T1 # load next counter value 384 385 vaesenclast $T2,$inout0,$inout0 386 vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb 387 vaesenclast $Ii,$inout1,$inout1 388 vpaddb $T2,$T1,$Ii 389 mov %r13,0x70+8(%rsp) 390 lea 0x60($inp),$inp 391 # These two prefetches were added in BoringSSL. See change that added them. 392 prefetcht0 512($inp) # We use 96-byte block so prefetch 2 lines (128 bytes) 393 prefetcht0 576($inp) 394 vaesenclast $Z1,$inout2,$inout2 395 vpaddb $T2,$Ii,$Z1 396 mov %r12,0x78+8(%rsp) 397 lea 0x60($out),$out 398 vmovdqu 0x00-0x80($key),$rndkey 399 vaesenclast $Z2,$inout3,$inout3 400 vpaddb $T2,$Z1,$Z2 401 vaesenclast $Z3, $inout4,$inout4 402 vpaddb $T2,$Z2,$Z3 403 vaesenclast $Hkey,$inout5,$inout5 404 vpaddb $T2,$Z3,$Hkey 405 406 add \$0x60,%rax 407 sub \$0x6,$len 408 jc .L6x_done 409 410 vmovups $inout0,-0x60($out) # save output 411 vpxor $rndkey,$T1,$inout0 412 vmovups $inout1,-0x50($out) 413 vmovdqa $Ii,$inout1 # 0 latency 414 vmovups $inout2,-0x40($out) 415 vmovdqa $Z1,$inout2 # 0 latency 416 vmovups $inout3,-0x30($out) 417 vmovdqa $Z2,$inout3 # 0 latency 418 vmovups $inout4,-0x20($out) 419 vmovdqa $Z3,$inout4 # 0 latency 420 vmovups $inout5,-0x10($out) 421 vmovdqa $Hkey,$inout5 # 0 latency 422 vmovdqu 0x20+8(%rsp),$Z3 # I[5] 423 jmp .Loop6x 424 425.L6x_done: 426 vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled 427 vpxor $Z0,$Xi,$Xi # modulo-scheduled 428 429 ret 430.cfi_endproc 431.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x 432___ 433###################################################################### 434# 435# size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len, 436# const AES_KEY *key, unsigned char iv[16], const u128 *Htbl[9], 437# u128 *Xip); 438$code.=<<___; 439.globl aesni_gcm_decrypt 440.type aesni_gcm_decrypt,\@abi-omnipotent 441.align 32 442aesni_gcm_decrypt: 443.cfi_startproc 444.seh_startproc 445 _CET_ENDBR 446 xor %rax,%rax 447 448 # We call |_aesni_ctr32_ghash_6x|, which requires at least 96 (0x60) 449 # bytes of input. 450 cmp \$0x60,$len # minimal accepted length 451 jb .Lgcm_dec_abort 452 453 push %rbp 454.cfi_push %rbp 455.seh_pushreg %rbp 456 mov %rsp, %rbp # save stack pointer 457.cfi_def_cfa_register %rbp 458 push %rbx 459.cfi_push %rbx 460.seh_pushreg %rbx 461 push %r12 462.cfi_push %r12 463.seh_pushreg %r12 464 push %r13 465.cfi_push %r13 466.seh_pushreg %r13 467 push %r14 468.cfi_push %r14 469.seh_pushreg %r14 470 push %r15 471.cfi_push %r15 472.seh_pushreg %r15 473___ 474if ($win64) { 475$code.=<<___ 476 lea -0xa8(%rsp),%rsp # 8 extra bytes to align the stack 477.seh_stackalloc 0xa8 478.seh_setframe %rbp, 0xa8+5*8 479 # Load the last two parameters. These go into %rdi and %rsi, which are 480 # non-volatile on Windows, so stash them in the parameter stack area 481 # first. 482 mov %rdi, 0x10(%rbp) 483.seh_savereg %rdi, 0xa8+5*8+0x10 484 mov %rsi, 0x18(%rbp) 485.seh_savereg %rsi, 0xa8+5*8+0x18 486 mov 0x30(%rbp), $ivp 487 mov 0x38(%rbp), $Htable 488 # Save non-volatile XMM registers. 489 movaps %xmm6,-0xd0(%rbp) 490.seh_savexmm %xmm6, 0xa8+5*8-0xd0 491 movaps %xmm7,-0xc0(%rbp) 492.seh_savexmm %xmm7, 0xa8+5*8-0xc0 493 movaps %xmm8,-0xb0(%rbp) 494.seh_savexmm %xmm8, 0xa8+5*8-0xb0 495 movaps %xmm9,-0xa0(%rbp) 496.seh_savexmm %xmm9, 0xa8+5*8-0xa0 497 movaps %xmm10,-0x90(%rbp) 498.seh_savexmm %xmm10, 0xa8+5*8-0x90 499 movaps %xmm11,-0x80(%rbp) 500.seh_savexmm %xmm11, 0xa8+5*8-0x80 501 movaps %xmm12,-0x70(%rbp) 502.seh_savexmm %xmm12, 0xa8+5*8-0x70 503 movaps %xmm13,-0x60(%rbp) 504.seh_savexmm %xmm13, 0xa8+5*8-0x60 505 movaps %xmm14,-0x50(%rbp) 506.seh_savexmm %xmm14, 0xa8+5*8-0x50 507 movaps %xmm15,-0x40(%rbp) 508.seh_savexmm %xmm15, 0xa8+5*8-0x40 509.seh_endprologue 510___ 511} 512$code.=<<___; 513 vzeroupper 514 515 mov $Xip_offset(%rbp), %r12 516 vmovdqu ($ivp),$T1 # input counter value 517 add \$-128,%rsp 518 mov 12($ivp),$counter 519 lea .Lbswap_mask(%rip),$const 520 lea -0x80($key),$in0 # borrow $in0 521 mov \$0xf80,$end0 # borrow $end0 522 vmovdqu (%r12),$Xi # load Xi 523 and \$-128,%rsp # ensure stack alignment 524 vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask 525 lea 0x80($key),$key # size optimization 526 lea 0x20($Htable),$Htable # size optimization 527 mov 0xf0-0x80($key),$rounds 528 vpshufb $Ii,$Xi,$Xi 529 530 and $end0,$in0 531 and %rsp,$end0 532 sub $in0,$end0 533 jc .Ldec_no_key_aliasing 534 cmp \$768,$end0 535 jnc .Ldec_no_key_aliasing 536 sub $end0,%rsp # avoid aliasing with key 537.Ldec_no_key_aliasing: 538 539 vmovdqu 0x50($inp),$Z3 # I[5] 540 mov $inp,$in0 541 vmovdqu 0x40($inp),$Z0 542 543 # |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0) 544 # bytes before the end of the input. Note, in particular, that this is 545 # correct even if |$len| is not an even multiple of 96 or 16. XXX: This 546 # seems to require that |$inp| + |$len| >= 2*96 (0xc0); i.e. |$inp| must 547 # not be near the very beginning of the address space when |$len| < 2*96 548 # (0xc0). 549 lea -0xc0($inp,$len),$end0 550 551 vmovdqu 0x30($inp),$Z1 552 shr \$4,$len 553 xor %rax,%rax 554 vmovdqu 0x20($inp),$Z2 555 vpshufb $Ii,$Z3,$Z3 # passed to _aesni_ctr32_ghash_6x 556 vmovdqu 0x10($inp),$T2 557 vpshufb $Ii,$Z0,$Z0 558 vmovdqu ($inp),$Hkey 559 vpshufb $Ii,$Z1,$Z1 560 vmovdqu $Z0,0x30(%rsp) 561 vpshufb $Ii,$Z2,$Z2 562 vmovdqu $Z1,0x40(%rsp) 563 vpshufb $Ii,$T2,$T2 564 vmovdqu $Z2,0x50(%rsp) 565 vpshufb $Ii,$Hkey,$Hkey 566 vmovdqu $T2,0x60(%rsp) 567 vmovdqu $Hkey,0x70(%rsp) 568 569 call _aesni_ctr32_ghash_6x 570 571 mov $Xip_offset(%rbp), %r12 572 vmovups $inout0,-0x60($out) # save output 573 vmovups $inout1,-0x50($out) 574 vmovups $inout2,-0x40($out) 575 vmovups $inout3,-0x30($out) 576 vmovups $inout4,-0x20($out) 577 vmovups $inout5,-0x10($out) 578 579 vpshufb ($const),$Xi,$Xi # .Lbswap_mask 580 vmovdqu $Xi,(%r12) # output Xi 581 582 vzeroupper 583___ 584$code.=<<___ if ($win64); 585 movaps -0xd0(%rbp),%xmm6 586 movaps -0xc0(%rbp),%xmm7 587 movaps -0xb0(%rbp),%xmm8 588 movaps -0xa0(%rbp),%xmm9 589 movaps -0x90(%rbp),%xmm10 590 movaps -0x80(%rbp),%xmm11 591 movaps -0x70(%rbp),%xmm12 592 movaps -0x60(%rbp),%xmm13 593 movaps -0x50(%rbp),%xmm14 594 movaps -0x40(%rbp),%xmm15 595 mov 0x10(%rbp),%rdi 596 mov 0x18(%rbp),%rsi 597___ 598$code.=<<___; 599 lea -0x28(%rbp), %rsp # restore %rsp to fixed allocation 600.cfi_def_cfa %rsp, 0x38 601 pop %r15 602.cfi_pop %r15 603 pop %r14 604.cfi_pop %r14 605 pop %r13 606.cfi_pop %r13 607 pop %r12 608.cfi_pop %r12 609 pop %rbx 610.cfi_pop %rbx 611 pop %rbp 612.cfi_pop %rbp 613.Lgcm_dec_abort: 614 ret 615.seh_endproc 616.cfi_endproc 617.size aesni_gcm_decrypt,.-aesni_gcm_decrypt 618___ 619 620$code.=<<___; 621.type _aesni_ctr32_6x,\@abi-omnipotent 622.align 32 623_aesni_ctr32_6x: 624.cfi_startproc 625 vmovdqu 0x00-0x80($key),$Z0 # borrow $Z0 for $rndkey 626 vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb 627 lea -1($rounds),%r13 628 vmovups 0x10-0x80($key),$rndkey 629 lea 0x20-0x80($key),%r12 630 vpxor $Z0,$T1,$inout0 631 add \$`6<<24`,$counter 632 jc .Lhandle_ctr32_2 633 vpaddb $T2,$T1,$inout1 634 vpaddb $T2,$inout1,$inout2 635 vpxor $Z0,$inout1,$inout1 636 vpaddb $T2,$inout2,$inout3 637 vpxor $Z0,$inout2,$inout2 638 vpaddb $T2,$inout3,$inout4 639 vpxor $Z0,$inout3,$inout3 640 vpaddb $T2,$inout4,$inout5 641 vpxor $Z0,$inout4,$inout4 642 vpaddb $T2,$inout5,$T1 643 vpxor $Z0,$inout5,$inout5 644 jmp .Loop_ctr32 645 646.align 16 647.Loop_ctr32: 648 vaesenc $rndkey,$inout0,$inout0 649 vaesenc $rndkey,$inout1,$inout1 650 vaesenc $rndkey,$inout2,$inout2 651 vaesenc $rndkey,$inout3,$inout3 652 vaesenc $rndkey,$inout4,$inout4 653 vaesenc $rndkey,$inout5,$inout5 654 vmovups (%r12),$rndkey 655 lea 0x10(%r12),%r12 656 dec %r13d 657 jnz .Loop_ctr32 658 659 vmovdqu (%r12),$Hkey # last round key 660 vaesenc $rndkey,$inout0,$inout0 661 vpxor 0x00($inp),$Hkey,$Z0 662 vaesenc $rndkey,$inout1,$inout1 663 vpxor 0x10($inp),$Hkey,$Z1 664 vaesenc $rndkey,$inout2,$inout2 665 vpxor 0x20($inp),$Hkey,$Z2 666 vaesenc $rndkey,$inout3,$inout3 667 vpxor 0x30($inp),$Hkey,$Xi 668 vaesenc $rndkey,$inout4,$inout4 669 vpxor 0x40($inp),$Hkey,$T2 670 vaesenc $rndkey,$inout5,$inout5 671 vpxor 0x50($inp),$Hkey,$Hkey 672 lea 0x60($inp),$inp 673 674 vaesenclast $Z0,$inout0,$inout0 675 vaesenclast $Z1,$inout1,$inout1 676 vaesenclast $Z2,$inout2,$inout2 677 vaesenclast $Xi,$inout3,$inout3 678 vaesenclast $T2,$inout4,$inout4 679 vaesenclast $Hkey,$inout5,$inout5 680 vmovups $inout0,0x00($out) 681 vmovups $inout1,0x10($out) 682 vmovups $inout2,0x20($out) 683 vmovups $inout3,0x30($out) 684 vmovups $inout4,0x40($out) 685 vmovups $inout5,0x50($out) 686 lea 0x60($out),$out 687 688 ret 689.align 32 690.Lhandle_ctr32_2: 691 vpshufb $Ii,$T1,$Z2 # byte-swap counter 692 vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb 693 vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb 694 vpaddd $Z1,$Z2,$inout2 695 vpaddd $Z1,$inout1,$inout3 696 vpshufb $Ii,$inout1,$inout1 697 vpaddd $Z1,$inout2,$inout4 698 vpshufb $Ii,$inout2,$inout2 699 vpxor $Z0,$inout1,$inout1 700 vpaddd $Z1,$inout3,$inout5 701 vpshufb $Ii,$inout3,$inout3 702 vpxor $Z0,$inout2,$inout2 703 vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value 704 vpshufb $Ii,$inout4,$inout4 705 vpxor $Z0,$inout3,$inout3 706 vpshufb $Ii,$inout5,$inout5 707 vpxor $Z0,$inout4,$inout4 708 vpshufb $Ii,$T1,$T1 # next counter value 709 vpxor $Z0,$inout5,$inout5 710 jmp .Loop_ctr32 711.cfi_endproc 712.size _aesni_ctr32_6x,.-_aesni_ctr32_6x 713 714.globl aesni_gcm_encrypt 715.type aesni_gcm_encrypt,\@abi-omnipotent 716.align 32 717aesni_gcm_encrypt: 718.cfi_startproc 719.seh_startproc 720 _CET_ENDBR 721#ifdef BORINGSSL_DISPATCH_TEST 722.extern BORINGSSL_function_hit 723 movb \$1,BORINGSSL_function_hit+2(%rip) 724#endif 725 xor %rax,%rax 726 727 # We call |_aesni_ctr32_6x| twice, each call consuming 96 bytes of 728 # input. Then we call |_aesni_ctr32_ghash_6x|, which requires at 729 # least 96 more bytes of input. 730 cmp \$0x60*3,$len # minimal accepted length 731 jb .Lgcm_enc_abort 732 733 push %rbp 734.cfi_push %rbp 735.seh_pushreg %rbp 736 mov %rsp, %rbp # save stack pointer 737.cfi_def_cfa_register %rbp 738 push %rbx 739.cfi_push %rbx 740.seh_pushreg %rbx 741 push %r12 742.cfi_push %r12 743.seh_pushreg %r12 744 push %r13 745.cfi_push %r13 746.seh_pushreg %r13 747 push %r14 748.cfi_push %r14 749.seh_pushreg %r14 750 push %r15 751.cfi_push %r15 752.seh_pushreg %r15 753___ 754if ($win64) { 755$code.=<<___ 756 lea -0xa8(%rsp),%rsp # 8 extra bytes to align the stack 757.seh_stackalloc 0xa8 758.seh_setframe %rbp, 0xa8+5*8 759 # Load the last two parameters. These go into %rdi and %rsi, which are 760 # non-volatile on Windows, so stash them in the parameter stack area 761 # first. 762 mov %rdi, 0x10(%rbp) 763.seh_savereg %rdi, 0xa8+5*8+0x10 764 mov %rsi, 0x18(%rbp) 765.seh_savereg %rsi, 0xa8+5*8+0x18 766 mov 0x30(%rbp), $ivp 767 mov 0x38(%rbp), $Htable 768 # Save non-volatile XMM registers. 769 movaps %xmm6,-0xd0(%rbp) 770.seh_savexmm %xmm6, 0xa8+5*8-0xd0 771 movaps %xmm7,-0xc0(%rbp) 772.seh_savexmm %xmm7, 0xa8+5*8-0xc0 773 movaps %xmm8,-0xb0(%rbp) 774.seh_savexmm %xmm8, 0xa8+5*8-0xb0 775 movaps %xmm9,-0xa0(%rbp) 776.seh_savexmm %xmm9, 0xa8+5*8-0xa0 777 movaps %xmm10,-0x90(%rbp) 778.seh_savexmm %xmm10, 0xa8+5*8-0x90 779 movaps %xmm11,-0x80(%rbp) 780.seh_savexmm %xmm11, 0xa8+5*8-0x80 781 movaps %xmm12,-0x70(%rbp) 782.seh_savexmm %xmm12, 0xa8+5*8-0x70 783 movaps %xmm13,-0x60(%rbp) 784.seh_savexmm %xmm13, 0xa8+5*8-0x60 785 movaps %xmm14,-0x50(%rbp) 786.seh_savexmm %xmm14, 0xa8+5*8-0x50 787 movaps %xmm15,-0x40(%rbp) 788.seh_savexmm %xmm15, 0xa8+5*8-0x40 789.seh_endprologue 790___ 791} 792$code.=<<___; 793 vzeroupper 794 795 vmovdqu ($ivp),$T1 # input counter value 796 add \$-128,%rsp 797 mov 12($ivp),$counter 798 lea .Lbswap_mask(%rip),$const 799 lea -0x80($key),$in0 # borrow $in0 800 mov \$0xf80,$end0 # borrow $end0 801 lea 0x80($key),$key # size optimization 802 vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask 803 and \$-128,%rsp # ensure stack alignment 804 mov 0xf0-0x80($key),$rounds 805 806 and $end0,$in0 807 and %rsp,$end0 808 sub $in0,$end0 809 jc .Lenc_no_key_aliasing 810 cmp \$768,$end0 811 jnc .Lenc_no_key_aliasing 812 sub $end0,%rsp # avoid aliasing with key 813.Lenc_no_key_aliasing: 814 815 mov $out,$in0 816 817 # |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0) 818 # bytes before the end of the input. Note, in particular, that this is 819 # correct even if |$len| is not an even multiple of 96 or 16. Unlike in 820 # the decryption case, there's no caveat that |$out| must not be near 821 # the very beginning of the address space, because we know that 822 # |$len| >= 3*96 from the check above, and so we know 823 # |$out| + |$len| >= 2*96 (0xc0). 824 lea -0xc0($out,$len),$end0 825 826 shr \$4,$len 827 828 call _aesni_ctr32_6x 829 vpshufb $Ii,$inout0,$Xi # save bswapped output on stack 830 vpshufb $Ii,$inout1,$T2 831 vmovdqu $Xi,0x70(%rsp) 832 vpshufb $Ii,$inout2,$Z0 833 vmovdqu $T2,0x60(%rsp) 834 vpshufb $Ii,$inout3,$Z1 835 vmovdqu $Z0,0x50(%rsp) 836 vpshufb $Ii,$inout4,$Z2 837 vmovdqu $Z1,0x40(%rsp) 838 vpshufb $Ii,$inout5,$Z3 # passed to _aesni_ctr32_ghash_6x 839 vmovdqu $Z2,0x30(%rsp) 840 841 call _aesni_ctr32_6x 842 843 mov $Xip_offset(%rbp), %r12 844 lea 0x20($Htable),$Htable # size optimization 845 vmovdqu (%r12),$Xi # load Xi 846 sub \$12,$len 847 mov \$0x60*2,%rax 848 vpshufb $Ii,$Xi,$Xi 849 850 call _aesni_ctr32_ghash_6x 851 vmovdqu 0x20(%rsp),$Z3 # I[5] 852 vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask 853 vmovdqu 0x00-0x20($Htable),$Hkey # $Hkey^1 854 vpunpckhqdq $Z3,$Z3,$T1 855 vmovdqu 0x20-0x20($Htable),$rndkey # borrow $rndkey for $HK 856 vmovups $inout0,-0x60($out) # save output 857 vpshufb $Ii,$inout0,$inout0 # but keep bswapped copy 858 vpxor $Z3,$T1,$T1 859 vmovups $inout1,-0x50($out) 860 vpshufb $Ii,$inout1,$inout1 861 vmovups $inout2,-0x40($out) 862 vpshufb $Ii,$inout2,$inout2 863 vmovups $inout3,-0x30($out) 864 vpshufb $Ii,$inout3,$inout3 865 vmovups $inout4,-0x20($out) 866 vpshufb $Ii,$inout4,$inout4 867 vmovups $inout5,-0x10($out) 868 vpshufb $Ii,$inout5,$inout5 869 vmovdqu $inout0,0x10(%rsp) # free $inout0 870___ 871{ my ($HK,$T3)=($rndkey,$inout0); 872 873$code.=<<___; 874 vmovdqu 0x30(%rsp),$Z2 # I[4] 875 vmovdqu 0x10-0x20($Htable),$Ii # borrow $Ii for $Hkey^2 876 vpunpckhqdq $Z2,$Z2,$T2 877 vpclmulqdq \$0x00,$Hkey,$Z3,$Z1 878 vpxor $Z2,$T2,$T2 879 vpclmulqdq \$0x11,$Hkey,$Z3,$Z3 880 vpclmulqdq \$0x00,$HK,$T1,$T1 881 882 vmovdqu 0x40(%rsp),$T3 # I[3] 883 vpclmulqdq \$0x00,$Ii,$Z2,$Z0 884 vmovdqu 0x30-0x20($Htable),$Hkey # $Hkey^3 885 vpxor $Z1,$Z0,$Z0 886 vpunpckhqdq $T3,$T3,$Z1 887 vpclmulqdq \$0x11,$Ii,$Z2,$Z2 888 vpxor $T3,$Z1,$Z1 889 vpxor $Z3,$Z2,$Z2 890 vpclmulqdq \$0x10,$HK,$T2,$T2 891 vmovdqu 0x50-0x20($Htable),$HK 892 vpxor $T1,$T2,$T2 893 894 vmovdqu 0x50(%rsp),$T1 # I[2] 895 vpclmulqdq \$0x00,$Hkey,$T3,$Z3 896 vmovdqu 0x40-0x20($Htable),$Ii # borrow $Ii for $Hkey^4 897 vpxor $Z0,$Z3,$Z3 898 vpunpckhqdq $T1,$T1,$Z0 899 vpclmulqdq \$0x11,$Hkey,$T3,$T3 900 vpxor $T1,$Z0,$Z0 901 vpxor $Z2,$T3,$T3 902 vpclmulqdq \$0x00,$HK,$Z1,$Z1 903 vpxor $T2,$Z1,$Z1 904 905 vmovdqu 0x60(%rsp),$T2 # I[1] 906 vpclmulqdq \$0x00,$Ii,$T1,$Z2 907 vmovdqu 0x60-0x20($Htable),$Hkey # $Hkey^5 908 vpxor $Z3,$Z2,$Z2 909 vpunpckhqdq $T2,$T2,$Z3 910 vpclmulqdq \$0x11,$Ii,$T1,$T1 911 vpxor $T2,$Z3,$Z3 912 vpxor $T3,$T1,$T1 913 vpclmulqdq \$0x10,$HK,$Z0,$Z0 914 vmovdqu 0x80-0x20($Htable),$HK 915 vpxor $Z1,$Z0,$Z0 916 917 vpxor 0x70(%rsp),$Xi,$Xi # accumulate I[0] 918 vpclmulqdq \$0x00,$Hkey,$T2,$Z1 919 vmovdqu 0x70-0x20($Htable),$Ii # borrow $Ii for $Hkey^6 920 vpunpckhqdq $Xi,$Xi,$T3 921 vpxor $Z2,$Z1,$Z1 922 vpclmulqdq \$0x11,$Hkey,$T2,$T2 923 vpxor $Xi,$T3,$T3 924 vpxor $T1,$T2,$T2 925 vpclmulqdq \$0x00,$HK,$Z3,$Z3 926 vpxor $Z0,$Z3,$Z0 927 928 vpclmulqdq \$0x00,$Ii,$Xi,$Z2 929 vmovdqu 0x00-0x20($Htable),$Hkey # $Hkey^1 930 vpunpckhqdq $inout5,$inout5,$T1 931 vpclmulqdq \$0x11,$Ii,$Xi,$Xi 932 vpxor $inout5,$T1,$T1 933 vpxor $Z1,$Z2,$Z1 934 vpclmulqdq \$0x10,$HK,$T3,$T3 935 vmovdqu 0x20-0x20($Htable),$HK 936 vpxor $T2,$Xi,$Z3 937 vpxor $Z0,$T3,$Z2 938 939 vmovdqu 0x10-0x20($Htable),$Ii # borrow $Ii for $Hkey^2 940 vpxor $Z1,$Z3,$T3 # aggregated Karatsuba post-processing 941 vpclmulqdq \$0x00,$Hkey,$inout5,$Z0 942 vpxor $T3,$Z2,$Z2 943 vpunpckhqdq $inout4,$inout4,$T2 944 vpclmulqdq \$0x11,$Hkey,$inout5,$inout5 945 vpxor $inout4,$T2,$T2 946 vpslldq \$8,$Z2,$T3 947 vpclmulqdq \$0x00,$HK,$T1,$T1 948 vpxor $T3,$Z1,$Xi 949 vpsrldq \$8,$Z2,$Z2 950 vpxor $Z2,$Z3,$Z3 951 952 vpclmulqdq \$0x00,$Ii,$inout4,$Z1 953 vmovdqu 0x30-0x20($Htable),$Hkey # $Hkey^3 954 vpxor $Z0,$Z1,$Z1 955 vpunpckhqdq $inout3,$inout3,$T3 956 vpclmulqdq \$0x11,$Ii,$inout4,$inout4 957 vpxor $inout3,$T3,$T3 958 vpxor $inout5,$inout4,$inout4 959 vpalignr \$8,$Xi,$Xi,$inout5 # 1st phase 960 vpclmulqdq \$0x10,$HK,$T2,$T2 961 vmovdqu 0x50-0x20($Htable),$HK 962 vpxor $T1,$T2,$T2 963 964 vpclmulqdq \$0x00,$Hkey,$inout3,$Z0 965 vmovdqu 0x40-0x20($Htable),$Ii # borrow $Ii for $Hkey^4 966 vpxor $Z1,$Z0,$Z0 967 vpunpckhqdq $inout2,$inout2,$T1 968 vpclmulqdq \$0x11,$Hkey,$inout3,$inout3 969 vpxor $inout2,$T1,$T1 970 vpxor $inout4,$inout3,$inout3 971 vxorps 0x10(%rsp),$Z3,$Z3 # accumulate $inout0 972 vpclmulqdq \$0x00,$HK,$T3,$T3 973 vpxor $T2,$T3,$T3 974 975 vpclmulqdq \$0x10,0x10($const),$Xi,$Xi 976 vxorps $inout5,$Xi,$Xi 977 978 vpclmulqdq \$0x00,$Ii,$inout2,$Z1 979 vmovdqu 0x60-0x20($Htable),$Hkey # $Hkey^5 980 vpxor $Z0,$Z1,$Z1 981 vpunpckhqdq $inout1,$inout1,$T2 982 vpclmulqdq \$0x11,$Ii,$inout2,$inout2 983 vpxor $inout1,$T2,$T2 984 vpalignr \$8,$Xi,$Xi,$inout5 # 2nd phase 985 vpxor $inout3,$inout2,$inout2 986 vpclmulqdq \$0x10,$HK,$T1,$T1 987 vmovdqu 0x80-0x20($Htable),$HK 988 vpxor $T3,$T1,$T1 989 990 vxorps $Z3,$inout5,$inout5 991 vpclmulqdq \$0x10,0x10($const),$Xi,$Xi 992 vxorps $inout5,$Xi,$Xi 993 994 vpclmulqdq \$0x00,$Hkey,$inout1,$Z0 995 vmovdqu 0x70-0x20($Htable),$Ii # borrow $Ii for $Hkey^6 996 vpxor $Z1,$Z0,$Z0 997 vpunpckhqdq $Xi,$Xi,$T3 998 vpclmulqdq \$0x11,$Hkey,$inout1,$inout1 999 vpxor $Xi,$T3,$T3 1000 vpxor $inout2,$inout1,$inout1 1001 vpclmulqdq \$0x00,$HK,$T2,$T2 1002 vpxor $T1,$T2,$T2 1003 1004 vpclmulqdq \$0x00,$Ii,$Xi,$Z1 1005 vpclmulqdq \$0x11,$Ii,$Xi,$Z3 1006 vpxor $Z0,$Z1,$Z1 1007 vpclmulqdq \$0x10,$HK,$T3,$Z2 1008 vpxor $inout1,$Z3,$Z3 1009 vpxor $T2,$Z2,$Z2 1010 1011 vpxor $Z1,$Z3,$Z0 # aggregated Karatsuba post-processing 1012 vpxor $Z0,$Z2,$Z2 1013 vpslldq \$8,$Z2,$T1 1014 vmovdqu 0x10($const),$Hkey # .Lpoly 1015 vpsrldq \$8,$Z2,$Z2 1016 vpxor $T1,$Z1,$Xi 1017 vpxor $Z2,$Z3,$Z3 1018 1019 vpalignr \$8,$Xi,$Xi,$T2 # 1st phase 1020 vpclmulqdq \$0x10,$Hkey,$Xi,$Xi 1021 vpxor $T2,$Xi,$Xi 1022 1023 vpalignr \$8,$Xi,$Xi,$T2 # 2nd phase 1024 vpclmulqdq \$0x10,$Hkey,$Xi,$Xi 1025 vpxor $Z3,$T2,$T2 1026 vpxor $T2,$Xi,$Xi 1027___ 1028} 1029$code.=<<___; 1030 mov $Xip_offset(%rbp), %r12 1031 vpshufb ($const),$Xi,$Xi # .Lbswap_mask 1032 vmovdqu $Xi,(%r12) # output Xi 1033 1034 vzeroupper 1035___ 1036$code.=<<___ if ($win64); 1037 movaps -0xd0(%rbp),%xmm6 1038 movaps -0xc0(%rbp),%xmm7 1039 movaps -0xb0(%rbp),%xmm8 1040 movaps -0xa0(%rbp),%xmm9 1041 movaps -0x90(%rbp),%xmm10 1042 movaps -0x80(%rbp),%xmm11 1043 movaps -0x70(%rbp),%xmm12 1044 movaps -0x60(%rbp),%xmm13 1045 movaps -0x50(%rbp),%xmm14 1046 movaps -0x40(%rbp),%xmm15 1047 mov 0x10(%rbp),%rdi 1048 mov 0x18(%rbp),%rsi 1049___ 1050$code.=<<___; 1051 lea -0x28(%rbp), %rsp # restore %rsp to fixed allocation 1052.cfi_def_cfa %rsp, 0x38 1053 pop %r15 1054.cfi_pop %r15 1055 pop %r14 1056.cfi_pop %r14 1057 pop %r13 1058.cfi_pop %r13 1059 pop %r12 1060.cfi_pop %r12 1061 pop %rbx 1062.cfi_pop %rbx 1063 pop %rbp 1064.cfi_pop %rbp 1065.Lgcm_enc_abort: 1066 ret 1067.seh_endproc 1068.cfi_endproc 1069.size aesni_gcm_encrypt,.-aesni_gcm_encrypt 1070___ 1071 1072$code.=<<___; 1073.section .rodata 1074.align 64 1075.Lbswap_mask: 1076 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 1077.Lpoly: 1078 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 1079.Lone_msb: 1080 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 1081.Ltwo_lsb: 1082 .byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 1083.Lone_lsb: 1084 .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 1085.asciz "AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1086.align 64 1087.text 1088___ 1089}}} else {{{ 1090$code=<<___; # assembler is too old 1091.text 1092 1093.globl aesni_gcm_encrypt 1094.type aesni_gcm_encrypt,\@abi-omnipotent 1095aesni_gcm_encrypt: 1096 _CET_ENDBR 1097 xor %eax,%eax 1098 ret 1099.size aesni_gcm_encrypt,.-aesni_gcm_encrypt 1100 1101.globl aesni_gcm_decrypt 1102.type aesni_gcm_decrypt,\@abi-omnipotent 1103aesni_gcm_decrypt: 1104 _CET_ENDBR 1105 xor %eax,%eax 1106 ret 1107.size aesni_gcm_decrypt,.-aesni_gcm_decrypt 1108___ 1109}}} 1110 1111$code =~ s/\`([^\`]*)\`/eval($1)/gem; 1112 1113print $code; 1114 1115close STDOUT or die "error closing STDOUT: $!"; 1116