1#! /usr/bin/env perl 2# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# 18# AES-NI-CTR+GHASH stitch. 19# 20# February 2013 21# 22# OpenSSL GCM implementation is organized in such way that its 23# performance is rather close to the sum of its streamed components, 24# in the context parallelized AES-NI CTR and modulo-scheduled 25# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation 26# was observed to perform significantly better than the sum of the 27# components on contemporary CPUs, the effort was deemed impossible to 28# justify. This module is based on combination of Intel submissions, 29# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max 30# Locktyukhin of Intel Corp. who verified that it reduces shuffles 31# pressure with notable relative improvement, achieving 1.0 cycle per 32# byte processed with 128-bit key on Haswell processor, 0.74 - on 33# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled 34# measurements for favourable packet size, one divisible by 96. 35# Applications using the EVP interface will observe a few percent 36# worse performance.] 37# 38# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP). 39# 40# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest 41# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf 42 43$flavour = shift; 44$output = shift; 45if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 46 47$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 48 49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 50( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 51( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 52die "can't locate x86_64-xlate.pl"; 53 54# |$avx| in ghash-x86_64.pl must be set to at least 1; otherwise tags will 55# be computed incorrectly. 56# 57# In upstream, this is controlled by shelling out to the compiler to check 58# versions, but BoringSSL is intended to be used with pre-generated perlasm 59# output, so this isn't useful anyway. 60# 61# The upstream code uses the condition |$avx>1| even though no AVX2 62# instructions are used, because it assumes MOVBE is supported by the assembler 63# if and only if AVX2 is also supported by the assembler; see 64# https://marc.info/?l=openssl-dev&m=146567589526984&w=2. 65$avx = 2; 66 67open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 68*STDOUT=*OUT; 69 70# See the comment above regarding why the condition is ($avx>1) when there are 71# no AVX2 instructions being used. 72if ($avx>1) {{{ 73 74# On Windows, only four parameters are passed in registers. The last two 75# parameters will be manually loaded into %rdi and %rsi. 76my ($inp, $out, $len, $key, $ivp, $Xip) = 77 $win64 ? ("%rcx", "%rdx", "%r8", "%r9", "%rdi", "%rsi") : 78 ("%rdi", "%rsi", "%rdx", "%rcx", "%r8", "%r9"); 79 80($Ii,$T1,$T2,$Hkey, 81 $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8)); 82 83($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15)); 84 85($counter,$rounds,$const,$in0,$end0)=("%ebx","%r10d","%r11","%r14","%r15"); 86 87$code=<<___; 88.text 89 90.type _aesni_ctr32_ghash_6x,\@abi-omnipotent 91.align 32 92_aesni_ctr32_ghash_6x: 93.cfi_startproc 94 vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb 95 sub \$6,$len 96 vpxor $Z0,$Z0,$Z0 # $Z0 = 0 97 vmovdqu 0x00-0x80($key),$rndkey 98 vpaddb $T2,$T1,$inout1 99 vpaddb $T2,$inout1,$inout2 100 vpaddb $T2,$inout2,$inout3 101 vpaddb $T2,$inout3,$inout4 102 vpaddb $T2,$inout4,$inout5 103 vpxor $rndkey,$T1,$inout0 104 vmovdqu $Z0,16+8(%rsp) # "$Z3" = 0 105 jmp .Loop6x 106 107.align 32 108.Loop6x: 109 add \$`6<<24`,$counter 110 jc .Lhandle_ctr32 # discard $inout[1-5]? 111 vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1 112 vpaddb $T2,$inout5,$T1 # next counter value 113 vpxor $rndkey,$inout1,$inout1 114 vpxor $rndkey,$inout2,$inout2 115 116.Lresume_ctr32: 117 vmovdqu $T1,($ivp) # save next counter value 118 vpclmulqdq \$0x10,$Hkey,$Z3,$Z1 119 vpxor $rndkey,$inout3,$inout3 120 vmovups 0x10-0x80($key),$T2 # borrow $T2 for $rndkey 121 vpclmulqdq \$0x01,$Hkey,$Z3,$Z2 122 123 # At this point, the current block of 96 (0x60) bytes has already been 124 # loaded into registers. Concurrently with processing it, we want to 125 # load the next 96 bytes of input for the next round. Obviously, we can 126 # only do this if there are at least 96 more bytes of input beyond the 127 # input we're currently processing, or else we'd read past the end of 128 # the input buffer. Here, we set |%r12| to 96 if there are at least 96 129 # bytes of input beyond the 96 bytes we're already processing, and we 130 # set |%r12| to 0 otherwise. In the case where we set |%r12| to 96, 131 # we'll read in the next block so that it is in registers for the next 132 # loop iteration. In the case where we set |%r12| to 0, we'll re-read 133 # the current block and then ignore what we re-read. 134 # 135 # At this point, |$in0| points to the current (already read into 136 # registers) block, and |$end0| points to 2*96 bytes before the end of 137 # the input. Thus, |$in0| > |$end0| means that we do not have the next 138 # 96-byte block to read in, and |$in0| <= |$end0| means we do. 139 xor %r12,%r12 140 cmp $in0,$end0 141 142 vaesenc $T2,$inout0,$inout0 143 vmovdqu 0x30+8(%rsp),$Ii # I[4] 144 vpxor $rndkey,$inout4,$inout4 145 vpclmulqdq \$0x00,$Hkey,$Z3,$T1 146 vaesenc $T2,$inout1,$inout1 147 vpxor $rndkey,$inout5,$inout5 148 setnc %r12b 149 vpclmulqdq \$0x11,$Hkey,$Z3,$Z3 150 vaesenc $T2,$inout2,$inout2 151 vmovdqu 0x10-0x20($Xip),$Hkey # $Hkey^2 152 neg %r12 153 vaesenc $T2,$inout3,$inout3 154 vpxor $Z1,$Z2,$Z2 155 vpclmulqdq \$0x00,$Hkey,$Ii,$Z1 156 vpxor $Z0,$Xi,$Xi # modulo-scheduled 157 vaesenc $T2,$inout4,$inout4 158 vpxor $Z1,$T1,$Z0 159 and \$0x60,%r12 160 vmovups 0x20-0x80($key),$rndkey 161 vpclmulqdq \$0x10,$Hkey,$Ii,$T1 162 vaesenc $T2,$inout5,$inout5 163 164 vpclmulqdq \$0x01,$Hkey,$Ii,$T2 165 lea ($in0,%r12),$in0 166 vaesenc $rndkey,$inout0,$inout0 167 vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled [vpxor $Z3,$Xi,$Xi] 168 vpclmulqdq \$0x11,$Hkey,$Ii,$Hkey 169 vmovdqu 0x40+8(%rsp),$Ii # I[3] 170 vaesenc $rndkey,$inout1,$inout1 171 movbe 0x58($in0),%r13 172 vaesenc $rndkey,$inout2,$inout2 173 movbe 0x50($in0),%r12 174 vaesenc $rndkey,$inout3,$inout3 175 mov %r13,0x20+8(%rsp) 176 vaesenc $rndkey,$inout4,$inout4 177 mov %r12,0x28+8(%rsp) 178 vmovdqu 0x30-0x20($Xip),$Z1 # borrow $Z1 for $Hkey^3 179 vaesenc $rndkey,$inout5,$inout5 180 181 vmovups 0x30-0x80($key),$rndkey 182 vpxor $T1,$Z2,$Z2 183 vpclmulqdq \$0x00,$Z1,$Ii,$T1 184 vaesenc $rndkey,$inout0,$inout0 185 vpxor $T2,$Z2,$Z2 186 vpclmulqdq \$0x10,$Z1,$Ii,$T2 187 vaesenc $rndkey,$inout1,$inout1 188 vpxor $Hkey,$Z3,$Z3 189 vpclmulqdq \$0x01,$Z1,$Ii,$Hkey 190 vaesenc $rndkey,$inout2,$inout2 191 vpclmulqdq \$0x11,$Z1,$Ii,$Z1 192 vmovdqu 0x50+8(%rsp),$Ii # I[2] 193 vaesenc $rndkey,$inout3,$inout3 194 vaesenc $rndkey,$inout4,$inout4 195 vpxor $T1,$Z0,$Z0 196 vmovdqu 0x40-0x20($Xip),$T1 # borrow $T1 for $Hkey^4 197 vaesenc $rndkey,$inout5,$inout5 198 199 vmovups 0x40-0x80($key),$rndkey 200 vpxor $T2,$Z2,$Z2 201 vpclmulqdq \$0x00,$T1,$Ii,$T2 202 vaesenc $rndkey,$inout0,$inout0 203 vpxor $Hkey,$Z2,$Z2 204 vpclmulqdq \$0x10,$T1,$Ii,$Hkey 205 vaesenc $rndkey,$inout1,$inout1 206 movbe 0x48($in0),%r13 207 vpxor $Z1,$Z3,$Z3 208 vpclmulqdq \$0x01,$T1,$Ii,$Z1 209 vaesenc $rndkey,$inout2,$inout2 210 movbe 0x40($in0),%r12 211 vpclmulqdq \$0x11,$T1,$Ii,$T1 212 vmovdqu 0x60+8(%rsp),$Ii # I[1] 213 vaesenc $rndkey,$inout3,$inout3 214 mov %r13,0x30+8(%rsp) 215 vaesenc $rndkey,$inout4,$inout4 216 mov %r12,0x38+8(%rsp) 217 vpxor $T2,$Z0,$Z0 218 vmovdqu 0x60-0x20($Xip),$T2 # borrow $T2 for $Hkey^5 219 vaesenc $rndkey,$inout5,$inout5 220 221 vmovups 0x50-0x80($key),$rndkey 222 vpxor $Hkey,$Z2,$Z2 223 vpclmulqdq \$0x00,$T2,$Ii,$Hkey 224 vaesenc $rndkey,$inout0,$inout0 225 vpxor $Z1,$Z2,$Z2 226 vpclmulqdq \$0x10,$T2,$Ii,$Z1 227 vaesenc $rndkey,$inout1,$inout1 228 movbe 0x38($in0),%r13 229 vpxor $T1,$Z3,$Z3 230 vpclmulqdq \$0x01,$T2,$Ii,$T1 231 vpxor 0x70+8(%rsp),$Xi,$Xi # accumulate I[0] 232 vaesenc $rndkey,$inout2,$inout2 233 movbe 0x30($in0),%r12 234 vpclmulqdq \$0x11,$T2,$Ii,$T2 235 vaesenc $rndkey,$inout3,$inout3 236 mov %r13,0x40+8(%rsp) 237 vaesenc $rndkey,$inout4,$inout4 238 mov %r12,0x48+8(%rsp) 239 vpxor $Hkey,$Z0,$Z0 240 vmovdqu 0x70-0x20($Xip),$Hkey # $Hkey^6 241 vaesenc $rndkey,$inout5,$inout5 242 243 vmovups 0x60-0x80($key),$rndkey 244 vpxor $Z1,$Z2,$Z2 245 vpclmulqdq \$0x10,$Hkey,$Xi,$Z1 246 vaesenc $rndkey,$inout0,$inout0 247 vpxor $T1,$Z2,$Z2 248 vpclmulqdq \$0x01,$Hkey,$Xi,$T1 249 vaesenc $rndkey,$inout1,$inout1 250 movbe 0x28($in0),%r13 251 vpxor $T2,$Z3,$Z3 252 vpclmulqdq \$0x00,$Hkey,$Xi,$T2 253 vaesenc $rndkey,$inout2,$inout2 254 movbe 0x20($in0),%r12 255 vpclmulqdq \$0x11,$Hkey,$Xi,$Xi 256 vaesenc $rndkey,$inout3,$inout3 257 mov %r13,0x50+8(%rsp) 258 vaesenc $rndkey,$inout4,$inout4 259 mov %r12,0x58+8(%rsp) 260 vpxor $Z1,$Z2,$Z2 261 vaesenc $rndkey,$inout5,$inout5 262 vpxor $T1,$Z2,$Z2 263 264 vmovups 0x70-0x80($key),$rndkey 265 vpslldq \$8,$Z2,$Z1 266 vpxor $T2,$Z0,$Z0 267 vmovdqu 0x10($const),$Hkey # .Lpoly 268 269 vaesenc $rndkey,$inout0,$inout0 270 vpxor $Xi,$Z3,$Z3 271 vaesenc $rndkey,$inout1,$inout1 272 vpxor $Z1,$Z0,$Z0 273 movbe 0x18($in0),%r13 274 vaesenc $rndkey,$inout2,$inout2 275 movbe 0x10($in0),%r12 276 vpalignr \$8,$Z0,$Z0,$Ii # 1st phase 277 vpclmulqdq \$0x10,$Hkey,$Z0,$Z0 278 mov %r13,0x60+8(%rsp) 279 vaesenc $rndkey,$inout3,$inout3 280 mov %r12,0x68+8(%rsp) 281 vaesenc $rndkey,$inout4,$inout4 282 vmovups 0x80-0x80($key),$T1 # borrow $T1 for $rndkey 283 vaesenc $rndkey,$inout5,$inout5 284 285 vaesenc $T1,$inout0,$inout0 286 vmovups 0x90-0x80($key),$rndkey 287 vaesenc $T1,$inout1,$inout1 288 vpsrldq \$8,$Z2,$Z2 289 vaesenc $T1,$inout2,$inout2 290 vpxor $Z2,$Z3,$Z3 291 vaesenc $T1,$inout3,$inout3 292 vpxor $Ii,$Z0,$Z0 293 movbe 0x08($in0),%r13 294 vaesenc $T1,$inout4,$inout4 295 movbe 0x00($in0),%r12 296 vaesenc $T1,$inout5,$inout5 297 vmovups 0xa0-0x80($key),$T1 298 cmp \$11,$rounds 299 jb .Lenc_tail # 128-bit key 300 301 vaesenc $rndkey,$inout0,$inout0 302 vaesenc $rndkey,$inout1,$inout1 303 vaesenc $rndkey,$inout2,$inout2 304 vaesenc $rndkey,$inout3,$inout3 305 vaesenc $rndkey,$inout4,$inout4 306 vaesenc $rndkey,$inout5,$inout5 307 308 vaesenc $T1,$inout0,$inout0 309 vaesenc $T1,$inout1,$inout1 310 vaesenc $T1,$inout2,$inout2 311 vaesenc $T1,$inout3,$inout3 312 vaesenc $T1,$inout4,$inout4 313 vmovups 0xb0-0x80($key),$rndkey 314 vaesenc $T1,$inout5,$inout5 315 vmovups 0xc0-0x80($key),$T1 316 je .Lenc_tail # 192-bit key 317 318 vaesenc $rndkey,$inout0,$inout0 319 vaesenc $rndkey,$inout1,$inout1 320 vaesenc $rndkey,$inout2,$inout2 321 vaesenc $rndkey,$inout3,$inout3 322 vaesenc $rndkey,$inout4,$inout4 323 vaesenc $rndkey,$inout5,$inout5 324 325 vaesenc $T1,$inout0,$inout0 326 vaesenc $T1,$inout1,$inout1 327 vaesenc $T1,$inout2,$inout2 328 vaesenc $T1,$inout3,$inout3 329 vaesenc $T1,$inout4,$inout4 330 vmovups 0xd0-0x80($key),$rndkey 331 vaesenc $T1,$inout5,$inout5 332 vmovups 0xe0-0x80($key),$T1 333 jmp .Lenc_tail # 256-bit key 334 335.align 32 336.Lhandle_ctr32: 337 vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask 338 vpshufb $Ii,$T1,$Z2 # byte-swap counter 339 vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb 340 vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb 341 vpaddd $Z1,$Z2,$inout2 342 vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1 343 vpaddd $Z1,$inout1,$inout3 344 vpshufb $Ii,$inout1,$inout1 345 vpaddd $Z1,$inout2,$inout4 346 vpshufb $Ii,$inout2,$inout2 347 vpxor $rndkey,$inout1,$inout1 348 vpaddd $Z1,$inout3,$inout5 349 vpshufb $Ii,$inout3,$inout3 350 vpxor $rndkey,$inout2,$inout2 351 vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value 352 vpshufb $Ii,$inout4,$inout4 353 vpshufb $Ii,$inout5,$inout5 354 vpshufb $Ii,$T1,$T1 # next counter value 355 jmp .Lresume_ctr32 356 357.align 32 358.Lenc_tail: 359 vaesenc $rndkey,$inout0,$inout0 360 vmovdqu $Z3,16+8(%rsp) # postpone vpxor $Z3,$Xi,$Xi 361 vpalignr \$8,$Z0,$Z0,$Xi # 2nd phase 362 vaesenc $rndkey,$inout1,$inout1 363 vpclmulqdq \$0x10,$Hkey,$Z0,$Z0 364 vpxor 0x00($inp),$T1,$T2 365 vaesenc $rndkey,$inout2,$inout2 366 vpxor 0x10($inp),$T1,$Ii 367 vaesenc $rndkey,$inout3,$inout3 368 vpxor 0x20($inp),$T1,$Z1 369 vaesenc $rndkey,$inout4,$inout4 370 vpxor 0x30($inp),$T1,$Z2 371 vaesenc $rndkey,$inout5,$inout5 372 vpxor 0x40($inp),$T1,$Z3 373 vpxor 0x50($inp),$T1,$Hkey 374 vmovdqu ($ivp),$T1 # load next counter value 375 376 vaesenclast $T2,$inout0,$inout0 377 vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb 378 vaesenclast $Ii,$inout1,$inout1 379 vpaddb $T2,$T1,$Ii 380 mov %r13,0x70+8(%rsp) 381 lea 0x60($inp),$inp 382 # These two prefetches were added in BoringSSL. See change that added them. 383 prefetcht0 512($inp) # We use 96-byte block so prefetch 2 lines (128 bytes) 384 prefetcht0 576($inp) 385 vaesenclast $Z1,$inout2,$inout2 386 vpaddb $T2,$Ii,$Z1 387 mov %r12,0x78+8(%rsp) 388 lea 0x60($out),$out 389 vmovdqu 0x00-0x80($key),$rndkey 390 vaesenclast $Z2,$inout3,$inout3 391 vpaddb $T2,$Z1,$Z2 392 vaesenclast $Z3, $inout4,$inout4 393 vpaddb $T2,$Z2,$Z3 394 vaesenclast $Hkey,$inout5,$inout5 395 vpaddb $T2,$Z3,$Hkey 396 397 add \$0x60,%rax 398 sub \$0x6,$len 399 jc .L6x_done 400 401 vmovups $inout0,-0x60($out) # save output 402 vpxor $rndkey,$T1,$inout0 403 vmovups $inout1,-0x50($out) 404 vmovdqa $Ii,$inout1 # 0 latency 405 vmovups $inout2,-0x40($out) 406 vmovdqa $Z1,$inout2 # 0 latency 407 vmovups $inout3,-0x30($out) 408 vmovdqa $Z2,$inout3 # 0 latency 409 vmovups $inout4,-0x20($out) 410 vmovdqa $Z3,$inout4 # 0 latency 411 vmovups $inout5,-0x10($out) 412 vmovdqa $Hkey,$inout5 # 0 latency 413 vmovdqu 0x20+8(%rsp),$Z3 # I[5] 414 jmp .Loop6x 415 416.L6x_done: 417 vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled 418 vpxor $Z0,$Xi,$Xi # modulo-scheduled 419 420 ret 421.cfi_endproc 422.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x 423___ 424###################################################################### 425# 426# size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len, 427# const AES_KEY *key, unsigned char iv[16], 428# struct { u128 Xi,H,Htbl[9]; } *Xip); 429$code.=<<___; 430.globl aesni_gcm_decrypt 431.type aesni_gcm_decrypt,\@abi-omnipotent 432.align 32 433aesni_gcm_decrypt: 434.cfi_startproc 435.seh_startproc 436 xor %rax,%rax 437 438 # We call |_aesni_ctr32_ghash_6x|, which requires at least 96 (0x60) 439 # bytes of input. 440 cmp \$0x60,$len # minimal accepted length 441 jb .Lgcm_dec_abort 442 443 push %rbp 444.cfi_push %rbp 445.seh_pushreg %rbp 446 mov %rsp, %rbp # save stack pointer 447.cfi_def_cfa_register %rbp 448 push %rbx 449.cfi_push %rbx 450.seh_pushreg %rbx 451 push %r12 452.cfi_push %r12 453.seh_pushreg %r12 454 push %r13 455.cfi_push %r13 456.seh_pushreg %r13 457 push %r14 458.cfi_push %r14 459.seh_pushreg %r14 460 push %r15 461.cfi_push %r15 462.seh_pushreg %r15 463___ 464if ($win64) { 465$code.=<<___ 466 lea -0xa8(%rsp),%rsp # 8 extra bytes to align the stack 467.seh_allocstack 0xa8 468.seh_setframe %rbp, 0xa8+5*8 469 # Load the last two parameters. These go into %rdi and %rsi, which are 470 # non-volatile on Windows, so stash them in the parameter stack area 471 # first. 472 mov %rdi, 0x10(%rbp) 473.seh_savereg %rdi, 0xa8+5*8+0x10 474 mov %rsi, 0x18(%rbp) 475.seh_savereg %rsi, 0xa8+5*8+0x18 476 mov 0x30(%rbp), $ivp 477 mov 0x38(%rbp), $Xip 478 # Save non-volatile XMM registers. 479 movaps %xmm6,-0xd0(%rbp) 480.seh_savexmm128 %xmm6, 0xa8+5*8-0xd0 481 movaps %xmm7,-0xc0(%rbp) 482.seh_savexmm128 %xmm7, 0xa8+5*8-0xc0 483 movaps %xmm8,-0xb0(%rbp) 484.seh_savexmm128 %xmm8, 0xa8+5*8-0xb0 485 movaps %xmm9,-0xa0(%rbp) 486.seh_savexmm128 %xmm9, 0xa8+5*8-0xa0 487 movaps %xmm10,-0x90(%rbp) 488.seh_savexmm128 %xmm10, 0xa8+5*8-0x90 489 movaps %xmm11,-0x80(%rbp) 490.seh_savexmm128 %xmm11, 0xa8+5*8-0x80 491 movaps %xmm12,-0x70(%rbp) 492.seh_savexmm128 %xmm12, 0xa8+5*8-0x70 493 movaps %xmm13,-0x60(%rbp) 494.seh_savexmm128 %xmm13, 0xa8+5*8-0x60 495 movaps %xmm14,-0x50(%rbp) 496.seh_savexmm128 %xmm14, 0xa8+5*8-0x50 497 movaps %xmm15,-0x40(%rbp) 498.seh_savexmm128 %xmm15, 0xa8+5*8-0x40 499___ 500} 501$code.=<<___; 502 vzeroupper 503 504 vmovdqu ($ivp),$T1 # input counter value 505 add \$-128,%rsp 506 mov 12($ivp),$counter 507 lea .Lbswap_mask(%rip),$const 508 lea -0x80($key),$in0 # borrow $in0 509 mov \$0xf80,$end0 # borrow $end0 510 vmovdqu ($Xip),$Xi # load Xi 511 and \$-128,%rsp # ensure stack alignment 512 vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask 513 lea 0x80($key),$key # size optimization 514 lea 0x20+0x20($Xip),$Xip # size optimization 515 mov 0xf0-0x80($key),$rounds 516 vpshufb $Ii,$Xi,$Xi 517 518 and $end0,$in0 519 and %rsp,$end0 520 sub $in0,$end0 521 jc .Ldec_no_key_aliasing 522 cmp \$768,$end0 523 jnc .Ldec_no_key_aliasing 524 sub $end0,%rsp # avoid aliasing with key 525.Ldec_no_key_aliasing: 526 527 vmovdqu 0x50($inp),$Z3 # I[5] 528 mov $inp,$in0 529 vmovdqu 0x40($inp),$Z0 530 531 # |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0) 532 # bytes before the end of the input. Note, in particular, that this is 533 # correct even if |$len| is not an even multiple of 96 or 16. XXX: This 534 # seems to require that |$inp| + |$len| >= 2*96 (0xc0); i.e. |$inp| must 535 # not be near the very beginning of the address space when |$len| < 2*96 536 # (0xc0). 537 lea -0xc0($inp,$len),$end0 538 539 vmovdqu 0x30($inp),$Z1 540 shr \$4,$len 541 xor %rax,%rax 542 vmovdqu 0x20($inp),$Z2 543 vpshufb $Ii,$Z3,$Z3 # passed to _aesni_ctr32_ghash_6x 544 vmovdqu 0x10($inp),$T2 545 vpshufb $Ii,$Z0,$Z0 546 vmovdqu ($inp),$Hkey 547 vpshufb $Ii,$Z1,$Z1 548 vmovdqu $Z0,0x30(%rsp) 549 vpshufb $Ii,$Z2,$Z2 550 vmovdqu $Z1,0x40(%rsp) 551 vpshufb $Ii,$T2,$T2 552 vmovdqu $Z2,0x50(%rsp) 553 vpshufb $Ii,$Hkey,$Hkey 554 vmovdqu $T2,0x60(%rsp) 555 vmovdqu $Hkey,0x70(%rsp) 556 557 call _aesni_ctr32_ghash_6x 558 559 vmovups $inout0,-0x60($out) # save output 560 vmovups $inout1,-0x50($out) 561 vmovups $inout2,-0x40($out) 562 vmovups $inout3,-0x30($out) 563 vmovups $inout4,-0x20($out) 564 vmovups $inout5,-0x10($out) 565 566 vpshufb ($const),$Xi,$Xi # .Lbswap_mask 567 vmovdqu $Xi,-0x40($Xip) # output Xi 568 569 vzeroupper 570___ 571$code.=<<___ if ($win64); 572 movaps -0xd0(%rbp),%xmm6 573 movaps -0xc0(%rbp),%xmm7 574 movaps -0xb0(%rbp),%xmm8 575 movaps -0xa0(%rbp),%xmm9 576 movaps -0x90(%rbp),%xmm10 577 movaps -0x80(%rbp),%xmm11 578 movaps -0x70(%rbp),%xmm12 579 movaps -0x60(%rbp),%xmm13 580 movaps -0x50(%rbp),%xmm14 581 movaps -0x40(%rbp),%xmm15 582 mov 0x10(%rbp),%rdi 583 mov 0x18(%rbp),%rsi 584___ 585$code.=<<___; 586 lea -0x28(%rbp), %rsp # restore %rsp to fixed allocation 587.cfi_def_cfa %rsp, 0x38 588 pop %r15 589.cfi_pop %r15 590 pop %r14 591.cfi_pop %r14 592 pop %r13 593.cfi_pop %r13 594 pop %r12 595.cfi_pop %r12 596 pop %rbx 597.cfi_pop %rbx 598 pop %rbp 599.cfi_pop %rbp 600.Lgcm_dec_abort: 601 ret 602.seh_endproc 603.cfi_endproc 604.size aesni_gcm_decrypt,.-aesni_gcm_decrypt 605___ 606 607$code.=<<___; 608.type _aesni_ctr32_6x,\@abi-omnipotent 609.align 32 610_aesni_ctr32_6x: 611.cfi_startproc 612 vmovdqu 0x00-0x80($key),$Z0 # borrow $Z0 for $rndkey 613 vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb 614 lea -1($rounds),%r13 615 vmovups 0x10-0x80($key),$rndkey 616 lea 0x20-0x80($key),%r12 617 vpxor $Z0,$T1,$inout0 618 add \$`6<<24`,$counter 619 jc .Lhandle_ctr32_2 620 vpaddb $T2,$T1,$inout1 621 vpaddb $T2,$inout1,$inout2 622 vpxor $Z0,$inout1,$inout1 623 vpaddb $T2,$inout2,$inout3 624 vpxor $Z0,$inout2,$inout2 625 vpaddb $T2,$inout3,$inout4 626 vpxor $Z0,$inout3,$inout3 627 vpaddb $T2,$inout4,$inout5 628 vpxor $Z0,$inout4,$inout4 629 vpaddb $T2,$inout5,$T1 630 vpxor $Z0,$inout5,$inout5 631 jmp .Loop_ctr32 632 633.align 16 634.Loop_ctr32: 635 vaesenc $rndkey,$inout0,$inout0 636 vaesenc $rndkey,$inout1,$inout1 637 vaesenc $rndkey,$inout2,$inout2 638 vaesenc $rndkey,$inout3,$inout3 639 vaesenc $rndkey,$inout4,$inout4 640 vaesenc $rndkey,$inout5,$inout5 641 vmovups (%r12),$rndkey 642 lea 0x10(%r12),%r12 643 dec %r13d 644 jnz .Loop_ctr32 645 646 vmovdqu (%r12),$Hkey # last round key 647 vaesenc $rndkey,$inout0,$inout0 648 vpxor 0x00($inp),$Hkey,$Z0 649 vaesenc $rndkey,$inout1,$inout1 650 vpxor 0x10($inp),$Hkey,$Z1 651 vaesenc $rndkey,$inout2,$inout2 652 vpxor 0x20($inp),$Hkey,$Z2 653 vaesenc $rndkey,$inout3,$inout3 654 vpxor 0x30($inp),$Hkey,$Xi 655 vaesenc $rndkey,$inout4,$inout4 656 vpxor 0x40($inp),$Hkey,$T2 657 vaesenc $rndkey,$inout5,$inout5 658 vpxor 0x50($inp),$Hkey,$Hkey 659 lea 0x60($inp),$inp 660 661 vaesenclast $Z0,$inout0,$inout0 662 vaesenclast $Z1,$inout1,$inout1 663 vaesenclast $Z2,$inout2,$inout2 664 vaesenclast $Xi,$inout3,$inout3 665 vaesenclast $T2,$inout4,$inout4 666 vaesenclast $Hkey,$inout5,$inout5 667 vmovups $inout0,0x00($out) 668 vmovups $inout1,0x10($out) 669 vmovups $inout2,0x20($out) 670 vmovups $inout3,0x30($out) 671 vmovups $inout4,0x40($out) 672 vmovups $inout5,0x50($out) 673 lea 0x60($out),$out 674 675 ret 676.align 32 677.Lhandle_ctr32_2: 678 vpshufb $Ii,$T1,$Z2 # byte-swap counter 679 vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb 680 vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb 681 vpaddd $Z1,$Z2,$inout2 682 vpaddd $Z1,$inout1,$inout3 683 vpshufb $Ii,$inout1,$inout1 684 vpaddd $Z1,$inout2,$inout4 685 vpshufb $Ii,$inout2,$inout2 686 vpxor $Z0,$inout1,$inout1 687 vpaddd $Z1,$inout3,$inout5 688 vpshufb $Ii,$inout3,$inout3 689 vpxor $Z0,$inout2,$inout2 690 vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value 691 vpshufb $Ii,$inout4,$inout4 692 vpxor $Z0,$inout3,$inout3 693 vpshufb $Ii,$inout5,$inout5 694 vpxor $Z0,$inout4,$inout4 695 vpshufb $Ii,$T1,$T1 # next counter value 696 vpxor $Z0,$inout5,$inout5 697 jmp .Loop_ctr32 698.cfi_endproc 699.size _aesni_ctr32_6x,.-_aesni_ctr32_6x 700 701.globl aesni_gcm_encrypt 702.type aesni_gcm_encrypt,\@abi-omnipotent 703.align 32 704aesni_gcm_encrypt: 705.cfi_startproc 706.seh_startproc 707#ifdef BORINGSSL_DISPATCH_TEST 708.extern BORINGSSL_function_hit 709 movb \$1,BORINGSSL_function_hit+2(%rip) 710#endif 711 xor %rax,%rax 712 713 # We call |_aesni_ctr32_6x| twice, each call consuming 96 bytes of 714 # input. Then we call |_aesni_ctr32_ghash_6x|, which requires at 715 # least 96 more bytes of input. 716 cmp \$0x60*3,$len # minimal accepted length 717 jb .Lgcm_enc_abort 718 719 push %rbp 720.cfi_push %rbp 721.seh_pushreg %rbp 722 mov %rsp, %rbp # save stack pointer 723.cfi_def_cfa_register %rbp 724 push %rbx 725.cfi_push %rbx 726.seh_pushreg %rbx 727 push %r12 728.cfi_push %r12 729.seh_pushreg %r12 730 push %r13 731.cfi_push %r13 732.seh_pushreg %r13 733 push %r14 734.cfi_push %r14 735.seh_pushreg %r14 736 push %r15 737.cfi_push %r15 738.seh_pushreg %r15 739___ 740if ($win64) { 741$code.=<<___ 742 lea -0xa8(%rsp),%rsp # 8 extra bytes to align the stack 743.seh_allocstack 0xa8 744.seh_setframe %rbp, 0xa8+5*8 745 # Load the last two parameters. These go into %rdi and %rsi, which are 746 # non-volatile on Windows, so stash them in the parameter stack area 747 # first. 748 mov %rdi, 0x10(%rbp) 749.seh_savereg %rdi, 0xa8+5*8+0x10 750 mov %rsi, 0x18(%rbp) 751.seh_savereg %rsi, 0xa8+5*8+0x18 752 mov 0x30(%rbp), $ivp 753 mov 0x38(%rbp), $Xip 754 # Save non-volatile XMM registers. 755 movaps %xmm6,-0xd0(%rbp) 756.seh_savexmm128 %xmm6, 0xa8+5*8-0xd0 757 movaps %xmm7,-0xc0(%rbp) 758.seh_savexmm128 %xmm7, 0xa8+5*8-0xc0 759 movaps %xmm8,-0xb0(%rbp) 760.seh_savexmm128 %xmm8, 0xa8+5*8-0xb0 761 movaps %xmm9,-0xa0(%rbp) 762.seh_savexmm128 %xmm9, 0xa8+5*8-0xa0 763 movaps %xmm10,-0x90(%rbp) 764.seh_savexmm128 %xmm10, 0xa8+5*8-0x90 765 movaps %xmm11,-0x80(%rbp) 766.seh_savexmm128 %xmm11, 0xa8+5*8-0x80 767 movaps %xmm12,-0x70(%rbp) 768.seh_savexmm128 %xmm12, 0xa8+5*8-0x70 769 movaps %xmm13,-0x60(%rbp) 770.seh_savexmm128 %xmm13, 0xa8+5*8-0x60 771 movaps %xmm14,-0x50(%rbp) 772.seh_savexmm128 %xmm14, 0xa8+5*8-0x50 773 movaps %xmm15,-0x40(%rbp) 774.seh_savexmm128 %xmm15, 0xa8+5*8-0x40 775___ 776} 777$code.=<<___; 778 vzeroupper 779 780 vmovdqu ($ivp),$T1 # input counter value 781 add \$-128,%rsp 782 mov 12($ivp),$counter 783 lea .Lbswap_mask(%rip),$const 784 lea -0x80($key),$in0 # borrow $in0 785 mov \$0xf80,$end0 # borrow $end0 786 lea 0x80($key),$key # size optimization 787 vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask 788 and \$-128,%rsp # ensure stack alignment 789 mov 0xf0-0x80($key),$rounds 790 791 and $end0,$in0 792 and %rsp,$end0 793 sub $in0,$end0 794 jc .Lenc_no_key_aliasing 795 cmp \$768,$end0 796 jnc .Lenc_no_key_aliasing 797 sub $end0,%rsp # avoid aliasing with key 798.Lenc_no_key_aliasing: 799 800 mov $out,$in0 801 802 # |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0) 803 # bytes before the end of the input. Note, in particular, that this is 804 # correct even if |$len| is not an even multiple of 96 or 16. Unlike in 805 # the decryption case, there's no caveat that |$out| must not be near 806 # the very beginning of the address space, because we know that 807 # |$len| >= 3*96 from the check above, and so we know 808 # |$out| + |$len| >= 2*96 (0xc0). 809 lea -0xc0($out,$len),$end0 810 811 shr \$4,$len 812 813 call _aesni_ctr32_6x 814 vpshufb $Ii,$inout0,$Xi # save bswapped output on stack 815 vpshufb $Ii,$inout1,$T2 816 vmovdqu $Xi,0x70(%rsp) 817 vpshufb $Ii,$inout2,$Z0 818 vmovdqu $T2,0x60(%rsp) 819 vpshufb $Ii,$inout3,$Z1 820 vmovdqu $Z0,0x50(%rsp) 821 vpshufb $Ii,$inout4,$Z2 822 vmovdqu $Z1,0x40(%rsp) 823 vpshufb $Ii,$inout5,$Z3 # passed to _aesni_ctr32_ghash_6x 824 vmovdqu $Z2,0x30(%rsp) 825 826 call _aesni_ctr32_6x 827 828 vmovdqu ($Xip),$Xi # load Xi 829 lea 0x20+0x20($Xip),$Xip # size optimization 830 sub \$12,$len 831 mov \$0x60*2,%rax 832 vpshufb $Ii,$Xi,$Xi 833 834 call _aesni_ctr32_ghash_6x 835 vmovdqu 0x20(%rsp),$Z3 # I[5] 836 vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask 837 vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1 838 vpunpckhqdq $Z3,$Z3,$T1 839 vmovdqu 0x20-0x20($Xip),$rndkey # borrow $rndkey for $HK 840 vmovups $inout0,-0x60($out) # save output 841 vpshufb $Ii,$inout0,$inout0 # but keep bswapped copy 842 vpxor $Z3,$T1,$T1 843 vmovups $inout1,-0x50($out) 844 vpshufb $Ii,$inout1,$inout1 845 vmovups $inout2,-0x40($out) 846 vpshufb $Ii,$inout2,$inout2 847 vmovups $inout3,-0x30($out) 848 vpshufb $Ii,$inout3,$inout3 849 vmovups $inout4,-0x20($out) 850 vpshufb $Ii,$inout4,$inout4 851 vmovups $inout5,-0x10($out) 852 vpshufb $Ii,$inout5,$inout5 853 vmovdqu $inout0,0x10(%rsp) # free $inout0 854___ 855{ my ($HK,$T3)=($rndkey,$inout0); 856 857$code.=<<___; 858 vmovdqu 0x30(%rsp),$Z2 # I[4] 859 vmovdqu 0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2 860 vpunpckhqdq $Z2,$Z2,$T2 861 vpclmulqdq \$0x00,$Hkey,$Z3,$Z1 862 vpxor $Z2,$T2,$T2 863 vpclmulqdq \$0x11,$Hkey,$Z3,$Z3 864 vpclmulqdq \$0x00,$HK,$T1,$T1 865 866 vmovdqu 0x40(%rsp),$T3 # I[3] 867 vpclmulqdq \$0x00,$Ii,$Z2,$Z0 868 vmovdqu 0x30-0x20($Xip),$Hkey # $Hkey^3 869 vpxor $Z1,$Z0,$Z0 870 vpunpckhqdq $T3,$T3,$Z1 871 vpclmulqdq \$0x11,$Ii,$Z2,$Z2 872 vpxor $T3,$Z1,$Z1 873 vpxor $Z3,$Z2,$Z2 874 vpclmulqdq \$0x10,$HK,$T2,$T2 875 vmovdqu 0x50-0x20($Xip),$HK 876 vpxor $T1,$T2,$T2 877 878 vmovdqu 0x50(%rsp),$T1 # I[2] 879 vpclmulqdq \$0x00,$Hkey,$T3,$Z3 880 vmovdqu 0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4 881 vpxor $Z0,$Z3,$Z3 882 vpunpckhqdq $T1,$T1,$Z0 883 vpclmulqdq \$0x11,$Hkey,$T3,$T3 884 vpxor $T1,$Z0,$Z0 885 vpxor $Z2,$T3,$T3 886 vpclmulqdq \$0x00,$HK,$Z1,$Z1 887 vpxor $T2,$Z1,$Z1 888 889 vmovdqu 0x60(%rsp),$T2 # I[1] 890 vpclmulqdq \$0x00,$Ii,$T1,$Z2 891 vmovdqu 0x60-0x20($Xip),$Hkey # $Hkey^5 892 vpxor $Z3,$Z2,$Z2 893 vpunpckhqdq $T2,$T2,$Z3 894 vpclmulqdq \$0x11,$Ii,$T1,$T1 895 vpxor $T2,$Z3,$Z3 896 vpxor $T3,$T1,$T1 897 vpclmulqdq \$0x10,$HK,$Z0,$Z0 898 vmovdqu 0x80-0x20($Xip),$HK 899 vpxor $Z1,$Z0,$Z0 900 901 vpxor 0x70(%rsp),$Xi,$Xi # accumulate I[0] 902 vpclmulqdq \$0x00,$Hkey,$T2,$Z1 903 vmovdqu 0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6 904 vpunpckhqdq $Xi,$Xi,$T3 905 vpxor $Z2,$Z1,$Z1 906 vpclmulqdq \$0x11,$Hkey,$T2,$T2 907 vpxor $Xi,$T3,$T3 908 vpxor $T1,$T2,$T2 909 vpclmulqdq \$0x00,$HK,$Z3,$Z3 910 vpxor $Z0,$Z3,$Z0 911 912 vpclmulqdq \$0x00,$Ii,$Xi,$Z2 913 vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1 914 vpunpckhqdq $inout5,$inout5,$T1 915 vpclmulqdq \$0x11,$Ii,$Xi,$Xi 916 vpxor $inout5,$T1,$T1 917 vpxor $Z1,$Z2,$Z1 918 vpclmulqdq \$0x10,$HK,$T3,$T3 919 vmovdqu 0x20-0x20($Xip),$HK 920 vpxor $T2,$Xi,$Z3 921 vpxor $Z0,$T3,$Z2 922 923 vmovdqu 0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2 924 vpxor $Z1,$Z3,$T3 # aggregated Karatsuba post-processing 925 vpclmulqdq \$0x00,$Hkey,$inout5,$Z0 926 vpxor $T3,$Z2,$Z2 927 vpunpckhqdq $inout4,$inout4,$T2 928 vpclmulqdq \$0x11,$Hkey,$inout5,$inout5 929 vpxor $inout4,$T2,$T2 930 vpslldq \$8,$Z2,$T3 931 vpclmulqdq \$0x00,$HK,$T1,$T1 932 vpxor $T3,$Z1,$Xi 933 vpsrldq \$8,$Z2,$Z2 934 vpxor $Z2,$Z3,$Z3 935 936 vpclmulqdq \$0x00,$Ii,$inout4,$Z1 937 vmovdqu 0x30-0x20($Xip),$Hkey # $Hkey^3 938 vpxor $Z0,$Z1,$Z1 939 vpunpckhqdq $inout3,$inout3,$T3 940 vpclmulqdq \$0x11,$Ii,$inout4,$inout4 941 vpxor $inout3,$T3,$T3 942 vpxor $inout5,$inout4,$inout4 943 vpalignr \$8,$Xi,$Xi,$inout5 # 1st phase 944 vpclmulqdq \$0x10,$HK,$T2,$T2 945 vmovdqu 0x50-0x20($Xip),$HK 946 vpxor $T1,$T2,$T2 947 948 vpclmulqdq \$0x00,$Hkey,$inout3,$Z0 949 vmovdqu 0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4 950 vpxor $Z1,$Z0,$Z0 951 vpunpckhqdq $inout2,$inout2,$T1 952 vpclmulqdq \$0x11,$Hkey,$inout3,$inout3 953 vpxor $inout2,$T1,$T1 954 vpxor $inout4,$inout3,$inout3 955 vxorps 0x10(%rsp),$Z3,$Z3 # accumulate $inout0 956 vpclmulqdq \$0x00,$HK,$T3,$T3 957 vpxor $T2,$T3,$T3 958 959 vpclmulqdq \$0x10,0x10($const),$Xi,$Xi 960 vxorps $inout5,$Xi,$Xi 961 962 vpclmulqdq \$0x00,$Ii,$inout2,$Z1 963 vmovdqu 0x60-0x20($Xip),$Hkey # $Hkey^5 964 vpxor $Z0,$Z1,$Z1 965 vpunpckhqdq $inout1,$inout1,$T2 966 vpclmulqdq \$0x11,$Ii,$inout2,$inout2 967 vpxor $inout1,$T2,$T2 968 vpalignr \$8,$Xi,$Xi,$inout5 # 2nd phase 969 vpxor $inout3,$inout2,$inout2 970 vpclmulqdq \$0x10,$HK,$T1,$T1 971 vmovdqu 0x80-0x20($Xip),$HK 972 vpxor $T3,$T1,$T1 973 974 vxorps $Z3,$inout5,$inout5 975 vpclmulqdq \$0x10,0x10($const),$Xi,$Xi 976 vxorps $inout5,$Xi,$Xi 977 978 vpclmulqdq \$0x00,$Hkey,$inout1,$Z0 979 vmovdqu 0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6 980 vpxor $Z1,$Z0,$Z0 981 vpunpckhqdq $Xi,$Xi,$T3 982 vpclmulqdq \$0x11,$Hkey,$inout1,$inout1 983 vpxor $Xi,$T3,$T3 984 vpxor $inout2,$inout1,$inout1 985 vpclmulqdq \$0x00,$HK,$T2,$T2 986 vpxor $T1,$T2,$T2 987 988 vpclmulqdq \$0x00,$Ii,$Xi,$Z1 989 vpclmulqdq \$0x11,$Ii,$Xi,$Z3 990 vpxor $Z0,$Z1,$Z1 991 vpclmulqdq \$0x10,$HK,$T3,$Z2 992 vpxor $inout1,$Z3,$Z3 993 vpxor $T2,$Z2,$Z2 994 995 vpxor $Z1,$Z3,$Z0 # aggregated Karatsuba post-processing 996 vpxor $Z0,$Z2,$Z2 997 vpslldq \$8,$Z2,$T1 998 vmovdqu 0x10($const),$Hkey # .Lpoly 999 vpsrldq \$8,$Z2,$Z2 1000 vpxor $T1,$Z1,$Xi 1001 vpxor $Z2,$Z3,$Z3 1002 1003 vpalignr \$8,$Xi,$Xi,$T2 # 1st phase 1004 vpclmulqdq \$0x10,$Hkey,$Xi,$Xi 1005 vpxor $T2,$Xi,$Xi 1006 1007 vpalignr \$8,$Xi,$Xi,$T2 # 2nd phase 1008 vpclmulqdq \$0x10,$Hkey,$Xi,$Xi 1009 vpxor $Z3,$T2,$T2 1010 vpxor $T2,$Xi,$Xi 1011___ 1012} 1013$code.=<<___; 1014 vpshufb ($const),$Xi,$Xi # .Lbswap_mask 1015 vmovdqu $Xi,-0x40($Xip) # output Xi 1016 1017 vzeroupper 1018___ 1019$code.=<<___ if ($win64); 1020 movaps -0xd0(%rbp),%xmm6 1021 movaps -0xc0(%rbp),%xmm7 1022 movaps -0xb0(%rbp),%xmm8 1023 movaps -0xa0(%rbp),%xmm9 1024 movaps -0x90(%rbp),%xmm10 1025 movaps -0x80(%rbp),%xmm11 1026 movaps -0x70(%rbp),%xmm12 1027 movaps -0x60(%rbp),%xmm13 1028 movaps -0x50(%rbp),%xmm14 1029 movaps -0x40(%rbp),%xmm15 1030 mov 0x10(%rbp),%rdi 1031 mov 0x18(%rbp),%rsi 1032___ 1033$code.=<<___; 1034 lea -0x28(%rbp), %rsp # restore %rsp to fixed allocation 1035.cfi_def_cfa %rsp, 0x38 1036 pop %r15 1037.cfi_pop %r15 1038 pop %r14 1039.cfi_pop %r14 1040 pop %r13 1041.cfi_pop %r13 1042 pop %r12 1043.cfi_pop %r12 1044 pop %rbx 1045.cfi_pop %rbx 1046 pop %rbp 1047.cfi_pop %rbp 1048.Lgcm_enc_abort: 1049 ret 1050.seh_endproc 1051.cfi_endproc 1052.size aesni_gcm_decrypt,.-aesni_gcm_decrypt 1053___ 1054 1055$code.=<<___; 1056.section .rodata 1057.align 64 1058.Lbswap_mask: 1059 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 1060.Lpoly: 1061 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 1062.Lone_msb: 1063 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 1064.Ltwo_lsb: 1065 .byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 1066.Lone_lsb: 1067 .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 1068.asciz "AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1069.align 64 1070.text 1071___ 1072}}} else {{{ 1073$code=<<___; # assembler is too old 1074.text 1075 1076.globl aesni_gcm_encrypt 1077.type aesni_gcm_encrypt,\@abi-omnipotent 1078aesni_gcm_encrypt: 1079 xor %eax,%eax 1080 ret 1081.size aesni_gcm_encrypt,.-aesni_gcm_encrypt 1082 1083.globl aesni_gcm_decrypt 1084.type aesni_gcm_decrypt,\@abi-omnipotent 1085aesni_gcm_decrypt: 1086 xor %eax,%eax 1087 ret 1088.size aesni_gcm_decrypt,.-aesni_gcm_decrypt 1089___ 1090}}} 1091 1092$code =~ s/\`([^\`]*)\`/eval($1)/gem; 1093 1094print $code; 1095 1096close STDOUT or die "error closing STDOUT: $!"; 1097