1#!/usr/bin/env perl 2 3# Copyright (c) 2017, Shay Gueron. 4# Copyright 2017 The BoringSSL Authors 5# 6# Permission to use, copy, modify, and/or distribute this software for any 7# purpose with or without fee is hereby granted, provided that the above 8# copyright notice and this permission notice appear in all copies. 9# 10# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 13# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION 15# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN 16# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ 17 18use warnings FATAL => 'all'; 19 20$flavour = shift; 21$output = shift; 22if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 23 24$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 25 26$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 27( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 28( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 29die "can't locate x86_64-xlate.pl"; 30 31open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 32*STDOUT=*OUT; 33 34$code.=<<___; 35.section .rodata 36 37.align 16 38one: 39.quad 1,0 40two: 41.quad 2,0 42three: 43.quad 3,0 44four: 45.quad 4,0 46five: 47.quad 5,0 48six: 49.quad 6,0 50seven: 51.quad 7,0 52eight: 53.quad 8,0 54 55OR_MASK: 56.long 0x00000000,0x00000000,0x00000000,0x80000000 57poly: 58.quad 0x1, 0xc200000000000000 59mask: 60.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d 61con1: 62.long 1,1,1,1 63con2: 64.long 0x1b,0x1b,0x1b,0x1b 65con3: 66.byte -1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7 67and_mask: 68.long 0,0xffffffff, 0xffffffff, 0xffffffff 69___ 70 71$code.=<<___; 72.text 73___ 74 75sub gfmul { 76 ######################### 77 # a = T 78 # b = TMP0 - remains unchanged 79 # res = T 80 # uses also TMP1,TMP2,TMP3,TMP4 81 # __m128i GFMUL(__m128i A, __m128i B); 82 83 my $T = "%xmm0"; 84 my $TMP0 = "%xmm1"; 85 my $TMP1 = "%xmm2"; 86 my $TMP2 = "%xmm3"; 87 my $TMP3 = "%xmm4"; 88 my $TMP4 = "%xmm5"; 89 90 $code.=<<___; 91.type GFMUL,\@abi-omnipotent 92.align 16 93GFMUL: 94.cfi_startproc 95 vpclmulqdq \$0x00, $TMP0, $T, $TMP1 96 vpclmulqdq \$0x11, $TMP0, $T, $TMP4 97 vpclmulqdq \$0x10, $TMP0, $T, $TMP2 98 vpclmulqdq \$0x01, $TMP0, $T, $TMP3 99 vpxor $TMP3, $TMP2, $TMP2 100 vpslldq \$8, $TMP2, $TMP3 101 vpsrldq \$8, $TMP2, $TMP2 102 vpxor $TMP3, $TMP1, $TMP1 103 vpxor $TMP2, $TMP4, $TMP4 104 105 vpclmulqdq \$0x10, poly(%rip), $TMP1, $TMP2 106 vpshufd \$78, $TMP1, $TMP3 107 vpxor $TMP3, $TMP2, $TMP1 108 109 vpclmulqdq \$0x10, poly(%rip), $TMP1, $TMP2 110 vpshufd \$78, $TMP1, $TMP3 111 vpxor $TMP3, $TMP2, $TMP1 112 113 vpxor $TMP4, $TMP1, $T 114 ret 115.cfi_endproc 116.size GFMUL, .-GFMUL 117___ 118} 119gfmul(); 120 121sub aesgcmsiv_htable_init { 122 # aesgcmsiv_htable_init writes an eight-entry table of powers of |H| to 123 # |out_htable|. 124 # void aesgcmsiv_htable_init(uint8_t out_htable[16*8], uint8_t *H); 125 126 my $Htbl = "%rdi"; 127 my $H = "%rsi"; 128 my $T = "%xmm0"; 129 my $TMP0 = "%xmm1"; 130 131$code.=<<___; 132.globl aesgcmsiv_htable_init 133.type aesgcmsiv_htable_init,\@function,2 134.align 16 135aesgcmsiv_htable_init: 136.cfi_startproc 137 _CET_ENDBR 138 vmovdqa ($H), $T 139 vmovdqa $T, $TMP0 140 vmovdqa $T, ($Htbl) # H 141 call GFMUL 142 vmovdqa $T, 16($Htbl) # H^2 143 call GFMUL 144 vmovdqa $T, 32($Htbl) # H^3 145 call GFMUL 146 vmovdqa $T, 48($Htbl) # H^4 147 call GFMUL 148 vmovdqa $T, 64($Htbl) # H^5 149 call GFMUL 150 vmovdqa $T, 80($Htbl) # H^6 151 call GFMUL 152 vmovdqa $T, 96($Htbl) # H^7 153 call GFMUL 154 vmovdqa $T, 112($Htbl) # H^8 155 ret 156.cfi_endproc 157.size aesgcmsiv_htable_init, .-aesgcmsiv_htable_init 158___ 159} 160aesgcmsiv_htable_init(); 161 162sub aesgcmsiv_htable6_init { 163 # aesgcmsiv_htable6_init writes a six-entry table of powers of |H| to 164 # |out_htable|. 165 # void aesgcmsiv_htable6_init(uint8_t out_htable[16*6], uint8_t *H); 166 # 167 my $Htbl = "%rdi"; 168 my $H = "%rsi"; 169 my $T = "%xmm0"; 170 my $TMP0 = "%xmm1"; 171 172 $code.=<<___; 173.globl aesgcmsiv_htable6_init 174.type aesgcmsiv_htable6_init,\@function,2 175.align 16 176aesgcmsiv_htable6_init: 177.cfi_startproc 178 _CET_ENDBR 179 vmovdqa ($H), $T 180 vmovdqa $T, $TMP0 181 vmovdqa $T, ($Htbl) # H 182 call GFMUL 183 vmovdqa $T, 16($Htbl) # H^2 184 call GFMUL 185 vmovdqa $T, 32($Htbl) # H^3 186 call GFMUL 187 vmovdqa $T, 48($Htbl) # H^4 188 call GFMUL 189 vmovdqa $T, 64($Htbl) # H^5 190 call GFMUL 191 vmovdqa $T, 80($Htbl) # H^6 192 ret 193.cfi_endproc 194.size aesgcmsiv_htable6_init, .-aesgcmsiv_htable6_init 195___ 196} 197aesgcmsiv_htable6_init(); 198 199sub aesgcmsiv_htable_polyval { 200 # void aesgcmsiv_htable_polyval(uint8_t Htbl[16*8], uint8_t *MSG, uint64_t LEN, uint8_t *T); 201 # parameter 1: %rdi Htable - pointer to Htable 202 # parameter 2: %rsi INp - pointer to input 203 # parameter 3: %rdx LEN - length of BUFFER in bytes 204 # parameter 4: %rcx T - pointer to POLYVAL output 205 206 my $DATA = "%xmm0"; 207 my $hlp0 = "%r11"; 208 my $Htbl = "%rdi"; 209 my $inp = "%rsi"; 210 my $len = "%rdx"; 211 my $TMP0 = "%xmm3"; 212 my $TMP1 = "%xmm4"; 213 my $TMP2 = "%xmm5"; 214 my $TMP3 = "%xmm6"; 215 my $TMP4 = "%xmm7"; 216 my $Tp = "%rcx"; 217 my $T = "%xmm1"; 218 my $Xhi = "%xmm9"; 219 220 my $SCHOOLBOOK_AAD = sub { 221 my ($i)=@_; 222 return <<___; 223 vpclmulqdq \$0x01, ${\eval(16*$i)}($Htbl), $DATA, $TMP3 224 vpxor $TMP3, $TMP2, $TMP2 225 vpclmulqdq \$0x00, ${\eval(16*$i)}($Htbl), $DATA, $TMP3 226 vpxor $TMP3, $TMP0, $TMP0 227 vpclmulqdq \$0x11, ${\eval(16*$i)}($Htbl), $DATA, $TMP3 228 vpxor $TMP3, $TMP1, $TMP1 229 vpclmulqdq \$0x10, ${\eval(16*$i)}($Htbl), $DATA, $TMP3 230 vpxor $TMP3, $TMP2, $TMP2 231___ 232 }; 233 234 $code.=<<___; 235.globl aesgcmsiv_htable_polyval 236.type aesgcmsiv_htable_polyval,\@function,4 237.align 16 238aesgcmsiv_htable_polyval: 239.cfi_startproc 240 _CET_ENDBR 241 test $len, $len 242 jnz .Lhtable_polyval_start 243 ret 244 245.Lhtable_polyval_start: 246 vzeroall 247 248 # We hash 8 blocks each iteration. If the total number of blocks is not a 249 # multiple of 8, we first hash the leading n%8 blocks. 250 movq $len, $hlp0 251 andq \$127, $hlp0 252 253 jz .Lhtable_polyval_no_prefix 254 255 vpxor $Xhi, $Xhi, $Xhi 256 vmovdqa ($Tp), $T 257 sub $hlp0, $len 258 259 sub \$16, $hlp0 260 261 # hash first prefix block 262 vmovdqu ($inp), $DATA 263 vpxor $T, $DATA, $DATA 264 265 vpclmulqdq \$0x01, ($Htbl,$hlp0), $DATA, $TMP2 266 vpclmulqdq \$0x00, ($Htbl,$hlp0), $DATA, $TMP0 267 vpclmulqdq \$0x11, ($Htbl,$hlp0), $DATA, $TMP1 268 vpclmulqdq \$0x10, ($Htbl,$hlp0), $DATA, $TMP3 269 vpxor $TMP3, $TMP2, $TMP2 270 271 lea 16($inp), $inp 272 test $hlp0, $hlp0 273 jnz .Lhtable_polyval_prefix_loop 274 jmp .Lhtable_polyval_prefix_complete 275 276 # hash remaining prefix bocks (up to 7 total prefix blocks) 277.align 64 278.Lhtable_polyval_prefix_loop: 279 sub \$16, $hlp0 280 281 vmovdqu ($inp), $DATA # next data block 282 283 vpclmulqdq \$0x00, ($Htbl,$hlp0), $DATA, $TMP3 284 vpxor $TMP3, $TMP0, $TMP0 285 vpclmulqdq \$0x11, ($Htbl,$hlp0), $DATA, $TMP3 286 vpxor $TMP3, $TMP1, $TMP1 287 vpclmulqdq \$0x01, ($Htbl,$hlp0), $DATA, $TMP3 288 vpxor $TMP3, $TMP2, $TMP2 289 vpclmulqdq \$0x10, ($Htbl,$hlp0), $DATA, $TMP3 290 vpxor $TMP3, $TMP2, $TMP2 291 292 test $hlp0, $hlp0 293 294 lea 16($inp), $inp 295 296 jnz .Lhtable_polyval_prefix_loop 297 298.Lhtable_polyval_prefix_complete: 299 vpsrldq \$8, $TMP2, $TMP3 300 vpslldq \$8, $TMP2, $TMP2 301 302 vpxor $TMP3, $TMP1, $Xhi 303 vpxor $TMP2, $TMP0, $T 304 305 jmp .Lhtable_polyval_main_loop 306 307.Lhtable_polyval_no_prefix: 308 # At this point we know the number of blocks is a multiple of 8. However, 309 # the reduction in the main loop includes a multiplication by x^(-128). In 310 # order to counter this, the existing tag needs to be multipled by x^128. 311 # In practice, this just means that it is loaded into $Xhi, not $T. 312 vpxor $T, $T, $T 313 vmovdqa ($Tp), $Xhi 314 315.align 64 316.Lhtable_polyval_main_loop: 317 sub \$0x80, $len 318 jb .Lhtable_polyval_out 319 320 vmovdqu 16*7($inp), $DATA # Ii 321 322 vpclmulqdq \$0x01, ($Htbl), $DATA, $TMP2 323 vpclmulqdq \$0x00, ($Htbl), $DATA, $TMP0 324 vpclmulqdq \$0x11, ($Htbl), $DATA, $TMP1 325 vpclmulqdq \$0x10, ($Htbl), $DATA, $TMP3 326 vpxor $TMP3, $TMP2, $TMP2 327 328 ######################################################### 329 vmovdqu 16*6($inp), $DATA 330 ${\$SCHOOLBOOK_AAD->(1)} 331 332 ######################################################### 333 vmovdqu 16*5($inp), $DATA 334 335 vpclmulqdq \$0x10, poly(%rip), $T, $TMP4 # reduction stage 1a 336 vpalignr \$8, $T, $T, $T 337 338 ${\$SCHOOLBOOK_AAD->(2)} 339 340 vpxor $TMP4, $T, $T # reduction stage 1b 341 ######################################################### 342 vmovdqu 16*4($inp), $DATA 343 344 ${\$SCHOOLBOOK_AAD->(3)} 345 ######################################################### 346 vmovdqu 16*3($inp), $DATA 347 348 vpclmulqdq \$0x10, poly(%rip), $T, $TMP4 # reduction stage 2a 349 vpalignr \$8, $T, $T, $T 350 351 ${\$SCHOOLBOOK_AAD->(4)} 352 353 vpxor $TMP4, $T, $T # reduction stage 2b 354 ######################################################### 355 vmovdqu 16*2($inp), $DATA 356 357 ${\$SCHOOLBOOK_AAD->(5)} 358 359 vpxor $Xhi, $T, $T # reduction finalize 360 ######################################################### 361 vmovdqu 16*1($inp), $DATA 362 363 ${\$SCHOOLBOOK_AAD->(6)} 364 ######################################################### 365 vmovdqu 16*0($inp), $DATA 366 vpxor $T, $DATA, $DATA 367 368 ${\$SCHOOLBOOK_AAD->(7)} 369 ######################################################### 370 vpsrldq \$8, $TMP2, $TMP3 371 vpslldq \$8, $TMP2, $TMP2 372 373 vpxor $TMP3, $TMP1, $Xhi 374 vpxor $TMP2, $TMP0, $T 375 376 lea 16*8($inp), $inp 377 jmp .Lhtable_polyval_main_loop 378 379 ######################################################### 380 381.Lhtable_polyval_out: 382 vpclmulqdq \$0x10, poly(%rip), $T, $TMP3 383 vpalignr \$8, $T, $T, $T 384 vpxor $TMP3, $T, $T 385 386 vpclmulqdq \$0x10, poly(%rip), $T, $TMP3 387 vpalignr \$8, $T, $T, $T 388 vpxor $TMP3, $T, $T 389 vpxor $Xhi, $T, $T 390 391 vmovdqu $T, ($Tp) 392 vzeroupper 393 ret 394.cfi_endproc 395.size aesgcmsiv_htable_polyval,.-aesgcmsiv_htable_polyval 396___ 397} 398aesgcmsiv_htable_polyval(); 399 400sub aesgcmsiv_polyval_horner { 401 #void aesgcmsiv_polyval_horner(unsigned char T[16], // output 402 # const unsigned char* H, // H 403 # unsigned char* BUF, // Buffer 404 # unsigned int blocks); // Len2 405 # 406 # parameter 1: %rdi T - pointers to POLYVAL output 407 # parameter 2: %rsi Hp - pointer to H (user key) 408 # parameter 3: %rdx INp - pointer to input 409 # parameter 4: %rcx L - total number of blocks in input BUFFER 410 # 411 my $T = "%rdi"; 412 my $Hp = "%rsi"; 413 my $INp = "%rdx"; 414 my $L = "%rcx"; 415 my $LOC = "%r10"; 416 my $LEN = "%eax"; 417 my $H = "%xmm1"; 418 my $RES = "%xmm0"; 419 420 $code.=<<___; 421.globl aesgcmsiv_polyval_horner 422.type aesgcmsiv_polyval_horner,\@function,4 423.align 16 424aesgcmsiv_polyval_horner: 425.cfi_startproc 426 _CET_ENDBR 427 test $L, $L 428 jnz .Lpolyval_horner_start 429 ret 430 431.Lpolyval_horner_start: 432 # We will start with L GFMULS for POLYVAL(BIG_BUFFER) 433 # RES = GFMUL(RES, H) 434 435 xorq $LOC, $LOC 436 shlq \$4, $L # L contains number of bytes to process 437 438 vmovdqa ($Hp), $H 439 vmovdqa ($T), $RES 440 441.Lpolyval_horner_loop: 442 vpxor ($INp,$LOC), $RES, $RES # RES = RES + Xi 443 call GFMUL # RES = RES * H 444 445 add \$16, $LOC 446 cmp $LOC, $L 447 jne .Lpolyval_horner_loop 448 449 # calculation of T is complete. RES=T 450 vmovdqa $RES, ($T) 451 ret 452.cfi_endproc 453.size aesgcmsiv_polyval_horner,.-aesgcmsiv_polyval_horner 454___ 455} 456aesgcmsiv_polyval_horner(); 457 458# void aes128gcmsiv_aes_ks(const uint8_t *key, uint8_t *out_expanded_key); 459# parameter 1: %rdi 460# parameter 2: %rsi 461$code.=<<___; 462.globl aes128gcmsiv_aes_ks 463.type aes128gcmsiv_aes_ks,\@function,2 464.align 16 465aes128gcmsiv_aes_ks: 466.cfi_startproc 467 _CET_ENDBR 468 vmovdqu (%rdi), %xmm1 # xmm1 = user key 469 vmovdqa %xmm1, (%rsi) # rsi points to output 470 471 vmovdqa con1(%rip), %xmm0 472 vmovdqa mask(%rip), %xmm15 473 474 movq \$8, %rax 475 476.Lks128_loop: 477 addq \$16, %rsi # rsi points for next key 478 subq \$1, %rax 479 vpshufb %xmm15, %xmm1, %xmm2 # xmm2 = shuffled user key 480 vaesenclast %xmm0, %xmm2, %xmm2 481 vpslld \$1, %xmm0, %xmm0 482 vpslldq \$4, %xmm1, %xmm3 483 vpxor %xmm3, %xmm1, %xmm1 484 vpslldq \$4, %xmm3, %xmm3 485 vpxor %xmm3, %xmm1, %xmm1 486 vpslldq \$4, %xmm3, %xmm3 487 vpxor %xmm3, %xmm1, %xmm1 488 vpxor %xmm2, %xmm1, %xmm1 489 vmovdqa %xmm1, (%rsi) 490 jne .Lks128_loop 491 492 vmovdqa con2(%rip), %xmm0 493 vpshufb %xmm15, %xmm1, %xmm2 494 vaesenclast %xmm0, %xmm2, %xmm2 495 vpslld \$1, %xmm0, %xmm0 496 vpslldq \$4, %xmm1, %xmm3 497 vpxor %xmm3, %xmm1, %xmm1 498 vpslldq \$4, %xmm3, %xmm3 499 vpxor %xmm3, %xmm1, %xmm1 500 vpslldq \$4, %xmm3, %xmm3 501 vpxor %xmm3, %xmm1, %xmm1 502 vpxor %xmm2, %xmm1, %xmm1 503 vmovdqa %xmm1, 16(%rsi) 504 505 vpshufb %xmm15, %xmm1, %xmm2 506 vaesenclast %xmm0, %xmm2, %xmm2 507 vpslldq \$4, %xmm1, %xmm3 508 vpxor %xmm3, %xmm1, %xmm1 509 vpslldq \$4, %xmm3, %xmm3 510 vpxor %xmm3, %xmm1, %xmm1 511 vpslldq \$4, %xmm3, %xmm3 512 vpxor %xmm3, %xmm1, %xmm1 513 vpxor %xmm2, %xmm1, %xmm1 514 vmovdqa %xmm1, 32(%rsi) 515 ret 516.cfi_endproc 517.size aes128gcmsiv_aes_ks,.-aes128gcmsiv_aes_ks 518___ 519 520# void aes256gcmsiv_aes_ks(const uint8_t *key, uint8_t *out_expanded_key); 521# parameter 1: %rdi 522# parameter 2: %rsi 523$code.=<<___; 524.globl aes256gcmsiv_aes_ks 525.type aes256gcmsiv_aes_ks,\@function,2 526.align 16 527aes256gcmsiv_aes_ks: 528.cfi_startproc 529 _CET_ENDBR 530 vmovdqu (%rdi), %xmm1 531 vmovdqu 16(%rdi), %xmm3 532 vmovdqa %xmm1, (%rsi) 533 vmovdqa %xmm3, 16(%rsi) 534 vmovdqa con1(%rip), %xmm0 535 vmovdqa mask(%rip), %xmm15 536 vpxor %xmm14, %xmm14, %xmm14 537 mov \$6, %rax 538 539.Lks256_loop: 540 add \$32, %rsi 541 subq \$1, %rax 542 vpshufb %xmm15, %xmm3, %xmm2 543 vaesenclast %xmm0, %xmm2, %xmm2 544 vpslld \$1, %xmm0, %xmm0 545 vpsllq \$32, %xmm1, %xmm4 546 vpxor %xmm4, %xmm1, %xmm1 547 vpshufb con3(%rip), %xmm1, %xmm4 548 vpxor %xmm4, %xmm1, %xmm1 549 vpxor %xmm2, %xmm1, %xmm1 550 vmovdqa %xmm1, (%rsi) 551 vpshufd \$0xff, %xmm1, %xmm2 552 vaesenclast %xmm14, %xmm2, %xmm2 553 vpsllq \$32, %xmm3, %xmm4 554 vpxor %xmm4, %xmm3, %xmm3 555 vpshufb con3(%rip), %xmm3, %xmm4 556 vpxor %xmm4, %xmm3, %xmm3 557 vpxor %xmm2, %xmm3, %xmm3 558 vmovdqa %xmm3, 16(%rsi) 559 jne .Lks256_loop 560 561 vpshufb %xmm15, %xmm3, %xmm2 562 vaesenclast %xmm0, %xmm2, %xmm2 563 vpsllq \$32, %xmm1, %xmm4 564 vpxor %xmm4, %xmm1, %xmm1 565 vpshufb con3(%rip), %xmm1, %xmm4 566 vpxor %xmm4, %xmm1, %xmm1 567 vpxor %xmm2, %xmm1, %xmm1 568 vmovdqa %xmm1, 32(%rsi) 569 ret 570.cfi_endproc 571___ 572 573sub aes128gcmsiv_aes_ks_enc_x1 { 574 my $KS1_REGA = "%xmm1"; 575 my $KS1_REGB = "%xmm2"; 576 my $BLOCK1 = "%xmm4"; 577 my $AUXREG = "%xmm3"; 578 579 my $KS_BLOCK = sub { 580 my ($reg, $reg2, $auxReg) = @_; 581 return <<___; 582 vpsllq \$32, $reg, $auxReg #!!saving mov instruction to xmm3 583 vpxor $auxReg, $reg, $reg 584 vpshufb con3(%rip), $reg, $auxReg 585 vpxor $auxReg, $reg, $reg 586 vpxor $reg2, $reg, $reg 587___ 588 }; 589 590 my $round = sub { 591 my ($i, $j) = @_; 592 return <<___; 593 vpshufb %xmm15, %xmm1, %xmm2 #!!saving mov instruction to xmm2 594 vaesenclast %xmm0, %xmm2, %xmm2 595 vpslld \$1, %xmm0, %xmm0 596 ${\$KS_BLOCK->($KS1_REGA, $KS1_REGB, $AUXREG)} 597 vaesenc %xmm1, $BLOCK1, $BLOCK1 598 vmovdqa %xmm1, ${\eval(16*$i)}($j) 599___ 600 }; 601 602 my $roundlast = sub { 603 my ($i, $j) = @_; 604 return <<___; 605 vpshufb %xmm15, %xmm1, %xmm2 #!!saving mov instruction to xmm2 606 vaesenclast %xmm0, %xmm2, %xmm2 607 ${\$KS_BLOCK->($KS1_REGA, $KS1_REGB, $AUXREG)} 608 vaesenclast %xmm1, $BLOCK1, $BLOCK1 609 vmovdqa %xmm1, ${\eval(16*$i)}($j) 610___ 611 }; 612 613# parameter 1: %rdi Pointer to PT 614# parameter 2: %rsi Pointer to CT 615# parameter 4: %rdx Pointer to keys 616# parameter 5: %rcx Pointer to initial key 617 $code.=<<___; 618.globl aes128gcmsiv_aes_ks_enc_x1 619.type aes128gcmsiv_aes_ks_enc_x1,\@function,4 620.align 16 621aes128gcmsiv_aes_ks_enc_x1: 622.cfi_startproc 623 _CET_ENDBR 624 vmovdqa (%rcx), %xmm1 # xmm1 = first 16 bytes of random key 625 vmovdqa 0*16(%rdi), $BLOCK1 626 627 vmovdqa %xmm1, (%rdx) # KEY[0] = first 16 bytes of random key 628 vpxor %xmm1, $BLOCK1, $BLOCK1 629 630 vmovdqa con1(%rip), %xmm0 # xmm0 = 1,1,1,1 631 vmovdqa mask(%rip), %xmm15 # xmm15 = mask 632 633 ${\$round->(1, "%rdx")} 634 ${\$round->(2, "%rdx")} 635 ${\$round->(3, "%rdx")} 636 ${\$round->(4, "%rdx")} 637 ${\$round->(5, "%rdx")} 638 ${\$round->(6, "%rdx")} 639 ${\$round->(7, "%rdx")} 640 ${\$round->(8, "%rdx")} 641 642 vmovdqa con2(%rip), %xmm0 643 644 ${\$round->(9, "%rdx")} 645 ${\$roundlast->(10, "%rdx")} 646 647 vmovdqa $BLOCK1, 0*16(%rsi) 648 ret 649.cfi_endproc 650.size aes128gcmsiv_aes_ks_enc_x1,.-aes128gcmsiv_aes_ks_enc_x1 651___ 652} 653aes128gcmsiv_aes_ks_enc_x1(); 654 655sub aes128gcmsiv_kdf { 656 my $BLOCK1 = "%xmm9"; 657 my $BLOCK2 = "%xmm10"; 658 my $BLOCK3 = "%xmm11"; 659 my $BLOCK4 = "%xmm12"; 660 my $BLOCK5 = "%xmm13"; 661 my $BLOCK6 = "%xmm14"; 662 my $ONE = "%xmm13"; 663 my $KSp = "%rdx"; 664 my $STATE_1 = "%xmm1"; 665 666 my $enc_roundx4 = sub { 667 my ($i, $j) = @_; 668 return <<___; 669 vmovdqa ${\eval($i*16)}(%rdx), $j 670 vaesenc $j, $BLOCK1, $BLOCK1 671 vaesenc $j, $BLOCK2, $BLOCK2 672 vaesenc $j, $BLOCK3, $BLOCK3 673 vaesenc $j, $BLOCK4, $BLOCK4 674___ 675 }; 676 677 my $enc_roundlastx4 = sub { 678 my ($i, $j) = @_; 679 return <<___; 680 vmovdqa ${\eval($i*16)}(%rdx), $j 681 vaesenclast $j, $BLOCK1, $BLOCK1 682 vaesenclast $j, $BLOCK2, $BLOCK2 683 vaesenclast $j, $BLOCK3, $BLOCK3 684 vaesenclast $j, $BLOCK4, $BLOCK4 685___ 686 }; 687 688# void aes128gcmsiv_kdf(const uint8_t nonce[16], 689# uint8_t *out_key_material, 690# const uint8_t *key_schedule); 691 $code.=<<___; 692.globl aes128gcmsiv_kdf 693.type aes128gcmsiv_kdf,\@function,3 694.align 16 695aes128gcmsiv_kdf: 696.cfi_startproc 697 _CET_ENDBR 698# parameter 1: %rdi Pointer to NONCE 699# parameter 2: %rsi Pointer to CT 700# parameter 4: %rdx Pointer to keys 701 702 vmovdqa (%rdx), %xmm1 # xmm1 = first 16 bytes of random key 703 vmovdqa 0*16(%rdi), $BLOCK1 704 vmovdqa and_mask(%rip), $BLOCK4 705 vmovdqa one(%rip), $ONE 706 vpshufd \$0x90, $BLOCK1, $BLOCK1 707 vpand $BLOCK4, $BLOCK1, $BLOCK1 708 vpaddd $ONE, $BLOCK1, $BLOCK2 709 vpaddd $ONE, $BLOCK2, $BLOCK3 710 vpaddd $ONE, $BLOCK3, $BLOCK4 711 712 vpxor %xmm1, $BLOCK1, $BLOCK1 713 vpxor %xmm1, $BLOCK2, $BLOCK2 714 vpxor %xmm1, $BLOCK3, $BLOCK3 715 vpxor %xmm1, $BLOCK4, $BLOCK4 716 717 ${\$enc_roundx4->(1, "%xmm1")} 718 ${\$enc_roundx4->(2, "%xmm2")} 719 ${\$enc_roundx4->(3, "%xmm1")} 720 ${\$enc_roundx4->(4, "%xmm2")} 721 ${\$enc_roundx4->(5, "%xmm1")} 722 ${\$enc_roundx4->(6, "%xmm2")} 723 ${\$enc_roundx4->(7, "%xmm1")} 724 ${\$enc_roundx4->(8, "%xmm2")} 725 ${\$enc_roundx4->(9, "%xmm1")} 726 ${\$enc_roundlastx4->(10, "%xmm2")} 727 728 vmovdqa $BLOCK1, 0*16(%rsi) 729 vmovdqa $BLOCK2, 1*16(%rsi) 730 vmovdqa $BLOCK3, 2*16(%rsi) 731 vmovdqa $BLOCK4, 3*16(%rsi) 732 ret 733.cfi_endproc 734.size aes128gcmsiv_kdf,.-aes128gcmsiv_kdf 735___ 736} 737aes128gcmsiv_kdf(); 738 739sub aes128gcmsiv_enc_msg_x4 { 740 my $CTR1 = "%xmm0"; 741 my $CTR2 = "%xmm1"; 742 my $CTR3 = "%xmm2"; 743 my $CTR4 = "%xmm3"; 744 my $ADDER = "%xmm4"; 745 746 my $STATE1 = "%xmm5"; 747 my $STATE2 = "%xmm6"; 748 my $STATE3 = "%xmm7"; 749 my $STATE4 = "%xmm8"; 750 751 my $TMP = "%xmm12"; 752 my $TMP2 = "%xmm13"; 753 my $TMP3 = "%xmm14"; 754 my $IV = "%xmm15"; 755 756 my $PT = "%rdi"; 757 my $CT = "%rsi"; 758 my $TAG = "%rdx"; 759 my $KS = "%rcx"; 760 my $LEN = "%r8"; 761 762 my $aes_round = sub { 763 my ($i) = @_; 764 return <<___; 765 vmovdqu ${\eval($i*16)}($KS), $TMP 766 vaesenc $TMP, $STATE1, $STATE1 767 vaesenc $TMP, $STATE2, $STATE2 768 vaesenc $TMP, $STATE3, $STATE3 769 vaesenc $TMP, $STATE4, $STATE4 770___ 771 }; 772 773 my $aes_lastround = sub { 774 my ($i) = @_; 775 return <<___; 776 vmovdqu ${\eval($i*16)}($KS), $TMP 777 vaesenclast $TMP, $STATE1, $STATE1 778 vaesenclast $TMP, $STATE2, $STATE2 779 vaesenclast $TMP, $STATE3, $STATE3 780 vaesenclast $TMP, $STATE4, $STATE4 781___ 782 }; 783 784# void aes128gcmsiv_enc_msg_x4(unsigned char* PT, unsigned char* CT, 785# unsigned char* TAG, unsigned char* KS, 786# size_t byte_len); 787# parameter 1: %rdi #PT 788# parameter 2: %rsi #CT 789# parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32] 790# parameter 4: %rcx #KS 791# parameter 5: %r8 #LEN MSG_length in bytes 792 $code.=<<___; 793.globl aes128gcmsiv_enc_msg_x4 794.type aes128gcmsiv_enc_msg_x4,\@function,5 795.align 16 796aes128gcmsiv_enc_msg_x4: 797.cfi_startproc 798 _CET_ENDBR 799 test $LEN, $LEN 800 jnz .L128_enc_msg_x4_start 801 ret 802 803.L128_enc_msg_x4_start: 804 pushq %r12 805.cfi_push %r12 806 pushq %r13 807.cfi_push %r13 808 809 shrq \$4, $LEN # LEN = num of blocks 810 movq $LEN, %r10 811 shlq \$62, %r10 812 shrq \$62, %r10 813 814 # make IV from TAG 815 vmovdqa ($TAG), $IV 816 vpor OR_MASK(%rip), $IV, $IV #IV = [1]TAG[126...32][00..00] 817 818 vmovdqu four(%rip), $ADDER # Register to increment counters 819 vmovdqa $IV, $CTR1 # CTR1 = TAG[1][127...32][00..00] 820 vpaddd one(%rip), $IV, $CTR2 # CTR2 = TAG[1][127...32][00..01] 821 vpaddd two(%rip), $IV, $CTR3 # CTR3 = TAG[1][127...32][00..02] 822 vpaddd three(%rip), $IV, $CTR4 # CTR4 = TAG[1][127...32][00..03] 823 824 shrq \$2, $LEN 825 je .L128_enc_msg_x4_check_remainder 826 827 subq \$64, $CT 828 subq \$64, $PT 829 830.L128_enc_msg_x4_loop1: 831 addq \$64, $CT 832 addq \$64, $PT 833 834 vmovdqa $CTR1, $STATE1 835 vmovdqa $CTR2, $STATE2 836 vmovdqa $CTR3, $STATE3 837 vmovdqa $CTR4, $STATE4 838 839 vpxor ($KS), $STATE1, $STATE1 840 vpxor ($KS), $STATE2, $STATE2 841 vpxor ($KS), $STATE3, $STATE3 842 vpxor ($KS), $STATE4, $STATE4 843 844 ${\$aes_round->(1)} 845 vpaddd $ADDER, $CTR1, $CTR1 846 ${\$aes_round->(2)} 847 vpaddd $ADDER, $CTR2, $CTR2 848 ${\$aes_round->(3)} 849 vpaddd $ADDER, $CTR3, $CTR3 850 ${\$aes_round->(4)} 851 vpaddd $ADDER, $CTR4, $CTR4 852 853 ${\$aes_round->(5)} 854 ${\$aes_round->(6)} 855 ${\$aes_round->(7)} 856 ${\$aes_round->(8)} 857 ${\$aes_round->(9)} 858 ${\$aes_lastround->(10)} 859 860 # XOR with Plaintext 861 vpxor 0*16($PT), $STATE1, $STATE1 862 vpxor 1*16($PT), $STATE2, $STATE2 863 vpxor 2*16($PT), $STATE3, $STATE3 864 vpxor 3*16($PT), $STATE4, $STATE4 865 866 subq \$1, $LEN 867 868 vmovdqu $STATE1, 0*16($CT) 869 vmovdqu $STATE2, 1*16($CT) 870 vmovdqu $STATE3, 2*16($CT) 871 vmovdqu $STATE4, 3*16($CT) 872 873 jne .L128_enc_msg_x4_loop1 874 875 addq \$64,$CT 876 addq \$64,$PT 877 878.L128_enc_msg_x4_check_remainder: 879 cmpq \$0, %r10 880 je .L128_enc_msg_x4_out 881 882.L128_enc_msg_x4_loop2: 883 # enc each block separately 884 # CTR1 is the highest counter (even if no LOOP done) 885 vmovdqa $CTR1, $STATE1 886 vpaddd one(%rip), $CTR1, $CTR1 # inc counter 887 888 vpxor ($KS), $STATE1, $STATE1 889 vaesenc 16($KS), $STATE1, $STATE1 890 vaesenc 32($KS), $STATE1, $STATE1 891 vaesenc 48($KS), $STATE1, $STATE1 892 vaesenc 64($KS), $STATE1, $STATE1 893 vaesenc 80($KS), $STATE1, $STATE1 894 vaesenc 96($KS), $STATE1, $STATE1 895 vaesenc 112($KS), $STATE1, $STATE1 896 vaesenc 128($KS), $STATE1, $STATE1 897 vaesenc 144($KS), $STATE1, $STATE1 898 vaesenclast 160($KS), $STATE1, $STATE1 899 900 # XOR with plaintext 901 vpxor ($PT), $STATE1, $STATE1 902 vmovdqu $STATE1, ($CT) 903 904 addq \$16, $PT 905 addq \$16, $CT 906 907 subq \$1, %r10 908 jne .L128_enc_msg_x4_loop2 909 910.L128_enc_msg_x4_out: 911 popq %r13 912.cfi_pop %r13 913 popq %r12 914.cfi_pop %r12 915 ret 916.cfi_endproc 917.size aes128gcmsiv_enc_msg_x4,.-aes128gcmsiv_enc_msg_x4 918___ 919} 920aes128gcmsiv_enc_msg_x4(); 921 922sub aes128gcmsiv_enc_msg_x8 { 923 my $STATE1 = "%xmm1"; 924 my $STATE2 = "%xmm2"; 925 my $STATE3 = "%xmm3"; 926 my $STATE4 = "%xmm4"; 927 my $STATE5 = "%xmm5"; 928 my $STATE6 = "%xmm6"; 929 my $STATE7 = "%xmm7"; 930 my $STATE8 = "%xmm8"; 931 932 my $CTR1 = "%xmm0"; 933 my $CTR2 = "%xmm9"; 934 my $CTR3 = "%xmm10"; 935 my $CTR4 = "%xmm11"; 936 my $CTR5 = "%xmm12"; 937 my $CTR6 = "%xmm13"; 938 my $CTR7 = "%xmm14"; 939 my $SCHED = "%xmm15"; 940 941 my $TMP1 = "%xmm1"; 942 my $TMP2 = "%xmm2"; 943 944 my $PT = "%rdi"; 945 my $CT = "%rsi"; 946 my $TAG = "%rdx"; 947 my $KS = "%rcx"; 948 my $LEN = "%r8"; 949 950 my $aes_round8 = sub { 951 my ($i) = @_; 952 return <<___; 953 vmovdqu ${\eval($i*16)}($KS), $SCHED 954 vaesenc $SCHED, $STATE1, $STATE1 955 vaesenc $SCHED, $STATE2, $STATE2 956 vaesenc $SCHED, $STATE3, $STATE3 957 vaesenc $SCHED, $STATE4, $STATE4 958 vaesenc $SCHED, $STATE5, $STATE5 959 vaesenc $SCHED, $STATE6, $STATE6 960 vaesenc $SCHED, $STATE7, $STATE7 961 vaesenc $SCHED, $STATE8, $STATE8 962___ 963 }; 964 965 my $aes_lastround8 = sub { 966 my ($i) = @_; 967 return <<___; 968 vmovdqu ${\eval($i*16)}($KS), $SCHED 969 vaesenclast $SCHED, $STATE1, $STATE1 970 vaesenclast $SCHED, $STATE2, $STATE2 971 vaesenclast $SCHED, $STATE3, $STATE3 972 vaesenclast $SCHED, $STATE4, $STATE4 973 vaesenclast $SCHED, $STATE5, $STATE5 974 vaesenclast $SCHED, $STATE6, $STATE6 975 vaesenclast $SCHED, $STATE7, $STATE7 976 vaesenclast $SCHED, $STATE8, $STATE8 977___ 978 }; 979 980# void ENC_MSG_x8(unsigned char* PT, 981# unsigned char* CT, 982# unsigned char* TAG, 983# unsigned char* KS, 984# size_t byte_len); 985# parameter 1: %rdi #PT 986# parameter 2: %rsi #CT 987# parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32] 988# parameter 4: %rcx #KS 989# parameter 5: %r8 #LEN MSG_length in bytes 990 $code.=<<___; 991.globl aes128gcmsiv_enc_msg_x8 992.type aes128gcmsiv_enc_msg_x8,\@function,5 993.align 16 994aes128gcmsiv_enc_msg_x8: 995.cfi_startproc 996 _CET_ENDBR 997 test $LEN, $LEN 998 jnz .L128_enc_msg_x8_start 999 ret 1000 1001.L128_enc_msg_x8_start: 1002 pushq %r12 1003.cfi_push %r12 1004 pushq %r13 1005.cfi_push %r13 1006 pushq %rbp 1007.cfi_push %rbp 1008 movq %rsp, %rbp 1009.cfi_def_cfa_register rbp 1010 1011 # Place in stack 1012 subq \$128, %rsp 1013 andq \$-64, %rsp 1014 1015 shrq \$4, $LEN # LEN = num of blocks 1016 movq $LEN, %r10 1017 shlq \$61, %r10 1018 shrq \$61, %r10 1019 1020 # make IV from TAG 1021 vmovdqu ($TAG), $TMP1 1022 vpor OR_MASK(%rip), $TMP1, $TMP1 # TMP1= IV = [1]TAG[126...32][00..00] 1023 1024 # store counter8 in the stack 1025 vpaddd seven(%rip), $TMP1, $CTR1 1026 vmovdqu $CTR1, (%rsp) # CTR8 = TAG[127...32][00..07] 1027 vpaddd one(%rip), $TMP1, $CTR2 # CTR2 = TAG[127...32][00..01] 1028 vpaddd two(%rip), $TMP1, $CTR3 # CTR3 = TAG[127...32][00..02] 1029 vpaddd three(%rip), $TMP1, $CTR4 # CTR4 = TAG[127...32][00..03] 1030 vpaddd four(%rip), $TMP1, $CTR5 # CTR5 = TAG[127...32][00..04] 1031 vpaddd five(%rip), $TMP1, $CTR6 # CTR6 = TAG[127...32][00..05] 1032 vpaddd six(%rip), $TMP1, $CTR7 # CTR7 = TAG[127...32][00..06] 1033 vmovdqa $TMP1, $CTR1 # CTR1 = TAG[127...32][00..00] 1034 1035 shrq \$3, $LEN 1036 je .L128_enc_msg_x8_check_remainder 1037 1038 subq \$128, $CT 1039 subq \$128, $PT 1040 1041.L128_enc_msg_x8_loop1: 1042 addq \$128, $CT 1043 addq \$128, $PT 1044 1045 vmovdqa $CTR1, $STATE1 1046 vmovdqa $CTR2, $STATE2 1047 vmovdqa $CTR3, $STATE3 1048 vmovdqa $CTR4, $STATE4 1049 vmovdqa $CTR5, $STATE5 1050 vmovdqa $CTR6, $STATE6 1051 vmovdqa $CTR7, $STATE7 1052 # move from stack 1053 vmovdqu (%rsp), $STATE8 1054 1055 vpxor ($KS), $STATE1, $STATE1 1056 vpxor ($KS), $STATE2, $STATE2 1057 vpxor ($KS), $STATE3, $STATE3 1058 vpxor ($KS), $STATE4, $STATE4 1059 vpxor ($KS), $STATE5, $STATE5 1060 vpxor ($KS), $STATE6, $STATE6 1061 vpxor ($KS), $STATE7, $STATE7 1062 vpxor ($KS), $STATE8, $STATE8 1063 1064 ${\$aes_round8->(1)} 1065 vmovdqu (%rsp), $CTR7 # deal with CTR8 1066 vpaddd eight(%rip), $CTR7, $CTR7 1067 vmovdqu $CTR7, (%rsp) 1068 ${\$aes_round8->(2)} 1069 vpsubd one(%rip), $CTR7, $CTR7 1070 ${\$aes_round8->(3)} 1071 vpaddd eight(%rip), $CTR1, $CTR1 1072 ${\$aes_round8->(4)} 1073 vpaddd eight(%rip), $CTR2, $CTR2 1074 ${\$aes_round8->(5)} 1075 vpaddd eight(%rip), $CTR3, $CTR3 1076 ${\$aes_round8->(6)} 1077 vpaddd eight(%rip), $CTR4, $CTR4 1078 ${\$aes_round8->(7)} 1079 vpaddd eight(%rip), $CTR5, $CTR5 1080 ${\$aes_round8->(8)} 1081 vpaddd eight(%rip), $CTR6, $CTR6 1082 ${\$aes_round8->(9)} 1083 ${\$aes_lastround8->(10)} 1084 1085 # XOR with Plaintext 1086 vpxor 0*16($PT), $STATE1, $STATE1 1087 vpxor 1*16($PT), $STATE2, $STATE2 1088 vpxor 2*16($PT), $STATE3, $STATE3 1089 vpxor 3*16($PT), $STATE4, $STATE4 1090 vpxor 4*16($PT), $STATE5, $STATE5 1091 vpxor 5*16($PT), $STATE6, $STATE6 1092 vpxor 6*16($PT), $STATE7, $STATE7 1093 vpxor 7*16($PT), $STATE8, $STATE8 1094 1095 dec $LEN 1096 1097 vmovdqu $STATE1, 0*16($CT) 1098 vmovdqu $STATE2, 1*16($CT) 1099 vmovdqu $STATE3, 2*16($CT) 1100 vmovdqu $STATE4, 3*16($CT) 1101 vmovdqu $STATE5, 4*16($CT) 1102 vmovdqu $STATE6, 5*16($CT) 1103 vmovdqu $STATE7, 6*16($CT) 1104 vmovdqu $STATE8, 7*16($CT) 1105 1106 jne .L128_enc_msg_x8_loop1 1107 1108 addq \$128, $CT 1109 addq \$128, $PT 1110 1111.L128_enc_msg_x8_check_remainder: 1112 cmpq \$0, %r10 1113 je .L128_enc_msg_x8_out 1114 1115.L128_enc_msg_x8_loop2: 1116 # enc each block separately 1117 # CTR1 is the highest counter (even if no LOOP done) 1118 vmovdqa $CTR1, $STATE1 1119 vpaddd one(%rip), $CTR1, $CTR1 # inc counter 1120 1121 vpxor ($KS), $STATE1, $STATE1 1122 vaesenc 16($KS), $STATE1, $STATE1 1123 vaesenc 32($KS), $STATE1, $STATE1 1124 vaesenc 48($KS), $STATE1, $STATE1 1125 vaesenc 64($KS), $STATE1, $STATE1 1126 vaesenc 80($KS), $STATE1, $STATE1 1127 vaesenc 96($KS), $STATE1, $STATE1 1128 vaesenc 112($KS), $STATE1, $STATE1 1129 vaesenc 128($KS), $STATE1, $STATE1 1130 vaesenc 144($KS), $STATE1, $STATE1 1131 vaesenclast 160($KS), $STATE1, $STATE1 1132 1133 # XOR with Plaintext 1134 vpxor ($PT), $STATE1, $STATE1 1135 1136 vmovdqu $STATE1, ($CT) 1137 1138 addq \$16, $PT 1139 addq \$16, $CT 1140 1141 decq %r10 1142 jne .L128_enc_msg_x8_loop2 1143 1144.L128_enc_msg_x8_out: 1145 movq %rbp, %rsp 1146.cfi_def_cfa_register %rsp 1147 popq %rbp 1148.cfi_pop %rbp 1149 popq %r13 1150.cfi_pop %r13 1151 popq %r12 1152.cfi_pop %r12 1153 ret 1154.cfi_endproc 1155.size aes128gcmsiv_enc_msg_x8,.-aes128gcmsiv_enc_msg_x8 1156___ 1157} 1158aes128gcmsiv_enc_msg_x8(); 1159 1160sub aesgcmsiv_dec { 1161 my ($aes256) = @_; 1162 1163 my $T = "%xmm0"; 1164 my $TMP0 = "%xmm1"; 1165 my $TMP1 = "%xmm2"; 1166 my $TMP2 = "%xmm3"; 1167 my $TMP3 = "%xmm4"; 1168 my $TMP4 = "%xmm5"; 1169 my $TMP5 = "%xmm6"; 1170 my $CTR1 = "%xmm7"; 1171 my $CTR2 = "%xmm8"; 1172 my $CTR3 = "%xmm9"; 1173 my $CTR4 = "%xmm10"; 1174 my $CTR5 = "%xmm11"; 1175 my $CTR6 = "%xmm12"; 1176 my $CTR = "%xmm15"; 1177 my $CT = "%rdi"; 1178 my $PT = "%rsi"; 1179 my $POL = "%rdx"; 1180 my $Htbl = "%rcx"; 1181 my $KS = "%r8"; 1182 my $LEN = "%r9"; 1183 my $secureBuffer = "%rax"; 1184 my $HTABLE_ROUNDS = "%xmm13"; 1185 1186 my $labelPrefix = "128"; 1187 if ($aes256) { 1188 $labelPrefix = "256"; 1189 } 1190 1191 my $aes_round_dec = sub { 1192 my ($i) = @_; 1193 return <<___; 1194 vmovdqu ${\eval($i*16)}($KS), $TMP3 1195 vaesenc $TMP3, $CTR1, $CTR1 1196 vaesenc $TMP3, $CTR2, $CTR2 1197 vaesenc $TMP3, $CTR3, $CTR3 1198 vaesenc $TMP3, $CTR4, $CTR4 1199 vaesenc $TMP3, $CTR5, $CTR5 1200 vaesenc $TMP3, $CTR6, $CTR6 1201___ 1202 }; 1203 1204 my $aes_lastround_dec = sub { 1205 my ($i) = @_; 1206 return <<___; 1207 vmovdqu ${\eval($i*16)}($KS), $TMP3 1208 vaesenclast $TMP3, $CTR1, $CTR1 1209 vaesenclast $TMP3, $CTR2, $CTR2 1210 vaesenclast $TMP3, $CTR3, $CTR3 1211 vaesenclast $TMP3, $CTR4, $CTR4 1212 vaesenclast $TMP3, $CTR5, $CTR5 1213 vaesenclast $TMP3, $CTR6, $CTR6 1214___ 1215 }; 1216 1217 my $schoolbook = sub { 1218 my ($i) = @_; 1219 return <<___; 1220 vmovdqu ${\eval($i*16-32)}($secureBuffer), $TMP5 1221 vmovdqu ${\eval($i*16-32)}($Htbl), $HTABLE_ROUNDS 1222 1223 vpclmulqdq \$0x10, $HTABLE_ROUNDS, $TMP5, $TMP3 1224 vpxor $TMP3, $TMP0, $TMP0 1225 vpclmulqdq \$0x11, $HTABLE_ROUNDS, $TMP5, $TMP3 1226 vpxor $TMP3, $TMP1, $TMP1 1227 vpclmulqdq \$0x00, $HTABLE_ROUNDS, $TMP5, $TMP3 1228 vpxor $TMP3, $TMP2, $TMP2 1229 vpclmulqdq \$0x01, $HTABLE_ROUNDS, $TMP5, $TMP3 1230 vpxor $TMP3, $TMP0, $TMP0 1231___ 1232 }; 1233 1234 if ($aes256) { 1235 $code.=<<___; 1236.globl aes256gcmsiv_dec 1237.type aes256gcmsiv_dec,\@function,6 1238.align 16 1239aes256gcmsiv_dec: 1240___ 1241 } else { 1242 $code.=<<___; 1243.globl aes128gcmsiv_dec 1244.type aes128gcmsiv_dec,\@function,6 1245.align 16 1246aes128gcmsiv_dec: 1247___ 1248 } 1249 1250 $code.=<<___; 1251.cfi_startproc 1252 _CET_ENDBR 1253 test \$~15, $LEN 1254 jnz .L${labelPrefix}_dec_start 1255 ret 1256 1257.L${labelPrefix}_dec_start: 1258 vzeroupper 1259 vmovdqa ($POL), $T 1260 # The claimed tag is provided after the current calculated tag value. 1261 # CTRBLKs is made from it. 1262 vmovdqu 16($POL), $CTR 1263 vpor OR_MASK(%rip), $CTR, $CTR # CTR = [1]TAG[126...32][00..00] 1264 movq $POL, $secureBuffer 1265 1266 leaq 32($secureBuffer), $secureBuffer 1267 leaq 32($Htbl), $Htbl 1268 1269 andq \$~15, $LEN 1270 1271 # If less then 6 blocks, make singles 1272 cmp \$96, $LEN 1273 jb .L${labelPrefix}_dec_loop2 1274 1275 # Decrypt the first six blocks 1276 sub \$96, $LEN 1277 vmovdqa $CTR, $CTR1 1278 vpaddd one(%rip), $CTR1, $CTR2 1279 vpaddd two(%rip), $CTR1, $CTR3 1280 vpaddd one(%rip), $CTR3, $CTR4 1281 vpaddd two(%rip), $CTR3, $CTR5 1282 vpaddd one(%rip), $CTR5, $CTR6 1283 vpaddd two(%rip), $CTR5, $CTR 1284 1285 vpxor ($KS), $CTR1, $CTR1 1286 vpxor ($KS), $CTR2, $CTR2 1287 vpxor ($KS), $CTR3, $CTR3 1288 vpxor ($KS), $CTR4, $CTR4 1289 vpxor ($KS), $CTR5, $CTR5 1290 vpxor ($KS), $CTR6, $CTR6 1291 1292 ${\$aes_round_dec->(1)} 1293 ${\$aes_round_dec->(2)} 1294 ${\$aes_round_dec->(3)} 1295 ${\$aes_round_dec->(4)} 1296 ${\$aes_round_dec->(5)} 1297 ${\$aes_round_dec->(6)} 1298 ${\$aes_round_dec->(7)} 1299 ${\$aes_round_dec->(8)} 1300 ${\$aes_round_dec->(9)} 1301___ 1302 1303if ($aes256) { 1304$code.=<<___; 1305 ${\$aes_round_dec->(10)} 1306 ${\$aes_round_dec->(11)} 1307 ${\$aes_round_dec->(12)} 1308 ${\$aes_round_dec->(13)} 1309 ${\$aes_lastround_dec->(14)} 1310___ 1311} else { 1312$code.=<<___; 1313 ${\$aes_lastround_dec->(10)} 1314___ 1315} 1316 1317$code.=<<___; 1318 # XOR with CT 1319 vpxor 0*16($CT), $CTR1, $CTR1 1320 vpxor 1*16($CT), $CTR2, $CTR2 1321 vpxor 2*16($CT), $CTR3, $CTR3 1322 vpxor 3*16($CT), $CTR4, $CTR4 1323 vpxor 4*16($CT), $CTR5, $CTR5 1324 vpxor 5*16($CT), $CTR6, $CTR6 1325 1326 vmovdqu $CTR1, 0*16($PT) 1327 vmovdqu $CTR2, 1*16($PT) 1328 vmovdqu $CTR3, 2*16($PT) 1329 vmovdqu $CTR4, 3*16($PT) 1330 vmovdqu $CTR5, 4*16($PT) 1331 vmovdqu $CTR6, 5*16($PT) 1332 1333 addq \$96, $CT 1334 addq \$96, $PT 1335 jmp .L${labelPrefix}_dec_loop1 1336 1337# Decrypt 6 blocks each time while hashing previous 6 blocks 1338.align 64 1339.L${labelPrefix}_dec_loop1: 1340 cmp \$96, $LEN 1341 jb .L${labelPrefix}_dec_finish_96 1342 sub \$96, $LEN 1343 1344 vmovdqa $CTR6, $TMP5 1345 vmovdqa $CTR5, 1*16-32($secureBuffer) 1346 vmovdqa $CTR4, 2*16-32($secureBuffer) 1347 vmovdqa $CTR3, 3*16-32($secureBuffer) 1348 vmovdqa $CTR2, 4*16-32($secureBuffer) 1349 vmovdqa $CTR1, 5*16-32($secureBuffer) 1350 1351 vmovdqa $CTR, $CTR1 1352 vpaddd one(%rip), $CTR1, $CTR2 1353 vpaddd two(%rip), $CTR1, $CTR3 1354 vpaddd one(%rip), $CTR3, $CTR4 1355 vpaddd two(%rip), $CTR3, $CTR5 1356 vpaddd one(%rip), $CTR5, $CTR6 1357 vpaddd two(%rip), $CTR5, $CTR 1358 1359 vmovdqa ($KS), $TMP3 1360 vpxor $TMP3, $CTR1, $CTR1 1361 vpxor $TMP3, $CTR2, $CTR2 1362 vpxor $TMP3, $CTR3, $CTR3 1363 vpxor $TMP3, $CTR4, $CTR4 1364 vpxor $TMP3, $CTR5, $CTR5 1365 vpxor $TMP3, $CTR6, $CTR6 1366 1367 vmovdqu 0*16-32($Htbl), $TMP3 1368 vpclmulqdq \$0x11, $TMP3, $TMP5, $TMP1 1369 vpclmulqdq \$0x00, $TMP3, $TMP5, $TMP2 1370 vpclmulqdq \$0x01, $TMP3, $TMP5, $TMP0 1371 vpclmulqdq \$0x10, $TMP3, $TMP5, $TMP3 1372 vpxor $TMP3, $TMP0, $TMP0 1373 1374 ${\$aes_round_dec->(1)} 1375 ${\$schoolbook->(1)} 1376 1377 ${\$aes_round_dec->(2)} 1378 ${\$schoolbook->(2)} 1379 1380 ${\$aes_round_dec->(3)} 1381 ${\$schoolbook->(3)} 1382 1383 ${\$aes_round_dec->(4)} 1384 ${\$schoolbook->(4)} 1385 1386 ${\$aes_round_dec->(5)} 1387 ${\$aes_round_dec->(6)} 1388 ${\$aes_round_dec->(7)} 1389 1390 vmovdqa 5*16-32($secureBuffer), $TMP5 1391 vpxor $T, $TMP5, $TMP5 1392 vmovdqu 5*16-32($Htbl), $TMP4 1393 1394 vpclmulqdq \$0x01, $TMP4, $TMP5, $TMP3 1395 vpxor $TMP3, $TMP0, $TMP0 1396 vpclmulqdq \$0x11, $TMP4, $TMP5, $TMP3 1397 vpxor $TMP3, $TMP1, $TMP1 1398 vpclmulqdq \$0x00, $TMP4, $TMP5, $TMP3 1399 vpxor $TMP3, $TMP2, $TMP2 1400 vpclmulqdq \$0x10, $TMP4, $TMP5, $TMP3 1401 vpxor $TMP3, $TMP0, $TMP0 1402 1403 ${\$aes_round_dec->(8)} 1404 1405 vpsrldq \$8, $TMP0, $TMP3 1406 vpxor $TMP3, $TMP1, $TMP4 1407 vpslldq \$8, $TMP0, $TMP3 1408 vpxor $TMP3, $TMP2, $T 1409 1410 vmovdqa poly(%rip), $TMP2 1411 1412 ${\$aes_round_dec->(9)} 1413___ 1414 1415if ($aes256) { 1416$code.=<<___; 1417 ${\$aes_round_dec->(10)} 1418 ${\$aes_round_dec->(11)} 1419 ${\$aes_round_dec->(12)} 1420 ${\$aes_round_dec->(13)} 1421 vmovdqu 14*16($KS), $TMP5 1422___ 1423} else { 1424$code.=<<___; 1425 vmovdqu 10*16($KS), $TMP5 1426___ 1427} 1428 1429$code.=<<___; 1430 vpalignr \$8, $T, $T, $TMP1 1431 vpclmulqdq \$0x10, $TMP2, $T, $T 1432 vpxor $T, $TMP1, $T 1433 1434 vpxor 0*16($CT), $TMP5, $TMP3 1435 vaesenclast $TMP3, $CTR1, $CTR1 1436 vpxor 1*16($CT), $TMP5, $TMP3 1437 vaesenclast $TMP3, $CTR2, $CTR2 1438 vpxor 2*16($CT), $TMP5, $TMP3 1439 vaesenclast $TMP3, $CTR3, $CTR3 1440 vpxor 3*16($CT), $TMP5, $TMP3 1441 vaesenclast $TMP3, $CTR4, $CTR4 1442 vpxor 4*16($CT), $TMP5, $TMP3 1443 vaesenclast $TMP3, $CTR5, $CTR5 1444 vpxor 5*16($CT), $TMP5, $TMP3 1445 vaesenclast $TMP3, $CTR6, $CTR6 1446 1447 vpalignr \$8, $T, $T, $TMP1 1448 vpclmulqdq \$0x10, $TMP2, $T, $T 1449 vpxor $T, $TMP1, $T 1450 1451 vmovdqu $CTR1, 0*16($PT) 1452 vmovdqu $CTR2, 1*16($PT) 1453 vmovdqu $CTR3, 2*16($PT) 1454 vmovdqu $CTR4, 3*16($PT) 1455 vmovdqu $CTR5, 4*16($PT) 1456 vmovdqu $CTR6, 5*16($PT) 1457 1458 vpxor $TMP4, $T, $T 1459 1460 lea 96($CT), $CT 1461 lea 96($PT), $PT 1462 jmp .L${labelPrefix}_dec_loop1 1463 1464.L${labelPrefix}_dec_finish_96: 1465 vmovdqa $CTR6, $TMP5 1466 vmovdqa $CTR5, 1*16-32($secureBuffer) 1467 vmovdqa $CTR4, 2*16-32($secureBuffer) 1468 vmovdqa $CTR3, 3*16-32($secureBuffer) 1469 vmovdqa $CTR2, 4*16-32($secureBuffer) 1470 vmovdqa $CTR1, 5*16-32($secureBuffer) 1471 1472 vmovdqu 0*16-32($Htbl), $TMP3 1473 vpclmulqdq \$0x10, $TMP3, $TMP5, $TMP0 1474 vpclmulqdq \$0x11, $TMP3, $TMP5, $TMP1 1475 vpclmulqdq \$0x00, $TMP3, $TMP5, $TMP2 1476 vpclmulqdq \$0x01, $TMP3, $TMP5, $TMP3 1477 vpxor $TMP3, $TMP0, $TMP0 1478 1479 ${\$schoolbook->(1)} 1480 ${\$schoolbook->(2)} 1481 ${\$schoolbook->(3)} 1482 ${\$schoolbook->(4)} 1483 1484 vmovdqu 5*16-32($secureBuffer), $TMP5 1485 vpxor $T, $TMP5, $TMP5 1486 vmovdqu 5*16-32($Htbl), $TMP4 1487 vpclmulqdq \$0x11, $TMP4, $TMP5, $TMP3 1488 vpxor $TMP3, $TMP1, $TMP1 1489 vpclmulqdq \$0x00, $TMP4, $TMP5, $TMP3 1490 vpxor $TMP3, $TMP2, $TMP2 1491 vpclmulqdq \$0x10, $TMP4, $TMP5, $TMP3 1492 vpxor $TMP3, $TMP0, $TMP0 1493 vpclmulqdq \$0x01, $TMP4, $TMP5, $TMP3 1494 vpxor $TMP3, $TMP0, $TMP0 1495 1496 vpsrldq \$8, $TMP0, $TMP3 1497 vpxor $TMP3, $TMP1, $TMP4 1498 vpslldq \$8, $TMP0, $TMP3 1499 vpxor $TMP3, $TMP2, $T 1500 1501 vmovdqa poly(%rip), $TMP2 1502 1503 vpalignr \$8, $T, $T, $TMP1 1504 vpclmulqdq \$0x10, $TMP2, $T, $T 1505 vpxor $T, $TMP1, $T 1506 1507 vpalignr \$8, $T, $T, $TMP1 1508 vpclmulqdq \$0x10, $TMP2, $T, $T 1509 vpxor $T, $TMP1, $T 1510 1511 vpxor $TMP4, $T, $T 1512 1513.L${labelPrefix}_dec_loop2: 1514 # Here we encrypt any remaining whole block 1515 1516 # if there are no whole blocks 1517 cmp \$16, $LEN 1518 jb .L${labelPrefix}_dec_out 1519 sub \$16, $LEN 1520 1521 vmovdqa $CTR, $TMP1 1522 vpaddd one(%rip), $CTR, $CTR 1523 1524 vpxor 0*16($KS), $TMP1, $TMP1 1525 vaesenc 1*16($KS), $TMP1, $TMP1 1526 vaesenc 2*16($KS), $TMP1, $TMP1 1527 vaesenc 3*16($KS), $TMP1, $TMP1 1528 vaesenc 4*16($KS), $TMP1, $TMP1 1529 vaesenc 5*16($KS), $TMP1, $TMP1 1530 vaesenc 6*16($KS), $TMP1, $TMP1 1531 vaesenc 7*16($KS), $TMP1, $TMP1 1532 vaesenc 8*16($KS), $TMP1, $TMP1 1533 vaesenc 9*16($KS), $TMP1, $TMP1 1534___ 1535if ($aes256) { 1536$code.=<<___; 1537 vaesenc 10*16($KS), $TMP1, $TMP1 1538 vaesenc 11*16($KS), $TMP1, $TMP1 1539 vaesenc 12*16($KS), $TMP1, $TMP1 1540 vaesenc 13*16($KS), $TMP1, $TMP1 1541 vaesenclast 14*16($KS), $TMP1, $TMP1 1542___ 1543} else { 1544$code.=<<___; 1545 vaesenclast 10*16($KS), $TMP1, $TMP1 1546___ 1547} 1548 1549$code.=<<___; 1550 vpxor ($CT), $TMP1, $TMP1 1551 vmovdqu $TMP1, ($PT) 1552 addq \$16, $CT 1553 addq \$16, $PT 1554 1555 vpxor $TMP1, $T, $T 1556 vmovdqa -32($Htbl), $TMP0 1557 call GFMUL 1558 1559 jmp .L${labelPrefix}_dec_loop2 1560 1561.L${labelPrefix}_dec_out: 1562 vmovdqu $T, ($POL) 1563 ret 1564.cfi_endproc 1565___ 1566 1567 if ($aes256) { 1568 $code.=<<___; 1569.size aes256gcmsiv_dec, .-aes256gcmsiv_dec 1570___ 1571 } else { 1572 $code.=<<___; 1573.size aes128gcmsiv_dec, .-aes128gcmsiv_dec 1574___ 1575 } 1576} 1577 1578aesgcmsiv_dec(0); # emit 128-bit version 1579 1580sub aes128gcmsiv_ecb_enc_block { 1581 my $STATE_1 = "%xmm1"; 1582 my $KSp = "%rdx"; 1583 1584 # parameter 1: PT %rdi (pointer to 128 bit) 1585 # parameter 2: CT %rsi (pointer to 128 bit) 1586 # parameter 3: ks %rdx (pointer to ks) 1587 $code.=<<___; 1588.globl aes128gcmsiv_ecb_enc_block 1589.type aes128gcmsiv_ecb_enc_block,\@function,3 1590.align 16 1591aes128gcmsiv_ecb_enc_block: 1592.cfi_startproc 1593 _CET_ENDBR 1594 vmovdqa (%rdi), $STATE_1 1595 1596 vpxor ($KSp), $STATE_1, $STATE_1 1597 vaesenc 1*16($KSp), $STATE_1, $STATE_1 1598 vaesenc 2*16($KSp), $STATE_1, $STATE_1 1599 vaesenc 3*16($KSp), $STATE_1, $STATE_1 1600 vaesenc 4*16($KSp), $STATE_1, $STATE_1 1601 vaesenc 5*16($KSp), $STATE_1, $STATE_1 1602 vaesenc 6*16($KSp), $STATE_1, $STATE_1 1603 vaesenc 7*16($KSp), $STATE_1, $STATE_1 1604 vaesenc 8*16($KSp), $STATE_1, $STATE_1 1605 vaesenc 9*16($KSp), $STATE_1, $STATE_1 1606 vaesenclast 10*16($KSp), $STATE_1, $STATE_1 # STATE_1 == IV 1607 1608 vmovdqa $STATE_1, (%rsi) 1609 1610 ret 1611.cfi_endproc 1612.size aes128gcmsiv_ecb_enc_block,.-aes128gcmsiv_ecb_enc_block 1613___ 1614} 1615aes128gcmsiv_ecb_enc_block(); 1616 1617sub aes256gcmsiv_aes_ks_enc_x1 { 1618 my $KS = "%rdx"; 1619 my $KEYp = "%rcx"; 1620 my $CON_MASK = "%xmm0"; 1621 my $MASK_256 = "%xmm15"; 1622 my $KEY_1 = "%xmm1"; 1623 my $KEY_2 = "%xmm3"; 1624 my $BLOCK1 = "%xmm8"; 1625 my $AUX_REG = "%xmm14"; 1626 my $PT = "%rdi"; 1627 my $CT = "%rsi"; 1628 1629 my $round_double = sub { 1630 my ($i, $j) = @_; 1631 return <<___; 1632 vpshufb %xmm15, %xmm3, %xmm2 1633 vaesenclast %xmm0, %xmm2, %xmm2 1634 vpslld \$1, %xmm0, %xmm0 1635 vpslldq \$4, %xmm1, %xmm4 1636 vpxor %xmm4, %xmm1, %xmm1 1637 vpslldq \$4, %xmm4, %xmm4 1638 vpxor %xmm4, %xmm1, %xmm1 1639 vpslldq \$4, %xmm4, %xmm4 1640 vpxor %xmm4, %xmm1, %xmm1 1641 vpxor %xmm2, %xmm1, %xmm1 1642 vaesenc %xmm1, $BLOCK1, $BLOCK1 1643 vmovdqu %xmm1, ${\eval(16*$i)}($KS) 1644 1645 vpshufd \$0xff, %xmm1, %xmm2 1646 vaesenclast %xmm14, %xmm2, %xmm2 1647 vpslldq \$4, %xmm3, %xmm4 1648 vpxor %xmm4, %xmm3, %xmm3 1649 vpslldq \$4, %xmm4, %xmm4 1650 vpxor %xmm4, %xmm3, %xmm3 1651 vpslldq \$4, %xmm4, %xmm4 1652 vpxor %xmm4, %xmm3, %xmm3 1653 vpxor %xmm2, %xmm3, %xmm3 1654 vaesenc %xmm3, $BLOCK1, $BLOCK1 1655 vmovdqu %xmm3, ${\eval(16*$j)}($KS) 1656___ 1657 }; 1658 1659 my $round_last = sub { 1660 my ($i) = @_; 1661 return <<___; 1662 vpshufb %xmm15, %xmm3, %xmm2 1663 vaesenclast %xmm0, %xmm2, %xmm2 1664 vpslldq \$4, %xmm1, %xmm4 1665 vpxor %xmm4, %xmm1, %xmm1 1666 vpslldq \$4, %xmm4, %xmm4 1667 vpxor %xmm4, %xmm1, %xmm1 1668 vpslldq \$4, %xmm4, %xmm4 1669 vpxor %xmm4, %xmm1, %xmm1 1670 vpxor %xmm2, %xmm1, %xmm1 1671 vaesenclast %xmm1, $BLOCK1, $BLOCK1 1672 vmovdqu %xmm1, ${\eval(16*$i)}($KS) 1673___ 1674 }; 1675 1676 # parameter 1: %rdi Pointer to PT1 1677 # parameter 2: %rsi Pointer to CT1 1678 # parameter 3: %rdx Pointer to KS 1679 # parameter 4: %rcx Pointer to initial key 1680 $code.=<<___; 1681.globl aes256gcmsiv_aes_ks_enc_x1 1682.type aes256gcmsiv_aes_ks_enc_x1,\@function,4 1683.align 16 1684aes256gcmsiv_aes_ks_enc_x1: 1685.cfi_startproc 1686 _CET_ENDBR 1687 vmovdqa con1(%rip), $CON_MASK # CON_MASK = 1,1,1,1 1688 vmovdqa mask(%rip), $MASK_256 # MASK_256 1689 vmovdqa ($PT), $BLOCK1 1690 vmovdqa ($KEYp), $KEY_1 # KEY_1 || KEY_2 [0..7] = user key 1691 vmovdqa 16($KEYp), $KEY_2 1692 vpxor $KEY_1, $BLOCK1, $BLOCK1 1693 vaesenc $KEY_2, $BLOCK1, $BLOCK1 1694 vmovdqu $KEY_1, ($KS) # First round key 1695 vmovdqu $KEY_2, 16($KS) 1696 vpxor $AUX_REG, $AUX_REG, $AUX_REG 1697 1698 ${\$round_double->(2, 3)} 1699 ${\$round_double->(4, 5)} 1700 ${\$round_double->(6, 7)} 1701 ${\$round_double->(8, 9)} 1702 ${\$round_double->(10, 11)} 1703 ${\$round_double->(12, 13)} 1704 ${\$round_last->(14)} 1705 vmovdqa $BLOCK1, ($CT) 1706 ret 1707.cfi_endproc 1708.size aes256gcmsiv_aes_ks_enc_x1,.-aes256gcmsiv_aes_ks_enc_x1 1709___ 1710} 1711aes256gcmsiv_aes_ks_enc_x1(); 1712 1713sub aes256gcmsiv_ecb_enc_block { 1714 my $STATE_1 = "%xmm1"; 1715 my $PT = "%rdi"; 1716 my $CT = "%rsi"; 1717 my $KSp = "%rdx"; 1718 1719 # parameter 1: PT %rdi (pointer to 128 bit) 1720 # parameter 2: CT %rsi (pointer to 128 bit) 1721 # parameter 3: ks %rdx (pointer to ks) 1722 $code.=<<___; 1723.globl aes256gcmsiv_ecb_enc_block 1724.type aes256gcmsiv_ecb_enc_block,\@function,3 1725.align 16 1726aes256gcmsiv_ecb_enc_block: 1727.cfi_startproc 1728 _CET_ENDBR 1729 vmovdqa (%rdi), $STATE_1 1730 vpxor ($KSp), $STATE_1, $STATE_1 1731 vaesenc 1*16($KSp), $STATE_1, $STATE_1 1732 vaesenc 2*16($KSp), $STATE_1, $STATE_1 1733 vaesenc 3*16($KSp), $STATE_1, $STATE_1 1734 vaesenc 4*16($KSp), $STATE_1, $STATE_1 1735 vaesenc 5*16($KSp), $STATE_1, $STATE_1 1736 vaesenc 6*16($KSp), $STATE_1, $STATE_1 1737 vaesenc 7*16($KSp), $STATE_1, $STATE_1 1738 vaesenc 8*16($KSp), $STATE_1, $STATE_1 1739 vaesenc 9*16($KSp), $STATE_1, $STATE_1 1740 vaesenc 10*16($KSp), $STATE_1, $STATE_1 1741 vaesenc 11*16($KSp), $STATE_1, $STATE_1 1742 vaesenc 12*16($KSp), $STATE_1, $STATE_1 1743 vaesenc 13*16($KSp), $STATE_1, $STATE_1 1744 vaesenclast 14*16($KSp), $STATE_1, $STATE_1 # $STATE_1 == IV 1745 vmovdqa $STATE_1, (%rsi) 1746 ret 1747.cfi_endproc 1748.size aes256gcmsiv_ecb_enc_block,.-aes256gcmsiv_ecb_enc_block 1749___ 1750} 1751aes256gcmsiv_ecb_enc_block(); 1752 1753sub aes256gcmsiv_enc_msg_x4 { 1754 my $CTR1 = "%xmm0"; 1755 my $CTR2 = "%xmm1"; 1756 my $CTR3 = "%xmm2"; 1757 my $CTR4 = "%xmm3"; 1758 my $ADDER = "%xmm4"; 1759 1760 my $STATE1 = "%xmm5"; 1761 my $STATE2 = "%xmm6"; 1762 my $STATE3 = "%xmm7"; 1763 my $STATE4 = "%xmm8"; 1764 1765 my $TMP = "%xmm12"; 1766 my $TMP2 = "%xmm13"; 1767 my $TMP3 = "%xmm14"; 1768 my $IV = "%xmm15"; 1769 1770 my $PT = "%rdi"; 1771 my $CT = "%rsi"; 1772 my $TAG = "%rdx"; 1773 my $KS = "%rcx"; 1774 my $LEN = "%r8"; 1775 1776 my $aes_round = sub { 1777 my ($i) = @_; 1778 return <<___; 1779 vmovdqu ${\eval($i*16)}($KS), $TMP 1780 vaesenc $TMP, $STATE1, $STATE1 1781 vaesenc $TMP, $STATE2, $STATE2 1782 vaesenc $TMP, $STATE3, $STATE3 1783 vaesenc $TMP, $STATE4, $STATE4 1784___ 1785 }; 1786 1787 my $aes_lastround = sub { 1788 my ($i) = @_; 1789 return <<___; 1790 vmovdqu ${\eval($i*16)}($KS), $TMP 1791 vaesenclast $TMP, $STATE1, $STATE1 1792 vaesenclast $TMP, $STATE2, $STATE2 1793 vaesenclast $TMP, $STATE3, $STATE3 1794 vaesenclast $TMP, $STATE4, $STATE4 1795___ 1796 }; 1797 1798 # void aes256gcmsiv_enc_msg_x4(unsigned char* PT, unsigned char* CT, 1799 # unsigned char* TAG, unsigned char* KS, 1800 # size_t byte_len); 1801 # parameter 1: %rdi #PT 1802 # parameter 2: %rsi #CT 1803 # parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32] 1804 # parameter 4: %rcx #KS 1805 # parameter 5: %r8 #LEN MSG_length in bytes 1806 $code.=<<___; 1807.globl aes256gcmsiv_enc_msg_x4 1808.type aes256gcmsiv_enc_msg_x4,\@function,5 1809.align 16 1810aes256gcmsiv_enc_msg_x4: 1811.cfi_startproc 1812 _CET_ENDBR 1813 test $LEN, $LEN 1814 jnz .L256_enc_msg_x4_start 1815 ret 1816 1817.L256_enc_msg_x4_start: 1818 movq $LEN, %r10 1819 shrq \$4, $LEN # LEN = num of blocks 1820 shlq \$60, %r10 1821 jz .L256_enc_msg_x4_start2 1822 addq \$1, $LEN 1823 1824.L256_enc_msg_x4_start2: 1825 movq $LEN, %r10 1826 shlq \$62, %r10 1827 shrq \$62, %r10 1828 1829 # make IV from TAG 1830 vmovdqa ($TAG), $IV 1831 vpor OR_MASK(%rip), $IV, $IV # IV = [1]TAG[126...32][00..00] 1832 1833 vmovdqa four(%rip), $ADDER # Register to increment counters 1834 vmovdqa $IV, $CTR1 # CTR1 = TAG[1][127...32][00..00] 1835 vpaddd one(%rip), $IV, $CTR2 # CTR2 = TAG[1][127...32][00..01] 1836 vpaddd two(%rip), $IV, $CTR3 # CTR3 = TAG[1][127...32][00..02] 1837 vpaddd three(%rip), $IV, $CTR4 # CTR4 = TAG[1][127...32][00..03] 1838 1839 shrq \$2, $LEN 1840 je .L256_enc_msg_x4_check_remainder 1841 1842 subq \$64, $CT 1843 subq \$64, $PT 1844 1845.L256_enc_msg_x4_loop1: 1846 addq \$64, $CT 1847 addq \$64, $PT 1848 1849 vmovdqa $CTR1, $STATE1 1850 vmovdqa $CTR2, $STATE2 1851 vmovdqa $CTR3, $STATE3 1852 vmovdqa $CTR4, $STATE4 1853 1854 vpxor ($KS), $STATE1, $STATE1 1855 vpxor ($KS), $STATE2, $STATE2 1856 vpxor ($KS), $STATE3, $STATE3 1857 vpxor ($KS), $STATE4, $STATE4 1858 1859 ${\$aes_round->(1)} 1860 vpaddd $ADDER, $CTR1, $CTR1 1861 ${\$aes_round->(2)} 1862 vpaddd $ADDER, $CTR2, $CTR2 1863 ${\$aes_round->(3)} 1864 vpaddd $ADDER, $CTR3, $CTR3 1865 ${\$aes_round->(4)} 1866 vpaddd $ADDER, $CTR4, $CTR4 1867 1868 ${\$aes_round->(5)} 1869 ${\$aes_round->(6)} 1870 ${\$aes_round->(7)} 1871 ${\$aes_round->(8)} 1872 ${\$aes_round->(9)} 1873 ${\$aes_round->(10)} 1874 ${\$aes_round->(11)} 1875 ${\$aes_round->(12)} 1876 ${\$aes_round->(13)} 1877 ${\$aes_lastround->(14)} 1878 1879 # XOR with Plaintext 1880 vpxor 0*16($PT), $STATE1, $STATE1 1881 vpxor 1*16($PT), $STATE2, $STATE2 1882 vpxor 2*16($PT), $STATE3, $STATE3 1883 vpxor 3*16($PT), $STATE4, $STATE4 1884 1885 subq \$1, $LEN 1886 1887 vmovdqu $STATE1, 0*16($CT) 1888 vmovdqu $STATE2, 1*16($CT) 1889 vmovdqu $STATE3, 2*16($CT) 1890 vmovdqu $STATE4, 3*16($CT) 1891 1892 jne .L256_enc_msg_x4_loop1 1893 1894 addq \$64, $CT 1895 addq \$64, $PT 1896 1897.L256_enc_msg_x4_check_remainder: 1898 cmpq \$0, %r10 1899 je .L256_enc_msg_x4_out 1900 1901.L256_enc_msg_x4_loop2: 1902 # encrypt each block separately 1903 # CTR1 is the highest counter (even if no LOOP done) 1904 1905 vmovdqa $CTR1, $STATE1 1906 vpaddd one(%rip), $CTR1, $CTR1 # inc counter 1907 vpxor ($KS), $STATE1, $STATE1 1908 vaesenc 16($KS), $STATE1, $STATE1 1909 vaesenc 32($KS), $STATE1, $STATE1 1910 vaesenc 48($KS), $STATE1, $STATE1 1911 vaesenc 64($KS), $STATE1, $STATE1 1912 vaesenc 80($KS), $STATE1, $STATE1 1913 vaesenc 96($KS), $STATE1, $STATE1 1914 vaesenc 112($KS), $STATE1, $STATE1 1915 vaesenc 128($KS), $STATE1, $STATE1 1916 vaesenc 144($KS), $STATE1, $STATE1 1917 vaesenc 160($KS), $STATE1, $STATE1 1918 vaesenc 176($KS), $STATE1, $STATE1 1919 vaesenc 192($KS), $STATE1, $STATE1 1920 vaesenc 208($KS), $STATE1, $STATE1 1921 vaesenclast 224($KS), $STATE1, $STATE1 1922 1923 # XOR with Plaintext 1924 vpxor ($PT), $STATE1, $STATE1 1925 1926 vmovdqu $STATE1, ($CT) 1927 1928 addq \$16, $PT 1929 addq \$16, $CT 1930 1931 subq \$1, %r10 1932 jne .L256_enc_msg_x4_loop2 1933 1934.L256_enc_msg_x4_out: 1935 ret 1936.cfi_endproc 1937.size aes256gcmsiv_enc_msg_x4,.-aes256gcmsiv_enc_msg_x4 1938___ 1939} 1940aes256gcmsiv_enc_msg_x4(); 1941 1942sub aes256gcmsiv_enc_msg_x8() { 1943 my $STATE1 = "%xmm1"; 1944 my $STATE2 = "%xmm2"; 1945 my $STATE3 = "%xmm3"; 1946 my $STATE4 = "%xmm4"; 1947 my $STATE5 = "%xmm5"; 1948 my $STATE6 = "%xmm6"; 1949 my $STATE7 = "%xmm7"; 1950 my $STATE8 = "%xmm8"; 1951 my $CTR1 = "%xmm0"; 1952 my $CTR2 = "%xmm9"; 1953 my $CTR3 = "%xmm10"; 1954 my $CTR4 = "%xmm11"; 1955 my $CTR5 = "%xmm12"; 1956 my $CTR6 = "%xmm13"; 1957 my $CTR7 = "%xmm14"; 1958 my $TMP1 = "%xmm1"; 1959 my $TMP2 = "%xmm2"; 1960 my $KS = "%rcx"; 1961 my $LEN = "%r8"; 1962 my $PT = "%rdi"; 1963 my $CT = "%rsi"; 1964 my $TAG = "%rdx"; 1965 my $SCHED = "%xmm15"; 1966 1967 my $aes_round8 = sub { 1968 my ($i) = @_; 1969 return <<___; 1970 vmovdqu ${\eval($i*16)}($KS), $SCHED 1971 vaesenc $SCHED, $STATE1, $STATE1 1972 vaesenc $SCHED, $STATE2, $STATE2 1973 vaesenc $SCHED, $STATE3, $STATE3 1974 vaesenc $SCHED, $STATE4, $STATE4 1975 vaesenc $SCHED, $STATE5, $STATE5 1976 vaesenc $SCHED, $STATE6, $STATE6 1977 vaesenc $SCHED, $STATE7, $STATE7 1978 vaesenc $SCHED, $STATE8, $STATE8 1979___ 1980 }; 1981 1982 my $aes_lastround8 = sub { 1983 my ($i) = @_; 1984 return <<___; 1985 vmovdqu ${\eval($i*16)}($KS), $SCHED 1986 vaesenclast $SCHED, $STATE1, $STATE1 1987 vaesenclast $SCHED, $STATE2, $STATE2 1988 vaesenclast $SCHED, $STATE3, $STATE3 1989 vaesenclast $SCHED, $STATE4, $STATE4 1990 vaesenclast $SCHED, $STATE5, $STATE5 1991 vaesenclast $SCHED, $STATE6, $STATE6 1992 vaesenclast $SCHED, $STATE7, $STATE7 1993 vaesenclast $SCHED, $STATE8, $STATE8 1994___ 1995 }; 1996 1997 # void ENC_MSG_x8(unsigned char* PT, 1998 # unsigned char* CT, 1999 # unsigned char* TAG, 2000 # unsigned char* KS, 2001 # size_t byte_len); 2002 # parameter 1: %rdi #PT 2003 # parameter 2: %rsi #CT 2004 # parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32] 2005 # parameter 4: %rcx #KS 2006 # parameter 5: %r8 #LEN MSG_length in bytes 2007 $code.=<<___; 2008.globl aes256gcmsiv_enc_msg_x8 2009.type aes256gcmsiv_enc_msg_x8,\@function,5 2010.align 16 2011aes256gcmsiv_enc_msg_x8: 2012.cfi_startproc 2013 _CET_ENDBR 2014 test $LEN, $LEN 2015 jnz .L256_enc_msg_x8_start 2016 ret 2017 2018.L256_enc_msg_x8_start: 2019 # Place in stack 2020 movq %rsp, %r11 2021 subq \$16, %r11 2022 andq \$-64, %r11 2023 2024 movq $LEN, %r10 2025 shrq \$4, $LEN # LEN = num of blocks 2026 shlq \$60, %r10 2027 jz .L256_enc_msg_x8_start2 2028 addq \$1, $LEN 2029 2030.L256_enc_msg_x8_start2: 2031 movq $LEN, %r10 2032 shlq \$61, %r10 2033 shrq \$61, %r10 2034 2035 # Make IV from TAG 2036 vmovdqa ($TAG), $TMP1 2037 vpor OR_MASK(%rip), $TMP1, $TMP1 # TMP1= IV = [1]TAG[126...32][00..00] 2038 2039 # store counter8 on the stack 2040 vpaddd seven(%rip), $TMP1, $CTR1 2041 vmovdqa $CTR1, (%r11) # CTR8 = TAG[127...32][00..07] 2042 vpaddd one(%rip), $TMP1, $CTR2 # CTR2 = TAG[127...32][00..01] 2043 vpaddd two(%rip), $TMP1, $CTR3 # CTR3 = TAG[127...32][00..02] 2044 vpaddd three(%rip), $TMP1, $CTR4 # CTR4 = TAG[127...32][00..03] 2045 vpaddd four(%rip), $TMP1, $CTR5 # CTR5 = TAG[127...32][00..04] 2046 vpaddd five(%rip), $TMP1, $CTR6 # CTR6 = TAG[127...32][00..05] 2047 vpaddd six(%rip), $TMP1, $CTR7 # CTR7 = TAG[127...32][00..06] 2048 vmovdqa $TMP1, $CTR1 # CTR1 = TAG[127...32][00..00] 2049 2050 shrq \$3, $LEN 2051 jz .L256_enc_msg_x8_check_remainder 2052 2053 subq \$128, $CT 2054 subq \$128, $PT 2055 2056.L256_enc_msg_x8_loop1: 2057 addq \$128, $CT 2058 addq \$128, $PT 2059 2060 vmovdqa $CTR1, $STATE1 2061 vmovdqa $CTR2, $STATE2 2062 vmovdqa $CTR3, $STATE3 2063 vmovdqa $CTR4, $STATE4 2064 vmovdqa $CTR5, $STATE5 2065 vmovdqa $CTR6, $STATE6 2066 vmovdqa $CTR7, $STATE7 2067 # move from stack 2068 vmovdqa (%r11), $STATE8 2069 2070 vpxor ($KS), $STATE1, $STATE1 2071 vpxor ($KS), $STATE2, $STATE2 2072 vpxor ($KS), $STATE3, $STATE3 2073 vpxor ($KS), $STATE4, $STATE4 2074 vpxor ($KS), $STATE5, $STATE5 2075 vpxor ($KS), $STATE6, $STATE6 2076 vpxor ($KS), $STATE7, $STATE7 2077 vpxor ($KS), $STATE8, $STATE8 2078 2079 ${\$aes_round8->(1)} 2080 vmovdqa (%r11), $CTR7 # deal with CTR8 2081 vpaddd eight(%rip), $CTR7, $CTR7 2082 vmovdqa $CTR7, (%r11) 2083 ${\$aes_round8->(2)} 2084 vpsubd one(%rip), $CTR7, $CTR7 2085 ${\$aes_round8->(3)} 2086 vpaddd eight(%rip), $CTR1, $CTR1 2087 ${\$aes_round8->(4)} 2088 vpaddd eight(%rip), $CTR2, $CTR2 2089 ${\$aes_round8->(5)} 2090 vpaddd eight(%rip), $CTR3, $CTR3 2091 ${\$aes_round8->(6)} 2092 vpaddd eight(%rip), $CTR4, $CTR4 2093 ${\$aes_round8->(7)} 2094 vpaddd eight(%rip), $CTR5, $CTR5 2095 ${\$aes_round8->(8)} 2096 vpaddd eight(%rip), $CTR6, $CTR6 2097 ${\$aes_round8->(9)} 2098 ${\$aes_round8->(10)} 2099 ${\$aes_round8->(11)} 2100 ${\$aes_round8->(12)} 2101 ${\$aes_round8->(13)} 2102 ${\$aes_lastround8->(14)} 2103 2104 # XOR with Plaintext 2105 vpxor 0*16($PT), $STATE1, $STATE1 2106 vpxor 1*16($PT), $STATE2, $STATE2 2107 vpxor 2*16($PT), $STATE3, $STATE3 2108 vpxor 3*16($PT), $STATE4, $STATE4 2109 vpxor 4*16($PT), $STATE5, $STATE5 2110 vpxor 5*16($PT), $STATE6, $STATE6 2111 vpxor 6*16($PT), $STATE7, $STATE7 2112 vpxor 7*16($PT), $STATE8, $STATE8 2113 2114 subq \$1, $LEN 2115 2116 vmovdqu $STATE1, 0*16($CT) 2117 vmovdqu $STATE2, 1*16($CT) 2118 vmovdqu $STATE3, 2*16($CT) 2119 vmovdqu $STATE4, 3*16($CT) 2120 vmovdqu $STATE5, 4*16($CT) 2121 vmovdqu $STATE6, 5*16($CT) 2122 vmovdqu $STATE7, 6*16($CT) 2123 vmovdqu $STATE8, 7*16($CT) 2124 2125 jne .L256_enc_msg_x8_loop1 2126 2127 addq \$128, $CT 2128 addq \$128, $PT 2129 2130.L256_enc_msg_x8_check_remainder: 2131 cmpq \$0, %r10 2132 je .L256_enc_msg_x8_out 2133 2134.L256_enc_msg_x8_loop2: 2135 # encrypt each block separately 2136 # CTR1 is the highest counter (even if no LOOP done) 2137 vmovdqa $CTR1, $STATE1 2138 vpaddd one(%rip), $CTR1, $CTR1 2139 2140 vpxor ($KS), $STATE1, $STATE1 2141 vaesenc 16($KS), $STATE1, $STATE1 2142 vaesenc 32($KS), $STATE1, $STATE1 2143 vaesenc 48($KS), $STATE1, $STATE1 2144 vaesenc 64($KS), $STATE1, $STATE1 2145 vaesenc 80($KS), $STATE1, $STATE1 2146 vaesenc 96($KS), $STATE1, $STATE1 2147 vaesenc 112($KS), $STATE1, $STATE1 2148 vaesenc 128($KS), $STATE1, $STATE1 2149 vaesenc 144($KS), $STATE1, $STATE1 2150 vaesenc 160($KS), $STATE1, $STATE1 2151 vaesenc 176($KS), $STATE1, $STATE1 2152 vaesenc 192($KS), $STATE1, $STATE1 2153 vaesenc 208($KS), $STATE1, $STATE1 2154 vaesenclast 224($KS), $STATE1, $STATE1 2155 2156 # XOR with Plaintext 2157 vpxor ($PT), $STATE1, $STATE1 2158 2159 vmovdqu $STATE1, ($CT) 2160 2161 addq \$16, $PT 2162 addq \$16, $CT 2163 subq \$1, %r10 2164 jnz .L256_enc_msg_x8_loop2 2165 2166.L256_enc_msg_x8_out: 2167 ret 2168 2169.cfi_endproc 2170.size aes256gcmsiv_enc_msg_x8,.-aes256gcmsiv_enc_msg_x8 2171___ 2172} 2173aes256gcmsiv_enc_msg_x8(); 2174aesgcmsiv_dec(1); 2175 2176sub aes256gcmsiv_kdf { 2177 my $ONE = "%xmm8"; 2178 my $BLOCK1 = "%xmm4"; 2179 my $BLOCK2 = "%xmm6"; 2180 my $BLOCK3 = "%xmm7"; 2181 my $BLOCK4 = "%xmm11"; 2182 my $BLOCK5 = "%xmm12"; 2183 my $BLOCK6 = "%xmm13"; 2184 2185 my $enc_roundx6 = sub { 2186 my ($i, $j) = @_; 2187 return <<___; 2188 vmovdqa ${\eval($i*16)}(%rdx), $j 2189 vaesenc $j, $BLOCK1, $BLOCK1 2190 vaesenc $j, $BLOCK2, $BLOCK2 2191 vaesenc $j, $BLOCK3, $BLOCK3 2192 vaesenc $j, $BLOCK4, $BLOCK4 2193 vaesenc $j, $BLOCK5, $BLOCK5 2194 vaesenc $j, $BLOCK6, $BLOCK6 2195___ 2196 }; 2197 2198 my $enc_roundlastx6 = sub { 2199 my ($i, $j) = @_; 2200 return <<___; 2201 vmovdqa ${\eval($i*16)}(%rdx), $j 2202 vaesenclast $j, $BLOCK1, $BLOCK1 2203 vaesenclast $j, $BLOCK2, $BLOCK2 2204 vaesenclast $j, $BLOCK3, $BLOCK3 2205 vaesenclast $j, $BLOCK4, $BLOCK4 2206 vaesenclast $j, $BLOCK5, $BLOCK5 2207 vaesenclast $j, $BLOCK6, $BLOCK6 2208___ 2209 }; 2210 2211 # void aes256gcmsiv_kdf(const uint8_t nonce[16], 2212 # uint8_t *out_key_material, 2213 # const uint8_t *key_schedule); 2214 $code.=<<___; 2215.globl aes256gcmsiv_kdf 2216.type aes256gcmsiv_kdf,\@function,3 2217.align 16 2218aes256gcmsiv_kdf: 2219.cfi_startproc 2220 _CET_ENDBR 2221# parameter 1: %rdi Pointer to NONCE 2222# parameter 2: %rsi Pointer to CT 2223# parameter 4: %rdx Pointer to keys 2224 2225 vmovdqa (%rdx), %xmm1 # xmm1 = first 16 bytes of random key 2226 vmovdqa 0*16(%rdi), $BLOCK1 2227 vmovdqa and_mask(%rip), $BLOCK4 2228 vmovdqa one(%rip), $ONE 2229 vpshufd \$0x90, $BLOCK1, $BLOCK1 2230 vpand $BLOCK4, $BLOCK1, $BLOCK1 2231 vpaddd $ONE, $BLOCK1, $BLOCK2 2232 vpaddd $ONE, $BLOCK2, $BLOCK3 2233 vpaddd $ONE, $BLOCK3, $BLOCK4 2234 vpaddd $ONE, $BLOCK4, $BLOCK5 2235 vpaddd $ONE, $BLOCK5, $BLOCK6 2236 2237 vpxor %xmm1, $BLOCK1, $BLOCK1 2238 vpxor %xmm1, $BLOCK2, $BLOCK2 2239 vpxor %xmm1, $BLOCK3, $BLOCK3 2240 vpxor %xmm1, $BLOCK4, $BLOCK4 2241 vpxor %xmm1, $BLOCK5, $BLOCK5 2242 vpxor %xmm1, $BLOCK6, $BLOCK6 2243 2244 ${\$enc_roundx6->(1, "%xmm1")} 2245 ${\$enc_roundx6->(2, "%xmm2")} 2246 ${\$enc_roundx6->(3, "%xmm1")} 2247 ${\$enc_roundx6->(4, "%xmm2")} 2248 ${\$enc_roundx6->(5, "%xmm1")} 2249 ${\$enc_roundx6->(6, "%xmm2")} 2250 ${\$enc_roundx6->(7, "%xmm1")} 2251 ${\$enc_roundx6->(8, "%xmm2")} 2252 ${\$enc_roundx6->(9, "%xmm1")} 2253 ${\$enc_roundx6->(10, "%xmm2")} 2254 ${\$enc_roundx6->(11, "%xmm1")} 2255 ${\$enc_roundx6->(12, "%xmm2")} 2256 ${\$enc_roundx6->(13, "%xmm1")} 2257 ${\$enc_roundlastx6->(14, "%xmm2")} 2258 2259 vmovdqa $BLOCK1, 0*16(%rsi) 2260 vmovdqa $BLOCK2, 1*16(%rsi) 2261 vmovdqa $BLOCK3, 2*16(%rsi) 2262 vmovdqa $BLOCK4, 3*16(%rsi) 2263 vmovdqa $BLOCK5, 4*16(%rsi) 2264 vmovdqa $BLOCK6, 5*16(%rsi) 2265 ret 2266.cfi_endproc 2267.size aes256gcmsiv_kdf, .-aes256gcmsiv_kdf 2268___ 2269} 2270aes256gcmsiv_kdf(); 2271 2272print $code; 2273 2274close STDOUT or die "error closing STDOUT: $!"; 2275