1# This file is generated from a similarly-named Perl script in the BoringSSL 2# source tree. Do not edit by hand. 3 4#if defined(__has_feature) 5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 6#define OPENSSL_NO_ASM 7#endif 8#endif 9 10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) 11#include "ring_core_generated/prefix_symbols_asm.h" 12.text 13 14.globl _gcm_init_clmul 15.private_extern _gcm_init_clmul 16 17.p2align 4 18_gcm_init_clmul: 19 20L$_init_clmul: 21 movdqu (%rsi),%xmm2 22 pshufd $78,%xmm2,%xmm2 23 24 25 pshufd $255,%xmm2,%xmm4 26 movdqa %xmm2,%xmm3 27 psllq $1,%xmm2 28 pxor %xmm5,%xmm5 29 psrlq $63,%xmm3 30 pcmpgtd %xmm4,%xmm5 31 pslldq $8,%xmm3 32 por %xmm3,%xmm2 33 34 35 pand L$0x1c2_polynomial(%rip),%xmm5 36 pxor %xmm5,%xmm2 37 38 39 pshufd $78,%xmm2,%xmm6 40 movdqa %xmm2,%xmm0 41 pxor %xmm2,%xmm6 42 movdqa %xmm0,%xmm1 43 pshufd $78,%xmm0,%xmm3 44 pxor %xmm0,%xmm3 45.byte 102,15,58,68,194,0 46.byte 102,15,58,68,202,17 47.byte 102,15,58,68,222,0 48 pxor %xmm0,%xmm3 49 pxor %xmm1,%xmm3 50 51 movdqa %xmm3,%xmm4 52 psrldq $8,%xmm3 53 pslldq $8,%xmm4 54 pxor %xmm3,%xmm1 55 pxor %xmm4,%xmm0 56 57 movdqa %xmm0,%xmm4 58 movdqa %xmm0,%xmm3 59 psllq $5,%xmm0 60 pxor %xmm0,%xmm3 61 psllq $1,%xmm0 62 pxor %xmm3,%xmm0 63 psllq $57,%xmm0 64 movdqa %xmm0,%xmm3 65 pslldq $8,%xmm0 66 psrldq $8,%xmm3 67 pxor %xmm4,%xmm0 68 pxor %xmm3,%xmm1 69 70 71 movdqa %xmm0,%xmm4 72 psrlq $1,%xmm0 73 pxor %xmm4,%xmm1 74 pxor %xmm0,%xmm4 75 psrlq $5,%xmm0 76 pxor %xmm4,%xmm0 77 psrlq $1,%xmm0 78 pxor %xmm1,%xmm0 79 pshufd $78,%xmm2,%xmm3 80 pshufd $78,%xmm0,%xmm4 81 pxor %xmm2,%xmm3 82 movdqu %xmm2,0(%rdi) 83 pxor %xmm0,%xmm4 84 movdqu %xmm0,16(%rdi) 85.byte 102,15,58,15,227,8 86 movdqu %xmm4,32(%rdi) 87 movdqa %xmm0,%xmm1 88 pshufd $78,%xmm0,%xmm3 89 pxor %xmm0,%xmm3 90.byte 102,15,58,68,194,0 91.byte 102,15,58,68,202,17 92.byte 102,15,58,68,222,0 93 pxor %xmm0,%xmm3 94 pxor %xmm1,%xmm3 95 96 movdqa %xmm3,%xmm4 97 psrldq $8,%xmm3 98 pslldq $8,%xmm4 99 pxor %xmm3,%xmm1 100 pxor %xmm4,%xmm0 101 102 movdqa %xmm0,%xmm4 103 movdqa %xmm0,%xmm3 104 psllq $5,%xmm0 105 pxor %xmm0,%xmm3 106 psllq $1,%xmm0 107 pxor %xmm3,%xmm0 108 psllq $57,%xmm0 109 movdqa %xmm0,%xmm3 110 pslldq $8,%xmm0 111 psrldq $8,%xmm3 112 pxor %xmm4,%xmm0 113 pxor %xmm3,%xmm1 114 115 116 movdqa %xmm0,%xmm4 117 psrlq $1,%xmm0 118 pxor %xmm4,%xmm1 119 pxor %xmm0,%xmm4 120 psrlq $5,%xmm0 121 pxor %xmm4,%xmm0 122 psrlq $1,%xmm0 123 pxor %xmm1,%xmm0 124 movdqa %xmm0,%xmm5 125 movdqa %xmm0,%xmm1 126 pshufd $78,%xmm0,%xmm3 127 pxor %xmm0,%xmm3 128.byte 102,15,58,68,194,0 129.byte 102,15,58,68,202,17 130.byte 102,15,58,68,222,0 131 pxor %xmm0,%xmm3 132 pxor %xmm1,%xmm3 133 134 movdqa %xmm3,%xmm4 135 psrldq $8,%xmm3 136 pslldq $8,%xmm4 137 pxor %xmm3,%xmm1 138 pxor %xmm4,%xmm0 139 140 movdqa %xmm0,%xmm4 141 movdqa %xmm0,%xmm3 142 psllq $5,%xmm0 143 pxor %xmm0,%xmm3 144 psllq $1,%xmm0 145 pxor %xmm3,%xmm0 146 psllq $57,%xmm0 147 movdqa %xmm0,%xmm3 148 pslldq $8,%xmm0 149 psrldq $8,%xmm3 150 pxor %xmm4,%xmm0 151 pxor %xmm3,%xmm1 152 153 154 movdqa %xmm0,%xmm4 155 psrlq $1,%xmm0 156 pxor %xmm4,%xmm1 157 pxor %xmm0,%xmm4 158 psrlq $5,%xmm0 159 pxor %xmm4,%xmm0 160 psrlq $1,%xmm0 161 pxor %xmm1,%xmm0 162 pshufd $78,%xmm5,%xmm3 163 pshufd $78,%xmm0,%xmm4 164 pxor %xmm5,%xmm3 165 movdqu %xmm5,48(%rdi) 166 pxor %xmm0,%xmm4 167 movdqu %xmm0,64(%rdi) 168.byte 102,15,58,15,227,8 169 movdqu %xmm4,80(%rdi) 170 .byte 0xf3,0xc3 171 172 173.globl _gcm_gmult_clmul 174.private_extern _gcm_gmult_clmul 175 176.p2align 4 177_gcm_gmult_clmul: 178 179L$_gmult_clmul: 180 movdqu (%rdi),%xmm0 181 movdqa L$bswap_mask(%rip),%xmm5 182 movdqu (%rsi),%xmm2 183 movdqu 32(%rsi),%xmm4 184.byte 102,15,56,0,197 185 movdqa %xmm0,%xmm1 186 pshufd $78,%xmm0,%xmm3 187 pxor %xmm0,%xmm3 188.byte 102,15,58,68,194,0 189.byte 102,15,58,68,202,17 190.byte 102,15,58,68,220,0 191 pxor %xmm0,%xmm3 192 pxor %xmm1,%xmm3 193 194 movdqa %xmm3,%xmm4 195 psrldq $8,%xmm3 196 pslldq $8,%xmm4 197 pxor %xmm3,%xmm1 198 pxor %xmm4,%xmm0 199 200 movdqa %xmm0,%xmm4 201 movdqa %xmm0,%xmm3 202 psllq $5,%xmm0 203 pxor %xmm0,%xmm3 204 psllq $1,%xmm0 205 pxor %xmm3,%xmm0 206 psllq $57,%xmm0 207 movdqa %xmm0,%xmm3 208 pslldq $8,%xmm0 209 psrldq $8,%xmm3 210 pxor %xmm4,%xmm0 211 pxor %xmm3,%xmm1 212 213 214 movdqa %xmm0,%xmm4 215 psrlq $1,%xmm0 216 pxor %xmm4,%xmm1 217 pxor %xmm0,%xmm4 218 psrlq $5,%xmm0 219 pxor %xmm4,%xmm0 220 psrlq $1,%xmm0 221 pxor %xmm1,%xmm0 222.byte 102,15,56,0,197 223 movdqu %xmm0,(%rdi) 224 .byte 0xf3,0xc3 225 226 227.globl _gcm_ghash_clmul 228.private_extern _gcm_ghash_clmul 229 230.p2align 5 231_gcm_ghash_clmul: 232 233L$_ghash_clmul: 234 movdqa L$bswap_mask(%rip),%xmm10 235 236 movdqu (%rdi),%xmm0 237 movdqu (%rsi),%xmm2 238 movdqu 32(%rsi),%xmm7 239.byte 102,65,15,56,0,194 240 241 subq $0x10,%rcx 242 jz L$odd_tail 243 244 movdqu 16(%rsi),%xmm6 245 leaq _OPENSSL_ia32cap_P(%rip),%rax 246 movl 4(%rax),%eax 247 cmpq $0x30,%rcx 248 jb L$skip4x 249 250 andl $71303168,%eax 251 cmpl $4194304,%eax 252 je L$skip4x 253 254 subq $0x30,%rcx 255 movq $0xA040608020C0E000,%rax 256 movdqu 48(%rsi),%xmm14 257 movdqu 64(%rsi),%xmm15 258 259 260 261 262 movdqu 48(%rdx),%xmm3 263 movdqu 32(%rdx),%xmm11 264.byte 102,65,15,56,0,218 265.byte 102,69,15,56,0,218 266 movdqa %xmm3,%xmm5 267 pshufd $78,%xmm3,%xmm4 268 pxor %xmm3,%xmm4 269.byte 102,15,58,68,218,0 270.byte 102,15,58,68,234,17 271.byte 102,15,58,68,231,0 272 273 movdqa %xmm11,%xmm13 274 pshufd $78,%xmm11,%xmm12 275 pxor %xmm11,%xmm12 276.byte 102,68,15,58,68,222,0 277.byte 102,68,15,58,68,238,17 278.byte 102,68,15,58,68,231,16 279 xorps %xmm11,%xmm3 280 xorps %xmm13,%xmm5 281 movups 80(%rsi),%xmm7 282 xorps %xmm12,%xmm4 283 284 movdqu 16(%rdx),%xmm11 285 movdqu 0(%rdx),%xmm8 286.byte 102,69,15,56,0,218 287.byte 102,69,15,56,0,194 288 movdqa %xmm11,%xmm13 289 pshufd $78,%xmm11,%xmm12 290 pxor %xmm8,%xmm0 291 pxor %xmm11,%xmm12 292.byte 102,69,15,58,68,222,0 293 movdqa %xmm0,%xmm1 294 pshufd $78,%xmm0,%xmm8 295 pxor %xmm0,%xmm8 296.byte 102,69,15,58,68,238,17 297.byte 102,68,15,58,68,231,0 298 xorps %xmm11,%xmm3 299 xorps %xmm13,%xmm5 300 301 leaq 64(%rdx),%rdx 302 subq $0x40,%rcx 303 jc L$tail4x 304 305 jmp L$mod4_loop 306.p2align 5 307L$mod4_loop: 308.byte 102,65,15,58,68,199,0 309 xorps %xmm12,%xmm4 310 movdqu 48(%rdx),%xmm11 311.byte 102,69,15,56,0,218 312.byte 102,65,15,58,68,207,17 313 xorps %xmm3,%xmm0 314 movdqu 32(%rdx),%xmm3 315 movdqa %xmm11,%xmm13 316.byte 102,68,15,58,68,199,16 317 pshufd $78,%xmm11,%xmm12 318 xorps %xmm5,%xmm1 319 pxor %xmm11,%xmm12 320.byte 102,65,15,56,0,218 321 movups 32(%rsi),%xmm7 322 xorps %xmm4,%xmm8 323.byte 102,68,15,58,68,218,0 324 pshufd $78,%xmm3,%xmm4 325 326 pxor %xmm0,%xmm8 327 movdqa %xmm3,%xmm5 328 pxor %xmm1,%xmm8 329 pxor %xmm3,%xmm4 330 movdqa %xmm8,%xmm9 331.byte 102,68,15,58,68,234,17 332 pslldq $8,%xmm8 333 psrldq $8,%xmm9 334 pxor %xmm8,%xmm0 335 movdqa L$7_mask(%rip),%xmm8 336 pxor %xmm9,%xmm1 337.byte 102,76,15,110,200 338 339 pand %xmm0,%xmm8 340.byte 102,69,15,56,0,200 341 pxor %xmm0,%xmm9 342.byte 102,68,15,58,68,231,0 343 psllq $57,%xmm9 344 movdqa %xmm9,%xmm8 345 pslldq $8,%xmm9 346.byte 102,15,58,68,222,0 347 psrldq $8,%xmm8 348 pxor %xmm9,%xmm0 349 pxor %xmm8,%xmm1 350 movdqu 0(%rdx),%xmm8 351 352 movdqa %xmm0,%xmm9 353 psrlq $1,%xmm0 354.byte 102,15,58,68,238,17 355 xorps %xmm11,%xmm3 356 movdqu 16(%rdx),%xmm11 357.byte 102,69,15,56,0,218 358.byte 102,15,58,68,231,16 359 xorps %xmm13,%xmm5 360 movups 80(%rsi),%xmm7 361.byte 102,69,15,56,0,194 362 pxor %xmm9,%xmm1 363 pxor %xmm0,%xmm9 364 psrlq $5,%xmm0 365 366 movdqa %xmm11,%xmm13 367 pxor %xmm12,%xmm4 368 pshufd $78,%xmm11,%xmm12 369 pxor %xmm9,%xmm0 370 pxor %xmm8,%xmm1 371 pxor %xmm11,%xmm12 372.byte 102,69,15,58,68,222,0 373 psrlq $1,%xmm0 374 pxor %xmm1,%xmm0 375 movdqa %xmm0,%xmm1 376.byte 102,69,15,58,68,238,17 377 xorps %xmm11,%xmm3 378 pshufd $78,%xmm0,%xmm8 379 pxor %xmm0,%xmm8 380 381.byte 102,68,15,58,68,231,0 382 xorps %xmm13,%xmm5 383 384 leaq 64(%rdx),%rdx 385 subq $0x40,%rcx 386 jnc L$mod4_loop 387 388L$tail4x: 389.byte 102,65,15,58,68,199,0 390.byte 102,65,15,58,68,207,17 391.byte 102,68,15,58,68,199,16 392 xorps %xmm12,%xmm4 393 xorps %xmm3,%xmm0 394 xorps %xmm5,%xmm1 395 pxor %xmm0,%xmm1 396 pxor %xmm4,%xmm8 397 398 pxor %xmm1,%xmm8 399 pxor %xmm0,%xmm1 400 401 movdqa %xmm8,%xmm9 402 psrldq $8,%xmm8 403 pslldq $8,%xmm9 404 pxor %xmm8,%xmm1 405 pxor %xmm9,%xmm0 406 407 movdqa %xmm0,%xmm4 408 movdqa %xmm0,%xmm3 409 psllq $5,%xmm0 410 pxor %xmm0,%xmm3 411 psllq $1,%xmm0 412 pxor %xmm3,%xmm0 413 psllq $57,%xmm0 414 movdqa %xmm0,%xmm3 415 pslldq $8,%xmm0 416 psrldq $8,%xmm3 417 pxor %xmm4,%xmm0 418 pxor %xmm3,%xmm1 419 420 421 movdqa %xmm0,%xmm4 422 psrlq $1,%xmm0 423 pxor %xmm4,%xmm1 424 pxor %xmm0,%xmm4 425 psrlq $5,%xmm0 426 pxor %xmm4,%xmm0 427 psrlq $1,%xmm0 428 pxor %xmm1,%xmm0 429 addq $0x40,%rcx 430 jz L$done 431 movdqu 32(%rsi),%xmm7 432 subq $0x10,%rcx 433 jz L$odd_tail 434L$skip4x: 435 436 437 438 439 440 movdqu (%rdx),%xmm8 441 movdqu 16(%rdx),%xmm3 442.byte 102,69,15,56,0,194 443.byte 102,65,15,56,0,218 444 pxor %xmm8,%xmm0 445 446 movdqa %xmm3,%xmm5 447 pshufd $78,%xmm3,%xmm4 448 pxor %xmm3,%xmm4 449.byte 102,15,58,68,218,0 450.byte 102,15,58,68,234,17 451.byte 102,15,58,68,231,0 452 453 leaq 32(%rdx),%rdx 454 nop 455 subq $0x20,%rcx 456 jbe L$even_tail 457 nop 458 jmp L$mod_loop 459 460.p2align 5 461L$mod_loop: 462 movdqa %xmm0,%xmm1 463 movdqa %xmm4,%xmm8 464 pshufd $78,%xmm0,%xmm4 465 pxor %xmm0,%xmm4 466 467.byte 102,15,58,68,198,0 468.byte 102,15,58,68,206,17 469.byte 102,15,58,68,231,16 470 471 pxor %xmm3,%xmm0 472 pxor %xmm5,%xmm1 473 movdqu (%rdx),%xmm9 474 pxor %xmm0,%xmm8 475.byte 102,69,15,56,0,202 476 movdqu 16(%rdx),%xmm3 477 478 pxor %xmm1,%xmm8 479 pxor %xmm9,%xmm1 480 pxor %xmm8,%xmm4 481.byte 102,65,15,56,0,218 482 movdqa %xmm4,%xmm8 483 psrldq $8,%xmm8 484 pslldq $8,%xmm4 485 pxor %xmm8,%xmm1 486 pxor %xmm4,%xmm0 487 488 movdqa %xmm3,%xmm5 489 490 movdqa %xmm0,%xmm9 491 movdqa %xmm0,%xmm8 492 psllq $5,%xmm0 493 pxor %xmm0,%xmm8 494.byte 102,15,58,68,218,0 495 psllq $1,%xmm0 496 pxor %xmm8,%xmm0 497 psllq $57,%xmm0 498 movdqa %xmm0,%xmm8 499 pslldq $8,%xmm0 500 psrldq $8,%xmm8 501 pxor %xmm9,%xmm0 502 pshufd $78,%xmm5,%xmm4 503 pxor %xmm8,%xmm1 504 pxor %xmm5,%xmm4 505 506 movdqa %xmm0,%xmm9 507 psrlq $1,%xmm0 508.byte 102,15,58,68,234,17 509 pxor %xmm9,%xmm1 510 pxor %xmm0,%xmm9 511 psrlq $5,%xmm0 512 pxor %xmm9,%xmm0 513 leaq 32(%rdx),%rdx 514 psrlq $1,%xmm0 515.byte 102,15,58,68,231,0 516 pxor %xmm1,%xmm0 517 518 subq $0x20,%rcx 519 ja L$mod_loop 520 521L$even_tail: 522 movdqa %xmm0,%xmm1 523 movdqa %xmm4,%xmm8 524 pshufd $78,%xmm0,%xmm4 525 pxor %xmm0,%xmm4 526 527.byte 102,15,58,68,198,0 528.byte 102,15,58,68,206,17 529.byte 102,15,58,68,231,16 530 531 pxor %xmm3,%xmm0 532 pxor %xmm5,%xmm1 533 pxor %xmm0,%xmm8 534 pxor %xmm1,%xmm8 535 pxor %xmm8,%xmm4 536 movdqa %xmm4,%xmm8 537 psrldq $8,%xmm8 538 pslldq $8,%xmm4 539 pxor %xmm8,%xmm1 540 pxor %xmm4,%xmm0 541 542 movdqa %xmm0,%xmm4 543 movdqa %xmm0,%xmm3 544 psllq $5,%xmm0 545 pxor %xmm0,%xmm3 546 psllq $1,%xmm0 547 pxor %xmm3,%xmm0 548 psllq $57,%xmm0 549 movdqa %xmm0,%xmm3 550 pslldq $8,%xmm0 551 psrldq $8,%xmm3 552 pxor %xmm4,%xmm0 553 pxor %xmm3,%xmm1 554 555 556 movdqa %xmm0,%xmm4 557 psrlq $1,%xmm0 558 pxor %xmm4,%xmm1 559 pxor %xmm0,%xmm4 560 psrlq $5,%xmm0 561 pxor %xmm4,%xmm0 562 psrlq $1,%xmm0 563 pxor %xmm1,%xmm0 564 testq %rcx,%rcx 565 jnz L$done 566 567L$odd_tail: 568 movdqu (%rdx),%xmm8 569.byte 102,69,15,56,0,194 570 pxor %xmm8,%xmm0 571 movdqa %xmm0,%xmm1 572 pshufd $78,%xmm0,%xmm3 573 pxor %xmm0,%xmm3 574.byte 102,15,58,68,194,0 575.byte 102,15,58,68,202,17 576.byte 102,15,58,68,223,0 577 pxor %xmm0,%xmm3 578 pxor %xmm1,%xmm3 579 580 movdqa %xmm3,%xmm4 581 psrldq $8,%xmm3 582 pslldq $8,%xmm4 583 pxor %xmm3,%xmm1 584 pxor %xmm4,%xmm0 585 586 movdqa %xmm0,%xmm4 587 movdqa %xmm0,%xmm3 588 psllq $5,%xmm0 589 pxor %xmm0,%xmm3 590 psllq $1,%xmm0 591 pxor %xmm3,%xmm0 592 psllq $57,%xmm0 593 movdqa %xmm0,%xmm3 594 pslldq $8,%xmm0 595 psrldq $8,%xmm3 596 pxor %xmm4,%xmm0 597 pxor %xmm3,%xmm1 598 599 600 movdqa %xmm0,%xmm4 601 psrlq $1,%xmm0 602 pxor %xmm4,%xmm1 603 pxor %xmm0,%xmm4 604 psrlq $5,%xmm0 605 pxor %xmm4,%xmm0 606 psrlq $1,%xmm0 607 pxor %xmm1,%xmm0 608L$done: 609.byte 102,65,15,56,0,194 610 movdqu %xmm0,(%rdi) 611 .byte 0xf3,0xc3 612 613 614.globl _gcm_init_avx 615.private_extern _gcm_init_avx 616 617.p2align 5 618_gcm_init_avx: 619 620 vzeroupper 621 622 vmovdqu (%rsi),%xmm2 623 vpshufd $78,%xmm2,%xmm2 624 625 626 vpshufd $255,%xmm2,%xmm4 627 vpsrlq $63,%xmm2,%xmm3 628 vpsllq $1,%xmm2,%xmm2 629 vpxor %xmm5,%xmm5,%xmm5 630 vpcmpgtd %xmm4,%xmm5,%xmm5 631 vpslldq $8,%xmm3,%xmm3 632 vpor %xmm3,%xmm2,%xmm2 633 634 635 vpand L$0x1c2_polynomial(%rip),%xmm5,%xmm5 636 vpxor %xmm5,%xmm2,%xmm2 637 638 vpunpckhqdq %xmm2,%xmm2,%xmm6 639 vmovdqa %xmm2,%xmm0 640 vpxor %xmm2,%xmm6,%xmm6 641 movq $4,%r10 642 jmp L$init_start_avx 643.p2align 5 644L$init_loop_avx: 645 vpalignr $8,%xmm3,%xmm4,%xmm5 646 vmovdqu %xmm5,-16(%rdi) 647 vpunpckhqdq %xmm0,%xmm0,%xmm3 648 vpxor %xmm0,%xmm3,%xmm3 649 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 650 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 651 vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 652 vpxor %xmm0,%xmm1,%xmm4 653 vpxor %xmm4,%xmm3,%xmm3 654 655 vpslldq $8,%xmm3,%xmm4 656 vpsrldq $8,%xmm3,%xmm3 657 vpxor %xmm4,%xmm0,%xmm0 658 vpxor %xmm3,%xmm1,%xmm1 659 vpsllq $57,%xmm0,%xmm3 660 vpsllq $62,%xmm0,%xmm4 661 vpxor %xmm3,%xmm4,%xmm4 662 vpsllq $63,%xmm0,%xmm3 663 vpxor %xmm3,%xmm4,%xmm4 664 vpslldq $8,%xmm4,%xmm3 665 vpsrldq $8,%xmm4,%xmm4 666 vpxor %xmm3,%xmm0,%xmm0 667 vpxor %xmm4,%xmm1,%xmm1 668 669 vpsrlq $1,%xmm0,%xmm4 670 vpxor %xmm0,%xmm1,%xmm1 671 vpxor %xmm4,%xmm0,%xmm0 672 vpsrlq $5,%xmm4,%xmm4 673 vpxor %xmm4,%xmm0,%xmm0 674 vpsrlq $1,%xmm0,%xmm0 675 vpxor %xmm1,%xmm0,%xmm0 676L$init_start_avx: 677 vmovdqa %xmm0,%xmm5 678 vpunpckhqdq %xmm0,%xmm0,%xmm3 679 vpxor %xmm0,%xmm3,%xmm3 680 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 681 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 682 vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 683 vpxor %xmm0,%xmm1,%xmm4 684 vpxor %xmm4,%xmm3,%xmm3 685 686 vpslldq $8,%xmm3,%xmm4 687 vpsrldq $8,%xmm3,%xmm3 688 vpxor %xmm4,%xmm0,%xmm0 689 vpxor %xmm3,%xmm1,%xmm1 690 vpsllq $57,%xmm0,%xmm3 691 vpsllq $62,%xmm0,%xmm4 692 vpxor %xmm3,%xmm4,%xmm4 693 vpsllq $63,%xmm0,%xmm3 694 vpxor %xmm3,%xmm4,%xmm4 695 vpslldq $8,%xmm4,%xmm3 696 vpsrldq $8,%xmm4,%xmm4 697 vpxor %xmm3,%xmm0,%xmm0 698 vpxor %xmm4,%xmm1,%xmm1 699 700 vpsrlq $1,%xmm0,%xmm4 701 vpxor %xmm0,%xmm1,%xmm1 702 vpxor %xmm4,%xmm0,%xmm0 703 vpsrlq $5,%xmm4,%xmm4 704 vpxor %xmm4,%xmm0,%xmm0 705 vpsrlq $1,%xmm0,%xmm0 706 vpxor %xmm1,%xmm0,%xmm0 707 vpshufd $78,%xmm5,%xmm3 708 vpshufd $78,%xmm0,%xmm4 709 vpxor %xmm5,%xmm3,%xmm3 710 vmovdqu %xmm5,0(%rdi) 711 vpxor %xmm0,%xmm4,%xmm4 712 vmovdqu %xmm0,16(%rdi) 713 leaq 48(%rdi),%rdi 714 subq $1,%r10 715 jnz L$init_loop_avx 716 717 vpalignr $8,%xmm4,%xmm3,%xmm5 718 vmovdqu %xmm5,-16(%rdi) 719 720 vzeroupper 721 .byte 0xf3,0xc3 722 723 724.globl _gcm_ghash_avx 725.private_extern _gcm_ghash_avx 726 727.p2align 5 728_gcm_ghash_avx: 729 730 vzeroupper 731 732 vmovdqu (%rdi),%xmm10 733 leaq L$0x1c2_polynomial(%rip),%r10 734 leaq 64(%rsi),%rsi 735 vmovdqu L$bswap_mask(%rip),%xmm13 736 vpshufb %xmm13,%xmm10,%xmm10 737 cmpq $0x80,%rcx 738 jb L$short_avx 739 subq $0x80,%rcx 740 741 vmovdqu 112(%rdx),%xmm14 742 vmovdqu 0-64(%rsi),%xmm6 743 vpshufb %xmm13,%xmm14,%xmm14 744 vmovdqu 32-64(%rsi),%xmm7 745 746 vpunpckhqdq %xmm14,%xmm14,%xmm9 747 vmovdqu 96(%rdx),%xmm15 748 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 749 vpxor %xmm14,%xmm9,%xmm9 750 vpshufb %xmm13,%xmm15,%xmm15 751 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 752 vmovdqu 16-64(%rsi),%xmm6 753 vpunpckhqdq %xmm15,%xmm15,%xmm8 754 vmovdqu 80(%rdx),%xmm14 755 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 756 vpxor %xmm15,%xmm8,%xmm8 757 758 vpshufb %xmm13,%xmm14,%xmm14 759 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 760 vpunpckhqdq %xmm14,%xmm14,%xmm9 761 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 762 vmovdqu 48-64(%rsi),%xmm6 763 vpxor %xmm14,%xmm9,%xmm9 764 vmovdqu 64(%rdx),%xmm15 765 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 766 vmovdqu 80-64(%rsi),%xmm7 767 768 vpshufb %xmm13,%xmm15,%xmm15 769 vpxor %xmm0,%xmm3,%xmm3 770 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 771 vpxor %xmm1,%xmm4,%xmm4 772 vpunpckhqdq %xmm15,%xmm15,%xmm8 773 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 774 vmovdqu 64-64(%rsi),%xmm6 775 vpxor %xmm2,%xmm5,%xmm5 776 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 777 vpxor %xmm15,%xmm8,%xmm8 778 779 vmovdqu 48(%rdx),%xmm14 780 vpxor %xmm3,%xmm0,%xmm0 781 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 782 vpxor %xmm4,%xmm1,%xmm1 783 vpshufb %xmm13,%xmm14,%xmm14 784 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 785 vmovdqu 96-64(%rsi),%xmm6 786 vpxor %xmm5,%xmm2,%xmm2 787 vpunpckhqdq %xmm14,%xmm14,%xmm9 788 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 789 vmovdqu 128-64(%rsi),%xmm7 790 vpxor %xmm14,%xmm9,%xmm9 791 792 vmovdqu 32(%rdx),%xmm15 793 vpxor %xmm0,%xmm3,%xmm3 794 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 795 vpxor %xmm1,%xmm4,%xmm4 796 vpshufb %xmm13,%xmm15,%xmm15 797 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 798 vmovdqu 112-64(%rsi),%xmm6 799 vpxor %xmm2,%xmm5,%xmm5 800 vpunpckhqdq %xmm15,%xmm15,%xmm8 801 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 802 vpxor %xmm15,%xmm8,%xmm8 803 804 vmovdqu 16(%rdx),%xmm14 805 vpxor %xmm3,%xmm0,%xmm0 806 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 807 vpxor %xmm4,%xmm1,%xmm1 808 vpshufb %xmm13,%xmm14,%xmm14 809 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 810 vmovdqu 144-64(%rsi),%xmm6 811 vpxor %xmm5,%xmm2,%xmm2 812 vpunpckhqdq %xmm14,%xmm14,%xmm9 813 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 814 vmovdqu 176-64(%rsi),%xmm7 815 vpxor %xmm14,%xmm9,%xmm9 816 817 vmovdqu (%rdx),%xmm15 818 vpxor %xmm0,%xmm3,%xmm3 819 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 820 vpxor %xmm1,%xmm4,%xmm4 821 vpshufb %xmm13,%xmm15,%xmm15 822 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 823 vmovdqu 160-64(%rsi),%xmm6 824 vpxor %xmm2,%xmm5,%xmm5 825 vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 826 827 leaq 128(%rdx),%rdx 828 cmpq $0x80,%rcx 829 jb L$tail_avx 830 831 vpxor %xmm10,%xmm15,%xmm15 832 subq $0x80,%rcx 833 jmp L$oop8x_avx 834 835.p2align 5 836L$oop8x_avx: 837 vpunpckhqdq %xmm15,%xmm15,%xmm8 838 vmovdqu 112(%rdx),%xmm14 839 vpxor %xmm0,%xmm3,%xmm3 840 vpxor %xmm15,%xmm8,%xmm8 841 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10 842 vpshufb %xmm13,%xmm14,%xmm14 843 vpxor %xmm1,%xmm4,%xmm4 844 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11 845 vmovdqu 0-64(%rsi),%xmm6 846 vpunpckhqdq %xmm14,%xmm14,%xmm9 847 vpxor %xmm2,%xmm5,%xmm5 848 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12 849 vmovdqu 32-64(%rsi),%xmm7 850 vpxor %xmm14,%xmm9,%xmm9 851 852 vmovdqu 96(%rdx),%xmm15 853 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 854 vpxor %xmm3,%xmm10,%xmm10 855 vpshufb %xmm13,%xmm15,%xmm15 856 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 857 vxorps %xmm4,%xmm11,%xmm11 858 vmovdqu 16-64(%rsi),%xmm6 859 vpunpckhqdq %xmm15,%xmm15,%xmm8 860 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 861 vpxor %xmm5,%xmm12,%xmm12 862 vxorps %xmm15,%xmm8,%xmm8 863 864 vmovdqu 80(%rdx),%xmm14 865 vpxor %xmm10,%xmm12,%xmm12 866 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 867 vpxor %xmm11,%xmm12,%xmm12 868 vpslldq $8,%xmm12,%xmm9 869 vpxor %xmm0,%xmm3,%xmm3 870 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 871 vpsrldq $8,%xmm12,%xmm12 872 vpxor %xmm9,%xmm10,%xmm10 873 vmovdqu 48-64(%rsi),%xmm6 874 vpshufb %xmm13,%xmm14,%xmm14 875 vxorps %xmm12,%xmm11,%xmm11 876 vpxor %xmm1,%xmm4,%xmm4 877 vpunpckhqdq %xmm14,%xmm14,%xmm9 878 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 879 vmovdqu 80-64(%rsi),%xmm7 880 vpxor %xmm14,%xmm9,%xmm9 881 vpxor %xmm2,%xmm5,%xmm5 882 883 vmovdqu 64(%rdx),%xmm15 884 vpalignr $8,%xmm10,%xmm10,%xmm12 885 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 886 vpshufb %xmm13,%xmm15,%xmm15 887 vpxor %xmm3,%xmm0,%xmm0 888 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 889 vmovdqu 64-64(%rsi),%xmm6 890 vpunpckhqdq %xmm15,%xmm15,%xmm8 891 vpxor %xmm4,%xmm1,%xmm1 892 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 893 vxorps %xmm15,%xmm8,%xmm8 894 vpxor %xmm5,%xmm2,%xmm2 895 896 vmovdqu 48(%rdx),%xmm14 897 vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 898 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 899 vpshufb %xmm13,%xmm14,%xmm14 900 vpxor %xmm0,%xmm3,%xmm3 901 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 902 vmovdqu 96-64(%rsi),%xmm6 903 vpunpckhqdq %xmm14,%xmm14,%xmm9 904 vpxor %xmm1,%xmm4,%xmm4 905 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 906 vmovdqu 128-64(%rsi),%xmm7 907 vpxor %xmm14,%xmm9,%xmm9 908 vpxor %xmm2,%xmm5,%xmm5 909 910 vmovdqu 32(%rdx),%xmm15 911 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 912 vpshufb %xmm13,%xmm15,%xmm15 913 vpxor %xmm3,%xmm0,%xmm0 914 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 915 vmovdqu 112-64(%rsi),%xmm6 916 vpunpckhqdq %xmm15,%xmm15,%xmm8 917 vpxor %xmm4,%xmm1,%xmm1 918 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 919 vpxor %xmm15,%xmm8,%xmm8 920 vpxor %xmm5,%xmm2,%xmm2 921 vxorps %xmm12,%xmm10,%xmm10 922 923 vmovdqu 16(%rdx),%xmm14 924 vpalignr $8,%xmm10,%xmm10,%xmm12 925 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 926 vpshufb %xmm13,%xmm14,%xmm14 927 vpxor %xmm0,%xmm3,%xmm3 928 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 929 vmovdqu 144-64(%rsi),%xmm6 930 vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 931 vxorps %xmm11,%xmm12,%xmm12 932 vpunpckhqdq %xmm14,%xmm14,%xmm9 933 vpxor %xmm1,%xmm4,%xmm4 934 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 935 vmovdqu 176-64(%rsi),%xmm7 936 vpxor %xmm14,%xmm9,%xmm9 937 vpxor %xmm2,%xmm5,%xmm5 938 939 vmovdqu (%rdx),%xmm15 940 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 941 vpshufb %xmm13,%xmm15,%xmm15 942 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 943 vmovdqu 160-64(%rsi),%xmm6 944 vpxor %xmm12,%xmm15,%xmm15 945 vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 946 vpxor %xmm10,%xmm15,%xmm15 947 948 leaq 128(%rdx),%rdx 949 subq $0x80,%rcx 950 jnc L$oop8x_avx 951 952 addq $0x80,%rcx 953 jmp L$tail_no_xor_avx 954 955.p2align 5 956L$short_avx: 957 vmovdqu -16(%rdx,%rcx,1),%xmm14 958 leaq (%rdx,%rcx,1),%rdx 959 vmovdqu 0-64(%rsi),%xmm6 960 vmovdqu 32-64(%rsi),%xmm7 961 vpshufb %xmm13,%xmm14,%xmm15 962 963 vmovdqa %xmm0,%xmm3 964 vmovdqa %xmm1,%xmm4 965 vmovdqa %xmm2,%xmm5 966 subq $0x10,%rcx 967 jz L$tail_avx 968 969 vpunpckhqdq %xmm15,%xmm15,%xmm8 970 vpxor %xmm0,%xmm3,%xmm3 971 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 972 vpxor %xmm15,%xmm8,%xmm8 973 vmovdqu -32(%rdx),%xmm14 974 vpxor %xmm1,%xmm4,%xmm4 975 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 976 vmovdqu 16-64(%rsi),%xmm6 977 vpshufb %xmm13,%xmm14,%xmm15 978 vpxor %xmm2,%xmm5,%xmm5 979 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 980 vpsrldq $8,%xmm7,%xmm7 981 subq $0x10,%rcx 982 jz L$tail_avx 983 984 vpunpckhqdq %xmm15,%xmm15,%xmm8 985 vpxor %xmm0,%xmm3,%xmm3 986 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 987 vpxor %xmm15,%xmm8,%xmm8 988 vmovdqu -48(%rdx),%xmm14 989 vpxor %xmm1,%xmm4,%xmm4 990 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 991 vmovdqu 48-64(%rsi),%xmm6 992 vpshufb %xmm13,%xmm14,%xmm15 993 vpxor %xmm2,%xmm5,%xmm5 994 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 995 vmovdqu 80-64(%rsi),%xmm7 996 subq $0x10,%rcx 997 jz L$tail_avx 998 999 vpunpckhqdq %xmm15,%xmm15,%xmm8 1000 vpxor %xmm0,%xmm3,%xmm3 1001 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 1002 vpxor %xmm15,%xmm8,%xmm8 1003 vmovdqu -64(%rdx),%xmm14 1004 vpxor %xmm1,%xmm4,%xmm4 1005 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 1006 vmovdqu 64-64(%rsi),%xmm6 1007 vpshufb %xmm13,%xmm14,%xmm15 1008 vpxor %xmm2,%xmm5,%xmm5 1009 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 1010 vpsrldq $8,%xmm7,%xmm7 1011 subq $0x10,%rcx 1012 jz L$tail_avx 1013 1014 vpunpckhqdq %xmm15,%xmm15,%xmm8 1015 vpxor %xmm0,%xmm3,%xmm3 1016 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 1017 vpxor %xmm15,%xmm8,%xmm8 1018 vmovdqu -80(%rdx),%xmm14 1019 vpxor %xmm1,%xmm4,%xmm4 1020 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 1021 vmovdqu 96-64(%rsi),%xmm6 1022 vpshufb %xmm13,%xmm14,%xmm15 1023 vpxor %xmm2,%xmm5,%xmm5 1024 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 1025 vmovdqu 128-64(%rsi),%xmm7 1026 subq $0x10,%rcx 1027 jz L$tail_avx 1028 1029 vpunpckhqdq %xmm15,%xmm15,%xmm8 1030 vpxor %xmm0,%xmm3,%xmm3 1031 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 1032 vpxor %xmm15,%xmm8,%xmm8 1033 vmovdqu -96(%rdx),%xmm14 1034 vpxor %xmm1,%xmm4,%xmm4 1035 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 1036 vmovdqu 112-64(%rsi),%xmm6 1037 vpshufb %xmm13,%xmm14,%xmm15 1038 vpxor %xmm2,%xmm5,%xmm5 1039 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 1040 vpsrldq $8,%xmm7,%xmm7 1041 subq $0x10,%rcx 1042 jz L$tail_avx 1043 1044 vpunpckhqdq %xmm15,%xmm15,%xmm8 1045 vpxor %xmm0,%xmm3,%xmm3 1046 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 1047 vpxor %xmm15,%xmm8,%xmm8 1048 vmovdqu -112(%rdx),%xmm14 1049 vpxor %xmm1,%xmm4,%xmm4 1050 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 1051 vmovdqu 144-64(%rsi),%xmm6 1052 vpshufb %xmm13,%xmm14,%xmm15 1053 vpxor %xmm2,%xmm5,%xmm5 1054 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 1055 vmovq 184-64(%rsi),%xmm7 1056 subq $0x10,%rcx 1057 jmp L$tail_avx 1058 1059.p2align 5 1060L$tail_avx: 1061 vpxor %xmm10,%xmm15,%xmm15 1062L$tail_no_xor_avx: 1063 vpunpckhqdq %xmm15,%xmm15,%xmm8 1064 vpxor %xmm0,%xmm3,%xmm3 1065 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 1066 vpxor %xmm15,%xmm8,%xmm8 1067 vpxor %xmm1,%xmm4,%xmm4 1068 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 1069 vpxor %xmm2,%xmm5,%xmm5 1070 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 1071 1072 vmovdqu (%r10),%xmm12 1073 1074 vpxor %xmm0,%xmm3,%xmm10 1075 vpxor %xmm1,%xmm4,%xmm11 1076 vpxor %xmm2,%xmm5,%xmm5 1077 1078 vpxor %xmm10,%xmm5,%xmm5 1079 vpxor %xmm11,%xmm5,%xmm5 1080 vpslldq $8,%xmm5,%xmm9 1081 vpsrldq $8,%xmm5,%xmm5 1082 vpxor %xmm9,%xmm10,%xmm10 1083 vpxor %xmm5,%xmm11,%xmm11 1084 1085 vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 1086 vpalignr $8,%xmm10,%xmm10,%xmm10 1087 vpxor %xmm9,%xmm10,%xmm10 1088 1089 vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 1090 vpalignr $8,%xmm10,%xmm10,%xmm10 1091 vpxor %xmm11,%xmm10,%xmm10 1092 vpxor %xmm9,%xmm10,%xmm10 1093 1094 cmpq $0,%rcx 1095 jne L$short_avx 1096 1097 vpshufb %xmm13,%xmm10,%xmm10 1098 vmovdqu %xmm10,(%rdi) 1099 vzeroupper 1100 .byte 0xf3,0xc3 1101 1102 1103.p2align 6 1104L$bswap_mask: 1105.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 1106L$0x1c2_polynomial: 1107.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 1108L$7_mask: 1109.long 7,0,7,0 1110.p2align 6 1111 1112.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1113.p2align 6 1114#endif 1115