1/* 2 * Intel SHA Extensions optimized implementation of a SHA-256 update function 3 * 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 * redistributing this file, you may do so under either license. 6 * 7 * GPL LICENSE SUMMARY 8 * 9 * Copyright(c) 2015 Intel Corporation. 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of version 2 of the GNU General Public License as 13 * published by the Free Software Foundation. 14 * 15 * This program is distributed in the hope that it will be useful, but 16 * WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * General Public License for more details. 19 * 20 * Contact Information: 21 * Sean Gulley <sean.m.gulley@intel.com> 22 * Tim Chen <tim.c.chen@linux.intel.com> 23 * 24 * BSD LICENSE 25 * 26 * Copyright(c) 2015 Intel Corporation. 27 * 28 * Redistribution and use in source and binary forms, with or without 29 * modification, are permitted provided that the following conditions 30 * are met: 31 * 32 * * Redistributions of source code must retain the above copyright 33 * notice, this list of conditions and the following disclaimer. 34 * * Redistributions in binary form must reproduce the above copyright 35 * notice, this list of conditions and the following disclaimer in 36 * the documentation and/or other materials provided with the 37 * distribution. 38 * * Neither the name of Intel Corporation nor the names of its 39 * contributors may be used to endorse or promote products derived 40 * from this software without specific prior written permission. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 43 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 44 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 45 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 46 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 48 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 49 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 50 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 51 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 52 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 53 * 54 */ 55 56#include <linux/linkage.h> 57#include <linux/cfi_types.h> 58 59#define DIGEST_PTR %rdi /* 1st arg */ 60#define DATA_PTR %rsi /* 2nd arg */ 61#define NUM_BLKS %rdx /* 3rd arg */ 62 63#define SHA256CONSTANTS %rax 64 65#define MSG %xmm0 /* sha256rnds2 implicit operand */ 66#define STATE0 %xmm1 67#define STATE1 %xmm2 68#define MSG0 %xmm3 69#define MSG1 %xmm4 70#define MSG2 %xmm5 71#define MSG3 %xmm6 72#define TMP %xmm7 73 74#define SHUF_MASK %xmm8 75 76#define ABEF_SAVE %xmm9 77#define CDGH_SAVE %xmm10 78 79.macro do_4rounds i, m0, m1, m2, m3 80.if \i < 16 81 movdqu \i*4(DATA_PTR), \m0 82 pshufb SHUF_MASK, \m0 83.endif 84 movdqa (\i-32)*4(SHA256CONSTANTS), MSG 85 paddd \m0, MSG 86 sha256rnds2 STATE0, STATE1 87.if \i >= 12 && \i < 60 88 movdqa \m0, TMP 89 palignr $4, \m3, TMP 90 paddd TMP, \m1 91 sha256msg2 \m0, \m1 92.endif 93 punpckhqdq MSG, MSG 94 sha256rnds2 STATE1, STATE0 95.if \i >= 4 && \i < 52 96 sha256msg1 \m0, \m3 97.endif 98.endm 99 100/* 101 * Intel SHA Extensions optimized implementation of a SHA-256 update function 102 * 103 * The function takes a pointer to the current hash values, a pointer to the 104 * input data, and a number of 64 byte blocks to process. Once all blocks have 105 * been processed, the digest pointer is updated with the resulting hash value. 106 * The function only processes complete blocks, there is no functionality to 107 * store partial blocks. All message padding and hash value initialization must 108 * be done outside the update function. 109 * 110 * void sha256_ni_transform(uint32_t *digest, const void *data, 111 uint32_t numBlocks); 112 * digest : pointer to digest 113 * data: pointer to input data 114 * numBlocks: Number of blocks to process 115 */ 116 117.text 118SYM_TYPED_FUNC_START(sha256_ni_transform) 119 120 shl $6, NUM_BLKS /* convert to bytes */ 121 jz .Ldone_hash 122 add DATA_PTR, NUM_BLKS /* pointer to end of data */ 123 124 /* 125 * load initial hash values 126 * Need to reorder these appropriately 127 * DCBA, HGFE -> ABEF, CDGH 128 */ 129 movdqu 0*16(DIGEST_PTR), STATE0 /* DCBA */ 130 movdqu 1*16(DIGEST_PTR), STATE1 /* HGFE */ 131 132 movdqa STATE0, TMP 133 punpcklqdq STATE1, STATE0 /* FEBA */ 134 punpckhqdq TMP, STATE1 /* DCHG */ 135 pshufd $0x1B, STATE0, STATE0 /* ABEF */ 136 pshufd $0xB1, STATE1, STATE1 /* CDGH */ 137 138 movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK 139 lea K256+32*4(%rip), SHA256CONSTANTS 140 141.Lloop0: 142 /* Save hash values for addition after rounds */ 143 movdqa STATE0, ABEF_SAVE 144 movdqa STATE1, CDGH_SAVE 145 146.irp i, 0, 16, 32, 48 147 do_4rounds (\i + 0), MSG0, MSG1, MSG2, MSG3 148 do_4rounds (\i + 4), MSG1, MSG2, MSG3, MSG0 149 do_4rounds (\i + 8), MSG2, MSG3, MSG0, MSG1 150 do_4rounds (\i + 12), MSG3, MSG0, MSG1, MSG2 151.endr 152 153 /* Add current hash values with previously saved */ 154 paddd ABEF_SAVE, STATE0 155 paddd CDGH_SAVE, STATE1 156 157 /* Increment data pointer and loop if more to process */ 158 add $64, DATA_PTR 159 cmp NUM_BLKS, DATA_PTR 160 jne .Lloop0 161 162 /* Write hash values back in the correct order */ 163 movdqa STATE0, TMP 164 punpcklqdq STATE1, STATE0 /* GHEF */ 165 punpckhqdq TMP, STATE1 /* ABCD */ 166 pshufd $0xB1, STATE0, STATE0 /* HGFE */ 167 pshufd $0x1B, STATE1, STATE1 /* DCBA */ 168 169 movdqu STATE1, 0*16(DIGEST_PTR) 170 movdqu STATE0, 1*16(DIGEST_PTR) 171 172.Ldone_hash: 173 174 RET 175SYM_FUNC_END(sha256_ni_transform) 176 177#undef DIGEST_PTR 178#undef DATA_PTR 179#undef NUM_BLKS 180#undef SHA256CONSTANTS 181#undef MSG 182#undef STATE0 183#undef STATE1 184#undef MSG0 185#undef MSG1 186#undef MSG2 187#undef MSG3 188#undef TMP 189#undef SHUF_MASK 190#undef ABEF_SAVE 191#undef CDGH_SAVE 192 193// parameters for __sha256_ni_finup2x() 194#define SCTX %rdi 195#define DATA1 %rsi 196#define DATA2 %rdx 197#define LEN %ecx 198#define LEN8 %cl 199#define LEN64 %rcx 200#define OUT1 %r8 201#define OUT2 %r9 202 203// other scalar variables 204#define SHA256CONSTANTS %rax 205#define COUNT %r10 206#define COUNT32 %r10d 207#define FINAL_STEP %r11d 208 209// rbx is used as a temporary. 210 211#define MSG %xmm0 // sha256rnds2 implicit operand 212#define STATE0_A %xmm1 213#define STATE1_A %xmm2 214#define STATE0_B %xmm3 215#define STATE1_B %xmm4 216#define TMP_A %xmm5 217#define TMP_B %xmm6 218#define MSG0_A %xmm7 219#define MSG1_A %xmm8 220#define MSG2_A %xmm9 221#define MSG3_A %xmm10 222#define MSG0_B %xmm11 223#define MSG1_B %xmm12 224#define MSG2_B %xmm13 225#define MSG3_B %xmm14 226#define SHUF_MASK %xmm15 227 228#define OFFSETOF_STATE 0 // offsetof(struct sha256_state, state) 229#define OFFSETOF_COUNT 32 // offsetof(struct sha256_state, count) 230#define OFFSETOF_BUF 40 // offsetof(struct sha256_state, buf) 231 232// Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a and m0_b 233// contain the current 4 message schedule words for the first and second message 234// respectively. 235// 236// If not all the message schedule words have been computed yet, then this also 237// computes 4 more message schedule words for each message. m1_a-m3_a contain 238// the next 3 groups of 4 message schedule words for the first message, and 239// likewise m1_b-m3_b for the second. After consuming the current value of 240// m0_a, this macro computes the group after m3_a and writes it to m0_a, and 241// likewise for *_b. This means that the next (m0_a, m1_a, m2_a, m3_a) is the 242// current (m1_a, m2_a, m3_a, m0_a), and likewise for *_b, so the caller must 243// cycle through the registers accordingly. 244.macro do_4rounds_2x i, m0_a, m1_a, m2_a, m3_a, m0_b, m1_b, m2_b, m3_b 245 movdqa (\i-32)*4(SHA256CONSTANTS), TMP_A 246 movdqa TMP_A, TMP_B 247 paddd \m0_a, TMP_A 248 paddd \m0_b, TMP_B 249.if \i < 48 250 sha256msg1 \m1_a, \m0_a 251 sha256msg1 \m1_b, \m0_b 252.endif 253 movdqa TMP_A, MSG 254 sha256rnds2 STATE0_A, STATE1_A 255 movdqa TMP_B, MSG 256 sha256rnds2 STATE0_B, STATE1_B 257 pshufd $0x0E, TMP_A, MSG 258 sha256rnds2 STATE1_A, STATE0_A 259 pshufd $0x0E, TMP_B, MSG 260 sha256rnds2 STATE1_B, STATE0_B 261.if \i < 48 262 movdqa \m3_a, TMP_A 263 movdqa \m3_b, TMP_B 264 palignr $4, \m2_a, TMP_A 265 palignr $4, \m2_b, TMP_B 266 paddd TMP_A, \m0_a 267 paddd TMP_B, \m0_b 268 sha256msg2 \m3_a, \m0_a 269 sha256msg2 \m3_b, \m0_b 270.endif 271.endm 272 273// 274// void __sha256_ni_finup2x(const struct sha256_state *sctx, 275// const u8 *data1, const u8 *data2, int len, 276// u8 out1[SHA256_DIGEST_SIZE], 277// u8 out2[SHA256_DIGEST_SIZE]); 278// 279// This function computes the SHA-256 digests of two messages |data1| and 280// |data2| that are both |len| bytes long, starting from the initial state 281// |sctx|. |len| must be at least SHA256_BLOCK_SIZE. 282// 283// The instructions for the two SHA-256 operations are interleaved. On many 284// CPUs, this is almost twice as fast as hashing each message individually due 285// to taking better advantage of the CPU's SHA-256 and SIMD throughput. 286// 287SYM_FUNC_START(__sha256_ni_finup2x) 288 // Allocate 128 bytes of stack space, 16-byte aligned. 289 push %rbx 290 push %rbp 291 mov %rsp, %rbp 292 sub $128, %rsp 293 and $~15, %rsp 294 295 // Load the shuffle mask for swapping the endianness of 32-bit words. 296 movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK 297 298 // Set up pointer to the round constants. 299 lea K256+32*4(%rip), SHA256CONSTANTS 300 301 // Initially we're not processing the final blocks. 302 xor FINAL_STEP, FINAL_STEP 303 304 // Load the initial state from sctx->state. 305 movdqu OFFSETOF_STATE+0*16(SCTX), STATE0_A // DCBA 306 movdqu OFFSETOF_STATE+1*16(SCTX), STATE1_A // HGFE 307 movdqa STATE0_A, TMP_A 308 punpcklqdq STATE1_A, STATE0_A // FEBA 309 punpckhqdq TMP_A, STATE1_A // DCHG 310 pshufd $0x1B, STATE0_A, STATE0_A // ABEF 311 pshufd $0xB1, STATE1_A, STATE1_A // CDGH 312 313 // Load sctx->count. Take the mod 64 of it to get the number of bytes 314 // that are buffered in sctx->buf. Also save it in a register with LEN 315 // added to it. 316 mov LEN, LEN 317 mov OFFSETOF_COUNT(SCTX), %rbx 318 lea (%rbx, LEN64, 1), COUNT 319 and $63, %ebx 320 jz .Lfinup2x_enter_loop // No bytes buffered? 321 322 // %ebx bytes (1 to 63) are currently buffered in sctx->buf. Load them 323 // followed by the first 64 - %ebx bytes of data. Since LEN >= 64, we 324 // just load 64 bytes from each of sctx->buf, DATA1, and DATA2 325 // unconditionally and rearrange the data as needed. 326 327 movdqu OFFSETOF_BUF+0*16(SCTX), MSG0_A 328 movdqu OFFSETOF_BUF+1*16(SCTX), MSG1_A 329 movdqu OFFSETOF_BUF+2*16(SCTX), MSG2_A 330 movdqu OFFSETOF_BUF+3*16(SCTX), MSG3_A 331 movdqa MSG0_A, 0*16(%rsp) 332 movdqa MSG1_A, 1*16(%rsp) 333 movdqa MSG2_A, 2*16(%rsp) 334 movdqa MSG3_A, 3*16(%rsp) 335 336 movdqu 0*16(DATA1), MSG0_A 337 movdqu 1*16(DATA1), MSG1_A 338 movdqu 2*16(DATA1), MSG2_A 339 movdqu 3*16(DATA1), MSG3_A 340 movdqu MSG0_A, 0*16(%rsp,%rbx) 341 movdqu MSG1_A, 1*16(%rsp,%rbx) 342 movdqu MSG2_A, 2*16(%rsp,%rbx) 343 movdqu MSG3_A, 3*16(%rsp,%rbx) 344 movdqa 0*16(%rsp), MSG0_A 345 movdqa 1*16(%rsp), MSG1_A 346 movdqa 2*16(%rsp), MSG2_A 347 movdqa 3*16(%rsp), MSG3_A 348 349 movdqu 0*16(DATA2), MSG0_B 350 movdqu 1*16(DATA2), MSG1_B 351 movdqu 2*16(DATA2), MSG2_B 352 movdqu 3*16(DATA2), MSG3_B 353 movdqu MSG0_B, 0*16(%rsp,%rbx) 354 movdqu MSG1_B, 1*16(%rsp,%rbx) 355 movdqu MSG2_B, 2*16(%rsp,%rbx) 356 movdqu MSG3_B, 3*16(%rsp,%rbx) 357 movdqa 0*16(%rsp), MSG0_B 358 movdqa 1*16(%rsp), MSG1_B 359 movdqa 2*16(%rsp), MSG2_B 360 movdqa 3*16(%rsp), MSG3_B 361 362 sub $64, %rbx // rbx = buffered - 64 363 sub %rbx, DATA1 // DATA1 += 64 - buffered 364 sub %rbx, DATA2 // DATA2 += 64 - buffered 365 add %ebx, LEN // LEN += buffered - 64 366 movdqa STATE0_A, STATE0_B 367 movdqa STATE1_A, STATE1_B 368 jmp .Lfinup2x_loop_have_data 369 370.Lfinup2x_enter_loop: 371 sub $64, LEN 372 movdqa STATE0_A, STATE0_B 373 movdqa STATE1_A, STATE1_B 374.Lfinup2x_loop: 375 // Load the next two data blocks. 376 movdqu 0*16(DATA1), MSG0_A 377 movdqu 0*16(DATA2), MSG0_B 378 movdqu 1*16(DATA1), MSG1_A 379 movdqu 1*16(DATA2), MSG1_B 380 movdqu 2*16(DATA1), MSG2_A 381 movdqu 2*16(DATA2), MSG2_B 382 movdqu 3*16(DATA1), MSG3_A 383 movdqu 3*16(DATA2), MSG3_B 384 add $64, DATA1 385 add $64, DATA2 386.Lfinup2x_loop_have_data: 387 // Convert the words of the data blocks from big endian. 388 pshufb SHUF_MASK, MSG0_A 389 pshufb SHUF_MASK, MSG0_B 390 pshufb SHUF_MASK, MSG1_A 391 pshufb SHUF_MASK, MSG1_B 392 pshufb SHUF_MASK, MSG2_A 393 pshufb SHUF_MASK, MSG2_B 394 pshufb SHUF_MASK, MSG3_A 395 pshufb SHUF_MASK, MSG3_B 396.Lfinup2x_loop_have_bswapped_data: 397 398 // Save the original state for each block. 399 movdqa STATE0_A, 0*16(%rsp) 400 movdqa STATE0_B, 1*16(%rsp) 401 movdqa STATE1_A, 2*16(%rsp) 402 movdqa STATE1_B, 3*16(%rsp) 403 404 // Do the SHA-256 rounds on each block. 405.irp i, 0, 16, 32, 48 406 do_4rounds_2x (\i + 0), MSG0_A, MSG1_A, MSG2_A, MSG3_A, \ 407 MSG0_B, MSG1_B, MSG2_B, MSG3_B 408 do_4rounds_2x (\i + 4), MSG1_A, MSG2_A, MSG3_A, MSG0_A, \ 409 MSG1_B, MSG2_B, MSG3_B, MSG0_B 410 do_4rounds_2x (\i + 8), MSG2_A, MSG3_A, MSG0_A, MSG1_A, \ 411 MSG2_B, MSG3_B, MSG0_B, MSG1_B 412 do_4rounds_2x (\i + 12), MSG3_A, MSG0_A, MSG1_A, MSG2_A, \ 413 MSG3_B, MSG0_B, MSG1_B, MSG2_B 414.endr 415 416 // Add the original state for each block. 417 paddd 0*16(%rsp), STATE0_A 418 paddd 1*16(%rsp), STATE0_B 419 paddd 2*16(%rsp), STATE1_A 420 paddd 3*16(%rsp), STATE1_B 421 422 // Update LEN and loop back if more blocks remain. 423 sub $64, LEN 424 jge .Lfinup2x_loop 425 426 // Check if any final blocks need to be handled. 427 // FINAL_STEP = 2: all done 428 // FINAL_STEP = 1: need to do count-only padding block 429 // FINAL_STEP = 0: need to do the block with 0x80 padding byte 430 cmp $1, FINAL_STEP 431 jg .Lfinup2x_done 432 je .Lfinup2x_finalize_countonly 433 add $64, LEN 434 jz .Lfinup2x_finalize_blockaligned 435 436 // Not block-aligned; 1 <= LEN <= 63 data bytes remain. Pad the block. 437 // To do this, write the padding starting with the 0x80 byte to 438 // &sp[64]. Then for each message, copy the last 64 data bytes to sp 439 // and load from &sp[64 - LEN] to get the needed padding block. This 440 // code relies on the data buffers being >= 64 bytes in length. 441 mov $64, %ebx 442 sub LEN, %ebx // ebx = 64 - LEN 443 sub %rbx, DATA1 // DATA1 -= 64 - LEN 444 sub %rbx, DATA2 // DATA2 -= 64 - LEN 445 mov $0x80, FINAL_STEP // using FINAL_STEP as a temporary 446 movd FINAL_STEP, MSG0_A 447 pxor MSG1_A, MSG1_A 448 movdqa MSG0_A, 4*16(%rsp) 449 movdqa MSG1_A, 5*16(%rsp) 450 movdqa MSG1_A, 6*16(%rsp) 451 movdqa MSG1_A, 7*16(%rsp) 452 cmp $56, LEN 453 jge 1f // will COUNT spill into its own block? 454 shl $3, COUNT 455 bswap COUNT 456 mov COUNT, 56(%rsp,%rbx) 457 mov $2, FINAL_STEP // won't need count-only block 458 jmp 2f 4591: 460 mov $1, FINAL_STEP // will need count-only block 4612: 462 movdqu 0*16(DATA1), MSG0_A 463 movdqu 1*16(DATA1), MSG1_A 464 movdqu 2*16(DATA1), MSG2_A 465 movdqu 3*16(DATA1), MSG3_A 466 movdqa MSG0_A, 0*16(%rsp) 467 movdqa MSG1_A, 1*16(%rsp) 468 movdqa MSG2_A, 2*16(%rsp) 469 movdqa MSG3_A, 3*16(%rsp) 470 movdqu 0*16(%rsp,%rbx), MSG0_A 471 movdqu 1*16(%rsp,%rbx), MSG1_A 472 movdqu 2*16(%rsp,%rbx), MSG2_A 473 movdqu 3*16(%rsp,%rbx), MSG3_A 474 475 movdqu 0*16(DATA2), MSG0_B 476 movdqu 1*16(DATA2), MSG1_B 477 movdqu 2*16(DATA2), MSG2_B 478 movdqu 3*16(DATA2), MSG3_B 479 movdqa MSG0_B, 0*16(%rsp) 480 movdqa MSG1_B, 1*16(%rsp) 481 movdqa MSG2_B, 2*16(%rsp) 482 movdqa MSG3_B, 3*16(%rsp) 483 movdqu 0*16(%rsp,%rbx), MSG0_B 484 movdqu 1*16(%rsp,%rbx), MSG1_B 485 movdqu 2*16(%rsp,%rbx), MSG2_B 486 movdqu 3*16(%rsp,%rbx), MSG3_B 487 jmp .Lfinup2x_loop_have_data 488 489 // Prepare a padding block, either: 490 // 491 // {0x80, 0, 0, 0, ..., count (as __be64)} 492 // This is for a block aligned message. 493 // 494 // { 0, 0, 0, 0, ..., count (as __be64)} 495 // This is for a message whose length mod 64 is >= 56. 496 // 497 // Pre-swap the endianness of the words. 498.Lfinup2x_finalize_countonly: 499 pxor MSG0_A, MSG0_A 500 jmp 1f 501 502.Lfinup2x_finalize_blockaligned: 503 mov $0x80000000, %ebx 504 movd %ebx, MSG0_A 5051: 506 pxor MSG1_A, MSG1_A 507 pxor MSG2_A, MSG2_A 508 ror $29, COUNT 509 movq COUNT, MSG3_A 510 pslldq $8, MSG3_A 511 movdqa MSG0_A, MSG0_B 512 pxor MSG1_B, MSG1_B 513 pxor MSG2_B, MSG2_B 514 movdqa MSG3_A, MSG3_B 515 mov $2, FINAL_STEP 516 jmp .Lfinup2x_loop_have_bswapped_data 517 518.Lfinup2x_done: 519 // Write the two digests with all bytes in the correct order. 520 movdqa STATE0_A, TMP_A 521 movdqa STATE0_B, TMP_B 522 punpcklqdq STATE1_A, STATE0_A // GHEF 523 punpcklqdq STATE1_B, STATE0_B 524 punpckhqdq TMP_A, STATE1_A // ABCD 525 punpckhqdq TMP_B, STATE1_B 526 pshufd $0xB1, STATE0_A, STATE0_A // HGFE 527 pshufd $0xB1, STATE0_B, STATE0_B 528 pshufd $0x1B, STATE1_A, STATE1_A // DCBA 529 pshufd $0x1B, STATE1_B, STATE1_B 530 pshufb SHUF_MASK, STATE0_A 531 pshufb SHUF_MASK, STATE0_B 532 pshufb SHUF_MASK, STATE1_A 533 pshufb SHUF_MASK, STATE1_B 534 movdqu STATE0_A, 1*16(OUT1) 535 movdqu STATE0_B, 1*16(OUT2) 536 movdqu STATE1_A, 0*16(OUT1) 537 movdqu STATE1_B, 0*16(OUT2) 538 539 mov %rbp, %rsp 540 pop %rbp 541 pop %rbx 542 RET 543SYM_FUNC_END(__sha256_ni_finup2x) 544 545.section .rodata.cst256.K256, "aM", @progbits, 256 546.align 64 547K256: 548 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 549 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 550 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 551 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 552 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 553 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 554 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 555 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 556 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 557 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 558 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 559 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 560 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 561 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 562 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 563 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 564 565.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 566.align 16 567PSHUFFLE_BYTE_FLIP_MASK: 568 .octa 0x0c0d0e0f08090a0b0405060700010203 569