1/* 2 * Intel SHA Extensions optimized implementation of a SHA-256 update function 3 * 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 * redistributing this file, you may do so under either license. 6 * 7 * GPL LICENSE SUMMARY 8 * 9 * Copyright(c) 2015 Intel Corporation. 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of version 2 of the GNU General Public License as 13 * published by the Free Software Foundation. 14 * 15 * This program is distributed in the hope that it will be useful, but 16 * WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * General Public License for more details. 19 * 20 * Contact Information: 21 * Sean Gulley <sean.m.gulley@intel.com> 22 * Tim Chen <tim.c.chen@linux.intel.com> 23 * 24 * BSD LICENSE 25 * 26 * Copyright(c) 2015 Intel Corporation. 27 * 28 * Redistribution and use in source and binary forms, with or without 29 * modification, are permitted provided that the following conditions 30 * are met: 31 * 32 * * Redistributions of source code must retain the above copyright 33 * notice, this list of conditions and the following disclaimer. 34 * * Redistributions in binary form must reproduce the above copyright 35 * notice, this list of conditions and the following disclaimer in 36 * the documentation and/or other materials provided with the 37 * distribution. 38 * * Neither the name of Intel Corporation nor the names of its 39 * contributors may be used to endorse or promote products derived 40 * from this software without specific prior written permission. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 43 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 44 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 45 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 46 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 48 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 49 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 50 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 51 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 52 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 53 * 54 */ 55 56#include <linux/linkage.h> 57#include <linux/cfi_types.h> 58 59#define DIGEST_PTR %rdi /* 1st arg */ 60#define DATA_PTR %rsi /* 2nd arg */ 61#define NUM_BLKS %rdx /* 3rd arg */ 62 63#define SHA256CONSTANTS %rax 64 65#define MSG %xmm0 66#define STATE0 %xmm1 67#define STATE1 %xmm2 68#define MSGTMP0 %xmm3 69#define MSGTMP1 %xmm4 70#define MSGTMP2 %xmm5 71#define MSGTMP3 %xmm6 72#define MSGTMP4 %xmm7 73 74#define SHUF_MASK %xmm8 75 76#define ABEF_SAVE %xmm9 77#define CDGH_SAVE %xmm10 78 79/* 80 * Intel SHA Extensions optimized implementation of a SHA-256 update function 81 * 82 * The function takes a pointer to the current hash values, a pointer to the 83 * input data, and a number of 64 byte blocks to process. Once all blocks have 84 * been processed, the digest pointer is updated with the resulting hash value. 85 * The function only processes complete blocks, there is no functionality to 86 * store partial blocks. All message padding and hash value initialization must 87 * be done outside the update function. 88 * 89 * The indented lines in the loop are instructions related to rounds processing. 90 * The non-indented lines are instructions related to the message schedule. 91 * 92 * void sha256_ni_transform(uint32_t *digest, const void *data, 93 uint32_t numBlocks); 94 * digest : pointer to digest 95 * data: pointer to input data 96 * numBlocks: Number of blocks to process 97 */ 98 99.text 100SYM_TYPED_FUNC_START(sha256_ni_transform) 101 102 shl $6, NUM_BLKS /* convert to bytes */ 103 jz .Ldone_hash 104 add DATA_PTR, NUM_BLKS /* pointer to end of data */ 105 106 /* 107 * load initial hash values 108 * Need to reorder these appropriately 109 * DCBA, HGFE -> ABEF, CDGH 110 */ 111 movdqu 0*16(DIGEST_PTR), STATE0 112 movdqu 1*16(DIGEST_PTR), STATE1 113 114 pshufd $0xB1, STATE0, STATE0 /* CDAB */ 115 pshufd $0x1B, STATE1, STATE1 /* EFGH */ 116 movdqa STATE0, MSGTMP4 117 palignr $8, STATE1, STATE0 /* ABEF */ 118 pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */ 119 120 movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK 121 lea K256(%rip), SHA256CONSTANTS 122 123.Lloop0: 124 /* Save hash values for addition after rounds */ 125 movdqa STATE0, ABEF_SAVE 126 movdqa STATE1, CDGH_SAVE 127 128 /* Rounds 0-3 */ 129 movdqu 0*16(DATA_PTR), MSG 130 pshufb SHUF_MASK, MSG 131 movdqa MSG, MSGTMP0 132 paddd 0*16(SHA256CONSTANTS), MSG 133 sha256rnds2 STATE0, STATE1 134 pshufd $0x0E, MSG, MSG 135 sha256rnds2 STATE1, STATE0 136 137 /* Rounds 4-7 */ 138 movdqu 1*16(DATA_PTR), MSG 139 pshufb SHUF_MASK, MSG 140 movdqa MSG, MSGTMP1 141 paddd 1*16(SHA256CONSTANTS), MSG 142 sha256rnds2 STATE0, STATE1 143 pshufd $0x0E, MSG, MSG 144 sha256rnds2 STATE1, STATE0 145 sha256msg1 MSGTMP1, MSGTMP0 146 147 /* Rounds 8-11 */ 148 movdqu 2*16(DATA_PTR), MSG 149 pshufb SHUF_MASK, MSG 150 movdqa MSG, MSGTMP2 151 paddd 2*16(SHA256CONSTANTS), MSG 152 sha256rnds2 STATE0, STATE1 153 pshufd $0x0E, MSG, MSG 154 sha256rnds2 STATE1, STATE0 155 sha256msg1 MSGTMP2, MSGTMP1 156 157 /* Rounds 12-15 */ 158 movdqu 3*16(DATA_PTR), MSG 159 pshufb SHUF_MASK, MSG 160 movdqa MSG, MSGTMP3 161 paddd 3*16(SHA256CONSTANTS), MSG 162 sha256rnds2 STATE0, STATE1 163 movdqa MSGTMP3, MSGTMP4 164 palignr $4, MSGTMP2, MSGTMP4 165 paddd MSGTMP4, MSGTMP0 166 sha256msg2 MSGTMP3, MSGTMP0 167 pshufd $0x0E, MSG, MSG 168 sha256rnds2 STATE1, STATE0 169 sha256msg1 MSGTMP3, MSGTMP2 170 171 /* Rounds 16-19 */ 172 movdqa MSGTMP0, MSG 173 paddd 4*16(SHA256CONSTANTS), MSG 174 sha256rnds2 STATE0, STATE1 175 movdqa MSGTMP0, MSGTMP4 176 palignr $4, MSGTMP3, MSGTMP4 177 paddd MSGTMP4, MSGTMP1 178 sha256msg2 MSGTMP0, MSGTMP1 179 pshufd $0x0E, MSG, MSG 180 sha256rnds2 STATE1, STATE0 181 sha256msg1 MSGTMP0, MSGTMP3 182 183 /* Rounds 20-23 */ 184 movdqa MSGTMP1, MSG 185 paddd 5*16(SHA256CONSTANTS), MSG 186 sha256rnds2 STATE0, STATE1 187 movdqa MSGTMP1, MSGTMP4 188 palignr $4, MSGTMP0, MSGTMP4 189 paddd MSGTMP4, MSGTMP2 190 sha256msg2 MSGTMP1, MSGTMP2 191 pshufd $0x0E, MSG, MSG 192 sha256rnds2 STATE1, STATE0 193 sha256msg1 MSGTMP1, MSGTMP0 194 195 /* Rounds 24-27 */ 196 movdqa MSGTMP2, MSG 197 paddd 6*16(SHA256CONSTANTS), MSG 198 sha256rnds2 STATE0, STATE1 199 movdqa MSGTMP2, MSGTMP4 200 palignr $4, MSGTMP1, MSGTMP4 201 paddd MSGTMP4, MSGTMP3 202 sha256msg2 MSGTMP2, MSGTMP3 203 pshufd $0x0E, MSG, MSG 204 sha256rnds2 STATE1, STATE0 205 sha256msg1 MSGTMP2, MSGTMP1 206 207 /* Rounds 28-31 */ 208 movdqa MSGTMP3, MSG 209 paddd 7*16(SHA256CONSTANTS), MSG 210 sha256rnds2 STATE0, STATE1 211 movdqa MSGTMP3, MSGTMP4 212 palignr $4, MSGTMP2, MSGTMP4 213 paddd MSGTMP4, MSGTMP0 214 sha256msg2 MSGTMP3, MSGTMP0 215 pshufd $0x0E, MSG, MSG 216 sha256rnds2 STATE1, STATE0 217 sha256msg1 MSGTMP3, MSGTMP2 218 219 /* Rounds 32-35 */ 220 movdqa MSGTMP0, MSG 221 paddd 8*16(SHA256CONSTANTS), MSG 222 sha256rnds2 STATE0, STATE1 223 movdqa MSGTMP0, MSGTMP4 224 palignr $4, MSGTMP3, MSGTMP4 225 paddd MSGTMP4, MSGTMP1 226 sha256msg2 MSGTMP0, MSGTMP1 227 pshufd $0x0E, MSG, MSG 228 sha256rnds2 STATE1, STATE0 229 sha256msg1 MSGTMP0, MSGTMP3 230 231 /* Rounds 36-39 */ 232 movdqa MSGTMP1, MSG 233 paddd 9*16(SHA256CONSTANTS), MSG 234 sha256rnds2 STATE0, STATE1 235 movdqa MSGTMP1, MSGTMP4 236 palignr $4, MSGTMP0, MSGTMP4 237 paddd MSGTMP4, MSGTMP2 238 sha256msg2 MSGTMP1, MSGTMP2 239 pshufd $0x0E, MSG, MSG 240 sha256rnds2 STATE1, STATE0 241 sha256msg1 MSGTMP1, MSGTMP0 242 243 /* Rounds 40-43 */ 244 movdqa MSGTMP2, MSG 245 paddd 10*16(SHA256CONSTANTS), MSG 246 sha256rnds2 STATE0, STATE1 247 movdqa MSGTMP2, MSGTMP4 248 palignr $4, MSGTMP1, MSGTMP4 249 paddd MSGTMP4, MSGTMP3 250 sha256msg2 MSGTMP2, MSGTMP3 251 pshufd $0x0E, MSG, MSG 252 sha256rnds2 STATE1, STATE0 253 sha256msg1 MSGTMP2, MSGTMP1 254 255 /* Rounds 44-47 */ 256 movdqa MSGTMP3, MSG 257 paddd 11*16(SHA256CONSTANTS), MSG 258 sha256rnds2 STATE0, STATE1 259 movdqa MSGTMP3, MSGTMP4 260 palignr $4, MSGTMP2, MSGTMP4 261 paddd MSGTMP4, MSGTMP0 262 sha256msg2 MSGTMP3, MSGTMP0 263 pshufd $0x0E, MSG, MSG 264 sha256rnds2 STATE1, STATE0 265 sha256msg1 MSGTMP3, MSGTMP2 266 267 /* Rounds 48-51 */ 268 movdqa MSGTMP0, MSG 269 paddd 12*16(SHA256CONSTANTS), MSG 270 sha256rnds2 STATE0, STATE1 271 movdqa MSGTMP0, MSGTMP4 272 palignr $4, MSGTMP3, MSGTMP4 273 paddd MSGTMP4, MSGTMP1 274 sha256msg2 MSGTMP0, MSGTMP1 275 pshufd $0x0E, MSG, MSG 276 sha256rnds2 STATE1, STATE0 277 sha256msg1 MSGTMP0, MSGTMP3 278 279 /* Rounds 52-55 */ 280 movdqa MSGTMP1, MSG 281 paddd 13*16(SHA256CONSTANTS), MSG 282 sha256rnds2 STATE0, STATE1 283 movdqa MSGTMP1, MSGTMP4 284 palignr $4, MSGTMP0, MSGTMP4 285 paddd MSGTMP4, MSGTMP2 286 sha256msg2 MSGTMP1, MSGTMP2 287 pshufd $0x0E, MSG, MSG 288 sha256rnds2 STATE1, STATE0 289 290 /* Rounds 56-59 */ 291 movdqa MSGTMP2, MSG 292 paddd 14*16(SHA256CONSTANTS), MSG 293 sha256rnds2 STATE0, STATE1 294 movdqa MSGTMP2, MSGTMP4 295 palignr $4, MSGTMP1, MSGTMP4 296 paddd MSGTMP4, MSGTMP3 297 sha256msg2 MSGTMP2, MSGTMP3 298 pshufd $0x0E, MSG, MSG 299 sha256rnds2 STATE1, STATE0 300 301 /* Rounds 60-63 */ 302 movdqa MSGTMP3, MSG 303 paddd 15*16(SHA256CONSTANTS), MSG 304 sha256rnds2 STATE0, STATE1 305 pshufd $0x0E, MSG, MSG 306 sha256rnds2 STATE1, STATE0 307 308 /* Add current hash values with previously saved */ 309 paddd ABEF_SAVE, STATE0 310 paddd CDGH_SAVE, STATE1 311 312 /* Increment data pointer and loop if more to process */ 313 add $64, DATA_PTR 314 cmp NUM_BLKS, DATA_PTR 315 jne .Lloop0 316 317 /* Write hash values back in the correct order */ 318 pshufd $0x1B, STATE0, STATE0 /* FEBA */ 319 pshufd $0xB1, STATE1, STATE1 /* DCHG */ 320 movdqa STATE0, MSGTMP4 321 pblendw $0xF0, STATE1, STATE0 /* DCBA */ 322 palignr $8, MSGTMP4, STATE1 /* HGFE */ 323 324 movdqu STATE0, 0*16(DIGEST_PTR) 325 movdqu STATE1, 1*16(DIGEST_PTR) 326 327.Ldone_hash: 328 329 RET 330SYM_FUNC_END(sha256_ni_transform) 331 332#undef DIGEST_PTR 333#undef DATA_PTR 334#undef NUM_BLKS 335#undef SHA256CONSTANTS 336#undef MSG 337#undef STATE0 338#undef STATE1 339#undef MSG0 340#undef MSG1 341#undef MSG2 342#undef MSG3 343#undef TMP 344#undef SHUF_MASK 345#undef ABEF_SAVE 346#undef CDGH_SAVE 347 348// parameters for __sha256_ni_finup2x() 349#define SCTX %rdi 350#define DATA1 %rsi 351#define DATA2 %rdx 352#define LEN %ecx 353#define LEN8 %cl 354#define LEN64 %rcx 355#define OUT1 %r8 356#define OUT2 %r9 357 358// other scalar variables 359#define SHA256CONSTANTS %rax 360#define COUNT %r10 361#define COUNT32 %r10d 362#define FINAL_STEP %r11d 363 364// rbx is used as a temporary. 365 366#define MSG %xmm0 // sha256rnds2 implicit operand 367#define STATE0_A %xmm1 368#define STATE1_A %xmm2 369#define STATE0_B %xmm3 370#define STATE1_B %xmm4 371#define TMP_A %xmm5 372#define TMP_B %xmm6 373#define MSG0_A %xmm7 374#define MSG1_A %xmm8 375#define MSG2_A %xmm9 376#define MSG3_A %xmm10 377#define MSG0_B %xmm11 378#define MSG1_B %xmm12 379#define MSG2_B %xmm13 380#define MSG3_B %xmm14 381#define SHUF_MASK %xmm15 382 383#define OFFSETOF_STATE 0 // offsetof(struct sha256_state, state) 384#define OFFSETOF_COUNT 32 // offsetof(struct sha256_state, count) 385#define OFFSETOF_BUF 40 // offsetof(struct sha256_state, buf) 386 387// Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a and m0_b 388// contain the current 4 message schedule words for the first and second message 389// respectively. 390// 391// If not all the message schedule words have been computed yet, then this also 392// computes 4 more message schedule words for each message. m1_a-m3_a contain 393// the next 3 groups of 4 message schedule words for the first message, and 394// likewise m1_b-m3_b for the second. After consuming the current value of 395// m0_a, this macro computes the group after m3_a and writes it to m0_a, and 396// likewise for *_b. This means that the next (m0_a, m1_a, m2_a, m3_a) is the 397// current (m1_a, m2_a, m3_a, m0_a), and likewise for *_b, so the caller must 398// cycle through the registers accordingly. 399.macro do_4rounds_2x i, m0_a, m1_a, m2_a, m3_a, m0_b, m1_b, m2_b, m3_b 400 movdqa (\i-32)*4(SHA256CONSTANTS), TMP_A 401 movdqa TMP_A, TMP_B 402 paddd \m0_a, TMP_A 403 paddd \m0_b, TMP_B 404.if \i < 48 405 sha256msg1 \m1_a, \m0_a 406 sha256msg1 \m1_b, \m0_b 407.endif 408 movdqa TMP_A, MSG 409 sha256rnds2 STATE0_A, STATE1_A 410 movdqa TMP_B, MSG 411 sha256rnds2 STATE0_B, STATE1_B 412 pshufd $0x0E, TMP_A, MSG 413 sha256rnds2 STATE1_A, STATE0_A 414 pshufd $0x0E, TMP_B, MSG 415 sha256rnds2 STATE1_B, STATE0_B 416.if \i < 48 417 movdqa \m3_a, TMP_A 418 movdqa \m3_b, TMP_B 419 palignr $4, \m2_a, TMP_A 420 palignr $4, \m2_b, TMP_B 421 paddd TMP_A, \m0_a 422 paddd TMP_B, \m0_b 423 sha256msg2 \m3_a, \m0_a 424 sha256msg2 \m3_b, \m0_b 425.endif 426.endm 427 428// 429// void __sha256_ni_finup2x(const struct sha256_state *sctx, 430// const u8 *data1, const u8 *data2, int len, 431// u8 out1[SHA256_DIGEST_SIZE], 432// u8 out2[SHA256_DIGEST_SIZE]); 433// 434// This function computes the SHA-256 digests of two messages |data1| and 435// |data2| that are both |len| bytes long, starting from the initial state 436// |sctx|. |len| must be at least SHA256_BLOCK_SIZE. 437// 438// The instructions for the two SHA-256 operations are interleaved. On many 439// CPUs, this is almost twice as fast as hashing each message individually due 440// to taking better advantage of the CPU's SHA-256 and SIMD throughput. 441// 442SYM_FUNC_START(__sha256_ni_finup2x) 443 // Allocate 128 bytes of stack space, 16-byte aligned. 444 push %rbx 445 push %rbp 446 mov %rsp, %rbp 447 sub $128, %rsp 448 and $~15, %rsp 449 450 // Load the shuffle mask for swapping the endianness of 32-bit words. 451 movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK 452 453 // Set up pointer to the round constants. 454 lea K256+32*4(%rip), SHA256CONSTANTS 455 456 // Initially we're not processing the final blocks. 457 xor FINAL_STEP, FINAL_STEP 458 459 // Load the initial state from sctx->state. 460 movdqu OFFSETOF_STATE+0*16(SCTX), STATE0_A // DCBA 461 movdqu OFFSETOF_STATE+1*16(SCTX), STATE1_A // HGFE 462 movdqa STATE0_A, TMP_A 463 punpcklqdq STATE1_A, STATE0_A // FEBA 464 punpckhqdq TMP_A, STATE1_A // DCHG 465 pshufd $0x1B, STATE0_A, STATE0_A // ABEF 466 pshufd $0xB1, STATE1_A, STATE1_A // CDGH 467 468 // Load sctx->count. Take the mod 64 of it to get the number of bytes 469 // that are buffered in sctx->buf. Also save it in a register with LEN 470 // added to it. 471 mov LEN, LEN 472 mov OFFSETOF_COUNT(SCTX), %rbx 473 lea (%rbx, LEN64, 1), COUNT 474 and $63, %ebx 475 jz .Lfinup2x_enter_loop // No bytes buffered? 476 477 // %ebx bytes (1 to 63) are currently buffered in sctx->buf. Load them 478 // followed by the first 64 - %ebx bytes of data. Since LEN >= 64, we 479 // just load 64 bytes from each of sctx->buf, DATA1, and DATA2 480 // unconditionally and rearrange the data as needed. 481 482 movdqu OFFSETOF_BUF+0*16(SCTX), MSG0_A 483 movdqu OFFSETOF_BUF+1*16(SCTX), MSG1_A 484 movdqu OFFSETOF_BUF+2*16(SCTX), MSG2_A 485 movdqu OFFSETOF_BUF+3*16(SCTX), MSG3_A 486 movdqa MSG0_A, 0*16(%rsp) 487 movdqa MSG1_A, 1*16(%rsp) 488 movdqa MSG2_A, 2*16(%rsp) 489 movdqa MSG3_A, 3*16(%rsp) 490 491 movdqu 0*16(DATA1), MSG0_A 492 movdqu 1*16(DATA1), MSG1_A 493 movdqu 2*16(DATA1), MSG2_A 494 movdqu 3*16(DATA1), MSG3_A 495 movdqu MSG0_A, 0*16(%rsp,%rbx) 496 movdqu MSG1_A, 1*16(%rsp,%rbx) 497 movdqu MSG2_A, 2*16(%rsp,%rbx) 498 movdqu MSG3_A, 3*16(%rsp,%rbx) 499 movdqa 0*16(%rsp), MSG0_A 500 movdqa 1*16(%rsp), MSG1_A 501 movdqa 2*16(%rsp), MSG2_A 502 movdqa 3*16(%rsp), MSG3_A 503 504 movdqu 0*16(DATA2), MSG0_B 505 movdqu 1*16(DATA2), MSG1_B 506 movdqu 2*16(DATA2), MSG2_B 507 movdqu 3*16(DATA2), MSG3_B 508 movdqu MSG0_B, 0*16(%rsp,%rbx) 509 movdqu MSG1_B, 1*16(%rsp,%rbx) 510 movdqu MSG2_B, 2*16(%rsp,%rbx) 511 movdqu MSG3_B, 3*16(%rsp,%rbx) 512 movdqa 0*16(%rsp), MSG0_B 513 movdqa 1*16(%rsp), MSG1_B 514 movdqa 2*16(%rsp), MSG2_B 515 movdqa 3*16(%rsp), MSG3_B 516 517 sub $64, %rbx // rbx = buffered - 64 518 sub %rbx, DATA1 // DATA1 += 64 - buffered 519 sub %rbx, DATA2 // DATA2 += 64 - buffered 520 add %ebx, LEN // LEN += buffered - 64 521 movdqa STATE0_A, STATE0_B 522 movdqa STATE1_A, STATE1_B 523 jmp .Lfinup2x_loop_have_data 524 525.Lfinup2x_enter_loop: 526 sub $64, LEN 527 movdqa STATE0_A, STATE0_B 528 movdqa STATE1_A, STATE1_B 529.Lfinup2x_loop: 530 // Load the next two data blocks. 531 movdqu 0*16(DATA1), MSG0_A 532 movdqu 0*16(DATA2), MSG0_B 533 movdqu 1*16(DATA1), MSG1_A 534 movdqu 1*16(DATA2), MSG1_B 535 movdqu 2*16(DATA1), MSG2_A 536 movdqu 2*16(DATA2), MSG2_B 537 movdqu 3*16(DATA1), MSG3_A 538 movdqu 3*16(DATA2), MSG3_B 539 add $64, DATA1 540 add $64, DATA2 541.Lfinup2x_loop_have_data: 542 // Convert the words of the data blocks from big endian. 543 pshufb SHUF_MASK, MSG0_A 544 pshufb SHUF_MASK, MSG0_B 545 pshufb SHUF_MASK, MSG1_A 546 pshufb SHUF_MASK, MSG1_B 547 pshufb SHUF_MASK, MSG2_A 548 pshufb SHUF_MASK, MSG2_B 549 pshufb SHUF_MASK, MSG3_A 550 pshufb SHUF_MASK, MSG3_B 551.Lfinup2x_loop_have_bswapped_data: 552 553 // Save the original state for each block. 554 movdqa STATE0_A, 0*16(%rsp) 555 movdqa STATE0_B, 1*16(%rsp) 556 movdqa STATE1_A, 2*16(%rsp) 557 movdqa STATE1_B, 3*16(%rsp) 558 559 // Do the SHA-256 rounds on each block. 560.irp i, 0, 16, 32, 48 561 do_4rounds_2x (\i + 0), MSG0_A, MSG1_A, MSG2_A, MSG3_A, \ 562 MSG0_B, MSG1_B, MSG2_B, MSG3_B 563 do_4rounds_2x (\i + 4), MSG1_A, MSG2_A, MSG3_A, MSG0_A, \ 564 MSG1_B, MSG2_B, MSG3_B, MSG0_B 565 do_4rounds_2x (\i + 8), MSG2_A, MSG3_A, MSG0_A, MSG1_A, \ 566 MSG2_B, MSG3_B, MSG0_B, MSG1_B 567 do_4rounds_2x (\i + 12), MSG3_A, MSG0_A, MSG1_A, MSG2_A, \ 568 MSG3_B, MSG0_B, MSG1_B, MSG2_B 569.endr 570 571 // Add the original state for each block. 572 paddd 0*16(%rsp), STATE0_A 573 paddd 1*16(%rsp), STATE0_B 574 paddd 2*16(%rsp), STATE1_A 575 paddd 3*16(%rsp), STATE1_B 576 577 // Update LEN and loop back if more blocks remain. 578 sub $64, LEN 579 jge .Lfinup2x_loop 580 581 // Check if any final blocks need to be handled. 582 // FINAL_STEP = 2: all done 583 // FINAL_STEP = 1: need to do count-only padding block 584 // FINAL_STEP = 0: need to do the block with 0x80 padding byte 585 cmp $1, FINAL_STEP 586 jg .Lfinup2x_done 587 je .Lfinup2x_finalize_countonly 588 add $64, LEN 589 jz .Lfinup2x_finalize_blockaligned 590 591 // Not block-aligned; 1 <= LEN <= 63 data bytes remain. Pad the block. 592 // To do this, write the padding starting with the 0x80 byte to 593 // &sp[64]. Then for each message, copy the last 64 data bytes to sp 594 // and load from &sp[64 - LEN] to get the needed padding block. This 595 // code relies on the data buffers being >= 64 bytes in length. 596 mov $64, %ebx 597 sub LEN, %ebx // ebx = 64 - LEN 598 sub %rbx, DATA1 // DATA1 -= 64 - LEN 599 sub %rbx, DATA2 // DATA2 -= 64 - LEN 600 mov $0x80, FINAL_STEP // using FINAL_STEP as a temporary 601 movd FINAL_STEP, MSG0_A 602 pxor MSG1_A, MSG1_A 603 movdqa MSG0_A, 4*16(%rsp) 604 movdqa MSG1_A, 5*16(%rsp) 605 movdqa MSG1_A, 6*16(%rsp) 606 movdqa MSG1_A, 7*16(%rsp) 607 cmp $56, LEN 608 jge 1f // will COUNT spill into its own block? 609 shl $3, COUNT 610 bswap COUNT 611 mov COUNT, 56(%rsp,%rbx) 612 mov $2, FINAL_STEP // won't need count-only block 613 jmp 2f 6141: 615 mov $1, FINAL_STEP // will need count-only block 6162: 617 movdqu 0*16(DATA1), MSG0_A 618 movdqu 1*16(DATA1), MSG1_A 619 movdqu 2*16(DATA1), MSG2_A 620 movdqu 3*16(DATA1), MSG3_A 621 movdqa MSG0_A, 0*16(%rsp) 622 movdqa MSG1_A, 1*16(%rsp) 623 movdqa MSG2_A, 2*16(%rsp) 624 movdqa MSG3_A, 3*16(%rsp) 625 movdqu 0*16(%rsp,%rbx), MSG0_A 626 movdqu 1*16(%rsp,%rbx), MSG1_A 627 movdqu 2*16(%rsp,%rbx), MSG2_A 628 movdqu 3*16(%rsp,%rbx), MSG3_A 629 630 movdqu 0*16(DATA2), MSG0_B 631 movdqu 1*16(DATA2), MSG1_B 632 movdqu 2*16(DATA2), MSG2_B 633 movdqu 3*16(DATA2), MSG3_B 634 movdqa MSG0_B, 0*16(%rsp) 635 movdqa MSG1_B, 1*16(%rsp) 636 movdqa MSG2_B, 2*16(%rsp) 637 movdqa MSG3_B, 3*16(%rsp) 638 movdqu 0*16(%rsp,%rbx), MSG0_B 639 movdqu 1*16(%rsp,%rbx), MSG1_B 640 movdqu 2*16(%rsp,%rbx), MSG2_B 641 movdqu 3*16(%rsp,%rbx), MSG3_B 642 jmp .Lfinup2x_loop_have_data 643 644 // Prepare a padding block, either: 645 // 646 // {0x80, 0, 0, 0, ..., count (as __be64)} 647 // This is for a block aligned message. 648 // 649 // { 0, 0, 0, 0, ..., count (as __be64)} 650 // This is for a message whose length mod 64 is >= 56. 651 // 652 // Pre-swap the endianness of the words. 653.Lfinup2x_finalize_countonly: 654 pxor MSG0_A, MSG0_A 655 jmp 1f 656 657.Lfinup2x_finalize_blockaligned: 658 mov $0x80000000, %ebx 659 movd %ebx, MSG0_A 6601: 661 pxor MSG1_A, MSG1_A 662 pxor MSG2_A, MSG2_A 663 ror $29, COUNT 664 movq COUNT, MSG3_A 665 pslldq $8, MSG3_A 666 movdqa MSG0_A, MSG0_B 667 pxor MSG1_B, MSG1_B 668 pxor MSG2_B, MSG2_B 669 movdqa MSG3_A, MSG3_B 670 mov $2, FINAL_STEP 671 jmp .Lfinup2x_loop_have_bswapped_data 672 673.Lfinup2x_done: 674 // Write the two digests with all bytes in the correct order. 675 movdqa STATE0_A, TMP_A 676 movdqa STATE0_B, TMP_B 677 punpcklqdq STATE1_A, STATE0_A // GHEF 678 punpcklqdq STATE1_B, STATE0_B 679 punpckhqdq TMP_A, STATE1_A // ABCD 680 punpckhqdq TMP_B, STATE1_B 681 pshufd $0xB1, STATE0_A, STATE0_A // HGFE 682 pshufd $0xB1, STATE0_B, STATE0_B 683 pshufd $0x1B, STATE1_A, STATE1_A // DCBA 684 pshufd $0x1B, STATE1_B, STATE1_B 685 pshufb SHUF_MASK, STATE0_A 686 pshufb SHUF_MASK, STATE0_B 687 pshufb SHUF_MASK, STATE1_A 688 pshufb SHUF_MASK, STATE1_B 689 movdqu STATE0_A, 1*16(OUT1) 690 movdqu STATE0_B, 1*16(OUT2) 691 movdqu STATE1_A, 0*16(OUT1) 692 movdqu STATE1_B, 0*16(OUT2) 693 694 mov %rbp, %rsp 695 pop %rbp 696 pop %rbx 697 RET 698SYM_FUNC_END(__sha256_ni_finup2x) 699 700.section .rodata.cst256.K256, "aM", @progbits, 256 701.align 64 702K256: 703 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 704 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 705 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 706 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 707 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 708 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 709 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 710 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 711 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 712 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 713 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 714 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 715 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 716 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 717 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 718 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 719 720.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 721.align 16 722PSHUFFLE_BYTE_FLIP_MASK: 723 .octa 0x0c0d0e0f08090a0b0405060700010203 724