1/* 2 * This file is part of the openHiTLS project. 3 * 4 * openHiTLS is licensed under the Mulan PSL v2. 5 * You can use this software according to the terms and conditions of the Mulan PSL v2. 6 * You may obtain a copy of Mulan PSL v2 at: 7 * 8 * http://license.coscl.org.cn/MulanPSL2 9 * 10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, 11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, 12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. 13 * See the Mulan PSL v2 for more details. 14 */ 15 16#include "hitls_build.h" 17#ifdef HITLS_CRYPTO_SHA256 18 19.file "sha2_256_x86_64.S" 20 21.set HashAddr, %rdi 22.set InAddr, %rsi 23.set NUM, %rdx 24 25.set tempFirst, %ebp 26.set tempThird, %ebx 27.set tempFifth, %edi 28.set avx2Temp1, %ymm4 29.set avx2Temp2, %ymm5 30.set avx2Temp3, %ymm6 31.set avx2Temp4, %ymm7 32.set avx2Temp5, %ymm10 33.set avx2Temp6, %ymm11 34.set avx2Temp7, %ymm15 35 36.set BlockFrontMessageW3_0, %xmm0 37.set BlockFrontMessageW7_4, %xmm1 38.set BlockFrontMessageW11_8, %xmm2 39.set BlockFrontMessageW15_12, %xmm3 40 41.set g_maskMerge, %ymm12 42.set g_maskShift, %ymm13 43.set g_maskTransformEndian, %ymm14 44 45/* Constant value used by sha256. For details about the data source, see the RFC4634 document. */ 46.section .rodata 47.align 64 48.type g_K256, %object 49g_K256: 50 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 51 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 52 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 53 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 54 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 55 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 56 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 57 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 58 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 59 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 60 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 61 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 62 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 63 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 64 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 65 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 66 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 67 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 68 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 69 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 70 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 71 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 72 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 73 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 74 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 75 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 76 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 77 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 78 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 79 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 80 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 81 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 82.size g_K256, .-g_K256 83 84/* Mask block */ 85.balign 64 86.type g_mask, %object 87g_mask: 88 .long 0x00010203,0x04050607, 0x08090a0b,0x0c0d0e0f 89 .long 0x00010203,0x04050607, 0x08090a0b,0x0c0d0e0f 90 .long 0x03020100,0x0b0a0908, 0xffffffff,0xffffffff 91 .long 0x03020100,0x0b0a0908, 0xffffffff,0xffffffff 92 .long 0xffffffff,0xffffffff, 0x03020100,0x0b0a0908 93 .long 0xffffffff,0xffffffff, 0x03020100,0x0b0a0908 94.size g_mask, .-g_mask 95 96/* 97 * Macro description: Processes the fast extension of four messages of two blocks at the same time 98 * and completes the four-round compression function of the first block. 99 * Input register: 100 * WkAddr: Address of the stack space where wi+kt is located. 101 * a - h: Intermediate variable of hash value 102 * Modify the register: r8d-r15d, ebp, eax, ebx, ecx, edi, ymm0-ymm10 103 * Output register: 104 * a-h: Value after four rounds of cyclic update 105 * B3_0: Value after data extension 106 * Naming convention: 107 * B3_0: w3-w0 108 * B7_4: w7-w4 109 * B11_8: w11-w8 110 * B15_12: w15-w12 111 * Function/Macro Call:None 112 * Implementation Description: 113 * ONE_ROUND algorithm implementation: 114 * For t = 0 to 63, T1 = h + BSIG1(e) + CH(e,f,g) + Kt + Wt 115 * T2 = BSIG0(a) + MAJ(a,b,c) 116 * h = g, g = f, f = e, e = d + T1, d = c, c = b, b = a, a = T1 + T2 117 * CH( x, y, z) = (x AND y) XOR ( (NOT x) AND z) CH(e,f,g) 118 * MAJ(a, b, c) = (a AND b) XOR (a AND c) XOR (b AND c) 119 * = CH(a^b, c, b) 120 * = ((a XOR b) AND c) XOR ((NOT(a XOR b)) AND b) 121 * = (b XOR c) AND (a XOR b) XOR b 122 * BSIG0(x) = ROTR^2(x) XOR ROTR^13(x) XOR ROTR^22(x) BSIG0(a) 123 * BSIG1(x) = ROTR^6(x) XOR ROTR^11(x) XOR ROTR^25(x) BSIG1(e) 124 * Optimization idea: b xor c in the next round of MAJ is a xor b in the previous round of MAJ 125 * to avoid redundant calculation. 126 * 127 * UPDATE_4W algorithm implementation: 128 * For t = 0 to 15 Wt = W0_W15(input w0-w15) 129 * For t = 16 to 63 Wt = SSIG1(W(t-2)) + W(t-7) + SSIG0(w(t-15)) + W(t-16) 130 * SSIG0(x) = ROTR^7(x) XOR ROTR^18(x) XOR SHR^3(x) 131 * SSIG1(x) = ROTR^17(x) XOR ROTR^19(x) XOR SHR^10(x) 132 * Optimization idea: Optimization point 1: Each WI message block is 32-bit, and the xmm register is 133 * a 128-bit register. Therefore, the common operation of four WI messages can be 134 * performed at the same time (SSIG0, W(t-16), W(t-7)). 135 * Due to the dependency of wi, four wis are calculated each time as the 136 * optimal solution found so far. 137 * Optimization point 2: The ymm register is a 256-bit register. Therefore, two rounds 138 * of 128-bit calculation can be performed at the same time, and two blocks can be used 139 * for the same calculation. 140 */ 141.macro FOUR_ROUND_UPDATE_4W a, b, c, d, e, f, g, h, tempSwitch2, tempSwitch4, WkAddr, B3_0, B7_4, B11_8, B15_12 142 vpalignr $4,\B3_0,\B7_4,avx2Temp1 // avx2Temp1->w4_1 143 add \WkAddr(%rsp),\h // h += Kt + Wt 144 and \e, tempFifth // e&f 145 rorx $6, \e, \tempSwitch2 // ROTR^6(e) 146 add tempFirst, \a // a += BSIG0(a) from last round 147 rorx $11, \e, tempThird // ROTR^11(e) 148 andn \g, \e, tempFirst // (~e)&g 149 xor \tempSwitch2, tempThird // ROTR^6(e) ^ ROTR^11(e) 150 xor tempFirst, tempFifth // CH(e,f,g) 151 vpshufd $250, \B15_12, avx2Temp5 152 rorx $25, \e, \tempSwitch2 // ROTR^25(e) 153 add tempFifth, \h // h += CH(e,f,g) 154 xor \tempSwitch2, tempThird // BSIG1(e) 155 vpalignr $4, \B11_8, \B15_12, avx2Temp2 // avx2Temp2->w12_9 156 vpslld $14, avx2Temp1, avx2Temp4 // w4_1<<datum line 14 157 rorx $2, \a, tempFirst // ROTR^2(a) 158 mov \a, \tempSwitch2 // a 159 add tempThird, \h // h += BSIG1(e)[h->T1] 160 vpsrld $3, avx2Temp1, avx2Temp3 // w4_1>>datum line 3 161 rorx $13, \a, tempFifth // ROTR^13(a) 162 xor \b, \tempSwitch2 // b^a for next round b^c 163 add \h, \d // d += T1 164 vpsrld $10, avx2Temp5, avx2Temp6 // >>10 165 xor tempFifth, tempFirst // ROTR^2(a) ^ ROTR^13(a) 166 and \tempSwitch2, \tempSwitch4 // (b^a) & (b^c) 167 vpsrld $7, avx2Temp1, avx2Temp1 // >>7 168 vpaddd avx2Temp2, \B3_0, \B3_0 169 rorx $22, \a, tempThird // ROTR^22(a) 170 add 4+\WkAddr(%rsp),\g // h += Kt + Wt 171 xor \b, \tempSwitch4 // Maj(a,b,c) 172 vpxor avx2Temp3, avx2Temp4, avx2Temp3 // 3 xor 14 173 mov \e, tempFifth // for next round f 174 xor tempThird, tempFirst // BSIG0(a) 175 vpsrlq $17, avx2Temp5, avx2Temp7 // >>17 176 add \tempSwitch4, \h // h += Maj(a,b,c) 177 and \d, tempFifth // e&f 178 rorx $6, \d, \tempSwitch4 179 add tempFirst, \h // a += BSIG0(a) from last round 180 vpxor avx2Temp3, avx2Temp1, avx2Temp3 // 7xor14xor3 181 vpsrlq $19, avx2Temp5, avx2Temp5 // >>19 182 rorx $11, \d, tempThird 183 andn \f, \d, tempFirst 184 xor \tempSwitch4, tempThird 185 vpsrld $11, avx2Temp1, avx2Temp1 // >>18 186 xor tempFirst, tempFifth 187 rorx $25, \d, \tempSwitch4 188 add tempFifth, \g 189 xor \tempSwitch4, tempThird 190 vpslld $11, avx2Temp4, avx2Temp4 // <<25 191 rorx $2, \h, tempFirst 192 mov \h, \tempSwitch4 193 add tempThird, \g 194 rorx $13, \h,tempFifth 195 xor \a, \tempSwitch4 196 vpxor avx2Temp7, avx2Temp6, avx2Temp7 // 17xor10 197 add \g, \c 198 xor tempFifth, tempFirst 199 vpxor avx2Temp3, avx2Temp1, avx2Temp3 // 7xor14xor3xor18 200 and \tempSwitch4, \tempSwitch2 201 rorx $22, \h, tempThird 202 add 8+\WkAddr(%rsp),\f 203 xor \a, \tempSwitch2 204 vpxor avx2Temp7, avx2Temp5, avx2Temp7 // 17xor10xor19 205 mov \d, tempFifth 206 xor tempThird, tempFirst 207 add \tempSwitch2, \g 208 vpshufb g_maskMerge, avx2Temp7, avx2Temp7 // BSIG1 w15_14 209 vpxor avx2Temp3, avx2Temp4, avx2Temp3 // 7xor14xor3xor18xor25 210 and \c, tempFifth 211 rorx $6, \c, \tempSwitch2 212 add tempFirst, \g 213 rorx $11, \c, tempThird 214 vpaddd avx2Temp3, \B3_0, \B3_0 // BSIG0+w(t-16)+w(t-7) 215 andn \e, \c, tempFirst 216 xor \tempSwitch2, tempThird 217 xor tempFirst, tempFifth 218 rorx $25, \c, \tempSwitch2 219 add tempFifth, \f 220 xor \tempSwitch2, tempThird 221 rorx $2, \g, tempFirst 222 mov \g, \tempSwitch2 223 add tempThird, \f 224 rorx $13, \g, tempFifth 225 vpaddd \B3_0, avx2Temp7, \B3_0 // w17_16 226 xor \h, \tempSwitch2 227 add \f, \b 228 xor tempFifth, tempFirst 229 and \tempSwitch2, \tempSwitch4 230 vpshufd $80, \B3_0, avx2Temp1 231 rorx $22, \g, tempThird 232 add 12+\WkAddr(%rsp),\e 233 xor \h, \tempSwitch4 234 mov \c, tempFifth 235 xor tempThird, tempFirst 236 add \tempSwitch4, \f 237 vpsrld $10, avx2Temp1, avx2Temp2 // >>10 238 and \b, tempFifth 239 rorx $6, \b, \tempSwitch4 240 vpsrlq $17, avx2Temp1, avx2Temp3 // >>17 241 add tempFirst, \f 242 rorx $11, \b, tempThird 243 andn \d, \b, tempFirst 244 xor \tempSwitch4, tempThird 245 vpsrlq $19,avx2Temp1, avx2Temp1 // >>19 246 xor tempFirst, tempFifth 247 rorx $25, \b, \tempSwitch4 248 add tempFifth, \e 249 xor \tempSwitch4, tempThird 250 vpxor avx2Temp2, avx2Temp3, avx2Temp3 // 10xor17 251 rorx $2, \f, tempFirst 252 mov \f, \tempSwitch4 253 add tempThird, \e 254 rorx $13, \f, tempFifth 255 xor \g, \tempSwitch4 256 vpxor avx2Temp3, avx2Temp1, avx2Temp3 // 10xor17xor19 257 add \e, \a 258 xor tempFifth, tempFirst 259 and \tempSwitch4, \tempSwitch2 260 rorx $22, \f, tempThird 261 vpshufb g_maskShift, avx2Temp3, avx2Temp3 // BSIG1(W17_16)Move to the desired location 262 xor \g, \tempSwitch2 263 mov \b, tempFifth 264 xor tempThird, tempFirst 265 add \tempSwitch2, \e 266 vpaddd avx2Temp3, \B3_0, \B3_0 // W19_16 267.endm 268 269/* 270 * Macro description: Processes the update of a round of hash values in 64 rounds of compression. 271 * Input register: 272 * wkAddr: wi+kt Stack space address. 273 * a - h: Intermediate variable of hash value 274 * Modify the register: r8d-r15d, ebp, eax, ebx, ecx, edi 275 * Output register: 276 * a-h: Indicates the value after a cyclic update. 277 * Function/Macro Call:None 278 * ONE_ROUND Algorithm Implementation: 279 * For t = 0 to 63, T1 = h + BSIG1(e) + CH(e,f,g) + Kt + Wt 280 * T2 = BSIG0(a) + MAJ(a,b,c) 281 * h = g, g = f, f = e, e = d + T1, d = c, c = b, b = a, a = T1 + T2 282 * CH( x, y, z) = (x AND y) XOR ( (NOT x) AND z) CH(e,f,g) 283 * MAJ(a, b, c) = (a AND b) XOR (a AND c) XOR (b AND c) 284 * = CH(a^b, c, b) 285 * = ((a XOR b) AND c) XOR ((NOT(a XOR b)) AND b) 286 * = (b XOR c) AND (a XOR b) XOR b 287 * BSIG0(x) = ROTR^2(x) XOR ROTR^13(x) XOR ROTR^22(x) BSIG0(a) 288 * BSIG1(x) = ROTR^6(x) XOR ROTR^11(x) XOR ROTR^25(x) BSIG1(e) 289 * Optimization idea: b xor c in the next round of MAJ is a xor b in the 290 * previous round of MAJ to avoid redundant calculation. 291 * Note: At the end of each round, the tempSwitch2 and tempSwitch4 of the next round need to be exchanged. 292 */ 293 .macro ONE_ROUND a, b, c, d, e, f, g, h, tempSwitch2, tempSwitch4, WkAddr 294 rorx $11, \e, tempThird // ROTR^11(e) 295 rorx $6, \e, \tempSwitch2 // ROTR^6(e) 296 add tempFirst, \a // a += BSIG0(a) from last round 297 and \e, tempFifth // e&f 298 andn \g, \e, tempFirst // (~e)&g 299 xor \tempSwitch2, tempThird // ROTR^6(e) ^ ROTR^11(e) 300 add \WkAddr(%rsp),\h // h += Kt + Wt 301 xor tempFirst, tempFifth // CH(e,f,g) 302 rorx $25, \e, \tempSwitch2 // ROTR^25(e) 303 add tempFifth, \h // h += CH(e,f,g) 304 xor \tempSwitch2, tempThird // BSIG1(e) 305 rorx $2, \a, tempFirst // ROTR^2(a) 306 mov \a, \tempSwitch2 // a 307 leal (tempThird, \h), \h // h += BSIG1(e)[h->T1] 308 rorx $13, \a, tempFifth // ROTR^13(a) 309 xor \b, \tempSwitch2 // b^a for next round b^c 310 add \h, \d // d += T1 311 xor tempFifth, tempFirst // ROTR^2(a) ^ ROTR^13(a) 312 and \tempSwitch2, \tempSwitch4 // (b^a) & (b^c) 313 rorx $22, \a, tempThird // ROTR^22(a) 314 xor \b, \tempSwitch4 // Maj(a,b,c) 315 mov \e, tempFifth // for next round f 316 xor tempThird, tempFirst // BSIG0(a) 317 add \tempSwitch4, \h // h += Maj(a,b,c) 318 .endm 319 320/* 321 * Function description: Performs 64 rounds of compression calculation based on the input plaintext data and updates the hash value. 322 * function prototype:void SHA256CompressMultiBlocks(uint32_t hash[8], const uint8_t *in, uint32_t num); 323 * Input register: 324 * rdi: Storage address of the hash value 325 * rsi: Pointer to the input data address (Wi) 326 * rdx: Number of 64 rounds of cycles. (You need to do several blocks, that is, you need to do several loops.) 327 * Modify the register: r0-r14 328 * Output register: None 329 * Function/Macro Call: None 330 */ 331.text 332.globl SHA256CompressMultiBlocks 333.type SHA256CompressMultiBlocks,%function 334.align 4 335SHA256CompressMultiBlocks: 336.cfi_startproc 337 /* Determine whether to end the process directly. */ 338 cmp $0, NUM 339 je .LEND_SHA256 340 341 /* Pop-stack/push stack protection */ 342 pushq %r14 343 pushq %rbx 344 pushq %rbp 345 pushq %r12 346 pushq %r13 347 pushq %r15 348 349 /* The pre-stored stack space and 32-byte address are aligned. 350 The original RSP value is added to the stack and the mask is assigned. */ 351 mov %rsp, %r14 352 mov 0(HashAddr), %r8d 353 sub $600, %rsp 354 vmovdqa g_mask + 0(%rip), g_maskTransformEndian 355 mov 4(HashAddr), %r9d 356 mov 8(HashAddr), %r10d 357 and $-256, %rsp 358 vmovdqa g_mask + 64(%rip), g_maskShift 359 mov 12(HashAddr), %r11d 360 mov %r14, 0(%rsp) 361 362 /* r8d-r15d: a-h */ 363 mov 16(HashAddr), %r12d 364 mov 20(HashAddr), %r13d 365 vmovdqa g_mask + 32(%rip), g_maskMerge 366 mov 24(HashAddr), %r14d 367 mov 28(HashAddr), %r15d 368 369.LEND_SHA256_LOOP: 370 mov InAddr, %rcx 371 372 /* Loads the data of a block to the lower 128 bits of the ymm register. */ 373 vmovdqu 0(InAddr), BlockFrontMessageW3_0 374 vmovdqu 16(InAddr), BlockFrontMessageW7_4 375 vmovdqu 32(InAddr), BlockFrontMessageW11_8 376 vmovdqu 48(InAddr), BlockFrontMessageW15_12 377 378 /* block Judgment condition processing */ 379 leaq 64(InAddr), InAddr 380 cmp $1, NUM 381 cmovne InAddr, %rcx // If num is greater than 1, rcx points to the next block. 382 383 /* Load the data of another block to the upper 128 bits of the ymm register. */ 384 vinserti128 $1, 0(%rcx), %ymm0, %ymm0 385 vinserti128 $1, 16(%rcx), %ymm1, %ymm1 386 vpshufb g_maskTransformEndian, %ymm0, %ymm0 387 mov NUM, 16(%rsp) 388 vinserti128 $1, 32(%rcx), %ymm2, %ymm2 389 mov HashAddr, 24(%rsp) 390 vpshufb g_maskTransformEndian, %ymm1, %ymm1 391 vinserti128 $1, 48(%rcx), %ymm3, %ymm3 392 vpshufb g_maskTransformEndian, %ymm2, %ymm2 393 394 add $64, %rcx 395 leaq g_K256(%rip), NUM 396 397 /* Little-endian order to big-endian order, wi + kt:ymm9-11*/ 398 mov %rcx, 8(%rsp) 399 leaq 32(%rsp), %rsp 400 vpaddd 0(NUM), %ymm0, %ymm8 401 mov %r9d, %ecx 402 vpaddd 32(NUM), %ymm1, %ymm9 403 vmovdqa %ymm8, 0(%rsp) 404 vpshufb g_maskTransformEndian, %ymm3, %ymm3 405 xor %ebp, %ebp 406 vpaddd 64(NUM), %ymm2, %ymm10 407 vmovdqu %ymm9, 32(%rsp) 408 xor %r10d, %ecx 409 vpaddd 96(NUM), %ymm3, %ymm11 410 mov %r13d, %edi 411 vmovdqa %ymm10, 64(%rsp) 412 vmovdqu %ymm11, 96(%rsp) 413 414.LEND_SHA256_ROUND_00_47: 415 416 /* Next round wi + kt: ymm9-11, 16 rounds of compression + 4 rounds of message block expansion */ 417 /* FOUR_ROUND_UPDATE_4W a, b, c, d, e, f, g, h, tempSwitch2,tempSwitch4, WkAddr,B3_0, B7_4, B11_8, B15_12 */ 418 FOUR_ROUND_UPDATE_4W %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %eax, %ecx, 0, %ymm0, %ymm1, %ymm2, %ymm3 419 leaq 128(NUM), NUM 420 FOUR_ROUND_UPDATE_4W %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %eax, %ecx, 32, %ymm1, %ymm2, %ymm3, %ymm0 421 vpaddd 0(NUM), %ymm0, %ymm8 422 FOUR_ROUND_UPDATE_4W %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %eax, %ecx, 64, %ymm2, %ymm3, %ymm0, %ymm1 423 vpaddd 32(NUM), %ymm1, %ymm9 424 vmovdqa %ymm8, 128(%rsp) 425 FOUR_ROUND_UPDATE_4W %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %eax, %ecx, 96, %ymm3, %ymm0, %ymm1, %ymm2 426 vpaddd 64(NUM), %ymm2, %ymm10 427 vmovdqa %ymm9, 160(%rsp) 428 vpaddd 96(NUM), %ymm3, %ymm11 429 vmovdqu %ymm10, 192(%rsp) 430 vmovdqa %ymm11, 224(%rsp) 431 432 /* Next round wi + kt: ymm9-11, 16 rounds of compression + 4 rounds of message block expansion */ 433 /* FOUR_ROUND_UPDATE_4W a, b, c, d, e, f, g, h, tempSwitch2,tempSwitch4, WkAddr,B19_16, B23_20, B27_24, B31_27 */ 434 FOUR_ROUND_UPDATE_4W %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %eax, %ecx, 128, %ymm0, %ymm1, %ymm2, %ymm3 435 leaq 128(NUM), NUM 436 FOUR_ROUND_UPDATE_4W %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %eax, %ecx, 160, %ymm1, %ymm2, %ymm3, %ymm0 437 vpaddd 0(NUM), %ymm0, %ymm8 438 FOUR_ROUND_UPDATE_4W %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %eax, %ecx, 192, %ymm2, %ymm3, %ymm0, %ymm1 439 vpaddd 32(NUM), %ymm1, %ymm9 440 vmovdqa %ymm8, 256(%rsp) 441 FOUR_ROUND_UPDATE_4W %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %eax, %ecx, 224, %ymm3, %ymm0, %ymm1, %ymm2 442 vpaddd 64(NUM), %ymm2, %ymm10 443 vmovdqa %ymm9, 288(%rsp) 444 vpaddd 96(NUM), %ymm3, %ymm11 445 vmovdqu %ymm10, 320(%rsp) 446 vmovdqa %ymm11, 352(%rsp) 447 448 /* Next round wi + kt: ymm9-11, 16 rounds of compression + 4 rounds of message block expansion */ 449 /* FOUR_ROUND_UPDATE_4W a, b, c, d, e, f, g, h, tempSwitch2,tempSwitch4, WkAddr,B35_32, B39_36, B43_40, B47_44 */ 450 FOUR_ROUND_UPDATE_4W %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %eax, %ecx, 256, %ymm0, %ymm1, %ymm2, %ymm3 451 leaq 128(NUM), NUM 452 FOUR_ROUND_UPDATE_4W %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %eax, %ecx, 288, %ymm1, %ymm2, %ymm3, %ymm0 453 vpaddd 0(NUM), %ymm0, %ymm8 454 FOUR_ROUND_UPDATE_4W %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %eax, %ecx, 320, %ymm2, %ymm3, %ymm0, %ymm1 455 vpaddd 32(NUM), %ymm1, %ymm9 456 vmovdqa %ymm8, 384(%rsp) 457 FOUR_ROUND_UPDATE_4W %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %eax, %ecx, 352, %ymm3, %ymm0, %ymm1, %ymm2 458 vpaddd 64(NUM), %ymm2, %ymm10 459 vmovdqa %ymm9, 416(%rsp) 460 vpaddd 96(NUM), %ymm3, %ymm11 461 vmovdqu %ymm10, 448(%rsp) 462 vmovdqa %ymm11, 480(%rsp) 463 464.LEND_SHA256_ROUND_48_63: 465 /* ONE_ROUND a, b, c, d, e, f, g, h, tempSwitch2, Fourth, WkAddr */ 466 ONE_ROUND %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %eax, %ecx, 384 467 ONE_ROUND %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %ecx, %eax, 388 468 ONE_ROUND %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %eax, %ecx, 392 469 ONE_ROUND %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %ecx, %eax, 396 470 471 ONE_ROUND %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %eax, %ecx, 416 472 ONE_ROUND %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %ecx, %eax, 420 473 ONE_ROUND %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %eax, %ecx, 424 474 ONE_ROUND %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %ecx, %eax, 428 475 476 ONE_ROUND %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %eax, %ecx, 448 477 ONE_ROUND %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %ecx, %eax, 452 478 ONE_ROUND %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %eax, %ecx, 456 479 ONE_ROUND %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %ecx, %eax, 460 480 481 ONE_ROUND %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %eax, %ecx, 480 482 ONE_ROUND %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %ecx, %eax, 484 483 ONE_ROUND %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %eax, %ecx, 488 484 ONE_ROUND %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %ecx, %eax, 492 485 486 sub $32, %rsp 487 add %ebp, %r8d // a+=BSIG0 488 mov 24(%rsp), HashAddr 489 490 /* Update the storage hash value. */ 491 add 0(HashAddr), %r8d 492 add 4(HashAddr), %r9d 493 mov %r8d, 0(HashAddr) 494 add 8(HashAddr), %r10d 495 mov %r9d, 4(HashAddr) 496 add 12(HashAddr), %r11d 497 mov %r10d, 8(HashAddr) 498 add 16(HashAddr), %r12d 499 mov 16(%rsp), NUM 500 mov %r11d, 12(HashAddr) 501 add 20(HashAddr), %r13d 502 mov %r12d, 16(HashAddr) 503 add 24(HashAddr), %r14d 504 mov %r13d, 20(HashAddr) 505 add 28(HashAddr), %r15d 506 mov %r14d, 24(HashAddr) 507 mov %r15d, 28(HashAddr) 508 509 cmp $1, NUM 510 je .LEND_SHA256_FINFISH_INITIAL 511 512 /* Data compression of the second block */ 513 xor %ebp, %ebp 514 mov %r9d, %ecx 515 xor %r10d, %ecx 516 mov %r13d, %edi 517 518.LEND_SHA256_NEXT_BLOCK: 519 /* 0-15 */ 520 /* ONE_ROUND a, b, c, d, e, f, g, h, tempSwitch2,tempSwitch4, WkAddr */ 521 ONE_ROUND %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %eax, %ecx, 16+32 522 ONE_ROUND %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %ecx, %eax, 20+32 523 ONE_ROUND %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %eax, %ecx, 24+32 524 ONE_ROUND %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %ecx, %eax, 28+32 525 526 ONE_ROUND %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %eax, %ecx, 48+32 527 ONE_ROUND %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %ecx, %eax, 52+32 528 ONE_ROUND %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %eax, %ecx, 56+32 529 ONE_ROUND %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %ecx, %eax, 60+32 530 531 ONE_ROUND %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %eax, %ecx, 80+32 532 ONE_ROUND %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %ecx, %eax, 84+32 533 ONE_ROUND %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %eax, %ecx, 88+32 534 ONE_ROUND %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %ecx, %eax, 92+32 535 536 ONE_ROUND %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %eax, %ecx, 112+32 537 ONE_ROUND %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %ecx, %eax, 116+32 538 ONE_ROUND %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %eax, %ecx, 120+32 539 ONE_ROUND %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %ecx, %eax, 124+32 540 541 /* 16-31 */ 542 ONE_ROUND %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %eax, %ecx, 16+128+32 543 ONE_ROUND %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %ecx, %eax, 20+128+32 544 ONE_ROUND %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %eax, %ecx, 24+128+32 545 ONE_ROUND %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %ecx, %eax, 28+128+32 546 547 ONE_ROUND %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %eax, %ecx, 48+128+32 548 ONE_ROUND %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %ecx, %eax, 52+128+32 549 ONE_ROUND %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %eax, %ecx, 56+128+32 550 ONE_ROUND %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %ecx, %eax, 60+128+32 551 552 ONE_ROUND %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %eax, %ecx, 80+128+32 553 ONE_ROUND %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %ecx, %eax, 84+128+32 554 ONE_ROUND %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %eax, %ecx, 88+128+32 555 ONE_ROUND %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %ecx, %eax, 92+128+32 556 557 ONE_ROUND %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %eax, %ecx, 112+128+32 558 ONE_ROUND %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %ecx, %eax, 116+128+32 559 ONE_ROUND %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %eax, %ecx, 120+128+32 560 ONE_ROUND %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %ecx, %eax, 124+128+32 561 562 /* 32-47 */ 563 ONE_ROUND %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %eax, %ecx, 16+256+32 564 ONE_ROUND %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %ecx, %eax, 20+256+32 565 ONE_ROUND %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %eax, %ecx, 24+256+32 566 ONE_ROUND %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %ecx, %eax, 28+256+32 567 568 ONE_ROUND %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %eax, %ecx, 48+256+32 569 ONE_ROUND %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %ecx, %eax, 52+256+32 570 ONE_ROUND %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %eax, %ecx, 56+256+32 571 ONE_ROUND %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %ecx, %eax, 60+256+32 572 573 ONE_ROUND %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %eax, %ecx, 80+256+32 574 ONE_ROUND %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %ecx, %eax, 84+256+32 575 ONE_ROUND %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %eax, %ecx, 88+256+32 576 ONE_ROUND %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %ecx, %eax, 92+256+32 577 578 ONE_ROUND %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %eax, %ecx, 112+256+32 579 ONE_ROUND %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %ecx, %eax, 116+256+32 580 ONE_ROUND %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %eax, %ecx, 120+256+32 581 ONE_ROUND %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %ecx, %eax, 124+256+32 582 583 /* 48-63 */ 584 ONE_ROUND %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %eax, %ecx, 16+384+32 585 ONE_ROUND %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %ecx, %eax, 20+384+32 586 ONE_ROUND %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %eax, %ecx, 24+384+32 587 ONE_ROUND %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %ecx, %eax, 28+384+32 588 589 ONE_ROUND %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %eax, %ecx, 48+384+32 590 ONE_ROUND %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %ecx, %eax, 52+384+32 591 ONE_ROUND %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %eax, %ecx, 56+384+32 592 ONE_ROUND %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %ecx, %eax, 60+384+32 593 594 ONE_ROUND %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %eax, %ecx, 80+384+32 595 ONE_ROUND %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %ecx, %eax, 84+384+32 596 ONE_ROUND %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %eax, %ecx, 88+384+32 597 ONE_ROUND %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %ecx, %eax, 92+384+32 598 599 ONE_ROUND %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %eax, %ecx, 112+384+32 600 ONE_ROUND %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %ecx, %eax, 116+384+32 601 ONE_ROUND %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %eax, %ecx, 120+384+32 602 ONE_ROUND %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %ecx, %eax, 124+384+32 603 604 mov 24(%rsp), HashAddr 605 lea (%ebp, %r8d), %r8d // a+=BSIG0 606 607 /* Update the storage hash value. */ 608 add 0(HashAddr), %r8d 609 add 4(HashAddr), %r9d 610 mov %r8d, 0(HashAddr) 611 add 8(HashAddr), %r10d 612 mov %r9d, 4(HashAddr) 613 add 12(HashAddr), %r11d 614 mov %r10d, 8(HashAddr) 615 add 16(HashAddr), %r12d 616 mov %r11d, 12(HashAddr) 617 add 20(HashAddr), %r13d 618 mov %r12d, 16(HashAddr) 619 mov 8(%rsp), InAddr 620 add 24(HashAddr), %r14d 621 mov %r13d, 20(HashAddr) 622 mov 16(%rsp), NUM 623 add 28(HashAddr), %r15d 624 mov %r14d, 24(HashAddr) 625 mov %r15d, 28(HashAddr) 626 627 sub $2, NUM 628 ja .LEND_SHA256_LOOP 629 630.LEND_SHA256_FINFISH_INITIAL: 631 /* Registers and pointers are reset. */ 632 mov 0(%rsp), %rsp 633 popq %r15 634 popq %r13 635 popq %r12 636 popq %rbp 637 popq %rbx 638 popq %r14 639 640.LEND_SHA256: 641 ret 642.cfi_endproc 643 .size SHA256CompressMultiBlocks, .-SHA256CompressMultiBlocks 644 645#endif 646