1/* 2 * This file is part of the openHiTLS project. 3 * 4 * openHiTLS is licensed under the Mulan PSL v2. 5 * You can use this software according to the terms and conditions of the Mulan PSL v2. 6 * You may obtain a copy of Mulan PSL v2 at: 7 * 8 * http://license.coscl.org.cn/MulanPSL2 9 * 10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, 11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, 12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. 13 * See the Mulan PSL v2 for more details. 14 */ 15 16#include "hitls_build.h" 17#ifdef HITLS_CRYPTO_SHA512 18 19.file "sha2_512_x86_64.S" 20 21.set TEMP1, %rbp 22.set TEMP2, %rax 23.set TEMP3, %rbx 24.set TEMP4, %rcx 25.set TEMP5, %rdi 26 27.set YTEMP1, %ymm8 28.set YTEMP2, %ymm9 29.set YTEMP3, %ymm10 30.set YTEMP4, %ymm11 31.set YTEMP5, %ymm12 32.set YTEMP6, %ymm13 33.set YTEMP7, %ymm14 34 35.equ SHA512_wk, 0 36.equ SHA512_in, SHA512_wk + 1280 37.equ SHA512_hash, SHA512_in + 8 38.equ SHA512_num, SHA512_hash + 8 39.equ SHA512_rsp, SHA512_num + 8 40.equ SHA512_size, SHA512_rsp + 8 41 42.section .rodata 43.balign 64 44.type g_k512,%object 45g_k512: 46 .quad 0x428a2f98d728ae22, 0x7137449123ef65cd, 0x428a2f98d728ae22, 0x7137449123ef65cd 47 .quad 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc 48 .quad 0x3956c25bf348b538, 0x59f111f1b605d019, 0x3956c25bf348b538, 0x59f111f1b605d019 49 .quad 0x923f82a4af194f9b, 0xab1c5ed5da6d8118, 0x923f82a4af194f9b, 0xab1c5ed5da6d8118 50 .quad 0xd807aa98a3030242, 0x12835b0145706fbe, 0xd807aa98a3030242, 0x12835b0145706fbe 51 .quad 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2 52 .quad 0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x72be5d74f27b896f, 0x80deb1fe3b1696b1 53 .quad 0x9bdc06a725c71235, 0xc19bf174cf692694, 0x9bdc06a725c71235, 0xc19bf174cf692694 54 .quad 0xe49b69c19ef14ad2, 0xefbe4786384f25e3, 0xe49b69c19ef14ad2, 0xefbe4786384f25e3 55 .quad 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65, 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65 56 .quad 0x2de92c6f592b0275, 0x4a7484aa6ea6e483, 0x2de92c6f592b0275, 0x4a7484aa6ea6e483 57 .quad 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5 58 .quad 0x983e5152ee66dfab, 0xa831c66d2db43210, 0x983e5152ee66dfab, 0xa831c66d2db43210 59 .quad 0xb00327c898fb213f, 0xbf597fc7beef0ee4, 0xb00327c898fb213f, 0xbf597fc7beef0ee4 60 .quad 0xc6e00bf33da88fc2, 0xd5a79147930aa725, 0xc6e00bf33da88fc2, 0xd5a79147930aa725 61 .quad 0x06ca6351e003826f, 0x142929670a0e6e70, 0x06ca6351e003826f, 0x142929670a0e6e70 62 .quad 0x27b70a8546d22ffc, 0x2e1b21385c26c926, 0x27b70a8546d22ffc, 0x2e1b21385c26c926 63 .quad 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df 64 .quad 0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x650a73548baf63de, 0x766a0abb3c77b2a8 65 .quad 0x81c2c92e47edaee6, 0x92722c851482353b, 0x81c2c92e47edaee6, 0x92722c851482353b 66 .quad 0xa2bfe8a14cf10364, 0xa81a664bbc423001, 0xa2bfe8a14cf10364, 0xa81a664bbc423001 67 .quad 0xc24b8b70d0f89791, 0xc76c51a30654be30, 0xc24b8b70d0f89791, 0xc76c51a30654be30 68 .quad 0xd192e819d6ef5218, 0xd69906245565a910, 0xd192e819d6ef5218, 0xd69906245565a910 69 .quad 0xf40e35855771202a, 0x106aa07032bbd1b8, 0xf40e35855771202a, 0x106aa07032bbd1b8 70 .quad 0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x19a4c116b8d2d0c8, 0x1e376c085141ab53 71 .quad 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8, 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8 72 .quad 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb 73 .quad 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3, 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3 74 .quad 0x748f82ee5defb2fc, 0x78a5636f43172f60, 0x748f82ee5defb2fc, 0x78a5636f43172f60 75 .quad 0x84c87814a1f0ab72, 0x8cc702081a6439ec, 0x84c87814a1f0ab72, 0x8cc702081a6439ec 76 .quad 0x90befffa23631e28, 0xa4506cebde82bde9, 0x90befffa23631e28, 0xa4506cebde82bde9 77 .quad 0xbef9a3f7b2c67915, 0xc67178f2e372532b, 0xbef9a3f7b2c67915, 0xc67178f2e372532b 78 .quad 0xca273eceea26619c, 0xd186b8c721c0c207, 0xca273eceea26619c, 0xd186b8c721c0c207 79 .quad 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178, 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178 80 .quad 0x06f067aa72176fba, 0x0a637dc5a2c898a6, 0x06f067aa72176fba, 0x0a637dc5a2c898a6 81 .quad 0x113f9804bef90dae, 0x1b710b35131c471b, 0x113f9804bef90dae, 0x1b710b35131c471b 82 .quad 0x28db77f523047d84, 0x32caab7b40c72493, 0x28db77f523047d84, 0x32caab7b40c72493 83 .quad 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c, 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c 84 .quad 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a 85 .quad 0x5fcb6fab3ad6faec, 0x6c44198c4a475817, 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 86.size g_k512, .-g_k512 87 88 .balign 64 89 .type g_endianMask,%object 90g_endianMask: 91 .quad 0x0001020304050607, 0x08090a0b0c0d0e0f 92 .quad 0x0001020304050607, 0x08090a0b0c0d0e0f 93.size g_endianMask, .-g_endianMask 94 95/** 96 * Macro Description: Processes the update of the hash value in one round of 80 compressions. 97 * input register: 98 * addr: Stack space initial address 99 * wkOffset: wi+k512 Data address offset 100 * a - h: Intermediate variable of hash value 101 * Modify the register:temp1, temp2, temp3, temp4, temp5 102 * Output register: 103 * h: Indicates the value after a cyclic update. 104 * d: Indicates the value after a cyclic update. 105 * temp1: BSIG0(a) from last round 106 * temp4: b^a for next round b^c 107 * Function/Macro Call: None 108 * Implementation Description: 109 * T1 = h + BSIG1(e) + CH(e,f,g) + Kt + Wt 110 * T2 = BSIG0(a) + MAJ(a,b,c) 111 * CH(e, f, g) = (e AND f) XOR ((NOT e) AND g) 112 * MAJ(a, b, c) = (a AND b) XOR (a AND c) XOR (b AND c) 113 * = CH(a^b, c, b) 114 * = ((a XOR b) AND c) XOR ((NOT(a XOR b)) AND b) 115 * = (b XOR c) AND (a XOR b) XOR b 116 * BSIG0(x) = ROTR^28(x) XOR ROTR^34(x) XOR ROTR^39(x) 117 * BSIG1(x) = ROTR^14(x) XOR ROTR^18(x) XOR ROTR^41(x) 118 * d += T1; h = T1 + T2 119 * Optimization Principle:asert b^c in temp4, temp1 equal 0, f in temp5 when round begin 120 * mov b, temp4 121 * xor temp1, temp1 122 * xor c, temp4 123 * mov f, temp5 124 * swap temp2 temp4 for next round 125 * add BSIG0(a) back to a when all round finished 126 */ 127 .macro ONE_ROUND a, b, c, d, e, f, g, h, temp1, temp2, temp3, temp4, temp5, addr, wkOffset 128 // asert b^c in temp4, temp1 equal 0, f in temp5 when round begin 129 addq \wkOffset(\addr), \h // h += Kt + Wt 130 and \e, \temp5 // e&f 131 rorx $14, \e, \temp2 // ROTR^14(e) 132 addq \temp1, \a // a += BSIG0(a) from last round 133 rorx $18, \e, \temp3 // ROTR^18(e) 134 andn \g, \e, \temp1 // (~e)&g 135 xor \temp2, \temp3 // ROTR^14(e) ^ ROTR^18(e) 136 xor \temp1, \temp5 // CH(e,f,g) 137 rorx $41, \e, \temp2 // ROTR^41(e) 138 addq \temp5, \h // h += CH(e,f,g) 139 xor \temp2, \temp3 // BSIG1(e) 140 rorx $28, \a, \temp1 // ROTR^28(a) 141 mov \a, \temp2 // a 142 addq \temp3, \h // h += BSIG1(e) 143 rorx $34, \a, \temp5 // ROTR^34(a) 144 xor \b, \temp2 // b^a for next round b^c 145 addq \h, \d // d += T1 146 xor \temp5, \temp1 // ROTR^14(a) ^ ROTR^34(a) 147 and \temp2, \temp4 // (b^a) & (b^c) 148 rorx $39, \a, \temp3 // ROTR^39(a) 149 xor \b, \temp4 // Maj(a,b,c) 150 mov \e, \temp5 // for next round f 151 xor \temp3, \temp1 // BSIG0(a) 152 addq \temp4, \h // h += Maj(a,b,c) 153 // swap temp2 temp4 for next round 154 // add BSIG0(a) back to a when all round finished 155 .endm 156 157/** 158 * Macro Description: Processes the update of two rounds of hash values in 80 rounds of compression, 159 * and expands messages. 160 * Input register: 161 * addr: Stack space initial address 162 * wkOffset: wi+k512 Data address offset 163 * a - h: Intermediate variable of hash value 164 * wi_17_16: W[i-16-15] 165 * wi_15_14: W[i-15-14] 166 * wi_7_6: W[i-7-6] 167 * wi_9_8: W[i-7-8] 168 * wi_3_2: W[i-3-2] 169 * Modify the register:TEMP1, TEMP2, TEMP3, TEMP4, TEMP5, wi_17_16, YTEMP1, YTEMP2, YTEMP3, YTEMP4, YTEMP5, YTEMP6 170 * Output register: 171 * h: Value after two rounds of cyclic update 172 * d: Value after two rounds of cyclic update 173 * TEMP1: BSIG0(a) from last round 174 * TEMP4: b^a for next round b^c 175 * wi_17_16: expanded message 176 * Function/Macro Call: None 177 * Implementation Description: 178 * T1 = h + BSIG1(e) + CH(e,f,g) + Kt + Wt 179 * T2 = BSIG0(a) + MAJ(a,b,c) 180 * CH(e, f, g) = (e AND f) XOR ((NOT e) AND g) 181 * MAJ(a, b, c) = (a AND b) XOR (a AND c) XOR (b AND c) 182 * = CH(a^b, c, b) 183 * = ((a XOR b) AND c) XOR ((NOT(a XOR b)) AND b) 184 * = (b XOR c) AND (a XOR b) XOR b 185 * BSIG0(x) = ROTR^28(x) XOR ROTR^34(x) XOR ROTR^39(x) 186 * BSIG1(x) = ROTR^14(x) XOR ROTR^18(x) XOR ROTR^41(x) 187 * d += T1; h = T1 + T2 188 * 189 * wi_16: Latest W[i] value, W[i] = sigma1(W[i-2]) + W[i-7] + sigma0(W[i-15]) + W[i-16] 190 * SSIG0(x) = ROTR^1(x) XOR ROTR^8(x) XOR SHR^7(x) 191 * SSIG1(x) = ROTR^19(x) XOR ROTR^61(x) XOR SHR^6(x) 192 * Optimization Principle:asert b^c in TEMP4, TEMP1 equal 0, f in TEMP5 when round begin 193 * mov b, TEMP4 194 * xor TEMP1, TEMP1 195 * xor c, TEMP4 196 * mov f, TEMP5 197 * swap TEMP2 TEMP4 for next round 198 * add BSIG0(a) back to a when all round finished 199 */ 200 .macro TWO_ROUND_UPDATE_2W a, b, c, d, e, f, g, h, wkOffset, wi_17_16, wi_15_14, wi_9_8, wi_7_6, wi_3_2 201 // 1st round 202 vpalignr $8, \wi_17_16, \wi_15_14, YTEMP1 // wi_16_15 203 vpalignr $8, \wi_9_8, \wi_7_6, YTEMP7 // wi_8_7 204 addq \wkOffset(%rsi), \h // h += Kt + Wt 205 and \e, TEMP5 // e&f 206 vpsrlq $1, YTEMP1, YTEMP2 207 rorx $14, \e, TEMP2 // ROTR^14(e) 208 addq TEMP1, \a // a += BSIG0(a) from last round 209 vpsrlq $8, YTEMP1, YTEMP3 210 rorx $18, \e, TEMP3 // ROTR^18(e) 211 andn \g, \e, TEMP1 // (~e)&g 212 vpsrlq $7, YTEMP1, YTEMP4 213 xor TEMP2, TEMP3 // ROTR^14(e) ^ ROTR^18(e) 214 xor TEMP1, TEMP5 // CH(e,f,g) 215 vpsllq $63, YTEMP1, YTEMP5 216 rorx $41, \e, TEMP2 // ROTR^41(e) 217 addq TEMP5, \h // h += CH(e,f,g) 218 vpsllq $56, YTEMP1, YTEMP6 219 xor TEMP2, TEMP3 // BSIG1(e) 220 rorx $28, \a, TEMP1 // ROTR^28(a) 221 vpaddq YTEMP7, \wi_17_16, \wi_17_16 // W[i-17..16] + W[8..7] 222 mov \a, TEMP2 // a 223 addq TEMP3, \h // h += BSIG1(e) 224 vpxor YTEMP5, YTEMP2, YTEMP2 // ROTR^1(wi_16_15) 225 rorx $34, \a, TEMP5 // ROTR^34(a) 226 xor \b, TEMP2 // b^a for next round b^c 227 vpxor YTEMP6, YTEMP3, YTEMP3 // ROTR^8(wi_16_15) 228 addq \h, \d // d += T1 229 xor TEMP5, TEMP1 // ROTR^14(a) ^ ROTR^34(a) 230 vpxor YTEMP4, YTEMP2, YTEMP1 231 and TEMP2, TEMP4 // (b^a) & (b^c) 232 rorx $39, \a, TEMP3 // ROTR^39(a) 233 vpxor YTEMP3, YTEMP1, YTEMP1 // SSIG0(wi_16_15) 234 xor \b, TEMP4 // Maj(a,b,c) 235 mov \e, TEMP5 // for next round f 236 vpaddq YTEMP1, \wi_17_16, \wi_17_16 // SSIG0(wi_16_15) + W[i-17..16] + W[8..7] 237 xor TEMP3, TEMP1 // BSIG0(a) 238 addq TEMP4, \h // h += Maj(a,b,c) 239 // swap TEMP2 TEMP4 for next round 240 241 // 2nd round 242 // ror abcdefgh to habcdefg 243 vpsrlq $19, \wi_3_2, YTEMP2 244 addq 8+\wkOffset(%rsi), \g // h += Kt + Wt 245 and \d, TEMP5 // e&f 246 vpsrlq $61, \wi_3_2, YTEMP3 247 rorx $14, \d, TEMP4 // ROTR^14(e) 248 addq TEMP1, \h // a += BSIG0(a) from last round 249 vpsrlq $6, \wi_3_2, YTEMP4 250 rorx $18, \d, TEMP3 // ROTR^18(e) 251 andn \f, \d, TEMP1 // (~e)&g 252 vpsllq $45, \wi_3_2, YTEMP5 253 xor TEMP4, TEMP3 // ROTR^14(e) ^ ROTR^18(e) 254 xor TEMP1, TEMP5 // CH(e,f,g) 255 vpsllq $3, \wi_3_2, YTEMP6 256 rorx $41, \d, TEMP4 // ROTR^41(e) 257 addq TEMP5, \g // h += CH(e,f,g) 258 vpxor YTEMP5, YTEMP2, YTEMP2 // ROTR^19(wi_3_2) 259 xor TEMP4, TEMP3 // BSIG1(e) 260 rorx $28, \h, TEMP1 // ROTR^28(a) 261 vpxor YTEMP6, YTEMP3, YTEMP3 // ROTR^61(wi_3_2) 262 mov \h, TEMP4 // a 263 addq TEMP3, \g // h += BSIG1(e) 264 vpxor YTEMP4, YTEMP2, YTEMP1 265 rorx $34, \h, TEMP5 // ROTR^34(a) 266 xor \a, TEMP4 // b^a for next round b^c 267 vpxor YTEMP3, YTEMP1, YTEMP1 // SSIG1(wi_3_2) 268 addq \g, \c // d += T1 269 xor TEMP5, TEMP1 // ROTR^14(a) ^ ROTR^34(a) 270 vpaddq YTEMP1, \wi_17_16, \wi_17_16 // SSIG0(wi_16_15) + W[i-17..16] + W[i-8..7] + SSIG1(wi_3_2) 271 and TEMP4, TEMP2 // (b^a) & (b^c) 272 rorx $39, \h, TEMP3 // ROTR^39(a) 273 vpaddq \wkOffset(%rdx), \wi_17_16, YTEMP1 // wi + k 274 xor \a, TEMP2 // Maj(a,b,c) 275 mov \d, TEMP5 // for next round f 276 vmovdqa YTEMP1, \wkOffset + 256(%rsi) 277 xor TEMP3, TEMP1 // BSIG0(a) 278 addq TEMP2, \g // h += Maj(a,b,c) 279 // swap TEMP2 TEMP4 for next round 280 // add BSIG0(a) back to a when all round finished 281 .endm 282 283/** 284 * Function description: Performs 80 rounds of compression calculation based on the input plaintext data and updates the hash value. 285 * function prototype:void SHA512CompressMultiBlocks(uint64_t hash[8], const uint8_t *in, uint32_t num); 286 * input register: 287 * rdi:function prototype 288 * rsi:Pointer to the input data address 289 * rdx:Number of 80 rounds of cycles. The value is the length of the input data divided by 128. 290 * Register usage:ymm0-ymm7 to participate in the calculation of message blocks (of two data blocks). 291 * ymm8-ymm14 is temporary wide register 292 * r8-r15 Storage a-h 293 * The stack space temporarily stores wi+k512 (1280 bytes) and hash addresses、in、num 294 * Output register:None 295 * Function/Macro Call:UPDATE_W、ONE_ROUND 296 * 297 */ 298 .text 299 .balign 16 300 .global SHA512CompressMultiBlocks 301 .type SHA512CompressMultiBlocks, %function 302SHA512CompressMultiBlocks: 303.cfi_startproc 304 cmp $0, %rdx 305 je .Lsha512end 306 307 pushq %rbx 308 pushq %rbp 309 pushq %r12 310 pushq %r13 311 pushq %r14 312 pushq %r15 313 mov %rsp, %r14 314 sub $1320, %rsp 315 and $-256, %rsp // 32-byte address alignment 316 mov %r14, SHA512_rsp(%rsp) // rsp The original value is added to the stack. 317 318 /* load A-H */ 319 mov 0(%rdi), %r8 320 mov 8(%rdi), %r9 321 mov 16(%rdi), %r10 322 mov 24(%rdi), %r11 323 mov 32(%rdi), %r12 324 mov 40(%rdi), %r13 325 mov 48(%rdi), %r14 326 mov 56(%rdi), %r15 327 328 mov %rdi, SHA512_hash(%rsp) 329 mov %rsi, SHA512_in(%rsp) // The input data address is stored in the stack. 330 331.Lsha512_loop: 332 mov SHA512_in(%rsp), %rsi 333 334 /* Loads the data of a block to the lower 128 bits of the ymm register. */ 335 vmovdqu 0(%rsi), %xmm0 336 vmovdqu 16(%rsi), %xmm1 337 vmovdqu 32(%rsi), %xmm2 338 vmovdqu 48(%rsi), %xmm3 339 vmovdqu 64(%rsi), %xmm4 340 vmovdqu 80(%rsi), %xmm5 341 vmovdqu 96(%rsi), %xmm6 342 vmovdqu 112(%rsi), %xmm7 343 344 mov %rsi, %rcx 345 add $128, %rsi 346 cmp $1, %rdx 347 cmovne %rsi, %rcx // If num is greater than 1, rcx points to the next block. 348 349 mov %rdx, SHA512_num(%rsp) // Remaining nums are added to the stack. 350 351 /* Loads the data of a block to the upper 128 bits of the ymm register. */ 352 vinserti128 $1, 0(%rcx), %ymm0, %ymm0 353 vinserti128 $1, 16(%rcx), %ymm1, %ymm1 354 vinserti128 $1, 32(%rcx), %ymm2, %ymm2 355 vinserti128 $1, 48(%rcx), %ymm3, %ymm3 356 vinserti128 $1, 64(%rcx), %ymm4, %ymm4 357 vinserti128 $1, 80(%rcx), %ymm5, %ymm5 358 vinserti128 $1, 96(%rcx), %ymm6, %ymm6 359 vinserti128 $1, 112(%rcx),%ymm7, %ymm7 360 add $128, %rcx 361 mov %rcx, SHA512_in(%rsp) // The input data address is stored in the stack. 362 363 vmovdqa g_endianMask + 0(%rip), %ymm8 364 leaq g_k512 + 0(%rip), %rdx 365 /* Little-endian order to big-endian order */ 366 vpshufb %ymm8, %ymm0, %ymm0 367 vpshufb %ymm8, %ymm1, %ymm1 368 vpshufb %ymm8, %ymm2, %ymm2 369 vpshufb %ymm8, %ymm3, %ymm3 370 vpshufb %ymm8, %ymm4, %ymm4 371 vpshufb %ymm8, %ymm5, %ymm5 372 vpshufb %ymm8, %ymm6, %ymm6 373 vpshufb %ymm8, %ymm7, %ymm7 374 /* w[0..15] + k*/ 375 vpaddq 0(%rdx), %ymm0, %ymm8 376 vpaddq 32(%rdx), %ymm1, %ymm9 377 vpaddq 64(%rdx), %ymm2, %ymm10 378 vpaddq 96(%rdx), %ymm3, %ymm11 379 vpaddq 128(%rdx), %ymm4, %ymm12 380 vpaddq 160(%rdx), %ymm5, %ymm13 381 vpaddq 192(%rdx), %ymm6, %ymm14 382 vpaddq 224(%rdx), %ymm7, %ymm15 383 /* wk push stack */ 384 vmovdqa %ymm8, 0(%rsp) 385 vmovdqa %ymm9, 32(%rsp) 386 vmovdqa %ymm10, 64(%rsp) 387 vmovdqa %ymm11, 96(%rsp) 388 vmovdqa %ymm12, 128(%rsp) 389 vmovdqa %ymm13, 160(%rsp) 390 vmovdqa %ymm14, 192(%rsp) 391 vmovdqa %ymm15, 224(%rsp) 392 393 movq $4, 1312(%rsp) 394 leaq 0(%rsp), %rsi 395 396 mov %r9, %rcx // mov b, TEMP4 397 xor %rbp, %rbp // xor TEMP1, TEMP1 398 xor %r10, %rcx // xor c, TEMP4 399 mov %r13, %rdi // mov f, TEMP5 400.Lround00_63: 401 leaq 256(%rdx), %rdx 402 403 TWO_ROUND_UPDATE_2W %r8, %r9, %r10, %r11, %r12, %r13, %r14, %r15, 0, %ymm0, %ymm1, %ymm4, %ymm5, %ymm7 404 TWO_ROUND_UPDATE_2W %r14, %r15, %r8, %r9, %r10, %r11, %r12, %r13, 32, %ymm1, %ymm2, %ymm5, %ymm6, %ymm0 405 TWO_ROUND_UPDATE_2W %r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11, 64, %ymm2, %ymm3, %ymm6, %ymm7, %ymm1 406 TWO_ROUND_UPDATE_2W %r10, %r11, %r12, %r13, %r14, %r15, %r8, %r9, 96, %ymm3, %ymm4, %ymm7, %ymm0, %ymm2 407 TWO_ROUND_UPDATE_2W %r8, %r9, %r10, %r11, %r12, %r13, %r14, %r15, 128, %ymm4, %ymm5, %ymm0, %ymm1, %ymm3 408 TWO_ROUND_UPDATE_2W %r14, %r15, %r8, %r9, %r10, %r11, %r12, %r13, 160, %ymm5, %ymm6, %ymm1, %ymm2, %ymm4 409 TWO_ROUND_UPDATE_2W %r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11, 192, %ymm6, %ymm7, %ymm2, %ymm3, %ymm5 410 TWO_ROUND_UPDATE_2W %r10, %r11, %r12, %r13, %r14, %r15, %r8, %r9, 224, %ymm7, %ymm0, %ymm3, %ymm4, %ymm6 411 412 leaq 256(%rsi), %rsi 413 decq 1312(%rsp) 414 jne .Lround00_63 415 416 /* round 64-79 */ 417 ONE_ROUND %r8, %r9, %r10, %r11, %r12, %r13, %r14, %r15, %rbp, %rax, %rbx, %rcx, %rdi, %rsi, 0 418 ONE_ROUND %r15, %r8, %r9, %r10, %r11, %r12, %r13, %r14, %rbp, %rcx, %rbx, %rax, %rdi, %rsi, 8 419 ONE_ROUND %r14, %r15, %r8, %r9, %r10, %r11, %r12, %r13, %rbp, %rax, %rbx, %rcx, %rdi, %rsi, 32 420 ONE_ROUND %r13, %r14, %r15, %r8, %r9, %r10, %r11, %r12, %rbp, %rcx, %rbx, %rax, %rdi, %rsi, 40 421 ONE_ROUND %r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11, %rbp, %rax, %rbx, %rcx, %rdi, %rsi, 64 422 ONE_ROUND %r11, %r12, %r13, %r14, %r15, %r8, %r9, %r10, %rbp, %rcx, %rbx, %rax, %rdi, %rsi, 72 423 ONE_ROUND %r10, %r11, %r12, %r13, %r14, %r15, %r8, %r9, %rbp, %rax, %rbx, %rcx, %rdi, %rsi, 96 424 ONE_ROUND %r9, %r10, %r11, %r12, %r13, %r14, %r15, %r8, %rbp, %rcx, %rbx, %rax, %rdi, %rsi, 104 425 426 ONE_ROUND %r8, %r9, %r10, %r11, %r12, %r13, %r14, %r15, %rbp, %rax, %rbx, %rcx, %rdi, %rsi, 128 427 ONE_ROUND %r15, %r8, %r9, %r10, %r11, %r12, %r13, %r14, %rbp, %rcx, %rbx, %rax, %rdi, %rsi, 136 428 ONE_ROUND %r14, %r15, %r8, %r9, %r10, %r11, %r12, %r13, %rbp, %rax, %rbx, %rcx, %rdi, %rsi, 160 429 ONE_ROUND %r13, %r14, %r15, %r8, %r9, %r10, %r11, %r12, %rbp, %rcx, %rbx, %rax, %rdi, %rsi, 168 430 ONE_ROUND %r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11, %rbp, %rax, %rbx, %rcx, %rdi, %rsi, 192 431 ONE_ROUND %r11, %r12, %r13, %r14, %r15, %r8, %r9, %r10, %rbp, %rcx, %rbx, %rax, %rdi, %rsi, 200 432 ONE_ROUND %r10, %r11, %r12, %r13, %r14, %r15, %r8, %r9, %rbp, %rax, %rbx, %rcx, %rdi, %rsi, 224 433 ONE_ROUND %r9, %r10, %r11, %r12, %r13, %r14, %r15, %r8, %rbp, %rcx, %rbx, %rax, %rdi, %rsi, 232 434 addq %rbp, %r8 // a += BSIG0(a) from last round 435 436 leaq -1024(%rsi), %rsi // rsi Point to the original address 437 /* Update the hash value. */ 438 mov SHA512_hash(%rsp), %rdi 439 mov SHA512_num(%rsp), %rdx 440 addq 0(%rdi), %r8 441 addq 8(%rdi), %r9 442 addq 16(%rdi), %r10 443 addq 24(%rdi), %r11 444 addq 32(%rdi), %r12 445 addq 40(%rdi), %r13 446 addq 48(%rdi), %r14 447 addq 56(%rdi), %r15 448 mov %r8, 0(%rdi) 449 mov %r9, 8(%rdi) 450 mov %r10, 16(%rdi) 451 mov %r11, 24(%rdi) 452 mov %r12, 32(%rdi) 453 mov %r13, 40(%rdi) 454 mov %r14, 48(%rdi) 455 mov %r15, 56(%rdi) 456 457 cmp $1, %rdx 458 je .Lsha512_finish 459 460 movq $10, 1312(%rsp) 461 462 mov %r9, %rcx // mov b, TEMP4 463 xor %rbp, %rbp // xor TEMP1, TEMP1 464 xor %r10, %rcx // xor c, TEMP4 465 mov %r13, %rdi // mov f, TEMP5 466.Lnext_block: 467 ONE_ROUND %r8, %r9, %r10, %r11, %r12, %r13, %r14, %r15, %rbp, %rax, %rbx, %rcx, %rdi, %rsi, 16 468 ONE_ROUND %r15, %r8, %r9, %r10, %r11, %r12, %r13, %r14, %rbp, %rcx, %rbx, %rax, %rdi, %rsi, 24 469 ONE_ROUND %r14, %r15, %r8, %r9, %r10, %r11, %r12, %r13, %rbp, %rax, %rbx, %rcx, %rdi, %rsi, 48 470 ONE_ROUND %r13, %r14, %r15, %r8, %r9, %r10, %r11, %r12, %rbp, %rcx, %rbx, %rax, %rdi, %rsi, 56 471 ONE_ROUND %r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11, %rbp, %rax, %rbx, %rcx, %rdi, %rsi, 80 472 ONE_ROUND %r11, %r12, %r13, %r14, %r15, %r8, %r9, %r10, %rbp, %rcx, %rbx, %rax, %rdi, %rsi, 88 473 ONE_ROUND %r10, %r11, %r12, %r13, %r14, %r15, %r8, %r9, %rbp, %rax, %rbx, %rcx, %rdi, %rsi, 112 474 ONE_ROUND %r9, %r10, %r11, %r12, %r13, %r14, %r15, %r8, %rbp, %rcx, %rbx, %rax, %rdi, %rsi, 120 475 leaq 128(%rsi), %rsi 476 decq 1312(%rsp) 477 jne .Lnext_block 478 479 addq %rbp, %r8 // a += BSIG0(a) from last round 480 leaq -1280(%rsi), %rsi // rsi Point to the original address 481 /* Update the hash value. */ 482 mov SHA512_hash(%rsp), %rdi 483 addq 0(%rdi), %r8 484 addq 8(%rdi), %r9 485 addq 16(%rdi), %r10 486 addq 24(%rdi), %r11 487 addq 32(%rdi), %r12 488 addq 40(%rdi), %r13 489 addq 48(%rdi), %r14 490 addq 56(%rdi), %r15 491 mov %r8, 0(%rdi) 492 mov %r9, 8(%rdi) 493 mov %r10, 16(%rdi) 494 mov %r11, 24(%rdi) 495 mov %r12, 32(%rdi) 496 mov %r13, 40(%rdi) 497 mov %r14, 48(%rdi) 498 mov %r15, 56(%rdi) 499 500 sub $2, %rdx 501 jne .Lsha512_loop 502 503.Lsha512_finish: 504 mov SHA512_rsp(%rsp), %rsp 505 popq %r15 506 popq %r14 507 popq %r13 508 popq %r12 509 popq %rbp 510 popq %rbx 511 512.Lsha512end: 513 ret 514.cfi_endproc 515 .size SHA512CompressMultiBlocks, .-SHA512CompressMultiBlocks 516 517#endif 518