1/* 2 * This file is part of the openHiTLS project. 3 * 4 * openHiTLS is licensed under the Mulan PSL v2. 5 * You can use this software according to the terms and conditions of the Mulan PSL v2. 6 * You may obtain a copy of Mulan PSL v2 at: 7 * 8 * http://license.coscl.org.cn/MulanPSL2 9 * 10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, 11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, 12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. 13 * See the Mulan PSL v2 for more details. 14 */ 15 16#include "hitls_build.h" 17#ifdef HITLS_CRYPTO_SHA1 18 19.file "sha1_x86_64.S" 20.text 21 22.set INPUT, %rdi 23.set LEN, %rsi 24.set HASH, %rdx 25 26.set A, %r8d 27.set B, %r9d 28.set C, %r10d 29.set D, %r11d 30.set E, %r12d 31 32.set TEMP, %r13d 33.set TEMP1, %r15d 34.set TEMP2, %ebx 35.set TEMP3, %eax 36.set BLK0, %xmm0 37.set BLK1, %xmm1 38.set BLK2, %xmm2 39.set BLK3, %xmm3 40 41.set ZERO, %ymm4 42.set EXPAND0, %ymm5 43.set EXPAND1, %ymm6 44.set EXPAND2, %ymm7 45.set EXPAND3, %ymm8 46.set TEMP_W0, %ymm9 47.set TEMP_W1, %ymm10 48.set TEMP_W2, %ymm11 49.set KNUM, %ymm12 50 51/* sha1 constant value used */ 52.section .rodata 53.balign 64 54.type g_k, %object 55g_k: 56 .long 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999 // K_00_19 57 .long 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1 // K_20_39 58 .long 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc // K_40_59 59 .long 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6 // K_60_79 60 .size g_k, .-g_k 61 62/* inverted mask */ 63.balign 64 64.type endian_mask, %object 65endian_mask: 66 .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f 67 .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f 68.size endian_mask, .-endian_mask 69 70/** 71 * Macro Description: Message compression, 0 to 18 rounds of data compression, pre-computation Next round F0, b 72 * Input register: 73 *a - e, temp: Intermediate variable of hash value 74 * addr: Stack Address, Kt+W 75 * wkOffset: Kt+W read Offset 76 * temp1-2: temporary register 77 * Modify the register: a e temp temp1 temp2 78 * Output register: 79 * a: Next round F0 80 * e: Indicates the value after a cyclic update. 81 * temp: Next round B 82 * Macro implementation: F0(b,c,d) = (b AND c) OR ((NOT b) AND d) 83 * =(((b) & (c)) | ((~(b)) & (d))) 84 * e = S^5(a) + F0(b,c,d) + e + W(i) + K(i) 85 * temp = S^30(b) 86 */ 87.macro ROUND00_18 a, temp, b, c, d, e, addr, wkOffset, temp1, temp2 88 addl \wkOffset(\addr), \e // e = e + W + KT 89 andn \c, \a, \temp1 // Next (~(b)) & (d) 90 addl \temp, \e // e = F0(b, c, d) + e + W + KT 91 rorxl $27, \a, \temp2 // Temp2 = ROTL32(a, 5) 92 rorxl $2, \a, \temp // Next ROTL32(b, 30) 93 and \b, \a // Next ((b) & (c)) 94 addl \temp2, \e // e = F0(b, c, d) + e + W + KT + S^5(a) 95 or \temp1, \a // Next (((b) & (c)) | ((~(b)) & (d))) 96.endm 97 98/** 99 * Macro Description: 0 to 18 rounds of message compression and 16 to 31 message extension, 100 * pre-calculation Next round F0, b 101 * Input register: 102 *a - e, temp: Intermediate variable of hash value 103 * addr: Stack Address, Kt+W 104 * wkOffset: Kt+W read offset 105 * temp1-2: temporary register 106 * wt_16_13: w(t-16) ~ w(t-13) 107 * wt_12_9: w(t-12) ~ w(t-9) 108 * wt_8_5: w(t-8) ~ w(t-5) 109 * wt_4_1: w(t-4) ~ w(t-1) 110 * expand0: w(t) ~ w(t+3) 111 * tempw0-2: temporary register 112 * zero: register with a value of zero 113 * knum: k constant value 114 * Modify the register: a b c d e temp temp1 temp2 expand0 tempw0 tempw1 tempw2 115 * Output register: 116 * a: Third round B 117 * b: Value after four rounds of cyclic update 118 * c: Next round F0 119 * d: Next round B 120 * e: Fourth round B 121 * temp: next b 122 * expand0: Value after a round of extension 123 * Macro implementation: f(b,c,d) = (b AND c) OR ((NOT b) AND d) 124* =(((b) & (c)) | ((~(b)) & (d))) 125 * temp = S^5(a) + f(b,c,d) + e + W(i) + K(i) 126 * b = S^30(b) 127 * W(t ) = ROL(W(t-3) ^ W(t-8) ^ W(t-14) ^ W(t-16), 1) 128 * W(t+1) = ROL(W(t-2) ^ W(t-7) ^ W(t-13) ^ W(t-15), 1) 129 * W(t+2) = ROL(W(t-1) ^ W(t-6) ^ W(t-12) ^ W(t-14), 1) 130 * W(t+3) = ROL(0 ^ W(t-5) ^ W(t-11) ^ W(t-13), 1) 131 * W(t+3) = W(t+3) ^ ROL(W(t), 1) 132 */ 133.macro ROUND00_18_EXPAND a, temp, b, c, d, e, addr, wkOffset, wt_16_13, wt_12_9, wt_8_5, wt_4_1, expand0 134 vpalignr $8, \wt_16_13, \wt_12_9, TEMP_W1 // Expand w(t-14) w(t-13) w(t-12) w(t-11) 135 addl \wkOffset(\addr), \e // e = e + W + KT 136 andn \c, \a, TEMP1 // Next (~(b)) & (d) 137 addl \temp, \e // e = F0 + e + W + KT 138 vpalignr $4, \wt_4_1, ZERO, TEMP_W0 // Expand w(t-3) w(t-2) w(t-1) 0 139 vpxor \wt_8_5, \wt_16_13, \expand0 // Expand w(t-8) ^ w(t-16) 140 rorxl $27, \a, TEMP2 // Temp2 = ROTL32(a, 5) 141 rorxl $2, \a, \temp // Next ROTL32(b, 30) 142 and \b, \a // Next ((b) & (c)) 143 vpxor TEMP_W1, \expand0, \expand0 // Expand w(t-14) ^ w(t-8) ^ w(t-16) 144 addl TEMP2, \e // e = F0 + e + W + KT + S^5(a) 145 or TEMP1, \a // Next F0 done 146 147 addl \wkOffset + 4(\addr), \d // Next d = d + W + KT 148 vpxor TEMP_W0, \expand0, TEMP_W0 // Expand tempw0 = w[t:t+4] before rol 1 149 andn \b, \e, TEMP1 // Next F0 150 addl \a, \d // d = F0 + d + W + KT 151 rorxl $27, \e, TEMP2 // Temp2 = ROTL32(E, 5) 152 rorxl $2, \e, \a // next ROTL32(E, 30) 153 vpalignr $4, ZERO, TEMP_W0, TEMP_W1 // Expand tempw1 = 0 0 0 w(t) 154 and \temp, \e // Next F0 155 addl TEMP2, \d // d = F0 + d + W + KT + S^5(E) 156 or TEMP1, \e // Next F0 done 157 158 vpsrld $31, TEMP_W0, \expand0 // Expand ROL(w(t), w(t+1), w(t+2), w(t+3),1) 159 addl \wkOffset + 8(\addr), \c // c = c + W + KT 160 vpaddd TEMP_W0, TEMP_W0, TEMP_W0 // Expand ROL(w(t), w(t+1), w(t+2), w(t+3),1) 161 andn \temp, \d, TEMP1 // Next F0 162 addl \e, \c // c = F0 + c + W + KT 163 rorxl $27, \d, TEMP2 // Temp2 = ROTL32(D, 5) 164 rorxl $2, \d, \e // Next ROTL32(D, 30) 165 vpsrld $30, TEMP_W1, TEMP_W2 // Expand ROL(w(t), 2) 166 and \a, \d // Next F0 167 addl TEMP2, \c // c = F0 + c + W + KT + S^5(D) 168 or TEMP1, \d // Next F0 done 169 170 vpslld $2, TEMP_W1, TEMP_W1 // Expand ROL(w(t), 2) 171 vpxor \expand0, TEMP_W0, \expand0 // Expand ROL(w(t), w(t+1), w(t+2), w(t+3),1) 172 addl \wkOffset + 12(\addr), \b // b = b + W + KT 173 andn \a, \c, TEMP1 // Next F0 174 vpxor TEMP_W2, TEMP_W1, TEMP_W0 // Expand ROL(w(t), 2) 175 addl \d, \b // b = F0 + b + W + KT 176 rorxl $27, \c, TEMP2 // Temp2 = ROTL32(C, 5) 177 rorxl $2, \c, \d // Next ROTL32(C, 30) 178 vpxor \expand0, TEMP_W0, \expand0 // Expand w[t:t+4] 179 and \e, \c // Next F0 180 addl TEMP2, \b // b = F0 + b + W + KT + S^5(C) 181 vpaddd KNUM,\expand0, TEMP_W0 // Expand w + k 182 or TEMP1, \c // Next F0 done 183 vmovdqa TEMP_W0, \wkOffset + 128(\addr) 184.endm 185 186/** 187 * Macro Description: Message compression, 20~39, 60~79 round data compression, precomputation Next round F1, b 188 * Input register: 189 *a - e, temp: Intermediate variable of hash value 190 * addr: Stack Address, Kt+W 191 * wkOffset: Kt+W read offset 192 * temp1-2: temporary register 193 * Modify the register: a e temp temp1 temp2 194 * Output register: 195 * a: Next round F1 196 * e: Indicates the value after a cyclic update. 197 * temp: Next round B 198 * Macro implementation: F1(b,c,d) = b XOR c XOR d 199 * =(((b) ^ (c)) ^ (d)) 200 * e = S^5(a) + F1(b,c,d) + e + W(i) + K(i) 201 * temp = S^30(b) 202 */ 203.macro ROUND20_39 a, temp, b, c, d, e, addr, wkOffset, temp1, temp2 204 addl \wkOffset(\addr), \e // e = e + W + KT 205 addl \temp, \e // e = F1(b, c, d) + e + W + KT 206 rorx $27, \a, TEMP2 // Temp2 = ROTL32(a, 5) 207 rorx $2, \a, \temp // Next ROTL32(b, 30) 208 xor \b, \a // Next (b) ^ (c) 209 addl TEMP2, \e // e = F0(b, c, d) + e + W + KT + S^5(a) 210 xor \c, \a // Next (b) ^ (c) ^ (d) 211.endm 212 213/** 214 * Macro Description: 20~39, 60~79 round data compression, and 16-31 message extension, precomputation Next round F1, b 215 * Input register: 216 *a - e, temp: Intermediate variable of hash value 217 * addr: Stack Address, Kt+W 218 * wkOffset: Kt+W read offset 219 * temp1-2: temporary register 220 * wt_32_29: w(t-32) ~ w(t-29) 221 * wt_28_25: w(t-28) ~ w(t-25) 222 * wt_8_5: w(t-8) ~ w(t-5) 223 * wt_4_1: w(t-4) ~ w(t-1) 224 * expand0: w(t) ~ w(t+3) 225 * zero: register with a value of zero 226 * knum: k constant value 227 * Modify the register: a b c d e temp temp1 temp2 wt_32_29 tempw0 228 * Output register: 229 * a: Third round B value 230 * b: Value after four rounds of cyclic update 231 * c: Next round F1 232 * d: Next round B 233 * e: Fourth round B value 234 * temp: next b 235 * expand0: Value after a round of extension 236 * Macro implementation: F1(b,c,d) = b XOR c XOR d 237 * =(((b) ^ (c)) ^ (d)) 238 * e = S^5(a) + F1(b,c,d) + e + W(i) + K(i) 239 * temp = S^30(b) 240 * w(t) = ROL(w(t-3) ^ w(t-8) ^ w(t-14) ^ w(t-16), 1) 241 * = ROL(w(t-6) ^ w(t-11) ^ w(t-17) ^ w(t-19) ^ 242 * w(t-11) ^ w(t-16) ^ w(t-22) ^ w(t-24) ^ 243 * w(t-17) ^ w(t-22) ^ w(t-28) ^ w(t-30) ^ 244 * w(t-19) ^ w(t-24) ^ w(t-30) ^ w(t-32), 2) 245 * = ROL(w(t-6) ^ w(t-16) ^ w(t-28) ^ w(t-32), 2) 246 * w(t+1), w(t+2), w(t+3) in the same way 247 */ 248.macro ROUND20_39_EXPAND a, temp, b, c, d, e, addr, wkOffset, wt_32_29, wt_28_25, wt_16_13, wt_8_5, wt_4_1, wkOffset2 249 vpalignr $8, \wt_8_5, \wt_4_1, TEMP_W0 // Expand w(t-6), w(t-5), w(t-4), w(t-3) 250 vpxor \wt_32_29, \wt_16_13, \wt_32_29 // Expand wt_32_29 =w[t-32:t-28] ^ w[t-16:t-12] 251 addl \wkOffset(\addr), \e // e = e + W + KT 252 addl \temp, \e // e = F1(b, c, d) + e + W + KT 253 rorx $27, \a, TEMP2 // temp2 = ROTL32(a, 5) 254 rorx $2, \a, \temp // Next ROTL32(b, 30) 255 vpxor \wt_32_29, \wt_28_25, \wt_32_29 // Expand wt_32_29 =w[t-32:t-28] ^ w[t-16:t-12]^ w[t-28:t-24] 256 xor \b, \a // Next (b) ^ (c) 257 addl TEMP2, \e // e = F0(b, c, d) + e + W + KT + S^5(a) 258 xor \c, \a // Next F1 done 259 260 addl \wkOffset + 4(\addr), \d // d = d + W + KT 261 vpxor \wt_32_29, TEMP_W0, \wt_32_29 // Expand wt_32_29 =w[t-32] ^ w[t-16]^ w[t-28]^ w[t-6] 262 addl \a, \d // d = F1 + d + W + KT 263 rorx $27, \e, TEMP2 // Temp2 = ROTL32(e, 5) 264 rorx $2, \e, \a // Next temp = ROTL32(e, 30) 265 xor \temp, \e // Next F1 266 addl TEMP2, \d // Expand d = F1 + d + W + KT + S^5(e) 267 vpsrld $30, \wt_32_29, TEMP_W0 // Expand ROL(wt_32_29,2) 268 xor \b, \e // Next F1 done 269 270 addl \wkOffset + 8(\addr), \c // c = c + W + KT 271 addl \e, \c // c = F1 + c + W + KT 272 rorx $27, \d, TEMP2 // Temp2 = ROTL32(e, 5) 273 rorx $2, \d, \e // Next ROTL32(e, 30) 274 vpslld $2, \wt_32_29, \wt_32_29 275 xor \a, \d // Next F1 276 addl TEMP2, \c // c = F1 + c + W + KT + S^5(e) 277 xor \temp, \d // Next F1 done 278 279 addl \wkOffset + 12(\addr), \b // b = b + W + KT 280 vpxor \wt_32_29, TEMP_W0, \wt_32_29 // Expand ROL(wt_32_29,2) 281 rorx $27, \c, TEMP2 // Temp2 = ROTL32(c, 5) 282 addl \d, \b // b = F1 + b + W + KT 283 rorx $2, \c, \d // Next ROTL32(c, 30) 284 vpaddd KNUM, \wt_32_29, TEMP_W0 285 xor \e, \c // Next F1 286 addl TEMP2, \b // b = F1 + b + W + KT + S^5(c) 287 xor \a, \c // Next F1 done 288 vmovdqa TEMP_W0, \wkOffset2(\addr) 289.endm 290 291/** 292 * Macro Description: Message compression, 40~59 round data compression, pre-computation Next round F2, b 293 * Input register: 294 *a - e, temp: Intermediate variable of hash value 295 * addr: Stack Address, Kt+W 296 * wkOffset: Kt+W read offset 297 * temp1-2: temporary register 298 * Modify the register: a e temp temp1 temp2 299 * Output register: 300 * a: Next round F1 301 * e: Indicates the value after a cyclic update. 302 * temp: Next round B 303 * Macro implementation: F1(b,c,d) = (b AND c) OR (b AND d) OR (c AND d) 304 * =((b^c) & (c^d) ^ c) 305 * e = S^5(a) + F1(b,c,d) + e + W(i) + K(i) 306 * temp = S^30(b) 307 */ 308.macro ROUND40_59 a, temp, b, c, d, e, addr, wkOffset, temp1, temp2 309 addl \wkOffset(\addr), \e // e = e + W + KT 310 mov \c, \temp1 311 addl \temp, \e // e = F2(b, c, d) + e + W + KT 312 xor \b, \temp1 // Next (c^d) 313 rorx $27, \a, \temp2 // Temp2 = ROTL32(a, 5) 314 rorx $2, \a, \temp // Next ROTL32(b, 30) 315 xor \b, \a // Next (b^c) 316 addl \temp2, \e // e = F0(b, c, d) + e + W + KT + S^5(a) 317 and \temp1, \a // Next (b^c) & (c^d) 318 xor \b, \a // Next (((b^c)) & (c^d) ^ c) 319.endm 320 321/** 322 * Macro Description: 40~59 round data compression, and 32 to 79 rounds of message extension, 323 * precomputation Next round F2, b 324 * Input register: 325 *a - e, temp: Intermediate variable of hash value 326 * addr: Stack Address, Kt+W 327 * wkOffset: Kt+W read offset 328 * temp1-2: temporary register 329 * wt_32_29: w(t-32) ~ w(t-29) 330 * wt_28_25: w(t-28) ~ w(t-25) 331 * wt_8_5: w(t-8) ~ w(t-5) 332 * wt_4_1: w(t-4) ~ w(t-1) 333 * expand0: w(t) ~ w(t+3) 334 * zero: register with a value of zero 335 * knum: k constant value 336 * Modify the register: a b c d e temp temp1 temp2 wt_32_29 tempw0 337 * Output register: 338 * a: Third round B value 339 * b: Value after four rounds of cyclic update 340 * c: Next round F1 341 * d: Next round B 342 * e: Fourth round B value 343 * temp: next b 344 * expand0: Value after a round of extension 345 * Macro implementation: F1(b,c,d) = (b AND c) OR (b AND d) OR (c AND d) 346 * =((b^c) & (c^d) ^ c) 347 * e = S^5(a) + F1(b,c,d) + e + W(i) + K(i) 348 * w(t) = ROL(w(t-3) ^ w(t-8) ^ w(t-14) ^ w(t-16), 1) 349 * = ROL(w(t-6) ^ w(t-11) ^ w(t-17) ^ w(t-19) ^ 350 * w(t-11) ^ w(t-16) ^ w(t-22) ^ w(t-24) ^ 351 * w(t-17) ^ w(t-22) ^ w(t-28) ^ w(t-30) ^ 352 * w(t-19) ^ w(t-24) ^ w(t-30) ^ w(t-32), 2) 353 * = ROL(w(t-6) ^ w(t-16) ^ w(t-28) ^ w(t-32), 2) 354 * w(t+1), w(t+2), w(t+3) in the same way 355 */ 356.macro ROUND40_59_EXPAND a, temp, b, c, d, e, addr, wkOffset, wt_32_29, wt_28_25, wt_16_13, wt_8_5, wt_4_1, wkOffset2 357 vpalignr $8, \wt_8_5, \wt_4_1, TEMP_W0 // Expand w(t-6), w(t-5), w(t-4), w(t-3) 358 vpxor \wt_32_29, \wt_16_13, \wt_32_29 // Expand wt_32_29 =w[t-32:t-28] ^ w[t-16:t-12] 359 addl \wkOffset(\addr), \e // e = e + W + KT 360 mov \c, TEMP1 361 addl \temp, \e // e = F2(b, c, d) + e + W + KT 362 xor \b, TEMP1 // Next temp1 = (c^d) 363 rorx $27, \a, TEMP2 // Temp2 = ROTL32(a, 5) 364 rorx $2, \a, \temp // Next ROTL32(b, 30) 365 vpxor \wt_32_29, \wt_28_25, \wt_32_29 // Expand wt_32_29 =w[t-32:t-28] ^ w[t-16:t-12]^ w[t-28:t-24] 366 xor \b, \a // Next (b^c) 367 addl TEMP2, \e // e = F0(b, c, d) + e + W + KT + S^5(a) 368 and TEMP1, \a // Next (b^c) & (c^d) 369 addl \wkOffset + 4(\addr), \d // d = d + W + KT 370 xor \b, \a // Next (((b^c)) & (c^d) ^ c) 371 372 vpxor \wt_32_29, TEMP_W0, \wt_32_29 // Expand wt_32_29 =w[t-32] ^ w[t-16]^ w[t-28]^ w[t-6] 373 mov \b, TEMP1 374 addl \a, \d // d = F2 + d + W + KT 375 xor \temp, TEMP1 // Next F2 376 rorx $27, \e, TEMP2 // Temp2 = ROTL32(e, 5) 377 rorx $2, \e, \a // Next ROTL32(e, 30) 378 addl \wkOffset + 8(\addr), \c // c = c + W + KT 379 xor \temp, \e // Next F2 380 vpsrld $30, \wt_32_29, TEMP_W0 // Expand ROL(wt_32_29,2) 381 and TEMP1, \e // Next F2 382 addl TEMP2, \d // d = F2 + d + W + KT + S^5(e) 383 xor \temp, \e // Next F2 done 384 385 mov \temp, TEMP1 386 addl \e, \c // c = F2 + c + W + KT 387 xor \a, TEMP1 // Next F2 388 vpslld $2, \wt_32_29, \wt_32_29 389 rorx $27, \d, TEMP2 // Temp2 = ROTL32(d, 5) 390 rorx $2, \d, \e // Next ROTL32(d, 30) 391 xor \a, \d // Next F2 392 addl TEMP2, \c // c = F2 + c + W + KT + S^5(d) 393 and TEMP1, \d // Next F2 394 addl \wkOffset + 12(\addr), \b // b = b + W + KT 395 vpxor \wt_32_29, TEMP_W0, \wt_32_29 // Expand ROL(wt_32_29,2) 396 xor \a, \d // Next F2 done 397 398 mov \a, TEMP1 399 addl \d, \b // b = F2 + b + W + KT 400 xor \e, TEMP1 // Next F2 401 rorx $27, \c, TEMP2 // Temp2 = ROTL32(c, 5) 402 rorx $2, \c, \d // Next ROTL32(c, 30) 403 xor \e, \c // Next F2 404 vpaddd KNUM, \wt_32_29, TEMP_W0 405 addl TEMP2, \b // b = F2 + b + W + KT + S^5(c) 406 and TEMP1, \c // Next F2 407 xor \e, \c // Next F2 done 408 vmovdqa TEMP_W0, \wkOffset2(\addr) 409.endm 410 411/** 412 * Function Description: Perform SHA1 compression calculation based on the input message and update the hash value. 413 * Function prototype: static const uint8_t *SHA1_Step(const uint8_t *input, uint32_t len, uint32_t *h) 414 * Input register: 415 * rdi: Pointer to the input data address 416 * rsi: Message length 417 * rdx: Storage address of the hash value 418 * Register usage: r8~r12: A~E, r13: TEMP, r15, ebx, eax: temporary register, ymm0~ymm3: w0~w15 Message block, 419 * ymm4: 0, ymm5~ymm8: extended message block, ymm9~ymm13: temporary register, ymm13: k+w value 420 * Output register: rax Returns the address of the message for which SHA1 calculation is not performed. 421 * Function/Macro Call: ROUND00_18, ROUND00_18_EXPAND, ROUND20_39, ROUND20_39_EXPAND, ROUND40_59, ROUND40_59_EXPAND 422 */ 423.text 424.globl SHA1_Step 425 .type SHA1_Step, @function 426SHA1_Step: 427 .cfi_startproc 428 cmp $64, LEN 429 jb .Lend_sha1 430 431 push %rbx 432 push %rbp 433 push %r12 434 push %r13 435 push %r14 436 push %r15 437 mov %rsp, %r14 438 lea -1024(%rsp), %rsp // Apply for 1024-byte stack space. 439 440 mov 0(HASH), A // r8~r13: a~e 441 mov 4(HASH), B 442 andq $-256, %rsp 443 mov 8(HASH), C 444 mov 12(HASH), D 445 mov 16(HASH), E 446 447.Lloop_sha1_compress: 448.align 16 449 vmovdqu (INPUT), BLK0 // Loads the data of a block to the lower 128 bits 450 // of the YMM register. 451 vmovdqu 16(INPUT), BLK1 452 vmovdqu 32(INPUT), BLK2 453 sub $64, LEN 454 vmovdqu 48(INPUT), BLK3 455 add $64, INPUT 456 457 cmp $64, LEN // Check whether the remaining length is greater than 64. 458 jb .Lsha1_compress 459 vinserti128 $1, 0(INPUT), %ymm0, %ymm0 // Loads the data of a block to the upper 128 bits 460 // of the ymm register. 461 vinserti128 $1, 16(INPUT), %ymm1, %ymm1 462 vinserti128 $1, 32(INPUT), %ymm2, %ymm2 463 vinserti128 $1, 48(INPUT), %ymm3, %ymm3 464 add $64, INPUT 465 466.Lsha1_compress: 467 vmovdqa endian_mask + 0(%rip), %ymm8 // Endian inversion mask 468 leaq g_k + 0(%rip), %rbp // Get k 469 470 vpshufb %ymm8, %ymm0, %ymm0 // Little endian to big endian 471 vmovdqa 0(%rbp), KNUM 472 vpshufb %ymm8, %ymm1, %ymm1 473 vpaddd KNUM, %ymm0, %ymm13 // w[0:15] + k0 474 vpshufb %ymm8, %ymm2, %ymm2 475 vmovdqa %ymm13, 0(%rsp) // wk push stack 476 vpaddd KNUM, %ymm1, %ymm9 477 vpshufb %ymm8, %ymm3, %ymm3 478 vmovdqa %ymm9, 32(%rsp) 479 vpaddd KNUM, %ymm2, %ymm10 480 vpxor %ymm4, %ymm4, %ymm4 481 482 mov C, TEMP // The first round F0 483 vmovdqa %ymm10, 64(%rsp) 484 and B, TEMP // Round0 ((b) & (c)) 485 andn D, B, TEMP2 // Round0 (~(b)) & (d) 486 vpaddd KNUM, %ymm3, %ymm11 487 or TEMP2, TEMP // Round0 (((b) & (c)) | ((~(b)) & (d))) 488 rol $30, B // Round0 B = ROTL32(B, 30) 489 vmovdqa %ymm11, 96(%rsp) 490 ROUND00_18_EXPAND A, TEMP, B, C, D, E, %rsp, 0, %ymm0, %ymm1, %ymm2, %ymm3, EXPAND0 491 vmovdqa 32(%rbp), KNUM 492 ROUND00_18_EXPAND B, C, D, E, A, TEMP, %rsp, 32, %ymm1, %ymm2, %ymm3, EXPAND0, EXPAND1 493 ROUND00_18_EXPAND D, E, A, TEMP, B, C, %rsp, 64, %ymm2, %ymm3, EXPAND0, EXPAND1, EXPAND2 494 ROUND00_18_EXPAND A, TEMP, B, C, D, E, %rsp, 96, %ymm3, EXPAND0, EXPAND1, EXPAND2, EXPAND3 495 ROUND00_18 B, C, D, E, A, TEMP, %rsp, 128, TEMP1, TEMP2 496 ROUND00_18 TEMP, B, C, D, E, A, %rsp, 132, TEMP1, TEMP2 497 ROUND00_18 A, TEMP, B, C, D, E, %rsp, 136, TEMP1, TEMP2 // 18 498 addl 140( %rsp), D // D = DE + W + KT 499 rorx $27, E, TEMP2 // TEMP2 = ROTL32(E, 5) 500 addl A, D // D = F0 + D + W + KT 501 rorx $2, E, A // Round20 ROTL32(E, 30) 502 xor TEMP, E // Round20 (TEMP) ^ (E) 503 addl TEMP2, D // D = F0 + D + W + KT + S^5(E) 504 xor B, E // Round20 F1 505 506 ROUND20_39_EXPAND D, E, A, TEMP, B, C, %rsp, 160, %ymm0, %ymm1, EXPAND0, EXPAND2, EXPAND3, 256 507 ROUND20_39_EXPAND A, TEMP, B, C, D, E, %rsp, 192, %ymm1, %ymm2, EXPAND1, EXPAND3, %ymm0, 288 508 vmovdqa 64(%rbp), KNUM 509 ROUND20_39_EXPAND B, C, D, E, A, TEMP, %rsp, 224, %ymm2, %ymm3, EXPAND2, %ymm0, %ymm1, 320 510 ROUND20_39_EXPAND D, E, A, TEMP, B, C, %rsp, 256, %ymm3, EXPAND0, EXPAND3, %ymm1, %ymm2, 352 511 ROUND20_39 A, TEMP, B, C, D, E, %rsp, 288, TEMP1, TEMP2 512 ROUND20_39 E, A, TEMP, B, C, D, %rsp, 292, TEMP1, TEMP2 513 ROUND20_39 D, E, A, TEMP, B, C, %rsp, 296, TEMP1, TEMP2 // 38 514 addl 300(%rsp), B // B = B + W + KT 515 mov A, TEMP1 516 addl D, B // B = F1 + B + W + KT 517 xor E, TEMP1 // Round40 (E^A) 518 rorx $27, C, TEMP2 // TEMP2 = ROTL32(C, 5) 519 rorx $2, C, D // Round40 ROTL32(C, 30) 520 xor E, C // Round40 (E^C) 521 addl TEMP2, B // B = F1 + B + W + KT + S^5(C) 522 and TEMP1, C // Round40 (E^A) & (E^C) 523 xor E, C // Round40 F2 524 525 ROUND40_59_EXPAND B, C, D, E, A, TEMP, %rsp, 320, EXPAND0, EXPAND1, %ymm0, %ymm2, %ymm3, 384 526 ROUND40_59_EXPAND D, E, A, TEMP, B, C, %rsp, 352, EXPAND1, EXPAND2, %ymm1, %ymm3, EXPAND0, 416 527 ROUND40_59_EXPAND A, TEMP, B, C, D, E, %rsp, 384, EXPAND2, EXPAND3, %ymm2, EXPAND0, EXPAND1, 448 528 vmovdqa 96(%rbp), KNUM 529 ROUND40_59_EXPAND B, C, D, E, A, TEMP, %rsp, 416, EXPAND3, %ymm0, %ymm3, EXPAND1, EXPAND2, 480 530 ROUND40_59 D, E, A, TEMP, B, C, %rsp, 448, TEMP1, TEMP2 531 ROUND40_59 C, D, E, A, TEMP, B, %rsp, 452, TEMP1, TEMP2 532 ROUND40_59 B, C, D, E, A, TEMP, %rsp, 456, TEMP1, TEMP2 // 58 533 addl 460(%rsp), A // A = A + W + KT 534 rorx $27, TEMP, TEMP2 // TEMP2 = ROTL32(TEMP, 5) 535 addl B, A // A = F2 + A + W + KT 536 rorx $2, TEMP, B // Round60 ROTL32(TEMP, 30) 537 xor C, TEMP // Round60 (C) ^ (TEMP) 538 addl TEMP2, A // A = F2 + A + W + KT + S^5(TEMP) 539 xor D, TEMP // Round60 F0 540 541 ROUND20_39_EXPAND A, TEMP, B, C, D, E, %rsp, 480, %ymm0, %ymm1, EXPAND0, EXPAND2, EXPAND3, 512 542 ROUND20_39_EXPAND B, C, D, E, A, TEMP, %rsp, 512, %ymm1, %ymm2, EXPAND1, EXPAND3, %ymm0, 544 543 ROUND20_39_EXPAND D, E, A, TEMP, B, C, %rsp, 544, %ymm2, %ymm3, EXPAND2, %ymm0, %ymm1, 576 544 ROUND20_39_EXPAND A, TEMP, B, C, D, E, %rsp, 576, %ymm3, EXPAND0, EXPAND3, %ymm1, %ymm2, 608 545 ROUND20_39 B, C, D, E, A, TEMP, %rsp, 608, TEMP1, TEMP2 546 ROUND20_39 TEMP, B, C, D, E, A, %rsp, 612, TEMP1, TEMP2 547 ROUND20_39 A, TEMP, B, C, D, E, %rsp, 616, TEMP1, TEMP2 // 78 548 addl 620(%rsp), D // D = D + W + KT 549 add E, 4(HASH) // Update HASH 550 lea (A, D), D // D = F1 + D + W + KT 551 add TEMP, 8(HASH) 552 rorx $27, E, TEMP2 // TEMP2 = ROTL32(E, 5) 553 554 add B, 12(HASH) 555 addl TEMP2, D // D = F1 + D + W + KT + S^5(E) 556 add C, 16(HASH) 557 mov 4(HASH), B 558 add D, 0(HASH) 559 mov 8(HASH), C 560 mov 16(HASH), E 561 mov 12(HASH), D 562 mov 0(HASH), A 563 564 cmp $64, LEN // Check whether the upper-bit register is calculated. 565 jb .Lend_sha1_pre 566 sub $64, LEN 567 568 mov C, TEMP 569 andn D, B, TEMP2 // TEMP2 = (~(b)) & (d) 570 and B, TEMP // TEMP=((b) & (c)) 571 or TEMP2, TEMP // TEMP = (((b) & (c)) | ((~(b)) & (d))) 572 rol $30, B // B = ROTL32(B, 30) 573 ROUND00_18 A, TEMP, B, C, D, E, %rsp, 16, TEMP1, TEMP2 574 ROUND00_18 E, A, TEMP, B, C, D, %rsp, 20, TEMP1, TEMP2 575 ROUND00_18 D, E, A, TEMP, B, C, %rsp, 24, TEMP1, TEMP2 576 ROUND00_18 C, D, E, A, TEMP, B, %rsp, 28, TEMP1, TEMP2 // Round 3 577 578 ROUND00_18 B, C, D, E, A, TEMP, %rsp, 48, TEMP1, TEMP2 579 ROUND00_18 TEMP, B, C, D, E, A, %rsp, 52, TEMP1, TEMP2 580 ROUND00_18 A, TEMP, B, C, D, E, %rsp, 56, TEMP1, TEMP2 581 ROUND00_18 E, A, TEMP, B, C, D, %rsp, 60, TEMP1, TEMP2 // Round 7 582 583 ROUND00_18 D, E, A, TEMP, B, C, %rsp, 80, TEMP1, TEMP2 584 ROUND00_18 C, D, E, A, TEMP, B, %rsp, 84, TEMP1, TEMP2 585 ROUND00_18 B, C, D, E, A, TEMP, %rsp, 88, TEMP1, TEMP2 586 ROUND00_18 TEMP, B, C, D, E, A, %rsp, 92, TEMP1, TEMP2 // Round 11 587 588 ROUND00_18 A, TEMP, B, C, D, E, %rsp, 112, TEMP1, TEMP2 589 ROUND00_18 E, A, TEMP, B, C, D, %rsp, 116, TEMP1, TEMP2 590 ROUND00_18 D, E, A, TEMP, B, C, %rsp, 120, TEMP1, TEMP2 591 ROUND00_18 C, D, E, A, TEMP, B, %rsp, 124, TEMP1, TEMP2 // Round 15 592 593 ROUND00_18 B, C, D, E, A, TEMP, %rsp, 144, TEMP1, TEMP2 594 ROUND00_18 TEMP, B, C, D, E, A, %rsp, 148, TEMP1, TEMP2 595 ROUND00_18 A, TEMP, B, C, D, E, %rsp, 152, TEMP1, TEMP2 // Round 18 596 addl 156( %rsp), D // D = D + W + KT 597 rorx $27, E, TEMP2 // TEMP2 = ROTL32(E, 5) 598 addl A, D // D = F0 + D + W + KT 599 rorx $2, E, A // Round20 ROTL32(E, 30) 600 xor TEMP, E // Round20 (TEMP) ^ (E) 601 addl TEMP2, D // D = F0 + D + W + KT + S^5(E) 602 xor B, E // Round20 F1 603 604 ROUND20_39 D, E, A, TEMP, B, C, %rsp, 176, TEMP1, TEMP2 605 ROUND20_39 C, D, E, A, TEMP, B, %rsp, 180, TEMP1, TEMP2 606 ROUND20_39 B, C, D, E, A, TEMP, %rsp, 184, TEMP1, TEMP2 607 ROUND20_39 TEMP, B, C, D, E, A, %rsp, 188, TEMP1, TEMP2 // Round 23 608 609 ROUND20_39 A, TEMP, B, C, D, E, %rsp, 208, TEMP1, TEMP2 610 ROUND20_39 E, A, TEMP, B, C, D, %rsp, 212, TEMP1, TEMP2 611 ROUND20_39 D, E, A, TEMP, B, C, %rsp, 216, TEMP1, TEMP2 612 ROUND20_39 C, D, E, A, TEMP, B, %rsp, 220, TEMP1, TEMP2 // Round 27 613 614 ROUND20_39 B, C, D, E, A, TEMP, %rsp, 240, TEMP1, TEMP2 615 ROUND20_39 TEMP, B, C, D, E, A, %rsp, 244, TEMP1, TEMP2 616 ROUND20_39 A, TEMP, B, C, D, E, %rsp, 248, TEMP1, TEMP2 617 ROUND20_39 E, A, TEMP, B, C, D, %rsp, 252, TEMP1, TEMP2 // Round 31 618 619 ROUND20_39 D, E, A, TEMP, B, C, %rsp, 272, TEMP1, TEMP2 620 ROUND20_39 C, D, E, A, TEMP, B, %rsp, 276, TEMP1, TEMP2 621 ROUND20_39 B, C, D, E, A, TEMP, %rsp, 280, TEMP1, TEMP2 622 ROUND20_39 TEMP, B, C, D, E, A, %rsp, 284, TEMP1, TEMP2 // Round 35 623 624 ROUND20_39 A, TEMP, B, C, D, E, %rsp, 304, TEMP1, TEMP2 625 ROUND20_39 E, A, TEMP, B, C, D, %rsp, 308, TEMP1, TEMP2 626 ROUND20_39 D, E, A, TEMP, B, C, %rsp, 312, TEMP1, TEMP2 // Round 38 627 addl 316(%rsp), B // B = B + W + KT 628 mov A, TEMP1 629 addl D, B // B = F1 + B + W + KT 630 xor E, TEMP1 // Round40 (A^E) 631 rorx $2, C, D // Round40 ROTL32(C, 30) 632 rorx $27, C, TEMP2 // TEMP2 = ROTL32(C, 5) 633 xor E, C // Round40 (E^C) 634 addl TEMP2, B // B = F1 + B + W + KT + S^5(C) 635 and TEMP1, C // Round40 (A^E) & (E^C) 636 xor E, C // Round40 F2 637 638 ROUND40_59 B, C, D, E, A, TEMP, %rsp, 336, TEMP1, TEMP2 639 ROUND40_59 TEMP, B, C, D, E, A, %rsp, 340, TEMP1, TEMP2 640 ROUND40_59 A, TEMP, B, C, D, E, %rsp, 344, TEMP1, TEMP2 641 ROUND40_59 E, A, TEMP, B, C, D, %rsp, 348, TEMP1, TEMP2 // Round 43 642 643 ROUND40_59 D, E, A, TEMP, B, C, %rsp, 368, TEMP1, TEMP2 644 ROUND40_59 C, D, E, A, TEMP, B, %rsp, 372, TEMP1, TEMP2 645 ROUND40_59 B, C, D, E, A, TEMP, %rsp, 376, TEMP1, TEMP2 646 ROUND40_59 TEMP, B, C, D, E, A, %rsp, 380, TEMP1, TEMP2 // Round 47 647 648 ROUND40_59 A, TEMP, B, C, D, E, %rsp, 400, TEMP1, TEMP2 649 ROUND40_59 E, A, TEMP, B, C, D, %rsp, 404, TEMP1, TEMP2 650 ROUND40_59 D, E, A, TEMP, B, C, %rsp, 408, TEMP1, TEMP2 651 ROUND40_59 C, D, E, A, TEMP, B, %rsp, 412, TEMP1, TEMP2 // Round 51 652 653 ROUND40_59 B, C, D, E, A, TEMP, %rsp, 432, TEMP1, TEMP2 654 ROUND40_59 TEMP, B, C, D, E, A, %rsp, 436, TEMP1, TEMP2 655 ROUND40_59 A, TEMP, B, C, D, E, %rsp, 440, TEMP1, TEMP2 656 ROUND40_59 E, A, TEMP, B, C, D, %rsp, 444, TEMP1, TEMP2 // Round 55 657 658 ROUND40_59 D, E, A, TEMP, B, C, %rsp, 464, TEMP1, TEMP2 659 ROUND40_59 C, D, E, A, TEMP, B, %rsp, 468, TEMP1, TEMP2 660 ROUND40_59 B, C, D, E, A, TEMP, %rsp, 472, TEMP1, TEMP2 // Round 58 661 addl 476(%rsp), A // A = A + W + KT 662 rorx $27, TEMP, TEMP2 // TEMP2 = ROTL32(TEMP, 5) 663 addl B, A // A = F2 + A + W + KT 664 rorx $2, TEMP, B // Round60 ROTL32(TEMP, 30) 665 xor C, TEMP // Round60 (TEMP) ^ (c) 666 addl TEMP2, A // A = F2 + A + W + KT + S^5(TEMP) 667 xor D, TEMP // Round60 F1 668 669 ROUND20_39 A, TEMP, B, C, D, E, %rsp, 496, TEMP1, TEMP2 670 ROUND20_39 E, A, TEMP, B, C, D, %rsp, 500, TEMP1, TEMP2 671 ROUND20_39 D, E, A, TEMP, B, C, %rsp, 504, TEMP1, TEMP2 672 ROUND20_39 C, D, E, A, TEMP, B, %rsp, 508, TEMP1, TEMP2 // Round 63 673 674 ROUND20_39 B, C, D, E, A, TEMP, %rsp, 528, TEMP1, TEMP2 675 ROUND20_39 TEMP, B, C, D, E, A, %rsp, 532, TEMP1, TEMP2 676 ROUND20_39 A, TEMP, B, C, D, E, %rsp, 536, TEMP1, TEMP2 677 ROUND20_39 E, A, TEMP, B, C, D, %rsp, 540, TEMP1, TEMP2 // Round 67 678 679 ROUND20_39 D, E, A, TEMP, B, C, %rsp, 560, TEMP1, TEMP2 680 ROUND20_39 C, D, E, A, TEMP, B, %rsp, 564, TEMP1, TEMP2 681 ROUND20_39 B, C, D, E, A, TEMP, %rsp, 568, TEMP1, TEMP2 682 ROUND20_39 TEMP, B, C, D, E, A, %rsp, 572, TEMP1, TEMP2 // Round 71 683 684 ROUND20_39 A, TEMP, B, C, D, E, %rsp, 592, TEMP1, TEMP2 685 ROUND20_39 E, A, TEMP, B, C, D, %rsp, 596, TEMP1, TEMP2 686 ROUND20_39 D, E, A, TEMP, B, C, %rsp, 600, TEMP1, TEMP2 687 ROUND20_39 C, D, E, A, TEMP, B, %rsp, 604, TEMP1, TEMP2 // Round 75 688 689 ROUND20_39 B, C, D, E, A, TEMP, %rsp, 624, TEMP1, TEMP2 690 ROUND20_39 TEMP, B, C, D, E, A, %rsp, 628, TEMP1, TEMP2 691 ROUND20_39 A, TEMP, B, C, D, E, %rsp, 632, TEMP1, TEMP2 // Round 78 692 addl 636(%rsp), D // D = D + W + KT 693 add E, 4(HASH) // Update HASH 694 add TEMP, 8(HASH) // Upadate H0~H5 695 lea (A, D), D // D = F1 + D + W + KT 696 rorx $27, E, TEMP2 // TEMP2 = ROTL32(E, 5) 697 add B, 12(HASH) 698 add C, 16(HASH) 699 addl TEMP2, D // D = F1 + D + W + KT + S^5(E) 700 mov 4(HASH), B 701 mov 8(HASH), C 702 add D, 0(HASH) 703 mov 16(HASH), E 704 mov 12(HASH), D 705 mov 0(HASH), A 706 cmp $64, LEN 707 jae .Lloop_sha1_compress 708 709.Lend_sha1_pre: 710 mov %r14, %rsp 711 pop %r15 712 pop %r14 713 pop %r13 714 pop %r12 715 pop %rbp 716 pop %rbx 717.Lend_sha1: 718 mov INPUT, %rax 719 ret 720 .cfi_endproc 721 .size SHA1_Step, .-SHA1_Step 722 723#endif 724