• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * This file is part of the openHiTLS project.
3 *
4 * openHiTLS is licensed under the Mulan PSL v2.
5 * You can use this software according to the terms and conditions of the Mulan PSL v2.
6 * You may obtain a copy of Mulan PSL v2 at:
7 *
8 *     http://license.coscl.org.cn/MulanPSL2
9 *
10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13 * See the Mulan PSL v2 for more details.
14 */
15
16#include "hitls_build.h"
17#ifdef HITLS_CRYPTO_SHA1
18
19.file   "sha1_x86_64.S"
20.text
21
22.set    INPUT, %rdi
23.set    LEN, %rsi
24.set    HASH, %rdx
25
26.set    A, %r8d
27.set    B, %r9d
28.set    C, %r10d
29.set    D, %r11d
30.set    E, %r12d
31
32.set    TEMP, %r13d
33.set    TEMP1, %r15d
34.set    TEMP2, %ebx
35.set    TEMP3, %eax
36.set    BLK0, %xmm0
37.set    BLK1, %xmm1
38.set    BLK2, %xmm2
39.set    BLK3, %xmm3
40
41.set    ZERO, %ymm4
42.set    EXPAND0, %ymm5
43.set    EXPAND1, %ymm6
44.set    EXPAND2, %ymm7
45.set    EXPAND3, %ymm8
46.set    TEMP_W0, %ymm9
47.set    TEMP_W1, %ymm10
48.set    TEMP_W2, %ymm11
49.set    KNUM, %ymm12
50
51/* sha1 constant value used */
52.section .rodata
53.balign    64
54.type    g_k, %object
55g_k:
56    .long   0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999  // K_00_19
57    .long   0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1  // K_20_39
58    .long   0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc  // K_40_59
59    .long   0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6  // K_60_79
60    .size   g_k, .-g_k
61
62/* inverted mask */
63.balign    64
64.type    endian_mask, %object
65endian_mask:
66    .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
67    .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
68.size   endian_mask, .-endian_mask
69
70/**
71 *  Macro Description: Message compression, 0 to 18 rounds of data compression, pre-computation Next round F0, b
72 *  Input register:
73 *a - e, temp:  Intermediate variable of hash value
74 *       addr:  Stack Address, Kt+W
75 *   wkOffset:  Kt+W read Offset
76 *    temp1-2:  temporary register
77 *  Modify the register:  a e temp temp1 temp2
78 *  Output register:
79 *          a:   Next round F0
80 *          e:  Indicates the value after a cyclic update.
81 *       temp:   Next round B
82 *  Macro implementation: F0(b,c,d) = (b AND c) OR ((NOT b) AND d)
83 *                    =(((b) & (c)) | ((~(b)) & (d)))
84 *          e = S^5(a) + F0(b,c,d) + e + W(i) + K(i)
85 *          temp = S^30(b)
86 */
87.macro ROUND00_18 a, temp, b, c, d, e, addr, wkOffset, temp1, temp2
88    addl  \wkOffset(\addr), \e                          // e = e + W + KT
89    andn \c, \a, \temp1                                 // Next (~(b)) & (d)
90    addl  \temp, \e                                     // e = F0(b, c, d) + e + W + KT
91    rorxl   $27, \a, \temp2                             // Temp2 = ROTL32(a, 5)
92    rorxl   $2, \a, \temp                               // Next ROTL32(b, 30)
93    and   \b, \a                                        // Next ((b) & (c))
94    addl  \temp2, \e                                    // e = F0(b, c, d) + e + W + KT + S^5(a)
95    or   \temp1, \a                                     // Next (((b) & (c)) | ((~(b)) & (d)))
96.endm
97
98/**
99 *  Macro Description: 0 to 18 rounds of message compression and 16 to 31 message extension,
100 *                     pre-calculation Next round F0, b
101 *  Input register:
102 *a - e, temp: Intermediate variable of hash value
103 *       addr: Stack Address, Kt+W
104 *   wkOffset: Kt+W read offset
105 *    temp1-2: temporary register
106 *   wt_16_13: w(t-16) ~ w(t-13)
107 *    wt_12_9: w(t-12) ~ w(t-9)
108 *     wt_8_5: w(t-8)  ~ w(t-5)
109 *     wt_4_1: w(t-4)  ~ w(t-1)
110 *    expand0: w(t)  ~ w(t+3)
111 *   tempw0-2: temporary register
112 *      zero:  register with a value of zero
113 *      knum:  k constant value
114 *  Modify the register:  a b c d e temp temp1 temp2 expand0 tempw0 tempw1 tempw2
115 *  Output register:
116 *          a:  Third round B
117 *          b:  Value after four rounds of cyclic update
118 *          c:   Next round F0
119 *          d:   Next round B
120 *          e:  Fourth round B
121 *       temp:  next b
122 *    expand0: Value after a round of extension
123 *  Macro implementation: f(b,c,d) = (b AND c) OR ((NOT b) AND d)
124*                    =(((b) & (c)) | ((~(b)) & (d)))
125 *          temp = S^5(a) + f(b,c,d) + e + W(i) + K(i)
126 *          b = S^30(b)
127 *      W(t  ) = ROL(W(t-3) ^ W(t-8) ^ W(t-14) ^ W(t-16), 1)
128 *      W(t+1) = ROL(W(t-2) ^ W(t-7) ^ W(t-13) ^ W(t-15), 1)
129 *      W(t+2) = ROL(W(t-1) ^ W(t-6) ^ W(t-12) ^ W(t-14), 1)
130 *      W(t+3) = ROL(0      ^ W(t-5) ^ W(t-11) ^ W(t-13), 1)
131 *      W(t+3) = W(t+3) ^ ROL(W(t), 1)
132 */
133.macro ROUND00_18_EXPAND a, temp, b, c, d, e, addr, wkOffset, wt_16_13, wt_12_9, wt_8_5, wt_4_1, expand0
134    vpalignr $8, \wt_16_13, \wt_12_9, TEMP_W1            // Expand w(t-14) w(t-13) w(t-12) w(t-11)
135    addl  \wkOffset(\addr), \e                          // e = e + W + KT
136    andn \c, \a, TEMP1                                  // Next (~(b)) & (d)
137    addl  \temp, \e                                     // e = F0 + e + W + KT
138    vpalignr $4, \wt_4_1, ZERO, TEMP_W0                  // Expand w(t-3)  w(t-2)  w(t-1)  0
139    vpxor   \wt_8_5, \wt_16_13, \expand0                // Expand w(t-8) ^ w(t-16)
140    rorxl   $27, \a, TEMP2                              // Temp2 = ROTL32(a, 5)
141    rorxl   $2, \a, \temp                               // Next ROTL32(b, 30)
142    and   \b, \a                                        // Next ((b) & (c))
143    vpxor   TEMP_W1, \expand0, \expand0                  // Expand w(t-14)  ^ w(t-8) ^ w(t-16)
144    addl  TEMP2, \e                                     // e = F0 + e + W + KT + S^5(a)
145    or   TEMP1, \a                                      // Next F0 done
146
147    addl  \wkOffset + 4(\addr), \d                      // Next d = d + W + KT
148    vpxor   TEMP_W0, \expand0, TEMP_W0                    // Expand tempw0 = w[t:t+4] before rol 1
149    andn \b, \e, TEMP1                                  // Next F0
150    addl  \a, \d                                        // d = F0 + d + W + KT
151    rorxl   $27, \e, TEMP2                              // Temp2 = ROTL32(E, 5)
152    rorxl   $2, \e, \a                                  // next ROTL32(E, 30)
153    vpalignr $4, ZERO, TEMP_W0, TEMP_W1                   // Expand tempw1 = 0 0 0 w(t)
154    and   \temp, \e                                     // Next F0
155    addl  TEMP2, \d                                     // d = F0 + d + W + KT + S^5(E)
156    or   TEMP1, \e                                      // Next F0 done
157
158    vpsrld  $31, TEMP_W0, \expand0                       // Expand ROL(w(t), w(t+1), w(t+2), w(t+3),1)
159    addl  \wkOffset + 8(\addr), \c                      // c = c + W + KT
160    vpaddd TEMP_W0, TEMP_W0, TEMP_W0                       // Expand ROL(w(t), w(t+1), w(t+2), w(t+3),1)
161    andn \temp, \d, TEMP1                               // Next F0
162    addl  \e, \c                                        // c = F0 + c + W + KT
163    rorxl   $27, \d, TEMP2                              // Temp2 = ROTL32(D, 5)
164    rorxl   $2, \d, \e                                  // Next ROTL32(D, 30)
165    vpsrld  $30, TEMP_W1, TEMP_W2                         // Expand ROL(w(t), 2)
166    and   \a, \d                                        // Next F0
167    addl  TEMP2, \c                                     // c = F0 + c + W + KT + S^5(D)
168    or   TEMP1, \d                                      // Next F0 done
169
170    vpslld  $2, TEMP_W1, TEMP_W1                          // Expand ROL(w(t), 2)
171    vpxor \expand0, TEMP_W0, \expand0                    // Expand ROL(w(t), w(t+1), w(t+2), w(t+3),1)
172    addl  \wkOffset + 12(\addr), \b                     // b = b + W + KT
173    andn \a, \c, TEMP1                                  // Next F0
174    vpxor TEMP_W2, TEMP_W1, TEMP_W0                        // Expand ROL(w(t), 2)
175    addl  \d, \b                                        // b = F0 + b + W + KT
176    rorxl   $27, \c, TEMP2                              // Temp2 = ROTL32(C, 5)
177    rorxl   $2, \c, \d                                  // Next ROTL32(C, 30)
178    vpxor \expand0, TEMP_W0, \expand0                    // Expand w[t:t+4]
179    and   \e, \c                                        // Next F0
180    addl  TEMP2, \b                                     // b = F0 + b + W + KT + S^5(C)
181    vpaddd KNUM,\expand0, TEMP_W0                        // Expand w + k
182    or   TEMP1, \c                                      // Next F0 done
183    vmovdqa TEMP_W0, \wkOffset + 128(\addr)
184.endm
185
186/**
187 *  Macro Description: Message compression, 20~39, 60~79 round data compression, precomputation Next round F1, b
188 *  Input register:
189 *a - e, temp:  Intermediate variable of hash value
190 *       addr:  Stack Address, Kt+W
191 *   wkOffset:  Kt+W read offset
192 *    temp1-2:  temporary register
193 *  Modify the register:  a e temp temp1 temp2
194 *  Output register:
195 *          a:   Next round F1
196 *          e:  Indicates the value after a cyclic update.
197 *       temp:   Next round B
198 *  Macro implementation: F1(b,c,d) =  b XOR c XOR d
199 *                    =(((b) ^ (c)) ^ (d))
200 *          e = S^5(a) + F1(b,c,d) + e + W(i) + K(i)
201 *          temp = S^30(b)
202 */
203.macro ROUND20_39 a, temp, b, c, d, e, addr, wkOffset, temp1, temp2
204    addl  \wkOffset(\addr), \e                          // e = e + W + KT
205    addl  \temp, \e                                     // e = F1(b, c, d) + e + W + KT
206    rorx   $27, \a, TEMP2                               // Temp2 = ROTL32(a, 5)
207    rorx   $2, \a, \temp                                // Next ROTL32(b, 30)
208    xor   \b, \a                                        // Next (b) ^ (c)
209    addl  TEMP2, \e                                     // e = F0(b, c, d) + e + W + KT + S^5(a)
210    xor   \c, \a                                        // Next (b) ^ (c) ^ (d)
211.endm
212
213/**
214 *  Macro Description: 20~39, 60~79 round data compression, and 16-31 message extension, precomputation Next round F1, b
215 *  Input register:
216 *a - e, temp: Intermediate variable of hash value
217 *       addr: Stack Address, Kt+W
218 *   wkOffset: Kt+W read offset
219 *    temp1-2: temporary register
220 *   wt_32_29: w(t-32) ~ w(t-29)
221 *   wt_28_25: w(t-28) ~ w(t-25)
222 *     wt_8_5: w(t-8)  ~ w(t-5)
223 *     wt_4_1: w(t-4)  ~ w(t-1)
224 *    expand0: w(t)  ~ w(t+3)
225 *      zero: register with a value of zero
226 *      knum: k constant value
227 *  Modify the register:  a b c d e temp temp1 temp2 wt_32_29 tempw0
228 *  Output register:
229 *          a:  Third round B value
230 *          b:  Value after four rounds of cyclic update
231 *          c:   Next round F1
232 *          d:   Next round B
233 *          e:  Fourth round B value
234 *       temp:  next b
235 *    expand0: Value after a round of extension
236 *  Macro implementation: F1(b,c,d) =  b XOR c XOR d
237 *                    =(((b) ^ (c)) ^ (d))
238 *          e = S^5(a) + F1(b,c,d) + e + W(i) + K(i)
239 *          temp = S^30(b)
240 *          w(t) = ROL(w(t-3)  ^ w(t-8)  ^ w(t-14) ^ w(t-16), 1)
241 *               = ROL(w(t-6)  ^ w(t-11) ^ w(t-17) ^ w(t-19) ^
242 *                     w(t-11) ^ w(t-16) ^ w(t-22) ^ w(t-24) ^
243 *                     w(t-17) ^ w(t-22) ^ w(t-28) ^ w(t-30) ^
244 *                     w(t-19) ^ w(t-24) ^ w(t-30) ^ w(t-32), 2)
245 *               = ROL(w(t-6)  ^ w(t-16) ^ w(t-28) ^ w(t-32), 2)
246 *          w(t+1), w(t+2), w(t+3) in the same way
247 */
248.macro ROUND20_39_EXPAND a, temp, b, c, d, e, addr, wkOffset, wt_32_29, wt_28_25, wt_16_13, wt_8_5, wt_4_1, wkOffset2
249    vpalignr $8, \wt_8_5, \wt_4_1, TEMP_W0               // Expand w(t-6), w(t-5), w(t-4), w(t-3)
250    vpxor   \wt_32_29, \wt_16_13, \wt_32_29             // Expand wt_32_29 =w[t-32:t-28] ^ w[t-16:t-12]
251    addl  \wkOffset(\addr), \e                          // e = e + W + KT
252    addl  \temp, \e                                     // e = F1(b, c, d) + e + W + KT
253    rorx   $27, \a, TEMP2                               // temp2 = ROTL32(a, 5)
254    rorx   $2, \a, \temp                                // Next ROTL32(b, 30)
255    vpxor   \wt_32_29, \wt_28_25, \wt_32_29             // Expand wt_32_29 =w[t-32:t-28] ^ w[t-16:t-12]^ w[t-28:t-24]
256    xor   \b, \a                                        // Next (b) ^ (c)
257    addl  TEMP2, \e                                     // e = F0(b, c, d) + e + W + KT + S^5(a)
258    xor   \c, \a                                        // Next F1 done
259
260    addl  \wkOffset + 4(\addr), \d                      // d = d + W + KT
261    vpxor   \wt_32_29, TEMP_W0, \wt_32_29                // Expand wt_32_29 =w[t-32] ^ w[t-16]^ w[t-28]^ w[t-6]
262    addl  \a, \d                                        // d = F1 + d + W + KT
263    rorx   $27, \e, TEMP2                               // Temp2 = ROTL32(e, 5)
264    rorx   $2, \e, \a                                   // Next temp = ROTL32(e, 30)
265    xor   \temp, \e                                     // Next F1
266    addl  TEMP2, \d                                     // Expand d = F1 + d + W + KT + S^5(e)
267    vpsrld  $30, \wt_32_29, TEMP_W0                      // Expand ROL(wt_32_29,2)
268    xor   \b, \e                                        // Next F1 done
269
270    addl  \wkOffset + 8(\addr), \c                      // c = c + W + KT
271    addl  \e, \c                                        // c = F1 + c + W + KT
272    rorx   $27, \d, TEMP2                               // Temp2 = ROTL32(e, 5)
273    rorx   $2, \d, \e                                   // Next ROTL32(e, 30)
274    vpslld  $2, \wt_32_29, \wt_32_29
275    xor   \a, \d                                        // Next F1
276    addl  TEMP2, \c                                     // c = F1 + c + W + KT + S^5(e)
277    xor   \temp, \d                                     // Next F1 done
278
279    addl  \wkOffset + 12(\addr), \b                     // b = b + W + KT
280    vpxor \wt_32_29, TEMP_W0, \wt_32_29                  // Expand ROL(wt_32_29,2)
281    rorx   $27, \c, TEMP2                               // Temp2 = ROTL32(c, 5)
282    addl  \d, \b                                        // b = F1 + b + W + KT
283    rorx   $2, \c, \d                                   // Next ROTL32(c, 30)
284    vpaddd KNUM, \wt_32_29, TEMP_W0
285    xor   \e, \c                                        // Next F1
286    addl  TEMP2, \b                                     // b = F1 + b + W + KT + S^5(c)
287    xor   \a, \c                                        // Next F1 done
288    vmovdqa TEMP_W0, \wkOffset2(\addr)
289.endm
290
291/**
292 *  Macro Description: Message compression, 40~59 round data compression, pre-computation Next round F2, b
293 *  Input register:
294 *a - e, temp:  Intermediate variable of hash value
295 *       addr:  Stack Address, Kt+W
296 *   wkOffset:  Kt+W read offset
297 *    temp1-2:  temporary register
298 *  Modify the register:  a e temp temp1 temp2
299 *  Output register:
300 *          a:   Next round F1
301 *          e:  Indicates the value after a cyclic update.
302 *       temp:   Next round B
303 *  Macro implementation: F1(b,c,d) = (b AND c) OR (b AND d) OR (c AND d)
304 *                    =((b^c) & (c^d) ^ c)
305 *          e = S^5(a) + F1(b,c,d) + e + W(i) + K(i)
306 *          temp = S^30(b)
307 */
308.macro ROUND40_59 a, temp, b, c, d, e, addr, wkOffset, temp1, temp2
309    addl  \wkOffset(\addr), \e                          // e = e + W + KT
310    mov   \c, \temp1
311    addl  \temp, \e                                     // e = F2(b, c, d) + e + W + KT
312    xor   \b, \temp1                                    // Next (c^d)
313    rorx   $27, \a, \temp2                              // Temp2 = ROTL32(a, 5)
314    rorx   $2, \a, \temp                                // Next ROTL32(b, 30)
315    xor   \b, \a                                        // Next (b^c)
316    addl  \temp2, \e                                    // e = F0(b, c, d) + e + W + KT + S^5(a)
317    and   \temp1, \a                                    // Next (b^c) & (c^d)
318    xor   \b, \a                                        // Next (((b^c)) & (c^d) ^ c)
319.endm
320
321/**
322 *  Macro Description: 40~59 round data compression, and 32 to 79 rounds of message extension,
323 * precomputation Next round F2, b
324 *  Input register:
325 *a - e, temp: Intermediate variable of hash value
326 *       addr: Stack Address, Kt+W
327 *   wkOffset: Kt+W read offset
328 *    temp1-2: temporary register
329 *   wt_32_29: w(t-32) ~ w(t-29)
330 *   wt_28_25: w(t-28) ~ w(t-25)
331 *     wt_8_5: w(t-8)  ~ w(t-5)
332 *     wt_4_1: w(t-4)  ~ w(t-1)
333 *    expand0: w(t)  ~ w(t+3)
334 *      zero: register with a value of zero
335 *      knum: k constant value
336 *  Modify the register:  a b c d e temp temp1 temp2 wt_32_29 tempw0
337 *  Output register:
338 *          a:  Third round B value
339 *          b:  Value after four rounds of cyclic update
340 *          c:   Next round F1
341 *          d:   Next round B
342 *          e:  Fourth round B value
343 *       temp:  next b
344 *    expand0: Value after a round of extension
345 *  Macro implementation: F1(b,c,d) = (b AND c) OR (b AND d) OR (c AND d)
346 *                    =((b^c) & (c^d) ^ c)
347 *          e = S^5(a) + F1(b,c,d) + e + W(i) + K(i)
348 *          w(t) = ROL(w(t-3)  ^ w(t-8)  ^ w(t-14) ^ w(t-16), 1)
349 *               = ROL(w(t-6)  ^ w(t-11) ^ w(t-17) ^ w(t-19) ^
350 *                     w(t-11) ^ w(t-16) ^ w(t-22) ^ w(t-24) ^
351 *                     w(t-17) ^ w(t-22) ^ w(t-28) ^ w(t-30) ^
352 *                     w(t-19) ^ w(t-24) ^ w(t-30) ^ w(t-32), 2)
353 *               = ROL(w(t-6)  ^ w(t-16) ^ w(t-28) ^ w(t-32), 2)
354 *          w(t+1), w(t+2), w(t+3) in the same way
355 */
356.macro ROUND40_59_EXPAND a, temp, b, c, d, e, addr, wkOffset, wt_32_29, wt_28_25, wt_16_13, wt_8_5, wt_4_1, wkOffset2
357    vpalignr $8, \wt_8_5, \wt_4_1, TEMP_W0               // Expand w(t-6), w(t-5), w(t-4), w(t-3)
358    vpxor   \wt_32_29, \wt_16_13, \wt_32_29             // Expand wt_32_29 =w[t-32:t-28] ^ w[t-16:t-12]
359    addl  \wkOffset(\addr), \e                          // e = e + W + KT
360    mov   \c, TEMP1
361    addl  \temp, \e                                     // e = F2(b, c, d) + e + W + KT
362    xor   \b, TEMP1                                     // Next temp1 = (c^d)
363    rorx   $27, \a, TEMP2                               // Temp2 = ROTL32(a, 5)
364    rorx   $2, \a, \temp                                // Next ROTL32(b, 30)
365    vpxor   \wt_32_29, \wt_28_25, \wt_32_29             // Expand wt_32_29 =w[t-32:t-28] ^ w[t-16:t-12]^ w[t-28:t-24]
366    xor   \b, \a                                        // Next (b^c)
367    addl  TEMP2, \e                                     // e = F0(b, c, d) + e + W + KT + S^5(a)
368    and   TEMP1, \a                                     // Next (b^c) & (c^d)
369    addl  \wkOffset + 4(\addr), \d                      // d = d + W + KT
370    xor   \b, \a                                        // Next (((b^c)) & (c^d) ^ c)
371
372    vpxor   \wt_32_29, TEMP_W0, \wt_32_29                // Expand wt_32_29 =w[t-32] ^ w[t-16]^ w[t-28]^ w[t-6]
373    mov   \b, TEMP1
374    addl  \a, \d                                        // d = F2 + d + W + KT
375    xor   \temp, TEMP1                                  // Next F2
376    rorx   $27, \e, TEMP2                               // Temp2 = ROTL32(e, 5)
377    rorx   $2, \e, \a                                   // Next ROTL32(e, 30)
378    addl  \wkOffset + 8(\addr), \c                      // c = c + W + KT
379    xor   \temp, \e                                     // Next F2
380    vpsrld  $30, \wt_32_29, TEMP_W0                      // Expand ROL(wt_32_29,2)
381    and   TEMP1, \e                                     // Next F2
382    addl  TEMP2, \d                                     // d = F2 + d + W + KT + S^5(e)
383    xor   \temp, \e                                     // Next F2 done
384
385    mov   \temp, TEMP1
386    addl  \e, \c                                        // c = F2 + c + W + KT
387    xor   \a, TEMP1                                     // Next F2
388    vpslld  $2, \wt_32_29, \wt_32_29
389    rorx   $27, \d, TEMP2                               // Temp2 = ROTL32(d, 5)
390    rorx   $2, \d, \e                                   // Next ROTL32(d, 30)
391    xor   \a, \d                                        // Next F2
392    addl  TEMP2, \c                                     // c = F2 + c + W + KT + S^5(d)
393    and   TEMP1, \d                                     // Next F2
394    addl  \wkOffset + 12(\addr), \b                     // b = b + W + KT
395    vpxor \wt_32_29, TEMP_W0, \wt_32_29                  // Expand ROL(wt_32_29,2)
396    xor   \a, \d                                        // Next F2 done
397
398    mov   \a, TEMP1
399    addl  \d, \b                                        // b = F2 + b + W + KT
400    xor   \e, TEMP1                                     // Next F2
401    rorx   $27, \c, TEMP2                               // Temp2 = ROTL32(c, 5)
402    rorx   $2, \c, \d                                   // Next ROTL32(c, 30)
403    xor   \e, \c                                        // Next F2
404    vpaddd KNUM, \wt_32_29, TEMP_W0
405    addl  TEMP2, \b                                     // b = F2 + b + W + KT + S^5(c)
406    and   TEMP1, \c                                     // Next F2
407    xor   \e, \c                                        // Next F2 done
408    vmovdqa TEMP_W0, \wkOffset2(\addr)
409.endm
410
411/**
412 *  Function Description: Perform SHA1 compression calculation based on the input message and update the hash value.
413 *  Function prototype: static const uint8_t *SHA1_Step(const uint8_t *input, uint32_t len, uint32_t *h)
414 *  Input register:
415 *         rdi:  Pointer to the input data address
416 *         rsi:  Message length
417 *         rdx:  Storage address of the hash value
418 *  Register usage:  r8~r12: A~E, r13: TEMP, r15, ebx, eax: temporary register, ymm0~ymm3: w0~w15 Message block,
419 * ymm4: 0, ymm5~ymm8: extended message block, ymm9~ymm13: temporary register, ymm13: k+w value
420 *  Output register:  rax Returns the address of the message for which SHA1 calculation is not performed.
421 *  Function/Macro Call: ROUND00_18, ROUND00_18_EXPAND, ROUND20_39, ROUND20_39_EXPAND, ROUND40_59, ROUND40_59_EXPAND
422 */
423.text
424.globl  SHA1_Step
425    .type   SHA1_Step, @function
426SHA1_Step:
427    .cfi_startproc
428    cmp     $64, LEN
429    jb      .Lend_sha1
430
431    push    %rbx
432    push    %rbp
433    push    %r12
434    push    %r13
435    push    %r14
436    push    %r15
437    mov     %rsp, %r14
438    lea     -1024(%rsp), %rsp                            // Apply for 1024-byte stack space.
439
440    mov     0(HASH), A      // r8~r13: a~e
441    mov     4(HASH), B
442    andq    $-256, %rsp
443    mov     8(HASH), C
444    mov     12(HASH), D
445    mov     16(HASH), E
446
447.Lloop_sha1_compress:
448.align  16
449    vmovdqu (INPUT), BLK0                                // Loads the data of a block to the lower 128 bits
450                                                         // of the YMM register.
451    vmovdqu 16(INPUT), BLK1
452    vmovdqu 32(INPUT), BLK2
453    sub     $64, LEN
454    vmovdqu 48(INPUT), BLK3
455    add     $64, INPUT
456
457    cmp     $64, LEN                                     // Check whether the remaining length is greater than 64.
458    jb .Lsha1_compress
459    vinserti128 $1, 0(INPUT), %ymm0, %ymm0               // Loads the data of a block to the upper 128 bits
460                                                         // of the ymm register.
461    vinserti128 $1, 16(INPUT), %ymm1, %ymm1
462    vinserti128 $1, 32(INPUT), %ymm2, %ymm2
463    vinserti128 $1, 48(INPUT), %ymm3, %ymm3
464    add     $64, INPUT
465
466.Lsha1_compress:
467    vmovdqa endian_mask + 0(%rip), %ymm8                // Endian inversion mask
468    leaq g_k + 0(%rip), %rbp                            // Get k
469
470    vpshufb %ymm8, %ymm0, %ymm0                         // Little endian to big endian
471    vmovdqa 0(%rbp), KNUM
472    vpshufb %ymm8, %ymm1, %ymm1
473    vpaddd  KNUM, %ymm0, %ymm13                         // w[0:15] + k0
474    vpshufb %ymm8, %ymm2, %ymm2
475    vmovdqa %ymm13, 0(%rsp)                             // wk push stack
476    vpaddd  KNUM, %ymm1, %ymm9
477    vpshufb %ymm8, %ymm3, %ymm3
478    vmovdqa %ymm9, 32(%rsp)
479    vpaddd  KNUM, %ymm2, %ymm10
480    vpxor   %ymm4, %ymm4, %ymm4
481
482    mov     C, TEMP                                      // The first round F0
483    vmovdqa %ymm10, 64(%rsp)
484    and     B, TEMP                                      // Round0 ((b) & (c))
485    andn    D, B, TEMP2                                  // Round0 (~(b)) & (d)
486    vpaddd  KNUM, %ymm3, %ymm11
487    or      TEMP2, TEMP                                  // Round0 (((b) & (c)) | ((~(b)) & (d)))
488    rol     $30, B                                       // Round0 B = ROTL32(B, 30)
489    vmovdqa %ymm11, 96(%rsp)
490    ROUND00_18_EXPAND A, TEMP, B, C, D, E, %rsp, 0, %ymm0, %ymm1, %ymm2, %ymm3, EXPAND0
491    vmovdqa 32(%rbp), KNUM
492    ROUND00_18_EXPAND B, C, D, E, A, TEMP, %rsp, 32, %ymm1, %ymm2, %ymm3, EXPAND0, EXPAND1
493    ROUND00_18_EXPAND D, E, A, TEMP, B, C, %rsp, 64, %ymm2, %ymm3, EXPAND0, EXPAND1, EXPAND2
494    ROUND00_18_EXPAND A, TEMP, B, C, D, E, %rsp, 96, %ymm3, EXPAND0, EXPAND1, EXPAND2, EXPAND3
495    ROUND00_18 B, C, D, E, A, TEMP, %rsp, 128, TEMP1, TEMP2
496    ROUND00_18 TEMP, B, C, D, E, A, %rsp, 132, TEMP1, TEMP2
497    ROUND00_18 A, TEMP, B, C, D, E, %rsp, 136, TEMP1, TEMP2     // 18
498    addl    140( %rsp), D                                 // D = DE + W + KT
499    rorx    $27, E, TEMP2                                 // TEMP2 = ROTL32(E, 5)
500    addl    A, D                                          // D = F0 + D + W + KT
501    rorx    $2, E, A                                      // Round20 ROTL32(E, 30)
502    xor     TEMP, E                                       // Round20 (TEMP) ^ (E)
503    addl    TEMP2, D                                      // D = F0 + D + W + KT + S^5(E)
504    xor     B, E                                          // Round20 F1
505
506    ROUND20_39_EXPAND D, E, A, TEMP, B, C, %rsp, 160, %ymm0, %ymm1, EXPAND0, EXPAND2, EXPAND3, 256
507    ROUND20_39_EXPAND A, TEMP, B, C, D, E, %rsp, 192, %ymm1, %ymm2, EXPAND1, EXPAND3, %ymm0, 288
508    vmovdqa 64(%rbp), KNUM
509    ROUND20_39_EXPAND B, C, D, E, A, TEMP, %rsp, 224, %ymm2, %ymm3, EXPAND2, %ymm0, %ymm1, 320
510    ROUND20_39_EXPAND D, E, A, TEMP, B, C, %rsp, 256, %ymm3, EXPAND0, EXPAND3, %ymm1, %ymm2, 352
511    ROUND20_39 A, TEMP, B, C, D, E, %rsp, 288, TEMP1, TEMP2
512    ROUND20_39 E, A, TEMP, B, C, D, %rsp, 292, TEMP1, TEMP2
513    ROUND20_39 D, E, A, TEMP, B, C, %rsp, 296, TEMP1, TEMP2     // 38
514    addl    300(%rsp), B                                  // B = B + W + KT
515    mov     A, TEMP1
516    addl    D, B                                          // B = F1 + B + W + KT
517    xor     E, TEMP1                                      // Round40 (E^A)
518    rorx    $27, C, TEMP2                                 // TEMP2 = ROTL32(C, 5)
519    rorx    $2, C, D                                      // Round40 ROTL32(C, 30)
520    xor     E, C                                          // Round40 (E^C)
521    addl    TEMP2, B                                      // B = F1 + B + W + KT + S^5(C)
522    and     TEMP1, C                                      // Round40 (E^A) & (E^C)
523    xor     E, C                                          // Round40 F2
524
525    ROUND40_59_EXPAND B, C, D, E, A, TEMP, %rsp, 320, EXPAND0, EXPAND1, %ymm0, %ymm2, %ymm3, 384
526    ROUND40_59_EXPAND D, E, A, TEMP, B, C, %rsp, 352, EXPAND1, EXPAND2, %ymm1, %ymm3, EXPAND0, 416
527    ROUND40_59_EXPAND A, TEMP, B, C, D, E, %rsp, 384, EXPAND2, EXPAND3, %ymm2, EXPAND0, EXPAND1, 448
528    vmovdqa 96(%rbp), KNUM
529    ROUND40_59_EXPAND B, C, D, E, A, TEMP, %rsp, 416, EXPAND3, %ymm0, %ymm3, EXPAND1, EXPAND2, 480
530    ROUND40_59 D, E, A, TEMP, B, C, %rsp, 448, TEMP1, TEMP2
531    ROUND40_59 C, D, E, A, TEMP, B, %rsp, 452, TEMP1, TEMP2
532    ROUND40_59 B, C, D, E, A, TEMP, %rsp, 456, TEMP1, TEMP2 // 58
533    addl    460(%rsp), A                                  // A = A + W + KT
534    rorx    $27, TEMP, TEMP2                              // TEMP2 = ROTL32(TEMP, 5)
535    addl    B, A                                          // A = F2 + A + W + KT
536    rorx    $2, TEMP, B                                   // Round60 ROTL32(TEMP, 30)
537    xor     C, TEMP                                       // Round60 (C) ^ (TEMP)
538    addl    TEMP2, A                                      // A = F2 + A + W + KT + S^5(TEMP)
539    xor     D, TEMP                                       // Round60 F0
540
541    ROUND20_39_EXPAND A, TEMP, B, C, D, E, %rsp, 480, %ymm0, %ymm1, EXPAND0, EXPAND2, EXPAND3, 512
542    ROUND20_39_EXPAND B, C, D, E, A, TEMP, %rsp, 512, %ymm1, %ymm2, EXPAND1, EXPAND3, %ymm0, 544
543    ROUND20_39_EXPAND D, E, A, TEMP, B, C, %rsp, 544, %ymm2, %ymm3, EXPAND2, %ymm0, %ymm1, 576
544    ROUND20_39_EXPAND A, TEMP, B, C, D, E, %rsp, 576, %ymm3, EXPAND0, EXPAND3, %ymm1, %ymm2, 608
545    ROUND20_39 B, C, D, E, A, TEMP, %rsp, 608, TEMP1, TEMP2
546    ROUND20_39 TEMP, B, C, D, E, A, %rsp, 612, TEMP1, TEMP2
547    ROUND20_39 A, TEMP, B, C, D, E, %rsp, 616, TEMP1, TEMP2 // 78
548    addl    620(%rsp), D                                   // D = D + W + KT
549    add     E, 4(HASH)                                     // Update HASH
550    lea     (A, D), D                                      // D = F1 + D + W + KT
551    add     TEMP, 8(HASH)
552    rorx    $27, E, TEMP2                                  // TEMP2 = ROTL32(E, 5)
553
554    add     B, 12(HASH)
555    addl    TEMP2, D                                       // D = F1 + D + W + KT + S^5(E)
556    add     C, 16(HASH)
557    mov     4(HASH), B
558    add     D, 0(HASH)
559    mov     8(HASH), C
560    mov     16(HASH), E
561    mov     12(HASH), D
562    mov     0(HASH), A
563
564    cmp     $64, LEN                                       // Check whether the upper-bit register is calculated.
565    jb      .Lend_sha1_pre
566    sub     $64, LEN
567
568    mov     C, TEMP
569    andn    D, B, TEMP2                                    // TEMP2 = (~(b)) & (d)
570    and     B, TEMP                                        // TEMP=((b) & (c))
571    or      TEMP2, TEMP                                    // TEMP = (((b) & (c)) | ((~(b)) & (d)))
572    rol     $30, B                                         // B = ROTL32(B, 30)
573    ROUND00_18 A, TEMP, B, C, D, E, %rsp, 16, TEMP1, TEMP2
574    ROUND00_18 E, A, TEMP, B, C, D, %rsp, 20, TEMP1, TEMP2
575    ROUND00_18 D, E, A, TEMP, B, C, %rsp, 24, TEMP1, TEMP2
576    ROUND00_18 C, D, E, A, TEMP, B, %rsp, 28, TEMP1, TEMP2          // Round 3
577
578    ROUND00_18 B, C, D, E, A, TEMP, %rsp, 48, TEMP1, TEMP2
579    ROUND00_18 TEMP, B, C, D, E, A, %rsp, 52, TEMP1, TEMP2
580    ROUND00_18 A, TEMP, B, C, D, E, %rsp, 56, TEMP1, TEMP2
581    ROUND00_18 E, A, TEMP, B, C, D, %rsp, 60, TEMP1, TEMP2          // Round 7
582
583    ROUND00_18 D, E, A, TEMP, B, C, %rsp, 80, TEMP1, TEMP2
584    ROUND00_18 C, D, E, A, TEMP, B, %rsp, 84, TEMP1, TEMP2
585    ROUND00_18 B, C, D, E, A, TEMP, %rsp, 88, TEMP1, TEMP2
586    ROUND00_18 TEMP, B, C, D, E, A, %rsp, 92, TEMP1, TEMP2          // Round 11
587
588    ROUND00_18 A, TEMP, B, C, D, E, %rsp, 112, TEMP1, TEMP2
589    ROUND00_18 E, A, TEMP, B, C, D, %rsp, 116, TEMP1, TEMP2
590    ROUND00_18 D, E, A, TEMP, B, C, %rsp, 120, TEMP1, TEMP2
591    ROUND00_18 C, D, E, A, TEMP, B, %rsp, 124, TEMP1, TEMP2         // Round 15
592
593    ROUND00_18 B, C, D, E, A, TEMP, %rsp, 144, TEMP1, TEMP2
594    ROUND00_18 TEMP, B, C, D, E, A, %rsp, 148, TEMP1, TEMP2
595    ROUND00_18 A, TEMP, B, C, D, E, %rsp, 152, TEMP1, TEMP2         // Round 18
596    addl    156( %rsp), D                                  // D = D + W + KT
597    rorx    $27, E, TEMP2                                  // TEMP2 = ROTL32(E, 5)
598    addl    A, D                                           // D = F0 + D + W + KT
599    rorx    $2, E, A                                       // Round20 ROTL32(E, 30)
600    xor     TEMP, E                                        // Round20 (TEMP) ^ (E)
601    addl    TEMP2, D                                       // D = F0 + D + W + KT + S^5(E)
602    xor     B, E                                           // Round20 F1
603
604    ROUND20_39 D, E, A, TEMP, B, C, %rsp, 176, TEMP1, TEMP2
605    ROUND20_39 C, D, E, A, TEMP, B, %rsp, 180, TEMP1, TEMP2
606    ROUND20_39 B, C, D, E, A, TEMP, %rsp, 184, TEMP1, TEMP2
607    ROUND20_39 TEMP, B, C, D, E, A, %rsp, 188, TEMP1, TEMP2         // Round 23
608
609    ROUND20_39 A, TEMP, B, C, D, E, %rsp, 208, TEMP1, TEMP2
610    ROUND20_39 E, A, TEMP, B, C, D, %rsp, 212, TEMP1, TEMP2
611    ROUND20_39 D, E, A, TEMP, B, C, %rsp, 216, TEMP1, TEMP2
612    ROUND20_39 C, D, E, A, TEMP, B, %rsp, 220, TEMP1, TEMP2         // Round 27
613
614    ROUND20_39 B, C, D, E, A, TEMP, %rsp, 240, TEMP1, TEMP2
615    ROUND20_39 TEMP, B, C, D, E, A, %rsp, 244, TEMP1, TEMP2
616    ROUND20_39 A, TEMP, B, C, D, E, %rsp, 248, TEMP1, TEMP2
617    ROUND20_39 E, A, TEMP, B, C, D, %rsp, 252, TEMP1, TEMP2         // Round 31
618
619    ROUND20_39 D, E, A, TEMP, B, C, %rsp, 272, TEMP1, TEMP2
620    ROUND20_39 C, D, E, A, TEMP, B, %rsp, 276, TEMP1, TEMP2
621    ROUND20_39 B, C, D, E, A, TEMP, %rsp, 280, TEMP1, TEMP2
622    ROUND20_39 TEMP, B, C, D, E, A, %rsp, 284, TEMP1, TEMP2         // Round 35
623
624    ROUND20_39 A, TEMP, B, C, D, E, %rsp, 304, TEMP1, TEMP2
625    ROUND20_39 E, A, TEMP, B, C, D, %rsp, 308, TEMP1, TEMP2
626    ROUND20_39 D, E, A, TEMP, B, C, %rsp, 312, TEMP1, TEMP2         // Round 38
627    addl    316(%rsp), B                                            // B = B + W + KT
628    mov     A, TEMP1
629    addl    D, B                                                    // B = F1 + B + W + KT
630    xor     E, TEMP1                                                // Round40 (A^E)
631    rorx    $2, C, D                                                // Round40 ROTL32(C, 30)
632    rorx    $27, C, TEMP2                                           // TEMP2 = ROTL32(C, 5)
633    xor     E, C                                                    // Round40 (E^C)
634    addl    TEMP2, B                                                // B = F1 + B + W + KT + S^5(C)
635    and     TEMP1, C                                                // Round40 (A^E) & (E^C)
636    xor     E, C                                                    // Round40 F2
637
638    ROUND40_59 B, C, D, E, A, TEMP, %rsp, 336, TEMP1, TEMP2
639    ROUND40_59 TEMP, B, C, D, E, A, %rsp, 340, TEMP1, TEMP2
640    ROUND40_59 A, TEMP, B, C, D, E, %rsp, 344, TEMP1, TEMP2
641    ROUND40_59 E, A, TEMP, B, C, D, %rsp, 348, TEMP1, TEMP2         // Round 43
642
643    ROUND40_59 D, E, A, TEMP, B, C, %rsp, 368, TEMP1, TEMP2
644    ROUND40_59 C, D, E, A, TEMP, B, %rsp, 372, TEMP1, TEMP2
645    ROUND40_59 B, C, D, E, A, TEMP, %rsp, 376, TEMP1, TEMP2
646    ROUND40_59 TEMP, B, C, D, E, A, %rsp, 380, TEMP1, TEMP2         // Round 47
647
648    ROUND40_59 A, TEMP, B, C, D, E, %rsp, 400, TEMP1, TEMP2
649    ROUND40_59 E, A, TEMP, B, C, D, %rsp, 404, TEMP1, TEMP2
650    ROUND40_59 D, E, A, TEMP, B, C, %rsp, 408, TEMP1, TEMP2
651    ROUND40_59 C, D, E, A, TEMP, B, %rsp, 412, TEMP1, TEMP2         // Round 51
652
653    ROUND40_59 B, C, D, E, A, TEMP, %rsp, 432, TEMP1, TEMP2
654    ROUND40_59 TEMP, B, C, D, E, A, %rsp, 436, TEMP1, TEMP2
655    ROUND40_59 A, TEMP, B, C, D, E, %rsp, 440, TEMP1, TEMP2
656    ROUND40_59 E, A, TEMP, B, C, D, %rsp, 444, TEMP1, TEMP2         // Round 55
657
658    ROUND40_59 D, E, A, TEMP, B, C, %rsp, 464, TEMP1, TEMP2
659    ROUND40_59 C, D, E, A, TEMP, B, %rsp, 468, TEMP1, TEMP2
660    ROUND40_59 B, C, D, E, A, TEMP, %rsp, 472, TEMP1, TEMP2         // Round 58
661    addl    476(%rsp), A                                            // A = A + W + KT
662    rorx    $27, TEMP, TEMP2                                        // TEMP2 = ROTL32(TEMP, 5)
663    addl    B, A                                                    // A = F2 + A + W + KT
664    rorx    $2, TEMP, B                                             // Round60 ROTL32(TEMP, 30)
665    xor     C, TEMP                                                 // Round60 (TEMP) ^ (c)
666    addl    TEMP2, A                                                // A = F2 + A + W + KT + S^5(TEMP)
667    xor     D, TEMP                                                 // Round60 F1
668
669    ROUND20_39 A, TEMP, B, C, D, E, %rsp, 496, TEMP1, TEMP2
670    ROUND20_39 E, A, TEMP, B, C, D, %rsp, 500, TEMP1, TEMP2
671    ROUND20_39 D, E, A, TEMP, B, C, %rsp, 504, TEMP1, TEMP2
672    ROUND20_39 C, D, E, A, TEMP, B, %rsp, 508, TEMP1, TEMP2         // Round 63
673
674    ROUND20_39 B, C, D, E, A, TEMP, %rsp, 528, TEMP1, TEMP2
675    ROUND20_39 TEMP, B, C, D, E, A, %rsp, 532, TEMP1, TEMP2
676    ROUND20_39 A, TEMP, B, C, D, E, %rsp, 536, TEMP1, TEMP2
677    ROUND20_39 E, A, TEMP, B, C, D, %rsp, 540, TEMP1, TEMP2         // Round 67
678
679    ROUND20_39 D, E, A, TEMP, B, C, %rsp, 560, TEMP1, TEMP2
680    ROUND20_39 C, D, E, A, TEMP, B, %rsp, 564, TEMP1, TEMP2
681    ROUND20_39 B, C, D, E, A, TEMP, %rsp, 568, TEMP1, TEMP2
682    ROUND20_39 TEMP, B, C, D, E, A, %rsp, 572, TEMP1, TEMP2         // Round 71
683
684    ROUND20_39 A, TEMP, B, C, D, E, %rsp, 592, TEMP1, TEMP2
685    ROUND20_39 E, A, TEMP, B, C, D, %rsp, 596, TEMP1, TEMP2
686    ROUND20_39 D, E, A, TEMP, B, C, %rsp, 600, TEMP1, TEMP2
687    ROUND20_39 C, D, E, A, TEMP, B, %rsp, 604, TEMP1, TEMP2         // Round 75
688
689    ROUND20_39 B, C, D, E, A, TEMP, %rsp, 624, TEMP1, TEMP2
690    ROUND20_39 TEMP, B, C, D, E, A, %rsp, 628, TEMP1, TEMP2
691    ROUND20_39 A, TEMP, B, C, D, E, %rsp, 632, TEMP1, TEMP2         // Round 78
692    addl    636(%rsp), D                                            // D = D + W + KT
693    add     E, 4(HASH)                                              // Update HASH
694    add     TEMP, 8(HASH)                                           // Upadate H0~H5
695    lea     (A, D), D                                               // D = F1 + D + W + KT
696    rorx    $27, E, TEMP2                                           // TEMP2 = ROTL32(E, 5)
697    add     B, 12(HASH)
698    add     C, 16(HASH)
699    addl    TEMP2, D                                                // D = F1 + D + W + KT + S^5(E)
700    mov     4(HASH), B
701    mov     8(HASH), C
702    add     D, 0(HASH)
703    mov     16(HASH), E
704    mov     12(HASH), D
705    mov     0(HASH), A
706    cmp     $64, LEN
707    jae    .Lloop_sha1_compress
708
709.Lend_sha1_pre:
710    mov %r14, %rsp
711    pop %r15
712    pop %r14
713    pop %r13
714    pop %r12
715    pop %rbp
716    pop %rbx
717.Lend_sha1:
718    mov INPUT, %rax
719    ret
720    .cfi_endproc
721    .size SHA1_Step, .-SHA1_Step
722
723#endif
724