• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * This file is part of the openHiTLS project.
3 *
4 * openHiTLS is licensed under the Mulan PSL v2.
5 * You can use this software according to the terms and conditions of the Mulan PSL v2.
6 * You may obtain a copy of Mulan PSL v2 at:
7 *
8 *     http://license.coscl.org.cn/MulanPSL2
9 *
10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13 * See the Mulan PSL v2 for more details.
14 */
15
16#include "hitls_build.h"
17#ifdef HITLS_CRYPTO_SHA512
18
19.file   "sha2_512_x86_64.S"
20
21.set TEMP1, %rbp
22.set TEMP2, %rax
23.set TEMP3, %rbx
24.set TEMP4, %rcx
25.set TEMP5, %rdi
26
27.set YTEMP1, %ymm8
28.set YTEMP2, %ymm9
29.set YTEMP3, %ymm10
30.set YTEMP4, %ymm11
31.set YTEMP5, %ymm12
32.set YTEMP6, %ymm13
33.set YTEMP7, %ymm14
34
35.equ SHA512_wk, 0
36.equ SHA512_in, SHA512_wk + 1280
37.equ SHA512_hash, SHA512_in + 8
38.equ SHA512_num, SHA512_hash + 8
39.equ SHA512_rsp, SHA512_num + 8
40.equ SHA512_size, SHA512_rsp + 8
41
42.section .rodata
43.balign    64
44.type    g_k512,%object
45g_k512:
46    .quad    0x428a2f98d728ae22, 0x7137449123ef65cd,    0x428a2f98d728ae22, 0x7137449123ef65cd
47    .quad    0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc,    0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
48    .quad    0x3956c25bf348b538, 0x59f111f1b605d019,    0x3956c25bf348b538, 0x59f111f1b605d019
49    .quad    0x923f82a4af194f9b, 0xab1c5ed5da6d8118,    0x923f82a4af194f9b, 0xab1c5ed5da6d8118
50    .quad    0xd807aa98a3030242, 0x12835b0145706fbe,    0xd807aa98a3030242, 0x12835b0145706fbe
51    .quad    0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,    0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
52    .quad    0x72be5d74f27b896f, 0x80deb1fe3b1696b1,    0x72be5d74f27b896f, 0x80deb1fe3b1696b1
53    .quad    0x9bdc06a725c71235, 0xc19bf174cf692694,    0x9bdc06a725c71235, 0xc19bf174cf692694
54    .quad    0xe49b69c19ef14ad2, 0xefbe4786384f25e3,    0xe49b69c19ef14ad2, 0xefbe4786384f25e3
55    .quad    0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65,    0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
56    .quad    0x2de92c6f592b0275, 0x4a7484aa6ea6e483,    0x2de92c6f592b0275, 0x4a7484aa6ea6e483
57    .quad    0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,    0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
58    .quad    0x983e5152ee66dfab, 0xa831c66d2db43210,    0x983e5152ee66dfab, 0xa831c66d2db43210
59    .quad    0xb00327c898fb213f, 0xbf597fc7beef0ee4,    0xb00327c898fb213f, 0xbf597fc7beef0ee4
60    .quad    0xc6e00bf33da88fc2, 0xd5a79147930aa725,    0xc6e00bf33da88fc2, 0xd5a79147930aa725
61    .quad    0x06ca6351e003826f, 0x142929670a0e6e70,    0x06ca6351e003826f, 0x142929670a0e6e70
62    .quad    0x27b70a8546d22ffc, 0x2e1b21385c26c926,    0x27b70a8546d22ffc, 0x2e1b21385c26c926
63    .quad    0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,    0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
64    .quad    0x650a73548baf63de, 0x766a0abb3c77b2a8,    0x650a73548baf63de, 0x766a0abb3c77b2a8
65    .quad    0x81c2c92e47edaee6, 0x92722c851482353b,    0x81c2c92e47edaee6, 0x92722c851482353b
66    .quad    0xa2bfe8a14cf10364, 0xa81a664bbc423001,    0xa2bfe8a14cf10364, 0xa81a664bbc423001
67    .quad    0xc24b8b70d0f89791, 0xc76c51a30654be30,    0xc24b8b70d0f89791, 0xc76c51a30654be30
68    .quad    0xd192e819d6ef5218, 0xd69906245565a910,    0xd192e819d6ef5218, 0xd69906245565a910
69    .quad    0xf40e35855771202a, 0x106aa07032bbd1b8,    0xf40e35855771202a, 0x106aa07032bbd1b8
70    .quad    0x19a4c116b8d2d0c8, 0x1e376c085141ab53,    0x19a4c116b8d2d0c8, 0x1e376c085141ab53
71    .quad    0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8,    0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
72    .quad    0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb,    0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
73    .quad    0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3,    0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
74    .quad    0x748f82ee5defb2fc, 0x78a5636f43172f60,    0x748f82ee5defb2fc, 0x78a5636f43172f60
75    .quad    0x84c87814a1f0ab72, 0x8cc702081a6439ec,    0x84c87814a1f0ab72, 0x8cc702081a6439ec
76    .quad    0x90befffa23631e28, 0xa4506cebde82bde9,    0x90befffa23631e28, 0xa4506cebde82bde9
77    .quad    0xbef9a3f7b2c67915, 0xc67178f2e372532b,    0xbef9a3f7b2c67915, 0xc67178f2e372532b
78    .quad    0xca273eceea26619c, 0xd186b8c721c0c207,    0xca273eceea26619c, 0xd186b8c721c0c207
79    .quad    0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178,    0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
80    .quad    0x06f067aa72176fba, 0x0a637dc5a2c898a6,    0x06f067aa72176fba, 0x0a637dc5a2c898a6
81    .quad    0x113f9804bef90dae, 0x1b710b35131c471b,    0x113f9804bef90dae, 0x1b710b35131c471b
82    .quad    0x28db77f523047d84, 0x32caab7b40c72493,    0x28db77f523047d84, 0x32caab7b40c72493
83    .quad    0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c,    0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
84    .quad    0x4cc5d4becb3e42b6, 0x597f299cfc657e2a,    0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
85    .quad    0x5fcb6fab3ad6faec, 0x6c44198c4a475817,    0x5fcb6fab3ad6faec, 0x6c44198c4a475817
86.size    g_k512, .-g_k512
87
88    .balign    64
89    .type    g_endianMask,%object
90g_endianMask:
91    .quad 0x0001020304050607, 0x08090a0b0c0d0e0f
92    .quad 0x0001020304050607, 0x08090a0b0c0d0e0f
93.size   g_endianMask, .-g_endianMask
94
95/**
96 *  Macro Description: Processes the update of the hash value in one round of 80 compressions.
97 *  input register:
98 *       addr: Stack space initial address
99 *   wkOffset: wi+k512 Data address offset
100 *      a - h: Intermediate variable of hash value
101 *  Modify the register:temp1, temp2, temp3, temp4, temp5
102 *  Output register:
103 *          h: Indicates the value after a cyclic update.
104 *          d: Indicates the value after a cyclic update.
105 *          temp1: BSIG0(a) from last round
106 *          temp4: b^a for next round b^c
107 *  Function/Macro Call: None
108 *  Implementation Description:
109 *          T1 = h + BSIG1(e) + CH(e,f,g) + Kt + Wt
110 *          T2 = BSIG0(a) + MAJ(a,b,c)
111 *          CH(e, f, g) = (e AND f) XOR ((NOT e) AND g)
112 *          MAJ(a, b, c) = (a AND b) XOR (a AND c) XOR (b AND c)
113 *                       = CH(a^b, c, b)
114 *                       = ((a XOR b) AND c) XOR ((NOT(a XOR b)) AND b)
115 *                       = (b XOR c) AND (a XOR b) XOR b
116 *          BSIG0(x) = ROTR^28(x) XOR ROTR^34(x) XOR ROTR^39(x)
117 *          BSIG1(x) = ROTR^14(x) XOR ROTR^18(x) XOR ROTR^41(x)
118 *          d += T1;        h = T1 + T2
119 *  Optimization Principle:asert b^c in temp4, temp1 equal 0, f in temp5 when round begin
120 *              mov  b, temp4
121 *              xor  temp1, temp1
122 *              xor  c, temp4
123 *              mov  f, temp5
124 *           swap temp2 temp4 for next round
125 *           add BSIG0(a) back to a when all round finished
126 */
127    .macro ONE_ROUND    a, b, c, d, e, f, g, h, temp1, temp2, temp3, temp4, temp5, addr, wkOffset
128        // asert b^c in temp4, temp1 equal 0, f in temp5 when round begin
129        addq \wkOffset(\addr), \h       // h += Kt + Wt
130        and  \e, \temp5                 // e&f
131        rorx $14, \e, \temp2            // ROTR^14(e)
132        addq \temp1, \a                 // a += BSIG0(a) from last round
133        rorx $18, \e, \temp3            // ROTR^18(e)
134        andn \g, \e, \temp1             // (~e)&g
135        xor  \temp2, \temp3             // ROTR^14(e) ^ ROTR^18(e)
136        xor  \temp1, \temp5             // CH(e,f,g)
137        rorx $41, \e, \temp2            // ROTR^41(e)
138        addq \temp5, \h                 // h += CH(e,f,g)
139        xor  \temp2, \temp3             // BSIG1(e)
140        rorx $28, \a, \temp1            // ROTR^28(a)
141        mov  \a, \temp2                 // a
142        addq \temp3, \h                 // h += BSIG1(e)
143        rorx $34, \a, \temp5            // ROTR^34(a)
144        xor  \b, \temp2                 // b^a for next round b^c
145        addq \h, \d                     // d += T1
146        xor  \temp5, \temp1             // ROTR^14(a) ^ ROTR^34(a)
147        and  \temp2, \temp4             // (b^a) & (b^c)
148        rorx $39, \a, \temp3            // ROTR^39(a)
149        xor  \b, \temp4                 // Maj(a,b,c)
150        mov  \e, \temp5                 // for next round f
151        xor  \temp3, \temp1             // BSIG0(a)
152        addq \temp4, \h                 // h += Maj(a,b,c)
153        // swap temp2 temp4 for next round
154        // add BSIG0(a) back to a when all round finished
155    .endm
156
157/**
158 *  Macro Description: Processes the update of two rounds of hash values in 80 rounds of compression,
159 *                      and expands messages.
160 *  Input register:
161 *       addr: Stack space initial address
162 *       wkOffset: wi+k512 Data address offset
163 *       a - h: Intermediate variable of hash value
164 *       wi_17_16: W[i-16-15]
165 *       wi_15_14: W[i-15-14]
166 *       wi_7_6: W[i-7-6]
167 *       wi_9_8: W[i-7-8]
168 *       wi_3_2: W[i-3-2]
169 *  Modify the register:TEMP1, TEMP2, TEMP3, TEMP4, TEMP5, wi_17_16, YTEMP1, YTEMP2, YTEMP3, YTEMP4, YTEMP5, YTEMP6
170 *  Output register:
171 *       h: Value after two rounds of cyclic update
172 *       d: Value after two rounds of cyclic update
173 *       TEMP1: BSIG0(a) from last round
174 *       TEMP4: b^a for next round b^c
175 *       wi_17_16: expanded message
176 *  Function/Macro Call: None
177 *  Implementation Description:
178 *          T1 = h + BSIG1(e) + CH(e,f,g) + Kt + Wt
179 *          T2 = BSIG0(a) + MAJ(a,b,c)
180 *          CH(e, f, g) = (e AND f) XOR ((NOT e) AND g)
181 *          MAJ(a, b, c) = (a AND b) XOR (a AND c) XOR (b AND c)
182 *                       = CH(a^b, c, b)
183 *                       = ((a XOR b) AND c) XOR ((NOT(a XOR b)) AND b)
184 *                       = (b XOR c) AND (a XOR b) XOR b
185 *          BSIG0(x) = ROTR^28(x) XOR ROTR^34(x) XOR ROTR^39(x)
186 *          BSIG1(x) = ROTR^14(x) XOR ROTR^18(x) XOR ROTR^41(x)
187 *          d += T1;        h = T1 + T2
188 *
189 *          wi_16: Latest W[i] value, W[i] = sigma1(W[i-2]) + W[i-7] + sigma0(W[i-15]) + W[i-16]
190 *          SSIG0(x) = ROTR^1(x) XOR ROTR^8(x) XOR SHR^7(x)
191 *          SSIG1(x) = ROTR^19(x) XOR ROTR^61(x) XOR SHR^6(x)
192 *  Optimization Principle:asert b^c in TEMP4, TEMP1 equal 0, f in TEMP5 when round begin
193 *              mov  b, TEMP4
194 *              xor  TEMP1, TEMP1
195 *              xor  c, TEMP4
196 *              mov  f, TEMP5
197 *           swap TEMP2 TEMP4 for next round
198 *           add BSIG0(a) back to a when all round finished
199 */
200    .macro TWO_ROUND_UPDATE_2W    a, b, c, d, e, f, g, h, wkOffset, wi_17_16, wi_15_14, wi_9_8, wi_7_6, wi_3_2
201        // 1st round
202        vpalignr $8, \wi_17_16, \wi_15_14, YTEMP1       // wi_16_15
203        vpalignr $8, \wi_9_8, \wi_7_6, YTEMP7           // wi_8_7
204        addq \wkOffset(%rsi), \h        // h += Kt + Wt
205        and  \e, TEMP5                  // e&f
206        vpsrlq   $1, YTEMP1, YTEMP2
207        rorx $14, \e, TEMP2             // ROTR^14(e)
208        addq TEMP1, \a                  // a += BSIG0(a) from last round
209        vpsrlq   $8, YTEMP1, YTEMP3
210        rorx $18, \e, TEMP3             // ROTR^18(e)
211        andn \g, \e, TEMP1              // (~e)&g
212        vpsrlq   $7, YTEMP1, YTEMP4
213        xor  TEMP2, TEMP3               // ROTR^14(e) ^ ROTR^18(e)
214        xor  TEMP1, TEMP5               // CH(e,f,g)
215        vpsllq   $63, YTEMP1, YTEMP5
216        rorx $41, \e, TEMP2             // ROTR^41(e)
217        addq TEMP5, \h                  // h += CH(e,f,g)
218        vpsllq   $56, YTEMP1, YTEMP6
219        xor  TEMP2, TEMP3               // BSIG1(e)
220        rorx $28, \a, TEMP1             // ROTR^28(a)
221        vpaddq   YTEMP7, \wi_17_16, \wi_17_16           // W[i-17..16] + W[8..7]
222        mov  \a, TEMP2                  // a
223        addq TEMP3, \h                  // h += BSIG1(e)
224        vpxor    YTEMP5, YTEMP2, YTEMP2                 // ROTR^1(wi_16_15)
225        rorx $34, \a, TEMP5             // ROTR^34(a)
226        xor  \b, TEMP2                  // b^a for next round b^c
227        vpxor    YTEMP6, YTEMP3, YTEMP3                 // ROTR^8(wi_16_15)
228        addq \h, \d                     // d += T1
229        xor  TEMP5, TEMP1               // ROTR^14(a) ^ ROTR^34(a)
230        vpxor    YTEMP4, YTEMP2, YTEMP1
231        and  TEMP2, TEMP4               // (b^a) & (b^c)
232        rorx $39, \a, TEMP3             // ROTR^39(a)
233        vpxor    YTEMP3, YTEMP1, YTEMP1                 // SSIG0(wi_16_15)
234        xor  \b, TEMP4                  // Maj(a,b,c)
235        mov  \e, TEMP5                  // for next round f
236        vpaddq   YTEMP1, \wi_17_16, \wi_17_16           // SSIG0(wi_16_15) + W[i-17..16] + W[8..7]
237        xor  TEMP3, TEMP1               // BSIG0(a)
238        addq TEMP4, \h                  // h += Maj(a,b,c)
239        // swap TEMP2 TEMP4 for next round
240
241        // 2nd round
242        // ror abcdefgh to habcdefg
243        vpsrlq   $19, \wi_3_2, YTEMP2
244        addq 8+\wkOffset(%rsi), \g      // h += Kt + Wt
245        and  \d, TEMP5                  // e&f
246        vpsrlq   $61, \wi_3_2, YTEMP3
247        rorx $14, \d, TEMP4             // ROTR^14(e)
248        addq TEMP1, \h                  // a += BSIG0(a) from last round
249        vpsrlq   $6, \wi_3_2, YTEMP4
250        rorx $18, \d, TEMP3             // ROTR^18(e)
251        andn \f, \d, TEMP1              // (~e)&g
252        vpsllq   $45, \wi_3_2, YTEMP5
253        xor  TEMP4, TEMP3               // ROTR^14(e) ^ ROTR^18(e)
254        xor  TEMP1, TEMP5               // CH(e,f,g)
255        vpsllq   $3, \wi_3_2, YTEMP6
256        rorx $41, \d, TEMP4             // ROTR^41(e)
257        addq TEMP5, \g                  // h += CH(e,f,g)
258        vpxor    YTEMP5, YTEMP2, YTEMP2                 // ROTR^19(wi_3_2)
259        xor  TEMP4, TEMP3               // BSIG1(e)
260        rorx $28, \h, TEMP1             // ROTR^28(a)
261        vpxor    YTEMP6, YTEMP3, YTEMP3                 // ROTR^61(wi_3_2)
262        mov  \h, TEMP4                  // a
263        addq TEMP3, \g                  // h += BSIG1(e)
264        vpxor    YTEMP4, YTEMP2, YTEMP1
265        rorx $34, \h, TEMP5             // ROTR^34(a)
266        xor  \a, TEMP4                  // b^a for next round b^c
267        vpxor    YTEMP3, YTEMP1, YTEMP1                 // SSIG1(wi_3_2)
268        addq \g, \c                     // d += T1
269        xor  TEMP5, TEMP1               // ROTR^14(a) ^ ROTR^34(a)
270        vpaddq   YTEMP1, \wi_17_16, \wi_17_16           // SSIG0(wi_16_15) + W[i-17..16] + W[i-8..7] + SSIG1(wi_3_2)
271        and  TEMP4, TEMP2               // (b^a) & (b^c)
272        rorx $39, \h, TEMP3             // ROTR^39(a)
273        vpaddq   \wkOffset(%rdx), \wi_17_16, YTEMP1     // wi + k
274        xor  \a, TEMP2                  // Maj(a,b,c)
275        mov  \d, TEMP5                  // for next round f
276        vmovdqa  YTEMP1, \wkOffset + 256(%rsi)
277        xor  TEMP3, TEMP1               // BSIG0(a)
278        addq TEMP2, \g                  // h += Maj(a,b,c)
279        // swap TEMP2 TEMP4 for next round
280        // add BSIG0(a) back to a when all round finished
281    .endm
282
283/**
284 *  Function description: Performs 80 rounds of compression calculation based on the input plaintext data and updates the hash value.
285 *  function prototype:void SHA512CompressMultiBlocks(uint64_t hash[8], const uint8_t *in, uint32_t num);
286 *  input register:
287 *         rdi:function prototype
288 *         rsi:Pointer to the input data address
289 *         rdx:Number of 80 rounds of cycles. The value is the length of the input data divided by 128.
290 *  Register usage:ymm0-ymm7 to participate in the calculation of message blocks (of two data blocks).
291 *                  ymm8-ymm14 is temporary wide register
292 *                  r8-r15 Storage a-h
293 *                  The stack space temporarily stores wi+k512 (1280 bytes) and hash addresses、in、num
294 *  Output register:None
295 *  Function/Macro Call:UPDATE_W、ONE_ROUND
296 *
297 */
298    .text
299    .balign 16
300    .global SHA512CompressMultiBlocks
301    .type SHA512CompressMultiBlocks, %function
302SHA512CompressMultiBlocks:
303.cfi_startproc
304    cmp $0, %rdx
305    je .Lsha512end
306
307    pushq %rbx
308    pushq %rbp
309    pushq %r12
310    pushq %r13
311    pushq %r14
312    pushq %r15
313    mov %rsp, %r14
314    sub $1320, %rsp
315    and $-256, %rsp     // 32-byte address alignment
316    mov %r14, SHA512_rsp(%rsp) // rsp The original value is added to the stack.
317
318    /* load A-H */
319    mov 0(%rdi), %r8
320    mov 8(%rdi), %r9
321    mov 16(%rdi), %r10
322    mov 24(%rdi), %r11
323    mov 32(%rdi), %r12
324    mov 40(%rdi), %r13
325    mov 48(%rdi), %r14
326    mov 56(%rdi), %r15
327
328    mov %rdi, SHA512_hash(%rsp)
329    mov %rsi, SHA512_in(%rsp) // The input data address is stored in the stack.
330
331.Lsha512_loop:
332    mov SHA512_in(%rsp), %rsi
333
334    /* Loads the data of a block to the lower 128 bits of the ymm register. */
335    vmovdqu 0(%rsi), %xmm0
336    vmovdqu 16(%rsi), %xmm1
337    vmovdqu 32(%rsi), %xmm2
338    vmovdqu 48(%rsi), %xmm3
339    vmovdqu 64(%rsi), %xmm4
340    vmovdqu 80(%rsi), %xmm5
341    vmovdqu 96(%rsi), %xmm6
342    vmovdqu 112(%rsi), %xmm7
343
344    mov %rsi, %rcx
345    add $128, %rsi
346    cmp $1, %rdx
347    cmovne %rsi, %rcx // If num is greater than 1, rcx points to the next block.
348
349    mov %rdx, SHA512_num(%rsp) // Remaining nums are added to the stack.
350
351    /* Loads the data of a block to the upper 128 bits of the ymm register. */
352    vinserti128 $1, 0(%rcx),  %ymm0, %ymm0
353    vinserti128 $1, 16(%rcx), %ymm1, %ymm1
354    vinserti128 $1, 32(%rcx), %ymm2, %ymm2
355    vinserti128 $1, 48(%rcx), %ymm3, %ymm3
356    vinserti128 $1, 64(%rcx), %ymm4, %ymm4
357    vinserti128 $1, 80(%rcx), %ymm5, %ymm5
358    vinserti128 $1, 96(%rcx), %ymm6, %ymm6
359    vinserti128 $1, 112(%rcx),%ymm7,  %ymm7
360    add $128, %rcx
361    mov %rcx, SHA512_in(%rsp)  // The input data address is stored in the stack.
362
363    vmovdqa g_endianMask + 0(%rip), %ymm8
364    leaq g_k512 + 0(%rip), %rdx
365    /* Little-endian order to big-endian order */
366    vpshufb %ymm8, %ymm0, %ymm0
367    vpshufb %ymm8, %ymm1, %ymm1
368    vpshufb %ymm8, %ymm2, %ymm2
369    vpshufb %ymm8, %ymm3, %ymm3
370    vpshufb %ymm8, %ymm4, %ymm4
371    vpshufb %ymm8, %ymm5, %ymm5
372    vpshufb %ymm8, %ymm6, %ymm6
373    vpshufb %ymm8, %ymm7, %ymm7
374    /* w[0..15] + k*/
375    vpaddq 0(%rdx), %ymm0, %ymm8
376    vpaddq 32(%rdx), %ymm1, %ymm9
377    vpaddq 64(%rdx), %ymm2, %ymm10
378    vpaddq 96(%rdx), %ymm3, %ymm11
379    vpaddq 128(%rdx), %ymm4, %ymm12
380    vpaddq 160(%rdx), %ymm5, %ymm13
381    vpaddq 192(%rdx), %ymm6, %ymm14
382    vpaddq 224(%rdx), %ymm7, %ymm15
383    /* wk push stack */
384    vmovdqa %ymm8, 0(%rsp)
385    vmovdqa %ymm9, 32(%rsp)
386    vmovdqa %ymm10, 64(%rsp)
387    vmovdqa %ymm11, 96(%rsp)
388    vmovdqa %ymm12, 128(%rsp)
389    vmovdqa %ymm13, 160(%rsp)
390    vmovdqa %ymm14, 192(%rsp)
391    vmovdqa %ymm15, 224(%rsp)
392
393    movq $4, 1312(%rsp)
394    leaq 0(%rsp), %rsi
395
396    mov  %r9, %rcx          // mov  b, TEMP4
397    xor  %rbp, %rbp         // xor  TEMP1, TEMP1
398    xor  %r10, %rcx         // xor  c, TEMP4
399    mov  %r13, %rdi         // mov  f, TEMP5
400.Lround00_63:
401    leaq 256(%rdx), %rdx
402
403    TWO_ROUND_UPDATE_2W %r8, %r9, %r10, %r11, %r12, %r13, %r14, %r15, 0, %ymm0, %ymm1, %ymm4, %ymm5, %ymm7
404    TWO_ROUND_UPDATE_2W %r14, %r15, %r8, %r9, %r10, %r11, %r12, %r13, 32, %ymm1, %ymm2, %ymm5, %ymm6, %ymm0
405    TWO_ROUND_UPDATE_2W %r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11, 64, %ymm2, %ymm3, %ymm6, %ymm7, %ymm1
406    TWO_ROUND_UPDATE_2W %r10, %r11, %r12, %r13, %r14, %r15, %r8, %r9, 96, %ymm3, %ymm4, %ymm7, %ymm0, %ymm2
407    TWO_ROUND_UPDATE_2W %r8, %r9, %r10, %r11, %r12, %r13, %r14, %r15, 128, %ymm4, %ymm5, %ymm0, %ymm1, %ymm3
408    TWO_ROUND_UPDATE_2W %r14, %r15, %r8, %r9, %r10, %r11, %r12, %r13, 160, %ymm5, %ymm6, %ymm1, %ymm2, %ymm4
409    TWO_ROUND_UPDATE_2W %r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11, 192, %ymm6, %ymm7, %ymm2, %ymm3, %ymm5
410    TWO_ROUND_UPDATE_2W %r10, %r11, %r12, %r13, %r14, %r15, %r8, %r9, 224, %ymm7, %ymm0, %ymm3, %ymm4, %ymm6
411
412    leaq 256(%rsi), %rsi
413    decq 1312(%rsp)
414    jne .Lround00_63
415
416    /* round 64-79 */
417    ONE_ROUND %r8, %r9, %r10, %r11, %r12, %r13, %r14, %r15, %rbp, %rax, %rbx, %rcx, %rdi, %rsi, 0
418    ONE_ROUND %r15, %r8, %r9, %r10, %r11, %r12, %r13, %r14, %rbp, %rcx, %rbx, %rax, %rdi, %rsi, 8
419    ONE_ROUND %r14, %r15, %r8, %r9, %r10, %r11, %r12, %r13, %rbp, %rax, %rbx, %rcx, %rdi, %rsi, 32
420    ONE_ROUND %r13, %r14, %r15, %r8, %r9, %r10, %r11, %r12, %rbp, %rcx, %rbx, %rax, %rdi, %rsi, 40
421    ONE_ROUND %r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11, %rbp, %rax, %rbx, %rcx, %rdi, %rsi, 64
422    ONE_ROUND %r11, %r12, %r13, %r14, %r15, %r8, %r9, %r10, %rbp, %rcx, %rbx, %rax, %rdi, %rsi, 72
423    ONE_ROUND %r10, %r11, %r12, %r13, %r14, %r15, %r8, %r9, %rbp, %rax, %rbx, %rcx, %rdi, %rsi, 96
424    ONE_ROUND %r9, %r10, %r11, %r12, %r13, %r14, %r15, %r8, %rbp, %rcx, %rbx, %rax, %rdi, %rsi, 104
425
426    ONE_ROUND %r8, %r9, %r10, %r11, %r12, %r13, %r14, %r15, %rbp, %rax, %rbx, %rcx, %rdi, %rsi, 128
427    ONE_ROUND %r15, %r8, %r9, %r10, %r11, %r12, %r13, %r14, %rbp, %rcx, %rbx, %rax, %rdi, %rsi, 136
428    ONE_ROUND %r14, %r15, %r8, %r9, %r10, %r11, %r12, %r13, %rbp, %rax, %rbx, %rcx, %rdi, %rsi, 160
429    ONE_ROUND %r13, %r14, %r15, %r8, %r9, %r10, %r11, %r12, %rbp, %rcx, %rbx, %rax, %rdi, %rsi, 168
430    ONE_ROUND %r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11, %rbp, %rax, %rbx, %rcx, %rdi, %rsi, 192
431    ONE_ROUND %r11, %r12, %r13, %r14, %r15, %r8, %r9, %r10, %rbp, %rcx, %rbx, %rax, %rdi, %rsi, 200
432    ONE_ROUND %r10, %r11, %r12, %r13, %r14, %r15, %r8, %r9, %rbp, %rax, %rbx, %rcx, %rdi, %rsi, 224
433    ONE_ROUND %r9, %r10, %r11, %r12, %r13, %r14, %r15, %r8, %rbp, %rcx, %rbx, %rax, %rdi, %rsi, 232
434    addq %rbp, %r8          // a += BSIG0(a) from last round
435
436    leaq -1024(%rsi), %rsi  // rsi Point to the original address
437    /* Update the hash value. */
438    mov SHA512_hash(%rsp), %rdi
439    mov SHA512_num(%rsp), %rdx
440    addq 0(%rdi), %r8
441    addq 8(%rdi), %r9
442    addq 16(%rdi), %r10
443    addq 24(%rdi), %r11
444    addq 32(%rdi), %r12
445    addq 40(%rdi), %r13
446    addq 48(%rdi), %r14
447    addq 56(%rdi), %r15
448    mov %r8, 0(%rdi)
449    mov %r9, 8(%rdi)
450    mov %r10, 16(%rdi)
451    mov %r11, 24(%rdi)
452    mov %r12, 32(%rdi)
453    mov %r13, 40(%rdi)
454    mov %r14, 48(%rdi)
455    mov %r15, 56(%rdi)
456
457    cmp $1, %rdx
458    je .Lsha512_finish
459
460    movq $10, 1312(%rsp)
461
462    mov  %r9, %rcx          // mov  b, TEMP4
463    xor  %rbp, %rbp         // xor  TEMP1, TEMP1
464    xor  %r10, %rcx         // xor  c, TEMP4
465    mov  %r13, %rdi         // mov  f, TEMP5
466.Lnext_block:
467    ONE_ROUND %r8, %r9, %r10, %r11, %r12, %r13, %r14, %r15, %rbp, %rax, %rbx, %rcx, %rdi, %rsi, 16
468    ONE_ROUND %r15, %r8, %r9, %r10, %r11, %r12, %r13, %r14, %rbp, %rcx, %rbx, %rax, %rdi, %rsi, 24
469    ONE_ROUND %r14, %r15, %r8, %r9, %r10, %r11, %r12, %r13, %rbp, %rax, %rbx, %rcx, %rdi, %rsi, 48
470    ONE_ROUND %r13, %r14, %r15, %r8, %r9, %r10, %r11, %r12, %rbp, %rcx, %rbx, %rax, %rdi, %rsi, 56
471    ONE_ROUND %r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11, %rbp, %rax, %rbx, %rcx, %rdi, %rsi, 80
472    ONE_ROUND %r11, %r12, %r13, %r14, %r15, %r8, %r9, %r10, %rbp, %rcx, %rbx, %rax, %rdi, %rsi, 88
473    ONE_ROUND %r10, %r11, %r12, %r13, %r14, %r15, %r8, %r9, %rbp, %rax, %rbx, %rcx, %rdi, %rsi, 112
474    ONE_ROUND %r9, %r10, %r11, %r12, %r13, %r14, %r15, %r8, %rbp, %rcx, %rbx, %rax, %rdi, %rsi, 120
475    leaq 128(%rsi), %rsi
476    decq 1312(%rsp)
477    jne .Lnext_block
478
479    addq %rbp, %r8          // a += BSIG0(a) from last round
480    leaq -1280(%rsi), %rsi // rsi Point to the original address
481    /* Update the hash value. */
482    mov SHA512_hash(%rsp), %rdi
483    addq 0(%rdi), %r8
484    addq 8(%rdi), %r9
485    addq 16(%rdi), %r10
486    addq 24(%rdi), %r11
487    addq 32(%rdi), %r12
488    addq 40(%rdi), %r13
489    addq 48(%rdi), %r14
490    addq 56(%rdi), %r15
491    mov %r8, 0(%rdi)
492    mov %r9, 8(%rdi)
493    mov %r10, 16(%rdi)
494    mov %r11, 24(%rdi)
495    mov %r12, 32(%rdi)
496    mov %r13, 40(%rdi)
497    mov %r14, 48(%rdi)
498    mov %r15, 56(%rdi)
499
500    sub $2, %rdx
501    jne .Lsha512_loop
502
503.Lsha512_finish:
504    mov SHA512_rsp(%rsp), %rsp
505    popq %r15
506    popq %r14
507    popq %r13
508    popq %r12
509    popq %rbp
510    popq %rbx
511
512.Lsha512end:
513    ret
514.cfi_endproc
515    .size SHA512CompressMultiBlocks, .-SHA512CompressMultiBlocks
516
517#endif
518