• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * This file is part of the openHiTLS project.
3 *
4 * openHiTLS is licensed under the Mulan PSL v2.
5 * You can use this software according to the terms and conditions of the Mulan PSL v2.
6 * You may obtain a copy of Mulan PSL v2 at:
7 *
8 *     http://license.coscl.org.cn/MulanPSL2
9 *
10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13 * See the Mulan PSL v2 for more details.
14 */
15
16#include "hitls_build.h"
17#ifdef HITLS_CRYPTO_SHA256
18
19.file   "sha2_256_x86_64.S"
20
21.set HashAddr, %rdi
22.set InAddr, %rsi
23.set NUM, %rdx
24
25.set tempFirst, %ebp
26.set tempThird, %ebx
27.set tempFifth, %edi
28.set avx2Temp1, %ymm4
29.set avx2Temp2, %ymm5
30.set avx2Temp3, %ymm6
31.set avx2Temp4, %ymm7
32.set avx2Temp5, %ymm10
33.set avx2Temp6, %ymm11
34.set avx2Temp7, %ymm15
35
36.set BlockFrontMessageW3_0, %xmm0
37.set BlockFrontMessageW7_4, %xmm1
38.set BlockFrontMessageW11_8, %xmm2
39.set BlockFrontMessageW15_12, %xmm3
40
41.set g_maskMerge, %ymm12
42.set g_maskShift, %ymm13
43.set g_maskTransformEndian, %ymm14
44
45/* Constant value used by sha256. For details about the data source, see the RFC4634 document. */
46.section .rodata
47.align 64
48.type   g_K256, %object
49g_K256:
50    .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
51    .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
52    .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
53    .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
54    .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
55    .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
56    .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
57    .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
58    .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
59    .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
60    .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
61    .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
62    .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
63    .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
64    .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
65    .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
66    .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
67    .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
68    .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
69    .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
70    .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
71    .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
72    .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
73    .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
74    .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
75    .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
76    .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
77    .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
78    .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
79    .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
80    .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
81    .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
82.size    g_K256, .-g_K256
83
84/* Mask block */
85.balign    64
86.type    g_mask, %object
87g_mask:
88    .long   0x00010203,0x04050607, 0x08090a0b,0x0c0d0e0f
89    .long   0x00010203,0x04050607, 0x08090a0b,0x0c0d0e0f
90    .long   0x03020100,0x0b0a0908, 0xffffffff,0xffffffff
91    .long   0x03020100,0x0b0a0908, 0xffffffff,0xffffffff
92    .long   0xffffffff,0xffffffff, 0x03020100,0x0b0a0908
93    .long   0xffffffff,0xffffffff, 0x03020100,0x0b0a0908
94.size   g_mask, .-g_mask
95
96/*
97 *   Macro description: Processes the fast extension of four messages of two blocks at the same time
98 *                         and completes the four-round compression function of the first block.
99 *   Input register:
100 *       WkAddr: Address of the stack space where wi+kt is located.
101 *       a - h: Intermediate variable of hash value
102 *   Modify the register: r8d-r15d, ebp, eax, ebx, ecx, edi, ymm0-ymm10
103 *   Output register:
104 *          a-h: Value after four rounds of cyclic update
105 *          B3_0: Value after data extension
106 *   Naming convention:
107 *          B3_0:   w3-w0
108 *          B7_4:   w7-w4
109 *          B11_8:  w11-w8
110 *          B15_12: w15-w12
111 *   Function/Macro Call:None
112 *   Implementation Description:
113 *          ONE_ROUND algorithm implementation:
114 *          For t = 0 to 63, T1 = h + BSIG1(e) + CH(e,f,g) + Kt + Wt
115 *          T2 = BSIG0(a) + MAJ(a,b,c)
116 *          h = g, g = f, f = e, e = d + T1, d = c, c = b, b = a, a = T1 + T2
117 *          CH( x, y, z) = (x AND y) XOR ( (NOT x) AND z)           CH(e,f,g)
118 *          MAJ(a, b, c) = (a AND b) XOR (a AND c) XOR (b AND c)
119 *                       = CH(a^b, c, b)
120 *                       = ((a XOR b) AND c) XOR ((NOT(a XOR b)) AND b)
121 *                       = (b XOR c) AND (a XOR b) XOR b
122 *          BSIG0(x) = ROTR^2(x) XOR ROTR^13(x) XOR ROTR^22(x)      BSIG0(a)
123 *          BSIG1(x) = ROTR^6(x) XOR ROTR^11(x) XOR ROTR^25(x)      BSIG1(e)
124 *          Optimization idea: b xor c in the next round of MAJ is a xor b in the previous round of MAJ
125 *                             to avoid redundant calculation.
126 *
127 *          UPDATE_4W algorithm implementation:
128 *          For t = 0 to 15     Wt = W0_W15(input w0-w15)
129 *          For t = 16 to 63    Wt = SSIG1(W(t-2)) + W(t-7) + SSIG0(w(t-15)) + W(t-16)
130 *          SSIG0(x) = ROTR^7(x) XOR ROTR^18(x) XOR SHR^3(x)
131 *          SSIG1(x) = ROTR^17(x) XOR ROTR^19(x) XOR SHR^10(x)
132 *          Optimization idea: Optimization point 1: Each WI message block is 32-bit, and the xmm register is
133 *                             a 128-bit register. Therefore, the common operation of four WI messages can be
134 *                             performed at the same time (SSIG0, W(t-16), W(t-7)).
135 *                             Due to the dependency of wi, four wis are calculated each time as the
136 *                             optimal solution found so far.
137 *                             Optimization point 2: The ymm register is a 256-bit register. Therefore, two rounds
138 *                             of 128-bit calculation can be performed at the same time, and two blocks can be used
139 *                             for the same calculation.
140 */
141.macro FOUR_ROUND_UPDATE_4W a, b, c, d, e, f, g, h, tempSwitch2, tempSwitch4, WkAddr, B3_0, B7_4, B11_8, B15_12
142    vpalignr   $4,\B3_0,\B7_4,avx2Temp1                  // avx2Temp1->w4_1
143    add  \WkAddr(%rsp),\h                                // h += Kt + Wt
144    and  \e, tempFifth                                   // e&f
145    rorx $6, \e, \tempSwitch2                            // ROTR^6(e)
146    add  tempFirst, \a                                   // a += BSIG0(a) from last round
147    rorx $11, \e, tempThird                              // ROTR^11(e)
148    andn \g, \e, tempFirst                               // (~e)&g
149    xor  \tempSwitch2, tempThird                         // ROTR^6(e) ^ ROTR^11(e)
150    xor  tempFirst, tempFifth                            // CH(e,f,g)
151    vpshufd     $250, \B15_12, avx2Temp5
152    rorx $25, \e, \tempSwitch2                           // ROTR^25(e)
153    add  tempFifth, \h                                   // h += CH(e,f,g)
154    xor  \tempSwitch2, tempThird                         // BSIG1(e)
155    vpalignr   $4, \B11_8, \B15_12, avx2Temp2            // avx2Temp2->w12_9
156    vpslld      $14, avx2Temp1, avx2Temp4                // w4_1<<datum line 14
157    rorx $2, \a, tempFirst                               // ROTR^2(a)
158    mov  \a, \tempSwitch2                                // a
159    add  tempThird, \h                                   // h += BSIG1(e)[h->T1]
160    vpsrld      $3, avx2Temp1, avx2Temp3                 // w4_1>>datum line 3
161    rorx $13, \a, tempFifth                              // ROTR^13(a)
162    xor  \b, \tempSwitch2                                // b^a for next round b^c
163    add  \h, \d                                          // d += T1
164    vpsrld      $10, avx2Temp5, avx2Temp6                // >>10
165    xor  tempFifth, tempFirst                            // ROTR^2(a) ^ ROTR^13(a)
166    and  \tempSwitch2, \tempSwitch4                      // (b^a) & (b^c)
167    vpsrld      $7, avx2Temp1, avx2Temp1                 // >>7
168    vpaddd      avx2Temp2, \B3_0, \B3_0
169    rorx $22, \a, tempThird                              // ROTR^22(a)
170    add  4+\WkAddr(%rsp),\g                              // h += Kt + Wt
171    xor  \b, \tempSwitch4                                // Maj(a,b,c)
172    vpxor       avx2Temp3, avx2Temp4, avx2Temp3          // 3 xor 14
173    mov  \e, tempFifth                                   // for next round f
174    xor  tempThird, tempFirst                            // BSIG0(a)
175    vpsrlq      $17, avx2Temp5, avx2Temp7                // >>17
176    add  \tempSwitch4, \h                                // h += Maj(a,b,c)
177    and  \d, tempFifth                                   // e&f
178    rorx $6, \d, \tempSwitch4
179    add  tempFirst, \h                                   // a += BSIG0(a) from last round
180    vpxor       avx2Temp3, avx2Temp1, avx2Temp3          // 7xor14xor3
181    vpsrlq      $19, avx2Temp5, avx2Temp5                // >>19
182    rorx $11, \d, tempThird
183    andn \f, \d, tempFirst
184    xor  \tempSwitch4, tempThird
185    vpsrld      $11, avx2Temp1, avx2Temp1                // >>18
186    xor  tempFirst, tempFifth
187    rorx $25, \d, \tempSwitch4
188    add  tempFifth, \g
189    xor  \tempSwitch4, tempThird
190    vpslld      $11, avx2Temp4, avx2Temp4                // <<25
191    rorx $2, \h, tempFirst
192    mov  \h, \tempSwitch4
193    add  tempThird, \g
194    rorx $13, \h,tempFifth
195    xor  \a, \tempSwitch4
196    vpxor       avx2Temp7, avx2Temp6, avx2Temp7          // 17xor10
197    add  \g, \c
198    xor  tempFifth, tempFirst
199    vpxor       avx2Temp3, avx2Temp1, avx2Temp3          // 7xor14xor3xor18
200    and  \tempSwitch4, \tempSwitch2
201    rorx $22, \h, tempThird
202    add  8+\WkAddr(%rsp),\f
203    xor  \a, \tempSwitch2
204    vpxor       avx2Temp7, avx2Temp5, avx2Temp7          // 17xor10xor19
205    mov  \d, tempFifth
206    xor  tempThird, tempFirst
207    add  \tempSwitch2, \g
208    vpshufb       g_maskMerge, avx2Temp7, avx2Temp7       // BSIG1 w15_14
209    vpxor       avx2Temp3, avx2Temp4, avx2Temp3           // 7xor14xor3xor18xor25
210    and  \c, tempFifth
211    rorx $6, \c, \tempSwitch2
212    add  tempFirst, \g
213    rorx $11, \c, tempThird
214    vpaddd      avx2Temp3, \B3_0, \B3_0                   // BSIG0+w(t-16)+w(t-7)
215    andn \e, \c, tempFirst
216    xor  \tempSwitch2, tempThird
217    xor  tempFirst, tempFifth
218    rorx $25, \c, \tempSwitch2
219    add  tempFifth, \f
220    xor  \tempSwitch2, tempThird
221    rorx $2, \g, tempFirst
222    mov  \g, \tempSwitch2
223    add  tempThird, \f
224    rorx $13, \g, tempFifth
225    vpaddd      \B3_0, avx2Temp7, \B3_0                   // w17_16
226    xor  \h, \tempSwitch2
227    add  \f, \b
228    xor  tempFifth, tempFirst
229    and  \tempSwitch2, \tempSwitch4
230    vpshufd       $80, \B3_0, avx2Temp1
231    rorx $22, \g, tempThird
232    add  12+\WkAddr(%rsp),\e
233    xor  \h, \tempSwitch4
234    mov  \c, tempFifth
235    xor  tempThird, tempFirst
236    add  \tempSwitch4, \f
237    vpsrld       $10, avx2Temp1, avx2Temp2                 // >>10
238    and  \b, tempFifth
239    rorx $6, \b, \tempSwitch4
240    vpsrlq       $17, avx2Temp1, avx2Temp3                 // >>17
241    add  tempFirst, \f
242    rorx $11, \b, tempThird
243    andn \d, \b, tempFirst
244    xor  \tempSwitch4, tempThird
245    vpsrlq     $19,avx2Temp1, avx2Temp1                    // >>19
246    xor  tempFirst, tempFifth
247    rorx $25, \b, \tempSwitch4
248    add  tempFifth, \e
249    xor  \tempSwitch4, tempThird
250    vpxor       avx2Temp2, avx2Temp3, avx2Temp3            // 10xor17
251    rorx $2, \f, tempFirst
252    mov  \f, \tempSwitch4
253    add  tempThird, \e
254    rorx $13, \f, tempFifth
255    xor  \g, \tempSwitch4
256    vpxor       avx2Temp3, avx2Temp1, avx2Temp3            // 10xor17xor19
257    add  \e, \a
258    xor  tempFifth, tempFirst
259    and  \tempSwitch4, \tempSwitch2
260    rorx $22, \f, tempThird
261    vpshufb       g_maskShift, avx2Temp3, avx2Temp3        // BSIG1(W17_16)Move to the desired location
262    xor  \g, \tempSwitch2
263    mov  \b, tempFifth
264    xor  tempThird, tempFirst
265    add  \tempSwitch2, \e
266    vpaddd       avx2Temp3, \B3_0, \B3_0                    // W19_16
267.endm
268
269/*
270 *   Macro description: Processes the update of a round of hash values in 64 rounds of compression.
271 *   Input register:
272 *      wkAddr: wi+kt Stack space address.
273 *       a - h: Intermediate variable of hash value
274 *   Modify the register: r8d-r15d, ebp, eax, ebx, ecx, edi
275 *   Output register:
276 *         a-h: Indicates the value after a cyclic update.
277 *   Function/Macro Call:None
278 *          ONE_ROUND Algorithm Implementation:
279 *          For t = 0 to 63, T1 = h + BSIG1(e) + CH(e,f,g) + Kt + Wt
280 *          T2 = BSIG0(a) + MAJ(a,b,c)
281 *          h = g, g = f, f = e, e = d + T1, d = c, c = b, b = a, a = T1 + T2
282 *          CH( x, y, z) = (x AND y) XOR ( (NOT x) AND z)           CH(e,f,g)
283 *          MAJ(a, b, c) = (a AND b) XOR (a AND c) XOR (b AND c)
284 *                       = CH(a^b, c, b)
285 *                       = ((a XOR b) AND c) XOR ((NOT(a XOR b)) AND b)
286 *                       = (b XOR c) AND (a XOR b) XOR b
287 *          BSIG0(x) = ROTR^2(x) XOR ROTR^13(x) XOR ROTR^22(x)      BSIG0(a)
288 *          BSIG1(x) = ROTR^6(x) XOR ROTR^11(x) XOR ROTR^25(x)      BSIG1(e)
289 *          Optimization idea: b xor c in the next round of MAJ is a xor b in the
290 *                             previous round of MAJ to avoid redundant calculation.
291 *          Note: At the end of each round, the tempSwitch2 and tempSwitch4 of the next round need to be exchanged.
292 */
293    .macro ONE_ROUND         a, b, c, d, e, f, g, h, tempSwitch2, tempSwitch4, WkAddr
294    rorx $11, \e, tempThird                          // ROTR^11(e)
295    rorx $6, \e, \tempSwitch2                        // ROTR^6(e)
296    add  tempFirst, \a                               // a += BSIG0(a) from last round
297    and  \e, tempFifth                               // e&f
298    andn \g, \e, tempFirst                           // (~e)&g
299    xor  \tempSwitch2, tempThird                     // ROTR^6(e) ^ ROTR^11(e)
300    add  \WkAddr(%rsp),\h                            // h += Kt + Wt
301    xor  tempFirst, tempFifth                        // CH(e,f,g)
302    rorx $25, \e, \tempSwitch2                       // ROTR^25(e)
303    add  tempFifth, \h                               // h += CH(e,f,g)
304    xor  \tempSwitch2, tempThird                     // BSIG1(e)
305    rorx $2, \a, tempFirst                           // ROTR^2(a)
306    mov  \a, \tempSwitch2                            // a
307    leal  (tempThird, \h), \h                        // h += BSIG1(e)[h->T1]
308    rorx $13, \a, tempFifth                          // ROTR^13(a)
309    xor  \b, \tempSwitch2                            // b^a for next round b^c
310    add  \h, \d                                      // d += T1
311    xor  tempFifth, tempFirst                        // ROTR^2(a) ^ ROTR^13(a)
312    and  \tempSwitch2, \tempSwitch4                  // (b^a) & (b^c)
313    rorx $22, \a, tempThird                          // ROTR^22(a)
314    xor  \b, \tempSwitch4                            // Maj(a,b,c)
315    mov  \e, tempFifth                               // for next round f
316    xor  tempThird, tempFirst                        // BSIG0(a)
317    add  \tempSwitch4, \h                            // h += Maj(a,b,c)
318    .endm
319
320/*
321 *  Function description: Performs 64 rounds of compression calculation based on the input plaintext data and updates the hash value.
322 *  function prototype:void SHA256CompressMultiBlocks(uint32_t hash[8], const uint8_t *in, uint32_t num);
323 *  Input register:
324 *         rdi: Storage address of the hash value
325 *         rsi: Pointer to the input data address (Wi)
326 *         rdx: Number of 64 rounds of cycles. (You need to do several blocks, that is, you need to do several loops.)
327 *  Modify the register: r0-r14
328 *  Output register: None
329 *  Function/Macro Call: None
330 */
331.text
332.globl SHA256CompressMultiBlocks
333.type SHA256CompressMultiBlocks,%function
334.align 4
335SHA256CompressMultiBlocks:
336.cfi_startproc
337    /* Determine whether to end the process directly. */
338    cmp $0, NUM
339    je .LEND_SHA256
340
341    /* Pop-stack/push stack protection */
342    pushq %r14
343    pushq %rbx
344    pushq %rbp
345    pushq %r12
346    pushq %r13
347    pushq %r15
348
349    /* The pre-stored stack space and 32-byte address are aligned.
350       The original RSP value is added to the stack and the mask is assigned. */
351    mov %rsp, %r14
352    mov 0(HashAddr), %r8d
353    sub $600, %rsp
354    vmovdqa g_mask + 0(%rip), g_maskTransformEndian
355    mov 4(HashAddr), %r9d
356    mov 8(HashAddr), %r10d
357    and $-256, %rsp
358    vmovdqa g_mask + 64(%rip), g_maskShift
359    mov 12(HashAddr), %r11d
360    mov %r14, 0(%rsp)
361
362    /* r8d-r15d: a-h */
363    mov 16(HashAddr), %r12d
364    mov 20(HashAddr), %r13d
365    vmovdqa g_mask + 32(%rip), g_maskMerge
366    mov 24(HashAddr), %r14d
367    mov 28(HashAddr), %r15d
368
369.LEND_SHA256_LOOP:
370    mov InAddr, %rcx
371
372    /* Loads the data of a block to the lower 128 bits of the ymm register. */
373    vmovdqu 0(InAddr), BlockFrontMessageW3_0
374    vmovdqu 16(InAddr), BlockFrontMessageW7_4
375    vmovdqu 32(InAddr), BlockFrontMessageW11_8
376    vmovdqu 48(InAddr), BlockFrontMessageW15_12
377
378    /* block Judgment condition processing */
379    leaq 64(InAddr), InAddr
380    cmp $1, NUM
381    cmovne InAddr, %rcx                   // If num is greater than 1, rcx points to the next block.
382
383    /* Load the data of another block to the upper 128 bits of the ymm register. */
384    vinserti128 $1, 0(%rcx),  %ymm0, %ymm0
385    vinserti128 $1, 16(%rcx), %ymm1, %ymm1
386    vpshufb g_maskTransformEndian, %ymm0, %ymm0
387    mov NUM, 16(%rsp)
388    vinserti128 $1, 32(%rcx), %ymm2, %ymm2
389    mov HashAddr, 24(%rsp)
390    vpshufb g_maskTransformEndian, %ymm1, %ymm1
391    vinserti128 $1, 48(%rcx), %ymm3, %ymm3
392    vpshufb g_maskTransformEndian, %ymm2, %ymm2
393
394    add $64, %rcx
395    leaq    g_K256(%rip), NUM
396
397    /* Little-endian order to big-endian order, wi + kt:ymm9-11*/
398    mov %rcx, 8(%rsp)
399    leaq    32(%rsp), %rsp
400    vpaddd 0(NUM), %ymm0, %ymm8
401    mov %r9d, %ecx
402    vpaddd 32(NUM), %ymm1, %ymm9
403    vmovdqa %ymm8, 0(%rsp)
404    vpshufb g_maskTransformEndian, %ymm3, %ymm3
405    xor %ebp, %ebp
406    vpaddd 64(NUM), %ymm2, %ymm10
407    vmovdqu %ymm9, 32(%rsp)
408    xor %r10d, %ecx
409    vpaddd 96(NUM), %ymm3, %ymm11
410    mov %r13d, %edi
411    vmovdqa %ymm10, 64(%rsp)
412    vmovdqu %ymm11, 96(%rsp)
413
414.LEND_SHA256_ROUND_00_47:
415
416    /* Next round wi + kt: ymm9-11, 16 rounds of compression + 4 rounds of message block expansion */
417    /* FOUR_ROUND_UPDATE_4W a, b, c, d, e, f, g, h, tempSwitch2,tempSwitch4, WkAddr,B3_0, B7_4, B11_8, B15_12 */
418    FOUR_ROUND_UPDATE_4W %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %eax, %ecx, 0, %ymm0, %ymm1, %ymm2, %ymm3
419    leaq 128(NUM), NUM
420    FOUR_ROUND_UPDATE_4W %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %eax, %ecx, 32, %ymm1, %ymm2, %ymm3, %ymm0
421    vpaddd 0(NUM), %ymm0, %ymm8
422    FOUR_ROUND_UPDATE_4W   %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %eax, %ecx, 64, %ymm2, %ymm3, %ymm0, %ymm1
423    vpaddd 32(NUM), %ymm1, %ymm9
424    vmovdqa %ymm8, 128(%rsp)
425    FOUR_ROUND_UPDATE_4W %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %eax, %ecx, 96, %ymm3, %ymm0, %ymm1, %ymm2
426    vpaddd 64(NUM), %ymm2, %ymm10
427    vmovdqa %ymm9, 160(%rsp)
428    vpaddd 96(NUM), %ymm3, %ymm11
429    vmovdqu %ymm10, 192(%rsp)
430    vmovdqa %ymm11, 224(%rsp)
431
432    /* Next round wi + kt: ymm9-11, 16 rounds of compression + 4 rounds of message block expansion */
433    /* FOUR_ROUND_UPDATE_4W a, b, c, d, e, f, g, h, tempSwitch2,tempSwitch4, WkAddr,B19_16, B23_20, B27_24, B31_27 */
434    FOUR_ROUND_UPDATE_4W %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %eax, %ecx, 128, %ymm0, %ymm1, %ymm2, %ymm3
435    leaq 128(NUM), NUM
436    FOUR_ROUND_UPDATE_4W %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %eax, %ecx, 160, %ymm1, %ymm2, %ymm3, %ymm0
437    vpaddd 0(NUM), %ymm0, %ymm8
438    FOUR_ROUND_UPDATE_4W   %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %eax, %ecx, 192, %ymm2, %ymm3, %ymm0, %ymm1
439    vpaddd 32(NUM), %ymm1, %ymm9
440    vmovdqa %ymm8, 256(%rsp)
441    FOUR_ROUND_UPDATE_4W %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %eax, %ecx, 224, %ymm3, %ymm0, %ymm1, %ymm2
442    vpaddd 64(NUM), %ymm2, %ymm10
443    vmovdqa %ymm9, 288(%rsp)
444    vpaddd 96(NUM), %ymm3, %ymm11
445    vmovdqu %ymm10, 320(%rsp)
446    vmovdqa %ymm11, 352(%rsp)
447
448    /* Next round wi + kt: ymm9-11, 16 rounds of compression + 4 rounds of message block expansion */
449    /* FOUR_ROUND_UPDATE_4W a, b, c, d, e, f, g, h, tempSwitch2,tempSwitch4, WkAddr,B35_32, B39_36, B43_40, B47_44 */
450    FOUR_ROUND_UPDATE_4W %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %eax, %ecx, 256, %ymm0, %ymm1, %ymm2, %ymm3
451    leaq 128(NUM), NUM
452    FOUR_ROUND_UPDATE_4W %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %eax, %ecx, 288, %ymm1, %ymm2, %ymm3, %ymm0
453    vpaddd 0(NUM), %ymm0, %ymm8
454    FOUR_ROUND_UPDATE_4W   %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %eax, %ecx, 320, %ymm2, %ymm3, %ymm0, %ymm1
455    vpaddd 32(NUM), %ymm1, %ymm9
456    vmovdqa %ymm8, 384(%rsp)
457    FOUR_ROUND_UPDATE_4W %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %eax, %ecx, 352, %ymm3, %ymm0, %ymm1, %ymm2
458    vpaddd 64(NUM), %ymm2, %ymm10
459    vmovdqa %ymm9, 416(%rsp)
460    vpaddd 96(NUM), %ymm3, %ymm11
461    vmovdqu %ymm10, 448(%rsp)
462    vmovdqa %ymm11, 480(%rsp)
463
464.LEND_SHA256_ROUND_48_63:
465    /* ONE_ROUND a, b, c, d, e, f, g, h, tempSwitch2, Fourth, WkAddr */
466    ONE_ROUND   %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %eax, %ecx, 384
467    ONE_ROUND   %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %ecx, %eax, 388
468    ONE_ROUND   %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %eax, %ecx, 392
469    ONE_ROUND   %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %ecx, %eax, 396
470
471    ONE_ROUND   %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %eax, %ecx, 416
472    ONE_ROUND   %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %ecx, %eax, 420
473    ONE_ROUND   %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %eax, %ecx, 424
474    ONE_ROUND   %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %ecx, %eax, 428
475
476    ONE_ROUND   %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %eax, %ecx, 448
477    ONE_ROUND   %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %ecx, %eax, 452
478    ONE_ROUND   %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %eax, %ecx, 456
479    ONE_ROUND   %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %ecx, %eax, 460
480
481    ONE_ROUND   %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %eax, %ecx, 480
482    ONE_ROUND   %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %ecx, %eax, 484
483    ONE_ROUND   %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %eax, %ecx, 488
484    ONE_ROUND   %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %ecx, %eax, 492
485
486    sub    $32, %rsp
487    add %ebp, %r8d                      // a+=BSIG0
488    mov 24(%rsp), HashAddr
489
490    /* Update the storage hash value. */
491    add 0(HashAddr), %r8d
492    add 4(HashAddr), %r9d
493    mov %r8d, 0(HashAddr)
494    add 8(HashAddr), %r10d
495    mov %r9d, 4(HashAddr)
496    add 12(HashAddr), %r11d
497    mov %r10d, 8(HashAddr)
498    add 16(HashAddr), %r12d
499    mov 16(%rsp), NUM
500    mov %r11d, 12(HashAddr)
501    add 20(HashAddr), %r13d
502    mov %r12d, 16(HashAddr)
503    add 24(HashAddr), %r14d
504    mov %r13d, 20(HashAddr)
505    add 28(HashAddr), %r15d
506    mov %r14d, 24(HashAddr)
507    mov %r15d, 28(HashAddr)
508
509    cmp $1, NUM
510    je .LEND_SHA256_FINFISH_INITIAL
511
512    /* Data compression of the second block */
513    xor %ebp, %ebp
514    mov %r9d, %ecx
515    xor %r10d, %ecx
516    mov %r13d, %edi
517
518.LEND_SHA256_NEXT_BLOCK:
519    /* 0-15 */
520    /* ONE_ROUND a,   b,    c,     d,     e,     f,    g,       h,    tempSwitch2,tempSwitch4, WkAddr */
521    ONE_ROUND   %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %eax, %ecx, 16+32
522    ONE_ROUND   %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %ecx, %eax, 20+32
523    ONE_ROUND   %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %eax, %ecx, 24+32
524    ONE_ROUND   %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %ecx, %eax, 28+32
525
526    ONE_ROUND   %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %eax, %ecx, 48+32
527    ONE_ROUND   %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %ecx, %eax, 52+32
528    ONE_ROUND   %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %eax, %ecx, 56+32
529    ONE_ROUND   %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %ecx, %eax, 60+32
530
531    ONE_ROUND   %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %eax, %ecx, 80+32
532    ONE_ROUND   %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %ecx, %eax, 84+32
533    ONE_ROUND   %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %eax, %ecx, 88+32
534    ONE_ROUND   %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %ecx, %eax, 92+32
535
536    ONE_ROUND   %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %eax, %ecx, 112+32
537    ONE_ROUND   %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %ecx, %eax, 116+32
538    ONE_ROUND   %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %eax, %ecx, 120+32
539    ONE_ROUND   %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %ecx, %eax, 124+32
540
541    /* 16-31 */
542    ONE_ROUND   %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %eax, %ecx, 16+128+32
543    ONE_ROUND   %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %ecx, %eax, 20+128+32
544    ONE_ROUND   %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %eax, %ecx, 24+128+32
545    ONE_ROUND   %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %ecx, %eax, 28+128+32
546
547    ONE_ROUND   %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %eax, %ecx, 48+128+32
548    ONE_ROUND   %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %ecx, %eax, 52+128+32
549    ONE_ROUND   %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %eax, %ecx, 56+128+32
550    ONE_ROUND   %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %ecx, %eax, 60+128+32
551
552    ONE_ROUND   %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %eax, %ecx, 80+128+32
553    ONE_ROUND   %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %ecx, %eax, 84+128+32
554    ONE_ROUND   %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %eax, %ecx, 88+128+32
555    ONE_ROUND   %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %ecx, %eax, 92+128+32
556
557    ONE_ROUND   %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %eax, %ecx, 112+128+32
558    ONE_ROUND   %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %ecx, %eax, 116+128+32
559    ONE_ROUND   %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %eax, %ecx, 120+128+32
560    ONE_ROUND   %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %ecx, %eax, 124+128+32
561
562    /* 32-47 */
563    ONE_ROUND   %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %eax, %ecx, 16+256+32
564    ONE_ROUND   %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %ecx, %eax, 20+256+32
565    ONE_ROUND   %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %eax, %ecx, 24+256+32
566    ONE_ROUND   %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %ecx, %eax, 28+256+32
567
568    ONE_ROUND   %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %eax, %ecx, 48+256+32
569    ONE_ROUND   %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %ecx, %eax, 52+256+32
570    ONE_ROUND   %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %eax, %ecx, 56+256+32
571    ONE_ROUND   %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %ecx, %eax, 60+256+32
572
573    ONE_ROUND   %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %eax, %ecx, 80+256+32
574    ONE_ROUND   %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %ecx, %eax, 84+256+32
575    ONE_ROUND   %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %eax, %ecx, 88+256+32
576    ONE_ROUND   %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %ecx, %eax, 92+256+32
577
578    ONE_ROUND   %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %eax, %ecx, 112+256+32
579    ONE_ROUND   %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %ecx, %eax, 116+256+32
580    ONE_ROUND   %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %eax, %ecx, 120+256+32
581    ONE_ROUND   %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %ecx, %eax, 124+256+32
582
583    /* 48-63 */
584    ONE_ROUND   %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %eax, %ecx, 16+384+32
585    ONE_ROUND   %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %ecx, %eax, 20+384+32
586    ONE_ROUND   %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %eax, %ecx, 24+384+32
587    ONE_ROUND   %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %ecx, %eax, 28+384+32
588
589    ONE_ROUND   %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %eax, %ecx, 48+384+32
590    ONE_ROUND   %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %ecx, %eax, 52+384+32
591    ONE_ROUND   %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %eax, %ecx, 56+384+32
592    ONE_ROUND   %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %ecx, %eax, 60+384+32
593
594    ONE_ROUND   %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %eax, %ecx, 80+384+32
595    ONE_ROUND   %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %ecx, %eax, 84+384+32
596    ONE_ROUND   %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %r13d, %eax, %ecx, 88+384+32
597    ONE_ROUND   %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %r12d, %ecx, %eax, 92+384+32
598
599    ONE_ROUND   %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %r11d, %eax, %ecx, 112+384+32
600    ONE_ROUND   %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %r10d, %ecx, %eax, 116+384+32
601    ONE_ROUND   %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %r9d, %eax, %ecx, 120+384+32
602    ONE_ROUND   %r9d, %r10d, %r11d, %r12d, %r13d, %r14d, %r15d, %r8d, %ecx, %eax, 124+384+32
603
604    mov 24(%rsp), HashAddr
605    lea (%ebp, %r8d), %r8d              // a+=BSIG0
606
607    /* Update the storage hash value. */
608    add 0(HashAddr), %r8d
609    add 4(HashAddr), %r9d
610    mov %r8d, 0(HashAddr)
611    add 8(HashAddr), %r10d
612    mov %r9d, 4(HashAddr)
613    add 12(HashAddr), %r11d
614    mov %r10d, 8(HashAddr)
615    add 16(HashAddr), %r12d
616    mov %r11d, 12(HashAddr)
617    add 20(HashAddr), %r13d
618    mov %r12d, 16(HashAddr)
619    mov 8(%rsp), InAddr
620    add 24(HashAddr), %r14d
621    mov %r13d, 20(HashAddr)
622    mov 16(%rsp), NUM
623    add 28(HashAddr), %r15d
624    mov %r14d, 24(HashAddr)
625    mov %r15d, 28(HashAddr)
626
627    sub $2, NUM
628    ja .LEND_SHA256_LOOP
629
630.LEND_SHA256_FINFISH_INITIAL:
631    /* Registers and pointers are reset. */
632    mov 0(%rsp), %rsp
633    popq %r15
634    popq %r13
635    popq %r12
636    popq %rbp
637    popq %rbx
638    popq %r14
639
640.LEND_SHA256:
641    ret
642.cfi_endproc
643    .size   SHA256CompressMultiBlocks, .-SHA256CompressMultiBlocks
644
645#endif
646