• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * This file is part of the openHiTLS project.
3 *
4 * openHiTLS is licensed under the Mulan PSL v2.
5 * You can use this software according to the terms and conditions of the Mulan PSL v2.
6 * You may obtain a copy of Mulan PSL v2 at:
7 *
8 *     http://license.coscl.org.cn/MulanPSL2
9 *
10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13 * See the Mulan PSL v2 for more details.
14 */
15
16
17#include "hitls_build.h"
18#ifdef HITLS_CRYPTO_SM3
19
20#include "crypt_arm.h"
21.arch    armv8-a+crypto
22
23// The first 16 of the compression function, w13 is Tj.
24.macro first16 A B C D E F G H W1 W2
25    ror w13, w13, #31
26    ror w10, \A, #20
27    add w9, \E, w10
28    eor w12, \E, \F
29    ror \F, \F, #13
30    eor w12, w12, \G
31    add w12, w12, \H
32    add w9, w9, w13
33    ror w9, w9, #25
34    add w12, w12, w9
35    eor w10, w10, w9
36    add w12, w12, \W1
37    eor \H, w12, w12, ror #23
38    ror w9, w12, #15
39    eor \H, \H, w9
40    eor w11, \A, \B
41    ror \B, \B, #23
42    eor w11, w11, \C
43    add w11, w11, \D
44    add w11, w11, w10
45    eor w9, \W1, \W2
46    add \D, w11, w9
47    .endm
48
49// Compress the last 48 of the function, w13 is Tj
50.macro second48 A B C D E F G H W1 W2
51    ror w13, w13, #31
52    orr w11, \B, \C
53    eor w12, \F, \G
54    ror \F, \F, #13
55    ror w10, \A, #20
56    add w9, w10, \E
57    and w14, \A, w11
58    and w12, w12, \E
59    eor w12, w12, \G
60    add w12, w12, \H
61    add w9, w9, w13
62    ror w9, w9, #25
63    add w12, w12, w9
64    eor w10, w10, w9
65    add w12, w12, \W1
66    and w11, \B, \C
67    ror \B, \B, #23
68    orr w11, w11, w14
69    eor w9, \W1, \W2
70    add w11, w11, \D
71    add w11, w11, w10
72    add \D, w11, w9
73    eor \H, w12, w12, ror #23
74    ror w9, w12, #15
75    eor \H, \H, w9
76    .endm
77
78// void SM3_CompressAsm(uint32_t state[8], const uint8_t *data, uint32_t blockCnt);
79.globl SM3_CompressAsm
80.type  SM3_CompressAsm, %function
81.align 4
82SM3_CompressAsm:
83AARCH64_PACIASP
84    sub sp, sp, 128
85    stp x19, x20, [sp]
86    stp x21, x22, [sp, #16]
87    stp x23, x24, [sp, #32]
88    stp x25, x26, [sp, #48]
89    // According to the calling convention, this function needs to be saved.
90    stp d8, d9, [sp, #64]
91    stp d10, d11, [sp, #80]
92    stp d12, d13, [sp, #96]
93    stp d14, d15, [sp, #112]
94
95    sub sp, sp, 64
96    mov x25, sp
97    sub sp, sp, 64
98    mov x26, sp
99
100    mov x22, x0 // x22: state
101    mov x23, x1 // x23: data
102    mov w24, w2 // x24: blockCnt
103
104    // w0-w7: ABCDEFGH word register in"SM3 cryptographic hash algorithm"
105    ldp w0, w1, [x22]
106    ldp w2, w3, [x22, #8]
107    ldp w4, w5, [x22, #16]
108    ldp w6, w7, [x22, #24]
109
110    prfm pldl1keep, [x23, #64]
111    blocksloop_1:
112    subs w24, w24, #1
113    bmi end
114    // Due to the SM3 feature, only three messages can be extended in parallel.
115    // You need to use ext to ensure that the data meets the requirements for calculation.
116    // To reduce the delay, the message expansion is calculated together with the compression function,
117    // and the compression function is calculated three times for every three Ws.
118
119    // v0-v3 message group w0-w15
120    ld1 {v0.4s-v3.4s}, [x23]
121#ifndef HITLS_BIG_ENDIAN
122    rev32 v0.16B, v0.16B
123    rev32 v1.16B, v1.16B
124    rev32 v2.16B, v2.16B
125    rev32 v3.16B, v3.16B
126#endif
127
128    ldp w15, w20, [x23]
129    ldp w19, w21, [x23, #16]
130#ifndef HITLS_BIG_ENDIAN
131    rev w15, w15
132    rev w19, w19
133    rev w20, w20
134    rev w21, w21
135#endif
136
137    ext v24.16b, v3.16b, v3.16b, #4   // 13, 14, 15
138    ext v25.16b, v0.16b, v1.16b, #12  // 3, 4, 5
139    ext v23.16b, v1.16b, v2.16b, #12  // 7, 8, 9
140    ext v26.16b, v2.16b, v3.16b, #8   // 10, 11, 12
141    eor v27.16b, v0.16b, v23.16b
142    // w13: constant Tj , 0 <= j <= 16
143    mov w13, #0x228c
144    movk w13, #0xbce6, lsl #16
145
146    // Message grouping: Wj−3 ≪ 15, Wj−13 ≪ 7
147    shl v21.4s, v24.4s, #15
148    shl v22.4s, v25.4s, #7
149    sri v21.4s, v24.4s, #17  // 13, 14, 15<<<15
150    sri v22.4s, v25.4s, #25  // 3, 4, 5<<<7
151    first16 w0 w1 w2 w3 w4 w5 w6 w7 w15 w19
152    eor v27.16b, v21.16b, v27.16b
153    eor v28.16b, v22.16b, v26.16b
154    first16 w3 w0 w1 w2 w7 w4 w5 w6 w20 w21
155    // permutation function P1: X ^ (X ≪ 15) ^ (X ≪ 23)
156    shl v29.4s, v27.4s, #15
157    shl v30.4s, v27.4s, #23
158    sri v29.4s, v27.4s, #17
159    sri v30.4s, v27.4s, #9
160    eor v27.16b, v27.16b, v29.16b
161    ldp w15, w20, [x23, #8]
162    ldp w19, w21, [x23, #24]
163#ifndef HITLS_BIG_ENDIAN
164    rev w15, w15
165    rev w19, w19
166    rev w20, w20
167    rev w21, w21
168#endif
169    eor v27.16b, v27.16b, v30.16b
170    first16 w2 w3 w0 w1 w6 w7 w4 w5 w15 w19
171    eor v4.16b, v27.16b, v28.16b
172
173    // 2:19, 20, 21
174    ext v23.16b, v1.16b, v2.16b, #8  // 6, 7, 8
175    eor v27.16b, v25.16b, v26.16b
176    first16 w1 w2 w3 w0 w5 w6 w7 w4 w20 w21
177    shl v21.4s, v4.4s, #15
178    shl v22.4s, v23.4s, #7
179    sri v21.4s, v4.4s, #17   // 16, 17, 18<<<15
180    sri v22.4s, v23.4s, #25  // 6, 7, 8<<<7
181    ldp w15, w20, [x23, #16]
182    ldp w19, w21, [x23, #32]
183#ifndef HITLS_BIG_ENDIAN
184    rev w15, w15
185    rev w19, w19
186    rev w20, w20
187    rev w21, w21
188#endif
189    eor v27.16b, v21.16b, v27.16b
190    eor v28.16b, v22.16b, v24.16b
191    first16 w0 w1 w2 w3 w4 w5 w6 w7 w15 w19
192    shl v29.4s, v27.4s, #15
193    shl v30.4s, v27.4s, #23
194    sri v29.4s, v27.4s, #17
195    sri v30.4s, v27.4s, #9
196    eor v27.16b, v27.16b, v29.16b
197    first16 w3 w0 w1 w2 w7 w4 w5 w6 w20 w21
198    eor v27.16b, v27.16b, v30.16b
199    mov v4.s[3], v4.s[2]  // Due to ext requirements, to fill s[3]
200    eor v5.16b, v27.16b, v28.16b
201
202    // 3:22, 23, 24
203    ext v25.16b, v2.16b, v3.16b, #4  // 9, 10, 11
204    eor v27.16b, v23.16b, v24.16b
205    ldp w15, w20, [x23, #24]
206    ldp w19, w21, [x23, #40]
207#ifndef HITLS_BIG_ENDIAN
208    rev w15, w15
209    rev w19, w19
210    rev w20, w20
211    rev w21, w21
212#endif
213    shl v21.4s, v5.4s, #15
214    shl v22.4s, v25.4s, #7
215    sri v21.4s, v5.4s, #17   // 19, 20, 21<<<15
216    sri v22.4s, v25.4s, #25  // 9, 10, 11<<<7
217    first16 w2 w3 w0 w1 w6 w7 w4 w5 w15 w19
218    eor v27.16b, v21.16b, v27.16b
219    eor v28.16b, v22.16b, v4.16b
220    first16 w1 w2 w3 w0 w5 w6 w7 w4 w20 w21
221    shl v29.4s, v27.4s, #15
222    shl v30.4s, v27.4s, #23
223    sri v29.4s, v27.4s, #17
224    sri v30.4s, v27.4s, #9
225    eor v27.16b, v27.16b, v29.16b
226    ldp w15, w20, [x23, #32]
227    ldp w19, w21, [x23, #48]
228#ifndef HITLS_BIG_ENDIAN
229    rev w15, w15
230    rev w19, w19
231    rev w20, w20
232    rev w21, w21
233#endif
234    first16 w0 w1 w2 w3 w4 w5 w6 w7 w15 w19
235    eor v27.16b, v27.16b, v30.16b
236    mov v5.s[3], v5.s[2]  // Due to ext requirements, to fill s[3]
237    eor v6.16b, v27.16b, v28.16b
238
239    // 4:25, 26, 27
240    eor v27.16b, v25.16b, v4.16b
241    shl v21.4s, v6.4s, #15
242    shl v22.4s, v3.4s, #7
243    sri v21.4s, v6.4s, #17  // 22, 23, 24<<<15
244    sri v22.4s, v3.4s, #25  // 12, 13, 14<<<7
245    first16 w3 w0 w1 w2 w7 w4 w5 w6 w20 w21
246    eor v27.16b, v21.16b, v27.16b
247    eor v28.16b, v22.16b, v5.16b
248    ldp w15, w20, [x23, #40]
249    ldp w19, w21, [x23, #56]
250#ifndef HITLS_BIG_ENDIAN
251    rev w15, w15
252    rev w19, w19
253    rev w20, w20
254    rev w21, w21
255#endif
256    shl v29.4s, v27.4s, #15
257    shl v30.4s, v27.4s, #23
258    sri v29.4s, v27.4s, #17
259    sri v30.4s, v27.4s, #9
260    first16 w2 w3 w0 w1 w6 w7 w4 w5 w15 w19
261    eor v27.16b, v27.16b, v29.16b
262    first16 w1 w2 w3 w0 w5 w6 w7 w4 w20 w21
263    eor v27.16b, v27.16b, v30.16b
264    mov v6.s[3], v6.s[2]  // Due to ext requirements, to fill s[3]
265    eor v7.16b, v27.16b, v28.16b
266
267    // 5:28, 29, 30
268    ext v23.16b, v3.16b, v4.16b, #12  // 15, 16, 17
269    eor v27.16b, v3.16b, v5.16b
270    st1 {v4.4s-v7.4s}, [x25]  // There is a redundant data for every four 32-bit bits of the stored data.
271                              // The data needs to be read in a skip manner.
272    shl v21.4s, v7.4s, #15
273    shl v22.4s, v23.4s, #7
274    sri v21.4s, v7.4s, #17   // 25, 26, 27<<<15
275    sri v22.4s, v23.4s, #25  // 15, 16, 17<<<7
276    ldp w15, w20, [x23, #48]
277    ldp w19, w21, [x25]
278#ifndef HITLS_BIG_ENDIAN
279    rev w15, w15
280    rev w20, w20
281#endif
282    first16 w0 w1 w2 w3 w4 w5 w6 w7 w15 w19
283    eor v27.16b, v21.16b, v27.16b
284    eor v28.16b, v22.16b, v6.16b
285    shl v29.4s, v27.4s, #15
286    shl v30.4s, v27.4s, #23
287    sri v29.4s, v27.4s, #17
288    sri v30.4s, v27.4s, #9
289    eor v27.16b, v27.16b, v29.16b
290    first16 w3 w0 w1 w2 w7 w4 w5 w6 w20 w21
291    ldp w15, w20, [x23, #56]
292#ifndef HITLS_BIG_ENDIAN
293    rev w15, w15
294    rev w20, w20
295#endif
296    add x23, x23, #64
297    prfm pldl1keep, [x23, #64]
298    ldr w19, [x25, #8]
299    ldr w21, [x25, #16]
300    eor v27.16b, v27.16b, v30.16b
301    first16 w2 w3 w0 w1 w6 w7 w4 w5 w15 w19
302    mov v7.s[3], v7.s[2]  // Due to ext requirements, to fill s[3]
303    eor v8.16b, v27.16b, v28.16b
304
305    // Message extension completed. Continue with the next 48 compression.
306    ext v24.16b, v4.16b, v5.16b, #12  // 18, 19, 20
307    eor v27.16b, v23.16b, v6.16b
308    first16 w1 w2 w3 w0 w5 w6 w7 w4 w20 w21
309    shl v21.4s, v8.4s, #15
310    shl v22.4s, v24.4s, #7
311    sri v21.4s, v8.4s, #17   // 28, 29, 30<<<15
312    sri v22.4s, v24.4s, #25  // 18, 19, 20<<<7
313    ldp w15, w20, [x25]
314    ldp w19, w21, [x25, #20]
315    eor v27.16b, v21.16b, v27.16b
316    eor v28.16b, v22.16b, v7.16b
317    // w13: constant Tj , 17 <= j <= 63
318    mov w13, #0x3d43
319    movk w13, #0xcec5, lsl #16
320    second48 w0 w1 w2 w3 w4 w5 w6 w7 w15 w19
321    shl v29.4s, v27.4s, #15
322    shl v30.4s, v27.4s, #23
323    sri v29.4s, v27.4s, #17
324    sri v30.4s, v27.4s, #9
325    eor v27.16b, v27.16b, v29.16b
326    second48 w3 w0 w1 w2 w7 w4 w5 w6 w20 w21
327    eor v27.16b, v27.16b, v30.16b
328    mov v8.s[3], v8.s[2]  // Due to ext requirements, to fill s[3]
329    eor v9.16b, v27.16b, v28.16b
330
331    // 7:34, 35, 36
332    ext v23.16b, v5.16b, v6.16b, #12  // 21, 22, 23
333    eor v27.16b, v24.16b, v7.16b
334    ldr w15, [x25, #8]
335    ldr w20, [x25, #16]
336    ldp w19, w21, [x25, #32]
337    shl v21.4s, v9.4s, #15
338    shl v22.4s, v23.4s, #7
339    sri v21.4s, v9.4s, #17   // 31, 32, 33<<<15
340    sri v22.4s, v23.4s, #25  // 21, 22, 23<<<7
341    second48 w2 w3 w0 w1 w6 w7 w4 w5 w15 w19
342    eor v27.16b, v21.16b, v27.16b
343    eor v28.16b, v22.16b, v8.16b
344    second48 w1 w2 w3 w0 w5 w6 w7 w4 w20 w21
345    shl v29.4s, v27.4s, #15
346    shl v30.4s, v27.4s, #23
347    sri v29.4s, v27.4s, #17
348    sri v30.4s, v27.4s, #9
349    eor v27.16b, v27.16b, v29.16b
350    ldp w15, w20, [x25, #20]
351    ldr w19, [x25, #40]
352    ldr w21, [x25, #48]
353    eor v27.16b, v27.16b, v30.16b
354    second48 w0 w1 w2 w3 w4 w5 w6 w7 w15 w19
355    mov v9.s[3], v9.s[2]  // Due to ext requirements, to fill s[3]
356    eor v10.16b, v27.16b, v28.16b
357
358    // 8:37, 38, 39
359    ext v24.16b, v6.16b, v7.16b, #12  // 24, 25, 26
360    eor v27.16b, v23.16b, v8.16b
361    second48 w3 w0 w1 w2 w7 w4 w5 w6 w20 w21
362    shl v21.4s, v10.4s, #15
363    shl v22.4s, v24.4s, #7
364    sri v21.4s, v10.4s, #17  // 34, 35, 36<<<15
365    sri v22.4s, v24.4s, #25  // 24, 25, 26<<<7
366    ldp w15, w20, [x25, #32]
367    ldp w19, w21, [x25, #52]
368    eor v27.16b, v21.16b, v27.16b
369    eor v28.16b, v22.16b, v9.16b
370    second48 w2 w3 w0 w1 w6 w7 w4 w5 w15 w19
371    shl v29.4s, v27.4s, #15
372    shl v30.4s, v27.4s, #23
373    sri v29.4s, v27.4s, #17
374    sri v30.4s, v27.4s, #9
375    eor v27.16b, v27.16b, v29.16b
376    second48 w1 w2 w3 w0 w5 w6 w7 w4 w20 w21
377    eor v27.16b, v27.16b, v30.16b
378    mov v10.s[3], v10.s[2]  // Due to ext requirements, to fill s[3]
379    eor v11.16b, v27.16b, v28.16b
380
381    // 9:40, 41, 42
382    ext v23.16b, v7.16b, v8.16b, #12  // 27, 28, 29
383    eor v27.16b, v24.16b, v9.16b
384    st1 {v8.4s-v11.4s}, [x26]
385    shl v21.4s, v11.4s, #15
386    shl v22.4s, v23.4s, #7
387    sri v21.4s, v11.4s, #17  // 37, 38, 39<<<15
388    sri v22.4s, v23.4s, #25  // 27, 28, 29<<<7
389    ldr w15, [x25, #40]
390    ldr w20, [x25, #48]
391    ldp w19, w21, [x26]
392    second48 w0 w1 w2 w3 w4 w5 w6 w7 w15 w19
393    eor v27.16b, v21.16b, v27.16b
394    eor v28.16b, v22.16b, v10.16b
395    second48 w3 w0 w1 w2 w7 w4 w5 w6 w20 w21
396    shl v29.4s, v27.4s, #15
397    shl v30.4s, v27.4s, #23
398    sri v29.4s, v27.4s, #17
399    sri v30.4s, v27.4s, #9
400    eor v27.16b, v27.16b, v29.16b
401    ldp w15, w20, [x25, #52]
402    ldr w19, [x26, #8]
403    ldr w21, [x26, #16]
404    eor v27.16b, v27.16b, v30.16b
405    second48 w2 w3 w0 w1 w6 w7 w4 w5 w15 w19
406    mov v11.s[3], v11.s[2]  // Due to ext requirements, to fill s[3]
407    eor v12.16b, v27.16b, v28.16b
408
409    // 10:43, 44, 45
410    ext v24.16b, v8.16b, v9.16b, #12  // 30, 31, 32
411    eor v27.16b, v23.16b, v10.16b
412    second48 w1 w2 w3 w0 w5 w6 w7 w4 w20 w21
413    shl v21.4s, v12.4s, #15
414    shl v22.4s, v24.4s, #7
415    sri v21.4s, v12.4s, #17  // 40, 41, 42<<<15
416    sri v22.4s, v24.4s, #25  // 30, 31, 32<<<7
417    ldp w15, w20, [x26]
418    ldp w19, w21, [x26, #20]
419    eor v27.16b, v21.16b, v27.16b
420    eor v28.16b, v22.16b, v11.16b
421    second48 w0 w1 w2 w3 w4 w5 w6 w7 w15 w19
422    shl v29.4s, v27.4s, #15
423    shl v30.4s, v27.4s, #23
424    sri v29.4s, v27.4s, #17
425    sri v30.4s, v27.4s, #9
426    eor v27.16b, v27.16b, v29.16b
427    second48 w3 w0 w1 w2 w7 w4 w5 w6 w20 w21
428    eor v27.16b, v27.16b, v30.16b
429    mov v12.s[3], v12.s[2]  // Due to ext requirements, to fill s[3]
430    eor v13.16b, v27.16b, v28.16b
431
432    // 11:46, 47, 48
433    ext v23.16b, v9.16b, v10.16b, #12  // 33, 34, 35
434    eor v27.16b, v24.16b, v11.16b
435    ldr w15, [x26, #8]
436    ldr w20, [x26, #16]
437    ldp w19, w21, [x26, #32]
438    shl v21.4s, v13.4s, #15
439    shl v22.4s, v23.4s, #7
440    sri v21.4s, v13.4s, #17  // 43, 44, 45<<<15
441    sri v22.4s, v23.4s, #25  // 33, 34, 35<<<7
442    second48 w2 w3 w0 w1 w6 w7 w4 w5 w15 w19
443    eor v27.16b, v21.16b, v27.16b
444    eor v28.16b, v22.16b, v12.16b
445    second48 w1 w2 w3 w0 w5 w6 w7 w4 w20 w21
446    shl v29.4s, v27.4s, #15
447    shl v30.4s, v27.4s, #23
448    sri v29.4s, v27.4s, #17
449    sri v30.4s, v27.4s, #9
450    eor v27.16b, v27.16b, v29.16b
451    ldp w15, w20, [x26, #20]
452    ldr w19, [x26, #40]
453    ldr w21, [x26, #48]
454    second48 w0 w1 w2 w3 w4 w5 w6 w7 w15 w19
455    eor v27.16b, v27.16b, v30.16b
456    mov v13.s[3], v13.s[2]  // Due to ext requirements, to fill s[3]
457    eor v14.16b, v27.16b, v28.16b
458
459    // 12:49, 50, 51
460    ext v24.16b, v10.16b, v11.16b, #12  // 36, 37, 38
461    eor v27.16b, v23.16b, v12.16b
462    second48 w3 w0 w1 w2 w7 w4 w5 w6 w20 w21
463    shl v21.4s, v14.4s, #15
464    shl v22.4s, v24.4s, #7
465    sri v21.4s, v14.4s, #17  // 46, 47, 48<<<15
466    sri v22.4s, v24.4s, #25  // 36, 37, 38<<<7
467    ldp w15, w20, [x26, #32]
468    ldp w19, w21, [x26, #52]
469    eor v27.16b, v21.16b, v27.16b
470    eor v28.16b, v22.16b, v13.16b
471    second48 w2 w3 w0 w1 w6 w7 w4 w5 w15 w19
472    shl v29.4s, v27.4s, #15
473    shl v30.4s, v27.4s, #23
474    sri v29.4s, v27.4s, #17
475    sri v30.4s, v27.4s, #9
476    eor v27.16b, v27.16b, v29.16b
477    second48 w1 w2 w3 w0 w5 w6 w7 w4 w20 w21
478    eor v27.16b, v27.16b, v30.16b
479    mov v14.s[3], v14.s[2]  // Due to ext requirements, to fill s[3]
480    eor v15.16b, v27.16b, v28.16b
481
482    // 13:52, 53, 54
483    ext v23.16b, v11.16b, v12.16b, #12  // 39, 40, 41
484    eor v27.16b, v24.16b, v13.16b
485    st1 {v12.4s-v15.4s}, [x25]
486    shl v21.4s, v15.4s, #15
487    shl v22.4s, v23.4s, #7
488    sri v21.4s, v15.4s, #17  // 49, 50, 51<<<15
489    sri v22.4s, v23.4s, #25  // 39, 40, 41<<<7
490    ldr w15, [x26, #40]
491    ldr w20, [x26, #48]
492    ldp w19, w21, [x25]
493    second48 w0 w1 w2 w3 w4 w5 w6 w7 w15 w19
494    eor v27.16b, v21.16b, v27.16b
495    eor v28.16b, v22.16b, v14.16b
496    second48 w3 w0 w1 w2 w7 w4 w5 w6 w20 w21
497    shl v29.4s, v27.4s, #15
498    shl v30.4s, v27.4s, #23
499    sri v29.4s, v27.4s, #17
500    sri v30.4s, v27.4s, #9
501    eor v27.16b, v27.16b, v29.16b
502    ldp w15, w20, [x26, #52]
503    ldr w19, [x25, #8]
504    ldr w21, [x25, #16]
505    second48 w2 w3 w0 w1 w6 w7 w4 w5 w15 w19
506    eor v27.16b, v27.16b, v30.16b
507    mov v15.s[3], v15.s[2]  // Due to ext requirements, to fill s[3]
508    eor v16.16b, v27.16b, v28.16b
509
510    // 14:55, 56, 57
511    ext v24.16b, v12.16b, v13.16b, #12  // 42, 43, 44
512    eor v27.16b, v23.16b, v14.16b
513    shl v21.4s, v16.4s, #15
514    shl v22.4s, v24.4s, #7
515    sri v21.4s, v16.4s, #17  // 52, 53, 54<<<15
516    sri v22.4s, v24.4s, #25  // 42, 43, 44<<<7
517    second48 w1 w2 w3 w0 w5 w6 w7 w4 w20 w21
518    eor v27.16b, v21.16b, v27.16b
519    eor v28.16b, v22.16b, v15.16b
520    ldp w15, w20, [x25]
521    ldp w19, w21, [x25, #20]
522    shl v29.4s, v27.4s, #15
523    shl v30.4s, v27.4s, #23
524    sri v29.4s, v27.4s, #17
525    sri v30.4s, v27.4s, #9
526    eor v27.16b, v27.16b, v29.16b
527    second48 w0 w1 w2 w3 w4 w5 w6 w7 w15 w19
528    eor v27.16b, v27.16b, v30.16b
529    second48 w3 w0 w1 w2 w7 w4 w5 w6 w20 w21
530    mov v16.s[3], v16.s[2]  // Due to ext requirements, to fill s[3]
531    eor v17.16b, v27.16b, v28.16b
532
533    // 15:58, 59, 60
534    ext v23.16b, v13.16b, v14.16b, #12  // 45, 46, 47
535    eor v27.16b, v24.16b, v15.16b
536    shl v21.4s, v17.4s, #15
537    shl v22.4s, v23.4s, #7
538    sri v21.4s, v17.4s, #17  // 55, 56, 57<<<15
539    sri v22.4s, v23.4s, #25  // 45, 46, 47<<<7
540    ldr w15, [x25, #8]
541    ldr w20, [x25, #16]
542    ldp w19, w21, [x25, #32]
543    second48 w2 w3 w0 w1 w6 w7 w4 w5 w15 w19
544    eor v27.16b, v21.16b, v27.16b
545    eor v28.16b, v22.16b, v16.16b
546    second48 w1 w2 w3 w0 w5 w6 w7 w4 w20 w21
547    shl v29.4s, v27.4s, #15
548    shl v30.4s, v27.4s, #23
549    sri v29.4s, v27.4s, #17
550    sri v30.4s, v27.4s, #9
551    eor v27.16b, v27.16b, v29.16b
552    ldp w15, w20, [x25, #20]
553    ldr w19, [x25, #40]
554    ldr w21, [x25, #48]
555    second48 w0 w1 w2 w3 w4 w5 w6 w7 w15 w19
556    eor v27.16b, v27.16b, v30.16b
557    eor v18.16b, v27.16b, v28.16b
558
559    // 16:61, 62, 63
560    ext v24.16b, v14.16b, v15.16b, #12  // 48, 49, 50
561    eor v27.16b, v23.16b, v16.16b
562    shl v21.4s, v18.4s, #15
563    shl v22.4s, v24.4s, #7
564    sri v21.4s, v18.4s, #17  // 58, 59, 60<<<15
565    sri v22.4s, v24.4s, #25  // 48, 49, 50<<<7
566    second48 w3 w0 w1 w2 w7 w4 w5 w6 w20 w21
567    eor v27.16b, v21.16b, v27.16b
568    eor v28.16b, v22.16b, v17.16b
569    ldp w15, w20, [x25, #32]
570    ldp w19, w21, [x25, #52]
571    shl v29.4s, v27.4s, #15
572    shl v30.4s, v27.4s, #23
573    sri v29.4s, v27.4s, #17
574    sri v30.4s, v27.4s, #9
575    eor v27.16b, v27.16b, v29.16b
576    second48 w2 w3 w0 w1 w6 w7 w4 w5 w15 w19
577    eor v27.16b, v27.16b, v30.16b
578    second48 w1 w2 w3 w0 w5 w6 w7 w4 w20 w21
579    eor v19.16b, v27.16b, v28.16b
580
581    // 17:64, 65, 66
582    ext v23.16b, v15.16b, v16.16b, #12  // 51, 52, 53
583    eor v27.16b, v24.16b, v17.16b
584    st1 {v16.4s-v19.4s}, [x26]
585    shl v21.4s, v19.4s, #15
586    shl v22.4s, v23.4s, #7
587    sri v21.4s, v19.4s, #17  // 61, 62, 63<<<15
588    sri v22.4s, v23.4s, #25  // 51, 52, 53<<<7
589    ldr w15, [x25, #40]
590    ldr w20, [x25, #48]
591    ldp w19, w21, [x26]
592    second48 w0 w1 w2 w3 w4 w5 w6 w7 w15 w19
593    eor v27.16b, v21.16b, v27.16b
594    eor v28.16b, v22.16b, v18.16b
595    second48 w3 w0 w1 w2 w7 w4 w5 w6 w20 w21
596    shl v29.4s, v27.4s, #15
597    shl v30.4s, v27.4s, #23
598    sri v29.4s, v27.4s, #17
599    sri v30.4s, v27.4s, #9
600    eor v27.16b, v27.16b, v29.16b
601    ldp w15, w20, [x25, #52]
602    ldr w19, [x26, #8]
603    ldr w21, [x26, #16]
604    eor v27.16b, v27.16b, v30.16b
605    second48 w2 w3 w0 w1 w6 w7 w4 w5 w15 w19
606    eor v20.16b, v27.16b, v28.16b
607
608    // 18:67
609    ext v24.16b, v16.16b, v17.16b, #12  // 54, 55, 56
610    eor v27.16b, v23.16b, v18.16b
611    shl v21.4s, v20.4s, #15
612    shl v22.4s, v24.4s, #7
613    sri v21.4s, v20.4s, #17  // 64, 65, 66<<<15
614    sri v22.4s, v24.4s, #25  // 54, 55, 56<<<7
615    second48 w1 w2 w3 w0 w5 w6 w7 w4 w20 w21
616    eor v27.16b, v21.16b, v27.16b
617    eor v28.16b, v22.16b, v19.16b
618    ldp w15, w20, [x26]
619    ldp w19, w21, [x26, #20]
620    shl v29.4s, v27.4s, #15
621    shl v30.4s, v27.4s, #23
622    sri v29.4s, v27.4s, #17
623    sri v30.4s, v27.4s, #9
624    eor v27.16b, v27.16b, v29.16b
625    second48 w0 w1 w2 w3 w4 w5 w6 w7 w15 w19
626    eor v27.16b, v27.16b, v30.16b
627    second48 w3 w0 w1 w2 w7 w4 w5 w6 w20 w21
628    eor v21.16b, v27.16b, v28.16b
629
630    ldr w15, [x26, #8]
631    ldr w20, [x26, #16]
632    ldp w19, w21, [x26, #32]
633    second48 w2 w3 w0 w1 w6 w7 w4 w5 w15 w19
634    st1 {v20.4s-v21.4s}, [x25]
635    second48 w1 w2 w3 w0 w5 w6 w7 w4 w20 w21
636    ldp w15, w20, [x26, #20]
637    ldr w19, [x26, #40]
638    ldr w21, [x26, #48]
639    second48 w0 w1 w2 w3 w4 w5 w6 w7 w15 w19
640
641    second48 w3 w0 w1 w2 w7 w4 w5 w6 w20 w21
642    ldp w15, w20, [x26, #32]
643    ldp w19, w21, [x26, #52]
644    second48 w2 w3 w0 w1 w6 w7 w4 w5 w15 w19
645    second48 w1 w2 w3 w0 w5 w6 w7 w4 w20 w21
646    ldr w15, [x26, #40]
647    ldr w20, [x26, #48]
648    ldp w19, w21, [x25]
649    second48 w0 w1 w2 w3 w4 w5 w6 w7 w15 w19
650    second48 w3 w0 w1 w2 w7 w4 w5 w6 w20 w21
651    ldp w15, w20, [x26, #52]
652    ldr w19, [x25, #8]
653    ldr w21, [x25, #16]
654    second48 w2 w3 w0 w1 w6 w7 w4 w5 w15 w19
655    second48 w1 w2 w3 w0 w5 w6 w7 w4 w20 w21
656    ldp w9, w10, [x22]  // XOR with the previous hash result
657    ldp w11, w12, [x22, #8]
658    ldp w13, w14, [x22, #16]
659    ldp w15, w19, [x22, #24]
660    eor w0, w0, w9
661    eor w1, w1, w10
662    eor w2, w2, w11
663    eor w3, w3, w12
664    eor w4, w4, w13
665    eor w5, w5, w14
666    eor w6, w6, w15
667    eor w7, w7, w19
668    stp w0, w1, [x22]  // Result saving
669    stp w2, w3, [x22, #8]
670    stp w4, w5, [x22, #16]
671    stp w6, w7, [x22, #24]
672    b blocksloop_1
673    end:
674
675    add sp, sp, 128
676
677    ldp x19, x20, [sp]
678    ldp x21, x22, [sp, #16]
679    ldp x23, x24, [sp, #32]
680    ldp x25, x26, [sp, #48]
681    ldp d8, d9, [sp, #64]
682    ldp d10, d11, [sp, #80]
683    ldp d12, d13, [sp, #96]
684    ldp d14, d15, [sp, #112]
685    add sp, sp, 128
686
687AARCH64_AUTIASP
688    ret
689.size SM3_CompressAsm,.-SM3_CompressAsm
690
691#endif
692