• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * This file is part of the openHiTLS project.
3 *
4 * openHiTLS is licensed under the Mulan PSL v2.
5 * You can use this software according to the terms and conditions of the Mulan PSL v2.
6 * You may obtain a copy of Mulan PSL v2 at:
7 *
8 *     http://license.coscl.org.cn/MulanPSL2
9 *
10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13 * See the Mulan PSL v2 for more details.
14 */
15#include "hitls_build.h"
16#ifdef HITLS_CRYPTO_CHACHA20
17
18.text
19
20.macro CHA512_EXTA
21    VEXT2 VREG04.16b, VREG14.16b, #12
22    VEXT2 VREG24.16b, VREG34.16b, #12
23    VEXT2 VREG44.16b, VREG54.16b, #12
24    VEXT2 VREG02.16b, VREG12.16b, #4
25    VEXT2 VREG22.16b, VREG32.16b, #4
26    VEXT2 VREG42.16b, VREG52.16b, #4
27.endm
28
29.macro CHA512_EXTB
30    VEXT2 VREG04.16b, VREG14.16b, #4
31    VEXT2 VREG24.16b, VREG34.16b, #4
32    VEXT2 VREG44.16b, VREG54.16b, #4
33    VEXT2 VREG02.16b, VREG12.16b, #12
34    VEXT2 VREG22.16b, VREG32.16b, #12
35    VEXT2 VREG42.16b, VREG52.16b, #12
36.endm
37
38.macro CHA512_SET_VDATA
39    mov VREG01.16b, VSIGMA.16b
40    mov VREG11.16b, VSIGMA.16b
41    mov VREG21.16b, VSIGMA.16b
42    mov VREG31.16b, VSIGMA.16b
43    mov VREG41.16b, VSIGMA.16b
44    mov VREG51.16b, VSIGMA.16b
45    mov VREG02.16b, VKEY01.16b
46    mov VREG12.16b, VKEY01.16b
47    mov VREG22.16b, VKEY01.16b
48    mov VREG32.16b, VKEY01.16b
49    mov VREG42.16b, VKEY01.16b
50    mov VREG52.16b, VKEY01.16b
51    mov VREG03.16b, VKEY02.16b
52    mov VREG13.16b, VKEY02.16b
53    mov VREG23.16b, VKEY02.16b
54    mov VREG33.16b, VKEY02.16b
55    mov VREG43.16b, VKEY02.16b
56    mov VREG53.16b, VKEY02.16b
57    mov VREG04.16b, VCUR01.16b              // Counter + 2
58    mov VREG14.16b, VCUR02.16b              // Counter + 3
59    mov VREG24.16b, VCUR03.16b              // Counter + 4
60    mov VREG34.16b, VCUR04.16b              // Counter + 5
61    add VREG44.4s, VREG04.4s, VADDER.4s     // Counter + 6 = 4 + 2
62    add VREG54.4s, VREG14.4s, VADDER.4s     // Counter + 7 = 4 + 3
63.endm
64
65.macro CHA512_ROUND_END
66    add VREG01.4s, VREG01.4s, VSIGMA.4s     // After the loop is complete, add input.
67    add VREG11.4s, VREG11.4s, VSIGMA.4s
68    add VREG21.4s, VREG21.4s, VSIGMA.4s
69    add VREG31.4s, VREG31.4s, VSIGMA.4s
70    add VREG41.4s, VREG41.4s, VSIGMA.4s
71    add VREG51.4s, VREG51.4s, VSIGMA.4s
72    add VREG02.4s, VREG02.4s, VKEY01.4s     // After the loop is complete, add input.
73    add VREG12.4s, VREG12.4s, VKEY01.4s
74    add VREG22.4s, VREG22.4s, VKEY01.4s
75    add VREG32.4s, VREG32.4s, VKEY01.4s
76    add VREG42.4s, VREG42.4s, VKEY01.4s
77    add VREG52.4s, VREG52.4s, VKEY01.4s
78    add VREG03.4s, VREG03.4s, VKEY02.4s     // After the loop is complete, add input.
79    add VREG13.4s, VREG13.4s, VKEY02.4s
80    add VREG23.4s, VREG23.4s, VKEY02.4s
81    add VREG33.4s, VREG33.4s, VKEY02.4s
82    add VREG43.4s, VREG43.4s, VKEY02.4s
83    add VREG53.4s, VREG53.4s, VKEY02.4s
84    add VREG44.4s, VREG44.4s, VCUR01.4s     // 2
85    add VREG54.4s, VREG54.4s, VCUR02.4s     // 3
86    add VREG04.4s, VREG04.4s, VCUR01.4s     // 2
87    add VREG14.4s, VREG14.4s, VCUR02.4s     // 3
88    add VREG24.4s, VREG24.4s, VCUR03.4s     // 4
89    add VREG34.4s, VREG34.4s, VCUR04.4s     // 5
90    add VREG44.4s, VREG44.4s, VADDER.4s     // 4 + 2
91    add VREG54.4s, VREG54.4s, VADDER.4s     // 4 + 3
92.endm
93
94.macro CHA512_WRITE_BACK
95    ld1 {VCUR01.16b, VCUR02.16b, VCUR03.16b, VCUR04.16b}, [REGINC], #64  // Load 64 bytes.
96    eor VREG01.16b, VREG01.16b, VCUR01.16b
97    eor VREG02.16b, VREG02.16b, VCUR02.16b
98    eor VREG03.16b, VREG03.16b, VCUR03.16b
99    eor VREG04.16b, VREG04.16b, VCUR04.16b
100    ld1 {VCUR01.16b, VCUR02.16b, VCUR03.16b, VCUR04.16b}, [REGINC], #64  // Load 64 bytes.
101    st1 {VREG01.16b, VREG02.16b, VREG03.16b, VREG04.16b}, [REGOUT], #64  // Write 64 bytes.
102    eor VREG11.16b, VREG11.16b, VCUR01.16b
103    eor VREG12.16b, VREG12.16b, VCUR02.16b
104    eor VREG13.16b, VREG13.16b, VCUR03.16b
105    eor VREG14.16b, VREG14.16b, VCUR04.16b
106    ld1 {VREG01.16b, VREG02.16b, VREG03.16b, VREG04.16b}, [REGINC], #64  // Load 64 bytes.
107    st1 {VREG11.16b, VREG12.16b, VREG13.16b, VREG14.16b}, [REGOUT], #64  // Write 64 bytes.
108    eor VREG21.16b, VREG21.16b, VREG01.16b
109    eor VREG22.16b, VREG22.16b, VREG02.16b
110    eor VREG23.16b, VREG23.16b, VREG03.16b
111    eor VREG24.16b, VREG24.16b, VREG04.16b
112    ld1 {VREG11.16b, VREG12.16b, VREG13.16b, VREG14.16b}, [REGINC], #64  // Load 64 bytes.
113    st1 {VREG21.16b, VREG22.16b, VREG23.16b, VREG24.16b}, [REGOUT], #64  // Write 64 bytes.
114    eor VREG31.16b, VREG31.16b, VREG11.16b
115    eor VREG32.16b, VREG32.16b, VREG12.16b
116    eor VREG33.16b, VREG33.16b, VREG13.16b
117    eor VREG34.16b, VREG34.16b, VREG14.16b
118    ld1 {VREG01.16b, VREG02.16b, VREG03.16b, VREG04.16b}, [REGINC], #64  // Load 64 bytes.
119    st1 {VREG31.16b, VREG32.16b, VREG33.16b, VREG34.16b}, [REGOUT], #64  // Write 64 bytes.
120    shl VREG21.4s, VADDER.4s, #1                                        // 4 -> 8
121    eor VREG41.16b, VREG41.16b, VREG01.16b
122    eor VREG42.16b, VREG42.16b, VREG02.16b
123    eor VREG43.16b, VREG43.16b, VREG03.16b
124    eor VREG44.16b, VREG44.16b, VREG04.16b
125    ld1 {VREG11.16b, VREG12.16b, VREG13.16b, VREG14.16b}, [REGINC], #64  // Load 64 bytes.
126    st1 {VREG41.16b, VREG42.16b, VREG43.16b, VREG44.16b}, [REGOUT], #64  // Write 64 bytes.
127    ldp QCUR01, QCUR02, [sp, #32]           // restore counter 0 1 2 4
128    ldp QCUR03, QCUR04, [sp, #64]
129    eor VREG51.16b, VREG51.16b, VREG11.16b
130    eor VREG52.16b, VREG52.16b, VREG12.16b
131    eor VREG53.16b, VREG53.16b, VREG13.16b
132    eor VREG54.16b, VREG54.16b, VREG14.16b
133    st1 {VREG51.16b, VREG52.16b, VREG53.16b, VREG54.16b}, [REGOUT], #64  // Write 64 bytes.
134    add VCUR01.4s, VCUR01.4s, VREG21.4s
135    add VCUR02.4s, VCUR02.4s, VREG21.4s
136    add VCUR03.4s, VCUR03.4s, VREG21.4s
137    add VCUR04.4s, VCUR04.4s, VREG21.4s
138.endm
139
140.macro CHA512_ROUND
141    WCHA_ADD_A_B                                            // a += b
142    VADD2 VREG02.4s, VREG01.4s, VREG12.4s, VREG11.4s        // a[0,1,2,3] += b[4,5,6,7]
143    VADD2 VREG22.4s, VREG21.4s, VREG32.4s, VREG31.4s
144    WCHA_EOR_D_A                                            // d ^= a
145    VADD2 VREG42.4s, VREG41.4s, VREG52.4s, VREG51.4s
146    VEOR2 VREG01.16b, VREG04.16b, VREG11.16b, VREG14.16b    // d[12,13,14,15] ^= a[0,1,2,3]
147    WCHA_ROR_D #16                                          // d <<<= 16 ror Cyclic shift right by 16 bits.
148    VEOR2 VREG21.16b, VREG24.16b, VREG31.16b, VREG34.16b
149    VEOR2 VREG41.16b, VREG44.16b, VREG51.16b, VREG54.16b
150    WCHA_ADD_C_D                                            // c += d
151    VREV322 VREG04.8h, VREG14.8h                            // d[12,13,14,15] (#16 inverse).
152    VREV322 VREG24.8h, VREG34.8h
153    WCHA_EOR_B_C
154    VREV322 VREG44.8h, VREG54.8h
155    VADD2 VREG04.4s, VREG03.4s, VREG14.4s, VREG13.4s        // c[8,9,10,11] += d[12,13,14,15]
156    WCHA_ROR_B #20
157    VADD2 VREG24.4s, VREG23.4s, VREG34.4s, VREG33.4s
158    VADD2 VREG44.4s, VREG43.4s, VREG54.4s, VREG53.4s
159    WCHA_ADD_A_B                                                                    // a += b
160    VEORX VREG03.16b, VREG02.16b, VCUR01.16b, VREG13.16b, VREG12.16b, VCUR02.16b    // m = b[4,5,6,7] ^ c[8,9,10,11]
161    VEORX VREG23.16b, VREG22.16b, VCUR03.16b, VREG33.16b, VREG32.16b, VCUR04.16b
162    WCHA_EOR_D_A
163    VEORX VREG43.16b, VREG42.16b, VCUR05.16b, VREG53.16b, VREG52.16b, VCUR06.16b
164    VUSHR2 VCUR01.4s, VREG02.4s, VCUR02.4s, VREG12.4s, #20  // b[4,5,6,7] = m << 20
165    WCHA_ROR_D #24
166    VUSHR2 VCUR03.4s, VREG22.4s, VCUR04.4s, VREG32.4s, #20
167    VUSHR2 VCUR05.4s, VREG42.4s, VCUR06.4s, VREG52.4s, #20
168    WCHA_ADD_C_D                                            // c += d
169    VSLI2 VCUR01.4s, VREG02.4s, VCUR02.4s, VREG12.4s, #12   // b[4,5,6,7] = m >> 12
170    VSLI2 VCUR03.4s, VREG22.4s, VCUR04.4s, VREG32.4s, #12
171    WCHA_EOR_B_C
172    VSLI2 VCUR05.4s, VREG42.4s, VCUR06.4s, VREG52.4s, #12
173    VADD2 VREG02.4s, VREG01.4s, VREG12.4s, VREG11.4s        // a[0,1,2,3] += b[4,5,6,7]
174    WCHA_ROR_B #25
175    VADD2 VREG22.4s, VREG21.4s, VREG32.4s, VREG31.4s
176    VADD2 VREG42.4s, VREG41.4s, VREG52.4s, VREG51.4s
177    WCHA_ADD2_A_B
178    VEORX VREG04.16b, VREG01.16b, VCUR01.16b, VREG14.16b, VREG11.16b, VCUR02.16b // m = d[12,13,14,15] ^ a[0,1,2,3]
179    VEORX VREG24.16b, VREG21.16b, VCUR03.16b, VREG34.16b, VREG31.16b, VCUR04.16b
180    WCHA_EOR2_D_A
181    VEORX VREG44.16b, VREG41.16b, VCUR05.16b, VREG54.16b, VREG51.16b, VCUR06.16b
182    VUSHR2 VCUR01.4s, VREG04.4s, VCUR02.4s, VREG14.4s, #24  // d[12,13,14,15] = m << 24
183    WCHA_ROR_D #16
184    VUSHR2 VCUR03.4s, VREG24.4s, VCUR04.4s, VREG34.4s, #24
185    VUSHR2 VCUR05.4s, VREG44.4s, VCUR06.4s, VREG54.4s, #24
186    WCHA_ADD2_C_D
187    VSLI2 VCUR01.4s, VREG04.4s, VCUR02.4s, VREG14.4s, #8    // d[12,13,14,15] = m >> 8
188    VSLI2 VCUR03.4s, VREG24.4s, VCUR04.4s, VREG34.4s, #8
189    WCHA_EOR2_B_C
190    VSLI2 VCUR05.4s, VREG44.4s, VCUR06.4s, VREG54.4s, #8
191    VADD2 VREG04.4s, VREG03.4s, VREG14.4s, VREG13.4s        // c[8,9,10,11] += d[12,13,14,15]
192    WCHA_ROR_B #20
193    VADD2 VREG24.4s, VREG23.4s, VREG34.4s, VREG33.4s
194    VADD2 VREG44.4s, VREG43.4s, VREG54.4s, VREG53.4s
195    WCHA_ADD2_A_B
196    VEORX VREG03.16b, VREG02.16b, VCUR01.16b, VREG13.16b, VREG12.16b, VCUR02.16b // m = b[4,5,6,7] ^ c[8,9,10,11]
197    VEORX VREG23.16b, VREG22.16b, VCUR03.16b, VREG33.16b, VREG32.16b, VCUR04.16b
198    WCHA_EOR2_D_A
199    VEORX VREG43.16b, VREG42.16b, VCUR05.16b, VREG53.16b, VREG52.16b, VCUR06.16b
200    VUSHR2 VCUR01.4s, VREG02.4s, VCUR02.4s, VREG12.4s, #25  // b[4,5,6,7] = m << 25
201    WCHA_ROR_D #24
202    VUSHR2 VCUR03.4s, VREG22.4s, VCUR04.4s, VREG32.4s, #25
203    VUSHR2 VCUR05.4s, VREG42.4s, VCUR06.4s, VREG52.4s, #25
204    WCHA_ADD2_C_D
205    VSLI2 VCUR01.4s, VREG02.4s, VCUR02.4s, VREG12.4s, #7    // b[4,5,6,7] = m >> 7
206    VSLI2 VCUR03.4s, VREG22.4s, VCUR04.4s, VREG32.4s, #7
207    WCHA_EOR2_B_C
208    VSLI2 VCUR05.4s, VREG42.4s, VCUR06.4s, VREG52.4s, #7
209    VEXT2 VREG03.16b, VREG13.16b, #8
210    WCHA_ROR_B #25
211    VEXT2 VREG23.16b, VREG33.16b, #8
212    VEXT2 VREG43.16b, VREG53.16b, #8
213.endm
214
215#endif
216