• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * This file is part of the openHiTLS project.
3 *
4 * openHiTLS is licensed under the Mulan PSL v2.
5 * You can use this software according to the terms and conditions of the Mulan PSL v2.
6 * You may obtain a copy of Mulan PSL v2 at:
7 *
8 *     http://license.coscl.org.cn/MulanPSL2
9 *
10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13 * See the Mulan PSL v2 for more details.
14 */
15
16#include "hitls_build.h"
17#ifdef HITLS_CRYPTO_CHACHA20
18
19#include "crypt_arm.h"
20#include "chacha20_common_aarch64.S"
21#include "chacha20_64block_aarch64.S"
22#include "chacha20_256block_aarch64.S"
23#include "chacha20_512block_aarch64.S"
24
25.section .rodata
26.ADD_LONG:
27.long 1,0,0,0
28/**
29 * @Interconnection with the C interface:void CHACHA20_Update(CRYPT_CHACHA20_Ctx *ctx, const uint8_t *in, uint8_t *out, uint32_t len);
30 * @brief Chacha20 algorithm
31 * @param ctx [IN] Algorithm context, which is set by the C interface and transferred.
32 * @param in [IN] Data to be encrypted
33 * @param out [OUT] Data after encryption
34 * @param len [IN] Encrypted length
35 */
36
37.text
38.globl CHACHA20_Update
39.type CHACHA20_Update,%function
40.align 4
41CHACHA20_Update:
42AARCH64_PACIASP
43    lsr REGLEN, REGLEN, #6                  // Divided by 64 to calculate how many blocks.
44    stp x29, x30, [sp, #-96]!               // x29 x30 store sp -96 address sp -=96.
45    add x29, sp, #0                         // x29 = sp
46    stp x19, x20, [sp, #80]                 // x19 x20 store sp, sp +=16.
47    stp x21, x22, [sp, #64]
48    cmp REGLEN, #1                          // 1
49    stp x23, x24, [sp, #48]
50    stp x25, x26, [sp, #32]
51    stp x27, x28, [sp, #16]
52    sub sp, sp, #128+64                     // sp -= 192
53    b.lo .Lchacha_end                       // Less than 1 block.
54    b.eq .Lchacha64                         // Equals 1 block.
55    adrp x5, .ADD_LONG
56    add x5, x5, :lo12:.ADD_LONG                       // load(1, 0, 0, 0)
57
58    cmp REGLEN, #8                          // >= 512(64*8)
59#ifdef HITLS_BIG_ENDIAN
60    ldp XSIG01, XSIG02, [x0]
61    ld1 {VSIGMA.4s}, [x0], #16              // {sima0, sima1, key0, key1, key3, key4, counter1, counter2}
62    ldp XKEY01, XKEY02, [x0]
63    ldp XKEY03, XKEY04, [x0, #16]
64    ld1 {VKEY01.4s, VKEY02.4s}, [x0], #32
65    ldp XCOUN1, XCOUN2, [x0]
66    ld1 {VCOUN0.4s}, [x0]
67
68    // Processing when the big-endian machine is loaded.
69    ror XCOUN1, XCOUN1, #32
70    ror XCOUN2, XCOUN2, #32
71    ror XSIG01, XSIG01, #32
72    ror XSIG02, XSIG02, #32
73    add WINPUT2, WCOUN1, w3
74    ror XKEY01, XKEY01, #32
75    ror XKEY02, XKEY02, #32
76    ror XKEY03, XKEY03, #32
77    ror XKEY04, XKEY04, #32
78    str WINPUT2, [x0]
79#else
80    ldp XSIG01, XSIG02, [x0]
81    ld1 {VSIGMA.4s}, [x0], #16              // {sima0, sima1, key0, key1, key3, key4, counter1, counter2}
82    ldp XKEY01, XKEY02, [x0]
83    ldp XKEY03, XKEY04, [x0, #16]
84    ld1 {VKEY01.4s, VKEY02.4s}, [x0], #32
85    ldp XCOUN1, XCOUN2, [x0]
86    ld1 {VCOUN0.4s}, [x0]
87    add x6, XCOUN1, REGLEN
88    str x6, [x0]                            // Write back the counter.
89#endif
90    b.lo .Lchacha256                        // < 512
91
92    stp QCUR05, QCUR06, [sp, #0]            // Write sigma key1 to SP.
93    ld1 {VADDER.4s}, [x5]                   // Load ADDR.
94    add VCUR01.4s, VCOUN0.4s, VADDER.4s     // 0
95    add VCUR01.4s, VCUR01.4s, VADDER.4s     // +2
96    add VCUR02.4s, VCUR01.4s, VADDER.4s     // +3
97    add VCUR03.4s, VCUR02.4s, VADDER.4s     // +4
98    add VCUR04.4s, VCUR03.4s, VADDER.4s     // +5
99    shl VADDER.4s, VADDER.4s, #2            // 4
100
101    stp d8, d9,[sp,#128+0]                  // Meet ABI requirements.
102    stp d10, d11,[sp,#128+16]
103    stp d12, d13,[sp,#128+32]
104    stp d14, d15,[sp,#128+48]
105
106// 8 block
107.Loop_512_start:
108    cmp REGLEN, #8
109    b.lo .L512ToChacha256                   // Less than 512.
110    CHA64_SET_WDATA                         // General-purpose register 1 x 64 bytes.
111    CHA512_SET_VDATA                        // Wide register 6 x 64 bytes.
112
113    stp QCUR01, QCUR02, [sp, #32]           // Write counter 0, 1, 2 3 to sp.
114    stp QCUR03, QCUR04, [sp, #64]
115    mov x4, #5
116    sub REGLEN, REGLEN, #8                  // Process 512 at a time.
117.Loop_512_a_run:
118    sub x4, x4, #1
119    CHA512_ROUND
120    CHA512_EXTA
121    CHA512_ROUND
122    CHA512_EXTB
123    cbnz x4, .Loop_512_a_run
124
125    CHA64_ROUND_END                         // Add to input after the loop is complete.
126    CHA64_WRITE_BACK                        // 512 Write 64 bytes in the first half round.
127    add XCOUN1, XCOUN1, #1                  // +1
128    CHA64_SET_WDATA                         // Resetting.
129
130    mov x4, #5
131.Loop_512_b_run:
132    sub x4, x4, #1
133    CHA512_ROUND
134    CHA512_EXTA
135    CHA512_ROUND
136    CHA512_EXTB
137    cbnz x4, .Loop_512_b_run
138
139    CHA64_ROUND_END                         // Add to input after the loop is complete.
140    CHA64_WRITE_BACK                        // 512 Write 64 bytes in the first half round.
141    add XCOUN1, XCOUN1, #7                  // +7
142
143    ldp QCUR05, QCUR06, [sp, #0]            // Restore sigma and key1.
144    ldp QCUR01, QCUR02, [sp, #32]           // Restore counter 0 1 2 4.
145    ldp QCUR03, QCUR04, [sp, #64]
146
147    CHA512_ROUND_END                        // Add to input after the loop is complete.
148    CHA512_WRITE_BACK                       // Write back data.
149    b .Loop_512_start                       // return start.
150
151// 1 block
152.Lchacha64:
153#ifdef HITLS_BIG_ENDIAN
154    ldp XCOUN1, XCOUN2, [x0, #48]
155    ldp XSIG01, XSIG02, [x0]
156    ldp XKEY01, XKEY02, [x0, #16]
157    // Processing when the big-endian machine is loaded
158    ror XCOUN1, XCOUN1, #32
159    ror XCOUN2, XCOUN2, #32
160    ror XSIG01, XSIG01, #32
161    ror XSIG02, XSIG02, #32
162    ldp XKEY03, XKEY04, [x0, #32]
163    add WINPUT0, WCOUN1, w3
164    ror XKEY01, XKEY01, #32
165    ror XKEY02, XKEY02, #32
166    ror XKEY03, XKEY03, #32
167    ror XKEY04, XKEY04, #32
168    str WINPUT0, [x0, #48]
169#else
170    ldp XCOUN1, XCOUN2, [x0, #48]
171    ldp XSIG01, XSIG02, [x0]
172    ldp XKEY01, XKEY02, [x0, #16]
173    add XINPUT0, XCOUN1, REGLEN
174    ldp XKEY03, XKEY04, [x0, #32]
175    str XINPUT0, [x0, #48]                   // Write data.
176#endif
177
178.Loop_64_start:
179    CHA64_SET_WDATA                          // General-purpose register, 1x64byte.
180    mov x4, #10
181.Loop_64_run:
182    sub x4, x4, #1
183    WCHA_ADD_A_B                                            // a += b
184    WCHA_EOR_D_A                                            // d ^= a
185    WCHA_ROR_D #16                                          // d <<<= 16 ror Cyclic shift right by 16 bits.
186    WCHA_ADD_C_D                                            // c += d
187    WCHA_EOR_B_C
188    WCHA_ROR_B #20
189    WCHA_ADD_A_B                                                                    // a += b
190    WCHA_EOR_D_A
191    WCHA_ROR_D #24
192    WCHA_ADD_C_D                                            // c += d
193    WCHA_EOR_B_C
194    WCHA_ROR_B #25
195
196    WCHA_ADD2_A_B
197    WCHA_EOR2_D_A
198    WCHA_ROR_D #16
199    WCHA_ADD2_C_D
200    WCHA_EOR2_B_C
201    WCHA_ROR_B #20
202    WCHA_ADD2_A_B
203    WCHA_EOR2_D_A
204    WCHA_ROR_D #24
205    WCHA_ADD2_C_D
206    WCHA_EOR2_B_C
207    WCHA_ROR_B #25
208    cbnz x4, .Loop_64_run
209    CHA64_ROUND_END                         // Add to input after the loop is complete.
210    subs REGLEN, REGLEN, #1
211    CHA64_WRITE_BACK                        // Write 64 bytes.
212    add XCOUN1, XCOUN1, #1
213    b.le .Lchacha_end
214    b .Loop_64_start
215
216.L512ToChacha256:
217    ldp d8,d9,[sp,#128+0]                   // Meet ABI requirements.
218    ldp d10,d11,[sp,#128+16]
219    ldp d12,d13,[sp,#128+32]
220    ldp d14,d15,[sp,#128+48]
221    cbz REGLEN, .Lchacha_end                 // The length is 0.
222    ushr VADDER.4s, VADDER.4s, #2           // 4->1
223    sub VREG52.4s, VCUR01.4s, VADDER.4s     // 10-1 = 9  8
224    sub VREG53.4s, VCUR02.4s, VADDER.4s     // 11-1 = 10
225    sub VREG54.4s, VCUR03.4s, VADDER.4s     // 12-1 = 11
226    shl VCUR01.4s, VADDER.4s, #2            // 2 -> 4
227    b .Loop_256_start
228
229// 4 block
230.Lchacha256:
231    ld1 {VADDER.4s}, [x5]                   // Load ADDR.
232    mov VREG51.16b, VCOUN0.16b              // 0
233    add VREG52.4s, VCOUN0.4s, VADDER.4s     // 1
234    add VREG53.4s, VREG52.4s, VADDER.4s     // 2
235    add VREG54.4s, VREG53.4s, VADDER.4s     // 3
236    shl VCUR01.4s, VADDER.4s, #2            // 4
237
238.Loop_256_start:
239    CHA64_SET_WDATA                         // General-purpose register 16 byte.
240    CHA256_SET_VDATA                        // Neon register 3 * 48 byte.
241    mov x4, #10
242.Loop_256_run:
243    sub x4, x4, #1
244    CHA256_ROUND_A
245    VEXT2 VREG04.16b, VREG14.16b, #12
246    VEXT2 VREG24.16b, VREG34.16b, #12
247    VEXT2 VREG02.16b, VREG12.16b, #4
248    VEXT2 VREG22.16b, VREG32.16b, #4
249    CHA256_ROUND_B
250    VEXT2 VREG04.16b, VREG14.16b, #4
251    VEXT2 VREG24.16b, VREG34.16b, #4
252    VEXT2 VREG02.16b, VREG12.16b, #12
253    VEXT2 VREG22.16b, VREG32.16b, #12
254    cbnz x4, .Loop_256_run
255    subs REGLEN, REGLEN, #4                 // One-time processing 256.
256    CHA256_ROUND_END
257    b.lo .Lchacha_less_than_256             // < 0
258    CHA64_ROUND_END
259    CHA256_WRITE_BACK                       // Write back data.
260    b.le .Lchacha_end                       // = 0
261    add	XCOUN1, XCOUN1, #4			        // Counter+4.
262    add VREG52.4s, VREG52.4s, VCUR01.4s     // Counter+4.
263    add VREG53.4s, VREG53.4s, VCUR01.4s
264    add VREG54.4s, VREG54.4s, VCUR01.4s
265    b .Loop_256_start
266
267.Lchacha_less_than_256:
268    add REGLEN, REGLEN, #4
269    cmp REGLEN, #1
270    b.lo .Lchacha_end                        // <= 64 byte.
271    CHA64_ROUND_END
272    CHA64_WRITE_BACK
273
274    sub REGLEN, REGLEN, #1
275    cmp REGLEN, #1
276    b.lo .Lchacha_end
277    CHA256_WRITE_BACKB VREG01.16b, VREG02.16b, VREG03.16b, VREG04.16b
278
279    sub REGLEN, REGLEN, #1
280    cmp REGLEN, #1
281    b.lo .Lchacha_end
282    CHA256_WRITE_BACKB VREG11.16b, VREG12.16b, VREG13.16b, VREG14.16b
283
284.Lchacha_end:
285    eor XKEY01, XKEY01, XKEY01
286    eor XKEY02, XKEY02, XKEY02
287    eor XKEY03, XKEY03, XKEY03
288    eor XKEY04, XKEY04, XKEY04
289    eor XKEY04, XKEY04, XKEY04
290    eor XCOUN2, XCOUN2, XCOUN2
291    eor VKEY01.16b, VKEY01.16b, VKEY01.16b
292    eor VKEY02.16b, VKEY02.16b, VKEY02.16b
293    eor VCUR01.16b, VCUR01.16b, VCUR01.16b
294    ldp x19, x20, [x29, #80]
295    add sp, sp, #128+64
296    ldp x21, x22, [x29, #64]
297    ldp x23, x24, [x29, #48]
298    ldp x25, x26, [x29, #32]
299    ldp x27, x28, [x29, #16]
300    ldp x29, x30, [sp], #96
301
302.Labort:
303AARCH64_AUTIASP
304    ret
305.size CHACHA20_Update,.-CHACHA20_Update
306
307#endif
308