1/* 2 * This file is part of the openHiTLS project. 3 * 4 * openHiTLS is licensed under the Mulan PSL v2. 5 * You can use this software according to the terms and conditions of the Mulan PSL v2. 6 * You may obtain a copy of Mulan PSL v2 at: 7 * 8 * http://license.coscl.org.cn/MulanPSL2 9 * 10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, 11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, 12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. 13 * See the Mulan PSL v2 for more details. 14 */ 15 16#include "hitls_build.h" 17#ifdef HITLS_CRYPTO_CHACHA20 18 19#include "crypt_arm.h" 20#include "chacha20_common_aarch64.S" 21#include "chacha20_64block_aarch64.S" 22#include "chacha20_256block_aarch64.S" 23#include "chacha20_512block_aarch64.S" 24 25.section .rodata 26.ADD_LONG: 27.long 1,0,0,0 28/** 29 * @Interconnection with the C interface:void CHACHA20_Update(CRYPT_CHACHA20_Ctx *ctx, const uint8_t *in, uint8_t *out, uint32_t len); 30 * @brief Chacha20 algorithm 31 * @param ctx [IN] Algorithm context, which is set by the C interface and transferred. 32 * @param in [IN] Data to be encrypted 33 * @param out [OUT] Data after encryption 34 * @param len [IN] Encrypted length 35 */ 36 37.text 38.globl CHACHA20_Update 39.type CHACHA20_Update,%function 40.align 4 41CHACHA20_Update: 42AARCH64_PACIASP 43 lsr REGLEN, REGLEN, #6 // Divided by 64 to calculate how many blocks. 44 stp x29, x30, [sp, #-96]! // x29 x30 store sp -96 address sp -=96. 45 add x29, sp, #0 // x29 = sp 46 stp x19, x20, [sp, #80] // x19 x20 store sp, sp +=16. 47 stp x21, x22, [sp, #64] 48 cmp REGLEN, #1 // 1 49 stp x23, x24, [sp, #48] 50 stp x25, x26, [sp, #32] 51 stp x27, x28, [sp, #16] 52 sub sp, sp, #128+64 // sp -= 192 53 b.lo .Lchacha_end // Less than 1 block. 54 b.eq .Lchacha64 // Equals 1 block. 55 adrp x5, .ADD_LONG 56 add x5, x5, :lo12:.ADD_LONG // load(1, 0, 0, 0) 57 58 cmp REGLEN, #8 // >= 512(64*8) 59#ifdef HITLS_BIG_ENDIAN 60 ldp XSIG01, XSIG02, [x0] 61 ld1 {VSIGMA.4s}, [x0], #16 // {sima0, sima1, key0, key1, key3, key4, counter1, counter2} 62 ldp XKEY01, XKEY02, [x0] 63 ldp XKEY03, XKEY04, [x0, #16] 64 ld1 {VKEY01.4s, VKEY02.4s}, [x0], #32 65 ldp XCOUN1, XCOUN2, [x0] 66 ld1 {VCOUN0.4s}, [x0] 67 68 // Processing when the big-endian machine is loaded. 69 ror XCOUN1, XCOUN1, #32 70 ror XCOUN2, XCOUN2, #32 71 ror XSIG01, XSIG01, #32 72 ror XSIG02, XSIG02, #32 73 add WINPUT2, WCOUN1, w3 74 ror XKEY01, XKEY01, #32 75 ror XKEY02, XKEY02, #32 76 ror XKEY03, XKEY03, #32 77 ror XKEY04, XKEY04, #32 78 str WINPUT2, [x0] 79#else 80 ldp XSIG01, XSIG02, [x0] 81 ld1 {VSIGMA.4s}, [x0], #16 // {sima0, sima1, key0, key1, key3, key4, counter1, counter2} 82 ldp XKEY01, XKEY02, [x0] 83 ldp XKEY03, XKEY04, [x0, #16] 84 ld1 {VKEY01.4s, VKEY02.4s}, [x0], #32 85 ldp XCOUN1, XCOUN2, [x0] 86 ld1 {VCOUN0.4s}, [x0] 87 add x6, XCOUN1, REGLEN 88 str x6, [x0] // Write back the counter. 89#endif 90 b.lo .Lchacha256 // < 512 91 92 stp QCUR05, QCUR06, [sp, #0] // Write sigma key1 to SP. 93 ld1 {VADDER.4s}, [x5] // Load ADDR. 94 add VCUR01.4s, VCOUN0.4s, VADDER.4s // 0 95 add VCUR01.4s, VCUR01.4s, VADDER.4s // +2 96 add VCUR02.4s, VCUR01.4s, VADDER.4s // +3 97 add VCUR03.4s, VCUR02.4s, VADDER.4s // +4 98 add VCUR04.4s, VCUR03.4s, VADDER.4s // +5 99 shl VADDER.4s, VADDER.4s, #2 // 4 100 101 stp d8, d9,[sp,#128+0] // Meet ABI requirements. 102 stp d10, d11,[sp,#128+16] 103 stp d12, d13,[sp,#128+32] 104 stp d14, d15,[sp,#128+48] 105 106// 8 block 107.Loop_512_start: 108 cmp REGLEN, #8 109 b.lo .L512ToChacha256 // Less than 512. 110 CHA64_SET_WDATA // General-purpose register 1 x 64 bytes. 111 CHA512_SET_VDATA // Wide register 6 x 64 bytes. 112 113 stp QCUR01, QCUR02, [sp, #32] // Write counter 0, 1, 2 3 to sp. 114 stp QCUR03, QCUR04, [sp, #64] 115 mov x4, #5 116 sub REGLEN, REGLEN, #8 // Process 512 at a time. 117.Loop_512_a_run: 118 sub x4, x4, #1 119 CHA512_ROUND 120 CHA512_EXTA 121 CHA512_ROUND 122 CHA512_EXTB 123 cbnz x4, .Loop_512_a_run 124 125 CHA64_ROUND_END // Add to input after the loop is complete. 126 CHA64_WRITE_BACK // 512 Write 64 bytes in the first half round. 127 add XCOUN1, XCOUN1, #1 // +1 128 CHA64_SET_WDATA // Resetting. 129 130 mov x4, #5 131.Loop_512_b_run: 132 sub x4, x4, #1 133 CHA512_ROUND 134 CHA512_EXTA 135 CHA512_ROUND 136 CHA512_EXTB 137 cbnz x4, .Loop_512_b_run 138 139 CHA64_ROUND_END // Add to input after the loop is complete. 140 CHA64_WRITE_BACK // 512 Write 64 bytes in the first half round. 141 add XCOUN1, XCOUN1, #7 // +7 142 143 ldp QCUR05, QCUR06, [sp, #0] // Restore sigma and key1. 144 ldp QCUR01, QCUR02, [sp, #32] // Restore counter 0 1 2 4. 145 ldp QCUR03, QCUR04, [sp, #64] 146 147 CHA512_ROUND_END // Add to input after the loop is complete. 148 CHA512_WRITE_BACK // Write back data. 149 b .Loop_512_start // return start. 150 151// 1 block 152.Lchacha64: 153#ifdef HITLS_BIG_ENDIAN 154 ldp XCOUN1, XCOUN2, [x0, #48] 155 ldp XSIG01, XSIG02, [x0] 156 ldp XKEY01, XKEY02, [x0, #16] 157 // Processing when the big-endian machine is loaded 158 ror XCOUN1, XCOUN1, #32 159 ror XCOUN2, XCOUN2, #32 160 ror XSIG01, XSIG01, #32 161 ror XSIG02, XSIG02, #32 162 ldp XKEY03, XKEY04, [x0, #32] 163 add WINPUT0, WCOUN1, w3 164 ror XKEY01, XKEY01, #32 165 ror XKEY02, XKEY02, #32 166 ror XKEY03, XKEY03, #32 167 ror XKEY04, XKEY04, #32 168 str WINPUT0, [x0, #48] 169#else 170 ldp XCOUN1, XCOUN2, [x0, #48] 171 ldp XSIG01, XSIG02, [x0] 172 ldp XKEY01, XKEY02, [x0, #16] 173 add XINPUT0, XCOUN1, REGLEN 174 ldp XKEY03, XKEY04, [x0, #32] 175 str XINPUT0, [x0, #48] // Write data. 176#endif 177 178.Loop_64_start: 179 CHA64_SET_WDATA // General-purpose register, 1x64byte. 180 mov x4, #10 181.Loop_64_run: 182 sub x4, x4, #1 183 WCHA_ADD_A_B // a += b 184 WCHA_EOR_D_A // d ^= a 185 WCHA_ROR_D #16 // d <<<= 16 ror Cyclic shift right by 16 bits. 186 WCHA_ADD_C_D // c += d 187 WCHA_EOR_B_C 188 WCHA_ROR_B #20 189 WCHA_ADD_A_B // a += b 190 WCHA_EOR_D_A 191 WCHA_ROR_D #24 192 WCHA_ADD_C_D // c += d 193 WCHA_EOR_B_C 194 WCHA_ROR_B #25 195 196 WCHA_ADD2_A_B 197 WCHA_EOR2_D_A 198 WCHA_ROR_D #16 199 WCHA_ADD2_C_D 200 WCHA_EOR2_B_C 201 WCHA_ROR_B #20 202 WCHA_ADD2_A_B 203 WCHA_EOR2_D_A 204 WCHA_ROR_D #24 205 WCHA_ADD2_C_D 206 WCHA_EOR2_B_C 207 WCHA_ROR_B #25 208 cbnz x4, .Loop_64_run 209 CHA64_ROUND_END // Add to input after the loop is complete. 210 subs REGLEN, REGLEN, #1 211 CHA64_WRITE_BACK // Write 64 bytes. 212 add XCOUN1, XCOUN1, #1 213 b.le .Lchacha_end 214 b .Loop_64_start 215 216.L512ToChacha256: 217 ldp d8,d9,[sp,#128+0] // Meet ABI requirements. 218 ldp d10,d11,[sp,#128+16] 219 ldp d12,d13,[sp,#128+32] 220 ldp d14,d15,[sp,#128+48] 221 cbz REGLEN, .Lchacha_end // The length is 0. 222 ushr VADDER.4s, VADDER.4s, #2 // 4->1 223 sub VREG52.4s, VCUR01.4s, VADDER.4s // 10-1 = 9 8 224 sub VREG53.4s, VCUR02.4s, VADDER.4s // 11-1 = 10 225 sub VREG54.4s, VCUR03.4s, VADDER.4s // 12-1 = 11 226 shl VCUR01.4s, VADDER.4s, #2 // 2 -> 4 227 b .Loop_256_start 228 229// 4 block 230.Lchacha256: 231 ld1 {VADDER.4s}, [x5] // Load ADDR. 232 mov VREG51.16b, VCOUN0.16b // 0 233 add VREG52.4s, VCOUN0.4s, VADDER.4s // 1 234 add VREG53.4s, VREG52.4s, VADDER.4s // 2 235 add VREG54.4s, VREG53.4s, VADDER.4s // 3 236 shl VCUR01.4s, VADDER.4s, #2 // 4 237 238.Loop_256_start: 239 CHA64_SET_WDATA // General-purpose register 16 byte. 240 CHA256_SET_VDATA // Neon register 3 * 48 byte. 241 mov x4, #10 242.Loop_256_run: 243 sub x4, x4, #1 244 CHA256_ROUND_A 245 VEXT2 VREG04.16b, VREG14.16b, #12 246 VEXT2 VREG24.16b, VREG34.16b, #12 247 VEXT2 VREG02.16b, VREG12.16b, #4 248 VEXT2 VREG22.16b, VREG32.16b, #4 249 CHA256_ROUND_B 250 VEXT2 VREG04.16b, VREG14.16b, #4 251 VEXT2 VREG24.16b, VREG34.16b, #4 252 VEXT2 VREG02.16b, VREG12.16b, #12 253 VEXT2 VREG22.16b, VREG32.16b, #12 254 cbnz x4, .Loop_256_run 255 subs REGLEN, REGLEN, #4 // One-time processing 256. 256 CHA256_ROUND_END 257 b.lo .Lchacha_less_than_256 // < 0 258 CHA64_ROUND_END 259 CHA256_WRITE_BACK // Write back data. 260 b.le .Lchacha_end // = 0 261 add XCOUN1, XCOUN1, #4 // Counter+4. 262 add VREG52.4s, VREG52.4s, VCUR01.4s // Counter+4. 263 add VREG53.4s, VREG53.4s, VCUR01.4s 264 add VREG54.4s, VREG54.4s, VCUR01.4s 265 b .Loop_256_start 266 267.Lchacha_less_than_256: 268 add REGLEN, REGLEN, #4 269 cmp REGLEN, #1 270 b.lo .Lchacha_end // <= 64 byte. 271 CHA64_ROUND_END 272 CHA64_WRITE_BACK 273 274 sub REGLEN, REGLEN, #1 275 cmp REGLEN, #1 276 b.lo .Lchacha_end 277 CHA256_WRITE_BACKB VREG01.16b, VREG02.16b, VREG03.16b, VREG04.16b 278 279 sub REGLEN, REGLEN, #1 280 cmp REGLEN, #1 281 b.lo .Lchacha_end 282 CHA256_WRITE_BACKB VREG11.16b, VREG12.16b, VREG13.16b, VREG14.16b 283 284.Lchacha_end: 285 eor XKEY01, XKEY01, XKEY01 286 eor XKEY02, XKEY02, XKEY02 287 eor XKEY03, XKEY03, XKEY03 288 eor XKEY04, XKEY04, XKEY04 289 eor XKEY04, XKEY04, XKEY04 290 eor XCOUN2, XCOUN2, XCOUN2 291 eor VKEY01.16b, VKEY01.16b, VKEY01.16b 292 eor VKEY02.16b, VKEY02.16b, VKEY02.16b 293 eor VCUR01.16b, VCUR01.16b, VCUR01.16b 294 ldp x19, x20, [x29, #80] 295 add sp, sp, #128+64 296 ldp x21, x22, [x29, #64] 297 ldp x23, x24, [x29, #48] 298 ldp x25, x26, [x29, #32] 299 ldp x27, x28, [x29, #16] 300 ldp x29, x30, [sp], #96 301 302.Labort: 303AARCH64_AUTIASP 304 ret 305.size CHACHA20_Update,.-CHACHA20_Update 306 307#endif 308