1/* 2 * This file is part of the openHiTLS project. 3 * 4 * openHiTLS is licensed under the Mulan PSL v2. 5 * You can use this software according to the terms and conditions of the Mulan PSL v2. 6 * You may obtain a copy of Mulan PSL v2 at: 7 * 8 * http://license.coscl.org.cn/MulanPSL2 9 * 10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, 11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, 12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. 13 * See the Mulan PSL v2 for more details. 14 */ 15 16#include "hitls_build.h" 17#ifdef HITLS_CRYPTO_CHACHA20 18 19.text 20 21.macro CHA256_SET_VDATA 22 mov VREG01.16b, VSIGMA.16b 23 mov VREG11.16b, VSIGMA.16b 24 mov VREG21.16b, VSIGMA.16b 25 26 mov VREG02.16b, VKEY01.16b 27 mov VREG12.16b, VKEY01.16b 28 mov VREG22.16b, VKEY01.16b 29 30 mov VREG03.16b, VKEY02.16b 31 mov VREG13.16b, VKEY02.16b 32 mov VREG23.16b, VKEY02.16b 33 34 mov VREG04.16b, VREG52.16b // 1 35 mov VREG14.16b, VREG53.16b // 2 36 mov VREG24.16b, VREG54.16b // 3 37.endm 38 39.macro CHA256_ROUND_A 40 add WINPUT0, WINPUT0, WINPUT4 // A+B 41 add VREG01.4s, VREG01.4s, VREG02.4s 42 add WINPUT1, WINPUT1, WINPUT5 // A+B 43 add VREG11.4s, VREG11.4s, VREG12.4s 44 add WINPUT2, WINPUT2, WINPUT6 // A+B 45 add VREG21.4s, VREG21.4s, VREG22.4s 46 add WINPUT3, WINPUT3, WINPUT7 // A+B 47 eor VREG04.16b, VREG04.16b, VREG01.16b 48 49 eor WINPUT12, WINPUT12, WINPUT0 // D^A 50 eor VREG14.16b, VREG14.16b, VREG11.16b 51 eor WINPUT13, WINPUT13, WINPUT1 // D^A 52 eor VREG24.16b, VREG24.16b, VREG21.16b 53 eor WINPUT14, WINPUT14, WINPUT2 // D^A 54 rev32 VREG04.8h, VREG04.8h 55 eor WINPUT15, WINPUT15, WINPUT3 // D^A 56 rev32 VREG14.8h, VREG14.8h 57 58 ror WINPUT12, WINPUT12, #16 // D>>>16 59 rev32 VREG24.8h, VREG24.8h 60 ror WINPUT13, WINPUT13, #16 // D>>>16 61 add VREG03.4s, VREG03.4s, VREG04.4s 62 ror WINPUT14, WINPUT14, #16 // D>>>16 63 add VREG13.4s, VREG13.4s, VREG14.4s 64 ror WINPUT15, WINPUT15, #16 // D>>>16 65 add VREG23.4s, VREG23.4s, VREG24.4s 66 67 add WINPUT8, WINPUT8, WINPUT12 // C+D 68 eor VREG41.16b, VREG03.16b, VREG02.16b 69 add WINPUT9, WINPUT9, WINPUT13 // C+D 70 eor VREG42.16b, VREG13.16b, VREG12.16b 71 add WINPUT10, WINPUT10, WINPUT14 // C+D 72 eor VREG43.16b, VREG23.16b, VREG22.16b 73 add WINPUT11, WINPUT11, WINPUT15 // C+D 74 ushr VREG02.4s, VREG41.4s, #20 75 76 eor WINPUT4, WINPUT4, WINPUT8 // B^C 77 ushr VREG12.4s, VREG42.4s, #20 78 eor WINPUT5, WINPUT5, WINPUT9 // B^C 79 ushr VREG22.4s, VREG43.4s, #20 80 eor WINPUT6, WINPUT6, WINPUT10 // B^C 81 sli VREG02.4s, VREG41.4s, #12 82 eor WINPUT7, WINPUT7, WINPUT11 // B^C 83 sli VREG12.4s, VREG42.4s, #12 84 85 ror WINPUT4, WINPUT4, #20 // B>>>20 86 sli VREG22.4s, VREG43.4s, #12 87 ror WINPUT5, WINPUT5, #20 // B>>>20 88 add VREG01.4s, VREG01.4s, VREG02.4s 89 ror WINPUT6, WINPUT6, #20 // B>>>20 90 add VREG11.4s, VREG11.4s, VREG12.4s 91 ror WINPUT7, WINPUT7, #20 // B>>>20 92 add VREG21.4s, VREG21.4s, VREG22.4s 93 94 add WINPUT0, WINPUT0, WINPUT4 // A+B 95 eor VREG41.16b, VREG04.16b, VREG01.16b 96 add WINPUT1, WINPUT1, WINPUT5 // A+B 97 eor VREG42.16b, VREG14.16b, VREG11.16b 98 add WINPUT2, WINPUT2, WINPUT6 // A+B 99 eor VREG43.16b, VREG24.16b, VREG21.16b 100 add WINPUT3, WINPUT3, WINPUT7 // A+B 101 ushr VREG04.4s, VREG41.4s, #24 102 103 eor WINPUT12, WINPUT12, WINPUT0 // D^A 104 ushr VREG14.4s, VREG42.4s, #24 105 eor WINPUT13, WINPUT13, WINPUT1 // D^A 106 ushr VREG24.4s, VREG43.4s, #24 107 eor WINPUT14, WINPUT14, WINPUT2 // D^A 108 sli VREG04.4s, VREG41.4s, #8 109 eor WINPUT15, WINPUT15, WINPUT3 // D^A 110 sli VREG14.4s, VREG42.4s, #8 111 112 ror WINPUT12, WINPUT12, #24 // D>>>24 113 sli VREG24.4s, VREG43.4s, #8 114 ror WINPUT13, WINPUT13, #24 // D>>>24 115 add VREG03.4s, VREG03.4s, VREG04.4s 116 ror WINPUT14, WINPUT14, #24 // D>>>24 117 add VREG13.4s, VREG13.4s, VREG14.4s 118 ror WINPUT15, WINPUT15, #24 // D>>>24 119 add VREG23.4s, VREG23.4s, VREG24.4s 120 121 add WINPUT8, WINPUT8, WINPUT12 // C+D 122 eor VREG41.16b, VREG03.16b, VREG02.16b 123 add WINPUT9, WINPUT9, WINPUT13 // C+D 124 eor VREG42.16b, VREG13.16b, VREG12.16b 125 add WINPUT10, WINPUT10, WINPUT14 // C+D 126 eor VREG43.16b, VREG23.16b, VREG22.16b 127 add WINPUT11, WINPUT11, WINPUT15 // C+D 128 ushr VREG02.4s, VREG41.4s, #25 129 130 eor WINPUT4, WINPUT4, WINPUT8 // B^C 131 ushr VREG12.4s, VREG42.4s, #25 132 eor WINPUT5, WINPUT5, WINPUT9 // B^C 133 ushr VREG22.4s, VREG43.4s, #25 134 eor WINPUT6, WINPUT6, WINPUT10 // B^C 135 sli VREG02.4s, VREG41.4s, #7 136 eor WINPUT7, WINPUT7, WINPUT11 // B^C 137 sli VREG12.4s, VREG42.4s, #7 138 139 ror WINPUT4, WINPUT4, #25 // B>>>25 140 sli VREG22.4s, VREG43.4s, #7 141 ror WINPUT5, WINPUT5, #25 // B>>>25 142 ext VREG03.16b, VREG03.16b, VREG03.16b, #8 143 ror WINPUT6, WINPUT6, #25 // B>>>25 144 ext VREG13.16b, VREG13.16b, VREG13.16b, #8 145 ror WINPUT7, WINPUT7, #25 // B>>>25 146 ext VREG23.16b, VREG23.16b, VREG23.16b, #8 147.endm 148 149.macro CHA256_ROUND_B 150 add WINPUT0, WINPUT0, WINPUT5 // A+B 151 add VREG01.4s, VREG01.4s, VREG02.4s 152 add WINPUT1, WINPUT1, WINPUT6 // A+B 153 add VREG11.4s, VREG11.4s, VREG12.4s 154 add WINPUT2, WINPUT2, WINPUT7 // A+B 155 add VREG21.4s, VREG21.4s, VREG22.4s 156 add WINPUT3, WINPUT3, WINPUT4 // A+B 157 eor VREG04.16b, VREG04.16b, VREG01.16b 158 159 eor WINPUT15, WINPUT15, WINPUT0 // D^A 160 eor VREG14.16b, VREG14.16b, VREG11.16b 161 eor WINPUT12, WINPUT12, WINPUT1 // D^A 162 eor VREG24.16b, VREG24.16b, VREG21.16b 163 eor WINPUT13, WINPUT13, WINPUT2 // D^A 164 rev32 VREG04.8h, VREG04.8h 165 eor WINPUT14, WINPUT14, WINPUT3 // D^A 166 rev32 VREG14.8h, VREG14.8h 167 168 ror WINPUT12, WINPUT12, #16 // D>>>16 169 rev32 VREG24.8h, VREG24.8h 170 ror WINPUT13, WINPUT13, #16 // D>>>16 171 add VREG03.4s, VREG03.4s, VREG04.4s 172 ror WINPUT14, WINPUT14, #16 // D>>>16 173 add VREG13.4s, VREG13.4s, VREG14.4s 174 ror WINPUT15, WINPUT15, #16 // D>>>16 175 add VREG23.4s, VREG23.4s, VREG24.4s 176 177 add WINPUT10, WINPUT10, WINPUT15 // C+D 178 eor VREG41.16b, VREG03.16b, VREG02.16b 179 add WINPUT11, WINPUT11, WINPUT12 // C+D 180 eor VREG42.16b, VREG13.16b, VREG12.16b 181 add WINPUT8, WINPUT8, WINPUT13 // C+D 182 eor VREG43.16b, VREG23.16b, VREG22.16b 183 add WINPUT9, WINPUT9, WINPUT14 // C+D 184 ushr VREG02.4s, VREG41.4s, #20 185 186 eor WINPUT5, WINPUT5, WINPUT10 // B^C 187 ushr VREG12.4s, VREG42.4s, #20 188 eor WINPUT6, WINPUT6, WINPUT11 // B^C 189 ushr VREG22.4s, VREG43.4s, #20 190 eor WINPUT7, WINPUT7, WINPUT8 // B^C 191 sli VREG02.4s, VREG41.4s, #12 192 eor WINPUT4, WINPUT4, WINPUT9 // B^C 193 sli VREG12.4s, VREG42.4s, #12 194 195 ror WINPUT4, WINPUT4, #20 // B>>>20 196 sli VREG22.4s, VREG43.4s, #12 197 ror WINPUT5, WINPUT5, #20 // B>>>20 198 add VREG01.4s, VREG01.4s, VREG02.4s 199 ror WINPUT6, WINPUT6, #20 // B>>>20 200 add VREG11.4s, VREG11.4s, VREG12.4s 201 ror WINPUT7, WINPUT7, #20 // B>>>20 202 add VREG21.4s, VREG21.4s, VREG22.4s 203 204 add WINPUT0, WINPUT0, WINPUT5 // A+B 205 eor VREG41.16b, VREG04.16b, VREG01.16b 206 add WINPUT1, WINPUT1, WINPUT6 // A+B 207 eor VREG42.16b, VREG14.16b, VREG11.16b 208 add WINPUT2, WINPUT2, WINPUT7 // A+B 209 eor VREG43.16b, VREG24.16b, VREG21.16b 210 add WINPUT3, WINPUT3, WINPUT4 // A+B 211 ushr VREG04.4s, VREG41.4s, #24 212 213 eor WINPUT15, WINPUT15, WINPUT0 // D^A 214 ushr VREG14.4s, VREG42.4s, #24 215 eor WINPUT12, WINPUT12, WINPUT1 // D^A 216 ushr VREG24.4s, VREG43.4s, #24 217 eor WINPUT13, WINPUT13, WINPUT2 // D^A 218 sli VREG04.4s, VREG41.4s, #8 219 eor WINPUT14, WINPUT14, WINPUT3 // D^A 220 sli VREG14.4s, VREG42.4s, #8 221 222 ror WINPUT12, WINPUT12, #24 // D>>>24 223 sli VREG24.4s, VREG43.4s, #8 224 ror WINPUT13, WINPUT13, #24 225 add VREG03.4s, VREG03.4s, VREG04.4s 226 ror WINPUT14, WINPUT14, #24 227 add VREG13.4s, VREG13.4s, VREG14.4s 228 ror WINPUT15, WINPUT15, #24 229 add VREG23.4s, VREG23.4s, VREG24.4s 230 231 add WINPUT10, WINPUT10, WINPUT15 // C+D 232 eor VREG41.16b, VREG03.16b, VREG02.16b 233 add WINPUT11, WINPUT11, WINPUT12 // C+D 234 eor VREG42.16b, VREG13.16b, VREG12.16b 235 add WINPUT8, WINPUT8, WINPUT13 // C+D 236 eor VREG43.16b, VREG23.16b, VREG22.16b 237 add WINPUT9, WINPUT9, WINPUT14 // C+D 238 ushr VREG02.4s, VREG41.4s, #25 239 240 eor WINPUT5, WINPUT5, WINPUT10 // B^C 241 ushr VREG12.4s, VREG42.4s, #25 242 eor WINPUT6, WINPUT6, WINPUT11 243 ushr VREG22.4s, VREG43.4s, #25 244 eor WINPUT7, WINPUT7, WINPUT8 245 sli VREG02.4s, VREG41.4s, #7 246 eor WINPUT4, WINPUT4, WINPUT9 247 sli VREG12.4s, VREG42.4s, #7 248 249 ror WINPUT4, WINPUT4, #25 // B>>>25 250 sli VREG22.4s, VREG43.4s, #7 251 ror WINPUT5, WINPUT5, #25 252 ext VREG03.16b, VREG03.16b, VREG03.16b, #8 253 ror WINPUT6, WINPUT6, #25 254 ext VREG13.16b, VREG13.16b, VREG13.16b, #8 255 ror WINPUT7, WINPUT7, #25 256 ext VREG23.16b, VREG23.16b, VREG23.16b, #8 257.endm 258 259.macro CHA256_ROUND_END 260 add VREG01.4s, VREG01.4s, VSIGMA.4s // After the cycle is complete, add input. 261 add VREG11.4s, VREG11.4s, VSIGMA.4s 262 add VREG21.4s, VREG21.4s, VSIGMA.4s 263 264 add VREG02.4s, VREG02.4s, VKEY01.4s // After the cycle is complete, add input. 265 add VREG12.4s, VREG12.4s, VKEY01.4s 266 add VREG22.4s, VREG22.4s, VKEY01.4s 267 268 add VREG03.4s, VREG03.4s, VKEY02.4s // After the cycle is complete, add input. 269 add VREG13.4s, VREG13.4s, VKEY02.4s 270 add VREG23.4s, VREG23.4s, VKEY02.4s 271 272 add VREG04.4s, VREG04.4s, VREG52.4s // 0 273 add VREG14.4s, VREG14.4s, VREG53.4s // 1 274 add VREG24.4s, VREG24.4s, VREG54.4s // 2 275.endm 276 277.macro CHA256_WRITE_BACK 278 ld1 {VREG41.16b, VREG42.16b, VREG43.16b, VREG44.16b}, [REGINC], #64 // Load 64 bytes. 279 eor XINPUT0, XINPUT0, XINPUT1 280 eor XINPUT2, XINPUT2, XINPUT3 281 eor XINPUT4, XINPUT4, XINPUT5 282 eor XINPUT6, XINPUT6, XINPUT7 283 eor XINPUT8, XINPUT8, XINPUT9 284 stp XINPUT0, XINPUT2, [REGOUT], #16 // Write data. 285 eor VREG01.16b, VREG01.16b, VREG41.16b 286 stp XINPUT4, XINPUT6, [REGOUT], #16 287 eor XINPUT10, XINPUT10, XINPUT11 288 eor VREG02.16b, VREG02.16b, VREG42.16b 289 eor XINPUT12, XINPUT12, XINPUT13 290 eor VREG03.16b, VREG03.16b, VREG43.16b 291 eor XINPUT14, XINPUT14, XINPUT15 292 stp XINPUT8, XINPUT10, [REGOUT], #16 293 eor VREG04.16b, VREG04.16b, VREG44.16b 294 295 ld1 {VREG41.16b, VREG42.16b, VREG43.16b, VREG44.16b}, [REGINC], #64 // Load 64 bytes. 296 stp XINPUT12, XINPUT14, [REGOUT], #16 297 298 eor VREG11.16b, VREG11.16b, VREG41.16b 299 eor VREG12.16b, VREG12.16b, VREG42.16b 300 301 st1 {VREG01.16b, VREG02.16b, VREG03.16b, VREG04.16b}, [REGOUT], #64 // Write 64 bytes. 302 303 eor VREG13.16b, VREG13.16b, VREG43.16b 304 eor VREG14.16b, VREG14.16b, VREG44.16b 305 306 ld1 {VREG01.16b, VREG02.16b, VREG03.16b, VREG04.16b}, [REGINC], #64 // Load 64 bytes. 307 st1 {VREG11.16b, VREG12.16b, VREG13.16b, VREG14.16b}, [REGOUT], #64 // Write 64 bytes. 308 309 eor VREG21.16b, VREG21.16b, VREG01.16b 310 eor VREG22.16b, VREG22.16b, VREG02.16b 311 eor VREG23.16b, VREG23.16b, VREG03.16b 312 eor VREG24.16b, VREG24.16b, VREG04.16b 313 st1 {VREG21.16b, VREG22.16b, VREG23.16b, VREG24.16b}, [REGOUT], #64 // Write 64 bytes. 314.endm 315 316.macro CHA256_WRITE_BACKB src1, src2, src3, src4 317 ld1 {VREG41.16b, VREG42.16b, VREG43.16b, VREG44.16b}, [REGINC], #64 // Load 64 bytes. 318 eor \src1, \src1, VREG41.16b 319 eor \src2, \src2, VREG42.16b 320 eor \src3, \src3, VREG43.16b 321 eor \src4, \src4, VREG44.16b 322 st1 {\src1, \src2, \src3, \src4}, [REGOUT], #64 // Write 64 bytes. 323.endm 324 325#endif 326