1/* 2 * Bit sliced AES using NEON instructions 3 * 4 * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 2 as 8 * published by the Free Software Foundation. 9 */ 10 11/* 12 * The algorithm implemented here is described in detail by the paper 13 * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and 14 * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf) 15 * 16 * This implementation is based primarily on the OpenSSL implementation 17 * for 32-bit ARM written by Andy Polyakov <appro@openssl.org> 18 */ 19 20#include <linux/linkage.h> 21#include <asm/assembler.h> 22 23 .text 24 25 rounds .req x11 26 bskey .req x12 27 28 .macro in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7 29 eor \b2, \b2, \b1 30 eor \b5, \b5, \b6 31 eor \b3, \b3, \b0 32 eor \b6, \b6, \b2 33 eor \b5, \b5, \b0 34 eor \b6, \b6, \b3 35 eor \b3, \b3, \b7 36 eor \b7, \b7, \b5 37 eor \b3, \b3, \b4 38 eor \b4, \b4, \b5 39 eor \b2, \b2, \b7 40 eor \b3, \b3, \b1 41 eor \b1, \b1, \b5 42 .endm 43 44 .macro out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7 45 eor \b0, \b0, \b6 46 eor \b1, \b1, \b4 47 eor \b4, \b4, \b6 48 eor \b2, \b2, \b0 49 eor \b6, \b6, \b1 50 eor \b1, \b1, \b5 51 eor \b5, \b5, \b3 52 eor \b3, \b3, \b7 53 eor \b7, \b7, \b5 54 eor \b2, \b2, \b5 55 eor \b4, \b4, \b7 56 .endm 57 58 .macro inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5 59 eor \b1, \b1, \b7 60 eor \b4, \b4, \b7 61 eor \b7, \b7, \b5 62 eor \b1, \b1, \b3 63 eor \b2, \b2, \b5 64 eor \b3, \b3, \b7 65 eor \b6, \b6, \b1 66 eor \b2, \b2, \b0 67 eor \b5, \b5, \b3 68 eor \b4, \b4, \b6 69 eor \b0, \b0, \b6 70 eor \b1, \b1, \b4 71 .endm 72 73 .macro inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2 74 eor \b1, \b1, \b5 75 eor \b2, \b2, \b7 76 eor \b3, \b3, \b1 77 eor \b4, \b4, \b5 78 eor \b7, \b7, \b5 79 eor \b3, \b3, \b4 80 eor \b5, \b5, \b0 81 eor \b3, \b3, \b7 82 eor \b6, \b6, \b2 83 eor \b2, \b2, \b1 84 eor \b6, \b6, \b3 85 eor \b3, \b3, \b0 86 eor \b5, \b5, \b6 87 .endm 88 89 .macro mul_gf4, x0, x1, y0, y1, t0, t1 90 eor \t0, \y0, \y1 91 and \t0, \t0, \x0 92 eor \x0, \x0, \x1 93 and \t1, \x1, \y0 94 and \x0, \x0, \y1 95 eor \x1, \t1, \t0 96 eor \x0, \x0, \t1 97 .endm 98 99 .macro mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1 100 eor \t0, \y0, \y1 101 eor \t1, \y2, \y3 102 and \t0, \t0, \x0 103 and \t1, \t1, \x2 104 eor \x0, \x0, \x1 105 eor \x2, \x2, \x3 106 and \x1, \x1, \y0 107 and \x3, \x3, \y2 108 and \x0, \x0, \y1 109 and \x2, \x2, \y3 110 eor \x1, \x1, \x0 111 eor \x2, \x2, \x3 112 eor \x0, \x0, \t0 113 eor \x3, \x3, \t1 114 .endm 115 116 .macro mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \ 117 y0, y1, y2, y3, t0, t1, t2, t3 118 eor \t0, \x0, \x2 119 eor \t1, \x1, \x3 120 mul_gf4 \x0, \x1, \y0, \y1, \t2, \t3 121 eor \y0, \y0, \y2 122 eor \y1, \y1, \y3 123 mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2 124 eor \x0, \x0, \t0 125 eor \x2, \x2, \t0 126 eor \x1, \x1, \t1 127 eor \x3, \x3, \t1 128 eor \t0, \x4, \x6 129 eor \t1, \x5, \x7 130 mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2 131 eor \y0, \y0, \y2 132 eor \y1, \y1, \y3 133 mul_gf4 \x4, \x5, \y0, \y1, \t2, \t3 134 eor \x4, \x4, \t0 135 eor \x6, \x6, \t0 136 eor \x5, \x5, \t1 137 eor \x7, \x7, \t1 138 .endm 139 140 .macro inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \ 141 t0, t1, t2, t3, s0, s1, s2, s3 142 eor \t3, \x4, \x6 143 eor \t0, \x5, \x7 144 eor \t1, \x1, \x3 145 eor \s1, \x7, \x6 146 eor \s0, \x0, \x2 147 eor \s3, \t3, \t0 148 orr \t2, \t0, \t1 149 and \s2, \t3, \s0 150 orr \t3, \t3, \s0 151 eor \s0, \s0, \t1 152 and \t0, \t0, \t1 153 eor \t1, \x3, \x2 154 and \s3, \s3, \s0 155 and \s1, \s1, \t1 156 eor \t1, \x4, \x5 157 eor \s0, \x1, \x0 158 eor \t3, \t3, \s1 159 eor \t2, \t2, \s1 160 and \s1, \t1, \s0 161 orr \t1, \t1, \s0 162 eor \t3, \t3, \s3 163 eor \t0, \t0, \s1 164 eor \t2, \t2, \s2 165 eor \t1, \t1, \s3 166 eor \t0, \t0, \s2 167 and \s0, \x7, \x3 168 eor \t1, \t1, \s2 169 and \s1, \x6, \x2 170 and \s2, \x5, \x1 171 orr \s3, \x4, \x0 172 eor \t3, \t3, \s0 173 eor \t1, \t1, \s2 174 eor \s0, \t0, \s3 175 eor \t2, \t2, \s1 176 and \s2, \t3, \t1 177 eor \s1, \t2, \s2 178 eor \s3, \s0, \s2 179 bsl \s1, \t1, \s0 180 not \t0, \s0 181 bsl \s0, \s1, \s3 182 bsl \t0, \s1, \s3 183 bsl \s3, \t3, \t2 184 eor \t3, \t3, \t2 185 and \s2, \s0, \s3 186 eor \t1, \t1, \t0 187 eor \s2, \s2, \t3 188 mul_gf16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \ 189 \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 190 .endm 191 192 .macro sbox, b0, b1, b2, b3, b4, b5, b6, b7, \ 193 t0, t1, t2, t3, s0, s1, s2, s3 194 in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \ 195 \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b 196 inv_gf256 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \ 197 \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \ 198 \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \ 199 \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b 200 out_bs_ch \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \ 201 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b 202 .endm 203 204 .macro inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \ 205 t0, t1, t2, t3, s0, s1, s2, s3 206 inv_in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \ 207 \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b 208 inv_gf256 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \ 209 \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \ 210 \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \ 211 \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b 212 inv_out_bs_ch \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \ 213 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b 214 .endm 215 216 .macro enc_next_rk 217 ldp q16, q17, [bskey], #128 218 ldp q18, q19, [bskey, #-96] 219 ldp q20, q21, [bskey, #-64] 220 ldp q22, q23, [bskey, #-32] 221 .endm 222 223 .macro dec_next_rk 224 ldp q16, q17, [bskey, #-128]! 225 ldp q18, q19, [bskey, #32] 226 ldp q20, q21, [bskey, #64] 227 ldp q22, q23, [bskey, #96] 228 .endm 229 230 .macro add_round_key, x0, x1, x2, x3, x4, x5, x6, x7 231 eor \x0\().16b, \x0\().16b, v16.16b 232 eor \x1\().16b, \x1\().16b, v17.16b 233 eor \x2\().16b, \x2\().16b, v18.16b 234 eor \x3\().16b, \x3\().16b, v19.16b 235 eor \x4\().16b, \x4\().16b, v20.16b 236 eor \x5\().16b, \x5\().16b, v21.16b 237 eor \x6\().16b, \x6\().16b, v22.16b 238 eor \x7\().16b, \x7\().16b, v23.16b 239 .endm 240 241 .macro shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask 242 tbl \x0\().16b, {\x0\().16b}, \mask\().16b 243 tbl \x1\().16b, {\x1\().16b}, \mask\().16b 244 tbl \x2\().16b, {\x2\().16b}, \mask\().16b 245 tbl \x3\().16b, {\x3\().16b}, \mask\().16b 246 tbl \x4\().16b, {\x4\().16b}, \mask\().16b 247 tbl \x5\().16b, {\x5\().16b}, \mask\().16b 248 tbl \x6\().16b, {\x6\().16b}, \mask\().16b 249 tbl \x7\().16b, {\x7\().16b}, \mask\().16b 250 .endm 251 252 .macro mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \ 253 t0, t1, t2, t3, t4, t5, t6, t7, inv 254 ext \t0\().16b, \x0\().16b, \x0\().16b, #12 255 ext \t1\().16b, \x1\().16b, \x1\().16b, #12 256 eor \x0\().16b, \x0\().16b, \t0\().16b 257 ext \t2\().16b, \x2\().16b, \x2\().16b, #12 258 eor \x1\().16b, \x1\().16b, \t1\().16b 259 ext \t3\().16b, \x3\().16b, \x3\().16b, #12 260 eor \x2\().16b, \x2\().16b, \t2\().16b 261 ext \t4\().16b, \x4\().16b, \x4\().16b, #12 262 eor \x3\().16b, \x3\().16b, \t3\().16b 263 ext \t5\().16b, \x5\().16b, \x5\().16b, #12 264 eor \x4\().16b, \x4\().16b, \t4\().16b 265 ext \t6\().16b, \x6\().16b, \x6\().16b, #12 266 eor \x5\().16b, \x5\().16b, \t5\().16b 267 ext \t7\().16b, \x7\().16b, \x7\().16b, #12 268 eor \x6\().16b, \x6\().16b, \t6\().16b 269 eor \t1\().16b, \t1\().16b, \x0\().16b 270 eor \x7\().16b, \x7\().16b, \t7\().16b 271 ext \x0\().16b, \x0\().16b, \x0\().16b, #8 272 eor \t2\().16b, \t2\().16b, \x1\().16b 273 eor \t0\().16b, \t0\().16b, \x7\().16b 274 eor \t1\().16b, \t1\().16b, \x7\().16b 275 ext \x1\().16b, \x1\().16b, \x1\().16b, #8 276 eor \t5\().16b, \t5\().16b, \x4\().16b 277 eor \x0\().16b, \x0\().16b, \t0\().16b 278 eor \t6\().16b, \t6\().16b, \x5\().16b 279 eor \x1\().16b, \x1\().16b, \t1\().16b 280 ext \t0\().16b, \x4\().16b, \x4\().16b, #8 281 eor \t4\().16b, \t4\().16b, \x3\().16b 282 ext \t1\().16b, \x5\().16b, \x5\().16b, #8 283 eor \t7\().16b, \t7\().16b, \x6\().16b 284 ext \x4\().16b, \x3\().16b, \x3\().16b, #8 285 eor \t3\().16b, \t3\().16b, \x2\().16b 286 ext \x5\().16b, \x7\().16b, \x7\().16b, #8 287 eor \t4\().16b, \t4\().16b, \x7\().16b 288 ext \x3\().16b, \x6\().16b, \x6\().16b, #8 289 eor \t3\().16b, \t3\().16b, \x7\().16b 290 ext \x6\().16b, \x2\().16b, \x2\().16b, #8 291 eor \x7\().16b, \t1\().16b, \t5\().16b 292 .ifb \inv 293 eor \x2\().16b, \t0\().16b, \t4\().16b 294 eor \x4\().16b, \x4\().16b, \t3\().16b 295 eor \x5\().16b, \x5\().16b, \t7\().16b 296 eor \x3\().16b, \x3\().16b, \t6\().16b 297 eor \x6\().16b, \x6\().16b, \t2\().16b 298 .else 299 eor \t3\().16b, \t3\().16b, \x4\().16b 300 eor \x5\().16b, \x5\().16b, \t7\().16b 301 eor \x2\().16b, \x3\().16b, \t6\().16b 302 eor \x3\().16b, \t0\().16b, \t4\().16b 303 eor \x4\().16b, \x6\().16b, \t2\().16b 304 mov \x6\().16b, \t3\().16b 305 .endif 306 .endm 307 308 .macro inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \ 309 t0, t1, t2, t3, t4, t5, t6, t7 310 ext \t0\().16b, \x0\().16b, \x0\().16b, #8 311 ext \t6\().16b, \x6\().16b, \x6\().16b, #8 312 ext \t7\().16b, \x7\().16b, \x7\().16b, #8 313 eor \t0\().16b, \t0\().16b, \x0\().16b 314 ext \t1\().16b, \x1\().16b, \x1\().16b, #8 315 eor \t6\().16b, \t6\().16b, \x6\().16b 316 ext \t2\().16b, \x2\().16b, \x2\().16b, #8 317 eor \t7\().16b, \t7\().16b, \x7\().16b 318 ext \t3\().16b, \x3\().16b, \x3\().16b, #8 319 eor \t1\().16b, \t1\().16b, \x1\().16b 320 ext \t4\().16b, \x4\().16b, \x4\().16b, #8 321 eor \t2\().16b, \t2\().16b, \x2\().16b 322 ext \t5\().16b, \x5\().16b, \x5\().16b, #8 323 eor \t3\().16b, \t3\().16b, \x3\().16b 324 eor \t4\().16b, \t4\().16b, \x4\().16b 325 eor \t5\().16b, \t5\().16b, \x5\().16b 326 eor \x0\().16b, \x0\().16b, \t6\().16b 327 eor \x1\().16b, \x1\().16b, \t6\().16b 328 eor \x2\().16b, \x2\().16b, \t0\().16b 329 eor \x4\().16b, \x4\().16b, \t2\().16b 330 eor \x3\().16b, \x3\().16b, \t1\().16b 331 eor \x1\().16b, \x1\().16b, \t7\().16b 332 eor \x2\().16b, \x2\().16b, \t7\().16b 333 eor \x4\().16b, \x4\().16b, \t6\().16b 334 eor \x5\().16b, \x5\().16b, \t3\().16b 335 eor \x3\().16b, \x3\().16b, \t6\().16b 336 eor \x6\().16b, \x6\().16b, \t4\().16b 337 eor \x4\().16b, \x4\().16b, \t7\().16b 338 eor \x5\().16b, \x5\().16b, \t7\().16b 339 eor \x7\().16b, \x7\().16b, \t5\().16b 340 mix_cols \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \ 341 \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1 342 .endm 343 344 .macro swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1 345 ushr \t0\().2d, \b0\().2d, #\n 346 ushr \t1\().2d, \b1\().2d, #\n 347 eor \t0\().16b, \t0\().16b, \a0\().16b 348 eor \t1\().16b, \t1\().16b, \a1\().16b 349 and \t0\().16b, \t0\().16b, \mask\().16b 350 and \t1\().16b, \t1\().16b, \mask\().16b 351 eor \a0\().16b, \a0\().16b, \t0\().16b 352 shl \t0\().2d, \t0\().2d, #\n 353 eor \a1\().16b, \a1\().16b, \t1\().16b 354 shl \t1\().2d, \t1\().2d, #\n 355 eor \b0\().16b, \b0\().16b, \t0\().16b 356 eor \b1\().16b, \b1\().16b, \t1\().16b 357 .endm 358 359 .macro bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3 360 movi \t0\().16b, #0x55 361 movi \t1\().16b, #0x33 362 swapmove_2x \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3 363 swapmove_2x \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3 364 movi \t0\().16b, #0x0f 365 swapmove_2x \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3 366 swapmove_2x \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3 367 swapmove_2x \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3 368 swapmove_2x \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3 369 .endm 370 371 372 .align 6 373M0: .octa 0x0004080c0105090d02060a0e03070b0f 374 375M0SR: .octa 0x0004080c05090d010a0e02060f03070b 376SR: .octa 0x0f0e0d0c0a09080b0504070600030201 377SRM0: .octa 0x01060b0c0207080d0304090e00050a0f 378 379M0ISR: .octa 0x0004080c0d0105090a0e0206070b0f03 380ISR: .octa 0x0f0e0d0c080b0a090504070602010003 381ISRM0: .octa 0x0306090c00070a0d01040b0e0205080f 382 383 /* 384 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds) 385 */ 386ENTRY(aesbs_convert_key) 387 ld1 {v7.4s}, [x1], #16 // load round 0 key 388 ld1 {v17.4s}, [x1], #16 // load round 1 key 389 390 movi v8.16b, #0x01 // bit masks 391 movi v9.16b, #0x02 392 movi v10.16b, #0x04 393 movi v11.16b, #0x08 394 movi v12.16b, #0x10 395 movi v13.16b, #0x20 396 movi v14.16b, #0x40 397 movi v15.16b, #0x80 398 ldr q16, M0 399 400 sub x2, x2, #1 401 str q7, [x0], #16 // save round 0 key 402 403.Lkey_loop: 404 tbl v7.16b ,{v17.16b}, v16.16b 405 ld1 {v17.4s}, [x1], #16 // load next round key 406 407 cmtst v0.16b, v7.16b, v8.16b 408 cmtst v1.16b, v7.16b, v9.16b 409 cmtst v2.16b, v7.16b, v10.16b 410 cmtst v3.16b, v7.16b, v11.16b 411 cmtst v4.16b, v7.16b, v12.16b 412 cmtst v5.16b, v7.16b, v13.16b 413 cmtst v6.16b, v7.16b, v14.16b 414 cmtst v7.16b, v7.16b, v15.16b 415 not v0.16b, v0.16b 416 not v1.16b, v1.16b 417 not v5.16b, v5.16b 418 not v6.16b, v6.16b 419 420 subs x2, x2, #1 421 stp q0, q1, [x0], #128 422 stp q2, q3, [x0, #-96] 423 stp q4, q5, [x0, #-64] 424 stp q6, q7, [x0, #-32] 425 b.ne .Lkey_loop 426 427 movi v7.16b, #0x63 // compose .L63 428 eor v17.16b, v17.16b, v7.16b 429 str q17, [x0] 430 ret 431ENDPROC(aesbs_convert_key) 432 433 .align 4 434aesbs_encrypt8: 435 ldr q9, [bskey], #16 // round 0 key 436 ldr q8, M0SR 437 ldr q24, SR 438 439 eor v10.16b, v0.16b, v9.16b // xor with round0 key 440 eor v11.16b, v1.16b, v9.16b 441 tbl v0.16b, {v10.16b}, v8.16b 442 eor v12.16b, v2.16b, v9.16b 443 tbl v1.16b, {v11.16b}, v8.16b 444 eor v13.16b, v3.16b, v9.16b 445 tbl v2.16b, {v12.16b}, v8.16b 446 eor v14.16b, v4.16b, v9.16b 447 tbl v3.16b, {v13.16b}, v8.16b 448 eor v15.16b, v5.16b, v9.16b 449 tbl v4.16b, {v14.16b}, v8.16b 450 eor v10.16b, v6.16b, v9.16b 451 tbl v5.16b, {v15.16b}, v8.16b 452 eor v11.16b, v7.16b, v9.16b 453 tbl v6.16b, {v10.16b}, v8.16b 454 tbl v7.16b, {v11.16b}, v8.16b 455 456 bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11 457 458 sub rounds, rounds, #1 459 b .Lenc_sbox 460 461.Lenc_loop: 462 shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24 463.Lenc_sbox: 464 sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \ 465 v13, v14, v15 466 subs rounds, rounds, #1 467 b.cc .Lenc_done 468 469 enc_next_rk 470 471 mix_cols v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \ 472 v13, v14, v15 473 474 add_round_key v0, v1, v2, v3, v4, v5, v6, v7 475 476 b.ne .Lenc_loop 477 ldr q24, SRM0 478 b .Lenc_loop 479 480.Lenc_done: 481 ldr q12, [bskey] // last round key 482 483 bitslice v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11 484 485 eor v0.16b, v0.16b, v12.16b 486 eor v1.16b, v1.16b, v12.16b 487 eor v4.16b, v4.16b, v12.16b 488 eor v6.16b, v6.16b, v12.16b 489 eor v3.16b, v3.16b, v12.16b 490 eor v7.16b, v7.16b, v12.16b 491 eor v2.16b, v2.16b, v12.16b 492 eor v5.16b, v5.16b, v12.16b 493 ret 494ENDPROC(aesbs_encrypt8) 495 496 .align 4 497aesbs_decrypt8: 498 lsl x9, rounds, #7 499 add bskey, bskey, x9 500 501 ldr q9, [bskey, #-112]! // round 0 key 502 ldr q8, M0ISR 503 ldr q24, ISR 504 505 eor v10.16b, v0.16b, v9.16b // xor with round0 key 506 eor v11.16b, v1.16b, v9.16b 507 tbl v0.16b, {v10.16b}, v8.16b 508 eor v12.16b, v2.16b, v9.16b 509 tbl v1.16b, {v11.16b}, v8.16b 510 eor v13.16b, v3.16b, v9.16b 511 tbl v2.16b, {v12.16b}, v8.16b 512 eor v14.16b, v4.16b, v9.16b 513 tbl v3.16b, {v13.16b}, v8.16b 514 eor v15.16b, v5.16b, v9.16b 515 tbl v4.16b, {v14.16b}, v8.16b 516 eor v10.16b, v6.16b, v9.16b 517 tbl v5.16b, {v15.16b}, v8.16b 518 eor v11.16b, v7.16b, v9.16b 519 tbl v6.16b, {v10.16b}, v8.16b 520 tbl v7.16b, {v11.16b}, v8.16b 521 522 bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11 523 524 sub rounds, rounds, #1 525 b .Ldec_sbox 526 527.Ldec_loop: 528 shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24 529.Ldec_sbox: 530 inv_sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \ 531 v13, v14, v15 532 subs rounds, rounds, #1 533 b.cc .Ldec_done 534 535 dec_next_rk 536 537 add_round_key v0, v1, v6, v4, v2, v7, v3, v5 538 539 inv_mix_cols v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \ 540 v13, v14, v15 541 542 b.ne .Ldec_loop 543 ldr q24, ISRM0 544 b .Ldec_loop 545.Ldec_done: 546 ldr q12, [bskey, #-16] // last round key 547 548 bitslice v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11 549 550 eor v0.16b, v0.16b, v12.16b 551 eor v1.16b, v1.16b, v12.16b 552 eor v6.16b, v6.16b, v12.16b 553 eor v4.16b, v4.16b, v12.16b 554 eor v2.16b, v2.16b, v12.16b 555 eor v7.16b, v7.16b, v12.16b 556 eor v3.16b, v3.16b, v12.16b 557 eor v5.16b, v5.16b, v12.16b 558 ret 559ENDPROC(aesbs_decrypt8) 560 561 /* 562 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 563 * int blocks) 564 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 565 * int blocks) 566 */ 567 .macro __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7 568 stp x29, x30, [sp, #-16]! 569 mov x29, sp 570 57199: mov x5, #1 572 lsl x5, x5, x4 573 subs w4, w4, #8 574 csel x4, x4, xzr, pl 575 csel x5, x5, xzr, mi 576 577 ld1 {v0.16b}, [x1], #16 578 tbnz x5, #1, 0f 579 ld1 {v1.16b}, [x1], #16 580 tbnz x5, #2, 0f 581 ld1 {v2.16b}, [x1], #16 582 tbnz x5, #3, 0f 583 ld1 {v3.16b}, [x1], #16 584 tbnz x5, #4, 0f 585 ld1 {v4.16b}, [x1], #16 586 tbnz x5, #5, 0f 587 ld1 {v5.16b}, [x1], #16 588 tbnz x5, #6, 0f 589 ld1 {v6.16b}, [x1], #16 590 tbnz x5, #7, 0f 591 ld1 {v7.16b}, [x1], #16 592 5930: mov bskey, x2 594 mov rounds, x3 595 bl \do8 596 597 st1 {\o0\().16b}, [x0], #16 598 tbnz x5, #1, 1f 599 st1 {\o1\().16b}, [x0], #16 600 tbnz x5, #2, 1f 601 st1 {\o2\().16b}, [x0], #16 602 tbnz x5, #3, 1f 603 st1 {\o3\().16b}, [x0], #16 604 tbnz x5, #4, 1f 605 st1 {\o4\().16b}, [x0], #16 606 tbnz x5, #5, 1f 607 st1 {\o5\().16b}, [x0], #16 608 tbnz x5, #6, 1f 609 st1 {\o6\().16b}, [x0], #16 610 tbnz x5, #7, 1f 611 st1 {\o7\().16b}, [x0], #16 612 613 cbnz x4, 99b 614 6151: ldp x29, x30, [sp], #16 616 ret 617 .endm 618 619 .align 4 620ENTRY(aesbs_ecb_encrypt) 621 __ecb_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5 622ENDPROC(aesbs_ecb_encrypt) 623 624 .align 4 625ENTRY(aesbs_ecb_decrypt) 626 __ecb_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5 627ENDPROC(aesbs_ecb_decrypt) 628 629 /* 630 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 631 * int blocks, u8 iv[]) 632 */ 633 .align 4 634ENTRY(aesbs_cbc_decrypt) 635 stp x29, x30, [sp, #-16]! 636 mov x29, sp 637 63899: mov x6, #1 639 lsl x6, x6, x4 640 subs w4, w4, #8 641 csel x4, x4, xzr, pl 642 csel x6, x6, xzr, mi 643 644 ld1 {v0.16b}, [x1], #16 645 mov v25.16b, v0.16b 646 tbnz x6, #1, 0f 647 ld1 {v1.16b}, [x1], #16 648 mov v26.16b, v1.16b 649 tbnz x6, #2, 0f 650 ld1 {v2.16b}, [x1], #16 651 mov v27.16b, v2.16b 652 tbnz x6, #3, 0f 653 ld1 {v3.16b}, [x1], #16 654 mov v28.16b, v3.16b 655 tbnz x6, #4, 0f 656 ld1 {v4.16b}, [x1], #16 657 mov v29.16b, v4.16b 658 tbnz x6, #5, 0f 659 ld1 {v5.16b}, [x1], #16 660 mov v30.16b, v5.16b 661 tbnz x6, #6, 0f 662 ld1 {v6.16b}, [x1], #16 663 mov v31.16b, v6.16b 664 tbnz x6, #7, 0f 665 ld1 {v7.16b}, [x1] 666 6670: mov bskey, x2 668 mov rounds, x3 669 bl aesbs_decrypt8 670 671 ld1 {v24.16b}, [x5] // load IV 672 673 eor v1.16b, v1.16b, v25.16b 674 eor v6.16b, v6.16b, v26.16b 675 eor v4.16b, v4.16b, v27.16b 676 eor v2.16b, v2.16b, v28.16b 677 eor v7.16b, v7.16b, v29.16b 678 eor v0.16b, v0.16b, v24.16b 679 eor v3.16b, v3.16b, v30.16b 680 eor v5.16b, v5.16b, v31.16b 681 682 st1 {v0.16b}, [x0], #16 683 mov v24.16b, v25.16b 684 tbnz x6, #1, 1f 685 st1 {v1.16b}, [x0], #16 686 mov v24.16b, v26.16b 687 tbnz x6, #2, 1f 688 st1 {v6.16b}, [x0], #16 689 mov v24.16b, v27.16b 690 tbnz x6, #3, 1f 691 st1 {v4.16b}, [x0], #16 692 mov v24.16b, v28.16b 693 tbnz x6, #4, 1f 694 st1 {v2.16b}, [x0], #16 695 mov v24.16b, v29.16b 696 tbnz x6, #5, 1f 697 st1 {v7.16b}, [x0], #16 698 mov v24.16b, v30.16b 699 tbnz x6, #6, 1f 700 st1 {v3.16b}, [x0], #16 701 mov v24.16b, v31.16b 702 tbnz x6, #7, 1f 703 ld1 {v24.16b}, [x1], #16 704 st1 {v5.16b}, [x0], #16 7051: st1 {v24.16b}, [x5] // store IV 706 707 cbnz x4, 99b 708 709 ldp x29, x30, [sp], #16 710 ret 711ENDPROC(aesbs_cbc_decrypt) 712 713 .macro next_tweak, out, in, const, tmp 714 sshr \tmp\().2d, \in\().2d, #63 715 and \tmp\().16b, \tmp\().16b, \const\().16b 716 add \out\().2d, \in\().2d, \in\().2d 717 ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8 718 eor \out\().16b, \out\().16b, \tmp\().16b 719 .endm 720 721 .align 4 722.Lxts_mul_x: 723CPU_LE( .quad 1, 0x87 ) 724CPU_BE( .quad 0x87, 1 ) 725 726 /* 727 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 728 * int blocks, u8 iv[]) 729 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 730 * int blocks, u8 iv[]) 731 */ 732__xts_crypt8: 733 mov x6, #1 734 lsl x6, x6, x4 735 subs w4, w4, #8 736 csel x4, x4, xzr, pl 737 csel x6, x6, xzr, mi 738 739 ld1 {v0.16b}, [x1], #16 740 next_tweak v26, v25, v30, v31 741 eor v0.16b, v0.16b, v25.16b 742 tbnz x6, #1, 0f 743 744 ld1 {v1.16b}, [x1], #16 745 next_tweak v27, v26, v30, v31 746 eor v1.16b, v1.16b, v26.16b 747 tbnz x6, #2, 0f 748 749 ld1 {v2.16b}, [x1], #16 750 next_tweak v28, v27, v30, v31 751 eor v2.16b, v2.16b, v27.16b 752 tbnz x6, #3, 0f 753 754 ld1 {v3.16b}, [x1], #16 755 next_tweak v29, v28, v30, v31 756 eor v3.16b, v3.16b, v28.16b 757 tbnz x6, #4, 0f 758 759 ld1 {v4.16b}, [x1], #16 760 str q29, [sp, #16] 761 eor v4.16b, v4.16b, v29.16b 762 next_tweak v29, v29, v30, v31 763 tbnz x6, #5, 0f 764 765 ld1 {v5.16b}, [x1], #16 766 str q29, [sp, #32] 767 eor v5.16b, v5.16b, v29.16b 768 next_tweak v29, v29, v30, v31 769 tbnz x6, #6, 0f 770 771 ld1 {v6.16b}, [x1], #16 772 str q29, [sp, #48] 773 eor v6.16b, v6.16b, v29.16b 774 next_tweak v29, v29, v30, v31 775 tbnz x6, #7, 0f 776 777 ld1 {v7.16b}, [x1], #16 778 str q29, [sp, #64] 779 eor v7.16b, v7.16b, v29.16b 780 next_tweak v29, v29, v30, v31 781 7820: mov bskey, x2 783 mov rounds, x3 784 br x7 785ENDPROC(__xts_crypt8) 786 787 .macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7 788 stp x29, x30, [sp, #-80]! 789 mov x29, sp 790 791 ldr q30, .Lxts_mul_x 792 ld1 {v25.16b}, [x5] 793 79499: adr x7, \do8 795 bl __xts_crypt8 796 797 ldp q16, q17, [sp, #16] 798 ldp q18, q19, [sp, #48] 799 800 eor \o0\().16b, \o0\().16b, v25.16b 801 eor \o1\().16b, \o1\().16b, v26.16b 802 eor \o2\().16b, \o2\().16b, v27.16b 803 eor \o3\().16b, \o3\().16b, v28.16b 804 805 st1 {\o0\().16b}, [x0], #16 806 mov v25.16b, v26.16b 807 tbnz x6, #1, 1f 808 st1 {\o1\().16b}, [x0], #16 809 mov v25.16b, v27.16b 810 tbnz x6, #2, 1f 811 st1 {\o2\().16b}, [x0], #16 812 mov v25.16b, v28.16b 813 tbnz x6, #3, 1f 814 st1 {\o3\().16b}, [x0], #16 815 mov v25.16b, v29.16b 816 tbnz x6, #4, 1f 817 818 eor \o4\().16b, \o4\().16b, v16.16b 819 eor \o5\().16b, \o5\().16b, v17.16b 820 eor \o6\().16b, \o6\().16b, v18.16b 821 eor \o7\().16b, \o7\().16b, v19.16b 822 823 st1 {\o4\().16b}, [x0], #16 824 tbnz x6, #5, 1f 825 st1 {\o5\().16b}, [x0], #16 826 tbnz x6, #6, 1f 827 st1 {\o6\().16b}, [x0], #16 828 tbnz x6, #7, 1f 829 st1 {\o7\().16b}, [x0], #16 830 831 cbnz x4, 99b 832 8331: st1 {v25.16b}, [x5] 834 ldp x29, x30, [sp], #80 835 ret 836 .endm 837 838ENTRY(aesbs_xts_encrypt) 839 __xts_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5 840ENDPROC(aesbs_xts_encrypt) 841 842ENTRY(aesbs_xts_decrypt) 843 __xts_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5 844ENDPROC(aesbs_xts_decrypt) 845 846 .macro next_ctr, v 847 mov \v\().d[1], x8 848 adds x8, x8, #1 849 mov \v\().d[0], x7 850 adc x7, x7, xzr 851 rev64 \v\().16b, \v\().16b 852 .endm 853 854 /* 855 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], 856 * int rounds, int blocks, u8 iv[], u8 final[]) 857 */ 858ENTRY(aesbs_ctr_encrypt) 859 stp x29, x30, [sp, #-16]! 860 mov x29, sp 861 862 cmp x6, #0 863 cset x10, ne 864 add x4, x4, x10 // do one extra block if final 865 866 ldp x7, x8, [x5] 867 ld1 {v0.16b}, [x5] 868CPU_LE( rev x7, x7 ) 869CPU_LE( rev x8, x8 ) 870 adds x8, x8, #1 871 adc x7, x7, xzr 872 87399: mov x9, #1 874 lsl x9, x9, x4 875 subs w4, w4, #8 876 csel x4, x4, xzr, pl 877 csel x9, x9, xzr, le 878 879 tbnz x9, #1, 0f 880 next_ctr v1 881 tbnz x9, #2, 0f 882 next_ctr v2 883 tbnz x9, #3, 0f 884 next_ctr v3 885 tbnz x9, #4, 0f 886 next_ctr v4 887 tbnz x9, #5, 0f 888 next_ctr v5 889 tbnz x9, #6, 0f 890 next_ctr v6 891 tbnz x9, #7, 0f 892 next_ctr v7 893 8940: mov bskey, x2 895 mov rounds, x3 896 bl aesbs_encrypt8 897 898 lsr x9, x9, x10 // disregard the extra block 899 tbnz x9, #0, 0f 900 901 ld1 {v8.16b}, [x1], #16 902 eor v0.16b, v0.16b, v8.16b 903 st1 {v0.16b}, [x0], #16 904 tbnz x9, #1, 1f 905 906 ld1 {v9.16b}, [x1], #16 907 eor v1.16b, v1.16b, v9.16b 908 st1 {v1.16b}, [x0], #16 909 tbnz x9, #2, 2f 910 911 ld1 {v10.16b}, [x1], #16 912 eor v4.16b, v4.16b, v10.16b 913 st1 {v4.16b}, [x0], #16 914 tbnz x9, #3, 3f 915 916 ld1 {v11.16b}, [x1], #16 917 eor v6.16b, v6.16b, v11.16b 918 st1 {v6.16b}, [x0], #16 919 tbnz x9, #4, 4f 920 921 ld1 {v12.16b}, [x1], #16 922 eor v3.16b, v3.16b, v12.16b 923 st1 {v3.16b}, [x0], #16 924 tbnz x9, #5, 5f 925 926 ld1 {v13.16b}, [x1], #16 927 eor v7.16b, v7.16b, v13.16b 928 st1 {v7.16b}, [x0], #16 929 tbnz x9, #6, 6f 930 931 ld1 {v14.16b}, [x1], #16 932 eor v2.16b, v2.16b, v14.16b 933 st1 {v2.16b}, [x0], #16 934 tbnz x9, #7, 7f 935 936 ld1 {v15.16b}, [x1], #16 937 eor v5.16b, v5.16b, v15.16b 938 st1 {v5.16b}, [x0], #16 939 9408: next_ctr v0 941 cbnz x4, 99b 942 943 st1 {v0.16b}, [x5] 944 ldp x29, x30, [sp], #16 945 ret 946 947 /* 948 * If we are handling the tail of the input (x6 != NULL), return the 949 * final keystream block back to the caller. 950 */ 9510: cbz x6, 8b 952 st1 {v0.16b}, [x6] 953 b 8b 9541: cbz x6, 8b 955 st1 {v1.16b}, [x6] 956 b 8b 9572: cbz x6, 8b 958 st1 {v4.16b}, [x6] 959 b 8b 9603: cbz x6, 8b 961 st1 {v6.16b}, [x6] 962 b 8b 9634: cbz x6, 8b 964 st1 {v3.16b}, [x6] 965 b 8b 9665: cbz x6, 8b 967 st1 {v7.16b}, [x6] 968 b 8b 9696: cbz x6, 8b 970 st1 {v2.16b}, [x6] 971 b 8b 9727: cbz x6, 8b 973 st1 {v5.16b}, [x6] 974 b 8b 975ENDPROC(aesbs_ctr_encrypt) 976