1/* 2 * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES 3 * 4 * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 2 as 8 * published by the Free Software Foundation. 9 */ 10 11/* included by aes-ce.S and aes-neon.S */ 12 13 .text 14 .align 4 15 16/* 17 * There are several ways to instantiate this code: 18 * - no interleave, all inline 19 * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2) 20 * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE) 21 * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4) 22 * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE) 23 * 24 * Macros imported by this code: 25 * - enc_prepare - setup NEON registers for encryption 26 * - dec_prepare - setup NEON registers for decryption 27 * - enc_switch_key - change to new key after having prepared for encryption 28 * - encrypt_block - encrypt a single block 29 * - decrypt block - decrypt a single block 30 * - encrypt_block2x - encrypt 2 blocks in parallel (if INTERLEAVE == 2) 31 * - decrypt_block2x - decrypt 2 blocks in parallel (if INTERLEAVE == 2) 32 * - encrypt_block4x - encrypt 4 blocks in parallel (if INTERLEAVE == 4) 33 * - decrypt_block4x - decrypt 4 blocks in parallel (if INTERLEAVE == 4) 34 */ 35 36#if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE) 37#define FRAME_PUSH stp x29, x30, [sp,#-16]! ; mov x29, sp 38#define FRAME_POP ldp x29, x30, [sp],#16 39 40#if INTERLEAVE == 2 41 42aes_encrypt_block2x: 43 encrypt_block2x v0, v1, w3, x2, x6, w7 44 ret 45ENDPROC(aes_encrypt_block2x) 46 47aes_decrypt_block2x: 48 decrypt_block2x v0, v1, w3, x2, x6, w7 49 ret 50ENDPROC(aes_decrypt_block2x) 51 52#elif INTERLEAVE == 4 53 54aes_encrypt_block4x: 55 encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 56 ret 57ENDPROC(aes_encrypt_block4x) 58 59aes_decrypt_block4x: 60 decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 61 ret 62ENDPROC(aes_decrypt_block4x) 63 64#else 65#error INTERLEAVE should equal 2 or 4 66#endif 67 68 .macro do_encrypt_block2x 69 bl aes_encrypt_block2x 70 .endm 71 72 .macro do_decrypt_block2x 73 bl aes_decrypt_block2x 74 .endm 75 76 .macro do_encrypt_block4x 77 bl aes_encrypt_block4x 78 .endm 79 80 .macro do_decrypt_block4x 81 bl aes_decrypt_block4x 82 .endm 83 84#else 85#define FRAME_PUSH 86#define FRAME_POP 87 88 .macro do_encrypt_block2x 89 encrypt_block2x v0, v1, w3, x2, x6, w7 90 .endm 91 92 .macro do_decrypt_block2x 93 decrypt_block2x v0, v1, w3, x2, x6, w7 94 .endm 95 96 .macro do_encrypt_block4x 97 encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 98 .endm 99 100 .macro do_decrypt_block4x 101 decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 102 .endm 103 104#endif 105 106 /* 107 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 108 * int blocks, int first) 109 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 110 * int blocks, int first) 111 */ 112 113AES_ENTRY(aes_ecb_encrypt) 114 FRAME_PUSH 115 cbz w5, .LecbencloopNx 116 117 enc_prepare w3, x2, x5 118 119.LecbencloopNx: 120#if INTERLEAVE >= 2 121 subs w4, w4, #INTERLEAVE 122 bmi .Lecbenc1x 123#if INTERLEAVE == 2 124 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */ 125 do_encrypt_block2x 126 st1 {v0.16b-v1.16b}, [x0], #32 127#else 128 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ 129 do_encrypt_block4x 130 st1 {v0.16b-v3.16b}, [x0], #64 131#endif 132 b .LecbencloopNx 133.Lecbenc1x: 134 adds w4, w4, #INTERLEAVE 135 beq .Lecbencout 136#endif 137.Lecbencloop: 138 ld1 {v0.16b}, [x1], #16 /* get next pt block */ 139 encrypt_block v0, w3, x2, x5, w6 140 st1 {v0.16b}, [x0], #16 141 subs w4, w4, #1 142 bne .Lecbencloop 143.Lecbencout: 144 FRAME_POP 145 ret 146AES_ENDPROC(aes_ecb_encrypt) 147 148 149AES_ENTRY(aes_ecb_decrypt) 150 FRAME_PUSH 151 cbz w5, .LecbdecloopNx 152 153 dec_prepare w3, x2, x5 154 155.LecbdecloopNx: 156#if INTERLEAVE >= 2 157 subs w4, w4, #INTERLEAVE 158 bmi .Lecbdec1x 159#if INTERLEAVE == 2 160 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */ 161 do_decrypt_block2x 162 st1 {v0.16b-v1.16b}, [x0], #32 163#else 164 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ 165 do_decrypt_block4x 166 st1 {v0.16b-v3.16b}, [x0], #64 167#endif 168 b .LecbdecloopNx 169.Lecbdec1x: 170 adds w4, w4, #INTERLEAVE 171 beq .Lecbdecout 172#endif 173.Lecbdecloop: 174 ld1 {v0.16b}, [x1], #16 /* get next ct block */ 175 decrypt_block v0, w3, x2, x5, w6 176 st1 {v0.16b}, [x0], #16 177 subs w4, w4, #1 178 bne .Lecbdecloop 179.Lecbdecout: 180 FRAME_POP 181 ret 182AES_ENDPROC(aes_ecb_decrypt) 183 184 185 /* 186 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 187 * int blocks, u8 iv[], int first) 188 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 189 * int blocks, u8 iv[], int first) 190 */ 191 192AES_ENTRY(aes_cbc_encrypt) 193 cbz w6, .Lcbcencloop 194 195 ld1 {v0.16b}, [x5] /* get iv */ 196 enc_prepare w3, x2, x6 197 198.Lcbcencloop: 199 ld1 {v1.16b}, [x1], #16 /* get next pt block */ 200 eor v0.16b, v0.16b, v1.16b /* ..and xor with iv */ 201 encrypt_block v0, w3, x2, x6, w7 202 st1 {v0.16b}, [x0], #16 203 subs w4, w4, #1 204 bne .Lcbcencloop 205 st1 {v0.16b}, [x5] /* return iv */ 206 ret 207AES_ENDPROC(aes_cbc_encrypt) 208 209 210AES_ENTRY(aes_cbc_decrypt) 211 FRAME_PUSH 212 cbz w6, .LcbcdecloopNx 213 214 ld1 {v7.16b}, [x5] /* get iv */ 215 dec_prepare w3, x2, x6 216 217.LcbcdecloopNx: 218#if INTERLEAVE >= 2 219 subs w4, w4, #INTERLEAVE 220 bmi .Lcbcdec1x 221#if INTERLEAVE == 2 222 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */ 223 mov v2.16b, v0.16b 224 mov v3.16b, v1.16b 225 do_decrypt_block2x 226 eor v0.16b, v0.16b, v7.16b 227 eor v1.16b, v1.16b, v2.16b 228 mov v7.16b, v3.16b 229 st1 {v0.16b-v1.16b}, [x0], #32 230#else 231 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ 232 mov v4.16b, v0.16b 233 mov v5.16b, v1.16b 234 mov v6.16b, v2.16b 235 do_decrypt_block4x 236 sub x1, x1, #16 237 eor v0.16b, v0.16b, v7.16b 238 eor v1.16b, v1.16b, v4.16b 239 ld1 {v7.16b}, [x1], #16 /* reload 1 ct block */ 240 eor v2.16b, v2.16b, v5.16b 241 eor v3.16b, v3.16b, v6.16b 242 st1 {v0.16b-v3.16b}, [x0], #64 243#endif 244 b .LcbcdecloopNx 245.Lcbcdec1x: 246 adds w4, w4, #INTERLEAVE 247 beq .Lcbcdecout 248#endif 249.Lcbcdecloop: 250 ld1 {v1.16b}, [x1], #16 /* get next ct block */ 251 mov v0.16b, v1.16b /* ...and copy to v0 */ 252 decrypt_block v0, w3, x2, x6, w7 253 eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */ 254 mov v7.16b, v1.16b /* ct is next iv */ 255 st1 {v0.16b}, [x0], #16 256 subs w4, w4, #1 257 bne .Lcbcdecloop 258.Lcbcdecout: 259 FRAME_POP 260 st1 {v7.16b}, [x5] /* return iv */ 261 ret 262AES_ENDPROC(aes_cbc_decrypt) 263 264 265 /* 266 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 267 * int blocks, u8 ctr[], int first) 268 */ 269 270AES_ENTRY(aes_ctr_encrypt) 271 FRAME_PUSH 272 cbz w6, .Lctrnotfirst /* 1st time around? */ 273 enc_prepare w3, x2, x6 274 ld1 {v4.16b}, [x5] 275 276.Lctrnotfirst: 277 umov x8, v4.d[1] /* keep swabbed ctr in reg */ 278 rev x8, x8 279#if INTERLEAVE >= 2 280 cmn w8, w4 /* 32 bit overflow? */ 281 bcs .Lctrloop 282.LctrloopNx: 283 subs w4, w4, #INTERLEAVE 284 bmi .Lctr1x 285#if INTERLEAVE == 2 286 mov v0.8b, v4.8b 287 mov v1.8b, v4.8b 288 rev x7, x8 289 add x8, x8, #1 290 ins v0.d[1], x7 291 rev x7, x8 292 add x8, x8, #1 293 ins v1.d[1], x7 294 ld1 {v2.16b-v3.16b}, [x1], #32 /* get 2 input blocks */ 295 do_encrypt_block2x 296 eor v0.16b, v0.16b, v2.16b 297 eor v1.16b, v1.16b, v3.16b 298 st1 {v0.16b-v1.16b}, [x0], #32 299#else 300 ldr q8, =0x30000000200000001 /* addends 1,2,3[,0] */ 301 dup v7.4s, w8 302 mov v0.16b, v4.16b 303 add v7.4s, v7.4s, v8.4s 304 mov v1.16b, v4.16b 305 rev32 v8.16b, v7.16b 306 mov v2.16b, v4.16b 307 mov v3.16b, v4.16b 308 mov v1.s[3], v8.s[0] 309 mov v2.s[3], v8.s[1] 310 mov v3.s[3], v8.s[2] 311 ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */ 312 do_encrypt_block4x 313 eor v0.16b, v5.16b, v0.16b 314 ld1 {v5.16b}, [x1], #16 /* get 1 input block */ 315 eor v1.16b, v6.16b, v1.16b 316 eor v2.16b, v7.16b, v2.16b 317 eor v3.16b, v5.16b, v3.16b 318 st1 {v0.16b-v3.16b}, [x0], #64 319 add x8, x8, #INTERLEAVE 320#endif 321 rev x7, x8 322 ins v4.d[1], x7 323 cbz w4, .Lctrout 324 b .LctrloopNx 325.Lctr1x: 326 adds w4, w4, #INTERLEAVE 327 beq .Lctrout 328#endif 329.Lctrloop: 330 mov v0.16b, v4.16b 331 encrypt_block v0, w3, x2, x6, w7 332 333 adds x8, x8, #1 /* increment BE ctr */ 334 rev x7, x8 335 ins v4.d[1], x7 336 bcs .Lctrcarry /* overflow? */ 337 338.Lctrcarrydone: 339 subs w4, w4, #1 340 bmi .Lctrhalfblock /* blocks < 0 means 1/2 block */ 341 ld1 {v3.16b}, [x1], #16 342 eor v3.16b, v0.16b, v3.16b 343 st1 {v3.16b}, [x0], #16 344 bne .Lctrloop 345 346.Lctrout: 347 st1 {v4.16b}, [x5] /* return next CTR value */ 348 FRAME_POP 349 ret 350 351.Lctrhalfblock: 352 ld1 {v3.8b}, [x1] 353 eor v3.8b, v0.8b, v3.8b 354 st1 {v3.8b}, [x0] 355 FRAME_POP 356 ret 357 358.Lctrcarry: 359 umov x7, v4.d[0] /* load upper word of ctr */ 360 rev x7, x7 /* ... to handle the carry */ 361 add x7, x7, #1 362 rev x7, x7 363 ins v4.d[0], x7 364 b .Lctrcarrydone 365AES_ENDPROC(aes_ctr_encrypt) 366 .ltorg 367 368 369 /* 370 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, 371 * int blocks, u8 const rk2[], u8 iv[], int first) 372 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, 373 * int blocks, u8 const rk2[], u8 iv[], int first) 374 */ 375 376 .macro next_tweak, out, in, const, tmp 377 sshr \tmp\().2d, \in\().2d, #63 378 and \tmp\().16b, \tmp\().16b, \const\().16b 379 add \out\().2d, \in\().2d, \in\().2d 380 ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8 381 eor \out\().16b, \out\().16b, \tmp\().16b 382 .endm 383 384.Lxts_mul_x: 385CPU_LE( .quad 1, 0x87 ) 386CPU_BE( .quad 0x87, 1 ) 387 388AES_ENTRY(aes_xts_encrypt) 389 FRAME_PUSH 390 cbz w7, .LxtsencloopNx 391 392 ld1 {v4.16b}, [x6] 393 enc_prepare w3, x5, x6 394 encrypt_block v4, w3, x5, x6, w7 /* first tweak */ 395 enc_switch_key w3, x2, x6 396 ldr q7, .Lxts_mul_x 397 b .LxtsencNx 398 399.LxtsencloopNx: 400 ldr q7, .Lxts_mul_x 401 next_tweak v4, v4, v7, v8 402.LxtsencNx: 403#if INTERLEAVE >= 2 404 subs w4, w4, #INTERLEAVE 405 bmi .Lxtsenc1x 406#if INTERLEAVE == 2 407 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */ 408 next_tweak v5, v4, v7, v8 409 eor v0.16b, v0.16b, v4.16b 410 eor v1.16b, v1.16b, v5.16b 411 do_encrypt_block2x 412 eor v0.16b, v0.16b, v4.16b 413 eor v1.16b, v1.16b, v5.16b 414 st1 {v0.16b-v1.16b}, [x0], #32 415 cbz w4, .LxtsencoutNx 416 next_tweak v4, v5, v7, v8 417 b .LxtsencNx 418.LxtsencoutNx: 419 mov v4.16b, v5.16b 420 b .Lxtsencout 421#else 422 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ 423 next_tweak v5, v4, v7, v8 424 eor v0.16b, v0.16b, v4.16b 425 next_tweak v6, v5, v7, v8 426 eor v1.16b, v1.16b, v5.16b 427 eor v2.16b, v2.16b, v6.16b 428 next_tweak v7, v6, v7, v8 429 eor v3.16b, v3.16b, v7.16b 430 do_encrypt_block4x 431 eor v3.16b, v3.16b, v7.16b 432 eor v0.16b, v0.16b, v4.16b 433 eor v1.16b, v1.16b, v5.16b 434 eor v2.16b, v2.16b, v6.16b 435 st1 {v0.16b-v3.16b}, [x0], #64 436 mov v4.16b, v7.16b 437 cbz w4, .Lxtsencout 438 b .LxtsencloopNx 439#endif 440.Lxtsenc1x: 441 adds w4, w4, #INTERLEAVE 442 beq .Lxtsencout 443#endif 444.Lxtsencloop: 445 ld1 {v1.16b}, [x1], #16 446 eor v0.16b, v1.16b, v4.16b 447 encrypt_block v0, w3, x2, x6, w7 448 eor v0.16b, v0.16b, v4.16b 449 st1 {v0.16b}, [x0], #16 450 subs w4, w4, #1 451 beq .Lxtsencout 452 next_tweak v4, v4, v7, v8 453 b .Lxtsencloop 454.Lxtsencout: 455 FRAME_POP 456 ret 457AES_ENDPROC(aes_xts_encrypt) 458 459 460AES_ENTRY(aes_xts_decrypt) 461 FRAME_PUSH 462 cbz w7, .LxtsdecloopNx 463 464 ld1 {v4.16b}, [x6] 465 enc_prepare w3, x5, x6 466 encrypt_block v4, w3, x5, x6, w7 /* first tweak */ 467 dec_prepare w3, x2, x6 468 ldr q7, .Lxts_mul_x 469 b .LxtsdecNx 470 471.LxtsdecloopNx: 472 ldr q7, .Lxts_mul_x 473 next_tweak v4, v4, v7, v8 474.LxtsdecNx: 475#if INTERLEAVE >= 2 476 subs w4, w4, #INTERLEAVE 477 bmi .Lxtsdec1x 478#if INTERLEAVE == 2 479 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */ 480 next_tweak v5, v4, v7, v8 481 eor v0.16b, v0.16b, v4.16b 482 eor v1.16b, v1.16b, v5.16b 483 do_decrypt_block2x 484 eor v0.16b, v0.16b, v4.16b 485 eor v1.16b, v1.16b, v5.16b 486 st1 {v0.16b-v1.16b}, [x0], #32 487 cbz w4, .LxtsdecoutNx 488 next_tweak v4, v5, v7, v8 489 b .LxtsdecNx 490.LxtsdecoutNx: 491 mov v4.16b, v5.16b 492 b .Lxtsdecout 493#else 494 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ 495 next_tweak v5, v4, v7, v8 496 eor v0.16b, v0.16b, v4.16b 497 next_tweak v6, v5, v7, v8 498 eor v1.16b, v1.16b, v5.16b 499 eor v2.16b, v2.16b, v6.16b 500 next_tweak v7, v6, v7, v8 501 eor v3.16b, v3.16b, v7.16b 502 do_decrypt_block4x 503 eor v3.16b, v3.16b, v7.16b 504 eor v0.16b, v0.16b, v4.16b 505 eor v1.16b, v1.16b, v5.16b 506 eor v2.16b, v2.16b, v6.16b 507 st1 {v0.16b-v3.16b}, [x0], #64 508 mov v4.16b, v7.16b 509 cbz w4, .Lxtsdecout 510 b .LxtsdecloopNx 511#endif 512.Lxtsdec1x: 513 adds w4, w4, #INTERLEAVE 514 beq .Lxtsdecout 515#endif 516.Lxtsdecloop: 517 ld1 {v1.16b}, [x1], #16 518 eor v0.16b, v1.16b, v4.16b 519 decrypt_block v0, w3, x2, x6, w7 520 eor v0.16b, v0.16b, v4.16b 521 st1 {v0.16b}, [x0], #16 522 subs w4, w4, #1 523 beq .Lxtsdecout 524 next_tweak v4, v4, v7, v8 525 b .Lxtsdecloop 526.Lxtsdecout: 527 FRAME_POP 528 ret 529AES_ENDPROC(aes_xts_decrypt) 530