1#include "arm_arch.h" 2 3#if __ARM_MAX_ARCH__>=7 4 5.text 6.align 5 7Lrcon: 8.long 0x01,0x01,0x01,0x01 9.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat 10.long 0x1b,0x1b,0x1b,0x1b 11 12.globl _aes_v8_set_encrypt_key 13 14.align 5 15_aes_v8_set_encrypt_key: 16Lenc_key: 17 stp x29,x30,[sp,#-16]! 18 add x29,sp,#0 19 mov x3,#-1 20 cmp x0,#0 21 b.eq Lenc_key_abort 22 cmp x2,#0 23 b.eq Lenc_key_abort 24 mov x3,#-2 25 cmp w1,#128 26 b.lt Lenc_key_abort 27 cmp w1,#256 28 b.gt Lenc_key_abort 29 tst w1,#0x3f 30 b.ne Lenc_key_abort 31 32 adr x3,Lrcon 33 cmp w1,#192 34 35 eor v0.16b,v0.16b,v0.16b 36 ld1 {v3.16b},[x0],#16 37 mov w1,#8 // reuse w1 38 ld1 {v1.4s,v2.4s},[x3],#32 39 40 b.lt Loop128 41 b.eq L192 42 b L256 43 44.align 4 45Loop128: 46 tbl v6.16b,{v3.16b},v2.16b 47 ext v5.16b,v0.16b,v3.16b,#12 48 st1 {v3.4s},[x2],#16 49 aese v6.16b,v0.16b 50 subs w1,w1,#1 51 52 eor v3.16b,v3.16b,v5.16b 53 ext v5.16b,v0.16b,v5.16b,#12 54 eor v3.16b,v3.16b,v5.16b 55 ext v5.16b,v0.16b,v5.16b,#12 56 eor v6.16b,v6.16b,v1.16b 57 eor v3.16b,v3.16b,v5.16b 58 shl v1.16b,v1.16b,#1 59 eor v3.16b,v3.16b,v6.16b 60 b.ne Loop128 61 62 ld1 {v1.4s},[x3] 63 64 tbl v6.16b,{v3.16b},v2.16b 65 ext v5.16b,v0.16b,v3.16b,#12 66 st1 {v3.4s},[x2],#16 67 aese v6.16b,v0.16b 68 69 eor v3.16b,v3.16b,v5.16b 70 ext v5.16b,v0.16b,v5.16b,#12 71 eor v3.16b,v3.16b,v5.16b 72 ext v5.16b,v0.16b,v5.16b,#12 73 eor v6.16b,v6.16b,v1.16b 74 eor v3.16b,v3.16b,v5.16b 75 shl v1.16b,v1.16b,#1 76 eor v3.16b,v3.16b,v6.16b 77 78 tbl v6.16b,{v3.16b},v2.16b 79 ext v5.16b,v0.16b,v3.16b,#12 80 st1 {v3.4s},[x2],#16 81 aese v6.16b,v0.16b 82 83 eor v3.16b,v3.16b,v5.16b 84 ext v5.16b,v0.16b,v5.16b,#12 85 eor v3.16b,v3.16b,v5.16b 86 ext v5.16b,v0.16b,v5.16b,#12 87 eor v6.16b,v6.16b,v1.16b 88 eor v3.16b,v3.16b,v5.16b 89 eor v3.16b,v3.16b,v6.16b 90 st1 {v3.4s},[x2] 91 add x2,x2,#0x50 92 93 mov w12,#10 94 b Ldone 95 96.align 4 97L192: 98 ld1 {v4.8b},[x0],#8 99 movi v6.16b,#8 // borrow v6.16b 100 st1 {v3.4s},[x2],#16 101 sub v2.16b,v2.16b,v6.16b // adjust the mask 102 103Loop192: 104 tbl v6.16b,{v4.16b},v2.16b 105 ext v5.16b,v0.16b,v3.16b,#12 106#ifdef __ARMEB__ 107 st1 {v4.4s},[x2],#16 108 sub x2,x2,#8 109#else 110 st1 {v4.8b},[x2],#8 111#endif 112 aese v6.16b,v0.16b 113 subs w1,w1,#1 114 115 eor v3.16b,v3.16b,v5.16b 116 ext v5.16b,v0.16b,v5.16b,#12 117 eor v3.16b,v3.16b,v5.16b 118 ext v5.16b,v0.16b,v5.16b,#12 119 eor v3.16b,v3.16b,v5.16b 120 121 dup v5.4s,v3.s[3] 122 eor v5.16b,v5.16b,v4.16b 123 eor v6.16b,v6.16b,v1.16b 124 ext v4.16b,v0.16b,v4.16b,#12 125 shl v1.16b,v1.16b,#1 126 eor v4.16b,v4.16b,v5.16b 127 eor v3.16b,v3.16b,v6.16b 128 eor v4.16b,v4.16b,v6.16b 129 st1 {v3.4s},[x2],#16 130 b.ne Loop192 131 132 mov w12,#12 133 add x2,x2,#0x20 134 b Ldone 135 136.align 4 137L256: 138 ld1 {v4.16b},[x0] 139 mov w1,#7 140 mov w12,#14 141 st1 {v3.4s},[x2],#16 142 143Loop256: 144 tbl v6.16b,{v4.16b},v2.16b 145 ext v5.16b,v0.16b,v3.16b,#12 146 st1 {v4.4s},[x2],#16 147 aese v6.16b,v0.16b 148 subs w1,w1,#1 149 150 eor v3.16b,v3.16b,v5.16b 151 ext v5.16b,v0.16b,v5.16b,#12 152 eor v3.16b,v3.16b,v5.16b 153 ext v5.16b,v0.16b,v5.16b,#12 154 eor v6.16b,v6.16b,v1.16b 155 eor v3.16b,v3.16b,v5.16b 156 shl v1.16b,v1.16b,#1 157 eor v3.16b,v3.16b,v6.16b 158 st1 {v3.4s},[x2],#16 159 b.eq Ldone 160 161 dup v6.4s,v3.s[3] // just splat 162 ext v5.16b,v0.16b,v4.16b,#12 163 aese v6.16b,v0.16b 164 165 eor v4.16b,v4.16b,v5.16b 166 ext v5.16b,v0.16b,v5.16b,#12 167 eor v4.16b,v4.16b,v5.16b 168 ext v5.16b,v0.16b,v5.16b,#12 169 eor v4.16b,v4.16b,v5.16b 170 171 eor v4.16b,v4.16b,v6.16b 172 b Loop256 173 174Ldone: 175 str w12,[x2] 176 mov x3,#0 177 178Lenc_key_abort: 179 mov x0,x3 // return value 180 ldr x29,[sp],#16 181 ret 182 183 184.globl _aes_v8_set_decrypt_key 185 186.align 5 187_aes_v8_set_decrypt_key: 188.long 0xd503233f // paciasp 189 stp x29,x30,[sp,#-16]! 190 add x29,sp,#0 191 bl Lenc_key 192 193 cmp x0,#0 194 b.ne Ldec_key_abort 195 196 sub x2,x2,#240 // restore original x2 197 mov x4,#-16 198 add x0,x2,x12,lsl#4 // end of key schedule 199 200 ld1 {v0.4s},[x2] 201 ld1 {v1.4s},[x0] 202 st1 {v0.4s},[x0],x4 203 st1 {v1.4s},[x2],#16 204 205Loop_imc: 206 ld1 {v0.4s},[x2] 207 ld1 {v1.4s},[x0] 208 aesimc v0.16b,v0.16b 209 aesimc v1.16b,v1.16b 210 st1 {v0.4s},[x0],x4 211 st1 {v1.4s},[x2],#16 212 cmp x0,x2 213 b.hi Loop_imc 214 215 ld1 {v0.4s},[x2] 216 aesimc v0.16b,v0.16b 217 st1 {v0.4s},[x0] 218 219 eor x0,x0,x0 // return value 220Ldec_key_abort: 221 ldp x29,x30,[sp],#16 222.long 0xd50323bf // autiasp 223 ret 224 225.globl _aes_v8_encrypt 226 227.align 5 228_aes_v8_encrypt: 229 ldr w3,[x2,#240] 230 ld1 {v0.4s},[x2],#16 231 ld1 {v2.16b},[x0] 232 sub w3,w3,#2 233 ld1 {v1.4s},[x2],#16 234 235Loop_enc: 236 aese v2.16b,v0.16b 237 aesmc v2.16b,v2.16b 238 ld1 {v0.4s},[x2],#16 239 subs w3,w3,#2 240 aese v2.16b,v1.16b 241 aesmc v2.16b,v2.16b 242 ld1 {v1.4s},[x2],#16 243 b.gt Loop_enc 244 245 aese v2.16b,v0.16b 246 aesmc v2.16b,v2.16b 247 ld1 {v0.4s},[x2] 248 aese v2.16b,v1.16b 249 eor v2.16b,v2.16b,v0.16b 250 251 st1 {v2.16b},[x1] 252 ret 253 254.globl _aes_v8_decrypt 255 256.align 5 257_aes_v8_decrypt: 258 ldr w3,[x2,#240] 259 ld1 {v0.4s},[x2],#16 260 ld1 {v2.16b},[x0] 261 sub w3,w3,#2 262 ld1 {v1.4s},[x2],#16 263 264Loop_dec: 265 aesd v2.16b,v0.16b 266 aesimc v2.16b,v2.16b 267 ld1 {v0.4s},[x2],#16 268 subs w3,w3,#2 269 aesd v2.16b,v1.16b 270 aesimc v2.16b,v2.16b 271 ld1 {v1.4s},[x2],#16 272 b.gt Loop_dec 273 274 aesd v2.16b,v0.16b 275 aesimc v2.16b,v2.16b 276 ld1 {v0.4s},[x2] 277 aesd v2.16b,v1.16b 278 eor v2.16b,v2.16b,v0.16b 279 280 st1 {v2.16b},[x1] 281 ret 282 283.globl _aes_v8_ecb_encrypt 284 285.align 5 286_aes_v8_ecb_encrypt: 287 subs x2,x2,#16 288 // Original input data size bigger than 16, jump to big size processing. 289 b.ne Lecb_big_size 290 ld1 {v0.16b},[x0] 291 cmp w4,#0 // en- or decrypting? 292 ldr w5,[x3,#240] 293 ld1 {v5.4s,v6.4s},[x3],#32 // load key schedule... 294 295 b.eq Lecb_small_dec 296 aese v0.16b,v5.16b 297 aesmc v0.16b,v0.16b 298 ld1 {v16.4s,v17.4s},[x3],#32 // load key schedule... 299 aese v0.16b,v6.16b 300 aesmc v0.16b,v0.16b 301 subs w5,w5,#10 // if rounds==10, jump to aes-128-ecb processing 302 b.eq Lecb_128_enc 303Lecb_round_loop: 304 aese v0.16b,v16.16b 305 aesmc v0.16b,v0.16b 306 ld1 {v16.4s},[x3],#16 // load key schedule... 307 aese v0.16b,v17.16b 308 aesmc v0.16b,v0.16b 309 ld1 {v17.4s},[x3],#16 // load key schedule... 310 subs w5,w5,#2 // bias 311 b.gt Lecb_round_loop 312Lecb_128_enc: 313 ld1 {v18.4s,v19.4s},[x3],#32 // load key schedule... 314 aese v0.16b,v16.16b 315 aesmc v0.16b,v0.16b 316 aese v0.16b,v17.16b 317 aesmc v0.16b,v0.16b 318 ld1 {v20.4s,v21.4s},[x3],#32 // load key schedule... 319 aese v0.16b,v18.16b 320 aesmc v0.16b,v0.16b 321 aese v0.16b,v19.16b 322 aesmc v0.16b,v0.16b 323 ld1 {v22.4s,v23.4s},[x3],#32 // load key schedule... 324 aese v0.16b,v20.16b 325 aesmc v0.16b,v0.16b 326 aese v0.16b,v21.16b 327 aesmc v0.16b,v0.16b 328 ld1 {v7.4s},[x3] 329 aese v0.16b,v22.16b 330 aesmc v0.16b,v0.16b 331 aese v0.16b,v23.16b 332 eor v0.16b,v0.16b,v7.16b 333 st1 {v0.16b},[x1] 334 b Lecb_Final_abort 335Lecb_small_dec: 336 aesd v0.16b,v5.16b 337 aesimc v0.16b,v0.16b 338 ld1 {v16.4s,v17.4s},[x3],#32 // load key schedule... 339 aesd v0.16b,v6.16b 340 aesimc v0.16b,v0.16b 341 subs w5,w5,#10 // bias 342 b.eq Lecb_128_dec 343Lecb_dec_round_loop: 344 aesd v0.16b,v16.16b 345 aesimc v0.16b,v0.16b 346 ld1 {v16.4s},[x3],#16 // load key schedule... 347 aesd v0.16b,v17.16b 348 aesimc v0.16b,v0.16b 349 ld1 {v17.4s},[x3],#16 // load key schedule... 350 subs w5,w5,#2 // bias 351 b.gt Lecb_dec_round_loop 352Lecb_128_dec: 353 ld1 {v18.4s,v19.4s},[x3],#32 // load key schedule... 354 aesd v0.16b,v16.16b 355 aesimc v0.16b,v0.16b 356 aesd v0.16b,v17.16b 357 aesimc v0.16b,v0.16b 358 ld1 {v20.4s,v21.4s},[x3],#32 // load key schedule... 359 aesd v0.16b,v18.16b 360 aesimc v0.16b,v0.16b 361 aesd v0.16b,v19.16b 362 aesimc v0.16b,v0.16b 363 ld1 {v22.4s,v23.4s},[x3],#32 // load key schedule... 364 aesd v0.16b,v20.16b 365 aesimc v0.16b,v0.16b 366 aesd v0.16b,v21.16b 367 aesimc v0.16b,v0.16b 368 ld1 {v7.4s},[x3] 369 aesd v0.16b,v22.16b 370 aesimc v0.16b,v0.16b 371 aesd v0.16b,v23.16b 372 eor v0.16b,v0.16b,v7.16b 373 st1 {v0.16b},[x1] 374 b Lecb_Final_abort 375Lecb_big_size: 376 stp x29,x30,[sp,#-16]! 377 add x29,sp,#0 378 mov x8,#16 379 b.lo Lecb_done 380 csel x8,xzr,x8,eq 381 382 cmp w4,#0 // en- or decrypting? 383 ldr w5,[x3,#240] 384 and x2,x2,#-16 385 ld1 {v0.16b},[x0],x8 386 387 ld1 {v16.4s,v17.4s},[x3] // load key schedule... 388 sub w5,w5,#6 389 add x7,x3,x5,lsl#4 // pointer to last 7 round keys 390 sub w5,w5,#2 391 ld1 {v18.4s,v19.4s},[x7],#32 392 ld1 {v20.4s,v21.4s},[x7],#32 393 ld1 {v22.4s,v23.4s},[x7],#32 394 ld1 {v7.4s},[x7] 395 396 add x7,x3,#32 397 mov w6,w5 398 b.eq Lecb_dec 399 400 ld1 {v1.16b},[x0],#16 401 subs x2,x2,#32 // bias 402 add w6,w5,#2 403 orr v3.16b,v1.16b,v1.16b 404 orr v24.16b,v1.16b,v1.16b 405 orr v1.16b,v0.16b,v0.16b 406 b.lo Lecb_enc_tail 407 408 orr v1.16b,v3.16b,v3.16b 409 ld1 {v24.16b},[x0],#16 410 cmp x2,#32 411 b.lo Loop3x_ecb_enc 412 413 ld1 {v25.16b},[x0],#16 414 ld1 {v26.16b},[x0],#16 415 sub x2,x2,#32 // bias 416 mov w6,w5 417 418Loop5x_ecb_enc: 419 aese v0.16b,v16.16b 420 aesmc v0.16b,v0.16b 421 aese v1.16b,v16.16b 422 aesmc v1.16b,v1.16b 423 aese v24.16b,v16.16b 424 aesmc v24.16b,v24.16b 425 aese v25.16b,v16.16b 426 aesmc v25.16b,v25.16b 427 aese v26.16b,v16.16b 428 aesmc v26.16b,v26.16b 429 ld1 {v16.4s},[x7],#16 430 subs w6,w6,#2 431 aese v0.16b,v17.16b 432 aesmc v0.16b,v0.16b 433 aese v1.16b,v17.16b 434 aesmc v1.16b,v1.16b 435 aese v24.16b,v17.16b 436 aesmc v24.16b,v24.16b 437 aese v25.16b,v17.16b 438 aesmc v25.16b,v25.16b 439 aese v26.16b,v17.16b 440 aesmc v26.16b,v26.16b 441 ld1 {v17.4s},[x7],#16 442 b.gt Loop5x_ecb_enc 443 444 aese v0.16b,v16.16b 445 aesmc v0.16b,v0.16b 446 aese v1.16b,v16.16b 447 aesmc v1.16b,v1.16b 448 aese v24.16b,v16.16b 449 aesmc v24.16b,v24.16b 450 aese v25.16b,v16.16b 451 aesmc v25.16b,v25.16b 452 aese v26.16b,v16.16b 453 aesmc v26.16b,v26.16b 454 cmp x2,#0x40 // because Lecb_enc_tail4x 455 sub x2,x2,#0x50 456 457 aese v0.16b,v17.16b 458 aesmc v0.16b,v0.16b 459 aese v1.16b,v17.16b 460 aesmc v1.16b,v1.16b 461 aese v24.16b,v17.16b 462 aesmc v24.16b,v24.16b 463 aese v25.16b,v17.16b 464 aesmc v25.16b,v25.16b 465 aese v26.16b,v17.16b 466 aesmc v26.16b,v26.16b 467 csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo 468 mov x7,x3 469 470 aese v0.16b,v18.16b 471 aesmc v0.16b,v0.16b 472 aese v1.16b,v18.16b 473 aesmc v1.16b,v1.16b 474 aese v24.16b,v18.16b 475 aesmc v24.16b,v24.16b 476 aese v25.16b,v18.16b 477 aesmc v25.16b,v25.16b 478 aese v26.16b,v18.16b 479 aesmc v26.16b,v26.16b 480 add x0,x0,x6 // x0 is adjusted in such way that 481 // at exit from the loop v1.16b-v26.16b 482 // are loaded with last "words" 483 add x6,x2,#0x60 // because Lecb_enc_tail4x 484 485 aese v0.16b,v19.16b 486 aesmc v0.16b,v0.16b 487 aese v1.16b,v19.16b 488 aesmc v1.16b,v1.16b 489 aese v24.16b,v19.16b 490 aesmc v24.16b,v24.16b 491 aese v25.16b,v19.16b 492 aesmc v25.16b,v25.16b 493 aese v26.16b,v19.16b 494 aesmc v26.16b,v26.16b 495 496 aese v0.16b,v20.16b 497 aesmc v0.16b,v0.16b 498 aese v1.16b,v20.16b 499 aesmc v1.16b,v1.16b 500 aese v24.16b,v20.16b 501 aesmc v24.16b,v24.16b 502 aese v25.16b,v20.16b 503 aesmc v25.16b,v25.16b 504 aese v26.16b,v20.16b 505 aesmc v26.16b,v26.16b 506 507 aese v0.16b,v21.16b 508 aesmc v0.16b,v0.16b 509 aese v1.16b,v21.16b 510 aesmc v1.16b,v1.16b 511 aese v24.16b,v21.16b 512 aesmc v24.16b,v24.16b 513 aese v25.16b,v21.16b 514 aesmc v25.16b,v25.16b 515 aese v26.16b,v21.16b 516 aesmc v26.16b,v26.16b 517 518 aese v0.16b,v22.16b 519 aesmc v0.16b,v0.16b 520 aese v1.16b,v22.16b 521 aesmc v1.16b,v1.16b 522 aese v24.16b,v22.16b 523 aesmc v24.16b,v24.16b 524 aese v25.16b,v22.16b 525 aesmc v25.16b,v25.16b 526 aese v26.16b,v22.16b 527 aesmc v26.16b,v26.16b 528 529 aese v0.16b,v23.16b 530 ld1 {v2.16b},[x0],#16 531 aese v1.16b,v23.16b 532 ld1 {v3.16b},[x0],#16 533 aese v24.16b,v23.16b 534 ld1 {v27.16b},[x0],#16 535 aese v25.16b,v23.16b 536 ld1 {v28.16b},[x0],#16 537 aese v26.16b,v23.16b 538 ld1 {v29.16b},[x0],#16 539 cbz x6,Lecb_enc_tail4x 540 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 541 eor v4.16b,v7.16b,v0.16b 542 orr v0.16b,v2.16b,v2.16b 543 eor v5.16b,v7.16b,v1.16b 544 orr v1.16b,v3.16b,v3.16b 545 eor v17.16b,v7.16b,v24.16b 546 orr v24.16b,v27.16b,v27.16b 547 eor v30.16b,v7.16b,v25.16b 548 orr v25.16b,v28.16b,v28.16b 549 eor v31.16b,v7.16b,v26.16b 550 st1 {v4.16b},[x1],#16 551 orr v26.16b,v29.16b,v29.16b 552 st1 {v5.16b},[x1],#16 553 mov w6,w5 554 st1 {v17.16b},[x1],#16 555 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 556 st1 {v30.16b},[x1],#16 557 st1 {v31.16b},[x1],#16 558 b.hs Loop5x_ecb_enc 559 560 add x2,x2,#0x50 561 cbz x2,Lecb_done 562 563 add w6,w5,#2 564 subs x2,x2,#0x30 565 orr v0.16b,v27.16b,v27.16b 566 orr v1.16b,v28.16b,v28.16b 567 orr v24.16b,v29.16b,v29.16b 568 b.lo Lecb_enc_tail 569 570 b Loop3x_ecb_enc 571 572.align 4 573Lecb_enc_tail4x: 574 eor v5.16b,v7.16b,v1.16b 575 eor v17.16b,v7.16b,v24.16b 576 eor v30.16b,v7.16b,v25.16b 577 eor v31.16b,v7.16b,v26.16b 578 st1 {v5.16b},[x1],#16 579 st1 {v17.16b},[x1],#16 580 st1 {v30.16b},[x1],#16 581 st1 {v31.16b},[x1],#16 582 583 b Lecb_done 584.align 4 585Loop3x_ecb_enc: 586 aese v0.16b,v16.16b 587 aesmc v0.16b,v0.16b 588 aese v1.16b,v16.16b 589 aesmc v1.16b,v1.16b 590 aese v24.16b,v16.16b 591 aesmc v24.16b,v24.16b 592 ld1 {v16.4s},[x7],#16 593 subs w6,w6,#2 594 aese v0.16b,v17.16b 595 aesmc v0.16b,v0.16b 596 aese v1.16b,v17.16b 597 aesmc v1.16b,v1.16b 598 aese v24.16b,v17.16b 599 aesmc v24.16b,v24.16b 600 ld1 {v17.4s},[x7],#16 601 b.gt Loop3x_ecb_enc 602 603 aese v0.16b,v16.16b 604 aesmc v0.16b,v0.16b 605 aese v1.16b,v16.16b 606 aesmc v1.16b,v1.16b 607 aese v24.16b,v16.16b 608 aesmc v24.16b,v24.16b 609 subs x2,x2,#0x30 610 csel x6,x2,x6,lo // x6, w6, is zero at this point 611 aese v0.16b,v17.16b 612 aesmc v0.16b,v0.16b 613 aese v1.16b,v17.16b 614 aesmc v1.16b,v1.16b 615 aese v24.16b,v17.16b 616 aesmc v24.16b,v24.16b 617 add x0,x0,x6 // x0 is adjusted in such way that 618 // at exit from the loop v1.16b-v24.16b 619 // are loaded with last "words" 620 mov x7,x3 621 aese v0.16b,v20.16b 622 aesmc v0.16b,v0.16b 623 aese v1.16b,v20.16b 624 aesmc v1.16b,v1.16b 625 aese v24.16b,v20.16b 626 aesmc v24.16b,v24.16b 627 ld1 {v2.16b},[x0],#16 628 aese v0.16b,v21.16b 629 aesmc v0.16b,v0.16b 630 aese v1.16b,v21.16b 631 aesmc v1.16b,v1.16b 632 aese v24.16b,v21.16b 633 aesmc v24.16b,v24.16b 634 ld1 {v3.16b},[x0],#16 635 aese v0.16b,v22.16b 636 aesmc v0.16b,v0.16b 637 aese v1.16b,v22.16b 638 aesmc v1.16b,v1.16b 639 aese v24.16b,v22.16b 640 aesmc v24.16b,v24.16b 641 ld1 {v27.16b},[x0],#16 642 aese v0.16b,v23.16b 643 aese v1.16b,v23.16b 644 aese v24.16b,v23.16b 645 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 646 add w6,w5,#2 647 eor v4.16b,v7.16b,v0.16b 648 eor v5.16b,v7.16b,v1.16b 649 eor v24.16b,v24.16b,v7.16b 650 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 651 st1 {v4.16b},[x1],#16 652 orr v0.16b,v2.16b,v2.16b 653 st1 {v5.16b},[x1],#16 654 orr v1.16b,v3.16b,v3.16b 655 st1 {v24.16b},[x1],#16 656 orr v24.16b,v27.16b,v27.16b 657 b.hs Loop3x_ecb_enc 658 659 cmn x2,#0x30 660 b.eq Lecb_done 661 nop 662 663Lecb_enc_tail: 664 aese v1.16b,v16.16b 665 aesmc v1.16b,v1.16b 666 aese v24.16b,v16.16b 667 aesmc v24.16b,v24.16b 668 ld1 {v16.4s},[x7],#16 669 subs w6,w6,#2 670 aese v1.16b,v17.16b 671 aesmc v1.16b,v1.16b 672 aese v24.16b,v17.16b 673 aesmc v24.16b,v24.16b 674 ld1 {v17.4s},[x7],#16 675 b.gt Lecb_enc_tail 676 677 aese v1.16b,v16.16b 678 aesmc v1.16b,v1.16b 679 aese v24.16b,v16.16b 680 aesmc v24.16b,v24.16b 681 aese v1.16b,v17.16b 682 aesmc v1.16b,v1.16b 683 aese v24.16b,v17.16b 684 aesmc v24.16b,v24.16b 685 aese v1.16b,v20.16b 686 aesmc v1.16b,v1.16b 687 aese v24.16b,v20.16b 688 aesmc v24.16b,v24.16b 689 cmn x2,#0x20 690 aese v1.16b,v21.16b 691 aesmc v1.16b,v1.16b 692 aese v24.16b,v21.16b 693 aesmc v24.16b,v24.16b 694 aese v1.16b,v22.16b 695 aesmc v1.16b,v1.16b 696 aese v24.16b,v22.16b 697 aesmc v24.16b,v24.16b 698 aese v1.16b,v23.16b 699 aese v24.16b,v23.16b 700 b.eq Lecb_enc_one 701 eor v5.16b,v7.16b,v1.16b 702 eor v17.16b,v7.16b,v24.16b 703 st1 {v5.16b},[x1],#16 704 st1 {v17.16b},[x1],#16 705 b Lecb_done 706 707Lecb_enc_one: 708 eor v5.16b,v7.16b,v24.16b 709 st1 {v5.16b},[x1],#16 710 b Lecb_done 711.align 5 712Lecb_dec: 713 ld1 {v1.16b},[x0],#16 714 subs x2,x2,#32 // bias 715 add w6,w5,#2 716 orr v3.16b,v1.16b,v1.16b 717 orr v24.16b,v1.16b,v1.16b 718 orr v1.16b,v0.16b,v0.16b 719 b.lo Lecb_dec_tail 720 721 orr v1.16b,v3.16b,v3.16b 722 ld1 {v24.16b},[x0],#16 723 cmp x2,#32 724 b.lo Loop3x_ecb_dec 725 726 ld1 {v25.16b},[x0],#16 727 ld1 {v26.16b},[x0],#16 728 sub x2,x2,#32 // bias 729 mov w6,w5 730 731Loop5x_ecb_dec: 732 aesd v0.16b,v16.16b 733 aesimc v0.16b,v0.16b 734 aesd v1.16b,v16.16b 735 aesimc v1.16b,v1.16b 736 aesd v24.16b,v16.16b 737 aesimc v24.16b,v24.16b 738 aesd v25.16b,v16.16b 739 aesimc v25.16b,v25.16b 740 aesd v26.16b,v16.16b 741 aesimc v26.16b,v26.16b 742 ld1 {v16.4s},[x7],#16 743 subs w6,w6,#2 744 aesd v0.16b,v17.16b 745 aesimc v0.16b,v0.16b 746 aesd v1.16b,v17.16b 747 aesimc v1.16b,v1.16b 748 aesd v24.16b,v17.16b 749 aesimc v24.16b,v24.16b 750 aesd v25.16b,v17.16b 751 aesimc v25.16b,v25.16b 752 aesd v26.16b,v17.16b 753 aesimc v26.16b,v26.16b 754 ld1 {v17.4s},[x7],#16 755 b.gt Loop5x_ecb_dec 756 757 aesd v0.16b,v16.16b 758 aesimc v0.16b,v0.16b 759 aesd v1.16b,v16.16b 760 aesimc v1.16b,v1.16b 761 aesd v24.16b,v16.16b 762 aesimc v24.16b,v24.16b 763 aesd v25.16b,v16.16b 764 aesimc v25.16b,v25.16b 765 aesd v26.16b,v16.16b 766 aesimc v26.16b,v26.16b 767 cmp x2,#0x40 // because Lecb_tail4x 768 sub x2,x2,#0x50 769 770 aesd v0.16b,v17.16b 771 aesimc v0.16b,v0.16b 772 aesd v1.16b,v17.16b 773 aesimc v1.16b,v1.16b 774 aesd v24.16b,v17.16b 775 aesimc v24.16b,v24.16b 776 aesd v25.16b,v17.16b 777 aesimc v25.16b,v25.16b 778 aesd v26.16b,v17.16b 779 aesimc v26.16b,v26.16b 780 csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo 781 mov x7,x3 782 783 aesd v0.16b,v18.16b 784 aesimc v0.16b,v0.16b 785 aesd v1.16b,v18.16b 786 aesimc v1.16b,v1.16b 787 aesd v24.16b,v18.16b 788 aesimc v24.16b,v24.16b 789 aesd v25.16b,v18.16b 790 aesimc v25.16b,v25.16b 791 aesd v26.16b,v18.16b 792 aesimc v26.16b,v26.16b 793 add x0,x0,x6 // x0 is adjusted in such way that 794 // at exit from the loop v1.16b-v26.16b 795 // are loaded with last "words" 796 add x6,x2,#0x60 // because Lecb_tail4x 797 798 aesd v0.16b,v19.16b 799 aesimc v0.16b,v0.16b 800 aesd v1.16b,v19.16b 801 aesimc v1.16b,v1.16b 802 aesd v24.16b,v19.16b 803 aesimc v24.16b,v24.16b 804 aesd v25.16b,v19.16b 805 aesimc v25.16b,v25.16b 806 aesd v26.16b,v19.16b 807 aesimc v26.16b,v26.16b 808 809 aesd v0.16b,v20.16b 810 aesimc v0.16b,v0.16b 811 aesd v1.16b,v20.16b 812 aesimc v1.16b,v1.16b 813 aesd v24.16b,v20.16b 814 aesimc v24.16b,v24.16b 815 aesd v25.16b,v20.16b 816 aesimc v25.16b,v25.16b 817 aesd v26.16b,v20.16b 818 aesimc v26.16b,v26.16b 819 820 aesd v0.16b,v21.16b 821 aesimc v0.16b,v0.16b 822 aesd v1.16b,v21.16b 823 aesimc v1.16b,v1.16b 824 aesd v24.16b,v21.16b 825 aesimc v24.16b,v24.16b 826 aesd v25.16b,v21.16b 827 aesimc v25.16b,v25.16b 828 aesd v26.16b,v21.16b 829 aesimc v26.16b,v26.16b 830 831 aesd v0.16b,v22.16b 832 aesimc v0.16b,v0.16b 833 aesd v1.16b,v22.16b 834 aesimc v1.16b,v1.16b 835 aesd v24.16b,v22.16b 836 aesimc v24.16b,v24.16b 837 aesd v25.16b,v22.16b 838 aesimc v25.16b,v25.16b 839 aesd v26.16b,v22.16b 840 aesimc v26.16b,v26.16b 841 842 aesd v0.16b,v23.16b 843 ld1 {v2.16b},[x0],#16 844 aesd v1.16b,v23.16b 845 ld1 {v3.16b},[x0],#16 846 aesd v24.16b,v23.16b 847 ld1 {v27.16b},[x0],#16 848 aesd v25.16b,v23.16b 849 ld1 {v28.16b},[x0],#16 850 aesd v26.16b,v23.16b 851 ld1 {v29.16b},[x0],#16 852 cbz x6,Lecb_tail4x 853 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 854 eor v4.16b,v7.16b,v0.16b 855 orr v0.16b,v2.16b,v2.16b 856 eor v5.16b,v7.16b,v1.16b 857 orr v1.16b,v3.16b,v3.16b 858 eor v17.16b,v7.16b,v24.16b 859 orr v24.16b,v27.16b,v27.16b 860 eor v30.16b,v7.16b,v25.16b 861 orr v25.16b,v28.16b,v28.16b 862 eor v31.16b,v7.16b,v26.16b 863 st1 {v4.16b},[x1],#16 864 orr v26.16b,v29.16b,v29.16b 865 st1 {v5.16b},[x1],#16 866 mov w6,w5 867 st1 {v17.16b},[x1],#16 868 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 869 st1 {v30.16b},[x1],#16 870 st1 {v31.16b},[x1],#16 871 b.hs Loop5x_ecb_dec 872 873 add x2,x2,#0x50 874 cbz x2,Lecb_done 875 876 add w6,w5,#2 877 subs x2,x2,#0x30 878 orr v0.16b,v27.16b,v27.16b 879 orr v1.16b,v28.16b,v28.16b 880 orr v24.16b,v29.16b,v29.16b 881 b.lo Lecb_dec_tail 882 883 b Loop3x_ecb_dec 884 885.align 4 886Lecb_tail4x: 887 eor v5.16b,v7.16b,v1.16b 888 eor v17.16b,v7.16b,v24.16b 889 eor v30.16b,v7.16b,v25.16b 890 eor v31.16b,v7.16b,v26.16b 891 st1 {v5.16b},[x1],#16 892 st1 {v17.16b},[x1],#16 893 st1 {v30.16b},[x1],#16 894 st1 {v31.16b},[x1],#16 895 896 b Lecb_done 897.align 4 898Loop3x_ecb_dec: 899 aesd v0.16b,v16.16b 900 aesimc v0.16b,v0.16b 901 aesd v1.16b,v16.16b 902 aesimc v1.16b,v1.16b 903 aesd v24.16b,v16.16b 904 aesimc v24.16b,v24.16b 905 ld1 {v16.4s},[x7],#16 906 subs w6,w6,#2 907 aesd v0.16b,v17.16b 908 aesimc v0.16b,v0.16b 909 aesd v1.16b,v17.16b 910 aesimc v1.16b,v1.16b 911 aesd v24.16b,v17.16b 912 aesimc v24.16b,v24.16b 913 ld1 {v17.4s},[x7],#16 914 b.gt Loop3x_ecb_dec 915 916 aesd v0.16b,v16.16b 917 aesimc v0.16b,v0.16b 918 aesd v1.16b,v16.16b 919 aesimc v1.16b,v1.16b 920 aesd v24.16b,v16.16b 921 aesimc v24.16b,v24.16b 922 subs x2,x2,#0x30 923 csel x6,x2,x6,lo // x6, w6, is zero at this point 924 aesd v0.16b,v17.16b 925 aesimc v0.16b,v0.16b 926 aesd v1.16b,v17.16b 927 aesimc v1.16b,v1.16b 928 aesd v24.16b,v17.16b 929 aesimc v24.16b,v24.16b 930 add x0,x0,x6 // x0 is adjusted in such way that 931 // at exit from the loop v1.16b-v24.16b 932 // are loaded with last "words" 933 mov x7,x3 934 aesd v0.16b,v20.16b 935 aesimc v0.16b,v0.16b 936 aesd v1.16b,v20.16b 937 aesimc v1.16b,v1.16b 938 aesd v24.16b,v20.16b 939 aesimc v24.16b,v24.16b 940 ld1 {v2.16b},[x0],#16 941 aesd v0.16b,v21.16b 942 aesimc v0.16b,v0.16b 943 aesd v1.16b,v21.16b 944 aesimc v1.16b,v1.16b 945 aesd v24.16b,v21.16b 946 aesimc v24.16b,v24.16b 947 ld1 {v3.16b},[x0],#16 948 aesd v0.16b,v22.16b 949 aesimc v0.16b,v0.16b 950 aesd v1.16b,v22.16b 951 aesimc v1.16b,v1.16b 952 aesd v24.16b,v22.16b 953 aesimc v24.16b,v24.16b 954 ld1 {v27.16b},[x0],#16 955 aesd v0.16b,v23.16b 956 aesd v1.16b,v23.16b 957 aesd v24.16b,v23.16b 958 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 959 add w6,w5,#2 960 eor v4.16b,v7.16b,v0.16b 961 eor v5.16b,v7.16b,v1.16b 962 eor v24.16b,v24.16b,v7.16b 963 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 964 st1 {v4.16b},[x1],#16 965 orr v0.16b,v2.16b,v2.16b 966 st1 {v5.16b},[x1],#16 967 orr v1.16b,v3.16b,v3.16b 968 st1 {v24.16b},[x1],#16 969 orr v24.16b,v27.16b,v27.16b 970 b.hs Loop3x_ecb_dec 971 972 cmn x2,#0x30 973 b.eq Lecb_done 974 nop 975 976Lecb_dec_tail: 977 aesd v1.16b,v16.16b 978 aesimc v1.16b,v1.16b 979 aesd v24.16b,v16.16b 980 aesimc v24.16b,v24.16b 981 ld1 {v16.4s},[x7],#16 982 subs w6,w6,#2 983 aesd v1.16b,v17.16b 984 aesimc v1.16b,v1.16b 985 aesd v24.16b,v17.16b 986 aesimc v24.16b,v24.16b 987 ld1 {v17.4s},[x7],#16 988 b.gt Lecb_dec_tail 989 990 aesd v1.16b,v16.16b 991 aesimc v1.16b,v1.16b 992 aesd v24.16b,v16.16b 993 aesimc v24.16b,v24.16b 994 aesd v1.16b,v17.16b 995 aesimc v1.16b,v1.16b 996 aesd v24.16b,v17.16b 997 aesimc v24.16b,v24.16b 998 aesd v1.16b,v20.16b 999 aesimc v1.16b,v1.16b 1000 aesd v24.16b,v20.16b 1001 aesimc v24.16b,v24.16b 1002 cmn x2,#0x20 1003 aesd v1.16b,v21.16b 1004 aesimc v1.16b,v1.16b 1005 aesd v24.16b,v21.16b 1006 aesimc v24.16b,v24.16b 1007 aesd v1.16b,v22.16b 1008 aesimc v1.16b,v1.16b 1009 aesd v24.16b,v22.16b 1010 aesimc v24.16b,v24.16b 1011 aesd v1.16b,v23.16b 1012 aesd v24.16b,v23.16b 1013 b.eq Lecb_dec_one 1014 eor v5.16b,v7.16b,v1.16b 1015 eor v17.16b,v7.16b,v24.16b 1016 st1 {v5.16b},[x1],#16 1017 st1 {v17.16b},[x1],#16 1018 b Lecb_done 1019 1020Lecb_dec_one: 1021 eor v5.16b,v7.16b,v24.16b 1022 st1 {v5.16b},[x1],#16 1023 1024Lecb_done: 1025 ldr x29,[sp],#16 1026Lecb_Final_abort: 1027 ret 1028 1029.globl _aes_v8_cbc_encrypt 1030 1031.align 5 1032_aes_v8_cbc_encrypt: 1033 stp x29,x30,[sp,#-16]! 1034 add x29,sp,#0 1035 subs x2,x2,#16 1036 mov x8,#16 1037 b.lo Lcbc_abort 1038 csel x8,xzr,x8,eq 1039 1040 cmp w5,#0 // en- or decrypting? 1041 ldr w5,[x3,#240] 1042 and x2,x2,#-16 1043 ld1 {v6.16b},[x4] 1044 ld1 {v0.16b},[x0],x8 1045 1046 ld1 {v16.4s,v17.4s},[x3] // load key schedule... 1047 sub w5,w5,#6 1048 add x7,x3,x5,lsl#4 // pointer to last 7 round keys 1049 sub w5,w5,#2 1050 ld1 {v18.4s,v19.4s},[x7],#32 1051 ld1 {v20.4s,v21.4s},[x7],#32 1052 ld1 {v22.4s,v23.4s},[x7],#32 1053 ld1 {v7.4s},[x7] 1054 1055 add x7,x3,#32 1056 mov w6,w5 1057 b.eq Lcbc_dec 1058 1059 cmp w5,#2 1060 eor v0.16b,v0.16b,v6.16b 1061 eor v5.16b,v16.16b,v7.16b 1062 b.eq Lcbc_enc128 1063 1064 ld1 {v2.4s,v3.4s},[x7] 1065 add x7,x3,#16 1066 add x6,x3,#16*4 1067 add x12,x3,#16*5 1068 aese v0.16b,v16.16b 1069 aesmc v0.16b,v0.16b 1070 add x14,x3,#16*6 1071 add x3,x3,#16*7 1072 b Lenter_cbc_enc 1073 1074.align 4 1075Loop_cbc_enc: 1076 aese v0.16b,v16.16b 1077 aesmc v0.16b,v0.16b 1078 st1 {v6.16b},[x1],#16 1079Lenter_cbc_enc: 1080 aese v0.16b,v17.16b 1081 aesmc v0.16b,v0.16b 1082 aese v0.16b,v2.16b 1083 aesmc v0.16b,v0.16b 1084 ld1 {v16.4s},[x6] 1085 cmp w5,#4 1086 aese v0.16b,v3.16b 1087 aesmc v0.16b,v0.16b 1088 ld1 {v17.4s},[x12] 1089 b.eq Lcbc_enc192 1090 1091 aese v0.16b,v16.16b 1092 aesmc v0.16b,v0.16b 1093 ld1 {v16.4s},[x14] 1094 aese v0.16b,v17.16b 1095 aesmc v0.16b,v0.16b 1096 ld1 {v17.4s},[x3] 1097 nop 1098 1099Lcbc_enc192: 1100 aese v0.16b,v16.16b 1101 aesmc v0.16b,v0.16b 1102 subs x2,x2,#16 1103 aese v0.16b,v17.16b 1104 aesmc v0.16b,v0.16b 1105 csel x8,xzr,x8,eq 1106 aese v0.16b,v18.16b 1107 aesmc v0.16b,v0.16b 1108 aese v0.16b,v19.16b 1109 aesmc v0.16b,v0.16b 1110 ld1 {v16.16b},[x0],x8 1111 aese v0.16b,v20.16b 1112 aesmc v0.16b,v0.16b 1113 eor v16.16b,v16.16b,v5.16b 1114 aese v0.16b,v21.16b 1115 aesmc v0.16b,v0.16b 1116 ld1 {v17.4s},[x7] // re-pre-load rndkey[1] 1117 aese v0.16b,v22.16b 1118 aesmc v0.16b,v0.16b 1119 aese v0.16b,v23.16b 1120 eor v6.16b,v0.16b,v7.16b 1121 b.hs Loop_cbc_enc 1122 1123 st1 {v6.16b},[x1],#16 1124 b Lcbc_done 1125 1126.align 5 1127Lcbc_enc128: 1128 ld1 {v2.4s,v3.4s},[x7] 1129 aese v0.16b,v16.16b 1130 aesmc v0.16b,v0.16b 1131 b Lenter_cbc_enc128 1132Loop_cbc_enc128: 1133 aese v0.16b,v16.16b 1134 aesmc v0.16b,v0.16b 1135 st1 {v6.16b},[x1],#16 1136Lenter_cbc_enc128: 1137 aese v0.16b,v17.16b 1138 aesmc v0.16b,v0.16b 1139 subs x2,x2,#16 1140 aese v0.16b,v2.16b 1141 aesmc v0.16b,v0.16b 1142 csel x8,xzr,x8,eq 1143 aese v0.16b,v3.16b 1144 aesmc v0.16b,v0.16b 1145 aese v0.16b,v18.16b 1146 aesmc v0.16b,v0.16b 1147 aese v0.16b,v19.16b 1148 aesmc v0.16b,v0.16b 1149 ld1 {v16.16b},[x0],x8 1150 aese v0.16b,v20.16b 1151 aesmc v0.16b,v0.16b 1152 aese v0.16b,v21.16b 1153 aesmc v0.16b,v0.16b 1154 aese v0.16b,v22.16b 1155 aesmc v0.16b,v0.16b 1156 eor v16.16b,v16.16b,v5.16b 1157 aese v0.16b,v23.16b 1158 eor v6.16b,v0.16b,v7.16b 1159 b.hs Loop_cbc_enc128 1160 1161 st1 {v6.16b},[x1],#16 1162 b Lcbc_done 1163.align 5 1164Lcbc_dec: 1165 ld1 {v24.16b},[x0],#16 1166 subs x2,x2,#32 // bias 1167 add w6,w5,#2 1168 orr v3.16b,v0.16b,v0.16b 1169 orr v1.16b,v0.16b,v0.16b 1170 orr v27.16b,v24.16b,v24.16b 1171 b.lo Lcbc_dec_tail 1172 1173 orr v1.16b,v24.16b,v24.16b 1174 ld1 {v24.16b},[x0],#16 1175 orr v2.16b,v0.16b,v0.16b 1176 orr v3.16b,v1.16b,v1.16b 1177 orr v27.16b,v24.16b,v24.16b 1178 cmp x2,#32 1179 b.lo Loop3x_cbc_dec 1180 1181 ld1 {v25.16b},[x0],#16 1182 ld1 {v26.16b},[x0],#16 1183 sub x2,x2,#32 // bias 1184 mov w6,w5 1185 orr v28.16b,v25.16b,v25.16b 1186 orr v29.16b,v26.16b,v26.16b 1187 1188Loop5x_cbc_dec: 1189 aesd v0.16b,v16.16b 1190 aesimc v0.16b,v0.16b 1191 aesd v1.16b,v16.16b 1192 aesimc v1.16b,v1.16b 1193 aesd v24.16b,v16.16b 1194 aesimc v24.16b,v24.16b 1195 aesd v25.16b,v16.16b 1196 aesimc v25.16b,v25.16b 1197 aesd v26.16b,v16.16b 1198 aesimc v26.16b,v26.16b 1199 ld1 {v16.4s},[x7],#16 1200 subs w6,w6,#2 1201 aesd v0.16b,v17.16b 1202 aesimc v0.16b,v0.16b 1203 aesd v1.16b,v17.16b 1204 aesimc v1.16b,v1.16b 1205 aesd v24.16b,v17.16b 1206 aesimc v24.16b,v24.16b 1207 aesd v25.16b,v17.16b 1208 aesimc v25.16b,v25.16b 1209 aesd v26.16b,v17.16b 1210 aesimc v26.16b,v26.16b 1211 ld1 {v17.4s},[x7],#16 1212 b.gt Loop5x_cbc_dec 1213 1214 aesd v0.16b,v16.16b 1215 aesimc v0.16b,v0.16b 1216 aesd v1.16b,v16.16b 1217 aesimc v1.16b,v1.16b 1218 aesd v24.16b,v16.16b 1219 aesimc v24.16b,v24.16b 1220 aesd v25.16b,v16.16b 1221 aesimc v25.16b,v25.16b 1222 aesd v26.16b,v16.16b 1223 aesimc v26.16b,v26.16b 1224 cmp x2,#0x40 // because Lcbc_tail4x 1225 sub x2,x2,#0x50 1226 1227 aesd v0.16b,v17.16b 1228 aesimc v0.16b,v0.16b 1229 aesd v1.16b,v17.16b 1230 aesimc v1.16b,v1.16b 1231 aesd v24.16b,v17.16b 1232 aesimc v24.16b,v24.16b 1233 aesd v25.16b,v17.16b 1234 aesimc v25.16b,v25.16b 1235 aesd v26.16b,v17.16b 1236 aesimc v26.16b,v26.16b 1237 csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo 1238 mov x7,x3 1239 1240 aesd v0.16b,v18.16b 1241 aesimc v0.16b,v0.16b 1242 aesd v1.16b,v18.16b 1243 aesimc v1.16b,v1.16b 1244 aesd v24.16b,v18.16b 1245 aesimc v24.16b,v24.16b 1246 aesd v25.16b,v18.16b 1247 aesimc v25.16b,v25.16b 1248 aesd v26.16b,v18.16b 1249 aesimc v26.16b,v26.16b 1250 add x0,x0,x6 // x0 is adjusted in such way that 1251 // at exit from the loop v1.16b-v26.16b 1252 // are loaded with last "words" 1253 add x6,x2,#0x60 // because Lcbc_tail4x 1254 1255 aesd v0.16b,v19.16b 1256 aesimc v0.16b,v0.16b 1257 aesd v1.16b,v19.16b 1258 aesimc v1.16b,v1.16b 1259 aesd v24.16b,v19.16b 1260 aesimc v24.16b,v24.16b 1261 aesd v25.16b,v19.16b 1262 aesimc v25.16b,v25.16b 1263 aesd v26.16b,v19.16b 1264 aesimc v26.16b,v26.16b 1265 1266 aesd v0.16b,v20.16b 1267 aesimc v0.16b,v0.16b 1268 aesd v1.16b,v20.16b 1269 aesimc v1.16b,v1.16b 1270 aesd v24.16b,v20.16b 1271 aesimc v24.16b,v24.16b 1272 aesd v25.16b,v20.16b 1273 aesimc v25.16b,v25.16b 1274 aesd v26.16b,v20.16b 1275 aesimc v26.16b,v26.16b 1276 1277 aesd v0.16b,v21.16b 1278 aesimc v0.16b,v0.16b 1279 aesd v1.16b,v21.16b 1280 aesimc v1.16b,v1.16b 1281 aesd v24.16b,v21.16b 1282 aesimc v24.16b,v24.16b 1283 aesd v25.16b,v21.16b 1284 aesimc v25.16b,v25.16b 1285 aesd v26.16b,v21.16b 1286 aesimc v26.16b,v26.16b 1287 1288 aesd v0.16b,v22.16b 1289 aesimc v0.16b,v0.16b 1290 aesd v1.16b,v22.16b 1291 aesimc v1.16b,v1.16b 1292 aesd v24.16b,v22.16b 1293 aesimc v24.16b,v24.16b 1294 aesd v25.16b,v22.16b 1295 aesimc v25.16b,v25.16b 1296 aesd v26.16b,v22.16b 1297 aesimc v26.16b,v26.16b 1298 1299 eor v4.16b,v6.16b,v7.16b 1300 aesd v0.16b,v23.16b 1301 eor v5.16b,v2.16b,v7.16b 1302 ld1 {v2.16b},[x0],#16 1303 aesd v1.16b,v23.16b 1304 eor v17.16b,v3.16b,v7.16b 1305 ld1 {v3.16b},[x0],#16 1306 aesd v24.16b,v23.16b 1307 eor v30.16b,v27.16b,v7.16b 1308 ld1 {v27.16b},[x0],#16 1309 aesd v25.16b,v23.16b 1310 eor v31.16b,v28.16b,v7.16b 1311 ld1 {v28.16b},[x0],#16 1312 aesd v26.16b,v23.16b 1313 orr v6.16b,v29.16b,v29.16b 1314 ld1 {v29.16b},[x0],#16 1315 cbz x6,Lcbc_tail4x 1316 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 1317 eor v4.16b,v4.16b,v0.16b 1318 orr v0.16b,v2.16b,v2.16b 1319 eor v5.16b,v5.16b,v1.16b 1320 orr v1.16b,v3.16b,v3.16b 1321 eor v17.16b,v17.16b,v24.16b 1322 orr v24.16b,v27.16b,v27.16b 1323 eor v30.16b,v30.16b,v25.16b 1324 orr v25.16b,v28.16b,v28.16b 1325 eor v31.16b,v31.16b,v26.16b 1326 st1 {v4.16b},[x1],#16 1327 orr v26.16b,v29.16b,v29.16b 1328 st1 {v5.16b},[x1],#16 1329 mov w6,w5 1330 st1 {v17.16b},[x1],#16 1331 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 1332 st1 {v30.16b},[x1],#16 1333 st1 {v31.16b},[x1],#16 1334 b.hs Loop5x_cbc_dec 1335 1336 add x2,x2,#0x50 1337 cbz x2,Lcbc_done 1338 1339 add w6,w5,#2 1340 subs x2,x2,#0x30 1341 orr v0.16b,v27.16b,v27.16b 1342 orr v2.16b,v27.16b,v27.16b 1343 orr v1.16b,v28.16b,v28.16b 1344 orr v3.16b,v28.16b,v28.16b 1345 orr v24.16b,v29.16b,v29.16b 1346 orr v27.16b,v29.16b,v29.16b 1347 b.lo Lcbc_dec_tail 1348 1349 b Loop3x_cbc_dec 1350 1351.align 4 1352Lcbc_tail4x: 1353 eor v5.16b,v4.16b,v1.16b 1354 eor v17.16b,v17.16b,v24.16b 1355 eor v30.16b,v30.16b,v25.16b 1356 eor v31.16b,v31.16b,v26.16b 1357 st1 {v5.16b},[x1],#16 1358 st1 {v17.16b},[x1],#16 1359 st1 {v30.16b},[x1],#16 1360 st1 {v31.16b},[x1],#16 1361 1362 b Lcbc_done 1363.align 4 1364Loop3x_cbc_dec: 1365 aesd v0.16b,v16.16b 1366 aesimc v0.16b,v0.16b 1367 aesd v1.16b,v16.16b 1368 aesimc v1.16b,v1.16b 1369 aesd v24.16b,v16.16b 1370 aesimc v24.16b,v24.16b 1371 ld1 {v16.4s},[x7],#16 1372 subs w6,w6,#2 1373 aesd v0.16b,v17.16b 1374 aesimc v0.16b,v0.16b 1375 aesd v1.16b,v17.16b 1376 aesimc v1.16b,v1.16b 1377 aesd v24.16b,v17.16b 1378 aesimc v24.16b,v24.16b 1379 ld1 {v17.4s},[x7],#16 1380 b.gt Loop3x_cbc_dec 1381 1382 aesd v0.16b,v16.16b 1383 aesimc v0.16b,v0.16b 1384 aesd v1.16b,v16.16b 1385 aesimc v1.16b,v1.16b 1386 aesd v24.16b,v16.16b 1387 aesimc v24.16b,v24.16b 1388 eor v4.16b,v6.16b,v7.16b 1389 subs x2,x2,#0x30 1390 eor v5.16b,v2.16b,v7.16b 1391 csel x6,x2,x6,lo // x6, w6, is zero at this point 1392 aesd v0.16b,v17.16b 1393 aesimc v0.16b,v0.16b 1394 aesd v1.16b,v17.16b 1395 aesimc v1.16b,v1.16b 1396 aesd v24.16b,v17.16b 1397 aesimc v24.16b,v24.16b 1398 eor v17.16b,v3.16b,v7.16b 1399 add x0,x0,x6 // x0 is adjusted in such way that 1400 // at exit from the loop v1.16b-v24.16b 1401 // are loaded with last "words" 1402 orr v6.16b,v27.16b,v27.16b 1403 mov x7,x3 1404 aesd v0.16b,v20.16b 1405 aesimc v0.16b,v0.16b 1406 aesd v1.16b,v20.16b 1407 aesimc v1.16b,v1.16b 1408 aesd v24.16b,v20.16b 1409 aesimc v24.16b,v24.16b 1410 ld1 {v2.16b},[x0],#16 1411 aesd v0.16b,v21.16b 1412 aesimc v0.16b,v0.16b 1413 aesd v1.16b,v21.16b 1414 aesimc v1.16b,v1.16b 1415 aesd v24.16b,v21.16b 1416 aesimc v24.16b,v24.16b 1417 ld1 {v3.16b},[x0],#16 1418 aesd v0.16b,v22.16b 1419 aesimc v0.16b,v0.16b 1420 aesd v1.16b,v22.16b 1421 aesimc v1.16b,v1.16b 1422 aesd v24.16b,v22.16b 1423 aesimc v24.16b,v24.16b 1424 ld1 {v27.16b},[x0],#16 1425 aesd v0.16b,v23.16b 1426 aesd v1.16b,v23.16b 1427 aesd v24.16b,v23.16b 1428 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 1429 add w6,w5,#2 1430 eor v4.16b,v4.16b,v0.16b 1431 eor v5.16b,v5.16b,v1.16b 1432 eor v24.16b,v24.16b,v17.16b 1433 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 1434 st1 {v4.16b},[x1],#16 1435 orr v0.16b,v2.16b,v2.16b 1436 st1 {v5.16b},[x1],#16 1437 orr v1.16b,v3.16b,v3.16b 1438 st1 {v24.16b},[x1],#16 1439 orr v24.16b,v27.16b,v27.16b 1440 b.hs Loop3x_cbc_dec 1441 1442 cmn x2,#0x30 1443 b.eq Lcbc_done 1444 nop 1445 1446Lcbc_dec_tail: 1447 aesd v1.16b,v16.16b 1448 aesimc v1.16b,v1.16b 1449 aesd v24.16b,v16.16b 1450 aesimc v24.16b,v24.16b 1451 ld1 {v16.4s},[x7],#16 1452 subs w6,w6,#2 1453 aesd v1.16b,v17.16b 1454 aesimc v1.16b,v1.16b 1455 aesd v24.16b,v17.16b 1456 aesimc v24.16b,v24.16b 1457 ld1 {v17.4s},[x7],#16 1458 b.gt Lcbc_dec_tail 1459 1460 aesd v1.16b,v16.16b 1461 aesimc v1.16b,v1.16b 1462 aesd v24.16b,v16.16b 1463 aesimc v24.16b,v24.16b 1464 aesd v1.16b,v17.16b 1465 aesimc v1.16b,v1.16b 1466 aesd v24.16b,v17.16b 1467 aesimc v24.16b,v24.16b 1468 aesd v1.16b,v20.16b 1469 aesimc v1.16b,v1.16b 1470 aesd v24.16b,v20.16b 1471 aesimc v24.16b,v24.16b 1472 cmn x2,#0x20 1473 aesd v1.16b,v21.16b 1474 aesimc v1.16b,v1.16b 1475 aesd v24.16b,v21.16b 1476 aesimc v24.16b,v24.16b 1477 eor v5.16b,v6.16b,v7.16b 1478 aesd v1.16b,v22.16b 1479 aesimc v1.16b,v1.16b 1480 aesd v24.16b,v22.16b 1481 aesimc v24.16b,v24.16b 1482 eor v17.16b,v3.16b,v7.16b 1483 aesd v1.16b,v23.16b 1484 aesd v24.16b,v23.16b 1485 b.eq Lcbc_dec_one 1486 eor v5.16b,v5.16b,v1.16b 1487 eor v17.16b,v17.16b,v24.16b 1488 orr v6.16b,v27.16b,v27.16b 1489 st1 {v5.16b},[x1],#16 1490 st1 {v17.16b},[x1],#16 1491 b Lcbc_done 1492 1493Lcbc_dec_one: 1494 eor v5.16b,v5.16b,v24.16b 1495 orr v6.16b,v27.16b,v27.16b 1496 st1 {v5.16b},[x1],#16 1497 1498Lcbc_done: 1499 st1 {v6.16b},[x4] 1500Lcbc_abort: 1501 ldr x29,[sp],#16 1502 ret 1503 1504.globl _aes_v8_ctr32_encrypt_blocks 1505 1506.align 5 1507_aes_v8_ctr32_encrypt_blocks: 1508 stp x29,x30,[sp,#-16]! 1509 add x29,sp,#0 1510 ldr w5,[x3,#240] 1511 1512 ldr w8, [x4, #12] 1513#ifdef __ARMEB__ 1514 ld1 {v0.16b},[x4] 1515#else 1516 ld1 {v0.4s},[x4] 1517#endif 1518 ld1 {v16.4s,v17.4s},[x3] // load key schedule... 1519 sub w5,w5,#4 1520 mov x12,#16 1521 cmp x2,#2 1522 add x7,x3,x5,lsl#4 // pointer to last 5 round keys 1523 sub w5,w5,#2 1524 ld1 {v20.4s,v21.4s},[x7],#32 1525 ld1 {v22.4s,v23.4s},[x7],#32 1526 ld1 {v7.4s},[x7] 1527 add x7,x3,#32 1528 mov w6,w5 1529 csel x12,xzr,x12,lo 1530#ifndef __ARMEB__ 1531 rev w8, w8 1532#endif 1533 orr v1.16b,v0.16b,v0.16b 1534 add w10, w8, #1 1535 orr v18.16b,v0.16b,v0.16b 1536 add w8, w8, #2 1537 orr v6.16b,v0.16b,v0.16b 1538 rev w10, w10 1539 mov v1.s[3],w10 1540 b.ls Lctr32_tail 1541 rev w12, w8 1542 sub x2,x2,#3 // bias 1543 mov v18.s[3],w12 1544 cmp x2,#32 1545 b.lo Loop3x_ctr32 1546 1547 add w13,w8,#1 1548 add w14,w8,#2 1549 orr v24.16b,v0.16b,v0.16b 1550 rev w13,w13 1551 orr v25.16b,v0.16b,v0.16b 1552 rev w14,w14 1553 mov v24.s[3],w13 1554 sub x2,x2,#2 // bias 1555 mov v25.s[3],w14 1556 add w8,w8,#2 1557 b Loop5x_ctr32 1558 1559.align 4 1560Loop5x_ctr32: 1561 aese v0.16b,v16.16b 1562 aesmc v0.16b,v0.16b 1563 aese v1.16b,v16.16b 1564 aesmc v1.16b,v1.16b 1565 aese v18.16b,v16.16b 1566 aesmc v18.16b,v18.16b 1567 aese v24.16b,v16.16b 1568 aesmc v24.16b,v24.16b 1569 aese v25.16b,v16.16b 1570 aesmc v25.16b,v25.16b 1571 ld1 {v16.4s},[x7],#16 1572 subs w6,w6,#2 1573 aese v0.16b,v17.16b 1574 aesmc v0.16b,v0.16b 1575 aese v1.16b,v17.16b 1576 aesmc v1.16b,v1.16b 1577 aese v18.16b,v17.16b 1578 aesmc v18.16b,v18.16b 1579 aese v24.16b,v17.16b 1580 aesmc v24.16b,v24.16b 1581 aese v25.16b,v17.16b 1582 aesmc v25.16b,v25.16b 1583 ld1 {v17.4s},[x7],#16 1584 b.gt Loop5x_ctr32 1585 1586 mov x7,x3 1587 aese v0.16b,v16.16b 1588 aesmc v0.16b,v0.16b 1589 aese v1.16b,v16.16b 1590 aesmc v1.16b,v1.16b 1591 aese v18.16b,v16.16b 1592 aesmc v18.16b,v18.16b 1593 aese v24.16b,v16.16b 1594 aesmc v24.16b,v24.16b 1595 aese v25.16b,v16.16b 1596 aesmc v25.16b,v25.16b 1597 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 1598 1599 aese v0.16b,v17.16b 1600 aesmc v0.16b,v0.16b 1601 aese v1.16b,v17.16b 1602 aesmc v1.16b,v1.16b 1603 aese v18.16b,v17.16b 1604 aesmc v18.16b,v18.16b 1605 aese v24.16b,v17.16b 1606 aesmc v24.16b,v24.16b 1607 aese v25.16b,v17.16b 1608 aesmc v25.16b,v25.16b 1609 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 1610 1611 aese v0.16b,v20.16b 1612 aesmc v0.16b,v0.16b 1613 add w9,w8,#1 1614 add w10,w8,#2 1615 aese v1.16b,v20.16b 1616 aesmc v1.16b,v1.16b 1617 add w12,w8,#3 1618 add w13,w8,#4 1619 aese v18.16b,v20.16b 1620 aesmc v18.16b,v18.16b 1621 add w14,w8,#5 1622 rev w9,w9 1623 aese v24.16b,v20.16b 1624 aesmc v24.16b,v24.16b 1625 rev w10,w10 1626 rev w12,w12 1627 aese v25.16b,v20.16b 1628 aesmc v25.16b,v25.16b 1629 rev w13,w13 1630 rev w14,w14 1631 1632 aese v0.16b,v21.16b 1633 aesmc v0.16b,v0.16b 1634 aese v1.16b,v21.16b 1635 aesmc v1.16b,v1.16b 1636 aese v18.16b,v21.16b 1637 aesmc v18.16b,v18.16b 1638 aese v24.16b,v21.16b 1639 aesmc v24.16b,v24.16b 1640 aese v25.16b,v21.16b 1641 aesmc v25.16b,v25.16b 1642 1643 aese v0.16b,v22.16b 1644 aesmc v0.16b,v0.16b 1645 ld1 {v2.16b},[x0],#16 1646 aese v1.16b,v22.16b 1647 aesmc v1.16b,v1.16b 1648 ld1 {v3.16b},[x0],#16 1649 aese v18.16b,v22.16b 1650 aesmc v18.16b,v18.16b 1651 ld1 {v19.16b},[x0],#16 1652 aese v24.16b,v22.16b 1653 aesmc v24.16b,v24.16b 1654 ld1 {v26.16b},[x0],#16 1655 aese v25.16b,v22.16b 1656 aesmc v25.16b,v25.16b 1657 ld1 {v27.16b},[x0],#16 1658 1659 aese v0.16b,v23.16b 1660 eor v2.16b,v2.16b,v7.16b 1661 aese v1.16b,v23.16b 1662 eor v3.16b,v3.16b,v7.16b 1663 aese v18.16b,v23.16b 1664 eor v19.16b,v19.16b,v7.16b 1665 aese v24.16b,v23.16b 1666 eor v26.16b,v26.16b,v7.16b 1667 aese v25.16b,v23.16b 1668 eor v27.16b,v27.16b,v7.16b 1669 1670 eor v2.16b,v2.16b,v0.16b 1671 orr v0.16b,v6.16b,v6.16b 1672 eor v3.16b,v3.16b,v1.16b 1673 orr v1.16b,v6.16b,v6.16b 1674 eor v19.16b,v19.16b,v18.16b 1675 orr v18.16b,v6.16b,v6.16b 1676 eor v26.16b,v26.16b,v24.16b 1677 orr v24.16b,v6.16b,v6.16b 1678 eor v27.16b,v27.16b,v25.16b 1679 orr v25.16b,v6.16b,v6.16b 1680 1681 st1 {v2.16b},[x1],#16 1682 mov v0.s[3],w9 1683 st1 {v3.16b},[x1],#16 1684 mov v1.s[3],w10 1685 st1 {v19.16b},[x1],#16 1686 mov v18.s[3],w12 1687 st1 {v26.16b},[x1],#16 1688 mov v24.s[3],w13 1689 st1 {v27.16b},[x1],#16 1690 mov v25.s[3],w14 1691 1692 mov w6,w5 1693 cbz x2,Lctr32_done 1694 1695 add w8,w8,#5 1696 subs x2,x2,#5 1697 b.hs Loop5x_ctr32 1698 1699 add x2,x2,#5 1700 sub w8,w8,#5 1701 1702 cmp x2,#2 1703 mov x12,#16 1704 csel x12,xzr,x12,lo 1705 b.ls Lctr32_tail 1706 1707 sub x2,x2,#3 // bias 1708 add w8,w8,#3 1709 b Loop3x_ctr32 1710 1711.align 4 1712Loop3x_ctr32: 1713 aese v0.16b,v16.16b 1714 aesmc v0.16b,v0.16b 1715 aese v1.16b,v16.16b 1716 aesmc v1.16b,v1.16b 1717 aese v18.16b,v16.16b 1718 aesmc v18.16b,v18.16b 1719 ld1 {v16.4s},[x7],#16 1720 subs w6,w6,#2 1721 aese v0.16b,v17.16b 1722 aesmc v0.16b,v0.16b 1723 aese v1.16b,v17.16b 1724 aesmc v1.16b,v1.16b 1725 aese v18.16b,v17.16b 1726 aesmc v18.16b,v18.16b 1727 ld1 {v17.4s},[x7],#16 1728 b.gt Loop3x_ctr32 1729 1730 aese v0.16b,v16.16b 1731 aesmc v4.16b,v0.16b 1732 aese v1.16b,v16.16b 1733 aesmc v5.16b,v1.16b 1734 ld1 {v2.16b},[x0],#16 1735 orr v0.16b,v6.16b,v6.16b 1736 aese v18.16b,v16.16b 1737 aesmc v18.16b,v18.16b 1738 ld1 {v3.16b},[x0],#16 1739 orr v1.16b,v6.16b,v6.16b 1740 aese v4.16b,v17.16b 1741 aesmc v4.16b,v4.16b 1742 aese v5.16b,v17.16b 1743 aesmc v5.16b,v5.16b 1744 ld1 {v19.16b},[x0],#16 1745 mov x7,x3 1746 aese v18.16b,v17.16b 1747 aesmc v17.16b,v18.16b 1748 orr v18.16b,v6.16b,v6.16b 1749 add w9,w8,#1 1750 aese v4.16b,v20.16b 1751 aesmc v4.16b,v4.16b 1752 aese v5.16b,v20.16b 1753 aesmc v5.16b,v5.16b 1754 eor v2.16b,v2.16b,v7.16b 1755 add w10,w8,#2 1756 aese v17.16b,v20.16b 1757 aesmc v17.16b,v17.16b 1758 eor v3.16b,v3.16b,v7.16b 1759 add w8,w8,#3 1760 aese v4.16b,v21.16b 1761 aesmc v4.16b,v4.16b 1762 aese v5.16b,v21.16b 1763 aesmc v5.16b,v5.16b 1764 eor v19.16b,v19.16b,v7.16b 1765 rev w9,w9 1766 aese v17.16b,v21.16b 1767 aesmc v17.16b,v17.16b 1768 mov v0.s[3], w9 1769 rev w10,w10 1770 aese v4.16b,v22.16b 1771 aesmc v4.16b,v4.16b 1772 aese v5.16b,v22.16b 1773 aesmc v5.16b,v5.16b 1774 mov v1.s[3], w10 1775 rev w12,w8 1776 aese v17.16b,v22.16b 1777 aesmc v17.16b,v17.16b 1778 mov v18.s[3], w12 1779 subs x2,x2,#3 1780 aese v4.16b,v23.16b 1781 aese v5.16b,v23.16b 1782 aese v17.16b,v23.16b 1783 1784 eor v2.16b,v2.16b,v4.16b 1785 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 1786 st1 {v2.16b},[x1],#16 1787 eor v3.16b,v3.16b,v5.16b 1788 mov w6,w5 1789 st1 {v3.16b},[x1],#16 1790 eor v19.16b,v19.16b,v17.16b 1791 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 1792 st1 {v19.16b},[x1],#16 1793 b.hs Loop3x_ctr32 1794 1795 adds x2,x2,#3 1796 b.eq Lctr32_done 1797 cmp x2,#1 1798 mov x12,#16 1799 csel x12,xzr,x12,eq 1800 1801Lctr32_tail: 1802 aese v0.16b,v16.16b 1803 aesmc v0.16b,v0.16b 1804 aese v1.16b,v16.16b 1805 aesmc v1.16b,v1.16b 1806 ld1 {v16.4s},[x7],#16 1807 subs w6,w6,#2 1808 aese v0.16b,v17.16b 1809 aesmc v0.16b,v0.16b 1810 aese v1.16b,v17.16b 1811 aesmc v1.16b,v1.16b 1812 ld1 {v17.4s},[x7],#16 1813 b.gt Lctr32_tail 1814 1815 aese v0.16b,v16.16b 1816 aesmc v0.16b,v0.16b 1817 aese v1.16b,v16.16b 1818 aesmc v1.16b,v1.16b 1819 aese v0.16b,v17.16b 1820 aesmc v0.16b,v0.16b 1821 aese v1.16b,v17.16b 1822 aesmc v1.16b,v1.16b 1823 ld1 {v2.16b},[x0],x12 1824 aese v0.16b,v20.16b 1825 aesmc v0.16b,v0.16b 1826 aese v1.16b,v20.16b 1827 aesmc v1.16b,v1.16b 1828 ld1 {v3.16b},[x0] 1829 aese v0.16b,v21.16b 1830 aesmc v0.16b,v0.16b 1831 aese v1.16b,v21.16b 1832 aesmc v1.16b,v1.16b 1833 eor v2.16b,v2.16b,v7.16b 1834 aese v0.16b,v22.16b 1835 aesmc v0.16b,v0.16b 1836 aese v1.16b,v22.16b 1837 aesmc v1.16b,v1.16b 1838 eor v3.16b,v3.16b,v7.16b 1839 aese v0.16b,v23.16b 1840 aese v1.16b,v23.16b 1841 1842 cmp x2,#1 1843 eor v2.16b,v2.16b,v0.16b 1844 eor v3.16b,v3.16b,v1.16b 1845 st1 {v2.16b},[x1],#16 1846 b.eq Lctr32_done 1847 st1 {v3.16b},[x1] 1848 1849Lctr32_done: 1850 ldr x29,[sp],#16 1851 ret 1852 1853.globl _aes_v8_xts_encrypt 1854 1855.align 5 1856_aes_v8_xts_encrypt: 1857 cmp x2,#16 1858 // Original input data size bigger than 16, jump to big size processing. 1859 b.ne Lxts_enc_big_size 1860 // Encrypt the iv with key2, as the first XEX iv. 1861 ldr w6,[x4,#240] 1862 ld1 {v0.4s},[x4],#16 1863 ld1 {v6.16b},[x5] 1864 sub w6,w6,#2 1865 ld1 {v1.4s},[x4],#16 1866 1867Loop_enc_iv_enc: 1868 aese v6.16b,v0.16b 1869 aesmc v6.16b,v6.16b 1870 ld1 {v0.4s},[x4],#16 1871 subs w6,w6,#2 1872 aese v6.16b,v1.16b 1873 aesmc v6.16b,v6.16b 1874 ld1 {v1.4s},[x4],#16 1875 b.gt Loop_enc_iv_enc 1876 1877 aese v6.16b,v0.16b 1878 aesmc v6.16b,v6.16b 1879 ld1 {v0.4s},[x4] 1880 aese v6.16b,v1.16b 1881 eor v6.16b,v6.16b,v0.16b 1882 1883 ld1 {v0.16b},[x0] 1884 eor v0.16b,v6.16b,v0.16b 1885 1886 ldr w6,[x3,#240] 1887 ld1 {v28.4s,v29.4s},[x3],#32 // load key schedule... 1888 1889 aese v0.16b,v28.16b 1890 aesmc v0.16b,v0.16b 1891 ld1 {v16.4s,v17.4s},[x3],#32 // load key schedule... 1892 aese v0.16b,v29.16b 1893 aesmc v0.16b,v0.16b 1894 subs w6,w6,#10 // if rounds==10, jump to aes-128-xts processing 1895 b.eq Lxts_128_enc 1896Lxts_enc_round_loop: 1897 aese v0.16b,v16.16b 1898 aesmc v0.16b,v0.16b 1899 ld1 {v16.4s},[x3],#16 // load key schedule... 1900 aese v0.16b,v17.16b 1901 aesmc v0.16b,v0.16b 1902 ld1 {v17.4s},[x3],#16 // load key schedule... 1903 subs w6,w6,#2 // bias 1904 b.gt Lxts_enc_round_loop 1905Lxts_128_enc: 1906 ld1 {v18.4s,v19.4s},[x3],#32 // load key schedule... 1907 aese v0.16b,v16.16b 1908 aesmc v0.16b,v0.16b 1909 aese v0.16b,v17.16b 1910 aesmc v0.16b,v0.16b 1911 ld1 {v20.4s,v21.4s},[x3],#32 // load key schedule... 1912 aese v0.16b,v18.16b 1913 aesmc v0.16b,v0.16b 1914 aese v0.16b,v19.16b 1915 aesmc v0.16b,v0.16b 1916 ld1 {v22.4s,v23.4s},[x3],#32 // load key schedule... 1917 aese v0.16b,v20.16b 1918 aesmc v0.16b,v0.16b 1919 aese v0.16b,v21.16b 1920 aesmc v0.16b,v0.16b 1921 ld1 {v7.4s},[x3] 1922 aese v0.16b,v22.16b 1923 aesmc v0.16b,v0.16b 1924 aese v0.16b,v23.16b 1925 eor v0.16b,v0.16b,v7.16b 1926 eor v0.16b,v0.16b,v6.16b 1927 st1 {v0.16b},[x1] 1928 b Lxts_enc_final_abort 1929 1930.align 4 1931Lxts_enc_big_size: 1932 stp x19,x20,[sp,#-64]! 1933 stp x21,x22,[sp,#48] 1934 stp d8,d9,[sp,#32] 1935 stp d10,d11,[sp,#16] 1936 1937 // tailcnt store the tail value of length%16. 1938 and x21,x2,#0xf 1939 and x2,x2,#-16 1940 subs x2,x2,#16 1941 mov x8,#16 1942 b.lo Lxts_abort 1943 csel x8,xzr,x8,eq 1944 1945 // Firstly, encrypt the iv with key2, as the first iv of XEX. 1946 ldr w6,[x4,#240] 1947 ld1 {v0.4s},[x4],#16 1948 ld1 {v6.16b},[x5] 1949 sub w6,w6,#2 1950 ld1 {v1.4s},[x4],#16 1951 1952Loop_iv_enc: 1953 aese v6.16b,v0.16b 1954 aesmc v6.16b,v6.16b 1955 ld1 {v0.4s},[x4],#16 1956 subs w6,w6,#2 1957 aese v6.16b,v1.16b 1958 aesmc v6.16b,v6.16b 1959 ld1 {v1.4s},[x4],#16 1960 b.gt Loop_iv_enc 1961 1962 aese v6.16b,v0.16b 1963 aesmc v6.16b,v6.16b 1964 ld1 {v0.4s},[x4] 1965 aese v6.16b,v1.16b 1966 eor v6.16b,v6.16b,v0.16b 1967 1968 // The iv for second block 1969 // x9- iv(low), x10 - iv(high) 1970 // the five ivs stored into, v6.16b,v8.16b,v9.16b,v10.16b,v11.16b 1971 fmov x9,d6 1972 fmov x10,v6.d[1] 1973 mov w19,#0x87 1974 extr x22,x10,x10,#32 1975 extr x10,x10,x9,#63 1976 and w11,w19,w22,asr#31 1977 eor x9,x11,x9,lsl#1 1978 fmov d8,x9 1979 fmov v8.d[1],x10 1980 1981 ldr w5,[x3,#240] // next starting point 1982 ld1 {v0.16b},[x0],x8 1983 1984 ld1 {v16.4s,v17.4s},[x3] // load key schedule... 1985 sub w5,w5,#6 1986 add x7,x3,x5,lsl#4 // pointer to last 7 round keys 1987 sub w5,w5,#2 1988 ld1 {v18.4s,v19.4s},[x7],#32 1989 ld1 {v20.4s,v21.4s},[x7],#32 1990 ld1 {v22.4s,v23.4s},[x7],#32 1991 ld1 {v7.4s},[x7] 1992 1993 add x7,x3,#32 1994 mov w6,w5 1995 1996 // Encryption 1997Lxts_enc: 1998 ld1 {v24.16b},[x0],#16 1999 subs x2,x2,#32 // bias 2000 add w6,w5,#2 2001 orr v3.16b,v0.16b,v0.16b 2002 orr v1.16b,v0.16b,v0.16b 2003 orr v28.16b,v0.16b,v0.16b 2004 orr v27.16b,v24.16b,v24.16b 2005 orr v29.16b,v24.16b,v24.16b 2006 b.lo Lxts_inner_enc_tail 2007 eor v0.16b,v0.16b,v6.16b // before encryption, xor with iv 2008 eor v24.16b,v24.16b,v8.16b 2009 2010 // The iv for third block 2011 extr x22,x10,x10,#32 2012 extr x10,x10,x9,#63 2013 and w11,w19,w22,asr#31 2014 eor x9,x11,x9,lsl#1 2015 fmov d9,x9 2016 fmov v9.d[1],x10 2017 2018 2019 orr v1.16b,v24.16b,v24.16b 2020 ld1 {v24.16b},[x0],#16 2021 orr v2.16b,v0.16b,v0.16b 2022 orr v3.16b,v1.16b,v1.16b 2023 eor v27.16b,v24.16b,v9.16b // the third block 2024 eor v24.16b,v24.16b,v9.16b 2025 cmp x2,#32 2026 b.lo Lxts_outer_enc_tail 2027 2028 // The iv for fourth block 2029 extr x22,x10,x10,#32 2030 extr x10,x10,x9,#63 2031 and w11,w19,w22,asr#31 2032 eor x9,x11,x9,lsl#1 2033 fmov d10,x9 2034 fmov v10.d[1],x10 2035 2036 ld1 {v25.16b},[x0],#16 2037 // The iv for fifth block 2038 extr x22,x10,x10,#32 2039 extr x10,x10,x9,#63 2040 and w11,w19,w22,asr#31 2041 eor x9,x11,x9,lsl#1 2042 fmov d11,x9 2043 fmov v11.d[1],x10 2044 2045 ld1 {v26.16b},[x0],#16 2046 eor v25.16b,v25.16b,v10.16b // the fourth block 2047 eor v26.16b,v26.16b,v11.16b 2048 sub x2,x2,#32 // bias 2049 mov w6,w5 2050 b Loop5x_xts_enc 2051 2052.align 4 2053Loop5x_xts_enc: 2054 aese v0.16b,v16.16b 2055 aesmc v0.16b,v0.16b 2056 aese v1.16b,v16.16b 2057 aesmc v1.16b,v1.16b 2058 aese v24.16b,v16.16b 2059 aesmc v24.16b,v24.16b 2060 aese v25.16b,v16.16b 2061 aesmc v25.16b,v25.16b 2062 aese v26.16b,v16.16b 2063 aesmc v26.16b,v26.16b 2064 ld1 {v16.4s},[x7],#16 2065 subs w6,w6,#2 2066 aese v0.16b,v17.16b 2067 aesmc v0.16b,v0.16b 2068 aese v1.16b,v17.16b 2069 aesmc v1.16b,v1.16b 2070 aese v24.16b,v17.16b 2071 aesmc v24.16b,v24.16b 2072 aese v25.16b,v17.16b 2073 aesmc v25.16b,v25.16b 2074 aese v26.16b,v17.16b 2075 aesmc v26.16b,v26.16b 2076 ld1 {v17.4s},[x7],#16 2077 b.gt Loop5x_xts_enc 2078 2079 aese v0.16b,v16.16b 2080 aesmc v0.16b,v0.16b 2081 aese v1.16b,v16.16b 2082 aesmc v1.16b,v1.16b 2083 aese v24.16b,v16.16b 2084 aesmc v24.16b,v24.16b 2085 aese v25.16b,v16.16b 2086 aesmc v25.16b,v25.16b 2087 aese v26.16b,v16.16b 2088 aesmc v26.16b,v26.16b 2089 subs x2,x2,#0x50 // because Lxts_enc_tail4x 2090 2091 aese v0.16b,v17.16b 2092 aesmc v0.16b,v0.16b 2093 aese v1.16b,v17.16b 2094 aesmc v1.16b,v1.16b 2095 aese v24.16b,v17.16b 2096 aesmc v24.16b,v24.16b 2097 aese v25.16b,v17.16b 2098 aesmc v25.16b,v25.16b 2099 aese v26.16b,v17.16b 2100 aesmc v26.16b,v26.16b 2101 csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo 2102 mov x7,x3 2103 2104 aese v0.16b,v18.16b 2105 aesmc v0.16b,v0.16b 2106 aese v1.16b,v18.16b 2107 aesmc v1.16b,v1.16b 2108 aese v24.16b,v18.16b 2109 aesmc v24.16b,v24.16b 2110 aese v25.16b,v18.16b 2111 aesmc v25.16b,v25.16b 2112 aese v26.16b,v18.16b 2113 aesmc v26.16b,v26.16b 2114 add x0,x0,x6 // x0 is adjusted in such way that 2115 // at exit from the loop v1.16b-v26.16b 2116 // are loaded with last "words" 2117 add x6,x2,#0x60 // because Lxts_enc_tail4x 2118 2119 aese v0.16b,v19.16b 2120 aesmc v0.16b,v0.16b 2121 aese v1.16b,v19.16b 2122 aesmc v1.16b,v1.16b 2123 aese v24.16b,v19.16b 2124 aesmc v24.16b,v24.16b 2125 aese v25.16b,v19.16b 2126 aesmc v25.16b,v25.16b 2127 aese v26.16b,v19.16b 2128 aesmc v26.16b,v26.16b 2129 2130 aese v0.16b,v20.16b 2131 aesmc v0.16b,v0.16b 2132 aese v1.16b,v20.16b 2133 aesmc v1.16b,v1.16b 2134 aese v24.16b,v20.16b 2135 aesmc v24.16b,v24.16b 2136 aese v25.16b,v20.16b 2137 aesmc v25.16b,v25.16b 2138 aese v26.16b,v20.16b 2139 aesmc v26.16b,v26.16b 2140 2141 aese v0.16b,v21.16b 2142 aesmc v0.16b,v0.16b 2143 aese v1.16b,v21.16b 2144 aesmc v1.16b,v1.16b 2145 aese v24.16b,v21.16b 2146 aesmc v24.16b,v24.16b 2147 aese v25.16b,v21.16b 2148 aesmc v25.16b,v25.16b 2149 aese v26.16b,v21.16b 2150 aesmc v26.16b,v26.16b 2151 2152 aese v0.16b,v22.16b 2153 aesmc v0.16b,v0.16b 2154 aese v1.16b,v22.16b 2155 aesmc v1.16b,v1.16b 2156 aese v24.16b,v22.16b 2157 aesmc v24.16b,v24.16b 2158 aese v25.16b,v22.16b 2159 aesmc v25.16b,v25.16b 2160 aese v26.16b,v22.16b 2161 aesmc v26.16b,v26.16b 2162 2163 eor v4.16b,v7.16b,v6.16b 2164 aese v0.16b,v23.16b 2165 // The iv for first block of one iteration 2166 extr x22,x10,x10,#32 2167 extr x10,x10,x9,#63 2168 and w11,w19,w22,asr#31 2169 eor x9,x11,x9,lsl#1 2170 fmov d6,x9 2171 fmov v6.d[1],x10 2172 eor v5.16b,v7.16b,v8.16b 2173 ld1 {v2.16b},[x0],#16 2174 aese v1.16b,v23.16b 2175 // The iv for second block 2176 extr x22,x10,x10,#32 2177 extr x10,x10,x9,#63 2178 and w11,w19,w22,asr#31 2179 eor x9,x11,x9,lsl#1 2180 fmov d8,x9 2181 fmov v8.d[1],x10 2182 eor v17.16b,v7.16b,v9.16b 2183 ld1 {v3.16b},[x0],#16 2184 aese v24.16b,v23.16b 2185 // The iv for third block 2186 extr x22,x10,x10,#32 2187 extr x10,x10,x9,#63 2188 and w11,w19,w22,asr#31 2189 eor x9,x11,x9,lsl#1 2190 fmov d9,x9 2191 fmov v9.d[1],x10 2192 eor v30.16b,v7.16b,v10.16b 2193 ld1 {v27.16b},[x0],#16 2194 aese v25.16b,v23.16b 2195 // The iv for fourth block 2196 extr x22,x10,x10,#32 2197 extr x10,x10,x9,#63 2198 and w11,w19,w22,asr#31 2199 eor x9,x11,x9,lsl#1 2200 fmov d10,x9 2201 fmov v10.d[1],x10 2202 eor v31.16b,v7.16b,v11.16b 2203 ld1 {v28.16b},[x0],#16 2204 aese v26.16b,v23.16b 2205 2206 // The iv for fifth block 2207 extr x22,x10,x10,#32 2208 extr x10,x10,x9,#63 2209 and w11,w19,w22,asr #31 2210 eor x9,x11,x9,lsl #1 2211 fmov d11,x9 2212 fmov v11.d[1],x10 2213 2214 ld1 {v29.16b},[x0],#16 2215 cbz x6,Lxts_enc_tail4x 2216 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 2217 eor v4.16b,v4.16b,v0.16b 2218 eor v0.16b,v2.16b,v6.16b 2219 eor v5.16b,v5.16b,v1.16b 2220 eor v1.16b,v3.16b,v8.16b 2221 eor v17.16b,v17.16b,v24.16b 2222 eor v24.16b,v27.16b,v9.16b 2223 eor v30.16b,v30.16b,v25.16b 2224 eor v25.16b,v28.16b,v10.16b 2225 eor v31.16b,v31.16b,v26.16b 2226 st1 {v4.16b},[x1],#16 2227 eor v26.16b,v29.16b,v11.16b 2228 st1 {v5.16b},[x1],#16 2229 mov w6,w5 2230 st1 {v17.16b},[x1],#16 2231 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 2232 st1 {v30.16b},[x1],#16 2233 st1 {v31.16b},[x1],#16 2234 b.hs Loop5x_xts_enc 2235 2236 2237 // If left 4 blocks, borrow the five block's processing. 2238 cmn x2,#0x10 2239 b.ne Loop5x_enc_after 2240 orr v11.16b,v10.16b,v10.16b 2241 orr v10.16b,v9.16b,v9.16b 2242 orr v9.16b,v8.16b,v8.16b 2243 orr v8.16b,v6.16b,v6.16b 2244 fmov x9,d11 2245 fmov x10,v11.d[1] 2246 eor v0.16b,v6.16b,v2.16b 2247 eor v1.16b,v8.16b,v3.16b 2248 eor v24.16b,v27.16b,v9.16b 2249 eor v25.16b,v28.16b,v10.16b 2250 eor v26.16b,v29.16b,v11.16b 2251 b.eq Loop5x_xts_enc 2252 2253Loop5x_enc_after: 2254 add x2,x2,#0x50 2255 cbz x2,Lxts_enc_done 2256 2257 add w6,w5,#2 2258 subs x2,x2,#0x30 2259 b.lo Lxts_inner_enc_tail 2260 2261 eor v0.16b,v6.16b,v27.16b 2262 eor v1.16b,v8.16b,v28.16b 2263 eor v24.16b,v29.16b,v9.16b 2264 b Lxts_outer_enc_tail 2265 2266.align 4 2267Lxts_enc_tail4x: 2268 add x0,x0,#16 2269 eor v5.16b,v1.16b,v5.16b 2270 st1 {v5.16b},[x1],#16 2271 eor v17.16b,v24.16b,v17.16b 2272 st1 {v17.16b},[x1],#16 2273 eor v30.16b,v25.16b,v30.16b 2274 eor v31.16b,v26.16b,v31.16b 2275 st1 {v30.16b,v31.16b},[x1],#32 2276 2277 b Lxts_enc_done 2278.align 4 2279Lxts_outer_enc_tail: 2280 aese v0.16b,v16.16b 2281 aesmc v0.16b,v0.16b 2282 aese v1.16b,v16.16b 2283 aesmc v1.16b,v1.16b 2284 aese v24.16b,v16.16b 2285 aesmc v24.16b,v24.16b 2286 ld1 {v16.4s},[x7],#16 2287 subs w6,w6,#2 2288 aese v0.16b,v17.16b 2289 aesmc v0.16b,v0.16b 2290 aese v1.16b,v17.16b 2291 aesmc v1.16b,v1.16b 2292 aese v24.16b,v17.16b 2293 aesmc v24.16b,v24.16b 2294 ld1 {v17.4s},[x7],#16 2295 b.gt Lxts_outer_enc_tail 2296 2297 aese v0.16b,v16.16b 2298 aesmc v0.16b,v0.16b 2299 aese v1.16b,v16.16b 2300 aesmc v1.16b,v1.16b 2301 aese v24.16b,v16.16b 2302 aesmc v24.16b,v24.16b 2303 eor v4.16b,v6.16b,v7.16b 2304 subs x2,x2,#0x30 2305 // The iv for first block 2306 fmov x9,d9 2307 fmov x10,v9.d[1] 2308 //mov w19,#0x87 2309 extr x22,x10,x10,#32 2310 extr x10,x10,x9,#63 2311 and w11,w19,w22,asr#31 2312 eor x9,x11,x9,lsl#1 2313 fmov d6,x9 2314 fmov v6.d[1],x10 2315 eor v5.16b,v8.16b,v7.16b 2316 csel x6,x2,x6,lo // x6, w6, is zero at this point 2317 aese v0.16b,v17.16b 2318 aesmc v0.16b,v0.16b 2319 aese v1.16b,v17.16b 2320 aesmc v1.16b,v1.16b 2321 aese v24.16b,v17.16b 2322 aesmc v24.16b,v24.16b 2323 eor v17.16b,v9.16b,v7.16b 2324 2325 add x6,x6,#0x20 2326 add x0,x0,x6 2327 mov x7,x3 2328 2329 aese v0.16b,v20.16b 2330 aesmc v0.16b,v0.16b 2331 aese v1.16b,v20.16b 2332 aesmc v1.16b,v1.16b 2333 aese v24.16b,v20.16b 2334 aesmc v24.16b,v24.16b 2335 aese v0.16b,v21.16b 2336 aesmc v0.16b,v0.16b 2337 aese v1.16b,v21.16b 2338 aesmc v1.16b,v1.16b 2339 aese v24.16b,v21.16b 2340 aesmc v24.16b,v24.16b 2341 aese v0.16b,v22.16b 2342 aesmc v0.16b,v0.16b 2343 aese v1.16b,v22.16b 2344 aesmc v1.16b,v1.16b 2345 aese v24.16b,v22.16b 2346 aesmc v24.16b,v24.16b 2347 aese v0.16b,v23.16b 2348 aese v1.16b,v23.16b 2349 aese v24.16b,v23.16b 2350 ld1 {v27.16b},[x0],#16 2351 add w6,w5,#2 2352 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 2353 eor v4.16b,v4.16b,v0.16b 2354 eor v5.16b,v5.16b,v1.16b 2355 eor v24.16b,v24.16b,v17.16b 2356 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 2357 st1 {v4.16b},[x1],#16 2358 st1 {v5.16b},[x1],#16 2359 st1 {v24.16b},[x1],#16 2360 cmn x2,#0x30 2361 b.eq Lxts_enc_done 2362Lxts_encxor_one: 2363 orr v28.16b,v3.16b,v3.16b 2364 orr v29.16b,v27.16b,v27.16b 2365 nop 2366 2367Lxts_inner_enc_tail: 2368 cmn x2,#0x10 2369 eor v1.16b,v28.16b,v6.16b 2370 eor v24.16b,v29.16b,v8.16b 2371 b.eq Lxts_enc_tail_loop 2372 eor v24.16b,v29.16b,v6.16b 2373Lxts_enc_tail_loop: 2374 aese v1.16b,v16.16b 2375 aesmc v1.16b,v1.16b 2376 aese v24.16b,v16.16b 2377 aesmc v24.16b,v24.16b 2378 ld1 {v16.4s},[x7],#16 2379 subs w6,w6,#2 2380 aese v1.16b,v17.16b 2381 aesmc v1.16b,v1.16b 2382 aese v24.16b,v17.16b 2383 aesmc v24.16b,v24.16b 2384 ld1 {v17.4s},[x7],#16 2385 b.gt Lxts_enc_tail_loop 2386 2387 aese v1.16b,v16.16b 2388 aesmc v1.16b,v1.16b 2389 aese v24.16b,v16.16b 2390 aesmc v24.16b,v24.16b 2391 aese v1.16b,v17.16b 2392 aesmc v1.16b,v1.16b 2393 aese v24.16b,v17.16b 2394 aesmc v24.16b,v24.16b 2395 aese v1.16b,v20.16b 2396 aesmc v1.16b,v1.16b 2397 aese v24.16b,v20.16b 2398 aesmc v24.16b,v24.16b 2399 cmn x2,#0x20 2400 aese v1.16b,v21.16b 2401 aesmc v1.16b,v1.16b 2402 aese v24.16b,v21.16b 2403 aesmc v24.16b,v24.16b 2404 eor v5.16b,v6.16b,v7.16b 2405 aese v1.16b,v22.16b 2406 aesmc v1.16b,v1.16b 2407 aese v24.16b,v22.16b 2408 aesmc v24.16b,v24.16b 2409 eor v17.16b,v8.16b,v7.16b 2410 aese v1.16b,v23.16b 2411 aese v24.16b,v23.16b 2412 b.eq Lxts_enc_one 2413 eor v5.16b,v5.16b,v1.16b 2414 st1 {v5.16b},[x1],#16 2415 eor v17.16b,v17.16b,v24.16b 2416 orr v6.16b,v8.16b,v8.16b 2417 st1 {v17.16b},[x1],#16 2418 fmov x9,d8 2419 fmov x10,v8.d[1] 2420 mov w19,#0x87 2421 extr x22,x10,x10,#32 2422 extr x10,x10,x9,#63 2423 and w11,w19,w22,asr #31 2424 eor x9,x11,x9,lsl #1 2425 fmov d6,x9 2426 fmov v6.d[1],x10 2427 b Lxts_enc_done 2428 2429Lxts_enc_one: 2430 eor v5.16b,v5.16b,v24.16b 2431 orr v6.16b,v6.16b,v6.16b 2432 st1 {v5.16b},[x1],#16 2433 fmov x9,d6 2434 fmov x10,v6.d[1] 2435 mov w19,#0x87 2436 extr x22,x10,x10,#32 2437 extr x10,x10,x9,#63 2438 and w11,w19,w22,asr #31 2439 eor x9,x11,x9,lsl #1 2440 fmov d6,x9 2441 fmov v6.d[1],x10 2442 b Lxts_enc_done 2443.align 5 2444Lxts_enc_done: 2445 // Process the tail block with cipher stealing. 2446 tst x21,#0xf 2447 b.eq Lxts_abort 2448 2449 mov x20,x0 2450 mov x13,x1 2451 sub x1,x1,#16 2452.composite_enc_loop: 2453 subs x21,x21,#1 2454 ldrb w15,[x1,x21] 2455 ldrb w14,[x20,x21] 2456 strb w15,[x13,x21] 2457 strb w14,[x1,x21] 2458 b.gt .composite_enc_loop 2459Lxts_enc_load_done: 2460 ld1 {v26.16b},[x1] 2461 eor v26.16b,v26.16b,v6.16b 2462 2463 // Encrypt the composite block to get the last second encrypted text block 2464 ldr w6,[x3,#240] // load key schedule... 2465 ld1 {v0.4s},[x3],#16 2466 sub w6,w6,#2 2467 ld1 {v1.4s},[x3],#16 // load key schedule... 2468Loop_final_enc: 2469 aese v26.16b,v0.16b 2470 aesmc v26.16b,v26.16b 2471 ld1 {v0.4s},[x3],#16 2472 subs w6,w6,#2 2473 aese v26.16b,v1.16b 2474 aesmc v26.16b,v26.16b 2475 ld1 {v1.4s},[x3],#16 2476 b.gt Loop_final_enc 2477 2478 aese v26.16b,v0.16b 2479 aesmc v26.16b,v26.16b 2480 ld1 {v0.4s},[x3] 2481 aese v26.16b,v1.16b 2482 eor v26.16b,v26.16b,v0.16b 2483 eor v26.16b,v26.16b,v6.16b 2484 st1 {v26.16b},[x1] 2485 2486Lxts_abort: 2487 ldp x21,x22,[sp,#48] 2488 ldp d8,d9,[sp,#32] 2489 ldp d10,d11,[sp,#16] 2490 ldp x19,x20,[sp],#64 2491Lxts_enc_final_abort: 2492 ret 2493 2494.globl _aes_v8_xts_decrypt 2495 2496.align 5 2497_aes_v8_xts_decrypt: 2498 cmp x2,#16 2499 // Original input data size bigger than 16, jump to big size processing. 2500 b.ne Lxts_dec_big_size 2501 // Encrypt the iv with key2, as the first XEX iv. 2502 ldr w6,[x4,#240] 2503 ld1 {v0.4s},[x4],#16 2504 ld1 {v6.16b},[x5] 2505 sub w6,w6,#2 2506 ld1 {v1.4s},[x4],#16 2507 2508Loop_dec_small_iv_enc: 2509 aese v6.16b,v0.16b 2510 aesmc v6.16b,v6.16b 2511 ld1 {v0.4s},[x4],#16 2512 subs w6,w6,#2 2513 aese v6.16b,v1.16b 2514 aesmc v6.16b,v6.16b 2515 ld1 {v1.4s},[x4],#16 2516 b.gt Loop_dec_small_iv_enc 2517 2518 aese v6.16b,v0.16b 2519 aesmc v6.16b,v6.16b 2520 ld1 {v0.4s},[x4] 2521 aese v6.16b,v1.16b 2522 eor v6.16b,v6.16b,v0.16b 2523 2524 ld1 {v0.16b},[x0] 2525 eor v0.16b,v6.16b,v0.16b 2526 2527 ldr w6,[x3,#240] 2528 ld1 {v28.4s,v29.4s},[x3],#32 // load key schedule... 2529 2530 aesd v0.16b,v28.16b 2531 aesimc v0.16b,v0.16b 2532 ld1 {v16.4s,v17.4s},[x3],#32 // load key schedule... 2533 aesd v0.16b,v29.16b 2534 aesimc v0.16b,v0.16b 2535 subs w6,w6,#10 // bias 2536 b.eq Lxts_128_dec 2537Lxts_dec_round_loop: 2538 aesd v0.16b,v16.16b 2539 aesimc v0.16b,v0.16b 2540 ld1 {v16.4s},[x3],#16 // load key schedule... 2541 aesd v0.16b,v17.16b 2542 aesimc v0.16b,v0.16b 2543 ld1 {v17.4s},[x3],#16 // load key schedule... 2544 subs w6,w6,#2 // bias 2545 b.gt Lxts_dec_round_loop 2546Lxts_128_dec: 2547 ld1 {v18.4s,v19.4s},[x3],#32 // load key schedule... 2548 aesd v0.16b,v16.16b 2549 aesimc v0.16b,v0.16b 2550 aesd v0.16b,v17.16b 2551 aesimc v0.16b,v0.16b 2552 ld1 {v20.4s,v21.4s},[x3],#32 // load key schedule... 2553 aesd v0.16b,v18.16b 2554 aesimc v0.16b,v0.16b 2555 aesd v0.16b,v19.16b 2556 aesimc v0.16b,v0.16b 2557 ld1 {v22.4s,v23.4s},[x3],#32 // load key schedule... 2558 aesd v0.16b,v20.16b 2559 aesimc v0.16b,v0.16b 2560 aesd v0.16b,v21.16b 2561 aesimc v0.16b,v0.16b 2562 ld1 {v7.4s},[x3] 2563 aesd v0.16b,v22.16b 2564 aesimc v0.16b,v0.16b 2565 aesd v0.16b,v23.16b 2566 eor v0.16b,v0.16b,v7.16b 2567 eor v0.16b,v6.16b,v0.16b 2568 st1 {v0.16b},[x1] 2569 b Lxts_dec_final_abort 2570Lxts_dec_big_size: 2571 stp x19,x20,[sp,#-64]! 2572 stp x21,x22,[sp,#48] 2573 stp d8,d9,[sp,#32] 2574 stp d10,d11,[sp,#16] 2575 2576 and x21,x2,#0xf 2577 and x2,x2,#-16 2578 subs x2,x2,#16 2579 mov x8,#16 2580 b.lo Lxts_dec_abort 2581 2582 // Encrypt the iv with key2, as the first XEX iv 2583 ldr w6,[x4,#240] 2584 ld1 {v0.4s},[x4],#16 2585 ld1 {v6.16b},[x5] 2586 sub w6,w6,#2 2587 ld1 {v1.4s},[x4],#16 2588 2589Loop_dec_iv_enc: 2590 aese v6.16b,v0.16b 2591 aesmc v6.16b,v6.16b 2592 ld1 {v0.4s},[x4],#16 2593 subs w6,w6,#2 2594 aese v6.16b,v1.16b 2595 aesmc v6.16b,v6.16b 2596 ld1 {v1.4s},[x4],#16 2597 b.gt Loop_dec_iv_enc 2598 2599 aese v6.16b,v0.16b 2600 aesmc v6.16b,v6.16b 2601 ld1 {v0.4s},[x4] 2602 aese v6.16b,v1.16b 2603 eor v6.16b,v6.16b,v0.16b 2604 2605 // The iv for second block 2606 // x9- iv(low), x10 - iv(high) 2607 // the five ivs stored into, v6.16b,v8.16b,v9.16b,v10.16b,v11.16b 2608 fmov x9,d6 2609 fmov x10,v6.d[1] 2610 mov w19,#0x87 2611 extr x22,x10,x10,#32 2612 extr x10,x10,x9,#63 2613 and w11,w19,w22,asr #31 2614 eor x9,x11,x9,lsl #1 2615 fmov d8,x9 2616 fmov v8.d[1],x10 2617 2618 ldr w5,[x3,#240] // load rounds number 2619 2620 // The iv for third block 2621 extr x22,x10,x10,#32 2622 extr x10,x10,x9,#63 2623 and w11,w19,w22,asr #31 2624 eor x9,x11,x9,lsl #1 2625 fmov d9,x9 2626 fmov v9.d[1],x10 2627 2628 ld1 {v16.4s,v17.4s},[x3] // load key schedule... 2629 sub w5,w5,#6 2630 add x7,x3,x5,lsl#4 // pointer to last 7 round keys 2631 sub w5,w5,#2 2632 ld1 {v18.4s,v19.4s},[x7],#32 // load key schedule... 2633 ld1 {v20.4s,v21.4s},[x7],#32 2634 ld1 {v22.4s,v23.4s},[x7],#32 2635 ld1 {v7.4s},[x7] 2636 2637 // The iv for fourth block 2638 extr x22,x10,x10,#32 2639 extr x10,x10,x9,#63 2640 and w11,w19,w22,asr #31 2641 eor x9,x11,x9,lsl #1 2642 fmov d10,x9 2643 fmov v10.d[1],x10 2644 2645 add x7,x3,#32 2646 mov w6,w5 2647 b Lxts_dec 2648 2649 // Decryption 2650.align 5 2651Lxts_dec: 2652 tst x21,#0xf 2653 b.eq Lxts_dec_begin 2654 subs x2,x2,#16 2655 csel x8,xzr,x8,eq 2656 ld1 {v0.16b},[x0],#16 2657 b.lo Lxts_done 2658 sub x0,x0,#16 2659Lxts_dec_begin: 2660 ld1 {v0.16b},[x0],x8 2661 subs x2,x2,#32 // bias 2662 add w6,w5,#2 2663 orr v3.16b,v0.16b,v0.16b 2664 orr v1.16b,v0.16b,v0.16b 2665 orr v28.16b,v0.16b,v0.16b 2666 ld1 {v24.16b},[x0],#16 2667 orr v27.16b,v24.16b,v24.16b 2668 orr v29.16b,v24.16b,v24.16b 2669 b.lo Lxts_inner_dec_tail 2670 eor v0.16b,v0.16b,v6.16b // before decryt, xor with iv 2671 eor v24.16b,v24.16b,v8.16b 2672 2673 orr v1.16b,v24.16b,v24.16b 2674 ld1 {v24.16b},[x0],#16 2675 orr v2.16b,v0.16b,v0.16b 2676 orr v3.16b,v1.16b,v1.16b 2677 eor v27.16b,v24.16b,v9.16b // third block xox with third iv 2678 eor v24.16b,v24.16b,v9.16b 2679 cmp x2,#32 2680 b.lo Lxts_outer_dec_tail 2681 2682 ld1 {v25.16b},[x0],#16 2683 2684 // The iv for fifth block 2685 extr x22,x10,x10,#32 2686 extr x10,x10,x9,#63 2687 and w11,w19,w22,asr #31 2688 eor x9,x11,x9,lsl #1 2689 fmov d11,x9 2690 fmov v11.d[1],x10 2691 2692 ld1 {v26.16b},[x0],#16 2693 eor v25.16b,v25.16b,v10.16b // the fourth block 2694 eor v26.16b,v26.16b,v11.16b 2695 sub x2,x2,#32 // bias 2696 mov w6,w5 2697 b Loop5x_xts_dec 2698 2699.align 4 2700Loop5x_xts_dec: 2701 aesd v0.16b,v16.16b 2702 aesimc v0.16b,v0.16b 2703 aesd v1.16b,v16.16b 2704 aesimc v1.16b,v1.16b 2705 aesd v24.16b,v16.16b 2706 aesimc v24.16b,v24.16b 2707 aesd v25.16b,v16.16b 2708 aesimc v25.16b,v25.16b 2709 aesd v26.16b,v16.16b 2710 aesimc v26.16b,v26.16b 2711 ld1 {v16.4s},[x7],#16 // load key schedule... 2712 subs w6,w6,#2 2713 aesd v0.16b,v17.16b 2714 aesimc v0.16b,v0.16b 2715 aesd v1.16b,v17.16b 2716 aesimc v1.16b,v1.16b 2717 aesd v24.16b,v17.16b 2718 aesimc v24.16b,v24.16b 2719 aesd v25.16b,v17.16b 2720 aesimc v25.16b,v25.16b 2721 aesd v26.16b,v17.16b 2722 aesimc v26.16b,v26.16b 2723 ld1 {v17.4s},[x7],#16 // load key schedule... 2724 b.gt Loop5x_xts_dec 2725 2726 aesd v0.16b,v16.16b 2727 aesimc v0.16b,v0.16b 2728 aesd v1.16b,v16.16b 2729 aesimc v1.16b,v1.16b 2730 aesd v24.16b,v16.16b 2731 aesimc v24.16b,v24.16b 2732 aesd v25.16b,v16.16b 2733 aesimc v25.16b,v25.16b 2734 aesd v26.16b,v16.16b 2735 aesimc v26.16b,v26.16b 2736 subs x2,x2,#0x50 // because Lxts_dec_tail4x 2737 2738 aesd v0.16b,v17.16b 2739 aesimc v0.16b,v0.16b 2740 aesd v1.16b,v17.16b 2741 aesimc v1.16b,v1.16b 2742 aesd v24.16b,v17.16b 2743 aesimc v24.16b,v24.16b 2744 aesd v25.16b,v17.16b 2745 aesimc v25.16b,v25.16b 2746 aesd v26.16b,v17.16b 2747 aesimc v26.16b,v26.16b 2748 csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo 2749 mov x7,x3 2750 2751 aesd v0.16b,v18.16b 2752 aesimc v0.16b,v0.16b 2753 aesd v1.16b,v18.16b 2754 aesimc v1.16b,v1.16b 2755 aesd v24.16b,v18.16b 2756 aesimc v24.16b,v24.16b 2757 aesd v25.16b,v18.16b 2758 aesimc v25.16b,v25.16b 2759 aesd v26.16b,v18.16b 2760 aesimc v26.16b,v26.16b 2761 add x0,x0,x6 // x0 is adjusted in such way that 2762 // at exit from the loop v1.16b-v26.16b 2763 // are loaded with last "words" 2764 add x6,x2,#0x60 // because Lxts_dec_tail4x 2765 2766 aesd v0.16b,v19.16b 2767 aesimc v0.16b,v0.16b 2768 aesd v1.16b,v19.16b 2769 aesimc v1.16b,v1.16b 2770 aesd v24.16b,v19.16b 2771 aesimc v24.16b,v24.16b 2772 aesd v25.16b,v19.16b 2773 aesimc v25.16b,v25.16b 2774 aesd v26.16b,v19.16b 2775 aesimc v26.16b,v26.16b 2776 2777 aesd v0.16b,v20.16b 2778 aesimc v0.16b,v0.16b 2779 aesd v1.16b,v20.16b 2780 aesimc v1.16b,v1.16b 2781 aesd v24.16b,v20.16b 2782 aesimc v24.16b,v24.16b 2783 aesd v25.16b,v20.16b 2784 aesimc v25.16b,v25.16b 2785 aesd v26.16b,v20.16b 2786 aesimc v26.16b,v26.16b 2787 2788 aesd v0.16b,v21.16b 2789 aesimc v0.16b,v0.16b 2790 aesd v1.16b,v21.16b 2791 aesimc v1.16b,v1.16b 2792 aesd v24.16b,v21.16b 2793 aesimc v24.16b,v24.16b 2794 aesd v25.16b,v21.16b 2795 aesimc v25.16b,v25.16b 2796 aesd v26.16b,v21.16b 2797 aesimc v26.16b,v26.16b 2798 2799 aesd v0.16b,v22.16b 2800 aesimc v0.16b,v0.16b 2801 aesd v1.16b,v22.16b 2802 aesimc v1.16b,v1.16b 2803 aesd v24.16b,v22.16b 2804 aesimc v24.16b,v24.16b 2805 aesd v25.16b,v22.16b 2806 aesimc v25.16b,v25.16b 2807 aesd v26.16b,v22.16b 2808 aesimc v26.16b,v26.16b 2809 2810 eor v4.16b,v7.16b,v6.16b 2811 aesd v0.16b,v23.16b 2812 // The iv for first block of next iteration. 2813 extr x22,x10,x10,#32 2814 extr x10,x10,x9,#63 2815 and w11,w19,w22,asr #31 2816 eor x9,x11,x9,lsl #1 2817 fmov d6,x9 2818 fmov v6.d[1],x10 2819 eor v5.16b,v7.16b,v8.16b 2820 ld1 {v2.16b},[x0],#16 2821 aesd v1.16b,v23.16b 2822 // The iv for second block 2823 extr x22,x10,x10,#32 2824 extr x10,x10,x9,#63 2825 and w11,w19,w22,asr #31 2826 eor x9,x11,x9,lsl #1 2827 fmov d8,x9 2828 fmov v8.d[1],x10 2829 eor v17.16b,v7.16b,v9.16b 2830 ld1 {v3.16b},[x0],#16 2831 aesd v24.16b,v23.16b 2832 // The iv for third block 2833 extr x22,x10,x10,#32 2834 extr x10,x10,x9,#63 2835 and w11,w19,w22,asr #31 2836 eor x9,x11,x9,lsl #1 2837 fmov d9,x9 2838 fmov v9.d[1],x10 2839 eor v30.16b,v7.16b,v10.16b 2840 ld1 {v27.16b},[x0],#16 2841 aesd v25.16b,v23.16b 2842 // The iv for fourth block 2843 extr x22,x10,x10,#32 2844 extr x10,x10,x9,#63 2845 and w11,w19,w22,asr #31 2846 eor x9,x11,x9,lsl #1 2847 fmov d10,x9 2848 fmov v10.d[1],x10 2849 eor v31.16b,v7.16b,v11.16b 2850 ld1 {v28.16b},[x0],#16 2851 aesd v26.16b,v23.16b 2852 2853 // The iv for fifth block 2854 extr x22,x10,x10,#32 2855 extr x10,x10,x9,#63 2856 and w11,w19,w22,asr #31 2857 eor x9,x11,x9,lsl #1 2858 fmov d11,x9 2859 fmov v11.d[1],x10 2860 2861 ld1 {v29.16b},[x0],#16 2862 cbz x6,Lxts_dec_tail4x 2863 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 2864 eor v4.16b,v4.16b,v0.16b 2865 eor v0.16b,v2.16b,v6.16b 2866 eor v5.16b,v5.16b,v1.16b 2867 eor v1.16b,v3.16b,v8.16b 2868 eor v17.16b,v17.16b,v24.16b 2869 eor v24.16b,v27.16b,v9.16b 2870 eor v30.16b,v30.16b,v25.16b 2871 eor v25.16b,v28.16b,v10.16b 2872 eor v31.16b,v31.16b,v26.16b 2873 st1 {v4.16b},[x1],#16 2874 eor v26.16b,v29.16b,v11.16b 2875 st1 {v5.16b},[x1],#16 2876 mov w6,w5 2877 st1 {v17.16b},[x1],#16 2878 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 2879 st1 {v30.16b},[x1],#16 2880 st1 {v31.16b},[x1],#16 2881 b.hs Loop5x_xts_dec 2882 2883 cmn x2,#0x10 2884 b.ne Loop5x_dec_after 2885 // If x2(x2) equal to -0x10, the left blocks is 4. 2886 // After specially processing, utilize the five blocks processing again. 2887 // It will use the following IVs: v6.16b,v6.16b,v8.16b,v9.16b,v10.16b. 2888 orr v11.16b,v10.16b,v10.16b 2889 orr v10.16b,v9.16b,v9.16b 2890 orr v9.16b,v8.16b,v8.16b 2891 orr v8.16b,v6.16b,v6.16b 2892 fmov x9,d11 2893 fmov x10,v11.d[1] 2894 eor v0.16b,v6.16b,v2.16b 2895 eor v1.16b,v8.16b,v3.16b 2896 eor v24.16b,v27.16b,v9.16b 2897 eor v25.16b,v28.16b,v10.16b 2898 eor v26.16b,v29.16b,v11.16b 2899 b.eq Loop5x_xts_dec 2900 2901Loop5x_dec_after: 2902 add x2,x2,#0x50 2903 cbz x2,Lxts_done 2904 2905 add w6,w5,#2 2906 subs x2,x2,#0x30 2907 b.lo Lxts_inner_dec_tail 2908 2909 eor v0.16b,v6.16b,v27.16b 2910 eor v1.16b,v8.16b,v28.16b 2911 eor v24.16b,v29.16b,v9.16b 2912 b Lxts_outer_dec_tail 2913 2914.align 4 2915Lxts_dec_tail4x: 2916 add x0,x0,#16 2917 tst x21,#0xf 2918 eor v5.16b,v1.16b,v4.16b 2919 st1 {v5.16b},[x1],#16 2920 eor v17.16b,v24.16b,v17.16b 2921 st1 {v17.16b},[x1],#16 2922 eor v30.16b,v25.16b,v30.16b 2923 eor v31.16b,v26.16b,v31.16b 2924 st1 {v30.16b,v31.16b},[x1],#32 2925 2926 b.eq Lxts_dec_abort 2927 ld1 {v0.16b},[x0],#16 2928 b Lxts_done 2929.align 4 2930Lxts_outer_dec_tail: 2931 aesd v0.16b,v16.16b 2932 aesimc v0.16b,v0.16b 2933 aesd v1.16b,v16.16b 2934 aesimc v1.16b,v1.16b 2935 aesd v24.16b,v16.16b 2936 aesimc v24.16b,v24.16b 2937 ld1 {v16.4s},[x7],#16 2938 subs w6,w6,#2 2939 aesd v0.16b,v17.16b 2940 aesimc v0.16b,v0.16b 2941 aesd v1.16b,v17.16b 2942 aesimc v1.16b,v1.16b 2943 aesd v24.16b,v17.16b 2944 aesimc v24.16b,v24.16b 2945 ld1 {v17.4s},[x7],#16 2946 b.gt Lxts_outer_dec_tail 2947 2948 aesd v0.16b,v16.16b 2949 aesimc v0.16b,v0.16b 2950 aesd v1.16b,v16.16b 2951 aesimc v1.16b,v1.16b 2952 aesd v24.16b,v16.16b 2953 aesimc v24.16b,v24.16b 2954 eor v4.16b,v6.16b,v7.16b 2955 subs x2,x2,#0x30 2956 // The iv for first block 2957 fmov x9,d9 2958 fmov x10,v9.d[1] 2959 mov w19,#0x87 2960 extr x22,x10,x10,#32 2961 extr x10,x10,x9,#63 2962 and w11,w19,w22,asr #31 2963 eor x9,x11,x9,lsl #1 2964 fmov d6,x9 2965 fmov v6.d[1],x10 2966 eor v5.16b,v8.16b,v7.16b 2967 csel x6,x2,x6,lo // x6, w6, is zero at this point 2968 aesd v0.16b,v17.16b 2969 aesimc v0.16b,v0.16b 2970 aesd v1.16b,v17.16b 2971 aesimc v1.16b,v1.16b 2972 aesd v24.16b,v17.16b 2973 aesimc v24.16b,v24.16b 2974 eor v17.16b,v9.16b,v7.16b 2975 // The iv for second block 2976 extr x22,x10,x10,#32 2977 extr x10,x10,x9,#63 2978 and w11,w19,w22,asr #31 2979 eor x9,x11,x9,lsl #1 2980 fmov d8,x9 2981 fmov v8.d[1],x10 2982 2983 add x6,x6,#0x20 2984 add x0,x0,x6 // x0 is adjusted to the last data 2985 2986 mov x7,x3 2987 2988 // The iv for third block 2989 extr x22,x10,x10,#32 2990 extr x10,x10,x9,#63 2991 and w11,w19,w22,asr #31 2992 eor x9,x11,x9,lsl #1 2993 fmov d9,x9 2994 fmov v9.d[1],x10 2995 2996 aesd v0.16b,v20.16b 2997 aesimc v0.16b,v0.16b 2998 aesd v1.16b,v20.16b 2999 aesimc v1.16b,v1.16b 3000 aesd v24.16b,v20.16b 3001 aesimc v24.16b,v24.16b 3002 aesd v0.16b,v21.16b 3003 aesimc v0.16b,v0.16b 3004 aesd v1.16b,v21.16b 3005 aesimc v1.16b,v1.16b 3006 aesd v24.16b,v21.16b 3007 aesimc v24.16b,v24.16b 3008 aesd v0.16b,v22.16b 3009 aesimc v0.16b,v0.16b 3010 aesd v1.16b,v22.16b 3011 aesimc v1.16b,v1.16b 3012 aesd v24.16b,v22.16b 3013 aesimc v24.16b,v24.16b 3014 ld1 {v27.16b},[x0],#16 3015 aesd v0.16b,v23.16b 3016 aesd v1.16b,v23.16b 3017 aesd v24.16b,v23.16b 3018 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 3019 add w6,w5,#2 3020 eor v4.16b,v4.16b,v0.16b 3021 eor v5.16b,v5.16b,v1.16b 3022 eor v24.16b,v24.16b,v17.16b 3023 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 3024 st1 {v4.16b},[x1],#16 3025 st1 {v5.16b},[x1],#16 3026 st1 {v24.16b},[x1],#16 3027 3028 cmn x2,#0x30 3029 add x2,x2,#0x30 3030 b.eq Lxts_done 3031 sub x2,x2,#0x30 3032 orr v28.16b,v3.16b,v3.16b 3033 orr v29.16b,v27.16b,v27.16b 3034 nop 3035 3036Lxts_inner_dec_tail: 3037 // x2 == -0x10 means two blocks left. 3038 cmn x2,#0x10 3039 eor v1.16b,v28.16b,v6.16b 3040 eor v24.16b,v29.16b,v8.16b 3041 b.eq Lxts_dec_tail_loop 3042 eor v24.16b,v29.16b,v6.16b 3043Lxts_dec_tail_loop: 3044 aesd v1.16b,v16.16b 3045 aesimc v1.16b,v1.16b 3046 aesd v24.16b,v16.16b 3047 aesimc v24.16b,v24.16b 3048 ld1 {v16.4s},[x7],#16 3049 subs w6,w6,#2 3050 aesd v1.16b,v17.16b 3051 aesimc v1.16b,v1.16b 3052 aesd v24.16b,v17.16b 3053 aesimc v24.16b,v24.16b 3054 ld1 {v17.4s},[x7],#16 3055 b.gt Lxts_dec_tail_loop 3056 3057 aesd v1.16b,v16.16b 3058 aesimc v1.16b,v1.16b 3059 aesd v24.16b,v16.16b 3060 aesimc v24.16b,v24.16b 3061 aesd v1.16b,v17.16b 3062 aesimc v1.16b,v1.16b 3063 aesd v24.16b,v17.16b 3064 aesimc v24.16b,v24.16b 3065 aesd v1.16b,v20.16b 3066 aesimc v1.16b,v1.16b 3067 aesd v24.16b,v20.16b 3068 aesimc v24.16b,v24.16b 3069 cmn x2,#0x20 3070 aesd v1.16b,v21.16b 3071 aesimc v1.16b,v1.16b 3072 aesd v24.16b,v21.16b 3073 aesimc v24.16b,v24.16b 3074 eor v5.16b,v6.16b,v7.16b 3075 aesd v1.16b,v22.16b 3076 aesimc v1.16b,v1.16b 3077 aesd v24.16b,v22.16b 3078 aesimc v24.16b,v24.16b 3079 eor v17.16b,v8.16b,v7.16b 3080 aesd v1.16b,v23.16b 3081 aesd v24.16b,v23.16b 3082 b.eq Lxts_dec_one 3083 eor v5.16b,v5.16b,v1.16b 3084 eor v17.16b,v17.16b,v24.16b 3085 orr v6.16b,v9.16b,v9.16b 3086 orr v8.16b,v10.16b,v10.16b 3087 st1 {v5.16b},[x1],#16 3088 st1 {v17.16b},[x1],#16 3089 add x2,x2,#16 3090 b Lxts_done 3091 3092Lxts_dec_one: 3093 eor v5.16b,v5.16b,v24.16b 3094 orr v6.16b,v8.16b,v8.16b 3095 orr v8.16b,v9.16b,v9.16b 3096 st1 {v5.16b},[x1],#16 3097 add x2,x2,#32 3098 3099Lxts_done: 3100 tst x21,#0xf 3101 b.eq Lxts_dec_abort 3102 // Processing the last two blocks with cipher stealing. 3103 mov x7,x3 3104 cbnz x2,Lxts_dec_1st_done 3105 ld1 {v0.16b},[x0],#16 3106 3107 // Decrypt the last secod block to get the last plain text block 3108Lxts_dec_1st_done: 3109 eor v26.16b,v0.16b,v8.16b 3110 ldr w6,[x3,#240] 3111 ld1 {v0.4s},[x3],#16 3112 sub w6,w6,#2 3113 ld1 {v1.4s},[x3],#16 3114Loop_final_2nd_dec: 3115 aesd v26.16b,v0.16b 3116 aesimc v26.16b,v26.16b 3117 ld1 {v0.4s},[x3],#16 // load key schedule... 3118 subs w6,w6,#2 3119 aesd v26.16b,v1.16b 3120 aesimc v26.16b,v26.16b 3121 ld1 {v1.4s},[x3],#16 // load key schedule... 3122 b.gt Loop_final_2nd_dec 3123 3124 aesd v26.16b,v0.16b 3125 aesimc v26.16b,v26.16b 3126 ld1 {v0.4s},[x3] 3127 aesd v26.16b,v1.16b 3128 eor v26.16b,v26.16b,v0.16b 3129 eor v26.16b,v26.16b,v8.16b 3130 st1 {v26.16b},[x1] 3131 3132 mov x20,x0 3133 add x13,x1,#16 3134 3135 // Composite the tailcnt "16 byte not aligned block" into the last second plain blocks 3136 // to get the last encrypted block. 3137.composite_dec_loop: 3138 subs x21,x21,#1 3139 ldrb w15,[x1,x21] 3140 ldrb w14,[x20,x21] 3141 strb w15,[x13,x21] 3142 strb w14,[x1,x21] 3143 b.gt .composite_dec_loop 3144Lxts_dec_load_done: 3145 ld1 {v26.16b},[x1] 3146 eor v26.16b,v26.16b,v6.16b 3147 3148 // Decrypt the composite block to get the last second plain text block 3149 ldr w6,[x7,#240] 3150 ld1 {v0.4s},[x7],#16 3151 sub w6,w6,#2 3152 ld1 {v1.4s},[x7],#16 3153Loop_final_dec: 3154 aesd v26.16b,v0.16b 3155 aesimc v26.16b,v26.16b 3156 ld1 {v0.4s},[x7],#16 // load key schedule... 3157 subs w6,w6,#2 3158 aesd v26.16b,v1.16b 3159 aesimc v26.16b,v26.16b 3160 ld1 {v1.4s},[x7],#16 // load key schedule... 3161 b.gt Loop_final_dec 3162 3163 aesd v26.16b,v0.16b 3164 aesimc v26.16b,v26.16b 3165 ld1 {v0.4s},[x7] 3166 aesd v26.16b,v1.16b 3167 eor v26.16b,v26.16b,v0.16b 3168 eor v26.16b,v26.16b,v6.16b 3169 st1 {v26.16b},[x1] 3170 3171Lxts_dec_abort: 3172 ldp x21,x22,[sp,#48] 3173 ldp d8,d9,[sp,#32] 3174 ldp d10,d11,[sp,#16] 3175 ldp x19,x20,[sp],#64 3176 3177Lxts_dec_final_abort: 3178 ret 3179 3180#endif 3181