1#ifndef __KERNEL__ 2# include "arm_arch.h" 3 4.private_extern _OPENSSL_armcap_P 5#endif 6 7.text 8 9.align 5 10Lsigma: 11.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral 12Lone: 13.long 1,2,3,4 14Lrot24: 15.long 0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f 16.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 17.align 2 18 19.globl _ChaCha20_ctr32 20 21.align 5 22_ChaCha20_ctr32: 23 cbz x2,Labort 24 cmp x2,#192 25 b.lo Lshort 26 27#ifndef __KERNEL__ 28 adrp x17,_OPENSSL_armcap_P@PAGE 29 ldr w17,[x17,_OPENSSL_armcap_P@PAGEOFF] 30 tst w17,#ARMV7_NEON 31 b.ne LChaCha20_neon 32#endif 33 34Lshort: 35.long 0xd503233f // paciasp 36 stp x29,x30,[sp,#-96]! 37 add x29,sp,#0 38 39 adr x5,Lsigma 40 stp x19,x20,[sp,#16] 41 stp x21,x22,[sp,#32] 42 stp x23,x24,[sp,#48] 43 stp x25,x26,[sp,#64] 44 stp x27,x28,[sp,#80] 45 sub sp,sp,#64 46 47 ldp x22,x23,[x5] // load sigma 48 ldp x24,x25,[x3] // load key 49 ldp x26,x27,[x3,#16] 50 ldp x28,x30,[x4] // load counter 51#ifdef __AARCH64EB__ 52 ror x24,x24,#32 53 ror x25,x25,#32 54 ror x26,x26,#32 55 ror x27,x27,#32 56 ror x28,x28,#32 57 ror x30,x30,#32 58#endif 59 60Loop_outer: 61 mov w5,w22 // unpack key block 62 lsr x6,x22,#32 63 mov w7,w23 64 lsr x8,x23,#32 65 mov w9,w24 66 lsr x10,x24,#32 67 mov w11,w25 68 lsr x12,x25,#32 69 mov w13,w26 70 lsr x14,x26,#32 71 mov w15,w27 72 lsr x16,x27,#32 73 mov w17,w28 74 lsr x19,x28,#32 75 mov w20,w30 76 lsr x21,x30,#32 77 78 mov x4,#10 79 subs x2,x2,#64 80Loop: 81 sub x4,x4,#1 82 add w5,w5,w9 83 add w6,w6,w10 84 add w7,w7,w11 85 add w8,w8,w12 86 eor w17,w17,w5 87 eor w19,w19,w6 88 eor w20,w20,w7 89 eor w21,w21,w8 90 ror w17,w17,#16 91 ror w19,w19,#16 92 ror w20,w20,#16 93 ror w21,w21,#16 94 add w13,w13,w17 95 add w14,w14,w19 96 add w15,w15,w20 97 add w16,w16,w21 98 eor w9,w9,w13 99 eor w10,w10,w14 100 eor w11,w11,w15 101 eor w12,w12,w16 102 ror w9,w9,#20 103 ror w10,w10,#20 104 ror w11,w11,#20 105 ror w12,w12,#20 106 add w5,w5,w9 107 add w6,w6,w10 108 add w7,w7,w11 109 add w8,w8,w12 110 eor w17,w17,w5 111 eor w19,w19,w6 112 eor w20,w20,w7 113 eor w21,w21,w8 114 ror w17,w17,#24 115 ror w19,w19,#24 116 ror w20,w20,#24 117 ror w21,w21,#24 118 add w13,w13,w17 119 add w14,w14,w19 120 add w15,w15,w20 121 add w16,w16,w21 122 eor w9,w9,w13 123 eor w10,w10,w14 124 eor w11,w11,w15 125 eor w12,w12,w16 126 ror w9,w9,#25 127 ror w10,w10,#25 128 ror w11,w11,#25 129 ror w12,w12,#25 130 add w5,w5,w10 131 add w6,w6,w11 132 add w7,w7,w12 133 add w8,w8,w9 134 eor w21,w21,w5 135 eor w17,w17,w6 136 eor w19,w19,w7 137 eor w20,w20,w8 138 ror w21,w21,#16 139 ror w17,w17,#16 140 ror w19,w19,#16 141 ror w20,w20,#16 142 add w15,w15,w21 143 add w16,w16,w17 144 add w13,w13,w19 145 add w14,w14,w20 146 eor w10,w10,w15 147 eor w11,w11,w16 148 eor w12,w12,w13 149 eor w9,w9,w14 150 ror w10,w10,#20 151 ror w11,w11,#20 152 ror w12,w12,#20 153 ror w9,w9,#20 154 add w5,w5,w10 155 add w6,w6,w11 156 add w7,w7,w12 157 add w8,w8,w9 158 eor w21,w21,w5 159 eor w17,w17,w6 160 eor w19,w19,w7 161 eor w20,w20,w8 162 ror w21,w21,#24 163 ror w17,w17,#24 164 ror w19,w19,#24 165 ror w20,w20,#24 166 add w15,w15,w21 167 add w16,w16,w17 168 add w13,w13,w19 169 add w14,w14,w20 170 eor w10,w10,w15 171 eor w11,w11,w16 172 eor w12,w12,w13 173 eor w9,w9,w14 174 ror w10,w10,#25 175 ror w11,w11,#25 176 ror w12,w12,#25 177 ror w9,w9,#25 178 cbnz x4,Loop 179 180 add w5,w5,w22 // accumulate key block 181 add x6,x6,x22,lsr#32 182 add w7,w7,w23 183 add x8,x8,x23,lsr#32 184 add w9,w9,w24 185 add x10,x10,x24,lsr#32 186 add w11,w11,w25 187 add x12,x12,x25,lsr#32 188 add w13,w13,w26 189 add x14,x14,x26,lsr#32 190 add w15,w15,w27 191 add x16,x16,x27,lsr#32 192 add w17,w17,w28 193 add x19,x19,x28,lsr#32 194 add w20,w20,w30 195 add x21,x21,x30,lsr#32 196 197 b.lo Ltail 198 199 add x5,x5,x6,lsl#32 // pack 200 add x7,x7,x8,lsl#32 201 ldp x6,x8,[x1,#0] // load input 202 add x9,x9,x10,lsl#32 203 add x11,x11,x12,lsl#32 204 ldp x10,x12,[x1,#16] 205 add x13,x13,x14,lsl#32 206 add x15,x15,x16,lsl#32 207 ldp x14,x16,[x1,#32] 208 add x17,x17,x19,lsl#32 209 add x20,x20,x21,lsl#32 210 ldp x19,x21,[x1,#48] 211 add x1,x1,#64 212#ifdef __AARCH64EB__ 213 rev x5,x5 214 rev x7,x7 215 rev x9,x9 216 rev x11,x11 217 rev x13,x13 218 rev x15,x15 219 rev x17,x17 220 rev x20,x20 221#endif 222 eor x5,x5,x6 223 eor x7,x7,x8 224 eor x9,x9,x10 225 eor x11,x11,x12 226 eor x13,x13,x14 227 eor x15,x15,x16 228 eor x17,x17,x19 229 eor x20,x20,x21 230 231 stp x5,x7,[x0,#0] // store output 232 add x28,x28,#1 // increment counter 233 stp x9,x11,[x0,#16] 234 stp x13,x15,[x0,#32] 235 stp x17,x20,[x0,#48] 236 add x0,x0,#64 237 238 b.hi Loop_outer 239 240 ldp x19,x20,[x29,#16] 241 add sp,sp,#64 242 ldp x21,x22,[x29,#32] 243 ldp x23,x24,[x29,#48] 244 ldp x25,x26,[x29,#64] 245 ldp x27,x28,[x29,#80] 246 ldp x29,x30,[sp],#96 247.long 0xd50323bf // autiasp 248Labort: 249 ret 250 251.align 4 252Ltail: 253 add x2,x2,#64 254Less_than_64: 255 sub x0,x0,#1 256 add x1,x1,x2 257 add x0,x0,x2 258 add x4,sp,x2 259 neg x2,x2 260 261 add x5,x5,x6,lsl#32 // pack 262 add x7,x7,x8,lsl#32 263 add x9,x9,x10,lsl#32 264 add x11,x11,x12,lsl#32 265 add x13,x13,x14,lsl#32 266 add x15,x15,x16,lsl#32 267 add x17,x17,x19,lsl#32 268 add x20,x20,x21,lsl#32 269#ifdef __AARCH64EB__ 270 rev x5,x5 271 rev x7,x7 272 rev x9,x9 273 rev x11,x11 274 rev x13,x13 275 rev x15,x15 276 rev x17,x17 277 rev x20,x20 278#endif 279 stp x5,x7,[sp,#0] 280 stp x9,x11,[sp,#16] 281 stp x13,x15,[sp,#32] 282 stp x17,x20,[sp,#48] 283 284Loop_tail: 285 ldrb w10,[x1,x2] 286 ldrb w11,[x4,x2] 287 add x2,x2,#1 288 eor w10,w10,w11 289 strb w10,[x0,x2] 290 cbnz x2,Loop_tail 291 292 stp xzr,xzr,[sp,#0] 293 stp xzr,xzr,[sp,#16] 294 stp xzr,xzr,[sp,#32] 295 stp xzr,xzr,[sp,#48] 296 297 ldp x19,x20,[x29,#16] 298 add sp,sp,#64 299 ldp x21,x22,[x29,#32] 300 ldp x23,x24,[x29,#48] 301 ldp x25,x26,[x29,#64] 302 ldp x27,x28,[x29,#80] 303 ldp x29,x30,[sp],#96 304.long 0xd50323bf // autiasp 305 ret 306 307 308#ifdef __KERNEL__ 309.globl _ChaCha20_neon 310#endif 311 312.align 5 313_ChaCha20_neon: 314LChaCha20_neon: 315.long 0xd503233f // paciasp 316 stp x29,x30,[sp,#-96]! 317 add x29,sp,#0 318 319 adr x5,Lsigma 320 stp x19,x20,[sp,#16] 321 stp x21,x22,[sp,#32] 322 stp x23,x24,[sp,#48] 323 stp x25,x26,[sp,#64] 324 stp x27,x28,[sp,#80] 325 cmp x2,#512 326 b.hs L512_or_more_neon 327 328 sub sp,sp,#64 329 330 ldp x22,x23,[x5] // load sigma 331 ld1 {v0.4s},[x5],#16 332 ldp x24,x25,[x3] // load key 333 ldp x26,x27,[x3,#16] 334 ld1 {v1.4s,v2.4s},[x3] 335 ldp x28,x30,[x4] // load counter 336 ld1 {v3.4s},[x4] 337 stp d8,d9,[sp] // meet ABI requirements 338 ld1 {v8.4s,v9.4s},[x5] 339#ifdef __AARCH64EB__ 340 rev64 v0.4s,v0.4s 341 ror x24,x24,#32 342 ror x25,x25,#32 343 ror x26,x26,#32 344 ror x27,x27,#32 345 ror x28,x28,#32 346 ror x30,x30,#32 347#endif 348 349Loop_outer_neon: 350 dup v16.4s,v0.s[0] // unpack key block 351 mov w5,w22 352 dup v20.4s,v0.s[1] 353 lsr x6,x22,#32 354 dup v24.4s,v0.s[2] 355 mov w7,w23 356 dup v28.4s,v0.s[3] 357 lsr x8,x23,#32 358 dup v17.4s,v1.s[0] 359 mov w9,w24 360 dup v21.4s,v1.s[1] 361 lsr x10,x24,#32 362 dup v25.4s,v1.s[2] 363 mov w11,w25 364 dup v29.4s,v1.s[3] 365 lsr x12,x25,#32 366 dup v19.4s,v3.s[0] 367 mov w13,w26 368 dup v23.4s,v3.s[1] 369 lsr x14,x26,#32 370 dup v27.4s,v3.s[2] 371 mov w15,w27 372 dup v31.4s,v3.s[3] 373 lsr x16,x27,#32 374 add v19.4s,v19.4s,v8.4s 375 mov w17,w28 376 dup v18.4s,v2.s[0] 377 lsr x19,x28,#32 378 dup v22.4s,v2.s[1] 379 mov w20,w30 380 dup v26.4s,v2.s[2] 381 lsr x21,x30,#32 382 dup v30.4s,v2.s[3] 383 384 mov x4,#10 385 subs x2,x2,#320 386Loop_neon: 387 sub x4,x4,#1 388 add v16.4s,v16.4s,v17.4s 389 add w5,w5,w9 390 add v20.4s,v20.4s,v21.4s 391 add w6,w6,w10 392 add v24.4s,v24.4s,v25.4s 393 add w7,w7,w11 394 add v28.4s,v28.4s,v29.4s 395 add w8,w8,w12 396 eor v19.16b,v19.16b,v16.16b 397 eor w17,w17,w5 398 eor v23.16b,v23.16b,v20.16b 399 eor w19,w19,w6 400 eor v27.16b,v27.16b,v24.16b 401 eor w20,w20,w7 402 eor v31.16b,v31.16b,v28.16b 403 eor w21,w21,w8 404 rev32 v19.8h,v19.8h 405 ror w17,w17,#16 406 rev32 v23.8h,v23.8h 407 ror w19,w19,#16 408 rev32 v27.8h,v27.8h 409 ror w20,w20,#16 410 rev32 v31.8h,v31.8h 411 ror w21,w21,#16 412 add v18.4s,v18.4s,v19.4s 413 add w13,w13,w17 414 add v22.4s,v22.4s,v23.4s 415 add w14,w14,w19 416 add v26.4s,v26.4s,v27.4s 417 add w15,w15,w20 418 add v30.4s,v30.4s,v31.4s 419 add w16,w16,w21 420 eor v4.16b,v17.16b,v18.16b 421 eor w9,w9,w13 422 eor v5.16b,v21.16b,v22.16b 423 eor w10,w10,w14 424 eor v6.16b,v25.16b,v26.16b 425 eor w11,w11,w15 426 eor v7.16b,v29.16b,v30.16b 427 eor w12,w12,w16 428 ushr v17.4s,v4.4s,#20 429 ror w9,w9,#20 430 ushr v21.4s,v5.4s,#20 431 ror w10,w10,#20 432 ushr v25.4s,v6.4s,#20 433 ror w11,w11,#20 434 ushr v29.4s,v7.4s,#20 435 ror w12,w12,#20 436 sli v17.4s,v4.4s,#12 437 add w5,w5,w9 438 sli v21.4s,v5.4s,#12 439 add w6,w6,w10 440 sli v25.4s,v6.4s,#12 441 add w7,w7,w11 442 sli v29.4s,v7.4s,#12 443 add w8,w8,w12 444 add v16.4s,v16.4s,v17.4s 445 eor w17,w17,w5 446 add v20.4s,v20.4s,v21.4s 447 eor w19,w19,w6 448 add v24.4s,v24.4s,v25.4s 449 eor w20,w20,w7 450 add v28.4s,v28.4s,v29.4s 451 eor w21,w21,w8 452 eor v4.16b,v19.16b,v16.16b 453 ror w17,w17,#24 454 eor v5.16b,v23.16b,v20.16b 455 ror w19,w19,#24 456 eor v6.16b,v27.16b,v24.16b 457 ror w20,w20,#24 458 eor v7.16b,v31.16b,v28.16b 459 ror w21,w21,#24 460 tbl v19.16b,{v4.16b},v9.16b 461 add w13,w13,w17 462 tbl v23.16b,{v5.16b},v9.16b 463 add w14,w14,w19 464 tbl v27.16b,{v6.16b},v9.16b 465 add w15,w15,w20 466 tbl v31.16b,{v7.16b},v9.16b 467 add w16,w16,w21 468 add v18.4s,v18.4s,v19.4s 469 eor w9,w9,w13 470 add v22.4s,v22.4s,v23.4s 471 eor w10,w10,w14 472 add v26.4s,v26.4s,v27.4s 473 eor w11,w11,w15 474 add v30.4s,v30.4s,v31.4s 475 eor w12,w12,w16 476 eor v4.16b,v17.16b,v18.16b 477 ror w9,w9,#25 478 eor v5.16b,v21.16b,v22.16b 479 ror w10,w10,#25 480 eor v6.16b,v25.16b,v26.16b 481 ror w11,w11,#25 482 eor v7.16b,v29.16b,v30.16b 483 ror w12,w12,#25 484 ushr v17.4s,v4.4s,#25 485 ushr v21.4s,v5.4s,#25 486 ushr v25.4s,v6.4s,#25 487 ushr v29.4s,v7.4s,#25 488 sli v17.4s,v4.4s,#7 489 sli v21.4s,v5.4s,#7 490 sli v25.4s,v6.4s,#7 491 sli v29.4s,v7.4s,#7 492 add v16.4s,v16.4s,v21.4s 493 add w5,w5,w10 494 add v20.4s,v20.4s,v25.4s 495 add w6,w6,w11 496 add v24.4s,v24.4s,v29.4s 497 add w7,w7,w12 498 add v28.4s,v28.4s,v17.4s 499 add w8,w8,w9 500 eor v31.16b,v31.16b,v16.16b 501 eor w21,w21,w5 502 eor v19.16b,v19.16b,v20.16b 503 eor w17,w17,w6 504 eor v23.16b,v23.16b,v24.16b 505 eor w19,w19,w7 506 eor v27.16b,v27.16b,v28.16b 507 eor w20,w20,w8 508 rev32 v31.8h,v31.8h 509 ror w21,w21,#16 510 rev32 v19.8h,v19.8h 511 ror w17,w17,#16 512 rev32 v23.8h,v23.8h 513 ror w19,w19,#16 514 rev32 v27.8h,v27.8h 515 ror w20,w20,#16 516 add v26.4s,v26.4s,v31.4s 517 add w15,w15,w21 518 add v30.4s,v30.4s,v19.4s 519 add w16,w16,w17 520 add v18.4s,v18.4s,v23.4s 521 add w13,w13,w19 522 add v22.4s,v22.4s,v27.4s 523 add w14,w14,w20 524 eor v4.16b,v21.16b,v26.16b 525 eor w10,w10,w15 526 eor v5.16b,v25.16b,v30.16b 527 eor w11,w11,w16 528 eor v6.16b,v29.16b,v18.16b 529 eor w12,w12,w13 530 eor v7.16b,v17.16b,v22.16b 531 eor w9,w9,w14 532 ushr v21.4s,v4.4s,#20 533 ror w10,w10,#20 534 ushr v25.4s,v5.4s,#20 535 ror w11,w11,#20 536 ushr v29.4s,v6.4s,#20 537 ror w12,w12,#20 538 ushr v17.4s,v7.4s,#20 539 ror w9,w9,#20 540 sli v21.4s,v4.4s,#12 541 add w5,w5,w10 542 sli v25.4s,v5.4s,#12 543 add w6,w6,w11 544 sli v29.4s,v6.4s,#12 545 add w7,w7,w12 546 sli v17.4s,v7.4s,#12 547 add w8,w8,w9 548 add v16.4s,v16.4s,v21.4s 549 eor w21,w21,w5 550 add v20.4s,v20.4s,v25.4s 551 eor w17,w17,w6 552 add v24.4s,v24.4s,v29.4s 553 eor w19,w19,w7 554 add v28.4s,v28.4s,v17.4s 555 eor w20,w20,w8 556 eor v4.16b,v31.16b,v16.16b 557 ror w21,w21,#24 558 eor v5.16b,v19.16b,v20.16b 559 ror w17,w17,#24 560 eor v6.16b,v23.16b,v24.16b 561 ror w19,w19,#24 562 eor v7.16b,v27.16b,v28.16b 563 ror w20,w20,#24 564 tbl v31.16b,{v4.16b},v9.16b 565 add w15,w15,w21 566 tbl v19.16b,{v5.16b},v9.16b 567 add w16,w16,w17 568 tbl v23.16b,{v6.16b},v9.16b 569 add w13,w13,w19 570 tbl v27.16b,{v7.16b},v9.16b 571 add w14,w14,w20 572 add v26.4s,v26.4s,v31.4s 573 eor w10,w10,w15 574 add v30.4s,v30.4s,v19.4s 575 eor w11,w11,w16 576 add v18.4s,v18.4s,v23.4s 577 eor w12,w12,w13 578 add v22.4s,v22.4s,v27.4s 579 eor w9,w9,w14 580 eor v4.16b,v21.16b,v26.16b 581 ror w10,w10,#25 582 eor v5.16b,v25.16b,v30.16b 583 ror w11,w11,#25 584 eor v6.16b,v29.16b,v18.16b 585 ror w12,w12,#25 586 eor v7.16b,v17.16b,v22.16b 587 ror w9,w9,#25 588 ushr v21.4s,v4.4s,#25 589 ushr v25.4s,v5.4s,#25 590 ushr v29.4s,v6.4s,#25 591 ushr v17.4s,v7.4s,#25 592 sli v21.4s,v4.4s,#7 593 sli v25.4s,v5.4s,#7 594 sli v29.4s,v6.4s,#7 595 sli v17.4s,v7.4s,#7 596 cbnz x4,Loop_neon 597 598 add v19.4s,v19.4s,v8.4s 599 600 zip1 v4.4s,v16.4s,v20.4s // transpose data 601 zip1 v5.4s,v24.4s,v28.4s 602 zip2 v6.4s,v16.4s,v20.4s 603 zip2 v7.4s,v24.4s,v28.4s 604 zip1 v16.2d,v4.2d,v5.2d 605 zip2 v20.2d,v4.2d,v5.2d 606 zip1 v24.2d,v6.2d,v7.2d 607 zip2 v28.2d,v6.2d,v7.2d 608 609 zip1 v4.4s,v17.4s,v21.4s 610 zip1 v5.4s,v25.4s,v29.4s 611 zip2 v6.4s,v17.4s,v21.4s 612 zip2 v7.4s,v25.4s,v29.4s 613 zip1 v17.2d,v4.2d,v5.2d 614 zip2 v21.2d,v4.2d,v5.2d 615 zip1 v25.2d,v6.2d,v7.2d 616 zip2 v29.2d,v6.2d,v7.2d 617 618 zip1 v4.4s,v18.4s,v22.4s 619 add w5,w5,w22 // accumulate key block 620 zip1 v5.4s,v26.4s,v30.4s 621 add x6,x6,x22,lsr#32 622 zip2 v6.4s,v18.4s,v22.4s 623 add w7,w7,w23 624 zip2 v7.4s,v26.4s,v30.4s 625 add x8,x8,x23,lsr#32 626 zip1 v18.2d,v4.2d,v5.2d 627 add w9,w9,w24 628 zip2 v22.2d,v4.2d,v5.2d 629 add x10,x10,x24,lsr#32 630 zip1 v26.2d,v6.2d,v7.2d 631 add w11,w11,w25 632 zip2 v30.2d,v6.2d,v7.2d 633 add x12,x12,x25,lsr#32 634 635 zip1 v4.4s,v19.4s,v23.4s 636 add w13,w13,w26 637 zip1 v5.4s,v27.4s,v31.4s 638 add x14,x14,x26,lsr#32 639 zip2 v6.4s,v19.4s,v23.4s 640 add w15,w15,w27 641 zip2 v7.4s,v27.4s,v31.4s 642 add x16,x16,x27,lsr#32 643 zip1 v19.2d,v4.2d,v5.2d 644 add w17,w17,w28 645 zip2 v23.2d,v4.2d,v5.2d 646 add x19,x19,x28,lsr#32 647 zip1 v27.2d,v6.2d,v7.2d 648 add w20,w20,w30 649 zip2 v31.2d,v6.2d,v7.2d 650 add x21,x21,x30,lsr#32 651 652 b.lo Ltail_neon 653 654 add x5,x5,x6,lsl#32 // pack 655 add x7,x7,x8,lsl#32 656 ldp x6,x8,[x1,#0] // load input 657 add v16.4s,v16.4s,v0.4s // accumulate key block 658 add x9,x9,x10,lsl#32 659 add x11,x11,x12,lsl#32 660 ldp x10,x12,[x1,#16] 661 add v17.4s,v17.4s,v1.4s 662 add x13,x13,x14,lsl#32 663 add x15,x15,x16,lsl#32 664 ldp x14,x16,[x1,#32] 665 add v18.4s,v18.4s,v2.4s 666 add x17,x17,x19,lsl#32 667 add x20,x20,x21,lsl#32 668 ldp x19,x21,[x1,#48] 669 add v19.4s,v19.4s,v3.4s 670 add x1,x1,#64 671#ifdef __AARCH64EB__ 672 rev x5,x5 673 rev x7,x7 674 rev x9,x9 675 rev x11,x11 676 rev x13,x13 677 rev x15,x15 678 rev x17,x17 679 rev x20,x20 680#endif 681 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 682 eor x5,x5,x6 683 add v20.4s,v20.4s,v0.4s 684 eor x7,x7,x8 685 add v21.4s,v21.4s,v1.4s 686 eor x9,x9,x10 687 add v22.4s,v22.4s,v2.4s 688 eor x11,x11,x12 689 add v23.4s,v23.4s,v3.4s 690 eor x13,x13,x14 691 eor v16.16b,v16.16b,v4.16b 692 movi v4.4s,#5 693 eor x15,x15,x16 694 eor v17.16b,v17.16b,v5.16b 695 eor x17,x17,x19 696 eor v18.16b,v18.16b,v6.16b 697 eor x20,x20,x21 698 eor v19.16b,v19.16b,v7.16b 699 add v8.4s,v8.4s,v4.4s // += 5 700 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 701 702 stp x5,x7,[x0,#0] // store output 703 add x28,x28,#5 // increment counter 704 stp x9,x11,[x0,#16] 705 stp x13,x15,[x0,#32] 706 stp x17,x20,[x0,#48] 707 add x0,x0,#64 708 709 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 710 add v24.4s,v24.4s,v0.4s 711 add v25.4s,v25.4s,v1.4s 712 add v26.4s,v26.4s,v2.4s 713 add v27.4s,v27.4s,v3.4s 714 ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 715 716 eor v20.16b,v20.16b,v4.16b 717 eor v21.16b,v21.16b,v5.16b 718 eor v22.16b,v22.16b,v6.16b 719 eor v23.16b,v23.16b,v7.16b 720 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 721 add v28.4s,v28.4s,v0.4s 722 add v29.4s,v29.4s,v1.4s 723 add v30.4s,v30.4s,v2.4s 724 add v31.4s,v31.4s,v3.4s 725 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 726 727 eor v24.16b,v24.16b,v16.16b 728 eor v25.16b,v25.16b,v17.16b 729 eor v26.16b,v26.16b,v18.16b 730 eor v27.16b,v27.16b,v19.16b 731 st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64 732 733 eor v28.16b,v28.16b,v20.16b 734 eor v29.16b,v29.16b,v21.16b 735 eor v30.16b,v30.16b,v22.16b 736 eor v31.16b,v31.16b,v23.16b 737 st1 {v28.16b,v29.16b,v30.16b,v31.16b},[x0],#64 738 739 b.hi Loop_outer_neon 740 741 ldp d8,d9,[sp] // meet ABI requirements 742 743 ldp x19,x20,[x29,#16] 744 add sp,sp,#64 745 ldp x21,x22,[x29,#32] 746 ldp x23,x24,[x29,#48] 747 ldp x25,x26,[x29,#64] 748 ldp x27,x28,[x29,#80] 749 ldp x29,x30,[sp],#96 750.long 0xd50323bf // autiasp 751 ret 752 753.align 4 754Ltail_neon: 755 add x2,x2,#320 756 ldp d8,d9,[sp] // meet ABI requirements 757 cmp x2,#64 758 b.lo Less_than_64 759 760 add x5,x5,x6,lsl#32 // pack 761 add x7,x7,x8,lsl#32 762 ldp x6,x8,[x1,#0] // load input 763 add x9,x9,x10,lsl#32 764 add x11,x11,x12,lsl#32 765 ldp x10,x12,[x1,#16] 766 add x13,x13,x14,lsl#32 767 add x15,x15,x16,lsl#32 768 ldp x14,x16,[x1,#32] 769 add x17,x17,x19,lsl#32 770 add x20,x20,x21,lsl#32 771 ldp x19,x21,[x1,#48] 772 add x1,x1,#64 773#ifdef __AARCH64EB__ 774 rev x5,x5 775 rev x7,x7 776 rev x9,x9 777 rev x11,x11 778 rev x13,x13 779 rev x15,x15 780 rev x17,x17 781 rev x20,x20 782#endif 783 eor x5,x5,x6 784 eor x7,x7,x8 785 eor x9,x9,x10 786 eor x11,x11,x12 787 eor x13,x13,x14 788 eor x15,x15,x16 789 eor x17,x17,x19 790 eor x20,x20,x21 791 792 stp x5,x7,[x0,#0] // store output 793 add v16.4s,v16.4s,v0.4s // accumulate key block 794 stp x9,x11,[x0,#16] 795 add v17.4s,v17.4s,v1.4s 796 stp x13,x15,[x0,#32] 797 add v18.4s,v18.4s,v2.4s 798 stp x17,x20,[x0,#48] 799 add v19.4s,v19.4s,v3.4s 800 add x0,x0,#64 801 b.eq Ldone_neon 802 sub x2,x2,#64 803 cmp x2,#64 804 b.lo Last_neon 805 806 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 807 eor v16.16b,v16.16b,v4.16b 808 eor v17.16b,v17.16b,v5.16b 809 eor v18.16b,v18.16b,v6.16b 810 eor v19.16b,v19.16b,v7.16b 811 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 812 b.eq Ldone_neon 813 814 add v16.4s,v20.4s,v0.4s 815 add v17.4s,v21.4s,v1.4s 816 sub x2,x2,#64 817 add v18.4s,v22.4s,v2.4s 818 cmp x2,#64 819 add v19.4s,v23.4s,v3.4s 820 b.lo Last_neon 821 822 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 823 eor v20.16b,v16.16b,v4.16b 824 eor v21.16b,v17.16b,v5.16b 825 eor v22.16b,v18.16b,v6.16b 826 eor v23.16b,v19.16b,v7.16b 827 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 828 b.eq Ldone_neon 829 830 add v16.4s,v24.4s,v0.4s 831 add v17.4s,v25.4s,v1.4s 832 sub x2,x2,#64 833 add v18.4s,v26.4s,v2.4s 834 cmp x2,#64 835 add v19.4s,v27.4s,v3.4s 836 b.lo Last_neon 837 838 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 839 eor v24.16b,v16.16b,v4.16b 840 eor v25.16b,v17.16b,v5.16b 841 eor v26.16b,v18.16b,v6.16b 842 eor v27.16b,v19.16b,v7.16b 843 st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64 844 b.eq Ldone_neon 845 846 add v16.4s,v28.4s,v0.4s 847 add v17.4s,v29.4s,v1.4s 848 add v18.4s,v30.4s,v2.4s 849 add v19.4s,v31.4s,v3.4s 850 sub x2,x2,#64 851 852Last_neon: 853 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] 854 855 sub x0,x0,#1 856 add x1,x1,x2 857 add x0,x0,x2 858 add x4,sp,x2 859 neg x2,x2 860 861Loop_tail_neon: 862 ldrb w10,[x1,x2] 863 ldrb w11,[x4,x2] 864 add x2,x2,#1 865 eor w10,w10,w11 866 strb w10,[x0,x2] 867 cbnz x2,Loop_tail_neon 868 869 stp xzr,xzr,[sp,#0] 870 stp xzr,xzr,[sp,#16] 871 stp xzr,xzr,[sp,#32] 872 stp xzr,xzr,[sp,#48] 873 874Ldone_neon: 875 ldp x19,x20,[x29,#16] 876 add sp,sp,#64 877 ldp x21,x22,[x29,#32] 878 ldp x23,x24,[x29,#48] 879 ldp x25,x26,[x29,#64] 880 ldp x27,x28,[x29,#80] 881 ldp x29,x30,[sp],#96 882.long 0xd50323bf // autiasp 883 ret 884 885 886.align 5 887ChaCha20_512_neon: 888.long 0xd503233f // paciasp 889 stp x29,x30,[sp,#-96]! 890 add x29,sp,#0 891 892 adr x5,Lsigma 893 stp x19,x20,[sp,#16] 894 stp x21,x22,[sp,#32] 895 stp x23,x24,[sp,#48] 896 stp x25,x26,[sp,#64] 897 stp x27,x28,[sp,#80] 898 899L512_or_more_neon: 900 sub sp,sp,#128+64 901 902 eor v7.16b,v7.16b,v7.16b 903 ldp x22,x23,[x5] // load sigma 904 ld1 {v0.4s},[x5],#16 905 ldp x24,x25,[x3] // load key 906 ldp x26,x27,[x3,#16] 907 ld1 {v1.4s,v2.4s},[x3] 908 ldp x28,x30,[x4] // load counter 909 ld1 {v3.4s},[x4] 910 ld1 {v7.s}[0],[x5] 911 add x3,x5,#16 // Lrot24 912#ifdef __AARCH64EB__ 913 rev64 v0.4s,v0.4s 914 ror x24,x24,#32 915 ror x25,x25,#32 916 ror x26,x26,#32 917 ror x27,x27,#32 918 ror x28,x28,#32 919 ror x30,x30,#32 920#endif 921 add v3.4s,v3.4s,v7.4s // += 1 922 stp q0,q1,[sp,#0] // off-load key block, invariant part 923 add v3.4s,v3.4s,v7.4s // not typo 924 str q2,[sp,#32] 925 add v4.4s,v3.4s,v7.4s 926 add v5.4s,v4.4s,v7.4s 927 add v6.4s,v5.4s,v7.4s 928 shl v7.4s,v7.4s,#2 // 1 -> 4 929 930 stp d8,d9,[sp,#128+0] // meet ABI requirements 931 stp d10,d11,[sp,#128+16] 932 stp d12,d13,[sp,#128+32] 933 stp d14,d15,[sp,#128+48] 934 935 sub x2,x2,#512 // not typo 936 937Loop_outer_512_neon: 938 mov v8.16b,v0.16b 939 mov v12.16b,v0.16b 940 mov v16.16b,v0.16b 941 mov v20.16b,v0.16b 942 mov v24.16b,v0.16b 943 mov v28.16b,v0.16b 944 mov v9.16b,v1.16b 945 mov w5,w22 // unpack key block 946 mov v13.16b,v1.16b 947 lsr x6,x22,#32 948 mov v17.16b,v1.16b 949 mov w7,w23 950 mov v21.16b,v1.16b 951 lsr x8,x23,#32 952 mov v25.16b,v1.16b 953 mov w9,w24 954 mov v29.16b,v1.16b 955 lsr x10,x24,#32 956 mov v11.16b,v3.16b 957 mov w11,w25 958 mov v15.16b,v4.16b 959 lsr x12,x25,#32 960 mov v19.16b,v5.16b 961 mov w13,w26 962 mov v23.16b,v6.16b 963 lsr x14,x26,#32 964 mov v10.16b,v2.16b 965 mov w15,w27 966 mov v14.16b,v2.16b 967 lsr x16,x27,#32 968 add v27.4s,v11.4s,v7.4s // +4 969 mov w17,w28 970 add v31.4s,v15.4s,v7.4s // +4 971 lsr x19,x28,#32 972 mov v18.16b,v2.16b 973 mov w20,w30 974 mov v22.16b,v2.16b 975 lsr x21,x30,#32 976 mov v26.16b,v2.16b 977 stp q3,q4,[sp,#48] // off-load key block, variable part 978 mov v30.16b,v2.16b 979 stp q5,q6,[sp,#80] 980 981 mov x4,#5 982 ld1 {v6.4s},[x3] 983 subs x2,x2,#512 984Loop_upper_neon: 985 sub x4,x4,#1 986 add v8.4s,v8.4s,v9.4s 987 add w5,w5,w9 988 add v12.4s,v12.4s,v13.4s 989 add w6,w6,w10 990 add v16.4s,v16.4s,v17.4s 991 add w7,w7,w11 992 add v20.4s,v20.4s,v21.4s 993 add w8,w8,w12 994 add v24.4s,v24.4s,v25.4s 995 eor w17,w17,w5 996 add v28.4s,v28.4s,v29.4s 997 eor w19,w19,w6 998 eor v11.16b,v11.16b,v8.16b 999 eor w20,w20,w7 1000 eor v15.16b,v15.16b,v12.16b 1001 eor w21,w21,w8 1002 eor v19.16b,v19.16b,v16.16b 1003 ror w17,w17,#16 1004 eor v23.16b,v23.16b,v20.16b 1005 ror w19,w19,#16 1006 eor v27.16b,v27.16b,v24.16b 1007 ror w20,w20,#16 1008 eor v31.16b,v31.16b,v28.16b 1009 ror w21,w21,#16 1010 rev32 v11.8h,v11.8h 1011 add w13,w13,w17 1012 rev32 v15.8h,v15.8h 1013 add w14,w14,w19 1014 rev32 v19.8h,v19.8h 1015 add w15,w15,w20 1016 rev32 v23.8h,v23.8h 1017 add w16,w16,w21 1018 rev32 v27.8h,v27.8h 1019 eor w9,w9,w13 1020 rev32 v31.8h,v31.8h 1021 eor w10,w10,w14 1022 add v10.4s,v10.4s,v11.4s 1023 eor w11,w11,w15 1024 add v14.4s,v14.4s,v15.4s 1025 eor w12,w12,w16 1026 add v18.4s,v18.4s,v19.4s 1027 ror w9,w9,#20 1028 add v22.4s,v22.4s,v23.4s 1029 ror w10,w10,#20 1030 add v26.4s,v26.4s,v27.4s 1031 ror w11,w11,#20 1032 add v30.4s,v30.4s,v31.4s 1033 ror w12,w12,#20 1034 eor v0.16b,v9.16b,v10.16b 1035 add w5,w5,w9 1036 eor v1.16b,v13.16b,v14.16b 1037 add w6,w6,w10 1038 eor v2.16b,v17.16b,v18.16b 1039 add w7,w7,w11 1040 eor v3.16b,v21.16b,v22.16b 1041 add w8,w8,w12 1042 eor v4.16b,v25.16b,v26.16b 1043 eor w17,w17,w5 1044 eor v5.16b,v29.16b,v30.16b 1045 eor w19,w19,w6 1046 ushr v9.4s,v0.4s,#20 1047 eor w20,w20,w7 1048 ushr v13.4s,v1.4s,#20 1049 eor w21,w21,w8 1050 ushr v17.4s,v2.4s,#20 1051 ror w17,w17,#24 1052 ushr v21.4s,v3.4s,#20 1053 ror w19,w19,#24 1054 ushr v25.4s,v4.4s,#20 1055 ror w20,w20,#24 1056 ushr v29.4s,v5.4s,#20 1057 ror w21,w21,#24 1058 sli v9.4s,v0.4s,#12 1059 add w13,w13,w17 1060 sli v13.4s,v1.4s,#12 1061 add w14,w14,w19 1062 sli v17.4s,v2.4s,#12 1063 add w15,w15,w20 1064 sli v21.4s,v3.4s,#12 1065 add w16,w16,w21 1066 sli v25.4s,v4.4s,#12 1067 eor w9,w9,w13 1068 sli v29.4s,v5.4s,#12 1069 eor w10,w10,w14 1070 add v8.4s,v8.4s,v9.4s 1071 eor w11,w11,w15 1072 add v12.4s,v12.4s,v13.4s 1073 eor w12,w12,w16 1074 add v16.4s,v16.4s,v17.4s 1075 ror w9,w9,#25 1076 add v20.4s,v20.4s,v21.4s 1077 ror w10,w10,#25 1078 add v24.4s,v24.4s,v25.4s 1079 ror w11,w11,#25 1080 add v28.4s,v28.4s,v29.4s 1081 ror w12,w12,#25 1082 eor v11.16b,v11.16b,v8.16b 1083 add w5,w5,w10 1084 eor v15.16b,v15.16b,v12.16b 1085 add w6,w6,w11 1086 eor v19.16b,v19.16b,v16.16b 1087 add w7,w7,w12 1088 eor v23.16b,v23.16b,v20.16b 1089 add w8,w8,w9 1090 eor v27.16b,v27.16b,v24.16b 1091 eor w21,w21,w5 1092 eor v31.16b,v31.16b,v28.16b 1093 eor w17,w17,w6 1094 tbl v11.16b,{v11.16b},v6.16b 1095 eor w19,w19,w7 1096 tbl v15.16b,{v15.16b},v6.16b 1097 eor w20,w20,w8 1098 tbl v19.16b,{v19.16b},v6.16b 1099 ror w21,w21,#16 1100 tbl v23.16b,{v23.16b},v6.16b 1101 ror w17,w17,#16 1102 tbl v27.16b,{v27.16b},v6.16b 1103 ror w19,w19,#16 1104 tbl v31.16b,{v31.16b},v6.16b 1105 ror w20,w20,#16 1106 add v10.4s,v10.4s,v11.4s 1107 add w15,w15,w21 1108 add v14.4s,v14.4s,v15.4s 1109 add w16,w16,w17 1110 add v18.4s,v18.4s,v19.4s 1111 add w13,w13,w19 1112 add v22.4s,v22.4s,v23.4s 1113 add w14,w14,w20 1114 add v26.4s,v26.4s,v27.4s 1115 eor w10,w10,w15 1116 add v30.4s,v30.4s,v31.4s 1117 eor w11,w11,w16 1118 eor v0.16b,v9.16b,v10.16b 1119 eor w12,w12,w13 1120 eor v1.16b,v13.16b,v14.16b 1121 eor w9,w9,w14 1122 eor v2.16b,v17.16b,v18.16b 1123 ror w10,w10,#20 1124 eor v3.16b,v21.16b,v22.16b 1125 ror w11,w11,#20 1126 eor v4.16b,v25.16b,v26.16b 1127 ror w12,w12,#20 1128 eor v5.16b,v29.16b,v30.16b 1129 ror w9,w9,#20 1130 ushr v9.4s,v0.4s,#25 1131 add w5,w5,w10 1132 ushr v13.4s,v1.4s,#25 1133 add w6,w6,w11 1134 ushr v17.4s,v2.4s,#25 1135 add w7,w7,w12 1136 ushr v21.4s,v3.4s,#25 1137 add w8,w8,w9 1138 ushr v25.4s,v4.4s,#25 1139 eor w21,w21,w5 1140 ushr v29.4s,v5.4s,#25 1141 eor w17,w17,w6 1142 sli v9.4s,v0.4s,#7 1143 eor w19,w19,w7 1144 sli v13.4s,v1.4s,#7 1145 eor w20,w20,w8 1146 sli v17.4s,v2.4s,#7 1147 ror w21,w21,#24 1148 sli v21.4s,v3.4s,#7 1149 ror w17,w17,#24 1150 sli v25.4s,v4.4s,#7 1151 ror w19,w19,#24 1152 sli v29.4s,v5.4s,#7 1153 ror w20,w20,#24 1154 ext v10.16b,v10.16b,v10.16b,#8 1155 add w15,w15,w21 1156 ext v14.16b,v14.16b,v14.16b,#8 1157 add w16,w16,w17 1158 ext v18.16b,v18.16b,v18.16b,#8 1159 add w13,w13,w19 1160 ext v22.16b,v22.16b,v22.16b,#8 1161 add w14,w14,w20 1162 ext v26.16b,v26.16b,v26.16b,#8 1163 eor w10,w10,w15 1164 ext v30.16b,v30.16b,v30.16b,#8 1165 eor w11,w11,w16 1166 ext v11.16b,v11.16b,v11.16b,#12 1167 eor w12,w12,w13 1168 ext v15.16b,v15.16b,v15.16b,#12 1169 eor w9,w9,w14 1170 ext v19.16b,v19.16b,v19.16b,#12 1171 ror w10,w10,#25 1172 ext v23.16b,v23.16b,v23.16b,#12 1173 ror w11,w11,#25 1174 ext v27.16b,v27.16b,v27.16b,#12 1175 ror w12,w12,#25 1176 ext v31.16b,v31.16b,v31.16b,#12 1177 ror w9,w9,#25 1178 ext v9.16b,v9.16b,v9.16b,#4 1179 ext v13.16b,v13.16b,v13.16b,#4 1180 ext v17.16b,v17.16b,v17.16b,#4 1181 ext v21.16b,v21.16b,v21.16b,#4 1182 ext v25.16b,v25.16b,v25.16b,#4 1183 ext v29.16b,v29.16b,v29.16b,#4 1184 add v8.4s,v8.4s,v9.4s 1185 add w5,w5,w9 1186 add v12.4s,v12.4s,v13.4s 1187 add w6,w6,w10 1188 add v16.4s,v16.4s,v17.4s 1189 add w7,w7,w11 1190 add v20.4s,v20.4s,v21.4s 1191 add w8,w8,w12 1192 add v24.4s,v24.4s,v25.4s 1193 eor w17,w17,w5 1194 add v28.4s,v28.4s,v29.4s 1195 eor w19,w19,w6 1196 eor v11.16b,v11.16b,v8.16b 1197 eor w20,w20,w7 1198 eor v15.16b,v15.16b,v12.16b 1199 eor w21,w21,w8 1200 eor v19.16b,v19.16b,v16.16b 1201 ror w17,w17,#16 1202 eor v23.16b,v23.16b,v20.16b 1203 ror w19,w19,#16 1204 eor v27.16b,v27.16b,v24.16b 1205 ror w20,w20,#16 1206 eor v31.16b,v31.16b,v28.16b 1207 ror w21,w21,#16 1208 rev32 v11.8h,v11.8h 1209 add w13,w13,w17 1210 rev32 v15.8h,v15.8h 1211 add w14,w14,w19 1212 rev32 v19.8h,v19.8h 1213 add w15,w15,w20 1214 rev32 v23.8h,v23.8h 1215 add w16,w16,w21 1216 rev32 v27.8h,v27.8h 1217 eor w9,w9,w13 1218 rev32 v31.8h,v31.8h 1219 eor w10,w10,w14 1220 add v10.4s,v10.4s,v11.4s 1221 eor w11,w11,w15 1222 add v14.4s,v14.4s,v15.4s 1223 eor w12,w12,w16 1224 add v18.4s,v18.4s,v19.4s 1225 ror w9,w9,#20 1226 add v22.4s,v22.4s,v23.4s 1227 ror w10,w10,#20 1228 add v26.4s,v26.4s,v27.4s 1229 ror w11,w11,#20 1230 add v30.4s,v30.4s,v31.4s 1231 ror w12,w12,#20 1232 eor v0.16b,v9.16b,v10.16b 1233 add w5,w5,w9 1234 eor v1.16b,v13.16b,v14.16b 1235 add w6,w6,w10 1236 eor v2.16b,v17.16b,v18.16b 1237 add w7,w7,w11 1238 eor v3.16b,v21.16b,v22.16b 1239 add w8,w8,w12 1240 eor v4.16b,v25.16b,v26.16b 1241 eor w17,w17,w5 1242 eor v5.16b,v29.16b,v30.16b 1243 eor w19,w19,w6 1244 ushr v9.4s,v0.4s,#20 1245 eor w20,w20,w7 1246 ushr v13.4s,v1.4s,#20 1247 eor w21,w21,w8 1248 ushr v17.4s,v2.4s,#20 1249 ror w17,w17,#24 1250 ushr v21.4s,v3.4s,#20 1251 ror w19,w19,#24 1252 ushr v25.4s,v4.4s,#20 1253 ror w20,w20,#24 1254 ushr v29.4s,v5.4s,#20 1255 ror w21,w21,#24 1256 sli v9.4s,v0.4s,#12 1257 add w13,w13,w17 1258 sli v13.4s,v1.4s,#12 1259 add w14,w14,w19 1260 sli v17.4s,v2.4s,#12 1261 add w15,w15,w20 1262 sli v21.4s,v3.4s,#12 1263 add w16,w16,w21 1264 sli v25.4s,v4.4s,#12 1265 eor w9,w9,w13 1266 sli v29.4s,v5.4s,#12 1267 eor w10,w10,w14 1268 add v8.4s,v8.4s,v9.4s 1269 eor w11,w11,w15 1270 add v12.4s,v12.4s,v13.4s 1271 eor w12,w12,w16 1272 add v16.4s,v16.4s,v17.4s 1273 ror w9,w9,#25 1274 add v20.4s,v20.4s,v21.4s 1275 ror w10,w10,#25 1276 add v24.4s,v24.4s,v25.4s 1277 ror w11,w11,#25 1278 add v28.4s,v28.4s,v29.4s 1279 ror w12,w12,#25 1280 eor v11.16b,v11.16b,v8.16b 1281 add w5,w5,w10 1282 eor v15.16b,v15.16b,v12.16b 1283 add w6,w6,w11 1284 eor v19.16b,v19.16b,v16.16b 1285 add w7,w7,w12 1286 eor v23.16b,v23.16b,v20.16b 1287 add w8,w8,w9 1288 eor v27.16b,v27.16b,v24.16b 1289 eor w21,w21,w5 1290 eor v31.16b,v31.16b,v28.16b 1291 eor w17,w17,w6 1292 tbl v11.16b,{v11.16b},v6.16b 1293 eor w19,w19,w7 1294 tbl v15.16b,{v15.16b},v6.16b 1295 eor w20,w20,w8 1296 tbl v19.16b,{v19.16b},v6.16b 1297 ror w21,w21,#16 1298 tbl v23.16b,{v23.16b},v6.16b 1299 ror w17,w17,#16 1300 tbl v27.16b,{v27.16b},v6.16b 1301 ror w19,w19,#16 1302 tbl v31.16b,{v31.16b},v6.16b 1303 ror w20,w20,#16 1304 add v10.4s,v10.4s,v11.4s 1305 add w15,w15,w21 1306 add v14.4s,v14.4s,v15.4s 1307 add w16,w16,w17 1308 add v18.4s,v18.4s,v19.4s 1309 add w13,w13,w19 1310 add v22.4s,v22.4s,v23.4s 1311 add w14,w14,w20 1312 add v26.4s,v26.4s,v27.4s 1313 eor w10,w10,w15 1314 add v30.4s,v30.4s,v31.4s 1315 eor w11,w11,w16 1316 eor v0.16b,v9.16b,v10.16b 1317 eor w12,w12,w13 1318 eor v1.16b,v13.16b,v14.16b 1319 eor w9,w9,w14 1320 eor v2.16b,v17.16b,v18.16b 1321 ror w10,w10,#20 1322 eor v3.16b,v21.16b,v22.16b 1323 ror w11,w11,#20 1324 eor v4.16b,v25.16b,v26.16b 1325 ror w12,w12,#20 1326 eor v5.16b,v29.16b,v30.16b 1327 ror w9,w9,#20 1328 ushr v9.4s,v0.4s,#25 1329 add w5,w5,w10 1330 ushr v13.4s,v1.4s,#25 1331 add w6,w6,w11 1332 ushr v17.4s,v2.4s,#25 1333 add w7,w7,w12 1334 ushr v21.4s,v3.4s,#25 1335 add w8,w8,w9 1336 ushr v25.4s,v4.4s,#25 1337 eor w21,w21,w5 1338 ushr v29.4s,v5.4s,#25 1339 eor w17,w17,w6 1340 sli v9.4s,v0.4s,#7 1341 eor w19,w19,w7 1342 sli v13.4s,v1.4s,#7 1343 eor w20,w20,w8 1344 sli v17.4s,v2.4s,#7 1345 ror w21,w21,#24 1346 sli v21.4s,v3.4s,#7 1347 ror w17,w17,#24 1348 sli v25.4s,v4.4s,#7 1349 ror w19,w19,#24 1350 sli v29.4s,v5.4s,#7 1351 ror w20,w20,#24 1352 ext v10.16b,v10.16b,v10.16b,#8 1353 add w15,w15,w21 1354 ext v14.16b,v14.16b,v14.16b,#8 1355 add w16,w16,w17 1356 ext v18.16b,v18.16b,v18.16b,#8 1357 add w13,w13,w19 1358 ext v22.16b,v22.16b,v22.16b,#8 1359 add w14,w14,w20 1360 ext v26.16b,v26.16b,v26.16b,#8 1361 eor w10,w10,w15 1362 ext v30.16b,v30.16b,v30.16b,#8 1363 eor w11,w11,w16 1364 ext v11.16b,v11.16b,v11.16b,#4 1365 eor w12,w12,w13 1366 ext v15.16b,v15.16b,v15.16b,#4 1367 eor w9,w9,w14 1368 ext v19.16b,v19.16b,v19.16b,#4 1369 ror w10,w10,#25 1370 ext v23.16b,v23.16b,v23.16b,#4 1371 ror w11,w11,#25 1372 ext v27.16b,v27.16b,v27.16b,#4 1373 ror w12,w12,#25 1374 ext v31.16b,v31.16b,v31.16b,#4 1375 ror w9,w9,#25 1376 ext v9.16b,v9.16b,v9.16b,#12 1377 ext v13.16b,v13.16b,v13.16b,#12 1378 ext v17.16b,v17.16b,v17.16b,#12 1379 ext v21.16b,v21.16b,v21.16b,#12 1380 ext v25.16b,v25.16b,v25.16b,#12 1381 ext v29.16b,v29.16b,v29.16b,#12 1382 cbnz x4,Loop_upper_neon 1383 1384 add w5,w5,w22 // accumulate key block 1385 add x6,x6,x22,lsr#32 1386 add w7,w7,w23 1387 add x8,x8,x23,lsr#32 1388 add w9,w9,w24 1389 add x10,x10,x24,lsr#32 1390 add w11,w11,w25 1391 add x12,x12,x25,lsr#32 1392 add w13,w13,w26 1393 add x14,x14,x26,lsr#32 1394 add w15,w15,w27 1395 add x16,x16,x27,lsr#32 1396 add w17,w17,w28 1397 add x19,x19,x28,lsr#32 1398 add w20,w20,w30 1399 add x21,x21,x30,lsr#32 1400 1401 add x5,x5,x6,lsl#32 // pack 1402 add x7,x7,x8,lsl#32 1403 ldp x6,x8,[x1,#0] // load input 1404 add x9,x9,x10,lsl#32 1405 add x11,x11,x12,lsl#32 1406 ldp x10,x12,[x1,#16] 1407 add x13,x13,x14,lsl#32 1408 add x15,x15,x16,lsl#32 1409 ldp x14,x16,[x1,#32] 1410 add x17,x17,x19,lsl#32 1411 add x20,x20,x21,lsl#32 1412 ldp x19,x21,[x1,#48] 1413 add x1,x1,#64 1414#ifdef __AARCH64EB__ 1415 rev x5,x5 1416 rev x7,x7 1417 rev x9,x9 1418 rev x11,x11 1419 rev x13,x13 1420 rev x15,x15 1421 rev x17,x17 1422 rev x20,x20 1423#endif 1424 eor x5,x5,x6 1425 eor x7,x7,x8 1426 eor x9,x9,x10 1427 eor x11,x11,x12 1428 eor x13,x13,x14 1429 eor x15,x15,x16 1430 eor x17,x17,x19 1431 eor x20,x20,x21 1432 1433 stp x5,x7,[x0,#0] // store output 1434 add x28,x28,#1 // increment counter 1435 mov w5,w22 // unpack key block 1436 lsr x6,x22,#32 1437 stp x9,x11,[x0,#16] 1438 mov w7,w23 1439 lsr x8,x23,#32 1440 stp x13,x15,[x0,#32] 1441 mov w9,w24 1442 lsr x10,x24,#32 1443 stp x17,x20,[x0,#48] 1444 add x0,x0,#64 1445 mov w11,w25 1446 lsr x12,x25,#32 1447 mov w13,w26 1448 lsr x14,x26,#32 1449 mov w15,w27 1450 lsr x16,x27,#32 1451 mov w17,w28 1452 lsr x19,x28,#32 1453 mov w20,w30 1454 lsr x21,x30,#32 1455 1456 mov x4,#5 1457Loop_lower_neon: 1458 sub x4,x4,#1 1459 add v8.4s,v8.4s,v9.4s 1460 add w5,w5,w9 1461 add v12.4s,v12.4s,v13.4s 1462 add w6,w6,w10 1463 add v16.4s,v16.4s,v17.4s 1464 add w7,w7,w11 1465 add v20.4s,v20.4s,v21.4s 1466 add w8,w8,w12 1467 add v24.4s,v24.4s,v25.4s 1468 eor w17,w17,w5 1469 add v28.4s,v28.4s,v29.4s 1470 eor w19,w19,w6 1471 eor v11.16b,v11.16b,v8.16b 1472 eor w20,w20,w7 1473 eor v15.16b,v15.16b,v12.16b 1474 eor w21,w21,w8 1475 eor v19.16b,v19.16b,v16.16b 1476 ror w17,w17,#16 1477 eor v23.16b,v23.16b,v20.16b 1478 ror w19,w19,#16 1479 eor v27.16b,v27.16b,v24.16b 1480 ror w20,w20,#16 1481 eor v31.16b,v31.16b,v28.16b 1482 ror w21,w21,#16 1483 rev32 v11.8h,v11.8h 1484 add w13,w13,w17 1485 rev32 v15.8h,v15.8h 1486 add w14,w14,w19 1487 rev32 v19.8h,v19.8h 1488 add w15,w15,w20 1489 rev32 v23.8h,v23.8h 1490 add w16,w16,w21 1491 rev32 v27.8h,v27.8h 1492 eor w9,w9,w13 1493 rev32 v31.8h,v31.8h 1494 eor w10,w10,w14 1495 add v10.4s,v10.4s,v11.4s 1496 eor w11,w11,w15 1497 add v14.4s,v14.4s,v15.4s 1498 eor w12,w12,w16 1499 add v18.4s,v18.4s,v19.4s 1500 ror w9,w9,#20 1501 add v22.4s,v22.4s,v23.4s 1502 ror w10,w10,#20 1503 add v26.4s,v26.4s,v27.4s 1504 ror w11,w11,#20 1505 add v30.4s,v30.4s,v31.4s 1506 ror w12,w12,#20 1507 eor v0.16b,v9.16b,v10.16b 1508 add w5,w5,w9 1509 eor v1.16b,v13.16b,v14.16b 1510 add w6,w6,w10 1511 eor v2.16b,v17.16b,v18.16b 1512 add w7,w7,w11 1513 eor v3.16b,v21.16b,v22.16b 1514 add w8,w8,w12 1515 eor v4.16b,v25.16b,v26.16b 1516 eor w17,w17,w5 1517 eor v5.16b,v29.16b,v30.16b 1518 eor w19,w19,w6 1519 ushr v9.4s,v0.4s,#20 1520 eor w20,w20,w7 1521 ushr v13.4s,v1.4s,#20 1522 eor w21,w21,w8 1523 ushr v17.4s,v2.4s,#20 1524 ror w17,w17,#24 1525 ushr v21.4s,v3.4s,#20 1526 ror w19,w19,#24 1527 ushr v25.4s,v4.4s,#20 1528 ror w20,w20,#24 1529 ushr v29.4s,v5.4s,#20 1530 ror w21,w21,#24 1531 sli v9.4s,v0.4s,#12 1532 add w13,w13,w17 1533 sli v13.4s,v1.4s,#12 1534 add w14,w14,w19 1535 sli v17.4s,v2.4s,#12 1536 add w15,w15,w20 1537 sli v21.4s,v3.4s,#12 1538 add w16,w16,w21 1539 sli v25.4s,v4.4s,#12 1540 eor w9,w9,w13 1541 sli v29.4s,v5.4s,#12 1542 eor w10,w10,w14 1543 add v8.4s,v8.4s,v9.4s 1544 eor w11,w11,w15 1545 add v12.4s,v12.4s,v13.4s 1546 eor w12,w12,w16 1547 add v16.4s,v16.4s,v17.4s 1548 ror w9,w9,#25 1549 add v20.4s,v20.4s,v21.4s 1550 ror w10,w10,#25 1551 add v24.4s,v24.4s,v25.4s 1552 ror w11,w11,#25 1553 add v28.4s,v28.4s,v29.4s 1554 ror w12,w12,#25 1555 eor v11.16b,v11.16b,v8.16b 1556 add w5,w5,w10 1557 eor v15.16b,v15.16b,v12.16b 1558 add w6,w6,w11 1559 eor v19.16b,v19.16b,v16.16b 1560 add w7,w7,w12 1561 eor v23.16b,v23.16b,v20.16b 1562 add w8,w8,w9 1563 eor v27.16b,v27.16b,v24.16b 1564 eor w21,w21,w5 1565 eor v31.16b,v31.16b,v28.16b 1566 eor w17,w17,w6 1567 tbl v11.16b,{v11.16b},v6.16b 1568 eor w19,w19,w7 1569 tbl v15.16b,{v15.16b},v6.16b 1570 eor w20,w20,w8 1571 tbl v19.16b,{v19.16b},v6.16b 1572 ror w21,w21,#16 1573 tbl v23.16b,{v23.16b},v6.16b 1574 ror w17,w17,#16 1575 tbl v27.16b,{v27.16b},v6.16b 1576 ror w19,w19,#16 1577 tbl v31.16b,{v31.16b},v6.16b 1578 ror w20,w20,#16 1579 add v10.4s,v10.4s,v11.4s 1580 add w15,w15,w21 1581 add v14.4s,v14.4s,v15.4s 1582 add w16,w16,w17 1583 add v18.4s,v18.4s,v19.4s 1584 add w13,w13,w19 1585 add v22.4s,v22.4s,v23.4s 1586 add w14,w14,w20 1587 add v26.4s,v26.4s,v27.4s 1588 eor w10,w10,w15 1589 add v30.4s,v30.4s,v31.4s 1590 eor w11,w11,w16 1591 eor v0.16b,v9.16b,v10.16b 1592 eor w12,w12,w13 1593 eor v1.16b,v13.16b,v14.16b 1594 eor w9,w9,w14 1595 eor v2.16b,v17.16b,v18.16b 1596 ror w10,w10,#20 1597 eor v3.16b,v21.16b,v22.16b 1598 ror w11,w11,#20 1599 eor v4.16b,v25.16b,v26.16b 1600 ror w12,w12,#20 1601 eor v5.16b,v29.16b,v30.16b 1602 ror w9,w9,#20 1603 ushr v9.4s,v0.4s,#25 1604 add w5,w5,w10 1605 ushr v13.4s,v1.4s,#25 1606 add w6,w6,w11 1607 ushr v17.4s,v2.4s,#25 1608 add w7,w7,w12 1609 ushr v21.4s,v3.4s,#25 1610 add w8,w8,w9 1611 ushr v25.4s,v4.4s,#25 1612 eor w21,w21,w5 1613 ushr v29.4s,v5.4s,#25 1614 eor w17,w17,w6 1615 sli v9.4s,v0.4s,#7 1616 eor w19,w19,w7 1617 sli v13.4s,v1.4s,#7 1618 eor w20,w20,w8 1619 sli v17.4s,v2.4s,#7 1620 ror w21,w21,#24 1621 sli v21.4s,v3.4s,#7 1622 ror w17,w17,#24 1623 sli v25.4s,v4.4s,#7 1624 ror w19,w19,#24 1625 sli v29.4s,v5.4s,#7 1626 ror w20,w20,#24 1627 ext v10.16b,v10.16b,v10.16b,#8 1628 add w15,w15,w21 1629 ext v14.16b,v14.16b,v14.16b,#8 1630 add w16,w16,w17 1631 ext v18.16b,v18.16b,v18.16b,#8 1632 add w13,w13,w19 1633 ext v22.16b,v22.16b,v22.16b,#8 1634 add w14,w14,w20 1635 ext v26.16b,v26.16b,v26.16b,#8 1636 eor w10,w10,w15 1637 ext v30.16b,v30.16b,v30.16b,#8 1638 eor w11,w11,w16 1639 ext v11.16b,v11.16b,v11.16b,#12 1640 eor w12,w12,w13 1641 ext v15.16b,v15.16b,v15.16b,#12 1642 eor w9,w9,w14 1643 ext v19.16b,v19.16b,v19.16b,#12 1644 ror w10,w10,#25 1645 ext v23.16b,v23.16b,v23.16b,#12 1646 ror w11,w11,#25 1647 ext v27.16b,v27.16b,v27.16b,#12 1648 ror w12,w12,#25 1649 ext v31.16b,v31.16b,v31.16b,#12 1650 ror w9,w9,#25 1651 ext v9.16b,v9.16b,v9.16b,#4 1652 ext v13.16b,v13.16b,v13.16b,#4 1653 ext v17.16b,v17.16b,v17.16b,#4 1654 ext v21.16b,v21.16b,v21.16b,#4 1655 ext v25.16b,v25.16b,v25.16b,#4 1656 ext v29.16b,v29.16b,v29.16b,#4 1657 add v8.4s,v8.4s,v9.4s 1658 add w5,w5,w9 1659 add v12.4s,v12.4s,v13.4s 1660 add w6,w6,w10 1661 add v16.4s,v16.4s,v17.4s 1662 add w7,w7,w11 1663 add v20.4s,v20.4s,v21.4s 1664 add w8,w8,w12 1665 add v24.4s,v24.4s,v25.4s 1666 eor w17,w17,w5 1667 add v28.4s,v28.4s,v29.4s 1668 eor w19,w19,w6 1669 eor v11.16b,v11.16b,v8.16b 1670 eor w20,w20,w7 1671 eor v15.16b,v15.16b,v12.16b 1672 eor w21,w21,w8 1673 eor v19.16b,v19.16b,v16.16b 1674 ror w17,w17,#16 1675 eor v23.16b,v23.16b,v20.16b 1676 ror w19,w19,#16 1677 eor v27.16b,v27.16b,v24.16b 1678 ror w20,w20,#16 1679 eor v31.16b,v31.16b,v28.16b 1680 ror w21,w21,#16 1681 rev32 v11.8h,v11.8h 1682 add w13,w13,w17 1683 rev32 v15.8h,v15.8h 1684 add w14,w14,w19 1685 rev32 v19.8h,v19.8h 1686 add w15,w15,w20 1687 rev32 v23.8h,v23.8h 1688 add w16,w16,w21 1689 rev32 v27.8h,v27.8h 1690 eor w9,w9,w13 1691 rev32 v31.8h,v31.8h 1692 eor w10,w10,w14 1693 add v10.4s,v10.4s,v11.4s 1694 eor w11,w11,w15 1695 add v14.4s,v14.4s,v15.4s 1696 eor w12,w12,w16 1697 add v18.4s,v18.4s,v19.4s 1698 ror w9,w9,#20 1699 add v22.4s,v22.4s,v23.4s 1700 ror w10,w10,#20 1701 add v26.4s,v26.4s,v27.4s 1702 ror w11,w11,#20 1703 add v30.4s,v30.4s,v31.4s 1704 ror w12,w12,#20 1705 eor v0.16b,v9.16b,v10.16b 1706 add w5,w5,w9 1707 eor v1.16b,v13.16b,v14.16b 1708 add w6,w6,w10 1709 eor v2.16b,v17.16b,v18.16b 1710 add w7,w7,w11 1711 eor v3.16b,v21.16b,v22.16b 1712 add w8,w8,w12 1713 eor v4.16b,v25.16b,v26.16b 1714 eor w17,w17,w5 1715 eor v5.16b,v29.16b,v30.16b 1716 eor w19,w19,w6 1717 ushr v9.4s,v0.4s,#20 1718 eor w20,w20,w7 1719 ushr v13.4s,v1.4s,#20 1720 eor w21,w21,w8 1721 ushr v17.4s,v2.4s,#20 1722 ror w17,w17,#24 1723 ushr v21.4s,v3.4s,#20 1724 ror w19,w19,#24 1725 ushr v25.4s,v4.4s,#20 1726 ror w20,w20,#24 1727 ushr v29.4s,v5.4s,#20 1728 ror w21,w21,#24 1729 sli v9.4s,v0.4s,#12 1730 add w13,w13,w17 1731 sli v13.4s,v1.4s,#12 1732 add w14,w14,w19 1733 sli v17.4s,v2.4s,#12 1734 add w15,w15,w20 1735 sli v21.4s,v3.4s,#12 1736 add w16,w16,w21 1737 sli v25.4s,v4.4s,#12 1738 eor w9,w9,w13 1739 sli v29.4s,v5.4s,#12 1740 eor w10,w10,w14 1741 add v8.4s,v8.4s,v9.4s 1742 eor w11,w11,w15 1743 add v12.4s,v12.4s,v13.4s 1744 eor w12,w12,w16 1745 add v16.4s,v16.4s,v17.4s 1746 ror w9,w9,#25 1747 add v20.4s,v20.4s,v21.4s 1748 ror w10,w10,#25 1749 add v24.4s,v24.4s,v25.4s 1750 ror w11,w11,#25 1751 add v28.4s,v28.4s,v29.4s 1752 ror w12,w12,#25 1753 eor v11.16b,v11.16b,v8.16b 1754 add w5,w5,w10 1755 eor v15.16b,v15.16b,v12.16b 1756 add w6,w6,w11 1757 eor v19.16b,v19.16b,v16.16b 1758 add w7,w7,w12 1759 eor v23.16b,v23.16b,v20.16b 1760 add w8,w8,w9 1761 eor v27.16b,v27.16b,v24.16b 1762 eor w21,w21,w5 1763 eor v31.16b,v31.16b,v28.16b 1764 eor w17,w17,w6 1765 tbl v11.16b,{v11.16b},v6.16b 1766 eor w19,w19,w7 1767 tbl v15.16b,{v15.16b},v6.16b 1768 eor w20,w20,w8 1769 tbl v19.16b,{v19.16b},v6.16b 1770 ror w21,w21,#16 1771 tbl v23.16b,{v23.16b},v6.16b 1772 ror w17,w17,#16 1773 tbl v27.16b,{v27.16b},v6.16b 1774 ror w19,w19,#16 1775 tbl v31.16b,{v31.16b},v6.16b 1776 ror w20,w20,#16 1777 add v10.4s,v10.4s,v11.4s 1778 add w15,w15,w21 1779 add v14.4s,v14.4s,v15.4s 1780 add w16,w16,w17 1781 add v18.4s,v18.4s,v19.4s 1782 add w13,w13,w19 1783 add v22.4s,v22.4s,v23.4s 1784 add w14,w14,w20 1785 add v26.4s,v26.4s,v27.4s 1786 eor w10,w10,w15 1787 add v30.4s,v30.4s,v31.4s 1788 eor w11,w11,w16 1789 eor v0.16b,v9.16b,v10.16b 1790 eor w12,w12,w13 1791 eor v1.16b,v13.16b,v14.16b 1792 eor w9,w9,w14 1793 eor v2.16b,v17.16b,v18.16b 1794 ror w10,w10,#20 1795 eor v3.16b,v21.16b,v22.16b 1796 ror w11,w11,#20 1797 eor v4.16b,v25.16b,v26.16b 1798 ror w12,w12,#20 1799 eor v5.16b,v29.16b,v30.16b 1800 ror w9,w9,#20 1801 ushr v9.4s,v0.4s,#25 1802 add w5,w5,w10 1803 ushr v13.4s,v1.4s,#25 1804 add w6,w6,w11 1805 ushr v17.4s,v2.4s,#25 1806 add w7,w7,w12 1807 ushr v21.4s,v3.4s,#25 1808 add w8,w8,w9 1809 ushr v25.4s,v4.4s,#25 1810 eor w21,w21,w5 1811 ushr v29.4s,v5.4s,#25 1812 eor w17,w17,w6 1813 sli v9.4s,v0.4s,#7 1814 eor w19,w19,w7 1815 sli v13.4s,v1.4s,#7 1816 eor w20,w20,w8 1817 sli v17.4s,v2.4s,#7 1818 ror w21,w21,#24 1819 sli v21.4s,v3.4s,#7 1820 ror w17,w17,#24 1821 sli v25.4s,v4.4s,#7 1822 ror w19,w19,#24 1823 sli v29.4s,v5.4s,#7 1824 ror w20,w20,#24 1825 ext v10.16b,v10.16b,v10.16b,#8 1826 add w15,w15,w21 1827 ext v14.16b,v14.16b,v14.16b,#8 1828 add w16,w16,w17 1829 ext v18.16b,v18.16b,v18.16b,#8 1830 add w13,w13,w19 1831 ext v22.16b,v22.16b,v22.16b,#8 1832 add w14,w14,w20 1833 ext v26.16b,v26.16b,v26.16b,#8 1834 eor w10,w10,w15 1835 ext v30.16b,v30.16b,v30.16b,#8 1836 eor w11,w11,w16 1837 ext v11.16b,v11.16b,v11.16b,#4 1838 eor w12,w12,w13 1839 ext v15.16b,v15.16b,v15.16b,#4 1840 eor w9,w9,w14 1841 ext v19.16b,v19.16b,v19.16b,#4 1842 ror w10,w10,#25 1843 ext v23.16b,v23.16b,v23.16b,#4 1844 ror w11,w11,#25 1845 ext v27.16b,v27.16b,v27.16b,#4 1846 ror w12,w12,#25 1847 ext v31.16b,v31.16b,v31.16b,#4 1848 ror w9,w9,#25 1849 ext v9.16b,v9.16b,v9.16b,#12 1850 ext v13.16b,v13.16b,v13.16b,#12 1851 ext v17.16b,v17.16b,v17.16b,#12 1852 ext v21.16b,v21.16b,v21.16b,#12 1853 ext v25.16b,v25.16b,v25.16b,#12 1854 ext v29.16b,v29.16b,v29.16b,#12 1855 cbnz x4,Loop_lower_neon 1856 1857 add w5,w5,w22 // accumulate key block 1858 ldp q0,q1,[sp,#0] 1859 add x6,x6,x22,lsr#32 1860 ldp q2,q3,[sp,#32] 1861 add w7,w7,w23 1862 ldp q4,q5,[sp,#64] 1863 add x8,x8,x23,lsr#32 1864 ldr q6,[sp,#96] 1865 add v8.4s,v8.4s,v0.4s 1866 add w9,w9,w24 1867 add v12.4s,v12.4s,v0.4s 1868 add x10,x10,x24,lsr#32 1869 add v16.4s,v16.4s,v0.4s 1870 add w11,w11,w25 1871 add v20.4s,v20.4s,v0.4s 1872 add x12,x12,x25,lsr#32 1873 add v24.4s,v24.4s,v0.4s 1874 add w13,w13,w26 1875 add v28.4s,v28.4s,v0.4s 1876 add x14,x14,x26,lsr#32 1877 add v10.4s,v10.4s,v2.4s 1878 add w15,w15,w27 1879 add v14.4s,v14.4s,v2.4s 1880 add x16,x16,x27,lsr#32 1881 add v18.4s,v18.4s,v2.4s 1882 add w17,w17,w28 1883 add v22.4s,v22.4s,v2.4s 1884 add x19,x19,x28,lsr#32 1885 add v26.4s,v26.4s,v2.4s 1886 add w20,w20,w30 1887 add v30.4s,v30.4s,v2.4s 1888 add x21,x21,x30,lsr#32 1889 add v27.4s,v27.4s,v7.4s // +4 1890 add x5,x5,x6,lsl#32 // pack 1891 add v31.4s,v31.4s,v7.4s // +4 1892 add x7,x7,x8,lsl#32 1893 add v11.4s,v11.4s,v3.4s 1894 ldp x6,x8,[x1,#0] // load input 1895 add v15.4s,v15.4s,v4.4s 1896 add x9,x9,x10,lsl#32 1897 add v19.4s,v19.4s,v5.4s 1898 add x11,x11,x12,lsl#32 1899 add v23.4s,v23.4s,v6.4s 1900 ldp x10,x12,[x1,#16] 1901 add v27.4s,v27.4s,v3.4s 1902 add x13,x13,x14,lsl#32 1903 add v31.4s,v31.4s,v4.4s 1904 add x15,x15,x16,lsl#32 1905 add v9.4s,v9.4s,v1.4s 1906 ldp x14,x16,[x1,#32] 1907 add v13.4s,v13.4s,v1.4s 1908 add x17,x17,x19,lsl#32 1909 add v17.4s,v17.4s,v1.4s 1910 add x20,x20,x21,lsl#32 1911 add v21.4s,v21.4s,v1.4s 1912 ldp x19,x21,[x1,#48] 1913 add v25.4s,v25.4s,v1.4s 1914 add x1,x1,#64 1915 add v29.4s,v29.4s,v1.4s 1916 1917#ifdef __AARCH64EB__ 1918 rev x5,x5 1919 rev x7,x7 1920 rev x9,x9 1921 rev x11,x11 1922 rev x13,x13 1923 rev x15,x15 1924 rev x17,x17 1925 rev x20,x20 1926#endif 1927 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 1928 eor x5,x5,x6 1929 eor x7,x7,x8 1930 eor x9,x9,x10 1931 eor x11,x11,x12 1932 eor x13,x13,x14 1933 eor v8.16b,v8.16b,v0.16b 1934 eor x15,x15,x16 1935 eor v9.16b,v9.16b,v1.16b 1936 eor x17,x17,x19 1937 eor v10.16b,v10.16b,v2.16b 1938 eor x20,x20,x21 1939 eor v11.16b,v11.16b,v3.16b 1940 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 1941 1942 stp x5,x7,[x0,#0] // store output 1943 add x28,x28,#7 // increment counter 1944 stp x9,x11,[x0,#16] 1945 stp x13,x15,[x0,#32] 1946 stp x17,x20,[x0,#48] 1947 add x0,x0,#64 1948 st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 1949 1950 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 1951 eor v12.16b,v12.16b,v0.16b 1952 eor v13.16b,v13.16b,v1.16b 1953 eor v14.16b,v14.16b,v2.16b 1954 eor v15.16b,v15.16b,v3.16b 1955 st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 1956 1957 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 1958 eor v16.16b,v16.16b,v8.16b 1959 ldp q0,q1,[sp,#0] 1960 eor v17.16b,v17.16b,v9.16b 1961 ldp q2,q3,[sp,#32] 1962 eor v18.16b,v18.16b,v10.16b 1963 eor v19.16b,v19.16b,v11.16b 1964 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 1965 1966 ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 1967 eor v20.16b,v20.16b,v12.16b 1968 eor v21.16b,v21.16b,v13.16b 1969 eor v22.16b,v22.16b,v14.16b 1970 eor v23.16b,v23.16b,v15.16b 1971 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 1972 1973 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 1974 eor v24.16b,v24.16b,v16.16b 1975 eor v25.16b,v25.16b,v17.16b 1976 eor v26.16b,v26.16b,v18.16b 1977 eor v27.16b,v27.16b,v19.16b 1978 st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64 1979 1980 shl v8.4s,v7.4s,#1 // 4 -> 8 1981 eor v28.16b,v28.16b,v20.16b 1982 eor v29.16b,v29.16b,v21.16b 1983 eor v30.16b,v30.16b,v22.16b 1984 eor v31.16b,v31.16b,v23.16b 1985 st1 {v28.16b,v29.16b,v30.16b,v31.16b},[x0],#64 1986 1987 add v3.4s,v3.4s,v8.4s // += 8 1988 add v4.4s,v4.4s,v8.4s 1989 add v5.4s,v5.4s,v8.4s 1990 add v6.4s,v6.4s,v8.4s 1991 1992 b.hs Loop_outer_512_neon 1993 1994 adds x2,x2,#512 1995 ushr v7.4s,v7.4s,#1 // 4 -> 2 1996 1997 ldp d10,d11,[sp,#128+16] // meet ABI requirements 1998 ldp d12,d13,[sp,#128+32] 1999 ldp d14,d15,[sp,#128+48] 2000 2001 stp q0,q0,[sp,#0] // wipe off-load area 2002 stp q0,q0,[sp,#32] 2003 stp q0,q0,[sp,#64] 2004 2005 b.eq Ldone_512_neon 2006 2007 sub x3,x3,#16 // Lone 2008 cmp x2,#192 2009 add sp,sp,#128 2010 sub v3.4s,v3.4s,v7.4s // -= 2 2011 ld1 {v8.4s,v9.4s},[x3] 2012 b.hs Loop_outer_neon 2013 2014 ldp d8,d9,[sp,#0] // meet ABI requirements 2015 eor v1.16b,v1.16b,v1.16b 2016 eor v2.16b,v2.16b,v2.16b 2017 eor v3.16b,v3.16b,v3.16b 2018 eor v4.16b,v4.16b,v4.16b 2019 eor v5.16b,v5.16b,v5.16b 2020 eor v6.16b,v6.16b,v6.16b 2021 b Loop_outer 2022 2023Ldone_512_neon: 2024 ldp d8,d9,[sp,#128+0] // meet ABI requirements 2025 ldp x19,x20,[x29,#16] 2026 add sp,sp,#128+64 2027 ldp x21,x22,[x29,#32] 2028 ldp x23,x24,[x29,#48] 2029 ldp x25,x26,[x29,#64] 2030 ldp x27,x28,[x29,#80] 2031 ldp x29,x30,[sp],#96 2032.long 0xd50323bf // autiasp 2033 ret 2034 2035