1#include "arm_arch.h" 2 3.text 4 5 6.hidden OPENSSL_armcap_P 7 8.align 5 9.Lsigma: 10.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral 11.Lone: 12.long 1,0,0,0 13.LOPENSSL_armcap_P: 14#ifdef __ILP32__ 15.long OPENSSL_armcap_P-. 16#else 17.quad OPENSSL_armcap_P-. 18#endif 19.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 20.align 2 21 22.globl ChaCha20_ctr32 23.type ChaCha20_ctr32,%function 24.align 5 25ChaCha20_ctr32: 26 cbz x2,.Labort 27 adr x5,.LOPENSSL_armcap_P 28 cmp x2,#192 29 b.lo .Lshort 30#ifdef __ILP32__ 31 ldrsw x6,[x5] 32#else 33 ldr x6,[x5] 34#endif 35 ldr w17,[x6,x5] 36 tst w17,#ARMV7_NEON 37 b.ne ChaCha20_neon 38 39.Lshort: 40.inst 0xd503233f // paciasp 41 stp x29,x30,[sp,#-96]! 42 add x29,sp,#0 43 44 adr x5,.Lsigma 45 stp x19,x20,[sp,#16] 46 stp x21,x22,[sp,#32] 47 stp x23,x24,[sp,#48] 48 stp x25,x26,[sp,#64] 49 stp x27,x28,[sp,#80] 50 sub sp,sp,#64 51 52 ldp x22,x23,[x5] // load sigma 53 ldp x24,x25,[x3] // load key 54 ldp x26,x27,[x3,#16] 55 ldp x28,x30,[x4] // load counter 56#ifdef __ARMEB__ 57 ror x24,x24,#32 58 ror x25,x25,#32 59 ror x26,x26,#32 60 ror x27,x27,#32 61 ror x28,x28,#32 62 ror x30,x30,#32 63#endif 64 65.Loop_outer: 66 mov w5,w22 // unpack key block 67 lsr x6,x22,#32 68 mov w7,w23 69 lsr x8,x23,#32 70 mov w9,w24 71 lsr x10,x24,#32 72 mov w11,w25 73 lsr x12,x25,#32 74 mov w13,w26 75 lsr x14,x26,#32 76 mov w15,w27 77 lsr x16,x27,#32 78 mov w17,w28 79 lsr x19,x28,#32 80 mov w20,w30 81 lsr x21,x30,#32 82 83 mov x4,#10 84 subs x2,x2,#64 85.Loop: 86 sub x4,x4,#1 87 add w5,w5,w9 88 add w6,w6,w10 89 add w7,w7,w11 90 add w8,w8,w12 91 eor w17,w17,w5 92 eor w19,w19,w6 93 eor w20,w20,w7 94 eor w21,w21,w8 95 ror w17,w17,#16 96 ror w19,w19,#16 97 ror w20,w20,#16 98 ror w21,w21,#16 99 add w13,w13,w17 100 add w14,w14,w19 101 add w15,w15,w20 102 add w16,w16,w21 103 eor w9,w9,w13 104 eor w10,w10,w14 105 eor w11,w11,w15 106 eor w12,w12,w16 107 ror w9,w9,#20 108 ror w10,w10,#20 109 ror w11,w11,#20 110 ror w12,w12,#20 111 add w5,w5,w9 112 add w6,w6,w10 113 add w7,w7,w11 114 add w8,w8,w12 115 eor w17,w17,w5 116 eor w19,w19,w6 117 eor w20,w20,w7 118 eor w21,w21,w8 119 ror w17,w17,#24 120 ror w19,w19,#24 121 ror w20,w20,#24 122 ror w21,w21,#24 123 add w13,w13,w17 124 add w14,w14,w19 125 add w15,w15,w20 126 add w16,w16,w21 127 eor w9,w9,w13 128 eor w10,w10,w14 129 eor w11,w11,w15 130 eor w12,w12,w16 131 ror w9,w9,#25 132 ror w10,w10,#25 133 ror w11,w11,#25 134 ror w12,w12,#25 135 add w5,w5,w10 136 add w6,w6,w11 137 add w7,w7,w12 138 add w8,w8,w9 139 eor w21,w21,w5 140 eor w17,w17,w6 141 eor w19,w19,w7 142 eor w20,w20,w8 143 ror w21,w21,#16 144 ror w17,w17,#16 145 ror w19,w19,#16 146 ror w20,w20,#16 147 add w15,w15,w21 148 add w16,w16,w17 149 add w13,w13,w19 150 add w14,w14,w20 151 eor w10,w10,w15 152 eor w11,w11,w16 153 eor w12,w12,w13 154 eor w9,w9,w14 155 ror w10,w10,#20 156 ror w11,w11,#20 157 ror w12,w12,#20 158 ror w9,w9,#20 159 add w5,w5,w10 160 add w6,w6,w11 161 add w7,w7,w12 162 add w8,w8,w9 163 eor w21,w21,w5 164 eor w17,w17,w6 165 eor w19,w19,w7 166 eor w20,w20,w8 167 ror w21,w21,#24 168 ror w17,w17,#24 169 ror w19,w19,#24 170 ror w20,w20,#24 171 add w15,w15,w21 172 add w16,w16,w17 173 add w13,w13,w19 174 add w14,w14,w20 175 eor w10,w10,w15 176 eor w11,w11,w16 177 eor w12,w12,w13 178 eor w9,w9,w14 179 ror w10,w10,#25 180 ror w11,w11,#25 181 ror w12,w12,#25 182 ror w9,w9,#25 183 cbnz x4,.Loop 184 185 add w5,w5,w22 // accumulate key block 186 add x6,x6,x22,lsr#32 187 add w7,w7,w23 188 add x8,x8,x23,lsr#32 189 add w9,w9,w24 190 add x10,x10,x24,lsr#32 191 add w11,w11,w25 192 add x12,x12,x25,lsr#32 193 add w13,w13,w26 194 add x14,x14,x26,lsr#32 195 add w15,w15,w27 196 add x16,x16,x27,lsr#32 197 add w17,w17,w28 198 add x19,x19,x28,lsr#32 199 add w20,w20,w30 200 add x21,x21,x30,lsr#32 201 202 b.lo .Ltail 203 204 add x5,x5,x6,lsl#32 // pack 205 add x7,x7,x8,lsl#32 206 ldp x6,x8,[x1,#0] // load input 207 add x9,x9,x10,lsl#32 208 add x11,x11,x12,lsl#32 209 ldp x10,x12,[x1,#16] 210 add x13,x13,x14,lsl#32 211 add x15,x15,x16,lsl#32 212 ldp x14,x16,[x1,#32] 213 add x17,x17,x19,lsl#32 214 add x20,x20,x21,lsl#32 215 ldp x19,x21,[x1,#48] 216 add x1,x1,#64 217#ifdef __ARMEB__ 218 rev x5,x5 219 rev x7,x7 220 rev x9,x9 221 rev x11,x11 222 rev x13,x13 223 rev x15,x15 224 rev x17,x17 225 rev x20,x20 226#endif 227 eor x5,x5,x6 228 eor x7,x7,x8 229 eor x9,x9,x10 230 eor x11,x11,x12 231 eor x13,x13,x14 232 eor x15,x15,x16 233 eor x17,x17,x19 234 eor x20,x20,x21 235 236 stp x5,x7,[x0,#0] // store output 237 add x28,x28,#1 // increment counter 238 stp x9,x11,[x0,#16] 239 stp x13,x15,[x0,#32] 240 stp x17,x20,[x0,#48] 241 add x0,x0,#64 242 243 b.hi .Loop_outer 244 245 ldp x19,x20,[x29,#16] 246 add sp,sp,#64 247 ldp x21,x22,[x29,#32] 248 ldp x23,x24,[x29,#48] 249 ldp x25,x26,[x29,#64] 250 ldp x27,x28,[x29,#80] 251 ldp x29,x30,[sp],#96 252.inst 0xd50323bf // autiasp 253.Labort: 254 ret 255 256.align 4 257.Ltail: 258 add x2,x2,#64 259.Less_than_64: 260 sub x0,x0,#1 261 add x1,x1,x2 262 add x0,x0,x2 263 add x4,sp,x2 264 neg x2,x2 265 266 add x5,x5,x6,lsl#32 // pack 267 add x7,x7,x8,lsl#32 268 add x9,x9,x10,lsl#32 269 add x11,x11,x12,lsl#32 270 add x13,x13,x14,lsl#32 271 add x15,x15,x16,lsl#32 272 add x17,x17,x19,lsl#32 273 add x20,x20,x21,lsl#32 274#ifdef __ARMEB__ 275 rev x5,x5 276 rev x7,x7 277 rev x9,x9 278 rev x11,x11 279 rev x13,x13 280 rev x15,x15 281 rev x17,x17 282 rev x20,x20 283#endif 284 stp x5,x7,[sp,#0] 285 stp x9,x11,[sp,#16] 286 stp x13,x15,[sp,#32] 287 stp x17,x20,[sp,#48] 288 289.Loop_tail: 290 ldrb w10,[x1,x2] 291 ldrb w11,[x4,x2] 292 add x2,x2,#1 293 eor w10,w10,w11 294 strb w10,[x0,x2] 295 cbnz x2,.Loop_tail 296 297 stp xzr,xzr,[sp,#0] 298 stp xzr,xzr,[sp,#16] 299 stp xzr,xzr,[sp,#32] 300 stp xzr,xzr,[sp,#48] 301 302 ldp x19,x20,[x29,#16] 303 add sp,sp,#64 304 ldp x21,x22,[x29,#32] 305 ldp x23,x24,[x29,#48] 306 ldp x25,x26,[x29,#64] 307 ldp x27,x28,[x29,#80] 308 ldp x29,x30,[sp],#96 309.inst 0xd50323bf // autiasp 310 ret 311.size ChaCha20_ctr32,.-ChaCha20_ctr32 312 313.type ChaCha20_neon,%function 314.align 5 315ChaCha20_neon: 316.inst 0xd503233f // paciasp 317 stp x29,x30,[sp,#-96]! 318 add x29,sp,#0 319 320 adr x5,.Lsigma 321 stp x19,x20,[sp,#16] 322 stp x21,x22,[sp,#32] 323 stp x23,x24,[sp,#48] 324 stp x25,x26,[sp,#64] 325 stp x27,x28,[sp,#80] 326 cmp x2,#512 327 b.hs .L512_or_more_neon 328 329 sub sp,sp,#64 330 331 ldp x22,x23,[x5] // load sigma 332 ld1 {v24.4s},[x5],#16 333 ldp x24,x25,[x3] // load key 334 ldp x26,x27,[x3,#16] 335 ld1 {v25.4s,v26.4s},[x3] 336 ldp x28,x30,[x4] // load counter 337 ld1 {v27.4s},[x4] 338 ld1 {v31.4s},[x5] 339#ifdef __ARMEB__ 340 rev64 v24.4s,v24.4s 341 ror x24,x24,#32 342 ror x25,x25,#32 343 ror x26,x26,#32 344 ror x27,x27,#32 345 ror x28,x28,#32 346 ror x30,x30,#32 347#endif 348 add v27.4s,v27.4s,v31.4s // += 1 349 add v28.4s,v27.4s,v31.4s 350 add v29.4s,v28.4s,v31.4s 351 shl v31.4s,v31.4s,#2 // 1 -> 4 352 353.Loop_outer_neon: 354 mov w5,w22 // unpack key block 355 lsr x6,x22,#32 356 mov v0.16b,v24.16b 357 mov w7,w23 358 lsr x8,x23,#32 359 mov v4.16b,v24.16b 360 mov w9,w24 361 lsr x10,x24,#32 362 mov v16.16b,v24.16b 363 mov w11,w25 364 mov v1.16b,v25.16b 365 lsr x12,x25,#32 366 mov v5.16b,v25.16b 367 mov w13,w26 368 mov v17.16b,v25.16b 369 lsr x14,x26,#32 370 mov v3.16b,v27.16b 371 mov w15,w27 372 mov v7.16b,v28.16b 373 lsr x16,x27,#32 374 mov v19.16b,v29.16b 375 mov w17,w28 376 mov v2.16b,v26.16b 377 lsr x19,x28,#32 378 mov v6.16b,v26.16b 379 mov w20,w30 380 mov v18.16b,v26.16b 381 lsr x21,x30,#32 382 383 mov x4,#10 384 subs x2,x2,#256 385.Loop_neon: 386 sub x4,x4,#1 387 add v0.4s,v0.4s,v1.4s 388 add w5,w5,w9 389 add v4.4s,v4.4s,v5.4s 390 add w6,w6,w10 391 add v16.4s,v16.4s,v17.4s 392 add w7,w7,w11 393 eor v3.16b,v3.16b,v0.16b 394 add w8,w8,w12 395 eor v7.16b,v7.16b,v4.16b 396 eor w17,w17,w5 397 eor v19.16b,v19.16b,v16.16b 398 eor w19,w19,w6 399 rev32 v3.8h,v3.8h 400 eor w20,w20,w7 401 rev32 v7.8h,v7.8h 402 eor w21,w21,w8 403 rev32 v19.8h,v19.8h 404 ror w17,w17,#16 405 add v2.4s,v2.4s,v3.4s 406 ror w19,w19,#16 407 add v6.4s,v6.4s,v7.4s 408 ror w20,w20,#16 409 add v18.4s,v18.4s,v19.4s 410 ror w21,w21,#16 411 eor v20.16b,v1.16b,v2.16b 412 add w13,w13,w17 413 eor v21.16b,v5.16b,v6.16b 414 add w14,w14,w19 415 eor v22.16b,v17.16b,v18.16b 416 add w15,w15,w20 417 ushr v1.4s,v20.4s,#20 418 add w16,w16,w21 419 ushr v5.4s,v21.4s,#20 420 eor w9,w9,w13 421 ushr v17.4s,v22.4s,#20 422 eor w10,w10,w14 423 sli v1.4s,v20.4s,#12 424 eor w11,w11,w15 425 sli v5.4s,v21.4s,#12 426 eor w12,w12,w16 427 sli v17.4s,v22.4s,#12 428 ror w9,w9,#20 429 add v0.4s,v0.4s,v1.4s 430 ror w10,w10,#20 431 add v4.4s,v4.4s,v5.4s 432 ror w11,w11,#20 433 add v16.4s,v16.4s,v17.4s 434 ror w12,w12,#20 435 eor v20.16b,v3.16b,v0.16b 436 add w5,w5,w9 437 eor v21.16b,v7.16b,v4.16b 438 add w6,w6,w10 439 eor v22.16b,v19.16b,v16.16b 440 add w7,w7,w11 441 ushr v3.4s,v20.4s,#24 442 add w8,w8,w12 443 ushr v7.4s,v21.4s,#24 444 eor w17,w17,w5 445 ushr v19.4s,v22.4s,#24 446 eor w19,w19,w6 447 sli v3.4s,v20.4s,#8 448 eor w20,w20,w7 449 sli v7.4s,v21.4s,#8 450 eor w21,w21,w8 451 sli v19.4s,v22.4s,#8 452 ror w17,w17,#24 453 add v2.4s,v2.4s,v3.4s 454 ror w19,w19,#24 455 add v6.4s,v6.4s,v7.4s 456 ror w20,w20,#24 457 add v18.4s,v18.4s,v19.4s 458 ror w21,w21,#24 459 eor v20.16b,v1.16b,v2.16b 460 add w13,w13,w17 461 eor v21.16b,v5.16b,v6.16b 462 add w14,w14,w19 463 eor v22.16b,v17.16b,v18.16b 464 add w15,w15,w20 465 ushr v1.4s,v20.4s,#25 466 add w16,w16,w21 467 ushr v5.4s,v21.4s,#25 468 eor w9,w9,w13 469 ushr v17.4s,v22.4s,#25 470 eor w10,w10,w14 471 sli v1.4s,v20.4s,#7 472 eor w11,w11,w15 473 sli v5.4s,v21.4s,#7 474 eor w12,w12,w16 475 sli v17.4s,v22.4s,#7 476 ror w9,w9,#25 477 ext v2.16b,v2.16b,v2.16b,#8 478 ror w10,w10,#25 479 ext v6.16b,v6.16b,v6.16b,#8 480 ror w11,w11,#25 481 ext v18.16b,v18.16b,v18.16b,#8 482 ror w12,w12,#25 483 ext v3.16b,v3.16b,v3.16b,#12 484 ext v7.16b,v7.16b,v7.16b,#12 485 ext v19.16b,v19.16b,v19.16b,#12 486 ext v1.16b,v1.16b,v1.16b,#4 487 ext v5.16b,v5.16b,v5.16b,#4 488 ext v17.16b,v17.16b,v17.16b,#4 489 add v0.4s,v0.4s,v1.4s 490 add w5,w5,w10 491 add v4.4s,v4.4s,v5.4s 492 add w6,w6,w11 493 add v16.4s,v16.4s,v17.4s 494 add w7,w7,w12 495 eor v3.16b,v3.16b,v0.16b 496 add w8,w8,w9 497 eor v7.16b,v7.16b,v4.16b 498 eor w21,w21,w5 499 eor v19.16b,v19.16b,v16.16b 500 eor w17,w17,w6 501 rev32 v3.8h,v3.8h 502 eor w19,w19,w7 503 rev32 v7.8h,v7.8h 504 eor w20,w20,w8 505 rev32 v19.8h,v19.8h 506 ror w21,w21,#16 507 add v2.4s,v2.4s,v3.4s 508 ror w17,w17,#16 509 add v6.4s,v6.4s,v7.4s 510 ror w19,w19,#16 511 add v18.4s,v18.4s,v19.4s 512 ror w20,w20,#16 513 eor v20.16b,v1.16b,v2.16b 514 add w15,w15,w21 515 eor v21.16b,v5.16b,v6.16b 516 add w16,w16,w17 517 eor v22.16b,v17.16b,v18.16b 518 add w13,w13,w19 519 ushr v1.4s,v20.4s,#20 520 add w14,w14,w20 521 ushr v5.4s,v21.4s,#20 522 eor w10,w10,w15 523 ushr v17.4s,v22.4s,#20 524 eor w11,w11,w16 525 sli v1.4s,v20.4s,#12 526 eor w12,w12,w13 527 sli v5.4s,v21.4s,#12 528 eor w9,w9,w14 529 sli v17.4s,v22.4s,#12 530 ror w10,w10,#20 531 add v0.4s,v0.4s,v1.4s 532 ror w11,w11,#20 533 add v4.4s,v4.4s,v5.4s 534 ror w12,w12,#20 535 add v16.4s,v16.4s,v17.4s 536 ror w9,w9,#20 537 eor v20.16b,v3.16b,v0.16b 538 add w5,w5,w10 539 eor v21.16b,v7.16b,v4.16b 540 add w6,w6,w11 541 eor v22.16b,v19.16b,v16.16b 542 add w7,w7,w12 543 ushr v3.4s,v20.4s,#24 544 add w8,w8,w9 545 ushr v7.4s,v21.4s,#24 546 eor w21,w21,w5 547 ushr v19.4s,v22.4s,#24 548 eor w17,w17,w6 549 sli v3.4s,v20.4s,#8 550 eor w19,w19,w7 551 sli v7.4s,v21.4s,#8 552 eor w20,w20,w8 553 sli v19.4s,v22.4s,#8 554 ror w21,w21,#24 555 add v2.4s,v2.4s,v3.4s 556 ror w17,w17,#24 557 add v6.4s,v6.4s,v7.4s 558 ror w19,w19,#24 559 add v18.4s,v18.4s,v19.4s 560 ror w20,w20,#24 561 eor v20.16b,v1.16b,v2.16b 562 add w15,w15,w21 563 eor v21.16b,v5.16b,v6.16b 564 add w16,w16,w17 565 eor v22.16b,v17.16b,v18.16b 566 add w13,w13,w19 567 ushr v1.4s,v20.4s,#25 568 add w14,w14,w20 569 ushr v5.4s,v21.4s,#25 570 eor w10,w10,w15 571 ushr v17.4s,v22.4s,#25 572 eor w11,w11,w16 573 sli v1.4s,v20.4s,#7 574 eor w12,w12,w13 575 sli v5.4s,v21.4s,#7 576 eor w9,w9,w14 577 sli v17.4s,v22.4s,#7 578 ror w10,w10,#25 579 ext v2.16b,v2.16b,v2.16b,#8 580 ror w11,w11,#25 581 ext v6.16b,v6.16b,v6.16b,#8 582 ror w12,w12,#25 583 ext v18.16b,v18.16b,v18.16b,#8 584 ror w9,w9,#25 585 ext v3.16b,v3.16b,v3.16b,#4 586 ext v7.16b,v7.16b,v7.16b,#4 587 ext v19.16b,v19.16b,v19.16b,#4 588 ext v1.16b,v1.16b,v1.16b,#12 589 ext v5.16b,v5.16b,v5.16b,#12 590 ext v17.16b,v17.16b,v17.16b,#12 591 cbnz x4,.Loop_neon 592 593 add w5,w5,w22 // accumulate key block 594 add v0.4s,v0.4s,v24.4s 595 add x6,x6,x22,lsr#32 596 add v4.4s,v4.4s,v24.4s 597 add w7,w7,w23 598 add v16.4s,v16.4s,v24.4s 599 add x8,x8,x23,lsr#32 600 add v2.4s,v2.4s,v26.4s 601 add w9,w9,w24 602 add v6.4s,v6.4s,v26.4s 603 add x10,x10,x24,lsr#32 604 add v18.4s,v18.4s,v26.4s 605 add w11,w11,w25 606 add v3.4s,v3.4s,v27.4s 607 add x12,x12,x25,lsr#32 608 add w13,w13,w26 609 add v7.4s,v7.4s,v28.4s 610 add x14,x14,x26,lsr#32 611 add w15,w15,w27 612 add v19.4s,v19.4s,v29.4s 613 add x16,x16,x27,lsr#32 614 add w17,w17,w28 615 add v1.4s,v1.4s,v25.4s 616 add x19,x19,x28,lsr#32 617 add w20,w20,w30 618 add v5.4s,v5.4s,v25.4s 619 add x21,x21,x30,lsr#32 620 add v17.4s,v17.4s,v25.4s 621 622 b.lo .Ltail_neon 623 624 add x5,x5,x6,lsl#32 // pack 625 add x7,x7,x8,lsl#32 626 ldp x6,x8,[x1,#0] // load input 627 add x9,x9,x10,lsl#32 628 add x11,x11,x12,lsl#32 629 ldp x10,x12,[x1,#16] 630 add x13,x13,x14,lsl#32 631 add x15,x15,x16,lsl#32 632 ldp x14,x16,[x1,#32] 633 add x17,x17,x19,lsl#32 634 add x20,x20,x21,lsl#32 635 ldp x19,x21,[x1,#48] 636 add x1,x1,#64 637#ifdef __ARMEB__ 638 rev x5,x5 639 rev x7,x7 640 rev x9,x9 641 rev x11,x11 642 rev x13,x13 643 rev x15,x15 644 rev x17,x17 645 rev x20,x20 646#endif 647 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 648 eor x5,x5,x6 649 eor x7,x7,x8 650 eor x9,x9,x10 651 eor x11,x11,x12 652 eor x13,x13,x14 653 eor v0.16b,v0.16b,v20.16b 654 eor x15,x15,x16 655 eor v1.16b,v1.16b,v21.16b 656 eor x17,x17,x19 657 eor v2.16b,v2.16b,v22.16b 658 eor x20,x20,x21 659 eor v3.16b,v3.16b,v23.16b 660 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 661 662 stp x5,x7,[x0,#0] // store output 663 add x28,x28,#4 // increment counter 664 stp x9,x11,[x0,#16] 665 add v27.4s,v27.4s,v31.4s // += 4 666 stp x13,x15,[x0,#32] 667 add v28.4s,v28.4s,v31.4s 668 stp x17,x20,[x0,#48] 669 add v29.4s,v29.4s,v31.4s 670 add x0,x0,#64 671 672 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 673 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 674 675 eor v4.16b,v4.16b,v20.16b 676 eor v5.16b,v5.16b,v21.16b 677 eor v6.16b,v6.16b,v22.16b 678 eor v7.16b,v7.16b,v23.16b 679 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 680 681 eor v16.16b,v16.16b,v0.16b 682 eor v17.16b,v17.16b,v1.16b 683 eor v18.16b,v18.16b,v2.16b 684 eor v19.16b,v19.16b,v3.16b 685 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 686 687 b.hi .Loop_outer_neon 688 689 ldp x19,x20,[x29,#16] 690 add sp,sp,#64 691 ldp x21,x22,[x29,#32] 692 ldp x23,x24,[x29,#48] 693 ldp x25,x26,[x29,#64] 694 ldp x27,x28,[x29,#80] 695 ldp x29,x30,[sp],#96 696.inst 0xd50323bf // autiasp 697 ret 698 699.Ltail_neon: 700 add x2,x2,#256 701 cmp x2,#64 702 b.lo .Less_than_64 703 704 add x5,x5,x6,lsl#32 // pack 705 add x7,x7,x8,lsl#32 706 ldp x6,x8,[x1,#0] // load input 707 add x9,x9,x10,lsl#32 708 add x11,x11,x12,lsl#32 709 ldp x10,x12,[x1,#16] 710 add x13,x13,x14,lsl#32 711 add x15,x15,x16,lsl#32 712 ldp x14,x16,[x1,#32] 713 add x17,x17,x19,lsl#32 714 add x20,x20,x21,lsl#32 715 ldp x19,x21,[x1,#48] 716 add x1,x1,#64 717#ifdef __ARMEB__ 718 rev x5,x5 719 rev x7,x7 720 rev x9,x9 721 rev x11,x11 722 rev x13,x13 723 rev x15,x15 724 rev x17,x17 725 rev x20,x20 726#endif 727 eor x5,x5,x6 728 eor x7,x7,x8 729 eor x9,x9,x10 730 eor x11,x11,x12 731 eor x13,x13,x14 732 eor x15,x15,x16 733 eor x17,x17,x19 734 eor x20,x20,x21 735 736 stp x5,x7,[x0,#0] // store output 737 add x28,x28,#4 // increment counter 738 stp x9,x11,[x0,#16] 739 stp x13,x15,[x0,#32] 740 stp x17,x20,[x0,#48] 741 add x0,x0,#64 742 b.eq .Ldone_neon 743 sub x2,x2,#64 744 cmp x2,#64 745 b.lo .Less_than_128 746 747 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 748 eor v0.16b,v0.16b,v20.16b 749 eor v1.16b,v1.16b,v21.16b 750 eor v2.16b,v2.16b,v22.16b 751 eor v3.16b,v3.16b,v23.16b 752 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 753 b.eq .Ldone_neon 754 sub x2,x2,#64 755 cmp x2,#64 756 b.lo .Less_than_192 757 758 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 759 eor v4.16b,v4.16b,v20.16b 760 eor v5.16b,v5.16b,v21.16b 761 eor v6.16b,v6.16b,v22.16b 762 eor v7.16b,v7.16b,v23.16b 763 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 764 b.eq .Ldone_neon 765 sub x2,x2,#64 766 767 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] 768 b .Last_neon 769 770.Less_than_128: 771 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] 772 b .Last_neon 773.Less_than_192: 774 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] 775 b .Last_neon 776 777.align 4 778.Last_neon: 779 sub x0,x0,#1 780 add x1,x1,x2 781 add x0,x0,x2 782 add x4,sp,x2 783 neg x2,x2 784 785.Loop_tail_neon: 786 ldrb w10,[x1,x2] 787 ldrb w11,[x4,x2] 788 add x2,x2,#1 789 eor w10,w10,w11 790 strb w10,[x0,x2] 791 cbnz x2,.Loop_tail_neon 792 793 stp xzr,xzr,[sp,#0] 794 stp xzr,xzr,[sp,#16] 795 stp xzr,xzr,[sp,#32] 796 stp xzr,xzr,[sp,#48] 797 798.Ldone_neon: 799 ldp x19,x20,[x29,#16] 800 add sp,sp,#64 801 ldp x21,x22,[x29,#32] 802 ldp x23,x24,[x29,#48] 803 ldp x25,x26,[x29,#64] 804 ldp x27,x28,[x29,#80] 805 ldp x29,x30,[sp],#96 806.inst 0xd50323bf // autiasp 807 ret 808.size ChaCha20_neon,.-ChaCha20_neon 809.type ChaCha20_512_neon,%function 810.align 5 811ChaCha20_512_neon: 812.inst 0xd503233f // paciasp 813 stp x29,x30,[sp,#-96]! 814 add x29,sp,#0 815 816 adr x5,.Lsigma 817 stp x19,x20,[sp,#16] 818 stp x21,x22,[sp,#32] 819 stp x23,x24,[sp,#48] 820 stp x25,x26,[sp,#64] 821 stp x27,x28,[sp,#80] 822 823.L512_or_more_neon: 824 sub sp,sp,#128+64 825 826 ldp x22,x23,[x5] // load sigma 827 ld1 {v24.4s},[x5],#16 828 ldp x24,x25,[x3] // load key 829 ldp x26,x27,[x3,#16] 830 ld1 {v25.4s,v26.4s},[x3] 831 ldp x28,x30,[x4] // load counter 832 ld1 {v27.4s},[x4] 833 ld1 {v31.4s},[x5] 834#ifdef __ARMEB__ 835 rev64 v24.4s,v24.4s 836 ror x24,x24,#32 837 ror x25,x25,#32 838 ror x26,x26,#32 839 ror x27,x27,#32 840 ror x28,x28,#32 841 ror x30,x30,#32 842#endif 843 add v27.4s,v27.4s,v31.4s // += 1 844 stp q24,q25,[sp,#0] // off-load key block, invariant part 845 add v27.4s,v27.4s,v31.4s // not typo 846 str q26,[sp,#32] 847 add v28.4s,v27.4s,v31.4s 848 add v29.4s,v28.4s,v31.4s 849 add v30.4s,v29.4s,v31.4s 850 shl v31.4s,v31.4s,#2 // 1 -> 4 851 852 stp d8,d9,[sp,#128+0] // meet ABI requirements 853 stp d10,d11,[sp,#128+16] 854 stp d12,d13,[sp,#128+32] 855 stp d14,d15,[sp,#128+48] 856 857 sub x2,x2,#512 // not typo 858 859.Loop_outer_512_neon: 860 mov v0.16b,v24.16b 861 mov v4.16b,v24.16b 862 mov v8.16b,v24.16b 863 mov v12.16b,v24.16b 864 mov v16.16b,v24.16b 865 mov v20.16b,v24.16b 866 mov v1.16b,v25.16b 867 mov w5,w22 // unpack key block 868 mov v5.16b,v25.16b 869 lsr x6,x22,#32 870 mov v9.16b,v25.16b 871 mov w7,w23 872 mov v13.16b,v25.16b 873 lsr x8,x23,#32 874 mov v17.16b,v25.16b 875 mov w9,w24 876 mov v21.16b,v25.16b 877 lsr x10,x24,#32 878 mov v3.16b,v27.16b 879 mov w11,w25 880 mov v7.16b,v28.16b 881 lsr x12,x25,#32 882 mov v11.16b,v29.16b 883 mov w13,w26 884 mov v15.16b,v30.16b 885 lsr x14,x26,#32 886 mov v2.16b,v26.16b 887 mov w15,w27 888 mov v6.16b,v26.16b 889 lsr x16,x27,#32 890 add v19.4s,v3.4s,v31.4s // +4 891 mov w17,w28 892 add v23.4s,v7.4s,v31.4s // +4 893 lsr x19,x28,#32 894 mov v10.16b,v26.16b 895 mov w20,w30 896 mov v14.16b,v26.16b 897 lsr x21,x30,#32 898 mov v18.16b,v26.16b 899 stp q27,q28,[sp,#48] // off-load key block, variable part 900 mov v22.16b,v26.16b 901 str q29,[sp,#80] 902 903 mov x4,#5 904 subs x2,x2,#512 905.Loop_upper_neon: 906 sub x4,x4,#1 907 add v0.4s,v0.4s,v1.4s 908 add w5,w5,w9 909 add v4.4s,v4.4s,v5.4s 910 add w6,w6,w10 911 add v8.4s,v8.4s,v9.4s 912 add w7,w7,w11 913 add v12.4s,v12.4s,v13.4s 914 add w8,w8,w12 915 add v16.4s,v16.4s,v17.4s 916 eor w17,w17,w5 917 add v20.4s,v20.4s,v21.4s 918 eor w19,w19,w6 919 eor v3.16b,v3.16b,v0.16b 920 eor w20,w20,w7 921 eor v7.16b,v7.16b,v4.16b 922 eor w21,w21,w8 923 eor v11.16b,v11.16b,v8.16b 924 ror w17,w17,#16 925 eor v15.16b,v15.16b,v12.16b 926 ror w19,w19,#16 927 eor v19.16b,v19.16b,v16.16b 928 ror w20,w20,#16 929 eor v23.16b,v23.16b,v20.16b 930 ror w21,w21,#16 931 rev32 v3.8h,v3.8h 932 add w13,w13,w17 933 rev32 v7.8h,v7.8h 934 add w14,w14,w19 935 rev32 v11.8h,v11.8h 936 add w15,w15,w20 937 rev32 v15.8h,v15.8h 938 add w16,w16,w21 939 rev32 v19.8h,v19.8h 940 eor w9,w9,w13 941 rev32 v23.8h,v23.8h 942 eor w10,w10,w14 943 add v2.4s,v2.4s,v3.4s 944 eor w11,w11,w15 945 add v6.4s,v6.4s,v7.4s 946 eor w12,w12,w16 947 add v10.4s,v10.4s,v11.4s 948 ror w9,w9,#20 949 add v14.4s,v14.4s,v15.4s 950 ror w10,w10,#20 951 add v18.4s,v18.4s,v19.4s 952 ror w11,w11,#20 953 add v22.4s,v22.4s,v23.4s 954 ror w12,w12,#20 955 eor v24.16b,v1.16b,v2.16b 956 add w5,w5,w9 957 eor v25.16b,v5.16b,v6.16b 958 add w6,w6,w10 959 eor v26.16b,v9.16b,v10.16b 960 add w7,w7,w11 961 eor v27.16b,v13.16b,v14.16b 962 add w8,w8,w12 963 eor v28.16b,v17.16b,v18.16b 964 eor w17,w17,w5 965 eor v29.16b,v21.16b,v22.16b 966 eor w19,w19,w6 967 ushr v1.4s,v24.4s,#20 968 eor w20,w20,w7 969 ushr v5.4s,v25.4s,#20 970 eor w21,w21,w8 971 ushr v9.4s,v26.4s,#20 972 ror w17,w17,#24 973 ushr v13.4s,v27.4s,#20 974 ror w19,w19,#24 975 ushr v17.4s,v28.4s,#20 976 ror w20,w20,#24 977 ushr v21.4s,v29.4s,#20 978 ror w21,w21,#24 979 sli v1.4s,v24.4s,#12 980 add w13,w13,w17 981 sli v5.4s,v25.4s,#12 982 add w14,w14,w19 983 sli v9.4s,v26.4s,#12 984 add w15,w15,w20 985 sli v13.4s,v27.4s,#12 986 add w16,w16,w21 987 sli v17.4s,v28.4s,#12 988 eor w9,w9,w13 989 sli v21.4s,v29.4s,#12 990 eor w10,w10,w14 991 add v0.4s,v0.4s,v1.4s 992 eor w11,w11,w15 993 add v4.4s,v4.4s,v5.4s 994 eor w12,w12,w16 995 add v8.4s,v8.4s,v9.4s 996 ror w9,w9,#25 997 add v12.4s,v12.4s,v13.4s 998 ror w10,w10,#25 999 add v16.4s,v16.4s,v17.4s 1000 ror w11,w11,#25 1001 add v20.4s,v20.4s,v21.4s 1002 ror w12,w12,#25 1003 eor v24.16b,v3.16b,v0.16b 1004 add w5,w5,w10 1005 eor v25.16b,v7.16b,v4.16b 1006 add w6,w6,w11 1007 eor v26.16b,v11.16b,v8.16b 1008 add w7,w7,w12 1009 eor v27.16b,v15.16b,v12.16b 1010 add w8,w8,w9 1011 eor v28.16b,v19.16b,v16.16b 1012 eor w21,w21,w5 1013 eor v29.16b,v23.16b,v20.16b 1014 eor w17,w17,w6 1015 ushr v3.4s,v24.4s,#24 1016 eor w19,w19,w7 1017 ushr v7.4s,v25.4s,#24 1018 eor w20,w20,w8 1019 ushr v11.4s,v26.4s,#24 1020 ror w21,w21,#16 1021 ushr v15.4s,v27.4s,#24 1022 ror w17,w17,#16 1023 ushr v19.4s,v28.4s,#24 1024 ror w19,w19,#16 1025 ushr v23.4s,v29.4s,#24 1026 ror w20,w20,#16 1027 sli v3.4s,v24.4s,#8 1028 add w15,w15,w21 1029 sli v7.4s,v25.4s,#8 1030 add w16,w16,w17 1031 sli v11.4s,v26.4s,#8 1032 add w13,w13,w19 1033 sli v15.4s,v27.4s,#8 1034 add w14,w14,w20 1035 sli v19.4s,v28.4s,#8 1036 eor w10,w10,w15 1037 sli v23.4s,v29.4s,#8 1038 eor w11,w11,w16 1039 add v2.4s,v2.4s,v3.4s 1040 eor w12,w12,w13 1041 add v6.4s,v6.4s,v7.4s 1042 eor w9,w9,w14 1043 add v10.4s,v10.4s,v11.4s 1044 ror w10,w10,#20 1045 add v14.4s,v14.4s,v15.4s 1046 ror w11,w11,#20 1047 add v18.4s,v18.4s,v19.4s 1048 ror w12,w12,#20 1049 add v22.4s,v22.4s,v23.4s 1050 ror w9,w9,#20 1051 eor v24.16b,v1.16b,v2.16b 1052 add w5,w5,w10 1053 eor v25.16b,v5.16b,v6.16b 1054 add w6,w6,w11 1055 eor v26.16b,v9.16b,v10.16b 1056 add w7,w7,w12 1057 eor v27.16b,v13.16b,v14.16b 1058 add w8,w8,w9 1059 eor v28.16b,v17.16b,v18.16b 1060 eor w21,w21,w5 1061 eor v29.16b,v21.16b,v22.16b 1062 eor w17,w17,w6 1063 ushr v1.4s,v24.4s,#25 1064 eor w19,w19,w7 1065 ushr v5.4s,v25.4s,#25 1066 eor w20,w20,w8 1067 ushr v9.4s,v26.4s,#25 1068 ror w21,w21,#24 1069 ushr v13.4s,v27.4s,#25 1070 ror w17,w17,#24 1071 ushr v17.4s,v28.4s,#25 1072 ror w19,w19,#24 1073 ushr v21.4s,v29.4s,#25 1074 ror w20,w20,#24 1075 sli v1.4s,v24.4s,#7 1076 add w15,w15,w21 1077 sli v5.4s,v25.4s,#7 1078 add w16,w16,w17 1079 sli v9.4s,v26.4s,#7 1080 add w13,w13,w19 1081 sli v13.4s,v27.4s,#7 1082 add w14,w14,w20 1083 sli v17.4s,v28.4s,#7 1084 eor w10,w10,w15 1085 sli v21.4s,v29.4s,#7 1086 eor w11,w11,w16 1087 ext v2.16b,v2.16b,v2.16b,#8 1088 eor w12,w12,w13 1089 ext v6.16b,v6.16b,v6.16b,#8 1090 eor w9,w9,w14 1091 ext v10.16b,v10.16b,v10.16b,#8 1092 ror w10,w10,#25 1093 ext v14.16b,v14.16b,v14.16b,#8 1094 ror w11,w11,#25 1095 ext v18.16b,v18.16b,v18.16b,#8 1096 ror w12,w12,#25 1097 ext v22.16b,v22.16b,v22.16b,#8 1098 ror w9,w9,#25 1099 ext v3.16b,v3.16b,v3.16b,#12 1100 ext v7.16b,v7.16b,v7.16b,#12 1101 ext v11.16b,v11.16b,v11.16b,#12 1102 ext v15.16b,v15.16b,v15.16b,#12 1103 ext v19.16b,v19.16b,v19.16b,#12 1104 ext v23.16b,v23.16b,v23.16b,#12 1105 ext v1.16b,v1.16b,v1.16b,#4 1106 ext v5.16b,v5.16b,v5.16b,#4 1107 ext v9.16b,v9.16b,v9.16b,#4 1108 ext v13.16b,v13.16b,v13.16b,#4 1109 ext v17.16b,v17.16b,v17.16b,#4 1110 ext v21.16b,v21.16b,v21.16b,#4 1111 add v0.4s,v0.4s,v1.4s 1112 add w5,w5,w9 1113 add v4.4s,v4.4s,v5.4s 1114 add w6,w6,w10 1115 add v8.4s,v8.4s,v9.4s 1116 add w7,w7,w11 1117 add v12.4s,v12.4s,v13.4s 1118 add w8,w8,w12 1119 add v16.4s,v16.4s,v17.4s 1120 eor w17,w17,w5 1121 add v20.4s,v20.4s,v21.4s 1122 eor w19,w19,w6 1123 eor v3.16b,v3.16b,v0.16b 1124 eor w20,w20,w7 1125 eor v7.16b,v7.16b,v4.16b 1126 eor w21,w21,w8 1127 eor v11.16b,v11.16b,v8.16b 1128 ror w17,w17,#16 1129 eor v15.16b,v15.16b,v12.16b 1130 ror w19,w19,#16 1131 eor v19.16b,v19.16b,v16.16b 1132 ror w20,w20,#16 1133 eor v23.16b,v23.16b,v20.16b 1134 ror w21,w21,#16 1135 rev32 v3.8h,v3.8h 1136 add w13,w13,w17 1137 rev32 v7.8h,v7.8h 1138 add w14,w14,w19 1139 rev32 v11.8h,v11.8h 1140 add w15,w15,w20 1141 rev32 v15.8h,v15.8h 1142 add w16,w16,w21 1143 rev32 v19.8h,v19.8h 1144 eor w9,w9,w13 1145 rev32 v23.8h,v23.8h 1146 eor w10,w10,w14 1147 add v2.4s,v2.4s,v3.4s 1148 eor w11,w11,w15 1149 add v6.4s,v6.4s,v7.4s 1150 eor w12,w12,w16 1151 add v10.4s,v10.4s,v11.4s 1152 ror w9,w9,#20 1153 add v14.4s,v14.4s,v15.4s 1154 ror w10,w10,#20 1155 add v18.4s,v18.4s,v19.4s 1156 ror w11,w11,#20 1157 add v22.4s,v22.4s,v23.4s 1158 ror w12,w12,#20 1159 eor v24.16b,v1.16b,v2.16b 1160 add w5,w5,w9 1161 eor v25.16b,v5.16b,v6.16b 1162 add w6,w6,w10 1163 eor v26.16b,v9.16b,v10.16b 1164 add w7,w7,w11 1165 eor v27.16b,v13.16b,v14.16b 1166 add w8,w8,w12 1167 eor v28.16b,v17.16b,v18.16b 1168 eor w17,w17,w5 1169 eor v29.16b,v21.16b,v22.16b 1170 eor w19,w19,w6 1171 ushr v1.4s,v24.4s,#20 1172 eor w20,w20,w7 1173 ushr v5.4s,v25.4s,#20 1174 eor w21,w21,w8 1175 ushr v9.4s,v26.4s,#20 1176 ror w17,w17,#24 1177 ushr v13.4s,v27.4s,#20 1178 ror w19,w19,#24 1179 ushr v17.4s,v28.4s,#20 1180 ror w20,w20,#24 1181 ushr v21.4s,v29.4s,#20 1182 ror w21,w21,#24 1183 sli v1.4s,v24.4s,#12 1184 add w13,w13,w17 1185 sli v5.4s,v25.4s,#12 1186 add w14,w14,w19 1187 sli v9.4s,v26.4s,#12 1188 add w15,w15,w20 1189 sli v13.4s,v27.4s,#12 1190 add w16,w16,w21 1191 sli v17.4s,v28.4s,#12 1192 eor w9,w9,w13 1193 sli v21.4s,v29.4s,#12 1194 eor w10,w10,w14 1195 add v0.4s,v0.4s,v1.4s 1196 eor w11,w11,w15 1197 add v4.4s,v4.4s,v5.4s 1198 eor w12,w12,w16 1199 add v8.4s,v8.4s,v9.4s 1200 ror w9,w9,#25 1201 add v12.4s,v12.4s,v13.4s 1202 ror w10,w10,#25 1203 add v16.4s,v16.4s,v17.4s 1204 ror w11,w11,#25 1205 add v20.4s,v20.4s,v21.4s 1206 ror w12,w12,#25 1207 eor v24.16b,v3.16b,v0.16b 1208 add w5,w5,w10 1209 eor v25.16b,v7.16b,v4.16b 1210 add w6,w6,w11 1211 eor v26.16b,v11.16b,v8.16b 1212 add w7,w7,w12 1213 eor v27.16b,v15.16b,v12.16b 1214 add w8,w8,w9 1215 eor v28.16b,v19.16b,v16.16b 1216 eor w21,w21,w5 1217 eor v29.16b,v23.16b,v20.16b 1218 eor w17,w17,w6 1219 ushr v3.4s,v24.4s,#24 1220 eor w19,w19,w7 1221 ushr v7.4s,v25.4s,#24 1222 eor w20,w20,w8 1223 ushr v11.4s,v26.4s,#24 1224 ror w21,w21,#16 1225 ushr v15.4s,v27.4s,#24 1226 ror w17,w17,#16 1227 ushr v19.4s,v28.4s,#24 1228 ror w19,w19,#16 1229 ushr v23.4s,v29.4s,#24 1230 ror w20,w20,#16 1231 sli v3.4s,v24.4s,#8 1232 add w15,w15,w21 1233 sli v7.4s,v25.4s,#8 1234 add w16,w16,w17 1235 sli v11.4s,v26.4s,#8 1236 add w13,w13,w19 1237 sli v15.4s,v27.4s,#8 1238 add w14,w14,w20 1239 sli v19.4s,v28.4s,#8 1240 eor w10,w10,w15 1241 sli v23.4s,v29.4s,#8 1242 eor w11,w11,w16 1243 add v2.4s,v2.4s,v3.4s 1244 eor w12,w12,w13 1245 add v6.4s,v6.4s,v7.4s 1246 eor w9,w9,w14 1247 add v10.4s,v10.4s,v11.4s 1248 ror w10,w10,#20 1249 add v14.4s,v14.4s,v15.4s 1250 ror w11,w11,#20 1251 add v18.4s,v18.4s,v19.4s 1252 ror w12,w12,#20 1253 add v22.4s,v22.4s,v23.4s 1254 ror w9,w9,#20 1255 eor v24.16b,v1.16b,v2.16b 1256 add w5,w5,w10 1257 eor v25.16b,v5.16b,v6.16b 1258 add w6,w6,w11 1259 eor v26.16b,v9.16b,v10.16b 1260 add w7,w7,w12 1261 eor v27.16b,v13.16b,v14.16b 1262 add w8,w8,w9 1263 eor v28.16b,v17.16b,v18.16b 1264 eor w21,w21,w5 1265 eor v29.16b,v21.16b,v22.16b 1266 eor w17,w17,w6 1267 ushr v1.4s,v24.4s,#25 1268 eor w19,w19,w7 1269 ushr v5.4s,v25.4s,#25 1270 eor w20,w20,w8 1271 ushr v9.4s,v26.4s,#25 1272 ror w21,w21,#24 1273 ushr v13.4s,v27.4s,#25 1274 ror w17,w17,#24 1275 ushr v17.4s,v28.4s,#25 1276 ror w19,w19,#24 1277 ushr v21.4s,v29.4s,#25 1278 ror w20,w20,#24 1279 sli v1.4s,v24.4s,#7 1280 add w15,w15,w21 1281 sli v5.4s,v25.4s,#7 1282 add w16,w16,w17 1283 sli v9.4s,v26.4s,#7 1284 add w13,w13,w19 1285 sli v13.4s,v27.4s,#7 1286 add w14,w14,w20 1287 sli v17.4s,v28.4s,#7 1288 eor w10,w10,w15 1289 sli v21.4s,v29.4s,#7 1290 eor w11,w11,w16 1291 ext v2.16b,v2.16b,v2.16b,#8 1292 eor w12,w12,w13 1293 ext v6.16b,v6.16b,v6.16b,#8 1294 eor w9,w9,w14 1295 ext v10.16b,v10.16b,v10.16b,#8 1296 ror w10,w10,#25 1297 ext v14.16b,v14.16b,v14.16b,#8 1298 ror w11,w11,#25 1299 ext v18.16b,v18.16b,v18.16b,#8 1300 ror w12,w12,#25 1301 ext v22.16b,v22.16b,v22.16b,#8 1302 ror w9,w9,#25 1303 ext v3.16b,v3.16b,v3.16b,#4 1304 ext v7.16b,v7.16b,v7.16b,#4 1305 ext v11.16b,v11.16b,v11.16b,#4 1306 ext v15.16b,v15.16b,v15.16b,#4 1307 ext v19.16b,v19.16b,v19.16b,#4 1308 ext v23.16b,v23.16b,v23.16b,#4 1309 ext v1.16b,v1.16b,v1.16b,#12 1310 ext v5.16b,v5.16b,v5.16b,#12 1311 ext v9.16b,v9.16b,v9.16b,#12 1312 ext v13.16b,v13.16b,v13.16b,#12 1313 ext v17.16b,v17.16b,v17.16b,#12 1314 ext v21.16b,v21.16b,v21.16b,#12 1315 cbnz x4,.Loop_upper_neon 1316 1317 add w5,w5,w22 // accumulate key block 1318 add x6,x6,x22,lsr#32 1319 add w7,w7,w23 1320 add x8,x8,x23,lsr#32 1321 add w9,w9,w24 1322 add x10,x10,x24,lsr#32 1323 add w11,w11,w25 1324 add x12,x12,x25,lsr#32 1325 add w13,w13,w26 1326 add x14,x14,x26,lsr#32 1327 add w15,w15,w27 1328 add x16,x16,x27,lsr#32 1329 add w17,w17,w28 1330 add x19,x19,x28,lsr#32 1331 add w20,w20,w30 1332 add x21,x21,x30,lsr#32 1333 1334 add x5,x5,x6,lsl#32 // pack 1335 add x7,x7,x8,lsl#32 1336 ldp x6,x8,[x1,#0] // load input 1337 add x9,x9,x10,lsl#32 1338 add x11,x11,x12,lsl#32 1339 ldp x10,x12,[x1,#16] 1340 add x13,x13,x14,lsl#32 1341 add x15,x15,x16,lsl#32 1342 ldp x14,x16,[x1,#32] 1343 add x17,x17,x19,lsl#32 1344 add x20,x20,x21,lsl#32 1345 ldp x19,x21,[x1,#48] 1346 add x1,x1,#64 1347#ifdef __ARMEB__ 1348 rev x5,x5 1349 rev x7,x7 1350 rev x9,x9 1351 rev x11,x11 1352 rev x13,x13 1353 rev x15,x15 1354 rev x17,x17 1355 rev x20,x20 1356#endif 1357 eor x5,x5,x6 1358 eor x7,x7,x8 1359 eor x9,x9,x10 1360 eor x11,x11,x12 1361 eor x13,x13,x14 1362 eor x15,x15,x16 1363 eor x17,x17,x19 1364 eor x20,x20,x21 1365 1366 stp x5,x7,[x0,#0] // store output 1367 add x28,x28,#1 // increment counter 1368 mov w5,w22 // unpack key block 1369 lsr x6,x22,#32 1370 stp x9,x11,[x0,#16] 1371 mov w7,w23 1372 lsr x8,x23,#32 1373 stp x13,x15,[x0,#32] 1374 mov w9,w24 1375 lsr x10,x24,#32 1376 stp x17,x20,[x0,#48] 1377 add x0,x0,#64 1378 mov w11,w25 1379 lsr x12,x25,#32 1380 mov w13,w26 1381 lsr x14,x26,#32 1382 mov w15,w27 1383 lsr x16,x27,#32 1384 mov w17,w28 1385 lsr x19,x28,#32 1386 mov w20,w30 1387 lsr x21,x30,#32 1388 1389 mov x4,#5 1390.Loop_lower_neon: 1391 sub x4,x4,#1 1392 add v0.4s,v0.4s,v1.4s 1393 add w5,w5,w9 1394 add v4.4s,v4.4s,v5.4s 1395 add w6,w6,w10 1396 add v8.4s,v8.4s,v9.4s 1397 add w7,w7,w11 1398 add v12.4s,v12.4s,v13.4s 1399 add w8,w8,w12 1400 add v16.4s,v16.4s,v17.4s 1401 eor w17,w17,w5 1402 add v20.4s,v20.4s,v21.4s 1403 eor w19,w19,w6 1404 eor v3.16b,v3.16b,v0.16b 1405 eor w20,w20,w7 1406 eor v7.16b,v7.16b,v4.16b 1407 eor w21,w21,w8 1408 eor v11.16b,v11.16b,v8.16b 1409 ror w17,w17,#16 1410 eor v15.16b,v15.16b,v12.16b 1411 ror w19,w19,#16 1412 eor v19.16b,v19.16b,v16.16b 1413 ror w20,w20,#16 1414 eor v23.16b,v23.16b,v20.16b 1415 ror w21,w21,#16 1416 rev32 v3.8h,v3.8h 1417 add w13,w13,w17 1418 rev32 v7.8h,v7.8h 1419 add w14,w14,w19 1420 rev32 v11.8h,v11.8h 1421 add w15,w15,w20 1422 rev32 v15.8h,v15.8h 1423 add w16,w16,w21 1424 rev32 v19.8h,v19.8h 1425 eor w9,w9,w13 1426 rev32 v23.8h,v23.8h 1427 eor w10,w10,w14 1428 add v2.4s,v2.4s,v3.4s 1429 eor w11,w11,w15 1430 add v6.4s,v6.4s,v7.4s 1431 eor w12,w12,w16 1432 add v10.4s,v10.4s,v11.4s 1433 ror w9,w9,#20 1434 add v14.4s,v14.4s,v15.4s 1435 ror w10,w10,#20 1436 add v18.4s,v18.4s,v19.4s 1437 ror w11,w11,#20 1438 add v22.4s,v22.4s,v23.4s 1439 ror w12,w12,#20 1440 eor v24.16b,v1.16b,v2.16b 1441 add w5,w5,w9 1442 eor v25.16b,v5.16b,v6.16b 1443 add w6,w6,w10 1444 eor v26.16b,v9.16b,v10.16b 1445 add w7,w7,w11 1446 eor v27.16b,v13.16b,v14.16b 1447 add w8,w8,w12 1448 eor v28.16b,v17.16b,v18.16b 1449 eor w17,w17,w5 1450 eor v29.16b,v21.16b,v22.16b 1451 eor w19,w19,w6 1452 ushr v1.4s,v24.4s,#20 1453 eor w20,w20,w7 1454 ushr v5.4s,v25.4s,#20 1455 eor w21,w21,w8 1456 ushr v9.4s,v26.4s,#20 1457 ror w17,w17,#24 1458 ushr v13.4s,v27.4s,#20 1459 ror w19,w19,#24 1460 ushr v17.4s,v28.4s,#20 1461 ror w20,w20,#24 1462 ushr v21.4s,v29.4s,#20 1463 ror w21,w21,#24 1464 sli v1.4s,v24.4s,#12 1465 add w13,w13,w17 1466 sli v5.4s,v25.4s,#12 1467 add w14,w14,w19 1468 sli v9.4s,v26.4s,#12 1469 add w15,w15,w20 1470 sli v13.4s,v27.4s,#12 1471 add w16,w16,w21 1472 sli v17.4s,v28.4s,#12 1473 eor w9,w9,w13 1474 sli v21.4s,v29.4s,#12 1475 eor w10,w10,w14 1476 add v0.4s,v0.4s,v1.4s 1477 eor w11,w11,w15 1478 add v4.4s,v4.4s,v5.4s 1479 eor w12,w12,w16 1480 add v8.4s,v8.4s,v9.4s 1481 ror w9,w9,#25 1482 add v12.4s,v12.4s,v13.4s 1483 ror w10,w10,#25 1484 add v16.4s,v16.4s,v17.4s 1485 ror w11,w11,#25 1486 add v20.4s,v20.4s,v21.4s 1487 ror w12,w12,#25 1488 eor v24.16b,v3.16b,v0.16b 1489 add w5,w5,w10 1490 eor v25.16b,v7.16b,v4.16b 1491 add w6,w6,w11 1492 eor v26.16b,v11.16b,v8.16b 1493 add w7,w7,w12 1494 eor v27.16b,v15.16b,v12.16b 1495 add w8,w8,w9 1496 eor v28.16b,v19.16b,v16.16b 1497 eor w21,w21,w5 1498 eor v29.16b,v23.16b,v20.16b 1499 eor w17,w17,w6 1500 ushr v3.4s,v24.4s,#24 1501 eor w19,w19,w7 1502 ushr v7.4s,v25.4s,#24 1503 eor w20,w20,w8 1504 ushr v11.4s,v26.4s,#24 1505 ror w21,w21,#16 1506 ushr v15.4s,v27.4s,#24 1507 ror w17,w17,#16 1508 ushr v19.4s,v28.4s,#24 1509 ror w19,w19,#16 1510 ushr v23.4s,v29.4s,#24 1511 ror w20,w20,#16 1512 sli v3.4s,v24.4s,#8 1513 add w15,w15,w21 1514 sli v7.4s,v25.4s,#8 1515 add w16,w16,w17 1516 sli v11.4s,v26.4s,#8 1517 add w13,w13,w19 1518 sli v15.4s,v27.4s,#8 1519 add w14,w14,w20 1520 sli v19.4s,v28.4s,#8 1521 eor w10,w10,w15 1522 sli v23.4s,v29.4s,#8 1523 eor w11,w11,w16 1524 add v2.4s,v2.4s,v3.4s 1525 eor w12,w12,w13 1526 add v6.4s,v6.4s,v7.4s 1527 eor w9,w9,w14 1528 add v10.4s,v10.4s,v11.4s 1529 ror w10,w10,#20 1530 add v14.4s,v14.4s,v15.4s 1531 ror w11,w11,#20 1532 add v18.4s,v18.4s,v19.4s 1533 ror w12,w12,#20 1534 add v22.4s,v22.4s,v23.4s 1535 ror w9,w9,#20 1536 eor v24.16b,v1.16b,v2.16b 1537 add w5,w5,w10 1538 eor v25.16b,v5.16b,v6.16b 1539 add w6,w6,w11 1540 eor v26.16b,v9.16b,v10.16b 1541 add w7,w7,w12 1542 eor v27.16b,v13.16b,v14.16b 1543 add w8,w8,w9 1544 eor v28.16b,v17.16b,v18.16b 1545 eor w21,w21,w5 1546 eor v29.16b,v21.16b,v22.16b 1547 eor w17,w17,w6 1548 ushr v1.4s,v24.4s,#25 1549 eor w19,w19,w7 1550 ushr v5.4s,v25.4s,#25 1551 eor w20,w20,w8 1552 ushr v9.4s,v26.4s,#25 1553 ror w21,w21,#24 1554 ushr v13.4s,v27.4s,#25 1555 ror w17,w17,#24 1556 ushr v17.4s,v28.4s,#25 1557 ror w19,w19,#24 1558 ushr v21.4s,v29.4s,#25 1559 ror w20,w20,#24 1560 sli v1.4s,v24.4s,#7 1561 add w15,w15,w21 1562 sli v5.4s,v25.4s,#7 1563 add w16,w16,w17 1564 sli v9.4s,v26.4s,#7 1565 add w13,w13,w19 1566 sli v13.4s,v27.4s,#7 1567 add w14,w14,w20 1568 sli v17.4s,v28.4s,#7 1569 eor w10,w10,w15 1570 sli v21.4s,v29.4s,#7 1571 eor w11,w11,w16 1572 ext v2.16b,v2.16b,v2.16b,#8 1573 eor w12,w12,w13 1574 ext v6.16b,v6.16b,v6.16b,#8 1575 eor w9,w9,w14 1576 ext v10.16b,v10.16b,v10.16b,#8 1577 ror w10,w10,#25 1578 ext v14.16b,v14.16b,v14.16b,#8 1579 ror w11,w11,#25 1580 ext v18.16b,v18.16b,v18.16b,#8 1581 ror w12,w12,#25 1582 ext v22.16b,v22.16b,v22.16b,#8 1583 ror w9,w9,#25 1584 ext v3.16b,v3.16b,v3.16b,#12 1585 ext v7.16b,v7.16b,v7.16b,#12 1586 ext v11.16b,v11.16b,v11.16b,#12 1587 ext v15.16b,v15.16b,v15.16b,#12 1588 ext v19.16b,v19.16b,v19.16b,#12 1589 ext v23.16b,v23.16b,v23.16b,#12 1590 ext v1.16b,v1.16b,v1.16b,#4 1591 ext v5.16b,v5.16b,v5.16b,#4 1592 ext v9.16b,v9.16b,v9.16b,#4 1593 ext v13.16b,v13.16b,v13.16b,#4 1594 ext v17.16b,v17.16b,v17.16b,#4 1595 ext v21.16b,v21.16b,v21.16b,#4 1596 add v0.4s,v0.4s,v1.4s 1597 add w5,w5,w9 1598 add v4.4s,v4.4s,v5.4s 1599 add w6,w6,w10 1600 add v8.4s,v8.4s,v9.4s 1601 add w7,w7,w11 1602 add v12.4s,v12.4s,v13.4s 1603 add w8,w8,w12 1604 add v16.4s,v16.4s,v17.4s 1605 eor w17,w17,w5 1606 add v20.4s,v20.4s,v21.4s 1607 eor w19,w19,w6 1608 eor v3.16b,v3.16b,v0.16b 1609 eor w20,w20,w7 1610 eor v7.16b,v7.16b,v4.16b 1611 eor w21,w21,w8 1612 eor v11.16b,v11.16b,v8.16b 1613 ror w17,w17,#16 1614 eor v15.16b,v15.16b,v12.16b 1615 ror w19,w19,#16 1616 eor v19.16b,v19.16b,v16.16b 1617 ror w20,w20,#16 1618 eor v23.16b,v23.16b,v20.16b 1619 ror w21,w21,#16 1620 rev32 v3.8h,v3.8h 1621 add w13,w13,w17 1622 rev32 v7.8h,v7.8h 1623 add w14,w14,w19 1624 rev32 v11.8h,v11.8h 1625 add w15,w15,w20 1626 rev32 v15.8h,v15.8h 1627 add w16,w16,w21 1628 rev32 v19.8h,v19.8h 1629 eor w9,w9,w13 1630 rev32 v23.8h,v23.8h 1631 eor w10,w10,w14 1632 add v2.4s,v2.4s,v3.4s 1633 eor w11,w11,w15 1634 add v6.4s,v6.4s,v7.4s 1635 eor w12,w12,w16 1636 add v10.4s,v10.4s,v11.4s 1637 ror w9,w9,#20 1638 add v14.4s,v14.4s,v15.4s 1639 ror w10,w10,#20 1640 add v18.4s,v18.4s,v19.4s 1641 ror w11,w11,#20 1642 add v22.4s,v22.4s,v23.4s 1643 ror w12,w12,#20 1644 eor v24.16b,v1.16b,v2.16b 1645 add w5,w5,w9 1646 eor v25.16b,v5.16b,v6.16b 1647 add w6,w6,w10 1648 eor v26.16b,v9.16b,v10.16b 1649 add w7,w7,w11 1650 eor v27.16b,v13.16b,v14.16b 1651 add w8,w8,w12 1652 eor v28.16b,v17.16b,v18.16b 1653 eor w17,w17,w5 1654 eor v29.16b,v21.16b,v22.16b 1655 eor w19,w19,w6 1656 ushr v1.4s,v24.4s,#20 1657 eor w20,w20,w7 1658 ushr v5.4s,v25.4s,#20 1659 eor w21,w21,w8 1660 ushr v9.4s,v26.4s,#20 1661 ror w17,w17,#24 1662 ushr v13.4s,v27.4s,#20 1663 ror w19,w19,#24 1664 ushr v17.4s,v28.4s,#20 1665 ror w20,w20,#24 1666 ushr v21.4s,v29.4s,#20 1667 ror w21,w21,#24 1668 sli v1.4s,v24.4s,#12 1669 add w13,w13,w17 1670 sli v5.4s,v25.4s,#12 1671 add w14,w14,w19 1672 sli v9.4s,v26.4s,#12 1673 add w15,w15,w20 1674 sli v13.4s,v27.4s,#12 1675 add w16,w16,w21 1676 sli v17.4s,v28.4s,#12 1677 eor w9,w9,w13 1678 sli v21.4s,v29.4s,#12 1679 eor w10,w10,w14 1680 add v0.4s,v0.4s,v1.4s 1681 eor w11,w11,w15 1682 add v4.4s,v4.4s,v5.4s 1683 eor w12,w12,w16 1684 add v8.4s,v8.4s,v9.4s 1685 ror w9,w9,#25 1686 add v12.4s,v12.4s,v13.4s 1687 ror w10,w10,#25 1688 add v16.4s,v16.4s,v17.4s 1689 ror w11,w11,#25 1690 add v20.4s,v20.4s,v21.4s 1691 ror w12,w12,#25 1692 eor v24.16b,v3.16b,v0.16b 1693 add w5,w5,w10 1694 eor v25.16b,v7.16b,v4.16b 1695 add w6,w6,w11 1696 eor v26.16b,v11.16b,v8.16b 1697 add w7,w7,w12 1698 eor v27.16b,v15.16b,v12.16b 1699 add w8,w8,w9 1700 eor v28.16b,v19.16b,v16.16b 1701 eor w21,w21,w5 1702 eor v29.16b,v23.16b,v20.16b 1703 eor w17,w17,w6 1704 ushr v3.4s,v24.4s,#24 1705 eor w19,w19,w7 1706 ushr v7.4s,v25.4s,#24 1707 eor w20,w20,w8 1708 ushr v11.4s,v26.4s,#24 1709 ror w21,w21,#16 1710 ushr v15.4s,v27.4s,#24 1711 ror w17,w17,#16 1712 ushr v19.4s,v28.4s,#24 1713 ror w19,w19,#16 1714 ushr v23.4s,v29.4s,#24 1715 ror w20,w20,#16 1716 sli v3.4s,v24.4s,#8 1717 add w15,w15,w21 1718 sli v7.4s,v25.4s,#8 1719 add w16,w16,w17 1720 sli v11.4s,v26.4s,#8 1721 add w13,w13,w19 1722 sli v15.4s,v27.4s,#8 1723 add w14,w14,w20 1724 sli v19.4s,v28.4s,#8 1725 eor w10,w10,w15 1726 sli v23.4s,v29.4s,#8 1727 eor w11,w11,w16 1728 add v2.4s,v2.4s,v3.4s 1729 eor w12,w12,w13 1730 add v6.4s,v6.4s,v7.4s 1731 eor w9,w9,w14 1732 add v10.4s,v10.4s,v11.4s 1733 ror w10,w10,#20 1734 add v14.4s,v14.4s,v15.4s 1735 ror w11,w11,#20 1736 add v18.4s,v18.4s,v19.4s 1737 ror w12,w12,#20 1738 add v22.4s,v22.4s,v23.4s 1739 ror w9,w9,#20 1740 eor v24.16b,v1.16b,v2.16b 1741 add w5,w5,w10 1742 eor v25.16b,v5.16b,v6.16b 1743 add w6,w6,w11 1744 eor v26.16b,v9.16b,v10.16b 1745 add w7,w7,w12 1746 eor v27.16b,v13.16b,v14.16b 1747 add w8,w8,w9 1748 eor v28.16b,v17.16b,v18.16b 1749 eor w21,w21,w5 1750 eor v29.16b,v21.16b,v22.16b 1751 eor w17,w17,w6 1752 ushr v1.4s,v24.4s,#25 1753 eor w19,w19,w7 1754 ushr v5.4s,v25.4s,#25 1755 eor w20,w20,w8 1756 ushr v9.4s,v26.4s,#25 1757 ror w21,w21,#24 1758 ushr v13.4s,v27.4s,#25 1759 ror w17,w17,#24 1760 ushr v17.4s,v28.4s,#25 1761 ror w19,w19,#24 1762 ushr v21.4s,v29.4s,#25 1763 ror w20,w20,#24 1764 sli v1.4s,v24.4s,#7 1765 add w15,w15,w21 1766 sli v5.4s,v25.4s,#7 1767 add w16,w16,w17 1768 sli v9.4s,v26.4s,#7 1769 add w13,w13,w19 1770 sli v13.4s,v27.4s,#7 1771 add w14,w14,w20 1772 sli v17.4s,v28.4s,#7 1773 eor w10,w10,w15 1774 sli v21.4s,v29.4s,#7 1775 eor w11,w11,w16 1776 ext v2.16b,v2.16b,v2.16b,#8 1777 eor w12,w12,w13 1778 ext v6.16b,v6.16b,v6.16b,#8 1779 eor w9,w9,w14 1780 ext v10.16b,v10.16b,v10.16b,#8 1781 ror w10,w10,#25 1782 ext v14.16b,v14.16b,v14.16b,#8 1783 ror w11,w11,#25 1784 ext v18.16b,v18.16b,v18.16b,#8 1785 ror w12,w12,#25 1786 ext v22.16b,v22.16b,v22.16b,#8 1787 ror w9,w9,#25 1788 ext v3.16b,v3.16b,v3.16b,#4 1789 ext v7.16b,v7.16b,v7.16b,#4 1790 ext v11.16b,v11.16b,v11.16b,#4 1791 ext v15.16b,v15.16b,v15.16b,#4 1792 ext v19.16b,v19.16b,v19.16b,#4 1793 ext v23.16b,v23.16b,v23.16b,#4 1794 ext v1.16b,v1.16b,v1.16b,#12 1795 ext v5.16b,v5.16b,v5.16b,#12 1796 ext v9.16b,v9.16b,v9.16b,#12 1797 ext v13.16b,v13.16b,v13.16b,#12 1798 ext v17.16b,v17.16b,v17.16b,#12 1799 ext v21.16b,v21.16b,v21.16b,#12 1800 cbnz x4,.Loop_lower_neon 1801 1802 add w5,w5,w22 // accumulate key block 1803 ldp q24,q25,[sp,#0] 1804 add x6,x6,x22,lsr#32 1805 ldp q26,q27,[sp,#32] 1806 add w7,w7,w23 1807 ldp q28,q29,[sp,#64] 1808 add x8,x8,x23,lsr#32 1809 add v0.4s,v0.4s,v24.4s 1810 add w9,w9,w24 1811 add v4.4s,v4.4s,v24.4s 1812 add x10,x10,x24,lsr#32 1813 add v8.4s,v8.4s,v24.4s 1814 add w11,w11,w25 1815 add v12.4s,v12.4s,v24.4s 1816 add x12,x12,x25,lsr#32 1817 add v16.4s,v16.4s,v24.4s 1818 add w13,w13,w26 1819 add v20.4s,v20.4s,v24.4s 1820 add x14,x14,x26,lsr#32 1821 add v2.4s,v2.4s,v26.4s 1822 add w15,w15,w27 1823 add v6.4s,v6.4s,v26.4s 1824 add x16,x16,x27,lsr#32 1825 add v10.4s,v10.4s,v26.4s 1826 add w17,w17,w28 1827 add v14.4s,v14.4s,v26.4s 1828 add x19,x19,x28,lsr#32 1829 add v18.4s,v18.4s,v26.4s 1830 add w20,w20,w30 1831 add v22.4s,v22.4s,v26.4s 1832 add x21,x21,x30,lsr#32 1833 add v19.4s,v19.4s,v31.4s // +4 1834 add x5,x5,x6,lsl#32 // pack 1835 add v23.4s,v23.4s,v31.4s // +4 1836 add x7,x7,x8,lsl#32 1837 add v3.4s,v3.4s,v27.4s 1838 ldp x6,x8,[x1,#0] // load input 1839 add v7.4s,v7.4s,v28.4s 1840 add x9,x9,x10,lsl#32 1841 add v11.4s,v11.4s,v29.4s 1842 add x11,x11,x12,lsl#32 1843 add v15.4s,v15.4s,v30.4s 1844 ldp x10,x12,[x1,#16] 1845 add v19.4s,v19.4s,v27.4s 1846 add x13,x13,x14,lsl#32 1847 add v23.4s,v23.4s,v28.4s 1848 add x15,x15,x16,lsl#32 1849 add v1.4s,v1.4s,v25.4s 1850 ldp x14,x16,[x1,#32] 1851 add v5.4s,v5.4s,v25.4s 1852 add x17,x17,x19,lsl#32 1853 add v9.4s,v9.4s,v25.4s 1854 add x20,x20,x21,lsl#32 1855 add v13.4s,v13.4s,v25.4s 1856 ldp x19,x21,[x1,#48] 1857 add v17.4s,v17.4s,v25.4s 1858 add x1,x1,#64 1859 add v21.4s,v21.4s,v25.4s 1860 1861#ifdef __ARMEB__ 1862 rev x5,x5 1863 rev x7,x7 1864 rev x9,x9 1865 rev x11,x11 1866 rev x13,x13 1867 rev x15,x15 1868 rev x17,x17 1869 rev x20,x20 1870#endif 1871 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 1872 eor x5,x5,x6 1873 eor x7,x7,x8 1874 eor x9,x9,x10 1875 eor x11,x11,x12 1876 eor x13,x13,x14 1877 eor v0.16b,v0.16b,v24.16b 1878 eor x15,x15,x16 1879 eor v1.16b,v1.16b,v25.16b 1880 eor x17,x17,x19 1881 eor v2.16b,v2.16b,v26.16b 1882 eor x20,x20,x21 1883 eor v3.16b,v3.16b,v27.16b 1884 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 1885 1886 stp x5,x7,[x0,#0] // store output 1887 add x28,x28,#7 // increment counter 1888 stp x9,x11,[x0,#16] 1889 stp x13,x15,[x0,#32] 1890 stp x17,x20,[x0,#48] 1891 add x0,x0,#64 1892 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 1893 1894 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 1895 eor v4.16b,v4.16b,v24.16b 1896 eor v5.16b,v5.16b,v25.16b 1897 eor v6.16b,v6.16b,v26.16b 1898 eor v7.16b,v7.16b,v27.16b 1899 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 1900 1901 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 1902 eor v8.16b,v8.16b,v0.16b 1903 ldp q24,q25,[sp,#0] 1904 eor v9.16b,v9.16b,v1.16b 1905 ldp q26,q27,[sp,#32] 1906 eor v10.16b,v10.16b,v2.16b 1907 eor v11.16b,v11.16b,v3.16b 1908 st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 1909 1910 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 1911 eor v12.16b,v12.16b,v4.16b 1912 eor v13.16b,v13.16b,v5.16b 1913 eor v14.16b,v14.16b,v6.16b 1914 eor v15.16b,v15.16b,v7.16b 1915 st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 1916 1917 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 1918 eor v16.16b,v16.16b,v8.16b 1919 eor v17.16b,v17.16b,v9.16b 1920 eor v18.16b,v18.16b,v10.16b 1921 eor v19.16b,v19.16b,v11.16b 1922 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 1923 1924 shl v0.4s,v31.4s,#1 // 4 -> 8 1925 eor v20.16b,v20.16b,v12.16b 1926 eor v21.16b,v21.16b,v13.16b 1927 eor v22.16b,v22.16b,v14.16b 1928 eor v23.16b,v23.16b,v15.16b 1929 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 1930 1931 add v27.4s,v27.4s,v0.4s // += 8 1932 add v28.4s,v28.4s,v0.4s 1933 add v29.4s,v29.4s,v0.4s 1934 add v30.4s,v30.4s,v0.4s 1935 1936 b.hs .Loop_outer_512_neon 1937 1938 adds x2,x2,#512 1939 ushr v0.4s,v31.4s,#2 // 4 -> 1 1940 1941 ldp d8,d9,[sp,#128+0] // meet ABI requirements 1942 ldp d10,d11,[sp,#128+16] 1943 ldp d12,d13,[sp,#128+32] 1944 ldp d14,d15,[sp,#128+48] 1945 1946 stp q24,q31,[sp,#0] // wipe off-load area 1947 stp q24,q31,[sp,#32] 1948 stp q24,q31,[sp,#64] 1949 1950 b.eq .Ldone_512_neon 1951 1952 cmp x2,#192 1953 sub v27.4s,v27.4s,v0.4s // -= 1 1954 sub v28.4s,v28.4s,v0.4s 1955 sub v29.4s,v29.4s,v0.4s 1956 add sp,sp,#128 1957 b.hs .Loop_outer_neon 1958 1959 eor v25.16b,v25.16b,v25.16b 1960 eor v26.16b,v26.16b,v26.16b 1961 eor v27.16b,v27.16b,v27.16b 1962 eor v28.16b,v28.16b,v28.16b 1963 eor v29.16b,v29.16b,v29.16b 1964 eor v30.16b,v30.16b,v30.16b 1965 b .Loop_outer 1966 1967.Ldone_512_neon: 1968 ldp x19,x20,[x29,#16] 1969 add sp,sp,#128+64 1970 ldp x21,x22,[x29,#32] 1971 ldp x23,x24,[x29,#48] 1972 ldp x25,x26,[x29,#64] 1973 ldp x27,x28,[x29,#80] 1974 ldp x29,x30,[sp],#96 1975.inst 0xd50323bf // autiasp 1976 ret 1977.size ChaCha20_512_neon,.-ChaCha20_512_neon 1978