1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#if defined(__aarch64__) 13#include <GFp/arm_arch.h> 14 15 16.hidden GFp_armcap_P 17 18.section .rodata 19 20.align 5 21.Lsigma: 22.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral 23.Lone: 24.long 1,0,0,0 25.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 26.align 2 27 28.text 29 30.globl GFp_ChaCha20_ctr32 31.hidden GFp_ChaCha20_ctr32 32.type GFp_ChaCha20_ctr32,%function 33.align 5 34GFp_ChaCha20_ctr32: 35 AARCH64_VALID_CALL_TARGET 36 cbz x2,.Labort 37#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10 38 adrp x5,:pg_hi21_nc:GFp_armcap_P 39#else 40 adrp x5,GFp_armcap_P 41#endif 42 cmp x2,#192 43 b.lo .Lshort 44 ldr w17,[x5,:lo12:GFp_armcap_P] 45 tst w17,#ARMV7_NEON 46 b.ne ChaCha20_neon 47 48.Lshort: 49 AARCH64_SIGN_LINK_REGISTER 50 stp x29,x30,[sp,#-96]! 51 add x29,sp,#0 52 53 adrp x5,.Lsigma 54 add x5,x5,:lo12:.Lsigma 55 stp x19,x20,[sp,#16] 56 stp x21,x22,[sp,#32] 57 stp x23,x24,[sp,#48] 58 stp x25,x26,[sp,#64] 59 stp x27,x28,[sp,#80] 60 sub sp,sp,#64 61 62 ldp x22,x23,[x5] // load sigma 63 ldp x24,x25,[x3] // load key 64 ldp x26,x27,[x3,#16] 65 ldp x28,x30,[x4] // load counter 66#ifdef __ARMEB__ 67 ror x24,x24,#32 68 ror x25,x25,#32 69 ror x26,x26,#32 70 ror x27,x27,#32 71 ror x28,x28,#32 72 ror x30,x30,#32 73#endif 74 75.Loop_outer: 76 mov w5,w22 // unpack key block 77 lsr x6,x22,#32 78 mov w7,w23 79 lsr x8,x23,#32 80 mov w9,w24 81 lsr x10,x24,#32 82 mov w11,w25 83 lsr x12,x25,#32 84 mov w13,w26 85 lsr x14,x26,#32 86 mov w15,w27 87 lsr x16,x27,#32 88 mov w17,w28 89 lsr x19,x28,#32 90 mov w20,w30 91 lsr x21,x30,#32 92 93 mov x4,#10 94 subs x2,x2,#64 95.Loop: 96 sub x4,x4,#1 97 add w5,w5,w9 98 add w6,w6,w10 99 add w7,w7,w11 100 add w8,w8,w12 101 eor w17,w17,w5 102 eor w19,w19,w6 103 eor w20,w20,w7 104 eor w21,w21,w8 105 ror w17,w17,#16 106 ror w19,w19,#16 107 ror w20,w20,#16 108 ror w21,w21,#16 109 add w13,w13,w17 110 add w14,w14,w19 111 add w15,w15,w20 112 add w16,w16,w21 113 eor w9,w9,w13 114 eor w10,w10,w14 115 eor w11,w11,w15 116 eor w12,w12,w16 117 ror w9,w9,#20 118 ror w10,w10,#20 119 ror w11,w11,#20 120 ror w12,w12,#20 121 add w5,w5,w9 122 add w6,w6,w10 123 add w7,w7,w11 124 add w8,w8,w12 125 eor w17,w17,w5 126 eor w19,w19,w6 127 eor w20,w20,w7 128 eor w21,w21,w8 129 ror w17,w17,#24 130 ror w19,w19,#24 131 ror w20,w20,#24 132 ror w21,w21,#24 133 add w13,w13,w17 134 add w14,w14,w19 135 add w15,w15,w20 136 add w16,w16,w21 137 eor w9,w9,w13 138 eor w10,w10,w14 139 eor w11,w11,w15 140 eor w12,w12,w16 141 ror w9,w9,#25 142 ror w10,w10,#25 143 ror w11,w11,#25 144 ror w12,w12,#25 145 add w5,w5,w10 146 add w6,w6,w11 147 add w7,w7,w12 148 add w8,w8,w9 149 eor w21,w21,w5 150 eor w17,w17,w6 151 eor w19,w19,w7 152 eor w20,w20,w8 153 ror w21,w21,#16 154 ror w17,w17,#16 155 ror w19,w19,#16 156 ror w20,w20,#16 157 add w15,w15,w21 158 add w16,w16,w17 159 add w13,w13,w19 160 add w14,w14,w20 161 eor w10,w10,w15 162 eor w11,w11,w16 163 eor w12,w12,w13 164 eor w9,w9,w14 165 ror w10,w10,#20 166 ror w11,w11,#20 167 ror w12,w12,#20 168 ror w9,w9,#20 169 add w5,w5,w10 170 add w6,w6,w11 171 add w7,w7,w12 172 add w8,w8,w9 173 eor w21,w21,w5 174 eor w17,w17,w6 175 eor w19,w19,w7 176 eor w20,w20,w8 177 ror w21,w21,#24 178 ror w17,w17,#24 179 ror w19,w19,#24 180 ror w20,w20,#24 181 add w15,w15,w21 182 add w16,w16,w17 183 add w13,w13,w19 184 add w14,w14,w20 185 eor w10,w10,w15 186 eor w11,w11,w16 187 eor w12,w12,w13 188 eor w9,w9,w14 189 ror w10,w10,#25 190 ror w11,w11,#25 191 ror w12,w12,#25 192 ror w9,w9,#25 193 cbnz x4,.Loop 194 195 add w5,w5,w22 // accumulate key block 196 add x6,x6,x22,lsr#32 197 add w7,w7,w23 198 add x8,x8,x23,lsr#32 199 add w9,w9,w24 200 add x10,x10,x24,lsr#32 201 add w11,w11,w25 202 add x12,x12,x25,lsr#32 203 add w13,w13,w26 204 add x14,x14,x26,lsr#32 205 add w15,w15,w27 206 add x16,x16,x27,lsr#32 207 add w17,w17,w28 208 add x19,x19,x28,lsr#32 209 add w20,w20,w30 210 add x21,x21,x30,lsr#32 211 212 b.lo .Ltail 213 214 add x5,x5,x6,lsl#32 // pack 215 add x7,x7,x8,lsl#32 216 ldp x6,x8,[x1,#0] // load input 217 add x9,x9,x10,lsl#32 218 add x11,x11,x12,lsl#32 219 ldp x10,x12,[x1,#16] 220 add x13,x13,x14,lsl#32 221 add x15,x15,x16,lsl#32 222 ldp x14,x16,[x1,#32] 223 add x17,x17,x19,lsl#32 224 add x20,x20,x21,lsl#32 225 ldp x19,x21,[x1,#48] 226 add x1,x1,#64 227#ifdef __ARMEB__ 228 rev x5,x5 229 rev x7,x7 230 rev x9,x9 231 rev x11,x11 232 rev x13,x13 233 rev x15,x15 234 rev x17,x17 235 rev x20,x20 236#endif 237 eor x5,x5,x6 238 eor x7,x7,x8 239 eor x9,x9,x10 240 eor x11,x11,x12 241 eor x13,x13,x14 242 eor x15,x15,x16 243 eor x17,x17,x19 244 eor x20,x20,x21 245 246 stp x5,x7,[x0,#0] // store output 247 add x28,x28,#1 // increment counter 248 stp x9,x11,[x0,#16] 249 stp x13,x15,[x0,#32] 250 stp x17,x20,[x0,#48] 251 add x0,x0,#64 252 253 b.hi .Loop_outer 254 255 ldp x19,x20,[x29,#16] 256 add sp,sp,#64 257 ldp x21,x22,[x29,#32] 258 ldp x23,x24,[x29,#48] 259 ldp x25,x26,[x29,#64] 260 ldp x27,x28,[x29,#80] 261 ldp x29,x30,[sp],#96 262 AARCH64_VALIDATE_LINK_REGISTER 263.Labort: 264 ret 265 266.align 4 267.Ltail: 268 add x2,x2,#64 269.Less_than_64: 270 sub x0,x0,#1 271 add x1,x1,x2 272 add x0,x0,x2 273 add x4,sp,x2 274 neg x2,x2 275 276 add x5,x5,x6,lsl#32 // pack 277 add x7,x7,x8,lsl#32 278 add x9,x9,x10,lsl#32 279 add x11,x11,x12,lsl#32 280 add x13,x13,x14,lsl#32 281 add x15,x15,x16,lsl#32 282 add x17,x17,x19,lsl#32 283 add x20,x20,x21,lsl#32 284#ifdef __ARMEB__ 285 rev x5,x5 286 rev x7,x7 287 rev x9,x9 288 rev x11,x11 289 rev x13,x13 290 rev x15,x15 291 rev x17,x17 292 rev x20,x20 293#endif 294 stp x5,x7,[sp,#0] 295 stp x9,x11,[sp,#16] 296 stp x13,x15,[sp,#32] 297 stp x17,x20,[sp,#48] 298 299.Loop_tail: 300 ldrb w10,[x1,x2] 301 ldrb w11,[x4,x2] 302 add x2,x2,#1 303 eor w10,w10,w11 304 strb w10,[x0,x2] 305 cbnz x2,.Loop_tail 306 307 stp xzr,xzr,[sp,#0] 308 stp xzr,xzr,[sp,#16] 309 stp xzr,xzr,[sp,#32] 310 stp xzr,xzr,[sp,#48] 311 312 ldp x19,x20,[x29,#16] 313 add sp,sp,#64 314 ldp x21,x22,[x29,#32] 315 ldp x23,x24,[x29,#48] 316 ldp x25,x26,[x29,#64] 317 ldp x27,x28,[x29,#80] 318 ldp x29,x30,[sp],#96 319 AARCH64_VALIDATE_LINK_REGISTER 320 ret 321.size GFp_ChaCha20_ctr32,.-GFp_ChaCha20_ctr32 322 323.type ChaCha20_neon,%function 324.align 5 325ChaCha20_neon: 326 AARCH64_SIGN_LINK_REGISTER 327 stp x29,x30,[sp,#-96]! 328 add x29,sp,#0 329 330 adrp x5,.Lsigma 331 add x5,x5,:lo12:.Lsigma 332 stp x19,x20,[sp,#16] 333 stp x21,x22,[sp,#32] 334 stp x23,x24,[sp,#48] 335 stp x25,x26,[sp,#64] 336 stp x27,x28,[sp,#80] 337 cmp x2,#512 338 b.hs .L512_or_more_neon 339 340 sub sp,sp,#64 341 342 ldp x22,x23,[x5] // load sigma 343 ld1 {v24.4s},[x5],#16 344 ldp x24,x25,[x3] // load key 345 ldp x26,x27,[x3,#16] 346 ld1 {v25.4s,v26.4s},[x3] 347 ldp x28,x30,[x4] // load counter 348 ld1 {v27.4s},[x4] 349 ld1 {v31.4s},[x5] 350#ifdef __ARMEB__ 351 rev64 v24.4s,v24.4s 352 ror x24,x24,#32 353 ror x25,x25,#32 354 ror x26,x26,#32 355 ror x27,x27,#32 356 ror x28,x28,#32 357 ror x30,x30,#32 358#endif 359 add v27.4s,v27.4s,v31.4s // += 1 360 add v28.4s,v27.4s,v31.4s 361 add v29.4s,v28.4s,v31.4s 362 shl v31.4s,v31.4s,#2 // 1 -> 4 363 364.Loop_outer_neon: 365 mov w5,w22 // unpack key block 366 lsr x6,x22,#32 367 mov v0.16b,v24.16b 368 mov w7,w23 369 lsr x8,x23,#32 370 mov v4.16b,v24.16b 371 mov w9,w24 372 lsr x10,x24,#32 373 mov v16.16b,v24.16b 374 mov w11,w25 375 mov v1.16b,v25.16b 376 lsr x12,x25,#32 377 mov v5.16b,v25.16b 378 mov w13,w26 379 mov v17.16b,v25.16b 380 lsr x14,x26,#32 381 mov v3.16b,v27.16b 382 mov w15,w27 383 mov v7.16b,v28.16b 384 lsr x16,x27,#32 385 mov v19.16b,v29.16b 386 mov w17,w28 387 mov v2.16b,v26.16b 388 lsr x19,x28,#32 389 mov v6.16b,v26.16b 390 mov w20,w30 391 mov v18.16b,v26.16b 392 lsr x21,x30,#32 393 394 mov x4,#10 395 subs x2,x2,#256 396.Loop_neon: 397 sub x4,x4,#1 398 add v0.4s,v0.4s,v1.4s 399 add w5,w5,w9 400 add v4.4s,v4.4s,v5.4s 401 add w6,w6,w10 402 add v16.4s,v16.4s,v17.4s 403 add w7,w7,w11 404 eor v3.16b,v3.16b,v0.16b 405 add w8,w8,w12 406 eor v7.16b,v7.16b,v4.16b 407 eor w17,w17,w5 408 eor v19.16b,v19.16b,v16.16b 409 eor w19,w19,w6 410 rev32 v3.8h,v3.8h 411 eor w20,w20,w7 412 rev32 v7.8h,v7.8h 413 eor w21,w21,w8 414 rev32 v19.8h,v19.8h 415 ror w17,w17,#16 416 add v2.4s,v2.4s,v3.4s 417 ror w19,w19,#16 418 add v6.4s,v6.4s,v7.4s 419 ror w20,w20,#16 420 add v18.4s,v18.4s,v19.4s 421 ror w21,w21,#16 422 eor v20.16b,v1.16b,v2.16b 423 add w13,w13,w17 424 eor v21.16b,v5.16b,v6.16b 425 add w14,w14,w19 426 eor v22.16b,v17.16b,v18.16b 427 add w15,w15,w20 428 ushr v1.4s,v20.4s,#20 429 add w16,w16,w21 430 ushr v5.4s,v21.4s,#20 431 eor w9,w9,w13 432 ushr v17.4s,v22.4s,#20 433 eor w10,w10,w14 434 sli v1.4s,v20.4s,#12 435 eor w11,w11,w15 436 sli v5.4s,v21.4s,#12 437 eor w12,w12,w16 438 sli v17.4s,v22.4s,#12 439 ror w9,w9,#20 440 add v0.4s,v0.4s,v1.4s 441 ror w10,w10,#20 442 add v4.4s,v4.4s,v5.4s 443 ror w11,w11,#20 444 add v16.4s,v16.4s,v17.4s 445 ror w12,w12,#20 446 eor v20.16b,v3.16b,v0.16b 447 add w5,w5,w9 448 eor v21.16b,v7.16b,v4.16b 449 add w6,w6,w10 450 eor v22.16b,v19.16b,v16.16b 451 add w7,w7,w11 452 ushr v3.4s,v20.4s,#24 453 add w8,w8,w12 454 ushr v7.4s,v21.4s,#24 455 eor w17,w17,w5 456 ushr v19.4s,v22.4s,#24 457 eor w19,w19,w6 458 sli v3.4s,v20.4s,#8 459 eor w20,w20,w7 460 sli v7.4s,v21.4s,#8 461 eor w21,w21,w8 462 sli v19.4s,v22.4s,#8 463 ror w17,w17,#24 464 add v2.4s,v2.4s,v3.4s 465 ror w19,w19,#24 466 add v6.4s,v6.4s,v7.4s 467 ror w20,w20,#24 468 add v18.4s,v18.4s,v19.4s 469 ror w21,w21,#24 470 eor v20.16b,v1.16b,v2.16b 471 add w13,w13,w17 472 eor v21.16b,v5.16b,v6.16b 473 add w14,w14,w19 474 eor v22.16b,v17.16b,v18.16b 475 add w15,w15,w20 476 ushr v1.4s,v20.4s,#25 477 add w16,w16,w21 478 ushr v5.4s,v21.4s,#25 479 eor w9,w9,w13 480 ushr v17.4s,v22.4s,#25 481 eor w10,w10,w14 482 sli v1.4s,v20.4s,#7 483 eor w11,w11,w15 484 sli v5.4s,v21.4s,#7 485 eor w12,w12,w16 486 sli v17.4s,v22.4s,#7 487 ror w9,w9,#25 488 ext v2.16b,v2.16b,v2.16b,#8 489 ror w10,w10,#25 490 ext v6.16b,v6.16b,v6.16b,#8 491 ror w11,w11,#25 492 ext v18.16b,v18.16b,v18.16b,#8 493 ror w12,w12,#25 494 ext v3.16b,v3.16b,v3.16b,#12 495 ext v7.16b,v7.16b,v7.16b,#12 496 ext v19.16b,v19.16b,v19.16b,#12 497 ext v1.16b,v1.16b,v1.16b,#4 498 ext v5.16b,v5.16b,v5.16b,#4 499 ext v17.16b,v17.16b,v17.16b,#4 500 add v0.4s,v0.4s,v1.4s 501 add w5,w5,w10 502 add v4.4s,v4.4s,v5.4s 503 add w6,w6,w11 504 add v16.4s,v16.4s,v17.4s 505 add w7,w7,w12 506 eor v3.16b,v3.16b,v0.16b 507 add w8,w8,w9 508 eor v7.16b,v7.16b,v4.16b 509 eor w21,w21,w5 510 eor v19.16b,v19.16b,v16.16b 511 eor w17,w17,w6 512 rev32 v3.8h,v3.8h 513 eor w19,w19,w7 514 rev32 v7.8h,v7.8h 515 eor w20,w20,w8 516 rev32 v19.8h,v19.8h 517 ror w21,w21,#16 518 add v2.4s,v2.4s,v3.4s 519 ror w17,w17,#16 520 add v6.4s,v6.4s,v7.4s 521 ror w19,w19,#16 522 add v18.4s,v18.4s,v19.4s 523 ror w20,w20,#16 524 eor v20.16b,v1.16b,v2.16b 525 add w15,w15,w21 526 eor v21.16b,v5.16b,v6.16b 527 add w16,w16,w17 528 eor v22.16b,v17.16b,v18.16b 529 add w13,w13,w19 530 ushr v1.4s,v20.4s,#20 531 add w14,w14,w20 532 ushr v5.4s,v21.4s,#20 533 eor w10,w10,w15 534 ushr v17.4s,v22.4s,#20 535 eor w11,w11,w16 536 sli v1.4s,v20.4s,#12 537 eor w12,w12,w13 538 sli v5.4s,v21.4s,#12 539 eor w9,w9,w14 540 sli v17.4s,v22.4s,#12 541 ror w10,w10,#20 542 add v0.4s,v0.4s,v1.4s 543 ror w11,w11,#20 544 add v4.4s,v4.4s,v5.4s 545 ror w12,w12,#20 546 add v16.4s,v16.4s,v17.4s 547 ror w9,w9,#20 548 eor v20.16b,v3.16b,v0.16b 549 add w5,w5,w10 550 eor v21.16b,v7.16b,v4.16b 551 add w6,w6,w11 552 eor v22.16b,v19.16b,v16.16b 553 add w7,w7,w12 554 ushr v3.4s,v20.4s,#24 555 add w8,w8,w9 556 ushr v7.4s,v21.4s,#24 557 eor w21,w21,w5 558 ushr v19.4s,v22.4s,#24 559 eor w17,w17,w6 560 sli v3.4s,v20.4s,#8 561 eor w19,w19,w7 562 sli v7.4s,v21.4s,#8 563 eor w20,w20,w8 564 sli v19.4s,v22.4s,#8 565 ror w21,w21,#24 566 add v2.4s,v2.4s,v3.4s 567 ror w17,w17,#24 568 add v6.4s,v6.4s,v7.4s 569 ror w19,w19,#24 570 add v18.4s,v18.4s,v19.4s 571 ror w20,w20,#24 572 eor v20.16b,v1.16b,v2.16b 573 add w15,w15,w21 574 eor v21.16b,v5.16b,v6.16b 575 add w16,w16,w17 576 eor v22.16b,v17.16b,v18.16b 577 add w13,w13,w19 578 ushr v1.4s,v20.4s,#25 579 add w14,w14,w20 580 ushr v5.4s,v21.4s,#25 581 eor w10,w10,w15 582 ushr v17.4s,v22.4s,#25 583 eor w11,w11,w16 584 sli v1.4s,v20.4s,#7 585 eor w12,w12,w13 586 sli v5.4s,v21.4s,#7 587 eor w9,w9,w14 588 sli v17.4s,v22.4s,#7 589 ror w10,w10,#25 590 ext v2.16b,v2.16b,v2.16b,#8 591 ror w11,w11,#25 592 ext v6.16b,v6.16b,v6.16b,#8 593 ror w12,w12,#25 594 ext v18.16b,v18.16b,v18.16b,#8 595 ror w9,w9,#25 596 ext v3.16b,v3.16b,v3.16b,#4 597 ext v7.16b,v7.16b,v7.16b,#4 598 ext v19.16b,v19.16b,v19.16b,#4 599 ext v1.16b,v1.16b,v1.16b,#12 600 ext v5.16b,v5.16b,v5.16b,#12 601 ext v17.16b,v17.16b,v17.16b,#12 602 cbnz x4,.Loop_neon 603 604 add w5,w5,w22 // accumulate key block 605 add v0.4s,v0.4s,v24.4s 606 add x6,x6,x22,lsr#32 607 add v4.4s,v4.4s,v24.4s 608 add w7,w7,w23 609 add v16.4s,v16.4s,v24.4s 610 add x8,x8,x23,lsr#32 611 add v2.4s,v2.4s,v26.4s 612 add w9,w9,w24 613 add v6.4s,v6.4s,v26.4s 614 add x10,x10,x24,lsr#32 615 add v18.4s,v18.4s,v26.4s 616 add w11,w11,w25 617 add v3.4s,v3.4s,v27.4s 618 add x12,x12,x25,lsr#32 619 add w13,w13,w26 620 add v7.4s,v7.4s,v28.4s 621 add x14,x14,x26,lsr#32 622 add w15,w15,w27 623 add v19.4s,v19.4s,v29.4s 624 add x16,x16,x27,lsr#32 625 add w17,w17,w28 626 add v1.4s,v1.4s,v25.4s 627 add x19,x19,x28,lsr#32 628 add w20,w20,w30 629 add v5.4s,v5.4s,v25.4s 630 add x21,x21,x30,lsr#32 631 add v17.4s,v17.4s,v25.4s 632 633 b.lo .Ltail_neon 634 635 add x5,x5,x6,lsl#32 // pack 636 add x7,x7,x8,lsl#32 637 ldp x6,x8,[x1,#0] // load input 638 add x9,x9,x10,lsl#32 639 add x11,x11,x12,lsl#32 640 ldp x10,x12,[x1,#16] 641 add x13,x13,x14,lsl#32 642 add x15,x15,x16,lsl#32 643 ldp x14,x16,[x1,#32] 644 add x17,x17,x19,lsl#32 645 add x20,x20,x21,lsl#32 646 ldp x19,x21,[x1,#48] 647 add x1,x1,#64 648#ifdef __ARMEB__ 649 rev x5,x5 650 rev x7,x7 651 rev x9,x9 652 rev x11,x11 653 rev x13,x13 654 rev x15,x15 655 rev x17,x17 656 rev x20,x20 657#endif 658 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 659 eor x5,x5,x6 660 eor x7,x7,x8 661 eor x9,x9,x10 662 eor x11,x11,x12 663 eor x13,x13,x14 664 eor v0.16b,v0.16b,v20.16b 665 eor x15,x15,x16 666 eor v1.16b,v1.16b,v21.16b 667 eor x17,x17,x19 668 eor v2.16b,v2.16b,v22.16b 669 eor x20,x20,x21 670 eor v3.16b,v3.16b,v23.16b 671 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 672 673 stp x5,x7,[x0,#0] // store output 674 add x28,x28,#4 // increment counter 675 stp x9,x11,[x0,#16] 676 add v27.4s,v27.4s,v31.4s // += 4 677 stp x13,x15,[x0,#32] 678 add v28.4s,v28.4s,v31.4s 679 stp x17,x20,[x0,#48] 680 add v29.4s,v29.4s,v31.4s 681 add x0,x0,#64 682 683 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 684 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 685 686 eor v4.16b,v4.16b,v20.16b 687 eor v5.16b,v5.16b,v21.16b 688 eor v6.16b,v6.16b,v22.16b 689 eor v7.16b,v7.16b,v23.16b 690 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 691 692 eor v16.16b,v16.16b,v0.16b 693 eor v17.16b,v17.16b,v1.16b 694 eor v18.16b,v18.16b,v2.16b 695 eor v19.16b,v19.16b,v3.16b 696 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 697 698 b.hi .Loop_outer_neon 699 700 ldp x19,x20,[x29,#16] 701 add sp,sp,#64 702 ldp x21,x22,[x29,#32] 703 ldp x23,x24,[x29,#48] 704 ldp x25,x26,[x29,#64] 705 ldp x27,x28,[x29,#80] 706 ldp x29,x30,[sp],#96 707 AARCH64_VALIDATE_LINK_REGISTER 708 ret 709 710.Ltail_neon: 711 add x2,x2,#256 712 cmp x2,#64 713 b.lo .Less_than_64 714 715 add x5,x5,x6,lsl#32 // pack 716 add x7,x7,x8,lsl#32 717 ldp x6,x8,[x1,#0] // load input 718 add x9,x9,x10,lsl#32 719 add x11,x11,x12,lsl#32 720 ldp x10,x12,[x1,#16] 721 add x13,x13,x14,lsl#32 722 add x15,x15,x16,lsl#32 723 ldp x14,x16,[x1,#32] 724 add x17,x17,x19,lsl#32 725 add x20,x20,x21,lsl#32 726 ldp x19,x21,[x1,#48] 727 add x1,x1,#64 728#ifdef __ARMEB__ 729 rev x5,x5 730 rev x7,x7 731 rev x9,x9 732 rev x11,x11 733 rev x13,x13 734 rev x15,x15 735 rev x17,x17 736 rev x20,x20 737#endif 738 eor x5,x5,x6 739 eor x7,x7,x8 740 eor x9,x9,x10 741 eor x11,x11,x12 742 eor x13,x13,x14 743 eor x15,x15,x16 744 eor x17,x17,x19 745 eor x20,x20,x21 746 747 stp x5,x7,[x0,#0] // store output 748 add x28,x28,#4 // increment counter 749 stp x9,x11,[x0,#16] 750 stp x13,x15,[x0,#32] 751 stp x17,x20,[x0,#48] 752 add x0,x0,#64 753 b.eq .Ldone_neon 754 sub x2,x2,#64 755 cmp x2,#64 756 b.lo .Less_than_128 757 758 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 759 eor v0.16b,v0.16b,v20.16b 760 eor v1.16b,v1.16b,v21.16b 761 eor v2.16b,v2.16b,v22.16b 762 eor v3.16b,v3.16b,v23.16b 763 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 764 b.eq .Ldone_neon 765 sub x2,x2,#64 766 cmp x2,#64 767 b.lo .Less_than_192 768 769 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 770 eor v4.16b,v4.16b,v20.16b 771 eor v5.16b,v5.16b,v21.16b 772 eor v6.16b,v6.16b,v22.16b 773 eor v7.16b,v7.16b,v23.16b 774 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 775 b.eq .Ldone_neon 776 sub x2,x2,#64 777 778 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] 779 b .Last_neon 780 781.Less_than_128: 782 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] 783 b .Last_neon 784.Less_than_192: 785 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] 786 b .Last_neon 787 788.align 4 789.Last_neon: 790 sub x0,x0,#1 791 add x1,x1,x2 792 add x0,x0,x2 793 add x4,sp,x2 794 neg x2,x2 795 796.Loop_tail_neon: 797 ldrb w10,[x1,x2] 798 ldrb w11,[x4,x2] 799 add x2,x2,#1 800 eor w10,w10,w11 801 strb w10,[x0,x2] 802 cbnz x2,.Loop_tail_neon 803 804 stp xzr,xzr,[sp,#0] 805 stp xzr,xzr,[sp,#16] 806 stp xzr,xzr,[sp,#32] 807 stp xzr,xzr,[sp,#48] 808 809.Ldone_neon: 810 ldp x19,x20,[x29,#16] 811 add sp,sp,#64 812 ldp x21,x22,[x29,#32] 813 ldp x23,x24,[x29,#48] 814 ldp x25,x26,[x29,#64] 815 ldp x27,x28,[x29,#80] 816 ldp x29,x30,[sp],#96 817 AARCH64_VALIDATE_LINK_REGISTER 818 ret 819.size ChaCha20_neon,.-ChaCha20_neon 820.type ChaCha20_512_neon,%function 821.align 5 822ChaCha20_512_neon: 823 AARCH64_SIGN_LINK_REGISTER 824 stp x29,x30,[sp,#-96]! 825 add x29,sp,#0 826 827 adrp x5,.Lsigma 828 add x5,x5,:lo12:.Lsigma 829 stp x19,x20,[sp,#16] 830 stp x21,x22,[sp,#32] 831 stp x23,x24,[sp,#48] 832 stp x25,x26,[sp,#64] 833 stp x27,x28,[sp,#80] 834 835.L512_or_more_neon: 836 sub sp,sp,#128+64 837 838 ldp x22,x23,[x5] // load sigma 839 ld1 {v24.4s},[x5],#16 840 ldp x24,x25,[x3] // load key 841 ldp x26,x27,[x3,#16] 842 ld1 {v25.4s,v26.4s},[x3] 843 ldp x28,x30,[x4] // load counter 844 ld1 {v27.4s},[x4] 845 ld1 {v31.4s},[x5] 846#ifdef __ARMEB__ 847 rev64 v24.4s,v24.4s 848 ror x24,x24,#32 849 ror x25,x25,#32 850 ror x26,x26,#32 851 ror x27,x27,#32 852 ror x28,x28,#32 853 ror x30,x30,#32 854#endif 855 add v27.4s,v27.4s,v31.4s // += 1 856 stp q24,q25,[sp,#0] // off-load key block, invariant part 857 add v27.4s,v27.4s,v31.4s // not typo 858 str q26,[sp,#32] 859 add v28.4s,v27.4s,v31.4s 860 add v29.4s,v28.4s,v31.4s 861 add v30.4s,v29.4s,v31.4s 862 shl v31.4s,v31.4s,#2 // 1 -> 4 863 864 stp d8,d9,[sp,#128+0] // meet ABI requirements 865 stp d10,d11,[sp,#128+16] 866 stp d12,d13,[sp,#128+32] 867 stp d14,d15,[sp,#128+48] 868 869 sub x2,x2,#512 // not typo 870 871.Loop_outer_512_neon: 872 mov v0.16b,v24.16b 873 mov v4.16b,v24.16b 874 mov v8.16b,v24.16b 875 mov v12.16b,v24.16b 876 mov v16.16b,v24.16b 877 mov v20.16b,v24.16b 878 mov v1.16b,v25.16b 879 mov w5,w22 // unpack key block 880 mov v5.16b,v25.16b 881 lsr x6,x22,#32 882 mov v9.16b,v25.16b 883 mov w7,w23 884 mov v13.16b,v25.16b 885 lsr x8,x23,#32 886 mov v17.16b,v25.16b 887 mov w9,w24 888 mov v21.16b,v25.16b 889 lsr x10,x24,#32 890 mov v3.16b,v27.16b 891 mov w11,w25 892 mov v7.16b,v28.16b 893 lsr x12,x25,#32 894 mov v11.16b,v29.16b 895 mov w13,w26 896 mov v15.16b,v30.16b 897 lsr x14,x26,#32 898 mov v2.16b,v26.16b 899 mov w15,w27 900 mov v6.16b,v26.16b 901 lsr x16,x27,#32 902 add v19.4s,v3.4s,v31.4s // +4 903 mov w17,w28 904 add v23.4s,v7.4s,v31.4s // +4 905 lsr x19,x28,#32 906 mov v10.16b,v26.16b 907 mov w20,w30 908 mov v14.16b,v26.16b 909 lsr x21,x30,#32 910 mov v18.16b,v26.16b 911 stp q27,q28,[sp,#48] // off-load key block, variable part 912 mov v22.16b,v26.16b 913 str q29,[sp,#80] 914 915 mov x4,#5 916 subs x2,x2,#512 917.Loop_upper_neon: 918 sub x4,x4,#1 919 add v0.4s,v0.4s,v1.4s 920 add w5,w5,w9 921 add v4.4s,v4.4s,v5.4s 922 add w6,w6,w10 923 add v8.4s,v8.4s,v9.4s 924 add w7,w7,w11 925 add v12.4s,v12.4s,v13.4s 926 add w8,w8,w12 927 add v16.4s,v16.4s,v17.4s 928 eor w17,w17,w5 929 add v20.4s,v20.4s,v21.4s 930 eor w19,w19,w6 931 eor v3.16b,v3.16b,v0.16b 932 eor w20,w20,w7 933 eor v7.16b,v7.16b,v4.16b 934 eor w21,w21,w8 935 eor v11.16b,v11.16b,v8.16b 936 ror w17,w17,#16 937 eor v15.16b,v15.16b,v12.16b 938 ror w19,w19,#16 939 eor v19.16b,v19.16b,v16.16b 940 ror w20,w20,#16 941 eor v23.16b,v23.16b,v20.16b 942 ror w21,w21,#16 943 rev32 v3.8h,v3.8h 944 add w13,w13,w17 945 rev32 v7.8h,v7.8h 946 add w14,w14,w19 947 rev32 v11.8h,v11.8h 948 add w15,w15,w20 949 rev32 v15.8h,v15.8h 950 add w16,w16,w21 951 rev32 v19.8h,v19.8h 952 eor w9,w9,w13 953 rev32 v23.8h,v23.8h 954 eor w10,w10,w14 955 add v2.4s,v2.4s,v3.4s 956 eor w11,w11,w15 957 add v6.4s,v6.4s,v7.4s 958 eor w12,w12,w16 959 add v10.4s,v10.4s,v11.4s 960 ror w9,w9,#20 961 add v14.4s,v14.4s,v15.4s 962 ror w10,w10,#20 963 add v18.4s,v18.4s,v19.4s 964 ror w11,w11,#20 965 add v22.4s,v22.4s,v23.4s 966 ror w12,w12,#20 967 eor v24.16b,v1.16b,v2.16b 968 add w5,w5,w9 969 eor v25.16b,v5.16b,v6.16b 970 add w6,w6,w10 971 eor v26.16b,v9.16b,v10.16b 972 add w7,w7,w11 973 eor v27.16b,v13.16b,v14.16b 974 add w8,w8,w12 975 eor v28.16b,v17.16b,v18.16b 976 eor w17,w17,w5 977 eor v29.16b,v21.16b,v22.16b 978 eor w19,w19,w6 979 ushr v1.4s,v24.4s,#20 980 eor w20,w20,w7 981 ushr v5.4s,v25.4s,#20 982 eor w21,w21,w8 983 ushr v9.4s,v26.4s,#20 984 ror w17,w17,#24 985 ushr v13.4s,v27.4s,#20 986 ror w19,w19,#24 987 ushr v17.4s,v28.4s,#20 988 ror w20,w20,#24 989 ushr v21.4s,v29.4s,#20 990 ror w21,w21,#24 991 sli v1.4s,v24.4s,#12 992 add w13,w13,w17 993 sli v5.4s,v25.4s,#12 994 add w14,w14,w19 995 sli v9.4s,v26.4s,#12 996 add w15,w15,w20 997 sli v13.4s,v27.4s,#12 998 add w16,w16,w21 999 sli v17.4s,v28.4s,#12 1000 eor w9,w9,w13 1001 sli v21.4s,v29.4s,#12 1002 eor w10,w10,w14 1003 add v0.4s,v0.4s,v1.4s 1004 eor w11,w11,w15 1005 add v4.4s,v4.4s,v5.4s 1006 eor w12,w12,w16 1007 add v8.4s,v8.4s,v9.4s 1008 ror w9,w9,#25 1009 add v12.4s,v12.4s,v13.4s 1010 ror w10,w10,#25 1011 add v16.4s,v16.4s,v17.4s 1012 ror w11,w11,#25 1013 add v20.4s,v20.4s,v21.4s 1014 ror w12,w12,#25 1015 eor v24.16b,v3.16b,v0.16b 1016 add w5,w5,w10 1017 eor v25.16b,v7.16b,v4.16b 1018 add w6,w6,w11 1019 eor v26.16b,v11.16b,v8.16b 1020 add w7,w7,w12 1021 eor v27.16b,v15.16b,v12.16b 1022 add w8,w8,w9 1023 eor v28.16b,v19.16b,v16.16b 1024 eor w21,w21,w5 1025 eor v29.16b,v23.16b,v20.16b 1026 eor w17,w17,w6 1027 ushr v3.4s,v24.4s,#24 1028 eor w19,w19,w7 1029 ushr v7.4s,v25.4s,#24 1030 eor w20,w20,w8 1031 ushr v11.4s,v26.4s,#24 1032 ror w21,w21,#16 1033 ushr v15.4s,v27.4s,#24 1034 ror w17,w17,#16 1035 ushr v19.4s,v28.4s,#24 1036 ror w19,w19,#16 1037 ushr v23.4s,v29.4s,#24 1038 ror w20,w20,#16 1039 sli v3.4s,v24.4s,#8 1040 add w15,w15,w21 1041 sli v7.4s,v25.4s,#8 1042 add w16,w16,w17 1043 sli v11.4s,v26.4s,#8 1044 add w13,w13,w19 1045 sli v15.4s,v27.4s,#8 1046 add w14,w14,w20 1047 sli v19.4s,v28.4s,#8 1048 eor w10,w10,w15 1049 sli v23.4s,v29.4s,#8 1050 eor w11,w11,w16 1051 add v2.4s,v2.4s,v3.4s 1052 eor w12,w12,w13 1053 add v6.4s,v6.4s,v7.4s 1054 eor w9,w9,w14 1055 add v10.4s,v10.4s,v11.4s 1056 ror w10,w10,#20 1057 add v14.4s,v14.4s,v15.4s 1058 ror w11,w11,#20 1059 add v18.4s,v18.4s,v19.4s 1060 ror w12,w12,#20 1061 add v22.4s,v22.4s,v23.4s 1062 ror w9,w9,#20 1063 eor v24.16b,v1.16b,v2.16b 1064 add w5,w5,w10 1065 eor v25.16b,v5.16b,v6.16b 1066 add w6,w6,w11 1067 eor v26.16b,v9.16b,v10.16b 1068 add w7,w7,w12 1069 eor v27.16b,v13.16b,v14.16b 1070 add w8,w8,w9 1071 eor v28.16b,v17.16b,v18.16b 1072 eor w21,w21,w5 1073 eor v29.16b,v21.16b,v22.16b 1074 eor w17,w17,w6 1075 ushr v1.4s,v24.4s,#25 1076 eor w19,w19,w7 1077 ushr v5.4s,v25.4s,#25 1078 eor w20,w20,w8 1079 ushr v9.4s,v26.4s,#25 1080 ror w21,w21,#24 1081 ushr v13.4s,v27.4s,#25 1082 ror w17,w17,#24 1083 ushr v17.4s,v28.4s,#25 1084 ror w19,w19,#24 1085 ushr v21.4s,v29.4s,#25 1086 ror w20,w20,#24 1087 sli v1.4s,v24.4s,#7 1088 add w15,w15,w21 1089 sli v5.4s,v25.4s,#7 1090 add w16,w16,w17 1091 sli v9.4s,v26.4s,#7 1092 add w13,w13,w19 1093 sli v13.4s,v27.4s,#7 1094 add w14,w14,w20 1095 sli v17.4s,v28.4s,#7 1096 eor w10,w10,w15 1097 sli v21.4s,v29.4s,#7 1098 eor w11,w11,w16 1099 ext v2.16b,v2.16b,v2.16b,#8 1100 eor w12,w12,w13 1101 ext v6.16b,v6.16b,v6.16b,#8 1102 eor w9,w9,w14 1103 ext v10.16b,v10.16b,v10.16b,#8 1104 ror w10,w10,#25 1105 ext v14.16b,v14.16b,v14.16b,#8 1106 ror w11,w11,#25 1107 ext v18.16b,v18.16b,v18.16b,#8 1108 ror w12,w12,#25 1109 ext v22.16b,v22.16b,v22.16b,#8 1110 ror w9,w9,#25 1111 ext v3.16b,v3.16b,v3.16b,#12 1112 ext v7.16b,v7.16b,v7.16b,#12 1113 ext v11.16b,v11.16b,v11.16b,#12 1114 ext v15.16b,v15.16b,v15.16b,#12 1115 ext v19.16b,v19.16b,v19.16b,#12 1116 ext v23.16b,v23.16b,v23.16b,#12 1117 ext v1.16b,v1.16b,v1.16b,#4 1118 ext v5.16b,v5.16b,v5.16b,#4 1119 ext v9.16b,v9.16b,v9.16b,#4 1120 ext v13.16b,v13.16b,v13.16b,#4 1121 ext v17.16b,v17.16b,v17.16b,#4 1122 ext v21.16b,v21.16b,v21.16b,#4 1123 add v0.4s,v0.4s,v1.4s 1124 add w5,w5,w9 1125 add v4.4s,v4.4s,v5.4s 1126 add w6,w6,w10 1127 add v8.4s,v8.4s,v9.4s 1128 add w7,w7,w11 1129 add v12.4s,v12.4s,v13.4s 1130 add w8,w8,w12 1131 add v16.4s,v16.4s,v17.4s 1132 eor w17,w17,w5 1133 add v20.4s,v20.4s,v21.4s 1134 eor w19,w19,w6 1135 eor v3.16b,v3.16b,v0.16b 1136 eor w20,w20,w7 1137 eor v7.16b,v7.16b,v4.16b 1138 eor w21,w21,w8 1139 eor v11.16b,v11.16b,v8.16b 1140 ror w17,w17,#16 1141 eor v15.16b,v15.16b,v12.16b 1142 ror w19,w19,#16 1143 eor v19.16b,v19.16b,v16.16b 1144 ror w20,w20,#16 1145 eor v23.16b,v23.16b,v20.16b 1146 ror w21,w21,#16 1147 rev32 v3.8h,v3.8h 1148 add w13,w13,w17 1149 rev32 v7.8h,v7.8h 1150 add w14,w14,w19 1151 rev32 v11.8h,v11.8h 1152 add w15,w15,w20 1153 rev32 v15.8h,v15.8h 1154 add w16,w16,w21 1155 rev32 v19.8h,v19.8h 1156 eor w9,w9,w13 1157 rev32 v23.8h,v23.8h 1158 eor w10,w10,w14 1159 add v2.4s,v2.4s,v3.4s 1160 eor w11,w11,w15 1161 add v6.4s,v6.4s,v7.4s 1162 eor w12,w12,w16 1163 add v10.4s,v10.4s,v11.4s 1164 ror w9,w9,#20 1165 add v14.4s,v14.4s,v15.4s 1166 ror w10,w10,#20 1167 add v18.4s,v18.4s,v19.4s 1168 ror w11,w11,#20 1169 add v22.4s,v22.4s,v23.4s 1170 ror w12,w12,#20 1171 eor v24.16b,v1.16b,v2.16b 1172 add w5,w5,w9 1173 eor v25.16b,v5.16b,v6.16b 1174 add w6,w6,w10 1175 eor v26.16b,v9.16b,v10.16b 1176 add w7,w7,w11 1177 eor v27.16b,v13.16b,v14.16b 1178 add w8,w8,w12 1179 eor v28.16b,v17.16b,v18.16b 1180 eor w17,w17,w5 1181 eor v29.16b,v21.16b,v22.16b 1182 eor w19,w19,w6 1183 ushr v1.4s,v24.4s,#20 1184 eor w20,w20,w7 1185 ushr v5.4s,v25.4s,#20 1186 eor w21,w21,w8 1187 ushr v9.4s,v26.4s,#20 1188 ror w17,w17,#24 1189 ushr v13.4s,v27.4s,#20 1190 ror w19,w19,#24 1191 ushr v17.4s,v28.4s,#20 1192 ror w20,w20,#24 1193 ushr v21.4s,v29.4s,#20 1194 ror w21,w21,#24 1195 sli v1.4s,v24.4s,#12 1196 add w13,w13,w17 1197 sli v5.4s,v25.4s,#12 1198 add w14,w14,w19 1199 sli v9.4s,v26.4s,#12 1200 add w15,w15,w20 1201 sli v13.4s,v27.4s,#12 1202 add w16,w16,w21 1203 sli v17.4s,v28.4s,#12 1204 eor w9,w9,w13 1205 sli v21.4s,v29.4s,#12 1206 eor w10,w10,w14 1207 add v0.4s,v0.4s,v1.4s 1208 eor w11,w11,w15 1209 add v4.4s,v4.4s,v5.4s 1210 eor w12,w12,w16 1211 add v8.4s,v8.4s,v9.4s 1212 ror w9,w9,#25 1213 add v12.4s,v12.4s,v13.4s 1214 ror w10,w10,#25 1215 add v16.4s,v16.4s,v17.4s 1216 ror w11,w11,#25 1217 add v20.4s,v20.4s,v21.4s 1218 ror w12,w12,#25 1219 eor v24.16b,v3.16b,v0.16b 1220 add w5,w5,w10 1221 eor v25.16b,v7.16b,v4.16b 1222 add w6,w6,w11 1223 eor v26.16b,v11.16b,v8.16b 1224 add w7,w7,w12 1225 eor v27.16b,v15.16b,v12.16b 1226 add w8,w8,w9 1227 eor v28.16b,v19.16b,v16.16b 1228 eor w21,w21,w5 1229 eor v29.16b,v23.16b,v20.16b 1230 eor w17,w17,w6 1231 ushr v3.4s,v24.4s,#24 1232 eor w19,w19,w7 1233 ushr v7.4s,v25.4s,#24 1234 eor w20,w20,w8 1235 ushr v11.4s,v26.4s,#24 1236 ror w21,w21,#16 1237 ushr v15.4s,v27.4s,#24 1238 ror w17,w17,#16 1239 ushr v19.4s,v28.4s,#24 1240 ror w19,w19,#16 1241 ushr v23.4s,v29.4s,#24 1242 ror w20,w20,#16 1243 sli v3.4s,v24.4s,#8 1244 add w15,w15,w21 1245 sli v7.4s,v25.4s,#8 1246 add w16,w16,w17 1247 sli v11.4s,v26.4s,#8 1248 add w13,w13,w19 1249 sli v15.4s,v27.4s,#8 1250 add w14,w14,w20 1251 sli v19.4s,v28.4s,#8 1252 eor w10,w10,w15 1253 sli v23.4s,v29.4s,#8 1254 eor w11,w11,w16 1255 add v2.4s,v2.4s,v3.4s 1256 eor w12,w12,w13 1257 add v6.4s,v6.4s,v7.4s 1258 eor w9,w9,w14 1259 add v10.4s,v10.4s,v11.4s 1260 ror w10,w10,#20 1261 add v14.4s,v14.4s,v15.4s 1262 ror w11,w11,#20 1263 add v18.4s,v18.4s,v19.4s 1264 ror w12,w12,#20 1265 add v22.4s,v22.4s,v23.4s 1266 ror w9,w9,#20 1267 eor v24.16b,v1.16b,v2.16b 1268 add w5,w5,w10 1269 eor v25.16b,v5.16b,v6.16b 1270 add w6,w6,w11 1271 eor v26.16b,v9.16b,v10.16b 1272 add w7,w7,w12 1273 eor v27.16b,v13.16b,v14.16b 1274 add w8,w8,w9 1275 eor v28.16b,v17.16b,v18.16b 1276 eor w21,w21,w5 1277 eor v29.16b,v21.16b,v22.16b 1278 eor w17,w17,w6 1279 ushr v1.4s,v24.4s,#25 1280 eor w19,w19,w7 1281 ushr v5.4s,v25.4s,#25 1282 eor w20,w20,w8 1283 ushr v9.4s,v26.4s,#25 1284 ror w21,w21,#24 1285 ushr v13.4s,v27.4s,#25 1286 ror w17,w17,#24 1287 ushr v17.4s,v28.4s,#25 1288 ror w19,w19,#24 1289 ushr v21.4s,v29.4s,#25 1290 ror w20,w20,#24 1291 sli v1.4s,v24.4s,#7 1292 add w15,w15,w21 1293 sli v5.4s,v25.4s,#7 1294 add w16,w16,w17 1295 sli v9.4s,v26.4s,#7 1296 add w13,w13,w19 1297 sli v13.4s,v27.4s,#7 1298 add w14,w14,w20 1299 sli v17.4s,v28.4s,#7 1300 eor w10,w10,w15 1301 sli v21.4s,v29.4s,#7 1302 eor w11,w11,w16 1303 ext v2.16b,v2.16b,v2.16b,#8 1304 eor w12,w12,w13 1305 ext v6.16b,v6.16b,v6.16b,#8 1306 eor w9,w9,w14 1307 ext v10.16b,v10.16b,v10.16b,#8 1308 ror w10,w10,#25 1309 ext v14.16b,v14.16b,v14.16b,#8 1310 ror w11,w11,#25 1311 ext v18.16b,v18.16b,v18.16b,#8 1312 ror w12,w12,#25 1313 ext v22.16b,v22.16b,v22.16b,#8 1314 ror w9,w9,#25 1315 ext v3.16b,v3.16b,v3.16b,#4 1316 ext v7.16b,v7.16b,v7.16b,#4 1317 ext v11.16b,v11.16b,v11.16b,#4 1318 ext v15.16b,v15.16b,v15.16b,#4 1319 ext v19.16b,v19.16b,v19.16b,#4 1320 ext v23.16b,v23.16b,v23.16b,#4 1321 ext v1.16b,v1.16b,v1.16b,#12 1322 ext v5.16b,v5.16b,v5.16b,#12 1323 ext v9.16b,v9.16b,v9.16b,#12 1324 ext v13.16b,v13.16b,v13.16b,#12 1325 ext v17.16b,v17.16b,v17.16b,#12 1326 ext v21.16b,v21.16b,v21.16b,#12 1327 cbnz x4,.Loop_upper_neon 1328 1329 add w5,w5,w22 // accumulate key block 1330 add x6,x6,x22,lsr#32 1331 add w7,w7,w23 1332 add x8,x8,x23,lsr#32 1333 add w9,w9,w24 1334 add x10,x10,x24,lsr#32 1335 add w11,w11,w25 1336 add x12,x12,x25,lsr#32 1337 add w13,w13,w26 1338 add x14,x14,x26,lsr#32 1339 add w15,w15,w27 1340 add x16,x16,x27,lsr#32 1341 add w17,w17,w28 1342 add x19,x19,x28,lsr#32 1343 add w20,w20,w30 1344 add x21,x21,x30,lsr#32 1345 1346 add x5,x5,x6,lsl#32 // pack 1347 add x7,x7,x8,lsl#32 1348 ldp x6,x8,[x1,#0] // load input 1349 add x9,x9,x10,lsl#32 1350 add x11,x11,x12,lsl#32 1351 ldp x10,x12,[x1,#16] 1352 add x13,x13,x14,lsl#32 1353 add x15,x15,x16,lsl#32 1354 ldp x14,x16,[x1,#32] 1355 add x17,x17,x19,lsl#32 1356 add x20,x20,x21,lsl#32 1357 ldp x19,x21,[x1,#48] 1358 add x1,x1,#64 1359#ifdef __ARMEB__ 1360 rev x5,x5 1361 rev x7,x7 1362 rev x9,x9 1363 rev x11,x11 1364 rev x13,x13 1365 rev x15,x15 1366 rev x17,x17 1367 rev x20,x20 1368#endif 1369 eor x5,x5,x6 1370 eor x7,x7,x8 1371 eor x9,x9,x10 1372 eor x11,x11,x12 1373 eor x13,x13,x14 1374 eor x15,x15,x16 1375 eor x17,x17,x19 1376 eor x20,x20,x21 1377 1378 stp x5,x7,[x0,#0] // store output 1379 add x28,x28,#1 // increment counter 1380 mov w5,w22 // unpack key block 1381 lsr x6,x22,#32 1382 stp x9,x11,[x0,#16] 1383 mov w7,w23 1384 lsr x8,x23,#32 1385 stp x13,x15,[x0,#32] 1386 mov w9,w24 1387 lsr x10,x24,#32 1388 stp x17,x20,[x0,#48] 1389 add x0,x0,#64 1390 mov w11,w25 1391 lsr x12,x25,#32 1392 mov w13,w26 1393 lsr x14,x26,#32 1394 mov w15,w27 1395 lsr x16,x27,#32 1396 mov w17,w28 1397 lsr x19,x28,#32 1398 mov w20,w30 1399 lsr x21,x30,#32 1400 1401 mov x4,#5 1402.Loop_lower_neon: 1403 sub x4,x4,#1 1404 add v0.4s,v0.4s,v1.4s 1405 add w5,w5,w9 1406 add v4.4s,v4.4s,v5.4s 1407 add w6,w6,w10 1408 add v8.4s,v8.4s,v9.4s 1409 add w7,w7,w11 1410 add v12.4s,v12.4s,v13.4s 1411 add w8,w8,w12 1412 add v16.4s,v16.4s,v17.4s 1413 eor w17,w17,w5 1414 add v20.4s,v20.4s,v21.4s 1415 eor w19,w19,w6 1416 eor v3.16b,v3.16b,v0.16b 1417 eor w20,w20,w7 1418 eor v7.16b,v7.16b,v4.16b 1419 eor w21,w21,w8 1420 eor v11.16b,v11.16b,v8.16b 1421 ror w17,w17,#16 1422 eor v15.16b,v15.16b,v12.16b 1423 ror w19,w19,#16 1424 eor v19.16b,v19.16b,v16.16b 1425 ror w20,w20,#16 1426 eor v23.16b,v23.16b,v20.16b 1427 ror w21,w21,#16 1428 rev32 v3.8h,v3.8h 1429 add w13,w13,w17 1430 rev32 v7.8h,v7.8h 1431 add w14,w14,w19 1432 rev32 v11.8h,v11.8h 1433 add w15,w15,w20 1434 rev32 v15.8h,v15.8h 1435 add w16,w16,w21 1436 rev32 v19.8h,v19.8h 1437 eor w9,w9,w13 1438 rev32 v23.8h,v23.8h 1439 eor w10,w10,w14 1440 add v2.4s,v2.4s,v3.4s 1441 eor w11,w11,w15 1442 add v6.4s,v6.4s,v7.4s 1443 eor w12,w12,w16 1444 add v10.4s,v10.4s,v11.4s 1445 ror w9,w9,#20 1446 add v14.4s,v14.4s,v15.4s 1447 ror w10,w10,#20 1448 add v18.4s,v18.4s,v19.4s 1449 ror w11,w11,#20 1450 add v22.4s,v22.4s,v23.4s 1451 ror w12,w12,#20 1452 eor v24.16b,v1.16b,v2.16b 1453 add w5,w5,w9 1454 eor v25.16b,v5.16b,v6.16b 1455 add w6,w6,w10 1456 eor v26.16b,v9.16b,v10.16b 1457 add w7,w7,w11 1458 eor v27.16b,v13.16b,v14.16b 1459 add w8,w8,w12 1460 eor v28.16b,v17.16b,v18.16b 1461 eor w17,w17,w5 1462 eor v29.16b,v21.16b,v22.16b 1463 eor w19,w19,w6 1464 ushr v1.4s,v24.4s,#20 1465 eor w20,w20,w7 1466 ushr v5.4s,v25.4s,#20 1467 eor w21,w21,w8 1468 ushr v9.4s,v26.4s,#20 1469 ror w17,w17,#24 1470 ushr v13.4s,v27.4s,#20 1471 ror w19,w19,#24 1472 ushr v17.4s,v28.4s,#20 1473 ror w20,w20,#24 1474 ushr v21.4s,v29.4s,#20 1475 ror w21,w21,#24 1476 sli v1.4s,v24.4s,#12 1477 add w13,w13,w17 1478 sli v5.4s,v25.4s,#12 1479 add w14,w14,w19 1480 sli v9.4s,v26.4s,#12 1481 add w15,w15,w20 1482 sli v13.4s,v27.4s,#12 1483 add w16,w16,w21 1484 sli v17.4s,v28.4s,#12 1485 eor w9,w9,w13 1486 sli v21.4s,v29.4s,#12 1487 eor w10,w10,w14 1488 add v0.4s,v0.4s,v1.4s 1489 eor w11,w11,w15 1490 add v4.4s,v4.4s,v5.4s 1491 eor w12,w12,w16 1492 add v8.4s,v8.4s,v9.4s 1493 ror w9,w9,#25 1494 add v12.4s,v12.4s,v13.4s 1495 ror w10,w10,#25 1496 add v16.4s,v16.4s,v17.4s 1497 ror w11,w11,#25 1498 add v20.4s,v20.4s,v21.4s 1499 ror w12,w12,#25 1500 eor v24.16b,v3.16b,v0.16b 1501 add w5,w5,w10 1502 eor v25.16b,v7.16b,v4.16b 1503 add w6,w6,w11 1504 eor v26.16b,v11.16b,v8.16b 1505 add w7,w7,w12 1506 eor v27.16b,v15.16b,v12.16b 1507 add w8,w8,w9 1508 eor v28.16b,v19.16b,v16.16b 1509 eor w21,w21,w5 1510 eor v29.16b,v23.16b,v20.16b 1511 eor w17,w17,w6 1512 ushr v3.4s,v24.4s,#24 1513 eor w19,w19,w7 1514 ushr v7.4s,v25.4s,#24 1515 eor w20,w20,w8 1516 ushr v11.4s,v26.4s,#24 1517 ror w21,w21,#16 1518 ushr v15.4s,v27.4s,#24 1519 ror w17,w17,#16 1520 ushr v19.4s,v28.4s,#24 1521 ror w19,w19,#16 1522 ushr v23.4s,v29.4s,#24 1523 ror w20,w20,#16 1524 sli v3.4s,v24.4s,#8 1525 add w15,w15,w21 1526 sli v7.4s,v25.4s,#8 1527 add w16,w16,w17 1528 sli v11.4s,v26.4s,#8 1529 add w13,w13,w19 1530 sli v15.4s,v27.4s,#8 1531 add w14,w14,w20 1532 sli v19.4s,v28.4s,#8 1533 eor w10,w10,w15 1534 sli v23.4s,v29.4s,#8 1535 eor w11,w11,w16 1536 add v2.4s,v2.4s,v3.4s 1537 eor w12,w12,w13 1538 add v6.4s,v6.4s,v7.4s 1539 eor w9,w9,w14 1540 add v10.4s,v10.4s,v11.4s 1541 ror w10,w10,#20 1542 add v14.4s,v14.4s,v15.4s 1543 ror w11,w11,#20 1544 add v18.4s,v18.4s,v19.4s 1545 ror w12,w12,#20 1546 add v22.4s,v22.4s,v23.4s 1547 ror w9,w9,#20 1548 eor v24.16b,v1.16b,v2.16b 1549 add w5,w5,w10 1550 eor v25.16b,v5.16b,v6.16b 1551 add w6,w6,w11 1552 eor v26.16b,v9.16b,v10.16b 1553 add w7,w7,w12 1554 eor v27.16b,v13.16b,v14.16b 1555 add w8,w8,w9 1556 eor v28.16b,v17.16b,v18.16b 1557 eor w21,w21,w5 1558 eor v29.16b,v21.16b,v22.16b 1559 eor w17,w17,w6 1560 ushr v1.4s,v24.4s,#25 1561 eor w19,w19,w7 1562 ushr v5.4s,v25.4s,#25 1563 eor w20,w20,w8 1564 ushr v9.4s,v26.4s,#25 1565 ror w21,w21,#24 1566 ushr v13.4s,v27.4s,#25 1567 ror w17,w17,#24 1568 ushr v17.4s,v28.4s,#25 1569 ror w19,w19,#24 1570 ushr v21.4s,v29.4s,#25 1571 ror w20,w20,#24 1572 sli v1.4s,v24.4s,#7 1573 add w15,w15,w21 1574 sli v5.4s,v25.4s,#7 1575 add w16,w16,w17 1576 sli v9.4s,v26.4s,#7 1577 add w13,w13,w19 1578 sli v13.4s,v27.4s,#7 1579 add w14,w14,w20 1580 sli v17.4s,v28.4s,#7 1581 eor w10,w10,w15 1582 sli v21.4s,v29.4s,#7 1583 eor w11,w11,w16 1584 ext v2.16b,v2.16b,v2.16b,#8 1585 eor w12,w12,w13 1586 ext v6.16b,v6.16b,v6.16b,#8 1587 eor w9,w9,w14 1588 ext v10.16b,v10.16b,v10.16b,#8 1589 ror w10,w10,#25 1590 ext v14.16b,v14.16b,v14.16b,#8 1591 ror w11,w11,#25 1592 ext v18.16b,v18.16b,v18.16b,#8 1593 ror w12,w12,#25 1594 ext v22.16b,v22.16b,v22.16b,#8 1595 ror w9,w9,#25 1596 ext v3.16b,v3.16b,v3.16b,#12 1597 ext v7.16b,v7.16b,v7.16b,#12 1598 ext v11.16b,v11.16b,v11.16b,#12 1599 ext v15.16b,v15.16b,v15.16b,#12 1600 ext v19.16b,v19.16b,v19.16b,#12 1601 ext v23.16b,v23.16b,v23.16b,#12 1602 ext v1.16b,v1.16b,v1.16b,#4 1603 ext v5.16b,v5.16b,v5.16b,#4 1604 ext v9.16b,v9.16b,v9.16b,#4 1605 ext v13.16b,v13.16b,v13.16b,#4 1606 ext v17.16b,v17.16b,v17.16b,#4 1607 ext v21.16b,v21.16b,v21.16b,#4 1608 add v0.4s,v0.4s,v1.4s 1609 add w5,w5,w9 1610 add v4.4s,v4.4s,v5.4s 1611 add w6,w6,w10 1612 add v8.4s,v8.4s,v9.4s 1613 add w7,w7,w11 1614 add v12.4s,v12.4s,v13.4s 1615 add w8,w8,w12 1616 add v16.4s,v16.4s,v17.4s 1617 eor w17,w17,w5 1618 add v20.4s,v20.4s,v21.4s 1619 eor w19,w19,w6 1620 eor v3.16b,v3.16b,v0.16b 1621 eor w20,w20,w7 1622 eor v7.16b,v7.16b,v4.16b 1623 eor w21,w21,w8 1624 eor v11.16b,v11.16b,v8.16b 1625 ror w17,w17,#16 1626 eor v15.16b,v15.16b,v12.16b 1627 ror w19,w19,#16 1628 eor v19.16b,v19.16b,v16.16b 1629 ror w20,w20,#16 1630 eor v23.16b,v23.16b,v20.16b 1631 ror w21,w21,#16 1632 rev32 v3.8h,v3.8h 1633 add w13,w13,w17 1634 rev32 v7.8h,v7.8h 1635 add w14,w14,w19 1636 rev32 v11.8h,v11.8h 1637 add w15,w15,w20 1638 rev32 v15.8h,v15.8h 1639 add w16,w16,w21 1640 rev32 v19.8h,v19.8h 1641 eor w9,w9,w13 1642 rev32 v23.8h,v23.8h 1643 eor w10,w10,w14 1644 add v2.4s,v2.4s,v3.4s 1645 eor w11,w11,w15 1646 add v6.4s,v6.4s,v7.4s 1647 eor w12,w12,w16 1648 add v10.4s,v10.4s,v11.4s 1649 ror w9,w9,#20 1650 add v14.4s,v14.4s,v15.4s 1651 ror w10,w10,#20 1652 add v18.4s,v18.4s,v19.4s 1653 ror w11,w11,#20 1654 add v22.4s,v22.4s,v23.4s 1655 ror w12,w12,#20 1656 eor v24.16b,v1.16b,v2.16b 1657 add w5,w5,w9 1658 eor v25.16b,v5.16b,v6.16b 1659 add w6,w6,w10 1660 eor v26.16b,v9.16b,v10.16b 1661 add w7,w7,w11 1662 eor v27.16b,v13.16b,v14.16b 1663 add w8,w8,w12 1664 eor v28.16b,v17.16b,v18.16b 1665 eor w17,w17,w5 1666 eor v29.16b,v21.16b,v22.16b 1667 eor w19,w19,w6 1668 ushr v1.4s,v24.4s,#20 1669 eor w20,w20,w7 1670 ushr v5.4s,v25.4s,#20 1671 eor w21,w21,w8 1672 ushr v9.4s,v26.4s,#20 1673 ror w17,w17,#24 1674 ushr v13.4s,v27.4s,#20 1675 ror w19,w19,#24 1676 ushr v17.4s,v28.4s,#20 1677 ror w20,w20,#24 1678 ushr v21.4s,v29.4s,#20 1679 ror w21,w21,#24 1680 sli v1.4s,v24.4s,#12 1681 add w13,w13,w17 1682 sli v5.4s,v25.4s,#12 1683 add w14,w14,w19 1684 sli v9.4s,v26.4s,#12 1685 add w15,w15,w20 1686 sli v13.4s,v27.4s,#12 1687 add w16,w16,w21 1688 sli v17.4s,v28.4s,#12 1689 eor w9,w9,w13 1690 sli v21.4s,v29.4s,#12 1691 eor w10,w10,w14 1692 add v0.4s,v0.4s,v1.4s 1693 eor w11,w11,w15 1694 add v4.4s,v4.4s,v5.4s 1695 eor w12,w12,w16 1696 add v8.4s,v8.4s,v9.4s 1697 ror w9,w9,#25 1698 add v12.4s,v12.4s,v13.4s 1699 ror w10,w10,#25 1700 add v16.4s,v16.4s,v17.4s 1701 ror w11,w11,#25 1702 add v20.4s,v20.4s,v21.4s 1703 ror w12,w12,#25 1704 eor v24.16b,v3.16b,v0.16b 1705 add w5,w5,w10 1706 eor v25.16b,v7.16b,v4.16b 1707 add w6,w6,w11 1708 eor v26.16b,v11.16b,v8.16b 1709 add w7,w7,w12 1710 eor v27.16b,v15.16b,v12.16b 1711 add w8,w8,w9 1712 eor v28.16b,v19.16b,v16.16b 1713 eor w21,w21,w5 1714 eor v29.16b,v23.16b,v20.16b 1715 eor w17,w17,w6 1716 ushr v3.4s,v24.4s,#24 1717 eor w19,w19,w7 1718 ushr v7.4s,v25.4s,#24 1719 eor w20,w20,w8 1720 ushr v11.4s,v26.4s,#24 1721 ror w21,w21,#16 1722 ushr v15.4s,v27.4s,#24 1723 ror w17,w17,#16 1724 ushr v19.4s,v28.4s,#24 1725 ror w19,w19,#16 1726 ushr v23.4s,v29.4s,#24 1727 ror w20,w20,#16 1728 sli v3.4s,v24.4s,#8 1729 add w15,w15,w21 1730 sli v7.4s,v25.4s,#8 1731 add w16,w16,w17 1732 sli v11.4s,v26.4s,#8 1733 add w13,w13,w19 1734 sli v15.4s,v27.4s,#8 1735 add w14,w14,w20 1736 sli v19.4s,v28.4s,#8 1737 eor w10,w10,w15 1738 sli v23.4s,v29.4s,#8 1739 eor w11,w11,w16 1740 add v2.4s,v2.4s,v3.4s 1741 eor w12,w12,w13 1742 add v6.4s,v6.4s,v7.4s 1743 eor w9,w9,w14 1744 add v10.4s,v10.4s,v11.4s 1745 ror w10,w10,#20 1746 add v14.4s,v14.4s,v15.4s 1747 ror w11,w11,#20 1748 add v18.4s,v18.4s,v19.4s 1749 ror w12,w12,#20 1750 add v22.4s,v22.4s,v23.4s 1751 ror w9,w9,#20 1752 eor v24.16b,v1.16b,v2.16b 1753 add w5,w5,w10 1754 eor v25.16b,v5.16b,v6.16b 1755 add w6,w6,w11 1756 eor v26.16b,v9.16b,v10.16b 1757 add w7,w7,w12 1758 eor v27.16b,v13.16b,v14.16b 1759 add w8,w8,w9 1760 eor v28.16b,v17.16b,v18.16b 1761 eor w21,w21,w5 1762 eor v29.16b,v21.16b,v22.16b 1763 eor w17,w17,w6 1764 ushr v1.4s,v24.4s,#25 1765 eor w19,w19,w7 1766 ushr v5.4s,v25.4s,#25 1767 eor w20,w20,w8 1768 ushr v9.4s,v26.4s,#25 1769 ror w21,w21,#24 1770 ushr v13.4s,v27.4s,#25 1771 ror w17,w17,#24 1772 ushr v17.4s,v28.4s,#25 1773 ror w19,w19,#24 1774 ushr v21.4s,v29.4s,#25 1775 ror w20,w20,#24 1776 sli v1.4s,v24.4s,#7 1777 add w15,w15,w21 1778 sli v5.4s,v25.4s,#7 1779 add w16,w16,w17 1780 sli v9.4s,v26.4s,#7 1781 add w13,w13,w19 1782 sli v13.4s,v27.4s,#7 1783 add w14,w14,w20 1784 sli v17.4s,v28.4s,#7 1785 eor w10,w10,w15 1786 sli v21.4s,v29.4s,#7 1787 eor w11,w11,w16 1788 ext v2.16b,v2.16b,v2.16b,#8 1789 eor w12,w12,w13 1790 ext v6.16b,v6.16b,v6.16b,#8 1791 eor w9,w9,w14 1792 ext v10.16b,v10.16b,v10.16b,#8 1793 ror w10,w10,#25 1794 ext v14.16b,v14.16b,v14.16b,#8 1795 ror w11,w11,#25 1796 ext v18.16b,v18.16b,v18.16b,#8 1797 ror w12,w12,#25 1798 ext v22.16b,v22.16b,v22.16b,#8 1799 ror w9,w9,#25 1800 ext v3.16b,v3.16b,v3.16b,#4 1801 ext v7.16b,v7.16b,v7.16b,#4 1802 ext v11.16b,v11.16b,v11.16b,#4 1803 ext v15.16b,v15.16b,v15.16b,#4 1804 ext v19.16b,v19.16b,v19.16b,#4 1805 ext v23.16b,v23.16b,v23.16b,#4 1806 ext v1.16b,v1.16b,v1.16b,#12 1807 ext v5.16b,v5.16b,v5.16b,#12 1808 ext v9.16b,v9.16b,v9.16b,#12 1809 ext v13.16b,v13.16b,v13.16b,#12 1810 ext v17.16b,v17.16b,v17.16b,#12 1811 ext v21.16b,v21.16b,v21.16b,#12 1812 cbnz x4,.Loop_lower_neon 1813 1814 add w5,w5,w22 // accumulate key block 1815 ldp q24,q25,[sp,#0] 1816 add x6,x6,x22,lsr#32 1817 ldp q26,q27,[sp,#32] 1818 add w7,w7,w23 1819 ldp q28,q29,[sp,#64] 1820 add x8,x8,x23,lsr#32 1821 add v0.4s,v0.4s,v24.4s 1822 add w9,w9,w24 1823 add v4.4s,v4.4s,v24.4s 1824 add x10,x10,x24,lsr#32 1825 add v8.4s,v8.4s,v24.4s 1826 add w11,w11,w25 1827 add v12.4s,v12.4s,v24.4s 1828 add x12,x12,x25,lsr#32 1829 add v16.4s,v16.4s,v24.4s 1830 add w13,w13,w26 1831 add v20.4s,v20.4s,v24.4s 1832 add x14,x14,x26,lsr#32 1833 add v2.4s,v2.4s,v26.4s 1834 add w15,w15,w27 1835 add v6.4s,v6.4s,v26.4s 1836 add x16,x16,x27,lsr#32 1837 add v10.4s,v10.4s,v26.4s 1838 add w17,w17,w28 1839 add v14.4s,v14.4s,v26.4s 1840 add x19,x19,x28,lsr#32 1841 add v18.4s,v18.4s,v26.4s 1842 add w20,w20,w30 1843 add v22.4s,v22.4s,v26.4s 1844 add x21,x21,x30,lsr#32 1845 add v19.4s,v19.4s,v31.4s // +4 1846 add x5,x5,x6,lsl#32 // pack 1847 add v23.4s,v23.4s,v31.4s // +4 1848 add x7,x7,x8,lsl#32 1849 add v3.4s,v3.4s,v27.4s 1850 ldp x6,x8,[x1,#0] // load input 1851 add v7.4s,v7.4s,v28.4s 1852 add x9,x9,x10,lsl#32 1853 add v11.4s,v11.4s,v29.4s 1854 add x11,x11,x12,lsl#32 1855 add v15.4s,v15.4s,v30.4s 1856 ldp x10,x12,[x1,#16] 1857 add v19.4s,v19.4s,v27.4s 1858 add x13,x13,x14,lsl#32 1859 add v23.4s,v23.4s,v28.4s 1860 add x15,x15,x16,lsl#32 1861 add v1.4s,v1.4s,v25.4s 1862 ldp x14,x16,[x1,#32] 1863 add v5.4s,v5.4s,v25.4s 1864 add x17,x17,x19,lsl#32 1865 add v9.4s,v9.4s,v25.4s 1866 add x20,x20,x21,lsl#32 1867 add v13.4s,v13.4s,v25.4s 1868 ldp x19,x21,[x1,#48] 1869 add v17.4s,v17.4s,v25.4s 1870 add x1,x1,#64 1871 add v21.4s,v21.4s,v25.4s 1872 1873#ifdef __ARMEB__ 1874 rev x5,x5 1875 rev x7,x7 1876 rev x9,x9 1877 rev x11,x11 1878 rev x13,x13 1879 rev x15,x15 1880 rev x17,x17 1881 rev x20,x20 1882#endif 1883 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 1884 eor x5,x5,x6 1885 eor x7,x7,x8 1886 eor x9,x9,x10 1887 eor x11,x11,x12 1888 eor x13,x13,x14 1889 eor v0.16b,v0.16b,v24.16b 1890 eor x15,x15,x16 1891 eor v1.16b,v1.16b,v25.16b 1892 eor x17,x17,x19 1893 eor v2.16b,v2.16b,v26.16b 1894 eor x20,x20,x21 1895 eor v3.16b,v3.16b,v27.16b 1896 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 1897 1898 stp x5,x7,[x0,#0] // store output 1899 add x28,x28,#7 // increment counter 1900 stp x9,x11,[x0,#16] 1901 stp x13,x15,[x0,#32] 1902 stp x17,x20,[x0,#48] 1903 add x0,x0,#64 1904 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 1905 1906 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 1907 eor v4.16b,v4.16b,v24.16b 1908 eor v5.16b,v5.16b,v25.16b 1909 eor v6.16b,v6.16b,v26.16b 1910 eor v7.16b,v7.16b,v27.16b 1911 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 1912 1913 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 1914 eor v8.16b,v8.16b,v0.16b 1915 ldp q24,q25,[sp,#0] 1916 eor v9.16b,v9.16b,v1.16b 1917 ldp q26,q27,[sp,#32] 1918 eor v10.16b,v10.16b,v2.16b 1919 eor v11.16b,v11.16b,v3.16b 1920 st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 1921 1922 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 1923 eor v12.16b,v12.16b,v4.16b 1924 eor v13.16b,v13.16b,v5.16b 1925 eor v14.16b,v14.16b,v6.16b 1926 eor v15.16b,v15.16b,v7.16b 1927 st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 1928 1929 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 1930 eor v16.16b,v16.16b,v8.16b 1931 eor v17.16b,v17.16b,v9.16b 1932 eor v18.16b,v18.16b,v10.16b 1933 eor v19.16b,v19.16b,v11.16b 1934 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 1935 1936 shl v0.4s,v31.4s,#1 // 4 -> 8 1937 eor v20.16b,v20.16b,v12.16b 1938 eor v21.16b,v21.16b,v13.16b 1939 eor v22.16b,v22.16b,v14.16b 1940 eor v23.16b,v23.16b,v15.16b 1941 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 1942 1943 add v27.4s,v27.4s,v0.4s // += 8 1944 add v28.4s,v28.4s,v0.4s 1945 add v29.4s,v29.4s,v0.4s 1946 add v30.4s,v30.4s,v0.4s 1947 1948 b.hs .Loop_outer_512_neon 1949 1950 adds x2,x2,#512 1951 ushr v0.4s,v31.4s,#2 // 4 -> 1 1952 1953 ldp d8,d9,[sp,#128+0] // meet ABI requirements 1954 ldp d10,d11,[sp,#128+16] 1955 ldp d12,d13,[sp,#128+32] 1956 ldp d14,d15,[sp,#128+48] 1957 1958 stp q24,q31,[sp,#0] // wipe off-load area 1959 stp q24,q31,[sp,#32] 1960 stp q24,q31,[sp,#64] 1961 1962 b.eq .Ldone_512_neon 1963 1964 cmp x2,#192 1965 sub v27.4s,v27.4s,v0.4s // -= 1 1966 sub v28.4s,v28.4s,v0.4s 1967 sub v29.4s,v29.4s,v0.4s 1968 add sp,sp,#128 1969 b.hs .Loop_outer_neon 1970 1971 eor v25.16b,v25.16b,v25.16b 1972 eor v26.16b,v26.16b,v26.16b 1973 eor v27.16b,v27.16b,v27.16b 1974 eor v28.16b,v28.16b,v28.16b 1975 eor v29.16b,v29.16b,v29.16b 1976 eor v30.16b,v30.16b,v30.16b 1977 b .Loop_outer 1978 1979.Ldone_512_neon: 1980 ldp x19,x20,[x29,#16] 1981 add sp,sp,#128+64 1982 ldp x21,x22,[x29,#32] 1983 ldp x23,x24,[x29,#48] 1984 ldp x25,x26,[x29,#64] 1985 ldp x27,x28,[x29,#80] 1986 ldp x29,x30,[sp],#96 1987 AARCH64_VALIDATE_LINK_REGISTER 1988 ret 1989.size ChaCha20_512_neon,.-ChaCha20_512_neon 1990#endif 1991#endif // !OPENSSL_NO_ASM 1992.section .note.GNU-stack,"",%progbits 1993