1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#if defined(__aarch64__) 13#if defined(BORINGSSL_PREFIX) 14#include <boringssl_prefix_symbols_asm.h> 15#endif 16#include <openssl/arm_arch.h> 17 18 19 20 21.section .rodata 22 23.align 5 24Lsigma: 25.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral 26Lone: 27.long 1,0,0,0 28.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 29.align 2 30 31.text 32 33.globl ChaCha20_ctr32 34 35.def ChaCha20_ctr32 36 .type 32 37.endef 38.align 5 39ChaCha20_ctr32: 40 AARCH64_VALID_CALL_TARGET 41 cbz x2,Labort 42#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10 43 adrp x5,:pg_hi21_nc:OPENSSL_armcap_P 44#else 45 adrp x5,OPENSSL_armcap_P 46#endif 47 cmp x2,#192 48 b.lo Lshort 49 ldr w17,[x5,:lo12:OPENSSL_armcap_P] 50 tst w17,#ARMV7_NEON 51 b.ne ChaCha20_neon 52 53Lshort: 54 AARCH64_SIGN_LINK_REGISTER 55 stp x29,x30,[sp,#-96]! 56 add x29,sp,#0 57 58 adrp x5,Lsigma 59 add x5,x5,:lo12:Lsigma 60 stp x19,x20,[sp,#16] 61 stp x21,x22,[sp,#32] 62 stp x23,x24,[sp,#48] 63 stp x25,x26,[sp,#64] 64 stp x27,x28,[sp,#80] 65 sub sp,sp,#64 66 67 ldp x22,x23,[x5] // load sigma 68 ldp x24,x25,[x3] // load key 69 ldp x26,x27,[x3,#16] 70 ldp x28,x30,[x4] // load counter 71#ifdef __ARMEB__ 72 ror x24,x24,#32 73 ror x25,x25,#32 74 ror x26,x26,#32 75 ror x27,x27,#32 76 ror x28,x28,#32 77 ror x30,x30,#32 78#endif 79 80Loop_outer: 81 mov w5,w22 // unpack key block 82 lsr x6,x22,#32 83 mov w7,w23 84 lsr x8,x23,#32 85 mov w9,w24 86 lsr x10,x24,#32 87 mov w11,w25 88 lsr x12,x25,#32 89 mov w13,w26 90 lsr x14,x26,#32 91 mov w15,w27 92 lsr x16,x27,#32 93 mov w17,w28 94 lsr x19,x28,#32 95 mov w20,w30 96 lsr x21,x30,#32 97 98 mov x4,#10 99 subs x2,x2,#64 100Loop: 101 sub x4,x4,#1 102 add w5,w5,w9 103 add w6,w6,w10 104 add w7,w7,w11 105 add w8,w8,w12 106 eor w17,w17,w5 107 eor w19,w19,w6 108 eor w20,w20,w7 109 eor w21,w21,w8 110 ror w17,w17,#16 111 ror w19,w19,#16 112 ror w20,w20,#16 113 ror w21,w21,#16 114 add w13,w13,w17 115 add w14,w14,w19 116 add w15,w15,w20 117 add w16,w16,w21 118 eor w9,w9,w13 119 eor w10,w10,w14 120 eor w11,w11,w15 121 eor w12,w12,w16 122 ror w9,w9,#20 123 ror w10,w10,#20 124 ror w11,w11,#20 125 ror w12,w12,#20 126 add w5,w5,w9 127 add w6,w6,w10 128 add w7,w7,w11 129 add w8,w8,w12 130 eor w17,w17,w5 131 eor w19,w19,w6 132 eor w20,w20,w7 133 eor w21,w21,w8 134 ror w17,w17,#24 135 ror w19,w19,#24 136 ror w20,w20,#24 137 ror w21,w21,#24 138 add w13,w13,w17 139 add w14,w14,w19 140 add w15,w15,w20 141 add w16,w16,w21 142 eor w9,w9,w13 143 eor w10,w10,w14 144 eor w11,w11,w15 145 eor w12,w12,w16 146 ror w9,w9,#25 147 ror w10,w10,#25 148 ror w11,w11,#25 149 ror w12,w12,#25 150 add w5,w5,w10 151 add w6,w6,w11 152 add w7,w7,w12 153 add w8,w8,w9 154 eor w21,w21,w5 155 eor w17,w17,w6 156 eor w19,w19,w7 157 eor w20,w20,w8 158 ror w21,w21,#16 159 ror w17,w17,#16 160 ror w19,w19,#16 161 ror w20,w20,#16 162 add w15,w15,w21 163 add w16,w16,w17 164 add w13,w13,w19 165 add w14,w14,w20 166 eor w10,w10,w15 167 eor w11,w11,w16 168 eor w12,w12,w13 169 eor w9,w9,w14 170 ror w10,w10,#20 171 ror w11,w11,#20 172 ror w12,w12,#20 173 ror w9,w9,#20 174 add w5,w5,w10 175 add w6,w6,w11 176 add w7,w7,w12 177 add w8,w8,w9 178 eor w21,w21,w5 179 eor w17,w17,w6 180 eor w19,w19,w7 181 eor w20,w20,w8 182 ror w21,w21,#24 183 ror w17,w17,#24 184 ror w19,w19,#24 185 ror w20,w20,#24 186 add w15,w15,w21 187 add w16,w16,w17 188 add w13,w13,w19 189 add w14,w14,w20 190 eor w10,w10,w15 191 eor w11,w11,w16 192 eor w12,w12,w13 193 eor w9,w9,w14 194 ror w10,w10,#25 195 ror w11,w11,#25 196 ror w12,w12,#25 197 ror w9,w9,#25 198 cbnz x4,Loop 199 200 add w5,w5,w22 // accumulate key block 201 add x6,x6,x22,lsr#32 202 add w7,w7,w23 203 add x8,x8,x23,lsr#32 204 add w9,w9,w24 205 add x10,x10,x24,lsr#32 206 add w11,w11,w25 207 add x12,x12,x25,lsr#32 208 add w13,w13,w26 209 add x14,x14,x26,lsr#32 210 add w15,w15,w27 211 add x16,x16,x27,lsr#32 212 add w17,w17,w28 213 add x19,x19,x28,lsr#32 214 add w20,w20,w30 215 add x21,x21,x30,lsr#32 216 217 b.lo Ltail 218 219 add x5,x5,x6,lsl#32 // pack 220 add x7,x7,x8,lsl#32 221 ldp x6,x8,[x1,#0] // load input 222 add x9,x9,x10,lsl#32 223 add x11,x11,x12,lsl#32 224 ldp x10,x12,[x1,#16] 225 add x13,x13,x14,lsl#32 226 add x15,x15,x16,lsl#32 227 ldp x14,x16,[x1,#32] 228 add x17,x17,x19,lsl#32 229 add x20,x20,x21,lsl#32 230 ldp x19,x21,[x1,#48] 231 add x1,x1,#64 232#ifdef __ARMEB__ 233 rev x5,x5 234 rev x7,x7 235 rev x9,x9 236 rev x11,x11 237 rev x13,x13 238 rev x15,x15 239 rev x17,x17 240 rev x20,x20 241#endif 242 eor x5,x5,x6 243 eor x7,x7,x8 244 eor x9,x9,x10 245 eor x11,x11,x12 246 eor x13,x13,x14 247 eor x15,x15,x16 248 eor x17,x17,x19 249 eor x20,x20,x21 250 251 stp x5,x7,[x0,#0] // store output 252 add x28,x28,#1 // increment counter 253 stp x9,x11,[x0,#16] 254 stp x13,x15,[x0,#32] 255 stp x17,x20,[x0,#48] 256 add x0,x0,#64 257 258 b.hi Loop_outer 259 260 ldp x19,x20,[x29,#16] 261 add sp,sp,#64 262 ldp x21,x22,[x29,#32] 263 ldp x23,x24,[x29,#48] 264 ldp x25,x26,[x29,#64] 265 ldp x27,x28,[x29,#80] 266 ldp x29,x30,[sp],#96 267 AARCH64_VALIDATE_LINK_REGISTER 268Labort: 269 ret 270 271.align 4 272Ltail: 273 add x2,x2,#64 274Less_than_64: 275 sub x0,x0,#1 276 add x1,x1,x2 277 add x0,x0,x2 278 add x4,sp,x2 279 neg x2,x2 280 281 add x5,x5,x6,lsl#32 // pack 282 add x7,x7,x8,lsl#32 283 add x9,x9,x10,lsl#32 284 add x11,x11,x12,lsl#32 285 add x13,x13,x14,lsl#32 286 add x15,x15,x16,lsl#32 287 add x17,x17,x19,lsl#32 288 add x20,x20,x21,lsl#32 289#ifdef __ARMEB__ 290 rev x5,x5 291 rev x7,x7 292 rev x9,x9 293 rev x11,x11 294 rev x13,x13 295 rev x15,x15 296 rev x17,x17 297 rev x20,x20 298#endif 299 stp x5,x7,[sp,#0] 300 stp x9,x11,[sp,#16] 301 stp x13,x15,[sp,#32] 302 stp x17,x20,[sp,#48] 303 304Loop_tail: 305 ldrb w10,[x1,x2] 306 ldrb w11,[x4,x2] 307 add x2,x2,#1 308 eor w10,w10,w11 309 strb w10,[x0,x2] 310 cbnz x2,Loop_tail 311 312 stp xzr,xzr,[sp,#0] 313 stp xzr,xzr,[sp,#16] 314 stp xzr,xzr,[sp,#32] 315 stp xzr,xzr,[sp,#48] 316 317 ldp x19,x20,[x29,#16] 318 add sp,sp,#64 319 ldp x21,x22,[x29,#32] 320 ldp x23,x24,[x29,#48] 321 ldp x25,x26,[x29,#64] 322 ldp x27,x28,[x29,#80] 323 ldp x29,x30,[sp],#96 324 AARCH64_VALIDATE_LINK_REGISTER 325 ret 326 327 328.def ChaCha20_neon 329 .type 32 330.endef 331.align 5 332ChaCha20_neon: 333 AARCH64_SIGN_LINK_REGISTER 334 stp x29,x30,[sp,#-96]! 335 add x29,sp,#0 336 337 adrp x5,Lsigma 338 add x5,x5,:lo12:Lsigma 339 stp x19,x20,[sp,#16] 340 stp x21,x22,[sp,#32] 341 stp x23,x24,[sp,#48] 342 stp x25,x26,[sp,#64] 343 stp x27,x28,[sp,#80] 344 cmp x2,#512 345 b.hs L512_or_more_neon 346 347 sub sp,sp,#64 348 349 ldp x22,x23,[x5] // load sigma 350 ld1 {v24.4s},[x5],#16 351 ldp x24,x25,[x3] // load key 352 ldp x26,x27,[x3,#16] 353 ld1 {v25.4s,v26.4s},[x3] 354 ldp x28,x30,[x4] // load counter 355 ld1 {v27.4s},[x4] 356 ld1 {v31.4s},[x5] 357#ifdef __ARMEB__ 358 rev64 v24.4s,v24.4s 359 ror x24,x24,#32 360 ror x25,x25,#32 361 ror x26,x26,#32 362 ror x27,x27,#32 363 ror x28,x28,#32 364 ror x30,x30,#32 365#endif 366 add v27.4s,v27.4s,v31.4s // += 1 367 add v28.4s,v27.4s,v31.4s 368 add v29.4s,v28.4s,v31.4s 369 shl v31.4s,v31.4s,#2 // 1 -> 4 370 371Loop_outer_neon: 372 mov w5,w22 // unpack key block 373 lsr x6,x22,#32 374 mov v0.16b,v24.16b 375 mov w7,w23 376 lsr x8,x23,#32 377 mov v4.16b,v24.16b 378 mov w9,w24 379 lsr x10,x24,#32 380 mov v16.16b,v24.16b 381 mov w11,w25 382 mov v1.16b,v25.16b 383 lsr x12,x25,#32 384 mov v5.16b,v25.16b 385 mov w13,w26 386 mov v17.16b,v25.16b 387 lsr x14,x26,#32 388 mov v3.16b,v27.16b 389 mov w15,w27 390 mov v7.16b,v28.16b 391 lsr x16,x27,#32 392 mov v19.16b,v29.16b 393 mov w17,w28 394 mov v2.16b,v26.16b 395 lsr x19,x28,#32 396 mov v6.16b,v26.16b 397 mov w20,w30 398 mov v18.16b,v26.16b 399 lsr x21,x30,#32 400 401 mov x4,#10 402 subs x2,x2,#256 403Loop_neon: 404 sub x4,x4,#1 405 add v0.4s,v0.4s,v1.4s 406 add w5,w5,w9 407 add v4.4s,v4.4s,v5.4s 408 add w6,w6,w10 409 add v16.4s,v16.4s,v17.4s 410 add w7,w7,w11 411 eor v3.16b,v3.16b,v0.16b 412 add w8,w8,w12 413 eor v7.16b,v7.16b,v4.16b 414 eor w17,w17,w5 415 eor v19.16b,v19.16b,v16.16b 416 eor w19,w19,w6 417 rev32 v3.8h,v3.8h 418 eor w20,w20,w7 419 rev32 v7.8h,v7.8h 420 eor w21,w21,w8 421 rev32 v19.8h,v19.8h 422 ror w17,w17,#16 423 add v2.4s,v2.4s,v3.4s 424 ror w19,w19,#16 425 add v6.4s,v6.4s,v7.4s 426 ror w20,w20,#16 427 add v18.4s,v18.4s,v19.4s 428 ror w21,w21,#16 429 eor v20.16b,v1.16b,v2.16b 430 add w13,w13,w17 431 eor v21.16b,v5.16b,v6.16b 432 add w14,w14,w19 433 eor v22.16b,v17.16b,v18.16b 434 add w15,w15,w20 435 ushr v1.4s,v20.4s,#20 436 add w16,w16,w21 437 ushr v5.4s,v21.4s,#20 438 eor w9,w9,w13 439 ushr v17.4s,v22.4s,#20 440 eor w10,w10,w14 441 sli v1.4s,v20.4s,#12 442 eor w11,w11,w15 443 sli v5.4s,v21.4s,#12 444 eor w12,w12,w16 445 sli v17.4s,v22.4s,#12 446 ror w9,w9,#20 447 add v0.4s,v0.4s,v1.4s 448 ror w10,w10,#20 449 add v4.4s,v4.4s,v5.4s 450 ror w11,w11,#20 451 add v16.4s,v16.4s,v17.4s 452 ror w12,w12,#20 453 eor v20.16b,v3.16b,v0.16b 454 add w5,w5,w9 455 eor v21.16b,v7.16b,v4.16b 456 add w6,w6,w10 457 eor v22.16b,v19.16b,v16.16b 458 add w7,w7,w11 459 ushr v3.4s,v20.4s,#24 460 add w8,w8,w12 461 ushr v7.4s,v21.4s,#24 462 eor w17,w17,w5 463 ushr v19.4s,v22.4s,#24 464 eor w19,w19,w6 465 sli v3.4s,v20.4s,#8 466 eor w20,w20,w7 467 sli v7.4s,v21.4s,#8 468 eor w21,w21,w8 469 sli v19.4s,v22.4s,#8 470 ror w17,w17,#24 471 add v2.4s,v2.4s,v3.4s 472 ror w19,w19,#24 473 add v6.4s,v6.4s,v7.4s 474 ror w20,w20,#24 475 add v18.4s,v18.4s,v19.4s 476 ror w21,w21,#24 477 eor v20.16b,v1.16b,v2.16b 478 add w13,w13,w17 479 eor v21.16b,v5.16b,v6.16b 480 add w14,w14,w19 481 eor v22.16b,v17.16b,v18.16b 482 add w15,w15,w20 483 ushr v1.4s,v20.4s,#25 484 add w16,w16,w21 485 ushr v5.4s,v21.4s,#25 486 eor w9,w9,w13 487 ushr v17.4s,v22.4s,#25 488 eor w10,w10,w14 489 sli v1.4s,v20.4s,#7 490 eor w11,w11,w15 491 sli v5.4s,v21.4s,#7 492 eor w12,w12,w16 493 sli v17.4s,v22.4s,#7 494 ror w9,w9,#25 495 ext v2.16b,v2.16b,v2.16b,#8 496 ror w10,w10,#25 497 ext v6.16b,v6.16b,v6.16b,#8 498 ror w11,w11,#25 499 ext v18.16b,v18.16b,v18.16b,#8 500 ror w12,w12,#25 501 ext v3.16b,v3.16b,v3.16b,#12 502 ext v7.16b,v7.16b,v7.16b,#12 503 ext v19.16b,v19.16b,v19.16b,#12 504 ext v1.16b,v1.16b,v1.16b,#4 505 ext v5.16b,v5.16b,v5.16b,#4 506 ext v17.16b,v17.16b,v17.16b,#4 507 add v0.4s,v0.4s,v1.4s 508 add w5,w5,w10 509 add v4.4s,v4.4s,v5.4s 510 add w6,w6,w11 511 add v16.4s,v16.4s,v17.4s 512 add w7,w7,w12 513 eor v3.16b,v3.16b,v0.16b 514 add w8,w8,w9 515 eor v7.16b,v7.16b,v4.16b 516 eor w21,w21,w5 517 eor v19.16b,v19.16b,v16.16b 518 eor w17,w17,w6 519 rev32 v3.8h,v3.8h 520 eor w19,w19,w7 521 rev32 v7.8h,v7.8h 522 eor w20,w20,w8 523 rev32 v19.8h,v19.8h 524 ror w21,w21,#16 525 add v2.4s,v2.4s,v3.4s 526 ror w17,w17,#16 527 add v6.4s,v6.4s,v7.4s 528 ror w19,w19,#16 529 add v18.4s,v18.4s,v19.4s 530 ror w20,w20,#16 531 eor v20.16b,v1.16b,v2.16b 532 add w15,w15,w21 533 eor v21.16b,v5.16b,v6.16b 534 add w16,w16,w17 535 eor v22.16b,v17.16b,v18.16b 536 add w13,w13,w19 537 ushr v1.4s,v20.4s,#20 538 add w14,w14,w20 539 ushr v5.4s,v21.4s,#20 540 eor w10,w10,w15 541 ushr v17.4s,v22.4s,#20 542 eor w11,w11,w16 543 sli v1.4s,v20.4s,#12 544 eor w12,w12,w13 545 sli v5.4s,v21.4s,#12 546 eor w9,w9,w14 547 sli v17.4s,v22.4s,#12 548 ror w10,w10,#20 549 add v0.4s,v0.4s,v1.4s 550 ror w11,w11,#20 551 add v4.4s,v4.4s,v5.4s 552 ror w12,w12,#20 553 add v16.4s,v16.4s,v17.4s 554 ror w9,w9,#20 555 eor v20.16b,v3.16b,v0.16b 556 add w5,w5,w10 557 eor v21.16b,v7.16b,v4.16b 558 add w6,w6,w11 559 eor v22.16b,v19.16b,v16.16b 560 add w7,w7,w12 561 ushr v3.4s,v20.4s,#24 562 add w8,w8,w9 563 ushr v7.4s,v21.4s,#24 564 eor w21,w21,w5 565 ushr v19.4s,v22.4s,#24 566 eor w17,w17,w6 567 sli v3.4s,v20.4s,#8 568 eor w19,w19,w7 569 sli v7.4s,v21.4s,#8 570 eor w20,w20,w8 571 sli v19.4s,v22.4s,#8 572 ror w21,w21,#24 573 add v2.4s,v2.4s,v3.4s 574 ror w17,w17,#24 575 add v6.4s,v6.4s,v7.4s 576 ror w19,w19,#24 577 add v18.4s,v18.4s,v19.4s 578 ror w20,w20,#24 579 eor v20.16b,v1.16b,v2.16b 580 add w15,w15,w21 581 eor v21.16b,v5.16b,v6.16b 582 add w16,w16,w17 583 eor v22.16b,v17.16b,v18.16b 584 add w13,w13,w19 585 ushr v1.4s,v20.4s,#25 586 add w14,w14,w20 587 ushr v5.4s,v21.4s,#25 588 eor w10,w10,w15 589 ushr v17.4s,v22.4s,#25 590 eor w11,w11,w16 591 sli v1.4s,v20.4s,#7 592 eor w12,w12,w13 593 sli v5.4s,v21.4s,#7 594 eor w9,w9,w14 595 sli v17.4s,v22.4s,#7 596 ror w10,w10,#25 597 ext v2.16b,v2.16b,v2.16b,#8 598 ror w11,w11,#25 599 ext v6.16b,v6.16b,v6.16b,#8 600 ror w12,w12,#25 601 ext v18.16b,v18.16b,v18.16b,#8 602 ror w9,w9,#25 603 ext v3.16b,v3.16b,v3.16b,#4 604 ext v7.16b,v7.16b,v7.16b,#4 605 ext v19.16b,v19.16b,v19.16b,#4 606 ext v1.16b,v1.16b,v1.16b,#12 607 ext v5.16b,v5.16b,v5.16b,#12 608 ext v17.16b,v17.16b,v17.16b,#12 609 cbnz x4,Loop_neon 610 611 add w5,w5,w22 // accumulate key block 612 add v0.4s,v0.4s,v24.4s 613 add x6,x6,x22,lsr#32 614 add v4.4s,v4.4s,v24.4s 615 add w7,w7,w23 616 add v16.4s,v16.4s,v24.4s 617 add x8,x8,x23,lsr#32 618 add v2.4s,v2.4s,v26.4s 619 add w9,w9,w24 620 add v6.4s,v6.4s,v26.4s 621 add x10,x10,x24,lsr#32 622 add v18.4s,v18.4s,v26.4s 623 add w11,w11,w25 624 add v3.4s,v3.4s,v27.4s 625 add x12,x12,x25,lsr#32 626 add w13,w13,w26 627 add v7.4s,v7.4s,v28.4s 628 add x14,x14,x26,lsr#32 629 add w15,w15,w27 630 add v19.4s,v19.4s,v29.4s 631 add x16,x16,x27,lsr#32 632 add w17,w17,w28 633 add v1.4s,v1.4s,v25.4s 634 add x19,x19,x28,lsr#32 635 add w20,w20,w30 636 add v5.4s,v5.4s,v25.4s 637 add x21,x21,x30,lsr#32 638 add v17.4s,v17.4s,v25.4s 639 640 b.lo Ltail_neon 641 642 add x5,x5,x6,lsl#32 // pack 643 add x7,x7,x8,lsl#32 644 ldp x6,x8,[x1,#0] // load input 645 add x9,x9,x10,lsl#32 646 add x11,x11,x12,lsl#32 647 ldp x10,x12,[x1,#16] 648 add x13,x13,x14,lsl#32 649 add x15,x15,x16,lsl#32 650 ldp x14,x16,[x1,#32] 651 add x17,x17,x19,lsl#32 652 add x20,x20,x21,lsl#32 653 ldp x19,x21,[x1,#48] 654 add x1,x1,#64 655#ifdef __ARMEB__ 656 rev x5,x5 657 rev x7,x7 658 rev x9,x9 659 rev x11,x11 660 rev x13,x13 661 rev x15,x15 662 rev x17,x17 663 rev x20,x20 664#endif 665 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 666 eor x5,x5,x6 667 eor x7,x7,x8 668 eor x9,x9,x10 669 eor x11,x11,x12 670 eor x13,x13,x14 671 eor v0.16b,v0.16b,v20.16b 672 eor x15,x15,x16 673 eor v1.16b,v1.16b,v21.16b 674 eor x17,x17,x19 675 eor v2.16b,v2.16b,v22.16b 676 eor x20,x20,x21 677 eor v3.16b,v3.16b,v23.16b 678 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 679 680 stp x5,x7,[x0,#0] // store output 681 add x28,x28,#4 // increment counter 682 stp x9,x11,[x0,#16] 683 add v27.4s,v27.4s,v31.4s // += 4 684 stp x13,x15,[x0,#32] 685 add v28.4s,v28.4s,v31.4s 686 stp x17,x20,[x0,#48] 687 add v29.4s,v29.4s,v31.4s 688 add x0,x0,#64 689 690 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 691 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 692 693 eor v4.16b,v4.16b,v20.16b 694 eor v5.16b,v5.16b,v21.16b 695 eor v6.16b,v6.16b,v22.16b 696 eor v7.16b,v7.16b,v23.16b 697 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 698 699 eor v16.16b,v16.16b,v0.16b 700 eor v17.16b,v17.16b,v1.16b 701 eor v18.16b,v18.16b,v2.16b 702 eor v19.16b,v19.16b,v3.16b 703 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 704 705 b.hi Loop_outer_neon 706 707 ldp x19,x20,[x29,#16] 708 add sp,sp,#64 709 ldp x21,x22,[x29,#32] 710 ldp x23,x24,[x29,#48] 711 ldp x25,x26,[x29,#64] 712 ldp x27,x28,[x29,#80] 713 ldp x29,x30,[sp],#96 714 AARCH64_VALIDATE_LINK_REGISTER 715 ret 716 717Ltail_neon: 718 add x2,x2,#256 719 cmp x2,#64 720 b.lo Less_than_64 721 722 add x5,x5,x6,lsl#32 // pack 723 add x7,x7,x8,lsl#32 724 ldp x6,x8,[x1,#0] // load input 725 add x9,x9,x10,lsl#32 726 add x11,x11,x12,lsl#32 727 ldp x10,x12,[x1,#16] 728 add x13,x13,x14,lsl#32 729 add x15,x15,x16,lsl#32 730 ldp x14,x16,[x1,#32] 731 add x17,x17,x19,lsl#32 732 add x20,x20,x21,lsl#32 733 ldp x19,x21,[x1,#48] 734 add x1,x1,#64 735#ifdef __ARMEB__ 736 rev x5,x5 737 rev x7,x7 738 rev x9,x9 739 rev x11,x11 740 rev x13,x13 741 rev x15,x15 742 rev x17,x17 743 rev x20,x20 744#endif 745 eor x5,x5,x6 746 eor x7,x7,x8 747 eor x9,x9,x10 748 eor x11,x11,x12 749 eor x13,x13,x14 750 eor x15,x15,x16 751 eor x17,x17,x19 752 eor x20,x20,x21 753 754 stp x5,x7,[x0,#0] // store output 755 add x28,x28,#4 // increment counter 756 stp x9,x11,[x0,#16] 757 stp x13,x15,[x0,#32] 758 stp x17,x20,[x0,#48] 759 add x0,x0,#64 760 b.eq Ldone_neon 761 sub x2,x2,#64 762 cmp x2,#64 763 b.lo Less_than_128 764 765 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 766 eor v0.16b,v0.16b,v20.16b 767 eor v1.16b,v1.16b,v21.16b 768 eor v2.16b,v2.16b,v22.16b 769 eor v3.16b,v3.16b,v23.16b 770 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 771 b.eq Ldone_neon 772 sub x2,x2,#64 773 cmp x2,#64 774 b.lo Less_than_192 775 776 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 777 eor v4.16b,v4.16b,v20.16b 778 eor v5.16b,v5.16b,v21.16b 779 eor v6.16b,v6.16b,v22.16b 780 eor v7.16b,v7.16b,v23.16b 781 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 782 b.eq Ldone_neon 783 sub x2,x2,#64 784 785 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] 786 b Last_neon 787 788Less_than_128: 789 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] 790 b Last_neon 791Less_than_192: 792 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] 793 b Last_neon 794 795.align 4 796Last_neon: 797 sub x0,x0,#1 798 add x1,x1,x2 799 add x0,x0,x2 800 add x4,sp,x2 801 neg x2,x2 802 803Loop_tail_neon: 804 ldrb w10,[x1,x2] 805 ldrb w11,[x4,x2] 806 add x2,x2,#1 807 eor w10,w10,w11 808 strb w10,[x0,x2] 809 cbnz x2,Loop_tail_neon 810 811 stp xzr,xzr,[sp,#0] 812 stp xzr,xzr,[sp,#16] 813 stp xzr,xzr,[sp,#32] 814 stp xzr,xzr,[sp,#48] 815 816Ldone_neon: 817 ldp x19,x20,[x29,#16] 818 add sp,sp,#64 819 ldp x21,x22,[x29,#32] 820 ldp x23,x24,[x29,#48] 821 ldp x25,x26,[x29,#64] 822 ldp x27,x28,[x29,#80] 823 ldp x29,x30,[sp],#96 824 AARCH64_VALIDATE_LINK_REGISTER 825 ret 826 827.def ChaCha20_512_neon 828 .type 32 829.endef 830.align 5 831ChaCha20_512_neon: 832 AARCH64_SIGN_LINK_REGISTER 833 stp x29,x30,[sp,#-96]! 834 add x29,sp,#0 835 836 adrp x5,Lsigma 837 add x5,x5,:lo12:Lsigma 838 stp x19,x20,[sp,#16] 839 stp x21,x22,[sp,#32] 840 stp x23,x24,[sp,#48] 841 stp x25,x26,[sp,#64] 842 stp x27,x28,[sp,#80] 843 844L512_or_more_neon: 845 sub sp,sp,#128+64 846 847 ldp x22,x23,[x5] // load sigma 848 ld1 {v24.4s},[x5],#16 849 ldp x24,x25,[x3] // load key 850 ldp x26,x27,[x3,#16] 851 ld1 {v25.4s,v26.4s},[x3] 852 ldp x28,x30,[x4] // load counter 853 ld1 {v27.4s},[x4] 854 ld1 {v31.4s},[x5] 855#ifdef __ARMEB__ 856 rev64 v24.4s,v24.4s 857 ror x24,x24,#32 858 ror x25,x25,#32 859 ror x26,x26,#32 860 ror x27,x27,#32 861 ror x28,x28,#32 862 ror x30,x30,#32 863#endif 864 add v27.4s,v27.4s,v31.4s // += 1 865 stp q24,q25,[sp,#0] // off-load key block, invariant part 866 add v27.4s,v27.4s,v31.4s // not typo 867 str q26,[sp,#32] 868 add v28.4s,v27.4s,v31.4s 869 add v29.4s,v28.4s,v31.4s 870 add v30.4s,v29.4s,v31.4s 871 shl v31.4s,v31.4s,#2 // 1 -> 4 872 873 stp d8,d9,[sp,#128+0] // meet ABI requirements 874 stp d10,d11,[sp,#128+16] 875 stp d12,d13,[sp,#128+32] 876 stp d14,d15,[sp,#128+48] 877 878 sub x2,x2,#512 // not typo 879 880Loop_outer_512_neon: 881 mov v0.16b,v24.16b 882 mov v4.16b,v24.16b 883 mov v8.16b,v24.16b 884 mov v12.16b,v24.16b 885 mov v16.16b,v24.16b 886 mov v20.16b,v24.16b 887 mov v1.16b,v25.16b 888 mov w5,w22 // unpack key block 889 mov v5.16b,v25.16b 890 lsr x6,x22,#32 891 mov v9.16b,v25.16b 892 mov w7,w23 893 mov v13.16b,v25.16b 894 lsr x8,x23,#32 895 mov v17.16b,v25.16b 896 mov w9,w24 897 mov v21.16b,v25.16b 898 lsr x10,x24,#32 899 mov v3.16b,v27.16b 900 mov w11,w25 901 mov v7.16b,v28.16b 902 lsr x12,x25,#32 903 mov v11.16b,v29.16b 904 mov w13,w26 905 mov v15.16b,v30.16b 906 lsr x14,x26,#32 907 mov v2.16b,v26.16b 908 mov w15,w27 909 mov v6.16b,v26.16b 910 lsr x16,x27,#32 911 add v19.4s,v3.4s,v31.4s // +4 912 mov w17,w28 913 add v23.4s,v7.4s,v31.4s // +4 914 lsr x19,x28,#32 915 mov v10.16b,v26.16b 916 mov w20,w30 917 mov v14.16b,v26.16b 918 lsr x21,x30,#32 919 mov v18.16b,v26.16b 920 stp q27,q28,[sp,#48] // off-load key block, variable part 921 mov v22.16b,v26.16b 922 str q29,[sp,#80] 923 924 mov x4,#5 925 subs x2,x2,#512 926Loop_upper_neon: 927 sub x4,x4,#1 928 add v0.4s,v0.4s,v1.4s 929 add w5,w5,w9 930 add v4.4s,v4.4s,v5.4s 931 add w6,w6,w10 932 add v8.4s,v8.4s,v9.4s 933 add w7,w7,w11 934 add v12.4s,v12.4s,v13.4s 935 add w8,w8,w12 936 add v16.4s,v16.4s,v17.4s 937 eor w17,w17,w5 938 add v20.4s,v20.4s,v21.4s 939 eor w19,w19,w6 940 eor v3.16b,v3.16b,v0.16b 941 eor w20,w20,w7 942 eor v7.16b,v7.16b,v4.16b 943 eor w21,w21,w8 944 eor v11.16b,v11.16b,v8.16b 945 ror w17,w17,#16 946 eor v15.16b,v15.16b,v12.16b 947 ror w19,w19,#16 948 eor v19.16b,v19.16b,v16.16b 949 ror w20,w20,#16 950 eor v23.16b,v23.16b,v20.16b 951 ror w21,w21,#16 952 rev32 v3.8h,v3.8h 953 add w13,w13,w17 954 rev32 v7.8h,v7.8h 955 add w14,w14,w19 956 rev32 v11.8h,v11.8h 957 add w15,w15,w20 958 rev32 v15.8h,v15.8h 959 add w16,w16,w21 960 rev32 v19.8h,v19.8h 961 eor w9,w9,w13 962 rev32 v23.8h,v23.8h 963 eor w10,w10,w14 964 add v2.4s,v2.4s,v3.4s 965 eor w11,w11,w15 966 add v6.4s,v6.4s,v7.4s 967 eor w12,w12,w16 968 add v10.4s,v10.4s,v11.4s 969 ror w9,w9,#20 970 add v14.4s,v14.4s,v15.4s 971 ror w10,w10,#20 972 add v18.4s,v18.4s,v19.4s 973 ror w11,w11,#20 974 add v22.4s,v22.4s,v23.4s 975 ror w12,w12,#20 976 eor v24.16b,v1.16b,v2.16b 977 add w5,w5,w9 978 eor v25.16b,v5.16b,v6.16b 979 add w6,w6,w10 980 eor v26.16b,v9.16b,v10.16b 981 add w7,w7,w11 982 eor v27.16b,v13.16b,v14.16b 983 add w8,w8,w12 984 eor v28.16b,v17.16b,v18.16b 985 eor w17,w17,w5 986 eor v29.16b,v21.16b,v22.16b 987 eor w19,w19,w6 988 ushr v1.4s,v24.4s,#20 989 eor w20,w20,w7 990 ushr v5.4s,v25.4s,#20 991 eor w21,w21,w8 992 ushr v9.4s,v26.4s,#20 993 ror w17,w17,#24 994 ushr v13.4s,v27.4s,#20 995 ror w19,w19,#24 996 ushr v17.4s,v28.4s,#20 997 ror w20,w20,#24 998 ushr v21.4s,v29.4s,#20 999 ror w21,w21,#24 1000 sli v1.4s,v24.4s,#12 1001 add w13,w13,w17 1002 sli v5.4s,v25.4s,#12 1003 add w14,w14,w19 1004 sli v9.4s,v26.4s,#12 1005 add w15,w15,w20 1006 sli v13.4s,v27.4s,#12 1007 add w16,w16,w21 1008 sli v17.4s,v28.4s,#12 1009 eor w9,w9,w13 1010 sli v21.4s,v29.4s,#12 1011 eor w10,w10,w14 1012 add v0.4s,v0.4s,v1.4s 1013 eor w11,w11,w15 1014 add v4.4s,v4.4s,v5.4s 1015 eor w12,w12,w16 1016 add v8.4s,v8.4s,v9.4s 1017 ror w9,w9,#25 1018 add v12.4s,v12.4s,v13.4s 1019 ror w10,w10,#25 1020 add v16.4s,v16.4s,v17.4s 1021 ror w11,w11,#25 1022 add v20.4s,v20.4s,v21.4s 1023 ror w12,w12,#25 1024 eor v24.16b,v3.16b,v0.16b 1025 add w5,w5,w10 1026 eor v25.16b,v7.16b,v4.16b 1027 add w6,w6,w11 1028 eor v26.16b,v11.16b,v8.16b 1029 add w7,w7,w12 1030 eor v27.16b,v15.16b,v12.16b 1031 add w8,w8,w9 1032 eor v28.16b,v19.16b,v16.16b 1033 eor w21,w21,w5 1034 eor v29.16b,v23.16b,v20.16b 1035 eor w17,w17,w6 1036 ushr v3.4s,v24.4s,#24 1037 eor w19,w19,w7 1038 ushr v7.4s,v25.4s,#24 1039 eor w20,w20,w8 1040 ushr v11.4s,v26.4s,#24 1041 ror w21,w21,#16 1042 ushr v15.4s,v27.4s,#24 1043 ror w17,w17,#16 1044 ushr v19.4s,v28.4s,#24 1045 ror w19,w19,#16 1046 ushr v23.4s,v29.4s,#24 1047 ror w20,w20,#16 1048 sli v3.4s,v24.4s,#8 1049 add w15,w15,w21 1050 sli v7.4s,v25.4s,#8 1051 add w16,w16,w17 1052 sli v11.4s,v26.4s,#8 1053 add w13,w13,w19 1054 sli v15.4s,v27.4s,#8 1055 add w14,w14,w20 1056 sli v19.4s,v28.4s,#8 1057 eor w10,w10,w15 1058 sli v23.4s,v29.4s,#8 1059 eor w11,w11,w16 1060 add v2.4s,v2.4s,v3.4s 1061 eor w12,w12,w13 1062 add v6.4s,v6.4s,v7.4s 1063 eor w9,w9,w14 1064 add v10.4s,v10.4s,v11.4s 1065 ror w10,w10,#20 1066 add v14.4s,v14.4s,v15.4s 1067 ror w11,w11,#20 1068 add v18.4s,v18.4s,v19.4s 1069 ror w12,w12,#20 1070 add v22.4s,v22.4s,v23.4s 1071 ror w9,w9,#20 1072 eor v24.16b,v1.16b,v2.16b 1073 add w5,w5,w10 1074 eor v25.16b,v5.16b,v6.16b 1075 add w6,w6,w11 1076 eor v26.16b,v9.16b,v10.16b 1077 add w7,w7,w12 1078 eor v27.16b,v13.16b,v14.16b 1079 add w8,w8,w9 1080 eor v28.16b,v17.16b,v18.16b 1081 eor w21,w21,w5 1082 eor v29.16b,v21.16b,v22.16b 1083 eor w17,w17,w6 1084 ushr v1.4s,v24.4s,#25 1085 eor w19,w19,w7 1086 ushr v5.4s,v25.4s,#25 1087 eor w20,w20,w8 1088 ushr v9.4s,v26.4s,#25 1089 ror w21,w21,#24 1090 ushr v13.4s,v27.4s,#25 1091 ror w17,w17,#24 1092 ushr v17.4s,v28.4s,#25 1093 ror w19,w19,#24 1094 ushr v21.4s,v29.4s,#25 1095 ror w20,w20,#24 1096 sli v1.4s,v24.4s,#7 1097 add w15,w15,w21 1098 sli v5.4s,v25.4s,#7 1099 add w16,w16,w17 1100 sli v9.4s,v26.4s,#7 1101 add w13,w13,w19 1102 sli v13.4s,v27.4s,#7 1103 add w14,w14,w20 1104 sli v17.4s,v28.4s,#7 1105 eor w10,w10,w15 1106 sli v21.4s,v29.4s,#7 1107 eor w11,w11,w16 1108 ext v2.16b,v2.16b,v2.16b,#8 1109 eor w12,w12,w13 1110 ext v6.16b,v6.16b,v6.16b,#8 1111 eor w9,w9,w14 1112 ext v10.16b,v10.16b,v10.16b,#8 1113 ror w10,w10,#25 1114 ext v14.16b,v14.16b,v14.16b,#8 1115 ror w11,w11,#25 1116 ext v18.16b,v18.16b,v18.16b,#8 1117 ror w12,w12,#25 1118 ext v22.16b,v22.16b,v22.16b,#8 1119 ror w9,w9,#25 1120 ext v3.16b,v3.16b,v3.16b,#12 1121 ext v7.16b,v7.16b,v7.16b,#12 1122 ext v11.16b,v11.16b,v11.16b,#12 1123 ext v15.16b,v15.16b,v15.16b,#12 1124 ext v19.16b,v19.16b,v19.16b,#12 1125 ext v23.16b,v23.16b,v23.16b,#12 1126 ext v1.16b,v1.16b,v1.16b,#4 1127 ext v5.16b,v5.16b,v5.16b,#4 1128 ext v9.16b,v9.16b,v9.16b,#4 1129 ext v13.16b,v13.16b,v13.16b,#4 1130 ext v17.16b,v17.16b,v17.16b,#4 1131 ext v21.16b,v21.16b,v21.16b,#4 1132 add v0.4s,v0.4s,v1.4s 1133 add w5,w5,w9 1134 add v4.4s,v4.4s,v5.4s 1135 add w6,w6,w10 1136 add v8.4s,v8.4s,v9.4s 1137 add w7,w7,w11 1138 add v12.4s,v12.4s,v13.4s 1139 add w8,w8,w12 1140 add v16.4s,v16.4s,v17.4s 1141 eor w17,w17,w5 1142 add v20.4s,v20.4s,v21.4s 1143 eor w19,w19,w6 1144 eor v3.16b,v3.16b,v0.16b 1145 eor w20,w20,w7 1146 eor v7.16b,v7.16b,v4.16b 1147 eor w21,w21,w8 1148 eor v11.16b,v11.16b,v8.16b 1149 ror w17,w17,#16 1150 eor v15.16b,v15.16b,v12.16b 1151 ror w19,w19,#16 1152 eor v19.16b,v19.16b,v16.16b 1153 ror w20,w20,#16 1154 eor v23.16b,v23.16b,v20.16b 1155 ror w21,w21,#16 1156 rev32 v3.8h,v3.8h 1157 add w13,w13,w17 1158 rev32 v7.8h,v7.8h 1159 add w14,w14,w19 1160 rev32 v11.8h,v11.8h 1161 add w15,w15,w20 1162 rev32 v15.8h,v15.8h 1163 add w16,w16,w21 1164 rev32 v19.8h,v19.8h 1165 eor w9,w9,w13 1166 rev32 v23.8h,v23.8h 1167 eor w10,w10,w14 1168 add v2.4s,v2.4s,v3.4s 1169 eor w11,w11,w15 1170 add v6.4s,v6.4s,v7.4s 1171 eor w12,w12,w16 1172 add v10.4s,v10.4s,v11.4s 1173 ror w9,w9,#20 1174 add v14.4s,v14.4s,v15.4s 1175 ror w10,w10,#20 1176 add v18.4s,v18.4s,v19.4s 1177 ror w11,w11,#20 1178 add v22.4s,v22.4s,v23.4s 1179 ror w12,w12,#20 1180 eor v24.16b,v1.16b,v2.16b 1181 add w5,w5,w9 1182 eor v25.16b,v5.16b,v6.16b 1183 add w6,w6,w10 1184 eor v26.16b,v9.16b,v10.16b 1185 add w7,w7,w11 1186 eor v27.16b,v13.16b,v14.16b 1187 add w8,w8,w12 1188 eor v28.16b,v17.16b,v18.16b 1189 eor w17,w17,w5 1190 eor v29.16b,v21.16b,v22.16b 1191 eor w19,w19,w6 1192 ushr v1.4s,v24.4s,#20 1193 eor w20,w20,w7 1194 ushr v5.4s,v25.4s,#20 1195 eor w21,w21,w8 1196 ushr v9.4s,v26.4s,#20 1197 ror w17,w17,#24 1198 ushr v13.4s,v27.4s,#20 1199 ror w19,w19,#24 1200 ushr v17.4s,v28.4s,#20 1201 ror w20,w20,#24 1202 ushr v21.4s,v29.4s,#20 1203 ror w21,w21,#24 1204 sli v1.4s,v24.4s,#12 1205 add w13,w13,w17 1206 sli v5.4s,v25.4s,#12 1207 add w14,w14,w19 1208 sli v9.4s,v26.4s,#12 1209 add w15,w15,w20 1210 sli v13.4s,v27.4s,#12 1211 add w16,w16,w21 1212 sli v17.4s,v28.4s,#12 1213 eor w9,w9,w13 1214 sli v21.4s,v29.4s,#12 1215 eor w10,w10,w14 1216 add v0.4s,v0.4s,v1.4s 1217 eor w11,w11,w15 1218 add v4.4s,v4.4s,v5.4s 1219 eor w12,w12,w16 1220 add v8.4s,v8.4s,v9.4s 1221 ror w9,w9,#25 1222 add v12.4s,v12.4s,v13.4s 1223 ror w10,w10,#25 1224 add v16.4s,v16.4s,v17.4s 1225 ror w11,w11,#25 1226 add v20.4s,v20.4s,v21.4s 1227 ror w12,w12,#25 1228 eor v24.16b,v3.16b,v0.16b 1229 add w5,w5,w10 1230 eor v25.16b,v7.16b,v4.16b 1231 add w6,w6,w11 1232 eor v26.16b,v11.16b,v8.16b 1233 add w7,w7,w12 1234 eor v27.16b,v15.16b,v12.16b 1235 add w8,w8,w9 1236 eor v28.16b,v19.16b,v16.16b 1237 eor w21,w21,w5 1238 eor v29.16b,v23.16b,v20.16b 1239 eor w17,w17,w6 1240 ushr v3.4s,v24.4s,#24 1241 eor w19,w19,w7 1242 ushr v7.4s,v25.4s,#24 1243 eor w20,w20,w8 1244 ushr v11.4s,v26.4s,#24 1245 ror w21,w21,#16 1246 ushr v15.4s,v27.4s,#24 1247 ror w17,w17,#16 1248 ushr v19.4s,v28.4s,#24 1249 ror w19,w19,#16 1250 ushr v23.4s,v29.4s,#24 1251 ror w20,w20,#16 1252 sli v3.4s,v24.4s,#8 1253 add w15,w15,w21 1254 sli v7.4s,v25.4s,#8 1255 add w16,w16,w17 1256 sli v11.4s,v26.4s,#8 1257 add w13,w13,w19 1258 sli v15.4s,v27.4s,#8 1259 add w14,w14,w20 1260 sli v19.4s,v28.4s,#8 1261 eor w10,w10,w15 1262 sli v23.4s,v29.4s,#8 1263 eor w11,w11,w16 1264 add v2.4s,v2.4s,v3.4s 1265 eor w12,w12,w13 1266 add v6.4s,v6.4s,v7.4s 1267 eor w9,w9,w14 1268 add v10.4s,v10.4s,v11.4s 1269 ror w10,w10,#20 1270 add v14.4s,v14.4s,v15.4s 1271 ror w11,w11,#20 1272 add v18.4s,v18.4s,v19.4s 1273 ror w12,w12,#20 1274 add v22.4s,v22.4s,v23.4s 1275 ror w9,w9,#20 1276 eor v24.16b,v1.16b,v2.16b 1277 add w5,w5,w10 1278 eor v25.16b,v5.16b,v6.16b 1279 add w6,w6,w11 1280 eor v26.16b,v9.16b,v10.16b 1281 add w7,w7,w12 1282 eor v27.16b,v13.16b,v14.16b 1283 add w8,w8,w9 1284 eor v28.16b,v17.16b,v18.16b 1285 eor w21,w21,w5 1286 eor v29.16b,v21.16b,v22.16b 1287 eor w17,w17,w6 1288 ushr v1.4s,v24.4s,#25 1289 eor w19,w19,w7 1290 ushr v5.4s,v25.4s,#25 1291 eor w20,w20,w8 1292 ushr v9.4s,v26.4s,#25 1293 ror w21,w21,#24 1294 ushr v13.4s,v27.4s,#25 1295 ror w17,w17,#24 1296 ushr v17.4s,v28.4s,#25 1297 ror w19,w19,#24 1298 ushr v21.4s,v29.4s,#25 1299 ror w20,w20,#24 1300 sli v1.4s,v24.4s,#7 1301 add w15,w15,w21 1302 sli v5.4s,v25.4s,#7 1303 add w16,w16,w17 1304 sli v9.4s,v26.4s,#7 1305 add w13,w13,w19 1306 sli v13.4s,v27.4s,#7 1307 add w14,w14,w20 1308 sli v17.4s,v28.4s,#7 1309 eor w10,w10,w15 1310 sli v21.4s,v29.4s,#7 1311 eor w11,w11,w16 1312 ext v2.16b,v2.16b,v2.16b,#8 1313 eor w12,w12,w13 1314 ext v6.16b,v6.16b,v6.16b,#8 1315 eor w9,w9,w14 1316 ext v10.16b,v10.16b,v10.16b,#8 1317 ror w10,w10,#25 1318 ext v14.16b,v14.16b,v14.16b,#8 1319 ror w11,w11,#25 1320 ext v18.16b,v18.16b,v18.16b,#8 1321 ror w12,w12,#25 1322 ext v22.16b,v22.16b,v22.16b,#8 1323 ror w9,w9,#25 1324 ext v3.16b,v3.16b,v3.16b,#4 1325 ext v7.16b,v7.16b,v7.16b,#4 1326 ext v11.16b,v11.16b,v11.16b,#4 1327 ext v15.16b,v15.16b,v15.16b,#4 1328 ext v19.16b,v19.16b,v19.16b,#4 1329 ext v23.16b,v23.16b,v23.16b,#4 1330 ext v1.16b,v1.16b,v1.16b,#12 1331 ext v5.16b,v5.16b,v5.16b,#12 1332 ext v9.16b,v9.16b,v9.16b,#12 1333 ext v13.16b,v13.16b,v13.16b,#12 1334 ext v17.16b,v17.16b,v17.16b,#12 1335 ext v21.16b,v21.16b,v21.16b,#12 1336 cbnz x4,Loop_upper_neon 1337 1338 add w5,w5,w22 // accumulate key block 1339 add x6,x6,x22,lsr#32 1340 add w7,w7,w23 1341 add x8,x8,x23,lsr#32 1342 add w9,w9,w24 1343 add x10,x10,x24,lsr#32 1344 add w11,w11,w25 1345 add x12,x12,x25,lsr#32 1346 add w13,w13,w26 1347 add x14,x14,x26,lsr#32 1348 add w15,w15,w27 1349 add x16,x16,x27,lsr#32 1350 add w17,w17,w28 1351 add x19,x19,x28,lsr#32 1352 add w20,w20,w30 1353 add x21,x21,x30,lsr#32 1354 1355 add x5,x5,x6,lsl#32 // pack 1356 add x7,x7,x8,lsl#32 1357 ldp x6,x8,[x1,#0] // load input 1358 add x9,x9,x10,lsl#32 1359 add x11,x11,x12,lsl#32 1360 ldp x10,x12,[x1,#16] 1361 add x13,x13,x14,lsl#32 1362 add x15,x15,x16,lsl#32 1363 ldp x14,x16,[x1,#32] 1364 add x17,x17,x19,lsl#32 1365 add x20,x20,x21,lsl#32 1366 ldp x19,x21,[x1,#48] 1367 add x1,x1,#64 1368#ifdef __ARMEB__ 1369 rev x5,x5 1370 rev x7,x7 1371 rev x9,x9 1372 rev x11,x11 1373 rev x13,x13 1374 rev x15,x15 1375 rev x17,x17 1376 rev x20,x20 1377#endif 1378 eor x5,x5,x6 1379 eor x7,x7,x8 1380 eor x9,x9,x10 1381 eor x11,x11,x12 1382 eor x13,x13,x14 1383 eor x15,x15,x16 1384 eor x17,x17,x19 1385 eor x20,x20,x21 1386 1387 stp x5,x7,[x0,#0] // store output 1388 add x28,x28,#1 // increment counter 1389 mov w5,w22 // unpack key block 1390 lsr x6,x22,#32 1391 stp x9,x11,[x0,#16] 1392 mov w7,w23 1393 lsr x8,x23,#32 1394 stp x13,x15,[x0,#32] 1395 mov w9,w24 1396 lsr x10,x24,#32 1397 stp x17,x20,[x0,#48] 1398 add x0,x0,#64 1399 mov w11,w25 1400 lsr x12,x25,#32 1401 mov w13,w26 1402 lsr x14,x26,#32 1403 mov w15,w27 1404 lsr x16,x27,#32 1405 mov w17,w28 1406 lsr x19,x28,#32 1407 mov w20,w30 1408 lsr x21,x30,#32 1409 1410 mov x4,#5 1411Loop_lower_neon: 1412 sub x4,x4,#1 1413 add v0.4s,v0.4s,v1.4s 1414 add w5,w5,w9 1415 add v4.4s,v4.4s,v5.4s 1416 add w6,w6,w10 1417 add v8.4s,v8.4s,v9.4s 1418 add w7,w7,w11 1419 add v12.4s,v12.4s,v13.4s 1420 add w8,w8,w12 1421 add v16.4s,v16.4s,v17.4s 1422 eor w17,w17,w5 1423 add v20.4s,v20.4s,v21.4s 1424 eor w19,w19,w6 1425 eor v3.16b,v3.16b,v0.16b 1426 eor w20,w20,w7 1427 eor v7.16b,v7.16b,v4.16b 1428 eor w21,w21,w8 1429 eor v11.16b,v11.16b,v8.16b 1430 ror w17,w17,#16 1431 eor v15.16b,v15.16b,v12.16b 1432 ror w19,w19,#16 1433 eor v19.16b,v19.16b,v16.16b 1434 ror w20,w20,#16 1435 eor v23.16b,v23.16b,v20.16b 1436 ror w21,w21,#16 1437 rev32 v3.8h,v3.8h 1438 add w13,w13,w17 1439 rev32 v7.8h,v7.8h 1440 add w14,w14,w19 1441 rev32 v11.8h,v11.8h 1442 add w15,w15,w20 1443 rev32 v15.8h,v15.8h 1444 add w16,w16,w21 1445 rev32 v19.8h,v19.8h 1446 eor w9,w9,w13 1447 rev32 v23.8h,v23.8h 1448 eor w10,w10,w14 1449 add v2.4s,v2.4s,v3.4s 1450 eor w11,w11,w15 1451 add v6.4s,v6.4s,v7.4s 1452 eor w12,w12,w16 1453 add v10.4s,v10.4s,v11.4s 1454 ror w9,w9,#20 1455 add v14.4s,v14.4s,v15.4s 1456 ror w10,w10,#20 1457 add v18.4s,v18.4s,v19.4s 1458 ror w11,w11,#20 1459 add v22.4s,v22.4s,v23.4s 1460 ror w12,w12,#20 1461 eor v24.16b,v1.16b,v2.16b 1462 add w5,w5,w9 1463 eor v25.16b,v5.16b,v6.16b 1464 add w6,w6,w10 1465 eor v26.16b,v9.16b,v10.16b 1466 add w7,w7,w11 1467 eor v27.16b,v13.16b,v14.16b 1468 add w8,w8,w12 1469 eor v28.16b,v17.16b,v18.16b 1470 eor w17,w17,w5 1471 eor v29.16b,v21.16b,v22.16b 1472 eor w19,w19,w6 1473 ushr v1.4s,v24.4s,#20 1474 eor w20,w20,w7 1475 ushr v5.4s,v25.4s,#20 1476 eor w21,w21,w8 1477 ushr v9.4s,v26.4s,#20 1478 ror w17,w17,#24 1479 ushr v13.4s,v27.4s,#20 1480 ror w19,w19,#24 1481 ushr v17.4s,v28.4s,#20 1482 ror w20,w20,#24 1483 ushr v21.4s,v29.4s,#20 1484 ror w21,w21,#24 1485 sli v1.4s,v24.4s,#12 1486 add w13,w13,w17 1487 sli v5.4s,v25.4s,#12 1488 add w14,w14,w19 1489 sli v9.4s,v26.4s,#12 1490 add w15,w15,w20 1491 sli v13.4s,v27.4s,#12 1492 add w16,w16,w21 1493 sli v17.4s,v28.4s,#12 1494 eor w9,w9,w13 1495 sli v21.4s,v29.4s,#12 1496 eor w10,w10,w14 1497 add v0.4s,v0.4s,v1.4s 1498 eor w11,w11,w15 1499 add v4.4s,v4.4s,v5.4s 1500 eor w12,w12,w16 1501 add v8.4s,v8.4s,v9.4s 1502 ror w9,w9,#25 1503 add v12.4s,v12.4s,v13.4s 1504 ror w10,w10,#25 1505 add v16.4s,v16.4s,v17.4s 1506 ror w11,w11,#25 1507 add v20.4s,v20.4s,v21.4s 1508 ror w12,w12,#25 1509 eor v24.16b,v3.16b,v0.16b 1510 add w5,w5,w10 1511 eor v25.16b,v7.16b,v4.16b 1512 add w6,w6,w11 1513 eor v26.16b,v11.16b,v8.16b 1514 add w7,w7,w12 1515 eor v27.16b,v15.16b,v12.16b 1516 add w8,w8,w9 1517 eor v28.16b,v19.16b,v16.16b 1518 eor w21,w21,w5 1519 eor v29.16b,v23.16b,v20.16b 1520 eor w17,w17,w6 1521 ushr v3.4s,v24.4s,#24 1522 eor w19,w19,w7 1523 ushr v7.4s,v25.4s,#24 1524 eor w20,w20,w8 1525 ushr v11.4s,v26.4s,#24 1526 ror w21,w21,#16 1527 ushr v15.4s,v27.4s,#24 1528 ror w17,w17,#16 1529 ushr v19.4s,v28.4s,#24 1530 ror w19,w19,#16 1531 ushr v23.4s,v29.4s,#24 1532 ror w20,w20,#16 1533 sli v3.4s,v24.4s,#8 1534 add w15,w15,w21 1535 sli v7.4s,v25.4s,#8 1536 add w16,w16,w17 1537 sli v11.4s,v26.4s,#8 1538 add w13,w13,w19 1539 sli v15.4s,v27.4s,#8 1540 add w14,w14,w20 1541 sli v19.4s,v28.4s,#8 1542 eor w10,w10,w15 1543 sli v23.4s,v29.4s,#8 1544 eor w11,w11,w16 1545 add v2.4s,v2.4s,v3.4s 1546 eor w12,w12,w13 1547 add v6.4s,v6.4s,v7.4s 1548 eor w9,w9,w14 1549 add v10.4s,v10.4s,v11.4s 1550 ror w10,w10,#20 1551 add v14.4s,v14.4s,v15.4s 1552 ror w11,w11,#20 1553 add v18.4s,v18.4s,v19.4s 1554 ror w12,w12,#20 1555 add v22.4s,v22.4s,v23.4s 1556 ror w9,w9,#20 1557 eor v24.16b,v1.16b,v2.16b 1558 add w5,w5,w10 1559 eor v25.16b,v5.16b,v6.16b 1560 add w6,w6,w11 1561 eor v26.16b,v9.16b,v10.16b 1562 add w7,w7,w12 1563 eor v27.16b,v13.16b,v14.16b 1564 add w8,w8,w9 1565 eor v28.16b,v17.16b,v18.16b 1566 eor w21,w21,w5 1567 eor v29.16b,v21.16b,v22.16b 1568 eor w17,w17,w6 1569 ushr v1.4s,v24.4s,#25 1570 eor w19,w19,w7 1571 ushr v5.4s,v25.4s,#25 1572 eor w20,w20,w8 1573 ushr v9.4s,v26.4s,#25 1574 ror w21,w21,#24 1575 ushr v13.4s,v27.4s,#25 1576 ror w17,w17,#24 1577 ushr v17.4s,v28.4s,#25 1578 ror w19,w19,#24 1579 ushr v21.4s,v29.4s,#25 1580 ror w20,w20,#24 1581 sli v1.4s,v24.4s,#7 1582 add w15,w15,w21 1583 sli v5.4s,v25.4s,#7 1584 add w16,w16,w17 1585 sli v9.4s,v26.4s,#7 1586 add w13,w13,w19 1587 sli v13.4s,v27.4s,#7 1588 add w14,w14,w20 1589 sli v17.4s,v28.4s,#7 1590 eor w10,w10,w15 1591 sli v21.4s,v29.4s,#7 1592 eor w11,w11,w16 1593 ext v2.16b,v2.16b,v2.16b,#8 1594 eor w12,w12,w13 1595 ext v6.16b,v6.16b,v6.16b,#8 1596 eor w9,w9,w14 1597 ext v10.16b,v10.16b,v10.16b,#8 1598 ror w10,w10,#25 1599 ext v14.16b,v14.16b,v14.16b,#8 1600 ror w11,w11,#25 1601 ext v18.16b,v18.16b,v18.16b,#8 1602 ror w12,w12,#25 1603 ext v22.16b,v22.16b,v22.16b,#8 1604 ror w9,w9,#25 1605 ext v3.16b,v3.16b,v3.16b,#12 1606 ext v7.16b,v7.16b,v7.16b,#12 1607 ext v11.16b,v11.16b,v11.16b,#12 1608 ext v15.16b,v15.16b,v15.16b,#12 1609 ext v19.16b,v19.16b,v19.16b,#12 1610 ext v23.16b,v23.16b,v23.16b,#12 1611 ext v1.16b,v1.16b,v1.16b,#4 1612 ext v5.16b,v5.16b,v5.16b,#4 1613 ext v9.16b,v9.16b,v9.16b,#4 1614 ext v13.16b,v13.16b,v13.16b,#4 1615 ext v17.16b,v17.16b,v17.16b,#4 1616 ext v21.16b,v21.16b,v21.16b,#4 1617 add v0.4s,v0.4s,v1.4s 1618 add w5,w5,w9 1619 add v4.4s,v4.4s,v5.4s 1620 add w6,w6,w10 1621 add v8.4s,v8.4s,v9.4s 1622 add w7,w7,w11 1623 add v12.4s,v12.4s,v13.4s 1624 add w8,w8,w12 1625 add v16.4s,v16.4s,v17.4s 1626 eor w17,w17,w5 1627 add v20.4s,v20.4s,v21.4s 1628 eor w19,w19,w6 1629 eor v3.16b,v3.16b,v0.16b 1630 eor w20,w20,w7 1631 eor v7.16b,v7.16b,v4.16b 1632 eor w21,w21,w8 1633 eor v11.16b,v11.16b,v8.16b 1634 ror w17,w17,#16 1635 eor v15.16b,v15.16b,v12.16b 1636 ror w19,w19,#16 1637 eor v19.16b,v19.16b,v16.16b 1638 ror w20,w20,#16 1639 eor v23.16b,v23.16b,v20.16b 1640 ror w21,w21,#16 1641 rev32 v3.8h,v3.8h 1642 add w13,w13,w17 1643 rev32 v7.8h,v7.8h 1644 add w14,w14,w19 1645 rev32 v11.8h,v11.8h 1646 add w15,w15,w20 1647 rev32 v15.8h,v15.8h 1648 add w16,w16,w21 1649 rev32 v19.8h,v19.8h 1650 eor w9,w9,w13 1651 rev32 v23.8h,v23.8h 1652 eor w10,w10,w14 1653 add v2.4s,v2.4s,v3.4s 1654 eor w11,w11,w15 1655 add v6.4s,v6.4s,v7.4s 1656 eor w12,w12,w16 1657 add v10.4s,v10.4s,v11.4s 1658 ror w9,w9,#20 1659 add v14.4s,v14.4s,v15.4s 1660 ror w10,w10,#20 1661 add v18.4s,v18.4s,v19.4s 1662 ror w11,w11,#20 1663 add v22.4s,v22.4s,v23.4s 1664 ror w12,w12,#20 1665 eor v24.16b,v1.16b,v2.16b 1666 add w5,w5,w9 1667 eor v25.16b,v5.16b,v6.16b 1668 add w6,w6,w10 1669 eor v26.16b,v9.16b,v10.16b 1670 add w7,w7,w11 1671 eor v27.16b,v13.16b,v14.16b 1672 add w8,w8,w12 1673 eor v28.16b,v17.16b,v18.16b 1674 eor w17,w17,w5 1675 eor v29.16b,v21.16b,v22.16b 1676 eor w19,w19,w6 1677 ushr v1.4s,v24.4s,#20 1678 eor w20,w20,w7 1679 ushr v5.4s,v25.4s,#20 1680 eor w21,w21,w8 1681 ushr v9.4s,v26.4s,#20 1682 ror w17,w17,#24 1683 ushr v13.4s,v27.4s,#20 1684 ror w19,w19,#24 1685 ushr v17.4s,v28.4s,#20 1686 ror w20,w20,#24 1687 ushr v21.4s,v29.4s,#20 1688 ror w21,w21,#24 1689 sli v1.4s,v24.4s,#12 1690 add w13,w13,w17 1691 sli v5.4s,v25.4s,#12 1692 add w14,w14,w19 1693 sli v9.4s,v26.4s,#12 1694 add w15,w15,w20 1695 sli v13.4s,v27.4s,#12 1696 add w16,w16,w21 1697 sli v17.4s,v28.4s,#12 1698 eor w9,w9,w13 1699 sli v21.4s,v29.4s,#12 1700 eor w10,w10,w14 1701 add v0.4s,v0.4s,v1.4s 1702 eor w11,w11,w15 1703 add v4.4s,v4.4s,v5.4s 1704 eor w12,w12,w16 1705 add v8.4s,v8.4s,v9.4s 1706 ror w9,w9,#25 1707 add v12.4s,v12.4s,v13.4s 1708 ror w10,w10,#25 1709 add v16.4s,v16.4s,v17.4s 1710 ror w11,w11,#25 1711 add v20.4s,v20.4s,v21.4s 1712 ror w12,w12,#25 1713 eor v24.16b,v3.16b,v0.16b 1714 add w5,w5,w10 1715 eor v25.16b,v7.16b,v4.16b 1716 add w6,w6,w11 1717 eor v26.16b,v11.16b,v8.16b 1718 add w7,w7,w12 1719 eor v27.16b,v15.16b,v12.16b 1720 add w8,w8,w9 1721 eor v28.16b,v19.16b,v16.16b 1722 eor w21,w21,w5 1723 eor v29.16b,v23.16b,v20.16b 1724 eor w17,w17,w6 1725 ushr v3.4s,v24.4s,#24 1726 eor w19,w19,w7 1727 ushr v7.4s,v25.4s,#24 1728 eor w20,w20,w8 1729 ushr v11.4s,v26.4s,#24 1730 ror w21,w21,#16 1731 ushr v15.4s,v27.4s,#24 1732 ror w17,w17,#16 1733 ushr v19.4s,v28.4s,#24 1734 ror w19,w19,#16 1735 ushr v23.4s,v29.4s,#24 1736 ror w20,w20,#16 1737 sli v3.4s,v24.4s,#8 1738 add w15,w15,w21 1739 sli v7.4s,v25.4s,#8 1740 add w16,w16,w17 1741 sli v11.4s,v26.4s,#8 1742 add w13,w13,w19 1743 sli v15.4s,v27.4s,#8 1744 add w14,w14,w20 1745 sli v19.4s,v28.4s,#8 1746 eor w10,w10,w15 1747 sli v23.4s,v29.4s,#8 1748 eor w11,w11,w16 1749 add v2.4s,v2.4s,v3.4s 1750 eor w12,w12,w13 1751 add v6.4s,v6.4s,v7.4s 1752 eor w9,w9,w14 1753 add v10.4s,v10.4s,v11.4s 1754 ror w10,w10,#20 1755 add v14.4s,v14.4s,v15.4s 1756 ror w11,w11,#20 1757 add v18.4s,v18.4s,v19.4s 1758 ror w12,w12,#20 1759 add v22.4s,v22.4s,v23.4s 1760 ror w9,w9,#20 1761 eor v24.16b,v1.16b,v2.16b 1762 add w5,w5,w10 1763 eor v25.16b,v5.16b,v6.16b 1764 add w6,w6,w11 1765 eor v26.16b,v9.16b,v10.16b 1766 add w7,w7,w12 1767 eor v27.16b,v13.16b,v14.16b 1768 add w8,w8,w9 1769 eor v28.16b,v17.16b,v18.16b 1770 eor w21,w21,w5 1771 eor v29.16b,v21.16b,v22.16b 1772 eor w17,w17,w6 1773 ushr v1.4s,v24.4s,#25 1774 eor w19,w19,w7 1775 ushr v5.4s,v25.4s,#25 1776 eor w20,w20,w8 1777 ushr v9.4s,v26.4s,#25 1778 ror w21,w21,#24 1779 ushr v13.4s,v27.4s,#25 1780 ror w17,w17,#24 1781 ushr v17.4s,v28.4s,#25 1782 ror w19,w19,#24 1783 ushr v21.4s,v29.4s,#25 1784 ror w20,w20,#24 1785 sli v1.4s,v24.4s,#7 1786 add w15,w15,w21 1787 sli v5.4s,v25.4s,#7 1788 add w16,w16,w17 1789 sli v9.4s,v26.4s,#7 1790 add w13,w13,w19 1791 sli v13.4s,v27.4s,#7 1792 add w14,w14,w20 1793 sli v17.4s,v28.4s,#7 1794 eor w10,w10,w15 1795 sli v21.4s,v29.4s,#7 1796 eor w11,w11,w16 1797 ext v2.16b,v2.16b,v2.16b,#8 1798 eor w12,w12,w13 1799 ext v6.16b,v6.16b,v6.16b,#8 1800 eor w9,w9,w14 1801 ext v10.16b,v10.16b,v10.16b,#8 1802 ror w10,w10,#25 1803 ext v14.16b,v14.16b,v14.16b,#8 1804 ror w11,w11,#25 1805 ext v18.16b,v18.16b,v18.16b,#8 1806 ror w12,w12,#25 1807 ext v22.16b,v22.16b,v22.16b,#8 1808 ror w9,w9,#25 1809 ext v3.16b,v3.16b,v3.16b,#4 1810 ext v7.16b,v7.16b,v7.16b,#4 1811 ext v11.16b,v11.16b,v11.16b,#4 1812 ext v15.16b,v15.16b,v15.16b,#4 1813 ext v19.16b,v19.16b,v19.16b,#4 1814 ext v23.16b,v23.16b,v23.16b,#4 1815 ext v1.16b,v1.16b,v1.16b,#12 1816 ext v5.16b,v5.16b,v5.16b,#12 1817 ext v9.16b,v9.16b,v9.16b,#12 1818 ext v13.16b,v13.16b,v13.16b,#12 1819 ext v17.16b,v17.16b,v17.16b,#12 1820 ext v21.16b,v21.16b,v21.16b,#12 1821 cbnz x4,Loop_lower_neon 1822 1823 add w5,w5,w22 // accumulate key block 1824 ldp q24,q25,[sp,#0] 1825 add x6,x6,x22,lsr#32 1826 ldp q26,q27,[sp,#32] 1827 add w7,w7,w23 1828 ldp q28,q29,[sp,#64] 1829 add x8,x8,x23,lsr#32 1830 add v0.4s,v0.4s,v24.4s 1831 add w9,w9,w24 1832 add v4.4s,v4.4s,v24.4s 1833 add x10,x10,x24,lsr#32 1834 add v8.4s,v8.4s,v24.4s 1835 add w11,w11,w25 1836 add v12.4s,v12.4s,v24.4s 1837 add x12,x12,x25,lsr#32 1838 add v16.4s,v16.4s,v24.4s 1839 add w13,w13,w26 1840 add v20.4s,v20.4s,v24.4s 1841 add x14,x14,x26,lsr#32 1842 add v2.4s,v2.4s,v26.4s 1843 add w15,w15,w27 1844 add v6.4s,v6.4s,v26.4s 1845 add x16,x16,x27,lsr#32 1846 add v10.4s,v10.4s,v26.4s 1847 add w17,w17,w28 1848 add v14.4s,v14.4s,v26.4s 1849 add x19,x19,x28,lsr#32 1850 add v18.4s,v18.4s,v26.4s 1851 add w20,w20,w30 1852 add v22.4s,v22.4s,v26.4s 1853 add x21,x21,x30,lsr#32 1854 add v19.4s,v19.4s,v31.4s // +4 1855 add x5,x5,x6,lsl#32 // pack 1856 add v23.4s,v23.4s,v31.4s // +4 1857 add x7,x7,x8,lsl#32 1858 add v3.4s,v3.4s,v27.4s 1859 ldp x6,x8,[x1,#0] // load input 1860 add v7.4s,v7.4s,v28.4s 1861 add x9,x9,x10,lsl#32 1862 add v11.4s,v11.4s,v29.4s 1863 add x11,x11,x12,lsl#32 1864 add v15.4s,v15.4s,v30.4s 1865 ldp x10,x12,[x1,#16] 1866 add v19.4s,v19.4s,v27.4s 1867 add x13,x13,x14,lsl#32 1868 add v23.4s,v23.4s,v28.4s 1869 add x15,x15,x16,lsl#32 1870 add v1.4s,v1.4s,v25.4s 1871 ldp x14,x16,[x1,#32] 1872 add v5.4s,v5.4s,v25.4s 1873 add x17,x17,x19,lsl#32 1874 add v9.4s,v9.4s,v25.4s 1875 add x20,x20,x21,lsl#32 1876 add v13.4s,v13.4s,v25.4s 1877 ldp x19,x21,[x1,#48] 1878 add v17.4s,v17.4s,v25.4s 1879 add x1,x1,#64 1880 add v21.4s,v21.4s,v25.4s 1881 1882#ifdef __ARMEB__ 1883 rev x5,x5 1884 rev x7,x7 1885 rev x9,x9 1886 rev x11,x11 1887 rev x13,x13 1888 rev x15,x15 1889 rev x17,x17 1890 rev x20,x20 1891#endif 1892 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 1893 eor x5,x5,x6 1894 eor x7,x7,x8 1895 eor x9,x9,x10 1896 eor x11,x11,x12 1897 eor x13,x13,x14 1898 eor v0.16b,v0.16b,v24.16b 1899 eor x15,x15,x16 1900 eor v1.16b,v1.16b,v25.16b 1901 eor x17,x17,x19 1902 eor v2.16b,v2.16b,v26.16b 1903 eor x20,x20,x21 1904 eor v3.16b,v3.16b,v27.16b 1905 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 1906 1907 stp x5,x7,[x0,#0] // store output 1908 add x28,x28,#7 // increment counter 1909 stp x9,x11,[x0,#16] 1910 stp x13,x15,[x0,#32] 1911 stp x17,x20,[x0,#48] 1912 add x0,x0,#64 1913 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 1914 1915 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 1916 eor v4.16b,v4.16b,v24.16b 1917 eor v5.16b,v5.16b,v25.16b 1918 eor v6.16b,v6.16b,v26.16b 1919 eor v7.16b,v7.16b,v27.16b 1920 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 1921 1922 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 1923 eor v8.16b,v8.16b,v0.16b 1924 ldp q24,q25,[sp,#0] 1925 eor v9.16b,v9.16b,v1.16b 1926 ldp q26,q27,[sp,#32] 1927 eor v10.16b,v10.16b,v2.16b 1928 eor v11.16b,v11.16b,v3.16b 1929 st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 1930 1931 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 1932 eor v12.16b,v12.16b,v4.16b 1933 eor v13.16b,v13.16b,v5.16b 1934 eor v14.16b,v14.16b,v6.16b 1935 eor v15.16b,v15.16b,v7.16b 1936 st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 1937 1938 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 1939 eor v16.16b,v16.16b,v8.16b 1940 eor v17.16b,v17.16b,v9.16b 1941 eor v18.16b,v18.16b,v10.16b 1942 eor v19.16b,v19.16b,v11.16b 1943 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 1944 1945 shl v0.4s,v31.4s,#1 // 4 -> 8 1946 eor v20.16b,v20.16b,v12.16b 1947 eor v21.16b,v21.16b,v13.16b 1948 eor v22.16b,v22.16b,v14.16b 1949 eor v23.16b,v23.16b,v15.16b 1950 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 1951 1952 add v27.4s,v27.4s,v0.4s // += 8 1953 add v28.4s,v28.4s,v0.4s 1954 add v29.4s,v29.4s,v0.4s 1955 add v30.4s,v30.4s,v0.4s 1956 1957 b.hs Loop_outer_512_neon 1958 1959 adds x2,x2,#512 1960 ushr v0.4s,v31.4s,#2 // 4 -> 1 1961 1962 ldp d8,d9,[sp,#128+0] // meet ABI requirements 1963 ldp d10,d11,[sp,#128+16] 1964 ldp d12,d13,[sp,#128+32] 1965 ldp d14,d15,[sp,#128+48] 1966 1967 stp q24,q31,[sp,#0] // wipe off-load area 1968 stp q24,q31,[sp,#32] 1969 stp q24,q31,[sp,#64] 1970 1971 b.eq Ldone_512_neon 1972 1973 cmp x2,#192 1974 sub v27.4s,v27.4s,v0.4s // -= 1 1975 sub v28.4s,v28.4s,v0.4s 1976 sub v29.4s,v29.4s,v0.4s 1977 add sp,sp,#128 1978 b.hs Loop_outer_neon 1979 1980 eor v25.16b,v25.16b,v25.16b 1981 eor v26.16b,v26.16b,v26.16b 1982 eor v27.16b,v27.16b,v27.16b 1983 eor v28.16b,v28.16b,v28.16b 1984 eor v29.16b,v29.16b,v29.16b 1985 eor v30.16b,v30.16b,v30.16b 1986 b Loop_outer 1987 1988Ldone_512_neon: 1989 ldp x19,x20,[x29,#16] 1990 add sp,sp,#128+64 1991 ldp x21,x22,[x29,#32] 1992 ldp x23,x24,[x29,#48] 1993 ldp x25,x26,[x29,#64] 1994 ldp x27,x28,[x29,#80] 1995 ldp x29,x30,[sp],#96 1996 AARCH64_VALIDATE_LINK_REGISTER 1997 ret 1998 1999#endif 2000#endif // !OPENSSL_NO_ASM 2001