1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#if defined(__aarch64__) 13#include "ring_core_generated/prefix_symbols_asm.h" 14#include <ring-core/arm_arch.h> 15 16 17.hidden OPENSSL_armcap_P 18 19.section .rodata 20 21.align 5 22.Lsigma: 23.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral 24.Lone: 25.long 1,0,0,0 26.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 27.align 2 28 29.text 30 31.globl ChaCha20_ctr32 32.hidden ChaCha20_ctr32 33.type ChaCha20_ctr32,%function 34.align 5 35ChaCha20_ctr32: 36 AARCH64_VALID_CALL_TARGET 37 cbz x2,.Labort 38#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10 39 adrp x5,:pg_hi21_nc:OPENSSL_armcap_P 40#else 41 adrp x5,OPENSSL_armcap_P 42#endif 43 cmp x2,#192 44 b.lo .Lshort 45 ldr w17,[x5,:lo12:OPENSSL_armcap_P] 46 tst w17,#ARMV7_NEON 47 b.ne ChaCha20_neon 48 49.Lshort: 50 AARCH64_SIGN_LINK_REGISTER 51 stp x29,x30,[sp,#-96]! 52 add x29,sp,#0 53 54 adrp x5,.Lsigma 55 add x5,x5,:lo12:.Lsigma 56 stp x19,x20,[sp,#16] 57 stp x21,x22,[sp,#32] 58 stp x23,x24,[sp,#48] 59 stp x25,x26,[sp,#64] 60 stp x27,x28,[sp,#80] 61 sub sp,sp,#64 62 63 ldp x22,x23,[x5] // load sigma 64 ldp x24,x25,[x3] // load key 65 ldp x26,x27,[x3,#16] 66 ldp x28,x30,[x4] // load counter 67#ifdef __ARMEB__ 68 ror x24,x24,#32 69 ror x25,x25,#32 70 ror x26,x26,#32 71 ror x27,x27,#32 72 ror x28,x28,#32 73 ror x30,x30,#32 74#endif 75 76.Loop_outer: 77 mov w5,w22 // unpack key block 78 lsr x6,x22,#32 79 mov w7,w23 80 lsr x8,x23,#32 81 mov w9,w24 82 lsr x10,x24,#32 83 mov w11,w25 84 lsr x12,x25,#32 85 mov w13,w26 86 lsr x14,x26,#32 87 mov w15,w27 88 lsr x16,x27,#32 89 mov w17,w28 90 lsr x19,x28,#32 91 mov w20,w30 92 lsr x21,x30,#32 93 94 mov x4,#10 95 subs x2,x2,#64 96.Loop: 97 sub x4,x4,#1 98 add w5,w5,w9 99 add w6,w6,w10 100 add w7,w7,w11 101 add w8,w8,w12 102 eor w17,w17,w5 103 eor w19,w19,w6 104 eor w20,w20,w7 105 eor w21,w21,w8 106 ror w17,w17,#16 107 ror w19,w19,#16 108 ror w20,w20,#16 109 ror w21,w21,#16 110 add w13,w13,w17 111 add w14,w14,w19 112 add w15,w15,w20 113 add w16,w16,w21 114 eor w9,w9,w13 115 eor w10,w10,w14 116 eor w11,w11,w15 117 eor w12,w12,w16 118 ror w9,w9,#20 119 ror w10,w10,#20 120 ror w11,w11,#20 121 ror w12,w12,#20 122 add w5,w5,w9 123 add w6,w6,w10 124 add w7,w7,w11 125 add w8,w8,w12 126 eor w17,w17,w5 127 eor w19,w19,w6 128 eor w20,w20,w7 129 eor w21,w21,w8 130 ror w17,w17,#24 131 ror w19,w19,#24 132 ror w20,w20,#24 133 ror w21,w21,#24 134 add w13,w13,w17 135 add w14,w14,w19 136 add w15,w15,w20 137 add w16,w16,w21 138 eor w9,w9,w13 139 eor w10,w10,w14 140 eor w11,w11,w15 141 eor w12,w12,w16 142 ror w9,w9,#25 143 ror w10,w10,#25 144 ror w11,w11,#25 145 ror w12,w12,#25 146 add w5,w5,w10 147 add w6,w6,w11 148 add w7,w7,w12 149 add w8,w8,w9 150 eor w21,w21,w5 151 eor w17,w17,w6 152 eor w19,w19,w7 153 eor w20,w20,w8 154 ror w21,w21,#16 155 ror w17,w17,#16 156 ror w19,w19,#16 157 ror w20,w20,#16 158 add w15,w15,w21 159 add w16,w16,w17 160 add w13,w13,w19 161 add w14,w14,w20 162 eor w10,w10,w15 163 eor w11,w11,w16 164 eor w12,w12,w13 165 eor w9,w9,w14 166 ror w10,w10,#20 167 ror w11,w11,#20 168 ror w12,w12,#20 169 ror w9,w9,#20 170 add w5,w5,w10 171 add w6,w6,w11 172 add w7,w7,w12 173 add w8,w8,w9 174 eor w21,w21,w5 175 eor w17,w17,w6 176 eor w19,w19,w7 177 eor w20,w20,w8 178 ror w21,w21,#24 179 ror w17,w17,#24 180 ror w19,w19,#24 181 ror w20,w20,#24 182 add w15,w15,w21 183 add w16,w16,w17 184 add w13,w13,w19 185 add w14,w14,w20 186 eor w10,w10,w15 187 eor w11,w11,w16 188 eor w12,w12,w13 189 eor w9,w9,w14 190 ror w10,w10,#25 191 ror w11,w11,#25 192 ror w12,w12,#25 193 ror w9,w9,#25 194 cbnz x4,.Loop 195 196 add w5,w5,w22 // accumulate key block 197 add x6,x6,x22,lsr#32 198 add w7,w7,w23 199 add x8,x8,x23,lsr#32 200 add w9,w9,w24 201 add x10,x10,x24,lsr#32 202 add w11,w11,w25 203 add x12,x12,x25,lsr#32 204 add w13,w13,w26 205 add x14,x14,x26,lsr#32 206 add w15,w15,w27 207 add x16,x16,x27,lsr#32 208 add w17,w17,w28 209 add x19,x19,x28,lsr#32 210 add w20,w20,w30 211 add x21,x21,x30,lsr#32 212 213 b.lo .Ltail 214 215 add x5,x5,x6,lsl#32 // pack 216 add x7,x7,x8,lsl#32 217 ldp x6,x8,[x1,#0] // load input 218 add x9,x9,x10,lsl#32 219 add x11,x11,x12,lsl#32 220 ldp x10,x12,[x1,#16] 221 add x13,x13,x14,lsl#32 222 add x15,x15,x16,lsl#32 223 ldp x14,x16,[x1,#32] 224 add x17,x17,x19,lsl#32 225 add x20,x20,x21,lsl#32 226 ldp x19,x21,[x1,#48] 227 add x1,x1,#64 228#ifdef __ARMEB__ 229 rev x5,x5 230 rev x7,x7 231 rev x9,x9 232 rev x11,x11 233 rev x13,x13 234 rev x15,x15 235 rev x17,x17 236 rev x20,x20 237#endif 238 eor x5,x5,x6 239 eor x7,x7,x8 240 eor x9,x9,x10 241 eor x11,x11,x12 242 eor x13,x13,x14 243 eor x15,x15,x16 244 eor x17,x17,x19 245 eor x20,x20,x21 246 247 stp x5,x7,[x0,#0] // store output 248 add x28,x28,#1 // increment counter 249 stp x9,x11,[x0,#16] 250 stp x13,x15,[x0,#32] 251 stp x17,x20,[x0,#48] 252 add x0,x0,#64 253 254 b.hi .Loop_outer 255 256 ldp x19,x20,[x29,#16] 257 add sp,sp,#64 258 ldp x21,x22,[x29,#32] 259 ldp x23,x24,[x29,#48] 260 ldp x25,x26,[x29,#64] 261 ldp x27,x28,[x29,#80] 262 ldp x29,x30,[sp],#96 263 AARCH64_VALIDATE_LINK_REGISTER 264.Labort: 265 ret 266 267.align 4 268.Ltail: 269 add x2,x2,#64 270.Less_than_64: 271 sub x0,x0,#1 272 add x1,x1,x2 273 add x0,x0,x2 274 add x4,sp,x2 275 neg x2,x2 276 277 add x5,x5,x6,lsl#32 // pack 278 add x7,x7,x8,lsl#32 279 add x9,x9,x10,lsl#32 280 add x11,x11,x12,lsl#32 281 add x13,x13,x14,lsl#32 282 add x15,x15,x16,lsl#32 283 add x17,x17,x19,lsl#32 284 add x20,x20,x21,lsl#32 285#ifdef __ARMEB__ 286 rev x5,x5 287 rev x7,x7 288 rev x9,x9 289 rev x11,x11 290 rev x13,x13 291 rev x15,x15 292 rev x17,x17 293 rev x20,x20 294#endif 295 stp x5,x7,[sp,#0] 296 stp x9,x11,[sp,#16] 297 stp x13,x15,[sp,#32] 298 stp x17,x20,[sp,#48] 299 300.Loop_tail: 301 ldrb w10,[x1,x2] 302 ldrb w11,[x4,x2] 303 add x2,x2,#1 304 eor w10,w10,w11 305 strb w10,[x0,x2] 306 cbnz x2,.Loop_tail 307 308 stp xzr,xzr,[sp,#0] 309 stp xzr,xzr,[sp,#16] 310 stp xzr,xzr,[sp,#32] 311 stp xzr,xzr,[sp,#48] 312 313 ldp x19,x20,[x29,#16] 314 add sp,sp,#64 315 ldp x21,x22,[x29,#32] 316 ldp x23,x24,[x29,#48] 317 ldp x25,x26,[x29,#64] 318 ldp x27,x28,[x29,#80] 319 ldp x29,x30,[sp],#96 320 AARCH64_VALIDATE_LINK_REGISTER 321 ret 322.size ChaCha20_ctr32,.-ChaCha20_ctr32 323 324.type ChaCha20_neon,%function 325.align 5 326ChaCha20_neon: 327 AARCH64_SIGN_LINK_REGISTER 328 stp x29,x30,[sp,#-96]! 329 add x29,sp,#0 330 331 adrp x5,.Lsigma 332 add x5,x5,:lo12:.Lsigma 333 stp x19,x20,[sp,#16] 334 stp x21,x22,[sp,#32] 335 stp x23,x24,[sp,#48] 336 stp x25,x26,[sp,#64] 337 stp x27,x28,[sp,#80] 338 cmp x2,#512 339 b.hs .L512_or_more_neon 340 341 sub sp,sp,#64 342 343 ldp x22,x23,[x5] // load sigma 344 ld1 {v24.4s},[x5],#16 345 ldp x24,x25,[x3] // load key 346 ldp x26,x27,[x3,#16] 347 ld1 {v25.4s,v26.4s},[x3] 348 ldp x28,x30,[x4] // load counter 349 ld1 {v27.4s},[x4] 350 ld1 {v31.4s},[x5] 351#ifdef __ARMEB__ 352 rev64 v24.4s,v24.4s 353 ror x24,x24,#32 354 ror x25,x25,#32 355 ror x26,x26,#32 356 ror x27,x27,#32 357 ror x28,x28,#32 358 ror x30,x30,#32 359#endif 360 add v27.4s,v27.4s,v31.4s // += 1 361 add v28.4s,v27.4s,v31.4s 362 add v29.4s,v28.4s,v31.4s 363 shl v31.4s,v31.4s,#2 // 1 -> 4 364 365.Loop_outer_neon: 366 mov w5,w22 // unpack key block 367 lsr x6,x22,#32 368 mov v0.16b,v24.16b 369 mov w7,w23 370 lsr x8,x23,#32 371 mov v4.16b,v24.16b 372 mov w9,w24 373 lsr x10,x24,#32 374 mov v16.16b,v24.16b 375 mov w11,w25 376 mov v1.16b,v25.16b 377 lsr x12,x25,#32 378 mov v5.16b,v25.16b 379 mov w13,w26 380 mov v17.16b,v25.16b 381 lsr x14,x26,#32 382 mov v3.16b,v27.16b 383 mov w15,w27 384 mov v7.16b,v28.16b 385 lsr x16,x27,#32 386 mov v19.16b,v29.16b 387 mov w17,w28 388 mov v2.16b,v26.16b 389 lsr x19,x28,#32 390 mov v6.16b,v26.16b 391 mov w20,w30 392 mov v18.16b,v26.16b 393 lsr x21,x30,#32 394 395 mov x4,#10 396 subs x2,x2,#256 397.Loop_neon: 398 sub x4,x4,#1 399 add v0.4s,v0.4s,v1.4s 400 add w5,w5,w9 401 add v4.4s,v4.4s,v5.4s 402 add w6,w6,w10 403 add v16.4s,v16.4s,v17.4s 404 add w7,w7,w11 405 eor v3.16b,v3.16b,v0.16b 406 add w8,w8,w12 407 eor v7.16b,v7.16b,v4.16b 408 eor w17,w17,w5 409 eor v19.16b,v19.16b,v16.16b 410 eor w19,w19,w6 411 rev32 v3.8h,v3.8h 412 eor w20,w20,w7 413 rev32 v7.8h,v7.8h 414 eor w21,w21,w8 415 rev32 v19.8h,v19.8h 416 ror w17,w17,#16 417 add v2.4s,v2.4s,v3.4s 418 ror w19,w19,#16 419 add v6.4s,v6.4s,v7.4s 420 ror w20,w20,#16 421 add v18.4s,v18.4s,v19.4s 422 ror w21,w21,#16 423 eor v20.16b,v1.16b,v2.16b 424 add w13,w13,w17 425 eor v21.16b,v5.16b,v6.16b 426 add w14,w14,w19 427 eor v22.16b,v17.16b,v18.16b 428 add w15,w15,w20 429 ushr v1.4s,v20.4s,#20 430 add w16,w16,w21 431 ushr v5.4s,v21.4s,#20 432 eor w9,w9,w13 433 ushr v17.4s,v22.4s,#20 434 eor w10,w10,w14 435 sli v1.4s,v20.4s,#12 436 eor w11,w11,w15 437 sli v5.4s,v21.4s,#12 438 eor w12,w12,w16 439 sli v17.4s,v22.4s,#12 440 ror w9,w9,#20 441 add v0.4s,v0.4s,v1.4s 442 ror w10,w10,#20 443 add v4.4s,v4.4s,v5.4s 444 ror w11,w11,#20 445 add v16.4s,v16.4s,v17.4s 446 ror w12,w12,#20 447 eor v20.16b,v3.16b,v0.16b 448 add w5,w5,w9 449 eor v21.16b,v7.16b,v4.16b 450 add w6,w6,w10 451 eor v22.16b,v19.16b,v16.16b 452 add w7,w7,w11 453 ushr v3.4s,v20.4s,#24 454 add w8,w8,w12 455 ushr v7.4s,v21.4s,#24 456 eor w17,w17,w5 457 ushr v19.4s,v22.4s,#24 458 eor w19,w19,w6 459 sli v3.4s,v20.4s,#8 460 eor w20,w20,w7 461 sli v7.4s,v21.4s,#8 462 eor w21,w21,w8 463 sli v19.4s,v22.4s,#8 464 ror w17,w17,#24 465 add v2.4s,v2.4s,v3.4s 466 ror w19,w19,#24 467 add v6.4s,v6.4s,v7.4s 468 ror w20,w20,#24 469 add v18.4s,v18.4s,v19.4s 470 ror w21,w21,#24 471 eor v20.16b,v1.16b,v2.16b 472 add w13,w13,w17 473 eor v21.16b,v5.16b,v6.16b 474 add w14,w14,w19 475 eor v22.16b,v17.16b,v18.16b 476 add w15,w15,w20 477 ushr v1.4s,v20.4s,#25 478 add w16,w16,w21 479 ushr v5.4s,v21.4s,#25 480 eor w9,w9,w13 481 ushr v17.4s,v22.4s,#25 482 eor w10,w10,w14 483 sli v1.4s,v20.4s,#7 484 eor w11,w11,w15 485 sli v5.4s,v21.4s,#7 486 eor w12,w12,w16 487 sli v17.4s,v22.4s,#7 488 ror w9,w9,#25 489 ext v2.16b,v2.16b,v2.16b,#8 490 ror w10,w10,#25 491 ext v6.16b,v6.16b,v6.16b,#8 492 ror w11,w11,#25 493 ext v18.16b,v18.16b,v18.16b,#8 494 ror w12,w12,#25 495 ext v3.16b,v3.16b,v3.16b,#12 496 ext v7.16b,v7.16b,v7.16b,#12 497 ext v19.16b,v19.16b,v19.16b,#12 498 ext v1.16b,v1.16b,v1.16b,#4 499 ext v5.16b,v5.16b,v5.16b,#4 500 ext v17.16b,v17.16b,v17.16b,#4 501 add v0.4s,v0.4s,v1.4s 502 add w5,w5,w10 503 add v4.4s,v4.4s,v5.4s 504 add w6,w6,w11 505 add v16.4s,v16.4s,v17.4s 506 add w7,w7,w12 507 eor v3.16b,v3.16b,v0.16b 508 add w8,w8,w9 509 eor v7.16b,v7.16b,v4.16b 510 eor w21,w21,w5 511 eor v19.16b,v19.16b,v16.16b 512 eor w17,w17,w6 513 rev32 v3.8h,v3.8h 514 eor w19,w19,w7 515 rev32 v7.8h,v7.8h 516 eor w20,w20,w8 517 rev32 v19.8h,v19.8h 518 ror w21,w21,#16 519 add v2.4s,v2.4s,v3.4s 520 ror w17,w17,#16 521 add v6.4s,v6.4s,v7.4s 522 ror w19,w19,#16 523 add v18.4s,v18.4s,v19.4s 524 ror w20,w20,#16 525 eor v20.16b,v1.16b,v2.16b 526 add w15,w15,w21 527 eor v21.16b,v5.16b,v6.16b 528 add w16,w16,w17 529 eor v22.16b,v17.16b,v18.16b 530 add w13,w13,w19 531 ushr v1.4s,v20.4s,#20 532 add w14,w14,w20 533 ushr v5.4s,v21.4s,#20 534 eor w10,w10,w15 535 ushr v17.4s,v22.4s,#20 536 eor w11,w11,w16 537 sli v1.4s,v20.4s,#12 538 eor w12,w12,w13 539 sli v5.4s,v21.4s,#12 540 eor w9,w9,w14 541 sli v17.4s,v22.4s,#12 542 ror w10,w10,#20 543 add v0.4s,v0.4s,v1.4s 544 ror w11,w11,#20 545 add v4.4s,v4.4s,v5.4s 546 ror w12,w12,#20 547 add v16.4s,v16.4s,v17.4s 548 ror w9,w9,#20 549 eor v20.16b,v3.16b,v0.16b 550 add w5,w5,w10 551 eor v21.16b,v7.16b,v4.16b 552 add w6,w6,w11 553 eor v22.16b,v19.16b,v16.16b 554 add w7,w7,w12 555 ushr v3.4s,v20.4s,#24 556 add w8,w8,w9 557 ushr v7.4s,v21.4s,#24 558 eor w21,w21,w5 559 ushr v19.4s,v22.4s,#24 560 eor w17,w17,w6 561 sli v3.4s,v20.4s,#8 562 eor w19,w19,w7 563 sli v7.4s,v21.4s,#8 564 eor w20,w20,w8 565 sli v19.4s,v22.4s,#8 566 ror w21,w21,#24 567 add v2.4s,v2.4s,v3.4s 568 ror w17,w17,#24 569 add v6.4s,v6.4s,v7.4s 570 ror w19,w19,#24 571 add v18.4s,v18.4s,v19.4s 572 ror w20,w20,#24 573 eor v20.16b,v1.16b,v2.16b 574 add w15,w15,w21 575 eor v21.16b,v5.16b,v6.16b 576 add w16,w16,w17 577 eor v22.16b,v17.16b,v18.16b 578 add w13,w13,w19 579 ushr v1.4s,v20.4s,#25 580 add w14,w14,w20 581 ushr v5.4s,v21.4s,#25 582 eor w10,w10,w15 583 ushr v17.4s,v22.4s,#25 584 eor w11,w11,w16 585 sli v1.4s,v20.4s,#7 586 eor w12,w12,w13 587 sli v5.4s,v21.4s,#7 588 eor w9,w9,w14 589 sli v17.4s,v22.4s,#7 590 ror w10,w10,#25 591 ext v2.16b,v2.16b,v2.16b,#8 592 ror w11,w11,#25 593 ext v6.16b,v6.16b,v6.16b,#8 594 ror w12,w12,#25 595 ext v18.16b,v18.16b,v18.16b,#8 596 ror w9,w9,#25 597 ext v3.16b,v3.16b,v3.16b,#4 598 ext v7.16b,v7.16b,v7.16b,#4 599 ext v19.16b,v19.16b,v19.16b,#4 600 ext v1.16b,v1.16b,v1.16b,#12 601 ext v5.16b,v5.16b,v5.16b,#12 602 ext v17.16b,v17.16b,v17.16b,#12 603 cbnz x4,.Loop_neon 604 605 add w5,w5,w22 // accumulate key block 606 add v0.4s,v0.4s,v24.4s 607 add x6,x6,x22,lsr#32 608 add v4.4s,v4.4s,v24.4s 609 add w7,w7,w23 610 add v16.4s,v16.4s,v24.4s 611 add x8,x8,x23,lsr#32 612 add v2.4s,v2.4s,v26.4s 613 add w9,w9,w24 614 add v6.4s,v6.4s,v26.4s 615 add x10,x10,x24,lsr#32 616 add v18.4s,v18.4s,v26.4s 617 add w11,w11,w25 618 add v3.4s,v3.4s,v27.4s 619 add x12,x12,x25,lsr#32 620 add w13,w13,w26 621 add v7.4s,v7.4s,v28.4s 622 add x14,x14,x26,lsr#32 623 add w15,w15,w27 624 add v19.4s,v19.4s,v29.4s 625 add x16,x16,x27,lsr#32 626 add w17,w17,w28 627 add v1.4s,v1.4s,v25.4s 628 add x19,x19,x28,lsr#32 629 add w20,w20,w30 630 add v5.4s,v5.4s,v25.4s 631 add x21,x21,x30,lsr#32 632 add v17.4s,v17.4s,v25.4s 633 634 b.lo .Ltail_neon 635 636 add x5,x5,x6,lsl#32 // pack 637 add x7,x7,x8,lsl#32 638 ldp x6,x8,[x1,#0] // load input 639 add x9,x9,x10,lsl#32 640 add x11,x11,x12,lsl#32 641 ldp x10,x12,[x1,#16] 642 add x13,x13,x14,lsl#32 643 add x15,x15,x16,lsl#32 644 ldp x14,x16,[x1,#32] 645 add x17,x17,x19,lsl#32 646 add x20,x20,x21,lsl#32 647 ldp x19,x21,[x1,#48] 648 add x1,x1,#64 649#ifdef __ARMEB__ 650 rev x5,x5 651 rev x7,x7 652 rev x9,x9 653 rev x11,x11 654 rev x13,x13 655 rev x15,x15 656 rev x17,x17 657 rev x20,x20 658#endif 659 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 660 eor x5,x5,x6 661 eor x7,x7,x8 662 eor x9,x9,x10 663 eor x11,x11,x12 664 eor x13,x13,x14 665 eor v0.16b,v0.16b,v20.16b 666 eor x15,x15,x16 667 eor v1.16b,v1.16b,v21.16b 668 eor x17,x17,x19 669 eor v2.16b,v2.16b,v22.16b 670 eor x20,x20,x21 671 eor v3.16b,v3.16b,v23.16b 672 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 673 674 stp x5,x7,[x0,#0] // store output 675 add x28,x28,#4 // increment counter 676 stp x9,x11,[x0,#16] 677 add v27.4s,v27.4s,v31.4s // += 4 678 stp x13,x15,[x0,#32] 679 add v28.4s,v28.4s,v31.4s 680 stp x17,x20,[x0,#48] 681 add v29.4s,v29.4s,v31.4s 682 add x0,x0,#64 683 684 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 685 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 686 687 eor v4.16b,v4.16b,v20.16b 688 eor v5.16b,v5.16b,v21.16b 689 eor v6.16b,v6.16b,v22.16b 690 eor v7.16b,v7.16b,v23.16b 691 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 692 693 eor v16.16b,v16.16b,v0.16b 694 eor v17.16b,v17.16b,v1.16b 695 eor v18.16b,v18.16b,v2.16b 696 eor v19.16b,v19.16b,v3.16b 697 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 698 699 b.hi .Loop_outer_neon 700 701 ldp x19,x20,[x29,#16] 702 add sp,sp,#64 703 ldp x21,x22,[x29,#32] 704 ldp x23,x24,[x29,#48] 705 ldp x25,x26,[x29,#64] 706 ldp x27,x28,[x29,#80] 707 ldp x29,x30,[sp],#96 708 AARCH64_VALIDATE_LINK_REGISTER 709 ret 710 711.Ltail_neon: 712 add x2,x2,#256 713 cmp x2,#64 714 b.lo .Less_than_64 715 716 add x5,x5,x6,lsl#32 // pack 717 add x7,x7,x8,lsl#32 718 ldp x6,x8,[x1,#0] // load input 719 add x9,x9,x10,lsl#32 720 add x11,x11,x12,lsl#32 721 ldp x10,x12,[x1,#16] 722 add x13,x13,x14,lsl#32 723 add x15,x15,x16,lsl#32 724 ldp x14,x16,[x1,#32] 725 add x17,x17,x19,lsl#32 726 add x20,x20,x21,lsl#32 727 ldp x19,x21,[x1,#48] 728 add x1,x1,#64 729#ifdef __ARMEB__ 730 rev x5,x5 731 rev x7,x7 732 rev x9,x9 733 rev x11,x11 734 rev x13,x13 735 rev x15,x15 736 rev x17,x17 737 rev x20,x20 738#endif 739 eor x5,x5,x6 740 eor x7,x7,x8 741 eor x9,x9,x10 742 eor x11,x11,x12 743 eor x13,x13,x14 744 eor x15,x15,x16 745 eor x17,x17,x19 746 eor x20,x20,x21 747 748 stp x5,x7,[x0,#0] // store output 749 add x28,x28,#4 // increment counter 750 stp x9,x11,[x0,#16] 751 stp x13,x15,[x0,#32] 752 stp x17,x20,[x0,#48] 753 add x0,x0,#64 754 b.eq .Ldone_neon 755 sub x2,x2,#64 756 cmp x2,#64 757 b.lo .Less_than_128 758 759 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 760 eor v0.16b,v0.16b,v20.16b 761 eor v1.16b,v1.16b,v21.16b 762 eor v2.16b,v2.16b,v22.16b 763 eor v3.16b,v3.16b,v23.16b 764 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 765 b.eq .Ldone_neon 766 sub x2,x2,#64 767 cmp x2,#64 768 b.lo .Less_than_192 769 770 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 771 eor v4.16b,v4.16b,v20.16b 772 eor v5.16b,v5.16b,v21.16b 773 eor v6.16b,v6.16b,v22.16b 774 eor v7.16b,v7.16b,v23.16b 775 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 776 b.eq .Ldone_neon 777 sub x2,x2,#64 778 779 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] 780 b .Last_neon 781 782.Less_than_128: 783 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] 784 b .Last_neon 785.Less_than_192: 786 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] 787 b .Last_neon 788 789.align 4 790.Last_neon: 791 sub x0,x0,#1 792 add x1,x1,x2 793 add x0,x0,x2 794 add x4,sp,x2 795 neg x2,x2 796 797.Loop_tail_neon: 798 ldrb w10,[x1,x2] 799 ldrb w11,[x4,x2] 800 add x2,x2,#1 801 eor w10,w10,w11 802 strb w10,[x0,x2] 803 cbnz x2,.Loop_tail_neon 804 805 stp xzr,xzr,[sp,#0] 806 stp xzr,xzr,[sp,#16] 807 stp xzr,xzr,[sp,#32] 808 stp xzr,xzr,[sp,#48] 809 810.Ldone_neon: 811 ldp x19,x20,[x29,#16] 812 add sp,sp,#64 813 ldp x21,x22,[x29,#32] 814 ldp x23,x24,[x29,#48] 815 ldp x25,x26,[x29,#64] 816 ldp x27,x28,[x29,#80] 817 ldp x29,x30,[sp],#96 818 AARCH64_VALIDATE_LINK_REGISTER 819 ret 820.size ChaCha20_neon,.-ChaCha20_neon 821.type ChaCha20_512_neon,%function 822.align 5 823ChaCha20_512_neon: 824 AARCH64_SIGN_LINK_REGISTER 825 stp x29,x30,[sp,#-96]! 826 add x29,sp,#0 827 828 adrp x5,.Lsigma 829 add x5,x5,:lo12:.Lsigma 830 stp x19,x20,[sp,#16] 831 stp x21,x22,[sp,#32] 832 stp x23,x24,[sp,#48] 833 stp x25,x26,[sp,#64] 834 stp x27,x28,[sp,#80] 835 836.L512_or_more_neon: 837 sub sp,sp,#128+64 838 839 ldp x22,x23,[x5] // load sigma 840 ld1 {v24.4s},[x5],#16 841 ldp x24,x25,[x3] // load key 842 ldp x26,x27,[x3,#16] 843 ld1 {v25.4s,v26.4s},[x3] 844 ldp x28,x30,[x4] // load counter 845 ld1 {v27.4s},[x4] 846 ld1 {v31.4s},[x5] 847#ifdef __ARMEB__ 848 rev64 v24.4s,v24.4s 849 ror x24,x24,#32 850 ror x25,x25,#32 851 ror x26,x26,#32 852 ror x27,x27,#32 853 ror x28,x28,#32 854 ror x30,x30,#32 855#endif 856 add v27.4s,v27.4s,v31.4s // += 1 857 stp q24,q25,[sp,#0] // off-load key block, invariant part 858 add v27.4s,v27.4s,v31.4s // not typo 859 str q26,[sp,#32] 860 add v28.4s,v27.4s,v31.4s 861 add v29.4s,v28.4s,v31.4s 862 add v30.4s,v29.4s,v31.4s 863 shl v31.4s,v31.4s,#2 // 1 -> 4 864 865 stp d8,d9,[sp,#128+0] // meet ABI requirements 866 stp d10,d11,[sp,#128+16] 867 stp d12,d13,[sp,#128+32] 868 stp d14,d15,[sp,#128+48] 869 870 sub x2,x2,#512 // not typo 871 872.Loop_outer_512_neon: 873 mov v0.16b,v24.16b 874 mov v4.16b,v24.16b 875 mov v8.16b,v24.16b 876 mov v12.16b,v24.16b 877 mov v16.16b,v24.16b 878 mov v20.16b,v24.16b 879 mov v1.16b,v25.16b 880 mov w5,w22 // unpack key block 881 mov v5.16b,v25.16b 882 lsr x6,x22,#32 883 mov v9.16b,v25.16b 884 mov w7,w23 885 mov v13.16b,v25.16b 886 lsr x8,x23,#32 887 mov v17.16b,v25.16b 888 mov w9,w24 889 mov v21.16b,v25.16b 890 lsr x10,x24,#32 891 mov v3.16b,v27.16b 892 mov w11,w25 893 mov v7.16b,v28.16b 894 lsr x12,x25,#32 895 mov v11.16b,v29.16b 896 mov w13,w26 897 mov v15.16b,v30.16b 898 lsr x14,x26,#32 899 mov v2.16b,v26.16b 900 mov w15,w27 901 mov v6.16b,v26.16b 902 lsr x16,x27,#32 903 add v19.4s,v3.4s,v31.4s // +4 904 mov w17,w28 905 add v23.4s,v7.4s,v31.4s // +4 906 lsr x19,x28,#32 907 mov v10.16b,v26.16b 908 mov w20,w30 909 mov v14.16b,v26.16b 910 lsr x21,x30,#32 911 mov v18.16b,v26.16b 912 stp q27,q28,[sp,#48] // off-load key block, variable part 913 mov v22.16b,v26.16b 914 str q29,[sp,#80] 915 916 mov x4,#5 917 subs x2,x2,#512 918.Loop_upper_neon: 919 sub x4,x4,#1 920 add v0.4s,v0.4s,v1.4s 921 add w5,w5,w9 922 add v4.4s,v4.4s,v5.4s 923 add w6,w6,w10 924 add v8.4s,v8.4s,v9.4s 925 add w7,w7,w11 926 add v12.4s,v12.4s,v13.4s 927 add w8,w8,w12 928 add v16.4s,v16.4s,v17.4s 929 eor w17,w17,w5 930 add v20.4s,v20.4s,v21.4s 931 eor w19,w19,w6 932 eor v3.16b,v3.16b,v0.16b 933 eor w20,w20,w7 934 eor v7.16b,v7.16b,v4.16b 935 eor w21,w21,w8 936 eor v11.16b,v11.16b,v8.16b 937 ror w17,w17,#16 938 eor v15.16b,v15.16b,v12.16b 939 ror w19,w19,#16 940 eor v19.16b,v19.16b,v16.16b 941 ror w20,w20,#16 942 eor v23.16b,v23.16b,v20.16b 943 ror w21,w21,#16 944 rev32 v3.8h,v3.8h 945 add w13,w13,w17 946 rev32 v7.8h,v7.8h 947 add w14,w14,w19 948 rev32 v11.8h,v11.8h 949 add w15,w15,w20 950 rev32 v15.8h,v15.8h 951 add w16,w16,w21 952 rev32 v19.8h,v19.8h 953 eor w9,w9,w13 954 rev32 v23.8h,v23.8h 955 eor w10,w10,w14 956 add v2.4s,v2.4s,v3.4s 957 eor w11,w11,w15 958 add v6.4s,v6.4s,v7.4s 959 eor w12,w12,w16 960 add v10.4s,v10.4s,v11.4s 961 ror w9,w9,#20 962 add v14.4s,v14.4s,v15.4s 963 ror w10,w10,#20 964 add v18.4s,v18.4s,v19.4s 965 ror w11,w11,#20 966 add v22.4s,v22.4s,v23.4s 967 ror w12,w12,#20 968 eor v24.16b,v1.16b,v2.16b 969 add w5,w5,w9 970 eor v25.16b,v5.16b,v6.16b 971 add w6,w6,w10 972 eor v26.16b,v9.16b,v10.16b 973 add w7,w7,w11 974 eor v27.16b,v13.16b,v14.16b 975 add w8,w8,w12 976 eor v28.16b,v17.16b,v18.16b 977 eor w17,w17,w5 978 eor v29.16b,v21.16b,v22.16b 979 eor w19,w19,w6 980 ushr v1.4s,v24.4s,#20 981 eor w20,w20,w7 982 ushr v5.4s,v25.4s,#20 983 eor w21,w21,w8 984 ushr v9.4s,v26.4s,#20 985 ror w17,w17,#24 986 ushr v13.4s,v27.4s,#20 987 ror w19,w19,#24 988 ushr v17.4s,v28.4s,#20 989 ror w20,w20,#24 990 ushr v21.4s,v29.4s,#20 991 ror w21,w21,#24 992 sli v1.4s,v24.4s,#12 993 add w13,w13,w17 994 sli v5.4s,v25.4s,#12 995 add w14,w14,w19 996 sli v9.4s,v26.4s,#12 997 add w15,w15,w20 998 sli v13.4s,v27.4s,#12 999 add w16,w16,w21 1000 sli v17.4s,v28.4s,#12 1001 eor w9,w9,w13 1002 sli v21.4s,v29.4s,#12 1003 eor w10,w10,w14 1004 add v0.4s,v0.4s,v1.4s 1005 eor w11,w11,w15 1006 add v4.4s,v4.4s,v5.4s 1007 eor w12,w12,w16 1008 add v8.4s,v8.4s,v9.4s 1009 ror w9,w9,#25 1010 add v12.4s,v12.4s,v13.4s 1011 ror w10,w10,#25 1012 add v16.4s,v16.4s,v17.4s 1013 ror w11,w11,#25 1014 add v20.4s,v20.4s,v21.4s 1015 ror w12,w12,#25 1016 eor v24.16b,v3.16b,v0.16b 1017 add w5,w5,w10 1018 eor v25.16b,v7.16b,v4.16b 1019 add w6,w6,w11 1020 eor v26.16b,v11.16b,v8.16b 1021 add w7,w7,w12 1022 eor v27.16b,v15.16b,v12.16b 1023 add w8,w8,w9 1024 eor v28.16b,v19.16b,v16.16b 1025 eor w21,w21,w5 1026 eor v29.16b,v23.16b,v20.16b 1027 eor w17,w17,w6 1028 ushr v3.4s,v24.4s,#24 1029 eor w19,w19,w7 1030 ushr v7.4s,v25.4s,#24 1031 eor w20,w20,w8 1032 ushr v11.4s,v26.4s,#24 1033 ror w21,w21,#16 1034 ushr v15.4s,v27.4s,#24 1035 ror w17,w17,#16 1036 ushr v19.4s,v28.4s,#24 1037 ror w19,w19,#16 1038 ushr v23.4s,v29.4s,#24 1039 ror w20,w20,#16 1040 sli v3.4s,v24.4s,#8 1041 add w15,w15,w21 1042 sli v7.4s,v25.4s,#8 1043 add w16,w16,w17 1044 sli v11.4s,v26.4s,#8 1045 add w13,w13,w19 1046 sli v15.4s,v27.4s,#8 1047 add w14,w14,w20 1048 sli v19.4s,v28.4s,#8 1049 eor w10,w10,w15 1050 sli v23.4s,v29.4s,#8 1051 eor w11,w11,w16 1052 add v2.4s,v2.4s,v3.4s 1053 eor w12,w12,w13 1054 add v6.4s,v6.4s,v7.4s 1055 eor w9,w9,w14 1056 add v10.4s,v10.4s,v11.4s 1057 ror w10,w10,#20 1058 add v14.4s,v14.4s,v15.4s 1059 ror w11,w11,#20 1060 add v18.4s,v18.4s,v19.4s 1061 ror w12,w12,#20 1062 add v22.4s,v22.4s,v23.4s 1063 ror w9,w9,#20 1064 eor v24.16b,v1.16b,v2.16b 1065 add w5,w5,w10 1066 eor v25.16b,v5.16b,v6.16b 1067 add w6,w6,w11 1068 eor v26.16b,v9.16b,v10.16b 1069 add w7,w7,w12 1070 eor v27.16b,v13.16b,v14.16b 1071 add w8,w8,w9 1072 eor v28.16b,v17.16b,v18.16b 1073 eor w21,w21,w5 1074 eor v29.16b,v21.16b,v22.16b 1075 eor w17,w17,w6 1076 ushr v1.4s,v24.4s,#25 1077 eor w19,w19,w7 1078 ushr v5.4s,v25.4s,#25 1079 eor w20,w20,w8 1080 ushr v9.4s,v26.4s,#25 1081 ror w21,w21,#24 1082 ushr v13.4s,v27.4s,#25 1083 ror w17,w17,#24 1084 ushr v17.4s,v28.4s,#25 1085 ror w19,w19,#24 1086 ushr v21.4s,v29.4s,#25 1087 ror w20,w20,#24 1088 sli v1.4s,v24.4s,#7 1089 add w15,w15,w21 1090 sli v5.4s,v25.4s,#7 1091 add w16,w16,w17 1092 sli v9.4s,v26.4s,#7 1093 add w13,w13,w19 1094 sli v13.4s,v27.4s,#7 1095 add w14,w14,w20 1096 sli v17.4s,v28.4s,#7 1097 eor w10,w10,w15 1098 sli v21.4s,v29.4s,#7 1099 eor w11,w11,w16 1100 ext v2.16b,v2.16b,v2.16b,#8 1101 eor w12,w12,w13 1102 ext v6.16b,v6.16b,v6.16b,#8 1103 eor w9,w9,w14 1104 ext v10.16b,v10.16b,v10.16b,#8 1105 ror w10,w10,#25 1106 ext v14.16b,v14.16b,v14.16b,#8 1107 ror w11,w11,#25 1108 ext v18.16b,v18.16b,v18.16b,#8 1109 ror w12,w12,#25 1110 ext v22.16b,v22.16b,v22.16b,#8 1111 ror w9,w9,#25 1112 ext v3.16b,v3.16b,v3.16b,#12 1113 ext v7.16b,v7.16b,v7.16b,#12 1114 ext v11.16b,v11.16b,v11.16b,#12 1115 ext v15.16b,v15.16b,v15.16b,#12 1116 ext v19.16b,v19.16b,v19.16b,#12 1117 ext v23.16b,v23.16b,v23.16b,#12 1118 ext v1.16b,v1.16b,v1.16b,#4 1119 ext v5.16b,v5.16b,v5.16b,#4 1120 ext v9.16b,v9.16b,v9.16b,#4 1121 ext v13.16b,v13.16b,v13.16b,#4 1122 ext v17.16b,v17.16b,v17.16b,#4 1123 ext v21.16b,v21.16b,v21.16b,#4 1124 add v0.4s,v0.4s,v1.4s 1125 add w5,w5,w9 1126 add v4.4s,v4.4s,v5.4s 1127 add w6,w6,w10 1128 add v8.4s,v8.4s,v9.4s 1129 add w7,w7,w11 1130 add v12.4s,v12.4s,v13.4s 1131 add w8,w8,w12 1132 add v16.4s,v16.4s,v17.4s 1133 eor w17,w17,w5 1134 add v20.4s,v20.4s,v21.4s 1135 eor w19,w19,w6 1136 eor v3.16b,v3.16b,v0.16b 1137 eor w20,w20,w7 1138 eor v7.16b,v7.16b,v4.16b 1139 eor w21,w21,w8 1140 eor v11.16b,v11.16b,v8.16b 1141 ror w17,w17,#16 1142 eor v15.16b,v15.16b,v12.16b 1143 ror w19,w19,#16 1144 eor v19.16b,v19.16b,v16.16b 1145 ror w20,w20,#16 1146 eor v23.16b,v23.16b,v20.16b 1147 ror w21,w21,#16 1148 rev32 v3.8h,v3.8h 1149 add w13,w13,w17 1150 rev32 v7.8h,v7.8h 1151 add w14,w14,w19 1152 rev32 v11.8h,v11.8h 1153 add w15,w15,w20 1154 rev32 v15.8h,v15.8h 1155 add w16,w16,w21 1156 rev32 v19.8h,v19.8h 1157 eor w9,w9,w13 1158 rev32 v23.8h,v23.8h 1159 eor w10,w10,w14 1160 add v2.4s,v2.4s,v3.4s 1161 eor w11,w11,w15 1162 add v6.4s,v6.4s,v7.4s 1163 eor w12,w12,w16 1164 add v10.4s,v10.4s,v11.4s 1165 ror w9,w9,#20 1166 add v14.4s,v14.4s,v15.4s 1167 ror w10,w10,#20 1168 add v18.4s,v18.4s,v19.4s 1169 ror w11,w11,#20 1170 add v22.4s,v22.4s,v23.4s 1171 ror w12,w12,#20 1172 eor v24.16b,v1.16b,v2.16b 1173 add w5,w5,w9 1174 eor v25.16b,v5.16b,v6.16b 1175 add w6,w6,w10 1176 eor v26.16b,v9.16b,v10.16b 1177 add w7,w7,w11 1178 eor v27.16b,v13.16b,v14.16b 1179 add w8,w8,w12 1180 eor v28.16b,v17.16b,v18.16b 1181 eor w17,w17,w5 1182 eor v29.16b,v21.16b,v22.16b 1183 eor w19,w19,w6 1184 ushr v1.4s,v24.4s,#20 1185 eor w20,w20,w7 1186 ushr v5.4s,v25.4s,#20 1187 eor w21,w21,w8 1188 ushr v9.4s,v26.4s,#20 1189 ror w17,w17,#24 1190 ushr v13.4s,v27.4s,#20 1191 ror w19,w19,#24 1192 ushr v17.4s,v28.4s,#20 1193 ror w20,w20,#24 1194 ushr v21.4s,v29.4s,#20 1195 ror w21,w21,#24 1196 sli v1.4s,v24.4s,#12 1197 add w13,w13,w17 1198 sli v5.4s,v25.4s,#12 1199 add w14,w14,w19 1200 sli v9.4s,v26.4s,#12 1201 add w15,w15,w20 1202 sli v13.4s,v27.4s,#12 1203 add w16,w16,w21 1204 sli v17.4s,v28.4s,#12 1205 eor w9,w9,w13 1206 sli v21.4s,v29.4s,#12 1207 eor w10,w10,w14 1208 add v0.4s,v0.4s,v1.4s 1209 eor w11,w11,w15 1210 add v4.4s,v4.4s,v5.4s 1211 eor w12,w12,w16 1212 add v8.4s,v8.4s,v9.4s 1213 ror w9,w9,#25 1214 add v12.4s,v12.4s,v13.4s 1215 ror w10,w10,#25 1216 add v16.4s,v16.4s,v17.4s 1217 ror w11,w11,#25 1218 add v20.4s,v20.4s,v21.4s 1219 ror w12,w12,#25 1220 eor v24.16b,v3.16b,v0.16b 1221 add w5,w5,w10 1222 eor v25.16b,v7.16b,v4.16b 1223 add w6,w6,w11 1224 eor v26.16b,v11.16b,v8.16b 1225 add w7,w7,w12 1226 eor v27.16b,v15.16b,v12.16b 1227 add w8,w8,w9 1228 eor v28.16b,v19.16b,v16.16b 1229 eor w21,w21,w5 1230 eor v29.16b,v23.16b,v20.16b 1231 eor w17,w17,w6 1232 ushr v3.4s,v24.4s,#24 1233 eor w19,w19,w7 1234 ushr v7.4s,v25.4s,#24 1235 eor w20,w20,w8 1236 ushr v11.4s,v26.4s,#24 1237 ror w21,w21,#16 1238 ushr v15.4s,v27.4s,#24 1239 ror w17,w17,#16 1240 ushr v19.4s,v28.4s,#24 1241 ror w19,w19,#16 1242 ushr v23.4s,v29.4s,#24 1243 ror w20,w20,#16 1244 sli v3.4s,v24.4s,#8 1245 add w15,w15,w21 1246 sli v7.4s,v25.4s,#8 1247 add w16,w16,w17 1248 sli v11.4s,v26.4s,#8 1249 add w13,w13,w19 1250 sli v15.4s,v27.4s,#8 1251 add w14,w14,w20 1252 sli v19.4s,v28.4s,#8 1253 eor w10,w10,w15 1254 sli v23.4s,v29.4s,#8 1255 eor w11,w11,w16 1256 add v2.4s,v2.4s,v3.4s 1257 eor w12,w12,w13 1258 add v6.4s,v6.4s,v7.4s 1259 eor w9,w9,w14 1260 add v10.4s,v10.4s,v11.4s 1261 ror w10,w10,#20 1262 add v14.4s,v14.4s,v15.4s 1263 ror w11,w11,#20 1264 add v18.4s,v18.4s,v19.4s 1265 ror w12,w12,#20 1266 add v22.4s,v22.4s,v23.4s 1267 ror w9,w9,#20 1268 eor v24.16b,v1.16b,v2.16b 1269 add w5,w5,w10 1270 eor v25.16b,v5.16b,v6.16b 1271 add w6,w6,w11 1272 eor v26.16b,v9.16b,v10.16b 1273 add w7,w7,w12 1274 eor v27.16b,v13.16b,v14.16b 1275 add w8,w8,w9 1276 eor v28.16b,v17.16b,v18.16b 1277 eor w21,w21,w5 1278 eor v29.16b,v21.16b,v22.16b 1279 eor w17,w17,w6 1280 ushr v1.4s,v24.4s,#25 1281 eor w19,w19,w7 1282 ushr v5.4s,v25.4s,#25 1283 eor w20,w20,w8 1284 ushr v9.4s,v26.4s,#25 1285 ror w21,w21,#24 1286 ushr v13.4s,v27.4s,#25 1287 ror w17,w17,#24 1288 ushr v17.4s,v28.4s,#25 1289 ror w19,w19,#24 1290 ushr v21.4s,v29.4s,#25 1291 ror w20,w20,#24 1292 sli v1.4s,v24.4s,#7 1293 add w15,w15,w21 1294 sli v5.4s,v25.4s,#7 1295 add w16,w16,w17 1296 sli v9.4s,v26.4s,#7 1297 add w13,w13,w19 1298 sli v13.4s,v27.4s,#7 1299 add w14,w14,w20 1300 sli v17.4s,v28.4s,#7 1301 eor w10,w10,w15 1302 sli v21.4s,v29.4s,#7 1303 eor w11,w11,w16 1304 ext v2.16b,v2.16b,v2.16b,#8 1305 eor w12,w12,w13 1306 ext v6.16b,v6.16b,v6.16b,#8 1307 eor w9,w9,w14 1308 ext v10.16b,v10.16b,v10.16b,#8 1309 ror w10,w10,#25 1310 ext v14.16b,v14.16b,v14.16b,#8 1311 ror w11,w11,#25 1312 ext v18.16b,v18.16b,v18.16b,#8 1313 ror w12,w12,#25 1314 ext v22.16b,v22.16b,v22.16b,#8 1315 ror w9,w9,#25 1316 ext v3.16b,v3.16b,v3.16b,#4 1317 ext v7.16b,v7.16b,v7.16b,#4 1318 ext v11.16b,v11.16b,v11.16b,#4 1319 ext v15.16b,v15.16b,v15.16b,#4 1320 ext v19.16b,v19.16b,v19.16b,#4 1321 ext v23.16b,v23.16b,v23.16b,#4 1322 ext v1.16b,v1.16b,v1.16b,#12 1323 ext v5.16b,v5.16b,v5.16b,#12 1324 ext v9.16b,v9.16b,v9.16b,#12 1325 ext v13.16b,v13.16b,v13.16b,#12 1326 ext v17.16b,v17.16b,v17.16b,#12 1327 ext v21.16b,v21.16b,v21.16b,#12 1328 cbnz x4,.Loop_upper_neon 1329 1330 add w5,w5,w22 // accumulate key block 1331 add x6,x6,x22,lsr#32 1332 add w7,w7,w23 1333 add x8,x8,x23,lsr#32 1334 add w9,w9,w24 1335 add x10,x10,x24,lsr#32 1336 add w11,w11,w25 1337 add x12,x12,x25,lsr#32 1338 add w13,w13,w26 1339 add x14,x14,x26,lsr#32 1340 add w15,w15,w27 1341 add x16,x16,x27,lsr#32 1342 add w17,w17,w28 1343 add x19,x19,x28,lsr#32 1344 add w20,w20,w30 1345 add x21,x21,x30,lsr#32 1346 1347 add x5,x5,x6,lsl#32 // pack 1348 add x7,x7,x8,lsl#32 1349 ldp x6,x8,[x1,#0] // load input 1350 add x9,x9,x10,lsl#32 1351 add x11,x11,x12,lsl#32 1352 ldp x10,x12,[x1,#16] 1353 add x13,x13,x14,lsl#32 1354 add x15,x15,x16,lsl#32 1355 ldp x14,x16,[x1,#32] 1356 add x17,x17,x19,lsl#32 1357 add x20,x20,x21,lsl#32 1358 ldp x19,x21,[x1,#48] 1359 add x1,x1,#64 1360#ifdef __ARMEB__ 1361 rev x5,x5 1362 rev x7,x7 1363 rev x9,x9 1364 rev x11,x11 1365 rev x13,x13 1366 rev x15,x15 1367 rev x17,x17 1368 rev x20,x20 1369#endif 1370 eor x5,x5,x6 1371 eor x7,x7,x8 1372 eor x9,x9,x10 1373 eor x11,x11,x12 1374 eor x13,x13,x14 1375 eor x15,x15,x16 1376 eor x17,x17,x19 1377 eor x20,x20,x21 1378 1379 stp x5,x7,[x0,#0] // store output 1380 add x28,x28,#1 // increment counter 1381 mov w5,w22 // unpack key block 1382 lsr x6,x22,#32 1383 stp x9,x11,[x0,#16] 1384 mov w7,w23 1385 lsr x8,x23,#32 1386 stp x13,x15,[x0,#32] 1387 mov w9,w24 1388 lsr x10,x24,#32 1389 stp x17,x20,[x0,#48] 1390 add x0,x0,#64 1391 mov w11,w25 1392 lsr x12,x25,#32 1393 mov w13,w26 1394 lsr x14,x26,#32 1395 mov w15,w27 1396 lsr x16,x27,#32 1397 mov w17,w28 1398 lsr x19,x28,#32 1399 mov w20,w30 1400 lsr x21,x30,#32 1401 1402 mov x4,#5 1403.Loop_lower_neon: 1404 sub x4,x4,#1 1405 add v0.4s,v0.4s,v1.4s 1406 add w5,w5,w9 1407 add v4.4s,v4.4s,v5.4s 1408 add w6,w6,w10 1409 add v8.4s,v8.4s,v9.4s 1410 add w7,w7,w11 1411 add v12.4s,v12.4s,v13.4s 1412 add w8,w8,w12 1413 add v16.4s,v16.4s,v17.4s 1414 eor w17,w17,w5 1415 add v20.4s,v20.4s,v21.4s 1416 eor w19,w19,w6 1417 eor v3.16b,v3.16b,v0.16b 1418 eor w20,w20,w7 1419 eor v7.16b,v7.16b,v4.16b 1420 eor w21,w21,w8 1421 eor v11.16b,v11.16b,v8.16b 1422 ror w17,w17,#16 1423 eor v15.16b,v15.16b,v12.16b 1424 ror w19,w19,#16 1425 eor v19.16b,v19.16b,v16.16b 1426 ror w20,w20,#16 1427 eor v23.16b,v23.16b,v20.16b 1428 ror w21,w21,#16 1429 rev32 v3.8h,v3.8h 1430 add w13,w13,w17 1431 rev32 v7.8h,v7.8h 1432 add w14,w14,w19 1433 rev32 v11.8h,v11.8h 1434 add w15,w15,w20 1435 rev32 v15.8h,v15.8h 1436 add w16,w16,w21 1437 rev32 v19.8h,v19.8h 1438 eor w9,w9,w13 1439 rev32 v23.8h,v23.8h 1440 eor w10,w10,w14 1441 add v2.4s,v2.4s,v3.4s 1442 eor w11,w11,w15 1443 add v6.4s,v6.4s,v7.4s 1444 eor w12,w12,w16 1445 add v10.4s,v10.4s,v11.4s 1446 ror w9,w9,#20 1447 add v14.4s,v14.4s,v15.4s 1448 ror w10,w10,#20 1449 add v18.4s,v18.4s,v19.4s 1450 ror w11,w11,#20 1451 add v22.4s,v22.4s,v23.4s 1452 ror w12,w12,#20 1453 eor v24.16b,v1.16b,v2.16b 1454 add w5,w5,w9 1455 eor v25.16b,v5.16b,v6.16b 1456 add w6,w6,w10 1457 eor v26.16b,v9.16b,v10.16b 1458 add w7,w7,w11 1459 eor v27.16b,v13.16b,v14.16b 1460 add w8,w8,w12 1461 eor v28.16b,v17.16b,v18.16b 1462 eor w17,w17,w5 1463 eor v29.16b,v21.16b,v22.16b 1464 eor w19,w19,w6 1465 ushr v1.4s,v24.4s,#20 1466 eor w20,w20,w7 1467 ushr v5.4s,v25.4s,#20 1468 eor w21,w21,w8 1469 ushr v9.4s,v26.4s,#20 1470 ror w17,w17,#24 1471 ushr v13.4s,v27.4s,#20 1472 ror w19,w19,#24 1473 ushr v17.4s,v28.4s,#20 1474 ror w20,w20,#24 1475 ushr v21.4s,v29.4s,#20 1476 ror w21,w21,#24 1477 sli v1.4s,v24.4s,#12 1478 add w13,w13,w17 1479 sli v5.4s,v25.4s,#12 1480 add w14,w14,w19 1481 sli v9.4s,v26.4s,#12 1482 add w15,w15,w20 1483 sli v13.4s,v27.4s,#12 1484 add w16,w16,w21 1485 sli v17.4s,v28.4s,#12 1486 eor w9,w9,w13 1487 sli v21.4s,v29.4s,#12 1488 eor w10,w10,w14 1489 add v0.4s,v0.4s,v1.4s 1490 eor w11,w11,w15 1491 add v4.4s,v4.4s,v5.4s 1492 eor w12,w12,w16 1493 add v8.4s,v8.4s,v9.4s 1494 ror w9,w9,#25 1495 add v12.4s,v12.4s,v13.4s 1496 ror w10,w10,#25 1497 add v16.4s,v16.4s,v17.4s 1498 ror w11,w11,#25 1499 add v20.4s,v20.4s,v21.4s 1500 ror w12,w12,#25 1501 eor v24.16b,v3.16b,v0.16b 1502 add w5,w5,w10 1503 eor v25.16b,v7.16b,v4.16b 1504 add w6,w6,w11 1505 eor v26.16b,v11.16b,v8.16b 1506 add w7,w7,w12 1507 eor v27.16b,v15.16b,v12.16b 1508 add w8,w8,w9 1509 eor v28.16b,v19.16b,v16.16b 1510 eor w21,w21,w5 1511 eor v29.16b,v23.16b,v20.16b 1512 eor w17,w17,w6 1513 ushr v3.4s,v24.4s,#24 1514 eor w19,w19,w7 1515 ushr v7.4s,v25.4s,#24 1516 eor w20,w20,w8 1517 ushr v11.4s,v26.4s,#24 1518 ror w21,w21,#16 1519 ushr v15.4s,v27.4s,#24 1520 ror w17,w17,#16 1521 ushr v19.4s,v28.4s,#24 1522 ror w19,w19,#16 1523 ushr v23.4s,v29.4s,#24 1524 ror w20,w20,#16 1525 sli v3.4s,v24.4s,#8 1526 add w15,w15,w21 1527 sli v7.4s,v25.4s,#8 1528 add w16,w16,w17 1529 sli v11.4s,v26.4s,#8 1530 add w13,w13,w19 1531 sli v15.4s,v27.4s,#8 1532 add w14,w14,w20 1533 sli v19.4s,v28.4s,#8 1534 eor w10,w10,w15 1535 sli v23.4s,v29.4s,#8 1536 eor w11,w11,w16 1537 add v2.4s,v2.4s,v3.4s 1538 eor w12,w12,w13 1539 add v6.4s,v6.4s,v7.4s 1540 eor w9,w9,w14 1541 add v10.4s,v10.4s,v11.4s 1542 ror w10,w10,#20 1543 add v14.4s,v14.4s,v15.4s 1544 ror w11,w11,#20 1545 add v18.4s,v18.4s,v19.4s 1546 ror w12,w12,#20 1547 add v22.4s,v22.4s,v23.4s 1548 ror w9,w9,#20 1549 eor v24.16b,v1.16b,v2.16b 1550 add w5,w5,w10 1551 eor v25.16b,v5.16b,v6.16b 1552 add w6,w6,w11 1553 eor v26.16b,v9.16b,v10.16b 1554 add w7,w7,w12 1555 eor v27.16b,v13.16b,v14.16b 1556 add w8,w8,w9 1557 eor v28.16b,v17.16b,v18.16b 1558 eor w21,w21,w5 1559 eor v29.16b,v21.16b,v22.16b 1560 eor w17,w17,w6 1561 ushr v1.4s,v24.4s,#25 1562 eor w19,w19,w7 1563 ushr v5.4s,v25.4s,#25 1564 eor w20,w20,w8 1565 ushr v9.4s,v26.4s,#25 1566 ror w21,w21,#24 1567 ushr v13.4s,v27.4s,#25 1568 ror w17,w17,#24 1569 ushr v17.4s,v28.4s,#25 1570 ror w19,w19,#24 1571 ushr v21.4s,v29.4s,#25 1572 ror w20,w20,#24 1573 sli v1.4s,v24.4s,#7 1574 add w15,w15,w21 1575 sli v5.4s,v25.4s,#7 1576 add w16,w16,w17 1577 sli v9.4s,v26.4s,#7 1578 add w13,w13,w19 1579 sli v13.4s,v27.4s,#7 1580 add w14,w14,w20 1581 sli v17.4s,v28.4s,#7 1582 eor w10,w10,w15 1583 sli v21.4s,v29.4s,#7 1584 eor w11,w11,w16 1585 ext v2.16b,v2.16b,v2.16b,#8 1586 eor w12,w12,w13 1587 ext v6.16b,v6.16b,v6.16b,#8 1588 eor w9,w9,w14 1589 ext v10.16b,v10.16b,v10.16b,#8 1590 ror w10,w10,#25 1591 ext v14.16b,v14.16b,v14.16b,#8 1592 ror w11,w11,#25 1593 ext v18.16b,v18.16b,v18.16b,#8 1594 ror w12,w12,#25 1595 ext v22.16b,v22.16b,v22.16b,#8 1596 ror w9,w9,#25 1597 ext v3.16b,v3.16b,v3.16b,#12 1598 ext v7.16b,v7.16b,v7.16b,#12 1599 ext v11.16b,v11.16b,v11.16b,#12 1600 ext v15.16b,v15.16b,v15.16b,#12 1601 ext v19.16b,v19.16b,v19.16b,#12 1602 ext v23.16b,v23.16b,v23.16b,#12 1603 ext v1.16b,v1.16b,v1.16b,#4 1604 ext v5.16b,v5.16b,v5.16b,#4 1605 ext v9.16b,v9.16b,v9.16b,#4 1606 ext v13.16b,v13.16b,v13.16b,#4 1607 ext v17.16b,v17.16b,v17.16b,#4 1608 ext v21.16b,v21.16b,v21.16b,#4 1609 add v0.4s,v0.4s,v1.4s 1610 add w5,w5,w9 1611 add v4.4s,v4.4s,v5.4s 1612 add w6,w6,w10 1613 add v8.4s,v8.4s,v9.4s 1614 add w7,w7,w11 1615 add v12.4s,v12.4s,v13.4s 1616 add w8,w8,w12 1617 add v16.4s,v16.4s,v17.4s 1618 eor w17,w17,w5 1619 add v20.4s,v20.4s,v21.4s 1620 eor w19,w19,w6 1621 eor v3.16b,v3.16b,v0.16b 1622 eor w20,w20,w7 1623 eor v7.16b,v7.16b,v4.16b 1624 eor w21,w21,w8 1625 eor v11.16b,v11.16b,v8.16b 1626 ror w17,w17,#16 1627 eor v15.16b,v15.16b,v12.16b 1628 ror w19,w19,#16 1629 eor v19.16b,v19.16b,v16.16b 1630 ror w20,w20,#16 1631 eor v23.16b,v23.16b,v20.16b 1632 ror w21,w21,#16 1633 rev32 v3.8h,v3.8h 1634 add w13,w13,w17 1635 rev32 v7.8h,v7.8h 1636 add w14,w14,w19 1637 rev32 v11.8h,v11.8h 1638 add w15,w15,w20 1639 rev32 v15.8h,v15.8h 1640 add w16,w16,w21 1641 rev32 v19.8h,v19.8h 1642 eor w9,w9,w13 1643 rev32 v23.8h,v23.8h 1644 eor w10,w10,w14 1645 add v2.4s,v2.4s,v3.4s 1646 eor w11,w11,w15 1647 add v6.4s,v6.4s,v7.4s 1648 eor w12,w12,w16 1649 add v10.4s,v10.4s,v11.4s 1650 ror w9,w9,#20 1651 add v14.4s,v14.4s,v15.4s 1652 ror w10,w10,#20 1653 add v18.4s,v18.4s,v19.4s 1654 ror w11,w11,#20 1655 add v22.4s,v22.4s,v23.4s 1656 ror w12,w12,#20 1657 eor v24.16b,v1.16b,v2.16b 1658 add w5,w5,w9 1659 eor v25.16b,v5.16b,v6.16b 1660 add w6,w6,w10 1661 eor v26.16b,v9.16b,v10.16b 1662 add w7,w7,w11 1663 eor v27.16b,v13.16b,v14.16b 1664 add w8,w8,w12 1665 eor v28.16b,v17.16b,v18.16b 1666 eor w17,w17,w5 1667 eor v29.16b,v21.16b,v22.16b 1668 eor w19,w19,w6 1669 ushr v1.4s,v24.4s,#20 1670 eor w20,w20,w7 1671 ushr v5.4s,v25.4s,#20 1672 eor w21,w21,w8 1673 ushr v9.4s,v26.4s,#20 1674 ror w17,w17,#24 1675 ushr v13.4s,v27.4s,#20 1676 ror w19,w19,#24 1677 ushr v17.4s,v28.4s,#20 1678 ror w20,w20,#24 1679 ushr v21.4s,v29.4s,#20 1680 ror w21,w21,#24 1681 sli v1.4s,v24.4s,#12 1682 add w13,w13,w17 1683 sli v5.4s,v25.4s,#12 1684 add w14,w14,w19 1685 sli v9.4s,v26.4s,#12 1686 add w15,w15,w20 1687 sli v13.4s,v27.4s,#12 1688 add w16,w16,w21 1689 sli v17.4s,v28.4s,#12 1690 eor w9,w9,w13 1691 sli v21.4s,v29.4s,#12 1692 eor w10,w10,w14 1693 add v0.4s,v0.4s,v1.4s 1694 eor w11,w11,w15 1695 add v4.4s,v4.4s,v5.4s 1696 eor w12,w12,w16 1697 add v8.4s,v8.4s,v9.4s 1698 ror w9,w9,#25 1699 add v12.4s,v12.4s,v13.4s 1700 ror w10,w10,#25 1701 add v16.4s,v16.4s,v17.4s 1702 ror w11,w11,#25 1703 add v20.4s,v20.4s,v21.4s 1704 ror w12,w12,#25 1705 eor v24.16b,v3.16b,v0.16b 1706 add w5,w5,w10 1707 eor v25.16b,v7.16b,v4.16b 1708 add w6,w6,w11 1709 eor v26.16b,v11.16b,v8.16b 1710 add w7,w7,w12 1711 eor v27.16b,v15.16b,v12.16b 1712 add w8,w8,w9 1713 eor v28.16b,v19.16b,v16.16b 1714 eor w21,w21,w5 1715 eor v29.16b,v23.16b,v20.16b 1716 eor w17,w17,w6 1717 ushr v3.4s,v24.4s,#24 1718 eor w19,w19,w7 1719 ushr v7.4s,v25.4s,#24 1720 eor w20,w20,w8 1721 ushr v11.4s,v26.4s,#24 1722 ror w21,w21,#16 1723 ushr v15.4s,v27.4s,#24 1724 ror w17,w17,#16 1725 ushr v19.4s,v28.4s,#24 1726 ror w19,w19,#16 1727 ushr v23.4s,v29.4s,#24 1728 ror w20,w20,#16 1729 sli v3.4s,v24.4s,#8 1730 add w15,w15,w21 1731 sli v7.4s,v25.4s,#8 1732 add w16,w16,w17 1733 sli v11.4s,v26.4s,#8 1734 add w13,w13,w19 1735 sli v15.4s,v27.4s,#8 1736 add w14,w14,w20 1737 sli v19.4s,v28.4s,#8 1738 eor w10,w10,w15 1739 sli v23.4s,v29.4s,#8 1740 eor w11,w11,w16 1741 add v2.4s,v2.4s,v3.4s 1742 eor w12,w12,w13 1743 add v6.4s,v6.4s,v7.4s 1744 eor w9,w9,w14 1745 add v10.4s,v10.4s,v11.4s 1746 ror w10,w10,#20 1747 add v14.4s,v14.4s,v15.4s 1748 ror w11,w11,#20 1749 add v18.4s,v18.4s,v19.4s 1750 ror w12,w12,#20 1751 add v22.4s,v22.4s,v23.4s 1752 ror w9,w9,#20 1753 eor v24.16b,v1.16b,v2.16b 1754 add w5,w5,w10 1755 eor v25.16b,v5.16b,v6.16b 1756 add w6,w6,w11 1757 eor v26.16b,v9.16b,v10.16b 1758 add w7,w7,w12 1759 eor v27.16b,v13.16b,v14.16b 1760 add w8,w8,w9 1761 eor v28.16b,v17.16b,v18.16b 1762 eor w21,w21,w5 1763 eor v29.16b,v21.16b,v22.16b 1764 eor w17,w17,w6 1765 ushr v1.4s,v24.4s,#25 1766 eor w19,w19,w7 1767 ushr v5.4s,v25.4s,#25 1768 eor w20,w20,w8 1769 ushr v9.4s,v26.4s,#25 1770 ror w21,w21,#24 1771 ushr v13.4s,v27.4s,#25 1772 ror w17,w17,#24 1773 ushr v17.4s,v28.4s,#25 1774 ror w19,w19,#24 1775 ushr v21.4s,v29.4s,#25 1776 ror w20,w20,#24 1777 sli v1.4s,v24.4s,#7 1778 add w15,w15,w21 1779 sli v5.4s,v25.4s,#7 1780 add w16,w16,w17 1781 sli v9.4s,v26.4s,#7 1782 add w13,w13,w19 1783 sli v13.4s,v27.4s,#7 1784 add w14,w14,w20 1785 sli v17.4s,v28.4s,#7 1786 eor w10,w10,w15 1787 sli v21.4s,v29.4s,#7 1788 eor w11,w11,w16 1789 ext v2.16b,v2.16b,v2.16b,#8 1790 eor w12,w12,w13 1791 ext v6.16b,v6.16b,v6.16b,#8 1792 eor w9,w9,w14 1793 ext v10.16b,v10.16b,v10.16b,#8 1794 ror w10,w10,#25 1795 ext v14.16b,v14.16b,v14.16b,#8 1796 ror w11,w11,#25 1797 ext v18.16b,v18.16b,v18.16b,#8 1798 ror w12,w12,#25 1799 ext v22.16b,v22.16b,v22.16b,#8 1800 ror w9,w9,#25 1801 ext v3.16b,v3.16b,v3.16b,#4 1802 ext v7.16b,v7.16b,v7.16b,#4 1803 ext v11.16b,v11.16b,v11.16b,#4 1804 ext v15.16b,v15.16b,v15.16b,#4 1805 ext v19.16b,v19.16b,v19.16b,#4 1806 ext v23.16b,v23.16b,v23.16b,#4 1807 ext v1.16b,v1.16b,v1.16b,#12 1808 ext v5.16b,v5.16b,v5.16b,#12 1809 ext v9.16b,v9.16b,v9.16b,#12 1810 ext v13.16b,v13.16b,v13.16b,#12 1811 ext v17.16b,v17.16b,v17.16b,#12 1812 ext v21.16b,v21.16b,v21.16b,#12 1813 cbnz x4,.Loop_lower_neon 1814 1815 add w5,w5,w22 // accumulate key block 1816 ldp q24,q25,[sp,#0] 1817 add x6,x6,x22,lsr#32 1818 ldp q26,q27,[sp,#32] 1819 add w7,w7,w23 1820 ldp q28,q29,[sp,#64] 1821 add x8,x8,x23,lsr#32 1822 add v0.4s,v0.4s,v24.4s 1823 add w9,w9,w24 1824 add v4.4s,v4.4s,v24.4s 1825 add x10,x10,x24,lsr#32 1826 add v8.4s,v8.4s,v24.4s 1827 add w11,w11,w25 1828 add v12.4s,v12.4s,v24.4s 1829 add x12,x12,x25,lsr#32 1830 add v16.4s,v16.4s,v24.4s 1831 add w13,w13,w26 1832 add v20.4s,v20.4s,v24.4s 1833 add x14,x14,x26,lsr#32 1834 add v2.4s,v2.4s,v26.4s 1835 add w15,w15,w27 1836 add v6.4s,v6.4s,v26.4s 1837 add x16,x16,x27,lsr#32 1838 add v10.4s,v10.4s,v26.4s 1839 add w17,w17,w28 1840 add v14.4s,v14.4s,v26.4s 1841 add x19,x19,x28,lsr#32 1842 add v18.4s,v18.4s,v26.4s 1843 add w20,w20,w30 1844 add v22.4s,v22.4s,v26.4s 1845 add x21,x21,x30,lsr#32 1846 add v19.4s,v19.4s,v31.4s // +4 1847 add x5,x5,x6,lsl#32 // pack 1848 add v23.4s,v23.4s,v31.4s // +4 1849 add x7,x7,x8,lsl#32 1850 add v3.4s,v3.4s,v27.4s 1851 ldp x6,x8,[x1,#0] // load input 1852 add v7.4s,v7.4s,v28.4s 1853 add x9,x9,x10,lsl#32 1854 add v11.4s,v11.4s,v29.4s 1855 add x11,x11,x12,lsl#32 1856 add v15.4s,v15.4s,v30.4s 1857 ldp x10,x12,[x1,#16] 1858 add v19.4s,v19.4s,v27.4s 1859 add x13,x13,x14,lsl#32 1860 add v23.4s,v23.4s,v28.4s 1861 add x15,x15,x16,lsl#32 1862 add v1.4s,v1.4s,v25.4s 1863 ldp x14,x16,[x1,#32] 1864 add v5.4s,v5.4s,v25.4s 1865 add x17,x17,x19,lsl#32 1866 add v9.4s,v9.4s,v25.4s 1867 add x20,x20,x21,lsl#32 1868 add v13.4s,v13.4s,v25.4s 1869 ldp x19,x21,[x1,#48] 1870 add v17.4s,v17.4s,v25.4s 1871 add x1,x1,#64 1872 add v21.4s,v21.4s,v25.4s 1873 1874#ifdef __ARMEB__ 1875 rev x5,x5 1876 rev x7,x7 1877 rev x9,x9 1878 rev x11,x11 1879 rev x13,x13 1880 rev x15,x15 1881 rev x17,x17 1882 rev x20,x20 1883#endif 1884 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 1885 eor x5,x5,x6 1886 eor x7,x7,x8 1887 eor x9,x9,x10 1888 eor x11,x11,x12 1889 eor x13,x13,x14 1890 eor v0.16b,v0.16b,v24.16b 1891 eor x15,x15,x16 1892 eor v1.16b,v1.16b,v25.16b 1893 eor x17,x17,x19 1894 eor v2.16b,v2.16b,v26.16b 1895 eor x20,x20,x21 1896 eor v3.16b,v3.16b,v27.16b 1897 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 1898 1899 stp x5,x7,[x0,#0] // store output 1900 add x28,x28,#7 // increment counter 1901 stp x9,x11,[x0,#16] 1902 stp x13,x15,[x0,#32] 1903 stp x17,x20,[x0,#48] 1904 add x0,x0,#64 1905 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 1906 1907 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 1908 eor v4.16b,v4.16b,v24.16b 1909 eor v5.16b,v5.16b,v25.16b 1910 eor v6.16b,v6.16b,v26.16b 1911 eor v7.16b,v7.16b,v27.16b 1912 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 1913 1914 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 1915 eor v8.16b,v8.16b,v0.16b 1916 ldp q24,q25,[sp,#0] 1917 eor v9.16b,v9.16b,v1.16b 1918 ldp q26,q27,[sp,#32] 1919 eor v10.16b,v10.16b,v2.16b 1920 eor v11.16b,v11.16b,v3.16b 1921 st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 1922 1923 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 1924 eor v12.16b,v12.16b,v4.16b 1925 eor v13.16b,v13.16b,v5.16b 1926 eor v14.16b,v14.16b,v6.16b 1927 eor v15.16b,v15.16b,v7.16b 1928 st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 1929 1930 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 1931 eor v16.16b,v16.16b,v8.16b 1932 eor v17.16b,v17.16b,v9.16b 1933 eor v18.16b,v18.16b,v10.16b 1934 eor v19.16b,v19.16b,v11.16b 1935 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 1936 1937 shl v0.4s,v31.4s,#1 // 4 -> 8 1938 eor v20.16b,v20.16b,v12.16b 1939 eor v21.16b,v21.16b,v13.16b 1940 eor v22.16b,v22.16b,v14.16b 1941 eor v23.16b,v23.16b,v15.16b 1942 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 1943 1944 add v27.4s,v27.4s,v0.4s // += 8 1945 add v28.4s,v28.4s,v0.4s 1946 add v29.4s,v29.4s,v0.4s 1947 add v30.4s,v30.4s,v0.4s 1948 1949 b.hs .Loop_outer_512_neon 1950 1951 adds x2,x2,#512 1952 ushr v0.4s,v31.4s,#2 // 4 -> 1 1953 1954 ldp d8,d9,[sp,#128+0] // meet ABI requirements 1955 ldp d10,d11,[sp,#128+16] 1956 ldp d12,d13,[sp,#128+32] 1957 ldp d14,d15,[sp,#128+48] 1958 1959 stp q24,q31,[sp,#0] // wipe off-load area 1960 stp q24,q31,[sp,#32] 1961 stp q24,q31,[sp,#64] 1962 1963 b.eq .Ldone_512_neon 1964 1965 cmp x2,#192 1966 sub v27.4s,v27.4s,v0.4s // -= 1 1967 sub v28.4s,v28.4s,v0.4s 1968 sub v29.4s,v29.4s,v0.4s 1969 add sp,sp,#128 1970 b.hs .Loop_outer_neon 1971 1972 eor v25.16b,v25.16b,v25.16b 1973 eor v26.16b,v26.16b,v26.16b 1974 eor v27.16b,v27.16b,v27.16b 1975 eor v28.16b,v28.16b,v28.16b 1976 eor v29.16b,v29.16b,v29.16b 1977 eor v30.16b,v30.16b,v30.16b 1978 b .Loop_outer 1979 1980.Ldone_512_neon: 1981 ldp x19,x20,[x29,#16] 1982 add sp,sp,#128+64 1983 ldp x21,x22,[x29,#32] 1984 ldp x23,x24,[x29,#48] 1985 ldp x25,x26,[x29,#64] 1986 ldp x27,x28,[x29,#80] 1987 ldp x29,x30,[sp],#96 1988 AARCH64_VALIDATE_LINK_REGISTER 1989 ret 1990.size ChaCha20_512_neon,.-ChaCha20_512_neon 1991#endif 1992#endif // !OPENSSL_NO_ASM 1993.section .note.GNU-stack,"",%progbits 1994