1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(__ELF__) 12#if defined(BORINGSSL_PREFIX) 13#include <boringssl_prefix_symbols_asm.h> 14#endif 15#include <openssl/arm_arch.h> 16.section .rodata 17 18.align 7 19.Lchacha20_consts: 20.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' 21.Linc: 22.long 1,2,3,4 23.Lrol8: 24.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 25.Lclamp: 26.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC 27 28.text 29 30.type .Lpoly_hash_ad_internal,%function 31.align 6 32.Lpoly_hash_ad_internal: 33.cfi_startproc 34 cbnz x4, .Lpoly_hash_intro 35 ret 36 37.Lpoly_hash_intro: 38 cmp x4, #16 39 b.lt .Lpoly_hash_ad_tail 40 ldp x11, x12, [x3], 16 41 adds x8, x8, x11 42 adcs x9, x9, x12 43 adc x10, x10, x15 44 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 45 umulh x12, x8, x16 46 mul x13, x9, x16 47 umulh x14, x9, x16 48 adds x12, x12, x13 49 mul x13, x10, x16 50 adc x13, x13, x14 51 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 52 umulh x8, x8, x17 53 adds x12, x12, x14 54 mul x14, x9, x17 55 umulh x9, x9, x17 56 adcs x14, x14, x8 57 mul x10, x10, x17 58 adc x10, x10, x9 59 adds x13, x13, x14 60 adc x14, x10, xzr 61 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 62 and x8, x13, #-4 63 extr x13, x14, x13, #2 64 adds x8, x8, x11 65 lsr x11, x14, #2 66 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 67 adds x8, x8, x13 68 adcs x9, x9, x12 69 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 70 sub x4, x4, #16 71 b .Lpoly_hash_ad_internal 72 73.Lpoly_hash_ad_tail: 74 cbz x4, .Lpoly_hash_ad_ret 75 76 eor v20.16b, v20.16b, v20.16b // Use T0 to load the AAD 77 sub x4, x4, #1 78 79.Lpoly_hash_tail_16_compose: 80 ext v20.16b, v20.16b, v20.16b, #15 81 ldrb w11, [x3, x4] 82 mov v20.b[0], w11 83 subs x4, x4, #1 84 b.ge .Lpoly_hash_tail_16_compose 85 mov x11, v20.d[0] 86 mov x12, v20.d[1] 87 adds x8, x8, x11 88 adcs x9, x9, x12 89 adc x10, x10, x15 90 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 91 umulh x12, x8, x16 92 mul x13, x9, x16 93 umulh x14, x9, x16 94 adds x12, x12, x13 95 mul x13, x10, x16 96 adc x13, x13, x14 97 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 98 umulh x8, x8, x17 99 adds x12, x12, x14 100 mul x14, x9, x17 101 umulh x9, x9, x17 102 adcs x14, x14, x8 103 mul x10, x10, x17 104 adc x10, x10, x9 105 adds x13, x13, x14 106 adc x14, x10, xzr 107 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 108 and x8, x13, #-4 109 extr x13, x14, x13, #2 110 adds x8, x8, x11 111 lsr x11, x14, #2 112 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 113 adds x8, x8, x13 114 adcs x9, x9, x12 115 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 116 117.Lpoly_hash_ad_ret: 118 ret 119.cfi_endproc 120.size .Lpoly_hash_ad_internal, .-.Lpoly_hash_ad_internal 121 122///////////////////////////////// 123// 124// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data); 125// 126.globl chacha20_poly1305_seal 127.hidden chacha20_poly1305_seal 128.type chacha20_poly1305_seal,%function 129.align 6 130chacha20_poly1305_seal: 131 AARCH64_SIGN_LINK_REGISTER 132.cfi_startproc 133 stp x29, x30, [sp, #-80]! 134.cfi_def_cfa_offset 80 135.cfi_offset w30, -72 136.cfi_offset w29, -80 137 mov x29, sp 138 // We probably could do .cfi_def_cfa w29, 80 at this point, but since 139 // we don't actually use the frame pointer like that, it's probably not 140 // worth bothering. 141 stp d8, d9, [sp, #16] 142 stp d10, d11, [sp, #32] 143 stp d12, d13, [sp, #48] 144 stp d14, d15, [sp, #64] 145.cfi_offset b15, -8 146.cfi_offset b14, -16 147.cfi_offset b13, -24 148.cfi_offset b12, -32 149.cfi_offset b11, -40 150.cfi_offset b10, -48 151.cfi_offset b9, -56 152.cfi_offset b8, -64 153 154 adrp x11, .Lchacha20_consts 155 add x11, x11, :lo12:.Lchacha20_consts 156 157 ld1 {v24.16b - v27.16b}, [x11] // .Load the CONSTS, INC, ROL8 and CLAMP values 158 ld1 {v28.16b - v30.16b}, [x5] 159 160 mov x15, #1 // Prepare the Poly1305 state 161 mov x8, #0 162 mov x9, #0 163 mov x10, #0 164 165 ldr x12, [x5, #56] // The total cipher text length includes extra_in_len 166 add x12, x12, x2 167 mov v31.d[0], x4 // Store the input and aad lengths 168 mov v31.d[1], x12 169 170 cmp x2, #128 171 b.le .Lseal_128 // Optimization for smaller buffers 172 173 // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext, 174 // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically, 175 // the fifth block (A4-D4) horizontally. 176 ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] 177 mov v4.16b, v24.16b 178 179 ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 180 mov v9.16b, v28.16b 181 182 ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 183 mov v14.16b, v29.16b 184 185 ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] 186 add v15.4s, v15.4s, v25.4s 187 mov v19.16b, v30.16b 188 189 sub x5, x5, #32 190 191 mov x6, #10 192 193.align 5 194.Lseal_init_rounds: 195 add v0.4s, v0.4s, v5.4s 196 add v1.4s, v1.4s, v6.4s 197 add v2.4s, v2.4s, v7.4s 198 add v3.4s, v3.4s, v8.4s 199 add v4.4s, v4.4s, v9.4s 200 201 eor v15.16b, v15.16b, v0.16b 202 eor v16.16b, v16.16b, v1.16b 203 eor v17.16b, v17.16b, v2.16b 204 eor v18.16b, v18.16b, v3.16b 205 eor v19.16b, v19.16b, v4.16b 206 207 rev32 v15.8h, v15.8h 208 rev32 v16.8h, v16.8h 209 rev32 v17.8h, v17.8h 210 rev32 v18.8h, v18.8h 211 rev32 v19.8h, v19.8h 212 213 add v10.4s, v10.4s, v15.4s 214 add v11.4s, v11.4s, v16.4s 215 add v12.4s, v12.4s, v17.4s 216 add v13.4s, v13.4s, v18.4s 217 add v14.4s, v14.4s, v19.4s 218 219 eor v5.16b, v5.16b, v10.16b 220 eor v6.16b, v6.16b, v11.16b 221 eor v7.16b, v7.16b, v12.16b 222 eor v8.16b, v8.16b, v13.16b 223 eor v9.16b, v9.16b, v14.16b 224 225 ushr v20.4s, v5.4s, #20 226 sli v20.4s, v5.4s, #12 227 ushr v5.4s, v6.4s, #20 228 sli v5.4s, v6.4s, #12 229 ushr v6.4s, v7.4s, #20 230 sli v6.4s, v7.4s, #12 231 ushr v7.4s, v8.4s, #20 232 sli v7.4s, v8.4s, #12 233 ushr v8.4s, v9.4s, #20 234 sli v8.4s, v9.4s, #12 235 236 add v0.4s, v0.4s, v20.4s 237 add v1.4s, v1.4s, v5.4s 238 add v2.4s, v2.4s, v6.4s 239 add v3.4s, v3.4s, v7.4s 240 add v4.4s, v4.4s, v8.4s 241 242 eor v15.16b, v15.16b, v0.16b 243 eor v16.16b, v16.16b, v1.16b 244 eor v17.16b, v17.16b, v2.16b 245 eor v18.16b, v18.16b, v3.16b 246 eor v19.16b, v19.16b, v4.16b 247 248 tbl v15.16b, {v15.16b}, v26.16b 249 tbl v16.16b, {v16.16b}, v26.16b 250 tbl v17.16b, {v17.16b}, v26.16b 251 tbl v18.16b, {v18.16b}, v26.16b 252 tbl v19.16b, {v19.16b}, v26.16b 253 254 add v10.4s, v10.4s, v15.4s 255 add v11.4s, v11.4s, v16.4s 256 add v12.4s, v12.4s, v17.4s 257 add v13.4s, v13.4s, v18.4s 258 add v14.4s, v14.4s, v19.4s 259 260 eor v20.16b, v20.16b, v10.16b 261 eor v5.16b, v5.16b, v11.16b 262 eor v6.16b, v6.16b, v12.16b 263 eor v7.16b, v7.16b, v13.16b 264 eor v8.16b, v8.16b, v14.16b 265 266 ushr v9.4s, v8.4s, #25 267 sli v9.4s, v8.4s, #7 268 ushr v8.4s, v7.4s, #25 269 sli v8.4s, v7.4s, #7 270 ushr v7.4s, v6.4s, #25 271 sli v7.4s, v6.4s, #7 272 ushr v6.4s, v5.4s, #25 273 sli v6.4s, v5.4s, #7 274 ushr v5.4s, v20.4s, #25 275 sli v5.4s, v20.4s, #7 276 277 ext v9.16b, v9.16b, v9.16b, #4 278 ext v14.16b, v14.16b, v14.16b, #8 279 ext v19.16b, v19.16b, v19.16b, #12 280 add v0.4s, v0.4s, v6.4s 281 add v1.4s, v1.4s, v7.4s 282 add v2.4s, v2.4s, v8.4s 283 add v3.4s, v3.4s, v5.4s 284 add v4.4s, v4.4s, v9.4s 285 286 eor v18.16b, v18.16b, v0.16b 287 eor v15.16b, v15.16b, v1.16b 288 eor v16.16b, v16.16b, v2.16b 289 eor v17.16b, v17.16b, v3.16b 290 eor v19.16b, v19.16b, v4.16b 291 292 rev32 v18.8h, v18.8h 293 rev32 v15.8h, v15.8h 294 rev32 v16.8h, v16.8h 295 rev32 v17.8h, v17.8h 296 rev32 v19.8h, v19.8h 297 298 add v12.4s, v12.4s, v18.4s 299 add v13.4s, v13.4s, v15.4s 300 add v10.4s, v10.4s, v16.4s 301 add v11.4s, v11.4s, v17.4s 302 add v14.4s, v14.4s, v19.4s 303 304 eor v6.16b, v6.16b, v12.16b 305 eor v7.16b, v7.16b, v13.16b 306 eor v8.16b, v8.16b, v10.16b 307 eor v5.16b, v5.16b, v11.16b 308 eor v9.16b, v9.16b, v14.16b 309 310 ushr v20.4s, v6.4s, #20 311 sli v20.4s, v6.4s, #12 312 ushr v6.4s, v7.4s, #20 313 sli v6.4s, v7.4s, #12 314 ushr v7.4s, v8.4s, #20 315 sli v7.4s, v8.4s, #12 316 ushr v8.4s, v5.4s, #20 317 sli v8.4s, v5.4s, #12 318 ushr v5.4s, v9.4s, #20 319 sli v5.4s, v9.4s, #12 320 321 add v0.4s, v0.4s, v20.4s 322 add v1.4s, v1.4s, v6.4s 323 add v2.4s, v2.4s, v7.4s 324 add v3.4s, v3.4s, v8.4s 325 add v4.4s, v4.4s, v5.4s 326 327 eor v18.16b, v18.16b, v0.16b 328 eor v15.16b, v15.16b, v1.16b 329 eor v16.16b, v16.16b, v2.16b 330 eor v17.16b, v17.16b, v3.16b 331 eor v19.16b, v19.16b, v4.16b 332 333 tbl v18.16b, {v18.16b}, v26.16b 334 tbl v15.16b, {v15.16b}, v26.16b 335 tbl v16.16b, {v16.16b}, v26.16b 336 tbl v17.16b, {v17.16b}, v26.16b 337 tbl v19.16b, {v19.16b}, v26.16b 338 339 add v12.4s, v12.4s, v18.4s 340 add v13.4s, v13.4s, v15.4s 341 add v10.4s, v10.4s, v16.4s 342 add v11.4s, v11.4s, v17.4s 343 add v14.4s, v14.4s, v19.4s 344 345 eor v20.16b, v20.16b, v12.16b 346 eor v6.16b, v6.16b, v13.16b 347 eor v7.16b, v7.16b, v10.16b 348 eor v8.16b, v8.16b, v11.16b 349 eor v5.16b, v5.16b, v14.16b 350 351 ushr v9.4s, v5.4s, #25 352 sli v9.4s, v5.4s, #7 353 ushr v5.4s, v8.4s, #25 354 sli v5.4s, v8.4s, #7 355 ushr v8.4s, v7.4s, #25 356 sli v8.4s, v7.4s, #7 357 ushr v7.4s, v6.4s, #25 358 sli v7.4s, v6.4s, #7 359 ushr v6.4s, v20.4s, #25 360 sli v6.4s, v20.4s, #7 361 362 ext v9.16b, v9.16b, v9.16b, #12 363 ext v14.16b, v14.16b, v14.16b, #8 364 ext v19.16b, v19.16b, v19.16b, #4 365 subs x6, x6, #1 366 b.hi .Lseal_init_rounds 367 368 add v15.4s, v15.4s, v25.4s 369 mov x11, #4 370 dup v20.4s, w11 371 add v25.4s, v25.4s, v20.4s 372 373 zip1 v20.4s, v0.4s, v1.4s 374 zip2 v21.4s, v0.4s, v1.4s 375 zip1 v22.4s, v2.4s, v3.4s 376 zip2 v23.4s, v2.4s, v3.4s 377 378 zip1 v0.2d, v20.2d, v22.2d 379 zip2 v1.2d, v20.2d, v22.2d 380 zip1 v2.2d, v21.2d, v23.2d 381 zip2 v3.2d, v21.2d, v23.2d 382 383 zip1 v20.4s, v5.4s, v6.4s 384 zip2 v21.4s, v5.4s, v6.4s 385 zip1 v22.4s, v7.4s, v8.4s 386 zip2 v23.4s, v7.4s, v8.4s 387 388 zip1 v5.2d, v20.2d, v22.2d 389 zip2 v6.2d, v20.2d, v22.2d 390 zip1 v7.2d, v21.2d, v23.2d 391 zip2 v8.2d, v21.2d, v23.2d 392 393 zip1 v20.4s, v10.4s, v11.4s 394 zip2 v21.4s, v10.4s, v11.4s 395 zip1 v22.4s, v12.4s, v13.4s 396 zip2 v23.4s, v12.4s, v13.4s 397 398 zip1 v10.2d, v20.2d, v22.2d 399 zip2 v11.2d, v20.2d, v22.2d 400 zip1 v12.2d, v21.2d, v23.2d 401 zip2 v13.2d, v21.2d, v23.2d 402 403 zip1 v20.4s, v15.4s, v16.4s 404 zip2 v21.4s, v15.4s, v16.4s 405 zip1 v22.4s, v17.4s, v18.4s 406 zip2 v23.4s, v17.4s, v18.4s 407 408 zip1 v15.2d, v20.2d, v22.2d 409 zip2 v16.2d, v20.2d, v22.2d 410 zip1 v17.2d, v21.2d, v23.2d 411 zip2 v18.2d, v21.2d, v23.2d 412 413 add v4.4s, v4.4s, v24.4s 414 add v9.4s, v9.4s, v28.4s 415 and v4.16b, v4.16b, v27.16b 416 417 add v0.4s, v0.4s, v24.4s 418 add v5.4s, v5.4s, v28.4s 419 add v10.4s, v10.4s, v29.4s 420 add v15.4s, v15.4s, v30.4s 421 422 add v1.4s, v1.4s, v24.4s 423 add v6.4s, v6.4s, v28.4s 424 add v11.4s, v11.4s, v29.4s 425 add v16.4s, v16.4s, v30.4s 426 427 add v2.4s, v2.4s, v24.4s 428 add v7.4s, v7.4s, v28.4s 429 add v12.4s, v12.4s, v29.4s 430 add v17.4s, v17.4s, v30.4s 431 432 add v3.4s, v3.4s, v24.4s 433 add v8.4s, v8.4s, v28.4s 434 add v13.4s, v13.4s, v29.4s 435 add v18.4s, v18.4s, v30.4s 436 437 mov x16, v4.d[0] // Move the R key to GPRs 438 mov x17, v4.d[1] 439 mov v27.16b, v9.16b // Store the S key 440 441 bl .Lpoly_hash_ad_internal 442 443 mov x3, x0 444 cmp x2, #256 445 b.le .Lseal_tail 446 447 ld1 {v20.16b - v23.16b}, [x1], #64 448 eor v20.16b, v20.16b, v0.16b 449 eor v21.16b, v21.16b, v5.16b 450 eor v22.16b, v22.16b, v10.16b 451 eor v23.16b, v23.16b, v15.16b 452 st1 {v20.16b - v23.16b}, [x0], #64 453 454 ld1 {v20.16b - v23.16b}, [x1], #64 455 eor v20.16b, v20.16b, v1.16b 456 eor v21.16b, v21.16b, v6.16b 457 eor v22.16b, v22.16b, v11.16b 458 eor v23.16b, v23.16b, v16.16b 459 st1 {v20.16b - v23.16b}, [x0], #64 460 461 ld1 {v20.16b - v23.16b}, [x1], #64 462 eor v20.16b, v20.16b, v2.16b 463 eor v21.16b, v21.16b, v7.16b 464 eor v22.16b, v22.16b, v12.16b 465 eor v23.16b, v23.16b, v17.16b 466 st1 {v20.16b - v23.16b}, [x0], #64 467 468 ld1 {v20.16b - v23.16b}, [x1], #64 469 eor v20.16b, v20.16b, v3.16b 470 eor v21.16b, v21.16b, v8.16b 471 eor v22.16b, v22.16b, v13.16b 472 eor v23.16b, v23.16b, v18.16b 473 st1 {v20.16b - v23.16b}, [x0], #64 474 475 sub x2, x2, #256 476 477 mov x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds 478 mov x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256 479 480.Lseal_main_loop: 481 adrp x11, .Lchacha20_consts 482 add x11, x11, :lo12:.Lchacha20_consts 483 484 ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] 485 mov v4.16b, v24.16b 486 487 ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 488 mov v9.16b, v28.16b 489 490 ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 491 mov v14.16b, v29.16b 492 493 ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] 494 add v15.4s, v15.4s, v25.4s 495 mov v19.16b, v30.16b 496 497 eor v20.16b, v20.16b, v20.16b //zero 498 not v21.16b, v20.16b // -1 499 sub v21.4s, v25.4s, v21.4s // Add +1 500 ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) 501 add v19.4s, v19.4s, v20.4s 502 503 sub x5, x5, #32 504.align 5 505.Lseal_main_loop_rounds: 506 add v0.4s, v0.4s, v5.4s 507 add v1.4s, v1.4s, v6.4s 508 add v2.4s, v2.4s, v7.4s 509 add v3.4s, v3.4s, v8.4s 510 add v4.4s, v4.4s, v9.4s 511 512 eor v15.16b, v15.16b, v0.16b 513 eor v16.16b, v16.16b, v1.16b 514 eor v17.16b, v17.16b, v2.16b 515 eor v18.16b, v18.16b, v3.16b 516 eor v19.16b, v19.16b, v4.16b 517 518 rev32 v15.8h, v15.8h 519 rev32 v16.8h, v16.8h 520 rev32 v17.8h, v17.8h 521 rev32 v18.8h, v18.8h 522 rev32 v19.8h, v19.8h 523 524 add v10.4s, v10.4s, v15.4s 525 add v11.4s, v11.4s, v16.4s 526 add v12.4s, v12.4s, v17.4s 527 add v13.4s, v13.4s, v18.4s 528 add v14.4s, v14.4s, v19.4s 529 530 eor v5.16b, v5.16b, v10.16b 531 eor v6.16b, v6.16b, v11.16b 532 eor v7.16b, v7.16b, v12.16b 533 eor v8.16b, v8.16b, v13.16b 534 eor v9.16b, v9.16b, v14.16b 535 536 ushr v20.4s, v5.4s, #20 537 sli v20.4s, v5.4s, #12 538 ushr v5.4s, v6.4s, #20 539 sli v5.4s, v6.4s, #12 540 ushr v6.4s, v7.4s, #20 541 sli v6.4s, v7.4s, #12 542 ushr v7.4s, v8.4s, #20 543 sli v7.4s, v8.4s, #12 544 ushr v8.4s, v9.4s, #20 545 sli v8.4s, v9.4s, #12 546 547 add v0.4s, v0.4s, v20.4s 548 add v1.4s, v1.4s, v5.4s 549 add v2.4s, v2.4s, v6.4s 550 add v3.4s, v3.4s, v7.4s 551 add v4.4s, v4.4s, v8.4s 552 553 eor v15.16b, v15.16b, v0.16b 554 eor v16.16b, v16.16b, v1.16b 555 eor v17.16b, v17.16b, v2.16b 556 eor v18.16b, v18.16b, v3.16b 557 eor v19.16b, v19.16b, v4.16b 558 559 tbl v15.16b, {v15.16b}, v26.16b 560 tbl v16.16b, {v16.16b}, v26.16b 561 tbl v17.16b, {v17.16b}, v26.16b 562 tbl v18.16b, {v18.16b}, v26.16b 563 tbl v19.16b, {v19.16b}, v26.16b 564 565 add v10.4s, v10.4s, v15.4s 566 add v11.4s, v11.4s, v16.4s 567 add v12.4s, v12.4s, v17.4s 568 add v13.4s, v13.4s, v18.4s 569 add v14.4s, v14.4s, v19.4s 570 571 eor v20.16b, v20.16b, v10.16b 572 eor v5.16b, v5.16b, v11.16b 573 eor v6.16b, v6.16b, v12.16b 574 eor v7.16b, v7.16b, v13.16b 575 eor v8.16b, v8.16b, v14.16b 576 577 ushr v9.4s, v8.4s, #25 578 sli v9.4s, v8.4s, #7 579 ushr v8.4s, v7.4s, #25 580 sli v8.4s, v7.4s, #7 581 ushr v7.4s, v6.4s, #25 582 sli v7.4s, v6.4s, #7 583 ushr v6.4s, v5.4s, #25 584 sli v6.4s, v5.4s, #7 585 ushr v5.4s, v20.4s, #25 586 sli v5.4s, v20.4s, #7 587 588 ext v9.16b, v9.16b, v9.16b, #4 589 ext v14.16b, v14.16b, v14.16b, #8 590 ext v19.16b, v19.16b, v19.16b, #12 591 ldp x11, x12, [x3], 16 592 adds x8, x8, x11 593 adcs x9, x9, x12 594 adc x10, x10, x15 595 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 596 umulh x12, x8, x16 597 mul x13, x9, x16 598 umulh x14, x9, x16 599 adds x12, x12, x13 600 mul x13, x10, x16 601 adc x13, x13, x14 602 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 603 umulh x8, x8, x17 604 adds x12, x12, x14 605 mul x14, x9, x17 606 umulh x9, x9, x17 607 adcs x14, x14, x8 608 mul x10, x10, x17 609 adc x10, x10, x9 610 adds x13, x13, x14 611 adc x14, x10, xzr 612 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 613 and x8, x13, #-4 614 extr x13, x14, x13, #2 615 adds x8, x8, x11 616 lsr x11, x14, #2 617 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 618 adds x8, x8, x13 619 adcs x9, x9, x12 620 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 621 add v0.4s, v0.4s, v6.4s 622 add v1.4s, v1.4s, v7.4s 623 add v2.4s, v2.4s, v8.4s 624 add v3.4s, v3.4s, v5.4s 625 add v4.4s, v4.4s, v9.4s 626 627 eor v18.16b, v18.16b, v0.16b 628 eor v15.16b, v15.16b, v1.16b 629 eor v16.16b, v16.16b, v2.16b 630 eor v17.16b, v17.16b, v3.16b 631 eor v19.16b, v19.16b, v4.16b 632 633 rev32 v18.8h, v18.8h 634 rev32 v15.8h, v15.8h 635 rev32 v16.8h, v16.8h 636 rev32 v17.8h, v17.8h 637 rev32 v19.8h, v19.8h 638 639 add v12.4s, v12.4s, v18.4s 640 add v13.4s, v13.4s, v15.4s 641 add v10.4s, v10.4s, v16.4s 642 add v11.4s, v11.4s, v17.4s 643 add v14.4s, v14.4s, v19.4s 644 645 eor v6.16b, v6.16b, v12.16b 646 eor v7.16b, v7.16b, v13.16b 647 eor v8.16b, v8.16b, v10.16b 648 eor v5.16b, v5.16b, v11.16b 649 eor v9.16b, v9.16b, v14.16b 650 651 ushr v20.4s, v6.4s, #20 652 sli v20.4s, v6.4s, #12 653 ushr v6.4s, v7.4s, #20 654 sli v6.4s, v7.4s, #12 655 ushr v7.4s, v8.4s, #20 656 sli v7.4s, v8.4s, #12 657 ushr v8.4s, v5.4s, #20 658 sli v8.4s, v5.4s, #12 659 ushr v5.4s, v9.4s, #20 660 sli v5.4s, v9.4s, #12 661 662 add v0.4s, v0.4s, v20.4s 663 add v1.4s, v1.4s, v6.4s 664 add v2.4s, v2.4s, v7.4s 665 add v3.4s, v3.4s, v8.4s 666 add v4.4s, v4.4s, v5.4s 667 668 eor v18.16b, v18.16b, v0.16b 669 eor v15.16b, v15.16b, v1.16b 670 eor v16.16b, v16.16b, v2.16b 671 eor v17.16b, v17.16b, v3.16b 672 eor v19.16b, v19.16b, v4.16b 673 674 tbl v18.16b, {v18.16b}, v26.16b 675 tbl v15.16b, {v15.16b}, v26.16b 676 tbl v16.16b, {v16.16b}, v26.16b 677 tbl v17.16b, {v17.16b}, v26.16b 678 tbl v19.16b, {v19.16b}, v26.16b 679 680 add v12.4s, v12.4s, v18.4s 681 add v13.4s, v13.4s, v15.4s 682 add v10.4s, v10.4s, v16.4s 683 add v11.4s, v11.4s, v17.4s 684 add v14.4s, v14.4s, v19.4s 685 686 eor v20.16b, v20.16b, v12.16b 687 eor v6.16b, v6.16b, v13.16b 688 eor v7.16b, v7.16b, v10.16b 689 eor v8.16b, v8.16b, v11.16b 690 eor v5.16b, v5.16b, v14.16b 691 692 ushr v9.4s, v5.4s, #25 693 sli v9.4s, v5.4s, #7 694 ushr v5.4s, v8.4s, #25 695 sli v5.4s, v8.4s, #7 696 ushr v8.4s, v7.4s, #25 697 sli v8.4s, v7.4s, #7 698 ushr v7.4s, v6.4s, #25 699 sli v7.4s, v6.4s, #7 700 ushr v6.4s, v20.4s, #25 701 sli v6.4s, v20.4s, #7 702 703 ext v9.16b, v9.16b, v9.16b, #12 704 ext v14.16b, v14.16b, v14.16b, #8 705 ext v19.16b, v19.16b, v19.16b, #4 706 subs x6, x6, #1 707 b.ge .Lseal_main_loop_rounds 708 ldp x11, x12, [x3], 16 709 adds x8, x8, x11 710 adcs x9, x9, x12 711 adc x10, x10, x15 712 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 713 umulh x12, x8, x16 714 mul x13, x9, x16 715 umulh x14, x9, x16 716 adds x12, x12, x13 717 mul x13, x10, x16 718 adc x13, x13, x14 719 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 720 umulh x8, x8, x17 721 adds x12, x12, x14 722 mul x14, x9, x17 723 umulh x9, x9, x17 724 adcs x14, x14, x8 725 mul x10, x10, x17 726 adc x10, x10, x9 727 adds x13, x13, x14 728 adc x14, x10, xzr 729 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 730 and x8, x13, #-4 731 extr x13, x14, x13, #2 732 adds x8, x8, x11 733 lsr x11, x14, #2 734 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 735 adds x8, x8, x13 736 adcs x9, x9, x12 737 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 738 subs x7, x7, #1 739 b.gt .Lseal_main_loop_rounds 740 741 eor v20.16b, v20.16b, v20.16b //zero 742 not v21.16b, v20.16b // -1 743 sub v21.4s, v25.4s, v21.4s // Add +1 744 ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) 745 add v19.4s, v19.4s, v20.4s 746 747 add v15.4s, v15.4s, v25.4s 748 mov x11, #5 749 dup v20.4s, w11 750 add v25.4s, v25.4s, v20.4s 751 752 zip1 v20.4s, v0.4s, v1.4s 753 zip2 v21.4s, v0.4s, v1.4s 754 zip1 v22.4s, v2.4s, v3.4s 755 zip2 v23.4s, v2.4s, v3.4s 756 757 zip1 v0.2d, v20.2d, v22.2d 758 zip2 v1.2d, v20.2d, v22.2d 759 zip1 v2.2d, v21.2d, v23.2d 760 zip2 v3.2d, v21.2d, v23.2d 761 762 zip1 v20.4s, v5.4s, v6.4s 763 zip2 v21.4s, v5.4s, v6.4s 764 zip1 v22.4s, v7.4s, v8.4s 765 zip2 v23.4s, v7.4s, v8.4s 766 767 zip1 v5.2d, v20.2d, v22.2d 768 zip2 v6.2d, v20.2d, v22.2d 769 zip1 v7.2d, v21.2d, v23.2d 770 zip2 v8.2d, v21.2d, v23.2d 771 772 zip1 v20.4s, v10.4s, v11.4s 773 zip2 v21.4s, v10.4s, v11.4s 774 zip1 v22.4s, v12.4s, v13.4s 775 zip2 v23.4s, v12.4s, v13.4s 776 777 zip1 v10.2d, v20.2d, v22.2d 778 zip2 v11.2d, v20.2d, v22.2d 779 zip1 v12.2d, v21.2d, v23.2d 780 zip2 v13.2d, v21.2d, v23.2d 781 782 zip1 v20.4s, v15.4s, v16.4s 783 zip2 v21.4s, v15.4s, v16.4s 784 zip1 v22.4s, v17.4s, v18.4s 785 zip2 v23.4s, v17.4s, v18.4s 786 787 zip1 v15.2d, v20.2d, v22.2d 788 zip2 v16.2d, v20.2d, v22.2d 789 zip1 v17.2d, v21.2d, v23.2d 790 zip2 v18.2d, v21.2d, v23.2d 791 792 add v0.4s, v0.4s, v24.4s 793 add v5.4s, v5.4s, v28.4s 794 add v10.4s, v10.4s, v29.4s 795 add v15.4s, v15.4s, v30.4s 796 797 add v1.4s, v1.4s, v24.4s 798 add v6.4s, v6.4s, v28.4s 799 add v11.4s, v11.4s, v29.4s 800 add v16.4s, v16.4s, v30.4s 801 802 add v2.4s, v2.4s, v24.4s 803 add v7.4s, v7.4s, v28.4s 804 add v12.4s, v12.4s, v29.4s 805 add v17.4s, v17.4s, v30.4s 806 807 add v3.4s, v3.4s, v24.4s 808 add v8.4s, v8.4s, v28.4s 809 add v13.4s, v13.4s, v29.4s 810 add v18.4s, v18.4s, v30.4s 811 812 add v4.4s, v4.4s, v24.4s 813 add v9.4s, v9.4s, v28.4s 814 add v14.4s, v14.4s, v29.4s 815 add v19.4s, v19.4s, v30.4s 816 817 cmp x2, #320 818 b.le .Lseal_tail 819 820 ld1 {v20.16b - v23.16b}, [x1], #64 821 eor v20.16b, v20.16b, v0.16b 822 eor v21.16b, v21.16b, v5.16b 823 eor v22.16b, v22.16b, v10.16b 824 eor v23.16b, v23.16b, v15.16b 825 st1 {v20.16b - v23.16b}, [x0], #64 826 827 ld1 {v20.16b - v23.16b}, [x1], #64 828 eor v20.16b, v20.16b, v1.16b 829 eor v21.16b, v21.16b, v6.16b 830 eor v22.16b, v22.16b, v11.16b 831 eor v23.16b, v23.16b, v16.16b 832 st1 {v20.16b - v23.16b}, [x0], #64 833 834 ld1 {v20.16b - v23.16b}, [x1], #64 835 eor v20.16b, v20.16b, v2.16b 836 eor v21.16b, v21.16b, v7.16b 837 eor v22.16b, v22.16b, v12.16b 838 eor v23.16b, v23.16b, v17.16b 839 st1 {v20.16b - v23.16b}, [x0], #64 840 841 ld1 {v20.16b - v23.16b}, [x1], #64 842 eor v20.16b, v20.16b, v3.16b 843 eor v21.16b, v21.16b, v8.16b 844 eor v22.16b, v22.16b, v13.16b 845 eor v23.16b, v23.16b, v18.16b 846 st1 {v20.16b - v23.16b}, [x0], #64 847 848 ld1 {v20.16b - v23.16b}, [x1], #64 849 eor v20.16b, v20.16b, v4.16b 850 eor v21.16b, v21.16b, v9.16b 851 eor v22.16b, v22.16b, v14.16b 852 eor v23.16b, v23.16b, v19.16b 853 st1 {v20.16b - v23.16b}, [x0], #64 854 855 sub x2, x2, #320 856 857 mov x6, #0 858 mov x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration 859 860 b .Lseal_main_loop 861 862.Lseal_tail: 863 // This part of the function handles the storage and authentication of the last [0,320) bytes 864 // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data. 865 cmp x2, #64 866 b.lt .Lseal_tail_64 867 868 // Store and authenticate 64B blocks per iteration 869 ld1 {v20.16b - v23.16b}, [x1], #64 870 871 eor v20.16b, v20.16b, v0.16b 872 eor v21.16b, v21.16b, v5.16b 873 eor v22.16b, v22.16b, v10.16b 874 eor v23.16b, v23.16b, v15.16b 875 mov x11, v20.d[0] 876 mov x12, v20.d[1] 877 adds x8, x8, x11 878 adcs x9, x9, x12 879 adc x10, x10, x15 880 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 881 umulh x12, x8, x16 882 mul x13, x9, x16 883 umulh x14, x9, x16 884 adds x12, x12, x13 885 mul x13, x10, x16 886 adc x13, x13, x14 887 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 888 umulh x8, x8, x17 889 adds x12, x12, x14 890 mul x14, x9, x17 891 umulh x9, x9, x17 892 adcs x14, x14, x8 893 mul x10, x10, x17 894 adc x10, x10, x9 895 adds x13, x13, x14 896 adc x14, x10, xzr 897 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 898 and x8, x13, #-4 899 extr x13, x14, x13, #2 900 adds x8, x8, x11 901 lsr x11, x14, #2 902 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 903 adds x8, x8, x13 904 adcs x9, x9, x12 905 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 906 mov x11, v21.d[0] 907 mov x12, v21.d[1] 908 adds x8, x8, x11 909 adcs x9, x9, x12 910 adc x10, x10, x15 911 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 912 umulh x12, x8, x16 913 mul x13, x9, x16 914 umulh x14, x9, x16 915 adds x12, x12, x13 916 mul x13, x10, x16 917 adc x13, x13, x14 918 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 919 umulh x8, x8, x17 920 adds x12, x12, x14 921 mul x14, x9, x17 922 umulh x9, x9, x17 923 adcs x14, x14, x8 924 mul x10, x10, x17 925 adc x10, x10, x9 926 adds x13, x13, x14 927 adc x14, x10, xzr 928 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 929 and x8, x13, #-4 930 extr x13, x14, x13, #2 931 adds x8, x8, x11 932 lsr x11, x14, #2 933 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 934 adds x8, x8, x13 935 adcs x9, x9, x12 936 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 937 mov x11, v22.d[0] 938 mov x12, v22.d[1] 939 adds x8, x8, x11 940 adcs x9, x9, x12 941 adc x10, x10, x15 942 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 943 umulh x12, x8, x16 944 mul x13, x9, x16 945 umulh x14, x9, x16 946 adds x12, x12, x13 947 mul x13, x10, x16 948 adc x13, x13, x14 949 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 950 umulh x8, x8, x17 951 adds x12, x12, x14 952 mul x14, x9, x17 953 umulh x9, x9, x17 954 adcs x14, x14, x8 955 mul x10, x10, x17 956 adc x10, x10, x9 957 adds x13, x13, x14 958 adc x14, x10, xzr 959 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 960 and x8, x13, #-4 961 extr x13, x14, x13, #2 962 adds x8, x8, x11 963 lsr x11, x14, #2 964 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 965 adds x8, x8, x13 966 adcs x9, x9, x12 967 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 968 mov x11, v23.d[0] 969 mov x12, v23.d[1] 970 adds x8, x8, x11 971 adcs x9, x9, x12 972 adc x10, x10, x15 973 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 974 umulh x12, x8, x16 975 mul x13, x9, x16 976 umulh x14, x9, x16 977 adds x12, x12, x13 978 mul x13, x10, x16 979 adc x13, x13, x14 980 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 981 umulh x8, x8, x17 982 adds x12, x12, x14 983 mul x14, x9, x17 984 umulh x9, x9, x17 985 adcs x14, x14, x8 986 mul x10, x10, x17 987 adc x10, x10, x9 988 adds x13, x13, x14 989 adc x14, x10, xzr 990 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 991 and x8, x13, #-4 992 extr x13, x14, x13, #2 993 adds x8, x8, x11 994 lsr x11, x14, #2 995 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 996 adds x8, x8, x13 997 adcs x9, x9, x12 998 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 999 st1 {v20.16b - v23.16b}, [x0], #64 1000 sub x2, x2, #64 1001 1002 // Shift the state left by 64 bytes for the next iteration of the loop 1003 mov v0.16b, v1.16b 1004 mov v5.16b, v6.16b 1005 mov v10.16b, v11.16b 1006 mov v15.16b, v16.16b 1007 1008 mov v1.16b, v2.16b 1009 mov v6.16b, v7.16b 1010 mov v11.16b, v12.16b 1011 mov v16.16b, v17.16b 1012 1013 mov v2.16b, v3.16b 1014 mov v7.16b, v8.16b 1015 mov v12.16b, v13.16b 1016 mov v17.16b, v18.16b 1017 1018 mov v3.16b, v4.16b 1019 mov v8.16b, v9.16b 1020 mov v13.16b, v14.16b 1021 mov v18.16b, v19.16b 1022 1023 b .Lseal_tail 1024 1025.Lseal_tail_64: 1026 ldp x3, x4, [x5, #48] // extra_in_len and extra_in_ptr 1027 1028 // Here we handle the last [0,64) bytes of plaintext 1029 cmp x2, #16 1030 b.lt .Lseal_tail_16 1031 // Each iteration encrypt and authenticate a 16B block 1032 ld1 {v20.16b}, [x1], #16 1033 eor v20.16b, v20.16b, v0.16b 1034 mov x11, v20.d[0] 1035 mov x12, v20.d[1] 1036 adds x8, x8, x11 1037 adcs x9, x9, x12 1038 adc x10, x10, x15 1039 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 1040 umulh x12, x8, x16 1041 mul x13, x9, x16 1042 umulh x14, x9, x16 1043 adds x12, x12, x13 1044 mul x13, x10, x16 1045 adc x13, x13, x14 1046 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 1047 umulh x8, x8, x17 1048 adds x12, x12, x14 1049 mul x14, x9, x17 1050 umulh x9, x9, x17 1051 adcs x14, x14, x8 1052 mul x10, x10, x17 1053 adc x10, x10, x9 1054 adds x13, x13, x14 1055 adc x14, x10, xzr 1056 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 1057 and x8, x13, #-4 1058 extr x13, x14, x13, #2 1059 adds x8, x8, x11 1060 lsr x11, x14, #2 1061 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 1062 adds x8, x8, x13 1063 adcs x9, x9, x12 1064 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 1065 st1 {v20.16b}, [x0], #16 1066 1067 sub x2, x2, #16 1068 1069 // Shift the state left by 16 bytes for the next iteration of the loop 1070 mov v0.16b, v5.16b 1071 mov v5.16b, v10.16b 1072 mov v10.16b, v15.16b 1073 1074 b .Lseal_tail_64 1075 1076.Lseal_tail_16: 1077 // Here we handle the last [0,16) bytes of ciphertext that require a padded block 1078 cbz x2, .Lseal_hash_extra 1079 1080 eor v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in 1081 eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes 1082 not v22.16b, v20.16b 1083 1084 mov x6, x2 1085 add x1, x1, x2 1086 1087 cbz x4, .Lseal_tail_16_compose // No extra data to pad with, zero padding 1088 1089 mov x7, #16 // We need to load some extra_in first for padding 1090 sub x7, x7, x2 1091 cmp x4, x7 1092 csel x7, x4, x7, lt // .Load the minimum of extra_in_len and the amount needed to fill the register 1093 mov x12, x7 1094 add x3, x3, x7 1095 sub x4, x4, x7 1096 1097.Lseal_tail16_compose_extra_in: 1098 ext v20.16b, v20.16b, v20.16b, #15 1099 ldrb w11, [x3, #-1]! 1100 mov v20.b[0], w11 1101 subs x7, x7, #1 1102 b.gt .Lseal_tail16_compose_extra_in 1103 1104 add x3, x3, x12 1105 1106.Lseal_tail_16_compose: 1107 ext v20.16b, v20.16b, v20.16b, #15 1108 ldrb w11, [x1, #-1]! 1109 mov v20.b[0], w11 1110 ext v21.16b, v22.16b, v21.16b, #15 1111 subs x2, x2, #1 1112 b.gt .Lseal_tail_16_compose 1113 1114 and v0.16b, v0.16b, v21.16b 1115 eor v20.16b, v20.16b, v0.16b 1116 mov v21.16b, v20.16b 1117 1118.Lseal_tail_16_store: 1119 umov w11, v20.b[0] 1120 strb w11, [x0], #1 1121 ext v20.16b, v20.16b, v20.16b, #1 1122 subs x6, x6, #1 1123 b.gt .Lseal_tail_16_store 1124 1125 // Hash in the final ct block concatenated with extra_in 1126 mov x11, v21.d[0] 1127 mov x12, v21.d[1] 1128 adds x8, x8, x11 1129 adcs x9, x9, x12 1130 adc x10, x10, x15 1131 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 1132 umulh x12, x8, x16 1133 mul x13, x9, x16 1134 umulh x14, x9, x16 1135 adds x12, x12, x13 1136 mul x13, x10, x16 1137 adc x13, x13, x14 1138 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 1139 umulh x8, x8, x17 1140 adds x12, x12, x14 1141 mul x14, x9, x17 1142 umulh x9, x9, x17 1143 adcs x14, x14, x8 1144 mul x10, x10, x17 1145 adc x10, x10, x9 1146 adds x13, x13, x14 1147 adc x14, x10, xzr 1148 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 1149 and x8, x13, #-4 1150 extr x13, x14, x13, #2 1151 adds x8, x8, x11 1152 lsr x11, x14, #2 1153 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 1154 adds x8, x8, x13 1155 adcs x9, x9, x12 1156 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 1157 1158.Lseal_hash_extra: 1159 cbz x4, .Lseal_finalize 1160 1161.Lseal_hash_extra_loop: 1162 cmp x4, #16 1163 b.lt .Lseal_hash_extra_tail 1164 ld1 {v20.16b}, [x3], #16 1165 mov x11, v20.d[0] 1166 mov x12, v20.d[1] 1167 adds x8, x8, x11 1168 adcs x9, x9, x12 1169 adc x10, x10, x15 1170 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 1171 umulh x12, x8, x16 1172 mul x13, x9, x16 1173 umulh x14, x9, x16 1174 adds x12, x12, x13 1175 mul x13, x10, x16 1176 adc x13, x13, x14 1177 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 1178 umulh x8, x8, x17 1179 adds x12, x12, x14 1180 mul x14, x9, x17 1181 umulh x9, x9, x17 1182 adcs x14, x14, x8 1183 mul x10, x10, x17 1184 adc x10, x10, x9 1185 adds x13, x13, x14 1186 adc x14, x10, xzr 1187 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 1188 and x8, x13, #-4 1189 extr x13, x14, x13, #2 1190 adds x8, x8, x11 1191 lsr x11, x14, #2 1192 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 1193 adds x8, x8, x13 1194 adcs x9, x9, x12 1195 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 1196 sub x4, x4, #16 1197 b .Lseal_hash_extra_loop 1198 1199.Lseal_hash_extra_tail: 1200 cbz x4, .Lseal_finalize 1201 eor v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext 1202 add x3, x3, x4 1203 1204.Lseal_hash_extra_load: 1205 ext v20.16b, v20.16b, v20.16b, #15 1206 ldrb w11, [x3, #-1]! 1207 mov v20.b[0], w11 1208 subs x4, x4, #1 1209 b.gt .Lseal_hash_extra_load 1210 1211 // Hash in the final padded extra_in blcok 1212 mov x11, v20.d[0] 1213 mov x12, v20.d[1] 1214 adds x8, x8, x11 1215 adcs x9, x9, x12 1216 adc x10, x10, x15 1217 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 1218 umulh x12, x8, x16 1219 mul x13, x9, x16 1220 umulh x14, x9, x16 1221 adds x12, x12, x13 1222 mul x13, x10, x16 1223 adc x13, x13, x14 1224 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 1225 umulh x8, x8, x17 1226 adds x12, x12, x14 1227 mul x14, x9, x17 1228 umulh x9, x9, x17 1229 adcs x14, x14, x8 1230 mul x10, x10, x17 1231 adc x10, x10, x9 1232 adds x13, x13, x14 1233 adc x14, x10, xzr 1234 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 1235 and x8, x13, #-4 1236 extr x13, x14, x13, #2 1237 adds x8, x8, x11 1238 lsr x11, x14, #2 1239 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 1240 adds x8, x8, x13 1241 adcs x9, x9, x12 1242 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 1243 1244.Lseal_finalize: 1245 mov x11, v31.d[0] 1246 mov x12, v31.d[1] 1247 adds x8, x8, x11 1248 adcs x9, x9, x12 1249 adc x10, x10, x15 1250 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 1251 umulh x12, x8, x16 1252 mul x13, x9, x16 1253 umulh x14, x9, x16 1254 adds x12, x12, x13 1255 mul x13, x10, x16 1256 adc x13, x13, x14 1257 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 1258 umulh x8, x8, x17 1259 adds x12, x12, x14 1260 mul x14, x9, x17 1261 umulh x9, x9, x17 1262 adcs x14, x14, x8 1263 mul x10, x10, x17 1264 adc x10, x10, x9 1265 adds x13, x13, x14 1266 adc x14, x10, xzr 1267 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 1268 and x8, x13, #-4 1269 extr x13, x14, x13, #2 1270 adds x8, x8, x11 1271 lsr x11, x14, #2 1272 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 1273 adds x8, x8, x13 1274 adcs x9, x9, x12 1275 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 1276 // Final reduction step 1277 sub x12, xzr, x15 1278 orr x13, xzr, #3 1279 subs x11, x8, #-5 1280 sbcs x12, x9, x12 1281 sbcs x13, x10, x13 1282 csel x8, x11, x8, cs 1283 csel x9, x12, x9, cs 1284 csel x10, x13, x10, cs 1285 mov x11, v27.d[0] 1286 mov x12, v27.d[1] 1287 adds x8, x8, x11 1288 adcs x9, x9, x12 1289 adc x10, x10, x15 1290 1291 stp x8, x9, [x5] 1292 1293 ldp d8, d9, [sp, #16] 1294 ldp d10, d11, [sp, #32] 1295 ldp d12, d13, [sp, #48] 1296 ldp d14, d15, [sp, #64] 1297.cfi_restore b15 1298.cfi_restore b14 1299.cfi_restore b13 1300.cfi_restore b12 1301.cfi_restore b11 1302.cfi_restore b10 1303.cfi_restore b9 1304.cfi_restore b8 1305 ldp x29, x30, [sp], 80 1306.cfi_restore w29 1307.cfi_restore w30 1308.cfi_def_cfa_offset 0 1309 AARCH64_VALIDATE_LINK_REGISTER 1310 ret 1311 1312.Lseal_128: 1313 // On some architectures preparing 5 blocks for small buffers is wasteful 1314 eor v25.16b, v25.16b, v25.16b 1315 mov x11, #1 1316 mov v25.s[0], w11 1317 mov v0.16b, v24.16b 1318 mov v1.16b, v24.16b 1319 mov v2.16b, v24.16b 1320 mov v5.16b, v28.16b 1321 mov v6.16b, v28.16b 1322 mov v7.16b, v28.16b 1323 mov v10.16b, v29.16b 1324 mov v11.16b, v29.16b 1325 mov v12.16b, v29.16b 1326 mov v17.16b, v30.16b 1327 add v15.4s, v17.4s, v25.4s 1328 add v16.4s, v15.4s, v25.4s 1329 1330 mov x6, #10 1331 1332.Lseal_128_rounds: 1333 add v0.4s, v0.4s, v5.4s 1334 add v1.4s, v1.4s, v6.4s 1335 add v2.4s, v2.4s, v7.4s 1336 eor v15.16b, v15.16b, v0.16b 1337 eor v16.16b, v16.16b, v1.16b 1338 eor v17.16b, v17.16b, v2.16b 1339 rev32 v15.8h, v15.8h 1340 rev32 v16.8h, v16.8h 1341 rev32 v17.8h, v17.8h 1342 1343 add v10.4s, v10.4s, v15.4s 1344 add v11.4s, v11.4s, v16.4s 1345 add v12.4s, v12.4s, v17.4s 1346 eor v5.16b, v5.16b, v10.16b 1347 eor v6.16b, v6.16b, v11.16b 1348 eor v7.16b, v7.16b, v12.16b 1349 ushr v20.4s, v5.4s, #20 1350 sli v20.4s, v5.4s, #12 1351 ushr v5.4s, v6.4s, #20 1352 sli v5.4s, v6.4s, #12 1353 ushr v6.4s, v7.4s, #20 1354 sli v6.4s, v7.4s, #12 1355 1356 add v0.4s, v0.4s, v20.4s 1357 add v1.4s, v1.4s, v5.4s 1358 add v2.4s, v2.4s, v6.4s 1359 eor v15.16b, v15.16b, v0.16b 1360 eor v16.16b, v16.16b, v1.16b 1361 eor v17.16b, v17.16b, v2.16b 1362 tbl v15.16b, {v15.16b}, v26.16b 1363 tbl v16.16b, {v16.16b}, v26.16b 1364 tbl v17.16b, {v17.16b}, v26.16b 1365 1366 add v10.4s, v10.4s, v15.4s 1367 add v11.4s, v11.4s, v16.4s 1368 add v12.4s, v12.4s, v17.4s 1369 eor v20.16b, v20.16b, v10.16b 1370 eor v5.16b, v5.16b, v11.16b 1371 eor v6.16b, v6.16b, v12.16b 1372 ushr v7.4s, v6.4s, #25 1373 sli v7.4s, v6.4s, #7 1374 ushr v6.4s, v5.4s, #25 1375 sli v6.4s, v5.4s, #7 1376 ushr v5.4s, v20.4s, #25 1377 sli v5.4s, v20.4s, #7 1378 1379 ext v5.16b, v5.16b, v5.16b, #4 1380 ext v6.16b, v6.16b, v6.16b, #4 1381 ext v7.16b, v7.16b, v7.16b, #4 1382 1383 ext v10.16b, v10.16b, v10.16b, #8 1384 ext v11.16b, v11.16b, v11.16b, #8 1385 ext v12.16b, v12.16b, v12.16b, #8 1386 1387 ext v15.16b, v15.16b, v15.16b, #12 1388 ext v16.16b, v16.16b, v16.16b, #12 1389 ext v17.16b, v17.16b, v17.16b, #12 1390 add v0.4s, v0.4s, v5.4s 1391 add v1.4s, v1.4s, v6.4s 1392 add v2.4s, v2.4s, v7.4s 1393 eor v15.16b, v15.16b, v0.16b 1394 eor v16.16b, v16.16b, v1.16b 1395 eor v17.16b, v17.16b, v2.16b 1396 rev32 v15.8h, v15.8h 1397 rev32 v16.8h, v16.8h 1398 rev32 v17.8h, v17.8h 1399 1400 add v10.4s, v10.4s, v15.4s 1401 add v11.4s, v11.4s, v16.4s 1402 add v12.4s, v12.4s, v17.4s 1403 eor v5.16b, v5.16b, v10.16b 1404 eor v6.16b, v6.16b, v11.16b 1405 eor v7.16b, v7.16b, v12.16b 1406 ushr v20.4s, v5.4s, #20 1407 sli v20.4s, v5.4s, #12 1408 ushr v5.4s, v6.4s, #20 1409 sli v5.4s, v6.4s, #12 1410 ushr v6.4s, v7.4s, #20 1411 sli v6.4s, v7.4s, #12 1412 1413 add v0.4s, v0.4s, v20.4s 1414 add v1.4s, v1.4s, v5.4s 1415 add v2.4s, v2.4s, v6.4s 1416 eor v15.16b, v15.16b, v0.16b 1417 eor v16.16b, v16.16b, v1.16b 1418 eor v17.16b, v17.16b, v2.16b 1419 tbl v15.16b, {v15.16b}, v26.16b 1420 tbl v16.16b, {v16.16b}, v26.16b 1421 tbl v17.16b, {v17.16b}, v26.16b 1422 1423 add v10.4s, v10.4s, v15.4s 1424 add v11.4s, v11.4s, v16.4s 1425 add v12.4s, v12.4s, v17.4s 1426 eor v20.16b, v20.16b, v10.16b 1427 eor v5.16b, v5.16b, v11.16b 1428 eor v6.16b, v6.16b, v12.16b 1429 ushr v7.4s, v6.4s, #25 1430 sli v7.4s, v6.4s, #7 1431 ushr v6.4s, v5.4s, #25 1432 sli v6.4s, v5.4s, #7 1433 ushr v5.4s, v20.4s, #25 1434 sli v5.4s, v20.4s, #7 1435 1436 ext v5.16b, v5.16b, v5.16b, #12 1437 ext v6.16b, v6.16b, v6.16b, #12 1438 ext v7.16b, v7.16b, v7.16b, #12 1439 1440 ext v10.16b, v10.16b, v10.16b, #8 1441 ext v11.16b, v11.16b, v11.16b, #8 1442 ext v12.16b, v12.16b, v12.16b, #8 1443 1444 ext v15.16b, v15.16b, v15.16b, #4 1445 ext v16.16b, v16.16b, v16.16b, #4 1446 ext v17.16b, v17.16b, v17.16b, #4 1447 subs x6, x6, #1 1448 b.hi .Lseal_128_rounds 1449 1450 add v0.4s, v0.4s, v24.4s 1451 add v1.4s, v1.4s, v24.4s 1452 add v2.4s, v2.4s, v24.4s 1453 1454 add v5.4s, v5.4s, v28.4s 1455 add v6.4s, v6.4s, v28.4s 1456 add v7.4s, v7.4s, v28.4s 1457 1458 // Only the first 32 bytes of the third block (counter = 0) are needed, 1459 // so skip updating v12 and v17. 1460 add v10.4s, v10.4s, v29.4s 1461 add v11.4s, v11.4s, v29.4s 1462 1463 add v30.4s, v30.4s, v25.4s 1464 add v15.4s, v15.4s, v30.4s 1465 add v30.4s, v30.4s, v25.4s 1466 add v16.4s, v16.4s, v30.4s 1467 1468 and v2.16b, v2.16b, v27.16b 1469 mov x16, v2.d[0] // Move the R key to GPRs 1470 mov x17, v2.d[1] 1471 mov v27.16b, v7.16b // Store the S key 1472 1473 bl .Lpoly_hash_ad_internal 1474 b .Lseal_tail 1475.cfi_endproc 1476.size chacha20_poly1305_seal,.-chacha20_poly1305_seal 1477 1478///////////////////////////////// 1479// 1480// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data); 1481// 1482.globl chacha20_poly1305_open 1483.hidden chacha20_poly1305_open 1484.type chacha20_poly1305_open,%function 1485.align 6 1486chacha20_poly1305_open: 1487 AARCH64_SIGN_LINK_REGISTER 1488.cfi_startproc 1489 stp x29, x30, [sp, #-80]! 1490.cfi_def_cfa_offset 80 1491.cfi_offset w30, -72 1492.cfi_offset w29, -80 1493 mov x29, sp 1494 // We probably could do .cfi_def_cfa w29, 80 at this point, but since 1495 // we don't actually use the frame pointer like that, it's probably not 1496 // worth bothering. 1497 stp d8, d9, [sp, #16] 1498 stp d10, d11, [sp, #32] 1499 stp d12, d13, [sp, #48] 1500 stp d14, d15, [sp, #64] 1501.cfi_offset b15, -8 1502.cfi_offset b14, -16 1503.cfi_offset b13, -24 1504.cfi_offset b12, -32 1505.cfi_offset b11, -40 1506.cfi_offset b10, -48 1507.cfi_offset b9, -56 1508.cfi_offset b8, -64 1509 1510 adrp x11, .Lchacha20_consts 1511 add x11, x11, :lo12:.Lchacha20_consts 1512 1513 ld1 {v24.16b - v27.16b}, [x11] // .Load the CONSTS, INC, ROL8 and CLAMP values 1514 ld1 {v28.16b - v30.16b}, [x5] 1515 1516 mov x15, #1 // Prepare the Poly1305 state 1517 mov x8, #0 1518 mov x9, #0 1519 mov x10, #0 1520 1521 mov v31.d[0], x4 // Store the input and aad lengths 1522 mov v31.d[1], x2 1523 1524 cmp x2, #128 1525 b.le .Lopen_128 // Optimization for smaller buffers 1526 1527 // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys 1528 mov v0.16b, v24.16b 1529 mov v5.16b, v28.16b 1530 mov v10.16b, v29.16b 1531 mov v15.16b, v30.16b 1532 1533 mov x6, #10 1534 1535.align 5 1536.Lopen_init_rounds: 1537 add v0.4s, v0.4s, v5.4s 1538 eor v15.16b, v15.16b, v0.16b 1539 rev32 v15.8h, v15.8h 1540 1541 add v10.4s, v10.4s, v15.4s 1542 eor v5.16b, v5.16b, v10.16b 1543 ushr v20.4s, v5.4s, #20 1544 sli v20.4s, v5.4s, #12 1545 add v0.4s, v0.4s, v20.4s 1546 eor v15.16b, v15.16b, v0.16b 1547 tbl v15.16b, {v15.16b}, v26.16b 1548 1549 add v10.4s, v10.4s, v15.4s 1550 eor v20.16b, v20.16b, v10.16b 1551 ushr v5.4s, v20.4s, #25 1552 sli v5.4s, v20.4s, #7 1553 ext v5.16b, v5.16b, v5.16b, #4 1554 ext v10.16b, v10.16b, v10.16b, #8 1555 ext v15.16b, v15.16b, v15.16b, #12 1556 add v0.4s, v0.4s, v5.4s 1557 eor v15.16b, v15.16b, v0.16b 1558 rev32 v15.8h, v15.8h 1559 1560 add v10.4s, v10.4s, v15.4s 1561 eor v5.16b, v5.16b, v10.16b 1562 ushr v20.4s, v5.4s, #20 1563 sli v20.4s, v5.4s, #12 1564 add v0.4s, v0.4s, v20.4s 1565 eor v15.16b, v15.16b, v0.16b 1566 tbl v15.16b, {v15.16b}, v26.16b 1567 1568 add v10.4s, v10.4s, v15.4s 1569 eor v20.16b, v20.16b, v10.16b 1570 ushr v5.4s, v20.4s, #25 1571 sli v5.4s, v20.4s, #7 1572 ext v5.16b, v5.16b, v5.16b, #12 1573 ext v10.16b, v10.16b, v10.16b, #8 1574 ext v15.16b, v15.16b, v15.16b, #4 1575 subs x6, x6, #1 1576 b.hi .Lopen_init_rounds 1577 1578 add v0.4s, v0.4s, v24.4s 1579 add v5.4s, v5.4s, v28.4s 1580 1581 and v0.16b, v0.16b, v27.16b 1582 mov x16, v0.d[0] // Move the R key to GPRs 1583 mov x17, v0.d[1] 1584 mov v27.16b, v5.16b // Store the S key 1585 1586 bl .Lpoly_hash_ad_internal 1587 1588.Lopen_ad_done: 1589 mov x3, x1 1590 1591// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes 1592.Lopen_main_loop: 1593 1594 cmp x2, #192 1595 b.lt .Lopen_tail 1596 1597 adrp x11, .Lchacha20_consts 1598 add x11, x11, :lo12:.Lchacha20_consts 1599 1600 ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] 1601 mov v4.16b, v24.16b 1602 1603 ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 1604 mov v9.16b, v28.16b 1605 1606 ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 1607 mov v14.16b, v29.16b 1608 1609 ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] 1610 sub x5, x5, #32 1611 add v15.4s, v15.4s, v25.4s 1612 mov v19.16b, v30.16b 1613 1614 eor v20.16b, v20.16b, v20.16b //zero 1615 not v21.16b, v20.16b // -1 1616 sub v21.4s, v25.4s, v21.4s // Add +1 1617 ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) 1618 add v19.4s, v19.4s, v20.4s 1619 1620 lsr x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12 1621 sub x4, x4, #10 1622 1623 mov x7, #10 1624 subs x6, x7, x4 1625 subs x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash 1626 csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full 1627 1628 cbz x7, .Lopen_main_loop_rounds_short 1629 1630.align 5 1631.Lopen_main_loop_rounds: 1632 ldp x11, x12, [x3], 16 1633 adds x8, x8, x11 1634 adcs x9, x9, x12 1635 adc x10, x10, x15 1636 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 1637 umulh x12, x8, x16 1638 mul x13, x9, x16 1639 umulh x14, x9, x16 1640 adds x12, x12, x13 1641 mul x13, x10, x16 1642 adc x13, x13, x14 1643 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 1644 umulh x8, x8, x17 1645 adds x12, x12, x14 1646 mul x14, x9, x17 1647 umulh x9, x9, x17 1648 adcs x14, x14, x8 1649 mul x10, x10, x17 1650 adc x10, x10, x9 1651 adds x13, x13, x14 1652 adc x14, x10, xzr 1653 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 1654 and x8, x13, #-4 1655 extr x13, x14, x13, #2 1656 adds x8, x8, x11 1657 lsr x11, x14, #2 1658 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 1659 adds x8, x8, x13 1660 adcs x9, x9, x12 1661 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 1662.Lopen_main_loop_rounds_short: 1663 add v0.4s, v0.4s, v5.4s 1664 add v1.4s, v1.4s, v6.4s 1665 add v2.4s, v2.4s, v7.4s 1666 add v3.4s, v3.4s, v8.4s 1667 add v4.4s, v4.4s, v9.4s 1668 1669 eor v15.16b, v15.16b, v0.16b 1670 eor v16.16b, v16.16b, v1.16b 1671 eor v17.16b, v17.16b, v2.16b 1672 eor v18.16b, v18.16b, v3.16b 1673 eor v19.16b, v19.16b, v4.16b 1674 1675 rev32 v15.8h, v15.8h 1676 rev32 v16.8h, v16.8h 1677 rev32 v17.8h, v17.8h 1678 rev32 v18.8h, v18.8h 1679 rev32 v19.8h, v19.8h 1680 1681 add v10.4s, v10.4s, v15.4s 1682 add v11.4s, v11.4s, v16.4s 1683 add v12.4s, v12.4s, v17.4s 1684 add v13.4s, v13.4s, v18.4s 1685 add v14.4s, v14.4s, v19.4s 1686 1687 eor v5.16b, v5.16b, v10.16b 1688 eor v6.16b, v6.16b, v11.16b 1689 eor v7.16b, v7.16b, v12.16b 1690 eor v8.16b, v8.16b, v13.16b 1691 eor v9.16b, v9.16b, v14.16b 1692 1693 ushr v20.4s, v5.4s, #20 1694 sli v20.4s, v5.4s, #12 1695 ushr v5.4s, v6.4s, #20 1696 sli v5.4s, v6.4s, #12 1697 ushr v6.4s, v7.4s, #20 1698 sli v6.4s, v7.4s, #12 1699 ushr v7.4s, v8.4s, #20 1700 sli v7.4s, v8.4s, #12 1701 ushr v8.4s, v9.4s, #20 1702 sli v8.4s, v9.4s, #12 1703 1704 add v0.4s, v0.4s, v20.4s 1705 add v1.4s, v1.4s, v5.4s 1706 add v2.4s, v2.4s, v6.4s 1707 add v3.4s, v3.4s, v7.4s 1708 add v4.4s, v4.4s, v8.4s 1709 1710 eor v15.16b, v15.16b, v0.16b 1711 eor v16.16b, v16.16b, v1.16b 1712 eor v17.16b, v17.16b, v2.16b 1713 eor v18.16b, v18.16b, v3.16b 1714 eor v19.16b, v19.16b, v4.16b 1715 1716 tbl v15.16b, {v15.16b}, v26.16b 1717 tbl v16.16b, {v16.16b}, v26.16b 1718 tbl v17.16b, {v17.16b}, v26.16b 1719 tbl v18.16b, {v18.16b}, v26.16b 1720 tbl v19.16b, {v19.16b}, v26.16b 1721 1722 add v10.4s, v10.4s, v15.4s 1723 add v11.4s, v11.4s, v16.4s 1724 add v12.4s, v12.4s, v17.4s 1725 add v13.4s, v13.4s, v18.4s 1726 add v14.4s, v14.4s, v19.4s 1727 1728 eor v20.16b, v20.16b, v10.16b 1729 eor v5.16b, v5.16b, v11.16b 1730 eor v6.16b, v6.16b, v12.16b 1731 eor v7.16b, v7.16b, v13.16b 1732 eor v8.16b, v8.16b, v14.16b 1733 1734 ushr v9.4s, v8.4s, #25 1735 sli v9.4s, v8.4s, #7 1736 ushr v8.4s, v7.4s, #25 1737 sli v8.4s, v7.4s, #7 1738 ushr v7.4s, v6.4s, #25 1739 sli v7.4s, v6.4s, #7 1740 ushr v6.4s, v5.4s, #25 1741 sli v6.4s, v5.4s, #7 1742 ushr v5.4s, v20.4s, #25 1743 sli v5.4s, v20.4s, #7 1744 1745 ext v9.16b, v9.16b, v9.16b, #4 1746 ext v14.16b, v14.16b, v14.16b, #8 1747 ext v19.16b, v19.16b, v19.16b, #12 1748 ldp x11, x12, [x3], 16 1749 adds x8, x8, x11 1750 adcs x9, x9, x12 1751 adc x10, x10, x15 1752 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 1753 umulh x12, x8, x16 1754 mul x13, x9, x16 1755 umulh x14, x9, x16 1756 adds x12, x12, x13 1757 mul x13, x10, x16 1758 adc x13, x13, x14 1759 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 1760 umulh x8, x8, x17 1761 adds x12, x12, x14 1762 mul x14, x9, x17 1763 umulh x9, x9, x17 1764 adcs x14, x14, x8 1765 mul x10, x10, x17 1766 adc x10, x10, x9 1767 adds x13, x13, x14 1768 adc x14, x10, xzr 1769 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 1770 and x8, x13, #-4 1771 extr x13, x14, x13, #2 1772 adds x8, x8, x11 1773 lsr x11, x14, #2 1774 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 1775 adds x8, x8, x13 1776 adcs x9, x9, x12 1777 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 1778 add v0.4s, v0.4s, v6.4s 1779 add v1.4s, v1.4s, v7.4s 1780 add v2.4s, v2.4s, v8.4s 1781 add v3.4s, v3.4s, v5.4s 1782 add v4.4s, v4.4s, v9.4s 1783 1784 eor v18.16b, v18.16b, v0.16b 1785 eor v15.16b, v15.16b, v1.16b 1786 eor v16.16b, v16.16b, v2.16b 1787 eor v17.16b, v17.16b, v3.16b 1788 eor v19.16b, v19.16b, v4.16b 1789 1790 rev32 v18.8h, v18.8h 1791 rev32 v15.8h, v15.8h 1792 rev32 v16.8h, v16.8h 1793 rev32 v17.8h, v17.8h 1794 rev32 v19.8h, v19.8h 1795 1796 add v12.4s, v12.4s, v18.4s 1797 add v13.4s, v13.4s, v15.4s 1798 add v10.4s, v10.4s, v16.4s 1799 add v11.4s, v11.4s, v17.4s 1800 add v14.4s, v14.4s, v19.4s 1801 1802 eor v6.16b, v6.16b, v12.16b 1803 eor v7.16b, v7.16b, v13.16b 1804 eor v8.16b, v8.16b, v10.16b 1805 eor v5.16b, v5.16b, v11.16b 1806 eor v9.16b, v9.16b, v14.16b 1807 1808 ushr v20.4s, v6.4s, #20 1809 sli v20.4s, v6.4s, #12 1810 ushr v6.4s, v7.4s, #20 1811 sli v6.4s, v7.4s, #12 1812 ushr v7.4s, v8.4s, #20 1813 sli v7.4s, v8.4s, #12 1814 ushr v8.4s, v5.4s, #20 1815 sli v8.4s, v5.4s, #12 1816 ushr v5.4s, v9.4s, #20 1817 sli v5.4s, v9.4s, #12 1818 1819 add v0.4s, v0.4s, v20.4s 1820 add v1.4s, v1.4s, v6.4s 1821 add v2.4s, v2.4s, v7.4s 1822 add v3.4s, v3.4s, v8.4s 1823 add v4.4s, v4.4s, v5.4s 1824 1825 eor v18.16b, v18.16b, v0.16b 1826 eor v15.16b, v15.16b, v1.16b 1827 eor v16.16b, v16.16b, v2.16b 1828 eor v17.16b, v17.16b, v3.16b 1829 eor v19.16b, v19.16b, v4.16b 1830 1831 tbl v18.16b, {v18.16b}, v26.16b 1832 tbl v15.16b, {v15.16b}, v26.16b 1833 tbl v16.16b, {v16.16b}, v26.16b 1834 tbl v17.16b, {v17.16b}, v26.16b 1835 tbl v19.16b, {v19.16b}, v26.16b 1836 1837 add v12.4s, v12.4s, v18.4s 1838 add v13.4s, v13.4s, v15.4s 1839 add v10.4s, v10.4s, v16.4s 1840 add v11.4s, v11.4s, v17.4s 1841 add v14.4s, v14.4s, v19.4s 1842 1843 eor v20.16b, v20.16b, v12.16b 1844 eor v6.16b, v6.16b, v13.16b 1845 eor v7.16b, v7.16b, v10.16b 1846 eor v8.16b, v8.16b, v11.16b 1847 eor v5.16b, v5.16b, v14.16b 1848 1849 ushr v9.4s, v5.4s, #25 1850 sli v9.4s, v5.4s, #7 1851 ushr v5.4s, v8.4s, #25 1852 sli v5.4s, v8.4s, #7 1853 ushr v8.4s, v7.4s, #25 1854 sli v8.4s, v7.4s, #7 1855 ushr v7.4s, v6.4s, #25 1856 sli v7.4s, v6.4s, #7 1857 ushr v6.4s, v20.4s, #25 1858 sli v6.4s, v20.4s, #7 1859 1860 ext v9.16b, v9.16b, v9.16b, #12 1861 ext v14.16b, v14.16b, v14.16b, #8 1862 ext v19.16b, v19.16b, v19.16b, #4 1863 subs x7, x7, #1 1864 b.gt .Lopen_main_loop_rounds 1865 subs x6, x6, #1 1866 b.ge .Lopen_main_loop_rounds_short 1867 1868 eor v20.16b, v20.16b, v20.16b //zero 1869 not v21.16b, v20.16b // -1 1870 sub v21.4s, v25.4s, v21.4s // Add +1 1871 ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) 1872 add v19.4s, v19.4s, v20.4s 1873 1874 add v15.4s, v15.4s, v25.4s 1875 mov x11, #5 1876 dup v20.4s, w11 1877 add v25.4s, v25.4s, v20.4s 1878 1879 zip1 v20.4s, v0.4s, v1.4s 1880 zip2 v21.4s, v0.4s, v1.4s 1881 zip1 v22.4s, v2.4s, v3.4s 1882 zip2 v23.4s, v2.4s, v3.4s 1883 1884 zip1 v0.2d, v20.2d, v22.2d 1885 zip2 v1.2d, v20.2d, v22.2d 1886 zip1 v2.2d, v21.2d, v23.2d 1887 zip2 v3.2d, v21.2d, v23.2d 1888 1889 zip1 v20.4s, v5.4s, v6.4s 1890 zip2 v21.4s, v5.4s, v6.4s 1891 zip1 v22.4s, v7.4s, v8.4s 1892 zip2 v23.4s, v7.4s, v8.4s 1893 1894 zip1 v5.2d, v20.2d, v22.2d 1895 zip2 v6.2d, v20.2d, v22.2d 1896 zip1 v7.2d, v21.2d, v23.2d 1897 zip2 v8.2d, v21.2d, v23.2d 1898 1899 zip1 v20.4s, v10.4s, v11.4s 1900 zip2 v21.4s, v10.4s, v11.4s 1901 zip1 v22.4s, v12.4s, v13.4s 1902 zip2 v23.4s, v12.4s, v13.4s 1903 1904 zip1 v10.2d, v20.2d, v22.2d 1905 zip2 v11.2d, v20.2d, v22.2d 1906 zip1 v12.2d, v21.2d, v23.2d 1907 zip2 v13.2d, v21.2d, v23.2d 1908 1909 zip1 v20.4s, v15.4s, v16.4s 1910 zip2 v21.4s, v15.4s, v16.4s 1911 zip1 v22.4s, v17.4s, v18.4s 1912 zip2 v23.4s, v17.4s, v18.4s 1913 1914 zip1 v15.2d, v20.2d, v22.2d 1915 zip2 v16.2d, v20.2d, v22.2d 1916 zip1 v17.2d, v21.2d, v23.2d 1917 zip2 v18.2d, v21.2d, v23.2d 1918 1919 add v0.4s, v0.4s, v24.4s 1920 add v5.4s, v5.4s, v28.4s 1921 add v10.4s, v10.4s, v29.4s 1922 add v15.4s, v15.4s, v30.4s 1923 1924 add v1.4s, v1.4s, v24.4s 1925 add v6.4s, v6.4s, v28.4s 1926 add v11.4s, v11.4s, v29.4s 1927 add v16.4s, v16.4s, v30.4s 1928 1929 add v2.4s, v2.4s, v24.4s 1930 add v7.4s, v7.4s, v28.4s 1931 add v12.4s, v12.4s, v29.4s 1932 add v17.4s, v17.4s, v30.4s 1933 1934 add v3.4s, v3.4s, v24.4s 1935 add v8.4s, v8.4s, v28.4s 1936 add v13.4s, v13.4s, v29.4s 1937 add v18.4s, v18.4s, v30.4s 1938 1939 add v4.4s, v4.4s, v24.4s 1940 add v9.4s, v9.4s, v28.4s 1941 add v14.4s, v14.4s, v29.4s 1942 add v19.4s, v19.4s, v30.4s 1943 1944 // We can always safely store 192 bytes 1945 ld1 {v20.16b - v23.16b}, [x1], #64 1946 eor v20.16b, v20.16b, v0.16b 1947 eor v21.16b, v21.16b, v5.16b 1948 eor v22.16b, v22.16b, v10.16b 1949 eor v23.16b, v23.16b, v15.16b 1950 st1 {v20.16b - v23.16b}, [x0], #64 1951 1952 ld1 {v20.16b - v23.16b}, [x1], #64 1953 eor v20.16b, v20.16b, v1.16b 1954 eor v21.16b, v21.16b, v6.16b 1955 eor v22.16b, v22.16b, v11.16b 1956 eor v23.16b, v23.16b, v16.16b 1957 st1 {v20.16b - v23.16b}, [x0], #64 1958 1959 ld1 {v20.16b - v23.16b}, [x1], #64 1960 eor v20.16b, v20.16b, v2.16b 1961 eor v21.16b, v21.16b, v7.16b 1962 eor v22.16b, v22.16b, v12.16b 1963 eor v23.16b, v23.16b, v17.16b 1964 st1 {v20.16b - v23.16b}, [x0], #64 1965 1966 sub x2, x2, #192 1967 1968 mov v0.16b, v3.16b 1969 mov v5.16b, v8.16b 1970 mov v10.16b, v13.16b 1971 mov v15.16b, v18.16b 1972 1973 cmp x2, #64 1974 b.lt .Lopen_tail_64_store 1975 1976 ld1 {v20.16b - v23.16b}, [x1], #64 1977 eor v20.16b, v20.16b, v3.16b 1978 eor v21.16b, v21.16b, v8.16b 1979 eor v22.16b, v22.16b, v13.16b 1980 eor v23.16b, v23.16b, v18.16b 1981 st1 {v20.16b - v23.16b}, [x0], #64 1982 1983 sub x2, x2, #64 1984 1985 mov v0.16b, v4.16b 1986 mov v5.16b, v9.16b 1987 mov v10.16b, v14.16b 1988 mov v15.16b, v19.16b 1989 1990 cmp x2, #64 1991 b.lt .Lopen_tail_64_store 1992 1993 ld1 {v20.16b - v23.16b}, [x1], #64 1994 eor v20.16b, v20.16b, v4.16b 1995 eor v21.16b, v21.16b, v9.16b 1996 eor v22.16b, v22.16b, v14.16b 1997 eor v23.16b, v23.16b, v19.16b 1998 st1 {v20.16b - v23.16b}, [x0], #64 1999 2000 sub x2, x2, #64 2001 b .Lopen_main_loop 2002 2003.Lopen_tail: 2004 2005 cbz x2, .Lopen_finalize 2006 2007 lsr x4, x2, #4 // How many whole blocks we have to hash 2008 2009 cmp x2, #64 2010 b.le .Lopen_tail_64 2011 cmp x2, #128 2012 b.le .Lopen_tail_128 2013 2014.Lopen_tail_192: 2015 // We need three more blocks 2016 mov v0.16b, v24.16b 2017 mov v1.16b, v24.16b 2018 mov v2.16b, v24.16b 2019 mov v5.16b, v28.16b 2020 mov v6.16b, v28.16b 2021 mov v7.16b, v28.16b 2022 mov v10.16b, v29.16b 2023 mov v11.16b, v29.16b 2024 mov v12.16b, v29.16b 2025 mov v15.16b, v30.16b 2026 mov v16.16b, v30.16b 2027 mov v17.16b, v30.16b 2028 eor v23.16b, v23.16b, v23.16b 2029 eor v21.16b, v21.16b, v21.16b 2030 ins v23.s[0], v25.s[0] 2031 ins v21.d[0], x15 2032 2033 add v22.4s, v23.4s, v21.4s 2034 add v21.4s, v22.4s, v21.4s 2035 2036 add v15.4s, v15.4s, v21.4s 2037 add v16.4s, v16.4s, v23.4s 2038 add v17.4s, v17.4s, v22.4s 2039 2040 mov x7, #10 2041 subs x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash 2042 csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing 2043 sub x4, x4, x7 2044 2045 cbz x7, .Lopen_tail_192_rounds_no_hash 2046 2047.Lopen_tail_192_rounds: 2048 ldp x11, x12, [x3], 16 2049 adds x8, x8, x11 2050 adcs x9, x9, x12 2051 adc x10, x10, x15 2052 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2053 umulh x12, x8, x16 2054 mul x13, x9, x16 2055 umulh x14, x9, x16 2056 adds x12, x12, x13 2057 mul x13, x10, x16 2058 adc x13, x13, x14 2059 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2060 umulh x8, x8, x17 2061 adds x12, x12, x14 2062 mul x14, x9, x17 2063 umulh x9, x9, x17 2064 adcs x14, x14, x8 2065 mul x10, x10, x17 2066 adc x10, x10, x9 2067 adds x13, x13, x14 2068 adc x14, x10, xzr 2069 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2070 and x8, x13, #-4 2071 extr x13, x14, x13, #2 2072 adds x8, x8, x11 2073 lsr x11, x14, #2 2074 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2075 adds x8, x8, x13 2076 adcs x9, x9, x12 2077 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2078.Lopen_tail_192_rounds_no_hash: 2079 add v0.4s, v0.4s, v5.4s 2080 add v1.4s, v1.4s, v6.4s 2081 add v2.4s, v2.4s, v7.4s 2082 eor v15.16b, v15.16b, v0.16b 2083 eor v16.16b, v16.16b, v1.16b 2084 eor v17.16b, v17.16b, v2.16b 2085 rev32 v15.8h, v15.8h 2086 rev32 v16.8h, v16.8h 2087 rev32 v17.8h, v17.8h 2088 2089 add v10.4s, v10.4s, v15.4s 2090 add v11.4s, v11.4s, v16.4s 2091 add v12.4s, v12.4s, v17.4s 2092 eor v5.16b, v5.16b, v10.16b 2093 eor v6.16b, v6.16b, v11.16b 2094 eor v7.16b, v7.16b, v12.16b 2095 ushr v20.4s, v5.4s, #20 2096 sli v20.4s, v5.4s, #12 2097 ushr v5.4s, v6.4s, #20 2098 sli v5.4s, v6.4s, #12 2099 ushr v6.4s, v7.4s, #20 2100 sli v6.4s, v7.4s, #12 2101 2102 add v0.4s, v0.4s, v20.4s 2103 add v1.4s, v1.4s, v5.4s 2104 add v2.4s, v2.4s, v6.4s 2105 eor v15.16b, v15.16b, v0.16b 2106 eor v16.16b, v16.16b, v1.16b 2107 eor v17.16b, v17.16b, v2.16b 2108 tbl v15.16b, {v15.16b}, v26.16b 2109 tbl v16.16b, {v16.16b}, v26.16b 2110 tbl v17.16b, {v17.16b}, v26.16b 2111 2112 add v10.4s, v10.4s, v15.4s 2113 add v11.4s, v11.4s, v16.4s 2114 add v12.4s, v12.4s, v17.4s 2115 eor v20.16b, v20.16b, v10.16b 2116 eor v5.16b, v5.16b, v11.16b 2117 eor v6.16b, v6.16b, v12.16b 2118 ushr v7.4s, v6.4s, #25 2119 sli v7.4s, v6.4s, #7 2120 ushr v6.4s, v5.4s, #25 2121 sli v6.4s, v5.4s, #7 2122 ushr v5.4s, v20.4s, #25 2123 sli v5.4s, v20.4s, #7 2124 2125 ext v5.16b, v5.16b, v5.16b, #4 2126 ext v6.16b, v6.16b, v6.16b, #4 2127 ext v7.16b, v7.16b, v7.16b, #4 2128 2129 ext v10.16b, v10.16b, v10.16b, #8 2130 ext v11.16b, v11.16b, v11.16b, #8 2131 ext v12.16b, v12.16b, v12.16b, #8 2132 2133 ext v15.16b, v15.16b, v15.16b, #12 2134 ext v16.16b, v16.16b, v16.16b, #12 2135 ext v17.16b, v17.16b, v17.16b, #12 2136 add v0.4s, v0.4s, v5.4s 2137 add v1.4s, v1.4s, v6.4s 2138 add v2.4s, v2.4s, v7.4s 2139 eor v15.16b, v15.16b, v0.16b 2140 eor v16.16b, v16.16b, v1.16b 2141 eor v17.16b, v17.16b, v2.16b 2142 rev32 v15.8h, v15.8h 2143 rev32 v16.8h, v16.8h 2144 rev32 v17.8h, v17.8h 2145 2146 add v10.4s, v10.4s, v15.4s 2147 add v11.4s, v11.4s, v16.4s 2148 add v12.4s, v12.4s, v17.4s 2149 eor v5.16b, v5.16b, v10.16b 2150 eor v6.16b, v6.16b, v11.16b 2151 eor v7.16b, v7.16b, v12.16b 2152 ushr v20.4s, v5.4s, #20 2153 sli v20.4s, v5.4s, #12 2154 ushr v5.4s, v6.4s, #20 2155 sli v5.4s, v6.4s, #12 2156 ushr v6.4s, v7.4s, #20 2157 sli v6.4s, v7.4s, #12 2158 2159 add v0.4s, v0.4s, v20.4s 2160 add v1.4s, v1.4s, v5.4s 2161 add v2.4s, v2.4s, v6.4s 2162 eor v15.16b, v15.16b, v0.16b 2163 eor v16.16b, v16.16b, v1.16b 2164 eor v17.16b, v17.16b, v2.16b 2165 tbl v15.16b, {v15.16b}, v26.16b 2166 tbl v16.16b, {v16.16b}, v26.16b 2167 tbl v17.16b, {v17.16b}, v26.16b 2168 2169 add v10.4s, v10.4s, v15.4s 2170 add v11.4s, v11.4s, v16.4s 2171 add v12.4s, v12.4s, v17.4s 2172 eor v20.16b, v20.16b, v10.16b 2173 eor v5.16b, v5.16b, v11.16b 2174 eor v6.16b, v6.16b, v12.16b 2175 ushr v7.4s, v6.4s, #25 2176 sli v7.4s, v6.4s, #7 2177 ushr v6.4s, v5.4s, #25 2178 sli v6.4s, v5.4s, #7 2179 ushr v5.4s, v20.4s, #25 2180 sli v5.4s, v20.4s, #7 2181 2182 ext v5.16b, v5.16b, v5.16b, #12 2183 ext v6.16b, v6.16b, v6.16b, #12 2184 ext v7.16b, v7.16b, v7.16b, #12 2185 2186 ext v10.16b, v10.16b, v10.16b, #8 2187 ext v11.16b, v11.16b, v11.16b, #8 2188 ext v12.16b, v12.16b, v12.16b, #8 2189 2190 ext v15.16b, v15.16b, v15.16b, #4 2191 ext v16.16b, v16.16b, v16.16b, #4 2192 ext v17.16b, v17.16b, v17.16b, #4 2193 subs x7, x7, #1 2194 b.gt .Lopen_tail_192_rounds 2195 subs x6, x6, #1 2196 b.ge .Lopen_tail_192_rounds_no_hash 2197 2198 // We hashed 160 bytes at most, may still have 32 bytes left 2199.Lopen_tail_192_hash: 2200 cbz x4, .Lopen_tail_192_hash_done 2201 ldp x11, x12, [x3], 16 2202 adds x8, x8, x11 2203 adcs x9, x9, x12 2204 adc x10, x10, x15 2205 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2206 umulh x12, x8, x16 2207 mul x13, x9, x16 2208 umulh x14, x9, x16 2209 adds x12, x12, x13 2210 mul x13, x10, x16 2211 adc x13, x13, x14 2212 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2213 umulh x8, x8, x17 2214 adds x12, x12, x14 2215 mul x14, x9, x17 2216 umulh x9, x9, x17 2217 adcs x14, x14, x8 2218 mul x10, x10, x17 2219 adc x10, x10, x9 2220 adds x13, x13, x14 2221 adc x14, x10, xzr 2222 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2223 and x8, x13, #-4 2224 extr x13, x14, x13, #2 2225 adds x8, x8, x11 2226 lsr x11, x14, #2 2227 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2228 adds x8, x8, x13 2229 adcs x9, x9, x12 2230 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2231 sub x4, x4, #1 2232 b .Lopen_tail_192_hash 2233 2234.Lopen_tail_192_hash_done: 2235 2236 add v0.4s, v0.4s, v24.4s 2237 add v1.4s, v1.4s, v24.4s 2238 add v2.4s, v2.4s, v24.4s 2239 add v5.4s, v5.4s, v28.4s 2240 add v6.4s, v6.4s, v28.4s 2241 add v7.4s, v7.4s, v28.4s 2242 add v10.4s, v10.4s, v29.4s 2243 add v11.4s, v11.4s, v29.4s 2244 add v12.4s, v12.4s, v29.4s 2245 add v15.4s, v15.4s, v30.4s 2246 add v16.4s, v16.4s, v30.4s 2247 add v17.4s, v17.4s, v30.4s 2248 2249 add v15.4s, v15.4s, v21.4s 2250 add v16.4s, v16.4s, v23.4s 2251 add v17.4s, v17.4s, v22.4s 2252 2253 ld1 {v20.16b - v23.16b}, [x1], #64 2254 2255 eor v20.16b, v20.16b, v1.16b 2256 eor v21.16b, v21.16b, v6.16b 2257 eor v22.16b, v22.16b, v11.16b 2258 eor v23.16b, v23.16b, v16.16b 2259 2260 st1 {v20.16b - v23.16b}, [x0], #64 2261 2262 ld1 {v20.16b - v23.16b}, [x1], #64 2263 2264 eor v20.16b, v20.16b, v2.16b 2265 eor v21.16b, v21.16b, v7.16b 2266 eor v22.16b, v22.16b, v12.16b 2267 eor v23.16b, v23.16b, v17.16b 2268 2269 st1 {v20.16b - v23.16b}, [x0], #64 2270 2271 sub x2, x2, #128 2272 b .Lopen_tail_64_store 2273 2274.Lopen_tail_128: 2275 // We need two more blocks 2276 mov v0.16b, v24.16b 2277 mov v1.16b, v24.16b 2278 mov v5.16b, v28.16b 2279 mov v6.16b, v28.16b 2280 mov v10.16b, v29.16b 2281 mov v11.16b, v29.16b 2282 mov v15.16b, v30.16b 2283 mov v16.16b, v30.16b 2284 eor v23.16b, v23.16b, v23.16b 2285 eor v22.16b, v22.16b, v22.16b 2286 ins v23.s[0], v25.s[0] 2287 ins v22.d[0], x15 2288 add v22.4s, v22.4s, v23.4s 2289 2290 add v15.4s, v15.4s, v22.4s 2291 add v16.4s, v16.4s, v23.4s 2292 2293 mov x6, #10 2294 sub x6, x6, x4 2295 2296.Lopen_tail_128_rounds: 2297 add v0.4s, v0.4s, v5.4s 2298 eor v15.16b, v15.16b, v0.16b 2299 rev32 v15.8h, v15.8h 2300 2301 add v10.4s, v10.4s, v15.4s 2302 eor v5.16b, v5.16b, v10.16b 2303 ushr v20.4s, v5.4s, #20 2304 sli v20.4s, v5.4s, #12 2305 add v0.4s, v0.4s, v20.4s 2306 eor v15.16b, v15.16b, v0.16b 2307 tbl v15.16b, {v15.16b}, v26.16b 2308 2309 add v10.4s, v10.4s, v15.4s 2310 eor v20.16b, v20.16b, v10.16b 2311 ushr v5.4s, v20.4s, #25 2312 sli v5.4s, v20.4s, #7 2313 ext v5.16b, v5.16b, v5.16b, #4 2314 ext v10.16b, v10.16b, v10.16b, #8 2315 ext v15.16b, v15.16b, v15.16b, #12 2316 add v1.4s, v1.4s, v6.4s 2317 eor v16.16b, v16.16b, v1.16b 2318 rev32 v16.8h, v16.8h 2319 2320 add v11.4s, v11.4s, v16.4s 2321 eor v6.16b, v6.16b, v11.16b 2322 ushr v20.4s, v6.4s, #20 2323 sli v20.4s, v6.4s, #12 2324 add v1.4s, v1.4s, v20.4s 2325 eor v16.16b, v16.16b, v1.16b 2326 tbl v16.16b, {v16.16b}, v26.16b 2327 2328 add v11.4s, v11.4s, v16.4s 2329 eor v20.16b, v20.16b, v11.16b 2330 ushr v6.4s, v20.4s, #25 2331 sli v6.4s, v20.4s, #7 2332 ext v6.16b, v6.16b, v6.16b, #4 2333 ext v11.16b, v11.16b, v11.16b, #8 2334 ext v16.16b, v16.16b, v16.16b, #12 2335 add v0.4s, v0.4s, v5.4s 2336 eor v15.16b, v15.16b, v0.16b 2337 rev32 v15.8h, v15.8h 2338 2339 add v10.4s, v10.4s, v15.4s 2340 eor v5.16b, v5.16b, v10.16b 2341 ushr v20.4s, v5.4s, #20 2342 sli v20.4s, v5.4s, #12 2343 add v0.4s, v0.4s, v20.4s 2344 eor v15.16b, v15.16b, v0.16b 2345 tbl v15.16b, {v15.16b}, v26.16b 2346 2347 add v10.4s, v10.4s, v15.4s 2348 eor v20.16b, v20.16b, v10.16b 2349 ushr v5.4s, v20.4s, #25 2350 sli v5.4s, v20.4s, #7 2351 ext v5.16b, v5.16b, v5.16b, #12 2352 ext v10.16b, v10.16b, v10.16b, #8 2353 ext v15.16b, v15.16b, v15.16b, #4 2354 add v1.4s, v1.4s, v6.4s 2355 eor v16.16b, v16.16b, v1.16b 2356 rev32 v16.8h, v16.8h 2357 2358 add v11.4s, v11.4s, v16.4s 2359 eor v6.16b, v6.16b, v11.16b 2360 ushr v20.4s, v6.4s, #20 2361 sli v20.4s, v6.4s, #12 2362 add v1.4s, v1.4s, v20.4s 2363 eor v16.16b, v16.16b, v1.16b 2364 tbl v16.16b, {v16.16b}, v26.16b 2365 2366 add v11.4s, v11.4s, v16.4s 2367 eor v20.16b, v20.16b, v11.16b 2368 ushr v6.4s, v20.4s, #25 2369 sli v6.4s, v20.4s, #7 2370 ext v6.16b, v6.16b, v6.16b, #12 2371 ext v11.16b, v11.16b, v11.16b, #8 2372 ext v16.16b, v16.16b, v16.16b, #4 2373 subs x6, x6, #1 2374 b.gt .Lopen_tail_128_rounds 2375 cbz x4, .Lopen_tail_128_rounds_done 2376 subs x4, x4, #1 2377 ldp x11, x12, [x3], 16 2378 adds x8, x8, x11 2379 adcs x9, x9, x12 2380 adc x10, x10, x15 2381 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2382 umulh x12, x8, x16 2383 mul x13, x9, x16 2384 umulh x14, x9, x16 2385 adds x12, x12, x13 2386 mul x13, x10, x16 2387 adc x13, x13, x14 2388 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2389 umulh x8, x8, x17 2390 adds x12, x12, x14 2391 mul x14, x9, x17 2392 umulh x9, x9, x17 2393 adcs x14, x14, x8 2394 mul x10, x10, x17 2395 adc x10, x10, x9 2396 adds x13, x13, x14 2397 adc x14, x10, xzr 2398 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2399 and x8, x13, #-4 2400 extr x13, x14, x13, #2 2401 adds x8, x8, x11 2402 lsr x11, x14, #2 2403 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2404 adds x8, x8, x13 2405 adcs x9, x9, x12 2406 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2407 b .Lopen_tail_128_rounds 2408 2409.Lopen_tail_128_rounds_done: 2410 add v0.4s, v0.4s, v24.4s 2411 add v1.4s, v1.4s, v24.4s 2412 add v5.4s, v5.4s, v28.4s 2413 add v6.4s, v6.4s, v28.4s 2414 add v10.4s, v10.4s, v29.4s 2415 add v11.4s, v11.4s, v29.4s 2416 add v15.4s, v15.4s, v30.4s 2417 add v16.4s, v16.4s, v30.4s 2418 add v15.4s, v15.4s, v22.4s 2419 add v16.4s, v16.4s, v23.4s 2420 2421 ld1 {v20.16b - v23.16b}, [x1], #64 2422 2423 eor v20.16b, v20.16b, v1.16b 2424 eor v21.16b, v21.16b, v6.16b 2425 eor v22.16b, v22.16b, v11.16b 2426 eor v23.16b, v23.16b, v16.16b 2427 2428 st1 {v20.16b - v23.16b}, [x0], #64 2429 sub x2, x2, #64 2430 2431 b .Lopen_tail_64_store 2432 2433.Lopen_tail_64: 2434 // We just need a single block 2435 mov v0.16b, v24.16b 2436 mov v5.16b, v28.16b 2437 mov v10.16b, v29.16b 2438 mov v15.16b, v30.16b 2439 eor v23.16b, v23.16b, v23.16b 2440 ins v23.s[0], v25.s[0] 2441 add v15.4s, v15.4s, v23.4s 2442 2443 mov x6, #10 2444 sub x6, x6, x4 2445 2446.Lopen_tail_64_rounds: 2447 add v0.4s, v0.4s, v5.4s 2448 eor v15.16b, v15.16b, v0.16b 2449 rev32 v15.8h, v15.8h 2450 2451 add v10.4s, v10.4s, v15.4s 2452 eor v5.16b, v5.16b, v10.16b 2453 ushr v20.4s, v5.4s, #20 2454 sli v20.4s, v5.4s, #12 2455 add v0.4s, v0.4s, v20.4s 2456 eor v15.16b, v15.16b, v0.16b 2457 tbl v15.16b, {v15.16b}, v26.16b 2458 2459 add v10.4s, v10.4s, v15.4s 2460 eor v20.16b, v20.16b, v10.16b 2461 ushr v5.4s, v20.4s, #25 2462 sli v5.4s, v20.4s, #7 2463 ext v5.16b, v5.16b, v5.16b, #4 2464 ext v10.16b, v10.16b, v10.16b, #8 2465 ext v15.16b, v15.16b, v15.16b, #12 2466 add v0.4s, v0.4s, v5.4s 2467 eor v15.16b, v15.16b, v0.16b 2468 rev32 v15.8h, v15.8h 2469 2470 add v10.4s, v10.4s, v15.4s 2471 eor v5.16b, v5.16b, v10.16b 2472 ushr v20.4s, v5.4s, #20 2473 sli v20.4s, v5.4s, #12 2474 add v0.4s, v0.4s, v20.4s 2475 eor v15.16b, v15.16b, v0.16b 2476 tbl v15.16b, {v15.16b}, v26.16b 2477 2478 add v10.4s, v10.4s, v15.4s 2479 eor v20.16b, v20.16b, v10.16b 2480 ushr v5.4s, v20.4s, #25 2481 sli v5.4s, v20.4s, #7 2482 ext v5.16b, v5.16b, v5.16b, #12 2483 ext v10.16b, v10.16b, v10.16b, #8 2484 ext v15.16b, v15.16b, v15.16b, #4 2485 subs x6, x6, #1 2486 b.gt .Lopen_tail_64_rounds 2487 cbz x4, .Lopen_tail_64_rounds_done 2488 subs x4, x4, #1 2489 ldp x11, x12, [x3], 16 2490 adds x8, x8, x11 2491 adcs x9, x9, x12 2492 adc x10, x10, x15 2493 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2494 umulh x12, x8, x16 2495 mul x13, x9, x16 2496 umulh x14, x9, x16 2497 adds x12, x12, x13 2498 mul x13, x10, x16 2499 adc x13, x13, x14 2500 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2501 umulh x8, x8, x17 2502 adds x12, x12, x14 2503 mul x14, x9, x17 2504 umulh x9, x9, x17 2505 adcs x14, x14, x8 2506 mul x10, x10, x17 2507 adc x10, x10, x9 2508 adds x13, x13, x14 2509 adc x14, x10, xzr 2510 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2511 and x8, x13, #-4 2512 extr x13, x14, x13, #2 2513 adds x8, x8, x11 2514 lsr x11, x14, #2 2515 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2516 adds x8, x8, x13 2517 adcs x9, x9, x12 2518 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2519 b .Lopen_tail_64_rounds 2520 2521.Lopen_tail_64_rounds_done: 2522 add v0.4s, v0.4s, v24.4s 2523 add v5.4s, v5.4s, v28.4s 2524 add v10.4s, v10.4s, v29.4s 2525 add v15.4s, v15.4s, v30.4s 2526 add v15.4s, v15.4s, v23.4s 2527 2528.Lopen_tail_64_store: 2529 cmp x2, #16 2530 b.lt .Lopen_tail_16 2531 2532 ld1 {v20.16b}, [x1], #16 2533 eor v20.16b, v20.16b, v0.16b 2534 st1 {v20.16b}, [x0], #16 2535 mov v0.16b, v5.16b 2536 mov v5.16b, v10.16b 2537 mov v10.16b, v15.16b 2538 sub x2, x2, #16 2539 b .Lopen_tail_64_store 2540 2541.Lopen_tail_16: 2542 // Here we handle the last [0,16) bytes that require a padded block 2543 cbz x2, .Lopen_finalize 2544 2545 eor v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext 2546 eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask 2547 not v22.16b, v20.16b 2548 2549 add x7, x1, x2 2550 mov x6, x2 2551 2552.Lopen_tail_16_compose: 2553 ext v20.16b, v20.16b, v20.16b, #15 2554 ldrb w11, [x7, #-1]! 2555 mov v20.b[0], w11 2556 ext v21.16b, v22.16b, v21.16b, #15 2557 subs x2, x2, #1 2558 b.gt .Lopen_tail_16_compose 2559 2560 and v20.16b, v20.16b, v21.16b 2561 // Hash in the final padded block 2562 mov x11, v20.d[0] 2563 mov x12, v20.d[1] 2564 adds x8, x8, x11 2565 adcs x9, x9, x12 2566 adc x10, x10, x15 2567 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2568 umulh x12, x8, x16 2569 mul x13, x9, x16 2570 umulh x14, x9, x16 2571 adds x12, x12, x13 2572 mul x13, x10, x16 2573 adc x13, x13, x14 2574 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2575 umulh x8, x8, x17 2576 adds x12, x12, x14 2577 mul x14, x9, x17 2578 umulh x9, x9, x17 2579 adcs x14, x14, x8 2580 mul x10, x10, x17 2581 adc x10, x10, x9 2582 adds x13, x13, x14 2583 adc x14, x10, xzr 2584 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2585 and x8, x13, #-4 2586 extr x13, x14, x13, #2 2587 adds x8, x8, x11 2588 lsr x11, x14, #2 2589 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2590 adds x8, x8, x13 2591 adcs x9, x9, x12 2592 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2593 eor v20.16b, v20.16b, v0.16b 2594 2595.Lopen_tail_16_store: 2596 umov w11, v20.b[0] 2597 strb w11, [x0], #1 2598 ext v20.16b, v20.16b, v20.16b, #1 2599 subs x6, x6, #1 2600 b.gt .Lopen_tail_16_store 2601 2602.Lopen_finalize: 2603 mov x11, v31.d[0] 2604 mov x12, v31.d[1] 2605 adds x8, x8, x11 2606 adcs x9, x9, x12 2607 adc x10, x10, x15 2608 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2609 umulh x12, x8, x16 2610 mul x13, x9, x16 2611 umulh x14, x9, x16 2612 adds x12, x12, x13 2613 mul x13, x10, x16 2614 adc x13, x13, x14 2615 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2616 umulh x8, x8, x17 2617 adds x12, x12, x14 2618 mul x14, x9, x17 2619 umulh x9, x9, x17 2620 adcs x14, x14, x8 2621 mul x10, x10, x17 2622 adc x10, x10, x9 2623 adds x13, x13, x14 2624 adc x14, x10, xzr 2625 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2626 and x8, x13, #-4 2627 extr x13, x14, x13, #2 2628 adds x8, x8, x11 2629 lsr x11, x14, #2 2630 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2631 adds x8, x8, x13 2632 adcs x9, x9, x12 2633 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2634 // Final reduction step 2635 sub x12, xzr, x15 2636 orr x13, xzr, #3 2637 subs x11, x8, #-5 2638 sbcs x12, x9, x12 2639 sbcs x13, x10, x13 2640 csel x8, x11, x8, cs 2641 csel x9, x12, x9, cs 2642 csel x10, x13, x10, cs 2643 mov x11, v27.d[0] 2644 mov x12, v27.d[1] 2645 adds x8, x8, x11 2646 adcs x9, x9, x12 2647 adc x10, x10, x15 2648 2649 stp x8, x9, [x5] 2650 2651 ldp d8, d9, [sp, #16] 2652 ldp d10, d11, [sp, #32] 2653 ldp d12, d13, [sp, #48] 2654 ldp d14, d15, [sp, #64] 2655.cfi_restore b15 2656.cfi_restore b14 2657.cfi_restore b13 2658.cfi_restore b12 2659.cfi_restore b11 2660.cfi_restore b10 2661.cfi_restore b9 2662.cfi_restore b8 2663 ldp x29, x30, [sp], 80 2664.cfi_restore w29 2665.cfi_restore w30 2666.cfi_def_cfa_offset 0 2667 AARCH64_VALIDATE_LINK_REGISTER 2668 ret 2669 2670.Lopen_128: 2671 // On some architectures preparing 5 blocks for small buffers is wasteful 2672 eor v25.16b, v25.16b, v25.16b 2673 mov x11, #1 2674 mov v25.s[0], w11 2675 mov v0.16b, v24.16b 2676 mov v1.16b, v24.16b 2677 mov v2.16b, v24.16b 2678 mov v5.16b, v28.16b 2679 mov v6.16b, v28.16b 2680 mov v7.16b, v28.16b 2681 mov v10.16b, v29.16b 2682 mov v11.16b, v29.16b 2683 mov v12.16b, v29.16b 2684 mov v17.16b, v30.16b 2685 add v15.4s, v17.4s, v25.4s 2686 add v16.4s, v15.4s, v25.4s 2687 2688 mov x6, #10 2689 2690.Lopen_128_rounds: 2691 add v0.4s, v0.4s, v5.4s 2692 add v1.4s, v1.4s, v6.4s 2693 add v2.4s, v2.4s, v7.4s 2694 eor v15.16b, v15.16b, v0.16b 2695 eor v16.16b, v16.16b, v1.16b 2696 eor v17.16b, v17.16b, v2.16b 2697 rev32 v15.8h, v15.8h 2698 rev32 v16.8h, v16.8h 2699 rev32 v17.8h, v17.8h 2700 2701 add v10.4s, v10.4s, v15.4s 2702 add v11.4s, v11.4s, v16.4s 2703 add v12.4s, v12.4s, v17.4s 2704 eor v5.16b, v5.16b, v10.16b 2705 eor v6.16b, v6.16b, v11.16b 2706 eor v7.16b, v7.16b, v12.16b 2707 ushr v20.4s, v5.4s, #20 2708 sli v20.4s, v5.4s, #12 2709 ushr v5.4s, v6.4s, #20 2710 sli v5.4s, v6.4s, #12 2711 ushr v6.4s, v7.4s, #20 2712 sli v6.4s, v7.4s, #12 2713 2714 add v0.4s, v0.4s, v20.4s 2715 add v1.4s, v1.4s, v5.4s 2716 add v2.4s, v2.4s, v6.4s 2717 eor v15.16b, v15.16b, v0.16b 2718 eor v16.16b, v16.16b, v1.16b 2719 eor v17.16b, v17.16b, v2.16b 2720 tbl v15.16b, {v15.16b}, v26.16b 2721 tbl v16.16b, {v16.16b}, v26.16b 2722 tbl v17.16b, {v17.16b}, v26.16b 2723 2724 add v10.4s, v10.4s, v15.4s 2725 add v11.4s, v11.4s, v16.4s 2726 add v12.4s, v12.4s, v17.4s 2727 eor v20.16b, v20.16b, v10.16b 2728 eor v5.16b, v5.16b, v11.16b 2729 eor v6.16b, v6.16b, v12.16b 2730 ushr v7.4s, v6.4s, #25 2731 sli v7.4s, v6.4s, #7 2732 ushr v6.4s, v5.4s, #25 2733 sli v6.4s, v5.4s, #7 2734 ushr v5.4s, v20.4s, #25 2735 sli v5.4s, v20.4s, #7 2736 2737 ext v5.16b, v5.16b, v5.16b, #4 2738 ext v6.16b, v6.16b, v6.16b, #4 2739 ext v7.16b, v7.16b, v7.16b, #4 2740 2741 ext v10.16b, v10.16b, v10.16b, #8 2742 ext v11.16b, v11.16b, v11.16b, #8 2743 ext v12.16b, v12.16b, v12.16b, #8 2744 2745 ext v15.16b, v15.16b, v15.16b, #12 2746 ext v16.16b, v16.16b, v16.16b, #12 2747 ext v17.16b, v17.16b, v17.16b, #12 2748 add v0.4s, v0.4s, v5.4s 2749 add v1.4s, v1.4s, v6.4s 2750 add v2.4s, v2.4s, v7.4s 2751 eor v15.16b, v15.16b, v0.16b 2752 eor v16.16b, v16.16b, v1.16b 2753 eor v17.16b, v17.16b, v2.16b 2754 rev32 v15.8h, v15.8h 2755 rev32 v16.8h, v16.8h 2756 rev32 v17.8h, v17.8h 2757 2758 add v10.4s, v10.4s, v15.4s 2759 add v11.4s, v11.4s, v16.4s 2760 add v12.4s, v12.4s, v17.4s 2761 eor v5.16b, v5.16b, v10.16b 2762 eor v6.16b, v6.16b, v11.16b 2763 eor v7.16b, v7.16b, v12.16b 2764 ushr v20.4s, v5.4s, #20 2765 sli v20.4s, v5.4s, #12 2766 ushr v5.4s, v6.4s, #20 2767 sli v5.4s, v6.4s, #12 2768 ushr v6.4s, v7.4s, #20 2769 sli v6.4s, v7.4s, #12 2770 2771 add v0.4s, v0.4s, v20.4s 2772 add v1.4s, v1.4s, v5.4s 2773 add v2.4s, v2.4s, v6.4s 2774 eor v15.16b, v15.16b, v0.16b 2775 eor v16.16b, v16.16b, v1.16b 2776 eor v17.16b, v17.16b, v2.16b 2777 tbl v15.16b, {v15.16b}, v26.16b 2778 tbl v16.16b, {v16.16b}, v26.16b 2779 tbl v17.16b, {v17.16b}, v26.16b 2780 2781 add v10.4s, v10.4s, v15.4s 2782 add v11.4s, v11.4s, v16.4s 2783 add v12.4s, v12.4s, v17.4s 2784 eor v20.16b, v20.16b, v10.16b 2785 eor v5.16b, v5.16b, v11.16b 2786 eor v6.16b, v6.16b, v12.16b 2787 ushr v7.4s, v6.4s, #25 2788 sli v7.4s, v6.4s, #7 2789 ushr v6.4s, v5.4s, #25 2790 sli v6.4s, v5.4s, #7 2791 ushr v5.4s, v20.4s, #25 2792 sli v5.4s, v20.4s, #7 2793 2794 ext v5.16b, v5.16b, v5.16b, #12 2795 ext v6.16b, v6.16b, v6.16b, #12 2796 ext v7.16b, v7.16b, v7.16b, #12 2797 2798 ext v10.16b, v10.16b, v10.16b, #8 2799 ext v11.16b, v11.16b, v11.16b, #8 2800 ext v12.16b, v12.16b, v12.16b, #8 2801 2802 ext v15.16b, v15.16b, v15.16b, #4 2803 ext v16.16b, v16.16b, v16.16b, #4 2804 ext v17.16b, v17.16b, v17.16b, #4 2805 subs x6, x6, #1 2806 b.hi .Lopen_128_rounds 2807 2808 add v0.4s, v0.4s, v24.4s 2809 add v1.4s, v1.4s, v24.4s 2810 add v2.4s, v2.4s, v24.4s 2811 2812 add v5.4s, v5.4s, v28.4s 2813 add v6.4s, v6.4s, v28.4s 2814 add v7.4s, v7.4s, v28.4s 2815 2816 add v10.4s, v10.4s, v29.4s 2817 add v11.4s, v11.4s, v29.4s 2818 2819 add v30.4s, v30.4s, v25.4s 2820 add v15.4s, v15.4s, v30.4s 2821 add v30.4s, v30.4s, v25.4s 2822 add v16.4s, v16.4s, v30.4s 2823 2824 and v2.16b, v2.16b, v27.16b 2825 mov x16, v2.d[0] // Move the R key to GPRs 2826 mov x17, v2.d[1] 2827 mov v27.16b, v7.16b // Store the S key 2828 2829 bl .Lpoly_hash_ad_internal 2830 2831.Lopen_128_store: 2832 cmp x2, #64 2833 b.lt .Lopen_128_store_64 2834 2835 ld1 {v20.16b - v23.16b}, [x1], #64 2836 2837 mov x11, v20.d[0] 2838 mov x12, v20.d[1] 2839 adds x8, x8, x11 2840 adcs x9, x9, x12 2841 adc x10, x10, x15 2842 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2843 umulh x12, x8, x16 2844 mul x13, x9, x16 2845 umulh x14, x9, x16 2846 adds x12, x12, x13 2847 mul x13, x10, x16 2848 adc x13, x13, x14 2849 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2850 umulh x8, x8, x17 2851 adds x12, x12, x14 2852 mul x14, x9, x17 2853 umulh x9, x9, x17 2854 adcs x14, x14, x8 2855 mul x10, x10, x17 2856 adc x10, x10, x9 2857 adds x13, x13, x14 2858 adc x14, x10, xzr 2859 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2860 and x8, x13, #-4 2861 extr x13, x14, x13, #2 2862 adds x8, x8, x11 2863 lsr x11, x14, #2 2864 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2865 adds x8, x8, x13 2866 adcs x9, x9, x12 2867 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2868 mov x11, v21.d[0] 2869 mov x12, v21.d[1] 2870 adds x8, x8, x11 2871 adcs x9, x9, x12 2872 adc x10, x10, x15 2873 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2874 umulh x12, x8, x16 2875 mul x13, x9, x16 2876 umulh x14, x9, x16 2877 adds x12, x12, x13 2878 mul x13, x10, x16 2879 adc x13, x13, x14 2880 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2881 umulh x8, x8, x17 2882 adds x12, x12, x14 2883 mul x14, x9, x17 2884 umulh x9, x9, x17 2885 adcs x14, x14, x8 2886 mul x10, x10, x17 2887 adc x10, x10, x9 2888 adds x13, x13, x14 2889 adc x14, x10, xzr 2890 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2891 and x8, x13, #-4 2892 extr x13, x14, x13, #2 2893 adds x8, x8, x11 2894 lsr x11, x14, #2 2895 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2896 adds x8, x8, x13 2897 adcs x9, x9, x12 2898 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2899 mov x11, v22.d[0] 2900 mov x12, v22.d[1] 2901 adds x8, x8, x11 2902 adcs x9, x9, x12 2903 adc x10, x10, x15 2904 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2905 umulh x12, x8, x16 2906 mul x13, x9, x16 2907 umulh x14, x9, x16 2908 adds x12, x12, x13 2909 mul x13, x10, x16 2910 adc x13, x13, x14 2911 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2912 umulh x8, x8, x17 2913 adds x12, x12, x14 2914 mul x14, x9, x17 2915 umulh x9, x9, x17 2916 adcs x14, x14, x8 2917 mul x10, x10, x17 2918 adc x10, x10, x9 2919 adds x13, x13, x14 2920 adc x14, x10, xzr 2921 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2922 and x8, x13, #-4 2923 extr x13, x14, x13, #2 2924 adds x8, x8, x11 2925 lsr x11, x14, #2 2926 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2927 adds x8, x8, x13 2928 adcs x9, x9, x12 2929 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2930 mov x11, v23.d[0] 2931 mov x12, v23.d[1] 2932 adds x8, x8, x11 2933 adcs x9, x9, x12 2934 adc x10, x10, x15 2935 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2936 umulh x12, x8, x16 2937 mul x13, x9, x16 2938 umulh x14, x9, x16 2939 adds x12, x12, x13 2940 mul x13, x10, x16 2941 adc x13, x13, x14 2942 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2943 umulh x8, x8, x17 2944 adds x12, x12, x14 2945 mul x14, x9, x17 2946 umulh x9, x9, x17 2947 adcs x14, x14, x8 2948 mul x10, x10, x17 2949 adc x10, x10, x9 2950 adds x13, x13, x14 2951 adc x14, x10, xzr 2952 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2953 and x8, x13, #-4 2954 extr x13, x14, x13, #2 2955 adds x8, x8, x11 2956 lsr x11, x14, #2 2957 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2958 adds x8, x8, x13 2959 adcs x9, x9, x12 2960 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2961 2962 eor v20.16b, v20.16b, v0.16b 2963 eor v21.16b, v21.16b, v5.16b 2964 eor v22.16b, v22.16b, v10.16b 2965 eor v23.16b, v23.16b, v15.16b 2966 2967 st1 {v20.16b - v23.16b}, [x0], #64 2968 2969 sub x2, x2, #64 2970 2971 mov v0.16b, v1.16b 2972 mov v5.16b, v6.16b 2973 mov v10.16b, v11.16b 2974 mov v15.16b, v16.16b 2975 2976.Lopen_128_store_64: 2977 2978 lsr x4, x2, #4 2979 mov x3, x1 2980 2981.Lopen_128_hash_64: 2982 cbz x4, .Lopen_tail_64_store 2983 ldp x11, x12, [x3], 16 2984 adds x8, x8, x11 2985 adcs x9, x9, x12 2986 adc x10, x10, x15 2987 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2988 umulh x12, x8, x16 2989 mul x13, x9, x16 2990 umulh x14, x9, x16 2991 adds x12, x12, x13 2992 mul x13, x10, x16 2993 adc x13, x13, x14 2994 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2995 umulh x8, x8, x17 2996 adds x12, x12, x14 2997 mul x14, x9, x17 2998 umulh x9, x9, x17 2999 adcs x14, x14, x8 3000 mul x10, x10, x17 3001 adc x10, x10, x9 3002 adds x13, x13, x14 3003 adc x14, x10, xzr 3004 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 3005 and x8, x13, #-4 3006 extr x13, x14, x13, #2 3007 adds x8, x8, x11 3008 lsr x11, x14, #2 3009 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 3010 adds x8, x8, x13 3011 adcs x9, x9, x12 3012 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 3013 sub x4, x4, #1 3014 b .Lopen_128_hash_64 3015.cfi_endproc 3016.size chacha20_poly1305_open,.-chacha20_poly1305_open 3017#endif // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__ELF__) 3018#if defined(__ELF__) 3019// See https://www.airs.com/blog/archives/518. 3020.section .note.GNU-stack,"",%progbits 3021#endif 3022