1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#if defined(__aarch64__) 13#if defined(BORINGSSL_PREFIX) 14#include <boringssl_prefix_symbols_asm.h> 15#endif 16#include <openssl/arm_arch.h> 17.section .rodata 18 19.align 7 20.Lchacha20_consts: 21.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' 22.Linc: 23.long 1,2,3,4 24.Lrol8: 25.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 26.Lclamp: 27.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC 28 29.text 30 31.type .Lpoly_hash_ad_internal,%function 32.align 6 33.Lpoly_hash_ad_internal: 34.cfi_startproc 35 cbnz x4, .Lpoly_hash_intro 36 ret 37 38.Lpoly_hash_intro: 39 cmp x4, #16 40 b.lt .Lpoly_hash_ad_tail 41 ldp x11, x12, [x3], 16 42 adds x8, x8, x11 43 adcs x9, x9, x12 44 adc x10, x10, x15 45 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 46 umulh x12, x8, x16 47 mul x13, x9, x16 48 umulh x14, x9, x16 49 adds x12, x12, x13 50 mul x13, x10, x16 51 adc x13, x13, x14 52 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 53 umulh x8, x8, x17 54 adds x12, x12, x14 55 mul x14, x9, x17 56 umulh x9, x9, x17 57 adcs x14, x14, x8 58 mul x10, x10, x17 59 adc x10, x10, x9 60 adds x13, x13, x14 61 adc x14, x10, xzr 62 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 63 and x8, x13, #-4 64 extr x13, x14, x13, #2 65 adds x8, x8, x11 66 lsr x11, x14, #2 67 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 68 adds x8, x8, x13 69 adcs x9, x9, x12 70 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 71 sub x4, x4, #16 72 b .Lpoly_hash_ad_internal 73 74.Lpoly_hash_ad_tail: 75 cbz x4, .Lpoly_hash_ad_ret 76 77 eor v20.16b, v20.16b, v20.16b // Use T0 to load the AAD 78 sub x4, x4, #1 79 80.Lpoly_hash_tail_16_compose: 81 ext v20.16b, v20.16b, v20.16b, #15 82 ldrb w11, [x3, x4] 83 mov v20.b[0], w11 84 subs x4, x4, #1 85 b.ge .Lpoly_hash_tail_16_compose 86 mov x11, v20.d[0] 87 mov x12, v20.d[1] 88 adds x8, x8, x11 89 adcs x9, x9, x12 90 adc x10, x10, x15 91 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 92 umulh x12, x8, x16 93 mul x13, x9, x16 94 umulh x14, x9, x16 95 adds x12, x12, x13 96 mul x13, x10, x16 97 adc x13, x13, x14 98 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 99 umulh x8, x8, x17 100 adds x12, x12, x14 101 mul x14, x9, x17 102 umulh x9, x9, x17 103 adcs x14, x14, x8 104 mul x10, x10, x17 105 adc x10, x10, x9 106 adds x13, x13, x14 107 adc x14, x10, xzr 108 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 109 and x8, x13, #-4 110 extr x13, x14, x13, #2 111 adds x8, x8, x11 112 lsr x11, x14, #2 113 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 114 adds x8, x8, x13 115 adcs x9, x9, x12 116 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 117 118.Lpoly_hash_ad_ret: 119 ret 120.cfi_endproc 121.size .Lpoly_hash_ad_internal, .-.Lpoly_hash_ad_internal 122 123///////////////////////////////// 124// 125// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data); 126// 127.globl chacha20_poly1305_seal 128.hidden chacha20_poly1305_seal 129.type chacha20_poly1305_seal,%function 130.align 6 131chacha20_poly1305_seal: 132 AARCH64_SIGN_LINK_REGISTER 133.cfi_startproc 134 stp x29, x30, [sp, #-80]! 135.cfi_def_cfa_offset 80 136.cfi_offset w30, -72 137.cfi_offset w29, -80 138 mov x29, sp 139# We probably could do .cfi_def_cfa w29, 80 at this point, but since 140# we don't actually use the frame pointer like that, it's probably not 141# worth bothering. 142 stp d8, d9, [sp, #16] 143 stp d10, d11, [sp, #32] 144 stp d12, d13, [sp, #48] 145 stp d14, d15, [sp, #64] 146.cfi_offset b15, -8 147.cfi_offset b14, -16 148.cfi_offset b13, -24 149.cfi_offset b12, -32 150.cfi_offset b11, -40 151.cfi_offset b10, -48 152.cfi_offset b9, -56 153.cfi_offset b8, -64 154 155 adrp x11, .Lchacha20_consts 156 add x11, x11, :lo12:.Lchacha20_consts 157 158 ld1 {v24.16b - v27.16b}, [x11] // .Load the CONSTS, INC, ROL8 and CLAMP values 159 ld1 {v28.16b - v30.16b}, [x5] 160 161 mov x15, #1 // Prepare the Poly1305 state 162 mov x8, #0 163 mov x9, #0 164 mov x10, #0 165 166 ldr x12, [x5, #56] // The total cipher text length includes extra_in_len 167 add x12, x12, x2 168 mov v31.d[0], x4 // Store the input and aad lengths 169 mov v31.d[1], x12 170 171 cmp x2, #128 172 b.le .Lseal_128 // Optimization for smaller buffers 173 174 // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext, 175 // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically, 176 // the fifth block (A4-D4) horizontally. 177 ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] 178 mov v4.16b, v24.16b 179 180 ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 181 mov v9.16b, v28.16b 182 183 ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 184 mov v14.16b, v29.16b 185 186 ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] 187 add v15.4s, v15.4s, v25.4s 188 mov v19.16b, v30.16b 189 190 sub x5, x5, #32 191 192 mov x6, #10 193 194.align 5 195.Lseal_init_rounds: 196 add v0.4s, v0.4s, v5.4s 197 add v1.4s, v1.4s, v6.4s 198 add v2.4s, v2.4s, v7.4s 199 add v3.4s, v3.4s, v8.4s 200 add v4.4s, v4.4s, v9.4s 201 202 eor v15.16b, v15.16b, v0.16b 203 eor v16.16b, v16.16b, v1.16b 204 eor v17.16b, v17.16b, v2.16b 205 eor v18.16b, v18.16b, v3.16b 206 eor v19.16b, v19.16b, v4.16b 207 208 rev32 v15.8h, v15.8h 209 rev32 v16.8h, v16.8h 210 rev32 v17.8h, v17.8h 211 rev32 v18.8h, v18.8h 212 rev32 v19.8h, v19.8h 213 214 add v10.4s, v10.4s, v15.4s 215 add v11.4s, v11.4s, v16.4s 216 add v12.4s, v12.4s, v17.4s 217 add v13.4s, v13.4s, v18.4s 218 add v14.4s, v14.4s, v19.4s 219 220 eor v5.16b, v5.16b, v10.16b 221 eor v6.16b, v6.16b, v11.16b 222 eor v7.16b, v7.16b, v12.16b 223 eor v8.16b, v8.16b, v13.16b 224 eor v9.16b, v9.16b, v14.16b 225 226 ushr v20.4s, v5.4s, #20 227 sli v20.4s, v5.4s, #12 228 ushr v5.4s, v6.4s, #20 229 sli v5.4s, v6.4s, #12 230 ushr v6.4s, v7.4s, #20 231 sli v6.4s, v7.4s, #12 232 ushr v7.4s, v8.4s, #20 233 sli v7.4s, v8.4s, #12 234 ushr v8.4s, v9.4s, #20 235 sli v8.4s, v9.4s, #12 236 237 add v0.4s, v0.4s, v20.4s 238 add v1.4s, v1.4s, v5.4s 239 add v2.4s, v2.4s, v6.4s 240 add v3.4s, v3.4s, v7.4s 241 add v4.4s, v4.4s, v8.4s 242 243 eor v15.16b, v15.16b, v0.16b 244 eor v16.16b, v16.16b, v1.16b 245 eor v17.16b, v17.16b, v2.16b 246 eor v18.16b, v18.16b, v3.16b 247 eor v19.16b, v19.16b, v4.16b 248 249 tbl v15.16b, {v15.16b}, v26.16b 250 tbl v16.16b, {v16.16b}, v26.16b 251 tbl v17.16b, {v17.16b}, v26.16b 252 tbl v18.16b, {v18.16b}, v26.16b 253 tbl v19.16b, {v19.16b}, v26.16b 254 255 add v10.4s, v10.4s, v15.4s 256 add v11.4s, v11.4s, v16.4s 257 add v12.4s, v12.4s, v17.4s 258 add v13.4s, v13.4s, v18.4s 259 add v14.4s, v14.4s, v19.4s 260 261 eor v20.16b, v20.16b, v10.16b 262 eor v5.16b, v5.16b, v11.16b 263 eor v6.16b, v6.16b, v12.16b 264 eor v7.16b, v7.16b, v13.16b 265 eor v8.16b, v8.16b, v14.16b 266 267 ushr v9.4s, v8.4s, #25 268 sli v9.4s, v8.4s, #7 269 ushr v8.4s, v7.4s, #25 270 sli v8.4s, v7.4s, #7 271 ushr v7.4s, v6.4s, #25 272 sli v7.4s, v6.4s, #7 273 ushr v6.4s, v5.4s, #25 274 sli v6.4s, v5.4s, #7 275 ushr v5.4s, v20.4s, #25 276 sli v5.4s, v20.4s, #7 277 278 ext v9.16b, v9.16b, v9.16b, #4 279 ext v14.16b, v14.16b, v14.16b, #8 280 ext v19.16b, v19.16b, v19.16b, #12 281 add v0.4s, v0.4s, v6.4s 282 add v1.4s, v1.4s, v7.4s 283 add v2.4s, v2.4s, v8.4s 284 add v3.4s, v3.4s, v5.4s 285 add v4.4s, v4.4s, v9.4s 286 287 eor v18.16b, v18.16b, v0.16b 288 eor v15.16b, v15.16b, v1.16b 289 eor v16.16b, v16.16b, v2.16b 290 eor v17.16b, v17.16b, v3.16b 291 eor v19.16b, v19.16b, v4.16b 292 293 rev32 v18.8h, v18.8h 294 rev32 v15.8h, v15.8h 295 rev32 v16.8h, v16.8h 296 rev32 v17.8h, v17.8h 297 rev32 v19.8h, v19.8h 298 299 add v12.4s, v12.4s, v18.4s 300 add v13.4s, v13.4s, v15.4s 301 add v10.4s, v10.4s, v16.4s 302 add v11.4s, v11.4s, v17.4s 303 add v14.4s, v14.4s, v19.4s 304 305 eor v6.16b, v6.16b, v12.16b 306 eor v7.16b, v7.16b, v13.16b 307 eor v8.16b, v8.16b, v10.16b 308 eor v5.16b, v5.16b, v11.16b 309 eor v9.16b, v9.16b, v14.16b 310 311 ushr v20.4s, v6.4s, #20 312 sli v20.4s, v6.4s, #12 313 ushr v6.4s, v7.4s, #20 314 sli v6.4s, v7.4s, #12 315 ushr v7.4s, v8.4s, #20 316 sli v7.4s, v8.4s, #12 317 ushr v8.4s, v5.4s, #20 318 sli v8.4s, v5.4s, #12 319 ushr v5.4s, v9.4s, #20 320 sli v5.4s, v9.4s, #12 321 322 add v0.4s, v0.4s, v20.4s 323 add v1.4s, v1.4s, v6.4s 324 add v2.4s, v2.4s, v7.4s 325 add v3.4s, v3.4s, v8.4s 326 add v4.4s, v4.4s, v5.4s 327 328 eor v18.16b, v18.16b, v0.16b 329 eor v15.16b, v15.16b, v1.16b 330 eor v16.16b, v16.16b, v2.16b 331 eor v17.16b, v17.16b, v3.16b 332 eor v19.16b, v19.16b, v4.16b 333 334 tbl v18.16b, {v18.16b}, v26.16b 335 tbl v15.16b, {v15.16b}, v26.16b 336 tbl v16.16b, {v16.16b}, v26.16b 337 tbl v17.16b, {v17.16b}, v26.16b 338 tbl v19.16b, {v19.16b}, v26.16b 339 340 add v12.4s, v12.4s, v18.4s 341 add v13.4s, v13.4s, v15.4s 342 add v10.4s, v10.4s, v16.4s 343 add v11.4s, v11.4s, v17.4s 344 add v14.4s, v14.4s, v19.4s 345 346 eor v20.16b, v20.16b, v12.16b 347 eor v6.16b, v6.16b, v13.16b 348 eor v7.16b, v7.16b, v10.16b 349 eor v8.16b, v8.16b, v11.16b 350 eor v5.16b, v5.16b, v14.16b 351 352 ushr v9.4s, v5.4s, #25 353 sli v9.4s, v5.4s, #7 354 ushr v5.4s, v8.4s, #25 355 sli v5.4s, v8.4s, #7 356 ushr v8.4s, v7.4s, #25 357 sli v8.4s, v7.4s, #7 358 ushr v7.4s, v6.4s, #25 359 sli v7.4s, v6.4s, #7 360 ushr v6.4s, v20.4s, #25 361 sli v6.4s, v20.4s, #7 362 363 ext v9.16b, v9.16b, v9.16b, #12 364 ext v14.16b, v14.16b, v14.16b, #8 365 ext v19.16b, v19.16b, v19.16b, #4 366 subs x6, x6, #1 367 b.hi .Lseal_init_rounds 368 369 add v15.4s, v15.4s, v25.4s 370 mov x11, #4 371 dup v20.4s, w11 372 add v25.4s, v25.4s, v20.4s 373 374 zip1 v20.4s, v0.4s, v1.4s 375 zip2 v21.4s, v0.4s, v1.4s 376 zip1 v22.4s, v2.4s, v3.4s 377 zip2 v23.4s, v2.4s, v3.4s 378 379 zip1 v0.2d, v20.2d, v22.2d 380 zip2 v1.2d, v20.2d, v22.2d 381 zip1 v2.2d, v21.2d, v23.2d 382 zip2 v3.2d, v21.2d, v23.2d 383 384 zip1 v20.4s, v5.4s, v6.4s 385 zip2 v21.4s, v5.4s, v6.4s 386 zip1 v22.4s, v7.4s, v8.4s 387 zip2 v23.4s, v7.4s, v8.4s 388 389 zip1 v5.2d, v20.2d, v22.2d 390 zip2 v6.2d, v20.2d, v22.2d 391 zip1 v7.2d, v21.2d, v23.2d 392 zip2 v8.2d, v21.2d, v23.2d 393 394 zip1 v20.4s, v10.4s, v11.4s 395 zip2 v21.4s, v10.4s, v11.4s 396 zip1 v22.4s, v12.4s, v13.4s 397 zip2 v23.4s, v12.4s, v13.4s 398 399 zip1 v10.2d, v20.2d, v22.2d 400 zip2 v11.2d, v20.2d, v22.2d 401 zip1 v12.2d, v21.2d, v23.2d 402 zip2 v13.2d, v21.2d, v23.2d 403 404 zip1 v20.4s, v15.4s, v16.4s 405 zip2 v21.4s, v15.4s, v16.4s 406 zip1 v22.4s, v17.4s, v18.4s 407 zip2 v23.4s, v17.4s, v18.4s 408 409 zip1 v15.2d, v20.2d, v22.2d 410 zip2 v16.2d, v20.2d, v22.2d 411 zip1 v17.2d, v21.2d, v23.2d 412 zip2 v18.2d, v21.2d, v23.2d 413 414 add v4.4s, v4.4s, v24.4s 415 add v9.4s, v9.4s, v28.4s 416 and v4.16b, v4.16b, v27.16b 417 418 add v0.4s, v0.4s, v24.4s 419 add v5.4s, v5.4s, v28.4s 420 add v10.4s, v10.4s, v29.4s 421 add v15.4s, v15.4s, v30.4s 422 423 add v1.4s, v1.4s, v24.4s 424 add v6.4s, v6.4s, v28.4s 425 add v11.4s, v11.4s, v29.4s 426 add v16.4s, v16.4s, v30.4s 427 428 add v2.4s, v2.4s, v24.4s 429 add v7.4s, v7.4s, v28.4s 430 add v12.4s, v12.4s, v29.4s 431 add v17.4s, v17.4s, v30.4s 432 433 add v3.4s, v3.4s, v24.4s 434 add v8.4s, v8.4s, v28.4s 435 add v13.4s, v13.4s, v29.4s 436 add v18.4s, v18.4s, v30.4s 437 438 mov x16, v4.d[0] // Move the R key to GPRs 439 mov x17, v4.d[1] 440 mov v27.16b, v9.16b // Store the S key 441 442 bl .Lpoly_hash_ad_internal 443 444 mov x3, x0 445 cmp x2, #256 446 b.le .Lseal_tail 447 448 ld1 {v20.16b - v23.16b}, [x1], #64 449 eor v20.16b, v20.16b, v0.16b 450 eor v21.16b, v21.16b, v5.16b 451 eor v22.16b, v22.16b, v10.16b 452 eor v23.16b, v23.16b, v15.16b 453 st1 {v20.16b - v23.16b}, [x0], #64 454 455 ld1 {v20.16b - v23.16b}, [x1], #64 456 eor v20.16b, v20.16b, v1.16b 457 eor v21.16b, v21.16b, v6.16b 458 eor v22.16b, v22.16b, v11.16b 459 eor v23.16b, v23.16b, v16.16b 460 st1 {v20.16b - v23.16b}, [x0], #64 461 462 ld1 {v20.16b - v23.16b}, [x1], #64 463 eor v20.16b, v20.16b, v2.16b 464 eor v21.16b, v21.16b, v7.16b 465 eor v22.16b, v22.16b, v12.16b 466 eor v23.16b, v23.16b, v17.16b 467 st1 {v20.16b - v23.16b}, [x0], #64 468 469 ld1 {v20.16b - v23.16b}, [x1], #64 470 eor v20.16b, v20.16b, v3.16b 471 eor v21.16b, v21.16b, v8.16b 472 eor v22.16b, v22.16b, v13.16b 473 eor v23.16b, v23.16b, v18.16b 474 st1 {v20.16b - v23.16b}, [x0], #64 475 476 sub x2, x2, #256 477 478 mov x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds 479 mov x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256 480 481.Lseal_main_loop: 482 adrp x11, .Lchacha20_consts 483 add x11, x11, :lo12:.Lchacha20_consts 484 485 ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] 486 mov v4.16b, v24.16b 487 488 ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 489 mov v9.16b, v28.16b 490 491 ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 492 mov v14.16b, v29.16b 493 494 ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] 495 add v15.4s, v15.4s, v25.4s 496 mov v19.16b, v30.16b 497 498 eor v20.16b, v20.16b, v20.16b //zero 499 not v21.16b, v20.16b // -1 500 sub v21.4s, v25.4s, v21.4s // Add +1 501 ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) 502 add v19.4s, v19.4s, v20.4s 503 504 sub x5, x5, #32 505.align 5 506.Lseal_main_loop_rounds: 507 add v0.4s, v0.4s, v5.4s 508 add v1.4s, v1.4s, v6.4s 509 add v2.4s, v2.4s, v7.4s 510 add v3.4s, v3.4s, v8.4s 511 add v4.4s, v4.4s, v9.4s 512 513 eor v15.16b, v15.16b, v0.16b 514 eor v16.16b, v16.16b, v1.16b 515 eor v17.16b, v17.16b, v2.16b 516 eor v18.16b, v18.16b, v3.16b 517 eor v19.16b, v19.16b, v4.16b 518 519 rev32 v15.8h, v15.8h 520 rev32 v16.8h, v16.8h 521 rev32 v17.8h, v17.8h 522 rev32 v18.8h, v18.8h 523 rev32 v19.8h, v19.8h 524 525 add v10.4s, v10.4s, v15.4s 526 add v11.4s, v11.4s, v16.4s 527 add v12.4s, v12.4s, v17.4s 528 add v13.4s, v13.4s, v18.4s 529 add v14.4s, v14.4s, v19.4s 530 531 eor v5.16b, v5.16b, v10.16b 532 eor v6.16b, v6.16b, v11.16b 533 eor v7.16b, v7.16b, v12.16b 534 eor v8.16b, v8.16b, v13.16b 535 eor v9.16b, v9.16b, v14.16b 536 537 ushr v20.4s, v5.4s, #20 538 sli v20.4s, v5.4s, #12 539 ushr v5.4s, v6.4s, #20 540 sli v5.4s, v6.4s, #12 541 ushr v6.4s, v7.4s, #20 542 sli v6.4s, v7.4s, #12 543 ushr v7.4s, v8.4s, #20 544 sli v7.4s, v8.4s, #12 545 ushr v8.4s, v9.4s, #20 546 sli v8.4s, v9.4s, #12 547 548 add v0.4s, v0.4s, v20.4s 549 add v1.4s, v1.4s, v5.4s 550 add v2.4s, v2.4s, v6.4s 551 add v3.4s, v3.4s, v7.4s 552 add v4.4s, v4.4s, v8.4s 553 554 eor v15.16b, v15.16b, v0.16b 555 eor v16.16b, v16.16b, v1.16b 556 eor v17.16b, v17.16b, v2.16b 557 eor v18.16b, v18.16b, v3.16b 558 eor v19.16b, v19.16b, v4.16b 559 560 tbl v15.16b, {v15.16b}, v26.16b 561 tbl v16.16b, {v16.16b}, v26.16b 562 tbl v17.16b, {v17.16b}, v26.16b 563 tbl v18.16b, {v18.16b}, v26.16b 564 tbl v19.16b, {v19.16b}, v26.16b 565 566 add v10.4s, v10.4s, v15.4s 567 add v11.4s, v11.4s, v16.4s 568 add v12.4s, v12.4s, v17.4s 569 add v13.4s, v13.4s, v18.4s 570 add v14.4s, v14.4s, v19.4s 571 572 eor v20.16b, v20.16b, v10.16b 573 eor v5.16b, v5.16b, v11.16b 574 eor v6.16b, v6.16b, v12.16b 575 eor v7.16b, v7.16b, v13.16b 576 eor v8.16b, v8.16b, v14.16b 577 578 ushr v9.4s, v8.4s, #25 579 sli v9.4s, v8.4s, #7 580 ushr v8.4s, v7.4s, #25 581 sli v8.4s, v7.4s, #7 582 ushr v7.4s, v6.4s, #25 583 sli v7.4s, v6.4s, #7 584 ushr v6.4s, v5.4s, #25 585 sli v6.4s, v5.4s, #7 586 ushr v5.4s, v20.4s, #25 587 sli v5.4s, v20.4s, #7 588 589 ext v9.16b, v9.16b, v9.16b, #4 590 ext v14.16b, v14.16b, v14.16b, #8 591 ext v19.16b, v19.16b, v19.16b, #12 592 ldp x11, x12, [x3], 16 593 adds x8, x8, x11 594 adcs x9, x9, x12 595 adc x10, x10, x15 596 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 597 umulh x12, x8, x16 598 mul x13, x9, x16 599 umulh x14, x9, x16 600 adds x12, x12, x13 601 mul x13, x10, x16 602 adc x13, x13, x14 603 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 604 umulh x8, x8, x17 605 adds x12, x12, x14 606 mul x14, x9, x17 607 umulh x9, x9, x17 608 adcs x14, x14, x8 609 mul x10, x10, x17 610 adc x10, x10, x9 611 adds x13, x13, x14 612 adc x14, x10, xzr 613 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 614 and x8, x13, #-4 615 extr x13, x14, x13, #2 616 adds x8, x8, x11 617 lsr x11, x14, #2 618 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 619 adds x8, x8, x13 620 adcs x9, x9, x12 621 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 622 add v0.4s, v0.4s, v6.4s 623 add v1.4s, v1.4s, v7.4s 624 add v2.4s, v2.4s, v8.4s 625 add v3.4s, v3.4s, v5.4s 626 add v4.4s, v4.4s, v9.4s 627 628 eor v18.16b, v18.16b, v0.16b 629 eor v15.16b, v15.16b, v1.16b 630 eor v16.16b, v16.16b, v2.16b 631 eor v17.16b, v17.16b, v3.16b 632 eor v19.16b, v19.16b, v4.16b 633 634 rev32 v18.8h, v18.8h 635 rev32 v15.8h, v15.8h 636 rev32 v16.8h, v16.8h 637 rev32 v17.8h, v17.8h 638 rev32 v19.8h, v19.8h 639 640 add v12.4s, v12.4s, v18.4s 641 add v13.4s, v13.4s, v15.4s 642 add v10.4s, v10.4s, v16.4s 643 add v11.4s, v11.4s, v17.4s 644 add v14.4s, v14.4s, v19.4s 645 646 eor v6.16b, v6.16b, v12.16b 647 eor v7.16b, v7.16b, v13.16b 648 eor v8.16b, v8.16b, v10.16b 649 eor v5.16b, v5.16b, v11.16b 650 eor v9.16b, v9.16b, v14.16b 651 652 ushr v20.4s, v6.4s, #20 653 sli v20.4s, v6.4s, #12 654 ushr v6.4s, v7.4s, #20 655 sli v6.4s, v7.4s, #12 656 ushr v7.4s, v8.4s, #20 657 sli v7.4s, v8.4s, #12 658 ushr v8.4s, v5.4s, #20 659 sli v8.4s, v5.4s, #12 660 ushr v5.4s, v9.4s, #20 661 sli v5.4s, v9.4s, #12 662 663 add v0.4s, v0.4s, v20.4s 664 add v1.4s, v1.4s, v6.4s 665 add v2.4s, v2.4s, v7.4s 666 add v3.4s, v3.4s, v8.4s 667 add v4.4s, v4.4s, v5.4s 668 669 eor v18.16b, v18.16b, v0.16b 670 eor v15.16b, v15.16b, v1.16b 671 eor v16.16b, v16.16b, v2.16b 672 eor v17.16b, v17.16b, v3.16b 673 eor v19.16b, v19.16b, v4.16b 674 675 tbl v18.16b, {v18.16b}, v26.16b 676 tbl v15.16b, {v15.16b}, v26.16b 677 tbl v16.16b, {v16.16b}, v26.16b 678 tbl v17.16b, {v17.16b}, v26.16b 679 tbl v19.16b, {v19.16b}, v26.16b 680 681 add v12.4s, v12.4s, v18.4s 682 add v13.4s, v13.4s, v15.4s 683 add v10.4s, v10.4s, v16.4s 684 add v11.4s, v11.4s, v17.4s 685 add v14.4s, v14.4s, v19.4s 686 687 eor v20.16b, v20.16b, v12.16b 688 eor v6.16b, v6.16b, v13.16b 689 eor v7.16b, v7.16b, v10.16b 690 eor v8.16b, v8.16b, v11.16b 691 eor v5.16b, v5.16b, v14.16b 692 693 ushr v9.4s, v5.4s, #25 694 sli v9.4s, v5.4s, #7 695 ushr v5.4s, v8.4s, #25 696 sli v5.4s, v8.4s, #7 697 ushr v8.4s, v7.4s, #25 698 sli v8.4s, v7.4s, #7 699 ushr v7.4s, v6.4s, #25 700 sli v7.4s, v6.4s, #7 701 ushr v6.4s, v20.4s, #25 702 sli v6.4s, v20.4s, #7 703 704 ext v9.16b, v9.16b, v9.16b, #12 705 ext v14.16b, v14.16b, v14.16b, #8 706 ext v19.16b, v19.16b, v19.16b, #4 707 subs x6, x6, #1 708 b.ge .Lseal_main_loop_rounds 709 ldp x11, x12, [x3], 16 710 adds x8, x8, x11 711 adcs x9, x9, x12 712 adc x10, x10, x15 713 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 714 umulh x12, x8, x16 715 mul x13, x9, x16 716 umulh x14, x9, x16 717 adds x12, x12, x13 718 mul x13, x10, x16 719 adc x13, x13, x14 720 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 721 umulh x8, x8, x17 722 adds x12, x12, x14 723 mul x14, x9, x17 724 umulh x9, x9, x17 725 adcs x14, x14, x8 726 mul x10, x10, x17 727 adc x10, x10, x9 728 adds x13, x13, x14 729 adc x14, x10, xzr 730 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 731 and x8, x13, #-4 732 extr x13, x14, x13, #2 733 adds x8, x8, x11 734 lsr x11, x14, #2 735 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 736 adds x8, x8, x13 737 adcs x9, x9, x12 738 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 739 subs x7, x7, #1 740 b.gt .Lseal_main_loop_rounds 741 742 eor v20.16b, v20.16b, v20.16b //zero 743 not v21.16b, v20.16b // -1 744 sub v21.4s, v25.4s, v21.4s // Add +1 745 ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) 746 add v19.4s, v19.4s, v20.4s 747 748 add v15.4s, v15.4s, v25.4s 749 mov x11, #5 750 dup v20.4s, w11 751 add v25.4s, v25.4s, v20.4s 752 753 zip1 v20.4s, v0.4s, v1.4s 754 zip2 v21.4s, v0.4s, v1.4s 755 zip1 v22.4s, v2.4s, v3.4s 756 zip2 v23.4s, v2.4s, v3.4s 757 758 zip1 v0.2d, v20.2d, v22.2d 759 zip2 v1.2d, v20.2d, v22.2d 760 zip1 v2.2d, v21.2d, v23.2d 761 zip2 v3.2d, v21.2d, v23.2d 762 763 zip1 v20.4s, v5.4s, v6.4s 764 zip2 v21.4s, v5.4s, v6.4s 765 zip1 v22.4s, v7.4s, v8.4s 766 zip2 v23.4s, v7.4s, v8.4s 767 768 zip1 v5.2d, v20.2d, v22.2d 769 zip2 v6.2d, v20.2d, v22.2d 770 zip1 v7.2d, v21.2d, v23.2d 771 zip2 v8.2d, v21.2d, v23.2d 772 773 zip1 v20.4s, v10.4s, v11.4s 774 zip2 v21.4s, v10.4s, v11.4s 775 zip1 v22.4s, v12.4s, v13.4s 776 zip2 v23.4s, v12.4s, v13.4s 777 778 zip1 v10.2d, v20.2d, v22.2d 779 zip2 v11.2d, v20.2d, v22.2d 780 zip1 v12.2d, v21.2d, v23.2d 781 zip2 v13.2d, v21.2d, v23.2d 782 783 zip1 v20.4s, v15.4s, v16.4s 784 zip2 v21.4s, v15.4s, v16.4s 785 zip1 v22.4s, v17.4s, v18.4s 786 zip2 v23.4s, v17.4s, v18.4s 787 788 zip1 v15.2d, v20.2d, v22.2d 789 zip2 v16.2d, v20.2d, v22.2d 790 zip1 v17.2d, v21.2d, v23.2d 791 zip2 v18.2d, v21.2d, v23.2d 792 793 add v0.4s, v0.4s, v24.4s 794 add v5.4s, v5.4s, v28.4s 795 add v10.4s, v10.4s, v29.4s 796 add v15.4s, v15.4s, v30.4s 797 798 add v1.4s, v1.4s, v24.4s 799 add v6.4s, v6.4s, v28.4s 800 add v11.4s, v11.4s, v29.4s 801 add v16.4s, v16.4s, v30.4s 802 803 add v2.4s, v2.4s, v24.4s 804 add v7.4s, v7.4s, v28.4s 805 add v12.4s, v12.4s, v29.4s 806 add v17.4s, v17.4s, v30.4s 807 808 add v3.4s, v3.4s, v24.4s 809 add v8.4s, v8.4s, v28.4s 810 add v13.4s, v13.4s, v29.4s 811 add v18.4s, v18.4s, v30.4s 812 813 add v4.4s, v4.4s, v24.4s 814 add v9.4s, v9.4s, v28.4s 815 add v14.4s, v14.4s, v29.4s 816 add v19.4s, v19.4s, v30.4s 817 818 cmp x2, #320 819 b.le .Lseal_tail 820 821 ld1 {v20.16b - v23.16b}, [x1], #64 822 eor v20.16b, v20.16b, v0.16b 823 eor v21.16b, v21.16b, v5.16b 824 eor v22.16b, v22.16b, v10.16b 825 eor v23.16b, v23.16b, v15.16b 826 st1 {v20.16b - v23.16b}, [x0], #64 827 828 ld1 {v20.16b - v23.16b}, [x1], #64 829 eor v20.16b, v20.16b, v1.16b 830 eor v21.16b, v21.16b, v6.16b 831 eor v22.16b, v22.16b, v11.16b 832 eor v23.16b, v23.16b, v16.16b 833 st1 {v20.16b - v23.16b}, [x0], #64 834 835 ld1 {v20.16b - v23.16b}, [x1], #64 836 eor v20.16b, v20.16b, v2.16b 837 eor v21.16b, v21.16b, v7.16b 838 eor v22.16b, v22.16b, v12.16b 839 eor v23.16b, v23.16b, v17.16b 840 st1 {v20.16b - v23.16b}, [x0], #64 841 842 ld1 {v20.16b - v23.16b}, [x1], #64 843 eor v20.16b, v20.16b, v3.16b 844 eor v21.16b, v21.16b, v8.16b 845 eor v22.16b, v22.16b, v13.16b 846 eor v23.16b, v23.16b, v18.16b 847 st1 {v20.16b - v23.16b}, [x0], #64 848 849 ld1 {v20.16b - v23.16b}, [x1], #64 850 eor v20.16b, v20.16b, v4.16b 851 eor v21.16b, v21.16b, v9.16b 852 eor v22.16b, v22.16b, v14.16b 853 eor v23.16b, v23.16b, v19.16b 854 st1 {v20.16b - v23.16b}, [x0], #64 855 856 sub x2, x2, #320 857 858 mov x6, #0 859 mov x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration 860 861 b .Lseal_main_loop 862 863.Lseal_tail: 864 // This part of the function handles the storage and authentication of the last [0,320) bytes 865 // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data. 866 cmp x2, #64 867 b.lt .Lseal_tail_64 868 869 // Store and authenticate 64B blocks per iteration 870 ld1 {v20.16b - v23.16b}, [x1], #64 871 872 eor v20.16b, v20.16b, v0.16b 873 eor v21.16b, v21.16b, v5.16b 874 eor v22.16b, v22.16b, v10.16b 875 eor v23.16b, v23.16b, v15.16b 876 mov x11, v20.d[0] 877 mov x12, v20.d[1] 878 adds x8, x8, x11 879 adcs x9, x9, x12 880 adc x10, x10, x15 881 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 882 umulh x12, x8, x16 883 mul x13, x9, x16 884 umulh x14, x9, x16 885 adds x12, x12, x13 886 mul x13, x10, x16 887 adc x13, x13, x14 888 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 889 umulh x8, x8, x17 890 adds x12, x12, x14 891 mul x14, x9, x17 892 umulh x9, x9, x17 893 adcs x14, x14, x8 894 mul x10, x10, x17 895 adc x10, x10, x9 896 adds x13, x13, x14 897 adc x14, x10, xzr 898 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 899 and x8, x13, #-4 900 extr x13, x14, x13, #2 901 adds x8, x8, x11 902 lsr x11, x14, #2 903 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 904 adds x8, x8, x13 905 adcs x9, x9, x12 906 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 907 mov x11, v21.d[0] 908 mov x12, v21.d[1] 909 adds x8, x8, x11 910 adcs x9, x9, x12 911 adc x10, x10, x15 912 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 913 umulh x12, x8, x16 914 mul x13, x9, x16 915 umulh x14, x9, x16 916 adds x12, x12, x13 917 mul x13, x10, x16 918 adc x13, x13, x14 919 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 920 umulh x8, x8, x17 921 adds x12, x12, x14 922 mul x14, x9, x17 923 umulh x9, x9, x17 924 adcs x14, x14, x8 925 mul x10, x10, x17 926 adc x10, x10, x9 927 adds x13, x13, x14 928 adc x14, x10, xzr 929 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 930 and x8, x13, #-4 931 extr x13, x14, x13, #2 932 adds x8, x8, x11 933 lsr x11, x14, #2 934 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 935 adds x8, x8, x13 936 adcs x9, x9, x12 937 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 938 mov x11, v22.d[0] 939 mov x12, v22.d[1] 940 adds x8, x8, x11 941 adcs x9, x9, x12 942 adc x10, x10, x15 943 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 944 umulh x12, x8, x16 945 mul x13, x9, x16 946 umulh x14, x9, x16 947 adds x12, x12, x13 948 mul x13, x10, x16 949 adc x13, x13, x14 950 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 951 umulh x8, x8, x17 952 adds x12, x12, x14 953 mul x14, x9, x17 954 umulh x9, x9, x17 955 adcs x14, x14, x8 956 mul x10, x10, x17 957 adc x10, x10, x9 958 adds x13, x13, x14 959 adc x14, x10, xzr 960 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 961 and x8, x13, #-4 962 extr x13, x14, x13, #2 963 adds x8, x8, x11 964 lsr x11, x14, #2 965 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 966 adds x8, x8, x13 967 adcs x9, x9, x12 968 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 969 mov x11, v23.d[0] 970 mov x12, v23.d[1] 971 adds x8, x8, x11 972 adcs x9, x9, x12 973 adc x10, x10, x15 974 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 975 umulh x12, x8, x16 976 mul x13, x9, x16 977 umulh x14, x9, x16 978 adds x12, x12, x13 979 mul x13, x10, x16 980 adc x13, x13, x14 981 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 982 umulh x8, x8, x17 983 adds x12, x12, x14 984 mul x14, x9, x17 985 umulh x9, x9, x17 986 adcs x14, x14, x8 987 mul x10, x10, x17 988 adc x10, x10, x9 989 adds x13, x13, x14 990 adc x14, x10, xzr 991 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 992 and x8, x13, #-4 993 extr x13, x14, x13, #2 994 adds x8, x8, x11 995 lsr x11, x14, #2 996 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 997 adds x8, x8, x13 998 adcs x9, x9, x12 999 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 1000 st1 {v20.16b - v23.16b}, [x0], #64 1001 sub x2, x2, #64 1002 1003 // Shift the state left by 64 bytes for the next iteration of the loop 1004 mov v0.16b, v1.16b 1005 mov v5.16b, v6.16b 1006 mov v10.16b, v11.16b 1007 mov v15.16b, v16.16b 1008 1009 mov v1.16b, v2.16b 1010 mov v6.16b, v7.16b 1011 mov v11.16b, v12.16b 1012 mov v16.16b, v17.16b 1013 1014 mov v2.16b, v3.16b 1015 mov v7.16b, v8.16b 1016 mov v12.16b, v13.16b 1017 mov v17.16b, v18.16b 1018 1019 mov v3.16b, v4.16b 1020 mov v8.16b, v9.16b 1021 mov v13.16b, v14.16b 1022 mov v18.16b, v19.16b 1023 1024 b .Lseal_tail 1025 1026.Lseal_tail_64: 1027 ldp x3, x4, [x5, #48] // extra_in_len and extra_in_ptr 1028 1029 // Here we handle the last [0,64) bytes of plaintext 1030 cmp x2, #16 1031 b.lt .Lseal_tail_16 1032 // Each iteration encrypt and authenticate a 16B block 1033 ld1 {v20.16b}, [x1], #16 1034 eor v20.16b, v20.16b, v0.16b 1035 mov x11, v20.d[0] 1036 mov x12, v20.d[1] 1037 adds x8, x8, x11 1038 adcs x9, x9, x12 1039 adc x10, x10, x15 1040 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 1041 umulh x12, x8, x16 1042 mul x13, x9, x16 1043 umulh x14, x9, x16 1044 adds x12, x12, x13 1045 mul x13, x10, x16 1046 adc x13, x13, x14 1047 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 1048 umulh x8, x8, x17 1049 adds x12, x12, x14 1050 mul x14, x9, x17 1051 umulh x9, x9, x17 1052 adcs x14, x14, x8 1053 mul x10, x10, x17 1054 adc x10, x10, x9 1055 adds x13, x13, x14 1056 adc x14, x10, xzr 1057 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 1058 and x8, x13, #-4 1059 extr x13, x14, x13, #2 1060 adds x8, x8, x11 1061 lsr x11, x14, #2 1062 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 1063 adds x8, x8, x13 1064 adcs x9, x9, x12 1065 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 1066 st1 {v20.16b}, [x0], #16 1067 1068 sub x2, x2, #16 1069 1070 // Shift the state left by 16 bytes for the next iteration of the loop 1071 mov v0.16b, v5.16b 1072 mov v5.16b, v10.16b 1073 mov v10.16b, v15.16b 1074 1075 b .Lseal_tail_64 1076 1077.Lseal_tail_16: 1078 // Here we handle the last [0,16) bytes of ciphertext that require a padded block 1079 cbz x2, .Lseal_hash_extra 1080 1081 eor v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in 1082 eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes 1083 not v22.16b, v20.16b 1084 1085 mov x6, x2 1086 add x1, x1, x2 1087 1088 cbz x4, .Lseal_tail_16_compose // No extra data to pad with, zero padding 1089 1090 mov x7, #16 // We need to load some extra_in first for padding 1091 sub x7, x7, x2 1092 cmp x4, x7 1093 csel x7, x4, x7, lt // .Load the minimum of extra_in_len and the amount needed to fill the register 1094 mov x12, x7 1095 add x3, x3, x7 1096 sub x4, x4, x7 1097 1098.Lseal_tail16_compose_extra_in: 1099 ext v20.16b, v20.16b, v20.16b, #15 1100 ldrb w11, [x3, #-1]! 1101 mov v20.b[0], w11 1102 subs x7, x7, #1 1103 b.gt .Lseal_tail16_compose_extra_in 1104 1105 add x3, x3, x12 1106 1107.Lseal_tail_16_compose: 1108 ext v20.16b, v20.16b, v20.16b, #15 1109 ldrb w11, [x1, #-1]! 1110 mov v20.b[0], w11 1111 ext v21.16b, v22.16b, v21.16b, #15 1112 subs x2, x2, #1 1113 b.gt .Lseal_tail_16_compose 1114 1115 and v0.16b, v0.16b, v21.16b 1116 eor v20.16b, v20.16b, v0.16b 1117 mov v21.16b, v20.16b 1118 1119.Lseal_tail_16_store: 1120 umov w11, v20.b[0] 1121 strb w11, [x0], #1 1122 ext v20.16b, v20.16b, v20.16b, #1 1123 subs x6, x6, #1 1124 b.gt .Lseal_tail_16_store 1125 1126 // Hash in the final ct block concatenated with extra_in 1127 mov x11, v21.d[0] 1128 mov x12, v21.d[1] 1129 adds x8, x8, x11 1130 adcs x9, x9, x12 1131 adc x10, x10, x15 1132 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 1133 umulh x12, x8, x16 1134 mul x13, x9, x16 1135 umulh x14, x9, x16 1136 adds x12, x12, x13 1137 mul x13, x10, x16 1138 adc x13, x13, x14 1139 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 1140 umulh x8, x8, x17 1141 adds x12, x12, x14 1142 mul x14, x9, x17 1143 umulh x9, x9, x17 1144 adcs x14, x14, x8 1145 mul x10, x10, x17 1146 adc x10, x10, x9 1147 adds x13, x13, x14 1148 adc x14, x10, xzr 1149 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 1150 and x8, x13, #-4 1151 extr x13, x14, x13, #2 1152 adds x8, x8, x11 1153 lsr x11, x14, #2 1154 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 1155 adds x8, x8, x13 1156 adcs x9, x9, x12 1157 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 1158 1159.Lseal_hash_extra: 1160 cbz x4, .Lseal_finalize 1161 1162.Lseal_hash_extra_loop: 1163 cmp x4, #16 1164 b.lt .Lseal_hash_extra_tail 1165 ld1 {v20.16b}, [x3], #16 1166 mov x11, v20.d[0] 1167 mov x12, v20.d[1] 1168 adds x8, x8, x11 1169 adcs x9, x9, x12 1170 adc x10, x10, x15 1171 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 1172 umulh x12, x8, x16 1173 mul x13, x9, x16 1174 umulh x14, x9, x16 1175 adds x12, x12, x13 1176 mul x13, x10, x16 1177 adc x13, x13, x14 1178 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 1179 umulh x8, x8, x17 1180 adds x12, x12, x14 1181 mul x14, x9, x17 1182 umulh x9, x9, x17 1183 adcs x14, x14, x8 1184 mul x10, x10, x17 1185 adc x10, x10, x9 1186 adds x13, x13, x14 1187 adc x14, x10, xzr 1188 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 1189 and x8, x13, #-4 1190 extr x13, x14, x13, #2 1191 adds x8, x8, x11 1192 lsr x11, x14, #2 1193 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 1194 adds x8, x8, x13 1195 adcs x9, x9, x12 1196 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 1197 sub x4, x4, #16 1198 b .Lseal_hash_extra_loop 1199 1200.Lseal_hash_extra_tail: 1201 cbz x4, .Lseal_finalize 1202 eor v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext 1203 add x3, x3, x4 1204 1205.Lseal_hash_extra_load: 1206 ext v20.16b, v20.16b, v20.16b, #15 1207 ldrb w11, [x3, #-1]! 1208 mov v20.b[0], w11 1209 subs x4, x4, #1 1210 b.gt .Lseal_hash_extra_load 1211 1212 // Hash in the final padded extra_in blcok 1213 mov x11, v20.d[0] 1214 mov x12, v20.d[1] 1215 adds x8, x8, x11 1216 adcs x9, x9, x12 1217 adc x10, x10, x15 1218 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 1219 umulh x12, x8, x16 1220 mul x13, x9, x16 1221 umulh x14, x9, x16 1222 adds x12, x12, x13 1223 mul x13, x10, x16 1224 adc x13, x13, x14 1225 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 1226 umulh x8, x8, x17 1227 adds x12, x12, x14 1228 mul x14, x9, x17 1229 umulh x9, x9, x17 1230 adcs x14, x14, x8 1231 mul x10, x10, x17 1232 adc x10, x10, x9 1233 adds x13, x13, x14 1234 adc x14, x10, xzr 1235 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 1236 and x8, x13, #-4 1237 extr x13, x14, x13, #2 1238 adds x8, x8, x11 1239 lsr x11, x14, #2 1240 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 1241 adds x8, x8, x13 1242 adcs x9, x9, x12 1243 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 1244 1245.Lseal_finalize: 1246 mov x11, v31.d[0] 1247 mov x12, v31.d[1] 1248 adds x8, x8, x11 1249 adcs x9, x9, x12 1250 adc x10, x10, x15 1251 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 1252 umulh x12, x8, x16 1253 mul x13, x9, x16 1254 umulh x14, x9, x16 1255 adds x12, x12, x13 1256 mul x13, x10, x16 1257 adc x13, x13, x14 1258 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 1259 umulh x8, x8, x17 1260 adds x12, x12, x14 1261 mul x14, x9, x17 1262 umulh x9, x9, x17 1263 adcs x14, x14, x8 1264 mul x10, x10, x17 1265 adc x10, x10, x9 1266 adds x13, x13, x14 1267 adc x14, x10, xzr 1268 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 1269 and x8, x13, #-4 1270 extr x13, x14, x13, #2 1271 adds x8, x8, x11 1272 lsr x11, x14, #2 1273 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 1274 adds x8, x8, x13 1275 adcs x9, x9, x12 1276 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 1277 # Final reduction step 1278 sub x12, xzr, x15 1279 orr x13, xzr, #3 1280 subs x11, x8, #-5 1281 sbcs x12, x9, x12 1282 sbcs x13, x10, x13 1283 csel x8, x11, x8, cs 1284 csel x9, x12, x9, cs 1285 csel x10, x13, x10, cs 1286 mov x11, v27.d[0] 1287 mov x12, v27.d[1] 1288 adds x8, x8, x11 1289 adcs x9, x9, x12 1290 adc x10, x10, x15 1291 1292 stp x8, x9, [x5] 1293 1294 ldp d8, d9, [sp, #16] 1295 ldp d10, d11, [sp, #32] 1296 ldp d12, d13, [sp, #48] 1297 ldp d14, d15, [sp, #64] 1298.cfi_restore b15 1299.cfi_restore b14 1300.cfi_restore b13 1301.cfi_restore b12 1302.cfi_restore b11 1303.cfi_restore b10 1304.cfi_restore b9 1305.cfi_restore b8 1306 ldp x29, x30, [sp], 80 1307.cfi_restore w29 1308.cfi_restore w30 1309.cfi_def_cfa_offset 0 1310 AARCH64_VALIDATE_LINK_REGISTER 1311 ret 1312 1313.Lseal_128: 1314 // On some architectures preparing 5 blocks for small buffers is wasteful 1315 eor v25.16b, v25.16b, v25.16b 1316 mov x11, #1 1317 mov v25.s[0], w11 1318 mov v0.16b, v24.16b 1319 mov v1.16b, v24.16b 1320 mov v2.16b, v24.16b 1321 mov v5.16b, v28.16b 1322 mov v6.16b, v28.16b 1323 mov v7.16b, v28.16b 1324 mov v10.16b, v29.16b 1325 mov v11.16b, v29.16b 1326 mov v12.16b, v29.16b 1327 mov v17.16b, v30.16b 1328 add v15.4s, v17.4s, v25.4s 1329 add v16.4s, v15.4s, v25.4s 1330 1331 mov x6, #10 1332 1333.Lseal_128_rounds: 1334 add v0.4s, v0.4s, v5.4s 1335 add v1.4s, v1.4s, v6.4s 1336 add v2.4s, v2.4s, v7.4s 1337 eor v15.16b, v15.16b, v0.16b 1338 eor v16.16b, v16.16b, v1.16b 1339 eor v17.16b, v17.16b, v2.16b 1340 rev32 v15.8h, v15.8h 1341 rev32 v16.8h, v16.8h 1342 rev32 v17.8h, v17.8h 1343 1344 add v10.4s, v10.4s, v15.4s 1345 add v11.4s, v11.4s, v16.4s 1346 add v12.4s, v12.4s, v17.4s 1347 eor v5.16b, v5.16b, v10.16b 1348 eor v6.16b, v6.16b, v11.16b 1349 eor v7.16b, v7.16b, v12.16b 1350 ushr v20.4s, v5.4s, #20 1351 sli v20.4s, v5.4s, #12 1352 ushr v5.4s, v6.4s, #20 1353 sli v5.4s, v6.4s, #12 1354 ushr v6.4s, v7.4s, #20 1355 sli v6.4s, v7.4s, #12 1356 1357 add v0.4s, v0.4s, v20.4s 1358 add v1.4s, v1.4s, v5.4s 1359 add v2.4s, v2.4s, v6.4s 1360 eor v15.16b, v15.16b, v0.16b 1361 eor v16.16b, v16.16b, v1.16b 1362 eor v17.16b, v17.16b, v2.16b 1363 tbl v15.16b, {v15.16b}, v26.16b 1364 tbl v16.16b, {v16.16b}, v26.16b 1365 tbl v17.16b, {v17.16b}, v26.16b 1366 1367 add v10.4s, v10.4s, v15.4s 1368 add v11.4s, v11.4s, v16.4s 1369 add v12.4s, v12.4s, v17.4s 1370 eor v20.16b, v20.16b, v10.16b 1371 eor v5.16b, v5.16b, v11.16b 1372 eor v6.16b, v6.16b, v12.16b 1373 ushr v7.4s, v6.4s, #25 1374 sli v7.4s, v6.4s, #7 1375 ushr v6.4s, v5.4s, #25 1376 sli v6.4s, v5.4s, #7 1377 ushr v5.4s, v20.4s, #25 1378 sli v5.4s, v20.4s, #7 1379 1380 ext v5.16b, v5.16b, v5.16b, #4 1381 ext v6.16b, v6.16b, v6.16b, #4 1382 ext v7.16b, v7.16b, v7.16b, #4 1383 1384 ext v10.16b, v10.16b, v10.16b, #8 1385 ext v11.16b, v11.16b, v11.16b, #8 1386 ext v12.16b, v12.16b, v12.16b, #8 1387 1388 ext v15.16b, v15.16b, v15.16b, #12 1389 ext v16.16b, v16.16b, v16.16b, #12 1390 ext v17.16b, v17.16b, v17.16b, #12 1391 add v0.4s, v0.4s, v5.4s 1392 add v1.4s, v1.4s, v6.4s 1393 add v2.4s, v2.4s, v7.4s 1394 eor v15.16b, v15.16b, v0.16b 1395 eor v16.16b, v16.16b, v1.16b 1396 eor v17.16b, v17.16b, v2.16b 1397 rev32 v15.8h, v15.8h 1398 rev32 v16.8h, v16.8h 1399 rev32 v17.8h, v17.8h 1400 1401 add v10.4s, v10.4s, v15.4s 1402 add v11.4s, v11.4s, v16.4s 1403 add v12.4s, v12.4s, v17.4s 1404 eor v5.16b, v5.16b, v10.16b 1405 eor v6.16b, v6.16b, v11.16b 1406 eor v7.16b, v7.16b, v12.16b 1407 ushr v20.4s, v5.4s, #20 1408 sli v20.4s, v5.4s, #12 1409 ushr v5.4s, v6.4s, #20 1410 sli v5.4s, v6.4s, #12 1411 ushr v6.4s, v7.4s, #20 1412 sli v6.4s, v7.4s, #12 1413 1414 add v0.4s, v0.4s, v20.4s 1415 add v1.4s, v1.4s, v5.4s 1416 add v2.4s, v2.4s, v6.4s 1417 eor v15.16b, v15.16b, v0.16b 1418 eor v16.16b, v16.16b, v1.16b 1419 eor v17.16b, v17.16b, v2.16b 1420 tbl v15.16b, {v15.16b}, v26.16b 1421 tbl v16.16b, {v16.16b}, v26.16b 1422 tbl v17.16b, {v17.16b}, v26.16b 1423 1424 add v10.4s, v10.4s, v15.4s 1425 add v11.4s, v11.4s, v16.4s 1426 add v12.4s, v12.4s, v17.4s 1427 eor v20.16b, v20.16b, v10.16b 1428 eor v5.16b, v5.16b, v11.16b 1429 eor v6.16b, v6.16b, v12.16b 1430 ushr v7.4s, v6.4s, #25 1431 sli v7.4s, v6.4s, #7 1432 ushr v6.4s, v5.4s, #25 1433 sli v6.4s, v5.4s, #7 1434 ushr v5.4s, v20.4s, #25 1435 sli v5.4s, v20.4s, #7 1436 1437 ext v5.16b, v5.16b, v5.16b, #12 1438 ext v6.16b, v6.16b, v6.16b, #12 1439 ext v7.16b, v7.16b, v7.16b, #12 1440 1441 ext v10.16b, v10.16b, v10.16b, #8 1442 ext v11.16b, v11.16b, v11.16b, #8 1443 ext v12.16b, v12.16b, v12.16b, #8 1444 1445 ext v15.16b, v15.16b, v15.16b, #4 1446 ext v16.16b, v16.16b, v16.16b, #4 1447 ext v17.16b, v17.16b, v17.16b, #4 1448 subs x6, x6, #1 1449 b.hi .Lseal_128_rounds 1450 1451 add v0.4s, v0.4s, v24.4s 1452 add v1.4s, v1.4s, v24.4s 1453 add v2.4s, v2.4s, v24.4s 1454 1455 add v5.4s, v5.4s, v28.4s 1456 add v6.4s, v6.4s, v28.4s 1457 add v7.4s, v7.4s, v28.4s 1458 1459 // Only the first 32 bytes of the third block (counter = 0) are needed, 1460 // so skip updating v12 and v17. 1461 add v10.4s, v10.4s, v29.4s 1462 add v11.4s, v11.4s, v29.4s 1463 1464 add v30.4s, v30.4s, v25.4s 1465 add v15.4s, v15.4s, v30.4s 1466 add v30.4s, v30.4s, v25.4s 1467 add v16.4s, v16.4s, v30.4s 1468 1469 and v2.16b, v2.16b, v27.16b 1470 mov x16, v2.d[0] // Move the R key to GPRs 1471 mov x17, v2.d[1] 1472 mov v27.16b, v7.16b // Store the S key 1473 1474 bl .Lpoly_hash_ad_internal 1475 b .Lseal_tail 1476.cfi_endproc 1477.size chacha20_poly1305_seal,.-chacha20_poly1305_seal 1478 1479///////////////////////////////// 1480// 1481// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data); 1482// 1483.globl chacha20_poly1305_open 1484.hidden chacha20_poly1305_open 1485.type chacha20_poly1305_open,%function 1486.align 6 1487chacha20_poly1305_open: 1488 AARCH64_SIGN_LINK_REGISTER 1489.cfi_startproc 1490 stp x29, x30, [sp, #-80]! 1491.cfi_def_cfa_offset 80 1492.cfi_offset w30, -72 1493.cfi_offset w29, -80 1494 mov x29, sp 1495# We probably could do .cfi_def_cfa w29, 80 at this point, but since 1496# we don't actually use the frame pointer like that, it's probably not 1497# worth bothering. 1498 stp d8, d9, [sp, #16] 1499 stp d10, d11, [sp, #32] 1500 stp d12, d13, [sp, #48] 1501 stp d14, d15, [sp, #64] 1502.cfi_offset b15, -8 1503.cfi_offset b14, -16 1504.cfi_offset b13, -24 1505.cfi_offset b12, -32 1506.cfi_offset b11, -40 1507.cfi_offset b10, -48 1508.cfi_offset b9, -56 1509.cfi_offset b8, -64 1510 1511 adrp x11, .Lchacha20_consts 1512 add x11, x11, :lo12:.Lchacha20_consts 1513 1514 ld1 {v24.16b - v27.16b}, [x11] // .Load the CONSTS, INC, ROL8 and CLAMP values 1515 ld1 {v28.16b - v30.16b}, [x5] 1516 1517 mov x15, #1 // Prepare the Poly1305 state 1518 mov x8, #0 1519 mov x9, #0 1520 mov x10, #0 1521 1522 mov v31.d[0], x4 // Store the input and aad lengths 1523 mov v31.d[1], x2 1524 1525 cmp x2, #128 1526 b.le .Lopen_128 // Optimization for smaller buffers 1527 1528 // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys 1529 mov v0.16b, v24.16b 1530 mov v5.16b, v28.16b 1531 mov v10.16b, v29.16b 1532 mov v15.16b, v30.16b 1533 1534 mov x6, #10 1535 1536.align 5 1537.Lopen_init_rounds: 1538 add v0.4s, v0.4s, v5.4s 1539 eor v15.16b, v15.16b, v0.16b 1540 rev32 v15.8h, v15.8h 1541 1542 add v10.4s, v10.4s, v15.4s 1543 eor v5.16b, v5.16b, v10.16b 1544 ushr v20.4s, v5.4s, #20 1545 sli v20.4s, v5.4s, #12 1546 add v0.4s, v0.4s, v20.4s 1547 eor v15.16b, v15.16b, v0.16b 1548 tbl v15.16b, {v15.16b}, v26.16b 1549 1550 add v10.4s, v10.4s, v15.4s 1551 eor v20.16b, v20.16b, v10.16b 1552 ushr v5.4s, v20.4s, #25 1553 sli v5.4s, v20.4s, #7 1554 ext v5.16b, v5.16b, v5.16b, #4 1555 ext v10.16b, v10.16b, v10.16b, #8 1556 ext v15.16b, v15.16b, v15.16b, #12 1557 add v0.4s, v0.4s, v5.4s 1558 eor v15.16b, v15.16b, v0.16b 1559 rev32 v15.8h, v15.8h 1560 1561 add v10.4s, v10.4s, v15.4s 1562 eor v5.16b, v5.16b, v10.16b 1563 ushr v20.4s, v5.4s, #20 1564 sli v20.4s, v5.4s, #12 1565 add v0.4s, v0.4s, v20.4s 1566 eor v15.16b, v15.16b, v0.16b 1567 tbl v15.16b, {v15.16b}, v26.16b 1568 1569 add v10.4s, v10.4s, v15.4s 1570 eor v20.16b, v20.16b, v10.16b 1571 ushr v5.4s, v20.4s, #25 1572 sli v5.4s, v20.4s, #7 1573 ext v5.16b, v5.16b, v5.16b, #12 1574 ext v10.16b, v10.16b, v10.16b, #8 1575 ext v15.16b, v15.16b, v15.16b, #4 1576 subs x6, x6, #1 1577 b.hi .Lopen_init_rounds 1578 1579 add v0.4s, v0.4s, v24.4s 1580 add v5.4s, v5.4s, v28.4s 1581 1582 and v0.16b, v0.16b, v27.16b 1583 mov x16, v0.d[0] // Move the R key to GPRs 1584 mov x17, v0.d[1] 1585 mov v27.16b, v5.16b // Store the S key 1586 1587 bl .Lpoly_hash_ad_internal 1588 1589.Lopen_ad_done: 1590 mov x3, x1 1591 1592// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes 1593.Lopen_main_loop: 1594 1595 cmp x2, #192 1596 b.lt .Lopen_tail 1597 1598 adrp x11, .Lchacha20_consts 1599 add x11, x11, :lo12:.Lchacha20_consts 1600 1601 ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] 1602 mov v4.16b, v24.16b 1603 1604 ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 1605 mov v9.16b, v28.16b 1606 1607 ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 1608 mov v14.16b, v29.16b 1609 1610 ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] 1611 sub x5, x5, #32 1612 add v15.4s, v15.4s, v25.4s 1613 mov v19.16b, v30.16b 1614 1615 eor v20.16b, v20.16b, v20.16b //zero 1616 not v21.16b, v20.16b // -1 1617 sub v21.4s, v25.4s, v21.4s // Add +1 1618 ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) 1619 add v19.4s, v19.4s, v20.4s 1620 1621 lsr x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12 1622 sub x4, x4, #10 1623 1624 mov x7, #10 1625 subs x6, x7, x4 1626 subs x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash 1627 csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full 1628 1629 cbz x7, .Lopen_main_loop_rounds_short 1630 1631.align 5 1632.Lopen_main_loop_rounds: 1633 ldp x11, x12, [x3], 16 1634 adds x8, x8, x11 1635 adcs x9, x9, x12 1636 adc x10, x10, x15 1637 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 1638 umulh x12, x8, x16 1639 mul x13, x9, x16 1640 umulh x14, x9, x16 1641 adds x12, x12, x13 1642 mul x13, x10, x16 1643 adc x13, x13, x14 1644 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 1645 umulh x8, x8, x17 1646 adds x12, x12, x14 1647 mul x14, x9, x17 1648 umulh x9, x9, x17 1649 adcs x14, x14, x8 1650 mul x10, x10, x17 1651 adc x10, x10, x9 1652 adds x13, x13, x14 1653 adc x14, x10, xzr 1654 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 1655 and x8, x13, #-4 1656 extr x13, x14, x13, #2 1657 adds x8, x8, x11 1658 lsr x11, x14, #2 1659 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 1660 adds x8, x8, x13 1661 adcs x9, x9, x12 1662 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 1663.Lopen_main_loop_rounds_short: 1664 add v0.4s, v0.4s, v5.4s 1665 add v1.4s, v1.4s, v6.4s 1666 add v2.4s, v2.4s, v7.4s 1667 add v3.4s, v3.4s, v8.4s 1668 add v4.4s, v4.4s, v9.4s 1669 1670 eor v15.16b, v15.16b, v0.16b 1671 eor v16.16b, v16.16b, v1.16b 1672 eor v17.16b, v17.16b, v2.16b 1673 eor v18.16b, v18.16b, v3.16b 1674 eor v19.16b, v19.16b, v4.16b 1675 1676 rev32 v15.8h, v15.8h 1677 rev32 v16.8h, v16.8h 1678 rev32 v17.8h, v17.8h 1679 rev32 v18.8h, v18.8h 1680 rev32 v19.8h, v19.8h 1681 1682 add v10.4s, v10.4s, v15.4s 1683 add v11.4s, v11.4s, v16.4s 1684 add v12.4s, v12.4s, v17.4s 1685 add v13.4s, v13.4s, v18.4s 1686 add v14.4s, v14.4s, v19.4s 1687 1688 eor v5.16b, v5.16b, v10.16b 1689 eor v6.16b, v6.16b, v11.16b 1690 eor v7.16b, v7.16b, v12.16b 1691 eor v8.16b, v8.16b, v13.16b 1692 eor v9.16b, v9.16b, v14.16b 1693 1694 ushr v20.4s, v5.4s, #20 1695 sli v20.4s, v5.4s, #12 1696 ushr v5.4s, v6.4s, #20 1697 sli v5.4s, v6.4s, #12 1698 ushr v6.4s, v7.4s, #20 1699 sli v6.4s, v7.4s, #12 1700 ushr v7.4s, v8.4s, #20 1701 sli v7.4s, v8.4s, #12 1702 ushr v8.4s, v9.4s, #20 1703 sli v8.4s, v9.4s, #12 1704 1705 add v0.4s, v0.4s, v20.4s 1706 add v1.4s, v1.4s, v5.4s 1707 add v2.4s, v2.4s, v6.4s 1708 add v3.4s, v3.4s, v7.4s 1709 add v4.4s, v4.4s, v8.4s 1710 1711 eor v15.16b, v15.16b, v0.16b 1712 eor v16.16b, v16.16b, v1.16b 1713 eor v17.16b, v17.16b, v2.16b 1714 eor v18.16b, v18.16b, v3.16b 1715 eor v19.16b, v19.16b, v4.16b 1716 1717 tbl v15.16b, {v15.16b}, v26.16b 1718 tbl v16.16b, {v16.16b}, v26.16b 1719 tbl v17.16b, {v17.16b}, v26.16b 1720 tbl v18.16b, {v18.16b}, v26.16b 1721 tbl v19.16b, {v19.16b}, v26.16b 1722 1723 add v10.4s, v10.4s, v15.4s 1724 add v11.4s, v11.4s, v16.4s 1725 add v12.4s, v12.4s, v17.4s 1726 add v13.4s, v13.4s, v18.4s 1727 add v14.4s, v14.4s, v19.4s 1728 1729 eor v20.16b, v20.16b, v10.16b 1730 eor v5.16b, v5.16b, v11.16b 1731 eor v6.16b, v6.16b, v12.16b 1732 eor v7.16b, v7.16b, v13.16b 1733 eor v8.16b, v8.16b, v14.16b 1734 1735 ushr v9.4s, v8.4s, #25 1736 sli v9.4s, v8.4s, #7 1737 ushr v8.4s, v7.4s, #25 1738 sli v8.4s, v7.4s, #7 1739 ushr v7.4s, v6.4s, #25 1740 sli v7.4s, v6.4s, #7 1741 ushr v6.4s, v5.4s, #25 1742 sli v6.4s, v5.4s, #7 1743 ushr v5.4s, v20.4s, #25 1744 sli v5.4s, v20.4s, #7 1745 1746 ext v9.16b, v9.16b, v9.16b, #4 1747 ext v14.16b, v14.16b, v14.16b, #8 1748 ext v19.16b, v19.16b, v19.16b, #12 1749 ldp x11, x12, [x3], 16 1750 adds x8, x8, x11 1751 adcs x9, x9, x12 1752 adc x10, x10, x15 1753 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 1754 umulh x12, x8, x16 1755 mul x13, x9, x16 1756 umulh x14, x9, x16 1757 adds x12, x12, x13 1758 mul x13, x10, x16 1759 adc x13, x13, x14 1760 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 1761 umulh x8, x8, x17 1762 adds x12, x12, x14 1763 mul x14, x9, x17 1764 umulh x9, x9, x17 1765 adcs x14, x14, x8 1766 mul x10, x10, x17 1767 adc x10, x10, x9 1768 adds x13, x13, x14 1769 adc x14, x10, xzr 1770 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 1771 and x8, x13, #-4 1772 extr x13, x14, x13, #2 1773 adds x8, x8, x11 1774 lsr x11, x14, #2 1775 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 1776 adds x8, x8, x13 1777 adcs x9, x9, x12 1778 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 1779 add v0.4s, v0.4s, v6.4s 1780 add v1.4s, v1.4s, v7.4s 1781 add v2.4s, v2.4s, v8.4s 1782 add v3.4s, v3.4s, v5.4s 1783 add v4.4s, v4.4s, v9.4s 1784 1785 eor v18.16b, v18.16b, v0.16b 1786 eor v15.16b, v15.16b, v1.16b 1787 eor v16.16b, v16.16b, v2.16b 1788 eor v17.16b, v17.16b, v3.16b 1789 eor v19.16b, v19.16b, v4.16b 1790 1791 rev32 v18.8h, v18.8h 1792 rev32 v15.8h, v15.8h 1793 rev32 v16.8h, v16.8h 1794 rev32 v17.8h, v17.8h 1795 rev32 v19.8h, v19.8h 1796 1797 add v12.4s, v12.4s, v18.4s 1798 add v13.4s, v13.4s, v15.4s 1799 add v10.4s, v10.4s, v16.4s 1800 add v11.4s, v11.4s, v17.4s 1801 add v14.4s, v14.4s, v19.4s 1802 1803 eor v6.16b, v6.16b, v12.16b 1804 eor v7.16b, v7.16b, v13.16b 1805 eor v8.16b, v8.16b, v10.16b 1806 eor v5.16b, v5.16b, v11.16b 1807 eor v9.16b, v9.16b, v14.16b 1808 1809 ushr v20.4s, v6.4s, #20 1810 sli v20.4s, v6.4s, #12 1811 ushr v6.4s, v7.4s, #20 1812 sli v6.4s, v7.4s, #12 1813 ushr v7.4s, v8.4s, #20 1814 sli v7.4s, v8.4s, #12 1815 ushr v8.4s, v5.4s, #20 1816 sli v8.4s, v5.4s, #12 1817 ushr v5.4s, v9.4s, #20 1818 sli v5.4s, v9.4s, #12 1819 1820 add v0.4s, v0.4s, v20.4s 1821 add v1.4s, v1.4s, v6.4s 1822 add v2.4s, v2.4s, v7.4s 1823 add v3.4s, v3.4s, v8.4s 1824 add v4.4s, v4.4s, v5.4s 1825 1826 eor v18.16b, v18.16b, v0.16b 1827 eor v15.16b, v15.16b, v1.16b 1828 eor v16.16b, v16.16b, v2.16b 1829 eor v17.16b, v17.16b, v3.16b 1830 eor v19.16b, v19.16b, v4.16b 1831 1832 tbl v18.16b, {v18.16b}, v26.16b 1833 tbl v15.16b, {v15.16b}, v26.16b 1834 tbl v16.16b, {v16.16b}, v26.16b 1835 tbl v17.16b, {v17.16b}, v26.16b 1836 tbl v19.16b, {v19.16b}, v26.16b 1837 1838 add v12.4s, v12.4s, v18.4s 1839 add v13.4s, v13.4s, v15.4s 1840 add v10.4s, v10.4s, v16.4s 1841 add v11.4s, v11.4s, v17.4s 1842 add v14.4s, v14.4s, v19.4s 1843 1844 eor v20.16b, v20.16b, v12.16b 1845 eor v6.16b, v6.16b, v13.16b 1846 eor v7.16b, v7.16b, v10.16b 1847 eor v8.16b, v8.16b, v11.16b 1848 eor v5.16b, v5.16b, v14.16b 1849 1850 ushr v9.4s, v5.4s, #25 1851 sli v9.4s, v5.4s, #7 1852 ushr v5.4s, v8.4s, #25 1853 sli v5.4s, v8.4s, #7 1854 ushr v8.4s, v7.4s, #25 1855 sli v8.4s, v7.4s, #7 1856 ushr v7.4s, v6.4s, #25 1857 sli v7.4s, v6.4s, #7 1858 ushr v6.4s, v20.4s, #25 1859 sli v6.4s, v20.4s, #7 1860 1861 ext v9.16b, v9.16b, v9.16b, #12 1862 ext v14.16b, v14.16b, v14.16b, #8 1863 ext v19.16b, v19.16b, v19.16b, #4 1864 subs x7, x7, #1 1865 b.gt .Lopen_main_loop_rounds 1866 subs x6, x6, #1 1867 b.ge .Lopen_main_loop_rounds_short 1868 1869 eor v20.16b, v20.16b, v20.16b //zero 1870 not v21.16b, v20.16b // -1 1871 sub v21.4s, v25.4s, v21.4s // Add +1 1872 ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) 1873 add v19.4s, v19.4s, v20.4s 1874 1875 add v15.4s, v15.4s, v25.4s 1876 mov x11, #5 1877 dup v20.4s, w11 1878 add v25.4s, v25.4s, v20.4s 1879 1880 zip1 v20.4s, v0.4s, v1.4s 1881 zip2 v21.4s, v0.4s, v1.4s 1882 zip1 v22.4s, v2.4s, v3.4s 1883 zip2 v23.4s, v2.4s, v3.4s 1884 1885 zip1 v0.2d, v20.2d, v22.2d 1886 zip2 v1.2d, v20.2d, v22.2d 1887 zip1 v2.2d, v21.2d, v23.2d 1888 zip2 v3.2d, v21.2d, v23.2d 1889 1890 zip1 v20.4s, v5.4s, v6.4s 1891 zip2 v21.4s, v5.4s, v6.4s 1892 zip1 v22.4s, v7.4s, v8.4s 1893 zip2 v23.4s, v7.4s, v8.4s 1894 1895 zip1 v5.2d, v20.2d, v22.2d 1896 zip2 v6.2d, v20.2d, v22.2d 1897 zip1 v7.2d, v21.2d, v23.2d 1898 zip2 v8.2d, v21.2d, v23.2d 1899 1900 zip1 v20.4s, v10.4s, v11.4s 1901 zip2 v21.4s, v10.4s, v11.4s 1902 zip1 v22.4s, v12.4s, v13.4s 1903 zip2 v23.4s, v12.4s, v13.4s 1904 1905 zip1 v10.2d, v20.2d, v22.2d 1906 zip2 v11.2d, v20.2d, v22.2d 1907 zip1 v12.2d, v21.2d, v23.2d 1908 zip2 v13.2d, v21.2d, v23.2d 1909 1910 zip1 v20.4s, v15.4s, v16.4s 1911 zip2 v21.4s, v15.4s, v16.4s 1912 zip1 v22.4s, v17.4s, v18.4s 1913 zip2 v23.4s, v17.4s, v18.4s 1914 1915 zip1 v15.2d, v20.2d, v22.2d 1916 zip2 v16.2d, v20.2d, v22.2d 1917 zip1 v17.2d, v21.2d, v23.2d 1918 zip2 v18.2d, v21.2d, v23.2d 1919 1920 add v0.4s, v0.4s, v24.4s 1921 add v5.4s, v5.4s, v28.4s 1922 add v10.4s, v10.4s, v29.4s 1923 add v15.4s, v15.4s, v30.4s 1924 1925 add v1.4s, v1.4s, v24.4s 1926 add v6.4s, v6.4s, v28.4s 1927 add v11.4s, v11.4s, v29.4s 1928 add v16.4s, v16.4s, v30.4s 1929 1930 add v2.4s, v2.4s, v24.4s 1931 add v7.4s, v7.4s, v28.4s 1932 add v12.4s, v12.4s, v29.4s 1933 add v17.4s, v17.4s, v30.4s 1934 1935 add v3.4s, v3.4s, v24.4s 1936 add v8.4s, v8.4s, v28.4s 1937 add v13.4s, v13.4s, v29.4s 1938 add v18.4s, v18.4s, v30.4s 1939 1940 add v4.4s, v4.4s, v24.4s 1941 add v9.4s, v9.4s, v28.4s 1942 add v14.4s, v14.4s, v29.4s 1943 add v19.4s, v19.4s, v30.4s 1944 1945 // We can always safely store 192 bytes 1946 ld1 {v20.16b - v23.16b}, [x1], #64 1947 eor v20.16b, v20.16b, v0.16b 1948 eor v21.16b, v21.16b, v5.16b 1949 eor v22.16b, v22.16b, v10.16b 1950 eor v23.16b, v23.16b, v15.16b 1951 st1 {v20.16b - v23.16b}, [x0], #64 1952 1953 ld1 {v20.16b - v23.16b}, [x1], #64 1954 eor v20.16b, v20.16b, v1.16b 1955 eor v21.16b, v21.16b, v6.16b 1956 eor v22.16b, v22.16b, v11.16b 1957 eor v23.16b, v23.16b, v16.16b 1958 st1 {v20.16b - v23.16b}, [x0], #64 1959 1960 ld1 {v20.16b - v23.16b}, [x1], #64 1961 eor v20.16b, v20.16b, v2.16b 1962 eor v21.16b, v21.16b, v7.16b 1963 eor v22.16b, v22.16b, v12.16b 1964 eor v23.16b, v23.16b, v17.16b 1965 st1 {v20.16b - v23.16b}, [x0], #64 1966 1967 sub x2, x2, #192 1968 1969 mov v0.16b, v3.16b 1970 mov v5.16b, v8.16b 1971 mov v10.16b, v13.16b 1972 mov v15.16b, v18.16b 1973 1974 cmp x2, #64 1975 b.lt .Lopen_tail_64_store 1976 1977 ld1 {v20.16b - v23.16b}, [x1], #64 1978 eor v20.16b, v20.16b, v3.16b 1979 eor v21.16b, v21.16b, v8.16b 1980 eor v22.16b, v22.16b, v13.16b 1981 eor v23.16b, v23.16b, v18.16b 1982 st1 {v20.16b - v23.16b}, [x0], #64 1983 1984 sub x2, x2, #64 1985 1986 mov v0.16b, v4.16b 1987 mov v5.16b, v9.16b 1988 mov v10.16b, v14.16b 1989 mov v15.16b, v19.16b 1990 1991 cmp x2, #64 1992 b.lt .Lopen_tail_64_store 1993 1994 ld1 {v20.16b - v23.16b}, [x1], #64 1995 eor v20.16b, v20.16b, v4.16b 1996 eor v21.16b, v21.16b, v9.16b 1997 eor v22.16b, v22.16b, v14.16b 1998 eor v23.16b, v23.16b, v19.16b 1999 st1 {v20.16b - v23.16b}, [x0], #64 2000 2001 sub x2, x2, #64 2002 b .Lopen_main_loop 2003 2004.Lopen_tail: 2005 2006 cbz x2, .Lopen_finalize 2007 2008 lsr x4, x2, #4 // How many whole blocks we have to hash 2009 2010 cmp x2, #64 2011 b.le .Lopen_tail_64 2012 cmp x2, #128 2013 b.le .Lopen_tail_128 2014 2015.Lopen_tail_192: 2016 // We need three more blocks 2017 mov v0.16b, v24.16b 2018 mov v1.16b, v24.16b 2019 mov v2.16b, v24.16b 2020 mov v5.16b, v28.16b 2021 mov v6.16b, v28.16b 2022 mov v7.16b, v28.16b 2023 mov v10.16b, v29.16b 2024 mov v11.16b, v29.16b 2025 mov v12.16b, v29.16b 2026 mov v15.16b, v30.16b 2027 mov v16.16b, v30.16b 2028 mov v17.16b, v30.16b 2029 eor v23.16b, v23.16b, v23.16b 2030 eor v21.16b, v21.16b, v21.16b 2031 ins v23.s[0], v25.s[0] 2032 ins v21.d[0], x15 2033 2034 add v22.4s, v23.4s, v21.4s 2035 add v21.4s, v22.4s, v21.4s 2036 2037 add v15.4s, v15.4s, v21.4s 2038 add v16.4s, v16.4s, v23.4s 2039 add v17.4s, v17.4s, v22.4s 2040 2041 mov x7, #10 2042 subs x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash 2043 csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing 2044 sub x4, x4, x7 2045 2046 cbz x7, .Lopen_tail_192_rounds_no_hash 2047 2048.Lopen_tail_192_rounds: 2049 ldp x11, x12, [x3], 16 2050 adds x8, x8, x11 2051 adcs x9, x9, x12 2052 adc x10, x10, x15 2053 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2054 umulh x12, x8, x16 2055 mul x13, x9, x16 2056 umulh x14, x9, x16 2057 adds x12, x12, x13 2058 mul x13, x10, x16 2059 adc x13, x13, x14 2060 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2061 umulh x8, x8, x17 2062 adds x12, x12, x14 2063 mul x14, x9, x17 2064 umulh x9, x9, x17 2065 adcs x14, x14, x8 2066 mul x10, x10, x17 2067 adc x10, x10, x9 2068 adds x13, x13, x14 2069 adc x14, x10, xzr 2070 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2071 and x8, x13, #-4 2072 extr x13, x14, x13, #2 2073 adds x8, x8, x11 2074 lsr x11, x14, #2 2075 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2076 adds x8, x8, x13 2077 adcs x9, x9, x12 2078 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2079.Lopen_tail_192_rounds_no_hash: 2080 add v0.4s, v0.4s, v5.4s 2081 add v1.4s, v1.4s, v6.4s 2082 add v2.4s, v2.4s, v7.4s 2083 eor v15.16b, v15.16b, v0.16b 2084 eor v16.16b, v16.16b, v1.16b 2085 eor v17.16b, v17.16b, v2.16b 2086 rev32 v15.8h, v15.8h 2087 rev32 v16.8h, v16.8h 2088 rev32 v17.8h, v17.8h 2089 2090 add v10.4s, v10.4s, v15.4s 2091 add v11.4s, v11.4s, v16.4s 2092 add v12.4s, v12.4s, v17.4s 2093 eor v5.16b, v5.16b, v10.16b 2094 eor v6.16b, v6.16b, v11.16b 2095 eor v7.16b, v7.16b, v12.16b 2096 ushr v20.4s, v5.4s, #20 2097 sli v20.4s, v5.4s, #12 2098 ushr v5.4s, v6.4s, #20 2099 sli v5.4s, v6.4s, #12 2100 ushr v6.4s, v7.4s, #20 2101 sli v6.4s, v7.4s, #12 2102 2103 add v0.4s, v0.4s, v20.4s 2104 add v1.4s, v1.4s, v5.4s 2105 add v2.4s, v2.4s, v6.4s 2106 eor v15.16b, v15.16b, v0.16b 2107 eor v16.16b, v16.16b, v1.16b 2108 eor v17.16b, v17.16b, v2.16b 2109 tbl v15.16b, {v15.16b}, v26.16b 2110 tbl v16.16b, {v16.16b}, v26.16b 2111 tbl v17.16b, {v17.16b}, v26.16b 2112 2113 add v10.4s, v10.4s, v15.4s 2114 add v11.4s, v11.4s, v16.4s 2115 add v12.4s, v12.4s, v17.4s 2116 eor v20.16b, v20.16b, v10.16b 2117 eor v5.16b, v5.16b, v11.16b 2118 eor v6.16b, v6.16b, v12.16b 2119 ushr v7.4s, v6.4s, #25 2120 sli v7.4s, v6.4s, #7 2121 ushr v6.4s, v5.4s, #25 2122 sli v6.4s, v5.4s, #7 2123 ushr v5.4s, v20.4s, #25 2124 sli v5.4s, v20.4s, #7 2125 2126 ext v5.16b, v5.16b, v5.16b, #4 2127 ext v6.16b, v6.16b, v6.16b, #4 2128 ext v7.16b, v7.16b, v7.16b, #4 2129 2130 ext v10.16b, v10.16b, v10.16b, #8 2131 ext v11.16b, v11.16b, v11.16b, #8 2132 ext v12.16b, v12.16b, v12.16b, #8 2133 2134 ext v15.16b, v15.16b, v15.16b, #12 2135 ext v16.16b, v16.16b, v16.16b, #12 2136 ext v17.16b, v17.16b, v17.16b, #12 2137 add v0.4s, v0.4s, v5.4s 2138 add v1.4s, v1.4s, v6.4s 2139 add v2.4s, v2.4s, v7.4s 2140 eor v15.16b, v15.16b, v0.16b 2141 eor v16.16b, v16.16b, v1.16b 2142 eor v17.16b, v17.16b, v2.16b 2143 rev32 v15.8h, v15.8h 2144 rev32 v16.8h, v16.8h 2145 rev32 v17.8h, v17.8h 2146 2147 add v10.4s, v10.4s, v15.4s 2148 add v11.4s, v11.4s, v16.4s 2149 add v12.4s, v12.4s, v17.4s 2150 eor v5.16b, v5.16b, v10.16b 2151 eor v6.16b, v6.16b, v11.16b 2152 eor v7.16b, v7.16b, v12.16b 2153 ushr v20.4s, v5.4s, #20 2154 sli v20.4s, v5.4s, #12 2155 ushr v5.4s, v6.4s, #20 2156 sli v5.4s, v6.4s, #12 2157 ushr v6.4s, v7.4s, #20 2158 sli v6.4s, v7.4s, #12 2159 2160 add v0.4s, v0.4s, v20.4s 2161 add v1.4s, v1.4s, v5.4s 2162 add v2.4s, v2.4s, v6.4s 2163 eor v15.16b, v15.16b, v0.16b 2164 eor v16.16b, v16.16b, v1.16b 2165 eor v17.16b, v17.16b, v2.16b 2166 tbl v15.16b, {v15.16b}, v26.16b 2167 tbl v16.16b, {v16.16b}, v26.16b 2168 tbl v17.16b, {v17.16b}, v26.16b 2169 2170 add v10.4s, v10.4s, v15.4s 2171 add v11.4s, v11.4s, v16.4s 2172 add v12.4s, v12.4s, v17.4s 2173 eor v20.16b, v20.16b, v10.16b 2174 eor v5.16b, v5.16b, v11.16b 2175 eor v6.16b, v6.16b, v12.16b 2176 ushr v7.4s, v6.4s, #25 2177 sli v7.4s, v6.4s, #7 2178 ushr v6.4s, v5.4s, #25 2179 sli v6.4s, v5.4s, #7 2180 ushr v5.4s, v20.4s, #25 2181 sli v5.4s, v20.4s, #7 2182 2183 ext v5.16b, v5.16b, v5.16b, #12 2184 ext v6.16b, v6.16b, v6.16b, #12 2185 ext v7.16b, v7.16b, v7.16b, #12 2186 2187 ext v10.16b, v10.16b, v10.16b, #8 2188 ext v11.16b, v11.16b, v11.16b, #8 2189 ext v12.16b, v12.16b, v12.16b, #8 2190 2191 ext v15.16b, v15.16b, v15.16b, #4 2192 ext v16.16b, v16.16b, v16.16b, #4 2193 ext v17.16b, v17.16b, v17.16b, #4 2194 subs x7, x7, #1 2195 b.gt .Lopen_tail_192_rounds 2196 subs x6, x6, #1 2197 b.ge .Lopen_tail_192_rounds_no_hash 2198 2199 // We hashed 160 bytes at most, may still have 32 bytes left 2200.Lopen_tail_192_hash: 2201 cbz x4, .Lopen_tail_192_hash_done 2202 ldp x11, x12, [x3], 16 2203 adds x8, x8, x11 2204 adcs x9, x9, x12 2205 adc x10, x10, x15 2206 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2207 umulh x12, x8, x16 2208 mul x13, x9, x16 2209 umulh x14, x9, x16 2210 adds x12, x12, x13 2211 mul x13, x10, x16 2212 adc x13, x13, x14 2213 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2214 umulh x8, x8, x17 2215 adds x12, x12, x14 2216 mul x14, x9, x17 2217 umulh x9, x9, x17 2218 adcs x14, x14, x8 2219 mul x10, x10, x17 2220 adc x10, x10, x9 2221 adds x13, x13, x14 2222 adc x14, x10, xzr 2223 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2224 and x8, x13, #-4 2225 extr x13, x14, x13, #2 2226 adds x8, x8, x11 2227 lsr x11, x14, #2 2228 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2229 adds x8, x8, x13 2230 adcs x9, x9, x12 2231 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2232 sub x4, x4, #1 2233 b .Lopen_tail_192_hash 2234 2235.Lopen_tail_192_hash_done: 2236 2237 add v0.4s, v0.4s, v24.4s 2238 add v1.4s, v1.4s, v24.4s 2239 add v2.4s, v2.4s, v24.4s 2240 add v5.4s, v5.4s, v28.4s 2241 add v6.4s, v6.4s, v28.4s 2242 add v7.4s, v7.4s, v28.4s 2243 add v10.4s, v10.4s, v29.4s 2244 add v11.4s, v11.4s, v29.4s 2245 add v12.4s, v12.4s, v29.4s 2246 add v15.4s, v15.4s, v30.4s 2247 add v16.4s, v16.4s, v30.4s 2248 add v17.4s, v17.4s, v30.4s 2249 2250 add v15.4s, v15.4s, v21.4s 2251 add v16.4s, v16.4s, v23.4s 2252 add v17.4s, v17.4s, v22.4s 2253 2254 ld1 {v20.16b - v23.16b}, [x1], #64 2255 2256 eor v20.16b, v20.16b, v1.16b 2257 eor v21.16b, v21.16b, v6.16b 2258 eor v22.16b, v22.16b, v11.16b 2259 eor v23.16b, v23.16b, v16.16b 2260 2261 st1 {v20.16b - v23.16b}, [x0], #64 2262 2263 ld1 {v20.16b - v23.16b}, [x1], #64 2264 2265 eor v20.16b, v20.16b, v2.16b 2266 eor v21.16b, v21.16b, v7.16b 2267 eor v22.16b, v22.16b, v12.16b 2268 eor v23.16b, v23.16b, v17.16b 2269 2270 st1 {v20.16b - v23.16b}, [x0], #64 2271 2272 sub x2, x2, #128 2273 b .Lopen_tail_64_store 2274 2275.Lopen_tail_128: 2276 // We need two more blocks 2277 mov v0.16b, v24.16b 2278 mov v1.16b, v24.16b 2279 mov v5.16b, v28.16b 2280 mov v6.16b, v28.16b 2281 mov v10.16b, v29.16b 2282 mov v11.16b, v29.16b 2283 mov v15.16b, v30.16b 2284 mov v16.16b, v30.16b 2285 eor v23.16b, v23.16b, v23.16b 2286 eor v22.16b, v22.16b, v22.16b 2287 ins v23.s[0], v25.s[0] 2288 ins v22.d[0], x15 2289 add v22.4s, v22.4s, v23.4s 2290 2291 add v15.4s, v15.4s, v22.4s 2292 add v16.4s, v16.4s, v23.4s 2293 2294 mov x6, #10 2295 sub x6, x6, x4 2296 2297.Lopen_tail_128_rounds: 2298 add v0.4s, v0.4s, v5.4s 2299 eor v15.16b, v15.16b, v0.16b 2300 rev32 v15.8h, v15.8h 2301 2302 add v10.4s, v10.4s, v15.4s 2303 eor v5.16b, v5.16b, v10.16b 2304 ushr v20.4s, v5.4s, #20 2305 sli v20.4s, v5.4s, #12 2306 add v0.4s, v0.4s, v20.4s 2307 eor v15.16b, v15.16b, v0.16b 2308 tbl v15.16b, {v15.16b}, v26.16b 2309 2310 add v10.4s, v10.4s, v15.4s 2311 eor v20.16b, v20.16b, v10.16b 2312 ushr v5.4s, v20.4s, #25 2313 sli v5.4s, v20.4s, #7 2314 ext v5.16b, v5.16b, v5.16b, #4 2315 ext v10.16b, v10.16b, v10.16b, #8 2316 ext v15.16b, v15.16b, v15.16b, #12 2317 add v1.4s, v1.4s, v6.4s 2318 eor v16.16b, v16.16b, v1.16b 2319 rev32 v16.8h, v16.8h 2320 2321 add v11.4s, v11.4s, v16.4s 2322 eor v6.16b, v6.16b, v11.16b 2323 ushr v20.4s, v6.4s, #20 2324 sli v20.4s, v6.4s, #12 2325 add v1.4s, v1.4s, v20.4s 2326 eor v16.16b, v16.16b, v1.16b 2327 tbl v16.16b, {v16.16b}, v26.16b 2328 2329 add v11.4s, v11.4s, v16.4s 2330 eor v20.16b, v20.16b, v11.16b 2331 ushr v6.4s, v20.4s, #25 2332 sli v6.4s, v20.4s, #7 2333 ext v6.16b, v6.16b, v6.16b, #4 2334 ext v11.16b, v11.16b, v11.16b, #8 2335 ext v16.16b, v16.16b, v16.16b, #12 2336 add v0.4s, v0.4s, v5.4s 2337 eor v15.16b, v15.16b, v0.16b 2338 rev32 v15.8h, v15.8h 2339 2340 add v10.4s, v10.4s, v15.4s 2341 eor v5.16b, v5.16b, v10.16b 2342 ushr v20.4s, v5.4s, #20 2343 sli v20.4s, v5.4s, #12 2344 add v0.4s, v0.4s, v20.4s 2345 eor v15.16b, v15.16b, v0.16b 2346 tbl v15.16b, {v15.16b}, v26.16b 2347 2348 add v10.4s, v10.4s, v15.4s 2349 eor v20.16b, v20.16b, v10.16b 2350 ushr v5.4s, v20.4s, #25 2351 sli v5.4s, v20.4s, #7 2352 ext v5.16b, v5.16b, v5.16b, #12 2353 ext v10.16b, v10.16b, v10.16b, #8 2354 ext v15.16b, v15.16b, v15.16b, #4 2355 add v1.4s, v1.4s, v6.4s 2356 eor v16.16b, v16.16b, v1.16b 2357 rev32 v16.8h, v16.8h 2358 2359 add v11.4s, v11.4s, v16.4s 2360 eor v6.16b, v6.16b, v11.16b 2361 ushr v20.4s, v6.4s, #20 2362 sli v20.4s, v6.4s, #12 2363 add v1.4s, v1.4s, v20.4s 2364 eor v16.16b, v16.16b, v1.16b 2365 tbl v16.16b, {v16.16b}, v26.16b 2366 2367 add v11.4s, v11.4s, v16.4s 2368 eor v20.16b, v20.16b, v11.16b 2369 ushr v6.4s, v20.4s, #25 2370 sli v6.4s, v20.4s, #7 2371 ext v6.16b, v6.16b, v6.16b, #12 2372 ext v11.16b, v11.16b, v11.16b, #8 2373 ext v16.16b, v16.16b, v16.16b, #4 2374 subs x6, x6, #1 2375 b.gt .Lopen_tail_128_rounds 2376 cbz x4, .Lopen_tail_128_rounds_done 2377 subs x4, x4, #1 2378 ldp x11, x12, [x3], 16 2379 adds x8, x8, x11 2380 adcs x9, x9, x12 2381 adc x10, x10, x15 2382 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2383 umulh x12, x8, x16 2384 mul x13, x9, x16 2385 umulh x14, x9, x16 2386 adds x12, x12, x13 2387 mul x13, x10, x16 2388 adc x13, x13, x14 2389 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2390 umulh x8, x8, x17 2391 adds x12, x12, x14 2392 mul x14, x9, x17 2393 umulh x9, x9, x17 2394 adcs x14, x14, x8 2395 mul x10, x10, x17 2396 adc x10, x10, x9 2397 adds x13, x13, x14 2398 adc x14, x10, xzr 2399 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2400 and x8, x13, #-4 2401 extr x13, x14, x13, #2 2402 adds x8, x8, x11 2403 lsr x11, x14, #2 2404 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2405 adds x8, x8, x13 2406 adcs x9, x9, x12 2407 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2408 b .Lopen_tail_128_rounds 2409 2410.Lopen_tail_128_rounds_done: 2411 add v0.4s, v0.4s, v24.4s 2412 add v1.4s, v1.4s, v24.4s 2413 add v5.4s, v5.4s, v28.4s 2414 add v6.4s, v6.4s, v28.4s 2415 add v10.4s, v10.4s, v29.4s 2416 add v11.4s, v11.4s, v29.4s 2417 add v15.4s, v15.4s, v30.4s 2418 add v16.4s, v16.4s, v30.4s 2419 add v15.4s, v15.4s, v22.4s 2420 add v16.4s, v16.4s, v23.4s 2421 2422 ld1 {v20.16b - v23.16b}, [x1], #64 2423 2424 eor v20.16b, v20.16b, v1.16b 2425 eor v21.16b, v21.16b, v6.16b 2426 eor v22.16b, v22.16b, v11.16b 2427 eor v23.16b, v23.16b, v16.16b 2428 2429 st1 {v20.16b - v23.16b}, [x0], #64 2430 sub x2, x2, #64 2431 2432 b .Lopen_tail_64_store 2433 2434.Lopen_tail_64: 2435 // We just need a single block 2436 mov v0.16b, v24.16b 2437 mov v5.16b, v28.16b 2438 mov v10.16b, v29.16b 2439 mov v15.16b, v30.16b 2440 eor v23.16b, v23.16b, v23.16b 2441 ins v23.s[0], v25.s[0] 2442 add v15.4s, v15.4s, v23.4s 2443 2444 mov x6, #10 2445 sub x6, x6, x4 2446 2447.Lopen_tail_64_rounds: 2448 add v0.4s, v0.4s, v5.4s 2449 eor v15.16b, v15.16b, v0.16b 2450 rev32 v15.8h, v15.8h 2451 2452 add v10.4s, v10.4s, v15.4s 2453 eor v5.16b, v5.16b, v10.16b 2454 ushr v20.4s, v5.4s, #20 2455 sli v20.4s, v5.4s, #12 2456 add v0.4s, v0.4s, v20.4s 2457 eor v15.16b, v15.16b, v0.16b 2458 tbl v15.16b, {v15.16b}, v26.16b 2459 2460 add v10.4s, v10.4s, v15.4s 2461 eor v20.16b, v20.16b, v10.16b 2462 ushr v5.4s, v20.4s, #25 2463 sli v5.4s, v20.4s, #7 2464 ext v5.16b, v5.16b, v5.16b, #4 2465 ext v10.16b, v10.16b, v10.16b, #8 2466 ext v15.16b, v15.16b, v15.16b, #12 2467 add v0.4s, v0.4s, v5.4s 2468 eor v15.16b, v15.16b, v0.16b 2469 rev32 v15.8h, v15.8h 2470 2471 add v10.4s, v10.4s, v15.4s 2472 eor v5.16b, v5.16b, v10.16b 2473 ushr v20.4s, v5.4s, #20 2474 sli v20.4s, v5.4s, #12 2475 add v0.4s, v0.4s, v20.4s 2476 eor v15.16b, v15.16b, v0.16b 2477 tbl v15.16b, {v15.16b}, v26.16b 2478 2479 add v10.4s, v10.4s, v15.4s 2480 eor v20.16b, v20.16b, v10.16b 2481 ushr v5.4s, v20.4s, #25 2482 sli v5.4s, v20.4s, #7 2483 ext v5.16b, v5.16b, v5.16b, #12 2484 ext v10.16b, v10.16b, v10.16b, #8 2485 ext v15.16b, v15.16b, v15.16b, #4 2486 subs x6, x6, #1 2487 b.gt .Lopen_tail_64_rounds 2488 cbz x4, .Lopen_tail_64_rounds_done 2489 subs x4, x4, #1 2490 ldp x11, x12, [x3], 16 2491 adds x8, x8, x11 2492 adcs x9, x9, x12 2493 adc x10, x10, x15 2494 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2495 umulh x12, x8, x16 2496 mul x13, x9, x16 2497 umulh x14, x9, x16 2498 adds x12, x12, x13 2499 mul x13, x10, x16 2500 adc x13, x13, x14 2501 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2502 umulh x8, x8, x17 2503 adds x12, x12, x14 2504 mul x14, x9, x17 2505 umulh x9, x9, x17 2506 adcs x14, x14, x8 2507 mul x10, x10, x17 2508 adc x10, x10, x9 2509 adds x13, x13, x14 2510 adc x14, x10, xzr 2511 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2512 and x8, x13, #-4 2513 extr x13, x14, x13, #2 2514 adds x8, x8, x11 2515 lsr x11, x14, #2 2516 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2517 adds x8, x8, x13 2518 adcs x9, x9, x12 2519 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2520 b .Lopen_tail_64_rounds 2521 2522.Lopen_tail_64_rounds_done: 2523 add v0.4s, v0.4s, v24.4s 2524 add v5.4s, v5.4s, v28.4s 2525 add v10.4s, v10.4s, v29.4s 2526 add v15.4s, v15.4s, v30.4s 2527 add v15.4s, v15.4s, v23.4s 2528 2529.Lopen_tail_64_store: 2530 cmp x2, #16 2531 b.lt .Lopen_tail_16 2532 2533 ld1 {v20.16b}, [x1], #16 2534 eor v20.16b, v20.16b, v0.16b 2535 st1 {v20.16b}, [x0], #16 2536 mov v0.16b, v5.16b 2537 mov v5.16b, v10.16b 2538 mov v10.16b, v15.16b 2539 sub x2, x2, #16 2540 b .Lopen_tail_64_store 2541 2542.Lopen_tail_16: 2543 // Here we handle the last [0,16) bytes that require a padded block 2544 cbz x2, .Lopen_finalize 2545 2546 eor v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext 2547 eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask 2548 not v22.16b, v20.16b 2549 2550 add x7, x1, x2 2551 mov x6, x2 2552 2553.Lopen_tail_16_compose: 2554 ext v20.16b, v20.16b, v20.16b, #15 2555 ldrb w11, [x7, #-1]! 2556 mov v20.b[0], w11 2557 ext v21.16b, v22.16b, v21.16b, #15 2558 subs x2, x2, #1 2559 b.gt .Lopen_tail_16_compose 2560 2561 and v20.16b, v20.16b, v21.16b 2562 // Hash in the final padded block 2563 mov x11, v20.d[0] 2564 mov x12, v20.d[1] 2565 adds x8, x8, x11 2566 adcs x9, x9, x12 2567 adc x10, x10, x15 2568 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2569 umulh x12, x8, x16 2570 mul x13, x9, x16 2571 umulh x14, x9, x16 2572 adds x12, x12, x13 2573 mul x13, x10, x16 2574 adc x13, x13, x14 2575 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2576 umulh x8, x8, x17 2577 adds x12, x12, x14 2578 mul x14, x9, x17 2579 umulh x9, x9, x17 2580 adcs x14, x14, x8 2581 mul x10, x10, x17 2582 adc x10, x10, x9 2583 adds x13, x13, x14 2584 adc x14, x10, xzr 2585 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2586 and x8, x13, #-4 2587 extr x13, x14, x13, #2 2588 adds x8, x8, x11 2589 lsr x11, x14, #2 2590 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2591 adds x8, x8, x13 2592 adcs x9, x9, x12 2593 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2594 eor v20.16b, v20.16b, v0.16b 2595 2596.Lopen_tail_16_store: 2597 umov w11, v20.b[0] 2598 strb w11, [x0], #1 2599 ext v20.16b, v20.16b, v20.16b, #1 2600 subs x6, x6, #1 2601 b.gt .Lopen_tail_16_store 2602 2603.Lopen_finalize: 2604 mov x11, v31.d[0] 2605 mov x12, v31.d[1] 2606 adds x8, x8, x11 2607 adcs x9, x9, x12 2608 adc x10, x10, x15 2609 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2610 umulh x12, x8, x16 2611 mul x13, x9, x16 2612 umulh x14, x9, x16 2613 adds x12, x12, x13 2614 mul x13, x10, x16 2615 adc x13, x13, x14 2616 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2617 umulh x8, x8, x17 2618 adds x12, x12, x14 2619 mul x14, x9, x17 2620 umulh x9, x9, x17 2621 adcs x14, x14, x8 2622 mul x10, x10, x17 2623 adc x10, x10, x9 2624 adds x13, x13, x14 2625 adc x14, x10, xzr 2626 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2627 and x8, x13, #-4 2628 extr x13, x14, x13, #2 2629 adds x8, x8, x11 2630 lsr x11, x14, #2 2631 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2632 adds x8, x8, x13 2633 adcs x9, x9, x12 2634 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2635 # Final reduction step 2636 sub x12, xzr, x15 2637 orr x13, xzr, #3 2638 subs x11, x8, #-5 2639 sbcs x12, x9, x12 2640 sbcs x13, x10, x13 2641 csel x8, x11, x8, cs 2642 csel x9, x12, x9, cs 2643 csel x10, x13, x10, cs 2644 mov x11, v27.d[0] 2645 mov x12, v27.d[1] 2646 adds x8, x8, x11 2647 adcs x9, x9, x12 2648 adc x10, x10, x15 2649 2650 stp x8, x9, [x5] 2651 2652 ldp d8, d9, [sp, #16] 2653 ldp d10, d11, [sp, #32] 2654 ldp d12, d13, [sp, #48] 2655 ldp d14, d15, [sp, #64] 2656.cfi_restore b15 2657.cfi_restore b14 2658.cfi_restore b13 2659.cfi_restore b12 2660.cfi_restore b11 2661.cfi_restore b10 2662.cfi_restore b9 2663.cfi_restore b8 2664 ldp x29, x30, [sp], 80 2665.cfi_restore w29 2666.cfi_restore w30 2667.cfi_def_cfa_offset 0 2668 AARCH64_VALIDATE_LINK_REGISTER 2669 ret 2670 2671.Lopen_128: 2672 // On some architectures preparing 5 blocks for small buffers is wasteful 2673 eor v25.16b, v25.16b, v25.16b 2674 mov x11, #1 2675 mov v25.s[0], w11 2676 mov v0.16b, v24.16b 2677 mov v1.16b, v24.16b 2678 mov v2.16b, v24.16b 2679 mov v5.16b, v28.16b 2680 mov v6.16b, v28.16b 2681 mov v7.16b, v28.16b 2682 mov v10.16b, v29.16b 2683 mov v11.16b, v29.16b 2684 mov v12.16b, v29.16b 2685 mov v17.16b, v30.16b 2686 add v15.4s, v17.4s, v25.4s 2687 add v16.4s, v15.4s, v25.4s 2688 2689 mov x6, #10 2690 2691.Lopen_128_rounds: 2692 add v0.4s, v0.4s, v5.4s 2693 add v1.4s, v1.4s, v6.4s 2694 add v2.4s, v2.4s, v7.4s 2695 eor v15.16b, v15.16b, v0.16b 2696 eor v16.16b, v16.16b, v1.16b 2697 eor v17.16b, v17.16b, v2.16b 2698 rev32 v15.8h, v15.8h 2699 rev32 v16.8h, v16.8h 2700 rev32 v17.8h, v17.8h 2701 2702 add v10.4s, v10.4s, v15.4s 2703 add v11.4s, v11.4s, v16.4s 2704 add v12.4s, v12.4s, v17.4s 2705 eor v5.16b, v5.16b, v10.16b 2706 eor v6.16b, v6.16b, v11.16b 2707 eor v7.16b, v7.16b, v12.16b 2708 ushr v20.4s, v5.4s, #20 2709 sli v20.4s, v5.4s, #12 2710 ushr v5.4s, v6.4s, #20 2711 sli v5.4s, v6.4s, #12 2712 ushr v6.4s, v7.4s, #20 2713 sli v6.4s, v7.4s, #12 2714 2715 add v0.4s, v0.4s, v20.4s 2716 add v1.4s, v1.4s, v5.4s 2717 add v2.4s, v2.4s, v6.4s 2718 eor v15.16b, v15.16b, v0.16b 2719 eor v16.16b, v16.16b, v1.16b 2720 eor v17.16b, v17.16b, v2.16b 2721 tbl v15.16b, {v15.16b}, v26.16b 2722 tbl v16.16b, {v16.16b}, v26.16b 2723 tbl v17.16b, {v17.16b}, v26.16b 2724 2725 add v10.4s, v10.4s, v15.4s 2726 add v11.4s, v11.4s, v16.4s 2727 add v12.4s, v12.4s, v17.4s 2728 eor v20.16b, v20.16b, v10.16b 2729 eor v5.16b, v5.16b, v11.16b 2730 eor v6.16b, v6.16b, v12.16b 2731 ushr v7.4s, v6.4s, #25 2732 sli v7.4s, v6.4s, #7 2733 ushr v6.4s, v5.4s, #25 2734 sli v6.4s, v5.4s, #7 2735 ushr v5.4s, v20.4s, #25 2736 sli v5.4s, v20.4s, #7 2737 2738 ext v5.16b, v5.16b, v5.16b, #4 2739 ext v6.16b, v6.16b, v6.16b, #4 2740 ext v7.16b, v7.16b, v7.16b, #4 2741 2742 ext v10.16b, v10.16b, v10.16b, #8 2743 ext v11.16b, v11.16b, v11.16b, #8 2744 ext v12.16b, v12.16b, v12.16b, #8 2745 2746 ext v15.16b, v15.16b, v15.16b, #12 2747 ext v16.16b, v16.16b, v16.16b, #12 2748 ext v17.16b, v17.16b, v17.16b, #12 2749 add v0.4s, v0.4s, v5.4s 2750 add v1.4s, v1.4s, v6.4s 2751 add v2.4s, v2.4s, v7.4s 2752 eor v15.16b, v15.16b, v0.16b 2753 eor v16.16b, v16.16b, v1.16b 2754 eor v17.16b, v17.16b, v2.16b 2755 rev32 v15.8h, v15.8h 2756 rev32 v16.8h, v16.8h 2757 rev32 v17.8h, v17.8h 2758 2759 add v10.4s, v10.4s, v15.4s 2760 add v11.4s, v11.4s, v16.4s 2761 add v12.4s, v12.4s, v17.4s 2762 eor v5.16b, v5.16b, v10.16b 2763 eor v6.16b, v6.16b, v11.16b 2764 eor v7.16b, v7.16b, v12.16b 2765 ushr v20.4s, v5.4s, #20 2766 sli v20.4s, v5.4s, #12 2767 ushr v5.4s, v6.4s, #20 2768 sli v5.4s, v6.4s, #12 2769 ushr v6.4s, v7.4s, #20 2770 sli v6.4s, v7.4s, #12 2771 2772 add v0.4s, v0.4s, v20.4s 2773 add v1.4s, v1.4s, v5.4s 2774 add v2.4s, v2.4s, v6.4s 2775 eor v15.16b, v15.16b, v0.16b 2776 eor v16.16b, v16.16b, v1.16b 2777 eor v17.16b, v17.16b, v2.16b 2778 tbl v15.16b, {v15.16b}, v26.16b 2779 tbl v16.16b, {v16.16b}, v26.16b 2780 tbl v17.16b, {v17.16b}, v26.16b 2781 2782 add v10.4s, v10.4s, v15.4s 2783 add v11.4s, v11.4s, v16.4s 2784 add v12.4s, v12.4s, v17.4s 2785 eor v20.16b, v20.16b, v10.16b 2786 eor v5.16b, v5.16b, v11.16b 2787 eor v6.16b, v6.16b, v12.16b 2788 ushr v7.4s, v6.4s, #25 2789 sli v7.4s, v6.4s, #7 2790 ushr v6.4s, v5.4s, #25 2791 sli v6.4s, v5.4s, #7 2792 ushr v5.4s, v20.4s, #25 2793 sli v5.4s, v20.4s, #7 2794 2795 ext v5.16b, v5.16b, v5.16b, #12 2796 ext v6.16b, v6.16b, v6.16b, #12 2797 ext v7.16b, v7.16b, v7.16b, #12 2798 2799 ext v10.16b, v10.16b, v10.16b, #8 2800 ext v11.16b, v11.16b, v11.16b, #8 2801 ext v12.16b, v12.16b, v12.16b, #8 2802 2803 ext v15.16b, v15.16b, v15.16b, #4 2804 ext v16.16b, v16.16b, v16.16b, #4 2805 ext v17.16b, v17.16b, v17.16b, #4 2806 subs x6, x6, #1 2807 b.hi .Lopen_128_rounds 2808 2809 add v0.4s, v0.4s, v24.4s 2810 add v1.4s, v1.4s, v24.4s 2811 add v2.4s, v2.4s, v24.4s 2812 2813 add v5.4s, v5.4s, v28.4s 2814 add v6.4s, v6.4s, v28.4s 2815 add v7.4s, v7.4s, v28.4s 2816 2817 add v10.4s, v10.4s, v29.4s 2818 add v11.4s, v11.4s, v29.4s 2819 2820 add v30.4s, v30.4s, v25.4s 2821 add v15.4s, v15.4s, v30.4s 2822 add v30.4s, v30.4s, v25.4s 2823 add v16.4s, v16.4s, v30.4s 2824 2825 and v2.16b, v2.16b, v27.16b 2826 mov x16, v2.d[0] // Move the R key to GPRs 2827 mov x17, v2.d[1] 2828 mov v27.16b, v7.16b // Store the S key 2829 2830 bl .Lpoly_hash_ad_internal 2831 2832.Lopen_128_store: 2833 cmp x2, #64 2834 b.lt .Lopen_128_store_64 2835 2836 ld1 {v20.16b - v23.16b}, [x1], #64 2837 2838 mov x11, v20.d[0] 2839 mov x12, v20.d[1] 2840 adds x8, x8, x11 2841 adcs x9, x9, x12 2842 adc x10, x10, x15 2843 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2844 umulh x12, x8, x16 2845 mul x13, x9, x16 2846 umulh x14, x9, x16 2847 adds x12, x12, x13 2848 mul x13, x10, x16 2849 adc x13, x13, x14 2850 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2851 umulh x8, x8, x17 2852 adds x12, x12, x14 2853 mul x14, x9, x17 2854 umulh x9, x9, x17 2855 adcs x14, x14, x8 2856 mul x10, x10, x17 2857 adc x10, x10, x9 2858 adds x13, x13, x14 2859 adc x14, x10, xzr 2860 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2861 and x8, x13, #-4 2862 extr x13, x14, x13, #2 2863 adds x8, x8, x11 2864 lsr x11, x14, #2 2865 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2866 adds x8, x8, x13 2867 adcs x9, x9, x12 2868 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2869 mov x11, v21.d[0] 2870 mov x12, v21.d[1] 2871 adds x8, x8, x11 2872 adcs x9, x9, x12 2873 adc x10, x10, x15 2874 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2875 umulh x12, x8, x16 2876 mul x13, x9, x16 2877 umulh x14, x9, x16 2878 adds x12, x12, x13 2879 mul x13, x10, x16 2880 adc x13, x13, x14 2881 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2882 umulh x8, x8, x17 2883 adds x12, x12, x14 2884 mul x14, x9, x17 2885 umulh x9, x9, x17 2886 adcs x14, x14, x8 2887 mul x10, x10, x17 2888 adc x10, x10, x9 2889 adds x13, x13, x14 2890 adc x14, x10, xzr 2891 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2892 and x8, x13, #-4 2893 extr x13, x14, x13, #2 2894 adds x8, x8, x11 2895 lsr x11, x14, #2 2896 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2897 adds x8, x8, x13 2898 adcs x9, x9, x12 2899 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2900 mov x11, v22.d[0] 2901 mov x12, v22.d[1] 2902 adds x8, x8, x11 2903 adcs x9, x9, x12 2904 adc x10, x10, x15 2905 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2906 umulh x12, x8, x16 2907 mul x13, x9, x16 2908 umulh x14, x9, x16 2909 adds x12, x12, x13 2910 mul x13, x10, x16 2911 adc x13, x13, x14 2912 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2913 umulh x8, x8, x17 2914 adds x12, x12, x14 2915 mul x14, x9, x17 2916 umulh x9, x9, x17 2917 adcs x14, x14, x8 2918 mul x10, x10, x17 2919 adc x10, x10, x9 2920 adds x13, x13, x14 2921 adc x14, x10, xzr 2922 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2923 and x8, x13, #-4 2924 extr x13, x14, x13, #2 2925 adds x8, x8, x11 2926 lsr x11, x14, #2 2927 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2928 adds x8, x8, x13 2929 adcs x9, x9, x12 2930 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2931 mov x11, v23.d[0] 2932 mov x12, v23.d[1] 2933 adds x8, x8, x11 2934 adcs x9, x9, x12 2935 adc x10, x10, x15 2936 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2937 umulh x12, x8, x16 2938 mul x13, x9, x16 2939 umulh x14, x9, x16 2940 adds x12, x12, x13 2941 mul x13, x10, x16 2942 adc x13, x13, x14 2943 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2944 umulh x8, x8, x17 2945 adds x12, x12, x14 2946 mul x14, x9, x17 2947 umulh x9, x9, x17 2948 adcs x14, x14, x8 2949 mul x10, x10, x17 2950 adc x10, x10, x9 2951 adds x13, x13, x14 2952 adc x14, x10, xzr 2953 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 2954 and x8, x13, #-4 2955 extr x13, x14, x13, #2 2956 adds x8, x8, x11 2957 lsr x11, x14, #2 2958 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 2959 adds x8, x8, x13 2960 adcs x9, x9, x12 2961 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 2962 2963 eor v20.16b, v20.16b, v0.16b 2964 eor v21.16b, v21.16b, v5.16b 2965 eor v22.16b, v22.16b, v10.16b 2966 eor v23.16b, v23.16b, v15.16b 2967 2968 st1 {v20.16b - v23.16b}, [x0], #64 2969 2970 sub x2, x2, #64 2971 2972 mov v0.16b, v1.16b 2973 mov v5.16b, v6.16b 2974 mov v10.16b, v11.16b 2975 mov v15.16b, v16.16b 2976 2977.Lopen_128_store_64: 2978 2979 lsr x4, x2, #4 2980 mov x3, x1 2981 2982.Lopen_128_hash_64: 2983 cbz x4, .Lopen_tail_64_store 2984 ldp x11, x12, [x3], 16 2985 adds x8, x8, x11 2986 adcs x9, x9, x12 2987 adc x10, x10, x15 2988 mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 2989 umulh x12, x8, x16 2990 mul x13, x9, x16 2991 umulh x14, x9, x16 2992 adds x12, x12, x13 2993 mul x13, x10, x16 2994 adc x13, x13, x14 2995 mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] 2996 umulh x8, x8, x17 2997 adds x12, x12, x14 2998 mul x14, x9, x17 2999 umulh x9, x9, x17 3000 adcs x14, x14, x8 3001 mul x10, x10, x17 3002 adc x10, x10, x9 3003 adds x13, x13, x14 3004 adc x14, x10, xzr 3005 and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) 3006 and x8, x13, #-4 3007 extr x13, x14, x13, #2 3008 adds x8, x8, x11 3009 lsr x11, x14, #2 3010 adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits 3011 adds x8, x8, x13 3012 adcs x9, x9, x12 3013 adc x10, x10, xzr // At this point acc2 has the value of 4 at most 3014 sub x4, x4, #1 3015 b .Lopen_128_hash_64 3016.cfi_endproc 3017.size chacha20_poly1305_open,.-chacha20_poly1305_open 3018#endif 3019#endif // !OPENSSL_NO_ASM 3020.section .note.GNU-stack,"",%progbits 3021