1#include "arm_arch.h" 2 3.text 4 5// forward "declarations" are required for Apple 6 7.hidden OPENSSL_armcap_P 8.globl poly1305_init 9.hidden poly1305_init 10.globl poly1305_blocks 11.hidden poly1305_blocks 12.globl poly1305_emit 13.hidden poly1305_emit 14 15.type poly1305_init,%function 16.align 5 17poly1305_init: 18 cmp x1,xzr 19 stp xzr,xzr,[x0] // zero hash value 20 stp xzr,xzr,[x0,#16] // [along with is_base2_26] 21 22 csel x0,xzr,x0,eq 23 b.eq .Lno_key 24 25 adrp x17,OPENSSL_armcap_P 26 ldr w17,[x17,#:lo12:OPENSSL_armcap_P] 27 28 ldp x7,x8,[x1] // load key 29 mov x9,#0xfffffffc0fffffff 30 movk x9,#0x0fff,lsl#48 31#ifdef __ARMEB__ 32 rev x7,x7 // flip bytes 33 rev x8,x8 34#endif 35 and x7,x7,x9 // &=0ffffffc0fffffff 36 and x9,x9,#-4 37 and x8,x8,x9 // &=0ffffffc0ffffffc 38 stp x7,x8,[x0,#32] // save key value 39 40 tst w17,#ARMV7_NEON 41 42 adr x12,.Lpoly1305_blocks 43 adr x7,.Lpoly1305_blocks_neon 44 adr x13,.Lpoly1305_emit 45 adr x8,.Lpoly1305_emit_neon 46 47 csel x12,x12,x7,eq 48 csel x13,x13,x8,eq 49 50#ifdef __ILP32__ 51 stp w12,w13,[x2] 52#else 53 stp x12,x13,[x2] 54#endif 55 56 mov x0,#1 57.Lno_key: 58 ret 59.size poly1305_init,.-poly1305_init 60 61.type poly1305_blocks,%function 62.align 5 63poly1305_blocks: 64.Lpoly1305_blocks: 65 ands x2,x2,#-16 66 b.eq .Lno_data 67 68 ldp x4,x5,[x0] // load hash value 69 ldp x7,x8,[x0,#32] // load key value 70 ldr x6,[x0,#16] 71 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 72 b .Loop 73 74.align 5 75.Loop: 76 ldp x10,x11,[x1],#16 // load input 77 sub x2,x2,#16 78#ifdef __ARMEB__ 79 rev x10,x10 80 rev x11,x11 81#endif 82 adds x4,x4,x10 // accumulate input 83 adcs x5,x5,x11 84 85 mul x12,x4,x7 // h0*r0 86 adc x6,x6,x3 87 umulh x13,x4,x7 88 89 mul x10,x5,x9 // h1*5*r1 90 umulh x11,x5,x9 91 92 adds x12,x12,x10 93 mul x10,x4,x8 // h0*r1 94 adc x13,x13,x11 95 umulh x14,x4,x8 96 97 adds x13,x13,x10 98 mul x10,x5,x7 // h1*r0 99 adc x14,x14,xzr 100 umulh x11,x5,x7 101 102 adds x13,x13,x10 103 mul x10,x6,x9 // h2*5*r1 104 adc x14,x14,x11 105 mul x11,x6,x7 // h2*r0 106 107 adds x13,x13,x10 108 adc x14,x14,x11 109 110 and x10,x14,#-4 // final reduction 111 and x6,x14,#3 112 add x10,x10,x14,lsr#2 113 adds x4,x12,x10 114 adcs x5,x13,xzr 115 adc x6,x6,xzr 116 117 cbnz x2,.Loop 118 119 stp x4,x5,[x0] // store hash value 120 str x6,[x0,#16] 121 122.Lno_data: 123 ret 124.size poly1305_blocks,.-poly1305_blocks 125 126.type poly1305_emit,%function 127.align 5 128poly1305_emit: 129.Lpoly1305_emit: 130 ldp x4,x5,[x0] // load hash base 2^64 131 ldr x6,[x0,#16] 132 ldp x10,x11,[x2] // load nonce 133 134 adds x12,x4,#5 // compare to modulus 135 adcs x13,x5,xzr 136 adc x14,x6,xzr 137 138 tst x14,#-4 // see if it's carried/borrowed 139 140 csel x4,x4,x12,eq 141 csel x5,x5,x13,eq 142 143#ifdef __ARMEB__ 144 ror x10,x10,#32 // flip nonce words 145 ror x11,x11,#32 146#endif 147 adds x4,x4,x10 // accumulate nonce 148 adc x5,x5,x11 149#ifdef __ARMEB__ 150 rev x4,x4 // flip output bytes 151 rev x5,x5 152#endif 153 stp x4,x5,[x1] // write result 154 155 ret 156.size poly1305_emit,.-poly1305_emit 157.type poly1305_mult,%function 158.align 5 159poly1305_mult: 160 mul x12,x4,x7 // h0*r0 161 umulh x13,x4,x7 162 163 mul x10,x5,x9 // h1*5*r1 164 umulh x11,x5,x9 165 166 adds x12,x12,x10 167 mul x10,x4,x8 // h0*r1 168 adc x13,x13,x11 169 umulh x14,x4,x8 170 171 adds x13,x13,x10 172 mul x10,x5,x7 // h1*r0 173 adc x14,x14,xzr 174 umulh x11,x5,x7 175 176 adds x13,x13,x10 177 mul x10,x6,x9 // h2*5*r1 178 adc x14,x14,x11 179 mul x11,x6,x7 // h2*r0 180 181 adds x13,x13,x10 182 adc x14,x14,x11 183 184 and x10,x14,#-4 // final reduction 185 and x6,x14,#3 186 add x10,x10,x14,lsr#2 187 adds x4,x12,x10 188 adcs x5,x13,xzr 189 adc x6,x6,xzr 190 191 ret 192.size poly1305_mult,.-poly1305_mult 193 194.type poly1305_splat,%function 195.align 5 196poly1305_splat: 197 and x12,x4,#0x03ffffff // base 2^64 -> base 2^26 198 ubfx x13,x4,#26,#26 199 extr x14,x5,x4,#52 200 and x14,x14,#0x03ffffff 201 ubfx x15,x5,#14,#26 202 extr x16,x6,x5,#40 203 204 str w12,[x0,#16*0] // r0 205 add w12,w13,w13,lsl#2 // r1*5 206 str w13,[x0,#16*1] // r1 207 add w13,w14,w14,lsl#2 // r2*5 208 str w12,[x0,#16*2] // s1 209 str w14,[x0,#16*3] // r2 210 add w14,w15,w15,lsl#2 // r3*5 211 str w13,[x0,#16*4] // s2 212 str w15,[x0,#16*5] // r3 213 add w15,w16,w16,lsl#2 // r4*5 214 str w14,[x0,#16*6] // s3 215 str w16,[x0,#16*7] // r4 216 str w15,[x0,#16*8] // s4 217 218 ret 219.size poly1305_splat,.-poly1305_splat 220 221.type poly1305_blocks_neon,%function 222.align 5 223poly1305_blocks_neon: 224.Lpoly1305_blocks_neon: 225 ldr x17,[x0,#24] 226 cmp x2,#128 227 b.hs .Lblocks_neon 228 cbz x17,.Lpoly1305_blocks 229 230.Lblocks_neon: 231.inst 0xd503233f // paciasp 232 stp x29,x30,[sp,#-80]! 233 add x29,sp,#0 234 235 ands x2,x2,#-16 236 b.eq .Lno_data_neon 237 238 cbz x17,.Lbase2_64_neon 239 240 ldp w10,w11,[x0] // load hash value base 2^26 241 ldp w12,w13,[x0,#8] 242 ldr w14,[x0,#16] 243 244 tst x2,#31 245 b.eq .Leven_neon 246 247 ldp x7,x8,[x0,#32] // load key value 248 249 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 250 lsr x5,x12,#12 251 adds x4,x4,x12,lsl#52 252 add x5,x5,x13,lsl#14 253 adc x5,x5,xzr 254 lsr x6,x14,#24 255 adds x5,x5,x14,lsl#40 256 adc x14,x6,xzr // can be partially reduced... 257 258 ldp x12,x13,[x1],#16 // load input 259 sub x2,x2,#16 260 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 261 262 and x10,x14,#-4 // ... so reduce 263 and x6,x14,#3 264 add x10,x10,x14,lsr#2 265 adds x4,x4,x10 266 adcs x5,x5,xzr 267 adc x6,x6,xzr 268 269#ifdef __ARMEB__ 270 rev x12,x12 271 rev x13,x13 272#endif 273 adds x4,x4,x12 // accumulate input 274 adcs x5,x5,x13 275 adc x6,x6,x3 276 277 bl poly1305_mult 278 ldr x30,[sp,#8] 279 280 cbz x3,.Lstore_base2_64_neon 281 282 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 283 ubfx x11,x4,#26,#26 284 extr x12,x5,x4,#52 285 and x12,x12,#0x03ffffff 286 ubfx x13,x5,#14,#26 287 extr x14,x6,x5,#40 288 289 cbnz x2,.Leven_neon 290 291 stp w10,w11,[x0] // store hash value base 2^26 292 stp w12,w13,[x0,#8] 293 str w14,[x0,#16] 294 b .Lno_data_neon 295 296.align 4 297.Lstore_base2_64_neon: 298 stp x4,x5,[x0] // store hash value base 2^64 299 stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed 300 b .Lno_data_neon 301 302.align 4 303.Lbase2_64_neon: 304 ldp x7,x8,[x0,#32] // load key value 305 306 ldp x4,x5,[x0] // load hash value base 2^64 307 ldr x6,[x0,#16] 308 309 tst x2,#31 310 b.eq .Linit_neon 311 312 ldp x12,x13,[x1],#16 // load input 313 sub x2,x2,#16 314 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 315#ifdef __ARMEB__ 316 rev x12,x12 317 rev x13,x13 318#endif 319 adds x4,x4,x12 // accumulate input 320 adcs x5,x5,x13 321 adc x6,x6,x3 322 323 bl poly1305_mult 324 325.Linit_neon: 326 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 327 ubfx x11,x4,#26,#26 328 extr x12,x5,x4,#52 329 and x12,x12,#0x03ffffff 330 ubfx x13,x5,#14,#26 331 extr x14,x6,x5,#40 332 333 stp d8,d9,[sp,#16] // meet ABI requirements 334 stp d10,d11,[sp,#32] 335 stp d12,d13,[sp,#48] 336 stp d14,d15,[sp,#64] 337 338 fmov d24,x10 339 fmov d25,x11 340 fmov d26,x12 341 fmov d27,x13 342 fmov d28,x14 343 344 ////////////////////////////////// initialize r^n table 345 mov x4,x7 // r^1 346 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 347 mov x5,x8 348 mov x6,xzr 349 add x0,x0,#48+12 350 bl poly1305_splat 351 352 bl poly1305_mult // r^2 353 sub x0,x0,#4 354 bl poly1305_splat 355 356 bl poly1305_mult // r^3 357 sub x0,x0,#4 358 bl poly1305_splat 359 360 bl poly1305_mult // r^4 361 sub x0,x0,#4 362 bl poly1305_splat 363 ldr x30,[sp,#8] 364 365 add x16,x1,#32 366 adr x17,.Lzeros 367 subs x2,x2,#64 368 csel x16,x17,x16,lo 369 370 mov x4,#1 371 stur x4,[x0,#-24] // set is_base2_26 372 sub x0,x0,#48 // restore original x0 373 b .Ldo_neon 374 375.align 4 376.Leven_neon: 377 add x16,x1,#32 378 adr x17,.Lzeros 379 subs x2,x2,#64 380 csel x16,x17,x16,lo 381 382 stp d8,d9,[sp,#16] // meet ABI requirements 383 stp d10,d11,[sp,#32] 384 stp d12,d13,[sp,#48] 385 stp d14,d15,[sp,#64] 386 387 fmov d24,x10 388 fmov d25,x11 389 fmov d26,x12 390 fmov d27,x13 391 fmov d28,x14 392 393.Ldo_neon: 394 ldp x8,x12,[x16],#16 // inp[2:3] (or zero) 395 ldp x9,x13,[x16],#48 396 397 lsl x3,x3,#24 398 add x15,x0,#48 399 400#ifdef __ARMEB__ 401 rev x8,x8 402 rev x12,x12 403 rev x9,x9 404 rev x13,x13 405#endif 406 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 407 and x5,x9,#0x03ffffff 408 ubfx x6,x8,#26,#26 409 ubfx x7,x9,#26,#26 410 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 411 extr x8,x12,x8,#52 412 extr x9,x13,x9,#52 413 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 414 fmov d14,x4 415 and x8,x8,#0x03ffffff 416 and x9,x9,#0x03ffffff 417 ubfx x10,x12,#14,#26 418 ubfx x11,x13,#14,#26 419 add x12,x3,x12,lsr#40 420 add x13,x3,x13,lsr#40 421 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 422 fmov d15,x6 423 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 424 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 425 fmov d16,x8 426 fmov d17,x10 427 fmov d18,x12 428 429 ldp x8,x12,[x1],#16 // inp[0:1] 430 ldp x9,x13,[x1],#48 431 432 ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64 433 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64 434 ld1 {v8.4s},[x15] 435 436#ifdef __ARMEB__ 437 rev x8,x8 438 rev x12,x12 439 rev x9,x9 440 rev x13,x13 441#endif 442 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 443 and x5,x9,#0x03ffffff 444 ubfx x6,x8,#26,#26 445 ubfx x7,x9,#26,#26 446 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 447 extr x8,x12,x8,#52 448 extr x9,x13,x9,#52 449 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 450 fmov d9,x4 451 and x8,x8,#0x03ffffff 452 and x9,x9,#0x03ffffff 453 ubfx x10,x12,#14,#26 454 ubfx x11,x13,#14,#26 455 add x12,x3,x12,lsr#40 456 add x13,x3,x13,lsr#40 457 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 458 fmov d10,x6 459 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 460 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 461 movi v31.2d,#-1 462 fmov d11,x8 463 fmov d12,x10 464 fmov d13,x12 465 ushr v31.2d,v31.2d,#38 466 467 b.ls .Lskip_loop 468 469.align 4 470.Loop_neon: 471 //////////////////////////////////////////////////////////////// 472 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 473 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 474 // ___________________/ 475 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 476 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 477 // ___________________/ ____________________/ 478 // 479 // Note that we start with inp[2:3]*r^2. This is because it 480 // doesn't depend on reduction in previous iteration. 481 //////////////////////////////////////////////////////////////// 482 // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 483 // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 484 // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 485 // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 486 // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 487 488 subs x2,x2,#64 489 umull v23.2d,v14.2s,v7.s[2] 490 csel x16,x17,x16,lo 491 umull v22.2d,v14.2s,v5.s[2] 492 umull v21.2d,v14.2s,v3.s[2] 493 ldp x8,x12,[x16],#16 // inp[2:3] (or zero) 494 umull v20.2d,v14.2s,v1.s[2] 495 ldp x9,x13,[x16],#48 496 umull v19.2d,v14.2s,v0.s[2] 497#ifdef __ARMEB__ 498 rev x8,x8 499 rev x12,x12 500 rev x9,x9 501 rev x13,x13 502#endif 503 504 umlal v23.2d,v15.2s,v5.s[2] 505 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 506 umlal v22.2d,v15.2s,v3.s[2] 507 and x5,x9,#0x03ffffff 508 umlal v21.2d,v15.2s,v1.s[2] 509 ubfx x6,x8,#26,#26 510 umlal v20.2d,v15.2s,v0.s[2] 511 ubfx x7,x9,#26,#26 512 umlal v19.2d,v15.2s,v8.s[2] 513 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 514 515 umlal v23.2d,v16.2s,v3.s[2] 516 extr x8,x12,x8,#52 517 umlal v22.2d,v16.2s,v1.s[2] 518 extr x9,x13,x9,#52 519 umlal v21.2d,v16.2s,v0.s[2] 520 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 521 umlal v20.2d,v16.2s,v8.s[2] 522 fmov d14,x4 523 umlal v19.2d,v16.2s,v6.s[2] 524 and x8,x8,#0x03ffffff 525 526 umlal v23.2d,v17.2s,v1.s[2] 527 and x9,x9,#0x03ffffff 528 umlal v22.2d,v17.2s,v0.s[2] 529 ubfx x10,x12,#14,#26 530 umlal v21.2d,v17.2s,v8.s[2] 531 ubfx x11,x13,#14,#26 532 umlal v20.2d,v17.2s,v6.s[2] 533 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 534 umlal v19.2d,v17.2s,v4.s[2] 535 fmov d15,x6 536 537 add v11.2s,v11.2s,v26.2s 538 add x12,x3,x12,lsr#40 539 umlal v23.2d,v18.2s,v0.s[2] 540 add x13,x3,x13,lsr#40 541 umlal v22.2d,v18.2s,v8.s[2] 542 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 543 umlal v21.2d,v18.2s,v6.s[2] 544 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 545 umlal v20.2d,v18.2s,v4.s[2] 546 fmov d16,x8 547 umlal v19.2d,v18.2s,v2.s[2] 548 fmov d17,x10 549 550 //////////////////////////////////////////////////////////////// 551 // (hash+inp[0:1])*r^4 and accumulate 552 553 add v9.2s,v9.2s,v24.2s 554 fmov d18,x12 555 umlal v22.2d,v11.2s,v1.s[0] 556 ldp x8,x12,[x1],#16 // inp[0:1] 557 umlal v19.2d,v11.2s,v6.s[0] 558 ldp x9,x13,[x1],#48 559 umlal v23.2d,v11.2s,v3.s[0] 560 umlal v20.2d,v11.2s,v8.s[0] 561 umlal v21.2d,v11.2s,v0.s[0] 562#ifdef __ARMEB__ 563 rev x8,x8 564 rev x12,x12 565 rev x9,x9 566 rev x13,x13 567#endif 568 569 add v10.2s,v10.2s,v25.2s 570 umlal v22.2d,v9.2s,v5.s[0] 571 umlal v23.2d,v9.2s,v7.s[0] 572 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 573 umlal v21.2d,v9.2s,v3.s[0] 574 and x5,x9,#0x03ffffff 575 umlal v19.2d,v9.2s,v0.s[0] 576 ubfx x6,x8,#26,#26 577 umlal v20.2d,v9.2s,v1.s[0] 578 ubfx x7,x9,#26,#26 579 580 add v12.2s,v12.2s,v27.2s 581 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 582 umlal v22.2d,v10.2s,v3.s[0] 583 extr x8,x12,x8,#52 584 umlal v23.2d,v10.2s,v5.s[0] 585 extr x9,x13,x9,#52 586 umlal v19.2d,v10.2s,v8.s[0] 587 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 588 umlal v21.2d,v10.2s,v1.s[0] 589 fmov d9,x4 590 umlal v20.2d,v10.2s,v0.s[0] 591 and x8,x8,#0x03ffffff 592 593 add v13.2s,v13.2s,v28.2s 594 and x9,x9,#0x03ffffff 595 umlal v22.2d,v12.2s,v0.s[0] 596 ubfx x10,x12,#14,#26 597 umlal v19.2d,v12.2s,v4.s[0] 598 ubfx x11,x13,#14,#26 599 umlal v23.2d,v12.2s,v1.s[0] 600 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 601 umlal v20.2d,v12.2s,v6.s[0] 602 fmov d10,x6 603 umlal v21.2d,v12.2s,v8.s[0] 604 add x12,x3,x12,lsr#40 605 606 umlal v22.2d,v13.2s,v8.s[0] 607 add x13,x3,x13,lsr#40 608 umlal v19.2d,v13.2s,v2.s[0] 609 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 610 umlal v23.2d,v13.2s,v0.s[0] 611 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 612 umlal v20.2d,v13.2s,v4.s[0] 613 fmov d11,x8 614 umlal v21.2d,v13.2s,v6.s[0] 615 fmov d12,x10 616 fmov d13,x12 617 618 ///////////////////////////////////////////////////////////////// 619 // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 620 // and P. Schwabe 621 // 622 // [see discussion in poly1305-armv4 module] 623 624 ushr v29.2d,v22.2d,#26 625 xtn v27.2s,v22.2d 626 ushr v30.2d,v19.2d,#26 627 and v19.16b,v19.16b,v31.16b 628 add v23.2d,v23.2d,v29.2d // h3 -> h4 629 bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff 630 add v20.2d,v20.2d,v30.2d // h0 -> h1 631 632 ushr v29.2d,v23.2d,#26 633 xtn v28.2s,v23.2d 634 ushr v30.2d,v20.2d,#26 635 xtn v25.2s,v20.2d 636 bic v28.2s,#0xfc,lsl#24 637 add v21.2d,v21.2d,v30.2d // h1 -> h2 638 639 add v19.2d,v19.2d,v29.2d 640 shl v29.2d,v29.2d,#2 641 shrn v30.2s,v21.2d,#26 642 xtn v26.2s,v21.2d 643 add v19.2d,v19.2d,v29.2d // h4 -> h0 644 bic v25.2s,#0xfc,lsl#24 645 add v27.2s,v27.2s,v30.2s // h2 -> h3 646 bic v26.2s,#0xfc,lsl#24 647 648 shrn v29.2s,v19.2d,#26 649 xtn v24.2s,v19.2d 650 ushr v30.2s,v27.2s,#26 651 bic v27.2s,#0xfc,lsl#24 652 bic v24.2s,#0xfc,lsl#24 653 add v25.2s,v25.2s,v29.2s // h0 -> h1 654 add v28.2s,v28.2s,v30.2s // h3 -> h4 655 656 b.hi .Loop_neon 657 658.Lskip_loop: 659 dup v16.2d,v16.d[0] 660 add v11.2s,v11.2s,v26.2s 661 662 //////////////////////////////////////////////////////////////// 663 // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 664 665 adds x2,x2,#32 666 b.ne .Long_tail 667 668 dup v16.2d,v11.d[0] 669 add v14.2s,v9.2s,v24.2s 670 add v17.2s,v12.2s,v27.2s 671 add v15.2s,v10.2s,v25.2s 672 add v18.2s,v13.2s,v28.2s 673 674.Long_tail: 675 dup v14.2d,v14.d[0] 676 umull2 v19.2d,v16.4s,v6.4s 677 umull2 v22.2d,v16.4s,v1.4s 678 umull2 v23.2d,v16.4s,v3.4s 679 umull2 v21.2d,v16.4s,v0.4s 680 umull2 v20.2d,v16.4s,v8.4s 681 682 dup v15.2d,v15.d[0] 683 umlal2 v19.2d,v14.4s,v0.4s 684 umlal2 v21.2d,v14.4s,v3.4s 685 umlal2 v22.2d,v14.4s,v5.4s 686 umlal2 v23.2d,v14.4s,v7.4s 687 umlal2 v20.2d,v14.4s,v1.4s 688 689 dup v17.2d,v17.d[0] 690 umlal2 v19.2d,v15.4s,v8.4s 691 umlal2 v22.2d,v15.4s,v3.4s 692 umlal2 v21.2d,v15.4s,v1.4s 693 umlal2 v23.2d,v15.4s,v5.4s 694 umlal2 v20.2d,v15.4s,v0.4s 695 696 dup v18.2d,v18.d[0] 697 umlal2 v22.2d,v17.4s,v0.4s 698 umlal2 v23.2d,v17.4s,v1.4s 699 umlal2 v19.2d,v17.4s,v4.4s 700 umlal2 v20.2d,v17.4s,v6.4s 701 umlal2 v21.2d,v17.4s,v8.4s 702 703 umlal2 v22.2d,v18.4s,v8.4s 704 umlal2 v19.2d,v18.4s,v2.4s 705 umlal2 v23.2d,v18.4s,v0.4s 706 umlal2 v20.2d,v18.4s,v4.4s 707 umlal2 v21.2d,v18.4s,v6.4s 708 709 b.eq .Lshort_tail 710 711 //////////////////////////////////////////////////////////////// 712 // (hash+inp[0:1])*r^4:r^3 and accumulate 713 714 add v9.2s,v9.2s,v24.2s 715 umlal v22.2d,v11.2s,v1.2s 716 umlal v19.2d,v11.2s,v6.2s 717 umlal v23.2d,v11.2s,v3.2s 718 umlal v20.2d,v11.2s,v8.2s 719 umlal v21.2d,v11.2s,v0.2s 720 721 add v10.2s,v10.2s,v25.2s 722 umlal v22.2d,v9.2s,v5.2s 723 umlal v19.2d,v9.2s,v0.2s 724 umlal v23.2d,v9.2s,v7.2s 725 umlal v20.2d,v9.2s,v1.2s 726 umlal v21.2d,v9.2s,v3.2s 727 728 add v12.2s,v12.2s,v27.2s 729 umlal v22.2d,v10.2s,v3.2s 730 umlal v19.2d,v10.2s,v8.2s 731 umlal v23.2d,v10.2s,v5.2s 732 umlal v20.2d,v10.2s,v0.2s 733 umlal v21.2d,v10.2s,v1.2s 734 735 add v13.2s,v13.2s,v28.2s 736 umlal v22.2d,v12.2s,v0.2s 737 umlal v19.2d,v12.2s,v4.2s 738 umlal v23.2d,v12.2s,v1.2s 739 umlal v20.2d,v12.2s,v6.2s 740 umlal v21.2d,v12.2s,v8.2s 741 742 umlal v22.2d,v13.2s,v8.2s 743 umlal v19.2d,v13.2s,v2.2s 744 umlal v23.2d,v13.2s,v0.2s 745 umlal v20.2d,v13.2s,v4.2s 746 umlal v21.2d,v13.2s,v6.2s 747 748.Lshort_tail: 749 //////////////////////////////////////////////////////////////// 750 // horizontal add 751 752 addp v22.2d,v22.2d,v22.2d 753 ldp d8,d9,[sp,#16] // meet ABI requirements 754 addp v19.2d,v19.2d,v19.2d 755 ldp d10,d11,[sp,#32] 756 addp v23.2d,v23.2d,v23.2d 757 ldp d12,d13,[sp,#48] 758 addp v20.2d,v20.2d,v20.2d 759 ldp d14,d15,[sp,#64] 760 addp v21.2d,v21.2d,v21.2d 761 762 //////////////////////////////////////////////////////////////// 763 // lazy reduction, but without narrowing 764 765 ushr v29.2d,v22.2d,#26 766 and v22.16b,v22.16b,v31.16b 767 ushr v30.2d,v19.2d,#26 768 and v19.16b,v19.16b,v31.16b 769 770 add v23.2d,v23.2d,v29.2d // h3 -> h4 771 add v20.2d,v20.2d,v30.2d // h0 -> h1 772 773 ushr v29.2d,v23.2d,#26 774 and v23.16b,v23.16b,v31.16b 775 ushr v30.2d,v20.2d,#26 776 and v20.16b,v20.16b,v31.16b 777 add v21.2d,v21.2d,v30.2d // h1 -> h2 778 779 add v19.2d,v19.2d,v29.2d 780 shl v29.2d,v29.2d,#2 781 ushr v30.2d,v21.2d,#26 782 and v21.16b,v21.16b,v31.16b 783 add v19.2d,v19.2d,v29.2d // h4 -> h0 784 add v22.2d,v22.2d,v30.2d // h2 -> h3 785 786 ushr v29.2d,v19.2d,#26 787 and v19.16b,v19.16b,v31.16b 788 ushr v30.2d,v22.2d,#26 789 and v22.16b,v22.16b,v31.16b 790 add v20.2d,v20.2d,v29.2d // h0 -> h1 791 add v23.2d,v23.2d,v30.2d // h3 -> h4 792 793 //////////////////////////////////////////////////////////////// 794 // write the result, can be partially reduced 795 796 st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16 797 st1 {v23.s}[0],[x0] 798 799.Lno_data_neon: 800 ldr x29,[sp],#80 801.inst 0xd50323bf // autiasp 802 ret 803.size poly1305_blocks_neon,.-poly1305_blocks_neon 804 805.type poly1305_emit_neon,%function 806.align 5 807poly1305_emit_neon: 808.Lpoly1305_emit_neon: 809 ldr x17,[x0,#24] 810 cbz x17,poly1305_emit 811 812 ldp w10,w11,[x0] // load hash value base 2^26 813 ldp w12,w13,[x0,#8] 814 ldr w14,[x0,#16] 815 816 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 817 lsr x5,x12,#12 818 adds x4,x4,x12,lsl#52 819 add x5,x5,x13,lsl#14 820 adc x5,x5,xzr 821 lsr x6,x14,#24 822 adds x5,x5,x14,lsl#40 823 adc x6,x6,xzr // can be partially reduced... 824 825 ldp x10,x11,[x2] // load nonce 826 827 and x12,x6,#-4 // ... so reduce 828 add x12,x12,x6,lsr#2 829 and x6,x6,#3 830 adds x4,x4,x12 831 adcs x5,x5,xzr 832 adc x6,x6,xzr 833 834 adds x12,x4,#5 // compare to modulus 835 adcs x13,x5,xzr 836 adc x14,x6,xzr 837 838 tst x14,#-4 // see if it's carried/borrowed 839 840 csel x4,x4,x12,eq 841 csel x5,x5,x13,eq 842 843#ifdef __ARMEB__ 844 ror x10,x10,#32 // flip nonce words 845 ror x11,x11,#32 846#endif 847 adds x4,x4,x10 // accumulate nonce 848 adc x5,x5,x11 849#ifdef __ARMEB__ 850 rev x4,x4 // flip output bytes 851 rev x5,x5 852#endif 853 stp x4,x5,[x1] // write result 854 855 ret 856.size poly1305_emit_neon,.-poly1305_emit_neon 857 858.align 5 859.Lzeros: 860.long 0,0,0,0,0,0,0,0 861.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 862.align 2 863.align 2 864