1#include "arm_arch.h" 2 3.text 4 5// forward "declarations" are required for Apple 6 7.hidden OPENSSL_armcap_P 8.globl poly1305_init 9.hidden poly1305_init 10.globl poly1305_blocks 11.hidden poly1305_blocks 12.globl poly1305_emit 13.hidden poly1305_emit 14 15.type poly1305_init,%function 16.align 5 17poly1305_init: 18 cmp x1,xzr 19 stp xzr,xzr,[x0] // zero hash value 20 stp xzr,xzr,[x0,#16] // [along with is_base2_26] 21 22 csel x0,xzr,x0,eq 23 b.eq .Lno_key 24 25#ifdef __ILP32__ 26 ldrsw x11,.LOPENSSL_armcap_P 27#else 28 ldr x11,.LOPENSSL_armcap_P 29#endif 30 adr x10,.LOPENSSL_armcap_P 31 32 ldp x7,x8,[x1] // load key 33 mov x9,#0xfffffffc0fffffff 34 movk x9,#0x0fff,lsl#48 35 ldr w17,[x10,x11] 36#ifdef __ARMEB__ 37 rev x7,x7 // flip bytes 38 rev x8,x8 39#endif 40 and x7,x7,x9 // &=0ffffffc0fffffff 41 and x9,x9,#-4 42 and x8,x8,x9 // &=0ffffffc0ffffffc 43 stp x7,x8,[x0,#32] // save key value 44 45 tst w17,#ARMV7_NEON 46 47 adr x12,poly1305_blocks 48 adr x7,poly1305_blocks_neon 49 adr x13,poly1305_emit 50 adr x8,poly1305_emit_neon 51 52 csel x12,x12,x7,eq 53 csel x13,x13,x8,eq 54 55#ifdef __ILP32__ 56 stp w12,w13,[x2] 57#else 58 stp x12,x13,[x2] 59#endif 60 61 mov x0,#1 62.Lno_key: 63 ret 64.size poly1305_init,.-poly1305_init 65 66.type poly1305_blocks,%function 67.align 5 68poly1305_blocks: 69 ands x2,x2,#-16 70 b.eq .Lno_data 71 72 ldp x4,x5,[x0] // load hash value 73 ldp x7,x8,[x0,#32] // load key value 74 ldr x6,[x0,#16] 75 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 76 b .Loop 77 78.align 5 79.Loop: 80 ldp x10,x11,[x1],#16 // load input 81 sub x2,x2,#16 82#ifdef __ARMEB__ 83 rev x10,x10 84 rev x11,x11 85#endif 86 adds x4,x4,x10 // accumulate input 87 adcs x5,x5,x11 88 89 mul x12,x4,x7 // h0*r0 90 adc x6,x6,x3 91 umulh x13,x4,x7 92 93 mul x10,x5,x9 // h1*5*r1 94 umulh x11,x5,x9 95 96 adds x12,x12,x10 97 mul x10,x4,x8 // h0*r1 98 adc x13,x13,x11 99 umulh x14,x4,x8 100 101 adds x13,x13,x10 102 mul x10,x5,x7 // h1*r0 103 adc x14,x14,xzr 104 umulh x11,x5,x7 105 106 adds x13,x13,x10 107 mul x10,x6,x9 // h2*5*r1 108 adc x14,x14,x11 109 mul x11,x6,x7 // h2*r0 110 111 adds x13,x13,x10 112 adc x14,x14,x11 113 114 and x10,x14,#-4 // final reduction 115 and x6,x14,#3 116 add x10,x10,x14,lsr#2 117 adds x4,x12,x10 118 adcs x5,x13,xzr 119 adc x6,x6,xzr 120 121 cbnz x2,.Loop 122 123 stp x4,x5,[x0] // store hash value 124 str x6,[x0,#16] 125 126.Lno_data: 127 ret 128.size poly1305_blocks,.-poly1305_blocks 129 130.type poly1305_emit,%function 131.align 5 132poly1305_emit: 133 ldp x4,x5,[x0] // load hash base 2^64 134 ldr x6,[x0,#16] 135 ldp x10,x11,[x2] // load nonce 136 137 adds x12,x4,#5 // compare to modulus 138 adcs x13,x5,xzr 139 adc x14,x6,xzr 140 141 tst x14,#-4 // see if it's carried/borrowed 142 143 csel x4,x4,x12,eq 144 csel x5,x5,x13,eq 145 146#ifdef __ARMEB__ 147 ror x10,x10,#32 // flip nonce words 148 ror x11,x11,#32 149#endif 150 adds x4,x4,x10 // accumulate nonce 151 adc x5,x5,x11 152#ifdef __ARMEB__ 153 rev x4,x4 // flip output bytes 154 rev x5,x5 155#endif 156 stp x4,x5,[x1] // write result 157 158 ret 159.size poly1305_emit,.-poly1305_emit 160.type poly1305_mult,%function 161.align 5 162poly1305_mult: 163 mul x12,x4,x7 // h0*r0 164 umulh x13,x4,x7 165 166 mul x10,x5,x9 // h1*5*r1 167 umulh x11,x5,x9 168 169 adds x12,x12,x10 170 mul x10,x4,x8 // h0*r1 171 adc x13,x13,x11 172 umulh x14,x4,x8 173 174 adds x13,x13,x10 175 mul x10,x5,x7 // h1*r0 176 adc x14,x14,xzr 177 umulh x11,x5,x7 178 179 adds x13,x13,x10 180 mul x10,x6,x9 // h2*5*r1 181 adc x14,x14,x11 182 mul x11,x6,x7 // h2*r0 183 184 adds x13,x13,x10 185 adc x14,x14,x11 186 187 and x10,x14,#-4 // final reduction 188 and x6,x14,#3 189 add x10,x10,x14,lsr#2 190 adds x4,x12,x10 191 adcs x5,x13,xzr 192 adc x6,x6,xzr 193 194 ret 195.size poly1305_mult,.-poly1305_mult 196 197.type poly1305_splat,%function 198.align 5 199poly1305_splat: 200 and x12,x4,#0x03ffffff // base 2^64 -> base 2^26 201 ubfx x13,x4,#26,#26 202 extr x14,x5,x4,#52 203 and x14,x14,#0x03ffffff 204 ubfx x15,x5,#14,#26 205 extr x16,x6,x5,#40 206 207 str w12,[x0,#16*0] // r0 208 add w12,w13,w13,lsl#2 // r1*5 209 str w13,[x0,#16*1] // r1 210 add w13,w14,w14,lsl#2 // r2*5 211 str w12,[x0,#16*2] // s1 212 str w14,[x0,#16*3] // r2 213 add w14,w15,w15,lsl#2 // r3*5 214 str w13,[x0,#16*4] // s2 215 str w15,[x0,#16*5] // r3 216 add w15,w16,w16,lsl#2 // r4*5 217 str w14,[x0,#16*6] // s3 218 str w16,[x0,#16*7] // r4 219 str w15,[x0,#16*8] // s4 220 221 ret 222.size poly1305_splat,.-poly1305_splat 223 224.type poly1305_blocks_neon,%function 225.align 5 226poly1305_blocks_neon: 227 ldr x17,[x0,#24] 228 cmp x2,#128 229 b.hs .Lblocks_neon 230 cbz x17,poly1305_blocks 231 232.Lblocks_neon: 233.inst 0xd503233f // paciasp 234 stp x29,x30,[sp,#-80]! 235 add x29,sp,#0 236 237 ands x2,x2,#-16 238 b.eq .Lno_data_neon 239 240 cbz x17,.Lbase2_64_neon 241 242 ldp w10,w11,[x0] // load hash value base 2^26 243 ldp w12,w13,[x0,#8] 244 ldr w14,[x0,#16] 245 246 tst x2,#31 247 b.eq .Leven_neon 248 249 ldp x7,x8,[x0,#32] // load key value 250 251 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 252 lsr x5,x12,#12 253 adds x4,x4,x12,lsl#52 254 add x5,x5,x13,lsl#14 255 adc x5,x5,xzr 256 lsr x6,x14,#24 257 adds x5,x5,x14,lsl#40 258 adc x14,x6,xzr // can be partially reduced... 259 260 ldp x12,x13,[x1],#16 // load input 261 sub x2,x2,#16 262 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 263 264 and x10,x14,#-4 // ... so reduce 265 and x6,x14,#3 266 add x10,x10,x14,lsr#2 267 adds x4,x4,x10 268 adcs x5,x5,xzr 269 adc x6,x6,xzr 270 271#ifdef __ARMEB__ 272 rev x12,x12 273 rev x13,x13 274#endif 275 adds x4,x4,x12 // accumulate input 276 adcs x5,x5,x13 277 adc x6,x6,x3 278 279 bl poly1305_mult 280 ldr x30,[sp,#8] 281 282 cbz x3,.Lstore_base2_64_neon 283 284 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 285 ubfx x11,x4,#26,#26 286 extr x12,x5,x4,#52 287 and x12,x12,#0x03ffffff 288 ubfx x13,x5,#14,#26 289 extr x14,x6,x5,#40 290 291 cbnz x2,.Leven_neon 292 293 stp w10,w11,[x0] // store hash value base 2^26 294 stp w12,w13,[x0,#8] 295 str w14,[x0,#16] 296 b .Lno_data_neon 297 298.align 4 299.Lstore_base2_64_neon: 300 stp x4,x5,[x0] // store hash value base 2^64 301 stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed 302 b .Lno_data_neon 303 304.align 4 305.Lbase2_64_neon: 306 ldp x7,x8,[x0,#32] // load key value 307 308 ldp x4,x5,[x0] // load hash value base 2^64 309 ldr x6,[x0,#16] 310 311 tst x2,#31 312 b.eq .Linit_neon 313 314 ldp x12,x13,[x1],#16 // load input 315 sub x2,x2,#16 316 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 317#ifdef __ARMEB__ 318 rev x12,x12 319 rev x13,x13 320#endif 321 adds x4,x4,x12 // accumulate input 322 adcs x5,x5,x13 323 adc x6,x6,x3 324 325 bl poly1305_mult 326 327.Linit_neon: 328 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 329 ubfx x11,x4,#26,#26 330 extr x12,x5,x4,#52 331 and x12,x12,#0x03ffffff 332 ubfx x13,x5,#14,#26 333 extr x14,x6,x5,#40 334 335 stp d8,d9,[sp,#16] // meet ABI requirements 336 stp d10,d11,[sp,#32] 337 stp d12,d13,[sp,#48] 338 stp d14,d15,[sp,#64] 339 340 fmov d24,x10 341 fmov d25,x11 342 fmov d26,x12 343 fmov d27,x13 344 fmov d28,x14 345 346 ////////////////////////////////// initialize r^n table 347 mov x4,x7 // r^1 348 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 349 mov x5,x8 350 mov x6,xzr 351 add x0,x0,#48+12 352 bl poly1305_splat 353 354 bl poly1305_mult // r^2 355 sub x0,x0,#4 356 bl poly1305_splat 357 358 bl poly1305_mult // r^3 359 sub x0,x0,#4 360 bl poly1305_splat 361 362 bl poly1305_mult // r^4 363 sub x0,x0,#4 364 bl poly1305_splat 365 ldr x30,[sp,#8] 366 367 add x16,x1,#32 368 adr x17,.Lzeros 369 subs x2,x2,#64 370 csel x16,x17,x16,lo 371 372 mov x4,#1 373 str x4,[x0,#-24] // set is_base2_26 374 sub x0,x0,#48 // restore original x0 375 b .Ldo_neon 376 377.align 4 378.Leven_neon: 379 add x16,x1,#32 380 adr x17,.Lzeros 381 subs x2,x2,#64 382 csel x16,x17,x16,lo 383 384 stp d8,d9,[sp,#16] // meet ABI requirements 385 stp d10,d11,[sp,#32] 386 stp d12,d13,[sp,#48] 387 stp d14,d15,[sp,#64] 388 389 fmov d24,x10 390 fmov d25,x11 391 fmov d26,x12 392 fmov d27,x13 393 fmov d28,x14 394 395.Ldo_neon: 396 ldp x8,x12,[x16],#16 // inp[2:3] (or zero) 397 ldp x9,x13,[x16],#48 398 399 lsl x3,x3,#24 400 add x15,x0,#48 401 402#ifdef __ARMEB__ 403 rev x8,x8 404 rev x12,x12 405 rev x9,x9 406 rev x13,x13 407#endif 408 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 409 and x5,x9,#0x03ffffff 410 ubfx x6,x8,#26,#26 411 ubfx x7,x9,#26,#26 412 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 413 extr x8,x12,x8,#52 414 extr x9,x13,x9,#52 415 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 416 fmov d14,x4 417 and x8,x8,#0x03ffffff 418 and x9,x9,#0x03ffffff 419 ubfx x10,x12,#14,#26 420 ubfx x11,x13,#14,#26 421 add x12,x3,x12,lsr#40 422 add x13,x3,x13,lsr#40 423 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 424 fmov d15,x6 425 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 426 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 427 fmov d16,x8 428 fmov d17,x10 429 fmov d18,x12 430 431 ldp x8,x12,[x1],#16 // inp[0:1] 432 ldp x9,x13,[x1],#48 433 434 ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64 435 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64 436 ld1 {v8.4s},[x15] 437 438#ifdef __ARMEB__ 439 rev x8,x8 440 rev x12,x12 441 rev x9,x9 442 rev x13,x13 443#endif 444 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 445 and x5,x9,#0x03ffffff 446 ubfx x6,x8,#26,#26 447 ubfx x7,x9,#26,#26 448 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 449 extr x8,x12,x8,#52 450 extr x9,x13,x9,#52 451 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 452 fmov d9,x4 453 and x8,x8,#0x03ffffff 454 and x9,x9,#0x03ffffff 455 ubfx x10,x12,#14,#26 456 ubfx x11,x13,#14,#26 457 add x12,x3,x12,lsr#40 458 add x13,x3,x13,lsr#40 459 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 460 fmov d10,x6 461 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 462 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 463 movi v31.2d,#-1 464 fmov d11,x8 465 fmov d12,x10 466 fmov d13,x12 467 ushr v31.2d,v31.2d,#38 468 469 b.ls .Lskip_loop 470 471.align 4 472.Loop_neon: 473 //////////////////////////////////////////////////////////////// 474 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 475 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 476 // ___________________/ 477 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 478 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 479 // ___________________/ ____________________/ 480 // 481 // Note that we start with inp[2:3]*r^2. This is because it 482 // doesn't depend on reduction in previous iteration. 483 //////////////////////////////////////////////////////////////// 484 // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 485 // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 486 // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 487 // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 488 // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 489 490 subs x2,x2,#64 491 umull v23.2d,v14.2s,v7.s[2] 492 csel x16,x17,x16,lo 493 umull v22.2d,v14.2s,v5.s[2] 494 umull v21.2d,v14.2s,v3.s[2] 495 ldp x8,x12,[x16],#16 // inp[2:3] (or zero) 496 umull v20.2d,v14.2s,v1.s[2] 497 ldp x9,x13,[x16],#48 498 umull v19.2d,v14.2s,v0.s[2] 499#ifdef __ARMEB__ 500 rev x8,x8 501 rev x12,x12 502 rev x9,x9 503 rev x13,x13 504#endif 505 506 umlal v23.2d,v15.2s,v5.s[2] 507 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 508 umlal v22.2d,v15.2s,v3.s[2] 509 and x5,x9,#0x03ffffff 510 umlal v21.2d,v15.2s,v1.s[2] 511 ubfx x6,x8,#26,#26 512 umlal v20.2d,v15.2s,v0.s[2] 513 ubfx x7,x9,#26,#26 514 umlal v19.2d,v15.2s,v8.s[2] 515 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 516 517 umlal v23.2d,v16.2s,v3.s[2] 518 extr x8,x12,x8,#52 519 umlal v22.2d,v16.2s,v1.s[2] 520 extr x9,x13,x9,#52 521 umlal v21.2d,v16.2s,v0.s[2] 522 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 523 umlal v20.2d,v16.2s,v8.s[2] 524 fmov d14,x4 525 umlal v19.2d,v16.2s,v6.s[2] 526 and x8,x8,#0x03ffffff 527 528 umlal v23.2d,v17.2s,v1.s[2] 529 and x9,x9,#0x03ffffff 530 umlal v22.2d,v17.2s,v0.s[2] 531 ubfx x10,x12,#14,#26 532 umlal v21.2d,v17.2s,v8.s[2] 533 ubfx x11,x13,#14,#26 534 umlal v20.2d,v17.2s,v6.s[2] 535 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 536 umlal v19.2d,v17.2s,v4.s[2] 537 fmov d15,x6 538 539 add v11.2s,v11.2s,v26.2s 540 add x12,x3,x12,lsr#40 541 umlal v23.2d,v18.2s,v0.s[2] 542 add x13,x3,x13,lsr#40 543 umlal v22.2d,v18.2s,v8.s[2] 544 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 545 umlal v21.2d,v18.2s,v6.s[2] 546 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 547 umlal v20.2d,v18.2s,v4.s[2] 548 fmov d16,x8 549 umlal v19.2d,v18.2s,v2.s[2] 550 fmov d17,x10 551 552 //////////////////////////////////////////////////////////////// 553 // (hash+inp[0:1])*r^4 and accumulate 554 555 add v9.2s,v9.2s,v24.2s 556 fmov d18,x12 557 umlal v22.2d,v11.2s,v1.s[0] 558 ldp x8,x12,[x1],#16 // inp[0:1] 559 umlal v19.2d,v11.2s,v6.s[0] 560 ldp x9,x13,[x1],#48 561 umlal v23.2d,v11.2s,v3.s[0] 562 umlal v20.2d,v11.2s,v8.s[0] 563 umlal v21.2d,v11.2s,v0.s[0] 564#ifdef __ARMEB__ 565 rev x8,x8 566 rev x12,x12 567 rev x9,x9 568 rev x13,x13 569#endif 570 571 add v10.2s,v10.2s,v25.2s 572 umlal v22.2d,v9.2s,v5.s[0] 573 umlal v23.2d,v9.2s,v7.s[0] 574 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 575 umlal v21.2d,v9.2s,v3.s[0] 576 and x5,x9,#0x03ffffff 577 umlal v19.2d,v9.2s,v0.s[0] 578 ubfx x6,x8,#26,#26 579 umlal v20.2d,v9.2s,v1.s[0] 580 ubfx x7,x9,#26,#26 581 582 add v12.2s,v12.2s,v27.2s 583 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 584 umlal v22.2d,v10.2s,v3.s[0] 585 extr x8,x12,x8,#52 586 umlal v23.2d,v10.2s,v5.s[0] 587 extr x9,x13,x9,#52 588 umlal v19.2d,v10.2s,v8.s[0] 589 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 590 umlal v21.2d,v10.2s,v1.s[0] 591 fmov d9,x4 592 umlal v20.2d,v10.2s,v0.s[0] 593 and x8,x8,#0x03ffffff 594 595 add v13.2s,v13.2s,v28.2s 596 and x9,x9,#0x03ffffff 597 umlal v22.2d,v12.2s,v0.s[0] 598 ubfx x10,x12,#14,#26 599 umlal v19.2d,v12.2s,v4.s[0] 600 ubfx x11,x13,#14,#26 601 umlal v23.2d,v12.2s,v1.s[0] 602 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 603 umlal v20.2d,v12.2s,v6.s[0] 604 fmov d10,x6 605 umlal v21.2d,v12.2s,v8.s[0] 606 add x12,x3,x12,lsr#40 607 608 umlal v22.2d,v13.2s,v8.s[0] 609 add x13,x3,x13,lsr#40 610 umlal v19.2d,v13.2s,v2.s[0] 611 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 612 umlal v23.2d,v13.2s,v0.s[0] 613 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 614 umlal v20.2d,v13.2s,v4.s[0] 615 fmov d11,x8 616 umlal v21.2d,v13.2s,v6.s[0] 617 fmov d12,x10 618 fmov d13,x12 619 620 ///////////////////////////////////////////////////////////////// 621 // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 622 // and P. Schwabe 623 // 624 // [see discussion in poly1305-armv4 module] 625 626 ushr v29.2d,v22.2d,#26 627 xtn v27.2s,v22.2d 628 ushr v30.2d,v19.2d,#26 629 and v19.16b,v19.16b,v31.16b 630 add v23.2d,v23.2d,v29.2d // h3 -> h4 631 bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff 632 add v20.2d,v20.2d,v30.2d // h0 -> h1 633 634 ushr v29.2d,v23.2d,#26 635 xtn v28.2s,v23.2d 636 ushr v30.2d,v20.2d,#26 637 xtn v25.2s,v20.2d 638 bic v28.2s,#0xfc,lsl#24 639 add v21.2d,v21.2d,v30.2d // h1 -> h2 640 641 add v19.2d,v19.2d,v29.2d 642 shl v29.2d,v29.2d,#2 643 shrn v30.2s,v21.2d,#26 644 xtn v26.2s,v21.2d 645 add v19.2d,v19.2d,v29.2d // h4 -> h0 646 bic v25.2s,#0xfc,lsl#24 647 add v27.2s,v27.2s,v30.2s // h2 -> h3 648 bic v26.2s,#0xfc,lsl#24 649 650 shrn v29.2s,v19.2d,#26 651 xtn v24.2s,v19.2d 652 ushr v30.2s,v27.2s,#26 653 bic v27.2s,#0xfc,lsl#24 654 bic v24.2s,#0xfc,lsl#24 655 add v25.2s,v25.2s,v29.2s // h0 -> h1 656 add v28.2s,v28.2s,v30.2s // h3 -> h4 657 658 b.hi .Loop_neon 659 660.Lskip_loop: 661 dup v16.2d,v16.d[0] 662 add v11.2s,v11.2s,v26.2s 663 664 //////////////////////////////////////////////////////////////// 665 // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 666 667 adds x2,x2,#32 668 b.ne .Long_tail 669 670 dup v16.2d,v11.d[0] 671 add v14.2s,v9.2s,v24.2s 672 add v17.2s,v12.2s,v27.2s 673 add v15.2s,v10.2s,v25.2s 674 add v18.2s,v13.2s,v28.2s 675 676.Long_tail: 677 dup v14.2d,v14.d[0] 678 umull2 v19.2d,v16.4s,v6.4s 679 umull2 v22.2d,v16.4s,v1.4s 680 umull2 v23.2d,v16.4s,v3.4s 681 umull2 v21.2d,v16.4s,v0.4s 682 umull2 v20.2d,v16.4s,v8.4s 683 684 dup v15.2d,v15.d[0] 685 umlal2 v19.2d,v14.4s,v0.4s 686 umlal2 v21.2d,v14.4s,v3.4s 687 umlal2 v22.2d,v14.4s,v5.4s 688 umlal2 v23.2d,v14.4s,v7.4s 689 umlal2 v20.2d,v14.4s,v1.4s 690 691 dup v17.2d,v17.d[0] 692 umlal2 v19.2d,v15.4s,v8.4s 693 umlal2 v22.2d,v15.4s,v3.4s 694 umlal2 v21.2d,v15.4s,v1.4s 695 umlal2 v23.2d,v15.4s,v5.4s 696 umlal2 v20.2d,v15.4s,v0.4s 697 698 dup v18.2d,v18.d[0] 699 umlal2 v22.2d,v17.4s,v0.4s 700 umlal2 v23.2d,v17.4s,v1.4s 701 umlal2 v19.2d,v17.4s,v4.4s 702 umlal2 v20.2d,v17.4s,v6.4s 703 umlal2 v21.2d,v17.4s,v8.4s 704 705 umlal2 v22.2d,v18.4s,v8.4s 706 umlal2 v19.2d,v18.4s,v2.4s 707 umlal2 v23.2d,v18.4s,v0.4s 708 umlal2 v20.2d,v18.4s,v4.4s 709 umlal2 v21.2d,v18.4s,v6.4s 710 711 b.eq .Lshort_tail 712 713 //////////////////////////////////////////////////////////////// 714 // (hash+inp[0:1])*r^4:r^3 and accumulate 715 716 add v9.2s,v9.2s,v24.2s 717 umlal v22.2d,v11.2s,v1.2s 718 umlal v19.2d,v11.2s,v6.2s 719 umlal v23.2d,v11.2s,v3.2s 720 umlal v20.2d,v11.2s,v8.2s 721 umlal v21.2d,v11.2s,v0.2s 722 723 add v10.2s,v10.2s,v25.2s 724 umlal v22.2d,v9.2s,v5.2s 725 umlal v19.2d,v9.2s,v0.2s 726 umlal v23.2d,v9.2s,v7.2s 727 umlal v20.2d,v9.2s,v1.2s 728 umlal v21.2d,v9.2s,v3.2s 729 730 add v12.2s,v12.2s,v27.2s 731 umlal v22.2d,v10.2s,v3.2s 732 umlal v19.2d,v10.2s,v8.2s 733 umlal v23.2d,v10.2s,v5.2s 734 umlal v20.2d,v10.2s,v0.2s 735 umlal v21.2d,v10.2s,v1.2s 736 737 add v13.2s,v13.2s,v28.2s 738 umlal v22.2d,v12.2s,v0.2s 739 umlal v19.2d,v12.2s,v4.2s 740 umlal v23.2d,v12.2s,v1.2s 741 umlal v20.2d,v12.2s,v6.2s 742 umlal v21.2d,v12.2s,v8.2s 743 744 umlal v22.2d,v13.2s,v8.2s 745 umlal v19.2d,v13.2s,v2.2s 746 umlal v23.2d,v13.2s,v0.2s 747 umlal v20.2d,v13.2s,v4.2s 748 umlal v21.2d,v13.2s,v6.2s 749 750.Lshort_tail: 751 //////////////////////////////////////////////////////////////// 752 // horizontal add 753 754 addp v22.2d,v22.2d,v22.2d 755 ldp d8,d9,[sp,#16] // meet ABI requirements 756 addp v19.2d,v19.2d,v19.2d 757 ldp d10,d11,[sp,#32] 758 addp v23.2d,v23.2d,v23.2d 759 ldp d12,d13,[sp,#48] 760 addp v20.2d,v20.2d,v20.2d 761 ldp d14,d15,[sp,#64] 762 addp v21.2d,v21.2d,v21.2d 763 764 //////////////////////////////////////////////////////////////// 765 // lazy reduction, but without narrowing 766 767 ushr v29.2d,v22.2d,#26 768 and v22.16b,v22.16b,v31.16b 769 ushr v30.2d,v19.2d,#26 770 and v19.16b,v19.16b,v31.16b 771 772 add v23.2d,v23.2d,v29.2d // h3 -> h4 773 add v20.2d,v20.2d,v30.2d // h0 -> h1 774 775 ushr v29.2d,v23.2d,#26 776 and v23.16b,v23.16b,v31.16b 777 ushr v30.2d,v20.2d,#26 778 and v20.16b,v20.16b,v31.16b 779 add v21.2d,v21.2d,v30.2d // h1 -> h2 780 781 add v19.2d,v19.2d,v29.2d 782 shl v29.2d,v29.2d,#2 783 ushr v30.2d,v21.2d,#26 784 and v21.16b,v21.16b,v31.16b 785 add v19.2d,v19.2d,v29.2d // h4 -> h0 786 add v22.2d,v22.2d,v30.2d // h2 -> h3 787 788 ushr v29.2d,v19.2d,#26 789 and v19.16b,v19.16b,v31.16b 790 ushr v30.2d,v22.2d,#26 791 and v22.16b,v22.16b,v31.16b 792 add v20.2d,v20.2d,v29.2d // h0 -> h1 793 add v23.2d,v23.2d,v30.2d // h3 -> h4 794 795 //////////////////////////////////////////////////////////////// 796 // write the result, can be partially reduced 797 798 st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16 799 st1 {v23.s}[0],[x0] 800 801.Lno_data_neon: 802 ldr x29,[sp],#80 803.inst 0xd50323bf // autiasp 804 ret 805.size poly1305_blocks_neon,.-poly1305_blocks_neon 806 807.type poly1305_emit_neon,%function 808.align 5 809poly1305_emit_neon: 810 ldr x17,[x0,#24] 811 cbz x17,poly1305_emit 812 813 ldp w10,w11,[x0] // load hash value base 2^26 814 ldp w12,w13,[x0,#8] 815 ldr w14,[x0,#16] 816 817 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 818 lsr x5,x12,#12 819 adds x4,x4,x12,lsl#52 820 add x5,x5,x13,lsl#14 821 adc x5,x5,xzr 822 lsr x6,x14,#24 823 adds x5,x5,x14,lsl#40 824 adc x6,x6,xzr // can be partially reduced... 825 826 ldp x10,x11,[x2] // load nonce 827 828 and x12,x6,#-4 // ... so reduce 829 add x12,x12,x6,lsr#2 830 and x6,x6,#3 831 adds x4,x4,x12 832 adcs x5,x5,xzr 833 adc x6,x6,xzr 834 835 adds x12,x4,#5 // compare to modulus 836 adcs x13,x5,xzr 837 adc x14,x6,xzr 838 839 tst x14,#-4 // see if it's carried/borrowed 840 841 csel x4,x4,x12,eq 842 csel x5,x5,x13,eq 843 844#ifdef __ARMEB__ 845 ror x10,x10,#32 // flip nonce words 846 ror x11,x11,#32 847#endif 848 adds x4,x4,x10 // accumulate nonce 849 adc x5,x5,x11 850#ifdef __ARMEB__ 851 rev x4,x4 // flip output bytes 852 rev x5,x5 853#endif 854 stp x4,x5,[x1] // write result 855 856 ret 857.size poly1305_emit_neon,.-poly1305_emit_neon 858 859.align 5 860.Lzeros: 861.long 0,0,0,0,0,0,0,0 862.LOPENSSL_armcap_P: 863#ifdef __ILP32__ 864.long OPENSSL_armcap_P-. 865#else 866.quad OPENSSL_armcap_P-. 867#endif 868.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 869.align 2 870.align 2 871