1#ifndef __KERNEL__ 2# include "arm_arch.h" 3.extern OPENSSL_armcap_P 4#endif 5 6.text 7 8// forward "declarations" are required for Apple 9.globl poly1305_blocks 10.globl poly1305_emit 11 12.globl poly1305_init 13.type poly1305_init,%function 14.align 5 15poly1305_init: 16 cmp x1,xzr 17 stp xzr,xzr,[x0] // zero hash value 18 stp xzr,xzr,[x0,#16] // [along with is_base2_26] 19 20 csel x0,xzr,x0,eq 21 b.eq .Lno_key 22 23#ifndef __KERNEL__ 24 adrp x17,OPENSSL_armcap_P 25 ldr w17,[x17,#:lo12:OPENSSL_armcap_P] 26#endif 27 28 ldp x7,x8,[x1] // load key 29 mov x9,#0xfffffffc0fffffff 30 movk x9,#0x0fff,lsl#48 31#ifdef __AARCH64EB__ 32 rev x7,x7 // flip bytes 33 rev x8,x8 34#endif 35 and x7,x7,x9 // &=0ffffffc0fffffff 36 and x9,x9,#-4 37 and x8,x8,x9 // &=0ffffffc0ffffffc 38 mov w9,#-1 39 stp x7,x8,[x0,#32] // save key value 40 str w9,[x0,#48] // impossible key power value 41 42#ifndef __KERNEL__ 43 tst w17,#ARMV7_NEON 44 45 adr x12,.Lpoly1305_blocks 46 adr x7,.Lpoly1305_blocks_neon 47 adr x13,.Lpoly1305_emit 48 49 csel x12,x12,x7,eq 50 51# ifdef __ILP32__ 52 stp w12,w13,[x2] 53# else 54 stp x12,x13,[x2] 55# endif 56#endif 57 mov x0,#1 58.Lno_key: 59 ret 60.size poly1305_init,.-poly1305_init 61 62.type poly1305_blocks,%function 63.align 5 64poly1305_blocks: 65.Lpoly1305_blocks: 66 ands x2,x2,#-16 67 b.eq .Lno_data 68 69 ldp x4,x5,[x0] // load hash value 70 ldp x6,x17,[x0,#16] // [along with is_base2_26] 71 ldp x7,x8,[x0,#32] // load key value 72 73#ifdef __AARCH64EB__ 74 lsr x12,x4,#32 75 mov w13,w4 76 lsr x14,x5,#32 77 mov w15,w5 78 lsr x16,x6,#32 79#else 80 mov w12,w4 81 lsr x13,x4,#32 82 mov w14,w5 83 lsr x15,x5,#32 84 mov w16,w6 85#endif 86 87 add x12,x12,x13,lsl#26 // base 2^26 -> base 2^64 88 lsr x13,x14,#12 89 adds x12,x12,x14,lsl#52 90 add x13,x13,x15,lsl#14 91 adc x13,x13,xzr 92 lsr x14,x16,#24 93 adds x13,x13,x16,lsl#40 94 adc x14,x14,xzr 95 96 cmp x17,#0 // is_base2_26? 97 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 98 csel x4,x4,x12,eq // choose between radixes 99 csel x5,x5,x13,eq 100 csel x6,x6,x14,eq 101 102.Loop: 103 ldp x10,x11,[x1],#16 // load input 104 sub x2,x2,#16 105#ifdef __AARCH64EB__ 106 rev x10,x10 107 rev x11,x11 108#endif 109 adds x4,x4,x10 // accumulate input 110 adcs x5,x5,x11 111 112 mul x12,x4,x7 // h0*r0 113 adc x6,x6,x3 114 umulh x13,x4,x7 115 116 mul x10,x5,x9 // h1*5*r1 117 umulh x11,x5,x9 118 119 adds x12,x12,x10 120 mul x10,x4,x8 // h0*r1 121 adc x13,x13,x11 122 umulh x14,x4,x8 123 124 adds x13,x13,x10 125 mul x10,x5,x7 // h1*r0 126 adc x14,x14,xzr 127 umulh x11,x5,x7 128 129 adds x13,x13,x10 130 mul x10,x6,x9 // h2*5*r1 131 adc x14,x14,x11 132 mul x11,x6,x7 // h2*r0 133 134 adds x13,x13,x10 135 adc x14,x14,x11 136 137 and x10,x14,#-4 // final reduction 138 and x6,x14,#3 139 add x10,x10,x14,lsr#2 140 adds x4,x12,x10 141 adcs x5,x13,xzr 142 adc x6,x6,xzr 143 144 cbnz x2,.Loop 145 146 stp x4,x5,[x0] // store hash value 147 stp x6,xzr,[x0,#16] // [and clear is_base2_26] 148 149.Lno_data: 150 ret 151.size poly1305_blocks,.-poly1305_blocks 152 153.type poly1305_emit,%function 154.align 5 155poly1305_emit: 156.Lpoly1305_emit: 157 ldp x4,x5,[x0] // load hash base 2^64 158 ldp x6,x7,[x0,#16] // [along with is_base2_26] 159 ldp x10,x11,[x2] // load nonce 160 161#ifdef __AARCH64EB__ 162 lsr x12,x4,#32 163 mov w13,w4 164 lsr x14,x5,#32 165 mov w15,w5 166 lsr x16,x6,#32 167#else 168 mov w12,w4 169 lsr x13,x4,#32 170 mov w14,w5 171 lsr x15,x5,#32 172 mov w16,w6 173#endif 174 175 add x12,x12,x13,lsl#26 // base 2^26 -> base 2^64 176 lsr x13,x14,#12 177 adds x12,x12,x14,lsl#52 178 add x13,x13,x15,lsl#14 179 adc x13,x13,xzr 180 lsr x14,x16,#24 181 adds x13,x13,x16,lsl#40 182 adc x14,x14,xzr 183 184 cmp x7,#0 // is_base2_26? 185 csel x4,x4,x12,eq // choose between radixes 186 csel x5,x5,x13,eq 187 csel x6,x6,x14,eq 188 189 adds x12,x4,#5 // compare to modulus 190 adcs x13,x5,xzr 191 adc x14,x6,xzr 192 193 tst x14,#-4 // see if it's carried/borrowed 194 195 csel x4,x4,x12,eq 196 csel x5,x5,x13,eq 197 198#ifdef __AARCH64EB__ 199 ror x10,x10,#32 // flip nonce words 200 ror x11,x11,#32 201#endif 202 adds x4,x4,x10 // accumulate nonce 203 adc x5,x5,x11 204#ifdef __AARCH64EB__ 205 rev x4,x4 // flip output bytes 206 rev x5,x5 207#endif 208 stp x4,x5,[x1] // write result 209 210 ret 211.size poly1305_emit,.-poly1305_emit 212.type poly1305_mult,%function 213.align 5 214poly1305_mult: 215 mul x12,x4,x7 // h0*r0 216 umulh x13,x4,x7 217 218 mul x10,x5,x9 // h1*5*r1 219 umulh x11,x5,x9 220 221 adds x12,x12,x10 222 mul x10,x4,x8 // h0*r1 223 adc x13,x13,x11 224 umulh x14,x4,x8 225 226 adds x13,x13,x10 227 mul x10,x5,x7 // h1*r0 228 adc x14,x14,xzr 229 umulh x11,x5,x7 230 231 adds x13,x13,x10 232 mul x10,x6,x9 // h2*5*r1 233 adc x14,x14,x11 234 mul x11,x6,x7 // h2*r0 235 236 adds x13,x13,x10 237 adc x14,x14,x11 238 239 and x10,x14,#-4 // final reduction 240 and x6,x14,#3 241 add x10,x10,x14,lsr#2 242 adds x4,x12,x10 243 adcs x5,x13,xzr 244 adc x6,x6,xzr 245 246 ret 247.size poly1305_mult,.-poly1305_mult 248 249.type poly1305_splat,%function 250.align 4 251poly1305_splat: 252 and x12,x4,#0x03ffffff // base 2^64 -> base 2^26 253 ubfx x13,x4,#26,#26 254 extr x14,x5,x4,#52 255 and x14,x14,#0x03ffffff 256 ubfx x15,x5,#14,#26 257 extr x16,x6,x5,#40 258 259 str w12,[x0,#16*0] // r0 260 add w12,w13,w13,lsl#2 // r1*5 261 str w13,[x0,#16*1] // r1 262 add w13,w14,w14,lsl#2 // r2*5 263 str w12,[x0,#16*2] // s1 264 str w14,[x0,#16*3] // r2 265 add w14,w15,w15,lsl#2 // r3*5 266 str w13,[x0,#16*4] // s2 267 str w15,[x0,#16*5] // r3 268 add w15,w16,w16,lsl#2 // r4*5 269 str w14,[x0,#16*6] // s3 270 str w16,[x0,#16*7] // r4 271 str w15,[x0,#16*8] // s4 272 273 ret 274.size poly1305_splat,.-poly1305_splat 275 276#ifdef __KERNEL__ 277.globl poly1305_blocks_neon 278#endif 279.type poly1305_blocks_neon,%function 280.align 5 281poly1305_blocks_neon: 282.Lpoly1305_blocks_neon: 283 ldr x17,[x0,#24] 284 cmp x2,#128 285 b.lo .Lpoly1305_blocks 286 287 .inst 0xd503233f // paciasp 288 stp x29,x30,[sp,#-80]! 289 add x29,sp,#0 290 291 stp d8,d9,[sp,#16] // meet ABI requirements 292 stp d10,d11,[sp,#32] 293 stp d12,d13,[sp,#48] 294 stp d14,d15,[sp,#64] 295 296 cbz x17,.Lbase2_64_neon 297 298 ldp w10,w11,[x0] // load hash value base 2^26 299 ldp w12,w13,[x0,#8] 300 ldr w14,[x0,#16] 301 302 tst x2,#31 303 b.eq .Leven_neon 304 305 ldp x7,x8,[x0,#32] // load key value 306 307 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 308 lsr x5,x12,#12 309 adds x4,x4,x12,lsl#52 310 add x5,x5,x13,lsl#14 311 adc x5,x5,xzr 312 lsr x6,x14,#24 313 adds x5,x5,x14,lsl#40 314 adc x14,x6,xzr // can be partially reduced... 315 316 ldp x12,x13,[x1],#16 // load input 317 sub x2,x2,#16 318 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 319 320#ifdef __AARCH64EB__ 321 rev x12,x12 322 rev x13,x13 323#endif 324 adds x4,x4,x12 // accumulate input 325 adcs x5,x5,x13 326 adc x6,x6,x3 327 328 bl poly1305_mult 329 330 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 331 ubfx x11,x4,#26,#26 332 extr x12,x5,x4,#52 333 and x12,x12,#0x03ffffff 334 ubfx x13,x5,#14,#26 335 extr x14,x6,x5,#40 336 337 b .Leven_neon 338 339.align 4 340.Lbase2_64_neon: 341 ldp x7,x8,[x0,#32] // load key value 342 343 ldp x4,x5,[x0] // load hash value base 2^64 344 ldr x6,[x0,#16] 345 346 tst x2,#31 347 b.eq .Linit_neon 348 349 ldp x12,x13,[x1],#16 // load input 350 sub x2,x2,#16 351 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 352#ifdef __AARCH64EB__ 353 rev x12,x12 354 rev x13,x13 355#endif 356 adds x4,x4,x12 // accumulate input 357 adcs x5,x5,x13 358 adc x6,x6,x3 359 360 bl poly1305_mult 361 362.Linit_neon: 363 ldr w17,[x0,#48] // first table element 364 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 365 ubfx x11,x4,#26,#26 366 extr x12,x5,x4,#52 367 and x12,x12,#0x03ffffff 368 ubfx x13,x5,#14,#26 369 extr x14,x6,x5,#40 370 371 cmp w17,#-1 // is value impossible? 372 b.ne .Leven_neon 373 374 fmov d24,x10 375 fmov d25,x11 376 fmov d26,x12 377 fmov d27,x13 378 fmov d28,x14 379 380 ////////////////////////////////// initialize r^n table 381 mov x4,x7 // r^1 382 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 383 mov x5,x8 384 mov x6,xzr 385 add x0,x0,#48+12 386 bl poly1305_splat 387 388 bl poly1305_mult // r^2 389 sub x0,x0,#4 390 bl poly1305_splat 391 392 bl poly1305_mult // r^3 393 sub x0,x0,#4 394 bl poly1305_splat 395 396 bl poly1305_mult // r^4 397 sub x0,x0,#4 398 bl poly1305_splat 399 sub x0,x0,#48 // restore original x0 400 b .Ldo_neon 401 402.align 4 403.Leven_neon: 404 fmov d24,x10 405 fmov d25,x11 406 fmov d26,x12 407 fmov d27,x13 408 fmov d28,x14 409 410.Ldo_neon: 411 ldp x8,x12,[x1,#32] // inp[2:3] 412 subs x2,x2,#64 413 ldp x9,x13,[x1,#48] 414 add x16,x1,#96 415 adr x17,.Lzeros 416 417 lsl x3,x3,#24 418 add x15,x0,#48 419 420#ifdef __AARCH64EB__ 421 rev x8,x8 422 rev x12,x12 423 rev x9,x9 424 rev x13,x13 425#endif 426 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 427 and x5,x9,#0x03ffffff 428 ubfx x6,x8,#26,#26 429 ubfx x7,x9,#26,#26 430 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 431 extr x8,x12,x8,#52 432 extr x9,x13,x9,#52 433 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 434 fmov d14,x4 435 and x8,x8,#0x03ffffff 436 and x9,x9,#0x03ffffff 437 ubfx x10,x12,#14,#26 438 ubfx x11,x13,#14,#26 439 add x12,x3,x12,lsr#40 440 add x13,x3,x13,lsr#40 441 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 442 fmov d15,x6 443 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 444 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 445 fmov d16,x8 446 fmov d17,x10 447 fmov d18,x12 448 449 ldp x8,x12,[x1],#16 // inp[0:1] 450 ldp x9,x13,[x1],#48 451 452 ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64 453 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64 454 ld1 {v8.4s},[x15] 455 456#ifdef __AARCH64EB__ 457 rev x8,x8 458 rev x12,x12 459 rev x9,x9 460 rev x13,x13 461#endif 462 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 463 and x5,x9,#0x03ffffff 464 ubfx x6,x8,#26,#26 465 ubfx x7,x9,#26,#26 466 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 467 extr x8,x12,x8,#52 468 extr x9,x13,x9,#52 469 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 470 fmov d9,x4 471 and x8,x8,#0x03ffffff 472 and x9,x9,#0x03ffffff 473 ubfx x10,x12,#14,#26 474 ubfx x11,x13,#14,#26 475 add x12,x3,x12,lsr#40 476 add x13,x3,x13,lsr#40 477 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 478 fmov d10,x6 479 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 480 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 481 movi v31.2d,#-1 482 fmov d11,x8 483 fmov d12,x10 484 fmov d13,x12 485 ushr v31.2d,v31.2d,#38 486 487 b.ls .Lskip_loop 488 489.align 4 490.Loop_neon: 491 //////////////////////////////////////////////////////////////// 492 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 493 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 494 // ___________________/ 495 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 496 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 497 // ___________________/ ____________________/ 498 // 499 // Note that we start with inp[2:3]*r^2. This is because it 500 // doesn't depend on reduction in previous iteration. 501 //////////////////////////////////////////////////////////////// 502 // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 503 // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 504 // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 505 // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 506 // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 507 508 subs x2,x2,#64 509 umull v23.2d,v14.2s,v7.s[2] 510 csel x16,x17,x16,lo 511 umull v22.2d,v14.2s,v5.s[2] 512 umull v21.2d,v14.2s,v3.s[2] 513 ldp x8,x12,[x16],#16 // inp[2:3] (or zero) 514 umull v20.2d,v14.2s,v1.s[2] 515 ldp x9,x13,[x16],#48 516 umull v19.2d,v14.2s,v0.s[2] 517#ifdef __AARCH64EB__ 518 rev x8,x8 519 rev x12,x12 520 rev x9,x9 521 rev x13,x13 522#endif 523 524 umlal v23.2d,v15.2s,v5.s[2] 525 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 526 umlal v22.2d,v15.2s,v3.s[2] 527 and x5,x9,#0x03ffffff 528 umlal v21.2d,v15.2s,v1.s[2] 529 ubfx x6,x8,#26,#26 530 umlal v20.2d,v15.2s,v0.s[2] 531 ubfx x7,x9,#26,#26 532 umlal v19.2d,v15.2s,v8.s[2] 533 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 534 535 umlal v23.2d,v16.2s,v3.s[2] 536 extr x8,x12,x8,#52 537 umlal v22.2d,v16.2s,v1.s[2] 538 extr x9,x13,x9,#52 539 umlal v21.2d,v16.2s,v0.s[2] 540 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 541 umlal v20.2d,v16.2s,v8.s[2] 542 fmov d14,x4 543 umlal v19.2d,v16.2s,v6.s[2] 544 and x8,x8,#0x03ffffff 545 546 umlal v23.2d,v17.2s,v1.s[2] 547 and x9,x9,#0x03ffffff 548 umlal v22.2d,v17.2s,v0.s[2] 549 ubfx x10,x12,#14,#26 550 umlal v21.2d,v17.2s,v8.s[2] 551 ubfx x11,x13,#14,#26 552 umlal v20.2d,v17.2s,v6.s[2] 553 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 554 umlal v19.2d,v17.2s,v4.s[2] 555 fmov d15,x6 556 557 add v11.2s,v11.2s,v26.2s 558 add x12,x3,x12,lsr#40 559 umlal v23.2d,v18.2s,v0.s[2] 560 add x13,x3,x13,lsr#40 561 umlal v22.2d,v18.2s,v8.s[2] 562 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 563 umlal v21.2d,v18.2s,v6.s[2] 564 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 565 umlal v20.2d,v18.2s,v4.s[2] 566 fmov d16,x8 567 umlal v19.2d,v18.2s,v2.s[2] 568 fmov d17,x10 569 570 //////////////////////////////////////////////////////////////// 571 // (hash+inp[0:1])*r^4 and accumulate 572 573 add v9.2s,v9.2s,v24.2s 574 fmov d18,x12 575 umlal v22.2d,v11.2s,v1.s[0] 576 ldp x8,x12,[x1],#16 // inp[0:1] 577 umlal v19.2d,v11.2s,v6.s[0] 578 ldp x9,x13,[x1],#48 579 umlal v23.2d,v11.2s,v3.s[0] 580 umlal v20.2d,v11.2s,v8.s[0] 581 umlal v21.2d,v11.2s,v0.s[0] 582#ifdef __AARCH64EB__ 583 rev x8,x8 584 rev x12,x12 585 rev x9,x9 586 rev x13,x13 587#endif 588 589 add v10.2s,v10.2s,v25.2s 590 umlal v22.2d,v9.2s,v5.s[0] 591 umlal v23.2d,v9.2s,v7.s[0] 592 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 593 umlal v21.2d,v9.2s,v3.s[0] 594 and x5,x9,#0x03ffffff 595 umlal v19.2d,v9.2s,v0.s[0] 596 ubfx x6,x8,#26,#26 597 umlal v20.2d,v9.2s,v1.s[0] 598 ubfx x7,x9,#26,#26 599 600 add v12.2s,v12.2s,v27.2s 601 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 602 umlal v22.2d,v10.2s,v3.s[0] 603 extr x8,x12,x8,#52 604 umlal v23.2d,v10.2s,v5.s[0] 605 extr x9,x13,x9,#52 606 umlal v19.2d,v10.2s,v8.s[0] 607 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 608 umlal v21.2d,v10.2s,v1.s[0] 609 fmov d9,x4 610 umlal v20.2d,v10.2s,v0.s[0] 611 and x8,x8,#0x03ffffff 612 613 add v13.2s,v13.2s,v28.2s 614 and x9,x9,#0x03ffffff 615 umlal v22.2d,v12.2s,v0.s[0] 616 ubfx x10,x12,#14,#26 617 umlal v19.2d,v12.2s,v4.s[0] 618 ubfx x11,x13,#14,#26 619 umlal v23.2d,v12.2s,v1.s[0] 620 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 621 umlal v20.2d,v12.2s,v6.s[0] 622 fmov d10,x6 623 umlal v21.2d,v12.2s,v8.s[0] 624 add x12,x3,x12,lsr#40 625 626 umlal v22.2d,v13.2s,v8.s[0] 627 add x13,x3,x13,lsr#40 628 umlal v19.2d,v13.2s,v2.s[0] 629 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 630 umlal v23.2d,v13.2s,v0.s[0] 631 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 632 umlal v20.2d,v13.2s,v4.s[0] 633 fmov d11,x8 634 umlal v21.2d,v13.2s,v6.s[0] 635 fmov d12,x10 636 fmov d13,x12 637 638 ///////////////////////////////////////////////////////////////// 639 // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 640 // and P. Schwabe 641 // 642 // [see discussion in poly1305-armv4 module] 643 644 ushr v29.2d,v22.2d,#26 645 xtn v27.2s,v22.2d 646 ushr v30.2d,v19.2d,#26 647 and v19.16b,v19.16b,v31.16b 648 add v23.2d,v23.2d,v29.2d // h3 -> h4 649 bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff 650 add v20.2d,v20.2d,v30.2d // h0 -> h1 651 652 ushr v29.2d,v23.2d,#26 653 xtn v28.2s,v23.2d 654 ushr v30.2d,v20.2d,#26 655 xtn v25.2s,v20.2d 656 bic v28.2s,#0xfc,lsl#24 657 add v21.2d,v21.2d,v30.2d // h1 -> h2 658 659 add v19.2d,v19.2d,v29.2d 660 shl v29.2d,v29.2d,#2 661 shrn v30.2s,v21.2d,#26 662 xtn v26.2s,v21.2d 663 add v19.2d,v19.2d,v29.2d // h4 -> h0 664 bic v25.2s,#0xfc,lsl#24 665 add v27.2s,v27.2s,v30.2s // h2 -> h3 666 bic v26.2s,#0xfc,lsl#24 667 668 shrn v29.2s,v19.2d,#26 669 xtn v24.2s,v19.2d 670 ushr v30.2s,v27.2s,#26 671 bic v27.2s,#0xfc,lsl#24 672 bic v24.2s,#0xfc,lsl#24 673 add v25.2s,v25.2s,v29.2s // h0 -> h1 674 add v28.2s,v28.2s,v30.2s // h3 -> h4 675 676 b.hi .Loop_neon 677 678.Lskip_loop: 679 dup v16.2d,v16.d[0] 680 add v11.2s,v11.2s,v26.2s 681 682 //////////////////////////////////////////////////////////////// 683 // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 684 685 adds x2,x2,#32 686 b.ne .Long_tail 687 688 dup v16.2d,v11.d[0] 689 add v14.2s,v9.2s,v24.2s 690 add v17.2s,v12.2s,v27.2s 691 add v15.2s,v10.2s,v25.2s 692 add v18.2s,v13.2s,v28.2s 693 694.Long_tail: 695 dup v14.2d,v14.d[0] 696 umull2 v19.2d,v16.4s,v6.4s 697 umull2 v22.2d,v16.4s,v1.4s 698 umull2 v23.2d,v16.4s,v3.4s 699 umull2 v21.2d,v16.4s,v0.4s 700 umull2 v20.2d,v16.4s,v8.4s 701 702 dup v15.2d,v15.d[0] 703 umlal2 v19.2d,v14.4s,v0.4s 704 umlal2 v21.2d,v14.4s,v3.4s 705 umlal2 v22.2d,v14.4s,v5.4s 706 umlal2 v23.2d,v14.4s,v7.4s 707 umlal2 v20.2d,v14.4s,v1.4s 708 709 dup v17.2d,v17.d[0] 710 umlal2 v19.2d,v15.4s,v8.4s 711 umlal2 v22.2d,v15.4s,v3.4s 712 umlal2 v21.2d,v15.4s,v1.4s 713 umlal2 v23.2d,v15.4s,v5.4s 714 umlal2 v20.2d,v15.4s,v0.4s 715 716 dup v18.2d,v18.d[0] 717 umlal2 v22.2d,v17.4s,v0.4s 718 umlal2 v23.2d,v17.4s,v1.4s 719 umlal2 v19.2d,v17.4s,v4.4s 720 umlal2 v20.2d,v17.4s,v6.4s 721 umlal2 v21.2d,v17.4s,v8.4s 722 723 umlal2 v22.2d,v18.4s,v8.4s 724 umlal2 v19.2d,v18.4s,v2.4s 725 umlal2 v23.2d,v18.4s,v0.4s 726 umlal2 v20.2d,v18.4s,v4.4s 727 umlal2 v21.2d,v18.4s,v6.4s 728 729 b.eq .Lshort_tail 730 731 //////////////////////////////////////////////////////////////// 732 // (hash+inp[0:1])*r^4:r^3 and accumulate 733 734 add v9.2s,v9.2s,v24.2s 735 umlal v22.2d,v11.2s,v1.2s 736 umlal v19.2d,v11.2s,v6.2s 737 umlal v23.2d,v11.2s,v3.2s 738 umlal v20.2d,v11.2s,v8.2s 739 umlal v21.2d,v11.2s,v0.2s 740 741 add v10.2s,v10.2s,v25.2s 742 umlal v22.2d,v9.2s,v5.2s 743 umlal v19.2d,v9.2s,v0.2s 744 umlal v23.2d,v9.2s,v7.2s 745 umlal v20.2d,v9.2s,v1.2s 746 umlal v21.2d,v9.2s,v3.2s 747 748 add v12.2s,v12.2s,v27.2s 749 umlal v22.2d,v10.2s,v3.2s 750 umlal v19.2d,v10.2s,v8.2s 751 umlal v23.2d,v10.2s,v5.2s 752 umlal v20.2d,v10.2s,v0.2s 753 umlal v21.2d,v10.2s,v1.2s 754 755 add v13.2s,v13.2s,v28.2s 756 umlal v22.2d,v12.2s,v0.2s 757 umlal v19.2d,v12.2s,v4.2s 758 umlal v23.2d,v12.2s,v1.2s 759 umlal v20.2d,v12.2s,v6.2s 760 umlal v21.2d,v12.2s,v8.2s 761 762 umlal v22.2d,v13.2s,v8.2s 763 umlal v19.2d,v13.2s,v2.2s 764 umlal v23.2d,v13.2s,v0.2s 765 umlal v20.2d,v13.2s,v4.2s 766 umlal v21.2d,v13.2s,v6.2s 767 768.Lshort_tail: 769 //////////////////////////////////////////////////////////////// 770 // horizontal add 771 772 addp v22.2d,v22.2d,v22.2d 773 ldp d8,d9,[sp,#16] // meet ABI requirements 774 addp v19.2d,v19.2d,v19.2d 775 ldp d10,d11,[sp,#32] 776 addp v23.2d,v23.2d,v23.2d 777 ldp d12,d13,[sp,#48] 778 addp v20.2d,v20.2d,v20.2d 779 ldp d14,d15,[sp,#64] 780 addp v21.2d,v21.2d,v21.2d 781 ldr x30,[sp,#8] 782 783 //////////////////////////////////////////////////////////////// 784 // lazy reduction, but without narrowing 785 786 ushr v29.2d,v22.2d,#26 787 and v22.16b,v22.16b,v31.16b 788 ushr v30.2d,v19.2d,#26 789 and v19.16b,v19.16b,v31.16b 790 791 add v23.2d,v23.2d,v29.2d // h3 -> h4 792 add v20.2d,v20.2d,v30.2d // h0 -> h1 793 794 ushr v29.2d,v23.2d,#26 795 and v23.16b,v23.16b,v31.16b 796 ushr v30.2d,v20.2d,#26 797 and v20.16b,v20.16b,v31.16b 798 add v21.2d,v21.2d,v30.2d // h1 -> h2 799 800 add v19.2d,v19.2d,v29.2d 801 shl v29.2d,v29.2d,#2 802 ushr v30.2d,v21.2d,#26 803 and v21.16b,v21.16b,v31.16b 804 add v19.2d,v19.2d,v29.2d // h4 -> h0 805 add v22.2d,v22.2d,v30.2d // h2 -> h3 806 807 ushr v29.2d,v19.2d,#26 808 and v19.16b,v19.16b,v31.16b 809 ushr v30.2d,v22.2d,#26 810 and v22.16b,v22.16b,v31.16b 811 add v20.2d,v20.2d,v29.2d // h0 -> h1 812 add v23.2d,v23.2d,v30.2d // h3 -> h4 813 814 //////////////////////////////////////////////////////////////// 815 // write the result, can be partially reduced 816 817 st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16 818 mov x4,#1 819 st1 {v23.s}[0],[x0] 820 str x4,[x0,#8] // set is_base2_26 821 822 ldr x29,[sp],#80 823 .inst 0xd50323bf // autiasp 824 ret 825.size poly1305_blocks_neon,.-poly1305_blocks_neon 826 827.align 5 828.Lzeros: 829.long 0,0,0,0,0,0,0,0 830.asciz "Poly1305 for ARMv8, CRYPTOGAMS by @dot-asm" 831.align 2 832#if !defined(__KERNEL__) && !defined(_WIN64) 833.comm OPENSSL_armcap_P,4,4 834.hidden OPENSSL_armcap_P 835#endif 836