1/* 2 * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions 3 * 4 * Copyright (C) 2015 Martin Willi 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 */ 11 12#include <linux/linkage.h> 13 14.data 15.align 16 16 17ANMASK: .octa 0x0000000003ffffff0000000003ffffff 18ORMASK: .octa 0x00000000010000000000000001000000 19 20.text 21 22#define h0 0x00(%rdi) 23#define h1 0x04(%rdi) 24#define h2 0x08(%rdi) 25#define h3 0x0c(%rdi) 26#define h4 0x10(%rdi) 27#define r0 0x00(%rdx) 28#define r1 0x04(%rdx) 29#define r2 0x08(%rdx) 30#define r3 0x0c(%rdx) 31#define r4 0x10(%rdx) 32#define s1 0x00(%rsp) 33#define s2 0x04(%rsp) 34#define s3 0x08(%rsp) 35#define s4 0x0c(%rsp) 36#define m %rsi 37#define h01 %xmm0 38#define h23 %xmm1 39#define h44 %xmm2 40#define t1 %xmm3 41#define t2 %xmm4 42#define t3 %xmm5 43#define t4 %xmm6 44#define mask %xmm7 45#define d0 %r8 46#define d1 %r9 47#define d2 %r10 48#define d3 %r11 49#define d4 %r12 50 51ENTRY(poly1305_block_sse2) 52 # %rdi: Accumulator h[5] 53 # %rsi: 16 byte input block m 54 # %rdx: Poly1305 key r[5] 55 # %rcx: Block count 56 57 # This single block variant tries to improve performance by doing two 58 # multiplications in parallel using SSE instructions. There is quite 59 # some quardword packing involved, hence the speedup is marginal. 60 61 push %rbx 62 push %r12 63 sub $0x10,%rsp 64 65 # s1..s4 = r1..r4 * 5 66 mov r1,%eax 67 lea (%eax,%eax,4),%eax 68 mov %eax,s1 69 mov r2,%eax 70 lea (%eax,%eax,4),%eax 71 mov %eax,s2 72 mov r3,%eax 73 lea (%eax,%eax,4),%eax 74 mov %eax,s3 75 mov r4,%eax 76 lea (%eax,%eax,4),%eax 77 mov %eax,s4 78 79 movdqa ANMASK(%rip),mask 80 81.Ldoblock: 82 # h01 = [0, h1, 0, h0] 83 # h23 = [0, h3, 0, h2] 84 # h44 = [0, h4, 0, h4] 85 movd h0,h01 86 movd h1,t1 87 movd h2,h23 88 movd h3,t2 89 movd h4,h44 90 punpcklqdq t1,h01 91 punpcklqdq t2,h23 92 punpcklqdq h44,h44 93 94 # h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ] 95 movd 0x00(m),t1 96 movd 0x03(m),t2 97 psrld $2,t2 98 punpcklqdq t2,t1 99 pand mask,t1 100 paddd t1,h01 101 # h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ] 102 movd 0x06(m),t1 103 movd 0x09(m),t2 104 psrld $4,t1 105 psrld $6,t2 106 punpcklqdq t2,t1 107 pand mask,t1 108 paddd t1,h23 109 # h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ] 110 mov 0x0c(m),%eax 111 shr $8,%eax 112 or $0x01000000,%eax 113 movd %eax,t1 114 pshufd $0xc4,t1,t1 115 paddd t1,h44 116 117 # t1[0] = h0 * r0 + h2 * s3 118 # t1[1] = h1 * s4 + h3 * s2 119 movd r0,t1 120 movd s4,t2 121 punpcklqdq t2,t1 122 pmuludq h01,t1 123 movd s3,t2 124 movd s2,t3 125 punpcklqdq t3,t2 126 pmuludq h23,t2 127 paddq t2,t1 128 # t2[0] = h0 * r1 + h2 * s4 129 # t2[1] = h1 * r0 + h3 * s3 130 movd r1,t2 131 movd r0,t3 132 punpcklqdq t3,t2 133 pmuludq h01,t2 134 movd s4,t3 135 movd s3,t4 136 punpcklqdq t4,t3 137 pmuludq h23,t3 138 paddq t3,t2 139 # t3[0] = h4 * s1 140 # t3[1] = h4 * s2 141 movd s1,t3 142 movd s2,t4 143 punpcklqdq t4,t3 144 pmuludq h44,t3 145 # d0 = t1[0] + t1[1] + t3[0] 146 # d1 = t2[0] + t2[1] + t3[1] 147 movdqa t1,t4 148 punpcklqdq t2,t4 149 punpckhqdq t2,t1 150 paddq t4,t1 151 paddq t3,t1 152 movq t1,d0 153 psrldq $8,t1 154 movq t1,d1 155 156 # t1[0] = h0 * r2 + h2 * r0 157 # t1[1] = h1 * r1 + h3 * s4 158 movd r2,t1 159 movd r1,t2 160 punpcklqdq t2,t1 161 pmuludq h01,t1 162 movd r0,t2 163 movd s4,t3 164 punpcklqdq t3,t2 165 pmuludq h23,t2 166 paddq t2,t1 167 # t2[0] = h0 * r3 + h2 * r1 168 # t2[1] = h1 * r2 + h3 * r0 169 movd r3,t2 170 movd r2,t3 171 punpcklqdq t3,t2 172 pmuludq h01,t2 173 movd r1,t3 174 movd r0,t4 175 punpcklqdq t4,t3 176 pmuludq h23,t3 177 paddq t3,t2 178 # t3[0] = h4 * s3 179 # t3[1] = h4 * s4 180 movd s3,t3 181 movd s4,t4 182 punpcklqdq t4,t3 183 pmuludq h44,t3 184 # d2 = t1[0] + t1[1] + t3[0] 185 # d3 = t2[0] + t2[1] + t3[1] 186 movdqa t1,t4 187 punpcklqdq t2,t4 188 punpckhqdq t2,t1 189 paddq t4,t1 190 paddq t3,t1 191 movq t1,d2 192 psrldq $8,t1 193 movq t1,d3 194 195 # t1[0] = h0 * r4 + h2 * r2 196 # t1[1] = h1 * r3 + h3 * r1 197 movd r4,t1 198 movd r3,t2 199 punpcklqdq t2,t1 200 pmuludq h01,t1 201 movd r2,t2 202 movd r1,t3 203 punpcklqdq t3,t2 204 pmuludq h23,t2 205 paddq t2,t1 206 # t3[0] = h4 * r0 207 movd r0,t3 208 pmuludq h44,t3 209 # d4 = t1[0] + t1[1] + t3[0] 210 movdqa t1,t4 211 psrldq $8,t4 212 paddq t4,t1 213 paddq t3,t1 214 movq t1,d4 215 216 # d1 += d0 >> 26 217 mov d0,%rax 218 shr $26,%rax 219 add %rax,d1 220 # h0 = d0 & 0x3ffffff 221 mov d0,%rbx 222 and $0x3ffffff,%ebx 223 224 # d2 += d1 >> 26 225 mov d1,%rax 226 shr $26,%rax 227 add %rax,d2 228 # h1 = d1 & 0x3ffffff 229 mov d1,%rax 230 and $0x3ffffff,%eax 231 mov %eax,h1 232 233 # d3 += d2 >> 26 234 mov d2,%rax 235 shr $26,%rax 236 add %rax,d3 237 # h2 = d2 & 0x3ffffff 238 mov d2,%rax 239 and $0x3ffffff,%eax 240 mov %eax,h2 241 242 # d4 += d3 >> 26 243 mov d3,%rax 244 shr $26,%rax 245 add %rax,d4 246 # h3 = d3 & 0x3ffffff 247 mov d3,%rax 248 and $0x3ffffff,%eax 249 mov %eax,h3 250 251 # h0 += (d4 >> 26) * 5 252 mov d4,%rax 253 shr $26,%rax 254 lea (%eax,%eax,4),%eax 255 add %eax,%ebx 256 # h4 = d4 & 0x3ffffff 257 mov d4,%rax 258 and $0x3ffffff,%eax 259 mov %eax,h4 260 261 # h1 += h0 >> 26 262 mov %ebx,%eax 263 shr $26,%eax 264 add %eax,h1 265 # h0 = h0 & 0x3ffffff 266 andl $0x3ffffff,%ebx 267 mov %ebx,h0 268 269 add $0x10,m 270 dec %rcx 271 jnz .Ldoblock 272 273 add $0x10,%rsp 274 pop %r12 275 pop %rbx 276 ret 277ENDPROC(poly1305_block_sse2) 278 279 280#define u0 0x00(%r8) 281#define u1 0x04(%r8) 282#define u2 0x08(%r8) 283#define u3 0x0c(%r8) 284#define u4 0x10(%r8) 285#define hc0 %xmm0 286#define hc1 %xmm1 287#define hc2 %xmm2 288#define hc3 %xmm5 289#define hc4 %xmm6 290#define ru0 %xmm7 291#define ru1 %xmm8 292#define ru2 %xmm9 293#define ru3 %xmm10 294#define ru4 %xmm11 295#define sv1 %xmm12 296#define sv2 %xmm13 297#define sv3 %xmm14 298#define sv4 %xmm15 299#undef d0 300#define d0 %r13 301 302ENTRY(poly1305_2block_sse2) 303 # %rdi: Accumulator h[5] 304 # %rsi: 16 byte input block m 305 # %rdx: Poly1305 key r[5] 306 # %rcx: Doubleblock count 307 # %r8: Poly1305 derived key r^2 u[5] 308 309 # This two-block variant further improves performance by using loop 310 # unrolled block processing. This is more straight forward and does 311 # less byte shuffling, but requires a second Poly1305 key r^2: 312 # h = (h + m) * r => h = (h + m1) * r^2 + m2 * r 313 314 push %rbx 315 push %r12 316 push %r13 317 318 # combine r0,u0 319 movd u0,ru0 320 movd r0,t1 321 punpcklqdq t1,ru0 322 323 # combine r1,u1 and s1=r1*5,v1=u1*5 324 movd u1,ru1 325 movd r1,t1 326 punpcklqdq t1,ru1 327 movdqa ru1,sv1 328 pslld $2,sv1 329 paddd ru1,sv1 330 331 # combine r2,u2 and s2=r2*5,v2=u2*5 332 movd u2,ru2 333 movd r2,t1 334 punpcklqdq t1,ru2 335 movdqa ru2,sv2 336 pslld $2,sv2 337 paddd ru2,sv2 338 339 # combine r3,u3 and s3=r3*5,v3=u3*5 340 movd u3,ru3 341 movd r3,t1 342 punpcklqdq t1,ru3 343 movdqa ru3,sv3 344 pslld $2,sv3 345 paddd ru3,sv3 346 347 # combine r4,u4 and s4=r4*5,v4=u4*5 348 movd u4,ru4 349 movd r4,t1 350 punpcklqdq t1,ru4 351 movdqa ru4,sv4 352 pslld $2,sv4 353 paddd ru4,sv4 354 355.Ldoblock2: 356 # hc0 = [ m[16-19] & 0x3ffffff, h0 + m[0-3] & 0x3ffffff ] 357 movd 0x00(m),hc0 358 movd 0x10(m),t1 359 punpcklqdq t1,hc0 360 pand ANMASK(%rip),hc0 361 movd h0,t1 362 paddd t1,hc0 363 # hc1 = [ (m[19-22] >> 2) & 0x3ffffff, h1 + (m[3-6] >> 2) & 0x3ffffff ] 364 movd 0x03(m),hc1 365 movd 0x13(m),t1 366 punpcklqdq t1,hc1 367 psrld $2,hc1 368 pand ANMASK(%rip),hc1 369 movd h1,t1 370 paddd t1,hc1 371 # hc2 = [ (m[22-25] >> 4) & 0x3ffffff, h2 + (m[6-9] >> 4) & 0x3ffffff ] 372 movd 0x06(m),hc2 373 movd 0x16(m),t1 374 punpcklqdq t1,hc2 375 psrld $4,hc2 376 pand ANMASK(%rip),hc2 377 movd h2,t1 378 paddd t1,hc2 379 # hc3 = [ (m[25-28] >> 6) & 0x3ffffff, h3 + (m[9-12] >> 6) & 0x3ffffff ] 380 movd 0x09(m),hc3 381 movd 0x19(m),t1 382 punpcklqdq t1,hc3 383 psrld $6,hc3 384 pand ANMASK(%rip),hc3 385 movd h3,t1 386 paddd t1,hc3 387 # hc4 = [ (m[28-31] >> 8) | (1<<24), h4 + (m[12-15] >> 8) | (1<<24) ] 388 movd 0x0c(m),hc4 389 movd 0x1c(m),t1 390 punpcklqdq t1,hc4 391 psrld $8,hc4 392 por ORMASK(%rip),hc4 393 movd h4,t1 394 paddd t1,hc4 395 396 # t1 = [ hc0[1] * r0, hc0[0] * u0 ] 397 movdqa ru0,t1 398 pmuludq hc0,t1 399 # t1 += [ hc1[1] * s4, hc1[0] * v4 ] 400 movdqa sv4,t2 401 pmuludq hc1,t2 402 paddq t2,t1 403 # t1 += [ hc2[1] * s3, hc2[0] * v3 ] 404 movdqa sv3,t2 405 pmuludq hc2,t2 406 paddq t2,t1 407 # t1 += [ hc3[1] * s2, hc3[0] * v2 ] 408 movdqa sv2,t2 409 pmuludq hc3,t2 410 paddq t2,t1 411 # t1 += [ hc4[1] * s1, hc4[0] * v1 ] 412 movdqa sv1,t2 413 pmuludq hc4,t2 414 paddq t2,t1 415 # d0 = t1[0] + t1[1] 416 movdqa t1,t2 417 psrldq $8,t2 418 paddq t2,t1 419 movq t1,d0 420 421 # t1 = [ hc0[1] * r1, hc0[0] * u1 ] 422 movdqa ru1,t1 423 pmuludq hc0,t1 424 # t1 += [ hc1[1] * r0, hc1[0] * u0 ] 425 movdqa ru0,t2 426 pmuludq hc1,t2 427 paddq t2,t1 428 # t1 += [ hc2[1] * s4, hc2[0] * v4 ] 429 movdqa sv4,t2 430 pmuludq hc2,t2 431 paddq t2,t1 432 # t1 += [ hc3[1] * s3, hc3[0] * v3 ] 433 movdqa sv3,t2 434 pmuludq hc3,t2 435 paddq t2,t1 436 # t1 += [ hc4[1] * s2, hc4[0] * v2 ] 437 movdqa sv2,t2 438 pmuludq hc4,t2 439 paddq t2,t1 440 # d1 = t1[0] + t1[1] 441 movdqa t1,t2 442 psrldq $8,t2 443 paddq t2,t1 444 movq t1,d1 445 446 # t1 = [ hc0[1] * r2, hc0[0] * u2 ] 447 movdqa ru2,t1 448 pmuludq hc0,t1 449 # t1 += [ hc1[1] * r1, hc1[0] * u1 ] 450 movdqa ru1,t2 451 pmuludq hc1,t2 452 paddq t2,t1 453 # t1 += [ hc2[1] * r0, hc2[0] * u0 ] 454 movdqa ru0,t2 455 pmuludq hc2,t2 456 paddq t2,t1 457 # t1 += [ hc3[1] * s4, hc3[0] * v4 ] 458 movdqa sv4,t2 459 pmuludq hc3,t2 460 paddq t2,t1 461 # t1 += [ hc4[1] * s3, hc4[0] * v3 ] 462 movdqa sv3,t2 463 pmuludq hc4,t2 464 paddq t2,t1 465 # d2 = t1[0] + t1[1] 466 movdqa t1,t2 467 psrldq $8,t2 468 paddq t2,t1 469 movq t1,d2 470 471 # t1 = [ hc0[1] * r3, hc0[0] * u3 ] 472 movdqa ru3,t1 473 pmuludq hc0,t1 474 # t1 += [ hc1[1] * r2, hc1[0] * u2 ] 475 movdqa ru2,t2 476 pmuludq hc1,t2 477 paddq t2,t1 478 # t1 += [ hc2[1] * r1, hc2[0] * u1 ] 479 movdqa ru1,t2 480 pmuludq hc2,t2 481 paddq t2,t1 482 # t1 += [ hc3[1] * r0, hc3[0] * u0 ] 483 movdqa ru0,t2 484 pmuludq hc3,t2 485 paddq t2,t1 486 # t1 += [ hc4[1] * s4, hc4[0] * v4 ] 487 movdqa sv4,t2 488 pmuludq hc4,t2 489 paddq t2,t1 490 # d3 = t1[0] + t1[1] 491 movdqa t1,t2 492 psrldq $8,t2 493 paddq t2,t1 494 movq t1,d3 495 496 # t1 = [ hc0[1] * r4, hc0[0] * u4 ] 497 movdqa ru4,t1 498 pmuludq hc0,t1 499 # t1 += [ hc1[1] * r3, hc1[0] * u3 ] 500 movdqa ru3,t2 501 pmuludq hc1,t2 502 paddq t2,t1 503 # t1 += [ hc2[1] * r2, hc2[0] * u2 ] 504 movdqa ru2,t2 505 pmuludq hc2,t2 506 paddq t2,t1 507 # t1 += [ hc3[1] * r1, hc3[0] * u1 ] 508 movdqa ru1,t2 509 pmuludq hc3,t2 510 paddq t2,t1 511 # t1 += [ hc4[1] * r0, hc4[0] * u0 ] 512 movdqa ru0,t2 513 pmuludq hc4,t2 514 paddq t2,t1 515 # d4 = t1[0] + t1[1] 516 movdqa t1,t2 517 psrldq $8,t2 518 paddq t2,t1 519 movq t1,d4 520 521 # d1 += d0 >> 26 522 mov d0,%rax 523 shr $26,%rax 524 add %rax,d1 525 # h0 = d0 & 0x3ffffff 526 mov d0,%rbx 527 and $0x3ffffff,%ebx 528 529 # d2 += d1 >> 26 530 mov d1,%rax 531 shr $26,%rax 532 add %rax,d2 533 # h1 = d1 & 0x3ffffff 534 mov d1,%rax 535 and $0x3ffffff,%eax 536 mov %eax,h1 537 538 # d3 += d2 >> 26 539 mov d2,%rax 540 shr $26,%rax 541 add %rax,d3 542 # h2 = d2 & 0x3ffffff 543 mov d2,%rax 544 and $0x3ffffff,%eax 545 mov %eax,h2 546 547 # d4 += d3 >> 26 548 mov d3,%rax 549 shr $26,%rax 550 add %rax,d4 551 # h3 = d3 & 0x3ffffff 552 mov d3,%rax 553 and $0x3ffffff,%eax 554 mov %eax,h3 555 556 # h0 += (d4 >> 26) * 5 557 mov d4,%rax 558 shr $26,%rax 559 lea (%eax,%eax,4),%eax 560 add %eax,%ebx 561 # h4 = d4 & 0x3ffffff 562 mov d4,%rax 563 and $0x3ffffff,%eax 564 mov %eax,h4 565 566 # h1 += h0 >> 26 567 mov %ebx,%eax 568 shr $26,%eax 569 add %eax,h1 570 # h0 = h0 & 0x3ffffff 571 andl $0x3ffffff,%ebx 572 mov %ebx,h0 573 574 add $0x20,m 575 dec %rcx 576 jnz .Ldoblock2 577 578 pop %r13 579 pop %r12 580 pop %rbx 581 ret 582ENDPROC(poly1305_2block_sse2) 583