1/* 2 * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental 3 * SSE3 instruction set extensions introduced in Intel Core Microarchitecture 4 * processors. CPUs supporting Intel(R) AVX extensions will get an additional 5 * boost. 6 * 7 * This work was inspired by the vectorized implementation of Dean Gaudet. 8 * Additional information on it can be found at: 9 * http://www.arctic.org/~dean/crypto/sha1.html 10 * 11 * It was improved upon with more efficient vectorization of the message 12 * scheduling. This implementation has also been optimized for all current and 13 * several future generations of Intel CPUs. 14 * 15 * See this article for more information about the implementation details: 16 * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/ 17 * 18 * Copyright (C) 2010, Intel Corp. 19 * Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com> 20 * Ronen Zohar <ronen.zohar@intel.com> 21 * 22 * Converted to AT&T syntax and adapted for inclusion in the Linux kernel: 23 * Author: Mathias Krause <minipli@googlemail.com> 24 * 25 * This program is free software; you can redistribute it and/or modify 26 * it under the terms of the GNU General Public License as published by 27 * the Free Software Foundation; either version 2 of the License, or 28 * (at your option) any later version. 29 */ 30 31#define CTX %rdi // arg1 32#define BUF %rsi // arg2 33#define CNT %rdx // arg3 34 35#define REG_A %ecx 36#define REG_B %esi 37#define REG_C %edi 38#define REG_D %ebp 39#define REG_E %edx 40 41#define REG_T1 %eax 42#define REG_T2 %ebx 43 44#define K_BASE %r8 45#define HASH_PTR %r9 46#define BUFFER_PTR %r10 47#define BUFFER_END %r11 48 49#define W_TMP1 %xmm0 50#define W_TMP2 %xmm9 51 52#define W0 %xmm1 53#define W4 %xmm2 54#define W8 %xmm3 55#define W12 %xmm4 56#define W16 %xmm5 57#define W20 %xmm6 58#define W24 %xmm7 59#define W28 %xmm8 60 61#define XMM_SHUFB_BSWAP %xmm10 62 63/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */ 64#define WK(t) (((t) & 15) * 4)(%rsp) 65#define W_PRECALC_AHEAD 16 66 67/* 68 * This macro implements the SHA-1 function's body for single 64-byte block 69 * param: function's name 70 */ 71.macro SHA1_VECTOR_ASM name 72 .global \name 73 .type \name, @function 74 .align 32 75\name: 76 push %rbx 77 push %rbp 78 push %r12 79 80 mov %rsp, %r12 81 sub $64, %rsp # allocate workspace 82 and $~15, %rsp # align stack 83 84 mov CTX, HASH_PTR 85 mov BUF, BUFFER_PTR 86 87 shl $6, CNT # multiply by 64 88 add BUF, CNT 89 mov CNT, BUFFER_END 90 91 lea K_XMM_AR(%rip), K_BASE 92 xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP 93 94 SHA1_PIPELINED_MAIN_BODY 95 96 # cleanup workspace 97 mov $8, %ecx 98 mov %rsp, %rdi 99 xor %rax, %rax 100 rep stosq 101 102 mov %r12, %rsp # deallocate workspace 103 104 pop %r12 105 pop %rbp 106 pop %rbx 107 ret 108 109 .size \name, .-\name 110.endm 111 112/* 113 * This macro implements 80 rounds of SHA-1 for one 64-byte block 114 */ 115.macro SHA1_PIPELINED_MAIN_BODY 116 INIT_REGALLOC 117 118 mov (HASH_PTR), A 119 mov 4(HASH_PTR), B 120 mov 8(HASH_PTR), C 121 mov 12(HASH_PTR), D 122 mov 16(HASH_PTR), E 123 124 .set i, 0 125 .rept W_PRECALC_AHEAD 126 W_PRECALC i 127 .set i, (i+1) 128 .endr 129 130.align 4 1311: 132 RR F1,A,B,C,D,E,0 133 RR F1,D,E,A,B,C,2 134 RR F1,B,C,D,E,A,4 135 RR F1,E,A,B,C,D,6 136 RR F1,C,D,E,A,B,8 137 138 RR F1,A,B,C,D,E,10 139 RR F1,D,E,A,B,C,12 140 RR F1,B,C,D,E,A,14 141 RR F1,E,A,B,C,D,16 142 RR F1,C,D,E,A,B,18 143 144 RR F2,A,B,C,D,E,20 145 RR F2,D,E,A,B,C,22 146 RR F2,B,C,D,E,A,24 147 RR F2,E,A,B,C,D,26 148 RR F2,C,D,E,A,B,28 149 150 RR F2,A,B,C,D,E,30 151 RR F2,D,E,A,B,C,32 152 RR F2,B,C,D,E,A,34 153 RR F2,E,A,B,C,D,36 154 RR F2,C,D,E,A,B,38 155 156 RR F3,A,B,C,D,E,40 157 RR F3,D,E,A,B,C,42 158 RR F3,B,C,D,E,A,44 159 RR F3,E,A,B,C,D,46 160 RR F3,C,D,E,A,B,48 161 162 RR F3,A,B,C,D,E,50 163 RR F3,D,E,A,B,C,52 164 RR F3,B,C,D,E,A,54 165 RR F3,E,A,B,C,D,56 166 RR F3,C,D,E,A,B,58 167 168 add $64, BUFFER_PTR # move to the next 64-byte block 169 cmp BUFFER_END, BUFFER_PTR # if the current is the last one use 170 cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun 171 172 RR F4,A,B,C,D,E,60 173 RR F4,D,E,A,B,C,62 174 RR F4,B,C,D,E,A,64 175 RR F4,E,A,B,C,D,66 176 RR F4,C,D,E,A,B,68 177 178 RR F4,A,B,C,D,E,70 179 RR F4,D,E,A,B,C,72 180 RR F4,B,C,D,E,A,74 181 RR F4,E,A,B,C,D,76 182 RR F4,C,D,E,A,B,78 183 184 UPDATE_HASH (HASH_PTR), A 185 UPDATE_HASH 4(HASH_PTR), B 186 UPDATE_HASH 8(HASH_PTR), C 187 UPDATE_HASH 12(HASH_PTR), D 188 UPDATE_HASH 16(HASH_PTR), E 189 190 RESTORE_RENAMED_REGS 191 cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end 192 jne 1b 193.endm 194 195.macro INIT_REGALLOC 196 .set A, REG_A 197 .set B, REG_B 198 .set C, REG_C 199 .set D, REG_D 200 .set E, REG_E 201 .set T1, REG_T1 202 .set T2, REG_T2 203.endm 204 205.macro RESTORE_RENAMED_REGS 206 # order is important (REG_C is where it should be) 207 mov B, REG_B 208 mov D, REG_D 209 mov A, REG_A 210 mov E, REG_E 211.endm 212 213.macro SWAP_REG_NAMES a, b 214 .set _T, \a 215 .set \a, \b 216 .set \b, _T 217.endm 218 219.macro F1 b, c, d 220 mov \c, T1 221 SWAP_REG_NAMES \c, T1 222 xor \d, T1 223 and \b, T1 224 xor \d, T1 225.endm 226 227.macro F2 b, c, d 228 mov \d, T1 229 SWAP_REG_NAMES \d, T1 230 xor \c, T1 231 xor \b, T1 232.endm 233 234.macro F3 b, c ,d 235 mov \c, T1 236 SWAP_REG_NAMES \c, T1 237 mov \b, T2 238 or \b, T1 239 and \c, T2 240 and \d, T1 241 or T2, T1 242.endm 243 244.macro F4 b, c, d 245 F2 \b, \c, \d 246.endm 247 248.macro UPDATE_HASH hash, val 249 add \hash, \val 250 mov \val, \hash 251.endm 252 253/* 254 * RR does two rounds of SHA-1 back to back with W[] pre-calc 255 * t1 = F(b, c, d); e += w(i) 256 * e += t1; b <<= 30; d += w(i+1); 257 * t1 = F(a, b, c); 258 * d += t1; a <<= 5; 259 * e += a; 260 * t1 = e; a >>= 7; 261 * t1 <<= 5; 262 * d += t1; 263 */ 264.macro RR F, a, b, c, d, e, round 265 add WK(\round), \e 266 \F \b, \c, \d # t1 = F(b, c, d); 267 W_PRECALC (\round + W_PRECALC_AHEAD) 268 rol $30, \b 269 add T1, \e 270 add WK(\round + 1), \d 271 272 \F \a, \b, \c 273 W_PRECALC (\round + W_PRECALC_AHEAD + 1) 274 rol $5, \a 275 add \a, \e 276 add T1, \d 277 ror $7, \a # (a <<r 5) >>r 7) => a <<r 30) 278 279 mov \e, T1 280 SWAP_REG_NAMES \e, T1 281 282 rol $5, T1 283 add T1, \d 284 285 # write: \a, \b 286 # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c 287.endm 288 289.macro W_PRECALC r 290 .set i, \r 291 292 .if (i < 20) 293 .set K_XMM, 0 294 .elseif (i < 40) 295 .set K_XMM, 16 296 .elseif (i < 60) 297 .set K_XMM, 32 298 .elseif (i < 80) 299 .set K_XMM, 48 300 .endif 301 302 .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD)))) 303 .set i, ((\r) % 80) # pre-compute for the next iteration 304 .if (i == 0) 305 W_PRECALC_RESET 306 .endif 307 W_PRECALC_00_15 308 .elseif (i<32) 309 W_PRECALC_16_31 310 .elseif (i < 80) // rounds 32-79 311 W_PRECALC_32_79 312 .endif 313.endm 314 315.macro W_PRECALC_RESET 316 .set W, W0 317 .set W_minus_04, W4 318 .set W_minus_08, W8 319 .set W_minus_12, W12 320 .set W_minus_16, W16 321 .set W_minus_20, W20 322 .set W_minus_24, W24 323 .set W_minus_28, W28 324 .set W_minus_32, W 325.endm 326 327.macro W_PRECALC_ROTATE 328 .set W_minus_32, W_minus_28 329 .set W_minus_28, W_minus_24 330 .set W_minus_24, W_minus_20 331 .set W_minus_20, W_minus_16 332 .set W_minus_16, W_minus_12 333 .set W_minus_12, W_minus_08 334 .set W_minus_08, W_minus_04 335 .set W_minus_04, W 336 .set W, W_minus_32 337.endm 338 339.macro W_PRECALC_SSSE3 340 341.macro W_PRECALC_00_15 342 W_PRECALC_00_15_SSSE3 343.endm 344.macro W_PRECALC_16_31 345 W_PRECALC_16_31_SSSE3 346.endm 347.macro W_PRECALC_32_79 348 W_PRECALC_32_79_SSSE3 349.endm 350 351/* message scheduling pre-compute for rounds 0-15 */ 352.macro W_PRECALC_00_15_SSSE3 353 .if ((i & 3) == 0) 354 movdqu (i*4)(BUFFER_PTR), W_TMP1 355 .elseif ((i & 3) == 1) 356 pshufb XMM_SHUFB_BSWAP, W_TMP1 357 movdqa W_TMP1, W 358 .elseif ((i & 3) == 2) 359 paddd (K_BASE), W_TMP1 360 .elseif ((i & 3) == 3) 361 movdqa W_TMP1, WK(i&~3) 362 W_PRECALC_ROTATE 363 .endif 364.endm 365 366/* message scheduling pre-compute for rounds 16-31 367 * 368 * - calculating last 32 w[i] values in 8 XMM registers 369 * - pre-calculate K+w[i] values and store to mem, for later load by ALU add 370 * instruction 371 * 372 * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3] 373 * dependency, but improves for 32-79 374 */ 375.macro W_PRECALC_16_31_SSSE3 376 # blended scheduling of vector and scalar instruction streams, one 4-wide 377 # vector iteration / 4 scalar rounds 378 .if ((i & 3) == 0) 379 movdqa W_minus_12, W 380 palignr $8, W_minus_16, W # w[i-14] 381 movdqa W_minus_04, W_TMP1 382 psrldq $4, W_TMP1 # w[i-3] 383 pxor W_minus_08, W 384 .elseif ((i & 3) == 1) 385 pxor W_minus_16, W_TMP1 386 pxor W_TMP1, W 387 movdqa W, W_TMP2 388 movdqa W, W_TMP1 389 pslldq $12, W_TMP2 390 .elseif ((i & 3) == 2) 391 psrld $31, W 392 pslld $1, W_TMP1 393 por W, W_TMP1 394 movdqa W_TMP2, W 395 psrld $30, W_TMP2 396 pslld $2, W 397 .elseif ((i & 3) == 3) 398 pxor W, W_TMP1 399 pxor W_TMP2, W_TMP1 400 movdqa W_TMP1, W 401 paddd K_XMM(K_BASE), W_TMP1 402 movdqa W_TMP1, WK(i&~3) 403 W_PRECALC_ROTATE 404 .endif 405.endm 406 407/* message scheduling pre-compute for rounds 32-79 408 * 409 * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 410 * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 411 * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken 412 */ 413.macro W_PRECALC_32_79_SSSE3 414 .if ((i & 3) == 0) 415 movdqa W_minus_04, W_TMP1 416 pxor W_minus_28, W # W is W_minus_32 before xor 417 palignr $8, W_minus_08, W_TMP1 418 .elseif ((i & 3) == 1) 419 pxor W_minus_16, W 420 pxor W_TMP1, W 421 movdqa W, W_TMP1 422 .elseif ((i & 3) == 2) 423 psrld $30, W 424 pslld $2, W_TMP1 425 por W, W_TMP1 426 .elseif ((i & 3) == 3) 427 movdqa W_TMP1, W 428 paddd K_XMM(K_BASE), W_TMP1 429 movdqa W_TMP1, WK(i&~3) 430 W_PRECALC_ROTATE 431 .endif 432.endm 433 434.endm // W_PRECALC_SSSE3 435 436 437#define K1 0x5a827999 438#define K2 0x6ed9eba1 439#define K3 0x8f1bbcdc 440#define K4 0xca62c1d6 441 442.section .rodata 443.align 16 444 445K_XMM_AR: 446 .long K1, K1, K1, K1 447 .long K2, K2, K2, K2 448 .long K3, K3, K3, K3 449 .long K4, K4, K4, K4 450 451BSWAP_SHUFB_CTL: 452 .long 0x00010203 453 .long 0x04050607 454 .long 0x08090a0b 455 .long 0x0c0d0e0f 456 457 458.section .text 459 460W_PRECALC_SSSE3 461.macro xmm_mov a, b 462 movdqu \a,\b 463.endm 464 465/* SSSE3 optimized implementation: 466 * extern "C" void sha1_transform_ssse3(u32 *digest, const char *data, u32 *ws, 467 * unsigned int rounds); 468 */ 469SHA1_VECTOR_ASM sha1_transform_ssse3 470 471#ifdef SHA1_ENABLE_AVX_SUPPORT 472 473.macro W_PRECALC_AVX 474 475.purgem W_PRECALC_00_15 476.macro W_PRECALC_00_15 477 W_PRECALC_00_15_AVX 478.endm 479.purgem W_PRECALC_16_31 480.macro W_PRECALC_16_31 481 W_PRECALC_16_31_AVX 482.endm 483.purgem W_PRECALC_32_79 484.macro W_PRECALC_32_79 485 W_PRECALC_32_79_AVX 486.endm 487 488.macro W_PRECALC_00_15_AVX 489 .if ((i & 3) == 0) 490 vmovdqu (i*4)(BUFFER_PTR), W_TMP1 491 .elseif ((i & 3) == 1) 492 vpshufb XMM_SHUFB_BSWAP, W_TMP1, W 493 .elseif ((i & 3) == 2) 494 vpaddd (K_BASE), W, W_TMP1 495 .elseif ((i & 3) == 3) 496 vmovdqa W_TMP1, WK(i&~3) 497 W_PRECALC_ROTATE 498 .endif 499.endm 500 501.macro W_PRECALC_16_31_AVX 502 .if ((i & 3) == 0) 503 vpalignr $8, W_minus_16, W_minus_12, W # w[i-14] 504 vpsrldq $4, W_minus_04, W_TMP1 # w[i-3] 505 vpxor W_minus_08, W, W 506 vpxor W_minus_16, W_TMP1, W_TMP1 507 .elseif ((i & 3) == 1) 508 vpxor W_TMP1, W, W 509 vpslldq $12, W, W_TMP2 510 vpslld $1, W, W_TMP1 511 .elseif ((i & 3) == 2) 512 vpsrld $31, W, W 513 vpor W, W_TMP1, W_TMP1 514 vpslld $2, W_TMP2, W 515 vpsrld $30, W_TMP2, W_TMP2 516 .elseif ((i & 3) == 3) 517 vpxor W, W_TMP1, W_TMP1 518 vpxor W_TMP2, W_TMP1, W 519 vpaddd K_XMM(K_BASE), W, W_TMP1 520 vmovdqu W_TMP1, WK(i&~3) 521 W_PRECALC_ROTATE 522 .endif 523.endm 524 525.macro W_PRECALC_32_79_AVX 526 .if ((i & 3) == 0) 527 vpalignr $8, W_minus_08, W_minus_04, W_TMP1 528 vpxor W_minus_28, W, W # W is W_minus_32 before xor 529 .elseif ((i & 3) == 1) 530 vpxor W_minus_16, W_TMP1, W_TMP1 531 vpxor W_TMP1, W, W 532 .elseif ((i & 3) == 2) 533 vpslld $2, W, W_TMP1 534 vpsrld $30, W, W 535 vpor W, W_TMP1, W 536 .elseif ((i & 3) == 3) 537 vpaddd K_XMM(K_BASE), W, W_TMP1 538 vmovdqu W_TMP1, WK(i&~3) 539 W_PRECALC_ROTATE 540 .endif 541.endm 542 543.endm // W_PRECALC_AVX 544 545W_PRECALC_AVX 546.purgem xmm_mov 547.macro xmm_mov a, b 548 vmovdqu \a,\b 549.endm 550 551 552/* AVX optimized implementation: 553 * extern "C" void sha1_transform_avx(u32 *digest, const char *data, u32 *ws, 554 * unsigned int rounds); 555 */ 556SHA1_VECTOR_ASM sha1_transform_avx 557 558#endif 559