1/* 2 * Multi-buffer SHA1 algorithm hash compute routine 3 * 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 * redistributing this file, you may do so under either license. 6 * 7 * GPL LICENSE SUMMARY 8 * 9 * Copyright(c) 2014 Intel Corporation. 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of version 2 of the GNU General Public License as 13 * published by the Free Software Foundation. 14 * 15 * This program is distributed in the hope that it will be useful, but 16 * WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * General Public License for more details. 19 * 20 * Contact Information: 21 * James Guilford <james.guilford@intel.com> 22 * Tim Chen <tim.c.chen@linux.intel.com> 23 * 24 * BSD LICENSE 25 * 26 * Copyright(c) 2014 Intel Corporation. 27 * 28 * Redistribution and use in source and binary forms, with or without 29 * modification, are permitted provided that the following conditions 30 * are met: 31 * 32 * * Redistributions of source code must retain the above copyright 33 * notice, this list of conditions and the following disclaimer. 34 * * Redistributions in binary form must reproduce the above copyright 35 * notice, this list of conditions and the following disclaimer in 36 * the documentation and/or other materials provided with the 37 * distribution. 38 * * Neither the name of Intel Corporation nor the names of its 39 * contributors may be used to endorse or promote products derived 40 * from this software without specific prior written permission. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 43 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 44 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 45 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 46 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 48 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 49 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 50 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 51 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 52 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 53 */ 54 55#include <linux/linkage.h> 56#include "sha1_mb_mgr_datastruct.S" 57 58## code to compute oct SHA1 using SSE-256 59## outer calling routine takes care of save and restore of XMM registers 60 61## Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15# ymm0-15 62## 63## Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15 64## Linux preserves: rdi rbp r8 65## 66## clobbers ymm0-15 67 68 69# TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1 70# "transpose" data in {r0...r7} using temps {t0...t1} 71# Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7} 72# r0 = {a7 a6 a5 a4 a3 a2 a1 a0} 73# r1 = {b7 b6 b5 b4 b3 b2 b1 b0} 74# r2 = {c7 c6 c5 c4 c3 c2 c1 c0} 75# r3 = {d7 d6 d5 d4 d3 d2 d1 d0} 76# r4 = {e7 e6 e5 e4 e3 e2 e1 e0} 77# r5 = {f7 f6 f5 f4 f3 f2 f1 f0} 78# r6 = {g7 g6 g5 g4 g3 g2 g1 g0} 79# r7 = {h7 h6 h5 h4 h3 h2 h1 h0} 80# 81# Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7} 82# r0 = {h0 g0 f0 e0 d0 c0 b0 a0} 83# r1 = {h1 g1 f1 e1 d1 c1 b1 a1} 84# r2 = {h2 g2 f2 e2 d2 c2 b2 a2} 85# r3 = {h3 g3 f3 e3 d3 c3 b3 a3} 86# r4 = {h4 g4 f4 e4 d4 c4 b4 a4} 87# r5 = {h5 g5 f5 e5 d5 c5 b5 a5} 88# r6 = {h6 g6 f6 e6 d6 c6 b6 a6} 89# r7 = {h7 g7 f7 e7 d7 c7 b7 a7} 90# 91 92.macro TRANSPOSE8 r0 r1 r2 r3 r4 r5 r6 r7 t0 t1 93 # process top half (r0..r3) {a...d} 94 vshufps $0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4 b1 b0 a1 a0} 95 vshufps $0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6 b3 b2 a3 a2} 96 vshufps $0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4 d1 d0 c1 c0} 97 vshufps $0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6 d3 d2 c3 c2} 98 vshufps $0xDD, \t1, \t0, \r3 # r3 = {d5 c5 b5 a5 d1 c1 b1 a1} 99 vshufps $0x88, \r2, \r0, \r1 # r1 = {d6 c6 b6 a6 d2 c2 b2 a2} 100 vshufps $0xDD, \r2, \r0, \r0 # r0 = {d7 c7 b7 a7 d3 c3 b3 a3} 101 vshufps $0x88, \t1, \t0, \t0 # t0 = {d4 c4 b4 a4 d0 c0 b0 a0} 102 103 # use r2 in place of t0 104 # process bottom half (r4..r7) {e...h} 105 vshufps $0x44, \r5, \r4, \r2 # r2 = {f5 f4 e5 e4 f1 f0 e1 e0} 106 vshufps $0xEE, \r5, \r4, \r4 # r4 = {f7 f6 e7 e6 f3 f2 e3 e2} 107 vshufps $0x44, \r7, \r6, \t1 # t1 = {h5 h4 g5 g4 h1 h0 g1 g0} 108 vshufps $0xEE, \r7, \r6, \r6 # r6 = {h7 h6 g7 g6 h3 h2 g3 g2} 109 vshufps $0xDD, \t1, \r2, \r7 # r7 = {h5 g5 f5 e5 h1 g1 f1 e1} 110 vshufps $0x88, \r6, \r4, \r5 # r5 = {h6 g6 f6 e6 h2 g2 f2 e2} 111 vshufps $0xDD, \r6, \r4, \r4 # r4 = {h7 g7 f7 e7 h3 g3 f3 e3} 112 vshufps $0x88, \t1, \r2, \t1 # t1 = {h4 g4 f4 e4 h0 g0 f0 e0} 113 114 vperm2f128 $0x13, \r1, \r5, \r6 # h6...a6 115 vperm2f128 $0x02, \r1, \r5, \r2 # h2...a2 116 vperm2f128 $0x13, \r3, \r7, \r5 # h5...a5 117 vperm2f128 $0x02, \r3, \r7, \r1 # h1...a1 118 vperm2f128 $0x13, \r0, \r4, \r7 # h7...a7 119 vperm2f128 $0x02, \r0, \r4, \r3 # h3...a3 120 vperm2f128 $0x13, \t0, \t1, \r4 # h4...a4 121 vperm2f128 $0x02, \t0, \t1, \r0 # h0...a0 122 123.endm 124## 125## Magic functions defined in FIPS 180-1 126## 127# macro MAGIC_F0 F,B,C,D,T ## F = (D ^ (B & (C ^ D))) 128.macro MAGIC_F0 regF regB regC regD regT 129 vpxor \regD, \regC, \regF 130 vpand \regB, \regF, \regF 131 vpxor \regD, \regF, \regF 132.endm 133 134# macro MAGIC_F1 F,B,C,D,T ## F = (B ^ C ^ D) 135.macro MAGIC_F1 regF regB regC regD regT 136 vpxor \regC, \regD, \regF 137 vpxor \regB, \regF, \regF 138.endm 139 140# macro MAGIC_F2 F,B,C,D,T ## F = ((B & C) | (B & D) | (C & D)) 141.macro MAGIC_F2 regF regB regC regD regT 142 vpor \regC, \regB, \regF 143 vpand \regC, \regB, \regT 144 vpand \regD, \regF, \regF 145 vpor \regT, \regF, \regF 146.endm 147 148# macro MAGIC_F3 F,B,C,D,T ## F = (B ^ C ^ D) 149.macro MAGIC_F3 regF regB regC regD regT 150 MAGIC_F1 \regF,\regB,\regC,\regD,\regT 151.endm 152 153# PROLD reg, imm, tmp 154.macro PROLD reg imm tmp 155 vpsrld $(32-\imm), \reg, \tmp 156 vpslld $\imm, \reg, \reg 157 vpor \tmp, \reg, \reg 158.endm 159 160.macro PROLD_nd reg imm tmp src 161 vpsrld $(32-\imm), \src, \tmp 162 vpslld $\imm, \src, \reg 163 vpor \tmp, \reg, \reg 164.endm 165 166.macro SHA1_STEP_00_15 regA regB regC regD regE regT regF memW immCNT MAGIC 167 vpaddd \immCNT, \regE, \regE 168 vpaddd \memW*32(%rsp), \regE, \regE 169 PROLD_nd \regT, 5, \regF, \regA 170 vpaddd \regT, \regE, \regE 171 \MAGIC \regF, \regB, \regC, \regD, \regT 172 PROLD \regB, 30, \regT 173 vpaddd \regF, \regE, \regE 174.endm 175 176.macro SHA1_STEP_16_79 regA regB regC regD regE regT regF memW immCNT MAGIC 177 vpaddd \immCNT, \regE, \regE 178 offset = ((\memW - 14) & 15) * 32 179 vmovdqu offset(%rsp), W14 180 vpxor W14, W16, W16 181 offset = ((\memW - 8) & 15) * 32 182 vpxor offset(%rsp), W16, W16 183 offset = ((\memW - 3) & 15) * 32 184 vpxor offset(%rsp), W16, W16 185 vpsrld $(32-1), W16, \regF 186 vpslld $1, W16, W16 187 vpor W16, \regF, \regF 188 189 ROTATE_W 190 191 offset = ((\memW - 0) & 15) * 32 192 vmovdqu \regF, offset(%rsp) 193 vpaddd \regF, \regE, \regE 194 PROLD_nd \regT, 5, \regF, \regA 195 vpaddd \regT, \regE, \regE 196 \MAGIC \regF,\regB,\regC,\regD,\regT ## FUN = MAGIC_Fi(B,C,D) 197 PROLD \regB,30, \regT 198 vpaddd \regF, \regE, \regE 199.endm 200 201######################################################################## 202######################################################################## 203######################################################################## 204 205## FRAMESZ plus pushes must be an odd multiple of 8 206YMM_SAVE = (15-15)*32 207FRAMESZ = 32*16 + YMM_SAVE 208_YMM = FRAMESZ - YMM_SAVE 209 210#define VMOVPS vmovups 211 212IDX = %rax 213inp0 = %r9 214inp1 = %r10 215inp2 = %r11 216inp3 = %r12 217inp4 = %r13 218inp5 = %r14 219inp6 = %r15 220inp7 = %rcx 221arg1 = %rdi 222arg2 = %rsi 223RSP_SAVE = %rdx 224 225# ymm0 A 226# ymm1 B 227# ymm2 C 228# ymm3 D 229# ymm4 E 230# ymm5 F AA 231# ymm6 T0 BB 232# ymm7 T1 CC 233# ymm8 T2 DD 234# ymm9 T3 EE 235# ymm10 T4 TMP 236# ymm11 T5 FUN 237# ymm12 T6 K 238# ymm13 T7 W14 239# ymm14 T8 W15 240# ymm15 T9 W16 241 242 243A = %ymm0 244B = %ymm1 245C = %ymm2 246D = %ymm3 247E = %ymm4 248F = %ymm5 249T0 = %ymm6 250T1 = %ymm7 251T2 = %ymm8 252T3 = %ymm9 253T4 = %ymm10 254T5 = %ymm11 255T6 = %ymm12 256T7 = %ymm13 257T8 = %ymm14 258T9 = %ymm15 259 260AA = %ymm5 261BB = %ymm6 262CC = %ymm7 263DD = %ymm8 264EE = %ymm9 265TMP = %ymm10 266FUN = %ymm11 267K = %ymm12 268W14 = %ymm13 269W15 = %ymm14 270W16 = %ymm15 271 272.macro ROTATE_ARGS 273 TMP_ = E 274 E = D 275 D = C 276 C = B 277 B = A 278 A = TMP_ 279.endm 280 281.macro ROTATE_W 282TMP_ = W16 283W16 = W15 284W15 = W14 285W14 = TMP_ 286.endm 287 288# 8 streams x 5 32bit words per digest x 4 bytes per word 289#define DIGEST_SIZE (8*5*4) 290 291.align 32 292 293# void sha1_x8_avx2(void **input_data, UINT128 *digest, UINT32 size) 294# arg 1 : pointer to array[4] of pointer to input data 295# arg 2 : size (in blocks) ;; assumed to be >= 1 296# 297ENTRY(sha1_x8_avx2) 298 299 push RSP_SAVE 300 301 #save rsp 302 mov %rsp, RSP_SAVE 303 sub $FRAMESZ, %rsp 304 305 #align rsp to 32 Bytes 306 and $~0x1F, %rsp 307 308 ## Initialize digests 309 vmovdqu 0*32(arg1), A 310 vmovdqu 1*32(arg1), B 311 vmovdqu 2*32(arg1), C 312 vmovdqu 3*32(arg1), D 313 vmovdqu 4*32(arg1), E 314 315 ## transpose input onto stack 316 mov _data_ptr+0*8(arg1),inp0 317 mov _data_ptr+1*8(arg1),inp1 318 mov _data_ptr+2*8(arg1),inp2 319 mov _data_ptr+3*8(arg1),inp3 320 mov _data_ptr+4*8(arg1),inp4 321 mov _data_ptr+5*8(arg1),inp5 322 mov _data_ptr+6*8(arg1),inp6 323 mov _data_ptr+7*8(arg1),inp7 324 325 xor IDX, IDX 326lloop: 327 vmovdqu PSHUFFLE_BYTE_FLIP_MASK(%rip), F 328 I=0 329.rep 2 330 VMOVPS (inp0, IDX), T0 331 VMOVPS (inp1, IDX), T1 332 VMOVPS (inp2, IDX), T2 333 VMOVPS (inp3, IDX), T3 334 VMOVPS (inp4, IDX), T4 335 VMOVPS (inp5, IDX), T5 336 VMOVPS (inp6, IDX), T6 337 VMOVPS (inp7, IDX), T7 338 339 TRANSPOSE8 T0, T1, T2, T3, T4, T5, T6, T7, T8, T9 340 vpshufb F, T0, T0 341 vmovdqu T0, (I*8)*32(%rsp) 342 vpshufb F, T1, T1 343 vmovdqu T1, (I*8+1)*32(%rsp) 344 vpshufb F, T2, T2 345 vmovdqu T2, (I*8+2)*32(%rsp) 346 vpshufb F, T3, T3 347 vmovdqu T3, (I*8+3)*32(%rsp) 348 vpshufb F, T4, T4 349 vmovdqu T4, (I*8+4)*32(%rsp) 350 vpshufb F, T5, T5 351 vmovdqu T5, (I*8+5)*32(%rsp) 352 vpshufb F, T6, T6 353 vmovdqu T6, (I*8+6)*32(%rsp) 354 vpshufb F, T7, T7 355 vmovdqu T7, (I*8+7)*32(%rsp) 356 add $32, IDX 357 I = (I+1) 358.endr 359 # save old digests 360 vmovdqu A,AA 361 vmovdqu B,BB 362 vmovdqu C,CC 363 vmovdqu D,DD 364 vmovdqu E,EE 365 366## 367## perform 0-79 steps 368## 369 vmovdqu K00_19(%rip), K 370## do rounds 0...15 371 I = 0 372.rep 16 373 SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 374 ROTATE_ARGS 375 I = (I+1) 376.endr 377 378## do rounds 16...19 379 vmovdqu ((16 - 16) & 15) * 32 (%rsp), W16 380 vmovdqu ((16 - 15) & 15) * 32 (%rsp), W15 381.rep 4 382 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 383 ROTATE_ARGS 384 I = (I+1) 385.endr 386 387## do rounds 20...39 388 vmovdqu K20_39(%rip), K 389.rep 20 390 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1 391 ROTATE_ARGS 392 I = (I+1) 393.endr 394 395## do rounds 40...59 396 vmovdqu K40_59(%rip), K 397.rep 20 398 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2 399 ROTATE_ARGS 400 I = (I+1) 401.endr 402 403## do rounds 60...79 404 vmovdqu K60_79(%rip), K 405.rep 20 406 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3 407 ROTATE_ARGS 408 I = (I+1) 409.endr 410 411 vpaddd AA,A,A 412 vpaddd BB,B,B 413 vpaddd CC,C,C 414 vpaddd DD,D,D 415 vpaddd EE,E,E 416 417 sub $1, arg2 418 jne lloop 419 420 # write out digests 421 vmovdqu A, 0*32(arg1) 422 vmovdqu B, 1*32(arg1) 423 vmovdqu C, 2*32(arg1) 424 vmovdqu D, 3*32(arg1) 425 vmovdqu E, 4*32(arg1) 426 427 # update input pointers 428 add IDX, inp0 429 add IDX, inp1 430 add IDX, inp2 431 add IDX, inp3 432 add IDX, inp4 433 add IDX, inp5 434 add IDX, inp6 435 add IDX, inp7 436 mov inp0, _data_ptr (arg1) 437 mov inp1, _data_ptr + 1*8(arg1) 438 mov inp2, _data_ptr + 2*8(arg1) 439 mov inp3, _data_ptr + 3*8(arg1) 440 mov inp4, _data_ptr + 4*8(arg1) 441 mov inp5, _data_ptr + 5*8(arg1) 442 mov inp6, _data_ptr + 6*8(arg1) 443 mov inp7, _data_ptr + 7*8(arg1) 444 445 ################ 446 ## Postamble 447 448 mov RSP_SAVE, %rsp 449 pop RSP_SAVE 450 451 ret 452ENDPROC(sha1_x8_avx2) 453 454 455.data 456 457.align 32 458K00_19: 459.octa 0x5A8279995A8279995A8279995A827999 460.octa 0x5A8279995A8279995A8279995A827999 461K20_39: 462.octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 463.octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 464K40_59: 465.octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC 466.octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC 467K60_79: 468.octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 469.octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 470PSHUFFLE_BYTE_FLIP_MASK: 471.octa 0x0c0d0e0f08090a0b0405060700010203 472.octa 0x0c0d0e0f08090a0b0405060700010203 473