1/* 2 * Flush routine for SHA1 multibuffer 3 * 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 * redistributing this file, you may do so under either license. 6 * 7 * GPL LICENSE SUMMARY 8 * 9 * Copyright(c) 2014 Intel Corporation. 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of version 2 of the GNU General Public License as 13 * published by the Free Software Foundation. 14 * 15 * This program is distributed in the hope that it will be useful, but 16 * WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * General Public License for more details. 19 * 20 * Contact Information: 21 * James Guilford <james.guilford@intel.com> 22 * Tim Chen <tim.c.chen@linux.intel.com> 23 * 24 * BSD LICENSE 25 * 26 * Copyright(c) 2014 Intel Corporation. 27 * 28 * Redistribution and use in source and binary forms, with or without 29 * modification, are permitted provided that the following conditions 30 * are met: 31 * 32 * * Redistributions of source code must retain the above copyright 33 * notice, this list of conditions and the following disclaimer. 34 * * Redistributions in binary form must reproduce the above copyright 35 * notice, this list of conditions and the following disclaimer in 36 * the documentation and/or other materials provided with the 37 * distribution. 38 * * Neither the name of Intel Corporation nor the names of its 39 * contributors may be used to endorse or promote products derived 40 * from this software without specific prior written permission. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 43 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 44 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 45 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 46 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 48 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 49 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 50 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 51 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 52 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 53 */ 54#include <linux/linkage.h> 55#include "sha1_mb_mgr_datastruct.S" 56 57 58.extern sha1_x8_avx2 59 60# LINUX register definitions 61#define arg1 %rdi 62#define arg2 %rsi 63 64# Common definitions 65#define state arg1 66#define job arg2 67#define len2 arg2 68 69# idx must be a register not clobbered by sha1_x8_avx2 70#define idx %r8 71#define DWORD_idx %r8d 72 73#define unused_lanes %rbx 74#define lane_data %rbx 75#define tmp2 %rbx 76#define tmp2_w %ebx 77 78#define job_rax %rax 79#define tmp1 %rax 80#define size_offset %rax 81#define tmp %rax 82#define start_offset %rax 83 84#define tmp3 %arg1 85 86#define extra_blocks %arg2 87#define p %arg2 88 89 90# STACK_SPACE needs to be an odd multiple of 8 91_XMM_SAVE_SIZE = 10*16 92_GPR_SAVE_SIZE = 8*8 93_ALIGN_SIZE = 8 94 95_XMM_SAVE = 0 96_GPR_SAVE = _XMM_SAVE + _XMM_SAVE_SIZE 97STACK_SPACE = _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE 98 99.macro LABEL prefix n 100\prefix\n\(): 101.endm 102 103.macro JNE_SKIP i 104jne skip_\i 105.endm 106 107.altmacro 108.macro SET_OFFSET _offset 109offset = \_offset 110.endm 111.noaltmacro 112 113# JOB* sha1_mb_mgr_flush_avx2(MB_MGR *state) 114# arg 1 : rcx : state 115ENTRY(sha1_mb_mgr_flush_avx2) 116 mov %rsp, %r10 117 sub $STACK_SPACE, %rsp 118 and $~31, %rsp 119 mov %rbx, _GPR_SAVE(%rsp) 120 mov %r10, _GPR_SAVE+8*1(%rsp) #save rsp 121 mov %rbp, _GPR_SAVE+8*3(%rsp) 122 mov %r12, _GPR_SAVE+8*4(%rsp) 123 mov %r13, _GPR_SAVE+8*5(%rsp) 124 mov %r14, _GPR_SAVE+8*6(%rsp) 125 mov %r15, _GPR_SAVE+8*7(%rsp) 126 127 # If bit (32+3) is set, then all lanes are empty 128 mov _unused_lanes(state), unused_lanes 129 bt $32+3, unused_lanes 130 jc return_null 131 132 # find a lane with a non-null job 133 xor idx, idx 134 offset = (_ldata + 1 * _LANE_DATA_size + _job_in_lane) 135 cmpq $0, offset(state) 136 cmovne one(%rip), idx 137 offset = (_ldata + 2 * _LANE_DATA_size + _job_in_lane) 138 cmpq $0, offset(state) 139 cmovne two(%rip), idx 140 offset = (_ldata + 3 * _LANE_DATA_size + _job_in_lane) 141 cmpq $0, offset(state) 142 cmovne three(%rip), idx 143 offset = (_ldata + 4 * _LANE_DATA_size + _job_in_lane) 144 cmpq $0, offset(state) 145 cmovne four(%rip), idx 146 offset = (_ldata + 5 * _LANE_DATA_size + _job_in_lane) 147 cmpq $0, offset(state) 148 cmovne five(%rip), idx 149 offset = (_ldata + 6 * _LANE_DATA_size + _job_in_lane) 150 cmpq $0, offset(state) 151 cmovne six(%rip), idx 152 offset = (_ldata + 7 * _LANE_DATA_size + _job_in_lane) 153 cmpq $0, offset(state) 154 cmovne seven(%rip), idx 155 156 # copy idx to empty lanes 157copy_lane_data: 158 offset = (_args + _data_ptr) 159 mov offset(state,idx,8), tmp 160 161 I = 0 162.rep 8 163 offset = (_ldata + I * _LANE_DATA_size + _job_in_lane) 164 cmpq $0, offset(state) 165.altmacro 166 JNE_SKIP %I 167 offset = (_args + _data_ptr + 8*I) 168 mov tmp, offset(state) 169 offset = (_lens + 4*I) 170 movl $0xFFFFFFFF, offset(state) 171LABEL skip_ %I 172 I = (I+1) 173.noaltmacro 174.endr 175 176 # Find min length 177 vmovdqu _lens+0*16(state), %xmm0 178 vmovdqu _lens+1*16(state), %xmm1 179 180 vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A} 181 vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C} 182 vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F} 183 vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E} 184 vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min value in low dword 185 186 vmovd %xmm2, DWORD_idx 187 mov idx, len2 188 and $0xF, idx 189 shr $4, len2 190 jz len_is_0 191 192 vpand clear_low_nibble(%rip), %xmm2, %xmm2 193 vpshufd $0, %xmm2, %xmm2 194 195 vpsubd %xmm2, %xmm0, %xmm0 196 vpsubd %xmm2, %xmm1, %xmm1 197 198 vmovdqu %xmm0, _lens+0*16(state) 199 vmovdqu %xmm1, _lens+1*16(state) 200 201 # "state" and "args" are the same address, arg1 202 # len is arg2 203 call sha1_x8_avx2 204 # state and idx are intact 205 206 207len_is_0: 208 # process completed job "idx" 209 imul $_LANE_DATA_size, idx, lane_data 210 lea _ldata(state, lane_data), lane_data 211 212 mov _job_in_lane(lane_data), job_rax 213 movq $0, _job_in_lane(lane_data) 214 movl $STS_COMPLETED, _status(job_rax) 215 mov _unused_lanes(state), unused_lanes 216 shl $4, unused_lanes 217 or idx, unused_lanes 218 mov unused_lanes, _unused_lanes(state) 219 220 movl $0xFFFFFFFF, _lens(state, idx, 4) 221 222 vmovd _args_digest(state , idx, 4) , %xmm0 223 vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0 224 vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0 225 vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0 226 movl _args_digest+4*32(state, idx, 4), tmp2_w 227 228 vmovdqu %xmm0, _result_digest(job_rax) 229 offset = (_result_digest + 1*16) 230 mov tmp2_w, offset(job_rax) 231 232return: 233 234 mov _GPR_SAVE(%rsp), %rbx 235 mov _GPR_SAVE+8*1(%rsp), %r10 #saved rsp 236 mov _GPR_SAVE+8*3(%rsp), %rbp 237 mov _GPR_SAVE+8*4(%rsp), %r12 238 mov _GPR_SAVE+8*5(%rsp), %r13 239 mov _GPR_SAVE+8*6(%rsp), %r14 240 mov _GPR_SAVE+8*7(%rsp), %r15 241 mov %r10, %rsp 242 243 ret 244 245return_null: 246 xor job_rax, job_rax 247 jmp return 248ENDPROC(sha1_mb_mgr_flush_avx2) 249 250 251################################################################# 252 253.align 16 254ENTRY(sha1_mb_mgr_get_comp_job_avx2) 255 push %rbx 256 257 ## if bit 32+3 is set, then all lanes are empty 258 mov _unused_lanes(state), unused_lanes 259 bt $(32+3), unused_lanes 260 jc .return_null 261 262 # Find min length 263 vmovdqu _lens(state), %xmm0 264 vmovdqu _lens+1*16(state), %xmm1 265 266 vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A} 267 vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C} 268 vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F} 269 vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E} 270 vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min value in low dword 271 272 vmovd %xmm2, DWORD_idx 273 test $~0xF, idx 274 jnz .return_null 275 276 # process completed job "idx" 277 imul $_LANE_DATA_size, idx, lane_data 278 lea _ldata(state, lane_data), lane_data 279 280 mov _job_in_lane(lane_data), job_rax 281 movq $0, _job_in_lane(lane_data) 282 movl $STS_COMPLETED, _status(job_rax) 283 mov _unused_lanes(state), unused_lanes 284 shl $4, unused_lanes 285 or idx, unused_lanes 286 mov unused_lanes, _unused_lanes(state) 287 288 movl $0xFFFFFFFF, _lens(state, idx, 4) 289 290 vmovd _args_digest(state, idx, 4), %xmm0 291 vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0 292 vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0 293 vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0 294 movl _args_digest+4*32(state, idx, 4), tmp2_w 295 296 vmovdqu %xmm0, _result_digest(job_rax) 297 movl tmp2_w, _result_digest+1*16(job_rax) 298 299 pop %rbx 300 301 ret 302 303.return_null: 304 xor job_rax, job_rax 305 pop %rbx 306 ret 307ENDPROC(sha1_mb_mgr_get_comp_job_avx2) 308 309.data 310 311.align 16 312clear_low_nibble: 313.octa 0x000000000000000000000000FFFFFFF0 314one: 315.quad 1 316two: 317.quad 2 318three: 319.quad 3 320four: 321.quad 4 322five: 323.quad 5 324six: 325.quad 6 326seven: 327.quad 7 328