1; Sha256Opt.asm -- SHA-256 optimized code for SHA-256 x86 hardware instructions 2; 2022-04-17 : Igor Pavlov : Public domain 3 4include 7zAsm.asm 5 6MY_ASM_START 7 8; .data 9; public K 10 11; we can use external SHA256_K_ARRAY defined in Sha256.c 12; but we must guarantee that SHA256_K_ARRAY is aligned for 16-bytes 13 14COMMENT @ 15ifdef x64 16K_CONST equ SHA256_K_ARRAY 17else 18K_CONST equ _SHA256_K_ARRAY 19endif 20EXTRN K_CONST:xmmword 21@ 22 23CONST SEGMENT 24 25align 16 26Reverse_Endian_Mask db 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 27 28; COMMENT @ 29align 16 30K_CONST \ 31DD 0428a2f98H, 071374491H, 0b5c0fbcfH, 0e9b5dba5H 32DD 03956c25bH, 059f111f1H, 0923f82a4H, 0ab1c5ed5H 33DD 0d807aa98H, 012835b01H, 0243185beH, 0550c7dc3H 34DD 072be5d74H, 080deb1feH, 09bdc06a7H, 0c19bf174H 35DD 0e49b69c1H, 0efbe4786H, 00fc19dc6H, 0240ca1ccH 36DD 02de92c6fH, 04a7484aaH, 05cb0a9dcH, 076f988daH 37DD 0983e5152H, 0a831c66dH, 0b00327c8H, 0bf597fc7H 38DD 0c6e00bf3H, 0d5a79147H, 006ca6351H, 014292967H 39DD 027b70a85H, 02e1b2138H, 04d2c6dfcH, 053380d13H 40DD 0650a7354H, 0766a0abbH, 081c2c92eH, 092722c85H 41DD 0a2bfe8a1H, 0a81a664bH, 0c24b8b70H, 0c76c51a3H 42DD 0d192e819H, 0d6990624H, 0f40e3585H, 0106aa070H 43DD 019a4c116H, 01e376c08H, 02748774cH, 034b0bcb5H 44DD 0391c0cb3H, 04ed8aa4aH, 05b9cca4fH, 0682e6ff3H 45DD 0748f82eeH, 078a5636fH, 084c87814H, 08cc70208H 46DD 090befffaH, 0a4506cebH, 0bef9a3f7H, 0c67178f2H 47; @ 48 49CONST ENDS 50 51; _TEXT$SHA256OPT SEGMENT 'CODE' 52 53ifndef x64 54 .686 55 .xmm 56endif 57 58; jwasm-based assemblers for linux and linker from new versions of binutils 59; can generate incorrect code for load [ARRAY + offset] instructions. 60; 22.00: we load K_CONST offset to (rTable) register to avoid jwasm+binutils problem 61 rTable equ r0 62 ; rTable equ K_CONST 63 64ifdef x64 65 rNum equ REG_ABI_PARAM_2 66 if (IS_LINUX eq 0) 67 LOCAL_SIZE equ (16 * 2) 68 endif 69else 70 rNum equ r3 71 LOCAL_SIZE equ (16 * 1) 72endif 73 74rState equ REG_ABI_PARAM_0 75rData equ REG_ABI_PARAM_1 76 77 78 79 80 81 82MY_SHA_INSTR macro cmd, a1, a2 83 db 0fH, 038H, cmd, (0c0H + a1 * 8 + a2) 84endm 85 86cmd_sha256rnds2 equ 0cbH 87cmd_sha256msg1 equ 0ccH 88cmd_sha256msg2 equ 0cdH 89 90MY_sha256rnds2 macro a1, a2 91 MY_SHA_INSTR cmd_sha256rnds2, a1, a2 92endm 93 94MY_sha256msg1 macro a1, a2 95 MY_SHA_INSTR cmd_sha256msg1, a1, a2 96endm 97 98MY_sha256msg2 macro a1, a2 99 MY_SHA_INSTR cmd_sha256msg2, a1, a2 100endm 101 102MY_PROLOG macro 103 ifdef x64 104 if (IS_LINUX eq 0) 105 movdqa [r4 + 8], xmm6 106 movdqa [r4 + 8 + 16], xmm7 107 sub r4, LOCAL_SIZE + 8 108 movdqa [r4 ], xmm8 109 movdqa [r4 + 16], xmm9 110 endif 111 else ; x86 112 push r3 113 push r5 114 mov r5, r4 115 NUM_PUSH_REGS equ 2 116 PARAM_OFFSET equ (REG_SIZE * (1 + NUM_PUSH_REGS)) 117 if (IS_CDECL gt 0) 118 mov rState, [r4 + PARAM_OFFSET] 119 mov rData, [r4 + PARAM_OFFSET + REG_SIZE * 1] 120 mov rNum, [r4 + PARAM_OFFSET + REG_SIZE * 2] 121 else ; fastcall 122 mov rNum, [r4 + PARAM_OFFSET] 123 endif 124 and r4, -16 125 sub r4, LOCAL_SIZE 126 endif 127endm 128 129MY_EPILOG macro 130 ifdef x64 131 if (IS_LINUX eq 0) 132 movdqa xmm8, [r4] 133 movdqa xmm9, [r4 + 16] 134 add r4, LOCAL_SIZE + 8 135 movdqa xmm6, [r4 + 8] 136 movdqa xmm7, [r4 + 8 + 16] 137 endif 138 else ; x86 139 mov r4, r5 140 pop r5 141 pop r3 142 endif 143 MY_ENDP 144endm 145 146 147msg equ xmm0 148tmp equ xmm0 149state0_N equ 2 150state1_N equ 3 151w_regs equ 4 152 153 154state1_save equ xmm1 155state0 equ @CatStr(xmm, %state0_N) 156state1 equ @CatStr(xmm, %state1_N) 157 158 159ifdef x64 160 state0_save equ xmm8 161 mask2 equ xmm9 162else 163 state0_save equ [r4] 164 mask2 equ xmm0 165endif 166 167LOAD_MASK macro 168 movdqa mask2, XMMWORD PTR Reverse_Endian_Mask 169endm 170 171LOAD_W macro k:req 172 movdqu @CatStr(xmm, %(w_regs + k)), [rData + (16 * (k))] 173 pshufb @CatStr(xmm, %(w_regs + k)), mask2 174endm 175 176 177; pre1 <= 4 && pre2 >= 1 && pre1 > pre2 && (pre1 - pre2) <= 1 178pre1 equ 3 179pre2 equ 2 180 181 182 183RND4 macro k 184 movdqa msg, xmmword ptr [rTable + (k) * 16] 185 paddd msg, @CatStr(xmm, %(w_regs + ((k + 0) mod 4))) 186 MY_sha256rnds2 state0_N, state1_N 187 pshufd msg, msg, 0eH 188 189 if (k GE (4 - pre1)) AND (k LT (16 - pre1)) 190 ; w4[0] = msg1(w4[-4], w4[-3]) 191 MY_sha256msg1 (w_regs + ((k + pre1) mod 4)), (w_regs + ((k + pre1 - 3) mod 4)) 192 endif 193 194 MY_sha256rnds2 state1_N, state0_N 195 196 if (k GE (4 - pre2)) AND (k LT (16 - pre2)) 197 movdqa tmp, @CatStr(xmm, %(w_regs + ((k + pre2 - 1) mod 4))) 198 palignr tmp, @CatStr(xmm, %(w_regs + ((k + pre2 - 2) mod 4))), 4 199 paddd @CatStr(xmm, %(w_regs + ((k + pre2) mod 4))), tmp 200 ; w4[0] = msg2(w4[0], w4[-1]) 201 MY_sha256msg2 %(w_regs + ((k + pre2) mod 4)), %(w_regs + ((k + pre2 - 1) mod 4)) 202 endif 203endm 204 205 206 207 208 209REVERSE_STATE macro 210 ; state0 ; dcba 211 ; state1 ; hgfe 212 pshufd tmp, state0, 01bH ; abcd 213 pshufd state0, state1, 01bH ; efgh 214 movdqa state1, state0 ; efgh 215 punpcklqdq state0, tmp ; cdgh 216 punpckhqdq state1, tmp ; abef 217endm 218 219 220MY_PROC Sha256_UpdateBlocks_HW, 3 221 MY_PROLOG 222 223 lea rTable, [K_CONST] 224 225 cmp rNum, 0 226 je end_c 227 228 movdqu state0, [rState] ; dcba 229 movdqu state1, [rState + 16] ; hgfe 230 231 REVERSE_STATE 232 233 ifdef x64 234 LOAD_MASK 235 endif 236 237 align 16 238 nextBlock: 239 movdqa state0_save, state0 240 movdqa state1_save, state1 241 242 ifndef x64 243 LOAD_MASK 244 endif 245 246 LOAD_W 0 247 LOAD_W 1 248 LOAD_W 2 249 LOAD_W 3 250 251 252 k = 0 253 rept 16 254 RND4 k 255 k = k + 1 256 endm 257 258 paddd state0, state0_save 259 paddd state1, state1_save 260 261 add rData, 64 262 sub rNum, 1 263 jnz nextBlock 264 265 REVERSE_STATE 266 267 movdqu [rState], state0 268 movdqu [rState + 16], state1 269 270 end_c: 271MY_EPILOG 272 273; _TEXT$SHA256OPT ENDS 274 275end 276