• Home
  • Raw
  • Download

Lines Matching +full:a +full:- +full:h

2 # Implement fast SHA-256 with AVX2 instructions. (x86_64)
11 # This software is available to you under a choice of one of two
21 # - Redistributions of source code must retain the above
25 # - Redistributions in binary form must reproduce the above
32 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
41 # This code is described in an Intel White-Paper:
42 # "Fast SHA-256 Implementations on Intel Architecture Processors"
48 # This code schedules 2 blocks at a time, with 4 lanes per block
51 #include <linux/linkage.h>
52 #include <linux/cfi_types.h>
60 # Add reg to mem using reg-mem add and store
87 SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
88 SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
103 a = %eax define
107 h = %r11d define
140 # Rotate values of symbols a...h
142 old_h = h
143 TMP_ = h
144 h = g define
150 b = a
151 a = TMP_ define
157 mov a, y3 # y3 = a # MAJA
161 addl \disp(%rsp, SRND), h # h = k + w + h # --
162 or c, y3 # y3 = a|c # MAJA
163 vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
165 rorx $13, a, T1 # T1 = a >> 13 # S0B
169 vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
174 rorx $22, a, y1 # y1 = a >> 22 # S0A
175 add h, d # d = k + w + h + d # --
177 and b, y3 # y3 = (a|c)&b # MAJA
178 vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
179 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
180 rorx $2, a, T1 # T1 = (a >> 2) # S0
184 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
185 mov a, T1 # T1 = a # MAJB
186 and c, T1 # T1 = a&c # MAJB
188 add y0, y2 # y2 = S1 + CH # --
189 vpslld $(32-7), XTMP1, XTMP3
190 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
191 add y1, h # h = k + w + h + S0 # --
193 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
194 vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
197 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
198 add y3, h # h = t1 + S0 + MAJ # --
205 mov a, y3 # y3 = a # MAJA
209 addl offset(%rsp, SRND), h # h = k + w + h # --
210 or c, y3 # y3 = a|c # MAJA
213 vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
215 rorx $13, a, T1 # T1 = a >> 13 # S0B
222 rorx $22, a, y1 # y1 = a >> 22 # S0A
224 add h, d # d = k + w + h + d # --
226 vpslld $(32-18), XTMP1, XTMP1
227 and b, y3 # y3 = (a|c)&b # MAJA
228 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
231 rorx $2, a, T1 # T1 = (a >> 2) # S0
234 vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
235 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
236 mov a, T1 # T1 = a # MAJB
237 and c, T1 # T1 = a&c # MAJB
238 add y0, y2 # y2 = S1 + CH # --
241 vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
242 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
243 add y1, h # h = k + w + h + S0 # --
245 vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
246 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
247 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
248 add y3, h # h = t1 + S0 + MAJ # --
250 vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
257 mov a, y3 # y3 = a # MAJA
260 addl offset(%rsp, SRND), h # h = k + w + h # --
262 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
264 or c, y3 # y3 = a|c # MAJA
268 rorx $13, a, T1 # T1 = a >> 13 # S0B
270 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
275 add h, d # d = k + w + h + d # --
276 and b, y3 # y3 = (a|c)&b # MAJA
279 rorx $22, a, y1 # y1 = a >> 22 # S0A
284 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
285 rorx $2, a ,T1 # T1 = (a >> 2) # S0
288 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
289 mov a, T1 # T1 = a # MAJB
290 and c, T1 # T1 = a&c # MAJB
291 add y0, y2 # y2 = S1 + CH # --
292 vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
294 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
295 add y1,h # h = k + w + h + S0 # --
296 add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --
297 add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
299 add y3,h # h = t1 + S0 + MAJ # --
306 mov a, y3 # y3 = a # MAJA
310 addl offset(%rsp, SRND), h # h = k + w + h # --
311 or c, y3 # y3 = a|c # MAJA
314 vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
316 rorx $13, a, T1 # T1 = a >> 13 # S0B
321 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
324 add h, d # d = k + w + h + d # --
325 and b, y3 # y3 = (a|c)&b # MAJA
327 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
332 rorx $22, a, y1 # y1 = a >> 22 # S0A
333 add y0, y2 # y2 = S1 + CH # --
336 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
337 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
339 rorx $2, a, T1 # T1 = (a >> 2) # S0
343 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
344 mov a, T1 # T1 = a # MAJB
345 and c, T1 # T1 = a&c # MAJB
346 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
348 add y1, h # h = k + w + h + S0 # --
349 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
350 add y3, h # h = t1 + S0 + MAJ # --
369 rorx $13, a, T1 # T1 = a >> 13 # S0B
371 rorx $22, a, y1 # y1 = a >> 22 # S0A
372 mov a, y3 # y3 = a # MAJA
374 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
375 rorx $2, a, T1 # T1 = (a >> 2) # S0
376 addl \disp(%rsp, SRND), h # h = k + w + h # --
377 or c, y3 # y3 = a|c # MAJA
379 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
380 mov a, T1 # T1 = a # MAJB
381 and b, y3 # y3 = (a|c)&b # MAJA
382 and c, T1 # T1 = a&c # MAJB
383 add y0, y2 # y2 = S1 + CH # --
386 add h, d # d = k + w + h + d # --
387 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
388 add y1, h # h = k + w + h + S0 # --
389 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
395 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
404 add y3, old_h # h = t1 + S0 + MAJ # --
407 rorx $13, a, T1 # T1 = a >> 13 # S0B
409 rorx $22, a, y1 # y1 = a >> 22 # S0A
410 mov a, y3 # y3 = a # MAJA
412 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
413 rorx $2, a, T1 # T1 = (a >> 2) # S0
415 addl offset(%rsp, SRND), h # h = k + w + h # --
416 or c, y3 # y3 = a|c # MAJA
418 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
419 mov a, T1 # T1 = a # MAJB
420 and b, y3 # y3 = (a|c)&b # MAJA
421 and c, T1 # T1 = a&c # MAJB
422 add y0, y2 # y2 = S1 + CH # --
425 add h, d # d = k + w + h + d # --
426 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
427 add y1, h # h = k + w + h + S0 # --
429 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
435 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
444 add y3, old_h # h = t1 + S0 + MAJ # --
447 rorx $13, a, T1 # T1 = a >> 13 # S0B
449 rorx $22, a, y1 # y1 = a >> 22 # S0A
450 mov a, y3 # y3 = a # MAJA
452 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
453 rorx $2, a, T1 # T1 = (a >> 2) # S0
455 addl offset(%rsp, SRND), h # h = k + w + h # --
456 or c, y3 # y3 = a|c # MAJA
458 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
459 mov a, T1 # T1 = a # MAJB
460 and b, y3 # y3 = (a|c)&b # MAJA
461 and c, T1 # T1 = a&c # MAJB
462 add y0, y2 # y2 = S1 + CH # --
465 add h, d # d = k + w + h + d # --
466 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
467 add y1, h # h = k + w + h + S0 # --
469 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
475 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
484 add y3, old_h # h = t1 + S0 + MAJ # --
487 rorx $13, a, T1 # T1 = a >> 13 # S0B
489 rorx $22, a, y1 # y1 = a >> 22 # S0A
490 mov a, y3 # y3 = a # MAJA
492 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
493 rorx $2, a, T1 # T1 = (a >> 2) # S0
495 addl offset(%rsp, SRND), h # h = k + w + h # --
496 or c, y3 # y3 = a|c # MAJA
498 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
499 mov a, T1 # T1 = a # MAJB
500 and b, y3 # y3 = (a|c)&b # MAJA
501 and c, T1 # T1 = a&c # MAJB
502 add y0, y2 # y2 = S1 + CH # --
505 add h, d # d = k + w + h + d # --
506 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
507 add y1, h # h = k + w + h + S0 # --
509 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
512 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
514 add y3, h # h = t1 + S0 + MAJ # --
538 and $-32, %rsp # align rsp to 32 byte boundary
542 lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
549 mov (CTX), a
556 mov 4*7(CTX), h
638 addm (4*0)(CTX),a
645 addm (4*7)(CTX),h
664 addm (4*0)(CTX),a
671 addm (4*7)(CTX),h
693 mov (4*0)(CTX),a
700 mov (4*7)(CTX),h
763 # shuffle xBxA -> 00BA
769 # shuffle xDxC -> DC00