• Home
  • Raw
  • Download

Lines Matching +full:a +full:- +full:h

2 # Implement fast SHA-256 with AVX2 instructions. (x86_64)
11 # This software is available to you under a choice of one of two
21 # - Redistributions of source code must retain the above
25 # - Redistributions in binary form must reproduce the above
32 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
41 # This code is described in an Intel White-Paper:
42 # "Fast SHA-256 Implementations on Intel Architecture Processors"
48 # This code schedules 2 blocks at a time, with 4 lanes per block
51 #include <linux/linkage.h>
59 # Add reg to mem using reg-mem add and store
86 SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
87 SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
102 a = %eax define
106 h = %r11d define
141 # Rotate values of symbols a...h
143 old_h = h
144 TMP_ = h
145 h = g define
151 b = a
152 a = TMP_ define
158 mov a, y3 # y3 = a # MAJA
162 addl \disp(%rsp, SRND), h # h = k + w + h # --
163 or c, y3 # y3 = a|c # MAJA
164 vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
166 rorx $13, a, T1 # T1 = a >> 13 # S0B
170 vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
175 rorx $22, a, y1 # y1 = a >> 22 # S0A
176 add h, d # d = k + w + h + d # --
178 and b, y3 # y3 = (a|c)&b # MAJA
179 vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
180 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
181 rorx $2, a, T1 # T1 = (a >> 2) # S0
185 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
186 mov a, T1 # T1 = a # MAJB
187 and c, T1 # T1 = a&c # MAJB
189 add y0, y2 # y2 = S1 + CH # --
190 vpslld $(32-7), XTMP1, XTMP3
191 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
192 add y1, h # h = k + w + h + S0 # --
194 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
195 vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
198 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
199 add y3, h # h = t1 + S0 + MAJ # --
206 mov a, y3 # y3 = a # MAJA
210 addl offset(%rsp, SRND), h # h = k + w + h # --
211 or c, y3 # y3 = a|c # MAJA
214 vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
216 rorx $13, a, T1 # T1 = a >> 13 # S0B
223 rorx $22, a, y1 # y1 = a >> 22 # S0A
225 add h, d # d = k + w + h + d # --
227 vpslld $(32-18), XTMP1, XTMP1
228 and b, y3 # y3 = (a|c)&b # MAJA
229 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
232 rorx $2, a, T1 # T1 = (a >> 2) # S0
235 vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
236 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
237 mov a, T1 # T1 = a # MAJB
238 and c, T1 # T1 = a&c # MAJB
239 add y0, y2 # y2 = S1 + CH # --
242 vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
243 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
244 add y1, h # h = k + w + h + S0 # --
246 vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
247 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
248 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
249 add y3, h # h = t1 + S0 + MAJ # --
251 vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
258 mov a, y3 # y3 = a # MAJA
261 addl offset(%rsp, SRND), h # h = k + w + h # --
263 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
265 or c, y3 # y3 = a|c # MAJA
269 rorx $13, a, T1 # T1 = a >> 13 # S0B
271 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
276 add h, d # d = k + w + h + d # --
277 and b, y3 # y3 = (a|c)&b # MAJA
280 rorx $22, a, y1 # y1 = a >> 22 # S0A
285 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
286 rorx $2, a ,T1 # T1 = (a >> 2) # S0
289 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
290 mov a, T1 # T1 = a # MAJB
291 and c, T1 # T1 = a&c # MAJB
292 add y0, y2 # y2 = S1 + CH # --
293 vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
295 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
296 add y1,h # h = k + w + h + S0 # --
297 add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --
298 add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
300 add y3,h # h = t1 + S0 + MAJ # --
307 mov a, y3 # y3 = a # MAJA
311 addl offset(%rsp, SRND), h # h = k + w + h # --
312 or c, y3 # y3 = a|c # MAJA
315 vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
317 rorx $13, a, T1 # T1 = a >> 13 # S0B
322 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
325 add h, d # d = k + w + h + d # --
326 and b, y3 # y3 = (a|c)&b # MAJA
328 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
333 rorx $22, a, y1 # y1 = a >> 22 # S0A
334 add y0, y2 # y2 = S1 + CH # --
337 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
338 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
340 rorx $2, a, T1 # T1 = (a >> 2) # S0
344 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
345 mov a, T1 # T1 = a # MAJB
346 and c, T1 # T1 = a&c # MAJB
347 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
349 add y1, h # h = k + w + h + S0 # --
350 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
351 add y3, h # h = t1 + S0 + MAJ # --
370 rorx $13, a, T1 # T1 = a >> 13 # S0B
372 rorx $22, a, y1 # y1 = a >> 22 # S0A
373 mov a, y3 # y3 = a # MAJA
375 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
376 rorx $2, a, T1 # T1 = (a >> 2) # S0
377 addl \disp(%rsp, SRND), h # h = k + w + h # --
378 or c, y3 # y3 = a|c # MAJA
380 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
381 mov a, T1 # T1 = a # MAJB
382 and b, y3 # y3 = (a|c)&b # MAJA
383 and c, T1 # T1 = a&c # MAJB
384 add y0, y2 # y2 = S1 + CH # --
387 add h, d # d = k + w + h + d # --
388 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
389 add y1, h # h = k + w + h + S0 # --
390 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
396 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
405 add y3, old_h # h = t1 + S0 + MAJ # --
408 rorx $13, a, T1 # T1 = a >> 13 # S0B
410 rorx $22, a, y1 # y1 = a >> 22 # S0A
411 mov a, y3 # y3 = a # MAJA
413 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
414 rorx $2, a, T1 # T1 = (a >> 2) # S0
416 addl offset(%rsp, SRND), h # h = k + w + h # --
417 or c, y3 # y3 = a|c # MAJA
419 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
420 mov a, T1 # T1 = a # MAJB
421 and b, y3 # y3 = (a|c)&b # MAJA
422 and c, T1 # T1 = a&c # MAJB
423 add y0, y2 # y2 = S1 + CH # --
426 add h, d # d = k + w + h + d # --
427 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
428 add y1, h # h = k + w + h + S0 # --
430 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
436 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
445 add y3, old_h # h = t1 + S0 + MAJ # --
448 rorx $13, a, T1 # T1 = a >> 13 # S0B
450 rorx $22, a, y1 # y1 = a >> 22 # S0A
451 mov a, y3 # y3 = a # MAJA
453 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
454 rorx $2, a, T1 # T1 = (a >> 2) # S0
456 addl offset(%rsp, SRND), h # h = k + w + h # --
457 or c, y3 # y3 = a|c # MAJA
459 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
460 mov a, T1 # T1 = a # MAJB
461 and b, y3 # y3 = (a|c)&b # MAJA
462 and c, T1 # T1 = a&c # MAJB
463 add y0, y2 # y2 = S1 + CH # --
466 add h, d # d = k + w + h + d # --
467 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
468 add y1, h # h = k + w + h + S0 # --
470 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
476 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
485 add y3, old_h # h = t1 + S0 + MAJ # --
488 rorx $13, a, T1 # T1 = a >> 13 # S0B
490 rorx $22, a, y1 # y1 = a >> 22 # S0A
491 mov a, y3 # y3 = a # MAJA
493 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
494 rorx $2, a, T1 # T1 = (a >> 2) # S0
496 addl offset(%rsp, SRND), h # h = k + w + h # --
497 or c, y3 # y3 = a|c # MAJA
499 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
500 mov a, T1 # T1 = a # MAJB
501 and b, y3 # y3 = (a|c)&b # MAJA
502 and c, T1 # T1 = a&c # MAJB
503 add y0, y2 # y2 = S1 + CH # --
506 add h, d # d = k + w + h + d # --
507 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
508 add y1, h # h = k + w + h + S0 # --
510 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
513 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
515 add y3, h # h = t1 + S0 + MAJ # --
538 and $-32, %rsp # align rsp to 32 byte boundary
544 lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
551 mov (CTX), a
558 mov 4*7(CTX), h
634 addm (4*0)(CTX),a
641 addm (4*7)(CTX),h
660 addm (4*0)(CTX),a
667 addm (4*7)(CTX),h
689 mov (4*0)(CTX),a
696 mov (4*7)(CTX),h
758 # shuffle xBxA -> 00BA
764 # shuffle xDxC -> DC00