• Home
  • Raw
  • Download

Lines Matching refs:T1

393 my ($D0,$D1,$D2,$D3,$D4,$T0,$T1,$T2)=map("xmm$_",(0..7));
431 &movdqa ($T1,$D1);
433 &pslld ($T1,2);
435 &paddd ($T1,$D1); # *5
437 &movdqa (&QWP(16*5,"esp"),$T1);
439 &movdqa ($T1,$D3);
441 &pslld ($T1,2);
443 &paddd ($T1,$D3); # *5
445 &movdqa (&QWP(16*7,"esp"),$T1);
448 &pshufd ($T1,$D0,0b01000100);
454 &movdqa (&QWP(16*0,"edx"),$T1);
471 &pmuludq ($D0,$T1); # h0*r0
483 &movdqa ($T1,$T0);
485 &movdqa ($T2,$T1);
486 &pmuludq ($T1,&QWP(16*2,$base)); # r1*h2
490 &paddq ($D3,$T1);
491 &$load ($T1,5); # s1
494 &pmuludq ($T1,&QWP(16*4,$base)); # s1*h4
500 &paddq ($D0,$T1);
501 &movdqa ($T1,$T0);
505 &pmuludq ($T1,&QWP(16*0,$base)); # r2*h0
509 &paddq ($D2,$T1);
511 &$load ($T1,3); # r3^n
514 &movdqa ($T2,$T1);
515 &pmuludq ($T1,&QWP(16*1,$base)); # r3*h1
519 &paddq ($D4,$T1);
520 &movdqa ($T1,$T0);
523 &movdqa ($T2,$T1);
524 &pmuludq ($T1,&QWP(16*3,$base)); # s3*h3
528 &paddq ($D1,$T1);
530 &$load ($T1,8); # s4^n
533 &movdqa ($T2,$T1);
534 &pmuludq ($T1,&QWP(16*4,$base)); # s4*h4
538 &paddq ($D3,$T1);
539 &movdqa ($T1,$T0);
542 &pmuludq ($T1,&QWP(16*3,$base)); # s4*h3
545 &paddq ($D2,$T1);
565 &movdqa ($T1,$D0);
567 &psrlq ($T1,26);
569 &paddq ($T1,$D1); # h0 -> h1
572 &movdqa ($D1,$T1);
573 &psrlq ($T1,26);
579 &paddq ($T1,$D2); # h1 -> h2
582 &movdqa ($D2,$T1);
583 &psrlq ($T1,26);
585 &paddd ($T1,$D3); # h2 -> h3
588 &movdqa ($D3,$T1);
589 &psrlq ($T1,26);
593 &paddd ($D4,$T1); # h3 -> h4
631 &movdqa ($T1,$D1);
633 &pslld ($T1,2);
635 &paddd ($T1,$D1); # *5
637 &movdqu (&QWP(16*5,"edi"),$T1);
639 &movdqa ($T1,$D3);
641 &pslld ($T1,2);
643 &paddd ($T1,$D3); # *5
645 &movdqu (&QWP(16*7,"edi"),$T1);
728 &movdqu ($T1,&QWP(0,"esi")); # input
731 &movdqa ($T0,$T1); # -> base 2^26 ...
732 &pand ($T1,$MASK);
733 &paddd ($D0,$T1); # ... and accumulate
735 &movdqa ($T1,$T0);
737 &psrldq ($T1,6);
741 &movdqa ($T0,$T1);
742 &psrlq ($T1,4);
743 &pand ($T1,$MASK);
744 &paddd ($D2,$T1);
746 &movdqa ($T1,$T0);
749 &psrldq ($T1,7);
753 &paddd ($D4,$T1);
754 &movd ($T1,&DWP(16*0+12,"edi")); # r0
770 &pmuludq ($D0,$T1); # h4*r0
771 &pmuludq ($D1,$T1); # h3*r0
772 &pmuludq ($D2,$T1); # h2*r0
774 &pmuludq ($D3,$T1); # h1*r0
775 &pmuludq ($D4,$T1); # h0*r0
795 &pshufd ($T1,$T0,0b01000100); # duplicate r^3:r^4
798 &movdqa (&QWP(16*0,"edx"),$T1);
800 &movdqu ($T1,&QWP(16*1,"edi"));
802 &pshufd ($T0,$T1,0b01000100);
803 &pshufd ($T1,$T1,0b11101110);
806 &movdqa (&QWP(16*(1-9),"edx"),$T1);
807 &pshufd ($T1,$T0,0b01000100);
809 &movdqa (&QWP(16*2,"edx"),$T1);
810 &movdqu ($T1,&QWP(16*3,"edi"));
812 &pshufd ($T0,$T1,0b01000100);
813 &pshufd ($T1,$T1,0b11101110);
816 &movdqa (&QWP(16*(3-9),"edx"),$T1);
817 &pshufd ($T1,$T0,0b01000100);
819 &movdqa (&QWP(16*4,"edx"),$T1);
820 &movdqu ($T1,&QWP(16*5,"edi"));
822 &pshufd ($T0,$T1,0b01000100);
823 &pshufd ($T1,$T1,0b11101110);
826 &movdqa (&QWP(16*(5-9),"edx"),$T1);
827 &pshufd ($T1,$T0,0b01000100);
829 &movdqa (&QWP(16*6,"edx"),$T1);
830 &movdqu ($T1,&QWP(16*7,"edi"));
832 &pshufd ($T0,$T1,0b01000100);
833 &pshufd ($T1,$T1,0b11101110);
836 &movdqa (&QWP(16*(7-9),"edx"),$T1);
837 &pshufd ($T1,$T0,0b01000100);
839 &movdqa (&QWP(16*8,"edx"),$T1);
846 &movdqu ($T1,&QWP($inpbase+16,"esi"));
854 &movdqa ($D3,$T1);
859 &punpckhqdq ($D4,$T1); # 4
860 &punpcklqdq ($T0,$T1); # 0:1
865 &movdqa ($T1,$T0);
867 &psrlq ($T1,26);
869 &pand ($T1,$MASK); # 1
893 &movdqa (&QWP(16*1,"eax"),$T1);
907 &movdqa ($D0,$T1);
908 &pmuludq ($T1,$T2); # h1*r0
922 &paddq ($D1,$T1);
923 &movdqa ($T1,$T0);
927 &pmuludq ($T1,&$addr(4)); # h0*r4
932 &paddq ($D4,$T1);
933 &movdqa ($T1,$T0);
937 &pmuludq ($T1,&$addr(3)); # h1*r3
941 &paddq ($D4,$T1);
942 &movdqa ($T1,$T0);
946 &movdqa ($T2,$T1);
947 &pmuludq ($T1,&$addr(1)); # h2*r1
951 &paddq ($D3,$T1);
952 &movdqa ($T1,$T0);
955 &movdqa ($T2,$T1);
956 &pmuludq ($T1,&$addr(7)); # h3*s3
960 &paddq ($D1,$T1);
962 &movdqa ($T1,&QWP(16*4,"eax")); # pull h4
965 &movdqa ($T2,$T1);
966 &pmuludq ($T1,&$addr(8)); # h4*s4
970 &paddq ($D3,$T1);
971 &movdqa ($T1,$T0);
975 &pmuludq ($T1,&$addr(7)); # h4*s3
977 &paddq ($D2,$T1);
986 &paddd ($T1,&QWP(16*(5+1),"esp"));
996 &movdqa (&QWP(16*1,"eax"),$T1);
1011 &movdqa ($D0,$T1);
1012 &pmuludq ($T1,$T2); # h1*r0
1017 &paddq ($T1,&QWP(16*1,"esp"));
1039 &paddd ($T1,$D1);
1047 &movdqa (&QWP(16*1,"eax"),$T1);
1060 &pmuludq ($T1,$T2); # h1*r0
1065 &movdqa ($D1,$T1);
1078 &paddd ($T1,&QWP(16*6,"esp"));
1088 &movdqa (&QWP(16*1,"esp"),$T1);
1089 &pmuludq ($T1,$T2); # h1*r0
1093 &paddq ($D1,$T1);
1094 &movdqa ($T1,$D3);
1100 &movdqa (&QWP(16*3,"esp"),$T1);
1101 &movdqa ($T1,$D4);
1104 &movdqa (&QWP(16*4,"esp"),$T1);
1115 &pshufd ($T1,$D4,0b01001110);
1117 &paddq ($D4,$T1);
1119 &pshufd ($T1,$D0,0b01001110);
1121 &paddq ($D0,$T1);
1123 &pshufd ($T1,$D2,0b01001110);
1126 &lazy_reduction (sub { &paddq ($D2,$T1) });
1267 &vpslld ($T1,$D1,2);
1269 &vpaddd ($T1,$T1,$D1); # *5
1271 &vmovdqa (&QWP(16*5,"esp"),$T1);
1273 &vpslld ($T1,$D3,2);
1275 &vpaddd ($T1,$T1,$D3); # *5
1277 &vmovdqa (&QWP(16*7,"esp"),$T1);
1281 &vmovdqa ($T1,$D1);
1305 &vpmuludq ($T0,$T1,&QWP(16*3,"edx")); # r1*h3
1307 &vpmuludq ($T2,$T1,&QWP(16*2,"edx")); # r1*h2
1309 &vpmuludq ($T0,$T1,&QWP(16*1,"edx")); # r1*h1
1312 &vpmuludq ($T1,$T1,&QWP(16*0,"edx")); # r1*h0
1313 &vpaddq ($D1,$D1,$T1);
1318 &vpmuludq ($T1,$T0,&QWP(16*2,"edx")); # r2*h2
1319 &vpaddq ($D4,$D4,$T1);
1322 &vmovdqa ($T1,&QWP(16*6,"esp")); # s2
1325 &vpmuludq ($T2,$T1,&QWP(16*4,"edx")); # s2*h4
1328 &vpmuludq ($T1,$T1,&QWP(16*3,"edx")); # s2*h3
1329 &vpaddq ($D0,$D0,$T1);
1333 &vmovdqa ($T1,&QWP(16*7,"esp")); # s3
1336 &vpmuludq ($T2,$T1,&QWP(16*4,"edx")); # s3*h4
1338 &vpmuludq ($T0,$T1,&QWP(16*3,"edx")); # s3*h3
1341 &vpmuludq ($T1,$T1,&QWP(16*2,"edx")); # s3*h2
1342 &vpaddq ($D0,$D0,$T1);
1347 &vpmuludq ($T1,$T0,&QWP(16*4,"edx")); # s4*h4
1348 &vpaddq ($D3,$D3,$T1);
1351 &vpmuludq ($T1,$T0,&QWP(16*2,"edx")); # s4*h2
1352 &vpaddq ($D1,$D1,$T1);
1361 &vpsrlq ($T1,$D0,26);
1364 &vpaddq ($D1,$D1,$T1); # h0 -> h1
1367 &vpsrlq ($T1,$D1,26);
1369 &vpaddq ($D2,$D2,$T1); # h1 -> h2
1372 &vpsrlq ($T1,$D2,26);
1375 &vpaddd ($D3,$D3,$T1); # h2 -> h3
1376 &vpsrlq ($T1,$D3,26);
1381 &vpaddd ($D4,$D4,$T1); # h3 -> h4
1417 &vpslld ($T1,$D1,2);
1419 &vpaddd ($T1,$T1,$D1); # *5
1421 &vmovdqu (&QWP(16*5,"edi"),$T1);
1423 &vpslld ($T1,$D3,2);
1425 &vpaddd ($T1,$T1,$D3); # *5
1427 &vmovdqu (&QWP(16*7,"edi"),$T1);
1438 my ($D0,$D1,$D2,$D3,$D4,$T0,$T1,$T2)=map("ymm$_",(0..7));
1566 &vmovdqu (&X($T1),&QWP(16*1,"esi"));
1583 &vpxor ($T1,$T1,$T1);
1593 &vmovdqu (&X($T1),&QWP(16*1,"esi"));
1595 &vinserti128 ($T1,$T1,&QWP(16*3,"esi"),1);
1613 &vpsrldq ($D0,$T1,6);
1615 &vpunpckhqdq ($D1,$T0,$T1); # 4
1616 &vpunpcklqdq ($T0,$T0,$T1); # 0:1
1621 &vpsrlq ($T1,$T0,26);
1625 &vpand ($T1,$T1,$MASK); # 1
1639 &vpaddq ($T1,$T1,&QWP(32*1,"esp"));
1651 &vmovdqa (QWP(32*1,"esp"),$T1);
1661 &vpmuludq ($T1,$T0,&$addr(4)); # h0*r4
1662 &vpaddq ($D4,$D4,$T1); # d4 + h0*r4
1666 &vpmuludq ($T1,$T0,&$addr(1)); # h0*r1
1667 &vpaddq ($D1,$D1,$T1); # d1 += h0*r1
1671 &vpmuludq ($T1,$T2,&$addr(2)); # h1*r2
1672 &vpaddq ($D3,$D3,$T1); # d3 += h1*r2
1675 &vpmuludq ($T1,$T2,&$addr(8)); # h1*s4
1676 &vpaddq ($D0,$D0,$T1); # d0 += h1*s4
1677 &vmovdqa ($T1,&QWP(32*3,"esp")); # h3
1683 &vpmuludq ($T0,$T1,&$addr(0)); # h3*r0
1685 &vpmuludq ($T2,$T1,&$addr(1)); # h3*r1
1687 &vpmuludq ($T0,$T1,&$addr(6)); # h3*s2
1690 &vpmuludq ($T2,$T1,&$addr(7)); # h3*s3
1692 &vpmuludq ($T1,$T1,&$addr(8)); # h3*s4
1693 &vpaddq ($D2,$D2,$T1); # d2 += h3*s4
1697 &vpmuludq ($T1,$T0,&$addr(5)); # h4*s1
1698 &vpaddq ($D0,$D0,$T1); # d0 += h4*s1
1702 &vpmuludq ($T1,$T0,&$addr(6)); # h4*s2
1703 &vpaddq ($D1,$D1,$T1); # d1 += h4*s2
1715 &vpsrlq ($T1,$D0,26);
1718 &vpaddq ($D1,$D1,$T1); # h0 -> h1
1721 &vpsrlq ($T1,$D1,26);
1723 &vpaddq ($D2,$D2,$T1); # h1 -> h2
1726 &vpsrlq ($T1,$D2,26);
1729 &vpaddq ($D3,$D3,$T1); # h2 -> h3
1730 &vpsrlq ($T1,$D3,26);
1735 &vpaddq ($D4,$D4,$T1); # h3 -> h4
1740 &vmovdqu (&X($T1),&QWP(16*1,"esi"));
1742 &vinserti128 ($T1,$T1,&QWP(16*3,"esi"),1);
1757 &vpsrldq ($T1,$D3,8);
1760 &vpaddq ($D3,$D3,$T1);
1761 &vpsrldq ($T1,$D1,8);
1764 &vpaddq ($D1,$D1,$T1);
1765 &vpermq ($T1,$D4,2); # keep folding
1768 &vpaddq ($D4,$D4,$T1);
1769 &vpermq ($T1,$D0,2);
1772 &vpaddq ($D0,$D0,$T1);
1773 &vpermq ($T1,$D2,2);
1775 &vpaddq ($D2,$D2,$T1);