Lines Matching refs:SLOW
2 …s -mtriple=x86_64-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE3,SSE3-SLOW
4 … %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW
6 … %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX-SLOW
8 … %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX-SLOW
44 ; SSE3-SLOW-LABEL: haddpd3:
45 ; SSE3-SLOW: # %bb.0:
46 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
47 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
48 ; SSE3-SLOW-NEXT: addpd %xmm0, %xmm1
49 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
50 ; SSE3-SLOW-NEXT: retq
57 ; AVX-SLOW-LABEL: haddpd3:
58 ; AVX-SLOW: # %bb.0:
59 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
60 ; AVX-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0
61 ; AVX-SLOW-NEXT: retq
154 ; SSE3-SLOW-LABEL: haddps6:
155 ; SSE3-SLOW: # %bb.0:
156 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
157 ; SSE3-SLOW-NEXT: addps %xmm1, %xmm0
158 ; SSE3-SLOW-NEXT: retq
165 ; AVX-SLOW-LABEL: haddps6:
166 ; AVX-SLOW: # %bb.0:
167 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
168 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
169 ; AVX-SLOW-NEXT: retq
214 ; SSE3-SLOW-LABEL: hsubpd2:
215 ; SSE3-SLOW: # %bb.0:
216 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
217 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
218 ; SSE3-SLOW-NEXT: subpd %xmm1, %xmm0
219 ; SSE3-SLOW-NEXT: retq
226 ; AVX-SLOW-LABEL: hsubpd2:
227 ; AVX-SLOW: # %bb.0:
228 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
229 ; AVX-SLOW-NEXT: vsubpd %xmm1, %xmm0, %xmm0
230 ; AVX-SLOW-NEXT: retq
291 ; SSE3-SLOW-LABEL: hsubps4:
292 ; SSE3-SLOW: # %bb.0:
293 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
294 ; SSE3-SLOW-NEXT: subps %xmm1, %xmm0
295 ; SSE3-SLOW-NEXT: retq
302 ; AVX-SLOW-LABEL: hsubps4:
303 ; AVX-SLOW: # %bb.0:
304 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
305 ; AVX-SLOW-NEXT: vsubps %xmm1, %xmm0, %xmm0
306 ; AVX-SLOW-NEXT: retq
461 ; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32:
462 ; SSE3-SLOW: # %bb.0:
463 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
464 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
465 ; SSE3-SLOW-NEXT: retq
472 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32:
473 ; AVX-SLOW: # %bb.0:
474 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
475 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
476 ; AVX-SLOW-NEXT: retq
489 ; SSE3-SLOW-LABEL: extract_extract23_v4f32_fadd_f32:
490 ; SSE3-SLOW: # %bb.0:
491 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
492 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
493 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
494 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
495 ; SSE3-SLOW-NEXT: retq
503 ; AVX-SLOW-LABEL: extract_extract23_v4f32_fadd_f32:
504 ; AVX-SLOW: # %bb.0:
505 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
506 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
507 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
508 ; AVX-SLOW-NEXT: retq
522 ; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_commute:
523 ; SSE3-SLOW: # %bb.0:
524 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
525 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
526 ; SSE3-SLOW-NEXT: retq
533 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_commute:
534 ; AVX-SLOW: # %bb.0:
535 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
536 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
537 ; AVX-SLOW-NEXT: retq
550 ; SSE3-SLOW-LABEL: extract_extract23_v4f32_fadd_f32_commute:
551 ; SSE3-SLOW: # %bb.0:
552 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
553 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
554 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
555 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
556 ; SSE3-SLOW-NEXT: retq
564 ; AVX-SLOW-LABEL: extract_extract23_v4f32_fadd_f32_commute:
565 ; AVX-SLOW: # %bb.0:
566 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
567 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
568 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
569 ; AVX-SLOW-NEXT: retq
583 ; SSE3-SLOW-LABEL: extract_extract01_v2f64_fadd_f64:
584 ; SSE3-SLOW: # %bb.0:
585 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
586 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
587 ; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1
588 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
589 ; SSE3-SLOW-NEXT: retq
596 ; AVX-SLOW-LABEL: extract_extract01_v2f64_fadd_f64:
597 ; AVX-SLOW: # %bb.0:
598 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
599 ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
600 ; AVX-SLOW-NEXT: retq
613 ; SSE3-SLOW-LABEL: extract_extract01_v2f64_fadd_f64_commute:
614 ; SSE3-SLOW: # %bb.0:
615 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
616 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
617 ; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1
618 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
619 ; SSE3-SLOW-NEXT: retq
626 ; AVX-SLOW-LABEL: extract_extract01_v2f64_fadd_f64_commute:
627 ; AVX-SLOW: # %bb.0:
628 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
629 ; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0
630 ; AVX-SLOW-NEXT: retq
643 ; SSE3-SLOW-LABEL: extract_extract01_v4f32_fsub_f32:
644 ; SSE3-SLOW: # %bb.0:
645 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
646 ; SSE3-SLOW-NEXT: subss %xmm1, %xmm0
647 ; SSE3-SLOW-NEXT: retq
654 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fsub_f32:
655 ; AVX-SLOW: # %bb.0:
656 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
657 ; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0
658 ; AVX-SLOW-NEXT: retq
671 ; SSE3-SLOW-LABEL: extract_extract23_v4f32_fsub_f32:
672 ; SSE3-SLOW: # %bb.0:
673 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
674 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
675 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
676 ; SSE3-SLOW-NEXT: subss %xmm0, %xmm1
677 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
678 ; SSE3-SLOW-NEXT: retq
686 ; AVX-SLOW-LABEL: extract_extract23_v4f32_fsub_f32:
687 ; AVX-SLOW: # %bb.0:
688 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
689 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
690 ; AVX-SLOW-NEXT: vsubss %xmm0, %xmm1, %xmm0
691 ; AVX-SLOW-NEXT: retq
745 ; SSE3-SLOW-LABEL: extract_extract01_v2f64_fsub_f64:
746 ; SSE3-SLOW: # %bb.0:
747 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
748 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
749 ; SSE3-SLOW-NEXT: subsd %xmm1, %xmm0
750 ; SSE3-SLOW-NEXT: retq
757 ; AVX-SLOW-LABEL: extract_extract01_v2f64_fsub_f64:
758 ; AVX-SLOW: # %bb.0:
759 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
760 ; AVX-SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0
761 ; AVX-SLOW-NEXT: retq
796 ; SSE3-SLOW-LABEL: extract_extract01_v8f32_fadd_f32:
797 ; SSE3-SLOW: # %bb.0:
798 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
799 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
800 ; SSE3-SLOW-NEXT: retq
807 ; AVX-SLOW-LABEL: extract_extract01_v8f32_fadd_f32:
808 ; AVX-SLOW: # %bb.0:
809 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
810 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
811 ; AVX-SLOW-NEXT: vzeroupper
812 ; AVX-SLOW-NEXT: retq
826 ; SSE3-SLOW-LABEL: extract_extract23_v8f32_fadd_f32:
827 ; SSE3-SLOW: # %bb.0:
828 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
829 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
830 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
831 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
832 ; SSE3-SLOW-NEXT: retq
840 ; AVX-SLOW-LABEL: extract_extract23_v8f32_fadd_f32:
841 ; AVX-SLOW: # %bb.0:
842 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
843 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
844 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
845 ; AVX-SLOW-NEXT: vzeroupper
846 ; AVX-SLOW-NEXT: retq
861 ; SSE3-SLOW-LABEL: extract_extract67_v8f32_fadd_f32:
862 ; SSE3-SLOW: # %bb.0:
863 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
864 ; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
865 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
866 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
867 ; SSE3-SLOW-NEXT: retq
875 ; AVX-SLOW-LABEL: extract_extract67_v8f32_fadd_f32:
876 ; AVX-SLOW: # %bb.0:
877 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
878 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
879 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
880 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
881 ; AVX-SLOW-NEXT: vzeroupper
882 ; AVX-SLOW-NEXT: retq
898 ; SSE3-SLOW-LABEL: extract_extract01_v8f32_fadd_f32_commute:
899 ; SSE3-SLOW: # %bb.0:
900 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
901 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
902 ; SSE3-SLOW-NEXT: retq
909 ; AVX-SLOW-LABEL: extract_extract01_v8f32_fadd_f32_commute:
910 ; AVX-SLOW: # %bb.0:
911 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
912 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
913 ; AVX-SLOW-NEXT: vzeroupper
914 ; AVX-SLOW-NEXT: retq
928 ; SSE3-SLOW-LABEL: extract_extract23_v8f32_fadd_f32_commute:
929 ; SSE3-SLOW: # %bb.0:
930 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
931 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
932 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
933 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
934 ; SSE3-SLOW-NEXT: retq
942 ; AVX-SLOW-LABEL: extract_extract23_v8f32_fadd_f32_commute:
943 ; AVX-SLOW: # %bb.0:
944 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
945 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
946 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
947 ; AVX-SLOW-NEXT: vzeroupper
948 ; AVX-SLOW-NEXT: retq
963 ; SSE3-SLOW-LABEL: extract_extract67_v8f32_fadd_f32_commute:
964 ; SSE3-SLOW: # %bb.0:
965 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
966 ; SSE3-SLOW-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
967 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
968 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
969 ; SSE3-SLOW-NEXT: retq
977 ; AVX-SLOW-LABEL: extract_extract67_v8f32_fadd_f32_commute:
978 ; AVX-SLOW: # %bb.0:
979 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
980 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
981 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
982 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
983 ; AVX-SLOW-NEXT: vzeroupper
984 ; AVX-SLOW-NEXT: retq
1000 ; SSE3-SLOW-LABEL: extract_extract01_v4f64_fadd_f64:
1001 ; SSE3-SLOW: # %bb.0:
1002 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
1003 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1004 ; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1
1005 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
1006 ; SSE3-SLOW-NEXT: retq
1013 ; AVX-SLOW-LABEL: extract_extract01_v4f64_fadd_f64:
1014 ; AVX-SLOW: # %bb.0:
1015 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1016 ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1017 ; AVX-SLOW-NEXT: vzeroupper
1018 ; AVX-SLOW-NEXT: retq
1032 ; SSE3-SLOW-LABEL: extract_extract23_v4f64_fadd_f64:
1033 ; SSE3-SLOW: # %bb.0:
1034 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
1035 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1036 ; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0
1037 ; SSE3-SLOW-NEXT: retq
1045 ; AVX-SLOW-LABEL: extract_extract23_v4f64_fadd_f64:
1046 ; AVX-SLOW: # %bb.0:
1047 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
1048 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1049 ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1050 ; AVX-SLOW-NEXT: vzeroupper
1051 ; AVX-SLOW-NEXT: retq
1066 ; SSE3-SLOW-LABEL: extract_extract01_v4f64_fadd_f64_commute:
1067 ; SSE3-SLOW: # %bb.0:
1068 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
1069 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1070 ; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1
1071 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
1072 ; SSE3-SLOW-NEXT: retq
1079 ; AVX-SLOW-LABEL: extract_extract01_v4f64_fadd_f64_commute:
1080 ; AVX-SLOW: # %bb.0:
1081 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1082 ; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0
1083 ; AVX-SLOW-NEXT: vzeroupper
1084 ; AVX-SLOW-NEXT: retq
1098 ; SSE3-SLOW-LABEL: extract_extract23_v4f64_fadd_f64_commute:
1099 ; SSE3-SLOW: # %bb.0:
1100 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
1101 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
1102 ; SSE3-SLOW-NEXT: addsd %xmm1, %xmm0
1103 ; SSE3-SLOW-NEXT: retq
1111 ; AVX-SLOW-LABEL: extract_extract23_v4f64_fadd_f64_commute:
1112 ; AVX-SLOW: # %bb.0:
1113 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
1114 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1115 ; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0
1116 ; AVX-SLOW-NEXT: vzeroupper
1117 ; AVX-SLOW-NEXT: retq
1132 ; SSE3-SLOW-LABEL: extract_extract01_v8f32_fsub_f32:
1133 ; SSE3-SLOW: # %bb.0:
1134 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1135 ; SSE3-SLOW-NEXT: subss %xmm1, %xmm0
1136 ; SSE3-SLOW-NEXT: retq
1143 ; AVX-SLOW-LABEL: extract_extract01_v8f32_fsub_f32:
1144 ; AVX-SLOW: # %bb.0:
1145 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1146 ; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0
1147 ; AVX-SLOW-NEXT: vzeroupper
1148 ; AVX-SLOW-NEXT: retq
1162 ; SSE3-SLOW-LABEL: extract_extract23_v8f32_fsub_f32:
1163 ; SSE3-SLOW: # %bb.0:
1164 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
1165 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1166 ; SSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1167 ; SSE3-SLOW-NEXT: subss %xmm0, %xmm1
1168 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
1169 ; SSE3-SLOW-NEXT: retq
1177 ; AVX-SLOW-LABEL: extract_extract23_v8f32_fsub_f32:
1178 ; AVX-SLOW: # %bb.0:
1179 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1180 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1181 ; AVX-SLOW-NEXT: vsubss %xmm0, %xmm1, %xmm0
1182 ; AVX-SLOW-NEXT: vzeroupper
1183 ; AVX-SLOW-NEXT: retq
1198 ; SSE3-SLOW-LABEL: extract_extract45_v8f32_fsub_f32:
1199 ; SSE3-SLOW: # %bb.0:
1200 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
1201 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
1202 ; SSE3-SLOW-NEXT: subss %xmm1, %xmm0
1203 ; SSE3-SLOW-NEXT: retq
1211 ; AVX-SLOW-LABEL: extract_extract45_v8f32_fsub_f32:
1212 ; AVX-SLOW: # %bb.0:
1213 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
1214 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1215 ; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0
1216 ; AVX-SLOW-NEXT: vzeroupper
1217 ; AVX-SLOW-NEXT: retq
1254 ; SSE3-SLOW-LABEL: extract_extract01_v4f64_fsub_f64:
1255 ; SSE3-SLOW: # %bb.0:
1256 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
1257 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1258 ; SSE3-SLOW-NEXT: subsd %xmm1, %xmm0
1259 ; SSE3-SLOW-NEXT: retq
1266 ; AVX-SLOW-LABEL: extract_extract01_v4f64_fsub_f64:
1267 ; AVX-SLOW: # %bb.0:
1268 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1269 ; AVX-SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0
1270 ; AVX-SLOW-NEXT: vzeroupper
1271 ; AVX-SLOW-NEXT: retq
1310 ; SSE3-SLOW-LABEL: extract_extract01_v16f32_fadd_f32:
1311 ; SSE3-SLOW: # %bb.0:
1312 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1313 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
1314 ; SSE3-SLOW-NEXT: retq
1321 ; AVX-SLOW-LABEL: extract_extract01_v16f32_fadd_f32:
1322 ; AVX-SLOW: # %bb.0:
1323 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1324 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1325 ; AVX-SLOW-NEXT: vzeroupper
1326 ; AVX-SLOW-NEXT: retq
1340 ; SSE3-SLOW-LABEL: extract_extract01_v16f32_fadd_f32_commute:
1341 ; SSE3-SLOW: # %bb.0:
1342 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1343 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
1344 ; SSE3-SLOW-NEXT: retq
1351 ; AVX-SLOW-LABEL: extract_extract01_v16f32_fadd_f32_commute:
1352 ; AVX-SLOW: # %bb.0:
1353 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1354 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
1355 ; AVX-SLOW-NEXT: vzeroupper
1356 ; AVX-SLOW-NEXT: retq
1370 ; SSE3-SLOW-LABEL: extract_extract01_v8f64_fadd_f64:
1371 ; SSE3-SLOW: # %bb.0:
1372 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
1373 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1374 ; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1
1375 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
1376 ; SSE3-SLOW-NEXT: retq
1383 ; AVX-SLOW-LABEL: extract_extract01_v8f64_fadd_f64:
1384 ; AVX-SLOW: # %bb.0:
1385 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1386 ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1387 ; AVX-SLOW-NEXT: vzeroupper
1388 ; AVX-SLOW-NEXT: retq
1402 ; SSE3-SLOW-LABEL: extract_extract01_v8f64_fadd_f64_commute:
1403 ; SSE3-SLOW: # %bb.0:
1404 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
1405 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1406 ; SSE3-SLOW-NEXT: addsd %xmm0, %xmm1
1407 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm0
1408 ; SSE3-SLOW-NEXT: retq
1415 ; AVX-SLOW-LABEL: extract_extract01_v8f64_fadd_f64_commute:
1416 ; AVX-SLOW: # %bb.0:
1417 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1418 ; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0
1419 ; AVX-SLOW-NEXT: vzeroupper
1420 ; AVX-SLOW-NEXT: retq
1434 ; SSE3-SLOW-LABEL: extract_extract01_v16f32_fsub_f32:
1435 ; SSE3-SLOW: # %bb.0:
1436 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1437 ; SSE3-SLOW-NEXT: subss %xmm1, %xmm0
1438 ; SSE3-SLOW-NEXT: retq
1445 ; AVX-SLOW-LABEL: extract_extract01_v16f32_fsub_f32:
1446 ; AVX-SLOW: # %bb.0:
1447 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1448 ; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0
1449 ; AVX-SLOW-NEXT: vzeroupper
1450 ; AVX-SLOW-NEXT: retq
1484 ; SSE3-SLOW-LABEL: extract_extract01_v8f64_fsub_f64:
1485 ; SSE3-SLOW: # %bb.0:
1486 ; SSE3-SLOW-NEXT: movapd %xmm0, %xmm1
1487 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1488 ; SSE3-SLOW-NEXT: subsd %xmm1, %xmm0
1489 ; SSE3-SLOW-NEXT: retq
1496 ; AVX-SLOW-LABEL: extract_extract01_v8f64_fsub_f64:
1497 ; AVX-SLOW: # %bb.0:
1498 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1499 ; AVX-SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0
1500 ; AVX-SLOW-NEXT: vzeroupper
1501 ; AVX-SLOW-NEXT: retq
1538 ; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses1:
1539 ; SSE3-SLOW: # %bb.0:
1540 ; SSE3-SLOW-NEXT: movss %xmm0, (%rdi)
1541 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1542 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
1543 ; SSE3-SLOW-NEXT: retq
1551 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses1:
1552 ; AVX-SLOW: # %bb.0:
1553 ; AVX-SLOW-NEXT: vmovss %xmm0, (%rdi)
1554 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1555 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1556 ; AVX-SLOW-NEXT: retq
1571 ; SSE3-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses2:
1572 ; SSE3-SLOW: # %bb.0:
1573 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1574 ; SSE3-SLOW-NEXT: movss %xmm1, (%rdi)
1575 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
1576 ; SSE3-SLOW-NEXT: retq
1585 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses2:
1586 ; AVX-SLOW: # %bb.0:
1587 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1588 ; AVX-SLOW-NEXT: vmovss %xmm1, (%rdi)
1589 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1590 ; AVX-SLOW-NEXT: retq
1635 ; SSE3-SLOW-LABEL: fadd_reduce_v8f32:
1636 ; SSE3-SLOW: # %bb.0:
1637 ; SSE3-SLOW-NEXT: addps %xmm2, %xmm1
1638 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm2
1639 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
1640 ; SSE3-SLOW-NEXT: addps %xmm1, %xmm2
1641 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
1642 ; SSE3-SLOW-NEXT: addss %xmm2, %xmm1
1643 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
1644 ; SSE3-SLOW-NEXT: retq
1654 ; AVX-SLOW-LABEL: fadd_reduce_v8f32:
1655 ; AVX-SLOW: # %bb.0:
1656 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
1657 ; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
1658 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1659 ; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
1660 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1661 ; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
1662 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1663 ; AVX-SLOW-NEXT: vzeroupper
1664 ; AVX-SLOW-NEXT: retq
1680 ; SSE3-SLOW-LABEL: fadd_reduce_v4f64:
1681 ; SSE3-SLOW: # %bb.0:
1682 ; SSE3-SLOW-NEXT: addpd %xmm2, %xmm1
1683 ; SSE3-SLOW-NEXT: movapd %xmm1, %xmm2
1684 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
1685 ; SSE3-SLOW-NEXT: addsd %xmm1, %xmm2
1686 ; SSE3-SLOW-NEXT: addsd %xmm2, %xmm0
1687 ; SSE3-SLOW-NEXT: retq
1696 ; AVX-SLOW-LABEL: fadd_reduce_v4f64:
1697 ; AVX-SLOW: # %bb.0:
1698 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
1699 ; AVX-SLOW-NEXT: vaddpd %xmm2, %xmm1, %xmm1
1700 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1701 ; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1
1702 ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1703 ; AVX-SLOW-NEXT: vzeroupper
1704 ; AVX-SLOW-NEXT: retq
1719 ; SSSE3-SLOW-LABEL: PR39936_v8f32:
1720 ; SSSE3-SLOW: # %bb.0:
1721 ; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm0
1722 ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1
1723 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
1724 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
1725 ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0
1726 ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1727 ; SSSE3-SLOW-NEXT: addss %xmm1, %xmm0
1728 ; SSSE3-SLOW-NEXT: retq
1737 ; SSE3-SLOW-LABEL: PR39936_v8f32:
1738 ; SSE3-SLOW: # %bb.0:
1739 ; SSE3-SLOW-NEXT: haddps %xmm1, %xmm0
1740 ; SSE3-SLOW-NEXT: haddps %xmm0, %xmm0
1741 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1742 ; SSE3-SLOW-NEXT: addss %xmm1, %xmm0
1743 ; SSE3-SLOW-NEXT: retq
1752 ; AVX-SLOW-LABEL: PR39936_v8f32:
1753 ; AVX-SLOW: # %bb.0:
1754 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
1755 ; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
1756 ; AVX-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1757 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1758 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1759 ; AVX-SLOW-NEXT: vzeroupper
1760 ; AVX-SLOW-NEXT: retq
1783 ; SSE3-SLOW-LABEL: hadd32_4:
1784 ; SSE3-SLOW: # %bb.0:
1785 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
1786 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1787 ; SSE3-SLOW-NEXT: addps %xmm0, %xmm1
1788 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
1789 ; SSE3-SLOW-NEXT: addss %xmm0, %xmm1
1790 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
1791 ; SSE3-SLOW-NEXT: retq
1802 ; AVX-SLOW-LABEL: hadd32_4:
1803 ; AVX-SLOW: # %bb.0:
1804 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1805 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
1806 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1807 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1808 ; AVX-SLOW-NEXT: retq
1825 ; SSE3-SLOW-LABEL: hadd32_8:
1826 ; SSE3-SLOW: # %bb.0:
1827 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
1828 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1829 ; SSE3-SLOW-NEXT: addps %xmm0, %xmm1
1830 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
1831 ; SSE3-SLOW-NEXT: addss %xmm0, %xmm1
1832 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
1833 ; SSE3-SLOW-NEXT: retq
1844 ; AVX-SLOW-LABEL: hadd32_8:
1845 ; AVX-SLOW: # %bb.0:
1846 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1847 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
1848 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1849 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1850 ; AVX-SLOW-NEXT: vzeroupper
1851 ; AVX-SLOW-NEXT: retq
1869 ; SSE3-SLOW-LABEL: hadd32_16:
1870 ; SSE3-SLOW: # %bb.0:
1871 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
1872 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1873 ; SSE3-SLOW-NEXT: addps %xmm0, %xmm1
1874 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
1875 ; SSE3-SLOW-NEXT: addss %xmm0, %xmm1
1876 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
1877 ; SSE3-SLOW-NEXT: retq
1888 ; AVX-SLOW-LABEL: hadd32_16:
1889 ; AVX-SLOW: # %bb.0:
1890 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1891 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
1892 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1893 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1894 ; AVX-SLOW-NEXT: vzeroupper
1895 ; AVX-SLOW-NEXT: retq
2061 ; SSE3-SLOW-LABEL: partial_reduction_fadd_v8f32:
2062 ; SSE3-SLOW: # %bb.0:
2063 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
2064 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2065 ; SSE3-SLOW-NEXT: addps %xmm0, %xmm1
2066 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
2067 ; SSE3-SLOW-NEXT: addss %xmm0, %xmm1
2068 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
2069 ; SSE3-SLOW-NEXT: retq
2080 ; AVX-SLOW-LABEL: partial_reduction_fadd_v8f32:
2081 ; AVX-SLOW: # %bb.0:
2082 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2083 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
2084 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2085 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
2086 ; AVX-SLOW-NEXT: vzeroupper
2087 ; AVX-SLOW-NEXT: retq
2107 ; SSE3-SLOW-LABEL: partial_reduction_fadd_v8f32_wrong_flags:
2108 ; SSE3-SLOW: # %bb.0:
2109 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
2110 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2111 ; SSE3-SLOW-NEXT: addps %xmm0, %xmm1
2112 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
2113 ; SSE3-SLOW-NEXT: addss %xmm0, %xmm1
2114 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
2115 ; SSE3-SLOW-NEXT: retq
2126 ; AVX-SLOW-LABEL: partial_reduction_fadd_v8f32_wrong_flags:
2127 ; AVX-SLOW: # %bb.0:
2128 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2129 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
2130 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2131 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
2132 ; AVX-SLOW-NEXT: vzeroupper
2133 ; AVX-SLOW-NEXT: retq
2151 ; SSE3-SLOW-LABEL: partial_reduction_fadd_v16f32:
2152 ; SSE3-SLOW: # %bb.0:
2153 ; SSE3-SLOW-NEXT: movaps %xmm0, %xmm1
2154 ; SSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2155 ; SSE3-SLOW-NEXT: addps %xmm0, %xmm1
2156 ; SSE3-SLOW-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
2157 ; SSE3-SLOW-NEXT: addss %xmm0, %xmm1
2158 ; SSE3-SLOW-NEXT: movaps %xmm1, %xmm0
2159 ; SSE3-SLOW-NEXT: retq
2170 ; AVX-SLOW-LABEL: partial_reduction_fadd_v16f32:
2171 ; AVX-SLOW: # %bb.0:
2172 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2173 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
2174 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2175 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
2176 ; AVX-SLOW-NEXT: vzeroupper
2177 ; AVX-SLOW-NEXT: retq