Lines Matching refs:AVX
4 … %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW
5 … %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST
6 … %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX-SLOW
7 … %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST
8 … %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX-SLOW
9 … %s -mtriple=x86_64-unknown -mattr=+avx512f,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST
17 ; AVX-LABEL: haddpd1:
18 ; AVX: # %bb.0:
19 ; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
20 ; AVX-NEXT: retq
33 ; AVX-LABEL: haddpd2:
34 ; AVX: # %bb.0:
35 ; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
36 ; AVX-NEXT: retq
57 ; AVX-SLOW-LABEL: haddpd3:
58 ; AVX-SLOW: # %bb.0:
59 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
60 ; AVX-SLOW-NEXT: vaddpd %xmm1, %xmm0, %xmm0
61 ; AVX-SLOW-NEXT: retq
63 ; AVX-FAST-LABEL: haddpd3:
64 ; AVX-FAST: # %bb.0:
65 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
66 ; AVX-FAST-NEXT: retq
79 ; AVX-LABEL: haddps1:
80 ; AVX: # %bb.0:
81 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
82 ; AVX-NEXT: retq
95 ; AVX-LABEL: haddps2:
96 ; AVX: # %bb.0:
97 ; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
98 ; AVX-NEXT: retq
111 ; AVX-LABEL: haddps3:
112 ; AVX: # %bb.0:
113 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
114 ; AVX-NEXT: retq
127 ; AVX-LABEL: haddps4:
128 ; AVX: # %bb.0:
129 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
130 ; AVX-NEXT: retq
143 ; AVX-LABEL: haddps5:
144 ; AVX: # %bb.0:
145 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
146 ; AVX-NEXT: retq
165 ; AVX-SLOW-LABEL: haddps6:
166 ; AVX-SLOW: # %bb.0:
167 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
168 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
169 ; AVX-SLOW-NEXT: retq
171 ; AVX-FAST-LABEL: haddps6:
172 ; AVX-FAST: # %bb.0:
173 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
174 ; AVX-FAST-NEXT: retq
187 ; AVX-LABEL: haddps7:
188 ; AVX: # %bb.0:
189 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
190 ; AVX-NEXT: retq
203 ; AVX-LABEL: hsubpd1:
204 ; AVX: # %bb.0:
205 ; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0
206 ; AVX-NEXT: retq
226 ; AVX-SLOW-LABEL: hsubpd2:
227 ; AVX-SLOW: # %bb.0:
228 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
229 ; AVX-SLOW-NEXT: vsubpd %xmm1, %xmm0, %xmm0
230 ; AVX-SLOW-NEXT: retq
232 ; AVX-FAST-LABEL: hsubpd2:
233 ; AVX-FAST: # %bb.0:
234 ; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
235 ; AVX-FAST-NEXT: retq
248 ; AVX-LABEL: hsubps1:
249 ; AVX: # %bb.0:
250 ; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0
251 ; AVX-NEXT: retq
264 ; AVX-LABEL: hsubps2:
265 ; AVX: # %bb.0:
266 ; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0
267 ; AVX-NEXT: retq
280 ; AVX-LABEL: hsubps3:
281 ; AVX: # %bb.0:
282 ; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0
283 ; AVX-NEXT: retq
302 ; AVX-SLOW-LABEL: hsubps4:
303 ; AVX-SLOW: # %bb.0:
304 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
305 ; AVX-SLOW-NEXT: vsubps %xmm1, %xmm0, %xmm0
306 ; AVX-SLOW-NEXT: retq
308 ; AVX-FAST-LABEL: hsubps4:
309 ; AVX-FAST: # %bb.0:
310 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
311 ; AVX-FAST-NEXT: retq
325 ; AVX-LABEL: vhaddps1:
326 ; AVX: # %bb.0:
327 ; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
328 ; AVX-NEXT: retq
342 ; AVX-LABEL: vhaddps2:
343 ; AVX: # %bb.0:
344 ; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
345 ; AVX-NEXT: retq
359 ; AVX-LABEL: vhaddps3:
360 ; AVX: # %bb.0:
361 ; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0
362 ; AVX-NEXT: retq
376 ; AVX-LABEL: vhsubps1:
377 ; AVX: # %bb.0:
378 ; AVX-NEXT: vhsubps %ymm1, %ymm0, %ymm0
379 ; AVX-NEXT: retq
393 ; AVX-LABEL: vhsubps3:
394 ; AVX: # %bb.0:
395 ; AVX-NEXT: vhsubps %ymm0, %ymm0, %ymm0
396 ; AVX-NEXT: retq
410 ; AVX-LABEL: vhaddpd1:
411 ; AVX: # %bb.0:
412 ; AVX-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
413 ; AVX-NEXT: retq
427 ; AVX-LABEL: vhsubpd1:
428 ; AVX: # %bb.0:
429 ; AVX-NEXT: vhsubpd %ymm1, %ymm0, %ymm0
430 ; AVX-NEXT: retq
443 ; AVX-LABEL: haddps_v2f32:
444 ; AVX: # %bb.0:
445 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
446 ; AVX-NEXT: retq
472 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32:
473 ; AVX-SLOW: # %bb.0:
474 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
475 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
476 ; AVX-SLOW-NEXT: retq
478 ; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32:
479 ; AVX-FAST: # %bb.0:
480 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
481 ; AVX-FAST-NEXT: retq
503 ; AVX-SLOW-LABEL: extract_extract23_v4f32_fadd_f32:
504 ; AVX-SLOW: # %bb.0:
505 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
506 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
507 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
508 ; AVX-SLOW-NEXT: retq
510 ; AVX-FAST-LABEL: extract_extract23_v4f32_fadd_f32:
511 ; AVX-FAST: # %bb.0:
512 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
513 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
514 ; AVX-FAST-NEXT: retq
533 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_commute:
534 ; AVX-SLOW: # %bb.0:
535 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
536 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
537 ; AVX-SLOW-NEXT: retq
539 ; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32_commute:
540 ; AVX-FAST: # %bb.0:
541 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
542 ; AVX-FAST-NEXT: retq
564 ; AVX-SLOW-LABEL: extract_extract23_v4f32_fadd_f32_commute:
565 ; AVX-SLOW: # %bb.0:
566 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
567 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
568 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
569 ; AVX-SLOW-NEXT: retq
571 ; AVX-FAST-LABEL: extract_extract23_v4f32_fadd_f32_commute:
572 ; AVX-FAST: # %bb.0:
573 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
574 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
575 ; AVX-FAST-NEXT: retq
596 ; AVX-SLOW-LABEL: extract_extract01_v2f64_fadd_f64:
597 ; AVX-SLOW: # %bb.0:
598 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
599 ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
600 ; AVX-SLOW-NEXT: retq
602 ; AVX-FAST-LABEL: extract_extract01_v2f64_fadd_f64:
603 ; AVX-FAST: # %bb.0:
604 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
605 ; AVX-FAST-NEXT: retq
626 ; AVX-SLOW-LABEL: extract_extract01_v2f64_fadd_f64_commute:
627 ; AVX-SLOW: # %bb.0:
628 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
629 ; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0
630 ; AVX-SLOW-NEXT: retq
632 ; AVX-FAST-LABEL: extract_extract01_v2f64_fadd_f64_commute:
633 ; AVX-FAST: # %bb.0:
634 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
635 ; AVX-FAST-NEXT: retq
654 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fsub_f32:
655 ; AVX-SLOW: # %bb.0:
656 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
657 ; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0
658 ; AVX-SLOW-NEXT: retq
660 ; AVX-FAST-LABEL: extract_extract01_v4f32_fsub_f32:
661 ; AVX-FAST: # %bb.0:
662 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
663 ; AVX-FAST-NEXT: retq
686 ; AVX-SLOW-LABEL: extract_extract23_v4f32_fsub_f32:
687 ; AVX-SLOW: # %bb.0:
688 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
689 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
690 ; AVX-SLOW-NEXT: vsubss %xmm0, %xmm1, %xmm0
691 ; AVX-SLOW-NEXT: retq
693 ; AVX-FAST-LABEL: extract_extract23_v4f32_fsub_f32:
694 ; AVX-FAST: # %bb.0:
695 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
696 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
697 ; AVX-FAST-NEXT: retq
712 ; AVX-LABEL: extract_extract01_v4f32_fsub_f32_commute:
713 ; AVX: # %bb.0:
714 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
715 ; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
716 ; AVX-NEXT: retq
732 ; AVX-LABEL: extract_extract23_v4f32_fsub_f32_commute:
733 ; AVX: # %bb.0:
734 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
735 ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
736 ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
737 ; AVX-NEXT: retq
757 ; AVX-SLOW-LABEL: extract_extract01_v2f64_fsub_f64:
758 ; AVX-SLOW: # %bb.0:
759 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
760 ; AVX-SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0
761 ; AVX-SLOW-NEXT: retq
763 ; AVX-FAST-LABEL: extract_extract01_v2f64_fsub_f64:
764 ; AVX-FAST: # %bb.0:
765 ; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
766 ; AVX-FAST-NEXT: retq
782 ; AVX-LABEL: extract_extract01_v2f64_fsub_f64_commute:
783 ; AVX: # %bb.0:
784 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
785 ; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
786 ; AVX-NEXT: retq
807 ; AVX-SLOW-LABEL: extract_extract01_v8f32_fadd_f32:
808 ; AVX-SLOW: # %bb.0:
809 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
810 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
811 ; AVX-SLOW-NEXT: vzeroupper
812 ; AVX-SLOW-NEXT: retq
814 ; AVX-FAST-LABEL: extract_extract01_v8f32_fadd_f32:
815 ; AVX-FAST: # %bb.0:
816 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
817 ; AVX-FAST-NEXT: vzeroupper
818 ; AVX-FAST-NEXT: retq
840 ; AVX-SLOW-LABEL: extract_extract23_v8f32_fadd_f32:
841 ; AVX-SLOW: # %bb.0:
842 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
843 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
844 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
845 ; AVX-SLOW-NEXT: vzeroupper
846 ; AVX-SLOW-NEXT: retq
848 ; AVX-FAST-LABEL: extract_extract23_v8f32_fadd_f32:
849 ; AVX-FAST: # %bb.0:
850 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
851 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
852 ; AVX-FAST-NEXT: vzeroupper
853 ; AVX-FAST-NEXT: retq
875 ; AVX-SLOW-LABEL: extract_extract67_v8f32_fadd_f32:
876 ; AVX-SLOW: # %bb.0:
877 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
878 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
879 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
880 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
881 ; AVX-SLOW-NEXT: vzeroupper
882 ; AVX-SLOW-NEXT: retq
884 ; AVX-FAST-LABEL: extract_extract67_v8f32_fadd_f32:
885 ; AVX-FAST: # %bb.0:
886 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
887 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
888 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
889 ; AVX-FAST-NEXT: vzeroupper
890 ; AVX-FAST-NEXT: retq
909 ; AVX-SLOW-LABEL: extract_extract01_v8f32_fadd_f32_commute:
910 ; AVX-SLOW: # %bb.0:
911 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
912 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
913 ; AVX-SLOW-NEXT: vzeroupper
914 ; AVX-SLOW-NEXT: retq
916 ; AVX-FAST-LABEL: extract_extract01_v8f32_fadd_f32_commute:
917 ; AVX-FAST: # %bb.0:
918 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
919 ; AVX-FAST-NEXT: vzeroupper
920 ; AVX-FAST-NEXT: retq
942 ; AVX-SLOW-LABEL: extract_extract23_v8f32_fadd_f32_commute:
943 ; AVX-SLOW: # %bb.0:
944 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
945 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
946 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
947 ; AVX-SLOW-NEXT: vzeroupper
948 ; AVX-SLOW-NEXT: retq
950 ; AVX-FAST-LABEL: extract_extract23_v8f32_fadd_f32_commute:
951 ; AVX-FAST: # %bb.0:
952 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
953 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
954 ; AVX-FAST-NEXT: vzeroupper
955 ; AVX-FAST-NEXT: retq
977 ; AVX-SLOW-LABEL: extract_extract67_v8f32_fadd_f32_commute:
978 ; AVX-SLOW: # %bb.0:
979 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
980 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
981 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
982 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
983 ; AVX-SLOW-NEXT: vzeroupper
984 ; AVX-SLOW-NEXT: retq
986 ; AVX-FAST-LABEL: extract_extract67_v8f32_fadd_f32_commute:
987 ; AVX-FAST: # %bb.0:
988 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
989 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
990 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
991 ; AVX-FAST-NEXT: vzeroupper
992 ; AVX-FAST-NEXT: retq
1013 ; AVX-SLOW-LABEL: extract_extract01_v4f64_fadd_f64:
1014 ; AVX-SLOW: # %bb.0:
1015 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1016 ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1017 ; AVX-SLOW-NEXT: vzeroupper
1018 ; AVX-SLOW-NEXT: retq
1020 ; AVX-FAST-LABEL: extract_extract01_v4f64_fadd_f64:
1021 ; AVX-FAST: # %bb.0:
1022 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1023 ; AVX-FAST-NEXT: vzeroupper
1024 ; AVX-FAST-NEXT: retq
1045 ; AVX-SLOW-LABEL: extract_extract23_v4f64_fadd_f64:
1046 ; AVX-SLOW: # %bb.0:
1047 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
1048 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1049 ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1050 ; AVX-SLOW-NEXT: vzeroupper
1051 ; AVX-SLOW-NEXT: retq
1053 ; AVX-FAST-LABEL: extract_extract23_v4f64_fadd_f64:
1054 ; AVX-FAST: # %bb.0:
1055 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
1056 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1057 ; AVX-FAST-NEXT: vzeroupper
1058 ; AVX-FAST-NEXT: retq
1079 ; AVX-SLOW-LABEL: extract_extract01_v4f64_fadd_f64_commute:
1080 ; AVX-SLOW: # %bb.0:
1081 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1082 ; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0
1083 ; AVX-SLOW-NEXT: vzeroupper
1084 ; AVX-SLOW-NEXT: retq
1086 ; AVX-FAST-LABEL: extract_extract01_v4f64_fadd_f64_commute:
1087 ; AVX-FAST: # %bb.0:
1088 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1089 ; AVX-FAST-NEXT: vzeroupper
1090 ; AVX-FAST-NEXT: retq
1111 ; AVX-SLOW-LABEL: extract_extract23_v4f64_fadd_f64_commute:
1112 ; AVX-SLOW: # %bb.0:
1113 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
1114 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1115 ; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0
1116 ; AVX-SLOW-NEXT: vzeroupper
1117 ; AVX-SLOW-NEXT: retq
1119 ; AVX-FAST-LABEL: extract_extract23_v4f64_fadd_f64_commute:
1120 ; AVX-FAST: # %bb.0:
1121 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
1122 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1123 ; AVX-FAST-NEXT: vzeroupper
1124 ; AVX-FAST-NEXT: retq
1143 ; AVX-SLOW-LABEL: extract_extract01_v8f32_fsub_f32:
1144 ; AVX-SLOW: # %bb.0:
1145 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1146 ; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0
1147 ; AVX-SLOW-NEXT: vzeroupper
1148 ; AVX-SLOW-NEXT: retq
1150 ; AVX-FAST-LABEL: extract_extract01_v8f32_fsub_f32:
1151 ; AVX-FAST: # %bb.0:
1152 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
1153 ; AVX-FAST-NEXT: vzeroupper
1154 ; AVX-FAST-NEXT: retq
1177 ; AVX-SLOW-LABEL: extract_extract23_v8f32_fsub_f32:
1178 ; AVX-SLOW: # %bb.0:
1179 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1180 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
1181 ; AVX-SLOW-NEXT: vsubss %xmm0, %xmm1, %xmm0
1182 ; AVX-SLOW-NEXT: vzeroupper
1183 ; AVX-SLOW-NEXT: retq
1185 ; AVX-FAST-LABEL: extract_extract23_v8f32_fsub_f32:
1186 ; AVX-FAST: # %bb.0:
1187 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
1188 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1189 ; AVX-FAST-NEXT: vzeroupper
1190 ; AVX-FAST-NEXT: retq
1211 ; AVX-SLOW-LABEL: extract_extract45_v8f32_fsub_f32:
1212 ; AVX-SLOW: # %bb.0:
1213 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
1214 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1215 ; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0
1216 ; AVX-SLOW-NEXT: vzeroupper
1217 ; AVX-SLOW-NEXT: retq
1219 ; AVX-FAST-LABEL: extract_extract45_v8f32_fsub_f32:
1220 ; AVX-FAST: # %bb.0:
1221 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
1222 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
1223 ; AVX-FAST-NEXT: vzeroupper
1224 ; AVX-FAST-NEXT: retq
1241 ; AVX-LABEL: extract_extract01_v8f32_fsub_f32_commute:
1242 ; AVX: # %bb.0:
1243 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1244 ; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
1245 ; AVX-NEXT: vzeroupper
1246 ; AVX-NEXT: retq
1266 ; AVX-SLOW-LABEL: extract_extract01_v4f64_fsub_f64:
1267 ; AVX-SLOW: # %bb.0:
1268 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1269 ; AVX-SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0
1270 ; AVX-SLOW-NEXT: vzeroupper
1271 ; AVX-SLOW-NEXT: retq
1273 ; AVX-FAST-LABEL: extract_extract01_v4f64_fsub_f64:
1274 ; AVX-FAST: # %bb.0:
1275 ; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
1276 ; AVX-FAST-NEXT: vzeroupper
1277 ; AVX-FAST-NEXT: retq
1295 ; AVX-LABEL: extract_extract01_v4f64_fsub_f64_commute:
1296 ; AVX: # %bb.0:
1297 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1298 ; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
1299 ; AVX-NEXT: vzeroupper
1300 ; AVX-NEXT: retq
1321 ; AVX-SLOW-LABEL: extract_extract01_v16f32_fadd_f32:
1322 ; AVX-SLOW: # %bb.0:
1323 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1324 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1325 ; AVX-SLOW-NEXT: vzeroupper
1326 ; AVX-SLOW-NEXT: retq
1328 ; AVX-FAST-LABEL: extract_extract01_v16f32_fadd_f32:
1329 ; AVX-FAST: # %bb.0:
1330 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1331 ; AVX-FAST-NEXT: vzeroupper
1332 ; AVX-FAST-NEXT: retq
1351 ; AVX-SLOW-LABEL: extract_extract01_v16f32_fadd_f32_commute:
1352 ; AVX-SLOW: # %bb.0:
1353 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1354 ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0
1355 ; AVX-SLOW-NEXT: vzeroupper
1356 ; AVX-SLOW-NEXT: retq
1358 ; AVX-FAST-LABEL: extract_extract01_v16f32_fadd_f32_commute:
1359 ; AVX-FAST: # %bb.0:
1360 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1361 ; AVX-FAST-NEXT: vzeroupper
1362 ; AVX-FAST-NEXT: retq
1383 ; AVX-SLOW-LABEL: extract_extract01_v8f64_fadd_f64:
1384 ; AVX-SLOW: # %bb.0:
1385 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1386 ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1387 ; AVX-SLOW-NEXT: vzeroupper
1388 ; AVX-SLOW-NEXT: retq
1390 ; AVX-FAST-LABEL: extract_extract01_v8f64_fadd_f64:
1391 ; AVX-FAST: # %bb.0:
1392 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1393 ; AVX-FAST-NEXT: vzeroupper
1394 ; AVX-FAST-NEXT: retq
1415 ; AVX-SLOW-LABEL: extract_extract01_v8f64_fadd_f64_commute:
1416 ; AVX-SLOW: # %bb.0:
1417 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1418 ; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0
1419 ; AVX-SLOW-NEXT: vzeroupper
1420 ; AVX-SLOW-NEXT: retq
1422 ; AVX-FAST-LABEL: extract_extract01_v8f64_fadd_f64_commute:
1423 ; AVX-FAST: # %bb.0:
1424 ; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
1425 ; AVX-FAST-NEXT: vzeroupper
1426 ; AVX-FAST-NEXT: retq
1445 ; AVX-SLOW-LABEL: extract_extract01_v16f32_fsub_f32:
1446 ; AVX-SLOW: # %bb.0:
1447 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1448 ; AVX-SLOW-NEXT: vsubss %xmm1, %xmm0, %xmm0
1449 ; AVX-SLOW-NEXT: vzeroupper
1450 ; AVX-SLOW-NEXT: retq
1452 ; AVX-FAST-LABEL: extract_extract01_v16f32_fsub_f32:
1453 ; AVX-FAST: # %bb.0:
1454 ; AVX-FAST-NEXT: vhsubps %xmm0, %xmm0, %xmm0
1455 ; AVX-FAST-NEXT: vzeroupper
1456 ; AVX-FAST-NEXT: retq
1471 ; AVX-LABEL: extract_extract01_v16f32_fsub_f32_commute:
1472 ; AVX: # %bb.0:
1473 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1474 ; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
1475 ; AVX-NEXT: vzeroupper
1476 ; AVX-NEXT: retq
1496 ; AVX-SLOW-LABEL: extract_extract01_v8f64_fsub_f64:
1497 ; AVX-SLOW: # %bb.0:
1498 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1499 ; AVX-SLOW-NEXT: vsubsd %xmm1, %xmm0, %xmm0
1500 ; AVX-SLOW-NEXT: vzeroupper
1501 ; AVX-SLOW-NEXT: retq
1503 ; AVX-FAST-LABEL: extract_extract01_v8f64_fsub_f64:
1504 ; AVX-FAST: # %bb.0:
1505 ; AVX-FAST-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
1506 ; AVX-FAST-NEXT: vzeroupper
1507 ; AVX-FAST-NEXT: retq
1523 ; AVX-LABEL: extract_extract01_v8f64_fsub_f64_commute:
1524 ; AVX: # %bb.0:
1525 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1526 ; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
1527 ; AVX-NEXT: vzeroupper
1528 ; AVX-NEXT: retq
1551 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses1:
1552 ; AVX-SLOW: # %bb.0:
1553 ; AVX-SLOW-NEXT: vmovss %xmm0, (%rdi)
1554 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1555 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1556 ; AVX-SLOW-NEXT: retq
1558 ; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses1:
1559 ; AVX-FAST: # %bb.0:
1560 ; AVX-FAST-NEXT: vmovss %xmm0, (%rdi)
1561 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1562 ; AVX-FAST-NEXT: retq
1585 ; AVX-SLOW-LABEL: extract_extract01_v4f32_fadd_f32_uses2:
1586 ; AVX-SLOW: # %bb.0:
1587 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1588 ; AVX-SLOW-NEXT: vmovss %xmm1, (%rdi)
1589 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1590 ; AVX-SLOW-NEXT: retq
1592 ; AVX-FAST-LABEL: extract_extract01_v4f32_fadd_f32_uses2:
1593 ; AVX-FAST: # %bb.0:
1594 ; AVX-FAST-NEXT: vextractps $1, %xmm0, (%rdi)
1595 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1596 ; AVX-FAST-NEXT: retq
1613 ; AVX-LABEL: extract_extract01_v4f32_fadd_f32_uses3:
1614 ; AVX: # %bb.0:
1615 ; AVX-NEXT: vmovss %xmm0, (%rdi)
1616 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1617 ; AVX-NEXT: vmovss %xmm1, (%rsi)
1618 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
1619 ; AVX-NEXT: retq
1654 ; AVX-SLOW-LABEL: fadd_reduce_v8f32:
1655 ; AVX-SLOW: # %bb.0:
1656 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
1657 ; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
1658 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1659 ; AVX-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
1660 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1661 ; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1
1662 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1663 ; AVX-SLOW-NEXT: vzeroupper
1664 ; AVX-SLOW-NEXT: retq
1666 ; AVX-FAST-LABEL: fadd_reduce_v8f32:
1667 ; AVX-FAST: # %bb.0:
1668 ; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
1669 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm2, %xmm1
1670 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
1671 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
1672 ; AVX-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0
1673 ; AVX-FAST-NEXT: vzeroupper
1674 ; AVX-FAST-NEXT: retq
1696 ; AVX-SLOW-LABEL: fadd_reduce_v4f64:
1697 ; AVX-SLOW: # %bb.0:
1698 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
1699 ; AVX-SLOW-NEXT: vaddpd %xmm2, %xmm1, %xmm1
1700 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
1701 ; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1
1702 ; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1703 ; AVX-SLOW-NEXT: vzeroupper
1704 ; AVX-SLOW-NEXT: retq
1706 ; AVX-FAST-LABEL: fadd_reduce_v4f64:
1707 ; AVX-FAST: # %bb.0:
1708 ; AVX-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
1709 ; AVX-FAST-NEXT: vhaddpd %xmm1, %xmm2, %xmm1
1710 ; AVX-FAST-NEXT: vhaddpd %xmm1, %xmm1, %xmm1
1711 ; AVX-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
1712 ; AVX-FAST-NEXT: vzeroupper
1713 ; AVX-FAST-NEXT: retq
1752 ; AVX-SLOW-LABEL: PR39936_v8f32:
1753 ; AVX-SLOW: # %bb.0:
1754 ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
1755 ; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
1756 ; AVX-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1757 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1758 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1759 ; AVX-SLOW-NEXT: vzeroupper
1760 ; AVX-SLOW-NEXT: retq
1762 ; AVX-FAST-LABEL: PR39936_v8f32:
1763 ; AVX-FAST: # %bb.0:
1764 ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
1765 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
1766 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1767 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1768 ; AVX-FAST-NEXT: vzeroupper
1769 ; AVX-FAST-NEXT: retq
1802 ; AVX-SLOW-LABEL: hadd32_4:
1803 ; AVX-SLOW: # %bb.0:
1804 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1805 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
1806 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1807 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1808 ; AVX-SLOW-NEXT: retq
1810 ; AVX-FAST-LABEL: hadd32_4:
1811 ; AVX-FAST: # %bb.0:
1812 ; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1813 ; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
1814 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1815 ; AVX-FAST-NEXT: retq
1844 ; AVX-SLOW-LABEL: hadd32_8:
1845 ; AVX-SLOW: # %bb.0:
1846 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1847 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
1848 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1849 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1850 ; AVX-SLOW-NEXT: vzeroupper
1851 ; AVX-SLOW-NEXT: retq
1853 ; AVX-FAST-LABEL: hadd32_8:
1854 ; AVX-FAST: # %bb.0:
1855 ; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1856 ; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
1857 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1858 ; AVX-FAST-NEXT: vzeroupper
1859 ; AVX-FAST-NEXT: retq
1888 ; AVX-SLOW-LABEL: hadd32_16:
1889 ; AVX-SLOW: # %bb.0:
1890 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1891 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
1892 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1893 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
1894 ; AVX-SLOW-NEXT: vzeroupper
1895 ; AVX-SLOW-NEXT: retq
1897 ; AVX-FAST-LABEL: hadd32_16:
1898 ; AVX-FAST: # %bb.0:
1899 ; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1900 ; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
1901 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1902 ; AVX-FAST-NEXT: vzeroupper
1903 ; AVX-FAST-NEXT: retq
1922 ; AVX-LABEL: hadd32_4_optsize:
1923 ; AVX: # %bb.0:
1924 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1925 ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
1926 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1927 ; AVX-NEXT: retq
1946 ; AVX-LABEL: hadd32_8_optsize:
1947 ; AVX: # %bb.0:
1948 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1949 ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
1950 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1951 ; AVX-NEXT: vzeroupper
1952 ; AVX-NEXT: retq
1971 ; AVX-LABEL: hadd32_16_optsize:
1972 ; AVX: # %bb.0:
1973 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1974 ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
1975 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
1976 ; AVX-NEXT: vzeroupper
1977 ; AVX-NEXT: retq
1996 ; AVX-LABEL: hadd32_4_pgso:
1997 ; AVX: # %bb.0:
1998 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
1999 ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
2000 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
2001 ; AVX-NEXT: retq
2020 ; AVX-LABEL: hadd32_8_pgso:
2021 ; AVX: # %bb.0:
2022 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2023 ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
2024 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
2025 ; AVX-NEXT: vzeroupper
2026 ; AVX-NEXT: retq
2045 ; AVX-LABEL: hadd32_16_pgso:
2046 ; AVX: # %bb.0:
2047 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2048 ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
2049 ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
2050 ; AVX-NEXT: vzeroupper
2051 ; AVX-NEXT: retq
2080 ; AVX-SLOW-LABEL: partial_reduction_fadd_v8f32:
2081 ; AVX-SLOW: # %bb.0:
2082 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2083 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
2084 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2085 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
2086 ; AVX-SLOW-NEXT: vzeroupper
2087 ; AVX-SLOW-NEXT: retq
2089 ; AVX-FAST-LABEL: partial_reduction_fadd_v8f32:
2090 ; AVX-FAST: # %bb.0:
2091 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
2092 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
2093 ; AVX-FAST-NEXT: vzeroupper
2094 ; AVX-FAST-NEXT: retq
2126 ; AVX-SLOW-LABEL: partial_reduction_fadd_v8f32_wrong_flags:
2127 ; AVX-SLOW: # %bb.0:
2128 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2129 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
2130 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2131 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
2132 ; AVX-SLOW-NEXT: vzeroupper
2133 ; AVX-SLOW-NEXT: retq
2135 ; AVX-FAST-LABEL: partial_reduction_fadd_v8f32_wrong_flags:
2136 ; AVX-FAST: # %bb.0:
2137 ; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2138 ; AVX-FAST-NEXT: vaddps %xmm1, %xmm0, %xmm0
2139 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
2140 ; AVX-FAST-NEXT: vzeroupper
2141 ; AVX-FAST-NEXT: retq
2170 ; AVX-SLOW-LABEL: partial_reduction_fadd_v16f32:
2171 ; AVX-SLOW: # %bb.0:
2172 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
2173 ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
2174 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
2175 ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0
2176 ; AVX-SLOW-NEXT: vzeroupper
2177 ; AVX-SLOW-NEXT: retq
2179 ; AVX-FAST-LABEL: partial_reduction_fadd_v16f32:
2180 ; AVX-FAST: # %bb.0:
2181 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
2182 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
2183 ; AVX-FAST-NEXT: vzeroupper
2184 ; AVX-FAST-NEXT: retq