• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -mtriple=x86_64-apple-macosx10.11.0 -slp-vectorizer -S -mattr=+sse2 | FileCheck %s --check-prefix=SSE
3; RUN: opt < %s -mtriple=x86_64-apple-macosx10.11.0 -slp-vectorizer -S -mattr=+avx  | FileCheck %s --check-prefix=AVX
4; RUN: opt < %s -mtriple=x86_64-apple-macosx10.11.0 -slp-vectorizer -S -mattr=+avx2 | FileCheck %s --check-prefix=AVX
5
6; Verify that the SLP vectorizer is able to figure out that commutativity
7; offers the possibility to splat/broadcast %c and thus make it profitable
8; to vectorize this case
9
10@cle = external unnamed_addr global [32 x i8], align 16
11@cle32 = external unnamed_addr global [32 x i32], align 16
12
13
14; Check that we correctly detect a splat/broadcast by leveraging the
15; commutativity property of `xor`.
16
17define void @splat(i8 %a, i8 %b, i8 %c) {
18; SSE-LABEL: @splat(
19; SSE-NEXT:    [[TMP1:%.*]] = xor i8 [[C:%.*]], [[A:%.*]]
20; SSE-NEXT:    store i8 [[TMP1]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 0), align 16
21; SSE-NEXT:    [[TMP2:%.*]] = xor i8 [[A]], [[C]]
22; SSE-NEXT:    store i8 [[TMP2]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 1)
23; SSE-NEXT:    [[TMP3:%.*]] = xor i8 [[A]], [[C]]
24; SSE-NEXT:    store i8 [[TMP3]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 2)
25; SSE-NEXT:    [[TMP4:%.*]] = xor i8 [[A]], [[C]]
26; SSE-NEXT:    store i8 [[TMP4]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 3)
27; SSE-NEXT:    [[TMP5:%.*]] = xor i8 [[C]], [[A]]
28; SSE-NEXT:    store i8 [[TMP5]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 4)
29; SSE-NEXT:    [[TMP6:%.*]] = xor i8 [[C]], [[B:%.*]]
30; SSE-NEXT:    store i8 [[TMP6]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 5)
31; SSE-NEXT:    [[TMP7:%.*]] = xor i8 [[C]], [[A]]
32; SSE-NEXT:    store i8 [[TMP7]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 6)
33; SSE-NEXT:    [[TMP8:%.*]] = xor i8 [[C]], [[B]]
34; SSE-NEXT:    store i8 [[TMP8]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 7)
35; SSE-NEXT:    [[TMP9:%.*]] = xor i8 [[A]], [[C]]
36; SSE-NEXT:    store i8 [[TMP9]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 8)
37; SSE-NEXT:    [[TMP10:%.*]] = xor i8 [[A]], [[C]]
38; SSE-NEXT:    store i8 [[TMP10]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 9)
39; SSE-NEXT:    [[TMP11:%.*]] = xor i8 [[A]], [[C]]
40; SSE-NEXT:    store i8 [[TMP11]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 10)
41; SSE-NEXT:    [[TMP12:%.*]] = xor i8 [[A]], [[C]]
42; SSE-NEXT:    store i8 [[TMP12]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 11)
43; SSE-NEXT:    [[TMP13:%.*]] = xor i8 [[A]], [[C]]
44; SSE-NEXT:    store i8 [[TMP13]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 12)
45; SSE-NEXT:    [[TMP14:%.*]] = xor i8 [[A]], [[C]]
46; SSE-NEXT:    store i8 [[TMP14]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 13)
47; SSE-NEXT:    [[TMP15:%.*]] = xor i8 [[A]], [[C]]
48; SSE-NEXT:    store i8 [[TMP15]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 14)
49; SSE-NEXT:    [[TMP16:%.*]] = xor i8 [[A]], [[C]]
50; SSE-NEXT:    store i8 [[TMP16]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 15)
51; SSE-NEXT:    ret void
52;
53; AVX-LABEL: @splat(
54; AVX-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[C:%.*]], i32 0
55; AVX-NEXT:    [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[C]], i32 1
56; AVX-NEXT:    [[TMP3:%.*]] = insertelement <16 x i8> [[TMP2]], i8 [[C]], i32 2
57; AVX-NEXT:    [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[C]], i32 3
58; AVX-NEXT:    [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[C]], i32 4
59; AVX-NEXT:    [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[C]], i32 5
60; AVX-NEXT:    [[TMP7:%.*]] = insertelement <16 x i8> [[TMP6]], i8 [[C]], i32 6
61; AVX-NEXT:    [[TMP8:%.*]] = insertelement <16 x i8> [[TMP7]], i8 [[C]], i32 7
62; AVX-NEXT:    [[TMP9:%.*]] = insertelement <16 x i8> [[TMP8]], i8 [[C]], i32 8
63; AVX-NEXT:    [[TMP10:%.*]] = insertelement <16 x i8> [[TMP9]], i8 [[C]], i32 9
64; AVX-NEXT:    [[TMP11:%.*]] = insertelement <16 x i8> [[TMP10]], i8 [[C]], i32 10
65; AVX-NEXT:    [[TMP12:%.*]] = insertelement <16 x i8> [[TMP11]], i8 [[C]], i32 11
66; AVX-NEXT:    [[TMP13:%.*]] = insertelement <16 x i8> [[TMP12]], i8 [[C]], i32 12
67; AVX-NEXT:    [[TMP14:%.*]] = insertelement <16 x i8> [[TMP13]], i8 [[C]], i32 13
68; AVX-NEXT:    [[TMP15:%.*]] = insertelement <16 x i8> [[TMP14]], i8 [[C]], i32 14
69; AVX-NEXT:    [[TMP16:%.*]] = insertelement <16 x i8> [[TMP15]], i8 [[C]], i32 15
70; AVX-NEXT:    [[TMP17:%.*]] = insertelement <2 x i8> undef, i8 [[A:%.*]], i32 0
71; AVX-NEXT:    [[TMP18:%.*]] = insertelement <2 x i8> [[TMP17]], i8 [[B:%.*]], i32 1
72; AVX-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i8> [[TMP18]], <2 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
73; AVX-NEXT:    [[TMP19:%.*]] = xor <16 x i8> [[TMP16]], [[SHUFFLE]]
74; AVX-NEXT:    store <16 x i8> [[TMP19]], <16 x i8>* bitcast ([32 x i8]* @cle to <16 x i8>*), align 16
75; AVX-NEXT:    ret void
76;
77  %1 = xor i8 %c, %a
78  store i8 %1, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 0), align 16
79  %2 = xor i8 %a, %c
80  store i8 %2, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 1)
81  %3 = xor i8 %a, %c
82  store i8 %3, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 2)
83  %4 = xor i8 %a, %c
84  store i8 %4, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 3)
85  %5 = xor i8 %c, %a
86  store i8 %5, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 4)
87  %6 = xor i8 %c, %b
88  store i8 %6, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 5)
89  %7 = xor i8 %c, %a
90  store i8 %7, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 6)
91  %8 = xor i8 %c, %b
92  store i8 %8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 7)
93  %9 = xor i8 %a, %c
94  store i8 %9, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 8)
95  %10 = xor i8 %a, %c
96  store i8 %10, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 9)
97  %11 = xor i8 %a, %c
98  store i8 %11, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 10)
99  %12 = xor i8 %a, %c
100  store i8 %12, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 11)
101  %13 = xor i8 %a, %c
102  store i8 %13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 12)
103  %14 = xor i8 %a, %c
104  store i8 %14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 13)
105  %15 = xor i8 %a, %c
106  store i8 %15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 14)
107  %16 = xor i8 %a, %c
108  store i8 %16, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 15)
109  ret void
110}
111
112; Check that we correctly detect that we can have the same opcode on one side by
113; leveraging the commutativity property of `xor`.
114
115define void @same_opcode_on_one_side(i32 %a, i32 %b, i32 %c) {
116; SSE-LABEL: @same_opcode_on_one_side(
117; SSE-NEXT:    [[ADD1:%.*]] = add i32 [[C:%.*]], [[A:%.*]]
118; SSE-NEXT:    [[ADD2:%.*]] = add i32 [[C]], [[A]]
119; SSE-NEXT:    [[ADD3:%.*]] = add i32 [[A]], [[C]]
120; SSE-NEXT:    [[ADD4:%.*]] = add i32 [[C]], [[A]]
121; SSE-NEXT:    [[TMP1:%.*]] = xor i32 [[ADD1]], [[A]]
122; SSE-NEXT:    store i32 [[TMP1]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 0), align 16
123; SSE-NEXT:    [[TMP2:%.*]] = xor i32 [[B:%.*]], [[ADD2]]
124; SSE-NEXT:    store i32 [[TMP2]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 1)
125; SSE-NEXT:    [[TMP3:%.*]] = xor i32 [[C]], [[ADD3]]
126; SSE-NEXT:    store i32 [[TMP3]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 2)
127; SSE-NEXT:    [[TMP4:%.*]] = xor i32 [[A]], [[ADD4]]
128; SSE-NEXT:    store i32 [[TMP4]], i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 3)
129; SSE-NEXT:    ret void
130;
131; AVX-LABEL: @same_opcode_on_one_side(
132; AVX-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[C:%.*]], i32 0
133; AVX-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[C]], i32 1
134; AVX-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[C]], i32 2
135; AVX-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[C]], i32 3
136; AVX-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[A:%.*]], i32 0
137; AVX-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[A]], i32 1
138; AVX-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[A]], i32 2
139; AVX-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[A]], i32 3
140; AVX-NEXT:    [[TMP9:%.*]] = add <4 x i32> [[TMP4]], [[TMP8]]
141; AVX-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[B:%.*]], i32 1
142; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[C]], i32 2
143; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[A]], i32 3
144; AVX-NEXT:    [[TMP13:%.*]] = xor <4 x i32> [[TMP9]], [[TMP12]]
145; AVX-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* bitcast ([32 x i32]* @cle32 to <4 x i32>*), align 16
146; AVX-NEXT:    ret void
147;
148  %add1 = add i32 %c, %a
149  %add2 = add i32 %c, %a
150  %add3 = add i32 %a, %c
151  %add4 = add i32 %c, %a
152  %1 = xor i32 %add1, %a
153  store i32 %1, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 0), align 16
154  %2 = xor i32 %b, %add2
155  store i32 %2, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 1)
156  %3 = xor i32 %c, %add3
157  store i32 %3, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 2)
158  %4 = xor i32 %a, %add4
159  store i32 %4, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @cle32, i64 0, i64 3)
160  ret void
161}
162