• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX-SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX-FAST
6
7; fold (shl 0, x) -> 0
8define <4 x i32> @combine_vec_shl_zero(<4 x i32> %x) {
9; SSE-LABEL: combine_vec_shl_zero:
10; SSE:       # %bb.0:
11; SSE-NEXT:    xorps %xmm0, %xmm0
12; SSE-NEXT:    retq
13;
14; AVX-LABEL: combine_vec_shl_zero:
15; AVX:       # %bb.0:
16; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
17; AVX-NEXT:    retq
18  %1 = shl <4 x i32> zeroinitializer, %x
19  ret <4 x i32> %1
20}
21
22; fold (shl x, c >= size(x)) -> undef
23define <4 x i32> @combine_vec_shl_outofrange0(<4 x i32> %x) {
24; CHECK-LABEL: combine_vec_shl_outofrange0:
25; CHECK:       # %bb.0:
26; CHECK-NEXT:    retq
27  %1 = shl <4 x i32> %x, <i32 33, i32 33, i32 33, i32 33>
28  ret <4 x i32> %1
29}
30
31define <4 x i32> @combine_vec_shl_outofrange1(<4 x i32> %x) {
32; CHECK-LABEL: combine_vec_shl_outofrange1:
33; CHECK:       # %bb.0:
34; CHECK-NEXT:    retq
35  %1 = shl <4 x i32> %x, <i32 33, i32 34, i32 35, i32 36>
36  ret <4 x i32> %1
37}
38
39define <4 x i32> @combine_vec_shl_outofrange2(<4 x i32> %a0) {
40; CHECK-LABEL: combine_vec_shl_outofrange2:
41; CHECK:       # %bb.0:
42; CHECK-NEXT:    retq
43  %1 = and <4 x i32> %a0, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
44  %2 = shl <4 x i32> %1, <i32 33, i32 33, i32 33, i32 33>
45  ret <4 x i32> %2
46}
47
48define <4 x i32> @combine_vec_shl_outofrange3(<4 x i32> %a0) {
49; CHECK-LABEL: combine_vec_shl_outofrange3:
50; CHECK:       # %bb.0:
51; CHECK-NEXT:    retq
52  %1 = shl <4 x i32> %a0, <i32 33, i32 34, i32 35, i32 undef>
53  ret <4 x i32> %1
54}
55
56; fold (shl x, 0) -> x
57define <4 x i32> @combine_vec_shl_by_zero(<4 x i32> %x) {
58; CHECK-LABEL: combine_vec_shl_by_zero:
59; CHECK:       # %bb.0:
60; CHECK-NEXT:    retq
61  %1 = shl <4 x i32> %x, zeroinitializer
62  ret <4 x i32> %1
63}
64
65; if (shl x, c) is known to be zero, return 0
66define <4 x i32> @combine_vec_shl_known_zero0(<4 x i32> %x) {
67; SSE-LABEL: combine_vec_shl_known_zero0:
68; SSE:       # %bb.0:
69; SSE-NEXT:    xorps %xmm0, %xmm0
70; SSE-NEXT:    retq
71;
72; AVX-LABEL: combine_vec_shl_known_zero0:
73; AVX:       # %bb.0:
74; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
75; AVX-NEXT:    retq
76  %1 = and <4 x i32> %x, <i32 4294901760, i32 4294901760, i32 4294901760, i32 4294901760>
77  %2 = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16>
78  ret <4 x i32> %2
79}
80
81define <4 x i32> @combine_vec_shl_known_zero1(<4 x i32> %x) {
82; SSE2-LABEL: combine_vec_shl_known_zero1:
83; SSE2:       # %bb.0:
84; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
85; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65536,32768,16384,8192]
86; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
87; SSE2-NEXT:    pmuludq %xmm1, %xmm0
88; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
89; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
90; SSE2-NEXT:    pmuludq %xmm2, %xmm1
91; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
92; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
93; SSE2-NEXT:    retq
94;
95; SSE41-LABEL: combine_vec_shl_known_zero1:
96; SSE41:       # %bb.0:
97; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
98; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
99; SSE41-NEXT:    retq
100;
101; AVX-LABEL: combine_vec_shl_known_zero1:
102; AVX:       # %bb.0:
103; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
104; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
105; AVX-NEXT:    retq
106  %1 = and <4 x i32> %x, <i32 4294901760, i32 8589803520, i32 17179607040, i32 34359214080>
107  %2 = shl <4 x i32> %1, <i32 16, i32 15, i32 14, i32 13>
108  ret <4 x i32> %2
109}
110
111; fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))).
112define <4 x i32> @combine_vec_shl_trunc_and(<4 x i32> %x, <4 x i64> %y) {
113; SSE2-LABEL: combine_vec_shl_trunc_and:
114; SSE2:       # %bb.0:
115; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
116; SSE2-NEXT:    andps {{.*}}(%rip), %xmm1
117; SSE2-NEXT:    pslld $23, %xmm1
118; SSE2-NEXT:    paddd {{.*}}(%rip), %xmm1
119; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
120; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
121; SSE2-NEXT:    pmuludq %xmm1, %xmm0
122; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
123; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
124; SSE2-NEXT:    pmuludq %xmm2, %xmm1
125; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
126; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
127; SSE2-NEXT:    retq
128;
129; SSE41-LABEL: combine_vec_shl_trunc_and:
130; SSE41:       # %bb.0:
131; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
132; SSE41-NEXT:    andps {{.*}}(%rip), %xmm1
133; SSE41-NEXT:    pslld $23, %xmm1
134; SSE41-NEXT:    paddd {{.*}}(%rip), %xmm1
135; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
136; SSE41-NEXT:    pmulld %xmm1, %xmm0
137; SSE41-NEXT:    retq
138;
139; AVX-SLOW-LABEL: combine_vec_shl_trunc_and:
140; AVX-SLOW:       # %bb.0:
141; AVX-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
142; AVX-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
143; AVX-SLOW-NEXT:    vandps {{.*}}(%rip), %xmm1, %xmm1
144; AVX-SLOW-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
145; AVX-SLOW-NEXT:    vzeroupper
146; AVX-SLOW-NEXT:    retq
147;
148; AVX-FAST-LABEL: combine_vec_shl_trunc_and:
149; AVX-FAST:       # %bb.0:
150; AVX-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u>
151; AVX-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
152; AVX-FAST-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
153; AVX-FAST-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
154; AVX-FAST-NEXT:    vzeroupper
155; AVX-FAST-NEXT:    retq
156  %1 = and <4 x i64> %y, <i64 15, i64 255, i64 4095, i64 65535>
157  %2 = trunc <4 x i64> %1 to <4 x i32>
158  %3 = shl <4 x i32> %x, %2
159  ret <4 x i32> %3
160}
161
162; fold (shl (shl x, c1), c2) -> (shl x, (add c1, c2))
163define <4 x i32> @combine_vec_shl_shl0(<4 x i32> %x) {
164; SSE-LABEL: combine_vec_shl_shl0:
165; SSE:       # %bb.0:
166; SSE-NEXT:    pslld $6, %xmm0
167; SSE-NEXT:    retq
168;
169; AVX-LABEL: combine_vec_shl_shl0:
170; AVX:       # %bb.0:
171; AVX-NEXT:    vpslld $6, %xmm0, %xmm0
172; AVX-NEXT:    retq
173  %1 = shl <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
174  %2 = shl <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4>
175  ret <4 x i32> %2
176}
177
178define <4 x i32> @combine_vec_shl_shl1(<4 x i32> %x) {
179; SSE2-LABEL: combine_vec_shl_shl1:
180; SSE2:       # %bb.0:
181; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [16,64,256,1024]
182; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
183; SSE2-NEXT:    pmuludq %xmm1, %xmm0
184; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
185; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
186; SSE2-NEXT:    pmuludq %xmm2, %xmm1
187; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
188; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
189; SSE2-NEXT:    retq
190;
191; SSE41-LABEL: combine_vec_shl_shl1:
192; SSE41:       # %bb.0:
193; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
194; SSE41-NEXT:    retq
195;
196; AVX-LABEL: combine_vec_shl_shl1:
197; AVX:       # %bb.0:
198; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
199; AVX-NEXT:    retq
200  %1 = shl <4 x i32> %x, <i32 0, i32 1, i32 2, i32 3>
201  %2 = shl <4 x i32> %1, <i32 4, i32 5, i32 6, i32 7>
202  ret <4 x i32> %2
203}
204
205; fold (shl (shl x, c1), c2) -> 0
206define <4 x i32> @combine_vec_shl_shlr_zero0(<4 x i32> %x) {
207; SSE-LABEL: combine_vec_shl_shlr_zero0:
208; SSE:       # %bb.0:
209; SSE-NEXT:    xorps %xmm0, %xmm0
210; SSE-NEXT:    retq
211;
212; AVX-LABEL: combine_vec_shl_shlr_zero0:
213; AVX:       # %bb.0:
214; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
215; AVX-NEXT:    retq
216  %1 = shl <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16>
217  %2 = shl <4 x i32> %1, <i32 20, i32 20, i32 20, i32 20>
218  ret <4 x i32> %2
219}
220
221define <4 x i32> @combine_vec_shl_shl_zero1(<4 x i32> %x) {
222; SSE-LABEL: combine_vec_shl_shl_zero1:
223; SSE:       # %bb.0:
224; SSE-NEXT:    xorps %xmm0, %xmm0
225; SSE-NEXT:    retq
226;
227; AVX-LABEL: combine_vec_shl_shl_zero1:
228; AVX:       # %bb.0:
229; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
230; AVX-NEXT:    retq
231  %1 = shl <4 x i32> %x, <i32 17, i32 18, i32 19, i32 20>
232  %2 = shl <4 x i32> %1, <i32 25, i32 26, i32 27, i32 28>
233  ret <4 x i32> %2
234}
235
236; fold (shl (ext (shl x, c1)), c2) -> (shl (ext x), (add c1, c2))
237define <8 x i32> @combine_vec_shl_ext_shl0(<8 x i16> %x) {
238; SSE2-LABEL: combine_vec_shl_ext_shl0:
239; SSE2:       # %bb.0:
240; SSE2-NEXT:    movdqa %xmm0, %xmm1
241; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
242; SSE2-NEXT:    pslld $20, %xmm0
243; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
244; SSE2-NEXT:    pslld $20, %xmm1
245; SSE2-NEXT:    retq
246;
247; SSE41-LABEL: combine_vec_shl_ext_shl0:
248; SSE41:       # %bb.0:
249; SSE41-NEXT:    movdqa %xmm0, %xmm1
250; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
251; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
252; SSE41-NEXT:    pslld $20, %xmm1
253; SSE41-NEXT:    pslld $20, %xmm0
254; SSE41-NEXT:    retq
255;
256; AVX-LABEL: combine_vec_shl_ext_shl0:
257; AVX:       # %bb.0:
258; AVX-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
259; AVX-NEXT:    vpslld $20, %ymm0, %ymm0
260; AVX-NEXT:    retq
261  %1 = shl <8 x i16> %x, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
262  %2 = sext <8 x i16> %1 to <8 x i32>
263  %3 = shl <8 x i32> %2, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
264  ret <8 x i32> %3
265}
266
267define <8 x i32> @combine_vec_shl_ext_shl1(<8 x i16> %x) {
268; SSE-LABEL: combine_vec_shl_ext_shl1:
269; SSE:       # %bb.0:
270; SSE-NEXT:    xorps %xmm0, %xmm0
271; SSE-NEXT:    xorps %xmm1, %xmm1
272; SSE-NEXT:    retq
273;
274; AVX-LABEL: combine_vec_shl_ext_shl1:
275; AVX:       # %bb.0:
276; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
277; AVX-NEXT:    retq
278  %1 = shl <8 x i16> %x, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
279  %2 = sext <8 x i16> %1 to <8 x i32>
280  %3 = shl <8 x i32> %2, <i32 31, i32 31, i32 30, i32 30, i32 29, i32 29, i32 28, i32 28>
281  ret <8 x i32> %3
282}
283
284define <8 x i32> @combine_vec_shl_ext_shl2(<8 x i16> %x) {
285; SSE2-LABEL: combine_vec_shl_ext_shl2:
286; SSE2:       # %bb.0:
287; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
288; SSE2-NEXT:    psrad $16, %xmm1
289; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [131072,524288,2097152,8388608]
290; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
291; SSE2-NEXT:    pmuludq %xmm3, %xmm1
292; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
293; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
294; SSE2-NEXT:    pmuludq %xmm4, %xmm1
295; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
296; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
297; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
298; SSE2-NEXT:    psrad $16, %xmm0
299; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [33554432,134217728,536870912,2147483648]
300; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
301; SSE2-NEXT:    pmuludq %xmm3, %xmm0
302; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
303; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
304; SSE2-NEXT:    pmuludq %xmm4, %xmm0
305; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
306; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
307; SSE2-NEXT:    movdqa %xmm2, %xmm0
308; SSE2-NEXT:    retq
309;
310; SSE41-LABEL: combine_vec_shl_ext_shl2:
311; SSE41:       # %bb.0:
312; SSE41-NEXT:    pmovsxwd %xmm0, %xmm2
313; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm2
314; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
315; SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
316; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm1
317; SSE41-NEXT:    movdqa %xmm2, %xmm0
318; SSE41-NEXT:    retq
319;
320; AVX-LABEL: combine_vec_shl_ext_shl2:
321; AVX:       # %bb.0:
322; AVX-NEXT:    vpmovsxwd %xmm0, %ymm0
323; AVX-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm0
324; AVX-NEXT:    retq
325  %1 = shl <8 x i16> %x, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
326  %2 = sext <8 x i16> %1 to <8 x i32>
327  %3 = shl <8 x i32> %2, <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
328  ret <8 x i32> %3
329}
330
331; fold (shl (zext (srl x, C)), C) -> (zext (shl (srl x, C), C))
332define <8 x i32> @combine_vec_shl_zext_lshr0(<8 x i16> %x) {
333; SSE2-LABEL: combine_vec_shl_zext_lshr0:
334; SSE2:       # %bb.0:
335; SSE2-NEXT:    movdqa %xmm0, %xmm1
336; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
337; SSE2-NEXT:    pxor %xmm2, %xmm2
338; SSE2-NEXT:    movdqa %xmm1, %xmm0
339; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
340; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
341; SSE2-NEXT:    retq
342;
343; SSE41-LABEL: combine_vec_shl_zext_lshr0:
344; SSE41:       # %bb.0:
345; SSE41-NEXT:    movdqa %xmm0, %xmm1
346; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
347; SSE41-NEXT:    pxor %xmm2, %xmm2
348; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
349; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
350; SSE41-NEXT:    retq
351;
352; AVX-LABEL: combine_vec_shl_zext_lshr0:
353; AVX:       # %bb.0:
354; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
355; AVX-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
356; AVX-NEXT:    retq
357  %1 = lshr <8 x i16> %x, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
358  %2 = zext <8 x i16> %1 to <8 x i32>
359  %3 = shl <8 x i32> %2, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
360  ret <8 x i32> %3
361}
362
363define <8 x i32> @combine_vec_shl_zext_lshr1(<8 x i16> %x) {
364; SSE2-LABEL: combine_vec_shl_zext_lshr1:
365; SSE2:       # %bb.0:
366; SSE2-NEXT:    movdqa %xmm0, %xmm1
367; SSE2-NEXT:    pmulhuw {{.*}}(%rip), %xmm1
368; SSE2-NEXT:    pxor %xmm2, %xmm2
369; SSE2-NEXT:    pmullw {{.*}}(%rip), %xmm1
370; SSE2-NEXT:    movdqa %xmm1, %xmm0
371; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
372; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
373; SSE2-NEXT:    retq
374;
375; SSE41-LABEL: combine_vec_shl_zext_lshr1:
376; SSE41:       # %bb.0:
377; SSE41-NEXT:    movdqa %xmm0, %xmm1
378; SSE41-NEXT:    pmulhuw {{.*}}(%rip), %xmm1
379; SSE41-NEXT:    pxor %xmm2, %xmm2
380; SSE41-NEXT:    pmullw {{.*}}(%rip), %xmm1
381; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
382; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
383; SSE41-NEXT:    retq
384;
385; AVX-LABEL: combine_vec_shl_zext_lshr1:
386; AVX:       # %bb.0:
387; AVX-NEXT:    vpmulhuw {{.*}}(%rip), %xmm0, %xmm0
388; AVX-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
389; AVX-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
390; AVX-NEXT:    retq
391  %1 = lshr <8 x i16> %x, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
392  %2 = zext <8 x i16> %1 to <8 x i32>
393  %3 = shl <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
394  ret <8 x i32> %3
395}
396
397; fold (shl (sr[la] exact X,  C1), C2) -> (shl X, (C2-C1)) if C1 <= C2
398define <4 x i32> @combine_vec_shl_ge_ashr_extact0(<4 x i32> %x) {
399; SSE-LABEL: combine_vec_shl_ge_ashr_extact0:
400; SSE:       # %bb.0:
401; SSE-NEXT:    pslld $2, %xmm0
402; SSE-NEXT:    retq
403;
404; AVX-LABEL: combine_vec_shl_ge_ashr_extact0:
405; AVX:       # %bb.0:
406; AVX-NEXT:    vpslld $2, %xmm0, %xmm0
407; AVX-NEXT:    retq
408  %1 = ashr exact <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
409  %2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5>
410  ret <4 x i32> %2
411}
412
413define <4 x i32> @combine_vec_shl_ge_ashr_extact1(<4 x i32> %x) {
414; SSE2-LABEL: combine_vec_shl_ge_ashr_extact1:
415; SSE2:       # %bb.0:
416; SSE2-NEXT:    movdqa %xmm0, %xmm1
417; SSE2-NEXT:    psrad $3, %xmm1
418; SSE2-NEXT:    movdqa %xmm0, %xmm2
419; SSE2-NEXT:    psrad $5, %xmm2
420; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
421; SSE2-NEXT:    movdqa %xmm0, %xmm1
422; SSE2-NEXT:    psrad $8, %xmm1
423; SSE2-NEXT:    psrad $4, %xmm0
424; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,3]
425; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [32,64,128,256]
426; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
427; SSE2-NEXT:    pmuludq %xmm0, %xmm3
428; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
429; SSE2-NEXT:    pmuludq %xmm1, %xmm2
430; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
431; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
432; SSE2-NEXT:    retq
433;
434; SSE41-LABEL: combine_vec_shl_ge_ashr_extact1:
435; SSE41:       # %bb.0:
436; SSE41-NEXT:    movdqa %xmm0, %xmm1
437; SSE41-NEXT:    psrad $8, %xmm1
438; SSE41-NEXT:    movdqa %xmm0, %xmm2
439; SSE41-NEXT:    psrad $4, %xmm2
440; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
441; SSE41-NEXT:    movdqa %xmm0, %xmm1
442; SSE41-NEXT:    psrad $5, %xmm1
443; SSE41-NEXT:    psrad $3, %xmm0
444; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
445; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
446; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
447; SSE41-NEXT:    retq
448;
449; AVX-LABEL: combine_vec_shl_ge_ashr_extact1:
450; AVX:       # %bb.0:
451; AVX-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
452; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
453; AVX-NEXT:    retq
454  %1 = ashr exact <4 x i32> %x, <i32 3, i32 4, i32 5, i32 8>
455  %2 = shl <4 x i32> %1, <i32 5, i32 6, i32 7, i32 8>
456  ret <4 x i32> %2
457}
458
459; fold (shl (sr[la] exact X,  C1), C2) -> (sr[la] X, (C2-C1)) if C1  > C2
460define <4 x i32> @combine_vec_shl_lt_ashr_extact0(<4 x i32> %x) {
461; SSE-LABEL: combine_vec_shl_lt_ashr_extact0:
462; SSE:       # %bb.0:
463; SSE-NEXT:    psrad $2, %xmm0
464; SSE-NEXT:    retq
465;
466; AVX-LABEL: combine_vec_shl_lt_ashr_extact0:
467; AVX:       # %bb.0:
468; AVX-NEXT:    vpsrad $2, %xmm0, %xmm0
469; AVX-NEXT:    retq
470  %1 = ashr exact <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
471  %2 = shl <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
472  ret <4 x i32> %2
473}
474
475define <4 x i32> @combine_vec_shl_lt_ashr_extact1(<4 x i32> %x) {
476; SSE2-LABEL: combine_vec_shl_lt_ashr_extact1:
477; SSE2:       # %bb.0:
478; SSE2-NEXT:    movdqa %xmm0, %xmm1
479; SSE2-NEXT:    psrad $5, %xmm1
480; SSE2-NEXT:    movdqa %xmm0, %xmm2
481; SSE2-NEXT:    psrad $7, %xmm2
482; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
483; SSE2-NEXT:    movdqa %xmm0, %xmm1
484; SSE2-NEXT:    psrad $8, %xmm1
485; SSE2-NEXT:    psrad $6, %xmm0
486; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,3]
487; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [8,16,32,256]
488; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
489; SSE2-NEXT:    pmuludq %xmm0, %xmm3
490; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
491; SSE2-NEXT:    pmuludq %xmm1, %xmm2
492; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
493; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
494; SSE2-NEXT:    retq
495;
496; SSE41-LABEL: combine_vec_shl_lt_ashr_extact1:
497; SSE41:       # %bb.0:
498; SSE41-NEXT:    movdqa %xmm0, %xmm1
499; SSE41-NEXT:    psrad $8, %xmm1
500; SSE41-NEXT:    movdqa %xmm0, %xmm2
501; SSE41-NEXT:    psrad $6, %xmm2
502; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
503; SSE41-NEXT:    movdqa %xmm0, %xmm1
504; SSE41-NEXT:    psrad $7, %xmm1
505; SSE41-NEXT:    psrad $5, %xmm0
506; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
507; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
508; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
509; SSE41-NEXT:    retq
510;
511; AVX-LABEL: combine_vec_shl_lt_ashr_extact1:
512; AVX:       # %bb.0:
513; AVX-NEXT:    vpsravd {{.*}}(%rip), %xmm0, %xmm0
514; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
515; AVX-NEXT:    retq
516  %1 = ashr exact <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
517  %2 = shl <4 x i32> %1, <i32 3, i32 4, i32 5, i32 8>
518  ret <4 x i32> %2
519}
520
521; fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) if C2 > C1
522define <4 x i32> @combine_vec_shl_gt_lshr0(<4 x i32> %x) {
523; SSE-LABEL: combine_vec_shl_gt_lshr0:
524; SSE:       # %bb.0:
525; SSE-NEXT:    pslld $2, %xmm0
526; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
527; SSE-NEXT:    retq
528;
529; AVX-LABEL: combine_vec_shl_gt_lshr0:
530; AVX:       # %bb.0:
531; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264]
532; AVX-NEXT:    vpslld $2, %xmm0, %xmm0
533; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
534; AVX-NEXT:    retq
535  %1 = lshr <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
536  %2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5>
537  ret <4 x i32> %2
538}
539
540define <4 x i32> @combine_vec_shl_gt_lshr1(<4 x i32> %x) {
541; SSE2-LABEL: combine_vec_shl_gt_lshr1:
542; SSE2:       # %bb.0:
543; SSE2-NEXT:    movdqa %xmm0, %xmm1
544; SSE2-NEXT:    psrld $3, %xmm1
545; SSE2-NEXT:    movdqa %xmm0, %xmm2
546; SSE2-NEXT:    psrld $5, %xmm2
547; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
548; SSE2-NEXT:    movdqa %xmm0, %xmm1
549; SSE2-NEXT:    psrld $8, %xmm1
550; SSE2-NEXT:    psrld $4, %xmm0
551; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,3]
552; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [32,64,128,256]
553; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
554; SSE2-NEXT:    pmuludq %xmm0, %xmm3
555; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
556; SSE2-NEXT:    pmuludq %xmm1, %xmm2
557; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
558; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
559; SSE2-NEXT:    retq
560;
561; SSE41-LABEL: combine_vec_shl_gt_lshr1:
562; SSE41:       # %bb.0:
563; SSE41-NEXT:    movdqa %xmm0, %xmm1
564; SSE41-NEXT:    psrld $8, %xmm1
565; SSE41-NEXT:    movdqa %xmm0, %xmm2
566; SSE41-NEXT:    psrld $4, %xmm2
567; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
568; SSE41-NEXT:    movdqa %xmm0, %xmm1
569; SSE41-NEXT:    psrld $5, %xmm1
570; SSE41-NEXT:    psrld $3, %xmm0
571; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
572; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
573; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
574; SSE41-NEXT:    retq
575;
576; AVX-LABEL: combine_vec_shl_gt_lshr1:
577; AVX:       # %bb.0:
578; AVX-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
579; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
580; AVX-NEXT:    retq
581  %1 = lshr <4 x i32> %x, <i32 3, i32 4, i32 5, i32 8>
582  %2 = shl <4 x i32> %1, <i32 5, i32 6, i32 7, i32 8>
583  ret <4 x i32> %2
584}
585
586; fold (shl (srl x, c1), c2) -> (and (srl x, (sub c1, c2), MASK) if C1 >= C2
587define <4 x i32> @combine_vec_shl_le_lshr0(<4 x i32> %x) {
588; SSE-LABEL: combine_vec_shl_le_lshr0:
589; SSE:       # %bb.0:
590; SSE-NEXT:    psrld $2, %xmm0
591; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
592; SSE-NEXT:    retq
593;
594; AVX-LABEL: combine_vec_shl_le_lshr0:
595; AVX:       # %bb.0:
596; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1073741816,1073741816,1073741816,1073741816]
597; AVX-NEXT:    vpsrld $2, %xmm0, %xmm0
598; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
599; AVX-NEXT:    retq
600  %1 = lshr <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
601  %2 = shl <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
602  ret <4 x i32> %2
603}
604
605define <4 x i32> @combine_vec_shl_le_lshr1(<4 x i32> %x) {
606; SSE2-LABEL: combine_vec_shl_le_lshr1:
607; SSE2:       # %bb.0:
608; SSE2-NEXT:    movdqa %xmm0, %xmm1
609; SSE2-NEXT:    psrld $5, %xmm1
610; SSE2-NEXT:    movdqa %xmm0, %xmm2
611; SSE2-NEXT:    psrld $7, %xmm2
612; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
613; SSE2-NEXT:    movdqa %xmm0, %xmm1
614; SSE2-NEXT:    psrld $8, %xmm1
615; SSE2-NEXT:    psrld $6, %xmm0
616; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,3]
617; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [8,16,32,256]
618; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
619; SSE2-NEXT:    pmuludq %xmm0, %xmm3
620; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
621; SSE2-NEXT:    pmuludq %xmm1, %xmm2
622; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
623; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
624; SSE2-NEXT:    retq
625;
626; SSE41-LABEL: combine_vec_shl_le_lshr1:
627; SSE41:       # %bb.0:
628; SSE41-NEXT:    movdqa %xmm0, %xmm1
629; SSE41-NEXT:    psrld $8, %xmm1
630; SSE41-NEXT:    movdqa %xmm0, %xmm2
631; SSE41-NEXT:    psrld $6, %xmm2
632; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
633; SSE41-NEXT:    movdqa %xmm0, %xmm1
634; SSE41-NEXT:    psrld $7, %xmm1
635; SSE41-NEXT:    psrld $5, %xmm0
636; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
637; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
638; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
639; SSE41-NEXT:    retq
640;
641; AVX-LABEL: combine_vec_shl_le_lshr1:
642; AVX:       # %bb.0:
643; AVX-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
644; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
645; AVX-NEXT:    retq
646  %1 = lshr <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
647  %2 = shl <4 x i32> %1, <i32 3, i32 4, i32 5, i32 8>
648  ret <4 x i32> %2
649}
650
651; fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1))
652define <4 x i32> @combine_vec_shl_ashr0(<4 x i32> %x) {
653; SSE-LABEL: combine_vec_shl_ashr0:
654; SSE:       # %bb.0:
655; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
656; SSE-NEXT:    retq
657;
658; AVX-LABEL: combine_vec_shl_ashr0:
659; AVX:       # %bb.0:
660; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264]
661; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
662; AVX-NEXT:    retq
663  %1 = ashr <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
664  %2 = shl <4 x i32> %1, <i32 5, i32 5, i32 5, i32 5>
665  ret <4 x i32> %2
666}
667
668define <4 x i32> @combine_vec_shl_ashr1(<4 x i32> %x) {
669; SSE-LABEL: combine_vec_shl_ashr1:
670; SSE:       # %bb.0:
671; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
672; SSE-NEXT:    retq
673;
674; AVX-LABEL: combine_vec_shl_ashr1:
675; AVX:       # %bb.0:
676; AVX-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
677; AVX-NEXT:    retq
678  %1 = ashr <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
679  %2 = shl <4 x i32> %1, <i32 5, i32 6, i32 7, i32 8>
680  ret <4 x i32> %2
681}
682
683; fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
684define <4 x i32> @combine_vec_shl_add0(<4 x i32> %x) {
685; SSE-LABEL: combine_vec_shl_add0:
686; SSE:       # %bb.0:
687; SSE-NEXT:    pslld $2, %xmm0
688; SSE-NEXT:    paddd {{.*}}(%rip), %xmm0
689; SSE-NEXT:    retq
690;
691; AVX-LABEL: combine_vec_shl_add0:
692; AVX:       # %bb.0:
693; AVX-NEXT:    vpslld $2, %xmm0, %xmm0
694; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
695; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
696; AVX-NEXT:    retq
697  %1 = add <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
698  %2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
699  ret <4 x i32> %2
700}
701
702define <4 x i32> @combine_vec_shl_add1(<4 x i32> %x) {
703; SSE2-LABEL: combine_vec_shl_add1:
704; SSE2:       # %bb.0:
705; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2,4,8,16]
706; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
707; SSE2-NEXT:    pmuludq %xmm1, %xmm0
708; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
709; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
710; SSE2-NEXT:    pmuludq %xmm2, %xmm1
711; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
712; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
713; SSE2-NEXT:    paddd {{.*}}(%rip), %xmm0
714; SSE2-NEXT:    retq
715;
716; SSE41-LABEL: combine_vec_shl_add1:
717; SSE41:       # %bb.0:
718; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
719; SSE41-NEXT:    paddd {{.*}}(%rip), %xmm0
720; SSE41-NEXT:    retq
721;
722; AVX-LABEL: combine_vec_shl_add1:
723; AVX:       # %bb.0:
724; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
725; AVX-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
726; AVX-NEXT:    retq
727  %1 = add <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
728  %2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4>
729  ret <4 x i32> %2
730}
731
732; fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
733define <4 x i32> @combine_vec_shl_or0(<4 x i32> %x) {
734; SSE-LABEL: combine_vec_shl_or0:
735; SSE:       # %bb.0:
736; SSE-NEXT:    pslld $2, %xmm0
737; SSE-NEXT:    por {{.*}}(%rip), %xmm0
738; SSE-NEXT:    retq
739;
740; AVX-LABEL: combine_vec_shl_or0:
741; AVX:       # %bb.0:
742; AVX-NEXT:    vpslld $2, %xmm0, %xmm0
743; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
744; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
745; AVX-NEXT:    retq
746  %1 = or  <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
747  %2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
748  ret <4 x i32> %2
749}
750
751define <4 x i32> @combine_vec_shl_or1(<4 x i32> %x) {
752; SSE2-LABEL: combine_vec_shl_or1:
753; SSE2:       # %bb.0:
754; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2,4,8,16]
755; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
756; SSE2-NEXT:    pmuludq %xmm1, %xmm0
757; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
758; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
759; SSE2-NEXT:    pmuludq %xmm2, %xmm1
760; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
761; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
762; SSE2-NEXT:    por {{.*}}(%rip), %xmm0
763; SSE2-NEXT:    retq
764;
765; SSE41-LABEL: combine_vec_shl_or1:
766; SSE41:       # %bb.0:
767; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
768; SSE41-NEXT:    por {{.*}}(%rip), %xmm0
769; SSE41-NEXT:    retq
770;
771; AVX-LABEL: combine_vec_shl_or1:
772; AVX:       # %bb.0:
773; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
774; AVX-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
775; AVX-NEXT:    retq
776  %1 = or  <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
777  %2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4>
778  ret <4 x i32> %2
779}
780
781; fold (shl (mul x, c1), c2) -> (mul x, c1 << c2)
782define <4 x i32> @combine_vec_shl_mul0(<4 x i32> %x) {
783; SSE2-LABEL: combine_vec_shl_mul0:
784; SSE2:       # %bb.0:
785; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [20,20,20,20]
786; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
787; SSE2-NEXT:    pmuludq %xmm1, %xmm0
788; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
789; SSE2-NEXT:    pmuludq %xmm1, %xmm2
790; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
791; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
792; SSE2-NEXT:    retq
793;
794; SSE41-LABEL: combine_vec_shl_mul0:
795; SSE41:       # %bb.0:
796; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
797; SSE41-NEXT:    retq
798;
799; AVX-LABEL: combine_vec_shl_mul0:
800; AVX:       # %bb.0:
801; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
802; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
803; AVX-NEXT:    retq
804  %1 = mul <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
805  %2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
806  ret <4 x i32> %2
807}
808
809define <4 x i32> @combine_vec_shl_mul1(<4 x i32> %x) {
810; SSE2-LABEL: combine_vec_shl_mul1:
811; SSE2:       # %bb.0:
812; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [10,24,56,128]
813; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
814; SSE2-NEXT:    pmuludq %xmm1, %xmm0
815; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
816; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
817; SSE2-NEXT:    pmuludq %xmm2, %xmm1
818; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
819; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
820; SSE2-NEXT:    retq
821;
822; SSE41-LABEL: combine_vec_shl_mul1:
823; SSE41:       # %bb.0:
824; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
825; SSE41-NEXT:    retq
826;
827; AVX-LABEL: combine_vec_shl_mul1:
828; AVX:       # %bb.0:
829; AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
830; AVX-NEXT:    retq
831  %1 = mul <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
832  %2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4>
833  ret <4 x i32> %2
834}
835
836; fold (add (shl x, c1), c2) -> (or (shl x, c1), c2)
837define <4 x i32> @combine_vec_add_shl_nonsplat(<4 x i32> %a0)  {
838; SSE2-LABEL: combine_vec_add_shl_nonsplat:
839; SSE2:       # %bb.0:
840; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [4,8,16,32]
841; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
842; SSE2-NEXT:    pmuludq %xmm1, %xmm0
843; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
844; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
845; SSE2-NEXT:    pmuludq %xmm2, %xmm1
846; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
847; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
848; SSE2-NEXT:    por {{.*}}(%rip), %xmm0
849; SSE2-NEXT:    retq
850;
851; SSE41-LABEL: combine_vec_add_shl_nonsplat:
852; SSE41:       # %bb.0:
853; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
854; SSE41-NEXT:    por {{.*}}(%rip), %xmm0
855; SSE41-NEXT:    retq
856;
857; AVX-LABEL: combine_vec_add_shl_nonsplat:
858; AVX:       # %bb.0:
859; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
860; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3]
861; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
862; AVX-NEXT:    retq
863  %1 = shl <4 x i32> %a0, <i32 2, i32 3, i32 4, i32 5>
864  %2 = add <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
865  ret <4 x i32> %2
866}
867
868define <4 x i32> @combine_vec_add_shl_and_nonsplat(<4 x i32> %a0)  {
869; SSE2-LABEL: combine_vec_add_shl_and_nonsplat:
870; SSE2:       # %bb.0:
871; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
872; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [4,8,16,32]
873; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
874; SSE2-NEXT:    pmuludq %xmm1, %xmm0
875; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
876; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
877; SSE2-NEXT:    pmuludq %xmm2, %xmm1
878; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
879; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
880; SSE2-NEXT:    por {{.*}}(%rip), %xmm0
881; SSE2-NEXT:    retq
882;
883; SSE41-LABEL: combine_vec_add_shl_and_nonsplat:
884; SSE41:       # %bb.0:
885; SSE41-NEXT:    pxor %xmm1, %xmm1
886; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
887; SSE41-NEXT:    pmulld {{.*}}(%rip), %xmm0
888; SSE41-NEXT:    por {{.*}}(%rip), %xmm0
889; SSE41-NEXT:    retq
890;
891; AVX-LABEL: combine_vec_add_shl_and_nonsplat:
892; AVX:       # %bb.0:
893; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
894; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
895; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
896; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15]
897; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
898; AVX-NEXT:    retq
899  %1 = and <4 x i32> %a0, <i32 4294901760, i32 4294901760, i32 4294901760, i32 4294901760>
900  %2 = shl <4 x i32> %1, <i32 2, i32 3, i32 4, i32 5>
901  %3 = add <4 x i32> %2, <i32 15, i32 15, i32 15, i32 15>
902  ret <4 x i32> %3
903}
904
905define <4 x i32> @combine_vec_add_shuffle_shl(<4 x i32> %a0)  {
906; SSE2-LABEL: combine_vec_add_shuffle_shl:
907; SSE2:       # %bb.0:
908; SSE2-NEXT:    movdqa %xmm0, %xmm1
909; SSE2-NEXT:    pslld $3, %xmm1
910; SSE2-NEXT:    pslld $2, %xmm0
911; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
912; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,3,0]
913; SSE2-NEXT:    por {{.*}}(%rip), %xmm0
914; SSE2-NEXT:    retq
915;
916; SSE41-LABEL: combine_vec_add_shuffle_shl:
917; SSE41:       # %bb.0:
918; SSE41-NEXT:    movdqa %xmm0, %xmm1
919; SSE41-NEXT:    pslld $3, %xmm1
920; SSE41-NEXT:    pslld $2, %xmm0
921; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
922; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
923; SSE41-NEXT:    por {{.*}}(%rip), %xmm0
924; SSE41-NEXT:    retq
925;
926; AVX-LABEL: combine_vec_add_shuffle_shl:
927; AVX:       # %bb.0:
928; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
929; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
930; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3]
931; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
932; AVX-NEXT:    retq
933  %1 = shl <4 x i32> %a0, <i32 2, i32 3, i32 0, i32 1>
934  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
935  %3 = add <4 x i32> %2, <i32 3, i32 3, i32 3, i32 3>
936  ret <4 x i32> %3
937}
938