• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512F
7;
8; Combine tests involving SSE3/SSSE3 target shuffles (MOVDDUP, MOVSHDUP, MOVSLDUP, PSHUFB)
9
10declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)
11
12define <16 x i8> @combine_vpshufb_as_zero(<16 x i8> %a0) {
13; SSE-LABEL: combine_vpshufb_as_zero:
14; SSE:       # %bb.0:
15; SSE-NEXT:    xorps %xmm0, %xmm0
16; SSE-NEXT:    retq
17;
18; AVX-LABEL: combine_vpshufb_as_zero:
19; AVX:       # %bb.0:
20; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
21; AVX-NEXT:    retq
22  %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 128, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
23  %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 0, i8 128, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
24  %res2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res1, <16 x i8> <i8 0, i8 1, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
25  ret <16 x i8> %res2
26}
27
28define <16 x i8> @combine_vpshufb_as_movq(<16 x i8> %a0) {
29; SSE-LABEL: combine_vpshufb_as_movq:
30; SSE:       # %bb.0:
31; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
32; SSE-NEXT:    retq
33;
34; AVX-LABEL: combine_vpshufb_as_movq:
35; AVX:       # %bb.0:
36; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
37; AVX-NEXT:    retq
38  %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 128, i8 1, i8 128, i8 2, i8 128, i8 3, i8 128, i8 4, i8 128, i8 5, i8 128, i8 6, i8 128, i8 7, i8 128>)
39  %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 1, i8 3, i8 5, i8 7, i8 9, i8 11, i8 13, i8 15>)
40  ret <16 x i8> %res1
41}
42
43define <2 x double> @combine_pshufb_as_movsd(<2 x double> %a0, <2 x double> %a1) {
44; SSSE3-LABEL: combine_pshufb_as_movsd:
45; SSSE3:       # %bb.0:
46; SSSE3-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
47; SSSE3-NEXT:    movapd %xmm1, %xmm0
48; SSSE3-NEXT:    retq
49;
50; SSE41-LABEL: combine_pshufb_as_movsd:
51; SSE41:       # %bb.0:
52; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
53; SSE41-NEXT:    retq
54;
55; AVX-LABEL: combine_pshufb_as_movsd:
56; AVX:       # %bb.0:
57; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
58; AVX-NEXT:    retq
59  %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 3, i32 0>
60  %2 = bitcast <2 x double> %1 to <16 x i8>
61  %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
62  %4 = bitcast <16 x i8> %3 to <2 x double>
63  ret <2 x double> %4
64}
65
66define <4 x float> @combine_pshufb_as_movss(<4 x float> %a0, <4 x float> %a1) {
67; SSSE3-LABEL: combine_pshufb_as_movss:
68; SSSE3:       # %bb.0:
69; SSSE3-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
70; SSSE3-NEXT:    retq
71;
72; SSE41-LABEL: combine_pshufb_as_movss:
73; SSE41:       # %bb.0:
74; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
75; SSE41-NEXT:    retq
76;
77; AVX-LABEL: combine_pshufb_as_movss:
78; AVX:       # %bb.0:
79; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
80; AVX-NEXT:    retq
81  %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 3, i32 2, i32 1>
82  %2 = bitcast <4 x float> %1 to <16 x i8>
83  %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 12, i8 13, i8 14, i8 15, i8 8, i8 9, i8 10, i8 11, i8 4, i8 5, i8 6, i8 7>)
84  %4 = bitcast <16 x i8> %3 to <4 x float>
85  ret <4 x float> %4
86}
87
88define <4 x i32> @combine_pshufb_as_zext(<16 x i8> %a0) {
89; SSSE3-LABEL: combine_pshufb_as_zext:
90; SSSE3:       # %bb.0:
91; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
92; SSSE3-NEXT:    retq
93;
94; SSE41-LABEL: combine_pshufb_as_zext:
95; SSE41:       # %bb.0:
96; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
97; SSE41-NEXT:    retq
98;
99; AVX-LABEL: combine_pshufb_as_zext:
100; AVX:       # %bb.0:
101; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
102; AVX-NEXT:    retq
103  %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 2, i8 -1, i8 -1, i8 -1, i8 3, i8 -1, i8 -1, i8 -1>)
104  %2 = bitcast <16 x i8> %1 to <4 x i32>
105  ret <4 x i32> %2
106}
107
108define <2 x double> @combine_pshufb_as_vzmovl_64(<2 x double> %a0) {
109; SSE-LABEL: combine_pshufb_as_vzmovl_64:
110; SSE:       # %bb.0:
111; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
112; SSE-NEXT:    retq
113;
114; AVX-LABEL: combine_pshufb_as_vzmovl_64:
115; AVX:       # %bb.0:
116; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
117; AVX-NEXT:    retq
118  %1 = bitcast <2 x double> %a0 to <16 x i8>
119  %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
120  %3 = bitcast <16 x i8> %2 to <2 x double>
121  ret <2 x double> %3
122}
123
124define <4 x float> @combine_pshufb_as_vzmovl_32(<4 x float> %a0) {
125; SSSE3-LABEL: combine_pshufb_as_vzmovl_32:
126; SSSE3:       # %bb.0:
127; SSSE3-NEXT:    xorps %xmm1, %xmm1
128; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
129; SSSE3-NEXT:    movaps %xmm1, %xmm0
130; SSSE3-NEXT:    retq
131;
132; SSE41-LABEL: combine_pshufb_as_vzmovl_32:
133; SSE41:       # %bb.0:
134; SSE41-NEXT:    xorps %xmm1, %xmm1
135; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
136; SSE41-NEXT:    retq
137;
138; AVX-LABEL: combine_pshufb_as_vzmovl_32:
139; AVX:       # %bb.0:
140; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
141; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
142; AVX-NEXT:    retq
143  %1 = bitcast <4 x float> %a0 to <16 x i8>
144  %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
145  %3 = bitcast <16 x i8> %2 to <4 x float>
146  ret <4 x float> %3
147}
148
149define <4 x float> @combine_pshufb_movddup(<4 x float> %a0) {
150; SSE-LABEL: combine_pshufb_movddup:
151; SSE:       # %bb.0:
152; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,7,7,7,7,5,5,5,5,7,7,7,7]
153; SSE-NEXT:    retq
154;
155; AVX-LABEL: combine_pshufb_movddup:
156; AVX:       # %bb.0:
157; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,7,7,7,7,5,5,5,5,7,7,7,7]
158; AVX-NEXT:    retq
159  %1 = bitcast <4 x float> %a0 to <16 x i8>
160  %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 5, i8 5, i8 5, i8 5, i8 7, i8 7, i8 7, i8 7, i8 1, i8 1, i8 1, i8 1, i8 3, i8 3, i8 3, i8 3>)
161  %3 = bitcast <16 x i8> %2 to <4 x float>
162  %4 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
163  ret <4 x float> %4
164}
165
166define <4 x float> @combine_pshufb_movshdup(<4 x float> %a0) {
167; SSE-LABEL: combine_pshufb_movshdup:
168; SSE:       # %bb.0:
169; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[7,7,7,7,7,7,7,7,3,3,3,3,3,3,3,3]
170; SSE-NEXT:    retq
171;
172; AVX-LABEL: combine_pshufb_movshdup:
173; AVX:       # %bb.0:
174; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,7,7,7,7,7,7,7,3,3,3,3,3,3,3,3]
175; AVX-NEXT:    retq
176  %1 = bitcast <4 x float> %a0 to <16 x i8>
177  %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 5, i8 5, i8 5, i8 5, i8 7, i8 7, i8 7, i8 7, i8 1, i8 1, i8 1, i8 1, i8 3, i8 3, i8 3, i8 3>)
178  %3 = bitcast <16 x i8> %2 to <4 x float>
179  %4 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
180  ret <4 x float> %4
181}
182
183define <4 x float> @combine_pshufb_movsldup(<4 x float> %a0) {
184; SSE-LABEL: combine_pshufb_movsldup:
185; SSE:       # %bb.0:
186; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,5,5,5,5,1,1,1,1,1,1,1,1]
187; SSE-NEXT:    retq
188;
189; AVX-LABEL: combine_pshufb_movsldup:
190; AVX:       # %bb.0:
191; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,5,5,5,5,1,1,1,1,1,1,1,1]
192; AVX-NEXT:    retq
193  %1 = bitcast <4 x float> %a0 to <16 x i8>
194  %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 5, i8 5, i8 5, i8 5, i8 7, i8 7, i8 7, i8 7, i8 1, i8 1, i8 1, i8 1, i8 3, i8 3, i8 3, i8 3>)
195  %3 = bitcast <16 x i8> %2 to <4 x float>
196  %4 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
197  ret <4 x float> %4
198}
199
200define <16 x i8> @combine_pshufb_palignr(<16 x i8> %a0, <16 x i8> %a1) {
201; SSE-LABEL: combine_pshufb_palignr:
202; SSE:       # %bb.0:
203; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
204; SSE-NEXT:    retq
205;
206; AVX-LABEL: combine_pshufb_palignr:
207; AVX:       # %bb.0:
208; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
209; AVX-NEXT:    retq
210  %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
211  %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
212  ret <16 x i8> %2
213}
214
215define <16 x i8> @combine_pshufb_pslldq(<16 x i8> %a0) {
216; SSE-LABEL: combine_pshufb_pslldq:
217; SSE:       # %bb.0:
218; SSE-NEXT:    xorps %xmm0, %xmm0
219; SSE-NEXT:    retq
220;
221; AVX-LABEL: combine_pshufb_pslldq:
222; AVX:       # %bb.0:
223; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
224; AVX-NEXT:    retq
225  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
226  %2 = shufflevector <16 x i8> %1, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
227  ret <16 x i8> %2
228}
229
230define <16 x i8> @combine_pshufb_psrldq(<16 x i8> %a0) {
231; SSE-LABEL: combine_pshufb_psrldq:
232; SSE:       # %bb.0:
233; SSE-NEXT:    xorps %xmm0, %xmm0
234; SSE-NEXT:    retq
235;
236; AVX-LABEL: combine_pshufb_psrldq:
237; AVX:       # %bb.0:
238; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
239; AVX-NEXT:    retq
240  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
241  %2 = shufflevector <16 x i8> %1, <16 x i8> zeroinitializer, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
242  ret <16 x i8> %2
243}
244
245define <16 x i8> @combine_and_pshufb(<16 x i8> %a0) {
246; SSSE3-LABEL: combine_and_pshufb:
247; SSSE3:       # %bb.0:
248; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
249; SSSE3-NEXT:    retq
250;
251; SSE41-LABEL: combine_and_pshufb:
252; SSE41:       # %bb.0:
253; SSE41-NEXT:    pxor %xmm1, %xmm1
254; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7]
255; SSE41-NEXT:    retq
256;
257; AVX-LABEL: combine_and_pshufb:
258; AVX:       # %bb.0:
259; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
260; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7]
261; AVX-NEXT:    retq
262  %1 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
263  %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
264  ret <16 x i8> %2
265}
266
267define <16 x i8> @combine_pshufb_and(<16 x i8> %a0) {
268; SSSE3-LABEL: combine_pshufb_and:
269; SSSE3:       # %bb.0:
270; SSSE3-NEXT:    andps {{.*}}(%rip), %xmm0
271; SSSE3-NEXT:    retq
272;
273; SSE41-LABEL: combine_pshufb_and:
274; SSE41:       # %bb.0:
275; SSE41-NEXT:    pxor %xmm1, %xmm1
276; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7]
277; SSE41-NEXT:    retq
278;
279; AVX-LABEL: combine_pshufb_and:
280; AVX:       # %bb.0:
281; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
282; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7]
283; AVX-NEXT:    retq
284  %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
285  %2 = shufflevector <16 x i8> %1, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
286  ret <16 x i8> %2
287}
288
289define <16 x i8> @combine_pshufb_as_palignr(<16 x i8> %a0) {
290; SSE-LABEL: combine_pshufb_as_palignr:
291; SSE:       # %bb.0:
292; SSE-NEXT:    palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
293; SSE-NEXT:    retq
294;
295; AVX-LABEL: combine_pshufb_as_palignr:
296; AVX:       # %bb.0:
297; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
298; AVX-NEXT:    retq
299  %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 undef, i8 undef, i8 0>)
300  ret <16 x i8> %res0
301}
302
303define <16 x i8> @combine_pshufb_as_pslldq(<16 x i8> %a0) {
304; SSE-LABEL: combine_pshufb_as_pslldq:
305; SSE:       # %bb.0:
306; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
307; SSE-NEXT:    retq
308;
309; AVX-LABEL: combine_pshufb_as_pslldq:
310; AVX:       # %bb.0:
311; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
312; AVX-NEXT:    retq
313  %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>)
314  ret <16 x i8> %res0
315}
316
317define <16 x i8> @combine_pshufb_as_psrldq(<16 x i8> %a0) {
318; SSE-LABEL: combine_pshufb_as_psrldq:
319; SSE:       # %bb.0:
320; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
321; SSE-NEXT:    retq
322;
323; AVX-LABEL: combine_pshufb_as_psrldq:
324; AVX:       # %bb.0:
325; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
326; AVX-NEXT:    retq
327  %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
328  ret <16 x i8> %res0
329}
330
331define <16 x i8> @combine_pshufb_as_psrlw(<16 x i8> %a0) {
332; SSE-LABEL: combine_pshufb_as_psrlw:
333; SSE:       # %bb.0:
334; SSE-NEXT:    psrlw $8, %xmm0
335; SSE-NEXT:    retq
336;
337; AVX-LABEL: combine_pshufb_as_psrlw:
338; AVX:       # %bb.0:
339; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
340; AVX-NEXT:    retq
341  %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 1, i8 128, i8 3, i8 128, i8 5, i8 128, i8 7, i8 128, i8 9, i8 128, i8 11, i8 128, i8 13, i8 128, i8 15, i8 128>)
342  ret <16 x i8> %res0
343}
344
345define <16 x i8> @combine_pshufb_as_pslld(<16 x i8> %a0) {
346; SSE-LABEL: combine_pshufb_as_pslld:
347; SSE:       # %bb.0:
348; SSE-NEXT:    pslld $24, %xmm0
349; SSE-NEXT:    retq
350;
351; AVX-LABEL: combine_pshufb_as_pslld:
352; AVX:       # %bb.0:
353; AVX-NEXT:    vpslld $24, %xmm0, %xmm0
354; AVX-NEXT:    retq
355  %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 128, i8 128, i8 128, i8 0, i8 128, i8 128, i8 128, i8 4, i8 128, i8 128, i8 128, i8 8, i8 128, i8 128, i8 128, i8 12>)
356  ret <16 x i8> %res0
357}
358
359define <16 x i8> @combine_pshufb_as_psrlq(<16 x i8> %a0) {
360; SSE-LABEL: combine_pshufb_as_psrlq:
361; SSE:       # %bb.0:
362; SSE-NEXT:    psrlq $40, %xmm0
363; SSE-NEXT:    retq
364;
365; AVX-LABEL: combine_pshufb_as_psrlq:
366; AVX:       # %bb.0:
367; AVX-NEXT:    vpsrlq $40, %xmm0, %xmm0
368; AVX-NEXT:    retq
369  %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 5, i8 6, i8 7, i8 128, i8 128, i8 128, i8 128, i8 128, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128>)
370  ret <16 x i8> %res0
371}
372
373define <16 x i8> @combine_pshufb_as_pshuflw(<16 x i8> %a0) {
374; SSE-LABEL: combine_pshufb_as_pshuflw:
375; SSE:       # %bb.0:
376; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
377; SSE-NEXT:    retq
378;
379; AVX-LABEL: combine_pshufb_as_pshuflw:
380; AVX:       # %bb.0:
381; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
382; AVX-NEXT:    retq
383  %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
384  ret <16 x i8> %res0
385}
386
387define <16 x i8> @combine_pshufb_as_pshufhw(<16 x i8> %a0) {
388; SSE-LABEL: combine_pshufb_as_pshufhw:
389; SSE:       # %bb.0:
390; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
391; SSE-NEXT:    retq
392;
393; AVX-LABEL: combine_pshufb_as_pshufhw:
394; AVX:       # %bb.0:
395; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
396; AVX-NEXT:    retq
397  %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
398  ret <16 x i8> %res0
399}
400
401define <16 x i8> @combine_pshufb_not_as_pshufw(<16 x i8> %a0) {
402; SSE-LABEL: combine_pshufb_not_as_pshufw:
403; SSE:       # %bb.0:
404; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
405; SSE-NEXT:    retq
406;
407; AVX-LABEL: combine_pshufb_not_as_pshufw:
408; AVX:       # %bb.0:
409; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
410; AVX-NEXT:    retq
411  %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
412  %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
413  ret <16 x i8> %res1
414}
415
416define <16 x i8> @combine_vpshufb_as_pshuflw_not_pslld(<16 x i8> *%a0) {
417; SSE-LABEL: combine_vpshufb_as_pshuflw_not_pslld:
418; SSE:       # %bb.0:
419; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = mem[0,0,2,2,4,5,6,7]
420; SSE-NEXT:    retq
421;
422; AVX-LABEL: combine_vpshufb_as_pshuflw_not_pslld:
423; AVX:       # %bb.0:
424; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = mem[0,0,2,2,4,5,6,7]
425; AVX-NEXT:    retq
426  %res0 = load <16 x i8>, <16 x i8> *%a0, align 16
427  %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 undef, i8 undef, i8 0, i8 1, i8 undef, i8 undef, i8 4, i8 5, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
428  ret <16 x i8> %res1
429}
430
431define <16 x i8> @combine_pshufb_as_unary_unpcklbw(<16 x i8> %a0) {
432; SSE-LABEL: combine_pshufb_as_unary_unpcklbw:
433; SSE:       # %bb.0:
434; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
435; SSE-NEXT:    retq
436;
437; AVX-LABEL: combine_pshufb_as_unary_unpcklbw:
438; AVX:       # %bb.0:
439; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
440; AVX-NEXT:    retq
441  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 undef, i8 undef, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4, i8 5, i8 5, i8 6, i8 6, i8 7, i8 7>)
442  ret <16 x i8> %1
443}
444
445define <16 x i8> @combine_pshufb_as_unary_unpckhwd(<16 x i8> %a0) {
446; SSE-LABEL: combine_pshufb_as_unary_unpckhwd:
447; SSE:       # %bb.0:
448; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
449; SSE-NEXT:    retq
450;
451; AVX-LABEL: combine_pshufb_as_unary_unpckhwd:
452; AVX:       # %bb.0:
453; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
454; AVX-NEXT:    retq
455  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 8, i8 9, i8 8, i8 9, i8 10, i8 11, i8 10, i8 11, i8 12, i8 13, i8 12, i8 13, i8 14, i8 15, i8 undef, i8 undef>)
456  ret <16 x i8> %1
457}
458
459define <8 x i16> @combine_pshufb_as_unpacklo_undef(<16 x i8> %a0) {
460; ALL-LABEL: combine_pshufb_as_unpacklo_undef:
461; ALL:       # %bb.0:
462; ALL-NEXT:    retq
463  %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 undef, i8 undef, i8 0, i8 1, i8 undef, i8 undef, i8 2, i8 3, i8 undef, i8 undef, i8 4, i8 5, i8 undef, i8 undef, i8 6, i8 7>)
464  %2 = bitcast <16 x i8> %1 to <8 x i16>
465  %3 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
466  ret <8 x i16> %3
467}
468
469define <16 x i8> @combine_pshufb_as_unpackhi_undef(<16 x i8> %a0) {
470; ALL-LABEL: combine_pshufb_as_unpackhi_undef:
471; ALL:       # %bb.0:
472; ALL-NEXT:    retq
473  %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 8, i8 undef, i8 9, i8 undef, i8 10, i8 undef, i8 11, i8 undef, i8 12, i8 undef, i8 13, i8 undef, i8 14, i8 undef, i8 15, i8 undef>)
474  %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
475  ret <16 x i8> %2
476}
477
478define <16 x i8> @combine_pshufb_as_unpacklo_zero(<16 x i8> %a0) {
479; SSE-LABEL: combine_pshufb_as_unpacklo_zero:
480; SSE:       # %bb.0:
481; SSE-NEXT:    xorps %xmm1, %xmm1
482; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
483; SSE-NEXT:    movaps %xmm1, %xmm0
484; SSE-NEXT:    retq
485;
486; AVX-LABEL: combine_pshufb_as_unpacklo_zero:
487; AVX:       # %bb.0:
488; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
489; AVX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
490; AVX-NEXT:    retq
491  %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 4, i8 5, i8 6, i8 7>)
492  ret <16 x i8> %1
493}
494
495define <16 x i8> @combine_pshufb_as_unpackhi_zero(<16 x i8> %a0) {
496; SSE-LABEL: combine_pshufb_as_unpackhi_zero:
497; SSE:       # %bb.0:
498; SSE-NEXT:    pxor %xmm1, %xmm1
499; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
500; SSE-NEXT:    retq
501;
502; AVX-LABEL: combine_pshufb_as_unpackhi_zero:
503; AVX:       # %bb.0:
504; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
505; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
506; AVX-NEXT:    retq
507  %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 8, i8 -1, i8 9, i8 -1, i8 10, i8 -1, i8 11, i8 -1, i8 12, i8 -1, i8 13, i8 -1, i8 14, i8 -1, i8 15, i8 -1>)
508  ret <16 x i8> %1
509}
510
511define <16 x i8> @combine_psrlw_pshufb(<8 x i16> %a0) {
512; SSE-LABEL: combine_psrlw_pshufb:
513; SSE:       # %bb.0:
514; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero
515; SSE-NEXT:    retq
516;
517; AVX-LABEL: combine_psrlw_pshufb:
518; AVX:       # %bb.0:
519; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero
520; AVX-NEXT:    retq
521  %1 = lshr <8 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
522  %2 = bitcast <8 x i16> %1 to <16 x i8>
523  %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1>)
524  ret <16 x i8> %3
525}
526
527define <16 x i8> @combine_pslld_pshufb(<4 x i32> %a0) {
528; SSE-LABEL: combine_pslld_pshufb:
529; SSE:       # %bb.0:
530; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,1,0],zero,xmm0[6,5,4],zero,xmm0[10,9,8],zero,xmm0[14,13,12],zero
531; SSE-NEXT:    retq
532;
533; AVX-LABEL: combine_pslld_pshufb:
534; AVX:       # %bb.0:
535; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,1,0],zero,xmm0[6,5,4],zero,xmm0[10,9,8],zero,xmm0[14,13,12],zero
536; AVX-NEXT:    retq
537  %1 = shl <4 x i32> %a0, <i32 8, i32 8, i32 8, i32 8>
538  %2 = bitcast <4 x i32> %1 to <16 x i8>
539  %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 11, i8 10, i8 9, i8 8, i8 15, i8 14, i8 13, i8 12>)
540  ret <16 x i8> %3
541}
542
543define <16 x i8> @combine_psrlq_pshufb(<2 x i64> %a0) {
544; SSE-LABEL: combine_psrlq_pshufb:
545; SSE:       # %bb.0:
546; SSE-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[7,6],zero,zero,zero,zero,zero,zero,xmm0[15,14]
547; SSE-NEXT:    retq
548;
549; AVX-LABEL: combine_psrlq_pshufb:
550; AVX:       # %bb.0:
551; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[7,6],zero,zero,zero,zero,zero,zero,xmm0[15,14]
552; AVX-NEXT:    retq
553  %1 = lshr <2 x i64> %a0, <i64 48, i64 48>
554  %2 = bitcast <2 x i64> %1 to <16 x i8>
555  %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8>)
556  ret <16 x i8> %3
557}
558
559define <16 x i8> @combine_unpckl_arg0_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
560; SSE-LABEL: combine_unpckl_arg0_pshufb:
561; SSE:       # %bb.0:
562; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
563; SSE-NEXT:    retq
564;
565; AVX-LABEL: combine_unpckl_arg0_pshufb:
566; AVX:       # %bb.0:
567; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
568; AVX-NEXT:    retq
569  %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
570  %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1>)
571  ret <16 x i8> %2
572}
573
574define <16 x i8> @combine_unpckl_arg1_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
575; SSE-LABEL: combine_unpckl_arg1_pshufb:
576; SSE:       # %bb.0:
577; SSE-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
578; SSE-NEXT:    movdqa %xmm1, %xmm0
579; SSE-NEXT:    retq
580;
581; AVX-LABEL: combine_unpckl_arg1_pshufb:
582; AVX:       # %bb.0:
583; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
584; AVX-NEXT:    retq
585  %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
586  %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1>)
587  ret <16 x i8> %2
588}
589
590define <8 x i16> @shuffle_combine_unpack_insert(<8 x i16> %a0) {
591; SSE-LABEL: shuffle_combine_unpack_insert:
592; SSE:       # %bb.0:
593; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,8,9,8,9,10,11,10,11]
594; SSE-NEXT:    retq
595;
596; AVX-LABEL: shuffle_combine_unpack_insert:
597; AVX:       # %bb.0:
598; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,8,9,8,9,10,11,10,11]
599; AVX-NEXT:    retq
600  %1 = extractelement <8 x i16> %a0, i32 2
601  %2 = extractelement <8 x i16> %a0, i32 4
602  %3 = insertelement <8 x i16> %a0, i16 %1, i32 4
603  %4 = insertelement <8 x i16> %a0, i16 %2, i32 2
604  %5 = shufflevector <8 x i16> %3, <8 x i16> %4, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
605  %6 = shufflevector <8 x i16> %5, <8 x i16> %3, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
606  %7 = shufflevector <8 x i16> %5, <8 x i16> %a0, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
607  %8 = shufflevector <8 x i16> %6, <8 x i16> %7, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
608  ret <8 x i16> %8
609}
610
611define <16 x i8> @shuffle_combine_packssdw_pshufb(<4 x i32> %a0) {
612; SSE-LABEL: shuffle_combine_packssdw_pshufb:
613; SSE:       # %bb.0:
614; SSE-NEXT:    psrad $31, %xmm0
615; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[13,12,9,8,5,4,1,0,13,12,9,8,5,4,1,0]
616; SSE-NEXT:    retq
617;
618; AVX-LABEL: shuffle_combine_packssdw_pshufb:
619; AVX:       # %bb.0:
620; AVX-NEXT:    vpsrad $31, %xmm0, %xmm0
621; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[13,12,9,8,5,4,1,0,13,12,9,8,5,4,1,0]
622; AVX-NEXT:    retq
623  %1 = ashr <4 x i32> %a0, <i32 31, i32 31, i32 31, i32 31>
624  %2 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %1, <4 x i32> %1)
625  %3 = bitcast <8 x i16> %2 to <16 x i8>
626  %4 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %3, <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8>)
627  ret <16 x i8> %4
628}
629declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
630
631define <16 x i8> @shuffle_combine_packsswb_pshufb(<8 x i16> %a0, <8 x i16> %a1) {
632; SSE-LABEL: shuffle_combine_packsswb_pshufb:
633; SSE:       # %bb.0:
634; SSE-NEXT:    psraw $15, %xmm0
635; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[14,12,10,8,6,4,2,0,14,12,10,8,6,4,2,0]
636; SSE-NEXT:    retq
637;
638; AVX-LABEL: shuffle_combine_packsswb_pshufb:
639; AVX:       # %bb.0:
640; AVX-NEXT:    vpsraw $15, %xmm0, %xmm0
641; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[14,12,10,8,6,4,2,0,14,12,10,8,6,4,2,0]
642; AVX-NEXT:    retq
643  %1 = ashr <8 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
644  %2 = ashr <8 x i16> %a1, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
645  %3 = tail call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %1, <8 x i16> %2)
646  %4 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %3, <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
647  ret <16 x i8> %4
648}
649declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
650
651define <16 x i8> @shuffle_combine_packuswb_pshufb(<8 x i16> %a0, <8 x i16> %a1) {
652; SSE-LABEL: shuffle_combine_packuswb_pshufb:
653; SSE:       # %bb.0:
654; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[15,13,11,9,7,5,3,1,15,13,11,9,7,5,3,1]
655; SSE-NEXT:    retq
656;
657; AVX-LABEL: shuffle_combine_packuswb_pshufb:
658; AVX:       # %bb.0:
659; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,13,11,9,7,5,3,1,15,13,11,9,7,5,3,1]
660; AVX-NEXT:    retq
661  %1 = lshr <8 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
662  %2 = lshr <8 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
663  %3 = tail call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %1, <8 x i16> %2)
664  %4 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %3, <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
665  ret <16 x i8> %4
666}
667declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
668
669define <16 x i8> @constant_fold_pshufb() {
670; SSE-LABEL: constant_fold_pshufb:
671; SSE:       # %bb.0:
672; SSE-NEXT:    movaps {{.*#+}} xmm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9>
673; SSE-NEXT:    retq
674;
675; AVX-LABEL: constant_fold_pshufb:
676; AVX:       # %bb.0:
677; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9>
678; AVX-NEXT:    retq
679  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> <i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6>)
680  ret <16 x i8> %1
681}
682
683; FIXME - unnecessary pshufb/broadcast being used - pshufb mask only needs lowest byte.
684define <16 x i8> @constant_fold_pshufb_2() {
685; SSE-LABEL: constant_fold_pshufb_2:
686; SSE:       # %bb.0:
687; SSE-NEXT:    movl $2, %eax
688; SSE-NEXT:    movd %eax, %xmm0
689; SSE-NEXT:    pxor %xmm1, %xmm1
690; SSE-NEXT:    pshufb %xmm1, %xmm0
691; SSE-NEXT:    retq
692;
693; AVX1-LABEL: constant_fold_pshufb_2:
694; AVX1:       # %bb.0:
695; AVX1-NEXT:    movl $2, %eax
696; AVX1-NEXT:    vmovd %eax, %xmm0
697; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
698; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
699; AVX1-NEXT:    retq
700;
701; AVX2-LABEL: constant_fold_pshufb_2:
702; AVX2:       # %bb.0:
703; AVX2-NEXT:    movl $2, %eax
704; AVX2-NEXT:    vmovd %eax, %xmm0
705; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
706; AVX2-NEXT:    retq
707;
708; AVX512F-LABEL: constant_fold_pshufb_2:
709; AVX512F:       # %bb.0:
710; AVX512F-NEXT:    movl $2, %eax
711; AVX512F-NEXT:    vmovd %eax, %xmm0
712; AVX512F-NEXT:    vpbroadcastb %xmm0, %xmm0
713; AVX512F-NEXT:    retq
714  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> <i8 2, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
715  ret <16 x i8> %1
716}
717
718define i32 @mask_zzz3_v16i8(<16 x i8> %a0) {
719; SSSE3-LABEL: mask_zzz3_v16i8:
720; SSSE3:       # %bb.0:
721; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[14,u,u,u,u,u,u,u,u,u,u,u,u]
722; SSSE3-NEXT:    movd %xmm0, %eax
723; SSSE3-NEXT:    retq
724;
725; SSE41-LABEL: mask_zzz3_v16i8:
726; SSE41:       # %bb.0:
727; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[14]
728; SSE41-NEXT:    pextrd $3, %xmm0, %eax
729; SSE41-NEXT:    retq
730;
731; AVX-LABEL: mask_zzz3_v16i8:
732; AVX:       # %bb.0:
733; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[14]
734; AVX-NEXT:    vpextrd $3, %xmm0, %eax
735; AVX-NEXT:    retq
736  %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14>)
737  %2 = bitcast <16 x i8> %1 to <4 x i32>
738  %3 = extractelement <4 x i32> %2, i32 3
739  %4 = and i32 %3, 4278190080
740  ret i32 %4
741}
742
743define i32 @mask_z1z3_v16i8(<16 x i8> %a0) {
744; SSSE3-LABEL: mask_z1z3_v16i8:
745; SSSE3:       # %bb.0:
746; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,xmm0[10],zero,xmm0[14,u,u,u,u,u,u,u,u,u,u,u,u]
747; SSSE3-NEXT:    movd %xmm0, %eax
748; SSSE3-NEXT:    retq
749;
750; SSE41-LABEL: mask_z1z3_v16i8:
751; SSE41:       # %bb.0:
752; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u],zero,xmm0[10],zero,xmm0[14]
753; SSE41-NEXT:    pextrd $3, %xmm0, %eax
754; SSE41-NEXT:    retq
755;
756; AVX-LABEL: mask_z1z3_v16i8:
757; AVX:       # %bb.0:
758; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u],zero,xmm0[10],zero,xmm0[14]
759; AVX-NEXT:    vpextrd $3, %xmm0, %eax
760; AVX-NEXT:    retq
761  %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14>)
762  %2 = bitcast <16 x i8> %1 to <4 x i32>
763  %3 = extractelement <4 x i32> %2, i32 3
764  %4 = and i32 %3, 4278255360
765  ret i32 %4
766}
767
768define i32 @PR22415(double %a0) {
769; SSE-LABEL: PR22415:
770; SSE:       # %bb.0:
771; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
772; SSE-NEXT:    movd %xmm0, %eax
773; SSE-NEXT:    retq
774;
775; AVX-LABEL: PR22415:
776; AVX:       # %bb.0:
777; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
778; AVX-NEXT:    vmovd %xmm0, %eax
779; AVX-NEXT:    retq
780  %1 = bitcast double %a0 to <8 x i8>
781  %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 undef>
782  %3 = shufflevector <4 x i8> %2, <4 x i8> undef, <3 x i32> <i32 0, i32 1, i32 2>
783  %4 = bitcast <3 x i8> %3 to i24
784  %5 = zext i24 %4 to i32
785  ret i32 %5
786}
787