• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX2
3; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX2
5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512
6
7declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>)
8declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>)
9declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)
10declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>)
11
12define <32 x i8> @combine_pshufb_pslldq(<32 x i8> %a0) {
13; X32-LABEL: combine_pshufb_pslldq:
14; X32:       # %bb.0:
15; X32-NEXT:    vxorps %xmm0, %xmm0, %xmm0
16; X32-NEXT:    retl
17;
18; X64-LABEL: combine_pshufb_pslldq:
19; X64:       # %bb.0:
20; X64-NEXT:    vxorps %xmm0, %xmm0, %xmm0
21; X64-NEXT:    retq
22  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
23  %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
24  ret <32 x i8> %2
25}
26
27define <32 x i8> @combine_pshufb_psrldq(<32 x i8> %a0) {
28; X32-LABEL: combine_pshufb_psrldq:
29; X32:       # %bb.0:
30; X32-NEXT:    vxorps %xmm0, %xmm0, %xmm0
31; X32-NEXT:    retl
32;
33; X64-LABEL: combine_pshufb_psrldq:
34; X64:       # %bb.0:
35; X64-NEXT:    vxorps %xmm0, %xmm0, %xmm0
36; X64-NEXT:    retq
37  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
38  %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
39  ret <32 x i8> %2
40}
41
42define <32 x i8> @combine_pshufb_vpermd(<8 x i32> %a) {
43; X32-LABEL: combine_pshufb_vpermd:
44; X32:       # %bb.0:
45; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18]
46; X32-NEXT:    retl
47;
48; X64-LABEL: combine_pshufb_vpermd:
49; X64:       # %bb.0:
50; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18]
51; X64-NEXT:    retq
52  %tmp0 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4>)
53  %tmp1 = bitcast <8 x i32> %tmp0 to <32 x i8>
54  %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 30>
55  ret <32 x i8> %tmp2
56}
57
58define <32 x i8> @combine_pshufb_vpermps(<8 x float> %a) {
59; X32-LABEL: combine_pshufb_vpermps:
60; X32:       # %bb.0:
61; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18]
62; X32-NEXT:    retl
63;
64; X64-LABEL: combine_pshufb_vpermps:
65; X64:       # %bb.0:
66; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18]
67; X64-NEXT:    retq
68  %tmp0 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4>)
69  %tmp1 = bitcast <8 x float> %tmp0 to <32 x i8>
70  %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 30>
71  ret <32 x i8> %tmp2
72}
73
74define <32 x i8> @combine_and_pshufb(<32 x i8> %a0) {
75; X32-LABEL: combine_and_pshufb:
76; X32:       # %bb.0:
77; X32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
78; X32-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
79; X32-NEXT:    retl
80;
81; X64-LABEL: combine_and_pshufb:
82; X64:       # %bb.0:
83; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
84; X64-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
85; X64-NEXT:    retq
86  %1 = shufflevector <32 x i8> %a0, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 32, i32 32, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
87  %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
88  ret <32 x i8> %2
89}
90
91define <32 x i8> @combine_pshufb_and(<32 x i8> %a0) {
92; X32-LABEL: combine_pshufb_and:
93; X32:       # %bb.0:
94; X32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
95; X32-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
96; X32-NEXT:    retl
97;
98; X64-LABEL: combine_pshufb_and:
99; X64:       # %bb.0:
100; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
101; X64-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
102; X64-NEXT:    retq
103  %1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
104  %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 32, i32 32, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
105  ret <32 x i8> %2
106}
107
108define <4 x i64> @combine_permq_pshufb_as_vperm2i128(<4 x i64> %a0) {
109; X32-LABEL: combine_permq_pshufb_as_vperm2i128:
110; X32:       # %bb.0:
111; X32-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
112; X32-NEXT:    vpaddq {{\.LCPI.*}}, %ymm0, %ymm0
113; X32-NEXT:    retl
114;
115; X64-LABEL: combine_permq_pshufb_as_vperm2i128:
116; X64:       # %bb.0:
117; X64-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
118; X64-NEXT:    vpaddq {{.*}}(%rip), %ymm0, %ymm0
119; X64-NEXT:    retq
120  %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
121  %2 = bitcast <4 x i64> %1 to <32 x i8>
122  %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255>)
123  %4 = bitcast <32 x i8> %3 to <4 x i64>
124  %5 = add <4 x i64> %4, <i64 1, i64 1, i64 3, i64 3>
125  ret <4 x i64> %5
126}
127
128define <8 x i32> @combine_as_vpermd(<8 x i32> %a0) {
129; X32-LABEL: combine_as_vpermd:
130; X32:       # %bb.0:
131; X32-NEXT:    vmovaps {{.*#+}} ymm1 = [4,5,4,5,6,7,0,7]
132; X32-NEXT:    vpermps %ymm0, %ymm1, %ymm0
133; X32-NEXT:    retl
134;
135; X64-LABEL: combine_as_vpermd:
136; X64:       # %bb.0:
137; X64-NEXT:    vmovaps {{.*#+}} ymm1 = [4,5,4,5,6,7,0,7]
138; X64-NEXT:    vpermps %ymm0, %ymm1, %ymm0
139; X64-NEXT:    retq
140  %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
141  %2 = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6>)
142  %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <8 x i32> <i32 0, i32 8, i32 9, i32 1, i32 15, i32 14, i32 4, i32 3>
143  ret <8 x i32> %3
144}
145
146define <8 x float> @combine_as_vpermps(<8 x float> %a0) {
147; X32-LABEL: combine_as_vpermps:
148; X32:       # %bb.0:
149; X32-NEXT:    vmovaps {{.*#+}} ymm1 = <6,4,7,5,1,u,4,7>
150; X32-NEXT:    vpermps %ymm0, %ymm1, %ymm0
151; X32-NEXT:    retl
152;
153; X64-LABEL: combine_as_vpermps:
154; X64:       # %bb.0:
155; X64-NEXT:    vmovaps {{.*#+}} ymm1 = <6,4,7,5,1,u,4,7>
156; X64-NEXT:    vpermps %ymm0, %ymm1, %ymm0
157; X64-NEXT:    retq
158  %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
159  %2 = tail call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> <i32 1, i32 undef, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>)
160  %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 15, i32 0, i32 14, i32 1, i32 8, i32 9, i32 4, i32 3>
161  ret <8 x float> %3
162}
163
164define <32 x i8> @combine_permq_pshufb_as_vpblendd(<4 x i64> %a0) {
165; X32-LABEL: combine_permq_pshufb_as_vpblendd:
166; X32:       # %bb.0:
167; X32-NEXT:    vxorps %xmm1, %xmm1, %xmm1
168; X32-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
169; X32-NEXT:    retl
170;
171; X64-LABEL: combine_permq_pshufb_as_vpblendd:
172; X64:       # %bb.0:
173; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
174; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
175; X64-NEXT:    retq
176  %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
177  %2 = bitcast <4 x i64> %1 to <32 x i8>
178  %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255, i8 255>)
179  ret <32 x i8> %3
180}
181
182define <16 x i8> @combine_pshufb_as_vpbroadcastb128(<16 x i8> %a) {
183; X32-LABEL: combine_pshufb_as_vpbroadcastb128:
184; X32:       # %bb.0:
185; X32-NEXT:    vpbroadcastb %xmm0, %xmm0
186; X32-NEXT:    retl
187;
188; X64-LABEL: combine_pshufb_as_vpbroadcastb128:
189; X64:       # %bb.0:
190; X64-NEXT:    vpbroadcastb %xmm0, %xmm0
191; X64-NEXT:    retq
192  %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> zeroinitializer)
193  ret <16 x i8> %1
194}
195
196define <32 x i8> @combine_pshufb_as_vpbroadcastb256(<2 x i64> %a) {
197; X32-LABEL: combine_pshufb_as_vpbroadcastb256:
198; X32:       # %bb.0:
199; X32-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
200; X32-NEXT:    vpbroadcastb %xmm0, %ymm0
201; X32-NEXT:    retl
202;
203; X64-LABEL: combine_pshufb_as_vpbroadcastb256:
204; X64:       # %bb.0:
205; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
206; X64-NEXT:    vpbroadcastb %xmm0, %ymm0
207; X64-NEXT:    retq
208  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
209  %2 = bitcast <4 x i64> %1 to <32 x i8>
210  %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> zeroinitializer)
211  %4 = bitcast <32 x i8> %3 to <8 x i32>
212  %5 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %4, <8 x i32> zeroinitializer)
213  %6 = bitcast <8 x i32> %5 to <32 x i8>
214  ret <32 x i8> %6
215}
216
217define <16 x i8> @combine_pshufb_as_vpbroadcastw128(<16 x i8> %a) {
218; X32-LABEL: combine_pshufb_as_vpbroadcastw128:
219; X32:       # %bb.0:
220; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
221; X32-NEXT:    retl
222;
223; X64-LABEL: combine_pshufb_as_vpbroadcastw128:
224; X64:       # %bb.0:
225; X64-NEXT:    vpbroadcastw %xmm0, %xmm0
226; X64-NEXT:    retq
227  %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>)
228  ret <16 x i8> %1
229}
230
231define <32 x i8> @combine_pshufb_as_vpbroadcastw256(<2 x i64> %a) {
232; X32-LABEL: combine_pshufb_as_vpbroadcastw256:
233; X32:       # %bb.0:
234; X32-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
235; X32-NEXT:    vpbroadcastw %xmm0, %ymm0
236; X32-NEXT:    retl
237;
238; X64-LABEL: combine_pshufb_as_vpbroadcastw256:
239; X64:       # %bb.0:
240; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
241; X64-NEXT:    vpbroadcastw %xmm0, %ymm0
242; X64-NEXT:    retq
243  %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
244  %2 = bitcast <4 x i64> %1 to <32 x i8>
245  %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>)
246  %4 = bitcast <32 x i8> %3 to <8 x i32>
247  %5 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %4, <8 x i32> zeroinitializer)
248  %6 = bitcast <8 x i32> %5 to <32 x i8>
249  ret <32 x i8> %6
250}
251
252define <16 x i8> @combine_pshufb_as_vpbroadcastd128(<16 x i8> %a) {
253; X32-LABEL: combine_pshufb_as_vpbroadcastd128:
254; X32:       # %bb.0:
255; X32-NEXT:    vpbroadcastd %xmm0, %xmm0
256; X32-NEXT:    vpaddb {{\.LCPI.*}}, %xmm0, %xmm0
257; X32-NEXT:    retl
258;
259; X64-LABEL: combine_pshufb_as_vpbroadcastd128:
260; X64:       # %bb.0:
261; X64-NEXT:    vpbroadcastd %xmm0, %xmm0
262; X64-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
263; X64-NEXT:    retq
264  %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
265  %2 = add <16 x i8> %1, <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>
266  ret <16 x i8> %2
267}
268
269define <8 x i32> @combine_permd_as_vpbroadcastd256(<4 x i32> %a) {
270; X32-LABEL: combine_permd_as_vpbroadcastd256:
271; X32:       # %bb.0:
272; X32-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
273; X32-NEXT:    vpbroadcastd %xmm0, %ymm0
274; X32-NEXT:    vpaddd {{\.LCPI.*}}, %ymm0, %ymm0
275; X32-NEXT:    retl
276;
277; X64-LABEL: combine_permd_as_vpbroadcastd256:
278; X64:       # %bb.0:
279; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
280; X64-NEXT:    vpbroadcastd %xmm0, %ymm0
281; X64-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
282; X64-NEXT:    retq
283  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
284  %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %1, <8 x i32> zeroinitializer)
285  %3 = add <8 x i32> %2, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
286  ret <8 x i32> %3
287}
288
289define <16 x i8> @combine_pshufb_as_vpbroadcastq128(<16 x i8> %a) {
290; X32-LABEL: combine_pshufb_as_vpbroadcastq128:
291; X32:       # %bb.0:
292; X32-NEXT:    vpbroadcastq %xmm0, %xmm0
293; X32-NEXT:    retl
294;
295; X64-LABEL: combine_pshufb_as_vpbroadcastq128:
296; X64:       # %bb.0:
297; X64-NEXT:    vpbroadcastq %xmm0, %xmm0
298; X64-NEXT:    retq
299  %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
300  ret <16 x i8> %1
301}
302
303define <8 x i32> @combine_permd_as_vpbroadcastq256(<4 x i32> %a) {
304; X32-LABEL: combine_permd_as_vpbroadcastq256:
305; X32:       # %bb.0:
306; X32-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
307; X32-NEXT:    vpbroadcastq %xmm0, %ymm0
308; X32-NEXT:    vpaddd {{\.LCPI.*}}, %ymm0, %ymm0
309; X32-NEXT:    retl
310;
311; X64-LABEL: combine_permd_as_vpbroadcastq256:
312; X64:       # %bb.0:
313; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
314; X64-NEXT:    vpbroadcastq %xmm0, %ymm0
315; X64-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
316; X64-NEXT:    retq
317  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
318  %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %1, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
319  %3 = add <8 x i32> %2, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
320  ret <8 x i32> %3
321}
322
323define <4 x float> @combine_pshufb_as_vpbroadcastss128(<4 x float> %a) {
324; X32-LABEL: combine_pshufb_as_vpbroadcastss128:
325; X32:       # %bb.0:
326; X32-NEXT:    vbroadcastss %xmm0, %xmm0
327; X32-NEXT:    retl
328;
329; X64-LABEL: combine_pshufb_as_vpbroadcastss128:
330; X64:       # %bb.0:
331; X64-NEXT:    vbroadcastss %xmm0, %xmm0
332; X64-NEXT:    retq
333  %1 = bitcast <4 x float> %a to <16 x i8>
334  %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
335  %3 = bitcast <16 x i8> %2 to <4 x float>
336  ret <4 x float> %3
337}
338
339define <8 x float> @combine_permps_as_vpbroadcastss256(<4 x float> %a) {
340; X32-LABEL: combine_permps_as_vpbroadcastss256:
341; X32:       # %bb.0:
342; X32-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
343; X32-NEXT:    vbroadcastss %xmm0, %ymm0
344; X32-NEXT:    retl
345;
346; X64-LABEL: combine_permps_as_vpbroadcastss256:
347; X64:       # %bb.0:
348; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
349; X64-NEXT:    vbroadcastss %xmm0, %ymm0
350; X64-NEXT:    retq
351  %1 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
352  %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %1, <8 x i32> zeroinitializer)
353  ret <8 x float> %2
354}
355
356define <4 x double> @combine_permps_as_vpbroadcastsd256(<2 x double> %a) {
357; X32-LABEL: combine_permps_as_vpbroadcastsd256:
358; X32:       # %bb.0:
359; X32-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
360; X32-NEXT:    vbroadcastsd %xmm0, %ymm0
361; X32-NEXT:    retl
362;
363; X64-LABEL: combine_permps_as_vpbroadcastsd256:
364; X64:       # %bb.0:
365; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
366; X64-NEXT:    vbroadcastsd %xmm0, %ymm0
367; X64-NEXT:    retq
368  %1 = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
369  %2 = bitcast <4 x double> %1 to <8 x float>
370  %3 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %2, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
371  %4 = bitcast <8 x float> %3 to <4 x double>
372  ret <4 x double> %4
373}
374
375define <16 x i8> @combine_vpbroadcast_pshufb_as_vpbroadcastb128(<16 x i8> %a) {
376; X32-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastb128:
377; X32:       # %bb.0:
378; X32-NEXT:    vpbroadcastb %xmm0, %xmm0
379; X32-NEXT:    retl
380;
381; X64-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastb128:
382; X64:       # %bb.0:
383; X64-NEXT:    vpbroadcastb %xmm0, %xmm0
384; X64-NEXT:    retq
385  %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> zeroinitializer
386  %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> zeroinitializer)
387  ret <16 x i8> %2
388}
389
390define <32 x i8> @combine_vpbroadcast_pshufb_as_vpbroadcastb256(<32 x i8> %a) {
391; X32-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastb256:
392; X32:       # %bb.0:
393; X32-NEXT:    vpbroadcastb %xmm0, %ymm0
394; X32-NEXT:    retl
395;
396; X64-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastb256:
397; X64:       # %bb.0:
398; X64-NEXT:    vpbroadcastb %xmm0, %ymm0
399; X64-NEXT:    retq
400  %1 = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> zeroinitializer
401  %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> zeroinitializer)
402  ret <32 x i8> %2
403}
404
405define <4 x float> @combine_vpbroadcast_pshufb_as_vpbroadcastss128(<4 x float> %a) {
406; X32-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastss128:
407; X32:       # %bb.0:
408; X32-NEXT:    vbroadcastss %xmm0, %xmm0
409; X32-NEXT:    retl
410;
411; X64-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastss128:
412; X64:       # %bb.0:
413; X64-NEXT:    vbroadcastss %xmm0, %xmm0
414; X64-NEXT:    retq
415  %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> zeroinitializer
416  %2 = bitcast <4 x float> %1 to <16 x i8>
417  %3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3>)
418  %4 = bitcast <16 x i8> %3 to <4 x float>
419  ret <4 x float> %4
420}
421
422define <8 x float> @combine_vpbroadcast_permd_as_vpbroadcastss256(<4 x float> %a) {
423; X32-LABEL: combine_vpbroadcast_permd_as_vpbroadcastss256:
424; X32:       # %bb.0:
425; X32-NEXT:    vbroadcastss %xmm0, %ymm0
426; X32-NEXT:    vbroadcastss %xmm0, %ymm0
427; X32-NEXT:    retl
428;
429; X64-LABEL: combine_vpbroadcast_permd_as_vpbroadcastss256:
430; X64:       # %bb.0:
431; X64-NEXT:    vbroadcastss %xmm0, %ymm0
432; X64-NEXT:    vbroadcastss %xmm0, %ymm0
433; X64-NEXT:    retq
434  %1 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> zeroinitializer
435  %2 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %1, <8 x i32> zeroinitializer)
436  ret <8 x float> %2
437}
438
439define <4 x double> @combine_vpbroadcast_permd_as_vpbroadcastsd256(<2 x double> %a) {
440; X32-LABEL: combine_vpbroadcast_permd_as_vpbroadcastsd256:
441; X32:       # %bb.0:
442; X32-NEXT:    vbroadcastsd %xmm0, %ymm0
443; X32-NEXT:    vbroadcastsd %xmm0, %ymm0
444; X32-NEXT:    retl
445;
446; X64-LABEL: combine_vpbroadcast_permd_as_vpbroadcastsd256:
447; X64:       # %bb.0:
448; X64-NEXT:    vbroadcastsd %xmm0, %ymm0
449; X64-NEXT:    vbroadcastsd %xmm0, %ymm0
450; X64-NEXT:    retq
451  %1 = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> zeroinitializer
452  %2 = bitcast <4 x double> %1 to <8 x float>
453  %3 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %2, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
454  %4 = bitcast <8 x float> %3 to <4 x double>
455  ret <4 x double> %4
456}
457
458define <8 x i32> @combine_permd_as_permq(<8 x i32> %a) {
459; X32-LABEL: combine_permd_as_permq:
460; X32:       # %bb.0:
461; X32-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1]
462; X32-NEXT:    retl
463;
464; X64-LABEL: combine_permd_as_permq:
465; X64:       # %bb.0:
466; X64-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1]
467; X64-NEXT:    retq
468  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 2, i32 3>)
469  ret <8 x i32> %1
470}
471
472define <8 x float> @combine_permps_as_permpd(<8 x float> %a) {
473; X32-LABEL: combine_permps_as_permpd:
474; X32:       # %bb.0:
475; X32-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,0,1]
476; X32-NEXT:    retl
477;
478; X64-LABEL: combine_permps_as_permpd:
479; X64:       # %bb.0:
480; X64-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,0,1]
481; X64-NEXT:    retq
482  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 0, i32 1, i32 2, i32 3>)
483  ret <8 x float> %1
484}
485
486define <4 x i64> @combine_pshufb_as_zext(<32 x i8> %a0) {
487; X32-LABEL: combine_pshufb_as_zext:
488; X32:       # %bb.0:
489; X32-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
490; X32-NEXT:    retl
491;
492; X64-LABEL: combine_pshufb_as_zext:
493; X64:       # %bb.0:
494; X64-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
495; X64-NEXT:    retq
496  %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
497  %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 10, i8 11, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 4, i8 5, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
498  %3 = bitcast <32 x i8> %2 to <4 x i64>
499  ret <4 x i64> %3
500}
501
502define <4 x i64> @combine_pshufb_as_zext128(<32 x i8> %a0) {
503; X32-LABEL: combine_pshufb_as_zext128:
504; X32:       # %bb.0:
505; X32-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
506; X32-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
507; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[15,14],zero,zero,zero,zero,zero,zero,ymm0[13,12],zero,zero,zero,zero,zero,zero,ymm0[31,30],zero,zero,zero,zero,zero,zero,ymm0[29,28],zero,zero,zero,zero,zero,zero
508; X32-NEXT:    retl
509;
510; X64-LABEL: combine_pshufb_as_zext128:
511; X64:       # %bb.0:
512; X64-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
513; X64-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
514; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[15,14],zero,zero,zero,zero,zero,zero,ymm0[13,12],zero,zero,zero,zero,zero,zero,ymm0[31,30],zero,zero,zero,zero,zero,zero,ymm0[29,28],zero,zero,zero,zero,zero,zero
515; X64-NEXT:    retq
516  %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
517  %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 15, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 13, i8 12, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 15, i8 14, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 13, i8 12, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
518  %3 = bitcast <32 x i8> %2 to <4 x i64>
519  ret <4 x i64> %3
520}
521
522define <4 x double> @combine_pshufb_as_vzmovl_64(<4 x double> %a0) {
523; X32-LABEL: combine_pshufb_as_vzmovl_64:
524; X32:       # %bb.0:
525; X32-NEXT:    vxorps %xmm1, %xmm1, %xmm1
526; X32-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
527; X32-NEXT:    retl
528;
529; X64-LABEL: combine_pshufb_as_vzmovl_64:
530; X64:       # %bb.0:
531; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
532; X64-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
533; X64-NEXT:    retq
534  %1 = bitcast <4 x double> %a0 to <32 x i8>
535  %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
536  %3 = bitcast <32 x i8> %2 to <4 x double>
537  ret <4 x double> %3
538}
539
540define <8 x float> @combine_pshufb_as_vzmovl_32(<8 x float> %a0) {
541; X32-LABEL: combine_pshufb_as_vzmovl_32:
542; X32:       # %bb.0:
543; X32-NEXT:    vxorps %xmm1, %xmm1, %xmm1
544; X32-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
545; X32-NEXT:    retl
546;
547; X64-LABEL: combine_pshufb_as_vzmovl_32:
548; X64:       # %bb.0:
549; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
550; X64-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
551; X64-NEXT:    retq
552  %1 = bitcast <8 x float> %a0 to <32 x i8>
553  %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
554  %3 = bitcast <32 x i8> %2 to <8 x float>
555  ret <8 x float> %3
556}
557
558define <32 x i8> @combine_pshufb_as_pslldq(<32 x i8> %a0) {
559; X32-LABEL: combine_pshufb_as_pslldq:
560; X32:       # %bb.0:
561; X32-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21]
562; X32-NEXT:    retl
563;
564; X64-LABEL: combine_pshufb_as_pslldq:
565; X64:       # %bb.0:
566; X64-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21]
567; X64-NEXT:    retq
568  %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>)
569  ret <32 x i8> %res0
570}
571
572define <32 x i8> @combine_pshufb_as_psrldq(<32 x i8> %a0) {
573; X32-LABEL: combine_pshufb_as_psrldq:
574; X32:       # %bb.0:
575; X32-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
576; X32-NEXT:    retl
577;
578; X64-LABEL: combine_pshufb_as_psrldq:
579; X64:       # %bb.0:
580; X64-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
581; X64-NEXT:    retq
582  %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
583  ret <32 x i8> %res0
584}
585
586define <32 x i8> @combine_pshufb_as_psrlw(<32 x i8> %a0) {
587; X32-LABEL: combine_pshufb_as_psrlw:
588; X32:       # %bb.0:
589; X32-NEXT:    vpsrlw $8, %ymm0, %ymm0
590; X32-NEXT:    retl
591;
592; X64-LABEL: combine_pshufb_as_psrlw:
593; X64:       # %bb.0:
594; X64-NEXT:    vpsrlw $8, %ymm0, %ymm0
595; X64-NEXT:    retq
596  %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 1, i8 128, i8 3, i8 128, i8 5, i8 128, i8 7, i8 128, i8 9, i8 128, i8 11, i8 128, i8 13, i8 128, i8 15, i8 128, i8 17, i8 128, i8 19, i8 128, i8 21, i8 128, i8 23, i8 128, i8 25, i8 128, i8 27, i8 128, i8 29, i8 128, i8 31, i8 128>)
597  ret <32 x i8> %res0
598}
599
600define <32 x i8> @combine_pshufb_as_pslld(<32 x i8> %a0) {
601; X32-LABEL: combine_pshufb_as_pslld:
602; X32:       # %bb.0:
603; X32-NEXT:    vpslld $24, %ymm0, %ymm0
604; X32-NEXT:    retl
605;
606; X64-LABEL: combine_pshufb_as_pslld:
607; X64:       # %bb.0:
608; X64-NEXT:    vpslld $24, %ymm0, %ymm0
609; X64-NEXT:    retq
610  %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 0, i8 128, i8 128, i8 128, i8 4, i8 128, i8 128, i8 128, i8 8, i8 128, i8 128, i8 128, i8 12, i8 128, i8 128, i8 128, i8 16, i8 128, i8 128, i8 128, i8 20, i8 128, i8 128, i8 128, i8 24, i8 128, i8 128, i8 128, i8 28>)
611  ret <32 x i8> %res0
612}
613
614define <32 x i8> @combine_pshufb_as_psrlq(<32 x i8> %a0) {
615; X32-LABEL: combine_pshufb_as_psrlq:
616; X32:       # %bb.0:
617; X32-NEXT:    vpsrlq $40, %ymm0, %ymm0
618; X32-NEXT:    retl
619;
620; X64-LABEL: combine_pshufb_as_psrlq:
621; X64:       # %bb.0:
622; X64-NEXT:    vpsrlq $40, %ymm0, %ymm0
623; X64-NEXT:    retq
624  %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 5, i8 6, i8 7, i8 128, i8 128, i8 128, i8 128, i8 128, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 21, i8 22, i8 23, i8 128, i8 128, i8 128, i8 128, i8 128, i8 29, i8 30, i8 31, i8 128, i8 128, i8 128, i8 128, i8 128>)
625  ret <32 x i8> %res0
626}
627
628define <32 x i8> @combine_pshufb_as_pshuflw(<32 x i8> %a0) {
629; X32-LABEL: combine_pshufb_as_pshuflw:
630; X32:       # %bb.0:
631; X32-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
632; X32-NEXT:    retl
633;
634; X64-LABEL: combine_pshufb_as_pshuflw:
635; X64:       # %bb.0:
636; X64-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
637; X64-NEXT:    retq
638  %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
639  ret <32 x i8> %res0
640}
641
642define <32 x i8> @combine_pshufb_as_pshufhw(<32 x i8> %a0) {
643; X32-LABEL: combine_pshufb_as_pshufhw:
644; X32:       # %bb.0:
645; X32-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14]
646; X32-NEXT:    retl
647;
648; X64-LABEL: combine_pshufb_as_pshufhw:
649; X64:       # %bb.0:
650; X64-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14]
651; X64-NEXT:    retq
652  %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
653  ret <32 x i8> %res0
654}
655
656define <32 x i8> @combine_pshufb_not_as_pshufw(<32 x i8> %a0) {
657; X32-LABEL: combine_pshufb_not_as_pshufw:
658; X32:       # %bb.0:
659; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29]
660; X32-NEXT:    retl
661;
662; X64-LABEL: combine_pshufb_not_as_pshufw:
663; X64:       # %bb.0:
664; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29]
665; X64-NEXT:    retq
666  %res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
667  %res1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %res0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
668  ret <32 x i8> %res1
669}
670
671define <32 x i8> @combine_pshufb_as_unpacklo_undef(<32 x i8> %a0) {
672; X32-LABEL: combine_pshufb_as_unpacklo_undef:
673; X32:       # %bb.0:
674; X32-NEXT:    retl
675;
676; X64-LABEL: combine_pshufb_as_unpacklo_undef:
677; X64:       # %bb.0:
678; X64-NEXT:    retq
679  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 undef, i8 0, i8 undef, i8 1, i8 undef, i8 2, i8 undef, i8 3, i8 undef, i8 4, i8 undef, i8 5, i8 undef, i8 6, i8 undef, i8 7, i8 undef, i8 16, i8 undef, i8 17, i8 undef, i8 18, i8 undef, i8 19, i8 undef, i8 20, i8 undef, i8 21, i8 undef, i8 22, i8 undef, i8 23>)
680  %2 = shufflevector <32 x i8> %1, <32 x i8> undef, <32 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14, i32 16, i32 16, i32 18, i32 18, i32 20, i32 20, i32 22, i32 22, i32 24, i32 24, i32 26, i32 26, i32 28, i32 28, i32 30, i32 30>
681  ret <32 x i8> %2
682}
683
684define <32 x i8> @combine_pshufb_as_unpacklo_zero(<32 x i8> %a0) {
685; X32-LABEL: combine_pshufb_as_unpacklo_zero:
686; X32:       # %bb.0:
687; X32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
688; X32-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
689; X32-NEXT:    retl
690;
691; X64-LABEL: combine_pshufb_as_unpacklo_zero:
692; X64:       # %bb.0:
693; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
694; X64-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
695; X64-NEXT:    retq
696  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 2, i8 3, i8 -1, i8 -1, i8 4, i8 5, i8 -1, i8 -1, i8 6, i8 7, i8 -1, i8 -1, i8 16, i8 17, i8 -1, i8 -1, i8 18, i8 19, i8 -1, i8 -1, i8 20, i8 21, i8 -1, i8 -1, i8 22, i8 23, i8 -1, i8 -1>)
697  ret <32 x i8> %1
698}
699
700define <32 x i8> @combine_pshufb_as_unpackhi_zero(<32 x i8> %a0) {
701; X32-LABEL: combine_pshufb_as_unpackhi_zero:
702; X32:       # %bb.0:
703; X32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
704; X32-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
705; X32-NEXT:    retl
706;
707; X64-LABEL: combine_pshufb_as_unpackhi_zero:
708; X64:       # %bb.0:
709; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
710; X64-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
711; X64-NEXT:    retq
712  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 -1, i8 8, i8 -1, i8 9, i8 -1, i8 10, i8 -1, i8 11, i8 -1, i8 12, i8 -1, i8 13, i8 -1, i8 14, i8 -1, i8 15, i8 -1, i8 24, i8 -1, i8 25, i8 -1, i8 26, i8 -1, i8 27, i8 -1, i8 28, i8 -1, i8 29, i8 -1, i8 30, i8 -1, i8 31>)
713  ret <32 x i8> %1
714}
715
716define <32 x i8> @combine_psrlw_pshufb(<16 x i16> %a0) {
717; X32-LABEL: combine_psrlw_pshufb:
718; X32:       # %bb.0:
719; X32-NEXT:    vandps {{\.LCPI.*}}, %ymm0, %ymm0
720; X32-NEXT:    retl
721;
722; X64-LABEL: combine_psrlw_pshufb:
723; X64:       # %bb.0:
724; X64-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
725; X64-NEXT:    retq
726  %1 = lshr <16 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
727  %2 = bitcast <16 x i16> %1 to <32 x i8>
728  %3 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 1, i8 0, i8 3, i8 2, i8 5, i8 4, i8 7, i8 6, i8 9, i8 8, i8 11, i8 10, i8 13, i8 12, i8 15, i8 14, i8 17, i8 16, i8 19, i8 18, i8 21, i8 20, i8 23, i8 22, i8 25, i8 24, i8 27, i8 26, i8 29, i8 28, i8 31, i8 30>)
729  ret <32 x i8> %3
730}
731
732define <32 x i8> @combine_pslld_pshufb(<8 x i32> %a0) {
733; X32-LABEL: combine_pslld_pshufb:
734; X32:       # %bb.0:
735; X32-NEXT:    vandps {{\.LCPI.*}}, %ymm0, %ymm0
736; X32-NEXT:    retl
737;
738; X64-LABEL: combine_pslld_pshufb:
739; X64:       # %bb.0:
740; X64-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
741; X64-NEXT:    retq
742  %1 = shl <8 x i32> %a0, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
743  %2 = bitcast <8 x i32> %1 to <32 x i8>
744  %3 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 11, i8 10, i8 9, i8 8, i8 15, i8 14, i8 13, i8 12, i8 19, i8 18, i8 17, i8 16, i8 23, i8 22, i8 21, i8 20, i8 27, i8 26, i8 25, i8 24, i8 31, i8 30, i8 29, i8 28>)
745  ret <32 x i8> %3
746}
747
748define <32 x i8> @combine_psrlq_pshufb(<4 x i64> %a0) {
749; X32-LABEL: combine_psrlq_pshufb:
750; X32:       # %bb.0:
751; X32-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[7,6,5,4],zero,zero,zero,zero,ymm0[15,14,13,12],zero,zero,zero,zero,ymm0[23,22,21],zero,zero,zero,zero,ymm0[31,30,29,28],zero
752; X32-NEXT:    retl
753;
754; X64-LABEL: combine_psrlq_pshufb:
755; X64:       # %bb.0:
756; X64-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[7,6,5,4],zero,zero,zero,zero,ymm0[15,14,13,12],zero,zero,zero,zero,ymm0[23,22,21],zero,zero,zero,zero,ymm0[31,30,29,28],zero
757; X64-NEXT:    retq
758  %1 = lshr <4 x i64> %a0, <i64 32, i64 32, i64 32, i64 32>
759  %2 = bitcast <4 x i64> %1 to <32 x i8>
760  %3 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %2, <32 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23>)
761  ret <32 x i8> %3
762}
763
764define <32 x i8> @combine_unpack_unpack_pshufb(<32 x i8> %a0) {
765; X32-LABEL: combine_unpack_unpack_pshufb:
766; X32:       # %bb.0:
767; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,4,8,1,1,5,9,2,2,6,10,3,3,7,11,16,16,20,24,17,17,21,25,18,18,22,26,19,19,23,27]
768; X32-NEXT:    retl
769;
770; X64-LABEL: combine_unpack_unpack_pshufb:
771; X64:       # %bb.0:
772; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,4,8,1,1,5,9,2,2,6,10,3,3,7,11,16,16,20,24,17,17,21,25,18,18,22,26,19,19,23,27]
773; X64-NEXT:    retq
774  %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
775  %2 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
776  %3 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 24, i32 25, i32 26, i32 27, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
777  %4 = shufflevector <32 x i8> %1, <32 x i8> %2, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
778  %5 = shufflevector <32 x i8> %1, <32 x i8> %3, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
779  %6 = shufflevector <32 x i8> %4, <32 x i8> %5, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
780  ret <32 x i8> %6
781}
782
783define <16 x i16> @shuffle_combine_packssdw_pshufb(<8 x i32> %a0) {
784; X32-LABEL: shuffle_combine_packssdw_pshufb:
785; X32:       # %bb.0:
786; X32-NEXT:    vpsrad $31, %ymm0, %ymm0
787; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[12,13,8,9,4,5,0,1,12,13,8,9,4,5,0,1,16,17,20,21,24,25,28,29,28,29,24,25,20,21,16,17]
788; X32-NEXT:    retl
789;
790; X64-LABEL: shuffle_combine_packssdw_pshufb:
791; X64:       # %bb.0:
792; X64-NEXT:    vpsrad $31, %ymm0, %ymm0
793; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[12,13,8,9,4,5,0,1,12,13,8,9,4,5,0,1,16,17,20,21,24,25,28,29,28,29,24,25,20,21,16,17]
794; X64-NEXT:    retq
795  %1 = ashr <8 x i32> %a0, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
796  %2 = tail call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %1, <8 x i32> %1)
797  %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 8, i32 9, i32 10, i32 11, i32 11, i32 10, i32 9, i32 8>
798  ret <16 x i16> %3
799}
800declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
801
802define <32 x i8> @shuffle_combine_packsswb_pshufb(<16 x i16> %a0, <16 x i16> %a1) {
803; X32-LABEL: shuffle_combine_packsswb_pshufb:
804; X32:       # %bb.0:
805; X32-NEXT:    vpsraw $15, %ymm0, %ymm0
806; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,12,10,8,6,4,2,0,14,12,10,8,6,4,2,0,30,28,26,24,22,20,18,16,30,28,26,24,22,20,18,16]
807; X32-NEXT:    retl
808;
809; X64-LABEL: shuffle_combine_packsswb_pshufb:
810; X64:       # %bb.0:
811; X64-NEXT:    vpsraw $15, %ymm0, %ymm0
812; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,12,10,8,6,4,2,0,14,12,10,8,6,4,2,0,30,28,26,24,22,20,18,16,30,28,26,24,22,20,18,16]
813; X64-NEXT:    retq
814  %1 = ashr <16 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
815  %2 = ashr <16 x i16> %a1, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
816  %3 = tail call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %1, <16 x i16> %2)
817  %4 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
818  ret <32 x i8> %4
819}
820declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
821
822define <16 x i16> @shuffle_combine_packusdw_pshufb(<8 x i32> %a0, <8 x i32> %a1) {
823; X32-LABEL: shuffle_combine_packusdw_pshufb:
824; X32:       # %bb.0:
825; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15,10,11,6,7,2,3,14,15,10,11,6,7,2,3,18,19,22,23,26,27,30,31,30,31,26,27,22,23,18,19]
826; X32-NEXT:    retl
827;
828; X64-LABEL: shuffle_combine_packusdw_pshufb:
829; X64:       # %bb.0:
830; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15,10,11,6,7,2,3,14,15,10,11,6,7,2,3,18,19,22,23,26,27,30,31,30,31,26,27,22,23,18,19]
831; X64-NEXT:    retq
832  %1 = lshr <8 x i32> %a0, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
833  %2 = tail call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %1, <8 x i32> %1)
834  %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 8, i32 9, i32 10, i32 11, i32 11, i32 10, i32 9, i32 8>
835  ret <16 x i16> %3
836}
837declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
838
839define <32 x i8> @shuffle_combine_packuswb_pshufb(<16 x i16> %a0, <16 x i16> %a1) {
840; X32-LABEL: shuffle_combine_packuswb_pshufb:
841; X32:       # %bb.0:
842; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[15,13,11,9,7,5,3,1,15,13,11,9,7,5,3,1,31,29,27,25,23,21,19,17,31,29,27,25,23,21,19,17]
843; X32-NEXT:    retl
844;
845; X64-LABEL: shuffle_combine_packuswb_pshufb:
846; X64:       # %bb.0:
847; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[15,13,11,9,7,5,3,1,15,13,11,9,7,5,3,1,31,29,27,25,23,21,19,17,31,29,27,25,23,21,19,17]
848; X64-NEXT:    retq
849  %1 = lshr <16 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
850  %2 = lshr <16 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
851  %3 = tail call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %1, <16 x i16> %2)
852  %4 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
853  ret <32 x i8> %4
854}
855declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
856
857define <16 x i8> @combine_pshufb_insertion_as_broadcast_v2i64(i64 %a0) {
858; X32-LABEL: combine_pshufb_insertion_as_broadcast_v2i64:
859; X32:       # %bb.0:
860; X32-NEXT:    vpbroadcastq {{[0-9]+}}(%esp), %xmm0
861; X32-NEXT:    retl
862;
863; X64-LABEL: combine_pshufb_insertion_as_broadcast_v2i64:
864; X64:       # %bb.0:
865; X64-NEXT:    vmovq %rdi, %xmm0
866; X64-NEXT:    vpbroadcastq %xmm0, %xmm0
867; X64-NEXT:    retq
868  %1 = insertelement <2 x i64> undef, i64 %a0, i32 0
869  %2 = bitcast <2 x i64> %1 to <16 x i8>
870  %3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
871  ret <16 x i8> %3
872}
873
874define <8 x i32> @combine_permd_insertion_as_broadcast_v4i64(i64 %a0) {
875; X32-LABEL: combine_permd_insertion_as_broadcast_v4i64:
876; X32:       # %bb.0:
877; X32-NEXT:    vbroadcastsd {{[0-9]+}}(%esp), %ymm0
878; X32-NEXT:    retl
879;
880; X64-LABEL: combine_permd_insertion_as_broadcast_v4i64:
881; X64:       # %bb.0:
882; X64-NEXT:    vmovq %rdi, %xmm0
883; X64-NEXT:    vpbroadcastq %xmm0, %ymm0
884; X64-NEXT:    retq
885  %1 = insertelement <4 x i64> undef, i64 %a0, i32 0
886  %2 = bitcast <4 x i64> %1 to <8 x i32>
887  %3 = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %2, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
888  ret <8 x i32> %3
889}
890
891define <8 x i32> @constant_fold_permd() {
892; X32-LABEL: constant_fold_permd:
893; X32:       # %bb.0:
894; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1]
895; X32-NEXT:    retl
896;
897; X64-LABEL: constant_fold_permd:
898; X64:       # %bb.0:
899; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1]
900; X64-NEXT:    retq
901  %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>, <8 x i32> <i32 4, i32 6, i32 2, i32 1, i32 7, i32 1, i32 5, i32 0>)
902  ret <8 x i32> %1
903}
904
905define <8 x float> @constant_fold_permps() {
906; X32-LABEL: constant_fold_permps:
907; X32:       # %bb.0:
908; X32-NEXT:    vmovaps {{.*#+}} ymm0 = [5.000000e+00,7.000000e+00,3.000000e+00,2.000000e+00,8.000000e+00,2.000000e+00,6.000000e+00,1.000000e+00]
909; X32-NEXT:    retl
910;
911; X64-LABEL: constant_fold_permps:
912; X64:       # %bb.0:
913; X64-NEXT:    vmovaps {{.*#+}} ymm0 = [5.000000e+00,7.000000e+00,3.000000e+00,2.000000e+00,8.000000e+00,2.000000e+00,6.000000e+00,1.000000e+00]
914; X64-NEXT:    retq
915  %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, <8 x i32> <i32 4, i32 6, i32 2, i32 1, i32 7, i32 1, i32 5, i32 0>)
916  ret <8 x float> %1
917}
918
919define <32 x i8> @constant_fold_pshufb_256() {
920; X32-LABEL: constant_fold_pshufb_256:
921; X32:       # %bb.0:
922; X32-NEXT:    vmovaps {{.*#+}} ymm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9,255,0,0,0,u,u,0,0,241,0,0,0,0,0,249,250>
923; X32-NEXT:    retl
924;
925; X64-LABEL: constant_fold_pshufb_256:
926; X64:       # %bb.0:
927; X64-NEXT:    vmovaps {{.*#+}} ymm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9,255,0,0,0,u,u,0,0,241,0,0,0,0,0,249,250>
928; X64-NEXT:    retq
929  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 -8, i8 -9, i8 -10, i8 -11, i8 -12, i8 -13, i8 -14, i8 -15>, <32 x i8> <i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6, i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6>)
930  ret <32 x i8> %1
931}
932
933define <32 x i8> @PR27320(<8 x i32> %a0) {
934; X32-LABEL: PR27320:
935; X32:       # %bb.0:
936; X32-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
937; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,1,2,3,4,4,5,6,7,7,8,9,10,10,11,28,29,29,30,31,16,16,17,18,19,19,20,21,22,22,23]
938; X32-NEXT:    retl
939;
940; X64-LABEL: PR27320:
941; X64:       # %bb.0:
942; X64-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
943; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,1,2,3,4,4,5,6,7,7,8,9,10,10,11,28,29,29,30,31,16,16,17,18,19,19,20,21,22,22,23]
944; X64-NEXT:    retq
945  %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 3, i32 4, i32 5, i32 undef>
946  %2 = bitcast <8 x i32> %1 to <32 x i8>
947  %3 = shufflevector <32 x i8> %2, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 7, i32 7, i32 8, i32 9, i32 10, i32 10, i32 11, i32 16, i32 17, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 23, i32 23, i32 24, i32 25, i32 26, i32 26, i32 27>
948  ret <32 x i8> %3
949}
950
951define internal fastcc <8 x float> @PR34577(<8 x float> %inp0, <8 x float> %inp1, <8 x float> %inp2) {
952; X32-LABEL: PR34577:
953; X32:       # %bb.0: # %entry
954; X32-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
955; X32-NEXT:    vxorps %xmm2, %xmm2, %xmm2
956; X32-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
957; X32-NEXT:    vmovaps {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2>
958; X32-NEXT:    vpermps %ymm1, %ymm2, %ymm1
959; X32-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
960; X32-NEXT:    retl
961;
962; X64-LABEL: PR34577:
963; X64:       # %bb.0: # %entry
964; X64-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
965; X64-NEXT:    vxorps %xmm2, %xmm2, %xmm2
966; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
967; X64-NEXT:    vmovaps {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2>
968; X64-NEXT:    vpermps %ymm1, %ymm2, %ymm1
969; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
970; X64-NEXT:    retq
971entry:
972  %shuf0 = shufflevector <8 x float> %inp0, <8 x float> %inp2, <8 x i32> <i32 1, i32 10, i32 11, i32 13, i32 2, i32 13, i32 5, i32 0>
973  %sel = select <8 x i1> <i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x float> %shuf0, <8 x float> zeroinitializer
974  %shuf1 = shufflevector <8 x float> zeroinitializer, <8 x float> %sel, <8 x i32> <i32 6, i32 11, i32 6, i32 15, i32 12, i32 11, i32 1, i32 3>
975  %shuf2 = shufflevector <8 x float> %inp1, <8 x float> %shuf1, <8 x i32> <i32 15, i32 10, i32 7, i32 2, i32 12, i32 undef, i32 3, i32 2>
976  ret <8 x float> %shuf2
977}
978