• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=X32
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=X64
4
5declare <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
6
7declare <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16)
8
9declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>)
10declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>)
11declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>)
12declare <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16>, <32 x i16>)
13
14declare <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64>, <8 x double>, <8 x double>, i8)
15declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16)
16
17declare <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
18declare <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
19declare <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
20
21declare <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8)
22declare <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16)
23
24declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
25declare <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
26declare <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
27
28define <8 x double> @combine_permvar_8f64_identity(<8 x double> %x0, <8 x double> %x1) {
29; X32-LABEL: combine_permvar_8f64_identity:
30; X32:       # %bb.0:
31; X32-NEXT:    retl
32;
33; X64-LABEL: combine_permvar_8f64_identity:
34; X64:       # %bb.0:
35; X64-NEXT:    retq
36  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
37  %2 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %1, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>)
38  ret <8 x double> %2
39}
40define <8 x double> @combine_permvar_8f64_identity_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
41; X32-LABEL: combine_permvar_8f64_identity_mask:
42; X32:       # %bb.0:
43; X32-NEXT:    vmovapd {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
44; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
45; X32-NEXT:    kmovd %eax, %k1
46; X32-NEXT:    vpermpd %zmm0, %zmm2, %zmm1 {%k1}
47; X32-NEXT:    vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
48; X32-NEXT:    vpermpd %zmm1, %zmm0, %zmm1 {%k1}
49; X32-NEXT:    vmovapd %zmm1, %zmm0
50; X32-NEXT:    retl
51;
52; X64-LABEL: combine_permvar_8f64_identity_mask:
53; X64:       # %bb.0:
54; X64-NEXT:    vmovapd {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
55; X64-NEXT:    kmovd %edi, %k1
56; X64-NEXT:    vpermpd %zmm0, %zmm2, %zmm1 {%k1}
57; X64-NEXT:    vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
58; X64-NEXT:    vpermpd %zmm1, %zmm0, %zmm1 {%k1}
59; X64-NEXT:    vmovapd %zmm1, %zmm0
60; X64-NEXT:    retq
61  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
62  %2 = bitcast i8 %m to <8 x i1>
63  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %x1
64  %4 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %3, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>)
65  %5 = bitcast i8 %m to <8 x i1>
66  %6 = select <8 x i1> %5, <8 x double> %4, <8 x double> %3
67  ret <8 x double> %6
68}
69
70define <8 x i64> @combine_permvar_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) {
71; X32-LABEL: combine_permvar_8i64_identity:
72; X32:       # %bb.0:
73; X32-NEXT:    retl
74;
75; X64-LABEL: combine_permvar_8i64_identity:
76; X64:       # %bb.0:
77; X64-NEXT:    retq
78  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
79  %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %1, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>)
80  ret <8 x i64> %2
81}
82define <8 x i64> @combine_permvar_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) {
83; X32-LABEL: combine_permvar_8i64_identity_mask:
84; X32:       # %bb.0:
85; X32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
86; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
87; X32-NEXT:    kmovd %eax, %k1
88; X32-NEXT:    vpermq %zmm0, %zmm2, %zmm1 {%k1}
89; X32-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
90; X32-NEXT:    vpermq %zmm1, %zmm0, %zmm1 {%k1}
91; X32-NEXT:    vmovdqa64 %zmm1, %zmm0
92; X32-NEXT:    retl
93;
94; X64-LABEL: combine_permvar_8i64_identity_mask:
95; X64:       # %bb.0:
96; X64-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
97; X64-NEXT:    kmovd %edi, %k1
98; X64-NEXT:    vpermq %zmm0, %zmm2, %zmm1 {%k1}
99; X64-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
100; X64-NEXT:    vpermq %zmm1, %zmm0, %zmm1 {%k1}
101; X64-NEXT:    vmovdqa64 %zmm1, %zmm0
102; X64-NEXT:    retq
103  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>)
104  %2 = bitcast i8 %m to <8 x i1>
105  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x1
106  %4 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %3, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>)
107  %5 = bitcast i8 %m to <8 x i1>
108  %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %3
109  ret <8 x i64> %6
110}
111
112define <8 x double> @combine_vpermt2var_8f64_identity(<8 x double> %x0, <8 x double> %x1) {
113; X32-LABEL: combine_vpermt2var_8f64_identity:
114; X32:       # %bb.0:
115; X32-NEXT:    retl
116;
117; X64-LABEL: combine_vpermt2var_8f64_identity:
118; X64:       # %bb.0:
119; X64-NEXT:    retq
120  %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x0, <8 x double> %x1, i8 -1)
121  %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, <8 x double> %res0, i8 -1)
122  ret <8 x double> %res1
123}
124define <8 x double> @combine_vpermt2var_8f64_identity_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
125; X32-LABEL: combine_vpermt2var_8f64_identity_mask:
126; X32:       # %bb.0:
127; X32-NEXT:    vmovapd {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
128; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
129; X32-NEXT:    kmovd %eax, %k1
130; X32-NEXT:    vpermi2pd %zmm1, %zmm0, %zmm2 {%k1} {z}
131; X32-NEXT:    vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
132; X32-NEXT:    vpermi2pd %zmm2, %zmm2, %zmm0 {%k1} {z}
133; X32-NEXT:    retl
134;
135; X64-LABEL: combine_vpermt2var_8f64_identity_mask:
136; X64:       # %bb.0:
137; X64-NEXT:    vmovapd {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
138; X64-NEXT:    kmovd %edi, %k1
139; X64-NEXT:    vpermi2pd %zmm1, %zmm0, %zmm2 {%k1} {z}
140; X64-NEXT:    vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
141; X64-NEXT:    vpermi2pd %zmm2, %zmm2, %zmm0 {%k1} {z}
142; X64-NEXT:    retq
143  %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x0, <8 x double> %x1, i8 %m)
144  %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, <8 x double> %res0, i8 %m)
145  ret <8 x double> %res1
146}
147
148define <8 x double> @combine_vpermt2var_8f64_movddup(<8 x double> %x0, <8 x double> %x1) {
149; X32-LABEL: combine_vpermt2var_8f64_movddup:
150; X32:       # %bb.0:
151; X32-NEXT:    vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
152; X32-NEXT:    retl
153;
154; X64-LABEL: combine_vpermt2var_8f64_movddup:
155; X64:       # %bb.0:
156; X64-NEXT:    vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
157; X64-NEXT:    retq
158  %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 undef, i64 undef>, <8 x double> %x0, <8 x double> %x1, i8 -1)
159  ret <8 x double> %res0
160}
161define <8 x double> @combine_vpermt2var_8f64_movddup_load(<8 x double> *%p0, <8 x double> %x1) {
162; X32-LABEL: combine_vpermt2var_8f64_movddup_load:
163; X32:       # %bb.0:
164; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
165; X32-NEXT:    vmovddup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6]
166; X32-NEXT:    retl
167;
168; X64-LABEL: combine_vpermt2var_8f64_movddup_load:
169; X64:       # %bb.0:
170; X64-NEXT:    vmovddup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6]
171; X64-NEXT:    retq
172  %x0 = load <8 x double>, <8 x double> *%p0
173  %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 6, i64 6>, <8 x double> %x0, <8 x double> %x1, i8 -1)
174  ret <8 x double> %res0
175}
176define <8 x double> @combine_vpermt2var_8f64_movddup_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
177; X32-LABEL: combine_vpermt2var_8f64_movddup_mask:
178; X32:       # %bb.0:
179; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
180; X32-NEXT:    kmovd %eax, %k1
181; X32-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
182; X32-NEXT:    retl
183;
184; X64-LABEL: combine_vpermt2var_8f64_movddup_mask:
185; X64:       # %bb.0:
186; X64-NEXT:    kmovd %edi, %k1
187; X64-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
188; X64-NEXT:    retq
189  %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 6, i64 6>, <8 x double> %x0, <8 x double> %x1, i8 %m)
190  ret <8 x double> %res0
191}
192
193define <8 x i64> @combine_vpermt2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) {
194; X32-LABEL: combine_vpermt2var_8i64_identity:
195; X32:       # %bb.0:
196; X32-NEXT:    retl
197;
198; X64-LABEL: combine_vpermt2var_8i64_identity:
199; X64:       # %bb.0:
200; X64-NEXT:    retq
201  %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x0, <8 x i64> %x1, i8 -1)
202  %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 undef, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, <8 x i64> %res0, i8 -1)
203  ret <8 x i64> %res1
204}
205define <8 x i64> @combine_vpermt2var_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) {
206; X32-LABEL: combine_vpermt2var_8i64_identity_mask:
207; X32:       # %bb.0:
208; X32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
209; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
210; X32-NEXT:    kmovd %eax, %k1
211; X32-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2 {%k1} {z}
212; X32-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
213; X32-NEXT:    vpermi2q %zmm2, %zmm2, %zmm0 {%k1} {z}
214; X32-NEXT:    retl
215;
216; X64-LABEL: combine_vpermt2var_8i64_identity_mask:
217; X64:       # %bb.0:
218; X64-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
219; X64-NEXT:    kmovd %edi, %k1
220; X64-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2 {%k1} {z}
221; X64-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
222; X64-NEXT:    vpermi2q %zmm2, %zmm2, %zmm0 {%k1} {z}
223; X64-NEXT:    retq
224  %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x0, <8 x i64> %x1, i8 %m)
225  %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, <8 x i64> %res0, i8 %m)
226  ret <8 x i64> %res1
227}
228
229define <16 x float> @combine_vpermt2var_16f32_identity(<16 x float> %x0, <16 x float> %x1) {
230; X32-LABEL: combine_vpermt2var_16f32_identity:
231; X32:       # %bb.0:
232; X32-NEXT:    retl
233;
234; X64-LABEL: combine_vpermt2var_16f32_identity:
235; X64:       # %bb.0:
236; X64-NEXT:    retq
237  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %x0, <16 x float> %x1, i16 -1)
238  %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x float> %res0, <16 x float> %res0, i16 -1)
239  ret <16 x float> %res1
240}
241define <16 x float> @combine_vpermt2var_16f32_identity_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
242; X32-LABEL: combine_vpermt2var_16f32_identity_mask:
243; X32:       # %bb.0:
244; X32-NEXT:    vmovaps {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
245; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
246; X32-NEXT:    vpermi2ps %zmm1, %zmm0, %zmm2 {%k1} {z}
247; X32-NEXT:    vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
248; X32-NEXT:    vpermi2ps %zmm2, %zmm2, %zmm0 {%k1} {z}
249; X32-NEXT:    retl
250;
251; X64-LABEL: combine_vpermt2var_16f32_identity_mask:
252; X64:       # %bb.0:
253; X64-NEXT:    vmovaps {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
254; X64-NEXT:    kmovd %edi, %k1
255; X64-NEXT:    vpermi2ps %zmm1, %zmm0, %zmm2 {%k1} {z}
256; X64-NEXT:    vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
257; X64-NEXT:    vpermi2ps %zmm2, %zmm2, %zmm0 {%k1} {z}
258; X64-NEXT:    retq
259  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %x0, <16 x float> %x1, i16 %m)
260  %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x float> %res0, <16 x float> %res0, i16 %m)
261  ret <16 x float> %res1
262}
263
264define <16 x float> @combine_vpermt2var_16f32_vmovddup(<16 x float> %x0, <16 x float> %x1) {
265; X32-LABEL: combine_vpermt2var_16f32_vmovddup:
266; X32:       # %bb.0:
267; X32-NEXT:    vmovaps {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
268; X32-NEXT:    vpermt2ps %zmm1, %zmm2, %zmm0
269; X32-NEXT:    retl
270;
271; X64-LABEL: combine_vpermt2var_16f32_vmovddup:
272; X64:       # %bb.0:
273; X64-NEXT:    vmovaps {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
274; X64-NEXT:    vpermt2ps %zmm1, %zmm2, %zmm0
275; X64-NEXT:    retq
276  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 -1)
277  ret <16 x float> %res0
278}
279define <16 x float> @combine_vpermt2var_16f32_vmovddup_load(<16 x float> *%p0, <16 x float> %x1) {
280; X32-LABEL: combine_vpermt2var_16f32_vmovddup_load:
281; X32:       # %bb.0:
282; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
283; X32-NEXT:    vmovaps (%eax), %zmm2
284; X32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
285; X32-NEXT:    vpermi2ps %zmm0, %zmm2, %zmm1
286; X32-NEXT:    vmovaps %zmm1, %zmm0
287; X32-NEXT:    retl
288;
289; X64-LABEL: combine_vpermt2var_16f32_vmovddup_load:
290; X64:       # %bb.0:
291; X64-NEXT:    vmovaps (%rdi), %zmm2
292; X64-NEXT:    vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
293; X64-NEXT:    vpermi2ps %zmm0, %zmm2, %zmm1
294; X64-NEXT:    vmovaps %zmm1, %zmm0
295; X64-NEXT:    retq
296  %x0 = load <16 x float>, <16 x float> *%p0
297  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 -1)
298  ret <16 x float> %res0
299}
300define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
301; X32-LABEL: combine_vpermt2var_16f32_vmovddup_mask:
302; X32:       # %bb.0:
303; X32-NEXT:    vmovaps {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
304; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
305; X32-NEXT:    vpermt2ps %zmm1, %zmm2, %zmm0 {%k1} {z}
306; X32-NEXT:    retl
307;
308; X64-LABEL: combine_vpermt2var_16f32_vmovddup_mask:
309; X64:       # %bb.0:
310; X64-NEXT:    vmovaps {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
311; X64-NEXT:    kmovd %edi, %k1
312; X64-NEXT:    vpermt2ps %zmm1, %zmm2, %zmm0 {%k1} {z}
313; X64-NEXT:    retq
314  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 %m)
315  ret <16 x float> %res0
316}
317define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) {
318; X32-LABEL: combine_vpermt2var_16f32_vmovddup_mask_load:
319; X32:       # %bb.0:
320; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
321; X32-NEXT:    vmovaps (%eax), %zmm2
322; X32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
323; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
324; X32-NEXT:    vpermi2ps %zmm0, %zmm2, %zmm1 {%k1} {z}
325; X32-NEXT:    vmovaps %zmm1, %zmm0
326; X32-NEXT:    retl
327;
328; X64-LABEL: combine_vpermt2var_16f32_vmovddup_mask_load:
329; X64:       # %bb.0:
330; X64-NEXT:    vmovaps (%rdi), %zmm2
331; X64-NEXT:    vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
332; X64-NEXT:    kmovd %esi, %k1
333; X64-NEXT:    vpermi2ps %zmm0, %zmm2, %zmm1 {%k1} {z}
334; X64-NEXT:    vmovaps %zmm1, %zmm0
335; X64-NEXT:    retq
336  %x0 = load <16 x float>, <16 x float> *%p0
337  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 %m)
338  ret <16 x float> %res0
339}
340
341define <16 x float> @combine_vpermt2var_16f32_vmovshdup(<16 x float> %x0, <16 x float> %x1) {
342; X32-LABEL: combine_vpermt2var_16f32_vmovshdup:
343; X32:       # %bb.0:
344; X32-NEXT:    vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
345; X32-NEXT:    retl
346;
347; X64-LABEL: combine_vpermt2var_16f32_vmovshdup:
348; X64:       # %bb.0:
349; X64-NEXT:    vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
350; X64-NEXT:    retq
351  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 -1)
352  ret <16 x float> %res0
353}
354define <16 x float> @combine_vpermt2var_16f32_vmovshdup_load(<16 x float> *%p0, <16 x float> %x1) {
355; X32-LABEL: combine_vpermt2var_16f32_vmovshdup_load:
356; X32:       # %bb.0:
357; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
358; X32-NEXT:    vmovshdup {{.*#+}} zmm0 = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
359; X32-NEXT:    retl
360;
361; X64-LABEL: combine_vpermt2var_16f32_vmovshdup_load:
362; X64:       # %bb.0:
363; X64-NEXT:    vmovshdup {{.*#+}} zmm0 = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
364; X64-NEXT:    retq
365  %x0 = load <16 x float>, <16 x float> *%p0
366  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 -1)
367  ret <16 x float> %res0
368}
369define <16 x float> @combine_vpermt2var_16f32_vmovshdup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
370; X32-LABEL: combine_vpermt2var_16f32_vmovshdup_mask:
371; X32:       # %bb.0:
372; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
373; X32-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
374; X32-NEXT:    retl
375;
376; X64-LABEL: combine_vpermt2var_16f32_vmovshdup_mask:
377; X64:       # %bb.0:
378; X64-NEXT:    kmovd %edi, %k1
379; X64-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
380; X64-NEXT:    retq
381  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 %m)
382  ret <16 x float> %res0
383}
384
385define <16 x float> @combine_vpermt2var_16f32_vmovsldup(<16 x float> %x0, <16 x float> %x1) {
386; X32-LABEL: combine_vpermt2var_16f32_vmovsldup:
387; X32:       # %bb.0:
388; X32-NEXT:    vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
389; X32-NEXT:    retl
390;
391; X64-LABEL: combine_vpermt2var_16f32_vmovsldup:
392; X64:       # %bb.0:
393; X64-NEXT:    vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
394; X64-NEXT:    retq
395  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 -1)
396  ret <16 x float> %res0
397}
398define <16 x float> @combine_vpermt2var_16f32_vmovsldup_load(<16 x float> *%p0, <16 x float> %x1) {
399; X32-LABEL: combine_vpermt2var_16f32_vmovsldup_load:
400; X32:       # %bb.0:
401; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
402; X32-NEXT:    vmovsldup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
403; X32-NEXT:    retl
404;
405; X64-LABEL: combine_vpermt2var_16f32_vmovsldup_load:
406; X64:       # %bb.0:
407; X64-NEXT:    vmovsldup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
408; X64-NEXT:    retq
409  %x0 = load <16 x float>, <16 x float> *%p0
410  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 -1)
411  ret <16 x float> %res0
412}
413define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
414; X32-LABEL: combine_vpermt2var_16f32_vmovsldup_mask:
415; X32:       # %bb.0:
416; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
417; X32-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
418; X32-NEXT:    retl
419;
420; X64-LABEL: combine_vpermt2var_16f32_vmovsldup_mask:
421; X64:       # %bb.0:
422; X64-NEXT:    kmovd %edi, %k1
423; X64-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
424; X64-NEXT:    retq
425  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 %m)
426  ret <16 x float> %res0
427}
428define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) {
429; X32-LABEL: combine_vpermt2var_16f32_vmovsldup_mask_load:
430; X32:       # %bb.0:
431; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
432; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
433; X32-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
434; X32-NEXT:    retl
435;
436; X64-LABEL: combine_vpermt2var_16f32_vmovsldup_mask_load:
437; X64:       # %bb.0:
438; X64-NEXT:    kmovd %esi, %k1
439; X64-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
440; X64-NEXT:    retq
441  %x0 = load <16 x float>, <16 x float> *%p0
442  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 %m)
443  ret <16 x float> %res0
444}
445
446define <16 x float> @combine_vpermt2var_16f32_vpermilps(<16 x float> %x0, <16 x float> %x1) {
447; X32-LABEL: combine_vpermt2var_16f32_vpermilps:
448; X32:       # %bb.0:
449; X32-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
450; X32-NEXT:    retl
451;
452; X64-LABEL: combine_vpermt2var_16f32_vpermilps:
453; X64:       # %bb.0:
454; X64-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
455; X64-NEXT:    retq
456  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 -1)
457  ret <16 x float> %res0
458}
459define <16 x float> @combine_vpermt2var_16f32_vpermilps_load(<16 x float> *%p0, <16 x float> %x1) {
460; X32-LABEL: combine_vpermt2var_16f32_vpermilps_load:
461; X32:       # %bb.0:
462; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
463; X32-NEXT:    vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
464; X32-NEXT:    retl
465;
466; X64-LABEL: combine_vpermt2var_16f32_vpermilps_load:
467; X64:       # %bb.0:
468; X64-NEXT:    vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
469; X64-NEXT:    retq
470  %x0 = load <16 x float>, <16 x float> *%p0
471  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 -1)
472  ret <16 x float> %res0
473}
474define <16 x float> @combine_vpermt2var_16f32_vpermilps_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
475; X32-LABEL: combine_vpermt2var_16f32_vpermilps_mask:
476; X32:       # %bb.0:
477; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
478; X32-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
479; X32-NEXT:    retl
480;
481; X64-LABEL: combine_vpermt2var_16f32_vpermilps_mask:
482; X64:       # %bb.0:
483; X64-NEXT:    kmovd %edi, %k1
484; X64-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
485; X64-NEXT:    retq
486  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 %m)
487  ret <16 x float> %res0
488}
489define <16 x float> @combine_vpermt2var_16f32_vpermilps_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) {
490; X32-LABEL: combine_vpermt2var_16f32_vpermilps_mask_load:
491; X32:       # %bb.0:
492; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
493; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
494; X32-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
495; X32-NEXT:    retl
496;
497; X64-LABEL: combine_vpermt2var_16f32_vpermilps_mask_load:
498; X64:       # %bb.0:
499; X64-NEXT:    kmovd %esi, %k1
500; X64-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
501; X64-NEXT:    retq
502  %x0 = load <16 x float>, <16 x float> *%p0
503  %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 %m)
504  ret <16 x float> %res0
505}
506
507define <16 x i32> @combine_vpermt2var_16i32_identity(<16 x i32> %x0, <16 x i32> %x1) {
508; X32-LABEL: combine_vpermt2var_16i32_identity:
509; X32:       # %bb.0:
510; X32-NEXT:    retl
511;
512; X64-LABEL: combine_vpermt2var_16i32_identity:
513; X64:       # %bb.0:
514; X64-NEXT:    retq
515  %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef>, <16 x i32> %x0, <16 x i32> %x1, i16 -1)
516  %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 undef, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, <16 x i32> %res0, i16 -1)
517  ret <16 x i32> %res1
518}
519define <16 x i32> @combine_vpermt2var_16i32_identity_mask(<16 x i32> %x0, <16 x i32> %x1, i16 %m) {
520; X32-LABEL: combine_vpermt2var_16i32_identity_mask:
521; X32:       # %bb.0:
522; X32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
523; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
524; X32-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2 {%k1} {z}
525; X32-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
526; X32-NEXT:    vpermi2d %zmm2, %zmm2, %zmm0 {%k1} {z}
527; X32-NEXT:    retl
528;
529; X64-LABEL: combine_vpermt2var_16i32_identity_mask:
530; X64:       # %bb.0:
531; X64-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
532; X64-NEXT:    kmovd %edi, %k1
533; X64-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2 {%k1} {z}
534; X64-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
535; X64-NEXT:    vpermi2d %zmm2, %zmm2, %zmm0 {%k1} {z}
536; X64-NEXT:    retq
537  %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x i32> %x0, <16 x i32> %x1, i16 %m)
538  %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, <16 x i32> %res0, i16 %m)
539  ret <16 x i32> %res1
540}
541
542define <32 x i16> @combine_vpermt2var_32i16_identity(<32 x i16> %x0, <32 x i16> %x1) {
543; X32-LABEL: combine_vpermt2var_32i16_identity:
544; X32:       # %bb.0:
545; X32-NEXT:    retl
546;
547; X64-LABEL: combine_vpermt2var_32i16_identity:
548; X64:       # %bb.0:
549; X64-NEXT:    retq
550  %res0 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> %x0, <32 x i16> %x1, i32 -1)
551  %res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 63, i16 30, i16 61, i16 28, i16 59, i16 26, i16 57, i16 24, i16 55, i16 22, i16 53, i16 20, i16 51, i16 18, i16 49, i16 16, i16 47, i16 46, i16 13, i16 44, i16 11, i16 42, i16 9, i16 40, i16 7, i16 38, i16 5, i16 36, i16 3, i16 34, i16 1, i16 32>, <32 x i16> %res0, <32 x i16> %res0, i32 -1)
552  ret <32 x i16> %res1
553}
554define <32 x i16> @combine_vpermt2var_32i16_identity_mask(<32 x i16> %x0, <32 x i16> %x1, i32 %m) {
555; X32-LABEL: combine_vpermt2var_32i16_identity_mask:
556; X32:       # %bb.0:
557; X32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
558; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
559; X32-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2 {%k1} {z}
560; X32-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32]
561; X32-NEXT:    vpermi2w %zmm2, %zmm2, %zmm0 {%k1} {z}
562; X32-NEXT:    retl
563;
564; X64-LABEL: combine_vpermt2var_32i16_identity_mask:
565; X64:       # %bb.0:
566; X64-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
567; X64-NEXT:    kmovd %edi, %k1
568; X64-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2 {%k1} {z}
569; X64-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32]
570; X64-NEXT:    vpermi2w %zmm2, %zmm2, %zmm0 {%k1} {z}
571; X64-NEXT:    retq
572  %res0 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> %x0, <32 x i16> %x1, i32 %m)
573  %res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 63, i16 30, i16 61, i16 28, i16 59, i16 26, i16 57, i16 24, i16 55, i16 22, i16 53, i16 20, i16 51, i16 18, i16 49, i16 16, i16 47, i16 46, i16 13, i16 44, i16 11, i16 42, i16 9, i16 40, i16 7, i16 38, i16 5, i16 36, i16 3, i16 34, i16 1, i16 32>, <32 x i16> %res0, <32 x i16> %res0, i32 %m)
574  ret <32 x i16> %res1
575}
576
577define <64 x i8> @combine_pshufb_identity(<64 x i8> %x0) {
578; X32-LABEL: combine_pshufb_identity:
579; X32:       # %bb.0:
580; X32-NEXT:    retl
581;
582; X64-LABEL: combine_pshufb_identity:
583; X64:       # %bb.0:
584; X64-NEXT:    retq
585  %select = bitcast <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1> to <64 x i8>
586  %mask = bitcast <16 x i32> <i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 undef, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051> to <64 x i8>
587  %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %mask, <64 x i8> %select, i64 -1)
588  %res1 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %res0, <64 x i8> %mask, <64 x i8> %select, i64 -1)
589  ret <64 x i8> %res1
590}
591define <64 x i8> @combine_pshufb_identity_mask(<64 x i8> %x0, i64 %m) {
592; X32-LABEL: combine_pshufb_identity_mask:
593; X32:       # %bb.0:
594; X32-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
595; X32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
596; X32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
597; X32-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3
598; X32-NEXT:    vpshufb %zmm2, %zmm0, %zmm3 {%k1}
599; X32-NEXT:    vpshufb %zmm2, %zmm3, %zmm1 {%k1}
600; X32-NEXT:    vmovdqa64 %zmm1, %zmm0
601; X32-NEXT:    retl
602;
603; X64-LABEL: combine_pshufb_identity_mask:
604; X64:       # %bb.0:
605; X64-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
606; X64-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
607; X64-NEXT:    kmovq %rdi, %k1
608; X64-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3
609; X64-NEXT:    vpshufb %zmm2, %zmm0, %zmm3 {%k1}
610; X64-NEXT:    vpshufb %zmm2, %zmm3, %zmm1 {%k1}
611; X64-NEXT:    vmovdqa64 %zmm1, %zmm0
612; X64-NEXT:    retq
613  %select = bitcast <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1> to <64 x i8>
614  %mask = bitcast <16 x i32> <i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051> to <64 x i8>
615  %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %mask, <64 x i8> %select, i64 %m)
616  %res1 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %res0, <64 x i8> %mask, <64 x i8> %select, i64 %m)
617  ret <64 x i8> %res1
618}
619
620define <32 x i16> @combine_permvar_as_vpbroadcastw512(<32 x i16> %x0) {
621; X32-LABEL: combine_permvar_as_vpbroadcastw512:
622; X32:       # %bb.0:
623; X32-NEXT:    vpbroadcastw %xmm0, %zmm0
624; X32-NEXT:    retl
625;
626; X64-LABEL: combine_permvar_as_vpbroadcastw512:
627; X64:       # %bb.0:
628; X64-NEXT:    vpbroadcastw %xmm0, %zmm0
629; X64-NEXT:    retq
630  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %x0, <32 x i16> zeroinitializer)
631  ret <32 x i16> %1
632}
633
634define <16 x i32> @combine_permvar_as_vpbroadcastd512(<16 x i32> %x0) {
635; X32-LABEL: combine_permvar_as_vpbroadcastd512:
636; X32:       # %bb.0:
637; X32-NEXT:    vbroadcastss %xmm0, %zmm0
638; X32-NEXT:    retl
639;
640; X64-LABEL: combine_permvar_as_vpbroadcastd512:
641; X64:       # %bb.0:
642; X64-NEXT:    vbroadcastss %xmm0, %zmm0
643; X64-NEXT:    retq
644  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> zeroinitializer)
645  ret <16 x i32> %1
646}
647
648define <8 x i64> @combine_permvar_as_vpbroadcastq512(<8 x i64> %x0) {
649; X32-LABEL: combine_permvar_as_vpbroadcastq512:
650; X32:       # %bb.0:
651; X32-NEXT:    vbroadcastsd %xmm0, %zmm0
652; X32-NEXT:    retl
653;
654; X64-LABEL: combine_permvar_as_vpbroadcastq512:
655; X64:       # %bb.0:
656; X64-NEXT:    vbroadcastsd %xmm0, %zmm0
657; X64-NEXT:    retq
658  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> zeroinitializer)
659  ret <8 x i64> %1
660}
661
662define <8 x i64> @combine_permvar_8i64_as_permq(<8 x i64> %x0, <8 x i64> %x1) {
663; X32-LABEL: combine_permvar_8i64_as_permq:
664; X32:       # %bb.0:
665; X32-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
666; X32-NEXT:    retl
667;
668; X64-LABEL: combine_permvar_8i64_as_permq:
669; X64:       # %bb.0:
670; X64-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
671; X64-NEXT:    retq
672  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>)
673  ret <8 x i64> %1
674}
675define <8 x i64> @combine_permvar_8i64_as_permq_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) {
676; X32-LABEL: combine_permvar_8i64_as_permq_mask:
677; X32:       # %bb.0:
678; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
679; X32-NEXT:    kmovd %eax, %k1
680; X32-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
681; X32-NEXT:    vmovdqa64 %zmm1, %zmm0
682; X32-NEXT:    retl
683;
684; X64-LABEL: combine_permvar_8i64_as_permq_mask:
685; X64:       # %bb.0:
686; X64-NEXT:    kmovd %edi, %k1
687; X64-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
688; X64-NEXT:    vmovdqa64 %zmm1, %zmm0
689; X64-NEXT:    retq
690  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>)
691  %2 = bitcast i8 %m to <8 x i1>
692  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x1
693  ret <8 x i64> %3
694}
695
696define <8 x double> @combine_permvar_8f64_as_permpd(<8 x double> %x0, <8 x double> %x1) {
697; X32-LABEL: combine_permvar_8f64_as_permpd:
698; X32:       # %bb.0:
699; X32-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
700; X32-NEXT:    retl
701;
702; X64-LABEL: combine_permvar_8f64_as_permpd:
703; X64:       # %bb.0:
704; X64-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
705; X64-NEXT:    retq
706  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>)
707  ret <8 x double> %1
708}
709define <8 x double> @combine_permvar_8f64_as_permpd_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
710; X32-LABEL: combine_permvar_8f64_as_permpd_mask:
711; X32:       # %bb.0:
712; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
713; X32-NEXT:    kmovd %eax, %k1
714; X32-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
715; X32-NEXT:    vmovapd %zmm1, %zmm0
716; X32-NEXT:    retl
717;
718; X64-LABEL: combine_permvar_8f64_as_permpd_mask:
719; X64:       # %bb.0:
720; X64-NEXT:    kmovd %edi, %k1
721; X64-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
722; X64-NEXT:    vmovapd %zmm1, %zmm0
723; X64-NEXT:    retq
724  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>)
725  %2 = bitcast i8 %m to <8 x i1>
726  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %x1
727  ret <8 x double> %3
728}
729
730define <16 x float> @combine_vpermilvar_16f32_230146759A8BCFDE(<16 x float> %x0) {
731; X32-LABEL: combine_vpermilvar_16f32_230146759A8BCFDE:
732; X32:       # %bb.0:
733; X32-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[2,3,0,1,4,6,7,5,9,10,8,11,12,15,13,14]
734; X32-NEXT:    retl
735;
736; X64-LABEL: combine_vpermilvar_16f32_230146759A8BCFDE:
737; X64:       # %bb.0:
738; X64-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[2,3,0,1,4,6,7,5,9,10,8,11,12,15,13,14]
739; X64-NEXT:    retq
740  %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 1, i32 1, i32 0, i32 3, i32 2>, <16 x float> undef, i16 -1)
741  %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %res0, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 3, i32 1, i32 0, i32 2, i32 3, i32 0, i32 2, i32 1, i32 1, i32 2, i32 0, i32 3>, <16 x float> undef, i16 -1)
742  ret <16 x float> %res1
743}
744
745define <64 x i8> @combine_pshufb_as_pslldq(<64 x i8> %a0) {
746; X32-LABEL: combine_pshufb_as_pslldq:
747; X32:       # %bb.0:
748; X32-NEXT:    vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53]
749; X32-NEXT:    retl
750;
751; X64-LABEL: combine_pshufb_as_pslldq:
752; X64:       # %bb.0:
753; X64-NEXT:    vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53]
754; X64-NEXT:    retq
755  %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %a0, <64 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>, <64 x i8> undef, i64 -1)
756  ret <64 x i8> %res0
757}
758define <64 x i8> @combine_pshufb_as_pslldq_mask(<64 x i8> %a0, i64 %m) {
759; X32-LABEL: combine_pshufb_as_pslldq_mask:
760; X32:       # %bb.0:
761; X32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
762; X32-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53]
763; X32-NEXT:    retl
764;
765; X64-LABEL: combine_pshufb_as_pslldq_mask:
766; X64:       # %bb.0:
767; X64-NEXT:    kmovq %rdi, %k1
768; X64-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53]
769; X64-NEXT:    retq
770  %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %a0, <64 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>, <64 x i8> zeroinitializer, i64 %m)
771  ret <64 x i8> %res0
772}
773
774define <64 x i8> @combine_pshufb_as_psrldq(<64 x i8> %a0) {
775; X32-LABEL: combine_pshufb_as_psrldq:
776; X32:       # %bb.0:
777; X32-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
778; X32-NEXT:    retl
779;
780; X64-LABEL: combine_pshufb_as_psrldq:
781; X64:       # %bb.0:
782; X64-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
783; X64-NEXT:    retq
784  %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %a0, <64 x i8> <i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>, <64 x i8> undef, i64 -1)
785  ret <64 x i8> %res0
786}
787define <64 x i8> @combine_pshufb_as_psrldq_mask(<64 x i8> %a0, i64 %m) {
788; X32-LABEL: combine_pshufb_as_psrldq_mask:
789; X32:       # %bb.0:
790; X32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
791; X32-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
792; X32-NEXT:    retl
793;
794; X64-LABEL: combine_pshufb_as_psrldq_mask:
795; X64:       # %bb.0:
796; X64-NEXT:    kmovq %rdi, %k1
797; X64-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
798; X64-NEXT:    retq
799  %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %a0, <64 x i8> <i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>, <64 x i8> zeroinitializer, i64 %m)
800  ret <64 x i8> %res0
801}
802
803define <32 x i16> @combine_permvar_as_pshuflw(<32 x i16> %a0) {
804; X32-LABEL: combine_permvar_as_pshuflw:
805; X32:       # %bb.0:
806; X32-NEXT:    vpshuflw {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15,17,16,19,18,20,21,22,23,25,24,27,26,28,29,30,31]
807; X32-NEXT:    retl
808;
809; X64-LABEL: combine_permvar_as_pshuflw:
810; X64:       # %bb.0:
811; X64-NEXT:    vpshuflw {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15,17,16,19,18,20,21,22,23,25,24,27,26,28,29,30,31]
812; X64-NEXT:    retq
813  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 1, i16 0, i16 3, i16 2, i16 4, i16 5, i16 6, i16 7, i16 9, i16 8, i16 11, i16 10, i16 12, i16 13, i16 14, i16 15, i16 17, i16 16, i16 19, i16 18, i16 20, i16 21, i16 22, i16 23, i16 25, i16 24, i16 27, i16 26, i16 28, i16 29, i16 30, i16 31>)
814  ret <32 x i16> %1
815}
816
817define <32 x i16> @combine_pshufb_as_pshufhw(<32 x i16> %a0) {
818; X32-LABEL: combine_pshufb_as_pshufhw:
819; X32:       # %bb.0:
820; X32-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14,16,17,18,19,21,20,23,22,24,25,26,27,29,28,31,30]
821; X32-NEXT:    retl
822;
823; X64-LABEL: combine_pshufb_as_pshufhw:
824; X64:       # %bb.0:
825; X64-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14,16,17,18,19,21,20,23,22,24,25,26,27,29,28,31,30]
826; X64-NEXT:    retq
827  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 5, i16 4, i16 7, i16 6, i16 8, i16 9, i16 10, i16 11, i16 13, i16 12, i16 15, i16 14, i16 16, i16 17, i16 18, i16 19, i16 21, i16 20, i16 23, i16 22, i16 24, i16 25, i16 26, i16 27, i16 29, i16 28, i16 31, i16 30>)
828  ret <32 x i16> %1
829}
830
831define <32 x i16> @combine_vpermi2var_32i16_as_pshufb(<32 x i16> %a0) {
832; X32-LABEL: combine_vpermi2var_32i16_as_pshufb:
833; X32:       # %bb.0:
834; X32-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29,34,35,32,33,38,39,36,37,42,43,40,41,46,47,44,45,50,51,48,49,54,55,52,53,58,59,56,57,62,63,60,61]
835; X32-NEXT:    retl
836;
837; X64-LABEL: combine_vpermi2var_32i16_as_pshufb:
838; X64:       # %bb.0:
839; X64-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29,34,35,32,33,38,39,36,37,42,43,40,41,46,47,44,45,50,51,48,49,54,55,52,53,58,59,56,57,62,63,60,61]
840; X64-NEXT:    retq
841  %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 1, i16 0, i16 3, i16 2, i16 4, i16 5, i16 6, i16 7, i16 9, i16 8, i16 11, i16 10, i16 12, i16 13, i16 14, i16 15, i16 17, i16 16, i16 19, i16 18, i16 20, i16 21, i16 22, i16 23, i16 25, i16 24, i16 27, i16 26, i16 28, i16 29, i16 30, i16 31>)
842  %2 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %1, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 5, i16 4, i16 7, i16 6, i16 8, i16 9, i16 10, i16 11, i16 13, i16 12, i16 15, i16 14, i16 16, i16 17, i16 18, i16 19, i16 21, i16 20, i16 23, i16 22, i16 24, i16 25, i16 26, i16 27, i16 29, i16 28, i16 31, i16 30>)
843  ret <32 x i16> %2
844}
845
846define <8 x double> @combine_vpermi2var_8f64_identity(<8 x double> %x0, <8 x double> %x1) {
847; X32-LABEL: combine_vpermi2var_8f64_identity:
848; X32:       # %bb.0:
849; X32-NEXT:    retl
850;
851; X64-LABEL: combine_vpermi2var_8f64_identity:
852; X64:       # %bb.0:
853; X64-NEXT:    retq
854  %res0 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x1, i8 -1)
855  %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %res0, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, i8 -1)
856  ret <8 x double> %res1
857}
858
859define <8 x double> @combine_vpermi2var_8f64_as_shufpd(<8 x double> %x0, <8 x double> %x1) {
860; X32-LABEL: combine_vpermi2var_8f64_as_shufpd:
861; X32:       # %bb.0:
862; X32-NEXT:    vshufpd {{.*#+}} zmm0 = zmm0[1],zmm1[0],zmm0[2],zmm1[2],zmm0[5],zmm1[5],zmm0[6],zmm1[7]
863; X32-NEXT:    retl
864;
865; X64-LABEL: combine_vpermi2var_8f64_as_shufpd:
866; X64:       # %bb.0:
867; X64-NEXT:    vshufpd {{.*#+}} zmm0 = zmm0[1],zmm1[0],zmm0[2],zmm1[2],zmm0[5],zmm1[5],zmm0[6],zmm1[7]
868; X64-NEXT:    retq
869  %1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> <i64 1, i64 8, i64 2, i64 10, i64 5, i64 13, i64 6, i64 15>, <8 x double> %x1, i8 -1)
870  ret <8 x double> %1
871}
872
873define <8 x i64> @combine_vpermi2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) {
874; X32-LABEL: combine_vpermi2var_8i64_identity:
875; X32:       # %bb.0:
876; X32-NEXT:    retl
877;
878; X64-LABEL: combine_vpermi2var_8i64_identity:
879; X64:       # %bb.0:
880; X64-NEXT:    retq
881  %res0 = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x1, i8 -1)
882  %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %res0, <8 x i64> <i64 undef, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, i8 -1)
883  ret <8 x i64> %res1
884}
885
886define <16 x float> @combine_vpermi2var_16f32_identity(<16 x float> %x0, <16 x float> %x1) {
887; X32-LABEL: combine_vpermi2var_16f32_identity:
888; X32:       # %bb.0:
889; X32-NEXT:    retl
890;
891; X64-LABEL: combine_vpermi2var_16f32_identity:
892; X64:       # %bb.0:
893; X64-NEXT:    retq
894  %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %x1, i16 -1)
895  %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %res0, <16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x float> %res0, i16 -1)
896  ret <16 x float> %res1
897}
898
899define <16 x i32> @combine_vpermi2var_16i32_identity(<16 x i32> %x0, <16 x i32> %x1) {
900; X32-LABEL: combine_vpermi2var_16i32_identity:
901; X32:       # %bb.0:
902; X32-NEXT:    retl
903;
904; X64-LABEL: combine_vpermi2var_16i32_identity:
905; X64:       # %bb.0:
906; X64-NEXT:    retq
907  %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef>, <16 x i32> %x1, i16 -1)
908  %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %res0, <16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 undef, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, i16 -1)
909  ret <16 x i32> %res1
910}
911
912define <16 x float> @combine_vpermt2var_vpermi2var_16f32_as_unpckhps(<16 x float> %a0, <16 x float> %a1) {
913; X32-LABEL: combine_vpermt2var_vpermi2var_16f32_as_unpckhps:
914; X32:       # %bb.0:
915; X32-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15]
916; X32-NEXT:    retl
917;
918; X64-LABEL: combine_vpermt2var_vpermi2var_16f32_as_unpckhps:
919; X64:       # %bb.0:
920; X64-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15]
921; X64-NEXT:    retq
922  %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %a0, <16 x i32> <i32 18, i32 2, i32 19, i32 3, i32 22, i32 6, i32 23, i32 7, i32 26, i32 10, i32 27, i32 11, i32 30, i32 14, i32 31, i32 15>, <16 x float> %a1, i16 -1)
923  ret <16 x float> %res0
924}
925
926define <16 x i32> @vpermt2var_vpermi2var_16i32_as_unpckldq(<16 x i32> %a0, <16 x i32> %a1) {
927; X32-LABEL: vpermt2var_vpermi2var_16i32_as_unpckldq:
928; X32:       # %bb.0:
929; X32-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
930; X32-NEXT:    retl
931;
932; X64-LABEL: vpermt2var_vpermi2var_16i32_as_unpckldq:
933; X64:       # %bb.0:
934; X64-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
935; X64-NEXT:    retq
936  %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %a0, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>, <16 x i32> %a1, i16 -1)
937  ret <16 x i32> %res0
938}
939
940define <32 x i16> @combine_vpermi2var_32i16_identity(<32 x i16> %x0, <32 x i16> %x1) {
941; X32-LABEL: combine_vpermi2var_32i16_identity:
942; X32:       # %bb.0:
943; X32-NEXT:    retl
944;
945; X64-LABEL: combine_vpermi2var_32i16_identity:
946; X64:       # %bb.0:
947; X64-NEXT:    retq
948  %res0 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> %x1, i32 -1)
949  %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %res0, <32 x i16> <i16 63, i16 30, i16 61, i16 28, i16 59, i16 26, i16 57, i16 24, i16 55, i16 22, i16 53, i16 20, i16 51, i16 18, i16 49, i16 16, i16 47, i16 46, i16 13, i16 44, i16 11, i16 42, i16 9, i16 40, i16 7, i16 38, i16 5, i16 36, i16 3, i16 34, i16 1, i16 32>, <32 x i16> %res0, i32 -1)
950  ret <32 x i16> %res1
951}
952
953define <8 x double> @combine_vpermi2var_8f64_as_vpermpd(<8 x double> %x0, <8 x double> %x1) {
954; X32-LABEL: combine_vpermi2var_8f64_as_vpermpd:
955; X32:       # %bb.0:
956; X32-NEXT:    vmovaps {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
957; X32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
958; X32-NEXT:    retl
959;
960; X64-LABEL: combine_vpermi2var_8f64_as_vpermpd:
961; X64:       # %bb.0:
962; X64-NEXT:    vmovaps {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
963; X64-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
964; X64-NEXT:    retq
965  %res0 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 0, i64 7, i64 6, i64 5, i64 4>, <8 x double> %x1, i8 -1)
966  %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %res0, <8 x i64> <i64 12, i64 5, i64 14, i64 7, i64 8, i64 1, i64 10, i64 3>, <8 x double> %res0, i8 -1)
967  ret <8 x double> %res1
968}
969
970define <8 x i64> @combine_vpermt2var_8i64_as_vpermq(<8 x i64> %x0, <8 x i64> %x1) {
971; X32-LABEL: combine_vpermt2var_8i64_as_vpermq:
972; X32:       # %bb.0:
973; X32-NEXT:    vmovaps {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
974; X32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
975; X32-NEXT:    retl
976;
977; X64-LABEL: combine_vpermt2var_8i64_as_vpermq:
978; X64:       # %bb.0:
979; X64-NEXT:    vmovaps {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
980; X64-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
981; X64-NEXT:    retq
982  %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 3, i64 2, i64 1, i64 0, i64 7, i64 6, i64 5, i64 4>, <8 x i64> %x0, <8 x i64> %x1, i8 -1)
983  %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 12, i64 5, i64 14, i64 7, i64 8, i64 1, i64 10, i64 3>, <8 x i64> %res0, <8 x i64> %res0, i8 -1)
984  ret <8 x i64> %res1
985}
986
987define <16 x float> @combine_vpermi2var_16f32_as_vpermps(<16 x float> %x0, <16 x float> %x1) {
988; X32-LABEL: combine_vpermi2var_16f32_as_vpermps:
989; X32:       # %bb.0:
990; X32-NEXT:    vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9]
991; X32-NEXT:    vpermps %zmm0, %zmm1, %zmm0
992; X32-NEXT:    retl
993;
994; X64-LABEL: combine_vpermi2var_16f32_as_vpermps:
995; X64:       # %bb.0:
996; X64-NEXT:    vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9]
997; X64-NEXT:    vpermps %zmm0, %zmm1, %zmm0
998; X64-NEXT:    retq
999  %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>, <16 x float> %x1, i16 -1)
1000  %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %res0, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>, <16 x float> %res0, i16 -1)
1001  ret <16 x float> %res1
1002}
1003
1004define <16 x i32> @combine_vpermt2var_16i32_as_vpermd(<16 x i32> %x0, <16 x i32> %x1) {
1005; X32-LABEL: combine_vpermt2var_16i32_as_vpermd:
1006; X32:       # %bb.0:
1007; X32-NEXT:    vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9]
1008; X32-NEXT:    vpermps %zmm0, %zmm1, %zmm0
1009; X32-NEXT:    retl
1010;
1011; X64-LABEL: combine_vpermt2var_16i32_as_vpermd:
1012; X64:       # %bb.0:
1013; X64-NEXT:    vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9]
1014; X64-NEXT:    vpermps %zmm0, %zmm1, %zmm0
1015; X64-NEXT:    retq
1016  %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>, <16 x i32> %x0, <16 x i32> %x1, i16 -1)
1017  %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>, <16 x i32> %res0, <16 x i32> %res0, i16 -1)
1018  ret <16 x i32> %res1
1019}
1020
1021define <32 x i16> @combine_vpermi2var_32i16_as_permw(<32 x i16> %x0, <32 x i16> %x1) {
1022; X32-LABEL: combine_vpermi2var_32i16_as_permw:
1023; X32:       # %bb.0:
1024; X32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,16,14,17,13,18,12,19,11,20,10,21,9,22,8,23,7,24,6,25,5,26,4,27,3,28,2,29,1,30,0,31]
1025; X32-NEXT:    vpermw %zmm0, %zmm1, %zmm0
1026; X32-NEXT:    retl
1027;
1028; X64-LABEL: combine_vpermi2var_32i16_as_permw:
1029; X64:       # %bb.0:
1030; X64-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,16,14,17,13,18,12,19,11,20,10,21,9,22,8,23,7,24,6,25,5,26,4,27,3,28,2,29,1,30,0,31]
1031; X64-NEXT:    vpermw %zmm0, %zmm1, %zmm0
1032; X64-NEXT:    retq
1033  %res0 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> <i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0, i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16>, <32 x i16> %x1, i32 -1)
1034  %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %res0, <32 x i16> <i16 0, i16 31, i16 1, i16 30, i16 2, i16 29, i16 3, i16 28, i16 4, i16 27, i16 5, i16 26, i16 6, i16 25, i16 7, i16 24, i16 8, i16 23, i16 9, i16 22, i16 10, i16 21, i16 11, i16 20, i16 12, i16 19, i16 13, i16 18, i16 14, i16 17, i16 15, i16 16>, <32 x i16> %res0, i32 -1)
1035  ret <32 x i16> %res1
1036}
1037
1038define <8 x double> @combine_vpermi2var_vpermt2var_8f64_as_vperm2(<8 x double> %x0, <8 x double> %x1) {
1039; X32-LABEL: combine_vpermi2var_vpermt2var_8f64_as_vperm2:
1040; X32:       # %bb.0:
1041; X32-NEXT:    vmovapd {{.*#+}} zmm2 = [4,0,14,0,3,0,12,0,7,0,8,0,0,0,15,0]
1042; X32-NEXT:    vpermi2pd %zmm0, %zmm1, %zmm2
1043; X32-NEXT:    vmovapd %zmm2, %zmm0
1044; X32-NEXT:    retl
1045;
1046; X64-LABEL: combine_vpermi2var_vpermt2var_8f64_as_vperm2:
1047; X64:       # %bb.0:
1048; X64-NEXT:    vmovapd {{.*#+}} zmm2 = [4,14,3,12,7,8,0,15]
1049; X64-NEXT:    vpermi2pd %zmm0, %zmm1, %zmm2
1050; X64-NEXT:    vmovapd %zmm2, %zmm0
1051; X64-NEXT:    retq
1052  %res0 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> <i64 15, i64 0, i64 8, i64 7, i64 12, i64 6, i64 11, i64 4>, <8 x double> %x1, i8 -1)
1053  %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 12, i64 5, i64 14, i64 7, i64 8, i64 1, i64 10, i64 3>, <8 x double> %res0, <8 x double> %res0, i8 -1)
1054  ret <8 x double> %res1
1055}
1056
1057define <16 x i32> @combine_vpermi2var_vpermt2var_16i32_as_vpermd(<16 x i32> %x0, <16 x i32> %x1) {
1058; X32-LABEL: combine_vpermi2var_vpermt2var_16i32_as_vpermd:
1059; X32:       # %bb.0:
1060; X32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19]
1061; X32-NEXT:    vpermt2d %zmm1, %zmm2, %zmm0
1062; X32-NEXT:    retl
1063;
1064; X64-LABEL: combine_vpermi2var_vpermt2var_16i32_as_vpermd:
1065; X64:       # %bb.0:
1066; X64-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19]
1067; X64-NEXT:    vpermt2d %zmm1, %zmm2, %zmm0
1068; X64-NEXT:    retq
1069  %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> <i32 0, i32 31, i32 2, i32 29, i32 4, i32 27, i32 6, i32 25, i32 8, i32 23, i32 10, i32 21, i32 12, i32 19, i32 14, i32 17>, <16 x i32> %x1, i16 -1)
1070  %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 0, i32 17, i32 2, i32 18, i32 4, i32 19, i32 6, i32 21, i32 8, i32 23, i32 10, i32 25, i32 12, i32 27, i32 14, i32 29>, <16 x i32> %res0, <16 x i32> %res0, i16 -1)
1071  ret <16 x i32> %res1
1072}
1073
1074define <32 x i16> @combine_vpermt2var_vpermi2var_32i16_as_permw(<32 x i16> %x0, <32 x i16> %x1) {
1075; X32-LABEL: combine_vpermt2var_vpermi2var_32i16_as_permw:
1076; X32:       # %bb.0:
1077; X32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [17,39,19,38,21,37,23,36,25,35,27,34,29,33,31,32,1,47,3,46,5,45,7,44,9,43,11,42,13,41,15,40]
1078; X32-NEXT:    vpermi2w %zmm0, %zmm1, %zmm2
1079; X32-NEXT:    vmovdqa64 %zmm2, %zmm0
1080; X32-NEXT:    retl
1081;
1082; X64-LABEL: combine_vpermt2var_vpermi2var_32i16_as_permw:
1083; X64:       # %bb.0:
1084; X64-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [17,39,19,38,21,37,23,36,25,35,27,34,29,33,31,32,1,47,3,46,5,45,7,44,9,43,11,42,13,41,15,40]
1085; X64-NEXT:    vpermi2w %zmm0, %zmm1, %zmm2
1086; X64-NEXT:    vmovdqa64 %zmm2, %zmm0
1087; X64-NEXT:    retq
1088  %res0 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 0, i16 63, i16 1, i16 61, i16 2, i16 59, i16 3, i16 57, i16 4, i16 55, i16 5, i16 53, i16 6, i16 51, i16 7, i16 49, i16 8, i16 47, i16 9, i16 45, i16 10, i16 43, i16 11, i16 41, i16 12, i16 39, i16 13, i16 37, i16 14, i16 35, i16 15, i16 33>, <32 x i16> %x0, <32 x i16> %x1, i32 -1)
1089  %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %res0, <32 x i16> <i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0, i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16>, <32 x i16> %res0, i32 -1)
1090  ret <32 x i16> %res1
1091}
1092
1093define <8 x double> @combine_vpermi2var_vpermvar_8f64_as_vperm2_zero(<8 x double> %x0) {
1094; X32-LABEL: combine_vpermi2var_vpermvar_8f64_as_vperm2_zero:
1095; X32:       # %bb.0:
1096; X32-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1097; X32-NEXT:    vmovapd {{.*#+}} zmm2 = [8,0,3,0,10,0,11,0,1,0,7,0,14,0,5,0]
1098; X32-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
1099; X32-NEXT:    retl
1100;
1101; X64-LABEL: combine_vpermi2var_vpermvar_8f64_as_vperm2_zero:
1102; X64:       # %bb.0:
1103; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1104; X64-NEXT:    vmovapd {{.*#+}} zmm2 = [8,3,10,11,1,7,14,5]
1105; X64-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
1106; X64-NEXT:    retq
1107  %res0 = shufflevector <8 x double> %x0, <8 x double> zeroinitializer, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
1108  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %res0, <8 x i64> <i64 3, i64 2, i64 1, i64 7, i64 0, i64 6, i64 5, i64 4>)
1109  ret <8 x double> %1
1110}
1111
1112define <16 x float> @combine_vpermi2var_vpermvar_16f32_as_vperm2_zero(<16 x float> %x0) {
1113; X32-LABEL: combine_vpermi2var_vpermvar_16f32_as_vperm2_zero:
1114; X32:       # %bb.0:
1115; X32-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1116; X32-NEXT:    vmovaps {{.*#+}} zmm2 = [0,13,1,12,4,9,22,12,4,25,26,9,5,29,30,8]
1117; X32-NEXT:    vpermt2ps %zmm1, %zmm2, %zmm0
1118; X32-NEXT:    retl
1119;
1120; X64-LABEL: combine_vpermi2var_vpermvar_16f32_as_vperm2_zero:
1121; X64:       # %bb.0:
1122; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1123; X64-NEXT:    vmovaps {{.*#+}} zmm2 = [0,13,1,12,4,9,22,12,4,25,26,9,5,29,30,8]
1124; X64-NEXT:    vpermt2ps %zmm1, %zmm2, %zmm0
1125; X64-NEXT:    retq
1126  %res0 = shufflevector <16 x float> %x0, <16 x float> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
1127  %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %res0, <16 x i32> <i32 0, i32 14, i32 2, i32 12, i32 4, i32 10, i32 3, i32 12, i32 4, i32 11, i32 5, i32 10, i32 6, i32 9, i32 7, i32 8>, <16 x float> %res0, i16 -1)
1128  ret <16 x float> %res1
1129}
1130
1131define <8 x i64> @combine_vpermvar_insertion_as_broadcast_v8i64(i64 %a0) {
1132; X32-LABEL: combine_vpermvar_insertion_as_broadcast_v8i64:
1133; X32:       # %bb.0:
1134; X32-NEXT:    vbroadcastsd {{[0-9]+}}(%esp), %zmm0
1135; X32-NEXT:    retl
1136;
1137; X64-LABEL: combine_vpermvar_insertion_as_broadcast_v8i64:
1138; X64:       # %bb.0:
1139; X64-NEXT:    vmovq %rdi, %xmm0
1140; X64-NEXT:    vpbroadcastq %xmm0, %zmm0
1141; X64-NEXT:    retq
1142  %1 = insertelement <8 x i64> undef, i64 %a0, i32 0
1143  %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %1, <8 x i64> zeroinitializer)
1144  ret <8 x i64> %2
1145}
1146