1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86,X86-AVX512F 3; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X86,X86-AVX512BW 4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64,X64-AVX512F 5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X64,X64-AVX512BW 6 7declare <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16) 8 9declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>) 10declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>) 11declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>) 12 13declare <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64>, <8 x double>, <8 x double>, i8) 14declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16) 15 16declare <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) 17declare <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) 18 19declare <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8) 20declare <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16) 21 22declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) 23declare <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) 24 25define <8 x double> @combine_permvar_8f64_identity(<8 x double> %x0, <8 x double> %x1) { 26; CHECK-LABEL: combine_permvar_8f64_identity: 27; CHECK: # %bb.0: 28; CHECK-NEXT: ret{{[l|q]}} 29 %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>) 30 %2 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %1, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>) 31 ret <8 x double> %2 32} 33define <8 x double> @combine_permvar_8f64_identity_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) { 34; X86-AVX512F-LABEL: combine_permvar_8f64_identity_mask: 35; X86-AVX512F: # %bb.0: 36; X86-AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] 37; X86-AVX512F-NEXT: movzbl {{[0-9]+}}(%esp), %eax 38; X86-AVX512F-NEXT: kmovw %eax, %k1 39; X86-AVX512F-NEXT: vpermpd %zmm0, %zmm2, %zmm1 {%k1} 40; X86-AVX512F-NEXT: vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] 41; X86-AVX512F-NEXT: vpermpd %zmm1, %zmm0, %zmm1 {%k1} 42; X86-AVX512F-NEXT: vmovapd %zmm1, %zmm0 43; X86-AVX512F-NEXT: retl 44; 45; X86-AVX512BW-LABEL: combine_permvar_8f64_identity_mask: 46; X86-AVX512BW: # %bb.0: 47; X86-AVX512BW-NEXT: vmovapd {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] 48; X86-AVX512BW-NEXT: movzbl {{[0-9]+}}(%esp), %eax 49; X86-AVX512BW-NEXT: kmovd %eax, %k1 50; X86-AVX512BW-NEXT: vpermpd %zmm0, %zmm2, %zmm1 {%k1} 51; X86-AVX512BW-NEXT: vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] 52; X86-AVX512BW-NEXT: vpermpd %zmm1, %zmm0, %zmm1 {%k1} 53; X86-AVX512BW-NEXT: vmovapd %zmm1, %zmm0 54; X86-AVX512BW-NEXT: retl 55; 56; X64-AVX512F-LABEL: combine_permvar_8f64_identity_mask: 57; X64-AVX512F: # %bb.0: 58; X64-AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] 59; X64-AVX512F-NEXT: kmovw %edi, %k1 60; X64-AVX512F-NEXT: vpermpd %zmm0, %zmm2, %zmm1 {%k1} 61; X64-AVX512F-NEXT: vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] 62; X64-AVX512F-NEXT: vpermpd %zmm1, %zmm0, %zmm1 {%k1} 63; X64-AVX512F-NEXT: vmovapd %zmm1, %zmm0 64; X64-AVX512F-NEXT: retq 65; 66; X64-AVX512BW-LABEL: combine_permvar_8f64_identity_mask: 67; X64-AVX512BW: # %bb.0: 68; X64-AVX512BW-NEXT: vmovapd {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] 69; X64-AVX512BW-NEXT: kmovd %edi, %k1 70; X64-AVX512BW-NEXT: vpermpd %zmm0, %zmm2, %zmm1 {%k1} 71; X64-AVX512BW-NEXT: vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] 72; X64-AVX512BW-NEXT: vpermpd %zmm1, %zmm0, %zmm1 {%k1} 73; X64-AVX512BW-NEXT: vmovapd %zmm1, %zmm0 74; X64-AVX512BW-NEXT: retq 75 %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>) 76 %2 = bitcast i8 %m to <8 x i1> 77 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %x1 78 %4 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %3, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>) 79 %5 = bitcast i8 %m to <8 x i1> 80 %6 = select <8 x i1> %5, <8 x double> %4, <8 x double> %3 81 ret <8 x double> %6 82} 83 84define <8 x i64> @combine_permvar_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) { 85; CHECK-LABEL: combine_permvar_8i64_identity: 86; CHECK: # %bb.0: 87; CHECK-NEXT: ret{{[l|q]}} 88 %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>) 89 %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %1, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>) 90 ret <8 x i64> %2 91} 92define <8 x i64> @combine_permvar_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) { 93; X86-AVX512F-LABEL: combine_permvar_8i64_identity_mask: 94; X86-AVX512F: # %bb.0: 95; X86-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] 96; X86-AVX512F-NEXT: movzbl {{[0-9]+}}(%esp), %eax 97; X86-AVX512F-NEXT: kmovw %eax, %k1 98; X86-AVX512F-NEXT: vpermq %zmm0, %zmm2, %zmm1 {%k1} 99; X86-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] 100; X86-AVX512F-NEXT: vpermq %zmm1, %zmm0, %zmm1 {%k1} 101; X86-AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 102; X86-AVX512F-NEXT: retl 103; 104; X86-AVX512BW-LABEL: combine_permvar_8i64_identity_mask: 105; X86-AVX512BW: # %bb.0: 106; X86-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] 107; X86-AVX512BW-NEXT: movzbl {{[0-9]+}}(%esp), %eax 108; X86-AVX512BW-NEXT: kmovd %eax, %k1 109; X86-AVX512BW-NEXT: vpermq %zmm0, %zmm2, %zmm1 {%k1} 110; X86-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] 111; X86-AVX512BW-NEXT: vpermq %zmm1, %zmm0, %zmm1 {%k1} 112; X86-AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 113; X86-AVX512BW-NEXT: retl 114; 115; X64-AVX512F-LABEL: combine_permvar_8i64_identity_mask: 116; X64-AVX512F: # %bb.0: 117; X64-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] 118; X64-AVX512F-NEXT: kmovw %edi, %k1 119; X64-AVX512F-NEXT: vpermq %zmm0, %zmm2, %zmm1 {%k1} 120; X64-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] 121; X64-AVX512F-NEXT: vpermq %zmm1, %zmm0, %zmm1 {%k1} 122; X64-AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 123; X64-AVX512F-NEXT: retq 124; 125; X64-AVX512BW-LABEL: combine_permvar_8i64_identity_mask: 126; X64-AVX512BW: # %bb.0: 127; X64-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] 128; X64-AVX512BW-NEXT: kmovd %edi, %k1 129; X64-AVX512BW-NEXT: vpermq %zmm0, %zmm2, %zmm1 {%k1} 130; X64-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] 131; X64-AVX512BW-NEXT: vpermq %zmm1, %zmm0, %zmm1 {%k1} 132; X64-AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 133; X64-AVX512BW-NEXT: retq 134 %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>) 135 %2 = bitcast i8 %m to <8 x i1> 136 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x1 137 %4 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %3, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>) 138 %5 = bitcast i8 %m to <8 x i1> 139 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %3 140 ret <8 x i64> %6 141} 142 143define <8 x double> @combine_vpermt2var_8f64_identity(<8 x double> %x0, <8 x double> %x1) { 144; CHECK-LABEL: combine_vpermt2var_8f64_identity: 145; CHECK: # %bb.0: 146; CHECK-NEXT: ret{{[l|q]}} 147 %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x0, <8 x double> %x1, i8 -1) 148 %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, <8 x double> %res0, i8 -1) 149 ret <8 x double> %res1 150} 151define <8 x double> @combine_vpermt2var_8f64_identity_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) { 152; X86-AVX512F-LABEL: combine_vpermt2var_8f64_identity_mask: 153; X86-AVX512F: # %bb.0: 154; X86-AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] 155; X86-AVX512F-NEXT: movzbl {{[0-9]+}}(%esp), %eax 156; X86-AVX512F-NEXT: kmovw %eax, %k1 157; X86-AVX512F-NEXT: vpermi2pd %zmm0, %zmm0, %zmm1 {%k1} {z} 158; X86-AVX512F-NEXT: vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] 159; X86-AVX512F-NEXT: vpermi2pd %zmm1, %zmm1, %zmm0 {%k1} {z} 160; X86-AVX512F-NEXT: retl 161; 162; X86-AVX512BW-LABEL: combine_vpermt2var_8f64_identity_mask: 163; X86-AVX512BW: # %bb.0: 164; X86-AVX512BW-NEXT: vmovapd {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] 165; X86-AVX512BW-NEXT: movzbl {{[0-9]+}}(%esp), %eax 166; X86-AVX512BW-NEXT: kmovd %eax, %k1 167; X86-AVX512BW-NEXT: vpermi2pd %zmm0, %zmm0, %zmm1 {%k1} {z} 168; X86-AVX512BW-NEXT: vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] 169; X86-AVX512BW-NEXT: vpermi2pd %zmm1, %zmm1, %zmm0 {%k1} {z} 170; X86-AVX512BW-NEXT: retl 171; 172; X64-AVX512F-LABEL: combine_vpermt2var_8f64_identity_mask: 173; X64-AVX512F: # %bb.0: 174; X64-AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] 175; X64-AVX512F-NEXT: kmovw %edi, %k1 176; X64-AVX512F-NEXT: vpermi2pd %zmm0, %zmm0, %zmm1 {%k1} {z} 177; X64-AVX512F-NEXT: vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] 178; X64-AVX512F-NEXT: vpermi2pd %zmm1, %zmm1, %zmm0 {%k1} {z} 179; X64-AVX512F-NEXT: retq 180; 181; X64-AVX512BW-LABEL: combine_vpermt2var_8f64_identity_mask: 182; X64-AVX512BW: # %bb.0: 183; X64-AVX512BW-NEXT: vmovapd {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] 184; X64-AVX512BW-NEXT: kmovd %edi, %k1 185; X64-AVX512BW-NEXT: vpermi2pd %zmm0, %zmm0, %zmm1 {%k1} {z} 186; X64-AVX512BW-NEXT: vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] 187; X64-AVX512BW-NEXT: vpermi2pd %zmm1, %zmm1, %zmm0 {%k1} {z} 188; X64-AVX512BW-NEXT: retq 189 %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x0, <8 x double> %x1, i8 %m) 190 %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, <8 x double> %res0, i8 %m) 191 ret <8 x double> %res1 192} 193 194define <8 x double> @combine_vpermt2var_8f64_movddup(<8 x double> %x0, <8 x double> %x1) { 195; CHECK-LABEL: combine_vpermt2var_8f64_movddup: 196; CHECK: # %bb.0: 197; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6] 198; CHECK-NEXT: ret{{[l|q]}} 199 %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 undef, i64 undef>, <8 x double> %x0, <8 x double> %x1, i8 -1) 200 ret <8 x double> %res0 201} 202define <8 x double> @combine_vpermt2var_8f64_movddup_load(<8 x double> *%p0, <8 x double> %x1) { 203; X86-LABEL: combine_vpermt2var_8f64_movddup_load: 204; X86: # %bb.0: 205; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 206; X86-NEXT: vmovddup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6] 207; X86-NEXT: retl 208; 209; X64-LABEL: combine_vpermt2var_8f64_movddup_load: 210; X64: # %bb.0: 211; X64-NEXT: vmovddup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6] 212; X64-NEXT: retq 213 %x0 = load <8 x double>, <8 x double> *%p0 214 %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 6, i64 6>, <8 x double> %x0, <8 x double> %x1, i8 -1) 215 ret <8 x double> %res0 216} 217define <8 x double> @combine_vpermt2var_8f64_movddup_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) { 218; X86-AVX512F-LABEL: combine_vpermt2var_8f64_movddup_mask: 219; X86-AVX512F: # %bb.0: 220; X86-AVX512F-NEXT: movzbl {{[0-9]+}}(%esp), %eax 221; X86-AVX512F-NEXT: kmovw %eax, %k1 222; X86-AVX512F-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] 223; X86-AVX512F-NEXT: retl 224; 225; X86-AVX512BW-LABEL: combine_vpermt2var_8f64_movddup_mask: 226; X86-AVX512BW: # %bb.0: 227; X86-AVX512BW-NEXT: movzbl {{[0-9]+}}(%esp), %eax 228; X86-AVX512BW-NEXT: kmovd %eax, %k1 229; X86-AVX512BW-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] 230; X86-AVX512BW-NEXT: retl 231; 232; X64-AVX512F-LABEL: combine_vpermt2var_8f64_movddup_mask: 233; X64-AVX512F: # %bb.0: 234; X64-AVX512F-NEXT: kmovw %edi, %k1 235; X64-AVX512F-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] 236; X64-AVX512F-NEXT: retq 237; 238; X64-AVX512BW-LABEL: combine_vpermt2var_8f64_movddup_mask: 239; X64-AVX512BW: # %bb.0: 240; X64-AVX512BW-NEXT: kmovd %edi, %k1 241; X64-AVX512BW-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] 242; X64-AVX512BW-NEXT: retq 243 %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 6, i64 6>, <8 x double> %x0, <8 x double> %x1, i8 %m) 244 ret <8 x double> %res0 245} 246 247define <8 x i64> @combine_vpermt2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) { 248; CHECK-LABEL: combine_vpermt2var_8i64_identity: 249; CHECK: # %bb.0: 250; CHECK-NEXT: ret{{[l|q]}} 251 %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x0, <8 x i64> %x1, i8 -1) 252 %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 undef, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, <8 x i64> %res0, i8 -1) 253 ret <8 x i64> %res1 254} 255define <8 x i64> @combine_vpermt2var_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) { 256; X86-AVX512F-LABEL: combine_vpermt2var_8i64_identity_mask: 257; X86-AVX512F: # %bb.0: 258; X86-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] 259; X86-AVX512F-NEXT: movzbl {{[0-9]+}}(%esp), %eax 260; X86-AVX512F-NEXT: kmovw %eax, %k1 261; X86-AVX512F-NEXT: vpermi2q %zmm0, %zmm0, %zmm1 {%k1} {z} 262; X86-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] 263; X86-AVX512F-NEXT: vpermi2q %zmm1, %zmm1, %zmm0 {%k1} {z} 264; X86-AVX512F-NEXT: retl 265; 266; X86-AVX512BW-LABEL: combine_vpermt2var_8i64_identity_mask: 267; X86-AVX512BW: # %bb.0: 268; X86-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] 269; X86-AVX512BW-NEXT: movzbl {{[0-9]+}}(%esp), %eax 270; X86-AVX512BW-NEXT: kmovd %eax, %k1 271; X86-AVX512BW-NEXT: vpermi2q %zmm0, %zmm0, %zmm1 {%k1} {z} 272; X86-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] 273; X86-AVX512BW-NEXT: vpermi2q %zmm1, %zmm1, %zmm0 {%k1} {z} 274; X86-AVX512BW-NEXT: retl 275; 276; X64-AVX512F-LABEL: combine_vpermt2var_8i64_identity_mask: 277; X64-AVX512F: # %bb.0: 278; X64-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] 279; X64-AVX512F-NEXT: kmovw %edi, %k1 280; X64-AVX512F-NEXT: vpermi2q %zmm0, %zmm0, %zmm1 {%k1} {z} 281; X64-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] 282; X64-AVX512F-NEXT: vpermi2q %zmm1, %zmm1, %zmm0 {%k1} {z} 283; X64-AVX512F-NEXT: retq 284; 285; X64-AVX512BW-LABEL: combine_vpermt2var_8i64_identity_mask: 286; X64-AVX512BW: # %bb.0: 287; X64-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] 288; X64-AVX512BW-NEXT: kmovd %edi, %k1 289; X64-AVX512BW-NEXT: vpermi2q %zmm0, %zmm0, %zmm1 {%k1} {z} 290; X64-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] 291; X64-AVX512BW-NEXT: vpermi2q %zmm1, %zmm1, %zmm0 {%k1} {z} 292; X64-AVX512BW-NEXT: retq 293 %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x0, <8 x i64> %x1, i8 %m) 294 %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, <8 x i64> %res0, i8 %m) 295 ret <8 x i64> %res1 296} 297 298define <16 x float> @combine_vpermt2var_16f32_identity(<16 x float> %x0, <16 x float> %x1) { 299; CHECK-LABEL: combine_vpermt2var_16f32_identity: 300; CHECK: # %bb.0: 301; CHECK-NEXT: ret{{[l|q]}} 302 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %x0, <16 x float> %x1, i16 -1) 303 %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x float> %res0, <16 x float> %res0, i16 -1) 304 ret <16 x float> %res1 305} 306define <16 x float> @combine_vpermt2var_16f32_identity_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) { 307; X86-LABEL: combine_vpermt2var_16f32_identity_mask: 308; X86: # %bb.0: 309; X86-NEXT: vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] 310; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 311; X86-NEXT: vpermi2ps %zmm0, %zmm0, %zmm1 {%k1} {z} 312; X86-NEXT: vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] 313; X86-NEXT: vpermi2ps %zmm1, %zmm1, %zmm0 {%k1} {z} 314; X86-NEXT: retl 315; 316; X64-AVX512F-LABEL: combine_vpermt2var_16f32_identity_mask: 317; X64-AVX512F: # %bb.0: 318; X64-AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] 319; X64-AVX512F-NEXT: kmovw %edi, %k1 320; X64-AVX512F-NEXT: vpermi2ps %zmm0, %zmm0, %zmm1 {%k1} {z} 321; X64-AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] 322; X64-AVX512F-NEXT: vpermi2ps %zmm1, %zmm1, %zmm0 {%k1} {z} 323; X64-AVX512F-NEXT: retq 324; 325; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_identity_mask: 326; X64-AVX512BW: # %bb.0: 327; X64-AVX512BW-NEXT: vmovaps {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] 328; X64-AVX512BW-NEXT: kmovd %edi, %k1 329; X64-AVX512BW-NEXT: vpermi2ps %zmm0, %zmm0, %zmm1 {%k1} {z} 330; X64-AVX512BW-NEXT: vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] 331; X64-AVX512BW-NEXT: vpermi2ps %zmm1, %zmm1, %zmm0 {%k1} {z} 332; X64-AVX512BW-NEXT: retq 333 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %x0, <16 x float> %x1, i16 %m) 334 %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x float> %res0, <16 x float> %res0, i16 %m) 335 ret <16 x float> %res1 336} 337 338define <16 x float> @combine_vpermt2var_16f32_vmovddup(<16 x float> %x0, <16 x float> %x1) { 339; CHECK-LABEL: combine_vpermt2var_16f32_vmovddup: 340; CHECK: # %bb.0: 341; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6] 342; CHECK-NEXT: ret{{[l|q]}} 343 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 -1) 344 ret <16 x float> %res0 345} 346define <16 x float> @combine_vpermt2var_16f32_vmovddup_load(<16 x float> *%p0, <16 x float> %x1) { 347; X86-LABEL: combine_vpermt2var_16f32_vmovddup_load: 348; X86: # %bb.0: 349; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 350; X86-NEXT: vmovddup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6] 351; X86-NEXT: retl 352; 353; X64-LABEL: combine_vpermt2var_16f32_vmovddup_load: 354; X64: # %bb.0: 355; X64-NEXT: vmovddup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6] 356; X64-NEXT: retq 357 %x0 = load <16 x float>, <16 x float> *%p0 358 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 -1) 359 ret <16 x float> %res0 360} 361define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) { 362; X86-LABEL: combine_vpermt2var_16f32_vmovddup_mask: 363; X86: # %bb.0: 364; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 365; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] 366; X86-NEXT: retl 367; 368; X64-AVX512F-LABEL: combine_vpermt2var_16f32_vmovddup_mask: 369; X64-AVX512F: # %bb.0: 370; X64-AVX512F-NEXT: kmovw %edi, %k1 371; X64-AVX512F-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] 372; X64-AVX512F-NEXT: retq 373; 374; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_vmovddup_mask: 375; X64-AVX512BW: # %bb.0: 376; X64-AVX512BW-NEXT: kmovd %edi, %k1 377; X64-AVX512BW-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] 378; X64-AVX512BW-NEXT: retq 379 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 %m) 380 ret <16 x float> %res0 381} 382define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) { 383; X86-LABEL: combine_vpermt2var_16f32_vmovddup_mask_load: 384; X86: # %bb.0: 385; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 386; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 387; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] 388; X86-NEXT: retl 389; 390; X64-AVX512F-LABEL: combine_vpermt2var_16f32_vmovddup_mask_load: 391; X64-AVX512F: # %bb.0: 392; X64-AVX512F-NEXT: kmovw %esi, %k1 393; X64-AVX512F-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] 394; X64-AVX512F-NEXT: retq 395; 396; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_vmovddup_mask_load: 397; X64-AVX512BW: # %bb.0: 398; X64-AVX512BW-NEXT: kmovd %esi, %k1 399; X64-AVX512BW-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] 400; X64-AVX512BW-NEXT: retq 401 %x0 = load <16 x float>, <16 x float> *%p0 402 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 %m) 403 ret <16 x float> %res0 404} 405 406define <16 x float> @combine_vpermt2var_16f32_vmovshdup(<16 x float> %x0, <16 x float> %x1) { 407; CHECK-LABEL: combine_vpermt2var_16f32_vmovshdup: 408; CHECK: # %bb.0: 409; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 410; CHECK-NEXT: ret{{[l|q]}} 411 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 -1) 412 ret <16 x float> %res0 413} 414define <16 x float> @combine_vpermt2var_16f32_vmovshdup_load(<16 x float> *%p0, <16 x float> %x1) { 415; X86-LABEL: combine_vpermt2var_16f32_vmovshdup_load: 416; X86: # %bb.0: 417; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 418; X86-NEXT: vmovshdup {{.*#+}} zmm0 = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 419; X86-NEXT: retl 420; 421; X64-LABEL: combine_vpermt2var_16f32_vmovshdup_load: 422; X64: # %bb.0: 423; X64-NEXT: vmovshdup {{.*#+}} zmm0 = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 424; X64-NEXT: retq 425 %x0 = load <16 x float>, <16 x float> *%p0 426 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 -1) 427 ret <16 x float> %res0 428} 429define <16 x float> @combine_vpermt2var_16f32_vmovshdup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) { 430; X86-LABEL: combine_vpermt2var_16f32_vmovshdup_mask: 431; X86: # %bb.0: 432; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 433; X86-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 434; X86-NEXT: retl 435; 436; X64-AVX512F-LABEL: combine_vpermt2var_16f32_vmovshdup_mask: 437; X64-AVX512F: # %bb.0: 438; X64-AVX512F-NEXT: kmovw %edi, %k1 439; X64-AVX512F-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 440; X64-AVX512F-NEXT: retq 441; 442; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_vmovshdup_mask: 443; X64-AVX512BW: # %bb.0: 444; X64-AVX512BW-NEXT: kmovd %edi, %k1 445; X64-AVX512BW-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 446; X64-AVX512BW-NEXT: retq 447 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 %m) 448 ret <16 x float> %res0 449} 450 451define <16 x float> @combine_vpermt2var_16f32_vmovsldup(<16 x float> %x0, <16 x float> %x1) { 452; CHECK-LABEL: combine_vpermt2var_16f32_vmovsldup: 453; CHECK: # %bb.0: 454; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 455; CHECK-NEXT: ret{{[l|q]}} 456 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 -1) 457 ret <16 x float> %res0 458} 459define <16 x float> @combine_vpermt2var_16f32_vmovsldup_load(<16 x float> *%p0, <16 x float> %x1) { 460; X86-LABEL: combine_vpermt2var_16f32_vmovsldup_load: 461; X86: # %bb.0: 462; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 463; X86-NEXT: vmovsldup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 464; X86-NEXT: retl 465; 466; X64-LABEL: combine_vpermt2var_16f32_vmovsldup_load: 467; X64: # %bb.0: 468; X64-NEXT: vmovsldup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 469; X64-NEXT: retq 470 %x0 = load <16 x float>, <16 x float> *%p0 471 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 -1) 472 ret <16 x float> %res0 473} 474define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) { 475; X86-LABEL: combine_vpermt2var_16f32_vmovsldup_mask: 476; X86: # %bb.0: 477; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 478; X86-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 479; X86-NEXT: retl 480; 481; X64-AVX512F-LABEL: combine_vpermt2var_16f32_vmovsldup_mask: 482; X64-AVX512F: # %bb.0: 483; X64-AVX512F-NEXT: kmovw %edi, %k1 484; X64-AVX512F-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 485; X64-AVX512F-NEXT: retq 486; 487; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_vmovsldup_mask: 488; X64-AVX512BW: # %bb.0: 489; X64-AVX512BW-NEXT: kmovd %edi, %k1 490; X64-AVX512BW-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 491; X64-AVX512BW-NEXT: retq 492 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 %m) 493 ret <16 x float> %res0 494} 495define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) { 496; X86-LABEL: combine_vpermt2var_16f32_vmovsldup_mask_load: 497; X86: # %bb.0: 498; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 499; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 500; X86-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 501; X86-NEXT: retl 502; 503; X64-AVX512F-LABEL: combine_vpermt2var_16f32_vmovsldup_mask_load: 504; X64-AVX512F: # %bb.0: 505; X64-AVX512F-NEXT: kmovw %esi, %k1 506; X64-AVX512F-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 507; X64-AVX512F-NEXT: retq 508; 509; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_vmovsldup_mask_load: 510; X64-AVX512BW: # %bb.0: 511; X64-AVX512BW-NEXT: kmovd %esi, %k1 512; X64-AVX512BW-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 513; X64-AVX512BW-NEXT: retq 514 %x0 = load <16 x float>, <16 x float> *%p0 515 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 %m) 516 ret <16 x float> %res0 517} 518 519define <16 x float> @combine_vpermt2var_16f32_vpermilps(<16 x float> %x0, <16 x float> %x1) { 520; CHECK-LABEL: combine_vpermt2var_16f32_vpermilps: 521; CHECK: # %bb.0: 522; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 523; CHECK-NEXT: ret{{[l|q]}} 524 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 -1) 525 ret <16 x float> %res0 526} 527define <16 x float> @combine_vpermt2var_16f32_vpermilps_load(<16 x float> *%p0, <16 x float> %x1) { 528; X86-LABEL: combine_vpermt2var_16f32_vpermilps_load: 529; X86: # %bb.0: 530; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 531; X86-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 532; X86-NEXT: retl 533; 534; X64-LABEL: combine_vpermt2var_16f32_vpermilps_load: 535; X64: # %bb.0: 536; X64-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 537; X64-NEXT: retq 538 %x0 = load <16 x float>, <16 x float> *%p0 539 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 -1) 540 ret <16 x float> %res0 541} 542define <16 x float> @combine_vpermt2var_16f32_vpermilps_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) { 543; X86-LABEL: combine_vpermt2var_16f32_vpermilps_mask: 544; X86: # %bb.0: 545; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 546; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 547; X86-NEXT: retl 548; 549; X64-AVX512F-LABEL: combine_vpermt2var_16f32_vpermilps_mask: 550; X64-AVX512F: # %bb.0: 551; X64-AVX512F-NEXT: kmovw %edi, %k1 552; X64-AVX512F-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 553; X64-AVX512F-NEXT: retq 554; 555; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_vpermilps_mask: 556; X64-AVX512BW: # %bb.0: 557; X64-AVX512BW-NEXT: kmovd %edi, %k1 558; X64-AVX512BW-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 559; X64-AVX512BW-NEXT: retq 560 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 %m) 561 ret <16 x float> %res0 562} 563define <16 x float> @combine_vpermt2var_16f32_vpermilps_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) { 564; X86-LABEL: combine_vpermt2var_16f32_vpermilps_mask_load: 565; X86: # %bb.0: 566; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 567; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 568; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 569; X86-NEXT: retl 570; 571; X64-AVX512F-LABEL: combine_vpermt2var_16f32_vpermilps_mask_load: 572; X64-AVX512F: # %bb.0: 573; X64-AVX512F-NEXT: kmovw %esi, %k1 574; X64-AVX512F-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 575; X64-AVX512F-NEXT: retq 576; 577; X64-AVX512BW-LABEL: combine_vpermt2var_16f32_vpermilps_mask_load: 578; X64-AVX512BW: # %bb.0: 579; X64-AVX512BW-NEXT: kmovd %esi, %k1 580; X64-AVX512BW-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 581; X64-AVX512BW-NEXT: retq 582 %x0 = load <16 x float>, <16 x float> *%p0 583 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 %m) 584 ret <16 x float> %res0 585} 586 587define <16 x i32> @combine_vpermt2var_16i32_identity(<16 x i32> %x0, <16 x i32> %x1) { 588; CHECK-LABEL: combine_vpermt2var_16i32_identity: 589; CHECK: # %bb.0: 590; CHECK-NEXT: ret{{[l|q]}} 591 %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef>, <16 x i32> %x0, <16 x i32> %x1, i16 -1) 592 %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 undef, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, <16 x i32> %res0, i16 -1) 593 ret <16 x i32> %res1 594} 595define <16 x i32> @combine_vpermt2var_16i32_identity_mask(<16 x i32> %x0, <16 x i32> %x1, i16 %m) { 596; X86-LABEL: combine_vpermt2var_16i32_identity_mask: 597; X86: # %bb.0: 598; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] 599; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 600; X86-NEXT: vpermi2d %zmm0, %zmm0, %zmm1 {%k1} {z} 601; X86-NEXT: vmovdqa64 {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] 602; X86-NEXT: vpermi2d %zmm1, %zmm1, %zmm0 {%k1} {z} 603; X86-NEXT: retl 604; 605; X64-AVX512F-LABEL: combine_vpermt2var_16i32_identity_mask: 606; X64-AVX512F: # %bb.0: 607; X64-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] 608; X64-AVX512F-NEXT: kmovw %edi, %k1 609; X64-AVX512F-NEXT: vpermi2d %zmm0, %zmm0, %zmm1 {%k1} {z} 610; X64-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] 611; X64-AVX512F-NEXT: vpermi2d %zmm1, %zmm1, %zmm0 {%k1} {z} 612; X64-AVX512F-NEXT: retq 613; 614; X64-AVX512BW-LABEL: combine_vpermt2var_16i32_identity_mask: 615; X64-AVX512BW: # %bb.0: 616; X64-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] 617; X64-AVX512BW-NEXT: kmovd %edi, %k1 618; X64-AVX512BW-NEXT: vpermi2d %zmm0, %zmm0, %zmm1 {%k1} {z} 619; X64-AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] 620; X64-AVX512BW-NEXT: vpermi2d %zmm1, %zmm1, %zmm0 {%k1} {z} 621; X64-AVX512BW-NEXT: retq 622 %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x i32> %x0, <16 x i32> %x1, i16 %m) 623 %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, <16 x i32> %res0, i16 %m) 624 ret <16 x i32> %res1 625} 626 627define <16 x i32> @combine_permvar_as_vpbroadcastd512(<16 x i32> %x0) { 628; CHECK-LABEL: combine_permvar_as_vpbroadcastd512: 629; CHECK: # %bb.0: 630; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 631; CHECK-NEXT: ret{{[l|q]}} 632 %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> zeroinitializer) 633 ret <16 x i32> %1 634} 635 636define <8 x i64> @combine_permvar_as_vpbroadcastq512(<8 x i64> %x0) { 637; CHECK-LABEL: combine_permvar_as_vpbroadcastq512: 638; CHECK: # %bb.0: 639; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 640; CHECK-NEXT: ret{{[l|q]}} 641 %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> zeroinitializer) 642 ret <8 x i64> %1 643} 644 645define <8 x i64> @combine_permvar_8i64_as_permq(<8 x i64> %x0, <8 x i64> %x1) { 646; CHECK-LABEL: combine_permvar_8i64_as_permq: 647; CHECK: # %bb.0: 648; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4] 649; CHECK-NEXT: ret{{[l|q]}} 650 %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>) 651 ret <8 x i64> %1 652} 653define <8 x i64> @combine_permvar_8i64_as_permq_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) { 654; X86-AVX512F-LABEL: combine_permvar_8i64_as_permq_mask: 655; X86-AVX512F: # %bb.0: 656; X86-AVX512F-NEXT: movzbl {{[0-9]+}}(%esp), %eax 657; X86-AVX512F-NEXT: kmovw %eax, %k1 658; X86-AVX512F-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4] 659; X86-AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 660; X86-AVX512F-NEXT: retl 661; 662; X86-AVX512BW-LABEL: combine_permvar_8i64_as_permq_mask: 663; X86-AVX512BW: # %bb.0: 664; X86-AVX512BW-NEXT: movzbl {{[0-9]+}}(%esp), %eax 665; X86-AVX512BW-NEXT: kmovd %eax, %k1 666; X86-AVX512BW-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4] 667; X86-AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 668; X86-AVX512BW-NEXT: retl 669; 670; X64-AVX512F-LABEL: combine_permvar_8i64_as_permq_mask: 671; X64-AVX512F: # %bb.0: 672; X64-AVX512F-NEXT: kmovw %edi, %k1 673; X64-AVX512F-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4] 674; X64-AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 675; X64-AVX512F-NEXT: retq 676; 677; X64-AVX512BW-LABEL: combine_permvar_8i64_as_permq_mask: 678; X64-AVX512BW: # %bb.0: 679; X64-AVX512BW-NEXT: kmovd %edi, %k1 680; X64-AVX512BW-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4] 681; X64-AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 682; X64-AVX512BW-NEXT: retq 683 %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>) 684 %2 = bitcast i8 %m to <8 x i1> 685 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x1 686 ret <8 x i64> %3 687} 688 689define <8 x double> @combine_permvar_8f64_as_permpd(<8 x double> %x0, <8 x double> %x1) { 690; CHECK-LABEL: combine_permvar_8f64_as_permpd: 691; CHECK: # %bb.0: 692; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4] 693; CHECK-NEXT: ret{{[l|q]}} 694 %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>) 695 ret <8 x double> %1 696} 697define <8 x double> @combine_permvar_8f64_as_permpd_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) { 698; X86-AVX512F-LABEL: combine_permvar_8f64_as_permpd_mask: 699; X86-AVX512F: # %bb.0: 700; X86-AVX512F-NEXT: movzbl {{[0-9]+}}(%esp), %eax 701; X86-AVX512F-NEXT: kmovw %eax, %k1 702; X86-AVX512F-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4] 703; X86-AVX512F-NEXT: vmovapd %zmm1, %zmm0 704; X86-AVX512F-NEXT: retl 705; 706; X86-AVX512BW-LABEL: combine_permvar_8f64_as_permpd_mask: 707; X86-AVX512BW: # %bb.0: 708; X86-AVX512BW-NEXT: movzbl {{[0-9]+}}(%esp), %eax 709; X86-AVX512BW-NEXT: kmovd %eax, %k1 710; X86-AVX512BW-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4] 711; X86-AVX512BW-NEXT: vmovapd %zmm1, %zmm0 712; X86-AVX512BW-NEXT: retl 713; 714; X64-AVX512F-LABEL: combine_permvar_8f64_as_permpd_mask: 715; X64-AVX512F: # %bb.0: 716; X64-AVX512F-NEXT: kmovw %edi, %k1 717; X64-AVX512F-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4] 718; X64-AVX512F-NEXT: vmovapd %zmm1, %zmm0 719; X64-AVX512F-NEXT: retq 720; 721; X64-AVX512BW-LABEL: combine_permvar_8f64_as_permpd_mask: 722; X64-AVX512BW: # %bb.0: 723; X64-AVX512BW-NEXT: kmovd %edi, %k1 724; X64-AVX512BW-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4] 725; X64-AVX512BW-NEXT: vmovapd %zmm1, %zmm0 726; X64-AVX512BW-NEXT: retq 727 %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>) 728 %2 = bitcast i8 %m to <8 x i1> 729 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %x1 730 ret <8 x double> %3 731} 732 733define <16 x float> @combine_vpermilvar_16f32_230146759A8BCFDE(<16 x float> %x0) { 734; CHECK-LABEL: combine_vpermilvar_16f32_230146759A8BCFDE: 735; CHECK: # %bb.0: 736; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,3,0,1,4,6,7,5,9,10,8,11,12,15,13,14] 737; CHECK-NEXT: ret{{[l|q]}} 738 %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 1, i32 1, i32 0, i32 3, i32 2>, <16 x float> undef, i16 -1) 739 %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %res0, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 3, i32 1, i32 0, i32 2, i32 3, i32 0, i32 2, i32 1, i32 1, i32 2, i32 0, i32 3>, <16 x float> undef, i16 -1) 740 ret <16 x float> %res1 741} 742 743define <8 x double> @combine_vpermi2var_8f64_identity(<8 x double> %x0, <8 x double> %x1) { 744; CHECK-LABEL: combine_vpermi2var_8f64_identity: 745; CHECK: # %bb.0: 746; CHECK-NEXT: ret{{[l|q]}} 747 %res0 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x1, i8 -1) 748 %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %res0, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, i8 -1) 749 ret <8 x double> %res1 750} 751 752define <8 x double> @combine_vpermi2var_8f64_as_shufpd(<8 x double> %x0, <8 x double> %x1) { 753; CHECK-LABEL: combine_vpermi2var_8f64_as_shufpd: 754; CHECK: # %bb.0: 755; CHECK-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[1],zmm1[0],zmm0[2],zmm1[2],zmm0[5],zmm1[5],zmm0[6],zmm1[7] 756; CHECK-NEXT: ret{{[l|q]}} 757 %1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> <i64 1, i64 8, i64 2, i64 10, i64 5, i64 13, i64 6, i64 15>, <8 x double> %x1, i8 -1) 758 ret <8 x double> %1 759} 760 761define <8 x i64> @combine_vpermi2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) { 762; CHECK-LABEL: combine_vpermi2var_8i64_identity: 763; CHECK: # %bb.0: 764; CHECK-NEXT: ret{{[l|q]}} 765 %res0 = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x1, i8 -1) 766 %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %res0, <8 x i64> <i64 undef, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, i8 -1) 767 ret <8 x i64> %res1 768} 769 770define <16 x float> @combine_vpermi2var_16f32_identity(<16 x float> %x0, <16 x float> %x1) { 771; CHECK-LABEL: combine_vpermi2var_16f32_identity: 772; CHECK: # %bb.0: 773; CHECK-NEXT: ret{{[l|q]}} 774 %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %x1, i16 -1) 775 %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %res0, <16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x float> %res0, i16 -1) 776 ret <16 x float> %res1 777} 778 779define <16 x i32> @combine_vpermi2var_16i32_identity(<16 x i32> %x0, <16 x i32> %x1) { 780; CHECK-LABEL: combine_vpermi2var_16i32_identity: 781; CHECK: # %bb.0: 782; CHECK-NEXT: ret{{[l|q]}} 783 %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef>, <16 x i32> %x1, i16 -1) 784 %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %res0, <16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 undef, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, i16 -1) 785 ret <16 x i32> %res1 786} 787 788define <16 x float> @combine_vpermt2var_vpermi2var_16f32_as_unpckhps(<16 x float> %a0, <16 x float> %a1) { 789; CHECK-LABEL: combine_vpermt2var_vpermi2var_16f32_as_unpckhps: 790; CHECK: # %bb.0: 791; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15] 792; CHECK-NEXT: ret{{[l|q]}} 793 %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %a0, <16 x i32> <i32 18, i32 2, i32 19, i32 3, i32 22, i32 6, i32 23, i32 7, i32 26, i32 10, i32 27, i32 11, i32 30, i32 14, i32 31, i32 15>, <16 x float> %a1, i16 -1) 794 ret <16 x float> %res0 795} 796 797define <16 x i32> @vpermt2var_vpermi2var_16i32_as_unpckldq(<16 x i32> %a0, <16 x i32> %a1) { 798; CHECK-LABEL: vpermt2var_vpermi2var_16i32_as_unpckldq: 799; CHECK: # %bb.0: 800; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] 801; CHECK-NEXT: ret{{[l|q]}} 802 %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %a0, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>, <16 x i32> %a1, i16 -1) 803 ret <16 x i32> %res0 804} 805 806define <8 x double> @combine_vpermi2var_8f64_as_vpermpd(<8 x double> %x0, <8 x double> %x1) { 807; X86-LABEL: combine_vpermi2var_8f64_as_vpermpd: 808; X86: # %bb.0: 809; X86-NEXT: vmovaps {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] 810; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm0 811; X86-NEXT: retl 812; 813; X64-LABEL: combine_vpermi2var_8f64_as_vpermpd: 814; X64: # %bb.0: 815; X64-NEXT: vmovaps {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] 816; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm0 817; X64-NEXT: retq 818 %res0 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 0, i64 7, i64 6, i64 5, i64 4>, <8 x double> %x1, i8 -1) 819 %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %res0, <8 x i64> <i64 12, i64 5, i64 14, i64 7, i64 8, i64 1, i64 10, i64 3>, <8 x double> %res0, i8 -1) 820 ret <8 x double> %res1 821} 822 823define <8 x i64> @combine_vpermt2var_8i64_as_vpermq(<8 x i64> %x0, <8 x i64> %x1) { 824; X86-LABEL: combine_vpermt2var_8i64_as_vpermq: 825; X86: # %bb.0: 826; X86-NEXT: vmovaps {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] 827; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm0 828; X86-NEXT: retl 829; 830; X64-LABEL: combine_vpermt2var_8i64_as_vpermq: 831; X64: # %bb.0: 832; X64-NEXT: vmovaps {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] 833; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm0 834; X64-NEXT: retq 835 %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 3, i64 2, i64 1, i64 0, i64 7, i64 6, i64 5, i64 4>, <8 x i64> %x0, <8 x i64> %x1, i8 -1) 836 %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 12, i64 5, i64 14, i64 7, i64 8, i64 1, i64 10, i64 3>, <8 x i64> %res0, <8 x i64> %res0, i8 -1) 837 ret <8 x i64> %res1 838} 839 840define <16 x float> @combine_vpermi2var_16f32_as_vpermps(<16 x float> %x0, <16 x float> %x1) { 841; CHECK-LABEL: combine_vpermi2var_16f32_as_vpermps: 842; CHECK: # %bb.0: 843; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9] 844; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 845; CHECK-NEXT: ret{{[l|q]}} 846 %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>, <16 x float> %x1, i16 -1) 847 %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %res0, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>, <16 x float> %res0, i16 -1) 848 ret <16 x float> %res1 849} 850 851define <16 x i32> @combine_vpermt2var_16i32_as_vpermd(<16 x i32> %x0, <16 x i32> %x1) { 852; CHECK-LABEL: combine_vpermt2var_16i32_as_vpermd: 853; CHECK: # %bb.0: 854; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9] 855; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 856; CHECK-NEXT: ret{{[l|q]}} 857 %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>, <16 x i32> %x0, <16 x i32> %x1, i16 -1) 858 %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>, <16 x i32> %res0, <16 x i32> %res0, i16 -1) 859 ret <16 x i32> %res1 860} 861 862define <16 x i32> @combine_vpermt2var_16i32_as_vpsrlq(<16 x i32> %x0) { 863; CHECK-LABEL: combine_vpermt2var_16i32_as_vpsrlq: 864; CHECK: # %bb.0: 865; CHECK-NEXT: vpsrlq $32, %zmm0, %zmm0 866; CHECK-NEXT: ret{{[l|q]}} 867 %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 1, i32 16, i32 3, i32 16, i32 5, i32 16, i32 7, i32 16, i32 9, i32 16, i32 11, i32 16, i32 13, i32 16, i32 15, i32 16>, <16 x i32> %x0, <16 x i32> zeroinitializer, i16 -1) 868 ret <16 x i32> %res0 869} 870 871define <16 x i32> @combine_vpermt2var_16i32_as_vpsllq(<16 x i32> %x0) { 872; CHECK-LABEL: combine_vpermt2var_16i32_as_vpsllq: 873; CHECK: # %bb.0: 874; CHECK-NEXT: vpsllq $32, %zmm0, %zmm0 875; CHECK-NEXT: ret{{[l|q]}} 876 %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 16, i32 0, i32 16, i32 2, i32 16, i32 4, i32 16, i32 6, i32 16, i32 8, i32 16, i32 10, i32 16, i32 12, i32 16, i32 14>, <16 x i32> %x0, <16 x i32> zeroinitializer, i16 -1) 877 ret <16 x i32> %res0 878} 879 880define <8 x double> @combine_vpermi2var_vpermt2var_8f64_as_vperm2(<8 x double> %x0, <8 x double> %x1) { 881; X86-LABEL: combine_vpermi2var_vpermt2var_8f64_as_vperm2: 882; X86: # %bb.0: 883; X86-NEXT: vmovapd {{.*#+}} zmm2 = [4,0,14,0,3,0,12,0,7,0,8,0,0,0,15,0] 884; X86-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 885; X86-NEXT: vmovapd %zmm2, %zmm0 886; X86-NEXT: retl 887; 888; X64-LABEL: combine_vpermi2var_vpermt2var_8f64_as_vperm2: 889; X64: # %bb.0: 890; X64-NEXT: vmovapd {{.*#+}} zmm2 = [4,14,3,12,7,8,0,15] 891; X64-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 892; X64-NEXT: vmovapd %zmm2, %zmm0 893; X64-NEXT: retq 894 %res0 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> <i64 15, i64 0, i64 8, i64 7, i64 12, i64 6, i64 11, i64 4>, <8 x double> %x1, i8 -1) 895 %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 12, i64 5, i64 14, i64 7, i64 8, i64 1, i64 10, i64 3>, <8 x double> %res0, <8 x double> %res0, i8 -1) 896 ret <8 x double> %res1 897} 898 899define <8 x double> @combine_vpermi2var_8f64_as_permpd(<8 x double> %x0, <8 x double> %x1, i64 %a2) { 900; X86-LABEL: combine_vpermi2var_8f64_as_permpd: 901; X86: # %bb.0: 902; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero 903; X86-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0] 904; X86-NEXT: vinsertf128 $1, {{\.LCPI.*}}, %ymm2, %ymm2 905; X86-NEXT: vinsertf64x4 $1, {{\.LCPI.*}}, %zmm2, %zmm2 906; X86-NEXT: vpermi2pd %zmm1, %zmm0, %zmm2 907; X86-NEXT: vpermpd {{.*#+}} zmm0 = zmm2[2,3,1,1,6,7,5,5] 908; X86-NEXT: retl 909; 910; X64-LABEL: combine_vpermi2var_8f64_as_permpd: 911; X64: # %bb.0: 912; X64-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[1,3,2,2,5,7,6,6] 913; X64-NEXT: retq 914 %res0 = insertelement <8 x i64> <i64 0, i64 2, i64 1, i64 3, i64 4, i64 6, i64 5, i64 7>, i64 %a2, i32 0 915 %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %res0, <8 x double> %x1, i8 -1) 916 %res2 = shufflevector <8 x double> %res1, <8 x double> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 1, i32 6, i32 7, i32 5, i32 5> 917 ret <8 x double> %res2 918} 919 920define <16 x i32> @combine_vpermi2var_vpermt2var_16i32_as_vpermd(<16 x i32> %x0, <16 x i32> %x1) { 921; CHECK-LABEL: combine_vpermi2var_vpermt2var_16i32_as_vpermd: 922; CHECK: # %bb.0: 923; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19] 924; CHECK-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 925; CHECK-NEXT: ret{{[l|q]}} 926 %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> <i32 0, i32 31, i32 2, i32 29, i32 4, i32 27, i32 6, i32 25, i32 8, i32 23, i32 10, i32 21, i32 12, i32 19, i32 14, i32 17>, <16 x i32> %x1, i16 -1) 927 %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 0, i32 17, i32 2, i32 18, i32 4, i32 19, i32 6, i32 21, i32 8, i32 23, i32 10, i32 25, i32 12, i32 27, i32 14, i32 29>, <16 x i32> %res0, <16 x i32> %res0, i16 -1) 928 ret <16 x i32> %res1 929} 930 931define <8 x double> @combine_vpermi2var_vpermvar_8f64_as_vperm2_zero(<8 x double> %x0) { 932; X86-LABEL: combine_vpermi2var_vpermvar_8f64_as_vperm2_zero: 933; X86: # %bb.0: 934; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 935; X86-NEXT: vmovapd {{.*#+}} zmm2 = [8,0,3,0,10,0,11,0,1,0,7,0,14,0,5,0] 936; X86-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 937; X86-NEXT: retl 938; 939; X64-LABEL: combine_vpermi2var_vpermvar_8f64_as_vperm2_zero: 940; X64: # %bb.0: 941; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1 942; X64-NEXT: vmovapd {{.*#+}} zmm2 = [8,3,10,11,1,7,14,5] 943; X64-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 944; X64-NEXT: retq 945 %res0 = shufflevector <8 x double> %x0, <8 x double> zeroinitializer, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 946 %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %res0, <8 x i64> <i64 3, i64 2, i64 1, i64 7, i64 0, i64 6, i64 5, i64 4>) 947 ret <8 x double> %1 948} 949 950define <16 x float> @combine_vpermi2var_vpermvar_16f32_as_vperm2_zero(<16 x float> %x0) { 951; CHECK-LABEL: combine_vpermi2var_vpermvar_16f32_as_vperm2_zero: 952; CHECK: # %bb.0: 953; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 954; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [0,13,1,12,4,9,22,12,4,25,26,9,5,29,30,8] 955; CHECK-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 956; CHECK-NEXT: ret{{[l|q]}} 957 %res0 = shufflevector <16 x float> %x0, <16 x float> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29> 958 %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %res0, <16 x i32> <i32 0, i32 14, i32 2, i32 12, i32 4, i32 10, i32 3, i32 12, i32 4, i32 11, i32 5, i32 10, i32 6, i32 9, i32 7, i32 8>, <16 x float> %res0, i16 -1) 959 ret <16 x float> %res1 960} 961 962define <8 x i64> @combine_vpermvar_insertion_as_broadcast_v8i64(i64 %a0) { 963; X86-LABEL: combine_vpermvar_insertion_as_broadcast_v8i64: 964; X86: # %bb.0: 965; X86-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %zmm0 966; X86-NEXT: retl 967; 968; X64-LABEL: combine_vpermvar_insertion_as_broadcast_v8i64: 969; X64: # %bb.0: 970; X64-NEXT: vpbroadcastq %rdi, %zmm0 971; X64-NEXT: retq 972 %1 = insertelement <8 x i64> undef, i64 %a0, i32 0 973 %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %1, <8 x i64> zeroinitializer) 974 ret <8 x i64> %2 975} 976