1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=X32 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=X64 4 5declare <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) 6 7declare <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16) 8 9declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>) 10declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>) 11declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>) 12declare <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16>, <32 x i16>) 13 14declare <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64>, <8 x double>, <8 x double>, i8) 15declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16) 16 17declare <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) 18declare <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) 19declare <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) 20 21declare <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8) 22declare <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16) 23 24declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) 25declare <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) 26declare <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) 27 28define <8 x double> @combine_permvar_8f64_identity(<8 x double> %x0, <8 x double> %x1) { 29; X32-LABEL: combine_permvar_8f64_identity: 30; X32: # %bb.0: 31; X32-NEXT: retl 32; 33; X64-LABEL: combine_permvar_8f64_identity: 34; X64: # %bb.0: 35; X64-NEXT: retq 36 %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>) 37 %2 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %1, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>) 38 ret <8 x double> %2 39} 40define <8 x double> @combine_permvar_8f64_identity_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) { 41; X32-LABEL: combine_permvar_8f64_identity_mask: 42; X32: # %bb.0: 43; X32-NEXT: vmovapd {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] 44; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 45; X32-NEXT: kmovd %eax, %k1 46; X32-NEXT: vpermpd %zmm0, %zmm2, %zmm1 {%k1} 47; X32-NEXT: vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] 48; X32-NEXT: vpermpd %zmm1, %zmm0, %zmm1 {%k1} 49; X32-NEXT: vmovapd %zmm1, %zmm0 50; X32-NEXT: retl 51; 52; X64-LABEL: combine_permvar_8f64_identity_mask: 53; X64: # %bb.0: 54; X64-NEXT: vmovapd {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] 55; X64-NEXT: kmovd %edi, %k1 56; X64-NEXT: vpermpd %zmm0, %zmm2, %zmm1 {%k1} 57; X64-NEXT: vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] 58; X64-NEXT: vpermpd %zmm1, %zmm0, %zmm1 {%k1} 59; X64-NEXT: vmovapd %zmm1, %zmm0 60; X64-NEXT: retq 61 %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>) 62 %2 = bitcast i8 %m to <8 x i1> 63 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %x1 64 %4 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %3, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>) 65 %5 = bitcast i8 %m to <8 x i1> 66 %6 = select <8 x i1> %5, <8 x double> %4, <8 x double> %3 67 ret <8 x double> %6 68} 69 70define <8 x i64> @combine_permvar_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) { 71; X32-LABEL: combine_permvar_8i64_identity: 72; X32: # %bb.0: 73; X32-NEXT: retl 74; 75; X64-LABEL: combine_permvar_8i64_identity: 76; X64: # %bb.0: 77; X64-NEXT: retq 78 %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>) 79 %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %1, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>) 80 ret <8 x i64> %2 81} 82define <8 x i64> @combine_permvar_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) { 83; X32-LABEL: combine_permvar_8i64_identity_mask: 84; X32: # %bb.0: 85; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] 86; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 87; X32-NEXT: kmovd %eax, %k1 88; X32-NEXT: vpermq %zmm0, %zmm2, %zmm1 {%k1} 89; X32-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] 90; X32-NEXT: vpermq %zmm1, %zmm0, %zmm1 {%k1} 91; X32-NEXT: vmovdqa64 %zmm1, %zmm0 92; X32-NEXT: retl 93; 94; X64-LABEL: combine_permvar_8i64_identity_mask: 95; X64: # %bb.0: 96; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] 97; X64-NEXT: kmovd %edi, %k1 98; X64-NEXT: vpermq %zmm0, %zmm2, %zmm1 {%k1} 99; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] 100; X64-NEXT: vpermq %zmm1, %zmm0, %zmm1 {%k1} 101; X64-NEXT: vmovdqa64 %zmm1, %zmm0 102; X64-NEXT: retq 103 %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>) 104 %2 = bitcast i8 %m to <8 x i1> 105 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x1 106 %4 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %3, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>) 107 %5 = bitcast i8 %m to <8 x i1> 108 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %3 109 ret <8 x i64> %6 110} 111 112define <8 x double> @combine_vpermt2var_8f64_identity(<8 x double> %x0, <8 x double> %x1) { 113; X32-LABEL: combine_vpermt2var_8f64_identity: 114; X32: # %bb.0: 115; X32-NEXT: retl 116; 117; X64-LABEL: combine_vpermt2var_8f64_identity: 118; X64: # %bb.0: 119; X64-NEXT: retq 120 %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x0, <8 x double> %x1, i8 -1) 121 %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, <8 x double> %res0, i8 -1) 122 ret <8 x double> %res1 123} 124define <8 x double> @combine_vpermt2var_8f64_identity_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) { 125; X32-LABEL: combine_vpermt2var_8f64_identity_mask: 126; X32: # %bb.0: 127; X32-NEXT: vmovapd {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] 128; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 129; X32-NEXT: kmovd %eax, %k1 130; X32-NEXT: vpermi2pd %zmm1, %zmm0, %zmm2 {%k1} {z} 131; X32-NEXT: vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] 132; X32-NEXT: vpermi2pd %zmm2, %zmm2, %zmm0 {%k1} {z} 133; X32-NEXT: retl 134; 135; X64-LABEL: combine_vpermt2var_8f64_identity_mask: 136; X64: # %bb.0: 137; X64-NEXT: vmovapd {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] 138; X64-NEXT: kmovd %edi, %k1 139; X64-NEXT: vpermi2pd %zmm1, %zmm0, %zmm2 {%k1} {z} 140; X64-NEXT: vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] 141; X64-NEXT: vpermi2pd %zmm2, %zmm2, %zmm0 {%k1} {z} 142; X64-NEXT: retq 143 %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x0, <8 x double> %x1, i8 %m) 144 %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, <8 x double> %res0, i8 %m) 145 ret <8 x double> %res1 146} 147 148define <8 x double> @combine_vpermt2var_8f64_movddup(<8 x double> %x0, <8 x double> %x1) { 149; X32-LABEL: combine_vpermt2var_8f64_movddup: 150; X32: # %bb.0: 151; X32-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6] 152; X32-NEXT: retl 153; 154; X64-LABEL: combine_vpermt2var_8f64_movddup: 155; X64: # %bb.0: 156; X64-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6] 157; X64-NEXT: retq 158 %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 undef, i64 undef>, <8 x double> %x0, <8 x double> %x1, i8 -1) 159 ret <8 x double> %res0 160} 161define <8 x double> @combine_vpermt2var_8f64_movddup_load(<8 x double> *%p0, <8 x double> %x1) { 162; X32-LABEL: combine_vpermt2var_8f64_movddup_load: 163; X32: # %bb.0: 164; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 165; X32-NEXT: vmovddup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6] 166; X32-NEXT: retl 167; 168; X64-LABEL: combine_vpermt2var_8f64_movddup_load: 169; X64: # %bb.0: 170; X64-NEXT: vmovddup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6] 171; X64-NEXT: retq 172 %x0 = load <8 x double>, <8 x double> *%p0 173 %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 6, i64 6>, <8 x double> %x0, <8 x double> %x1, i8 -1) 174 ret <8 x double> %res0 175} 176define <8 x double> @combine_vpermt2var_8f64_movddup_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) { 177; X32-LABEL: combine_vpermt2var_8f64_movddup_mask: 178; X32: # %bb.0: 179; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 180; X32-NEXT: kmovd %eax, %k1 181; X32-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] 182; X32-NEXT: retl 183; 184; X64-LABEL: combine_vpermt2var_8f64_movddup_mask: 185; X64: # %bb.0: 186; X64-NEXT: kmovd %edi, %k1 187; X64-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] 188; X64-NEXT: retq 189 %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 6, i64 6>, <8 x double> %x0, <8 x double> %x1, i8 %m) 190 ret <8 x double> %res0 191} 192 193define <8 x i64> @combine_vpermt2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) { 194; X32-LABEL: combine_vpermt2var_8i64_identity: 195; X32: # %bb.0: 196; X32-NEXT: retl 197; 198; X64-LABEL: combine_vpermt2var_8i64_identity: 199; X64: # %bb.0: 200; X64-NEXT: retq 201 %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x0, <8 x i64> %x1, i8 -1) 202 %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 undef, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, <8 x i64> %res0, i8 -1) 203 ret <8 x i64> %res1 204} 205define <8 x i64> @combine_vpermt2var_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) { 206; X32-LABEL: combine_vpermt2var_8i64_identity_mask: 207; X32: # %bb.0: 208; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] 209; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 210; X32-NEXT: kmovd %eax, %k1 211; X32-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 {%k1} {z} 212; X32-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] 213; X32-NEXT: vpermi2q %zmm2, %zmm2, %zmm0 {%k1} {z} 214; X32-NEXT: retl 215; 216; X64-LABEL: combine_vpermt2var_8i64_identity_mask: 217; X64: # %bb.0: 218; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] 219; X64-NEXT: kmovd %edi, %k1 220; X64-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 {%k1} {z} 221; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] 222; X64-NEXT: vpermi2q %zmm2, %zmm2, %zmm0 {%k1} {z} 223; X64-NEXT: retq 224 %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x0, <8 x i64> %x1, i8 %m) 225 %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, <8 x i64> %res0, i8 %m) 226 ret <8 x i64> %res1 227} 228 229define <16 x float> @combine_vpermt2var_16f32_identity(<16 x float> %x0, <16 x float> %x1) { 230; X32-LABEL: combine_vpermt2var_16f32_identity: 231; X32: # %bb.0: 232; X32-NEXT: retl 233; 234; X64-LABEL: combine_vpermt2var_16f32_identity: 235; X64: # %bb.0: 236; X64-NEXT: retq 237 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %x0, <16 x float> %x1, i16 -1) 238 %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x float> %res0, <16 x float> %res0, i16 -1) 239 ret <16 x float> %res1 240} 241define <16 x float> @combine_vpermt2var_16f32_identity_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) { 242; X32-LABEL: combine_vpermt2var_16f32_identity_mask: 243; X32: # %bb.0: 244; X32-NEXT: vmovaps {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] 245; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 246; X32-NEXT: vpermi2ps %zmm1, %zmm0, %zmm2 {%k1} {z} 247; X32-NEXT: vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] 248; X32-NEXT: vpermi2ps %zmm2, %zmm2, %zmm0 {%k1} {z} 249; X32-NEXT: retl 250; 251; X64-LABEL: combine_vpermt2var_16f32_identity_mask: 252; X64: # %bb.0: 253; X64-NEXT: vmovaps {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] 254; X64-NEXT: kmovd %edi, %k1 255; X64-NEXT: vpermi2ps %zmm1, %zmm0, %zmm2 {%k1} {z} 256; X64-NEXT: vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] 257; X64-NEXT: vpermi2ps %zmm2, %zmm2, %zmm0 {%k1} {z} 258; X64-NEXT: retq 259 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %x0, <16 x float> %x1, i16 %m) 260 %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x float> %res0, <16 x float> %res0, i16 %m) 261 ret <16 x float> %res1 262} 263 264define <16 x float> @combine_vpermt2var_16f32_vmovddup(<16 x float> %x0, <16 x float> %x1) { 265; X32-LABEL: combine_vpermt2var_16f32_vmovddup: 266; X32: # %bb.0: 267; X32-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] 268; X32-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 269; X32-NEXT: retl 270; 271; X64-LABEL: combine_vpermt2var_16f32_vmovddup: 272; X64: # %bb.0: 273; X64-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] 274; X64-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 275; X64-NEXT: retq 276 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 -1) 277 ret <16 x float> %res0 278} 279define <16 x float> @combine_vpermt2var_16f32_vmovddup_load(<16 x float> *%p0, <16 x float> %x1) { 280; X32-LABEL: combine_vpermt2var_16f32_vmovddup_load: 281; X32: # %bb.0: 282; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 283; X32-NEXT: vmovaps (%eax), %zmm2 284; X32-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] 285; X32-NEXT: vpermi2ps %zmm0, %zmm2, %zmm1 286; X32-NEXT: vmovaps %zmm1, %zmm0 287; X32-NEXT: retl 288; 289; X64-LABEL: combine_vpermt2var_16f32_vmovddup_load: 290; X64: # %bb.0: 291; X64-NEXT: vmovaps (%rdi), %zmm2 292; X64-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] 293; X64-NEXT: vpermi2ps %zmm0, %zmm2, %zmm1 294; X64-NEXT: vmovaps %zmm1, %zmm0 295; X64-NEXT: retq 296 %x0 = load <16 x float>, <16 x float> *%p0 297 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 -1) 298 ret <16 x float> %res0 299} 300define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) { 301; X32-LABEL: combine_vpermt2var_16f32_vmovddup_mask: 302; X32: # %bb.0: 303; X32-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] 304; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 305; X32-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 {%k1} {z} 306; X32-NEXT: retl 307; 308; X64-LABEL: combine_vpermt2var_16f32_vmovddup_mask: 309; X64: # %bb.0: 310; X64-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] 311; X64-NEXT: kmovd %edi, %k1 312; X64-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 {%k1} {z} 313; X64-NEXT: retq 314 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 %m) 315 ret <16 x float> %res0 316} 317define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) { 318; X32-LABEL: combine_vpermt2var_16f32_vmovddup_mask_load: 319; X32: # %bb.0: 320; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 321; X32-NEXT: vmovaps (%eax), %zmm2 322; X32-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] 323; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 324; X32-NEXT: vpermi2ps %zmm0, %zmm2, %zmm1 {%k1} {z} 325; X32-NEXT: vmovaps %zmm1, %zmm0 326; X32-NEXT: retl 327; 328; X64-LABEL: combine_vpermt2var_16f32_vmovddup_mask_load: 329; X64: # %bb.0: 330; X64-NEXT: vmovaps (%rdi), %zmm2 331; X64-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] 332; X64-NEXT: kmovd %esi, %k1 333; X64-NEXT: vpermi2ps %zmm0, %zmm2, %zmm1 {%k1} {z} 334; X64-NEXT: vmovaps %zmm1, %zmm0 335; X64-NEXT: retq 336 %x0 = load <16 x float>, <16 x float> *%p0 337 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 12, i32 13>, <16 x float> %x0, <16 x float> %x1, i16 %m) 338 ret <16 x float> %res0 339} 340 341define <16 x float> @combine_vpermt2var_16f32_vmovshdup(<16 x float> %x0, <16 x float> %x1) { 342; X32-LABEL: combine_vpermt2var_16f32_vmovshdup: 343; X32: # %bb.0: 344; X32-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 345; X32-NEXT: retl 346; 347; X64-LABEL: combine_vpermt2var_16f32_vmovshdup: 348; X64: # %bb.0: 349; X64-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 350; X64-NEXT: retq 351 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 -1) 352 ret <16 x float> %res0 353} 354define <16 x float> @combine_vpermt2var_16f32_vmovshdup_load(<16 x float> *%p0, <16 x float> %x1) { 355; X32-LABEL: combine_vpermt2var_16f32_vmovshdup_load: 356; X32: # %bb.0: 357; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 358; X32-NEXT: vmovshdup {{.*#+}} zmm0 = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 359; X32-NEXT: retl 360; 361; X64-LABEL: combine_vpermt2var_16f32_vmovshdup_load: 362; X64: # %bb.0: 363; X64-NEXT: vmovshdup {{.*#+}} zmm0 = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 364; X64-NEXT: retq 365 %x0 = load <16 x float>, <16 x float> *%p0 366 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 -1) 367 ret <16 x float> %res0 368} 369define <16 x float> @combine_vpermt2var_16f32_vmovshdup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) { 370; X32-LABEL: combine_vpermt2var_16f32_vmovshdup_mask: 371; X32: # %bb.0: 372; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 373; X32-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 374; X32-NEXT: retl 375; 376; X64-LABEL: combine_vpermt2var_16f32_vmovshdup_mask: 377; X64: # %bb.0: 378; X64-NEXT: kmovd %edi, %k1 379; X64-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 380; X64-NEXT: retq 381 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 %m) 382 ret <16 x float> %res0 383} 384 385define <16 x float> @combine_vpermt2var_16f32_vmovsldup(<16 x float> %x0, <16 x float> %x1) { 386; X32-LABEL: combine_vpermt2var_16f32_vmovsldup: 387; X32: # %bb.0: 388; X32-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 389; X32-NEXT: retl 390; 391; X64-LABEL: combine_vpermt2var_16f32_vmovsldup: 392; X64: # %bb.0: 393; X64-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 394; X64-NEXT: retq 395 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 -1) 396 ret <16 x float> %res0 397} 398define <16 x float> @combine_vpermt2var_16f32_vmovsldup_load(<16 x float> *%p0, <16 x float> %x1) { 399; X32-LABEL: combine_vpermt2var_16f32_vmovsldup_load: 400; X32: # %bb.0: 401; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 402; X32-NEXT: vmovsldup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 403; X32-NEXT: retl 404; 405; X64-LABEL: combine_vpermt2var_16f32_vmovsldup_load: 406; X64: # %bb.0: 407; X64-NEXT: vmovsldup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 408; X64-NEXT: retq 409 %x0 = load <16 x float>, <16 x float> *%p0 410 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 -1) 411 ret <16 x float> %res0 412} 413define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) { 414; X32-LABEL: combine_vpermt2var_16f32_vmovsldup_mask: 415; X32: # %bb.0: 416; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 417; X32-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 418; X32-NEXT: retl 419; 420; X64-LABEL: combine_vpermt2var_16f32_vmovsldup_mask: 421; X64: # %bb.0: 422; X64-NEXT: kmovd %edi, %k1 423; X64-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 424; X64-NEXT: retq 425 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 %m) 426 ret <16 x float> %res0 427} 428define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) { 429; X32-LABEL: combine_vpermt2var_16f32_vmovsldup_mask_load: 430; X32: # %bb.0: 431; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 432; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 433; X32-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 434; X32-NEXT: retl 435; 436; X64-LABEL: combine_vpermt2var_16f32_vmovsldup_mask_load: 437; X64: # %bb.0: 438; X64-NEXT: kmovd %esi, %k1 439; X64-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 440; X64-NEXT: retq 441 %x0 = load <16 x float>, <16 x float> *%p0 442 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 %m) 443 ret <16 x float> %res0 444} 445 446define <16 x float> @combine_vpermt2var_16f32_vpermilps(<16 x float> %x0, <16 x float> %x1) { 447; X32-LABEL: combine_vpermt2var_16f32_vpermilps: 448; X32: # %bb.0: 449; X32-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 450; X32-NEXT: retl 451; 452; X64-LABEL: combine_vpermt2var_16f32_vpermilps: 453; X64: # %bb.0: 454; X64-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 455; X64-NEXT: retq 456 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 -1) 457 ret <16 x float> %res0 458} 459define <16 x float> @combine_vpermt2var_16f32_vpermilps_load(<16 x float> *%p0, <16 x float> %x1) { 460; X32-LABEL: combine_vpermt2var_16f32_vpermilps_load: 461; X32: # %bb.0: 462; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 463; X32-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 464; X32-NEXT: retl 465; 466; X64-LABEL: combine_vpermt2var_16f32_vpermilps_load: 467; X64: # %bb.0: 468; X64-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 469; X64-NEXT: retq 470 %x0 = load <16 x float>, <16 x float> *%p0 471 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 -1) 472 ret <16 x float> %res0 473} 474define <16 x float> @combine_vpermt2var_16f32_vpermilps_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) { 475; X32-LABEL: combine_vpermt2var_16f32_vpermilps_mask: 476; X32: # %bb.0: 477; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 478; X32-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 479; X32-NEXT: retl 480; 481; X64-LABEL: combine_vpermt2var_16f32_vpermilps_mask: 482; X64: # %bb.0: 483; X64-NEXT: kmovd %edi, %k1 484; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 485; X64-NEXT: retq 486 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 %m) 487 ret <16 x float> %res0 488} 489define <16 x float> @combine_vpermt2var_16f32_vpermilps_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) { 490; X32-LABEL: combine_vpermt2var_16f32_vpermilps_mask_load: 491; X32: # %bb.0: 492; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 493; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 494; X32-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 495; X32-NEXT: retl 496; 497; X64-LABEL: combine_vpermt2var_16f32_vpermilps_mask_load: 498; X64: # %bb.0: 499; X64-NEXT: kmovd %esi, %k1 500; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] 501; X64-NEXT: retq 502 %x0 = load <16 x float>, <16 x float> *%p0 503 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 %m) 504 ret <16 x float> %res0 505} 506 507define <16 x i32> @combine_vpermt2var_16i32_identity(<16 x i32> %x0, <16 x i32> %x1) { 508; X32-LABEL: combine_vpermt2var_16i32_identity: 509; X32: # %bb.0: 510; X32-NEXT: retl 511; 512; X64-LABEL: combine_vpermt2var_16i32_identity: 513; X64: # %bb.0: 514; X64-NEXT: retq 515 %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef>, <16 x i32> %x0, <16 x i32> %x1, i16 -1) 516 %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 undef, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, <16 x i32> %res0, i16 -1) 517 ret <16 x i32> %res1 518} 519define <16 x i32> @combine_vpermt2var_16i32_identity_mask(<16 x i32> %x0, <16 x i32> %x1, i16 %m) { 520; X32-LABEL: combine_vpermt2var_16i32_identity_mask: 521; X32: # %bb.0: 522; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] 523; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 524; X32-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 {%k1} {z} 525; X32-NEXT: vmovdqa64 {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] 526; X32-NEXT: vpermi2d %zmm2, %zmm2, %zmm0 {%k1} {z} 527; X32-NEXT: retl 528; 529; X64-LABEL: combine_vpermt2var_16i32_identity_mask: 530; X64: # %bb.0: 531; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] 532; X64-NEXT: kmovd %edi, %k1 533; X64-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 {%k1} {z} 534; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] 535; X64-NEXT: vpermi2d %zmm2, %zmm2, %zmm0 {%k1} {z} 536; X64-NEXT: retq 537 %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x i32> %x0, <16 x i32> %x1, i16 %m) 538 %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, <16 x i32> %res0, i16 %m) 539 ret <16 x i32> %res1 540} 541 542define <32 x i16> @combine_vpermt2var_32i16_identity(<32 x i16> %x0, <32 x i16> %x1) { 543; X32-LABEL: combine_vpermt2var_32i16_identity: 544; X32: # %bb.0: 545; X32-NEXT: retl 546; 547; X64-LABEL: combine_vpermt2var_32i16_identity: 548; X64: # %bb.0: 549; X64-NEXT: retq 550 %res0 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> %x0, <32 x i16> %x1, i32 -1) 551 %res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 63, i16 30, i16 61, i16 28, i16 59, i16 26, i16 57, i16 24, i16 55, i16 22, i16 53, i16 20, i16 51, i16 18, i16 49, i16 16, i16 47, i16 46, i16 13, i16 44, i16 11, i16 42, i16 9, i16 40, i16 7, i16 38, i16 5, i16 36, i16 3, i16 34, i16 1, i16 32>, <32 x i16> %res0, <32 x i16> %res0, i32 -1) 552 ret <32 x i16> %res1 553} 554define <32 x i16> @combine_vpermt2var_32i16_identity_mask(<32 x i16> %x0, <32 x i16> %x1, i32 %m) { 555; X32-LABEL: combine_vpermt2var_32i16_identity_mask: 556; X32: # %bb.0: 557; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] 558; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 559; X32-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 {%k1} {z} 560; X32-NEXT: vmovdqa64 {{.*#+}} zmm0 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32] 561; X32-NEXT: vpermi2w %zmm2, %zmm2, %zmm0 {%k1} {z} 562; X32-NEXT: retl 563; 564; X64-LABEL: combine_vpermt2var_32i16_identity_mask: 565; X64: # %bb.0: 566; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] 567; X64-NEXT: kmovd %edi, %k1 568; X64-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 {%k1} {z} 569; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32] 570; X64-NEXT: vpermi2w %zmm2, %zmm2, %zmm0 {%k1} {z} 571; X64-NEXT: retq 572 %res0 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> %x0, <32 x i16> %x1, i32 %m) 573 %res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 63, i16 30, i16 61, i16 28, i16 59, i16 26, i16 57, i16 24, i16 55, i16 22, i16 53, i16 20, i16 51, i16 18, i16 49, i16 16, i16 47, i16 46, i16 13, i16 44, i16 11, i16 42, i16 9, i16 40, i16 7, i16 38, i16 5, i16 36, i16 3, i16 34, i16 1, i16 32>, <32 x i16> %res0, <32 x i16> %res0, i32 %m) 574 ret <32 x i16> %res1 575} 576 577define <64 x i8> @combine_pshufb_identity(<64 x i8> %x0) { 578; X32-LABEL: combine_pshufb_identity: 579; X32: # %bb.0: 580; X32-NEXT: retl 581; 582; X64-LABEL: combine_pshufb_identity: 583; X64: # %bb.0: 584; X64-NEXT: retq 585 %select = bitcast <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1> to <64 x i8> 586 %mask = bitcast <16 x i32> <i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 undef, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051> to <64 x i8> 587 %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %mask, <64 x i8> %select, i64 -1) 588 %res1 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %res0, <64 x i8> %mask, <64 x i8> %select, i64 -1) 589 ret <64 x i8> %res1 590} 591define <64 x i8> @combine_pshufb_identity_mask(<64 x i8> %x0, i64 %m) { 592; X32-LABEL: combine_pshufb_identity_mask: 593; X32: # %bb.0: 594; X32-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 595; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] 596; X32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 597; X32-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 598; X32-NEXT: vpshufb %zmm2, %zmm0, %zmm3 {%k1} 599; X32-NEXT: vpshufb %zmm2, %zmm3, %zmm1 {%k1} 600; X32-NEXT: vmovdqa64 %zmm1, %zmm0 601; X32-NEXT: retl 602; 603; X64-LABEL: combine_pshufb_identity_mask: 604; X64: # %bb.0: 605; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 606; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] 607; X64-NEXT: kmovq %rdi, %k1 608; X64-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 609; X64-NEXT: vpshufb %zmm2, %zmm0, %zmm3 {%k1} 610; X64-NEXT: vpshufb %zmm2, %zmm3, %zmm1 {%k1} 611; X64-NEXT: vmovdqa64 %zmm1, %zmm0 612; X64-NEXT: retq 613 %select = bitcast <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1> to <64 x i8> 614 %mask = bitcast <16 x i32> <i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051> to <64 x i8> 615 %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %mask, <64 x i8> %select, i64 %m) 616 %res1 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %res0, <64 x i8> %mask, <64 x i8> %select, i64 %m) 617 ret <64 x i8> %res1 618} 619 620define <32 x i16> @combine_permvar_as_vpbroadcastw512(<32 x i16> %x0) { 621; X32-LABEL: combine_permvar_as_vpbroadcastw512: 622; X32: # %bb.0: 623; X32-NEXT: vpbroadcastw %xmm0, %zmm0 624; X32-NEXT: retl 625; 626; X64-LABEL: combine_permvar_as_vpbroadcastw512: 627; X64: # %bb.0: 628; X64-NEXT: vpbroadcastw %xmm0, %zmm0 629; X64-NEXT: retq 630 %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %x0, <32 x i16> zeroinitializer) 631 ret <32 x i16> %1 632} 633 634define <16 x i32> @combine_permvar_as_vpbroadcastd512(<16 x i32> %x0) { 635; X32-LABEL: combine_permvar_as_vpbroadcastd512: 636; X32: # %bb.0: 637; X32-NEXT: vbroadcastss %xmm0, %zmm0 638; X32-NEXT: retl 639; 640; X64-LABEL: combine_permvar_as_vpbroadcastd512: 641; X64: # %bb.0: 642; X64-NEXT: vbroadcastss %xmm0, %zmm0 643; X64-NEXT: retq 644 %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> zeroinitializer) 645 ret <16 x i32> %1 646} 647 648define <8 x i64> @combine_permvar_as_vpbroadcastq512(<8 x i64> %x0) { 649; X32-LABEL: combine_permvar_as_vpbroadcastq512: 650; X32: # %bb.0: 651; X32-NEXT: vbroadcastsd %xmm0, %zmm0 652; X32-NEXT: retl 653; 654; X64-LABEL: combine_permvar_as_vpbroadcastq512: 655; X64: # %bb.0: 656; X64-NEXT: vbroadcastsd %xmm0, %zmm0 657; X64-NEXT: retq 658 %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> zeroinitializer) 659 ret <8 x i64> %1 660} 661 662define <8 x i64> @combine_permvar_8i64_as_permq(<8 x i64> %x0, <8 x i64> %x1) { 663; X32-LABEL: combine_permvar_8i64_as_permq: 664; X32: # %bb.0: 665; X32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4] 666; X32-NEXT: retl 667; 668; X64-LABEL: combine_permvar_8i64_as_permq: 669; X64: # %bb.0: 670; X64-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4] 671; X64-NEXT: retq 672 %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>) 673 ret <8 x i64> %1 674} 675define <8 x i64> @combine_permvar_8i64_as_permq_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) { 676; X32-LABEL: combine_permvar_8i64_as_permq_mask: 677; X32: # %bb.0: 678; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 679; X32-NEXT: kmovd %eax, %k1 680; X32-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4] 681; X32-NEXT: vmovdqa64 %zmm1, %zmm0 682; X32-NEXT: retl 683; 684; X64-LABEL: combine_permvar_8i64_as_permq_mask: 685; X64: # %bb.0: 686; X64-NEXT: kmovd %edi, %k1 687; X64-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4] 688; X64-NEXT: vmovdqa64 %zmm1, %zmm0 689; X64-NEXT: retq 690 %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>) 691 %2 = bitcast i8 %m to <8 x i1> 692 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x1 693 ret <8 x i64> %3 694} 695 696define <8 x double> @combine_permvar_8f64_as_permpd(<8 x double> %x0, <8 x double> %x1) { 697; X32-LABEL: combine_permvar_8f64_as_permpd: 698; X32: # %bb.0: 699; X32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4] 700; X32-NEXT: retl 701; 702; X64-LABEL: combine_permvar_8f64_as_permpd: 703; X64: # %bb.0: 704; X64-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4] 705; X64-NEXT: retq 706 %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>) 707 ret <8 x double> %1 708} 709define <8 x double> @combine_permvar_8f64_as_permpd_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) { 710; X32-LABEL: combine_permvar_8f64_as_permpd_mask: 711; X32: # %bb.0: 712; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 713; X32-NEXT: kmovd %eax, %k1 714; X32-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4] 715; X32-NEXT: vmovapd %zmm1, %zmm0 716; X32-NEXT: retl 717; 718; X64-LABEL: combine_permvar_8f64_as_permpd_mask: 719; X64: # %bb.0: 720; X64-NEXT: kmovd %edi, %k1 721; X64-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4] 722; X64-NEXT: vmovapd %zmm1, %zmm0 723; X64-NEXT: retq 724 %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>) 725 %2 = bitcast i8 %m to <8 x i1> 726 %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %x1 727 ret <8 x double> %3 728} 729 730define <16 x float> @combine_vpermilvar_16f32_230146759A8BCFDE(<16 x float> %x0) { 731; X32-LABEL: combine_vpermilvar_16f32_230146759A8BCFDE: 732; X32: # %bb.0: 733; X32-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,3,0,1,4,6,7,5,9,10,8,11,12,15,13,14] 734; X32-NEXT: retl 735; 736; X64-LABEL: combine_vpermilvar_16f32_230146759A8BCFDE: 737; X64: # %bb.0: 738; X64-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,3,0,1,4,6,7,5,9,10,8,11,12,15,13,14] 739; X64-NEXT: retq 740 %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 1, i32 1, i32 0, i32 3, i32 2>, <16 x float> undef, i16 -1) 741 %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %res0, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 3, i32 1, i32 0, i32 2, i32 3, i32 0, i32 2, i32 1, i32 1, i32 2, i32 0, i32 3>, <16 x float> undef, i16 -1) 742 ret <16 x float> %res1 743} 744 745define <64 x i8> @combine_pshufb_as_pslldq(<64 x i8> %a0) { 746; X32-LABEL: combine_pshufb_as_pslldq: 747; X32: # %bb.0: 748; X32-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53] 749; X32-NEXT: retl 750; 751; X64-LABEL: combine_pshufb_as_pslldq: 752; X64: # %bb.0: 753; X64-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53] 754; X64-NEXT: retq 755 %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %a0, <64 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>, <64 x i8> undef, i64 -1) 756 ret <64 x i8> %res0 757} 758define <64 x i8> @combine_pshufb_as_pslldq_mask(<64 x i8> %a0, i64 %m) { 759; X32-LABEL: combine_pshufb_as_pslldq_mask: 760; X32: # %bb.0: 761; X32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 762; X32-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53] 763; X32-NEXT: retl 764; 765; X64-LABEL: combine_pshufb_as_pslldq_mask: 766; X64: # %bb.0: 767; X64-NEXT: kmovq %rdi, %k1 768; X64-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53] 769; X64-NEXT: retq 770 %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %a0, <64 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>, <64 x i8> zeroinitializer, i64 %m) 771 ret <64 x i8> %res0 772} 773 774define <64 x i8> @combine_pshufb_as_psrldq(<64 x i8> %a0) { 775; X32-LABEL: combine_pshufb_as_psrldq: 776; X32: # %bb.0: 777; X32-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 778; X32-NEXT: retl 779; 780; X64-LABEL: combine_pshufb_as_psrldq: 781; X64: # %bb.0: 782; X64-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 783; X64-NEXT: retq 784 %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %a0, <64 x i8> <i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>, <64 x i8> undef, i64 -1) 785 ret <64 x i8> %res0 786} 787define <64 x i8> @combine_pshufb_as_psrldq_mask(<64 x i8> %a0, i64 %m) { 788; X32-LABEL: combine_pshufb_as_psrldq_mask: 789; X32: # %bb.0: 790; X32-NEXT: kmovq {{[0-9]+}}(%esp), %k1 791; X32-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 792; X32-NEXT: retl 793; 794; X64-LABEL: combine_pshufb_as_psrldq_mask: 795; X64: # %bb.0: 796; X64-NEXT: kmovq %rdi, %k1 797; X64-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 798; X64-NEXT: retq 799 %res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %a0, <64 x i8> <i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>, <64 x i8> zeroinitializer, i64 %m) 800 ret <64 x i8> %res0 801} 802 803define <32 x i16> @combine_permvar_as_pshuflw(<32 x i16> %a0) { 804; X32-LABEL: combine_permvar_as_pshuflw: 805; X32: # %bb.0: 806; X32-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15,17,16,19,18,20,21,22,23,25,24,27,26,28,29,30,31] 807; X32-NEXT: retl 808; 809; X64-LABEL: combine_permvar_as_pshuflw: 810; X64: # %bb.0: 811; X64-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15,17,16,19,18,20,21,22,23,25,24,27,26,28,29,30,31] 812; X64-NEXT: retq 813 %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 1, i16 0, i16 3, i16 2, i16 4, i16 5, i16 6, i16 7, i16 9, i16 8, i16 11, i16 10, i16 12, i16 13, i16 14, i16 15, i16 17, i16 16, i16 19, i16 18, i16 20, i16 21, i16 22, i16 23, i16 25, i16 24, i16 27, i16 26, i16 28, i16 29, i16 30, i16 31>) 814 ret <32 x i16> %1 815} 816 817define <32 x i16> @combine_pshufb_as_pshufhw(<32 x i16> %a0) { 818; X32-LABEL: combine_pshufb_as_pshufhw: 819; X32: # %bb.0: 820; X32-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14,16,17,18,19,21,20,23,22,24,25,26,27,29,28,31,30] 821; X32-NEXT: retl 822; 823; X64-LABEL: combine_pshufb_as_pshufhw: 824; X64: # %bb.0: 825; X64-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14,16,17,18,19,21,20,23,22,24,25,26,27,29,28,31,30] 826; X64-NEXT: retq 827 %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 5, i16 4, i16 7, i16 6, i16 8, i16 9, i16 10, i16 11, i16 13, i16 12, i16 15, i16 14, i16 16, i16 17, i16 18, i16 19, i16 21, i16 20, i16 23, i16 22, i16 24, i16 25, i16 26, i16 27, i16 29, i16 28, i16 31, i16 30>) 828 ret <32 x i16> %1 829} 830 831define <32 x i16> @combine_vpermi2var_32i16_as_pshufb(<32 x i16> %a0) { 832; X32-LABEL: combine_vpermi2var_32i16_as_pshufb: 833; X32: # %bb.0: 834; X32-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29,34,35,32,33,38,39,36,37,42,43,40,41,46,47,44,45,50,51,48,49,54,55,52,53,58,59,56,57,62,63,60,61] 835; X32-NEXT: retl 836; 837; X64-LABEL: combine_vpermi2var_32i16_as_pshufb: 838; X64: # %bb.0: 839; X64-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29,34,35,32,33,38,39,36,37,42,43,40,41,46,47,44,45,50,51,48,49,54,55,52,53,58,59,56,57,62,63,60,61] 840; X64-NEXT: retq 841 %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 1, i16 0, i16 3, i16 2, i16 4, i16 5, i16 6, i16 7, i16 9, i16 8, i16 11, i16 10, i16 12, i16 13, i16 14, i16 15, i16 17, i16 16, i16 19, i16 18, i16 20, i16 21, i16 22, i16 23, i16 25, i16 24, i16 27, i16 26, i16 28, i16 29, i16 30, i16 31>) 842 %2 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %1, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 5, i16 4, i16 7, i16 6, i16 8, i16 9, i16 10, i16 11, i16 13, i16 12, i16 15, i16 14, i16 16, i16 17, i16 18, i16 19, i16 21, i16 20, i16 23, i16 22, i16 24, i16 25, i16 26, i16 27, i16 29, i16 28, i16 31, i16 30>) 843 ret <32 x i16> %2 844} 845 846define <8 x double> @combine_vpermi2var_8f64_identity(<8 x double> %x0, <8 x double> %x1) { 847; X32-LABEL: combine_vpermi2var_8f64_identity: 848; X32: # %bb.0: 849; X32-NEXT: retl 850; 851; X64-LABEL: combine_vpermi2var_8f64_identity: 852; X64: # %bb.0: 853; X64-NEXT: retq 854 %res0 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x1, i8 -1) 855 %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %res0, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, i8 -1) 856 ret <8 x double> %res1 857} 858 859define <8 x double> @combine_vpermi2var_8f64_as_shufpd(<8 x double> %x0, <8 x double> %x1) { 860; X32-LABEL: combine_vpermi2var_8f64_as_shufpd: 861; X32: # %bb.0: 862; X32-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[1],zmm1[0],zmm0[2],zmm1[2],zmm0[5],zmm1[5],zmm0[6],zmm1[7] 863; X32-NEXT: retl 864; 865; X64-LABEL: combine_vpermi2var_8f64_as_shufpd: 866; X64: # %bb.0: 867; X64-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[1],zmm1[0],zmm0[2],zmm1[2],zmm0[5],zmm1[5],zmm0[6],zmm1[7] 868; X64-NEXT: retq 869 %1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> <i64 1, i64 8, i64 2, i64 10, i64 5, i64 13, i64 6, i64 15>, <8 x double> %x1, i8 -1) 870 ret <8 x double> %1 871} 872 873define <8 x i64> @combine_vpermi2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) { 874; X32-LABEL: combine_vpermi2var_8i64_identity: 875; X32: # %bb.0: 876; X32-NEXT: retl 877; 878; X64-LABEL: combine_vpermi2var_8i64_identity: 879; X64: # %bb.0: 880; X64-NEXT: retq 881 %res0 = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x1, i8 -1) 882 %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %res0, <8 x i64> <i64 undef, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, i8 -1) 883 ret <8 x i64> %res1 884} 885 886define <16 x float> @combine_vpermi2var_16f32_identity(<16 x float> %x0, <16 x float> %x1) { 887; X32-LABEL: combine_vpermi2var_16f32_identity: 888; X32: # %bb.0: 889; X32-NEXT: retl 890; 891; X64-LABEL: combine_vpermi2var_16f32_identity: 892; X64: # %bb.0: 893; X64-NEXT: retq 894 %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %x1, i16 -1) 895 %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %res0, <16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x float> %res0, i16 -1) 896 ret <16 x float> %res1 897} 898 899define <16 x i32> @combine_vpermi2var_16i32_identity(<16 x i32> %x0, <16 x i32> %x1) { 900; X32-LABEL: combine_vpermi2var_16i32_identity: 901; X32: # %bb.0: 902; X32-NEXT: retl 903; 904; X64-LABEL: combine_vpermi2var_16i32_identity: 905; X64: # %bb.0: 906; X64-NEXT: retq 907 %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef>, <16 x i32> %x1, i16 -1) 908 %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %res0, <16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 undef, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, i16 -1) 909 ret <16 x i32> %res1 910} 911 912define <16 x float> @combine_vpermt2var_vpermi2var_16f32_as_unpckhps(<16 x float> %a0, <16 x float> %a1) { 913; X32-LABEL: combine_vpermt2var_vpermi2var_16f32_as_unpckhps: 914; X32: # %bb.0: 915; X32-NEXT: vunpckhps {{.*#+}} zmm0 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15] 916; X32-NEXT: retl 917; 918; X64-LABEL: combine_vpermt2var_vpermi2var_16f32_as_unpckhps: 919; X64: # %bb.0: 920; X64-NEXT: vunpckhps {{.*#+}} zmm0 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15] 921; X64-NEXT: retq 922 %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %a0, <16 x i32> <i32 18, i32 2, i32 19, i32 3, i32 22, i32 6, i32 23, i32 7, i32 26, i32 10, i32 27, i32 11, i32 30, i32 14, i32 31, i32 15>, <16 x float> %a1, i16 -1) 923 ret <16 x float> %res0 924} 925 926define <16 x i32> @vpermt2var_vpermi2var_16i32_as_unpckldq(<16 x i32> %a0, <16 x i32> %a1) { 927; X32-LABEL: vpermt2var_vpermi2var_16i32_as_unpckldq: 928; X32: # %bb.0: 929; X32-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] 930; X32-NEXT: retl 931; 932; X64-LABEL: vpermt2var_vpermi2var_16i32_as_unpckldq: 933; X64: # %bb.0: 934; X64-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] 935; X64-NEXT: retq 936 %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %a0, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>, <16 x i32> %a1, i16 -1) 937 ret <16 x i32> %res0 938} 939 940define <32 x i16> @combine_vpermi2var_32i16_identity(<32 x i16> %x0, <32 x i16> %x1) { 941; X32-LABEL: combine_vpermi2var_32i16_identity: 942; X32: # %bb.0: 943; X32-NEXT: retl 944; 945; X64-LABEL: combine_vpermi2var_32i16_identity: 946; X64: # %bb.0: 947; X64-NEXT: retq 948 %res0 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> %x1, i32 -1) 949 %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %res0, <32 x i16> <i16 63, i16 30, i16 61, i16 28, i16 59, i16 26, i16 57, i16 24, i16 55, i16 22, i16 53, i16 20, i16 51, i16 18, i16 49, i16 16, i16 47, i16 46, i16 13, i16 44, i16 11, i16 42, i16 9, i16 40, i16 7, i16 38, i16 5, i16 36, i16 3, i16 34, i16 1, i16 32>, <32 x i16> %res0, i32 -1) 950 ret <32 x i16> %res1 951} 952 953define <8 x double> @combine_vpermi2var_8f64_as_vpermpd(<8 x double> %x0, <8 x double> %x1) { 954; X32-LABEL: combine_vpermi2var_8f64_as_vpermpd: 955; X32: # %bb.0: 956; X32-NEXT: vmovaps {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] 957; X32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 958; X32-NEXT: retl 959; 960; X64-LABEL: combine_vpermi2var_8f64_as_vpermpd: 961; X64: # %bb.0: 962; X64-NEXT: vmovaps {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] 963; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm0 964; X64-NEXT: retq 965 %res0 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 0, i64 7, i64 6, i64 5, i64 4>, <8 x double> %x1, i8 -1) 966 %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %res0, <8 x i64> <i64 12, i64 5, i64 14, i64 7, i64 8, i64 1, i64 10, i64 3>, <8 x double> %res0, i8 -1) 967 ret <8 x double> %res1 968} 969 970define <8 x i64> @combine_vpermt2var_8i64_as_vpermq(<8 x i64> %x0, <8 x i64> %x1) { 971; X32-LABEL: combine_vpermt2var_8i64_as_vpermq: 972; X32: # %bb.0: 973; X32-NEXT: vmovaps {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] 974; X32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 975; X32-NEXT: retl 976; 977; X64-LABEL: combine_vpermt2var_8i64_as_vpermq: 978; X64: # %bb.0: 979; X64-NEXT: vmovaps {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0] 980; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm0 981; X64-NEXT: retq 982 %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 3, i64 2, i64 1, i64 0, i64 7, i64 6, i64 5, i64 4>, <8 x i64> %x0, <8 x i64> %x1, i8 -1) 983 %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 12, i64 5, i64 14, i64 7, i64 8, i64 1, i64 10, i64 3>, <8 x i64> %res0, <8 x i64> %res0, i8 -1) 984 ret <8 x i64> %res1 985} 986 987define <16 x float> @combine_vpermi2var_16f32_as_vpermps(<16 x float> %x0, <16 x float> %x1) { 988; X32-LABEL: combine_vpermi2var_16f32_as_vpermps: 989; X32: # %bb.0: 990; X32-NEXT: vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9] 991; X32-NEXT: vpermps %zmm0, %zmm1, %zmm0 992; X32-NEXT: retl 993; 994; X64-LABEL: combine_vpermi2var_16f32_as_vpermps: 995; X64: # %bb.0: 996; X64-NEXT: vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9] 997; X64-NEXT: vpermps %zmm0, %zmm1, %zmm0 998; X64-NEXT: retq 999 %res0 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>, <16 x float> %x1, i16 -1) 1000 %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %res0, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>, <16 x float> %res0, i16 -1) 1001 ret <16 x float> %res1 1002} 1003 1004define <16 x i32> @combine_vpermt2var_16i32_as_vpermd(<16 x i32> %x0, <16 x i32> %x1) { 1005; X32-LABEL: combine_vpermt2var_16i32_as_vpermd: 1006; X32: # %bb.0: 1007; X32-NEXT: vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9] 1008; X32-NEXT: vpermps %zmm0, %zmm1, %zmm0 1009; X32-NEXT: retl 1010; 1011; X64-LABEL: combine_vpermt2var_16i32_as_vpermd: 1012; X64: # %bb.0: 1013; X64-NEXT: vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9] 1014; X64-NEXT: vpermps %zmm0, %zmm1, %zmm0 1015; X64-NEXT: retq 1016 %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>, <16 x i32> %x0, <16 x i32> %x1, i16 -1) 1017 %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>, <16 x i32> %res0, <16 x i32> %res0, i16 -1) 1018 ret <16 x i32> %res1 1019} 1020 1021define <32 x i16> @combine_vpermi2var_32i16_as_permw(<32 x i16> %x0, <32 x i16> %x1) { 1022; X32-LABEL: combine_vpermi2var_32i16_as_permw: 1023; X32: # %bb.0: 1024; X32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,16,14,17,13,18,12,19,11,20,10,21,9,22,8,23,7,24,6,25,5,26,4,27,3,28,2,29,1,30,0,31] 1025; X32-NEXT: vpermw %zmm0, %zmm1, %zmm0 1026; X32-NEXT: retl 1027; 1028; X64-LABEL: combine_vpermi2var_32i16_as_permw: 1029; X64: # %bb.0: 1030; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,16,14,17,13,18,12,19,11,20,10,21,9,22,8,23,7,24,6,25,5,26,4,27,3,28,2,29,1,30,0,31] 1031; X64-NEXT: vpermw %zmm0, %zmm1, %zmm0 1032; X64-NEXT: retq 1033 %res0 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> <i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0, i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16>, <32 x i16> %x1, i32 -1) 1034 %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %res0, <32 x i16> <i16 0, i16 31, i16 1, i16 30, i16 2, i16 29, i16 3, i16 28, i16 4, i16 27, i16 5, i16 26, i16 6, i16 25, i16 7, i16 24, i16 8, i16 23, i16 9, i16 22, i16 10, i16 21, i16 11, i16 20, i16 12, i16 19, i16 13, i16 18, i16 14, i16 17, i16 15, i16 16>, <32 x i16> %res0, i32 -1) 1035 ret <32 x i16> %res1 1036} 1037 1038define <8 x double> @combine_vpermi2var_vpermt2var_8f64_as_vperm2(<8 x double> %x0, <8 x double> %x1) { 1039; X32-LABEL: combine_vpermi2var_vpermt2var_8f64_as_vperm2: 1040; X32: # %bb.0: 1041; X32-NEXT: vmovapd {{.*#+}} zmm2 = [4,0,14,0,3,0,12,0,7,0,8,0,0,0,15,0] 1042; X32-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 1043; X32-NEXT: vmovapd %zmm2, %zmm0 1044; X32-NEXT: retl 1045; 1046; X64-LABEL: combine_vpermi2var_vpermt2var_8f64_as_vperm2: 1047; X64: # %bb.0: 1048; X64-NEXT: vmovapd {{.*#+}} zmm2 = [4,14,3,12,7,8,0,15] 1049; X64-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2 1050; X64-NEXT: vmovapd %zmm2, %zmm0 1051; X64-NEXT: retq 1052 %res0 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> <i64 15, i64 0, i64 8, i64 7, i64 12, i64 6, i64 11, i64 4>, <8 x double> %x1, i8 -1) 1053 %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 12, i64 5, i64 14, i64 7, i64 8, i64 1, i64 10, i64 3>, <8 x double> %res0, <8 x double> %res0, i8 -1) 1054 ret <8 x double> %res1 1055} 1056 1057define <16 x i32> @combine_vpermi2var_vpermt2var_16i32_as_vpermd(<16 x i32> %x0, <16 x i32> %x1) { 1058; X32-LABEL: combine_vpermi2var_vpermt2var_16i32_as_vpermd: 1059; X32: # %bb.0: 1060; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19] 1061; X32-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 1062; X32-NEXT: retl 1063; 1064; X64-LABEL: combine_vpermi2var_vpermt2var_16i32_as_vpermd: 1065; X64: # %bb.0: 1066; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19] 1067; X64-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 1068; X64-NEXT: retq 1069 %res0 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> <i32 0, i32 31, i32 2, i32 29, i32 4, i32 27, i32 6, i32 25, i32 8, i32 23, i32 10, i32 21, i32 12, i32 19, i32 14, i32 17>, <16 x i32> %x1, i16 -1) 1070 %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 0, i32 17, i32 2, i32 18, i32 4, i32 19, i32 6, i32 21, i32 8, i32 23, i32 10, i32 25, i32 12, i32 27, i32 14, i32 29>, <16 x i32> %res0, <16 x i32> %res0, i16 -1) 1071 ret <16 x i32> %res1 1072} 1073 1074define <32 x i16> @combine_vpermt2var_vpermi2var_32i16_as_permw(<32 x i16> %x0, <32 x i16> %x1) { 1075; X32-LABEL: combine_vpermt2var_vpermi2var_32i16_as_permw: 1076; X32: # %bb.0: 1077; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [17,39,19,38,21,37,23,36,25,35,27,34,29,33,31,32,1,47,3,46,5,45,7,44,9,43,11,42,13,41,15,40] 1078; X32-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 1079; X32-NEXT: vmovdqa64 %zmm2, %zmm0 1080; X32-NEXT: retl 1081; 1082; X64-LABEL: combine_vpermt2var_vpermi2var_32i16_as_permw: 1083; X64: # %bb.0: 1084; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [17,39,19,38,21,37,23,36,25,35,27,34,29,33,31,32,1,47,3,46,5,45,7,44,9,43,11,42,13,41,15,40] 1085; X64-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 1086; X64-NEXT: vmovdqa64 %zmm2, %zmm0 1087; X64-NEXT: retq 1088 %res0 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 0, i16 63, i16 1, i16 61, i16 2, i16 59, i16 3, i16 57, i16 4, i16 55, i16 5, i16 53, i16 6, i16 51, i16 7, i16 49, i16 8, i16 47, i16 9, i16 45, i16 10, i16 43, i16 11, i16 41, i16 12, i16 39, i16 13, i16 37, i16 14, i16 35, i16 15, i16 33>, <32 x i16> %x0, <32 x i16> %x1, i32 -1) 1089 %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %res0, <32 x i16> <i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0, i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16>, <32 x i16> %res0, i32 -1) 1090 ret <32 x i16> %res1 1091} 1092 1093define <8 x double> @combine_vpermi2var_vpermvar_8f64_as_vperm2_zero(<8 x double> %x0) { 1094; X32-LABEL: combine_vpermi2var_vpermvar_8f64_as_vperm2_zero: 1095; X32: # %bb.0: 1096; X32-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1097; X32-NEXT: vmovapd {{.*#+}} zmm2 = [8,0,3,0,10,0,11,0,1,0,7,0,14,0,5,0] 1098; X32-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 1099; X32-NEXT: retl 1100; 1101; X64-LABEL: combine_vpermi2var_vpermvar_8f64_as_vperm2_zero: 1102; X64: # %bb.0: 1103; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1104; X64-NEXT: vmovapd {{.*#+}} zmm2 = [8,3,10,11,1,7,14,5] 1105; X64-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 1106; X64-NEXT: retq 1107 %res0 = shufflevector <8 x double> %x0, <8 x double> zeroinitializer, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 1108 %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %res0, <8 x i64> <i64 3, i64 2, i64 1, i64 7, i64 0, i64 6, i64 5, i64 4>) 1109 ret <8 x double> %1 1110} 1111 1112define <16 x float> @combine_vpermi2var_vpermvar_16f32_as_vperm2_zero(<16 x float> %x0) { 1113; X32-LABEL: combine_vpermi2var_vpermvar_16f32_as_vperm2_zero: 1114; X32: # %bb.0: 1115; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 1116; X32-NEXT: vmovaps {{.*#+}} zmm2 = [0,13,1,12,4,9,22,12,4,25,26,9,5,29,30,8] 1117; X32-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 1118; X32-NEXT: retl 1119; 1120; X64-LABEL: combine_vpermi2var_vpermvar_16f32_as_vperm2_zero: 1121; X64: # %bb.0: 1122; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 1123; X64-NEXT: vmovaps {{.*#+}} zmm2 = [0,13,1,12,4,9,22,12,4,25,26,9,5,29,30,8] 1124; X64-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 1125; X64-NEXT: retq 1126 %res0 = shufflevector <16 x float> %x0, <16 x float> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29> 1127 %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %res0, <16 x i32> <i32 0, i32 14, i32 2, i32 12, i32 4, i32 10, i32 3, i32 12, i32 4, i32 11, i32 5, i32 10, i32 6, i32 9, i32 7, i32 8>, <16 x float> %res0, i16 -1) 1128 ret <16 x float> %res1 1129} 1130 1131define <8 x i64> @combine_vpermvar_insertion_as_broadcast_v8i64(i64 %a0) { 1132; X32-LABEL: combine_vpermvar_insertion_as_broadcast_v8i64: 1133; X32: # %bb.0: 1134; X32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %zmm0 1135; X32-NEXT: retl 1136; 1137; X64-LABEL: combine_vpermvar_insertion_as_broadcast_v8i64: 1138; X64: # %bb.0: 1139; X64-NEXT: vmovq %rdi, %xmm0 1140; X64-NEXT: vpbroadcastq %xmm0, %zmm0 1141; X64-NEXT: retq 1142 %1 = insertelement <8 x i64> undef, i64 %a0, i32 0 1143 %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %1, <8 x i64> zeroinitializer) 1144 ret <8 x i64> %2 1145} 1146