1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX512VL 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefix=VL_BW_DQ 5 6define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) { 7; AVX512F-LABEL: shuf2i1_1_0: 8; AVX512F: # %bb.0: 9; AVX512F-NEXT: vpsllq $63, %xmm0, %xmm0 10; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 11; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 12; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 13; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 14; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 15; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 16; AVX512F-NEXT: vzeroupper 17; AVX512F-NEXT: retq 18; 19; AVX512VL-LABEL: shuf2i1_1_0: 20; AVX512VL: # %bb.0: 21; AVX512VL-NEXT: vpsllq $63, %xmm0, %xmm0 22; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k1 23; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 24; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} {z} 25; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 26; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1 27; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} 28; AVX512VL-NEXT: retq 29; 30; VL_BW_DQ-LABEL: shuf2i1_1_0: 31; VL_BW_DQ: # %bb.0: 32; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0 33; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 34; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 35; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 36; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 37; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 38; VL_BW_DQ-NEXT: retq 39 %b = shufflevector <2 x i1> %a, <2 x i1> undef, <2 x i32> <i32 1, i32 0> 40 ret <2 x i1> %b 41} 42 43define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) { 44; AVX512F-LABEL: shuf2i1_1_2: 45; AVX512F: # %bb.0: 46; AVX512F-NEXT: vpsllq $63, %xmm0, %xmm0 47; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 48; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 49; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551615,0] 50; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] 51; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 52; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 53; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 54; AVX512F-NEXT: vzeroupper 55; AVX512F-NEXT: retq 56; 57; AVX512VL-LABEL: shuf2i1_1_2: 58; AVX512VL: # %bb.0: 59; AVX512VL-NEXT: vpsllq $63, %xmm0, %xmm0 60; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k1 61; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 62; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} {z} 63; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744073709551615,0] 64; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] 65; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1 66; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} 67; AVX512VL-NEXT: retq 68; 69; VL_BW_DQ-LABEL: shuf2i1_1_2: 70; VL_BW_DQ: # %bb.0: 71; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0 72; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 73; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 74; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551615,0] 75; VL_BW_DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] 76; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 77; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 78; VL_BW_DQ-NEXT: retq 79 %b = shufflevector <2 x i1> %a, <2 x i1> <i1 1, i1 0>, <2 x i32> <i32 1, i32 2> 80 ret <2 x i1> %b 81} 82 83 84define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) { 85; AVX512F-LABEL: shuf4i1_3_2_10: 86; AVX512F: # %bb.0: 87; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 88; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 89; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 90; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] 91; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 92; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 93; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 94; AVX512F-NEXT: vzeroupper 95; AVX512F-NEXT: retq 96; 97; AVX512VL-LABEL: shuf4i1_3_2_10: 98; AVX512VL: # %bb.0: 99; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0 100; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1 101; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 102; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} {z} 103; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0] 104; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1 105; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 106; AVX512VL-NEXT: retq 107; 108; VL_BW_DQ-LABEL: shuf4i1_3_2_10: 109; VL_BW_DQ: # %bb.0: 110; VL_BW_DQ-NEXT: vpslld $31, %xmm0, %xmm0 111; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0 112; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0 113; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] 114; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0 115; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0 116; VL_BW_DQ-NEXT: retq 117 %b = shufflevector <4 x i1> %a, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 118 ret <4 x i1> %b 119} 120 121define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %a1, <8 x i64> %b1) { 122; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0: 123; AVX512F: # %bb.0: 124; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 125; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 126; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] 127; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 128; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 129; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 130; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 131; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 132; AVX512F-NEXT: vzeroupper 133; AVX512F-NEXT: retq 134; 135; AVX512VL-LABEL: shuf8i1_3_6_1_0_3_7_7_0: 136; AVX512VL: # %bb.0: 137; AVX512VL-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 138; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 139; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} 140; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,1,0,3,7,7,0] 141; AVX512VL-NEXT: vpermd %ymm1, %ymm2, %ymm1 142; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1 143; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 144; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 145; AVX512VL-NEXT: vzeroupper 146; AVX512VL-NEXT: retq 147; 148; VL_BW_DQ-LABEL: shuf8i1_3_6_1_0_3_7_7_0: 149; VL_BW_DQ: # %bb.0: 150; VL_BW_DQ-NEXT: vpcmpeqq %zmm2, %zmm0, %k0 151; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 152; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [3,6,1,0,3,7,7,0] 153; VL_BW_DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 154; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0 155; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0 156; VL_BW_DQ-NEXT: vzeroupper 157; VL_BW_DQ-NEXT: retq 158 %a2 = icmp eq <8 x i64> %a, %a1 159 %b2 = icmp eq <8 x i64> %b, %b1 160 %c = shufflevector <8 x i1> %a2, <8 x i1> %b2, <8 x i32> <i32 3, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0> 161 ret <8 x i1> %c 162} 163 164define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <16 x i32> %b, <16 x i32> %a1, <16 x i32> %b1) { 165; AVX512F-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: 166; AVX512F: # %bb.0: 167; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 168; AVX512F-NEXT: vpcmpeqd %zmm3, %zmm1, %k2 169; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} 170; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 171; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 172; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 173; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1 174; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 175; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 176; AVX512F-NEXT: vzeroupper 177; AVX512F-NEXT: retq 178; 179; AVX512VL-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: 180; AVX512VL: # %bb.0: 181; AVX512VL-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 182; AVX512VL-NEXT: vpcmpeqd %zmm3, %zmm1, %k2 183; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} 184; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 185; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 186; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 187; AVX512VL-NEXT: vptestmd %zmm2, %zmm2, %k1 188; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 189; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 190; AVX512VL-NEXT: vzeroupper 191; AVX512VL-NEXT: retq 192; 193; VL_BW_DQ-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: 194; VL_BW_DQ: # %bb.0: 195; VL_BW_DQ-NEXT: vpcmpeqd %zmm2, %zmm0, %k0 196; VL_BW_DQ-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 197; VL_BW_DQ-NEXT: vpmovm2d %k1, %zmm0 198; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm1 199; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 200; VL_BW_DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 201; VL_BW_DQ-NEXT: vpmovd2m %zmm2, %k0 202; VL_BW_DQ-NEXT: vpmovm2b %k0, %xmm0 203; VL_BW_DQ-NEXT: vzeroupper 204; VL_BW_DQ-NEXT: retq 205 %a2 = icmp eq <16 x i32> %a, %a1 206 %b2 = icmp eq <16 x i32> %b, %b1 207 %c = shufflevector <16 x i1> %a2, <16 x i1> %b2, <16 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0> 208 ret <16 x i1> %c 209} 210 211define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<32 x i1> %a) { 212; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: 213; AVX512F: # %bb.0: 214; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm1 215; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 216; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 217; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 218; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 219; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 220; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 221; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} 222; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 223; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 224; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 225; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1 226; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 227; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 228; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 229; AVX512F-NEXT: retq 230; 231; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: 232; AVX512VL: # %bb.0: 233; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm1 234; AVX512VL-NEXT: vpslld $31, %zmm1, %zmm1 235; AVX512VL-NEXT: vptestmd %zmm1, %zmm1, %k1 236; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 237; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0 238; AVX512VL-NEXT: vpslld $31, %zmm0, %zmm0 239; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2 240; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} 241; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 242; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 243; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 244; AVX512VL-NEXT: vptestmd %zmm2, %zmm2, %k1 245; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 246; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 247; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 248; AVX512VL-NEXT: retq 249; 250; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: 251; VL_BW_DQ: # %bb.0: 252; VL_BW_DQ-NEXT: vpsllw $7, %ymm0, %ymm0 253; VL_BW_DQ-NEXT: vpmovb2m %ymm0, %k0 254; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0 255; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 256; VL_BW_DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 257; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0 258; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k0 259; VL_BW_DQ-NEXT: vpmovm2b %k0, %ymm0 260; VL_BW_DQ-NEXT: retq 261 %b = shufflevector <32 x i1> %a, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0> 262 ret <32 x i1> %b 263} 264 265define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16(<32 x i16> %a, <32 x i16> %c, <32 x i16> %d) { 266; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16: 267; AVX512F: # %bb.0: 268; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 269; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm4 270; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm4 271; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1 272; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 273; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0 274; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 275; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 276; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} 277; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} 278; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 279; AVX512F-NEXT: vpermi2d %zmm0, %zmm3, %zmm4 280; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1 281; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 282; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 283; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 284; AVX512F-NEXT: vpternlogq $202, %zmm2, %zmm1, %zmm0 285; AVX512F-NEXT: retq 286; 287; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16: 288; AVX512VL: # %bb.0: 289; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 290; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm4 291; AVX512VL-NEXT: vpmovsxwd %ymm4, %zmm4 292; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1 293; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 294; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0 295; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0 296; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2 297; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} 298; AVX512VL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} 299; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 300; AVX512VL-NEXT: vpermi2d %zmm0, %zmm3, %zmm4 301; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1 302; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 303; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 304; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 305; AVX512VL-NEXT: vpternlogq $202, %zmm2, %zmm1, %zmm0 306; AVX512VL-NEXT: retq 307; 308; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16: 309; VL_BW_DQ: # %bb.0: 310; VL_BW_DQ-NEXT: vptestnmw %zmm0, %zmm0, %k0 311; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0 312; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 313; VL_BW_DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 314; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm3, %zmm0 315; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1 316; VL_BW_DQ-NEXT: vpblendmw %zmm1, %zmm2, %zmm0 {%k1} 317; VL_BW_DQ-NEXT: retq 318 %cmp = icmp eq <32 x i16> %a, zeroinitializer 319 %shuf = shufflevector <32 x i1> %cmp, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0> 320 %sel = select <32 x i1> %shuf, <32 x i16> %c, <32 x i16> %d 321 ret <32 x i16> %sel 322} 323 324define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8(<32 x i8> %a, <32 x i8> %c, <32 x i8> %d) { 325; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8: 326; AVX512F: # %bb.0: 327; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 328; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 329; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm3 330; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1 331; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 332; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 333; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 334; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} 335; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} 336; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 337; AVX512F-NEXT: vpermi2d %zmm0, %zmm3, %zmm4 338; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1 339; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 340; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 341; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 342; AVX512F-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 343; AVX512F-NEXT: retq 344; 345; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8: 346; AVX512VL: # %bb.0: 347; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 348; AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 349; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm3 350; AVX512VL-NEXT: vptestmd %zmm3, %zmm3, %k1 351; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 352; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0 353; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2 354; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} 355; AVX512VL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} 356; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 357; AVX512VL-NEXT: vpermi2d %zmm0, %zmm3, %zmm4 358; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1 359; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 360; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 361; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 362; AVX512VL-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 363; AVX512VL-NEXT: retq 364; 365; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8: 366; VL_BW_DQ: # %bb.0: 367; VL_BW_DQ-NEXT: vptestnmb %ymm0, %ymm0, %k0 368; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0 369; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 370; VL_BW_DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 371; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm3, %zmm0 372; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1 373; VL_BW_DQ-NEXT: vpblendmb %ymm1, %ymm2, %ymm0 {%k1} 374; VL_BW_DQ-NEXT: retq 375 %cmp = icmp eq <32 x i8> %a, zeroinitializer 376 %shuf = shufflevector <32 x i1> %cmp, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0> 377 %sel = select <32 x i1> %shuf, <32 x i8> %c, <32 x i8> %d 378 ret <32 x i8> %sel 379} 380 381define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split(<16 x i32> %a, <16 x i32> %b, <32 x i16> %c, <32 x i16> %d) { 382; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split: 383; AVX512F: # %bb.0: 384; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 385; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k2 386; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} 387; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 388; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 389; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 390; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1 391; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 392; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 393; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 394; AVX512F-NEXT: vpternlogq $202, %zmm3, %zmm2, %zmm0 395; AVX512F-NEXT: retq 396; 397; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split: 398; AVX512VL: # %bb.0: 399; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1 400; AVX512VL-NEXT: vptestnmd %zmm1, %zmm1, %k2 401; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} 402; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 403; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 404; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 405; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1 406; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 407; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 408; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 409; AVX512VL-NEXT: vpternlogq $202, %zmm3, %zmm2, %zmm0 410; AVX512VL-NEXT: retq 411; 412; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split: 413; VL_BW_DQ: # %bb.0: 414; VL_BW_DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0 415; VL_BW_DQ-NEXT: vptestnmd %zmm1, %zmm1, %k1 416; VL_BW_DQ-NEXT: kunpckwd %k0, %k1, %k0 417; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0 418; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 419; VL_BW_DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 420; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0 421; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1 422; VL_BW_DQ-NEXT: vpblendmw %zmm2, %zmm3, %zmm0 {%k1} 423; VL_BW_DQ-NEXT: retq 424 %cmp1 = icmp eq <16 x i32> %a, zeroinitializer 425 %cmp2 = icmp eq <16 x i32> %b, zeroinitializer 426 %concat = shufflevector <16 x i1> %cmp1, <16 x i1> %cmp2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 427 %shuf = shufflevector <32 x i1> %concat, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0> 428 %sel = select <32 x i1> %shuf, <32 x i16> %c, <32 x i16> %d 429 ret <32 x i16> %sel 430} 431 432define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split(<16 x i32> %a, <16 x i32> %b, <32 x i8> %c, <32 x i8> %d) { 433; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split: 434; AVX512F: # %bb.0: 435; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 436; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k2 437; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} 438; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 439; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 440; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 441; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1 442; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 443; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 444; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 445; AVX512F-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0 446; AVX512F-NEXT: retq 447; 448; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split: 449; AVX512VL: # %bb.0: 450; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1 451; AVX512VL-NEXT: vptestnmd %zmm1, %zmm1, %k2 452; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} 453; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 454; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 455; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 456; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1 457; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 458; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 459; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 460; AVX512VL-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0 461; AVX512VL-NEXT: retq 462; 463; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split: 464; VL_BW_DQ: # %bb.0: 465; VL_BW_DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0 466; VL_BW_DQ-NEXT: vptestnmd %zmm1, %zmm1, %k1 467; VL_BW_DQ-NEXT: kunpckwd %k0, %k1, %k0 468; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0 469; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 470; VL_BW_DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 471; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0 472; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1 473; VL_BW_DQ-NEXT: vpblendmb %ymm2, %ymm3, %ymm0 {%k1} 474; VL_BW_DQ-NEXT: retq 475 %cmp1 = icmp eq <16 x i32> %a, zeroinitializer 476 %cmp2 = icmp eq <16 x i32> %b, zeroinitializer 477 %concat = shufflevector <16 x i1> %cmp1, <16 x i1> %cmp2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 478 %shuf = shufflevector <32 x i1> %concat, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0> 479 %sel = select <32 x i1> %shuf, <32 x i8> %c, <32 x i8> %d 480 ret <32 x i8> %sel 481} 482 483define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) { 484; AVX512F-LABEL: shuf8i1_u_2_u_u_2_u_2_u: 485; AVX512F: # %bb.0: 486; AVX512F-NEXT: kmovw %edi, %k1 487; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 488; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 489; AVX512F-NEXT: vpbroadcastq %xmm0, %zmm0 490; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 491; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 492; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 493; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 494; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 495; AVX512F-NEXT: vzeroupper 496; AVX512F-NEXT: retq 497; 498; AVX512VL-LABEL: shuf8i1_u_2_u_u_2_u_2_u: 499; AVX512VL: # %bb.0: 500; AVX512VL-NEXT: kmovw %edi, %k1 501; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 502; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} 503; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] 504; AVX512VL-NEXT: vpbroadcastq %xmm1, %ymm1 505; AVX512VL-NEXT: vpslld $31, %ymm1, %ymm1 506; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1 507; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 508; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 509; AVX512VL-NEXT: vzeroupper 510; AVX512VL-NEXT: retq 511; 512; VL_BW_DQ-LABEL: shuf8i1_u_2_u_u_2_u_2_u: 513; VL_BW_DQ: # %bb.0: 514; VL_BW_DQ-NEXT: kmovd %edi, %k0 515; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 516; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 517; VL_BW_DQ-NEXT: vpbroadcastq %xmm0, %ymm0 518; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0 519; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0 520; VL_BW_DQ-NEXT: vzeroupper 521; VL_BW_DQ-NEXT: retq 522 %b = bitcast i8 %a to <8 x i1> 523 %c = shufflevector < 8 x i1> %b, <8 x i1>undef, <8 x i32> <i32 undef, i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 undef> 524 ret <8 x i1> %c 525} 526 527define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) { 528; AVX512F-LABEL: shuf8i1_10_2_9_u_3_u_2_u: 529; AVX512F: # %bb.0: 530; AVX512F-NEXT: kmovw %edi, %k1 531; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 532; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 533; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u> 534; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 535; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 536; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 537; AVX512F-NEXT: kmovw %k0, %eax 538; AVX512F-NEXT: # kill: def $al killed $al killed $eax 539; AVX512F-NEXT: vzeroupper 540; AVX512F-NEXT: retq 541; 542; AVX512VL-LABEL: shuf8i1_10_2_9_u_3_u_2_u: 543; AVX512VL: # %bb.0: 544; AVX512VL-NEXT: kmovw %edi, %k1 545; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 546; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 547; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 548; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,2,10,3,3,2,2,3] 549; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 550; AVX512VL-NEXT: vpslld $31, %ymm2, %ymm0 551; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0 552; AVX512VL-NEXT: kmovw %k0, %eax 553; AVX512VL-NEXT: # kill: def $al killed $al killed $eax 554; AVX512VL-NEXT: vzeroupper 555; AVX512VL-NEXT: retq 556; 557; VL_BW_DQ-LABEL: shuf8i1_10_2_9_u_3_u_2_u: 558; VL_BW_DQ: # %bb.0: 559; VL_BW_DQ-NEXT: kmovd %edi, %k0 560; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 561; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 562; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [8,2,10,3,3,2,2,3] 563; VL_BW_DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 564; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0 565; VL_BW_DQ-NEXT: kmovd %k0, %eax 566; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax 567; VL_BW_DQ-NEXT: vzeroupper 568; VL_BW_DQ-NEXT: retq 569 %b = bitcast i8 %a to <8 x i1> 570 %c = shufflevector < 8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 10, i32 2, i32 9, i32 undef, i32 3, i32 undef, i32 2, i32 undef> 571 %d = bitcast <8 x i1> %c to i8 572 ret i8 %d 573} 574 575define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) { 576; AVX512F-LABEL: shuf8i1_0_1_4_5_u_u_u_u: 577; AVX512F: # %bb.0: 578; AVX512F-NEXT: kmovw %edi, %k1 579; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 580; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1] 581; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 582; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 583; AVX512F-NEXT: kmovw %k0, %eax 584; AVX512F-NEXT: # kill: def $al killed $al killed $eax 585; AVX512F-NEXT: vzeroupper 586; AVX512F-NEXT: retq 587; 588; AVX512VL-LABEL: shuf8i1_0_1_4_5_u_u_u_u: 589; AVX512VL: # %bb.0: 590; AVX512VL-NEXT: kmovw %edi, %k1 591; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 592; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 593; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 594; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 595; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0 596; AVX512VL-NEXT: kmovw %k0, %eax 597; AVX512VL-NEXT: # kill: def $al killed $al killed $eax 598; AVX512VL-NEXT: vzeroupper 599; AVX512VL-NEXT: retq 600; 601; VL_BW_DQ-LABEL: shuf8i1_0_1_4_5_u_u_u_u: 602; VL_BW_DQ: # %bb.0: 603; VL_BW_DQ-NEXT: kmovd %edi, %k0 604; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 605; VL_BW_DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 606; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0 607; VL_BW_DQ-NEXT: kmovd %k0, %eax 608; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax 609; VL_BW_DQ-NEXT: vzeroupper 610; VL_BW_DQ-NEXT: retq 611 %b = bitcast i8 %a to <8 x i1> 612 %c = shufflevector < 8 x i1> %b, <8 x i1> undef, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 undef, i32 undef, i32 undef, i32 undef> 613 %d = bitcast <8 x i1> %c to i8 614 ret i8 %d 615} 616 617define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) { 618; AVX512F-LABEL: shuf8i1_9_6_1_0_3_7_7_0: 619; AVX512F: # %bb.0: 620; AVX512F-NEXT: kmovw %edi, %k1 621; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 622; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 623; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0] 624; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 625; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 626; AVX512F-NEXT: kmovw %k0, %eax 627; AVX512F-NEXT: # kill: def $al killed $al killed $eax 628; AVX512F-NEXT: vzeroupper 629; AVX512F-NEXT: retq 630; 631; AVX512VL-LABEL: shuf8i1_9_6_1_0_3_7_7_0: 632; AVX512VL: # %bb.0: 633; AVX512VL-NEXT: kmovw %edi, %k1 634; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 635; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 636; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 637; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0] 638; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 639; AVX512VL-NEXT: vptestmd %ymm2, %ymm2, %k0 640; AVX512VL-NEXT: kmovw %k0, %eax 641; AVX512VL-NEXT: # kill: def $al killed $al killed $eax 642; AVX512VL-NEXT: vzeroupper 643; AVX512VL-NEXT: retq 644; 645; VL_BW_DQ-LABEL: shuf8i1_9_6_1_0_3_7_7_0: 646; VL_BW_DQ: # %bb.0: 647; VL_BW_DQ-NEXT: kmovd %edi, %k0 648; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 649; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 650; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0] 651; VL_BW_DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 652; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0 653; VL_BW_DQ-NEXT: kmovd %k0, %eax 654; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax 655; VL_BW_DQ-NEXT: vzeroupper 656; VL_BW_DQ-NEXT: retq 657 %b = bitcast i8 %a to <8 x i1> 658 %c = shufflevector <8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0> 659 %d = bitcast <8 x i1>%c to i8 660 ret i8 %d 661} 662 663define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) { 664; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0: 665; AVX512F: # %bb.0: 666; AVX512F-NEXT: kmovw %edi, %k1 667; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 668; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7] 669; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 670; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 671; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 672; AVX512F-NEXT: kmovw %k0, %eax 673; AVX512F-NEXT: # kill: def $al killed $al killed $eax 674; AVX512F-NEXT: vzeroupper 675; AVX512F-NEXT: retq 676; 677; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0: 678; AVX512VL: # %bb.0: 679; AVX512VL-NEXT: kmovw %edi, %k1 680; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 681; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 682; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 683; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0 684; AVX512VL-NEXT: kmovw %k0, %eax 685; AVX512VL-NEXT: # kill: def $al killed $al killed $eax 686; AVX512VL-NEXT: vzeroupper 687; AVX512VL-NEXT: retq 688; 689; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0: 690; VL_BW_DQ: # %bb.0: 691; VL_BW_DQ-NEXT: kmovd %edi, %k0 692; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 693; VL_BW_DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 694; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0 695; VL_BW_DQ-NEXT: kmovd %k0, %eax 696; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax 697; VL_BW_DQ-NEXT: vzeroupper 698; VL_BW_DQ-NEXT: retq 699 %b = bitcast i8 %a to <8 x i1> 700 %c = shufflevector <8 x i1> zeroinitializer, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 10, i32 3, i32 7, i32 7, i32 0> 701 %d = bitcast <8 x i1>%c to i8 702 ret i8 %d 703} 704 705define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) { 706; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1: 707; AVX512F: # %bb.0: 708; AVX512F-NEXT: kmovw %edi, %k1 709; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 710; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,6,1,0,3,7,7,1] 711; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,0,0,0,0,0,0] 712; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 713; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 714; AVX512F-NEXT: kmovw %k0, %eax 715; AVX512F-NEXT: # kill: def $al killed $al killed $eax 716; AVX512F-NEXT: vzeroupper 717; AVX512F-NEXT: retq 718; 719; AVX512VL-LABEL: shuf8i1__9_6_1_10_3_7_7_1: 720; AVX512VL: # %bb.0: 721; AVX512VL-NEXT: kmovw %edi, %k1 722; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 723; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 724; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 725; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7] 726; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0 727; AVX512VL-NEXT: kmovw %k0, %eax 728; AVX512VL-NEXT: # kill: def $al killed $al killed $eax 729; AVX512VL-NEXT: vzeroupper 730; AVX512VL-NEXT: retq 731; 732; VL_BW_DQ-LABEL: shuf8i1__9_6_1_10_3_7_7_1: 733; VL_BW_DQ: # %bb.0: 734; VL_BW_DQ-NEXT: kmovd %edi, %k0 735; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 736; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 737; VL_BW_DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7] 738; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0 739; VL_BW_DQ-NEXT: kmovd %k0, %eax 740; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax 741; VL_BW_DQ-NEXT: vzeroupper 742; VL_BW_DQ-NEXT: retq 743 %b = bitcast i8 %a to <8 x i1> 744 %c = shufflevector <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 1> 745 %c1 = bitcast <8 x i1>%c to i8 746 ret i8 %c1 747} 748 749define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) { 750; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones: 751; AVX512F: # %bb.0: 752; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 753; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 754; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 755; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 756; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7] 757; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 758; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 759; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 760; AVX512F-NEXT: kmovw %k0, %eax 761; AVX512F-NEXT: # kill: def $al killed $al killed $eax 762; AVX512F-NEXT: vzeroupper 763; AVX512F-NEXT: retq 764; 765; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones: 766; AVX512VL: # %bb.0: 767; AVX512VL-NEXT: vpmovsxwd %xmm0, %ymm0 768; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 769; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1 770; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 771; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} 772; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [9,1,2,3,4,5,6,7] 773; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 774; AVX512VL-NEXT: vptestmd %ymm2, %ymm2, %k0 775; AVX512VL-NEXT: kmovw %k0, %eax 776; AVX512VL-NEXT: # kill: def $al killed $al killed $eax 777; AVX512VL-NEXT: vzeroupper 778; AVX512VL-NEXT: retq 779; 780; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones: 781; VL_BW_DQ: # %bb.0: 782; VL_BW_DQ-NEXT: vpsllw $15, %xmm0, %xmm0 783; VL_BW_DQ-NEXT: vpmovw2m %xmm0, %k0 784; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 785; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [9,1,2,3,4,5,6,7] 786; VL_BW_DQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 787; VL_BW_DQ-NEXT: vpermt2d %ymm0, %ymm1, %ymm2 788; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0 789; VL_BW_DQ-NEXT: kmovd %k0, %eax 790; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax 791; VL_BW_DQ-NEXT: vzeroupper 792; VL_BW_DQ-NEXT: retq 793 %c = shufflevector <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1> %a, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0> 794 %c1 = bitcast <8 x i1>%c to i8 795 ret i8 %c1 796} 797 798define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) { 799; AVX512F-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0: 800; AVX512F: # %bb.0: 801; AVX512F-NEXT: kmovw %edi, %k1 802; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 803; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0 804; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 805; AVX512F-NEXT: kmovw %k0, %eax 806; AVX512F-NEXT: # kill: def $ax killed $ax killed $eax 807; AVX512F-NEXT: vzeroupper 808; AVX512F-NEXT: retq 809; 810; AVX512VL-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0: 811; AVX512VL: # %bb.0: 812; AVX512VL-NEXT: kmovw %edi, %k1 813; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 814; AVX512VL-NEXT: vpbroadcastd %xmm0, %zmm0 815; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k0 816; AVX512VL-NEXT: kmovw %k0, %eax 817; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax 818; AVX512VL-NEXT: vzeroupper 819; AVX512VL-NEXT: retq 820; 821; VL_BW_DQ-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0: 822; VL_BW_DQ: # %bb.0: 823; VL_BW_DQ-NEXT: kmovd %edi, %k0 824; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm0 825; VL_BW_DQ-NEXT: vpbroadcastd %xmm0, %zmm0 826; VL_BW_DQ-NEXT: vpmovd2m %zmm0, %k0 827; VL_BW_DQ-NEXT: kmovd %k0, %eax 828; VL_BW_DQ-NEXT: # kill: def $ax killed $ax killed $eax 829; VL_BW_DQ-NEXT: vzeroupper 830; VL_BW_DQ-NEXT: retq 831 %b = bitcast i16 %a to <16 x i1> 832 %c = shufflevector < 16 x i1> %b, <16 x i1> undef, <16 x i32> zeroinitializer 833 %d = bitcast <16 x i1> %c to i16 834 ret i16 %d 835} 836 837define i64 @shuf64i1_zero(i64 %a) { 838; AVX512F-LABEL: shuf64i1_zero: 839; AVX512F: # %bb.0: 840; AVX512F-NEXT: kmovw %edi, %k1 841; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 842; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0 843; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 844; AVX512F-NEXT: kmovw %k0, %eax 845; AVX512F-NEXT: kmovw %k0, %ecx 846; AVX512F-NEXT: shll $16, %ecx 847; AVX512F-NEXT: orl %eax, %ecx 848; AVX512F-NEXT: movq %rcx, %rax 849; AVX512F-NEXT: shlq $32, %rax 850; AVX512F-NEXT: orq %rcx, %rax 851; AVX512F-NEXT: vzeroupper 852; AVX512F-NEXT: retq 853; 854; AVX512VL-LABEL: shuf64i1_zero: 855; AVX512VL: # %bb.0: 856; AVX512VL-NEXT: kmovw %edi, %k1 857; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 858; AVX512VL-NEXT: vpbroadcastd %xmm0, %zmm0 859; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k0 860; AVX512VL-NEXT: kmovw %k0, %eax 861; AVX512VL-NEXT: kmovw %k0, %ecx 862; AVX512VL-NEXT: shll $16, %ecx 863; AVX512VL-NEXT: orl %eax, %ecx 864; AVX512VL-NEXT: movq %rcx, %rax 865; AVX512VL-NEXT: shlq $32, %rax 866; AVX512VL-NEXT: orq %rcx, %rax 867; AVX512VL-NEXT: vzeroupper 868; AVX512VL-NEXT: retq 869; 870; VL_BW_DQ-LABEL: shuf64i1_zero: 871; VL_BW_DQ: # %bb.0: 872; VL_BW_DQ-NEXT: kmovq %rdi, %k0 873; VL_BW_DQ-NEXT: vpmovm2b %k0, %zmm0 874; VL_BW_DQ-NEXT: vpbroadcastb %xmm0, %zmm0 875; VL_BW_DQ-NEXT: vpmovb2m %zmm0, %k0 876; VL_BW_DQ-NEXT: kmovq %k0, %rax 877; VL_BW_DQ-NEXT: vzeroupper 878; VL_BW_DQ-NEXT: retq 879 %b = bitcast i64 %a to <64 x i1> 880 %c = shufflevector < 64 x i1> %b, <64 x i1> undef, <64 x i32> zeroinitializer 881 %d = bitcast <64 x i1> %c to i64 882 ret i64 %d 883} 884