1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F 3; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw -mattr=+avx512vl -mattr=+avx512dq| FileCheck %s --check-prefix=VL_BW_DQ 4 5target triple = "x86_64-unknown-unknown" 6 7define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) { 8; AVX512F-LABEL: shuf2i1_1_0: 9; AVX512F: # BB#0: 10; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 11; AVX512F-NEXT: retq 12; 13; VL_BW_DQ-LABEL: shuf2i1_1_0: 14; VL_BW_DQ: # BB#0: 15; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0 16; VL_BW_DQ-NEXT: vptestmq %xmm0, %xmm0, %k0 17; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 18; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 19; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0 20; VL_BW_DQ-NEXT: vptestmq %xmm0, %xmm0, %k0 21; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 22; VL_BW_DQ-NEXT: retq 23 %b = shufflevector <2 x i1> %a, <2 x i1> undef, <2 x i32> <i32 1, i32 0> 24 ret <2 x i1> %b 25} 26 27define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) { 28; AVX512F-LABEL: shuf2i1_1_2: 29; AVX512F: # BB#0: 30; AVX512F-NEXT: movl $1, %eax 31; AVX512F-NEXT: vmovq %rax, %xmm1 32; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] 33; AVX512F-NEXT: retq 34; 35; VL_BW_DQ-LABEL: shuf2i1_1_2: 36; VL_BW_DQ: # BB#0: 37; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0 38; VL_BW_DQ-NEXT: vptestmq %xmm0, %xmm0, %k0 39; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 40; VL_BW_DQ-NEXT: movb $1, %al 41; VL_BW_DQ-NEXT: kmovb %eax, %k0 42; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm1 43; VL_BW_DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] 44; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0 45; VL_BW_DQ-NEXT: vptestmq %xmm0, %xmm0, %k0 46; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 47; VL_BW_DQ-NEXT: retq 48 %b = shufflevector <2 x i1> %a, <2 x i1> <i1 1, i1 0>, <2 x i32> <i32 1, i32 2> 49 ret <2 x i1> %b 50} 51 52 53define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) { 54; AVX512F-LABEL: shuf4i1_3_2_10: 55; AVX512F: # BB#0: 56; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] 57; AVX512F-NEXT: retq 58; 59; VL_BW_DQ-LABEL: shuf4i1_3_2_10: 60; VL_BW_DQ: # BB#0: 61; VL_BW_DQ-NEXT: vpslld $31, %xmm0, %xmm0 62; VL_BW_DQ-NEXT: vptestmd %xmm0, %xmm0, %k0 63; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0 64; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] 65; VL_BW_DQ-NEXT: vpslld $31, %xmm0, %xmm0 66; VL_BW_DQ-NEXT: vptestmd %xmm0, %xmm0, %k0 67; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0 68; VL_BW_DQ-NEXT: retq 69 %b = shufflevector <4 x i1> %a, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 70 ret <4 x i1> %b 71} 72 73define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %a1, <8 x i64> %b1) { 74; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0: 75; AVX512F: # BB#0: 76; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 77; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 78; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z} 79; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,1,0,3,7,7,0] 80; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1 81; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1 82; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 83; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} 84; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 85; AVX512F-NEXT: retq 86; 87; VL_BW_DQ-LABEL: shuf8i1_3_6_1_0_3_7_7_0: 88; VL_BW_DQ: # BB#0: 89; VL_BW_DQ-NEXT: vpcmpeqq %zmm2, %zmm0, %k0 90; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 91; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] 92; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0 93; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0 94; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0 95; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0 96; VL_BW_DQ-NEXT: retq 97 %a2 = icmp eq <8 x i64> %a, %a1 98 %b2 = icmp eq <8 x i64> %b, %b1 99 %c = shufflevector <8 x i1> %a2, <8 x i1> %b2, <8 x i32> <i32 3, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0> 100 ret <8 x i1> %c 101} 102 103define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <16 x i32> %b, <16 x i32> %a1, <16 x i32> %b1) { 104; AVX512F-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: 105; AVX512F: # BB#0: 106; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 107; AVX512F-NEXT: vpcmpeqd %zmm3, %zmm1, %k2 108; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 109; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} {z} 110; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} {z} 111; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 112; AVX512F-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 113; AVX512F-NEXT: vpslld $31, %zmm2, %zmm1 114; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 115; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} 116; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 117; AVX512F-NEXT: retq 118; 119; VL_BW_DQ-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: 120; VL_BW_DQ: # BB#0: 121; VL_BW_DQ-NEXT: vpcmpeqd %zmm2, %zmm0, %k0 122; VL_BW_DQ-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 123; VL_BW_DQ-NEXT: vpmovm2d %k1, %zmm0 124; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm1 125; VL_BW_DQ-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 126; VL_BW_DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 127; VL_BW_DQ-NEXT: vpslld $31, %zmm1, %zmm0 128; VL_BW_DQ-NEXT: vptestmd %zmm0, %zmm0, %k0 129; VL_BW_DQ-NEXT: vpmovm2b %k0, %xmm0 130; VL_BW_DQ-NEXT: retq 131 %a2 = icmp eq <16 x i32> %a, %a1 132 %b2 = icmp eq <16 x i32> %b, %b1 133 %c = shufflevector <16 x i1> %a2, <16 x i1> %b2, <16 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0> 134 ret <16 x i1> %c 135} 136 137define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<32 x i1> %a) { 138; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: 139; AVX512F: # BB#0: 140; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] 141; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,6,u,u,u,u,u,u,u,u,u,u,5,u,u,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16] 142; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u] 143; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,255,255,255,255,255,255,255,255,255,255,0,255,255,0,0,255,0,0,0,0,0,0,0,0,0,0,255,0,0] 144; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 145; AVX512F-NEXT: retq 146; 147; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: 148; VL_BW_DQ: # BB#0: 149; VL_BW_DQ-NEXT: vpsllw $7, %ymm0, %ymm0 150; VL_BW_DQ-NEXT: vpmovb2m %ymm0, %k0 151; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0 152; VL_BW_DQ-NEXT: vmovdqu16 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 153; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0 154; VL_BW_DQ-NEXT: vpsllw $15, %zmm0, %zmm0 155; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k0 156; VL_BW_DQ-NEXT: vpmovm2b %k0, %ymm0 157; VL_BW_DQ-NEXT: retq 158 %b = shufflevector <32 x i1> %a, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0> 159 ret <32 x i1> %b 160} 161 162define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) { 163; AVX512F-LABEL: shuf8i1_u_2_u_u_2_u_2_u: 164; AVX512F: # BB#0: 165; AVX512F-NEXT: kmovw %edi, %k1 166; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 167; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z} 168; AVX512F-NEXT: vextracti32x4 $1, %zmm1, %xmm1 169; AVX512F-NEXT: vpbroadcastq %xmm1, %zmm1 170; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1 171; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 172; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} 173; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 174; AVX512F-NEXT: retq 175; 176; VL_BW_DQ-LABEL: shuf8i1_u_2_u_u_2_u_2_u: 177; VL_BW_DQ: # BB#0: 178; VL_BW_DQ-NEXT: kmovb %edi, %k0 179; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 180; VL_BW_DQ-NEXT: vextracti64x2 $1, %zmm0, %xmm0 181; VL_BW_DQ-NEXT: vpbroadcastq %xmm0, %zmm0 182; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0 183; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0 184; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0 185; VL_BW_DQ-NEXT: retq 186 %b = bitcast i8 %a to <8 x i1> 187 %c = shufflevector < 8 x i1> %b, <8 x i1>undef, <8 x i32> <i32 undef, i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 undef> 188 ret <8 x i1> %c 189} 190 191define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) { 192; AVX512F-LABEL: shuf8i1_10_2_9_u_3_u_2_u: 193; AVX512F: # BB#0: 194; AVX512F-NEXT: kmovw %edi, %k1 195; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 196; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} 197; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 198; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u> 199; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 200; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 201; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 202; AVX512F-NEXT: kmovw %k0, %eax 203; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> 204; AVX512F-NEXT: retq 205; 206; VL_BW_DQ-LABEL: shuf8i1_10_2_9_u_3_u_2_u: 207; VL_BW_DQ: # BB#0: 208; VL_BW_DQ-NEXT: kmovb %edi, %k0 209; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 210; VL_BW_DQ-NEXT: vpxord %zmm1, %zmm1, %zmm1 211; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u> 212; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 213; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0 214; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0 215; VL_BW_DQ-NEXT: kmovb %k0, %eax 216; VL_BW_DQ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> 217; VL_BW_DQ-NEXT: retq 218 %b = bitcast i8 %a to <8 x i1> 219 %c = shufflevector < 8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 10, i32 2, i32 9, i32 undef, i32 3, i32 undef, i32 2, i32 undef> 220 %d = bitcast <8 x i1> %c to i8 221 ret i8 %d 222} 223 224define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) { 225; AVX512F-LABEL: shuf8i1_0_1_4_5_u_u_u_u: 226; AVX512F: # BB#0: 227; AVX512F-NEXT: kmovw %edi, %k1 228; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 229; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} 230; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1] 231; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 232; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 233; AVX512F-NEXT: kmovw %k0, %eax 234; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> 235; AVX512F-NEXT: retq 236; 237; VL_BW_DQ-LABEL: shuf8i1_0_1_4_5_u_u_u_u: 238; VL_BW_DQ: # BB#0: 239; VL_BW_DQ-NEXT: kmovb %edi, %k0 240; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 241; VL_BW_DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1] 242; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0 243; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0 244; VL_BW_DQ-NEXT: kmovb %k0, %eax 245; VL_BW_DQ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> 246; VL_BW_DQ-NEXT: retq 247 %b = bitcast i8 %a to <8 x i1> 248 %c = shufflevector < 8 x i1> %b, <8 x i1> undef, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 undef, i32 undef, i32 undef, i32 undef> 249 %d = bitcast <8 x i1> %c to i8 250 ret i8 %d 251} 252 253define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) { 254; AVX512F-LABEL: shuf8i1_9_6_1_0_3_7_7_0: 255; AVX512F: # BB#0: 256; AVX512F-NEXT: kmovw %edi, %k1 257; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 258; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} 259; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 260; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0] 261; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 262; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 263; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 264; AVX512F-NEXT: kmovw %k0, %eax 265; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> 266; AVX512F-NEXT: retq 267; 268; VL_BW_DQ-LABEL: shuf8i1_9_6_1_0_3_7_7_0: 269; VL_BW_DQ: # BB#0: 270; VL_BW_DQ-NEXT: kmovb %edi, %k0 271; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 272; VL_BW_DQ-NEXT: vpxord %zmm1, %zmm1, %zmm1 273; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0] 274; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 275; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0 276; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0 277; VL_BW_DQ-NEXT: kmovb %k0, %eax 278; VL_BW_DQ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> 279; VL_BW_DQ-NEXT: retq 280 %b = bitcast i8 %a to <8 x i1> 281 %c = shufflevector <8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0> 282 %d = bitcast <8 x i1>%c to i8 283 ret i8 %d 284} 285 286define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) { 287; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0: 288; AVX512F: # BB#0: 289; AVX512F-NEXT: kmovw %edi, %k1 290; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 291; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} 292; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7] 293; AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2 294; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 295; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 296; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 297; AVX512F-NEXT: kmovw %k0, %eax 298; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> 299; AVX512F-NEXT: retq 300; 301; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0: 302; VL_BW_DQ: # BB#0: 303; VL_BW_DQ-NEXT: kmovb %edi, %k0 304; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 305; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7] 306; VL_BW_DQ-NEXT: vpxord %zmm2, %zmm2, %zmm2 307; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 308; VL_BW_DQ-NEXT: vpsllq $63, %zmm2, %zmm0 309; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0 310; VL_BW_DQ-NEXT: kmovb %k0, %eax 311; VL_BW_DQ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> 312; VL_BW_DQ-NEXT: retq 313 %b = bitcast i8 %a to <8 x i1> 314 %c = shufflevector <8 x i1> zeroinitializer, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 10, i32 3, i32 7, i32 7, i32 0> 315 %d = bitcast <8 x i1>%c to i8 316 ret i8 %d 317} 318 319define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) { 320; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1: 321; AVX512F: # BB#0: 322; AVX512F-NEXT: kmovw %edi, %k1 323; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 324; AVX512F-NEXT: movb $51, %al 325; AVX512F-NEXT: kmovw %eax, %k2 326; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} {z} 327; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} 328; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1] 329; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 330; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm0 331; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 332; AVX512F-NEXT: kmovw %k0, %eax 333; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> 334; AVX512F-NEXT: retq 335; 336; VL_BW_DQ-LABEL: shuf8i1__9_6_1_10_3_7_7_1: 337; VL_BW_DQ: # BB#0: 338; VL_BW_DQ-NEXT: kmovb %edi, %k0 339; VL_BW_DQ-NEXT: movb $51, %al 340; VL_BW_DQ-NEXT: kmovb %eax, %k1 341; VL_BW_DQ-NEXT: vpmovm2q %k1, %zmm0 342; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm1 343; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1] 344; VL_BW_DQ-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 345; VL_BW_DQ-NEXT: vpsllq $63, %zmm0, %zmm0 346; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0 347; VL_BW_DQ-NEXT: kmovb %k0, %eax 348; VL_BW_DQ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> 349; VL_BW_DQ-NEXT: retq 350 %b = bitcast i8 %a to <8 x i1> 351 %c = shufflevector <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 1> 352 %c1 = bitcast <8 x i1>%c to i8 353 ret i8 %c1 354} 355 356define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) { 357; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones: 358; AVX512F: # BB#0: 359; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 360; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 361; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 362; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 363; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z} 364; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,1,2,3,4,5,6,7] 365; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0 366; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 367; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 368; AVX512F-NEXT: kmovw %k0, %eax 369; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> 370; AVX512F-NEXT: retq 371; 372; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones: 373; VL_BW_DQ: # BB#0: 374; VL_BW_DQ-NEXT: vpsllw $15, %xmm0, %xmm0 375; VL_BW_DQ-NEXT: vpmovw2m %xmm0, %k0 376; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0 377; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7] 378; VL_BW_DQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 379; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 380; VL_BW_DQ-NEXT: vpsllq $63, %zmm2, %zmm0 381; VL_BW_DQ-NEXT: vptestmq %zmm0, %zmm0, %k0 382; VL_BW_DQ-NEXT: kmovb %k0, %eax 383; VL_BW_DQ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> 384; VL_BW_DQ-NEXT: retq 385 %c = shufflevector <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1> %a, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0> 386 %c1 = bitcast <8 x i1>%c to i8 387 ret i8 %c1 388} 389 390 391define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) { 392; AVX512F-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0: 393; AVX512F: # BB#0: 394; AVX512F-NEXT: kmovw %edi, %k1 395; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 396; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} 397; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0 398; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 399; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 400; AVX512F-NEXT: kmovw %k0, %eax 401; AVX512F-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> 402; AVX512F-NEXT: retq 403; 404; VL_BW_DQ-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0: 405; VL_BW_DQ: # BB#0: 406; VL_BW_DQ-NEXT: kmovw %edi, %k0 407; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm0 408; VL_BW_DQ-NEXT: vpbroadcastd %xmm0, %zmm0 409; VL_BW_DQ-NEXT: vpslld $31, %zmm0, %zmm0 410; VL_BW_DQ-NEXT: vptestmd %zmm0, %zmm0, %k0 411; VL_BW_DQ-NEXT: kmovw %k0, %eax 412; VL_BW_DQ-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill> 413; VL_BW_DQ-NEXT: retq 414 %b = bitcast i16 %a to <16 x i1> 415 %c = shufflevector < 16 x i1> %b, <16 x i1> undef, <16 x i32> zeroinitializer 416 %d = bitcast <16 x i1> %c to i16 417 ret i16 %d 418} 419 420define i64 @shuf64i1_zero(i64 %a) { 421; AVX512F-LABEL: shuf64i1_zero: 422; AVX512F: # BB#0: 423; AVX512F-NEXT: pushq %rbp 424; AVX512F-NEXT: .Ltmp0: 425; AVX512F-NEXT: .cfi_def_cfa_offset 16 426; AVX512F-NEXT: .Ltmp1: 427; AVX512F-NEXT: .cfi_offset %rbp, -16 428; AVX512F-NEXT: movq %rsp, %rbp 429; AVX512F-NEXT: .Ltmp2: 430; AVX512F-NEXT: .cfi_def_cfa_register %rbp 431; AVX512F-NEXT: andq $-32, %rsp 432; AVX512F-NEXT: subq $96, %rsp 433; AVX512F-NEXT: movl %edi, {{[0-9]+}}(%rsp) 434; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 435; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 436; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} 437; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 438; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 439; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 440; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 441; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 442; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 443; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) 444; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 445; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 446; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 447; AVX512F-NEXT: kmovw %k0, (%rsp) 448; AVX512F-NEXT: movl (%rsp), %ecx 449; AVX512F-NEXT: movq %rcx, %rax 450; AVX512F-NEXT: shlq $32, %rax 451; AVX512F-NEXT: orq %rcx, %rax 452; AVX512F-NEXT: movq %rbp, %rsp 453; AVX512F-NEXT: popq %rbp 454; AVX512F-NEXT: retq 455; 456; VL_BW_DQ-LABEL: shuf64i1_zero: 457; VL_BW_DQ: # BB#0: 458; VL_BW_DQ-NEXT: kmovq %rdi, %k0 459; VL_BW_DQ-NEXT: vpmovm2b %k0, %zmm0 460; VL_BW_DQ-NEXT: vpbroadcastb %xmm0, %zmm0 461; VL_BW_DQ-NEXT: vpsllw $7, %zmm0, %zmm0 462; VL_BW_DQ-NEXT: vpmovb2m %zmm0, %k0 463; VL_BW_DQ-NEXT: kmovq %k0, %rax 464; VL_BW_DQ-NEXT: retq 465 %b = bitcast i64 %a to <64 x i1> 466 %c = shufflevector < 64 x i1> %b, <64 x i1> undef, <64 x i32> zeroinitializer 467 %d = bitcast <64 x i1> %c to i64 468 ret i64 %d 469} 470