1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+fast-variable-shuffle -O2 | FileCheck %s --check-prefix=AVX512 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+fast-variable-shuffle -O2 | FileCheck %s --check-prefix=AVX512NOTDQ 4 5define void @load_v8i1_broadcast_4_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { 6; AVX512-LABEL: load_v8i1_broadcast_4_v2i1: 7; AVX512: # %bb.0: 8; AVX512-NEXT: kmovb (%rdi), %k0 9; AVX512-NEXT: kshiftrb $4, %k0, %k0 10; AVX512-NEXT: vpmovm2q %k0, %xmm2 11; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2 12; AVX512-NEXT: vpmovq2m %xmm2, %k1 13; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1} 14; AVX512-NEXT: vmovapd %xmm1, (%rsi) 15; AVX512-NEXT: retq 16; 17; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v2i1: 18; AVX512NOTDQ: # %bb.0: 19; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 20; AVX512NOTDQ-NEXT: kshiftrw $4, %k0, %k1 21; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 22; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} 23; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2 24; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 25; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} 26; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) 27; AVX512NOTDQ-NEXT: retq 28 %d0 = load <8 x i1>, <8 x i1>* %a0 29 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 4,i32 4> 30 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2 31 store <2 x double> %d2, <2 x double>* %a3 32 ret void 33} 34define void @load_v8i1_broadcast_7_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { 35; AVX512-LABEL: load_v8i1_broadcast_7_v2i1: 36; AVX512: # %bb.0: 37; AVX512-NEXT: kmovb (%rdi), %k0 38; AVX512-NEXT: kshiftrb $6, %k0, %k0 39; AVX512-NEXT: vpmovm2q %k0, %xmm2 40; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 41; AVX512-NEXT: vpmovq2m %xmm2, %k1 42; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1} 43; AVX512-NEXT: vmovapd %xmm1, (%rsi) 44; AVX512-NEXT: retq 45; 46; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v2i1: 47; AVX512NOTDQ: # %bb.0: 48; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 49; AVX512NOTDQ-NEXT: kshiftrw $6, %k0, %k1 50; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 51; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} 52; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 53; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 54; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} 55; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) 56; AVX512NOTDQ-NEXT: retq 57 %d0 = load <8 x i1>, <8 x i1>* %a0 58 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 7,i32 7> 59 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2 60 store <2 x double> %d2, <2 x double>* %a3 61 ret void 62} 63define void @load_v16i1_broadcast_8_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { 64; AVX512-LABEL: load_v16i1_broadcast_8_v2i1: 65; AVX512: # %bb.0: 66; AVX512-NEXT: kmovw (%rdi), %k0 67; AVX512-NEXT: kshiftrw $8, %k0, %k0 68; AVX512-NEXT: vpmovm2q %k0, %xmm2 69; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2 70; AVX512-NEXT: vpmovq2m %xmm2, %k1 71; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1} 72; AVX512-NEXT: vmovapd %xmm1, (%rsi) 73; AVX512-NEXT: retq 74; 75; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v2i1: 76; AVX512NOTDQ: # %bb.0: 77; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 78; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k1 79; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 80; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} 81; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2 82; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 83; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} 84; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) 85; AVX512NOTDQ-NEXT: retq 86 %d0 = load <16 x i1>, <16 x i1>* %a0 87 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 8,i32 8> 88 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2 89 store <2 x double> %d2, <2 x double>* %a3 90 ret void 91} 92define void @load_v16i1_broadcast_8_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) { 93; AVX512-LABEL: load_v16i1_broadcast_8_v4i1: 94; AVX512: # %bb.0: 95; AVX512-NEXT: kmovw (%rdi), %k0 96; AVX512-NEXT: kshiftrw $8, %k0, %k0 97; AVX512-NEXT: vpmovm2d %k0, %xmm2 98; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2 99; AVX512-NEXT: vpmovd2m %xmm2, %k1 100; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1} 101; AVX512-NEXT: vmovaps %xmm1, (%rsi) 102; AVX512-NEXT: retq 103; 104; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v4i1: 105; AVX512NOTDQ: # %bb.0: 106; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 107; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k1 108; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 109; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} 110; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2 111; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 112; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} 113; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) 114; AVX512NOTDQ-NEXT: retq 115 %d0 = load <16 x i1>, <16 x i1>* %a0 116 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 8,i32 8,i32 8,i32 8> 117 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2 118 store <4 x float> %d2, <4 x float>* %a3 119 ret void 120} 121define void @load_v16i1_broadcast_15_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { 122; AVX512-LABEL: load_v16i1_broadcast_15_v2i1: 123; AVX512: # %bb.0: 124; AVX512-NEXT: kmovw (%rdi), %k0 125; AVX512-NEXT: kshiftrw $14, %k0, %k0 126; AVX512-NEXT: vpmovm2q %k0, %xmm2 127; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 128; AVX512-NEXT: vpmovq2m %xmm2, %k1 129; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1} 130; AVX512-NEXT: vmovapd %xmm1, (%rsi) 131; AVX512-NEXT: retq 132; 133; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v2i1: 134; AVX512NOTDQ: # %bb.0: 135; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 136; AVX512NOTDQ-NEXT: kshiftrw $14, %k0, %k1 137; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 138; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} 139; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 140; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 141; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} 142; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) 143; AVX512NOTDQ-NEXT: retq 144 %d0 = load <16 x i1>, <16 x i1>* %a0 145 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 15,i32 15> 146 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2 147 store <2 x double> %d2, <2 x double>* %a3 148 ret void 149} 150define void @load_v16i1_broadcast_15_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) { 151; AVX512-LABEL: load_v16i1_broadcast_15_v4i1: 152; AVX512: # %bb.0: 153; AVX512-NEXT: kmovw (%rdi), %k0 154; AVX512-NEXT: kshiftrw $12, %k0, %k0 155; AVX512-NEXT: vpmovm2d %k0, %xmm2 156; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] 157; AVX512-NEXT: vpmovd2m %xmm2, %k1 158; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1} 159; AVX512-NEXT: vmovaps %xmm1, (%rsi) 160; AVX512-NEXT: retq 161; 162; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v4i1: 163; AVX512NOTDQ: # %bb.0: 164; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 165; AVX512NOTDQ-NEXT: kshiftrw $12, %k0, %k1 166; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 167; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} 168; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] 169; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 170; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} 171; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) 172; AVX512NOTDQ-NEXT: retq 173 %d0 = load <16 x i1>, <16 x i1>* %a0 174 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 15,i32 15,i32 15,i32 15> 175 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2 176 store <4 x float> %d2, <4 x float>* %a3 177 ret void 178} 179define void @load_v32i1_broadcast_16_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { 180; AVX512-LABEL: load_v32i1_broadcast_16_v2i1: 181; AVX512: # %bb.0: 182; AVX512-NEXT: kmovd (%rdi), %k0 183; AVX512-NEXT: kshiftrd $16, %k0, %k0 184; AVX512-NEXT: vpmovm2q %k0, %xmm2 185; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2 186; AVX512-NEXT: vpmovq2m %xmm2, %k1 187; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1} 188; AVX512-NEXT: vmovapd %xmm1, (%rsi) 189; AVX512-NEXT: retq 190; 191; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v2i1: 192; AVX512NOTDQ: # %bb.0: 193; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 194; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1 195; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 196; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} 197; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2 198; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 199; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} 200; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) 201; AVX512NOTDQ-NEXT: retq 202 %d0 = load <32 x i1>, <32 x i1>* %a0 203 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 16,i32 16> 204 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2 205 store <2 x double> %d2, <2 x double>* %a3 206 ret void 207} 208define void @load_v32i1_broadcast_16_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) { 209; AVX512-LABEL: load_v32i1_broadcast_16_v4i1: 210; AVX512: # %bb.0: 211; AVX512-NEXT: kmovd (%rdi), %k0 212; AVX512-NEXT: kshiftrd $16, %k0, %k0 213; AVX512-NEXT: vpmovm2d %k0, %xmm2 214; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2 215; AVX512-NEXT: vpmovd2m %xmm2, %k1 216; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1} 217; AVX512-NEXT: vmovaps %xmm1, (%rsi) 218; AVX512-NEXT: retq 219; 220; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v4i1: 221; AVX512NOTDQ: # %bb.0: 222; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 223; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1 224; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 225; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} 226; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2 227; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 228; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} 229; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) 230; AVX512NOTDQ-NEXT: retq 231 %d0 = load <32 x i1>, <32 x i1>* %a0 232 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 16,i32 16,i32 16,i32 16> 233 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2 234 store <4 x float> %d2, <4 x float>* %a3 235 ret void 236} 237define void @load_v32i1_broadcast_16_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) { 238; AVX512-LABEL: load_v32i1_broadcast_16_v8i1: 239; AVX512: # %bb.0: 240; AVX512-NEXT: kmovb 2(%rdi), %k0 241; AVX512-NEXT: vpmovm2d %k0, %ymm2 242; AVX512-NEXT: vpbroadcastd %xmm2, %ymm2 243; AVX512-NEXT: vpmovd2m %ymm2, %k1 244; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1} 245; AVX512-NEXT: vmovaps %ymm1, (%rsi) 246; AVX512-NEXT: vzeroupper 247; AVX512-NEXT: retq 248; 249; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v8i1: 250; AVX512NOTDQ: # %bb.0: 251; AVX512NOTDQ-NEXT: kmovw 2(%rdi), %k1 252; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 253; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} 254; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %ymm2 255; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1 256; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1} 257; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi) 258; AVX512NOTDQ-NEXT: vzeroupper 259; AVX512NOTDQ-NEXT: retq 260 %d0 = load <32 x i1>, <32 x i1>* %a0 261 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16> 262 %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2 263 store <8 x float> %d2, <8 x float>* %a3 264 ret void 265} 266define void @load_v32i1_broadcast_31_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { 267; AVX512-LABEL: load_v32i1_broadcast_31_v2i1: 268; AVX512: # %bb.0: 269; AVX512-NEXT: kmovd (%rdi), %k0 270; AVX512-NEXT: kshiftrd $30, %k0, %k0 271; AVX512-NEXT: vpmovm2q %k0, %xmm2 272; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 273; AVX512-NEXT: vpmovq2m %xmm2, %k1 274; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1} 275; AVX512-NEXT: vmovapd %xmm1, (%rsi) 276; AVX512-NEXT: retq 277; 278; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v2i1: 279; AVX512NOTDQ: # %bb.0: 280; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 281; AVX512NOTDQ-NEXT: kshiftrd $30, %k0, %k1 282; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 283; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} 284; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 285; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 286; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} 287; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) 288; AVX512NOTDQ-NEXT: retq 289 %d0 = load <32 x i1>, <32 x i1>* %a0 290 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 31,i32 31> 291 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2 292 store <2 x double> %d2, <2 x double>* %a3 293 ret void 294} 295define void @load_v32i1_broadcast_31_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) { 296; AVX512-LABEL: load_v32i1_broadcast_31_v4i1: 297; AVX512: # %bb.0: 298; AVX512-NEXT: kmovd (%rdi), %k0 299; AVX512-NEXT: kshiftrd $28, %k0, %k0 300; AVX512-NEXT: vpmovm2d %k0, %xmm2 301; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] 302; AVX512-NEXT: vpmovd2m %xmm2, %k1 303; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1} 304; AVX512-NEXT: vmovaps %xmm1, (%rsi) 305; AVX512-NEXT: retq 306; 307; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v4i1: 308; AVX512NOTDQ: # %bb.0: 309; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 310; AVX512NOTDQ-NEXT: kshiftrd $28, %k0, %k1 311; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 312; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} 313; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] 314; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 315; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} 316; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) 317; AVX512NOTDQ-NEXT: retq 318 %d0 = load <32 x i1>, <32 x i1>* %a0 319 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 31,i32 31,i32 31,i32 31> 320 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2 321 store <4 x float> %d2, <4 x float>* %a3 322 ret void 323} 324define void @load_v32i1_broadcast_31_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) { 325; AVX512-LABEL: load_v32i1_broadcast_31_v8i1: 326; AVX512: # %bb.0: 327; AVX512-NEXT: kmovb 3(%rdi), %k0 328; AVX512-NEXT: vpmovm2d %k0, %ymm2 329; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] 330; AVX512-NEXT: vpermd %ymm2, %ymm3, %ymm2 331; AVX512-NEXT: vpmovd2m %ymm2, %k1 332; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1} 333; AVX512-NEXT: vmovaps %ymm1, (%rsi) 334; AVX512-NEXT: vzeroupper 335; AVX512-NEXT: retq 336; 337; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v8i1: 338; AVX512NOTDQ: # %bb.0: 339; AVX512NOTDQ-NEXT: movzbl 3(%rdi), %eax 340; AVX512NOTDQ-NEXT: kmovd %eax, %k1 341; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 342; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} 343; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] 344; AVX512NOTDQ-NEXT: vpermd %ymm2, %ymm3, %ymm2 345; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1 346; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1} 347; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi) 348; AVX512NOTDQ-NEXT: vzeroupper 349; AVX512NOTDQ-NEXT: retq 350 %d0 = load <32 x i1>, <32 x i1>* %a0 351 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31> 352 %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2 353 store <8 x float> %d2, <8 x float>* %a3 354 ret void 355} 356define void @load_v64i1_broadcast_32_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { 357; AVX512-LABEL: load_v64i1_broadcast_32_v2i1: 358; AVX512: # %bb.0: 359; AVX512-NEXT: kmovq (%rdi), %k0 360; AVX512-NEXT: kshiftrq $32, %k0, %k0 361; AVX512-NEXT: vpmovm2q %k0, %xmm2 362; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2 363; AVX512-NEXT: vpmovq2m %xmm2, %k1 364; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1} 365; AVX512-NEXT: vmovapd %xmm1, (%rsi) 366; AVX512-NEXT: retq 367; 368; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v2i1: 369; AVX512NOTDQ: # %bb.0: 370; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 371; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1 372; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 373; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} 374; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2 375; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 376; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} 377; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) 378; AVX512NOTDQ-NEXT: retq 379 %d0 = load <64 x i1>, <64 x i1>* %a0 380 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 32,i32 32> 381 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2 382 store <2 x double> %d2, <2 x double>* %a3 383 ret void 384} 385define void @load_v64i1_broadcast_32_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) { 386; AVX512-LABEL: load_v64i1_broadcast_32_v4i1: 387; AVX512: # %bb.0: 388; AVX512-NEXT: kmovq (%rdi), %k0 389; AVX512-NEXT: kshiftrq $32, %k0, %k0 390; AVX512-NEXT: vpmovm2d %k0, %xmm2 391; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2 392; AVX512-NEXT: vpmovd2m %xmm2, %k1 393; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1} 394; AVX512-NEXT: vmovaps %xmm1, (%rsi) 395; AVX512-NEXT: retq 396; 397; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v4i1: 398; AVX512NOTDQ: # %bb.0: 399; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 400; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1 401; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 402; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} 403; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2 404; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 405; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} 406; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) 407; AVX512NOTDQ-NEXT: retq 408 %d0 = load <64 x i1>, <64 x i1>* %a0 409 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 32,i32 32,i32 32,i32 32> 410 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2 411 store <4 x float> %d2, <4 x float>* %a3 412 ret void 413} 414define void @load_v64i1_broadcast_32_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) { 415; AVX512-LABEL: load_v64i1_broadcast_32_v8i1: 416; AVX512: # %bb.0: 417; AVX512-NEXT: kmovb 4(%rdi), %k0 418; AVX512-NEXT: vpmovm2d %k0, %ymm2 419; AVX512-NEXT: vpbroadcastd %xmm2, %ymm2 420; AVX512-NEXT: vpmovd2m %ymm2, %k1 421; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1} 422; AVX512-NEXT: vmovaps %ymm1, (%rsi) 423; AVX512-NEXT: vzeroupper 424; AVX512-NEXT: retq 425; 426; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v8i1: 427; AVX512NOTDQ: # %bb.0: 428; AVX512NOTDQ-NEXT: kmovw 4(%rdi), %k1 429; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 430; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} 431; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %ymm2 432; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1 433; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1} 434; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi) 435; AVX512NOTDQ-NEXT: vzeroupper 436; AVX512NOTDQ-NEXT: retq 437 %d0 = load <64 x i1>, <64 x i1>* %a0 438 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32> 439 %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2 440 store <8 x float> %d2, <8 x float>* %a3 441 ret void 442} 443define void @load_v64i1_broadcast_32_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x float> %a2,<16 x float>* %a3) { 444; AVX512-LABEL: load_v64i1_broadcast_32_v16i1: 445; AVX512: # %bb.0: 446; AVX512-NEXT: kmovw 4(%rdi), %k0 447; AVX512-NEXT: vpmovm2d %k0, %zmm2 448; AVX512-NEXT: vpbroadcastd %xmm2, %zmm2 449; AVX512-NEXT: vpmovd2m %zmm2, %k1 450; AVX512-NEXT: vmovaps %zmm0, %zmm1 {%k1} 451; AVX512-NEXT: vmovaps %zmm1, (%rsi) 452; AVX512-NEXT: vzeroupper 453; AVX512-NEXT: retq 454; 455; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1: 456; AVX512NOTDQ: # %bb.0: 457; AVX512NOTDQ-NEXT: kmovw 4(%rdi), %k1 458; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} 459; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %zmm2 460; AVX512NOTDQ-NEXT: vptestmd %zmm2, %zmm2, %k1 461; AVX512NOTDQ-NEXT: vmovaps %zmm0, %zmm1 {%k1} 462; AVX512NOTDQ-NEXT: vmovaps %zmm1, (%rsi) 463; AVX512NOTDQ-NEXT: vzeroupper 464; AVX512NOTDQ-NEXT: retq 465 %d0 = load <64 x i1>, <64 x i1>* %a0 466 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32> 467 %d2 = select <16 x i1> %d1, <16 x float> %a1, <16 x float> %a2 468 store <16 x float> %d2, <16 x float>* %a3 469 ret void 470} 471define void @load_v64i1_broadcast_63_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { 472; AVX512-LABEL: load_v64i1_broadcast_63_v2i1: 473; AVX512: # %bb.0: 474; AVX512-NEXT: kmovq (%rdi), %k0 475; AVX512-NEXT: kshiftrq $62, %k0, %k0 476; AVX512-NEXT: vpmovm2q %k0, %xmm2 477; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 478; AVX512-NEXT: vpmovq2m %xmm2, %k1 479; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1} 480; AVX512-NEXT: vmovapd %xmm1, (%rsi) 481; AVX512-NEXT: retq 482; 483; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v2i1: 484; AVX512NOTDQ: # %bb.0: 485; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 486; AVX512NOTDQ-NEXT: kshiftrq $62, %k0, %k1 487; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 488; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z} 489; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 490; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1 491; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1} 492; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi) 493; AVX512NOTDQ-NEXT: retq 494 %d0 = load <64 x i1>, <64 x i1>* %a0 495 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 63,i32 63> 496 %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2 497 store <2 x double> %d2, <2 x double>* %a3 498 ret void 499} 500define void @load_v64i1_broadcast_63_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) { 501; AVX512-LABEL: load_v64i1_broadcast_63_v4i1: 502; AVX512: # %bb.0: 503; AVX512-NEXT: kmovq (%rdi), %k0 504; AVX512-NEXT: kshiftrq $60, %k0, %k0 505; AVX512-NEXT: vpmovm2d %k0, %xmm2 506; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] 507; AVX512-NEXT: vpmovd2m %xmm2, %k1 508; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1} 509; AVX512-NEXT: vmovaps %xmm1, (%rsi) 510; AVX512-NEXT: retq 511; 512; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v4i1: 513; AVX512NOTDQ: # %bb.0: 514; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 515; AVX512NOTDQ-NEXT: kshiftrq $60, %k0, %k1 516; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 517; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z} 518; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3] 519; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1 520; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1} 521; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi) 522; AVX512NOTDQ-NEXT: retq 523 %d0 = load <64 x i1>, <64 x i1>* %a0 524 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 63,i32 63,i32 63,i32 63> 525 %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2 526 store <4 x float> %d2, <4 x float>* %a3 527 ret void 528} 529define void @load_v64i1_broadcast_63_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) { 530; AVX512-LABEL: load_v64i1_broadcast_63_v8i1: 531; AVX512: # %bb.0: 532; AVX512-NEXT: kmovb 7(%rdi), %k0 533; AVX512-NEXT: vpmovm2d %k0, %ymm2 534; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] 535; AVX512-NEXT: vpermd %ymm2, %ymm3, %ymm2 536; AVX512-NEXT: vpmovd2m %ymm2, %k1 537; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1} 538; AVX512-NEXT: vmovaps %ymm1, (%rsi) 539; AVX512-NEXT: vzeroupper 540; AVX512-NEXT: retq 541; 542; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v8i1: 543; AVX512NOTDQ: # %bb.0: 544; AVX512NOTDQ-NEXT: movzbl 7(%rdi), %eax 545; AVX512NOTDQ-NEXT: kmovd %eax, %k1 546; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 547; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} 548; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] 549; AVX512NOTDQ-NEXT: vpermd %ymm2, %ymm3, %ymm2 550; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1 551; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1} 552; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi) 553; AVX512NOTDQ-NEXT: vzeroupper 554; AVX512NOTDQ-NEXT: retq 555 %d0 = load <64 x i1>, <64 x i1>* %a0 556 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63> 557 %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2 558 store <8 x float> %d2, <8 x float>* %a3 559 ret void 560} 561define void @load_v64i1_broadcast_63_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x float> %a2,<16 x float>* %a3) { 562; AVX512-LABEL: load_v64i1_broadcast_63_v16i1: 563; AVX512: # %bb.0: 564; AVX512-NEXT: kmovw 6(%rdi), %k0 565; AVX512-NEXT: vpmovm2d %k0, %zmm2 566; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 567; AVX512-NEXT: vpermd %zmm2, %zmm3, %zmm2 568; AVX512-NEXT: vpmovd2m %zmm2, %k1 569; AVX512-NEXT: vmovaps %zmm0, %zmm1 {%k1} 570; AVX512-NEXT: vmovaps %zmm1, (%rsi) 571; AVX512-NEXT: vzeroupper 572; AVX512-NEXT: retq 573; 574; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v16i1: 575; AVX512NOTDQ: # %bb.0: 576; AVX512NOTDQ-NEXT: kmovw 6(%rdi), %k1 577; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} 578; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 579; AVX512NOTDQ-NEXT: vpermd %zmm2, %zmm3, %zmm2 580; AVX512NOTDQ-NEXT: vptestmd %zmm2, %zmm2, %k1 581; AVX512NOTDQ-NEXT: vmovaps %zmm0, %zmm1 {%k1} 582; AVX512NOTDQ-NEXT: vmovaps %zmm1, (%rsi) 583; AVX512NOTDQ-NEXT: vzeroupper 584; AVX512NOTDQ-NEXT: retq 585 %d0 = load <64 x i1>, <64 x i1>* %a0 586 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63> 587 %d2 = select <16 x i1> %d1, <16 x float> %a1, <16 x float> %a2 588 store <16 x float> %d2, <16 x float>* %a3 589 ret void 590} 591define void @load_v2i1_broadcast_1_v1i1_store(<2 x i1>* %a0,<1 x i1>* %a1) { 592; AVX512-LABEL: load_v2i1_broadcast_1_v1i1_store: 593; AVX512: # %bb.0: 594; AVX512-NEXT: kmovb (%rdi), %k0 595; AVX512-NEXT: kshiftrb $1, %k0, %k0 596; AVX512-NEXT: kshiftlb $7, %k0, %k0 597; AVX512-NEXT: kshiftrb $7, %k0, %k0 598; AVX512-NEXT: kmovb %k0, (%rsi) 599; AVX512-NEXT: retq 600; 601; AVX512NOTDQ-LABEL: load_v2i1_broadcast_1_v1i1_store: 602; AVX512NOTDQ: # %bb.0: 603; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 604; AVX512NOTDQ-NEXT: kshiftrw $1, %k0, %k0 605; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 606; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 607; AVX512NOTDQ-NEXT: kmovd %k0, %eax 608; AVX512NOTDQ-NEXT: movb %al, (%rsi) 609; AVX512NOTDQ-NEXT: retq 610 %d0 = load <2 x i1>, <2 x i1>* %a0 611 %d1 = shufflevector <2 x i1> %d0,<2 x i1> undef,<1 x i32><i32 1> 612 store <1 x i1> %d1, <1 x i1>* %a1 613 ret void 614} 615define void @load_v3i1_broadcast_1_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) { 616; AVX512-LABEL: load_v3i1_broadcast_1_v1i1_store: 617; AVX512: # %bb.0: 618; AVX512-NEXT: movb (%rdi), %al 619; AVX512-NEXT: shrb %al 620; AVX512-NEXT: xorl %ecx, %ecx 621; AVX512-NEXT: testb $1, %al 622; AVX512-NEXT: movl $255, %eax 623; AVX512-NEXT: cmovel %ecx, %eax 624; AVX512-NEXT: kmovd %eax, %k0 625; AVX512-NEXT: kshiftrb $1, %k0, %k0 626; AVX512-NEXT: kshiftlb $7, %k0, %k0 627; AVX512-NEXT: kshiftrb $7, %k0, %k0 628; AVX512-NEXT: kmovb %k0, (%rsi) 629; AVX512-NEXT: retq 630; 631; AVX512NOTDQ-LABEL: load_v3i1_broadcast_1_v1i1_store: 632; AVX512NOTDQ: # %bb.0: 633; AVX512NOTDQ-NEXT: movb (%rdi), %al 634; AVX512NOTDQ-NEXT: shrb %al 635; AVX512NOTDQ-NEXT: xorl %ecx, %ecx 636; AVX512NOTDQ-NEXT: testb $1, %al 637; AVX512NOTDQ-NEXT: movl $255, %eax 638; AVX512NOTDQ-NEXT: cmovel %ecx, %eax 639; AVX512NOTDQ-NEXT: kmovd %eax, %k0 640; AVX512NOTDQ-NEXT: kshiftrw $1, %k0, %k0 641; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 642; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 643; AVX512NOTDQ-NEXT: kmovd %k0, %eax 644; AVX512NOTDQ-NEXT: movb %al, (%rsi) 645; AVX512NOTDQ-NEXT: retq 646 %d0 = load <3 x i1>, <3 x i1>* %a0 647 %d1 = shufflevector <3 x i1> %d0,<3 x i1> undef,<1 x i32><i32 1> 648 store <1 x i1> %d1, <1 x i1>* %a1 649 ret void 650} 651define void @load_v3i1_broadcast_2_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) { 652; AVX512-LABEL: load_v3i1_broadcast_2_v1i1_store: 653; AVX512: # %bb.0: 654; AVX512-NEXT: xorl %eax, %eax 655; AVX512-NEXT: testb $4, (%rdi) 656; AVX512-NEXT: movl $255, %ecx 657; AVX512-NEXT: cmovel %eax, %ecx 658; AVX512-NEXT: kmovd %ecx, %k0 659; AVX512-NEXT: kshiftrb $2, %k0, %k0 660; AVX512-NEXT: kshiftlb $7, %k0, %k0 661; AVX512-NEXT: kshiftrb $7, %k0, %k0 662; AVX512-NEXT: kmovb %k0, (%rsi) 663; AVX512-NEXT: retq 664; 665; AVX512NOTDQ-LABEL: load_v3i1_broadcast_2_v1i1_store: 666; AVX512NOTDQ: # %bb.0: 667; AVX512NOTDQ-NEXT: xorl %eax, %eax 668; AVX512NOTDQ-NEXT: testb $4, (%rdi) 669; AVX512NOTDQ-NEXT: movl $255, %ecx 670; AVX512NOTDQ-NEXT: cmovel %eax, %ecx 671; AVX512NOTDQ-NEXT: kmovd %ecx, %k0 672; AVX512NOTDQ-NEXT: kshiftrw $2, %k0, %k0 673; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 674; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 675; AVX512NOTDQ-NEXT: kmovd %k0, %eax 676; AVX512NOTDQ-NEXT: movb %al, (%rsi) 677; AVX512NOTDQ-NEXT: retq 678 %d0 = load <3 x i1>, <3 x i1>* %a0 679 %d1 = shufflevector <3 x i1> %d0,<3 x i1> undef,<1 x i32><i32 2> 680 store <1 x i1> %d1, <1 x i1>* %a1 681 ret void 682} 683define void @load_v4i1_broadcast_2_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) { 684; AVX512-LABEL: load_v4i1_broadcast_2_v1i1_store: 685; AVX512: # %bb.0: 686; AVX512-NEXT: kmovb (%rdi), %k0 687; AVX512-NEXT: kshiftrb $2, %k0, %k0 688; AVX512-NEXT: kshiftlb $7, %k0, %k0 689; AVX512-NEXT: kshiftrb $7, %k0, %k0 690; AVX512-NEXT: kmovb %k0, (%rsi) 691; AVX512-NEXT: retq 692; 693; AVX512NOTDQ-LABEL: load_v4i1_broadcast_2_v1i1_store: 694; AVX512NOTDQ: # %bb.0: 695; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 696; AVX512NOTDQ-NEXT: kshiftrw $2, %k0, %k0 697; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 698; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 699; AVX512NOTDQ-NEXT: kmovd %k0, %eax 700; AVX512NOTDQ-NEXT: movb %al, (%rsi) 701; AVX512NOTDQ-NEXT: retq 702 %d0 = load <4 x i1>, <4 x i1>* %a0 703 %d1 = shufflevector <4 x i1> %d0,<4 x i1> undef,<1 x i32><i32 2> 704 store <1 x i1> %d1, <1 x i1>* %a1 705 ret void 706} 707define void @load_v4i1_broadcast_3_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) { 708; AVX512-LABEL: load_v4i1_broadcast_3_v1i1_store: 709; AVX512: # %bb.0: 710; AVX512-NEXT: kmovb (%rdi), %k0 711; AVX512-NEXT: kshiftrb $3, %k0, %k0 712; AVX512-NEXT: kshiftlb $7, %k0, %k0 713; AVX512-NEXT: kshiftrb $7, %k0, %k0 714; AVX512-NEXT: kmovb %k0, (%rsi) 715; AVX512-NEXT: retq 716; 717; AVX512NOTDQ-LABEL: load_v4i1_broadcast_3_v1i1_store: 718; AVX512NOTDQ: # %bb.0: 719; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 720; AVX512NOTDQ-NEXT: kshiftrw $3, %k0, %k0 721; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 722; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 723; AVX512NOTDQ-NEXT: kmovd %k0, %eax 724; AVX512NOTDQ-NEXT: movb %al, (%rsi) 725; AVX512NOTDQ-NEXT: retq 726 %d0 = load <4 x i1>, <4 x i1>* %a0 727 %d1 = shufflevector <4 x i1> %d0,<4 x i1> undef,<1 x i32><i32 3> 728 store <1 x i1> %d1, <1 x i1>* %a1 729 ret void 730} 731define void @load_v8i1_broadcast_4_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) { 732; AVX512-LABEL: load_v8i1_broadcast_4_v1i1_store: 733; AVX512: # %bb.0: 734; AVX512-NEXT: kmovb (%rdi), %k0 735; AVX512-NEXT: kshiftrb $4, %k0, %k0 736; AVX512-NEXT: kshiftlb $7, %k0, %k0 737; AVX512-NEXT: kshiftrb $7, %k0, %k0 738; AVX512-NEXT: kmovb %k0, (%rsi) 739; AVX512-NEXT: retq 740; 741; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v1i1_store: 742; AVX512NOTDQ: # %bb.0: 743; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 744; AVX512NOTDQ-NEXT: kshiftrw $4, %k0, %k0 745; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 746; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 747; AVX512NOTDQ-NEXT: kmovd %k0, %eax 748; AVX512NOTDQ-NEXT: movb %al, (%rsi) 749; AVX512NOTDQ-NEXT: retq 750 %d0 = load <8 x i1>, <8 x i1>* %a0 751 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<1 x i32><i32 4> 752 store <1 x i1> %d1, <1 x i1>* %a1 753 ret void 754} 755define void @load_v8i1_broadcast_4_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) { 756; AVX512-LABEL: load_v8i1_broadcast_4_v2i1_store: 757; AVX512: # %bb.0: 758; AVX512-NEXT: kmovb (%rdi), %k0 759; AVX512-NEXT: kshiftrb $4, %k0, %k0 760; AVX512-NEXT: vpmovm2q %k0, %xmm0 761; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 762; AVX512-NEXT: vpmovq2m %xmm0, %k0 763; AVX512-NEXT: kmovb %k0, (%rsi) 764; AVX512-NEXT: retq 765; 766; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v2i1_store: 767; AVX512NOTDQ: # %bb.0: 768; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 769; AVX512NOTDQ-NEXT: kshiftrw $4, %k0, %k1 770; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 771; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} 772; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0 773; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 774; AVX512NOTDQ-NEXT: kmovd %k0, %eax 775; AVX512NOTDQ-NEXT: movb %al, (%rsi) 776; AVX512NOTDQ-NEXT: retq 777 %d0 = load <8 x i1>, <8 x i1>* %a0 778 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 4,i32 4> 779 store <2 x i1> %d1, <2 x i1>* %a1 780 ret void 781} 782define void @load_v8i1_broadcast_7_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) { 783; AVX512-LABEL: load_v8i1_broadcast_7_v1i1_store: 784; AVX512: # %bb.0: 785; AVX512-NEXT: kmovb (%rdi), %k0 786; AVX512-NEXT: kshiftrb $7, %k0, %k0 787; AVX512-NEXT: kshiftlb $7, %k0, %k0 788; AVX512-NEXT: kshiftrb $7, %k0, %k0 789; AVX512-NEXT: kmovb %k0, (%rsi) 790; AVX512-NEXT: retq 791; 792; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v1i1_store: 793; AVX512NOTDQ: # %bb.0: 794; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 795; AVX512NOTDQ-NEXT: kshiftrw $7, %k0, %k0 796; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 797; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 798; AVX512NOTDQ-NEXT: kmovd %k0, %eax 799; AVX512NOTDQ-NEXT: movb %al, (%rsi) 800; AVX512NOTDQ-NEXT: retq 801 %d0 = load <8 x i1>, <8 x i1>* %a0 802 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<1 x i32><i32 7> 803 store <1 x i1> %d1, <1 x i1>* %a1 804 ret void 805} 806define void @load_v8i1_broadcast_7_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) { 807; AVX512-LABEL: load_v8i1_broadcast_7_v2i1_store: 808; AVX512: # %bb.0: 809; AVX512-NEXT: kmovb (%rdi), %k0 810; AVX512-NEXT: kshiftrb $6, %k0, %k0 811; AVX512-NEXT: vpmovm2q %k0, %xmm0 812; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 813; AVX512-NEXT: vpmovq2m %xmm0, %k0 814; AVX512-NEXT: kmovb %k0, (%rsi) 815; AVX512-NEXT: retq 816; 817; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v2i1_store: 818; AVX512NOTDQ: # %bb.0: 819; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 820; AVX512NOTDQ-NEXT: kshiftrw $6, %k0, %k1 821; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 822; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} 823; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 824; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 825; AVX512NOTDQ-NEXT: kmovd %k0, %eax 826; AVX512NOTDQ-NEXT: movb %al, (%rsi) 827; AVX512NOTDQ-NEXT: retq 828 %d0 = load <8 x i1>, <8 x i1>* %a0 829 %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 7,i32 7> 830 store <2 x i1> %d1, <2 x i1>* %a1 831 ret void 832} 833define void @load_v16i1_broadcast_8_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) { 834; AVX512-LABEL: load_v16i1_broadcast_8_v1i1_store: 835; AVX512: # %bb.0: 836; AVX512-NEXT: kmovw (%rdi), %k0 837; AVX512-NEXT: kshiftrw $8, %k0, %k0 838; AVX512-NEXT: kshiftlb $7, %k0, %k0 839; AVX512-NEXT: kshiftrb $7, %k0, %k0 840; AVX512-NEXT: kmovb %k0, (%rsi) 841; AVX512-NEXT: retq 842; 843; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v1i1_store: 844; AVX512NOTDQ: # %bb.0: 845; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 846; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k0 847; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 848; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 849; AVX512NOTDQ-NEXT: kmovd %k0, %eax 850; AVX512NOTDQ-NEXT: movb %al, (%rsi) 851; AVX512NOTDQ-NEXT: retq 852 %d0 = load <16 x i1>, <16 x i1>* %a0 853 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<1 x i32><i32 8> 854 store <1 x i1> %d1, <1 x i1>* %a1 855 ret void 856} 857define void @load_v16i1_broadcast_8_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) { 858; AVX512-LABEL: load_v16i1_broadcast_8_v2i1_store: 859; AVX512: # %bb.0: 860; AVX512-NEXT: kmovw (%rdi), %k0 861; AVX512-NEXT: kshiftrw $8, %k0, %k0 862; AVX512-NEXT: vpmovm2q %k0, %xmm0 863; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 864; AVX512-NEXT: vpmovq2m %xmm0, %k0 865; AVX512-NEXT: kmovb %k0, (%rsi) 866; AVX512-NEXT: retq 867; 868; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v2i1_store: 869; AVX512NOTDQ: # %bb.0: 870; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 871; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k1 872; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 873; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} 874; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0 875; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 876; AVX512NOTDQ-NEXT: kmovd %k0, %eax 877; AVX512NOTDQ-NEXT: movb %al, (%rsi) 878; AVX512NOTDQ-NEXT: retq 879 %d0 = load <16 x i1>, <16 x i1>* %a0 880 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 8,i32 8> 881 store <2 x i1> %d1, <2 x i1>* %a1 882 ret void 883} 884define void @load_v16i1_broadcast_8_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) { 885; AVX512-LABEL: load_v16i1_broadcast_8_v4i1_store: 886; AVX512: # %bb.0: 887; AVX512-NEXT: kmovw (%rdi), %k0 888; AVX512-NEXT: kshiftrw $8, %k0, %k0 889; AVX512-NEXT: vpmovm2d %k0, %xmm0 890; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 891; AVX512-NEXT: vpmovd2m %xmm0, %k0 892; AVX512-NEXT: kmovb %k0, (%rsi) 893; AVX512-NEXT: retq 894; 895; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v4i1_store: 896; AVX512NOTDQ: # %bb.0: 897; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 898; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k1 899; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 900; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 901; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0 902; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0 903; AVX512NOTDQ-NEXT: kmovd %k0, %eax 904; AVX512NOTDQ-NEXT: movb %al, (%rsi) 905; AVX512NOTDQ-NEXT: retq 906 %d0 = load <16 x i1>, <16 x i1>* %a0 907 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 8,i32 8,i32 8,i32 8> 908 store <4 x i1> %d1, <4 x i1>* %a1 909 ret void 910} 911define void @load_v16i1_broadcast_15_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) { 912; AVX512-LABEL: load_v16i1_broadcast_15_v1i1_store: 913; AVX512: # %bb.0: 914; AVX512-NEXT: kmovw (%rdi), %k0 915; AVX512-NEXT: kshiftrw $15, %k0, %k0 916; AVX512-NEXT: kshiftlb $7, %k0, %k0 917; AVX512-NEXT: kshiftrb $7, %k0, %k0 918; AVX512-NEXT: kmovb %k0, (%rsi) 919; AVX512-NEXT: retq 920; 921; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v1i1_store: 922; AVX512NOTDQ: # %bb.0: 923; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 924; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 925; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 926; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 927; AVX512NOTDQ-NEXT: kmovd %k0, %eax 928; AVX512NOTDQ-NEXT: movb %al, (%rsi) 929; AVX512NOTDQ-NEXT: retq 930 %d0 = load <16 x i1>, <16 x i1>* %a0 931 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<1 x i32><i32 15> 932 store <1 x i1> %d1, <1 x i1>* %a1 933 ret void 934} 935define void @load_v16i1_broadcast_15_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) { 936; AVX512-LABEL: load_v16i1_broadcast_15_v2i1_store: 937; AVX512: # %bb.0: 938; AVX512-NEXT: kmovw (%rdi), %k0 939; AVX512-NEXT: kshiftrw $14, %k0, %k0 940; AVX512-NEXT: vpmovm2q %k0, %xmm0 941; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 942; AVX512-NEXT: vpmovq2m %xmm0, %k0 943; AVX512-NEXT: kmovb %k0, (%rsi) 944; AVX512-NEXT: retq 945; 946; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v2i1_store: 947; AVX512NOTDQ: # %bb.0: 948; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 949; AVX512NOTDQ-NEXT: kshiftrw $14, %k0, %k1 950; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 951; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} 952; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 953; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 954; AVX512NOTDQ-NEXT: kmovd %k0, %eax 955; AVX512NOTDQ-NEXT: movb %al, (%rsi) 956; AVX512NOTDQ-NEXT: retq 957 %d0 = load <16 x i1>, <16 x i1>* %a0 958 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 15,i32 15> 959 store <2 x i1> %d1, <2 x i1>* %a1 960 ret void 961} 962define void @load_v16i1_broadcast_15_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) { 963; AVX512-LABEL: load_v16i1_broadcast_15_v4i1_store: 964; AVX512: # %bb.0: 965; AVX512-NEXT: kmovw (%rdi), %k0 966; AVX512-NEXT: kshiftrw $12, %k0, %k0 967; AVX512-NEXT: vpmovm2d %k0, %xmm0 968; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 969; AVX512-NEXT: vpmovd2m %xmm0, %k0 970; AVX512-NEXT: kmovb %k0, (%rsi) 971; AVX512-NEXT: retq 972; 973; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v4i1_store: 974; AVX512NOTDQ: # %bb.0: 975; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0 976; AVX512NOTDQ-NEXT: kshiftrw $12, %k0, %k1 977; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 978; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 979; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 980; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0 981; AVX512NOTDQ-NEXT: kmovd %k0, %eax 982; AVX512NOTDQ-NEXT: movb %al, (%rsi) 983; AVX512NOTDQ-NEXT: retq 984 %d0 = load <16 x i1>, <16 x i1>* %a0 985 %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 15,i32 15,i32 15,i32 15> 986 store <4 x i1> %d1, <4 x i1>* %a1 987 ret void 988} 989define void @load_v32i1_broadcast_16_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) { 990; AVX512-LABEL: load_v32i1_broadcast_16_v1i1_store: 991; AVX512: # %bb.0: 992; AVX512-NEXT: kmovd (%rdi), %k0 993; AVX512-NEXT: kshiftrd $16, %k0, %k0 994; AVX512-NEXT: kshiftlb $7, %k0, %k0 995; AVX512-NEXT: kshiftrb $7, %k0, %k0 996; AVX512-NEXT: kmovb %k0, (%rsi) 997; AVX512-NEXT: retq 998; 999; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v1i1_store: 1000; AVX512NOTDQ: # %bb.0: 1001; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 1002; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k0 1003; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 1004; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 1005; AVX512NOTDQ-NEXT: kmovd %k0, %eax 1006; AVX512NOTDQ-NEXT: movb %al, (%rsi) 1007; AVX512NOTDQ-NEXT: retq 1008 %d0 = load <32 x i1>, <32 x i1>* %a0 1009 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<1 x i32><i32 16> 1010 store <1 x i1> %d1, <1 x i1>* %a1 1011 ret void 1012} 1013define void @load_v32i1_broadcast_16_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) { 1014; AVX512-LABEL: load_v32i1_broadcast_16_v2i1_store: 1015; AVX512: # %bb.0: 1016; AVX512-NEXT: kmovd (%rdi), %k0 1017; AVX512-NEXT: kshiftrd $16, %k0, %k0 1018; AVX512-NEXT: vpmovm2q %k0, %xmm0 1019; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 1020; AVX512-NEXT: vpmovq2m %xmm0, %k0 1021; AVX512-NEXT: kmovb %k0, (%rsi) 1022; AVX512-NEXT: retq 1023; 1024; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v2i1_store: 1025; AVX512NOTDQ: # %bb.0: 1026; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 1027; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1 1028; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1029; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} 1030; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0 1031; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 1032; AVX512NOTDQ-NEXT: kmovd %k0, %eax 1033; AVX512NOTDQ-NEXT: movb %al, (%rsi) 1034; AVX512NOTDQ-NEXT: retq 1035 %d0 = load <32 x i1>, <32 x i1>* %a0 1036 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 16,i32 16> 1037 store <2 x i1> %d1, <2 x i1>* %a1 1038 ret void 1039} 1040define void @load_v32i1_broadcast_16_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) { 1041; AVX512-LABEL: load_v32i1_broadcast_16_v4i1_store: 1042; AVX512: # %bb.0: 1043; AVX512-NEXT: kmovd (%rdi), %k0 1044; AVX512-NEXT: kshiftrd $16, %k0, %k0 1045; AVX512-NEXT: vpmovm2d %k0, %xmm0 1046; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 1047; AVX512-NEXT: vpmovd2m %xmm0, %k0 1048; AVX512-NEXT: kmovb %k0, (%rsi) 1049; AVX512-NEXT: retq 1050; 1051; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v4i1_store: 1052; AVX512NOTDQ: # %bb.0: 1053; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 1054; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1 1055; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1056; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 1057; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0 1058; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0 1059; AVX512NOTDQ-NEXT: kmovd %k0, %eax 1060; AVX512NOTDQ-NEXT: movb %al, (%rsi) 1061; AVX512NOTDQ-NEXT: retq 1062 %d0 = load <32 x i1>, <32 x i1>* %a0 1063 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 16,i32 16,i32 16,i32 16> 1064 store <4 x i1> %d1, <4 x i1>* %a1 1065 ret void 1066} 1067define void @load_v32i1_broadcast_16_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) { 1068; AVX512-LABEL: load_v32i1_broadcast_16_v8i1_store: 1069; AVX512: # %bb.0: 1070; AVX512-NEXT: kmovb 2(%rdi), %k0 1071; AVX512-NEXT: vpmovm2d %k0, %ymm0 1072; AVX512-NEXT: vpbroadcastd %xmm0, %ymm0 1073; AVX512-NEXT: vpmovd2m %ymm0, %k0 1074; AVX512-NEXT: kmovb %k0, (%rsi) 1075; AVX512-NEXT: vzeroupper 1076; AVX512-NEXT: retq 1077; 1078; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v8i1_store: 1079; AVX512NOTDQ: # %bb.0: 1080; AVX512NOTDQ-NEXT: kmovw 2(%rdi), %k1 1081; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 1082; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 1083; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %ymm0 1084; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0 1085; AVX512NOTDQ-NEXT: kmovd %k0, %eax 1086; AVX512NOTDQ-NEXT: movb %al, (%rsi) 1087; AVX512NOTDQ-NEXT: vzeroupper 1088; AVX512NOTDQ-NEXT: retq 1089 %d0 = load <32 x i1>, <32 x i1>* %a0 1090 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16> 1091 store <8 x i1> %d1, <8 x i1>* %a1 1092 ret void 1093} 1094define void @load_v32i1_broadcast_31_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) { 1095; AVX512-LABEL: load_v32i1_broadcast_31_v1i1_store: 1096; AVX512: # %bb.0: 1097; AVX512-NEXT: kmovd (%rdi), %k0 1098; AVX512-NEXT: kshiftrd $31, %k0, %k0 1099; AVX512-NEXT: kshiftlb $7, %k0, %k0 1100; AVX512-NEXT: kshiftrb $7, %k0, %k0 1101; AVX512-NEXT: kmovb %k0, (%rsi) 1102; AVX512-NEXT: retq 1103; 1104; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v1i1_store: 1105; AVX512NOTDQ: # %bb.0: 1106; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 1107; AVX512NOTDQ-NEXT: kshiftrd $31, %k0, %k0 1108; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 1109; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 1110; AVX512NOTDQ-NEXT: kmovd %k0, %eax 1111; AVX512NOTDQ-NEXT: movb %al, (%rsi) 1112; AVX512NOTDQ-NEXT: retq 1113 %d0 = load <32 x i1>, <32 x i1>* %a0 1114 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<1 x i32><i32 31> 1115 store <1 x i1> %d1, <1 x i1>* %a1 1116 ret void 1117} 1118define void @load_v32i1_broadcast_31_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) { 1119; AVX512-LABEL: load_v32i1_broadcast_31_v2i1_store: 1120; AVX512: # %bb.0: 1121; AVX512-NEXT: kmovd (%rdi), %k0 1122; AVX512-NEXT: kshiftrd $30, %k0, %k0 1123; AVX512-NEXT: vpmovm2q %k0, %xmm0 1124; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 1125; AVX512-NEXT: vpmovq2m %xmm0, %k0 1126; AVX512-NEXT: kmovb %k0, (%rsi) 1127; AVX512-NEXT: retq 1128; 1129; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v2i1_store: 1130; AVX512NOTDQ: # %bb.0: 1131; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 1132; AVX512NOTDQ-NEXT: kshiftrd $30, %k0, %k1 1133; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1134; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} 1135; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 1136; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 1137; AVX512NOTDQ-NEXT: kmovd %k0, %eax 1138; AVX512NOTDQ-NEXT: movb %al, (%rsi) 1139; AVX512NOTDQ-NEXT: retq 1140 %d0 = load <32 x i1>, <32 x i1>* %a0 1141 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 31,i32 31> 1142 store <2 x i1> %d1, <2 x i1>* %a1 1143 ret void 1144} 1145define void @load_v32i1_broadcast_31_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) { 1146; AVX512-LABEL: load_v32i1_broadcast_31_v4i1_store: 1147; AVX512: # %bb.0: 1148; AVX512-NEXT: kmovd (%rdi), %k0 1149; AVX512-NEXT: kshiftrd $28, %k0, %k0 1150; AVX512-NEXT: vpmovm2d %k0, %xmm0 1151; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 1152; AVX512-NEXT: vpmovd2m %xmm0, %k0 1153; AVX512-NEXT: kmovb %k0, (%rsi) 1154; AVX512-NEXT: retq 1155; 1156; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v4i1_store: 1157; AVX512NOTDQ: # %bb.0: 1158; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0 1159; AVX512NOTDQ-NEXT: kshiftrd $28, %k0, %k1 1160; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1161; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 1162; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 1163; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0 1164; AVX512NOTDQ-NEXT: kmovd %k0, %eax 1165; AVX512NOTDQ-NEXT: movb %al, (%rsi) 1166; AVX512NOTDQ-NEXT: retq 1167 %d0 = load <32 x i1>, <32 x i1>* %a0 1168 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 31,i32 31,i32 31,i32 31> 1169 store <4 x i1> %d1, <4 x i1>* %a1 1170 ret void 1171} 1172define void @load_v32i1_broadcast_31_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) { 1173; AVX512-LABEL: load_v32i1_broadcast_31_v8i1_store: 1174; AVX512: # %bb.0: 1175; AVX512-NEXT: kmovb 3(%rdi), %k0 1176; AVX512-NEXT: vpmovm2d %k0, %ymm0 1177; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] 1178; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0 1179; AVX512-NEXT: vpmovd2m %ymm0, %k0 1180; AVX512-NEXT: kmovb %k0, (%rsi) 1181; AVX512-NEXT: vzeroupper 1182; AVX512-NEXT: retq 1183; 1184; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v8i1_store: 1185; AVX512NOTDQ: # %bb.0: 1186; AVX512NOTDQ-NEXT: movzbl 3(%rdi), %eax 1187; AVX512NOTDQ-NEXT: kmovd %eax, %k1 1188; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 1189; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 1190; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] 1191; AVX512NOTDQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 1192; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0 1193; AVX512NOTDQ-NEXT: kmovd %k0, %eax 1194; AVX512NOTDQ-NEXT: movb %al, (%rsi) 1195; AVX512NOTDQ-NEXT: vzeroupper 1196; AVX512NOTDQ-NEXT: retq 1197 %d0 = load <32 x i1>, <32 x i1>* %a0 1198 %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31> 1199 store <8 x i1> %d1, <8 x i1>* %a1 1200 ret void 1201} 1202define void @load_v64i1_broadcast_32_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) { 1203; AVX512-LABEL: load_v64i1_broadcast_32_v1i1_store: 1204; AVX512: # %bb.0: 1205; AVX512-NEXT: kmovq (%rdi), %k0 1206; AVX512-NEXT: kshiftrq $32, %k0, %k0 1207; AVX512-NEXT: kshiftlb $7, %k0, %k0 1208; AVX512-NEXT: kshiftrb $7, %k0, %k0 1209; AVX512-NEXT: kmovb %k0, (%rsi) 1210; AVX512-NEXT: retq 1211; 1212; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v1i1_store: 1213; AVX512NOTDQ: # %bb.0: 1214; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 1215; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k0 1216; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 1217; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 1218; AVX512NOTDQ-NEXT: kmovd %k0, %eax 1219; AVX512NOTDQ-NEXT: movb %al, (%rsi) 1220; AVX512NOTDQ-NEXT: retq 1221 %d0 = load <64 x i1>, <64 x i1>* %a0 1222 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<1 x i32><i32 32> 1223 store <1 x i1> %d1, <1 x i1>* %a1 1224 ret void 1225} 1226define void @load_v64i1_broadcast_32_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) { 1227; AVX512-LABEL: load_v64i1_broadcast_32_v2i1_store: 1228; AVX512: # %bb.0: 1229; AVX512-NEXT: kmovq (%rdi), %k0 1230; AVX512-NEXT: kshiftrq $32, %k0, %k0 1231; AVX512-NEXT: vpmovm2q %k0, %xmm0 1232; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0 1233; AVX512-NEXT: vpmovq2m %xmm0, %k0 1234; AVX512-NEXT: kmovb %k0, (%rsi) 1235; AVX512-NEXT: retq 1236; 1237; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v2i1_store: 1238; AVX512NOTDQ: # %bb.0: 1239; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 1240; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1 1241; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1242; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} 1243; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0 1244; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 1245; AVX512NOTDQ-NEXT: kmovd %k0, %eax 1246; AVX512NOTDQ-NEXT: movb %al, (%rsi) 1247; AVX512NOTDQ-NEXT: retq 1248 %d0 = load <64 x i1>, <64 x i1>* %a0 1249 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 32,i32 32> 1250 store <2 x i1> %d1, <2 x i1>* %a1 1251 ret void 1252} 1253define void @load_v64i1_broadcast_32_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) { 1254; AVX512-LABEL: load_v64i1_broadcast_32_v4i1_store: 1255; AVX512: # %bb.0: 1256; AVX512-NEXT: kmovq (%rdi), %k0 1257; AVX512-NEXT: kshiftrq $32, %k0, %k0 1258; AVX512-NEXT: vpmovm2d %k0, %xmm0 1259; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 1260; AVX512-NEXT: vpmovd2m %xmm0, %k0 1261; AVX512-NEXT: kmovb %k0, (%rsi) 1262; AVX512-NEXT: retq 1263; 1264; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v4i1_store: 1265; AVX512NOTDQ: # %bb.0: 1266; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 1267; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1 1268; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1269; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 1270; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0 1271; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0 1272; AVX512NOTDQ-NEXT: kmovd %k0, %eax 1273; AVX512NOTDQ-NEXT: movb %al, (%rsi) 1274; AVX512NOTDQ-NEXT: retq 1275 %d0 = load <64 x i1>, <64 x i1>* %a0 1276 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 32,i32 32,i32 32,i32 32> 1277 store <4 x i1> %d1, <4 x i1>* %a1 1278 ret void 1279} 1280define void @load_v64i1_broadcast_32_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) { 1281; AVX512-LABEL: load_v64i1_broadcast_32_v8i1_store: 1282; AVX512: # %bb.0: 1283; AVX512-NEXT: kmovb 4(%rdi), %k0 1284; AVX512-NEXT: vpmovm2d %k0, %ymm0 1285; AVX512-NEXT: vpbroadcastd %xmm0, %ymm0 1286; AVX512-NEXT: vpmovd2m %ymm0, %k0 1287; AVX512-NEXT: kmovb %k0, (%rsi) 1288; AVX512-NEXT: vzeroupper 1289; AVX512-NEXT: retq 1290; 1291; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v8i1_store: 1292; AVX512NOTDQ: # %bb.0: 1293; AVX512NOTDQ-NEXT: kmovw 4(%rdi), %k1 1294; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 1295; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 1296; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %ymm0 1297; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0 1298; AVX512NOTDQ-NEXT: kmovd %k0, %eax 1299; AVX512NOTDQ-NEXT: movb %al, (%rsi) 1300; AVX512NOTDQ-NEXT: vzeroupper 1301; AVX512NOTDQ-NEXT: retq 1302 %d0 = load <64 x i1>, <64 x i1>* %a0 1303 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32> 1304 store <8 x i1> %d1, <8 x i1>* %a1 1305 ret void 1306} 1307define void @load_v64i1_broadcast_32_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) { 1308; AVX512-LABEL: load_v64i1_broadcast_32_v16i1_store: 1309; AVX512: # %bb.0: 1310; AVX512-NEXT: kmovw 4(%rdi), %k0 1311; AVX512-NEXT: vpmovm2d %k0, %zmm0 1312; AVX512-NEXT: vpbroadcastd %xmm0, %zmm0 1313; AVX512-NEXT: vpmovd2m %zmm0, %k0 1314; AVX512-NEXT: kmovw %k0, (%rsi) 1315; AVX512-NEXT: vzeroupper 1316; AVX512-NEXT: retq 1317; 1318; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1_store: 1319; AVX512NOTDQ: # %bb.0: 1320; AVX512NOTDQ-NEXT: kmovw 4(%rdi), %k1 1321; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 1322; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %zmm0 1323; AVX512NOTDQ-NEXT: vptestmd %zmm0, %zmm0, %k0 1324; AVX512NOTDQ-NEXT: kmovw %k0, (%rsi) 1325; AVX512NOTDQ-NEXT: vzeroupper 1326; AVX512NOTDQ-NEXT: retq 1327 %d0 = load <64 x i1>, <64 x i1>* %a0 1328 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32> 1329 store <16 x i1> %d1, <16 x i1>* %a1 1330 ret void 1331} 1332define void @load_v64i1_broadcast_63_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) { 1333; AVX512-LABEL: load_v64i1_broadcast_63_v1i1_store: 1334; AVX512: # %bb.0: 1335; AVX512-NEXT: kmovq (%rdi), %k0 1336; AVX512-NEXT: kshiftrq $63, %k0, %k0 1337; AVX512-NEXT: kshiftlb $7, %k0, %k0 1338; AVX512-NEXT: kshiftrb $7, %k0, %k0 1339; AVX512-NEXT: kmovb %k0, (%rsi) 1340; AVX512-NEXT: retq 1341; 1342; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v1i1_store: 1343; AVX512NOTDQ: # %bb.0: 1344; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 1345; AVX512NOTDQ-NEXT: kshiftrq $63, %k0, %k0 1346; AVX512NOTDQ-NEXT: kshiftlw $15, %k0, %k0 1347; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0 1348; AVX512NOTDQ-NEXT: kmovd %k0, %eax 1349; AVX512NOTDQ-NEXT: movb %al, (%rsi) 1350; AVX512NOTDQ-NEXT: retq 1351 %d0 = load <64 x i1>, <64 x i1>* %a0 1352 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<1 x i32><i32 63> 1353 store <1 x i1> %d1, <1 x i1>* %a1 1354 ret void 1355} 1356define void @load_v64i1_broadcast_63_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) { 1357; AVX512-LABEL: load_v64i1_broadcast_63_v2i1_store: 1358; AVX512: # %bb.0: 1359; AVX512-NEXT: kmovq (%rdi), %k0 1360; AVX512-NEXT: kshiftrq $62, %k0, %k0 1361; AVX512-NEXT: vpmovm2q %k0, %xmm0 1362; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 1363; AVX512-NEXT: vpmovq2m %xmm0, %k0 1364; AVX512-NEXT: kmovb %k0, (%rsi) 1365; AVX512-NEXT: retq 1366; 1367; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v2i1_store: 1368; AVX512NOTDQ: # %bb.0: 1369; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 1370; AVX512NOTDQ-NEXT: kshiftrq $62, %k0, %k1 1371; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1372; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} 1373; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 1374; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0 1375; AVX512NOTDQ-NEXT: kmovd %k0, %eax 1376; AVX512NOTDQ-NEXT: movb %al, (%rsi) 1377; AVX512NOTDQ-NEXT: retq 1378 %d0 = load <64 x i1>, <64 x i1>* %a0 1379 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 63,i32 63> 1380 store <2 x i1> %d1, <2 x i1>* %a1 1381 ret void 1382} 1383define void @load_v64i1_broadcast_63_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) { 1384; AVX512-LABEL: load_v64i1_broadcast_63_v4i1_store: 1385; AVX512: # %bb.0: 1386; AVX512-NEXT: kmovq (%rdi), %k0 1387; AVX512-NEXT: kshiftrq $60, %k0, %k0 1388; AVX512-NEXT: vpmovm2d %k0, %xmm0 1389; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 1390; AVX512-NEXT: vpmovd2m %xmm0, %k0 1391; AVX512-NEXT: kmovb %k0, (%rsi) 1392; AVX512-NEXT: retq 1393; 1394; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v4i1_store: 1395; AVX512NOTDQ: # %bb.0: 1396; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0 1397; AVX512NOTDQ-NEXT: kshiftrq $60, %k0, %k1 1398; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 1399; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 1400; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 1401; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0 1402; AVX512NOTDQ-NEXT: kmovd %k0, %eax 1403; AVX512NOTDQ-NEXT: movb %al, (%rsi) 1404; AVX512NOTDQ-NEXT: retq 1405 %d0 = load <64 x i1>, <64 x i1>* %a0 1406 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 63,i32 63,i32 63,i32 63> 1407 store <4 x i1> %d1, <4 x i1>* %a1 1408 ret void 1409} 1410define void @load_v64i1_broadcast_63_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) { 1411; AVX512-LABEL: load_v64i1_broadcast_63_v8i1_store: 1412; AVX512: # %bb.0: 1413; AVX512-NEXT: kmovb 7(%rdi), %k0 1414; AVX512-NEXT: vpmovm2d %k0, %ymm0 1415; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] 1416; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0 1417; AVX512-NEXT: vpmovd2m %ymm0, %k0 1418; AVX512-NEXT: kmovb %k0, (%rsi) 1419; AVX512-NEXT: vzeroupper 1420; AVX512-NEXT: retq 1421; 1422; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v8i1_store: 1423; AVX512NOTDQ: # %bb.0: 1424; AVX512NOTDQ-NEXT: movzbl 7(%rdi), %eax 1425; AVX512NOTDQ-NEXT: kmovd %eax, %k1 1426; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 1427; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 1428; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] 1429; AVX512NOTDQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 1430; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0 1431; AVX512NOTDQ-NEXT: kmovd %k0, %eax 1432; AVX512NOTDQ-NEXT: movb %al, (%rsi) 1433; AVX512NOTDQ-NEXT: vzeroupper 1434; AVX512NOTDQ-NEXT: retq 1435 %d0 = load <64 x i1>, <64 x i1>* %a0 1436 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63> 1437 store <8 x i1> %d1, <8 x i1>* %a1 1438 ret void 1439} 1440define void @load_v64i1_broadcast_63_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) { 1441; AVX512-LABEL: load_v64i1_broadcast_63_v16i1_store: 1442; AVX512: # %bb.0: 1443; AVX512-NEXT: kmovw 6(%rdi), %k0 1444; AVX512-NEXT: vpmovm2d %k0, %zmm0 1445; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1446; AVX512-NEXT: vpermd %zmm0, %zmm1, %zmm0 1447; AVX512-NEXT: vpmovd2m %zmm0, %k0 1448; AVX512-NEXT: kmovw %k0, (%rsi) 1449; AVX512-NEXT: vzeroupper 1450; AVX512-NEXT: retq 1451; 1452; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v16i1_store: 1453; AVX512NOTDQ: # %bb.0: 1454; AVX512NOTDQ-NEXT: kmovw 6(%rdi), %k1 1455; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 1456; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1457; AVX512NOTDQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 1458; AVX512NOTDQ-NEXT: vptestmd %zmm0, %zmm0, %k0 1459; AVX512NOTDQ-NEXT: kmovw %k0, (%rsi) 1460; AVX512NOTDQ-NEXT: vzeroupper 1461; AVX512NOTDQ-NEXT: retq 1462 %d0 = load <64 x i1>, <64 x i1>* %a0 1463 %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63> 1464 store <16 x i1> %d1, <16 x i1>* %a1 1465 ret void 1466} 1467 1468