1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f < %s | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F 3; RUN: llc -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl < %s | FileCheck %s --check-prefix=AVX512 --check-prefix=SKX 4 5define <16 x i32> @test1(<16 x i32> %trigger, <16 x i32>* %addr) { 6; AVX512-LABEL: test1: 7; AVX512: ## %bb.0: 8; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1 9; AVX512-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} 10; AVX512-NEXT: retq 11 %mask = icmp eq <16 x i32> %trigger, zeroinitializer 12 %res = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>undef) 13 ret <16 x i32> %res 14} 15 16define <16 x i32> @test2(<16 x i32> %trigger, <16 x i32>* %addr) { 17; AVX512-LABEL: test2: 18; AVX512: ## %bb.0: 19; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1 20; AVX512-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} 21; AVX512-NEXT: retq 22 %mask = icmp eq <16 x i32> %trigger, zeroinitializer 23 %res = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>zeroinitializer) 24 ret <16 x i32> %res 25} 26 27define void @test3(<16 x i32> %trigger, <16 x i32>* %addr, <16 x i32> %val) { 28; AVX512-LABEL: test3: 29; AVX512: ## %bb.0: 30; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1 31; AVX512-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1} 32; AVX512-NEXT: vzeroupper 33; AVX512-NEXT: retq 34 %mask = icmp eq <16 x i32> %trigger, zeroinitializer 35 call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32>%val, <16 x i32>* %addr, i32 4, <16 x i1>%mask) 36 ret void 37} 38 39define <16 x float> @test4(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %dst) { 40; AVX512-LABEL: test4: 41; AVX512: ## %bb.0: 42; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1 43; AVX512-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1} 44; AVX512-NEXT: retq 45 %mask = icmp eq <16 x i32> %trigger, zeroinitializer 46 %res = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* %addr, i32 4, <16 x i1>%mask, <16 x float> %dst) 47 ret <16 x float> %res 48} 49 50define void @test13(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %val) { 51; AVX512-LABEL: test13: 52; AVX512: ## %bb.0: 53; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1 54; AVX512-NEXT: vmovups %zmm1, (%rdi) {%k1} 55; AVX512-NEXT: vzeroupper 56; AVX512-NEXT: retq 57 %mask = icmp eq <16 x i32> %trigger, zeroinitializer 58 call void @llvm.masked.store.v16f32.p0v16f32(<16 x float>%val, <16 x float>* %addr, i32 4, <16 x i1>%mask) 59 ret void 60} 61 62define void @one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) { 63; AVX512-LABEL: one_mask_bit_set5: 64; AVX512: ## %bb.0: 65; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 66; AVX512-NEXT: vmovlps %xmm0, 48(%rdi) 67; AVX512-NEXT: vzeroupper 68; AVX512-NEXT: retq 69 call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %val, <8 x double>* %addr, i32 4, <8 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false>) 70 ret void 71} 72 73define <8 x double> @load_one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) { 74; 75; AVX512-LABEL: load_one_mask_bit_set5: 76; AVX512: ## %bb.0: 77; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1 78; AVX512-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] 79; AVX512-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0 80; AVX512-NEXT: retq 81 %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x double> %val) 82 ret <8 x double> %res 83} 84 85declare <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>) 86declare void @llvm.masked.store.v16i32.p0v16i32(<16 x i32>, <16 x i32>*, i32, <16 x i1>) 87declare void @llvm.masked.store.v16f32.p0v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>) 88declare <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>) 89declare <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>) 90declare void @llvm.masked.store.v8f64.p0v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>) 91 92declare <16 x i32*> @llvm.masked.load.v16p0i32.p0v16p0i32(<16 x i32*>*, i32, <16 x i1>, <16 x i32*>) 93 94define <16 x i32*> @test23(<16 x i32*> %trigger, <16 x i32*>* %addr) { 95; AVX512-LABEL: test23: 96; AVX512: ## %bb.0: 97; AVX512-NEXT: vptestnmq %zmm1, %zmm1, %k1 98; AVX512-NEXT: vptestnmq %zmm0, %zmm0, %k2 99; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 {%k2} {z} 100; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} {z} 101; AVX512-NEXT: retq 102 %mask = icmp eq <16 x i32*> %trigger, zeroinitializer 103 %res = call <16 x i32*> @llvm.masked.load.v16p0i32.p0v16p0i32(<16 x i32*>* %addr, i32 4, <16 x i1>%mask, <16 x i32*>zeroinitializer) 104 ret <16 x i32*> %res 105} 106 107%mystruct = type { i16, i16, [1 x i8*] } 108 109declare <16 x %mystruct*> @llvm.masked.load.v16p0mystruct.p0v16p0mystruct(<16 x %mystruct*>*, i32, <16 x i1>, <16 x %mystruct*>) 110 111define <16 x %mystruct*> @test24(<16 x i1> %mask, <16 x %mystruct*>* %addr) { 112; AVX512F-LABEL: test24: 113; AVX512F: ## %bb.0: 114; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 115; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 116; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 117; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} 118; AVX512F-NEXT: kshiftrw $8, %k1, %k1 119; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} {z} 120; AVX512F-NEXT: retq 121; 122; SKX-LABEL: test24: 123; SKX: ## %bb.0: 124; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 125; SKX-NEXT: vpmovb2m %xmm0, %k1 126; SKX-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} 127; SKX-NEXT: kshiftrw $8, %k1, %k1 128; SKX-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k1} {z} 129; SKX-NEXT: retq 130 %res = call <16 x %mystruct*> @llvm.masked.load.v16p0mystruct.p0v16p0mystruct(<16 x %mystruct*>* %addr, i32 4, <16 x i1>%mask, <16 x %mystruct*>zeroinitializer) 131 ret <16 x %mystruct*> %res 132} 133 134define void @test_store_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0) { 135; AVX512F-LABEL: test_store_16i64: 136; AVX512F: ## %bb.0: 137; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 138; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 139; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 140; AVX512F-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1} 141; AVX512F-NEXT: kshiftrw $8, %k1, %k1 142; AVX512F-NEXT: vmovdqu64 %zmm2, 64(%rdi) {%k1} 143; AVX512F-NEXT: vzeroupper 144; AVX512F-NEXT: retq 145; 146; SKX-LABEL: test_store_16i64: 147; SKX: ## %bb.0: 148; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 149; SKX-NEXT: vpmovb2m %xmm0, %k1 150; SKX-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1} 151; SKX-NEXT: kshiftrw $8, %k1, %k1 152; SKX-NEXT: vmovdqu64 %zmm2, 64(%rdi) {%k1} 153; SKX-NEXT: vzeroupper 154; SKX-NEXT: retq 155 call void @llvm.masked.store.v16i64.p0v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32 4, <16 x i1> %mask) 156 ret void 157} 158declare void @llvm.masked.store.v16i64.p0v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32, <16 x i1> %mask) 159 160define void @test_store_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0) { 161; AVX512F-LABEL: test_store_16f64: 162; AVX512F: ## %bb.0: 163; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 164; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 165; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 166; AVX512F-NEXT: vmovupd %zmm1, (%rdi) {%k1} 167; AVX512F-NEXT: kshiftrw $8, %k1, %k1 168; AVX512F-NEXT: vmovupd %zmm2, 64(%rdi) {%k1} 169; AVX512F-NEXT: vzeroupper 170; AVX512F-NEXT: retq 171; 172; SKX-LABEL: test_store_16f64: 173; SKX: ## %bb.0: 174; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 175; SKX-NEXT: vpmovb2m %xmm0, %k1 176; SKX-NEXT: vmovupd %zmm1, (%rdi) {%k1} 177; SKX-NEXT: kshiftrw $8, %k1, %k1 178; SKX-NEXT: vmovupd %zmm2, 64(%rdi) {%k1} 179; SKX-NEXT: vzeroupper 180; SKX-NEXT: retq 181 call void @llvm.masked.store.v16f64.p0v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32 4, <16 x i1> %mask) 182 ret void 183} 184declare void @llvm.masked.store.v16f64.p0v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32, <16 x i1> %mask) 185 186define <16 x i64> @test_load_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0) { 187; AVX512F-LABEL: test_load_16i64: 188; AVX512F: ## %bb.0: 189; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 190; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 191; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 192; AVX512F-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1} 193; AVX512F-NEXT: kshiftrw $8, %k1, %k1 194; AVX512F-NEXT: vpblendmq 64(%rdi), %zmm2, %zmm1 {%k1} 195; AVX512F-NEXT: retq 196; 197; SKX-LABEL: test_load_16i64: 198; SKX: ## %bb.0: 199; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 200; SKX-NEXT: vpmovb2m %xmm0, %k1 201; SKX-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1} 202; SKX-NEXT: kshiftrw $8, %k1, %k1 203; SKX-NEXT: vpblendmq 64(%rdi), %zmm2, %zmm1 {%k1} 204; SKX-NEXT: retq 205 %res = call <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>* %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0) 206 ret <16 x i64> %res 207} 208declare <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>* %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0) 209 210define <16 x double> @test_load_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0) { 211; AVX512F-LABEL: test_load_16f64: 212; AVX512F: ## %bb.0: 213; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 214; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 215; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 216; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1} 217; AVX512F-NEXT: kshiftrw $8, %k1, %k1 218; AVX512F-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k1} 219; AVX512F-NEXT: retq 220; 221; SKX-LABEL: test_load_16f64: 222; SKX: ## %bb.0: 223; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 224; SKX-NEXT: vpmovb2m %xmm0, %k1 225; SKX-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1} 226; SKX-NEXT: kshiftrw $8, %k1, %k1 227; SKX-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k1} 228; SKX-NEXT: retq 229 %res = call <16 x double> @llvm.masked.load.v16f64.p0v16f64(<16 x double>* %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0) 230 ret <16 x double> %res 231} 232declare <16 x double> @llvm.masked.load.v16f64.p0v16f64(<16 x double>* %ptrs, i32, <16 x i1> %mask, <16 x double> %src0) 233 234define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32 x double> %src0) { 235; AVX512F-LABEL: test_load_32f64: 236; AVX512F: ## %bb.0: 237; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm5 238; AVX512F-NEXT: vpmovsxbd %xmm5, %zmm5 239; AVX512F-NEXT: vpslld $31, %zmm5, %zmm5 240; AVX512F-NEXT: vptestmd %zmm5, %zmm5, %k1 241; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 242; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 243; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 244; AVX512F-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k2} 245; AVX512F-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm5 {%k1} 246; AVX512F-NEXT: kshiftrw $8, %k2, %k2 247; AVX512F-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k2} 248; AVX512F-NEXT: kshiftrw $8, %k1, %k1 249; AVX512F-NEXT: vblendmpd 192(%rdi), %zmm4, %zmm3 {%k1} 250; AVX512F-NEXT: vmovapd %zmm5, %zmm2 251; AVX512F-NEXT: retq 252; 253; SKX-LABEL: test_load_32f64: 254; SKX: ## %bb.0: 255; SKX-NEXT: vpsllw $7, %ymm0, %ymm0 256; SKX-NEXT: vpmovb2m %ymm0, %k1 257; SKX-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1} 258; SKX-NEXT: kshiftrw $8, %k1, %k2 259; SKX-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k2} 260; SKX-NEXT: kshiftrd $16, %k1, %k1 261; SKX-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm2 {%k1} 262; SKX-NEXT: kshiftrw $8, %k1, %k1 263; SKX-NEXT: vblendmpd 192(%rdi), %zmm4, %zmm3 {%k1} 264; SKX-NEXT: retq 265 %res = call <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0) 266 ret <32 x double> %res 267} 268 269declare <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32, <32 x i1> %mask, <32 x double> %src0) 270