1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s 3 4define <64 x i8> @test1(i8 * %addr) { 5; CHECK-LABEL: test1: 6; CHECK: ## %bb.0: 7; CHECK-NEXT: vmovups (%rdi), %zmm0 8; CHECK-NEXT: retq 9 %vaddr = bitcast i8* %addr to <64 x i8>* 10 %res = load <64 x i8>, <64 x i8>* %vaddr, align 1 11 ret <64 x i8>%res 12} 13 14define void @test2(i8 * %addr, <64 x i8> %data) { 15; CHECK-LABEL: test2: 16; CHECK: ## %bb.0: 17; CHECK-NEXT: vmovups %zmm0, (%rdi) 18; CHECK-NEXT: retq 19 %vaddr = bitcast i8* %addr to <64 x i8>* 20 store <64 x i8>%data, <64 x i8>* %vaddr, align 1 21 ret void 22} 23 24define <64 x i8> @test3(i8 * %addr, <64 x i8> %old, <64 x i8> %mask1) { 25; CHECK-LABEL: test3: 26; CHECK: ## %bb.0: 27; CHECK-NEXT: vptestmb %zmm1, %zmm1, %k1 28; CHECK-NEXT: vmovdqu8 (%rdi), %zmm0 {%k1} 29; CHECK-NEXT: retq 30 %mask = icmp ne <64 x i8> %mask1, zeroinitializer 31 %vaddr = bitcast i8* %addr to <64 x i8>* 32 %r = load <64 x i8>, <64 x i8>* %vaddr, align 1 33 %res = select <64 x i1> %mask, <64 x i8> %r, <64 x i8> %old 34 ret <64 x i8>%res 35} 36 37define <64 x i8> @test4(i8 * %addr, <64 x i8> %mask1) { 38; CHECK-LABEL: test4: 39; CHECK: ## %bb.0: 40; CHECK-NEXT: vptestmb %zmm0, %zmm0, %k1 41; CHECK-NEXT: vmovdqu8 (%rdi), %zmm0 {%k1} {z} 42; CHECK-NEXT: retq 43 %mask = icmp ne <64 x i8> %mask1, zeroinitializer 44 %vaddr = bitcast i8* %addr to <64 x i8>* 45 %r = load <64 x i8>, <64 x i8>* %vaddr, align 1 46 %res = select <64 x i1> %mask, <64 x i8> %r, <64 x i8> zeroinitializer 47 ret <64 x i8>%res 48} 49 50define <32 x i16> @test5(i8 * %addr) { 51; CHECK-LABEL: test5: 52; CHECK: ## %bb.0: 53; CHECK-NEXT: vmovups (%rdi), %zmm0 54; CHECK-NEXT: retq 55 %vaddr = bitcast i8* %addr to <32 x i16>* 56 %res = load <32 x i16>, <32 x i16>* %vaddr, align 1 57 ret <32 x i16>%res 58} 59 60define void @test6(i8 * %addr, <32 x i16> %data) { 61; CHECK-LABEL: test6: 62; CHECK: ## %bb.0: 63; CHECK-NEXT: vmovups %zmm0, (%rdi) 64; CHECK-NEXT: retq 65 %vaddr = bitcast i8* %addr to <32 x i16>* 66 store <32 x i16>%data, <32 x i16>* %vaddr, align 1 67 ret void 68} 69 70define <32 x i16> @test7(i8 * %addr, <32 x i16> %old, <32 x i16> %mask1) { 71; CHECK-LABEL: test7: 72; CHECK: ## %bb.0: 73; CHECK-NEXT: vptestmw %zmm1, %zmm1, %k1 74; CHECK-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} 75; CHECK-NEXT: retq 76 %mask = icmp ne <32 x i16> %mask1, zeroinitializer 77 %vaddr = bitcast i8* %addr to <32 x i16>* 78 %r = load <32 x i16>, <32 x i16>* %vaddr, align 1 79 %res = select <32 x i1> %mask, <32 x i16> %r, <32 x i16> %old 80 ret <32 x i16>%res 81} 82 83define <32 x i16> @test8(i8 * %addr, <32 x i16> %mask1) { 84; CHECK-LABEL: test8: 85; CHECK: ## %bb.0: 86; CHECK-NEXT: vptestmw %zmm0, %zmm0, %k1 87; CHECK-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z} 88; CHECK-NEXT: retq 89 %mask = icmp ne <32 x i16> %mask1, zeroinitializer 90 %vaddr = bitcast i8* %addr to <32 x i16>* 91 %r = load <32 x i16>, <32 x i16>* %vaddr, align 1 92 %res = select <32 x i1> %mask, <32 x i16> %r, <32 x i16> zeroinitializer 93 ret <32 x i16>%res 94} 95 96define <16 x i8> @test_mask_load_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> %val) { 97; CHECK-LABEL: test_mask_load_16xi8: 98; CHECK: ## %bb.0: 99; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0 100; CHECK-NEXT: vpmovb2m %zmm0, %k0 101; CHECK-NEXT: kmovw %k0, %k1 102; CHECK-NEXT: vmovdqu8 (%rdi), %zmm0 {%k1} {z} 103; CHECK-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 104; CHECK-NEXT: retq 105 %res = call <16 x i8> @llvm.masked.load.v16i8(<16 x i8>* %addr, i32 4, <16 x i1>%mask, <16 x i8> undef) 106 ret <16 x i8> %res 107} 108declare <16 x i8> @llvm.masked.load.v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>) 109 110define <32 x i8> @test_mask_load_32xi8(<32 x i1> %mask, <32 x i8>* %addr, <32 x i8> %val) { 111; CHECK-LABEL: test_mask_load_32xi8: 112; CHECK: ## %bb.0: 113; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0 114; CHECK-NEXT: vpmovb2m %zmm0, %k0 115; CHECK-NEXT: kmovd %k0, %k1 116; CHECK-NEXT: vmovdqu8 (%rdi), %zmm0 {%k1} {z} 117; CHECK-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 118; CHECK-NEXT: retq 119 %res = call <32 x i8> @llvm.masked.load.v32i8(<32 x i8>* %addr, i32 4, <32 x i1>%mask, <32 x i8> zeroinitializer) 120 ret <32 x i8> %res 121} 122declare <32 x i8> @llvm.masked.load.v32i8(<32 x i8>*, i32, <32 x i1>, <32 x i8>) 123 124define <8 x i16> @test_mask_load_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> %val) { 125; CHECK-LABEL: test_mask_load_8xi16: 126; CHECK: ## %bb.0: 127; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0 128; CHECK-NEXT: vpmovw2m %zmm0, %k0 129; CHECK-NEXT: kshiftld $24, %k0, %k0 130; CHECK-NEXT: kshiftrd $24, %k0, %k1 131; CHECK-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z} 132; CHECK-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 133; CHECK-NEXT: retq 134 %res = call <8 x i16> @llvm.masked.load.v8i16(<8 x i16>* %addr, i32 4, <8 x i1>%mask, <8 x i16> undef) 135 ret <8 x i16> %res 136} 137declare <8 x i16> @llvm.masked.load.v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>) 138 139define <16 x i16> @test_mask_load_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i16> %val) { 140; CHECK-LABEL: test_mask_load_16xi16: 141; CHECK: ## %bb.0: 142; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0 143; CHECK-NEXT: vpmovb2m %zmm0, %k0 144; CHECK-NEXT: kmovw %k0, %k1 145; CHECK-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z} 146; CHECK-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0 147; CHECK-NEXT: retq 148 %res = call <16 x i16> @llvm.masked.load.v16i16(<16 x i16>* %addr, i32 4, <16 x i1>%mask, <16 x i16> zeroinitializer) 149 ret <16 x i16> %res 150} 151declare <16 x i16> @llvm.masked.load.v16i16(<16 x i16>*, i32, <16 x i1>, <16 x i16>) 152 153define void @test_mask_store_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> %val) { 154; CHECK-LABEL: test_mask_store_16xi8: 155; CHECK: ## %bb.0: 156; CHECK-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 157; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0 158; CHECK-NEXT: vpmovb2m %zmm0, %k0 159; CHECK-NEXT: kmovw %k0, %k1 160; CHECK-NEXT: vmovdqu8 %zmm1, (%rdi) {%k1} 161; CHECK-NEXT: retq 162 call void @llvm.masked.store.v16i8(<16 x i8> %val, <16 x i8>* %addr, i32 4, <16 x i1>%mask) 163 ret void 164} 165declare void @llvm.masked.store.v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>) 166 167define void @test_mask_store_32xi8(<32 x i1> %mask, <32 x i8>* %addr, <32 x i8> %val) { 168; CHECK-LABEL: test_mask_store_32xi8: 169; CHECK: ## %bb.0: 170; CHECK-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 171; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0 172; CHECK-NEXT: vpmovb2m %zmm0, %k0 173; CHECK-NEXT: kmovd %k0, %k1 174; CHECK-NEXT: vmovdqu8 %zmm1, (%rdi) {%k1} 175; CHECK-NEXT: retq 176 call void @llvm.masked.store.v32i8(<32 x i8> %val, <32 x i8>* %addr, i32 4, <32 x i1>%mask) 177 ret void 178} 179declare void @llvm.masked.store.v32i8(<32 x i8>, <32 x i8>*, i32, <32 x i1>) 180 181define void @test_mask_store_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> %val) { 182; CHECK-LABEL: test_mask_store_8xi16: 183; CHECK: ## %bb.0: 184; CHECK-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 185; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0 186; CHECK-NEXT: vpmovw2m %zmm0, %k0 187; CHECK-NEXT: kshiftld $24, %k0, %k0 188; CHECK-NEXT: kshiftrd $24, %k0, %k1 189; CHECK-NEXT: vmovdqu16 %zmm1, (%rdi) {%k1} 190; CHECK-NEXT: retq 191 call void @llvm.masked.store.v8i16(<8 x i16> %val, <8 x i16>* %addr, i32 4, <8 x i1>%mask) 192 ret void 193} 194declare void @llvm.masked.store.v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>) 195 196define void @test_mask_store_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i16> %val) { 197; CHECK-LABEL: test_mask_store_16xi16: 198; CHECK: ## %bb.0: 199; CHECK-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 200; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0 201; CHECK-NEXT: vpmovb2m %zmm0, %k0 202; CHECK-NEXT: kmovw %k0, %k1 203; CHECK-NEXT: vmovdqu16 %zmm1, (%rdi) {%k1} 204; CHECK-NEXT: retq 205 call void @llvm.masked.store.v16i16(<16 x i16> %val, <16 x i16>* %addr, i32 4, <16 x i1>%mask) 206 ret void 207} 208declare void @llvm.masked.store.v16i16(<16 x i16>, <16 x i16>*, i32, <16 x i1>) 209