1; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s 2>%t | FileCheck %s 2; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t 3 4; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. 5; WARN-NOT: warning 6 7; Range checks: for all the instruction tested in this file, the 8; immediate must be within the range [-8, 7] (4-bit immediate). Out of 9; range values are tested only in one case (following). Valid values 10; are tested all through the rest of the file. 11 12define void @imm_out_of_range(<vscale x 2 x i64> * %base, <vscale x 2 x i1> %mask) nounwind { 13; CHECK-LABEL: imm_out_of_range: 14; CHECK-NEXT: rdvl x8, #8 15; CHECK-NEXT: add x8, x0, x8 16; CHECK-NEXT: ldnt1d { z[[DATA:[0-9]+]].d }, p0/z, [x{{[0-9]+}}] 17; CHECK-NEXT: rdvl x8, #-9 18; CHECK-NEXT: add x8, x0, x8 19; CHECK-NEXT: stnt1d { z[[DATA]].d }, p0, [x{{[0-9]+}}] 20; CHECK-NEXT: ret 21 %base_load = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %base, i64 8 22 %base_load_bc = bitcast <vscale x 2 x i64>* %base_load to i64* 23 %data = call <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.nxv2i64(<vscale x 2 x i1> %mask, 24 i64* %base_load_bc) 25 %base_store = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64> * %base, i64 -9 26 %base_store_bc = bitcast <vscale x 2 x i64>* %base_store to i64* 27 call void @llvm.aarch64.sve.stnt1.nxv2i64(<vscale x 2 x i64> %data, 28 <vscale x 2 x i1> %mask, 29 i64* %base_store_bc) 30 ret void 31} 32 33; 2-lane non-temporal load/stores 34 35 36define void @test_masked_ldst_sv2i64(<vscale x 2 x i64> * %base, <vscale x 2 x i1> %mask) nounwind { 37; CHECK-LABEL: test_masked_ldst_sv2i64: 38; CHECK-NEXT: ldnt1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-8, mul vl] 39; CHECK-NEXT: stnt1d { z[[DATA]].d }, p0, [x0, #-7, mul vl] 40; CHECK-NEXT: ret 41 %base_load = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %base, i64 -8 42 %base_load_bc = bitcast <vscale x 2 x i64>* %base_load to i64* 43 %data = call <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.nxv2i64(<vscale x 2 x i1> %mask, 44 i64* %base_load_bc) 45 %base_store = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64> * %base, i64 -7 46 %base_store_bc = bitcast <vscale x 2 x i64>* %base_store to i64* 47 call void @llvm.aarch64.sve.stnt1.nxv2i64(<vscale x 2 x i64> %data, 48 <vscale x 2 x i1> %mask, 49 i64* %base_store_bc) 50 ret void 51} 52 53define void @test_masked_ldst_sv2f64(<vscale x 2 x double> * %base, <vscale x 2 x i1> %mask) nounwind { 54; CHECK-LABEL: test_masked_ldst_sv2f64: 55; CHECK-NEXT: ldnt1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, #-6, mul vl] 56; CHECK-NEXT: stnt1d { z[[DATA]].d }, p0, [x0, #-5, mul vl] 57; CHECK-NEXT: ret 58 %base_load = getelementptr <vscale x 2 x double>, <vscale x 2 x double>* %base, i64 -6 59 %base_load_bc = bitcast <vscale x 2 x double>* %base_load to double* 60 %data = call <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.nxv2f64(<vscale x 2 x i1> %mask, 61 double* %base_load_bc) 62 %base_store = getelementptr <vscale x 2 x double>, <vscale x 2 x double> * %base, i64 -5 63 %base_store_bc = bitcast <vscale x 2 x double>* %base_store to double* 64 call void @llvm.aarch64.sve.stnt1.nxv2f64(<vscale x 2 x double> %data, 65 <vscale x 2 x i1> %mask, 66 double* %base_store_bc) 67 ret void 68} 69 70; 4-lane non-temporal load/stores. 71 72define void @test_masked_ldst_sv4i32(<vscale x 4 x i32> * %base, <vscale x 4 x i1> %mask) nounwind { 73; CHECK-LABEL: test_masked_ldst_sv4i32: 74; CHECK-NEXT: ldnt1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, #6, mul vl] 75; CHECK-NEXT: stnt1w { z[[DATA]].s }, p0, [x0, #7, mul vl] 76; CHECK-NEXT: ret 77 %base_load = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %base, i64 6 78 %base_load_bc = bitcast <vscale x 4 x i32>* %base_load to i32* 79 %data = call <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.nxv4i32(<vscale x 4 x i1> %mask, 80 i32* %base_load_bc) 81 %base_store = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32> * %base, i64 7 82 %base_store_bc = bitcast <vscale x 4 x i32>* %base_store to i32* 83 call void @llvm.aarch64.sve.stnt1.nxv4i32(<vscale x 4 x i32> %data, 84 <vscale x 4 x i1> %mask, 85 i32* %base_store_bc) 86 ret void 87} 88 89define void @test_masked_ldst_sv4f32(<vscale x 4 x float> * %base, <vscale x 4 x i1> %mask) nounwind { 90; CHECK-LABEL: test_masked_ldst_sv4f32: 91; CHECK-NEXT: ldnt1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, #-1, mul vl] 92; CHECK-NEXT: stnt1w { z[[DATA]].s }, p0, [x0, #2, mul vl] 93; CHECK-NEXT: ret 94 %base_load = getelementptr <vscale x 4 x float>, <vscale x 4 x float>* %base, i64 -1 95 %base_load_bc = bitcast <vscale x 4 x float>* %base_load to float* 96 %data = call <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.nxv4f32(<vscale x 4 x i1> %mask, 97 float* %base_load_bc) 98 %base_store = getelementptr <vscale x 4 x float>, <vscale x 4 x float> * %base, i64 2 99 %base_store_bc = bitcast <vscale x 4 x float>* %base_store to float* 100 call void @llvm.aarch64.sve.stnt1.nxv4f32(<vscale x 4 x float> %data, 101 <vscale x 4 x i1> %mask, 102 float* %base_store_bc) 103 ret void 104} 105 106 107; 8-lane non-temporal load/stores. 108 109define void @test_masked_ldst_sv8i16(<vscale x 8 x i16> * %base, <vscale x 8 x i1> %mask) nounwind { 110; CHECK-LABEL: test_masked_ldst_sv8i16: 111; CHECK-NEXT: ldnt1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, #6, mul vl] 112; CHECK-NEXT: stnt1h { z[[DATA]].h }, p0, [x0, #7, mul vl] 113; CHECK-NEXT: ret 114 %base_load = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16>* %base, i64 6 115 %base_load_bc = bitcast <vscale x 8 x i16>* %base_load to i16* 116 %data = call <vscale x 8 x i16> @llvm.aarch64.sve.ldnt1.nxv8i16(<vscale x 8 x i1> %mask, 117 i16* %base_load_bc) 118 %base_store = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16> * %base, i64 7 119 %base_store_bc = bitcast <vscale x 8 x i16>* %base_store to i16* 120 call void @llvm.aarch64.sve.stnt1.nxv8i16(<vscale x 8 x i16> %data, 121 <vscale x 8 x i1> %mask, 122 i16* %base_store_bc) 123 ret void 124} 125 126define void @test_masked_ldst_sv8f16(<vscale x 8 x half> * %base, <vscale x 8 x i1> %mask) nounwind { 127; CHECK-LABEL: test_masked_ldst_sv8f16: 128; CHECK-NEXT: ldnt1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, #-1, mul vl] 129; CHECK-NEXT: stnt1h { z[[DATA]].h }, p0, [x0, #2, mul vl] 130; CHECK-NEXT: ret 131 %base_load = getelementptr <vscale x 8 x half>, <vscale x 8 x half>* %base, i64 -1 132 %base_load_bc = bitcast <vscale x 8 x half>* %base_load to half* 133 %data = call <vscale x 8 x half> @llvm.aarch64.sve.ldnt1.nxv8f16(<vscale x 8 x i1> %mask, 134 half* %base_load_bc) 135 %base_store = getelementptr <vscale x 8 x half>, <vscale x 8 x half> * %base, i64 2 136 %base_store_bc = bitcast <vscale x 8 x half>* %base_store to half* 137 call void @llvm.aarch64.sve.stnt1.nxv8f16(<vscale x 8 x half> %data, 138 <vscale x 8 x i1> %mask, 139 half* %base_store_bc) 140 ret void 141} 142 143define void @test_masked_ldst_sv8bf16(<vscale x 8 x bfloat> * %base, <vscale x 8 x i1> %mask) nounwind #0 { 144; CHECK-LABEL: test_masked_ldst_sv8bf16: 145; CHECK-NEXT: ldnt1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, #-1, mul vl] 146; CHECK-NEXT: stnt1h { z[[DATA]].h }, p0, [x0, #2, mul vl] 147; CHECK-NEXT: ret 148 %base_load = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %base, i64 -1 149 %base_load_bc = bitcast <vscale x 8 x bfloat>* %base_load to bfloat* 150 %data = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1> %mask, 151 bfloat* %base_load_bc) 152 %base_store = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat> * %base, i64 2 153 %base_store_bc = bitcast <vscale x 8 x bfloat>* %base_store to bfloat* 154 call void @llvm.aarch64.sve.stnt1.nxv8bf16(<vscale x 8 x bfloat> %data, 155 <vscale x 8 x i1> %mask, 156 bfloat* %base_store_bc) 157 ret void 158} 159 160; 16-lane non-temporal load/stores. 161 162define void @test_masked_ldst_sv16i8(<vscale x 16 x i8> * %base, <vscale x 16 x i1> %mask) nounwind { 163; CHECK-LABEL: test_masked_ldst_sv16i8: 164; CHECK-NEXT: ldnt1b { z[[DATA:[0-9]+]].b }, p0/z, [x0, #6, mul vl] 165; CHECK-NEXT: stnt1b { z[[DATA]].b }, p0, [x0, #7, mul vl] 166; CHECK-NEXT: ret 167 %base_load = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %base, i64 6 168 %base_load_bc = bitcast <vscale x 16 x i8>* %base_load to i8* 169 %data = call <vscale x 16 x i8> @llvm.aarch64.sve.ldnt1.nxv16i8(<vscale x 16 x i1> %mask, 170 i8* %base_load_bc) 171 %base_store = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8> * %base, i64 7 172 %base_store_bc = bitcast <vscale x 16 x i8>* %base_store to i8* 173 call void @llvm.aarch64.sve.stnt1.nxv16i8(<vscale x 16 x i8> %data, 174 <vscale x 16 x i1> %mask, 175 i8* %base_store_bc) 176 ret void 177} 178 179; 2-element non-temporal loads. 180declare <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.nxv2i64(<vscale x 2 x i1>, i64*) 181declare <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.nxv2f64(<vscale x 2 x i1>, double*) 182 183; 4-element non-temporal loads. 184declare <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.nxv4i32(<vscale x 4 x i1>, i32*) 185declare <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.nxv4f32(<vscale x 4 x i1>, float*) 186 187; 8-element non-temporal loads. 188declare <vscale x 8 x i16> @llvm.aarch64.sve.ldnt1.nxv8i16(<vscale x 8 x i1>, i16*) 189declare <vscale x 8 x half> @llvm.aarch64.sve.ldnt1.nxv8f16(<vscale x 8 x i1>, half*) 190declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1>, bfloat*) 191 192; 16-element non-temporal loads. 193declare <vscale x 16 x i8> @llvm.aarch64.sve.ldnt1.nxv16i8(<vscale x 16 x i1>, i8*) 194 195; 2-element non-temporal stores. 196declare void @llvm.aarch64.sve.stnt1.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64*) 197declare void @llvm.aarch64.sve.stnt1.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double*) 198 199; 4-element non-temporal stores. 200declare void @llvm.aarch64.sve.stnt1.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32*) 201declare void @llvm.aarch64.sve.stnt1.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, float*) 202 203; 8-element non-temporal stores. 204declare void @llvm.aarch64.sve.stnt1.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i16*) 205declare void @llvm.aarch64.sve.stnt1.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, half*) 206declare void @llvm.aarch64.sve.stnt1.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x i1>, bfloat*) 207 208; 16-element non-temporal stores. 209declare void @llvm.aarch64.sve.stnt1.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, i8*) 210 211; +bf16 is required for the bfloat version. 212attributes #0 = { "target-features"="+sve,+bf16" } 213