1; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s 2; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t 3 4; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. 5; WARN-NOT: warning 6 7; 8; LD1B 9; 10 11define <vscale x 16 x i8> @ld1b_upper_bound(<vscale x 16 x i1> %pg, i8* %a) { 12; CHECK-LABEL: ld1b_upper_bound: 13; CHECK: ld1b { z0.b }, p0/z, [x0, #7, mul vl] 14; CHECK-NEXT: ret 15 %base_scalable = bitcast i8* %a to <vscale x 16 x i8>* 16 %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %base_scalable, i64 7 17 %base_scalar = bitcast <vscale x 16 x i8>* %base to i8* 18 %load = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1.nxv16i8(<vscale x 16 x i1> %pg, i8* %base_scalar) 19 ret <vscale x 16 x i8> %load 20} 21 22define <vscale x 16 x i8> @ld1b_inbound(<vscale x 16 x i1> %pg, i8* %a) { 23; CHECK-LABEL: ld1b_inbound: 24; CHECK: ld1b { z0.b }, p0/z, [x0, #1, mul vl] 25; CHECK-NEXT: ret 26 %base_scalable = bitcast i8* %a to <vscale x 16 x i8>* 27 %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %base_scalable, i64 1 28 %base_scalar = bitcast <vscale x 16 x i8>* %base to i8* 29 %load = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1.nxv16i8(<vscale x 16 x i1> %pg, i8* %base_scalar) 30 ret <vscale x 16 x i8> %load 31} 32 33define <vscale x 4 x i32> @ld1b_s_inbound(<vscale x 4 x i1> %pg, i8* %a) { 34; CHECK-LABEL: ld1b_s_inbound: 35; CHECK: ld1b { z0.s }, p0/z, [x0, #7, mul vl] 36; CHECK-NEXT: ret 37 %base_scalable = bitcast i8* %a to <vscale x 4 x i8>* 38 %base = getelementptr <vscale x 4 x i8>, <vscale x 4 x i8>* %base_scalable, i64 7 39 %base_scalar = bitcast <vscale x 4 x i8>* %base to i8* 40 %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.nxv4i8(<vscale x 4 x i1> %pg, i8* %base_scalar) 41 %res = zext <vscale x 4 x i8> %load to <vscale x 4 x i32> 42 ret <vscale x 4 x i32> %res 43} 44 45define <vscale x 4 x i32> @ld1sb_s_inbound(<vscale x 4 x i1> %pg, i8* %a) { 46; CHECK-LABEL: ld1sb_s_inbound: 47; CHECK: ld1sb { z0.s }, p0/z, [x0, #7, mul vl] 48; CHECK-NEXT: ret 49 %base_scalable = bitcast i8* %a to <vscale x 4 x i8>* 50 %base = getelementptr <vscale x 4 x i8>, <vscale x 4 x i8>* %base_scalable, i64 7 51 %base_scalar = bitcast <vscale x 4 x i8>* %base to i8* 52 %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.nxv4i8(<vscale x 4 x i1> %pg, i8* %base_scalar) 53 %res = sext <vscale x 4 x i8> %load to <vscale x 4 x i32> 54 ret <vscale x 4 x i32> %res 55} 56 57define <vscale x 16 x i8> @ld1b_lower_bound(<vscale x 16 x i1> %pg, i8* %a) { 58; CHECK-LABEL: ld1b_lower_bound: 59; CHECK: ld1b { z0.b }, p0/z, [x0, #-8, mul vl] 60; CHECK-NEXT: ret 61 %base_scalable = bitcast i8* %a to <vscale x 16 x i8>* 62 %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %base_scalable, i64 -8 63 %base_scalar = bitcast <vscale x 16 x i8>* %base to i8* 64 %load = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1.nxv16i8(<vscale x 16 x i1> %pg, i8* %base_scalar) 65 ret <vscale x 16 x i8> %load 66} 67 68define <vscale x 16 x i8> @ld1b_out_of_upper_bound(<vscale x 16 x i1> %pg, i8* %a) { 69; CHECK-LABEL: ld1b_out_of_upper_bound: 70; CHECK: rdvl x[[OFFSET:[0-9]+]], #8 71; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x[[OFFSET]]] 72; CHECK-NEXT: ret 73 %base_scalable = bitcast i8* %a to <vscale x 16 x i8>* 74 %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %base_scalable, i64 8 75 %base_scalar = bitcast <vscale x 16 x i8>* %base to i8* 76 %load = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1.nxv16i8(<vscale x 16 x i1> %pg, i8* %base_scalar) 77 ret <vscale x 16 x i8> %load 78} 79 80define <vscale x 16 x i8> @ld1b_out_of_lower_bound(<vscale x 16 x i1> %pg, i8* %a) { 81; CHECK-LABEL: ld1b_out_of_lower_bound: 82; CHECK: rdvl x[[OFFSET:[0-9]+]], #-9 83; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x[[OFFSET]]] 84; CHECK-NEXT: ret 85 %base_scalable = bitcast i8* %a to <vscale x 16 x i8>* 86 %base = getelementptr <vscale x 16 x i8>, <vscale x 16 x i8>* %base_scalable, i64 -9 87 %base_scalar = bitcast <vscale x 16 x i8>* %base to i8* 88 %load = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1.nxv16i8(<vscale x 16 x i1> %pg, i8* %base_scalar) 89 ret <vscale x 16 x i8> %load 90} 91 92; 93; LD1H 94; 95 96define <vscale x 8 x i16> @ld1b_h_inbound(<vscale x 8 x i1> %pg, i8* %a) { 97; CHECK-LABEL: ld1b_h_inbound: 98; CHECK: ld1b { z0.h }, p0/z, [x0, #7, mul vl] 99; CHECK-NEXT: ret 100 %base_scalable = bitcast i8* %a to <vscale x 8 x i8>* 101 %base = getelementptr <vscale x 8 x i8>, <vscale x 8 x i8>* %base_scalable, i64 7 102 %base_scalar = bitcast <vscale x 8 x i8>* %base to i8* 103 %load = call <vscale x 8 x i8> @llvm.aarch64.sve.ld1.nxv8i8(<vscale x 8 x i1> %pg, i8* %base_scalar) 104 %res = zext <vscale x 8 x i8> %load to <vscale x 8 x i16> 105 ret <vscale x 8 x i16> %res 106} 107 108define <vscale x 8 x i16> @ld1sb_h_inbound(<vscale x 8 x i1> %pg, i8* %a) { 109; CHECK-LABEL: ld1sb_h_inbound: 110; CHECK: ld1sb { z0.h }, p0/z, [x0, #7, mul vl] 111; CHECK-NEXT: ret 112 %base_scalable = bitcast i8* %a to <vscale x 8 x i8>* 113 %base = getelementptr <vscale x 8 x i8>, <vscale x 8 x i8>* %base_scalable, i64 7 114 %base_scalar = bitcast <vscale x 8 x i8>* %base to i8* 115 %load = call <vscale x 8 x i8> @llvm.aarch64.sve.ld1.nxv8i8(<vscale x 8 x i1> %pg, i8* %base_scalar) 116 %res = sext <vscale x 8 x i8> %load to <vscale x 8 x i16> 117 ret <vscale x 8 x i16> %res 118} 119 120define <vscale x 8 x i16> @ld1h_inbound(<vscale x 8 x i1> %pg, i16* %a) { 121; CHECK-LABEL: ld1h_inbound: 122; CHECK: ld1h { z0.h }, p0/z, [x0, #1, mul vl] 123; CHECK-NEXT: ret 124 %base_scalable = bitcast i16* %a to <vscale x 8 x i16>* 125 %base = getelementptr <vscale x 8 x i16>, <vscale x 8 x i16>* %base_scalable, i64 1 126 %base_scalar = bitcast <vscale x 8 x i16>* %base to i16* 127 %load = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8 x i1> %pg, i16* %base_scalar) 128 ret <vscale x 8 x i16> %load 129} 130 131define <vscale x 4 x i32> @ld1h_s_inbound(<vscale x 4 x i1> %pg, i16* %a) { 132; CHECK-LABEL: ld1h_s_inbound: 133; CHECK: ld1h { z0.s }, p0/z, [x0, #7, mul vl] 134; CHECK-NEXT: ret 135 %base_scalable = bitcast i16* %a to <vscale x 4 x i16>* 136 %base = getelementptr <vscale x 4 x i16>, <vscale x 4 x i16>* %base_scalable, i64 7 137 %base_scalar = bitcast <vscale x 4 x i16>* %base to i16* 138 %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.nxv4i16(<vscale x 4 x i1> %pg, i16* %base_scalar) 139 %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32> 140 ret <vscale x 4 x i32> %res 141} 142 143define <vscale x 4 x i32> @ld1sh_s_inbound(<vscale x 4 x i1> %pg, i16* %a) { 144; CHECK-LABEL: ld1sh_s_inbound: 145; CHECK: ld1sh { z0.s }, p0/z, [x0, #7, mul vl] 146; CHECK-NEXT: ret 147 %base_scalable = bitcast i16* %a to <vscale x 4 x i16>* 148 %base = getelementptr <vscale x 4 x i16>, <vscale x 4 x i16>* %base_scalable, i64 7 149 %base_scalar = bitcast <vscale x 4 x i16>* %base to i16* 150 %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.nxv4i16(<vscale x 4 x i1> %pg, i16* %base_scalar) 151 %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32> 152 ret <vscale x 4 x i32> %res 153} 154 155define <vscale x 2 x i64> @ld1b_d_inbound(<vscale x 2 x i1> %pg, i8* %a) { 156; CHECK-LABEL: ld1b_d_inbound: 157; CHECK: ld1b { z0.d }, p0/z, [x0, #7, mul vl] 158; CHECK-NEXT: ret 159 %base_scalable = bitcast i8* %a to <vscale x 2 x i8>* 160 %base = getelementptr <vscale x 2 x i8>, <vscale x 2 x i8>* %base_scalable, i64 7 161 %base_scalar = bitcast <vscale x 2 x i8>* %base to i8* 162 %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.nxv2i8(<vscale x 2 x i1> %pg, i8* %base_scalar) 163 %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64> 164 ret <vscale x 2 x i64> %res 165} 166 167define <vscale x 2 x i64> @ld1sb_d_inbound(<vscale x 2 x i1> %pg, i8* %a) { 168; CHECK-LABEL: ld1sb_d_inbound: 169; CHECK: ld1sb { z0.d }, p0/z, [x0, #7, mul vl] 170; CHECK-NEXT: ret 171 %base_scalable = bitcast i8* %a to <vscale x 2 x i8>* 172 %base = getelementptr <vscale x 2 x i8>, <vscale x 2 x i8>* %base_scalable, i64 7 173 %base_scalar = bitcast <vscale x 2 x i8>* %base to i8* 174 %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.nxv2i8(<vscale x 2 x i1> %pg, i8* %base_scalar) 175 %res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64> 176 ret <vscale x 2 x i64> %res 177} 178 179define <vscale x 2 x i64> @ld1h_d_inbound(<vscale x 2 x i1> %pg, i16* %a) { 180; CHECK-LABEL: ld1h_d_inbound: 181; CHECK: ld1h { z0.d }, p0/z, [x0, #7, mul vl] 182; CHECK-NEXT: ret 183 %base_scalable = bitcast i16* %a to <vscale x 2 x i16>* 184 %base = getelementptr <vscale x 2 x i16>, <vscale x 2 x i16>* %base_scalable, i64 7 185 %base_scalar = bitcast <vscale x 2 x i16>* %base to i16* 186 %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.nxv2i16(<vscale x 2 x i1> %pg, i16* %base_scalar) 187 %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64> 188 ret <vscale x 2 x i64> %res 189} 190 191define <vscale x 2 x i64> @ld1sh_d_inbound(<vscale x 2 x i1> %pg, i16* %a) { 192; CHECK-LABEL: ld1sh_d_inbound: 193; CHECK: ld1sh { z0.d }, p0/z, [x0, #7, mul vl] 194; CHECK-NEXT: ret 195 %base_scalable = bitcast i16* %a to <vscale x 2 x i16>* 196 %base = getelementptr <vscale x 2 x i16>, <vscale x 2 x i16>* %base_scalable, i64 7 197 %base_scalar = bitcast <vscale x 2 x i16>* %base to i16* 198 %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.nxv2i16(<vscale x 2 x i1> %pg, i16* %base_scalar) 199 %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64> 200 ret <vscale x 2 x i64> %res 201} 202 203define <vscale x 8 x half> @ld1h_f16_inbound(<vscale x 8 x i1> %pg, half* %a) { 204; CHECK-LABEL: ld1h_f16_inbound: 205; CHECK: ld1h { z0.h }, p0/z, [x0, #1, mul vl] 206; CHECK-NEXT: ret 207 %base_scalable = bitcast half* %a to <vscale x 8 x half>* 208 %base = getelementptr <vscale x 8 x half>, <vscale x 8 x half>* %base_scalable, i64 1 209 %base_scalar = bitcast <vscale x 8 x half>* %base to half* 210 %load = call <vscale x 8 x half> @llvm.aarch64.sve.ld1.nxv8f16(<vscale x 8 x i1> %pg, half* %base_scalar) 211 ret <vscale x 8 x half> %load 212} 213 214define <vscale x 8 x bfloat> @ld1h_bf16_inbound(<vscale x 8 x i1> %pg, bfloat* %a) #0 { 215; CHECK-LABEL: ld1h_bf16_inbound: 216; CHECK: ld1h { z0.h }, p0/z, [x0, #1, mul vl] 217; CHECK-NEXT: ret 218 %base_scalable = bitcast bfloat* %a to <vscale x 8 x bfloat>* 219 %base = getelementptr <vscale x 8 x bfloat>, <vscale x 8 x bfloat>* %base_scalable, i64 1 220 %base_scalar = bitcast <vscale x 8 x bfloat>* %base to bfloat* 221 %load = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1.nxv8bf16(<vscale x 8 x i1> %pg, bfloat* %base_scalar) 222 ret <vscale x 8 x bfloat> %load 223} 224 225; 226; LD1W 227; 228 229define <vscale x 4 x i32> @ld1w_inbound(<vscale x 4 x i1> %pg, i32* %a) { 230; CHECK-LABEL: ld1w_inbound: 231; CHECK: ld1w { z0.s }, p0/z, [x0, #7, mul vl] 232; CHECK-NEXT: ret 233 %base_scalable = bitcast i32* %a to <vscale x 4 x i32>* 234 %base = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %base_scalable, i64 7 235 %base_scalar = bitcast <vscale x 4 x i32>* %base to i32* 236 %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1> %pg, i32* %base_scalar) 237 ret <vscale x 4 x i32> %load 238} 239 240define <vscale x 4 x float> @ld1w_f32_inbound(<vscale x 4 x i1> %pg, float* %a) { 241; CHECK-LABEL: ld1w_f32_inbound: 242; CHECK: ld1w { z0.s }, p0/z, [x0, #7, mul vl] 243; CHECK-NEXT: ret 244 %base_scalable = bitcast float* %a to <vscale x 4 x float>* 245 %base = getelementptr <vscale x 4 x float>, <vscale x 4 x float>* %base_scalable, i64 7 246 %base_scalar = bitcast <vscale x 4 x float>* %base to float* 247 %load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.nxv4f32(<vscale x 4 x i1> %pg, float* %base_scalar) 248 ret <vscale x 4 x float> %load 249} 250 251; 252; LD1D 253; 254 255define <vscale x 2 x i64> @ld1d_inbound(<vscale x 2 x i1> %pg, i64* %a) { 256; CHECK-LABEL: ld1d_inbound: 257; CHECK: ld1d { z0.d }, p0/z, [x0, #1, mul vl] 258; CHECK-NEXT: ret 259 %base_scalable = bitcast i64* %a to <vscale x 2 x i64>* 260 %base = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %base_scalable, i64 1 261 %base_scalar = bitcast <vscale x 2 x i64>* %base to i64* 262 %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.nxv2i64(<vscale x 2 x i1> %pg, i64* %base_scalar) 263 ret <vscale x 2 x i64> %load 264} 265 266define <vscale x 2 x i64> @ld1w_d_inbound(<vscale x 2 x i1> %pg, i32* %a) { 267; CHECK-LABEL: ld1w_d_inbound: 268; CHECK: ld1w { z0.d }, p0/z, [x0, #7, mul vl] 269; CHECK-NEXT: ret 270 %base_scalable = bitcast i32* %a to <vscale x 2 x i32>* 271 %base = getelementptr <vscale x 2 x i32>, <vscale x 2 x i32>* %base_scalable, i64 7 272 %base_scalar = bitcast <vscale x 2 x i32>* %base to i32* 273 %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.nxv2i32(<vscale x 2 x i1> %pg, i32* %base_scalar) 274 %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64> 275 ret <vscale x 2 x i64> %res 276} 277 278define <vscale x 2 x i64> @ld1sw_d_inbound(<vscale x 2 x i1> %pg, i32* %a) { 279; CHECK-LABEL: ld1sw_d_inbound: 280; CHECK: ld1sw { z0.d }, p0/z, [x0, #7, mul vl] 281; CHECK-NEXT: ret 282 %base_scalable = bitcast i32* %a to <vscale x 2 x i32>* 283 %base = getelementptr <vscale x 2 x i32>, <vscale x 2 x i32>* %base_scalable, i64 7 284 %base_scalar = bitcast <vscale x 2 x i32>* %base to i32* 285 %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.nxv2i32(<vscale x 2 x i1> %pg, i32* %base_scalar) 286 %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64> 287 ret <vscale x 2 x i64> %res 288} 289 290define <vscale x 2 x double> @ld1d_f64_inbound(<vscale x 2 x i1> %pg, double* %a) { 291; CHECK-LABEL: ld1d_f64_inbound: 292; CHECK: ld1d { z0.d }, p0/z, [x0, #1, mul vl] 293; CHECK-NEXT: ret 294 %base_scalable = bitcast double* %a to <vscale x 2 x double>* 295 %base = getelementptr <vscale x 2 x double>, <vscale x 2 x double>* %base_scalable, i64 1 296 %base_scalar = bitcast <vscale x 2 x double>* %base to double* 297 %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.nxv2f64(<vscale x 2 x i1> %pg, double* %base_scalar) 298 ret <vscale x 2 x double> %load 299} 300 301declare <vscale x 16 x i8> @llvm.aarch64.sve.ld1.nxv16i8(<vscale x 16 x i1>, i8*) 302 303declare <vscale x 8 x i8> @llvm.aarch64.sve.ld1.nxv8i8(<vscale x 8 x i1>, i8*) 304declare <vscale x 8 x i16> @llvm.aarch64.sve.ld1.nxv8i16(<vscale x 8 x i1>, i16*) 305declare <vscale x 8 x half> @llvm.aarch64.sve.ld1.nxv8f16(<vscale x 8 x i1>, half*) 306declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1.nxv8bf16(<vscale x 8 x i1>, bfloat*) 307 308declare <vscale x 4 x i8> @llvm.aarch64.sve.ld1.nxv4i8(<vscale x 4 x i1>, i8*) 309declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.nxv4i16(<vscale x 4 x i1>, i16*) 310declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.nxv4i32(<vscale x 4 x i1>, i32*) 311declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.nxv4f32(<vscale x 4 x i1>, float*) 312 313declare <vscale x 2 x i8> @llvm.aarch64.sve.ld1.nxv2i8(<vscale x 2 x i1>, i8*) 314declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.nxv2i16(<vscale x 2 x i1>, i16*) 315declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.nxv2i32(<vscale x 2 x i1>, i32*) 316declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.nxv2i64(<vscale x 2 x i1>, i64*) 317declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.nxv2f64(<vscale x 2 x i1>, double*) 318 319; +bf16 is required for the bfloat version. 320attributes #0 = { "target-features"="+sve,+bf16" } 321