1; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+bf16 -asm-verbose=0 < %s 2>%t | FileCheck %s 2; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t 3 4; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. 5; WARN-NOT: warning 6 7; 8; LD1RQB 9; 10 11define <vscale x 16 x i8> @ld1rqb_i8(<vscale x 16 x i1> %pred, i8* %addr) { 12; CHECK-LABEL: ld1rqb_i8: 13; CHECK: ld1rqb { z0.b }, p0/z, [x0] 14; CHECK-NEXT: ret 15 %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> %pred, i8* %addr) 16 ret <vscale x 16 x i8> %res 17} 18 19define <vscale x 16 x i8> @ld1rqb_i8_imm(<vscale x 16 x i1> %pred, i8* %addr) { 20; CHECK-LABEL: ld1rqb_i8_imm: 21; CHECK: ld1rqb { z0.b }, p0/z, [x0, #16] 22; CHECK-NEXT: ret 23 %ptr = getelementptr inbounds i8, i8* %addr, i8 16 24 %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> %pred, i8* %ptr) 25 ret <vscale x 16 x i8> %res 26} 27 28define <vscale x 16 x i8> @ld1rqb_i8_imm_lower_bound(<vscale x 16 x i1> %pred, i8* %addr) { 29; CHECK-LABEL: ld1rqb_i8_imm_lower_bound: 30; CHECK: ld1rqb { z0.b }, p0/z, [x0, #-128] 31; CHECK-NEXT: ret 32 %ptr = getelementptr inbounds i8, i8* %addr, i8 -128 33 %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> %pred, i8* %ptr) 34 ret <vscale x 16 x i8> %res 35} 36 37define <vscale x 16 x i8> @ld1rqb_i8_imm_upper_bound(<vscale x 16 x i1> %pred, i8* %addr) { 38; CHECK-LABEL: ld1rqb_i8_imm_upper_bound: 39; CHECK: ld1rqb { z0.b }, p0/z, [x0, #112] 40; CHECK-NEXT: ret 41 %ptr = getelementptr inbounds i8, i8* %addr, i8 112 42 %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> %pred, i8* %ptr) 43 ret <vscale x 16 x i8> %res 44} 45 46define <vscale x 16 x i8> @ld1rqb_i8_imm_out_of_lower_bound(<vscale x 16 x i1> %pred, i8* %addr) { 47; CHECK-LABEL: ld1rqb_i8_imm_out_of_lower_bound: 48; CHECK: sub x8, x0, #129 49; CHECK-NEXT: ld1rqb { z0.b }, p0/z, [x8] 50; CHECK-NEXT: ret 51 %ptr = getelementptr inbounds i8, i8* %addr, i64 -129 52 %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> %pred, i8* %ptr) 53 ret <vscale x 16 x i8> %res 54} 55 56define <vscale x 16 x i8> @ld1rqb_i8_imm_out_of_upper_bound(<vscale x 16 x i1> %pred, i8* %addr) { 57; CHECK-LABEL: ld1rqb_i8_imm_out_of_upper_bound: 58; CHECK: add x8, x0, #113 59; CHECK-NEXT: ld1rqb { z0.b }, p0/z, [x8] 60; CHECK-NEXT: ret 61 %ptr = getelementptr inbounds i8, i8* %addr, i64 113 62 %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> %pred, i8* %ptr) 63 ret <vscale x 16 x i8> %res 64} 65 66; 67; LD1RQH 68; 69 70define <vscale x 8 x i16> @ld1rqh_i16(<vscale x 8 x i1> %pred, i16* %addr) { 71; CHECK-LABEL: ld1rqh_i16: 72; CHECK: ld1rqh { z0.h }, p0/z, [x0] 73; CHECK-NEXT: ret 74 %res = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1rq.nxv8i16(<vscale x 8 x i1> %pred, i16* %addr) 75 ret <vscale x 8 x i16> %res 76} 77 78define <vscale x 8 x half> @ld1rqh_f16(<vscale x 8 x i1> %pred, half* %addr) { 79; CHECK-LABEL: ld1rqh_f16: 80; CHECK: ld1rqh { z0.h }, p0/z, [x0] 81; CHECK-NEXT: ret 82 %res = call <vscale x 8 x half> @llvm.aarch64.sve.ld1rq.nxv8f16(<vscale x 8 x i1> %pred, half* %addr) 83 ret <vscale x 8 x half> %res 84} 85 86define <vscale x 8 x i16> @ld1rqh_i16_imm(<vscale x 8 x i1> %pred, i16* %addr) { 87; CHECK-LABEL: ld1rqh_i16_imm: 88; CHECK: ld1rqh { z0.h }, p0/z, [x0, #-64] 89; CHECK-NEXT: ret 90 %ptr = getelementptr inbounds i16, i16* %addr, i16 -32 91 %res = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1rq.nxv8i16(<vscale x 8 x i1> %pred, i16* %ptr) 92 ret <vscale x 8 x i16> %res 93} 94 95define <vscale x 8 x half> @ld1rqh_f16_imm(<vscale x 8 x i1> %pred, half* %addr) { 96; CHECK-LABEL: ld1rqh_f16_imm: 97; CHECK: ld1rqh { z0.h }, p0/z, [x0, #-16] 98; CHECK-NEXT: ret 99 %ptr = getelementptr inbounds half, half* %addr, i16 -8 100 %res = call <vscale x 8 x half> @llvm.aarch64.sve.ld1rq.nxv8f16(<vscale x 8 x i1> %pred, half* %ptr) 101 ret <vscale x 8 x half> %res 102} 103 104define <vscale x 8 x bfloat> @ld1rqh_bf16(<vscale x 8 x i1> %pred, bfloat* %addr) { 105; CHECK-LABEL: ld1rqh_bf16: 106; CHECK: ld1rqh { z0.h }, p0/z, [x0] 107; CHECK-NEXT: ret 108 %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1rq.nxv8bf16(<vscale x 8 x i1> %pred, bfloat* %addr) 109 ret <vscale x 8 x bfloat> %res 110} 111 112define <vscale x 8 x bfloat> @ld1rqh_bf16_imm(<vscale x 8 x i1> %pred, bfloat* %addr) { 113; CHECK-LABEL: ld1rqh_bf16_imm: 114; CHECK: ld1rqh { z0.h }, p0/z, [x0, #-16] 115; CHECK-NEXT: ret 116 %ptr = getelementptr inbounds bfloat, bfloat* %addr, i16 -8 117 %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1rq.nxv8bf16(<vscale x 8 x i1> %pred, bfloat* %ptr) 118 ret <vscale x 8 x bfloat> %res 119} 120 121; 122; LD1RQW 123; 124 125define <vscale x 4 x i32> @ld1rqw_i32(<vscale x 4 x i1> %pred, i32* %addr) { 126; CHECK-LABEL: ld1rqw_i32: 127; CHECK: ld1rqw { z0.s }, p0/z, [x0] 128; CHECK-NEXT: ret 129 %res = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1rq.nxv4i32(<vscale x 4 x i1> %pred, i32* %addr) 130 ret <vscale x 4 x i32> %res 131} 132 133define <vscale x 4 x float> @ld1rqw_f32(<vscale x 4 x i1> %pred, float* %addr) { 134; CHECK-LABEL: ld1rqw_f32: 135; CHECK: ld1rqw { z0.s }, p0/z, [x0] 136; CHECK-NEXT: ret 137 %res = call <vscale x 4 x float> @llvm.aarch64.sve.ld1rq.nxv4f32(<vscale x 4 x i1> %pred, float* %addr) 138 ret <vscale x 4 x float> %res 139} 140 141define <vscale x 4 x i32> @ld1rqw_i32_imm(<vscale x 4 x i1> %pred, i32* %addr) { 142; CHECK-LABEL: ld1rqw_i32_imm: 143; CHECK: ld1rqw { z0.s }, p0/z, [x0, #112] 144; CHECK-NEXT: ret 145 %ptr = getelementptr inbounds i32, i32* %addr, i32 28 146 %res = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1rq.nxv4i32(<vscale x 4 x i1> %pred, i32* %ptr) 147 ret <vscale x 4 x i32> %res 148} 149 150define <vscale x 4 x float> @ld1rqw_f32_imm(<vscale x 4 x i1> %pred, float* %addr) { 151; CHECK-LABEL: ld1rqw_f32_imm: 152; CHECK: ld1rqw { z0.s }, p0/z, [x0, #32] 153; CHECK-NEXT: ret 154 %ptr = getelementptr inbounds float, float* %addr, i32 8 155 %res = call <vscale x 4 x float> @llvm.aarch64.sve.ld1rq.nxv4f32(<vscale x 4 x i1> %pred, float* %ptr) 156 ret <vscale x 4 x float> %res 157} 158 159; 160; LD1RQD 161; 162 163define <vscale x 2 x i64> @ld1rqd_i64(<vscale x 2 x i1> %pred, i64* %addr) { 164; CHECK-LABEL: ld1rqd_i64: 165; CHECK: ld1rqd { z0.d }, p0/z, [x0] 166; CHECK-NEXT: ret 167 %res = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1rq.nxv2i64(<vscale x 2 x i1> %pred, i64* %addr) 168 ret <vscale x 2 x i64> %res 169} 170 171define <vscale x 2 x double> @ld1rqd_f64(<vscale x 2 x i1> %pred, double* %addr) { 172; CHECK-LABEL: ld1rqd_f64: 173; CHECK: ld1rqd { z0.d }, p0/z, [x0] 174; CHECK-NEXT: ret 175 %res = call <vscale x 2 x double> @llvm.aarch64.sve.ld1rq.nxv2f64(<vscale x 2 x i1> %pred, double* %addr) 176 ret <vscale x 2 x double> %res 177} 178 179define <vscale x 2 x i64> @ld1rqd_i64_imm(<vscale x 2 x i1> %pred, i64* %addr) { 180; CHECK-LABEL: ld1rqd_i64_imm: 181; CHECK: ld1rqd { z0.d }, p0/z, [x0, #64] 182; CHECK-NEXT: ret 183 %ptr = getelementptr inbounds i64, i64* %addr, i64 8 184 %res = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1rq.nxv2i64(<vscale x 2 x i1> %pred, i64* %ptr) 185 ret <vscale x 2 x i64> %res 186} 187 188define <vscale x 2 x double> @ld1rqd_f64_imm(<vscale x 2 x i1> %pred, double* %addr) { 189; CHECK-LABEL: ld1rqd_f64_imm: 190; CHECK: ld1rqd { z0.d }, p0/z, [x0, #-128] 191; CHECK-NEXT: ret 192 %ptr = getelementptr inbounds double, double* %addr, i64 -16 193 %res = call <vscale x 2 x double> @llvm.aarch64.sve.ld1rq.nxv2f64(<vscale x 2 x i1> %pred, double* %ptr) 194 ret <vscale x 2 x double> %res 195} 196 197; 198; LDNT1B 199; 200 201define <vscale x 16 x i8> @ldnt1b_i8(<vscale x 16 x i1> %pred, i8* %addr) { 202; CHECK-LABEL: ldnt1b_i8: 203; CHECK: ldnt1b { z0.b }, p0/z, [x0] 204; CHECK-NEXT: ret 205 %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ldnt1.nxv16i8(<vscale x 16 x i1> %pred, 206 i8* %addr) 207 ret <vscale x 16 x i8> %res 208} 209 210; 211; LDNT1H 212; 213 214define <vscale x 8 x i16> @ldnt1h_i16(<vscale x 8 x i1> %pred, i16* %addr) { 215; CHECK-LABEL: ldnt1h_i16: 216; CHECK: ldnt1h { z0.h }, p0/z, [x0] 217; CHECK-NEXT: ret 218 %res = call <vscale x 8 x i16> @llvm.aarch64.sve.ldnt1.nxv8i16(<vscale x 8 x i1> %pred, 219 i16* %addr) 220 ret <vscale x 8 x i16> %res 221} 222 223define <vscale x 8 x half> @ldnt1h_f16(<vscale x 8 x i1> %pred, half* %addr) { 224; CHECK-LABEL: ldnt1h_f16: 225; CHECK: ldnt1h { z0.h }, p0/z, [x0] 226; CHECK-NEXT: ret 227 %res = call <vscale x 8 x half> @llvm.aarch64.sve.ldnt1.nxv8f16(<vscale x 8 x i1> %pred, 228 half* %addr) 229 ret <vscale x 8 x half> %res 230} 231 232define <vscale x 8 x bfloat> @ldnt1h_bf16(<vscale x 8 x i1> %pred, bfloat* %addr) { 233; CHECK-LABEL: ldnt1h_bf16: 234; CHECK: ldnt1h { z0.h }, p0/z, [x0] 235; CHECK-NEXT: ret 236 %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1> %pred, 237 bfloat* %addr) 238 ret <vscale x 8 x bfloat> %res 239} 240 241; 242; LDNT1W 243; 244 245define <vscale x 4 x i32> @ldnt1w_i32(<vscale x 4 x i1> %pred, i32* %addr) { 246; CHECK-LABEL: ldnt1w_i32: 247; CHECK: ldnt1w { z0.s }, p0/z, [x0] 248; CHECK-NEXT: ret 249 %res = call <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.nxv4i32(<vscale x 4 x i1> %pred, 250 i32* %addr) 251 ret <vscale x 4 x i32> %res 252} 253 254define <vscale x 4 x float> @ldnt1w_f32(<vscale x 4 x i1> %pred, float* %addr) { 255; CHECK-LABEL: ldnt1w_f32: 256; CHECK: ldnt1w { z0.s }, p0/z, [x0] 257; CHECK-NEXT: ret 258 %res = call <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.nxv4f32(<vscale x 4 x i1> %pred, 259 float* %addr) 260 ret <vscale x 4 x float> %res 261} 262 263; 264; LDNT1D 265; 266 267define <vscale x 2 x i64> @ldnt1d_i64(<vscale x 2 x i1> %pred, i64* %addr) { 268; CHECK-LABEL: ldnt1d_i64: 269; CHECK: ldnt1d { z0.d }, p0/z, [x0] 270; CHECK-NEXT: ret 271 %res = call <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.nxv2i64(<vscale x 2 x i1> %pred, 272 i64* %addr) 273 ret <vscale x 2 x i64> %res 274} 275 276define <vscale x 2 x double> @ldnt1d_f64(<vscale x 2 x i1> %pred, double* %addr) { 277; CHECK-LABEL: ldnt1d_f64: 278; CHECK: ldnt1d { z0.d }, p0/z, [x0] 279; CHECK-NEXT: ret 280 %res = call <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.nxv2f64(<vscale x 2 x i1> %pred, 281 double* %addr) 282 ret <vscale x 2 x double> %res 283} 284 285; 286; LD2B 287; 288 289define <vscale x 32 x i8> @ld2b_i8(<vscale x 16 x i1> %pred, i8* %addr) { 290; CHECK-LABEL: ld2b_i8: 291; CHECK: ld2b { z0.b, z1.b }, p0/z, [x0] 292; CHECK-NEXT: ret 293 %res = call <vscale x 32 x i8> @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8(<vscale x 16 x i1> %pred, i8* %addr) 294 ret <vscale x 32 x i8> %res 295} 296 297; 298; LD2H 299; 300 301define <vscale x 16 x i16> @ld2h_i16(<vscale x 8 x i1> %pred, i16* %addr) { 302; CHECK-LABEL: ld2h_i16: 303; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0] 304; CHECK-NEXT: ret 305 %res = call <vscale x 16 x i16> @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0i16(<vscale x 8 x i1> %pred, i16* %addr) 306 ret <vscale x 16 x i16> %res 307} 308 309define <vscale x 16 x half> @ld2h_f16(<vscale x 8 x i1> %pred, half* %addr) { 310; CHECK-LABEL: ld2h_f16: 311; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0] 312; CHECK-NEXT: ret 313 %res = call <vscale x 16 x half> @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0f16(<vscale x 8 x i1> %pred, half* %addr) 314 ret <vscale x 16 x half> %res 315} 316 317define <vscale x 16 x bfloat> @ld2h_bf16(<vscale x 8 x i1> %pred, bfloat* %addr) { 318; CHECK-LABEL: ld2h_bf16: 319; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0] 320; CHECK-NEXT: ret 321 %res = call <vscale x 16 x bfloat> @llvm.aarch64.sve.ld2.nxv16bf16.nxv8i1.p0bf16(<vscale x 8 x i1> %pred, bfloat* %addr) 322 ret <vscale x 16 x bfloat> %res 323} 324 325; 326; LD2W 327; 328 329define <vscale x 8 x i32> @ld2w_i32(<vscale x 4 x i1> %pred, i32* %addr) { 330; CHECK-LABEL: ld2w_i32: 331; CHECK: ld2w { z0.s, z1.s }, p0/z, [x0] 332; CHECK-NEXT: ret 333 %res = call <vscale x 8 x i32> @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0i32(<vscale x 4 x i1> %pred, i32* %addr) 334 ret <vscale x 8 x i32> %res 335} 336 337define <vscale x 8 x float> @ld2w_f32(<vscale x 4 x i1> %pred, float* %addr) { 338; CHECK-LABEL: ld2w_f32: 339; CHECK: ld2w { z0.s, z1.s }, p0/z, [x0] 340; CHECK-NEXT: ret 341 %res = call <vscale x 8 x float> @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0f32(<vscale x 4 x i1> %pred, float* %addr) 342 ret <vscale x 8 x float> %res 343} 344 345; 346; LD2D 347; 348 349define <vscale x 4 x i64> @ld2d_i64(<vscale x 2 x i1> %pred, i64* %addr) { 350; CHECK-LABEL: ld2d_i64: 351; CHECK: ld2d { z0.d, z1.d }, p0/z, [x0] 352; CHECK-NEXT: ret 353 %res = call <vscale x 4 x i64> @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0i64(<vscale x 2 x i1> %pred, i64* %addr) 354 ret <vscale x 4 x i64> %res 355} 356 357define <vscale x 4 x double> @ld2d_f64(<vscale x 2 x i1> %pred, double* %addr) { 358; CHECK-LABEL: ld2d_f64: 359; CHECK: ld2d { z0.d, z1.d }, p0/z, [x0] 360; CHECK-NEXT: ret 361 %res = call <vscale x 4 x double> @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0f64(<vscale x 2 x i1> %pred, double* %addr) 362 ret <vscale x 4 x double> %res 363} 364 365; 366; LD3B 367; 368 369define <vscale x 48 x i8> @ld3b_i8(<vscale x 16 x i1> %pred, i8* %addr) { 370; CHECK-LABEL: ld3b_i8: 371; CHECK: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0] 372; CHECK-NEXT: ret 373 %res = call <vscale x 48 x i8> @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8(<vscale x 16 x i1> %pred, i8* %addr) 374 ret <vscale x 48 x i8> %res 375} 376 377; 378; LD3H 379; 380 381define <vscale x 24 x i16> @ld3h_i16(<vscale x 8 x i1> %pred, i16* %addr) { 382; CHECK-LABEL: ld3h_i16: 383; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0] 384; CHECK-NEXT: ret 385 %res = call <vscale x 24 x i16> @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0i16(<vscale x 8 x i1> %pred, i16* %addr) 386 ret <vscale x 24 x i16> %res 387} 388 389define <vscale x 24 x half> @ld3h_f16(<vscale x 8 x i1> %pred, half* %addr) { 390; CHECK-LABEL: ld3h_f16: 391; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0] 392; CHECK-NEXT: ret 393 %res = call <vscale x 24 x half> @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0f16(<vscale x 8 x i1> %pred, half* %addr) 394 ret <vscale x 24 x half> %res 395} 396 397define <vscale x 24 x bfloat> @ld3h_bf16(<vscale x 8 x i1> %pred, bfloat* %addr) { 398; CHECK-LABEL: ld3h_bf16: 399; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0] 400; CHECK-NEXT: ret 401 %res = call <vscale x 24 x bfloat> @llvm.aarch64.sve.ld3.nxv24bf16.nxv8i1.p0bf16(<vscale x 8 x i1> %pred, bfloat* %addr) 402 ret <vscale x 24 x bfloat> %res 403} 404 405; 406; LD3W 407; 408 409define <vscale x 12 x i32> @ld3w_i32(<vscale x 4 x i1> %pred, i32* %addr) { 410; CHECK-LABEL: ld3w_i32: 411; CHECK: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0] 412; CHECK-NEXT: ret 413 %res = call <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0i32(<vscale x 4 x i1> %pred, i32* %addr) 414 ret <vscale x 12 x i32> %res 415} 416 417define <vscale x 12 x float> @ld3w_f32(<vscale x 4 x i1> %pred, float* %addr) { 418; CHECK-LABEL: ld3w_f32: 419; CHECK: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0] 420; CHECK-NEXT: ret 421 %res = call <vscale x 12 x float> @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0f32(<vscale x 4 x i1> %pred, float* %addr) 422 ret <vscale x 12 x float> %res 423} 424 425; 426; LD3D 427; 428 429define <vscale x 6 x i64> @ld3d_i64(<vscale x 2 x i1> %pred, i64* %addr) { 430; CHECK-LABEL: ld3d_i64: 431; CHECK: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0] 432; CHECK-NEXT: ret 433 %res = call <vscale x 6 x i64> @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0i64(<vscale x 2 x i1> %pred, i64* %addr) 434 ret <vscale x 6 x i64> %res 435} 436 437define <vscale x 6 x double> @ld3d_f64(<vscale x 2 x i1> %pred, double* %addr) { 438; CHECK-LABEL: ld3d_f64: 439; CHECK: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0] 440; CHECK-NEXT: ret 441 %res = call <vscale x 6 x double> @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0f64(<vscale x 2 x i1> %pred, double* %addr) 442 ret <vscale x 6 x double> %res 443} 444 445; 446; LD4B 447; 448 449define <vscale x 64 x i8> @ld4b_i8(<vscale x 16 x i1> %pred, i8* %addr) { 450; CHECK-LABEL: ld4b_i8: 451; CHECK: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0] 452; CHECK-NEXT: ret 453 %res = call <vscale x 64 x i8> @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8(<vscale x 16 x i1> %pred, i8* %addr) 454 ret <vscale x 64 x i8> %res 455} 456 457; 458; LD4H 459; 460 461define <vscale x 32 x i16> @ld4h_i16(<vscale x 8 x i1> %pred, i16* %addr) { 462; CHECK-LABEL: ld4h_i16: 463; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0] 464; CHECK-NEXT: ret 465 %res = call <vscale x 32 x i16> @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0i16(<vscale x 8 x i1> %pred, i16* %addr) 466 ret <vscale x 32 x i16> %res 467} 468 469define <vscale x 32 x half> @ld4h_f16(<vscale x 8 x i1> %pred, half* %addr) { 470; CHECK-LABEL: ld4h_f16: 471; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0] 472; CHECK-NEXT: ret 473 %res = call <vscale x 32 x half> @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0f16(<vscale x 8 x i1> %pred, half* %addr) 474 ret <vscale x 32 x half> %res 475} 476 477define <vscale x 32 x bfloat> @ld4h_bf16(<vscale x 8 x i1> %pred, bfloat* %addr) { 478; CHECK-LABEL: ld4h_bf16: 479; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0] 480; CHECK-NEXT: ret 481 %res = call <vscale x 32 x bfloat> @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1.p0bf16(<vscale x 8 x i1> %pred, bfloat* %addr) 482 ret <vscale x 32 x bfloat> %res 483} 484 485; 486; LD4W 487; 488 489define <vscale x 16 x i32> @ld4w_i32(<vscale x 4 x i1> %pred, i32* %addr) { 490; CHECK-LABEL: ld4w_i32: 491; CHECK: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0] 492; CHECK-NEXT: ret 493 %res = call <vscale x 16 x i32> @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0i32(<vscale x 4 x i1> %pred, i32* %addr) 494 ret <vscale x 16 x i32> %res 495} 496 497define <vscale x 16 x float> @ld4w_f32(<vscale x 4 x i1> %pred, float* %addr) { 498; CHECK-LABEL: ld4w_f32: 499; CHECK: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0] 500; CHECK-NEXT: ret 501 %res = call <vscale x 16 x float> @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0f32(<vscale x 4 x i1> %pred, float* %addr) 502 ret <vscale x 16 x float> %res 503} 504 505; 506; LD4D 507; 508 509define <vscale x 8 x i64> @ld4d_i64(<vscale x 2 x i1> %pred, i64* %addr) { 510; CHECK-LABEL: ld4d_i64: 511; CHECK: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0] 512; CHECK-NEXT: ret 513 %res = call <vscale x 8 x i64> @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0i64(<vscale x 2 x i1> %pred, i64* %addr) 514 ret <vscale x 8 x i64> %res 515} 516 517define <vscale x 8 x double> @ld4d_f64(<vscale x 2 x i1> %pred, double* %addr) { 518; CHECK-LABEL: ld4d_f64: 519; CHECK: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0] 520; CHECK-NEXT: ret 521 %res = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0f64(<vscale x 2 x i1> %pred, double* %addr) 522 ret <vscale x 8 x double> %res 523} 524 525 526declare <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1>, i8*) 527declare <vscale x 8 x i16> @llvm.aarch64.sve.ld1rq.nxv8i16(<vscale x 8 x i1>, i16*) 528declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1rq.nxv4i32(<vscale x 4 x i1>, i32*) 529declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1rq.nxv2i64(<vscale x 2 x i1>, i64*) 530declare <vscale x 8 x half> @llvm.aarch64.sve.ld1rq.nxv8f16(<vscale x 8 x i1>, half*) 531declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1rq.nxv8bf16(<vscale x 8 x i1>, bfloat*) 532declare <vscale x 4 x float> @llvm.aarch64.sve.ld1rq.nxv4f32(<vscale x 4 x i1>, float*) 533declare <vscale x 2 x double> @llvm.aarch64.sve.ld1rq.nxv2f64(<vscale x 2 x i1>, double*) 534 535declare <vscale x 16 x i8> @llvm.aarch64.sve.ldnt1.nxv16i8(<vscale x 16 x i1>, i8*) 536declare <vscale x 8 x i16> @llvm.aarch64.sve.ldnt1.nxv8i16(<vscale x 8 x i1>, i16*) 537declare <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.nxv4i32(<vscale x 4 x i1>, i32*) 538declare <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.nxv2i64(<vscale x 2 x i1>, i64*) 539declare <vscale x 8 x half> @llvm.aarch64.sve.ldnt1.nxv8f16(<vscale x 8 x i1>, half*) 540declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1>, bfloat*) 541declare <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.nxv4f32(<vscale x 4 x i1>, float*) 542declare <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.nxv2f64(<vscale x 2 x i1>, double*) 543 544declare <vscale x 32 x i8> @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8(<vscale x 16 x i1>, i8*) 545declare <vscale x 16 x i16> @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0i16(<vscale x 8 x i1>, i16*) 546declare <vscale x 8 x i32> @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0i32(<vscale x 4 x i1>, i32*) 547declare <vscale x 4 x i64> @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0i64(<vscale x 2 x i1>, i64*) 548declare <vscale x 16 x half> @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0f16(<vscale x 8 x i1>, half*) 549declare <vscale x 16 x bfloat> @llvm.aarch64.sve.ld2.nxv16bf16.nxv8i1.p0bf16(<vscale x 8 x i1>, bfloat*) 550declare <vscale x 8 x float> @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0f32(<vscale x 4 x i1>, float*) 551declare <vscale x 4 x double> @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0f64(<vscale x 2 x i1>, double*) 552 553declare <vscale x 48 x i8> @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8(<vscale x 16 x i1>, i8*) 554declare <vscale x 24 x i16> @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0i16(<vscale x 8 x i1>, i16*) 555declare <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0i32(<vscale x 4 x i1>, i32*) 556declare <vscale x 6 x i64> @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0i64(<vscale x 2 x i1>, i64*) 557declare <vscale x 24 x half> @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0f16(<vscale x 8 x i1>, half*) 558declare <vscale x 24 x bfloat> @llvm.aarch64.sve.ld3.nxv24bf16.nxv8i1.p0bf16(<vscale x 8 x i1>, bfloat*) 559declare <vscale x 12 x float> @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0f32(<vscale x 4 x i1>, float*) 560declare <vscale x 6 x double> @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0f64(<vscale x 2 x i1>, double*) 561 562declare <vscale x 64 x i8> @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8(<vscale x 16 x i1>, i8*) 563declare <vscale x 32 x i16> @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0i16(<vscale x 8 x i1>, i16*) 564declare <vscale x 16 x i32> @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0i32(<vscale x 4 x i1>, i32*) 565declare <vscale x 8 x i64> @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0i64(<vscale x 2 x i1>, i64*) 566declare <vscale x 32 x half> @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0f16(<vscale x 8 x i1>, half*) 567declare <vscale x 32 x bfloat> @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1.p0bf16(<vscale x 8 x i1>, bfloat*) 568declare <vscale x 16 x float> @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0f32(<vscale x 4 x i1>, float*) 569declare <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0f64(<vscale x 2 x i1>, double*) 570