1; RUN: llc -mtriple=aarch64--linux-gnu -mattr=sve < %s 2>%t | FileCheck %s 2; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t 3 4; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. 5; WARN-NOT: warning 6 7; ld2b 8define <vscale x 32 x i8> @ld2.nxv32i8(<vscale x 16 x i1> %Pg, i8 *%addr, i64 %a) { 9; CHECK-LABEL: ld2.nxv32i8: 10; CHECK: ld2b { z0.b, z1.b }, p0/z, [x0, x1] 11; CHECK-NEXT: ret 12%addr2 = getelementptr i8, i8 * %addr, i64 %a 13%res = call <vscale x 32 x i8> @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8(<vscale x 16 x i1> %Pg, i8 *%addr2) 14ret <vscale x 32 x i8> %res 15} 16 17; ld2h 18define <vscale x 16 x i16> @ld2.nxv16i16(<vscale x 8 x i1> %Pg, i16 *%addr, i64 %a) { 19; CHECK-LABEL: ld2.nxv16i16: 20; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0, x1, lsl #1] 21; CHECK-NEXT: ret 22%addr2 = getelementptr i16, i16 * %addr, i64 %a 23%res = call <vscale x 16 x i16> @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0i16(<vscale x 8 x i1> %Pg, i16 *%addr2) 24ret <vscale x 16 x i16> %res 25} 26 27define <vscale x 16 x half> @ld2.nxv16f16(<vscale x 8 x i1> %Pg, half *%addr, i64 %a) { 28; CHECK-LABEL: ld2.nxv16f16: 29; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0, x1, lsl #1] 30; CHECK-NEXT: ret 31%addr2 = getelementptr half, half * %addr, i64 %a 32%res = call <vscale x 16 x half> @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0f16(<vscale x 8 x i1> %Pg, half *%addr2) 33ret <vscale x 16 x half> %res 34} 35 36define <vscale x 16 x bfloat> @ld2.nxv16bf16(<vscale x 8 x i1> %Pg, bfloat *%addr, i64 %a) #0 { 37; CHECK-LABEL: ld2.nxv16bf16: 38; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0, x1, lsl #1] 39; CHECK-NEXT: ret 40%addr2 = getelementptr bfloat, bfloat * %addr, i64 %a 41%res = call <vscale x 16 x bfloat> @llvm.aarch64.sve.ld2.nxv16bf16.nxv8i1.p0bf16(<vscale x 8 x i1> %Pg, bfloat *%addr2) 42ret <vscale x 16 x bfloat> %res 43} 44 45; ld2w 46define <vscale x 8 x i32> @ld2.nxv8i32(<vscale x 4 x i1> %Pg, i32 *%addr, i64 %a) { 47; CHECK-LABEL: ld2.nxv8i32: 48; CHECK: ld2w { z0.s, z1.s }, p0/z, [x0, x1, lsl #2] 49; CHECK-NEXT: ret 50%addr2 = getelementptr i32, i32 * %addr, i64 %a 51%res = call <vscale x 8 x i32> @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0i32(<vscale x 4 x i1> %Pg, i32 *%addr2) 52ret <vscale x 8 x i32> %res 53} 54 55define <vscale x 8 x float> @ld2.nxv8f32(<vscale x 4 x i1> %Pg, float *%addr, i64 %a) { 56; CHECK-LABEL: ld2.nxv8f32: 57; CHECK: ld2w { z0.s, z1.s }, p0/z, [x0, x1, lsl #2] 58; CHECK-NEXT: ret 59%addr2 = getelementptr float, float * %addr, i64 %a 60%res = call <vscale x 8 x float> @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0f32(<vscale x 4 x i1> %Pg, float *%addr2) 61ret <vscale x 8 x float> %res 62} 63 64; ld2d 65define <vscale x 4 x i64> @ld2.nxv4i64(<vscale x 2 x i1> %Pg, i64 *%addr, i64 %a) { 66; CHECK-LABEL: ld2.nxv4i64: 67; CHECK: ld2d { z0.d, z1.d }, p0/z, [x0, x1, lsl #3] 68; CHECK-NEXT: ret 69%addr2 = getelementptr i64, i64 * %addr, i64 %a 70%res = call <vscale x 4 x i64> @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0i64(<vscale x 2 x i1> %Pg, i64 *%addr2) 71ret <vscale x 4 x i64> %res 72} 73 74define <vscale x 4 x double> @ld2.nxv4f64(<vscale x 2 x i1> %Pg, double *%addr, i64 %a) { 75; CHECK-LABEL: ld2.nxv4f64: 76; CHECK: ld2d { z0.d, z1.d }, p0/z, [x0, x1, lsl #3] 77; CHECK-NEXT: ret 78%addr2 = getelementptr double, double * %addr, i64 %a 79%res = call <vscale x 4 x double> @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0f64(<vscale x 2 x i1> %Pg, double *%addr2) 80ret <vscale x 4 x double> %res 81} 82 83; ld3b 84define <vscale x 48 x i8> @ld3.nxv48i8(<vscale x 16 x i1> %Pg, i8 *%addr, i64 %a) { 85; CHECK-LABEL: ld3.nxv48i8: 86; CHECK: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0, x1] 87; CHECK-NEXT: ret 88%addr2 = getelementptr i8, i8 * %addr, i64 %a 89%res = call <vscale x 48 x i8> @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8(<vscale x 16 x i1> %Pg, i8 *%addr2) 90ret <vscale x 48 x i8> %res 91} 92 93; ld3h 94define <vscale x 24 x i16> @ld3.nxv24i16(<vscale x 8 x i1> %Pg, i16 *%addr, i64 %a) { 95; CHECK-LABEL: ld3.nxv24i16: 96; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0, x1, lsl #1] 97; CHECK-NEXT: ret 98%addr2 = getelementptr i16, i16 * %addr, i64 %a 99%res = call <vscale x 24 x i16> @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0i16(<vscale x 8 x i1> %Pg, i16 *%addr2) 100ret <vscale x 24 x i16> %res 101} 102 103define <vscale x 24 x half> @ld3.nxv24f16(<vscale x 8 x i1> %Pg, half *%addr, i64 %a) { 104; CHECK-LABEL: ld3.nxv24f16: 105; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0, x1, lsl #1] 106; CHECK-NEXT: ret 107%addr2 = getelementptr half, half * %addr, i64 %a 108%res = call <vscale x 24 x half> @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0f16(<vscale x 8 x i1> %Pg, half *%addr2) 109ret <vscale x 24 x half> %res 110} 111 112define <vscale x 24 x bfloat> @ld3.nxv24bf16(<vscale x 8 x i1> %Pg, bfloat *%addr, i64 %a) #0 { 113; CHECK-LABEL: ld3.nxv24bf16: 114; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0, x1, lsl #1] 115; CHECK-NEXT: ret 116%addr2 = getelementptr bfloat, bfloat * %addr, i64 %a 117%res = call <vscale x 24 x bfloat> @llvm.aarch64.sve.ld3.nxv24bf16.nxv8i1.p0bf16(<vscale x 8 x i1> %Pg, bfloat *%addr2) 118ret <vscale x 24 x bfloat> %res 119} 120 121; ld3w 122define <vscale x 12 x i32> @ld3.nxv12i32(<vscale x 4 x i1> %Pg, i32 *%addr, i64 %a) { 123; CHECK-LABEL: ld3.nxv12i32: 124; CHECK: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0, x1, lsl #2] 125; CHECK-NEXT: ret 126%addr2 = getelementptr i32, i32 * %addr, i64 %a 127%res = call <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0i32(<vscale x 4 x i1> %Pg, i32 *%addr2) 128ret <vscale x 12 x i32> %res 129} 130 131define <vscale x 12 x float> @ld3.nxv12f32(<vscale x 4 x i1> %Pg, float *%addr, i64 %a) { 132; CHECK-LABEL: ld3.nxv12f32: 133; CHECK: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0, x1, lsl #2] 134; CHECK-NEXT: ret 135%addr2 = getelementptr float, float * %addr, i64 %a 136%res = call <vscale x 12 x float> @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0f32(<vscale x 4 x i1> %Pg, float *%addr2) 137ret <vscale x 12 x float> %res 138} 139 140; ld3d 141define <vscale x 6 x i64> @ld3.nxv6i64(<vscale x 2 x i1> %Pg, i64 *%addr, i64 %a) { 142; CHECK-LABEL: ld3.nxv6i64: 143; CHECK: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0, x1, lsl #3] 144; CHECK-NEXT: ret 145%addr2 = getelementptr i64, i64 * %addr, i64 %a 146%res = call <vscale x 6 x i64> @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0i64(<vscale x 2 x i1> %Pg, i64 *%addr2) 147ret <vscale x 6 x i64> %res 148} 149 150define <vscale x 6 x double> @ld3.nxv6f64(<vscale x 2 x i1> %Pg, double *%addr, i64 %a) { 151; CHECK-LABEL: ld3.nxv6f64: 152; CHECK: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0, x1, lsl #3] 153; CHECK-NEXT: ret 154%addr2 = getelementptr double, double * %addr, i64 %a 155%res = call <vscale x 6 x double> @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0f64(<vscale x 2 x i1> %Pg, double *%addr2) 156ret <vscale x 6 x double> %res 157} 158 159; ld4b 160define <vscale x 64 x i8> @ld4.nxv64i8(<vscale x 16 x i1> %Pg, i8 *%addr, i64 %a) { 161; CHECK-LABEL: ld4.nxv64i8: 162; CHECK: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0, x1] 163; CHECK-NEXT: ret 164%addr2 = getelementptr i8, i8 * %addr, i64 %a 165%res = call <vscale x 64 x i8> @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8(<vscale x 16 x i1> %Pg, i8 *%addr2) 166ret <vscale x 64 x i8> %res 167} 168 169; ld4h 170define <vscale x 32 x i16> @ld4.nxv32i16(<vscale x 8 x i1> %Pg, i16 *%addr, i64 %a) { 171; CHECK-LABEL: ld4.nxv32i16: 172; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, x1, lsl #1] 173; CHECK-NEXT: ret 174%addr2 = getelementptr i16, i16 * %addr, i64 %a 175%res = call <vscale x 32 x i16> @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0i16(<vscale x 8 x i1> %Pg, i16 *%addr2) 176ret <vscale x 32 x i16> %res 177} 178 179define <vscale x 32 x half> @ld4.nxv32f16(<vscale x 8 x i1> %Pg, half *%addr, i64 %a) { 180; CHECK-LABEL: ld4.nxv32f16: 181; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, x1, lsl #1] 182; CHECK-NEXT: ret 183%addr2 = getelementptr half, half * %addr, i64 %a 184%res = call <vscale x 32 x half> @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0f16(<vscale x 8 x i1> %Pg, half *%addr2) 185ret <vscale x 32 x half> %res 186} 187 188define <vscale x 32 x bfloat> @ld4.nxv32bf16(<vscale x 8 x i1> %Pg, bfloat *%addr, i64 %a) #0 { 189; CHECK-LABEL: ld4.nxv32bf16: 190; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0, x1, lsl #1] 191; CHECK-NEXT: ret 192%addr2 = getelementptr bfloat, bfloat * %addr, i64 %a 193%res = call <vscale x 32 x bfloat> @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1.p0bf16(<vscale x 8 x i1> %Pg, bfloat *%addr2) 194ret <vscale x 32 x bfloat> %res 195} 196 197; ld4w 198define <vscale x 16 x i32> @ld4.nxv16i32(<vscale x 4 x i1> %Pg, i32 *%addr, i64 %a) { 199; CHECK-LABEL: ld4.nxv16i32: 200; CHECK: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0, x1, lsl #2] 201; CHECK-NEXT: ret 202%addr2 = getelementptr i32, i32 * %addr, i64 %a 203%res = call <vscale x 16 x i32> @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0i32(<vscale x 4 x i1> %Pg, i32 *%addr2) 204ret <vscale x 16 x i32> %res 205} 206 207define <vscale x 16 x float> @ld4.nxv16f32(<vscale x 4 x i1> %Pg, float *%addr, i64 %a) { 208; CHECK-LABEL: ld4.nxv16f32: 209; CHECK: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0, x1, lsl #2] 210; CHECK-NEXT: ret 211%addr2 = getelementptr float, float * %addr, i64 %a 212%res = call <vscale x 16 x float> @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0f32(<vscale x 4 x i1> %Pg, float *%addr2) 213ret <vscale x 16 x float> %res 214} 215 216; ld4d 217define <vscale x 8 x i64> @ld4.nxv8i64(<vscale x 2 x i1> %Pg, i64 *%addr, i64 %a) { 218; CHECK-LABEL: ld4.nxv8i64: 219; CHECK: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0, x1, lsl #3] 220; CHECK-NEXT: ret 221%addr2 = getelementptr i64, i64 * %addr, i64 %a 222%res = call <vscale x 8 x i64> @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0i64(<vscale x 2 x i1> %Pg, i64 *%addr2) 223ret <vscale x 8 x i64> %res 224} 225 226define <vscale x 8 x double> @ld4.nxv8f64(<vscale x 2 x i1> %Pg, double *%addr, i64 %a) { 227; CHECK-LABEL: ld4.nxv8f64: 228; CHECK: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0, x1, lsl #3] 229; CHECK-NEXT: ret 230%addr2 = getelementptr double, double * %addr, i64 %a 231%res = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0f64(<vscale x 2 x i1> %Pg, double *%addr2) 232ret <vscale x 8 x double> %res 233} 234 235declare <vscale x 32 x i8> @llvm.aarch64.sve.ld2.nxv32i8.nxv16i1.p0i8(<vscale x 16 x i1>, i8*) 236declare <vscale x 16 x i16> @llvm.aarch64.sve.ld2.nxv16i16.nxv8i1.p0i16(<vscale x 8 x i1>, i16*) 237declare <vscale x 8 x i32> @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0i32(<vscale x 4 x i1>, i32*) 238declare <vscale x 4 x i64> @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0i64(<vscale x 2 x i1>, i64*) 239declare <vscale x 16 x half> @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0f16(<vscale x 8 x i1>, half*) 240declare <vscale x 16 x bfloat> @llvm.aarch64.sve.ld2.nxv16bf16.nxv8i1.p0bf16(<vscale x 8 x i1>, bfloat*) 241declare <vscale x 8 x float> @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0f32(<vscale x 4 x i1>, float*) 242declare <vscale x 4 x double> @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0f64(<vscale x 2 x i1>, double*) 243 244declare <vscale x 48 x i8> @llvm.aarch64.sve.ld3.nxv48i8.nxv16i1.p0i8(<vscale x 16 x i1>, i8*) 245declare <vscale x 24 x i16> @llvm.aarch64.sve.ld3.nxv24i16.nxv8i1.p0i16(<vscale x 8 x i1>, i16*) 246declare <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0i32(<vscale x 4 x i1>, i32*) 247declare <vscale x 6 x i64> @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0i64(<vscale x 2 x i1>, i64*) 248declare <vscale x 24 x half> @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0f16(<vscale x 8 x i1>, half*) 249declare <vscale x 24 x bfloat> @llvm.aarch64.sve.ld3.nxv24bf16.nxv8i1.p0bf16(<vscale x 8 x i1>, bfloat*) 250declare <vscale x 12 x float> @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0f32(<vscale x 4 x i1>, float*) 251declare <vscale x 6 x double> @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0f64(<vscale x 2 x i1>, double*) 252 253declare <vscale x 64 x i8> @llvm.aarch64.sve.ld4.nxv64i8.nxv16i1.p0i8(<vscale x 16 x i1>, i8*) 254declare <vscale x 32 x i16> @llvm.aarch64.sve.ld4.nxv32i16.nxv8i1.p0i16(<vscale x 8 x i1>, i16*) 255declare <vscale x 16 x i32> @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0i32(<vscale x 4 x i1>, i32*) 256declare <vscale x 8 x i64> @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0i64(<vscale x 2 x i1>, i64*) 257declare <vscale x 32 x half> @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0f16(<vscale x 8 x i1>, half*) 258declare <vscale x 32 x bfloat> @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1.p0bf16(<vscale x 8 x i1>, bfloat*) 259declare <vscale x 16 x float> @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0f32(<vscale x 4 x i1>, float*) 260declare <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0f64(<vscale x 2 x i1>, double*) 261 262; +bf16 is required for the bfloat version. 263attributes #0 = { "target-features"="+sve,+bf16" } 264