; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s 2>%t | FileCheck %s ; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t ; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. ; WARN-NOT: warning ; 2-lane contiguous load/stores define void @test_masked_ldst_sv2i8(i8 * %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: test_masked_ldst_sv2i8: ; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1] ; CHECK-NEXT: st1b { z[[DATA]].d }, p0, [x0, x1] ; CHECK-NEXT: ret %base_i8 = getelementptr i8, i8* %base, i64 %offset %base_addr = bitcast i8* %base_i8 to * %data = call @llvm.masked.load.nxv2i8(* %base_addr, i32 1, %mask, undef) call void @llvm.masked.store.nxv2i8( %data, * %base_addr, i32 1, %mask) ret void } define void @test_masked_ldst_sv2i16(i16 * %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: test_masked_ldst_sv2i16: ; CHECK-NEXT: ld1sh { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #1] ; CHECK-NEXT: st1h { z[[DATA]].d }, p0, [x0, x1, lsl #1] ; CHECK-NEXT: ret %base_i16 = getelementptr i16, i16* %base, i64 %offset %base_addr = bitcast i16* %base_i16 to * %data = call @llvm.masked.load.nxv2i16(* %base_addr, i32 1, %mask, undef) call void @llvm.masked.store.nxv2i16( %data, * %base_addr, i32 1, %mask) ret void } define void @test_masked_ldst_sv2i32(i32 * %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: test_masked_ldst_sv2i32: ; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, x1, lsl #2] ; CHECK-NEXT: st1w { z0.d }, p0, [x0, x1, lsl #2] ; CHECK-NEXT: ret %base_i32 = getelementptr i32, i32* %base, i64 %offset %base_addr = bitcast i32* %base_i32 to * %data = call @llvm.masked.load.nxv2i32(* %base_addr, i32 1, %mask, undef) call void @llvm.masked.store.nxv2i32( %data, * %base_addr, i32 1, %mask) ret void } define void @test_masked_ldst_sv2i64(i64 * %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: test_masked_ldst_sv2i64: ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x1, lsl #3] ; CHECK-NEXT: st1d { z0.d }, p0, [x0, x1, lsl #3] ; CHECK-NEXT: ret %base_i64 = getelementptr i64, i64* %base, i64 %offset %base_addr = bitcast i64* %base_i64 to * %data = call @llvm.masked.load.nxv2i64(* %base_addr, i32 1, %mask, undef) call void @llvm.masked.store.nxv2i64( %data, * %base_addr, i32 1, %mask) ret void } define void @test_masked_ldst_sv2f16(half * %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: test_masked_ldst_sv2f16: ; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #1] ; CHECK-NEXT: st1h { z[[DATA]].d }, p0, [x0, x1, lsl #1] ; CHECK-NEXT: ret %base_half = getelementptr half, half* %base, i64 %offset %base_addr = bitcast half* %base_half to * %data = call @llvm.masked.load.nxv2f16(* %base_addr, i32 1, %mask, undef) call void @llvm.masked.store.nxv2f16( %data, * %base_addr, i32 1, %mask) ret void } define void @test_masked_ldst_sv2f32(float * %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: test_masked_ldst_sv2f32: ; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #2] ; CHECK-NEXT: st1w { z[[DATA]].d }, p0, [x0, x1, lsl #2] ; CHECK-NEXT: ret %base_float = getelementptr float, float* %base, i64 %offset %base_addr = bitcast float* %base_float to * %data = call @llvm.masked.load.nxv2f32(* %base_addr, i32 1, %mask, undef) call void @llvm.masked.store.nxv2f32( %data, * %base_addr, i32 1, %mask) ret void } define void @test_masked_ldst_sv2f64(double * %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: test_masked_ldst_sv2f64: ; CHECK-NEXT: ld1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #3] ; CHECK-NEXT: st1d { z[[DATA]].d }, p0, [x0, x1, lsl #3] ; CHECK-NEXT: ret %base_double = getelementptr double, double* %base, i64 %offset %base_addr = bitcast double* %base_double to * %data = call @llvm.masked.load.nxv2f64(* %base_addr, i32 1, %mask, undef) call void @llvm.masked.store.nxv2f64( %data, * %base_addr, i32 1, %mask) ret void } ; 2-lane zero/sign extended contiguous loads. define @masked_zload_sv2i8_to_sv2i64(i8* %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_zload_sv2i8_to_sv2i64: ; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, x1] ; CHECK-NEXT: ret %base_i8 = getelementptr i8, i8* %base, i64 %offset %base_addr = bitcast i8* %base_i8 to * %load = call @llvm.masked.load.nxv2i8(* %base_addr, i32 1, %mask, undef) %ext = zext %load to ret %ext } define @masked_sload_sv2i8_to_sv2i64(i8* %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_sload_sv2i8_to_sv2i64: ; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, x1] ; CHECK-NEXT: ret %base_i8 = getelementptr i8, i8* %base, i64 %offset %base_addr = bitcast i8* %base_i8 to * %load = call @llvm.masked.load.nxv2i8(* %base_addr, i32 1, %mask, undef) %ext = sext %load to ret %ext } define @masked_zload_sv2i16_to_sv2i64(i16* %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_zload_sv2i16_to_sv2i64: ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, x1, lsl #1] ; CHECK-NEXT: ret %base_i16 = getelementptr i16, i16* %base, i64 %offset %base_addr = bitcast i16* %base_i16 to * %load = call @llvm.masked.load.nxv2i16(* %base_addr, i32 1, %mask, undef) %ext = zext %load to ret %ext } define @masked_sload_sv2i16_to_sv2i64(i16* %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_sload_sv2i16_to_sv2i64: ; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, x1, lsl #1] ; CHECK-NEXT: ret %base_i16 = getelementptr i16, i16* %base, i64 %offset %base_addr = bitcast i16* %base_i16 to * %load = call @llvm.masked.load.nxv2i16(* %base_addr, i32 1, %mask, undef) %ext = sext %load to ret %ext } define @masked_zload_sv2i32_to_sv2i64(i32* %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_zload_sv2i32_to_sv2i64: ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, x1, lsl #2] ; CHECK-NEXT: ret %base_i32 = getelementptr i32, i32* %base, i64 %offset %base_addr = bitcast i32* %base_i32 to * %load = call @llvm.masked.load.nxv2i32(* %base_addr, i32 1, %mask, undef) %ext = zext %load to ret %ext } define @masked_sload_sv2i32_to_sv2i64(i32* %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_sload_sv2i32_to_sv2i64: ; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, x1, lsl #2] ; CHECK-NEXT: ret %base_i32 = getelementptr i32, i32* %base, i64 %offset %base_addr = bitcast i32* %base_i32 to * %load = call @llvm.masked.load.nxv2i32(* %base_addr, i32 1, %mask, undef) %ext = sext %load to ret %ext } ; 2-lane truncating contiguous stores. define void @masked_trunc_store_sv2i64_to_sv2i8( %val, i8 *%base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i8: ; CHECK-NEXT: st1b { z0.d }, p0, [x0, x1] ; CHECK-NEXT: ret %base_i8 = getelementptr i8, i8* %base, i64 %offset %base_addr = bitcast i8* %base_i8 to * %trunc = trunc %val to call void @llvm.masked.store.nxv2i8( %trunc, *%base_addr, i32 1, %mask) ret void } define void @masked_trunc_store_sv2i64_to_sv2i16( %val, i16 *%base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i16: ; CHECK-NEXT: st1h { z0.d }, p0, [x0, x1, lsl #1] ; CHECK-NEXT: ret %base_i16 = getelementptr i16, i16* %base, i64 %offset %base_addr = bitcast i16* %base_i16 to * %trunc = trunc %val to call void @llvm.masked.store.nxv2i16( %trunc, *%base_addr, i32 1, %mask) ret void } define void @masked_trunc_store_sv2i64_to_sv2i32( %val, i32 *%base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i32: ; CHECK-NEXT: st1w { z0.d }, p0, [x0, x1, lsl #2] ; CHECK-NEXT: ret %base_i32 = getelementptr i32, i32* %base, i64 %offset %base_addr = bitcast i32* %base_i32 to * %trunc = trunc %val to call void @llvm.masked.store.nxv2i32( %trunc, *%base_addr, i32 1, %mask) ret void } ; 4-lane contiguous load/stores. define void @test_masked_ldst_sv4i8(i8 * %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: test_masked_ldst_sv4i8: ; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1] ; CHECK-NEXT: st1b { z[[DATA]].s }, p0, [x0, x1] ; CHECK-NEXT: ret %base_i8 = getelementptr i8, i8* %base, i64 %offset %base_addr = bitcast i8* %base_i8 to * %data = call @llvm.masked.load.nxv4i8(* %base_addr, i32 1, %mask, undef) call void @llvm.masked.store.nxv4i8( %data, * %base_addr, i32 1, %mask) ret void } define void @test_masked_ldst_sv4i16(i16 * %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: test_masked_ldst_sv4i16: ; CHECK-NEXT: ld1sh { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #1] ; CHECK-NEXT: st1h { z[[DATA]].s }, p0, [x0, x1, lsl #1] ; CHECK-NEXT: ret %base_i16 = getelementptr i16, i16* %base, i64 %offset %base_addr = bitcast i16* %base_i16 to * %data = call @llvm.masked.load.nxv4i16(* %base_addr, i32 1, %mask, undef) call void @llvm.masked.store.nxv4i16( %data, * %base_addr, i32 1, %mask) ret void } define void @test_masked_ldst_sv4i32(i32 * %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: test_masked_ldst_sv4i32: ; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2] ; CHECK-NEXT: st1w { z[[DATA]].s }, p0, [x0, x1, lsl #2] ; CHECK-NEXT: ret %base_i32 = getelementptr i32, i32* %base, i64 %offset %base_addr = bitcast i32* %base_i32 to * %data = call @llvm.masked.load.nxv4i32(* %base_addr, i32 1, %mask, undef) call void @llvm.masked.store.nxv4i32( %data, * %base_addr, i32 1, %mask) ret void } define void @test_masked_ldst_sv4f16(half * %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: test_masked_ldst_sv4f16: ; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #1] ; CHECK-NEXT: st1h { z[[DATA]].s }, p0, [x0, x1, lsl #1] ; CHECK-NEXT: ret %base_f16 = getelementptr half, half* %base, i64 %offset %base_addr = bitcast half* %base_f16 to * %data = call @llvm.masked.load.nxv4f16(* %base_addr, i32 1, %mask, undef) call void @llvm.masked.store.nxv4f16( %data, * %base_addr, i32 1, %mask) ret void } define void @test_masked_ldst_sv4f32(float * %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: test_masked_ldst_sv4f32: ; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2] ; CHECK-NEXT: st1w { z[[DATA]].s }, p0, [x0, x1, lsl #2] ; CHECK-NEXT: ret %base_f32 = getelementptr float, float* %base, i64 %offset %base_addr = bitcast float* %base_f32 to * %data = call @llvm.masked.load.nxv4f32(* %base_addr, i32 1, %mask, undef) call void @llvm.masked.store.nxv4f32( %data, * %base_addr, i32 1, %mask) ret void } ; 4-lane zero/sign extended contiguous loads. define @masked_zload_sv4i8_to_sv4i32(i8* %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_zload_sv4i8_to_sv4i32: ; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, x1] ; CHECK-NEXT: ret %base_i8 = getelementptr i8, i8* %base, i64 %offset %base_addr = bitcast i8* %base_i8 to * %load = call @llvm.masked.load.nxv4i8(* %base_addr, i32 1, %mask, undef) %ext = zext %load to ret %ext } define @masked_sload_sv4i8_to_sv4i32(i8* %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_sload_sv4i8_to_sv4i32: ; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0, x1] ; CHECK-NEXT: ret %base_i8 = getelementptr i8, i8* %base, i64 %offset %base_addr = bitcast i8* %base_i8 to * %load = call @llvm.masked.load.nxv4i8(* %base_addr, i32 1, %mask, undef) %ext = sext %load to ret %ext } define @masked_zload_sv4i16_to_sv4i32(i16* %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_zload_sv4i16_to_sv4i32: ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, x1, lsl #1] ; CHECK-NEXT: ret %base_i16 = getelementptr i16, i16* %base, i64 %offset %base_addr = bitcast i16* %base_i16 to * %load = call @llvm.masked.load.nxv4i16(* %base_addr, i32 1, %mask, undef) %ext = zext %load to ret %ext } define @masked_sload_sv4i16_to_sv4i32(i16* %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_sload_sv4i16_to_sv4i32: ; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, x1, lsl #1] ; CHECK-NEXT: ret %base_i16 = getelementptr i16, i16* %base, i64 %offset %base_addr = bitcast i16* %base_i16 to * %load = call @llvm.masked.load.nxv4i16(* %base_addr, i32 1, %mask, undef) %ext = sext %load to ret %ext } ; 4-lane truncating contiguous stores. define void @masked_trunc_store_sv4i32_to_sv4i8( %val, i8 *%base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i8: ; CHECK-NEXT: st1b { z0.s }, p0, [x0, x1] ; CHECK-NEXT: ret %base_i8 = getelementptr i8, i8* %base, i64 %offset %base_addr = bitcast i8* %base_i8 to * %trunc = trunc %val to call void @llvm.masked.store.nxv4i8( %trunc, *%base_addr, i32 1, %mask) ret void } define void @masked_trunc_store_sv4i32_to_sv4i16( %val, i16 *%base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i16: ; CHECK-NEXT: st1h { z0.s }, p0, [x0, x1, lsl #1] ; CHECK-NEXT: ret %base_i16 = getelementptr i16, i16* %base, i64 %offset %base_addr = bitcast i16* %base_i16 to * %trunc = trunc %val to call void @llvm.masked.store.nxv4i16( %trunc, *%base_addr, i32 1, %mask) ret void } ; 8-lane contiguous load/stores. define void @test_masked_ldst_sv8i8(i8 * %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: test_masked_ldst_sv8i8: ; CHECK-NEXT: ld1sb { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1] ; CHECK-NEXT: st1b { z[[DATA]].h }, p0, [x0, x1] ; CHECK-NEXT: ret %base_i8 = getelementptr i8, i8* %base, i64 %offset %base_addr = bitcast i8* %base_i8 to * %data = call @llvm.masked.load.nxv8i8(* %base_addr, i32 1, %mask, undef) call void @llvm.masked.store.nxv8i8( %data, * %base_addr, i32 1, %mask) ret void } define void @test_masked_ldst_sv8i16(i16 * %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: test_masked_ldst_sv8i16: ; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1] ; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1] ; CHECK-NEXT: ret %base_i16 = getelementptr i16, i16* %base, i64 %offset %base_addr = bitcast i16* %base_i16 to * %data = call @llvm.masked.load.nxv8i16(* %base_addr, i32 1, %mask, undef) call void @llvm.masked.store.nxv8i16( %data, * %base_addr, i32 1, %mask) ret void } define void @test_masked_ldst_sv8f16(half * %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: test_masked_ldst_sv8f16: ; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1] ; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1] ; CHECK-NEXT: ret %base_f16 = getelementptr half, half* %base, i64 %offset %base_addr = bitcast half* %base_f16 to * %data = call @llvm.masked.load.nxv8f16(* %base_addr, i32 1, %mask, undef) call void @llvm.masked.store.nxv8f16( %data, * %base_addr, i32 1, %mask) ret void } define void @test_masked_ldst_sv8bf16(bfloat * %base, %mask, i64 %offset) nounwind #0 { ; CHECK-LABEL: test_masked_ldst_sv8bf16: ; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1] ; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1] ; CHECK-NEXT: ret %base_f16 = getelementptr bfloat, bfloat* %base, i64 %offset %base_addr = bitcast bfloat* %base_f16 to * %data = call @llvm.masked.load.nxv8bf16(* %base_addr, i32 1, %mask, undef) call void @llvm.masked.store.nxv8bf16( %data, * %base_addr, i32 1, %mask) ret void } ; 8-lane zero/sign extended contiguous loads. define @masked_zload_sv8i8_to_sv8i16(i8* %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_zload_sv8i8_to_sv8i16: ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0, x1] ; CHECK-NEXT: ret %base_i8 = getelementptr i8, i8* %base, i64 %offset %base_addr = bitcast i8* %base_i8 to * %load = call @llvm.masked.load.nxv8i8(* %base_addr, i32 1, %mask, undef) %ext = zext %load to ret %ext } define @masked_sload_sv8i8_to_sv8i16(i8* %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_sload_sv8i8_to_sv8i16: ; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0, x1] ; CHECK-NEXT: ret %base_i8 = getelementptr i8, i8* %base, i64 %offset %base_addr = bitcast i8* %base_i8 to * %load = call @llvm.masked.load.nxv8i8(* %base_addr, i32 1, %mask, undef) %ext = sext %load to ret %ext } ; 8-lane truncating contiguous stores. define void @masked_trunc_store_sv8i16_to_sv8i8( %val, i8 *%base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: masked_trunc_store_sv8i16_to_sv8i8: ; CHECK-NEXT: st1b { z0.h }, p0, [x0, x1] ; CHECK-NEXT: ret %base_i8 = getelementptr i8, i8* %base, i64 %offset %base_addr = bitcast i8* %base_i8 to * %trunc = trunc %val to call void @llvm.masked.store.nxv8i8( %trunc, *%base_addr, i32 1, %mask) ret void } ; 16-lane contiguous load/stores. define void @test_masked_ldst_sv16i8(i8 * %base, %mask, i64 %offset) nounwind { ; CHECK-LABEL: test_masked_ldst_sv16i8: ; CHECK-NEXT: ld1b { z[[DATA:[0-9]+]].b }, p0/z, [x0, x1] ; CHECK-NEXT: st1b { z[[DATA]].b }, p0, [x0, x1] ; CHECK-NEXT: ret %base_i8 = getelementptr i8, i8* %base, i64 %offset %base_addr = bitcast i8* %base_i8 to * %data = call @llvm.masked.load.nxv16i8(* %base_addr, i32 1, %mask, undef) call void @llvm.masked.store.nxv16i8( %data, * %base_addr, i32 1, %mask) ret void } ; 2-element contiguous loads. declare @llvm.masked.load.nxv2i8 (* , i32, , ) declare @llvm.masked.load.nxv2i16(*, i32, , ) declare @llvm.masked.load.nxv2i32(*, i32, , ) declare @llvm.masked.load.nxv2i64(*, i32, , ) declare @llvm.masked.load.nxv2f16(*, i32, , ) declare @llvm.masked.load.nxv2f32(*, i32, , ) declare @llvm.masked.load.nxv2f64(*, i32, , ) ; 4-element contiguous loads. declare @llvm.masked.load.nxv4i8 (* , i32, , ) declare @llvm.masked.load.nxv4i16(*, i32, , ) declare @llvm.masked.load.nxv4i32(*, i32, , ) declare @llvm.masked.load.nxv4f16(*, i32, , ) declare @llvm.masked.load.nxv4f32(*, i32, , ) ; 8-element contiguous loads. declare @llvm.masked.load.nxv8i8 (* , i32, , ) declare @llvm.masked.load.nxv8i16(*, i32, , ) declare @llvm.masked.load.nxv8f16(*, i32, , ) declare @llvm.masked.load.nxv8bf16(*, i32, , ) ; 16-element contiguous loads. declare @llvm.masked.load.nxv16i8(*, i32, , ) ; 2-element contiguous stores. declare void @llvm.masked.store.nxv2i8 ( , * , i32, ) declare void @llvm.masked.store.nxv2i16(, *, i32, ) declare void @llvm.masked.store.nxv2i32(, *, i32, ) declare void @llvm.masked.store.nxv2i64(, *, i32, ) declare void @llvm.masked.store.nxv2f16(, *, i32, ) declare void @llvm.masked.store.nxv2f32(, *, i32, ) declare void @llvm.masked.store.nxv2f64(, *, i32, ) ; 4-element contiguous stores. declare void @llvm.masked.store.nxv4i8 ( , * , i32, ) declare void @llvm.masked.store.nxv4i16(, *, i32, ) declare void @llvm.masked.store.nxv4i32(, *, i32, ) declare void @llvm.masked.store.nxv4f16(, *, i32, ) declare void @llvm.masked.store.nxv4f32(, *, i32, ) ; 8-element contiguous stores. declare void @llvm.masked.store.nxv8i8 ( , * , i32, ) declare void @llvm.masked.store.nxv8i16(, *, i32, ) declare void @llvm.masked.store.nxv8f16(, *, i32, ) declare void @llvm.masked.store.nxv8bf16(, *, i32, ) ; 16-element contiguous stores. declare void @llvm.masked.store.nxv16i8(, *, i32, ) ; +bf16 is required for the bfloat version. attributes #0 = { "target-features"="+sve,+bf16" }