1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=armv8.6a-arm-none-eabi -mattr=+bf16,+neon,+fullfp16 < %s | FileCheck %s 3; FIXME: Remove fullfp16 once bfloat arguments and returns lowering stops 4; depending on it. 5 6define arm_aapcs_vfpcc <4 x bfloat> @test_vld1_bf16(bfloat* nocapture readonly %ptr) { 7; CHECK-LABEL: test_vld1_bf16: 8; CHECK: @ %bb.0: @ %entry 9; CHECK-NEXT: vld1.16 {d0}, [r0] 10; CHECK-NEXT: bx lr 11entry: 12 %0 = bitcast bfloat* %ptr to <4 x bfloat>* 13 %1 = load <4 x bfloat>, <4 x bfloat>* %0, align 2 14 ret <4 x bfloat> %1 15} 16 17define arm_aapcs_vfpcc <8 x bfloat> @test_vld1q_bf16(bfloat* nocapture readonly %ptr) { 18; CHECK-LABEL: test_vld1q_bf16: 19; CHECK: @ %bb.0: @ %entry 20; CHECK-NEXT: vld1.16 {d0, d1}, [r0] 21; CHECK-NEXT: bx lr 22entry: 23 %0 = bitcast bfloat* %ptr to <8 x bfloat>* 24 %1 = load <8 x bfloat>, <8 x bfloat>* %0, align 2 25 ret <8 x bfloat> %1 26} 27 28define arm_aapcs_vfpcc <4 x bfloat> @test_vld1_lane_bf16(bfloat* nocapture readonly %ptr, <4 x bfloat> %src) { 29; CHECK-LABEL: test_vld1_lane_bf16: 30; CHECK: @ %bb.0: @ %entry 31; CHECK-NEXT: vld1.16 {d0[0]}, [r0:16] 32; CHECK-NEXT: bx lr 33entry: 34 %0 = load bfloat, bfloat* %ptr, align 2 35 %vld1_lane = insertelement <4 x bfloat> %src, bfloat %0, i32 0 36 ret <4 x bfloat> %vld1_lane 37} 38 39define arm_aapcs_vfpcc <8 x bfloat> @test_vld1q_lane_bf16(bfloat* nocapture readonly %ptr, <8 x bfloat> %src) { 40; CHECK-LABEL: test_vld1q_lane_bf16: 41; CHECK: @ %bb.0: @ %entry 42; CHECK-NEXT: vld1.16 {d1[3]}, [r0:16] 43; CHECK-NEXT: bx lr 44entry: 45 %0 = load bfloat, bfloat* %ptr, align 2 46 %vld1_lane = insertelement <8 x bfloat> %src, bfloat %0, i32 7 47 ret <8 x bfloat> %vld1_lane 48} 49 50define arm_aapcs_vfpcc <4 x bfloat> @test_vld1_dup_bf16(bfloat* nocapture readonly %ptr) { 51; CHECK-LABEL: test_vld1_dup_bf16: 52; CHECK: @ %bb.0: @ %entry 53; CHECK-NEXT: vld1.16 {d0[]}, [r0:16] 54; CHECK-NEXT: bx lr 55entry: 56 %0 = load bfloat, bfloat* %ptr, align 2 57 %1 = insertelement <4 x bfloat> undef, bfloat %0, i32 0 58 %lane = shufflevector <4 x bfloat> %1, <4 x bfloat> undef, <4 x i32> zeroinitializer 59 ret <4 x bfloat> %lane 60} 61 62define arm_aapcs_vfpcc [2 x <2 x i32>] @test_vld1_bf16_x2(bfloat* %ptr) { 63; CHECK-LABEL: test_vld1_bf16_x2: 64; CHECK: @ %bb.0: @ %entry 65; CHECK-NEXT: vld1.16 {d0, d1}, [r0:64] 66; CHECK-NEXT: bx lr 67entry: 68 %vld1xN = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x2.v4bf16.p0bf16(bfloat* %ptr) 69 %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld1xN, 0 70 %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld1xN, 1 71 %0 = bitcast <4 x bfloat> %vld1xN.fca.0.extract to <2 x i32> 72 %1 = bitcast <4 x bfloat> %vld1xN.fca.1.extract to <2 x i32> 73 %.fca.0.insert = insertvalue [2 x <2 x i32>] undef, <2 x i32> %0, 0 74 %.fca.1.insert = insertvalue [2 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1 75 ret [2 x <2 x i32>] %.fca.1.insert 76} 77 78define arm_aapcs_vfpcc [2 x <4 x i32>] @test_vld1q_bf16_x2(bfloat* %ptr) { 79; CHECK-LABEL: test_vld1q_bf16_x2: 80; CHECK: @ %bb.0: @ %entry 81; CHECK-NEXT: vld1.16 {d0, d1, d2, d3}, [r0:256] 82; CHECK-NEXT: bx lr 83entry: 84 %vld1xN = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x2.v8bf16.p0bf16(bfloat* %ptr) 85 %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld1xN, 0 86 %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld1xN, 1 87 %0 = bitcast <8 x bfloat> %vld1xN.fca.0.extract to <4 x i32> 88 %1 = bitcast <8 x bfloat> %vld1xN.fca.1.extract to <4 x i32> 89 %.fca.0.insert = insertvalue [2 x <4 x i32>] undef, <4 x i32> %0, 0 90 %.fca.1.insert = insertvalue [2 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1 91 ret [2 x <4 x i32>] %.fca.1.insert 92} 93 94define arm_aapcs_vfpcc [3 x <2 x i32>] @test_vld1_bf16_x3(bfloat* %ptr) { 95; CHECK-LABEL: test_vld1_bf16_x3: 96; CHECK: @ %bb.0: @ %entry 97; CHECK-NEXT: vld1.16 {d0, d1, d2}, [r0:64] 98; CHECK-NEXT: bx lr 99entry: 100 %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x3.v4bf16.p0bf16(bfloat* %ptr) 101 %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 0 102 %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 1 103 %vld1xN.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 2 104 %0 = bitcast <4 x bfloat> %vld1xN.fca.0.extract to <2 x i32> 105 %1 = bitcast <4 x bfloat> %vld1xN.fca.1.extract to <2 x i32> 106 %2 = bitcast <4 x bfloat> %vld1xN.fca.2.extract to <2 x i32> 107 %.fca.0.insert = insertvalue [3 x <2 x i32>] undef, <2 x i32> %0, 0 108 %.fca.1.insert = insertvalue [3 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1 109 %.fca.2.insert = insertvalue [3 x <2 x i32>] %.fca.1.insert, <2 x i32> %2, 2 110 ret [3 x <2 x i32>] %.fca.2.insert 111} 112 113define arm_aapcs_vfpcc [3 x <4 x i32>] @test_vld1q_bf16_x3(bfloat* %ptr) { 114; CHECK-LABEL: test_vld1q_bf16_x3: 115; CHECK: @ %bb.0: @ %entry 116; CHECK-NEXT: vld1.16 {d0, d1, d2}, [r0:64]! 117; CHECK-NEXT: vld1.16 {d3, d4, d5}, [r0:64] 118; CHECK-NEXT: bx lr 119entry: 120 %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x3.v8bf16.p0bf16(bfloat* %ptr) 121 %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 0 122 %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 1 123 %vld1xN.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 2 124 %0 = bitcast <8 x bfloat> %vld1xN.fca.0.extract to <4 x i32> 125 %1 = bitcast <8 x bfloat> %vld1xN.fca.1.extract to <4 x i32> 126 %2 = bitcast <8 x bfloat> %vld1xN.fca.2.extract to <4 x i32> 127 %.fca.0.insert = insertvalue [3 x <4 x i32>] undef, <4 x i32> %0, 0 128 %.fca.1.insert = insertvalue [3 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1 129 %.fca.2.insert = insertvalue [3 x <4 x i32>] %.fca.1.insert, <4 x i32> %2, 2 130 ret [3 x <4 x i32>] %.fca.2.insert 131} 132 133define arm_aapcs_vfpcc [4 x <2 x i32>] @test_vld1_bf16_x4(bfloat* %ptr) { 134; CHECK-LABEL: test_vld1_bf16_x4: 135; CHECK: @ %bb.0: @ %entry 136; CHECK-NEXT: vld1.16 {d0, d1, d2, d3}, [r0:256] 137; CHECK-NEXT: bx lr 138entry: 139 %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x4.v4bf16.p0bf16(bfloat* %ptr) 140 %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 0 141 %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 1 142 %vld1xN.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 2 143 %vld1xN.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 3 144 %0 = bitcast <4 x bfloat> %vld1xN.fca.0.extract to <2 x i32> 145 %1 = bitcast <4 x bfloat> %vld1xN.fca.1.extract to <2 x i32> 146 %2 = bitcast <4 x bfloat> %vld1xN.fca.2.extract to <2 x i32> 147 %3 = bitcast <4 x bfloat> %vld1xN.fca.3.extract to <2 x i32> 148 %.fca.0.insert = insertvalue [4 x <2 x i32>] undef, <2 x i32> %0, 0 149 %.fca.1.insert = insertvalue [4 x <2 x i32>] %.fca.0.insert, <2 x i32> %1, 1 150 %.fca.2.insert = insertvalue [4 x <2 x i32>] %.fca.1.insert, <2 x i32> %2, 2 151 %.fca.3.insert = insertvalue [4 x <2 x i32>] %.fca.2.insert, <2 x i32> %3, 3 152 ret [4 x <2 x i32>] %.fca.3.insert 153} 154 155define arm_aapcs_vfpcc [4 x <4 x i32>] @test_vld1q_bf16_x4(bfloat* %ptr) { 156; CHECK-LABEL: test_vld1q_bf16_x4: 157; CHECK: @ %bb.0: @ %entry 158; CHECK-NEXT: vld1.16 {d0, d1, d2, d3}, [r0:256]! 159; CHECK-NEXT: vld1.16 {d4, d5, d6, d7}, [r0:256] 160; CHECK-NEXT: bx lr 161entry: 162 %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x4.v8bf16.p0bf16(bfloat* %ptr) 163 %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 0 164 %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 1 165 %vld1xN.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 2 166 %vld1xN.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 3 167 %0 = bitcast <8 x bfloat> %vld1xN.fca.0.extract to <4 x i32> 168 %1 = bitcast <8 x bfloat> %vld1xN.fca.1.extract to <4 x i32> 169 %2 = bitcast <8 x bfloat> %vld1xN.fca.2.extract to <4 x i32> 170 %3 = bitcast <8 x bfloat> %vld1xN.fca.3.extract to <4 x i32> 171 %.fca.0.insert = insertvalue [4 x <4 x i32>] undef, <4 x i32> %0, 0 172 %.fca.1.insert = insertvalue [4 x <4 x i32>] %.fca.0.insert, <4 x i32> %1, 1 173 %.fca.2.insert = insertvalue [4 x <4 x i32>] %.fca.1.insert, <4 x i32> %2, 2 174 %.fca.3.insert = insertvalue [4 x <4 x i32>] %.fca.2.insert, <4 x i32> %3, 3 175 ret [4 x <4 x i32>] %.fca.3.insert 176} 177 178define arm_aapcs_vfpcc <8 x bfloat> @test_vld1q_dup_bf16(bfloat* nocapture readonly %ptr) { 179; CHECK-LABEL: test_vld1q_dup_bf16: 180; CHECK: @ %bb.0: @ %entry 181; CHECK-NEXT: vld1.16 {d0[], d1[]}, [r0:16] 182; CHECK-NEXT: bx lr 183entry: 184 %0 = load bfloat, bfloat* %ptr, align 2 185 %1 = insertelement <8 x bfloat> undef, bfloat %0, i32 0 186 %lane = shufflevector <8 x bfloat> %1, <8 x bfloat> undef, <8 x i32> zeroinitializer 187 ret <8 x bfloat> %lane 188} 189 190define arm_aapcs_vfpcc [2 x <2 x i32>] @test_vld2_bf16(bfloat* %ptr) { 191; CHECK-LABEL: test_vld2_bf16: 192; CHECK: @ %bb.0: @ %entry 193; CHECK-NEXT: vld2.16 {d0, d1}, [r0] 194; CHECK-NEXT: bx lr 195entry: 196 %0 = bitcast bfloat* %ptr to i8* 197 %vld2_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2.v4bf16.p0i8(i8* %0, i32 2) 198 %vld2_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_v, 0 199 %vld2_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_v, 1 200 %1 = bitcast <4 x bfloat> %vld2_v.fca.0.extract to <2 x i32> 201 %2 = bitcast <4 x bfloat> %vld2_v.fca.1.extract to <2 x i32> 202 %.fca.0.insert = insertvalue [2 x <2 x i32>] undef, <2 x i32> %1, 0 203 %.fca.1.insert = insertvalue [2 x <2 x i32>] %.fca.0.insert, <2 x i32> %2, 1 204 ret [2 x <2 x i32>] %.fca.1.insert 205} 206 207define arm_aapcs_vfpcc [2 x <4 x i32>] @test_vld2q_bf16(bfloat* %ptr) { 208; CHECK-LABEL: test_vld2q_bf16: 209; CHECK: @ %bb.0: @ %entry 210; CHECK-NEXT: vld2.16 {d0, d1, d2, d3}, [r0] 211; CHECK-NEXT: bx lr 212entry: 213 %0 = bitcast bfloat* %ptr to i8* 214 %vld2q_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2.v8bf16.p0i8(i8* %0, i32 2) 215 %vld2q_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_v, 0 216 %vld2q_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_v, 1 217 %1 = bitcast <8 x bfloat> %vld2q_v.fca.0.extract to <4 x i32> 218 %2 = bitcast <8 x bfloat> %vld2q_v.fca.1.extract to <4 x i32> 219 %.fca.0.insert = insertvalue [2 x <4 x i32>] undef, <4 x i32> %1, 0 220 %.fca.1.insert = insertvalue [2 x <4 x i32>] %.fca.0.insert, <4 x i32> %2, 1 221 ret [2 x <4 x i32>] %.fca.1.insert 222} 223 224define arm_aapcs_vfpcc [2 x <2 x i32>] @test_vld2_lane_bf16(bfloat* %ptr, [2 x <2 x i32>] %src.coerce) { 225; CHECK-LABEL: test_vld2_lane_bf16: 226; CHECK: @ %bb.0: @ %entry 227; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0 def $q0 228; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0 def $q0 229; CHECK-NEXT: vld2.16 {d0[1], d1[1]}, [r0] 230; CHECK-NEXT: bx lr 231entry: 232 %src.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %src.coerce, 0 233 %src.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %src.coerce, 1 234 %0 = bitcast <2 x i32> %src.coerce.fca.0.extract to <4 x bfloat> 235 %1 = bitcast <2 x i32> %src.coerce.fca.1.extract to <4 x bfloat> 236 %2 = bitcast bfloat* %ptr to i8* 237 %vld2_lane_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2lane.v4bf16.p0i8(i8* %2, <4 x bfloat> %0, <4 x bfloat> %1, i32 1, i32 2) 238 %vld2_lane_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_lane_v, 0 239 %vld2_lane_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_lane_v, 1 240 %3 = bitcast <4 x bfloat> %vld2_lane_v.fca.0.extract to <2 x i32> 241 %4 = bitcast <4 x bfloat> %vld2_lane_v.fca.1.extract to <2 x i32> 242 %.fca.0.insert = insertvalue [2 x <2 x i32>] undef, <2 x i32> %3, 0 243 %.fca.1.insert = insertvalue [2 x <2 x i32>] %.fca.0.insert, <2 x i32> %4, 1 244 ret [2 x <2 x i32>] %.fca.1.insert 245} 246 247define arm_aapcs_vfpcc [2 x <4 x i32>] @test_vld2q_lane_bf16(bfloat* %ptr, [2 x <4 x i32>] %src.coerce) { 248; CHECK-LABEL: test_vld2q_lane_bf16: 249; CHECK: @ %bb.0: @ %entry 250; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 251; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 252; CHECK-NEXT: vld2.16 {d1[3], d3[3]}, [r0] 253; CHECK-NEXT: bx lr 254entry: 255 %src.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %src.coerce, 0 256 %src.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %src.coerce, 1 257 %0 = bitcast <4 x i32> %src.coerce.fca.0.extract to <8 x bfloat> 258 %1 = bitcast <4 x i32> %src.coerce.fca.1.extract to <8 x bfloat> 259 %2 = bitcast bfloat* %ptr to i8* 260 %vld2q_lane_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2lane.v8bf16.p0i8(i8* %2, <8 x bfloat> %0, <8 x bfloat> %1, i32 7, i32 2) 261 %vld2q_lane_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_lane_v, 0 262 %vld2q_lane_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_lane_v, 1 263 %3 = bitcast <8 x bfloat> %vld2q_lane_v.fca.0.extract to <4 x i32> 264 %4 = bitcast <8 x bfloat> %vld2q_lane_v.fca.1.extract to <4 x i32> 265 %.fca.0.insert = insertvalue [2 x <4 x i32>] undef, <4 x i32> %3, 0 266 %.fca.1.insert = insertvalue [2 x <4 x i32>] %.fca.0.insert, <4 x i32> %4, 1 267 ret [2 x <4 x i32>] %.fca.1.insert 268} 269 270define arm_aapcs_vfpcc [3 x <2 x i32>] @test_vld3_bf16(bfloat* %ptr) { 271; CHECK-LABEL: test_vld3_bf16: 272; CHECK: @ %bb.0: @ %entry 273; CHECK-NEXT: vld3.16 {d0, d1, d2}, [r0] 274; CHECK-NEXT: bx lr 275entry: 276 %0 = bitcast bfloat* %ptr to i8* 277 %vld3_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3.v4bf16.p0i8(i8* %0, i32 2) 278 %vld3_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_v, 0 279 %vld3_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_v, 1 280 %vld3_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_v, 2 281 %1 = bitcast <4 x bfloat> %vld3_v.fca.0.extract to <2 x i32> 282 %2 = bitcast <4 x bfloat> %vld3_v.fca.1.extract to <2 x i32> 283 %3 = bitcast <4 x bfloat> %vld3_v.fca.2.extract to <2 x i32> 284 %.fca.0.insert = insertvalue [3 x <2 x i32>] undef, <2 x i32> %1, 0 285 %.fca.1.insert = insertvalue [3 x <2 x i32>] %.fca.0.insert, <2 x i32> %2, 1 286 %.fca.2.insert = insertvalue [3 x <2 x i32>] %.fca.1.insert, <2 x i32> %3, 2 287 ret [3 x <2 x i32>] %.fca.2.insert 288} 289 290define arm_aapcs_vfpcc [3 x <4 x i32>] @test_vld3q_bf16(bfloat* %ptr) { 291; CHECK-LABEL: test_vld3q_bf16: 292; CHECK: @ %bb.0: @ %entry 293; CHECK-NEXT: vld3.16 {d0, d2, d4}, [r0]! 294; CHECK-NEXT: vld3.16 {d1, d3, d5}, [r0] 295; CHECK-NEXT: bx lr 296entry: 297 %0 = bitcast bfloat* %ptr to i8* 298 %vld3q_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3.v8bf16.p0i8(i8* %0, i32 2) 299 %vld3q_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_v, 0 300 %vld3q_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_v, 1 301 %vld3q_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_v, 2 302 %1 = bitcast <8 x bfloat> %vld3q_v.fca.0.extract to <4 x i32> 303 %2 = bitcast <8 x bfloat> %vld3q_v.fca.1.extract to <4 x i32> 304 %3 = bitcast <8 x bfloat> %vld3q_v.fca.2.extract to <4 x i32> 305 %.fca.0.insert = insertvalue [3 x <4 x i32>] undef, <4 x i32> %1, 0 306 %.fca.1.insert = insertvalue [3 x <4 x i32>] %.fca.0.insert, <4 x i32> %2, 1 307 %.fca.2.insert = insertvalue [3 x <4 x i32>] %.fca.1.insert, <4 x i32> %3, 2 308 ret [3 x <4 x i32>] %.fca.2.insert 309} 310 311define arm_aapcs_vfpcc [3 x <2 x i32>] @test_vld3_lane_bf16(bfloat* %ptr, [3 x <2 x i32>] %src.coerce) { 312; CHECK-LABEL: test_vld3_lane_bf16: 313; CHECK: @ %bb.0: @ %entry 314; CHECK-NEXT: @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1 315; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 316; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 317; CHECK-NEXT: vld3.16 {d0[1], d1[1], d2[1]}, [r0] 318; CHECK-NEXT: bx lr 319entry: 320 %src.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %src.coerce, 0 321 %src.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %src.coerce, 1 322 %src.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %src.coerce, 2 323 %0 = bitcast <2 x i32> %src.coerce.fca.0.extract to <4 x bfloat> 324 %1 = bitcast <2 x i32> %src.coerce.fca.1.extract to <4 x bfloat> 325 %2 = bitcast <2 x i32> %src.coerce.fca.2.extract to <4 x bfloat> 326 %3 = bitcast bfloat* %ptr to i8* 327 %vld3_lane_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3lane.v4bf16.p0i8(i8* %3, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 1, i32 2) 328 %vld3_lane_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane_v, 0 329 %vld3_lane_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane_v, 1 330 %vld3_lane_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane_v, 2 331 %4 = bitcast <4 x bfloat> %vld3_lane_v.fca.0.extract to <2 x i32> 332 %5 = bitcast <4 x bfloat> %vld3_lane_v.fca.1.extract to <2 x i32> 333 %6 = bitcast <4 x bfloat> %vld3_lane_v.fca.2.extract to <2 x i32> 334 %.fca.0.insert = insertvalue [3 x <2 x i32>] undef, <2 x i32> %4, 0 335 %.fca.1.insert = insertvalue [3 x <2 x i32>] %.fca.0.insert, <2 x i32> %5, 1 336 %.fca.2.insert = insertvalue [3 x <2 x i32>] %.fca.1.insert, <2 x i32> %6, 2 337 ret [3 x <2 x i32>] %.fca.2.insert 338} 339 340define arm_aapcs_vfpcc [3 x <4 x i32>] @test_vld3q_lane_bf16(bfloat* %ptr, [3 x <4 x i32>] %src.coerce) { 341; CHECK-LABEL: test_vld3q_lane_bf16: 342; CHECK: @ %bb.0: @ %entry 343; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 344; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 345; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 346; CHECK-NEXT: vld3.16 {d1[3], d3[3], d5[3]}, [r0] 347; CHECK-NEXT: bx lr 348entry: 349 %src.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %src.coerce, 0 350 %src.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %src.coerce, 1 351 %src.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %src.coerce, 2 352 %0 = bitcast <4 x i32> %src.coerce.fca.0.extract to <8 x bfloat> 353 %1 = bitcast <4 x i32> %src.coerce.fca.1.extract to <8 x bfloat> 354 %2 = bitcast <4 x i32> %src.coerce.fca.2.extract to <8 x bfloat> 355 %3 = bitcast bfloat* %ptr to i8* 356 %vld3q_lane_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3lane.v8bf16.p0i8(i8* %3, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 7, i32 2) 357 %vld3q_lane_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_lane_v, 0 358 %vld3q_lane_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_lane_v, 1 359 %vld3q_lane_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_lane_v, 2 360 %4 = bitcast <8 x bfloat> %vld3q_lane_v.fca.0.extract to <4 x i32> 361 %5 = bitcast <8 x bfloat> %vld3q_lane_v.fca.1.extract to <4 x i32> 362 %6 = bitcast <8 x bfloat> %vld3q_lane_v.fca.2.extract to <4 x i32> 363 %.fca.0.insert = insertvalue [3 x <4 x i32>] undef, <4 x i32> %4, 0 364 %.fca.1.insert = insertvalue [3 x <4 x i32>] %.fca.0.insert, <4 x i32> %5, 1 365 %.fca.2.insert = insertvalue [3 x <4 x i32>] %.fca.1.insert, <4 x i32> %6, 2 366 ret [3 x <4 x i32>] %.fca.2.insert 367} 368 369define arm_aapcs_vfpcc [4 x <2 x i32>] @test_vld4_bf16(bfloat* %ptr) { 370; CHECK-LABEL: test_vld4_bf16: 371; CHECK: @ %bb.0: @ %entry 372; CHECK-NEXT: vld4.16 {d0, d1, d2, d3}, [r0] 373; CHECK-NEXT: bx lr 374entry: 375 %0 = bitcast bfloat* %ptr to i8* 376 %vld4_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4.v4bf16.p0i8(i8* %0, i32 2) 377 %vld4_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_v, 0 378 %vld4_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_v, 1 379 %vld4_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_v, 2 380 %vld4_v.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_v, 3 381 %1 = bitcast <4 x bfloat> %vld4_v.fca.0.extract to <2 x i32> 382 %2 = bitcast <4 x bfloat> %vld4_v.fca.1.extract to <2 x i32> 383 %3 = bitcast <4 x bfloat> %vld4_v.fca.2.extract to <2 x i32> 384 %4 = bitcast <4 x bfloat> %vld4_v.fca.3.extract to <2 x i32> 385 %.fca.0.insert = insertvalue [4 x <2 x i32>] undef, <2 x i32> %1, 0 386 %.fca.1.insert = insertvalue [4 x <2 x i32>] %.fca.0.insert, <2 x i32> %2, 1 387 %.fca.2.insert = insertvalue [4 x <2 x i32>] %.fca.1.insert, <2 x i32> %3, 2 388 %.fca.3.insert = insertvalue [4 x <2 x i32>] %.fca.2.insert, <2 x i32> %4, 3 389 ret [4 x <2 x i32>] %.fca.3.insert 390} 391 392define arm_aapcs_vfpcc [4 x <4 x i32>] @test_vld4q_bf16(bfloat* %ptr) { 393; CHECK-LABEL: test_vld4q_bf16: 394; CHECK: @ %bb.0: @ %entry 395; CHECK-NEXT: vld4.16 {d0, d2, d4, d6}, [r0]! 396; CHECK-NEXT: vld4.16 {d1, d3, d5, d7}, [r0] 397; CHECK-NEXT: bx lr 398entry: 399 %0 = bitcast bfloat* %ptr to i8* 400 %vld4q_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4.v8bf16.p0i8(i8* %0, i32 2) 401 %vld4q_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_v, 0 402 %vld4q_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_v, 1 403 %vld4q_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_v, 2 404 %vld4q_v.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_v, 3 405 %1 = bitcast <8 x bfloat> %vld4q_v.fca.0.extract to <4 x i32> 406 %2 = bitcast <8 x bfloat> %vld4q_v.fca.1.extract to <4 x i32> 407 %3 = bitcast <8 x bfloat> %vld4q_v.fca.2.extract to <4 x i32> 408 %4 = bitcast <8 x bfloat> %vld4q_v.fca.3.extract to <4 x i32> 409 %.fca.0.insert = insertvalue [4 x <4 x i32>] undef, <4 x i32> %1, 0 410 %.fca.1.insert = insertvalue [4 x <4 x i32>] %.fca.0.insert, <4 x i32> %2, 1 411 %.fca.2.insert = insertvalue [4 x <4 x i32>] %.fca.1.insert, <4 x i32> %3, 2 412 %.fca.3.insert = insertvalue [4 x <4 x i32>] %.fca.2.insert, <4 x i32> %4, 3 413 ret [4 x <4 x i32>] %.fca.3.insert 414} 415 416define arm_aapcs_vfpcc [4 x <2 x i32>] @test_vld4_lane_bf16(bfloat* %ptr, [4 x <2 x i32>] %src.coerce) { 417; CHECK-LABEL: test_vld4_lane_bf16: 418; CHECK: @ %bb.0: @ %entry 419; CHECK-NEXT: @ kill: def $d3 killed $d3 killed $q0_q1 def $q0_q1 420; CHECK-NEXT: @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1 421; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 422; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 423; CHECK-NEXT: vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0] 424; CHECK-NEXT: bx lr 425entry: 426 %src.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %src.coerce, 0 427 %src.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %src.coerce, 1 428 %src.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %src.coerce, 2 429 %src.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %src.coerce, 3 430 %0 = bitcast <2 x i32> %src.coerce.fca.0.extract to <4 x bfloat> 431 %1 = bitcast <2 x i32> %src.coerce.fca.1.extract to <4 x bfloat> 432 %2 = bitcast <2 x i32> %src.coerce.fca.2.extract to <4 x bfloat> 433 %3 = bitcast <2 x i32> %src.coerce.fca.3.extract to <4 x bfloat> 434 %4 = bitcast bfloat* %ptr to i8* 435 %vld4_lane_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4lane.v4bf16.p0i8(i8* %4, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 1, i32 2) 436 %vld4_lane_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane_v, 0 437 %vld4_lane_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane_v, 1 438 %vld4_lane_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane_v, 2 439 %vld4_lane_v.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_lane_v, 3 440 %5 = bitcast <4 x bfloat> %vld4_lane_v.fca.0.extract to <2 x i32> 441 %6 = bitcast <4 x bfloat> %vld4_lane_v.fca.1.extract to <2 x i32> 442 %7 = bitcast <4 x bfloat> %vld4_lane_v.fca.2.extract to <2 x i32> 443 %8 = bitcast <4 x bfloat> %vld4_lane_v.fca.3.extract to <2 x i32> 444 %.fca.0.insert = insertvalue [4 x <2 x i32>] undef, <2 x i32> %5, 0 445 %.fca.1.insert = insertvalue [4 x <2 x i32>] %.fca.0.insert, <2 x i32> %6, 1 446 %.fca.2.insert = insertvalue [4 x <2 x i32>] %.fca.1.insert, <2 x i32> %7, 2 447 %.fca.3.insert = insertvalue [4 x <2 x i32>] %.fca.2.insert, <2 x i32> %8, 3 448 ret [4 x <2 x i32>] %.fca.3.insert 449} 450 451define arm_aapcs_vfpcc [4 x <4 x i32>] @test_vld4q_lane_bf16(bfloat* %ptr, [4 x <4 x i32>] %src.coerce) { 452; CHECK-LABEL: test_vld4q_lane_bf16: 453; CHECK: @ %bb.0: @ %entry 454; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 455; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 456; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 457; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 458; CHECK-NEXT: vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0] 459; CHECK-NEXT: bx lr 460entry: 461 %src.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %src.coerce, 0 462 %src.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %src.coerce, 1 463 %src.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %src.coerce, 2 464 %src.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %src.coerce, 3 465 %0 = bitcast <4 x i32> %src.coerce.fca.0.extract to <8 x bfloat> 466 %1 = bitcast <4 x i32> %src.coerce.fca.1.extract to <8 x bfloat> 467 %2 = bitcast <4 x i32> %src.coerce.fca.2.extract to <8 x bfloat> 468 %3 = bitcast <4 x i32> %src.coerce.fca.3.extract to <8 x bfloat> 469 %4 = bitcast bfloat* %ptr to i8* 470 %vld4q_lane_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4lane.v8bf16.p0i8(i8* %4, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 7, i32 2) 471 %vld4q_lane_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_lane_v, 0 472 %vld4q_lane_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_lane_v, 1 473 %vld4q_lane_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_lane_v, 2 474 %vld4q_lane_v.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_lane_v, 3 475 %5 = bitcast <8 x bfloat> %vld4q_lane_v.fca.0.extract to <4 x i32> 476 %6 = bitcast <8 x bfloat> %vld4q_lane_v.fca.1.extract to <4 x i32> 477 %7 = bitcast <8 x bfloat> %vld4q_lane_v.fca.2.extract to <4 x i32> 478 %8 = bitcast <8 x bfloat> %vld4q_lane_v.fca.3.extract to <4 x i32> 479 %.fca.0.insert = insertvalue [4 x <4 x i32>] undef, <4 x i32> %5, 0 480 %.fca.1.insert = insertvalue [4 x <4 x i32>] %.fca.0.insert, <4 x i32> %6, 1 481 %.fca.2.insert = insertvalue [4 x <4 x i32>] %.fca.1.insert, <4 x i32> %7, 2 482 %.fca.3.insert = insertvalue [4 x <4 x i32>] %.fca.2.insert, <4 x i32> %8, 3 483 ret [4 x <4 x i32>] %.fca.3.insert 484} 485 486define arm_aapcs_vfpcc [2 x <2 x i32>] @test_vld2_dup_bf16(bfloat* %ptr) { 487; CHECK-LABEL: test_vld2_dup_bf16: 488; CHECK: @ %bb.0: @ %entry 489; CHECK-NEXT: vld2.16 {d0[], d1[]}, [r0] 490; CHECK-NEXT: bx lr 491entry: 492 %0 = bitcast bfloat* %ptr to i8* 493 %vld2_dup_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2dup.v4bf16.p0i8(i8* %0, i32 2) 494 %vld2_dup_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_dup_v, 0 495 %vld2_dup_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_dup_v, 1 496 %1 = bitcast <4 x bfloat> %vld2_dup_v.fca.0.extract to <2 x i32> 497 %2 = bitcast <4 x bfloat> %vld2_dup_v.fca.1.extract to <2 x i32> 498 %.fca.0.insert = insertvalue [2 x <2 x i32>] undef, <2 x i32> %1, 0 499 %.fca.1.insert = insertvalue [2 x <2 x i32>] %.fca.0.insert, <2 x i32> %2, 1 500 ret [2 x <2 x i32>] %.fca.1.insert 501} 502 503define arm_aapcs_vfpcc [2 x <4 x i32>] @test_vld2q_dup_bf16(bfloat* %ptr) { 504; CHECK-LABEL: test_vld2q_dup_bf16: 505; CHECK: @ %bb.0: @ %entry 506; CHECK-NEXT: vld2.16 {d16[], d18[]}, [r0] 507; CHECK-NEXT: vld2.16 {d1[], d3[]}, [r0] 508; CHECK-NEXT: bx lr 509entry: 510 %0 = bitcast bfloat* %ptr to i8* 511 %vld2q_dup_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2dup.v8bf16.p0i8(i8* %0, i32 2) 512 %vld2q_dup_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_dup_v, 0 513 %vld2q_dup_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_dup_v, 1 514 %1 = bitcast <8 x bfloat> %vld2q_dup_v.fca.0.extract to <4 x i32> 515 %2 = bitcast <8 x bfloat> %vld2q_dup_v.fca.1.extract to <4 x i32> 516 %.fca.0.insert = insertvalue [2 x <4 x i32>] undef, <4 x i32> %1, 0 517 %.fca.1.insert = insertvalue [2 x <4 x i32>] %.fca.0.insert, <4 x i32> %2, 1 518 ret [2 x <4 x i32>] %.fca.1.insert 519} 520 521define arm_aapcs_vfpcc [3 x <2 x i32>] @test_vld3_dup_bf16(bfloat* %ptr) { 522; CHECK-LABEL: test_vld3_dup_bf16: 523; CHECK: @ %bb.0: @ %entry 524; CHECK-NEXT: vld3.16 {d0[], d1[], d2[]}, [r0] 525; CHECK-NEXT: bx lr 526entry: 527 %0 = bitcast bfloat* %ptr to i8* 528 %vld3_dup_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3dup.v4bf16.p0i8(i8* %0, i32 2) 529 %vld3_dup_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_dup_v, 0 530 %vld3_dup_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_dup_v, 1 531 %vld3_dup_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_dup_v, 2 532 %1 = bitcast <4 x bfloat> %vld3_dup_v.fca.0.extract to <2 x i32> 533 %2 = bitcast <4 x bfloat> %vld3_dup_v.fca.1.extract to <2 x i32> 534 %3 = bitcast <4 x bfloat> %vld3_dup_v.fca.2.extract to <2 x i32> 535 %.fca.0.insert = insertvalue [3 x <2 x i32>] undef, <2 x i32> %1, 0 536 %.fca.1.insert = insertvalue [3 x <2 x i32>] %.fca.0.insert, <2 x i32> %2, 1 537 %.fca.2.insert = insertvalue [3 x <2 x i32>] %.fca.1.insert, <2 x i32> %3, 2 538 ret [3 x <2 x i32>] %.fca.2.insert 539} 540 541define arm_aapcs_vfpcc [3 x <4 x i32>] @test_vld3q_dup_bf16(bfloat* %ptr) { 542; CHECK-LABEL: test_vld3q_dup_bf16: 543; CHECK: @ %bb.0: @ %entry 544; CHECK-NEXT: vld3.16 {d0[], d2[], d4[]}, [r0] 545; CHECK-NEXT: vld3.16 {d1[], d3[], d5[]}, [r0] 546; CHECK-NEXT: bx lr 547entry: 548 %0 = bitcast bfloat* %ptr to i8* 549 %vld3q_dup_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3dup.v8bf16.p0i8(i8* %0, i32 2) 550 %vld3q_dup_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_dup_v, 0 551 %vld3q_dup_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_dup_v, 1 552 %vld3q_dup_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_dup_v, 2 553 %1 = bitcast <8 x bfloat> %vld3q_dup_v.fca.0.extract to <4 x i32> 554 %2 = bitcast <8 x bfloat> %vld3q_dup_v.fca.1.extract to <4 x i32> 555 %3 = bitcast <8 x bfloat> %vld3q_dup_v.fca.2.extract to <4 x i32> 556 %.fca.0.insert = insertvalue [3 x <4 x i32>] undef, <4 x i32> %1, 0 557 %.fca.1.insert = insertvalue [3 x <4 x i32>] %.fca.0.insert, <4 x i32> %2, 1 558 %.fca.2.insert = insertvalue [3 x <4 x i32>] %.fca.1.insert, <4 x i32> %3, 2 559 ret [3 x <4 x i32>] %.fca.2.insert 560} 561 562define arm_aapcs_vfpcc [4 x <2 x i32>] @test_vld4_dup_bf16(bfloat* %ptr) { 563; CHECK-LABEL: test_vld4_dup_bf16: 564; CHECK: @ %bb.0: @ %entry 565; CHECK-NEXT: vld4.16 {d0[], d1[], d2[], d3[]}, [r0] 566; CHECK-NEXT: bx lr 567entry: 568 %0 = bitcast bfloat* %ptr to i8* 569 %vld4_dup_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4dup.v4bf16.p0i8(i8* %0, i32 2) 570 %vld4_dup_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_dup_v, 0 571 %vld4_dup_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_dup_v, 1 572 %vld4_dup_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_dup_v, 2 573 %vld4_dup_v.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld4_dup_v, 3 574 %1 = bitcast <4 x bfloat> %vld4_dup_v.fca.0.extract to <2 x i32> 575 %2 = bitcast <4 x bfloat> %vld4_dup_v.fca.1.extract to <2 x i32> 576 %3 = bitcast <4 x bfloat> %vld4_dup_v.fca.2.extract to <2 x i32> 577 %4 = bitcast <4 x bfloat> %vld4_dup_v.fca.3.extract to <2 x i32> 578 %.fca.0.insert = insertvalue [4 x <2 x i32>] undef, <2 x i32> %1, 0 579 %.fca.1.insert = insertvalue [4 x <2 x i32>] %.fca.0.insert, <2 x i32> %2, 1 580 %.fca.2.insert = insertvalue [4 x <2 x i32>] %.fca.1.insert, <2 x i32> %3, 2 581 %.fca.3.insert = insertvalue [4 x <2 x i32>] %.fca.2.insert, <2 x i32> %4, 3 582 ret [4 x <2 x i32>] %.fca.3.insert 583} 584 585define arm_aapcs_vfpcc [4 x <4 x i32>] @test_vld4q_dup_bf16(bfloat* %ptr) { 586; CHECK-LABEL: test_vld4q_dup_bf16: 587; CHECK: @ %bb.0: @ %entry 588; CHECK-NEXT: vld4.16 {d0[], d2[], d4[], d6[]}, [r0] 589; CHECK-NEXT: vld4.16 {d1[], d3[], d5[], d7[]}, [r0] 590; CHECK-NEXT: bx lr 591entry: 592 %0 = bitcast bfloat* %ptr to i8* 593 %vld4q_dup_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4dup.v8bf16.p0i8(i8* %0, i32 2) 594 %vld4q_dup_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_dup_v, 0 595 %vld4q_dup_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_dup_v, 1 596 %vld4q_dup_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_dup_v, 2 597 %vld4q_dup_v.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld4q_dup_v, 3 598 %1 = bitcast <8 x bfloat> %vld4q_dup_v.fca.0.extract to <4 x i32> 599 %2 = bitcast <8 x bfloat> %vld4q_dup_v.fca.1.extract to <4 x i32> 600 %3 = bitcast <8 x bfloat> %vld4q_dup_v.fca.2.extract to <4 x i32> 601 %4 = bitcast <8 x bfloat> %vld4q_dup_v.fca.3.extract to <4 x i32> 602 %.fca.0.insert = insertvalue [4 x <4 x i32>] undef, <4 x i32> %1, 0 603 %.fca.1.insert = insertvalue [4 x <4 x i32>] %.fca.0.insert, <4 x i32> %2, 1 604 %.fca.2.insert = insertvalue [4 x <4 x i32>] %.fca.1.insert, <4 x i32> %3, 2 605 %.fca.3.insert = insertvalue [4 x <4 x i32>] %.fca.2.insert, <4 x i32> %4, 3 606 ret [4 x <4 x i32>] %.fca.3.insert 607} 608 609define arm_aapcs_vfpcc void @test_vst1_bf16(bfloat* %ptr, <4 x bfloat> %val) { 610; CHECK-LABEL: test_vst1_bf16: 611; CHECK: @ %bb.0: @ %entry 612; CHECK-NEXT: vst1.16 {d0}, [r0] 613; CHECK-NEXT: bx lr 614entry: 615 %0 = bitcast bfloat* %ptr to i8* 616 tail call void @llvm.arm.neon.vst1.p0i8.v4bf16(i8* %0, <4 x bfloat> %val, i32 2) 617 ret void 618} 619 620define arm_aapcs_vfpcc void @test_vst1q_bf16(bfloat* %ptr, <8 x bfloat> %val) { 621; CHECK-LABEL: test_vst1q_bf16: 622; CHECK: @ %bb.0: @ %entry 623; CHECK-NEXT: vst1.16 {d0, d1}, [r0] 624; CHECK-NEXT: bx lr 625entry: 626 %0 = bitcast bfloat* %ptr to i8* 627 tail call void @llvm.arm.neon.vst1.p0i8.v8bf16(i8* %0, <8 x bfloat> %val, i32 2) 628 ret void 629} 630 631define arm_aapcs_vfpcc void @test_vst1_lane_bf16(bfloat* nocapture %ptr, <4 x bfloat> %val) { 632; CHECK-LABEL: test_vst1_lane_bf16: 633; CHECK: @ %bb.0: @ %entry 634; CHECK-NEXT: vmovx.f16 s0, s0 635; CHECK-NEXT: vstr.16 s0, [r0] 636; CHECK-NEXT: bx lr 637entry: 638 %0 = extractelement <4 x bfloat> %val, i32 1 639 store bfloat %0, bfloat* %ptr, align 2 640 ret void 641} 642 643define arm_aapcs_vfpcc void @test_vst1q_lane_bf16(bfloat* nocapture %ptr, <8 x bfloat> %val) { 644; CHECK-LABEL: test_vst1q_lane_bf16: 645; CHECK: @ %bb.0: @ %entry 646; CHECK-NEXT: vmovx.f16 s0, s3 647; CHECK-NEXT: vstr.16 s0, [r0] 648; CHECK-NEXT: bx lr 649entry: 650 %0 = extractelement <8 x bfloat> %val, i32 7 651 store bfloat %0, bfloat* %ptr, align 2 652 ret void 653} 654 655define arm_aapcs_vfpcc void @test_vst1_bf16_x2(bfloat* nocapture %ptr, [2 x <2 x i32>] %val.coerce) { 656; CHECK-LABEL: test_vst1_bf16_x2: 657; CHECK: @ %bb.0: @ %entry 658; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0 def $q0 659; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0 def $q0 660; CHECK-NEXT: vst1.16 {d0, d1}, [r0:64] 661; CHECK-NEXT: bx lr 662entry: 663 %val.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %val.coerce, 0 664 %val.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %val.coerce, 1 665 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat> 666 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat> 667 tail call void @llvm.arm.neon.vst1x2.p0bf16.v4bf16(bfloat* %ptr, <4 x bfloat> %0, <4 x bfloat> %1) 668 ret void 669} 670 671define arm_aapcs_vfpcc void @test_vst1q_bf16_x2(bfloat* nocapture %ptr, [2 x <4 x i32>] %val.coerce) { 672; CHECK-LABEL: test_vst1q_bf16_x2: 673; CHECK: @ %bb.0: @ %entry 674; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 675; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 676; CHECK-NEXT: vst1.16 {d0, d1, d2, d3}, [r0:256] 677; CHECK-NEXT: bx lr 678entry: 679 %val.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %val.coerce, 0 680 %val.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %val.coerce, 1 681 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat> 682 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat> 683 tail call void @llvm.arm.neon.vst1x2.p0bf16.v8bf16(bfloat* %ptr, <8 x bfloat> %0, <8 x bfloat> %1) 684 ret void 685} 686 687define arm_aapcs_vfpcc void @test_vst1_bf16_x3(bfloat* nocapture %ptr, [3 x <2 x i32>] %val.coerce) { 688; CHECK-LABEL: test_vst1_bf16_x3: 689; CHECK: @ %bb.0: @ %entry 690; CHECK-NEXT: @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1 691; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 692; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 693; CHECK-NEXT: vst1.16 {d0, d1, d2}, [r0:64] 694; CHECK-NEXT: bx lr 695entry: 696 %val.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %val.coerce, 0 697 %val.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %val.coerce, 1 698 %val.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %val.coerce, 2 699 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat> 700 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat> 701 %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat> 702 tail call void @llvm.arm.neon.vst1x3.p0bf16.v4bf16(bfloat* %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2) 703 ret void 704} 705 706define arm_aapcs_vfpcc void @test_vst1q_bf16_x3(bfloat* nocapture %ptr, [3 x <4 x i32>] %val.coerce) { 707; CHECK-LABEL: test_vst1q_bf16_x3: 708; CHECK: @ %bb.0: @ %entry 709; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 710; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 711; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 712; CHECK-NEXT: vst1.16 {d0, d1, d2}, [r0:64]! 713; CHECK-NEXT: vst1.16 {d3, d4, d5}, [r0:64] 714; CHECK-NEXT: bx lr 715entry: 716 %val.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %val.coerce, 0 717 %val.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %val.coerce, 1 718 %val.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %val.coerce, 2 719 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat> 720 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat> 721 %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat> 722 tail call void @llvm.arm.neon.vst1x3.p0bf16.v8bf16(bfloat* %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2) 723 ret void 724} 725 726define arm_aapcs_vfpcc void @test_vst1_bf16_x4(bfloat* nocapture %ptr, [4 x <2 x i32>] %val.coerce) { 727; CHECK-LABEL: test_vst1_bf16_x4: 728; CHECK: @ %bb.0: @ %entry 729; CHECK-NEXT: @ kill: def $d3 killed $d3 killed $q0_q1 def $q0_q1 730; CHECK-NEXT: @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1 731; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 732; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 733; CHECK-NEXT: vst1.16 {d0, d1, d2, d3}, [r0:256] 734; CHECK-NEXT: bx lr 735entry: 736 %val.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %val.coerce, 0 737 %val.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %val.coerce, 1 738 %val.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %val.coerce, 2 739 %val.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %val.coerce, 3 740 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat> 741 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat> 742 %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat> 743 %3 = bitcast <2 x i32> %val.coerce.fca.3.extract to <4 x bfloat> 744 tail call void @llvm.arm.neon.vst1x4.p0bf16.v4bf16(bfloat* %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3) 745 ret void 746} 747 748define arm_aapcs_vfpcc void @test_vst1q_bf16_x4(bfloat* nocapture %ptr, [4 x <4 x i32>] %val.coerce) { 749; CHECK-LABEL: test_vst1q_bf16_x4: 750; CHECK: @ %bb.0: @ %entry 751; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 752; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 753; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 754; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 755; CHECK-NEXT: vst1.16 {d0, d1, d2, d3}, [r0:256]! 756; CHECK-NEXT: vst1.16 {d4, d5, d6, d7}, [r0:256] 757; CHECK-NEXT: bx lr 758entry: 759 %val.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %val.coerce, 0 760 %val.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %val.coerce, 1 761 %val.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %val.coerce, 2 762 %val.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %val.coerce, 3 763 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat> 764 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat> 765 %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat> 766 %3 = bitcast <4 x i32> %val.coerce.fca.3.extract to <8 x bfloat> 767 tail call void @llvm.arm.neon.vst1x4.p0bf16.v8bf16(bfloat* %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3) 768 ret void 769} 770 771define arm_aapcs_vfpcc void @test_vst2_bf16(bfloat* %ptr, [2 x <2 x i32>] %val.coerce) { 772; CHECK-LABEL: test_vst2_bf16: 773; CHECK: @ %bb.0: @ %entry 774; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0 def $q0 775; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0 def $q0 776; CHECK-NEXT: vst2.16 {d0, d1}, [r0] 777; CHECK-NEXT: bx lr 778entry: 779 %val.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %val.coerce, 0 780 %val.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %val.coerce, 1 781 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat> 782 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat> 783 %2 = bitcast bfloat* %ptr to i8* 784 tail call void @llvm.arm.neon.vst2.p0i8.v4bf16(i8* %2, <4 x bfloat> %0, <4 x bfloat> %1, i32 2) 785 ret void 786} 787 788define arm_aapcs_vfpcc void @test_vst2q_bf16(bfloat* %ptr, [2 x <4 x i32>] %val.coerce) { 789; CHECK-LABEL: test_vst2q_bf16: 790; CHECK: @ %bb.0: @ %entry 791; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 792; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 793; CHECK-NEXT: vst2.16 {d0, d1, d2, d3}, [r0] 794; CHECK-NEXT: bx lr 795entry: 796 %val.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %val.coerce, 0 797 %val.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %val.coerce, 1 798 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat> 799 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat> 800 %2 = bitcast bfloat* %ptr to i8* 801 tail call void @llvm.arm.neon.vst2.p0i8.v8bf16(i8* %2, <8 x bfloat> %0, <8 x bfloat> %1, i32 2) 802 ret void 803} 804 805define arm_aapcs_vfpcc void @test_vst2_lane_bf16(bfloat* %ptr, [2 x <2 x i32>] %val.coerce) { 806; CHECK-LABEL: test_vst2_lane_bf16: 807; CHECK: @ %bb.0: @ %entry 808; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0 def $q0 809; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0 def $q0 810; CHECK-NEXT: vst2.16 {d0[1], d1[1]}, [r0] 811; CHECK-NEXT: bx lr 812entry: 813 %val.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %val.coerce, 0 814 %val.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %val.coerce, 1 815 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat> 816 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat> 817 %2 = bitcast bfloat* %ptr to i8* 818 tail call void @llvm.arm.neon.vst2lane.p0i8.v4bf16(i8* %2, <4 x bfloat> %0, <4 x bfloat> %1, i32 1, i32 2) 819 ret void 820} 821 822define arm_aapcs_vfpcc void @test_vst2q_lane_bf16(bfloat* %ptr, [2 x <4 x i32>] %val.coerce) { 823; CHECK-LABEL: test_vst2q_lane_bf16: 824; CHECK: @ %bb.0: @ %entry 825; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 826; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 827; CHECK-NEXT: vst2.16 {d1[3], d3[3]}, [r0] 828; CHECK-NEXT: bx lr 829entry: 830 %val.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %val.coerce, 0 831 %val.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %val.coerce, 1 832 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat> 833 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat> 834 %2 = bitcast bfloat* %ptr to i8* 835 tail call void @llvm.arm.neon.vst2lane.p0i8.v8bf16(i8* %2, <8 x bfloat> %0, <8 x bfloat> %1, i32 7, i32 2) 836 ret void 837} 838 839define arm_aapcs_vfpcc void @test_vst3_bf16(bfloat* %ptr, [3 x <2 x i32>] %val.coerce) { 840; CHECK-LABEL: test_vst3_bf16: 841; CHECK: @ %bb.0: @ %entry 842; CHECK-NEXT: @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1 843; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 844; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 845; CHECK-NEXT: vst3.16 {d0, d1, d2}, [r0] 846; CHECK-NEXT: bx lr 847entry: 848 %val.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %val.coerce, 0 849 %val.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %val.coerce, 1 850 %val.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %val.coerce, 2 851 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat> 852 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat> 853 %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat> 854 %3 = bitcast bfloat* %ptr to i8* 855 tail call void @llvm.arm.neon.vst3.p0i8.v4bf16(i8* %3, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 2) 856 ret void 857} 858 859define arm_aapcs_vfpcc void @test_vst3q_bf16(bfloat* %ptr, [3 x <4 x i32>] %val.coerce) { 860; CHECK-LABEL: test_vst3q_bf16: 861; CHECK: @ %bb.0: @ %entry 862; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 863; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 864; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 865; CHECK-NEXT: vst3.16 {d0, d2, d4}, [r0]! 866; CHECK-NEXT: vst3.16 {d1, d3, d5}, [r0] 867; CHECK-NEXT: bx lr 868entry: 869 %val.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %val.coerce, 0 870 %val.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %val.coerce, 1 871 %val.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %val.coerce, 2 872 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat> 873 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat> 874 %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat> 875 %3 = bitcast bfloat* %ptr to i8* 876 tail call void @llvm.arm.neon.vst3.p0i8.v8bf16(i8* %3, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 2) 877 ret void 878} 879 880define arm_aapcs_vfpcc void @test_vst3_lane_bf16(bfloat* %ptr, [3 x <2 x i32>] %val.coerce) { 881; CHECK-LABEL: test_vst3_lane_bf16: 882; CHECK: @ %bb.0: @ %entry 883; CHECK-NEXT: @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1 884; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 885; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 886; CHECK-NEXT: vst3.16 {d0[1], d1[1], d2[1]}, [r0] 887; CHECK-NEXT: bx lr 888entry: 889 %val.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %val.coerce, 0 890 %val.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %val.coerce, 1 891 %val.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %val.coerce, 2 892 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat> 893 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat> 894 %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat> 895 %3 = bitcast bfloat* %ptr to i8* 896 tail call void @llvm.arm.neon.vst3lane.p0i8.v4bf16(i8* %3, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 1, i32 2) 897 ret void 898} 899 900define arm_aapcs_vfpcc void @test_vst3q_lane_bf16(bfloat* %ptr, [3 x <4 x i32>] %val.coerce) { 901; CHECK-LABEL: test_vst3q_lane_bf16: 902; CHECK: @ %bb.0: @ %entry 903; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 904; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 905; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 906; CHECK-NEXT: vst3.16 {d1[3], d3[3], d5[3]}, [r0] 907; CHECK-NEXT: bx lr 908entry: 909 %val.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %val.coerce, 0 910 %val.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %val.coerce, 1 911 %val.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %val.coerce, 2 912 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat> 913 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat> 914 %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat> 915 %3 = bitcast bfloat* %ptr to i8* 916 tail call void @llvm.arm.neon.vst3lane.p0i8.v8bf16(i8* %3, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 7, i32 2) 917 ret void 918} 919 920define arm_aapcs_vfpcc void @test_vst4_bf16(bfloat* %ptr, [4 x <2 x i32>] %val.coerce) { 921; CHECK-LABEL: test_vst4_bf16: 922; CHECK: @ %bb.0: @ %entry 923; CHECK-NEXT: @ kill: def $d3 killed $d3 killed $q0_q1 def $q0_q1 924; CHECK-NEXT: @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1 925; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 926; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 927; CHECK-NEXT: vst4.16 {d0, d1, d2, d3}, [r0] 928; CHECK-NEXT: bx lr 929entry: 930 %val.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %val.coerce, 0 931 %val.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %val.coerce, 1 932 %val.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %val.coerce, 2 933 %val.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %val.coerce, 3 934 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat> 935 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat> 936 %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat> 937 %3 = bitcast <2 x i32> %val.coerce.fca.3.extract to <4 x bfloat> 938 %4 = bitcast bfloat* %ptr to i8* 939 tail call void @llvm.arm.neon.vst4.p0i8.v4bf16(i8* %4, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 2) 940 ret void 941} 942 943define arm_aapcs_vfpcc void @test_vst4q_bf16(bfloat* %ptr, [4 x <4 x i32>] %val.coerce) { 944; CHECK-LABEL: test_vst4q_bf16: 945; CHECK: @ %bb.0: @ %entry 946; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 947; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 948; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 949; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 950; CHECK-NEXT: vst4.16 {d0, d2, d4, d6}, [r0]! 951; CHECK-NEXT: vst4.16 {d1, d3, d5, d7}, [r0] 952; CHECK-NEXT: bx lr 953entry: 954 %val.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %val.coerce, 0 955 %val.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %val.coerce, 1 956 %val.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %val.coerce, 2 957 %val.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %val.coerce, 3 958 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat> 959 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat> 960 %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat> 961 %3 = bitcast <4 x i32> %val.coerce.fca.3.extract to <8 x bfloat> 962 %4 = bitcast bfloat* %ptr to i8* 963 tail call void @llvm.arm.neon.vst4.p0i8.v8bf16(i8* %4, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 2) 964 ret void 965} 966 967define arm_aapcs_vfpcc void @test_vst4_lane_bf16(bfloat* %ptr, [4 x <2 x i32>] %val.coerce) { 968; CHECK-LABEL: test_vst4_lane_bf16: 969; CHECK: @ %bb.0: @ %entry 970; CHECK-NEXT: @ kill: def $d3 killed $d3 killed $q0_q1 def $q0_q1 971; CHECK-NEXT: @ kill: def $d2 killed $d2 killed $q0_q1 def $q0_q1 972; CHECK-NEXT: @ kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 973; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 974; CHECK-NEXT: vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0] 975; CHECK-NEXT: bx lr 976entry: 977 %val.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %val.coerce, 0 978 %val.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %val.coerce, 1 979 %val.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %val.coerce, 2 980 %val.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %val.coerce, 3 981 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat> 982 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat> 983 %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat> 984 %3 = bitcast <2 x i32> %val.coerce.fca.3.extract to <4 x bfloat> 985 %4 = bitcast bfloat* %ptr to i8* 986 tail call void @llvm.arm.neon.vst4lane.p0i8.v4bf16(i8* %4, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 1, i32 2) 987 ret void 988} 989 990define arm_aapcs_vfpcc void @test_vst4q_lane_bf16(bfloat* %ptr, [4 x <4 x i32>] %val.coerce) { 991; CHECK-LABEL: test_vst4q_lane_bf16: 992; CHECK: @ %bb.0: @ %entry 993; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 994; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 995; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 996; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 997; CHECK-NEXT: vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0] 998; CHECK-NEXT: bx lr 999entry: 1000 %val.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %val.coerce, 0 1001 %val.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %val.coerce, 1 1002 %val.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %val.coerce, 2 1003 %val.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %val.coerce, 3 1004 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat> 1005 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat> 1006 %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat> 1007 %3 = bitcast <4 x i32> %val.coerce.fca.3.extract to <8 x bfloat> 1008 %4 = bitcast bfloat* %ptr to i8* 1009 tail call void @llvm.arm.neon.vst4lane.p0i8.v8bf16(i8* %4, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 7, i32 2) 1010 ret void 1011} 1012 1013declare { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2.v4bf16.p0i8(i8*, i32) 1014declare { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2.v8bf16.p0i8(i8*, i32) 1015declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3.v4bf16.p0i8(i8*, i32) 1016declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3.v8bf16.p0i8(i8*, i32) 1017declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4.v4bf16.p0i8(i8*, i32) 1018declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4.v8bf16.p0i8(i8*, i32) 1019 1020declare { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2dup.v4bf16.p0i8(i8*, i32) 1021declare { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2dup.v8bf16.p0i8(i8*, i32) 1022declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3dup.v4bf16.p0i8(i8*, i32) 1023declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3dup.v8bf16.p0i8(i8*, i32) 1024declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4dup.v4bf16.p0i8(i8*, i32) 1025declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4dup.v8bf16.p0i8(i8*, i32) 1026 1027declare { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x2.v4bf16.p0bf16(bfloat*) 1028declare { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x2.v8bf16.p0bf16(bfloat*) 1029declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x3.v4bf16.p0bf16(bfloat*) 1030declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x3.v8bf16.p0bf16(bfloat*) 1031declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x4.v4bf16.p0bf16(bfloat*) 1032declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x4.v8bf16.p0bf16(bfloat*) 1033 1034declare { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2lane.v4bf16.p0i8(i8*, <4 x bfloat>, <4 x bfloat>, i32, i32) 1035declare { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2lane.v8bf16.p0i8(i8*, <8 x bfloat>, <8 x bfloat>, i32, i32) 1036declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3lane.v4bf16.p0i8(i8*, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32, i32) 1037declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3lane.v8bf16.p0i8(i8*, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32, i32) 1038declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4lane.v4bf16.p0i8(i8*, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32, i32) 1039declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4lane.v8bf16.p0i8(i8*, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32, i32) 1040 1041declare void @llvm.arm.neon.vst1.p0i8.v4bf16(i8*, <4 x bfloat>, i32) 1042declare void @llvm.arm.neon.vst1.p0i8.v8bf16(i8*, <8 x bfloat>, i32) 1043declare void @llvm.arm.neon.vst2.p0i8.v4bf16(i8*, <4 x bfloat>, <4 x bfloat>, i32) 1044declare void @llvm.arm.neon.vst2.p0i8.v8bf16(i8*, <8 x bfloat>, <8 x bfloat>, i32) 1045declare void @llvm.arm.neon.vst3.p0i8.v4bf16(i8*, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32) 1046declare void @llvm.arm.neon.vst3.p0i8.v8bf16(i8*, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32) 1047declare void @llvm.arm.neon.vst4.p0i8.v4bf16(i8*, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32) 1048declare void @llvm.arm.neon.vst4.p0i8.v8bf16(i8*, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32) 1049 1050declare void @llvm.arm.neon.vst1x2.p0bf16.v4bf16(bfloat* nocapture, <4 x bfloat>, <4 x bfloat>) 1051declare void @llvm.arm.neon.vst1x2.p0bf16.v8bf16(bfloat* nocapture, <8 x bfloat>, <8 x bfloat>) 1052declare void @llvm.arm.neon.vst1x3.p0bf16.v4bf16(bfloat* nocapture, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>) 1053declare void @llvm.arm.neon.vst1x3.p0bf16.v8bf16(bfloat* nocapture, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>) 1054declare void @llvm.arm.neon.vst1x4.p0bf16.v4bf16(bfloat* nocapture, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>) 1055declare void @llvm.arm.neon.vst1x4.p0bf16.v8bf16(bfloat* nocapture, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>) 1056 1057declare void @llvm.arm.neon.vst2lane.p0i8.v4bf16(i8*, <4 x bfloat>, <4 x bfloat>, i32, i32) 1058declare void @llvm.arm.neon.vst2lane.p0i8.v8bf16(i8*, <8 x bfloat>, <8 x bfloat>, i32, i32) 1059declare void @llvm.arm.neon.vst3lane.p0i8.v4bf16(i8*, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32, i32) 1060declare void @llvm.arm.neon.vst3lane.p0i8.v8bf16(i8*, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32, i32) 1061declare void @llvm.arm.neon.vst4lane.p0i8.v4bf16(i8*, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32, i32) 1062declare void @llvm.arm.neon.vst4lane.p0i8.v8bf16(i8*, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32, i32) 1063