Lines Matching refs:bfloat
3 ; FIXME: Remove fullfp16 once bfloat arguments and returns lowering stops
6 define arm_aapcs_vfpcc <4 x bfloat> @test_vld1_bf16(bfloat* nocapture readonly %ptr) {
12 %0 = bitcast bfloat* %ptr to <4 x bfloat>*
13 %1 = load <4 x bfloat>, <4 x bfloat>* %0, align 2
14 ret <4 x bfloat> %1
17 define arm_aapcs_vfpcc <8 x bfloat> @test_vld1q_bf16(bfloat* nocapture readonly %ptr) {
23 %0 = bitcast bfloat* %ptr to <8 x bfloat>*
24 %1 = load <8 x bfloat>, <8 x bfloat>* %0, align 2
25 ret <8 x bfloat> %1
28 define arm_aapcs_vfpcc <4 x bfloat> @test_vld1_lane_bf16(bfloat* nocapture readonly %ptr, <4 x bflo…
34 %0 = load bfloat, bfloat* %ptr, align 2
35 %vld1_lane = insertelement <4 x bfloat> %src, bfloat %0, i32 0
36 ret <4 x bfloat> %vld1_lane
39 define arm_aapcs_vfpcc <8 x bfloat> @test_vld1q_lane_bf16(bfloat* nocapture readonly %ptr, <8 x bfl…
45 %0 = load bfloat, bfloat* %ptr, align 2
46 %vld1_lane = insertelement <8 x bfloat> %src, bfloat %0, i32 7
47 ret <8 x bfloat> %vld1_lane
50 define arm_aapcs_vfpcc <4 x bfloat> @test_vld1_dup_bf16(bfloat* nocapture readonly %ptr) {
56 %0 = load bfloat, bfloat* %ptr, align 2
57 %1 = insertelement <4 x bfloat> undef, bfloat %0, i32 0
58 %lane = shufflevector <4 x bfloat> %1, <4 x bfloat> undef, <4 x i32> zeroinitializer
59 ret <4 x bfloat> %lane
62 define arm_aapcs_vfpcc [2 x <2 x i32>] @test_vld1_bf16_x2(bfloat* %ptr) {
68 …%vld1xN = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x2.v4bf16.p0bf16(bfloat* %pt…
69 %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld1xN, 0
70 %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld1xN, 1
71 %0 = bitcast <4 x bfloat> %vld1xN.fca.0.extract to <2 x i32>
72 %1 = bitcast <4 x bfloat> %vld1xN.fca.1.extract to <2 x i32>
78 define arm_aapcs_vfpcc [2 x <4 x i32>] @test_vld1q_bf16_x2(bfloat* %ptr) {
84 …%vld1xN = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x2.v8bf16.p0bf16(bfloat* %pt…
85 %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld1xN, 0
86 %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld1xN, 1
87 %0 = bitcast <8 x bfloat> %vld1xN.fca.0.extract to <4 x i32>
88 %1 = bitcast <8 x bfloat> %vld1xN.fca.1.extract to <4 x i32>
94 define arm_aapcs_vfpcc [3 x <2 x i32>] @test_vld1_bf16_x3(bfloat* %ptr) {
100 …%vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x3.v4bf16.p0bf…
101 %vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 0
102 %vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 1
103 %vld1xN.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld1xN, 2
104 %0 = bitcast <4 x bfloat> %vld1xN.fca.0.extract to <2 x i32>
105 %1 = bitcast <4 x bfloat> %vld1xN.fca.1.extract to <2 x i32>
106 %2 = bitcast <4 x bfloat> %vld1xN.fca.2.extract to <2 x i32>
113 define arm_aapcs_vfpcc [3 x <4 x i32>] @test_vld1q_bf16_x3(bfloat* %ptr) {
120 …%vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x3.v8bf16.p0bf…
121 %vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 0
122 %vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 1
123 %vld1xN.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld1xN, 2
124 %0 = bitcast <8 x bfloat> %vld1xN.fca.0.extract to <4 x i32>
125 %1 = bitcast <8 x bfloat> %vld1xN.fca.1.extract to <4 x i32>
126 %2 = bitcast <8 x bfloat> %vld1xN.fca.2.extract to <4 x i32>
133 define arm_aapcs_vfpcc [4 x <2 x i32>] @test_vld1_bf16_x4(bfloat* %ptr) {
139 …%vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1…
140 …%vld1xN.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %v…
141 …%vld1xN.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %v…
142 …%vld1xN.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %v…
143 …%vld1xN.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %v…
144 %0 = bitcast <4 x bfloat> %vld1xN.fca.0.extract to <2 x i32>
145 %1 = bitcast <4 x bfloat> %vld1xN.fca.1.extract to <2 x i32>
146 %2 = bitcast <4 x bfloat> %vld1xN.fca.2.extract to <2 x i32>
147 %3 = bitcast <4 x bfloat> %vld1xN.fca.3.extract to <2 x i32>
155 define arm_aapcs_vfpcc [4 x <4 x i32>] @test_vld1q_bf16_x4(bfloat* %ptr) {
162 …%vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1…
163 …%vld1xN.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %v…
164 …%vld1xN.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %v…
165 …%vld1xN.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %v…
166 …%vld1xN.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %v…
167 %0 = bitcast <8 x bfloat> %vld1xN.fca.0.extract to <4 x i32>
168 %1 = bitcast <8 x bfloat> %vld1xN.fca.1.extract to <4 x i32>
169 %2 = bitcast <8 x bfloat> %vld1xN.fca.2.extract to <4 x i32>
170 %3 = bitcast <8 x bfloat> %vld1xN.fca.3.extract to <4 x i32>
178 define arm_aapcs_vfpcc <8 x bfloat> @test_vld1q_dup_bf16(bfloat* nocapture readonly %ptr) {
184 %0 = load bfloat, bfloat* %ptr, align 2
185 %1 = insertelement <8 x bfloat> undef, bfloat %0, i32 0
186 %lane = shufflevector <8 x bfloat> %1, <8 x bfloat> undef, <8 x i32> zeroinitializer
187 ret <8 x bfloat> %lane
190 define arm_aapcs_vfpcc [2 x <2 x i32>] @test_vld2_bf16(bfloat* %ptr) {
196 %0 = bitcast bfloat* %ptr to i8*
197 %vld2_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2.v4bf16.p0i8(i8* %0, i32 2)
198 %vld2_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_v, 0
199 %vld2_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_v, 1
200 %1 = bitcast <4 x bfloat> %vld2_v.fca.0.extract to <2 x i32>
201 %2 = bitcast <4 x bfloat> %vld2_v.fca.1.extract to <2 x i32>
207 define arm_aapcs_vfpcc [2 x <4 x i32>] @test_vld2q_bf16(bfloat* %ptr) {
213 %0 = bitcast bfloat* %ptr to i8*
214 %vld2q_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2.v8bf16.p0i8(i8* %0, i32 2)
215 %vld2q_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_v, 0
216 %vld2q_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_v, 1
217 %1 = bitcast <8 x bfloat> %vld2q_v.fca.0.extract to <4 x i32>
218 %2 = bitcast <8 x bfloat> %vld2q_v.fca.1.extract to <4 x i32>
224 define arm_aapcs_vfpcc [2 x <2 x i32>] @test_vld2_lane_bf16(bfloat* %ptr, [2 x <2 x i32>] %src.coer…
234 %0 = bitcast <2 x i32> %src.coerce.fca.0.extract to <4 x bfloat>
235 %1 = bitcast <2 x i32> %src.coerce.fca.1.extract to <4 x bfloat>
236 %2 = bitcast bfloat* %ptr to i8*
237 …2_lane_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2lane.v4bf16.p0i8(i8* %2, <4…
238 %vld2_lane_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_lane_v, 0
239 %vld2_lane_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_lane_v, 1
240 %3 = bitcast <4 x bfloat> %vld2_lane_v.fca.0.extract to <2 x i32>
241 %4 = bitcast <4 x bfloat> %vld2_lane_v.fca.1.extract to <2 x i32>
247 define arm_aapcs_vfpcc [2 x <4 x i32>] @test_vld2q_lane_bf16(bfloat* %ptr, [2 x <4 x i32>] %src.coe…
257 %0 = bitcast <4 x i32> %src.coerce.fca.0.extract to <8 x bfloat>
258 %1 = bitcast <4 x i32> %src.coerce.fca.1.extract to <8 x bfloat>
259 %2 = bitcast bfloat* %ptr to i8*
260 …q_lane_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2lane.v8bf16.p0i8(i8* %2, <8…
261 %vld2q_lane_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_lane_v, 0
262 %vld2q_lane_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_lane_v, 1
263 %3 = bitcast <8 x bfloat> %vld2q_lane_v.fca.0.extract to <4 x i32>
264 %4 = bitcast <8 x bfloat> %vld2q_lane_v.fca.1.extract to <4 x i32>
270 define arm_aapcs_vfpcc [3 x <2 x i32>] @test_vld3_bf16(bfloat* %ptr) {
276 %0 = bitcast bfloat* %ptr to i8*
277 …%vld3_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3.v4bf16.p0i8(i…
278 %vld3_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_v, 0
279 %vld3_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_v, 1
280 %vld3_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_v, 2
281 %1 = bitcast <4 x bfloat> %vld3_v.fca.0.extract to <2 x i32>
282 %2 = bitcast <4 x bfloat> %vld3_v.fca.1.extract to <2 x i32>
283 %3 = bitcast <4 x bfloat> %vld3_v.fca.2.extract to <2 x i32>
290 define arm_aapcs_vfpcc [3 x <4 x i32>] @test_vld3q_bf16(bfloat* %ptr) {
297 %0 = bitcast bfloat* %ptr to i8*
298 …%vld3q_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3.v8bf16.p0i8(…
299 %vld3q_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_v, 0
300 %vld3q_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_v, 1
301 %vld3q_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_v, 2
302 %1 = bitcast <8 x bfloat> %vld3q_v.fca.0.extract to <4 x i32>
303 %2 = bitcast <8 x bfloat> %vld3q_v.fca.1.extract to <4 x i32>
304 %3 = bitcast <8 x bfloat> %vld3q_v.fca.2.extract to <4 x i32>
311 define arm_aapcs_vfpcc [3 x <2 x i32>] @test_vld3_lane_bf16(bfloat* %ptr, [3 x <2 x i32>] %src.coer…
323 %0 = bitcast <2 x i32> %src.coerce.fca.0.extract to <4 x bfloat>
324 %1 = bitcast <2 x i32> %src.coerce.fca.1.extract to <4 x bfloat>
325 %2 = bitcast <2 x i32> %src.coerce.fca.2.extract to <4 x bfloat>
326 %3 = bitcast bfloat* %ptr to i8*
327 … call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3lane.v4bf16.p0i8(i8* %3, <4 …
328 …%vld3_lane_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane_…
329 …%vld3_lane_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane_…
330 …%vld3_lane_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_lane_…
331 %4 = bitcast <4 x bfloat> %vld3_lane_v.fca.0.extract to <2 x i32>
332 %5 = bitcast <4 x bfloat> %vld3_lane_v.fca.1.extract to <2 x i32>
333 %6 = bitcast <4 x bfloat> %vld3_lane_v.fca.2.extract to <2 x i32>
340 define arm_aapcs_vfpcc [3 x <4 x i32>] @test_vld3q_lane_bf16(bfloat* %ptr, [3 x <4 x i32>] %src.coe…
352 %0 = bitcast <4 x i32> %src.coerce.fca.0.extract to <8 x bfloat>
353 %1 = bitcast <4 x i32> %src.coerce.fca.1.extract to <8 x bfloat>
354 %2 = bitcast <4 x i32> %src.coerce.fca.2.extract to <8 x bfloat>
355 %3 = bitcast bfloat* %ptr to i8*
356 … call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3lane.v8bf16.p0i8(i8* %3, <8 …
357 …%vld3q_lane_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_lan…
358 …%vld3q_lane_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_lan…
359 …%vld3q_lane_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_lan…
360 %4 = bitcast <8 x bfloat> %vld3q_lane_v.fca.0.extract to <4 x i32>
361 %5 = bitcast <8 x bfloat> %vld3q_lane_v.fca.1.extract to <4 x i32>
362 %6 = bitcast <8 x bfloat> %vld3q_lane_v.fca.2.extract to <4 x i32>
369 define arm_aapcs_vfpcc [4 x <2 x i32>] @test_vld4_bf16(bfloat* %ptr) {
375 %0 = bitcast bfloat* %ptr to i8*
376 …%vld4_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4…
377 …%vld4_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %v…
378 …%vld4_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %v…
379 …%vld4_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %v…
380 …%vld4_v.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %v…
381 %1 = bitcast <4 x bfloat> %vld4_v.fca.0.extract to <2 x i32>
382 %2 = bitcast <4 x bfloat> %vld4_v.fca.1.extract to <2 x i32>
383 %3 = bitcast <4 x bfloat> %vld4_v.fca.2.extract to <2 x i32>
384 %4 = bitcast <4 x bfloat> %vld4_v.fca.3.extract to <2 x i32>
392 define arm_aapcs_vfpcc [4 x <4 x i32>] @test_vld4q_bf16(bfloat* %ptr) {
399 %0 = bitcast bfloat* %ptr to i8*
400 …%vld4q_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld…
401 …%vld4q_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %…
402 …%vld4q_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %…
403 …%vld4q_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %…
404 …%vld4q_v.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %…
405 %1 = bitcast <8 x bfloat> %vld4q_v.fca.0.extract to <4 x i32>
406 %2 = bitcast <8 x bfloat> %vld4q_v.fca.1.extract to <4 x i32>
407 %3 = bitcast <8 x bfloat> %vld4q_v.fca.2.extract to <4 x i32>
408 %4 = bitcast <8 x bfloat> %vld4q_v.fca.3.extract to <4 x i32>
416 define arm_aapcs_vfpcc [4 x <2 x i32>] @test_vld4_lane_bf16(bfloat* %ptr, [4 x <2 x i32>] %src.coer…
430 %0 = bitcast <2 x i32> %src.coerce.fca.0.extract to <4 x bfloat>
431 %1 = bitcast <2 x i32> %src.coerce.fca.1.extract to <4 x bfloat>
432 %2 = bitcast <2 x i32> %src.coerce.fca.2.extract to <4 x bfloat>
433 %3 = bitcast <2 x i32> %src.coerce.fca.3.extract to <4 x bfloat>
434 %4 = bitcast bfloat* %ptr to i8*
435 …bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4lane.v4bf16.p0i8(i8* %4, <4…
436 …%vld4_lane_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>…
437 …%vld4_lane_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>…
438 …%vld4_lane_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>…
439 …%vld4_lane_v.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>…
440 %5 = bitcast <4 x bfloat> %vld4_lane_v.fca.0.extract to <2 x i32>
441 %6 = bitcast <4 x bfloat> %vld4_lane_v.fca.1.extract to <2 x i32>
442 %7 = bitcast <4 x bfloat> %vld4_lane_v.fca.2.extract to <2 x i32>
443 %8 = bitcast <4 x bfloat> %vld4_lane_v.fca.3.extract to <2 x i32>
451 define arm_aapcs_vfpcc [4 x <4 x i32>] @test_vld4q_lane_bf16(bfloat* %ptr, [4 x <4 x i32>] %src.coe…
465 %0 = bitcast <4 x i32> %src.coerce.fca.0.extract to <8 x bfloat>
466 %1 = bitcast <4 x i32> %src.coerce.fca.1.extract to <8 x bfloat>
467 %2 = bitcast <4 x i32> %src.coerce.fca.2.extract to <8 x bfloat>
468 %3 = bitcast <4 x i32> %src.coerce.fca.3.extract to <8 x bfloat>
469 %4 = bitcast bfloat* %ptr to i8*
470 …bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4lane.v8bf16.p0i8(i8* %4, <8…
471 …%vld4q_lane_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat…
472 …%vld4q_lane_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat…
473 …%vld4q_lane_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat…
474 …%vld4q_lane_v.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat…
475 %5 = bitcast <8 x bfloat> %vld4q_lane_v.fca.0.extract to <4 x i32>
476 %6 = bitcast <8 x bfloat> %vld4q_lane_v.fca.1.extract to <4 x i32>
477 %7 = bitcast <8 x bfloat> %vld4q_lane_v.fca.2.extract to <4 x i32>
478 %8 = bitcast <8 x bfloat> %vld4q_lane_v.fca.3.extract to <4 x i32>
486 define arm_aapcs_vfpcc [2 x <2 x i32>] @test_vld2_dup_bf16(bfloat* %ptr) {
492 %0 = bitcast bfloat* %ptr to i8*
493 …%vld2_dup_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2dup.v4bf16.p0i8(i8* %0, …
494 %vld2_dup_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_dup_v, 0
495 %vld2_dup_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat> } %vld2_dup_v, 1
496 %1 = bitcast <4 x bfloat> %vld2_dup_v.fca.0.extract to <2 x i32>
497 %2 = bitcast <4 x bfloat> %vld2_dup_v.fca.1.extract to <2 x i32>
503 define arm_aapcs_vfpcc [2 x <4 x i32>] @test_vld2q_dup_bf16(bfloat* %ptr) {
510 %0 = bitcast bfloat* %ptr to i8*
511 …%vld2q_dup_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2dup.v8bf16.p0i8(i8* %0,…
512 %vld2q_dup_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_dup_v, 0
513 %vld2q_dup_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat> } %vld2q_dup_v, 1
514 %1 = bitcast <8 x bfloat> %vld2q_dup_v.fca.0.extract to <4 x i32>
515 %2 = bitcast <8 x bfloat> %vld2q_dup_v.fca.1.extract to <4 x i32>
521 define arm_aapcs_vfpcc [3 x <2 x i32>] @test_vld3_dup_bf16(bfloat* %ptr) {
527 %0 = bitcast bfloat* %ptr to i8*
528 …%vld3_dup_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3dup.v4bf16…
529 …%vld3_dup_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_dup_v,…
530 …%vld3_dup_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_dup_v,…
531 …%vld3_dup_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } %vld3_dup_v,…
532 %1 = bitcast <4 x bfloat> %vld3_dup_v.fca.0.extract to <2 x i32>
533 %2 = bitcast <4 x bfloat> %vld3_dup_v.fca.1.extract to <2 x i32>
534 %3 = bitcast <4 x bfloat> %vld3_dup_v.fca.2.extract to <2 x i32>
541 define arm_aapcs_vfpcc [3 x <4 x i32>] @test_vld3q_dup_bf16(bfloat* %ptr) {
548 %0 = bitcast bfloat* %ptr to i8*
549 …%vld3q_dup_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3dup.v8bf1…
550 …%vld3q_dup_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_dup_…
551 …%vld3q_dup_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_dup_…
552 …%vld3q_dup_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } %vld3q_dup_…
553 %1 = bitcast <8 x bfloat> %vld3q_dup_v.fca.0.extract to <4 x i32>
554 %2 = bitcast <8 x bfloat> %vld3q_dup_v.fca.1.extract to <4 x i32>
555 %3 = bitcast <8 x bfloat> %vld3q_dup_v.fca.2.extract to <4 x i32>
562 define arm_aapcs_vfpcc [4 x <2 x i32>] @test_vld4_dup_bf16(bfloat* %ptr) {
568 %0 = bitcast bfloat* %ptr to i8*
569 …%vld4_dup_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.…
570 …%vld4_dup_v.fca.0.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> …
571 …%vld4_dup_v.fca.1.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> …
572 …%vld4_dup_v.fca.2.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> …
573 …%vld4_dup_v.fca.3.extract = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> …
574 %1 = bitcast <4 x bfloat> %vld4_dup_v.fca.0.extract to <2 x i32>
575 %2 = bitcast <4 x bfloat> %vld4_dup_v.fca.1.extract to <2 x i32>
576 %3 = bitcast <4 x bfloat> %vld4_dup_v.fca.2.extract to <2 x i32>
577 %4 = bitcast <4 x bfloat> %vld4_dup_v.fca.3.extract to <2 x i32>
585 define arm_aapcs_vfpcc [4 x <4 x i32>] @test_vld4q_dup_bf16(bfloat* %ptr) {
592 %0 = bitcast bfloat* %ptr to i8*
593 …%vld4q_dup_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon…
594 …%vld4q_dup_v.fca.0.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>…
595 …%vld4q_dup_v.fca.1.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>…
596 …%vld4q_dup_v.fca.2.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>…
597 …%vld4q_dup_v.fca.3.extract = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>…
598 %1 = bitcast <8 x bfloat> %vld4q_dup_v.fca.0.extract to <4 x i32>
599 %2 = bitcast <8 x bfloat> %vld4q_dup_v.fca.1.extract to <4 x i32>
600 %3 = bitcast <8 x bfloat> %vld4q_dup_v.fca.2.extract to <4 x i32>
601 %4 = bitcast <8 x bfloat> %vld4q_dup_v.fca.3.extract to <4 x i32>
609 define arm_aapcs_vfpcc void @test_vst1_bf16(bfloat* %ptr, <4 x bfloat> %val) {
615 %0 = bitcast bfloat* %ptr to i8*
616 tail call void @llvm.arm.neon.vst1.p0i8.v4bf16(i8* %0, <4 x bfloat> %val, i32 2)
620 define arm_aapcs_vfpcc void @test_vst1q_bf16(bfloat* %ptr, <8 x bfloat> %val) {
626 %0 = bitcast bfloat* %ptr to i8*
627 tail call void @llvm.arm.neon.vst1.p0i8.v8bf16(i8* %0, <8 x bfloat> %val, i32 2)
631 define arm_aapcs_vfpcc void @test_vst1_lane_bf16(bfloat* nocapture %ptr, <4 x bfloat> %val) {
638 %0 = extractelement <4 x bfloat> %val, i32 1
639 store bfloat %0, bfloat* %ptr, align 2
643 define arm_aapcs_vfpcc void @test_vst1q_lane_bf16(bfloat* nocapture %ptr, <8 x bfloat> %val) {
650 %0 = extractelement <8 x bfloat> %val, i32 7
651 store bfloat %0, bfloat* %ptr, align 2
655 define arm_aapcs_vfpcc void @test_vst1_bf16_x2(bfloat* nocapture %ptr, [2 x <2 x i32>] %val.coerce)…
665 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
666 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
667 tail call void @llvm.arm.neon.vst1x2.p0bf16.v4bf16(bfloat* %ptr, <4 x bfloat> %0, <4 x bfloat> %1)
671 define arm_aapcs_vfpcc void @test_vst1q_bf16_x2(bfloat* nocapture %ptr, [2 x <4 x i32>] %val.coerce…
681 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
682 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
683 tail call void @llvm.arm.neon.vst1x2.p0bf16.v8bf16(bfloat* %ptr, <8 x bfloat> %0, <8 x bfloat> %1)
687 define arm_aapcs_vfpcc void @test_vst1_bf16_x3(bfloat* nocapture %ptr, [3 x <2 x i32>] %val.coerce)…
699 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
700 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
701 %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat>
702 … call void @llvm.arm.neon.vst1x3.p0bf16.v4bf16(bfloat* %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 …
706 define arm_aapcs_vfpcc void @test_vst1q_bf16_x3(bfloat* nocapture %ptr, [3 x <4 x i32>] %val.coerce…
719 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
720 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
721 %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat>
722 … call void @llvm.arm.neon.vst1x3.p0bf16.v8bf16(bfloat* %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 …
726 define arm_aapcs_vfpcc void @test_vst1_bf16_x4(bfloat* nocapture %ptr, [4 x <2 x i32>] %val.coerce)…
740 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
741 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
742 %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat>
743 %3 = bitcast <2 x i32> %val.coerce.fca.3.extract to <4 x bfloat>
744 …d @llvm.arm.neon.vst1x4.p0bf16.v4bf16(bfloat* %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat>…
748 define arm_aapcs_vfpcc void @test_vst1q_bf16_x4(bfloat* nocapture %ptr, [4 x <4 x i32>] %val.coerce…
763 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
764 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
765 %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat>
766 %3 = bitcast <4 x i32> %val.coerce.fca.3.extract to <8 x bfloat>
767 …d @llvm.arm.neon.vst1x4.p0bf16.v8bf16(bfloat* %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat>…
771 define arm_aapcs_vfpcc void @test_vst2_bf16(bfloat* %ptr, [2 x <2 x i32>] %val.coerce) {
781 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
782 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
783 %2 = bitcast bfloat* %ptr to i8*
784 tail call void @llvm.arm.neon.vst2.p0i8.v4bf16(i8* %2, <4 x bfloat> %0, <4 x bfloat> %1, i32 2)
788 define arm_aapcs_vfpcc void @test_vst2q_bf16(bfloat* %ptr, [2 x <4 x i32>] %val.coerce) {
798 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
799 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
800 %2 = bitcast bfloat* %ptr to i8*
801 tail call void @llvm.arm.neon.vst2.p0i8.v8bf16(i8* %2, <8 x bfloat> %0, <8 x bfloat> %1, i32 2)
805 define arm_aapcs_vfpcc void @test_vst2_lane_bf16(bfloat* %ptr, [2 x <2 x i32>] %val.coerce) {
815 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
816 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
817 %2 = bitcast bfloat* %ptr to i8*
818 …tail call void @llvm.arm.neon.vst2lane.p0i8.v4bf16(i8* %2, <4 x bfloat> %0, <4 x bfloat> %1, i32 1…
822 define arm_aapcs_vfpcc void @test_vst2q_lane_bf16(bfloat* %ptr, [2 x <4 x i32>] %val.coerce) {
832 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
833 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
834 %2 = bitcast bfloat* %ptr to i8*
835 …tail call void @llvm.arm.neon.vst2lane.p0i8.v8bf16(i8* %2, <8 x bfloat> %0, <8 x bfloat> %1, i32 7…
839 define arm_aapcs_vfpcc void @test_vst3_bf16(bfloat* %ptr, [3 x <2 x i32>] %val.coerce) {
851 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
852 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
853 %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat>
854 %3 = bitcast bfloat* %ptr to i8*
855 … call void @llvm.arm.neon.vst3.p0i8.v4bf16(i8* %3, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> …
859 define arm_aapcs_vfpcc void @test_vst3q_bf16(bfloat* %ptr, [3 x <4 x i32>] %val.coerce) {
872 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
873 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
874 %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat>
875 %3 = bitcast bfloat* %ptr to i8*
876 … call void @llvm.arm.neon.vst3.p0i8.v8bf16(i8* %3, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> …
880 define arm_aapcs_vfpcc void @test_vst3_lane_bf16(bfloat* %ptr, [3 x <2 x i32>] %val.coerce) {
892 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
893 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
894 %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat>
895 %3 = bitcast bfloat* %ptr to i8*
896 …l void @llvm.arm.neon.vst3lane.p0i8.v4bf16(i8* %3, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> …
900 define arm_aapcs_vfpcc void @test_vst3q_lane_bf16(bfloat* %ptr, [3 x <4 x i32>] %val.coerce) {
912 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
913 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
914 %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat>
915 %3 = bitcast bfloat* %ptr to i8*
916 …l void @llvm.arm.neon.vst3lane.p0i8.v8bf16(i8* %3, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> …
920 define arm_aapcs_vfpcc void @test_vst4_bf16(bfloat* %ptr, [4 x <2 x i32>] %val.coerce) {
934 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
935 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
936 %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat>
937 %3 = bitcast <2 x i32> %val.coerce.fca.3.extract to <4 x bfloat>
938 %4 = bitcast bfloat* %ptr to i8*
939 …id @llvm.arm.neon.vst4.p0i8.v4bf16(i8* %4, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x…
943 define arm_aapcs_vfpcc void @test_vst4q_bf16(bfloat* %ptr, [4 x <4 x i32>] %val.coerce) {
958 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
959 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
960 %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat>
961 %3 = bitcast <4 x i32> %val.coerce.fca.3.extract to <8 x bfloat>
962 %4 = bitcast bfloat* %ptr to i8*
963 …id @llvm.arm.neon.vst4.p0i8.v8bf16(i8* %4, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x…
967 define arm_aapcs_vfpcc void @test_vst4_lane_bf16(bfloat* %ptr, [4 x <2 x i32>] %val.coerce) {
981 %0 = bitcast <2 x i32> %val.coerce.fca.0.extract to <4 x bfloat>
982 %1 = bitcast <2 x i32> %val.coerce.fca.1.extract to <4 x bfloat>
983 %2 = bitcast <2 x i32> %val.coerce.fca.2.extract to <4 x bfloat>
984 %3 = bitcast <2 x i32> %val.coerce.fca.3.extract to <4 x bfloat>
985 %4 = bitcast bfloat* %ptr to i8*
986 …llvm.arm.neon.vst4lane.p0i8.v4bf16(i8* %4, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x…
990 define arm_aapcs_vfpcc void @test_vst4q_lane_bf16(bfloat* %ptr, [4 x <4 x i32>] %val.coerce) {
1004 %0 = bitcast <4 x i32> %val.coerce.fca.0.extract to <8 x bfloat>
1005 %1 = bitcast <4 x i32> %val.coerce.fca.1.extract to <8 x bfloat>
1006 %2 = bitcast <4 x i32> %val.coerce.fca.2.extract to <8 x bfloat>
1007 %3 = bitcast <4 x i32> %val.coerce.fca.3.extract to <8 x bfloat>
1008 %4 = bitcast bfloat* %ptr to i8*
1009 …llvm.arm.neon.vst4lane.p0i8.v8bf16(i8* %4, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x…
1013 declare { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2.v4bf16.p0i8(i8*, i32)
1014 declare { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2.v8bf16.p0i8(i8*, i32)
1015 declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3.v4bf16.p0i8(i8*, i32)
1016 declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3.v8bf16.p0i8(i8*, i32)
1017 declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4.v4bf16.p0i8(…
1018 declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4.v8bf16.p0i8(…
1020 declare { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2dup.v4bf16.p0i8(i8*, i32)
1021 declare { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2dup.v8bf16.p0i8(i8*, i32)
1022 declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3dup.v4bf16.p0i8(i8*, i32)
1023 declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3dup.v8bf16.p0i8(i8*, i32)
1024 declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4dup.v4bf16.p0…
1025 declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4dup.v8bf16.p0…
1027 declare { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x2.v4bf16.p0bf16(bfloat*)
1028 declare { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x2.v8bf16.p0bf16(bfloat*)
1029 declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x3.v4bf16.p0bf16(bfloat*)
1030 declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x3.v8bf16.p0bf16(bfloat*)
1031 declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x4.v4bf16.p0b…
1032 declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x4.v8bf16.p0b…
1034 declare { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2lane.v4bf16.p0i8(i8*, <4 x bfloat>, <4 x …
1035 declare { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2lane.v8bf16.p0i8(i8*, <8 x bfloat>, <8 x …
1036 declare { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3lane.v4bf16.p0i8(i8*, <4 x …
1037 declare { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3lane.v8bf16.p0i8(i8*, <8 x …
1038 … x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4lane.v4bf16.p0i8(i8*, <4…
1039 … x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4lane.v8bf16.p0i8(i8*, <8…
1041 declare void @llvm.arm.neon.vst1.p0i8.v4bf16(i8*, <4 x bfloat>, i32)
1042 declare void @llvm.arm.neon.vst1.p0i8.v8bf16(i8*, <8 x bfloat>, i32)
1043 declare void @llvm.arm.neon.vst2.p0i8.v4bf16(i8*, <4 x bfloat>, <4 x bfloat>, i32)
1044 declare void @llvm.arm.neon.vst2.p0i8.v8bf16(i8*, <8 x bfloat>, <8 x bfloat>, i32)
1045 declare void @llvm.arm.neon.vst3.p0i8.v4bf16(i8*, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32)
1046 declare void @llvm.arm.neon.vst3.p0i8.v8bf16(i8*, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32)
1047 …lare void @llvm.arm.neon.vst4.p0i8.v4bf16(i8*, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bflo…
1048 …lare void @llvm.arm.neon.vst4.p0i8.v8bf16(i8*, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bflo…
1050 declare void @llvm.arm.neon.vst1x2.p0bf16.v4bf16(bfloat* nocapture, <4 x bfloat>, <4 x bfloat>)
1051 declare void @llvm.arm.neon.vst1x2.p0bf16.v8bf16(bfloat* nocapture, <8 x bfloat>, <8 x bfloat>)
1052 …clare void @llvm.arm.neon.vst1x3.p0bf16.v4bf16(bfloat* nocapture, <4 x bfloat>, <4 x bfloat>, <4 x…
1053 …clare void @llvm.arm.neon.vst1x3.p0bf16.v8bf16(bfloat* nocapture, <8 x bfloat>, <8 x bfloat>, <8 x…
1054 …oid @llvm.arm.neon.vst1x4.p0bf16.v4bf16(bfloat* nocapture, <4 x bfloat>, <4 x bfloat>, <4 x bfloat…
1055 …oid @llvm.arm.neon.vst1x4.p0bf16.v8bf16(bfloat* nocapture, <8 x bfloat>, <8 x bfloat>, <8 x bfloat…
1057 declare void @llvm.arm.neon.vst2lane.p0i8.v4bf16(i8*, <4 x bfloat>, <4 x bfloat>, i32, i32)
1058 declare void @llvm.arm.neon.vst2lane.p0i8.v8bf16(i8*, <8 x bfloat>, <8 x bfloat>, i32, i32)
1059 declare void @llvm.arm.neon.vst3lane.p0i8.v4bf16(i8*, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, i32…
1060 declare void @llvm.arm.neon.vst3lane.p0i8.v8bf16(i8*, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i32…
1061 … void @llvm.arm.neon.vst4lane.p0i8.v4bf16(i8*, <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bflo…
1062 … void @llvm.arm.neon.vst4lane.p0i8.v8bf16(i8*, <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bflo…