• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
3
4; F32
5
6define arm_aapcs_vfpcc <4 x float> @maxf32(<4 x float> %a, <4 x float> %b) {
7; CHECK-LABEL: maxf32:
8; CHECK:       @ %bb.0:
9; CHECK-NEXT:    vmaxnma.f32 q0, q1
10; CHECK-NEXT:    bx lr
11  %aa = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
12  %bb = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %b)
13  %c = tail call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %aa, <4 x float> %bb)
14  ret <4 x float> %c
15}
16
17define arm_aapcs_vfpcc <4 x float> @maxf32_c(<4 x float> %a, <4 x float> %b) {
18; CHECK-LABEL: maxf32_c:
19; CHECK:       @ %bb.0:
20; CHECK-NEXT:    vmaxnma.f32 q0, q1
21; CHECK-NEXT:    bx lr
22  %aa = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
23  %bb = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %b)
24  %c = tail call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %bb, <4 x float> %aa)
25  ret <4 x float> %c
26}
27
28define arm_aapcs_vfpcc <4 x float> @minf32(<4 x float> %a, <4 x float> %b) {
29; CHECK-LABEL: minf32:
30; CHECK:       @ %bb.0:
31; CHECK-NEXT:    vminnma.f32 q0, q1
32; CHECK-NEXT:    bx lr
33  %aa = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
34  %bb = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %b)
35  %c = tail call fast <4 x float> @llvm.minnum.v4f32(<4 x float> %aa, <4 x float> %bb)
36  ret <4 x float> %c
37}
38
39define arm_aapcs_vfpcc <4 x float> @minf32_c(<4 x float> %a, <4 x float> %b) {
40; CHECK-LABEL: minf32_c:
41; CHECK:       @ %bb.0:
42; CHECK-NEXT:    vminnma.f32 q0, q1
43; CHECK-NEXT:    bx lr
44  %aa = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
45  %bb = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %b)
46  %c = tail call fast <4 x float> @llvm.minnum.v4f32(<4 x float> %bb, <4 x float> %aa)
47  ret <4 x float> %c
48}
49
50
51define arm_aapcs_vfpcc <4 x float> @maxpredf32(<4 x float> %a, <4 x float> %b) {
52; CHECK-LABEL: maxpredf32:
53; CHECK:       @ %bb.0:
54; CHECK-NEXT:    vpt.f32 gt, q1, q0
55; CHECK-NEXT:    vmaxnmat.f32 q0, q1
56; CHECK-NEXT:    bx lr
57  %c = fcmp olt <4 x float> %a, %b
58  %s = tail call fast <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %c)
59  ret <4 x float> %s
60}
61
62define arm_aapcs_vfpcc <4 x float> @maxpredf32_c(<4 x float> %a, <4 x float> %b) {
63; CHECK-LABEL: maxpredf32_c:
64; CHECK:       @ %bb.0:
65; CHECK-NEXT:    vpt.f32 gt, q1, q0
66; CHECK-NEXT:    vmaxnmat.f32 q1, q0
67; CHECK-NEXT:    vmov q0, q1
68; CHECK-NEXT:    bx lr
69  %c = fcmp olt <4 x float> %a, %b
70  %s = tail call fast <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> %b, <4 x float> %a, <4 x i1> %c)
71  ret <4 x float> %s
72}
73
74define arm_aapcs_vfpcc <4 x float> @minpredf32(<4 x float> %a, <4 x float> %b) {
75; CHECK-LABEL: minpredf32:
76; CHECK:       @ %bb.0:
77; CHECK-NEXT:    vpt.f32 gt, q1, q0
78; CHECK-NEXT:    vminnmat.f32 q0, q1
79; CHECK-NEXT:    bx lr
80  %c = fcmp olt <4 x float> %a, %b
81  %s = tail call fast <4 x float> @llvm.arm.mve.vminnma.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %c)
82  ret <4 x float> %s
83}
84
85define arm_aapcs_vfpcc <4 x float> @minpredf32_c(<4 x float> %a, <4 x float> %b) {
86; CHECK-LABEL: minpredf32_c:
87; CHECK:       @ %bb.0:
88; CHECK-NEXT:    vpt.f32 gt, q1, q0
89; CHECK-NEXT:    vminnmat.f32 q1, q0
90; CHECK-NEXT:    vmov q0, q1
91; CHECK-NEXT:    bx lr
92  %c = fcmp olt <4 x float> %a, %b
93  %s = tail call fast <4 x float> @llvm.arm.mve.vminnma.predicated.v4f32.v4i1(<4 x float> %b, <4 x float> %a, <4 x i1> %c)
94  ret <4 x float> %s
95}
96
97
98
99; F16
100
101define arm_aapcs_vfpcc <8 x half> @maxf16(<8 x half> %a, <8 x half> %b) {
102; CHECK-LABEL: maxf16:
103; CHECK:       @ %bb.0:
104; CHECK-NEXT:    vmaxnma.f16 q0, q1
105; CHECK-NEXT:    bx lr
106  %aa = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %a)
107  %bb = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %b)
108  %c = tail call fast <8 x half> @llvm.maxnum.v8f16(<8 x half> %aa, <8 x half> %bb)
109  ret <8 x half> %c
110}
111
112define arm_aapcs_vfpcc <8 x half> @maxf16_c(<8 x half> %a, <8 x half> %b) {
113; CHECK-LABEL: maxf16_c:
114; CHECK:       @ %bb.0:
115; CHECK-NEXT:    vmaxnma.f16 q0, q1
116; CHECK-NEXT:    bx lr
117  %aa = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %a)
118  %bb = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %b)
119  %c = tail call fast <8 x half> @llvm.maxnum.v8f16(<8 x half> %bb, <8 x half> %aa)
120  ret <8 x half> %c
121}
122
123define arm_aapcs_vfpcc <8 x half> @minf16(<8 x half> %a, <8 x half> %b) {
124; CHECK-LABEL: minf16:
125; CHECK:       @ %bb.0:
126; CHECK-NEXT:    vminnma.f16 q0, q1
127; CHECK-NEXT:    bx lr
128  %aa = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %a)
129  %bb = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %b)
130  %c = tail call fast <8 x half> @llvm.minnum.v8f16(<8 x half> %aa, <8 x half> %bb)
131  ret <8 x half> %c
132}
133
134define arm_aapcs_vfpcc <8 x half> @minf16_c(<8 x half> %a, <8 x half> %b) {
135; CHECK-LABEL: minf16_c:
136; CHECK:       @ %bb.0:
137; CHECK-NEXT:    vminnma.f16 q0, q1
138; CHECK-NEXT:    bx lr
139  %aa = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %a)
140  %bb = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %b)
141  %c = tail call fast <8 x half> @llvm.minnum.v8f16(<8 x half> %bb, <8 x half> %aa)
142  ret <8 x half> %c
143}
144
145define arm_aapcs_vfpcc <8 x half> @maxpredf16(<8 x half> %a, <8 x half> %b) {
146; CHECK-LABEL: maxpredf16:
147; CHECK:       @ %bb.0:
148; CHECK-NEXT:    vpt.f16 gt, q1, q0
149; CHECK-NEXT:    vmaxnmat.f16 q0, q1
150; CHECK-NEXT:    bx lr
151  %c = fcmp olt <8 x half> %a, %b
152  %s = tail call fast <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %c)
153  ret <8 x half> %s
154}
155
156define arm_aapcs_vfpcc <8 x half> @maxpredf16_c(<8 x half> %a, <8 x half> %b) {
157; CHECK-LABEL: maxpredf16_c:
158; CHECK:       @ %bb.0:
159; CHECK-NEXT:    vpt.f16 gt, q1, q0
160; CHECK-NEXT:    vmaxnmat.f16 q1, q0
161; CHECK-NEXT:    vmov q0, q1
162; CHECK-NEXT:    bx lr
163  %c = fcmp olt <8 x half> %a, %b
164  %s = tail call fast <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> %b, <8 x half> %a, <8 x i1> %c)
165  ret <8 x half> %s
166}
167
168define arm_aapcs_vfpcc <8 x half> @minpredf16(<8 x half> %a, <8 x half> %b) {
169; CHECK-LABEL: minpredf16:
170; CHECK:       @ %bb.0:
171; CHECK-NEXT:    vpt.f16 gt, q1, q0
172; CHECK-NEXT:    vminnmat.f16 q0, q1
173; CHECK-NEXT:    bx lr
174  %c = fcmp olt <8 x half> %a, %b
175  %s = tail call fast <8 x half> @llvm.arm.mve.vminnma.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %c)
176  ret <8 x half> %s
177}
178
179define arm_aapcs_vfpcc <8 x half> @minpredf16_c(<8 x half> %a, <8 x half> %b) {
180; CHECK-LABEL: minpredf16_c:
181; CHECK:       @ %bb.0:
182; CHECK-NEXT:    vpt.f16 gt, q1, q0
183; CHECK-NEXT:    vminnmat.f16 q1, q0
184; CHECK-NEXT:    vmov q0, q1
185; CHECK-NEXT:    bx lr
186  %c = fcmp olt <8 x half> %a, %b
187  %s = tail call fast <8 x half> @llvm.arm.mve.vminnma.predicated.v8f16.v8i1(<8 x half> %b, <8 x half> %a, <8 x i1> %c)
188  ret <8 x half> %s
189}
190
191
192; Loops
193
194define void @loop_absmax32(float* nocapture readonly %0, i32 %1, float* nocapture %2) {
195; CHECK-LABEL: loop_absmax32:
196; CHECK:       @ %bb.0:
197; CHECK-NEXT:    .save {r7, lr}
198; CHECK-NEXT:    push {r7, lr}
199; CHECK-NEXT:    vmov.i32 q0, #0x0
200; CHECK-NEXT:    lsr.w lr, r1, #3
201; CHECK-NEXT:    wls lr, lr, .LBB16_3
202; CHECK-NEXT:  @ %bb.1: @ %.preheader
203; CHECK-NEXT:    vmov.i32 q0, #0x0
204; CHECK-NEXT:  .LBB16_2: @ =>This Inner Loop Header: Depth=1
205; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
206; CHECK-NEXT:    vmaxnma.f32 q0, q1
207; CHECK-NEXT:    le lr, .LBB16_2
208; CHECK-NEXT:  .LBB16_3:
209; CHECK-NEXT:    vldr s4, .LCPI16_0
210; CHECK-NEXT:    vmov r0, s4
211; CHECK-NEXT:    vmaxnmav.f32 r0, q0
212; CHECK-NEXT:    vmov s0, r0
213; CHECK-NEXT:    vstr s0, [r2]
214; CHECK-NEXT:    pop {r7, pc}
215; CHECK-NEXT:    .p2align 2
216; CHECK-NEXT:  @ %bb.4:
217; CHECK-NEXT:  .LCPI16_0:
218; CHECK-NEXT:    .long 0x00000000 @ float 0
219  %4 = lshr i32 %1, 3
220  %5 = icmp eq i32 %4, 0
221  br i1 %5, label %18, label %6
222
2236:                                                ; preds = %3, %6
224  %7 = phi i32 [ %16, %6 ], [ %4, %3 ]
225  %8 = phi <4 x float> [ %15, %6 ], [ zeroinitializer, %3 ]
226  %9 = phi float* [ %12, %6 ], [ %0, %3 ]
227  %10 = bitcast float* %9 to <4 x float>*
228  %11 = load <4 x float>, <4 x float>* %10, align 4
229  %12 = getelementptr inbounds float, float* %9, i32 4
230  %13 = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %11)
231  %14 = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %8)
232  %15 = tail call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %14, <4 x float> %13)
233  %16 = add nsw i32 %7, -1
234  %17 = icmp eq i32 %16, 0
235  br i1 %17, label %18, label %6
236
23718:                                               ; preds = %6, %3
238  %19 = phi <4 x float> [ zeroinitializer, %3 ], [ %15, %6 ]
239  %20 = tail call fast float @llvm.arm.mve.maxnmav.f32.v4f32(float 0.000000e+00, <4 x float> %19)
240  store float %20, float* %2, align 4
241  ret void
242}
243
244define void @loop_absmax32_c(float* nocapture readonly %0, i32 %1, float* nocapture %2) {
245; CHECK-LABEL: loop_absmax32_c:
246; CHECK:       @ %bb.0:
247; CHECK-NEXT:    .save {r7, lr}
248; CHECK-NEXT:    push {r7, lr}
249; CHECK-NEXT:    vmov.i32 q0, #0x0
250; CHECK-NEXT:    lsr.w lr, r1, #3
251; CHECK-NEXT:    wls lr, lr, .LBB17_3
252; CHECK-NEXT:  @ %bb.1: @ %.preheader
253; CHECK-NEXT:    vmov.i32 q0, #0x0
254; CHECK-NEXT:  .LBB17_2: @ =>This Inner Loop Header: Depth=1
255; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
256; CHECK-NEXT:    vmaxnma.f32 q0, q1
257; CHECK-NEXT:    le lr, .LBB17_2
258; CHECK-NEXT:  .LBB17_3:
259; CHECK-NEXT:    vldr s4, .LCPI17_0
260; CHECK-NEXT:    vmov r0, s4
261; CHECK-NEXT:    vmaxnmav.f32 r0, q0
262; CHECK-NEXT:    vmov s0, r0
263; CHECK-NEXT:    vstr s0, [r2]
264; CHECK-NEXT:    pop {r7, pc}
265; CHECK-NEXT:    .p2align 2
266; CHECK-NEXT:  @ %bb.4:
267; CHECK-NEXT:  .LCPI17_0:
268; CHECK-NEXT:    .long 0x00000000 @ float 0
269  %4 = lshr i32 %1, 3
270  %5 = icmp eq i32 %4, 0
271  br i1 %5, label %18, label %6
272
2736:                                                ; preds = %3, %6
274  %7 = phi i32 [ %16, %6 ], [ %4, %3 ]
275  %8 = phi <4 x float> [ %15, %6 ], [ zeroinitializer, %3 ]
276  %9 = phi float* [ %12, %6 ], [ %0, %3 ]
277  %10 = bitcast float* %9 to <4 x float>*
278  %11 = load <4 x float>, <4 x float>* %10, align 4
279  %12 = getelementptr inbounds float, float* %9, i32 4
280  %13 = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %11)
281  %14 = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %8)
282  %15 = tail call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %13, <4 x float> %14)
283  %16 = add nsw i32 %7, -1
284  %17 = icmp eq i32 %16, 0
285  br i1 %17, label %18, label %6
286
28718:                                               ; preds = %6, %3
288  %19 = phi <4 x float> [ zeroinitializer, %3 ], [ %15, %6 ]
289  %20 = tail call fast float @llvm.arm.mve.maxnmav.f32.v4f32(float 0.000000e+00, <4 x float> %19)
290  store float %20, float* %2, align 4
291  ret void
292}
293
294define void @loop_absmax32_pred(float* %0, i32 %1, float* nocapture %2) {
295; CHECK-LABEL: loop_absmax32_pred:
296; CHECK:       @ %bb.0:
297; CHECK-NEXT:    .save {r7, lr}
298; CHECK-NEXT:    push {r7, lr}
299; CHECK-NEXT:    vmov.i32 q0, #0x0
300; CHECK-NEXT:    dlstp.32 lr, r1
301; CHECK-NEXT:  .LBB18_1: @ =>This Inner Loop Header: Depth=1
302; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
303; CHECK-NEXT:    vmaxnma.f32 q0, q1
304; CHECK-NEXT:    letp lr, .LBB18_1
305; CHECK-NEXT:  @ %bb.2:
306; CHECK-NEXT:    vldr s4, .LCPI18_0
307; CHECK-NEXT:    vmov r0, s4
308; CHECK-NEXT:    vmaxnmav.f32 r0, q0
309; CHECK-NEXT:    vmov s0, r0
310; CHECK-NEXT:    vstr s0, [r2]
311; CHECK-NEXT:    pop {r7, pc}
312; CHECK-NEXT:    .p2align 2
313; CHECK-NEXT:  @ %bb.3:
314; CHECK-NEXT:  .LCPI18_0:
315; CHECK-NEXT:    .long 0x00000000 @ float 0
316  br label %4
317
3184:                                                ; preds = %4, %3
319  %5 = phi <4 x float> [ zeroinitializer, %3 ], [ %12, %4 ]
320  %6 = phi i32 [ %1, %3 ], [ %13, %4 ]
321  %7 = phi float* [ %0, %3 ], [ %11, %4 ]
322  %8 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %6)
323  %9 = bitcast float* %7 to <4 x float>*
324  %10 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %9, i32 4, <4 x i1> %8, <4 x float> zeroinitializer)
325  %11 = getelementptr inbounds float, float* %7, i32 4
326  %12 = tail call fast <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> %5, <4 x float> %10, <4 x i1> %8)
327  %13 = add nsw i32 %6, -4
328  %14 = icmp sgt i32 %6, 4
329  br i1 %14, label %4, label %15
330
33115:                                               ; preds = %4
332  %16 = tail call fast float @llvm.arm.mve.maxnmav.f32.v4f32(float 0.000000e+00, <4 x float> %12)
333  store float %16, float* %2, align 4
334  ret void
335}
336
337define void @loop_absmax32_pred_c(float* %0, i32 %1, float* nocapture %2) {
338; CHECK-LABEL: loop_absmax32_pred_c:
339; CHECK:       @ %bb.0:
340; CHECK-NEXT:    .save {r7, lr}
341; CHECK-NEXT:    push {r7, lr}
342; CHECK-NEXT:    vmov.i32 q0, #0x0
343; CHECK-NEXT:    dlstp.32 lr, r1
344; CHECK-NEXT:  .LBB19_1: @ =>This Inner Loop Header: Depth=1
345; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
346; CHECK-NEXT:    vmaxnma.f32 q1, q0
347; CHECK-NEXT:    vmov q0, q1
348; CHECK-NEXT:    letp lr, .LBB19_1
349; CHECK-NEXT:  @ %bb.2:
350; CHECK-NEXT:    vldr s0, .LCPI19_0
351; CHECK-NEXT:    vmov r0, s0
352; CHECK-NEXT:    vmaxnmav.f32 r0, q1
353; CHECK-NEXT:    vmov s0, r0
354; CHECK-NEXT:    vstr s0, [r2]
355; CHECK-NEXT:    pop {r7, pc}
356; CHECK-NEXT:    .p2align 2
357; CHECK-NEXT:  @ %bb.3:
358; CHECK-NEXT:  .LCPI19_0:
359; CHECK-NEXT:    .long 0x00000000 @ float 0
360  br label %4
361
3624:                                                ; preds = %4, %3
363  %5 = phi <4 x float> [ zeroinitializer, %3 ], [ %12, %4 ]
364  %6 = phi i32 [ %1, %3 ], [ %13, %4 ]
365  %7 = phi float* [ %0, %3 ], [ %11, %4 ]
366  %8 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %6)
367  %9 = bitcast float* %7 to <4 x float>*
368  %10 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %9, i32 4, <4 x i1> %8, <4 x float> zeroinitializer)
369  %11 = getelementptr inbounds float, float* %7, i32 4
370  %12 = tail call fast <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> %10, <4 x float> %5, <4 x i1> %8)
371  %13 = add nsw i32 %6, -4
372  %14 = icmp sgt i32 %6, 4
373  br i1 %14, label %4, label %15
374
37515:                                               ; preds = %4
376  %16 = tail call fast float @llvm.arm.mve.maxnmav.f32.v4f32(float 0.000000e+00, <4 x float> %12)
377  store float %16, float* %2, align 4
378  ret void
379}
380
381
382
383
384
385
386define void @loop_absmax16(half* nocapture readonly %0, i32 %1, half* nocapture %2) {
387; CHECK-LABEL: loop_absmax16:
388; CHECK:       @ %bb.0:
389; CHECK-NEXT:    .save {r7, lr}
390; CHECK-NEXT:    push {r7, lr}
391; CHECK-NEXT:    vmov.i32 q0, #0x0
392; CHECK-NEXT:    lsr.w lr, r1, #3
393; CHECK-NEXT:    wls lr, lr, .LBB20_3
394; CHECK-NEXT:  @ %bb.1: @ %.preheader
395; CHECK-NEXT:    vmov.i32 q0, #0x0
396; CHECK-NEXT:  .LBB20_2: @ =>This Inner Loop Header: Depth=1
397; CHECK-NEXT:    vldrw.u32 q1, [r0], #8
398; CHECK-NEXT:    vmaxnma.f16 q0, q1
399; CHECK-NEXT:    le lr, .LBB20_2
400; CHECK-NEXT:  .LBB20_3:
401; CHECK-NEXT:    vldr.16 s4, .LCPI20_0
402; CHECK-NEXT:    vmov r0, s4
403; CHECK-NEXT:    vmaxnmav.f16 r0, q0
404; CHECK-NEXT:    vmov s0, r0
405; CHECK-NEXT:    vstr.16 s0, [r2]
406; CHECK-NEXT:    pop {r7, pc}
407; CHECK-NEXT:    .p2align 1
408; CHECK-NEXT:  @ %bb.4:
409; CHECK-NEXT:  .LCPI20_0:
410; CHECK-NEXT:    .short 0x0000 @ half 0
411  %4 = lshr i32 %1, 3
412  %5 = icmp eq i32 %4, 0
413  br i1 %5, label %18, label %6
414
4156:                                                ; preds = %3, %6
416  %7 = phi i32 [ %16, %6 ], [ %4, %3 ]
417  %8 = phi <8 x half> [ %15, %6 ], [ zeroinitializer, %3 ]
418  %9 = phi half* [ %12, %6 ], [ %0, %3 ]
419  %10 = bitcast half* %9 to <8 x half>*
420  %11 = load <8 x half>, <8 x half>* %10, align 4
421  %12 = getelementptr inbounds half, half* %9, i32 4
422  %13 = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %11)
423  %14 = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %8)
424  %15 = tail call fast <8 x half> @llvm.maxnum.v8f16(<8 x half> %14, <8 x half> %13)
425  %16 = add nsw i32 %7, -1
426  %17 = icmp eq i32 %16, 0
427  br i1 %17, label %18, label %6
428
42918:                                               ; preds = %6, %3
430  %19 = phi <8 x half> [ zeroinitializer, %3 ], [ %15, %6 ]
431  %20 = tail call fast half @llvm.arm.mve.maxnmav.f16.v8f16(half 0.000000e+00, <8 x half> %19)
432  store half %20, half* %2, align 4
433  ret void
434}
435
436define void @loop_absmax16_c(half* nocapture readonly %0, i32 %1, half* nocapture %2) {
437; CHECK-LABEL: loop_absmax16_c:
438; CHECK:       @ %bb.0:
439; CHECK-NEXT:    .save {r7, lr}
440; CHECK-NEXT:    push {r7, lr}
441; CHECK-NEXT:    vmov.i32 q0, #0x0
442; CHECK-NEXT:    lsr.w lr, r1, #3
443; CHECK-NEXT:    wls lr, lr, .LBB21_3
444; CHECK-NEXT:  @ %bb.1: @ %.preheader
445; CHECK-NEXT:    vmov.i32 q0, #0x0
446; CHECK-NEXT:  .LBB21_2: @ =>This Inner Loop Header: Depth=1
447; CHECK-NEXT:    vldrw.u32 q1, [r0], #8
448; CHECK-NEXT:    vmaxnma.f16 q0, q1
449; CHECK-NEXT:    le lr, .LBB21_2
450; CHECK-NEXT:  .LBB21_3:
451; CHECK-NEXT:    vldr.16 s4, .LCPI21_0
452; CHECK-NEXT:    vmov r0, s4
453; CHECK-NEXT:    vmaxnmav.f16 r0, q0
454; CHECK-NEXT:    vmov s0, r0
455; CHECK-NEXT:    vstr.16 s0, [r2]
456; CHECK-NEXT:    pop {r7, pc}
457; CHECK-NEXT:    .p2align 1
458; CHECK-NEXT:  @ %bb.4:
459; CHECK-NEXT:  .LCPI21_0:
460; CHECK-NEXT:    .short 0x0000 @ half 0
461  %4 = lshr i32 %1, 3
462  %5 = icmp eq i32 %4, 0
463  br i1 %5, label %18, label %6
464
4656:                                                ; preds = %3, %6
466  %7 = phi i32 [ %16, %6 ], [ %4, %3 ]
467  %8 = phi <8 x half> [ %15, %6 ], [ zeroinitializer, %3 ]
468  %9 = phi half* [ %12, %6 ], [ %0, %3 ]
469  %10 = bitcast half* %9 to <8 x half>*
470  %11 = load <8 x half>, <8 x half>* %10, align 4
471  %12 = getelementptr inbounds half, half* %9, i32 4
472  %13 = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %11)
473  %14 = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %8)
474  %15 = tail call fast <8 x half> @llvm.maxnum.v8f16(<8 x half> %13, <8 x half> %14)
475  %16 = add nsw i32 %7, -1
476  %17 = icmp eq i32 %16, 0
477  br i1 %17, label %18, label %6
478
47918:                                               ; preds = %6, %3
480  %19 = phi <8 x half> [ zeroinitializer, %3 ], [ %15, %6 ]
481  %20 = tail call fast half @llvm.arm.mve.maxnmav.f16.v8f16(half 0.000000e+00, <8 x half> %19)
482  store half %20, half* %2, align 4
483  ret void
484}
485
486define void @loop_absmax16_pred(half* %0, i32 %1, half* nocapture %2) {
487; CHECK-LABEL: loop_absmax16_pred:
488; CHECK:       @ %bb.0:
489; CHECK-NEXT:    .save {r7, lr}
490; CHECK-NEXT:    push {r7, lr}
491; CHECK-NEXT:    vmov.i32 q0, #0x0
492; CHECK-NEXT:    dlstp.16 lr, r1
493; CHECK-NEXT:  .LBB22_1: @ =>This Inner Loop Header: Depth=1
494; CHECK-NEXT:    vldrh.u16 q1, [r0], #8
495; CHECK-NEXT:    vmaxnma.f16 q0, q1
496; CHECK-NEXT:    letp lr, .LBB22_1
497; CHECK-NEXT:  @ %bb.2:
498; CHECK-NEXT:    vldr.16 s4, .LCPI22_0
499; CHECK-NEXT:    vmov r0, s4
500; CHECK-NEXT:    vmaxnmav.f16 r0, q0
501; CHECK-NEXT:    vmov s0, r0
502; CHECK-NEXT:    vstr.16 s0, [r2]
503; CHECK-NEXT:    pop {r7, pc}
504; CHECK-NEXT:    .p2align 1
505; CHECK-NEXT:  @ %bb.3:
506; CHECK-NEXT:  .LCPI22_0:
507; CHECK-NEXT:    .short 0x0000 @ half 0
508  br label %4
509
5104:                                                ; preds = %4, %3
511  %5 = phi <8 x half> [ zeroinitializer, %3 ], [ %12, %4 ]
512  %6 = phi i32 [ %1, %3 ], [ %13, %4 ]
513  %7 = phi half* [ %0, %3 ], [ %11, %4 ]
514  %8 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %6)
515  %9 = bitcast half* %7 to <8 x half>*
516  %10 = tail call fast <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %9, i32 4, <8 x i1> %8, <8 x half> zeroinitializer)
517  %11 = getelementptr inbounds half, half* %7, i32 4
518  %12 = tail call fast <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> %5, <8 x half> %10, <8 x i1> %8)
519  %13 = add nsw i32 %6, -8
520  %14 = icmp sgt i32 %6, 8
521  br i1 %14, label %4, label %15
522
52315:                                               ; preds = %4
524  %16 = tail call fast half @llvm.arm.mve.maxnmav.f16.v8f16(half 0.000000e+00, <8 x half> %12)
525  store half %16, half* %2, align 4
526  ret void
527}
528
529define void @loop_absmax16_pred_c(half* %0, i32 %1, half* nocapture %2) {
530; CHECK-LABEL: loop_absmax16_pred_c:
531; CHECK:       @ %bb.0:
532; CHECK-NEXT:    .save {r7, lr}
533; CHECK-NEXT:    push {r7, lr}
534; CHECK-NEXT:    vmov.i32 q0, #0x0
535; CHECK-NEXT:    dlstp.16 lr, r1
536; CHECK-NEXT:  .LBB23_1: @ =>This Inner Loop Header: Depth=1
537; CHECK-NEXT:    vldrh.u16 q1, [r0], #8
538; CHECK-NEXT:    vmaxnma.f16 q1, q0
539; CHECK-NEXT:    vmov q0, q1
540; CHECK-NEXT:    letp lr, .LBB23_1
541; CHECK-NEXT:  @ %bb.2:
542; CHECK-NEXT:    vldr.16 s0, .LCPI23_0
543; CHECK-NEXT:    vmov r0, s0
544; CHECK-NEXT:    vmaxnmav.f16 r0, q1
545; CHECK-NEXT:    vmov s0, r0
546; CHECK-NEXT:    vstr.16 s0, [r2]
547; CHECK-NEXT:    pop {r7, pc}
548; CHECK-NEXT:    .p2align 1
549; CHECK-NEXT:  @ %bb.3:
550; CHECK-NEXT:  .LCPI23_0:
551; CHECK-NEXT:    .short 0x0000 @ half 0
552  br label %4
553
5544:                                                ; preds = %4, %3
555  %5 = phi <8 x half> [ zeroinitializer, %3 ], [ %12, %4 ]
556  %6 = phi i32 [ %1, %3 ], [ %13, %4 ]
557  %7 = phi half* [ %0, %3 ], [ %11, %4 ]
558  %8 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %6)
559  %9 = bitcast half* %7 to <8 x half>*
560  %10 = tail call fast <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %9, i32 4, <8 x i1> %8, <8 x half> zeroinitializer)
561  %11 = getelementptr inbounds half, half* %7, i32 4
562  %12 = tail call fast <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> %10, <8 x half> %5, <8 x i1> %8)
563  %13 = add nsw i32 %6, -8
564  %14 = icmp sgt i32 %6, 8
565  br i1 %14, label %4, label %15
566
56715:                                               ; preds = %4
568  %16 = tail call fast half @llvm.arm.mve.maxnmav.f16.v8f16(half 0.000000e+00, <8 x half> %12)
569  store half %16, half* %2, align 4
570  ret void
571}
572
573
574
575
576
577declare <4 x i1> @llvm.arm.mve.vctp32(i32)
578declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
579declare <4 x float> @llvm.arm.mve.vminnma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>)
580declare <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>)
581declare float @llvm.arm.mve.maxnmav.f32.v4f32(float, <4 x float>)
582declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
583declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
584declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
585
586declare <8 x i1> @llvm.arm.mve.vctp16(i32)
587declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32 immarg, <8 x i1>, <8 x half>)
588declare <8 x half> @llvm.arm.mve.vminnma.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>)
589declare <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>)
590declare half @llvm.arm.mve.maxnmav.f16.v8f16(half, <8 x half>)
591declare <8 x half> @llvm.fabs.v8f16(<8 x half>)
592declare <8 x half> @llvm.maxnum.v8f16(<8 x half>, <8 x half>)
593declare <8 x half> @llvm.minnum.v8f16(<8 x half>, <8 x half>)
594
595
596