1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s 3 4; F32 5 6define arm_aapcs_vfpcc <4 x float> @maxf32(<4 x float> %a, <4 x float> %b) { 7; CHECK-LABEL: maxf32: 8; CHECK: @ %bb.0: 9; CHECK-NEXT: vmaxnma.f32 q0, q1 10; CHECK-NEXT: bx lr 11 %aa = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %a) 12 %bb = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %b) 13 %c = tail call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %aa, <4 x float> %bb) 14 ret <4 x float> %c 15} 16 17define arm_aapcs_vfpcc <4 x float> @maxf32_c(<4 x float> %a, <4 x float> %b) { 18; CHECK-LABEL: maxf32_c: 19; CHECK: @ %bb.0: 20; CHECK-NEXT: vmaxnma.f32 q0, q1 21; CHECK-NEXT: bx lr 22 %aa = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %a) 23 %bb = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %b) 24 %c = tail call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %bb, <4 x float> %aa) 25 ret <4 x float> %c 26} 27 28define arm_aapcs_vfpcc <4 x float> @minf32(<4 x float> %a, <4 x float> %b) { 29; CHECK-LABEL: minf32: 30; CHECK: @ %bb.0: 31; CHECK-NEXT: vminnma.f32 q0, q1 32; CHECK-NEXT: bx lr 33 %aa = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %a) 34 %bb = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %b) 35 %c = tail call fast <4 x float> @llvm.minnum.v4f32(<4 x float> %aa, <4 x float> %bb) 36 ret <4 x float> %c 37} 38 39define arm_aapcs_vfpcc <4 x float> @minf32_c(<4 x float> %a, <4 x float> %b) { 40; CHECK-LABEL: minf32_c: 41; CHECK: @ %bb.0: 42; CHECK-NEXT: vminnma.f32 q0, q1 43; CHECK-NEXT: bx lr 44 %aa = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %a) 45 %bb = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %b) 46 %c = tail call fast <4 x float> @llvm.minnum.v4f32(<4 x float> %bb, <4 x float> %aa) 47 ret <4 x float> %c 48} 49 50 51define arm_aapcs_vfpcc <4 x float> @maxpredf32(<4 x float> %a, <4 x float> %b) { 52; CHECK-LABEL: maxpredf32: 53; CHECK: @ %bb.0: 54; CHECK-NEXT: vpt.f32 gt, q1, q0 55; CHECK-NEXT: vmaxnmat.f32 q0, q1 56; CHECK-NEXT: bx lr 57 %c = fcmp olt <4 x float> %a, %b 58 %s = tail call fast <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %c) 59 ret <4 x float> %s 60} 61 62define arm_aapcs_vfpcc <4 x float> @maxpredf32_c(<4 x float> %a, <4 x float> %b) { 63; CHECK-LABEL: maxpredf32_c: 64; CHECK: @ %bb.0: 65; CHECK-NEXT: vpt.f32 gt, q1, q0 66; CHECK-NEXT: vmaxnmat.f32 q1, q0 67; CHECK-NEXT: vmov q0, q1 68; CHECK-NEXT: bx lr 69 %c = fcmp olt <4 x float> %a, %b 70 %s = tail call fast <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> %b, <4 x float> %a, <4 x i1> %c) 71 ret <4 x float> %s 72} 73 74define arm_aapcs_vfpcc <4 x float> @minpredf32(<4 x float> %a, <4 x float> %b) { 75; CHECK-LABEL: minpredf32: 76; CHECK: @ %bb.0: 77; CHECK-NEXT: vpt.f32 gt, q1, q0 78; CHECK-NEXT: vminnmat.f32 q0, q1 79; CHECK-NEXT: bx lr 80 %c = fcmp olt <4 x float> %a, %b 81 %s = tail call fast <4 x float> @llvm.arm.mve.vminnma.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %c) 82 ret <4 x float> %s 83} 84 85define arm_aapcs_vfpcc <4 x float> @minpredf32_c(<4 x float> %a, <4 x float> %b) { 86; CHECK-LABEL: minpredf32_c: 87; CHECK: @ %bb.0: 88; CHECK-NEXT: vpt.f32 gt, q1, q0 89; CHECK-NEXT: vminnmat.f32 q1, q0 90; CHECK-NEXT: vmov q0, q1 91; CHECK-NEXT: bx lr 92 %c = fcmp olt <4 x float> %a, %b 93 %s = tail call fast <4 x float> @llvm.arm.mve.vminnma.predicated.v4f32.v4i1(<4 x float> %b, <4 x float> %a, <4 x i1> %c) 94 ret <4 x float> %s 95} 96 97 98 99; F16 100 101define arm_aapcs_vfpcc <8 x half> @maxf16(<8 x half> %a, <8 x half> %b) { 102; CHECK-LABEL: maxf16: 103; CHECK: @ %bb.0: 104; CHECK-NEXT: vmaxnma.f16 q0, q1 105; CHECK-NEXT: bx lr 106 %aa = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %a) 107 %bb = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %b) 108 %c = tail call fast <8 x half> @llvm.maxnum.v8f16(<8 x half> %aa, <8 x half> %bb) 109 ret <8 x half> %c 110} 111 112define arm_aapcs_vfpcc <8 x half> @maxf16_c(<8 x half> %a, <8 x half> %b) { 113; CHECK-LABEL: maxf16_c: 114; CHECK: @ %bb.0: 115; CHECK-NEXT: vmaxnma.f16 q0, q1 116; CHECK-NEXT: bx lr 117 %aa = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %a) 118 %bb = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %b) 119 %c = tail call fast <8 x half> @llvm.maxnum.v8f16(<8 x half> %bb, <8 x half> %aa) 120 ret <8 x half> %c 121} 122 123define arm_aapcs_vfpcc <8 x half> @minf16(<8 x half> %a, <8 x half> %b) { 124; CHECK-LABEL: minf16: 125; CHECK: @ %bb.0: 126; CHECK-NEXT: vminnma.f16 q0, q1 127; CHECK-NEXT: bx lr 128 %aa = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %a) 129 %bb = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %b) 130 %c = tail call fast <8 x half> @llvm.minnum.v8f16(<8 x half> %aa, <8 x half> %bb) 131 ret <8 x half> %c 132} 133 134define arm_aapcs_vfpcc <8 x half> @minf16_c(<8 x half> %a, <8 x half> %b) { 135; CHECK-LABEL: minf16_c: 136; CHECK: @ %bb.0: 137; CHECK-NEXT: vminnma.f16 q0, q1 138; CHECK-NEXT: bx lr 139 %aa = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %a) 140 %bb = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %b) 141 %c = tail call fast <8 x half> @llvm.minnum.v8f16(<8 x half> %bb, <8 x half> %aa) 142 ret <8 x half> %c 143} 144 145define arm_aapcs_vfpcc <8 x half> @maxpredf16(<8 x half> %a, <8 x half> %b) { 146; CHECK-LABEL: maxpredf16: 147; CHECK: @ %bb.0: 148; CHECK-NEXT: vpt.f16 gt, q1, q0 149; CHECK-NEXT: vmaxnmat.f16 q0, q1 150; CHECK-NEXT: bx lr 151 %c = fcmp olt <8 x half> %a, %b 152 %s = tail call fast <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %c) 153 ret <8 x half> %s 154} 155 156define arm_aapcs_vfpcc <8 x half> @maxpredf16_c(<8 x half> %a, <8 x half> %b) { 157; CHECK-LABEL: maxpredf16_c: 158; CHECK: @ %bb.0: 159; CHECK-NEXT: vpt.f16 gt, q1, q0 160; CHECK-NEXT: vmaxnmat.f16 q1, q0 161; CHECK-NEXT: vmov q0, q1 162; CHECK-NEXT: bx lr 163 %c = fcmp olt <8 x half> %a, %b 164 %s = tail call fast <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> %b, <8 x half> %a, <8 x i1> %c) 165 ret <8 x half> %s 166} 167 168define arm_aapcs_vfpcc <8 x half> @minpredf16(<8 x half> %a, <8 x half> %b) { 169; CHECK-LABEL: minpredf16: 170; CHECK: @ %bb.0: 171; CHECK-NEXT: vpt.f16 gt, q1, q0 172; CHECK-NEXT: vminnmat.f16 q0, q1 173; CHECK-NEXT: bx lr 174 %c = fcmp olt <8 x half> %a, %b 175 %s = tail call fast <8 x half> @llvm.arm.mve.vminnma.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %c) 176 ret <8 x half> %s 177} 178 179define arm_aapcs_vfpcc <8 x half> @minpredf16_c(<8 x half> %a, <8 x half> %b) { 180; CHECK-LABEL: minpredf16_c: 181; CHECK: @ %bb.0: 182; CHECK-NEXT: vpt.f16 gt, q1, q0 183; CHECK-NEXT: vminnmat.f16 q1, q0 184; CHECK-NEXT: vmov q0, q1 185; CHECK-NEXT: bx lr 186 %c = fcmp olt <8 x half> %a, %b 187 %s = tail call fast <8 x half> @llvm.arm.mve.vminnma.predicated.v8f16.v8i1(<8 x half> %b, <8 x half> %a, <8 x i1> %c) 188 ret <8 x half> %s 189} 190 191 192; Loops 193 194define void @loop_absmax32(float* nocapture readonly %0, i32 %1, float* nocapture %2) { 195; CHECK-LABEL: loop_absmax32: 196; CHECK: @ %bb.0: 197; CHECK-NEXT: .save {r7, lr} 198; CHECK-NEXT: push {r7, lr} 199; CHECK-NEXT: vmov.i32 q0, #0x0 200; CHECK-NEXT: lsr.w lr, r1, #3 201; CHECK-NEXT: wls lr, lr, .LBB16_3 202; CHECK-NEXT: @ %bb.1: @ %.preheader 203; CHECK-NEXT: vmov.i32 q0, #0x0 204; CHECK-NEXT: .LBB16_2: @ =>This Inner Loop Header: Depth=1 205; CHECK-NEXT: vldrw.u32 q1, [r0], #16 206; CHECK-NEXT: vmaxnma.f32 q0, q1 207; CHECK-NEXT: le lr, .LBB16_2 208; CHECK-NEXT: .LBB16_3: 209; CHECK-NEXT: vldr s4, .LCPI16_0 210; CHECK-NEXT: vmov r0, s4 211; CHECK-NEXT: vmaxnmav.f32 r0, q0 212; CHECK-NEXT: vmov s0, r0 213; CHECK-NEXT: vstr s0, [r2] 214; CHECK-NEXT: pop {r7, pc} 215; CHECK-NEXT: .p2align 2 216; CHECK-NEXT: @ %bb.4: 217; CHECK-NEXT: .LCPI16_0: 218; CHECK-NEXT: .long 0x00000000 @ float 0 219 %4 = lshr i32 %1, 3 220 %5 = icmp eq i32 %4, 0 221 br i1 %5, label %18, label %6 222 2236: ; preds = %3, %6 224 %7 = phi i32 [ %16, %6 ], [ %4, %3 ] 225 %8 = phi <4 x float> [ %15, %6 ], [ zeroinitializer, %3 ] 226 %9 = phi float* [ %12, %6 ], [ %0, %3 ] 227 %10 = bitcast float* %9 to <4 x float>* 228 %11 = load <4 x float>, <4 x float>* %10, align 4 229 %12 = getelementptr inbounds float, float* %9, i32 4 230 %13 = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %11) 231 %14 = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %8) 232 %15 = tail call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %14, <4 x float> %13) 233 %16 = add nsw i32 %7, -1 234 %17 = icmp eq i32 %16, 0 235 br i1 %17, label %18, label %6 236 23718: ; preds = %6, %3 238 %19 = phi <4 x float> [ zeroinitializer, %3 ], [ %15, %6 ] 239 %20 = tail call fast float @llvm.arm.mve.maxnmav.f32.v4f32(float 0.000000e+00, <4 x float> %19) 240 store float %20, float* %2, align 4 241 ret void 242} 243 244define void @loop_absmax32_c(float* nocapture readonly %0, i32 %1, float* nocapture %2) { 245; CHECK-LABEL: loop_absmax32_c: 246; CHECK: @ %bb.0: 247; CHECK-NEXT: .save {r7, lr} 248; CHECK-NEXT: push {r7, lr} 249; CHECK-NEXT: vmov.i32 q0, #0x0 250; CHECK-NEXT: lsr.w lr, r1, #3 251; CHECK-NEXT: wls lr, lr, .LBB17_3 252; CHECK-NEXT: @ %bb.1: @ %.preheader 253; CHECK-NEXT: vmov.i32 q0, #0x0 254; CHECK-NEXT: .LBB17_2: @ =>This Inner Loop Header: Depth=1 255; CHECK-NEXT: vldrw.u32 q1, [r0], #16 256; CHECK-NEXT: vmaxnma.f32 q0, q1 257; CHECK-NEXT: le lr, .LBB17_2 258; CHECK-NEXT: .LBB17_3: 259; CHECK-NEXT: vldr s4, .LCPI17_0 260; CHECK-NEXT: vmov r0, s4 261; CHECK-NEXT: vmaxnmav.f32 r0, q0 262; CHECK-NEXT: vmov s0, r0 263; CHECK-NEXT: vstr s0, [r2] 264; CHECK-NEXT: pop {r7, pc} 265; CHECK-NEXT: .p2align 2 266; CHECK-NEXT: @ %bb.4: 267; CHECK-NEXT: .LCPI17_0: 268; CHECK-NEXT: .long 0x00000000 @ float 0 269 %4 = lshr i32 %1, 3 270 %5 = icmp eq i32 %4, 0 271 br i1 %5, label %18, label %6 272 2736: ; preds = %3, %6 274 %7 = phi i32 [ %16, %6 ], [ %4, %3 ] 275 %8 = phi <4 x float> [ %15, %6 ], [ zeroinitializer, %3 ] 276 %9 = phi float* [ %12, %6 ], [ %0, %3 ] 277 %10 = bitcast float* %9 to <4 x float>* 278 %11 = load <4 x float>, <4 x float>* %10, align 4 279 %12 = getelementptr inbounds float, float* %9, i32 4 280 %13 = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %11) 281 %14 = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %8) 282 %15 = tail call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %13, <4 x float> %14) 283 %16 = add nsw i32 %7, -1 284 %17 = icmp eq i32 %16, 0 285 br i1 %17, label %18, label %6 286 28718: ; preds = %6, %3 288 %19 = phi <4 x float> [ zeroinitializer, %3 ], [ %15, %6 ] 289 %20 = tail call fast float @llvm.arm.mve.maxnmav.f32.v4f32(float 0.000000e+00, <4 x float> %19) 290 store float %20, float* %2, align 4 291 ret void 292} 293 294define void @loop_absmax32_pred(float* %0, i32 %1, float* nocapture %2) { 295; CHECK-LABEL: loop_absmax32_pred: 296; CHECK: @ %bb.0: 297; CHECK-NEXT: .save {r7, lr} 298; CHECK-NEXT: push {r7, lr} 299; CHECK-NEXT: vmov.i32 q0, #0x0 300; CHECK-NEXT: dlstp.32 lr, r1 301; CHECK-NEXT: .LBB18_1: @ =>This Inner Loop Header: Depth=1 302; CHECK-NEXT: vldrw.u32 q1, [r0], #16 303; CHECK-NEXT: vmaxnma.f32 q0, q1 304; CHECK-NEXT: letp lr, .LBB18_1 305; CHECK-NEXT: @ %bb.2: 306; CHECK-NEXT: vldr s4, .LCPI18_0 307; CHECK-NEXT: vmov r0, s4 308; CHECK-NEXT: vmaxnmav.f32 r0, q0 309; CHECK-NEXT: vmov s0, r0 310; CHECK-NEXT: vstr s0, [r2] 311; CHECK-NEXT: pop {r7, pc} 312; CHECK-NEXT: .p2align 2 313; CHECK-NEXT: @ %bb.3: 314; CHECK-NEXT: .LCPI18_0: 315; CHECK-NEXT: .long 0x00000000 @ float 0 316 br label %4 317 3184: ; preds = %4, %3 319 %5 = phi <4 x float> [ zeroinitializer, %3 ], [ %12, %4 ] 320 %6 = phi i32 [ %1, %3 ], [ %13, %4 ] 321 %7 = phi float* [ %0, %3 ], [ %11, %4 ] 322 %8 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %6) 323 %9 = bitcast float* %7 to <4 x float>* 324 %10 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %9, i32 4, <4 x i1> %8, <4 x float> zeroinitializer) 325 %11 = getelementptr inbounds float, float* %7, i32 4 326 %12 = tail call fast <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> %5, <4 x float> %10, <4 x i1> %8) 327 %13 = add nsw i32 %6, -4 328 %14 = icmp sgt i32 %6, 4 329 br i1 %14, label %4, label %15 330 33115: ; preds = %4 332 %16 = tail call fast float @llvm.arm.mve.maxnmav.f32.v4f32(float 0.000000e+00, <4 x float> %12) 333 store float %16, float* %2, align 4 334 ret void 335} 336 337define void @loop_absmax32_pred_c(float* %0, i32 %1, float* nocapture %2) { 338; CHECK-LABEL: loop_absmax32_pred_c: 339; CHECK: @ %bb.0: 340; CHECK-NEXT: .save {r7, lr} 341; CHECK-NEXT: push {r7, lr} 342; CHECK-NEXT: vmov.i32 q0, #0x0 343; CHECK-NEXT: dlstp.32 lr, r1 344; CHECK-NEXT: .LBB19_1: @ =>This Inner Loop Header: Depth=1 345; CHECK-NEXT: vldrw.u32 q1, [r0], #16 346; CHECK-NEXT: vmaxnma.f32 q1, q0 347; CHECK-NEXT: vmov q0, q1 348; CHECK-NEXT: letp lr, .LBB19_1 349; CHECK-NEXT: @ %bb.2: 350; CHECK-NEXT: vldr s0, .LCPI19_0 351; CHECK-NEXT: vmov r0, s0 352; CHECK-NEXT: vmaxnmav.f32 r0, q1 353; CHECK-NEXT: vmov s0, r0 354; CHECK-NEXT: vstr s0, [r2] 355; CHECK-NEXT: pop {r7, pc} 356; CHECK-NEXT: .p2align 2 357; CHECK-NEXT: @ %bb.3: 358; CHECK-NEXT: .LCPI19_0: 359; CHECK-NEXT: .long 0x00000000 @ float 0 360 br label %4 361 3624: ; preds = %4, %3 363 %5 = phi <4 x float> [ zeroinitializer, %3 ], [ %12, %4 ] 364 %6 = phi i32 [ %1, %3 ], [ %13, %4 ] 365 %7 = phi float* [ %0, %3 ], [ %11, %4 ] 366 %8 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %6) 367 %9 = bitcast float* %7 to <4 x float>* 368 %10 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %9, i32 4, <4 x i1> %8, <4 x float> zeroinitializer) 369 %11 = getelementptr inbounds float, float* %7, i32 4 370 %12 = tail call fast <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> %10, <4 x float> %5, <4 x i1> %8) 371 %13 = add nsw i32 %6, -4 372 %14 = icmp sgt i32 %6, 4 373 br i1 %14, label %4, label %15 374 37515: ; preds = %4 376 %16 = tail call fast float @llvm.arm.mve.maxnmav.f32.v4f32(float 0.000000e+00, <4 x float> %12) 377 store float %16, float* %2, align 4 378 ret void 379} 380 381 382 383 384 385 386define void @loop_absmax16(half* nocapture readonly %0, i32 %1, half* nocapture %2) { 387; CHECK-LABEL: loop_absmax16: 388; CHECK: @ %bb.0: 389; CHECK-NEXT: .save {r7, lr} 390; CHECK-NEXT: push {r7, lr} 391; CHECK-NEXT: vmov.i32 q0, #0x0 392; CHECK-NEXT: lsr.w lr, r1, #3 393; CHECK-NEXT: wls lr, lr, .LBB20_3 394; CHECK-NEXT: @ %bb.1: @ %.preheader 395; CHECK-NEXT: vmov.i32 q0, #0x0 396; CHECK-NEXT: .LBB20_2: @ =>This Inner Loop Header: Depth=1 397; CHECK-NEXT: vldrw.u32 q1, [r0], #8 398; CHECK-NEXT: vmaxnma.f16 q0, q1 399; CHECK-NEXT: le lr, .LBB20_2 400; CHECK-NEXT: .LBB20_3: 401; CHECK-NEXT: vldr.16 s4, .LCPI20_0 402; CHECK-NEXT: vmov r0, s4 403; CHECK-NEXT: vmaxnmav.f16 r0, q0 404; CHECK-NEXT: vmov s0, r0 405; CHECK-NEXT: vstr.16 s0, [r2] 406; CHECK-NEXT: pop {r7, pc} 407; CHECK-NEXT: .p2align 1 408; CHECK-NEXT: @ %bb.4: 409; CHECK-NEXT: .LCPI20_0: 410; CHECK-NEXT: .short 0x0000 @ half 0 411 %4 = lshr i32 %1, 3 412 %5 = icmp eq i32 %4, 0 413 br i1 %5, label %18, label %6 414 4156: ; preds = %3, %6 416 %7 = phi i32 [ %16, %6 ], [ %4, %3 ] 417 %8 = phi <8 x half> [ %15, %6 ], [ zeroinitializer, %3 ] 418 %9 = phi half* [ %12, %6 ], [ %0, %3 ] 419 %10 = bitcast half* %9 to <8 x half>* 420 %11 = load <8 x half>, <8 x half>* %10, align 4 421 %12 = getelementptr inbounds half, half* %9, i32 4 422 %13 = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %11) 423 %14 = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %8) 424 %15 = tail call fast <8 x half> @llvm.maxnum.v8f16(<8 x half> %14, <8 x half> %13) 425 %16 = add nsw i32 %7, -1 426 %17 = icmp eq i32 %16, 0 427 br i1 %17, label %18, label %6 428 42918: ; preds = %6, %3 430 %19 = phi <8 x half> [ zeroinitializer, %3 ], [ %15, %6 ] 431 %20 = tail call fast half @llvm.arm.mve.maxnmav.f16.v8f16(half 0.000000e+00, <8 x half> %19) 432 store half %20, half* %2, align 4 433 ret void 434} 435 436define void @loop_absmax16_c(half* nocapture readonly %0, i32 %1, half* nocapture %2) { 437; CHECK-LABEL: loop_absmax16_c: 438; CHECK: @ %bb.0: 439; CHECK-NEXT: .save {r7, lr} 440; CHECK-NEXT: push {r7, lr} 441; CHECK-NEXT: vmov.i32 q0, #0x0 442; CHECK-NEXT: lsr.w lr, r1, #3 443; CHECK-NEXT: wls lr, lr, .LBB21_3 444; CHECK-NEXT: @ %bb.1: @ %.preheader 445; CHECK-NEXT: vmov.i32 q0, #0x0 446; CHECK-NEXT: .LBB21_2: @ =>This Inner Loop Header: Depth=1 447; CHECK-NEXT: vldrw.u32 q1, [r0], #8 448; CHECK-NEXT: vmaxnma.f16 q0, q1 449; CHECK-NEXT: le lr, .LBB21_2 450; CHECK-NEXT: .LBB21_3: 451; CHECK-NEXT: vldr.16 s4, .LCPI21_0 452; CHECK-NEXT: vmov r0, s4 453; CHECK-NEXT: vmaxnmav.f16 r0, q0 454; CHECK-NEXT: vmov s0, r0 455; CHECK-NEXT: vstr.16 s0, [r2] 456; CHECK-NEXT: pop {r7, pc} 457; CHECK-NEXT: .p2align 1 458; CHECK-NEXT: @ %bb.4: 459; CHECK-NEXT: .LCPI21_0: 460; CHECK-NEXT: .short 0x0000 @ half 0 461 %4 = lshr i32 %1, 3 462 %5 = icmp eq i32 %4, 0 463 br i1 %5, label %18, label %6 464 4656: ; preds = %3, %6 466 %7 = phi i32 [ %16, %6 ], [ %4, %3 ] 467 %8 = phi <8 x half> [ %15, %6 ], [ zeroinitializer, %3 ] 468 %9 = phi half* [ %12, %6 ], [ %0, %3 ] 469 %10 = bitcast half* %9 to <8 x half>* 470 %11 = load <8 x half>, <8 x half>* %10, align 4 471 %12 = getelementptr inbounds half, half* %9, i32 4 472 %13 = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %11) 473 %14 = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %8) 474 %15 = tail call fast <8 x half> @llvm.maxnum.v8f16(<8 x half> %13, <8 x half> %14) 475 %16 = add nsw i32 %7, -1 476 %17 = icmp eq i32 %16, 0 477 br i1 %17, label %18, label %6 478 47918: ; preds = %6, %3 480 %19 = phi <8 x half> [ zeroinitializer, %3 ], [ %15, %6 ] 481 %20 = tail call fast half @llvm.arm.mve.maxnmav.f16.v8f16(half 0.000000e+00, <8 x half> %19) 482 store half %20, half* %2, align 4 483 ret void 484} 485 486define void @loop_absmax16_pred(half* %0, i32 %1, half* nocapture %2) { 487; CHECK-LABEL: loop_absmax16_pred: 488; CHECK: @ %bb.0: 489; CHECK-NEXT: .save {r7, lr} 490; CHECK-NEXT: push {r7, lr} 491; CHECK-NEXT: vmov.i32 q0, #0x0 492; CHECK-NEXT: dlstp.16 lr, r1 493; CHECK-NEXT: .LBB22_1: @ =>This Inner Loop Header: Depth=1 494; CHECK-NEXT: vldrh.u16 q1, [r0], #8 495; CHECK-NEXT: vmaxnma.f16 q0, q1 496; CHECK-NEXT: letp lr, .LBB22_1 497; CHECK-NEXT: @ %bb.2: 498; CHECK-NEXT: vldr.16 s4, .LCPI22_0 499; CHECK-NEXT: vmov r0, s4 500; CHECK-NEXT: vmaxnmav.f16 r0, q0 501; CHECK-NEXT: vmov s0, r0 502; CHECK-NEXT: vstr.16 s0, [r2] 503; CHECK-NEXT: pop {r7, pc} 504; CHECK-NEXT: .p2align 1 505; CHECK-NEXT: @ %bb.3: 506; CHECK-NEXT: .LCPI22_0: 507; CHECK-NEXT: .short 0x0000 @ half 0 508 br label %4 509 5104: ; preds = %4, %3 511 %5 = phi <8 x half> [ zeroinitializer, %3 ], [ %12, %4 ] 512 %6 = phi i32 [ %1, %3 ], [ %13, %4 ] 513 %7 = phi half* [ %0, %3 ], [ %11, %4 ] 514 %8 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %6) 515 %9 = bitcast half* %7 to <8 x half>* 516 %10 = tail call fast <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %9, i32 4, <8 x i1> %8, <8 x half> zeroinitializer) 517 %11 = getelementptr inbounds half, half* %7, i32 4 518 %12 = tail call fast <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> %5, <8 x half> %10, <8 x i1> %8) 519 %13 = add nsw i32 %6, -8 520 %14 = icmp sgt i32 %6, 8 521 br i1 %14, label %4, label %15 522 52315: ; preds = %4 524 %16 = tail call fast half @llvm.arm.mve.maxnmav.f16.v8f16(half 0.000000e+00, <8 x half> %12) 525 store half %16, half* %2, align 4 526 ret void 527} 528 529define void @loop_absmax16_pred_c(half* %0, i32 %1, half* nocapture %2) { 530; CHECK-LABEL: loop_absmax16_pred_c: 531; CHECK: @ %bb.0: 532; CHECK-NEXT: .save {r7, lr} 533; CHECK-NEXT: push {r7, lr} 534; CHECK-NEXT: vmov.i32 q0, #0x0 535; CHECK-NEXT: dlstp.16 lr, r1 536; CHECK-NEXT: .LBB23_1: @ =>This Inner Loop Header: Depth=1 537; CHECK-NEXT: vldrh.u16 q1, [r0], #8 538; CHECK-NEXT: vmaxnma.f16 q1, q0 539; CHECK-NEXT: vmov q0, q1 540; CHECK-NEXT: letp lr, .LBB23_1 541; CHECK-NEXT: @ %bb.2: 542; CHECK-NEXT: vldr.16 s0, .LCPI23_0 543; CHECK-NEXT: vmov r0, s0 544; CHECK-NEXT: vmaxnmav.f16 r0, q1 545; CHECK-NEXT: vmov s0, r0 546; CHECK-NEXT: vstr.16 s0, [r2] 547; CHECK-NEXT: pop {r7, pc} 548; CHECK-NEXT: .p2align 1 549; CHECK-NEXT: @ %bb.3: 550; CHECK-NEXT: .LCPI23_0: 551; CHECK-NEXT: .short 0x0000 @ half 0 552 br label %4 553 5544: ; preds = %4, %3 555 %5 = phi <8 x half> [ zeroinitializer, %3 ], [ %12, %4 ] 556 %6 = phi i32 [ %1, %3 ], [ %13, %4 ] 557 %7 = phi half* [ %0, %3 ], [ %11, %4 ] 558 %8 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %6) 559 %9 = bitcast half* %7 to <8 x half>* 560 %10 = tail call fast <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %9, i32 4, <8 x i1> %8, <8 x half> zeroinitializer) 561 %11 = getelementptr inbounds half, half* %7, i32 4 562 %12 = tail call fast <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> %10, <8 x half> %5, <8 x i1> %8) 563 %13 = add nsw i32 %6, -8 564 %14 = icmp sgt i32 %6, 8 565 br i1 %14, label %4, label %15 566 56715: ; preds = %4 568 %16 = tail call fast half @llvm.arm.mve.maxnmav.f16.v8f16(half 0.000000e+00, <8 x half> %12) 569 store half %16, half* %2, align 4 570 ret void 571} 572 573 574 575 576 577declare <4 x i1> @llvm.arm.mve.vctp32(i32) 578declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>) 579declare <4 x float> @llvm.arm.mve.vminnma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>) 580declare <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>) 581declare float @llvm.arm.mve.maxnmav.f32.v4f32(float, <4 x float>) 582declare <4 x float> @llvm.fabs.v4f32(<4 x float>) 583declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) 584declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) 585 586declare <8 x i1> @llvm.arm.mve.vctp16(i32) 587declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32 immarg, <8 x i1>, <8 x half>) 588declare <8 x half> @llvm.arm.mve.vminnma.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>) 589declare <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>) 590declare half @llvm.arm.mve.maxnmav.f16.v8f16(half, <8 x half>) 591declare <8 x half> @llvm.fabs.v8f16(<8 x half>) 592declare <8 x half> @llvm.maxnum.v8f16(<8 x half>, <8 x half>) 593declare <8 x half> @llvm.minnum.v8f16(<8 x half>, <8 x half>) 594 595 596