1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs -tail-predication=enabled -o - %s | FileCheck %s 3 4define arm_aapcs_vfpcc void @round(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 { 5; CHECK-LABEL: round: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: .save {r7, lr} 8; CHECK-NEXT: push {r7, lr} 9; CHECK-NEXT: cmp r2, #0 10; CHECK-NEXT: it eq 11; CHECK-NEXT: popeq {r7, pc} 12; CHECK-NEXT: .LBB0_1: @ %vector.ph 13; CHECK-NEXT: dlstp.32 lr, r2 14; CHECK-NEXT: .LBB0_2: @ %vector.body 15; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 16; CHECK-NEXT: vldrw.u32 q0, [r0], #16 17; CHECK-NEXT: vrinta.f32 q0, q0 18; CHECK-NEXT: vstrw.32 q0, [r1], #16 19; CHECK-NEXT: letp lr, .LBB0_2 20; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 21; CHECK-NEXT: pop {r7, pc} 22entry: 23 %cmp5 = icmp eq i32 %n, 0 24 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph 25 26vector.ph: ; preds = %entry 27 %n.rnd.up = add i32 %n, 3 28 %n.vec = and i32 %n.rnd.up, -4 29 %trip.count.minus.1 = add i32 %n, -1 30 br label %vector.body 31 32vector.body: ; preds = %vector.body, %vector.ph 33 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 34 %next.gep = getelementptr float, float* %pSrcA, i32 %index 35 %next.gep14 = getelementptr float, float* %pDst, i32 %index 36 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 37 %0 = bitcast float* %next.gep to <4 x float>* 38 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 39 %1 = call fast <4 x float> @llvm.round.v4f32(<4 x float> %wide.masked.load) 40 %2 = bitcast float* %next.gep14 to <4 x float>* 41 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %active.lane.mask) 42 %index.next = add i32 %index, 4 43 %3 = icmp eq i32 %index.next, %n.vec 44 br i1 %3, label %for.cond.cleanup, label %vector.body 45 46for.cond.cleanup: ; preds = %vector.body, %entry 47 ret void 48} 49 50define arm_aapcs_vfpcc void @rint(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 { 51; CHECK-LABEL: rint: 52; CHECK: @ %bb.0: @ %entry 53; CHECK-NEXT: .save {r7, lr} 54; CHECK-NEXT: push {r7, lr} 55; CHECK-NEXT: cmp r2, #0 56; CHECK-NEXT: it eq 57; CHECK-NEXT: popeq {r7, pc} 58; CHECK-NEXT: .LBB1_1: @ %vector.ph 59; CHECK-NEXT: dlstp.32 lr, r2 60; CHECK-NEXT: .LBB1_2: @ %vector.body 61; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 62; CHECK-NEXT: vldrw.u32 q0, [r0], #16 63; CHECK-NEXT: vrintx.f32 q0, q0 64; CHECK-NEXT: vstrw.32 q0, [r1], #16 65; CHECK-NEXT: letp lr, .LBB1_2 66; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 67; CHECK-NEXT: pop {r7, pc} 68entry: 69 %cmp5 = icmp eq i32 %n, 0 70 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph 71 72vector.ph: ; preds = %entry 73 %n.rnd.up = add i32 %n, 3 74 %n.vec = and i32 %n.rnd.up, -4 75 %trip.count.minus.1 = add i32 %n, -1 76 br label %vector.body 77 78vector.body: ; preds = %vector.body, %vector.ph 79 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 80 %next.gep = getelementptr float, float* %pSrcA, i32 %index 81 %next.gep14 = getelementptr float, float* %pDst, i32 %index 82 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 83 %0 = bitcast float* %next.gep to <4 x float>* 84 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 85 %1 = call fast <4 x float> @llvm.rint.v4f32(<4 x float> %wide.masked.load) 86 %2 = bitcast float* %next.gep14 to <4 x float>* 87 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %active.lane.mask) 88 %index.next = add i32 %index, 4 89 %3 = icmp eq i32 %index.next, %n.vec 90 br i1 %3, label %for.cond.cleanup, label %vector.body 91 92for.cond.cleanup: ; preds = %vector.body, %entry 93 ret void 94} 95 96define arm_aapcs_vfpcc void @trunc(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 { 97; CHECK-LABEL: trunc: 98; CHECK: @ %bb.0: @ %entry 99; CHECK-NEXT: .save {r7, lr} 100; CHECK-NEXT: push {r7, lr} 101; CHECK-NEXT: cmp r2, #0 102; CHECK-NEXT: it eq 103; CHECK-NEXT: popeq {r7, pc} 104; CHECK-NEXT: .LBB2_1: @ %vector.ph 105; CHECK-NEXT: dlstp.32 lr, r2 106; CHECK-NEXT: .LBB2_2: @ %vector.body 107; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 108; CHECK-NEXT: vldrw.u32 q0, [r0], #16 109; CHECK-NEXT: vrintz.f32 q0, q0 110; CHECK-NEXT: vstrw.32 q0, [r1], #16 111; CHECK-NEXT: letp lr, .LBB2_2 112; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 113; CHECK-NEXT: pop {r7, pc} 114entry: 115 %cmp5 = icmp eq i32 %n, 0 116 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph 117 118vector.ph: ; preds = %entry 119 %n.rnd.up = add i32 %n, 3 120 %n.vec = and i32 %n.rnd.up, -4 121 %trip.count.minus.1 = add i32 %n, -1 122 br label %vector.body 123 124vector.body: ; preds = %vector.body, %vector.ph 125 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 126 %next.gep = getelementptr float, float* %pSrcA, i32 %index 127 %next.gep14 = getelementptr float, float* %pDst, i32 %index 128 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 129 %0 = bitcast float* %next.gep to <4 x float>* 130 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 131 %1 = call fast <4 x float> @llvm.trunc.v4f32(<4 x float> %wide.masked.load) 132 %2 = bitcast float* %next.gep14 to <4 x float>* 133 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %active.lane.mask) 134 %index.next = add i32 %index, 4 135 %3 = icmp eq i32 %index.next, %n.vec 136 br i1 %3, label %for.cond.cleanup, label %vector.body 137 138for.cond.cleanup: ; preds = %vector.body, %entry 139 ret void 140} 141 142define arm_aapcs_vfpcc void @ceil(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 { 143; CHECK-LABEL: ceil: 144; CHECK: @ %bb.0: @ %entry 145; CHECK-NEXT: .save {r7, lr} 146; CHECK-NEXT: push {r7, lr} 147; CHECK-NEXT: cmp r2, #0 148; CHECK-NEXT: it eq 149; CHECK-NEXT: popeq {r7, pc} 150; CHECK-NEXT: .LBB3_1: @ %vector.ph 151; CHECK-NEXT: dlstp.32 lr, r2 152; CHECK-NEXT: .LBB3_2: @ %vector.body 153; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 154; CHECK-NEXT: vldrw.u32 q0, [r0], #16 155; CHECK-NEXT: vrintp.f32 q0, q0 156; CHECK-NEXT: vstrw.32 q0, [r1], #16 157; CHECK-NEXT: letp lr, .LBB3_2 158; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 159; CHECK-NEXT: pop {r7, pc} 160entry: 161 %cmp5 = icmp eq i32 %n, 0 162 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph 163 164vector.ph: ; preds = %entry 165 %n.rnd.up = add i32 %n, 3 166 %n.vec = and i32 %n.rnd.up, -4 167 %trip.count.minus.1 = add i32 %n, -1 168 br label %vector.body 169 170vector.body: ; preds = %vector.body, %vector.ph 171 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 172 %next.gep = getelementptr float, float* %pSrcA, i32 %index 173 %next.gep14 = getelementptr float, float* %pDst, i32 %index 174 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 175 %0 = bitcast float* %next.gep to <4 x float>* 176 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 177 %1 = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.masked.load) 178 %2 = bitcast float* %next.gep14 to <4 x float>* 179 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %active.lane.mask) 180 %index.next = add i32 %index, 4 181 %3 = icmp eq i32 %index.next, %n.vec 182 br i1 %3, label %for.cond.cleanup, label %vector.body 183 184for.cond.cleanup: ; preds = %vector.body, %entry 185 ret void 186} 187 188define arm_aapcs_vfpcc void @floor(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 { 189; CHECK-LABEL: floor: 190; CHECK: @ %bb.0: @ %entry 191; CHECK-NEXT: .save {r7, lr} 192; CHECK-NEXT: push {r7, lr} 193; CHECK-NEXT: cmp r2, #0 194; CHECK-NEXT: it eq 195; CHECK-NEXT: popeq {r7, pc} 196; CHECK-NEXT: .LBB4_1: @ %vector.ph 197; CHECK-NEXT: dlstp.32 lr, r2 198; CHECK-NEXT: .LBB4_2: @ %vector.body 199; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 200; CHECK-NEXT: vldrw.u32 q0, [r0], #16 201; CHECK-NEXT: vrintm.f32 q0, q0 202; CHECK-NEXT: vstrw.32 q0, [r1], #16 203; CHECK-NEXT: letp lr, .LBB4_2 204; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 205; CHECK-NEXT: pop {r7, pc} 206entry: 207 %cmp5 = icmp eq i32 %n, 0 208 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph 209 210vector.ph: ; preds = %entry 211 %n.rnd.up = add i32 %n, 3 212 %n.vec = and i32 %n.rnd.up, -4 213 %trip.count.minus.1 = add i32 %n, -1 214 br label %vector.body 215 216vector.body: ; preds = %vector.body, %vector.ph 217 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 218 %next.gep = getelementptr float, float* %pSrcA, i32 %index 219 %next.gep14 = getelementptr float, float* %pDst, i32 %index 220 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 221 %0 = bitcast float* %next.gep to <4 x float>* 222 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 223 %1 = call fast <4 x float> @llvm.floor.v4f32(<4 x float> %wide.masked.load) 224 %2 = bitcast float* %next.gep14 to <4 x float>* 225 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %active.lane.mask) 226 %index.next = add i32 %index, 4 227 %3 = icmp eq i32 %index.next, %n.vec 228 br i1 %3, label %for.cond.cleanup, label %vector.body 229 230for.cond.cleanup: ; preds = %vector.body, %entry 231 ret void 232} 233 234; nearbyint shouldn't be tail predicated because it's lowered into multiple instructions 235define arm_aapcs_vfpcc void @nearbyint(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 { 236; CHECK-LABEL: nearbyint: 237; CHECK: @ %bb.0: @ %entry 238; CHECK-NEXT: .save {r7, lr} 239; CHECK-NEXT: push {r7, lr} 240; CHECK-NEXT: cmp r2, #0 241; CHECK-NEXT: it eq 242; CHECK-NEXT: popeq {r7, pc} 243; CHECK-NEXT: .LBB5_1: @ %vector.ph 244; CHECK-NEXT: dlstp.32 lr, r2 245; CHECK-NEXT: .LBB5_2: @ %vector.body 246; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 247; CHECK-NEXT: vldrw.u32 q0, [r0], #16 248; CHECK-NEXT: vrintr.f32 s7, s3 249; CHECK-NEXT: vrintr.f32 s6, s2 250; CHECK-NEXT: vrintr.f32 s5, s1 251; CHECK-NEXT: vrintr.f32 s4, s0 252; CHECK-NEXT: vstrw.32 q1, [r1], #16 253; CHECK-NEXT: letp lr, .LBB5_2 254; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 255; CHECK-NEXT: pop {r7, pc} 256entry: 257 %cmp5 = icmp eq i32 %n, 0 258 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph 259 260vector.ph: ; preds = %entry 261 %n.rnd.up = add i32 %n, 3 262 %n.vec = and i32 %n.rnd.up, -4 263 %trip.count.minus.1 = add i32 %n, -1 264 br label %vector.body 265 266vector.body: ; preds = %vector.body, %vector.ph 267 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 268 %next.gep = getelementptr float, float* %pSrcA, i32 %index 269 %next.gep14 = getelementptr float, float* %pDst, i32 %index 270 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 271 %0 = bitcast float* %next.gep to <4 x float>* 272 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 273 %1 = call fast <4 x float> @llvm.nearbyint.v4f32(<4 x float> %wide.masked.load) 274 %2 = bitcast float* %next.gep14 to <4 x float>* 275 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %active.lane.mask) 276 %index.next = add i32 %index, 4 277 %3 = icmp eq i32 %index.next, %n.vec 278 br i1 %3, label %for.cond.cleanup, label %vector.body 279 280for.cond.cleanup: ; preds = %vector.body, %entry 281 ret void 282} 283 284declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) #1 285 286declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>) #2 287 288declare <4 x float> @llvm.trunc.v4f32(<4 x float>) #3 289 290declare <4 x float> @llvm.rint.v4f32(<4 x float>) #3 291 292declare <4 x float> @llvm.round.v4f32(<4 x float>) #3 293 294declare <4 x float> @llvm.ceil.v4f32(<4 x float>) #3 295 296declare <4 x float> @llvm.floor.v4f32(<4 x float>) #3 297 298declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) #1 299 300declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>) #4 301