1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s 3 4define void @arm_cmplx_mag_squared_f16(half* nocapture readonly %pSrc, half* nocapture %pDst, i32 %numSamples) { 5; CHECK-LABEL: arm_cmplx_mag_squared_f16: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: .save {r4, r5, r7, lr} 8; CHECK-NEXT: push {r4, r5, r7, lr} 9; CHECK-NEXT: cmp r2, #0 10; CHECK-NEXT: beq .LBB0_8 11; CHECK-NEXT: @ %bb.1: @ %while.body.preheader 12; CHECK-NEXT: cmp r2, #8 13; CHECK-NEXT: blo .LBB0_9 14; CHECK-NEXT: @ %bb.2: @ %vector.memcheck 15; CHECK-NEXT: add.w r3, r0, r2, lsl #2 16; CHECK-NEXT: cmp r3, r1 17; CHECK-NEXT: itt hi 18; CHECK-NEXT: addhi.w r3, r1, r2, lsl #1 19; CHECK-NEXT: cmphi r3, r0 20; CHECK-NEXT: bhi .LBB0_9 21; CHECK-NEXT: @ %bb.3: @ %vector.ph 22; CHECK-NEXT: bic r4, r2, #7 23; CHECK-NEXT: movs r5, #1 24; CHECK-NEXT: sub.w r3, r4, #8 25; CHECK-NEXT: add.w r12, r1, r4, lsl #1 26; CHECK-NEXT: add.w lr, r5, r3, lsr #3 27; CHECK-NEXT: add.w r3, r0, r4, lsl #2 28; CHECK-NEXT: dls lr, lr 29; CHECK-NEXT: and r5, r2, #7 30; CHECK-NEXT: .LBB0_4: @ %vector.body 31; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 32; CHECK-NEXT: vld20.16 {q0, q1}, [r0] 33; CHECK-NEXT: vld21.16 {q0, q1}, [r0]! 34; CHECK-NEXT: vmul.f16 q2, q0, q0 35; CHECK-NEXT: vfma.f16 q2, q1, q1 36; CHECK-NEXT: vstrb.8 q2, [r1], #16 37; CHECK-NEXT: le lr, .LBB0_4 38; CHECK-NEXT: @ %bb.5: @ %middle.block 39; CHECK-NEXT: cmp r4, r2 40; CHECK-NEXT: it eq 41; CHECK-NEXT: popeq {r4, r5, r7, pc} 42; CHECK-NEXT: .LBB0_6: @ %while.body.preheader26 43; CHECK-NEXT: dls lr, r5 44; CHECK-NEXT: .LBB0_7: @ %while.body 45; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 46; CHECK-NEXT: vldr.16 s0, [r3] 47; CHECK-NEXT: vldr.16 s2, [r3, #2] 48; CHECK-NEXT: adds r3, #4 49; CHECK-NEXT: vmul.f16 s0, s0, s0 50; CHECK-NEXT: vfma.f16 s0, s2, s2 51; CHECK-NEXT: vstr.16 s0, [r12] 52; CHECK-NEXT: add.w r12, r12, #2 53; CHECK-NEXT: le lr, .LBB0_7 54; CHECK-NEXT: .LBB0_8: @ %while.end 55; CHECK-NEXT: pop {r4, r5, r7, pc} 56; CHECK-NEXT: .LBB0_9: 57; CHECK-NEXT: mov r3, r0 58; CHECK-NEXT: mov r12, r1 59; CHECK-NEXT: mov r5, r2 60; CHECK-NEXT: b .LBB0_6 61entry: 62 %cmp.not11 = icmp eq i32 %numSamples, 0 63 br i1 %cmp.not11, label %while.end, label %while.body.preheader 64 65while.body.preheader: ; preds = %entry 66 %min.iters.check = icmp ult i32 %numSamples, 8 67 br i1 %min.iters.check, label %while.body.preheader26, label %vector.memcheck 68 69vector.memcheck: ; preds = %while.body.preheader 70 %scevgep = getelementptr half, half* %pDst, i32 %numSamples 71 %0 = shl i32 %numSamples, 1 72 %scevgep18 = getelementptr half, half* %pSrc, i32 %0 73 %bound0 = icmp ugt half* %scevgep18, %pDst 74 %bound1 = icmp ugt half* %scevgep, %pSrc 75 %found.conflict = and i1 %bound0, %bound1 76 br i1 %found.conflict, label %while.body.preheader26, label %vector.ph 77 78vector.ph: ; preds = %vector.memcheck 79 %n.vec = and i32 %numSamples, -8 80 %1 = shl i32 %n.vec, 1 81 %ind.end = getelementptr half, half* %pSrc, i32 %1 82 %ind.end21 = getelementptr half, half* %pDst, i32 %n.vec 83 %ind.end23 = and i32 %numSamples, 7 84 br label %vector.body 85 86vector.body: ; preds = %vector.body, %vector.ph 87 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 88 %2 = shl i32 %index, 1 89 %next.gep = getelementptr half, half* %pSrc, i32 %2 90 %next.gep24 = getelementptr half, half* %pDst, i32 %index 91 %3 = bitcast half* %next.gep to <16 x half>* 92 %wide.vec = load <16 x half>, <16 x half>* %3, align 2 93 %4 = fmul fast <16 x half> %wide.vec, %wide.vec 94 %5 = shufflevector <16 x half> %4, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 95 %6 = fmul fast <16 x half> %wide.vec, %wide.vec 96 %7 = shufflevector <16 x half> %6, <16 x half> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 97 %8 = fadd fast <8 x half> %7, %5 98 %9 = bitcast half* %next.gep24 to <8 x half>* 99 store <8 x half> %8, <8 x half>* %9, align 2 100 %index.next = add i32 %index, 8 101 %10 = icmp eq i32 %index.next, %n.vec 102 br i1 %10, label %middle.block, label %vector.body 103 104middle.block: ; preds = %vector.body 105 %cmp.n = icmp eq i32 %n.vec, %numSamples 106 br i1 %cmp.n, label %while.end, label %while.body.preheader26 107 108while.body.preheader26: ; preds = %middle.block, %vector.memcheck, %while.body.preheader 109 %pSrc.addr.014.ph = phi half* [ %pSrc, %vector.memcheck ], [ %pSrc, %while.body.preheader ], [ %ind.end, %middle.block ] 110 %pDst.addr.013.ph = phi half* [ %pDst, %vector.memcheck ], [ %pDst, %while.body.preheader ], [ %ind.end21, %middle.block ] 111 %blkCnt.012.ph = phi i32 [ %numSamples, %vector.memcheck ], [ %numSamples, %while.body.preheader ], [ %ind.end23, %middle.block ] 112 br label %while.body 113 114while.body: ; preds = %while.body.preheader26, %while.body 115 %pSrc.addr.014 = phi half* [ %incdec.ptr1, %while.body ], [ %pSrc.addr.014.ph, %while.body.preheader26 ] 116 %pDst.addr.013 = phi half* [ %incdec.ptr3, %while.body ], [ %pDst.addr.013.ph, %while.body.preheader26 ] 117 %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blkCnt.012.ph, %while.body.preheader26 ] 118 %incdec.ptr = getelementptr inbounds half, half* %pSrc.addr.014, i32 1 119 %11 = load half, half* %pSrc.addr.014, align 2 120 %incdec.ptr1 = getelementptr inbounds half, half* %pSrc.addr.014, i32 2 121 %12 = load half, half* %incdec.ptr, align 2 122 %mul = fmul fast half %11, %11 123 %mul2 = fmul fast half %12, %12 124 %add = fadd fast half %mul2, %mul 125 %incdec.ptr3 = getelementptr inbounds half, half* %pDst.addr.013, i32 1 126 store half %add, half* %pDst.addr.013, align 2 127 %dec = add i32 %blkCnt.012, -1 128 %cmp.not = icmp eq i32 %dec, 0 129 br i1 %cmp.not, label %while.end, label %while.body 130 131while.end: ; preds = %while.body, %middle.block, %entry 132 ret void 133} 134 135define void @arm_cmplx_mag_squared_f32(float* nocapture readonly %pSrc, float* nocapture %pDst, i32 %numSamples) { 136; CHECK-LABEL: arm_cmplx_mag_squared_f32: 137; CHECK: @ %bb.0: @ %entry 138; CHECK-NEXT: .save {r4, r5, r7, lr} 139; CHECK-NEXT: push {r4, r5, r7, lr} 140; CHECK-NEXT: cmp r2, #0 141; CHECK-NEXT: beq .LBB1_8 142; CHECK-NEXT: @ %bb.1: @ %while.body.preheader 143; CHECK-NEXT: cmp r2, #4 144; CHECK-NEXT: blo .LBB1_9 145; CHECK-NEXT: @ %bb.2: @ %vector.memcheck 146; CHECK-NEXT: add.w r3, r0, r2, lsl #3 147; CHECK-NEXT: cmp r3, r1 148; CHECK-NEXT: itt hi 149; CHECK-NEXT: addhi.w r3, r1, r2, lsl #2 150; CHECK-NEXT: cmphi r3, r0 151; CHECK-NEXT: bhi .LBB1_9 152; CHECK-NEXT: @ %bb.3: @ %vector.ph 153; CHECK-NEXT: bic r4, r2, #3 154; CHECK-NEXT: movs r5, #1 155; CHECK-NEXT: subs r3, r4, #4 156; CHECK-NEXT: add.w r12, r1, r4, lsl #2 157; CHECK-NEXT: add.w lr, r5, r3, lsr #2 158; CHECK-NEXT: add.w r3, r0, r4, lsl #3 159; CHECK-NEXT: dls lr, lr 160; CHECK-NEXT: and r5, r2, #3 161; CHECK-NEXT: .LBB1_4: @ %vector.body 162; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 163; CHECK-NEXT: vld20.32 {q0, q1}, [r0] 164; CHECK-NEXT: vld21.32 {q0, q1}, [r0]! 165; CHECK-NEXT: vmul.f32 q2, q0, q0 166; CHECK-NEXT: vfma.f32 q2, q1, q1 167; CHECK-NEXT: vstrb.8 q2, [r1], #16 168; CHECK-NEXT: le lr, .LBB1_4 169; CHECK-NEXT: @ %bb.5: @ %middle.block 170; CHECK-NEXT: cmp r4, r2 171; CHECK-NEXT: it eq 172; CHECK-NEXT: popeq {r4, r5, r7, pc} 173; CHECK-NEXT: .LBB1_6: @ %while.body.preheader26 174; CHECK-NEXT: dls lr, r5 175; CHECK-NEXT: .LBB1_7: @ %while.body 176; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 177; CHECK-NEXT: vldr s0, [r3] 178; CHECK-NEXT: vldr s2, [r3, #4] 179; CHECK-NEXT: adds r3, #8 180; CHECK-NEXT: vmul.f32 s0, s0, s0 181; CHECK-NEXT: vfma.f32 s0, s2, s2 182; CHECK-NEXT: vstr s0, [r12] 183; CHECK-NEXT: add.w r12, r12, #4 184; CHECK-NEXT: le lr, .LBB1_7 185; CHECK-NEXT: .LBB1_8: @ %while.end 186; CHECK-NEXT: pop {r4, r5, r7, pc} 187; CHECK-NEXT: .LBB1_9: 188; CHECK-NEXT: mov r3, r0 189; CHECK-NEXT: mov r12, r1 190; CHECK-NEXT: mov r5, r2 191; CHECK-NEXT: b .LBB1_6 192entry: 193 %cmp.not11 = icmp eq i32 %numSamples, 0 194 br i1 %cmp.not11, label %while.end, label %while.body.preheader 195 196while.body.preheader: ; preds = %entry 197 %min.iters.check = icmp ult i32 %numSamples, 4 198 br i1 %min.iters.check, label %while.body.preheader26, label %vector.memcheck 199 200vector.memcheck: ; preds = %while.body.preheader 201 %scevgep = getelementptr float, float* %pDst, i32 %numSamples 202 %0 = shl i32 %numSamples, 1 203 %scevgep18 = getelementptr float, float* %pSrc, i32 %0 204 %bound0 = icmp ugt float* %scevgep18, %pDst 205 %bound1 = icmp ugt float* %scevgep, %pSrc 206 %found.conflict = and i1 %bound0, %bound1 207 br i1 %found.conflict, label %while.body.preheader26, label %vector.ph 208 209vector.ph: ; preds = %vector.memcheck 210 %n.vec = and i32 %numSamples, -4 211 %1 = shl i32 %n.vec, 1 212 %ind.end = getelementptr float, float* %pSrc, i32 %1 213 %ind.end21 = getelementptr float, float* %pDst, i32 %n.vec 214 %ind.end23 = and i32 %numSamples, 3 215 br label %vector.body 216 217vector.body: ; preds = %vector.body, %vector.ph 218 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 219 %2 = shl i32 %index, 1 220 %next.gep = getelementptr float, float* %pSrc, i32 %2 221 %next.gep24 = getelementptr float, float* %pDst, i32 %index 222 %3 = bitcast float* %next.gep to <8 x float>* 223 %wide.vec = load <8 x float>, <8 x float>* %3, align 4 224 %4 = fmul fast <8 x float> %wide.vec, %wide.vec 225 %5 = shufflevector <8 x float> %4, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 226 %6 = fmul fast <8 x float> %wide.vec, %wide.vec 227 %7 = shufflevector <8 x float> %6, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 228 %8 = fadd fast <4 x float> %7, %5 229 %9 = bitcast float* %next.gep24 to <4 x float>* 230 store <4 x float> %8, <4 x float>* %9, align 4 231 %index.next = add i32 %index, 4 232 %10 = icmp eq i32 %index.next, %n.vec 233 br i1 %10, label %middle.block, label %vector.body 234 235middle.block: ; preds = %vector.body 236 %cmp.n = icmp eq i32 %n.vec, %numSamples 237 br i1 %cmp.n, label %while.end, label %while.body.preheader26 238 239while.body.preheader26: ; preds = %middle.block, %vector.memcheck, %while.body.preheader 240 %pSrc.addr.014.ph = phi float* [ %pSrc, %vector.memcheck ], [ %pSrc, %while.body.preheader ], [ %ind.end, %middle.block ] 241 %pDst.addr.013.ph = phi float* [ %pDst, %vector.memcheck ], [ %pDst, %while.body.preheader ], [ %ind.end21, %middle.block ] 242 %blkCnt.012.ph = phi i32 [ %numSamples, %vector.memcheck ], [ %numSamples, %while.body.preheader ], [ %ind.end23, %middle.block ] 243 br label %while.body 244 245while.body: ; preds = %while.body.preheader26, %while.body 246 %pSrc.addr.014 = phi float* [ %incdec.ptr1, %while.body ], [ %pSrc.addr.014.ph, %while.body.preheader26 ] 247 %pDst.addr.013 = phi float* [ %incdec.ptr3, %while.body ], [ %pDst.addr.013.ph, %while.body.preheader26 ] 248 %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blkCnt.012.ph, %while.body.preheader26 ] 249 %incdec.ptr = getelementptr inbounds float, float* %pSrc.addr.014, i32 1 250 %11 = load float, float* %pSrc.addr.014, align 4 251 %incdec.ptr1 = getelementptr inbounds float, float* %pSrc.addr.014, i32 2 252 %12 = load float, float* %incdec.ptr, align 4 253 %mul = fmul fast float %11, %11 254 %mul2 = fmul fast float %12, %12 255 %add = fadd fast float %mul2, %mul 256 %incdec.ptr3 = getelementptr inbounds float, float* %pDst.addr.013, i32 1 257 store float %add, float* %pDst.addr.013, align 4 258 %dec = add i32 %blkCnt.012, -1 259 %cmp.not = icmp eq i32 %dec, 0 260 br i1 %cmp.not, label %while.end, label %while.body 261 262while.end: ; preds = %while.body, %middle.block, %entry 263 ret void 264} 265