• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s
3
4define void @arm_cmplx_mag_squared_f16(half* nocapture readonly %pSrc, half* nocapture %pDst, i32 %numSamples) {
5; CHECK-LABEL: arm_cmplx_mag_squared_f16:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    .save {r4, r5, r7, lr}
8; CHECK-NEXT:    push {r4, r5, r7, lr}
9; CHECK-NEXT:    cmp r2, #0
10; CHECK-NEXT:    beq .LBB0_8
11; CHECK-NEXT:  @ %bb.1: @ %while.body.preheader
12; CHECK-NEXT:    cmp r2, #8
13; CHECK-NEXT:    blo .LBB0_9
14; CHECK-NEXT:  @ %bb.2: @ %vector.memcheck
15; CHECK-NEXT:    add.w r3, r0, r2, lsl #2
16; CHECK-NEXT:    cmp r3, r1
17; CHECK-NEXT:    itt hi
18; CHECK-NEXT:    addhi.w r3, r1, r2, lsl #1
19; CHECK-NEXT:    cmphi r3, r0
20; CHECK-NEXT:    bhi .LBB0_9
21; CHECK-NEXT:  @ %bb.3: @ %vector.ph
22; CHECK-NEXT:    bic r4, r2, #7
23; CHECK-NEXT:    movs r5, #1
24; CHECK-NEXT:    sub.w r3, r4, #8
25; CHECK-NEXT:    add.w r12, r1, r4, lsl #1
26; CHECK-NEXT:    add.w lr, r5, r3, lsr #3
27; CHECK-NEXT:    add.w r3, r0, r4, lsl #2
28; CHECK-NEXT:    dls lr, lr
29; CHECK-NEXT:    and r5, r2, #7
30; CHECK-NEXT:  .LBB0_4: @ %vector.body
31; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
32; CHECK-NEXT:    vld20.16 {q0, q1}, [r0]
33; CHECK-NEXT:    vld21.16 {q0, q1}, [r0]!
34; CHECK-NEXT:    vmul.f16 q2, q0, q0
35; CHECK-NEXT:    vfma.f16 q2, q1, q1
36; CHECK-NEXT:    vstrb.8 q2, [r1], #16
37; CHECK-NEXT:    le lr, .LBB0_4
38; CHECK-NEXT:  @ %bb.5: @ %middle.block
39; CHECK-NEXT:    cmp r4, r2
40; CHECK-NEXT:    it eq
41; CHECK-NEXT:    popeq {r4, r5, r7, pc}
42; CHECK-NEXT:  .LBB0_6: @ %while.body.preheader26
43; CHECK-NEXT:    dls lr, r5
44; CHECK-NEXT:  .LBB0_7: @ %while.body
45; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
46; CHECK-NEXT:    vldr.16 s0, [r3]
47; CHECK-NEXT:    vldr.16 s2, [r3, #2]
48; CHECK-NEXT:    adds r3, #4
49; CHECK-NEXT:    vmul.f16 s0, s0, s0
50; CHECK-NEXT:    vfma.f16 s0, s2, s2
51; CHECK-NEXT:    vstr.16 s0, [r12]
52; CHECK-NEXT:    add.w r12, r12, #2
53; CHECK-NEXT:    le lr, .LBB0_7
54; CHECK-NEXT:  .LBB0_8: @ %while.end
55; CHECK-NEXT:    pop {r4, r5, r7, pc}
56; CHECK-NEXT:  .LBB0_9:
57; CHECK-NEXT:    mov r3, r0
58; CHECK-NEXT:    mov r12, r1
59; CHECK-NEXT:    mov r5, r2
60; CHECK-NEXT:    b .LBB0_6
61entry:
62  %cmp.not11 = icmp eq i32 %numSamples, 0
63  br i1 %cmp.not11, label %while.end, label %while.body.preheader
64
65while.body.preheader:                             ; preds = %entry
66  %min.iters.check = icmp ult i32 %numSamples, 8
67  br i1 %min.iters.check, label %while.body.preheader26, label %vector.memcheck
68
69vector.memcheck:                                  ; preds = %while.body.preheader
70  %scevgep = getelementptr half, half* %pDst, i32 %numSamples
71  %0 = shl i32 %numSamples, 1
72  %scevgep18 = getelementptr half, half* %pSrc, i32 %0
73  %bound0 = icmp ugt half* %scevgep18, %pDst
74  %bound1 = icmp ugt half* %scevgep, %pSrc
75  %found.conflict = and i1 %bound0, %bound1
76  br i1 %found.conflict, label %while.body.preheader26, label %vector.ph
77
78vector.ph:                                        ; preds = %vector.memcheck
79  %n.vec = and i32 %numSamples, -8
80  %1 = shl i32 %n.vec, 1
81  %ind.end = getelementptr half, half* %pSrc, i32 %1
82  %ind.end21 = getelementptr half, half* %pDst, i32 %n.vec
83  %ind.end23 = and i32 %numSamples, 7
84  br label %vector.body
85
86vector.body:                                      ; preds = %vector.body, %vector.ph
87  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
88  %2 = shl i32 %index, 1
89  %next.gep = getelementptr half, half* %pSrc, i32 %2
90  %next.gep24 = getelementptr half, half* %pDst, i32 %index
91  %3 = bitcast half* %next.gep to <16 x half>*
92  %wide.vec = load <16 x half>, <16 x half>* %3, align 2
93  %4 = fmul fast <16 x half> %wide.vec, %wide.vec
94  %5 = shufflevector <16 x half> %4, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
95  %6 = fmul fast <16 x half> %wide.vec, %wide.vec
96  %7 = shufflevector <16 x half> %6, <16 x half> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
97  %8 = fadd fast <8 x half> %7, %5
98  %9 = bitcast half* %next.gep24 to <8 x half>*
99  store <8 x half> %8, <8 x half>* %9, align 2
100  %index.next = add i32 %index, 8
101  %10 = icmp eq i32 %index.next, %n.vec
102  br i1 %10, label %middle.block, label %vector.body
103
104middle.block:                                     ; preds = %vector.body
105  %cmp.n = icmp eq i32 %n.vec, %numSamples
106  br i1 %cmp.n, label %while.end, label %while.body.preheader26
107
108while.body.preheader26:                           ; preds = %middle.block, %vector.memcheck, %while.body.preheader
109  %pSrc.addr.014.ph = phi half* [ %pSrc, %vector.memcheck ], [ %pSrc, %while.body.preheader ], [ %ind.end, %middle.block ]
110  %pDst.addr.013.ph = phi half* [ %pDst, %vector.memcheck ], [ %pDst, %while.body.preheader ], [ %ind.end21, %middle.block ]
111  %blkCnt.012.ph = phi i32 [ %numSamples, %vector.memcheck ], [ %numSamples, %while.body.preheader ], [ %ind.end23, %middle.block ]
112  br label %while.body
113
114while.body:                                       ; preds = %while.body.preheader26, %while.body
115  %pSrc.addr.014 = phi half* [ %incdec.ptr1, %while.body ], [ %pSrc.addr.014.ph, %while.body.preheader26 ]
116  %pDst.addr.013 = phi half* [ %incdec.ptr3, %while.body ], [ %pDst.addr.013.ph, %while.body.preheader26 ]
117  %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blkCnt.012.ph, %while.body.preheader26 ]
118  %incdec.ptr = getelementptr inbounds half, half* %pSrc.addr.014, i32 1
119  %11 = load half, half* %pSrc.addr.014, align 2
120  %incdec.ptr1 = getelementptr inbounds half, half* %pSrc.addr.014, i32 2
121  %12 = load half, half* %incdec.ptr, align 2
122  %mul = fmul fast half %11, %11
123  %mul2 = fmul fast half %12, %12
124  %add = fadd fast half %mul2, %mul
125  %incdec.ptr3 = getelementptr inbounds half, half* %pDst.addr.013, i32 1
126  store half %add, half* %pDst.addr.013, align 2
127  %dec = add i32 %blkCnt.012, -1
128  %cmp.not = icmp eq i32 %dec, 0
129  br i1 %cmp.not, label %while.end, label %while.body
130
131while.end:                                        ; preds = %while.body, %middle.block, %entry
132  ret void
133}
134
135define void @arm_cmplx_mag_squared_f32(float* nocapture readonly %pSrc, float* nocapture %pDst, i32 %numSamples) {
136; CHECK-LABEL: arm_cmplx_mag_squared_f32:
137; CHECK:       @ %bb.0: @ %entry
138; CHECK-NEXT:    .save {r4, r5, r7, lr}
139; CHECK-NEXT:    push {r4, r5, r7, lr}
140; CHECK-NEXT:    cmp r2, #0
141; CHECK-NEXT:    beq .LBB1_8
142; CHECK-NEXT:  @ %bb.1: @ %while.body.preheader
143; CHECK-NEXT:    cmp r2, #4
144; CHECK-NEXT:    blo .LBB1_9
145; CHECK-NEXT:  @ %bb.2: @ %vector.memcheck
146; CHECK-NEXT:    add.w r3, r0, r2, lsl #3
147; CHECK-NEXT:    cmp r3, r1
148; CHECK-NEXT:    itt hi
149; CHECK-NEXT:    addhi.w r3, r1, r2, lsl #2
150; CHECK-NEXT:    cmphi r3, r0
151; CHECK-NEXT:    bhi .LBB1_9
152; CHECK-NEXT:  @ %bb.3: @ %vector.ph
153; CHECK-NEXT:    bic r4, r2, #3
154; CHECK-NEXT:    movs r5, #1
155; CHECK-NEXT:    subs r3, r4, #4
156; CHECK-NEXT:    add.w r12, r1, r4, lsl #2
157; CHECK-NEXT:    add.w lr, r5, r3, lsr #2
158; CHECK-NEXT:    add.w r3, r0, r4, lsl #3
159; CHECK-NEXT:    dls lr, lr
160; CHECK-NEXT:    and r5, r2, #3
161; CHECK-NEXT:  .LBB1_4: @ %vector.body
162; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
163; CHECK-NEXT:    vld20.32 {q0, q1}, [r0]
164; CHECK-NEXT:    vld21.32 {q0, q1}, [r0]!
165; CHECK-NEXT:    vmul.f32 q2, q0, q0
166; CHECK-NEXT:    vfma.f32 q2, q1, q1
167; CHECK-NEXT:    vstrb.8 q2, [r1], #16
168; CHECK-NEXT:    le lr, .LBB1_4
169; CHECK-NEXT:  @ %bb.5: @ %middle.block
170; CHECK-NEXT:    cmp r4, r2
171; CHECK-NEXT:    it eq
172; CHECK-NEXT:    popeq {r4, r5, r7, pc}
173; CHECK-NEXT:  .LBB1_6: @ %while.body.preheader26
174; CHECK-NEXT:    dls lr, r5
175; CHECK-NEXT:  .LBB1_7: @ %while.body
176; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
177; CHECK-NEXT:    vldr s0, [r3]
178; CHECK-NEXT:    vldr s2, [r3, #4]
179; CHECK-NEXT:    adds r3, #8
180; CHECK-NEXT:    vmul.f32 s0, s0, s0
181; CHECK-NEXT:    vfma.f32 s0, s2, s2
182; CHECK-NEXT:    vstr s0, [r12]
183; CHECK-NEXT:    add.w r12, r12, #4
184; CHECK-NEXT:    le lr, .LBB1_7
185; CHECK-NEXT:  .LBB1_8: @ %while.end
186; CHECK-NEXT:    pop {r4, r5, r7, pc}
187; CHECK-NEXT:  .LBB1_9:
188; CHECK-NEXT:    mov r3, r0
189; CHECK-NEXT:    mov r12, r1
190; CHECK-NEXT:    mov r5, r2
191; CHECK-NEXT:    b .LBB1_6
192entry:
193  %cmp.not11 = icmp eq i32 %numSamples, 0
194  br i1 %cmp.not11, label %while.end, label %while.body.preheader
195
196while.body.preheader:                             ; preds = %entry
197  %min.iters.check = icmp ult i32 %numSamples, 4
198  br i1 %min.iters.check, label %while.body.preheader26, label %vector.memcheck
199
200vector.memcheck:                                  ; preds = %while.body.preheader
201  %scevgep = getelementptr float, float* %pDst, i32 %numSamples
202  %0 = shl i32 %numSamples, 1
203  %scevgep18 = getelementptr float, float* %pSrc, i32 %0
204  %bound0 = icmp ugt float* %scevgep18, %pDst
205  %bound1 = icmp ugt float* %scevgep, %pSrc
206  %found.conflict = and i1 %bound0, %bound1
207  br i1 %found.conflict, label %while.body.preheader26, label %vector.ph
208
209vector.ph:                                        ; preds = %vector.memcheck
210  %n.vec = and i32 %numSamples, -4
211  %1 = shl i32 %n.vec, 1
212  %ind.end = getelementptr float, float* %pSrc, i32 %1
213  %ind.end21 = getelementptr float, float* %pDst, i32 %n.vec
214  %ind.end23 = and i32 %numSamples, 3
215  br label %vector.body
216
217vector.body:                                      ; preds = %vector.body, %vector.ph
218  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
219  %2 = shl i32 %index, 1
220  %next.gep = getelementptr float, float* %pSrc, i32 %2
221  %next.gep24 = getelementptr float, float* %pDst, i32 %index
222  %3 = bitcast float* %next.gep to <8 x float>*
223  %wide.vec = load <8 x float>, <8 x float>* %3, align 4
224  %4 = fmul fast <8 x float> %wide.vec, %wide.vec
225  %5 = shufflevector <8 x float> %4, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
226  %6 = fmul fast <8 x float> %wide.vec, %wide.vec
227  %7 = shufflevector <8 x float> %6, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
228  %8 = fadd fast <4 x float> %7, %5
229  %9 = bitcast float* %next.gep24 to <4 x float>*
230  store <4 x float> %8, <4 x float>* %9, align 4
231  %index.next = add i32 %index, 4
232  %10 = icmp eq i32 %index.next, %n.vec
233  br i1 %10, label %middle.block, label %vector.body
234
235middle.block:                                     ; preds = %vector.body
236  %cmp.n = icmp eq i32 %n.vec, %numSamples
237  br i1 %cmp.n, label %while.end, label %while.body.preheader26
238
239while.body.preheader26:                           ; preds = %middle.block, %vector.memcheck, %while.body.preheader
240  %pSrc.addr.014.ph = phi float* [ %pSrc, %vector.memcheck ], [ %pSrc, %while.body.preheader ], [ %ind.end, %middle.block ]
241  %pDst.addr.013.ph = phi float* [ %pDst, %vector.memcheck ], [ %pDst, %while.body.preheader ], [ %ind.end21, %middle.block ]
242  %blkCnt.012.ph = phi i32 [ %numSamples, %vector.memcheck ], [ %numSamples, %while.body.preheader ], [ %ind.end23, %middle.block ]
243  br label %while.body
244
245while.body:                                       ; preds = %while.body.preheader26, %while.body
246  %pSrc.addr.014 = phi float* [ %incdec.ptr1, %while.body ], [ %pSrc.addr.014.ph, %while.body.preheader26 ]
247  %pDst.addr.013 = phi float* [ %incdec.ptr3, %while.body ], [ %pDst.addr.013.ph, %while.body.preheader26 ]
248  %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blkCnt.012.ph, %while.body.preheader26 ]
249  %incdec.ptr = getelementptr inbounds float, float* %pSrc.addr.014, i32 1
250  %11 = load float, float* %pSrc.addr.014, align 4
251  %incdec.ptr1 = getelementptr inbounds float, float* %pSrc.addr.014, i32 2
252  %12 = load float, float* %incdec.ptr, align 4
253  %mul = fmul fast float %11, %11
254  %mul2 = fmul fast float %12, %12
255  %add = fadd fast float %mul2, %mul
256  %incdec.ptr3 = getelementptr inbounds float, float* %pDst.addr.013, i32 1
257  store float %add, float* %pDst.addr.013, align 4
258  %dec = add i32 %blkCnt.012, -1
259  %cmp.not = icmp eq i32 %dec, 0
260  br i1 %cmp.not, label %while.end, label %while.body
261
262while.end:                                        ; preds = %while.body, %middle.block, %entry
263  ret void
264}
265