• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp -o - %s | FileCheck --check-prefix=CHECK %s
3
4define void @tailpred(half* nocapture readonly %pSrcA, half* nocapture readonly %pSrcB, half* nocapture %pDst, i32 %blockSize) {
5; CHECK-LABEL: tailpred:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    .save {r4, r5, r7, lr}
8; CHECK-NEXT:    push {r4, r5, r7, lr}
9; CHECK-NEXT:    cmp r3, #0
10; CHECK-NEXT:    beq .LBB0_6
11; CHECK-NEXT:  @ %bb.1: @ %vector.memcheck
12; CHECK-NEXT:    add.w r5, r2, r3, lsl #1
13; CHECK-NEXT:    add.w r4, r1, r3, lsl #1
14; CHECK-NEXT:    cmp r5, r1
15; CHECK-NEXT:    cset r12, hi
16; CHECK-NEXT:    cmp r4, r2
17; CHECK-NEXT:    cset lr, hi
18; CHECK-NEXT:    cmp r5, r0
19; CHECK-NEXT:    add.w r5, r0, r3, lsl #1
20; CHECK-NEXT:    cset r4, hi
21; CHECK-NEXT:    cmp r5, r2
22; CHECK-NEXT:    cset r5, hi
23; CHECK-NEXT:    ands r4, r5
24; CHECK-NEXT:    lsls r4, r4, #31
25; CHECK-NEXT:    itt eq
26; CHECK-NEXT:    andeq.w r5, lr, r12
27; CHECK-NEXT:    lslseq.w r5, r5, #31
28; CHECK-NEXT:    beq .LBB0_4
29; CHECK-NEXT:  @ %bb.2: @ %while.body.preheader
30; CHECK-NEXT:    dls lr, r3
31; CHECK-NEXT:  .LBB0_3: @ %while.body
32; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
33; CHECK-NEXT:    vldr.16 s0, [r0]
34; CHECK-NEXT:    vldr.16 s2, [r1]
35; CHECK-NEXT:    adds r1, #2
36; CHECK-NEXT:    adds r0, #2
37; CHECK-NEXT:    vadd.f16 s0, s2, s0
38; CHECK-NEXT:    vstr.16 s0, [r2]
39; CHECK-NEXT:    adds r2, #2
40; CHECK-NEXT:    le lr, .LBB0_3
41; CHECK-NEXT:    b .LBB0_6
42; CHECK-NEXT:  .LBB0_4: @ %vector.ph
43; CHECK-NEXT:    dlstp.16 lr, r3
44; CHECK-NEXT:  .LBB0_5: @ %vector.body
45; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
46; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
47; CHECK-NEXT:    vldrh.u16 q1, [r1], #16
48; CHECK-NEXT:    vadd.f16 q0, q1, q0
49; CHECK-NEXT:    vstrh.16 q0, [r2], #16
50; CHECK-NEXT:    letp lr, .LBB0_5
51; CHECK-NEXT:  .LBB0_6: @ %while.end
52; CHECK-NEXT:    pop {r4, r5, r7, pc}
53entry:
54  %cmp.not6 = icmp eq i32 %blockSize, 0
55  br i1 %cmp.not6, label %while.end, label %vector.memcheck
56
57vector.memcheck:                                  ; preds = %entry
58  %scevgep = getelementptr half, half* %pDst, i32 %blockSize
59  %scevgep14 = getelementptr half, half* %pSrcA, i32 %blockSize
60  %scevgep17 = getelementptr half, half* %pSrcB, i32 %blockSize
61  %bound0 = icmp ugt half* %scevgep14, %pDst
62  %bound1 = icmp ugt half* %scevgep, %pSrcA
63  %found.conflict = and i1 %bound0, %bound1
64  %bound019 = icmp ugt half* %scevgep17, %pDst
65  %bound120 = icmp ugt half* %scevgep, %pSrcB
66  %found.conflict21 = and i1 %bound019, %bound120
67  %conflict.rdx = or i1 %found.conflict, %found.conflict21
68  br i1 %conflict.rdx, label %while.body, label %vector.ph
69
70vector.ph:                                        ; preds = %vector.memcheck
71  %n.rnd.up = add i32 %blockSize, 7
72  %n.vec = and i32 %n.rnd.up, -8
73  br label %vector.body
74
75vector.body:                                      ; preds = %vector.body, %vector.ph
76  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
77  %next.gep = getelementptr half, half* %pSrcA, i32 %index
78  %next.gep28 = getelementptr half, half* %pDst, i32 %index
79  %next.gep29 = getelementptr half, half* %pSrcB, i32 %index
80  %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %blockSize)
81  %0 = bitcast half* %next.gep to <8 x half>*
82  %wide.masked.load = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %active.lane.mask, <8 x half> undef)
83  %1 = bitcast half* %next.gep29 to <8 x half>*
84  %wide.masked.load32 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x half> undef)
85  %2 = fadd fast <8 x half> %wide.masked.load32, %wide.masked.load
86  %3 = bitcast half* %next.gep28 to <8 x half>*
87  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %2, <8 x half>* %3, i32 2, <8 x i1> %active.lane.mask)
88  %index.next = add i32 %index, 8
89  %4 = icmp eq i32 %index.next, %n.vec
90  br i1 %4, label %while.end, label %vector.body
91
92while.body:                                       ; preds = %vector.memcheck, %while.body
93  %blkCnt.010 = phi i32 [ %dec, %while.body ], [ %blockSize, %vector.memcheck ]
94  %pSrcA.addr.09 = phi half* [ %incdec.ptr, %while.body ], [ %pSrcA, %vector.memcheck ]
95  %pDst.addr.08 = phi half* [ %incdec.ptr3, %while.body ], [ %pDst, %vector.memcheck ]
96  %pSrcB.addr.07 = phi half* [ %incdec.ptr1, %while.body ], [ %pSrcB, %vector.memcheck ]
97  %incdec.ptr = getelementptr inbounds half, half* %pSrcA.addr.09, i32 1
98  %5 = load half, half* %pSrcA.addr.09, align 2
99  %incdec.ptr1 = getelementptr inbounds half, half* %pSrcB.addr.07, i32 1
100  %6 = load half, half* %pSrcB.addr.07, align 2
101  %7 = fadd fast half %6, %5
102  %incdec.ptr3 = getelementptr inbounds half, half* %pDst.addr.08, i32 1
103  store half %7, half* %pDst.addr.08, align 2
104  %dec = add i32 %blkCnt.010, -1
105  %cmp.not = icmp eq i32 %dec, 0
106  br i1 %cmp.not, label %while.end, label %while.body
107
108while.end:                                        ; preds = %vector.body, %while.body, %entry
109  ret void
110}
111
112define void @notailpred(half* nocapture readonly %pSrcA, half* nocapture readonly %pSrcB, half* nocapture %pDst, i32 %blockSize) {
113; CHECK-LABEL: notailpred:
114; CHECK:       @ %bb.0: @ %entry
115; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
116; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
117; CHECK-NEXT:    cbz r3, .LBB1_6
118; CHECK-NEXT:  @ %bb.1: @ %while.body.preheader
119; CHECK-NEXT:    cmp r3, #8
120; CHECK-NEXT:    blo .LBB1_3
121; CHECK-NEXT:  @ %bb.2: @ %vector.memcheck
122; CHECK-NEXT:    add.w r5, r2, r3, lsl #1
123; CHECK-NEXT:    add.w r6, r1, r3, lsl #1
124; CHECK-NEXT:    cmp r5, r1
125; CHECK-NEXT:    add.w r4, r0, r3, lsl #1
126; CHECK-NEXT:    cset r7, hi
127; CHECK-NEXT:    cmp r6, r2
128; CHECK-NEXT:    cset r6, hi
129; CHECK-NEXT:    cmp r5, r0
130; CHECK-NEXT:    cset r5, hi
131; CHECK-NEXT:    cmp r4, r2
132; CHECK-NEXT:    cset r4, hi
133; CHECK-NEXT:    ands r5, r4
134; CHECK-NEXT:    lsls r5, r5, #31
135; CHECK-NEXT:    itt eq
136; CHECK-NEXT:    andeq r7, r6
137; CHECK-NEXT:    lslseq.w r7, r7, #31
138; CHECK-NEXT:    beq .LBB1_7
139; CHECK-NEXT:  .LBB1_3:
140; CHECK-NEXT:    mov r5, r3
141; CHECK-NEXT:    mov r12, r0
142; CHECK-NEXT:    mov r7, r2
143; CHECK-NEXT:    mov r4, r1
144; CHECK-NEXT:  .LBB1_4: @ %while.body.preheader31
145; CHECK-NEXT:    dls lr, r5
146; CHECK-NEXT:  .LBB1_5: @ %while.body
147; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
148; CHECK-NEXT:    vldr.16 s0, [r12]
149; CHECK-NEXT:    vldr.16 s2, [r4]
150; CHECK-NEXT:    adds r4, #2
151; CHECK-NEXT:    add.w r12, r12, #2
152; CHECK-NEXT:    vadd.f16 s0, s2, s0
153; CHECK-NEXT:    vstr.16 s0, [r7]
154; CHECK-NEXT:    adds r7, #2
155; CHECK-NEXT:    le lr, .LBB1_5
156; CHECK-NEXT:  .LBB1_6: @ %while.end
157; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
158; CHECK-NEXT:  .LBB1_7: @ %vector.ph
159; CHECK-NEXT:    bic r6, r3, #7
160; CHECK-NEXT:    movs r5, #1
161; CHECK-NEXT:    sub.w r7, r6, #8
162; CHECK-NEXT:    add.w r4, r1, r6, lsl #1
163; CHECK-NEXT:    add.w r12, r0, r6, lsl #1
164; CHECK-NEXT:    add.w lr, r5, r7, lsr #3
165; CHECK-NEXT:    add.w r7, r2, r6, lsl #1
166; CHECK-NEXT:    dls lr, lr
167; CHECK-NEXT:    and r5, r3, #7
168; CHECK-NEXT:  .LBB1_8: @ %vector.body
169; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
170; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
171; CHECK-NEXT:    vldrh.u16 q1, [r1], #16
172; CHECK-NEXT:    vadd.f16 q0, q1, q0
173; CHECK-NEXT:    vstrb.8 q0, [r2], #16
174; CHECK-NEXT:    le lr, .LBB1_8
175; CHECK-NEXT:  @ %bb.9: @ %middle.block
176; CHECK-NEXT:    cmp r6, r3
177; CHECK-NEXT:    bne .LBB1_4
178; CHECK-NEXT:    b .LBB1_6
179entry:
180  %cmp.not6 = icmp eq i32 %blockSize, 0
181  br i1 %cmp.not6, label %while.end, label %while.body.preheader
182
183while.body.preheader:                             ; preds = %entry
184  %min.iters.check = icmp ult i32 %blockSize, 8
185  br i1 %min.iters.check, label %while.body.preheader31, label %vector.memcheck
186
187vector.memcheck:                                  ; preds = %while.body.preheader
188  %scevgep = getelementptr half, half* %pDst, i32 %blockSize
189  %scevgep14 = getelementptr half, half* %pSrcA, i32 %blockSize
190  %scevgep17 = getelementptr half, half* %pSrcB, i32 %blockSize
191  %bound0 = icmp ugt half* %scevgep14, %pDst
192  %bound1 = icmp ugt half* %scevgep, %pSrcA
193  %found.conflict = and i1 %bound0, %bound1
194  %bound019 = icmp ugt half* %scevgep17, %pDst
195  %bound120 = icmp ugt half* %scevgep, %pSrcB
196  %found.conflict21 = and i1 %bound019, %bound120
197  %conflict.rdx = or i1 %found.conflict, %found.conflict21
198  br i1 %conflict.rdx, label %while.body.preheader31, label %vector.ph
199
200vector.ph:                                        ; preds = %vector.memcheck
201  %n.vec = and i32 %blockSize, -8
202  %ind.end = and i32 %blockSize, 7
203  %ind.end23 = getelementptr half, half* %pSrcA, i32 %n.vec
204  %ind.end25 = getelementptr half, half* %pDst, i32 %n.vec
205  %ind.end27 = getelementptr half, half* %pSrcB, i32 %n.vec
206  br label %vector.body
207
208vector.body:                                      ; preds = %vector.body, %vector.ph
209  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
210  %next.gep = getelementptr half, half* %pSrcA, i32 %index
211  %next.gep28 = getelementptr half, half* %pDst, i32 %index
212  %next.gep29 = getelementptr half, half* %pSrcB, i32 %index
213  %0 = bitcast half* %next.gep to <8 x half>*
214  %wide.load = load <8 x half>, <8 x half>* %0, align 2
215  %1 = bitcast half* %next.gep29 to <8 x half>*
216  %wide.load30 = load <8 x half>, <8 x half>* %1, align 2
217  %2 = fadd fast <8 x half> %wide.load30, %wide.load
218  %3 = bitcast half* %next.gep28 to <8 x half>*
219  store <8 x half> %2, <8 x half>* %3, align 2
220  %index.next = add i32 %index, 8
221  %4 = icmp eq i32 %index.next, %n.vec
222  br i1 %4, label %middle.block, label %vector.body
223
224middle.block:                                     ; preds = %vector.body
225  %cmp.n = icmp eq i32 %n.vec, %blockSize
226  br i1 %cmp.n, label %while.end, label %while.body.preheader31
227
228while.body.preheader31:                           ; preds = %middle.block, %vector.memcheck, %while.body.preheader
229  %blkCnt.010.ph = phi i32 [ %blockSize, %vector.memcheck ], [ %blockSize, %while.body.preheader ], [ %ind.end, %middle.block ]
230  %pSrcA.addr.09.ph = phi half* [ %pSrcA, %vector.memcheck ], [ %pSrcA, %while.body.preheader ], [ %ind.end23, %middle.block ]
231  %pDst.addr.08.ph = phi half* [ %pDst, %vector.memcheck ], [ %pDst, %while.body.preheader ], [ %ind.end25, %middle.block ]
232  %pSrcB.addr.07.ph = phi half* [ %pSrcB, %vector.memcheck ], [ %pSrcB, %while.body.preheader ], [ %ind.end27, %middle.block ]
233  br label %while.body
234
235while.body:                                       ; preds = %while.body.preheader31, %while.body
236  %blkCnt.010 = phi i32 [ %dec, %while.body ], [ %blkCnt.010.ph, %while.body.preheader31 ]
237  %pSrcA.addr.09 = phi half* [ %incdec.ptr, %while.body ], [ %pSrcA.addr.09.ph, %while.body.preheader31 ]
238  %pDst.addr.08 = phi half* [ %incdec.ptr3, %while.body ], [ %pDst.addr.08.ph, %while.body.preheader31 ]
239  %pSrcB.addr.07 = phi half* [ %incdec.ptr1, %while.body ], [ %pSrcB.addr.07.ph, %while.body.preheader31 ]
240  %incdec.ptr = getelementptr inbounds half, half* %pSrcA.addr.09, i32 1
241  %5 = load half, half* %pSrcA.addr.09, align 2
242  %incdec.ptr1 = getelementptr inbounds half, half* %pSrcB.addr.07, i32 1
243  %6 = load half, half* %pSrcB.addr.07, align 2
244  %7 = fadd fast half %6, %5
245  %incdec.ptr3 = getelementptr inbounds half, half* %pDst.addr.08, i32 1
246  store half %7, half* %pDst.addr.08, align 2
247  %dec = add i32 %blkCnt.010, -1
248  %cmp.not = icmp eq i32 %dec, 0
249  br i1 %cmp.not, label %while.end, label %while.body
250
251while.end:                                        ; preds = %while.body, %middle.block, %entry
252  ret void
253}
254
255declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) #1
256declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32 immarg, <8 x i1>, <8 x half>) #2
257declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32 immarg, <8 x i1>) #3
258