• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=armv8.1m.main -mattr=+mve -tail-predication=enabled --verify-machineinstrs %s -o - | FileCheck %s
3
4define void @arm_cmplx_mag_squared_q15_mve(i16* %pSrc, i16* %pDst, i32 %blockSize) {
5; CHECK-LABEL: arm_cmplx_mag_squared_q15_mve:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    push {r7, lr}
8; CHECK-NEXT:    subs.w r12, r2, #8
9; CHECK-NEXT:    dlstp.16 lr, r2
10; CHECK-NEXT:  .LBB0_1: @ %do.body
11; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
12; CHECK-NEXT:    vld20.16 {q0, q1}, [r0]
13; CHECK-NEXT:    vld21.16 {q0, q1}, [r0]!
14; CHECK-NEXT:    vmulh.s16 q2, q1, q1
15; CHECK-NEXT:    vmulh.s16 q0, q0, q0
16; CHECK-NEXT:    vqadd.s16 q0, q0, q2
17; CHECK-NEXT:    vshr.s16 q0, q0, #1
18; CHECK-NEXT:    vstrh.16 q0, [r1], #16
19; CHECK-NEXT:    letp lr, .LBB0_1
20; CHECK-NEXT:  @ %bb.2: @ %do.end
21; CHECK-NEXT:    pop {r7, pc}
22entry:
23  br label %do.body
24
25do.body:                                          ; preds = %do.body, %entry
26  %blockSize.addr.0 = phi i32 [ %blockSize, %entry ], [ %sub, %do.body ]
27  %pDst.addr.0 = phi i16* [ %pDst, %entry ], [ %add.ptr7, %do.body ]
28  %pSrc.addr.0 = phi i16* [ %pSrc, %entry ], [ %add.ptr, %do.body ]
29  %0 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %blockSize.addr.0)
30  %1 = tail call { <8 x i16>, <8 x i16> } @llvm.arm.mve.vld2q.v8i16.p0i16(i16* %pSrc.addr.0)
31  %2 = extractvalue { <8 x i16>, <8 x i16> } %1, 0
32  %3 = extractvalue { <8 x i16>, <8 x i16> } %1, 1
33  %4 = tail call <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16> %2, <8 x i16> %2, i32 0, <8 x i1> %0, <8 x i16> undef)
34  %5 = tail call <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16> %3, <8 x i16> %3, i32 0, <8 x i1> %0, <8 x i16> undef)
35  %6 = tail call <8 x i16> @llvm.arm.mve.qadd.predicated.v8i16.v8i1(<8 x i16> %4, <8 x i16> %5, i32 0, <8 x i1> %0, <8 x i16> undef)
36  %7 = tail call <8 x i16> @llvm.arm.mve.shr.imm.predicated.v8i16.v8i1(<8 x i16> %6, i32 1, i32 0, <8 x i1> %0, <8 x i16> undef)
37  %8 = bitcast i16* %pDst.addr.0 to <8 x i16>*
38  tail call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %7, <8 x i16>* %8, i32 2, <8 x i1> %0)
39  %add.ptr = getelementptr inbounds i16, i16* %pSrc.addr.0, i32 16
40  %add.ptr7 = getelementptr inbounds i16, i16* %pDst.addr.0, i32 8
41  %sub = add i32 %blockSize.addr.0, -8
42  %cmp = icmp sgt i32 %sub, 0
43  br i1 %cmp, label %do.body, label %do.end
44
45do.end:                                           ; preds = %do.body
46  ret void
47}
48
49define i32 @bad(i32* readonly %x, i32* nocapture readonly %y, i32 %n) {
50; CHECK-LABEL: bad:
51; CHECK:       @ %bb.0: @ %entry
52; CHECK-NEXT:    push {r7, lr}
53; CHECK-NEXT:    mov r3, r2
54; CHECK-NEXT:    cmp r2, #4
55; CHECK-NEXT:    it ge
56; CHECK-NEXT:    movge r3, #4
57; CHECK-NEXT:    subs r3, r2, r3
58; CHECK-NEXT:    add.w r12, r3, #3
59; CHECK-NEXT:    movs r3, #1
60; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
61; CHECK-NEXT:    mov.w r12, #0
62; CHECK-NEXT:    dls lr, lr
63; CHECK-NEXT:  .LBB1_1: @ %do.body
64; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
65; CHECK-NEXT:    vctp.32 r2
66; CHECK-NEXT:    vpst
67; CHECK-NEXT:    vldrwt.u32 q0, [r0], #16
68; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
69; CHECK-NEXT:    subs r2, #4
70; CHECK-NEXT:    vmlava.s32 r12, q0, q1
71; CHECK-NEXT:    le lr, .LBB1_1
72; CHECK-NEXT:  @ %bb.2: @ %do.end
73; CHECK-NEXT:    mov r0, r12
74; CHECK-NEXT:    pop {r7, pc}
75entry:
76  br label %do.body
77
78do.body:                                          ; preds = %do.body, %entry
79  %s.0 = phi i32 [ 0, %entry ], [ %5, %do.body ]
80  %n.addr.0 = phi i32 [ %n, %entry ], [ %sub, %do.body ]
81  %y.addr.0 = phi i32* [ %y, %entry ], [ %add.ptr1, %do.body ]
82  %x.addr.0 = phi i32* [ %x, %entry ], [ %add.ptr, %do.body ]
83  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %n.addr.0)
84  %1 = bitcast i32* %x.addr.0 to <4 x i32>*
85  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
86  %add.ptr = getelementptr inbounds i32, i32* %x.addr.0, i32 4
87  %3 = bitcast i32* %y.addr.0 to <4 x i32>*
88  %4 = load <4 x i32>, <4 x i32>* %3, align 4
89  %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.0, i32 4
90  %5 = tail call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 0, i32 0, i32 %s.0, <4 x i32> %2, <4 x i32> %4)
91  %sub = add nsw i32 %n.addr.0, -4
92  %cmp = icmp sgt i32 %n.addr.0, 4
93  br i1 %cmp, label %do.body, label %do.end
94
95do.end:                                           ; preds = %do.body
96  ret i32 %5
97}
98
99define i32 @good(i32* readonly %x, i32* readonly %y, i32 %n) {
100; CHECK-LABEL: good:
101; CHECK:       @ %bb.0: @ %entry
102; CHECK-NEXT:    push {r7, lr}
103; CHECK-NEXT:    mov.w r12, #0
104; CHECK-NEXT:    dlstp.32 lr, r2
105; CHECK-NEXT:  .LBB2_1: @ %do.body
106; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
107; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
108; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
109; CHECK-NEXT:    vmlava.s32 r12, q1, q0
110; CHECK-NEXT:    letp lr, .LBB2_1
111; CHECK-NEXT:  @ %bb.2: @ %do.end
112; CHECK-NEXT:    mov r0, r12
113; CHECK-NEXT:    pop {r7, pc}
114entry:
115  br label %do.body
116
117do.body:                                          ; preds = %do.body, %entry
118  %s.0 = phi i32 [ 0, %entry ], [ %5, %do.body ]
119  %n.addr.0 = phi i32 [ %n, %entry ], [ %sub, %do.body ]
120  %y.addr.0 = phi i32* [ %y, %entry ], [ %add.ptr1, %do.body ]
121  %x.addr.0 = phi i32* [ %x, %entry ], [ %add.ptr, %do.body ]
122  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %n.addr.0)
123  %1 = bitcast i32* %x.addr.0 to <4 x i32>*
124  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
125  %add.ptr = getelementptr inbounds i32, i32* %x.addr.0, i32 4
126  %3 = bitcast i32* %y.addr.0 to <4 x i32>*
127  %4 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %3, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
128  %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.0, i32 4
129  %5 = tail call i32 @llvm.arm.mve.vmldava.v4i32(i32 0, i32 0, i32 0, i32 %s.0, <4 x i32> %2, <4 x i32> %4)
130  %sub = add nsw i32 %n.addr.0, -4
131  %cmp = icmp sgt i32 %n.addr.0, 4
132  br i1 %cmp, label %do.body, label %do.end
133
134do.end:                                           ; preds = %do.body
135  ret i32 %5
136}
137
138define i32 @good2(i32* nocapture readonly %x, i32* nocapture readonly %y, i32 %n) {
139; CHECK-LABEL: good2:
140; CHECK:       @ %bb.0: @ %entry
141; CHECK-NEXT:    push {r7, lr}
142; CHECK-NEXT:    mov.w r12, #0
143; CHECK-NEXT:    dlstp.32 lr, r2
144; CHECK-NEXT:  .LBB3_1: @ %do.body
145; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
146; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
147; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
148; CHECK-NEXT:    vmlava.s32 r12, q1, q0
149; CHECK-NEXT:    letp lr, .LBB3_1
150; CHECK-NEXT:  @ %bb.2: @ %do.end
151; CHECK-NEXT:    mov r0, r12
152; CHECK-NEXT:    pop {r7, pc}
153entry:
154  br label %do.body
155
156do.body:                                          ; preds = %do.body, %entry
157  %s.0 = phi i32 [ 0, %entry ], [ %5, %do.body ]
158  %n.addr.0 = phi i32 [ %n, %entry ], [ %sub, %do.body ]
159  %y.addr.0 = phi i32* [ %y, %entry ], [ %add.ptr1, %do.body ]
160  %x.addr.0 = phi i32* [ %x, %entry ], [ %add.ptr, %do.body ]
161  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %n.addr.0)
162  %1 = bitcast i32* %x.addr.0 to <4 x i32>*
163  %2 = load <4 x i32>, <4 x i32>* %1, align 4
164  %add.ptr = getelementptr inbounds i32, i32* %x.addr.0, i32 4
165  %3 = bitcast i32* %y.addr.0 to <4 x i32>*
166  %4 = load <4 x i32>, <4 x i32>* %3, align 4
167  %add.ptr1 = getelementptr inbounds i32, i32* %y.addr.0, i32 4
168  %5 = tail call i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32 0, i32 0, i32 0, i32 %s.0, <4 x i32> %2, <4 x i32> %4, <4 x i1> %0)
169  %sub = add nsw i32 %n.addr.0, -4
170  %cmp = icmp sgt i32 %n.addr.0, 4
171  br i1 %cmp, label %do.body, label %do.end
172
173do.end:                                           ; preds = %do.body
174  ret i32 %5
175}
176
177declare <8 x i1> @llvm.arm.mve.vctp16(i32)
178declare { <8 x i16>, <8 x i16> } @llvm.arm.mve.vld2q.v8i16.p0i16(i16*)
179declare <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <8 x i16>)
180declare <8 x i16> @llvm.arm.mve.qadd.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>, <8 x i16>)
181declare <8 x i16> @llvm.arm.mve.shr.imm.predicated.v8i16.v8i1(<8 x i16>, i32, i32, <8 x i1>, <8 x i16>)
182declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>)
183declare i32 @llvm.arm.mve.vmldava.predicated.v4i32.v4i1(i32, i32, i32, i32, <4 x i32>, <4 x i32>, <4 x i1>)
184declare <4 x i1> @llvm.arm.mve.vctp32(i32) #1
185declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #2
186declare i32 @llvm.arm.mve.vmldava.v4i32(i32, i32, i32, i32, <4 x i32>, <4 x i32>) #1
187