• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
3
4define arm_aapcs_vfpcc i32 @vqdmulh_i8(<16 x i8> %s0, <16 x i8> %s1) {
5; CHECK-LABEL: vqdmulh_i8:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    vqdmulh.s8 q0, q1, q0
8; CHECK-NEXT:    vaddv.s8 r0, q0
9; CHECK-NEXT:    bx lr
10entry:
11  %l2 = sext <16 x i8> %s0 to <16 x i32>
12  %l5 = sext <16 x i8> %s1 to <16 x i32>
13  %l6 = mul nsw <16 x i32> %l5, %l2
14  %l7 = ashr <16 x i32> %l6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
15  %l8 = icmp slt <16 x i32> %l7, <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
16  %l9 = select <16 x i1> %l8, <16 x i32> %l7, <16 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
17  %l10 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %l9)
18  ret i32 %l10
19}
20
21define arm_aapcs_vfpcc <16 x i8> @vqdmulh_i8_b(<16 x i8> %s0, <16 x i8> %s1) {
22; CHECK-LABEL: vqdmulh_i8_b:
23; CHECK:       @ %bb.0: @ %entry
24; CHECK-NEXT:    vqdmulh.s8 q0, q1, q0
25; CHECK-NEXT:    bx lr
26entry:
27  %l2 = sext <16 x i8> %s0 to <16 x i32>
28  %l5 = sext <16 x i8> %s1 to <16 x i32>
29  %l6 = mul nsw <16 x i32> %l5, %l2
30  %l7 = ashr <16 x i32> %l6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
31  %l8 = icmp slt <16 x i32> %l7, <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
32  %l9 = select <16 x i1> %l8, <16 x i32> %l7, <16 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
33  %l10 = trunc <16 x i32> %l9 to <16 x i8>
34  ret <16 x i8> %l10
35}
36
37define arm_aapcs_vfpcc i32 @vqdmulh_i16(<8 x i16> %s0, <8 x i16> %s1) {
38; CHECK-LABEL: vqdmulh_i16:
39; CHECK:       @ %bb.0: @ %entry
40; CHECK-NEXT:    vqdmulh.s16 q0, q1, q0
41; CHECK-NEXT:    vaddv.s16 r0, q0
42; CHECK-NEXT:    bx lr
43entry:
44  %l2 = sext <8 x i16> %s0 to <8 x i32>
45  %l5 = sext <8 x i16> %s1 to <8 x i32>
46  %l6 = mul nsw <8 x i32> %l5, %l2
47  %l7 = ashr <8 x i32> %l6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
48  %l8 = icmp slt <8 x i32> %l7, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
49  %l9 = select <8 x i1> %l8, <8 x i32> %l7, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
50  %l10 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %l9)
51  ret i32 %l10
52}
53
54define arm_aapcs_vfpcc <8 x i16> @vqdmulh_i16_b(<8 x i16> %s0, <8 x i16> %s1) {
55; CHECK-LABEL: vqdmulh_i16_b:
56; CHECK:       @ %bb.0: @ %entry
57; CHECK-NEXT:    vqdmulh.s16 q0, q1, q0
58; CHECK-NEXT:    bx lr
59entry:
60  %l2 = sext <8 x i16> %s0 to <8 x i32>
61  %l5 = sext <8 x i16> %s1 to <8 x i32>
62  %l6 = mul nsw <8 x i32> %l5, %l2
63  %l7 = ashr <8 x i32> %l6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
64  %l8 = icmp slt <8 x i32> %l7, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
65  %l9 = select <8 x i1> %l8, <8 x i32> %l7, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
66  %l10 = trunc <8 x i32> %l9 to <8 x i16>
67  ret <8 x i16> %l10
68}
69
70define arm_aapcs_vfpcc <8 x i16> @vqdmulh_i16_c(<8 x i16> %s0, <8 x i16> %s1) {
71; CHECK-LABEL: vqdmulh_i16_c:
72; CHECK:       @ %bb.0: @ %entry
73; CHECK-NEXT:    .vsave {d8, d9}
74; CHECK-NEXT:    vpush {d8, d9}
75; CHECK-NEXT:    vmov q2, q0
76; CHECK-NEXT:    vmov.u16 r0, q0[0]
77; CHECK-NEXT:    vmov.32 q0[0], r0
78; CHECK-NEXT:    vmov.u16 r0, q2[1]
79; CHECK-NEXT:    vmov.32 q0[1], r0
80; CHECK-NEXT:    vmov.u16 r0, q2[2]
81; CHECK-NEXT:    vmov.32 q0[2], r0
82; CHECK-NEXT:    vmov.u16 r0, q2[3]
83; CHECK-NEXT:    vmov.32 q0[3], r0
84; CHECK-NEXT:    vmov.u16 r0, q1[0]
85; CHECK-NEXT:    vmov.32 q3[0], r0
86; CHECK-NEXT:    vmov.u16 r0, q1[1]
87; CHECK-NEXT:    vmov.32 q3[1], r0
88; CHECK-NEXT:    vmov.u16 r0, q1[2]
89; CHECK-NEXT:    vmov.32 q3[2], r0
90; CHECK-NEXT:    vmov.u16 r0, q1[3]
91; CHECK-NEXT:    vmov.32 q3[3], r0
92; CHECK-NEXT:    vmullb.s16 q0, q3, q0
93; CHECK-NEXT:    vmov.i32 q3, #0x7fff
94; CHECK-NEXT:    vshl.i32 q0, q0, #10
95; CHECK-NEXT:    vshr.s32 q0, q0, #10
96; CHECK-NEXT:    vshr.s32 q0, q0, #15
97; CHECK-NEXT:    vmin.s32 q4, q0, q3
98; CHECK-NEXT:    vmov r0, s16
99; CHECK-NEXT:    vmov.16 q0[0], r0
100; CHECK-NEXT:    vmov r0, s17
101; CHECK-NEXT:    vmov.16 q0[1], r0
102; CHECK-NEXT:    vmov r0, s18
103; CHECK-NEXT:    vmov.16 q0[2], r0
104; CHECK-NEXT:    vmov r0, s19
105; CHECK-NEXT:    vmov.16 q0[3], r0
106; CHECK-NEXT:    vmov.u16 r0, q2[4]
107; CHECK-NEXT:    vmov.32 q4[0], r0
108; CHECK-NEXT:    vmov.u16 r0, q2[5]
109; CHECK-NEXT:    vmov.32 q4[1], r0
110; CHECK-NEXT:    vmov.u16 r0, q2[6]
111; CHECK-NEXT:    vmov.32 q4[2], r0
112; CHECK-NEXT:    vmov.u16 r0, q2[7]
113; CHECK-NEXT:    vmov.32 q4[3], r0
114; CHECK-NEXT:    vmov.u16 r0, q1[4]
115; CHECK-NEXT:    vmov.32 q2[0], r0
116; CHECK-NEXT:    vmov.u16 r0, q1[5]
117; CHECK-NEXT:    vmov.32 q2[1], r0
118; CHECK-NEXT:    vmov.u16 r0, q1[6]
119; CHECK-NEXT:    vmov.32 q2[2], r0
120; CHECK-NEXT:    vmov.u16 r0, q1[7]
121; CHECK-NEXT:    vmov.32 q2[3], r0
122; CHECK-NEXT:    vmullb.s16 q1, q2, q4
123; CHECK-NEXT:    vshl.i32 q1, q1, #10
124; CHECK-NEXT:    vshr.s32 q1, q1, #10
125; CHECK-NEXT:    vshr.s32 q1, q1, #15
126; CHECK-NEXT:    vmin.s32 q1, q1, q3
127; CHECK-NEXT:    vmov r0, s4
128; CHECK-NEXT:    vmov.16 q0[4], r0
129; CHECK-NEXT:    vmov r0, s5
130; CHECK-NEXT:    vmov.16 q0[5], r0
131; CHECK-NEXT:    vmov r0, s6
132; CHECK-NEXT:    vmov.16 q0[6], r0
133; CHECK-NEXT:    vmov r0, s7
134; CHECK-NEXT:    vmov.16 q0[7], r0
135; CHECK-NEXT:    vpop {d8, d9}
136; CHECK-NEXT:    bx lr
137entry:
138  %l2 = sext <8 x i16> %s0 to <8 x i22>
139  %l5 = sext <8 x i16> %s1 to <8 x i22>
140  %l6 = mul nsw <8 x i22> %l5, %l2
141  %l7 = ashr <8 x i22> %l6, <i22 15, i22 15, i22 15, i22 15, i22 15, i22 15, i22 15, i22 15>
142  %l8 = icmp slt <8 x i22> %l7, <i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767>
143  %l9 = select <8 x i1> %l8, <8 x i22> %l7, <8 x i22> <i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767>
144  %l10 = trunc <8 x i22> %l9 to <8 x i16>
145  ret <8 x i16> %l10
146}
147
148define arm_aapcs_vfpcc i64 @vqdmulh_i32(<4 x i32> %s0, <4 x i32> %s1) {
149; CHECK-LABEL: vqdmulh_i32:
150; CHECK:       @ %bb.0: @ %entry
151; CHECK-NEXT:    vqdmulh.s32 q0, q1, q0
152; CHECK-NEXT:    vaddlv.s32 r0, r1, q0
153; CHECK-NEXT:    bx lr
154entry:
155  %l2 = sext <4 x i32> %s0 to <4 x i64>
156  %l5 = sext <4 x i32> %s1 to <4 x i64>
157  %l6 = mul nsw <4 x i64> %l5, %l2
158  %l7 = ashr <4 x i64> %l6, <i64 31, i64 31, i64 31, i64 31>
159  %l8 = icmp slt <4 x i64> %l7, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
160  %l9 = select <4 x i1> %l8, <4 x i64> %l7, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
161  %l10 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %l9)
162  ret i64 %l10
163}
164
165define arm_aapcs_vfpcc <4 x i32> @vqdmulh_i32_b(<4 x i32> %s0, <4 x i32> %s1) {
166; CHECK-LABEL: vqdmulh_i32_b:
167; CHECK:       @ %bb.0: @ %entry
168; CHECK-NEXT:    vqdmulh.s32 q0, q1, q0
169; CHECK-NEXT:    bx lr
170entry:
171  %l2 = sext <4 x i32> %s0 to <4 x i64>
172  %l5 = sext <4 x i32> %s1 to <4 x i64>
173  %l6 = mul nsw <4 x i64> %l5, %l2
174  %l7 = ashr <4 x i64> %l6, <i64 31, i64 31, i64 31, i64 31>
175  %l8 = icmp slt <4 x i64> %l7, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
176  %l9 = select <4 x i1> %l8, <4 x i64> %l7, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
177  %l10 = trunc <4 x i64> %l9 to <4 x i32>
178  ret <4 x i32> %l10
179}
180
181
182
183
184define void @vqdmulh_loop_i8(i8* nocapture readonly %x, i8* nocapture readonly %y, i8* noalias nocapture %z, i32 %n) local_unnamed_addr #0 {
185; CHECK-LABEL: vqdmulh_loop_i8:
186; CHECK:       @ %bb.0: @ %entry
187; CHECK-NEXT:    .save {r7, lr}
188; CHECK-NEXT:    push {r7, lr}
189; CHECK-NEXT:    mov.w lr, #64
190; CHECK-NEXT:    dls lr, lr
191; CHECK-NEXT:  .LBB7_1: @ %vector.body
192; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
193; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
194; CHECK-NEXT:    vldrb.u8 q1, [r1], #16
195; CHECK-NEXT:    vqdmulh.s8 q0, q1, q0
196; CHECK-NEXT:    vstrb.8 q0, [r2], #16
197; CHECK-NEXT:    le lr, .LBB7_1
198; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
199; CHECK-NEXT:    pop {r7, pc}
200entry:
201  br label %vector.body
202
203vector.body:                                      ; preds = %vector.body, %entry
204  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
205  %0 = getelementptr inbounds i8, i8* %x, i32 %index
206  %1 = bitcast i8* %0 to <16 x i8>*
207  %wide.load = load <16 x i8>, <16 x i8>* %1, align 1
208  %2 = sext <16 x i8> %wide.load to <16 x i32>
209  %3 = getelementptr inbounds i8, i8* %y, i32 %index
210  %4 = bitcast i8* %3 to <16 x i8>*
211  %wide.load26 = load <16 x i8>, <16 x i8>* %4, align 1
212  %5 = sext <16 x i8> %wide.load26 to <16 x i32>
213  %6 = mul nsw <16 x i32> %5, %2
214  %7 = ashr <16 x i32> %6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
215  %8 = icmp slt <16 x i32> %7, <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
216  %9 = select <16 x i1> %8, <16 x i32> %7, <16 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
217  %10 = trunc <16 x i32> %9 to <16 x i8>
218  %11 = getelementptr inbounds i8, i8* %z, i32 %index
219  %12 = bitcast i8* %11 to <16 x i8>*
220  store <16 x i8> %10, <16 x i8>* %12, align 1
221  %index.next = add i32 %index, 16
222  %13 = icmp eq i32 %index.next, 1024
223  br i1 %13, label %for.cond.cleanup, label %vector.body
224
225for.cond.cleanup:                                 ; preds = %vector.body
226  ret void
227}
228
229define void @vqdmulh_loop_i16(i16* nocapture readonly %x, i16* nocapture readonly %y, i16* noalias nocapture %z, i32 %n) {
230; CHECK-LABEL: vqdmulh_loop_i16:
231; CHECK:       @ %bb.0: @ %entry
232; CHECK-NEXT:    .save {r7, lr}
233; CHECK-NEXT:    push {r7, lr}
234; CHECK-NEXT:    mov.w lr, #128
235; CHECK-NEXT:    dls lr, lr
236; CHECK-NEXT:  .LBB8_1: @ %vector.body
237; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
238; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
239; CHECK-NEXT:    vldrh.u16 q1, [r1], #16
240; CHECK-NEXT:    vqdmulh.s16 q0, q1, q0
241; CHECK-NEXT:    vstrb.8 q0, [r2], #16
242; CHECK-NEXT:    le lr, .LBB8_1
243; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
244; CHECK-NEXT:    pop {r7, pc}
245entry:
246  br label %vector.body
247
248vector.body:                                      ; preds = %vector.body, %entry
249  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
250  %0 = getelementptr inbounds i16, i16* %x, i32 %index
251  %1 = bitcast i16* %0 to <8 x i16>*
252  %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
253  %2 = sext <8 x i16> %wide.load to <8 x i32>
254  %3 = getelementptr inbounds i16, i16* %y, i32 %index
255  %4 = bitcast i16* %3 to <8 x i16>*
256  %wide.load30 = load <8 x i16>, <8 x i16>* %4, align 2
257  %5 = sext <8 x i16> %wide.load30 to <8 x i32>
258  %6 = mul nsw <8 x i32> %5, %2
259  %7 = ashr <8 x i32> %6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
260  %8 = icmp slt <8 x i32> %7, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
261  %9 = select <8 x i1> %8, <8 x i32> %7, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
262  %10 = trunc <8 x i32> %9 to <8 x i16>
263  %11 = getelementptr inbounds i16, i16* %z, i32 %index
264  %12 = bitcast i16* %11 to <8 x i16>*
265  store <8 x i16> %10, <8 x i16>* %12, align 2
266  %index.next = add i32 %index, 8
267  %13 = icmp eq i32 %index.next, 1024
268  br i1 %13, label %for.cond.cleanup, label %vector.body
269
270for.cond.cleanup:                                 ; preds = %vector.body
271  ret void
272}
273
274define void @vqdmulh_loop_i32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) {
275; CHECK-LABEL: vqdmulh_loop_i32:
276; CHECK:       @ %bb.0: @ %entry
277; CHECK-NEXT:    .save {r7, lr}
278; CHECK-NEXT:    push {r7, lr}
279; CHECK-NEXT:    mov.w lr, #256
280; CHECK-NEXT:    dls lr, lr
281; CHECK-NEXT:  .LBB9_1: @ %vector.body
282; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
283; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
284; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
285; CHECK-NEXT:    vqdmulh.s32 q0, q1, q0
286; CHECK-NEXT:    vstrb.8 q0, [r2], #16
287; CHECK-NEXT:    le lr, .LBB9_1
288; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
289; CHECK-NEXT:    pop {r7, pc}
290entry:
291  br label %vector.body
292
293vector.body:                                      ; preds = %vector.body, %entry
294  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
295  %0 = getelementptr inbounds i32, i32* %x, i32 %index
296  %1 = bitcast i32* %0 to <4 x i32>*
297  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
298  %2 = sext <4 x i32> %wide.load to <4 x i64>
299  %3 = getelementptr inbounds i32, i32* %y, i32 %index
300  %4 = bitcast i32* %3 to <4 x i32>*
301  %wide.load30 = load <4 x i32>, <4 x i32>* %4, align 4
302  %5 = sext <4 x i32> %wide.load30 to <4 x i64>
303  %6 = mul nsw <4 x i64> %5, %2
304  %7 = ashr <4 x i64> %6, <i64 31, i64 31, i64 31, i64 31>
305  %8 = icmp slt <4 x i64> %7, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
306  %9 = select <4 x i1> %8, <4 x i64> %7, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
307  %10 = trunc <4 x i64> %9 to <4 x i32>
308  %11 = getelementptr inbounds i32, i32* %z, i32 %index
309  %12 = bitcast i32* %11 to <4 x i32>*
310  store <4 x i32> %10, <4 x i32>* %12, align 4
311  %index.next = add i32 %index, 4
312  %13 = icmp eq i32 %index.next, 1024
313  br i1 %13, label %for.cond.cleanup, label %vector.body
314
315for.cond.cleanup:                                 ; preds = %vector.body
316  ret void
317}
318
319declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
320declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
321declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
322