• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
3
4define arm_aapcs_vfpcc <16 x i8> @test_vmulq_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
5; CHECK-LABEL: test_vmulq_u8:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    vmul.i8 q0, q1, q0
8; CHECK-NEXT:    bx lr
9entry:
10  %0 = mul <16 x i8> %b, %a
11  ret <16 x i8> %0
12}
13
14define arm_aapcs_vfpcc <8 x i16> @test_vmulq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
15; CHECK-LABEL: test_vmulq_s16:
16; CHECK:       @ %bb.0: @ %entry
17; CHECK-NEXT:    vmul.i16 q0, q1, q0
18; CHECK-NEXT:    bx lr
19entry:
20  %0 = mul <8 x i16> %b, %a
21  ret <8 x i16> %0
22}
23
24define arm_aapcs_vfpcc <4 x i32> @test_vmulq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
25; CHECK-LABEL: test_vmulq_u32:
26; CHECK:       @ %bb.0: @ %entry
27; CHECK-NEXT:    vmul.i32 q0, q1, q0
28; CHECK-NEXT:    bx lr
29entry:
30  %0 = mul <4 x i32> %b, %a
31  ret <4 x i32> %0
32}
33
34define arm_aapcs_vfpcc <4 x float> @test_vmulq_f32(<4 x float> %a, <4 x float> %b) local_unnamed_addr #0 {
35; CHECK-LABEL: test_vmulq_f32:
36; CHECK:       @ %bb.0: @ %entry
37; CHECK-NEXT:    vmul.f32 q0, q0, q1
38; CHECK-NEXT:    bx lr
39entry:
40  %0 = fmul <4 x float> %a, %b
41  ret <4 x float> %0
42}
43
44define arm_aapcs_vfpcc <16 x i8> @test_vmulq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
45; CHECK-LABEL: test_vmulq_m_s8:
46; CHECK:       @ %bb.0: @ %entry
47; CHECK-NEXT:    vmsr p0, r0
48; CHECK-NEXT:    vpst
49; CHECK-NEXT:    vmult.i8 q0, q1, q2
50; CHECK-NEXT:    bx lr
51entry:
52  %0 = zext i16 %p to i32
53  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
54  %2 = tail call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
55  ret <16 x i8> %2
56}
57
58declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #2
59
60declare <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #2
61
62define arm_aapcs_vfpcc <8 x i16> @test_vmulq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
63; CHECK-LABEL: test_vmulq_m_u16:
64; CHECK:       @ %bb.0: @ %entry
65; CHECK-NEXT:    vmsr p0, r0
66; CHECK-NEXT:    vpst
67; CHECK-NEXT:    vmult.i16 q0, q1, q2
68; CHECK-NEXT:    bx lr
69entry:
70  %0 = zext i16 %p to i32
71  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
72  %2 = tail call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
73  ret <8 x i16> %2
74}
75
76declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
77
78declare <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2
79
80define arm_aapcs_vfpcc <4 x i32> @test_vmulq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 {
81; CHECK-LABEL: test_vmulq_m_s32:
82; CHECK:       @ %bb.0: @ %entry
83; CHECK-NEXT:    vmsr p0, r0
84; CHECK-NEXT:    vpst
85; CHECK-NEXT:    vmult.i32 q0, q1, q2
86; CHECK-NEXT:    bx lr
87entry:
88  %0 = zext i16 %p to i32
89  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
90  %2 = tail call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
91  ret <4 x i32> %2
92}
93
94declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
95
96declare <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2
97
98define arm_aapcs_vfpcc <8 x half> @test_vmulq_m_f16(<8 x half> %inactive, <8 x half> %a, <8 x half> %b, i16 zeroext %p) local_unnamed_addr #1 {
99; CHECK-LABEL: test_vmulq_m_f16:
100; CHECK:       @ %bb.0: @ %entry
101; CHECK-NEXT:    vmsr p0, r0
102; CHECK-NEXT:    vpst
103; CHECK-NEXT:    vmult.f16 q0, q1, q2
104; CHECK-NEXT:    bx lr
105entry:
106  %0 = zext i16 %p to i32
107  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
108  %2 = tail call <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> %inactive)
109  ret <8 x half> %2
110}
111
112declare <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>) #2
113
114define arm_aapcs_vfpcc <16 x i8> @test_vmulq_x_u8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
115; CHECK-LABEL: test_vmulq_x_u8:
116; CHECK:       @ %bb.0: @ %entry
117; CHECK-NEXT:    vmsr p0, r0
118; CHECK-NEXT:    vpst
119; CHECK-NEXT:    vmult.i8 q0, q0, q1
120; CHECK-NEXT:    bx lr
121entry:
122  %0 = zext i16 %p to i32
123  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
124  %2 = tail call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> undef)
125  ret <16 x i8> %2
126}
127
128define arm_aapcs_vfpcc <8 x i16> @test_vmulq_x_s16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
129; CHECK-LABEL: test_vmulq_x_s16:
130; CHECK:       @ %bb.0: @ %entry
131; CHECK-NEXT:    vmsr p0, r0
132; CHECK-NEXT:    vpst
133; CHECK-NEXT:    vmult.i16 q0, q0, q1
134; CHECK-NEXT:    bx lr
135entry:
136  %0 = zext i16 %p to i32
137  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
138  %2 = tail call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> undef)
139  ret <8 x i16> %2
140}
141
142define arm_aapcs_vfpcc <4 x i32> @test_vmulq_x_u32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 {
143; CHECK-LABEL: test_vmulq_x_u32:
144; CHECK:       @ %bb.0: @ %entry
145; CHECK-NEXT:    vmsr p0, r0
146; CHECK-NEXT:    vpst
147; CHECK-NEXT:    vmult.i32 q0, q0, q1
148; CHECK-NEXT:    bx lr
149entry:
150  %0 = zext i16 %p to i32
151  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
152  %2 = tail call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> undef)
153  ret <4 x i32> %2
154}
155
156define arm_aapcs_vfpcc <4 x float> @test_vmulq_m_f32(<4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #1 {
157; CHECK-LABEL: test_vmulq_m_f32:
158; CHECK:       @ %bb.0: @ %entry
159; CHECK-NEXT:    vmsr p0, r0
160; CHECK-NEXT:    vpst
161; CHECK-NEXT:    vmult.f32 q0, q0, q1
162; CHECK-NEXT:    bx lr
163entry:
164  %0 = zext i16 %p to i32
165  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
166  %2 = tail call <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %1, <4 x float> undef)
167  ret <4 x float> %2
168}
169
170declare <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #2
171
172define arm_aapcs_vfpcc <16 x i8> @test_vmulq_n_u8(<16 x i8> %a, i8 zeroext %b) {
173; CHECK-LABEL: test_vmulq_n_u8:
174; CHECK:       @ %bb.0: @ %entry
175; CHECK-NEXT:    vmul.i8 q0, q0, r0
176; CHECK-NEXT:    bx lr
177entry:
178  %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
179  %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
180  %0 = mul <16 x i8> %.splat, %a
181  ret <16 x i8> %0
182}
183
184define arm_aapcs_vfpcc <8 x i16> @test_vmulq_n_s16(<8 x i16> %a, i16 signext %b) {
185; CHECK-LABEL: test_vmulq_n_s16:
186; CHECK:       @ %bb.0: @ %entry
187; CHECK-NEXT:    vmul.i16 q0, q0, r0
188; CHECK-NEXT:    bx lr
189entry:
190  %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
191  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
192  %0 = mul <8 x i16> %.splat, %a
193  ret <8 x i16> %0
194}
195
196define arm_aapcs_vfpcc <4 x i32> @test_vmulq_n_u32(<4 x i32> %a, i32 %b) {
197; CHECK-LABEL: test_vmulq_n_u32:
198; CHECK:       @ %bb.0: @ %entry
199; CHECK-NEXT:    vmul.i32 q0, q0, r0
200; CHECK-NEXT:    bx lr
201entry:
202  %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
203  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
204  %0 = mul <4 x i32> %.splat, %a
205  ret <4 x i32> %0
206}
207
208define arm_aapcs_vfpcc <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) {
209; CHECK-LABEL: test_vmulq_n_f32:
210; CHECK:       @ %bb.0: @ %entry
211; CHECK-NEXT:    vmov r0, s4
212; CHECK-NEXT:    vmul.f32 q0, q0, r0
213; CHECK-NEXT:    bx lr
214entry:
215  %.splatinsert = insertelement <4 x float> undef, float %b, i32 0
216  %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
217  %0 = fmul <4 x float> %.splat, %a
218  ret <4 x float> %0
219}
220
221define arm_aapcs_vfpcc <16 x i8> @test_vmulq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i8 signext %b, i16 zeroext %p) {
222; CHECK-LABEL: test_vmulq_m_n_s8:
223; CHECK:       @ %bb.0: @ %entry
224; CHECK-NEXT:    vmsr p0, r1
225; CHECK-NEXT:    vpst
226; CHECK-NEXT:    vmult.i8 q0, q1, r0
227; CHECK-NEXT:    bx lr
228entry:
229  %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
230  %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
231  %0 = zext i16 %p to i32
232  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
233  %2 = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, <16 x i1> %1, <16 x i8> %inactive)
234  ret <16 x i8> %2
235}
236
237define arm_aapcs_vfpcc <8 x i16> @test_vmulq_m_n_u16(<8 x i16> %inactive, <8 x i16> %a, i16 zeroext %b, i16 zeroext %p) {
238; CHECK-LABEL: test_vmulq_m_n_u16:
239; CHECK:       @ %bb.0: @ %entry
240; CHECK-NEXT:    vmsr p0, r1
241; CHECK-NEXT:    vpst
242; CHECK-NEXT:    vmult.i16 q0, q1, r0
243; CHECK-NEXT:    bx lr
244entry:
245  %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
246  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
247  %0 = zext i16 %p to i32
248  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
249  %2 = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, <8 x i1> %1, <8 x i16> %inactive)
250  ret <8 x i16> %2
251}
252
253define arm_aapcs_vfpcc <4 x i32> @test_vmulq_m_n_s32(<4 x i32> %inactive, <4 x i32> %a, i32 %b, i16 zeroext %p) {
254; CHECK-LABEL: test_vmulq_m_n_s32:
255; CHECK:       @ %bb.0: @ %entry
256; CHECK-NEXT:    vmsr p0, r1
257; CHECK-NEXT:    vpst
258; CHECK-NEXT:    vmult.i32 q0, q1, r0
259; CHECK-NEXT:    bx lr
260entry:
261  %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
262  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
263  %0 = zext i16 %p to i32
264  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
265  %2 = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, <4 x i1> %1, <4 x i32> %inactive)
266  ret <4 x i32> %2
267}
268
269define arm_aapcs_vfpcc <8 x half> @test_vmulq_m_n_f16(<8 x half> %inactive, <8 x half> %a, float %b.coerce, i16 zeroext %p) {
270; CHECK-LABEL: test_vmulq_m_n_f16:
271; CHECK:       @ %bb.0: @ %entry
272; CHECK-NEXT:    vmov r1, s8
273; CHECK-NEXT:    vmsr p0, r0
274; CHECK-NEXT:    vpst
275; CHECK-NEXT:    vmult.f16 q0, q1, r1
276; CHECK-NEXT:    bx lr
277entry:
278  %0 = bitcast float %b.coerce to i32
279  %tmp.0.extract.trunc = trunc i32 %0 to i16
280  %1 = bitcast i16 %tmp.0.extract.trunc to half
281  %.splatinsert = insertelement <8 x half> undef, half %1, i32 0
282  %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer
283  %2 = zext i16 %p to i32
284  %3 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2)
285  %4 = call <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %.splat, <8 x i1> %3, <8 x half> %inactive)
286  ret <8 x half> %4
287}
288
289define arm_aapcs_vfpcc <16 x i8> @test_vmulq_x_n_u8(<16 x i8> %a, i8 zeroext %b, i16 zeroext %p) {
290; CHECK-LABEL: test_vmulq_x_n_u8:
291; CHECK:       @ %bb.0: @ %entry
292; CHECK-NEXT:    vmsr p0, r1
293; CHECK-NEXT:    vpst
294; CHECK-NEXT:    vmult.i8 q0, q0, r0
295; CHECK-NEXT:    bx lr
296entry:
297  %.splatinsert = insertelement <16 x i8> undef, i8 %b, i32 0
298  %.splat = shufflevector <16 x i8> %.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
299  %0 = zext i16 %p to i32
300  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
301  %2 = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %.splat, <16 x i1> %1, <16 x i8> undef)
302  ret <16 x i8> %2
303}
304
305define arm_aapcs_vfpcc <8 x i16> @test_vmulq_x_n_s16(<8 x i16> %a, i16 signext %b, i16 zeroext %p) {
306; CHECK-LABEL: test_vmulq_x_n_s16:
307; CHECK:       @ %bb.0: @ %entry
308; CHECK-NEXT:    vmsr p0, r1
309; CHECK-NEXT:    vpst
310; CHECK-NEXT:    vmult.i16 q0, q0, r0
311; CHECK-NEXT:    bx lr
312entry:
313  %.splatinsert = insertelement <8 x i16> undef, i16 %b, i32 0
314  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
315  %0 = zext i16 %p to i32
316  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
317  %2 = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %.splat, <8 x i1> %1, <8 x i16> undef)
318  ret <8 x i16> %2
319}
320
321define arm_aapcs_vfpcc <4 x i32> @test_vmulq_x_n_u32(<4 x i32> %a, i32 %b, i16 zeroext %p) {
322; CHECK-LABEL: test_vmulq_x_n_u32:
323; CHECK:       @ %bb.0: @ %entry
324; CHECK-NEXT:    vmsr p0, r1
325; CHECK-NEXT:    vpst
326; CHECK-NEXT:    vmult.i32 q0, q0, r0
327; CHECK-NEXT:    bx lr
328entry:
329  %.splatinsert = insertelement <4 x i32> undef, i32 %b, i32 0
330  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
331  %0 = zext i16 %p to i32
332  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
333  %2 = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %.splat, <4 x i1> %1, <4 x i32> undef)
334  ret <4 x i32> %2
335}
336
337define arm_aapcs_vfpcc <4 x float> @test_vmulq_x_n_f32(<4 x float> %a, float %b, i16 zeroext %p) {
338; CHECK-LABEL: test_vmulq_x_n_f32:
339; CHECK:       @ %bb.0: @ %entry
340; CHECK-NEXT:    vmov r1, s4
341; CHECK-NEXT:    vmsr p0, r0
342; CHECK-NEXT:    vpst
343; CHECK-NEXT:    vmult.f32 q0, q0, r1
344; CHECK-NEXT:    bx lr
345entry:
346  %.splatinsert = insertelement <4 x float> undef, float %b, i32 0
347  %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
348  %0 = zext i16 %p to i32
349  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
350  %2 = call <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %.splat, <4 x i1> %1, <4 x float> undef)
351  ret <4 x float> %2
352}
353