• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast                 | FileCheck %s --check-prefixes=CHECK,GENERIC
2; The instruction latencies of Exynos-M1 trigger the transform we see under the Exynos check.
3; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast -mcpu=exynos-m1 | FileCheck %s --check-prefixes=CHECK,EXYNOSM1
4; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast -mcpu=exynos-m3 | FileCheck %s --check-prefixes=CHECK,EXYNOSM3
5
6declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>)
7
8declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>)
9
10declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>)
11
12declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
13
14declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
15
16declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
17
18declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
19
20declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>)
21
22declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>)
23
24declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>)
25
26declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>)
27
28declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
29
30declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
31
32declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
33
34declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
35
36declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
37
38declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
39
40declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
41
42declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
43
44declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
45
46declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
47
48define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
49; CHECK-LABEL: test_vmla_lane_s16:
50; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
51entry:
52  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
53  %mul = mul <4 x i16> %shuffle, %b
54  %add = add <4 x i16> %mul, %a
55  ret <4 x i16> %add
56}
57
58define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
59; CHECK-LABEL: test_vmlaq_lane_s16:
60; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
61entry:
62  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
63  %mul = mul <8 x i16> %shuffle, %b
64  %add = add <8 x i16> %mul, %a
65  ret <8 x i16> %add
66}
67
68define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
69; CHECK-LABEL: test_vmla_lane_s32:
70; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
71entry:
72  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
73  %mul = mul <2 x i32> %shuffle, %b
74  %add = add <2 x i32> %mul, %a
75  ret <2 x i32> %add
76}
77
78define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
79; CHECK-LABEL: test_vmlaq_lane_s32:
80; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
81entry:
82  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
83  %mul = mul <4 x i32> %shuffle, %b
84  %add = add <4 x i32> %mul, %a
85  ret <4 x i32> %add
86}
87
88define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
89; CHECK-LABEL: test_vmla_laneq_s16:
90; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
91entry:
92  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
93  %mul = mul <4 x i16> %shuffle, %b
94  %add = add <4 x i16> %mul, %a
95  ret <4 x i16> %add
96}
97
98define <8 x i16> @test_vmlaq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
99; CHECK-LABEL: test_vmlaq_laneq_s16:
100; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
101entry:
102  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
103  %mul = mul <8 x i16> %shuffle, %b
104  %add = add <8 x i16> %mul, %a
105  ret <8 x i16> %add
106}
107
108define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
109; CHECK-LABEL: test_vmla_laneq_s32:
110; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
111entry:
112  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
113  %mul = mul <2 x i32> %shuffle, %b
114  %add = add <2 x i32> %mul, %a
115  ret <2 x i32> %add
116}
117
118define <4 x i32> @test_vmlaq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
119; CHECK-LABEL: test_vmlaq_laneq_s32:
120; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
121entry:
122  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
123  %mul = mul <4 x i32> %shuffle, %b
124  %add = add <4 x i32> %mul, %a
125  ret <4 x i32> %add
126}
127
128define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
129; CHECK-LABEL: test_vmls_lane_s16:
130; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
131entry:
132  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
133  %mul = mul <4 x i16> %shuffle, %b
134  %sub = sub <4 x i16> %a, %mul
135  ret <4 x i16> %sub
136}
137
138define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
139; CHECK-LABEL: test_vmlsq_lane_s16:
140; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
141entry:
142  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
143  %mul = mul <8 x i16> %shuffle, %b
144  %sub = sub <8 x i16> %a, %mul
145  ret <8 x i16> %sub
146}
147
148define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
149; CHECK-LABEL: test_vmls_lane_s32:
150; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
151entry:
152  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
153  %mul = mul <2 x i32> %shuffle, %b
154  %sub = sub <2 x i32> %a, %mul
155  ret <2 x i32> %sub
156}
157
158define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
159; CHECK-LABEL: test_vmlsq_lane_s32:
160; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
161entry:
162  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
163  %mul = mul <4 x i32> %shuffle, %b
164  %sub = sub <4 x i32> %a, %mul
165  ret <4 x i32> %sub
166}
167
168define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
169; CHECK-LABEL: test_vmls_laneq_s16:
170; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
171entry:
172  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
173  %mul = mul <4 x i16> %shuffle, %b
174  %sub = sub <4 x i16> %a, %mul
175  ret <4 x i16> %sub
176}
177
178define <8 x i16> @test_vmlsq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
179; CHECK-LABEL: test_vmlsq_laneq_s16:
180; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
181entry:
182  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
183  %mul = mul <8 x i16> %shuffle, %b
184  %sub = sub <8 x i16> %a, %mul
185  ret <8 x i16> %sub
186}
187
188define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
189; CHECK-LABEL: test_vmls_laneq_s32:
190; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
191entry:
192  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
193  %mul = mul <2 x i32> %shuffle, %b
194  %sub = sub <2 x i32> %a, %mul
195  ret <2 x i32> %sub
196}
197
198define <4 x i32> @test_vmlsq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
199; CHECK-LABEL: test_vmlsq_laneq_s32:
200; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
201entry:
202  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
203  %mul = mul <4 x i32> %shuffle, %b
204  %sub = sub <4 x i32> %a, %mul
205  ret <4 x i32> %sub
206}
207
208define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %v) {
209; CHECK-LABEL: test_vmul_lane_s16:
210; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
211entry:
212  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
213  %mul = mul <4 x i16> %shuffle, %a
214  ret <4 x i16> %mul
215}
216
217define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
218; CHECK-LABEL: test_vmulq_lane_s16:
219; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
220entry:
221  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
222  %mul = mul <8 x i16> %shuffle, %a
223  ret <8 x i16> %mul
224}
225
226define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %v) {
227; CHECK-LABEL: test_vmul_lane_s32:
228; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
229entry:
230  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
231  %mul = mul <2 x i32> %shuffle, %a
232  ret <2 x i32> %mul
233}
234
235define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
236; CHECK-LABEL: test_vmulq_lane_s32:
237; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
238entry:
239  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
240  %mul = mul <4 x i32> %shuffle, %a
241  ret <4 x i32> %mul
242}
243
244define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %v) {
245; CHECK-LABEL: test_vmul_lane_u16:
246; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
247entry:
248  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
249  %mul = mul <4 x i16> %shuffle, %a
250  ret <4 x i16> %mul
251}
252
253define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %v) {
254; CHECK-LABEL: test_vmulq_lane_u16:
255; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
256entry:
257  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
258  %mul = mul <8 x i16> %shuffle, %a
259  ret <8 x i16> %mul
260}
261
262define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %v) {
263; CHECK-LABEL: test_vmul_lane_u32:
264; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
265entry:
266  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
267  %mul = mul <2 x i32> %shuffle, %a
268  ret <2 x i32> %mul
269}
270
271define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %v) {
272; CHECK-LABEL: test_vmulq_lane_u32:
273; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
274entry:
275  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
276  %mul = mul <4 x i32> %shuffle, %a
277  ret <4 x i32> %mul
278}
279
280define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
281; CHECK-LABEL: test_vmul_laneq_s16:
282; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
283entry:
284  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
285  %mul = mul <4 x i16> %shuffle, %a
286  ret <4 x i16> %mul
287}
288
289define <8 x i16> @test_vmulq_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
290; CHECK-LABEL: test_vmulq_laneq_s16:
291; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
292entry:
293  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
294  %mul = mul <8 x i16> %shuffle, %a
295  ret <8 x i16> %mul
296}
297
298define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
299; CHECK-LABEL: test_vmul_laneq_s32:
300; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
301entry:
302  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
303  %mul = mul <2 x i32> %shuffle, %a
304  ret <2 x i32> %mul
305}
306
307define <4 x i32> @test_vmulq_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
308; CHECK-LABEL: test_vmulq_laneq_s32:
309; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
310entry:
311  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
312  %mul = mul <4 x i32> %shuffle, %a
313  ret <4 x i32> %mul
314}
315
316define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
317; CHECK-LABEL: test_vmul_laneq_u16:
318; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
319entry:
320  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
321  %mul = mul <4 x i16> %shuffle, %a
322  ret <4 x i16> %mul
323}
324
325define <8 x i16> @test_vmulq_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
326; CHECK-LABEL: test_vmulq_laneq_u16:
327; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
328entry:
329  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
330  %mul = mul <8 x i16> %shuffle, %a
331  ret <8 x i16> %mul
332}
333
334define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
335; CHECK-LABEL: test_vmul_laneq_u32:
336; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
337entry:
338  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
339  %mul = mul <2 x i32> %shuffle, %a
340  ret <2 x i32> %mul
341}
342
343define <4 x i32> @test_vmulq_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
344; CHECK-LABEL: test_vmulq_laneq_u32:
345; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
346entry:
347  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
348  %mul = mul <4 x i32> %shuffle, %a
349  ret <4 x i32> %mul
350}
351
352define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
353; CHECK-LABEL: test_vfma_lane_f32:
354; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
355; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1]
356; EXYNOSM1: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
357; EXYNOSM3: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
358entry:
359  %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
360  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
361  ret <2 x float> %0
362}
363
364declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
365
366define <4 x float> @test_vfmaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
367; CHECK-LABEL: test_vfmaq_lane_f32:
368; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
369; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
370; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
371; EXYNOSM3: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
372entry:
373  %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
374  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
375  ret <4 x float> %0
376}
377
378declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
379
380define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
381; CHECK-LABEL: test_vfma_laneq_f32:
382; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
383; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[3]
384; EXYNOSM1: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
385; EXYNOSM3: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
386entry:
387  %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
388  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
389  ret <2 x float> %0
390}
391
392define <4 x float> @test_vfmaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
393; CHECK-LABEL: test_vfmaq_laneq_f32:
394; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
395; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
396; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
397; EXYNOSM3: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
398entry:
399  %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
400  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
401  ret <4 x float> %0
402}
403
404define <2 x float> @test_vfms_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
405; CHECK-LABEL: test_vfms_lane_f32:
406; GENERIC: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
407; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1]
408; EXYNOSM1: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
409; EXYNOSM3: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
410entry:
411  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
412  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> <i32 1, i32 1>
413  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
414  ret <2 x float> %0
415}
416
417define <4 x float> @test_vfmsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
418; CHECK-LABEL: test_vfmsq_lane_f32:
419; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
420; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
421; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
422; EXYNOSM3: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
423entry:
424  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
425  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
426  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
427  ret <4 x float> %0
428}
429
430define <2 x float> @test_vfms_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
431; CHECK-LABEL: test_vfms_laneq_f32:
432; GENERIC: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
433; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[3]
434; EXYNOSM1: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
435; EXYNOSM3: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
436entry:
437  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
438  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> <i32 3, i32 3>
439  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
440  ret <2 x float> %0
441}
442
443define <4 x float> @test_vfmsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
444; CHECK-LABEL: test_vfmsq_laneq_f32:
445; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
446; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
447; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
448; EXYNOSM3: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
449entry:
450  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
451  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
452  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
453  ret <4 x float> %0
454}
455
456define <2 x double> @test_vfmaq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
457; CHECK-LABEL: test_vfmaq_lane_f64:
458; GENERIC: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
459; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
460; EXYNOSM1: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
461; EXYNOSM3: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
462entry:
463  %lane = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
464  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
465  ret <2 x double> %0
466}
467
468declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
469
470define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
471; CHECK-LABEL: test_vfmaq_laneq_f64:
472; GENERIC: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
473; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[1]
474; EXYNOSM1: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
475; EXYNOSM3: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
476entry:
477  %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
478  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
479  ret <2 x double> %0
480}
481
482define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
483; CHECK-LABEL: test_vfmsq_lane_f64:
484; GENERIC: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
485; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
486; EXYNOSM1: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
487; EXYNOSM3: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
488entry:
489  %sub = fsub <1 x double> <double -0.000000e+00>, %v
490  %lane = shufflevector <1 x double> %sub, <1 x double> undef, <2 x i32> zeroinitializer
491  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
492  ret <2 x double> %0
493}
494
495define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
496; CHECK-LABEL: test_vfmsq_laneq_f64:
497; GENERIC: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
498; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[1]
499; EXYNOSM1: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
500; EXYNOSM3: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
501entry:
502  %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
503  %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> <i32 1, i32 1>
504  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
505  ret <2 x double> %0
506}
507
508define float @test_vfmas_laneq_f32(float %a, float %b, <4 x float> %v) {
509; CHECK-LABEL: test_vfmas_laneq_f32
510; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
511entry:
512  %extract = extractelement <4 x float> %v, i32 3
513  %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a)
514  ret float %0
515}
516
517declare float @llvm.fma.f32(float, float, float)
518
519define double @test_vfmsd_lane_f64(double %a, double %b, <1 x double> %v) {
520; CHECK-LABEL: test_vfmsd_lane_f64
521; CHECK: fmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
522entry:
523  %extract.rhs = extractelement <1 x double> %v, i32 0
524  %extract = fsub double -0.000000e+00, %extract.rhs
525  %0 = tail call double @llvm.fma.f64(double %b, double %extract, double %a)
526  ret double %0
527}
528
529declare double @llvm.fma.f64(double, double, double)
530
531define float @test_vfmss_lane_f32(float %a, float %b, <2 x float> %v) {
532; CHECK-LABEL: test_vfmss_lane_f32
533; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
534entry:
535  %extract.rhs = extractelement <2 x float> %v, i32 1
536  %extract = fsub float -0.000000e+00, %extract.rhs
537  %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a)
538  ret float %0
539}
540
541define float @test_vfmss_laneq_f32(float %a, float %b, <4 x float> %v) {
542; CHECK-LABEL: test_vfmss_laneq_f32
543; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
544entry:
545  %extract.rhs = extractelement <4 x float> %v, i32 3
546  %extract = fsub float -0.000000e+00, %extract.rhs
547  %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a)
548  ret float %0
549}
550
551define double @test_vfmsd_laneq_f64(double %a, double %b, <2 x double> %v) {
552; CHECK-LABEL: test_vfmsd_laneq_f64
553; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
554entry:
555  %extract.rhs = extractelement <2 x double> %v, i32 1
556  %extract = fsub double -0.000000e+00, %extract.rhs
557  %0 = tail call double @llvm.fma.f64(double %b, double %extract, double %a)
558  ret double %0
559}
560
561define double @test_vfmsd_lane_f64_0(double %a, double %b, <1 x double> %v) {
562; CHCK-LABEL: test_vfmsd_lane_f64_0
563; CHCK: fmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
564; CHCK-NEXT: ret
565entry:
566  %tmp0 = fsub <1 x double> <double -0.000000e+00>, %v
567  %tmp1 = extractelement <1 x double> %tmp0, i32 0
568  %0 = tail call double @llvm.fma.f64(double %b, double %tmp1, double %a)
569  ret double %0
570}
571
572define float @test_vfmss_lane_f32_0(float %a, float %b, <2 x float> %v) {
573; CHECK-LABEL: test_vfmss_lane_f32_0
574; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
575entry:
576  %tmp0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
577  %tmp1 = extractelement <2 x float> %tmp0, i32 1
578  %0 = tail call float @llvm.fma.f32(float %b, float %tmp1, float %a)
579  ret float %0
580}
581
582define float @test_vfmss_laneq_f32_0(float %a, float %b, <4 x float> %v) {
583; CHECK-LABEL: test_vfmss_laneq_f32_0
584; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
585entry:
586  %tmp0 = fsub <4 x float><float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
587  %tmp1 = extractelement <4 x float> %tmp0, i32 3
588  %0 = tail call float @llvm.fma.f32(float %b, float %tmp1, float %a)
589  ret float %0
590}
591
592define double @test_vfmsd_laneq_f64_0(double %a, double %b, <2 x double> %v) {
593; CHECK-LABEL: test_vfmsd_laneq_f64_0
594; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
595entry:
596  %tmp0 = fsub <2 x double><double -0.000000e+00, double -0.000000e+00>, %v
597  %tmp1 = extractelement <2 x double> %tmp0, i32 1
598  %0 = tail call double @llvm.fma.f64(double %b, double %tmp1, double %a)
599  ret double %0
600}
601
602define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
603; CHECK-LABEL: test_vmlal_lane_s16:
604; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
605entry:
606  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
607  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
608  %add = add <4 x i32> %vmull2.i, %a
609  ret <4 x i32> %add
610}
611
612define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
613; CHECK-LABEL: test_vmlal_lane_s32:
614; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
615entry:
616  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
617  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
618  %add = add <2 x i64> %vmull2.i, %a
619  ret <2 x i64> %add
620}
621
622define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
623; CHECK-LABEL: test_vmlal_laneq_s16:
624; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
625entry:
626  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
627  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
628  %add = add <4 x i32> %vmull2.i, %a
629  ret <4 x i32> %add
630}
631
632define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
633; CHECK-LABEL: test_vmlal_laneq_s32:
634; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
635entry:
636  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
637  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
638  %add = add <2 x i64> %vmull2.i, %a
639  ret <2 x i64> %add
640}
641
642define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
643; CHECK-LABEL: test_vmlal_high_lane_s16:
644; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
645entry:
646  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
647  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
648  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
649  %add = add <4 x i32> %vmull2.i, %a
650  ret <4 x i32> %add
651}
652
653define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
654; CHECK-LABEL: test_vmlal_high_lane_s32:
655; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
656entry:
657  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
658  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
659  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
660  %add = add <2 x i64> %vmull2.i, %a
661  ret <2 x i64> %add
662}
663
664define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
665; CHECK-LABEL: test_vmlal_high_laneq_s16:
666; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
667entry:
668  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
669  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
670  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
671  %add = add <4 x i32> %vmull2.i, %a
672  ret <4 x i32> %add
673}
674
675define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
676; CHECK-LABEL: test_vmlal_high_laneq_s32:
677; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
678entry:
679  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
680  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
681  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
682  %add = add <2 x i64> %vmull2.i, %a
683  ret <2 x i64> %add
684}
685
686define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
687; CHECK-LABEL: test_vmlsl_lane_s16:
688; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
689entry:
690  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
691  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
692  %sub = sub <4 x i32> %a, %vmull2.i
693  ret <4 x i32> %sub
694}
695
696define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
697; CHECK-LABEL: test_vmlsl_lane_s32:
698; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
699entry:
700  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
701  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
702  %sub = sub <2 x i64> %a, %vmull2.i
703  ret <2 x i64> %sub
704}
705
706define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
707; CHECK-LABEL: test_vmlsl_laneq_s16:
708; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
709entry:
710  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
711  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
712  %sub = sub <4 x i32> %a, %vmull2.i
713  ret <4 x i32> %sub
714}
715
716define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
717; CHECK-LABEL: test_vmlsl_laneq_s32:
718; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
719entry:
720  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
721  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
722  %sub = sub <2 x i64> %a, %vmull2.i
723  ret <2 x i64> %sub
724}
725
726define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
727; CHECK-LABEL: test_vmlsl_high_lane_s16:
728; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
729entry:
730  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
731  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
732  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
733  %sub = sub <4 x i32> %a, %vmull2.i
734  ret <4 x i32> %sub
735}
736
737define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
738; CHECK-LABEL: test_vmlsl_high_lane_s32:
739; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
740entry:
741  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
742  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
743  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
744  %sub = sub <2 x i64> %a, %vmull2.i
745  ret <2 x i64> %sub
746}
747
748define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
749; CHECK-LABEL: test_vmlsl_high_laneq_s16:
750; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
751entry:
752  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
753  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
754  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
755  %sub = sub <4 x i32> %a, %vmull2.i
756  ret <4 x i32> %sub
757}
758
759define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
760; CHECK-LABEL: test_vmlsl_high_laneq_s32:
761; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
762entry:
763  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
764  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
765  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
766  %sub = sub <2 x i64> %a, %vmull2.i
767  ret <2 x i64> %sub
768}
769
770define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
771; CHECK-LABEL: test_vmlal_lane_u16:
772; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
773entry:
774  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
775  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
776  %add = add <4 x i32> %vmull2.i, %a
777  ret <4 x i32> %add
778}
779
780define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
781; CHECK-LABEL: test_vmlal_lane_u32:
782; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
783entry:
784  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
785  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
786  %add = add <2 x i64> %vmull2.i, %a
787  ret <2 x i64> %add
788}
789
790define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
791; CHECK-LABEL: test_vmlal_laneq_u16:
792; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
793entry:
794  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
795  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
796  %add = add <4 x i32> %vmull2.i, %a
797  ret <4 x i32> %add
798}
799
800define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
801; CHECK-LABEL: test_vmlal_laneq_u32:
802; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
803entry:
804  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
805  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
806  %add = add <2 x i64> %vmull2.i, %a
807  ret <2 x i64> %add
808}
809
810define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
811; CHECK-LABEL: test_vmlal_high_lane_u16:
812; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
813entry:
814  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
815  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
816  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
817  %add = add <4 x i32> %vmull2.i, %a
818  ret <4 x i32> %add
819}
820
821define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
822; CHECK-LABEL: test_vmlal_high_lane_u32:
823; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
824entry:
825  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
826  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
827  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
828  %add = add <2 x i64> %vmull2.i, %a
829  ret <2 x i64> %add
830}
831
832define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
833; CHECK-LABEL: test_vmlal_high_laneq_u16:
834; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
835entry:
836  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
837  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
838  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
839  %add = add <4 x i32> %vmull2.i, %a
840  ret <4 x i32> %add
841}
842
843define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
844; CHECK-LABEL: test_vmlal_high_laneq_u32:
845; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
846entry:
847  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
848  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
849  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
850  %add = add <2 x i64> %vmull2.i, %a
851  ret <2 x i64> %add
852}
853
854define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
855; CHECK-LABEL: test_vmlsl_lane_u16:
856; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
857entry:
858  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
859  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
860  %sub = sub <4 x i32> %a, %vmull2.i
861  ret <4 x i32> %sub
862}
863
864define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
865; CHECK-LABEL: test_vmlsl_lane_u32:
866; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
867entry:
868  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
869  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
870  %sub = sub <2 x i64> %a, %vmull2.i
871  ret <2 x i64> %sub
872}
873
874define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
875; CHECK-LABEL: test_vmlsl_laneq_u16:
876; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
877entry:
878  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
879  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
880  %sub = sub <4 x i32> %a, %vmull2.i
881  ret <4 x i32> %sub
882}
883
884define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
885; CHECK-LABEL: test_vmlsl_laneq_u32:
886; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
887entry:
888  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
889  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
890  %sub = sub <2 x i64> %a, %vmull2.i
891  ret <2 x i64> %sub
892}
893
894define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
895; CHECK-LABEL: test_vmlsl_high_lane_u16:
896; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
897entry:
898  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
899  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
900  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
901  %sub = sub <4 x i32> %a, %vmull2.i
902  ret <4 x i32> %sub
903}
904
905define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
906; CHECK-LABEL: test_vmlsl_high_lane_u32:
907; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
908entry:
909  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
910  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
911  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
912  %sub = sub <2 x i64> %a, %vmull2.i
913  ret <2 x i64> %sub
914}
915
916define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
917; CHECK-LABEL: test_vmlsl_high_laneq_u16:
918; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
919entry:
920  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
921  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
922  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
923  %sub = sub <4 x i32> %a, %vmull2.i
924  ret <4 x i32> %sub
925}
926
927define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
928; CHECK-LABEL: test_vmlsl_high_laneq_u32:
929; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
930entry:
931  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
932  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
933  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
934  %sub = sub <2 x i64> %a, %vmull2.i
935  ret <2 x i64> %sub
936}
937
938define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
939; CHECK-LABEL: test_vmull_lane_s16:
940; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
941entry:
942  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
943  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
944  ret <4 x i32> %vmull2.i
945}
946
947define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
948; CHECK-LABEL: test_vmull_lane_s32:
949; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
950entry:
951  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
952  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
953  ret <2 x i64> %vmull2.i
954}
955
956define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %v) {
957; CHECK-LABEL: test_vmull_lane_u16:
958; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
959entry:
960  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
961  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
962  ret <4 x i32> %vmull2.i
963}
964
965define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %v) {
966; CHECK-LABEL: test_vmull_lane_u32:
967; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
968entry:
969  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
970  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
971  ret <2 x i64> %vmull2.i
972}
973
974define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
975; CHECK-LABEL: test_vmull_high_lane_s16:
976; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
977entry:
978  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
979  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
980  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
981  ret <4 x i32> %vmull2.i
982}
983
984define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
985; CHECK-LABEL: test_vmull_high_lane_s32:
986; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
987entry:
988  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
989  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
990  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
991  ret <2 x i64> %vmull2.i
992}
993
994define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) {
995; CHECK-LABEL: test_vmull_high_lane_u16:
996; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
997entry:
998  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
999  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1000  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
1001  ret <4 x i32> %vmull2.i
1002}
1003
1004define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) {
1005; CHECK-LABEL: test_vmull_high_lane_u32:
1006; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
1007entry:
1008  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1009  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1010  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
1011  ret <2 x i64> %vmull2.i
1012}
1013
1014define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
1015; CHECK-LABEL: test_vmull_laneq_s16:
1016; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
1017entry:
1018  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1019  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
1020  ret <4 x i32> %vmull2.i
1021}
1022
1023define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
1024; CHECK-LABEL: test_vmull_laneq_s32:
1025; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
1026entry:
1027  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
1028  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
1029  ret <2 x i64> %vmull2.i
1030}
1031
1032define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
1033; CHECK-LABEL: test_vmull_laneq_u16:
1034; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
1035entry:
1036  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1037  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
1038  ret <4 x i32> %vmull2.i
1039}
1040
1041define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
1042; CHECK-LABEL: test_vmull_laneq_u32:
1043; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
1044entry:
1045  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
1046  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
1047  ret <2 x i64> %vmull2.i
1048}
1049
1050define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
1051; CHECK-LABEL: test_vmull_high_laneq_s16:
1052; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
1053entry:
1054  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1055  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1056  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
1057  ret <4 x i32> %vmull2.i
1058}
1059
1060define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
1061; CHECK-LABEL: test_vmull_high_laneq_s32:
1062; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
1063entry:
1064  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1065  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
1066  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
1067  ret <2 x i64> %vmull2.i
1068}
1069
1070define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
1071; CHECK-LABEL: test_vmull_high_laneq_u16:
1072; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
1073entry:
1074  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1075  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1076  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
1077  ret <4 x i32> %vmull2.i
1078}
1079
1080define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
1081; CHECK-LABEL: test_vmull_high_laneq_u32:
1082; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
1083entry:
1084  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1085  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
1086  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
1087  ret <2 x i64> %vmull2.i
1088}
1089
1090define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
1091; CHECK-LABEL: test_vqdmlal_lane_s16:
1092; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
1093entry:
1094  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1095  %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
1096  %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
1097  ret <4 x i32> %vqdmlal4.i
1098}
1099
1100define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
1101; CHECK-LABEL: test_vqdmlal_lane_s32:
1102; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
1103entry:
1104  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1105  %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
1106  %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
1107  ret <2 x i64> %vqdmlal4.i
1108}
1109
1110define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
1111; CHECK-LABEL: test_vqdmlal_high_lane_s16:
1112; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
1113entry:
1114  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1115  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1116  %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
1117  %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
1118  ret <4 x i32> %vqdmlal4.i
1119}
1120
1121define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
1122; CHECK-LABEL: test_vqdmlal_high_lane_s32:
1123; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
1124entry:
1125  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1126  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1127  %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
1128  %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
1129  ret <2 x i64> %vqdmlal4.i
1130}
1131
1132define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
1133; CHECK-LABEL: test_vqdmlsl_lane_s16:
1134; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
1135entry:
1136  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1137  %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
1138  %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
1139  ret <4 x i32> %vqdmlsl4.i
1140}
1141
1142define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
1143; CHECK-LABEL: test_vqdmlsl_lane_s32:
1144; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
1145entry:
1146  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1147  %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
1148  %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
1149  ret <2 x i64> %vqdmlsl4.i
1150}
1151
1152define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
1153; CHECK-LABEL: test_vqdmlsl_high_lane_s16:
1154; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
1155entry:
1156  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1157  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1158  %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
1159  %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
1160  ret <4 x i32> %vqdmlsl4.i
1161}
1162
1163define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
1164; CHECK-LABEL: test_vqdmlsl_high_lane_s32:
1165; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
1166entry:
1167  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1168  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1169  %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
1170  %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
1171  ret <2 x i64> %vqdmlsl4.i
1172}
1173
1174define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
1175; CHECK-LABEL: test_vqdmull_lane_s16:
1176; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
1177entry:
1178  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1179  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
1180  ret <4 x i32> %vqdmull2.i
1181}
1182
1183define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
1184; CHECK-LABEL: test_vqdmull_lane_s32:
1185; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
1186entry:
1187  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1188  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
1189  ret <2 x i64> %vqdmull2.i
1190}
1191
1192define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
1193; CHECK-LABEL: test_vqdmull_laneq_s16:
1194; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
1195entry:
1196  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1197  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
1198  ret <4 x i32> %vqdmull2.i
1199}
1200
1201define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
1202; CHECK-LABEL: test_vqdmull_laneq_s32:
1203; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
1204entry:
1205  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
1206  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
1207  ret <2 x i64> %vqdmull2.i
1208}
1209
1210define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
1211; CHECK-LABEL: test_vqdmull_high_lane_s16:
1212; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
1213entry:
1214  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1215  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1216  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
1217  ret <4 x i32> %vqdmull2.i
1218}
1219
1220define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
1221; CHECK-LABEL: test_vqdmull_high_lane_s32:
1222; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
1223entry:
1224  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1225  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1226  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
1227  ret <2 x i64> %vqdmull2.i
1228}
1229
1230define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
1231; CHECK-LABEL: test_vqdmull_high_laneq_s16:
1232; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
1233entry:
1234  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1235  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
1236  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
1237  ret <4 x i32> %vqdmull2.i
1238}
1239
1240define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
1241; CHECK-LABEL: test_vqdmull_high_laneq_s32:
1242; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
1243entry:
1244  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1245  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
1246  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
1247  ret <2 x i64> %vqdmull2.i
1248}
1249
1250define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
1251; CHECK-LABEL: test_vqdmulh_lane_s16:
1252; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
1253entry:
1254  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1255  %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
1256  ret <4 x i16> %vqdmulh2.i
1257}
1258
1259define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
1260; CHECK-LABEL: test_vqdmulhq_lane_s16:
1261; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
1262entry:
1263  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
1264  %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
1265  ret <8 x i16> %vqdmulh2.i
1266}
1267
1268define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
1269; CHECK-LABEL: test_vqdmulh_lane_s32:
1270; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
1271entry:
1272  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1273  %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
1274  ret <2 x i32> %vqdmulh2.i
1275}
1276
1277define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
1278; CHECK-LABEL: test_vqdmulhq_lane_s32:
1279; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
1280entry:
1281  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1282  %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
1283  ret <4 x i32> %vqdmulh2.i
1284}
1285
1286define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
1287; CHECK-LABEL: test_vqrdmulh_lane_s16:
1288; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
1289entry:
1290  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1291  %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
1292  ret <4 x i16> %vqrdmulh2.i
1293}
1294
1295define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
1296; CHECK-LABEL: test_vqrdmulhq_lane_s16:
1297; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
1298entry:
1299  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
1300  %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
1301  ret <8 x i16> %vqrdmulh2.i
1302}
1303
1304define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
1305; CHECK-LABEL: test_vqrdmulh_lane_s32:
1306; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
1307entry:
1308  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
1309  %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
1310  ret <2 x i32> %vqrdmulh2.i
1311}
1312
1313define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
1314; CHECK-LABEL: test_vqrdmulhq_lane_s32:
1315; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
1316entry:
1317  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1318  %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
1319  ret <4 x i32> %vqrdmulh2.i
1320}
1321
1322define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) {
1323; CHECK-LABEL: test_vmul_lane_f32:
1324; GENERIC: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
1325; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1]
1326; EXYNOSM1: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
1327; EXYNOSM3: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
1328entry:
1329  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
1330  %mul = fmul <2 x float> %shuffle, %a
1331  ret <2 x float> %mul
1332}
1333
1334define <1 x double> @test_vmul_lane_f64(<1 x double> %a, <1 x double> %v) {
1335; CHECK-LABEL: test_vmul_lane_f64:
1336; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
1337entry:
1338  %0 = bitcast <1 x double> %a to <8 x i8>
1339  %1 = bitcast <8 x i8> %0 to double
1340  %extract = extractelement <1 x double> %v, i32 0
1341  %2 = fmul double %1, %extract
1342  %3 = insertelement <1 x double> undef, double %2, i32 0
1343  ret <1 x double> %3
1344}
1345
1346define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %v) {
1347; CHECK-LABEL: test_vmulq_lane_f32:
1348; GENERIC: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
1349; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
1350; EXYNOSM1: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
1351; EXYNOSM3: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
1352entry:
1353  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1354  %mul = fmul <4 x float> %shuffle, %a
1355  ret <4 x float> %mul
1356}
1357
1358define <2 x double> @test_vmulq_lane_f64(<2 x double> %a, <1 x double> %v) {
1359; CHECK-LABEL: test_vmulq_lane_f64:
1360; GENERIC: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
1361; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
1362; EXYNOSM1: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
1363; EXYNOSM3: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
1364entry:
1365  %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
1366  %mul = fmul <2 x double> %shuffle, %a
1367  ret <2 x double> %mul
1368}
1369
1370define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) {
1371; CHECK-LABEL: test_vmul_laneq_f32:
1372; GENERIC: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
1373; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[3]
1374; EXYNOSM1: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
1375; EXYNOSM3: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
1376entry:
1377  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
1378  %mul = fmul <2 x float> %shuffle, %a
1379  ret <2 x float> %mul
1380}
1381
1382define <1 x double> @test_vmul_laneq_f64(<1 x double> %a, <2 x double> %v) {
1383; CHECK-LABEL: test_vmul_laneq_f64:
1384; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
1385entry:
1386  %0 = bitcast <1 x double> %a to <8 x i8>
1387  %1 = bitcast <8 x i8> %0 to double
1388  %extract = extractelement <2 x double> %v, i32 1
1389  %2 = fmul double %1, %extract
1390  %3 = insertelement <1 x double> undef, double %2, i32 0
1391  ret <1 x double> %3
1392}
1393
1394define <4 x float> @test_vmulq_laneq_f32(<4 x float> %a, <4 x float> %v) {
1395; CHECK-LABEL: test_vmulq_laneq_f32:
1396; GENERIC: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
1397; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
1398; EXYNOSM1: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
1399; EXYNOSM3: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
1400entry:
1401  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1402  %mul = fmul <4 x float> %shuffle, %a
1403  ret <4 x float> %mul
1404}
1405
1406define <2 x double> @test_vmulq_laneq_f64(<2 x double> %a, <2 x double> %v) {
1407; CHECK-LABEL: test_vmulq_laneq_f64:
1408; GENERIC: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
1409; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[1]
1410; EXYNOSM1: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
1411; EXYNOSM3: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
1412entry:
1413  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
1414  %mul = fmul <2 x double> %shuffle, %a
1415  ret <2 x double> %mul
1416}
1417
1418define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) {
1419; CHECK-LABEL: test_vmulx_lane_f32:
1420; GENERIC: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
1421; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1]
1422; EXYNOSM1: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
1423; EXYNOSM3: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
1424entry:
1425  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
1426  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
1427  ret <2 x float> %vmulx2.i
1428}
1429
1430define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) {
1431; CHECK-LABEL: test_vmulxq_lane_f32:
1432; GENERIC: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
1433; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
1434; EXYNOSM1: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
1435; EXYNOSM3: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
1436entry:
1437  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
1438  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
1439  ret <4 x float> %vmulx2.i
1440}
1441
1442define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) {
1443; CHECK-LABEL: test_vmulxq_lane_f64:
1444; GENERIC: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
1445; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
1446; EXYNOSM1: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
1447; EXYNOSM3: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
1448entry:
1449  %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
1450  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
1451  ret <2 x double> %vmulx2.i
1452}
1453
1454define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) {
1455; CHECK-LABEL: test_vmulx_laneq_f32:
1456; GENERIC: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
1457; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[3]
1458; EXYNOSM1: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
1459; EXYNOSM3: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
1460entry:
1461  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
1462  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
1463  ret <2 x float> %vmulx2.i
1464}
1465
1466define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) {
1467; CHECK-LABEL: test_vmulxq_laneq_f32:
1468; GENERIC: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
1469; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
1470; EXYNOSM1: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
1471; EXYNOSM3: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
1472entry:
1473  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
1474  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
1475  ret <4 x float> %vmulx2.i
1476}
1477
1478define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) {
1479; CHECK-LABEL: test_vmulxq_laneq_f64:
1480; GENERIC: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
1481; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[1]
1482; EXYNOSM1: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
1483; EXYNOSM3: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
1484entry:
1485  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
1486  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
1487  ret <2 x double> %vmulx2.i
1488}
1489
1490define <4 x i16> @test_vmla_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
1491; CHECK-LABEL: test_vmla_lane_s16_0:
1492; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
1493entry:
1494  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
1495  %mul = mul <4 x i16> %shuffle, %b
1496  %add = add <4 x i16> %mul, %a
1497  ret <4 x i16> %add
1498}
1499
1500define <8 x i16> @test_vmlaq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
1501; CHECK-LABEL: test_vmlaq_lane_s16_0:
1502; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
1503entry:
1504  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
1505  %mul = mul <8 x i16> %shuffle, %b
1506  %add = add <8 x i16> %mul, %a
1507  ret <8 x i16> %add
1508}
1509
1510define <2 x i32> @test_vmla_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
1511; CHECK-LABEL: test_vmla_lane_s32_0:
1512; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
1513entry:
1514  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
1515  %mul = mul <2 x i32> %shuffle, %b
1516  %add = add <2 x i32> %mul, %a
1517  ret <2 x i32> %add
1518}
1519
1520define <4 x i32> @test_vmlaq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
1521; CHECK-LABEL: test_vmlaq_lane_s32_0:
1522; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
1523entry:
1524  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
1525  %mul = mul <4 x i32> %shuffle, %b
1526  %add = add <4 x i32> %mul, %a
1527  ret <4 x i32> %add
1528}
1529
1530define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
1531; CHECK-LABEL: test_vmla_laneq_s16_0:
1532; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
1533entry:
1534  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
1535  %mul = mul <4 x i16> %shuffle, %b
1536  %add = add <4 x i16> %mul, %a
1537  ret <4 x i16> %add
1538}
1539
1540define <8 x i16> @test_vmlaq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
1541; CHECK-LABEL: test_vmlaq_laneq_s16_0:
1542; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
1543entry:
1544  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
1545  %mul = mul <8 x i16> %shuffle, %b
1546  %add = add <8 x i16> %mul, %a
1547  ret <8 x i16> %add
1548}
1549
1550define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
1551; CHECK-LABEL: test_vmla_laneq_s32_0:
1552; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
1553entry:
1554  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
1555  %mul = mul <2 x i32> %shuffle, %b
1556  %add = add <2 x i32> %mul, %a
1557  ret <2 x i32> %add
1558}
1559
1560define <4 x i32> @test_vmlaq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
1561; CHECK-LABEL: test_vmlaq_laneq_s32_0:
1562; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
1563entry:
1564  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
1565  %mul = mul <4 x i32> %shuffle, %b
1566  %add = add <4 x i32> %mul, %a
1567  ret <4 x i32> %add
1568}
1569
1570define <4 x i16> @test_vmls_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
1571; CHECK-LABEL: test_vmls_lane_s16_0:
1572; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
1573entry:
1574  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
1575  %mul = mul <4 x i16> %shuffle, %b
1576  %sub = sub <4 x i16> %a, %mul
1577  ret <4 x i16> %sub
1578}
1579
1580define <8 x i16> @test_vmlsq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
1581; CHECK-LABEL: test_vmlsq_lane_s16_0:
1582; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
1583entry:
1584  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
1585  %mul = mul <8 x i16> %shuffle, %b
1586  %sub = sub <8 x i16> %a, %mul
1587  ret <8 x i16> %sub
1588}
1589
1590define <2 x i32> @test_vmls_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
1591; CHECK-LABEL: test_vmls_lane_s32_0:
1592; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
1593entry:
1594  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
1595  %mul = mul <2 x i32> %shuffle, %b
1596  %sub = sub <2 x i32> %a, %mul
1597  ret <2 x i32> %sub
1598}
1599
1600define <4 x i32> @test_vmlsq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
1601; CHECK-LABEL: test_vmlsq_lane_s32_0:
1602; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
1603entry:
1604  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
1605  %mul = mul <4 x i32> %shuffle, %b
1606  %sub = sub <4 x i32> %a, %mul
1607  ret <4 x i32> %sub
1608}
1609
1610define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
1611; CHECK-LABEL: test_vmls_laneq_s16_0:
1612; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
1613entry:
1614  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
1615  %mul = mul <4 x i16> %shuffle, %b
1616  %sub = sub <4 x i16> %a, %mul
1617  ret <4 x i16> %sub
1618}
1619
1620define <8 x i16> @test_vmlsq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
1621; CHECK-LABEL: test_vmlsq_laneq_s16_0:
1622; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
1623entry:
1624  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
1625  %mul = mul <8 x i16> %shuffle, %b
1626  %sub = sub <8 x i16> %a, %mul
1627  ret <8 x i16> %sub
1628}
1629
1630define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
1631; CHECK-LABEL: test_vmls_laneq_s32_0:
1632; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
1633entry:
1634  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
1635  %mul = mul <2 x i32> %shuffle, %b
1636  %sub = sub <2 x i32> %a, %mul
1637  ret <2 x i32> %sub
1638}
1639
1640define <4 x i32> @test_vmlsq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
1641; CHECK-LABEL: test_vmlsq_laneq_s32_0:
1642; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
1643entry:
1644  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
1645  %mul = mul <4 x i32> %shuffle, %b
1646  %sub = sub <4 x i32> %a, %mul
1647  ret <4 x i32> %sub
1648}
1649
1650define <4 x i16> @test_vmul_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
1651; CHECK-LABEL: test_vmul_lane_s16_0:
1652; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
1653entry:
1654  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
1655  %mul = mul <4 x i16> %shuffle, %a
1656  ret <4 x i16> %mul
1657}
1658
1659define <8 x i16> @test_vmulq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
1660; CHECK-LABEL: test_vmulq_lane_s16_0:
1661; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
1662entry:
1663  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
1664  %mul = mul <8 x i16> %shuffle, %a
1665  ret <8 x i16> %mul
1666}
1667
1668define <2 x i32> @test_vmul_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
1669; CHECK-LABEL: test_vmul_lane_s32_0:
1670; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
1671entry:
1672  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
1673  %mul = mul <2 x i32> %shuffle, %a
1674  ret <2 x i32> %mul
1675}
1676
1677define <4 x i32> @test_vmulq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
1678; CHECK-LABEL: test_vmulq_lane_s32_0:
1679; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
1680entry:
1681  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
1682  %mul = mul <4 x i32> %shuffle, %a
1683  ret <4 x i32> %mul
1684}
1685
1686define <4 x i16> @test_vmul_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
1687; CHECK-LABEL: test_vmul_lane_u16_0:
1688; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
1689entry:
1690  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
1691  %mul = mul <4 x i16> %shuffle, %a
1692  ret <4 x i16> %mul
1693}
1694
1695define <8 x i16> @test_vmulq_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
1696; CHECK-LABEL: test_vmulq_lane_u16_0:
1697; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
1698entry:
1699  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
1700  %mul = mul <8 x i16> %shuffle, %a
1701  ret <8 x i16> %mul
1702}
1703
1704define <2 x i32> @test_vmul_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
1705; CHECK-LABEL: test_vmul_lane_u32_0:
1706; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
1707entry:
1708  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
1709  %mul = mul <2 x i32> %shuffle, %a
1710  ret <2 x i32> %mul
1711}
1712
1713define <4 x i32> @test_vmulq_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
1714; CHECK-LABEL: test_vmulq_lane_u32_0:
1715; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
1716entry:
1717  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
1718  %mul = mul <4 x i32> %shuffle, %a
1719  ret <4 x i32> %mul
1720}
1721
1722define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
1723; CHECK-LABEL: test_vmul_laneq_s16_0:
1724; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
1725entry:
1726  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
1727  %mul = mul <4 x i16> %shuffle, %a
1728  ret <4 x i16> %mul
1729}
1730
1731define <8 x i16> @test_vmulq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
1732; CHECK-LABEL: test_vmulq_laneq_s16_0:
1733; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
1734entry:
1735  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
1736  %mul = mul <8 x i16> %shuffle, %a
1737  ret <8 x i16> %mul
1738}
1739
1740define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
1741; CHECK-LABEL: test_vmul_laneq_s32_0:
1742; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
1743entry:
1744  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
1745  %mul = mul <2 x i32> %shuffle, %a
1746  ret <2 x i32> %mul
1747}
1748
1749define <4 x i32> @test_vmulq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
1750; CHECK-LABEL: test_vmulq_laneq_s32_0:
1751; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
1752entry:
1753  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
1754  %mul = mul <4 x i32> %shuffle, %a
1755  ret <4 x i32> %mul
1756}
1757
1758define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
1759; CHECK-LABEL: test_vmul_laneq_u16_0:
1760; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
1761entry:
1762  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
1763  %mul = mul <4 x i16> %shuffle, %a
1764  ret <4 x i16> %mul
1765}
1766
1767define <8 x i16> @test_vmulq_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
1768; CHECK-LABEL: test_vmulq_laneq_u16_0:
1769; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
1770entry:
1771  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
1772  %mul = mul <8 x i16> %shuffle, %a
1773  ret <8 x i16> %mul
1774}
1775
1776define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
1777; CHECK-LABEL: test_vmul_laneq_u32_0:
1778; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
1779entry:
1780  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
1781  %mul = mul <2 x i32> %shuffle, %a
1782  ret <2 x i32> %mul
1783}
1784
1785define <4 x i32> @test_vmulq_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
1786; CHECK-LABEL: test_vmulq_laneq_u32_0:
1787; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
1788entry:
1789  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
1790  %mul = mul <4 x i32> %shuffle, %a
1791  ret <4 x i32> %mul
1792}
1793
1794define <2 x float> @test_vfma_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
1795; CHECK-LABEL: test_vfma_lane_f32_0:
1796; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
1797; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
1798; EXYNOSM1: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
1799; EXYNOSM3: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
1800entry:
1801  %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
1802  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
1803  ret <2 x float> %0
1804}
1805
1806define <4 x float> @test_vfmaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
1807; CHECK-LABEL: test_vfmaq_lane_f32_0:
1808; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
1809; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
1810; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
1811; EXYNOSM3: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
1812entry:
1813  %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
1814  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
1815  ret <4 x float> %0
1816}
1817
1818define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
1819; CHECK-LABEL: test_vfma_laneq_f32_0:
1820; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
1821; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
1822; EXYNOSM1: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
1823; EXYNOSM3: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
1824entry:
1825  %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
1826  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
1827  ret <2 x float> %0
1828}
1829
1830define <4 x float> @test_vfmaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
1831; CHECK-LABEL: test_vfmaq_laneq_f32_0:
1832; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
1833; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
1834; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
1835; EXYNOSM3: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
1836entry:
1837  %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
1838  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
1839  ret <4 x float> %0
1840}
1841
1842define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
1843; CHECK-LABEL: test_vfms_lane_f32_0:
1844; GENERIC: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
1845; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
1846; EXYNOSM1: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
1847; EXYNOSM3: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
1848entry:
1849  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
1850  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> zeroinitializer
1851  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
1852  ret <2 x float> %0
1853}
1854
1855define <4 x float> @test_vfmsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
1856; CHECK-LABEL: test_vfmsq_lane_f32_0:
1857; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
1858; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
1859; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
1860; EXYNOSM3: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
1861entry:
1862  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
1863  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> zeroinitializer
1864  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
1865  ret <4 x float> %0
1866}
1867
1868define <2 x float> @test_vfms_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
1869; CHECK-LABEL: test_vfms_laneq_f32_0:
1870; GENERIC: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
1871; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
1872; EXYNOSM1: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
1873; EXYNOSM3: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
1874entry:
1875  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
1876  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> zeroinitializer
1877  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
1878  ret <2 x float> %0
1879}
1880
1881define <4 x float> @test_vfmsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
1882; CHECK-LABEL: test_vfmsq_laneq_f32_0:
1883; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
1884; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
1885; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
1886; EXYNOSM3: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
1887entry:
1888  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
1889  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> zeroinitializer
1890  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
1891  ret <4 x float> %0
1892}
1893
1894define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
1895; CHECK-LABEL: test_vfmaq_laneq_f64_0:
1896; GENERIC: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
1897; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
1898; EXYNOSM1: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
1899; EXYNOSM3: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
1900entry:
1901  %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
1902  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
1903  ret <2 x double> %0
1904}
1905
1906define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
1907; CHECK-LABEL: test_vfmsq_laneq_f64_0:
1908; GENERIC: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
1909; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
1910; EXYNOSM1: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
1911; EXYNOSM3: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
1912entry:
1913  %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
1914  %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> zeroinitializer
1915  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
1916  ret <2 x double> %0
1917}
1918
1919define <4 x i32> @test_vmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
1920; CHECK-LABEL: test_vmlal_lane_s16_0:
1921; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
1922entry:
1923  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
1924  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
1925  %add = add <4 x i32> %vmull2.i, %a
1926  ret <4 x i32> %add
1927}
1928
1929define <2 x i64> @test_vmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
1930; CHECK-LABEL: test_vmlal_lane_s32_0:
1931; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
1932entry:
1933  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
1934  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
1935  %add = add <2 x i64> %vmull2.i, %a
1936  ret <2 x i64> %add
1937}
1938
1939define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
1940; CHECK-LABEL: test_vmlal_laneq_s16_0:
1941; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
1942entry:
1943  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
1944  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
1945  %add = add <4 x i32> %vmull2.i, %a
1946  ret <4 x i32> %add
1947}
1948
1949define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
1950; CHECK-LABEL: test_vmlal_laneq_s32_0:
1951; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
1952entry:
1953  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
1954  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
1955  %add = add <2 x i64> %vmull2.i, %a
1956  ret <2 x i64> %add
1957}
1958
1959define <4 x i32> @test_vmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
1960; CHECK-LABEL: test_vmlal_high_lane_s16_0:
1961; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
1962entry:
1963  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1964  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
1965  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
1966  %add = add <4 x i32> %vmull2.i, %a
1967  ret <4 x i32> %add
1968}
1969
1970define <2 x i64> @test_vmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
1971; CHECK-LABEL: test_vmlal_high_lane_s32_0:
1972; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
1973entry:
1974  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1975  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
1976  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
1977  %add = add <2 x i64> %vmull2.i, %a
1978  ret <2 x i64> %add
1979}
1980
1981define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
1982; CHECK-LABEL: test_vmlal_high_laneq_s16_0:
1983; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
1984entry:
1985  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1986  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
1987  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
1988  %add = add <4 x i32> %vmull2.i, %a
1989  ret <4 x i32> %add
1990}
1991
1992define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
1993; CHECK-LABEL: test_vmlal_high_laneq_s32_0:
1994; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
1995entry:
1996  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1997  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
1998  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
1999  %add = add <2 x i64> %vmull2.i, %a
2000  ret <2 x i64> %add
2001}
2002
2003define <4 x i32> @test_vmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
2004; CHECK-LABEL: test_vmlsl_lane_s16_0:
2005; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
2006entry:
2007  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2008  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
2009  %sub = sub <4 x i32> %a, %vmull2.i
2010  ret <4 x i32> %sub
2011}
2012
2013define <2 x i64> @test_vmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
2014; CHECK-LABEL: test_vmlsl_lane_s32_0:
2015; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
2016entry:
2017  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2018  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
2019  %sub = sub <2 x i64> %a, %vmull2.i
2020  ret <2 x i64> %sub
2021}
2022
2023define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
2024; CHECK-LABEL: test_vmlsl_laneq_s16_0:
2025; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
2026entry:
2027  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2028  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
2029  %sub = sub <4 x i32> %a, %vmull2.i
2030  ret <4 x i32> %sub
2031}
2032
2033define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
2034; CHECK-LABEL: test_vmlsl_laneq_s32_0:
2035; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
2036entry:
2037  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2038  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
2039  %sub = sub <2 x i64> %a, %vmull2.i
2040  ret <2 x i64> %sub
2041}
2042
2043define <4 x i32> @test_vmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
2044; CHECK-LABEL: test_vmlsl_high_lane_s16_0:
2045; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
2046entry:
2047  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2048  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2049  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
2050  %sub = sub <4 x i32> %a, %vmull2.i
2051  ret <4 x i32> %sub
2052}
2053
2054define <2 x i64> @test_vmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
2055; CHECK-LABEL: test_vmlsl_high_lane_s32_0:
2056; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
2057entry:
2058  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2059  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2060  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
2061  %sub = sub <2 x i64> %a, %vmull2.i
2062  ret <2 x i64> %sub
2063}
2064
2065define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
2066; CHECK-LABEL: test_vmlsl_high_laneq_s16_0:
2067; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
2068entry:
2069  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2070  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2071  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
2072  %sub = sub <4 x i32> %a, %vmull2.i
2073  ret <4 x i32> %sub
2074}
2075
2076define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
2077; CHECK-LABEL: test_vmlsl_high_laneq_s32_0:
2078; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
2079entry:
2080  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2081  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2082  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
2083  %sub = sub <2 x i64> %a, %vmull2.i
2084  ret <2 x i64> %sub
2085}
2086
2087define <4 x i32> @test_vmlal_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
2088; CHECK-LABEL: test_vmlal_lane_u16_0:
2089; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
2090entry:
2091  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2092  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
2093  %add = add <4 x i32> %vmull2.i, %a
2094  ret <4 x i32> %add
2095}
2096
2097define <2 x i64> @test_vmlal_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
2098; CHECK-LABEL: test_vmlal_lane_u32_0:
2099; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
2100entry:
2101  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2102  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
2103  %add = add <2 x i64> %vmull2.i, %a
2104  ret <2 x i64> %add
2105}
2106
2107define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
2108; CHECK-LABEL: test_vmlal_laneq_u16_0:
2109; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
2110entry:
2111  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2112  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
2113  %add = add <4 x i32> %vmull2.i, %a
2114  ret <4 x i32> %add
2115}
2116
2117define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
2118; CHECK-LABEL: test_vmlal_laneq_u32_0:
2119; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
2120entry:
2121  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2122  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
2123  %add = add <2 x i64> %vmull2.i, %a
2124  ret <2 x i64> %add
2125}
2126
2127define <4 x i32> @test_vmlal_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
2128; CHECK-LABEL: test_vmlal_high_lane_u16_0:
2129; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
2130entry:
2131  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2132  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2133  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
2134  %add = add <4 x i32> %vmull2.i, %a
2135  ret <4 x i32> %add
2136}
2137
2138define <2 x i64> @test_vmlal_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
2139; CHECK-LABEL: test_vmlal_high_lane_u32_0:
2140; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
2141entry:
2142  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2143  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2144  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
2145  %add = add <2 x i64> %vmull2.i, %a
2146  ret <2 x i64> %add
2147}
2148
2149define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
2150; CHECK-LABEL: test_vmlal_high_laneq_u16_0:
2151; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
2152entry:
2153  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2154  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2155  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
2156  %add = add <4 x i32> %vmull2.i, %a
2157  ret <4 x i32> %add
2158}
2159
2160define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
2161; CHECK-LABEL: test_vmlal_high_laneq_u32_0:
2162; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
2163entry:
2164  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2165  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2166  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
2167  %add = add <2 x i64> %vmull2.i, %a
2168  ret <2 x i64> %add
2169}
2170
2171define <4 x i32> @test_vmlsl_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
2172; CHECK-LABEL: test_vmlsl_lane_u16_0:
2173; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
2174entry:
2175  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2176  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
2177  %sub = sub <4 x i32> %a, %vmull2.i
2178  ret <4 x i32> %sub
2179}
2180
2181define <2 x i64> @test_vmlsl_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
2182; CHECK-LABEL: test_vmlsl_lane_u32_0:
2183; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
2184entry:
2185  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2186  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
2187  %sub = sub <2 x i64> %a, %vmull2.i
2188  ret <2 x i64> %sub
2189}
2190
2191define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
2192; CHECK-LABEL: test_vmlsl_laneq_u16_0:
2193; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
2194entry:
2195  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2196  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
2197  %sub = sub <4 x i32> %a, %vmull2.i
2198  ret <4 x i32> %sub
2199}
2200
2201define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
2202; CHECK-LABEL: test_vmlsl_laneq_u32_0:
2203; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
2204entry:
2205  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2206  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
2207  %sub = sub <2 x i64> %a, %vmull2.i
2208  ret <2 x i64> %sub
2209}
2210
2211define <4 x i32> @test_vmlsl_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
2212; CHECK-LABEL: test_vmlsl_high_lane_u16_0:
2213; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
2214entry:
2215  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2216  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2217  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
2218  %sub = sub <4 x i32> %a, %vmull2.i
2219  ret <4 x i32> %sub
2220}
2221
2222define <2 x i64> @test_vmlsl_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
2223; CHECK-LABEL: test_vmlsl_high_lane_u32_0:
2224; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
2225entry:
2226  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2227  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2228  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
2229  %sub = sub <2 x i64> %a, %vmull2.i
2230  ret <2 x i64> %sub
2231}
2232
2233define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
2234; CHECK-LABEL: test_vmlsl_high_laneq_u16_0:
2235; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
2236entry:
2237  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2238  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2239  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
2240  %sub = sub <4 x i32> %a, %vmull2.i
2241  ret <4 x i32> %sub
2242}
2243
2244define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
2245; CHECK-LABEL: test_vmlsl_high_laneq_u32_0:
2246; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
2247entry:
2248  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2249  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2250  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
2251  %sub = sub <2 x i64> %a, %vmull2.i
2252  ret <2 x i64> %sub
2253}
2254
2255define <4 x i32> @test_vmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
2256; CHECK-LABEL: test_vmull_lane_s16_0:
2257; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
2258entry:
2259  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2260  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
2261  ret <4 x i32> %vmull2.i
2262}
2263
2264define <2 x i64> @test_vmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
2265; CHECK-LABEL: test_vmull_lane_s32_0:
2266; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
2267entry:
2268  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2269  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
2270  ret <2 x i64> %vmull2.i
2271}
2272
2273define <4 x i32> @test_vmull_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
2274; CHECK-LABEL: test_vmull_lane_u16_0:
2275; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
2276entry:
2277  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2278  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
2279  ret <4 x i32> %vmull2.i
2280}
2281
2282define <2 x i64> @test_vmull_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
2283; CHECK-LABEL: test_vmull_lane_u32_0:
2284; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
2285entry:
2286  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2287  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
2288  ret <2 x i64> %vmull2.i
2289}
2290
2291define <4 x i32> @test_vmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
2292; CHECK-LABEL: test_vmull_high_lane_s16_0:
2293; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
2294entry:
2295  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2296  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2297  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
2298  ret <4 x i32> %vmull2.i
2299}
2300
2301define <2 x i64> @test_vmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
2302; CHECK-LABEL: test_vmull_high_lane_s32_0:
2303; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
2304entry:
2305  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2306  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2307  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
2308  ret <2 x i64> %vmull2.i
2309}
2310
2311define <4 x i32> @test_vmull_high_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
2312; CHECK-LABEL: test_vmull_high_lane_u16_0:
2313; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
2314entry:
2315  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2316  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2317  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
2318  ret <4 x i32> %vmull2.i
2319}
2320
2321define <2 x i64> @test_vmull_high_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
2322; CHECK-LABEL: test_vmull_high_lane_u32_0:
2323; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
2324entry:
2325  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2326  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2327  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
2328  ret <2 x i64> %vmull2.i
2329}
2330
2331define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
2332; CHECK-LABEL: test_vmull_laneq_s16_0:
2333; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
2334entry:
2335  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2336  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
2337  ret <4 x i32> %vmull2.i
2338}
2339
2340define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
2341; CHECK-LABEL: test_vmull_laneq_s32_0:
2342; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
2343entry:
2344  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2345  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
2346  ret <2 x i64> %vmull2.i
2347}
2348
2349define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
2350; CHECK-LABEL: test_vmull_laneq_u16_0:
2351; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
2352entry:
2353  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2354  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
2355  ret <4 x i32> %vmull2.i
2356}
2357
2358define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
2359; CHECK-LABEL: test_vmull_laneq_u32_0:
2360; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
2361entry:
2362  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2363  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
2364  ret <2 x i64> %vmull2.i
2365}
2366
2367define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
2368; CHECK-LABEL: test_vmull_high_laneq_s16_0:
2369; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
2370entry:
2371  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2372  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2373  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
2374  ret <4 x i32> %vmull2.i
2375}
2376
2377define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
2378; CHECK-LABEL: test_vmull_high_laneq_s32_0:
2379; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
2380entry:
2381  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2382  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2383  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
2384  ret <2 x i64> %vmull2.i
2385}
2386
2387define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
2388; CHECK-LABEL: test_vmull_high_laneq_u16_0:
2389; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
2390entry:
2391  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2392  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2393  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
2394  ret <4 x i32> %vmull2.i
2395}
2396
2397define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
2398; CHECK-LABEL: test_vmull_high_laneq_u32_0:
2399; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
2400entry:
2401  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2402  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2403  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
2404  ret <2 x i64> %vmull2.i
2405}
2406
2407define <4 x i32> @test_vqdmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
2408; CHECK-LABEL: test_vqdmlal_lane_s16_0:
2409; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
2410entry:
2411  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2412  %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
2413  %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
2414  ret <4 x i32> %vqdmlal4.i
2415}
2416
2417define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
2418; CHECK-LABEL: test_vqdmlal_lane_s32_0:
2419; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
2420entry:
2421  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2422  %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
2423  %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
2424  ret <2 x i64> %vqdmlal4.i
2425}
2426
2427define <4 x i32> @test_vqdmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
2428; CHECK-LABEL: test_vqdmlal_high_lane_s16_0:
2429; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
2430entry:
2431  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2432  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2433  %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
2434  %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
2435  ret <4 x i32> %vqdmlal4.i
2436}
2437
2438define <2 x i64> @test_vqdmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
2439; CHECK-LABEL: test_vqdmlal_high_lane_s32_0:
2440; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
2441entry:
2442  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2443  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2444  %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
2445  %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
2446  ret <2 x i64> %vqdmlal4.i
2447}
2448
2449define <4 x i32> @test_vqdmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
2450; CHECK-LABEL: test_vqdmlsl_lane_s16_0:
2451; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
2452entry:
2453  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2454  %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
2455  %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
2456  ret <4 x i32> %vqdmlsl4.i
2457}
2458
2459define <2 x i64> @test_vqdmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
2460; CHECK-LABEL: test_vqdmlsl_lane_s32_0:
2461; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
2462entry:
2463  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2464  %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
2465  %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
2466  ret <2 x i64> %vqdmlsl4.i
2467}
2468
2469define <4 x i32> @test_vqdmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
2470; CHECK-LABEL: test_vqdmlsl_high_lane_s16_0:
2471; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
2472entry:
2473  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2474  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2475  %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
2476  %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
2477  ret <4 x i32> %vqdmlsl4.i
2478}
2479
2480define <2 x i64> @test_vqdmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
2481; CHECK-LABEL: test_vqdmlsl_high_lane_s32_0:
2482; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
2483entry:
2484  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2485  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2486  %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
2487  %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
2488  ret <2 x i64> %vqdmlsl4.i
2489}
2490
2491define <4 x i32> @test_vqdmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
2492; CHECK-LABEL: test_vqdmull_lane_s16_0:
2493; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
2494entry:
2495  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2496  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
2497  ret <4 x i32> %vqdmull2.i
2498}
2499
2500define <2 x i64> @test_vqdmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
2501; CHECK-LABEL: test_vqdmull_lane_s32_0:
2502; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
2503entry:
2504  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2505  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
2506  ret <2 x i64> %vqdmull2.i
2507}
2508
2509define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
2510; CHECK-LABEL: test_vqdmull_laneq_s16_0:
2511; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
2512entry:
2513  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2514  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
2515  ret <4 x i32> %vqdmull2.i
2516}
2517
2518define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
2519; CHECK-LABEL: test_vqdmull_laneq_s32_0:
2520; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
2521entry:
2522  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2523  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
2524  ret <2 x i64> %vqdmull2.i
2525}
2526
2527define <4 x i32> @test_vqdmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
2528; CHECK-LABEL: test_vqdmull_high_lane_s16_0:
2529; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
2530entry:
2531  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2532  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2533  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
2534  ret <4 x i32> %vqdmull2.i
2535}
2536
2537define <2 x i64> @test_vqdmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
2538; CHECK-LABEL: test_vqdmull_high_lane_s32_0:
2539; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
2540entry:
2541  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2542  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2543  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
2544  ret <2 x i64> %vqdmull2.i
2545}
2546
2547define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
2548; CHECK-LABEL: test_vqdmull_high_laneq_s16_0:
2549; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
2550entry:
2551  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2552  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
2553  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
2554  ret <4 x i32> %vqdmull2.i
2555}
2556
2557define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
2558; CHECK-LABEL: test_vqdmull_high_laneq_s32_0:
2559; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
2560entry:
2561  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
2562  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
2563  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
2564  ret <2 x i64> %vqdmull2.i
2565}
2566
2567define <4 x i16> @test_vqdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
2568; CHECK-LABEL: test_vqdmulh_lane_s16_0:
2569; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
2570entry:
2571  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2572  %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
2573  ret <4 x i16> %vqdmulh2.i
2574}
2575
2576define <8 x i16> @test_vqdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
2577; CHECK-LABEL: test_vqdmulhq_lane_s16_0:
2578; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
2579entry:
2580  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
2581  %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
2582  ret <8 x i16> %vqdmulh2.i
2583}
2584
2585define <2 x i32> @test_vqdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
2586; CHECK-LABEL: test_vqdmulh_lane_s32_0:
2587; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
2588entry:
2589  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2590  %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
2591  ret <2 x i32> %vqdmulh2.i
2592}
2593
2594define <4 x i32> @test_vqdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
2595; CHECK-LABEL: test_vqdmulhq_lane_s32_0:
2596; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
2597entry:
2598  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
2599  %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
2600  ret <4 x i32> %vqdmulh2.i
2601}
2602
2603define <4 x i16> @test_vqrdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
2604; CHECK-LABEL: test_vqrdmulh_lane_s16_0:
2605; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
2606entry:
2607  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
2608  %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
2609  ret <4 x i16> %vqrdmulh2.i
2610}
2611
2612define <8 x i16> @test_vqrdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
2613; CHECK-LABEL: test_vqrdmulhq_lane_s16_0:
2614; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
2615entry:
2616  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
2617  %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
2618  ret <8 x i16> %vqrdmulh2.i
2619}
2620
2621define <2 x i32> @test_vqrdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
2622; CHECK-LABEL: test_vqrdmulh_lane_s32_0:
2623; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
2624entry:
2625  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
2626  %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
2627  ret <2 x i32> %vqrdmulh2.i
2628}
2629
2630define <4 x i32> @test_vqrdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
2631; CHECK-LABEL: test_vqrdmulhq_lane_s32_0:
2632; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
2633entry:
2634  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
2635  %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
2636  ret <4 x i32> %vqrdmulh2.i
2637}
2638
2639define <2 x float> @test_vmul_lane_f32_0(<2 x float> %a, <2 x float> %v) {
2640; CHECK-LABEL: test_vmul_lane_f32_0:
2641; GENERIC: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
2642; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
2643; EXYNOSM1: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
2644; EXYNOSM3: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
2645entry:
2646  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
2647  %mul = fmul <2 x float> %shuffle, %a
2648  ret <2 x float> %mul
2649}
2650
2651define <4 x float> @test_vmulq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
2652; CHECK-LABEL: test_vmulq_lane_f32_0:
2653; GENERIC: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
2654; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
2655; EXYNOSM1: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
2656; EXYNOSM3: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
2657entry:
2658  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
2659  %mul = fmul <4 x float> %shuffle, %a
2660  ret <4 x float> %mul
2661}
2662
2663define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
2664; CHECK-LABEL: test_vmul_laneq_f32_0:
2665; GENERIC: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
2666; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
2667; EXYNOSM1: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
2668; EXYNOSM3: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
2669entry:
2670  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
2671  %mul = fmul <2 x float> %shuffle, %a
2672  ret <2 x float> %mul
2673}
2674
2675define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) {
2676; CHECK-LABEL: test_vmul_laneq_f64_0:
2677; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
2678entry:
2679  %0 = bitcast <1 x double> %a to <8 x i8>
2680  %1 = bitcast <8 x i8> %0 to double
2681  %extract = extractelement <2 x double> %v, i32 0
2682  %2 = fmul double %1, %extract
2683  %3 = insertelement <1 x double> undef, double %2, i32 0
2684  ret <1 x double> %3
2685}
2686
2687define <4 x float> @test_vmulq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
2688; CHECK-LABEL: test_vmulq_laneq_f32_0:
2689; GENERIC: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
2690; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
2691; EXYNOSM1: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
2692; EXYNOSM3: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
2693entry:
2694  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
2695  %mul = fmul <4 x float> %shuffle, %a
2696  ret <4 x float> %mul
2697}
2698
2699define <2 x double> @test_vmulq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
2700; CHECK-LABEL: test_vmulq_laneq_f64_0:
2701; GENERIC: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
2702; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
2703; EXYNOSM1: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
2704; EXYNOSM3: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
2705entry:
2706  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
2707  %mul = fmul <2 x double> %shuffle, %a
2708  ret <2 x double> %mul
2709}
2710
2711define <2 x float> @test_vmulx_lane_f32_0(<2 x float> %a, <2 x float> %v) {
2712; CHECK-LABEL: test_vmulx_lane_f32_0:
2713; GENERIC: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
2714; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
2715; EXYNOSM1: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
2716; EXYNOSM3: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
2717entry:
2718  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
2719  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
2720  ret <2 x float> %vmulx2.i
2721}
2722
2723define <4 x float> @test_vmulxq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
2724; CHECK-LABEL: test_vmulxq_lane_f32_0:
2725; GENERIC: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
2726; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
2727; EXYNOSM1: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
2728; EXYNOSM3: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
2729entry:
2730  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
2731  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
2732  ret <4 x float> %vmulx2.i
2733}
2734
2735define <2 x double> @test_vmulxq_lane_f64_0(<2 x double> %a, <1 x double> %v) {
2736; CHECK-LABEL: test_vmulxq_lane_f64_0:
2737; GENERIC: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
2738; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
2739; EXYNOSM1: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
2740; EXYNOSM3: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
2741entry:
2742  %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
2743  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
2744  ret <2 x double> %vmulx2.i
2745}
2746
2747define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
2748; CHECK-LABEL: test_vmulx_laneq_f32_0:
2749; GENERIC: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
2750; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0]
2751; EXYNOSM1: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
2752; EXYNOSM3: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
2753entry:
2754  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
2755  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
2756  ret <2 x float> %vmulx2.i
2757}
2758
2759define <4 x float> @test_vmulxq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
2760; CHECK-LABEL: test_vmulxq_laneq_f32_0:
2761; GENERIC: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
2762; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0]
2763; EXYNOSM1: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
2764; EXYNOSM3: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
2765entry:
2766  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
2767  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
2768  ret <4 x float> %vmulx2.i
2769}
2770
2771define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
2772; CHECK-LABEL: test_vmulxq_laneq_f64_0:
2773; GENERIC: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
2774; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0]
2775; EXYNOSM1: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d
2776; EXYNOSM3: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
2777entry:
2778  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
2779  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
2780  ret <2 x double> %vmulx2.i
2781}
2782
2783define <4 x float> @optimize_dup(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %v) {
2784; CHECK-LABEL: optimize_dup:
2785; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
2786; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
2787; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
2788; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
2789; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
2790; EXYNOSM3: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
2791; EXYNOSM3: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
2792entry:
2793  %lane1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
2794  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane1, <4 x float> %b, <4 x float> %a)
2795  %lane2 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
2796	%1 = fmul <4 x float> %lane2, %c
2797	%s = fsub <4 x float> %0, %1
2798  ret <4 x float> %s
2799}
2800
2801define <4 x float> @no_optimize_dup(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %v) {
2802; CHECK-LABEL: no_optimize_dup:
2803; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
2804; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
2805; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3]
2806; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s
2807; EXYNOSM1: dup  [[W:v[0-9]+]].4s, {{v[0-9]+}}.s[1]
2808; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[W]].4s
2809; EXYNOSM3: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
2810; EXYNOSM3: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
2811entry:
2812  %lane1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
2813  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane1, <4 x float> %b, <4 x float> %a)
2814  %lane2 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
2815	%1 = fmul <4 x float> %lane2, %c
2816	%s = fsub <4 x float> %0, %1
2817  ret <4 x float> %s
2818}
2819
2820define <2 x float> @test_vfma_lane_simdinstr_opt_pass_caching_a57(<2 x float> %a, <2 x float> %b, <2 x float> %v) "target-cpu"="cortex-a57" {
2821; CHECK-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_a57:
2822; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
2823entry:
2824  %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
2825  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
2826  ret <2 x float> %0
2827}
2828
2829define <2 x float> @test_vfma_lane_simdinstr_opt_pass_caching_m1(<2 x float> %a, <2 x float> %b, <2 x float> %v) "target-cpu"="exynos-m1" {
2830; CHECK-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_m1:
2831; GENERIC: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1]
2832; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s
2833entry:
2834  %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
2835  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
2836  ret <2 x float> %0
2837}
2838
2839define <2 x float> @test_vfma_lane_simdinstr_opt_pass_caching_m3(<2 x float> %a, <2 x float> %b, <2 x float> %v) "target-cpu"="exynos-m3" {
2840; CHECK-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_m3:
2841; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
2842entry:
2843  %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
2844  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
2845  ret <2 x float> %0
2846}
2847