• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V8a
2; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mattr=+rdm -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a
3; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mcpu=falkor -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a
4; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mattr=+v8.1a -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a
5; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mcpu=saphira -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a
6; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-eabi -mattr=+v8.1a -aarch64-neon-syntax=apple | FileCheck %s --check-prefix=CHECK-V81a-apple
7
8declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
9declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
10declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
11declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
12declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32)
13declare i16 @llvm.aarch64.neon.sqrdmulh.i16(i16, i16)
14
15declare <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>)
16declare <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>)
17declare <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>)
18declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
19declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
20declare i16 @llvm.aarch64.neon.sqadd.i16(i16, i16)
21
22declare <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>)
23declare <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>)
24declare <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>)
25declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
26declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
27declare i16 @llvm.aarch64.neon.sqsub.i16(i16, i16)
28
29;-----------------------------------------------------------------------------
30; RDMA Vector
31; test for SIMDThreeSameVectorSQRDMLxHTiedHS
32
33define <4 x i16> @test_sqrdmlah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
34; CHECK-LABEL: test_sqrdmlah_v4i16:
35   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs,  <4 x i16> %rhs)
36   %retval =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc,  <4 x i16> %prod)
37; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.4h
38; CHECK-V81a:       sqrdmlah    v0.4h, v1.4h, v2.4h
39; CHECK-V81a-apple: sqrdmlah.4h v0,    v1,    v2
40   ret <4 x i16> %retval
41}
42
43define <8 x i16> @test_sqrdmlah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
44; CHECK-LABEL: test_sqrdmlah_v8i16:
45   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
46   %retval =  call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
47; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.8h
48; CHECK-V81a:       sqrdmlah    v0.8h, v1.8h, v2.8h
49; CHECK-V81a-apple: sqrdmlah.8h v0, v1, v2
50   ret <8 x i16> %retval
51}
52
53define <2 x i32> @test_sqrdmlah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
54; CHECK-LABEL: test_sqrdmlah_v2i32:
55   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
56   %retval =  call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
57; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.2s
58; CHECK-V81a:       sqrdmlah    v0.2s, v1.2s, v2.2s
59; CHECK-V81a-apple: sqrdmlah.2s v0,    v1,    v2
60   ret <2 x i32> %retval
61}
62
63define <4 x i32> @test_sqrdmlah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
64; CHECK-LABEL: test_sqrdmlah_v4i32:
65   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
66   %retval =  call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
67; CHECK-V81:        sqrdmulh    v1.4s, v1.4s, v2.4s
68; CHECK-V81a:       sqrdmlah    v0.4s, v1.4s, v2.4s
69; CHECK-V81a-apple: sqrdmlah.4s v0,    v1,    v2
70   ret <4 x i32> %retval
71}
72
73define <4 x i16> @test_sqrdmlsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
74; CHECK-LABEL: test_sqrdmlsh_v4i16:
75   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs,  <4 x i16> %rhs)
76   %retval =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
77; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.4h
78; CHECK-V81a:       sqrdmlsh    v0.4h, v1.4h, v2.4h
79; CHECK-V81a-apple: sqrdmlsh.4h v0,    v1,    v2
80   ret <4 x i16> %retval
81}
82
83define <8 x i16> @test_sqrdmlsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
84; CHECK-LABEL: test_sqrdmlsh_v8i16:
85   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
86   %retval =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
87; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.8h
88; CHECK-V81a:       sqrdmlsh    v0.8h, v1.8h, v2.8h
89; CHECK-V81a-apple: sqrdmlsh.8h v0,    v1,    v2
90   ret <8 x i16> %retval
91}
92
93define <2 x i32> @test_sqrdmlsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
94; CHECK-LABEL: test_sqrdmlsh_v2i32:
95   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
96   %retval =  call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
97; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.2s
98; CHECK-V81a:       sqrdmlsh    v0.2s, v1.2s, v2.2s
99; CHECK-V81a-apple: sqrdmlsh.2s v0,    v1,    v2
100   ret <2 x i32> %retval
101}
102
103define <4 x i32> @test_sqrdmlsh_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
104; CHECK-LABEL: test_sqrdmlsh_v4i32:
105   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
106   %retval =  call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
107; CHECK-V8a:        sqrdmulh    v1.4s, v1.4s, v2.4s
108; CHECK-V81a:       sqrdmlsh    v0.4s, v1.4s, v2.4s
109; CHECK-V81a-apple: sqrdmlsh.4s v0,    v1,    v2
110   ret <4 x i32> %retval
111}
112
113;-----------------------------------------------------------------------------
114; RDMA Vector, by element
115; tests for vXiYY_indexed in SIMDIndexedSQRDMLxHSDTied
116
117define <4 x i16> @test_sqrdmlah_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
118; CHECK-LABEL: test_sqrdmlah_lane_s16:
119entry:
120  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
121  %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
122  %retval =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod)
123; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.h[3]
124; CHECK-V81a:       sqrdmlah    v0.4h, v1.4h, v2.h[3]
125; CHECK-V81a-apple: sqrdmlah.4h v0,    v1,    v2[3]
126  ret <4 x i16> %retval
127}
128
129define <8 x i16> @test_sqrdmlahq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
130; CHECK-LABEL: test_sqrdmlahq_lane_s16:
131entry:
132  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
133  %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
134  %retval =  call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
135; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.h[2]
136; CHECK-V81a:       sqrdmlah    v0.8h, v1.8h, v2.h[2]
137; CHECK-V81a-apple: sqrdmlah.8h v0,    v1,    v2[2]
138  ret <8 x i16> %retval
139}
140
141define <2 x i32> @test_sqrdmlah_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
142; CHECK-LABEL: test_sqrdmlah_lane_s32:
143entry:
144  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
145  %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
146  %retval =  call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
147; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.s[1]
148; CHECK-V81a:       sqrdmlah    v0.2s, v1.2s, v2.s[1]
149; CHECK-V81a-apple: sqrdmlah.2s v0,    v1,    v2[1]
150  ret <2 x i32> %retval
151}
152
153define <4 x i32> @test_sqrdmlahq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
154; CHECK-LABEL: test_sqrdmlahq_lane_s32:
155entry:
156  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
157  %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
158  %retval =  call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
159; CHECK-V8a:        sqrdmulh    v1.4s, v1.4s, v2.s[0]
160; CHECK-V81a:       sqrdmlah    v0.4s, v1.4s, v2.s[0]
161; CHECK-V81a-apple: sqrdmlah.4s v0,    v1,    v2[0]
162  ret <4 x i32> %retval
163}
164
165define <4 x i16> @test_sqrdmlsh_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
166; CHECK-LABEL: test_sqrdmlsh_lane_s16:
167entry:
168  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
169  %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
170  %retval =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
171; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.h[3]
172; CHECK-V81a:       sqrdmlsh    v0.4h, v1.4h, v2.h[3]
173; CHECK-V81a-apple: sqrdmlsh.4h v0,    v1,    v2[3]
174  ret <4 x i16> %retval
175}
176
177define <8 x i16> @test_sqrdmlshq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
178; CHECK-LABEL: test_sqrdmlshq_lane_s16:
179entry:
180  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
181  %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
182  %retval =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
183; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.h[2]
184; CHECK-V81a:       sqrdmlsh    v0.8h, v1.8h, v2.h[2]
185; CHECK-V81a-apple: sqrdmlsh.8h v0,    v1,    v2[2]
186  ret <8 x i16> %retval
187}
188
189define <2 x i32> @test_sqrdmlsh_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
190; CHECK-LABEL: test_sqrdmlsh_lane_s32:
191entry:
192  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
193  %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
194  %retval =  call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
195; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.s[1]
196; CHECK-V81a:       sqrdmlsh    v0.2s, v1.2s, v2.s[1]
197; CHECK-V81a-apple: sqrdmlsh.2s v0,    v1,    v2[1]
198  ret <2 x i32> %retval
199}
200
201define <4 x i32> @test_sqrdmlshq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
202; CHECK-LABEL: test_sqrdmlshq_lane_s32:
203entry:
204  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
205  %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
206  %retval =  call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
207; CHECK-V8a:        sqrdmulh    v1.4s, v1.4s, v2.s[0]
208; CHECK-V81a:       sqrdmlsh    v0.4s, v1.4s, v2.s[0]
209; CHECK-V81a-apple: sqrdmlsh.4s v0,    v1,    v2[0]
210  ret <4 x i32> %retval
211}
212
213;-----------------------------------------------------------------------------
214; RDMA Vector, by element, extracted
215; i16 tests are for vXi16_indexed in SIMDIndexedSQRDMLxHSDTied, with IR in ACLE style
216; i32 tests are for   "def : Pat" in SIMDIndexedSQRDMLxHSDTied
217
218define i16 @test_sqrdmlah_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
219; CHECK-LABEL: test_sqrdmlah_extracted_lane_s16:
220entry:
221  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
222  %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
223  %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
224  %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
225  %retval = extractelement <4 x i16> %retval_vec, i64 0
226; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, v0.4h, v1.h[1]
227; CHECK-V81a:       sqrdmlah    {{v[2-9]+}}.4h, v0.4h, v1.h[1]
228; CHECK-V81a-apple: sqrdmlah.4h {{v[2-9]+}},    v0,    v1[1]
229  ret i16 %retval
230}
231
232define i16 @test_sqrdmlahq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
233; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s16:
234entry:
235  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1>
236  %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
237  %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
238  %retval_vec =  call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
239  %retval = extractelement <8 x i16> %retval_vec, i64 0
240; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.8h, v0.8h, v1.h[1]
241; CHECK-V81a:       sqrdmlah    {{v[2-9]+}}.8h, v0.8h, v1.h[1]
242; CHECK-V81a-apple: sqrdmlah.8h {{v[2-9]+}},    v0,    v1[1]
243  ret i16 %retval
244}
245
246define i32 @test_sqrdmlah_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
247; CHECK-LABEL: test_sqrdmlah_extracted_lane_s32:
248entry:
249  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
250  %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
251  %extract = extractelement <2 x i32> %prod, i64 0
252  %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract)
253; CHECK-V8a:        sqrdmulh    v0.2s, v0.2s, v1.s[0]
254; CHECK-V81a:       sqrdmlah    v2.2s, v0.2s, v1.s[0]
255; CHECK-V81a-apple: sqrdmlah.2s v2,    v0,    v1[0]
256  ret i32 %retval
257}
258
259define i32 @test_sqrdmlahq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
260; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s32:
261entry:
262  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
263  %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
264  %extract = extractelement <4 x i32> %prod, i64 0
265  %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract)
266; CHECK-V8a:        sqrdmulh    v0.4s, v0.4s, v1.s[0]
267; CHECK-V81a:       sqrdmlah    v2.4s, v0.4s, v1.s[0]
268; CHECK-V81a-apple: sqrdmlah.4s v2,    v0,    v1[0]
269  ret i32 %retval
270}
271
272define i16 @test_sqrdmlsh_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
273; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s16:
274entry:
275  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
276  %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
277  %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
278  %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
279  %retval = extractelement <4 x i16> %retval_vec, i64 0
280; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, v0.4h, v1.h[1]
281; CHECK-V81a:       sqrdmlsh    {{v[2-9]+}}.4h, v0.4h, v1.h[1]
282; CHECK-V81a-apple: sqrdmlsh.4h {{v[2-9]+}},    v0,    v1[1]
283  ret i16 %retval
284}
285
286define i16 @test_sqrdmlshq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
287; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s16:
288entry:
289  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1>
290  %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
291  %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
292  %retval_vec =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
293  %retval = extractelement <8 x i16> %retval_vec, i64 0
294; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.8h, v0.8h, v1.h[1]
295; CHECK-V81a:       sqrdmlsh    {{v[2-9]+}}.8h, v0.8h, v1.h[1]
296; CHECK-V81a-apple: sqrdmlsh.8h {{v[2-9]+}},    v0,    v1[1]
297  ret i16 %retval
298}
299
300define i32 @test_sqrdmlsh_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
301; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s32:
302entry:
303  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
304  %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
305  %extract = extractelement <2 x i32> %prod, i64 0
306  %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract)
307; CHECK-V8a:        sqrdmulh    v0.2s, v0.2s, v1.s[0]
308; CHECK-V81a:       sqrdmlsh    v2.2s, v0.2s, v1.s[0]
309; CHECK-V81a-apple: sqrdmlsh.2s v2,    v0,    v1[0]
310  ret i32 %retval
311}
312
313define i32 @test_sqrdmlshq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
314; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s32:
315entry:
316  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
317  %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
318  %extract = extractelement <4 x i32> %prod, i64 0
319  %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract)
320; CHECK-V8a:        sqrdmulh    v0.4s, v0.4s, v1.s[0]
321; CHECK-V81a:       sqrdmlsh    v2.4s, v0.4s, v1.s[0]
322; CHECK-V81a-apple: sqrdmlsh.4s v2,    v0,    v1[0]
323  ret i32 %retval
324}
325
326;-----------------------------------------------------------------------------
327; RDMA Scalar
328; test for "def : Pat" near SIMDThreeScalarHSTied in AArch64InstInfo.td
329
330define i16 @test_sqrdmlah_v1i16(i16 %acc, i16 %x, i16 %y) {
331; CHECK-LABEL: test_sqrdmlah_v1i16:
332  %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
333  %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0
334  %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec,  <4 x i16> %y_vec)
335  %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
336  %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec,  <4 x i16> %prod_vec)
337  %retval = extractelement <4 x i16> %retval_vec, i64 0
338; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
339; CHECK-V81a:       sqrdmlah    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
340; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
341  ret i16 %retval
342}
343
344define i32 @test_sqrdmlah_v1i32(i32 %acc, i32 %x, i32 %y) {
345; CHECK-LABEL: test_sqrdmlah_v1i32:
346  %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0
347  %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0
348  %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec,  <4 x i32> %y_vec)
349  %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0
350  %retval_vec =  call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc_vec,  <4 x i32> %prod_vec)
351  %retval = extractelement <4 x i32> %retval_vec, i64 0
352; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
353; CHECK-V81a:       sqrdmlah    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
354; CHECK-V81a-apple: sqrdmlah.4s {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
355  ret i32 %retval
356}
357
358
359define i16 @test_sqrdmlsh_v1i16(i16 %acc, i16 %x, i16 %y) {
360; CHECK-LABEL: test_sqrdmlsh_v1i16:
361  %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
362  %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0
363  %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec,  <4 x i16> %y_vec)
364  %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
365  %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec,  <4 x i16> %prod_vec)
366  %retval = extractelement <4 x i16> %retval_vec, i64 0
367; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
368; CHECK-V81a:       sqrdmlsh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
369; CHECK-V81a-apple: sqrdmlsh.4h {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
370  ret i16 %retval
371}
372
373define i32 @test_sqrdmlsh_v1i32(i32 %acc, i32 %x, i32 %y) {
374; CHECK-LABEL: test_sqrdmlsh_v1i32:
375  %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0
376  %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0
377  %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec,  <4 x i32> %y_vec)
378  %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0
379  %retval_vec =  call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc_vec,  <4 x i32> %prod_vec)
380  %retval = extractelement <4 x i32> %retval_vec, i64 0
381; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
382; CHECK-V81a:       sqrdmlsh    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
383; CHECK-V81a-apple: sqrdmlsh.4s {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
384  ret i32 %retval
385}
386define i32 @test_sqrdmlah_i32(i32 %acc, i32 %mhs, i32 %rhs) {
387; CHECK-LABEL: test_sqrdmlah_i32:
388  %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %rhs)
389  %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc,  i32 %prod)
390; CHECK-V8a:        sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
391; CHECK-V81a:       sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
392; CHECK-V81a-apple: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
393  ret i32 %retval
394}
395
396define i32 @test_sqrdmlsh_i32(i32 %acc, i32 %mhs, i32 %rhs) {
397; CHECK-LABEL: test_sqrdmlsh_i32:
398  %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %rhs)
399  %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc,  i32 %prod)
400; CHECK-V8a:        sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
401; CHECK-V81a:       sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
402; CHECK-V81a-apple: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
403  ret i32 %retval
404}
405
406;-----------------------------------------------------------------------------
407; RDMA Scalar, by element
408; i16 tests are performed via tests in above chapter, with IR in ACLE style
409; i32 tests are for i32_indexed in SIMDIndexedSQRDMLxHSDTied
410
411define i16 @test_sqrdmlah_extract_i16(i16 %acc, i16 %x, <4 x i16> %y_vec) {
412; CHECK-LABEL: test_sqrdmlah_extract_i16:
413  %shuffle = shufflevector <4 x i16> %y_vec, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
414  %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
415  %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %shuffle)
416  %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
417  %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
418  %retval = extractelement <4 x i16> %retval_vec, i32 0
419; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1]
420; CHECK-V81a:       sqrdmlah    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1]
421; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}},    {{v[0-9]+}}, v0[1]
422  ret i16 %retval
423}
424
425define i32 @test_sqrdmlah_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
426; CHECK-LABEL: test_sqrdmlah_extract_i32:
427  %extract = extractelement <4 x i32> %rhs, i32 3
428  %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %extract)
429  %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc,  i32 %prod)
430; CHECK-V8a:        sqrdmulh   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
431; CHECK-V81a:       sqrdmlah   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
432; CHECK-V81a-apple: sqrdmlah.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3]
433  ret i32 %retval
434}
435
436define i16 @test_sqrdmlshq_extract_i16(i16 %acc, i16 %x, <8 x i16> %y_vec) {
437; CHECK-LABEL: test_sqrdmlshq_extract_i16:
438  %shuffle = shufflevector <8 x i16> %y_vec, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1>
439  %x_vec = insertelement <8 x i16> undef, i16 %x, i64 0
440  %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x_vec, <8 x i16> %shuffle)
441  %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
442  %retval_vec =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
443  %retval = extractelement <8 x i16> %retval_vec, i32 0
444; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1]
445; CHECK-V81a:       sqrdmlsh    {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1]
446; CHECK-V81a-apple: sqrdmlsh.8h {{v[0-9]+}},    {{v[0-9]+}}, v0[1]
447  ret i16 %retval
448}
449
450define i32 @test_sqrdmlsh_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
451; CHECK-LABEL: test_sqrdmlsh_extract_i32:
452  %extract = extractelement <4 x i32> %rhs, i32 3
453  %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %extract)
454  %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc,  i32 %prod)
455; CHECK-V8a:        sqrdmulh   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
456; CHECK-V81a:       sqrdmlsh   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
457; CHECK-V81a-apple: sqrdmlsh.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3]
458  ret i32 %retval
459}
460