• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -enable-misched=false | FileCheck %s
2
3define <8 x i8> @sqshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
4;CHECK-LABEL: sqshl8b:
5;CHECK: sqshl.8b
6        %tmp1 = load <8 x i8>, <8 x i8>* %A
7        %tmp2 = load <8 x i8>, <8 x i8>* %B
8        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
9        ret <8 x i8> %tmp3
10}
11
12define <4 x i16> @sqshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
13;CHECK-LABEL: sqshl4h:
14;CHECK: sqshl.4h
15        %tmp1 = load <4 x i16>, <4 x i16>* %A
16        %tmp2 = load <4 x i16>, <4 x i16>* %B
17        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
18        ret <4 x i16> %tmp3
19}
20
21define <2 x i32> @sqshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
22;CHECK-LABEL: sqshl2s:
23;CHECK: sqshl.2s
24        %tmp1 = load <2 x i32>, <2 x i32>* %A
25        %tmp2 = load <2 x i32>, <2 x i32>* %B
26        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
27        ret <2 x i32> %tmp3
28}
29
30define <8 x i8> @uqshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
31;CHECK-LABEL: uqshl8b:
32;CHECK: uqshl.8b
33        %tmp1 = load <8 x i8>, <8 x i8>* %A
34        %tmp2 = load <8 x i8>, <8 x i8>* %B
35        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
36        ret <8 x i8> %tmp3
37}
38
39define <4 x i16> @uqshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
40;CHECK-LABEL: uqshl4h:
41;CHECK: uqshl.4h
42        %tmp1 = load <4 x i16>, <4 x i16>* %A
43        %tmp2 = load <4 x i16>, <4 x i16>* %B
44        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
45        ret <4 x i16> %tmp3
46}
47
48define <2 x i32> @uqshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
49;CHECK-LABEL: uqshl2s:
50;CHECK: uqshl.2s
51        %tmp1 = load <2 x i32>, <2 x i32>* %A
52        %tmp2 = load <2 x i32>, <2 x i32>* %B
53        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
54        ret <2 x i32> %tmp3
55}
56
57define <16 x i8> @sqshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
58;CHECK-LABEL: sqshl16b:
59;CHECK: sqshl.16b
60        %tmp1 = load <16 x i8>, <16 x i8>* %A
61        %tmp2 = load <16 x i8>, <16 x i8>* %B
62        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
63        ret <16 x i8> %tmp3
64}
65
66define <8 x i16> @sqshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
67;CHECK-LABEL: sqshl8h:
68;CHECK: sqshl.8h
69        %tmp1 = load <8 x i16>, <8 x i16>* %A
70        %tmp2 = load <8 x i16>, <8 x i16>* %B
71        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
72        ret <8 x i16> %tmp3
73}
74
75define <4 x i32> @sqshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
76;CHECK-LABEL: sqshl4s:
77;CHECK: sqshl.4s
78        %tmp1 = load <4 x i32>, <4 x i32>* %A
79        %tmp2 = load <4 x i32>, <4 x i32>* %B
80        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
81        ret <4 x i32> %tmp3
82}
83
84define <2 x i64> @sqshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
85;CHECK-LABEL: sqshl2d:
86;CHECK: sqshl.2d
87        %tmp1 = load <2 x i64>, <2 x i64>* %A
88        %tmp2 = load <2 x i64>, <2 x i64>* %B
89        %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
90        ret <2 x i64> %tmp3
91}
92
93define <16 x i8> @uqshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
94;CHECK-LABEL: uqshl16b:
95;CHECK: uqshl.16b
96        %tmp1 = load <16 x i8>, <16 x i8>* %A
97        %tmp2 = load <16 x i8>, <16 x i8>* %B
98        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
99        ret <16 x i8> %tmp3
100}
101
102define <8 x i16> @uqshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
103;CHECK-LABEL: uqshl8h:
104;CHECK: uqshl.8h
105        %tmp1 = load <8 x i16>, <8 x i16>* %A
106        %tmp2 = load <8 x i16>, <8 x i16>* %B
107        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
108        ret <8 x i16> %tmp3
109}
110
111define <4 x i32> @uqshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
112;CHECK-LABEL: uqshl4s:
113;CHECK: uqshl.4s
114        %tmp1 = load <4 x i32>, <4 x i32>* %A
115        %tmp2 = load <4 x i32>, <4 x i32>* %B
116        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
117        ret <4 x i32> %tmp3
118}
119
120define <2 x i64> @uqshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
121;CHECK-LABEL: uqshl2d:
122;CHECK: uqshl.2d
123        %tmp1 = load <2 x i64>, <2 x i64>* %A
124        %tmp2 = load <2 x i64>, <2 x i64>* %B
125        %tmp3 = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
126        ret <2 x i64> %tmp3
127}
128
129declare <8 x i8>  @llvm.aarch64.neon.sqshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
130declare <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
131declare <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
132declare <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
133
134declare <8 x i8>  @llvm.aarch64.neon.uqshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
135declare <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
136declare <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
137declare <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
138
139declare <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
140declare <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
141declare <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
142declare <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
143
144declare <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
145declare <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
146declare <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
147declare <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
148
149define <8 x i8> @srshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
150;CHECK-LABEL: srshl8b:
151;CHECK: srshl.8b
152        %tmp1 = load <8 x i8>, <8 x i8>* %A
153        %tmp2 = load <8 x i8>, <8 x i8>* %B
154        %tmp3 = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
155        ret <8 x i8> %tmp3
156}
157
158define <4 x i16> @srshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
159;CHECK-LABEL: srshl4h:
160;CHECK: srshl.4h
161        %tmp1 = load <4 x i16>, <4 x i16>* %A
162        %tmp2 = load <4 x i16>, <4 x i16>* %B
163        %tmp3 = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
164        ret <4 x i16> %tmp3
165}
166
167define <2 x i32> @srshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
168;CHECK-LABEL: srshl2s:
169;CHECK: srshl.2s
170        %tmp1 = load <2 x i32>, <2 x i32>* %A
171        %tmp2 = load <2 x i32>, <2 x i32>* %B
172        %tmp3 = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
173        ret <2 x i32> %tmp3
174}
175
176define <8 x i8> @urshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
177;CHECK-LABEL: urshl8b:
178;CHECK: urshl.8b
179        %tmp1 = load <8 x i8>, <8 x i8>* %A
180        %tmp2 = load <8 x i8>, <8 x i8>* %B
181        %tmp3 = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
182        ret <8 x i8> %tmp3
183}
184
185define <4 x i16> @urshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
186;CHECK-LABEL: urshl4h:
187;CHECK: urshl.4h
188        %tmp1 = load <4 x i16>, <4 x i16>* %A
189        %tmp2 = load <4 x i16>, <4 x i16>* %B
190        %tmp3 = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
191        ret <4 x i16> %tmp3
192}
193
194define <2 x i32> @urshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
195;CHECK-LABEL: urshl2s:
196;CHECK: urshl.2s
197        %tmp1 = load <2 x i32>, <2 x i32>* %A
198        %tmp2 = load <2 x i32>, <2 x i32>* %B
199        %tmp3 = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
200        ret <2 x i32> %tmp3
201}
202
203define <16 x i8> @srshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
204;CHECK-LABEL: srshl16b:
205;CHECK: srshl.16b
206        %tmp1 = load <16 x i8>, <16 x i8>* %A
207        %tmp2 = load <16 x i8>, <16 x i8>* %B
208        %tmp3 = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
209        ret <16 x i8> %tmp3
210}
211
212define <8 x i16> @srshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
213;CHECK-LABEL: srshl8h:
214;CHECK: srshl.8h
215        %tmp1 = load <8 x i16>, <8 x i16>* %A
216        %tmp2 = load <8 x i16>, <8 x i16>* %B
217        %tmp3 = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
218        ret <8 x i16> %tmp3
219}
220
221define <4 x i32> @srshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
222;CHECK-LABEL: srshl4s:
223;CHECK: srshl.4s
224        %tmp1 = load <4 x i32>, <4 x i32>* %A
225        %tmp2 = load <4 x i32>, <4 x i32>* %B
226        %tmp3 = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
227        ret <4 x i32> %tmp3
228}
229
230define <2 x i64> @srshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
231;CHECK-LABEL: srshl2d:
232;CHECK: srshl.2d
233        %tmp1 = load <2 x i64>, <2 x i64>* %A
234        %tmp2 = load <2 x i64>, <2 x i64>* %B
235        %tmp3 = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
236        ret <2 x i64> %tmp3
237}
238
239define <16 x i8> @urshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
240;CHECK-LABEL: urshl16b:
241;CHECK: urshl.16b
242        %tmp1 = load <16 x i8>, <16 x i8>* %A
243        %tmp2 = load <16 x i8>, <16 x i8>* %B
244        %tmp3 = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
245        ret <16 x i8> %tmp3
246}
247
248define <8 x i16> @urshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
249;CHECK-LABEL: urshl8h:
250;CHECK: urshl.8h
251        %tmp1 = load <8 x i16>, <8 x i16>* %A
252        %tmp2 = load <8 x i16>, <8 x i16>* %B
253        %tmp3 = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
254        ret <8 x i16> %tmp3
255}
256
257define <4 x i32> @urshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
258;CHECK-LABEL: urshl4s:
259;CHECK: urshl.4s
260        %tmp1 = load <4 x i32>, <4 x i32>* %A
261        %tmp2 = load <4 x i32>, <4 x i32>* %B
262        %tmp3 = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
263        ret <4 x i32> %tmp3
264}
265
266define <2 x i64> @urshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
267;CHECK-LABEL: urshl2d:
268;CHECK: urshl.2d
269        %tmp1 = load <2 x i64>, <2 x i64>* %A
270        %tmp2 = load <2 x i64>, <2 x i64>* %B
271        %tmp3 = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
272        ret <2 x i64> %tmp3
273}
274
275declare <8 x i8>  @llvm.aarch64.neon.srshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
276declare <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
277declare <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
278declare <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
279
280declare <8 x i8>  @llvm.aarch64.neon.urshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
281declare <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
282declare <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
283declare <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
284
285declare <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
286declare <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
287declare <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
288declare <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
289
290declare <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
291declare <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
292declare <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
293declare <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
294
295define <8 x i8> @sqrshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
296;CHECK-LABEL: sqrshl8b:
297;CHECK: sqrshl.8b
298        %tmp1 = load <8 x i8>, <8 x i8>* %A
299        %tmp2 = load <8 x i8>, <8 x i8>* %B
300        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
301        ret <8 x i8> %tmp3
302}
303
304define <4 x i16> @sqrshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
305;CHECK-LABEL: sqrshl4h:
306;CHECK: sqrshl.4h
307        %tmp1 = load <4 x i16>, <4 x i16>* %A
308        %tmp2 = load <4 x i16>, <4 x i16>* %B
309        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
310        ret <4 x i16> %tmp3
311}
312
313define <2 x i32> @sqrshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
314;CHECK-LABEL: sqrshl2s:
315;CHECK: sqrshl.2s
316        %tmp1 = load <2 x i32>, <2 x i32>* %A
317        %tmp2 = load <2 x i32>, <2 x i32>* %B
318        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
319        ret <2 x i32> %tmp3
320}
321
322define <8 x i8> @uqrshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
323;CHECK-LABEL: uqrshl8b:
324;CHECK: uqrshl.8b
325        %tmp1 = load <8 x i8>, <8 x i8>* %A
326        %tmp2 = load <8 x i8>, <8 x i8>* %B
327        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
328        ret <8 x i8> %tmp3
329}
330
331define <4 x i16> @uqrshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
332;CHECK-LABEL: uqrshl4h:
333;CHECK: uqrshl.4h
334        %tmp1 = load <4 x i16>, <4 x i16>* %A
335        %tmp2 = load <4 x i16>, <4 x i16>* %B
336        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
337        ret <4 x i16> %tmp3
338}
339
340define <2 x i32> @uqrshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
341;CHECK-LABEL: uqrshl2s:
342;CHECK: uqrshl.2s
343        %tmp1 = load <2 x i32>, <2 x i32>* %A
344        %tmp2 = load <2 x i32>, <2 x i32>* %B
345        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqrshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
346        ret <2 x i32> %tmp3
347}
348
349define <16 x i8> @sqrshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
350;CHECK-LABEL: sqrshl16b:
351;CHECK: sqrshl.16b
352        %tmp1 = load <16 x i8>, <16 x i8>* %A
353        %tmp2 = load <16 x i8>, <16 x i8>* %B
354        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqrshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
355        ret <16 x i8> %tmp3
356}
357
358define <8 x i16> @sqrshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
359;CHECK-LABEL: sqrshl8h:
360;CHECK: sqrshl.8h
361        %tmp1 = load <8 x i16>, <8 x i16>* %A
362        %tmp2 = load <8 x i16>, <8 x i16>* %B
363        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqrshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
364        ret <8 x i16> %tmp3
365}
366
367define <4 x i32> @sqrshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
368;CHECK-LABEL: sqrshl4s:
369;CHECK: sqrshl.4s
370        %tmp1 = load <4 x i32>, <4 x i32>* %A
371        %tmp2 = load <4 x i32>, <4 x i32>* %B
372        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqrshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
373        ret <4 x i32> %tmp3
374}
375
376define <2 x i64> @sqrshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
377;CHECK-LABEL: sqrshl2d:
378;CHECK: sqrshl.2d
379        %tmp1 = load <2 x i64>, <2 x i64>* %A
380        %tmp2 = load <2 x i64>, <2 x i64>* %B
381        %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqrshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
382        ret <2 x i64> %tmp3
383}
384
385define <16 x i8> @uqrshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
386;CHECK-LABEL: uqrshl16b:
387;CHECK: uqrshl.16b
388        %tmp1 = load <16 x i8>, <16 x i8>* %A
389        %tmp2 = load <16 x i8>, <16 x i8>* %B
390        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uqrshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
391        ret <16 x i8> %tmp3
392}
393
394define <8 x i16> @uqrshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
395;CHECK-LABEL: uqrshl8h:
396;CHECK: uqrshl.8h
397        %tmp1 = load <8 x i16>, <8 x i16>* %A
398        %tmp2 = load <8 x i16>, <8 x i16>* %B
399        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uqrshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
400        ret <8 x i16> %tmp3
401}
402
403define <4 x i32> @uqrshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
404;CHECK-LABEL: uqrshl4s:
405;CHECK: uqrshl.4s
406        %tmp1 = load <4 x i32>, <4 x i32>* %A
407        %tmp2 = load <4 x i32>, <4 x i32>* %B
408        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
409        ret <4 x i32> %tmp3
410}
411
412define <2 x i64> @uqrshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
413;CHECK-LABEL: uqrshl2d:
414;CHECK: uqrshl.2d
415        %tmp1 = load <2 x i64>, <2 x i64>* %A
416        %tmp2 = load <2 x i64>, <2 x i64>* %B
417        %tmp3 = call <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
418        ret <2 x i64> %tmp3
419}
420
421declare <8 x i8>  @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
422declare <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
423declare <2 x i32> @llvm.aarch64.neon.sqrshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
424declare <1 x i64> @llvm.aarch64.neon.sqrshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
425
426declare <8 x i8>  @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
427declare <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
428declare <2 x i32> @llvm.aarch64.neon.uqrshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
429declare <1 x i64> @llvm.aarch64.neon.uqrshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
430
431declare <16 x i8> @llvm.aarch64.neon.sqrshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
432declare <8 x i16> @llvm.aarch64.neon.sqrshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
433declare <4 x i32> @llvm.aarch64.neon.sqrshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
434declare <2 x i64> @llvm.aarch64.neon.sqrshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
435
436declare <16 x i8> @llvm.aarch64.neon.uqrshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
437declare <8 x i16> @llvm.aarch64.neon.uqrshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
438declare <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
439declare <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
440
441define <8 x i8> @urshr8b(<8 x i8>* %A) nounwind {
442;CHECK-LABEL: urshr8b:
443;CHECK: urshr.8b
444        %tmp1 = load <8 x i8>, <8 x i8>* %A
445        %tmp3 = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
446        ret <8 x i8> %tmp3
447}
448
449define <4 x i16> @urshr4h(<4 x i16>* %A) nounwind {
450;CHECK-LABEL: urshr4h:
451;CHECK: urshr.4h
452        %tmp1 = load <4 x i16>, <4 x i16>* %A
453        %tmp3 = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
454        ret <4 x i16> %tmp3
455}
456
457define <2 x i32> @urshr2s(<2 x i32>* %A) nounwind {
458;CHECK-LABEL: urshr2s:
459;CHECK: urshr.2s
460        %tmp1 = load <2 x i32>, <2 x i32>* %A
461        %tmp3 = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
462        ret <2 x i32> %tmp3
463}
464
465define <16 x i8> @urshr16b(<16 x i8>* %A) nounwind {
466;CHECK-LABEL: urshr16b:
467;CHECK: urshr.16b
468        %tmp1 = load <16 x i8>, <16 x i8>* %A
469        %tmp3 = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
470        ret <16 x i8> %tmp3
471}
472
473define <8 x i16> @urshr8h(<8 x i16>* %A) nounwind {
474;CHECK-LABEL: urshr8h:
475;CHECK: urshr.8h
476        %tmp1 = load <8 x i16>, <8 x i16>* %A
477        %tmp3 = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
478        ret <8 x i16> %tmp3
479}
480
481define <4 x i32> @urshr4s(<4 x i32>* %A) nounwind {
482;CHECK-LABEL: urshr4s:
483;CHECK: urshr.4s
484        %tmp1 = load <4 x i32>, <4 x i32>* %A
485        %tmp3 = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
486        ret <4 x i32> %tmp3
487}
488
489define <2 x i64> @urshr2d(<2 x i64>* %A) nounwind {
490;CHECK-LABEL: urshr2d:
491;CHECK: urshr.2d
492        %tmp1 = load <2 x i64>, <2 x i64>* %A
493        %tmp3 = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
494        ret <2 x i64> %tmp3
495}
496
497define <8 x i8> @srshr8b(<8 x i8>* %A) nounwind {
498;CHECK-LABEL: srshr8b:
499;CHECK: srshr.8b
500        %tmp1 = load <8 x i8>, <8 x i8>* %A
501        %tmp3 = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
502        ret <8 x i8> %tmp3
503}
504
505define <4 x i16> @srshr4h(<4 x i16>* %A) nounwind {
506;CHECK-LABEL: srshr4h:
507;CHECK: srshr.4h
508        %tmp1 = load <4 x i16>, <4 x i16>* %A
509        %tmp3 = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
510        ret <4 x i16> %tmp3
511}
512
513define <2 x i32> @srshr2s(<2 x i32>* %A) nounwind {
514;CHECK-LABEL: srshr2s:
515;CHECK: srshr.2s
516        %tmp1 = load <2 x i32>, <2 x i32>* %A
517        %tmp3 = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
518        ret <2 x i32> %tmp3
519}
520
521define <16 x i8> @srshr16b(<16 x i8>* %A) nounwind {
522;CHECK-LABEL: srshr16b:
523;CHECK: srshr.16b
524        %tmp1 = load <16 x i8>, <16 x i8>* %A
525        %tmp3 = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
526        ret <16 x i8> %tmp3
527}
528
529define <8 x i16> @srshr8h(<8 x i16>* %A) nounwind {
530;CHECK-LABEL: srshr8h:
531;CHECK: srshr.8h
532        %tmp1 = load <8 x i16>, <8 x i16>* %A
533        %tmp3 = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
534        ret <8 x i16> %tmp3
535}
536
537define <4 x i32> @srshr4s(<4 x i32>* %A) nounwind {
538;CHECK-LABEL: srshr4s:
539;CHECK: srshr.4s
540        %tmp1 = load <4 x i32>, <4 x i32>* %A
541        %tmp3 = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
542        ret <4 x i32> %tmp3
543}
544
545define <2 x i64> @srshr2d(<2 x i64>* %A) nounwind {
546;CHECK-LABEL: srshr2d:
547;CHECK: srshr.2d
548        %tmp1 = load <2 x i64>, <2 x i64>* %A
549        %tmp3 = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
550        ret <2 x i64> %tmp3
551}
552
553define <8 x i8> @sqshlu8b(<8 x i8>* %A) nounwind {
554;CHECK-LABEL: sqshlu8b:
555;CHECK: sqshlu.8b v0, {{v[0-9]+}}, #1
556        %tmp1 = load <8 x i8>, <8 x i8>* %A
557        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
558        ret <8 x i8> %tmp3
559}
560
561define <4 x i16> @sqshlu4h(<4 x i16>* %A) nounwind {
562;CHECK-LABEL: sqshlu4h:
563;CHECK: sqshlu.4h v0, {{v[0-9]+}}, #1
564        %tmp1 = load <4 x i16>, <4 x i16>* %A
565        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
566        ret <4 x i16> %tmp3
567}
568
569define <2 x i32> @sqshlu2s(<2 x i32>* %A) nounwind {
570;CHECK-LABEL: sqshlu2s:
571;CHECK: sqshlu.2s v0, {{v[0-9]+}}, #1
572        %tmp1 = load <2 x i32>, <2 x i32>* %A
573        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
574        ret <2 x i32> %tmp3
575}
576
577define <16 x i8> @sqshlu16b(<16 x i8>* %A) nounwind {
578;CHECK-LABEL: sqshlu16b:
579;CHECK: sqshlu.16b v0, {{v[0-9]+}}, #1
580        %tmp1 = load <16 x i8>, <16 x i8>* %A
581        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
582        ret <16 x i8> %tmp3
583}
584
585define <8 x i16> @sqshlu8h(<8 x i16>* %A) nounwind {
586;CHECK-LABEL: sqshlu8h:
587;CHECK: sqshlu.8h v0, {{v[0-9]+}}, #1
588        %tmp1 = load <8 x i16>, <8 x i16>* %A
589        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
590        ret <8 x i16> %tmp3
591}
592
593define <4 x i32> @sqshlu4s(<4 x i32>* %A) nounwind {
594;CHECK-LABEL: sqshlu4s:
595;CHECK: sqshlu.4s v0, {{v[0-9]+}}, #1
596        %tmp1 = load <4 x i32>, <4 x i32>* %A
597        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
598        ret <4 x i32> %tmp3
599}
600
601define <2 x i64> @sqshlu2d(<2 x i64>* %A) nounwind {
602;CHECK-LABEL: sqshlu2d:
603;CHECK: sqshlu.2d v0, {{v[0-9]+}}, #1
604        %tmp1 = load <2 x i64>, <2 x i64>* %A
605        %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
606        ret <2 x i64> %tmp3
607}
608
609declare <8 x i8>  @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
610declare <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
611declare <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
612declare <1 x i64> @llvm.aarch64.neon.sqshlu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
613
614declare <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
615declare <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
616declare <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
617declare <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
618
619define <8 x i8> @rshrn8b(<8 x i16>* %A) nounwind {
620;CHECK-LABEL: rshrn8b:
621;CHECK: rshrn.8b v0, {{v[0-9]+}}, #1
622        %tmp1 = load <8 x i16>, <8 x i16>* %A
623        %tmp3 = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %tmp1, i32 1)
624        ret <8 x i8> %tmp3
625}
626
627define <4 x i16> @rshrn4h(<4 x i32>* %A) nounwind {
628;CHECK-LABEL: rshrn4h:
629;CHECK: rshrn.4h v0, {{v[0-9]+}}, #1
630        %tmp1 = load <4 x i32>, <4 x i32>* %A
631        %tmp3 = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %tmp1, i32 1)
632        ret <4 x i16> %tmp3
633}
634
635define <2 x i32> @rshrn2s(<2 x i64>* %A) nounwind {
636;CHECK-LABEL: rshrn2s:
637;CHECK: rshrn.2s v0, {{v[0-9]+}}, #1
638        %tmp1 = load <2 x i64>, <2 x i64>* %A
639        %tmp3 = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> %tmp1, i32 1)
640        ret <2 x i32> %tmp3
641}
642
643define <16 x i8> @rshrn16b(<8 x i8> *%ret, <8 x i16>* %A) nounwind {
644;CHECK-LABEL: rshrn16b:
645;CHECK: rshrn2.16b v0, {{v[0-9]+}}, #1
646        %out = load <8 x i8>, <8 x i8>* %ret
647        %tmp1 = load <8 x i16>, <8 x i16>* %A
648        %tmp3 = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %tmp1, i32 1)
649        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
650        ret <16 x i8> %tmp4
651}
652
653define <8 x i16> @rshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
654;CHECK-LABEL: rshrn8h:
655;CHECK: rshrn2.8h v0, {{v[0-9]+}}, #1
656        %out = load <4 x i16>, <4 x i16>* %ret
657        %tmp1 = load <4 x i32>, <4 x i32>* %A
658        %tmp3 = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %tmp1, i32 1)
659        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
660        ret <8 x i16> %tmp4
661}
662
663define <4 x i32> @rshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
664;CHECK-LABEL: rshrn4s:
665;CHECK: rshrn2.4s v0, {{v[0-9]+}}, #1
666        %out = load <2 x i32>, <2 x i32>* %ret
667        %tmp1 = load <2 x i64>, <2 x i64>* %A
668        %tmp3 = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> %tmp1, i32 1)
669        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
670        ret <4 x i32> %tmp4
671}
672
673declare <8 x i8>  @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
674declare <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32>, i32) nounwind readnone
675declare <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64>, i32) nounwind readnone
676
677define <8 x i8> @shrn8b(<8 x i16>* %A) nounwind {
678;CHECK-LABEL: shrn8b:
679;CHECK: shrn.8b v0, {{v[0-9]+}}, #1
680        %tmp1 = load <8 x i16>, <8 x i16>* %A
681        %tmp2 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
682        %tmp3 = trunc <8 x i16> %tmp2 to <8 x i8>
683        ret <8 x i8> %tmp3
684}
685
686define <4 x i16> @shrn4h(<4 x i32>* %A) nounwind {
687;CHECK-LABEL: shrn4h:
688;CHECK: shrn.4h v0, {{v[0-9]+}}, #1
689        %tmp1 = load <4 x i32>, <4 x i32>* %A
690        %tmp2 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
691        %tmp3 = trunc <4 x i32> %tmp2 to <4 x i16>
692        ret <4 x i16> %tmp3
693}
694
695define <2 x i32> @shrn2s(<2 x i64>* %A) nounwind {
696;CHECK-LABEL: shrn2s:
697;CHECK: shrn.2s v0, {{v[0-9]+}}, #1
698        %tmp1 = load <2 x i64>, <2 x i64>* %A
699        %tmp2 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
700        %tmp3 = trunc <2 x i64> %tmp2 to <2 x i32>
701        ret <2 x i32> %tmp3
702}
703
704define <16 x i8> @shrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
705;CHECK-LABEL: shrn16b:
706;CHECK: shrn2.16b v0, {{v[0-9]+}}, #1
707        %out = load <8 x i8>, <8 x i8>* %ret
708        %tmp1 = load <8 x i16>, <8 x i16>* %A
709        %tmp2 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
710        %tmp3 = trunc <8 x i16> %tmp2 to <8 x i8>
711        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
712        ret <16 x i8> %tmp4
713}
714
715define <8 x i16> @shrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
716;CHECK-LABEL: shrn8h:
717;CHECK: shrn2.8h v0, {{v[0-9]+}}, #1
718        %out = load <4 x i16>, <4 x i16>* %ret
719        %tmp1 = load <4 x i32>, <4 x i32>* %A
720        %tmp2 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
721        %tmp3 = trunc <4 x i32> %tmp2 to <4 x i16>
722        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
723        ret <8 x i16> %tmp4
724}
725
726define <4 x i32> @shrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
727;CHECK-LABEL: shrn4s:
728;CHECK: shrn2.4s v0, {{v[0-9]+}}, #1
729        %out = load <2 x i32>, <2 x i32>* %ret
730        %tmp1 = load <2 x i64>, <2 x i64>* %A
731        %tmp2 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
732        %tmp3 = trunc <2 x i64> %tmp2 to <2 x i32>
733        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
734        ret <4 x i32> %tmp4
735}
736
737declare <8 x i8>  @llvm.aarch64.neon.shrn.v8i8(<8 x i16>, i32) nounwind readnone
738declare <4 x i16> @llvm.aarch64.neon.shrn.v4i16(<4 x i32>, i32) nounwind readnone
739declare <2 x i32> @llvm.aarch64.neon.shrn.v2i32(<2 x i64>, i32) nounwind readnone
740
741define i32 @sqshrn1s(i64 %A) nounwind {
742; CHECK-LABEL: sqshrn1s:
743; CHECK: sqshrn {{s[0-9]+}}, d0, #1
744  %tmp = call i32 @llvm.aarch64.neon.sqshrn.i32(i64 %A, i32 1)
745  ret i32 %tmp
746}
747
748define <8 x i8> @sqshrn8b(<8 x i16>* %A) nounwind {
749;CHECK-LABEL: sqshrn8b:
750;CHECK: sqshrn.8b v0, {{v[0-9]+}}, #1
751        %tmp1 = load <8 x i16>, <8 x i16>* %A
752        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> %tmp1, i32 1)
753        ret <8 x i8> %tmp3
754}
755
756define <4 x i16> @sqshrn4h(<4 x i32>* %A) nounwind {
757;CHECK-LABEL: sqshrn4h:
758;CHECK: sqshrn.4h v0, {{v[0-9]+}}, #1
759        %tmp1 = load <4 x i32>, <4 x i32>* %A
760        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> %tmp1, i32 1)
761        ret <4 x i16> %tmp3
762}
763
764define <2 x i32> @sqshrn2s(<2 x i64>* %A) nounwind {
765;CHECK-LABEL: sqshrn2s:
766;CHECK: sqshrn.2s v0, {{v[0-9]+}}, #1
767        %tmp1 = load <2 x i64>, <2 x i64>* %A
768        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> %tmp1, i32 1)
769        ret <2 x i32> %tmp3
770}
771
772
773define <16 x i8> @sqshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
774;CHECK-LABEL: sqshrn16b:
775;CHECK: sqshrn2.16b v0, {{v[0-9]+}}, #1
776        %out = load <8 x i8>, <8 x i8>* %ret
777        %tmp1 = load <8 x i16>, <8 x i16>* %A
778        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> %tmp1, i32 1)
779        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
780        ret <16 x i8> %tmp4
781}
782
783define <8 x i16> @sqshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
784;CHECK-LABEL: sqshrn8h:
785;CHECK: sqshrn2.8h v0, {{v[0-9]+}}, #1
786        %out = load <4 x i16>, <4 x i16>* %ret
787        %tmp1 = load <4 x i32>, <4 x i32>* %A
788        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> %tmp1, i32 1)
789        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
790        ret <8 x i16> %tmp4
791}
792
793define <4 x i32> @sqshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
794;CHECK-LABEL: sqshrn4s:
795;CHECK: sqshrn2.4s v0, {{v[0-9]+}}, #1
796        %out = load <2 x i32>, <2 x i32>* %ret
797        %tmp1 = load <2 x i64>, <2 x i64>* %A
798        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> %tmp1, i32 1)
799        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
800        ret <4 x i32> %tmp4
801}
802
803declare i32  @llvm.aarch64.neon.sqshrn.i32(i64, i32) nounwind readnone
804declare <8 x i8>  @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16>, i32) nounwind readnone
805declare <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32>, i32) nounwind readnone
806declare <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64>, i32) nounwind readnone
807
808define i32 @sqshrun1s(i64 %A) nounwind {
809; CHECK-LABEL: sqshrun1s:
810; CHECK: sqshrun {{s[0-9]+}}, d0, #1
811  %tmp = call i32 @llvm.aarch64.neon.sqshrun.i32(i64 %A, i32 1)
812  ret i32 %tmp
813}
814
815define <8 x i8> @sqshrun8b(<8 x i16>* %A) nounwind {
816;CHECK-LABEL: sqshrun8b:
817;CHECK: sqshrun.8b v0, {{v[0-9]+}}, #1
818        %tmp1 = load <8 x i16>, <8 x i16>* %A
819        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> %tmp1, i32 1)
820        ret <8 x i8> %tmp3
821}
822
823define <4 x i16> @sqshrun4h(<4 x i32>* %A) nounwind {
824;CHECK-LABEL: sqshrun4h:
825;CHECK: sqshrun.4h v0, {{v[0-9]+}}, #1
826        %tmp1 = load <4 x i32>, <4 x i32>* %A
827        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> %tmp1, i32 1)
828        ret <4 x i16> %tmp3
829}
830
831define <2 x i32> @sqshrun2s(<2 x i64>* %A) nounwind {
832;CHECK-LABEL: sqshrun2s:
833;CHECK: sqshrun.2s v0, {{v[0-9]+}}, #1
834        %tmp1 = load <2 x i64>, <2 x i64>* %A
835        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> %tmp1, i32 1)
836        ret <2 x i32> %tmp3
837}
838
839define <16 x i8> @sqshrun16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
840;CHECK-LABEL: sqshrun16b:
841;CHECK: sqshrun2.16b v0, {{v[0-9]+}}, #1
842        %out = load <8 x i8>, <8 x i8>* %ret
843        %tmp1 = load <8 x i16>, <8 x i16>* %A
844        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> %tmp1, i32 1)
845        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
846        ret <16 x i8> %tmp4
847}
848
849define <8 x i16> @sqshrun8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
850;CHECK-LABEL: sqshrun8h:
851;CHECK: sqshrun2.8h v0, {{v[0-9]+}}, #1
852        %out = load <4 x i16>, <4 x i16>* %ret
853        %tmp1 = load <4 x i32>, <4 x i32>* %A
854        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> %tmp1, i32 1)
855        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
856        ret <8 x i16> %tmp4
857}
858
859define <4 x i32> @sqshrun4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
860;CHECK-LABEL: sqshrun4s:
861;CHECK: sqshrun2.4s v0, {{v[0-9]+}}, #1
862        %out = load <2 x i32>, <2 x i32>* %ret
863        %tmp1 = load <2 x i64>, <2 x i64>* %A
864        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> %tmp1, i32 1)
865        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
866        ret <4 x i32> %tmp4
867}
868
869declare i32  @llvm.aarch64.neon.sqshrun.i32(i64, i32) nounwind readnone
870declare <8 x i8>  @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16>, i32) nounwind readnone
871declare <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32>, i32) nounwind readnone
872declare <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64>, i32) nounwind readnone
873
874define i32 @sqrshrn1s(i64 %A) nounwind {
875; CHECK-LABEL: sqrshrn1s:
876; CHECK: sqrshrn {{s[0-9]+}}, d0, #1
877  %tmp = call i32 @llvm.aarch64.neon.sqrshrn.i32(i64 %A, i32 1)
878  ret i32 %tmp
879}
880
881define <8 x i8> @sqrshrn8b(<8 x i16>* %A) nounwind {
882;CHECK-LABEL: sqrshrn8b:
883;CHECK: sqrshrn.8b v0, {{v[0-9]+}}, #1
884        %tmp1 = load <8 x i16>, <8 x i16>* %A
885        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
886        ret <8 x i8> %tmp3
887}
888
889define <4 x i16> @sqrshrn4h(<4 x i32>* %A) nounwind {
890;CHECK-LABEL: sqrshrn4h:
891;CHECK: sqrshrn.4h v0, {{v[0-9]+}}, #1
892        %tmp1 = load <4 x i32>, <4 x i32>* %A
893        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
894        ret <4 x i16> %tmp3
895}
896
897define <2 x i32> @sqrshrn2s(<2 x i64>* %A) nounwind {
898;CHECK-LABEL: sqrshrn2s:
899;CHECK: sqrshrn.2s v0, {{v[0-9]+}}, #1
900        %tmp1 = load <2 x i64>, <2 x i64>* %A
901        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
902        ret <2 x i32> %tmp3
903}
904
905define <16 x i8> @sqrshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
906;CHECK-LABEL: sqrshrn16b:
907;CHECK: sqrshrn2.16b v0, {{v[0-9]+}}, #1
908        %out = load <8 x i8>, <8 x i8>* %ret
909        %tmp1 = load <8 x i16>, <8 x i16>* %A
910        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
911        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
912        ret <16 x i8> %tmp4
913}
914
915define <8 x i16> @sqrshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
916;CHECK-LABEL: sqrshrn8h:
917;CHECK: sqrshrn2.8h v0, {{v[0-9]+}}, #1
918        %out = load <4 x i16>, <4 x i16>* %ret
919        %tmp1 = load <4 x i32>, <4 x i32>* %A
920        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
921        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
922        ret <8 x i16> %tmp4
923}
924
925define <4 x i32> @sqrshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
926;CHECK-LABEL: sqrshrn4s:
927;CHECK: sqrshrn2.4s v0, {{v[0-9]+}}, #1
928        %out = load <2 x i32>, <2 x i32>* %ret
929        %tmp1 = load <2 x i64>, <2 x i64>* %A
930        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
931        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
932        ret <4 x i32> %tmp4
933}
934
935declare i32  @llvm.aarch64.neon.sqrshrn.i32(i64, i32) nounwind readnone
936declare <8 x i8>  @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16>, i32) nounwind readnone
937declare <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32>, i32) nounwind readnone
938declare <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64>, i32) nounwind readnone
939
940define i32 @sqrshrun1s(i64 %A) nounwind {
941; CHECK-LABEL: sqrshrun1s:
942; CHECK: sqrshrun {{s[0-9]+}}, d0, #1
943  %tmp = call i32 @llvm.aarch64.neon.sqrshrun.i32(i64 %A, i32 1)
944  ret i32 %tmp
945}
946
947define <8 x i8> @sqrshrun8b(<8 x i16>* %A) nounwind {
948;CHECK-LABEL: sqrshrun8b:
949;CHECK: sqrshrun.8b v0, {{v[0-9]+}}, #1
950        %tmp1 = load <8 x i16>, <8 x i16>* %A
951        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> %tmp1, i32 1)
952        ret <8 x i8> %tmp3
953}
954
955define <4 x i16> @sqrshrun4h(<4 x i32>* %A) nounwind {
956;CHECK-LABEL: sqrshrun4h:
957;CHECK: sqrshrun.4h v0, {{v[0-9]+}}, #1
958        %tmp1 = load <4 x i32>, <4 x i32>* %A
959        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> %tmp1, i32 1)
960        ret <4 x i16> %tmp3
961}
962
963define <2 x i32> @sqrshrun2s(<2 x i64>* %A) nounwind {
964;CHECK-LABEL: sqrshrun2s:
965;CHECK: sqrshrun.2s v0, {{v[0-9]+}}, #1
966        %tmp1 = load <2 x i64>, <2 x i64>* %A
967        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> %tmp1, i32 1)
968        ret <2 x i32> %tmp3
969}
970
971define <16 x i8> @sqrshrun16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
972;CHECK-LABEL: sqrshrun16b:
973;CHECK: sqrshrun2.16b v0, {{v[0-9]+}}, #1
974        %out = load <8 x i8>, <8 x i8>* %ret
975        %tmp1 = load <8 x i16>, <8 x i16>* %A
976        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> %tmp1, i32 1)
977        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
978        ret <16 x i8> %tmp4
979}
980
981define <8 x i16> @sqrshrun8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
982;CHECK-LABEL: sqrshrun8h:
983;CHECK: sqrshrun2.8h v0, {{v[0-9]+}}, #1
984        %out = load <4 x i16>, <4 x i16>* %ret
985        %tmp1 = load <4 x i32>, <4 x i32>* %A
986        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> %tmp1, i32 1)
987        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
988        ret <8 x i16> %tmp4
989}
990
991define <4 x i32> @sqrshrun4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
992;CHECK-LABEL: sqrshrun4s:
993;CHECK: sqrshrun2.4s v0, {{v[0-9]+}}, #1
994        %out = load <2 x i32>, <2 x i32>* %ret
995        %tmp1 = load <2 x i64>, <2 x i64>* %A
996        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> %tmp1, i32 1)
997        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
998        ret <4 x i32> %tmp4
999}
1000
1001declare i32  @llvm.aarch64.neon.sqrshrun.i32(i64, i32) nounwind readnone
1002declare <8 x i8>  @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16>, i32) nounwind readnone
1003declare <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32>, i32) nounwind readnone
1004declare <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64>, i32) nounwind readnone
1005
1006define i32 @uqrshrn1s(i64 %A) nounwind {
1007; CHECK-LABEL: uqrshrn1s:
1008; CHECK: uqrshrn {{s[0-9]+}}, d0, #1
1009  %tmp = call i32 @llvm.aarch64.neon.uqrshrn.i32(i64 %A, i32 1)
1010  ret i32 %tmp
1011}
1012
1013define <8 x i8> @uqrshrn8b(<8 x i16>* %A) nounwind {
1014;CHECK-LABEL: uqrshrn8b:
1015;CHECK: uqrshrn.8b v0, {{v[0-9]+}}, #1
1016        %tmp1 = load <8 x i16>, <8 x i16>* %A
1017        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
1018        ret <8 x i8> %tmp3
1019}
1020
1021define <4 x i16> @uqrshrn4h(<4 x i32>* %A) nounwind {
1022;CHECK-LABEL: uqrshrn4h:
1023;CHECK: uqrshrn.4h v0, {{v[0-9]+}}, #1
1024        %tmp1 = load <4 x i32>, <4 x i32>* %A
1025        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
1026        ret <4 x i16> %tmp3
1027}
1028
1029define <2 x i32> @uqrshrn2s(<2 x i64>* %A) nounwind {
1030;CHECK-LABEL: uqrshrn2s:
1031;CHECK: uqrshrn.2s v0, {{v[0-9]+}}, #1
1032        %tmp1 = load <2 x i64>, <2 x i64>* %A
1033        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
1034        ret <2 x i32> %tmp3
1035}
1036
1037define <16 x i8> @uqrshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
1038;CHECK-LABEL: uqrshrn16b:
1039;CHECK: uqrshrn2.16b v0, {{v[0-9]+}}, #1
1040        %out = load <8 x i8>, <8 x i8>* %ret
1041        %tmp1 = load <8 x i16>, <8 x i16>* %A
1042        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
1043        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1044        ret <16 x i8> %tmp4
1045}
1046
1047define <8 x i16> @uqrshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
1048;CHECK-LABEL: uqrshrn8h:
1049;CHECK: uqrshrn2.8h v0, {{v[0-9]+}}, #1
1050        %out = load <4 x i16>, <4 x i16>* %ret
1051        %tmp1 = load <4 x i32>, <4 x i32>* %A
1052        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
1053        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1054        ret <8 x i16> %tmp4
1055}
1056
1057define <4 x i32> @uqrshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
1058;CHECK-LABEL: uqrshrn4s:
1059;CHECK: uqrshrn2.4s v0, {{v[0-9]+}}, #1
1060        %out = load <2 x i32>, <2 x i32>* %ret
1061        %tmp1 = load <2 x i64>, <2 x i64>* %A
1062        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
1063        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1064        ret <4 x i32> %tmp4
1065}
1066
1067declare i32  @llvm.aarch64.neon.uqrshrn.i32(i64, i32) nounwind readnone
1068declare <8 x i8>  @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16>, i32) nounwind readnone
1069declare <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32>, i32) nounwind readnone
1070declare <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64>, i32) nounwind readnone
1071
1072define i32 @uqshrn1s(i64 %A) nounwind {
1073; CHECK-LABEL: uqshrn1s:
1074; CHECK: uqshrn {{s[0-9]+}}, d0, #1
1075  %tmp = call i32 @llvm.aarch64.neon.uqshrn.i32(i64 %A, i32 1)
1076  ret i32 %tmp
1077}
1078
1079define <8 x i8> @uqshrn8b(<8 x i16>* %A) nounwind {
1080;CHECK-LABEL: uqshrn8b:
1081;CHECK: uqshrn.8b v0, {{v[0-9]+}}, #1
1082        %tmp1 = load <8 x i16>, <8 x i16>* %A
1083        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> %tmp1, i32 1)
1084        ret <8 x i8> %tmp3
1085}
1086
1087define <4 x i16> @uqshrn4h(<4 x i32>* %A) nounwind {
1088;CHECK-LABEL: uqshrn4h:
1089;CHECK: uqshrn.4h v0, {{v[0-9]+}}, #1
1090        %tmp1 = load <4 x i32>, <4 x i32>* %A
1091        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> %tmp1, i32 1)
1092        ret <4 x i16> %tmp3
1093}
1094
1095define <2 x i32> @uqshrn2s(<2 x i64>* %A) nounwind {
1096;CHECK-LABEL: uqshrn2s:
1097;CHECK: uqshrn.2s v0, {{v[0-9]+}}, #1
1098        %tmp1 = load <2 x i64>, <2 x i64>* %A
1099        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> %tmp1, i32 1)
1100        ret <2 x i32> %tmp3
1101}
1102
1103define <16 x i8> @uqshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
1104;CHECK-LABEL: uqshrn16b:
1105;CHECK: uqshrn2.16b v0, {{v[0-9]+}}, #1
1106        %out = load <8 x i8>, <8 x i8>* %ret
1107        %tmp1 = load <8 x i16>, <8 x i16>* %A
1108        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> %tmp1, i32 1)
1109        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1110        ret <16 x i8> %tmp4
1111}
1112
1113define <8 x i16> @uqshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
1114;CHECK-LABEL: uqshrn8h:
1115;CHECK: uqshrn2.8h v0, {{v[0-9]+}}, #1
1116  %out = load <4 x i16>, <4 x i16>* %ret
1117  %tmp1 = load <4 x i32>, <4 x i32>* %A
1118  %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> %tmp1, i32 1)
1119  %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1120  ret <8 x i16> %tmp4
1121}
1122
1123define <4 x i32> @uqshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
1124;CHECK-LABEL: uqshrn4s:
1125;CHECK: uqshrn2.4s v0, {{v[0-9]+}}, #1
1126  %out = load <2 x i32>, <2 x i32>* %ret
1127  %tmp1 = load <2 x i64>, <2 x i64>* %A
1128  %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> %tmp1, i32 1)
1129  %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1130  ret <4 x i32> %tmp4
1131}
1132
1133declare i32  @llvm.aarch64.neon.uqshrn.i32(i64, i32) nounwind readnone
1134declare <8 x i8>  @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16>, i32) nounwind readnone
1135declare <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32>, i32) nounwind readnone
1136declare <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64>, i32) nounwind readnone
1137
1138define <8 x i16> @ushll8h(<8 x i8>* %A) nounwind {
1139;CHECK-LABEL: ushll8h:
1140;CHECK: ushll.8h v0, {{v[0-9]+}}, #1
1141        %tmp1 = load <8 x i8>, <8 x i8>* %A
1142        %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
1143        %tmp3 = shl <8 x i16> %tmp2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1144        ret <8 x i16> %tmp3
1145}
1146
1147define <4 x i32> @ushll4s(<4 x i16>* %A) nounwind {
1148;CHECK-LABEL: ushll4s:
1149;CHECK: ushll.4s v0, {{v[0-9]+}}, #1
1150        %tmp1 = load <4 x i16>, <4 x i16>* %A
1151        %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
1152        %tmp3 = shl <4 x i32> %tmp2, <i32 1, i32 1, i32 1, i32 1>
1153        ret <4 x i32> %tmp3
1154}
1155
1156define <2 x i64> @ushll2d(<2 x i32>* %A) nounwind {
1157;CHECK-LABEL: ushll2d:
1158;CHECK: ushll.2d v0, {{v[0-9]+}}, #1
1159        %tmp1 = load <2 x i32>, <2 x i32>* %A
1160        %tmp2 = zext <2 x i32> %tmp1 to <2 x i64>
1161        %tmp3 = shl <2 x i64> %tmp2, <i64 1, i64 1>
1162        ret <2 x i64> %tmp3
1163}
1164
1165define <8 x i16> @ushll2_8h(<16 x i8>* %A) nounwind {
1166;CHECK-LABEL: ushll2_8h:
1167;CHECK: ushll2.8h v0, {{v[0-9]+}}, #1
1168        %load1 = load <16 x i8>, <16 x i8>* %A
1169        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1170        %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
1171        %tmp3 = shl <8 x i16> %tmp2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1172        ret <8 x i16> %tmp3
1173}
1174
1175define <4 x i32> @ushll2_4s(<8 x i16>* %A) nounwind {
1176;CHECK-LABEL: ushll2_4s:
1177;CHECK: ushll2.4s v0, {{v[0-9]+}}, #1
1178        %load1 = load <8 x i16>, <8 x i16>* %A
1179        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1180        %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
1181        %tmp3 = shl <4 x i32> %tmp2, <i32 1, i32 1, i32 1, i32 1>
1182        ret <4 x i32> %tmp3
1183}
1184
1185define <2 x i64> @ushll2_2d(<4 x i32>* %A) nounwind {
1186;CHECK-LABEL: ushll2_2d:
1187;CHECK: ushll2.2d v0, {{v[0-9]+}}, #1
1188        %load1 = load <4 x i32>, <4 x i32>* %A
1189        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1190        %tmp2 = zext <2 x i32> %tmp1 to <2 x i64>
1191        %tmp3 = shl <2 x i64> %tmp2, <i64 1, i64 1>
1192        ret <2 x i64> %tmp3
1193}
1194
1195define <8 x i16> @sshll8h(<8 x i8>* %A) nounwind {
1196;CHECK-LABEL: sshll8h:
1197;CHECK: sshll.8h v0, {{v[0-9]+}}, #1
1198        %tmp1 = load <8 x i8>, <8 x i8>* %A
1199        %tmp2 = sext <8 x i8> %tmp1 to <8 x i16>
1200        %tmp3 = shl <8 x i16> %tmp2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1201        ret <8 x i16> %tmp3
1202}
1203
1204define <4 x i32> @sshll4s(<4 x i16>* %A) nounwind {
1205;CHECK-LABEL: sshll4s:
1206;CHECK: sshll.4s v0, {{v[0-9]+}}, #1
1207        %tmp1 = load <4 x i16>, <4 x i16>* %A
1208        %tmp2 = sext <4 x i16> %tmp1 to <4 x i32>
1209        %tmp3 = shl <4 x i32> %tmp2, <i32 1, i32 1, i32 1, i32 1>
1210        ret <4 x i32> %tmp3
1211}
1212
1213define <2 x i64> @sshll2d(<2 x i32>* %A) nounwind {
1214;CHECK-LABEL: sshll2d:
1215;CHECK: sshll.2d v0, {{v[0-9]+}}, #1
1216        %tmp1 = load <2 x i32>, <2 x i32>* %A
1217        %tmp2 = sext <2 x i32> %tmp1 to <2 x i64>
1218        %tmp3 = shl <2 x i64> %tmp2, <i64 1, i64 1>
1219        ret <2 x i64> %tmp3
1220}
1221
1222define <8 x i16> @sshll2_8h(<16 x i8>* %A) nounwind {
1223;CHECK-LABEL: sshll2_8h:
1224;CHECK: sshll2.8h v0, {{v[0-9]+}}, #1
1225        %load1 = load <16 x i8>, <16 x i8>* %A
1226        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1227        %tmp2 = sext <8 x i8> %tmp1 to <8 x i16>
1228        %tmp3 = shl <8 x i16> %tmp2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1229        ret <8 x i16> %tmp3
1230}
1231
1232define <4 x i32> @sshll2_4s(<8 x i16>* %A) nounwind {
1233;CHECK-LABEL: sshll2_4s:
1234;CHECK: sshll2.4s v0, {{v[0-9]+}}, #1
1235        %load1 = load <8 x i16>, <8 x i16>* %A
1236        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1237        %tmp2 = sext <4 x i16> %tmp1 to <4 x i32>
1238        %tmp3 = shl <4 x i32> %tmp2, <i32 1, i32 1, i32 1, i32 1>
1239        ret <4 x i32> %tmp3
1240}
1241
1242define <2 x i64> @sshll2_2d(<4 x i32>* %A) nounwind {
1243;CHECK-LABEL: sshll2_2d:
1244;CHECK: sshll2.2d v0, {{v[0-9]+}}, #1
1245        %load1 = load <4 x i32>, <4 x i32>* %A
1246        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1247        %tmp2 = sext <2 x i32> %tmp1 to <2 x i64>
1248        %tmp3 = shl <2 x i64> %tmp2, <i64 1, i64 1>
1249        ret <2 x i64> %tmp3
1250}
1251
1252define <8 x i8> @sqshli8b(<8 x i8>* %A) nounwind {
1253;CHECK-LABEL: sqshli8b:
1254;CHECK: sqshl.8b v0, {{v[0-9]+}}, #1
1255        %tmp1 = load <8 x i8>, <8 x i8>* %A
1256        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
1257        ret <8 x i8> %tmp3
1258}
1259
1260define <4 x i16> @sqshli4h(<4 x i16>* %A) nounwind {
1261;CHECK-LABEL: sqshli4h:
1262;CHECK: sqshl.4h v0, {{v[0-9]+}}, #1
1263        %tmp1 = load <4 x i16>, <4 x i16>* %A
1264        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
1265        ret <4 x i16> %tmp3
1266}
1267
1268define <2 x i32> @sqshli2s(<2 x i32>* %A) nounwind {
1269;CHECK-LABEL: sqshli2s:
1270;CHECK: sqshl.2s v0, {{v[0-9]+}}, #1
1271        %tmp1 = load <2 x i32>, <2 x i32>* %A
1272        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
1273        ret <2 x i32> %tmp3
1274}
1275
1276define <16 x i8> @sqshli16b(<16 x i8>* %A) nounwind {
1277;CHECK-LABEL: sqshli16b:
1278;CHECK: sqshl.16b v0, {{v[0-9]+}}, #1
1279        %tmp1 = load <16 x i8>, <16 x i8>* %A
1280        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
1281        ret <16 x i8> %tmp3
1282}
1283
1284define <8 x i16> @sqshli8h(<8 x i16>* %A) nounwind {
1285;CHECK-LABEL: sqshli8h:
1286;CHECK: sqshl.8h v0, {{v[0-9]+}}, #1
1287        %tmp1 = load <8 x i16>, <8 x i16>* %A
1288        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
1289        ret <8 x i16> %tmp3
1290}
1291
1292define <4 x i32> @sqshli4s(<4 x i32>* %A) nounwind {
1293;CHECK-LABEL: sqshli4s:
1294;CHECK: sqshl.4s v0, {{v[0-9]+}}, #1
1295        %tmp1 = load <4 x i32>, <4 x i32>* %A
1296        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
1297        ret <4 x i32> %tmp3
1298}
1299
1300define <2 x i64> @sqshli2d(<2 x i64>* %A) nounwind {
1301;CHECK-LABEL: sqshli2d:
1302;CHECK: sqshl.2d v0, {{v[0-9]+}}, #1
1303        %tmp1 = load <2 x i64>, <2 x i64>* %A
1304        %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
1305        ret <2 x i64> %tmp3
1306}
1307
1308define <8 x i8> @uqshli8b(<8 x i8>* %A) nounwind {
1309;CHECK-LABEL: uqshli8b:
1310;CHECK: uqshl.8b v0, {{v[0-9]+}}, #1
1311        %tmp1 = load <8 x i8>, <8 x i8>* %A
1312        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
1313        ret <8 x i8> %tmp3
1314}
1315
1316define <8 x i8> @uqshli8b_1(<8 x i8>* %A) nounwind {
1317;CHECK-LABEL: uqshli8b_1:
1318;CHECK: movi.8b [[REG:v[0-9]+]], #8
1319;CHECK: uqshl.8b v0, v0, [[REG]]
1320        %tmp1 = load <8 x i8>, <8 x i8>* %A
1321        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>)
1322        ret <8 x i8> %tmp3
1323}
1324
1325define <4 x i16> @uqshli4h(<4 x i16>* %A) nounwind {
1326;CHECK-LABEL: uqshli4h:
1327;CHECK: uqshl.4h v0, {{v[0-9]+}}, #1
1328        %tmp1 = load <4 x i16>, <4 x i16>* %A
1329        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
1330        ret <4 x i16> %tmp3
1331}
1332
1333define <2 x i32> @uqshli2s(<2 x i32>* %A) nounwind {
1334;CHECK-LABEL: uqshli2s:
1335;CHECK: uqshl.2s v0, {{v[0-9]+}}, #1
1336        %tmp1 = load <2 x i32>, <2 x i32>* %A
1337        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
1338        ret <2 x i32> %tmp3
1339}
1340
1341define <16 x i8> @uqshli16b(<16 x i8>* %A) nounwind {
1342;CHECK-LABEL: uqshli16b:
1343;CHECK: uqshl.16b
1344        %tmp1 = load <16 x i8>, <16 x i8>* %A
1345        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
1346        ret <16 x i8> %tmp3
1347}
1348
1349define <8 x i16> @uqshli8h(<8 x i16>* %A) nounwind {
1350;CHECK-LABEL: uqshli8h:
1351;CHECK: uqshl.8h v0, {{v[0-9]+}}, #1
1352        %tmp1 = load <8 x i16>, <8 x i16>* %A
1353        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
1354        ret <8 x i16> %tmp3
1355}
1356
1357define <4 x i32> @uqshli4s(<4 x i32>* %A) nounwind {
1358;CHECK-LABEL: uqshli4s:
1359;CHECK: uqshl.4s v0, {{v[0-9]+}}, #1
1360        %tmp1 = load <4 x i32>, <4 x i32>* %A
1361        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
1362        ret <4 x i32> %tmp3
1363}
1364
1365define <2 x i64> @uqshli2d(<2 x i64>* %A) nounwind {
1366;CHECK-LABEL: uqshli2d:
1367;CHECK: uqshl.2d v0, {{v[0-9]+}}, #1
1368        %tmp1 = load <2 x i64>, <2 x i64>* %A
1369        %tmp3 = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
1370        ret <2 x i64> %tmp3
1371}
1372
1373define <8 x i8> @ursra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
1374;CHECK-LABEL: ursra8b:
1375;CHECK: ursra.8b v0, {{v[0-9]+}}, #1
1376        %tmp1 = load <8 x i8>, <8 x i8>* %A
1377        %tmp3 = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
1378        %tmp4 = load <8 x i8>, <8 x i8>* %B
1379        %tmp5 = add <8 x i8> %tmp3, %tmp4
1380        ret <8 x i8> %tmp5
1381}
1382
1383define <4 x i16> @ursra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
1384;CHECK-LABEL: ursra4h:
1385;CHECK: ursra.4h v0, {{v[0-9]+}}, #1
1386        %tmp1 = load <4 x i16>, <4 x i16>* %A
1387        %tmp3 = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
1388        %tmp4 = load <4 x i16>, <4 x i16>* %B
1389        %tmp5 = add <4 x i16> %tmp3, %tmp4
1390        ret <4 x i16> %tmp5
1391}
1392
1393define <2 x i32> @ursra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
1394;CHECK-LABEL: ursra2s:
1395;CHECK: ursra.2s v0, {{v[0-9]+}}, #1
1396        %tmp1 = load <2 x i32>, <2 x i32>* %A
1397        %tmp3 = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
1398        %tmp4 = load <2 x i32>, <2 x i32>* %B
1399        %tmp5 = add <2 x i32> %tmp3, %tmp4
1400        ret <2 x i32> %tmp5
1401}
1402
1403define <16 x i8> @ursra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
1404;CHECK-LABEL: ursra16b:
1405;CHECK: ursra.16b v0, {{v[0-9]+}}, #1
1406        %tmp1 = load <16 x i8>, <16 x i8>* %A
1407        %tmp3 = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
1408        %tmp4 = load <16 x i8>, <16 x i8>* %B
1409        %tmp5 = add <16 x i8> %tmp3, %tmp4
1410         ret <16 x i8> %tmp5
1411}
1412
1413define <8 x i16> @ursra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
1414;CHECK-LABEL: ursra8h:
1415;CHECK: ursra.8h v0, {{v[0-9]+}}, #1
1416        %tmp1 = load <8 x i16>, <8 x i16>* %A
1417        %tmp3 = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
1418        %tmp4 = load <8 x i16>, <8 x i16>* %B
1419        %tmp5 = add <8 x i16> %tmp3, %tmp4
1420         ret <8 x i16> %tmp5
1421}
1422
1423define <4 x i32> @ursra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
1424;CHECK-LABEL: ursra4s:
1425;CHECK: ursra.4s v0, {{v[0-9]+}}, #1
1426        %tmp1 = load <4 x i32>, <4 x i32>* %A
1427        %tmp3 = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
1428        %tmp4 = load <4 x i32>, <4 x i32>* %B
1429        %tmp5 = add <4 x i32> %tmp3, %tmp4
1430         ret <4 x i32> %tmp5
1431}
1432
1433define <2 x i64> @ursra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
1434;CHECK-LABEL: ursra2d:
1435;CHECK: ursra.2d v0, {{v[0-9]+}}, #1
1436        %tmp1 = load <2 x i64>, <2 x i64>* %A
1437        %tmp3 = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
1438        %tmp4 = load <2 x i64>, <2 x i64>* %B
1439        %tmp5 = add <2 x i64> %tmp3, %tmp4
1440         ret <2 x i64> %tmp5
1441}
1442
1443define <8 x i8> @srsra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
1444;CHECK-LABEL: srsra8b:
1445;CHECK: srsra.8b v0, {{v[0-9]+}}, #1
1446        %tmp1 = load <8 x i8>, <8 x i8>* %A
1447        %tmp3 = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
1448        %tmp4 = load <8 x i8>, <8 x i8>* %B
1449        %tmp5 = add <8 x i8> %tmp3, %tmp4
1450        ret <8 x i8> %tmp5
1451}
1452
1453define <4 x i16> @srsra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
1454;CHECK-LABEL: srsra4h:
1455;CHECK: srsra.4h v0, {{v[0-9]+}}, #1
1456        %tmp1 = load <4 x i16>, <4 x i16>* %A
1457        %tmp3 = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
1458        %tmp4 = load <4 x i16>, <4 x i16>* %B
1459        %tmp5 = add <4 x i16> %tmp3, %tmp4
1460        ret <4 x i16> %tmp5
1461}
1462
1463define <2 x i32> @srsra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
1464;CHECK-LABEL: srsra2s:
1465;CHECK: srsra.2s v0, {{v[0-9]+}}, #1
1466        %tmp1 = load <2 x i32>, <2 x i32>* %A
1467        %tmp3 = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
1468        %tmp4 = load <2 x i32>, <2 x i32>* %B
1469        %tmp5 = add <2 x i32> %tmp3, %tmp4
1470        ret <2 x i32> %tmp5
1471}
1472
1473define <16 x i8> @srsra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
1474;CHECK-LABEL: srsra16b:
1475;CHECK: srsra.16b v0, {{v[0-9]+}}, #1
1476        %tmp1 = load <16 x i8>, <16 x i8>* %A
1477        %tmp3 = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
1478        %tmp4 = load <16 x i8>, <16 x i8>* %B
1479        %tmp5 = add <16 x i8> %tmp3, %tmp4
1480         ret <16 x i8> %tmp5
1481}
1482
1483define <8 x i16> @srsra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
1484;CHECK-LABEL: srsra8h:
1485;CHECK: srsra.8h v0, {{v[0-9]+}}, #1
1486        %tmp1 = load <8 x i16>, <8 x i16>* %A
1487        %tmp3 = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
1488        %tmp4 = load <8 x i16>, <8 x i16>* %B
1489        %tmp5 = add <8 x i16> %tmp3, %tmp4
1490         ret <8 x i16> %tmp5
1491}
1492
1493define <4 x i32> @srsra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
1494;CHECK-LABEL: srsra4s:
1495;CHECK: srsra.4s v0, {{v[0-9]+}}, #1
1496        %tmp1 = load <4 x i32>, <4 x i32>* %A
1497        %tmp3 = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
1498        %tmp4 = load <4 x i32>, <4 x i32>* %B
1499        %tmp5 = add <4 x i32> %tmp3, %tmp4
1500         ret <4 x i32> %tmp5
1501}
1502
1503define <2 x i64> @srsra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
1504;CHECK-LABEL: srsra2d:
1505;CHECK: srsra.2d v0, {{v[0-9]+}}, #1
1506        %tmp1 = load <2 x i64>, <2 x i64>* %A
1507        %tmp3 = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
1508        %tmp4 = load <2 x i64>, <2 x i64>* %B
1509        %tmp5 = add <2 x i64> %tmp3, %tmp4
1510         ret <2 x i64> %tmp5
1511}
1512
1513define <8 x i8> @usra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
1514;CHECK-LABEL: usra8b:
1515;CHECK: usra.8b v0, {{v[0-9]+}}, #1
1516        %tmp1 = load <8 x i8>, <8 x i8>* %A
1517        %tmp3 = lshr <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
1518        %tmp4 = load <8 x i8>, <8 x i8>* %B
1519        %tmp5 = add <8 x i8> %tmp3, %tmp4
1520        ret <8 x i8> %tmp5
1521}
1522
1523define <4 x i16> @usra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
1524;CHECK-LABEL: usra4h:
1525;CHECK: usra.4h v0, {{v[0-9]+}}, #1
1526        %tmp1 = load <4 x i16>, <4 x i16>* %A
1527        %tmp3 = lshr <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
1528        %tmp4 = load <4 x i16>, <4 x i16>* %B
1529        %tmp5 = add <4 x i16> %tmp3, %tmp4
1530        ret <4 x i16> %tmp5
1531}
1532
1533define <2 x i32> @usra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
1534;CHECK-LABEL: usra2s:
1535;CHECK: usra.2s v0, {{v[0-9]+}}, #1
1536        %tmp1 = load <2 x i32>, <2 x i32>* %A
1537        %tmp3 = lshr <2 x i32> %tmp1, <i32 1, i32 1>
1538        %tmp4 = load <2 x i32>, <2 x i32>* %B
1539        %tmp5 = add <2 x i32> %tmp3, %tmp4
1540        ret <2 x i32> %tmp5
1541}
1542
1543define <16 x i8> @usra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
1544;CHECK-LABEL: usra16b:
1545;CHECK: usra.16b v0, {{v[0-9]+}}, #1
1546        %tmp1 = load <16 x i8>, <16 x i8>* %A
1547        %tmp3 = lshr <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
1548        %tmp4 = load <16 x i8>, <16 x i8>* %B
1549        %tmp5 = add <16 x i8> %tmp3, %tmp4
1550         ret <16 x i8> %tmp5
1551}
1552
1553define <8 x i16> @usra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
1554;CHECK-LABEL: usra8h:
1555;CHECK: usra.8h v0, {{v[0-9]+}}, #1
1556        %tmp1 = load <8 x i16>, <8 x i16>* %A
1557        %tmp3 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1558        %tmp4 = load <8 x i16>, <8 x i16>* %B
1559        %tmp5 = add <8 x i16> %tmp3, %tmp4
1560         ret <8 x i16> %tmp5
1561}
1562
1563define <4 x i32> @usra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
1564;CHECK-LABEL: usra4s:
1565;CHECK: usra.4s v0, {{v[0-9]+}}, #1
1566        %tmp1 = load <4 x i32>, <4 x i32>* %A
1567        %tmp3 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
1568        %tmp4 = load <4 x i32>, <4 x i32>* %B
1569        %tmp5 = add <4 x i32> %tmp3, %tmp4
1570         ret <4 x i32> %tmp5
1571}
1572
1573define <2 x i64> @usra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
1574;CHECK-LABEL: usra2d:
1575;CHECK: usra.2d v0, {{v[0-9]+}}, #1
1576        %tmp1 = load <2 x i64>, <2 x i64>* %A
1577        %tmp3 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
1578        %tmp4 = load <2 x i64>, <2 x i64>* %B
1579        %tmp5 = add <2 x i64> %tmp3, %tmp4
1580         ret <2 x i64> %tmp5
1581}
1582
1583define <8 x i8> @ssra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
1584;CHECK-LABEL: ssra8b:
1585;CHECK: ssra.8b v0, {{v[0-9]+}}, #1
1586        %tmp1 = load <8 x i8>, <8 x i8>* %A
1587        %tmp3 = ashr <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
1588        %tmp4 = load <8 x i8>, <8 x i8>* %B
1589        %tmp5 = add <8 x i8> %tmp3, %tmp4
1590        ret <8 x i8> %tmp5
1591}
1592
1593define <4 x i16> @ssra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
1594;CHECK-LABEL: ssra4h:
1595;CHECK: ssra.4h v0, {{v[0-9]+}}, #1
1596        %tmp1 = load <4 x i16>, <4 x i16>* %A
1597        %tmp3 = ashr <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
1598        %tmp4 = load <4 x i16>, <4 x i16>* %B
1599        %tmp5 = add <4 x i16> %tmp3, %tmp4
1600        ret <4 x i16> %tmp5
1601}
1602
1603define <2 x i32> @ssra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
1604;CHECK-LABEL: ssra2s:
1605;CHECK: ssra.2s v0, {{v[0-9]+}}, #1
1606        %tmp1 = load <2 x i32>, <2 x i32>* %A
1607        %tmp3 = ashr <2 x i32> %tmp1, <i32 1, i32 1>
1608        %tmp4 = load <2 x i32>, <2 x i32>* %B
1609        %tmp5 = add <2 x i32> %tmp3, %tmp4
1610        ret <2 x i32> %tmp5
1611}
1612
1613define <16 x i8> @ssra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
1614;CHECK-LABEL: ssra16b:
1615;CHECK: ssra.16b v0, {{v[0-9]+}}, #1
1616        %tmp1 = load <16 x i8>, <16 x i8>* %A
1617        %tmp3 = ashr <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
1618        %tmp4 = load <16 x i8>, <16 x i8>* %B
1619        %tmp5 = add <16 x i8> %tmp3, %tmp4
1620         ret <16 x i8> %tmp5
1621}
1622
1623define <8 x i16> @ssra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
1624;CHECK-LABEL: ssra8h:
1625;CHECK: ssra.8h v0, {{v[0-9]+}}, #1
1626        %tmp1 = load <8 x i16>, <8 x i16>* %A
1627        %tmp3 = ashr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1628        %tmp4 = load <8 x i16>, <8 x i16>* %B
1629        %tmp5 = add <8 x i16> %tmp3, %tmp4
1630         ret <8 x i16> %tmp5
1631}
1632
1633define <4 x i32> @ssra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
1634;CHECK-LABEL: ssra4s:
1635;CHECK: ssra.4s v0, {{v[0-9]+}}, #1
1636        %tmp1 = load <4 x i32>, <4 x i32>* %A
1637        %tmp3 = ashr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
1638        %tmp4 = load <4 x i32>, <4 x i32>* %B
1639        %tmp5 = add <4 x i32> %tmp3, %tmp4
1640         ret <4 x i32> %tmp5
1641}
1642
1643define <2 x i64> @ssra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
1644;CHECK-LABEL: ssra2d:
1645;CHECK: ssra.2d v0, {{v[0-9]+}}, #1
1646        %tmp1 = load <2 x i64>, <2 x i64>* %A
1647        %tmp3 = ashr <2 x i64> %tmp1, <i64 1, i64 1>
1648        %tmp4 = load <2 x i64>, <2 x i64>* %B
1649        %tmp5 = add <2 x i64> %tmp3, %tmp4
1650         ret <2 x i64> %tmp5
1651}
1652
1653define <8 x i8> @shr_orr8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
1654;CHECK-LABEL: shr_orr8b:
1655;CHECK: shr.8b v0, {{v[0-9]+}}, #1
1656;CHECK-NEXT: orr.8b
1657;CHECK-NEXT: ret
1658        %tmp1 = load <8 x i8>, <8 x i8>* %A
1659        %tmp4 = load <8 x i8>, <8 x i8>* %B
1660        %tmp3 = lshr <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
1661        %tmp5 = or <8 x i8> %tmp3, %tmp4
1662        ret <8 x i8> %tmp5
1663}
1664
1665define <4 x i16> @shr_orr4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
1666;CHECK-LABEL: shr_orr4h:
1667;CHECK: shr.4h v0, {{v[0-9]+}}, #1
1668;CHECK-NEXT: orr.8b
1669;CHECK-NEXT: ret
1670        %tmp1 = load <4 x i16>, <4 x i16>* %A
1671        %tmp4 = load <4 x i16>, <4 x i16>* %B
1672        %tmp3 = lshr <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
1673        %tmp5 = or <4 x i16> %tmp3, %tmp4
1674        ret <4 x i16> %tmp5
1675}
1676
1677define <2 x i32> @shr_orr2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
1678;CHECK-LABEL: shr_orr2s:
1679;CHECK: shr.2s v0, {{v[0-9]+}}, #1
1680;CHECK-NEXT: orr.8b
1681;CHECK-NEXT: ret
1682        %tmp1 = load <2 x i32>, <2 x i32>* %A
1683        %tmp4 = load <2 x i32>, <2 x i32>* %B
1684        %tmp3 = lshr <2 x i32> %tmp1, <i32 1, i32 1>
1685        %tmp5 = or <2 x i32> %tmp3, %tmp4
1686        ret <2 x i32> %tmp5
1687}
1688
1689define <16 x i8> @shr_orr16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
1690;CHECK-LABEL: shr_orr16b:
1691;CHECK: shr.16b v0, {{v[0-9]+}}, #1
1692;CHECK-NEXT: orr.16b
1693;CHECK-NEXT: ret
1694        %tmp1 = load <16 x i8>, <16 x i8>* %A
1695        %tmp4 = load <16 x i8>, <16 x i8>* %B
1696        %tmp3 = lshr <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
1697        %tmp5 = or <16 x i8> %tmp3, %tmp4
1698         ret <16 x i8> %tmp5
1699}
1700
1701define <8 x i16> @shr_orr8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
1702;CHECK-LABEL: shr_orr8h:
1703;CHECK: shr.8h v0, {{v[0-9]+}}, #1
1704;CHECK-NEXT: orr.16b
1705;CHECK-NEXT: ret
1706        %tmp1 = load <8 x i16>, <8 x i16>* %A
1707        %tmp4 = load <8 x i16>, <8 x i16>* %B
1708        %tmp3 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1709        %tmp5 = or <8 x i16> %tmp3, %tmp4
1710         ret <8 x i16> %tmp5
1711}
1712
1713define <4 x i32> @shr_orr4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
1714;CHECK-LABEL: shr_orr4s:
1715;CHECK: shr.4s v0, {{v[0-9]+}}, #1
1716;CHECK-NEXT: orr.16b
1717;CHECK-NEXT: ret
1718        %tmp1 = load <4 x i32>, <4 x i32>* %A
1719        %tmp4 = load <4 x i32>, <4 x i32>* %B
1720        %tmp3 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
1721        %tmp5 = or <4 x i32> %tmp3, %tmp4
1722         ret <4 x i32> %tmp5
1723}
1724
1725define <2 x i64> @shr_orr2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
1726;CHECK-LABEL: shr_orr2d:
1727;CHECK: shr.2d v0, {{v[0-9]+}}, #1
1728;CHECK-NEXT: orr.16b
1729;CHECK-NEXT: ret
1730        %tmp1 = load <2 x i64>, <2 x i64>* %A
1731        %tmp4 = load <2 x i64>, <2 x i64>* %B
1732        %tmp3 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
1733        %tmp5 = or <2 x i64> %tmp3, %tmp4
1734         ret <2 x i64> %tmp5
1735}
1736
1737define <8 x i8> @shl_orr8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
1738;CHECK-LABEL: shl_orr8b:
1739;CHECK: shl.8b v0, {{v[0-9]+}}, #1
1740;CHECK-NEXT: orr.8b
1741;CHECK-NEXT: ret
1742        %tmp1 = load <8 x i8>, <8 x i8>* %A
1743        %tmp4 = load <8 x i8>, <8 x i8>* %B
1744        %tmp3 = shl <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
1745        %tmp5 = or <8 x i8> %tmp3, %tmp4
1746        ret <8 x i8> %tmp5
1747}
1748
1749define <4 x i16> @shl_orr4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
1750;CHECK-LABEL: shl_orr4h:
1751;CHECK: shl.4h v0, {{v[0-9]+}}, #1
1752;CHECK-NEXT: orr.8b
1753;CHECK-NEXT: ret
1754        %tmp1 = load <4 x i16>, <4 x i16>* %A
1755        %tmp4 = load <4 x i16>, <4 x i16>* %B
1756        %tmp3 = shl <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
1757        %tmp5 = or <4 x i16> %tmp3, %tmp4
1758        ret <4 x i16> %tmp5
1759}
1760
1761define <2 x i32> @shl_orr2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
1762;CHECK-LABEL: shl_orr2s:
1763;CHECK: shl.2s v0, {{v[0-9]+}}, #1
1764;CHECK-NEXT: orr.8b
1765;CHECK-NEXT: ret
1766        %tmp1 = load <2 x i32>, <2 x i32>* %A
1767        %tmp4 = load <2 x i32>, <2 x i32>* %B
1768        %tmp3 = shl <2 x i32> %tmp1, <i32 1, i32 1>
1769        %tmp5 = or <2 x i32> %tmp3, %tmp4
1770        ret <2 x i32> %tmp5
1771}
1772
1773define <16 x i8> @shl_orr16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
1774;CHECK-LABEL: shl_orr16b:
1775;CHECK: shl.16b v0, {{v[0-9]+}}, #1
1776;CHECK-NEXT: orr.16b
1777;CHECK-NEXT: ret
1778        %tmp1 = load <16 x i8>, <16 x i8>* %A
1779        %tmp4 = load <16 x i8>, <16 x i8>* %B
1780        %tmp3 = shl <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
1781        %tmp5 = or <16 x i8> %tmp3, %tmp4
1782         ret <16 x i8> %tmp5
1783}
1784
1785define <8 x i16> @shl_orr8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
1786;CHECK-LABEL: shl_orr8h:
1787;CHECK: shl.8h v0, {{v[0-9]+}}, #1
1788;CHECK-NEXT: orr.16b
1789;CHECK-NEXT: ret
1790        %tmp1 = load <8 x i16>, <8 x i16>* %A
1791        %tmp4 = load <8 x i16>, <8 x i16>* %B
1792        %tmp3 = shl <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
1793        %tmp5 = or <8 x i16> %tmp3, %tmp4
1794         ret <8 x i16> %tmp5
1795}
1796
1797define <4 x i32> @shl_orr4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
1798;CHECK-LABEL: shl_orr4s:
1799;CHECK: shl.4s v0, {{v[0-9]+}}, #1
1800;CHECK-NEXT: orr.16b
1801;CHECK-NEXT: ret
1802        %tmp1 = load <4 x i32>, <4 x i32>* %A
1803        %tmp4 = load <4 x i32>, <4 x i32>* %B
1804        %tmp3 = shl <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
1805        %tmp5 = or <4 x i32> %tmp3, %tmp4
1806         ret <4 x i32> %tmp5
1807}
1808
1809define <2 x i64> @shl_orr2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
1810;CHECK-LABEL: shl_orr2d:
1811;CHECK: shl.2d v0, {{v[0-9]+}}, #1
1812;CHECK-NEXT: orr.16b
1813;CHECK-NEXT: ret
1814        %tmp1 = load <2 x i64>, <2 x i64>* %A
1815        %tmp4 = load <2 x i64>, <2 x i64>* %B
1816        %tmp3 = shl <2 x i64> %tmp1, <i64 1, i64 1>
1817        %tmp5 = or <2 x i64> %tmp3, %tmp4
1818         ret <2 x i64> %tmp5
1819}
1820
1821define <8 x i16> @shll(<8 x i8> %in) {
1822; CHECK-LABEL: shll:
1823; CHECK: shll.8h v0, {{v[0-9]+}}, #8
1824  %ext = zext <8 x i8> %in to <8 x i16>
1825  %res = shl <8 x i16> %ext, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1826  ret <8 x i16> %res
1827}
1828
1829define <4 x i32> @shll_high(<8 x i16> %in) {
1830; CHECK-LABEL: shll_high
1831; CHECK: shll2.4s v0, {{v[0-9]+}}, #16
1832  %extract = shufflevector <8 x i16> %in, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1833  %ext = zext <4 x i16> %extract to <4 x i32>
1834  %res = shl <4 x i32> %ext, <i32 16, i32 16, i32 16, i32 16>
1835  ret <4 x i32> %res
1836}
1837
1838define <8 x i8> @sli8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
1839;CHECK-LABEL: sli8b:
1840;CHECK: sli.8b v0, {{v[0-9]+}}, #1
1841        %tmp1 = load <8 x i8>, <8 x i8>* %A
1842        %tmp2 = load <8 x i8>, <8 x i8>* %B
1843        %tmp3 = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, i32 1)
1844        ret <8 x i8> %tmp3
1845}
1846
1847define <4 x i16> @sli4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
1848;CHECK-LABEL: sli4h:
1849;CHECK: sli.4h v0, {{v[0-9]+}}, #1
1850        %tmp1 = load <4 x i16>, <4 x i16>* %A
1851        %tmp2 = load <4 x i16>, <4 x i16>* %B
1852        %tmp3 = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, i32 1)
1853        ret <4 x i16> %tmp3
1854}
1855
1856define <2 x i32> @sli2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
1857;CHECK-LABEL: sli2s:
1858;CHECK: sli.2s v0, {{v[0-9]+}}, #1
1859        %tmp1 = load <2 x i32>, <2 x i32>* %A
1860        %tmp2 = load <2 x i32>, <2 x i32>* %B
1861        %tmp3 = call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, i32 1)
1862        ret <2 x i32> %tmp3
1863}
1864
1865define <1 x i64> @sli1d(<1 x i64>* %A, <1 x i64>* %B) nounwind {
1866;CHECK-LABEL: sli1d:
1867;CHECK: sli d0, {{d[0-9]+}}, #1
1868        %tmp1 = load <1 x i64>, <1 x i64>* %A
1869        %tmp2 = load <1 x i64>, <1 x i64>* %B
1870        %tmp3 = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2, i32 1)
1871        ret <1 x i64> %tmp3
1872}
1873
1874define <16 x i8> @sli16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
1875;CHECK-LABEL: sli16b:
1876;CHECK: sli.16b v0, {{v[0-9]+}}, #1
1877        %tmp1 = load <16 x i8>, <16 x i8>* %A
1878        %tmp2 = load <16 x i8>, <16 x i8>* %B
1879        %tmp3 = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, i32 1)
1880        ret <16 x i8> %tmp3
1881}
1882
1883define <8 x i16> @sli8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
1884;CHECK-LABEL: sli8h:
1885;CHECK: sli.8h v0, {{v[0-9]+}}, #1
1886        %tmp1 = load <8 x i16>, <8 x i16>* %A
1887        %tmp2 = load <8 x i16>, <8 x i16>* %B
1888        %tmp3 = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, i32 1)
1889        ret <8 x i16> %tmp3
1890}
1891
1892define <4 x i32> @sli4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
1893;CHECK-LABEL: sli4s:
1894;CHECK: sli.4s v0, {{v[0-9]+}}, #1
1895        %tmp1 = load <4 x i32>, <4 x i32>* %A
1896        %tmp2 = load <4 x i32>, <4 x i32>* %B
1897        %tmp3 = call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, i32 1)
1898        ret <4 x i32> %tmp3
1899}
1900
1901define <2 x i64> @sli2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
1902;CHECK-LABEL: sli2d:
1903;CHECK: sli.2d v0, {{v[0-9]+}}, #1
1904        %tmp1 = load <2 x i64>, <2 x i64>* %A
1905        %tmp2 = load <2 x i64>, <2 x i64>* %B
1906        %tmp3 = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2, i32 1)
1907        ret <2 x i64> %tmp3
1908}
1909
1910declare <8 x i8>  @llvm.aarch64.neon.vsli.v8i8(<8 x i8>, <8 x i8>, i32) nounwind readnone
1911declare <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16>, <4 x i16>, i32) nounwind readnone
1912declare <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32>, <2 x i32>, i32) nounwind readnone
1913declare <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64>, <1 x i64>, i32) nounwind readnone
1914
1915declare <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8>, <16 x i8>, i32) nounwind readnone
1916declare <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16>, <8 x i16>, i32) nounwind readnone
1917declare <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32>, <4 x i32>, i32) nounwind readnone
1918declare <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64>, <2 x i64>, i32) nounwind readnone
1919
1920define <1 x i64> @ashr_v1i64(<1 x i64> %a, <1 x i64> %b) {
1921; CHECK-LABEL: ashr_v1i64:
1922; CHECK: neg d{{[0-9]+}}, d{{[0-9]+}}
1923; CHECK: sshl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
1924  %c = ashr <1 x i64> %a, %b
1925  ret <1 x i64> %c
1926}
1927