• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X32
3; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64
4
5;
6; 128-bit Vectors
7;
8
9define <4 x float> @test_unpackl_fhadd_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
10; X32-LABEL: test_unpackl_fhadd_128:
11; X32:       ## %bb.0:
12; X32-NEXT:    vhaddps %xmm2, %xmm0, %xmm0
13; X32-NEXT:    retl
14;
15; X64-LABEL: test_unpackl_fhadd_128:
16; X64:       ## %bb.0:
17; X64-NEXT:    vhaddps %xmm2, %xmm0, %xmm0
18; X64-NEXT:    retq
19  %1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1)
20  %2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a2, <4 x float> %a3)
21  %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
22  ret <4 x float> %3
23}
24
25define <2 x double> @test_unpackh_fhadd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) {
26; X32-LABEL: test_unpackh_fhadd_128:
27; X32:       ## %bb.0:
28; X32-NEXT:    vhaddpd %xmm3, %xmm1, %xmm0
29; X32-NEXT:    retl
30;
31; X64-LABEL: test_unpackh_fhadd_128:
32; X64:       ## %bb.0:
33; X64-NEXT:    vhaddpd %xmm3, %xmm1, %xmm0
34; X64-NEXT:    retq
35  %1 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1)
36  %2 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a2, <2 x double> %a3)
37  %3 = shufflevector <2 x double> %1, <2 x double> %2, <2 x i32> <i32 1, i32 3>
38  ret <2 x double> %3
39}
40
41define <2 x double> @test_unpackl_fhsub_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) {
42; X32-LABEL: test_unpackl_fhsub_128:
43; X32:       ## %bb.0:
44; X32-NEXT:    vhsubpd %xmm2, %xmm0, %xmm0
45; X32-NEXT:    retl
46;
47; X64-LABEL: test_unpackl_fhsub_128:
48; X64:       ## %bb.0:
49; X64-NEXT:    vhsubpd %xmm2, %xmm0, %xmm0
50; X64-NEXT:    retq
51  %1 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1)
52  %2 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a2, <2 x double> %a3)
53  %3 = shufflevector <2 x double> %1, <2 x double> %2, <2 x i32> <i32 0, i32 2>
54  ret <2 x double> %3
55}
56
57define <4 x float> @test_unpackh_fhsub_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
58; X32-LABEL: test_unpackh_fhsub_128:
59; X32:       ## %bb.0:
60; X32-NEXT:    vhsubps %xmm3, %xmm1, %xmm0
61; X32-NEXT:    retl
62;
63; X64-LABEL: test_unpackh_fhsub_128:
64; X64:       ## %bb.0:
65; X64-NEXT:    vhsubps %xmm3, %xmm1, %xmm0
66; X64-NEXT:    retq
67  %1 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1)
68  %2 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a2, <4 x float> %a3)
69  %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
70  ret <4 x float> %3
71}
72
73define <8 x i16> @test_unpackl_hadd_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
74; X32-LABEL: test_unpackl_hadd_128:
75; X32:       ## %bb.0:
76; X32-NEXT:    vphaddw %xmm2, %xmm0, %xmm0
77; X32-NEXT:    retl
78;
79; X64-LABEL: test_unpackl_hadd_128:
80; X64:       ## %bb.0:
81; X64-NEXT:    vphaddw %xmm2, %xmm0, %xmm0
82; X64-NEXT:    retq
83  %1 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1)
84  %2 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a2, <8 x i16> %a3)
85  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
86  ret <8 x i16> %3
87}
88
89define <4 x i32> @test_unpackh_hadd_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
90; X32-LABEL: test_unpackh_hadd_128:
91; X32:       ## %bb.0:
92; X32-NEXT:    vphaddd %xmm3, %xmm1, %xmm0
93; X32-NEXT:    retl
94;
95; X64-LABEL: test_unpackh_hadd_128:
96; X64:       ## %bb.0:
97; X64-NEXT:    vphaddd %xmm3, %xmm1, %xmm0
98; X64-NEXT:    retq
99  %1 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1)
100  %2 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a2, <4 x i32> %a3)
101  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
102  ret <4 x i32> %3
103}
104
105define <4 x i32> @test_unpackl_hsub_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
106; X32-LABEL: test_unpackl_hsub_128:
107; X32:       ## %bb.0:
108; X32-NEXT:    vphsubd %xmm2, %xmm0, %xmm0
109; X32-NEXT:    retl
110;
111; X64-LABEL: test_unpackl_hsub_128:
112; X64:       ## %bb.0:
113; X64-NEXT:    vphsubd %xmm2, %xmm0, %xmm0
114; X64-NEXT:    retq
115  %1 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1)
116  %2 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a2, <4 x i32> %a3)
117  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
118  ret <4 x i32> %3
119}
120
121define <8 x i16> @test_unpackh_hsub_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
122; X32-LABEL: test_unpackh_hsub_128:
123; X32:       ## %bb.0:
124; X32-NEXT:    vphsubw %xmm3, %xmm1, %xmm0
125; X32-NEXT:    retl
126;
127; X64-LABEL: test_unpackh_hsub_128:
128; X64:       ## %bb.0:
129; X64-NEXT:    vphsubw %xmm3, %xmm1, %xmm0
130; X64-NEXT:    retq
131  %1 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1)
132  %2 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a2, <8 x i16> %a3)
133  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
134  ret <8 x i16> %3
135}
136
137define <16 x i8> @test_unpackl_packss_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
138; X32-LABEL: test_unpackl_packss_128:
139; X32:       ## %bb.0:
140; X32-NEXT:    vpacksswb %xmm2, %xmm0, %xmm0
141; X32-NEXT:    retl
142;
143; X64-LABEL: test_unpackl_packss_128:
144; X64:       ## %bb.0:
145; X64-NEXT:    vpacksswb %xmm2, %xmm0, %xmm0
146; X64-NEXT:    retq
147  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1)
148  %2 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a2, <8 x i16> %a3)
149  %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
150  ret <16 x i8> %3
151}
152
153define <8 x i16> @test_unpackh_packss_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
154; X32-LABEL: test_unpackh_packss_128:
155; X32:       ## %bb.0:
156; X32-NEXT:    vpackssdw %xmm3, %xmm1, %xmm0
157; X32-NEXT:    retl
158;
159; X64-LABEL: test_unpackh_packss_128:
160; X64:       ## %bb.0:
161; X64-NEXT:    vpackssdw %xmm3, %xmm1, %xmm0
162; X64-NEXT:    retq
163  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1)
164  %2 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a2, <4 x i32> %a3)
165  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
166  ret <8 x i16> %3
167}
168
169define <8 x i16> @test_unpackl_packus_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
170; X32-LABEL: test_unpackl_packus_128:
171; X32:       ## %bb.0:
172; X32-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
173; X32-NEXT:    retl
174;
175; X64-LABEL: test_unpackl_packus_128:
176; X64:       ## %bb.0:
177; X64-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
178; X64-NEXT:    retq
179  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
180  %2 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a2, <4 x i32> %a3)
181  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
182  ret <8 x i16> %3
183}
184
185define <16 x i8> @test_unpackh_packus_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
186; X32-LABEL: test_unpackh_packus_128:
187; X32:       ## %bb.0:
188; X32-NEXT:    vpackuswb %xmm3, %xmm1, %xmm0
189; X32-NEXT:    retl
190;
191; X64-LABEL: test_unpackh_packus_128:
192; X64:       ## %bb.0:
193; X64-NEXT:    vpackuswb %xmm3, %xmm1, %xmm0
194; X64-NEXT:    retq
195  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1)
196  %2 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a2, <8 x i16> %a3)
197  %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
198  ret <16 x i8> %3
199}
200
201;
202; 256-bit Vectors
203;
204
205define <8 x float> @test_unpackl_fhadd_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> %a3) {
206; X32-LABEL: test_unpackl_fhadd_256:
207; X32:       ## %bb.0:
208; X32-NEXT:    vhaddps %ymm2, %ymm0, %ymm0
209; X32-NEXT:    retl
210;
211; X64-LABEL: test_unpackl_fhadd_256:
212; X64:       ## %bb.0:
213; X64-NEXT:    vhaddps %ymm2, %ymm0, %ymm0
214; X64-NEXT:    retq
215  %1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1)
216  %2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a2, <8 x float> %a3)
217  %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 12, i32 13>
218  ret <8 x float> %3
219}
220
221define <4 x double> @test_unpackh_fhadd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> %a3) {
222; X32-LABEL: test_unpackh_fhadd_256:
223; X32:       ## %bb.0:
224; X32-NEXT:    vhaddpd %ymm3, %ymm1, %ymm0
225; X32-NEXT:    retl
226;
227; X64-LABEL: test_unpackh_fhadd_256:
228; X64:       ## %bb.0:
229; X64-NEXT:    vhaddpd %ymm3, %ymm1, %ymm0
230; X64-NEXT:    retq
231  %1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1)
232  %2 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a2, <4 x double> %a3)
233  %3 = shufflevector <4 x double> %1, <4 x double> %2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
234  ret <4 x double> %3
235}
236
237define <4 x double> @test_unpackl_fhsub_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> %a3) {
238; X32-LABEL: test_unpackl_fhsub_256:
239; X32:       ## %bb.0:
240; X32-NEXT:    vhsubpd %ymm2, %ymm0, %ymm0
241; X32-NEXT:    retl
242;
243; X64-LABEL: test_unpackl_fhsub_256:
244; X64:       ## %bb.0:
245; X64-NEXT:    vhsubpd %ymm2, %ymm0, %ymm0
246; X64-NEXT:    retq
247  %1 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1)
248  %2 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a2, <4 x double> %a3)
249  %3 = shufflevector <4 x double> %1, <4 x double> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
250  ret <4 x double> %3
251}
252
253define <8 x float> @test_unpackh_fhsub_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> %a3) {
254; X32-LABEL: test_unpackh_fhsub_256:
255; X32:       ## %bb.0:
256; X32-NEXT:    vhsubps %ymm3, %ymm1, %ymm0
257; X32-NEXT:    retl
258;
259; X64-LABEL: test_unpackh_fhsub_256:
260; X64:       ## %bb.0:
261; X64-NEXT:    vhsubps %ymm3, %ymm1, %ymm0
262; X64-NEXT:    retq
263  %1 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1)
264  %2 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a2, <8 x float> %a3)
265  %3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 2, i32 3, i32 10, i32 11, i32 6, i32 7, i32 14, i32 15>
266  ret <8 x float> %3
267}
268
269define <16 x i16> @test_unpackl_hadd_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> %a2, <16 x i16> %a3) {
270; X32-LABEL: test_unpackl_hadd_256:
271; X32:       ## %bb.0:
272; X32-NEXT:    vphaddw %ymm2, %ymm0, %ymm0
273; X32-NEXT:    retl
274;
275; X64-LABEL: test_unpackl_hadd_256:
276; X64:       ## %bb.0:
277; X64-NEXT:    vphaddw %ymm2, %ymm0, %ymm0
278; X64-NEXT:    retq
279  %1 = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a0, <16 x i16> %a1)
280  %2 = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a2, <16 x i16> %a3)
281  %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27>
282  ret <16 x i16> %3
283}
284
285define <8 x i32> @test_unpackh_hadd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) {
286; X32-LABEL: test_unpackh_hadd_256:
287; X32:       ## %bb.0:
288; X32-NEXT:    vphaddd %ymm3, %ymm1, %ymm0
289; X32-NEXT:    retl
290;
291; X64-LABEL: test_unpackh_hadd_256:
292; X64:       ## %bb.0:
293; X64-NEXT:    vphaddd %ymm3, %ymm1, %ymm0
294; X64-NEXT:    retq
295  %1 = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %a0, <8 x i32> %a1)
296  %2 = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %a2, <8 x i32> %a3)
297  %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <8 x i32> <i32 2, i32 3, i32 10, i32 11, i32 6, i32 7, i32 14, i32 15>
298  ret <8 x i32> %3
299}
300
301define <8 x i32> @test_unpackl_hsub_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) {
302; X32-LABEL: test_unpackl_hsub_256:
303; X32:       ## %bb.0:
304; X32-NEXT:    vphsubd %ymm2, %ymm0, %ymm0
305; X32-NEXT:    retl
306;
307; X64-LABEL: test_unpackl_hsub_256:
308; X64:       ## %bb.0:
309; X64-NEXT:    vphsubd %ymm2, %ymm0, %ymm0
310; X64-NEXT:    retq
311  %1 = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a0, <8 x i32> %a1)
312  %2 = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a2, <8 x i32> %a3)
313  %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 12, i32 13>
314  ret <8 x i32> %3
315}
316
317define <16 x i16> @test_unpackh_hsub_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> %a2, <16 x i16> %a3) {
318; X32-LABEL: test_unpackh_hsub_256:
319; X32:       ## %bb.0:
320; X32-NEXT:    vphsubw %ymm3, %ymm1, %ymm0
321; X32-NEXT:    retl
322;
323; X64-LABEL: test_unpackh_hsub_256:
324; X64:       ## %bb.0:
325; X64-NEXT:    vphsubw %ymm3, %ymm1, %ymm0
326; X64-NEXT:    retq
327  %1 = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %a0, <16 x i16> %a1)
328  %2 = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %a2, <16 x i16> %a3)
329  %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31>
330  ret <16 x i16> %3
331}
332
333define <32 x i8> @test_unpackl_packss_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> %a2, <16 x i16> %a3) {
334; X32-LABEL: test_unpackl_packss_256:
335; X32:       ## %bb.0:
336; X32-NEXT:    vpacksswb %ymm2, %ymm0, %ymm0
337; X32-NEXT:    retl
338;
339; X64-LABEL: test_unpackl_packss_256:
340; X64:       ## %bb.0:
341; X64-NEXT:    vpacksswb %ymm2, %ymm0, %ymm0
342; X64-NEXT:    retq
343  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1)
344  %2 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a2, <16 x i16> %a3)
345  %3 = shufflevector <32 x i8> %1, <32 x i8> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
346  ret <32 x i8> %3
347}
348
349define <16 x i16> @test_unpackh_packss_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) {
350; X32-LABEL: test_unpackh_packss_256:
351; X32:       ## %bb.0:
352; X32-NEXT:    vpackssdw %ymm3, %ymm1, %ymm0
353; X32-NEXT:    retl
354;
355; X64-LABEL: test_unpackh_packss_256:
356; X64:       ## %bb.0:
357; X64-NEXT:    vpackssdw %ymm3, %ymm1, %ymm0
358; X64-NEXT:    retq
359  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1)
360  %2 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a2, <8 x i32> %a3)
361  %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31>
362  ret <16 x i16> %3
363}
364
365define <16 x i16> @test_unpackl_packus_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) {
366; X32-LABEL: test_unpackl_packus_256:
367; X32:       ## %bb.0:
368; X32-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
369; X32-NEXT:    retl
370;
371; X64-LABEL: test_unpackl_packus_256:
372; X64:       ## %bb.0:
373; X64-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
374; X64-NEXT:    retq
375  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1)
376  %2 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a2, <8 x i32> %a3)
377  %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27>
378  ret <16 x i16> %3
379}
380
381define <32 x i8> @test_unpackh_packus_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> %a2, <16 x i16> %a3) {
382; X32-LABEL: test_unpackh_packus_256:
383; X32:       ## %bb.0:
384; X32-NEXT:    vpacksswb %ymm3, %ymm1, %ymm0
385; X32-NEXT:    retl
386;
387; X64-LABEL: test_unpackh_packus_256:
388; X64:       ## %bb.0:
389; X64-NEXT:    vpacksswb %ymm3, %ymm1, %ymm0
390; X64-NEXT:    retq
391  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1)
392  %2 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a2, <16 x i16> %a3)
393  %3 = shufflevector <32 x i8> %1, <32 x i8> %2, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
394  ret <32 x i8> %3
395}
396
397declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>)
398declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>)
399declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>)
400declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>)
401
402declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>)
403declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>)
404declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>)
405declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>)
406
407declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>)
408declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>)
409declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>)
410declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>)
411
412declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>)
413declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>)
414declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>)
415declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>)
416
417declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>)
418declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>)
419declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>)
420declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>)
421
422declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>)
423declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>)
424declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>)
425declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>)
426