• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx  | FileCheck %s --check-prefixes=AVX,AVX1
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
5
6; The next 8 tests check for matching the horizontal op and eliminating the shuffle.
7; PR34111 - https://bugs.llvm.org/show_bug.cgi?id=34111
8
9define <4 x float> @hadd_v4f32(<4 x float> %a) {
10; SSSE3-LABEL: hadd_v4f32:
11; SSSE3:       # %bb.0:
12; SSSE3-NEXT:    haddps %xmm0, %xmm0
13; SSSE3-NEXT:    retq
14;
15; AVX-LABEL: hadd_v4f32:
16; AVX:       # %bb.0:
17; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
18; AVX-NEXT:    retq
19  %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2>
20  %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3>
21  %hop = fadd <2 x float> %a02, %a13
22  %shuf = shufflevector <2 x float> %hop, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
23  ret <4 x float> %shuf
24}
25
26define <8 x float> @hadd_v8f32a(<8 x float> %a) {
27; SSSE3-LABEL: hadd_v8f32a:
28; SSSE3:       # %bb.0:
29; SSSE3-NEXT:    movaps %xmm0, %xmm2
30; SSSE3-NEXT:    haddps %xmm1, %xmm2
31; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm2[0,0]
32; SSSE3-NEXT:    movaps %xmm2, %xmm1
33; SSSE3-NEXT:    retq
34;
35; AVX1-LABEL: hadd_v8f32a:
36; AVX1:       # %bb.0:
37; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
38; AVX1-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
39; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
40; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
41; AVX1-NEXT:    retq
42;
43; AVX2-LABEL: hadd_v8f32a:
44; AVX2:       # %bb.0:
45; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
46; AVX2-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
47; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
48; AVX2-NEXT:    retq
49  %a0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
50  %a1 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
51  %hop = fadd <4 x float> %a0, %a1
52  %shuf = shufflevector <4 x float> %hop, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3>
53  ret <8 x float> %shuf
54}
55
56define <8 x float> @hadd_v8f32b(<8 x float> %a) {
57; SSSE3-LABEL: hadd_v8f32b:
58; SSSE3:       # %bb.0:
59; SSSE3-NEXT:    haddps %xmm0, %xmm0
60; SSSE3-NEXT:    haddps %xmm1, %xmm1
61; SSSE3-NEXT:    retq
62;
63; AVX-LABEL: hadd_v8f32b:
64; AVX:       # %bb.0:
65; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
66; AVX-NEXT:    retq
67  %a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
68  %a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
69  %hop = fadd <8 x float> %a0, %a1
70  %shuf = shufflevector <8 x float> %hop, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
71  ret <8 x float> %shuf
72}
73
74define <4 x float> @hsub_v4f32(<4 x float> %a) {
75; SSSE3-LABEL: hsub_v4f32:
76; SSSE3:       # %bb.0:
77; SSSE3-NEXT:    hsubps %xmm0, %xmm0
78; SSSE3-NEXT:    retq
79;
80; AVX-LABEL: hsub_v4f32:
81; AVX:       # %bb.0:
82; AVX-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
83; AVX-NEXT:    retq
84  %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2>
85  %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3>
86  %hop = fsub <2 x float> %a02, %a13
87  %shuf = shufflevector <2 x float> %hop, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
88  ret <4 x float> %shuf
89}
90
91define <8 x float> @hsub_v8f32a(<8 x float> %a) {
92; SSSE3-LABEL: hsub_v8f32a:
93; SSSE3:       # %bb.0:
94; SSSE3-NEXT:    movaps %xmm0, %xmm2
95; SSSE3-NEXT:    hsubps %xmm1, %xmm2
96; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm2[0,0]
97; SSSE3-NEXT:    movaps %xmm2, %xmm1
98; SSSE3-NEXT:    retq
99;
100; AVX1-LABEL: hsub_v8f32a:
101; AVX1:       # %bb.0:
102; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
103; AVX1-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
104; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
105; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
106; AVX1-NEXT:    retq
107;
108; AVX2-LABEL: hsub_v8f32a:
109; AVX2:       # %bb.0:
110; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
111; AVX2-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
112; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
113; AVX2-NEXT:    retq
114  %a0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
115  %a1 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
116  %hop = fsub <4 x float> %a0, %a1
117  %shuf = shufflevector <4 x float> %hop, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3>
118  ret <8 x float> %shuf
119}
120
121define <8 x float> @hsub_v8f32b(<8 x float> %a) {
122; SSSE3-LABEL: hsub_v8f32b:
123; SSSE3:       # %bb.0:
124; SSSE3-NEXT:    hsubps %xmm0, %xmm0
125; SSSE3-NEXT:    hsubps %xmm1, %xmm1
126; SSSE3-NEXT:    retq
127;
128; AVX-LABEL: hsub_v8f32b:
129; AVX:       # %bb.0:
130; AVX-NEXT:    vhsubps %ymm0, %ymm0, %ymm0
131; AVX-NEXT:    retq
132  %a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
133  %a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
134  %hop = fsub <8 x float> %a0, %a1
135  %shuf = shufflevector <8 x float> %hop, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
136  ret <8 x float> %shuf
137}
138
139define <2 x double> @hadd_v2f64(<2 x double> %a) {
140; SSSE3-LABEL: hadd_v2f64:
141; SSSE3:       # %bb.0:
142; SSSE3-NEXT:    haddpd %xmm0, %xmm0
143; SSSE3-NEXT:    retq
144;
145; AVX-LABEL: hadd_v2f64:
146; AVX:       # %bb.0:
147; AVX-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
148; AVX-NEXT:    retq
149  %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
150  %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
151  %hop = fadd <2 x double> %a0, %a1
152  %shuf = shufflevector <2 x double> %hop, <2 x double> undef, <2 x i32> <i32 0, i32 0>
153  ret <2 x double> %shuf
154}
155
156define <4 x double> @hadd_v4f64(<4 x double> %a) {
157; SSSE3-LABEL: hadd_v4f64:
158; SSSE3:       # %bb.0:
159; SSSE3-NEXT:    haddpd %xmm0, %xmm0
160; SSSE3-NEXT:    haddpd %xmm1, %xmm1
161; SSSE3-NEXT:    retq
162;
163; AVX-LABEL: hadd_v4f64:
164; AVX:       # %bb.0:
165; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
166; AVX-NEXT:    retq
167  %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
168  %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
169  %hop = fadd <4 x double> %a0, %a1
170  %shuf = shufflevector <4 x double> %hop, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
171  ret <4 x double> %shuf
172}
173
174define <2 x double> @hsub_v2f64(<2 x double> %a) {
175; SSSE3-LABEL: hsub_v2f64:
176; SSSE3:       # %bb.0:
177; SSSE3-NEXT:    hsubpd %xmm0, %xmm0
178; SSSE3-NEXT:    retq
179;
180; AVX-LABEL: hsub_v2f64:
181; AVX:       # %bb.0:
182; AVX-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
183; AVX-NEXT:    retq
184  %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
185  %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
186  %hop = fsub <2 x double> %a0, %a1
187  %shuf = shufflevector <2 x double> %hop, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
188  ret <2 x double> %shuf
189}
190
191define <4 x double> @hsub_v4f64(<4 x double> %a) {
192; SSSE3-LABEL: hsub_v4f64:
193; SSSE3:       # %bb.0:
194; SSSE3-NEXT:    hsubpd %xmm0, %xmm0
195; SSSE3-NEXT:    hsubpd %xmm1, %xmm1
196; SSSE3-NEXT:    retq
197;
198; AVX-LABEL: hsub_v4f64:
199; AVX:       # %bb.0:
200; AVX-NEXT:    vhsubpd %ymm0, %ymm0, %ymm0
201; AVX-NEXT:    retq
202  %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
203  %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
204  %hop = fsub <4 x double> %a0, %a1
205  %shuf = shufflevector <4 x double> %hop, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
206  ret <4 x double> %shuf
207}
208
209define <4 x i32> @hadd_v4i32(<4 x i32> %a) {
210; SSSE3-LABEL: hadd_v4i32:
211; SSSE3:       # %bb.0:
212; SSSE3-NEXT:    phaddd %xmm0, %xmm0
213; SSSE3-NEXT:    retq
214;
215; AVX-LABEL: hadd_v4i32:
216; AVX:       # %bb.0:
217; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
218; AVX-NEXT:    retq
219  %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
220  %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
221  %hop = add <4 x i32> %a02, %a13
222  %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 1>
223  ret <4 x i32> %shuf
224}
225
226define <8 x i32> @hadd_v8i32a(<8 x i32> %a) {
227; SSSE3-LABEL: hadd_v8i32a:
228; SSSE3:       # %bb.0:
229; SSSE3-NEXT:    movdqa %xmm0, %xmm2
230; SSSE3-NEXT:    phaddd %xmm1, %xmm2
231; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
232; SSSE3-NEXT:    movdqa %xmm2, %xmm1
233; SSSE3-NEXT:    retq
234;
235; AVX1-LABEL: hadd_v8i32a:
236; AVX1:       # %bb.0:
237; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
238; AVX1-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
239; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
240; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
241; AVX1-NEXT:    retq
242;
243; AVX2-LABEL: hadd_v8i32a:
244; AVX2:       # %bb.0:
245; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
246; AVX2-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
247; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
248; AVX2-NEXT:    retq
249  %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
250  %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
251  %hop = add <4 x i32> %a0, %a1
252  %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3>
253  ret <8 x i32> %shuf
254}
255
256define <8 x i32> @hadd_v8i32b(<8 x i32> %a) {
257; SSSE3-LABEL: hadd_v8i32b:
258; SSSE3:       # %bb.0:
259; SSSE3-NEXT:    phaddd %xmm0, %xmm0
260; SSSE3-NEXT:    phaddd %xmm1, %xmm1
261; SSSE3-NEXT:    retq
262;
263; AVX1-LABEL: hadd_v8i32b:
264; AVX1:       # %bb.0:
265; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm1
266; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
267; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
268; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
269; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
270; AVX1-NEXT:    retq
271;
272; AVX2-LABEL: hadd_v8i32b:
273; AVX2:       # %bb.0:
274; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
275; AVX2-NEXT:    retq
276  %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
277  %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
278  %hop = add <8 x i32> %a0, %a1
279  %shuf = shufflevector <8 x i32> %hop, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
280  ret <8 x i32> %shuf
281}
282
283define <4 x i32> @hsub_v4i32(<4 x i32> %a) {
284; SSSE3-LABEL: hsub_v4i32:
285; SSSE3:       # %bb.0:
286; SSSE3-NEXT:    phsubd %xmm0, %xmm0
287; SSSE3-NEXT:    retq
288;
289; AVX-LABEL: hsub_v4i32:
290; AVX:       # %bb.0:
291; AVX-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
292; AVX-NEXT:    retq
293  %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
294  %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
295  %hop = sub <4 x i32> %a02, %a13
296  %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <4 x i32> <i32 undef, i32 1, i32 0, i32 undef>
297  ret <4 x i32> %shuf
298}
299
300define <8 x i32> @hsub_v8i32a(<8 x i32> %a) {
301; SSSE3-LABEL: hsub_v8i32a:
302; SSSE3:       # %bb.0:
303; SSSE3-NEXT:    movdqa %xmm0, %xmm2
304; SSSE3-NEXT:    phsubd %xmm1, %xmm2
305; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
306; SSSE3-NEXT:    movdqa %xmm2, %xmm1
307; SSSE3-NEXT:    retq
308;
309; AVX1-LABEL: hsub_v8i32a:
310; AVX1:       # %bb.0:
311; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
312; AVX1-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
313; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
314; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
315; AVX1-NEXT:    retq
316;
317; AVX2-LABEL: hsub_v8i32a:
318; AVX2:       # %bb.0:
319; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
320; AVX2-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
321; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
322; AVX2-NEXT:    retq
323  %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
324  %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
325  %hop = sub <4 x i32> %a0, %a1
326  %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3>
327  ret <8 x i32> %shuf
328}
329
330define <8 x i32> @hsub_v8i32b(<8 x i32> %a) {
331; SSSE3-LABEL: hsub_v8i32b:
332; SSSE3:       # %bb.0:
333; SSSE3-NEXT:    phsubd %xmm0, %xmm0
334; SSSE3-NEXT:    phsubd %xmm1, %xmm1
335; SSSE3-NEXT:    retq
336;
337; AVX1-LABEL: hsub_v8i32b:
338; AVX1:       # %bb.0:
339; AVX1-NEXT:    vphsubd %xmm0, %xmm0, %xmm1
340; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
341; AVX1-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
342; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
343; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
344; AVX1-NEXT:    retq
345;
346; AVX2-LABEL: hsub_v8i32b:
347; AVX2:       # %bb.0:
348; AVX2-NEXT:    vphsubd %ymm0, %ymm0, %ymm0
349; AVX2-NEXT:    retq
350  %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
351  %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
352  %hop = sub <8 x i32> %a0, %a1
353  %shuf = shufflevector <8 x i32> %hop, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
354  ret <8 x i32> %shuf
355}
356
357define <8 x i16> @hadd_v8i16(<8 x i16> %a) {
358; SSSE3-LABEL: hadd_v8i16:
359; SSSE3:       # %bb.0:
360; SSSE3-NEXT:    phaddw %xmm0, %xmm0
361; SSSE3-NEXT:    retq
362;
363; AVX-LABEL: hadd_v8i16:
364; AVX:       # %bb.0:
365; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
366; AVX-NEXT:    retq
367  %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
368  %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
369  %hop = add <8 x i16> %a0246, %a1357
370  %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
371  ret <8 x i16> %shuf
372}
373
374define <16 x i16> @hadd_v16i16a(<16 x i16> %a) {
375; SSSE3-LABEL: hadd_v16i16a:
376; SSSE3:       # %bb.0:
377; SSSE3-NEXT:    movdqa %xmm0, %xmm2
378; SSSE3-NEXT:    phaddw %xmm1, %xmm2
379; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
380; SSSE3-NEXT:    movdqa %xmm2, %xmm1
381; SSSE3-NEXT:    retq
382;
383; AVX1-LABEL: hadd_v16i16a:
384; AVX1:       # %bb.0:
385; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
386; AVX1-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
387; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
388; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
389; AVX1-NEXT:    retq
390;
391; AVX2-LABEL: hadd_v16i16a:
392; AVX2:       # %bb.0:
393; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
394; AVX2-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
395; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
396; AVX2-NEXT:    retq
397  %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
398  %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
399  %hop = add <8 x i16> %a0, %a1
400  %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 7>
401  ret <16 x i16> %shuf
402}
403
404define <16 x i16> @hadd_v16i16b(<16 x i16> %a) {
405; SSSE3-LABEL: hadd_v16i16b:
406; SSSE3:       # %bb.0:
407; SSSE3-NEXT:    phaddw %xmm0, %xmm0
408; SSSE3-NEXT:    phaddw %xmm1, %xmm1
409; SSSE3-NEXT:    retq
410;
411; AVX1-LABEL: hadd_v16i16b:
412; AVX1:       # %bb.0:
413; AVX1-NEXT:    vphaddw %xmm0, %xmm0, %xmm1
414; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
415; AVX1-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
416; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
417; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
418; AVX1-NEXT:    retq
419;
420; AVX2-LABEL: hadd_v16i16b:
421; AVX2:       # %bb.0:
422; AVX2-NEXT:    vphaddw %ymm0, %ymm0, %ymm0
423; AVX2-NEXT:    retq
424  %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef>
425  %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 9, i32 11, i32 13, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
426  %hop = add <16 x i16> %a0, %a1
427  %shuf = shufflevector <16 x i16> %hop, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11>
428  ret <16 x i16> %shuf
429}
430
431define <8 x i16> @hsub_v8i16(<8 x i16> %a) {
432; SSSE3-LABEL: hsub_v8i16:
433; SSSE3:       # %bb.0:
434; SSSE3-NEXT:    phsubw %xmm0, %xmm0
435; SSSE3-NEXT:    retq
436;
437; AVX-LABEL: hsub_v8i16:
438; AVX:       # %bb.0:
439; AVX-NEXT:    vphsubw %xmm0, %xmm0, %xmm0
440; AVX-NEXT:    retq
441  %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
442  %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
443  %hop = sub <8 x i16> %a0246, %a1357
444  %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 2, i32 undef, i32 undef, i32 1, i32 undef, i32 3>
445  ret <8 x i16> %shuf
446}
447
448define <16 x i16> @hsub_v16i16a(<16 x i16> %a) {
449; SSSE3-LABEL: hsub_v16i16a:
450; SSSE3:       # %bb.0:
451; SSSE3-NEXT:    movdqa %xmm0, %xmm2
452; SSSE3-NEXT:    phsubw %xmm1, %xmm2
453; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
454; SSSE3-NEXT:    movdqa %xmm2, %xmm1
455; SSSE3-NEXT:    retq
456;
457; AVX1-LABEL: hsub_v16i16a:
458; AVX1:       # %bb.0:
459; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
460; AVX1-NEXT:    vphsubw %xmm1, %xmm0, %xmm0
461; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
462; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
463; AVX1-NEXT:    retq
464;
465; AVX2-LABEL: hsub_v16i16a:
466; AVX2:       # %bb.0:
467; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
468; AVX2-NEXT:    vphsubw %xmm1, %xmm0, %xmm0
469; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
470; AVX2-NEXT:    retq
471  %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
472  %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
473  %hop = sub <8 x i16> %a0, %a1
474  %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 7>
475  ret <16 x i16> %shuf
476}
477
478define <16 x i16> @hsub_v16i16b(<16 x i16> %a) {
479; SSSE3-LABEL: hsub_v16i16b:
480; SSSE3:       # %bb.0:
481; SSSE3-NEXT:    phsubw %xmm0, %xmm0
482; SSSE3-NEXT:    phsubw %xmm1, %xmm1
483; SSSE3-NEXT:    retq
484;
485; AVX1-LABEL: hsub_v16i16b:
486; AVX1:       # %bb.0:
487; AVX1-NEXT:    vphsubw %xmm0, %xmm0, %xmm1
488; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
489; AVX1-NEXT:    vphsubw %xmm0, %xmm0, %xmm0
490; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
491; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
492; AVX1-NEXT:    retq
493;
494; AVX2-LABEL: hsub_v16i16b:
495; AVX2:       # %bb.0:
496; AVX2-NEXT:    vphsubw %ymm0, %ymm0, %ymm0
497; AVX2-NEXT:    retq
498  %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef>
499  %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 9, i32 11, i32 13, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
500  %hop = sub <16 x i16> %a0, %a1
501  %shuf = shufflevector <16 x i16> %hop, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11>
502  ret <16 x i16> %shuf
503}
504