• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx  | FileCheck %s --check-prefixes=AVX,AVX1
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6; The next 8 tests check for matching the horizontal op and eliminating the shuffle.
7; PR34111 - https://bugs.llvm.org/show_bug.cgi?id=34111
9define <4 x float> @hadd_v4f32(<4 x float> %a) {
10; SSSE3-LABEL: hadd_v4f32:
11; SSSE3:       # %bb.0:
12; SSSE3-NEXT:    haddps %xmm0, %xmm0
13; SSSE3-NEXT:    retq
15; AVX-LABEL: hadd_v4f32:
16; AVX:       # %bb.0:
17; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
18; AVX-NEXT:    retq
19  %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2>
20  %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3>
21  %hop = fadd <2 x float> %a02, %a13
22  %shuf = shufflevector <2 x float> %hop, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
23  ret <4 x float> %shuf
26define <8 x float> @hadd_v8f32a(<8 x float> %a) {
27; SSSE3-LABEL: hadd_v8f32a:
28; SSSE3:       # %bb.0:
29; SSSE3-NEXT:    movaps %xmm0, %xmm2
30; SSSE3-NEXT:    haddps %xmm1, %xmm2
31; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm2[0,0]
32; SSSE3-NEXT:    movaps %xmm2, %xmm1
33; SSSE3-NEXT:    retq
35; AVX1-LABEL: hadd_v8f32a:
36; AVX1:       # %bb.0:
37; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
38; AVX1-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
39; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
40; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
41; AVX1-NEXT:    retq
43; AVX2-LABEL: hadd_v8f32a:
44; AVX2:       # %bb.0:
45; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
46; AVX2-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
47; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
48; AVX2-NEXT:    retq
49  %a0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
50  %a1 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
51  %hop = fadd <4 x float> %a0, %a1
52  %shuf = shufflevector <4 x float> %hop, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3>
53  ret <8 x float> %shuf
56define <8 x float> @hadd_v8f32b(<8 x float> %a) {
57; SSSE3-LABEL: hadd_v8f32b:
58; SSSE3:       # %bb.0:
59; SSSE3-NEXT:    haddps %xmm0, %xmm0
60; SSSE3-NEXT:    haddps %xmm1, %xmm1
61; SSSE3-NEXT:    retq
63; AVX-LABEL: hadd_v8f32b:
64; AVX:       # %bb.0:
65; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
66; AVX-NEXT:    retq
67  %a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
68  %a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
69  %hop = fadd <8 x float> %a0, %a1
70  %shuf = shufflevector <8 x float> %hop, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
71  ret <8 x float> %shuf
74define <4 x float> @hsub_v4f32(<4 x float> %a) {
75; SSSE3-LABEL: hsub_v4f32:
76; SSSE3:       # %bb.0:
77; SSSE3-NEXT:    hsubps %xmm0, %xmm0
78; SSSE3-NEXT:    retq
80; AVX-LABEL: hsub_v4f32:
81; AVX:       # %bb.0:
82; AVX-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
83; AVX-NEXT:    retq
84  %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2>
85  %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3>
86  %hop = fsub <2 x float> %a02, %a13
87  %shuf = shufflevector <2 x float> %hop, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
88  ret <4 x float> %shuf
91define <8 x float> @hsub_v8f32a(<8 x float> %a) {
92; SSSE3-LABEL: hsub_v8f32a:
93; SSSE3:       # %bb.0:
94; SSSE3-NEXT:    movaps %xmm0, %xmm2
95; SSSE3-NEXT:    hsubps %xmm1, %xmm2
96; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = xmm2[0,0]
97; SSSE3-NEXT:    movaps %xmm2, %xmm1
98; SSSE3-NEXT:    retq
100; AVX1-LABEL: hsub_v8f32a:
101; AVX1:       # %bb.0:
102; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
103; AVX1-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
104; AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
105; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
106; AVX1-NEXT:    retq
108; AVX2-LABEL: hsub_v8f32a:
109; AVX2:       # %bb.0:
110; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
111; AVX2-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
112; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
113; AVX2-NEXT:    retq
114  %a0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
115  %a1 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
116  %hop = fsub <4 x float> %a0, %a1
117  %shuf = shufflevector <4 x float> %hop, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3>
118  ret <8 x float> %shuf
121define <8 x float> @hsub_v8f32b(<8 x float> %a) {
122; SSSE3-LABEL: hsub_v8f32b:
123; SSSE3:       # %bb.0:
124; SSSE3-NEXT:    hsubps %xmm0, %xmm0
125; SSSE3-NEXT:    hsubps %xmm1, %xmm1
126; SSSE3-NEXT:    retq
128; AVX-LABEL: hsub_v8f32b:
129; AVX:       # %bb.0:
130; AVX-NEXT:    vhsubps %ymm0, %ymm0, %ymm0
131; AVX-NEXT:    retq
132  %a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
133  %a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
134  %hop = fsub <8 x float> %a0, %a1
135  %shuf = shufflevector <8 x float> %hop, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
136  ret <8 x float> %shuf
139define <2 x double> @hadd_v2f64(<2 x double> %a) {
140; SSSE3-LABEL: hadd_v2f64:
141; SSSE3:       # %bb.0:
142; SSSE3-NEXT:    haddpd %xmm0, %xmm0
143; SSSE3-NEXT:    retq
145; AVX-LABEL: hadd_v2f64:
146; AVX:       # %bb.0:
147; AVX-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
148; AVX-NEXT:    retq
149  %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
150  %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
151  %hop = fadd <2 x double> %a0, %a1
152  %shuf = shufflevector <2 x double> %hop, <2 x double> undef, <2 x i32> <i32 0, i32 0>
153  ret <2 x double> %shuf
156define <4 x double> @hadd_v4f64(<4 x double> %a) {
157; SSSE3-LABEL: hadd_v4f64:
158; SSSE3:       # %bb.0:
159; SSSE3-NEXT:    haddpd %xmm0, %xmm0
160; SSSE3-NEXT:    haddpd %xmm1, %xmm1
161; SSSE3-NEXT:    retq
163; AVX-LABEL: hadd_v4f64:
164; AVX:       # %bb.0:
165; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
166; AVX-NEXT:    retq
167  %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
168  %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
169  %hop = fadd <4 x double> %a0, %a1
170  %shuf = shufflevector <4 x double> %hop, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
171  ret <4 x double> %shuf
174define <2 x double> @hsub_v2f64(<2 x double> %a) {
175; SSSE3-LABEL: hsub_v2f64:
176; SSSE3:       # %bb.0:
177; SSSE3-NEXT:    hsubpd %xmm0, %xmm0
178; SSSE3-NEXT:    retq
180; AVX-LABEL: hsub_v2f64:
181; AVX:       # %bb.0:
182; AVX-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
183; AVX-NEXT:    retq
184  %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
185  %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
186  %hop = fsub <2 x double> %a0, %a1
187  %shuf = shufflevector <2 x double> %hop, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
188  ret <2 x double> %shuf
191define <4 x double> @hsub_v4f64(<4 x double> %a) {
192; SSSE3-LABEL: hsub_v4f64:
193; SSSE3:       # %bb.0:
194; SSSE3-NEXT:    hsubpd %xmm0, %xmm0
195; SSSE3-NEXT:    hsubpd %xmm1, %xmm1
196; SSSE3-NEXT:    retq
198; AVX-LABEL: hsub_v4f64:
199; AVX:       # %bb.0:
200; AVX-NEXT:    vhsubpd %ymm0, %ymm0, %ymm0
201; AVX-NEXT:    retq
202  %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
203  %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
204  %hop = fsub <4 x double> %a0, %a1
205  %shuf = shufflevector <4 x double> %hop, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
206  ret <4 x double> %shuf
209define <4 x i32> @hadd_v4i32(<4 x i32> %a) {
210; SSSE3-LABEL: hadd_v4i32:
211; SSSE3:       # %bb.0:
212; SSSE3-NEXT:    phaddd %xmm0, %xmm0
213; SSSE3-NEXT:    retq
215; AVX-LABEL: hadd_v4i32:
216; AVX:       # %bb.0:
217; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
218; AVX-NEXT:    retq
219  %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
220  %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
221  %hop = add <4 x i32> %a02, %a13
222  %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 1>
223  ret <4 x i32> %shuf
226define <8 x i32> @hadd_v8i32a(<8 x i32> %a) {
227; SSSE3-LABEL: hadd_v8i32a:
228; SSSE3:       # %bb.0:
229; SSSE3-NEXT:    movdqa %xmm0, %xmm2
230; SSSE3-NEXT:    phaddd %xmm1, %xmm2
231; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
232; SSSE3-NEXT:    movdqa %xmm2, %xmm1
233; SSSE3-NEXT:    retq
235; AVX1-LABEL: hadd_v8i32a:
236; AVX1:       # %bb.0:
237; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
238; AVX1-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
239; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
240; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
241; AVX1-NEXT:    retq
243; AVX2-LABEL: hadd_v8i32a:
244; AVX2:       # %bb.0:
245; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
246; AVX2-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
247; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
248; AVX2-NEXT:    retq
249  %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
250  %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
251  %hop = add <4 x i32> %a0, %a1
252  %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3>
253  ret <8 x i32> %shuf
256define <8 x i32> @hadd_v8i32b(<8 x i32> %a) {
257; SSSE3-LABEL: hadd_v8i32b:
258; SSSE3:       # %bb.0:
259; SSSE3-NEXT:    phaddd %xmm0, %xmm0
260; SSSE3-NEXT:    phaddd %xmm1, %xmm1
261; SSSE3-NEXT:    retq
263; AVX1-LABEL: hadd_v8i32b:
264; AVX1:       # %bb.0:
265; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm1
266; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
267; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
268; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
269; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
270; AVX1-NEXT:    retq
272; AVX2-LABEL: hadd_v8i32b:
273; AVX2:       # %bb.0:
274; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
275; AVX2-NEXT:    retq
276  %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
277  %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
278  %hop = add <8 x i32> %a0, %a1
279  %shuf = shufflevector <8 x i32> %hop, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
280  ret <8 x i32> %shuf
283define <4 x i32> @hsub_v4i32(<4 x i32> %a) {
284; SSSE3-LABEL: hsub_v4i32:
285; SSSE3:       # %bb.0:
286; SSSE3-NEXT:    phsubd %xmm0, %xmm0
287; SSSE3-NEXT:    retq
289; AVX-LABEL: hsub_v4i32:
290; AVX:       # %bb.0:
291; AVX-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
292; AVX-NEXT:    retq
293  %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
294  %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
295  %hop = sub <4 x i32> %a02, %a13
296  %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <4 x i32> <i32 undef, i32 1, i32 0, i32 undef>
297  ret <4 x i32> %shuf
300define <8 x i32> @hsub_v8i32a(<8 x i32> %a) {
301; SSSE3-LABEL: hsub_v8i32a:
302; SSSE3:       # %bb.0:
303; SSSE3-NEXT:    movdqa %xmm0, %xmm2
304; SSSE3-NEXT:    phsubd %xmm1, %xmm2
305; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
306; SSSE3-NEXT:    movdqa %xmm2, %xmm1
307; SSSE3-NEXT:    retq
309; AVX1-LABEL: hsub_v8i32a:
310; AVX1:       # %bb.0:
311; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
312; AVX1-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
313; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
314; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
315; AVX1-NEXT:    retq
317; AVX2-LABEL: hsub_v8i32a:
318; AVX2:       # %bb.0:
319; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
320; AVX2-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
321; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
322; AVX2-NEXT:    retq
323  %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
324  %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
325  %hop = sub <4 x i32> %a0, %a1
326  %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3>
327  ret <8 x i32> %shuf
330define <8 x i32> @hsub_v8i32b(<8 x i32> %a) {
331; SSSE3-LABEL: hsub_v8i32b:
332; SSSE3:       # %bb.0:
333; SSSE3-NEXT:    phsubd %xmm0, %xmm0
334; SSSE3-NEXT:    phsubd %xmm1, %xmm1
335; SSSE3-NEXT:    retq
337; AVX1-LABEL: hsub_v8i32b:
338; AVX1:       # %bb.0:
339; AVX1-NEXT:    vphsubd %xmm0, %xmm0, %xmm1
340; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
341; AVX1-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
342; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
343; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
344; AVX1-NEXT:    retq
346; AVX2-LABEL: hsub_v8i32b:
347; AVX2:       # %bb.0:
348; AVX2-NEXT:    vphsubd %ymm0, %ymm0, %ymm0
349; AVX2-NEXT:    retq
350  %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
351  %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
352  %hop = sub <8 x i32> %a0, %a1
353  %shuf = shufflevector <8 x i32> %hop, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
354  ret <8 x i32> %shuf
357define <8 x i16> @hadd_v8i16(<8 x i16> %a) {
358; SSSE3-LABEL: hadd_v8i16:
359; SSSE3:       # %bb.0:
360; SSSE3-NEXT:    phaddw %xmm0, %xmm0
361; SSSE3-NEXT:    retq
363; AVX-LABEL: hadd_v8i16:
364; AVX:       # %bb.0:
365; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
366; AVX-NEXT:    retq
367  %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
368  %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
369  %hop = add <8 x i16> %a0246, %a1357
370  %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
371  ret <8 x i16> %shuf
374define <16 x i16> @hadd_v16i16a(<16 x i16> %a) {
375; SSSE3-LABEL: hadd_v16i16a:
376; SSSE3:       # %bb.0:
377; SSSE3-NEXT:    movdqa %xmm0, %xmm2
378; SSSE3-NEXT:    phaddw %xmm1, %xmm2
379; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
380; SSSE3-NEXT:    movdqa %xmm2, %xmm1
381; SSSE3-NEXT:    retq
383; AVX1-LABEL: hadd_v16i16a:
384; AVX1:       # %bb.0:
385; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
386; AVX1-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
387; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
388; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
389; AVX1-NEXT:    retq
391; AVX2-LABEL: hadd_v16i16a:
392; AVX2:       # %bb.0:
393; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
394; AVX2-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
395; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
396; AVX2-NEXT:    retq
397  %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
398  %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
399  %hop = add <8 x i16> %a0, %a1
400  %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 7>
401  ret <16 x i16> %shuf
404define <16 x i16> @hadd_v16i16b(<16 x i16> %a) {
405; SSSE3-LABEL: hadd_v16i16b:
406; SSSE3:       # %bb.0:
407; SSSE3-NEXT:    phaddw %xmm0, %xmm0
408; SSSE3-NEXT:    phaddw %xmm1, %xmm1
409; SSSE3-NEXT:    retq
411; AVX1-LABEL: hadd_v16i16b:
412; AVX1:       # %bb.0:
413; AVX1-NEXT:    vphaddw %xmm0, %xmm0, %xmm1
414; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
415; AVX1-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
416; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
417; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
418; AVX1-NEXT:    retq
420; AVX2-LABEL: hadd_v16i16b:
421; AVX2:       # %bb.0:
422; AVX2-NEXT:    vphaddw %ymm0, %ymm0, %ymm0
423; AVX2-NEXT:    retq
424  %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef>
425  %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 9, i32 11, i32 13, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
426  %hop = add <16 x i16> %a0, %a1
427  %shuf = shufflevector <16 x i16> %hop, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11>
428  ret <16 x i16> %shuf
431define <8 x i16> @hsub_v8i16(<8 x i16> %a) {
432; SSSE3-LABEL: hsub_v8i16:
433; SSSE3:       # %bb.0:
434; SSSE3-NEXT:    phsubw %xmm0, %xmm0
435; SSSE3-NEXT:    retq
437; AVX-LABEL: hsub_v8i16:
438; AVX:       # %bb.0:
439; AVX-NEXT:    vphsubw %xmm0, %xmm0, %xmm0
440; AVX-NEXT:    retq
441  %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
442  %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
443  %hop = sub <8 x i16> %a0246, %a1357
444  %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 2, i32 undef, i32 undef, i32 1, i32 undef, i32 3>
445  ret <8 x i16> %shuf
448define <16 x i16> @hsub_v16i16a(<16 x i16> %a) {
449; SSSE3-LABEL: hsub_v16i16a:
450; SSSE3:       # %bb.0:
451; SSSE3-NEXT:    movdqa %xmm0, %xmm2
452; SSSE3-NEXT:    phsubw %xmm1, %xmm2
453; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
454; SSSE3-NEXT:    movdqa %xmm2, %xmm1
455; SSSE3-NEXT:    retq
457; AVX1-LABEL: hsub_v16i16a:
458; AVX1:       # %bb.0:
459; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
460; AVX1-NEXT:    vphsubw %xmm1, %xmm0, %xmm0
461; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
462; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
463; AVX1-NEXT:    retq
465; AVX2-LABEL: hsub_v16i16a:
466; AVX2:       # %bb.0:
467; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
468; AVX2-NEXT:    vphsubw %xmm1, %xmm0, %xmm0
469; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
470; AVX2-NEXT:    retq
471  %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
472  %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
473  %hop = sub <8 x i16> %a0, %a1
474  %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 7>
475  ret <16 x i16> %shuf
478define <16 x i16> @hsub_v16i16b(<16 x i16> %a) {
479; SSSE3-LABEL: hsub_v16i16b:
480; SSSE3:       # %bb.0:
481; SSSE3-NEXT:    phsubw %xmm0, %xmm0
482; SSSE3-NEXT:    phsubw %xmm1, %xmm1
483; SSSE3-NEXT:    retq
485; AVX1-LABEL: hsub_v16i16b:
486; AVX1:       # %bb.0:
487; AVX1-NEXT:    vphsubw %xmm0, %xmm0, %xmm1
488; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
489; AVX1-NEXT:    vphsubw %xmm0, %xmm0, %xmm0
490; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
491; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
492; AVX1-NEXT:    retq
494; AVX2-LABEL: hsub_v16i16b:
495; AVX2:       # %bb.0:
496; AVX2-NEXT:    vphsubw %ymm0, %ymm0, %ymm0
497; AVX2-NEXT:    retq
498  %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef>
499  %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 9, i32 11, i32 13, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
500  %hop = sub <16 x i16> %a0, %a1
501  %shuf = shufflevector <16 x i16> %hop, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11>
502  ret <16 x i16> %shuf