• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3           | FileCheck %s --check-prefixes=SSSE3,SSSE3_SLOW
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3,SSSE3_FAST
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx             | FileCheck %s --check-prefixes=AVX,AVX1,AVX1_SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops   | FileCheck %s --check-prefixes=AVX,AVX1,AVX1_FAST
6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2            | FileCheck %s --check-prefixes=AVX,AVX2,AVX2_SLOW
7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops  | FileCheck %s --check-prefixes=AVX,AVX2,AVX2_FAST
8
9; The next 8 tests check for matching the horizontal op and eliminating the shuffle.
10; PR34111 - https://bugs.llvm.org/show_bug.cgi?id=34111
11
12define <4 x float> @hadd_v4f32(<4 x float> %a) {
13; SSSE3-LABEL: hadd_v4f32:
14; SSSE3:       # %bb.0:
15; SSSE3-NEXT:    haddps %xmm0, %xmm0
16; SSSE3-NEXT:    retq
17;
18; AVX-LABEL: hadd_v4f32:
19; AVX:       # %bb.0:
20; AVX-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
21; AVX-NEXT:    retq
22  %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2>
23  %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3>
24  %hop = fadd <2 x float> %a02, %a13
25  %shuf = shufflevector <2 x float> %hop, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
26  ret <4 x float> %shuf
27}
28
29define <8 x float> @hadd_v8f32a(<8 x float> %a) {
30; SSSE3_SLOW-LABEL: hadd_v8f32a:
31; SSSE3_SLOW:       # %bb.0:
32; SSSE3_SLOW-NEXT:    movaps %xmm0, %xmm2
33; SSSE3_SLOW-NEXT:    haddps %xmm1, %xmm2
34; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm2[0,0]
35; SSSE3_SLOW-NEXT:    movaps %xmm2, %xmm1
36; SSSE3_SLOW-NEXT:    retq
37;
38; SSSE3_FAST-LABEL: hadd_v8f32a:
39; SSSE3_FAST:       # %bb.0:
40; SSSE3_FAST-NEXT:    movaps %xmm0, %xmm2
41; SSSE3_FAST-NEXT:    haddps %xmm1, %xmm2
42; SSSE3_FAST-NEXT:    haddps %xmm0, %xmm0
43; SSSE3_FAST-NEXT:    movaps %xmm2, %xmm1
44; SSSE3_FAST-NEXT:    retq
45;
46; AVX1_SLOW-LABEL: hadd_v8f32a:
47; AVX1_SLOW:       # %bb.0:
48; AVX1_SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
49; AVX1_SLOW-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
50; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
51; AVX1_SLOW-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
52; AVX1_SLOW-NEXT:    retq
53;
54; AVX1_FAST-LABEL: hadd_v8f32a:
55; AVX1_FAST:       # %bb.0:
56; AVX1_FAST-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm1
57; AVX1_FAST-NEXT:    vhaddps %ymm0, %ymm1, %ymm0
58; AVX1_FAST-NEXT:    retq
59;
60; AVX2-LABEL: hadd_v8f32a:
61; AVX2:       # %bb.0:
62; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
63; AVX2-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
64; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
65; AVX2-NEXT:    retq
66  %a0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
67  %a1 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
68  %hop = fadd <4 x float> %a0, %a1
69  %shuf = shufflevector <4 x float> %hop, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3>
70  ret <8 x float> %shuf
71}
72
73define <8 x float> @hadd_v8f32b(<8 x float> %a) {
74; SSSE3-LABEL: hadd_v8f32b:
75; SSSE3:       # %bb.0:
76; SSSE3-NEXT:    haddps %xmm0, %xmm0
77; SSSE3-NEXT:    haddps %xmm1, %xmm1
78; SSSE3-NEXT:    retq
79;
80; AVX-LABEL: hadd_v8f32b:
81; AVX:       # %bb.0:
82; AVX-NEXT:    vhaddps %ymm0, %ymm0, %ymm0
83; AVX-NEXT:    retq
84  %a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
85  %a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
86  %hop = fadd <8 x float> %a0, %a1
87  %shuf = shufflevector <8 x float> %hop, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
88  ret <8 x float> %shuf
89}
90
91define <4 x float> @hsub_v4f32(<4 x float> %a) {
92; SSSE3-LABEL: hsub_v4f32:
93; SSSE3:       # %bb.0:
94; SSSE3-NEXT:    hsubps %xmm0, %xmm0
95; SSSE3-NEXT:    retq
96;
97; AVX-LABEL: hsub_v4f32:
98; AVX:       # %bb.0:
99; AVX-NEXT:    vhsubps %xmm0, %xmm0, %xmm0
100; AVX-NEXT:    retq
101  %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2>
102  %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3>
103  %hop = fsub <2 x float> %a02, %a13
104  %shuf = shufflevector <2 x float> %hop, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
105  ret <4 x float> %shuf
106}
107
108define <8 x float> @hsub_v8f32a(<8 x float> %a) {
109; SSSE3_SLOW-LABEL: hsub_v8f32a:
110; SSSE3_SLOW:       # %bb.0:
111; SSSE3_SLOW-NEXT:    movaps %xmm0, %xmm2
112; SSSE3_SLOW-NEXT:    hsubps %xmm1, %xmm2
113; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm2[0,0]
114; SSSE3_SLOW-NEXT:    movaps %xmm2, %xmm1
115; SSSE3_SLOW-NEXT:    retq
116;
117; SSSE3_FAST-LABEL: hsub_v8f32a:
118; SSSE3_FAST:       # %bb.0:
119; SSSE3_FAST-NEXT:    movaps %xmm0, %xmm2
120; SSSE3_FAST-NEXT:    hsubps %xmm1, %xmm2
121; SSSE3_FAST-NEXT:    hsubps %xmm0, %xmm0
122; SSSE3_FAST-NEXT:    movaps %xmm2, %xmm1
123; SSSE3_FAST-NEXT:    retq
124;
125; AVX1_SLOW-LABEL: hsub_v8f32a:
126; AVX1_SLOW:       # %bb.0:
127; AVX1_SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
128; AVX1_SLOW-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
129; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} xmm1 = xmm0[0,0]
130; AVX1_SLOW-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
131; AVX1_SLOW-NEXT:    retq
132;
133; AVX1_FAST-LABEL: hsub_v8f32a:
134; AVX1_FAST:       # %bb.0:
135; AVX1_FAST-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm1
136; AVX1_FAST-NEXT:    vhsubps %ymm0, %ymm1, %ymm0
137; AVX1_FAST-NEXT:    retq
138;
139; AVX2-LABEL: hsub_v8f32a:
140; AVX2:       # %bb.0:
141; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
142; AVX2-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
143; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
144; AVX2-NEXT:    retq
145  %a0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
146  %a1 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
147  %hop = fsub <4 x float> %a0, %a1
148  %shuf = shufflevector <4 x float> %hop, <4 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3>
149  ret <8 x float> %shuf
150}
151
152define <8 x float> @hsub_v8f32b(<8 x float> %a) {
153; SSSE3-LABEL: hsub_v8f32b:
154; SSSE3:       # %bb.0:
155; SSSE3-NEXT:    hsubps %xmm0, %xmm0
156; SSSE3-NEXT:    hsubps %xmm1, %xmm1
157; SSSE3-NEXT:    retq
158;
159; AVX-LABEL: hsub_v8f32b:
160; AVX:       # %bb.0:
161; AVX-NEXT:    vhsubps %ymm0, %ymm0, %ymm0
162; AVX-NEXT:    retq
163  %a0 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
164  %a1 = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
165  %hop = fsub <8 x float> %a0, %a1
166  %shuf = shufflevector <8 x float> %hop, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
167  ret <8 x float> %shuf
168}
169
170define <2 x double> @hadd_v2f64(<2 x double> %a) {
171; SSSE3_SLOW-LABEL: hadd_v2f64:
172; SSSE3_SLOW:       # %bb.0:
173; SSSE3_SLOW-NEXT:    movapd %xmm0, %xmm1
174; SSSE3_SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
175; SSSE3_SLOW-NEXT:    addsd %xmm0, %xmm1
176; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
177; SSSE3_SLOW-NEXT:    retq
178;
179; SSSE3_FAST-LABEL: hadd_v2f64:
180; SSSE3_FAST:       # %bb.0:
181; SSSE3_FAST-NEXT:    haddpd %xmm0, %xmm0
182; SSSE3_FAST-NEXT:    retq
183;
184; AVX1_SLOW-LABEL: hadd_v2f64:
185; AVX1_SLOW:       # %bb.0:
186; AVX1_SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
187; AVX1_SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
188; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
189; AVX1_SLOW-NEXT:    retq
190;
191; AVX1_FAST-LABEL: hadd_v2f64:
192; AVX1_FAST:       # %bb.0:
193; AVX1_FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
194; AVX1_FAST-NEXT:    retq
195;
196; AVX2_SLOW-LABEL: hadd_v2f64:
197; AVX2_SLOW:       # %bb.0:
198; AVX2_SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
199; AVX2_SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
200; AVX2_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
201; AVX2_SLOW-NEXT:    retq
202;
203; AVX2_FAST-LABEL: hadd_v2f64:
204; AVX2_FAST:       # %bb.0:
205; AVX2_FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
206; AVX2_FAST-NEXT:    retq
207  %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
208  %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
209  %hop = fadd <2 x double> %a0, %a1
210  %shuf = shufflevector <2 x double> %hop, <2 x double> undef, <2 x i32> <i32 0, i32 0>
211  ret <2 x double> %shuf
212}
213
214define <2 x double> @hadd_v2f64_scalar_splat(<2 x double> %a) {
215; SSSE3_SLOW-LABEL: hadd_v2f64_scalar_splat:
216; SSSE3_SLOW:       # %bb.0:
217; SSSE3_SLOW-NEXT:    movapd %xmm0, %xmm1
218; SSSE3_SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
219; SSSE3_SLOW-NEXT:    addsd %xmm0, %xmm1
220; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
221; SSSE3_SLOW-NEXT:    retq
222;
223; SSSE3_FAST-LABEL: hadd_v2f64_scalar_splat:
224; SSSE3_FAST:       # %bb.0:
225; SSSE3_FAST-NEXT:    haddpd %xmm0, %xmm0
226; SSSE3_FAST-NEXT:    retq
227;
228; AVX1_SLOW-LABEL: hadd_v2f64_scalar_splat:
229; AVX1_SLOW:       # %bb.0:
230; AVX1_SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
231; AVX1_SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
232; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
233; AVX1_SLOW-NEXT:    retq
234;
235; AVX1_FAST-LABEL: hadd_v2f64_scalar_splat:
236; AVX1_FAST:       # %bb.0:
237; AVX1_FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
238; AVX1_FAST-NEXT:    retq
239;
240; AVX2_SLOW-LABEL: hadd_v2f64_scalar_splat:
241; AVX2_SLOW:       # %bb.0:
242; AVX2_SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
243; AVX2_SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
244; AVX2_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
245; AVX2_SLOW-NEXT:    retq
246;
247; AVX2_FAST-LABEL: hadd_v2f64_scalar_splat:
248; AVX2_FAST:       # %bb.0:
249; AVX2_FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
250; AVX2_FAST-NEXT:    retq
251  %a0 = extractelement <2 x double> %a, i32 0
252  %a1 = extractelement <2 x double> %a, i32 1
253  %hop = fadd double %a0, %a1
254  %ins = insertelement <2 x double> undef, double %hop, i32 0
255  %shuf = shufflevector <2 x double> %ins, <2 x double> undef, <2 x i32> <i32 0, i32 0>
256  ret <2 x double> %shuf
257}
258
259define <4 x double> @hadd_v4f64_scalar_splat(<4 x double> %a) {
260; SSSE3_SLOW-LABEL: hadd_v4f64_scalar_splat:
261; SSSE3_SLOW:       # %bb.0:
262; SSSE3_SLOW-NEXT:    movapd %xmm0, %xmm2
263; SSSE3_SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
264; SSSE3_SLOW-NEXT:    addsd %xmm0, %xmm2
265; SSSE3_SLOW-NEXT:    movapd %xmm1, %xmm3
266; SSSE3_SLOW-NEXT:    unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
267; SSSE3_SLOW-NEXT:    addsd %xmm1, %xmm3
268; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm2[0,0]
269; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm1 = xmm3[0,0]
270; SSSE3_SLOW-NEXT:    retq
271;
272; SSSE3_FAST-LABEL: hadd_v4f64_scalar_splat:
273; SSSE3_FAST:       # %bb.0:
274; SSSE3_FAST-NEXT:    haddpd %xmm0, %xmm0
275; SSSE3_FAST-NEXT:    haddpd %xmm1, %xmm1
276; SSSE3_FAST-NEXT:    retq
277;
278; AVX-LABEL: hadd_v4f64_scalar_splat:
279; AVX:       # %bb.0:
280; AVX-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
281; AVX-NEXT:    retq
282  %a0 = extractelement <4 x double> %a, i32 0
283  %a1 = extractelement <4 x double> %a, i32 1
284  %hop0 = fadd double %a0, %a1
285  %a2 = extractelement <4 x double> %a, i32 2
286  %a3 = extractelement <4 x double> %a, i32 3
287  %hop1 = fadd double %a2, %a3
288  %ins = insertelement <4 x double> undef, double %hop0, i32 0
289  %ins2 = insertelement <4 x double> %ins,  double %hop1, i32 2
290  %shuf = shufflevector <4 x double> %ins2, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
291  ret <4 x double> %shuf
292}
293
294define <4 x double> @hadd_v4f64_scalar_broadcast(<4 x double> %a) {
295; SSSE3_SLOW-LABEL: hadd_v4f64_scalar_broadcast:
296; SSSE3_SLOW:       # %bb.0:
297; SSSE3_SLOW-NEXT:    movapd %xmm0, %xmm1
298; SSSE3_SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
299; SSSE3_SLOW-NEXT:    addsd %xmm0, %xmm1
300; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm1[0,0]
301; SSSE3_SLOW-NEXT:    movapd %xmm0, %xmm1
302; SSSE3_SLOW-NEXT:    retq
303;
304; SSSE3_FAST-LABEL: hadd_v4f64_scalar_broadcast:
305; SSSE3_FAST:       # %bb.0:
306; SSSE3_FAST-NEXT:    haddpd %xmm0, %xmm0
307; SSSE3_FAST-NEXT:    movapd %xmm0, %xmm1
308; SSSE3_FAST-NEXT:    retq
309;
310; AVX1_SLOW-LABEL: hadd_v4f64_scalar_broadcast:
311; AVX1_SLOW:       # %bb.0:
312; AVX1_SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
313; AVX1_SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
314; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
315; AVX1_SLOW-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
316; AVX1_SLOW-NEXT:    retq
317;
318; AVX1_FAST-LABEL: hadd_v4f64_scalar_broadcast:
319; AVX1_FAST:       # %bb.0:
320; AVX1_FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
321; AVX1_FAST-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
322; AVX1_FAST-NEXT:    retq
323;
324; AVX2_SLOW-LABEL: hadd_v4f64_scalar_broadcast:
325; AVX2_SLOW:       # %bb.0:
326; AVX2_SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
327; AVX2_SLOW-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
328; AVX2_SLOW-NEXT:    vbroadcastsd %xmm0, %ymm0
329; AVX2_SLOW-NEXT:    retq
330;
331; AVX2_FAST-LABEL: hadd_v4f64_scalar_broadcast:
332; AVX2_FAST:       # %bb.0:
333; AVX2_FAST-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
334; AVX2_FAST-NEXT:    vbroadcastsd %xmm0, %ymm0
335; AVX2_FAST-NEXT:    retq
336  %a0 = extractelement <4 x double> %a, i32 0
337  %a1 = extractelement <4 x double> %a, i32 1
338  %hop0 = fadd double %a0, %a1
339  %a2 = extractelement <4 x double> %a, i32 2
340  %a3 = extractelement <4 x double> %a, i32 3
341  %hop1 = fadd double %a2, %a3
342  %ins = insertelement <4 x double> undef, double %hop0, i32 0
343  %ins2 = insertelement <4 x double> %ins,  double %hop1, i32 2
344  %shuf = shufflevector <4 x double> %ins2, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
345  ret <4 x double> %shuf
346}
347
348define <4 x double> @hadd_v4f64(<4 x double> %a) {
349; SSSE3_SLOW-LABEL: hadd_v4f64:
350; SSSE3_SLOW:       # %bb.0:
351; SSSE3_SLOW-NEXT:    movapd %xmm0, %xmm2
352; SSSE3_SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
353; SSSE3_SLOW-NEXT:    addsd %xmm0, %xmm2
354; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm2[0,0]
355; SSSE3_SLOW-NEXT:    movapd %xmm1, %xmm2
356; SSSE3_SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
357; SSSE3_SLOW-NEXT:    addsd %xmm1, %xmm2
358; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm1 = xmm2[0,0]
359; SSSE3_SLOW-NEXT:    retq
360;
361; SSSE3_FAST-LABEL: hadd_v4f64:
362; SSSE3_FAST:       # %bb.0:
363; SSSE3_FAST-NEXT:    haddpd %xmm0, %xmm0
364; SSSE3_FAST-NEXT:    haddpd %xmm1, %xmm1
365; SSSE3_FAST-NEXT:    retq
366;
367; AVX1_SLOW-LABEL: hadd_v4f64:
368; AVX1_SLOW:       # %bb.0:
369; AVX1_SLOW-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
370; AVX1_SLOW-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
371; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
372; AVX1_SLOW-NEXT:    retq
373;
374; AVX1_FAST-LABEL: hadd_v4f64:
375; AVX1_FAST:       # %bb.0:
376; AVX1_FAST-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
377; AVX1_FAST-NEXT:    retq
378;
379; AVX2_SLOW-LABEL: hadd_v4f64:
380; AVX2_SLOW:       # %bb.0:
381; AVX2_SLOW-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
382; AVX2_SLOW-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
383; AVX2_SLOW-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
384; AVX2_SLOW-NEXT:    retq
385;
386; AVX2_FAST-LABEL: hadd_v4f64:
387; AVX2_FAST:       # %bb.0:
388; AVX2_FAST-NEXT:    vhaddpd %ymm0, %ymm0, %ymm0
389; AVX2_FAST-NEXT:    retq
390  %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
391  %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
392  %hop = fadd <4 x double> %a0, %a1
393  %shuf = shufflevector <4 x double> %hop, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
394  ret <4 x double> %shuf
395}
396
397define <2 x double> @hsub_v2f64(<2 x double> %a) {
398; SSSE3_SLOW-LABEL: hsub_v2f64:
399; SSSE3_SLOW:       # %bb.0:
400; SSSE3_SLOW-NEXT:    movapd %xmm0, %xmm1
401; SSSE3_SLOW-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
402; SSSE3_SLOW-NEXT:    subsd %xmm1, %xmm0
403; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
404; SSSE3_SLOW-NEXT:    retq
405;
406; SSSE3_FAST-LABEL: hsub_v2f64:
407; SSSE3_FAST:       # %bb.0:
408; SSSE3_FAST-NEXT:    hsubpd %xmm0, %xmm0
409; SSSE3_FAST-NEXT:    retq
410;
411; AVX1_SLOW-LABEL: hsub_v2f64:
412; AVX1_SLOW:       # %bb.0:
413; AVX1_SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
414; AVX1_SLOW-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
415; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
416; AVX1_SLOW-NEXT:    retq
417;
418; AVX1_FAST-LABEL: hsub_v2f64:
419; AVX1_FAST:       # %bb.0:
420; AVX1_FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
421; AVX1_FAST-NEXT:    retq
422;
423; AVX2_SLOW-LABEL: hsub_v2f64:
424; AVX2_SLOW:       # %bb.0:
425; AVX2_SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
426; AVX2_SLOW-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
427; AVX2_SLOW-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
428; AVX2_SLOW-NEXT:    retq
429;
430; AVX2_FAST-LABEL: hsub_v2f64:
431; AVX2_FAST:       # %bb.0:
432; AVX2_FAST-NEXT:    vhsubpd %xmm0, %xmm0, %xmm0
433; AVX2_FAST-NEXT:    retq
434  %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
435  %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
436  %hop = fsub <2 x double> %a0, %a1
437  %shuf = shufflevector <2 x double> %hop, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
438  ret <2 x double> %shuf
439}
440
441define <4 x double> @hsub_v4f64(<4 x double> %a) {
442; SSSE3_SLOW-LABEL: hsub_v4f64:
443; SSSE3_SLOW:       # %bb.0:
444; SSSE3_SLOW-NEXT:    movapd %xmm0, %xmm2
445; SSSE3_SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
446; SSSE3_SLOW-NEXT:    subsd %xmm2, %xmm0
447; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
448; SSSE3_SLOW-NEXT:    movapd %xmm1, %xmm2
449; SSSE3_SLOW-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
450; SSSE3_SLOW-NEXT:    subsd %xmm2, %xmm1
451; SSSE3_SLOW-NEXT:    movddup {{.*#+}} xmm1 = xmm1[0,0]
452; SSSE3_SLOW-NEXT:    retq
453;
454; SSSE3_FAST-LABEL: hsub_v4f64:
455; SSSE3_FAST:       # %bb.0:
456; SSSE3_FAST-NEXT:    hsubpd %xmm0, %xmm0
457; SSSE3_FAST-NEXT:    hsubpd %xmm1, %xmm1
458; SSSE3_FAST-NEXT:    retq
459;
460; AVX1_SLOW-LABEL: hsub_v4f64:
461; AVX1_SLOW:       # %bb.0:
462; AVX1_SLOW-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
463; AVX1_SLOW-NEXT:    vsubpd %ymm1, %ymm0, %ymm0
464; AVX1_SLOW-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
465; AVX1_SLOW-NEXT:    retq
466;
467; AVX1_FAST-LABEL: hsub_v4f64:
468; AVX1_FAST:       # %bb.0:
469; AVX1_FAST-NEXT:    vhsubpd %ymm0, %ymm0, %ymm0
470; AVX1_FAST-NEXT:    retq
471;
472; AVX2_SLOW-LABEL: hsub_v4f64:
473; AVX2_SLOW:       # %bb.0:
474; AVX2_SLOW-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
475; AVX2_SLOW-NEXT:    vsubpd %ymm1, %ymm0, %ymm0
476; AVX2_SLOW-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
477; AVX2_SLOW-NEXT:    retq
478;
479; AVX2_FAST-LABEL: hsub_v4f64:
480; AVX2_FAST:       # %bb.0:
481; AVX2_FAST-NEXT:    vhsubpd %ymm0, %ymm0, %ymm0
482; AVX2_FAST-NEXT:    retq
483  %a0 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
484  %a1 = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
485  %hop = fsub <4 x double> %a0, %a1
486  %shuf = shufflevector <4 x double> %hop, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
487  ret <4 x double> %shuf
488}
489
490define <4 x i32> @hadd_v4i32(<4 x i32> %a) {
491; SSSE3-LABEL: hadd_v4i32:
492; SSSE3:       # %bb.0:
493; SSSE3-NEXT:    phaddd %xmm0, %xmm0
494; SSSE3-NEXT:    retq
495;
496; AVX-LABEL: hadd_v4i32:
497; AVX:       # %bb.0:
498; AVX-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
499; AVX-NEXT:    retq
500  %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
501  %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
502  %hop = add <4 x i32> %a02, %a13
503  %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 1>
504  ret <4 x i32> %shuf
505}
506
507define <8 x i32> @hadd_v8i32a(<8 x i32> %a) {
508; SSSE3_SLOW-LABEL: hadd_v8i32a:
509; SSSE3_SLOW:       # %bb.0:
510; SSSE3_SLOW-NEXT:    movdqa %xmm0, %xmm2
511; SSSE3_SLOW-NEXT:    phaddd %xmm1, %xmm2
512; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
513; SSSE3_SLOW-NEXT:    movdqa %xmm2, %xmm1
514; SSSE3_SLOW-NEXT:    retq
515;
516; SSSE3_FAST-LABEL: hadd_v8i32a:
517; SSSE3_FAST:       # %bb.0:
518; SSSE3_FAST-NEXT:    movdqa %xmm0, %xmm2
519; SSSE3_FAST-NEXT:    phaddd %xmm1, %xmm2
520; SSSE3_FAST-NEXT:    phaddd %xmm0, %xmm0
521; SSSE3_FAST-NEXT:    movdqa %xmm2, %xmm1
522; SSSE3_FAST-NEXT:    retq
523;
524; AVX1_SLOW-LABEL: hadd_v8i32a:
525; AVX1_SLOW:       # %bb.0:
526; AVX1_SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
527; AVX1_SLOW-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
528; AVX1_SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
529; AVX1_SLOW-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
530; AVX1_SLOW-NEXT:    retq
531;
532; AVX1_FAST-LABEL: hadd_v8i32a:
533; AVX1_FAST:       # %bb.0:
534; AVX1_FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
535; AVX1_FAST-NEXT:    vphaddd %xmm1, %xmm0, %xmm1
536; AVX1_FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
537; AVX1_FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
538; AVX1_FAST-NEXT:    retq
539;
540; AVX2-LABEL: hadd_v8i32a:
541; AVX2:       # %bb.0:
542; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
543; AVX2-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
544; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
545; AVX2-NEXT:    retq
546  %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
547  %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
548  %hop = add <4 x i32> %a0, %a1
549  %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3>
550  ret <8 x i32> %shuf
551}
552
553define <8 x i32> @hadd_v8i32b(<8 x i32> %a) {
554; SSSE3-LABEL: hadd_v8i32b:
555; SSSE3:       # %bb.0:
556; SSSE3-NEXT:    phaddd %xmm0, %xmm0
557; SSSE3-NEXT:    phaddd %xmm1, %xmm1
558; SSSE3-NEXT:    retq
559;
560; AVX1-LABEL: hadd_v8i32b:
561; AVX1:       # %bb.0:
562; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm1
563; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
564; AVX1-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
565; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
566; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
567; AVX1-NEXT:    retq
568;
569; AVX2-LABEL: hadd_v8i32b:
570; AVX2:       # %bb.0:
571; AVX2-NEXT:    vphaddd %ymm0, %ymm0, %ymm0
572; AVX2-NEXT:    retq
573  %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
574  %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
575  %hop = add <8 x i32> %a0, %a1
576  %shuf = shufflevector <8 x i32> %hop, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
577  ret <8 x i32> %shuf
578}
579
580define <4 x i32> @hsub_v4i32(<4 x i32> %a) {
581; SSSE3-LABEL: hsub_v4i32:
582; SSSE3:       # %bb.0:
583; SSSE3-NEXT:    phsubd %xmm0, %xmm0
584; SSSE3-NEXT:    retq
585;
586; AVX-LABEL: hsub_v4i32:
587; AVX:       # %bb.0:
588; AVX-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
589; AVX-NEXT:    retq
590  %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
591  %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
592  %hop = sub <4 x i32> %a02, %a13
593  %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <4 x i32> <i32 undef, i32 1, i32 0, i32 undef>
594  ret <4 x i32> %shuf
595}
596
597define <8 x i32> @hsub_v8i32a(<8 x i32> %a) {
598; SSSE3_SLOW-LABEL: hsub_v8i32a:
599; SSSE3_SLOW:       # %bb.0:
600; SSSE3_SLOW-NEXT:    movdqa %xmm0, %xmm2
601; SSSE3_SLOW-NEXT:    phsubd %xmm1, %xmm2
602; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
603; SSSE3_SLOW-NEXT:    movdqa %xmm2, %xmm1
604; SSSE3_SLOW-NEXT:    retq
605;
606; SSSE3_FAST-LABEL: hsub_v8i32a:
607; SSSE3_FAST:       # %bb.0:
608; SSSE3_FAST-NEXT:    movdqa %xmm0, %xmm2
609; SSSE3_FAST-NEXT:    phsubd %xmm1, %xmm2
610; SSSE3_FAST-NEXT:    phsubd %xmm0, %xmm0
611; SSSE3_FAST-NEXT:    movdqa %xmm2, %xmm1
612; SSSE3_FAST-NEXT:    retq
613;
614; AVX1_SLOW-LABEL: hsub_v8i32a:
615; AVX1_SLOW:       # %bb.0:
616; AVX1_SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
617; AVX1_SLOW-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
618; AVX1_SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
619; AVX1_SLOW-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
620; AVX1_SLOW-NEXT:    retq
621;
622; AVX1_FAST-LABEL: hsub_v8i32a:
623; AVX1_FAST:       # %bb.0:
624; AVX1_FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
625; AVX1_FAST-NEXT:    vphsubd %xmm1, %xmm0, %xmm1
626; AVX1_FAST-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
627; AVX1_FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
628; AVX1_FAST-NEXT:    retq
629;
630; AVX2-LABEL: hsub_v8i32a:
631; AVX2:       # %bb.0:
632; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
633; AVX2-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
634; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
635; AVX2-NEXT:    retq
636  %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
637  %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
638  %hop = sub <4 x i32> %a0, %a1
639  %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <8 x i32> <i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef, i32 2, i32 3>
640  ret <8 x i32> %shuf
641}
642
643define <8 x i32> @hsub_v8i32b(<8 x i32> %a) {
644; SSSE3-LABEL: hsub_v8i32b:
645; SSSE3:       # %bb.0:
646; SSSE3-NEXT:    phsubd %xmm0, %xmm0
647; SSSE3-NEXT:    phsubd %xmm1, %xmm1
648; SSSE3-NEXT:    retq
649;
650; AVX1-LABEL: hsub_v8i32b:
651; AVX1:       # %bb.0:
652; AVX1-NEXT:    vphsubd %xmm0, %xmm0, %xmm1
653; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
654; AVX1-NEXT:    vphsubd %xmm0, %xmm0, %xmm0
655; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
656; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
657; AVX1-NEXT:    retq
658;
659; AVX2-LABEL: hsub_v8i32b:
660; AVX2:       # %bb.0:
661; AVX2-NEXT:    vphsubd %ymm0, %ymm0, %ymm0
662; AVX2-NEXT:    retq
663  %a0 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 4, i32 6, i32 undef, i32 undef>
664  %a1 = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 5, i32 7, i32 undef, i32 undef>
665  %hop = sub <8 x i32> %a0, %a1
666  %shuf = shufflevector <8 x i32> %hop, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
667  ret <8 x i32> %shuf
668}
669
670define <8 x i16> @hadd_v8i16(<8 x i16> %a) {
671; SSSE3-LABEL: hadd_v8i16:
672; SSSE3:       # %bb.0:
673; SSSE3-NEXT:    phaddw %xmm0, %xmm0
674; SSSE3-NEXT:    retq
675;
676; AVX-LABEL: hadd_v8i16:
677; AVX:       # %bb.0:
678; AVX-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
679; AVX-NEXT:    retq
680  %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
681  %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
682  %hop = add <8 x i16> %a0246, %a1357
683  %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
684  ret <8 x i16> %shuf
685}
686
687define <16 x i16> @hadd_v16i16a(<16 x i16> %a) {
688; SSSE3_SLOW-LABEL: hadd_v16i16a:
689; SSSE3_SLOW:       # %bb.0:
690; SSSE3_SLOW-NEXT:    movdqa %xmm0, %xmm2
691; SSSE3_SLOW-NEXT:    phaddw %xmm1, %xmm2
692; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
693; SSSE3_SLOW-NEXT:    movdqa %xmm2, %xmm1
694; SSSE3_SLOW-NEXT:    retq
695;
696; SSSE3_FAST-LABEL: hadd_v16i16a:
697; SSSE3_FAST:       # %bb.0:
698; SSSE3_FAST-NEXT:    movdqa %xmm0, %xmm2
699; SSSE3_FAST-NEXT:    phaddw %xmm1, %xmm2
700; SSSE3_FAST-NEXT:    phaddw %xmm0, %xmm0
701; SSSE3_FAST-NEXT:    movdqa %xmm2, %xmm1
702; SSSE3_FAST-NEXT:    retq
703;
704; AVX1_SLOW-LABEL: hadd_v16i16a:
705; AVX1_SLOW:       # %bb.0:
706; AVX1_SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
707; AVX1_SLOW-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
708; AVX1_SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
709; AVX1_SLOW-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
710; AVX1_SLOW-NEXT:    retq
711;
712; AVX1_FAST-LABEL: hadd_v16i16a:
713; AVX1_FAST:       # %bb.0:
714; AVX1_FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
715; AVX1_FAST-NEXT:    vphaddw %xmm1, %xmm0, %xmm1
716; AVX1_FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
717; AVX1_FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
718; AVX1_FAST-NEXT:    retq
719;
720; AVX2-LABEL: hadd_v16i16a:
721; AVX2:       # %bb.0:
722; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
723; AVX2-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
724; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
725; AVX2-NEXT:    retq
726  %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
727  %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
728  %hop = add <8 x i16> %a0, %a1
729  %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 7>
730  ret <16 x i16> %shuf
731}
732
733define <16 x i16> @hadd_v16i16b(<16 x i16> %a) {
734; SSSE3-LABEL: hadd_v16i16b:
735; SSSE3:       # %bb.0:
736; SSSE3-NEXT:    phaddw %xmm0, %xmm0
737; SSSE3-NEXT:    phaddw %xmm1, %xmm1
738; SSSE3-NEXT:    retq
739;
740; AVX1-LABEL: hadd_v16i16b:
741; AVX1:       # %bb.0:
742; AVX1-NEXT:    vphaddw %xmm0, %xmm0, %xmm1
743; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
744; AVX1-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
745; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
746; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
747; AVX1-NEXT:    retq
748;
749; AVX2-LABEL: hadd_v16i16b:
750; AVX2:       # %bb.0:
751; AVX2-NEXT:    vphaddw %ymm0, %ymm0, %ymm0
752; AVX2-NEXT:    retq
753  %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef>
754  %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 9, i32 11, i32 13, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
755  %hop = add <16 x i16> %a0, %a1
756  %shuf = shufflevector <16 x i16> %hop, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11>
757  ret <16 x i16> %shuf
758}
759
760define <8 x i16> @hsub_v8i16(<8 x i16> %a) {
761; SSSE3-LABEL: hsub_v8i16:
762; SSSE3:       # %bb.0:
763; SSSE3-NEXT:    phsubw %xmm0, %xmm0
764; SSSE3-NEXT:    retq
765;
766; AVX-LABEL: hsub_v8i16:
767; AVX:       # %bb.0:
768; AVX-NEXT:    vphsubw %xmm0, %xmm0, %xmm0
769; AVX-NEXT:    retq
770  %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
771  %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
772  %hop = sub <8 x i16> %a0246, %a1357
773  %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 2, i32 undef, i32 undef, i32 1, i32 undef, i32 3>
774  ret <8 x i16> %shuf
775}
776
777define <16 x i16> @hsub_v16i16a(<16 x i16> %a) {
778; SSSE3_SLOW-LABEL: hsub_v16i16a:
779; SSSE3_SLOW:       # %bb.0:
780; SSSE3_SLOW-NEXT:    movdqa %xmm0, %xmm2
781; SSSE3_SLOW-NEXT:    phsubw %xmm1, %xmm2
782; SSSE3_SLOW-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
783; SSSE3_SLOW-NEXT:    movdqa %xmm2, %xmm1
784; SSSE3_SLOW-NEXT:    retq
785;
786; SSSE3_FAST-LABEL: hsub_v16i16a:
787; SSSE3_FAST:       # %bb.0:
788; SSSE3_FAST-NEXT:    movdqa %xmm0, %xmm2
789; SSSE3_FAST-NEXT:    phsubw %xmm1, %xmm2
790; SSSE3_FAST-NEXT:    phsubw %xmm0, %xmm0
791; SSSE3_FAST-NEXT:    movdqa %xmm2, %xmm1
792; SSSE3_FAST-NEXT:    retq
793;
794; AVX1_SLOW-LABEL: hsub_v16i16a:
795; AVX1_SLOW:       # %bb.0:
796; AVX1_SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
797; AVX1_SLOW-NEXT:    vphsubw %xmm1, %xmm0, %xmm0
798; AVX1_SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
799; AVX1_SLOW-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
800; AVX1_SLOW-NEXT:    retq
801;
802; AVX1_FAST-LABEL: hsub_v16i16a:
803; AVX1_FAST:       # %bb.0:
804; AVX1_FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
805; AVX1_FAST-NEXT:    vphsubw %xmm1, %xmm0, %xmm1
806; AVX1_FAST-NEXT:    vphsubw %xmm0, %xmm0, %xmm0
807; AVX1_FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
808; AVX1_FAST-NEXT:    retq
809;
810; AVX2-LABEL: hsub_v16i16a:
811; AVX2:       # %bb.0:
812; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
813; AVX2-NEXT:    vphsubw %xmm1, %xmm0, %xmm0
814; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
815; AVX2-NEXT:    retq
816  %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
817  %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
818  %hop = sub <8 x i16> %a0, %a1
819  %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 7>
820  ret <16 x i16> %shuf
821}
822
823define <16 x i16> @hsub_v16i16b(<16 x i16> %a) {
824; SSSE3-LABEL: hsub_v16i16b:
825; SSSE3:       # %bb.0:
826; SSSE3-NEXT:    phsubw %xmm0, %xmm0
827; SSSE3-NEXT:    phsubw %xmm1, %xmm1
828; SSSE3-NEXT:    retq
829;
830; AVX1-LABEL: hsub_v16i16b:
831; AVX1:       # %bb.0:
832; AVX1-NEXT:    vphsubw %xmm0, %xmm0, %xmm1
833; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
834; AVX1-NEXT:    vphsubw %xmm0, %xmm0, %xmm0
835; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
836; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
837; AVX1-NEXT:    retq
838;
839; AVX2-LABEL: hsub_v16i16b:
840; AVX2:       # %bb.0:
841; AVX2-NEXT:    vphsubw %ymm0, %ymm0, %ymm0
842; AVX2-NEXT:    retq
843  %a0 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef>
844  %a1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 9, i32 11, i32 13, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
845  %hop = sub <16 x i16> %a0, %a1
846  %shuf = shufflevector <16 x i16> %hop, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11>
847  ret <16 x i16> %shuf
848}
849
850define <4 x float> @broadcast_haddps_v4f32(<4 x float> %a0) {
851; SSSE3-LABEL: broadcast_haddps_v4f32:
852; SSSE3:       # %bb.0:
853; SSSE3-NEXT:    haddps %xmm0, %xmm0
854; SSSE3-NEXT:    movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
855; SSSE3-NEXT:    retq
856;
857; AVX1-LABEL: broadcast_haddps_v4f32:
858; AVX1:       # %bb.0:
859; AVX1-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
860; AVX1-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
861; AVX1-NEXT:    retq
862;
863; AVX2-LABEL: broadcast_haddps_v4f32:
864; AVX2:       # %bb.0:
865; AVX2-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
866; AVX2-NEXT:    vbroadcastss %xmm0, %xmm0
867; AVX2-NEXT:    retq
868  %1 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a0)
869  %2 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
870  ret <4 x float> %2
871}
872
873declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>)
874
875define <4 x float> @PR34724_1(<4 x float> %a, <4 x float> %b) {
876; SSSE3_SLOW-LABEL: PR34724_1:
877; SSSE3_SLOW:       # %bb.0:
878; SSSE3_SLOW-NEXT:    haddps %xmm1, %xmm0
879; SSSE3_SLOW-NEXT:    movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
880; SSSE3_SLOW-NEXT:    addps %xmm1, %xmm2
881; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0]
882; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
883; SSSE3_SLOW-NEXT:    retq
884;
885; SSSE3_FAST-LABEL: PR34724_1:
886; SSSE3_FAST:       # %bb.0:
887; SSSE3_FAST-NEXT:    haddps %xmm1, %xmm0
888; SSSE3_FAST-NEXT:    retq
889;
890; AVX1_SLOW-LABEL: PR34724_1:
891; AVX1_SLOW:       # %bb.0:
892; AVX1_SLOW-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
893; AVX1_SLOW-NEXT:    vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
894; AVX1_SLOW-NEXT:    vaddps %xmm1, %xmm2, %xmm1
895; AVX1_SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
896; AVX1_SLOW-NEXT:    retq
897;
898; AVX1_FAST-LABEL: PR34724_1:
899; AVX1_FAST:       # %bb.0:
900; AVX1_FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
901; AVX1_FAST-NEXT:    retq
902;
903; AVX2_SLOW-LABEL: PR34724_1:
904; AVX2_SLOW:       # %bb.0:
905; AVX2_SLOW-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
906; AVX2_SLOW-NEXT:    vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
907; AVX2_SLOW-NEXT:    vaddps %xmm1, %xmm2, %xmm1
908; AVX2_SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
909; AVX2_SLOW-NEXT:    retq
910;
911; AVX2_FAST-LABEL: PR34724_1:
912; AVX2_FAST:       # %bb.0:
913; AVX2_FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
914; AVX2_FAST-NEXT:    retq
915  %t0 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> <i32 2, i32 4>
916  %t1 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> <i32 3, i32 5>
917  %t2 = fadd <2 x float> %t0, %t1
918  %vecinit9 = shufflevector <2 x float> %t2, <2 x float> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef>
919  %t3 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
920  %t4 = fadd <4 x float> %t3, %b
921  %vecinit13 = shufflevector <4 x float> %vecinit9, <4 x float> %t4, <4 x i32> <i32 undef, i32 1, i32 2, i32 7>
922  ret <4 x float> %vecinit13
923}
924
925define <4 x float> @PR34724_2(<4 x float> %a, <4 x float> %b) {
926; SSSE3_SLOW-LABEL: PR34724_2:
927; SSSE3_SLOW:       # %bb.0:
928; SSSE3_SLOW-NEXT:    haddps %xmm1, %xmm0
929; SSSE3_SLOW-NEXT:    movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
930; SSSE3_SLOW-NEXT:    addps %xmm1, %xmm2
931; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0]
932; SSSE3_SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
933; SSSE3_SLOW-NEXT:    retq
934;
935; SSSE3_FAST-LABEL: PR34724_2:
936; SSSE3_FAST:       # %bb.0:
937; SSSE3_FAST-NEXT:    haddps %xmm1, %xmm0
938; SSSE3_FAST-NEXT:    retq
939;
940; AVX1_SLOW-LABEL: PR34724_2:
941; AVX1_SLOW:       # %bb.0:
942; AVX1_SLOW-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
943; AVX1_SLOW-NEXT:    vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
944; AVX1_SLOW-NEXT:    vaddps %xmm1, %xmm2, %xmm1
945; AVX1_SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
946; AVX1_SLOW-NEXT:    retq
947;
948; AVX1_FAST-LABEL: PR34724_2:
949; AVX1_FAST:       # %bb.0:
950; AVX1_FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
951; AVX1_FAST-NEXT:    retq
952;
953; AVX2_SLOW-LABEL: PR34724_2:
954; AVX2_SLOW:       # %bb.0:
955; AVX2_SLOW-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
956; AVX2_SLOW-NEXT:    vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2]
957; AVX2_SLOW-NEXT:    vaddps %xmm1, %xmm2, %xmm1
958; AVX2_SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
959; AVX2_SLOW-NEXT:    retq
960;
961; AVX2_FAST-LABEL: PR34724_2:
962; AVX2_FAST:       # %bb.0:
963; AVX2_FAST-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
964; AVX2_FAST-NEXT:    retq
965  %t0 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 4, i32 undef, i32 undef>
966  %t1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 5, i32 undef, i32 undef>
967  %t2 = fadd <4 x float> %t0, %t1
968  %vecinit9 = shufflevector <4 x float> %t2, <4 x float> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef>
969  %t3 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
970  %t4 = fadd <4 x float> %t3, %b
971  %vecinit13 = shufflevector <4 x float> %vecinit9, <4 x float> %t4, <4 x i32> <i32 undef, i32 1, i32 2, i32 7>
972  ret <4 x float> %vecinit13
973}
974
975;
976; fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
977;  --> SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))).
978;
979
980define <4 x float> @hadd_4f32_v8f32_shuffle(<8 x float> %a0) {
981; SSSE3-LABEL: hadd_4f32_v8f32_shuffle:
982; SSSE3:       # %bb.0:
983; SSSE3-NEXT:    haddps %xmm1, %xmm0
984; SSSE3-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
985; SSSE3-NEXT:    retq
986;
987; AVX-LABEL: hadd_4f32_v8f32_shuffle:
988; AVX:       # %bb.0:
989; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
990; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
991; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
992; AVX-NEXT:    vzeroupper
993; AVX-NEXT:    retq
994  %shuf256 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 6, i32 7, i32 6, i32 7>
995  %lo = shufflevector <8 x float> %shuf256, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
996  %hi = shufflevector <8 x float> %shuf256, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
997  %hadd0 = shufflevector <4 x float> %lo, <4 x float> %hi, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
998  %hadd1 = shufflevector <4 x float> %lo, <4 x float> %hi, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
999  %hadd = fadd <4 x float> %hadd0, %hadd1
1000  ret <4 x float> %hadd
1001}
1002
1003define <4 x float> @hsub_4f32_v8f32_shuffle(<8 x float> %a0) {
1004; SSSE3-LABEL: hsub_4f32_v8f32_shuffle:
1005; SSSE3:       # %bb.0:
1006; SSSE3-NEXT:    haddps %xmm1, %xmm0
1007; SSSE3-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1008; SSSE3-NEXT:    retq
1009;
1010; AVX-LABEL: hsub_4f32_v8f32_shuffle:
1011; AVX:       # %bb.0:
1012; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
1013; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
1014; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1015; AVX-NEXT:    vzeroupper
1016; AVX-NEXT:    retq
1017  %shuf256 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 6, i32 7, i32 6, i32 7>
1018  %lo = shufflevector <8 x float> %shuf256, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1019  %hi = shufflevector <8 x float> %shuf256, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1020  %hsub0 = shufflevector <4 x float> %lo, <4 x float> %hi, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1021  %hsub1 = shufflevector <4 x float> %lo, <4 x float> %hi, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1022  %hsub = fadd <4 x float> %hsub0, %hsub1
1023  ret <4 x float> %hsub
1024}
1025
1026define <4 x i32> @hadd_4i32_v8i32_shuffle(<8 x i32> %a0) {
1027; SSSE3-LABEL: hadd_4i32_v8i32_shuffle:
1028; SSSE3:       # %bb.0:
1029; SSSE3-NEXT:    phaddd %xmm1, %xmm0
1030; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1031; SSSE3-NEXT:    retq
1032;
1033; AVX1-LABEL: hadd_4i32_v8i32_shuffle:
1034; AVX1:       # %bb.0:
1035; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1036; AVX1-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
1037; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1038; AVX1-NEXT:    vzeroupper
1039; AVX1-NEXT:    retq
1040;
1041; AVX2-LABEL: hadd_4i32_v8i32_shuffle:
1042; AVX2:       # %bb.0:
1043; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1044; AVX2-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
1045; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1046; AVX2-NEXT:    vzeroupper
1047; AVX2-NEXT:    retq
1048  %shuf256 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 6, i32 7, i32 6, i32 7>
1049  %lo = shufflevector <8 x i32> %shuf256, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1050  %hi = shufflevector <8 x i32> %shuf256, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1051  %hadd0 = shufflevector <4 x i32> %lo, <4 x i32> %hi, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1052  %hadd1 = shufflevector <4 x i32> %lo, <4 x i32> %hi, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1053  %hadd = add <4 x i32> %hadd0, %hadd1
1054  ret <4 x i32> %hadd
1055}
1056
1057define <4 x i32> @hsub_4i32_v8i32_shuffle(<8 x i32> %a0) {
1058; SSSE3-LABEL: hsub_4i32_v8i32_shuffle:
1059; SSSE3:       # %bb.0:
1060; SSSE3-NEXT:    phaddd %xmm1, %xmm0
1061; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1062; SSSE3-NEXT:    retq
1063;
1064; AVX1-LABEL: hsub_4i32_v8i32_shuffle:
1065; AVX1:       # %bb.0:
1066; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1067; AVX1-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
1068; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1069; AVX1-NEXT:    vzeroupper
1070; AVX1-NEXT:    retq
1071;
1072; AVX2-LABEL: hsub_4i32_v8i32_shuffle:
1073; AVX2:       # %bb.0:
1074; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1075; AVX2-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
1076; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
1077; AVX2-NEXT:    vzeroupper
1078; AVX2-NEXT:    retq
1079  %shuf256 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 6, i32 7, i32 6, i32 7>
1080  %lo = shufflevector <8 x i32> %shuf256, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1081  %hi = shufflevector <8 x i32> %shuf256, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
1082  %hsub0 = shufflevector <4 x i32> %lo, <4 x i32> %hi, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
1083  %hsub1 = shufflevector <4 x i32> %lo, <4 x i32> %hi, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
1084  %hsub = add <4 x i32> %hsub0, %hsub1
1085  ret <4 x i32> %hsub
1086}
1087
1088;
1089; fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) --> SHUFFLE(HOP(X,Y)).
1090;
1091
1092define <4 x double> @hadd_4f64_v4f64_shuffle(<4 x double> %a0, <4 x double> %a1) {
1093; SSSE3-LABEL: hadd_4f64_v4f64_shuffle:
1094; SSSE3:       # %bb.0:
1095; SSSE3-NEXT:    haddpd %xmm1, %xmm0
1096; SSSE3-NEXT:    haddpd %xmm3, %xmm2
1097; SSSE3-NEXT:    movapd %xmm2, %xmm1
1098; SSSE3-NEXT:    retq
1099;
1100; AVX1-LABEL: hadd_4f64_v4f64_shuffle:
1101; AVX1:       # %bb.0:
1102; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm2
1103; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1104; AVX1-NEXT:    vhaddpd %ymm0, %ymm2, %ymm0
1105; AVX1-NEXT:    retq
1106;
1107; AVX2-LABEL: hadd_4f64_v4f64_shuffle:
1108; AVX2:       # %bb.0:
1109; AVX2-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
1110; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
1111; AVX2-NEXT:    retq
1112  %shuf0 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1113  %shuf1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1114  %hadd0 = shufflevector <4 x double> %shuf0, <4 x double> %shuf1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
1115  %hadd1 = shufflevector <4 x double> %shuf0, <4 x double> %shuf1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
1116  %hadd = fadd <4 x double> %hadd0, %hadd1
1117  ret <4 x double> %hadd
1118}
1119
1120define <4 x double> @hsub_4f64_v4f64_shuffle(<4 x double> %a0, <4 x double> %a1) {
1121; SSSE3-LABEL: hsub_4f64_v4f64_shuffle:
1122; SSSE3:       # %bb.0:
1123; SSSE3-NEXT:    hsubpd %xmm1, %xmm0
1124; SSSE3-NEXT:    hsubpd %xmm3, %xmm2
1125; SSSE3-NEXT:    movapd %xmm2, %xmm1
1126; SSSE3-NEXT:    retq
1127;
1128; AVX1-LABEL: hsub_4f64_v4f64_shuffle:
1129; AVX1:       # %bb.0:
1130; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm2
1131; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1132; AVX1-NEXT:    vhsubpd %ymm0, %ymm2, %ymm0
1133; AVX1-NEXT:    retq
1134;
1135; AVX2-LABEL: hsub_4f64_v4f64_shuffle:
1136; AVX2:       # %bb.0:
1137; AVX2-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0
1138; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
1139; AVX2-NEXT:    retq
1140  %shuf0 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1141  %shuf1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1142  %hadd0 = shufflevector <4 x double> %shuf0, <4 x double> %shuf1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
1143  %hadd1 = shufflevector <4 x double> %shuf0, <4 x double> %shuf1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
1144  %hadd = fsub <4 x double> %hadd0, %hadd1
1145  ret <4 x double> %hadd
1146}
1147
1148define <8 x float> @hadd_8f32_v8f32_shuffle(<8 x float> %a0, <8 x float> %a1) {
1149; SSSE3-LABEL: hadd_8f32_v8f32_shuffle:
1150; SSSE3:       # %bb.0:
1151; SSSE3-NEXT:    haddps %xmm1, %xmm0
1152; SSSE3-NEXT:    haddps %xmm3, %xmm2
1153; SSSE3-NEXT:    movaps %xmm2, %xmm1
1154; SSSE3-NEXT:    retq
1155;
1156; AVX1-LABEL: hadd_8f32_v8f32_shuffle:
1157; AVX1:       # %bb.0:
1158; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm2
1159; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1160; AVX1-NEXT:    vhaddps %ymm0, %ymm2, %ymm0
1161; AVX1-NEXT:    retq
1162;
1163; AVX2-LABEL: hadd_8f32_v8f32_shuffle:
1164; AVX2:       # %bb.0:
1165; AVX2-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
1166; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
1167; AVX2-NEXT:    retq
1168  %shuf0 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
1169  %shuf1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1170  %hadd0 = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
1171  %hadd1 = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
1172  %hadd = fadd <8 x float> %hadd0, %hadd1
1173  ret <8 x float> %hadd
1174}
1175
1176define <8 x float> @hsub_8f32_v8f32_shuffle(<8 x float> %a0, <8 x float> %a1) {
1177; SSSE3-LABEL: hsub_8f32_v8f32_shuffle:
1178; SSSE3:       # %bb.0:
1179; SSSE3-NEXT:    haddps %xmm1, %xmm0
1180; SSSE3-NEXT:    haddps %xmm3, %xmm2
1181; SSSE3-NEXT:    movaps %xmm2, %xmm1
1182; SSSE3-NEXT:    retq
1183;
1184; AVX1-LABEL: hsub_8f32_v8f32_shuffle:
1185; AVX1:       # %bb.0:
1186; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm2
1187; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1188; AVX1-NEXT:    vhaddps %ymm0, %ymm2, %ymm0
1189; AVX1-NEXT:    retq
1190;
1191; AVX2-LABEL: hsub_8f32_v8f32_shuffle:
1192; AVX2:       # %bb.0:
1193; AVX2-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
1194; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
1195; AVX2-NEXT:    retq
1196  %shuf0 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
1197  %shuf1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1198  %hsub0 = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
1199  %hsub1 = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
1200  %hsub = fadd <8 x float> %hsub0, %hsub1
1201  ret <8 x float> %hsub
1202}
1203
1204define <8 x i32> @hadd_8i32_v8i32_shuffle(<8 x i32> %a0, <8 x i32> %a1) {
1205; SSSE3-LABEL: hadd_8i32_v8i32_shuffle:
1206; SSSE3:       # %bb.0:
1207; SSSE3-NEXT:    phaddd %xmm1, %xmm0
1208; SSSE3-NEXT:    phaddd %xmm3, %xmm2
1209; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1210; SSSE3-NEXT:    retq
1211;
1212; AVX1-LABEL: hadd_8i32_v8i32_shuffle:
1213; AVX1:       # %bb.0:
1214; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1215; AVX1-NEXT:    vphaddd %xmm2, %xmm1, %xmm1
1216; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1217; AVX1-NEXT:    vphaddd %xmm2, %xmm0, %xmm0
1218; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1219; AVX1-NEXT:    retq
1220;
1221; AVX2-LABEL: hadd_8i32_v8i32_shuffle:
1222; AVX2:       # %bb.0:
1223; AVX2-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
1224; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1225; AVX2-NEXT:    retq
1226  %shuf0 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
1227  %shuf1 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1228  %hadd0 = shufflevector <8 x i32> %shuf0, <8 x i32> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
1229  %hadd1 = shufflevector <8 x i32> %shuf0, <8 x i32> %shuf1, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
1230  %hadd = add <8 x i32> %hadd0, %hadd1
1231  ret <8 x i32> %hadd
1232}
1233
1234define <8 x i32> @hsub_8i32_v8i32_shuffle(<8 x i32> %a0, <8 x i32> %a1) {
1235; SSSE3-LABEL: hsub_8i32_v8i32_shuffle:
1236; SSSE3:       # %bb.0:
1237; SSSE3-NEXT:    phsubd %xmm1, %xmm0
1238; SSSE3-NEXT:    phsubd %xmm3, %xmm2
1239; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1240; SSSE3-NEXT:    retq
1241;
1242; AVX1-LABEL: hsub_8i32_v8i32_shuffle:
1243; AVX1:       # %bb.0:
1244; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1245; AVX1-NEXT:    vphsubd %xmm2, %xmm1, %xmm1
1246; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1247; AVX1-NEXT:    vphsubd %xmm2, %xmm0, %xmm0
1248; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1249; AVX1-NEXT:    retq
1250;
1251; AVX2-LABEL: hsub_8i32_v8i32_shuffle:
1252; AVX2:       # %bb.0:
1253; AVX2-NEXT:    vphsubd %ymm1, %ymm0, %ymm0
1254; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1255; AVX2-NEXT:    retq
1256  %shuf0 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
1257  %shuf1 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
1258  %hadd0 = shufflevector <8 x i32> %shuf0, <8 x i32> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
1259  %hadd1 = shufflevector <8 x i32> %shuf0, <8 x i32> %shuf1, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
1260  %hadd = sub <8 x i32> %hadd0, %hadd1
1261  ret <8 x i32> %hadd
1262}
1263
1264define <16 x i16> @hadd_16i16_16i16_shuffle(<16 x i16> %a0, <16 x i16> %a1) {
1265; SSSE3-LABEL: hadd_16i16_16i16_shuffle:
1266; SSSE3:       # %bb.0:
1267; SSSE3-NEXT:    phaddw %xmm1, %xmm0
1268; SSSE3-NEXT:    phaddw %xmm3, %xmm2
1269; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1270; SSSE3-NEXT:    retq
1271;
1272; AVX1-LABEL: hadd_16i16_16i16_shuffle:
1273; AVX1:       # %bb.0:
1274; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1275; AVX1-NEXT:    vphaddw %xmm2, %xmm1, %xmm1
1276; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1277; AVX1-NEXT:    vphaddw %xmm2, %xmm0, %xmm0
1278; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1279; AVX1-NEXT:    retq
1280;
1281; AVX2-LABEL: hadd_16i16_16i16_shuffle:
1282; AVX2:       # %bb.0:
1283; AVX2-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
1284; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
1285; AVX2-NEXT:    retq
1286  %shuf0 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
1287  %shuf1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1288  %hadd0 = shufflevector <16 x i16> %shuf0, <16 x i16> %shuf1, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
1289  %hadd1 = shufflevector <16 x i16> %shuf0, <16 x i16> %shuf1, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
1290  %hadd = add <16 x i16> %hadd0, %hadd1
1291  ret <16 x i16> %hadd
1292}
1293