• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3           | FileCheck %s --check-prefix=SSE
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefix=SSE
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx             | FileCheck %s --check-prefixes=AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops   | FileCheck %s --check-prefixes=AVX,AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2            | FileCheck %s --check-prefixes=AVX,AVX2
7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops  | FileCheck %s --check-prefixes=AVX,AVX2
8
9define <8 x i16> @hadd_reverse_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
10; SSE-LABEL: hadd_reverse_v8i16:
11; SSE:       # %bb.0:
12; SSE-NEXT:    phaddw %xmm1, %xmm0
13; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
14; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
15; SSE-NEXT:    retq
16;
17; AVX-LABEL: hadd_reverse_v8i16:
18; AVX:       # %bb.0:
19; AVX-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
20; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
21; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
22; AVX-NEXT:    retq
23  %lhs = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 7, i32 5, i32 3, i32 1, i32 15, i32 13, i32 11, i32 9>
24  %rhs = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 6, i32 4, i32 2, i32 0, i32 14, i32 12, i32 10, i32 8>
25  %add = add <8 x i16> %lhs, %rhs
26  ret <8 x i16> %add
27}
28
29define <8 x i16> @hadd_reverse2_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
30; SSE-LABEL: hadd_reverse2_v8i16:
31; SSE:       # %bb.0:
32; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
33; SSE-NEXT:    pshufb %xmm2, %xmm0
34; SSE-NEXT:    pshufb %xmm2, %xmm1
35; SSE-NEXT:    phaddw %xmm1, %xmm0
36; SSE-NEXT:    retq
37;
38; AVX-LABEL: hadd_reverse2_v8i16:
39; AVX:       # %bb.0:
40; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
41; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
42; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
43; AVX-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
44; AVX-NEXT:    retq
45  %shuf0 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
46  %shuf1 = shufflevector <8 x i16> %a1, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
47  %lhs = shufflevector <8 x i16> %shuf0, <8 x i16> %shuf1, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
48  %rhs = shufflevector <8 x i16> %shuf0, <8 x i16> %shuf1, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
49  %add = add <8 x i16> %lhs, %rhs
50  ret <8 x i16> %add
51}
52
53define <8 x float> @hadd_reverse_v8f32(<8 x float> %a0, <8 x float> %a1) {
54; SSE-LABEL: hadd_reverse_v8f32:
55; SSE:       # %bb.0:
56; SSE-NEXT:    movaps %xmm0, %xmm4
57; SSE-NEXT:    haddps %xmm3, %xmm1
58; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0,3,2]
59; SSE-NEXT:    haddps %xmm2, %xmm4
60; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,0,3,2]
61; SSE-NEXT:    movaps %xmm1, %xmm0
62; SSE-NEXT:    movaps %xmm4, %xmm1
63; SSE-NEXT:    retq
64;
65; AVX1-LABEL: hadd_reverse_v8f32:
66; AVX1:       # %bb.0:
67; AVX1-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
68; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
69; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
70; AVX1-NEXT:    retq
71;
72; AVX2-LABEL: hadd_reverse_v8f32:
73; AVX2:       # %bb.0:
74; AVX2-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
75; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
76; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
77; AVX2-NEXT:    retq
78  %lhs = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 7, i32 5, i32 15, i32 13, i32 3, i32 1, i32 11, i32 9>
79  %rhs = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 6, i32 4, i32 14, i32 12, i32 2, i32 0, i32 10, i32 8>
80  %add = fadd <8 x float> %lhs, %rhs
81  ret <8 x float> %add
82}
83
84define <8 x float> @hadd_reverse2_v8f32(<8 x float> %a0, <8 x float> %a1) {
85; SSE-LABEL: hadd_reverse2_v8f32:
86; SSE:       # %bb.0:
87; SSE-NEXT:    movaps %xmm0, %xmm4
88; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,2],xmm0[1,0]
89; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,2,1,0]
90; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,2,1,0]
91; SSE-NEXT:    haddps %xmm2, %xmm4
92; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,2,1,0]
93; SSE-NEXT:    haddps %xmm3, %xmm1
94; SSE-NEXT:    movaps %xmm1, %xmm0
95; SSE-NEXT:    movaps %xmm4, %xmm1
96; SSE-NEXT:    retq
97;
98; AVX1-LABEL: hadd_reverse2_v8f32:
99; AVX1:       # %bb.0:
100; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
101; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
102; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
103; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
104; AVX1-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
105; AVX1-NEXT:    retq
106;
107; AVX2-LABEL: hadd_reverse2_v8f32:
108; AVX2:       # %bb.0:
109; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
110; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
111; AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
112; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,3,0,1]
113; AVX2-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
114; AVX2-NEXT:    retq
115  %shuf0 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
116  %shuf1 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
117  %lhs = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
118  %rhs = shufflevector <8 x float> %shuf0, <8 x float> %shuf1, <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
119  %add = fadd <8 x float> %lhs, %rhs
120  ret <8 x float> %add
121}
122
123define <16 x i16> @hadd_reverse_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
124; SSE-LABEL: hadd_reverse_v16i16:
125; SSE:       # %bb.0:
126; SSE-NEXT:    phaddw %xmm3, %xmm1
127; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
128; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,7,6,5,4]
129; SSE-NEXT:    phaddw %xmm2, %xmm0
130; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
131; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,5,4]
132; SSE-NEXT:    movdqa %xmm3, %xmm0
133; SSE-NEXT:    retq
134;
135; AVX1-LABEL: hadd_reverse_v16i16:
136; AVX1:       # %bb.0:
137; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
138; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
139; AVX1-NEXT:    vphaddw %xmm2, %xmm3, %xmm2
140; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
141; AVX1-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
142; AVX1-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
143; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
144; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
145; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
146; AVX1-NEXT:    retq
147;
148; AVX2-LABEL: hadd_reverse_v16i16:
149; AVX2:       # %bb.0:
150; AVX2-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
151; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
152; AVX2-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12]
153; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
154; AVX2-NEXT:    retq
155  %lhs = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 15, i32 13, i32 11, i32 9, i32 31, i32 29, i32 27, i32 25, i32 7, i32 5, i32 3, i32 1, i32 23, i32 21, i32 19, i32 17>
156  %rhs = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 14, i32 12, i32 10, i32 8, i32 30, i32 28, i32 26, i32 24, i32 6, i32 4, i32 2, i32 0, i32 22, i32 20, i32 18, i32 16>
157  %add = add <16 x i16> %lhs, %rhs
158  ret <16 x i16> %add
159}
160
161define <16 x i16> @hadd_reverse2_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
162; SSE-LABEL: hadd_reverse2_v16i16:
163; SSE:       # %bb.0:
164; SSE-NEXT:    movdqa %xmm0, %xmm4
165; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
166; SSE-NEXT:    pshufb %xmm0, %xmm4
167; SSE-NEXT:    pshufb %xmm0, %xmm1
168; SSE-NEXT:    pshufb %xmm0, %xmm2
169; SSE-NEXT:    phaddw %xmm2, %xmm4
170; SSE-NEXT:    pshufb %xmm0, %xmm3
171; SSE-NEXT:    phaddw %xmm3, %xmm1
172; SSE-NEXT:    movdqa %xmm1, %xmm0
173; SSE-NEXT:    movdqa %xmm4, %xmm1
174; SSE-NEXT:    retq
175;
176; AVX1-LABEL: hadd_reverse2_v16i16:
177; AVX1:       # %bb.0:
178; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
179; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
180; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
181; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
182; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
183; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
184; AVX1-NEXT:    vphaddw %xmm4, %xmm2, %xmm2
185; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
186; AVX1-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
187; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
188; AVX1-NEXT:    retq
189;
190; AVX2-LABEL: hadd_reverse2_v16i16:
191; AVX2:       # %bb.0:
192; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1,30,31,28,29,26,27,24,25,22,23,20,21,18,19,16,17]
193; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
194; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
195; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
196; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
197; AVX2-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
198; AVX2-NEXT:    retq
199  %shuf0 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
200  %shuf1 = shufflevector <16 x i16> %a1, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
201  %lhs = shufflevector <16 x i16> %shuf0, <16 x i16> %shuf1, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
202  %rhs = shufflevector <16 x i16> %shuf0, <16 x i16> %shuf1, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
203  %add = add <16 x i16> %lhs, %rhs
204  ret <16 x i16> %add
205}
206
207define <8 x double> @hadd_reverse_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind {
208; SSE-LABEL: hadd_reverse_v8f64:
209; SSE:       # %bb.0:
210; SSE-NEXT:    movapd %xmm1, %xmm8
211; SSE-NEXT:    movapd %xmm0, %xmm9
212; SSE-NEXT:    haddpd %xmm7, %xmm3
213; SSE-NEXT:    haddpd %xmm6, %xmm2
214; SSE-NEXT:    haddpd %xmm5, %xmm8
215; SSE-NEXT:    haddpd %xmm4, %xmm9
216; SSE-NEXT:    movapd %xmm3, %xmm0
217; SSE-NEXT:    movapd %xmm2, %xmm1
218; SSE-NEXT:    movapd %xmm8, %xmm2
219; SSE-NEXT:    movapd %xmm9, %xmm3
220; SSE-NEXT:    retq
221;
222; AVX1-LABEL: hadd_reverse_v8f64:
223; AVX1:       # %bb.0:
224; AVX1-NEXT:    vhaddpd %ymm3, %ymm1, %ymm1
225; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1]
226; AVX1-NEXT:    vhaddpd %ymm2, %ymm0, %ymm0
227; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
228; AVX1-NEXT:    vmovapd %ymm3, %ymm0
229; AVX1-NEXT:    retq
230;
231; AVX2-LABEL: hadd_reverse_v8f64:
232; AVX2:       # %bb.0:
233; AVX2-NEXT:    vhaddpd %ymm3, %ymm1, %ymm1
234; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm1[2,3,0,1]
235; AVX2-NEXT:    vhaddpd %ymm2, %ymm0, %ymm0
236; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[2,3,0,1]
237; AVX2-NEXT:    vmovapd %ymm3, %ymm0
238; AVX2-NEXT:    retq
239  %lhs = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 7, i32 15, i32 5, i32 13, i32 3, i32 11, i32 1, i32 9>
240  %rhs = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 6, i32 14, i32 4, i32 12, i32 2, i32 10, i32 0, i32 8>
241  %fadd = fadd <8 x double> %lhs, %rhs
242  ret <8 x double> %fadd
243}
244
245define <8 x double> @hadd_reverse2_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind {
246; SSE-LABEL: hadd_reverse2_v8f64:
247; SSE:       # %bb.0:
248; SSE-NEXT:    movapd %xmm1, %xmm8
249; SSE-NEXT:    movapd %xmm0, %xmm9
250; SSE-NEXT:    shufpd {{.*#+}} xmm9 = xmm9[1],xmm0[0]
251; SSE-NEXT:    shufpd {{.*#+}} xmm8 = xmm8[1],xmm1[0]
252; SSE-NEXT:    shufpd {{.*#+}} xmm2 = xmm2[1,0]
253; SSE-NEXT:    shufpd {{.*#+}} xmm3 = xmm3[1,0]
254; SSE-NEXT:    shufpd {{.*#+}} xmm4 = xmm4[1,0]
255; SSE-NEXT:    haddpd %xmm4, %xmm9
256; SSE-NEXT:    shufpd {{.*#+}} xmm5 = xmm5[1,0]
257; SSE-NEXT:    haddpd %xmm5, %xmm8
258; SSE-NEXT:    shufpd {{.*#+}} xmm6 = xmm6[1,0]
259; SSE-NEXT:    haddpd %xmm6, %xmm2
260; SSE-NEXT:    shufpd {{.*#+}} xmm7 = xmm7[1,0]
261; SSE-NEXT:    haddpd %xmm7, %xmm3
262; SSE-NEXT:    movapd %xmm3, %xmm0
263; SSE-NEXT:    movapd %xmm2, %xmm1
264; SSE-NEXT:    movapd %xmm8, %xmm2
265; SSE-NEXT:    movapd %xmm9, %xmm3
266; SSE-NEXT:    retq
267;
268; AVX1-LABEL: hadd_reverse2_v8f64:
269; AVX1:       # %bb.0:
270; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
271; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
272; AVX1-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm1[1,0,3,2]
273; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm1[2,3,0,1]
274; AVX1-NEXT:    vpermilpd {{.*#+}} ymm1 = ymm2[1,0,3,2]
275; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
276; AVX1-NEXT:    vhaddpd %ymm1, %ymm0, %ymm1
277; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm3[1,0,3,2]
278; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
279; AVX1-NEXT:    vhaddpd %ymm0, %ymm4, %ymm0
280; AVX1-NEXT:    retq
281;
282; AVX2-LABEL: hadd_reverse2_v8f64:
283; AVX2:       # %bb.0:
284; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0]
285; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm1[3,2,1,0]
286; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm2[3,2,1,0]
287; AVX2-NEXT:    vhaddpd %ymm1, %ymm0, %ymm1
288; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm3[3,2,1,0]
289; AVX2-NEXT:    vhaddpd %ymm0, %ymm4, %ymm0
290; AVX2-NEXT:    retq
291  %shuf0 = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
292  %shuf1 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
293  %lhs = shufflevector <8 x double> %shuf0, <8 x double> %shuf1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
294  %rhs = shufflevector <8 x double> %shuf0, <8 x double> %shuf1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
295  %fadd = fadd <8 x double> %lhs, %rhs
296  ret <8 x double> %fadd
297}
298
299define <16 x float> @hadd_reverse_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind {
300; SSE-LABEL: hadd_reverse_v16f32:
301; SSE:       # %bb.0:
302; SSE-NEXT:    movaps %xmm5, %xmm8
303; SSE-NEXT:    movaps %xmm1, %xmm5
304; SSE-NEXT:    haddps %xmm2, %xmm3
305; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0,3,2]
306; SSE-NEXT:    haddps %xmm6, %xmm7
307; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[1,0,3,2]
308; SSE-NEXT:    haddps %xmm0, %xmm5
309; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,0,3,2]
310; SSE-NEXT:    haddps %xmm4, %xmm8
311; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[1,0,3,2]
312; SSE-NEXT:    movaps %xmm3, %xmm0
313; SSE-NEXT:    movaps %xmm7, %xmm1
314; SSE-NEXT:    movaps %xmm5, %xmm2
315; SSE-NEXT:    movaps %xmm8, %xmm3
316; SSE-NEXT:    retq
317;
318; AVX1-LABEL: hadd_reverse_v16f32:
319; AVX1:       # %bb.0:
320; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm2[2,3]
321; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
322; AVX1-NEXT:    vhaddps %ymm0, %ymm4, %ymm2
323; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm3[2,3]
324; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
325; AVX1-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
326; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
327; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm2[1,0,3,2,5,4,7,6]
328; AVX1-NEXT:    retq
329;
330; AVX2-LABEL: hadd_reverse_v16f32:
331; AVX2:       # %bb.0:
332; AVX2-NEXT:    vhaddps %ymm3, %ymm1, %ymm1
333; AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[1,0,3,2,5,4,7,6]
334; AVX2-NEXT:    vpermpd {{.*#+}} ymm3 = ymm1[2,0,3,1]
335; AVX2-NEXT:    vhaddps %ymm2, %ymm0, %ymm0
336; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
337; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[2,0,3,1]
338; AVX2-NEXT:    vmovaps %ymm3, %ymm0
339; AVX2-NEXT:    retq
340  %lhs = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 15, i32 13, i32 11, i32 9, i32 31, i32 29, i32 27, i32 25, i32 7, i32 5, i32 3, i32 1, i32 23, i32 21, i32 19, i32 17>
341  %rhs = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 14, i32 12, i32 10, i32 8, i32 30, i32 28, i32 26, i32 24, i32 6, i32 4, i32 2, i32 0, i32 22, i32 20, i32 18, i32 16>
342  %fadd = fadd <16 x float> %lhs, %rhs
343  ret <16 x float> %fadd
344}
345
346define <16 x float> @hadd_reverse2_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind {
347; SSE-LABEL: hadd_reverse2_v16f32:
348; SSE:       # %bb.0:
349; SSE-NEXT:    movaps %xmm1, %xmm8
350; SSE-NEXT:    movaps %xmm0, %xmm9
351; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[3,2],xmm0[1,0]
352; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[3,2],xmm1[1,0]
353; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,2,1,0]
354; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,2,1,0]
355; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,2,1,0]
356; SSE-NEXT:    haddps %xmm4, %xmm9
357; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[3,2,1,0]
358; SSE-NEXT:    haddps %xmm5, %xmm8
359; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[3,2,1,0]
360; SSE-NEXT:    haddps %xmm6, %xmm2
361; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[3,2,1,0]
362; SSE-NEXT:    haddps %xmm7, %xmm3
363; SSE-NEXT:    movaps %xmm3, %xmm0
364; SSE-NEXT:    movaps %xmm2, %xmm1
365; SSE-NEXT:    movaps %xmm8, %xmm2
366; SSE-NEXT:    movaps %xmm9, %xmm3
367; SSE-NEXT:    retq
368;
369; AVX1-LABEL: hadd_reverse2_v16f32:
370; AVX1:       # %bb.0:
371; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
372; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
373; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
374; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm1[2,3,0,1]
375; AVX1-NEXT:    vpermilps {{.*#+}} ymm1 = ymm2[3,2,1,0,7,6,5,4]
376; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
377; AVX1-NEXT:    vhaddps %ymm1, %ymm0, %ymm1
378; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm3[3,2,1,0,7,6,5,4]
379; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
380; AVX1-NEXT:    vhaddps %ymm0, %ymm4, %ymm0
381; AVX1-NEXT:    retq
382;
383; AVX2-LABEL: hadd_reverse2_v16f32:
384; AVX2:       # %bb.0:
385; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
386; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
387; AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4]
388; AVX2-NEXT:    vpermpd {{.*#+}} ymm4 = ymm1[2,3,0,1]
389; AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = ymm2[3,2,1,0,7,6,5,4]
390; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,3,0,1]
391; AVX2-NEXT:    vhaddps %ymm1, %ymm0, %ymm1
392; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm3[3,2,1,0,7,6,5,4]
393; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
394; AVX2-NEXT:    vhaddps %ymm0, %ymm4, %ymm0
395; AVX2-NEXT:    retq
396  %shuf0 = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
397  %shuf1 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
398  %lhs = shufflevector <16 x float> %shuf0, <16 x float> %shuf1, <16 x i32> <i32 0, i32 2, i32 16, i32 18, i32 4, i32 6, i32 20, i32 22, i32 8, i32 10, i32 24, i32 26, i32 12, i32 14, i32 28, i32 30>
399  %rhs = shufflevector <16 x float> %shuf0, <16 x float> %shuf1, <16 x i32> <i32 1, i32 3, i32 17, i32 19, i32 5, i32 7, i32 21, i32 23, i32 9, i32 11, i32 25, i32 27, i32 13, i32 15, i32 29, i32 31>
400  %fadd = fadd <16 x float> %lhs, %rhs
401  ret <16 x float> %fadd
402}
403