• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSE3
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3,+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
6
7define <4 x float> @hadd_ps_test1(<4 x float> %A, <4 x float> %B) {
8; SSE-LABEL: hadd_ps_test1:
9; SSE:       # %bb.0:
10; SSE-NEXT:    haddps %xmm1, %xmm0
11; SSE-NEXT:    retq
12;
13; AVX-LABEL: hadd_ps_test1:
14; AVX:       # %bb.0:
15; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
16; AVX-NEXT:    retq
17  %vecext = extractelement <4 x float> %A, i32 0
18  %vecext1 = extractelement <4 x float> %A, i32 1
19  %add = fadd float %vecext, %vecext1
20  %vecinit = insertelement <4 x float> undef, float %add, i32 0
21  %vecext2 = extractelement <4 x float> %A, i32 2
22  %vecext3 = extractelement <4 x float> %A, i32 3
23  %add4 = fadd float %vecext2, %vecext3
24  %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
25  %vecext6 = extractelement <4 x float> %B, i32 0
26  %vecext7 = extractelement <4 x float> %B, i32 1
27  %add8 = fadd float %vecext6, %vecext7
28  %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2
29  %vecext10 = extractelement <4 x float> %B, i32 2
30  %vecext11 = extractelement <4 x float> %B, i32 3
31  %add12 = fadd float %vecext10, %vecext11
32  %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3
33  ret <4 x float> %vecinit13
34}
35
36define <4 x float> @hadd_ps_test2(<4 x float> %A, <4 x float> %B) {
37; SSE-LABEL: hadd_ps_test2:
38; SSE:       # %bb.0:
39; SSE-NEXT:    haddps %xmm1, %xmm0
40; SSE-NEXT:    retq
41;
42; AVX-LABEL: hadd_ps_test2:
43; AVX:       # %bb.0:
44; AVX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0
45; AVX-NEXT:    retq
46  %vecext = extractelement <4 x float> %A, i32 2
47  %vecext1 = extractelement <4 x float> %A, i32 3
48  %add = fadd float %vecext, %vecext1
49  %vecinit = insertelement <4 x float> undef, float %add, i32 1
50  %vecext2 = extractelement <4 x float> %A, i32 0
51  %vecext3 = extractelement <4 x float> %A, i32 1
52  %add4 = fadd float %vecext2, %vecext3
53  %vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 0
54  %vecext6 = extractelement <4 x float> %B, i32 2
55  %vecext7 = extractelement <4 x float> %B, i32 3
56  %add8 = fadd float %vecext6, %vecext7
57  %vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 3
58  %vecext10 = extractelement <4 x float> %B, i32 0
59  %vecext11 = extractelement <4 x float> %B, i32 1
60  %add12 = fadd float %vecext10, %vecext11
61  %vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 2
62  ret <4 x float> %vecinit13
63}
64
65define <4 x float> @hsub_ps_test1(<4 x float> %A, <4 x float> %B) {
66; SSE-LABEL: hsub_ps_test1:
67; SSE:       # %bb.0:
68; SSE-NEXT:    hsubps %xmm1, %xmm0
69; SSE-NEXT:    retq
70;
71; AVX-LABEL: hsub_ps_test1:
72; AVX:       # %bb.0:
73; AVX-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
74; AVX-NEXT:    retq
75  %vecext = extractelement <4 x float> %A, i32 0
76  %vecext1 = extractelement <4 x float> %A, i32 1
77  %sub = fsub float %vecext, %vecext1
78  %vecinit = insertelement <4 x float> undef, float %sub, i32 0
79  %vecext2 = extractelement <4 x float> %A, i32 2
80  %vecext3 = extractelement <4 x float> %A, i32 3
81  %sub4 = fsub float %vecext2, %vecext3
82  %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 1
83  %vecext6 = extractelement <4 x float> %B, i32 0
84  %vecext7 = extractelement <4 x float> %B, i32 1
85  %sub8 = fsub float %vecext6, %vecext7
86  %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 2
87  %vecext10 = extractelement <4 x float> %B, i32 2
88  %vecext11 = extractelement <4 x float> %B, i32 3
89  %sub12 = fsub float %vecext10, %vecext11
90  %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 3
91  ret <4 x float> %vecinit13
92}
93
94define <4 x float> @hsub_ps_test2(<4 x float> %A, <4 x float> %B) {
95; SSE-LABEL: hsub_ps_test2:
96; SSE:       # %bb.0:
97; SSE-NEXT:    hsubps %xmm1, %xmm0
98; SSE-NEXT:    retq
99;
100; AVX-LABEL: hsub_ps_test2:
101; AVX:       # %bb.0:
102; AVX-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
103; AVX-NEXT:    retq
104  %vecext = extractelement <4 x float> %A, i32 2
105  %vecext1 = extractelement <4 x float> %A, i32 3
106  %sub = fsub float %vecext, %vecext1
107  %vecinit = insertelement <4 x float> undef, float %sub, i32 1
108  %vecext2 = extractelement <4 x float> %A, i32 0
109  %vecext3 = extractelement <4 x float> %A, i32 1
110  %sub4 = fsub float %vecext2, %vecext3
111  %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 0
112  %vecext6 = extractelement <4 x float> %B, i32 2
113  %vecext7 = extractelement <4 x float> %B, i32 3
114  %sub8 = fsub float %vecext6, %vecext7
115  %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 3
116  %vecext10 = extractelement <4 x float> %B, i32 0
117  %vecext11 = extractelement <4 x float> %B, i32 1
118  %sub12 = fsub float %vecext10, %vecext11
119  %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2
120  ret <4 x float> %vecinit13
121}
122
123define <4 x i32> @phadd_d_test1(<4 x i32> %A, <4 x i32> %B) {
124; SSE3-LABEL: phadd_d_test1:
125; SSE3:       # %bb.0:
126; SSE3-NEXT:    movd %xmm0, %eax
127; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
128; SSE3-NEXT:    movd %xmm2, %ecx
129; SSE3-NEXT:    addl %eax, %ecx
130; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
131; SSE3-NEXT:    movd %xmm2, %eax
132; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
133; SSE3-NEXT:    movd %xmm0, %edx
134; SSE3-NEXT:    addl %eax, %edx
135; SSE3-NEXT:    movd %xmm1, %eax
136; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
137; SSE3-NEXT:    movd %xmm0, %esi
138; SSE3-NEXT:    addl %eax, %esi
139; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
140; SSE3-NEXT:    movd %xmm0, %eax
141; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
142; SSE3-NEXT:    movd %xmm0, %edi
143; SSE3-NEXT:    addl %eax, %edi
144; SSE3-NEXT:    movd %edi, %xmm0
145; SSE3-NEXT:    movd %esi, %xmm1
146; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
147; SSE3-NEXT:    movd %edx, %xmm2
148; SSE3-NEXT:    movd %ecx, %xmm0
149; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
150; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
151; SSE3-NEXT:    retq
152;
153; SSSE3-LABEL: phadd_d_test1:
154; SSSE3:       # %bb.0:
155; SSSE3-NEXT:    phaddd %xmm1, %xmm0
156; SSSE3-NEXT:    retq
157;
158; AVX-LABEL: phadd_d_test1:
159; AVX:       # %bb.0:
160; AVX-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
161; AVX-NEXT:    retq
162  %vecext = extractelement <4 x i32> %A, i32 0
163  %vecext1 = extractelement <4 x i32> %A, i32 1
164  %add = add i32 %vecext, %vecext1
165  %vecinit = insertelement <4 x i32> undef, i32 %add, i32 0
166  %vecext2 = extractelement <4 x i32> %A, i32 2
167  %vecext3 = extractelement <4 x i32> %A, i32 3
168  %add4 = add i32 %vecext2, %vecext3
169  %vecinit5 = insertelement <4 x i32> %vecinit, i32 %add4, i32 1
170  %vecext6 = extractelement <4 x i32> %B, i32 0
171  %vecext7 = extractelement <4 x i32> %B, i32 1
172  %add8 = add i32 %vecext6, %vecext7
173  %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %add8, i32 2
174  %vecext10 = extractelement <4 x i32> %B, i32 2
175  %vecext11 = extractelement <4 x i32> %B, i32 3
176  %add12 = add i32 %vecext10, %vecext11
177  %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 3
178  ret <4 x i32> %vecinit13
179}
180
181define <4 x i32> @phadd_d_test2(<4 x i32> %A, <4 x i32> %B) {
182; SSE3-LABEL: phadd_d_test2:
183; SSE3:       # %bb.0:
184; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
185; SSE3-NEXT:    movd %xmm2, %eax
186; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
187; SSE3-NEXT:    movd %xmm2, %ecx
188; SSE3-NEXT:    addl %eax, %ecx
189; SSE3-NEXT:    movd %xmm0, %eax
190; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
191; SSE3-NEXT:    movd %xmm0, %edx
192; SSE3-NEXT:    addl %eax, %edx
193; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
194; SSE3-NEXT:    movd %xmm0, %eax
195; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
196; SSE3-NEXT:    movd %xmm0, %esi
197; SSE3-NEXT:    addl %eax, %esi
198; SSE3-NEXT:    movd %esi, %xmm0
199; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
200; SSE3-NEXT:    movd %xmm2, %eax
201; SSE3-NEXT:    movd %xmm1, %esi
202; SSE3-NEXT:    addl %eax, %esi
203; SSE3-NEXT:    movd %esi, %xmm1
204; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
205; SSE3-NEXT:    movd %ecx, %xmm2
206; SSE3-NEXT:    movd %edx, %xmm0
207; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
208; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
209; SSE3-NEXT:    retq
210;
211; SSSE3-LABEL: phadd_d_test2:
212; SSSE3:       # %bb.0:
213; SSSE3-NEXT:    phaddd %xmm1, %xmm0
214; SSSE3-NEXT:    retq
215;
216; AVX-LABEL: phadd_d_test2:
217; AVX:       # %bb.0:
218; AVX-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
219; AVX-NEXT:    retq
220  %vecext = extractelement <4 x i32> %A, i32 2
221  %vecext1 = extractelement <4 x i32> %A, i32 3
222  %add = add i32 %vecext, %vecext1
223  %vecinit = insertelement <4 x i32> undef, i32 %add, i32 1
224  %vecext2 = extractelement <4 x i32> %A, i32 0
225  %vecext3 = extractelement <4 x i32> %A, i32 1
226  %add4 = add i32 %vecext2, %vecext3
227  %vecinit5 = insertelement <4 x i32> %vecinit, i32 %add4, i32 0
228  %vecext6 = extractelement <4 x i32> %B, i32 3
229  %vecext7 = extractelement <4 x i32> %B, i32 2
230  %add8 = add i32 %vecext6, %vecext7
231  %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %add8, i32 3
232  %vecext10 = extractelement <4 x i32> %B, i32 1
233  %vecext11 = extractelement <4 x i32> %B, i32 0
234  %add12 = add i32 %vecext10, %vecext11
235  %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 2
236  ret <4 x i32> %vecinit13
237}
238
239define <4 x i32> @phsub_d_test1(<4 x i32> %A, <4 x i32> %B) {
240; SSE3-LABEL: phsub_d_test1:
241; SSE3:       # %bb.0:
242; SSE3-NEXT:    movd %xmm0, %eax
243; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
244; SSE3-NEXT:    movd %xmm2, %ecx
245; SSE3-NEXT:    subl %ecx, %eax
246; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
247; SSE3-NEXT:    movd %xmm2, %ecx
248; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
249; SSE3-NEXT:    movd %xmm0, %edx
250; SSE3-NEXT:    subl %edx, %ecx
251; SSE3-NEXT:    movd %xmm1, %edx
252; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
253; SSE3-NEXT:    movd %xmm0, %esi
254; SSE3-NEXT:    subl %esi, %edx
255; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
256; SSE3-NEXT:    movd %xmm0, %esi
257; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
258; SSE3-NEXT:    movd %xmm0, %edi
259; SSE3-NEXT:    subl %edi, %esi
260; SSE3-NEXT:    movd %esi, %xmm0
261; SSE3-NEXT:    movd %edx, %xmm1
262; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
263; SSE3-NEXT:    movd %ecx, %xmm2
264; SSE3-NEXT:    movd %eax, %xmm0
265; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
266; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
267; SSE3-NEXT:    retq
268;
269; SSSE3-LABEL: phsub_d_test1:
270; SSSE3:       # %bb.0:
271; SSSE3-NEXT:    phsubd %xmm1, %xmm0
272; SSSE3-NEXT:    retq
273;
274; AVX-LABEL: phsub_d_test1:
275; AVX:       # %bb.0:
276; AVX-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
277; AVX-NEXT:    retq
278  %vecext = extractelement <4 x i32> %A, i32 0
279  %vecext1 = extractelement <4 x i32> %A, i32 1
280  %sub = sub i32 %vecext, %vecext1
281  %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 0
282  %vecext2 = extractelement <4 x i32> %A, i32 2
283  %vecext3 = extractelement <4 x i32> %A, i32 3
284  %sub4 = sub i32 %vecext2, %vecext3
285  %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 1
286  %vecext6 = extractelement <4 x i32> %B, i32 0
287  %vecext7 = extractelement <4 x i32> %B, i32 1
288  %sub8 = sub i32 %vecext6, %vecext7
289  %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 2
290  %vecext10 = extractelement <4 x i32> %B, i32 2
291  %vecext11 = extractelement <4 x i32> %B, i32 3
292  %sub12 = sub i32 %vecext10, %vecext11
293  %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3
294  ret <4 x i32> %vecinit13
295}
296
297define <4 x i32> @phsub_d_test2(<4 x i32> %A, <4 x i32> %B) {
298; SSE3-LABEL: phsub_d_test2:
299; SSE3:       # %bb.0:
300; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
301; SSE3-NEXT:    movd %xmm2, %eax
302; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
303; SSE3-NEXT:    movd %xmm2, %ecx
304; SSE3-NEXT:    subl %ecx, %eax
305; SSE3-NEXT:    movd %xmm0, %ecx
306; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
307; SSE3-NEXT:    movd %xmm0, %edx
308; SSE3-NEXT:    subl %edx, %ecx
309; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
310; SSE3-NEXT:    movd %xmm0, %edx
311; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
312; SSE3-NEXT:    movd %xmm0, %esi
313; SSE3-NEXT:    subl %esi, %edx
314; SSE3-NEXT:    movd %edx, %xmm0
315; SSE3-NEXT:    movd %xmm1, %edx
316; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
317; SSE3-NEXT:    movd %xmm1, %esi
318; SSE3-NEXT:    subl %esi, %edx
319; SSE3-NEXT:    movd %edx, %xmm1
320; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
321; SSE3-NEXT:    movd %eax, %xmm2
322; SSE3-NEXT:    movd %ecx, %xmm0
323; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
324; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
325; SSE3-NEXT:    retq
326;
327; SSSE3-LABEL: phsub_d_test2:
328; SSSE3:       # %bb.0:
329; SSSE3-NEXT:    phsubd %xmm1, %xmm0
330; SSSE3-NEXT:    retq
331;
332; AVX-LABEL: phsub_d_test2:
333; AVX:       # %bb.0:
334; AVX-NEXT:    vphsubd %xmm1, %xmm0, %xmm0
335; AVX-NEXT:    retq
336  %vecext = extractelement <4 x i32> %A, i32 2
337  %vecext1 = extractelement <4 x i32> %A, i32 3
338  %sub = sub i32 %vecext, %vecext1
339  %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 1
340  %vecext2 = extractelement <4 x i32> %A, i32 0
341  %vecext3 = extractelement <4 x i32> %A, i32 1
342  %sub4 = sub i32 %vecext2, %vecext3
343  %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 0
344  %vecext6 = extractelement <4 x i32> %B, i32 2
345  %vecext7 = extractelement <4 x i32> %B, i32 3
346  %sub8 = sub i32 %vecext6, %vecext7
347  %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 3
348  %vecext10 = extractelement <4 x i32> %B, i32 0
349  %vecext11 = extractelement <4 x i32> %B, i32 1
350  %sub12 = sub i32 %vecext10, %vecext11
351  %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 2
352  ret <4 x i32> %vecinit13
353}
354
355define <2 x double> @hadd_pd_test1(<2 x double> %A, <2 x double> %B) {
356; SSE-LABEL: hadd_pd_test1:
357; SSE:       # %bb.0:
358; SSE-NEXT:    haddpd %xmm1, %xmm0
359; SSE-NEXT:    retq
360;
361; AVX-LABEL: hadd_pd_test1:
362; AVX:       # %bb.0:
363; AVX-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
364; AVX-NEXT:    retq
365  %vecext = extractelement <2 x double> %A, i32 0
366  %vecext1 = extractelement <2 x double> %A, i32 1
367  %add = fadd double %vecext, %vecext1
368  %vecinit = insertelement <2 x double> undef, double %add, i32 0
369  %vecext2 = extractelement <2 x double> %B, i32 0
370  %vecext3 = extractelement <2 x double> %B, i32 1
371  %add2 = fadd double %vecext2, %vecext3
372  %vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1
373  ret <2 x double> %vecinit2
374}
375
376define <2 x double> @hadd_pd_test2(<2 x double> %A, <2 x double> %B) {
377; SSE-LABEL: hadd_pd_test2:
378; SSE:       # %bb.0:
379; SSE-NEXT:    haddpd %xmm1, %xmm0
380; SSE-NEXT:    retq
381;
382; AVX-LABEL: hadd_pd_test2:
383; AVX:       # %bb.0:
384; AVX-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0
385; AVX-NEXT:    retq
386  %vecext = extractelement <2 x double> %A, i32 1
387  %vecext1 = extractelement <2 x double> %A, i32 0
388  %add = fadd double %vecext, %vecext1
389  %vecinit = insertelement <2 x double> undef, double %add, i32 0
390  %vecext2 = extractelement <2 x double> %B, i32 1
391  %vecext3 = extractelement <2 x double> %B, i32 0
392  %add2 = fadd double %vecext2, %vecext3
393  %vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1
394  ret <2 x double> %vecinit2
395}
396
397define <2 x double> @hsub_pd_test1(<2 x double> %A, <2 x double> %B) {
398; SSE-LABEL: hsub_pd_test1:
399; SSE:       # %bb.0:
400; SSE-NEXT:    hsubpd %xmm1, %xmm0
401; SSE-NEXT:    retq
402;
403; AVX-LABEL: hsub_pd_test1:
404; AVX:       # %bb.0:
405; AVX-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0
406; AVX-NEXT:    retq
407  %vecext = extractelement <2 x double> %A, i32 0
408  %vecext1 = extractelement <2 x double> %A, i32 1
409  %sub = fsub double %vecext, %vecext1
410  %vecinit = insertelement <2 x double> undef, double %sub, i32 0
411  %vecext2 = extractelement <2 x double> %B, i32 0
412  %vecext3 = extractelement <2 x double> %B, i32 1
413  %sub2 = fsub double %vecext2, %vecext3
414  %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 1
415  ret <2 x double> %vecinit2
416}
417
418define <2 x double> @hsub_pd_test2(<2 x double> %A, <2 x double> %B) {
419; SSE-LABEL: hsub_pd_test2:
420; SSE:       # %bb.0:
421; SSE-NEXT:    hsubpd %xmm1, %xmm0
422; SSE-NEXT:    retq
423;
424; AVX-LABEL: hsub_pd_test2:
425; AVX:       # %bb.0:
426; AVX-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0
427; AVX-NEXT:    retq
428  %vecext = extractelement <2 x double> %B, i32 0
429  %vecext1 = extractelement <2 x double> %B, i32 1
430  %sub = fsub double %vecext, %vecext1
431  %vecinit = insertelement <2 x double> undef, double %sub, i32 1
432  %vecext2 = extractelement <2 x double> %A, i32 0
433  %vecext3 = extractelement <2 x double> %A, i32 1
434  %sub2 = fsub double %vecext2, %vecext3
435  %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0
436  ret <2 x double> %vecinit2
437}
438
439define <4 x double> @avx_vhadd_pd_test(<4 x double> %A, <4 x double> %B) {
440; SSE-LABEL: avx_vhadd_pd_test:
441; SSE:       # %bb.0:
442; SSE-NEXT:    haddpd %xmm1, %xmm0
443; SSE-NEXT:    haddpd %xmm3, %xmm2
444; SSE-NEXT:    movapd %xmm2, %xmm1
445; SSE-NEXT:    retq
446;
447; AVX-LABEL: avx_vhadd_pd_test:
448; AVX:       # %bb.0:
449; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
450; AVX-NEXT:    vhaddpd %xmm2, %xmm1, %xmm1
451; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
452; AVX-NEXT:    vhaddpd %xmm2, %xmm0, %xmm0
453; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
454; AVX-NEXT:    retq
455  %vecext = extractelement <4 x double> %A, i32 0
456  %vecext1 = extractelement <4 x double> %A, i32 1
457  %add = fadd double %vecext, %vecext1
458  %vecinit = insertelement <4 x double> undef, double %add, i32 0
459  %vecext2 = extractelement <4 x double> %A, i32 2
460  %vecext3 = extractelement <4 x double> %A, i32 3
461  %add4 = fadd double %vecext2, %vecext3
462  %vecinit5 = insertelement <4 x double> %vecinit, double %add4, i32 1
463  %vecext6 = extractelement <4 x double> %B, i32 0
464  %vecext7 = extractelement <4 x double> %B, i32 1
465  %add8 = fadd double %vecext6, %vecext7
466  %vecinit9 = insertelement <4 x double> %vecinit5, double %add8, i32 2
467  %vecext10 = extractelement <4 x double> %B, i32 2
468  %vecext11 = extractelement <4 x double> %B, i32 3
469  %add12 = fadd double %vecext10, %vecext11
470  %vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3
471  ret <4 x double> %vecinit13
472}
473
474define <4 x double> @avx_vhsub_pd_test(<4 x double> %A, <4 x double> %B) {
475; SSE-LABEL: avx_vhsub_pd_test:
476; SSE:       # %bb.0:
477; SSE-NEXT:    hsubpd %xmm1, %xmm0
478; SSE-NEXT:    hsubpd %xmm3, %xmm2
479; SSE-NEXT:    movapd %xmm2, %xmm1
480; SSE-NEXT:    retq
481;
482; AVX-LABEL: avx_vhsub_pd_test:
483; AVX:       # %bb.0:
484; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm2
485; AVX-NEXT:    vhsubpd %xmm2, %xmm1, %xmm1
486; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
487; AVX-NEXT:    vhsubpd %xmm2, %xmm0, %xmm0
488; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
489; AVX-NEXT:    retq
490  %vecext = extractelement <4 x double> %A, i32 0
491  %vecext1 = extractelement <4 x double> %A, i32 1
492  %sub = fsub double %vecext, %vecext1
493  %vecinit = insertelement <4 x double> undef, double %sub, i32 0
494  %vecext2 = extractelement <4 x double> %A, i32 2
495  %vecext3 = extractelement <4 x double> %A, i32 3
496  %sub4 = fsub double %vecext2, %vecext3
497  %vecinit5 = insertelement <4 x double> %vecinit, double %sub4, i32 1
498  %vecext6 = extractelement <4 x double> %B, i32 0
499  %vecext7 = extractelement <4 x double> %B, i32 1
500  %sub8 = fsub double %vecext6, %vecext7
501  %vecinit9 = insertelement <4 x double> %vecinit5, double %sub8, i32 2
502  %vecext10 = extractelement <4 x double> %B, i32 2
503  %vecext11 = extractelement <4 x double> %B, i32 3
504  %sub12 = fsub double %vecext10, %vecext11
505  %vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3
506  ret <4 x double> %vecinit13
507}
508
509define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) {
510; SSE3-LABEL: avx2_vphadd_d_test:
511; SSE3:       # %bb.0:
512; SSE3-NEXT:    movd %xmm0, %ecx
513; SSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
514; SSE3-NEXT:    movd %xmm4, %r8d
515; SSE3-NEXT:    addl %ecx, %r8d
516; SSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
517; SSE3-NEXT:    movd %xmm4, %edx
518; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
519; SSE3-NEXT:    movd %xmm0, %r9d
520; SSE3-NEXT:    addl %edx, %r9d
521; SSE3-NEXT:    movd %xmm1, %edx
522; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
523; SSE3-NEXT:    movd %xmm0, %esi
524; SSE3-NEXT:    addl %edx, %esi
525; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
526; SSE3-NEXT:    movd %xmm0, %edx
527; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
528; SSE3-NEXT:    movd %xmm0, %edi
529; SSE3-NEXT:    addl %edx, %edi
530; SSE3-NEXT:    movd %xmm2, %eax
531; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
532; SSE3-NEXT:    movd %xmm0, %r10d
533; SSE3-NEXT:    addl %eax, %r10d
534; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
535; SSE3-NEXT:    movd %xmm0, %eax
536; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
537; SSE3-NEXT:    movd %xmm0, %ecx
538; SSE3-NEXT:    addl %eax, %ecx
539; SSE3-NEXT:    movd %xmm3, %eax
540; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
541; SSE3-NEXT:    movd %xmm0, %edx
542; SSE3-NEXT:    addl %eax, %edx
543; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
544; SSE3-NEXT:    movd %xmm0, %r11d
545; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3]
546; SSE3-NEXT:    movd %xmm0, %eax
547; SSE3-NEXT:    addl %r11d, %eax
548; SSE3-NEXT:    movd %edi, %xmm0
549; SSE3-NEXT:    movd %esi, %xmm1
550; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
551; SSE3-NEXT:    movd %r9d, %xmm2
552; SSE3-NEXT:    movd %r8d, %xmm0
553; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
554; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
555; SSE3-NEXT:    movd %eax, %xmm1
556; SSE3-NEXT:    movd %edx, %xmm2
557; SSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
558; SSE3-NEXT:    movd %ecx, %xmm3
559; SSE3-NEXT:    movd %r10d, %xmm1
560; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
561; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
562; SSE3-NEXT:    retq
563;
564; SSSE3-LABEL: avx2_vphadd_d_test:
565; SSSE3:       # %bb.0:
566; SSSE3-NEXT:    phaddd %xmm1, %xmm0
567; SSSE3-NEXT:    phaddd %xmm3, %xmm2
568; SSSE3-NEXT:    movdqa %xmm2, %xmm1
569; SSSE3-NEXT:    retq
570;
571; AVX1-LABEL: avx2_vphadd_d_test:
572; AVX1:       # %bb.0:
573; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
574; AVX1-NEXT:    vphaddd %xmm2, %xmm1, %xmm1
575; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
576; AVX1-NEXT:    vphaddd %xmm2, %xmm0, %xmm0
577; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
578; AVX1-NEXT:    retq
579;
580; AVX2-LABEL: avx2_vphadd_d_test:
581; AVX2:       # %bb.0:
582; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
583; AVX2-NEXT:    vphaddd %xmm2, %xmm1, %xmm1
584; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
585; AVX2-NEXT:    vphaddd %xmm2, %xmm0, %xmm0
586; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
587; AVX2-NEXT:    retq
588  %vecext = extractelement <8 x i32> %A, i32 0
589  %vecext1 = extractelement <8 x i32> %A, i32 1
590  %add = add i32 %vecext, %vecext1
591  %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
592  %vecext2 = extractelement <8 x i32> %A, i32 2
593  %vecext3 = extractelement <8 x i32> %A, i32 3
594  %add4 = add i32 %vecext2, %vecext3
595  %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1
596  %vecext6 = extractelement <8 x i32> %A, i32 4
597  %vecext7 = extractelement <8 x i32> %A, i32 5
598  %add8 = add i32 %vecext6, %vecext7
599  %vecinit9 = insertelement <8 x i32> %vecinit5, i32 %add8, i32 2
600  %vecext10 = extractelement <8 x i32> %A, i32 6
601  %vecext11 = extractelement <8 x i32> %A, i32 7
602  %add12 = add i32 %vecext10, %vecext11
603  %vecinit13 = insertelement <8 x i32> %vecinit9, i32 %add12, i32 3
604  %vecext14 = extractelement <8 x i32> %B, i32 0
605  %vecext15 = extractelement <8 x i32> %B, i32 1
606  %add16 = add i32 %vecext14, %vecext15
607  %vecinit17 = insertelement <8 x i32> %vecinit13, i32 %add16, i32 4
608  %vecext18 = extractelement <8 x i32> %B, i32 2
609  %vecext19 = extractelement <8 x i32> %B, i32 3
610  %add20 = add i32 %vecext18, %vecext19
611  %vecinit21 = insertelement <8 x i32> %vecinit17, i32 %add20, i32 5
612  %vecext22 = extractelement <8 x i32> %B, i32 4
613  %vecext23 = extractelement <8 x i32> %B, i32 5
614  %add24 = add i32 %vecext22, %vecext23
615  %vecinit25 = insertelement <8 x i32> %vecinit21, i32 %add24, i32 6
616  %vecext26 = extractelement <8 x i32> %B, i32 6
617  %vecext27 = extractelement <8 x i32> %B, i32 7
618  %add28 = add i32 %vecext26, %vecext27
619  %vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7
620  ret <8 x i32> %vecinit29
621}
622
623define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) {
624; SSE3-LABEL: avx2_vphadd_w_test:
625; SSE3:       # %bb.0:
626; SSE3-NEXT:    pushq %rbp
627; SSE3-NEXT:    .cfi_def_cfa_offset 16
628; SSE3-NEXT:    pushq %r15
629; SSE3-NEXT:    .cfi_def_cfa_offset 24
630; SSE3-NEXT:    pushq %r14
631; SSE3-NEXT:    .cfi_def_cfa_offset 32
632; SSE3-NEXT:    pushq %r13
633; SSE3-NEXT:    .cfi_def_cfa_offset 40
634; SSE3-NEXT:    pushq %r12
635; SSE3-NEXT:    .cfi_def_cfa_offset 48
636; SSE3-NEXT:    pushq %rbx
637; SSE3-NEXT:    .cfi_def_cfa_offset 56
638; SSE3-NEXT:    .cfi_offset %rbx, -56
639; SSE3-NEXT:    .cfi_offset %r12, -48
640; SSE3-NEXT:    .cfi_offset %r13, -40
641; SSE3-NEXT:    .cfi_offset %r14, -32
642; SSE3-NEXT:    .cfi_offset %r15, -24
643; SSE3-NEXT:    .cfi_offset %rbp, -16
644; SSE3-NEXT:    movd %xmm0, %eax
645; SSE3-NEXT:    pextrw $1, %xmm0, %ecx
646; SSE3-NEXT:    addl %eax, %ecx
647; SSE3-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
648; SSE3-NEXT:    pextrw $2, %xmm0, %eax
649; SSE3-NEXT:    pextrw $3, %xmm0, %ecx
650; SSE3-NEXT:    addl %eax, %ecx
651; SSE3-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
652; SSE3-NEXT:    pextrw $4, %xmm0, %eax
653; SSE3-NEXT:    pextrw $5, %xmm0, %r11d
654; SSE3-NEXT:    addl %eax, %r11d
655; SSE3-NEXT:    pextrw $6, %xmm0, %eax
656; SSE3-NEXT:    pextrw $7, %xmm0, %r15d
657; SSE3-NEXT:    addl %eax, %r15d
658; SSE3-NEXT:    movd %xmm1, %eax
659; SSE3-NEXT:    pextrw $1, %xmm1, %r13d
660; SSE3-NEXT:    addl %eax, %r13d
661; SSE3-NEXT:    pextrw $2, %xmm1, %eax
662; SSE3-NEXT:    pextrw $3, %xmm1, %ebx
663; SSE3-NEXT:    addl %eax, %ebx
664; SSE3-NEXT:    pextrw $4, %xmm1, %eax
665; SSE3-NEXT:    pextrw $5, %xmm1, %r8d
666; SSE3-NEXT:    addl %eax, %r8d
667; SSE3-NEXT:    pextrw $6, %xmm1, %eax
668; SSE3-NEXT:    pextrw $7, %xmm1, %esi
669; SSE3-NEXT:    addl %eax, %esi
670; SSE3-NEXT:    movd %xmm2, %eax
671; SSE3-NEXT:    pextrw $1, %xmm2, %r10d
672; SSE3-NEXT:    addl %eax, %r10d
673; SSE3-NEXT:    pextrw $2, %xmm2, %eax
674; SSE3-NEXT:    pextrw $3, %xmm2, %r14d
675; SSE3-NEXT:    addl %eax, %r14d
676; SSE3-NEXT:    pextrw $4, %xmm2, %eax
677; SSE3-NEXT:    pextrw $5, %xmm2, %r12d
678; SSE3-NEXT:    addl %eax, %r12d
679; SSE3-NEXT:    pextrw $6, %xmm2, %eax
680; SSE3-NEXT:    pextrw $7, %xmm2, %r9d
681; SSE3-NEXT:    addl %eax, %r9d
682; SSE3-NEXT:    movd %xmm3, %eax
683; SSE3-NEXT:    pextrw $1, %xmm3, %ebp
684; SSE3-NEXT:    addl %eax, %ebp
685; SSE3-NEXT:    pextrw $2, %xmm3, %edx
686; SSE3-NEXT:    pextrw $3, %xmm3, %edi
687; SSE3-NEXT:    addl %edx, %edi
688; SSE3-NEXT:    pextrw $4, %xmm3, %edx
689; SSE3-NEXT:    pextrw $5, %xmm3, %ecx
690; SSE3-NEXT:    addl %edx, %ecx
691; SSE3-NEXT:    pextrw $6, %xmm3, %edx
692; SSE3-NEXT:    pextrw $7, %xmm3, %eax
693; SSE3-NEXT:    addl %edx, %eax
694; SSE3-NEXT:    movd %esi, %xmm8
695; SSE3-NEXT:    movd %r8d, %xmm3
696; SSE3-NEXT:    movd %ebx, %xmm9
697; SSE3-NEXT:    movd %r13d, %xmm4
698; SSE3-NEXT:    movd %r15d, %xmm10
699; SSE3-NEXT:    movd %r11d, %xmm7
700; SSE3-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 4-byte Folded Reload
701; SSE3-NEXT:    # xmm11 = mem[0],zero,zero,zero
702; SSE3-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
703; SSE3-NEXT:    # xmm0 = mem[0],zero,zero,zero
704; SSE3-NEXT:    movd %eax, %xmm12
705; SSE3-NEXT:    movd %ecx, %xmm6
706; SSE3-NEXT:    movd %edi, %xmm13
707; SSE3-NEXT:    movd %ebp, %xmm5
708; SSE3-NEXT:    movd %r9d, %xmm14
709; SSE3-NEXT:    movd %r12d, %xmm2
710; SSE3-NEXT:    movd %r14d, %xmm15
711; SSE3-NEXT:    movd %r10d, %xmm1
712; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
713; SSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
714; SSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
715; SSE3-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
716; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
717; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
718; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
719; SSE3-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
720; SSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3]
721; SSE3-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
722; SSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
723; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3]
724; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
725; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
726; SSE3-NEXT:    popq %rbx
727; SSE3-NEXT:    .cfi_def_cfa_offset 48
728; SSE3-NEXT:    popq %r12
729; SSE3-NEXT:    .cfi_def_cfa_offset 40
730; SSE3-NEXT:    popq %r13
731; SSE3-NEXT:    .cfi_def_cfa_offset 32
732; SSE3-NEXT:    popq %r14
733; SSE3-NEXT:    .cfi_def_cfa_offset 24
734; SSE3-NEXT:    popq %r15
735; SSE3-NEXT:    .cfi_def_cfa_offset 16
736; SSE3-NEXT:    popq %rbp
737; SSE3-NEXT:    .cfi_def_cfa_offset 8
738; SSE3-NEXT:    retq
739;
740; SSSE3-LABEL: avx2_vphadd_w_test:
741; SSSE3:       # %bb.0:
742; SSSE3-NEXT:    phaddw %xmm1, %xmm0
743; SSSE3-NEXT:    phaddw %xmm3, %xmm2
744; SSSE3-NEXT:    movdqa %xmm2, %xmm1
745; SSSE3-NEXT:    retq
746;
747; AVX1-LABEL: avx2_vphadd_w_test:
748; AVX1:       # %bb.0:
749; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
750; AVX1-NEXT:    vphaddw %xmm2, %xmm1, %xmm1
751; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
752; AVX1-NEXT:    vphaddw %xmm2, %xmm0, %xmm0
753; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
754; AVX1-NEXT:    retq
755;
756; AVX2-LABEL: avx2_vphadd_w_test:
757; AVX2:       # %bb.0:
758; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
759; AVX2-NEXT:    vphaddw %xmm2, %xmm1, %xmm1
760; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
761; AVX2-NEXT:    vphaddw %xmm2, %xmm0, %xmm0
762; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
763; AVX2-NEXT:    retq
764  %vecext = extractelement <16 x i16> %a, i32 0
765  %vecext1 = extractelement <16 x i16> %a, i32 1
766  %add = add i16 %vecext, %vecext1
767  %vecinit = insertelement <16 x i16> undef, i16 %add, i32 0
768  %vecext4 = extractelement <16 x i16> %a, i32 2
769  %vecext6 = extractelement <16 x i16> %a, i32 3
770  %add8 = add i16 %vecext4, %vecext6
771  %vecinit10 = insertelement <16 x i16> %vecinit, i16 %add8, i32 1
772  %vecext11 = extractelement <16 x i16> %a, i32 4
773  %vecext13 = extractelement <16 x i16> %a, i32 5
774  %add15 = add i16 %vecext11, %vecext13
775  %vecinit17 = insertelement <16 x i16> %vecinit10, i16 %add15, i32 2
776  %vecext18 = extractelement <16 x i16> %a, i32 6
777  %vecext20 = extractelement <16 x i16> %a, i32 7
778  %add22 = add i16 %vecext18, %vecext20
779  %vecinit24 = insertelement <16 x i16> %vecinit17, i16 %add22, i32 3
780  %vecext25 = extractelement <16 x i16> %a, i32 8
781  %vecext27 = extractelement <16 x i16> %a, i32 9
782  %add29 = add i16 %vecext25, %vecext27
783  %vecinit31 = insertelement <16 x i16> %vecinit24, i16 %add29, i32 4
784  %vecext32 = extractelement <16 x i16> %a, i32 10
785  %vecext34 = extractelement <16 x i16> %a, i32 11
786  %add36 = add i16 %vecext32, %vecext34
787  %vecinit38 = insertelement <16 x i16> %vecinit31, i16 %add36, i32 5
788  %vecext39 = extractelement <16 x i16> %a, i32 12
789  %vecext41 = extractelement <16 x i16> %a, i32 13
790  %add43 = add i16 %vecext39, %vecext41
791  %vecinit45 = insertelement <16 x i16> %vecinit38, i16 %add43, i32 6
792  %vecext46 = extractelement <16 x i16> %a, i32 14
793  %vecext48 = extractelement <16 x i16> %a, i32 15
794  %add50 = add i16 %vecext46, %vecext48
795  %vecinit52 = insertelement <16 x i16> %vecinit45, i16 %add50, i32 7
796  %vecext53 = extractelement <16 x i16> %b, i32 0
797  %vecext55 = extractelement <16 x i16> %b, i32 1
798  %add57 = add i16 %vecext53, %vecext55
799  %vecinit59 = insertelement <16 x i16> %vecinit52, i16 %add57, i32 8
800  %vecext60 = extractelement <16 x i16> %b, i32 2
801  %vecext62 = extractelement <16 x i16> %b, i32 3
802  %add64 = add i16 %vecext60, %vecext62
803  %vecinit66 = insertelement <16 x i16> %vecinit59, i16 %add64, i32 9
804  %vecext67 = extractelement <16 x i16> %b, i32 4
805  %vecext69 = extractelement <16 x i16> %b, i32 5
806  %add71 = add i16 %vecext67, %vecext69
807  %vecinit73 = insertelement <16 x i16> %vecinit66, i16 %add71, i32 10
808  %vecext74 = extractelement <16 x i16> %b, i32 6
809  %vecext76 = extractelement <16 x i16> %b, i32 7
810  %add78 = add i16 %vecext74, %vecext76
811  %vecinit80 = insertelement <16 x i16> %vecinit73, i16 %add78, i32 11
812  %vecext81 = extractelement <16 x i16> %b, i32 8
813  %vecext83 = extractelement <16 x i16> %b, i32 9
814  %add85 = add i16 %vecext81, %vecext83
815  %vecinit87 = insertelement <16 x i16> %vecinit80, i16 %add85, i32 12
816  %vecext88 = extractelement <16 x i16> %b, i32 10
817  %vecext90 = extractelement <16 x i16> %b, i32 11
818  %add92 = add i16 %vecext88, %vecext90
819  %vecinit94 = insertelement <16 x i16> %vecinit87, i16 %add92, i32 13
820  %vecext95 = extractelement <16 x i16> %b, i32 12
821  %vecext97 = extractelement <16 x i16> %b, i32 13
822  %add99 = add i16 %vecext95, %vecext97
823  %vecinit101 = insertelement <16 x i16> %vecinit94, i16 %add99, i32 14
824  %vecext102 = extractelement <16 x i16> %b, i32 14
825  %vecext104 = extractelement <16 x i16> %b, i32 15
826  %add106 = add i16 %vecext102, %vecext104
827  %vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15
828  ret <16 x i16> %vecinit108
829}
830
831; Verify that we don't select horizontal subs in the following functions.
832
833define <4 x i32> @not_a_hsub_1(<4 x i32> %A, <4 x i32> %B) {
834; SSE-LABEL: not_a_hsub_1:
835; SSE:       # %bb.0:
836; SSE-NEXT:    movd %xmm0, %eax
837; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
838; SSE-NEXT:    movd %xmm2, %ecx
839; SSE-NEXT:    subl %ecx, %eax
840; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
841; SSE-NEXT:    movd %xmm2, %ecx
842; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
843; SSE-NEXT:    movd %xmm0, %edx
844; SSE-NEXT:    subl %edx, %ecx
845; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
846; SSE-NEXT:    movd %xmm0, %edx
847; SSE-NEXT:    movd %xmm1, %esi
848; SSE-NEXT:    subl %esi, %edx
849; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
850; SSE-NEXT:    movd %xmm0, %esi
851; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
852; SSE-NEXT:    movd %xmm0, %edi
853; SSE-NEXT:    subl %edi, %esi
854; SSE-NEXT:    movd %esi, %xmm0
855; SSE-NEXT:    movd %edx, %xmm1
856; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
857; SSE-NEXT:    movd %ecx, %xmm2
858; SSE-NEXT:    movd %eax, %xmm0
859; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
860; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
861; SSE-NEXT:    retq
862;
863; AVX-LABEL: not_a_hsub_1:
864; AVX:       # %bb.0:
865; AVX-NEXT:    vmovd %xmm0, %eax
866; AVX-NEXT:    vpextrd $1, %xmm0, %ecx
867; AVX-NEXT:    subl %ecx, %eax
868; AVX-NEXT:    vpextrd $2, %xmm0, %ecx
869; AVX-NEXT:    vpextrd $3, %xmm0, %edx
870; AVX-NEXT:    subl %edx, %ecx
871; AVX-NEXT:    vpextrd $1, %xmm1, %edx
872; AVX-NEXT:    vmovd %xmm1, %esi
873; AVX-NEXT:    subl %esi, %edx
874; AVX-NEXT:    vpextrd $3, %xmm1, %esi
875; AVX-NEXT:    vpextrd $2, %xmm1, %edi
876; AVX-NEXT:    subl %edi, %esi
877; AVX-NEXT:    vmovd %eax, %xmm0
878; AVX-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
879; AVX-NEXT:    vpinsrd $2, %edx, %xmm0, %xmm0
880; AVX-NEXT:    vpinsrd $3, %esi, %xmm0, %xmm0
881; AVX-NEXT:    retq
882  %vecext = extractelement <4 x i32> %A, i32 0
883  %vecext1 = extractelement <4 x i32> %A, i32 1
884  %sub = sub i32 %vecext, %vecext1
885  %vecinit = insertelement <4 x i32> undef, i32 %sub, i32 0
886  %vecext2 = extractelement <4 x i32> %A, i32 2
887  %vecext3 = extractelement <4 x i32> %A, i32 3
888  %sub4 = sub i32 %vecext2, %vecext3
889  %vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 1
890  %vecext6 = extractelement <4 x i32> %B, i32 1
891  %vecext7 = extractelement <4 x i32> %B, i32 0
892  %sub8 = sub i32 %vecext6, %vecext7
893  %vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 2
894  %vecext10 = extractelement <4 x i32> %B, i32 3
895  %vecext11 = extractelement <4 x i32> %B, i32 2
896  %sub12 = sub i32 %vecext10, %vecext11
897  %vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3
898  ret <4 x i32> %vecinit13
899}
900
901define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) {
902; SSE-LABEL: not_a_hsub_2:
903; SSE:       # %bb.0:
904; SSE-NEXT:    movaps %xmm0, %xmm2
905; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
906; SSE-NEXT:    movaps %xmm0, %xmm3
907; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[2,3]
908; SSE-NEXT:    subss %xmm3, %xmm2
909; SSE-NEXT:    movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
910; SSE-NEXT:    subss %xmm3, %xmm0
911; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
912; SSE-NEXT:    movaps %xmm1, %xmm2
913; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3]
914; SSE-NEXT:    movaps %xmm1, %xmm3
915; SSE-NEXT:    movhlps {{.*#+}} xmm3 = xmm1[1],xmm3[1]
916; SSE-NEXT:    subss %xmm3, %xmm2
917; SSE-NEXT:    movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
918; SSE-NEXT:    subss %xmm3, %xmm1
919; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
920; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
921; SSE-NEXT:    retq
922;
923; AVX-LABEL: not_a_hsub_2:
924; AVX:       # %bb.0:
925; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
926; AVX-NEXT:    vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3]
927; AVX-NEXT:    vsubss %xmm3, %xmm2, %xmm2
928; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
929; AVX-NEXT:    vsubss %xmm3, %xmm0, %xmm0
930; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
931; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
932; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
933; AVX-NEXT:    vsubss %xmm3, %xmm2, %xmm2
934; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
935; AVX-NEXT:    vsubss %xmm3, %xmm1, %xmm1
936; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
937; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
938; AVX-NEXT:    retq
939  %vecext = extractelement <4 x float> %A, i32 2
940  %vecext1 = extractelement <4 x float> %A, i32 3
941  %sub = fsub float %vecext, %vecext1
942  %vecinit = insertelement <4 x float> undef, float %sub, i32 1
943  %vecext2 = extractelement <4 x float> %A, i32 0
944  %vecext3 = extractelement <4 x float> %A, i32 1
945  %sub4 = fsub float %vecext2, %vecext3
946  %vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 0
947  %vecext6 = extractelement <4 x float> %B, i32 3
948  %vecext7 = extractelement <4 x float> %B, i32 2
949  %sub8 = fsub float %vecext6, %vecext7
950  %vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 3
951  %vecext10 = extractelement <4 x float> %B, i32 0
952  %vecext11 = extractelement <4 x float> %B, i32 1
953  %sub12 = fsub float %vecext10, %vecext11
954  %vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2
955  ret <4 x float> %vecinit13
956}
957
958define <2 x double> @not_a_hsub_3(<2 x double> %A, <2 x double> %B) {
959; SSE-LABEL: not_a_hsub_3:
960; SSE:       # %bb.0:
961; SSE-NEXT:    movaps %xmm1, %xmm2
962; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
963; SSE-NEXT:    subsd %xmm2, %xmm1
964; SSE-NEXT:    movaps %xmm0, %xmm2
965; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
966; SSE-NEXT:    subsd %xmm0, %xmm2
967; SSE-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0]
968; SSE-NEXT:    movapd %xmm2, %xmm0
969; SSE-NEXT:    retq
970;
971; AVX-LABEL: not_a_hsub_3:
972; AVX:       # %bb.0:
973; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
974; AVX-NEXT:    vsubsd %xmm2, %xmm1, %xmm1
975; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
976; AVX-NEXT:    vsubsd %xmm0, %xmm2, %xmm0
977; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
978; AVX-NEXT:    retq
979  %vecext = extractelement <2 x double> %B, i32 0
980  %vecext1 = extractelement <2 x double> %B, i32 1
981  %sub = fsub double %vecext, %vecext1
982  %vecinit = insertelement <2 x double> undef, double %sub, i32 1
983  %vecext2 = extractelement <2 x double> %A, i32 1
984  %vecext3 = extractelement <2 x double> %A, i32 0
985  %sub2 = fsub double %vecext2, %vecext3
986  %vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0
987  ret <2 x double> %vecinit2
988}
989
990; Test AVX horizontal add/sub of packed single/double precision
991; floating point values from 256-bit vectors.
992
993define <8 x float> @avx_vhadd_ps(<8 x float> %a, <8 x float> %b) {
994; SSE-LABEL: avx_vhadd_ps:
995; SSE:       # %bb.0:
996; SSE-NEXT:    haddps %xmm2, %xmm0
997; SSE-NEXT:    haddps %xmm3, %xmm1
998; SSE-NEXT:    retq
999;
1000; AVX-LABEL: avx_vhadd_ps:
1001; AVX:       # %bb.0:
1002; AVX-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
1003; AVX-NEXT:    retq
1004  %vecext = extractelement <8 x float> %a, i32 0
1005  %vecext1 = extractelement <8 x float> %a, i32 1
1006  %add = fadd float %vecext, %vecext1
1007  %vecinit = insertelement <8 x float> undef, float %add, i32 0
1008  %vecext2 = extractelement <8 x float> %a, i32 2
1009  %vecext3 = extractelement <8 x float> %a, i32 3
1010  %add4 = fadd float %vecext2, %vecext3
1011  %vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 1
1012  %vecext6 = extractelement <8 x float> %b, i32 0
1013  %vecext7 = extractelement <8 x float> %b, i32 1
1014  %add8 = fadd float %vecext6, %vecext7
1015  %vecinit9 = insertelement <8 x float> %vecinit5, float %add8, i32 2
1016  %vecext10 = extractelement <8 x float> %b, i32 2
1017  %vecext11 = extractelement <8 x float> %b, i32 3
1018  %add12 = fadd float %vecext10, %vecext11
1019  %vecinit13 = insertelement <8 x float> %vecinit9, float %add12, i32 3
1020  %vecext14 = extractelement <8 x float> %a, i32 4
1021  %vecext15 = extractelement <8 x float> %a, i32 5
1022  %add16 = fadd float %vecext14, %vecext15
1023  %vecinit17 = insertelement <8 x float> %vecinit13, float %add16, i32 4
1024  %vecext18 = extractelement <8 x float> %a, i32 6
1025  %vecext19 = extractelement <8 x float> %a, i32 7
1026  %add20 = fadd float %vecext18, %vecext19
1027  %vecinit21 = insertelement <8 x float> %vecinit17, float %add20, i32 5
1028  %vecext22 = extractelement <8 x float> %b, i32 4
1029  %vecext23 = extractelement <8 x float> %b, i32 5
1030  %add24 = fadd float %vecext22, %vecext23
1031  %vecinit25 = insertelement <8 x float> %vecinit21, float %add24, i32 6
1032  %vecext26 = extractelement <8 x float> %b, i32 6
1033  %vecext27 = extractelement <8 x float> %b, i32 7
1034  %add28 = fadd float %vecext26, %vecext27
1035  %vecinit29 = insertelement <8 x float> %vecinit25, float %add28, i32 7
1036  ret <8 x float> %vecinit29
1037}
1038
1039define <8 x float> @avx_vhsub_ps(<8 x float> %a, <8 x float> %b) {
1040; SSE-LABEL: avx_vhsub_ps:
1041; SSE:       # %bb.0:
1042; SSE-NEXT:    hsubps %xmm2, %xmm0
1043; SSE-NEXT:    hsubps %xmm3, %xmm1
1044; SSE-NEXT:    retq
1045;
1046; AVX-LABEL: avx_vhsub_ps:
1047; AVX:       # %bb.0:
1048; AVX-NEXT:    vhsubps %ymm1, %ymm0, %ymm0
1049; AVX-NEXT:    retq
1050  %vecext = extractelement <8 x float> %a, i32 0
1051  %vecext1 = extractelement <8 x float> %a, i32 1
1052  %sub = fsub float %vecext, %vecext1
1053  %vecinit = insertelement <8 x float> undef, float %sub, i32 0
1054  %vecext2 = extractelement <8 x float> %a, i32 2
1055  %vecext3 = extractelement <8 x float> %a, i32 3
1056  %sub4 = fsub float %vecext2, %vecext3
1057  %vecinit5 = insertelement <8 x float> %vecinit, float %sub4, i32 1
1058  %vecext6 = extractelement <8 x float> %b, i32 0
1059  %vecext7 = extractelement <8 x float> %b, i32 1
1060  %sub8 = fsub float %vecext6, %vecext7
1061  %vecinit9 = insertelement <8 x float> %vecinit5, float %sub8, i32 2
1062  %vecext10 = extractelement <8 x float> %b, i32 2
1063  %vecext11 = extractelement <8 x float> %b, i32 3
1064  %sub12 = fsub float %vecext10, %vecext11
1065  %vecinit13 = insertelement <8 x float> %vecinit9, float %sub12, i32 3
1066  %vecext14 = extractelement <8 x float> %a, i32 4
1067  %vecext15 = extractelement <8 x float> %a, i32 5
1068  %sub16 = fsub float %vecext14, %vecext15
1069  %vecinit17 = insertelement <8 x float> %vecinit13, float %sub16, i32 4
1070  %vecext18 = extractelement <8 x float> %a, i32 6
1071  %vecext19 = extractelement <8 x float> %a, i32 7
1072  %sub20 = fsub float %vecext18, %vecext19
1073  %vecinit21 = insertelement <8 x float> %vecinit17, float %sub20, i32 5
1074  %vecext22 = extractelement <8 x float> %b, i32 4
1075  %vecext23 = extractelement <8 x float> %b, i32 5
1076  %sub24 = fsub float %vecext22, %vecext23
1077  %vecinit25 = insertelement <8 x float> %vecinit21, float %sub24, i32 6
1078  %vecext26 = extractelement <8 x float> %b, i32 6
1079  %vecext27 = extractelement <8 x float> %b, i32 7
1080  %sub28 = fsub float %vecext26, %vecext27
1081  %vecinit29 = insertelement <8 x float> %vecinit25, float %sub28, i32 7
1082  ret <8 x float> %vecinit29
1083}
1084
1085define <4 x double> @avx_hadd_pd(<4 x double> %a, <4 x double> %b) {
1086; SSE-LABEL: avx_hadd_pd:
1087; SSE:       # %bb.0:
1088; SSE-NEXT:    haddpd %xmm2, %xmm0
1089; SSE-NEXT:    haddpd %xmm3, %xmm1
1090; SSE-NEXT:    retq
1091;
1092; AVX-LABEL: avx_hadd_pd:
1093; AVX:       # %bb.0:
1094; AVX-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
1095; AVX-NEXT:    retq
1096  %vecext = extractelement <4 x double> %a, i32 0
1097  %vecext1 = extractelement <4 x double> %a, i32 1
1098  %add = fadd double %vecext, %vecext1
1099  %vecinit = insertelement <4 x double> undef, double %add, i32 0
1100  %vecext2 = extractelement <4 x double> %b, i32 0
1101  %vecext3 = extractelement <4 x double> %b, i32 1
1102  %add4 = fadd double %vecext2, %vecext3
1103  %vecinit5 = insertelement <4 x double> %vecinit, double %add4, i32 1
1104  %vecext6 = extractelement <4 x double> %a, i32 2
1105  %vecext7 = extractelement <4 x double> %a, i32 3
1106  %add8 = fadd double %vecext6, %vecext7
1107  %vecinit9 = insertelement <4 x double> %vecinit5, double %add8, i32 2
1108  %vecext10 = extractelement <4 x double> %b, i32 2
1109  %vecext11 = extractelement <4 x double> %b, i32 3
1110  %add12 = fadd double %vecext10, %vecext11
1111  %vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3
1112  ret <4 x double> %vecinit13
1113}
1114
1115define <4 x double> @avx_hsub_pd(<4 x double> %a, <4 x double> %b) {
1116; SSE-LABEL: avx_hsub_pd:
1117; SSE:       # %bb.0:
1118; SSE-NEXT:    hsubpd %xmm2, %xmm0
1119; SSE-NEXT:    hsubpd %xmm3, %xmm1
1120; SSE-NEXT:    retq
1121;
1122; AVX-LABEL: avx_hsub_pd:
1123; AVX:       # %bb.0:
1124; AVX-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0
1125; AVX-NEXT:    retq
1126  %vecext = extractelement <4 x double> %a, i32 0
1127  %vecext1 = extractelement <4 x double> %a, i32 1
1128  %sub = fsub double %vecext, %vecext1
1129  %vecinit = insertelement <4 x double> undef, double %sub, i32 0
1130  %vecext2 = extractelement <4 x double> %b, i32 0
1131  %vecext3 = extractelement <4 x double> %b, i32 1
1132  %sub4 = fsub double %vecext2, %vecext3
1133  %vecinit5 = insertelement <4 x double> %vecinit, double %sub4, i32 1
1134  %vecext6 = extractelement <4 x double> %a, i32 2
1135  %vecext7 = extractelement <4 x double> %a, i32 3
1136  %sub8 = fsub double %vecext6, %vecext7
1137  %vecinit9 = insertelement <4 x double> %vecinit5, double %sub8, i32 2
1138  %vecext10 = extractelement <4 x double> %b, i32 2
1139  %vecext11 = extractelement <4 x double> %b, i32 3
1140  %sub12 = fsub double %vecext10, %vecext11
1141  %vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3
1142  ret <4 x double> %vecinit13
1143}
1144
1145; Test AVX2 horizontal add of packed integer values from 256-bit vectors.
1146
1147define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) {
1148; SSE3-LABEL: avx2_hadd_d:
1149; SSE3:       # %bb.0:
1150; SSE3-NEXT:    movd %xmm0, %ecx
1151; SSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
1152; SSE3-NEXT:    movd %xmm4, %r8d
1153; SSE3-NEXT:    addl %ecx, %r8d
1154; SSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
1155; SSE3-NEXT:    movd %xmm4, %edx
1156; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
1157; SSE3-NEXT:    movd %xmm0, %r9d
1158; SSE3-NEXT:    addl %edx, %r9d
1159; SSE3-NEXT:    movd %xmm2, %edx
1160; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
1161; SSE3-NEXT:    movd %xmm0, %esi
1162; SSE3-NEXT:    addl %edx, %esi
1163; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
1164; SSE3-NEXT:    movd %xmm0, %edx
1165; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
1166; SSE3-NEXT:    movd %xmm0, %edi
1167; SSE3-NEXT:    addl %edx, %edi
1168; SSE3-NEXT:    movd %xmm1, %eax
1169; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
1170; SSE3-NEXT:    movd %xmm0, %r10d
1171; SSE3-NEXT:    addl %eax, %r10d
1172; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
1173; SSE3-NEXT:    movd %xmm0, %eax
1174; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
1175; SSE3-NEXT:    movd %xmm0, %ecx
1176; SSE3-NEXT:    addl %eax, %ecx
1177; SSE3-NEXT:    movd %xmm3, %eax
1178; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
1179; SSE3-NEXT:    movd %xmm0, %edx
1180; SSE3-NEXT:    addl %eax, %edx
1181; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
1182; SSE3-NEXT:    movd %xmm0, %r11d
1183; SSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3]
1184; SSE3-NEXT:    movd %xmm0, %eax
1185; SSE3-NEXT:    addl %r11d, %eax
1186; SSE3-NEXT:    movd %edi, %xmm0
1187; SSE3-NEXT:    movd %esi, %xmm1
1188; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1189; SSE3-NEXT:    movd %r9d, %xmm2
1190; SSE3-NEXT:    movd %r8d, %xmm0
1191; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1192; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1193; SSE3-NEXT:    movd %eax, %xmm1
1194; SSE3-NEXT:    movd %edx, %xmm2
1195; SSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1196; SSE3-NEXT:    movd %ecx, %xmm3
1197; SSE3-NEXT:    movd %r10d, %xmm1
1198; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
1199; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1200; SSE3-NEXT:    retq
1201;
1202; SSSE3-LABEL: avx2_hadd_d:
1203; SSSE3:       # %bb.0:
1204; SSSE3-NEXT:    phaddd %xmm2, %xmm0
1205; SSSE3-NEXT:    phaddd %xmm3, %xmm1
1206; SSSE3-NEXT:    retq
1207;
1208; AVX1-LABEL: avx2_hadd_d:
1209; AVX1:       # %bb.0:
1210; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1211; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1212; AVX1-NEXT:    vphaddd %xmm2, %xmm3, %xmm2
1213; AVX1-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
1214; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1215; AVX1-NEXT:    retq
1216;
1217; AVX2-LABEL: avx2_hadd_d:
1218; AVX2:       # %bb.0:
1219; AVX2-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
1220; AVX2-NEXT:    retq
1221  %vecext = extractelement <8 x i32> %a, i32 0
1222  %vecext1 = extractelement <8 x i32> %a, i32 1
1223  %add = add i32 %vecext, %vecext1
1224  %vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
1225  %vecext2 = extractelement <8 x i32> %a, i32 2
1226  %vecext3 = extractelement <8 x i32> %a, i32 3
1227  %add4 = add i32 %vecext2, %vecext3
1228  %vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1
1229  %vecext6 = extractelement <8 x i32> %b, i32 0
1230  %vecext7 = extractelement <8 x i32> %b, i32 1
1231  %add8 = add i32 %vecext6, %vecext7
1232  %vecinit9 = insertelement <8 x i32> %vecinit5, i32 %add8, i32 2
1233  %vecext10 = extractelement <8 x i32> %b, i32 2
1234  %vecext11 = extractelement <8 x i32> %b, i32 3
1235  %add12 = add i32 %vecext10, %vecext11
1236  %vecinit13 = insertelement <8 x i32> %vecinit9, i32 %add12, i32 3
1237  %vecext14 = extractelement <8 x i32> %a, i32 4
1238  %vecext15 = extractelement <8 x i32> %a, i32 5
1239  %add16 = add i32 %vecext14, %vecext15
1240  %vecinit17 = insertelement <8 x i32> %vecinit13, i32 %add16, i32 4
1241  %vecext18 = extractelement <8 x i32> %a, i32 6
1242  %vecext19 = extractelement <8 x i32> %a, i32 7
1243  %add20 = add i32 %vecext18, %vecext19
1244  %vecinit21 = insertelement <8 x i32> %vecinit17, i32 %add20, i32 5
1245  %vecext22 = extractelement <8 x i32> %b, i32 4
1246  %vecext23 = extractelement <8 x i32> %b, i32 5
1247  %add24 = add i32 %vecext22, %vecext23
1248  %vecinit25 = insertelement <8 x i32> %vecinit21, i32 %add24, i32 6
1249  %vecext26 = extractelement <8 x i32> %b, i32 6
1250  %vecext27 = extractelement <8 x i32> %b, i32 7
1251  %add28 = add i32 %vecext26, %vecext27
1252  %vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7
1253  ret <8 x i32> %vecinit29
1254}
1255
1256define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) {
1257; SSE3-LABEL: avx2_hadd_w:
1258; SSE3:       # %bb.0:
1259; SSE3-NEXT:    pushq %rbp
1260; SSE3-NEXT:    .cfi_def_cfa_offset 16
1261; SSE3-NEXT:    pushq %r15
1262; SSE3-NEXT:    .cfi_def_cfa_offset 24
1263; SSE3-NEXT:    pushq %r14
1264; SSE3-NEXT:    .cfi_def_cfa_offset 32
1265; SSE3-NEXT:    pushq %r13
1266; SSE3-NEXT:    .cfi_def_cfa_offset 40
1267; SSE3-NEXT:    pushq %r12
1268; SSE3-NEXT:    .cfi_def_cfa_offset 48
1269; SSE3-NEXT:    pushq %rbx
1270; SSE3-NEXT:    .cfi_def_cfa_offset 56
1271; SSE3-NEXT:    .cfi_offset %rbx, -56
1272; SSE3-NEXT:    .cfi_offset %r12, -48
1273; SSE3-NEXT:    .cfi_offset %r13, -40
1274; SSE3-NEXT:    .cfi_offset %r14, -32
1275; SSE3-NEXT:    .cfi_offset %r15, -24
1276; SSE3-NEXT:    .cfi_offset %rbp, -16
1277; SSE3-NEXT:    movd %xmm0, %eax
1278; SSE3-NEXT:    pextrw $1, %xmm0, %r10d
1279; SSE3-NEXT:    addl %eax, %r10d
1280; SSE3-NEXT:    pextrw $2, %xmm0, %eax
1281; SSE3-NEXT:    pextrw $3, %xmm0, %r11d
1282; SSE3-NEXT:    addl %eax, %r11d
1283; SSE3-NEXT:    pextrw $4, %xmm0, %eax
1284; SSE3-NEXT:    pextrw $5, %xmm0, %r12d
1285; SSE3-NEXT:    addl %eax, %r12d
1286; SSE3-NEXT:    pextrw $6, %xmm0, %eax
1287; SSE3-NEXT:    pextrw $7, %xmm0, %r13d
1288; SSE3-NEXT:    addl %eax, %r13d
1289; SSE3-NEXT:    movd %xmm1, %eax
1290; SSE3-NEXT:    pextrw $1, %xmm1, %ecx
1291; SSE3-NEXT:    addl %eax, %ecx
1292; SSE3-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1293; SSE3-NEXT:    pextrw $2, %xmm1, %eax
1294; SSE3-NEXT:    pextrw $3, %xmm1, %ecx
1295; SSE3-NEXT:    addl %eax, %ecx
1296; SSE3-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1297; SSE3-NEXT:    pextrw $4, %xmm1, %eax
1298; SSE3-NEXT:    pextrw $5, %xmm1, %r14d
1299; SSE3-NEXT:    addl %eax, %r14d
1300; SSE3-NEXT:    pextrw $6, %xmm1, %esi
1301; SSE3-NEXT:    pextrw $7, %xmm1, %r15d
1302; SSE3-NEXT:    addl %esi, %r15d
1303; SSE3-NEXT:    movd %xmm2, %esi
1304; SSE3-NEXT:    pextrw $1, %xmm2, %ebp
1305; SSE3-NEXT:    addl %esi, %ebp
1306; SSE3-NEXT:    pextrw $2, %xmm2, %esi
1307; SSE3-NEXT:    pextrw $3, %xmm2, %edi
1308; SSE3-NEXT:    addl %esi, %edi
1309; SSE3-NEXT:    pextrw $4, %xmm2, %esi
1310; SSE3-NEXT:    pextrw $5, %xmm2, %eax
1311; SSE3-NEXT:    addl %esi, %eax
1312; SSE3-NEXT:    pextrw $6, %xmm2, %esi
1313; SSE3-NEXT:    pextrw $7, %xmm2, %ecx
1314; SSE3-NEXT:    addl %esi, %ecx
1315; SSE3-NEXT:    movd %xmm3, %ebx
1316; SSE3-NEXT:    pextrw $1, %xmm3, %r9d
1317; SSE3-NEXT:    addl %ebx, %r9d
1318; SSE3-NEXT:    pextrw $2, %xmm3, %edx
1319; SSE3-NEXT:    pextrw $3, %xmm3, %ebx
1320; SSE3-NEXT:    addl %edx, %ebx
1321; SSE3-NEXT:    pextrw $4, %xmm3, %edx
1322; SSE3-NEXT:    pextrw $5, %xmm3, %esi
1323; SSE3-NEXT:    addl %edx, %esi
1324; SSE3-NEXT:    pextrw $6, %xmm3, %r8d
1325; SSE3-NEXT:    pextrw $7, %xmm3, %edx
1326; SSE3-NEXT:    addl %r8d, %edx
1327; SSE3-NEXT:    movd %ecx, %xmm8
1328; SSE3-NEXT:    movd %eax, %xmm3
1329; SSE3-NEXT:    movd %edi, %xmm9
1330; SSE3-NEXT:    movd %ebp, %xmm4
1331; SSE3-NEXT:    movd %r13d, %xmm10
1332; SSE3-NEXT:    movd %r12d, %xmm7
1333; SSE3-NEXT:    movd %r11d, %xmm11
1334; SSE3-NEXT:    movd %r10d, %xmm0
1335; SSE3-NEXT:    movd %edx, %xmm12
1336; SSE3-NEXT:    movd %esi, %xmm6
1337; SSE3-NEXT:    movd %ebx, %xmm13
1338; SSE3-NEXT:    movd %r9d, %xmm5
1339; SSE3-NEXT:    movd %r15d, %xmm14
1340; SSE3-NEXT:    movd %r14d, %xmm2
1341; SSE3-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 4-byte Folded Reload
1342; SSE3-NEXT:    # xmm15 = mem[0],zero,zero,zero
1343; SSE3-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Folded Reload
1344; SSE3-NEXT:    # xmm1 = mem[0],zero,zero,zero
1345; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
1346; SSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
1347; SSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
1348; SSE3-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
1349; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
1350; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
1351; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
1352; SSE3-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
1353; SSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3]
1354; SSE3-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
1355; SSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
1356; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3]
1357; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1358; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
1359; SSE3-NEXT:    popq %rbx
1360; SSE3-NEXT:    .cfi_def_cfa_offset 48
1361; SSE3-NEXT:    popq %r12
1362; SSE3-NEXT:    .cfi_def_cfa_offset 40
1363; SSE3-NEXT:    popq %r13
1364; SSE3-NEXT:    .cfi_def_cfa_offset 32
1365; SSE3-NEXT:    popq %r14
1366; SSE3-NEXT:    .cfi_def_cfa_offset 24
1367; SSE3-NEXT:    popq %r15
1368; SSE3-NEXT:    .cfi_def_cfa_offset 16
1369; SSE3-NEXT:    popq %rbp
1370; SSE3-NEXT:    .cfi_def_cfa_offset 8
1371; SSE3-NEXT:    retq
1372;
1373; SSSE3-LABEL: avx2_hadd_w:
1374; SSSE3:       # %bb.0:
1375; SSSE3-NEXT:    phaddw %xmm2, %xmm0
1376; SSSE3-NEXT:    phaddw %xmm3, %xmm1
1377; SSSE3-NEXT:    retq
1378;
1379; AVX1-LABEL: avx2_hadd_w:
1380; AVX1:       # %bb.0:
1381; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1382; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1383; AVX1-NEXT:    vphaddw %xmm2, %xmm3, %xmm2
1384; AVX1-NEXT:    vphaddw %xmm1, %xmm0, %xmm0
1385; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1386; AVX1-NEXT:    retq
1387;
1388; AVX2-LABEL: avx2_hadd_w:
1389; AVX2:       # %bb.0:
1390; AVX2-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
1391; AVX2-NEXT:    retq
1392  %vecext = extractelement <16 x i16> %a, i32 0
1393  %vecext1 = extractelement <16 x i16> %a, i32 1
1394  %add = add i16 %vecext, %vecext1
1395  %vecinit = insertelement <16 x i16> undef, i16 %add, i32 0
1396  %vecext4 = extractelement <16 x i16> %a, i32 2
1397  %vecext6 = extractelement <16 x i16> %a, i32 3
1398  %add8 = add i16 %vecext4, %vecext6
1399  %vecinit10 = insertelement <16 x i16> %vecinit, i16 %add8, i32 1
1400  %vecext11 = extractelement <16 x i16> %a, i32 4
1401  %vecext13 = extractelement <16 x i16> %a, i32 5
1402  %add15 = add i16 %vecext11, %vecext13
1403  %vecinit17 = insertelement <16 x i16> %vecinit10, i16 %add15, i32 2
1404  %vecext18 = extractelement <16 x i16> %a, i32 6
1405  %vecext20 = extractelement <16 x i16> %a, i32 7
1406  %add22 = add i16 %vecext18, %vecext20
1407  %vecinit24 = insertelement <16 x i16> %vecinit17, i16 %add22, i32 3
1408  %vecext25 = extractelement <16 x i16> %a, i32 8
1409  %vecext27 = extractelement <16 x i16> %a, i32 9
1410  %add29 = add i16 %vecext25, %vecext27
1411  %vecinit31 = insertelement <16 x i16> %vecinit24, i16 %add29, i32 8
1412  %vecext32 = extractelement <16 x i16> %a, i32 10
1413  %vecext34 = extractelement <16 x i16> %a, i32 11
1414  %add36 = add i16 %vecext32, %vecext34
1415  %vecinit38 = insertelement <16 x i16> %vecinit31, i16 %add36, i32 9
1416  %vecext39 = extractelement <16 x i16> %a, i32 12
1417  %vecext41 = extractelement <16 x i16> %a, i32 13
1418  %add43 = add i16 %vecext39, %vecext41
1419  %vecinit45 = insertelement <16 x i16> %vecinit38, i16 %add43, i32 10
1420  %vecext46 = extractelement <16 x i16> %a, i32 14
1421  %vecext48 = extractelement <16 x i16> %a, i32 15
1422  %add50 = add i16 %vecext46, %vecext48
1423  %vecinit52 = insertelement <16 x i16> %vecinit45, i16 %add50, i32 11
1424  %vecext53 = extractelement <16 x i16> %b, i32 0
1425  %vecext55 = extractelement <16 x i16> %b, i32 1
1426  %add57 = add i16 %vecext53, %vecext55
1427  %vecinit59 = insertelement <16 x i16> %vecinit52, i16 %add57, i32 4
1428  %vecext60 = extractelement <16 x i16> %b, i32 2
1429  %vecext62 = extractelement <16 x i16> %b, i32 3
1430  %add64 = add i16 %vecext60, %vecext62
1431  %vecinit66 = insertelement <16 x i16> %vecinit59, i16 %add64, i32 5
1432  %vecext67 = extractelement <16 x i16> %b, i32 4
1433  %vecext69 = extractelement <16 x i16> %b, i32 5
1434  %add71 = add i16 %vecext67, %vecext69
1435  %vecinit73 = insertelement <16 x i16> %vecinit66, i16 %add71, i32 6
1436  %vecext74 = extractelement <16 x i16> %b, i32 6
1437  %vecext76 = extractelement <16 x i16> %b, i32 7
1438  %add78 = add i16 %vecext74, %vecext76
1439  %vecinit80 = insertelement <16 x i16> %vecinit73, i16 %add78, i32 7
1440  %vecext81 = extractelement <16 x i16> %b, i32 8
1441  %vecext83 = extractelement <16 x i16> %b, i32 9
1442  %add85 = add i16 %vecext81, %vecext83
1443  %vecinit87 = insertelement <16 x i16> %vecinit80, i16 %add85, i32 12
1444  %vecext88 = extractelement <16 x i16> %b, i32 10
1445  %vecext90 = extractelement <16 x i16> %b, i32 11
1446  %add92 = add i16 %vecext88, %vecext90
1447  %vecinit94 = insertelement <16 x i16> %vecinit87, i16 %add92, i32 13
1448  %vecext95 = extractelement <16 x i16> %b, i32 12
1449  %vecext97 = extractelement <16 x i16> %b, i32 13
1450  %add99 = add i16 %vecext95, %vecext97
1451  %vecinit101 = insertelement <16 x i16> %vecinit94, i16 %add99, i32 14
1452  %vecext102 = extractelement <16 x i16> %b, i32 14
1453  %vecext104 = extractelement <16 x i16> %b, i32 15
1454  %add106 = add i16 %vecext102, %vecext104
1455  %vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15
1456  ret <16 x i16> %vecinit108
1457}
1458