• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse3 | FileCheck %s --check-prefix=SSE
3; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefix=AVX
4
5; Verify that we correctly generate 'addsub' instructions from
6; a sequence of vector extracts + float add/sub + vector inserts.
7
8define <4 x float> @test1(<4 x float> %A, <4 x float> %B) {
9; SSE-LABEL: test1:
10; SSE:       # BB#0:
11; SSE-NEXT:    addsubps %xmm1, %xmm0
12; SSE-NEXT:    retq
13;
14; AVX-LABEL: test1:
15; AVX:       # BB#0:
16; AVX-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0
17; AVX-NEXT:    retq
18  %1 = extractelement <4 x float> %A, i32 0
19  %2 = extractelement <4 x float> %B, i32 0
20  %sub = fsub float %1, %2
21  %3 = extractelement <4 x float> %A, i32 2
22  %4 = extractelement <4 x float> %B, i32 2
23  %sub2 = fsub float %3, %4
24  %5 = extractelement <4 x float> %A, i32 1
25  %6 = extractelement <4 x float> %B, i32 1
26  %add = fadd float %5, %6
27  %7 = extractelement <4 x float> %A, i32 3
28  %8 = extractelement <4 x float> %B, i32 3
29  %add2 = fadd float %7, %8
30  %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
31  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
32  %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
33  %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
34  ret <4 x float> %vecinsert4
35}
36
37define <4 x float> @test2(<4 x float> %A, <4 x float> %B) {
38; SSE-LABEL: test2:
39; SSE:       # BB#0:
40; SSE-NEXT:    addsubps %xmm1, %xmm0
41; SSE-NEXT:    retq
42;
43; AVX-LABEL: test2:
44; AVX:       # BB#0:
45; AVX-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0
46; AVX-NEXT:    retq
47  %1 = extractelement <4 x float> %A, i32 2
48  %2 = extractelement <4 x float> %B, i32 2
49  %sub2 = fsub float %1, %2
50  %3 = extractelement <4 x float> %A, i32 3
51  %4 = extractelement <4 x float> %B, i32 3
52  %add2 = fadd float %3, %4
53  %vecinsert1 = insertelement <4 x float> undef, float %sub2, i32 2
54  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
55  ret <4 x float> %vecinsert2
56}
57
58define <4 x float> @test3(<4 x float> %A, <4 x float> %B) {
59; SSE-LABEL: test3:
60; SSE:       # BB#0:
61; SSE-NEXT:    addsubps %xmm1, %xmm0
62; SSE-NEXT:    retq
63;
64; AVX-LABEL: test3:
65; AVX:       # BB#0:
66; AVX-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0
67; AVX-NEXT:    retq
68  %1 = extractelement <4 x float> %A, i32 0
69  %2 = extractelement <4 x float> %B, i32 0
70  %sub = fsub float %1, %2
71  %3 = extractelement <4 x float> %A, i32 3
72  %4 = extractelement <4 x float> %B, i32 3
73  %add = fadd float %4, %3
74  %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0
75  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 3
76  ret <4 x float> %vecinsert2
77}
78
79define <4 x float> @test4(<4 x float> %A, <4 x float> %B) {
80; SSE-LABEL: test4:
81; SSE:       # BB#0:
82; SSE-NEXT:    addsubps %xmm1, %xmm0
83; SSE-NEXT:    retq
84;
85; AVX-LABEL: test4:
86; AVX:       # BB#0:
87; AVX-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0
88; AVX-NEXT:    retq
89  %1 = extractelement <4 x float> %A, i32 2
90  %2 = extractelement <4 x float> %B, i32 2
91  %sub = fsub float %1, %2
92  %3 = extractelement <4 x float> %A, i32 1
93  %4 = extractelement <4 x float> %B, i32 1
94  %add = fadd float %3, %4
95  %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 2
96  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 1
97  ret <4 x float> %vecinsert2
98}
99
100define <4 x float> @test5(<4 x float> %A, <4 x float> %B) {
101; SSE-LABEL: test5:
102; SSE:       # BB#0:
103; SSE-NEXT:    addsubps %xmm1, %xmm0
104; SSE-NEXT:    retq
105;
106; AVX-LABEL: test5:
107; AVX:       # BB#0:
108; AVX-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0
109; AVX-NEXT:    retq
110  %1 = extractelement <4 x float> %A, i32 0
111  %2 = extractelement <4 x float> %B, i32 0
112  %sub2 = fsub float %1, %2
113  %3 = extractelement <4 x float> %A, i32 1
114  %4 = extractelement <4 x float> %B, i32 1
115  %add2 = fadd float %3, %4
116  %vecinsert1 = insertelement <4 x float> undef, float %sub2, i32 0
117  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 1
118  ret <4 x float> %vecinsert2
119}
120
121define <4 x float> @test6(<4 x float> %A, <4 x float> %B) {
122; SSE-LABEL: test6:
123; SSE:       # BB#0:
124; SSE-NEXT:    addsubps %xmm1, %xmm0
125; SSE-NEXT:    retq
126;
127; AVX-LABEL: test6:
128; AVX:       # BB#0:
129; AVX-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0
130; AVX-NEXT:    retq
131  %1 = extractelement <4 x float> %A, i32 0
132  %2 = extractelement <4 x float> %B, i32 0
133  %sub = fsub float %1, %2
134  %3 = extractelement <4 x float> %A, i32 2
135  %4 = extractelement <4 x float> %B, i32 2
136  %sub2 = fsub float %3, %4
137  %5 = extractelement <4 x float> %A, i32 1
138  %6 = extractelement <4 x float> %B, i32 1
139  %add = fadd float %5, %6
140  %7 = extractelement <4 x float> %A, i32 3
141  %8 = extractelement <4 x float> %B, i32 3
142  %add2 = fadd float %7, %8
143  %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
144  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
145  %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
146  %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
147  ret <4 x float> %vecinsert4
148}
149
150define <4 x double> @test7(<4 x double> %A, <4 x double> %B) {
151; SSE-LABEL: test7:
152; SSE:       # BB#0:
153; SSE-NEXT:    addsubpd %xmm2, %xmm0
154; SSE-NEXT:    addsubpd %xmm3, %xmm1
155; SSE-NEXT:    retq
156;
157; AVX-LABEL: test7:
158; AVX:       # BB#0:
159; AVX-NEXT:    vaddsubpd %ymm1, %ymm0, %ymm0
160; AVX-NEXT:    retq
161  %1 = extractelement <4 x double> %A, i32 0
162  %2 = extractelement <4 x double> %B, i32 0
163  %sub = fsub double %1, %2
164  %3 = extractelement <4 x double> %A, i32 2
165  %4 = extractelement <4 x double> %B, i32 2
166  %sub2 = fsub double %3, %4
167  %5 = extractelement <4 x double> %A, i32 1
168  %6 = extractelement <4 x double> %B, i32 1
169  %add = fadd double %5, %6
170  %7 = extractelement <4 x double> %A, i32 3
171  %8 = extractelement <4 x double> %B, i32 3
172  %add2 = fadd double %7, %8
173  %vecinsert1 = insertelement <4 x double> undef, double %add, i32 1
174  %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add2, i32 3
175  %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub, i32 0
176  %vecinsert4 = insertelement <4 x double> %vecinsert3, double %sub2, i32 2
177  ret <4 x double> %vecinsert4
178}
179
180define <2 x double> @test8(<2 x double> %A, <2 x double> %B) {
181; SSE-LABEL: test8:
182; SSE:       # BB#0:
183; SSE-NEXT:    addsubpd %xmm1, %xmm0
184; SSE-NEXT:    retq
185;
186; AVX-LABEL: test8:
187; AVX:       # BB#0:
188; AVX-NEXT:    vaddsubpd %xmm1, %xmm0, %xmm0
189; AVX-NEXT:    retq
190  %1 = extractelement <2 x double> %A, i32 0
191  %2 = extractelement <2 x double> %B, i32 0
192  %sub = fsub double %1, %2
193  %3 = extractelement <2 x double> %A, i32 1
194  %4 = extractelement <2 x double> %B, i32 1
195  %add = fadd double %3, %4
196  %vecinsert1 = insertelement <2 x double> undef, double %sub, i32 0
197  %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add, i32 1
198  ret <2 x double> %vecinsert2
199}
200
201define <8 x float> @test9(<8 x float> %A, <8 x float> %B) {
202; SSE-LABEL: test9:
203; SSE:       # BB#0:
204; SSE-NEXT:    addsubps %xmm2, %xmm0
205; SSE-NEXT:    addsubps %xmm3, %xmm1
206; SSE-NEXT:    retq
207;
208; AVX-LABEL: test9:
209; AVX:       # BB#0:
210; AVX-NEXT:    vaddsubps %ymm1, %ymm0, %ymm0
211; AVX-NEXT:    retq
212  %1 = extractelement <8 x float> %A, i32 0
213  %2 = extractelement <8 x float> %B, i32 0
214  %sub = fsub float %1, %2
215  %3 = extractelement <8 x float> %A, i32 2
216  %4 = extractelement <8 x float> %B, i32 2
217  %sub2 = fsub float %3, %4
218  %5 = extractelement <8 x float> %A, i32 1
219  %6 = extractelement <8 x float> %B, i32 1
220  %add = fadd float %5, %6
221  %7 = extractelement <8 x float> %A, i32 3
222  %8 = extractelement <8 x float> %B, i32 3
223  %add2 = fadd float %7, %8
224  %9 = extractelement <8 x float> %A, i32 4
225  %10 = extractelement <8 x float> %B, i32 4
226  %sub3 = fsub float %9, %10
227  %11 = extractelement <8 x float> %A, i32 6
228  %12 = extractelement <8 x float> %B, i32 6
229  %sub4 = fsub float %11, %12
230  %13 = extractelement <8 x float> %A, i32 5
231  %14 = extractelement <8 x float> %B, i32 5
232  %add3 = fadd float %13, %14
233  %15 = extractelement <8 x float> %A, i32 7
234  %16 = extractelement <8 x float> %B, i32 7
235  %add4 = fadd float %15, %16
236  %vecinsert1 = insertelement <8 x float> undef, float %add, i32 1
237  %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add2, i32 3
238  %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub, i32 0
239  %vecinsert4 = insertelement <8 x float> %vecinsert3, float %sub2, i32 2
240  %vecinsert5 = insertelement <8 x float> %vecinsert4, float %add3, i32 5
241  %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add4, i32 7
242  %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub3, i32 4
243  %vecinsert8 = insertelement <8 x float> %vecinsert7, float %sub4, i32 6
244  ret <8 x float> %vecinsert8
245}
246
247; Verify that we don't generate addsub instruction for the following
248; functions.
249
250define <4 x float> @test10(<4 x float> %A, <4 x float> %B) {
251; SSE-LABEL: test10:
252; SSE:       # BB#0:
253; SSE-NEXT:    subss %xmm1, %xmm0
254; SSE-NEXT:    retq
255;
256; AVX-LABEL: test10:
257; AVX:       # BB#0:
258; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
259; AVX-NEXT:    retq
260  %1 = extractelement <4 x float> %A, i32 0
261  %2 = extractelement <4 x float> %B, i32 0
262  %sub = fsub float %1, %2
263  %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0
264  ret <4 x float> %vecinsert1
265}
266
267define <4 x float> @test11(<4 x float> %A, <4 x float> %B) {
268; SSE-LABEL: test11:
269; SSE:       # BB#0:
270; SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
271; SSE-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1,0]
272; SSE-NEXT:    subss %xmm1, %xmm0
273; SSE-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
274; SSE-NEXT:    retq
275;
276; AVX-LABEL: test11:
277; AVX:       # BB#0:
278; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
279; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
280; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
281; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
282; AVX-NEXT:    retq
283  %1 = extractelement <4 x float> %A, i32 2
284  %2 = extractelement <4 x float> %B, i32 2
285  %sub = fsub float %1, %2
286  %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 2
287  ret <4 x float> %vecinsert1
288}
289
290define <4 x float> @test12(<4 x float> %A, <4 x float> %B) {
291; SSE-LABEL: test12:
292; SSE:       # BB#0:
293; SSE-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
294; SSE-NEXT:    movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
295; SSE-NEXT:    addss %xmm0, %xmm1
296; SSE-NEXT:    movsldup {{.*#+}} xmm0 = xmm1[0,0,2,2]
297; SSE-NEXT:    retq
298;
299; AVX-LABEL: test12:
300; AVX:       # BB#0:
301; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
302; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
303; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
304; AVX-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
305; AVX-NEXT:    retq
306  %1 = extractelement <4 x float> %A, i32 1
307  %2 = extractelement <4 x float> %B, i32 1
308  %add = fadd float %1, %2
309  %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
310  ret <4 x float> %vecinsert1
311}
312
313define <4 x float> @test13(<4 x float> %A, <4 x float> %B) {
314; SSE-LABEL: test13:
315; SSE:       # BB#0:
316; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
317; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
318; SSE-NEXT:    addss %xmm0, %xmm1
319; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
320; SSE-NEXT:    movaps %xmm1, %xmm0
321; SSE-NEXT:    retq
322;
323; AVX-LABEL: test13:
324; AVX:       # BB#0:
325; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
326; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
327; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
328; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0]
329; AVX-NEXT:    retq
330  %1 = extractelement <4 x float> %A, i32 3
331  %2 = extractelement <4 x float> %B, i32 3
332  %add = fadd float %1, %2
333  %vecinsert1 = insertelement <4 x float> undef, float %add, i32 3
334  ret <4 x float> %vecinsert1
335}
336
337define <4 x float> @test14(<4 x float> %A, <4 x float> %B) {
338; SSE-LABEL: test14:
339; SSE:       # BB#0:
340; SSE-NEXT:    movaps %xmm0, %xmm2
341; SSE-NEXT:    subss %xmm1, %xmm2
342; SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
343; SSE-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1,0]
344; SSE-NEXT:    subss %xmm1, %xmm0
345; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
346; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1,1,3]
347; SSE-NEXT:    movaps %xmm2, %xmm0
348; SSE-NEXT:    retq
349;
350; AVX-LABEL: test14:
351; AVX:       # BB#0:
352; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm2
353; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
354; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
355; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
356; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3]
357; AVX-NEXT:    retq
358  %1 = extractelement <4 x float> %A, i32 0
359  %2 = extractelement <4 x float> %B, i32 0
360  %sub = fsub float %1, %2
361  %3 = extractelement <4 x float> %A, i32 2
362  %4 = extractelement <4 x float> %B, i32 2
363  %sub2 = fsub float %3, %4
364  %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0
365  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %sub2, i32 2
366  ret <4 x float> %vecinsert2
367}
368
369define <4 x float> @test15(<4 x float> %A, <4 x float> %B) {
370; SSE-LABEL: test15:
371; SSE:       # BB#0:
372; SSE-NEXT:    movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
373; SSE-NEXT:    movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
374; SSE-NEXT:    addss %xmm3, %xmm2
375; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
376; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
377; SSE-NEXT:    addss %xmm0, %xmm1
378; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
379; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0,2,1]
380; SSE-NEXT:    movaps %xmm2, %xmm0
381; SSE-NEXT:    retq
382;
383; AVX-LABEL: test15:
384; AVX:       # BB#0:
385; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
386; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
387; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm2
388; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
389; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
390; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
391; AVX-NEXT:    vmovsldup {{.*#+}} xmm1 = xmm2[0,0,2,2]
392; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
393; AVX-NEXT:    retq
394  %1 = extractelement <4 x float> %A, i32 1
395  %2 = extractelement <4 x float> %B, i32 1
396  %add = fadd float %1, %2
397  %3 = extractelement <4 x float> %A, i32 3
398  %4 = extractelement <4 x float> %B, i32 3
399  %add2 = fadd float %3, %4
400  %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
401  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
402  ret <4 x float> %vecinsert2
403}
404
405define <4 x float> @test16(<4 x float> %A, <4 x float> %B) {
406; SSE-LABEL: test16:
407; SSE:       # BB#0:
408; SSE-NEXT:    movaps %xmm0, %xmm2
409; SSE-NEXT:    subss %xmm0, %xmm2
410; SSE-NEXT:    movaps %xmm0, %xmm3
411; SSE-NEXT:    shufpd {{.*#+}} xmm3 = xmm3[1,0]
412; SSE-NEXT:    movapd %xmm1, %xmm4
413; SSE-NEXT:    shufpd {{.*#+}} xmm4 = xmm4[1,0]
414; SSE-NEXT:    subss %xmm4, %xmm3
415; SSE-NEXT:    movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
416; SSE-NEXT:    addss %xmm0, %xmm4
417; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
418; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
419; SSE-NEXT:    addss %xmm0, %xmm1
420; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
421; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
422; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
423; SSE-NEXT:    movaps %xmm2, %xmm0
424; SSE-NEXT:    retq
425;
426; AVX-LABEL: test16:
427; AVX:       # BB#0:
428; AVX-NEXT:    vsubss %xmm0, %xmm0, %xmm2
429; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
430; AVX-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
431; AVX-NEXT:    vsubss %xmm4, %xmm3, %xmm3
432; AVX-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
433; AVX-NEXT:    vaddss %xmm0, %xmm4, %xmm4
434; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
435; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
436; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
437; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[2,3]
438; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
439; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
440; AVX-NEXT:    retq
441  %1 = extractelement <4 x float> %A, i32 0
442  %2 = extractelement <4 x float> %B, i32 0
443  %sub = fsub float %1, undef
444  %3 = extractelement <4 x float> %A, i32 2
445  %4 = extractelement <4 x float> %B, i32 2
446  %sub2 = fsub float %3, %4
447  %5 = extractelement <4 x float> %A, i32 1
448  %6 = extractelement <4 x float> %B, i32 1
449  %add = fadd float %5, undef
450  %7 = extractelement <4 x float> %A, i32 3
451  %8 = extractelement <4 x float> %B, i32 3
452  %add2 = fadd float %7, %8
453  %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
454  %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
455  %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
456  %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
457  ret <4 x float> %vecinsert4
458}
459
460define <2 x float> @test_v2f32(<2 x float> %v0, <2 x float> %v1) {
461; SSE-LABEL: test_v2f32:
462; SSE:       # BB#0:
463; SSE-NEXT:    addsubps %xmm1, %xmm0
464; SSE-NEXT:    retq
465;
466; AVX-LABEL: test_v2f32:
467; AVX:       # BB#0:
468; AVX-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0
469; AVX-NEXT:    retq
470  %v2 = extractelement <2 x float> %v0, i32 0
471  %v3 = extractelement <2 x float> %v1, i32 0
472  %v4 = extractelement <2 x float> %v0, i32 1
473  %v5 = extractelement <2 x float> %v1, i32 1
474  %sub = fsub float %v2, %v3
475  %add = fadd float %v5, %v4
476  %res0 = insertelement <2 x float> undef, float %sub, i32 0
477  %res1 = insertelement <2 x float> %res0, float %add, i32 1
478  ret <2 x float> %res1
479}
480