• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE,SSE2,X86-SSE2
3; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE,SSE41,X86-SSE41
4; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX1,X86-AVX1
5; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX512,X86-AVX512
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE,SSE2,X64-SSE2
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE,SSE41,X64-SSE41
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512
10
11; Ensure that the backend no longer emits unnecessary vector insert
12; instructions immediately after SSE scalar fp instructions
13; like addss or mulss.
14
15define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
16; SSE-LABEL: test_add_ss:
17; SSE:       # %bb.0:
18; SSE-NEXT:    addss %xmm1, %xmm0
19; SSE-NEXT:    ret{{[l|q]}}
20;
21; AVX-LABEL: test_add_ss:
22; AVX:       # %bb.0:
23; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
24; AVX-NEXT:    ret{{[l|q]}}
25  %1 = extractelement <4 x float> %b, i32 0
26  %2 = extractelement <4 x float> %a, i32 0
27  %add = fadd float %2, %1
28  %3 = insertelement <4 x float> %a, float %add, i32 0
29  ret <4 x float> %3
30}
31
32define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
33; SSE-LABEL: test_sub_ss:
34; SSE:       # %bb.0:
35; SSE-NEXT:    subss %xmm1, %xmm0
36; SSE-NEXT:    ret{{[l|q]}}
37;
38; AVX-LABEL: test_sub_ss:
39; AVX:       # %bb.0:
40; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
41; AVX-NEXT:    ret{{[l|q]}}
42  %1 = extractelement <4 x float> %b, i32 0
43  %2 = extractelement <4 x float> %a, i32 0
44  %sub = fsub float %2, %1
45  %3 = insertelement <4 x float> %a, float %sub, i32 0
46  ret <4 x float> %3
47}
48
49define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
50; SSE-LABEL: test_mul_ss:
51; SSE:       # %bb.0:
52; SSE-NEXT:    mulss %xmm1, %xmm0
53; SSE-NEXT:    ret{{[l|q]}}
54;
55; AVX-LABEL: test_mul_ss:
56; AVX:       # %bb.0:
57; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
58; AVX-NEXT:    ret{{[l|q]}}
59  %1 = extractelement <4 x float> %b, i32 0
60  %2 = extractelement <4 x float> %a, i32 0
61  %mul = fmul float %2, %1
62  %3 = insertelement <4 x float> %a, float %mul, i32 0
63  ret <4 x float> %3
64}
65
66define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
67; SSE-LABEL: test_div_ss:
68; SSE:       # %bb.0:
69; SSE-NEXT:    divss %xmm1, %xmm0
70; SSE-NEXT:    ret{{[l|q]}}
71;
72; AVX-LABEL: test_div_ss:
73; AVX:       # %bb.0:
74; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
75; AVX-NEXT:    ret{{[l|q]}}
76  %1 = extractelement <4 x float> %b, i32 0
77  %2 = extractelement <4 x float> %a, i32 0
78  %div = fdiv float %2, %1
79  %3 = insertelement <4 x float> %a, float %div, i32 0
80  ret <4 x float> %3
81}
82
83define <4 x float> @test_sqrt_ss(<4 x float> %a) {
84; SSE-LABEL: test_sqrt_ss:
85; SSE:       # %bb.0:
86; SSE-NEXT:    sqrtss %xmm0, %xmm0
87; SSE-NEXT:    ret{{[l|q]}}
88;
89; AVX-LABEL: test_sqrt_ss:
90; AVX:       # %bb.0:
91; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
92; AVX-NEXT:    ret{{[l|q]}}
93  %1 = extractelement <4 x float> %a, i32 0
94  %2 = call float @llvm.sqrt.f32(float %1)
95  %3 = insertelement <4 x float> %a, float %2, i32 0
96  ret <4 x float> %3
97}
98declare float @llvm.sqrt.f32(float)
99
100define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
101; SSE-LABEL: test_add_sd:
102; SSE:       # %bb.0:
103; SSE-NEXT:    addsd %xmm1, %xmm0
104; SSE-NEXT:    ret{{[l|q]}}
105;
106; AVX-LABEL: test_add_sd:
107; AVX:       # %bb.0:
108; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
109; AVX-NEXT:    ret{{[l|q]}}
110  %1 = extractelement <2 x double> %b, i32 0
111  %2 = extractelement <2 x double> %a, i32 0
112  %add = fadd double %2, %1
113  %3 = insertelement <2 x double> %a, double %add, i32 0
114  ret <2 x double> %3
115}
116
117define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
118; SSE-LABEL: test_sub_sd:
119; SSE:       # %bb.0:
120; SSE-NEXT:    subsd %xmm1, %xmm0
121; SSE-NEXT:    ret{{[l|q]}}
122;
123; AVX-LABEL: test_sub_sd:
124; AVX:       # %bb.0:
125; AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
126; AVX-NEXT:    ret{{[l|q]}}
127  %1 = extractelement <2 x double> %b, i32 0
128  %2 = extractelement <2 x double> %a, i32 0
129  %sub = fsub double %2, %1
130  %3 = insertelement <2 x double> %a, double %sub, i32 0
131  ret <2 x double> %3
132}
133
134define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
135; SSE-LABEL: test_mul_sd:
136; SSE:       # %bb.0:
137; SSE-NEXT:    mulsd %xmm1, %xmm0
138; SSE-NEXT:    ret{{[l|q]}}
139;
140; AVX-LABEL: test_mul_sd:
141; AVX:       # %bb.0:
142; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
143; AVX-NEXT:    ret{{[l|q]}}
144  %1 = extractelement <2 x double> %b, i32 0
145  %2 = extractelement <2 x double> %a, i32 0
146  %mul = fmul double %2, %1
147  %3 = insertelement <2 x double> %a, double %mul, i32 0
148  ret <2 x double> %3
149}
150
151define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
152; SSE-LABEL: test_div_sd:
153; SSE:       # %bb.0:
154; SSE-NEXT:    divsd %xmm1, %xmm0
155; SSE-NEXT:    ret{{[l|q]}}
156;
157; AVX-LABEL: test_div_sd:
158; AVX:       # %bb.0:
159; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
160; AVX-NEXT:    ret{{[l|q]}}
161  %1 = extractelement <2 x double> %b, i32 0
162  %2 = extractelement <2 x double> %a, i32 0
163  %div = fdiv double %2, %1
164  %3 = insertelement <2 x double> %a, double %div, i32 0
165  ret <2 x double> %3
166}
167
168define <2 x double> @test_sqrt_sd(<2 x double> %a) {
169; SSE-LABEL: test_sqrt_sd:
170; SSE:       # %bb.0:
171; SSE-NEXT:    sqrtsd %xmm0, %xmm0
172; SSE-NEXT:    ret{{[l|q]}}
173;
174; AVX-LABEL: test_sqrt_sd:
175; AVX:       # %bb.0:
176; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
177; AVX-NEXT:    ret{{[l|q]}}
178  %1 = extractelement <2 x double> %a, i32 0
179  %2 = call double @llvm.sqrt.f64(double %1)
180  %3 = insertelement <2 x double> %a, double %2, i32 0
181  ret <2 x double> %3
182}
183declare double @llvm.sqrt.f64(double)
184
185define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) {
186; SSE-LABEL: test2_add_ss:
187; SSE:       # %bb.0:
188; SSE-NEXT:    addss %xmm0, %xmm1
189; SSE-NEXT:    movaps %xmm1, %xmm0
190; SSE-NEXT:    ret{{[l|q]}}
191;
192; AVX-LABEL: test2_add_ss:
193; AVX:       # %bb.0:
194; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
195; AVX-NEXT:    ret{{[l|q]}}
196  %1 = extractelement <4 x float> %a, i32 0
197  %2 = extractelement <4 x float> %b, i32 0
198  %add = fadd float %1, %2
199  %3 = insertelement <4 x float> %b, float %add, i32 0
200  ret <4 x float> %3
201}
202
203define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) {
204; SSE-LABEL: test2_sub_ss:
205; SSE:       # %bb.0:
206; SSE-NEXT:    subss %xmm0, %xmm1
207; SSE-NEXT:    movaps %xmm1, %xmm0
208; SSE-NEXT:    ret{{[l|q]}}
209;
210; AVX-LABEL: test2_sub_ss:
211; AVX:       # %bb.0:
212; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
213; AVX-NEXT:    ret{{[l|q]}}
214  %1 = extractelement <4 x float> %a, i32 0
215  %2 = extractelement <4 x float> %b, i32 0
216  %sub = fsub float %2, %1
217  %3 = insertelement <4 x float> %b, float %sub, i32 0
218  ret <4 x float> %3
219}
220
221define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) {
222; SSE-LABEL: test2_mul_ss:
223; SSE:       # %bb.0:
224; SSE-NEXT:    mulss %xmm0, %xmm1
225; SSE-NEXT:    movaps %xmm1, %xmm0
226; SSE-NEXT:    ret{{[l|q]}}
227;
228; AVX-LABEL: test2_mul_ss:
229; AVX:       # %bb.0:
230; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
231; AVX-NEXT:    ret{{[l|q]}}
232  %1 = extractelement <4 x float> %a, i32 0
233  %2 = extractelement <4 x float> %b, i32 0
234  %mul = fmul float %1, %2
235  %3 = insertelement <4 x float> %b, float %mul, i32 0
236  ret <4 x float> %3
237}
238
239define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) {
240; SSE-LABEL: test2_div_ss:
241; SSE:       # %bb.0:
242; SSE-NEXT:    divss %xmm0, %xmm1
243; SSE-NEXT:    movaps %xmm1, %xmm0
244; SSE-NEXT:    ret{{[l|q]}}
245;
246; AVX-LABEL: test2_div_ss:
247; AVX:       # %bb.0:
248; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
249; AVX-NEXT:    ret{{[l|q]}}
250  %1 = extractelement <4 x float> %a, i32 0
251  %2 = extractelement <4 x float> %b, i32 0
252  %div = fdiv float %2, %1
253  %3 = insertelement <4 x float> %b, float %div, i32 0
254  ret <4 x float> %3
255}
256
257define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) {
258; SSE-LABEL: test2_add_sd:
259; SSE:       # %bb.0:
260; SSE-NEXT:    addsd %xmm0, %xmm1
261; SSE-NEXT:    movapd %xmm1, %xmm0
262; SSE-NEXT:    ret{{[l|q]}}
263;
264; AVX-LABEL: test2_add_sd:
265; AVX:       # %bb.0:
266; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
267; AVX-NEXT:    ret{{[l|q]}}
268  %1 = extractelement <2 x double> %a, i32 0
269  %2 = extractelement <2 x double> %b, i32 0
270  %add = fadd double %1, %2
271  %3 = insertelement <2 x double> %b, double %add, i32 0
272  ret <2 x double> %3
273}
274
275define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) {
276; SSE-LABEL: test2_sub_sd:
277; SSE:       # %bb.0:
278; SSE-NEXT:    subsd %xmm0, %xmm1
279; SSE-NEXT:    movapd %xmm1, %xmm0
280; SSE-NEXT:    ret{{[l|q]}}
281;
282; AVX-LABEL: test2_sub_sd:
283; AVX:       # %bb.0:
284; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
285; AVX-NEXT:    ret{{[l|q]}}
286  %1 = extractelement <2 x double> %a, i32 0
287  %2 = extractelement <2 x double> %b, i32 0
288  %sub = fsub double %2, %1
289  %3 = insertelement <2 x double> %b, double %sub, i32 0
290  ret <2 x double> %3
291}
292
293define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) {
294; SSE-LABEL: test2_mul_sd:
295; SSE:       # %bb.0:
296; SSE-NEXT:    mulsd %xmm0, %xmm1
297; SSE-NEXT:    movapd %xmm1, %xmm0
298; SSE-NEXT:    ret{{[l|q]}}
299;
300; AVX-LABEL: test2_mul_sd:
301; AVX:       # %bb.0:
302; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
303; AVX-NEXT:    ret{{[l|q]}}
304  %1 = extractelement <2 x double> %a, i32 0
305  %2 = extractelement <2 x double> %b, i32 0
306  %mul = fmul double %1, %2
307  %3 = insertelement <2 x double> %b, double %mul, i32 0
308  ret <2 x double> %3
309}
310
311define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) {
312; SSE-LABEL: test2_div_sd:
313; SSE:       # %bb.0:
314; SSE-NEXT:    divsd %xmm0, %xmm1
315; SSE-NEXT:    movapd %xmm1, %xmm0
316; SSE-NEXT:    ret{{[l|q]}}
317;
318; AVX-LABEL: test2_div_sd:
319; AVX:       # %bb.0:
320; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
321; AVX-NEXT:    ret{{[l|q]}}
322  %1 = extractelement <2 x double> %a, i32 0
323  %2 = extractelement <2 x double> %b, i32 0
324  %div = fdiv double %2, %1
325  %3 = insertelement <2 x double> %b, double %div, i32 0
326  ret <2 x double> %3
327}
328
329define <4 x float> @test_multiple_add_ss(<4 x float> %a, <4 x float> %b) {
330; SSE-LABEL: test_multiple_add_ss:
331; SSE:       # %bb.0:
332; SSE-NEXT:    addss %xmm0, %xmm1
333; SSE-NEXT:    addss %xmm1, %xmm0
334; SSE-NEXT:    ret{{[l|q]}}
335;
336; AVX-LABEL: test_multiple_add_ss:
337; AVX:       # %bb.0:
338; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm1
339; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
340; AVX-NEXT:    ret{{[l|q]}}
341  %1 = extractelement <4 x float> %b, i32 0
342  %2 = extractelement <4 x float> %a, i32 0
343  %add = fadd float %2, %1
344  %add2 = fadd float %2, %add
345  %3 = insertelement <4 x float> %a, float %add2, i32 0
346  ret <4 x float> %3
347}
348
349define <4 x float> @test_multiple_sub_ss(<4 x float> %a, <4 x float> %b) {
350; SSE-LABEL: test_multiple_sub_ss:
351; SSE:       # %bb.0:
352; SSE-NEXT:    movaps %xmm0, %xmm2
353; SSE-NEXT:    subss %xmm1, %xmm2
354; SSE-NEXT:    subss %xmm2, %xmm0
355; SSE-NEXT:    ret{{[l|q]}}
356;
357; AVX-LABEL: test_multiple_sub_ss:
358; AVX:       # %bb.0:
359; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm1
360; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
361; AVX-NEXT:    ret{{[l|q]}}
362  %1 = extractelement <4 x float> %b, i32 0
363  %2 = extractelement <4 x float> %a, i32 0
364  %sub = fsub float %2, %1
365  %sub2 = fsub float %2, %sub
366  %3 = insertelement <4 x float> %a, float %sub2, i32 0
367  ret <4 x float> %3
368}
369
370define <4 x float> @test_multiple_mul_ss(<4 x float> %a, <4 x float> %b) {
371; SSE-LABEL: test_multiple_mul_ss:
372; SSE:       # %bb.0:
373; SSE-NEXT:    mulss %xmm0, %xmm1
374; SSE-NEXT:    mulss %xmm1, %xmm0
375; SSE-NEXT:    ret{{[l|q]}}
376;
377; AVX-LABEL: test_multiple_mul_ss:
378; AVX:       # %bb.0:
379; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm1
380; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
381; AVX-NEXT:    ret{{[l|q]}}
382  %1 = extractelement <4 x float> %b, i32 0
383  %2 = extractelement <4 x float> %a, i32 0
384  %mul = fmul float %2, %1
385  %mul2 = fmul float %2, %mul
386  %3 = insertelement <4 x float> %a, float %mul2, i32 0
387  ret <4 x float> %3
388}
389
390define <4 x float> @test_multiple_div_ss(<4 x float> %a, <4 x float> %b) {
391; SSE-LABEL: test_multiple_div_ss:
392; SSE:       # %bb.0:
393; SSE-NEXT:    movaps %xmm0, %xmm2
394; SSE-NEXT:    divss %xmm1, %xmm2
395; SSE-NEXT:    divss %xmm2, %xmm0
396; SSE-NEXT:    ret{{[l|q]}}
397;
398; AVX-LABEL: test_multiple_div_ss:
399; AVX:       # %bb.0:
400; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm1
401; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
402; AVX-NEXT:    ret{{[l|q]}}
403  %1 = extractelement <4 x float> %b, i32 0
404  %2 = extractelement <4 x float> %a, i32 0
405  %div = fdiv float %2, %1
406  %div2 = fdiv float %2, %div
407  %3 = insertelement <4 x float> %a, float %div2, i32 0
408  ret <4 x float> %3
409}
410
411; With SSE4.1 or greater, the shuffles in the following tests may
412; be lowered to X86Blendi nodes.
413
414define <4 x float> @blend_add_ss(<4 x float> %a, float %b) {
415; X86-SSE-LABEL: blend_add_ss:
416; X86-SSE:       # %bb.0:
417; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
418; X86-SSE-NEXT:    addss %xmm1, %xmm0
419; X86-SSE-NEXT:    retl
420;
421; X86-AVX-LABEL: blend_add_ss:
422; X86-AVX:       # %bb.0:
423; X86-AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
424; X86-AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
425; X86-AVX-NEXT:    retl
426;
427; X64-SSE-LABEL: blend_add_ss:
428; X64-SSE:       # %bb.0:
429; X64-SSE-NEXT:    addss %xmm1, %xmm0
430; X64-SSE-NEXT:    retq
431;
432; X64-AVX-LABEL: blend_add_ss:
433; X64-AVX:       # %bb.0:
434; X64-AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
435; X64-AVX-NEXT:    retq
436
437  %ext = extractelement <4 x float> %a, i32 0
438  %op = fadd float %b, %ext
439  %ins = insertelement <4 x float> undef, float %op, i32 0
440  %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
441  ret <4 x float> %shuf
442}
443
444define <4 x float> @blend_sub_ss(<4 x float> %a, float %b) {
445; X86-SSE-LABEL: blend_sub_ss:
446; X86-SSE:       # %bb.0:
447; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
448; X86-SSE-NEXT:    subss %xmm1, %xmm0
449; X86-SSE-NEXT:    retl
450;
451; X86-AVX-LABEL: blend_sub_ss:
452; X86-AVX:       # %bb.0:
453; X86-AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
454; X86-AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
455; X86-AVX-NEXT:    retl
456;
457; X64-SSE-LABEL: blend_sub_ss:
458; X64-SSE:       # %bb.0:
459; X64-SSE-NEXT:    subss %xmm1, %xmm0
460; X64-SSE-NEXT:    retq
461;
462; X64-AVX-LABEL: blend_sub_ss:
463; X64-AVX:       # %bb.0:
464; X64-AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
465; X64-AVX-NEXT:    retq
466
467  %ext = extractelement <4 x float> %a, i32 0
468  %op = fsub float %ext, %b
469  %ins = insertelement <4 x float> undef, float %op, i32 0
470  %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
471  ret <4 x float> %shuf
472}
473
474define <4 x float> @blend_mul_ss(<4 x float> %a, float %b) {
475; X86-SSE-LABEL: blend_mul_ss:
476; X86-SSE:       # %bb.0:
477; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
478; X86-SSE-NEXT:    mulss %xmm1, %xmm0
479; X86-SSE-NEXT:    retl
480;
481; X86-AVX-LABEL: blend_mul_ss:
482; X86-AVX:       # %bb.0:
483; X86-AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
484; X86-AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
485; X86-AVX-NEXT:    retl
486;
487; X64-SSE-LABEL: blend_mul_ss:
488; X64-SSE:       # %bb.0:
489; X64-SSE-NEXT:    mulss %xmm1, %xmm0
490; X64-SSE-NEXT:    retq
491;
492; X64-AVX-LABEL: blend_mul_ss:
493; X64-AVX:       # %bb.0:
494; X64-AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
495; X64-AVX-NEXT:    retq
496
497  %ext = extractelement <4 x float> %a, i32 0
498  %op = fmul float %b, %ext
499  %ins = insertelement <4 x float> undef, float %op, i32 0
500  %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
501  ret <4 x float> %shuf
502}
503
504define <4 x float> @blend_div_ss(<4 x float> %a, float %b) {
505; X86-SSE-LABEL: blend_div_ss:
506; X86-SSE:       # %bb.0:
507; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
508; X86-SSE-NEXT:    divss %xmm1, %xmm0
509; X86-SSE-NEXT:    retl
510;
511; X86-AVX-LABEL: blend_div_ss:
512; X86-AVX:       # %bb.0:
513; X86-AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
514; X86-AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
515; X86-AVX-NEXT:    retl
516;
517; X64-SSE-LABEL: blend_div_ss:
518; X64-SSE:       # %bb.0:
519; X64-SSE-NEXT:    divss %xmm1, %xmm0
520; X64-SSE-NEXT:    retq
521;
522; X64-AVX-LABEL: blend_div_ss:
523; X64-AVX:       # %bb.0:
524; X64-AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
525; X64-AVX-NEXT:    retq
526
527  %ext = extractelement <4 x float> %a, i32 0
528  %op = fdiv float %ext, %b
529  %ins = insertelement <4 x float> undef, float %op, i32 0
530  %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
531  ret <4 x float> %shuf
532}
533
534define <2 x double> @blend_add_sd(<2 x double> %a, double %b) {
535; X86-SSE-LABEL: blend_add_sd:
536; X86-SSE:       # %bb.0:
537; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
538; X86-SSE-NEXT:    addsd %xmm1, %xmm0
539; X86-SSE-NEXT:    retl
540;
541; X86-AVX-LABEL: blend_add_sd:
542; X86-AVX:       # %bb.0:
543; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
544; X86-AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
545; X86-AVX-NEXT:    retl
546;
547; X64-SSE-LABEL: blend_add_sd:
548; X64-SSE:       # %bb.0:
549; X64-SSE-NEXT:    addsd %xmm1, %xmm0
550; X64-SSE-NEXT:    retq
551;
552; X64-AVX-LABEL: blend_add_sd:
553; X64-AVX:       # %bb.0:
554; X64-AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
555; X64-AVX-NEXT:    retq
556
557  %ext = extractelement <2 x double> %a, i32 0
558  %op = fadd double %b, %ext
559  %ins = insertelement <2 x double> undef, double %op, i32 0
560  %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
561  ret <2 x double> %shuf
562}
563
564define <2 x double> @blend_sub_sd(<2 x double> %a, double %b) {
565; X86-SSE-LABEL: blend_sub_sd:
566; X86-SSE:       # %bb.0:
567; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
568; X86-SSE-NEXT:    subsd %xmm1, %xmm0
569; X86-SSE-NEXT:    retl
570;
571; X86-AVX-LABEL: blend_sub_sd:
572; X86-AVX:       # %bb.0:
573; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
574; X86-AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
575; X86-AVX-NEXT:    retl
576;
577; X64-SSE-LABEL: blend_sub_sd:
578; X64-SSE:       # %bb.0:
579; X64-SSE-NEXT:    subsd %xmm1, %xmm0
580; X64-SSE-NEXT:    retq
581;
582; X64-AVX-LABEL: blend_sub_sd:
583; X64-AVX:       # %bb.0:
584; X64-AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
585; X64-AVX-NEXT:    retq
586
587  %ext = extractelement <2 x double> %a, i32 0
588  %op = fsub double %ext, %b
589  %ins = insertelement <2 x double> undef, double %op, i32 0
590  %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
591  ret <2 x double> %shuf
592}
593
594define <2 x double> @blend_mul_sd(<2 x double> %a, double %b) {
595; X86-SSE-LABEL: blend_mul_sd:
596; X86-SSE:       # %bb.0:
597; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
598; X86-SSE-NEXT:    mulsd %xmm1, %xmm0
599; X86-SSE-NEXT:    retl
600;
601; X86-AVX-LABEL: blend_mul_sd:
602; X86-AVX:       # %bb.0:
603; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
604; X86-AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
605; X86-AVX-NEXT:    retl
606;
607; X64-SSE-LABEL: blend_mul_sd:
608; X64-SSE:       # %bb.0:
609; X64-SSE-NEXT:    mulsd %xmm1, %xmm0
610; X64-SSE-NEXT:    retq
611;
612; X64-AVX-LABEL: blend_mul_sd:
613; X64-AVX:       # %bb.0:
614; X64-AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
615; X64-AVX-NEXT:    retq
616
617  %ext = extractelement <2 x double> %a, i32 0
618  %op = fmul double %b, %ext
619  %ins = insertelement <2 x double> undef, double %op, i32 0
620  %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
621  ret <2 x double> %shuf
622}
623
624define <2 x double> @blend_div_sd(<2 x double> %a, double %b) {
625; X86-SSE-LABEL: blend_div_sd:
626; X86-SSE:       # %bb.0:
627; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
628; X86-SSE-NEXT:    divsd %xmm1, %xmm0
629; X86-SSE-NEXT:    retl
630;
631; X86-AVX-LABEL: blend_div_sd:
632; X86-AVX:       # %bb.0:
633; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
634; X86-AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
635; X86-AVX-NEXT:    retl
636;
637; X64-SSE-LABEL: blend_div_sd:
638; X64-SSE:       # %bb.0:
639; X64-SSE-NEXT:    divsd %xmm1, %xmm0
640; X64-SSE-NEXT:    retq
641;
642; X64-AVX-LABEL: blend_div_sd:
643; X64-AVX:       # %bb.0:
644; X64-AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
645; X64-AVX-NEXT:    retq
646
647  %ext = extractelement <2 x double> %a, i32 0
648  %op = fdiv double %ext, %b
649  %ins = insertelement <2 x double> undef, double %op, i32 0
650  %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
651  ret <2 x double> %shuf
652}
653
654; Ensure that the backend selects SSE/AVX scalar fp instructions
655; from a packed fp instruction plus a vector insert.
656
657define <4 x float> @insert_test_add_ss(<4 x float> %a, <4 x float> %b) {
658; SSE-LABEL: insert_test_add_ss:
659; SSE:       # %bb.0:
660; SSE-NEXT:    addss %xmm1, %xmm0
661; SSE-NEXT:    ret{{[l|q]}}
662;
663; AVX-LABEL: insert_test_add_ss:
664; AVX:       # %bb.0:
665; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
666; AVX-NEXT:    ret{{[l|q]}}
667  %1 = fadd <4 x float> %a, %b
668  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
669  ret <4 x float> %2
670}
671
672define <4 x float> @insert_test_sub_ss(<4 x float> %a, <4 x float> %b) {
673; SSE-LABEL: insert_test_sub_ss:
674; SSE:       # %bb.0:
675; SSE-NEXT:    subss %xmm1, %xmm0
676; SSE-NEXT:    ret{{[l|q]}}
677;
678; AVX-LABEL: insert_test_sub_ss:
679; AVX:       # %bb.0:
680; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
681; AVX-NEXT:    ret{{[l|q]}}
682  %1 = fsub <4 x float> %a, %b
683  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
684  ret <4 x float> %2
685}
686
687define <4 x float> @insert_test_mul_ss(<4 x float> %a, <4 x float> %b) {
688; SSE-LABEL: insert_test_mul_ss:
689; SSE:       # %bb.0:
690; SSE-NEXT:    mulss %xmm1, %xmm0
691; SSE-NEXT:    ret{{[l|q]}}
692;
693; AVX-LABEL: insert_test_mul_ss:
694; AVX:       # %bb.0:
695; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
696; AVX-NEXT:    ret{{[l|q]}}
697  %1 = fmul <4 x float> %a, %b
698  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
699  ret <4 x float> %2
700}
701
702define <4 x float> @insert_test_div_ss(<4 x float> %a, <4 x float> %b) {
703; SSE-LABEL: insert_test_div_ss:
704; SSE:       # %bb.0:
705; SSE-NEXT:    divss %xmm1, %xmm0
706; SSE-NEXT:    ret{{[l|q]}}
707;
708; AVX-LABEL: insert_test_div_ss:
709; AVX:       # %bb.0:
710; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
711; AVX-NEXT:    ret{{[l|q]}}
712  %1 = fdiv <4 x float> %a, %b
713  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
714  ret <4 x float> %2
715}
716
717define <2 x double> @insert_test_add_sd(<2 x double> %a, <2 x double> %b) {
718; SSE-LABEL: insert_test_add_sd:
719; SSE:       # %bb.0:
720; SSE-NEXT:    addsd %xmm1, %xmm0
721; SSE-NEXT:    ret{{[l|q]}}
722;
723; AVX-LABEL: insert_test_add_sd:
724; AVX:       # %bb.0:
725; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
726; AVX-NEXT:    ret{{[l|q]}}
727  %1 = fadd <2 x double> %a, %b
728  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
729  ret <2 x double> %2
730}
731
732define <2 x double> @insert_test_sub_sd(<2 x double> %a, <2 x double> %b) {
733; SSE-LABEL: insert_test_sub_sd:
734; SSE:       # %bb.0:
735; SSE-NEXT:    subsd %xmm1, %xmm0
736; SSE-NEXT:    ret{{[l|q]}}
737;
738; AVX-LABEL: insert_test_sub_sd:
739; AVX:       # %bb.0:
740; AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
741; AVX-NEXT:    ret{{[l|q]}}
742  %1 = fsub <2 x double> %a, %b
743  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
744  ret <2 x double> %2
745}
746
747define <2 x double> @insert_test_mul_sd(<2 x double> %a, <2 x double> %b) {
748; SSE-LABEL: insert_test_mul_sd:
749; SSE:       # %bb.0:
750; SSE-NEXT:    mulsd %xmm1, %xmm0
751; SSE-NEXT:    ret{{[l|q]}}
752;
753; AVX-LABEL: insert_test_mul_sd:
754; AVX:       # %bb.0:
755; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
756; AVX-NEXT:    ret{{[l|q]}}
757  %1 = fmul <2 x double> %a, %b
758  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
759  ret <2 x double> %2
760}
761
762define <2 x double> @insert_test_div_sd(<2 x double> %a, <2 x double> %b) {
763; SSE-LABEL: insert_test_div_sd:
764; SSE:       # %bb.0:
765; SSE-NEXT:    divsd %xmm1, %xmm0
766; SSE-NEXT:    ret{{[l|q]}}
767;
768; AVX-LABEL: insert_test_div_sd:
769; AVX:       # %bb.0:
770; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
771; AVX-NEXT:    ret{{[l|q]}}
772  %1 = fdiv <2 x double> %a, %b
773  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
774  ret <2 x double> %2
775}
776
777define <4 x float> @insert_test2_add_ss(<4 x float> %a, <4 x float> %b) {
778; SSE-LABEL: insert_test2_add_ss:
779; SSE:       # %bb.0:
780; SSE-NEXT:    addss %xmm0, %xmm1
781; SSE-NEXT:    movaps %xmm1, %xmm0
782; SSE-NEXT:    ret{{[l|q]}}
783;
784; AVX-LABEL: insert_test2_add_ss:
785; AVX:       # %bb.0:
786; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
787; AVX-NEXT:    ret{{[l|q]}}
788  %1 = fadd <4 x float> %b, %a
789  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
790  ret <4 x float> %2
791}
792
793define <4 x float> @insert_test2_sub_ss(<4 x float> %a, <4 x float> %b) {
794; SSE-LABEL: insert_test2_sub_ss:
795; SSE:       # %bb.0:
796; SSE-NEXT:    subss %xmm0, %xmm1
797; SSE-NEXT:    movaps %xmm1, %xmm0
798; SSE-NEXT:    ret{{[l|q]}}
799;
800; AVX-LABEL: insert_test2_sub_ss:
801; AVX:       # %bb.0:
802; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
803; AVX-NEXT:    ret{{[l|q]}}
804  %1 = fsub <4 x float> %b, %a
805  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
806  ret <4 x float> %2
807}
808
809define <4 x float> @insert_test2_mul_ss(<4 x float> %a, <4 x float> %b) {
810; SSE-LABEL: insert_test2_mul_ss:
811; SSE:       # %bb.0:
812; SSE-NEXT:    mulss %xmm0, %xmm1
813; SSE-NEXT:    movaps %xmm1, %xmm0
814; SSE-NEXT:    ret{{[l|q]}}
815;
816; AVX-LABEL: insert_test2_mul_ss:
817; AVX:       # %bb.0:
818; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
819; AVX-NEXT:    ret{{[l|q]}}
820  %1 = fmul <4 x float> %b, %a
821  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
822  ret <4 x float> %2
823}
824
825define <4 x float> @insert_test2_div_ss(<4 x float> %a, <4 x float> %b) {
826; SSE-LABEL: insert_test2_div_ss:
827; SSE:       # %bb.0:
828; SSE-NEXT:    divss %xmm0, %xmm1
829; SSE-NEXT:    movaps %xmm1, %xmm0
830; SSE-NEXT:    ret{{[l|q]}}
831;
832; AVX-LABEL: insert_test2_div_ss:
833; AVX:       # %bb.0:
834; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
835; AVX-NEXT:    ret{{[l|q]}}
836  %1 = fdiv <4 x float> %b, %a
837  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
838  ret <4 x float> %2
839}
840
841define <2 x double> @insert_test2_add_sd(<2 x double> %a, <2 x double> %b) {
842; SSE-LABEL: insert_test2_add_sd:
843; SSE:       # %bb.0:
844; SSE-NEXT:    addsd %xmm0, %xmm1
845; SSE-NEXT:    movapd %xmm1, %xmm0
846; SSE-NEXT:    ret{{[l|q]}}
847;
848; AVX-LABEL: insert_test2_add_sd:
849; AVX:       # %bb.0:
850; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
851; AVX-NEXT:    ret{{[l|q]}}
852  %1 = fadd <2 x double> %b, %a
853  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
854  ret <2 x double> %2
855}
856
857define <2 x double> @insert_test2_sub_sd(<2 x double> %a, <2 x double> %b) {
858; SSE-LABEL: insert_test2_sub_sd:
859; SSE:       # %bb.0:
860; SSE-NEXT:    subsd %xmm0, %xmm1
861; SSE-NEXT:    movapd %xmm1, %xmm0
862; SSE-NEXT:    ret{{[l|q]}}
863;
864; AVX-LABEL: insert_test2_sub_sd:
865; AVX:       # %bb.0:
866; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
867; AVX-NEXT:    ret{{[l|q]}}
868  %1 = fsub <2 x double> %b, %a
869  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
870  ret <2 x double> %2
871}
872
873define <2 x double> @insert_test2_mul_sd(<2 x double> %a, <2 x double> %b) {
874; SSE-LABEL: insert_test2_mul_sd:
875; SSE:       # %bb.0:
876; SSE-NEXT:    mulsd %xmm0, %xmm1
877; SSE-NEXT:    movapd %xmm1, %xmm0
878; SSE-NEXT:    ret{{[l|q]}}
879;
880; AVX-LABEL: insert_test2_mul_sd:
881; AVX:       # %bb.0:
882; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
883; AVX-NEXT:    ret{{[l|q]}}
884  %1 = fmul <2 x double> %b, %a
885  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
886  ret <2 x double> %2
887}
888
889define <2 x double> @insert_test2_div_sd(<2 x double> %a, <2 x double> %b) {
890; SSE-LABEL: insert_test2_div_sd:
891; SSE:       # %bb.0:
892; SSE-NEXT:    divsd %xmm0, %xmm1
893; SSE-NEXT:    movapd %xmm1, %xmm0
894; SSE-NEXT:    ret{{[l|q]}}
895;
896; AVX-LABEL: insert_test2_div_sd:
897; AVX:       # %bb.0:
898; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
899; AVX-NEXT:    ret{{[l|q]}}
900  %1 = fdiv <2 x double> %b, %a
901  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
902  ret <2 x double> %2
903}
904
905define <4 x float> @insert_test3_add_ss(<4 x float> %a, <4 x float> %b) {
906; SSE-LABEL: insert_test3_add_ss:
907; SSE:       # %bb.0:
908; SSE-NEXT:    addss %xmm1, %xmm0
909; SSE-NEXT:    ret{{[l|q]}}
910;
911; AVX-LABEL: insert_test3_add_ss:
912; AVX:       # %bb.0:
913; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
914; AVX-NEXT:    ret{{[l|q]}}
915  %1 = fadd <4 x float> %a, %b
916  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
917  ret <4 x float> %2
918}
919
920define <4 x float> @insert_test3_sub_ss(<4 x float> %a, <4 x float> %b) {
921; SSE-LABEL: insert_test3_sub_ss:
922; SSE:       # %bb.0:
923; SSE-NEXT:    subss %xmm1, %xmm0
924; SSE-NEXT:    ret{{[l|q]}}
925;
926; AVX-LABEL: insert_test3_sub_ss:
927; AVX:       # %bb.0:
928; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
929; AVX-NEXT:    ret{{[l|q]}}
930  %1 = fsub <4 x float> %a, %b
931  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
932  ret <4 x float> %2
933}
934
935define <4 x float> @insert_test3_mul_ss(<4 x float> %a, <4 x float> %b) {
936; SSE-LABEL: insert_test3_mul_ss:
937; SSE:       # %bb.0:
938; SSE-NEXT:    mulss %xmm1, %xmm0
939; SSE-NEXT:    ret{{[l|q]}}
940;
941; AVX-LABEL: insert_test3_mul_ss:
942; AVX:       # %bb.0:
943; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
944; AVX-NEXT:    ret{{[l|q]}}
945  %1 = fmul <4 x float> %a, %b
946  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
947  ret <4 x float> %2
948}
949
950define <4 x float> @insert_test3_div_ss(<4 x float> %a, <4 x float> %b) {
951; SSE-LABEL: insert_test3_div_ss:
952; SSE:       # %bb.0:
953; SSE-NEXT:    divss %xmm1, %xmm0
954; SSE-NEXT:    ret{{[l|q]}}
955;
956; AVX-LABEL: insert_test3_div_ss:
957; AVX:       # %bb.0:
958; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
959; AVX-NEXT:    ret{{[l|q]}}
960  %1 = fdiv <4 x float> %a, %b
961  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
962  ret <4 x float> %2
963}
964
965define <2 x double> @insert_test3_add_sd(<2 x double> %a, <2 x double> %b) {
966; SSE-LABEL: insert_test3_add_sd:
967; SSE:       # %bb.0:
968; SSE-NEXT:    addsd %xmm1, %xmm0
969; SSE-NEXT:    ret{{[l|q]}}
970;
971; AVX-LABEL: insert_test3_add_sd:
972; AVX:       # %bb.0:
973; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
974; AVX-NEXT:    ret{{[l|q]}}
975  %1 = fadd <2 x double> %a, %b
976  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
977  ret <2 x double> %2
978}
979
980define <2 x double> @insert_test3_sub_sd(<2 x double> %a, <2 x double> %b) {
981; SSE-LABEL: insert_test3_sub_sd:
982; SSE:       # %bb.0:
983; SSE-NEXT:    subsd %xmm1, %xmm0
984; SSE-NEXT:    ret{{[l|q]}}
985;
986; AVX-LABEL: insert_test3_sub_sd:
987; AVX:       # %bb.0:
988; AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
989; AVX-NEXT:    ret{{[l|q]}}
990  %1 = fsub <2 x double> %a, %b
991  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
992  ret <2 x double> %2
993}
994
995define <2 x double> @insert_test3_mul_sd(<2 x double> %a, <2 x double> %b) {
996; SSE-LABEL: insert_test3_mul_sd:
997; SSE:       # %bb.0:
998; SSE-NEXT:    mulsd %xmm1, %xmm0
999; SSE-NEXT:    ret{{[l|q]}}
1000;
1001; AVX-LABEL: insert_test3_mul_sd:
1002; AVX:       # %bb.0:
1003; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1004; AVX-NEXT:    ret{{[l|q]}}
1005  %1 = fmul <2 x double> %a, %b
1006  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
1007  ret <2 x double> %2
1008}
1009
1010define <2 x double> @insert_test3_div_sd(<2 x double> %a, <2 x double> %b) {
1011; SSE-LABEL: insert_test3_div_sd:
1012; SSE:       # %bb.0:
1013; SSE-NEXT:    divsd %xmm1, %xmm0
1014; SSE-NEXT:    ret{{[l|q]}}
1015;
1016; AVX-LABEL: insert_test3_div_sd:
1017; AVX:       # %bb.0:
1018; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
1019; AVX-NEXT:    ret{{[l|q]}}
1020  %1 = fdiv <2 x double> %a, %b
1021  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
1022  ret <2 x double> %2
1023}
1024
1025define <4 x float> @insert_test4_add_ss(<4 x float> %a, <4 x float> %b) {
1026; SSE-LABEL: insert_test4_add_ss:
1027; SSE:       # %bb.0:
1028; SSE-NEXT:    addss %xmm0, %xmm1
1029; SSE-NEXT:    movaps %xmm1, %xmm0
1030; SSE-NEXT:    ret{{[l|q]}}
1031;
1032; AVX-LABEL: insert_test4_add_ss:
1033; AVX:       # %bb.0:
1034; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
1035; AVX-NEXT:    ret{{[l|q]}}
1036  %1 = fadd <4 x float> %b, %a
1037  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
1038  ret <4 x float> %2
1039}
1040
1041define <4 x float> @insert_test4_sub_ss(<4 x float> %a, <4 x float> %b) {
1042; SSE-LABEL: insert_test4_sub_ss:
1043; SSE:       # %bb.0:
1044; SSE-NEXT:    subss %xmm0, %xmm1
1045; SSE-NEXT:    movaps %xmm1, %xmm0
1046; SSE-NEXT:    ret{{[l|q]}}
1047;
1048; AVX-LABEL: insert_test4_sub_ss:
1049; AVX:       # %bb.0:
1050; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
1051; AVX-NEXT:    ret{{[l|q]}}
1052  %1 = fsub <4 x float> %b, %a
1053  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
1054  ret <4 x float> %2
1055}
1056
1057define <4 x float> @insert_test4_mul_ss(<4 x float> %a, <4 x float> %b) {
1058; SSE-LABEL: insert_test4_mul_ss:
1059; SSE:       # %bb.0:
1060; SSE-NEXT:    mulss %xmm0, %xmm1
1061; SSE-NEXT:    movaps %xmm1, %xmm0
1062; SSE-NEXT:    ret{{[l|q]}}
1063;
1064; AVX-LABEL: insert_test4_mul_ss:
1065; AVX:       # %bb.0:
1066; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
1067; AVX-NEXT:    ret{{[l|q]}}
1068  %1 = fmul <4 x float> %b, %a
1069  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
1070  ret <4 x float> %2
1071}
1072
1073define <4 x float> @insert_test4_div_ss(<4 x float> %a, <4 x float> %b) {
1074; SSE-LABEL: insert_test4_div_ss:
1075; SSE:       # %bb.0:
1076; SSE-NEXT:    divss %xmm0, %xmm1
1077; SSE-NEXT:    movaps %xmm1, %xmm0
1078; SSE-NEXT:    ret{{[l|q]}}
1079;
1080; AVX-LABEL: insert_test4_div_ss:
1081; AVX:       # %bb.0:
1082; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
1083; AVX-NEXT:    ret{{[l|q]}}
1084  %1 = fdiv <4 x float> %b, %a
1085  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
1086  ret <4 x float> %2
1087}
1088
1089define <2 x double> @insert_test4_add_sd(<2 x double> %a, <2 x double> %b) {
1090; SSE-LABEL: insert_test4_add_sd:
1091; SSE:       # %bb.0:
1092; SSE-NEXT:    addsd %xmm0, %xmm1
1093; SSE-NEXT:    movapd %xmm1, %xmm0
1094; SSE-NEXT:    ret{{[l|q]}}
1095;
1096; AVX-LABEL: insert_test4_add_sd:
1097; AVX:       # %bb.0:
1098; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
1099; AVX-NEXT:    ret{{[l|q]}}
1100  %1 = fadd <2 x double> %b, %a
1101  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
1102  ret <2 x double> %2
1103}
1104
1105define <2 x double> @insert_test4_sub_sd(<2 x double> %a, <2 x double> %b) {
1106; SSE-LABEL: insert_test4_sub_sd:
1107; SSE:       # %bb.0:
1108; SSE-NEXT:    subsd %xmm0, %xmm1
1109; SSE-NEXT:    movapd %xmm1, %xmm0
1110; SSE-NEXT:    ret{{[l|q]}}
1111;
1112; AVX-LABEL: insert_test4_sub_sd:
1113; AVX:       # %bb.0:
1114; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
1115; AVX-NEXT:    ret{{[l|q]}}
1116  %1 = fsub <2 x double> %b, %a
1117  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
1118  ret <2 x double> %2
1119}
1120
1121define <2 x double> @insert_test4_mul_sd(<2 x double> %a, <2 x double> %b) {
1122; SSE-LABEL: insert_test4_mul_sd:
1123; SSE:       # %bb.0:
1124; SSE-NEXT:    mulsd %xmm0, %xmm1
1125; SSE-NEXT:    movapd %xmm1, %xmm0
1126; SSE-NEXT:    ret{{[l|q]}}
1127;
1128; AVX-LABEL: insert_test4_mul_sd:
1129; AVX:       # %bb.0:
1130; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
1131; AVX-NEXT:    ret{{[l|q]}}
1132  %1 = fmul <2 x double> %b, %a
1133  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
1134  ret <2 x double> %2
1135}
1136
1137define <2 x double> @insert_test4_div_sd(<2 x double> %a, <2 x double> %b) {
1138; SSE-LABEL: insert_test4_div_sd:
1139; SSE:       # %bb.0:
1140; SSE-NEXT:    divsd %xmm0, %xmm1
1141; SSE-NEXT:    movapd %xmm1, %xmm0
1142; SSE-NEXT:    ret{{[l|q]}}
1143;
1144; AVX-LABEL: insert_test4_div_sd:
1145; AVX:       # %bb.0:
1146; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
1147; AVX-NEXT:    ret{{[l|q]}}
1148  %1 = fdiv <2 x double> %b, %a
1149  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
1150  ret <2 x double> %2
1151}
1152
1153define <4 x float> @insert_test5_add_ss(<4 x float> %a, <4 x float> %b) {
1154; SSE-LABEL: insert_test5_add_ss:
1155; SSE:       # %bb.0:
1156; SSE-NEXT:    addss %xmm1, %xmm0
1157; SSE-NEXT:    ret{{[l|q]}}
1158;
1159; AVX-LABEL: insert_test5_add_ss:
1160; AVX:       # %bb.0:
1161; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
1162; AVX-NEXT:    ret{{[l|q]}}
1163  %1 = fadd <4 x float> %b, %a
1164  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1165  ret <4 x float> %2
1166}
1167
1168define <4 x float> @insert_test5_sub_ss(<4 x float> %a, <4 x float> %b) {
1169; SSE2-LABEL: insert_test5_sub_ss:
1170; SSE2:       # %bb.0:
1171; SSE2-NEXT:    subps %xmm0, %xmm1
1172; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1173; SSE2-NEXT:    ret{{[l|q]}}
1174;
1175; SSE41-LABEL: insert_test5_sub_ss:
1176; SSE41:       # %bb.0:
1177; SSE41-NEXT:    subps %xmm0, %xmm1
1178; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1179; SSE41-NEXT:    ret{{[l|q]}}
1180;
1181; AVX-LABEL: insert_test5_sub_ss:
1182; AVX:       # %bb.0:
1183; AVX-NEXT:    vsubps %xmm0, %xmm1, %xmm1
1184; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1185; AVX-NEXT:    ret{{[l|q]}}
1186  %1 = fsub <4 x float> %b, %a
1187  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1188  ret <4 x float> %2
1189}
1190
1191define <4 x float> @insert_test5_mul_ss(<4 x float> %a, <4 x float> %b) {
1192; SSE-LABEL: insert_test5_mul_ss:
1193; SSE:       # %bb.0:
1194; SSE-NEXT:    mulss %xmm1, %xmm0
1195; SSE-NEXT:    ret{{[l|q]}}
1196;
1197; AVX-LABEL: insert_test5_mul_ss:
1198; AVX:       # %bb.0:
1199; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
1200; AVX-NEXT:    ret{{[l|q]}}
1201  %1 = fmul <4 x float> %b, %a
1202  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1203  ret <4 x float> %2
1204}
1205
1206define <4 x float> @insert_test5_div_ss(<4 x float> %a, <4 x float> %b) {
1207; SSE2-LABEL: insert_test5_div_ss:
1208; SSE2:       # %bb.0:
1209; SSE2-NEXT:    divps %xmm0, %xmm1
1210; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1211; SSE2-NEXT:    ret{{[l|q]}}
1212;
1213; SSE41-LABEL: insert_test5_div_ss:
1214; SSE41:       # %bb.0:
1215; SSE41-NEXT:    divps %xmm0, %xmm1
1216; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1217; SSE41-NEXT:    ret{{[l|q]}}
1218;
1219; AVX-LABEL: insert_test5_div_ss:
1220; AVX:       # %bb.0:
1221; AVX-NEXT:    vdivps %xmm0, %xmm1, %xmm1
1222; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1223; AVX-NEXT:    ret{{[l|q]}}
1224  %1 = fdiv <4 x float> %b, %a
1225  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
1226  ret <4 x float> %2
1227}
1228
1229define <2 x double> @insert_test5_add_sd(<2 x double> %a, <2 x double> %b) {
1230; SSE-LABEL: insert_test5_add_sd:
1231; SSE:       # %bb.0:
1232; SSE-NEXT:    addsd %xmm1, %xmm0
1233; SSE-NEXT:    ret{{[l|q]}}
1234;
1235; AVX-LABEL: insert_test5_add_sd:
1236; AVX:       # %bb.0:
1237; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
1238; AVX-NEXT:    ret{{[l|q]}}
1239  %1 = fadd <2 x double> %b, %a
1240  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
1241  ret <2 x double> %2
1242}
1243
1244define <2 x double> @insert_test5_sub_sd(<2 x double> %a, <2 x double> %b) {
1245; SSE2-LABEL: insert_test5_sub_sd:
1246; SSE2:       # %bb.0:
1247; SSE2-NEXT:    subpd %xmm0, %xmm1
1248; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1249; SSE2-NEXT:    ret{{[l|q]}}
1250;
1251; SSE41-LABEL: insert_test5_sub_sd:
1252; SSE41:       # %bb.0:
1253; SSE41-NEXT:    subpd %xmm0, %xmm1
1254; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1255; SSE41-NEXT:    ret{{[l|q]}}
1256;
1257; AVX-LABEL: insert_test5_sub_sd:
1258; AVX:       # %bb.0:
1259; AVX-NEXT:    vsubpd %xmm0, %xmm1, %xmm1
1260; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1261; AVX-NEXT:    ret{{[l|q]}}
1262  %1 = fsub <2 x double> %b, %a
1263  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
1264  ret <2 x double> %2
1265}
1266
1267define <2 x double> @insert_test5_mul_sd(<2 x double> %a, <2 x double> %b) {
1268; SSE-LABEL: insert_test5_mul_sd:
1269; SSE:       # %bb.0:
1270; SSE-NEXT:    mulsd %xmm1, %xmm0
1271; SSE-NEXT:    ret{{[l|q]}}
1272;
1273; AVX-LABEL: insert_test5_mul_sd:
1274; AVX:       # %bb.0:
1275; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
1276; AVX-NEXT:    ret{{[l|q]}}
1277  %1 = fmul <2 x double> %b, %a
1278  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
1279  ret <2 x double> %2
1280}
1281
1282define <2 x double> @insert_test5_div_sd(<2 x double> %a, <2 x double> %b) {
1283; SSE2-LABEL: insert_test5_div_sd:
1284; SSE2:       # %bb.0:
1285; SSE2-NEXT:    divpd %xmm0, %xmm1
1286; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1287; SSE2-NEXT:    ret{{[l|q]}}
1288;
1289; SSE41-LABEL: insert_test5_div_sd:
1290; SSE41:       # %bb.0:
1291; SSE41-NEXT:    divpd %xmm0, %xmm1
1292; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1293; SSE41-NEXT:    ret{{[l|q]}}
1294;
1295; AVX-LABEL: insert_test5_div_sd:
1296; AVX:       # %bb.0:
1297; AVX-NEXT:    vdivpd %xmm0, %xmm1, %xmm1
1298; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1299; AVX-NEXT:    ret{{[l|q]}}
1300  %1 = fdiv <2 x double> %b, %a
1301  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
1302  ret <2 x double> %2
1303}
1304
1305define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
1306; X86-SSE2-LABEL: add_ss_mask:
1307; X86-SSE2:       # %bb.0:
1308; X86-SSE2-NEXT:    testb $1, {{[0-9]+}}(%esp)
1309; X86-SSE2-NEXT:    jne .LBB70_1
1310; X86-SSE2-NEXT:  # %bb.2:
1311; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1312; X86-SSE2-NEXT:    retl
1313; X86-SSE2-NEXT:  .LBB70_1:
1314; X86-SSE2-NEXT:    addss %xmm0, %xmm1
1315; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1316; X86-SSE2-NEXT:    retl
1317;
1318; X86-SSE41-LABEL: add_ss_mask:
1319; X86-SSE41:       # %bb.0:
1320; X86-SSE41-NEXT:    testb $1, {{[0-9]+}}(%esp)
1321; X86-SSE41-NEXT:    jne .LBB70_1
1322; X86-SSE41-NEXT:  # %bb.2:
1323; X86-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1324; X86-SSE41-NEXT:    retl
1325; X86-SSE41-NEXT:  .LBB70_1:
1326; X86-SSE41-NEXT:    addss %xmm0, %xmm1
1327; X86-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1328; X86-SSE41-NEXT:    retl
1329;
1330; X86-AVX1-LABEL: add_ss_mask:
1331; X86-AVX1:       # %bb.0:
1332; X86-AVX1-NEXT:    testb $1, {{[0-9]+}}(%esp)
1333; X86-AVX1-NEXT:    je .LBB70_2
1334; X86-AVX1-NEXT:  # %bb.1:
1335; X86-AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm2
1336; X86-AVX1-NEXT:  .LBB70_2:
1337; X86-AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1338; X86-AVX1-NEXT:    retl
1339;
1340; X86-AVX512-LABEL: add_ss_mask:
1341; X86-AVX512:       # %bb.0:
1342; X86-AVX512-NEXT:    movb {{[0-9]+}}(%esp), %al
1343; X86-AVX512-NEXT:    kmovw %eax, %k1
1344; X86-AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm2 {%k1}
1345; X86-AVX512-NEXT:    vmovaps %xmm2, %xmm0
1346; X86-AVX512-NEXT:    retl
1347;
1348; X64-SSE2-LABEL: add_ss_mask:
1349; X64-SSE2:       # %bb.0:
1350; X64-SSE2-NEXT:    testb $1, %dil
1351; X64-SSE2-NEXT:    jne .LBB70_1
1352; X64-SSE2-NEXT:  # %bb.2:
1353; X64-SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1354; X64-SSE2-NEXT:    retq
1355; X64-SSE2-NEXT:  .LBB70_1:
1356; X64-SSE2-NEXT:    addss %xmm0, %xmm1
1357; X64-SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1358; X64-SSE2-NEXT:    retq
1359;
1360; X64-SSE41-LABEL: add_ss_mask:
1361; X64-SSE41:       # %bb.0:
1362; X64-SSE41-NEXT:    testb $1, %dil
1363; X64-SSE41-NEXT:    jne .LBB70_1
1364; X64-SSE41-NEXT:  # %bb.2:
1365; X64-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1366; X64-SSE41-NEXT:    retq
1367; X64-SSE41-NEXT:  .LBB70_1:
1368; X64-SSE41-NEXT:    addss %xmm0, %xmm1
1369; X64-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1370; X64-SSE41-NEXT:    retq
1371;
1372; X64-AVX1-LABEL: add_ss_mask:
1373; X64-AVX1:       # %bb.0:
1374; X64-AVX1-NEXT:    testb $1, %dil
1375; X64-AVX1-NEXT:    je .LBB70_2
1376; X64-AVX1-NEXT:  # %bb.1:
1377; X64-AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm2
1378; X64-AVX1-NEXT:  .LBB70_2:
1379; X64-AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1380; X64-AVX1-NEXT:    retq
1381;
1382; X64-AVX512-LABEL: add_ss_mask:
1383; X64-AVX512:       # %bb.0:
1384; X64-AVX512-NEXT:    kmovw %edi, %k1
1385; X64-AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm2 {%k1}
1386; X64-AVX512-NEXT:    vmovaps %xmm2, %xmm0
1387; X64-AVX512-NEXT:    retq
1388  %1 = extractelement <4 x float> %a, i64 0
1389  %2 = extractelement <4 x float> %b, i64 0
1390  %3 = fadd float %1, %2
1391  %4 = extractelement <4 x float> %c, i32 0
1392  %5 = bitcast i8 %mask to <8 x i1>
1393  %6 = extractelement <8 x i1> %5, i64 0
1394  %7 = select i1 %6, float %3, float %4
1395  %8 = insertelement <4 x float> %a, float %7, i64 0
1396  ret <4 x float> %8
1397}
1398
1399define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
1400; X86-SSE2-LABEL: add_sd_mask:
1401; X86-SSE2:       # %bb.0:
1402; X86-SSE2-NEXT:    testb $1, {{[0-9]+}}(%esp)
1403; X86-SSE2-NEXT:    jne .LBB71_1
1404; X86-SSE2-NEXT:  # %bb.2:
1405; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1406; X86-SSE2-NEXT:    retl
1407; X86-SSE2-NEXT:  .LBB71_1:
1408; X86-SSE2-NEXT:    addsd %xmm0, %xmm1
1409; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1410; X86-SSE2-NEXT:    retl
1411;
1412; X86-SSE41-LABEL: add_sd_mask:
1413; X86-SSE41:       # %bb.0:
1414; X86-SSE41-NEXT:    testb $1, {{[0-9]+}}(%esp)
1415; X86-SSE41-NEXT:    jne .LBB71_1
1416; X86-SSE41-NEXT:  # %bb.2:
1417; X86-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
1418; X86-SSE41-NEXT:    retl
1419; X86-SSE41-NEXT:  .LBB71_1:
1420; X86-SSE41-NEXT:    addsd %xmm0, %xmm1
1421; X86-SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1422; X86-SSE41-NEXT:    retl
1423;
1424; X86-AVX1-LABEL: add_sd_mask:
1425; X86-AVX1:       # %bb.0:
1426; X86-AVX1-NEXT:    testb $1, {{[0-9]+}}(%esp)
1427; X86-AVX1-NEXT:    je .LBB71_2
1428; X86-AVX1-NEXT:  # %bb.1:
1429; X86-AVX1-NEXT:    vaddsd %xmm1, %xmm0, %xmm2
1430; X86-AVX1-NEXT:  .LBB71_2:
1431; X86-AVX1-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1432; X86-AVX1-NEXT:    retl
1433;
1434; X86-AVX512-LABEL: add_sd_mask:
1435; X86-AVX512:       # %bb.0:
1436; X86-AVX512-NEXT:    movb {{[0-9]+}}(%esp), %al
1437; X86-AVX512-NEXT:    kmovw %eax, %k1
1438; X86-AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm2 {%k1}
1439; X86-AVX512-NEXT:    vmovapd %xmm2, %xmm0
1440; X86-AVX512-NEXT:    retl
1441;
1442; X64-SSE2-LABEL: add_sd_mask:
1443; X64-SSE2:       # %bb.0:
1444; X64-SSE2-NEXT:    testb $1, %dil
1445; X64-SSE2-NEXT:    jne .LBB71_1
1446; X64-SSE2-NEXT:  # %bb.2:
1447; X64-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1448; X64-SSE2-NEXT:    retq
1449; X64-SSE2-NEXT:  .LBB71_1:
1450; X64-SSE2-NEXT:    addsd %xmm0, %xmm1
1451; X64-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1452; X64-SSE2-NEXT:    retq
1453;
1454; X64-SSE41-LABEL: add_sd_mask:
1455; X64-SSE41:       # %bb.0:
1456; X64-SSE41-NEXT:    testb $1, %dil
1457; X64-SSE41-NEXT:    jne .LBB71_1
1458; X64-SSE41-NEXT:  # %bb.2:
1459; X64-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
1460; X64-SSE41-NEXT:    retq
1461; X64-SSE41-NEXT:  .LBB71_1:
1462; X64-SSE41-NEXT:    addsd %xmm0, %xmm1
1463; X64-SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1464; X64-SSE41-NEXT:    retq
1465;
1466; X64-AVX1-LABEL: add_sd_mask:
1467; X64-AVX1:       # %bb.0:
1468; X64-AVX1-NEXT:    testb $1, %dil
1469; X64-AVX1-NEXT:    je .LBB71_2
1470; X64-AVX1-NEXT:  # %bb.1:
1471; X64-AVX1-NEXT:    vaddsd %xmm1, %xmm0, %xmm2
1472; X64-AVX1-NEXT:  .LBB71_2:
1473; X64-AVX1-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
1474; X64-AVX1-NEXT:    retq
1475;
1476; X64-AVX512-LABEL: add_sd_mask:
1477; X64-AVX512:       # %bb.0:
1478; X64-AVX512-NEXT:    kmovw %edi, %k1
1479; X64-AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm2 {%k1}
1480; X64-AVX512-NEXT:    vmovapd %xmm2, %xmm0
1481; X64-AVX512-NEXT:    retq
1482  %1 = extractelement <2 x double> %a, i64 0
1483  %2 = extractelement <2 x double> %b, i64 0
1484  %3 = fadd double %1, %2
1485  %4 = extractelement <2 x double> %c, i32 0
1486  %5 = bitcast i8 %mask to <8 x i1>
1487  %6 = extractelement <8 x i1> %5, i64 0
1488  %7 = select i1 %6, double %3, double %4
1489  %8 = insertelement <2 x double> %a, double %7, i64 0
1490  ret <2 x double> %8
1491}
1492