• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X32
3; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64
4
5define <4 x i64> @A(i64* %ptr) nounwind uwtable readnone ssp {
6; X32-LABEL: A:
7; X32:       ## %bb.0: ## %entry
8; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
9; X32-NEXT:    movl (%eax), %ecx
10; X32-NEXT:    movl 4(%eax), %eax
11; X32-NEXT:    vmovd %ecx, %xmm0
12; X32-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
13; X32-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
14; X32-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
15; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
16; X32-NEXT:    retl
17;
18; X64-LABEL: A:
19; X64:       ## %bb.0: ## %entry
20; X64-NEXT:    vbroadcastsd (%rdi), %ymm0
21; X64-NEXT:    retq
22entry:
23  %q = load i64, i64* %ptr, align 8
24  %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
25  %vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1
26  %vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2
27  %vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3
28  ret <4 x i64> %vecinit6.i
29}
30
31define <4 x i64> @A2(i64* %ptr, i64* %ptr2) nounwind uwtable readnone ssp {
32; X32-LABEL: A2:
33; X32:       ## %bb.0: ## %entry
34; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
35; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
36; X32-NEXT:    movl (%ecx), %edx
37; X32-NEXT:    movl 4(%ecx), %ecx
38; X32-NEXT:    movl %ecx, 4(%eax)
39; X32-NEXT:    movl %edx, (%eax)
40; X32-NEXT:    vmovd %edx, %xmm0
41; X32-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
42; X32-NEXT:    vpinsrd $2, %edx, %xmm0, %xmm0
43; X32-NEXT:    vpinsrd $3, %ecx, %xmm0, %xmm0
44; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
45; X32-NEXT:    retl
46;
47; X64-LABEL: A2:
48; X64:       ## %bb.0: ## %entry
49; X64-NEXT:    movq (%rdi), %rax
50; X64-NEXT:    vmovq %rax, %xmm0
51; X64-NEXT:    movq %rax, (%rsi)
52; X64-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
53; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
54; X64-NEXT:    retq
55entry:
56  %q = load i64, i64* %ptr, align 8
57  store i64 %q, i64* %ptr2, align 8 ; to create a chain to prevent broadcast
58  %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
59  %vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1
60  %vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2
61  %vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3
62  ret <4 x i64> %vecinit6.i
63}
64
65define <8 x i32> @B(i32* %ptr) nounwind uwtable readnone ssp {
66; X32-LABEL: B:
67; X32:       ## %bb.0: ## %entry
68; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
69; X32-NEXT:    vbroadcastss (%eax), %ymm0
70; X32-NEXT:    retl
71;
72; X64-LABEL: B:
73; X64:       ## %bb.0: ## %entry
74; X64-NEXT:    vbroadcastss (%rdi), %ymm0
75; X64-NEXT:    retq
76entry:
77  %q = load i32, i32* %ptr, align 4
78  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
79  %vecinit2.i = insertelement <8 x i32> %vecinit.i, i32 %q, i32 1
80  %vecinit4.i = insertelement <8 x i32> %vecinit2.i, i32 %q, i32 2
81  %vecinit6.i = insertelement <8 x i32> %vecinit4.i, i32 %q, i32 3
82  ret <8 x i32> %vecinit6.i
83}
84
85define <8 x i32> @B2(i32* %ptr) nounwind uwtable readnone ssp {
86; X32-LABEL: B2:
87; X32:       ## %bb.0: ## %entry
88; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
89; X32-NEXT:    vbroadcastss (%eax), %ymm0
90; X32-NEXT:    retl
91;
92; X64-LABEL: B2:
93; X64:       ## %bb.0: ## %entry
94; X64-NEXT:    vbroadcastss (%rdi), %ymm0
95; X64-NEXT:    retq
96entry:
97  %q = load i32, i32* %ptr, align 4
98  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
99  %vecinit2.i = insertelement <8 x i32> %vecinit.i, i32 %q, i32 1
100  %vecinit4.i = insertelement <8 x i32> %vecinit2.i, i32 %q, i32 2
101  %vecinit6.i = insertelement <8 x i32> %vecinit4.i, i32 %q, i32 3
102  %vecinit8.i = insertelement <8 x i32> %vecinit6.i, i32 %q, i32 4
103  %vecinit10.i = insertelement <8 x i32> %vecinit8.i, i32 %q, i32 5
104  %vecinit12.i = insertelement <8 x i32> %vecinit10.i, i32 %q, i32 6
105  %vecinit14.i = insertelement <8 x i32> %vecinit12.i, i32 %q, i32 7
106  ret <8 x i32> %vecinit14.i
107}
108
109define <8 x i32> @B3(i32* %ptr, i32* %ptr2) nounwind uwtable readnone ssp {
110; X32-LABEL: B3:
111; X32:       ## %bb.0: ## %entry
112; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
113; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
114; X32-NEXT:    movl (%ecx), %ecx
115; X32-NEXT:    vmovd %ecx, %xmm0
116; X32-NEXT:    movl %ecx, (%eax)
117; X32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
118; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
119; X32-NEXT:    retl
120;
121; X64-LABEL: B3:
122; X64:       ## %bb.0: ## %entry
123; X64-NEXT:    movl (%rdi), %eax
124; X64-NEXT:    vmovd %eax, %xmm0
125; X64-NEXT:    movl %eax, (%rsi)
126; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
127; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
128; X64-NEXT:    retq
129entry:
130  %q = load i32, i32* %ptr, align 4
131  store i32 %q, i32* %ptr2, align 4 ; to create a chain to prevent broadcast
132  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
133  %vecinit2.i = insertelement <8 x i32> %vecinit.i, i32 %q, i32 1
134  %vecinit4.i = insertelement <8 x i32> %vecinit2.i, i32 %q, i32 2
135  %vecinit6.i = insertelement <8 x i32> %vecinit4.i, i32 %q, i32 3
136  %vecinit8.i = insertelement <8 x i32> %vecinit6.i, i32 %q, i32 4
137  %vecinit10.i = insertelement <8 x i32> %vecinit8.i, i32 %q, i32 5
138  %vecinit12.i = insertelement <8 x i32> %vecinit10.i, i32 %q, i32 6
139  %vecinit14.i = insertelement <8 x i32> %vecinit12.i, i32 %q, i32 7
140  ret <8 x i32> %vecinit14.i
141}
142
143define <4 x double> @C(double* %ptr) nounwind uwtable readnone ssp {
144; X32-LABEL: C:
145; X32:       ## %bb.0: ## %entry
146; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
147; X32-NEXT:    vbroadcastsd (%eax), %ymm0
148; X32-NEXT:    retl
149;
150; X64-LABEL: C:
151; X64:       ## %bb.0: ## %entry
152; X64-NEXT:    vbroadcastsd (%rdi), %ymm0
153; X64-NEXT:    retq
154entry:
155  %q = load double, double* %ptr, align 8
156  %vecinit.i = insertelement <4 x double> undef, double %q, i32 0
157  %vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1
158  %vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2
159  %vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3
160  ret <4 x double> %vecinit6.i
161}
162
163define <4 x double> @C2(double* %ptr, double* %ptr2) nounwind uwtable readnone ssp {
164; X32-LABEL: C2:
165; X32:       ## %bb.0: ## %entry
166; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
167; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
168; X32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
169; X32-NEXT:    vmovsd %xmm0, (%eax)
170; X32-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
171; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
172; X32-NEXT:    retl
173;
174; X64-LABEL: C2:
175; X64:       ## %bb.0: ## %entry
176; X64-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
177; X64-NEXT:    vmovsd %xmm0, (%rsi)
178; X64-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
179; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
180; X64-NEXT:    retq
181entry:
182  %q = load double, double* %ptr, align 8
183  store double %q, double* %ptr2, align 8 ; to create a chain to prevent broadcast
184  %vecinit.i = insertelement <4 x double> undef, double %q, i32 0
185  %vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1
186  %vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2
187  %vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3
188  ret <4 x double> %vecinit6.i
189}
190
191define <8 x float> @D(float* %ptr) nounwind uwtable readnone ssp {
192; X32-LABEL: D:
193; X32:       ## %bb.0: ## %entry
194; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
195; X32-NEXT:    vbroadcastss (%eax), %ymm0
196; X32-NEXT:    retl
197;
198; X64-LABEL: D:
199; X64:       ## %bb.0: ## %entry
200; X64-NEXT:    vbroadcastss (%rdi), %ymm0
201; X64-NEXT:    retq
202entry:
203  %q = load float, float* %ptr, align 4
204  %vecinit.i = insertelement <8 x float> undef, float %q, i32 0
205  %vecinit2.i = insertelement <8 x float> %vecinit.i, float %q, i32 1
206  %vecinit4.i = insertelement <8 x float> %vecinit2.i, float %q, i32 2
207  %vecinit6.i = insertelement <8 x float> %vecinit4.i, float %q, i32 3
208  ret <8 x float> %vecinit6.i
209}
210
211define <8 x float> @D2(float* %ptr) nounwind uwtable readnone ssp {
212; X32-LABEL: D2:
213; X32:       ## %bb.0: ## %entry
214; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
215; X32-NEXT:    vbroadcastss (%eax), %ymm0
216; X32-NEXT:    retl
217;
218; X64-LABEL: D2:
219; X64:       ## %bb.0: ## %entry
220; X64-NEXT:    vbroadcastss (%rdi), %ymm0
221; X64-NEXT:    retq
222entry:
223  %q = load float, float* %ptr, align 4
224  %vecinit.i = insertelement <8 x float> undef, float %q, i32 0
225  %vecinit2.i = insertelement <8 x float> %vecinit.i, float %q, i32 1
226  %vecinit4.i = insertelement <8 x float> %vecinit2.i, float %q, i32 2
227  %vecinit6.i = insertelement <8 x float> %vecinit4.i, float %q, i32 3
228  %vecinit8.i = insertelement <8 x float> %vecinit6.i, float %q, i32 4
229  %vecinit10.i = insertelement <8 x float> %vecinit8.i, float %q, i32 5
230  %vecinit12.i = insertelement <8 x float> %vecinit10.i, float %q, i32 6
231  %vecinit14.i = insertelement <8 x float> %vecinit12.i, float %q, i32 7
232  ret <8 x float> %vecinit14.i
233}
234
235define <8 x float> @D3(float* %ptr, float* %ptr2) nounwind uwtable readnone ssp {
236; X32-LABEL: D3:
237; X32:       ## %bb.0: ## %entry
238; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
239; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
240; X32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
241; X32-NEXT:    vmovss %xmm0, (%eax)
242; X32-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
243; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
244; X32-NEXT:    retl
245;
246; X64-LABEL: D3:
247; X64:       ## %bb.0: ## %entry
248; X64-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
249; X64-NEXT:    vmovss %xmm0, (%rsi)
250; X64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
251; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
252; X64-NEXT:    retq
253entry:
254  %q = load float, float* %ptr, align 4
255  store float %q, float* %ptr2, align 4 ; to create a chain to prevent broadcast
256  %vecinit.i = insertelement <8 x float> undef, float %q, i32 0
257  %vecinit2.i = insertelement <8 x float> %vecinit.i, float %q, i32 1
258  %vecinit4.i = insertelement <8 x float> %vecinit2.i, float %q, i32 2
259  %vecinit6.i = insertelement <8 x float> %vecinit4.i, float %q, i32 3
260  %vecinit8.i = insertelement <8 x float> %vecinit6.i, float %q, i32 4
261  %vecinit10.i = insertelement <8 x float> %vecinit8.i, float %q, i32 5
262  %vecinit12.i = insertelement <8 x float> %vecinit10.i, float %q, i32 6
263  %vecinit14.i = insertelement <8 x float> %vecinit12.i, float %q, i32 7
264  ret <8 x float> %vecinit14.i
265}
266
267;;;; 128-bit versions
268
269define <4 x float> @e(float* %ptr) nounwind uwtable readnone ssp {
270; X32-LABEL: e:
271; X32:       ## %bb.0: ## %entry
272; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
273; X32-NEXT:    vbroadcastss (%eax), %xmm0
274; X32-NEXT:    retl
275;
276; X64-LABEL: e:
277; X64:       ## %bb.0: ## %entry
278; X64-NEXT:    vbroadcastss (%rdi), %xmm0
279; X64-NEXT:    retq
280entry:
281  %q = load float, float* %ptr, align 4
282  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
283  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
284  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
285  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
286  ret <4 x float> %vecinit6.i
287}
288
289define <4 x float> @e2(float* %ptr, float* %ptr2) nounwind uwtable readnone ssp {
290; X32-LABEL: e2:
291; X32:       ## %bb.0: ## %entry
292; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
293; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
294; X32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
295; X32-NEXT:    vmovss %xmm0, (%eax)
296; X32-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
297; X32-NEXT:    retl
298;
299; X64-LABEL: e2:
300; X64:       ## %bb.0: ## %entry
301; X64-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
302; X64-NEXT:    vmovss %xmm0, (%rsi)
303; X64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
304; X64-NEXT:    retq
305entry:
306  %q = load float, float* %ptr, align 4
307  store float %q, float* %ptr2, align 4 ; to create a chain to prevent broadcast
308  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
309  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
310  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
311  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
312  ret <4 x float> %vecinit6.i
313}
314
315; Don't broadcast constants on pre-AVX2 hardware.
316define <4 x float> @_e2(float* %ptr) nounwind uwtable readnone ssp {
317; X32-LABEL: _e2:
318; X32:       ## %bb.0: ## %entry
319; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [-7.812500e-03,-7.812500e-03,-7.812500e-03,-7.812500e-03]
320; X32-NEXT:    retl
321;
322; X64-LABEL: _e2:
323; X64:       ## %bb.0: ## %entry
324; X64-NEXT:    vmovaps {{.*#+}} xmm0 = [-7.812500e-03,-7.812500e-03,-7.812500e-03,-7.812500e-03]
325; X64-NEXT:    retq
326entry:
327   %vecinit.i = insertelement <4 x float> undef, float       0xbf80000000000000, i32 0
328  %vecinit2.i = insertelement <4 x float> %vecinit.i, float  0xbf80000000000000, i32 1
329  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float 0xbf80000000000000, i32 2
330  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float 0xbf80000000000000, i32 3
331  ret <4 x float> %vecinit6.i
332}
333
334
335define <4 x i32> @F(i32* %ptr) nounwind uwtable readnone ssp {
336; X32-LABEL: F:
337; X32:       ## %bb.0: ## %entry
338; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
339; X32-NEXT:    vbroadcastss (%eax), %xmm0
340; X32-NEXT:    retl
341;
342; X64-LABEL: F:
343; X64:       ## %bb.0: ## %entry
344; X64-NEXT:    vbroadcastss (%rdi), %xmm0
345; X64-NEXT:    retq
346entry:
347  %q = load i32, i32* %ptr, align 4
348  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
349  %vecinit2.i = insertelement <4 x i32> %vecinit.i, i32 %q, i32 1
350  %vecinit4.i = insertelement <4 x i32> %vecinit2.i, i32 %q, i32 2
351  %vecinit6.i = insertelement <4 x i32> %vecinit4.i, i32 %q, i32 3
352  ret <4 x i32> %vecinit6.i
353}
354
355define <4 x i32> @F2(i32* %ptr, i32* %ptr2) nounwind uwtable readnone ssp {
356; X32-LABEL: F2:
357; X32:       ## %bb.0: ## %entry
358; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
359; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
360; X32-NEXT:    movl (%ecx), %ecx
361; X32-NEXT:    movl %ecx, (%eax)
362; X32-NEXT:    vmovd %ecx, %xmm0
363; X32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
364; X32-NEXT:    retl
365;
366; X64-LABEL: F2:
367; X64:       ## %bb.0: ## %entry
368; X64-NEXT:    movl (%rdi), %eax
369; X64-NEXT:    movl %eax, (%rsi)
370; X64-NEXT:    vmovd %eax, %xmm0
371; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
372; X64-NEXT:    retq
373entry:
374  %q = load i32, i32* %ptr, align 4
375  store i32 %q, i32* %ptr2, align 4 ; to create a chain to prevent broadcast
376  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
377  %vecinit2.i = insertelement <4 x i32> %vecinit.i, i32 %q, i32 1
378  %vecinit4.i = insertelement <4 x i32> %vecinit2.i, i32 %q, i32 2
379  %vecinit6.i = insertelement <4 x i32> %vecinit4.i, i32 %q, i32 3
380  ret <4 x i32> %vecinit6.i
381}
382
383; FIXME: Pointer adjusted broadcasts
384
385define <4 x i32> @load_splat_4i32_4i32_1111(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
386; X32-LABEL: load_splat_4i32_4i32_1111:
387; X32:       ## %bb.0: ## %entry
388; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
389; X32-NEXT:    vpermilps {{.*#+}} xmm0 = mem[1,1,1,1]
390; X32-NEXT:    retl
391;
392; X64-LABEL: load_splat_4i32_4i32_1111:
393; X64:       ## %bb.0: ## %entry
394; X64-NEXT:    vpermilps {{.*#+}} xmm0 = mem[1,1,1,1]
395; X64-NEXT:    retq
396entry:
397  %ld = load <4 x i32>, <4 x i32>* %ptr
398  %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
399  ret <4 x i32> %ret
400}
401
402define <8 x i32> @load_splat_8i32_4i32_33333333(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
403; X32-LABEL: load_splat_8i32_4i32_33333333:
404; X32:       ## %bb.0: ## %entry
405; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
406; X32-NEXT:    vbroadcastss 12(%eax), %ymm0
407; X32-NEXT:    retl
408;
409; X64-LABEL: load_splat_8i32_4i32_33333333:
410; X64:       ## %bb.0: ## %entry
411; X64-NEXT:    vbroadcastss 12(%rdi), %ymm0
412; X64-NEXT:    retq
413entry:
414  %ld = load <4 x i32>, <4 x i32>* %ptr
415  %ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
416  ret <8 x i32> %ret
417}
418
419define <8 x i32> @load_splat_8i32_8i32_55555555(<8 x i32>* %ptr) nounwind uwtable readnone ssp {
420; X32-LABEL: load_splat_8i32_8i32_55555555:
421; X32:       ## %bb.0: ## %entry
422; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
423; X32-NEXT:    vbroadcastss 20(%eax), %ymm0
424; X32-NEXT:    retl
425;
426; X64-LABEL: load_splat_8i32_8i32_55555555:
427; X64:       ## %bb.0: ## %entry
428; X64-NEXT:    vbroadcastss 20(%rdi), %ymm0
429; X64-NEXT:    retq
430entry:
431  %ld = load <8 x i32>, <8 x i32>* %ptr
432  %ret = shufflevector <8 x i32> %ld, <8 x i32> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
433  ret <8 x i32> %ret
434}
435
436define <4 x float> @load_splat_4f32_4f32_1111(<4 x float>* %ptr) nounwind uwtable readnone ssp {
437; X32-LABEL: load_splat_4f32_4f32_1111:
438; X32:       ## %bb.0: ## %entry
439; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
440; X32-NEXT:    vbroadcastss 4(%eax), %xmm0
441; X32-NEXT:    retl
442;
443; X64-LABEL: load_splat_4f32_4f32_1111:
444; X64:       ## %bb.0: ## %entry
445; X64-NEXT:    vbroadcastss 4(%rdi), %xmm0
446; X64-NEXT:    retq
447entry:
448  %ld = load <4 x float>, <4 x float>* %ptr
449  %ret = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
450  ret <4 x float> %ret
451}
452
453define <8 x float> @load_splat_8f32_4f32_33333333(<4 x float>* %ptr) nounwind uwtable readnone ssp {
454; X32-LABEL: load_splat_8f32_4f32_33333333:
455; X32:       ## %bb.0: ## %entry
456; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
457; X32-NEXT:    vbroadcastss 12(%eax), %ymm0
458; X32-NEXT:    retl
459;
460; X64-LABEL: load_splat_8f32_4f32_33333333:
461; X64:       ## %bb.0: ## %entry
462; X64-NEXT:    vbroadcastss 12(%rdi), %ymm0
463; X64-NEXT:    retq
464entry:
465  %ld = load <4 x float>, <4 x float>* %ptr
466  %ret = shufflevector <4 x float> %ld, <4 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
467  ret <8 x float> %ret
468}
469
470define <8 x float> @load_splat_8f32_8f32_55555555(<8 x float>* %ptr) nounwind uwtable readnone ssp {
471; X32-LABEL: load_splat_8f32_8f32_55555555:
472; X32:       ## %bb.0: ## %entry
473; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
474; X32-NEXT:    vbroadcastss 20(%eax), %ymm0
475; X32-NEXT:    retl
476;
477; X64-LABEL: load_splat_8f32_8f32_55555555:
478; X64:       ## %bb.0: ## %entry
479; X64-NEXT:    vbroadcastss 20(%rdi), %ymm0
480; X64-NEXT:    retq
481entry:
482  %ld = load <8 x float>, <8 x float>* %ptr
483  %ret = shufflevector <8 x float> %ld, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
484  ret <8 x float> %ret
485}
486
487define <2 x i64> @load_splat_2i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp {
488; X32-LABEL: load_splat_2i64_2i64_1111:
489; X32:       ## %bb.0: ## %entry
490; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
491; X32-NEXT:    vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
492; X32-NEXT:    retl
493;
494; X64-LABEL: load_splat_2i64_2i64_1111:
495; X64:       ## %bb.0: ## %entry
496; X64-NEXT:    vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
497; X64-NEXT:    retq
498entry:
499  %ld = load <2 x i64>, <2 x i64>* %ptr
500  %ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
501  ret <2 x i64> %ret
502}
503
504define <4 x i64> @load_splat_4i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp {
505; X32-LABEL: load_splat_4i64_2i64_1111:
506; X32:       ## %bb.0: ## %entry
507; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
508; X32-NEXT:    vbroadcastsd 8(%eax), %ymm0
509; X32-NEXT:    retl
510;
511; X64-LABEL: load_splat_4i64_2i64_1111:
512; X64:       ## %bb.0: ## %entry
513; X64-NEXT:    vbroadcastsd 8(%rdi), %ymm0
514; X64-NEXT:    retq
515entry:
516  %ld = load <2 x i64>, <2 x i64>* %ptr
517  %ret = shufflevector <2 x i64> %ld, <2 x i64> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
518  ret <4 x i64> %ret
519}
520
521define <4 x i64> @load_splat_4i64_4i64_2222(<4 x i64>* %ptr) nounwind uwtable readnone ssp {
522; X32-LABEL: load_splat_4i64_4i64_2222:
523; X32:       ## %bb.0: ## %entry
524; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
525; X32-NEXT:    vbroadcastsd 16(%eax), %ymm0
526; X32-NEXT:    retl
527;
528; X64-LABEL: load_splat_4i64_4i64_2222:
529; X64:       ## %bb.0: ## %entry
530; X64-NEXT:    vbroadcastsd 16(%rdi), %ymm0
531; X64-NEXT:    retq
532entry:
533  %ld = load <4 x i64>, <4 x i64>* %ptr
534  %ret = shufflevector <4 x i64> %ld, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
535  ret <4 x i64> %ret
536}
537
538define <2 x double> @load_splat_2f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp {
539; X32-LABEL: load_splat_2f64_2f64_1111:
540; X32:       ## %bb.0: ## %entry
541; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
542; X32-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
543; X32-NEXT:    retl
544;
545; X64-LABEL: load_splat_2f64_2f64_1111:
546; X64:       ## %bb.0: ## %entry
547; X64-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
548; X64-NEXT:    retq
549entry:
550  %ld = load <2 x double>, <2 x double>* %ptr
551  %ret = shufflevector <2 x double> %ld, <2 x double> undef, <2 x i32> <i32 1, i32 1>
552  ret <2 x double> %ret
553}
554
555define <4 x double> @load_splat_4f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp {
556; X32-LABEL: load_splat_4f64_2f64_1111:
557; X32:       ## %bb.0: ## %entry
558; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
559; X32-NEXT:    vbroadcastsd 8(%eax), %ymm0
560; X32-NEXT:    retl
561;
562; X64-LABEL: load_splat_4f64_2f64_1111:
563; X64:       ## %bb.0: ## %entry
564; X64-NEXT:    vbroadcastsd 8(%rdi), %ymm0
565; X64-NEXT:    retq
566entry:
567  %ld = load <2 x double>, <2 x double>* %ptr
568  %ret = shufflevector <2 x double> %ld, <2 x double> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
569  ret <4 x double> %ret
570}
571
572define <4 x double> @load_splat_4f64_4f64_2222(<4 x double>* %ptr) nounwind uwtable readnone ssp {
573; X32-LABEL: load_splat_4f64_4f64_2222:
574; X32:       ## %bb.0: ## %entry
575; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
576; X32-NEXT:    vbroadcastsd 16(%eax), %ymm0
577; X32-NEXT:    retl
578;
579; X64-LABEL: load_splat_4f64_4f64_2222:
580; X64:       ## %bb.0: ## %entry
581; X64-NEXT:    vbroadcastsd 16(%rdi), %ymm0
582; X64-NEXT:    retq
583entry:
584  %ld = load <4 x double>, <4 x double>* %ptr
585  %ret = shufflevector <4 x double> %ld, <4 x double> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
586  ret <4 x double> %ret
587}
588
589; Unsupported vbroadcasts
590
591define <2 x i64> @G(i64* %ptr) nounwind uwtable readnone ssp {
592; X32-LABEL: G:
593; X32:       ## %bb.0: ## %entry
594; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
595; X32-NEXT:    movl (%eax), %ecx
596; X32-NEXT:    movl 4(%eax), %eax
597; X32-NEXT:    vmovd %ecx, %xmm0
598; X32-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
599; X32-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
600; X32-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
601; X32-NEXT:    retl
602;
603; X64-LABEL: G:
604; X64:       ## %bb.0: ## %entry
605; X64-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
606; X64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
607; X64-NEXT:    retq
608entry:
609  %q = load i64, i64* %ptr, align 8
610  %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
611  %vecinit2.i = insertelement <2 x i64> %vecinit.i, i64 %q, i32 1
612  ret <2 x i64> %vecinit2.i
613}
614
615define <2 x i64> @G2(i64* %ptr, i64* %ptr2) nounwind uwtable readnone ssp {
616; X32-LABEL: G2:
617; X32:       ## %bb.0: ## %entry
618; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
619; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
620; X32-NEXT:    movl (%ecx), %edx
621; X32-NEXT:    movl 4(%ecx), %ecx
622; X32-NEXT:    movl %ecx, 4(%eax)
623; X32-NEXT:    movl %edx, (%eax)
624; X32-NEXT:    vmovd %edx, %xmm0
625; X32-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
626; X32-NEXT:    vpinsrd $2, %edx, %xmm0, %xmm0
627; X32-NEXT:    vpinsrd $3, %ecx, %xmm0, %xmm0
628; X32-NEXT:    retl
629;
630; X64-LABEL: G2:
631; X64:       ## %bb.0: ## %entry
632; X64-NEXT:    movq (%rdi), %rax
633; X64-NEXT:    movq %rax, (%rsi)
634; X64-NEXT:    vmovq %rax, %xmm0
635; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
636; X64-NEXT:    retq
637entry:
638  %q = load i64, i64* %ptr, align 8
639  store i64 %q, i64* %ptr2, align 8 ; to create a chain to prevent broadcast
640  %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
641  %vecinit2.i = insertelement <2 x i64> %vecinit.i, i64 %q, i32 1
642  ret <2 x i64> %vecinit2.i
643}
644
645define <4 x i32> @H(<4 x i32> %a) {
646; X32-LABEL: H:
647; X32:       ## %bb.0: ## %entry
648; X32-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,3]
649; X32-NEXT:    retl
650;
651; X64-LABEL: H:
652; X64:       ## %bb.0: ## %entry
653; X64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,3]
654; X64-NEXT:    retq
655entry:
656  %x = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
657  ret <4 x i32> %x
658}
659
660define <2 x double> @I(double* %ptr) nounwind uwtable readnone ssp {
661; X32-LABEL: I:
662; X32:       ## %bb.0: ## %entry
663; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
664; X32-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
665; X32-NEXT:    retl
666;
667; X64-LABEL: I:
668; X64:       ## %bb.0: ## %entry
669; X64-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
670; X64-NEXT:    retq
671entry:
672  %q = load double, double* %ptr, align 4
673  %vecinit.i = insertelement <2 x double> undef, double %q, i32 0
674  %vecinit2.i = insertelement <2 x double> %vecinit.i, double %q, i32 1
675  ret <2 x double> %vecinit2.i
676}
677
678define <2 x double> @I2(double* %ptr, double* %ptr2) nounwind uwtable readnone ssp {
679; X32-LABEL: I2:
680; X32:       ## %bb.0: ## %entry
681; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
682; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
683; X32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
684; X32-NEXT:    vmovsd %xmm0, (%eax)
685; X32-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
686; X32-NEXT:    retl
687;
688; X64-LABEL: I2:
689; X64:       ## %bb.0: ## %entry
690; X64-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
691; X64-NEXT:    vmovsd %xmm0, (%rsi)
692; X64-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
693; X64-NEXT:    retq
694entry:
695  %q = load double, double* %ptr, align 4
696  store double %q, double* %ptr2, align 4 ; to create a chain to prevent broadcast
697  %vecinit.i = insertelement <2 x double> undef, double %q, i32 0
698  %vecinit2.i = insertelement <2 x double> %vecinit.i, double %q, i32 1
699  ret <2 x double> %vecinit2.i
700}
701
702define <4 x float> @_RR(float* %ptr, i32* %k) nounwind uwtable readnone ssp {
703; X32-LABEL: _RR:
704; X32:       ## %bb.0: ## %entry
705; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
706; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
707; X32-NEXT:    vbroadcastss (%ecx), %xmm0
708; X32-NEXT:    movl (%eax), %eax
709; X32-NEXT:    movl %eax, (%eax)
710; X32-NEXT:    retl
711;
712; X64-LABEL: _RR:
713; X64:       ## %bb.0: ## %entry
714; X64-NEXT:    vbroadcastss (%rdi), %xmm0
715; X64-NEXT:    movl (%rsi), %eax
716; X64-NEXT:    movl %eax, (%rax)
717; X64-NEXT:    retq
718entry:
719  %q = load float, float* %ptr, align 4
720  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
721  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
722  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
723  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
724  ; force a chain
725  %j = load i32, i32* %k, align 4
726  store i32 %j, i32* undef
727  ret <4 x float> %vecinit6.i
728}
729
730define <4 x float> @_RR2(float* %ptr, i32* %k) nounwind uwtable readnone ssp {
731; X32-LABEL: _RR2:
732; X32:       ## %bb.0: ## %entry
733; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
734; X32-NEXT:    vbroadcastss (%eax), %xmm0
735; X32-NEXT:    retl
736;
737; X64-LABEL: _RR2:
738; X64:       ## %bb.0: ## %entry
739; X64-NEXT:    vbroadcastss (%rdi), %xmm0
740; X64-NEXT:    retq
741entry:
742  %q = load float, float* %ptr, align 4
743  %v = insertelement <4 x float> undef, float %q, i32 0
744  %t = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
745  ret <4 x float> %t
746}
747
748; These tests check that a vbroadcast instruction is used when we have a splat
749; formed from a concat_vectors (via the shufflevector) of two BUILD_VECTORs
750; (via the insertelements).
751
752define <8 x float> @splat_concat1(float* %p) {
753; X32-LABEL: splat_concat1:
754; X32:       ## %bb.0:
755; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
756; X32-NEXT:    vbroadcastss (%eax), %ymm0
757; X32-NEXT:    retl
758;
759; X64-LABEL: splat_concat1:
760; X64:       ## %bb.0:
761; X64-NEXT:    vbroadcastss (%rdi), %ymm0
762; X64-NEXT:    retq
763  %1 = load float, float* %p, align 4
764  %2 = insertelement <4 x float> undef, float %1, i32 0
765  %3 = insertelement <4 x float> %2, float %1, i32 1
766  %4 = insertelement <4 x float> %3, float %1, i32 2
767  %5 = insertelement <4 x float> %4, float %1, i32 3
768  %6 = shufflevector <4 x float> %5, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
769  ret <8 x float> %6
770}
771
772define <8 x float> @splat_concat2(float* %p) {
773; X32-LABEL: splat_concat2:
774; X32:       ## %bb.0:
775; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
776; X32-NEXT:    vbroadcastss (%eax), %ymm0
777; X32-NEXT:    retl
778;
779; X64-LABEL: splat_concat2:
780; X64:       ## %bb.0:
781; X64-NEXT:    vbroadcastss (%rdi), %ymm0
782; X64-NEXT:    retq
783  %1 = load float, float* %p, align 4
784  %2 = insertelement <4 x float> undef, float %1, i32 0
785  %3 = insertelement <4 x float> %2, float %1, i32 1
786  %4 = insertelement <4 x float> %3, float %1, i32 2
787  %5 = insertelement <4 x float> %4, float %1, i32 3
788  %6 = insertelement <4 x float> undef, float %1, i32 0
789  %7 = insertelement <4 x float> %6, float %1, i32 1
790  %8 = insertelement <4 x float> %7, float %1, i32 2
791  %9 = insertelement <4 x float> %8, float %1, i32 3
792  %10 = shufflevector <4 x float> %5, <4 x float> %9, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
793  ret <8 x float> %10
794}
795
796define <4 x double> @splat_concat3(double* %p) {
797; X32-LABEL: splat_concat3:
798; X32:       ## %bb.0:
799; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
800; X32-NEXT:    vbroadcastsd (%eax), %ymm0
801; X32-NEXT:    retl
802;
803; X64-LABEL: splat_concat3:
804; X64:       ## %bb.0:
805; X64-NEXT:    vbroadcastsd (%rdi), %ymm0
806; X64-NEXT:    retq
807  %1 = load double, double* %p, align 8
808  %2 = insertelement <2 x double> undef, double %1, i32 0
809  %3 = insertelement <2 x double> %2, double %1, i32 1
810  %4 = shufflevector <2 x double> %3, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
811  ret <4 x double> %4
812}
813
814define <4 x double> @splat_concat4(double* %p) {
815; X32-LABEL: splat_concat4:
816; X32:       ## %bb.0:
817; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
818; X32-NEXT:    vbroadcastsd (%eax), %ymm0
819; X32-NEXT:    retl
820;
821; X64-LABEL: splat_concat4:
822; X64:       ## %bb.0:
823; X64-NEXT:    vbroadcastsd (%rdi), %ymm0
824; X64-NEXT:    retq
825  %1 = load double, double* %p, align 8
826  %2 = insertelement <2 x double> undef, double %1, i32 0
827  %3 = insertelement <2 x double> %2, double %1, i32 1
828  %4 = insertelement <2 x double> undef, double %1, i32 0
829  %5 = insertelement <2 x double> %2, double %1, i32 1
830  %6 = shufflevector <2 x double> %3, <2 x double> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
831  ret <4 x double> %6
832}
833
834; PR34041
835define <4 x double> @broadcast_shuffle_1000(double* %p) {
836; X32-LABEL: broadcast_shuffle_1000:
837; X32:       ## %bb.0:
838; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
839; X32-NEXT:    vbroadcastsd (%eax), %ymm0
840; X32-NEXT:    retl
841;
842; X64-LABEL: broadcast_shuffle_1000:
843; X64:       ## %bb.0:
844; X64-NEXT:    vbroadcastsd (%rdi), %ymm0
845; X64-NEXT:    retq
846  %1 = load double, double* %p
847  %2 = insertelement <2 x double> undef, double %1, i32 0
848  %3 = shufflevector <2 x double> %2, <2 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
849  ret <4 x double> %3
850}
851
852define <4 x double> @broadcast_shuffle1032(double* %p) {
853; X32-LABEL: broadcast_shuffle1032:
854; X32:       ## %bb.0:
855; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
856; X32-NEXT:    vbroadcastsd (%eax), %ymm0
857; X32-NEXT:    retl
858;
859; X64-LABEL: broadcast_shuffle1032:
860; X64:       ## %bb.0:
861; X64-NEXT:    vbroadcastsd (%rdi), %ymm0
862; X64-NEXT:    retq
863  %1 = load double, double* %p
864  %2 = insertelement <2 x double> undef, double %1, i32 1
865  %3 = insertelement <2 x double> undef, double %1, i32 0
866  %4 = shufflevector <2 x double> %2, <2 x double> %3, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
867  ret <4 x double> %4
868}
869
870;
871; When VBROADCAST replaces an existing load, ensure it still respects lifetime dependencies.
872;
873define float @broadcast_lifetime() nounwind {
874; X32-LABEL: broadcast_lifetime:
875; X32:       ## %bb.0:
876; X32-NEXT:    pushl %esi
877; X32-NEXT:    subl $40, %esp
878; X32-NEXT:    leal {{[0-9]+}}(%esp), %esi
879; X32-NEXT:    movl %esi, (%esp)
880; X32-NEXT:    calll _gfunc
881; X32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
882; X32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp) ## 4-byte Spill
883; X32-NEXT:    movl %esi, (%esp)
884; X32-NEXT:    calll _gfunc
885; X32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
886; X32-NEXT:    vsubss {{[0-9]+}}(%esp), %xmm0, %xmm0 ## 4-byte Folded Reload
887; X32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
888; X32-NEXT:    flds {{[0-9]+}}(%esp)
889; X32-NEXT:    addl $40, %esp
890; X32-NEXT:    popl %esi
891; X32-NEXT:    retl
892;
893; X64-LABEL: broadcast_lifetime:
894; X64:       ## %bb.0:
895; X64-NEXT:    subq $40, %rsp
896; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
897; X64-NEXT:    callq _gfunc
898; X64-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
899; X64-NEXT:    vmovss %xmm0, {{[0-9]+}}(%rsp) ## 4-byte Spill
900; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
901; X64-NEXT:    callq _gfunc
902; X64-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
903; X64-NEXT:    vsubss {{[0-9]+}}(%rsp), %xmm0, %xmm0 ## 4-byte Folded Reload
904; X64-NEXT:    addq $40, %rsp
905; X64-NEXT:    retq
906  %1 = alloca <4 x float>, align 16
907  %2 = alloca <4 x float>, align 16
908  %3 = bitcast <4 x float>* %1 to i8*
909  %4 = bitcast <4 x float>* %2 to i8*
910
911  call void @llvm.lifetime.start.p0i8(i64 16, i8* %3)
912  call void @gfunc(<4 x float>* %1)
913  %5 = load <4 x float>, <4 x float>* %1, align 16
914  call void @llvm.lifetime.end.p0i8(i64 16, i8* %3)
915
916  call void @llvm.lifetime.start.p0i8(i64 16, i8* %4)
917  call void @gfunc(<4 x float>* %2)
918  %6 = load <4 x float>, <4 x float>* %2, align 16
919  call void @llvm.lifetime.end.p0i8(i64 16, i8* %4)
920
921  %7 = extractelement <4 x float> %5, i32 1
922  %8 = extractelement <4 x float> %6, i32 1
923  %9 = fsub float %8, %7
924  ret float %9
925}
926
927declare void @gfunc(<4 x float>*)
928declare void @llvm.lifetime.start.p0i8(i64, i8*)
929declare void @llvm.lifetime.end.p0i8(i64, i8*)
930