• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX1
3; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX2
4; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=X86,X86-AVX512
5; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=X86,X86-AVX512
6; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X86,X86-AVX512
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX512
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX512
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX512
12
13;
14; Subvector Load + Broadcast
15;
16
17define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind {
18; X86-LABEL: test_broadcast_2f64_4f64:
19; X86:       # %bb.0:
20; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
21; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
22; X86-NEXT:    retl
23;
24; X64-LABEL: test_broadcast_2f64_4f64:
25; X64:       # %bb.0:
26; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
27; X64-NEXT:    retq
28 %1 = load <2 x double>, <2 x double> *%p
29 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
30 ret <4 x double> %2
31}
32
33define <8 x double> @test_broadcast_2f64_8f64(<2 x double> *%p) nounwind {
34; X86-AVX-LABEL: test_broadcast_2f64_8f64:
35; X86-AVX:       # %bb.0:
36; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
37; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
38; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
39; X86-AVX-NEXT:    retl
40;
41; X86-AVX512-LABEL: test_broadcast_2f64_8f64:
42; X86-AVX512:       # %bb.0:
43; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
44; X86-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
45; X86-AVX512-NEXT:    retl
46;
47; X64-AVX-LABEL: test_broadcast_2f64_8f64:
48; X64-AVX:       # %bb.0:
49; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
50; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
51; X64-AVX-NEXT:    retq
52;
53; X64-AVX512-LABEL: test_broadcast_2f64_8f64:
54; X64-AVX512:       # %bb.0:
55; X64-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
56; X64-AVX512-NEXT:    retq
57 %1 = load <2 x double>, <2 x double> *%p
58 %2 = shufflevector <2 x double> %1, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
59 ret <8 x double> %2
60}
61
62define <8 x double> @test_broadcast_4f64_8f64(<4 x double> *%p) nounwind {
63; X86-AVX-LABEL: test_broadcast_4f64_8f64:
64; X86-AVX:       # %bb.0:
65; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
66; X86-AVX-NEXT:    vmovaps (%eax), %ymm0
67; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
68; X86-AVX-NEXT:    retl
69;
70; X86-AVX512-LABEL: test_broadcast_4f64_8f64:
71; X86-AVX512:       # %bb.0:
72; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
73; X86-AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
74; X86-AVX512-NEXT:    retl
75;
76; X64-AVX-LABEL: test_broadcast_4f64_8f64:
77; X64-AVX:       # %bb.0:
78; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
79; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
80; X64-AVX-NEXT:    retq
81;
82; X64-AVX512-LABEL: test_broadcast_4f64_8f64:
83; X64-AVX512:       # %bb.0:
84; X64-AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
85; X64-AVX512-NEXT:    retq
86 %1 = load <4 x double>, <4 x double> *%p
87 %2 = shufflevector <4 x double> %1, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
88 ret <8 x double> %2
89}
90
91define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind {
92; X86-AVX-LABEL: test_broadcast_2i64_4i64:
93; X86-AVX:       # %bb.0:
94; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
95; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
96; X86-AVX-NEXT:    retl
97;
98; X86-AVX512-LABEL: test_broadcast_2i64_4i64:
99; X86-AVX512:       # %bb.0:
100; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
101; X86-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
102; X86-AVX512-NEXT:    retl
103;
104; X64-AVX-LABEL: test_broadcast_2i64_4i64:
105; X64-AVX:       # %bb.0:
106; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
107; X64-AVX-NEXT:    retq
108;
109; X64-AVX512-LABEL: test_broadcast_2i64_4i64:
110; X64-AVX512:       # %bb.0:
111; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
112; X64-AVX512-NEXT:    retq
113 %1 = load <2 x i64>, <2 x i64> *%p
114 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
115 ret <4 x i64> %2
116}
117
118define <8 x i64> @test_broadcast_2i64_8i64(<2 x i64> *%p) nounwind {
119; X86-AVX-LABEL: test_broadcast_2i64_8i64:
120; X86-AVX:       # %bb.0:
121; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
122; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
123; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
124; X86-AVX-NEXT:    retl
125;
126; X86-AVX512-LABEL: test_broadcast_2i64_8i64:
127; X86-AVX512:       # %bb.0:
128; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
129; X86-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
130; X86-AVX512-NEXT:    retl
131;
132; X64-AVX-LABEL: test_broadcast_2i64_8i64:
133; X64-AVX:       # %bb.0:
134; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
135; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
136; X64-AVX-NEXT:    retq
137;
138; X64-AVX512-LABEL: test_broadcast_2i64_8i64:
139; X64-AVX512:       # %bb.0:
140; X64-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
141; X64-AVX512-NEXT:    retq
142 %1 = load <2 x i64>, <2 x i64> *%p
143 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
144 ret <8 x i64> %2
145}
146
147define <8 x i64> @test_broadcast_4i64_8i64(<4 x i64> *%p) nounwind {
148; X86-AVX-LABEL: test_broadcast_4i64_8i64:
149; X86-AVX:       # %bb.0:
150; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
151; X86-AVX-NEXT:    vmovaps (%eax), %ymm0
152; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
153; X86-AVX-NEXT:    retl
154;
155; X86-AVX512-LABEL: test_broadcast_4i64_8i64:
156; X86-AVX512:       # %bb.0:
157; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
158; X86-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
159; X86-AVX512-NEXT:    retl
160;
161; X64-AVX-LABEL: test_broadcast_4i64_8i64:
162; X64-AVX:       # %bb.0:
163; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
164; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
165; X64-AVX-NEXT:    retq
166;
167; X64-AVX512-LABEL: test_broadcast_4i64_8i64:
168; X64-AVX512:       # %bb.0:
169; X64-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
170; X64-AVX512-NEXT:    retq
171 %1 = load <4 x i64>, <4 x i64> *%p
172 %2 = shufflevector <4 x i64> %1, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
173 ret <8 x i64> %2
174}
175
176define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind {
177; X86-LABEL: test_broadcast_4f32_8f32:
178; X86:       # %bb.0:
179; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
180; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
181; X86-NEXT:    retl
182;
183; X64-LABEL: test_broadcast_4f32_8f32:
184; X64:       # %bb.0:
185; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
186; X64-NEXT:    retq
187 %1 = load <4 x float>, <4 x float> *%p
188 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
189 ret <8 x float> %2
190}
191
192define <16 x float> @test_broadcast_4f32_16f32(<4 x float> *%p) nounwind {
193; X86-AVX-LABEL: test_broadcast_4f32_16f32:
194; X86-AVX:       # %bb.0:
195; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
196; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
197; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
198; X86-AVX-NEXT:    retl
199;
200; X86-AVX512-LABEL: test_broadcast_4f32_16f32:
201; X86-AVX512:       # %bb.0:
202; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
203; X86-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
204; X86-AVX512-NEXT:    retl
205;
206; X64-AVX-LABEL: test_broadcast_4f32_16f32:
207; X64-AVX:       # %bb.0:
208; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
209; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
210; X64-AVX-NEXT:    retq
211;
212; X64-AVX512-LABEL: test_broadcast_4f32_16f32:
213; X64-AVX512:       # %bb.0:
214; X64-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
215; X64-AVX512-NEXT:    retq
216 %1 = load <4 x float>, <4 x float> *%p
217 %2 = shufflevector <4 x float> %1, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
218 ret <16 x float> %2
219}
220
221define <16 x float> @test_broadcast_8f32_16f32(<8 x float> *%p) nounwind {
222; X86-AVX-LABEL: test_broadcast_8f32_16f32:
223; X86-AVX:       # %bb.0:
224; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
225; X86-AVX-NEXT:    vmovaps (%eax), %ymm0
226; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
227; X86-AVX-NEXT:    retl
228;
229; X86-AVX512-LABEL: test_broadcast_8f32_16f32:
230; X86-AVX512:       # %bb.0:
231; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
232; X86-AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
233; X86-AVX512-NEXT:    retl
234;
235; X64-AVX-LABEL: test_broadcast_8f32_16f32:
236; X64-AVX:       # %bb.0:
237; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
238; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
239; X64-AVX-NEXT:    retq
240;
241; X64-AVX512-LABEL: test_broadcast_8f32_16f32:
242; X64-AVX512:       # %bb.0:
243; X64-AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
244; X64-AVX512-NEXT:    retq
245 %1 = load <8 x float>, <8 x float> *%p
246 %2 = shufflevector <8 x float> %1, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
247 ret <16 x float> %2
248}
249
250define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind {
251; X86-AVX-LABEL: test_broadcast_4i32_8i32:
252; X86-AVX:       # %bb.0:
253; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
254; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
255; X86-AVX-NEXT:    retl
256;
257; X86-AVX512-LABEL: test_broadcast_4i32_8i32:
258; X86-AVX512:       # %bb.0:
259; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
260; X86-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
261; X86-AVX512-NEXT:    retl
262;
263; X64-AVX-LABEL: test_broadcast_4i32_8i32:
264; X64-AVX:       # %bb.0:
265; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
266; X64-AVX-NEXT:    retq
267;
268; X64-AVX512-LABEL: test_broadcast_4i32_8i32:
269; X64-AVX512:       # %bb.0:
270; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
271; X64-AVX512-NEXT:    retq
272 %1 = load <4 x i32>, <4 x i32> *%p
273 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
274 ret <8 x i32> %2
275}
276
277define <16 x i32> @test_broadcast_4i32_16i32(<4 x i32> *%p) nounwind {
278; X86-AVX-LABEL: test_broadcast_4i32_16i32:
279; X86-AVX:       # %bb.0:
280; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
281; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
282; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
283; X86-AVX-NEXT:    retl
284;
285; X86-AVX512-LABEL: test_broadcast_4i32_16i32:
286; X86-AVX512:       # %bb.0:
287; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
288; X86-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
289; X86-AVX512-NEXT:    retl
290;
291; X64-AVX-LABEL: test_broadcast_4i32_16i32:
292; X64-AVX:       # %bb.0:
293; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
294; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
295; X64-AVX-NEXT:    retq
296;
297; X64-AVX512-LABEL: test_broadcast_4i32_16i32:
298; X64-AVX512:       # %bb.0:
299; X64-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
300; X64-AVX512-NEXT:    retq
301 %1 = load <4 x i32>, <4 x i32> *%p
302 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
303 ret <16 x i32> %2
304}
305
306define <16 x i32> @test_broadcast_8i32_16i32(<8 x i32> *%p) nounwind {
307; X86-AVX-LABEL: test_broadcast_8i32_16i32:
308; X86-AVX:       # %bb.0:
309; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
310; X86-AVX-NEXT:    vmovaps (%eax), %ymm0
311; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
312; X86-AVX-NEXT:    retl
313;
314; X86-AVX512-LABEL: test_broadcast_8i32_16i32:
315; X86-AVX512:       # %bb.0:
316; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
317; X86-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
318; X86-AVX512-NEXT:    retl
319;
320; X64-AVX-LABEL: test_broadcast_8i32_16i32:
321; X64-AVX:       # %bb.0:
322; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
323; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
324; X64-AVX-NEXT:    retq
325;
326; X64-AVX512-LABEL: test_broadcast_8i32_16i32:
327; X64-AVX512:       # %bb.0:
328; X64-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
329; X64-AVX512-NEXT:    retq
330 %1 = load <8 x i32>, <8 x i32> *%p
331 %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
332 ret <16 x i32> %2
333}
334
335define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind {
336; X86-AVX-LABEL: test_broadcast_8i16_16i16:
337; X86-AVX:       # %bb.0:
338; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
339; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
340; X86-AVX-NEXT:    retl
341;
342; X86-AVX512-LABEL: test_broadcast_8i16_16i16:
343; X86-AVX512:       # %bb.0:
344; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
345; X86-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
346; X86-AVX512-NEXT:    retl
347;
348; X64-AVX-LABEL: test_broadcast_8i16_16i16:
349; X64-AVX:       # %bb.0:
350; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
351; X64-AVX-NEXT:    retq
352;
353; X64-AVX512-LABEL: test_broadcast_8i16_16i16:
354; X64-AVX512:       # %bb.0:
355; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
356; X64-AVX512-NEXT:    retq
357 %1 = load <8 x i16>, <8 x i16> *%p
358 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
359 ret <16 x i16> %2
360}
361
362define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind {
363; X86-AVX-LABEL: test_broadcast_8i16_32i16:
364; X86-AVX:       # %bb.0:
365; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
366; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
367; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
368; X86-AVX-NEXT:    retl
369;
370; X86-AVX512-LABEL: test_broadcast_8i16_32i16:
371; X86-AVX512:       # %bb.0:
372; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
373; X86-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
374; X86-AVX512-NEXT:    retl
375;
376; X64-AVX-LABEL: test_broadcast_8i16_32i16:
377; X64-AVX:       # %bb.0:
378; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
379; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
380; X64-AVX-NEXT:    retq
381;
382; X64-AVX512-LABEL: test_broadcast_8i16_32i16:
383; X64-AVX512:       # %bb.0:
384; X64-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
385; X64-AVX512-NEXT:    retq
386 %1 = load <8 x i16>, <8 x i16> *%p
387 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
388 ret <32 x i16> %2
389}
390
391define <32 x i16> @test_broadcast_16i16_32i16(<16 x i16> *%p) nounwind {
392; X86-AVX-LABEL: test_broadcast_16i16_32i16:
393; X86-AVX:       # %bb.0:
394; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
395; X86-AVX-NEXT:    vmovaps (%eax), %ymm0
396; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
397; X86-AVX-NEXT:    retl
398;
399; X86-AVX512-LABEL: test_broadcast_16i16_32i16:
400; X86-AVX512:       # %bb.0:
401; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
402; X86-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
403; X86-AVX512-NEXT:    retl
404;
405; X64-AVX-LABEL: test_broadcast_16i16_32i16:
406; X64-AVX:       # %bb.0:
407; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
408; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
409; X64-AVX-NEXT:    retq
410;
411; X64-AVX512-LABEL: test_broadcast_16i16_32i16:
412; X64-AVX512:       # %bb.0:
413; X64-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
414; X64-AVX512-NEXT:    retq
415 %1 = load <16 x i16>, <16 x i16> *%p
416 %2 = shufflevector <16 x i16> %1, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
417 ret <32 x i16> %2
418}
419
420define <32 x i8> @test_broadcast_16i8_32i8(<16 x i8> *%p) nounwind {
421; X86-AVX-LABEL: test_broadcast_16i8_32i8:
422; X86-AVX:       # %bb.0:
423; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
424; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
425; X86-AVX-NEXT:    retl
426;
427; X86-AVX512-LABEL: test_broadcast_16i8_32i8:
428; X86-AVX512:       # %bb.0:
429; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
430; X86-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
431; X86-AVX512-NEXT:    retl
432;
433; X64-AVX-LABEL: test_broadcast_16i8_32i8:
434; X64-AVX:       # %bb.0:
435; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
436; X64-AVX-NEXT:    retq
437;
438; X64-AVX512-LABEL: test_broadcast_16i8_32i8:
439; X64-AVX512:       # %bb.0:
440; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
441; X64-AVX512-NEXT:    retq
442 %1 = load <16 x i8>, <16 x i8> *%p
443 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
444 ret <32 x i8> %2
445}
446
447define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind {
448; X86-AVX-LABEL: test_broadcast_16i8_64i8:
449; X86-AVX:       # %bb.0:
450; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
451; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
452; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
453; X86-AVX-NEXT:    retl
454;
455; X86-AVX512-LABEL: test_broadcast_16i8_64i8:
456; X86-AVX512:       # %bb.0:
457; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
458; X86-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
459; X86-AVX512-NEXT:    retl
460;
461; X64-AVX-LABEL: test_broadcast_16i8_64i8:
462; X64-AVX:       # %bb.0:
463; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
464; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
465; X64-AVX-NEXT:    retq
466;
467; X64-AVX512-LABEL: test_broadcast_16i8_64i8:
468; X64-AVX512:       # %bb.0:
469; X64-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
470; X64-AVX512-NEXT:    retq
471 %1 = load <16 x i8>, <16 x i8> *%p
472 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
473 ret <64 x i8> %2
474}
475
476define <64 x i8> @test_broadcast_32i8_64i8(<32 x i8> *%p) nounwind {
477; X86-AVX-LABEL: test_broadcast_32i8_64i8:
478; X86-AVX:       # %bb.0:
479; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
480; X86-AVX-NEXT:    vmovaps (%eax), %ymm0
481; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
482; X86-AVX-NEXT:    retl
483;
484; X86-AVX512-LABEL: test_broadcast_32i8_64i8:
485; X86-AVX512:       # %bb.0:
486; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
487; X86-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
488; X86-AVX512-NEXT:    retl
489;
490; X64-AVX-LABEL: test_broadcast_32i8_64i8:
491; X64-AVX:       # %bb.0:
492; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
493; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
494; X64-AVX-NEXT:    retq
495;
496; X64-AVX512-LABEL: test_broadcast_32i8_64i8:
497; X64-AVX512:       # %bb.0:
498; X64-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
499; X64-AVX512-NEXT:    retq
500 %1 = load <32 x i8>, <32 x i8> *%p
501 %2 = shufflevector <32 x i8> %1, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
502 ret <64 x i8> %2
503}
504
505;
506; Subvector Load + Broadcast + Store
507;
508
509define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x double>* %p1) {
510; X86-LABEL: test_broadcast_2f64_4f64_reuse:
511; X86:       # %bb.0:
512; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
513; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
514; X86-NEXT:    vmovaps (%ecx), %xmm0
515; X86-NEXT:    vmovaps %xmm0, (%eax)
516; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
517; X86-NEXT:    retl
518;
519; X64-LABEL: test_broadcast_2f64_4f64_reuse:
520; X64:       # %bb.0:
521; X64-NEXT:    vmovaps (%rdi), %xmm0
522; X64-NEXT:    vmovaps %xmm0, (%rsi)
523; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
524; X64-NEXT:    retq
525 %1 = load <2 x double>, <2 x double>* %p0
526 store <2 x double> %1, <2 x double>* %p1
527 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
528 ret <4 x double> %2
529}
530
531define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1) {
532; X86-LABEL: test_broadcast_2i64_4i64_reuse:
533; X86:       # %bb.0:
534; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
535; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
536; X86-NEXT:    vmovaps (%ecx), %xmm0
537; X86-NEXT:    vmovaps %xmm0, (%eax)
538; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
539; X86-NEXT:    retl
540;
541; X64-LABEL: test_broadcast_2i64_4i64_reuse:
542; X64:       # %bb.0:
543; X64-NEXT:    vmovaps (%rdi), %xmm0
544; X64-NEXT:    vmovaps %xmm0, (%rsi)
545; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
546; X64-NEXT:    retq
547 %1 = load <2 x i64>, <2 x i64>* %p0
548 store <2 x i64> %1, <2 x i64>* %p1
549 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
550 ret <4 x i64> %2
551}
552
553define <8 x float> @test_broadcast_4f32_8f32_reuse(<4 x float>* %p0, <4 x float>* %p1) {
554; X86-LABEL: test_broadcast_4f32_8f32_reuse:
555; X86:       # %bb.0:
556; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
557; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
558; X86-NEXT:    vmovaps (%ecx), %xmm0
559; X86-NEXT:    vmovaps %xmm0, (%eax)
560; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
561; X86-NEXT:    retl
562;
563; X64-LABEL: test_broadcast_4f32_8f32_reuse:
564; X64:       # %bb.0:
565; X64-NEXT:    vmovaps (%rdi), %xmm0
566; X64-NEXT:    vmovaps %xmm0, (%rsi)
567; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
568; X64-NEXT:    retq
569 %1 = load <4 x float>, <4 x float>* %p0
570 store <4 x float> %1, <4 x float>* %p1
571 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
572 ret <8 x float> %2
573}
574
575define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1) {
576; X86-LABEL: test_broadcast_4i32_8i32_reuse:
577; X86:       # %bb.0:
578; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
579; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
580; X86-NEXT:    vmovaps (%ecx), %xmm0
581; X86-NEXT:    vmovaps %xmm0, (%eax)
582; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
583; X86-NEXT:    retl
584;
585; X64-LABEL: test_broadcast_4i32_8i32_reuse:
586; X64:       # %bb.0:
587; X64-NEXT:    vmovaps (%rdi), %xmm0
588; X64-NEXT:    vmovaps %xmm0, (%rsi)
589; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
590; X64-NEXT:    retq
591 %1 = load <4 x i32>, <4 x i32>* %p0
592 store <4 x i32> %1, <4 x i32>* %p1
593 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
594 ret <8 x i32> %2
595}
596
597define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p1) nounwind {
598; X86-LABEL: test_broadcast_8i16_16i16_reuse:
599; X86:       # %bb.0:
600; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
601; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
602; X86-NEXT:    vmovaps (%ecx), %xmm0
603; X86-NEXT:    vmovaps %xmm0, (%eax)
604; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
605; X86-NEXT:    retl
606;
607; X64-LABEL: test_broadcast_8i16_16i16_reuse:
608; X64:       # %bb.0:
609; X64-NEXT:    vmovaps (%rdi), %xmm0
610; X64-NEXT:    vmovaps %xmm0, (%rsi)
611; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
612; X64-NEXT:    retq
613 %1 = load <8 x i16>, <8 x i16> *%p0
614 store <8 x i16> %1, <8 x i16>* %p1
615 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
616 ret <16 x i16> %2
617}
618
619define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1) nounwind {
620; X86-LABEL: test_broadcast_16i8_32i8_reuse:
621; X86:       # %bb.0:
622; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
623; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
624; X86-NEXT:    vmovaps (%ecx), %xmm0
625; X86-NEXT:    vmovaps %xmm0, (%eax)
626; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
627; X86-NEXT:    retl
628;
629; X64-LABEL: test_broadcast_16i8_32i8_reuse:
630; X64:       # %bb.0:
631; X64-NEXT:    vmovaps (%rdi), %xmm0
632; X64-NEXT:    vmovaps %xmm0, (%rsi)
633; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
634; X64-NEXT:    retq
635 %1 = load <16 x i8>, <16 x i8> *%p0
636 store <16 x i8> %1, <16 x i8>* %p1
637 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
638 ret <32 x i8> %2
639}
640
641;
642; Subvector Load + Broadcast with Separate Store
643;
644
645define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p1) {
646; X86-AVX-LABEL: test_broadcast_4i32_8i32_chain:
647; X86-AVX:       # %bb.0:
648; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
649; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
650; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
651; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
652; X86-AVX-NEXT:    vmovaps %xmm1, (%eax)
653; X86-AVX-NEXT:    retl
654;
655; X86-AVX512-LABEL: test_broadcast_4i32_8i32_chain:
656; X86-AVX512:       # %bb.0:
657; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
658; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
659; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
660; X86-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
661; X86-AVX512-NEXT:    vmovaps %xmm1, (%eax)
662; X86-AVX512-NEXT:    retl
663;
664; X64-AVX-LABEL: test_broadcast_4i32_8i32_chain:
665; X64-AVX:       # %bb.0:
666; X64-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
667; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
668; X64-AVX-NEXT:    vmovaps %xmm1, (%rsi)
669; X64-AVX-NEXT:    retq
670;
671; X64-AVX512-LABEL: test_broadcast_4i32_8i32_chain:
672; X64-AVX512:       # %bb.0:
673; X64-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
674; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
675; X64-AVX512-NEXT:    vmovaps %xmm1, (%rsi)
676; X64-AVX512-NEXT:    retq
677  %1 = load <4 x i32>, <4 x i32>* %p0
678  store <4 x float> zeroinitializer, <4 x float>* %p1
679  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
680  ret <8 x i32> %2
681}
682
683define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>* %p1) {
684; X86-AVX-LABEL: test_broadcast_4i32_16i32_chain:
685; X86-AVX:       # %bb.0:
686; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
687; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
688; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
689; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
690; X86-AVX-NEXT:    vmovaps %xmm1, (%eax)
691; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
692; X86-AVX-NEXT:    retl
693;
694; X86-AVX512-LABEL: test_broadcast_4i32_16i32_chain:
695; X86-AVX512:       # %bb.0:
696; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
697; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
698; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
699; X86-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
700; X86-AVX512-NEXT:    vmovaps %xmm1, (%eax)
701; X86-AVX512-NEXT:    retl
702;
703; X64-AVX-LABEL: test_broadcast_4i32_16i32_chain:
704; X64-AVX:       # %bb.0:
705; X64-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
706; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
707; X64-AVX-NEXT:    vmovaps %xmm1, (%rsi)
708; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
709; X64-AVX-NEXT:    retq
710;
711; X64-AVX512-LABEL: test_broadcast_4i32_16i32_chain:
712; X64-AVX512:       # %bb.0:
713; X64-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
714; X64-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
715; X64-AVX512-NEXT:    vmovaps %xmm1, (%rsi)
716; X64-AVX512-NEXT:    retq
717  %1 = load <4 x i32>, <4 x i32>* %p0
718  store <4 x float> zeroinitializer, <4 x float>* %p1
719  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
720  ret <16 x i32> %2
721}
722
723;
724; subvector Load with multiple uses + broadcast
725; Fallback to the broadcast should be done
726;
727
728@ga4 = global <4 x i64> zeroinitializer, align 8
729@gb4 = global <8 x i64> zeroinitializer, align 8
730
731define void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) {
732; X86-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64:
733; X86-AVX1:       # %bb.0: # %entry
734; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,0,2,0]
735; X86-AVX1-NEXT:    vpaddq %xmm3, %xmm0, %xmm4
736; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
737; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [3,0,4,0]
738; X86-AVX1-NEXT:    vpaddq %xmm5, %xmm0, %xmm0
739; X86-AVX1-NEXT:    vmovaps {{.*#+}} ymm6 = [1,0,2,0,3,0,4,0]
740; X86-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
741; X86-AVX1-NEXT:    vpaddq %xmm5, %xmm7, %xmm7
742; X86-AVX1-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
743; X86-AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm2, %ymm2
744; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
745; X86-AVX1-NEXT:    vpaddq %xmm5, %xmm7, %xmm5
746; X86-AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
747; X86-AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm1
748; X86-AVX1-NEXT:    vandps %ymm6, %ymm1, %ymm1
749; X86-AVX1-NEXT:    vandps %ymm6, %ymm2, %ymm2
750; X86-AVX1-NEXT:    vmovdqu %xmm0, ga4+16
751; X86-AVX1-NEXT:    vmovdqu %xmm4, ga4
752; X86-AVX1-NEXT:    vmovups %ymm2, gb4+32
753; X86-AVX1-NEXT:    vmovups %ymm1, gb4
754; X86-AVX1-NEXT:    vzeroupper
755; X86-AVX1-NEXT:    retl
756;
757; X86-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64:
758; X86-AVX2:       # %bb.0: # %entry
759; X86-AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,0,2,0,3,0,4,0]
760; X86-AVX2-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
761; X86-AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
762; X86-AVX2-NEXT:    vpaddq %ymm3, %ymm1, %ymm1
763; X86-AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
764; X86-AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
765; X86-AVX2-NEXT:    vmovdqu %ymm0, ga4
766; X86-AVX2-NEXT:    vmovdqu %ymm2, gb4+32
767; X86-AVX2-NEXT:    vmovdqu %ymm1, gb4
768; X86-AVX2-NEXT:    vzeroupper
769; X86-AVX2-NEXT:    retl
770;
771; X86-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64:
772; X86-AVX512:       # %bb.0: # %entry
773; X86-AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,0,2,0,3,0,4,0]
774; X86-AVX512-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
775; X86-AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm2
776; X86-AVX512-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
777; X86-AVX512-NEXT:    vpandq %zmm2, %zmm1, %zmm1
778; X86-AVX512-NEXT:    vmovdqu %ymm0, ga4
779; X86-AVX512-NEXT:    vmovdqu64 %zmm1, gb4
780; X86-AVX512-NEXT:    vzeroupper
781; X86-AVX512-NEXT:    retl
782;
783; X64-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64:
784; X64-AVX1:       # %bb.0: # %entry
785; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,2]
786; X64-AVX1-NEXT:    vpaddq %xmm3, %xmm0, %xmm4
787; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
788; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [3,4]
789; X64-AVX1-NEXT:    vpaddq %xmm5, %xmm0, %xmm0
790; X64-AVX1-NEXT:    vmovaps {{.*#+}} ymm6 = [1,2,3,4]
791; X64-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
792; X64-AVX1-NEXT:    vpaddq %xmm5, %xmm7, %xmm7
793; X64-AVX1-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
794; X64-AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm2, %ymm2
795; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
796; X64-AVX1-NEXT:    vpaddq %xmm5, %xmm7, %xmm5
797; X64-AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
798; X64-AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm1
799; X64-AVX1-NEXT:    vandps %ymm6, %ymm1, %ymm1
800; X64-AVX1-NEXT:    vandps %ymm6, %ymm2, %ymm2
801; X64-AVX1-NEXT:    vmovdqu %xmm0, ga4+{{.*}}(%rip)
802; X64-AVX1-NEXT:    vmovdqu %xmm4, {{.*}}(%rip)
803; X64-AVX1-NEXT:    vmovups %ymm2, gb4+{{.*}}(%rip)
804; X64-AVX1-NEXT:    vmovups %ymm1, {{.*}}(%rip)
805; X64-AVX1-NEXT:    vzeroupper
806; X64-AVX1-NEXT:    retq
807;
808; X64-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64:
809; X64-AVX2:       # %bb.0: # %entry
810; X64-AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,2,3,4]
811; X64-AVX2-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
812; X64-AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
813; X64-AVX2-NEXT:    vpaddq %ymm3, %ymm1, %ymm1
814; X64-AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
815; X64-AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
816; X64-AVX2-NEXT:    vmovdqu %ymm0, {{.*}}(%rip)
817; X64-AVX2-NEXT:    vmovdqu %ymm2, gb4+{{.*}}(%rip)
818; X64-AVX2-NEXT:    vmovdqu %ymm1, {{.*}}(%rip)
819; X64-AVX2-NEXT:    vzeroupper
820; X64-AVX2-NEXT:    retq
821;
822; X64-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64:
823; X64-AVX512:       # %bb.0: # %entry
824; X64-AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,2,3,4]
825; X64-AVX512-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
826; X64-AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm2
827; X64-AVX512-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
828; X64-AVX512-NEXT:    vpandq %zmm2, %zmm1, %zmm1
829; X64-AVX512-NEXT:    vmovdqu %ymm0, {{.*}}(%rip)
830; X64-AVX512-NEXT:    vmovdqu64 %zmm1, {{.*}}(%rip)
831; X64-AVX512-NEXT:    vzeroupper
832; X64-AVX512-NEXT:    retq
833entry:
834  %0 = add <4 x i64> %a, <i64 1, i64 2, i64 3, i64 4>
835  %1 = add <8 x i64> %b, <i64 1, i64 2, i64 3, i64 4, i64 1, i64 2, i64 3, i64 4>
836  %2 = and <8 x i64> %1, <i64 1, i64 2, i64 3, i64 4, i64 1, i64 2, i64 3, i64 4>
837  store <4 x i64> %0, <4 x i64>* @ga4, align 8
838  store <8 x i64> %2, <8 x i64>* @gb4, align 8
839  ret void
840}
841
842
843@ga2 = global <4 x double> zeroinitializer, align 8
844@gb2 = global <8 x double> zeroinitializer, align 8
845
846define void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b) {
847; X86-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64:
848; X86-AVX:       # %bb.0: # %entry
849; X86-AVX-NEXT:    vmovapd {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
850; X86-AVX-NEXT:    vaddpd %ymm3, %ymm0, %ymm0
851; X86-AVX-NEXT:    vaddpd %ymm3, %ymm2, %ymm2
852; X86-AVX-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
853; X86-AVX-NEXT:    vdivpd %ymm3, %ymm1, %ymm1
854; X86-AVX-NEXT:    vdivpd %ymm3, %ymm2, %ymm2
855; X86-AVX-NEXT:    vmovupd %ymm0, ga2
856; X86-AVX-NEXT:    vmovupd %ymm2, gb2+32
857; X86-AVX-NEXT:    vmovupd %ymm1, gb2
858; X86-AVX-NEXT:    vzeroupper
859; X86-AVX-NEXT:    retl
860;
861; X86-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64:
862; X86-AVX512:       # %bb.0: # %entry
863; X86-AVX512-NEXT:    vmovapd {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
864; X86-AVX512-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
865; X86-AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm2, %zmm2
866; X86-AVX512-NEXT:    vaddpd %zmm2, %zmm1, %zmm1
867; X86-AVX512-NEXT:    vdivpd %zmm2, %zmm1, %zmm1
868; X86-AVX512-NEXT:    vmovupd %ymm0, ga2
869; X86-AVX512-NEXT:    vmovupd %zmm1, gb2
870; X86-AVX512-NEXT:    vzeroupper
871; X86-AVX512-NEXT:    retl
872;
873; X64-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64:
874; X64-AVX:       # %bb.0: # %entry
875; X64-AVX-NEXT:    vmovapd {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
876; X64-AVX-NEXT:    vaddpd %ymm3, %ymm0, %ymm0
877; X64-AVX-NEXT:    vaddpd %ymm3, %ymm2, %ymm2
878; X64-AVX-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
879; X64-AVX-NEXT:    vdivpd %ymm3, %ymm1, %ymm1
880; X64-AVX-NEXT:    vdivpd %ymm3, %ymm2, %ymm2
881; X64-AVX-NEXT:    vmovupd %ymm0, {{.*}}(%rip)
882; X64-AVX-NEXT:    vmovupd %ymm2, gb2+{{.*}}(%rip)
883; X64-AVX-NEXT:    vmovupd %ymm1, {{.*}}(%rip)
884; X64-AVX-NEXT:    vzeroupper
885; X64-AVX-NEXT:    retq
886;
887; X64-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64:
888; X64-AVX512:       # %bb.0: # %entry
889; X64-AVX512-NEXT:    vmovapd {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
890; X64-AVX512-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
891; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm2, %zmm2
892; X64-AVX512-NEXT:    vaddpd %zmm2, %zmm1, %zmm1
893; X64-AVX512-NEXT:    vdivpd %zmm2, %zmm1, %zmm1
894; X64-AVX512-NEXT:    vmovupd %ymm0, {{.*}}(%rip)
895; X64-AVX512-NEXT:    vmovupd %zmm1, {{.*}}(%rip)
896; X64-AVX512-NEXT:    vzeroupper
897; X64-AVX512-NEXT:    retq
898entry:
899  %0 = fadd <4 x double> %a, <double 1.0, double 2.0, double 3.0, double 4.0>
900  %1 = fadd <8 x double> %b, <double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0, double 4.0>
901  %2 = fdiv <8 x double> %1, <double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0, double 4.0>
902  store <4 x double> %0, <4 x double>* @ga2, align 8
903  store <8 x double> %2, <8 x double>* @gb2, align 8
904  ret void
905}
906
907;
908; Subvector Broadcast from register
909;
910
911define <4 x double> @reg_broadcast_2f64_4f64(<2 x double> %a0) nounwind {
912; X86-LABEL: reg_broadcast_2f64_4f64:
913; X86:       # %bb.0:
914; X86-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
915; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
916; X86-NEXT:    retl
917;
918; X64-LABEL: reg_broadcast_2f64_4f64:
919; X64:       # %bb.0:
920; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
921; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
922; X64-NEXT:    retq
923 %1 = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
924 ret <4 x double> %1
925}
926
927define <8 x double> @reg_broadcast_2f64_8f64(<2 x double> %a0) nounwind {
928; X86-AVX-LABEL: reg_broadcast_2f64_8f64:
929; X86-AVX:       # %bb.0:
930; X86-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
931; X86-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
932; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
933; X86-AVX-NEXT:    retl
934;
935; X86-AVX512-LABEL: reg_broadcast_2f64_8f64:
936; X86-AVX512:       # %bb.0:
937; X86-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
938; X86-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
939; X86-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
940; X86-AVX512-NEXT:    retl
941;
942; X64-AVX-LABEL: reg_broadcast_2f64_8f64:
943; X64-AVX:       # %bb.0:
944; X64-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
945; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
946; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
947; X64-AVX-NEXT:    retq
948;
949; X64-AVX512-LABEL: reg_broadcast_2f64_8f64:
950; X64-AVX512:       # %bb.0:
951; X64-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
952; X64-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
953; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
954; X64-AVX512-NEXT:    retq
955 %1 = shufflevector <2 x double> %a0, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
956 ret <8 x double> %1
957}
958
959define <8 x double> @reg_broadcast_4f64_8f64(<4 x double> %a0) nounwind {
960; X86-AVX-LABEL: reg_broadcast_4f64_8f64:
961; X86-AVX:       # %bb.0:
962; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
963; X86-AVX-NEXT:    retl
964;
965; X86-AVX512-LABEL: reg_broadcast_4f64_8f64:
966; X86-AVX512:       # %bb.0:
967; X86-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
968; X86-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
969; X86-AVX512-NEXT:    retl
970;
971; X64-AVX-LABEL: reg_broadcast_4f64_8f64:
972; X64-AVX:       # %bb.0:
973; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
974; X64-AVX-NEXT:    retq
975;
976; X64-AVX512-LABEL: reg_broadcast_4f64_8f64:
977; X64-AVX512:       # %bb.0:
978; X64-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
979; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
980; X64-AVX512-NEXT:    retq
981 %1 = shufflevector <4 x double> %a0, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
982 ret <8 x double> %1
983}
984
985define <4 x i64> @reg_broadcast_2i64_4i64(<2 x i64> %a0) nounwind {
986; X86-LABEL: reg_broadcast_2i64_4i64:
987; X86:       # %bb.0:
988; X86-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
989; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
990; X86-NEXT:    retl
991;
992; X64-LABEL: reg_broadcast_2i64_4i64:
993; X64:       # %bb.0:
994; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
995; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
996; X64-NEXT:    retq
997 %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
998 ret <4 x i64> %1
999}
1000
1001define <8 x i64> @reg_broadcast_2i64_8i64(<2 x i64> %a0) nounwind {
1002; X86-AVX-LABEL: reg_broadcast_2i64_8i64:
1003; X86-AVX:       # %bb.0:
1004; X86-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1005; X86-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1006; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1007; X86-AVX-NEXT:    retl
1008;
1009; X86-AVX512-LABEL: reg_broadcast_2i64_8i64:
1010; X86-AVX512:       # %bb.0:
1011; X86-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1012; X86-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1013; X86-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1014; X86-AVX512-NEXT:    retl
1015;
1016; X64-AVX-LABEL: reg_broadcast_2i64_8i64:
1017; X64-AVX:       # %bb.0:
1018; X64-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1019; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1020; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1021; X64-AVX-NEXT:    retq
1022;
1023; X64-AVX512-LABEL: reg_broadcast_2i64_8i64:
1024; X64-AVX512:       # %bb.0:
1025; X64-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1026; X64-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1027; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1028; X64-AVX512-NEXT:    retq
1029 %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
1030 ret <8 x i64> %1
1031}
1032
1033define <8 x i64> @reg_broadcast_4i64_8i64(<4 x i64> %a0) nounwind {
1034; X86-AVX-LABEL: reg_broadcast_4i64_8i64:
1035; X86-AVX:       # %bb.0:
1036; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1037; X86-AVX-NEXT:    retl
1038;
1039; X86-AVX512-LABEL: reg_broadcast_4i64_8i64:
1040; X86-AVX512:       # %bb.0:
1041; X86-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1042; X86-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1043; X86-AVX512-NEXT:    retl
1044;
1045; X64-AVX-LABEL: reg_broadcast_4i64_8i64:
1046; X64-AVX:       # %bb.0:
1047; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1048; X64-AVX-NEXT:    retq
1049;
1050; X64-AVX512-LABEL: reg_broadcast_4i64_8i64:
1051; X64-AVX512:       # %bb.0:
1052; X64-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1053; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1054; X64-AVX512-NEXT:    retq
1055 %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1056 ret <8 x i64> %1
1057}
1058
1059define <8 x float> @reg_broadcast_4f32_8f32(<4 x float> %a0) nounwind {
1060; X86-LABEL: reg_broadcast_4f32_8f32:
1061; X86:       # %bb.0:
1062; X86-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1063; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1064; X86-NEXT:    retl
1065;
1066; X64-LABEL: reg_broadcast_4f32_8f32:
1067; X64:       # %bb.0:
1068; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1069; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1070; X64-NEXT:    retq
1071 %1 = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1072 ret <8 x float> %1
1073}
1074
1075define <16 x float> @reg_broadcast_4f32_16f32(<4 x float> %a0) nounwind {
1076; X86-AVX-LABEL: reg_broadcast_4f32_16f32:
1077; X86-AVX:       # %bb.0:
1078; X86-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1079; X86-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1080; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1081; X86-AVX-NEXT:    retl
1082;
1083; X86-AVX512-LABEL: reg_broadcast_4f32_16f32:
1084; X86-AVX512:       # %bb.0:
1085; X86-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1086; X86-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1087; X86-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1088; X86-AVX512-NEXT:    retl
1089;
1090; X64-AVX-LABEL: reg_broadcast_4f32_16f32:
1091; X64-AVX:       # %bb.0:
1092; X64-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1093; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1094; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1095; X64-AVX-NEXT:    retq
1096;
1097; X64-AVX512-LABEL: reg_broadcast_4f32_16f32:
1098; X64-AVX512:       # %bb.0:
1099; X64-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1100; X64-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1101; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1102; X64-AVX512-NEXT:    retq
1103 %1 = shufflevector <4 x float> %a0, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1104 ret <16 x float> %1
1105}
1106
1107define <16 x float> @reg_broadcast_8f32_16f32(<8 x float> %a0) nounwind {
1108; X86-AVX-LABEL: reg_broadcast_8f32_16f32:
1109; X86-AVX:       # %bb.0:
1110; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1111; X86-AVX-NEXT:    retl
1112;
1113; X86-AVX512-LABEL: reg_broadcast_8f32_16f32:
1114; X86-AVX512:       # %bb.0:
1115; X86-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1116; X86-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1117; X86-AVX512-NEXT:    retl
1118;
1119; X64-AVX-LABEL: reg_broadcast_8f32_16f32:
1120; X64-AVX:       # %bb.0:
1121; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1122; X64-AVX-NEXT:    retq
1123;
1124; X64-AVX512-LABEL: reg_broadcast_8f32_16f32:
1125; X64-AVX512:       # %bb.0:
1126; X64-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1127; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1128; X64-AVX512-NEXT:    retq
1129 %1 = shufflevector <8 x float> %a0, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1130 ret <16 x float> %1
1131}
1132
1133define <8 x i32> @reg_broadcast_4i32_8i32(<4 x i32> %a0) nounwind {
1134; X86-LABEL: reg_broadcast_4i32_8i32:
1135; X86:       # %bb.0:
1136; X86-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1137; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1138; X86-NEXT:    retl
1139;
1140; X64-LABEL: reg_broadcast_4i32_8i32:
1141; X64:       # %bb.0:
1142; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1143; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1144; X64-NEXT:    retq
1145 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1146 ret <8 x i32> %1
1147}
1148
1149define <16 x i32> @reg_broadcast_4i32_16i32(<4 x i32> %a0) nounwind {
1150; X86-AVX-LABEL: reg_broadcast_4i32_16i32:
1151; X86-AVX:       # %bb.0:
1152; X86-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1153; X86-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1154; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1155; X86-AVX-NEXT:    retl
1156;
1157; X86-AVX512-LABEL: reg_broadcast_4i32_16i32:
1158; X86-AVX512:       # %bb.0:
1159; X86-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1160; X86-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1161; X86-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1162; X86-AVX512-NEXT:    retl
1163;
1164; X64-AVX-LABEL: reg_broadcast_4i32_16i32:
1165; X64-AVX:       # %bb.0:
1166; X64-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1167; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1168; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1169; X64-AVX-NEXT:    retq
1170;
1171; X64-AVX512-LABEL: reg_broadcast_4i32_16i32:
1172; X64-AVX512:       # %bb.0:
1173; X64-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1174; X64-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1175; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1176; X64-AVX512-NEXT:    retq
1177 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1178 ret <16 x i32> %1
1179}
1180
1181define <16 x i32> @reg_broadcast_8i32_16i32(<8 x i32> %a0) nounwind {
1182; X86-AVX-LABEL: reg_broadcast_8i32_16i32:
1183; X86-AVX:       # %bb.0:
1184; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1185; X86-AVX-NEXT:    retl
1186;
1187; X86-AVX512-LABEL: reg_broadcast_8i32_16i32:
1188; X86-AVX512:       # %bb.0:
1189; X86-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1190; X86-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1191; X86-AVX512-NEXT:    retl
1192;
1193; X64-AVX-LABEL: reg_broadcast_8i32_16i32:
1194; X64-AVX:       # %bb.0:
1195; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1196; X64-AVX-NEXT:    retq
1197;
1198; X64-AVX512-LABEL: reg_broadcast_8i32_16i32:
1199; X64-AVX512:       # %bb.0:
1200; X64-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1201; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1202; X64-AVX512-NEXT:    retq
1203 %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1204 ret <16 x i32> %1
1205}
1206
1207define <16 x i16> @reg_broadcast_8i16_16i16(<8 x i16> %a0) nounwind {
1208; X86-LABEL: reg_broadcast_8i16_16i16:
1209; X86:       # %bb.0:
1210; X86-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1211; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1212; X86-NEXT:    retl
1213;
1214; X64-LABEL: reg_broadcast_8i16_16i16:
1215; X64:       # %bb.0:
1216; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1217; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1218; X64-NEXT:    retq
1219 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1220 ret <16 x i16> %1
1221}
1222
1223define <32 x i16> @reg_broadcast_8i16_32i16(<8 x i16> %a0) nounwind {
1224; X86-AVX-LABEL: reg_broadcast_8i16_32i16:
1225; X86-AVX:       # %bb.0:
1226; X86-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1227; X86-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1228; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1229; X86-AVX-NEXT:    retl
1230;
1231; X86-AVX512-LABEL: reg_broadcast_8i16_32i16:
1232; X86-AVX512:       # %bb.0:
1233; X86-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1234; X86-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1235; X86-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1236; X86-AVX512-NEXT:    retl
1237;
1238; X64-AVX-LABEL: reg_broadcast_8i16_32i16:
1239; X64-AVX:       # %bb.0:
1240; X64-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1241; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1242; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1243; X64-AVX-NEXT:    retq
1244;
1245; X64-AVX512-LABEL: reg_broadcast_8i16_32i16:
1246; X64-AVX512:       # %bb.0:
1247; X64-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1248; X64-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1249; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1250; X64-AVX512-NEXT:    retq
1251 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1252 ret <32 x i16> %1
1253}
1254
1255define <32 x i16> @reg_broadcast_16i16_32i16(<16 x i16> %a0) nounwind {
1256; X86-AVX-LABEL: reg_broadcast_16i16_32i16:
1257; X86-AVX:       # %bb.0:
1258; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1259; X86-AVX-NEXT:    retl
1260;
1261; X86-AVX512-LABEL: reg_broadcast_16i16_32i16:
1262; X86-AVX512:       # %bb.0:
1263; X86-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1264; X86-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1265; X86-AVX512-NEXT:    retl
1266;
1267; X64-AVX-LABEL: reg_broadcast_16i16_32i16:
1268; X64-AVX:       # %bb.0:
1269; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1270; X64-AVX-NEXT:    retq
1271;
1272; X64-AVX512-LABEL: reg_broadcast_16i16_32i16:
1273; X64-AVX512:       # %bb.0:
1274; X64-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1275; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1276; X64-AVX512-NEXT:    retq
1277 %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1278 ret <32 x i16> %1
1279}
1280
1281define <32 x i8> @reg_broadcast_16i8_32i8(<16 x i8> %a0) nounwind {
1282; X86-LABEL: reg_broadcast_16i8_32i8:
1283; X86:       # %bb.0:
1284; X86-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1285; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1286; X86-NEXT:    retl
1287;
1288; X64-LABEL: reg_broadcast_16i8_32i8:
1289; X64:       # %bb.0:
1290; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1291; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1292; X64-NEXT:    retq
1293 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1294 ret <32 x i8> %1
1295}
1296
1297define <64 x i8> @reg_broadcast_16i8_64i8(<16 x i8> %a0) nounwind {
1298; X86-AVX-LABEL: reg_broadcast_16i8_64i8:
1299; X86-AVX:       # %bb.0:
1300; X86-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1301; X86-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1302; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1303; X86-AVX-NEXT:    retl
1304;
1305; X86-AVX512-LABEL: reg_broadcast_16i8_64i8:
1306; X86-AVX512:       # %bb.0:
1307; X86-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1308; X86-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1309; X86-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1310; X86-AVX512-NEXT:    retl
1311;
1312; X64-AVX-LABEL: reg_broadcast_16i8_64i8:
1313; X64-AVX:       # %bb.0:
1314; X64-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1315; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1316; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1317; X64-AVX-NEXT:    retq
1318;
1319; X64-AVX512-LABEL: reg_broadcast_16i8_64i8:
1320; X64-AVX512:       # %bb.0:
1321; X64-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1322; X64-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1323; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1324; X64-AVX512-NEXT:    retq
1325 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1326 ret <64 x i8> %1
1327}
1328
1329define <64 x i8> @reg_broadcast_32i8_64i8(<32 x i8> %a0) nounwind {
1330; X86-AVX-LABEL: reg_broadcast_32i8_64i8:
1331; X86-AVX:       # %bb.0:
1332; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1333; X86-AVX-NEXT:    retl
1334;
1335; X86-AVX512-LABEL: reg_broadcast_32i8_64i8:
1336; X86-AVX512:       # %bb.0:
1337; X86-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1338; X86-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1339; X86-AVX512-NEXT:    retl
1340;
1341; X64-AVX-LABEL: reg_broadcast_32i8_64i8:
1342; X64-AVX:       # %bb.0:
1343; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1344; X64-AVX-NEXT:    retq
1345;
1346; X64-AVX512-LABEL: reg_broadcast_32i8_64i8:
1347; X64-AVX512:       # %bb.0:
1348; X64-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1349; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1350; X64-AVX512-NEXT:    retq
1351 %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1352 ret <64 x i8> %1
1353}
1354
1355;
1356; PR34394
1357;
1358
1359define <4 x i32> @test_2xi32_to_4xi32_mem(<2 x i32>* %vp) {
1360; X86-LABEL: test_2xi32_to_4xi32_mem:
1361; X86:       # %bb.0:
1362; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1363; X86-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
1364; X86-NEXT:    retl
1365;
1366; X64-LABEL: test_2xi32_to_4xi32_mem:
1367; X64:       # %bb.0:
1368; X64-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
1369; X64-NEXT:    retq
1370  %vec = load <2 x i32>, <2 x i32>* %vp
1371  %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
1372  ret <4 x i32> %res
1373}
1374
1375define <8 x i32> @test_2xi32_to_8xi32_mem(<2 x i32>* %vp) {
1376; X86-LABEL: test_2xi32_to_8xi32_mem:
1377; X86:       # %bb.0:
1378; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1379; X86-NEXT:    vbroadcastsd (%eax), %ymm0
1380; X86-NEXT:    retl
1381;
1382; X64-LABEL: test_2xi32_to_8xi32_mem:
1383; X64:       # %bb.0:
1384; X64-NEXT:    vbroadcastsd (%rdi), %ymm0
1385; X64-NEXT:    retq
1386  %vec = load <2 x i32>, <2 x i32>* %vp
1387  %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
1388  ret <8 x i32> %res
1389}
1390
1391define <16 x i32> @test_2xi32_to_16xi32_mem(<2 x i32>* %vp) {
1392; X86-AVX-LABEL: test_2xi32_to_16xi32_mem:
1393; X86-AVX:       # %bb.0:
1394; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1395; X86-AVX-NEXT:    vbroadcastsd (%eax), %ymm0
1396; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1397; X86-AVX-NEXT:    retl
1398;
1399; X86-AVX512-LABEL: test_2xi32_to_16xi32_mem:
1400; X86-AVX512:       # %bb.0:
1401; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
1402; X86-AVX512-NEXT:    vbroadcastsd (%eax), %zmm0
1403; X86-AVX512-NEXT:    retl
1404;
1405; X64-AVX-LABEL: test_2xi32_to_16xi32_mem:
1406; X64-AVX:       # %bb.0:
1407; X64-AVX-NEXT:    vbroadcastsd (%rdi), %ymm0
1408; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1409; X64-AVX-NEXT:    retq
1410;
1411; X64-AVX512-LABEL: test_2xi32_to_16xi32_mem:
1412; X64-AVX512:       # %bb.0:
1413; X64-AVX512-NEXT:    vbroadcastsd (%rdi), %zmm0
1414; X64-AVX512-NEXT:    retq
1415  %vec = load <2 x i32>, <2 x i32>* %vp
1416  %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
1417  ret <16 x i32> %res
1418}
1419
1420;
1421; PR34041
1422;
1423
1424define <4 x double> @broadcast_v4f64_f64_u000(double* %p) {
1425; X86-LABEL: broadcast_v4f64_f64_u000:
1426; X86:       # %bb.0:
1427; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1428; X86-NEXT:    vbroadcastsd (%eax), %ymm0
1429; X86-NEXT:    retl
1430;
1431; X64-LABEL: broadcast_v4f64_f64_u000:
1432; X64:       # %bb.0:
1433; X64-NEXT:    vbroadcastsd (%rdi), %ymm0
1434; X64-NEXT:    retq
1435  %s = load double, double* %p
1436  %vec = insertelement <2 x double> undef, double %s, i32 0
1437  %res = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
1438  ret <4 x double> %res
1439}
1440
1441define <4 x double> @broadcast_v4f64_v2f64_4u61(<2 x double>* %vp, <4 x double> %default) {
1442; X86-LABEL: broadcast_v4f64_v2f64_4u61:
1443; X86:       # %bb.0:
1444; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1445; X86-NEXT:    vinsertf128 $1, (%eax), %ymm0, %ymm1
1446; X86-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
1447; X86-NEXT:    retl
1448;
1449; X64-LABEL: broadcast_v4f64_v2f64_4u61:
1450; X64:       # %bb.0:
1451; X64-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm1
1452; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
1453; X64-NEXT:    retq
1454  %vec = load <2 x double>, <2 x double>* %vp
1455  %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 1>
1456  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %default
1457  ret <4 x double> %res
1458}
1459
1460define <8 x float> @broadcast_v8f32_v2f32_u1uu0uEu(<2 x float>* %vp, <8 x float> %default) {
1461; X86-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
1462; X86:       # %bb.0:
1463; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1464; X86-NEXT:    vbroadcastsd (%eax), %ymm1
1465; X86-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3]
1466; X86-NEXT:    retl
1467;
1468; X64-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
1469; X64:       # %bb.0:
1470; X64-NEXT:    vbroadcastsd (%rdi), %ymm1
1471; X64-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3]
1472; X64-NEXT:    retq
1473  %vec = load <2 x float>, <2 x float>* %vp
1474  %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 undef, i32 1, i32 undef, i32 undef, i32 0, i32 2, i32 3, i32 undef>
1475  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %default
1476  ret <8 x float> %res
1477}
1478
1479define <8 x double> @broadcast_v8f64_v2f64_u1u10101(<2 x double>* %vp) {
1480; X86-AVX-LABEL: broadcast_v8f64_v2f64_u1u10101:
1481; X86-AVX:       # %bb.0:
1482; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1483; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
1484; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1485; X86-AVX-NEXT:    retl
1486;
1487; X86-AVX512-LABEL: broadcast_v8f64_v2f64_u1u10101:
1488; X86-AVX512:       # %bb.0:
1489; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
1490; X86-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1491; X86-AVX512-NEXT:    retl
1492;
1493; X64-AVX-LABEL: broadcast_v8f64_v2f64_u1u10101:
1494; X64-AVX:       # %bb.0:
1495; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
1496; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1497; X64-AVX-NEXT:    retq
1498;
1499; X64-AVX512-LABEL: broadcast_v8f64_v2f64_u1u10101:
1500; X64-AVX512:       # %bb.0:
1501; X64-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1502; X64-AVX512-NEXT:    retq
1503  %vec = load <2 x double>, <2 x double>* %vp
1504  %res = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 3, i32 1, i32 undef, i32 1, i32 0, i32 1, i32 0, i32 1>
1505  ret <8 x double> %res
1506}
1507
1508define <8 x double> @broadcast_v8f64_v2f64_0uuu0101(<2 x double>* %vp) {
1509; X86-AVX-LABEL: broadcast_v8f64_v2f64_0uuu0101:
1510; X86-AVX:       # %bb.0:
1511; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1512; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
1513; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1514; X86-AVX-NEXT:    retl
1515;
1516; X86-AVX512-LABEL: broadcast_v8f64_v2f64_0uuu0101:
1517; X86-AVX512:       # %bb.0:
1518; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
1519; X86-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1520; X86-AVX512-NEXT:    retl
1521;
1522; X64-AVX-LABEL: broadcast_v8f64_v2f64_0uuu0101:
1523; X64-AVX:       # %bb.0:
1524; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
1525; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1526; X64-AVX-NEXT:    retq
1527;
1528; X64-AVX512-LABEL: broadcast_v8f64_v2f64_0uuu0101:
1529; X64-AVX512:       # %bb.0:
1530; X64-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1531; X64-AVX512-NEXT:    retq
1532  %vec = load <2 x double>, <2 x double>* %vp
1533  %res = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 0, i32 1>
1534  ret <8 x double> %res
1535}
1536