• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX --check-prefix=X32-AVX1
3; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX --check-prefix=X32-AVX2
4; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512 --check-prefix=X32-AVX512F
5; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512 --check-prefix=X32-AVX512BW
6; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512 --check-prefix=X32-AVX512DQ
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512 --check-prefix=X64-AVX512F
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512 --check-prefix=X64-AVX512BW
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512 --check-prefix=X64-AVX512DQ
12
13;
14; Subvector Load + Broadcast
15;
16
17define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind {
18; X32-LABEL: test_broadcast_2f64_4f64:
19; X32:       # %bb.0:
20; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
21; X32-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
22; X32-NEXT:    retl
23;
24; X64-LABEL: test_broadcast_2f64_4f64:
25; X64:       # %bb.0:
26; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
27; X64-NEXT:    retq
28 %1 = load <2 x double>, <2 x double> *%p
29 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
30 ret <4 x double> %2
31}
32
33define <8 x double> @test_broadcast_2f64_8f64(<2 x double> *%p) nounwind {
34; X32-AVX-LABEL: test_broadcast_2f64_8f64:
35; X32-AVX:       # %bb.0:
36; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
37; X32-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
38; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
39; X32-AVX-NEXT:    retl
40;
41; X32-AVX512-LABEL: test_broadcast_2f64_8f64:
42; X32-AVX512:       # %bb.0:
43; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
44; X32-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
45; X32-AVX512-NEXT:    retl
46;
47; X64-AVX-LABEL: test_broadcast_2f64_8f64:
48; X64-AVX:       # %bb.0:
49; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
50; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
51; X64-AVX-NEXT:    retq
52;
53; X64-AVX512-LABEL: test_broadcast_2f64_8f64:
54; X64-AVX512:       # %bb.0:
55; X64-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
56; X64-AVX512-NEXT:    retq
57 %1 = load <2 x double>, <2 x double> *%p
58 %2 = shufflevector <2 x double> %1, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
59 ret <8 x double> %2
60}
61
62define <8 x double> @test_broadcast_4f64_8f64(<4 x double> *%p) nounwind {
63; X32-AVX-LABEL: test_broadcast_4f64_8f64:
64; X32-AVX:       # %bb.0:
65; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
66; X32-AVX-NEXT:    vmovaps (%eax), %ymm0
67; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
68; X32-AVX-NEXT:    retl
69;
70; X32-AVX512-LABEL: test_broadcast_4f64_8f64:
71; X32-AVX512:       # %bb.0:
72; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
73; X32-AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
74; X32-AVX512-NEXT:    retl
75;
76; X64-AVX-LABEL: test_broadcast_4f64_8f64:
77; X64-AVX:       # %bb.0:
78; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
79; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
80; X64-AVX-NEXT:    retq
81;
82; X64-AVX512-LABEL: test_broadcast_4f64_8f64:
83; X64-AVX512:       # %bb.0:
84; X64-AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
85; X64-AVX512-NEXT:    retq
86 %1 = load <4 x double>, <4 x double> *%p
87 %2 = shufflevector <4 x double> %1, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
88 ret <8 x double> %2
89}
90
91define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind {
92; X32-AVX-LABEL: test_broadcast_2i64_4i64:
93; X32-AVX:       # %bb.0:
94; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
95; X32-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
96; X32-AVX-NEXT:    retl
97;
98; X32-AVX512-LABEL: test_broadcast_2i64_4i64:
99; X32-AVX512:       # %bb.0:
100; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
101; X32-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
102; X32-AVX512-NEXT:    retl
103;
104; X64-AVX-LABEL: test_broadcast_2i64_4i64:
105; X64-AVX:       # %bb.0:
106; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
107; X64-AVX-NEXT:    retq
108;
109; X64-AVX512-LABEL: test_broadcast_2i64_4i64:
110; X64-AVX512:       # %bb.0:
111; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
112; X64-AVX512-NEXT:    retq
113 %1 = load <2 x i64>, <2 x i64> *%p
114 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
115 ret <4 x i64> %2
116}
117
118define <8 x i64> @test_broadcast_2i64_8i64(<2 x i64> *%p) nounwind {
119; X32-AVX-LABEL: test_broadcast_2i64_8i64:
120; X32-AVX:       # %bb.0:
121; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
122; X32-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
123; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
124; X32-AVX-NEXT:    retl
125;
126; X32-AVX512-LABEL: test_broadcast_2i64_8i64:
127; X32-AVX512:       # %bb.0:
128; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
129; X32-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
130; X32-AVX512-NEXT:    retl
131;
132; X64-AVX-LABEL: test_broadcast_2i64_8i64:
133; X64-AVX:       # %bb.0:
134; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
135; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
136; X64-AVX-NEXT:    retq
137;
138; X64-AVX512-LABEL: test_broadcast_2i64_8i64:
139; X64-AVX512:       # %bb.0:
140; X64-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
141; X64-AVX512-NEXT:    retq
142 %1 = load <2 x i64>, <2 x i64> *%p
143 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
144 ret <8 x i64> %2
145}
146
147define <8 x i64> @test_broadcast_4i64_8i64(<4 x i64> *%p) nounwind {
148; X32-AVX-LABEL: test_broadcast_4i64_8i64:
149; X32-AVX:       # %bb.0:
150; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
151; X32-AVX-NEXT:    vmovaps (%eax), %ymm0
152; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
153; X32-AVX-NEXT:    retl
154;
155; X32-AVX512-LABEL: test_broadcast_4i64_8i64:
156; X32-AVX512:       # %bb.0:
157; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
158; X32-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
159; X32-AVX512-NEXT:    retl
160;
161; X64-AVX-LABEL: test_broadcast_4i64_8i64:
162; X64-AVX:       # %bb.0:
163; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
164; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
165; X64-AVX-NEXT:    retq
166;
167; X64-AVX512-LABEL: test_broadcast_4i64_8i64:
168; X64-AVX512:       # %bb.0:
169; X64-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
170; X64-AVX512-NEXT:    retq
171 %1 = load <4 x i64>, <4 x i64> *%p
172 %2 = shufflevector <4 x i64> %1, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
173 ret <8 x i64> %2
174}
175
176define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind {
177; X32-LABEL: test_broadcast_4f32_8f32:
178; X32:       # %bb.0:
179; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
180; X32-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
181; X32-NEXT:    retl
182;
183; X64-LABEL: test_broadcast_4f32_8f32:
184; X64:       # %bb.0:
185; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
186; X64-NEXT:    retq
187 %1 = load <4 x float>, <4 x float> *%p
188 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
189 ret <8 x float> %2
190}
191
192define <16 x float> @test_broadcast_4f32_16f32(<4 x float> *%p) nounwind {
193; X32-AVX-LABEL: test_broadcast_4f32_16f32:
194; X32-AVX:       # %bb.0:
195; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
196; X32-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
197; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
198; X32-AVX-NEXT:    retl
199;
200; X32-AVX512-LABEL: test_broadcast_4f32_16f32:
201; X32-AVX512:       # %bb.0:
202; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
203; X32-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
204; X32-AVX512-NEXT:    retl
205;
206; X64-AVX-LABEL: test_broadcast_4f32_16f32:
207; X64-AVX:       # %bb.0:
208; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
209; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
210; X64-AVX-NEXT:    retq
211;
212; X64-AVX512-LABEL: test_broadcast_4f32_16f32:
213; X64-AVX512:       # %bb.0:
214; X64-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
215; X64-AVX512-NEXT:    retq
216 %1 = load <4 x float>, <4 x float> *%p
217 %2 = shufflevector <4 x float> %1, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
218 ret <16 x float> %2
219}
220
221define <16 x float> @test_broadcast_8f32_16f32(<8 x float> *%p) nounwind {
222; X32-AVX-LABEL: test_broadcast_8f32_16f32:
223; X32-AVX:       # %bb.0:
224; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
225; X32-AVX-NEXT:    vmovaps (%eax), %ymm0
226; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
227; X32-AVX-NEXT:    retl
228;
229; X32-AVX512-LABEL: test_broadcast_8f32_16f32:
230; X32-AVX512:       # %bb.0:
231; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
232; X32-AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
233; X32-AVX512-NEXT:    retl
234;
235; X64-AVX-LABEL: test_broadcast_8f32_16f32:
236; X64-AVX:       # %bb.0:
237; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
238; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
239; X64-AVX-NEXT:    retq
240;
241; X64-AVX512-LABEL: test_broadcast_8f32_16f32:
242; X64-AVX512:       # %bb.0:
243; X64-AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
244; X64-AVX512-NEXT:    retq
245 %1 = load <8 x float>, <8 x float> *%p
246 %2 = shufflevector <8 x float> %1, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
247 ret <16 x float> %2
248}
249
250define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind {
251; X32-AVX-LABEL: test_broadcast_4i32_8i32:
252; X32-AVX:       # %bb.0:
253; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
254; X32-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
255; X32-AVX-NEXT:    retl
256;
257; X32-AVX512-LABEL: test_broadcast_4i32_8i32:
258; X32-AVX512:       # %bb.0:
259; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
260; X32-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
261; X32-AVX512-NEXT:    retl
262;
263; X64-AVX-LABEL: test_broadcast_4i32_8i32:
264; X64-AVX:       # %bb.0:
265; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
266; X64-AVX-NEXT:    retq
267;
268; X64-AVX512-LABEL: test_broadcast_4i32_8i32:
269; X64-AVX512:       # %bb.0:
270; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
271; X64-AVX512-NEXT:    retq
272 %1 = load <4 x i32>, <4 x i32> *%p
273 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
274 ret <8 x i32> %2
275}
276
277define <16 x i32> @test_broadcast_4i32_16i32(<4 x i32> *%p) nounwind {
278; X32-AVX-LABEL: test_broadcast_4i32_16i32:
279; X32-AVX:       # %bb.0:
280; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
281; X32-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
282; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
283; X32-AVX-NEXT:    retl
284;
285; X32-AVX512-LABEL: test_broadcast_4i32_16i32:
286; X32-AVX512:       # %bb.0:
287; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
288; X32-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
289; X32-AVX512-NEXT:    retl
290;
291; X64-AVX-LABEL: test_broadcast_4i32_16i32:
292; X64-AVX:       # %bb.0:
293; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
294; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
295; X64-AVX-NEXT:    retq
296;
297; X64-AVX512-LABEL: test_broadcast_4i32_16i32:
298; X64-AVX512:       # %bb.0:
299; X64-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
300; X64-AVX512-NEXT:    retq
301 %1 = load <4 x i32>, <4 x i32> *%p
302 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
303 ret <16 x i32> %2
304}
305
306define <16 x i32> @test_broadcast_8i32_16i32(<8 x i32> *%p) nounwind {
307; X32-AVX-LABEL: test_broadcast_8i32_16i32:
308; X32-AVX:       # %bb.0:
309; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
310; X32-AVX-NEXT:    vmovaps (%eax), %ymm0
311; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
312; X32-AVX-NEXT:    retl
313;
314; X32-AVX512-LABEL: test_broadcast_8i32_16i32:
315; X32-AVX512:       # %bb.0:
316; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
317; X32-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
318; X32-AVX512-NEXT:    retl
319;
320; X64-AVX-LABEL: test_broadcast_8i32_16i32:
321; X64-AVX:       # %bb.0:
322; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
323; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
324; X64-AVX-NEXT:    retq
325;
326; X64-AVX512-LABEL: test_broadcast_8i32_16i32:
327; X64-AVX512:       # %bb.0:
328; X64-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
329; X64-AVX512-NEXT:    retq
330 %1 = load <8 x i32>, <8 x i32> *%p
331 %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
332 ret <16 x i32> %2
333}
334
335define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind {
336; X32-AVX-LABEL: test_broadcast_8i16_16i16:
337; X32-AVX:       # %bb.0:
338; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
339; X32-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
340; X32-AVX-NEXT:    retl
341;
342; X32-AVX512-LABEL: test_broadcast_8i16_16i16:
343; X32-AVX512:       # %bb.0:
344; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
345; X32-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
346; X32-AVX512-NEXT:    retl
347;
348; X64-AVX-LABEL: test_broadcast_8i16_16i16:
349; X64-AVX:       # %bb.0:
350; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
351; X64-AVX-NEXT:    retq
352;
353; X64-AVX512-LABEL: test_broadcast_8i16_16i16:
354; X64-AVX512:       # %bb.0:
355; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
356; X64-AVX512-NEXT:    retq
357 %1 = load <8 x i16>, <8 x i16> *%p
358 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
359 ret <16 x i16> %2
360}
361
362define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind {
363; X32-AVX-LABEL: test_broadcast_8i16_32i16:
364; X32-AVX:       # %bb.0:
365; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
366; X32-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
367; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
368; X32-AVX-NEXT:    retl
369;
370; X32-AVX512F-LABEL: test_broadcast_8i16_32i16:
371; X32-AVX512F:       # %bb.0:
372; X32-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
373; X32-AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
374; X32-AVX512F-NEXT:    vmovdqa %ymm0, %ymm1
375; X32-AVX512F-NEXT:    retl
376;
377; X32-AVX512BW-LABEL: test_broadcast_8i16_32i16:
378; X32-AVX512BW:       # %bb.0:
379; X32-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
380; X32-AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
381; X32-AVX512BW-NEXT:    retl
382;
383; X32-AVX512DQ-LABEL: test_broadcast_8i16_32i16:
384; X32-AVX512DQ:       # %bb.0:
385; X32-AVX512DQ-NEXT:    movl {{[0-9]+}}(%esp), %eax
386; X32-AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
387; X32-AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm1
388; X32-AVX512DQ-NEXT:    retl
389;
390; X64-AVX-LABEL: test_broadcast_8i16_32i16:
391; X64-AVX:       # %bb.0:
392; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
393; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
394; X64-AVX-NEXT:    retq
395;
396; X64-AVX512F-LABEL: test_broadcast_8i16_32i16:
397; X64-AVX512F:       # %bb.0:
398; X64-AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
399; X64-AVX512F-NEXT:    vmovdqa %ymm0, %ymm1
400; X64-AVX512F-NEXT:    retq
401;
402; X64-AVX512BW-LABEL: test_broadcast_8i16_32i16:
403; X64-AVX512BW:       # %bb.0:
404; X64-AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
405; X64-AVX512BW-NEXT:    retq
406;
407; X64-AVX512DQ-LABEL: test_broadcast_8i16_32i16:
408; X64-AVX512DQ:       # %bb.0:
409; X64-AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
410; X64-AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm1
411; X64-AVX512DQ-NEXT:    retq
412 %1 = load <8 x i16>, <8 x i16> *%p
413 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
414 ret <32 x i16> %2
415}
416
417define <32 x i16> @test_broadcast_16i16_32i16(<16 x i16> *%p) nounwind {
418; X32-AVX-LABEL: test_broadcast_16i16_32i16:
419; X32-AVX:       # %bb.0:
420; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
421; X32-AVX-NEXT:    vmovaps (%eax), %ymm0
422; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
423; X32-AVX-NEXT:    retl
424;
425; X32-AVX512F-LABEL: test_broadcast_16i16_32i16:
426; X32-AVX512F:       # %bb.0:
427; X32-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
428; X32-AVX512F-NEXT:    vmovaps (%eax), %ymm0
429; X32-AVX512F-NEXT:    vmovaps %ymm0, %ymm1
430; X32-AVX512F-NEXT:    retl
431;
432; X32-AVX512BW-LABEL: test_broadcast_16i16_32i16:
433; X32-AVX512BW:       # %bb.0:
434; X32-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
435; X32-AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
436; X32-AVX512BW-NEXT:    retl
437;
438; X32-AVX512DQ-LABEL: test_broadcast_16i16_32i16:
439; X32-AVX512DQ:       # %bb.0:
440; X32-AVX512DQ-NEXT:    movl {{[0-9]+}}(%esp), %eax
441; X32-AVX512DQ-NEXT:    vmovaps (%eax), %ymm0
442; X32-AVX512DQ-NEXT:    vmovaps %ymm0, %ymm1
443; X32-AVX512DQ-NEXT:    retl
444;
445; X64-AVX-LABEL: test_broadcast_16i16_32i16:
446; X64-AVX:       # %bb.0:
447; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
448; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
449; X64-AVX-NEXT:    retq
450;
451; X64-AVX512F-LABEL: test_broadcast_16i16_32i16:
452; X64-AVX512F:       # %bb.0:
453; X64-AVX512F-NEXT:    vmovaps (%rdi), %ymm0
454; X64-AVX512F-NEXT:    vmovaps %ymm0, %ymm1
455; X64-AVX512F-NEXT:    retq
456;
457; X64-AVX512BW-LABEL: test_broadcast_16i16_32i16:
458; X64-AVX512BW:       # %bb.0:
459; X64-AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
460; X64-AVX512BW-NEXT:    retq
461;
462; X64-AVX512DQ-LABEL: test_broadcast_16i16_32i16:
463; X64-AVX512DQ:       # %bb.0:
464; X64-AVX512DQ-NEXT:    vmovaps (%rdi), %ymm0
465; X64-AVX512DQ-NEXT:    vmovaps %ymm0, %ymm1
466; X64-AVX512DQ-NEXT:    retq
467 %1 = load <16 x i16>, <16 x i16> *%p
468 %2 = shufflevector <16 x i16> %1, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
469 ret <32 x i16> %2
470}
471
472define <32 x i8> @test_broadcast_16i8_32i8(<16 x i8> *%p) nounwind {
473; X32-AVX-LABEL: test_broadcast_16i8_32i8:
474; X32-AVX:       # %bb.0:
475; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
476; X32-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
477; X32-AVX-NEXT:    retl
478;
479; X32-AVX512-LABEL: test_broadcast_16i8_32i8:
480; X32-AVX512:       # %bb.0:
481; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
482; X32-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
483; X32-AVX512-NEXT:    retl
484;
485; X64-AVX-LABEL: test_broadcast_16i8_32i8:
486; X64-AVX:       # %bb.0:
487; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
488; X64-AVX-NEXT:    retq
489;
490; X64-AVX512-LABEL: test_broadcast_16i8_32i8:
491; X64-AVX512:       # %bb.0:
492; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
493; X64-AVX512-NEXT:    retq
494 %1 = load <16 x i8>, <16 x i8> *%p
495 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
496 ret <32 x i8> %2
497}
498
499define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind {
500; X32-AVX-LABEL: test_broadcast_16i8_64i8:
501; X32-AVX:       # %bb.0:
502; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
503; X32-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
504; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
505; X32-AVX-NEXT:    retl
506;
507; X32-AVX512F-LABEL: test_broadcast_16i8_64i8:
508; X32-AVX512F:       # %bb.0:
509; X32-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
510; X32-AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
511; X32-AVX512F-NEXT:    vmovdqa %ymm0, %ymm1
512; X32-AVX512F-NEXT:    retl
513;
514; X32-AVX512BW-LABEL: test_broadcast_16i8_64i8:
515; X32-AVX512BW:       # %bb.0:
516; X32-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
517; X32-AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
518; X32-AVX512BW-NEXT:    retl
519;
520; X32-AVX512DQ-LABEL: test_broadcast_16i8_64i8:
521; X32-AVX512DQ:       # %bb.0:
522; X32-AVX512DQ-NEXT:    movl {{[0-9]+}}(%esp), %eax
523; X32-AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
524; X32-AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm1
525; X32-AVX512DQ-NEXT:    retl
526;
527; X64-AVX-LABEL: test_broadcast_16i8_64i8:
528; X64-AVX:       # %bb.0:
529; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
530; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
531; X64-AVX-NEXT:    retq
532;
533; X64-AVX512F-LABEL: test_broadcast_16i8_64i8:
534; X64-AVX512F:       # %bb.0:
535; X64-AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
536; X64-AVX512F-NEXT:    vmovdqa %ymm0, %ymm1
537; X64-AVX512F-NEXT:    retq
538;
539; X64-AVX512BW-LABEL: test_broadcast_16i8_64i8:
540; X64-AVX512BW:       # %bb.0:
541; X64-AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
542; X64-AVX512BW-NEXT:    retq
543;
544; X64-AVX512DQ-LABEL: test_broadcast_16i8_64i8:
545; X64-AVX512DQ:       # %bb.0:
546; X64-AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
547; X64-AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm1
548; X64-AVX512DQ-NEXT:    retq
549 %1 = load <16 x i8>, <16 x i8> *%p
550 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
551 ret <64 x i8> %2
552}
553
554define <64 x i8> @test_broadcast_32i8_64i8(<32 x i8> *%p) nounwind {
555; X32-AVX-LABEL: test_broadcast_32i8_64i8:
556; X32-AVX:       # %bb.0:
557; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
558; X32-AVX-NEXT:    vmovaps (%eax), %ymm0
559; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
560; X32-AVX-NEXT:    retl
561;
562; X32-AVX512F-LABEL: test_broadcast_32i8_64i8:
563; X32-AVX512F:       # %bb.0:
564; X32-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
565; X32-AVX512F-NEXT:    vmovaps (%eax), %ymm0
566; X32-AVX512F-NEXT:    vmovaps %ymm0, %ymm1
567; X32-AVX512F-NEXT:    retl
568;
569; X32-AVX512BW-LABEL: test_broadcast_32i8_64i8:
570; X32-AVX512BW:       # %bb.0:
571; X32-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
572; X32-AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
573; X32-AVX512BW-NEXT:    retl
574;
575; X32-AVX512DQ-LABEL: test_broadcast_32i8_64i8:
576; X32-AVX512DQ:       # %bb.0:
577; X32-AVX512DQ-NEXT:    movl {{[0-9]+}}(%esp), %eax
578; X32-AVX512DQ-NEXT:    vmovaps (%eax), %ymm0
579; X32-AVX512DQ-NEXT:    vmovaps %ymm0, %ymm1
580; X32-AVX512DQ-NEXT:    retl
581;
582; X64-AVX-LABEL: test_broadcast_32i8_64i8:
583; X64-AVX:       # %bb.0:
584; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
585; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
586; X64-AVX-NEXT:    retq
587;
588; X64-AVX512F-LABEL: test_broadcast_32i8_64i8:
589; X64-AVX512F:       # %bb.0:
590; X64-AVX512F-NEXT:    vmovaps (%rdi), %ymm0
591; X64-AVX512F-NEXT:    vmovaps %ymm0, %ymm1
592; X64-AVX512F-NEXT:    retq
593;
594; X64-AVX512BW-LABEL: test_broadcast_32i8_64i8:
595; X64-AVX512BW:       # %bb.0:
596; X64-AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
597; X64-AVX512BW-NEXT:    retq
598;
599; X64-AVX512DQ-LABEL: test_broadcast_32i8_64i8:
600; X64-AVX512DQ:       # %bb.0:
601; X64-AVX512DQ-NEXT:    vmovaps (%rdi), %ymm0
602; X64-AVX512DQ-NEXT:    vmovaps %ymm0, %ymm1
603; X64-AVX512DQ-NEXT:    retq
604 %1 = load <32 x i8>, <32 x i8> *%p
605 %2 = shufflevector <32 x i8> %1, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
606 ret <64 x i8> %2
607}
608
609;
610; Subvector Load + Broadcast + Store
611;
612
613define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x double>* %p1) {
614; X32-LABEL: test_broadcast_2f64_4f64_reuse:
615; X32:       # %bb.0:
616; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
617; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
618; X32-NEXT:    vmovaps (%ecx), %xmm0
619; X32-NEXT:    vmovaps %xmm0, (%eax)
620; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
621; X32-NEXT:    retl
622;
623; X64-LABEL: test_broadcast_2f64_4f64_reuse:
624; X64:       # %bb.0:
625; X64-NEXT:    vmovaps (%rdi), %xmm0
626; X64-NEXT:    vmovaps %xmm0, (%rsi)
627; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
628; X64-NEXT:    retq
629 %1 = load <2 x double>, <2 x double>* %p0
630 store <2 x double> %1, <2 x double>* %p1
631 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
632 ret <4 x double> %2
633}
634
635define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1) {
636; X32-LABEL: test_broadcast_2i64_4i64_reuse:
637; X32:       # %bb.0:
638; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
639; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
640; X32-NEXT:    vmovaps (%ecx), %xmm0
641; X32-NEXT:    vmovaps %xmm0, (%eax)
642; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
643; X32-NEXT:    retl
644;
645; X64-LABEL: test_broadcast_2i64_4i64_reuse:
646; X64:       # %bb.0:
647; X64-NEXT:    vmovaps (%rdi), %xmm0
648; X64-NEXT:    vmovaps %xmm0, (%rsi)
649; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
650; X64-NEXT:    retq
651 %1 = load <2 x i64>, <2 x i64>* %p0
652 store <2 x i64> %1, <2 x i64>* %p1
653 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
654 ret <4 x i64> %2
655}
656
657define <8 x float> @test_broadcast_4f32_8f32_reuse(<4 x float>* %p0, <4 x float>* %p1) {
658; X32-LABEL: test_broadcast_4f32_8f32_reuse:
659; X32:       # %bb.0:
660; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
661; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
662; X32-NEXT:    vmovaps (%ecx), %xmm0
663; X32-NEXT:    vmovaps %xmm0, (%eax)
664; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
665; X32-NEXT:    retl
666;
667; X64-LABEL: test_broadcast_4f32_8f32_reuse:
668; X64:       # %bb.0:
669; X64-NEXT:    vmovaps (%rdi), %xmm0
670; X64-NEXT:    vmovaps %xmm0, (%rsi)
671; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
672; X64-NEXT:    retq
673 %1 = load <4 x float>, <4 x float>* %p0
674 store <4 x float> %1, <4 x float>* %p1
675 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
676 ret <8 x float> %2
677}
678
679define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1) {
680; X32-LABEL: test_broadcast_4i32_8i32_reuse:
681; X32:       # %bb.0:
682; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
683; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
684; X32-NEXT:    vmovaps (%ecx), %xmm0
685; X32-NEXT:    vmovaps %xmm0, (%eax)
686; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
687; X32-NEXT:    retl
688;
689; X64-LABEL: test_broadcast_4i32_8i32_reuse:
690; X64:       # %bb.0:
691; X64-NEXT:    vmovaps (%rdi), %xmm0
692; X64-NEXT:    vmovaps %xmm0, (%rsi)
693; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
694; X64-NEXT:    retq
695 %1 = load <4 x i32>, <4 x i32>* %p0
696 store <4 x i32> %1, <4 x i32>* %p1
697 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
698 ret <8 x i32> %2
699}
700
701define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p1) nounwind {
702; X32-LABEL: test_broadcast_8i16_16i16_reuse:
703; X32:       # %bb.0:
704; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
705; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
706; X32-NEXT:    vmovaps (%ecx), %xmm0
707; X32-NEXT:    vmovaps %xmm0, (%eax)
708; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
709; X32-NEXT:    retl
710;
711; X64-LABEL: test_broadcast_8i16_16i16_reuse:
712; X64:       # %bb.0:
713; X64-NEXT:    vmovaps (%rdi), %xmm0
714; X64-NEXT:    vmovaps %xmm0, (%rsi)
715; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
716; X64-NEXT:    retq
717 %1 = load <8 x i16>, <8 x i16> *%p0
718 store <8 x i16> %1, <8 x i16>* %p1
719 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
720 ret <16 x i16> %2
721}
722
723define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1) nounwind {
724; X32-LABEL: test_broadcast_16i8_32i8_reuse:
725; X32:       # %bb.0:
726; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
727; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
728; X32-NEXT:    vmovaps (%ecx), %xmm0
729; X32-NEXT:    vmovaps %xmm0, (%eax)
730; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
731; X32-NEXT:    retl
732;
733; X64-LABEL: test_broadcast_16i8_32i8_reuse:
734; X64:       # %bb.0:
735; X64-NEXT:    vmovaps (%rdi), %xmm0
736; X64-NEXT:    vmovaps %xmm0, (%rsi)
737; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
738; X64-NEXT:    retq
739 %1 = load <16 x i8>, <16 x i8> *%p0
740 store <16 x i8> %1, <16 x i8>* %p1
741 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
742 ret <32 x i8> %2
743}
744
745;
746; Subvector Load + Broadcast with Separate Store
747;
748
749define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p1) {
750; X32-AVX-LABEL: test_broadcast_4i32_8i32_chain:
751; X32-AVX:       # %bb.0:
752; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
753; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
754; X32-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
755; X32-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
756; X32-AVX-NEXT:    vmovaps %xmm1, (%eax)
757; X32-AVX-NEXT:    retl
758;
759; X32-AVX512-LABEL: test_broadcast_4i32_8i32_chain:
760; X32-AVX512:       # %bb.0:
761; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
762; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
763; X32-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
764; X32-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
765; X32-AVX512-NEXT:    vmovaps %xmm1, (%eax)
766; X32-AVX512-NEXT:    retl
767;
768; X64-AVX-LABEL: test_broadcast_4i32_8i32_chain:
769; X64-AVX:       # %bb.0:
770; X64-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
771; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
772; X64-AVX-NEXT:    vmovaps %xmm1, (%rsi)
773; X64-AVX-NEXT:    retq
774;
775; X64-AVX512-LABEL: test_broadcast_4i32_8i32_chain:
776; X64-AVX512:       # %bb.0:
777; X64-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
778; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
779; X64-AVX512-NEXT:    vmovaps %xmm1, (%rsi)
780; X64-AVX512-NEXT:    retq
781  %1 = load <4 x i32>, <4 x i32>* %p0
782  store <4 x float> zeroinitializer, <4 x float>* %p1
783  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
784  ret <8 x i32> %2
785}
786
787define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>* %p1) {
788; X32-AVX-LABEL: test_broadcast_4i32_16i32_chain:
789; X32-AVX:       # %bb.0:
790; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
791; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
792; X32-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
793; X32-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
794; X32-AVX-NEXT:    vmovaps %xmm1, (%eax)
795; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
796; X32-AVX-NEXT:    retl
797;
798; X32-AVX512-LABEL: test_broadcast_4i32_16i32_chain:
799; X32-AVX512:       # %bb.0:
800; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
801; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
802; X32-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
803; X32-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
804; X32-AVX512-NEXT:    vmovaps %xmm1, (%eax)
805; X32-AVX512-NEXT:    retl
806;
807; X64-AVX-LABEL: test_broadcast_4i32_16i32_chain:
808; X64-AVX:       # %bb.0:
809; X64-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
810; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
811; X64-AVX-NEXT:    vmovaps %xmm1, (%rsi)
812; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
813; X64-AVX-NEXT:    retq
814;
815; X64-AVX512-LABEL: test_broadcast_4i32_16i32_chain:
816; X64-AVX512:       # %bb.0:
817; X64-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
818; X64-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
819; X64-AVX512-NEXT:    vmovaps %xmm1, (%rsi)
820; X64-AVX512-NEXT:    retq
821  %1 = load <4 x i32>, <4 x i32>* %p0
822  store <4 x float> zeroinitializer, <4 x float>* %p1
823  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
824  ret <16 x i32> %2
825}
826
827;
828; subvector Load with multiple uses + broadcast
829; Fallback to the broadcast should be done
830;
831
832@ga4 = global <4 x i64> zeroinitializer, align 8
833@gb4 = global <8 x i64> zeroinitializer, align 8
834
835define void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) {
836; X32-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64:
837; X32-AVX1:       # %bb.0: # %entry
838; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
839; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [3,0,4,0]
840; X32-AVX1-NEXT:    vpaddq %xmm4, %xmm3, %xmm3
841; X32-AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [1,0,2,0]
842; X32-AVX1-NEXT:    vpaddq %xmm5, %xmm0, %xmm0
843; X32-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
844; X32-AVX1-NEXT:    vmovaps {{.*#+}} ymm3 = [1,0,2,0,3,0,4,0]
845; X32-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
846; X32-AVX1-NEXT:    vpaddq %xmm4, %xmm6, %xmm6
847; X32-AVX1-NEXT:    vpaddq %xmm5, %xmm2, %xmm2
848; X32-AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
849; X32-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
850; X32-AVX1-NEXT:    vpaddq %xmm4, %xmm6, %xmm4
851; X32-AVX1-NEXT:    vpaddq %xmm5, %xmm1, %xmm1
852; X32-AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
853; X32-AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
854; X32-AVX1-NEXT:    vandps %ymm3, %ymm2, %ymm2
855; X32-AVX1-NEXT:    vmovups %ymm0, ga4
856; X32-AVX1-NEXT:    vmovups %ymm2, gb4+32
857; X32-AVX1-NEXT:    vmovups %ymm1, gb4
858; X32-AVX1-NEXT:    vzeroupper
859; X32-AVX1-NEXT:    retl
860;
861; X32-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64:
862; X32-AVX2:       # %bb.0: # %entry
863; X32-AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,0,2,0,3,0,4,0]
864; X32-AVX2-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
865; X32-AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
866; X32-AVX2-NEXT:    vpaddq %ymm3, %ymm1, %ymm1
867; X32-AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
868; X32-AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
869; X32-AVX2-NEXT:    vmovdqu %ymm0, ga4
870; X32-AVX2-NEXT:    vmovdqu %ymm2, gb4+32
871; X32-AVX2-NEXT:    vmovdqu %ymm1, gb4
872; X32-AVX2-NEXT:    vzeroupper
873; X32-AVX2-NEXT:    retl
874;
875; X32-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64:
876; X32-AVX512:       # %bb.0: # %entry
877; X32-AVX512-NEXT:    vpaddq {{\.LCPI.*}}, %ymm0, %ymm0
878; X32-AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,0,2,0,3,0,4,0,1,0,2,0,3,0,4,0]
879; X32-AVX512-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
880; X32-AVX512-NEXT:    vpandq %zmm2, %zmm1, %zmm1
881; X32-AVX512-NEXT:    vmovdqu %ymm0, ga4
882; X32-AVX512-NEXT:    vmovdqu64 %zmm1, gb4
883; X32-AVX512-NEXT:    vzeroupper
884; X32-AVX512-NEXT:    retl
885;
886; X64-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64:
887; X64-AVX1:       # %bb.0: # %entry
888; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
889; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [3,4]
890; X64-AVX1-NEXT:    vpaddq %xmm4, %xmm3, %xmm3
891; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [1,2]
892; X64-AVX1-NEXT:    vpaddq %xmm5, %xmm0, %xmm0
893; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
894; X64-AVX1-NEXT:    vmovaps {{.*#+}} ymm3 = [1,2,3,4]
895; X64-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
896; X64-AVX1-NEXT:    vpaddq %xmm4, %xmm6, %xmm6
897; X64-AVX1-NEXT:    vpaddq %xmm5, %xmm2, %xmm2
898; X64-AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
899; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
900; X64-AVX1-NEXT:    vpaddq %xmm4, %xmm6, %xmm4
901; X64-AVX1-NEXT:    vpaddq %xmm5, %xmm1, %xmm1
902; X64-AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
903; X64-AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
904; X64-AVX1-NEXT:    vandps %ymm3, %ymm2, %ymm2
905; X64-AVX1-NEXT:    vmovups %ymm0, {{.*}}(%rip)
906; X64-AVX1-NEXT:    vmovups %ymm2, gb4+{{.*}}(%rip)
907; X64-AVX1-NEXT:    vmovups %ymm1, {{.*}}(%rip)
908; X64-AVX1-NEXT:    vzeroupper
909; X64-AVX1-NEXT:    retq
910;
911; X64-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64:
912; X64-AVX2:       # %bb.0: # %entry
913; X64-AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,2,3,4]
914; X64-AVX2-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
915; X64-AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
916; X64-AVX2-NEXT:    vpaddq %ymm3, %ymm1, %ymm1
917; X64-AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
918; X64-AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
919; X64-AVX2-NEXT:    vmovdqu %ymm0, {{.*}}(%rip)
920; X64-AVX2-NEXT:    vmovdqu %ymm2, gb4+{{.*}}(%rip)
921; X64-AVX2-NEXT:    vmovdqu %ymm1, {{.*}}(%rip)
922; X64-AVX2-NEXT:    vzeroupper
923; X64-AVX2-NEXT:    retq
924;
925; X64-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64:
926; X64-AVX512:       # %bb.0: # %entry
927; X64-AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,2,3,4]
928; X64-AVX512-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
929; X64-AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm2
930; X64-AVX512-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
931; X64-AVX512-NEXT:    vpandq %zmm2, %zmm1, %zmm1
932; X64-AVX512-NEXT:    vmovdqu %ymm0, {{.*}}(%rip)
933; X64-AVX512-NEXT:    vmovdqu64 %zmm1, {{.*}}(%rip)
934; X64-AVX512-NEXT:    vzeroupper
935; X64-AVX512-NEXT:    retq
936entry:
937  %0 = add <4 x i64> %a, <i64 1, i64 2, i64 3, i64 4>
938  %1 = add <8 x i64> %b, <i64 1, i64 2, i64 3, i64 4, i64 1, i64 2, i64 3, i64 4>
939  %2 = and <8 x i64> %1, <i64 1, i64 2, i64 3, i64 4, i64 1, i64 2, i64 3, i64 4>
940  store <4 x i64> %0, <4 x i64>* @ga4, align 8
941  store <8 x i64> %2, <8 x i64>* @gb4, align 8
942  ret void
943}
944
945
946@ga2 = global <4 x double> zeroinitializer, align 8
947@gb2 = global <8 x double> zeroinitializer, align 8
948
949define void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b) {
950; X32-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64:
951; X32-AVX:       # %bb.0: # %entry
952; X32-AVX-NEXT:    vmovapd {{.*#+}} ymm3 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
953; X32-AVX-NEXT:    vaddpd %ymm3, %ymm0, %ymm0
954; X32-AVX-NEXT:    vaddpd %ymm3, %ymm2, %ymm2
955; X32-AVX-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
956; X32-AVX-NEXT:    vdivpd %ymm3, %ymm1, %ymm1
957; X32-AVX-NEXT:    vdivpd %ymm3, %ymm2, %ymm2
958; X32-AVX-NEXT:    vmovupd %ymm0, ga2
959; X32-AVX-NEXT:    vmovupd %ymm2, gb2+32
960; X32-AVX-NEXT:    vmovupd %ymm1, gb2
961; X32-AVX-NEXT:    vzeroupper
962; X32-AVX-NEXT:    retl
963;
964; X32-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64:
965; X32-AVX512:       # %bb.0: # %entry
966; X32-AVX512-NEXT:    vmovapd {{.*#+}} ymm2 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
967; X32-AVX512-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
968; X32-AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm2, %zmm2
969; X32-AVX512-NEXT:    vaddpd %zmm2, %zmm1, %zmm1
970; X32-AVX512-NEXT:    vdivpd %zmm2, %zmm1, %zmm1
971; X32-AVX512-NEXT:    vmovupd %ymm0, ga2
972; X32-AVX512-NEXT:    vmovupd %zmm1, gb2
973; X32-AVX512-NEXT:    vzeroupper
974; X32-AVX512-NEXT:    retl
975;
976; X64-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64:
977; X64-AVX:       # %bb.0: # %entry
978; X64-AVX-NEXT:    vmovapd {{.*#+}} ymm3 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
979; X64-AVX-NEXT:    vaddpd %ymm3, %ymm0, %ymm0
980; X64-AVX-NEXT:    vaddpd %ymm3, %ymm2, %ymm2
981; X64-AVX-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
982; X64-AVX-NEXT:    vdivpd %ymm3, %ymm1, %ymm1
983; X64-AVX-NEXT:    vdivpd %ymm3, %ymm2, %ymm2
984; X64-AVX-NEXT:    vmovupd %ymm0, {{.*}}(%rip)
985; X64-AVX-NEXT:    vmovupd %ymm2, gb2+{{.*}}(%rip)
986; X64-AVX-NEXT:    vmovupd %ymm1, {{.*}}(%rip)
987; X64-AVX-NEXT:    vzeroupper
988; X64-AVX-NEXT:    retq
989;
990; X64-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64:
991; X64-AVX512:       # %bb.0: # %entry
992; X64-AVX512-NEXT:    vmovapd {{.*#+}} ymm2 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
993; X64-AVX512-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
994; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm2, %zmm2
995; X64-AVX512-NEXT:    vaddpd %zmm2, %zmm1, %zmm1
996; X64-AVX512-NEXT:    vdivpd %zmm2, %zmm1, %zmm1
997; X64-AVX512-NEXT:    vmovupd %ymm0, {{.*}}(%rip)
998; X64-AVX512-NEXT:    vmovupd %zmm1, {{.*}}(%rip)
999; X64-AVX512-NEXT:    vzeroupper
1000; X64-AVX512-NEXT:    retq
1001entry:
1002  %0 = fadd <4 x double> %a, <double 1.0, double 2.0, double 3.0, double 4.0>
1003  %1 = fadd <8 x double> %b, <double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0, double 4.0>
1004  %2 = fdiv <8 x double> %1, <double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0, double 4.0>
1005  store <4 x double> %0, <4 x double>* @ga2, align 8
1006  store <8 x double> %2, <8 x double>* @gb2, align 8
1007  ret void
1008}
1009
1010;
1011; Subvector Broadcast from register
1012;
1013
1014define <4 x double> @reg_broadcast_2f64_4f64(<2 x double> %a0) nounwind {
1015; X32-LABEL: reg_broadcast_2f64_4f64:
1016; X32:       # %bb.0:
1017; X32-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1018; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1019; X32-NEXT:    retl
1020;
1021; X64-LABEL: reg_broadcast_2f64_4f64:
1022; X64:       # %bb.0:
1023; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1024; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1025; X64-NEXT:    retq
1026 %1 = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
1027 ret <4 x double> %1
1028}
1029
1030define <8 x double> @reg_broadcast_2f64_8f64(<2 x double> %a0) nounwind {
1031; X32-AVX-LABEL: reg_broadcast_2f64_8f64:
1032; X32-AVX:       # %bb.0:
1033; X32-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1034; X32-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1035; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
1036; X32-AVX-NEXT:    retl
1037;
1038; X32-AVX512-LABEL: reg_broadcast_2f64_8f64:
1039; X32-AVX512:       # %bb.0:
1040; X32-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1041; X32-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1042; X32-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1043; X32-AVX512-NEXT:    retl
1044;
1045; X64-AVX-LABEL: reg_broadcast_2f64_8f64:
1046; X64-AVX:       # %bb.0:
1047; X64-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1048; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1049; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1050; X64-AVX-NEXT:    retq
1051;
1052; X64-AVX512-LABEL: reg_broadcast_2f64_8f64:
1053; X64-AVX512:       # %bb.0:
1054; X64-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1055; X64-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1056; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1057; X64-AVX512-NEXT:    retq
1058 %1 = shufflevector <2 x double> %a0, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
1059 ret <8 x double> %1
1060}
1061
1062define <8 x double> @reg_broadcast_4f64_8f64(<4 x double> %a0) nounwind {
1063; X32-AVX-LABEL: reg_broadcast_4f64_8f64:
1064; X32-AVX:       # %bb.0:
1065; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
1066; X32-AVX-NEXT:    retl
1067;
1068; X32-AVX512-LABEL: reg_broadcast_4f64_8f64:
1069; X32-AVX512:       # %bb.0:
1070; X32-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1071; X32-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1072; X32-AVX512-NEXT:    retl
1073;
1074; X64-AVX-LABEL: reg_broadcast_4f64_8f64:
1075; X64-AVX:       # %bb.0:
1076; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1077; X64-AVX-NEXT:    retq
1078;
1079; X64-AVX512-LABEL: reg_broadcast_4f64_8f64:
1080; X64-AVX512:       # %bb.0:
1081; X64-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1082; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1083; X64-AVX512-NEXT:    retq
1084 %1 = shufflevector <4 x double> %a0, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1085 ret <8 x double> %1
1086}
1087
1088define <4 x i64> @reg_broadcast_2i64_4i64(<2 x i64> %a0) nounwind {
1089; X32-LABEL: reg_broadcast_2i64_4i64:
1090; X32:       # %bb.0:
1091; X32-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1092; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1093; X32-NEXT:    retl
1094;
1095; X64-LABEL: reg_broadcast_2i64_4i64:
1096; X64:       # %bb.0:
1097; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1098; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1099; X64-NEXT:    retq
1100 %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
1101 ret <4 x i64> %1
1102}
1103
1104define <8 x i64> @reg_broadcast_2i64_8i64(<2 x i64> %a0) nounwind {
1105; X32-AVX-LABEL: reg_broadcast_2i64_8i64:
1106; X32-AVX:       # %bb.0:
1107; X32-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1108; X32-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1109; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
1110; X32-AVX-NEXT:    retl
1111;
1112; X32-AVX512-LABEL: reg_broadcast_2i64_8i64:
1113; X32-AVX512:       # %bb.0:
1114; X32-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1115; X32-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1116; X32-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1117; X32-AVX512-NEXT:    retl
1118;
1119; X64-AVX-LABEL: reg_broadcast_2i64_8i64:
1120; X64-AVX:       # %bb.0:
1121; X64-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1122; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1123; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1124; X64-AVX-NEXT:    retq
1125;
1126; X64-AVX512-LABEL: reg_broadcast_2i64_8i64:
1127; X64-AVX512:       # %bb.0:
1128; X64-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1129; X64-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1130; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1131; X64-AVX512-NEXT:    retq
1132 %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
1133 ret <8 x i64> %1
1134}
1135
1136define <8 x i64> @reg_broadcast_4i64_8i64(<4 x i64> %a0) nounwind {
1137; X32-AVX-LABEL: reg_broadcast_4i64_8i64:
1138; X32-AVX:       # %bb.0:
1139; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
1140; X32-AVX-NEXT:    retl
1141;
1142; X32-AVX512-LABEL: reg_broadcast_4i64_8i64:
1143; X32-AVX512:       # %bb.0:
1144; X32-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1145; X32-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1146; X32-AVX512-NEXT:    retl
1147;
1148; X64-AVX-LABEL: reg_broadcast_4i64_8i64:
1149; X64-AVX:       # %bb.0:
1150; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1151; X64-AVX-NEXT:    retq
1152;
1153; X64-AVX512-LABEL: reg_broadcast_4i64_8i64:
1154; X64-AVX512:       # %bb.0:
1155; X64-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1156; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1157; X64-AVX512-NEXT:    retq
1158 %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1159 ret <8 x i64> %1
1160}
1161
1162define <8 x float> @reg_broadcast_4f32_8f32(<4 x float> %a0) nounwind {
1163; X32-LABEL: reg_broadcast_4f32_8f32:
1164; X32:       # %bb.0:
1165; X32-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1166; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1167; X32-NEXT:    retl
1168;
1169; X64-LABEL: reg_broadcast_4f32_8f32:
1170; X64:       # %bb.0:
1171; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1172; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1173; X64-NEXT:    retq
1174 %1 = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1175 ret <8 x float> %1
1176}
1177
1178define <16 x float> @reg_broadcast_4f32_16f32(<4 x float> %a0) nounwind {
1179; X32-AVX-LABEL: reg_broadcast_4f32_16f32:
1180; X32-AVX:       # %bb.0:
1181; X32-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1182; X32-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1183; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
1184; X32-AVX-NEXT:    retl
1185;
1186; X32-AVX512-LABEL: reg_broadcast_4f32_16f32:
1187; X32-AVX512:       # %bb.0:
1188; X32-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1189; X32-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1190; X32-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1191; X32-AVX512-NEXT:    retl
1192;
1193; X64-AVX-LABEL: reg_broadcast_4f32_16f32:
1194; X64-AVX:       # %bb.0:
1195; X64-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1196; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1197; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1198; X64-AVX-NEXT:    retq
1199;
1200; X64-AVX512-LABEL: reg_broadcast_4f32_16f32:
1201; X64-AVX512:       # %bb.0:
1202; X64-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1203; X64-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1204; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1205; X64-AVX512-NEXT:    retq
1206 %1 = shufflevector <4 x float> %a0, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1207 ret <16 x float> %1
1208}
1209
1210define <16 x float> @reg_broadcast_8f32_16f32(<8 x float> %a0) nounwind {
1211; X32-AVX-LABEL: reg_broadcast_8f32_16f32:
1212; X32-AVX:       # %bb.0:
1213; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
1214; X32-AVX-NEXT:    retl
1215;
1216; X32-AVX512-LABEL: reg_broadcast_8f32_16f32:
1217; X32-AVX512:       # %bb.0:
1218; X32-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1219; X32-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1220; X32-AVX512-NEXT:    retl
1221;
1222; X64-AVX-LABEL: reg_broadcast_8f32_16f32:
1223; X64-AVX:       # %bb.0:
1224; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1225; X64-AVX-NEXT:    retq
1226;
1227; X64-AVX512-LABEL: reg_broadcast_8f32_16f32:
1228; X64-AVX512:       # %bb.0:
1229; X64-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1230; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1231; X64-AVX512-NEXT:    retq
1232 %1 = shufflevector <8 x float> %a0, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1233 ret <16 x float> %1
1234}
1235
1236define <8 x i32> @reg_broadcast_4i32_8i32(<4 x i32> %a0) nounwind {
1237; X32-LABEL: reg_broadcast_4i32_8i32:
1238; X32:       # %bb.0:
1239; X32-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1240; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1241; X32-NEXT:    retl
1242;
1243; X64-LABEL: reg_broadcast_4i32_8i32:
1244; X64:       # %bb.0:
1245; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1246; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1247; X64-NEXT:    retq
1248 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1249 ret <8 x i32> %1
1250}
1251
1252define <16 x i32> @reg_broadcast_4i32_16i32(<4 x i32> %a0) nounwind {
1253; X32-AVX-LABEL: reg_broadcast_4i32_16i32:
1254; X32-AVX:       # %bb.0:
1255; X32-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1256; X32-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1257; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
1258; X32-AVX-NEXT:    retl
1259;
1260; X32-AVX512-LABEL: reg_broadcast_4i32_16i32:
1261; X32-AVX512:       # %bb.0:
1262; X32-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1263; X32-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1264; X32-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1265; X32-AVX512-NEXT:    retl
1266;
1267; X64-AVX-LABEL: reg_broadcast_4i32_16i32:
1268; X64-AVX:       # %bb.0:
1269; X64-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1270; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1271; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1272; X64-AVX-NEXT:    retq
1273;
1274; X64-AVX512-LABEL: reg_broadcast_4i32_16i32:
1275; X64-AVX512:       # %bb.0:
1276; X64-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1277; X64-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1278; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1279; X64-AVX512-NEXT:    retq
1280 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1281 ret <16 x i32> %1
1282}
1283
1284define <16 x i32> @reg_broadcast_8i32_16i32(<8 x i32> %a0) nounwind {
1285; X32-AVX-LABEL: reg_broadcast_8i32_16i32:
1286; X32-AVX:       # %bb.0:
1287; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
1288; X32-AVX-NEXT:    retl
1289;
1290; X32-AVX512-LABEL: reg_broadcast_8i32_16i32:
1291; X32-AVX512:       # %bb.0:
1292; X32-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1293; X32-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1294; X32-AVX512-NEXT:    retl
1295;
1296; X64-AVX-LABEL: reg_broadcast_8i32_16i32:
1297; X64-AVX:       # %bb.0:
1298; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1299; X64-AVX-NEXT:    retq
1300;
1301; X64-AVX512-LABEL: reg_broadcast_8i32_16i32:
1302; X64-AVX512:       # %bb.0:
1303; X64-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1304; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1305; X64-AVX512-NEXT:    retq
1306 %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1307 ret <16 x i32> %1
1308}
1309
1310define <16 x i16> @reg_broadcast_8i16_16i16(<8 x i16> %a0) nounwind {
1311; X32-LABEL: reg_broadcast_8i16_16i16:
1312; X32:       # %bb.0:
1313; X32-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1314; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1315; X32-NEXT:    retl
1316;
1317; X64-LABEL: reg_broadcast_8i16_16i16:
1318; X64:       # %bb.0:
1319; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1320; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1321; X64-NEXT:    retq
1322 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1323 ret <16 x i16> %1
1324}
1325
1326define <32 x i16> @reg_broadcast_8i16_32i16(<8 x i16> %a0) nounwind {
1327; X32-AVX-LABEL: reg_broadcast_8i16_32i16:
1328; X32-AVX:       # %bb.0:
1329; X32-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1330; X32-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1331; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
1332; X32-AVX-NEXT:    retl
1333;
1334; X32-AVX512F-LABEL: reg_broadcast_8i16_32i16:
1335; X32-AVX512F:       # %bb.0:
1336; X32-AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1337; X32-AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1338; X32-AVX512F-NEXT:    vmovaps %ymm0, %ymm1
1339; X32-AVX512F-NEXT:    retl
1340;
1341; X32-AVX512BW-LABEL: reg_broadcast_8i16_32i16:
1342; X32-AVX512BW:       # %bb.0:
1343; X32-AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1344; X32-AVX512BW-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1345; X32-AVX512BW-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1346; X32-AVX512BW-NEXT:    retl
1347;
1348; X32-AVX512DQ-LABEL: reg_broadcast_8i16_32i16:
1349; X32-AVX512DQ:       # %bb.0:
1350; X32-AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1351; X32-AVX512DQ-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1352; X32-AVX512DQ-NEXT:    vmovaps %ymm0, %ymm1
1353; X32-AVX512DQ-NEXT:    retl
1354;
1355; X64-AVX-LABEL: reg_broadcast_8i16_32i16:
1356; X64-AVX:       # %bb.0:
1357; X64-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1358; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1359; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1360; X64-AVX-NEXT:    retq
1361;
1362; X64-AVX512F-LABEL: reg_broadcast_8i16_32i16:
1363; X64-AVX512F:       # %bb.0:
1364; X64-AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1365; X64-AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1366; X64-AVX512F-NEXT:    vmovaps %ymm0, %ymm1
1367; X64-AVX512F-NEXT:    retq
1368;
1369; X64-AVX512BW-LABEL: reg_broadcast_8i16_32i16:
1370; X64-AVX512BW:       # %bb.0:
1371; X64-AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1372; X64-AVX512BW-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1373; X64-AVX512BW-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1374; X64-AVX512BW-NEXT:    retq
1375;
1376; X64-AVX512DQ-LABEL: reg_broadcast_8i16_32i16:
1377; X64-AVX512DQ:       # %bb.0:
1378; X64-AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1379; X64-AVX512DQ-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1380; X64-AVX512DQ-NEXT:    vmovaps %ymm0, %ymm1
1381; X64-AVX512DQ-NEXT:    retq
1382 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1383 ret <32 x i16> %1
1384}
1385
1386define <32 x i16> @reg_broadcast_16i16_32i16(<16 x i16> %a0) nounwind {
1387; X32-AVX-LABEL: reg_broadcast_16i16_32i16:
1388; X32-AVX:       # %bb.0:
1389; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
1390; X32-AVX-NEXT:    retl
1391;
1392; X32-AVX512F-LABEL: reg_broadcast_16i16_32i16:
1393; X32-AVX512F:       # %bb.0:
1394; X32-AVX512F-NEXT:    vmovaps %ymm0, %ymm1
1395; X32-AVX512F-NEXT:    retl
1396;
1397; X32-AVX512BW-LABEL: reg_broadcast_16i16_32i16:
1398; X32-AVX512BW:       # %bb.0:
1399; X32-AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1400; X32-AVX512BW-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1401; X32-AVX512BW-NEXT:    retl
1402;
1403; X32-AVX512DQ-LABEL: reg_broadcast_16i16_32i16:
1404; X32-AVX512DQ:       # %bb.0:
1405; X32-AVX512DQ-NEXT:    vmovaps %ymm0, %ymm1
1406; X32-AVX512DQ-NEXT:    retl
1407;
1408; X64-AVX-LABEL: reg_broadcast_16i16_32i16:
1409; X64-AVX:       # %bb.0:
1410; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1411; X64-AVX-NEXT:    retq
1412;
1413; X64-AVX512F-LABEL: reg_broadcast_16i16_32i16:
1414; X64-AVX512F:       # %bb.0:
1415; X64-AVX512F-NEXT:    vmovaps %ymm0, %ymm1
1416; X64-AVX512F-NEXT:    retq
1417;
1418; X64-AVX512BW-LABEL: reg_broadcast_16i16_32i16:
1419; X64-AVX512BW:       # %bb.0:
1420; X64-AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1421; X64-AVX512BW-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1422; X64-AVX512BW-NEXT:    retq
1423;
1424; X64-AVX512DQ-LABEL: reg_broadcast_16i16_32i16:
1425; X64-AVX512DQ:       # %bb.0:
1426; X64-AVX512DQ-NEXT:    vmovaps %ymm0, %ymm1
1427; X64-AVX512DQ-NEXT:    retq
1428 %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1429 ret <32 x i16> %1
1430}
1431
1432define <32 x i8> @reg_broadcast_16i8_32i8(<16 x i8> %a0) nounwind {
1433; X32-LABEL: reg_broadcast_16i8_32i8:
1434; X32:       # %bb.0:
1435; X32-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1436; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1437; X32-NEXT:    retl
1438;
1439; X64-LABEL: reg_broadcast_16i8_32i8:
1440; X64:       # %bb.0:
1441; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1442; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1443; X64-NEXT:    retq
1444 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1445 ret <32 x i8> %1
1446}
1447
1448define <64 x i8> @reg_broadcast_16i8_64i8(<16 x i8> %a0) nounwind {
1449; X32-AVX-LABEL: reg_broadcast_16i8_64i8:
1450; X32-AVX:       # %bb.0:
1451; X32-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1452; X32-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1453; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
1454; X32-AVX-NEXT:    retl
1455;
1456; X32-AVX512F-LABEL: reg_broadcast_16i8_64i8:
1457; X32-AVX512F:       # %bb.0:
1458; X32-AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1459; X32-AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1460; X32-AVX512F-NEXT:    vmovaps %ymm0, %ymm1
1461; X32-AVX512F-NEXT:    retl
1462;
1463; X32-AVX512BW-LABEL: reg_broadcast_16i8_64i8:
1464; X32-AVX512BW:       # %bb.0:
1465; X32-AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1466; X32-AVX512BW-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1467; X32-AVX512BW-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1468; X32-AVX512BW-NEXT:    retl
1469;
1470; X32-AVX512DQ-LABEL: reg_broadcast_16i8_64i8:
1471; X32-AVX512DQ:       # %bb.0:
1472; X32-AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1473; X32-AVX512DQ-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1474; X32-AVX512DQ-NEXT:    vmovaps %ymm0, %ymm1
1475; X32-AVX512DQ-NEXT:    retl
1476;
1477; X64-AVX-LABEL: reg_broadcast_16i8_64i8:
1478; X64-AVX:       # %bb.0:
1479; X64-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1480; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1481; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1482; X64-AVX-NEXT:    retq
1483;
1484; X64-AVX512F-LABEL: reg_broadcast_16i8_64i8:
1485; X64-AVX512F:       # %bb.0:
1486; X64-AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1487; X64-AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1488; X64-AVX512F-NEXT:    vmovaps %ymm0, %ymm1
1489; X64-AVX512F-NEXT:    retq
1490;
1491; X64-AVX512BW-LABEL: reg_broadcast_16i8_64i8:
1492; X64-AVX512BW:       # %bb.0:
1493; X64-AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1494; X64-AVX512BW-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1495; X64-AVX512BW-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1496; X64-AVX512BW-NEXT:    retq
1497;
1498; X64-AVX512DQ-LABEL: reg_broadcast_16i8_64i8:
1499; X64-AVX512DQ:       # %bb.0:
1500; X64-AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1501; X64-AVX512DQ-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1502; X64-AVX512DQ-NEXT:    vmovaps %ymm0, %ymm1
1503; X64-AVX512DQ-NEXT:    retq
1504 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1505 ret <64 x i8> %1
1506}
1507
1508define <64 x i8> @reg_broadcast_32i8_64i8(<32 x i8> %a0) nounwind {
1509; X32-AVX-LABEL: reg_broadcast_32i8_64i8:
1510; X32-AVX:       # %bb.0:
1511; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
1512; X32-AVX-NEXT:    retl
1513;
1514; X32-AVX512F-LABEL: reg_broadcast_32i8_64i8:
1515; X32-AVX512F:       # %bb.0:
1516; X32-AVX512F-NEXT:    vmovaps %ymm0, %ymm1
1517; X32-AVX512F-NEXT:    retl
1518;
1519; X32-AVX512BW-LABEL: reg_broadcast_32i8_64i8:
1520; X32-AVX512BW:       # %bb.0:
1521; X32-AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1522; X32-AVX512BW-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1523; X32-AVX512BW-NEXT:    retl
1524;
1525; X32-AVX512DQ-LABEL: reg_broadcast_32i8_64i8:
1526; X32-AVX512DQ:       # %bb.0:
1527; X32-AVX512DQ-NEXT:    vmovaps %ymm0, %ymm1
1528; X32-AVX512DQ-NEXT:    retl
1529;
1530; X64-AVX-LABEL: reg_broadcast_32i8_64i8:
1531; X64-AVX:       # %bb.0:
1532; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1533; X64-AVX-NEXT:    retq
1534;
1535; X64-AVX512F-LABEL: reg_broadcast_32i8_64i8:
1536; X64-AVX512F:       # %bb.0:
1537; X64-AVX512F-NEXT:    vmovaps %ymm0, %ymm1
1538; X64-AVX512F-NEXT:    retq
1539;
1540; X64-AVX512BW-LABEL: reg_broadcast_32i8_64i8:
1541; X64-AVX512BW:       # %bb.0:
1542; X64-AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1543; X64-AVX512BW-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1544; X64-AVX512BW-NEXT:    retq
1545;
1546; X64-AVX512DQ-LABEL: reg_broadcast_32i8_64i8:
1547; X64-AVX512DQ:       # %bb.0:
1548; X64-AVX512DQ-NEXT:    vmovaps %ymm0, %ymm1
1549; X64-AVX512DQ-NEXT:    retq
1550 %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1551 ret <64 x i8> %1
1552}
1553